diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,90764 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 12962, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 15.614504080242568, + "learning_rate": 1.2853470437017994e-07, + "loss": 2.9582, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 15.570163404465948, + "learning_rate": 2.570694087403599e-07, + "loss": 2.8727, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 14.356797485109597, + "learning_rate": 3.8560411311053987e-07, + "loss": 2.8515, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 15.781751164059694, + "learning_rate": 5.141388174807198e-07, + "loss": 2.9528, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 14.94068714667129, + "learning_rate": 6.426735218508997e-07, + "loss": 2.838, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 14.207690351340352, + "learning_rate": 7.712082262210797e-07, + "loss": 2.8576, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 13.860289726894845, + "learning_rate": 8.997429305912597e-07, + "loss": 2.7774, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 13.35998549398397, + "learning_rate": 1.0282776349614395e-06, + "loss": 2.8728, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 11.849071113840285, + "learning_rate": 1.1568123393316196e-06, + "loss": 2.7707, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 9.076004211529174, + "learning_rate": 1.2853470437017995e-06, + "loss": 2.6408, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 6.240390429933518, + "learning_rate": 1.4138817480719794e-06, + "loss": 2.696, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 5.378123612137972, + "learning_rate": 1.5424164524421595e-06, + "loss": 2.7634, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 5.567312081792433, + "learning_rate": 1.6709511568123394e-06, + "loss": 2.5598, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 8.044425629898154, + "learning_rate": 1.7994858611825194e-06, + "loss": 2.5837, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 6.467778096569555, + "learning_rate": 1.928020565552699e-06, + "loss": 2.5774, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 6.218632565331351, + "learning_rate": 2.056555269922879e-06, + "loss": 2.6803, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 3.8545724267389976, + "learning_rate": 2.1850899742930593e-06, + "loss": 2.5352, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 3.1826273541619954, + "learning_rate": 2.313624678663239e-06, + "loss": 2.4847, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 3.1569608664356097, + "learning_rate": 2.442159383033419e-06, + "loss": 2.464, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 2.9987558130636356, + "learning_rate": 2.570694087403599e-06, + "loss": 2.6248, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 2.630066207289744, + "learning_rate": 2.699228791773779e-06, + "loss": 2.445, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 2.191065794331003, + "learning_rate": 2.8277634961439587e-06, + "loss": 2.4244, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 1.9851148257093616, + "learning_rate": 2.956298200514139e-06, + "loss": 2.4332, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 8.049303273666483, + "learning_rate": 3.084832904884319e-06, + "loss": 2.5698, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 2.6044393980477123, + "learning_rate": 3.213367609254499e-06, + "loss": 2.4001, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 3.6235482559708783, + "learning_rate": 3.3419023136246787e-06, + "loss": 2.4217, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 1.2629172635241919, + "learning_rate": 3.470437017994859e-06, + "loss": 2.4073, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 1.300242551751265, + "learning_rate": 3.598971722365039e-06, + "loss": 2.5546, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 1.2481148376547564, + "learning_rate": 3.7275064267352188e-06, + "loss": 2.3982, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 1.177143906465669, + "learning_rate": 3.856041131105398e-06, + "loss": 2.3921, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 1.1262339135292236, + "learning_rate": 3.984575835475578e-06, + "loss": 2.3848, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 1.1201312761638555, + "learning_rate": 4.113110539845758e-06, + "loss": 2.3534, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 1.19831346765079, + "learning_rate": 4.241645244215939e-06, + "loss": 2.5288, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 1.1262109382931946, + "learning_rate": 4.370179948586119e-06, + "loss": 2.358, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 1.1677785952812396, + "learning_rate": 4.4987146529562985e-06, + "loss": 2.3458, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 1.1248316136252485, + "learning_rate": 4.627249357326478e-06, + "loss": 2.5331, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 0.9438076309994886, + "learning_rate": 4.755784061696658e-06, + "loss": 2.3739, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 0.8649453968522334, + "learning_rate": 4.884318766066838e-06, + "loss": 2.3573, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 0.8424219839650771, + "learning_rate": 5.012853470437019e-06, + "loss": 2.3465, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 0.8819647120903683, + "learning_rate": 5.141388174807198e-06, + "loss": 2.5345, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 0.9075458149810234, + "learning_rate": 5.269922879177378e-06, + "loss": 2.3511, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 1.0068870378933819, + "learning_rate": 5.398457583547558e-06, + "loss": 2.3529, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 0.9514509413390595, + "learning_rate": 5.526992287917738e-06, + "loss": 2.326, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 0.8556176001785393, + "learning_rate": 5.6555269922879175e-06, + "loss": 2.3198, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 0.9619745039800207, + "learning_rate": 5.784061696658098e-06, + "loss": 2.5048, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 0.9876638609130287, + "learning_rate": 5.912596401028278e-06, + "loss": 2.3354, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 0.8688675959187675, + "learning_rate": 6.041131105398458e-06, + "loss": 2.3303, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 0.8025143826323919, + "learning_rate": 6.169665809768638e-06, + "loss": 2.4709, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 0.8680776063762177, + "learning_rate": 6.298200514138818e-06, + "loss": 2.2979, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 1.0592149110299272, + "learning_rate": 6.426735218508998e-06, + "loss": 2.3169, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 0.8973191238306678, + "learning_rate": 6.5552699228791775e-06, + "loss": 2.3028, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 0.8449496542813714, + "learning_rate": 6.683804627249357e-06, + "loss": 2.4668, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 0.8771845820006734, + "learning_rate": 6.812339331619537e-06, + "loss": 2.3433, + "step": 53 + }, + { + "epoch": 0.0, + "grad_norm": 0.8790940901876545, + "learning_rate": 6.940874035989718e-06, + "loss": 2.298, + "step": 54 + }, + { + "epoch": 0.0, + "grad_norm": 0.803435281615288, + "learning_rate": 7.069408740359898e-06, + "loss": 2.3524, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 0.7185911576146709, + "learning_rate": 7.197943444730078e-06, + "loss": 2.3055, + "step": 56 + }, + { + "epoch": 0.0, + "grad_norm": 0.8062159097676136, + "learning_rate": 7.326478149100258e-06, + "loss": 2.4896, + "step": 57 + }, + { + "epoch": 0.0, + "grad_norm": 0.7551608013791536, + "learning_rate": 7.4550128534704376e-06, + "loss": 2.3538, + "step": 58 + }, + { + "epoch": 0.0, + "grad_norm": 0.743329166402104, + "learning_rate": 7.5835475578406175e-06, + "loss": 2.2654, + "step": 59 + }, + { + "epoch": 0.0, + "grad_norm": 0.839762047748952, + "learning_rate": 7.712082262210796e-06, + "loss": 2.4416, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 0.7605774823430541, + "learning_rate": 7.840616966580976e-06, + "loss": 2.3077, + "step": 61 + }, + { + "epoch": 0.0, + "grad_norm": 0.8025973417994856, + "learning_rate": 7.969151670951156e-06, + "loss": 2.2836, + "step": 62 + }, + { + "epoch": 0.0, + "grad_norm": 0.6864232970479806, + "learning_rate": 8.097686375321336e-06, + "loss": 2.2811, + "step": 63 + }, + { + "epoch": 0.0, + "grad_norm": 0.7940441918021376, + "learning_rate": 8.226221079691516e-06, + "loss": 2.2973, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 0.7392450587141398, + "learning_rate": 8.354755784061698e-06, + "loss": 2.5009, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 0.7406350762506708, + "learning_rate": 8.483290488431877e-06, + "loss": 2.3045, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 0.7543156254603282, + "learning_rate": 8.611825192802057e-06, + "loss": 2.2644, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 0.7848852496477787, + "learning_rate": 8.740359897172237e-06, + "loss": 2.4247, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 0.7365416779921541, + "learning_rate": 8.868894601542417e-06, + "loss": 2.3137, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 0.7350442586478796, + "learning_rate": 8.997429305912597e-06, + "loss": 2.2579, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 0.7230483333064687, + "learning_rate": 9.125964010282777e-06, + "loss": 2.2739, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 0.7269909307113563, + "learning_rate": 9.254498714652957e-06, + "loss": 2.4287, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 0.7520259645621727, + "learning_rate": 9.383033419023137e-06, + "loss": 2.2747, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 0.7960539911485558, + "learning_rate": 9.511568123393317e-06, + "loss": 2.2894, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 0.7160407111609224, + "learning_rate": 9.640102827763496e-06, + "loss": 2.2722, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 0.7409386960529175, + "learning_rate": 9.768637532133676e-06, + "loss": 2.2911, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 0.7121957913508553, + "learning_rate": 9.897172236503858e-06, + "loss": 2.4432, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 0.7342355334139097, + "learning_rate": 1.0025706940874038e-05, + "loss": 2.3059, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 0.7503926601087365, + "learning_rate": 1.0154241645244216e-05, + "loss": 2.2978, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 0.7431219726322306, + "learning_rate": 1.0282776349614396e-05, + "loss": 2.4642, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 0.6912343863017675, + "learning_rate": 1.0411311053984576e-05, + "loss": 2.2826, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 0.7535315971131915, + "learning_rate": 1.0539845758354756e-05, + "loss": 2.268, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 0.7293404309596656, + "learning_rate": 1.0668380462724936e-05, + "loss": 2.2365, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 0.7493574754176567, + "learning_rate": 1.0796915167095115e-05, + "loss": 2.4182, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 0.7741535205635848, + "learning_rate": 1.0925449871465295e-05, + "loss": 2.2451, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 0.7249225770927538, + "learning_rate": 1.1053984575835475e-05, + "loss": 2.2737, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 0.688407931585645, + "learning_rate": 1.1182519280205655e-05, + "loss": 2.2964, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 0.7228442476379507, + "learning_rate": 1.1311053984575835e-05, + "loss": 2.252, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 0.6985712773681425, + "learning_rate": 1.1439588688946017e-05, + "loss": 2.4318, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 0.743051302790943, + "learning_rate": 1.1568123393316196e-05, + "loss": 2.2714, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 0.7352474699051454, + "learning_rate": 1.1696658097686376e-05, + "loss": 2.2312, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 0.6857505375836221, + "learning_rate": 1.1825192802056556e-05, + "loss": 2.3752, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 0.7357208435435285, + "learning_rate": 1.1953727506426736e-05, + "loss": 2.2498, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 0.6828846057938996, + "learning_rate": 1.2082262210796916e-05, + "loss": 2.2675, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 0.7717079623683487, + "learning_rate": 1.2210796915167096e-05, + "loss": 2.2534, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 0.7853346252126335, + "learning_rate": 1.2339331619537276e-05, + "loss": 2.2224, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 0.7346116207273206, + "learning_rate": 1.2467866323907456e-05, + "loss": 2.4562, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 0.741239462158256, + "learning_rate": 1.2596401028277636e-05, + "loss": 2.2545, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 0.7603482137204725, + "learning_rate": 1.2724935732647817e-05, + "loss": 2.2573, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 0.7028756516841389, + "learning_rate": 1.2853470437017995e-05, + "loss": 2.2877, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.7381484539558425, + "learning_rate": 1.2982005141388177e-05, + "loss": 2.3774, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 0.7947011145062155, + "learning_rate": 1.3110539845758355e-05, + "loss": 2.2394, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 0.7327421330152774, + "learning_rate": 1.3239074550128535e-05, + "loss": 2.2408, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 0.7945811841184374, + "learning_rate": 1.3367609254498715e-05, + "loss": 2.4034, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 0.7489403340172621, + "learning_rate": 1.3496143958868895e-05, + "loss": 2.2526, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.6899589256194841, + "learning_rate": 1.3624678663239075e-05, + "loss": 2.2861, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 0.8663417148519842, + "learning_rate": 1.3753213367609254e-05, + "loss": 2.2177, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 0.7342809824930605, + "learning_rate": 1.3881748071979436e-05, + "loss": 2.2449, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 0.8160635567395097, + "learning_rate": 1.4010282776349614e-05, + "loss": 2.41, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 0.8034197795553303, + "learning_rate": 1.4138817480719796e-05, + "loss": 2.2181, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 0.7235235106141928, + "learning_rate": 1.4267352185089974e-05, + "loss": 2.2708, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 0.712692858685245, + "learning_rate": 1.4395886889460156e-05, + "loss": 2.2478, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 0.7718128085655253, + "learning_rate": 1.4524421593830334e-05, + "loss": 2.3759, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 0.7988679756765087, + "learning_rate": 1.4652956298200515e-05, + "loss": 2.2888, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 0.7682800261550204, + "learning_rate": 1.4781491002570694e-05, + "loss": 2.2442, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 0.8174401345904406, + "learning_rate": 1.4910025706940875e-05, + "loss": 2.2324, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 0.7337342640199395, + "learning_rate": 1.5038560411311053e-05, + "loss": 2.373, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 0.7233248531945753, + "learning_rate": 1.5167095115681235e-05, + "loss": 2.2296, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 0.8951712559300193, + "learning_rate": 1.5295629820051416e-05, + "loss": 2.2088, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 0.7515945129280316, + "learning_rate": 1.5424164524421593e-05, + "loss": 2.2358, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 0.8472798169972598, + "learning_rate": 1.5552699228791776e-05, + "loss": 2.3821, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 0.815944237925127, + "learning_rate": 1.5681233933161953e-05, + "loss": 2.2392, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 0.7637159644660926, + "learning_rate": 1.5809768637532136e-05, + "loss": 2.2272, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 0.8367666448646426, + "learning_rate": 1.5938303341902313e-05, + "loss": 2.4248, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 0.7114587818097278, + "learning_rate": 1.6066838046272496e-05, + "loss": 2.2506, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 0.8019529997505153, + "learning_rate": 1.6195372750642672e-05, + "loss": 2.2143, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 0.832489195816639, + "learning_rate": 1.6323907455012856e-05, + "loss": 2.2487, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 0.8340810174967511, + "learning_rate": 1.6452442159383032e-05, + "loss": 2.2134, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 0.7424160283345604, + "learning_rate": 1.6580976863753215e-05, + "loss": 2.391, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 0.8031178193982584, + "learning_rate": 1.6709511568123395e-05, + "loss": 2.2064, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 0.7399645111812496, + "learning_rate": 1.6838046272493575e-05, + "loss": 2.2394, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 0.8269335128138091, + "learning_rate": 1.6966580976863755e-05, + "loss": 2.2153, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 0.7693441703791892, + "learning_rate": 1.7095115681233935e-05, + "loss": 2.3841, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 0.7676822031426157, + "learning_rate": 1.7223650385604115e-05, + "loss": 2.1814, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 0.7804318459497673, + "learning_rate": 1.7352185089974295e-05, + "loss": 2.2391, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 0.7438932382567364, + "learning_rate": 1.7480719794344475e-05, + "loss": 2.3838, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 0.712586081466253, + "learning_rate": 1.7609254498714654e-05, + "loss": 2.2613, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 0.8600477181581904, + "learning_rate": 1.7737789203084834e-05, + "loss": 2.1858, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 0.7530114647052649, + "learning_rate": 1.7866323907455014e-05, + "loss": 2.2505, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 0.803068386434486, + "learning_rate": 1.7994858611825194e-05, + "loss": 2.1718, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 0.8325994802276592, + "learning_rate": 1.8123393316195374e-05, + "loss": 2.4214, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 0.749460914628141, + "learning_rate": 1.8251928020565554e-05, + "loss": 2.2305, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 0.7532191238420938, + "learning_rate": 1.8380462724935734e-05, + "loss": 2.23, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 0.9118671439579568, + "learning_rate": 1.8508997429305914e-05, + "loss": 2.2024, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 0.7432233383656482, + "learning_rate": 1.8637532133676093e-05, + "loss": 2.3729, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 0.8038797558993938, + "learning_rate": 1.8766066838046273e-05, + "loss": 2.2352, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 0.8068803892120634, + "learning_rate": 1.8894601542416453e-05, + "loss": 2.1927, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 0.7645221466645382, + "learning_rate": 1.9023136246786633e-05, + "loss": 2.2319, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 0.725793203223981, + "learning_rate": 1.9151670951156813e-05, + "loss": 2.3541, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 0.8255924430645825, + "learning_rate": 1.9280205655526993e-05, + "loss": 2.1842, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 0.8265884765552166, + "learning_rate": 1.9408740359897173e-05, + "loss": 2.1864, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 0.8378283781456477, + "learning_rate": 1.9537275064267353e-05, + "loss": 2.2437, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 0.7598232159202175, + "learning_rate": 1.9665809768637533e-05, + "loss": 2.4008, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 0.8257843017410677, + "learning_rate": 1.9794344473007716e-05, + "loss": 2.2326, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 0.7753745188253554, + "learning_rate": 1.9922879177377892e-05, + "loss": 2.1824, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 0.7138011035699597, + "learning_rate": 2.0051413881748076e-05, + "loss": 2.2453, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 0.774284079807111, + "learning_rate": 2.0179948586118252e-05, + "loss": 2.3797, + "step": 157 + }, + { + "epoch": 0.01, + "grad_norm": 0.8633054577213001, + "learning_rate": 2.0308483290488432e-05, + "loss": 2.1679, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 0.805168376446495, + "learning_rate": 2.0437017994858612e-05, + "loss": 2.1878, + "step": 159 + }, + { + "epoch": 0.01, + "grad_norm": 0.7921651051962181, + "learning_rate": 2.0565552699228792e-05, + "loss": 2.1983, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 0.8062833144635174, + "learning_rate": 2.069408740359897e-05, + "loss": 2.358, + "step": 161 + }, + { + "epoch": 0.01, + "grad_norm": 0.752102994062018, + "learning_rate": 2.082262210796915e-05, + "loss": 2.239, + "step": 162 + }, + { + "epoch": 0.01, + "grad_norm": 0.8531593906546486, + "learning_rate": 2.095115681233933e-05, + "loss": 2.2306, + "step": 163 + }, + { + "epoch": 0.01, + "grad_norm": 0.8449248806805474, + "learning_rate": 2.107969151670951e-05, + "loss": 2.1964, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 0.842256756711545, + "learning_rate": 2.120822622107969e-05, + "loss": 2.413, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 0.8473662940224512, + "learning_rate": 2.133676092544987e-05, + "loss": 2.222, + "step": 166 + }, + { + "epoch": 0.01, + "grad_norm": 0.8042223052263614, + "learning_rate": 2.1465295629820054e-05, + "loss": 2.2017, + "step": 167 + }, + { + "epoch": 0.01, + "grad_norm": 0.7950694658117617, + "learning_rate": 2.159383033419023e-05, + "loss": 2.1971, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 0.7704212442597661, + "learning_rate": 2.1722365038560414e-05, + "loss": 2.3261, + "step": 169 + }, + { + "epoch": 0.01, + "grad_norm": 0.9416442899451295, + "learning_rate": 2.185089974293059e-05, + "loss": 2.1464, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 0.8068008414002802, + "learning_rate": 2.1979434447300774e-05, + "loss": 2.2209, + "step": 171 + }, + { + "epoch": 0.01, + "grad_norm": 0.8974292527858956, + "learning_rate": 2.210796915167095e-05, + "loss": 2.1655, + "step": 172 + }, + { + "epoch": 0.01, + "grad_norm": 0.8479575723898999, + "learning_rate": 2.2236503856041134e-05, + "loss": 2.3451, + "step": 173 + }, + { + "epoch": 0.01, + "grad_norm": 0.7986356237148048, + "learning_rate": 2.236503856041131e-05, + "loss": 2.2272, + "step": 174 + }, + { + "epoch": 0.01, + "grad_norm": 0.8286607794795989, + "learning_rate": 2.2493573264781493e-05, + "loss": 2.1686, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 0.954741370859446, + "learning_rate": 2.262210796915167e-05, + "loss": 2.2007, + "step": 176 + }, + { + "epoch": 0.01, + "grad_norm": 0.8323671581646224, + "learning_rate": 2.2750642673521853e-05, + "loss": 2.3998, + "step": 177 + }, + { + "epoch": 0.01, + "grad_norm": 0.8806843936190892, + "learning_rate": 2.2879177377892033e-05, + "loss": 2.1764, + "step": 178 + }, + { + "epoch": 0.01, + "grad_norm": 0.857802849990949, + "learning_rate": 2.3007712082262213e-05, + "loss": 2.183, + "step": 179 + }, + { + "epoch": 0.01, + "grad_norm": 0.8708433848297302, + "learning_rate": 2.3136246786632393e-05, + "loss": 2.1778, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 0.840412034652977, + "learning_rate": 2.3264781491002573e-05, + "loss": 2.352, + "step": 181 + }, + { + "epoch": 0.01, + "grad_norm": 0.9344075211623188, + "learning_rate": 2.3393316195372753e-05, + "loss": 2.17, + "step": 182 + }, + { + "epoch": 0.01, + "grad_norm": 0.8812025888871806, + "learning_rate": 2.3521850899742933e-05, + "loss": 2.152, + "step": 183 + }, + { + "epoch": 0.01, + "grad_norm": 0.9509939865879609, + "learning_rate": 2.3650385604113112e-05, + "loss": 2.195, + "step": 184 + }, + { + "epoch": 0.01, + "grad_norm": 0.9945527159093788, + "learning_rate": 2.3778920308483292e-05, + "loss": 2.3279, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 0.886769605585205, + "learning_rate": 2.3907455012853472e-05, + "loss": 2.1942, + "step": 186 + }, + { + "epoch": 0.01, + "grad_norm": 0.8959021518910648, + "learning_rate": 2.403598971722365e-05, + "loss": 2.2206, + "step": 187 + }, + { + "epoch": 0.01, + "grad_norm": 1.1859858907128553, + "learning_rate": 2.4164524421593832e-05, + "loss": 2.1445, + "step": 188 + }, + { + "epoch": 0.01, + "grad_norm": 1.0094984017630166, + "learning_rate": 2.4293059125964012e-05, + "loss": 2.3268, + "step": 189 + }, + { + "epoch": 0.01, + "grad_norm": 0.9238777016137953, + "learning_rate": 2.4421593830334192e-05, + "loss": 2.1705, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 1.0038666111133774, + "learning_rate": 2.455012853470437e-05, + "loss": 2.1728, + "step": 191 + }, + { + "epoch": 0.01, + "grad_norm": 0.9933981749687495, + "learning_rate": 2.467866323907455e-05, + "loss": 2.1753, + "step": 192 + }, + { + "epoch": 0.01, + "grad_norm": 0.8109137074634817, + "learning_rate": 2.480719794344473e-05, + "loss": 2.3185, + "step": 193 + }, + { + "epoch": 0.01, + "grad_norm": 1.0106594923453656, + "learning_rate": 2.493573264781491e-05, + "loss": 2.1354, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 0.9243365227367555, + "learning_rate": 2.5064267352185088e-05, + "loss": 2.1799, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 0.9321626469181074, + "learning_rate": 2.519280205655527e-05, + "loss": 2.167, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 0.8646693304018523, + "learning_rate": 2.532133676092545e-05, + "loss": 2.347, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 0.8372559532358477, + "learning_rate": 2.5449871465295634e-05, + "loss": 2.1813, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 0.8396683369773447, + "learning_rate": 2.5578406169665807e-05, + "loss": 2.223, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 0.8817676829226687, + "learning_rate": 2.570694087403599e-05, + "loss": 2.1483, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 0.776459798803132, + "learning_rate": 2.583547557840617e-05, + "loss": 2.3365, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 0.8084753660969847, + "learning_rate": 2.5964010282776354e-05, + "loss": 2.1554, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 0.8764255240490545, + "learning_rate": 2.6092544987146534e-05, + "loss": 2.1585, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 0.8577833790854885, + "learning_rate": 2.622107969151671e-05, + "loss": 2.202, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 0.8781993404197235, + "learning_rate": 2.634961439588689e-05, + "loss": 2.3809, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 0.889180704863445, + "learning_rate": 2.647814910025707e-05, + "loss": 2.1945, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 0.9425559892998966, + "learning_rate": 2.6606683804627253e-05, + "loss": 2.1552, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 1.000696328058695, + "learning_rate": 2.673521850899743e-05, + "loss": 2.1882, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 0.8298721511261294, + "learning_rate": 2.686375321336761e-05, + "loss": 2.3159, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 0.8573486621291903, + "learning_rate": 2.699228791773779e-05, + "loss": 2.1083, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 0.9666676202109203, + "learning_rate": 2.7120822622107973e-05, + "loss": 2.1577, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 0.7871256442648927, + "learning_rate": 2.724935732647815e-05, + "loss": 2.2281, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 0.8433162502641173, + "learning_rate": 2.737789203084833e-05, + "loss": 2.3337, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 0.9560937954519017, + "learning_rate": 2.750642673521851e-05, + "loss": 2.1453, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 0.9238370998287431, + "learning_rate": 2.7634961439588692e-05, + "loss": 2.1703, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 0.9005431888321329, + "learning_rate": 2.7763496143958872e-05, + "loss": 2.1396, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 0.9048485134303587, + "learning_rate": 2.789203084832905e-05, + "loss": 2.3822, + "step": 217 + }, + { + "epoch": 0.02, + "grad_norm": 0.7455479070037024, + "learning_rate": 2.802056555269923e-05, + "loss": 2.1846, + "step": 218 + }, + { + "epoch": 0.02, + "grad_norm": 0.830115199097991, + "learning_rate": 2.8149100257069412e-05, + "loss": 2.1473, + "step": 219 + }, + { + "epoch": 0.02, + "grad_norm": 0.8636395035972432, + "learning_rate": 2.827763496143959e-05, + "loss": 2.1965, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 0.8054978869881467, + "learning_rate": 2.8406169665809768e-05, + "loss": 2.3366, + "step": 221 + }, + { + "epoch": 0.02, + "grad_norm": 0.8681404882413228, + "learning_rate": 2.8534704370179948e-05, + "loss": 2.1337, + "step": 222 + }, + { + "epoch": 0.02, + "grad_norm": 0.8927113650861616, + "learning_rate": 2.866323907455013e-05, + "loss": 2.1411, + "step": 223 + }, + { + "epoch": 0.02, + "grad_norm": 0.75671445839545, + "learning_rate": 2.879177377892031e-05, + "loss": 2.1998, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 0.7972525417440803, + "learning_rate": 2.892030848329049e-05, + "loss": 2.3657, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 0.8208457285454869, + "learning_rate": 2.9048843187660668e-05, + "loss": 2.1656, + "step": 226 + }, + { + "epoch": 0.02, + "grad_norm": 0.815931009332894, + "learning_rate": 2.917737789203085e-05, + "loss": 2.1466, + "step": 227 + }, + { + "epoch": 0.02, + "grad_norm": 0.8193856937777181, + "learning_rate": 2.930591259640103e-05, + "loss": 2.1581, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 0.8665210116298263, + "learning_rate": 2.943444730077121e-05, + "loss": 2.3562, + "step": 229 + }, + { + "epoch": 0.02, + "grad_norm": 0.7522637518379005, + "learning_rate": 2.9562982005141387e-05, + "loss": 2.1879, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 0.8082362465992756, + "learning_rate": 2.969151670951157e-05, + "loss": 2.1543, + "step": 231 + }, + { + "epoch": 0.02, + "grad_norm": 0.8964399948187011, + "learning_rate": 2.982005141388175e-05, + "loss": 2.1378, + "step": 232 + }, + { + "epoch": 0.02, + "grad_norm": 0.8247864590624835, + "learning_rate": 2.994858611825193e-05, + "loss": 2.3513, + "step": 233 + }, + { + "epoch": 0.02, + "grad_norm": 0.8703640276954714, + "learning_rate": 3.0077120822622107e-05, + "loss": 2.1312, + "step": 234 + }, + { + "epoch": 0.02, + "grad_norm": 0.9344182165770607, + "learning_rate": 3.0205655526992287e-05, + "loss": 2.1212, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 0.8699682147643341, + "learning_rate": 3.033419023136247e-05, + "loss": 2.1938, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 0.9682396468502663, + "learning_rate": 3.046272493573265e-05, + "loss": 2.3009, + "step": 237 + }, + { + "epoch": 0.02, + "grad_norm": 0.9204904605625427, + "learning_rate": 3.059125964010283e-05, + "loss": 2.1922, + "step": 238 + }, + { + "epoch": 0.02, + "grad_norm": 0.9957959750973201, + "learning_rate": 3.0719794344473006e-05, + "loss": 2.1544, + "step": 239 + }, + { + "epoch": 0.02, + "grad_norm": 0.8798991594277322, + "learning_rate": 3.0848329048843186e-05, + "loss": 2.1357, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 0.7840988764470934, + "learning_rate": 3.097686375321337e-05, + "loss": 2.3482, + "step": 241 + }, + { + "epoch": 0.02, + "grad_norm": 0.851310682314147, + "learning_rate": 3.110539845758355e-05, + "loss": 2.1685, + "step": 242 + }, + { + "epoch": 0.02, + "grad_norm": 0.8456942784305319, + "learning_rate": 3.1233933161953726e-05, + "loss": 2.2, + "step": 243 + }, + { + "epoch": 0.02, + "grad_norm": 0.8514480384706256, + "learning_rate": 3.1362467866323906e-05, + "loss": 2.1178, + "step": 244 + }, + { + "epoch": 0.02, + "grad_norm": 0.9099053986922037, + "learning_rate": 3.149100257069409e-05, + "loss": 2.2845, + "step": 245 + }, + { + "epoch": 0.02, + "grad_norm": 0.8279899922425604, + "learning_rate": 3.161953727506427e-05, + "loss": 2.1725, + "step": 246 + }, + { + "epoch": 0.02, + "grad_norm": 0.8313809058230442, + "learning_rate": 3.1748071979434445e-05, + "loss": 2.1451, + "step": 247 + }, + { + "epoch": 0.02, + "grad_norm": 0.9378075178520656, + "learning_rate": 3.1876606683804625e-05, + "loss": 2.1274, + "step": 248 + }, + { + "epoch": 0.02, + "grad_norm": 0.7622517964462712, + "learning_rate": 3.200514138817481e-05, + "loss": 2.1685, + "step": 249 + }, + { + "epoch": 0.02, + "grad_norm": 0.8909518195269749, + "learning_rate": 3.213367609254499e-05, + "loss": 2.3435, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 0.9650547266330559, + "learning_rate": 3.226221079691517e-05, + "loss": 2.1889, + "step": 251 + }, + { + "epoch": 0.02, + "grad_norm": 0.8733172155639845, + "learning_rate": 3.2390745501285345e-05, + "loss": 2.1284, + "step": 252 + }, + { + "epoch": 0.02, + "grad_norm": 1.0052534483850306, + "learning_rate": 3.251928020565553e-05, + "loss": 2.3263, + "step": 253 + }, + { + "epoch": 0.02, + "grad_norm": 0.8904747276526405, + "learning_rate": 3.264781491002571e-05, + "loss": 2.1521, + "step": 254 + }, + { + "epoch": 0.02, + "grad_norm": 0.9828398508170662, + "learning_rate": 3.277634961439589e-05, + "loss": 2.1533, + "step": 255 + }, + { + "epoch": 0.02, + "grad_norm": 0.9928896583152526, + "learning_rate": 3.2904884318766064e-05, + "loss": 2.1397, + "step": 256 + }, + { + "epoch": 0.02, + "grad_norm": 0.94910024758784, + "learning_rate": 3.3033419023136244e-05, + "loss": 2.3181, + "step": 257 + }, + { + "epoch": 0.02, + "grad_norm": 0.9210982285355511, + "learning_rate": 3.316195372750643e-05, + "loss": 2.1382, + "step": 258 + }, + { + "epoch": 0.02, + "grad_norm": 0.8893051305631425, + "learning_rate": 3.329048843187661e-05, + "loss": 2.1345, + "step": 259 + }, + { + "epoch": 0.02, + "grad_norm": 1.0069840917191275, + "learning_rate": 3.341902313624679e-05, + "loss": 2.0934, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 0.8144084792935744, + "learning_rate": 3.3547557840616964e-05, + "loss": 2.183, + "step": 261 + }, + { + "epoch": 0.02, + "grad_norm": 0.8894335034020656, + "learning_rate": 3.367609254498715e-05, + "loss": 2.2968, + "step": 262 + }, + { + "epoch": 0.02, + "grad_norm": 0.9737452255198756, + "learning_rate": 3.380462724935733e-05, + "loss": 2.0878, + "step": 263 + }, + { + "epoch": 0.02, + "grad_norm": 0.9534263584781247, + "learning_rate": 3.393316195372751e-05, + "loss": 2.1539, + "step": 264 + }, + { + "epoch": 0.02, + "grad_norm": 0.8728334944976406, + "learning_rate": 3.406169665809768e-05, + "loss": 2.3197, + "step": 265 + }, + { + "epoch": 0.02, + "grad_norm": 0.8775640232734667, + "learning_rate": 3.419023136246787e-05, + "loss": 2.1279, + "step": 266 + }, + { + "epoch": 0.02, + "grad_norm": 1.0188555166573248, + "learning_rate": 3.431876606683805e-05, + "loss": 2.1845, + "step": 267 + }, + { + "epoch": 0.02, + "grad_norm": 0.9484249961824899, + "learning_rate": 3.444730077120823e-05, + "loss": 2.1497, + "step": 268 + }, + { + "epoch": 0.02, + "grad_norm": 0.8124571658973927, + "learning_rate": 3.45758354755784e-05, + "loss": 2.3162, + "step": 269 + }, + { + "epoch": 0.02, + "grad_norm": 1.0071685392965257, + "learning_rate": 3.470437017994859e-05, + "loss": 2.1484, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 0.8037777173426008, + "learning_rate": 3.483290488431877e-05, + "loss": 2.1098, + "step": 271 + }, + { + "epoch": 0.02, + "grad_norm": 0.9498689277672548, + "learning_rate": 3.496143958868895e-05, + "loss": 2.1618, + "step": 272 + }, + { + "epoch": 0.02, + "grad_norm": 0.8346628375739239, + "learning_rate": 3.508997429305913e-05, + "loss": 2.3119, + "step": 273 + }, + { + "epoch": 0.02, + "grad_norm": 0.810258621972297, + "learning_rate": 3.521850899742931e-05, + "loss": 2.16, + "step": 274 + }, + { + "epoch": 0.02, + "grad_norm": 0.9832139792235742, + "learning_rate": 3.534704370179949e-05, + "loss": 2.0887, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 0.817191678457258, + "learning_rate": 3.547557840616967e-05, + "loss": 2.1241, + "step": 276 + }, + { + "epoch": 0.02, + "grad_norm": 0.8660000347947862, + "learning_rate": 3.560411311053985e-05, + "loss": 2.3138, + "step": 277 + }, + { + "epoch": 0.02, + "grad_norm": 0.8149833299945981, + "learning_rate": 3.573264781491003e-05, + "loss": 2.1002, + "step": 278 + }, + { + "epoch": 0.02, + "grad_norm": 0.8700381938745791, + "learning_rate": 3.586118251928021e-05, + "loss": 2.1454, + "step": 279 + }, + { + "epoch": 0.02, + "grad_norm": 0.8608244430225308, + "learning_rate": 3.598971722365039e-05, + "loss": 2.1345, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 0.8092453261614396, + "learning_rate": 3.611825192802057e-05, + "loss": 2.1237, + "step": 281 + }, + { + "epoch": 0.02, + "grad_norm": 0.9037228073990298, + "learning_rate": 3.624678663239075e-05, + "loss": 2.3116, + "step": 282 + }, + { + "epoch": 0.02, + "grad_norm": 0.8225646743470585, + "learning_rate": 3.637532133676093e-05, + "loss": 2.1258, + "step": 283 + }, + { + "epoch": 0.02, + "grad_norm": 0.8619318488594805, + "learning_rate": 3.650385604113111e-05, + "loss": 2.1242, + "step": 284 + }, + { + "epoch": 0.02, + "grad_norm": 0.8155844239593858, + "learning_rate": 3.663239074550129e-05, + "loss": 2.2905, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 0.8589412595483565, + "learning_rate": 3.676092544987147e-05, + "loss": 2.1385, + "step": 286 + }, + { + "epoch": 0.02, + "grad_norm": 0.9899722294961518, + "learning_rate": 3.688946015424165e-05, + "loss": 2.0781, + "step": 287 + }, + { + "epoch": 0.02, + "grad_norm": 0.8877622740079876, + "learning_rate": 3.701799485861183e-05, + "loss": 2.1446, + "step": 288 + }, + { + "epoch": 0.02, + "grad_norm": 1.05763635838708, + "learning_rate": 3.714652956298201e-05, + "loss": 2.3216, + "step": 289 + }, + { + "epoch": 0.02, + "grad_norm": 0.8500307408568465, + "learning_rate": 3.727506426735219e-05, + "loss": 2.0983, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 0.9452587122646533, + "learning_rate": 3.740359897172237e-05, + "loss": 2.165, + "step": 291 + }, + { + "epoch": 0.02, + "grad_norm": 0.8464230999175705, + "learning_rate": 3.753213367609255e-05, + "loss": 2.1721, + "step": 292 + }, + { + "epoch": 0.02, + "grad_norm": 0.8304922226407954, + "learning_rate": 3.766066838046273e-05, + "loss": 2.1029, + "step": 293 + }, + { + "epoch": 0.02, + "grad_norm": 0.9372525199525727, + "learning_rate": 3.7789203084832907e-05, + "loss": 2.2473, + "step": 294 + }, + { + "epoch": 0.02, + "grad_norm": 0.827681917055771, + "learning_rate": 3.7917737789203086e-05, + "loss": 2.1189, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 0.9080064957599923, + "learning_rate": 3.8046272493573266e-05, + "loss": 2.1084, + "step": 296 + }, + { + "epoch": 0.02, + "grad_norm": 0.7999667851413097, + "learning_rate": 3.8174807197943446e-05, + "loss": 2.2356, + "step": 297 + }, + { + "epoch": 0.02, + "grad_norm": 0.806392404946158, + "learning_rate": 3.8303341902313626e-05, + "loss": 2.1273, + "step": 298 + }, + { + "epoch": 0.02, + "grad_norm": 0.8292438743068946, + "learning_rate": 3.8431876606683806e-05, + "loss": 2.1474, + "step": 299 + }, + { + "epoch": 0.02, + "grad_norm": 0.9100527586708572, + "learning_rate": 3.8560411311053986e-05, + "loss": 2.097, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 0.8964113006938416, + "learning_rate": 3.8688946015424166e-05, + "loss": 2.2736, + "step": 301 + }, + { + "epoch": 0.02, + "grad_norm": 0.8617871886772082, + "learning_rate": 3.8817480719794346e-05, + "loss": 2.1159, + "step": 302 + }, + { + "epoch": 0.02, + "grad_norm": 0.8487198459931585, + "learning_rate": 3.8946015424164526e-05, + "loss": 2.1319, + "step": 303 + }, + { + "epoch": 0.02, + "grad_norm": 0.8677515989974598, + "learning_rate": 3.9074550128534705e-05, + "loss": 2.1331, + "step": 304 + }, + { + "epoch": 0.02, + "grad_norm": 0.7354985725581981, + "learning_rate": 3.9203084832904885e-05, + "loss": 2.1485, + "step": 305 + }, + { + "epoch": 0.02, + "grad_norm": 0.8500310764927246, + "learning_rate": 3.9331619537275065e-05, + "loss": 2.2693, + "step": 306 + }, + { + "epoch": 0.02, + "grad_norm": 0.8703580385080903, + "learning_rate": 3.9460154241645245e-05, + "loss": 2.1257, + "step": 307 + }, + { + "epoch": 0.02, + "grad_norm": 0.8518917215616878, + "learning_rate": 3.958868894601543e-05, + "loss": 2.097, + "step": 308 + }, + { + "epoch": 0.02, + "grad_norm": 0.8639792676320875, + "learning_rate": 3.9717223650385605e-05, + "loss": 2.2755, + "step": 309 + }, + { + "epoch": 0.02, + "grad_norm": 0.8303638114412453, + "learning_rate": 3.9845758354755785e-05, + "loss": 2.1282, + "step": 310 + }, + { + "epoch": 0.02, + "grad_norm": 0.7745614314235371, + "learning_rate": 3.9974293059125965e-05, + "loss": 2.0952, + "step": 311 + }, + { + "epoch": 0.02, + "grad_norm": 0.9118032615071021, + "learning_rate": 4.010282776349615e-05, + "loss": 2.1241, + "step": 312 + }, + { + "epoch": 0.02, + "grad_norm": 0.847354847594381, + "learning_rate": 4.0231362467866324e-05, + "loss": 2.0603, + "step": 313 + }, + { + "epoch": 0.02, + "grad_norm": 0.8909382340071971, + "learning_rate": 4.0359897172236504e-05, + "loss": 2.2731, + "step": 314 + }, + { + "epoch": 0.02, + "grad_norm": 0.8614456618939683, + "learning_rate": 4.0488431876606684e-05, + "loss": 2.0796, + "step": 315 + }, + { + "epoch": 0.02, + "grad_norm": 0.8677412969107535, + "learning_rate": 4.0616966580976864e-05, + "loss": 2.1428, + "step": 316 + }, + { + "epoch": 0.02, + "grad_norm": 0.8263118413678917, + "learning_rate": 4.0745501285347044e-05, + "loss": 2.1178, + "step": 317 + }, + { + "epoch": 0.02, + "grad_norm": 0.7926581803625108, + "learning_rate": 4.0874035989717224e-05, + "loss": 2.3389, + "step": 318 + }, + { + "epoch": 0.02, + "grad_norm": 0.9553198751918666, + "learning_rate": 4.1002570694087404e-05, + "loss": 2.0861, + "step": 319 + }, + { + "epoch": 0.02, + "grad_norm": 0.912762246066301, + "learning_rate": 4.1131105398457584e-05, + "loss": 2.1066, + "step": 320 + }, + { + "epoch": 0.02, + "grad_norm": 0.8088595278696781, + "learning_rate": 4.125964010282777e-05, + "loss": 2.2718, + "step": 321 + }, + { + "epoch": 0.02, + "grad_norm": 0.9162821547994565, + "learning_rate": 4.138817480719794e-05, + "loss": 2.1133, + "step": 322 + }, + { + "epoch": 0.02, + "grad_norm": 0.7552532762904902, + "learning_rate": 4.151670951156812e-05, + "loss": 2.1532, + "step": 323 + }, + { + "epoch": 0.02, + "grad_norm": 0.8563933631964012, + "learning_rate": 4.16452442159383e-05, + "loss": 2.1168, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 0.9445953598016068, + "learning_rate": 4.177377892030849e-05, + "loss": 2.0495, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 0.7547707013070538, + "learning_rate": 4.190231362467866e-05, + "loss": 2.2658, + "step": 326 + }, + { + "epoch": 0.03, + "grad_norm": 0.8198409608689639, + "learning_rate": 4.203084832904884e-05, + "loss": 2.068, + "step": 327 + }, + { + "epoch": 0.03, + "grad_norm": 0.9572399442192118, + "learning_rate": 4.215938303341902e-05, + "loss": 2.0412, + "step": 328 + }, + { + "epoch": 0.03, + "grad_norm": 0.8881850156329527, + "learning_rate": 4.228791773778921e-05, + "loss": 2.2808, + "step": 329 + }, + { + "epoch": 0.03, + "grad_norm": 0.7762157108582601, + "learning_rate": 4.241645244215938e-05, + "loss": 2.1584, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 0.8375757796062334, + "learning_rate": 4.254498714652956e-05, + "loss": 2.0602, + "step": 331 + }, + { + "epoch": 0.03, + "grad_norm": 0.9283286049859709, + "learning_rate": 4.267352185089974e-05, + "loss": 2.0707, + "step": 332 + }, + { + "epoch": 0.03, + "grad_norm": 0.7358818169110288, + "learning_rate": 4.280205655526993e-05, + "loss": 2.1139, + "step": 333 + }, + { + "epoch": 0.03, + "grad_norm": 0.9602188243305393, + "learning_rate": 4.293059125964011e-05, + "loss": 2.2886, + "step": 334 + }, + { + "epoch": 0.03, + "grad_norm": 0.7984682553531695, + "learning_rate": 4.305912596401028e-05, + "loss": 2.0655, + "step": 335 + }, + { + "epoch": 0.03, + "grad_norm": 0.8165703562307497, + "learning_rate": 4.318766066838046e-05, + "loss": 2.1523, + "step": 336 + }, + { + "epoch": 0.03, + "grad_norm": 0.8354262339886275, + "learning_rate": 4.331619537275065e-05, + "loss": 2.0885, + "step": 337 + }, + { + "epoch": 0.03, + "grad_norm": 0.8128764513509886, + "learning_rate": 4.344473007712083e-05, + "loss": 2.2437, + "step": 338 + }, + { + "epoch": 0.03, + "grad_norm": 0.825527774423894, + "learning_rate": 4.3573264781491e-05, + "loss": 2.0837, + "step": 339 + }, + { + "epoch": 0.03, + "grad_norm": 0.8044921835942778, + "learning_rate": 4.370179948586118e-05, + "loss": 2.0612, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 0.8433194570271689, + "learning_rate": 4.383033419023137e-05, + "loss": 2.2951, + "step": 341 + }, + { + "epoch": 0.03, + "grad_norm": 0.7738317605120996, + "learning_rate": 4.395886889460155e-05, + "loss": 2.1568, + "step": 342 + }, + { + "epoch": 0.03, + "grad_norm": 0.8012292417236062, + "learning_rate": 4.408740359897173e-05, + "loss": 2.0523, + "step": 343 + }, + { + "epoch": 0.03, + "grad_norm": 0.8473619987616035, + "learning_rate": 4.42159383033419e-05, + "loss": 2.0864, + "step": 344 + }, + { + "epoch": 0.03, + "grad_norm": 0.8056024692378703, + "learning_rate": 4.434447300771208e-05, + "loss": 2.1009, + "step": 345 + }, + { + "epoch": 0.03, + "grad_norm": 0.837167737608196, + "learning_rate": 4.447300771208227e-05, + "loss": 2.279, + "step": 346 + }, + { + "epoch": 0.03, + "grad_norm": 0.7570582597052302, + "learning_rate": 4.460154241645245e-05, + "loss": 2.0903, + "step": 347 + }, + { + "epoch": 0.03, + "grad_norm": 0.7519677905577871, + "learning_rate": 4.473007712082262e-05, + "loss": 2.1586, + "step": 348 + }, + { + "epoch": 0.03, + "grad_norm": 0.7740784841521778, + "learning_rate": 4.48586118251928e-05, + "loss": 2.0765, + "step": 349 + }, + { + "epoch": 0.03, + "grad_norm": 0.7368430313504458, + "learning_rate": 4.498714652956299e-05, + "loss": 2.2834, + "step": 350 + }, + { + "epoch": 0.03, + "grad_norm": 0.8693554172654819, + "learning_rate": 4.511568123393317e-05, + "loss": 2.0676, + "step": 351 + }, + { + "epoch": 0.03, + "grad_norm": 0.799719553685666, + "learning_rate": 4.524421593830334e-05, + "loss": 2.09, + "step": 352 + }, + { + "epoch": 0.03, + "grad_norm": 0.9054201750990999, + "learning_rate": 4.537275064267352e-05, + "loss": 2.2791, + "step": 353 + }, + { + "epoch": 0.03, + "grad_norm": 0.9016969309095005, + "learning_rate": 4.5501285347043706e-05, + "loss": 2.1525, + "step": 354 + }, + { + "epoch": 0.03, + "grad_norm": 0.9191063043798773, + "learning_rate": 4.5629820051413886e-05, + "loss": 2.0865, + "step": 355 + }, + { + "epoch": 0.03, + "grad_norm": 0.913842710979489, + "learning_rate": 4.5758354755784066e-05, + "loss": 2.1075, + "step": 356 + }, + { + "epoch": 0.03, + "grad_norm": 0.7883323826783198, + "learning_rate": 4.588688946015424e-05, + "loss": 2.0127, + "step": 357 + }, + { + "epoch": 0.03, + "grad_norm": 0.9350454557578266, + "learning_rate": 4.6015424164524426e-05, + "loss": 2.3214, + "step": 358 + }, + { + "epoch": 0.03, + "grad_norm": 0.8518201446433175, + "learning_rate": 4.6143958868894606e-05, + "loss": 2.0453, + "step": 359 + }, + { + "epoch": 0.03, + "grad_norm": 0.8523334988175477, + "learning_rate": 4.6272493573264786e-05, + "loss": 2.0779, + "step": 360 + }, + { + "epoch": 0.03, + "grad_norm": 0.7039829534606079, + "learning_rate": 4.640102827763496e-05, + "loss": 2.1315, + "step": 361 + }, + { + "epoch": 0.03, + "grad_norm": 0.8297513553222922, + "learning_rate": 4.6529562982005145e-05, + "loss": 2.2753, + "step": 362 + }, + { + "epoch": 0.03, + "grad_norm": 0.7983605852683096, + "learning_rate": 4.6658097686375325e-05, + "loss": 2.0706, + "step": 363 + }, + { + "epoch": 0.03, + "grad_norm": 0.8058756890740276, + "learning_rate": 4.6786632390745505e-05, + "loss": 2.1042, + "step": 364 + }, + { + "epoch": 0.03, + "grad_norm": 0.8100279961052802, + "learning_rate": 4.691516709511568e-05, + "loss": 2.1182, + "step": 365 + }, + { + "epoch": 0.03, + "grad_norm": 0.8765908325678339, + "learning_rate": 4.7043701799485865e-05, + "loss": 2.2436, + "step": 366 + }, + { + "epoch": 0.03, + "grad_norm": 0.7767884797247849, + "learning_rate": 4.7172236503856045e-05, + "loss": 2.1113, + "step": 367 + }, + { + "epoch": 0.03, + "grad_norm": 0.8669188181701808, + "learning_rate": 4.7300771208226225e-05, + "loss": 2.0812, + "step": 368 + }, + { + "epoch": 0.03, + "grad_norm": 0.8013753736440375, + "learning_rate": 4.7429305912596405e-05, + "loss": 2.1107, + "step": 369 + }, + { + "epoch": 0.03, + "grad_norm": 0.8184550380863294, + "learning_rate": 4.7557840616966585e-05, + "loss": 2.2659, + "step": 370 + }, + { + "epoch": 0.03, + "grad_norm": 0.8284297373516081, + "learning_rate": 4.7686375321336764e-05, + "loss": 2.0811, + "step": 371 + }, + { + "epoch": 0.03, + "grad_norm": 0.7532731279753423, + "learning_rate": 4.7814910025706944e-05, + "loss": 2.1254, + "step": 372 + }, + { + "epoch": 0.03, + "grad_norm": 0.8640461323037059, + "learning_rate": 4.7943444730077124e-05, + "loss": 2.1264, + "step": 373 + }, + { + "epoch": 0.03, + "grad_norm": 0.73475651413898, + "learning_rate": 4.80719794344473e-05, + "loss": 2.2675, + "step": 374 + }, + { + "epoch": 0.03, + "grad_norm": 0.7682674945788471, + "learning_rate": 4.8200514138817484e-05, + "loss": 2.082, + "step": 375 + }, + { + "epoch": 0.03, + "grad_norm": 0.8181447462931017, + "learning_rate": 4.8329048843187664e-05, + "loss": 2.0638, + "step": 376 + }, + { + "epoch": 0.03, + "grad_norm": 0.9344356888550188, + "learning_rate": 4.8457583547557844e-05, + "loss": 2.0894, + "step": 377 + }, + { + "epoch": 0.03, + "grad_norm": 0.7988065826780911, + "learning_rate": 4.8586118251928024e-05, + "loss": 2.2891, + "step": 378 + }, + { + "epoch": 0.03, + "grad_norm": 0.8682307708465136, + "learning_rate": 4.8714652956298204e-05, + "loss": 2.1165, + "step": 379 + }, + { + "epoch": 0.03, + "grad_norm": 0.892808523649001, + "learning_rate": 4.8843187660668383e-05, + "loss": 2.1021, + "step": 380 + }, + { + "epoch": 0.03, + "grad_norm": 0.8630464626378955, + "learning_rate": 4.897172236503856e-05, + "loss": 2.075, + "step": 381 + }, + { + "epoch": 0.03, + "grad_norm": 0.8721985941703809, + "learning_rate": 4.910025706940874e-05, + "loss": 2.2786, + "step": 382 + }, + { + "epoch": 0.03, + "grad_norm": 0.7583571634181621, + "learning_rate": 4.922879177377892e-05, + "loss": 2.0996, + "step": 383 + }, + { + "epoch": 0.03, + "grad_norm": 0.8640644130799212, + "learning_rate": 4.93573264781491e-05, + "loss": 2.0894, + "step": 384 + }, + { + "epoch": 0.03, + "grad_norm": 0.7433709916772382, + "learning_rate": 4.948586118251928e-05, + "loss": 2.1565, + "step": 385 + }, + { + "epoch": 0.03, + "grad_norm": 0.8026005748344281, + "learning_rate": 4.961439588688946e-05, + "loss": 2.2824, + "step": 386 + }, + { + "epoch": 0.03, + "grad_norm": 0.879464100607665, + "learning_rate": 4.974293059125964e-05, + "loss": 2.0902, + "step": 387 + }, + { + "epoch": 0.03, + "grad_norm": 0.9045204292307262, + "learning_rate": 4.987146529562982e-05, + "loss": 2.0909, + "step": 388 + }, + { + "epoch": 0.03, + "grad_norm": 0.9658830197996352, + "learning_rate": 5e-05, + "loss": 2.0984, + "step": 389 + }, + { + "epoch": 0.03, + "grad_norm": 0.8232653956471361, + "learning_rate": 4.9999999219573654e-05, + "loss": 2.2913, + "step": 390 + }, + { + "epoch": 0.03, + "grad_norm": 0.8628948057280135, + "learning_rate": 4.999999687829465e-05, + "loss": 2.0863, + "step": 391 + }, + { + "epoch": 0.03, + "grad_norm": 0.8100726439528145, + "learning_rate": 4.9999992976163134e-05, + "loss": 2.1178, + "step": 392 + }, + { + "epoch": 0.03, + "grad_norm": 0.7836585604140691, + "learning_rate": 4.9999987513179356e-05, + "loss": 2.0862, + "step": 393 + }, + { + "epoch": 0.03, + "grad_norm": 1.0861402659570965, + "learning_rate": 4.9999980489343654e-05, + "loss": 2.2711, + "step": 394 + }, + { + "epoch": 0.03, + "grad_norm": 1.0821659517085256, + "learning_rate": 4.999997190465647e-05, + "loss": 2.1257, + "step": 395 + }, + { + "epoch": 0.03, + "grad_norm": 0.7729306412833703, + "learning_rate": 4.999996175911834e-05, + "loss": 2.0677, + "step": 396 + }, + { + "epoch": 0.03, + "grad_norm": 0.8229142493691448, + "learning_rate": 4.9999950052729894e-05, + "loss": 2.0697, + "step": 397 + }, + { + "epoch": 0.03, + "grad_norm": 0.8472627694967156, + "learning_rate": 4.9999936785491864e-05, + "loss": 2.2627, + "step": 398 + }, + { + "epoch": 0.03, + "grad_norm": 0.8070808701616811, + "learning_rate": 4.9999921957405074e-05, + "loss": 2.0669, + "step": 399 + }, + { + "epoch": 0.03, + "grad_norm": 0.9981962830207696, + "learning_rate": 4.999990556847045e-05, + "loss": 2.0874, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 0.9552103813766676, + "learning_rate": 4.9999887618689026e-05, + "loss": 2.0531, + "step": 401 + }, + { + "epoch": 0.03, + "grad_norm": 0.8118047843911405, + "learning_rate": 4.9999868108061924e-05, + "loss": 2.27, + "step": 402 + }, + { + "epoch": 0.03, + "grad_norm": 1.0649515536668732, + "learning_rate": 4.999984703659034e-05, + "loss": 2.0751, + "step": 403 + }, + { + "epoch": 0.03, + "grad_norm": 0.8397503691400678, + "learning_rate": 4.9999824404275606e-05, + "loss": 2.1119, + "step": 404 + }, + { + "epoch": 0.03, + "grad_norm": 0.7943271780521511, + "learning_rate": 4.999980021111914e-05, + "loss": 2.0389, + "step": 405 + }, + { + "epoch": 0.03, + "grad_norm": 0.9842156607524565, + "learning_rate": 4.999977445712244e-05, + "loss": 2.2834, + "step": 406 + }, + { + "epoch": 0.03, + "grad_norm": 1.0710314643105574, + "learning_rate": 4.999974714228712e-05, + "loss": 2.1163, + "step": 407 + }, + { + "epoch": 0.03, + "grad_norm": 0.7490094165284531, + "learning_rate": 4.999971826661488e-05, + "loss": 2.0819, + "step": 408 + }, + { + "epoch": 0.03, + "grad_norm": 0.8996838239970464, + "learning_rate": 4.9999687830107534e-05, + "loss": 2.1127, + "step": 409 + }, + { + "epoch": 0.03, + "grad_norm": 0.825954463778069, + "learning_rate": 4.9999655832766966e-05, + "loss": 2.2581, + "step": 410 + }, + { + "epoch": 0.03, + "grad_norm": 0.8143067743139568, + "learning_rate": 4.9999622274595195e-05, + "loss": 2.0784, + "step": 411 + }, + { + "epoch": 0.03, + "grad_norm": 1.114100938748624, + "learning_rate": 4.9999587155594293e-05, + "loss": 2.0502, + "step": 412 + }, + { + "epoch": 0.03, + "grad_norm": 1.0480702527765249, + "learning_rate": 4.999955047576648e-05, + "loss": 2.0777, + "step": 413 + }, + { + "epoch": 0.03, + "grad_norm": 0.7625278706498451, + "learning_rate": 4.999951223511401e-05, + "loss": 2.2702, + "step": 414 + }, + { + "epoch": 0.03, + "grad_norm": 0.9427066331418958, + "learning_rate": 4.999947243363931e-05, + "loss": 2.1061, + "step": 415 + }, + { + "epoch": 0.03, + "grad_norm": 0.8437621763333893, + "learning_rate": 4.999943107134483e-05, + "loss": 2.1052, + "step": 416 + }, + { + "epoch": 0.03, + "grad_norm": 0.8712966312607735, + "learning_rate": 4.999938814823317e-05, + "loss": 2.1043, + "step": 417 + }, + { + "epoch": 0.03, + "grad_norm": 1.01293211230257, + "learning_rate": 4.999934366430702e-05, + "loss": 2.2984, + "step": 418 + }, + { + "epoch": 0.03, + "grad_norm": 1.0342316114886378, + "learning_rate": 4.999929761956913e-05, + "loss": 2.0889, + "step": 419 + }, + { + "epoch": 0.03, + "grad_norm": 0.8716933736049707, + "learning_rate": 4.9999250014022404e-05, + "loss": 2.0762, + "step": 420 + }, + { + "epoch": 0.03, + "grad_norm": 1.1097916543460231, + "learning_rate": 4.9999200847669795e-05, + "loss": 2.0562, + "step": 421 + }, + { + "epoch": 0.03, + "grad_norm": 0.9730110208891356, + "learning_rate": 4.999915012051437e-05, + "loss": 2.2521, + "step": 422 + }, + { + "epoch": 0.03, + "grad_norm": 0.8441898111175365, + "learning_rate": 4.9999097832559315e-05, + "loss": 2.1052, + "step": 423 + }, + { + "epoch": 0.03, + "grad_norm": 0.9843318050039119, + "learning_rate": 4.999904398380788e-05, + "loss": 2.0956, + "step": 424 + }, + { + "epoch": 0.03, + "grad_norm": 0.8381330910956554, + "learning_rate": 4.999898857426343e-05, + "loss": 2.0962, + "step": 425 + }, + { + "epoch": 0.03, + "grad_norm": 0.8238884586298355, + "learning_rate": 4.999893160392942e-05, + "loss": 2.2767, + "step": 426 + }, + { + "epoch": 0.03, + "grad_norm": 0.8387452914407116, + "learning_rate": 4.999887307280941e-05, + "loss": 2.0725, + "step": 427 + }, + { + "epoch": 0.03, + "grad_norm": 0.8065959427415655, + "learning_rate": 4.9998812980907066e-05, + "loss": 2.0702, + "step": 428 + }, + { + "epoch": 0.03, + "grad_norm": 0.9162430394232872, + "learning_rate": 4.999875132822612e-05, + "loss": 2.1245, + "step": 429 + }, + { + "epoch": 0.03, + "grad_norm": 0.7315352407959287, + "learning_rate": 4.9998688114770444e-05, + "loss": 2.2965, + "step": 430 + }, + { + "epoch": 0.03, + "grad_norm": 0.7526227424480181, + "learning_rate": 4.999862334054396e-05, + "loss": 2.0827, + "step": 431 + }, + { + "epoch": 0.03, + "grad_norm": 0.7982820462086706, + "learning_rate": 4.999855700555073e-05, + "loss": 2.083, + "step": 432 + }, + { + "epoch": 0.03, + "grad_norm": 0.8254111799129937, + "learning_rate": 4.9998489109794886e-05, + "loss": 2.0499, + "step": 433 + }, + { + "epoch": 0.03, + "grad_norm": 0.943682417353492, + "learning_rate": 4.999841965328068e-05, + "loss": 2.224, + "step": 434 + }, + { + "epoch": 0.03, + "grad_norm": 0.8211262943400262, + "learning_rate": 4.9998348636012425e-05, + "loss": 2.1633, + "step": 435 + }, + { + "epoch": 0.03, + "grad_norm": 0.894248237844582, + "learning_rate": 4.999827605799458e-05, + "loss": 2.0915, + "step": 436 + }, + { + "epoch": 0.03, + "grad_norm": 0.8498766710321624, + "learning_rate": 4.999820191923166e-05, + "loss": 2.1312, + "step": 437 + }, + { + "epoch": 0.03, + "grad_norm": 0.7753817699423873, + "learning_rate": 4.99981262197283e-05, + "loss": 2.2832, + "step": 438 + }, + { + "epoch": 0.03, + "grad_norm": 0.8617140213936059, + "learning_rate": 4.999804895948922e-05, + "loss": 2.0921, + "step": 439 + }, + { + "epoch": 0.03, + "grad_norm": 0.9247910752335331, + "learning_rate": 4.999797013851926e-05, + "loss": 2.0767, + "step": 440 + }, + { + "epoch": 0.03, + "grad_norm": 0.7291350620687566, + "learning_rate": 4.9997889756823316e-05, + "loss": 2.1226, + "step": 441 + }, + { + "epoch": 0.03, + "grad_norm": 0.8024549436229903, + "learning_rate": 4.999780781440643e-05, + "loss": 2.2603, + "step": 442 + }, + { + "epoch": 0.03, + "grad_norm": 0.8068057277949261, + "learning_rate": 4.999772431127371e-05, + "loss": 2.0221, + "step": 443 + }, + { + "epoch": 0.03, + "grad_norm": 0.7133210792997057, + "learning_rate": 4.999763924743036e-05, + "loss": 2.0713, + "step": 444 + }, + { + "epoch": 0.03, + "grad_norm": 0.7195987481563844, + "learning_rate": 4.99975526228817e-05, + "loss": 2.1268, + "step": 445 + }, + { + "epoch": 0.03, + "grad_norm": 0.8302340811057716, + "learning_rate": 4.9997464437633144e-05, + "loss": 2.2817, + "step": 446 + }, + { + "epoch": 0.03, + "grad_norm": 0.7290367502371367, + "learning_rate": 4.9997374691690194e-05, + "loss": 2.1175, + "step": 447 + }, + { + "epoch": 0.03, + "grad_norm": 0.7687906770699118, + "learning_rate": 4.9997283385058444e-05, + "loss": 2.0849, + "step": 448 + }, + { + "epoch": 0.03, + "grad_norm": 0.8398500397240121, + "learning_rate": 4.99971905177436e-05, + "loss": 2.0612, + "step": 449 + }, + { + "epoch": 0.03, + "grad_norm": 0.8304813759530085, + "learning_rate": 4.999709608975146e-05, + "loss": 2.2927, + "step": 450 + }, + { + "epoch": 0.03, + "grad_norm": 0.8440126339882029, + "learning_rate": 4.999700010108792e-05, + "loss": 2.06, + "step": 451 + }, + { + "epoch": 0.03, + "grad_norm": 0.9082574090154942, + "learning_rate": 4.999690255175898e-05, + "loss": 2.1177, + "step": 452 + }, + { + "epoch": 0.03, + "grad_norm": 0.7700450496803274, + "learning_rate": 4.9996803441770715e-05, + "loss": 2.0007, + "step": 453 + }, + { + "epoch": 0.04, + "grad_norm": 0.8853145379477192, + "learning_rate": 4.9996702771129336e-05, + "loss": 2.3271, + "step": 454 + }, + { + "epoch": 0.04, + "grad_norm": 0.8751604240901071, + "learning_rate": 4.9996600539841096e-05, + "loss": 2.0703, + "step": 455 + }, + { + "epoch": 0.04, + "grad_norm": 0.7817455875133111, + "learning_rate": 4.9996496747912404e-05, + "loss": 2.055, + "step": 456 + }, + { + "epoch": 0.04, + "grad_norm": 0.8914872877842717, + "learning_rate": 4.999639139534973e-05, + "loss": 2.0885, + "step": 457 + }, + { + "epoch": 0.04, + "grad_norm": 0.8627105516470674, + "learning_rate": 4.999628448215966e-05, + "loss": 2.2488, + "step": 458 + }, + { + "epoch": 0.04, + "grad_norm": 0.94988901800165, + "learning_rate": 4.999617600834886e-05, + "loss": 2.0616, + "step": 459 + }, + { + "epoch": 0.04, + "grad_norm": 0.7449114684799212, + "learning_rate": 4.99960659739241e-05, + "loss": 2.1143, + "step": 460 + }, + { + "epoch": 0.04, + "grad_norm": 0.8984075913582436, + "learning_rate": 4.999595437889226e-05, + "loss": 2.0737, + "step": 461 + }, + { + "epoch": 0.04, + "grad_norm": 0.9637318125513891, + "learning_rate": 4.99958412232603e-05, + "loss": 2.2363, + "step": 462 + }, + { + "epoch": 0.04, + "grad_norm": 0.734514391471459, + "learning_rate": 4.999572650703528e-05, + "loss": 2.0785, + "step": 463 + }, + { + "epoch": 0.04, + "grad_norm": 0.9640707929274376, + "learning_rate": 4.9995610230224375e-05, + "loss": 2.047, + "step": 464 + }, + { + "epoch": 0.04, + "grad_norm": 0.958605900611534, + "learning_rate": 4.9995492392834844e-05, + "loss": 2.1262, + "step": 465 + }, + { + "epoch": 0.04, + "grad_norm": 0.7942870212565192, + "learning_rate": 4.999537299487403e-05, + "loss": 2.155, + "step": 466 + }, + { + "epoch": 0.04, + "grad_norm": 1.0091026436455823, + "learning_rate": 4.9995252036349403e-05, + "loss": 2.2253, + "step": 467 + }, + { + "epoch": 0.04, + "grad_norm": 0.7574886362522087, + "learning_rate": 4.999512951726851e-05, + "loss": 2.0819, + "step": 468 + }, + { + "epoch": 0.04, + "grad_norm": 0.9080976371587154, + "learning_rate": 4.999500543763899e-05, + "loss": 2.0416, + "step": 469 + }, + { + "epoch": 0.04, + "grad_norm": 0.842101937580771, + "learning_rate": 4.9994879797468605e-05, + "loss": 2.216, + "step": 470 + }, + { + "epoch": 0.04, + "grad_norm": 0.7974925990151202, + "learning_rate": 4.999475259676519e-05, + "loss": 2.0701, + "step": 471 + }, + { + "epoch": 0.04, + "grad_norm": 0.9395515818029764, + "learning_rate": 4.999462383553669e-05, + "loss": 2.0932, + "step": 472 + }, + { + "epoch": 0.04, + "grad_norm": 0.7662249798342431, + "learning_rate": 4.999449351379115e-05, + "loss": 2.0613, + "step": 473 + }, + { + "epoch": 0.04, + "grad_norm": 0.9843705290372647, + "learning_rate": 4.999436163153669e-05, + "loss": 2.2254, + "step": 474 + }, + { + "epoch": 0.04, + "grad_norm": 0.7764718211067133, + "learning_rate": 4.999422818878155e-05, + "loss": 2.0745, + "step": 475 + }, + { + "epoch": 0.04, + "grad_norm": 0.7954718879826685, + "learning_rate": 4.999409318553408e-05, + "loss": 2.0587, + "step": 476 + }, + { + "epoch": 0.04, + "grad_norm": 0.8859752029518048, + "learning_rate": 4.999395662180269e-05, + "loss": 2.0857, + "step": 477 + }, + { + "epoch": 0.04, + "grad_norm": 0.7225096838504672, + "learning_rate": 4.99938184975959e-05, + "loss": 2.2386, + "step": 478 + }, + { + "epoch": 0.04, + "grad_norm": 0.827492685348671, + "learning_rate": 4.9993678812922356e-05, + "loss": 2.1525, + "step": 479 + }, + { + "epoch": 0.04, + "grad_norm": 0.7561700992463559, + "learning_rate": 4.999353756779076e-05, + "loss": 2.0558, + "step": 480 + }, + { + "epoch": 0.04, + "grad_norm": 0.803135801979098, + "learning_rate": 4.999339476220994e-05, + "loss": 2.0807, + "step": 481 + }, + { + "epoch": 0.04, + "grad_norm": 0.8385660241106662, + "learning_rate": 4.999325039618881e-05, + "loss": 2.2535, + "step": 482 + }, + { + "epoch": 0.04, + "grad_norm": 0.7978445504286632, + "learning_rate": 4.999310446973638e-05, + "loss": 2.0686, + "step": 483 + }, + { + "epoch": 0.04, + "grad_norm": 0.8051040540770208, + "learning_rate": 4.999295698286177e-05, + "loss": 2.1121, + "step": 484 + }, + { + "epoch": 0.04, + "grad_norm": 0.8936295716752525, + "learning_rate": 4.999280793557418e-05, + "loss": 2.1334, + "step": 485 + }, + { + "epoch": 0.04, + "grad_norm": 0.7439477016178776, + "learning_rate": 4.999265732788292e-05, + "loss": 2.2998, + "step": 486 + }, + { + "epoch": 0.04, + "grad_norm": 0.7612729488072276, + "learning_rate": 4.999250515979739e-05, + "loss": 2.0642, + "step": 487 + }, + { + "epoch": 0.04, + "grad_norm": 0.823381507772441, + "learning_rate": 4.999235143132708e-05, + "loss": 2.0587, + "step": 488 + }, + { + "epoch": 0.04, + "grad_norm": 0.8818117233228686, + "learning_rate": 4.999219614248161e-05, + "loss": 2.1089, + "step": 489 + }, + { + "epoch": 0.04, + "grad_norm": 0.7946174227550551, + "learning_rate": 4.9992039293270655e-05, + "loss": 2.179, + "step": 490 + }, + { + "epoch": 0.04, + "grad_norm": 0.808321536214124, + "learning_rate": 4.999188088370402e-05, + "loss": 2.1266, + "step": 491 + }, + { + "epoch": 0.04, + "grad_norm": 1.0304105346551413, + "learning_rate": 4.9991720913791604e-05, + "loss": 2.0502, + "step": 492 + }, + { + "epoch": 0.04, + "grad_norm": 0.8010317810616022, + "learning_rate": 4.9991559383543375e-05, + "loss": 2.0472, + "step": 493 + }, + { + "epoch": 0.04, + "grad_norm": 1.1288761195489778, + "learning_rate": 4.999139629296942e-05, + "loss": 2.2365, + "step": 494 + }, + { + "epoch": 0.04, + "grad_norm": 1.0966092443581594, + "learning_rate": 4.999123164207993e-05, + "loss": 2.1008, + "step": 495 + }, + { + "epoch": 0.04, + "grad_norm": 0.872445613599377, + "learning_rate": 4.999106543088519e-05, + "loss": 2.0911, + "step": 496 + }, + { + "epoch": 0.04, + "grad_norm": 0.8399811132152307, + "learning_rate": 4.999089765939556e-05, + "loss": 2.0984, + "step": 497 + }, + { + "epoch": 0.04, + "grad_norm": 0.8157452182265927, + "learning_rate": 4.999072832762153e-05, + "loss": 2.0636, + "step": 498 + }, + { + "epoch": 0.04, + "grad_norm": 0.7913392243514016, + "learning_rate": 4.999055743557366e-05, + "loss": 2.2203, + "step": 499 + }, + { + "epoch": 0.04, + "grad_norm": 0.9573702209199872, + "learning_rate": 4.999038498326263e-05, + "loss": 2.0432, + "step": 500 + }, + { + "epoch": 0.04, + "grad_norm": 0.8972903977893034, + "learning_rate": 4.9990210970699204e-05, + "loss": 2.1009, + "step": 501 + }, + { + "epoch": 0.04, + "grad_norm": 0.8210524161654924, + "learning_rate": 4.999003539789424e-05, + "loss": 2.2795, + "step": 502 + }, + { + "epoch": 0.04, + "grad_norm": 0.7701369965347726, + "learning_rate": 4.9989858264858704e-05, + "loss": 2.1121, + "step": 503 + }, + { + "epoch": 0.04, + "grad_norm": 0.8218291496648268, + "learning_rate": 4.998967957160365e-05, + "loss": 2.0224, + "step": 504 + }, + { + "epoch": 0.04, + "grad_norm": 0.7381038693476569, + "learning_rate": 4.998949931814025e-05, + "loss": 2.0335, + "step": 505 + }, + { + "epoch": 0.04, + "grad_norm": 1.0549697916733558, + "learning_rate": 4.998931750447975e-05, + "loss": 2.2512, + "step": 506 + }, + { + "epoch": 0.04, + "grad_norm": 0.862611490633934, + "learning_rate": 4.99891341306335e-05, + "loss": 2.0708, + "step": 507 + }, + { + "epoch": 0.04, + "grad_norm": 0.8681370223790387, + "learning_rate": 4.998894919661294e-05, + "loss": 2.0249, + "step": 508 + }, + { + "epoch": 0.04, + "grad_norm": 0.8435041001435267, + "learning_rate": 4.998876270242963e-05, + "loss": 2.1056, + "step": 509 + }, + { + "epoch": 0.04, + "grad_norm": 0.7242803843850976, + "learning_rate": 4.998857464809521e-05, + "loss": 2.1529, + "step": 510 + }, + { + "epoch": 0.04, + "grad_norm": 0.8768372330042844, + "learning_rate": 4.998838503362141e-05, + "loss": 2.2559, + "step": 511 + }, + { + "epoch": 0.04, + "grad_norm": 0.9657078745831267, + "learning_rate": 4.998819385902008e-05, + "loss": 2.0714, + "step": 512 + }, + { + "epoch": 0.04, + "grad_norm": 0.779099751623594, + "learning_rate": 4.998800112430316e-05, + "loss": 2.035, + "step": 513 + }, + { + "epoch": 0.04, + "grad_norm": 0.835395994110178, + "learning_rate": 4.9987806829482674e-05, + "loss": 2.2612, + "step": 514 + }, + { + "epoch": 0.04, + "grad_norm": 0.8113856584369281, + "learning_rate": 4.9987610974570755e-05, + "loss": 2.0451, + "step": 515 + }, + { + "epoch": 0.04, + "grad_norm": 0.7372958763898732, + "learning_rate": 4.9987413559579636e-05, + "loss": 2.1364, + "step": 516 + }, + { + "epoch": 0.04, + "grad_norm": 0.9354273122997846, + "learning_rate": 4.998721458452163e-05, + "loss": 2.0553, + "step": 517 + }, + { + "epoch": 0.04, + "grad_norm": 1.054942908460834, + "learning_rate": 4.9987014049409166e-05, + "loss": 2.2769, + "step": 518 + }, + { + "epoch": 0.04, + "grad_norm": 0.7067115783890964, + "learning_rate": 4.998681195425477e-05, + "loss": 2.1116, + "step": 519 + }, + { + "epoch": 0.04, + "grad_norm": 0.8711277437669996, + "learning_rate": 4.998660829907105e-05, + "loss": 2.0566, + "step": 520 + }, + { + "epoch": 0.04, + "grad_norm": 0.7411338331491617, + "learning_rate": 4.998640308387074e-05, + "loss": 2.0834, + "step": 521 + }, + { + "epoch": 0.04, + "grad_norm": 0.7683344720794699, + "learning_rate": 4.998619630866662e-05, + "loss": 2.0922, + "step": 522 + }, + { + "epoch": 0.04, + "grad_norm": 0.910371811868455, + "learning_rate": 4.998598797347163e-05, + "loss": 2.2432, + "step": 523 + }, + { + "epoch": 0.04, + "grad_norm": 0.8449364689376, + "learning_rate": 4.998577807829876e-05, + "loss": 2.1126, + "step": 524 + }, + { + "epoch": 0.04, + "grad_norm": 0.8074557482892698, + "learning_rate": 4.998556662316113e-05, + "loss": 2.0238, + "step": 525 + }, + { + "epoch": 0.04, + "grad_norm": 0.8265954745821756, + "learning_rate": 4.9985353608071924e-05, + "loss": 2.2513, + "step": 526 + }, + { + "epoch": 0.04, + "grad_norm": 0.9321021662758424, + "learning_rate": 4.998513903304445e-05, + "loss": 2.0778, + "step": 527 + }, + { + "epoch": 0.04, + "grad_norm": 0.7223864797102112, + "learning_rate": 4.998492289809211e-05, + "loss": 2.097, + "step": 528 + }, + { + "epoch": 0.04, + "grad_norm": 0.8450822121312791, + "learning_rate": 4.998470520322839e-05, + "loss": 2.0525, + "step": 529 + }, + { + "epoch": 0.04, + "grad_norm": 0.8239669241930608, + "learning_rate": 4.9984485948466885e-05, + "loss": 2.0569, + "step": 530 + }, + { + "epoch": 0.04, + "grad_norm": 0.7194351685241752, + "learning_rate": 4.9984265133821285e-05, + "loss": 2.2723, + "step": 531 + }, + { + "epoch": 0.04, + "grad_norm": 0.7797691788690546, + "learning_rate": 4.9984042759305375e-05, + "loss": 2.081, + "step": 532 + }, + { + "epoch": 0.04, + "grad_norm": 0.7645428016155795, + "learning_rate": 4.9983818824933034e-05, + "loss": 2.0396, + "step": 533 + }, + { + "epoch": 0.04, + "grad_norm": 0.7905174111791649, + "learning_rate": 4.998359333071825e-05, + "loss": 2.0939, + "step": 534 + }, + { + "epoch": 0.04, + "grad_norm": 0.8112360346491397, + "learning_rate": 4.99833662766751e-05, + "loss": 2.2677, + "step": 535 + }, + { + "epoch": 0.04, + "grad_norm": 0.7480168307173948, + "learning_rate": 4.9983137662817756e-05, + "loss": 2.0835, + "step": 536 + }, + { + "epoch": 0.04, + "grad_norm": 0.7638784106124105, + "learning_rate": 4.9982907489160495e-05, + "loss": 2.0679, + "step": 537 + }, + { + "epoch": 0.04, + "grad_norm": 0.7446809569697774, + "learning_rate": 4.998267575571769e-05, + "loss": 2.2101, + "step": 538 + }, + { + "epoch": 0.04, + "grad_norm": 0.8256424001709195, + "learning_rate": 4.99824424625038e-05, + "loss": 2.0693, + "step": 539 + }, + { + "epoch": 0.04, + "grad_norm": 1.0535945192683556, + "learning_rate": 4.99822076095334e-05, + "loss": 2.0769, + "step": 540 + }, + { + "epoch": 0.04, + "grad_norm": 0.7751979279518619, + "learning_rate": 4.9981971196821154e-05, + "loss": 2.0937, + "step": 541 + }, + { + "epoch": 0.04, + "grad_norm": 0.8516367696336125, + "learning_rate": 4.998173322438181e-05, + "loss": 2.0542, + "step": 542 + }, + { + "epoch": 0.04, + "grad_norm": 0.8712691538460843, + "learning_rate": 4.998149369223024e-05, + "loss": 2.2249, + "step": 543 + }, + { + "epoch": 0.04, + "grad_norm": 0.798473658618515, + "learning_rate": 4.9981252600381396e-05, + "loss": 2.1002, + "step": 544 + }, + { + "epoch": 0.04, + "grad_norm": 0.9008487150199367, + "learning_rate": 4.998100994885032e-05, + "loss": 2.0498, + "step": 545 + }, + { + "epoch": 0.04, + "grad_norm": 0.9204863140701205, + "learning_rate": 4.9980765737652166e-05, + "loss": 2.2054, + "step": 546 + }, + { + "epoch": 0.04, + "grad_norm": 0.734580972708866, + "learning_rate": 4.9980519966802185e-05, + "loss": 2.0943, + "step": 547 + }, + { + "epoch": 0.04, + "grad_norm": 0.8603391453781783, + "learning_rate": 4.998027263631573e-05, + "loss": 2.0992, + "step": 548 + }, + { + "epoch": 0.04, + "grad_norm": 1.0484663923127682, + "learning_rate": 4.998002374620821e-05, + "loss": 2.0548, + "step": 549 + }, + { + "epoch": 0.04, + "grad_norm": 0.8023831852440654, + "learning_rate": 4.9979773296495215e-05, + "loss": 2.0565, + "step": 550 + }, + { + "epoch": 0.04, + "grad_norm": 1.1112580955384295, + "learning_rate": 4.997952128719233e-05, + "loss": 2.2747, + "step": 551 + }, + { + "epoch": 0.04, + "grad_norm": 0.841365712502595, + "learning_rate": 4.997926771831533e-05, + "loss": 2.0796, + "step": 552 + }, + { + "epoch": 0.04, + "grad_norm": 1.0290392672159288, + "learning_rate": 4.9979012589880016e-05, + "loss": 2.1119, + "step": 553 + }, + { + "epoch": 0.04, + "grad_norm": 0.8556226930497965, + "learning_rate": 4.997875590190233e-05, + "loss": 2.032, + "step": 554 + }, + { + "epoch": 0.04, + "grad_norm": 0.8675387493299548, + "learning_rate": 4.997849765439831e-05, + "loss": 2.2422, + "step": 555 + }, + { + "epoch": 0.04, + "grad_norm": 0.8380065173917949, + "learning_rate": 4.9978237847384056e-05, + "loss": 2.0659, + "step": 556 + }, + { + "epoch": 0.04, + "grad_norm": 0.9092738694777557, + "learning_rate": 4.997797648087581e-05, + "loss": 2.0318, + "step": 557 + }, + { + "epoch": 0.04, + "grad_norm": 0.8886722847886525, + "learning_rate": 4.997771355488987e-05, + "loss": 2.2693, + "step": 558 + }, + { + "epoch": 0.04, + "grad_norm": 0.8860830104895143, + "learning_rate": 4.997744906944267e-05, + "loss": 2.1173, + "step": 559 + }, + { + "epoch": 0.04, + "grad_norm": 0.7234059020940159, + "learning_rate": 4.997718302455071e-05, + "loss": 2.0836, + "step": 560 + }, + { + "epoch": 0.04, + "grad_norm": 0.8871051934248314, + "learning_rate": 4.99769154202306e-05, + "loss": 2.0477, + "step": 561 + }, + { + "epoch": 0.04, + "grad_norm": 0.7697819244263002, + "learning_rate": 4.997664625649906e-05, + "loss": 2.0255, + "step": 562 + }, + { + "epoch": 0.04, + "grad_norm": 0.8989888184792155, + "learning_rate": 4.997637553337289e-05, + "loss": 2.2321, + "step": 563 + }, + { + "epoch": 0.04, + "grad_norm": 0.9919396328084688, + "learning_rate": 4.997610325086898e-05, + "loss": 2.0319, + "step": 564 + }, + { + "epoch": 0.04, + "grad_norm": 0.7548152415429998, + "learning_rate": 4.997582940900435e-05, + "loss": 2.1393, + "step": 565 + }, + { + "epoch": 0.04, + "grad_norm": 1.1222777399950583, + "learning_rate": 4.9975554007796075e-05, + "loss": 2.0713, + "step": 566 + }, + { + "epoch": 0.04, + "grad_norm": 0.7481217633436353, + "learning_rate": 4.997527704726137e-05, + "loss": 2.2721, + "step": 567 + }, + { + "epoch": 0.04, + "grad_norm": 1.0241147818502496, + "learning_rate": 4.9974998527417516e-05, + "loss": 2.0464, + "step": 568 + }, + { + "epoch": 0.04, + "grad_norm": 0.9176588570449181, + "learning_rate": 4.9974718448281905e-05, + "loss": 2.0913, + "step": 569 + }, + { + "epoch": 0.04, + "grad_norm": 0.9124820593264301, + "learning_rate": 4.997443680987202e-05, + "loss": 2.2672, + "step": 570 + }, + { + "epoch": 0.04, + "grad_norm": 1.0638159802716098, + "learning_rate": 4.9974153612205455e-05, + "loss": 2.0846, + "step": 571 + }, + { + "epoch": 0.04, + "grad_norm": 0.746867800549365, + "learning_rate": 4.997386885529988e-05, + "loss": 2.0898, + "step": 572 + }, + { + "epoch": 0.04, + "grad_norm": 0.8165204294101939, + "learning_rate": 4.997358253917307e-05, + "loss": 2.0445, + "step": 573 + }, + { + "epoch": 0.04, + "grad_norm": 0.9051826744388111, + "learning_rate": 4.9973294663842904e-05, + "loss": 2.1151, + "step": 574 + }, + { + "epoch": 0.04, + "grad_norm": 0.7631780637342404, + "learning_rate": 4.997300522932737e-05, + "loss": 2.202, + "step": 575 + }, + { + "epoch": 0.04, + "grad_norm": 0.9389529809623927, + "learning_rate": 4.997271423564453e-05, + "loss": 2.092, + "step": 576 + }, + { + "epoch": 0.04, + "grad_norm": 0.7585799286700476, + "learning_rate": 4.9972421682812544e-05, + "loss": 2.0455, + "step": 577 + }, + { + "epoch": 0.04, + "grad_norm": 0.8323795123262308, + "learning_rate": 4.9972127570849685e-05, + "loss": 2.1076, + "step": 578 + }, + { + "epoch": 0.04, + "grad_norm": 0.7759156795048144, + "learning_rate": 4.9971831899774314e-05, + "loss": 2.2282, + "step": 579 + }, + { + "epoch": 0.04, + "grad_norm": 0.7526626135025627, + "learning_rate": 4.997153466960489e-05, + "loss": 2.0582, + "step": 580 + }, + { + "epoch": 0.04, + "grad_norm": 0.7387212144657123, + "learning_rate": 4.997123588035997e-05, + "loss": 2.0538, + "step": 581 + }, + { + "epoch": 0.04, + "grad_norm": 0.7232414088638426, + "learning_rate": 4.997093553205822e-05, + "loss": 2.0179, + "step": 582 + }, + { + "epoch": 0.04, + "grad_norm": 0.8764033237826514, + "learning_rate": 4.9970633624718365e-05, + "loss": 2.2522, + "step": 583 + }, + { + "epoch": 0.05, + "grad_norm": 0.7979401619614107, + "learning_rate": 4.997033015835928e-05, + "loss": 2.1133, + "step": 584 + }, + { + "epoch": 0.05, + "grad_norm": 0.8163702227581208, + "learning_rate": 4.99700251329999e-05, + "loss": 2.0159, + "step": 585 + }, + { + "epoch": 0.05, + "grad_norm": 1.0210158783867431, + "learning_rate": 4.996971854865927e-05, + "loss": 2.0582, + "step": 586 + }, + { + "epoch": 0.05, + "grad_norm": 0.9062636761679665, + "learning_rate": 4.996941040535653e-05, + "loss": 2.2816, + "step": 587 + }, + { + "epoch": 0.05, + "grad_norm": 0.7584401047861605, + "learning_rate": 4.9969100703110925e-05, + "loss": 2.0102, + "step": 588 + }, + { + "epoch": 0.05, + "grad_norm": 0.8224547912898316, + "learning_rate": 4.99687894419418e-05, + "loss": 2.0369, + "step": 589 + }, + { + "epoch": 0.05, + "grad_norm": 0.7323426756947437, + "learning_rate": 4.996847662186856e-05, + "loss": 2.1031, + "step": 590 + }, + { + "epoch": 0.05, + "grad_norm": 0.7679761335970001, + "learning_rate": 4.9968162242910757e-05, + "loss": 2.1985, + "step": 591 + }, + { + "epoch": 0.05, + "grad_norm": 0.8608716612778514, + "learning_rate": 4.9967846305088005e-05, + "loss": 2.0491, + "step": 592 + }, + { + "epoch": 0.05, + "grad_norm": 0.7686714367519798, + "learning_rate": 4.996752880842005e-05, + "loss": 2.0833, + "step": 593 + }, + { + "epoch": 0.05, + "grad_norm": 0.8174475967419659, + "learning_rate": 4.9967209752926695e-05, + "loss": 2.03, + "step": 594 + }, + { + "epoch": 0.05, + "grad_norm": 0.8003719052594509, + "learning_rate": 4.996688913862787e-05, + "loss": 2.2281, + "step": 595 + }, + { + "epoch": 0.05, + "grad_norm": 0.6971591958556124, + "learning_rate": 4.9966566965543594e-05, + "loss": 2.1018, + "step": 596 + }, + { + "epoch": 0.05, + "grad_norm": 0.8152650553138222, + "learning_rate": 4.9966243233693965e-05, + "loss": 2.0052, + "step": 597 + }, + { + "epoch": 0.05, + "grad_norm": 0.7612005329221397, + "learning_rate": 4.996591794309922e-05, + "loss": 2.0542, + "step": 598 + }, + { + "epoch": 0.05, + "grad_norm": 0.8592121253849749, + "learning_rate": 4.996559109377965e-05, + "loss": 2.2939, + "step": 599 + }, + { + "epoch": 0.05, + "grad_norm": 0.7527600388607134, + "learning_rate": 4.996526268575566e-05, + "loss": 2.0373, + "step": 600 + }, + { + "epoch": 0.05, + "grad_norm": 0.8181432522229931, + "learning_rate": 4.9964932719047774e-05, + "loss": 2.0315, + "step": 601 + }, + { + "epoch": 0.05, + "grad_norm": 0.728095249099527, + "learning_rate": 4.996460119367657e-05, + "loss": 2.247, + "step": 602 + }, + { + "epoch": 0.05, + "grad_norm": 0.823861225954546, + "learning_rate": 4.996426810966276e-05, + "loss": 2.1257, + "step": 603 + }, + { + "epoch": 0.05, + "grad_norm": 0.8566471527004964, + "learning_rate": 4.9963933467027135e-05, + "loss": 2.0245, + "step": 604 + }, + { + "epoch": 0.05, + "grad_norm": 0.9018949958093674, + "learning_rate": 4.996359726579059e-05, + "loss": 2.0317, + "step": 605 + }, + { + "epoch": 0.05, + "grad_norm": 0.6934804251111074, + "learning_rate": 4.996325950597411e-05, + "loss": 2.0841, + "step": 606 + }, + { + "epoch": 0.05, + "grad_norm": 1.0238246791964976, + "learning_rate": 4.9962920187598794e-05, + "loss": 2.2802, + "step": 607 + }, + { + "epoch": 0.05, + "grad_norm": 0.7658273860821063, + "learning_rate": 4.996257931068582e-05, + "loss": 1.9985, + "step": 608 + }, + { + "epoch": 0.05, + "grad_norm": 0.8999920583122112, + "learning_rate": 4.996223687525647e-05, + "loss": 2.1551, + "step": 609 + }, + { + "epoch": 0.05, + "grad_norm": 0.7115632169492504, + "learning_rate": 4.9961892881332115e-05, + "loss": 2.0671, + "step": 610 + }, + { + "epoch": 0.05, + "grad_norm": 0.844916054757255, + "learning_rate": 4.9961547328934255e-05, + "loss": 2.2263, + "step": 611 + }, + { + "epoch": 0.05, + "grad_norm": 0.979185090718177, + "learning_rate": 4.996120021808445e-05, + "loss": 2.0705, + "step": 612 + }, + { + "epoch": 0.05, + "grad_norm": 0.8409447018634192, + "learning_rate": 4.996085154880437e-05, + "loss": 2.0975, + "step": 613 + }, + { + "epoch": 0.05, + "grad_norm": 0.9493599629933573, + "learning_rate": 4.9960501321115774e-05, + "loss": 2.0209, + "step": 614 + }, + { + "epoch": 0.05, + "grad_norm": 1.0160571035520876, + "learning_rate": 4.9960149535040555e-05, + "loss": 2.2452, + "step": 615 + }, + { + "epoch": 0.05, + "grad_norm": 0.8700336432888692, + "learning_rate": 4.995979619060065e-05, + "loss": 2.0839, + "step": 616 + }, + { + "epoch": 0.05, + "grad_norm": 0.9711781661645156, + "learning_rate": 4.995944128781814e-05, + "loss": 2.017, + "step": 617 + }, + { + "epoch": 0.05, + "grad_norm": 0.7326138468653915, + "learning_rate": 4.995908482671517e-05, + "loss": 2.0629, + "step": 618 + }, + { + "epoch": 0.05, + "grad_norm": 0.9086365312372525, + "learning_rate": 4.9958726807314005e-05, + "loss": 2.2077, + "step": 619 + }, + { + "epoch": 0.05, + "grad_norm": 0.7792561192390194, + "learning_rate": 4.995836722963699e-05, + "loss": 2.0191, + "step": 620 + }, + { + "epoch": 0.05, + "grad_norm": 0.7027386571563832, + "learning_rate": 4.995800609370658e-05, + "loss": 2.0948, + "step": 621 + }, + { + "epoch": 0.05, + "grad_norm": 0.8441521648038912, + "learning_rate": 4.9957643399545314e-05, + "loss": 2.031, + "step": 622 + }, + { + "epoch": 0.05, + "grad_norm": 0.7996128938507705, + "learning_rate": 4.995727914717584e-05, + "loss": 2.2353, + "step": 623 + }, + { + "epoch": 0.05, + "grad_norm": 0.8796829820860176, + "learning_rate": 4.9956913336620914e-05, + "loss": 2.0402, + "step": 624 + }, + { + "epoch": 0.05, + "grad_norm": 0.7531181917265757, + "learning_rate": 4.995654596790335e-05, + "loss": 2.0885, + "step": 625 + }, + { + "epoch": 0.05, + "grad_norm": 0.7631970803856661, + "learning_rate": 4.9956177041046104e-05, + "loss": 2.0634, + "step": 626 + }, + { + "epoch": 0.05, + "grad_norm": 0.8219772969939948, + "learning_rate": 4.995580655607221e-05, + "loss": 2.2877, + "step": 627 + }, + { + "epoch": 0.05, + "grad_norm": 0.7625813947059649, + "learning_rate": 4.995543451300477e-05, + "loss": 2.0404, + "step": 628 + }, + { + "epoch": 0.05, + "grad_norm": 0.9709920769976625, + "learning_rate": 4.995506091186705e-05, + "loss": 2.0751, + "step": 629 + }, + { + "epoch": 0.05, + "grad_norm": 0.749938883680589, + "learning_rate": 4.995468575268235e-05, + "loss": 2.0558, + "step": 630 + }, + { + "epoch": 0.05, + "grad_norm": 0.8616586664600657, + "learning_rate": 4.9954309035474114e-05, + "loss": 2.2479, + "step": 631 + }, + { + "epoch": 0.05, + "grad_norm": 0.7260712477242162, + "learning_rate": 4.9953930760265845e-05, + "loss": 2.0243, + "step": 632 + }, + { + "epoch": 0.05, + "grad_norm": 0.7800301390801145, + "learning_rate": 4.995355092708116e-05, + "loss": 2.0381, + "step": 633 + }, + { + "epoch": 0.05, + "grad_norm": 0.736937019428342, + "learning_rate": 4.9953169535943783e-05, + "loss": 2.111, + "step": 634 + }, + { + "epoch": 0.05, + "grad_norm": 0.7440417604194494, + "learning_rate": 4.995278658687752e-05, + "loss": 2.2512, + "step": 635 + }, + { + "epoch": 0.05, + "grad_norm": 0.9139757044768461, + "learning_rate": 4.995240207990629e-05, + "loss": 2.0686, + "step": 636 + }, + { + "epoch": 0.05, + "grad_norm": 0.8458488429428564, + "learning_rate": 4.995201601505408e-05, + "loss": 2.0943, + "step": 637 + }, + { + "epoch": 0.05, + "grad_norm": 0.8822602846091177, + "learning_rate": 4.9951628392345005e-05, + "loss": 2.083, + "step": 638 + }, + { + "epoch": 0.05, + "grad_norm": 0.751130550292683, + "learning_rate": 4.9951239211803276e-05, + "loss": 2.2268, + "step": 639 + }, + { + "epoch": 0.05, + "grad_norm": 0.9418203537671516, + "learning_rate": 4.995084847345317e-05, + "loss": 2.1336, + "step": 640 + }, + { + "epoch": 0.05, + "grad_norm": 0.8359962392838421, + "learning_rate": 4.9950456177319106e-05, + "loss": 2.0612, + "step": 641 + }, + { + "epoch": 0.05, + "grad_norm": 0.803935196710273, + "learning_rate": 4.995006232342555e-05, + "loss": 2.039, + "step": 642 + }, + { + "epoch": 0.05, + "grad_norm": 0.7842009919222735, + "learning_rate": 4.994966691179711e-05, + "loss": 2.2302, + "step": 643 + }, + { + "epoch": 0.05, + "grad_norm": 0.7721257625274404, + "learning_rate": 4.9949269942458474e-05, + "loss": 2.1023, + "step": 644 + }, + { + "epoch": 0.05, + "grad_norm": 0.7282385906513733, + "learning_rate": 4.994887141543442e-05, + "loss": 2.0157, + "step": 645 + }, + { + "epoch": 0.05, + "grad_norm": 0.7311720520935104, + "learning_rate": 4.9948471330749824e-05, + "loss": 2.0833, + "step": 646 + }, + { + "epoch": 0.05, + "grad_norm": 0.6827996303665989, + "learning_rate": 4.994806968842968e-05, + "loss": 2.2676, + "step": 647 + }, + { + "epoch": 0.05, + "grad_norm": 0.7254043660775278, + "learning_rate": 4.994766648849904e-05, + "loss": 2.0642, + "step": 648 + }, + { + "epoch": 0.05, + "grad_norm": 0.7423240797752403, + "learning_rate": 4.994726173098312e-05, + "loss": 2.0143, + "step": 649 + }, + { + "epoch": 0.05, + "grad_norm": 0.7168180420701663, + "learning_rate": 4.994685541590715e-05, + "loss": 2.0438, + "step": 650 + }, + { + "epoch": 0.05, + "grad_norm": 0.7072408166290011, + "learning_rate": 4.99464475432965e-05, + "loss": 2.2326, + "step": 651 + }, + { + "epoch": 0.05, + "grad_norm": 0.7075539052966868, + "learning_rate": 4.994603811317667e-05, + "loss": 2.1136, + "step": 652 + }, + { + "epoch": 0.05, + "grad_norm": 0.8316250582793244, + "learning_rate": 4.994562712557319e-05, + "loss": 2.0761, + "step": 653 + }, + { + "epoch": 0.05, + "grad_norm": 0.7940173536652535, + "learning_rate": 4.994521458051172e-05, + "loss": 2.0333, + "step": 654 + }, + { + "epoch": 0.05, + "grad_norm": 0.8849979608600422, + "learning_rate": 4.994480047801804e-05, + "loss": 2.2608, + "step": 655 + }, + { + "epoch": 0.05, + "grad_norm": 0.6930111884757989, + "learning_rate": 4.9944384818117985e-05, + "loss": 2.0892, + "step": 656 + }, + { + "epoch": 0.05, + "grad_norm": 0.8911855013612321, + "learning_rate": 4.994396760083751e-05, + "loss": 2.0042, + "step": 657 + }, + { + "epoch": 0.05, + "grad_norm": 0.8295575785003704, + "learning_rate": 4.994354882620267e-05, + "loss": 1.9972, + "step": 658 + }, + { + "epoch": 0.05, + "grad_norm": 0.7464702859065359, + "learning_rate": 4.994312849423961e-05, + "loss": 2.2733, + "step": 659 + }, + { + "epoch": 0.05, + "grad_norm": 0.8064579456158069, + "learning_rate": 4.9942706604974555e-05, + "loss": 2.0438, + "step": 660 + }, + { + "epoch": 0.05, + "grad_norm": 0.7902967432062032, + "learning_rate": 4.9942283158433865e-05, + "loss": 2.0561, + "step": 661 + }, + { + "epoch": 0.05, + "grad_norm": 0.7873763760899675, + "learning_rate": 4.994185815464398e-05, + "loss": 2.0291, + "step": 662 + }, + { + "epoch": 0.05, + "grad_norm": 0.8795987679934034, + "learning_rate": 4.9941431593631414e-05, + "loss": 2.2468, + "step": 663 + }, + { + "epoch": 0.05, + "grad_norm": 0.7181087428112289, + "learning_rate": 4.994100347542282e-05, + "loss": 2.1, + "step": 664 + }, + { + "epoch": 0.05, + "grad_norm": 0.8611595095002097, + "learning_rate": 4.994057380004492e-05, + "loss": 2.1339, + "step": 665 + }, + { + "epoch": 0.05, + "grad_norm": 0.8267398110866929, + "learning_rate": 4.994014256752454e-05, + "loss": 2.0357, + "step": 666 + }, + { + "epoch": 0.05, + "grad_norm": 0.8075389777165766, + "learning_rate": 4.99397097778886e-05, + "loss": 2.2799, + "step": 667 + }, + { + "epoch": 0.05, + "grad_norm": 0.8544295929998521, + "learning_rate": 4.993927543116412e-05, + "loss": 2.0479, + "step": 668 + }, + { + "epoch": 0.05, + "grad_norm": 0.8784910813934775, + "learning_rate": 4.993883952737822e-05, + "loss": 2.0485, + "step": 669 + }, + { + "epoch": 0.05, + "grad_norm": 0.7081845553094777, + "learning_rate": 4.993840206655812e-05, + "loss": 2.0574, + "step": 670 + }, + { + "epoch": 0.05, + "grad_norm": 0.7016368079186255, + "learning_rate": 4.9937963048731134e-05, + "loss": 2.2637, + "step": 671 + }, + { + "epoch": 0.05, + "grad_norm": 0.8409725228445841, + "learning_rate": 4.993752247392467e-05, + "loss": 2.0161, + "step": 672 + }, + { + "epoch": 0.05, + "grad_norm": 0.8861996372853229, + "learning_rate": 4.993708034216622e-05, + "loss": 2.0386, + "step": 673 + }, + { + "epoch": 0.05, + "grad_norm": 0.7843481388070138, + "learning_rate": 4.993663665348341e-05, + "loss": 2.0615, + "step": 674 + }, + { + "epoch": 0.05, + "grad_norm": 0.8429688090966334, + "learning_rate": 4.993619140790393e-05, + "loss": 2.2218, + "step": 675 + }, + { + "epoch": 0.05, + "grad_norm": 1.063902709714382, + "learning_rate": 4.993574460545557e-05, + "loss": 2.0353, + "step": 676 + }, + { + "epoch": 0.05, + "grad_norm": 0.8430582434241445, + "learning_rate": 4.9935296246166245e-05, + "loss": 2.1565, + "step": 677 + }, + { + "epoch": 0.05, + "grad_norm": 0.9687464770378915, + "learning_rate": 4.993484633006394e-05, + "loss": 2.0529, + "step": 678 + }, + { + "epoch": 0.05, + "grad_norm": 0.8519851547668436, + "learning_rate": 4.993439485717674e-05, + "loss": 2.2507, + "step": 679 + }, + { + "epoch": 0.05, + "grad_norm": 0.8263542105946974, + "learning_rate": 4.993394182753284e-05, + "loss": 2.0663, + "step": 680 + }, + { + "epoch": 0.05, + "grad_norm": 1.0702254450021844, + "learning_rate": 4.9933487241160516e-05, + "loss": 2.041, + "step": 681 + }, + { + "epoch": 0.05, + "grad_norm": 0.8437667150722616, + "learning_rate": 4.993303109808816e-05, + "loss": 2.044, + "step": 682 + }, + { + "epoch": 0.05, + "grad_norm": 1.067139137316206, + "learning_rate": 4.9932573398344236e-05, + "loss": 2.0943, + "step": 683 + }, + { + "epoch": 0.05, + "grad_norm": 0.9999358260689343, + "learning_rate": 4.993211414195733e-05, + "loss": 2.2078, + "step": 684 + }, + { + "epoch": 0.05, + "grad_norm": 0.8929280356637327, + "learning_rate": 4.993165332895612e-05, + "loss": 2.0841, + "step": 685 + }, + { + "epoch": 0.05, + "grad_norm": 0.953788517899573, + "learning_rate": 4.993119095936937e-05, + "loss": 2.066, + "step": 686 + }, + { + "epoch": 0.05, + "grad_norm": 0.8259038534549908, + "learning_rate": 4.9930727033225945e-05, + "loss": 2.2291, + "step": 687 + }, + { + "epoch": 0.05, + "grad_norm": 0.8592689725828306, + "learning_rate": 4.993026155055482e-05, + "loss": 2.0174, + "step": 688 + }, + { + "epoch": 0.05, + "grad_norm": 0.7815492604276704, + "learning_rate": 4.9929794511385044e-05, + "loss": 2.0364, + "step": 689 + }, + { + "epoch": 0.05, + "grad_norm": 0.7121539456110083, + "learning_rate": 4.992932591574578e-05, + "loss": 2.0971, + "step": 690 + }, + { + "epoch": 0.05, + "grad_norm": 0.7993185383495275, + "learning_rate": 4.99288557636663e-05, + "loss": 2.2007, + "step": 691 + }, + { + "epoch": 0.05, + "grad_norm": 0.7495261296115238, + "learning_rate": 4.9928384055175934e-05, + "loss": 2.041, + "step": 692 + }, + { + "epoch": 0.05, + "grad_norm": 0.8856085910383256, + "learning_rate": 4.992791079030414e-05, + "loss": 2.0365, + "step": 693 + }, + { + "epoch": 0.05, + "grad_norm": 0.7647786163909616, + "learning_rate": 4.9927435969080476e-05, + "loss": 2.0331, + "step": 694 + }, + { + "epoch": 0.05, + "grad_norm": 0.8602517470224418, + "learning_rate": 4.992695959153458e-05, + "loss": 2.2141, + "step": 695 + }, + { + "epoch": 0.05, + "grad_norm": 0.8208175579891402, + "learning_rate": 4.9926481657696186e-05, + "loss": 2.0653, + "step": 696 + }, + { + "epoch": 0.05, + "grad_norm": 0.8079899929003923, + "learning_rate": 4.9926002167595153e-05, + "loss": 2.0722, + "step": 697 + }, + { + "epoch": 0.05, + "grad_norm": 0.8167136440703118, + "learning_rate": 4.99255211212614e-05, + "loss": 2.0718, + "step": 698 + }, + { + "epoch": 0.05, + "grad_norm": 0.9042468225600693, + "learning_rate": 4.992503851872497e-05, + "loss": 2.2825, + "step": 699 + }, + { + "epoch": 0.05, + "grad_norm": 0.7273317571493322, + "learning_rate": 4.9924554360015986e-05, + "loss": 2.0564, + "step": 700 + }, + { + "epoch": 0.05, + "grad_norm": 0.8121517092205636, + "learning_rate": 4.992406864516468e-05, + "loss": 2.0045, + "step": 701 + }, + { + "epoch": 0.05, + "grad_norm": 0.8394669052353458, + "learning_rate": 4.992358137420138e-05, + "loss": 2.1349, + "step": 702 + }, + { + "epoch": 0.05, + "grad_norm": 0.7902429503949969, + "learning_rate": 4.992309254715651e-05, + "loss": 2.2078, + "step": 703 + }, + { + "epoch": 0.05, + "grad_norm": 0.9750147073260156, + "learning_rate": 4.9922602164060586e-05, + "loss": 2.0741, + "step": 704 + }, + { + "epoch": 0.05, + "grad_norm": 0.7831176577813663, + "learning_rate": 4.992211022494422e-05, + "loss": 2.0497, + "step": 705 + }, + { + "epoch": 0.05, + "grad_norm": 0.9215773769890955, + "learning_rate": 4.992161672983813e-05, + "loss": 2.0453, + "step": 706 + }, + { + "epoch": 0.05, + "grad_norm": 0.7117490398604537, + "learning_rate": 4.992112167877313e-05, + "loss": 2.2023, + "step": 707 + }, + { + "epoch": 0.05, + "grad_norm": 0.7623678902091698, + "learning_rate": 4.992062507178012e-05, + "loss": 2.0829, + "step": 708 + }, + { + "epoch": 0.05, + "grad_norm": 0.8021934458379629, + "learning_rate": 4.992012690889011e-05, + "loss": 2.0436, + "step": 709 + }, + { + "epoch": 0.05, + "grad_norm": 0.8240638728293711, + "learning_rate": 4.9919627190134214e-05, + "loss": 2.0802, + "step": 710 + }, + { + "epoch": 0.05, + "grad_norm": 0.8385189594782518, + "learning_rate": 4.9919125915543604e-05, + "loss": 2.2199, + "step": 711 + }, + { + "epoch": 0.05, + "grad_norm": 0.7448968165781115, + "learning_rate": 4.99186230851496e-05, + "loss": 2.0563, + "step": 712 + }, + { + "epoch": 0.06, + "grad_norm": 0.8913757722206899, + "learning_rate": 4.991811869898359e-05, + "loss": 2.0721, + "step": 713 + }, + { + "epoch": 0.06, + "grad_norm": 0.7106863283717292, + "learning_rate": 4.9917612757077056e-05, + "loss": 2.1059, + "step": 714 + }, + { + "epoch": 0.06, + "grad_norm": 0.7479462682033995, + "learning_rate": 4.9917105259461605e-05, + "loss": 1.9997, + "step": 715 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834815894441659, + "learning_rate": 4.991659620616891e-05, + "loss": 2.2784, + "step": 716 + }, + { + "epoch": 0.06, + "grad_norm": 0.714724318427169, + "learning_rate": 4.991608559723074e-05, + "loss": 2.0616, + "step": 717 + }, + { + "epoch": 0.06, + "grad_norm": 0.8042717531145045, + "learning_rate": 4.9915573432679005e-05, + "loss": 2.0689, + "step": 718 + }, + { + "epoch": 0.06, + "grad_norm": 0.7935815420813611, + "learning_rate": 4.991505971254566e-05, + "loss": 2.2461, + "step": 719 + }, + { + "epoch": 0.06, + "grad_norm": 0.7787098515282533, + "learning_rate": 4.991454443686278e-05, + "loss": 2.0594, + "step": 720 + }, + { + "epoch": 0.06, + "grad_norm": 0.9099736503795062, + "learning_rate": 4.991402760566254e-05, + "loss": 2.1082, + "step": 721 + }, + { + "epoch": 0.06, + "grad_norm": 0.7621661021654437, + "learning_rate": 4.991350921897721e-05, + "loss": 2.0533, + "step": 722 + }, + { + "epoch": 0.06, + "grad_norm": 0.9200315935884937, + "learning_rate": 4.991298927683915e-05, + "loss": 2.1915, + "step": 723 + }, + { + "epoch": 0.06, + "grad_norm": 0.7305474485812655, + "learning_rate": 4.9912467779280825e-05, + "loss": 2.0144, + "step": 724 + }, + { + "epoch": 0.06, + "grad_norm": 0.7781171457784287, + "learning_rate": 4.991194472633479e-05, + "loss": 2.0149, + "step": 725 + }, + { + "epoch": 0.06, + "grad_norm": 0.7233474849879434, + "learning_rate": 4.991142011803371e-05, + "loss": 2.0182, + "step": 726 + }, + { + "epoch": 0.06, + "grad_norm": 0.8977702470890788, + "learning_rate": 4.991089395441033e-05, + "loss": 2.0824, + "step": 727 + }, + { + "epoch": 0.06, + "grad_norm": 0.7223307154371491, + "learning_rate": 4.9910366235497503e-05, + "loss": 2.2124, + "step": 728 + }, + { + "epoch": 0.06, + "grad_norm": 0.7350429405381966, + "learning_rate": 4.9909836961328185e-05, + "loss": 2.0189, + "step": 729 + }, + { + "epoch": 0.06, + "grad_norm": 0.7981841479670778, + "learning_rate": 4.99093061319354e-05, + "loss": 2.0414, + "step": 730 + }, + { + "epoch": 0.06, + "grad_norm": 0.7145214242492883, + "learning_rate": 4.990877374735231e-05, + "loss": 2.2191, + "step": 731 + }, + { + "epoch": 0.06, + "grad_norm": 0.8365423771874068, + "learning_rate": 4.9908239807612153e-05, + "loss": 2.0124, + "step": 732 + }, + { + "epoch": 0.06, + "grad_norm": 0.6919689517950398, + "learning_rate": 4.9907704312748255e-05, + "loss": 2.1078, + "step": 733 + }, + { + "epoch": 0.06, + "grad_norm": 0.7618951027208049, + "learning_rate": 4.990716726279405e-05, + "loss": 2.0745, + "step": 734 + }, + { + "epoch": 0.06, + "grad_norm": 0.7305981607490506, + "learning_rate": 4.990662865778307e-05, + "loss": 2.0329, + "step": 735 + }, + { + "epoch": 0.06, + "grad_norm": 0.7463826793007735, + "learning_rate": 4.990608849774895e-05, + "loss": 2.2019, + "step": 736 + }, + { + "epoch": 0.06, + "grad_norm": 0.7064261747096207, + "learning_rate": 4.990554678272541e-05, + "loss": 2.0214, + "step": 737 + }, + { + "epoch": 0.06, + "grad_norm": 0.7913628349854176, + "learning_rate": 4.9905003512746264e-05, + "loss": 2.0662, + "step": 738 + }, + { + "epoch": 0.06, + "grad_norm": 0.681483463675515, + "learning_rate": 4.990445868784544e-05, + "loss": 2.0323, + "step": 739 + }, + { + "epoch": 0.06, + "grad_norm": 0.6599138744460065, + "learning_rate": 4.9903912308056946e-05, + "loss": 2.2373, + "step": 740 + }, + { + "epoch": 0.06, + "grad_norm": 0.7496447704112126, + "learning_rate": 4.9903364373414904e-05, + "loss": 2.0558, + "step": 741 + }, + { + "epoch": 0.06, + "grad_norm": 0.6753875808215875, + "learning_rate": 4.990281488395352e-05, + "loss": 1.9622, + "step": 742 + }, + { + "epoch": 0.06, + "grad_norm": 0.708285283325801, + "learning_rate": 4.990226383970709e-05, + "loss": 2.2127, + "step": 743 + }, + { + "epoch": 0.06, + "grad_norm": 0.6640098730573766, + "learning_rate": 4.9901711240710033e-05, + "loss": 2.044, + "step": 744 + }, + { + "epoch": 0.06, + "grad_norm": 0.6509362036911452, + "learning_rate": 4.990115708699685e-05, + "loss": 2.1253, + "step": 745 + }, + { + "epoch": 0.06, + "grad_norm": 0.6901334692916077, + "learning_rate": 4.990060137860212e-05, + "loss": 2.0085, + "step": 746 + }, + { + "epoch": 0.06, + "grad_norm": 0.7615038072105573, + "learning_rate": 4.990004411556056e-05, + "loss": 2.049, + "step": 747 + }, + { + "epoch": 0.06, + "grad_norm": 0.6931460809687433, + "learning_rate": 4.9899485297906954e-05, + "loss": 2.2397, + "step": 748 + }, + { + "epoch": 0.06, + "grad_norm": 0.6534189928436769, + "learning_rate": 4.989892492567619e-05, + "loss": 2.0762, + "step": 749 + }, + { + "epoch": 0.06, + "grad_norm": 0.7112982447357519, + "learning_rate": 4.989836299890326e-05, + "loss": 2.0737, + "step": 750 + }, + { + "epoch": 0.06, + "grad_norm": 0.7641410485605341, + "learning_rate": 4.989779951762324e-05, + "loss": 2.222, + "step": 751 + }, + { + "epoch": 0.06, + "grad_norm": 0.8505337638915853, + "learning_rate": 4.989723448187131e-05, + "loss": 2.0935, + "step": 752 + }, + { + "epoch": 0.06, + "grad_norm": 0.7147043281115546, + "learning_rate": 4.989666789168276e-05, + "loss": 2.0134, + "step": 753 + }, + { + "epoch": 0.06, + "grad_norm": 0.7304699287639821, + "learning_rate": 4.989609974709295e-05, + "loss": 2.0217, + "step": 754 + }, + { + "epoch": 0.06, + "grad_norm": 0.7256952314498918, + "learning_rate": 4.989553004813735e-05, + "loss": 2.2198, + "step": 755 + }, + { + "epoch": 0.06, + "grad_norm": 0.7552942679998607, + "learning_rate": 4.9894958794851544e-05, + "loss": 2.0898, + "step": 756 + }, + { + "epoch": 0.06, + "grad_norm": 0.7018188153634748, + "learning_rate": 4.9894385987271185e-05, + "loss": 2.0217, + "step": 757 + }, + { + "epoch": 0.06, + "grad_norm": 0.6743354492054912, + "learning_rate": 4.989381162543204e-05, + "loss": 2.0403, + "step": 758 + }, + { + "epoch": 0.06, + "grad_norm": 0.7935821089716525, + "learning_rate": 4.9893235709369976e-05, + "loss": 2.0584, + "step": 759 + }, + { + "epoch": 0.06, + "grad_norm": 0.871835295142175, + "learning_rate": 4.9892658239120934e-05, + "loss": 2.1947, + "step": 760 + }, + { + "epoch": 0.06, + "grad_norm": 0.6787437045570935, + "learning_rate": 4.989207921472098e-05, + "loss": 2.006, + "step": 761 + }, + { + "epoch": 0.06, + "grad_norm": 1.0358893907234512, + "learning_rate": 4.9891498636206267e-05, + "loss": 2.0885, + "step": 762 + }, + { + "epoch": 0.06, + "grad_norm": 0.6983656004377002, + "learning_rate": 4.9890916503613025e-05, + "loss": 2.2546, + "step": 763 + }, + { + "epoch": 0.06, + "grad_norm": 0.7465843139766267, + "learning_rate": 4.989033281697762e-05, + "loss": 2.0957, + "step": 764 + }, + { + "epoch": 0.06, + "grad_norm": 0.7048198575000174, + "learning_rate": 4.988974757633649e-05, + "loss": 2.001, + "step": 765 + }, + { + "epoch": 0.06, + "grad_norm": 0.7771613760800516, + "learning_rate": 4.9889160781726166e-05, + "loss": 2.0773, + "step": 766 + }, + { + "epoch": 0.06, + "grad_norm": 0.6950254925834636, + "learning_rate": 4.988857243318328e-05, + "loss": 2.0548, + "step": 767 + }, + { + "epoch": 0.06, + "grad_norm": 0.8341697096984066, + "learning_rate": 4.988798253074457e-05, + "loss": 2.2359, + "step": 768 + }, + { + "epoch": 0.06, + "grad_norm": 0.7186223256349434, + "learning_rate": 4.988739107444688e-05, + "loss": 2.0475, + "step": 769 + }, + { + "epoch": 0.06, + "grad_norm": 0.7411821927818484, + "learning_rate": 4.988679806432712e-05, + "loss": 2.1333, + "step": 770 + }, + { + "epoch": 0.06, + "grad_norm": 0.8520646889766157, + "learning_rate": 4.9886203500422313e-05, + "loss": 2.071, + "step": 771 + }, + { + "epoch": 0.06, + "grad_norm": 0.700077065116785, + "learning_rate": 4.9885607382769596e-05, + "loss": 2.2553, + "step": 772 + }, + { + "epoch": 0.06, + "grad_norm": 0.7404556462446583, + "learning_rate": 4.988500971140617e-05, + "loss": 2.0562, + "step": 773 + }, + { + "epoch": 0.06, + "grad_norm": 0.7289801149227552, + "learning_rate": 4.988441048636937e-05, + "loss": 2.0334, + "step": 774 + }, + { + "epoch": 0.06, + "grad_norm": 0.7153092758658268, + "learning_rate": 4.988380970769658e-05, + "loss": 2.2493, + "step": 775 + }, + { + "epoch": 0.06, + "grad_norm": 0.7462983965174421, + "learning_rate": 4.9883207375425334e-05, + "loss": 2.052, + "step": 776 + }, + { + "epoch": 0.06, + "grad_norm": 0.8049546590078663, + "learning_rate": 4.9882603489593226e-05, + "loss": 2.0558, + "step": 777 + }, + { + "epoch": 0.06, + "grad_norm": 0.7266973839399192, + "learning_rate": 4.988199805023796e-05, + "loss": 2.034, + "step": 778 + }, + { + "epoch": 0.06, + "grad_norm": 0.782933669916175, + "learning_rate": 4.988139105739734e-05, + "loss": 1.9865, + "step": 779 + }, + { + "epoch": 0.06, + "grad_norm": 0.704126248590267, + "learning_rate": 4.988078251110926e-05, + "loss": 2.2452, + "step": 780 + }, + { + "epoch": 0.06, + "grad_norm": 0.6504003730202843, + "learning_rate": 4.9880172411411716e-05, + "loss": 2.0553, + "step": 781 + }, + { + "epoch": 0.06, + "grad_norm": 0.7265142059382989, + "learning_rate": 4.9879560758342804e-05, + "loss": 2.0348, + "step": 782 + }, + { + "epoch": 0.06, + "grad_norm": 0.6737558360417277, + "learning_rate": 4.98789475519407e-05, + "loss": 2.1169, + "step": 783 + }, + { + "epoch": 0.06, + "grad_norm": 0.8120840395656, + "learning_rate": 4.9878332792243686e-05, + "loss": 2.2139, + "step": 784 + }, + { + "epoch": 0.06, + "grad_norm": 0.711600918910579, + "learning_rate": 4.9877716479290174e-05, + "loss": 2.0253, + "step": 785 + }, + { + "epoch": 0.06, + "grad_norm": 0.7814369244521375, + "learning_rate": 4.98770986131186e-05, + "loss": 2.0456, + "step": 786 + }, + { + "epoch": 0.06, + "grad_norm": 0.6818013651486091, + "learning_rate": 4.987647919376758e-05, + "loss": 2.2463, + "step": 787 + }, + { + "epoch": 0.06, + "grad_norm": 1.093077745814283, + "learning_rate": 4.9875858221275756e-05, + "loss": 2.0803, + "step": 788 + }, + { + "epoch": 0.06, + "grad_norm": 0.7075235454045999, + "learning_rate": 4.987523569568192e-05, + "loss": 2.0974, + "step": 789 + }, + { + "epoch": 0.06, + "grad_norm": 0.6675591461105268, + "learning_rate": 4.987461161702492e-05, + "loss": 2.0294, + "step": 790 + }, + { + "epoch": 0.06, + "grad_norm": 0.7661498172793076, + "learning_rate": 4.9873985985343734e-05, + "loss": 2.0526, + "step": 791 + }, + { + "epoch": 0.06, + "grad_norm": 0.7345125327153424, + "learning_rate": 4.987335880067742e-05, + "loss": 2.2249, + "step": 792 + }, + { + "epoch": 0.06, + "grad_norm": 0.7932124616173761, + "learning_rate": 4.987273006306513e-05, + "loss": 2.0285, + "step": 793 + }, + { + "epoch": 0.06, + "grad_norm": 0.6508623223374154, + "learning_rate": 4.987209977254612e-05, + "loss": 2.028, + "step": 794 + }, + { + "epoch": 0.06, + "grad_norm": 0.7867669477716656, + "learning_rate": 4.987146792915975e-05, + "loss": 2.1119, + "step": 795 + }, + { + "epoch": 0.06, + "grad_norm": 0.6763597433838995, + "learning_rate": 4.987083453294546e-05, + "loss": 2.2323, + "step": 796 + }, + { + "epoch": 0.06, + "grad_norm": 0.7683310136612457, + "learning_rate": 4.98701995839428e-05, + "loss": 2.0578, + "step": 797 + }, + { + "epoch": 0.06, + "grad_norm": 0.7761324967971456, + "learning_rate": 4.9869563082191415e-05, + "loss": 2.0284, + "step": 798 + }, + { + "epoch": 0.06, + "grad_norm": 0.726820297459208, + "learning_rate": 4.986892502773103e-05, + "loss": 2.0347, + "step": 799 + }, + { + "epoch": 0.06, + "grad_norm": 0.7967007859485941, + "learning_rate": 4.98682854206015e-05, + "loss": 2.2171, + "step": 800 + }, + { + "epoch": 0.06, + "grad_norm": 0.7471516554376232, + "learning_rate": 4.986764426084275e-05, + "loss": 2.1015, + "step": 801 + }, + { + "epoch": 0.06, + "grad_norm": 0.7625045814073039, + "learning_rate": 4.98670015484948e-05, + "loss": 2.0452, + "step": 802 + }, + { + "epoch": 0.06, + "grad_norm": 0.8003251758240643, + "learning_rate": 4.9866357283597794e-05, + "loss": 2.0244, + "step": 803 + }, + { + "epoch": 0.06, + "grad_norm": 0.7053028460745614, + "learning_rate": 4.986571146619195e-05, + "loss": 2.2306, + "step": 804 + }, + { + "epoch": 0.06, + "grad_norm": 0.7142548230919835, + "learning_rate": 4.9865064096317586e-05, + "loss": 2.0227, + "step": 805 + }, + { + "epoch": 0.06, + "grad_norm": 0.7505401564644995, + "learning_rate": 4.9864415174015124e-05, + "loss": 2.0823, + "step": 806 + }, + { + "epoch": 0.06, + "grad_norm": 0.7597643084817601, + "learning_rate": 4.9863764699325075e-05, + "loss": 2.2234, + "step": 807 + }, + { + "epoch": 0.06, + "grad_norm": 0.768779724659344, + "learning_rate": 4.986311267228805e-05, + "loss": 2.0879, + "step": 808 + }, + { + "epoch": 0.06, + "grad_norm": 0.8565394030667026, + "learning_rate": 4.986245909294477e-05, + "loss": 2.0535, + "step": 809 + }, + { + "epoch": 0.06, + "grad_norm": 0.6680538376376509, + "learning_rate": 4.986180396133603e-05, + "loss": 1.9681, + "step": 810 + }, + { + "epoch": 0.06, + "grad_norm": 0.8127246566167461, + "learning_rate": 4.986114727750273e-05, + "loss": 2.0534, + "step": 811 + }, + { + "epoch": 0.06, + "grad_norm": 0.7312139074271476, + "learning_rate": 4.986048904148587e-05, + "loss": 2.2078, + "step": 812 + }, + { + "epoch": 0.06, + "grad_norm": 0.8067891560152749, + "learning_rate": 4.9859829253326554e-05, + "loss": 2.046, + "step": 813 + }, + { + "epoch": 0.06, + "grad_norm": 0.6481194313573897, + "learning_rate": 4.9859167913065964e-05, + "loss": 2.1019, + "step": 814 + }, + { + "epoch": 0.06, + "grad_norm": 0.7939226992594048, + "learning_rate": 4.9858505020745406e-05, + "loss": 2.046, + "step": 815 + }, + { + "epoch": 0.06, + "grad_norm": 0.82158712380577, + "learning_rate": 4.985784057640626e-05, + "loss": 2.2539, + "step": 816 + }, + { + "epoch": 0.06, + "grad_norm": 0.7419622301819795, + "learning_rate": 4.985717458009e-05, + "loss": 2.0159, + "step": 817 + }, + { + "epoch": 0.06, + "grad_norm": 0.7668192282059307, + "learning_rate": 4.985650703183822e-05, + "loss": 2.0512, + "step": 818 + }, + { + "epoch": 0.06, + "grad_norm": 0.7630692495225115, + "learning_rate": 4.985583793169258e-05, + "loss": 2.2402, + "step": 819 + }, + { + "epoch": 0.06, + "grad_norm": 0.6915707947014003, + "learning_rate": 4.985516727969489e-05, + "loss": 2.0336, + "step": 820 + }, + { + "epoch": 0.06, + "grad_norm": 0.750965860634281, + "learning_rate": 4.9854495075886984e-05, + "loss": 2.0513, + "step": 821 + }, + { + "epoch": 0.06, + "grad_norm": 0.6914527944337195, + "learning_rate": 4.985382132031085e-05, + "loss": 2.0467, + "step": 822 + }, + { + "epoch": 0.06, + "grad_norm": 0.6926143551025613, + "learning_rate": 4.9853146013008554e-05, + "loss": 2.0265, + "step": 823 + }, + { + "epoch": 0.06, + "grad_norm": 0.7343547430760857, + "learning_rate": 4.985246915402224e-05, + "loss": 2.241, + "step": 824 + }, + { + "epoch": 0.06, + "grad_norm": 0.6710432336977166, + "learning_rate": 4.9851790743394186e-05, + "loss": 2.0356, + "step": 825 + }, + { + "epoch": 0.06, + "grad_norm": 0.7233493593153486, + "learning_rate": 4.9851110781166746e-05, + "loss": 2.1113, + "step": 826 + }, + { + "epoch": 0.06, + "grad_norm": 0.6718253909341829, + "learning_rate": 4.985042926738237e-05, + "loss": 2.012, + "step": 827 + }, + { + "epoch": 0.06, + "grad_norm": 0.8346908565575587, + "learning_rate": 4.98497462020836e-05, + "loss": 2.2423, + "step": 828 + }, + { + "epoch": 0.06, + "grad_norm": 0.73973075154058, + "learning_rate": 4.984906158531309e-05, + "loss": 2.0238, + "step": 829 + }, + { + "epoch": 0.06, + "grad_norm": 0.840117373218022, + "learning_rate": 4.984837541711359e-05, + "loss": 2.0475, + "step": 830 + }, + { + "epoch": 0.06, + "grad_norm": 0.6608448010903015, + "learning_rate": 4.984768769752794e-05, + "loss": 2.1229, + "step": 831 + }, + { + "epoch": 0.06, + "grad_norm": 0.9034436293011574, + "learning_rate": 4.984699842659905e-05, + "loss": 2.2663, + "step": 832 + }, + { + "epoch": 0.06, + "grad_norm": 0.6633732433207797, + "learning_rate": 4.9846307604369985e-05, + "loss": 2.0395, + "step": 833 + }, + { + "epoch": 0.06, + "grad_norm": 0.7343044334820176, + "learning_rate": 4.9845615230883865e-05, + "loss": 1.9828, + "step": 834 + }, + { + "epoch": 0.06, + "grad_norm": 0.7816831638217253, + "learning_rate": 4.984492130618391e-05, + "loss": 2.031, + "step": 835 + }, + { + "epoch": 0.06, + "grad_norm": 0.6939326101848108, + "learning_rate": 4.984422583031346e-05, + "loss": 2.1999, + "step": 836 + }, + { + "epoch": 0.06, + "grad_norm": 0.7522123002819346, + "learning_rate": 4.984352880331593e-05, + "loss": 2.0171, + "step": 837 + }, + { + "epoch": 0.06, + "grad_norm": 0.8171640916275302, + "learning_rate": 4.9842830225234835e-05, + "loss": 2.0524, + "step": 838 + }, + { + "epoch": 0.06, + "grad_norm": 0.6577343787602368, + "learning_rate": 4.9842130096113795e-05, + "loss": 2.1207, + "step": 839 + }, + { + "epoch": 0.06, + "grad_norm": 0.9935433694907367, + "learning_rate": 4.984142841599651e-05, + "loss": 2.2246, + "step": 840 + }, + { + "epoch": 0.06, + "grad_norm": 0.7223477368200224, + "learning_rate": 4.984072518492681e-05, + "loss": 2.0494, + "step": 841 + }, + { + "epoch": 0.06, + "grad_norm": 0.849218800020168, + "learning_rate": 4.9840020402948574e-05, + "loss": 2.0295, + "step": 842 + }, + { + "epoch": 0.07, + "grad_norm": 0.7577179823050312, + "learning_rate": 4.983931407010583e-05, + "loss": 2.0514, + "step": 843 + }, + { + "epoch": 0.07, + "grad_norm": 0.7713770670130917, + "learning_rate": 4.983860618644266e-05, + "loss": 2.26, + "step": 844 + }, + { + "epoch": 0.07, + "grad_norm": 0.8329551518722105, + "learning_rate": 4.983789675200327e-05, + "loss": 2.107, + "step": 845 + }, + { + "epoch": 0.07, + "grad_norm": 0.8276628941021626, + "learning_rate": 4.983718576683195e-05, + "loss": 2.0514, + "step": 846 + }, + { + "epoch": 0.07, + "grad_norm": 0.824191615076118, + "learning_rate": 4.983647323097309e-05, + "loss": 2.1109, + "step": 847 + }, + { + "epoch": 0.07, + "grad_norm": 0.9863282747024558, + "learning_rate": 4.9835759144471175e-05, + "loss": 2.2199, + "step": 848 + }, + { + "epoch": 0.07, + "grad_norm": 0.7489246358183148, + "learning_rate": 4.983504350737078e-05, + "loss": 2.0306, + "step": 849 + }, + { + "epoch": 0.07, + "grad_norm": 0.7440571818464188, + "learning_rate": 4.98343263197166e-05, + "loss": 2.0556, + "step": 850 + }, + { + "epoch": 0.07, + "grad_norm": 0.7850101165918115, + "learning_rate": 4.983360758155341e-05, + "loss": 2.1154, + "step": 851 + }, + { + "epoch": 0.07, + "grad_norm": 0.7008034623840368, + "learning_rate": 4.983288729292607e-05, + "loss": 2.2408, + "step": 852 + }, + { + "epoch": 0.07, + "grad_norm": 0.9255140032245944, + "learning_rate": 4.983216545387957e-05, + "loss": 2.0657, + "step": 853 + }, + { + "epoch": 0.07, + "grad_norm": 0.6977989356636721, + "learning_rate": 4.9831442064458964e-05, + "loss": 2.0126, + "step": 854 + }, + { + "epoch": 0.07, + "grad_norm": 0.869289560394945, + "learning_rate": 4.983071712470942e-05, + "loss": 2.0777, + "step": 855 + }, + { + "epoch": 0.07, + "grad_norm": 0.8215997227558899, + "learning_rate": 4.982999063467619e-05, + "loss": 2.216, + "step": 856 + }, + { + "epoch": 0.07, + "grad_norm": 0.8457242573412425, + "learning_rate": 4.982926259440465e-05, + "loss": 2.133, + "step": 857 + }, + { + "epoch": 0.07, + "grad_norm": 0.7648923112512502, + "learning_rate": 4.982853300394025e-05, + "loss": 2.0303, + "step": 858 + }, + { + "epoch": 0.07, + "grad_norm": 0.8038686071630206, + "learning_rate": 4.9827801863328527e-05, + "loss": 2.0273, + "step": 859 + }, + { + "epoch": 0.07, + "grad_norm": 0.8461515344313622, + "learning_rate": 4.982706917261514e-05, + "loss": 2.247, + "step": 860 + }, + { + "epoch": 0.07, + "grad_norm": 0.6989260927839712, + "learning_rate": 4.9826334931845835e-05, + "loss": 2.0176, + "step": 861 + }, + { + "epoch": 0.07, + "grad_norm": 0.7934748715928471, + "learning_rate": 4.982559914106645e-05, + "loss": 2.0366, + "step": 862 + }, + { + "epoch": 0.07, + "grad_norm": 0.6917894664885689, + "learning_rate": 4.982486180032293e-05, + "loss": 2.1194, + "step": 863 + }, + { + "epoch": 0.07, + "grad_norm": 0.7250799543659454, + "learning_rate": 4.98241229096613e-05, + "loss": 2.2303, + "step": 864 + }, + { + "epoch": 0.07, + "grad_norm": 0.7269420689931193, + "learning_rate": 4.9823382469127686e-05, + "loss": 2.0437, + "step": 865 + }, + { + "epoch": 0.07, + "grad_norm": 0.6809825934741925, + "learning_rate": 4.982264047876835e-05, + "loss": 2.0315, + "step": 866 + }, + { + "epoch": 0.07, + "grad_norm": 0.7951886783491885, + "learning_rate": 4.982189693862957e-05, + "loss": 2.0548, + "step": 867 + }, + { + "epoch": 0.07, + "grad_norm": 0.9263725221802728, + "learning_rate": 4.982115184875781e-05, + "loss": 2.1664, + "step": 868 + }, + { + "epoch": 0.07, + "grad_norm": 0.6987917071579538, + "learning_rate": 4.982040520919957e-05, + "loss": 2.0739, + "step": 869 + }, + { + "epoch": 0.07, + "grad_norm": 1.0335954251331716, + "learning_rate": 4.9819657020001466e-05, + "loss": 2.1281, + "step": 870 + }, + { + "epoch": 0.07, + "grad_norm": 0.7497860936467335, + "learning_rate": 4.9818907281210206e-05, + "loss": 2.0432, + "step": 871 + }, + { + "epoch": 0.07, + "grad_norm": 0.8221237995850927, + "learning_rate": 4.9818155992872614e-05, + "loss": 2.1887, + "step": 872 + }, + { + "epoch": 0.07, + "grad_norm": 0.925165863566801, + "learning_rate": 4.9817403155035595e-05, + "loss": 2.029, + "step": 873 + }, + { + "epoch": 0.07, + "grad_norm": 0.79730469064726, + "learning_rate": 4.9816648767746135e-05, + "loss": 2.0166, + "step": 874 + }, + { + "epoch": 0.07, + "grad_norm": 0.8109329590212345, + "learning_rate": 4.9815892831051336e-05, + "loss": 2.0579, + "step": 875 + }, + { + "epoch": 0.07, + "grad_norm": 0.811281504167831, + "learning_rate": 4.981513534499841e-05, + "loss": 2.2684, + "step": 876 + }, + { + "epoch": 0.07, + "grad_norm": 0.7600970178837713, + "learning_rate": 4.981437630963464e-05, + "loss": 2.0267, + "step": 877 + }, + { + "epoch": 0.07, + "grad_norm": 0.9612666325776142, + "learning_rate": 4.981361572500742e-05, + "loss": 2.0452, + "step": 878 + }, + { + "epoch": 0.07, + "grad_norm": 0.7421183489881553, + "learning_rate": 4.981285359116423e-05, + "loss": 2.0698, + "step": 879 + }, + { + "epoch": 0.07, + "grad_norm": 0.826324154297863, + "learning_rate": 4.9812089908152656e-05, + "loss": 2.2681, + "step": 880 + }, + { + "epoch": 0.07, + "grad_norm": 0.900948510584364, + "learning_rate": 4.981132467602038e-05, + "loss": 2.0657, + "step": 881 + }, + { + "epoch": 0.07, + "grad_norm": 0.7305268703880301, + "learning_rate": 4.9810557894815176e-05, + "loss": 2.0279, + "step": 882 + }, + { + "epoch": 0.07, + "grad_norm": 1.0192189923801236, + "learning_rate": 4.980978956458492e-05, + "loss": 2.0022, + "step": 883 + }, + { + "epoch": 0.07, + "grad_norm": 0.7921033842507202, + "learning_rate": 4.980901968537758e-05, + "loss": 2.251, + "step": 884 + }, + { + "epoch": 0.07, + "grad_norm": 0.9055087307965137, + "learning_rate": 4.9808248257241216e-05, + "loss": 2.021, + "step": 885 + }, + { + "epoch": 0.07, + "grad_norm": 1.028227290666769, + "learning_rate": 4.980747528022401e-05, + "loss": 2.0538, + "step": 886 + }, + { + "epoch": 0.07, + "grad_norm": 0.6681327839597142, + "learning_rate": 4.98067007543742e-05, + "loss": 2.0145, + "step": 887 + }, + { + "epoch": 0.07, + "grad_norm": 0.9031349792497351, + "learning_rate": 4.980592467974016e-05, + "loss": 2.2398, + "step": 888 + }, + { + "epoch": 0.07, + "grad_norm": 0.715313057331333, + "learning_rate": 4.980514705637034e-05, + "loss": 2.0258, + "step": 889 + }, + { + "epoch": 0.07, + "grad_norm": 0.7672590841322251, + "learning_rate": 4.980436788431328e-05, + "loss": 2.019, + "step": 890 + }, + { + "epoch": 0.07, + "grad_norm": 0.785246739293584, + "learning_rate": 4.980358716361764e-05, + "loss": 2.0424, + "step": 891 + }, + { + "epoch": 0.07, + "grad_norm": 0.7162691780674956, + "learning_rate": 4.9802804894332156e-05, + "loss": 2.2233, + "step": 892 + }, + { + "epoch": 0.07, + "grad_norm": 0.8781188225279981, + "learning_rate": 4.980202107650567e-05, + "loss": 2.0545, + "step": 893 + }, + { + "epoch": 0.07, + "grad_norm": 0.6872812849863329, + "learning_rate": 4.980123571018711e-05, + "loss": 2.1034, + "step": 894 + }, + { + "epoch": 0.07, + "grad_norm": 0.8039306146356703, + "learning_rate": 4.980044879542552e-05, + "loss": 2.0484, + "step": 895 + }, + { + "epoch": 0.07, + "grad_norm": 0.7968291476576818, + "learning_rate": 4.979966033227004e-05, + "loss": 2.2398, + "step": 896 + }, + { + "epoch": 0.07, + "grad_norm": 0.7159424103456706, + "learning_rate": 4.9798870320769886e-05, + "loss": 2.0183, + "step": 897 + }, + { + "epoch": 0.07, + "grad_norm": 0.7672306920926802, + "learning_rate": 4.979807876097437e-05, + "loss": 2.0342, + "step": 898 + }, + { + "epoch": 0.07, + "grad_norm": 0.7310034897072122, + "learning_rate": 4.979728565293293e-05, + "loss": 1.9777, + "step": 899 + }, + { + "epoch": 0.07, + "grad_norm": 0.7945518271347839, + "learning_rate": 4.9796490996695086e-05, + "loss": 2.212, + "step": 900 + }, + { + "epoch": 0.07, + "grad_norm": 0.9091171486390567, + "learning_rate": 4.979569479231043e-05, + "loss": 2.097, + "step": 901 + }, + { + "epoch": 0.07, + "grad_norm": 0.6862795583698919, + "learning_rate": 4.9794897039828694e-05, + "loss": 2.0075, + "step": 902 + }, + { + "epoch": 0.07, + "grad_norm": 0.7937909251281441, + "learning_rate": 4.9794097739299664e-05, + "loss": 2.0301, + "step": 903 + }, + { + "epoch": 0.07, + "grad_norm": 0.800197802644946, + "learning_rate": 4.979329689077327e-05, + "loss": 2.2486, + "step": 904 + }, + { + "epoch": 0.07, + "grad_norm": 0.8379376902739822, + "learning_rate": 4.979249449429948e-05, + "loss": 2.0303, + "step": 905 + }, + { + "epoch": 0.07, + "grad_norm": 0.6948687426230196, + "learning_rate": 4.979169054992842e-05, + "loss": 1.9992, + "step": 906 + }, + { + "epoch": 0.07, + "grad_norm": 0.8377682635008799, + "learning_rate": 4.979088505771027e-05, + "loss": 2.1031, + "step": 907 + }, + { + "epoch": 0.07, + "grad_norm": 0.6467559363841642, + "learning_rate": 4.979007801769533e-05, + "loss": 2.2709, + "step": 908 + }, + { + "epoch": 0.07, + "grad_norm": 0.7424639949571712, + "learning_rate": 4.9789269429933964e-05, + "loss": 2.0005, + "step": 909 + }, + { + "epoch": 0.07, + "grad_norm": 0.7432849247820381, + "learning_rate": 4.978845929447668e-05, + "loss": 2.0053, + "step": 910 + }, + { + "epoch": 0.07, + "grad_norm": 0.8244180404005984, + "learning_rate": 4.978764761137404e-05, + "loss": 1.9976, + "step": 911 + }, + { + "epoch": 0.07, + "grad_norm": 0.7312852236983929, + "learning_rate": 4.978683438067675e-05, + "loss": 2.2409, + "step": 912 + }, + { + "epoch": 0.07, + "grad_norm": 0.6755154803231432, + "learning_rate": 4.978601960243554e-05, + "loss": 2.1389, + "step": 913 + }, + { + "epoch": 0.07, + "grad_norm": 0.7008291060049643, + "learning_rate": 4.978520327670132e-05, + "loss": 2.0518, + "step": 914 + }, + { + "epoch": 0.07, + "grad_norm": 0.7482159958514782, + "learning_rate": 4.978438540352502e-05, + "loss": 1.9967, + "step": 915 + }, + { + "epoch": 0.07, + "grad_norm": 0.6732795052437436, + "learning_rate": 4.978356598295774e-05, + "loss": 2.2508, + "step": 916 + }, + { + "epoch": 0.07, + "grad_norm": 0.735602100029352, + "learning_rate": 4.978274501505061e-05, + "loss": 2.0472, + "step": 917 + }, + { + "epoch": 0.07, + "grad_norm": 0.6458421689863666, + "learning_rate": 4.978192249985491e-05, + "loss": 2.0497, + "step": 918 + }, + { + "epoch": 0.07, + "grad_norm": 0.7434733474547877, + "learning_rate": 4.978109843742198e-05, + "loss": 2.0599, + "step": 919 + }, + { + "epoch": 0.07, + "grad_norm": 0.7670719285295212, + "learning_rate": 4.9780272827803265e-05, + "loss": 2.2025, + "step": 920 + }, + { + "epoch": 0.07, + "grad_norm": 0.8235117677091967, + "learning_rate": 4.9779445671050315e-05, + "loss": 2.0536, + "step": 921 + }, + { + "epoch": 0.07, + "grad_norm": 0.6662852030688867, + "learning_rate": 4.977861696721479e-05, + "loss": 2.0026, + "step": 922 + }, + { + "epoch": 0.07, + "grad_norm": 0.7213365024835765, + "learning_rate": 4.977778671634839e-05, + "loss": 2.0379, + "step": 923 + }, + { + "epoch": 0.07, + "grad_norm": 0.7001861314640812, + "learning_rate": 4.9776954918503e-05, + "loss": 2.2172, + "step": 924 + }, + { + "epoch": 0.07, + "grad_norm": 0.6729168726967532, + "learning_rate": 4.977612157373052e-05, + "loss": 2.104, + "step": 925 + }, + { + "epoch": 0.07, + "grad_norm": 0.6654466314413828, + "learning_rate": 4.9775286682082984e-05, + "loss": 2.0259, + "step": 926 + }, + { + "epoch": 0.07, + "grad_norm": 0.733989863819936, + "learning_rate": 4.977445024361252e-05, + "loss": 2.0268, + "step": 927 + }, + { + "epoch": 0.07, + "grad_norm": 0.7558077891871695, + "learning_rate": 4.977361225837136e-05, + "loss": 2.2334, + "step": 928 + }, + { + "epoch": 0.07, + "grad_norm": 0.676743415651291, + "learning_rate": 4.977277272641181e-05, + "loss": 2.0472, + "step": 929 + }, + { + "epoch": 0.07, + "grad_norm": 0.9118849471534284, + "learning_rate": 4.977193164778628e-05, + "loss": 2.0408, + "step": 930 + }, + { + "epoch": 0.07, + "grad_norm": 0.8876821118985034, + "learning_rate": 4.9771089022547304e-05, + "loss": 2.0464, + "step": 931 + }, + { + "epoch": 0.07, + "grad_norm": 0.6333579493885405, + "learning_rate": 4.977024485074747e-05, + "loss": 2.1092, + "step": 932 + }, + { + "epoch": 0.07, + "grad_norm": 0.7400077346057291, + "learning_rate": 4.97693991324395e-05, + "loss": 2.2154, + "step": 933 + }, + { + "epoch": 0.07, + "grad_norm": 1.0783608963601525, + "learning_rate": 4.9768551867676186e-05, + "loss": 2.0487, + "step": 934 + }, + { + "epoch": 0.07, + "grad_norm": 0.8185687668141374, + "learning_rate": 4.9767703056510416e-05, + "loss": 1.9996, + "step": 935 + }, + { + "epoch": 0.07, + "grad_norm": 0.820888920686587, + "learning_rate": 4.9766852698995205e-05, + "loss": 2.2525, + "step": 936 + }, + { + "epoch": 0.07, + "grad_norm": 0.6874286826616396, + "learning_rate": 4.976600079518363e-05, + "loss": 2.021, + "step": 937 + }, + { + "epoch": 0.07, + "grad_norm": 0.7874940933643579, + "learning_rate": 4.9765147345128884e-05, + "loss": 2.0826, + "step": 938 + }, + { + "epoch": 0.07, + "grad_norm": 0.71585370646607, + "learning_rate": 4.976429234888426e-05, + "loss": 2.0107, + "step": 939 + }, + { + "epoch": 0.07, + "grad_norm": 0.9736690348691248, + "learning_rate": 4.976343580650312e-05, + "loss": 2.241, + "step": 940 + }, + { + "epoch": 0.07, + "grad_norm": 0.7632309525353292, + "learning_rate": 4.976257771803896e-05, + "loss": 2.0125, + "step": 941 + }, + { + "epoch": 0.07, + "grad_norm": 0.8874391172412602, + "learning_rate": 4.9761718083545344e-05, + "loss": 2.0286, + "step": 942 + }, + { + "epoch": 0.07, + "grad_norm": 1.0125582744743706, + "learning_rate": 4.976085690307594e-05, + "loss": 2.0375, + "step": 943 + }, + { + "epoch": 0.07, + "grad_norm": 0.7545554717568546, + "learning_rate": 4.975999417668452e-05, + "loss": 2.1201, + "step": 944 + }, + { + "epoch": 0.07, + "grad_norm": 0.8204251486017243, + "learning_rate": 4.975912990442495e-05, + "loss": 2.2272, + "step": 945 + }, + { + "epoch": 0.07, + "grad_norm": 0.7925800742207649, + "learning_rate": 4.975826408635119e-05, + "loss": 2.0404, + "step": 946 + }, + { + "epoch": 0.07, + "grad_norm": 0.8528619441503231, + "learning_rate": 4.975739672251729e-05, + "loss": 1.9958, + "step": 947 + }, + { + "epoch": 0.07, + "grad_norm": 1.0799450757311806, + "learning_rate": 4.97565278129774e-05, + "loss": 2.2556, + "step": 948 + }, + { + "epoch": 0.07, + "grad_norm": 0.6630359748141954, + "learning_rate": 4.975565735778579e-05, + "loss": 2.04, + "step": 949 + }, + { + "epoch": 0.07, + "grad_norm": 1.0294361052356031, + "learning_rate": 4.9754785356996787e-05, + "loss": 2.0645, + "step": 950 + }, + { + "epoch": 0.07, + "grad_norm": 0.7712324828792639, + "learning_rate": 4.975391181066483e-05, + "loss": 1.9943, + "step": 951 + }, + { + "epoch": 0.07, + "grad_norm": 0.7424901430667604, + "learning_rate": 4.975303671884448e-05, + "loss": 2.0393, + "step": 952 + }, + { + "epoch": 0.07, + "grad_norm": 0.843649540127684, + "learning_rate": 4.975216008159035e-05, + "loss": 2.1948, + "step": 953 + }, + { + "epoch": 0.07, + "grad_norm": 0.5891375173476674, + "learning_rate": 4.975128189895719e-05, + "loss": 1.9325, + "step": 954 + }, + { + "epoch": 0.07, + "grad_norm": 0.7569030117340539, + "learning_rate": 4.9750402170999814e-05, + "loss": 2.003, + "step": 955 + }, + { + "epoch": 0.07, + "grad_norm": 0.7571528370796616, + "learning_rate": 4.974952089777316e-05, + "loss": 2.2574, + "step": 956 + }, + { + "epoch": 0.07, + "grad_norm": 0.7062070482456363, + "learning_rate": 4.9748638079332236e-05, + "loss": 2.1115, + "step": 957 + }, + { + "epoch": 0.07, + "grad_norm": 0.9686026724538899, + "learning_rate": 4.974775371573217e-05, + "loss": 2.0151, + "step": 958 + }, + { + "epoch": 0.07, + "grad_norm": 0.6758375395265377, + "learning_rate": 4.974686780702817e-05, + "loss": 2.0299, + "step": 959 + }, + { + "epoch": 0.07, + "grad_norm": 1.0976813306232893, + "learning_rate": 4.974598035327555e-05, + "loss": 2.173, + "step": 960 + }, + { + "epoch": 0.07, + "grad_norm": 0.8152299831492347, + "learning_rate": 4.974509135452972e-05, + "loss": 2.0238, + "step": 961 + }, + { + "epoch": 0.07, + "grad_norm": 0.8807975321880066, + "learning_rate": 4.974420081084618e-05, + "loss": 2.0186, + "step": 962 + }, + { + "epoch": 0.07, + "grad_norm": 0.6950261951814034, + "learning_rate": 4.9743308722280536e-05, + "loss": 2.0838, + "step": 963 + }, + { + "epoch": 0.07, + "grad_norm": 0.8125022838282693, + "learning_rate": 4.974241508888847e-05, + "loss": 2.0443, + "step": 964 + }, + { + "epoch": 0.07, + "grad_norm": 0.6237207771715597, + "learning_rate": 4.9741519910725796e-05, + "loss": 2.2202, + "step": 965 + }, + { + "epoch": 0.07, + "grad_norm": 0.7897099917993983, + "learning_rate": 4.974062318784839e-05, + "loss": 2.0259, + "step": 966 + }, + { + "epoch": 0.07, + "grad_norm": 0.6978263002965192, + "learning_rate": 4.973972492031224e-05, + "loss": 2.0418, + "step": 967 + }, + { + "epoch": 0.07, + "grad_norm": 0.7031922639373333, + "learning_rate": 4.973882510817344e-05, + "loss": 2.2428, + "step": 968 + }, + { + "epoch": 0.07, + "grad_norm": 0.7226546147937737, + "learning_rate": 4.9737923751488146e-05, + "loss": 2.1011, + "step": 969 + }, + { + "epoch": 0.07, + "grad_norm": 0.6977130645612353, + "learning_rate": 4.973702085031264e-05, + "loss": 2.074, + "step": 970 + }, + { + "epoch": 0.07, + "grad_norm": 0.8065794558247578, + "learning_rate": 4.973611640470331e-05, + "loss": 2.0792, + "step": 971 + }, + { + "epoch": 0.07, + "grad_norm": 0.8448694034276166, + "learning_rate": 4.973521041471662e-05, + "loss": 2.2251, + "step": 972 + }, + { + "epoch": 0.08, + "grad_norm": 0.7159249637319535, + "learning_rate": 4.973430288040912e-05, + "loss": 2.0086, + "step": 973 + }, + { + "epoch": 0.08, + "grad_norm": 0.7876933856360586, + "learning_rate": 4.973339380183749e-05, + "loss": 1.9869, + "step": 974 + }, + { + "epoch": 0.08, + "grad_norm": 0.6559759951805296, + "learning_rate": 4.973248317905847e-05, + "loss": 2.1253, + "step": 975 + }, + { + "epoch": 0.08, + "grad_norm": 0.7135362861633066, + "learning_rate": 4.973157101212892e-05, + "loss": 1.9926, + "step": 976 + }, + { + "epoch": 0.08, + "grad_norm": 0.7323847052407415, + "learning_rate": 4.97306573011058e-05, + "loss": 2.245, + "step": 977 + }, + { + "epoch": 0.08, + "grad_norm": 0.6721835144845716, + "learning_rate": 4.972974204604614e-05, + "loss": 2.0218, + "step": 978 + }, + { + "epoch": 0.08, + "grad_norm": 0.7170384418801566, + "learning_rate": 4.972882524700709e-05, + "loss": 2.0825, + "step": 979 + }, + { + "epoch": 0.08, + "grad_norm": 0.7933813675003109, + "learning_rate": 4.972790690404591e-05, + "loss": 2.1797, + "step": 980 + }, + { + "epoch": 0.08, + "grad_norm": 0.6553433956107843, + "learning_rate": 4.972698701721989e-05, + "loss": 2.1613, + "step": 981 + }, + { + "epoch": 0.08, + "grad_norm": 0.6943969689560172, + "learning_rate": 4.9726065586586514e-05, + "loss": 1.9498, + "step": 982 + }, + { + "epoch": 0.08, + "grad_norm": 0.7959267362030705, + "learning_rate": 4.9725142612203265e-05, + "loss": 2.0123, + "step": 983 + }, + { + "epoch": 0.08, + "grad_norm": 0.6298556809262316, + "learning_rate": 4.972421809412781e-05, + "loss": 2.037, + "step": 984 + }, + { + "epoch": 0.08, + "grad_norm": 0.8254187216597044, + "learning_rate": 4.972329203241783e-05, + "loss": 2.2099, + "step": 985 + }, + { + "epoch": 0.08, + "grad_norm": 0.6321830946557987, + "learning_rate": 4.9722364427131175e-05, + "loss": 2.0229, + "step": 986 + }, + { + "epoch": 0.08, + "grad_norm": 0.7304206440517998, + "learning_rate": 4.972143527832575e-05, + "loss": 2.045, + "step": 987 + }, + { + "epoch": 0.08, + "grad_norm": 0.6197332727535526, + "learning_rate": 4.972050458605955e-05, + "loss": 2.0629, + "step": 988 + }, + { + "epoch": 0.08, + "grad_norm": 0.6639498186932975, + "learning_rate": 4.97195723503907e-05, + "loss": 2.2372, + "step": 989 + }, + { + "epoch": 0.08, + "grad_norm": 0.6580922492264214, + "learning_rate": 4.97186385713774e-05, + "loss": 2.0381, + "step": 990 + }, + { + "epoch": 0.08, + "grad_norm": 0.6623205429303454, + "learning_rate": 4.971770324907794e-05, + "loss": 2.0401, + "step": 991 + }, + { + "epoch": 0.08, + "grad_norm": 0.7211961374016485, + "learning_rate": 4.9716766383550734e-05, + "loss": 2.2275, + "step": 992 + }, + { + "epoch": 0.08, + "grad_norm": 0.7168788784949109, + "learning_rate": 4.9715827974854254e-05, + "loss": 2.0469, + "step": 993 + }, + { + "epoch": 0.08, + "grad_norm": 0.6104165856442945, + "learning_rate": 4.9714888023047104e-05, + "loss": 2.0707, + "step": 994 + }, + { + "epoch": 0.08, + "grad_norm": 0.7124639837637254, + "learning_rate": 4.971394652818796e-05, + "loss": 2.0214, + "step": 995 + }, + { + "epoch": 0.08, + "grad_norm": 0.7020182905430483, + "learning_rate": 4.9713003490335614e-05, + "loss": 2.0101, + "step": 996 + }, + { + "epoch": 0.08, + "grad_norm": 0.6275850417776819, + "learning_rate": 4.971205890954893e-05, + "loss": 2.2318, + "step": 997 + }, + { + "epoch": 0.08, + "grad_norm": 0.6901571679344544, + "learning_rate": 4.971111278588689e-05, + "loss": 1.9885, + "step": 998 + }, + { + "epoch": 0.08, + "grad_norm": 0.6518103261036434, + "learning_rate": 4.971016511940856e-05, + "loss": 2.0716, + "step": 999 + }, + { + "epoch": 0.08, + "grad_norm": 0.6234726733576571, + "learning_rate": 4.970921591017311e-05, + "loss": 2.0668, + "step": 1000 + }, + { + "epoch": 0.08, + "grad_norm": 0.661947058249578, + "learning_rate": 4.9708265158239806e-05, + "loss": 2.1665, + "step": 1001 + }, + { + "epoch": 0.08, + "grad_norm": 0.6664500077721862, + "learning_rate": 4.970731286366801e-05, + "loss": 1.9984, + "step": 1002 + }, + { + "epoch": 0.08, + "grad_norm": 0.7111451604744019, + "learning_rate": 4.970635902651716e-05, + "loss": 2.0101, + "step": 1003 + }, + { + "epoch": 0.08, + "grad_norm": 0.7097839952634879, + "learning_rate": 4.970540364684683e-05, + "loss": 2.2351, + "step": 1004 + }, + { + "epoch": 0.08, + "grad_norm": 0.6495305803555842, + "learning_rate": 4.9704446724716643e-05, + "loss": 2.0391, + "step": 1005 + }, + { + "epoch": 0.08, + "grad_norm": 0.6183726723553741, + "learning_rate": 4.970348826018637e-05, + "loss": 2.1098, + "step": 1006 + }, + { + "epoch": 0.08, + "grad_norm": 0.7008105243982548, + "learning_rate": 4.9702528253315846e-05, + "loss": 2.0138, + "step": 1007 + }, + { + "epoch": 0.08, + "grad_norm": 0.6131618126007959, + "learning_rate": 4.970156670416499e-05, + "loss": 2.0179, + "step": 1008 + }, + { + "epoch": 0.08, + "grad_norm": 0.7841514087228121, + "learning_rate": 4.9700603612793864e-05, + "loss": 2.1816, + "step": 1009 + }, + { + "epoch": 0.08, + "grad_norm": 0.7992658807814428, + "learning_rate": 4.969963897926256e-05, + "loss": 2.0223, + "step": 1010 + }, + { + "epoch": 0.08, + "grad_norm": 0.9131768884047065, + "learning_rate": 4.969867280363134e-05, + "loss": 2.0633, + "step": 1011 + }, + { + "epoch": 0.08, + "grad_norm": 0.6761429244910706, + "learning_rate": 4.969770508596051e-05, + "loss": 2.1207, + "step": 1012 + }, + { + "epoch": 0.08, + "grad_norm": 0.9580250948896384, + "learning_rate": 4.969673582631049e-05, + "loss": 2.1782, + "step": 1013 + }, + { + "epoch": 0.08, + "grad_norm": 0.9106560560904544, + "learning_rate": 4.9695765024741796e-05, + "loss": 2.0303, + "step": 1014 + }, + { + "epoch": 0.08, + "grad_norm": 0.7384008520053995, + "learning_rate": 4.969479268131504e-05, + "loss": 2.0221, + "step": 1015 + }, + { + "epoch": 0.08, + "grad_norm": 0.8385369069628335, + "learning_rate": 4.9693818796090927e-05, + "loss": 2.0236, + "step": 1016 + }, + { + "epoch": 0.08, + "grad_norm": 0.7889825737723688, + "learning_rate": 4.969284336913026e-05, + "loss": 2.252, + "step": 1017 + }, + { + "epoch": 0.08, + "grad_norm": 0.6910643882549516, + "learning_rate": 4.969186640049394e-05, + "loss": 2.044, + "step": 1018 + }, + { + "epoch": 0.08, + "grad_norm": 0.7188617974176045, + "learning_rate": 4.9690887890242964e-05, + "loss": 2.1338, + "step": 1019 + }, + { + "epoch": 0.08, + "grad_norm": 0.7531724394565169, + "learning_rate": 4.9689907838438435e-05, + "loss": 2.0073, + "step": 1020 + }, + { + "epoch": 0.08, + "grad_norm": 0.6445354880030325, + "learning_rate": 4.968892624514152e-05, + "loss": 2.2523, + "step": 1021 + }, + { + "epoch": 0.08, + "grad_norm": 0.7243309486864397, + "learning_rate": 4.9687943110413516e-05, + "loss": 1.9924, + "step": 1022 + }, + { + "epoch": 0.08, + "grad_norm": 0.7009964450659534, + "learning_rate": 4.9686958434315804e-05, + "loss": 2.0295, + "step": 1023 + }, + { + "epoch": 0.08, + "grad_norm": 0.707505304161887, + "learning_rate": 4.968597221690986e-05, + "loss": 2.218, + "step": 1024 + }, + { + "epoch": 0.08, + "grad_norm": 0.7355397671951441, + "learning_rate": 4.968498445825726e-05, + "loss": 2.1093, + "step": 1025 + }, + { + "epoch": 0.08, + "grad_norm": 0.6660555201393771, + "learning_rate": 4.968399515841967e-05, + "loss": 2.0032, + "step": 1026 + }, + { + "epoch": 0.08, + "grad_norm": 0.7212233879366329, + "learning_rate": 4.968300431745886e-05, + "loss": 1.991, + "step": 1027 + }, + { + "epoch": 0.08, + "grad_norm": 0.6601724424398042, + "learning_rate": 4.968201193543669e-05, + "loss": 2.0575, + "step": 1028 + }, + { + "epoch": 0.08, + "grad_norm": 0.7947924365624697, + "learning_rate": 4.9681018012415114e-05, + "loss": 2.1704, + "step": 1029 + }, + { + "epoch": 0.08, + "grad_norm": 0.6978287433527, + "learning_rate": 4.96800225484562e-05, + "loss": 2.0008, + "step": 1030 + }, + { + "epoch": 0.08, + "grad_norm": 0.659031799836682, + "learning_rate": 4.9679025543622085e-05, + "loss": 2.095, + "step": 1031 + }, + { + "epoch": 0.08, + "grad_norm": 0.6936100193493179, + "learning_rate": 4.967802699797501e-05, + "loss": 2.0245, + "step": 1032 + }, + { + "epoch": 0.08, + "grad_norm": 0.6863481693580329, + "learning_rate": 4.967702691157734e-05, + "loss": 2.2254, + "step": 1033 + }, + { + "epoch": 0.08, + "grad_norm": 0.8047739314973198, + "learning_rate": 4.967602528449151e-05, + "loss": 2.0325, + "step": 1034 + }, + { + "epoch": 0.08, + "grad_norm": 0.721649843661402, + "learning_rate": 4.967502211678005e-05, + "loss": 2.0201, + "step": 1035 + }, + { + "epoch": 0.08, + "grad_norm": 0.7442001868959572, + "learning_rate": 4.967401740850558e-05, + "loss": 2.2478, + "step": 1036 + }, + { + "epoch": 0.08, + "grad_norm": 0.6350403227026801, + "learning_rate": 4.9673011159730844e-05, + "loss": 2.0951, + "step": 1037 + }, + { + "epoch": 0.08, + "grad_norm": 0.6673287110831463, + "learning_rate": 4.967200337051867e-05, + "loss": 2.0471, + "step": 1038 + }, + { + "epoch": 0.08, + "grad_norm": 0.6992422278366218, + "learning_rate": 4.967099404093196e-05, + "loss": 1.9937, + "step": 1039 + }, + { + "epoch": 0.08, + "grad_norm": 0.6697359326786231, + "learning_rate": 4.9669983171033744e-05, + "loss": 2.0163, + "step": 1040 + }, + { + "epoch": 0.08, + "grad_norm": 0.7289081795903044, + "learning_rate": 4.9668970760887135e-05, + "loss": 2.2485, + "step": 1041 + }, + { + "epoch": 0.08, + "grad_norm": 0.6515271142029379, + "learning_rate": 4.966795681055534e-05, + "loss": 2.0277, + "step": 1042 + }, + { + "epoch": 0.08, + "grad_norm": 0.7645822991036333, + "learning_rate": 4.9666941320101655e-05, + "loss": 2.0732, + "step": 1043 + }, + { + "epoch": 0.08, + "grad_norm": 0.6673669356419604, + "learning_rate": 4.966592428958949e-05, + "loss": 2.0778, + "step": 1044 + }, + { + "epoch": 0.08, + "grad_norm": 0.8072304795674596, + "learning_rate": 4.966490571908235e-05, + "loss": 2.2673, + "step": 1045 + }, + { + "epoch": 0.08, + "grad_norm": 0.7547983630791196, + "learning_rate": 4.9663885608643815e-05, + "loss": 2.0216, + "step": 1046 + }, + { + "epoch": 0.08, + "grad_norm": 0.7585184589199094, + "learning_rate": 4.9662863958337577e-05, + "loss": 1.9981, + "step": 1047 + }, + { + "epoch": 0.08, + "grad_norm": 0.7120197062285855, + "learning_rate": 4.966184076822743e-05, + "loss": 2.067, + "step": 1048 + }, + { + "epoch": 0.08, + "grad_norm": 0.8317450870447444, + "learning_rate": 4.966081603837725e-05, + "loss": 2.2282, + "step": 1049 + }, + { + "epoch": 0.08, + "grad_norm": 0.7107590506437731, + "learning_rate": 4.965978976885102e-05, + "loss": 2.0561, + "step": 1050 + }, + { + "epoch": 0.08, + "grad_norm": 0.7189081973991424, + "learning_rate": 4.96587619597128e-05, + "loss": 2.0027, + "step": 1051 + }, + { + "epoch": 0.08, + "grad_norm": 0.731351418285113, + "learning_rate": 4.965773261102678e-05, + "loss": 2.0292, + "step": 1052 + }, + { + "epoch": 0.08, + "grad_norm": 0.8858160599144653, + "learning_rate": 4.9656701722857203e-05, + "loss": 2.2305, + "step": 1053 + }, + { + "epoch": 0.08, + "grad_norm": 0.7457616477189996, + "learning_rate": 4.965566929526845e-05, + "loss": 2.0441, + "step": 1054 + }, + { + "epoch": 0.08, + "grad_norm": 0.8863291314227609, + "learning_rate": 4.965463532832498e-05, + "loss": 2.0729, + "step": 1055 + }, + { + "epoch": 0.08, + "grad_norm": 0.7711512030297716, + "learning_rate": 4.9653599822091336e-05, + "loss": 2.0952, + "step": 1056 + }, + { + "epoch": 0.08, + "grad_norm": 0.9152845673629448, + "learning_rate": 4.9652562776632186e-05, + "loss": 2.1901, + "step": 1057 + }, + { + "epoch": 0.08, + "grad_norm": 0.8398253263202407, + "learning_rate": 4.9651524192012255e-05, + "loss": 2.0426, + "step": 1058 + }, + { + "epoch": 0.08, + "grad_norm": 0.8054376367957267, + "learning_rate": 4.96504840682964e-05, + "loss": 2.0301, + "step": 1059 + }, + { + "epoch": 0.08, + "grad_norm": 0.771070531991092, + "learning_rate": 4.9649442405549564e-05, + "loss": 2.0705, + "step": 1060 + }, + { + "epoch": 0.08, + "grad_norm": 0.784323862701141, + "learning_rate": 4.964839920383677e-05, + "loss": 2.2204, + "step": 1061 + }, + { + "epoch": 0.08, + "grad_norm": 0.6701921618757497, + "learning_rate": 4.964735446322316e-05, + "loss": 2.1203, + "step": 1062 + }, + { + "epoch": 0.08, + "grad_norm": 0.8005268162701557, + "learning_rate": 4.964630818377396e-05, + "loss": 2.0721, + "step": 1063 + }, + { + "epoch": 0.08, + "grad_norm": 0.736304255389527, + "learning_rate": 4.964526036555448e-05, + "loss": 2.0102, + "step": 1064 + }, + { + "epoch": 0.08, + "grad_norm": 0.7854911415831769, + "learning_rate": 4.964421100863016e-05, + "loss": 2.2211, + "step": 1065 + }, + { + "epoch": 0.08, + "grad_norm": 0.6912173675863236, + "learning_rate": 4.96431601130665e-05, + "loss": 2.03, + "step": 1066 + }, + { + "epoch": 0.08, + "grad_norm": 0.872388200871447, + "learning_rate": 4.964210767892912e-05, + "loss": 1.9564, + "step": 1067 + }, + { + "epoch": 0.08, + "grad_norm": 0.8173542354284669, + "learning_rate": 4.9641053706283725e-05, + "loss": 2.0624, + "step": 1068 + }, + { + "epoch": 0.08, + "grad_norm": 0.7466383694390366, + "learning_rate": 4.9639998195196114e-05, + "loss": 2.1901, + "step": 1069 + }, + { + "epoch": 0.08, + "grad_norm": 0.9602262245815072, + "learning_rate": 4.96389411457322e-05, + "loss": 2.0349, + "step": 1070 + }, + { + "epoch": 0.08, + "grad_norm": 0.7223897110466765, + "learning_rate": 4.963788255795797e-05, + "loss": 2.0303, + "step": 1071 + }, + { + "epoch": 0.08, + "grad_norm": 0.7714234423989593, + "learning_rate": 4.963682243193951e-05, + "loss": 2.0497, + "step": 1072 + }, + { + "epoch": 0.08, + "grad_norm": 0.8033783682063672, + "learning_rate": 4.963576076774302e-05, + "loss": 2.2201, + "step": 1073 + }, + { + "epoch": 0.08, + "grad_norm": 0.6824882474199132, + "learning_rate": 4.9634697565434777e-05, + "loss": 2.0842, + "step": 1074 + }, + { + "epoch": 0.08, + "grad_norm": 0.774489191820849, + "learning_rate": 4.9633632825081166e-05, + "loss": 2.0389, + "step": 1075 + }, + { + "epoch": 0.08, + "grad_norm": 0.7365024192543138, + "learning_rate": 4.963256654674866e-05, + "loss": 2.046, + "step": 1076 + }, + { + "epoch": 0.08, + "grad_norm": 0.841699067956948, + "learning_rate": 4.963149873050383e-05, + "loss": 2.1969, + "step": 1077 + }, + { + "epoch": 0.08, + "grad_norm": 0.6731114895007412, + "learning_rate": 4.963042937641335e-05, + "loss": 2.0196, + "step": 1078 + }, + { + "epoch": 0.08, + "grad_norm": 0.7205027172424567, + "learning_rate": 4.962935848454397e-05, + "loss": 1.9993, + "step": 1079 + }, + { + "epoch": 0.08, + "grad_norm": 0.8527946200197376, + "learning_rate": 4.962828605496256e-05, + "loss": 2.0074, + "step": 1080 + }, + { + "epoch": 0.08, + "grad_norm": 0.7465398101084108, + "learning_rate": 4.9627212087736084e-05, + "loss": 2.2432, + "step": 1081 + }, + { + "epoch": 0.08, + "grad_norm": 0.652738685562523, + "learning_rate": 4.962613658293158e-05, + "loss": 2.0646, + "step": 1082 + }, + { + "epoch": 0.08, + "grad_norm": 0.7861249464464168, + "learning_rate": 4.962505954061621e-05, + "loss": 2.0743, + "step": 1083 + }, + { + "epoch": 0.08, + "grad_norm": 0.708890666243947, + "learning_rate": 4.962398096085721e-05, + "loss": 1.968, + "step": 1084 + }, + { + "epoch": 0.08, + "grad_norm": 0.7674057481884768, + "learning_rate": 4.962290084372191e-05, + "loss": 2.1829, + "step": 1085 + }, + { + "epoch": 0.08, + "grad_norm": 0.748623140066446, + "learning_rate": 4.962181918927777e-05, + "loss": 1.9904, + "step": 1086 + }, + { + "epoch": 0.08, + "grad_norm": 0.6798828789939435, + "learning_rate": 4.962073599759231e-05, + "loss": 2.1426, + "step": 1087 + }, + { + "epoch": 0.08, + "grad_norm": 0.7301733815725464, + "learning_rate": 4.9619651268733144e-05, + "loss": 2.0079, + "step": 1088 + }, + { + "epoch": 0.08, + "grad_norm": 0.68003568559764, + "learning_rate": 4.961856500276801e-05, + "loss": 2.2387, + "step": 1089 + }, + { + "epoch": 0.08, + "grad_norm": 0.6953394064931132, + "learning_rate": 4.961747719976474e-05, + "loss": 2.037, + "step": 1090 + }, + { + "epoch": 0.08, + "grad_norm": 0.7265080326878989, + "learning_rate": 4.961638785979123e-05, + "loss": 2.0535, + "step": 1091 + }, + { + "epoch": 0.08, + "grad_norm": 0.6410283530926739, + "learning_rate": 4.961529698291549e-05, + "loss": 2.0077, + "step": 1092 + }, + { + "epoch": 0.08, + "grad_norm": 0.6587762743896434, + "learning_rate": 4.961420456920566e-05, + "loss": 2.2499, + "step": 1093 + }, + { + "epoch": 0.08, + "grad_norm": 0.7541415627246982, + "learning_rate": 4.96131106187299e-05, + "loss": 1.9918, + "step": 1094 + }, + { + "epoch": 0.08, + "grad_norm": 0.7346643627920064, + "learning_rate": 4.961201513155654e-05, + "loss": 2.0532, + "step": 1095 + }, + { + "epoch": 0.08, + "grad_norm": 0.6976891395832233, + "learning_rate": 4.961091810775397e-05, + "loss": 2.0397, + "step": 1096 + }, + { + "epoch": 0.08, + "grad_norm": 0.8527623584074553, + "learning_rate": 4.960981954739067e-05, + "loss": 2.2478, + "step": 1097 + }, + { + "epoch": 0.08, + "grad_norm": 0.7338474575772996, + "learning_rate": 4.9608719450535236e-05, + "loss": 2.0291, + "step": 1098 + }, + { + "epoch": 0.08, + "grad_norm": 0.8544896969740515, + "learning_rate": 4.960761781725636e-05, + "loss": 2.1128, + "step": 1099 + }, + { + "epoch": 0.08, + "grad_norm": 0.6634281152760301, + "learning_rate": 4.96065146476228e-05, + "loss": 2.0321, + "step": 1100 + }, + { + "epoch": 0.08, + "grad_norm": 0.7655626578824397, + "learning_rate": 4.9605409941703464e-05, + "loss": 2.2023, + "step": 1101 + }, + { + "epoch": 0.09, + "grad_norm": 0.6733480948442727, + "learning_rate": 4.9604303699567286e-05, + "loss": 2.0059, + "step": 1102 + }, + { + "epoch": 0.09, + "grad_norm": 0.7740564112818662, + "learning_rate": 4.960319592128336e-05, + "loss": 1.9949, + "step": 1103 + }, + { + "epoch": 0.09, + "grad_norm": 0.6874472132497218, + "learning_rate": 4.960208660692084e-05, + "loss": 2.0361, + "step": 1104 + }, + { + "epoch": 0.09, + "grad_norm": 0.7380994477730938, + "learning_rate": 4.960097575654898e-05, + "loss": 2.2632, + "step": 1105 + }, + { + "epoch": 0.09, + "grad_norm": 0.852312319985487, + "learning_rate": 4.9599863370237144e-05, + "loss": 2.0524, + "step": 1106 + }, + { + "epoch": 0.09, + "grad_norm": 0.6781156244779359, + "learning_rate": 4.959874944805478e-05, + "loss": 2.0221, + "step": 1107 + }, + { + "epoch": 0.09, + "grad_norm": 0.9680788472103373, + "learning_rate": 4.9597633990071426e-05, + "loss": 1.9844, + "step": 1108 + }, + { + "epoch": 0.09, + "grad_norm": 0.8531478478552657, + "learning_rate": 4.959651699635674e-05, + "loss": 2.1821, + "step": 1109 + }, + { + "epoch": 0.09, + "grad_norm": 0.770146150294283, + "learning_rate": 4.959539846698045e-05, + "loss": 2.0408, + "step": 1110 + }, + { + "epoch": 0.09, + "grad_norm": 0.8851849038283203, + "learning_rate": 4.9594278402012395e-05, + "loss": 2.0737, + "step": 1111 + }, + { + "epoch": 0.09, + "grad_norm": 0.7026094322141042, + "learning_rate": 4.95931568015225e-05, + "loss": 2.0638, + "step": 1112 + }, + { + "epoch": 0.09, + "grad_norm": 0.873459353307161, + "learning_rate": 4.95920336655808e-05, + "loss": 2.2284, + "step": 1113 + }, + { + "epoch": 0.09, + "grad_norm": 0.7931822077349445, + "learning_rate": 4.959090899425741e-05, + "loss": 2.0271, + "step": 1114 + }, + { + "epoch": 0.09, + "grad_norm": 0.729894161943249, + "learning_rate": 4.958978278762255e-05, + "loss": 2.0379, + "step": 1115 + }, + { + "epoch": 0.09, + "grad_norm": 0.9047779535473272, + "learning_rate": 4.9588655045746534e-05, + "loss": 2.0224, + "step": 1116 + }, + { + "epoch": 0.09, + "grad_norm": 0.7682985554428163, + "learning_rate": 4.958752576869977e-05, + "loss": 2.2411, + "step": 1117 + }, + { + "epoch": 0.09, + "grad_norm": 0.8241863918368432, + "learning_rate": 4.958639495655276e-05, + "loss": 2.0876, + "step": 1118 + }, + { + "epoch": 0.09, + "grad_norm": 0.9637923812568879, + "learning_rate": 4.958526260937611e-05, + "loss": 1.9851, + "step": 1119 + }, + { + "epoch": 0.09, + "grad_norm": 0.7392565034796879, + "learning_rate": 4.958412872724052e-05, + "loss": 2.0205, + "step": 1120 + }, + { + "epoch": 0.09, + "grad_norm": 1.0138962511071723, + "learning_rate": 4.9582993310216774e-05, + "loss": 2.2164, + "step": 1121 + }, + { + "epoch": 0.09, + "grad_norm": 0.667317663133804, + "learning_rate": 4.958185635837578e-05, + "loss": 2.023, + "step": 1122 + }, + { + "epoch": 0.09, + "grad_norm": 0.8238105349904448, + "learning_rate": 4.958071787178849e-05, + "loss": 2.0417, + "step": 1123 + }, + { + "epoch": 0.09, + "grad_norm": 0.7462304113404287, + "learning_rate": 4.9579577850526015e-05, + "loss": 2.0927, + "step": 1124 + }, + { + "epoch": 0.09, + "grad_norm": 0.7411967413119631, + "learning_rate": 4.957843629465951e-05, + "loss": 2.1775, + "step": 1125 + }, + { + "epoch": 0.09, + "grad_norm": 0.8957837918437386, + "learning_rate": 4.9577293204260265e-05, + "loss": 2.0178, + "step": 1126 + }, + { + "epoch": 0.09, + "grad_norm": 0.7530335422985333, + "learning_rate": 4.957614857939964e-05, + "loss": 2.0388, + "step": 1127 + }, + { + "epoch": 0.09, + "grad_norm": 0.6921811191967561, + "learning_rate": 4.9575002420149095e-05, + "loss": 2.059, + "step": 1128 + }, + { + "epoch": 0.09, + "grad_norm": 0.8618525567933475, + "learning_rate": 4.957385472658019e-05, + "loss": 2.2355, + "step": 1129 + }, + { + "epoch": 0.09, + "grad_norm": 0.6074498672199484, + "learning_rate": 4.957270549876459e-05, + "loss": 2.0668, + "step": 1130 + }, + { + "epoch": 0.09, + "grad_norm": 0.7810323804261484, + "learning_rate": 4.957155473677403e-05, + "loss": 2.0422, + "step": 1131 + }, + { + "epoch": 0.09, + "grad_norm": 0.6618596058067698, + "learning_rate": 4.957040244068038e-05, + "loss": 2.022, + "step": 1132 + }, + { + "epoch": 0.09, + "grad_norm": 0.7021726401038475, + "learning_rate": 4.956924861055555e-05, + "loss": 2.2346, + "step": 1133 + }, + { + "epoch": 0.09, + "grad_norm": 0.7303958528888959, + "learning_rate": 4.9568093246471603e-05, + "loss": 2.0402, + "step": 1134 + }, + { + "epoch": 0.09, + "grad_norm": 0.6797179770565074, + "learning_rate": 4.956693634850067e-05, + "loss": 2.0024, + "step": 1135 + }, + { + "epoch": 0.09, + "grad_norm": 0.7872261872073871, + "learning_rate": 4.956577791671497e-05, + "loss": 2.021, + "step": 1136 + }, + { + "epoch": 0.09, + "grad_norm": 0.9852649964706484, + "learning_rate": 4.956461795118684e-05, + "loss": 2.2566, + "step": 1137 + }, + { + "epoch": 0.09, + "grad_norm": 0.6945653714374064, + "learning_rate": 4.9563456451988694e-05, + "loss": 2.0682, + "step": 1138 + }, + { + "epoch": 0.09, + "grad_norm": 0.9116516976907111, + "learning_rate": 4.9562293419193054e-05, + "loss": 2.0412, + "step": 1139 + }, + { + "epoch": 0.09, + "grad_norm": 0.6666076139848338, + "learning_rate": 4.956112885287253e-05, + "loss": 2.0195, + "step": 1140 + }, + { + "epoch": 0.09, + "grad_norm": 0.9190447813968373, + "learning_rate": 4.9559962753099835e-05, + "loss": 2.1911, + "step": 1141 + }, + { + "epoch": 0.09, + "grad_norm": 0.8397680311380011, + "learning_rate": 4.955879511994778e-05, + "loss": 2.0148, + "step": 1142 + }, + { + "epoch": 0.09, + "grad_norm": 0.8182235791403191, + "learning_rate": 4.955762595348924e-05, + "loss": 2.0797, + "step": 1143 + }, + { + "epoch": 0.09, + "grad_norm": 0.9612082887683635, + "learning_rate": 4.955645525379723e-05, + "loss": 1.9672, + "step": 1144 + }, + { + "epoch": 0.09, + "grad_norm": 0.7293208653101693, + "learning_rate": 4.9555283020944844e-05, + "loss": 2.1946, + "step": 1145 + }, + { + "epoch": 0.09, + "grad_norm": 0.8979454429122249, + "learning_rate": 4.955410925500526e-05, + "loss": 2.0267, + "step": 1146 + }, + { + "epoch": 0.09, + "grad_norm": 0.8792669306041917, + "learning_rate": 4.955293395605176e-05, + "loss": 2.0246, + "step": 1147 + }, + { + "epoch": 0.09, + "grad_norm": 0.7634640272201217, + "learning_rate": 4.955175712415773e-05, + "loss": 2.0883, + "step": 1148 + }, + { + "epoch": 0.09, + "grad_norm": 0.8582898375311494, + "learning_rate": 4.955057875939664e-05, + "loss": 2.0636, + "step": 1149 + }, + { + "epoch": 0.09, + "grad_norm": 0.6254136814594533, + "learning_rate": 4.954939886184207e-05, + "loss": 2.2292, + "step": 1150 + }, + { + "epoch": 0.09, + "grad_norm": 0.841958801591601, + "learning_rate": 4.9548217431567665e-05, + "loss": 2.0125, + "step": 1151 + }, + { + "epoch": 0.09, + "grad_norm": 0.7482969379492537, + "learning_rate": 4.9547034468647214e-05, + "loss": 2.0535, + "step": 1152 + }, + { + "epoch": 0.09, + "grad_norm": 0.8930921303952899, + "learning_rate": 4.9545849973154544e-05, + "loss": 2.1956, + "step": 1153 + }, + { + "epoch": 0.09, + "grad_norm": 0.733474757353843, + "learning_rate": 4.954466394516364e-05, + "loss": 2.0234, + "step": 1154 + }, + { + "epoch": 0.09, + "grad_norm": 0.7105650425854, + "learning_rate": 4.954347638474852e-05, + "loss": 2.1365, + "step": 1155 + }, + { + "epoch": 0.09, + "grad_norm": 0.6912419971479322, + "learning_rate": 4.954228729198335e-05, + "loss": 2.0261, + "step": 1156 + }, + { + "epoch": 0.09, + "grad_norm": 0.6522808082938539, + "learning_rate": 4.954109666694235e-05, + "loss": 2.2071, + "step": 1157 + }, + { + "epoch": 0.09, + "grad_norm": 0.807711966635246, + "learning_rate": 4.953990450969989e-05, + "loss": 2.0349, + "step": 1158 + }, + { + "epoch": 0.09, + "grad_norm": 0.674344849250202, + "learning_rate": 4.953871082033036e-05, + "loss": 2.0277, + "step": 1159 + }, + { + "epoch": 0.09, + "grad_norm": 0.7286528576338047, + "learning_rate": 4.953751559890831e-05, + "loss": 2.0594, + "step": 1160 + }, + { + "epoch": 0.09, + "grad_norm": 0.7295389341607927, + "learning_rate": 4.953631884550836e-05, + "loss": 2.0911, + "step": 1161 + }, + { + "epoch": 0.09, + "grad_norm": 0.7102316954071105, + "learning_rate": 4.953512056020523e-05, + "loss": 2.2327, + "step": 1162 + }, + { + "epoch": 0.09, + "grad_norm": 0.6825915413046217, + "learning_rate": 4.9533920743073725e-05, + "loss": 1.9923, + "step": 1163 + }, + { + "epoch": 0.09, + "grad_norm": 0.737601873845265, + "learning_rate": 4.9532719394188764e-05, + "loss": 1.9946, + "step": 1164 + }, + { + "epoch": 0.09, + "grad_norm": 0.7283855328717732, + "learning_rate": 4.9531516513625345e-05, + "loss": 2.2244, + "step": 1165 + }, + { + "epoch": 0.09, + "grad_norm": 0.7405415974333289, + "learning_rate": 4.953031210145858e-05, + "loss": 2.0497, + "step": 1166 + }, + { + "epoch": 0.09, + "grad_norm": 0.6350718675278094, + "learning_rate": 4.952910615776365e-05, + "loss": 2.0202, + "step": 1167 + }, + { + "epoch": 0.09, + "grad_norm": 0.6472411950872328, + "learning_rate": 4.9527898682615855e-05, + "loss": 2.0428, + "step": 1168 + }, + { + "epoch": 0.09, + "grad_norm": 0.7030429959244919, + "learning_rate": 4.952668967609058e-05, + "loss": 2.0084, + "step": 1169 + }, + { + "epoch": 0.09, + "grad_norm": 0.7266715028967139, + "learning_rate": 4.952547913826332e-05, + "loss": 2.1906, + "step": 1170 + }, + { + "epoch": 0.09, + "grad_norm": 0.6679616917197104, + "learning_rate": 4.952426706920963e-05, + "loss": 2.042, + "step": 1171 + }, + { + "epoch": 0.09, + "grad_norm": 0.7009517210809939, + "learning_rate": 4.952305346900521e-05, + "loss": 2.0376, + "step": 1172 + }, + { + "epoch": 0.09, + "grad_norm": 0.7035991577881616, + "learning_rate": 4.9521838337725814e-05, + "loss": 2.1955, + "step": 1173 + }, + { + "epoch": 0.09, + "grad_norm": 0.6990689106248329, + "learning_rate": 4.952062167544731e-05, + "loss": 2.127, + "step": 1174 + }, + { + "epoch": 0.09, + "grad_norm": 0.7225767453801756, + "learning_rate": 4.9519403482245666e-05, + "loss": 1.968, + "step": 1175 + }, + { + "epoch": 0.09, + "grad_norm": 0.6317048137352662, + "learning_rate": 4.9518183758196935e-05, + "loss": 2.0196, + "step": 1176 + }, + { + "epoch": 0.09, + "grad_norm": 0.7585815970200795, + "learning_rate": 4.951696250337727e-05, + "loss": 2.2158, + "step": 1177 + }, + { + "epoch": 0.09, + "grad_norm": 0.6414203807282861, + "learning_rate": 4.951573971786291e-05, + "loss": 2.0449, + "step": 1178 + }, + { + "epoch": 0.09, + "grad_norm": 0.6676496320229492, + "learning_rate": 4.951451540173021e-05, + "loss": 1.9986, + "step": 1179 + }, + { + "epoch": 0.09, + "grad_norm": 0.7573460421372907, + "learning_rate": 4.951328955505561e-05, + "loss": 2.082, + "step": 1180 + }, + { + "epoch": 0.09, + "grad_norm": 0.648673490929942, + "learning_rate": 4.951206217791564e-05, + "loss": 2.0352, + "step": 1181 + }, + { + "epoch": 0.09, + "grad_norm": 0.8001062662861226, + "learning_rate": 4.951083327038693e-05, + "loss": 2.2163, + "step": 1182 + }, + { + "epoch": 0.09, + "grad_norm": 0.7891533039280032, + "learning_rate": 4.95096028325462e-05, + "loss": 2.0259, + "step": 1183 + }, + { + "epoch": 0.09, + "grad_norm": 0.6498202081550398, + "learning_rate": 4.950837086447028e-05, + "loss": 2.0162, + "step": 1184 + }, + { + "epoch": 0.09, + "grad_norm": 0.851512622034079, + "learning_rate": 4.950713736623608e-05, + "loss": 2.2041, + "step": 1185 + }, + { + "epoch": 0.09, + "grad_norm": 0.6593544477671972, + "learning_rate": 4.950590233792062e-05, + "loss": 2.1312, + "step": 1186 + }, + { + "epoch": 0.09, + "grad_norm": 0.9568269752686577, + "learning_rate": 4.9504665779601e-05, + "loss": 2.0109, + "step": 1187 + }, + { + "epoch": 0.09, + "grad_norm": 0.8149241106213242, + "learning_rate": 4.950342769135443e-05, + "loss": 2.0578, + "step": 1188 + }, + { + "epoch": 0.09, + "grad_norm": 0.7001180957111567, + "learning_rate": 4.9502188073258215e-05, + "loss": 2.2101, + "step": 1189 + }, + { + "epoch": 0.09, + "grad_norm": 0.8304629760826161, + "learning_rate": 4.9500946925389734e-05, + "loss": 2.0463, + "step": 1190 + }, + { + "epoch": 0.09, + "grad_norm": 0.6827463911640193, + "learning_rate": 4.9499704247826486e-05, + "loss": 1.97, + "step": 1191 + }, + { + "epoch": 0.09, + "grad_norm": 0.7748230181632485, + "learning_rate": 4.949846004064605e-05, + "loss": 2.0596, + "step": 1192 + }, + { + "epoch": 0.09, + "grad_norm": 0.7744869918003887, + "learning_rate": 4.949721430392612e-05, + "loss": 1.974, + "step": 1193 + }, + { + "epoch": 0.09, + "grad_norm": 0.7183663680162962, + "learning_rate": 4.949596703774445e-05, + "loss": 2.1727, + "step": 1194 + }, + { + "epoch": 0.09, + "grad_norm": 0.8112647422572766, + "learning_rate": 4.949471824217894e-05, + "loss": 2.0321, + "step": 1195 + }, + { + "epoch": 0.09, + "grad_norm": 0.7032185809406998, + "learning_rate": 4.9493467917307537e-05, + "loss": 2.0183, + "step": 1196 + }, + { + "epoch": 0.09, + "grad_norm": 0.9315650264259715, + "learning_rate": 4.9492216063208306e-05, + "loss": 2.2519, + "step": 1197 + }, + { + "epoch": 0.09, + "grad_norm": 0.8683266485685761, + "learning_rate": 4.949096267995942e-05, + "loss": 2.0268, + "step": 1198 + }, + { + "epoch": 0.09, + "grad_norm": 0.68553023361516, + "learning_rate": 4.948970776763911e-05, + "loss": 2.0655, + "step": 1199 + }, + { + "epoch": 0.09, + "grad_norm": 1.0074078112362301, + "learning_rate": 4.948845132632575e-05, + "loss": 1.9924, + "step": 1200 + }, + { + "epoch": 0.09, + "grad_norm": 0.6501335033968308, + "learning_rate": 4.9487193356097764e-05, + "loss": 2.0105, + "step": 1201 + }, + { + "epoch": 0.09, + "grad_norm": 0.8667444071589213, + "learning_rate": 4.9485933857033706e-05, + "loss": 2.2467, + "step": 1202 + }, + { + "epoch": 0.09, + "grad_norm": 0.9627874343059033, + "learning_rate": 4.948467282921221e-05, + "loss": 2.039, + "step": 1203 + }, + { + "epoch": 0.09, + "grad_norm": 0.6795434128030994, + "learning_rate": 4.9483410272712e-05, + "loss": 2.0446, + "step": 1204 + }, + { + "epoch": 0.09, + "grad_norm": 0.9108529482133553, + "learning_rate": 4.94821461876119e-05, + "loss": 2.0781, + "step": 1205 + }, + { + "epoch": 0.09, + "grad_norm": 0.7375020791839677, + "learning_rate": 4.948088057399084e-05, + "loss": 2.1952, + "step": 1206 + }, + { + "epoch": 0.09, + "grad_norm": 0.8337950335735858, + "learning_rate": 4.947961343192784e-05, + "loss": 2.0452, + "step": 1207 + }, + { + "epoch": 0.09, + "grad_norm": 0.7883459438546738, + "learning_rate": 4.947834476150201e-05, + "loss": 1.998, + "step": 1208 + }, + { + "epoch": 0.09, + "grad_norm": 0.7483379397419843, + "learning_rate": 4.947707456279256e-05, + "loss": 2.1965, + "step": 1209 + }, + { + "epoch": 0.09, + "grad_norm": 0.891124485706932, + "learning_rate": 4.947580283587878e-05, + "loss": 2.0035, + "step": 1210 + }, + { + "epoch": 0.09, + "grad_norm": 0.5938230666014205, + "learning_rate": 4.9474529580840085e-05, + "loss": 2.0849, + "step": 1211 + }, + { + "epoch": 0.09, + "grad_norm": 0.7832320198452103, + "learning_rate": 4.9473254797755966e-05, + "loss": 2.0377, + "step": 1212 + }, + { + "epoch": 0.09, + "grad_norm": 0.7340048474777493, + "learning_rate": 4.9471978486706e-05, + "loss": 1.9824, + "step": 1213 + }, + { + "epoch": 0.09, + "grad_norm": 0.6411054456039458, + "learning_rate": 4.9470700647769904e-05, + "loss": 2.1894, + "step": 1214 + }, + { + "epoch": 0.09, + "grad_norm": 0.6750707950154565, + "learning_rate": 4.946942128102743e-05, + "loss": 1.9877, + "step": 1215 + }, + { + "epoch": 0.09, + "grad_norm": 0.7192942317818203, + "learning_rate": 4.946814038655846e-05, + "loss": 2.0281, + "step": 1216 + }, + { + "epoch": 0.09, + "grad_norm": 0.6375629204243052, + "learning_rate": 4.9466857964442964e-05, + "loss": 2.0929, + "step": 1217 + }, + { + "epoch": 0.09, + "grad_norm": 0.6631151567361007, + "learning_rate": 4.946557401476102e-05, + "loss": 2.17, + "step": 1218 + }, + { + "epoch": 0.09, + "grad_norm": 0.7004568972614962, + "learning_rate": 4.946428853759278e-05, + "loss": 2.0372, + "step": 1219 + }, + { + "epoch": 0.09, + "grad_norm": 0.7233063679299834, + "learning_rate": 4.946300153301851e-05, + "loss": 2.006, + "step": 1220 + }, + { + "epoch": 0.09, + "grad_norm": 0.635386485366084, + "learning_rate": 4.946171300111855e-05, + "loss": 2.2403, + "step": 1221 + }, + { + "epoch": 0.09, + "grad_norm": 0.6471590731319148, + "learning_rate": 4.946042294197336e-05, + "loss": 1.993, + "step": 1222 + }, + { + "epoch": 0.09, + "grad_norm": 0.5973655684212206, + "learning_rate": 4.945913135566348e-05, + "loss": 2.0638, + "step": 1223 + }, + { + "epoch": 0.09, + "grad_norm": 0.6790031391159621, + "learning_rate": 4.9457838242269546e-05, + "loss": 2.021, + "step": 1224 + }, + { + "epoch": 0.09, + "grad_norm": 0.623255845180241, + "learning_rate": 4.94565436018723e-05, + "loss": 2.0466, + "step": 1225 + }, + { + "epoch": 0.09, + "grad_norm": 0.6395915558908302, + "learning_rate": 4.9455247434552556e-05, + "loss": 2.2179, + "step": 1226 + }, + { + "epoch": 0.09, + "grad_norm": 0.6497210081429029, + "learning_rate": 4.945394974039126e-05, + "loss": 2.0188, + "step": 1227 + }, + { + "epoch": 0.09, + "grad_norm": 0.6588346421717401, + "learning_rate": 4.945265051946942e-05, + "loss": 2.0069, + "step": 1228 + }, + { + "epoch": 0.09, + "grad_norm": 0.7119112460293314, + "learning_rate": 4.945134977186816e-05, + "loss": 2.2778, + "step": 1229 + }, + { + "epoch": 0.09, + "grad_norm": 0.57135677946212, + "learning_rate": 4.9450047497668676e-05, + "loss": 2.0541, + "step": 1230 + }, + { + "epoch": 0.09, + "grad_norm": 0.7191549598440876, + "learning_rate": 4.9448743696952286e-05, + "loss": 2.0439, + "step": 1231 + }, + { + "epoch": 0.1, + "grad_norm": 0.7157867781202208, + "learning_rate": 4.944743836980039e-05, + "loss": 2.035, + "step": 1232 + }, + { + "epoch": 0.1, + "grad_norm": 0.6393656820040531, + "learning_rate": 4.9446131516294485e-05, + "loss": 2.0361, + "step": 1233 + }, + { + "epoch": 0.1, + "grad_norm": 0.7531502674304072, + "learning_rate": 4.944482313651616e-05, + "loss": 2.2221, + "step": 1234 + }, + { + "epoch": 0.1, + "grad_norm": 0.6858199931976968, + "learning_rate": 4.944351323054711e-05, + "loss": 2.0587, + "step": 1235 + }, + { + "epoch": 0.1, + "grad_norm": 0.6473603601128464, + "learning_rate": 4.944220179846911e-05, + "loss": 2.1235, + "step": 1236 + }, + { + "epoch": 0.1, + "grad_norm": 0.6552420101560866, + "learning_rate": 4.944088884036404e-05, + "loss": 2.036, + "step": 1237 + }, + { + "epoch": 0.1, + "grad_norm": 0.6266792740480249, + "learning_rate": 4.9439574356313865e-05, + "loss": 2.2228, + "step": 1238 + }, + { + "epoch": 0.1, + "grad_norm": 0.7034071256095411, + "learning_rate": 4.943825834640068e-05, + "loss": 1.9987, + "step": 1239 + }, + { + "epoch": 0.1, + "grad_norm": 0.6827806777704185, + "learning_rate": 4.9436940810706615e-05, + "loss": 2.0333, + "step": 1240 + }, + { + "epoch": 0.1, + "grad_norm": 0.6347223248704427, + "learning_rate": 4.943562174931396e-05, + "loss": 2.1688, + "step": 1241 + }, + { + "epoch": 0.1, + "grad_norm": 0.576430732108288, + "learning_rate": 4.943430116230504e-05, + "loss": 2.0933, + "step": 1242 + }, + { + "epoch": 0.1, + "grad_norm": 0.689732601582731, + "learning_rate": 4.943297904976233e-05, + "loss": 2.0322, + "step": 1243 + }, + { + "epoch": 0.1, + "grad_norm": 0.6478854315264666, + "learning_rate": 4.9431655411768364e-05, + "loss": 2.0045, + "step": 1244 + }, + { + "epoch": 0.1, + "grad_norm": 0.6702488097472884, + "learning_rate": 4.9430330248405783e-05, + "loss": 2.0521, + "step": 1245 + }, + { + "epoch": 0.1, + "grad_norm": 0.6325607528456259, + "learning_rate": 4.942900355975732e-05, + "loss": 2.1882, + "step": 1246 + }, + { + "epoch": 0.1, + "grad_norm": 0.7121582656263283, + "learning_rate": 4.942767534590581e-05, + "loss": 2.0184, + "step": 1247 + }, + { + "epoch": 0.1, + "grad_norm": 0.7541101976557016, + "learning_rate": 4.9426345606934175e-05, + "loss": 2.0567, + "step": 1248 + }, + { + "epoch": 0.1, + "grad_norm": 0.6546008262859417, + "learning_rate": 4.9425014342925436e-05, + "loss": 2.0022, + "step": 1249 + }, + { + "epoch": 0.1, + "grad_norm": 0.6733489189131282, + "learning_rate": 4.942368155396271e-05, + "loss": 2.2083, + "step": 1250 + }, + { + "epoch": 0.1, + "grad_norm": 0.6980470194802821, + "learning_rate": 4.942234724012922e-05, + "loss": 1.9886, + "step": 1251 + }, + { + "epoch": 0.1, + "grad_norm": 0.6431429934939142, + "learning_rate": 4.942101140150826e-05, + "loss": 2.0319, + "step": 1252 + }, + { + "epoch": 0.1, + "grad_norm": 0.7025040199556309, + "learning_rate": 4.941967403818322e-05, + "loss": 2.1678, + "step": 1253 + }, + { + "epoch": 0.1, + "grad_norm": 0.6006639095021568, + "learning_rate": 4.941833515023763e-05, + "loss": 2.0953, + "step": 1254 + }, + { + "epoch": 0.1, + "grad_norm": 0.7116648130760442, + "learning_rate": 4.941699473775505e-05, + "loss": 2.0384, + "step": 1255 + }, + { + "epoch": 0.1, + "grad_norm": 0.6614144283823683, + "learning_rate": 4.9415652800819186e-05, + "loss": 2.0064, + "step": 1256 + }, + { + "epoch": 0.1, + "grad_norm": 0.7422982275451593, + "learning_rate": 4.941430933951381e-05, + "loss": 1.9888, + "step": 1257 + }, + { + "epoch": 0.1, + "grad_norm": 0.6830282249418417, + "learning_rate": 4.9412964353922814e-05, + "loss": 2.2137, + "step": 1258 + }, + { + "epoch": 0.1, + "grad_norm": 0.6865252342123821, + "learning_rate": 4.941161784413016e-05, + "loss": 2.0533, + "step": 1259 + }, + { + "epoch": 0.1, + "grad_norm": 0.6278576003474102, + "learning_rate": 4.941026981021992e-05, + "loss": 1.9389, + "step": 1260 + }, + { + "epoch": 0.1, + "grad_norm": 0.6387162366100033, + "learning_rate": 4.940892025227625e-05, + "loss": 2.0637, + "step": 1261 + }, + { + "epoch": 0.1, + "grad_norm": 0.643538801418585, + "learning_rate": 4.9407569170383415e-05, + "loss": 2.2375, + "step": 1262 + }, + { + "epoch": 0.1, + "grad_norm": 0.6586212775070299, + "learning_rate": 4.940621656462577e-05, + "loss": 1.9945, + "step": 1263 + }, + { + "epoch": 0.1, + "grad_norm": 0.6856604374609993, + "learning_rate": 4.9404862435087765e-05, + "loss": 2.018, + "step": 1264 + }, + { + "epoch": 0.1, + "grad_norm": 0.6498519515362073, + "learning_rate": 4.940350678185394e-05, + "loss": 2.0121, + "step": 1265 + }, + { + "epoch": 0.1, + "grad_norm": 0.78305577298325, + "learning_rate": 4.940214960500893e-05, + "loss": 2.2076, + "step": 1266 + }, + { + "epoch": 0.1, + "grad_norm": 0.8216115922378247, + "learning_rate": 4.940079090463747e-05, + "loss": 2.0733, + "step": 1267 + }, + { + "epoch": 0.1, + "grad_norm": 0.6509087305087776, + "learning_rate": 4.93994306808244e-05, + "loss": 2.0349, + "step": 1268 + }, + { + "epoch": 0.1, + "grad_norm": 0.7400337480556364, + "learning_rate": 4.939806893365464e-05, + "loss": 2.0341, + "step": 1269 + }, + { + "epoch": 0.1, + "grad_norm": 0.7059298938795996, + "learning_rate": 4.93967056632132e-05, + "loss": 2.2271, + "step": 1270 + }, + { + "epoch": 0.1, + "grad_norm": 0.7350343326413614, + "learning_rate": 4.939534086958521e-05, + "loss": 2.0629, + "step": 1271 + }, + { + "epoch": 0.1, + "grad_norm": 0.7837355257272042, + "learning_rate": 4.939397455285586e-05, + "loss": 2.0479, + "step": 1272 + }, + { + "epoch": 0.1, + "grad_norm": 0.6389854103590814, + "learning_rate": 4.939260671311047e-05, + "loss": 2.0584, + "step": 1273 + }, + { + "epoch": 0.1, + "grad_norm": 0.7361658984681914, + "learning_rate": 4.939123735043444e-05, + "loss": 2.18, + "step": 1274 + }, + { + "epoch": 0.1, + "grad_norm": 0.7460637094562477, + "learning_rate": 4.938986646491325e-05, + "loss": 2.0407, + "step": 1275 + }, + { + "epoch": 0.1, + "grad_norm": 0.6700816310591816, + "learning_rate": 4.938849405663251e-05, + "loss": 2.0328, + "step": 1276 + }, + { + "epoch": 0.1, + "grad_norm": 0.7239776831526815, + "learning_rate": 4.9387120125677885e-05, + "loss": 2.0594, + "step": 1277 + }, + { + "epoch": 0.1, + "grad_norm": 0.7791851613503944, + "learning_rate": 4.938574467213518e-05, + "loss": 2.2643, + "step": 1278 + }, + { + "epoch": 0.1, + "grad_norm": 0.6894871124071639, + "learning_rate": 4.938436769609025e-05, + "loss": 2.0878, + "step": 1279 + }, + { + "epoch": 0.1, + "grad_norm": 0.900650097317208, + "learning_rate": 4.938298919762907e-05, + "loss": 2.0164, + "step": 1280 + }, + { + "epoch": 0.1, + "grad_norm": 0.6615225793496733, + "learning_rate": 4.9381609176837704e-05, + "loss": 1.9921, + "step": 1281 + }, + { + "epoch": 0.1, + "grad_norm": 0.83935847146629, + "learning_rate": 4.938022763380232e-05, + "loss": 2.2264, + "step": 1282 + }, + { + "epoch": 0.1, + "grad_norm": 0.7050713243975505, + "learning_rate": 4.937884456860916e-05, + "loss": 1.9998, + "step": 1283 + }, + { + "epoch": 0.1, + "grad_norm": 0.6365375351547815, + "learning_rate": 4.93774599813446e-05, + "loss": 2.0232, + "step": 1284 + }, + { + "epoch": 0.1, + "grad_norm": 0.7283724354987484, + "learning_rate": 4.937607387209505e-05, + "loss": 1.9966, + "step": 1285 + }, + { + "epoch": 0.1, + "grad_norm": 0.7911837829366256, + "learning_rate": 4.9374686240947075e-05, + "loss": 2.2074, + "step": 1286 + }, + { + "epoch": 0.1, + "grad_norm": 0.675441652626894, + "learning_rate": 4.9373297087987305e-05, + "loss": 1.9843, + "step": 1287 + }, + { + "epoch": 0.1, + "grad_norm": 0.8690720557947171, + "learning_rate": 4.937190641330247e-05, + "loss": 2.0517, + "step": 1288 + }, + { + "epoch": 0.1, + "grad_norm": 0.661902555249874, + "learning_rate": 4.9370514216979405e-05, + "loss": 1.991, + "step": 1289 + }, + { + "epoch": 0.1, + "grad_norm": 0.8720815221336945, + "learning_rate": 4.9369120499105005e-05, + "loss": 2.1959, + "step": 1290 + }, + { + "epoch": 0.1, + "grad_norm": 0.8070080440683559, + "learning_rate": 4.936772525976631e-05, + "loss": 2.0054, + "step": 1291 + }, + { + "epoch": 0.1, + "grad_norm": 0.6766548776666609, + "learning_rate": 4.936632849905042e-05, + "loss": 2.0826, + "step": 1292 + }, + { + "epoch": 0.1, + "grad_norm": 0.7497710861780327, + "learning_rate": 4.9364930217044537e-05, + "loss": 1.9755, + "step": 1293 + }, + { + "epoch": 0.1, + "grad_norm": 0.6591328786451597, + "learning_rate": 4.9363530413835976e-05, + "loss": 2.1417, + "step": 1294 + }, + { + "epoch": 0.1, + "grad_norm": 0.8106457848170325, + "learning_rate": 4.9362129089512117e-05, + "loss": 2.0181, + "step": 1295 + }, + { + "epoch": 0.1, + "grad_norm": 0.6594006074696173, + "learning_rate": 4.936072624416046e-05, + "loss": 2.075, + "step": 1296 + }, + { + "epoch": 0.1, + "grad_norm": 0.7134499562293972, + "learning_rate": 4.935932187786858e-05, + "loss": 1.9969, + "step": 1297 + }, + { + "epoch": 0.1, + "grad_norm": 0.7296888770916595, + "learning_rate": 4.935791599072418e-05, + "loss": 2.2605, + "step": 1298 + }, + { + "epoch": 0.1, + "grad_norm": 0.6823858980290215, + "learning_rate": 4.935650858281501e-05, + "loss": 2.0379, + "step": 1299 + }, + { + "epoch": 0.1, + "grad_norm": 0.969012912987145, + "learning_rate": 4.9355099654228945e-05, + "loss": 2.0309, + "step": 1300 + }, + { + "epoch": 0.1, + "grad_norm": 0.6737537170271464, + "learning_rate": 4.935368920505396e-05, + "loss": 1.9931, + "step": 1301 + }, + { + "epoch": 0.1, + "grad_norm": 1.070565565169915, + "learning_rate": 4.935227723537811e-05, + "loss": 2.2284, + "step": 1302 + }, + { + "epoch": 0.1, + "grad_norm": 0.716436167594749, + "learning_rate": 4.935086374528955e-05, + "loss": 2.0107, + "step": 1303 + }, + { + "epoch": 0.1, + "grad_norm": 0.7197695695049768, + "learning_rate": 4.934944873487654e-05, + "loss": 2.0889, + "step": 1304 + }, + { + "epoch": 0.1, + "grad_norm": 0.8559310835590055, + "learning_rate": 4.9348032204227406e-05, + "loss": 2.0113, + "step": 1305 + }, + { + "epoch": 0.1, + "grad_norm": 0.6776819088013176, + "learning_rate": 4.9346614153430604e-05, + "loss": 2.1967, + "step": 1306 + }, + { + "epoch": 0.1, + "grad_norm": 0.8774642127341507, + "learning_rate": 4.934519458257466e-05, + "loss": 2.056, + "step": 1307 + }, + { + "epoch": 0.1, + "grad_norm": 0.6902292612230814, + "learning_rate": 4.93437734917482e-05, + "loss": 1.9993, + "step": 1308 + }, + { + "epoch": 0.1, + "grad_norm": 0.8858225906196834, + "learning_rate": 4.934235088103996e-05, + "loss": 2.029, + "step": 1309 + }, + { + "epoch": 0.1, + "grad_norm": 0.8221413089279395, + "learning_rate": 4.934092675053875e-05, + "loss": 2.2356, + "step": 1310 + }, + { + "epoch": 0.1, + "grad_norm": 0.7368355179078083, + "learning_rate": 4.9339501100333486e-05, + "loss": 2.0555, + "step": 1311 + }, + { + "epoch": 0.1, + "grad_norm": 0.9112225608741429, + "learning_rate": 4.933807393051318e-05, + "loss": 1.9948, + "step": 1312 + }, + { + "epoch": 0.1, + "grad_norm": 0.6719035614778086, + "learning_rate": 4.933664524116694e-05, + "loss": 1.982, + "step": 1313 + }, + { + "epoch": 0.1, + "grad_norm": 1.019917254334078, + "learning_rate": 4.933521503238395e-05, + "loss": 2.2106, + "step": 1314 + }, + { + "epoch": 0.1, + "grad_norm": 0.6554400597774033, + "learning_rate": 4.933378330425352e-05, + "loss": 1.9513, + "step": 1315 + }, + { + "epoch": 0.1, + "grad_norm": 0.7338500694121778, + "learning_rate": 4.933235005686504e-05, + "loss": 2.0033, + "step": 1316 + }, + { + "epoch": 0.1, + "grad_norm": 0.725630760965212, + "learning_rate": 4.933091529030798e-05, + "loss": 2.0887, + "step": 1317 + }, + { + "epoch": 0.1, + "grad_norm": 0.6708245820323802, + "learning_rate": 4.932947900467193e-05, + "loss": 2.1988, + "step": 1318 + }, + { + "epoch": 0.1, + "grad_norm": 0.7575120674643232, + "learning_rate": 4.932804120004655e-05, + "loss": 2.0417, + "step": 1319 + }, + { + "epoch": 0.1, + "grad_norm": 0.6758547717404063, + "learning_rate": 4.932660187652162e-05, + "loss": 1.9296, + "step": 1320 + }, + { + "epoch": 0.1, + "grad_norm": 0.8203942084712488, + "learning_rate": 4.932516103418699e-05, + "loss": 2.1005, + "step": 1321 + }, + { + "epoch": 0.1, + "grad_norm": 0.7472642061522002, + "learning_rate": 4.932371867313264e-05, + "loss": 2.2003, + "step": 1322 + }, + { + "epoch": 0.1, + "grad_norm": 0.7215367806390354, + "learning_rate": 4.9322274793448595e-05, + "loss": 2.0837, + "step": 1323 + }, + { + "epoch": 0.1, + "grad_norm": 0.8281465675308213, + "learning_rate": 4.932082939522502e-05, + "loss": 2.0344, + "step": 1324 + }, + { + "epoch": 0.1, + "grad_norm": 0.6762818786916303, + "learning_rate": 4.931938247855216e-05, + "loss": 2.0468, + "step": 1325 + }, + { + "epoch": 0.1, + "grad_norm": 0.7774336220447284, + "learning_rate": 4.931793404352034e-05, + "loss": 2.2376, + "step": 1326 + }, + { + "epoch": 0.1, + "grad_norm": 0.6929016053717274, + "learning_rate": 4.9316484090220004e-05, + "loss": 1.9959, + "step": 1327 + }, + { + "epoch": 0.1, + "grad_norm": 0.6620327025676058, + "learning_rate": 4.931503261874166e-05, + "loss": 2.0216, + "step": 1328 + }, + { + "epoch": 0.1, + "grad_norm": 0.6691610602222117, + "learning_rate": 4.9313579629175955e-05, + "loss": 2.0597, + "step": 1329 + }, + { + "epoch": 0.1, + "grad_norm": 0.6837742184898068, + "learning_rate": 4.931212512161358e-05, + "loss": 2.1811, + "step": 1330 + }, + { + "epoch": 0.1, + "grad_norm": 0.6989880840977692, + "learning_rate": 4.931066909614536e-05, + "loss": 2.0448, + "step": 1331 + }, + { + "epoch": 0.1, + "grad_norm": 0.7094737200333332, + "learning_rate": 4.93092115528622e-05, + "loss": 2.051, + "step": 1332 + }, + { + "epoch": 0.1, + "grad_norm": 0.6346523142759848, + "learning_rate": 4.9307752491855096e-05, + "loss": 1.9916, + "step": 1333 + }, + { + "epoch": 0.1, + "grad_norm": 0.6398644637391256, + "learning_rate": 4.930629191321514e-05, + "loss": 2.2564, + "step": 1334 + }, + { + "epoch": 0.1, + "grad_norm": 0.624015196136929, + "learning_rate": 4.9304829817033536e-05, + "loss": 2.0788, + "step": 1335 + }, + { + "epoch": 0.1, + "grad_norm": 0.7021210510783147, + "learning_rate": 4.930336620340156e-05, + "loss": 2.0178, + "step": 1336 + }, + { + "epoch": 0.1, + "grad_norm": 0.7179074690529313, + "learning_rate": 4.930190107241058e-05, + "loss": 2.036, + "step": 1337 + }, + { + "epoch": 0.1, + "grad_norm": 0.7221112935571207, + "learning_rate": 4.9300434424152093e-05, + "loss": 2.2437, + "step": 1338 + }, + { + "epoch": 0.1, + "grad_norm": 0.6790095362899412, + "learning_rate": 4.929896625871765e-05, + "loss": 2.0287, + "step": 1339 + }, + { + "epoch": 0.1, + "grad_norm": 0.6566081755875817, + "learning_rate": 4.929749657619893e-05, + "loss": 2.0673, + "step": 1340 + }, + { + "epoch": 0.1, + "grad_norm": 0.6720157735102602, + "learning_rate": 4.929602537668767e-05, + "loss": 2.1102, + "step": 1341 + }, + { + "epoch": 0.1, + "grad_norm": 0.6603505205475175, + "learning_rate": 4.929455266027574e-05, + "loss": 2.1874, + "step": 1342 + }, + { + "epoch": 0.1, + "grad_norm": 0.7339029255369434, + "learning_rate": 4.929307842705508e-05, + "loss": 1.9862, + "step": 1343 + }, + { + "epoch": 0.1, + "grad_norm": 0.6284914964558868, + "learning_rate": 4.9291602677117745e-05, + "loss": 1.9712, + "step": 1344 + }, + { + "epoch": 0.1, + "grad_norm": 0.6893421358684864, + "learning_rate": 4.929012541055585e-05, + "loss": 2.0037, + "step": 1345 + }, + { + "epoch": 0.1, + "grad_norm": 0.7258442383574379, + "learning_rate": 4.9288646627461645e-05, + "loss": 2.1871, + "step": 1346 + }, + { + "epoch": 0.1, + "grad_norm": 0.6574591573777764, + "learning_rate": 4.9287166327927455e-05, + "loss": 2.0176, + "step": 1347 + }, + { + "epoch": 0.1, + "grad_norm": 0.646917874666992, + "learning_rate": 4.928568451204569e-05, + "loss": 2.0946, + "step": 1348 + }, + { + "epoch": 0.1, + "grad_norm": 0.6867314058364945, + "learning_rate": 4.928420117990887e-05, + "loss": 1.959, + "step": 1349 + }, + { + "epoch": 0.1, + "grad_norm": 0.7302301828126002, + "learning_rate": 4.928271633160962e-05, + "loss": 2.2413, + "step": 1350 + }, + { + "epoch": 0.1, + "grad_norm": 0.8222059838322406, + "learning_rate": 4.928122996724063e-05, + "loss": 2.0345, + "step": 1351 + }, + { + "epoch": 0.1, + "grad_norm": 0.78961780159155, + "learning_rate": 4.92797420868947e-05, + "loss": 2.0132, + "step": 1352 + }, + { + "epoch": 0.1, + "grad_norm": 0.7620716258153973, + "learning_rate": 4.927825269066473e-05, + "loss": 2.0386, + "step": 1353 + }, + { + "epoch": 0.1, + "grad_norm": 0.673236701377961, + "learning_rate": 4.92767617786437e-05, + "loss": 2.2611, + "step": 1354 + }, + { + "epoch": 0.1, + "grad_norm": 0.7473519180921998, + "learning_rate": 4.927526935092471e-05, + "loss": 2.0074, + "step": 1355 + }, + { + "epoch": 0.1, + "grad_norm": 0.7455857263136154, + "learning_rate": 4.927377540760092e-05, + "loss": 2.0332, + "step": 1356 + }, + { + "epoch": 0.1, + "grad_norm": 0.6684308072191367, + "learning_rate": 4.927227994876562e-05, + "loss": 2.0005, + "step": 1357 + }, + { + "epoch": 0.1, + "grad_norm": 0.7833241949470409, + "learning_rate": 4.927078297451217e-05, + "loss": 2.2257, + "step": 1358 + }, + { + "epoch": 0.1, + "grad_norm": 0.6724717742866224, + "learning_rate": 4.926928448493403e-05, + "loss": 2.0041, + "step": 1359 + }, + { + "epoch": 0.1, + "grad_norm": 0.7586771626220096, + "learning_rate": 4.9267784480124755e-05, + "loss": 2.0622, + "step": 1360 + }, + { + "epoch": 0.1, + "grad_norm": 0.7074053227017949, + "learning_rate": 4.9266282960178006e-05, + "loss": 2.01, + "step": 1361 + }, + { + "epoch": 0.11, + "grad_norm": 0.6660321763606343, + "learning_rate": 4.9264779925187524e-05, + "loss": 2.1879, + "step": 1362 + }, + { + "epoch": 0.11, + "grad_norm": 0.7068494611590002, + "learning_rate": 4.926327537524714e-05, + "loss": 2.0601, + "step": 1363 + }, + { + "epoch": 0.11, + "grad_norm": 0.6912167034183743, + "learning_rate": 4.926176931045081e-05, + "loss": 2.0017, + "step": 1364 + }, + { + "epoch": 0.11, + "grad_norm": 0.7897086876024525, + "learning_rate": 4.926026173089254e-05, + "loss": 2.0025, + "step": 1365 + }, + { + "epoch": 0.11, + "grad_norm": 0.8577721037195298, + "learning_rate": 4.925875263666648e-05, + "loss": 2.0945, + "step": 1366 + }, + { + "epoch": 0.11, + "grad_norm": 0.6607077861922018, + "learning_rate": 4.925724202786682e-05, + "loss": 2.1742, + "step": 1367 + }, + { + "epoch": 0.11, + "grad_norm": 0.8266398648704273, + "learning_rate": 4.92557299045879e-05, + "loss": 1.9941, + "step": 1368 + }, + { + "epoch": 0.11, + "grad_norm": 0.7324747896933725, + "learning_rate": 4.9254216266924114e-05, + "loss": 2.0076, + "step": 1369 + }, + { + "epoch": 0.11, + "grad_norm": 0.7938501904482388, + "learning_rate": 4.925270111496997e-05, + "loss": 2.2218, + "step": 1370 + }, + { + "epoch": 0.11, + "grad_norm": 0.6470950059920718, + "learning_rate": 4.925118444882006e-05, + "loss": 2.0211, + "step": 1371 + }, + { + "epoch": 0.11, + "grad_norm": 0.7670667226921369, + "learning_rate": 4.9249666268569086e-05, + "loss": 2.1107, + "step": 1372 + }, + { + "epoch": 0.11, + "grad_norm": 0.7477371405276937, + "learning_rate": 4.924814657431182e-05, + "loss": 2.0049, + "step": 1373 + }, + { + "epoch": 0.11, + "grad_norm": 0.6385979306079431, + "learning_rate": 4.924662536614314e-05, + "loss": 2.1831, + "step": 1374 + }, + { + "epoch": 0.11, + "grad_norm": 0.6470327373095323, + "learning_rate": 4.9245102644158046e-05, + "loss": 1.9855, + "step": 1375 + }, + { + "epoch": 0.11, + "grad_norm": 0.7842674063117663, + "learning_rate": 4.924357840845158e-05, + "loss": 2.018, + "step": 1376 + }, + { + "epoch": 0.11, + "grad_norm": 0.6581355206258155, + "learning_rate": 4.9242052659118935e-05, + "loss": 1.9806, + "step": 1377 + }, + { + "epoch": 0.11, + "grad_norm": 0.7782053478836232, + "learning_rate": 4.924052539625534e-05, + "loss": 2.2263, + "step": 1378 + }, + { + "epoch": 0.11, + "grad_norm": 0.7030568808399806, + "learning_rate": 4.923899661995617e-05, + "loss": 2.0524, + "step": 1379 + }, + { + "epoch": 0.11, + "grad_norm": 0.8329068407379203, + "learning_rate": 4.923746633031686e-05, + "loss": 2.0124, + "step": 1380 + }, + { + "epoch": 0.11, + "grad_norm": 0.7735910987759003, + "learning_rate": 4.9235934527432956e-05, + "loss": 2.0257, + "step": 1381 + }, + { + "epoch": 0.11, + "grad_norm": 0.7666188703619063, + "learning_rate": 4.923440121140009e-05, + "loss": 2.1792, + "step": 1382 + }, + { + "epoch": 0.11, + "grad_norm": 0.7339166525749653, + "learning_rate": 4.9232866382314014e-05, + "loss": 1.9829, + "step": 1383 + }, + { + "epoch": 0.11, + "grad_norm": 0.7588350992874737, + "learning_rate": 4.923133004027053e-05, + "loss": 2.0198, + "step": 1384 + }, + { + "epoch": 0.11, + "grad_norm": 0.6586022950087436, + "learning_rate": 4.922979218536557e-05, + "loss": 2.0854, + "step": 1385 + }, + { + "epoch": 0.11, + "grad_norm": 0.7254282207275699, + "learning_rate": 4.922825281769514e-05, + "loss": 2.0266, + "step": 1386 + }, + { + "epoch": 0.11, + "grad_norm": 0.8126760418600476, + "learning_rate": 4.922671193735536e-05, + "loss": 2.226, + "step": 1387 + }, + { + "epoch": 0.11, + "grad_norm": 0.640145186003117, + "learning_rate": 4.922516954444243e-05, + "loss": 2.0387, + "step": 1388 + }, + { + "epoch": 0.11, + "grad_norm": 0.6627040198743919, + "learning_rate": 4.922362563905264e-05, + "loss": 2.0046, + "step": 1389 + }, + { + "epoch": 0.11, + "grad_norm": 0.7388868399071236, + "learning_rate": 4.9222080221282387e-05, + "loss": 2.1921, + "step": 1390 + }, + { + "epoch": 0.11, + "grad_norm": 0.7315726696665364, + "learning_rate": 4.922053329122817e-05, + "loss": 2.0809, + "step": 1391 + }, + { + "epoch": 0.11, + "grad_norm": 0.645971740884652, + "learning_rate": 4.9218984848986544e-05, + "loss": 2.0262, + "step": 1392 + }, + { + "epoch": 0.11, + "grad_norm": 0.7486732423768055, + "learning_rate": 4.921743489465421e-05, + "loss": 2.0282, + "step": 1393 + }, + { + "epoch": 0.11, + "grad_norm": 0.6517187734383602, + "learning_rate": 4.921588342832792e-05, + "loss": 2.2206, + "step": 1394 + }, + { + "epoch": 0.11, + "grad_norm": 0.6837180450301867, + "learning_rate": 4.921433045010455e-05, + "loss": 2.0042, + "step": 1395 + }, + { + "epoch": 0.11, + "grad_norm": 0.7033723554480398, + "learning_rate": 4.921277596008106e-05, + "loss": 2.0309, + "step": 1396 + }, + { + "epoch": 0.11, + "grad_norm": 0.6239111702741212, + "learning_rate": 4.9211219958354496e-05, + "loss": 2.0923, + "step": 1397 + }, + { + "epoch": 0.11, + "grad_norm": 0.6479867167526784, + "learning_rate": 4.9209662445022003e-05, + "loss": 2.0287, + "step": 1398 + }, + { + "epoch": 0.11, + "grad_norm": 0.6640194757746232, + "learning_rate": 4.9208103420180826e-05, + "loss": 2.2056, + "step": 1399 + }, + { + "epoch": 0.11, + "grad_norm": 0.7296763663717897, + "learning_rate": 4.9206542883928305e-05, + "loss": 1.984, + "step": 1400 + }, + { + "epoch": 0.11, + "grad_norm": 0.6617176948865363, + "learning_rate": 4.920498083636188e-05, + "loss": 2.0014, + "step": 1401 + }, + { + "epoch": 0.11, + "grad_norm": 0.8683525948404506, + "learning_rate": 4.920341727757906e-05, + "loss": 2.19, + "step": 1402 + }, + { + "epoch": 0.11, + "grad_norm": 0.5884740506352705, + "learning_rate": 4.920185220767746e-05, + "loss": 2.0792, + "step": 1403 + }, + { + "epoch": 0.11, + "grad_norm": 0.7442585642204066, + "learning_rate": 4.920028562675481e-05, + "loss": 2.0436, + "step": 1404 + }, + { + "epoch": 0.11, + "grad_norm": 0.622773421178493, + "learning_rate": 4.919871753490891e-05, + "loss": 1.9681, + "step": 1405 + }, + { + "epoch": 0.11, + "grad_norm": 0.6565300645157086, + "learning_rate": 4.9197147932237664e-05, + "loss": 2.204, + "step": 1406 + }, + { + "epoch": 0.11, + "grad_norm": 0.6632049598364192, + "learning_rate": 4.9195576818839064e-05, + "loss": 1.9777, + "step": 1407 + }, + { + "epoch": 0.11, + "grad_norm": 0.5882294410579186, + "learning_rate": 4.919400419481121e-05, + "loss": 2.0131, + "step": 1408 + }, + { + "epoch": 0.11, + "grad_norm": 0.6676869785017955, + "learning_rate": 4.919243006025228e-05, + "loss": 2.0167, + "step": 1409 + }, + { + "epoch": 0.11, + "grad_norm": 0.5893579848265511, + "learning_rate": 4.9190854415260556e-05, + "loss": 2.0163, + "step": 1410 + }, + { + "epoch": 0.11, + "grad_norm": 0.6268881006949795, + "learning_rate": 4.918927725993442e-05, + "loss": 2.2333, + "step": 1411 + }, + { + "epoch": 0.11, + "grad_norm": 0.6071670743541296, + "learning_rate": 4.918769859437232e-05, + "loss": 1.9968, + "step": 1412 + }, + { + "epoch": 0.11, + "grad_norm": 0.6715931927167169, + "learning_rate": 4.9186118418672844e-05, + "loss": 2.043, + "step": 1413 + }, + { + "epoch": 0.11, + "grad_norm": 0.739076979411059, + "learning_rate": 4.918453673293463e-05, + "loss": 2.2027, + "step": 1414 + }, + { + "epoch": 0.11, + "grad_norm": 0.6128681262178067, + "learning_rate": 4.9182953537256435e-05, + "loss": 1.9508, + "step": 1415 + }, + { + "epoch": 0.11, + "grad_norm": 0.6802465883244089, + "learning_rate": 4.91813688317371e-05, + "loss": 2.0659, + "step": 1416 + }, + { + "epoch": 0.11, + "grad_norm": 0.645269551867952, + "learning_rate": 4.917978261647558e-05, + "loss": 2.0554, + "step": 1417 + }, + { + "epoch": 0.11, + "grad_norm": 0.6956842109527737, + "learning_rate": 4.917819489157089e-05, + "loss": 2.0351, + "step": 1418 + }, + { + "epoch": 0.11, + "grad_norm": 0.6898499348151153, + "learning_rate": 4.917660565712216e-05, + "loss": 2.2164, + "step": 1419 + }, + { + "epoch": 0.11, + "grad_norm": 0.7366164056749555, + "learning_rate": 4.9175014913228634e-05, + "loss": 2.0809, + "step": 1420 + }, + { + "epoch": 0.11, + "grad_norm": 0.8131213082688976, + "learning_rate": 4.9173422659989607e-05, + "loss": 1.9771, + "step": 1421 + }, + { + "epoch": 0.11, + "grad_norm": 0.6265425338560084, + "learning_rate": 4.9171828897504495e-05, + "loss": 2.0688, + "step": 1422 + }, + { + "epoch": 0.11, + "grad_norm": 0.7843504494406331, + "learning_rate": 4.917023362587281e-05, + "loss": 2.2418, + "step": 1423 + }, + { + "epoch": 0.11, + "grad_norm": 0.6971086001904286, + "learning_rate": 4.916863684519414e-05, + "loss": 2.0333, + "step": 1424 + }, + { + "epoch": 0.11, + "grad_norm": 0.9228081100045981, + "learning_rate": 4.916703855556819e-05, + "loss": 2.0243, + "step": 1425 + }, + { + "epoch": 0.11, + "grad_norm": 0.8479742020759614, + "learning_rate": 4.916543875709473e-05, + "loss": 2.2056, + "step": 1426 + }, + { + "epoch": 0.11, + "grad_norm": 0.6598787123874069, + "learning_rate": 4.9163837449873665e-05, + "loss": 1.9996, + "step": 1427 + }, + { + "epoch": 0.11, + "grad_norm": 0.7174901121575203, + "learning_rate": 4.916223463400496e-05, + "loss": 2.0726, + "step": 1428 + }, + { + "epoch": 0.11, + "grad_norm": 0.6346016832985462, + "learning_rate": 4.9160630309588686e-05, + "loss": 2.0128, + "step": 1429 + }, + { + "epoch": 0.11, + "grad_norm": 0.6929303770388461, + "learning_rate": 4.9159024476725003e-05, + "loss": 1.9945, + "step": 1430 + }, + { + "epoch": 0.11, + "grad_norm": 0.6479037582618368, + "learning_rate": 4.9157417135514183e-05, + "loss": 2.2212, + "step": 1431 + }, + { + "epoch": 0.11, + "grad_norm": 0.7025884846329573, + "learning_rate": 4.915580828605656e-05, + "loss": 1.9632, + "step": 1432 + }, + { + "epoch": 0.11, + "grad_norm": 0.6762814430433215, + "learning_rate": 4.9154197928452596e-05, + "loss": 1.99, + "step": 1433 + }, + { + "epoch": 0.11, + "grad_norm": 0.717427271152316, + "learning_rate": 4.915258606280283e-05, + "loss": 2.2395, + "step": 1434 + }, + { + "epoch": 0.11, + "grad_norm": 0.7645438601476247, + "learning_rate": 4.915097268920789e-05, + "loss": 2.0659, + "step": 1435 + }, + { + "epoch": 0.11, + "grad_norm": 0.7774831376577979, + "learning_rate": 4.914935780776851e-05, + "loss": 2.0418, + "step": 1436 + }, + { + "epoch": 0.11, + "grad_norm": 0.6832070153104234, + "learning_rate": 4.914774141858551e-05, + "loss": 2.041, + "step": 1437 + }, + { + "epoch": 0.11, + "grad_norm": 0.6548544928602852, + "learning_rate": 4.9146123521759824e-05, + "loss": 2.1845, + "step": 1438 + }, + { + "epoch": 0.11, + "grad_norm": 0.7154612609640302, + "learning_rate": 4.9144504117392445e-05, + "loss": 2.0099, + "step": 1439 + }, + { + "epoch": 0.11, + "grad_norm": 0.6506253952011838, + "learning_rate": 4.9142883205584485e-05, + "loss": 2.018, + "step": 1440 + }, + { + "epoch": 0.11, + "grad_norm": 0.6677169009202621, + "learning_rate": 4.9141260786437146e-05, + "loss": 2.0833, + "step": 1441 + }, + { + "epoch": 0.11, + "grad_norm": 0.6340480992538535, + "learning_rate": 4.913963686005172e-05, + "loss": 2.0133, + "step": 1442 + }, + { + "epoch": 0.11, + "grad_norm": 0.7369112995239134, + "learning_rate": 4.913801142652959e-05, + "loss": 2.2134, + "step": 1443 + }, + { + "epoch": 0.11, + "grad_norm": 0.7153467636897861, + "learning_rate": 4.913638448597227e-05, + "loss": 1.969, + "step": 1444 + }, + { + "epoch": 0.11, + "grad_norm": 0.8534291650550823, + "learning_rate": 4.913475603848129e-05, + "loss": 2.0239, + "step": 1445 + }, + { + "epoch": 0.11, + "grad_norm": 0.8527012168738066, + "learning_rate": 4.9133126084158344e-05, + "loss": 2.1673, + "step": 1446 + }, + { + "epoch": 0.11, + "grad_norm": 0.6865184574954233, + "learning_rate": 4.9131494623105204e-05, + "loss": 2.0749, + "step": 1447 + }, + { + "epoch": 0.11, + "grad_norm": 0.7413853463198135, + "learning_rate": 4.912986165542371e-05, + "loss": 2.0312, + "step": 1448 + }, + { + "epoch": 0.11, + "grad_norm": 0.8907502999290932, + "learning_rate": 4.912822718121583e-05, + "loss": 1.9743, + "step": 1449 + }, + { + "epoch": 0.11, + "grad_norm": 0.62262153506801, + "learning_rate": 4.91265912005836e-05, + "loss": 1.9712, + "step": 1450 + }, + { + "epoch": 0.11, + "grad_norm": 0.773709139725493, + "learning_rate": 4.9124953713629174e-05, + "loss": 2.2038, + "step": 1451 + }, + { + "epoch": 0.11, + "grad_norm": 0.7225878136381105, + "learning_rate": 4.9123314720454786e-05, + "loss": 2.0483, + "step": 1452 + }, + { + "epoch": 0.11, + "grad_norm": 0.6895708392708253, + "learning_rate": 4.912167422116275e-05, + "loss": 2.0664, + "step": 1453 + }, + { + "epoch": 0.11, + "grad_norm": 0.7912068401804979, + "learning_rate": 4.9120032215855496e-05, + "loss": 2.0234, + "step": 1454 + }, + { + "epoch": 0.11, + "grad_norm": 0.7033156723715096, + "learning_rate": 4.911838870463554e-05, + "loss": 2.2253, + "step": 1455 + }, + { + "epoch": 0.11, + "grad_norm": 0.7522270336473421, + "learning_rate": 4.911674368760551e-05, + "loss": 2.0282, + "step": 1456 + }, + { + "epoch": 0.11, + "grad_norm": 0.6380176066217332, + "learning_rate": 4.911509716486809e-05, + "loss": 1.9703, + "step": 1457 + }, + { + "epoch": 0.11, + "grad_norm": 0.8132472574000443, + "learning_rate": 4.911344913652609e-05, + "loss": 2.2366, + "step": 1458 + }, + { + "epoch": 0.11, + "grad_norm": 0.8299108877896236, + "learning_rate": 4.91117996026824e-05, + "loss": 2.0935, + "step": 1459 + }, + { + "epoch": 0.11, + "grad_norm": 0.7045345257515145, + "learning_rate": 4.9110148563439995e-05, + "loss": 2.0464, + "step": 1460 + }, + { + "epoch": 0.11, + "grad_norm": 0.8264982560829094, + "learning_rate": 4.9108496018901975e-05, + "loss": 2.0132, + "step": 1461 + }, + { + "epoch": 0.11, + "grad_norm": 0.6347887224176082, + "learning_rate": 4.9106841969171515e-05, + "loss": 1.9418, + "step": 1462 + }, + { + "epoch": 0.11, + "grad_norm": 0.8228208649537577, + "learning_rate": 4.9105186414351876e-05, + "loss": 2.1814, + "step": 1463 + }, + { + "epoch": 0.11, + "grad_norm": 0.6852758464567472, + "learning_rate": 4.910352935454642e-05, + "loss": 1.9875, + "step": 1464 + }, + { + "epoch": 0.11, + "grad_norm": 0.6862275519756383, + "learning_rate": 4.91018707898586e-05, + "loss": 1.991, + "step": 1465 + }, + { + "epoch": 0.11, + "grad_norm": 0.6894498481353553, + "learning_rate": 4.910021072039198e-05, + "loss": 2.0766, + "step": 1466 + }, + { + "epoch": 0.11, + "grad_norm": 0.6406473091023128, + "learning_rate": 4.90985491462502e-05, + "loss": 2.1514, + "step": 1467 + }, + { + "epoch": 0.11, + "grad_norm": 0.7072110393949574, + "learning_rate": 4.9096886067536996e-05, + "loss": 1.9942, + "step": 1468 + }, + { + "epoch": 0.11, + "grad_norm": 0.6918591279678531, + "learning_rate": 4.90952214843562e-05, + "loss": 1.993, + "step": 1469 + }, + { + "epoch": 0.11, + "grad_norm": 0.663486934650943, + "learning_rate": 4.909355539681174e-05, + "loss": 2.032, + "step": 1470 + }, + { + "epoch": 0.11, + "grad_norm": 0.8722624576923758, + "learning_rate": 4.9091887805007646e-05, + "loss": 2.2325, + "step": 1471 + }, + { + "epoch": 0.11, + "grad_norm": 0.7249328263112711, + "learning_rate": 4.9090218709048016e-05, + "loss": 2.1094, + "step": 1472 + }, + { + "epoch": 0.11, + "grad_norm": 0.6907016675778929, + "learning_rate": 4.9088548109037075e-05, + "loss": 2.0002, + "step": 1473 + }, + { + "epoch": 0.11, + "grad_norm": 0.7851199550941093, + "learning_rate": 4.908687600507911e-05, + "loss": 2.0562, + "step": 1474 + }, + { + "epoch": 0.11, + "grad_norm": 0.6553147923177362, + "learning_rate": 4.908520239727853e-05, + "loss": 2.2066, + "step": 1475 + }, + { + "epoch": 0.11, + "grad_norm": 0.6799776154550105, + "learning_rate": 4.9083527285739815e-05, + "loss": 1.9599, + "step": 1476 + }, + { + "epoch": 0.11, + "grad_norm": 0.7863076043653596, + "learning_rate": 4.9081850670567556e-05, + "loss": 2.0393, + "step": 1477 + }, + { + "epoch": 0.11, + "grad_norm": 0.6203190189389729, + "learning_rate": 4.908017255186643e-05, + "loss": 2.0469, + "step": 1478 + }, + { + "epoch": 0.11, + "grad_norm": 0.7434557719813637, + "learning_rate": 4.907849292974121e-05, + "loss": 2.202, + "step": 1479 + }, + { + "epoch": 0.11, + "grad_norm": 0.766558274567434, + "learning_rate": 4.9076811804296755e-05, + "loss": 2.0041, + "step": 1480 + }, + { + "epoch": 0.11, + "grad_norm": 0.664942800646058, + "learning_rate": 4.907512917563802e-05, + "loss": 1.9843, + "step": 1481 + }, + { + "epoch": 0.11, + "grad_norm": 0.7293549543803105, + "learning_rate": 4.907344504387008e-05, + "loss": 2.0135, + "step": 1482 + }, + { + "epoch": 0.11, + "grad_norm": 0.656950957100537, + "learning_rate": 4.907175940909807e-05, + "loss": 2.2146, + "step": 1483 + }, + { + "epoch": 0.11, + "grad_norm": 0.643413637013205, + "learning_rate": 4.907007227142723e-05, + "loss": 2.0782, + "step": 1484 + }, + { + "epoch": 0.11, + "grad_norm": 0.6199355752174294, + "learning_rate": 4.906838363096289e-05, + "loss": 2.0068, + "step": 1485 + }, + { + "epoch": 0.11, + "grad_norm": 0.6248417641856743, + "learning_rate": 4.9066693487810486e-05, + "loss": 2.0399, + "step": 1486 + }, + { + "epoch": 0.11, + "grad_norm": 0.7214019985594473, + "learning_rate": 4.906500184207555e-05, + "loss": 2.2296, + "step": 1487 + }, + { + "epoch": 0.11, + "grad_norm": 0.638379935770456, + "learning_rate": 4.906330869386367e-05, + "loss": 1.9561, + "step": 1488 + }, + { + "epoch": 0.11, + "grad_norm": 0.6697773568148789, + "learning_rate": 4.906161404328059e-05, + "loss": 1.9864, + "step": 1489 + }, + { + "epoch": 0.11, + "grad_norm": 0.5780295510364761, + "learning_rate": 4.905991789043209e-05, + "loss": 2.0631, + "step": 1490 + }, + { + "epoch": 0.12, + "grad_norm": 0.6523151894995519, + "learning_rate": 4.905822023542407e-05, + "loss": 2.2063, + "step": 1491 + }, + { + "epoch": 0.12, + "grad_norm": 0.6405427837066013, + "learning_rate": 4.905652107836254e-05, + "loss": 2.0093, + "step": 1492 + }, + { + "epoch": 0.12, + "grad_norm": 0.7534136238793245, + "learning_rate": 4.905482041935356e-05, + "loss": 2.0011, + "step": 1493 + }, + { + "epoch": 0.12, + "grad_norm": 0.670397597019326, + "learning_rate": 4.905311825850333e-05, + "loss": 2.0301, + "step": 1494 + }, + { + "epoch": 0.12, + "grad_norm": 0.7016625641974874, + "learning_rate": 4.905141459591811e-05, + "loss": 2.1791, + "step": 1495 + }, + { + "epoch": 0.12, + "grad_norm": 0.7376418064013709, + "learning_rate": 4.904970943170427e-05, + "loss": 2.0212, + "step": 1496 + }, + { + "epoch": 0.12, + "grad_norm": 0.6303882548001669, + "learning_rate": 4.904800276596828e-05, + "loss": 2.0689, + "step": 1497 + }, + { + "epoch": 0.12, + "grad_norm": 0.750003663635683, + "learning_rate": 4.904629459881668e-05, + "loss": 2.05, + "step": 1498 + }, + { + "epoch": 0.12, + "grad_norm": 0.7356302381575069, + "learning_rate": 4.904458493035612e-05, + "loss": 2.2429, + "step": 1499 + }, + { + "epoch": 0.12, + "grad_norm": 0.6686318271181354, + "learning_rate": 4.9042873760693345e-05, + "loss": 1.9609, + "step": 1500 + }, + { + "epoch": 0.12, + "grad_norm": 0.7809777811436641, + "learning_rate": 4.9041161089935194e-05, + "loss": 2.01, + "step": 1501 + }, + { + "epoch": 0.12, + "grad_norm": 0.6455630936021807, + "learning_rate": 4.903944691818859e-05, + "loss": 1.9877, + "step": 1502 + }, + { + "epoch": 0.12, + "grad_norm": 0.7342726700723904, + "learning_rate": 4.903773124556057e-05, + "loss": 2.222, + "step": 1503 + }, + { + "epoch": 0.12, + "grad_norm": 0.7438400129643592, + "learning_rate": 4.903601407215822e-05, + "loss": 2.0578, + "step": 1504 + }, + { + "epoch": 0.12, + "grad_norm": 0.6225040135293964, + "learning_rate": 4.903429539808878e-05, + "loss": 1.988, + "step": 1505 + }, + { + "epoch": 0.12, + "grad_norm": 0.7388586794179391, + "learning_rate": 4.903257522345954e-05, + "loss": 2.0275, + "step": 1506 + }, + { + "epoch": 0.12, + "grad_norm": 0.676428974597801, + "learning_rate": 4.903085354837791e-05, + "loss": 2.1751, + "step": 1507 + }, + { + "epoch": 0.12, + "grad_norm": 0.6581480506518382, + "learning_rate": 4.902913037295136e-05, + "loss": 1.9707, + "step": 1508 + }, + { + "epoch": 0.12, + "grad_norm": 0.7250774251461055, + "learning_rate": 4.9027405697287495e-05, + "loss": 2.0678, + "step": 1509 + }, + { + "epoch": 0.12, + "grad_norm": 0.756225713435815, + "learning_rate": 4.902567952149398e-05, + "loss": 1.9985, + "step": 1510 + }, + { + "epoch": 0.12, + "grad_norm": 0.6909473493638949, + "learning_rate": 4.902395184567859e-05, + "loss": 2.1961, + "step": 1511 + }, + { + "epoch": 0.12, + "grad_norm": 0.6870684254919689, + "learning_rate": 4.9022222669949197e-05, + "loss": 2.0012, + "step": 1512 + }, + { + "epoch": 0.12, + "grad_norm": 0.675522815320918, + "learning_rate": 4.902049199441376e-05, + "loss": 1.9893, + "step": 1513 + }, + { + "epoch": 0.12, + "grad_norm": 0.642581920324174, + "learning_rate": 4.901875981918033e-05, + "loss": 2.0157, + "step": 1514 + }, + { + "epoch": 0.12, + "grad_norm": 0.6328546304367614, + "learning_rate": 4.901702614435704e-05, + "loss": 2.2229, + "step": 1515 + }, + { + "epoch": 0.12, + "grad_norm": 0.6384657471304721, + "learning_rate": 4.9015290970052154e-05, + "loss": 2.05, + "step": 1516 + }, + { + "epoch": 0.12, + "grad_norm": 0.7481845841743291, + "learning_rate": 4.9013554296374e-05, + "loss": 2.019, + "step": 1517 + }, + { + "epoch": 0.12, + "grad_norm": 0.6071072945800561, + "learning_rate": 4.9011816123430984e-05, + "loss": 2.0117, + "step": 1518 + }, + { + "epoch": 0.12, + "grad_norm": 0.7733484827537251, + "learning_rate": 4.901007645133166e-05, + "loss": 2.2272, + "step": 1519 + }, + { + "epoch": 0.12, + "grad_norm": 0.7191250076478276, + "learning_rate": 4.900833528018463e-05, + "loss": 2.0274, + "step": 1520 + }, + { + "epoch": 0.12, + "grad_norm": 0.7835381137027306, + "learning_rate": 4.9006592610098585e-05, + "loss": 2.0886, + "step": 1521 + }, + { + "epoch": 0.12, + "grad_norm": 0.8706074794599856, + "learning_rate": 4.900484844118235e-05, + "loss": 2.013, + "step": 1522 + }, + { + "epoch": 0.12, + "grad_norm": 0.728879837046103, + "learning_rate": 4.9003102773544806e-05, + "loss": 2.2453, + "step": 1523 + }, + { + "epoch": 0.12, + "grad_norm": 0.9895335942455465, + "learning_rate": 4.900135560729496e-05, + "loss": 2.0735, + "step": 1524 + }, + { + "epoch": 0.12, + "grad_norm": 0.6775610701308648, + "learning_rate": 4.8999606942541876e-05, + "loss": 2.0092, + "step": 1525 + }, + { + "epoch": 0.12, + "grad_norm": 0.8945486850427139, + "learning_rate": 4.899785677939474e-05, + "loss": 2.0308, + "step": 1526 + }, + { + "epoch": 0.12, + "grad_norm": 0.7183996339804891, + "learning_rate": 4.899610511796282e-05, + "loss": 2.1864, + "step": 1527 + }, + { + "epoch": 0.12, + "grad_norm": 0.6335110279820241, + "learning_rate": 4.899435195835548e-05, + "loss": 2.0696, + "step": 1528 + }, + { + "epoch": 0.12, + "grad_norm": 0.7512250537838179, + "learning_rate": 4.899259730068217e-05, + "loss": 2.025, + "step": 1529 + }, + { + "epoch": 0.12, + "grad_norm": 0.6574494636396642, + "learning_rate": 4.8990841145052445e-05, + "loss": 2.0099, + "step": 1530 + }, + { + "epoch": 0.12, + "grad_norm": 0.8200064744820464, + "learning_rate": 4.898908349157596e-05, + "loss": 2.1645, + "step": 1531 + }, + { + "epoch": 0.12, + "grad_norm": 0.8586679926042361, + "learning_rate": 4.898732434036244e-05, + "loss": 2.0106, + "step": 1532 + }, + { + "epoch": 0.12, + "grad_norm": 0.6652020727271293, + "learning_rate": 4.8985563691521716e-05, + "loss": 2.0548, + "step": 1533 + }, + { + "epoch": 0.12, + "grad_norm": 1.02539381082077, + "learning_rate": 4.898380154516371e-05, + "loss": 2.0734, + "step": 1534 + }, + { + "epoch": 0.12, + "grad_norm": 0.8656481895243578, + "learning_rate": 4.898203790139846e-05, + "loss": 2.196, + "step": 1535 + }, + { + "epoch": 0.12, + "grad_norm": 0.8556766210930827, + "learning_rate": 4.8980272760336055e-05, + "loss": 2.0445, + "step": 1536 + }, + { + "epoch": 0.12, + "grad_norm": 0.9982375532611854, + "learning_rate": 4.897850612208671e-05, + "loss": 2.0179, + "step": 1537 + }, + { + "epoch": 0.12, + "grad_norm": 0.7142998302345301, + "learning_rate": 4.8976737986760725e-05, + "loss": 2.0623, + "step": 1538 + }, + { + "epoch": 0.12, + "grad_norm": 1.0613700319277675, + "learning_rate": 4.897496835446847e-05, + "loss": 2.2131, + "step": 1539 + }, + { + "epoch": 0.12, + "grad_norm": 0.6915873793323681, + "learning_rate": 4.8973197225320473e-05, + "loss": 2.1061, + "step": 1540 + }, + { + "epoch": 0.12, + "grad_norm": 0.9137395221800345, + "learning_rate": 4.897142459942728e-05, + "loss": 1.9875, + "step": 1541 + }, + { + "epoch": 0.12, + "grad_norm": 0.8664656178549999, + "learning_rate": 4.896965047689957e-05, + "loss": 1.9654, + "step": 1542 + }, + { + "epoch": 0.12, + "grad_norm": 0.8136781489277275, + "learning_rate": 4.896787485784811e-05, + "loss": 2.1726, + "step": 1543 + }, + { + "epoch": 0.12, + "grad_norm": 1.0280438076490326, + "learning_rate": 4.8966097742383765e-05, + "loss": 2.02, + "step": 1544 + }, + { + "epoch": 0.12, + "grad_norm": 0.7813941109639757, + "learning_rate": 4.8964319130617475e-05, + "loss": 1.9389, + "step": 1545 + }, + { + "epoch": 0.12, + "grad_norm": 1.035533366829317, + "learning_rate": 4.8962539022660304e-05, + "loss": 2.0627, + "step": 1546 + }, + { + "epoch": 0.12, + "grad_norm": 0.7602538631759572, + "learning_rate": 4.896075741862338e-05, + "loss": 2.1579, + "step": 1547 + }, + { + "epoch": 0.12, + "grad_norm": 0.789342306532993, + "learning_rate": 4.895897431861793e-05, + "loss": 1.9886, + "step": 1548 + }, + { + "epoch": 0.12, + "grad_norm": 0.9898172957004133, + "learning_rate": 4.8957189722755293e-05, + "loss": 1.9715, + "step": 1549 + }, + { + "epoch": 0.12, + "grad_norm": 0.7029873685542755, + "learning_rate": 4.895540363114688e-05, + "loss": 1.9931, + "step": 1550 + }, + { + "epoch": 0.12, + "grad_norm": 0.8891478307082431, + "learning_rate": 4.895361604390421e-05, + "loss": 2.2425, + "step": 1551 + }, + { + "epoch": 0.12, + "grad_norm": 0.7836904109212032, + "learning_rate": 4.895182696113888e-05, + "loss": 2.0798, + "step": 1552 + }, + { + "epoch": 0.12, + "grad_norm": 0.7116327861932176, + "learning_rate": 4.89500363829626e-05, + "loss": 1.9739, + "step": 1553 + }, + { + "epoch": 0.12, + "grad_norm": 0.9542128255001108, + "learning_rate": 4.894824430948716e-05, + "loss": 1.9707, + "step": 1554 + }, + { + "epoch": 0.12, + "grad_norm": 0.7535971870968055, + "learning_rate": 4.894645074082445e-05, + "loss": 2.1819, + "step": 1555 + }, + { + "epoch": 0.12, + "grad_norm": 0.8475963819539005, + "learning_rate": 4.8944655677086435e-05, + "loss": 2.028, + "step": 1556 + }, + { + "epoch": 0.12, + "grad_norm": 0.8359139022070011, + "learning_rate": 4.894285911838521e-05, + "loss": 2.032, + "step": 1557 + }, + { + "epoch": 0.12, + "grad_norm": 0.6385252085821237, + "learning_rate": 4.894106106483292e-05, + "loss": 2.017, + "step": 1558 + }, + { + "epoch": 0.12, + "grad_norm": 0.8793057892398624, + "learning_rate": 4.893926151654185e-05, + "loss": 2.229, + "step": 1559 + }, + { + "epoch": 0.12, + "grad_norm": 0.6070047003070554, + "learning_rate": 4.893746047362432e-05, + "loss": 2.0172, + "step": 1560 + }, + { + "epoch": 0.12, + "grad_norm": 0.8313779730718615, + "learning_rate": 4.893565793619281e-05, + "loss": 2.0286, + "step": 1561 + }, + { + "epoch": 0.12, + "grad_norm": 0.6392284320226778, + "learning_rate": 4.893385390435984e-05, + "loss": 1.9973, + "step": 1562 + }, + { + "epoch": 0.12, + "grad_norm": 0.7276386086022384, + "learning_rate": 4.8932048378238046e-05, + "loss": 2.1987, + "step": 1563 + }, + { + "epoch": 0.12, + "grad_norm": 0.7078267249440373, + "learning_rate": 4.893024135794015e-05, + "loss": 2.0001, + "step": 1564 + }, + { + "epoch": 0.12, + "grad_norm": 0.5761364664713644, + "learning_rate": 4.8928432843578996e-05, + "loss": 2.0929, + "step": 1565 + }, + { + "epoch": 0.12, + "grad_norm": 0.694638786286016, + "learning_rate": 4.8926622835267463e-05, + "loss": 2.0052, + "step": 1566 + }, + { + "epoch": 0.12, + "grad_norm": 0.602180231384444, + "learning_rate": 4.892481133311858e-05, + "loss": 2.1902, + "step": 1567 + }, + { + "epoch": 0.12, + "grad_norm": 0.6291354898008008, + "learning_rate": 4.892299833724544e-05, + "loss": 1.9592, + "step": 1568 + }, + { + "epoch": 0.12, + "grad_norm": 0.666561263907537, + "learning_rate": 4.892118384776124e-05, + "loss": 1.9886, + "step": 1569 + }, + { + "epoch": 0.12, + "grad_norm": 0.7256364950114741, + "learning_rate": 4.891936786477925e-05, + "loss": 2.0393, + "step": 1570 + }, + { + "epoch": 0.12, + "grad_norm": 0.6332533624869824, + "learning_rate": 4.891755038841288e-05, + "loss": 2.2497, + "step": 1571 + }, + { + "epoch": 0.12, + "grad_norm": 0.7873463478785132, + "learning_rate": 4.891573141877557e-05, + "loss": 2.0263, + "step": 1572 + }, + { + "epoch": 0.12, + "grad_norm": 0.6103041586379244, + "learning_rate": 4.891391095598089e-05, + "loss": 2.0066, + "step": 1573 + }, + { + "epoch": 0.12, + "grad_norm": 0.9432593701523323, + "learning_rate": 4.8912089000142524e-05, + "loss": 2.0334, + "step": 1574 + }, + { + "epoch": 0.12, + "grad_norm": 0.6640726176650178, + "learning_rate": 4.89102655513742e-05, + "loss": 2.2272, + "step": 1575 + }, + { + "epoch": 0.12, + "grad_norm": 0.6726939452587867, + "learning_rate": 4.8908440609789776e-05, + "loss": 2.045, + "step": 1576 + }, + { + "epoch": 0.12, + "grad_norm": 0.7856395934724953, + "learning_rate": 4.890661417550319e-05, + "loss": 2.0774, + "step": 1577 + }, + { + "epoch": 0.12, + "grad_norm": 0.615826528286066, + "learning_rate": 4.8904786248628464e-05, + "loss": 2.0443, + "step": 1578 + }, + { + "epoch": 0.12, + "grad_norm": 0.8122669988640299, + "learning_rate": 4.8902956829279735e-05, + "loss": 2.2142, + "step": 1579 + }, + { + "epoch": 0.12, + "grad_norm": 0.7039585237719892, + "learning_rate": 4.890112591757121e-05, + "loss": 1.9933, + "step": 1580 + }, + { + "epoch": 0.12, + "grad_norm": 0.7762731046148803, + "learning_rate": 4.889929351361721e-05, + "loss": 2.0444, + "step": 1581 + }, + { + "epoch": 0.12, + "grad_norm": 0.7212878374222162, + "learning_rate": 4.889745961753213e-05, + "loss": 2.012, + "step": 1582 + }, + { + "epoch": 0.12, + "grad_norm": 0.634371596967165, + "learning_rate": 4.889562422943047e-05, + "loss": 2.24, + "step": 1583 + }, + { + "epoch": 0.12, + "grad_norm": 0.6477497512795606, + "learning_rate": 4.8893787349426834e-05, + "loss": 2.0603, + "step": 1584 + }, + { + "epoch": 0.12, + "grad_norm": 0.6675986346829489, + "learning_rate": 4.889194897763589e-05, + "loss": 2.0018, + "step": 1585 + }, + { + "epoch": 0.12, + "grad_norm": 0.6876871842471952, + "learning_rate": 4.8890109114172424e-05, + "loss": 1.9922, + "step": 1586 + }, + { + "epoch": 0.12, + "grad_norm": 0.7691590454992162, + "learning_rate": 4.8888267759151294e-05, + "loss": 2.1962, + "step": 1587 + }, + { + "epoch": 0.12, + "grad_norm": 0.65129547971366, + "learning_rate": 4.888642491268748e-05, + "loss": 2.0185, + "step": 1588 + }, + { + "epoch": 0.12, + "grad_norm": 0.6647064232407159, + "learning_rate": 4.8884580574896035e-05, + "loss": 2.0428, + "step": 1589 + }, + { + "epoch": 0.12, + "grad_norm": 0.7667339058106031, + "learning_rate": 4.888273474589209e-05, + "loss": 2.0748, + "step": 1590 + }, + { + "epoch": 0.12, + "grad_norm": 0.7962866151133833, + "learning_rate": 4.888088742579091e-05, + "loss": 2.179, + "step": 1591 + }, + { + "epoch": 0.12, + "grad_norm": 0.6636730443976754, + "learning_rate": 4.887903861470783e-05, + "loss": 2.0091, + "step": 1592 + }, + { + "epoch": 0.12, + "grad_norm": 0.8112790103102209, + "learning_rate": 4.8877188312758256e-05, + "loss": 2.0187, + "step": 1593 + }, + { + "epoch": 0.12, + "grad_norm": 0.676995185450621, + "learning_rate": 4.887533652005774e-05, + "loss": 1.9847, + "step": 1594 + }, + { + "epoch": 0.12, + "grad_norm": 0.964438644471062, + "learning_rate": 4.8873483236721876e-05, + "loss": 2.1875, + "step": 1595 + }, + { + "epoch": 0.12, + "grad_norm": 0.6755408188736353, + "learning_rate": 4.887162846286638e-05, + "loss": 2.0883, + "step": 1596 + }, + { + "epoch": 0.12, + "grad_norm": 0.8683921549347674, + "learning_rate": 4.8869772198607055e-05, + "loss": 2.0016, + "step": 1597 + }, + { + "epoch": 0.12, + "grad_norm": 0.7126116507965032, + "learning_rate": 4.886791444405979e-05, + "loss": 2.0219, + "step": 1598 + }, + { + "epoch": 0.12, + "grad_norm": 0.678864731622033, + "learning_rate": 4.886605519934058e-05, + "loss": 2.2106, + "step": 1599 + }, + { + "epoch": 0.12, + "grad_norm": 0.6742139819262983, + "learning_rate": 4.886419446456549e-05, + "loss": 2.0287, + "step": 1600 + }, + { + "epoch": 0.12, + "grad_norm": 0.7170332726417835, + "learning_rate": 4.8862332239850703e-05, + "loss": 1.9693, + "step": 1601 + }, + { + "epoch": 0.12, + "grad_norm": 0.6758436084036683, + "learning_rate": 4.886046852531249e-05, + "loss": 2.0744, + "step": 1602 + }, + { + "epoch": 0.12, + "grad_norm": 0.6296118483492044, + "learning_rate": 4.885860332106721e-05, + "loss": 1.9917, + "step": 1603 + }, + { + "epoch": 0.12, + "grad_norm": 0.6878704827091733, + "learning_rate": 4.88567366272313e-05, + "loss": 2.1597, + "step": 1604 + }, + { + "epoch": 0.12, + "grad_norm": 0.665578910103583, + "learning_rate": 4.885486844392132e-05, + "loss": 1.9791, + "step": 1605 + }, + { + "epoch": 0.12, + "grad_norm": 0.7055552918869917, + "learning_rate": 4.885299877125391e-05, + "loss": 2.0025, + "step": 1606 + }, + { + "epoch": 0.12, + "grad_norm": 0.6842684815151263, + "learning_rate": 4.885112760934579e-05, + "loss": 2.2387, + "step": 1607 + }, + { + "epoch": 0.12, + "grad_norm": 0.7917567223357809, + "learning_rate": 4.884925495831379e-05, + "loss": 2.0838, + "step": 1608 + }, + { + "epoch": 0.12, + "grad_norm": 0.6622367410573917, + "learning_rate": 4.884738081827484e-05, + "loss": 1.9954, + "step": 1609 + }, + { + "epoch": 0.12, + "grad_norm": 0.8085825632022866, + "learning_rate": 4.8845505189345934e-05, + "loss": 2.0259, + "step": 1610 + }, + { + "epoch": 0.12, + "grad_norm": 0.5928021443823616, + "learning_rate": 4.8843628071644165e-05, + "loss": 2.1899, + "step": 1611 + }, + { + "epoch": 0.12, + "grad_norm": 0.7038223703472594, + "learning_rate": 4.8841749465286756e-05, + "loss": 1.9877, + "step": 1612 + }, + { + "epoch": 0.12, + "grad_norm": 0.6738111926743082, + "learning_rate": 4.8839869370390975e-05, + "loss": 1.9798, + "step": 1613 + }, + { + "epoch": 0.12, + "grad_norm": 0.6367237971210636, + "learning_rate": 4.883798778707422e-05, + "loss": 2.0266, + "step": 1614 + }, + { + "epoch": 0.12, + "grad_norm": 0.6578258973002554, + "learning_rate": 4.8836104715453964e-05, + "loss": 2.0078, + "step": 1615 + }, + { + "epoch": 0.12, + "grad_norm": 0.6619893085531953, + "learning_rate": 4.883422015564776e-05, + "loss": 2.2095, + "step": 1616 + }, + { + "epoch": 0.12, + "grad_norm": 0.7054812614205657, + "learning_rate": 4.8832334107773284e-05, + "loss": 1.9746, + "step": 1617 + }, + { + "epoch": 0.12, + "grad_norm": 0.6519881658362222, + "learning_rate": 4.8830446571948276e-05, + "loss": 1.9801, + "step": 1618 + }, + { + "epoch": 0.12, + "grad_norm": 0.646432943931197, + "learning_rate": 4.882855754829059e-05, + "loss": 2.2014, + "step": 1619 + }, + { + "epoch": 0.12, + "grad_norm": 0.6345553994015883, + "learning_rate": 4.882666703691817e-05, + "loss": 1.9976, + "step": 1620 + }, + { + "epoch": 0.13, + "grad_norm": 0.6086523847171076, + "learning_rate": 4.882477503794905e-05, + "loss": 2.1228, + "step": 1621 + }, + { + "epoch": 0.13, + "grad_norm": 0.7457897545748834, + "learning_rate": 4.882288155150134e-05, + "loss": 2.0059, + "step": 1622 + }, + { + "epoch": 0.13, + "grad_norm": 0.6650492792933501, + "learning_rate": 4.8820986577693276e-05, + "loss": 2.1606, + "step": 1623 + }, + { + "epoch": 0.13, + "grad_norm": 0.6978977152072615, + "learning_rate": 4.881909011664316e-05, + "loss": 2.0435, + "step": 1624 + }, + { + "epoch": 0.13, + "grad_norm": 0.6293032714526053, + "learning_rate": 4.8817192168469394e-05, + "loss": 2.0217, + "step": 1625 + }, + { + "epoch": 0.13, + "grad_norm": 0.6689911255570921, + "learning_rate": 4.881529273329048e-05, + "loss": 1.9957, + "step": 1626 + }, + { + "epoch": 0.13, + "grad_norm": 0.6303067500755346, + "learning_rate": 4.881339181122501e-05, + "loss": 2.1098, + "step": 1627 + }, + { + "epoch": 0.13, + "grad_norm": 0.6134512699079463, + "learning_rate": 4.881148940239165e-05, + "loss": 2.1788, + "step": 1628 + }, + { + "epoch": 0.13, + "grad_norm": 0.6117785189284667, + "learning_rate": 4.880958550690919e-05, + "loss": 1.9818, + "step": 1629 + }, + { + "epoch": 0.13, + "grad_norm": 0.6822191618286261, + "learning_rate": 4.8807680124896494e-05, + "loss": 2.0655, + "step": 1630 + }, + { + "epoch": 0.13, + "grad_norm": 0.701812133039931, + "learning_rate": 4.880577325647252e-05, + "loss": 2.1826, + "step": 1631 + }, + { + "epoch": 0.13, + "grad_norm": 0.6547127302048279, + "learning_rate": 4.8803864901756334e-05, + "loss": 1.9343, + "step": 1632 + }, + { + "epoch": 0.13, + "grad_norm": 0.6370819513658245, + "learning_rate": 4.880195506086707e-05, + "loss": 2.0536, + "step": 1633 + }, + { + "epoch": 0.13, + "grad_norm": 0.6695398593518003, + "learning_rate": 4.8800043733923974e-05, + "loss": 1.954, + "step": 1634 + }, + { + "epoch": 0.13, + "grad_norm": 0.7313766525092967, + "learning_rate": 4.879813092104637e-05, + "loss": 2.0286, + "step": 1635 + }, + { + "epoch": 0.13, + "grad_norm": 0.7536167257279901, + "learning_rate": 4.8796216622353686e-05, + "loss": 2.2029, + "step": 1636 + }, + { + "epoch": 0.13, + "grad_norm": 0.669188585636682, + "learning_rate": 4.879430083796545e-05, + "loss": 2.0162, + "step": 1637 + }, + { + "epoch": 0.13, + "grad_norm": 0.6985841310891843, + "learning_rate": 4.879238356800126e-05, + "loss": 1.981, + "step": 1638 + }, + { + "epoch": 0.13, + "grad_norm": 0.7026379047899727, + "learning_rate": 4.879046481258081e-05, + "loss": 2.0793, + "step": 1639 + }, + { + "epoch": 0.13, + "grad_norm": 0.7622717385476238, + "learning_rate": 4.8788544571823915e-05, + "loss": 2.1566, + "step": 1640 + }, + { + "epoch": 0.13, + "grad_norm": 0.6920474094895059, + "learning_rate": 4.878662284585046e-05, + "loss": 2.0048, + "step": 1641 + }, + { + "epoch": 0.13, + "grad_norm": 0.6975053090548534, + "learning_rate": 4.878469963478042e-05, + "loss": 1.9782, + "step": 1642 + }, + { + "epoch": 0.13, + "grad_norm": 0.7308433897087943, + "learning_rate": 4.878277493873388e-05, + "loss": 2.2721, + "step": 1643 + }, + { + "epoch": 0.13, + "grad_norm": 0.7687378647604103, + "learning_rate": 4.878084875783099e-05, + "loss": 2.0045, + "step": 1644 + }, + { + "epoch": 0.13, + "grad_norm": 0.721376253300633, + "learning_rate": 4.8778921092192014e-05, + "loss": 2.0155, + "step": 1645 + }, + { + "epoch": 0.13, + "grad_norm": 0.7226996388170284, + "learning_rate": 4.877699194193731e-05, + "loss": 2.0904, + "step": 1646 + }, + { + "epoch": 0.13, + "grad_norm": 0.7450728313212015, + "learning_rate": 4.8775061307187333e-05, + "loss": 1.9734, + "step": 1647 + }, + { + "epoch": 0.13, + "grad_norm": 0.6431438065196093, + "learning_rate": 4.8773129188062593e-05, + "loss": 2.1727, + "step": 1648 + }, + { + "epoch": 0.13, + "grad_norm": 0.6648747749361291, + "learning_rate": 4.877119558468374e-05, + "loss": 2.0085, + "step": 1649 + }, + { + "epoch": 0.13, + "grad_norm": 0.7009929775636612, + "learning_rate": 4.8769260497171495e-05, + "loss": 2.0091, + "step": 1650 + }, + { + "epoch": 0.13, + "grad_norm": 0.7118196263174075, + "learning_rate": 4.876732392564667e-05, + "loss": 2.217, + "step": 1651 + }, + { + "epoch": 0.13, + "grad_norm": 0.8675359410523498, + "learning_rate": 4.8765385870230176e-05, + "loss": 2.0873, + "step": 1652 + }, + { + "epoch": 0.13, + "grad_norm": 0.6562636976045251, + "learning_rate": 4.876344633104301e-05, + "loss": 2.0165, + "step": 1653 + }, + { + "epoch": 0.13, + "grad_norm": 0.8688049381520782, + "learning_rate": 4.876150530820627e-05, + "loss": 2.0225, + "step": 1654 + }, + { + "epoch": 0.13, + "grad_norm": 0.6796720403257119, + "learning_rate": 4.875956280184113e-05, + "loss": 2.236, + "step": 1655 + }, + { + "epoch": 0.13, + "grad_norm": 0.6745944701673521, + "learning_rate": 4.875761881206888e-05, + "loss": 2.0252, + "step": 1656 + }, + { + "epoch": 0.13, + "grad_norm": 0.6894460831030285, + "learning_rate": 4.875567333901089e-05, + "loss": 2.0136, + "step": 1657 + }, + { + "epoch": 0.13, + "grad_norm": 0.6499894300623692, + "learning_rate": 4.875372638278862e-05, + "loss": 2.0469, + "step": 1658 + }, + { + "epoch": 0.13, + "grad_norm": 0.7034366928974665, + "learning_rate": 4.8751777943523634e-05, + "loss": 2.012, + "step": 1659 + }, + { + "epoch": 0.13, + "grad_norm": 0.6023193271663739, + "learning_rate": 4.8749828021337575e-05, + "loss": 2.1878, + "step": 1660 + }, + { + "epoch": 0.13, + "grad_norm": 0.7701695634234397, + "learning_rate": 4.8747876616352186e-05, + "loss": 1.9906, + "step": 1661 + }, + { + "epoch": 0.13, + "grad_norm": 0.7107925637929976, + "learning_rate": 4.8745923728689305e-05, + "loss": 2.0193, + "step": 1662 + }, + { + "epoch": 0.13, + "grad_norm": 0.7108363744203235, + "learning_rate": 4.874396935847085e-05, + "loss": 2.2194, + "step": 1663 + }, + { + "epoch": 0.13, + "grad_norm": 0.7780080001755454, + "learning_rate": 4.874201350581885e-05, + "loss": 2.1008, + "step": 1664 + }, + { + "epoch": 0.13, + "grad_norm": 0.7751381995421351, + "learning_rate": 4.87400561708554e-05, + "loss": 1.9841, + "step": 1665 + }, + { + "epoch": 0.13, + "grad_norm": 0.7150848145473261, + "learning_rate": 4.873809735370273e-05, + "loss": 2.039, + "step": 1666 + }, + { + "epoch": 0.13, + "grad_norm": 0.7543903194220497, + "learning_rate": 4.8736137054483114e-05, + "loss": 2.0149, + "step": 1667 + }, + { + "epoch": 0.13, + "grad_norm": 0.5988994073553893, + "learning_rate": 4.873417527331896e-05, + "loss": 2.1877, + "step": 1668 + }, + { + "epoch": 0.13, + "grad_norm": 0.6758694067084882, + "learning_rate": 4.873221201033273e-05, + "loss": 1.9786, + "step": 1669 + }, + { + "epoch": 0.13, + "grad_norm": 0.7166566400845973, + "learning_rate": 4.873024726564702e-05, + "loss": 2.0844, + "step": 1670 + }, + { + "epoch": 0.13, + "grad_norm": 0.614621937838585, + "learning_rate": 4.8728281039384485e-05, + "loss": 1.9769, + "step": 1671 + }, + { + "epoch": 0.13, + "grad_norm": 0.7908840912065147, + "learning_rate": 4.8726313331667884e-05, + "loss": 2.1851, + "step": 1672 + }, + { + "epoch": 0.13, + "grad_norm": 0.8567382653011144, + "learning_rate": 4.872434414262008e-05, + "loss": 2.0293, + "step": 1673 + }, + { + "epoch": 0.13, + "grad_norm": 0.6801281216716422, + "learning_rate": 4.8722373472363995e-05, + "loss": 2.0525, + "step": 1674 + }, + { + "epoch": 0.13, + "grad_norm": 0.9845661601447437, + "learning_rate": 4.872040132102269e-05, + "loss": 2.2061, + "step": 1675 + }, + { + "epoch": 0.13, + "grad_norm": 0.8796047672036053, + "learning_rate": 4.871842768871928e-05, + "loss": 2.0246, + "step": 1676 + }, + { + "epoch": 0.13, + "grad_norm": 0.8034968704134742, + "learning_rate": 4.8716452575576996e-05, + "loss": 2.0719, + "step": 1677 + }, + { + "epoch": 0.13, + "grad_norm": 0.917931829774112, + "learning_rate": 4.871447598171914e-05, + "loss": 2.0193, + "step": 1678 + }, + { + "epoch": 0.13, + "grad_norm": 0.6950799806501033, + "learning_rate": 4.8712497907269136e-05, + "loss": 1.9867, + "step": 1679 + }, + { + "epoch": 0.13, + "grad_norm": 0.7494405794121956, + "learning_rate": 4.871051835235047e-05, + "loss": 2.2111, + "step": 1680 + }, + { + "epoch": 0.13, + "grad_norm": 0.6827160001705807, + "learning_rate": 4.870853731708674e-05, + "loss": 1.9973, + "step": 1681 + }, + { + "epoch": 0.13, + "grad_norm": 0.6647011152862121, + "learning_rate": 4.870655480160162e-05, + "loss": 1.9859, + "step": 1682 + }, + { + "epoch": 0.13, + "grad_norm": 0.617967958271628, + "learning_rate": 4.870457080601891e-05, + "loss": 2.0345, + "step": 1683 + }, + { + "epoch": 0.13, + "grad_norm": 0.6197415603830477, + "learning_rate": 4.870258533046245e-05, + "loss": 2.1756, + "step": 1684 + }, + { + "epoch": 0.13, + "grad_norm": 0.6347939948217158, + "learning_rate": 4.870059837505623e-05, + "loss": 1.977, + "step": 1685 + }, + { + "epoch": 0.13, + "grad_norm": 0.6514836045287381, + "learning_rate": 4.869860993992428e-05, + "loss": 2.0005, + "step": 1686 + }, + { + "epoch": 0.13, + "grad_norm": 0.6961361087451419, + "learning_rate": 4.8696620025190745e-05, + "loss": 2.043, + "step": 1687 + }, + { + "epoch": 0.13, + "grad_norm": 0.9934656610936521, + "learning_rate": 4.869462863097989e-05, + "loss": 2.1959, + "step": 1688 + }, + { + "epoch": 0.13, + "grad_norm": 0.6679691079294406, + "learning_rate": 4.869263575741602e-05, + "loss": 2.0317, + "step": 1689 + }, + { + "epoch": 0.13, + "grad_norm": 0.897886914361417, + "learning_rate": 4.869064140462357e-05, + "loss": 2.044, + "step": 1690 + }, + { + "epoch": 0.13, + "grad_norm": 0.8105274619240014, + "learning_rate": 4.8688645572727056e-05, + "loss": 2.0175, + "step": 1691 + }, + { + "epoch": 0.13, + "grad_norm": 0.8083769139633945, + "learning_rate": 4.868664826185108e-05, + "loss": 2.211, + "step": 1692 + }, + { + "epoch": 0.13, + "grad_norm": 0.8407175834469127, + "learning_rate": 4.868464947212035e-05, + "loss": 1.962, + "step": 1693 + }, + { + "epoch": 0.13, + "grad_norm": 0.7955257249228689, + "learning_rate": 4.868264920365965e-05, + "loss": 2.0583, + "step": 1694 + }, + { + "epoch": 0.13, + "grad_norm": 0.7256681543945334, + "learning_rate": 4.8680647456593874e-05, + "loss": 2.0863, + "step": 1695 + }, + { + "epoch": 0.13, + "grad_norm": 0.7678117183049155, + "learning_rate": 4.867864423104799e-05, + "loss": 2.1936, + "step": 1696 + }, + { + "epoch": 0.13, + "grad_norm": 0.8129346087303382, + "learning_rate": 4.867663952714707e-05, + "loss": 2.0104, + "step": 1697 + }, + { + "epoch": 0.13, + "grad_norm": 0.83111487245195, + "learning_rate": 4.8674633345016286e-05, + "loss": 1.978, + "step": 1698 + }, + { + "epoch": 0.13, + "grad_norm": 0.9700786480357096, + "learning_rate": 4.867262568478088e-05, + "loss": 2.0256, + "step": 1699 + }, + { + "epoch": 0.13, + "grad_norm": 0.7269961196035782, + "learning_rate": 4.867061654656621e-05, + "loss": 2.1374, + "step": 1700 + }, + { + "epoch": 0.13, + "grad_norm": 1.0480162515968983, + "learning_rate": 4.86686059304977e-05, + "loss": 2.0581, + "step": 1701 + }, + { + "epoch": 0.13, + "grad_norm": 0.6418276441783758, + "learning_rate": 4.8666593836700894e-05, + "loss": 2.0294, + "step": 1702 + }, + { + "epoch": 0.13, + "grad_norm": 0.8910414350926767, + "learning_rate": 4.86645802653014e-05, + "loss": 2.0612, + "step": 1703 + }, + { + "epoch": 0.13, + "grad_norm": 0.7781934119691012, + "learning_rate": 4.866256521642495e-05, + "loss": 2.1672, + "step": 1704 + }, + { + "epoch": 0.13, + "grad_norm": 0.7732012912474907, + "learning_rate": 4.866054869019735e-05, + "loss": 1.9943, + "step": 1705 + }, + { + "epoch": 0.13, + "grad_norm": 0.7562789775001519, + "learning_rate": 4.865853068674449e-05, + "loss": 1.9888, + "step": 1706 + }, + { + "epoch": 0.13, + "grad_norm": 0.8698717910725112, + "learning_rate": 4.865651120619237e-05, + "loss": 2.2336, + "step": 1707 + }, + { + "epoch": 0.13, + "grad_norm": 1.0561513301322838, + "learning_rate": 4.8654490248667066e-05, + "loss": 2.0689, + "step": 1708 + }, + { + "epoch": 0.13, + "grad_norm": 0.6765396813612639, + "learning_rate": 4.865246781429476e-05, + "loss": 1.9957, + "step": 1709 + }, + { + "epoch": 0.13, + "grad_norm": 0.8605045789616634, + "learning_rate": 4.865044390320173e-05, + "loss": 2.0038, + "step": 1710 + }, + { + "epoch": 0.13, + "grad_norm": 0.7478064347571007, + "learning_rate": 4.8648418515514324e-05, + "loss": 1.9915, + "step": 1711 + }, + { + "epoch": 0.13, + "grad_norm": 0.6858473212893377, + "learning_rate": 4.8646391651359e-05, + "loss": 2.1879, + "step": 1712 + }, + { + "epoch": 0.13, + "grad_norm": 0.8456511077570654, + "learning_rate": 4.864436331086231e-05, + "loss": 2.0012, + "step": 1713 + }, + { + "epoch": 0.13, + "grad_norm": 0.5993920168755866, + "learning_rate": 4.864233349415088e-05, + "loss": 2.0795, + "step": 1714 + }, + { + "epoch": 0.13, + "grad_norm": 0.6367367952583745, + "learning_rate": 4.864030220135144e-05, + "loss": 1.9592, + "step": 1715 + }, + { + "epoch": 0.13, + "grad_norm": 0.7510258089623232, + "learning_rate": 4.863826943259082e-05, + "loss": 2.2426, + "step": 1716 + }, + { + "epoch": 0.13, + "grad_norm": 0.6964838031100102, + "learning_rate": 4.8636235187995936e-05, + "loss": 1.9594, + "step": 1717 + }, + { + "epoch": 0.13, + "grad_norm": 0.7643991946731155, + "learning_rate": 4.863419946769379e-05, + "loss": 2.0051, + "step": 1718 + }, + { + "epoch": 0.13, + "grad_norm": 0.7387461030209671, + "learning_rate": 4.863216227181147e-05, + "loss": 1.9968, + "step": 1719 + }, + { + "epoch": 0.13, + "grad_norm": 0.7309874893534596, + "learning_rate": 4.863012360047618e-05, + "loss": 2.2375, + "step": 1720 + }, + { + "epoch": 0.13, + "grad_norm": 0.8503260656207527, + "learning_rate": 4.86280834538152e-05, + "loss": 2.0143, + "step": 1721 + }, + { + "epoch": 0.13, + "grad_norm": 0.6916383412403738, + "learning_rate": 4.862604183195591e-05, + "loss": 2.0233, + "step": 1722 + }, + { + "epoch": 0.13, + "grad_norm": 0.7300764838506615, + "learning_rate": 4.862399873502576e-05, + "loss": 2.0122, + "step": 1723 + }, + { + "epoch": 0.13, + "grad_norm": 0.6441696230647352, + "learning_rate": 4.862195416315232e-05, + "loss": 2.2013, + "step": 1724 + }, + { + "epoch": 0.13, + "grad_norm": 0.8475887569106031, + "learning_rate": 4.861990811646324e-05, + "loss": 1.9869, + "step": 1725 + }, + { + "epoch": 0.13, + "grad_norm": 0.6608040455635477, + "learning_rate": 4.861786059508626e-05, + "loss": 2.0757, + "step": 1726 + }, + { + "epoch": 0.13, + "grad_norm": 0.7768408457895387, + "learning_rate": 4.861581159914923e-05, + "loss": 2.0192, + "step": 1727 + }, + { + "epoch": 0.13, + "grad_norm": 0.7510877715204681, + "learning_rate": 4.8613761128780055e-05, + "loss": 2.2371, + "step": 1728 + }, + { + "epoch": 0.13, + "grad_norm": 0.7517153443662197, + "learning_rate": 4.861170918410678e-05, + "loss": 2.0471, + "step": 1729 + }, + { + "epoch": 0.13, + "grad_norm": 0.7452508890509288, + "learning_rate": 4.8609655765257486e-05, + "loss": 1.9749, + "step": 1730 + }, + { + "epoch": 0.13, + "grad_norm": 0.6394154916518292, + "learning_rate": 4.860760087236039e-05, + "loss": 1.9623, + "step": 1731 + }, + { + "epoch": 0.13, + "grad_norm": 0.775191706083146, + "learning_rate": 4.8605544505543796e-05, + "loss": 2.2399, + "step": 1732 + }, + { + "epoch": 0.13, + "grad_norm": 0.732192740280029, + "learning_rate": 4.860348666493609e-05, + "loss": 1.9739, + "step": 1733 + }, + { + "epoch": 0.13, + "grad_norm": 0.6941870679191552, + "learning_rate": 4.860142735066574e-05, + "loss": 1.9561, + "step": 1734 + }, + { + "epoch": 0.13, + "grad_norm": 0.7763156501705997, + "learning_rate": 4.859936656286132e-05, + "loss": 1.9537, + "step": 1735 + }, + { + "epoch": 0.13, + "grad_norm": 0.6504050006754993, + "learning_rate": 4.85973043016515e-05, + "loss": 2.1775, + "step": 1736 + }, + { + "epoch": 0.13, + "grad_norm": 0.7003261217402635, + "learning_rate": 4.8595240567165036e-05, + "loss": 1.9991, + "step": 1737 + }, + { + "epoch": 0.13, + "grad_norm": 0.6667079562988385, + "learning_rate": 4.859317535953077e-05, + "loss": 2.027, + "step": 1738 + }, + { + "epoch": 0.13, + "grad_norm": 0.6341486741370379, + "learning_rate": 4.859110867887764e-05, + "loss": 2.0825, + "step": 1739 + }, + { + "epoch": 0.13, + "grad_norm": 0.6467983622511467, + "learning_rate": 4.858904052533469e-05, + "loss": 2.1926, + "step": 1740 + }, + { + "epoch": 0.13, + "grad_norm": 0.658715271192428, + "learning_rate": 4.858697089903102e-05, + "loss": 2.0488, + "step": 1741 + }, + { + "epoch": 0.13, + "grad_norm": 0.6425733424568277, + "learning_rate": 4.8584899800095864e-05, + "loss": 2.0168, + "step": 1742 + }, + { + "epoch": 0.13, + "grad_norm": 0.6061894869884389, + "learning_rate": 4.858282722865852e-05, + "loss": 2.0345, + "step": 1743 + }, + { + "epoch": 0.13, + "grad_norm": 0.626872384955726, + "learning_rate": 4.85807531848484e-05, + "loss": 2.2092, + "step": 1744 + }, + { + "epoch": 0.13, + "grad_norm": 0.5975668826038398, + "learning_rate": 4.857867766879498e-05, + "loss": 2.1198, + "step": 1745 + }, + { + "epoch": 0.13, + "grad_norm": 0.6837035240959958, + "learning_rate": 4.8576600680627855e-05, + "loss": 1.9965, + "step": 1746 + }, + { + "epoch": 0.13, + "grad_norm": 0.6679771309077187, + "learning_rate": 4.8574522220476693e-05, + "loss": 1.9785, + "step": 1747 + }, + { + "epoch": 0.13, + "grad_norm": 0.6578942978104355, + "learning_rate": 4.857244228847126e-05, + "loss": 2.2314, + "step": 1748 + }, + { + "epoch": 0.13, + "grad_norm": 0.7001025761483601, + "learning_rate": 4.857036088474142e-05, + "loss": 1.9813, + "step": 1749 + }, + { + "epoch": 0.14, + "grad_norm": 0.6140900857659639, + "learning_rate": 4.856827800941711e-05, + "loss": 1.9905, + "step": 1750 + }, + { + "epoch": 0.14, + "grad_norm": 0.7319357294825499, + "learning_rate": 4.85661936626284e-05, + "loss": 2.07, + "step": 1751 + }, + { + "epoch": 0.14, + "grad_norm": 0.6979669462922744, + "learning_rate": 4.85641078445054e-05, + "loss": 2.1857, + "step": 1752 + }, + { + "epoch": 0.14, + "grad_norm": 1.0420516663690453, + "learning_rate": 4.856202055517834e-05, + "loss": 1.9879, + "step": 1753 + }, + { + "epoch": 0.14, + "grad_norm": 0.766404546105132, + "learning_rate": 4.855993179477755e-05, + "loss": 1.9905, + "step": 1754 + }, + { + "epoch": 0.14, + "grad_norm": 0.5866602742975688, + "learning_rate": 4.855784156343342e-05, + "loss": 1.999, + "step": 1755 + }, + { + "epoch": 0.14, + "grad_norm": 0.6937368995058981, + "learning_rate": 4.8555749861276464e-05, + "loss": 2.2051, + "step": 1756 + }, + { + "epoch": 0.14, + "grad_norm": 0.597893382237688, + "learning_rate": 4.8553656688437285e-05, + "loss": 2.0892, + "step": 1757 + }, + { + "epoch": 0.14, + "grad_norm": 0.6354028578838833, + "learning_rate": 4.855156204504655e-05, + "loss": 1.9973, + "step": 1758 + }, + { + "epoch": 0.14, + "grad_norm": 0.8012703333719722, + "learning_rate": 4.854946593123505e-05, + "loss": 2.0129, + "step": 1759 + }, + { + "epoch": 0.14, + "grad_norm": 0.6128584734467191, + "learning_rate": 4.854736834713364e-05, + "loss": 2.193, + "step": 1760 + }, + { + "epoch": 0.14, + "grad_norm": 0.6822113073547863, + "learning_rate": 4.8545269292873296e-05, + "loss": 2.0225, + "step": 1761 + }, + { + "epoch": 0.14, + "grad_norm": 0.5919420980363895, + "learning_rate": 4.854316876858506e-05, + "loss": 1.9967, + "step": 1762 + }, + { + "epoch": 0.14, + "grad_norm": 0.6405758031831892, + "learning_rate": 4.854106677440008e-05, + "loss": 1.9971, + "step": 1763 + }, + { + "epoch": 0.14, + "grad_norm": 0.7483749278589876, + "learning_rate": 4.853896331044959e-05, + "loss": 2.2328, + "step": 1764 + }, + { + "epoch": 0.14, + "grad_norm": 0.8002111886575328, + "learning_rate": 4.8536858376864926e-05, + "loss": 2.013, + "step": 1765 + }, + { + "epoch": 0.14, + "grad_norm": 0.671024819792495, + "learning_rate": 4.8534751973777503e-05, + "loss": 2.0494, + "step": 1766 + }, + { + "epoch": 0.14, + "grad_norm": 0.6801693266773705, + "learning_rate": 4.8532644101318826e-05, + "loss": 2.0053, + "step": 1767 + }, + { + "epoch": 0.14, + "grad_norm": 0.7351967137811298, + "learning_rate": 4.853053475962051e-05, + "loss": 2.2053, + "step": 1768 + }, + { + "epoch": 0.14, + "grad_norm": 0.802477307371425, + "learning_rate": 4.852842394881423e-05, + "loss": 2.0428, + "step": 1769 + }, + { + "epoch": 0.14, + "grad_norm": 0.6622435680034102, + "learning_rate": 4.85263116690318e-05, + "loss": 2.0499, + "step": 1770 + }, + { + "epoch": 0.14, + "grad_norm": 1.2915161924095737, + "learning_rate": 4.852419792040507e-05, + "loss": 2.012, + "step": 1771 + }, + { + "epoch": 0.14, + "grad_norm": 0.6785565182063865, + "learning_rate": 4.8522082703066035e-05, + "loss": 2.1959, + "step": 1772 + }, + { + "epoch": 0.14, + "grad_norm": 0.7987657957515331, + "learning_rate": 4.851996601714674e-05, + "loss": 1.9894, + "step": 1773 + }, + { + "epoch": 0.14, + "grad_norm": 0.7400324296134823, + "learning_rate": 4.851784786277935e-05, + "loss": 2.0346, + "step": 1774 + }, + { + "epoch": 0.14, + "grad_norm": 0.6903197819339421, + "learning_rate": 4.85157282400961e-05, + "loss": 1.9459, + "step": 1775 + }, + { + "epoch": 0.14, + "grad_norm": 0.7810173412521904, + "learning_rate": 4.851360714922933e-05, + "loss": 2.269, + "step": 1776 + }, + { + "epoch": 0.14, + "grad_norm": 0.6619569884971317, + "learning_rate": 4.851148459031147e-05, + "loss": 1.9881, + "step": 1777 + }, + { + "epoch": 0.14, + "grad_norm": 0.7205435777252497, + "learning_rate": 4.8509360563475045e-05, + "loss": 1.9773, + "step": 1778 + }, + { + "epoch": 0.14, + "grad_norm": 0.8441379042372801, + "learning_rate": 4.850723506885265e-05, + "loss": 2.01, + "step": 1779 + }, + { + "epoch": 0.14, + "grad_norm": 0.6344637065548948, + "learning_rate": 4.850510810657701e-05, + "loss": 2.1996, + "step": 1780 + }, + { + "epoch": 0.14, + "grad_norm": 0.825882300894643, + "learning_rate": 4.85029796767809e-05, + "loss": 2.0101, + "step": 1781 + }, + { + "epoch": 0.14, + "grad_norm": 0.7931028711494718, + "learning_rate": 4.850084977959722e-05, + "loss": 2.0837, + "step": 1782 + }, + { + "epoch": 0.14, + "grad_norm": 0.7617546898901234, + "learning_rate": 4.849871841515895e-05, + "loss": 1.9774, + "step": 1783 + }, + { + "epoch": 0.14, + "grad_norm": 0.7939058964877811, + "learning_rate": 4.849658558359915e-05, + "loss": 2.2086, + "step": 1784 + }, + { + "epoch": 0.14, + "grad_norm": 0.703272893238437, + "learning_rate": 4.849445128505099e-05, + "loss": 2.0059, + "step": 1785 + }, + { + "epoch": 0.14, + "grad_norm": 0.7225836065492971, + "learning_rate": 4.849231551964771e-05, + "loss": 1.9649, + "step": 1786 + }, + { + "epoch": 0.14, + "grad_norm": 0.8384191599681904, + "learning_rate": 4.849017828752267e-05, + "loss": 1.9603, + "step": 1787 + }, + { + "epoch": 0.14, + "grad_norm": 0.8981483033690908, + "learning_rate": 4.8488039588809296e-05, + "loss": 2.2443, + "step": 1788 + }, + { + "epoch": 0.14, + "grad_norm": 0.9480267823215893, + "learning_rate": 4.848589942364112e-05, + "loss": 1.9873, + "step": 1789 + }, + { + "epoch": 0.14, + "grad_norm": 0.7435715976088432, + "learning_rate": 4.8483757792151766e-05, + "loss": 1.9971, + "step": 1790 + }, + { + "epoch": 0.14, + "grad_norm": 0.9093753824461049, + "learning_rate": 4.848161469447493e-05, + "loss": 2.0132, + "step": 1791 + }, + { + "epoch": 0.14, + "grad_norm": 0.9136018486931248, + "learning_rate": 4.847947013074443e-05, + "loss": 2.1466, + "step": 1792 + }, + { + "epoch": 0.14, + "grad_norm": 0.7329625345457703, + "learning_rate": 4.8477324101094144e-05, + "loss": 2.0256, + "step": 1793 + }, + { + "epoch": 0.14, + "grad_norm": 0.9088924735466958, + "learning_rate": 4.847517660565808e-05, + "loss": 1.9981, + "step": 1794 + }, + { + "epoch": 0.14, + "grad_norm": 0.6709794582582006, + "learning_rate": 4.8473027644570296e-05, + "loss": 2.0677, + "step": 1795 + }, + { + "epoch": 0.14, + "grad_norm": 0.8333346130846855, + "learning_rate": 4.847087721796496e-05, + "loss": 2.2208, + "step": 1796 + }, + { + "epoch": 0.14, + "grad_norm": 0.7001607343194188, + "learning_rate": 4.846872532597635e-05, + "loss": 2.0311, + "step": 1797 + }, + { + "epoch": 0.14, + "grad_norm": 0.7770965623623268, + "learning_rate": 4.8466571968738804e-05, + "loss": 2.0218, + "step": 1798 + }, + { + "epoch": 0.14, + "grad_norm": 0.9596627643835487, + "learning_rate": 4.8464417146386764e-05, + "loss": 2.0097, + "step": 1799 + }, + { + "epoch": 0.14, + "grad_norm": 0.906223162372791, + "learning_rate": 4.8462260859054766e-05, + "loss": 2.2109, + "step": 1800 + }, + { + "epoch": 0.14, + "grad_norm": 0.9874280350530131, + "learning_rate": 4.846010310687744e-05, + "loss": 2.0896, + "step": 1801 + }, + { + "epoch": 0.14, + "grad_norm": 1.0652570015294909, + "learning_rate": 4.84579438899895e-05, + "loss": 2.0179, + "step": 1802 + }, + { + "epoch": 0.14, + "grad_norm": 1.0234377552282155, + "learning_rate": 4.8455783208525754e-05, + "loss": 2.0279, + "step": 1803 + }, + { + "epoch": 0.14, + "grad_norm": 1.177207769473578, + "learning_rate": 4.845362106262111e-05, + "loss": 2.2034, + "step": 1804 + }, + { + "epoch": 0.14, + "grad_norm": 0.8083954663005583, + "learning_rate": 4.845145745241054e-05, + "loss": 2.0515, + "step": 1805 + }, + { + "epoch": 0.14, + "grad_norm": 0.9474424782828786, + "learning_rate": 4.844929237802915e-05, + "loss": 2.0307, + "step": 1806 + }, + { + "epoch": 0.14, + "grad_norm": 1.0081894923458448, + "learning_rate": 4.8447125839612106e-05, + "loss": 2.0754, + "step": 1807 + }, + { + "epoch": 0.14, + "grad_norm": 1.0398278686173632, + "learning_rate": 4.844495783729467e-05, + "loss": 2.1963, + "step": 1808 + }, + { + "epoch": 0.14, + "grad_norm": 0.8088859471188138, + "learning_rate": 4.84427883712122e-05, + "loss": 2.0053, + "step": 1809 + }, + { + "epoch": 0.14, + "grad_norm": 0.8088999841143488, + "learning_rate": 4.844061744150015e-05, + "loss": 2.0445, + "step": 1810 + }, + { + "epoch": 0.14, + "grad_norm": 0.8313782440265521, + "learning_rate": 4.843844504829405e-05, + "loss": 1.9776, + "step": 1811 + }, + { + "epoch": 0.14, + "grad_norm": 0.8764648906148655, + "learning_rate": 4.843627119172954e-05, + "loss": 2.2167, + "step": 1812 + }, + { + "epoch": 0.14, + "grad_norm": 0.7575454956956268, + "learning_rate": 4.843409587194234e-05, + "loss": 2.0908, + "step": 1813 + }, + { + "epoch": 0.14, + "grad_norm": 0.9162060400524485, + "learning_rate": 4.843191908906827e-05, + "loss": 2.0242, + "step": 1814 + }, + { + "epoch": 0.14, + "grad_norm": 0.7916566155701187, + "learning_rate": 4.842974084324323e-05, + "loss": 2.025, + "step": 1815 + }, + { + "epoch": 0.14, + "grad_norm": 0.7869885681845203, + "learning_rate": 4.8427561134603216e-05, + "loss": 2.2043, + "step": 1816 + }, + { + "epoch": 0.14, + "grad_norm": 0.7863558495998596, + "learning_rate": 4.8425379963284324e-05, + "loss": 1.9739, + "step": 1817 + }, + { + "epoch": 0.14, + "grad_norm": 0.7673110396455858, + "learning_rate": 4.842319732942272e-05, + "loss": 1.9612, + "step": 1818 + }, + { + "epoch": 0.14, + "grad_norm": 0.6860692483886814, + "learning_rate": 4.842101323315468e-05, + "loss": 2.1035, + "step": 1819 + }, + { + "epoch": 0.14, + "grad_norm": 0.9223342422367793, + "learning_rate": 4.841882767461657e-05, + "loss": 1.9804, + "step": 1820 + }, + { + "epoch": 0.14, + "grad_norm": 0.7213069056202914, + "learning_rate": 4.8416640653944844e-05, + "loss": 2.192, + "step": 1821 + }, + { + "epoch": 0.14, + "grad_norm": 0.756393747374661, + "learning_rate": 4.8414452171276036e-05, + "loss": 2.0308, + "step": 1822 + }, + { + "epoch": 0.14, + "grad_norm": 0.8385305648117451, + "learning_rate": 4.8412262226746797e-05, + "loss": 1.9676, + "step": 1823 + }, + { + "epoch": 0.14, + "grad_norm": 0.7558679817858103, + "learning_rate": 4.8410070820493844e-05, + "loss": 2.207, + "step": 1824 + }, + { + "epoch": 0.14, + "grad_norm": 0.8893072318185858, + "learning_rate": 4.8407877952654003e-05, + "loss": 2.0141, + "step": 1825 + }, + { + "epoch": 0.14, + "grad_norm": 0.789600043792327, + "learning_rate": 4.840568362336418e-05, + "loss": 2.0557, + "step": 1826 + }, + { + "epoch": 0.14, + "grad_norm": 0.7795883107531075, + "learning_rate": 4.840348783276137e-05, + "loss": 1.9814, + "step": 1827 + }, + { + "epoch": 0.14, + "grad_norm": 0.809858936296519, + "learning_rate": 4.840129058098267e-05, + "loss": 2.1665, + "step": 1828 + }, + { + "epoch": 0.14, + "grad_norm": 0.713842734851453, + "learning_rate": 4.839909186816527e-05, + "loss": 2.0311, + "step": 1829 + }, + { + "epoch": 0.14, + "grad_norm": 0.6990516633469149, + "learning_rate": 4.839689169444644e-05, + "loss": 1.998, + "step": 1830 + }, + { + "epoch": 0.14, + "grad_norm": 0.8448272917620883, + "learning_rate": 4.8394690059963534e-05, + "loss": 2.035, + "step": 1831 + }, + { + "epoch": 0.14, + "grad_norm": 0.6974921906357421, + "learning_rate": 4.8392486964854034e-05, + "loss": 2.08, + "step": 1832 + }, + { + "epoch": 0.14, + "grad_norm": 0.8253978659784993, + "learning_rate": 4.839028240925546e-05, + "loss": 2.2284, + "step": 1833 + }, + { + "epoch": 0.14, + "grad_norm": 0.6686619814700958, + "learning_rate": 4.838807639330548e-05, + "loss": 1.9985, + "step": 1834 + }, + { + "epoch": 0.14, + "grad_norm": 0.6646760242105587, + "learning_rate": 4.8385868917141796e-05, + "loss": 2.0572, + "step": 1835 + }, + { + "epoch": 0.14, + "grad_norm": 0.6952305784420879, + "learning_rate": 4.838365998090225e-05, + "loss": 2.155, + "step": 1836 + }, + { + "epoch": 0.14, + "grad_norm": 0.7589404232117765, + "learning_rate": 4.838144958472475e-05, + "loss": 2.0008, + "step": 1837 + }, + { + "epoch": 0.14, + "grad_norm": 0.643111749401844, + "learning_rate": 4.8379237728747304e-05, + "loss": 2.0747, + "step": 1838 + }, + { + "epoch": 0.14, + "grad_norm": 0.6949470416109794, + "learning_rate": 4.837702441310799e-05, + "loss": 2.0142, + "step": 1839 + }, + { + "epoch": 0.14, + "grad_norm": 0.6166374309213687, + "learning_rate": 4.837480963794502e-05, + "loss": 2.1865, + "step": 1840 + }, + { + "epoch": 0.14, + "grad_norm": 0.7026553248019648, + "learning_rate": 4.837259340339665e-05, + "loss": 1.9819, + "step": 1841 + }, + { + "epoch": 0.14, + "grad_norm": 0.6210912148703593, + "learning_rate": 4.8370375709601254e-05, + "loss": 1.983, + "step": 1842 + }, + { + "epoch": 0.14, + "grad_norm": 0.8130754301565566, + "learning_rate": 4.8368156556697295e-05, + "loss": 1.9699, + "step": 1843 + }, + { + "epoch": 0.14, + "grad_norm": 0.5783749364887686, + "learning_rate": 4.8365935944823334e-05, + "loss": 2.0559, + "step": 1844 + }, + { + "epoch": 0.14, + "grad_norm": 0.7052941996369728, + "learning_rate": 4.8363713874117986e-05, + "loss": 2.2139, + "step": 1845 + }, + { + "epoch": 0.14, + "grad_norm": 0.7037952660026171, + "learning_rate": 4.8361490344720014e-05, + "loss": 1.9923, + "step": 1846 + }, + { + "epoch": 0.14, + "grad_norm": 0.6403449202149538, + "learning_rate": 4.8359265356768225e-05, + "loss": 2.0055, + "step": 1847 + }, + { + "epoch": 0.14, + "grad_norm": 0.6435531386604566, + "learning_rate": 4.835703891040154e-05, + "loss": 2.2097, + "step": 1848 + }, + { + "epoch": 0.14, + "grad_norm": 0.6201528865643201, + "learning_rate": 4.835481100575896e-05, + "loss": 2.0243, + "step": 1849 + }, + { + "epoch": 0.14, + "grad_norm": 0.6420688759533919, + "learning_rate": 4.8352581642979576e-05, + "loss": 2.0811, + "step": 1850 + }, + { + "epoch": 0.14, + "grad_norm": 0.6292286828867993, + "learning_rate": 4.83503508222026e-05, + "loss": 2.0127, + "step": 1851 + }, + { + "epoch": 0.14, + "grad_norm": 0.7858229513658695, + "learning_rate": 4.834811854356729e-05, + "loss": 1.9806, + "step": 1852 + }, + { + "epoch": 0.14, + "grad_norm": 0.6858820191676133, + "learning_rate": 4.8345884807213024e-05, + "loss": 2.2172, + "step": 1853 + }, + { + "epoch": 0.14, + "grad_norm": 0.5768020906808438, + "learning_rate": 4.834364961327926e-05, + "loss": 1.9764, + "step": 1854 + }, + { + "epoch": 0.14, + "grad_norm": 0.6775038752978104, + "learning_rate": 4.8341412961905554e-05, + "loss": 1.9987, + "step": 1855 + }, + { + "epoch": 0.14, + "grad_norm": 0.6548377793358822, + "learning_rate": 4.8339174853231536e-05, + "loss": 2.209, + "step": 1856 + }, + { + "epoch": 0.14, + "grad_norm": 0.6034509240509452, + "learning_rate": 4.833693528739697e-05, + "loss": 2.0124, + "step": 1857 + }, + { + "epoch": 0.14, + "grad_norm": 0.849058493620179, + "learning_rate": 4.833469426454166e-05, + "loss": 2.0144, + "step": 1858 + }, + { + "epoch": 0.14, + "grad_norm": 0.6226748063919738, + "learning_rate": 4.833245178480552e-05, + "loss": 2.0212, + "step": 1859 + }, + { + "epoch": 0.14, + "grad_norm": 0.8929410495882787, + "learning_rate": 4.833020784832857e-05, + "loss": 2.1992, + "step": 1860 + }, + { + "epoch": 0.14, + "grad_norm": 0.9988434051563644, + "learning_rate": 4.832796245525089e-05, + "loss": 2.0101, + "step": 1861 + }, + { + "epoch": 0.14, + "grad_norm": 0.7577316817795313, + "learning_rate": 4.832571560571269e-05, + "loss": 2.0279, + "step": 1862 + }, + { + "epoch": 0.14, + "grad_norm": 0.8374854428542543, + "learning_rate": 4.832346729985423e-05, + "loss": 2.0645, + "step": 1863 + }, + { + "epoch": 0.14, + "grad_norm": 0.6492857480310614, + "learning_rate": 4.8321217537815907e-05, + "loss": 1.9531, + "step": 1864 + }, + { + "epoch": 0.14, + "grad_norm": 0.9223165128853924, + "learning_rate": 4.8318966319738156e-05, + "loss": 2.2079, + "step": 1865 + }, + { + "epoch": 0.14, + "grad_norm": 0.7060071403335778, + "learning_rate": 4.831671364576155e-05, + "loss": 1.9684, + "step": 1866 + }, + { + "epoch": 0.14, + "grad_norm": 0.8558848330627417, + "learning_rate": 4.831445951602671e-05, + "loss": 2.0156, + "step": 1867 + }, + { + "epoch": 0.14, + "grad_norm": 0.6866388768474617, + "learning_rate": 4.831220393067439e-05, + "loss": 2.222, + "step": 1868 + }, + { + "epoch": 0.14, + "grad_norm": 0.7799654960106693, + "learning_rate": 4.830994688984542e-05, + "loss": 2.0399, + "step": 1869 + }, + { + "epoch": 0.14, + "grad_norm": 0.6580485849579769, + "learning_rate": 4.830768839368069e-05, + "loss": 1.9877, + "step": 1870 + }, + { + "epoch": 0.14, + "grad_norm": 0.7177393830844447, + "learning_rate": 4.830542844232123e-05, + "loss": 1.9956, + "step": 1871 + }, + { + "epoch": 0.14, + "grad_norm": 0.6939986757423202, + "learning_rate": 4.830316703590814e-05, + "loss": 2.1717, + "step": 1872 + }, + { + "epoch": 0.14, + "grad_norm": 0.6069003436719608, + "learning_rate": 4.8300904174582585e-05, + "loss": 2.0164, + "step": 1873 + }, + { + "epoch": 0.14, + "grad_norm": 0.7233845452068256, + "learning_rate": 4.829863985848587e-05, + "loss": 2.0013, + "step": 1874 + }, + { + "epoch": 0.14, + "grad_norm": 0.6915819562936643, + "learning_rate": 4.829637408775935e-05, + "loss": 2.0679, + "step": 1875 + }, + { + "epoch": 0.14, + "grad_norm": 0.6423483577736271, + "learning_rate": 4.829410686254449e-05, + "loss": 2.0362, + "step": 1876 + }, + { + "epoch": 0.14, + "grad_norm": 0.7685073532638866, + "learning_rate": 4.829183818298285e-05, + "loss": 2.1481, + "step": 1877 + }, + { + "epoch": 0.14, + "grad_norm": 0.6462386515353771, + "learning_rate": 4.828956804921607e-05, + "loss": 1.9984, + "step": 1878 + }, + { + "epoch": 0.14, + "grad_norm": 0.8045501896651835, + "learning_rate": 4.828729646138587e-05, + "loss": 1.9787, + "step": 1879 + }, + { + "epoch": 0.15, + "grad_norm": 0.658713606850531, + "learning_rate": 4.828502341963409e-05, + "loss": 2.1825, + "step": 1880 + }, + { + "epoch": 0.15, + "grad_norm": 0.7051819751401018, + "learning_rate": 4.828274892410264e-05, + "loss": 2.0672, + "step": 1881 + }, + { + "epoch": 0.15, + "grad_norm": 0.7473361284914114, + "learning_rate": 4.828047297493353e-05, + "loss": 1.9811, + "step": 1882 + }, + { + "epoch": 0.15, + "grad_norm": 0.7964074112670938, + "learning_rate": 4.827819557226885e-05, + "loss": 2.0251, + "step": 1883 + }, + { + "epoch": 0.15, + "grad_norm": 0.6383579195253555, + "learning_rate": 4.8275916716250796e-05, + "loss": 2.0265, + "step": 1884 + }, + { + "epoch": 0.15, + "grad_norm": 0.762900198290114, + "learning_rate": 4.8273636407021635e-05, + "loss": 2.1988, + "step": 1885 + }, + { + "epoch": 0.15, + "grad_norm": 0.6131933965567656, + "learning_rate": 4.827135464472375e-05, + "loss": 2.0072, + "step": 1886 + }, + { + "epoch": 0.15, + "grad_norm": 0.6246149938472124, + "learning_rate": 4.826907142949959e-05, + "loss": 1.9977, + "step": 1887 + }, + { + "epoch": 0.15, + "grad_norm": 1.0021715128205675, + "learning_rate": 4.826678676149171e-05, + "loss": 2.0794, + "step": 1888 + }, + { + "epoch": 0.15, + "grad_norm": 0.6416065211239265, + "learning_rate": 4.8264500640842744e-05, + "loss": 2.2062, + "step": 1889 + }, + { + "epoch": 0.15, + "grad_norm": 0.6671776828913605, + "learning_rate": 4.826221306769544e-05, + "loss": 1.9693, + "step": 1890 + }, + { + "epoch": 0.15, + "grad_norm": 0.655776507763887, + "learning_rate": 4.82599240421926e-05, + "loss": 2.0237, + "step": 1891 + }, + { + "epoch": 0.15, + "grad_norm": 0.776274065158058, + "learning_rate": 4.8257633564477156e-05, + "loss": 2.1839, + "step": 1892 + }, + { + "epoch": 0.15, + "grad_norm": 0.6304223728930962, + "learning_rate": 4.82553416346921e-05, + "loss": 1.9989, + "step": 1893 + }, + { + "epoch": 0.15, + "grad_norm": 0.6710501259125442, + "learning_rate": 4.825304825298053e-05, + "loss": 2.0553, + "step": 1894 + }, + { + "epoch": 0.15, + "grad_norm": 0.6477136174584734, + "learning_rate": 4.8250753419485634e-05, + "loss": 2.0014, + "step": 1895 + }, + { + "epoch": 0.15, + "grad_norm": 0.7097378178470494, + "learning_rate": 4.824845713435068e-05, + "loss": 1.9884, + "step": 1896 + }, + { + "epoch": 0.15, + "grad_norm": 0.6694838222329936, + "learning_rate": 4.824615939771905e-05, + "loss": 2.2176, + "step": 1897 + }, + { + "epoch": 0.15, + "grad_norm": 0.6585410827628839, + "learning_rate": 4.8243860209734185e-05, + "loss": 2.0357, + "step": 1898 + }, + { + "epoch": 0.15, + "grad_norm": 0.6307576693203919, + "learning_rate": 4.824155957053964e-05, + "loss": 1.9644, + "step": 1899 + }, + { + "epoch": 0.15, + "grad_norm": 0.666368502614613, + "learning_rate": 4.823925748027905e-05, + "loss": 2.0461, + "step": 1900 + }, + { + "epoch": 0.15, + "grad_norm": 0.7578814712075588, + "learning_rate": 4.823695393909615e-05, + "loss": 2.1967, + "step": 1901 + }, + { + "epoch": 0.15, + "grad_norm": 0.686373879384365, + "learning_rate": 4.823464894713475e-05, + "loss": 2.0403, + "step": 1902 + }, + { + "epoch": 0.15, + "grad_norm": 0.8093966061880892, + "learning_rate": 4.823234250453878e-05, + "loss": 2.0092, + "step": 1903 + }, + { + "epoch": 0.15, + "grad_norm": 0.6722061570107992, + "learning_rate": 4.823003461145221e-05, + "loss": 1.9932, + "step": 1904 + }, + { + "epoch": 0.15, + "grad_norm": 0.6468213522949562, + "learning_rate": 4.8227725268019155e-05, + "loss": 2.1959, + "step": 1905 + }, + { + "epoch": 0.15, + "grad_norm": 0.71842789560096, + "learning_rate": 4.822541447438379e-05, + "loss": 2.1094, + "step": 1906 + }, + { + "epoch": 0.15, + "grad_norm": 0.6263837249534764, + "learning_rate": 4.822310223069039e-05, + "loss": 2.0399, + "step": 1907 + }, + { + "epoch": 0.15, + "grad_norm": 0.7170854163795776, + "learning_rate": 4.822078853708331e-05, + "loss": 2.0163, + "step": 1908 + }, + { + "epoch": 0.15, + "grad_norm": 0.6789374841806087, + "learning_rate": 4.8218473393707004e-05, + "loss": 2.1903, + "step": 1909 + }, + { + "epoch": 0.15, + "grad_norm": 0.6670422654854251, + "learning_rate": 4.8216156800706026e-05, + "loss": 1.9975, + "step": 1910 + }, + { + "epoch": 0.15, + "grad_norm": 0.7294112481300277, + "learning_rate": 4.8213838758225005e-05, + "loss": 1.9488, + "step": 1911 + }, + { + "epoch": 0.15, + "grad_norm": 0.6262879132211079, + "learning_rate": 4.821151926640866e-05, + "loss": 2.1806, + "step": 1912 + }, + { + "epoch": 0.15, + "grad_norm": 0.7180347551115842, + "learning_rate": 4.8209198325401815e-05, + "loss": 2.0577, + "step": 1913 + }, + { + "epoch": 0.15, + "grad_norm": 0.7097144834912172, + "learning_rate": 4.820687593534938e-05, + "loss": 1.987, + "step": 1914 + }, + { + "epoch": 0.15, + "grad_norm": 0.6546451942575606, + "learning_rate": 4.820455209639634e-05, + "loss": 1.9904, + "step": 1915 + }, + { + "epoch": 0.15, + "grad_norm": 0.7038046517450455, + "learning_rate": 4.820222680868778e-05, + "loss": 2.004, + "step": 1916 + }, + { + "epoch": 0.15, + "grad_norm": 0.715691801539892, + "learning_rate": 4.819990007236889e-05, + "loss": 2.1634, + "step": 1917 + }, + { + "epoch": 0.15, + "grad_norm": 0.6782980425053148, + "learning_rate": 4.819757188758492e-05, + "loss": 2.0101, + "step": 1918 + }, + { + "epoch": 0.15, + "grad_norm": 0.6785400127678617, + "learning_rate": 4.8195242254481254e-05, + "loss": 2.0339, + "step": 1919 + }, + { + "epoch": 0.15, + "grad_norm": 0.6591438780865508, + "learning_rate": 4.819291117320332e-05, + "loss": 2.0253, + "step": 1920 + }, + { + "epoch": 0.15, + "grad_norm": 0.7028496178773267, + "learning_rate": 4.819057864389666e-05, + "loss": 2.1877, + "step": 1921 + }, + { + "epoch": 0.15, + "grad_norm": 0.6534432145073824, + "learning_rate": 4.8188244666706905e-05, + "loss": 2.0238, + "step": 1922 + }, + { + "epoch": 0.15, + "grad_norm": 0.7055207477321518, + "learning_rate": 4.818590924177978e-05, + "loss": 1.9773, + "step": 1923 + }, + { + "epoch": 0.15, + "grad_norm": 0.6952323058825431, + "learning_rate": 4.818357236926109e-05, + "loss": 2.2041, + "step": 1924 + }, + { + "epoch": 0.15, + "grad_norm": 0.737407850953508, + "learning_rate": 4.818123404929673e-05, + "loss": 2.0453, + "step": 1925 + }, + { + "epoch": 0.15, + "grad_norm": 0.6750130506500975, + "learning_rate": 4.81788942820327e-05, + "loss": 2.0366, + "step": 1926 + }, + { + "epoch": 0.15, + "grad_norm": 0.7294570243800683, + "learning_rate": 4.817655306761508e-05, + "loss": 1.996, + "step": 1927 + }, + { + "epoch": 0.15, + "grad_norm": 0.6396067830300864, + "learning_rate": 4.8174210406190046e-05, + "loss": 1.9892, + "step": 1928 + }, + { + "epoch": 0.15, + "grad_norm": 0.6899997590369725, + "learning_rate": 4.817186629790385e-05, + "loss": 2.266, + "step": 1929 + }, + { + "epoch": 0.15, + "grad_norm": 0.727265449128919, + "learning_rate": 4.816952074290285e-05, + "loss": 1.9957, + "step": 1930 + }, + { + "epoch": 0.15, + "grad_norm": 0.7189510403150956, + "learning_rate": 4.816717374133348e-05, + "loss": 2.1046, + "step": 1931 + }, + { + "epoch": 0.15, + "grad_norm": 0.6318497078622048, + "learning_rate": 4.8164825293342294e-05, + "loss": 1.9332, + "step": 1932 + }, + { + "epoch": 0.15, + "grad_norm": 0.7132285419760882, + "learning_rate": 4.816247539907589e-05, + "loss": 2.2157, + "step": 1933 + }, + { + "epoch": 0.15, + "grad_norm": 0.6304749875756387, + "learning_rate": 4.8160124058681e-05, + "loss": 1.9547, + "step": 1934 + }, + { + "epoch": 0.15, + "grad_norm": 0.6227453425372939, + "learning_rate": 4.815777127230442e-05, + "loss": 1.9644, + "step": 1935 + }, + { + "epoch": 0.15, + "grad_norm": 0.819201227138268, + "learning_rate": 4.815541704009304e-05, + "loss": 1.9863, + "step": 1936 + }, + { + "epoch": 0.15, + "grad_norm": 0.6120868767218024, + "learning_rate": 4.8153061362193855e-05, + "loss": 2.1967, + "step": 1937 + }, + { + "epoch": 0.15, + "grad_norm": 0.7179887357398979, + "learning_rate": 4.8150704238753926e-05, + "loss": 1.9425, + "step": 1938 + }, + { + "epoch": 0.15, + "grad_norm": 0.5769099376860864, + "learning_rate": 4.814834566992043e-05, + "loss": 2.0275, + "step": 1939 + }, + { + "epoch": 0.15, + "grad_norm": 0.6524795357590113, + "learning_rate": 4.814598565584062e-05, + "loss": 1.9855, + "step": 1940 + }, + { + "epoch": 0.15, + "grad_norm": 0.6266808267090597, + "learning_rate": 4.814362419666184e-05, + "loss": 2.1876, + "step": 1941 + }, + { + "epoch": 0.15, + "grad_norm": 0.6636713796999617, + "learning_rate": 4.814126129253152e-05, + "loss": 1.9444, + "step": 1942 + }, + { + "epoch": 0.15, + "grad_norm": 0.6648586144715893, + "learning_rate": 4.8138896943597193e-05, + "loss": 2.025, + "step": 1943 + }, + { + "epoch": 0.15, + "grad_norm": 0.5964756895947774, + "learning_rate": 4.813653115000647e-05, + "loss": 2.0571, + "step": 1944 + }, + { + "epoch": 0.15, + "grad_norm": 0.6043784260492732, + "learning_rate": 4.813416391190707e-05, + "loss": 2.1955, + "step": 1945 + }, + { + "epoch": 0.15, + "grad_norm": 0.6485594120218846, + "learning_rate": 4.813179522944678e-05, + "loss": 1.9647, + "step": 1946 + }, + { + "epoch": 0.15, + "grad_norm": 0.660139386834233, + "learning_rate": 4.812942510277347e-05, + "loss": 1.9782, + "step": 1947 + }, + { + "epoch": 0.15, + "grad_norm": 0.6625504557723374, + "learning_rate": 4.812705353203514e-05, + "loss": 2.0261, + "step": 1948 + }, + { + "epoch": 0.15, + "grad_norm": 0.6107333917391423, + "learning_rate": 4.8124680517379856e-05, + "loss": 2.221, + "step": 1949 + }, + { + "epoch": 0.15, + "grad_norm": 0.6368868631259151, + "learning_rate": 4.8122306058955765e-05, + "loss": 2.0587, + "step": 1950 + }, + { + "epoch": 0.15, + "grad_norm": 0.6312905399777267, + "learning_rate": 4.811993015691112e-05, + "loss": 2.0026, + "step": 1951 + }, + { + "epoch": 0.15, + "grad_norm": 0.6379599757539238, + "learning_rate": 4.811755281139425e-05, + "loss": 2.001, + "step": 1952 + }, + { + "epoch": 0.15, + "grad_norm": 0.6006845087547474, + "learning_rate": 4.8115174022553586e-05, + "loss": 2.198, + "step": 1953 + }, + { + "epoch": 0.15, + "grad_norm": 0.6085232232463885, + "learning_rate": 4.811279379053766e-05, + "loss": 2.0021, + "step": 1954 + }, + { + "epoch": 0.15, + "grad_norm": 0.6878227538795225, + "learning_rate": 4.811041211549506e-05, + "loss": 1.9862, + "step": 1955 + }, + { + "epoch": 0.15, + "grad_norm": 0.6502181177691235, + "learning_rate": 4.81080289975745e-05, + "loss": 2.0238, + "step": 1956 + }, + { + "epoch": 0.15, + "grad_norm": 0.7062331536371547, + "learning_rate": 4.810564443692475e-05, + "loss": 2.1624, + "step": 1957 + }, + { + "epoch": 0.15, + "grad_norm": 0.6823517708161424, + "learning_rate": 4.81032584336947e-05, + "loss": 1.9992, + "step": 1958 + }, + { + "epoch": 0.15, + "grad_norm": 0.7654489912713323, + "learning_rate": 4.810087098803332e-05, + "loss": 1.9859, + "step": 1959 + }, + { + "epoch": 0.15, + "grad_norm": 0.6343047554835318, + "learning_rate": 4.809848210008966e-05, + "loss": 1.9663, + "step": 1960 + }, + { + "epoch": 0.15, + "grad_norm": 0.7305276526388191, + "learning_rate": 4.8096091770012876e-05, + "loss": 2.2205, + "step": 1961 + }, + { + "epoch": 0.15, + "grad_norm": 0.5734896610164785, + "learning_rate": 4.809369999795219e-05, + "loss": 2.0651, + "step": 1962 + }, + { + "epoch": 0.15, + "grad_norm": 0.6876374327108107, + "learning_rate": 4.809130678405696e-05, + "loss": 2.0589, + "step": 1963 + }, + { + "epoch": 0.15, + "grad_norm": 0.6793109928680939, + "learning_rate": 4.808891212847657e-05, + "loss": 1.9602, + "step": 1964 + }, + { + "epoch": 0.15, + "grad_norm": 0.6450520195103199, + "learning_rate": 4.8086516031360546e-05, + "loss": 2.1662, + "step": 1965 + }, + { + "epoch": 0.15, + "grad_norm": 0.6866501038595605, + "learning_rate": 4.8084118492858486e-05, + "loss": 1.9506, + "step": 1966 + }, + { + "epoch": 0.15, + "grad_norm": 0.6866882009060182, + "learning_rate": 4.808171951312008e-05, + "loss": 2.0368, + "step": 1967 + }, + { + "epoch": 0.15, + "grad_norm": 0.5971420071909138, + "learning_rate": 4.80793190922951e-05, + "loss": 2.0533, + "step": 1968 + }, + { + "epoch": 0.15, + "grad_norm": 0.6137395946089833, + "learning_rate": 4.807691723053342e-05, + "loss": 2.1681, + "step": 1969 + }, + { + "epoch": 0.15, + "grad_norm": 0.6531782676571379, + "learning_rate": 4.807451392798499e-05, + "loss": 1.9953, + "step": 1970 + }, + { + "epoch": 0.15, + "grad_norm": 0.6139987833004621, + "learning_rate": 4.807210918479987e-05, + "loss": 2.0002, + "step": 1971 + }, + { + "epoch": 0.15, + "grad_norm": 0.6648496682012286, + "learning_rate": 4.806970300112819e-05, + "loss": 2.0317, + "step": 1972 + }, + { + "epoch": 0.15, + "grad_norm": 0.6266498790253385, + "learning_rate": 4.806729537712017e-05, + "loss": 2.1787, + "step": 1973 + }, + { + "epoch": 0.15, + "grad_norm": 0.6496864058024427, + "learning_rate": 4.806488631292614e-05, + "loss": 1.9903, + "step": 1974 + }, + { + "epoch": 0.15, + "grad_norm": 0.5938894547134859, + "learning_rate": 4.806247580869651e-05, + "loss": 2.0873, + "step": 1975 + }, + { + "epoch": 0.15, + "grad_norm": 0.617821628766627, + "learning_rate": 4.8060063864581765e-05, + "loss": 2.0123, + "step": 1976 + }, + { + "epoch": 0.15, + "grad_norm": 0.6649397791322516, + "learning_rate": 4.805765048073251e-05, + "loss": 2.219, + "step": 1977 + }, + { + "epoch": 0.15, + "grad_norm": 0.6373434319364385, + "learning_rate": 4.80552356572994e-05, + "loss": 1.9543, + "step": 1978 + }, + { + "epoch": 0.15, + "grad_norm": 0.7608710489778561, + "learning_rate": 4.8052819394433225e-05, + "loss": 2.0333, + "step": 1979 + }, + { + "epoch": 0.15, + "grad_norm": 0.6116075345314019, + "learning_rate": 4.805040169228483e-05, + "loss": 2.0084, + "step": 1980 + }, + { + "epoch": 0.15, + "grad_norm": 0.7803187258145476, + "learning_rate": 4.804798255100515e-05, + "loss": 2.2264, + "step": 1981 + }, + { + "epoch": 0.15, + "grad_norm": 0.6517347828304965, + "learning_rate": 4.804556197074526e-05, + "loss": 2.0052, + "step": 1982 + }, + { + "epoch": 0.15, + "grad_norm": 0.7152816541509863, + "learning_rate": 4.804313995165625e-05, + "loss": 1.9957, + "step": 1983 + }, + { + "epoch": 0.15, + "grad_norm": 0.6822068536330941, + "learning_rate": 4.8040716493889346e-05, + "loss": 1.9954, + "step": 1984 + }, + { + "epoch": 0.15, + "grad_norm": 0.651347863029358, + "learning_rate": 4.8038291597595865e-05, + "loss": 2.1961, + "step": 1985 + }, + { + "epoch": 0.15, + "grad_norm": 0.673210263013792, + "learning_rate": 4.803586526292719e-05, + "loss": 2.0414, + "step": 1986 + }, + { + "epoch": 0.15, + "grad_norm": 0.6918784095515743, + "learning_rate": 4.803343749003482e-05, + "loss": 2.0696, + "step": 1987 + }, + { + "epoch": 0.15, + "grad_norm": 0.7075527213672979, + "learning_rate": 4.803100827907031e-05, + "loss": 2.0001, + "step": 1988 + }, + { + "epoch": 0.15, + "grad_norm": 0.5981893489644124, + "learning_rate": 4.802857763018535e-05, + "loss": 2.1629, + "step": 1989 + }, + { + "epoch": 0.15, + "grad_norm": 0.6081048844232899, + "learning_rate": 4.8026145543531685e-05, + "loss": 1.9654, + "step": 1990 + }, + { + "epoch": 0.15, + "grad_norm": 0.8983575774834642, + "learning_rate": 4.8023712019261155e-05, + "loss": 1.9739, + "step": 1991 + }, + { + "epoch": 0.15, + "grad_norm": 0.6052069133552328, + "learning_rate": 4.8021277057525705e-05, + "loss": 1.9801, + "step": 1992 + }, + { + "epoch": 0.15, + "grad_norm": 0.6536860243191052, + "learning_rate": 4.8018840658477346e-05, + "loss": 2.217, + "step": 1993 + }, + { + "epoch": 0.15, + "grad_norm": 0.663562872718384, + "learning_rate": 4.801640282226821e-05, + "loss": 1.9379, + "step": 1994 + }, + { + "epoch": 0.15, + "grad_norm": 0.6527102329063471, + "learning_rate": 4.801396354905049e-05, + "loss": 1.9478, + "step": 1995 + }, + { + "epoch": 0.15, + "grad_norm": 0.6080631281335737, + "learning_rate": 4.8011522838976484e-05, + "loss": 2.0177, + "step": 1996 + }, + { + "epoch": 0.15, + "grad_norm": 0.7024265501649074, + "learning_rate": 4.800908069219857e-05, + "loss": 2.1758, + "step": 1997 + }, + { + "epoch": 0.15, + "grad_norm": 0.6714636259650334, + "learning_rate": 4.800663710886922e-05, + "loss": 1.9997, + "step": 1998 + }, + { + "epoch": 0.15, + "grad_norm": 0.5729278918161292, + "learning_rate": 4.800419208914101e-05, + "loss": 2.0836, + "step": 1999 + }, + { + "epoch": 0.15, + "grad_norm": 0.6197662764388889, + "learning_rate": 4.800174563316658e-05, + "loss": 2.0152, + "step": 2000 + }, + { + "epoch": 0.15, + "grad_norm": 0.6993932632955054, + "learning_rate": 4.799929774109868e-05, + "loss": 2.1788, + "step": 2001 + }, + { + "epoch": 0.15, + "grad_norm": 0.6362776724510202, + "learning_rate": 4.7996848413090134e-05, + "loss": 2.0305, + "step": 2002 + }, + { + "epoch": 0.15, + "grad_norm": 0.5515626975110235, + "learning_rate": 4.799439764929388e-05, + "loss": 1.9601, + "step": 2003 + }, + { + "epoch": 0.15, + "grad_norm": 0.6544932753332472, + "learning_rate": 4.79919454498629e-05, + "loss": 1.9933, + "step": 2004 + }, + { + "epoch": 0.15, + "grad_norm": 0.6665142569981537, + "learning_rate": 4.798949181495032e-05, + "loss": 2.2188, + "step": 2005 + }, + { + "epoch": 0.15, + "grad_norm": 0.6877752435755332, + "learning_rate": 4.7987036744709326e-05, + "loss": 2.0901, + "step": 2006 + }, + { + "epoch": 0.15, + "grad_norm": 0.7005530904857376, + "learning_rate": 4.7984580239293184e-05, + "loss": 2.0132, + "step": 2007 + }, + { + "epoch": 0.15, + "grad_norm": 0.6703711316980305, + "learning_rate": 4.798212229885528e-05, + "loss": 1.9825, + "step": 2008 + }, + { + "epoch": 0.15, + "grad_norm": 0.7980617066581983, + "learning_rate": 4.797966292354907e-05, + "loss": 2.2063, + "step": 2009 + }, + { + "epoch": 0.16, + "grad_norm": 0.7124983593212242, + "learning_rate": 4.797720211352811e-05, + "loss": 1.9624, + "step": 2010 + }, + { + "epoch": 0.16, + "grad_norm": 0.6286220111536401, + "learning_rate": 4.7974739868946015e-05, + "loss": 2.0244, + "step": 2011 + }, + { + "epoch": 0.16, + "grad_norm": 0.6824240014931058, + "learning_rate": 4.797227618995654e-05, + "loss": 2.0565, + "step": 2012 + }, + { + "epoch": 0.16, + "grad_norm": 0.6318707084812021, + "learning_rate": 4.7969811076713476e-05, + "loss": 2.1954, + "step": 2013 + }, + { + "epoch": 0.16, + "grad_norm": 0.8507820928260817, + "learning_rate": 4.7967344529370755e-05, + "loss": 2.0504, + "step": 2014 + }, + { + "epoch": 0.16, + "grad_norm": 0.58717746675603, + "learning_rate": 4.796487654808236e-05, + "loss": 1.9898, + "step": 2015 + }, + { + "epoch": 0.16, + "grad_norm": 0.6394849373650409, + "learning_rate": 4.796240713300238e-05, + "loss": 1.9997, + "step": 2016 + }, + { + "epoch": 0.16, + "grad_norm": 0.6180948353544421, + "learning_rate": 4.795993628428499e-05, + "loss": 2.2295, + "step": 2017 + }, + { + "epoch": 0.16, + "grad_norm": 0.5926810606580063, + "learning_rate": 4.795746400208445e-05, + "loss": 2.0249, + "step": 2018 + }, + { + "epoch": 0.16, + "grad_norm": 0.6329189186096706, + "learning_rate": 4.795499028655513e-05, + "loss": 2.0, + "step": 2019 + }, + { + "epoch": 0.16, + "grad_norm": 0.6714982697090845, + "learning_rate": 4.795251513785146e-05, + "loss": 1.9714, + "step": 2020 + }, + { + "epoch": 0.16, + "grad_norm": 0.5925491323527572, + "learning_rate": 4.795003855612798e-05, + "loss": 2.2292, + "step": 2021 + }, + { + "epoch": 0.16, + "grad_norm": 0.6979715828390779, + "learning_rate": 4.794756054153932e-05, + "loss": 1.9905, + "step": 2022 + }, + { + "epoch": 0.16, + "grad_norm": 0.61371908200805, + "learning_rate": 4.794508109424017e-05, + "loss": 1.9676, + "step": 2023 + }, + { + "epoch": 0.16, + "grad_norm": 0.8079443305970975, + "learning_rate": 4.794260021438535e-05, + "loss": 2.0677, + "step": 2024 + }, + { + "epoch": 0.16, + "grad_norm": 0.6251343873604478, + "learning_rate": 4.7940117902129756e-05, + "loss": 2.1648, + "step": 2025 + }, + { + "epoch": 0.16, + "grad_norm": 0.6899623895310409, + "learning_rate": 4.793763415762835e-05, + "loss": 1.9882, + "step": 2026 + }, + { + "epoch": 0.16, + "grad_norm": 0.59601104007301, + "learning_rate": 4.793514898103623e-05, + "loss": 1.9357, + "step": 2027 + }, + { + "epoch": 0.16, + "grad_norm": 0.7354077466212326, + "learning_rate": 4.793266237250852e-05, + "loss": 1.9805, + "step": 2028 + }, + { + "epoch": 0.16, + "grad_norm": 0.640648493276136, + "learning_rate": 4.7930174332200494e-05, + "loss": 2.2447, + "step": 2029 + }, + { + "epoch": 0.16, + "grad_norm": 0.7283083453591035, + "learning_rate": 4.792768486026749e-05, + "loss": 2.0533, + "step": 2030 + }, + { + "epoch": 0.16, + "grad_norm": 0.8283802324875262, + "learning_rate": 4.792519395686493e-05, + "loss": 2.0199, + "step": 2031 + }, + { + "epoch": 0.16, + "grad_norm": 0.6271099743607036, + "learning_rate": 4.792270162214833e-05, + "loss": 2.0296, + "step": 2032 + }, + { + "epoch": 0.16, + "grad_norm": 0.7249711768231938, + "learning_rate": 4.7920207856273306e-05, + "loss": 2.1937, + "step": 2033 + }, + { + "epoch": 0.16, + "grad_norm": 0.6344268820761548, + "learning_rate": 4.7917712659395535e-05, + "loss": 1.9857, + "step": 2034 + }, + { + "epoch": 0.16, + "grad_norm": 0.7262464653830112, + "learning_rate": 4.7915216031670815e-05, + "loss": 2.0272, + "step": 2035 + }, + { + "epoch": 0.16, + "grad_norm": 0.7068685898391689, + "learning_rate": 4.7912717973255025e-05, + "loss": 1.979, + "step": 2036 + }, + { + "epoch": 0.16, + "grad_norm": 0.6946991295264463, + "learning_rate": 4.791021848430413e-05, + "loss": 2.0553, + "step": 2037 + }, + { + "epoch": 0.16, + "grad_norm": 0.7196671744328093, + "learning_rate": 4.7907717564974176e-05, + "loss": 2.1877, + "step": 2038 + }, + { + "epoch": 0.16, + "grad_norm": 0.7350932805947377, + "learning_rate": 4.790521521542129e-05, + "loss": 1.9808, + "step": 2039 + }, + { + "epoch": 0.16, + "grad_norm": 0.7540355637161716, + "learning_rate": 4.790271143580174e-05, + "loss": 2.0044, + "step": 2040 + }, + { + "epoch": 0.16, + "grad_norm": 0.7341810212691784, + "learning_rate": 4.7900206226271823e-05, + "loss": 2.2063, + "step": 2041 + }, + { + "epoch": 0.16, + "grad_norm": 0.8960686007635085, + "learning_rate": 4.7897699586987955e-05, + "loss": 1.9756, + "step": 2042 + }, + { + "epoch": 0.16, + "grad_norm": 0.6861972935524167, + "learning_rate": 4.789519151810664e-05, + "loss": 2.0331, + "step": 2043 + }, + { + "epoch": 0.16, + "grad_norm": 1.0491275542035767, + "learning_rate": 4.7892682019784454e-05, + "loss": 2.0042, + "step": 2044 + }, + { + "epoch": 0.16, + "grad_norm": 1.0501591407251794, + "learning_rate": 4.789017109217809e-05, + "loss": 2.1573, + "step": 2045 + }, + { + "epoch": 0.16, + "grad_norm": 0.6687818144803892, + "learning_rate": 4.788765873544431e-05, + "loss": 2.0165, + "step": 2046 + }, + { + "epoch": 0.16, + "grad_norm": 1.1710635130302376, + "learning_rate": 4.788514494973996e-05, + "loss": 2.0076, + "step": 2047 + }, + { + "epoch": 0.16, + "grad_norm": 0.6839678899582097, + "learning_rate": 4.7882629735222004e-05, + "loss": 1.9858, + "step": 2048 + }, + { + "epoch": 0.16, + "grad_norm": 0.9295397337971999, + "learning_rate": 4.7880113092047475e-05, + "loss": 2.0672, + "step": 2049 + }, + { + "epoch": 0.16, + "grad_norm": 1.1173896136462103, + "learning_rate": 4.787759502037348e-05, + "loss": 2.1929, + "step": 2050 + }, + { + "epoch": 0.16, + "grad_norm": 0.6115905052026146, + "learning_rate": 4.7875075520357246e-05, + "loss": 2.0094, + "step": 2051 + }, + { + "epoch": 0.16, + "grad_norm": 1.0712491627937821, + "learning_rate": 4.7872554592156085e-05, + "loss": 2.0632, + "step": 2052 + }, + { + "epoch": 0.16, + "grad_norm": 0.900964613569214, + "learning_rate": 4.787003223592737e-05, + "loss": 2.1353, + "step": 2053 + }, + { + "epoch": 0.16, + "grad_norm": 0.6805119405914696, + "learning_rate": 4.786750845182859e-05, + "loss": 1.994, + "step": 2054 + }, + { + "epoch": 0.16, + "grad_norm": 0.9279147423168745, + "learning_rate": 4.786498324001732e-05, + "loss": 2.0383, + "step": 2055 + }, + { + "epoch": 0.16, + "grad_norm": 0.7407466233806347, + "learning_rate": 4.786245660065121e-05, + "loss": 2.006, + "step": 2056 + }, + { + "epoch": 0.16, + "grad_norm": 0.8411688707239905, + "learning_rate": 4.785992853388801e-05, + "loss": 2.226, + "step": 2057 + }, + { + "epoch": 0.16, + "grad_norm": 0.7817317712360857, + "learning_rate": 4.785739903988557e-05, + "loss": 2.0314, + "step": 2058 + }, + { + "epoch": 0.16, + "grad_norm": 0.681575626284558, + "learning_rate": 4.7854868118801806e-05, + "loss": 1.9696, + "step": 2059 + }, + { + "epoch": 0.16, + "grad_norm": 0.8794660252609096, + "learning_rate": 4.785233577079473e-05, + "loss": 1.9634, + "step": 2060 + }, + { + "epoch": 0.16, + "grad_norm": 0.7363778253619299, + "learning_rate": 4.784980199602245e-05, + "loss": 2.1946, + "step": 2061 + }, + { + "epoch": 0.16, + "grad_norm": 0.7704576839771932, + "learning_rate": 4.7847266794643165e-05, + "loss": 2.0641, + "step": 2062 + }, + { + "epoch": 0.16, + "grad_norm": 0.6955109504115887, + "learning_rate": 4.784473016681516e-05, + "loss": 2.0238, + "step": 2063 + }, + { + "epoch": 0.16, + "grad_norm": 0.8911060735920282, + "learning_rate": 4.78421921126968e-05, + "loss": 2.0271, + "step": 2064 + }, + { + "epoch": 0.16, + "grad_norm": 0.678539981222637, + "learning_rate": 4.783965263244654e-05, + "loss": 2.2187, + "step": 2065 + }, + { + "epoch": 0.16, + "grad_norm": 0.8116756337196732, + "learning_rate": 4.783711172622295e-05, + "loss": 1.9659, + "step": 2066 + }, + { + "epoch": 0.16, + "grad_norm": 0.6507957117608317, + "learning_rate": 4.783456939418465e-05, + "loss": 1.9952, + "step": 2067 + }, + { + "epoch": 0.16, + "grad_norm": 0.8306903190028612, + "learning_rate": 4.783202563649038e-05, + "loss": 2.1081, + "step": 2068 + }, + { + "epoch": 0.16, + "grad_norm": 0.6866248866498299, + "learning_rate": 4.782948045329895e-05, + "loss": 1.9792, + "step": 2069 + }, + { + "epoch": 0.16, + "grad_norm": 0.6740795505613211, + "learning_rate": 4.7826933844769273e-05, + "loss": 2.1661, + "step": 2070 + }, + { + "epoch": 0.16, + "grad_norm": 0.6695911551004502, + "learning_rate": 4.782438581106034e-05, + "loss": 1.9619, + "step": 2071 + }, + { + "epoch": 0.16, + "grad_norm": 0.6161563889870678, + "learning_rate": 4.782183635233124e-05, + "loss": 1.9638, + "step": 2072 + }, + { + "epoch": 0.16, + "grad_norm": 0.7081932149913969, + "learning_rate": 4.781928546874113e-05, + "loss": 2.1521, + "step": 2073 + }, + { + "epoch": 0.16, + "grad_norm": 0.5868916252570111, + "learning_rate": 4.7816733160449287e-05, + "loss": 2.0228, + "step": 2074 + }, + { + "epoch": 0.16, + "grad_norm": 0.6835951888062113, + "learning_rate": 4.7814179427615066e-05, + "loss": 2.041, + "step": 2075 + }, + { + "epoch": 0.16, + "grad_norm": 0.7369878413881155, + "learning_rate": 4.78116242703979e-05, + "loss": 2.0081, + "step": 2076 + }, + { + "epoch": 0.16, + "grad_norm": 0.7352898438406729, + "learning_rate": 4.780906768895731e-05, + "loss": 2.1729, + "step": 2077 + }, + { + "epoch": 0.16, + "grad_norm": 0.7692482600277273, + "learning_rate": 4.7806509683452937e-05, + "loss": 2.0021, + "step": 2078 + }, + { + "epoch": 0.16, + "grad_norm": 0.6717009874348432, + "learning_rate": 4.780395025404446e-05, + "loss": 1.961, + "step": 2079 + }, + { + "epoch": 0.16, + "grad_norm": 0.8596008915387896, + "learning_rate": 4.780138940089169e-05, + "loss": 2.0469, + "step": 2080 + }, + { + "epoch": 0.16, + "grad_norm": 0.6957045568805382, + "learning_rate": 4.779882712415451e-05, + "loss": 2.0324, + "step": 2081 + }, + { + "epoch": 0.16, + "grad_norm": 0.7573576574557406, + "learning_rate": 4.77962634239929e-05, + "loss": 2.2071, + "step": 2082 + }, + { + "epoch": 0.16, + "grad_norm": 0.8061973592927387, + "learning_rate": 4.779369830056691e-05, + "loss": 2.006, + "step": 2083 + }, + { + "epoch": 0.16, + "grad_norm": 0.685884752514872, + "learning_rate": 4.7791131754036694e-05, + "loss": 2.0075, + "step": 2084 + }, + { + "epoch": 0.16, + "grad_norm": 0.7603544711573783, + "learning_rate": 4.778856378456249e-05, + "loss": 2.2021, + "step": 2085 + }, + { + "epoch": 0.16, + "grad_norm": 0.7646471324838224, + "learning_rate": 4.778599439230465e-05, + "loss": 2.0604, + "step": 2086 + }, + { + "epoch": 0.16, + "grad_norm": 0.8127998258039101, + "learning_rate": 4.778342357742355e-05, + "loss": 1.9673, + "step": 2087 + }, + { + "epoch": 0.16, + "grad_norm": 0.9142923762387998, + "learning_rate": 4.778085134007973e-05, + "loss": 1.9968, + "step": 2088 + }, + { + "epoch": 0.16, + "grad_norm": 0.7660587641226316, + "learning_rate": 4.777827768043377e-05, + "loss": 2.1904, + "step": 2089 + }, + { + "epoch": 0.16, + "grad_norm": 0.898660551761882, + "learning_rate": 4.7775702598646366e-05, + "loss": 1.9953, + "step": 2090 + }, + { + "epoch": 0.16, + "grad_norm": 0.924365474350514, + "learning_rate": 4.777312609487828e-05, + "loss": 1.9076, + "step": 2091 + }, + { + "epoch": 0.16, + "grad_norm": 0.8210222319038528, + "learning_rate": 4.7770548169290374e-05, + "loss": 2.0383, + "step": 2092 + }, + { + "epoch": 0.16, + "grad_norm": 0.9666485252711392, + "learning_rate": 4.7767968822043605e-05, + "loss": 2.0846, + "step": 2093 + }, + { + "epoch": 0.16, + "grad_norm": 0.7445174180160211, + "learning_rate": 4.7765388053299015e-05, + "loss": 2.21, + "step": 2094 + }, + { + "epoch": 0.16, + "grad_norm": 0.6692976444926391, + "learning_rate": 4.776280586321772e-05, + "loss": 1.9446, + "step": 2095 + }, + { + "epoch": 0.16, + "grad_norm": 0.6136148132588995, + "learning_rate": 4.7760222251960945e-05, + "loss": 1.9659, + "step": 2096 + }, + { + "epoch": 0.16, + "grad_norm": 0.7467297347061005, + "learning_rate": 4.7757637219689995e-05, + "loss": 2.1797, + "step": 2097 + }, + { + "epoch": 0.16, + "grad_norm": 0.7382853476037023, + "learning_rate": 4.775505076656626e-05, + "loss": 1.9875, + "step": 2098 + }, + { + "epoch": 0.16, + "grad_norm": 0.8627338129766725, + "learning_rate": 4.775246289275123e-05, + "loss": 2.0751, + "step": 2099 + }, + { + "epoch": 0.16, + "grad_norm": 0.7232280959031868, + "learning_rate": 4.7749873598406464e-05, + "loss": 2.013, + "step": 2100 + }, + { + "epoch": 0.16, + "grad_norm": 0.6658641609627705, + "learning_rate": 4.774728288369363e-05, + "loss": 1.969, + "step": 2101 + }, + { + "epoch": 0.16, + "grad_norm": 0.7188561573172418, + "learning_rate": 4.774469074877449e-05, + "loss": 2.1508, + "step": 2102 + }, + { + "epoch": 0.16, + "grad_norm": 0.6502199778916018, + "learning_rate": 4.774209719381086e-05, + "loss": 1.9486, + "step": 2103 + }, + { + "epoch": 0.16, + "grad_norm": 0.6451419553568801, + "learning_rate": 4.773950221896467e-05, + "loss": 2.0225, + "step": 2104 + }, + { + "epoch": 0.16, + "grad_norm": 0.6527264398930251, + "learning_rate": 4.773690582439795e-05, + "loss": 2.0611, + "step": 2105 + }, + { + "epoch": 0.16, + "grad_norm": 0.7486942629641066, + "learning_rate": 4.7734308010272785e-05, + "loss": 2.1865, + "step": 2106 + }, + { + "epoch": 0.16, + "grad_norm": 0.6594084732036383, + "learning_rate": 4.773170877675139e-05, + "loss": 1.9565, + "step": 2107 + }, + { + "epoch": 0.16, + "grad_norm": 0.682377732959749, + "learning_rate": 4.772910812399603e-05, + "loss": 2.0087, + "step": 2108 + }, + { + "epoch": 0.16, + "grad_norm": 0.6899658529279881, + "learning_rate": 4.7726506052169065e-05, + "loss": 2.2371, + "step": 2109 + }, + { + "epoch": 0.16, + "grad_norm": 0.8117764705299684, + "learning_rate": 4.7723902561432964e-05, + "loss": 1.9949, + "step": 2110 + }, + { + "epoch": 0.16, + "grad_norm": 0.6115773217696118, + "learning_rate": 4.772129765195028e-05, + "loss": 2.0599, + "step": 2111 + }, + { + "epoch": 0.16, + "grad_norm": 0.7327408694576583, + "learning_rate": 4.7718691323883644e-05, + "loss": 1.9613, + "step": 2112 + }, + { + "epoch": 0.16, + "grad_norm": 0.6346770945252507, + "learning_rate": 4.771608357739578e-05, + "loss": 1.9727, + "step": 2113 + }, + { + "epoch": 0.16, + "grad_norm": 0.7402626310764088, + "learning_rate": 4.771347441264949e-05, + "loss": 2.1908, + "step": 2114 + }, + { + "epoch": 0.16, + "grad_norm": 0.6964219120716855, + "learning_rate": 4.771086382980768e-05, + "loss": 2.0159, + "step": 2115 + }, + { + "epoch": 0.16, + "grad_norm": 0.6398498349708037, + "learning_rate": 4.770825182903335e-05, + "loss": 2.0072, + "step": 2116 + }, + { + "epoch": 0.16, + "grad_norm": 0.8105697010974076, + "learning_rate": 4.770563841048957e-05, + "loss": 2.0898, + "step": 2117 + }, + { + "epoch": 0.16, + "grad_norm": 0.5687941002014472, + "learning_rate": 4.770302357433951e-05, + "loss": 2.1923, + "step": 2118 + }, + { + "epoch": 0.16, + "grad_norm": 0.759742122709048, + "learning_rate": 4.7700407320746414e-05, + "loss": 1.9789, + "step": 2119 + }, + { + "epoch": 0.16, + "grad_norm": 0.6839453939032593, + "learning_rate": 4.769778964987364e-05, + "loss": 2.0206, + "step": 2120 + }, + { + "epoch": 0.16, + "grad_norm": 0.6673818755837441, + "learning_rate": 4.769517056188461e-05, + "loss": 1.9856, + "step": 2121 + }, + { + "epoch": 0.16, + "grad_norm": 0.6906704042737747, + "learning_rate": 4.7692550056942855e-05, + "loss": 2.1954, + "step": 2122 + }, + { + "epoch": 0.16, + "grad_norm": 0.65477996902443, + "learning_rate": 4.7689928135211966e-05, + "loss": 2.0563, + "step": 2123 + }, + { + "epoch": 0.16, + "grad_norm": 0.6225360719067285, + "learning_rate": 4.768730479685566e-05, + "loss": 2.0519, + "step": 2124 + }, + { + "epoch": 0.16, + "grad_norm": 0.5905329067382125, + "learning_rate": 4.76846800420377e-05, + "loss": 2.0434, + "step": 2125 + }, + { + "epoch": 0.16, + "grad_norm": 0.626912742947821, + "learning_rate": 4.768205387092198e-05, + "loss": 2.1541, + "step": 2126 + }, + { + "epoch": 0.16, + "grad_norm": 0.6771285613181283, + "learning_rate": 4.7679426283672466e-05, + "loss": 2.0241, + "step": 2127 + }, + { + "epoch": 0.16, + "grad_norm": 0.623145668525327, + "learning_rate": 4.7676797280453194e-05, + "loss": 2.0126, + "step": 2128 + }, + { + "epoch": 0.16, + "grad_norm": 0.6815742173356221, + "learning_rate": 4.767416686142831e-05, + "loss": 2.1446, + "step": 2129 + }, + { + "epoch": 0.16, + "grad_norm": 0.5941494331752277, + "learning_rate": 4.7671535026762035e-05, + "loss": 2.0679, + "step": 2130 + }, + { + "epoch": 0.16, + "grad_norm": 0.6488961613468942, + "learning_rate": 4.76689017766187e-05, + "loss": 1.9835, + "step": 2131 + }, + { + "epoch": 0.16, + "grad_norm": 0.5971346561956788, + "learning_rate": 4.76662671111627e-05, + "loss": 2.0281, + "step": 2132 + }, + { + "epoch": 0.16, + "grad_norm": 0.6922259929399077, + "learning_rate": 4.766363103055852e-05, + "loss": 1.9969, + "step": 2133 + }, + { + "epoch": 0.16, + "grad_norm": 0.7375878470013998, + "learning_rate": 4.7660993534970753e-05, + "loss": 2.2527, + "step": 2134 + }, + { + "epoch": 0.16, + "grad_norm": 0.7771057332345209, + "learning_rate": 4.765835462456407e-05, + "loss": 1.9622, + "step": 2135 + }, + { + "epoch": 0.16, + "grad_norm": 0.6883744502895583, + "learning_rate": 4.765571429950323e-05, + "loss": 2.0554, + "step": 2136 + }, + { + "epoch": 0.16, + "grad_norm": 0.6717032548924661, + "learning_rate": 4.765307255995307e-05, + "loss": 1.9963, + "step": 2137 + }, + { + "epoch": 0.16, + "grad_norm": 0.8194844959616974, + "learning_rate": 4.7650429406078525e-05, + "loss": 2.1399, + "step": 2138 + }, + { + "epoch": 0.17, + "grad_norm": 0.682835427229905, + "learning_rate": 4.764778483804462e-05, + "loss": 2.0105, + "step": 2139 + }, + { + "epoch": 0.17, + "grad_norm": 0.9498380490845082, + "learning_rate": 4.764513885601647e-05, + "loss": 2.0031, + "step": 2140 + }, + { + "epoch": 0.17, + "grad_norm": 0.7035057578387215, + "learning_rate": 4.764249146015928e-05, + "loss": 2.1658, + "step": 2141 + }, + { + "epoch": 0.17, + "grad_norm": 0.8128048563623967, + "learning_rate": 4.763984265063832e-05, + "loss": 2.0525, + "step": 2142 + }, + { + "epoch": 0.17, + "grad_norm": 0.8574265438414361, + "learning_rate": 4.763719242761898e-05, + "loss": 1.9948, + "step": 2143 + }, + { + "epoch": 0.17, + "grad_norm": 0.6717391272248605, + "learning_rate": 4.7634540791266726e-05, + "loss": 2.0465, + "step": 2144 + }, + { + "epoch": 0.17, + "grad_norm": 0.7365831202557926, + "learning_rate": 4.76318877417471e-05, + "loss": 2.0318, + "step": 2145 + }, + { + "epoch": 0.17, + "grad_norm": 0.6025212183952929, + "learning_rate": 4.762923327922575e-05, + "loss": 2.2009, + "step": 2146 + }, + { + "epoch": 0.17, + "grad_norm": 0.6409536703728919, + "learning_rate": 4.7626577403868405e-05, + "loss": 1.9338, + "step": 2147 + }, + { + "epoch": 0.17, + "grad_norm": 0.7296461043848717, + "learning_rate": 4.762392011584088e-05, + "loss": 2.0974, + "step": 2148 + }, + { + "epoch": 0.17, + "grad_norm": 0.6625176358972278, + "learning_rate": 4.7621261415309074e-05, + "loss": 1.9974, + "step": 2149 + }, + { + "epoch": 0.17, + "grad_norm": 0.7108608134604655, + "learning_rate": 4.7618601302439e-05, + "loss": 2.143, + "step": 2150 + }, + { + "epoch": 0.17, + "grad_norm": 0.8277748439559212, + "learning_rate": 4.7615939777396715e-05, + "loss": 2.0087, + "step": 2151 + }, + { + "epoch": 0.17, + "grad_norm": 0.6338536422644437, + "learning_rate": 4.761327684034841e-05, + "loss": 1.9785, + "step": 2152 + }, + { + "epoch": 0.17, + "grad_norm": 0.7681194675428614, + "learning_rate": 4.761061249146033e-05, + "loss": 2.0017, + "step": 2153 + }, + { + "epoch": 0.17, + "grad_norm": 0.6113398171467649, + "learning_rate": 4.760794673089882e-05, + "loss": 2.2249, + "step": 2154 + }, + { + "epoch": 0.17, + "grad_norm": 0.7807389275853509, + "learning_rate": 4.760527955883033e-05, + "loss": 2.0717, + "step": 2155 + }, + { + "epoch": 0.17, + "grad_norm": 0.7329144150012221, + "learning_rate": 4.760261097542137e-05, + "loss": 1.9917, + "step": 2156 + }, + { + "epoch": 0.17, + "grad_norm": 0.6648096668284537, + "learning_rate": 4.7599940980838546e-05, + "loss": 1.9643, + "step": 2157 + }, + { + "epoch": 0.17, + "grad_norm": 0.6756730895888895, + "learning_rate": 4.759726957524857e-05, + "loss": 2.1812, + "step": 2158 + }, + { + "epoch": 0.17, + "grad_norm": 0.6185432429127606, + "learning_rate": 4.759459675881822e-05, + "loss": 2.0056, + "step": 2159 + }, + { + "epoch": 0.17, + "grad_norm": 0.7308715087862035, + "learning_rate": 4.759192253171437e-05, + "loss": 2.0104, + "step": 2160 + }, + { + "epoch": 0.17, + "grad_norm": 0.6355642578672368, + "learning_rate": 4.758924689410399e-05, + "loss": 2.1115, + "step": 2161 + }, + { + "epoch": 0.17, + "grad_norm": 0.6643475290261182, + "learning_rate": 4.7586569846154125e-05, + "loss": 2.2076, + "step": 2162 + }, + { + "epoch": 0.17, + "grad_norm": 0.6017302560997428, + "learning_rate": 4.7583891388031925e-05, + "loss": 1.9824, + "step": 2163 + }, + { + "epoch": 0.17, + "grad_norm": 0.6470610368288269, + "learning_rate": 4.75812115199046e-05, + "loss": 2.0207, + "step": 2164 + }, + { + "epoch": 0.17, + "grad_norm": 0.7109337064743986, + "learning_rate": 4.757853024193948e-05, + "loss": 2.0368, + "step": 2165 + }, + { + "epoch": 0.17, + "grad_norm": 0.7042583797246986, + "learning_rate": 4.757584755430396e-05, + "loss": 2.1905, + "step": 2166 + }, + { + "epoch": 0.17, + "grad_norm": 0.7838207228156526, + "learning_rate": 4.7573163457165534e-05, + "loss": 2.117, + "step": 2167 + }, + { + "epoch": 0.17, + "grad_norm": 0.7148682030422492, + "learning_rate": 4.757047795069178e-05, + "loss": 1.9718, + "step": 2168 + }, + { + "epoch": 0.17, + "grad_norm": 0.7888765709744396, + "learning_rate": 4.756779103505036e-05, + "loss": 1.9649, + "step": 2169 + }, + { + "epoch": 0.17, + "grad_norm": 0.8252544255032028, + "learning_rate": 4.7565102710409035e-05, + "loss": 2.1416, + "step": 2170 + }, + { + "epoch": 0.17, + "grad_norm": 0.6453524738858872, + "learning_rate": 4.756241297693566e-05, + "loss": 1.9725, + "step": 2171 + }, + { + "epoch": 0.17, + "grad_norm": 0.8951344953789735, + "learning_rate": 4.755972183479814e-05, + "loss": 1.9662, + "step": 2172 + }, + { + "epoch": 0.17, + "grad_norm": 0.6635546744142443, + "learning_rate": 4.755702928416452e-05, + "loss": 2.047, + "step": 2173 + }, + { + "epoch": 0.17, + "grad_norm": 0.6945403993087533, + "learning_rate": 4.755433532520289e-05, + "loss": 2.2012, + "step": 2174 + }, + { + "epoch": 0.17, + "grad_norm": 0.7612797178756655, + "learning_rate": 4.7551639958081454e-05, + "loss": 1.9838, + "step": 2175 + }, + { + "epoch": 0.17, + "grad_norm": 0.5700647447494938, + "learning_rate": 4.754894318296849e-05, + "loss": 1.9949, + "step": 2176 + }, + { + "epoch": 0.17, + "grad_norm": 0.7242183190116863, + "learning_rate": 4.754624500003236e-05, + "loss": 1.9739, + "step": 2177 + }, + { + "epoch": 0.17, + "grad_norm": 0.6305583716674078, + "learning_rate": 4.754354540944154e-05, + "loss": 2.1961, + "step": 2178 + }, + { + "epoch": 0.17, + "grad_norm": 0.6938293942764818, + "learning_rate": 4.754084441136457e-05, + "loss": 2.0865, + "step": 2179 + }, + { + "epoch": 0.17, + "grad_norm": 0.6773749508302627, + "learning_rate": 4.753814200597008e-05, + "loss": 2.0195, + "step": 2180 + }, + { + "epoch": 0.17, + "grad_norm": 0.8435730733284926, + "learning_rate": 4.753543819342679e-05, + "loss": 2.0346, + "step": 2181 + }, + { + "epoch": 0.17, + "grad_norm": 0.8816794609161358, + "learning_rate": 4.753273297390353e-05, + "loss": 2.1961, + "step": 2182 + }, + { + "epoch": 0.17, + "grad_norm": 0.709258290640546, + "learning_rate": 4.753002634756918e-05, + "loss": 1.9799, + "step": 2183 + }, + { + "epoch": 0.17, + "grad_norm": 0.669868895224133, + "learning_rate": 4.752731831459272e-05, + "loss": 2.0076, + "step": 2184 + }, + { + "epoch": 0.17, + "grad_norm": 0.6814754120180553, + "learning_rate": 4.752460887514324e-05, + "loss": 1.9609, + "step": 2185 + }, + { + "epoch": 0.17, + "grad_norm": 0.6959610837901892, + "learning_rate": 4.752189802938989e-05, + "loss": 2.2133, + "step": 2186 + }, + { + "epoch": 0.17, + "grad_norm": 0.671086802904581, + "learning_rate": 4.7519185777501927e-05, + "loss": 1.9807, + "step": 2187 + }, + { + "epoch": 0.17, + "grad_norm": 0.6947207885745841, + "learning_rate": 4.751647211964869e-05, + "loss": 2.0377, + "step": 2188 + }, + { + "epoch": 0.17, + "grad_norm": 0.7706935463950707, + "learning_rate": 4.751375705599959e-05, + "loss": 2.0318, + "step": 2189 + }, + { + "epoch": 0.17, + "grad_norm": 0.6575969282435566, + "learning_rate": 4.751104058672415e-05, + "loss": 2.1668, + "step": 2190 + }, + { + "epoch": 0.17, + "grad_norm": 0.7027936953318382, + "learning_rate": 4.750832271199197e-05, + "loss": 1.9819, + "step": 2191 + }, + { + "epoch": 0.17, + "grad_norm": 0.755792748557436, + "learning_rate": 4.750560343197274e-05, + "loss": 2.0773, + "step": 2192 + }, + { + "epoch": 0.17, + "grad_norm": 0.6651096194239591, + "learning_rate": 4.7502882746836234e-05, + "loss": 1.9705, + "step": 2193 + }, + { + "epoch": 0.17, + "grad_norm": 0.7122685757349444, + "learning_rate": 4.75001606567523e-05, + "loss": 2.132, + "step": 2194 + }, + { + "epoch": 0.17, + "grad_norm": 0.6180555298714466, + "learning_rate": 4.749743716189092e-05, + "loss": 2.0136, + "step": 2195 + }, + { + "epoch": 0.17, + "grad_norm": 0.7015893149285746, + "learning_rate": 4.74947122624221e-05, + "loss": 1.9645, + "step": 2196 + }, + { + "epoch": 0.17, + "grad_norm": 0.7161202047281838, + "learning_rate": 4.749198595851599e-05, + "loss": 2.0103, + "step": 2197 + }, + { + "epoch": 0.17, + "grad_norm": 0.9211103885903537, + "learning_rate": 4.748925825034281e-05, + "loss": 2.2655, + "step": 2198 + }, + { + "epoch": 0.17, + "grad_norm": 0.8479565679864598, + "learning_rate": 4.7486529138072834e-05, + "loss": 1.9188, + "step": 2199 + }, + { + "epoch": 0.17, + "grad_norm": 0.6857165073713344, + "learning_rate": 4.748379862187647e-05, + "loss": 2.0017, + "step": 2200 + }, + { + "epoch": 0.17, + "grad_norm": 0.7015212024503608, + "learning_rate": 4.748106670192419e-05, + "loss": 1.982, + "step": 2201 + }, + { + "epoch": 0.17, + "grad_norm": 0.637524591985425, + "learning_rate": 4.7478333378386564e-05, + "loss": 2.1817, + "step": 2202 + }, + { + "epoch": 0.17, + "grad_norm": 0.7922389156854768, + "learning_rate": 4.747559865143425e-05, + "loss": 1.9866, + "step": 2203 + }, + { + "epoch": 0.17, + "grad_norm": 0.5721586071046244, + "learning_rate": 4.747286252123797e-05, + "loss": 2.0164, + "step": 2204 + }, + { + "epoch": 0.17, + "grad_norm": 0.8408036989563741, + "learning_rate": 4.747012498796856e-05, + "loss": 1.9922, + "step": 2205 + }, + { + "epoch": 0.17, + "grad_norm": 0.6903678311567681, + "learning_rate": 4.746738605179694e-05, + "loss": 2.1998, + "step": 2206 + }, + { + "epoch": 0.17, + "grad_norm": 0.7054701838058777, + "learning_rate": 4.7464645712894115e-05, + "loss": 1.9875, + "step": 2207 + }, + { + "epoch": 0.17, + "grad_norm": 0.7177184396523119, + "learning_rate": 4.746190397143116e-05, + "loss": 2.015, + "step": 2208 + }, + { + "epoch": 0.17, + "grad_norm": 0.7169249856911328, + "learning_rate": 4.745916082757928e-05, + "loss": 2.0019, + "step": 2209 + }, + { + "epoch": 0.17, + "grad_norm": 0.7588436353791614, + "learning_rate": 4.745641628150972e-05, + "loss": 2.2288, + "step": 2210 + }, + { + "epoch": 0.17, + "grad_norm": 0.7199095361220854, + "learning_rate": 4.745367033339383e-05, + "loss": 2.0128, + "step": 2211 + }, + { + "epoch": 0.17, + "grad_norm": 0.9067348744253737, + "learning_rate": 4.7450922983403067e-05, + "loss": 2.0238, + "step": 2212 + }, + { + "epoch": 0.17, + "grad_norm": 0.7484307023813518, + "learning_rate": 4.744817423170895e-05, + "loss": 1.9302, + "step": 2213 + }, + { + "epoch": 0.17, + "grad_norm": 0.8863818961107608, + "learning_rate": 4.744542407848309e-05, + "loss": 2.2086, + "step": 2214 + }, + { + "epoch": 0.17, + "grad_norm": 0.7061056169893236, + "learning_rate": 4.7442672523897205e-05, + "loss": 1.9792, + "step": 2215 + }, + { + "epoch": 0.17, + "grad_norm": 0.8597940885522649, + "learning_rate": 4.7439919568123075e-05, + "loss": 1.973, + "step": 2216 + }, + { + "epoch": 0.17, + "grad_norm": 0.6799079728415194, + "learning_rate": 4.743716521133258e-05, + "loss": 2.1038, + "step": 2217 + }, + { + "epoch": 0.17, + "grad_norm": 0.8826031338624678, + "learning_rate": 4.743440945369769e-05, + "loss": 2.2559, + "step": 2218 + }, + { + "epoch": 0.17, + "grad_norm": 0.6554724407852188, + "learning_rate": 4.743165229539045e-05, + "loss": 2.064, + "step": 2219 + }, + { + "epoch": 0.17, + "grad_norm": 0.8283170721138711, + "learning_rate": 4.742889373658301e-05, + "loss": 1.9921, + "step": 2220 + }, + { + "epoch": 0.17, + "grad_norm": 0.769524174784727, + "learning_rate": 4.742613377744759e-05, + "loss": 2.0017, + "step": 2221 + }, + { + "epoch": 0.17, + "grad_norm": 0.9865132812343029, + "learning_rate": 4.742337241815651e-05, + "loss": 2.1628, + "step": 2222 + }, + { + "epoch": 0.17, + "grad_norm": 0.7770542092231372, + "learning_rate": 4.742060965888218e-05, + "loss": 2.0369, + "step": 2223 + }, + { + "epoch": 0.17, + "grad_norm": 0.9627244630515337, + "learning_rate": 4.741784549979707e-05, + "loss": 2.0156, + "step": 2224 + }, + { + "epoch": 0.17, + "grad_norm": 0.8230627296314685, + "learning_rate": 4.741507994107378e-05, + "loss": 1.9922, + "step": 2225 + }, + { + "epoch": 0.17, + "grad_norm": 1.0440074939971662, + "learning_rate": 4.741231298288496e-05, + "loss": 2.1958, + "step": 2226 + }, + { + "epoch": 0.17, + "grad_norm": 0.7999073981294432, + "learning_rate": 4.7409544625403376e-05, + "loss": 2.0093, + "step": 2227 + }, + { + "epoch": 0.17, + "grad_norm": 0.9101365691576917, + "learning_rate": 4.7406774868801854e-05, + "loss": 1.9779, + "step": 2228 + }, + { + "epoch": 0.17, + "grad_norm": 0.7328674918503169, + "learning_rate": 4.740400371325333e-05, + "loss": 2.1031, + "step": 2229 + }, + { + "epoch": 0.17, + "grad_norm": 0.8182139240926867, + "learning_rate": 4.740123115893081e-05, + "loss": 2.1808, + "step": 2230 + }, + { + "epoch": 0.17, + "grad_norm": 0.770112800204449, + "learning_rate": 4.7398457206007404e-05, + "loss": 2.0, + "step": 2231 + }, + { + "epoch": 0.17, + "grad_norm": 0.7397900456538322, + "learning_rate": 4.7395681854656304e-05, + "loss": 2.046, + "step": 2232 + }, + { + "epoch": 0.17, + "grad_norm": 0.9795482557525457, + "learning_rate": 4.7392905105050786e-05, + "loss": 1.9678, + "step": 2233 + }, + { + "epoch": 0.17, + "grad_norm": 0.6687499942936698, + "learning_rate": 4.73901269573642e-05, + "loss": 2.1468, + "step": 2234 + }, + { + "epoch": 0.17, + "grad_norm": 0.9748739052543072, + "learning_rate": 4.738734741177002e-05, + "loss": 2.0643, + "step": 2235 + }, + { + "epoch": 0.17, + "grad_norm": 0.611640619787887, + "learning_rate": 4.738456646844176e-05, + "loss": 1.9843, + "step": 2236 + }, + { + "epoch": 0.17, + "grad_norm": 0.664586089769723, + "learning_rate": 4.738178412755306e-05, + "loss": 2.0164, + "step": 2237 + }, + { + "epoch": 0.17, + "grad_norm": 0.8897428639021979, + "learning_rate": 4.737900038927763e-05, + "loss": 2.2117, + "step": 2238 + }, + { + "epoch": 0.17, + "grad_norm": 0.6341164165295204, + "learning_rate": 4.737621525378927e-05, + "loss": 1.9878, + "step": 2239 + }, + { + "epoch": 0.17, + "grad_norm": 0.7995184631401555, + "learning_rate": 4.737342872126187e-05, + "loss": 1.9893, + "step": 2240 + }, + { + "epoch": 0.17, + "grad_norm": 0.7331363588765615, + "learning_rate": 4.7370640791869404e-05, + "loss": 2.0029, + "step": 2241 + }, + { + "epoch": 0.17, + "grad_norm": 0.7675685791988233, + "learning_rate": 4.736785146578593e-05, + "loss": 2.1929, + "step": 2242 + }, + { + "epoch": 0.17, + "grad_norm": 0.7447667360232648, + "learning_rate": 4.73650607431856e-05, + "loss": 1.9471, + "step": 2243 + }, + { + "epoch": 0.17, + "grad_norm": 0.6388670042930327, + "learning_rate": 4.736226862424265e-05, + "loss": 1.9826, + "step": 2244 + }, + { + "epoch": 0.17, + "grad_norm": 0.831382957636824, + "learning_rate": 4.7359475109131404e-05, + "loss": 2.0248, + "step": 2245 + }, + { + "epoch": 0.17, + "grad_norm": 0.6354317487868717, + "learning_rate": 4.735668019802627e-05, + "loss": 2.147, + "step": 2246 + }, + { + "epoch": 0.17, + "grad_norm": 0.8316490916330804, + "learning_rate": 4.735388389110175e-05, + "loss": 1.9758, + "step": 2247 + }, + { + "epoch": 0.17, + "grad_norm": 0.8564514551536018, + "learning_rate": 4.735108618853242e-05, + "loss": 2.0641, + "step": 2248 + }, + { + "epoch": 0.17, + "grad_norm": 0.6832468664648573, + "learning_rate": 4.734828709049297e-05, + "loss": 2.017, + "step": 2249 + }, + { + "epoch": 0.17, + "grad_norm": 0.8471333581249486, + "learning_rate": 4.734548659715814e-05, + "loss": 2.2009, + "step": 2250 + }, + { + "epoch": 0.17, + "grad_norm": 0.6124243485508711, + "learning_rate": 4.734268470870279e-05, + "loss": 1.9736, + "step": 2251 + }, + { + "epoch": 0.17, + "grad_norm": 0.7743167500632855, + "learning_rate": 4.733988142530184e-05, + "loss": 1.9879, + "step": 2252 + }, + { + "epoch": 0.17, + "grad_norm": 0.6994242435417036, + "learning_rate": 4.733707674713032e-05, + "loss": 1.9677, + "step": 2253 + }, + { + "epoch": 0.17, + "grad_norm": 0.6255152331373967, + "learning_rate": 4.733427067436334e-05, + "loss": 2.0716, + "step": 2254 + }, + { + "epoch": 0.17, + "grad_norm": 0.8560530345274114, + "learning_rate": 4.7331463207176085e-05, + "loss": 2.1861, + "step": 2255 + }, + { + "epoch": 0.17, + "grad_norm": 0.7110351004510782, + "learning_rate": 4.732865434574385e-05, + "loss": 1.9771, + "step": 2256 + }, + { + "epoch": 0.17, + "grad_norm": 0.7478884831751502, + "learning_rate": 4.732584409024199e-05, + "loss": 1.9959, + "step": 2257 + }, + { + "epoch": 0.17, + "grad_norm": 0.7481174060157333, + "learning_rate": 4.732303244084596e-05, + "loss": 2.22, + "step": 2258 + }, + { + "epoch": 0.17, + "grad_norm": 0.6823758471538995, + "learning_rate": 4.732021939773133e-05, + "loss": 2.0039, + "step": 2259 + }, + { + "epoch": 0.17, + "grad_norm": 0.8592711529355944, + "learning_rate": 4.731740496107369e-05, + "loss": 2.0505, + "step": 2260 + }, + { + "epoch": 0.17, + "grad_norm": 0.6770663520000079, + "learning_rate": 4.731458913104878e-05, + "loss": 2.0073, + "step": 2261 + }, + { + "epoch": 0.17, + "grad_norm": 0.6564807024453408, + "learning_rate": 4.73117719078324e-05, + "loss": 2.148, + "step": 2262 + }, + { + "epoch": 0.17, + "grad_norm": 0.8674542013229403, + "learning_rate": 4.7308953291600445e-05, + "loss": 1.9932, + "step": 2263 + }, + { + "epoch": 0.17, + "grad_norm": 0.7098646804239714, + "learning_rate": 4.7306133282528884e-05, + "loss": 2.0072, + "step": 2264 + }, + { + "epoch": 0.17, + "grad_norm": 0.7863901683057218, + "learning_rate": 4.7303311880793785e-05, + "loss": 1.97, + "step": 2265 + }, + { + "epoch": 0.17, + "grad_norm": 0.8253347442705621, + "learning_rate": 4.730048908657131e-05, + "loss": 2.0451, + "step": 2266 + }, + { + "epoch": 0.17, + "grad_norm": 0.8262551837465105, + "learning_rate": 4.729766490003768e-05, + "loss": 2.1899, + "step": 2267 + }, + { + "epoch": 0.17, + "grad_norm": 0.7402038367310115, + "learning_rate": 4.729483932136923e-05, + "loss": 2.0274, + "step": 2268 + }, + { + "epoch": 0.18, + "grad_norm": 0.7775684593042144, + "learning_rate": 4.729201235074238e-05, + "loss": 1.9667, + "step": 2269 + }, + { + "epoch": 0.18, + "grad_norm": 0.6828038902179513, + "learning_rate": 4.728918398833361e-05, + "loss": 2.2323, + "step": 2270 + }, + { + "epoch": 0.18, + "grad_norm": 0.6458227013858772, + "learning_rate": 4.7286354234319524e-05, + "loss": 2.0015, + "step": 2271 + }, + { + "epoch": 0.18, + "grad_norm": 0.6611350587631385, + "learning_rate": 4.728352308887679e-05, + "loss": 1.9831, + "step": 2272 + }, + { + "epoch": 0.18, + "grad_norm": 0.6701283915036454, + "learning_rate": 4.728069055218216e-05, + "loss": 2.0867, + "step": 2273 + }, + { + "epoch": 0.18, + "grad_norm": 0.5739344712462449, + "learning_rate": 4.727785662441249e-05, + "loss": 2.1736, + "step": 2274 + }, + { + "epoch": 0.18, + "grad_norm": 0.6429531847119568, + "learning_rate": 4.7275021305744716e-05, + "loss": 2.0118, + "step": 2275 + }, + { + "epoch": 0.18, + "grad_norm": 0.7812366594742745, + "learning_rate": 4.7272184596355844e-05, + "loss": 1.9718, + "step": 2276 + }, + { + "epoch": 0.18, + "grad_norm": 0.7607882151070329, + "learning_rate": 4.7269346496422996e-05, + "loss": 2.0225, + "step": 2277 + }, + { + "epoch": 0.18, + "grad_norm": 0.6176206893234318, + "learning_rate": 4.726650700612336e-05, + "loss": 2.1607, + "step": 2278 + }, + { + "epoch": 0.18, + "grad_norm": 0.6333759925842067, + "learning_rate": 4.7263666125634224e-05, + "loss": 2.094, + "step": 2279 + }, + { + "epoch": 0.18, + "grad_norm": 0.6006265029852581, + "learning_rate": 4.726082385513295e-05, + "loss": 2.0022, + "step": 2280 + }, + { + "epoch": 0.18, + "grad_norm": 0.596499968652833, + "learning_rate": 4.725798019479699e-05, + "loss": 1.9532, + "step": 2281 + }, + { + "epoch": 0.18, + "grad_norm": 0.6467741825110729, + "learning_rate": 4.7255135144803884e-05, + "loss": 2.2438, + "step": 2282 + }, + { + "epoch": 0.18, + "grad_norm": 0.6138294467899366, + "learning_rate": 4.725228870533127e-05, + "loss": 2.0103, + "step": 2283 + }, + { + "epoch": 0.18, + "grad_norm": 0.8036779673294767, + "learning_rate": 4.724944087655686e-05, + "loss": 1.964, + "step": 2284 + }, + { + "epoch": 0.18, + "grad_norm": 0.82541946793859, + "learning_rate": 4.724659165865845e-05, + "loss": 2.0979, + "step": 2285 + }, + { + "epoch": 0.18, + "grad_norm": 0.626936905684608, + "learning_rate": 4.724374105181393e-05, + "loss": 1.9684, + "step": 2286 + }, + { + "epoch": 0.18, + "grad_norm": 0.6147490892790837, + "learning_rate": 4.724088905620127e-05, + "loss": 2.1902, + "step": 2287 + }, + { + "epoch": 0.18, + "grad_norm": 0.7228858432519913, + "learning_rate": 4.7238035671998555e-05, + "loss": 2.0171, + "step": 2288 + }, + { + "epoch": 0.18, + "grad_norm": 0.6408512904840232, + "learning_rate": 4.723518089938391e-05, + "loss": 2.028, + "step": 2289 + }, + { + "epoch": 0.18, + "grad_norm": 0.6334019702754733, + "learning_rate": 4.723232473853557e-05, + "loss": 2.1964, + "step": 2290 + }, + { + "epoch": 0.18, + "grad_norm": 0.8022596608891602, + "learning_rate": 4.722946718963187e-05, + "loss": 2.0482, + "step": 2291 + }, + { + "epoch": 0.18, + "grad_norm": 0.744178095420191, + "learning_rate": 4.7226608252851215e-05, + "loss": 1.994, + "step": 2292 + }, + { + "epoch": 0.18, + "grad_norm": 0.6400314233154624, + "learning_rate": 4.722374792837209e-05, + "loss": 2.0074, + "step": 2293 + }, + { + "epoch": 0.18, + "grad_norm": 0.7664532159925955, + "learning_rate": 4.722088621637309e-05, + "loss": 2.214, + "step": 2294 + }, + { + "epoch": 0.18, + "grad_norm": 0.6593368383900502, + "learning_rate": 4.721802311703287e-05, + "loss": 1.9766, + "step": 2295 + }, + { + "epoch": 0.18, + "grad_norm": 0.9289243100644894, + "learning_rate": 4.72151586305302e-05, + "loss": 2.0148, + "step": 2296 + }, + { + "epoch": 0.18, + "grad_norm": 0.6120081628988943, + "learning_rate": 4.7212292757043916e-05, + "loss": 2.0284, + "step": 2297 + }, + { + "epoch": 0.18, + "grad_norm": 0.9375660098034472, + "learning_rate": 4.7209425496752945e-05, + "loss": 1.9266, + "step": 2298 + }, + { + "epoch": 0.18, + "grad_norm": 0.7405549037046271, + "learning_rate": 4.72065568498363e-05, + "loss": 2.163, + "step": 2299 + }, + { + "epoch": 0.18, + "grad_norm": 0.7500093703188777, + "learning_rate": 4.720368681647308e-05, + "loss": 1.9329, + "step": 2300 + }, + { + "epoch": 0.18, + "grad_norm": 0.855537338743673, + "learning_rate": 4.720081539684248e-05, + "loss": 2.0065, + "step": 2301 + }, + { + "epoch": 0.18, + "grad_norm": 0.6596873923541644, + "learning_rate": 4.719794259112377e-05, + "loss": 2.1889, + "step": 2302 + }, + { + "epoch": 0.18, + "grad_norm": 0.804295185037117, + "learning_rate": 4.719506839949631e-05, + "loss": 2.0001, + "step": 2303 + }, + { + "epoch": 0.18, + "grad_norm": 0.6570906406545723, + "learning_rate": 4.719219282213956e-05, + "loss": 2.1111, + "step": 2304 + }, + { + "epoch": 0.18, + "grad_norm": 0.7352542623935455, + "learning_rate": 4.718931585923304e-05, + "loss": 1.9935, + "step": 2305 + }, + { + "epoch": 0.18, + "grad_norm": 0.7498062613271765, + "learning_rate": 4.718643751095637e-05, + "loss": 2.1957, + "step": 2306 + }, + { + "epoch": 0.18, + "grad_norm": 0.7033133567511664, + "learning_rate": 4.718355777748927e-05, + "loss": 1.9835, + "step": 2307 + }, + { + "epoch": 0.18, + "grad_norm": 0.6600743093053663, + "learning_rate": 4.7180676659011516e-05, + "loss": 1.9515, + "step": 2308 + }, + { + "epoch": 0.18, + "grad_norm": 0.683478817451781, + "learning_rate": 4.7177794155703004e-05, + "loss": 1.9768, + "step": 2309 + }, + { + "epoch": 0.18, + "grad_norm": 0.6630316729717854, + "learning_rate": 4.7174910267743695e-05, + "loss": 2.0506, + "step": 2310 + }, + { + "epoch": 0.18, + "grad_norm": 0.64931722064103, + "learning_rate": 4.7172024995313634e-05, + "loss": 2.1634, + "step": 2311 + }, + { + "epoch": 0.18, + "grad_norm": 0.7905698167488725, + "learning_rate": 4.716913833859298e-05, + "loss": 1.9659, + "step": 2312 + }, + { + "epoch": 0.18, + "grad_norm": 0.6510276257360043, + "learning_rate": 4.7166250297761936e-05, + "loss": 1.9519, + "step": 2313 + }, + { + "epoch": 0.18, + "grad_norm": 0.7963777072486765, + "learning_rate": 4.716336087300083e-05, + "loss": 2.208, + "step": 2314 + }, + { + "epoch": 0.18, + "grad_norm": 0.6898933415254367, + "learning_rate": 4.716047006449005e-05, + "loss": 1.9863, + "step": 2315 + }, + { + "epoch": 0.18, + "grad_norm": 0.8683789186144247, + "learning_rate": 4.715757787241009e-05, + "loss": 2.0516, + "step": 2316 + }, + { + "epoch": 0.18, + "grad_norm": 0.8200129932086837, + "learning_rate": 4.715468429694152e-05, + "loss": 2.0297, + "step": 2317 + }, + { + "epoch": 0.18, + "grad_norm": 0.6914844773068379, + "learning_rate": 4.715178933826499e-05, + "loss": 1.9969, + "step": 2318 + }, + { + "epoch": 0.18, + "grad_norm": 0.8232841994859744, + "learning_rate": 4.7148892996561256e-05, + "loss": 2.2172, + "step": 2319 + }, + { + "epoch": 0.18, + "grad_norm": 0.7338345967617775, + "learning_rate": 4.714599527201113e-05, + "loss": 2.0036, + "step": 2320 + }, + { + "epoch": 0.18, + "grad_norm": 0.7050474363717165, + "learning_rate": 4.7143096164795554e-05, + "loss": 2.0321, + "step": 2321 + }, + { + "epoch": 0.18, + "grad_norm": 0.7349141590850311, + "learning_rate": 4.714019567509552e-05, + "loss": 2.0546, + "step": 2322 + }, + { + "epoch": 0.18, + "grad_norm": 0.6496724210225138, + "learning_rate": 4.713729380309211e-05, + "loss": 2.1735, + "step": 2323 + }, + { + "epoch": 0.18, + "grad_norm": 0.7487579177400563, + "learning_rate": 4.71343905489665e-05, + "loss": 1.9592, + "step": 2324 + }, + { + "epoch": 0.18, + "grad_norm": 0.6852632013917966, + "learning_rate": 4.7131485912899964e-05, + "loss": 1.985, + "step": 2325 + }, + { + "epoch": 0.18, + "grad_norm": 0.7545291562493922, + "learning_rate": 4.712857989507384e-05, + "loss": 2.1665, + "step": 2326 + }, + { + "epoch": 0.18, + "grad_norm": 0.644067669464844, + "learning_rate": 4.712567249566958e-05, + "loss": 1.9815, + "step": 2327 + }, + { + "epoch": 0.18, + "grad_norm": 0.902208787096129, + "learning_rate": 4.7122763714868676e-05, + "loss": 2.006, + "step": 2328 + }, + { + "epoch": 0.18, + "grad_norm": 0.7741701573641254, + "learning_rate": 4.7119853552852754e-05, + "loss": 1.9633, + "step": 2329 + }, + { + "epoch": 0.18, + "grad_norm": 0.8202262785045211, + "learning_rate": 4.711694200980351e-05, + "loss": 2.0201, + "step": 2330 + }, + { + "epoch": 0.18, + "grad_norm": 0.7125623033661771, + "learning_rate": 4.711402908590271e-05, + "loss": 2.1368, + "step": 2331 + }, + { + "epoch": 0.18, + "grad_norm": 0.8195414581391616, + "learning_rate": 4.7111114781332236e-05, + "loss": 1.9875, + "step": 2332 + }, + { + "epoch": 0.18, + "grad_norm": 0.6773615832984973, + "learning_rate": 4.710819909627403e-05, + "loss": 2.0147, + "step": 2333 + }, + { + "epoch": 0.18, + "grad_norm": 0.7920689320867647, + "learning_rate": 4.7105282030910125e-05, + "loss": 2.2293, + "step": 2334 + }, + { + "epoch": 0.18, + "grad_norm": 0.5946582109202839, + "learning_rate": 4.710236358542266e-05, + "loss": 2.0753, + "step": 2335 + }, + { + "epoch": 0.18, + "grad_norm": 0.657712051303984, + "learning_rate": 4.7099443759993837e-05, + "loss": 1.983, + "step": 2336 + }, + { + "epoch": 0.18, + "grad_norm": 0.6048672789078698, + "learning_rate": 4.709652255480594e-05, + "loss": 1.9798, + "step": 2337 + }, + { + "epoch": 0.18, + "grad_norm": 0.7193073746587328, + "learning_rate": 4.709359997004138e-05, + "loss": 1.9517, + "step": 2338 + }, + { + "epoch": 0.18, + "grad_norm": 0.6935720562506166, + "learning_rate": 4.7090676005882604e-05, + "loss": 2.206, + "step": 2339 + }, + { + "epoch": 0.18, + "grad_norm": 0.7610782032850921, + "learning_rate": 4.7087750662512176e-05, + "loss": 1.9729, + "step": 2340 + }, + { + "epoch": 0.18, + "grad_norm": 0.8141273375887564, + "learning_rate": 4.708482394011273e-05, + "loss": 1.9995, + "step": 2341 + }, + { + "epoch": 0.18, + "grad_norm": 0.7121293148824727, + "learning_rate": 4.708189583886701e-05, + "loss": 1.9934, + "step": 2342 + }, + { + "epoch": 0.18, + "grad_norm": 0.7981987079489882, + "learning_rate": 4.707896635895781e-05, + "loss": 2.1746, + "step": 2343 + }, + { + "epoch": 0.18, + "grad_norm": 0.7721896421473234, + "learning_rate": 4.7076035500568036e-05, + "loss": 1.9816, + "step": 2344 + }, + { + "epoch": 0.18, + "grad_norm": 0.7399160644936708, + "learning_rate": 4.7073103263880684e-05, + "loss": 2.0032, + "step": 2345 + }, + { + "epoch": 0.18, + "grad_norm": 0.8728616936472825, + "learning_rate": 4.707016964907881e-05, + "loss": 2.1918, + "step": 2346 + }, + { + "epoch": 0.18, + "grad_norm": 0.7415057420742341, + "learning_rate": 4.7067234656345585e-05, + "loss": 2.0882, + "step": 2347 + }, + { + "epoch": 0.18, + "grad_norm": 0.8717150180252916, + "learning_rate": 4.706429828586424e-05, + "loss": 2.0265, + "step": 2348 + }, + { + "epoch": 0.18, + "grad_norm": 0.6964360105938643, + "learning_rate": 4.706136053781811e-05, + "loss": 1.982, + "step": 2349 + }, + { + "epoch": 0.18, + "grad_norm": 0.7922125513153762, + "learning_rate": 4.705842141239061e-05, + "loss": 1.9785, + "step": 2350 + }, + { + "epoch": 0.18, + "grad_norm": 0.8620178246255268, + "learning_rate": 4.7055480909765246e-05, + "loss": 2.2115, + "step": 2351 + }, + { + "epoch": 0.18, + "grad_norm": 0.6431078819446007, + "learning_rate": 4.7052539030125605e-05, + "loss": 1.9849, + "step": 2352 + }, + { + "epoch": 0.18, + "grad_norm": 0.8656972101826599, + "learning_rate": 4.7049595773655354e-05, + "loss": 2.0261, + "step": 2353 + }, + { + "epoch": 0.18, + "grad_norm": 0.5748791145891335, + "learning_rate": 4.7046651140538254e-05, + "loss": 2.0088, + "step": 2354 + }, + { + "epoch": 0.18, + "grad_norm": 0.9484448275287364, + "learning_rate": 4.704370513095816e-05, + "loss": 2.1681, + "step": 2355 + }, + { + "epoch": 0.18, + "grad_norm": 0.6714215953690877, + "learning_rate": 4.7040757745099e-05, + "loss": 1.9677, + "step": 2356 + }, + { + "epoch": 0.18, + "grad_norm": 0.7568710651115563, + "learning_rate": 4.703780898314478e-05, + "loss": 2.0134, + "step": 2357 + }, + { + "epoch": 0.18, + "grad_norm": 0.7322763783823012, + "learning_rate": 4.7034858845279614e-05, + "loss": 2.1731, + "step": 2358 + }, + { + "epoch": 0.18, + "grad_norm": 0.6248436094859314, + "learning_rate": 4.7031907331687686e-05, + "loss": 2.0461, + "step": 2359 + }, + { + "epoch": 0.18, + "grad_norm": 0.7858325477692505, + "learning_rate": 4.7028954442553276e-05, + "loss": 1.9798, + "step": 2360 + }, + { + "epoch": 0.18, + "grad_norm": 0.6867747856137486, + "learning_rate": 4.7026000178060745e-05, + "loss": 1.9432, + "step": 2361 + }, + { + "epoch": 0.18, + "grad_norm": 0.7100661754213549, + "learning_rate": 4.702304453839453e-05, + "loss": 2.0057, + "step": 2362 + }, + { + "epoch": 0.18, + "grad_norm": 0.7705055658611255, + "learning_rate": 4.7020087523739186e-05, + "loss": 2.1714, + "step": 2363 + }, + { + "epoch": 0.18, + "grad_norm": 0.6361664936963715, + "learning_rate": 4.7017129134279307e-05, + "loss": 2.0617, + "step": 2364 + }, + { + "epoch": 0.18, + "grad_norm": 0.62851620789004, + "learning_rate": 4.7014169370199615e-05, + "loss": 2.0086, + "step": 2365 + }, + { + "epoch": 0.18, + "grad_norm": 0.7360203106700033, + "learning_rate": 4.701120823168489e-05, + "loss": 2.0948, + "step": 2366 + }, + { + "epoch": 0.18, + "grad_norm": 0.7045687881489059, + "learning_rate": 4.7008245718920005e-05, + "loss": 2.1657, + "step": 2367 + }, + { + "epoch": 0.18, + "grad_norm": 0.6437362584729972, + "learning_rate": 4.7005281832089934e-05, + "loss": 2.0071, + "step": 2368 + }, + { + "epoch": 0.18, + "grad_norm": 0.7407344383757167, + "learning_rate": 4.7002316571379715e-05, + "loss": 2.0272, + "step": 2369 + }, + { + "epoch": 0.18, + "grad_norm": 0.6528316540081227, + "learning_rate": 4.699934993697448e-05, + "loss": 1.9847, + "step": 2370 + }, + { + "epoch": 0.18, + "grad_norm": 0.707142817813905, + "learning_rate": 4.6996381929059466e-05, + "loss": 2.232, + "step": 2371 + }, + { + "epoch": 0.18, + "grad_norm": 0.7533255255702686, + "learning_rate": 4.699341254781995e-05, + "loss": 2.0115, + "step": 2372 + }, + { + "epoch": 0.18, + "grad_norm": 0.6602538492950344, + "learning_rate": 4.6990441793441344e-05, + "loss": 1.9749, + "step": 2373 + }, + { + "epoch": 0.18, + "grad_norm": 0.7460006573247863, + "learning_rate": 4.698746966610912e-05, + "loss": 1.984, + "step": 2374 + }, + { + "epoch": 0.18, + "grad_norm": 0.644995270240468, + "learning_rate": 4.6984496166008835e-05, + "loss": 2.2163, + "step": 2375 + }, + { + "epoch": 0.18, + "grad_norm": 0.8094113320180328, + "learning_rate": 4.6981521293326136e-05, + "loss": 1.994, + "step": 2376 + }, + { + "epoch": 0.18, + "grad_norm": 0.6921645976726828, + "learning_rate": 4.697854504824677e-05, + "loss": 1.9765, + "step": 2377 + }, + { + "epoch": 0.18, + "grad_norm": 0.6157273220243186, + "learning_rate": 4.697556743095654e-05, + "loss": 2.0594, + "step": 2378 + }, + { + "epoch": 0.18, + "grad_norm": 0.8109147459615079, + "learning_rate": 4.697258844164136e-05, + "loss": 2.2266, + "step": 2379 + }, + { + "epoch": 0.18, + "grad_norm": 0.600012315143977, + "learning_rate": 4.696960808048721e-05, + "loss": 1.9498, + "step": 2380 + }, + { + "epoch": 0.18, + "grad_norm": 0.7866080512635013, + "learning_rate": 4.696662634768019e-05, + "loss": 1.9794, + "step": 2381 + }, + { + "epoch": 0.18, + "grad_norm": 0.6267702135950278, + "learning_rate": 4.6963643243406427e-05, + "loss": 1.9943, + "step": 2382 + }, + { + "epoch": 0.18, + "grad_norm": 0.6558067975411246, + "learning_rate": 4.69606587678522e-05, + "loss": 2.2223, + "step": 2383 + }, + { + "epoch": 0.18, + "grad_norm": 0.6278321107074327, + "learning_rate": 4.695767292120383e-05, + "loss": 2.0248, + "step": 2384 + }, + { + "epoch": 0.18, + "grad_norm": 0.6757816358998711, + "learning_rate": 4.695468570364773e-05, + "loss": 2.0317, + "step": 2385 + }, + { + "epoch": 0.18, + "grad_norm": 0.6539438843570307, + "learning_rate": 4.695169711537041e-05, + "loss": 1.9651, + "step": 2386 + }, + { + "epoch": 0.18, + "grad_norm": 0.6967888616323279, + "learning_rate": 4.694870715655846e-05, + "loss": 2.1839, + "step": 2387 + }, + { + "epoch": 0.18, + "grad_norm": 0.6999453531174823, + "learning_rate": 4.694571582739856e-05, + "loss": 1.99, + "step": 2388 + }, + { + "epoch": 0.18, + "grad_norm": 0.6556003573154653, + "learning_rate": 4.6942723128077456e-05, + "loss": 1.9779, + "step": 2389 + }, + { + "epoch": 0.18, + "grad_norm": 0.784948135223806, + "learning_rate": 4.693972905878201e-05, + "loss": 2.2027, + "step": 2390 + }, + { + "epoch": 0.18, + "grad_norm": 0.63280086298786, + "learning_rate": 4.693673361969915e-05, + "loss": 2.0406, + "step": 2391 + }, + { + "epoch": 0.18, + "grad_norm": 0.6852467634835429, + "learning_rate": 4.693373681101589e-05, + "loss": 2.0333, + "step": 2392 + }, + { + "epoch": 0.18, + "grad_norm": 0.6550995047689948, + "learning_rate": 4.6930738632919334e-05, + "loss": 2.0042, + "step": 2393 + }, + { + "epoch": 0.18, + "grad_norm": 0.6820817184019495, + "learning_rate": 4.6927739085596665e-05, + "loss": 2.0063, + "step": 2394 + }, + { + "epoch": 0.18, + "grad_norm": 0.6858583753288315, + "learning_rate": 4.692473816923517e-05, + "loss": 2.1682, + "step": 2395 + }, + { + "epoch": 0.18, + "grad_norm": 0.6994882500529027, + "learning_rate": 4.69217358840222e-05, + "loss": 1.9989, + "step": 2396 + }, + { + "epoch": 0.18, + "grad_norm": 0.6813766758544803, + "learning_rate": 4.691873223014521e-05, + "loss": 2.0753, + "step": 2397 + }, + { + "epoch": 0.19, + "grad_norm": 0.743267037534749, + "learning_rate": 4.691572720779171e-05, + "loss": 1.9845, + "step": 2398 + }, + { + "epoch": 0.19, + "grad_norm": 0.7464977535491373, + "learning_rate": 4.6912720817149344e-05, + "loss": 2.1735, + "step": 2399 + }, + { + "epoch": 0.19, + "grad_norm": 0.6988264981373272, + "learning_rate": 4.6909713058405786e-05, + "loss": 1.997, + "step": 2400 + }, + { + "epoch": 0.19, + "grad_norm": 0.7415633551152035, + "learning_rate": 4.6906703931748837e-05, + "loss": 1.9978, + "step": 2401 + }, + { + "epoch": 0.19, + "grad_norm": 0.6862739221398028, + "learning_rate": 4.690369343736636e-05, + "loss": 1.9716, + "step": 2402 + }, + { + "epoch": 0.19, + "grad_norm": 0.7525748910971672, + "learning_rate": 4.690068157544633e-05, + "loss": 2.2354, + "step": 2403 + }, + { + "epoch": 0.19, + "grad_norm": 0.7222412199060225, + "learning_rate": 4.689766834617677e-05, + "loss": 2.0065, + "step": 2404 + }, + { + "epoch": 0.19, + "grad_norm": 0.8914678611446539, + "learning_rate": 4.689465374974583e-05, + "loss": 1.9628, + "step": 2405 + }, + { + "epoch": 0.19, + "grad_norm": 0.6815979702097102, + "learning_rate": 4.68916377863417e-05, + "loss": 2.02, + "step": 2406 + }, + { + "epoch": 0.19, + "grad_norm": 0.9786463779104423, + "learning_rate": 4.6888620456152685e-05, + "loss": 2.2259, + "step": 2407 + }, + { + "epoch": 0.19, + "grad_norm": 0.6766761212809368, + "learning_rate": 4.6885601759367185e-05, + "loss": 2.0244, + "step": 2408 + }, + { + "epoch": 0.19, + "grad_norm": 0.9457390830997467, + "learning_rate": 4.6882581696173654e-05, + "loss": 2.0258, + "step": 2409 + }, + { + "epoch": 0.19, + "grad_norm": 0.6682738258013812, + "learning_rate": 4.6879560266760655e-05, + "loss": 2.0148, + "step": 2410 + }, + { + "epoch": 0.19, + "grad_norm": 0.9996729742622479, + "learning_rate": 4.6876537471316825e-05, + "loss": 2.1908, + "step": 2411 + }, + { + "epoch": 0.19, + "grad_norm": 0.7582930961960058, + "learning_rate": 4.687351331003089e-05, + "loss": 1.9427, + "step": 2412 + }, + { + "epoch": 0.19, + "grad_norm": 0.9209677685399305, + "learning_rate": 4.687048778309165e-05, + "loss": 2.0308, + "step": 2413 + }, + { + "epoch": 0.19, + "grad_norm": 0.7093914039779269, + "learning_rate": 4.686746089068803e-05, + "loss": 1.9277, + "step": 2414 + }, + { + "epoch": 0.19, + "grad_norm": 0.6020204047246959, + "learning_rate": 4.6864432633008984e-05, + "loss": 2.2177, + "step": 2415 + }, + { + "epoch": 0.19, + "grad_norm": 0.6690675547998777, + "learning_rate": 4.686140301024358e-05, + "loss": 1.9599, + "step": 2416 + }, + { + "epoch": 0.19, + "grad_norm": 0.6935338427150075, + "learning_rate": 4.6858372022580985e-05, + "loss": 1.9729, + "step": 2417 + }, + { + "epoch": 0.19, + "grad_norm": 0.6817606223249436, + "learning_rate": 4.685533967021043e-05, + "loss": 1.995, + "step": 2418 + }, + { + "epoch": 0.19, + "grad_norm": 0.6533889145307378, + "learning_rate": 4.685230595332123e-05, + "loss": 2.1781, + "step": 2419 + }, + { + "epoch": 0.19, + "grad_norm": 0.6272304993663916, + "learning_rate": 4.6849270872102795e-05, + "loss": 1.9654, + "step": 2420 + }, + { + "epoch": 0.19, + "grad_norm": 0.6208635882396977, + "learning_rate": 4.684623442674463e-05, + "loss": 1.9978, + "step": 2421 + }, + { + "epoch": 0.19, + "grad_norm": 0.7090455630635236, + "learning_rate": 4.684319661743629e-05, + "loss": 2.0711, + "step": 2422 + }, + { + "epoch": 0.19, + "grad_norm": 0.785345367071294, + "learning_rate": 4.684015744436746e-05, + "loss": 2.1855, + "step": 2423 + }, + { + "epoch": 0.19, + "grad_norm": 0.7682614166296461, + "learning_rate": 4.683711690772788e-05, + "loss": 1.996, + "step": 2424 + }, + { + "epoch": 0.19, + "grad_norm": 0.673925360401252, + "learning_rate": 4.683407500770739e-05, + "loss": 1.9595, + "step": 2425 + }, + { + "epoch": 0.19, + "grad_norm": 0.7463416876586877, + "learning_rate": 4.6831031744495886e-05, + "loss": 1.9953, + "step": 2426 + }, + { + "epoch": 0.19, + "grad_norm": 0.6868098204062206, + "learning_rate": 4.682798711828339e-05, + "loss": 2.1238, + "step": 2427 + }, + { + "epoch": 0.19, + "grad_norm": 0.7755090925700292, + "learning_rate": 4.6824941129259984e-05, + "loss": 2.0348, + "step": 2428 + }, + { + "epoch": 0.19, + "grad_norm": 0.6580598810441544, + "learning_rate": 4.6821893777615853e-05, + "loss": 1.9601, + "step": 2429 + }, + { + "epoch": 0.19, + "grad_norm": 0.5903494072969236, + "learning_rate": 4.6818845063541237e-05, + "loss": 2.0145, + "step": 2430 + }, + { + "epoch": 0.19, + "grad_norm": 0.6606038154073814, + "learning_rate": 4.68157949872265e-05, + "loss": 2.1669, + "step": 2431 + }, + { + "epoch": 0.19, + "grad_norm": 0.5882545759951369, + "learning_rate": 4.681274354886205e-05, + "loss": 2.011, + "step": 2432 + }, + { + "epoch": 0.19, + "grad_norm": 0.6009549583719351, + "learning_rate": 4.680969074863842e-05, + "loss": 1.991, + "step": 2433 + }, + { + "epoch": 0.19, + "grad_norm": 0.6440689349448566, + "learning_rate": 4.680663658674619e-05, + "loss": 2.0577, + "step": 2434 + }, + { + "epoch": 0.19, + "grad_norm": 0.593953457515576, + "learning_rate": 4.680358106337607e-05, + "loss": 2.1275, + "step": 2435 + }, + { + "epoch": 0.19, + "grad_norm": 0.6209700270235651, + "learning_rate": 4.6800524178718794e-05, + "loss": 1.9511, + "step": 2436 + }, + { + "epoch": 0.19, + "grad_norm": 0.629379427169456, + "learning_rate": 4.6797465932965244e-05, + "loss": 1.9902, + "step": 2437 + }, + { + "epoch": 0.19, + "grad_norm": 0.6134594950209334, + "learning_rate": 4.6794406326306355e-05, + "loss": 1.972, + "step": 2438 + }, + { + "epoch": 0.19, + "grad_norm": 0.7002850129913979, + "learning_rate": 4.6791345358933136e-05, + "loss": 2.1939, + "step": 2439 + }, + { + "epoch": 0.19, + "grad_norm": 0.5788637489564847, + "learning_rate": 4.678828303103671e-05, + "loss": 2.0575, + "step": 2440 + }, + { + "epoch": 0.19, + "grad_norm": 0.7493564311339685, + "learning_rate": 4.6785219342808267e-05, + "loss": 2.0173, + "step": 2441 + }, + { + "epoch": 0.19, + "grad_norm": 0.5873567821096899, + "learning_rate": 4.678215429443908e-05, + "loss": 1.9608, + "step": 2442 + }, + { + "epoch": 0.19, + "grad_norm": 0.6946516804751978, + "learning_rate": 4.677908788612052e-05, + "loss": 2.1833, + "step": 2443 + }, + { + "epoch": 0.19, + "grad_norm": 0.5749428253025071, + "learning_rate": 4.677602011804403e-05, + "loss": 1.9992, + "step": 2444 + }, + { + "epoch": 0.19, + "grad_norm": 0.667333628695, + "learning_rate": 4.677295099040115e-05, + "loss": 2.0027, + "step": 2445 + }, + { + "epoch": 0.19, + "grad_norm": 0.564125826582981, + "learning_rate": 4.6769880503383494e-05, + "loss": 2.0503, + "step": 2446 + }, + { + "epoch": 0.19, + "grad_norm": 0.6558112280922334, + "learning_rate": 4.676680865718276e-05, + "loss": 2.1799, + "step": 2447 + }, + { + "epoch": 0.19, + "grad_norm": 0.6205326355591049, + "learning_rate": 4.676373545199075e-05, + "loss": 1.9966, + "step": 2448 + }, + { + "epoch": 0.19, + "grad_norm": 0.6013401005735152, + "learning_rate": 4.676066088799932e-05, + "loss": 1.993, + "step": 2449 + }, + { + "epoch": 0.19, + "grad_norm": 0.7082665361921955, + "learning_rate": 4.675758496540044e-05, + "loss": 2.0307, + "step": 2450 + }, + { + "epoch": 0.19, + "grad_norm": 0.6815068920718871, + "learning_rate": 4.675450768438615e-05, + "loss": 2.2017, + "step": 2451 + }, + { + "epoch": 0.19, + "grad_norm": 0.7863016694375223, + "learning_rate": 4.6751429045148564e-05, + "loss": 1.9444, + "step": 2452 + }, + { + "epoch": 0.19, + "grad_norm": 0.641644814287374, + "learning_rate": 4.6748349047879915e-05, + "loss": 2.0592, + "step": 2453 + }, + { + "epoch": 0.19, + "grad_norm": 0.6637420359780016, + "learning_rate": 4.674526769277249e-05, + "loss": 1.9671, + "step": 2454 + }, + { + "epoch": 0.19, + "grad_norm": 0.8215598966312048, + "learning_rate": 4.674218498001867e-05, + "loss": 2.1768, + "step": 2455 + }, + { + "epoch": 0.19, + "grad_norm": 0.6256409272116019, + "learning_rate": 4.6739100909810924e-05, + "loss": 1.9473, + "step": 2456 + }, + { + "epoch": 0.19, + "grad_norm": 0.7411965677407634, + "learning_rate": 4.67360154823418e-05, + "loss": 2.0086, + "step": 2457 + }, + { + "epoch": 0.19, + "grad_norm": 0.6874492364254969, + "learning_rate": 4.6732928697803934e-05, + "loss": 2.0286, + "step": 2458 + }, + { + "epoch": 0.19, + "grad_norm": 0.7214778841287579, + "learning_rate": 4.6729840556390055e-05, + "loss": 2.2292, + "step": 2459 + }, + { + "epoch": 0.19, + "grad_norm": 0.8902793631900654, + "learning_rate": 4.672675105829296e-05, + "loss": 2.0058, + "step": 2460 + }, + { + "epoch": 0.19, + "grad_norm": 0.6500647663352871, + "learning_rate": 4.6723660203705534e-05, + "loss": 1.9868, + "step": 2461 + }, + { + "epoch": 0.19, + "grad_norm": 0.8034596807430502, + "learning_rate": 4.6720567992820766e-05, + "loss": 1.9587, + "step": 2462 + }, + { + "epoch": 0.19, + "grad_norm": 0.6284560820977981, + "learning_rate": 4.6717474425831696e-05, + "loss": 2.1853, + "step": 2463 + }, + { + "epoch": 0.19, + "grad_norm": 0.7452628968630005, + "learning_rate": 4.6714379502931495e-05, + "loss": 1.9979, + "step": 2464 + }, + { + "epoch": 0.19, + "grad_norm": 0.9020655853096545, + "learning_rate": 4.671128322431337e-05, + "loss": 2.1166, + "step": 2465 + }, + { + "epoch": 0.19, + "grad_norm": 0.6905652344042049, + "learning_rate": 4.670818559017064e-05, + "loss": 1.9956, + "step": 2466 + }, + { + "epoch": 0.19, + "grad_norm": 0.9088184554615418, + "learning_rate": 4.67050866006967e-05, + "loss": 2.1709, + "step": 2467 + }, + { + "epoch": 0.19, + "grad_norm": 0.9334777492351023, + "learning_rate": 4.6701986256085046e-05, + "loss": 2.0273, + "step": 2468 + }, + { + "epoch": 0.19, + "grad_norm": 0.7607925636820858, + "learning_rate": 4.669888455652923e-05, + "loss": 2.0003, + "step": 2469 + }, + { + "epoch": 0.19, + "grad_norm": 0.942315784948906, + "learning_rate": 4.669578150222291e-05, + "loss": 1.9725, + "step": 2470 + }, + { + "epoch": 0.19, + "grad_norm": 0.6436585398899733, + "learning_rate": 4.669267709335983e-05, + "loss": 2.0215, + "step": 2471 + }, + { + "epoch": 0.19, + "grad_norm": 0.9908122001421829, + "learning_rate": 4.66895713301338e-05, + "loss": 2.159, + "step": 2472 + }, + { + "epoch": 0.19, + "grad_norm": 0.7194983822927975, + "learning_rate": 4.6686464212738724e-05, + "loss": 2.0396, + "step": 2473 + }, + { + "epoch": 0.19, + "grad_norm": 0.710400495671655, + "learning_rate": 4.6683355741368594e-05, + "loss": 2.0254, + "step": 2474 + }, + { + "epoch": 0.19, + "grad_norm": 0.8116954626767107, + "learning_rate": 4.66802459162175e-05, + "loss": 2.1892, + "step": 2475 + }, + { + "epoch": 0.19, + "grad_norm": 0.5833524321714841, + "learning_rate": 4.6677134737479576e-05, + "loss": 1.9489, + "step": 2476 + }, + { + "epoch": 0.19, + "grad_norm": 0.745788769867993, + "learning_rate": 4.6674022205349085e-05, + "loss": 2.0047, + "step": 2477 + }, + { + "epoch": 0.19, + "grad_norm": 0.6438366094071706, + "learning_rate": 4.6670908320020346e-05, + "loss": 1.9714, + "step": 2478 + }, + { + "epoch": 0.19, + "grad_norm": 0.7288017204382724, + "learning_rate": 4.666779308168778e-05, + "loss": 2.1895, + "step": 2479 + }, + { + "epoch": 0.19, + "grad_norm": 0.8968327544160674, + "learning_rate": 4.666467649054587e-05, + "loss": 1.9725, + "step": 2480 + }, + { + "epoch": 0.19, + "grad_norm": 0.6789129988395259, + "learning_rate": 4.666155854678921e-05, + "loss": 1.9716, + "step": 2481 + }, + { + "epoch": 0.19, + "grad_norm": 0.8969814698472731, + "learning_rate": 4.6658439250612465e-05, + "loss": 1.966, + "step": 2482 + }, + { + "epoch": 0.19, + "grad_norm": 0.6383086482161018, + "learning_rate": 4.665531860221038e-05, + "loss": 2.1536, + "step": 2483 + }, + { + "epoch": 0.19, + "grad_norm": 1.088937701805825, + "learning_rate": 4.665219660177779e-05, + "loss": 2.0763, + "step": 2484 + }, + { + "epoch": 0.19, + "grad_norm": 0.7555931351443268, + "learning_rate": 4.664907324950962e-05, + "loss": 1.9906, + "step": 2485 + }, + { + "epoch": 0.19, + "grad_norm": 0.853980941632526, + "learning_rate": 4.664594854560087e-05, + "loss": 1.998, + "step": 2486 + }, + { + "epoch": 0.19, + "grad_norm": 0.8620016157143996, + "learning_rate": 4.664282249024663e-05, + "loss": 2.1926, + "step": 2487 + }, + { + "epoch": 0.19, + "grad_norm": 0.6415795421207837, + "learning_rate": 4.6639695083642064e-05, + "loss": 2.0103, + "step": 2488 + }, + { + "epoch": 0.19, + "grad_norm": 0.8729458814588819, + "learning_rate": 4.6636566325982446e-05, + "loss": 1.9789, + "step": 2489 + }, + { + "epoch": 0.19, + "grad_norm": 0.7013923046202719, + "learning_rate": 4.66334362174631e-05, + "loss": 2.0691, + "step": 2490 + }, + { + "epoch": 0.19, + "grad_norm": 0.9335828328259878, + "learning_rate": 4.6630304758279464e-05, + "loss": 2.1607, + "step": 2491 + }, + { + "epoch": 0.19, + "grad_norm": 0.8134259988417311, + "learning_rate": 4.6627171948627035e-05, + "loss": 2.0134, + "step": 2492 + }, + { + "epoch": 0.19, + "grad_norm": 0.6366219479972837, + "learning_rate": 4.662403778870142e-05, + "loss": 1.9299, + "step": 2493 + }, + { + "epoch": 0.19, + "grad_norm": 0.8955690564080252, + "learning_rate": 4.662090227869829e-05, + "loss": 2.0202, + "step": 2494 + }, + { + "epoch": 0.19, + "grad_norm": 0.6893279595012406, + "learning_rate": 4.661776541881341e-05, + "loss": 2.1555, + "step": 2495 + }, + { + "epoch": 0.19, + "grad_norm": 0.8340311214658663, + "learning_rate": 4.661462720924262e-05, + "loss": 2.0391, + "step": 2496 + }, + { + "epoch": 0.19, + "grad_norm": 0.6726823172223888, + "learning_rate": 4.661148765018187e-05, + "loss": 1.9803, + "step": 2497 + }, + { + "epoch": 0.19, + "grad_norm": 0.7181456520297506, + "learning_rate": 4.660834674182716e-05, + "loss": 1.9901, + "step": 2498 + }, + { + "epoch": 0.19, + "grad_norm": 0.8283961551635887, + "learning_rate": 4.6605204484374595e-05, + "loss": 2.1864, + "step": 2499 + }, + { + "epoch": 0.19, + "grad_norm": 0.5952219369838586, + "learning_rate": 4.6602060878020356e-05, + "loss": 1.9861, + "step": 2500 + }, + { + "epoch": 0.19, + "grad_norm": 0.9130134026602053, + "learning_rate": 4.659891592296071e-05, + "loss": 2.0087, + "step": 2501 + }, + { + "epoch": 0.19, + "grad_norm": 0.6755865748467891, + "learning_rate": 4.6595769619392017e-05, + "loss": 2.0502, + "step": 2502 + }, + { + "epoch": 0.19, + "grad_norm": 0.9135357547477468, + "learning_rate": 4.65926219675107e-05, + "loss": 1.9739, + "step": 2503 + }, + { + "epoch": 0.19, + "grad_norm": 0.835995010263368, + "learning_rate": 4.65894729675133e-05, + "loss": 2.1934, + "step": 2504 + }, + { + "epoch": 0.19, + "grad_norm": 0.702827364277536, + "learning_rate": 4.65863226195964e-05, + "loss": 1.9983, + "step": 2505 + }, + { + "epoch": 0.19, + "grad_norm": 0.8758870258136955, + "learning_rate": 4.658317092395671e-05, + "loss": 2.0134, + "step": 2506 + }, + { + "epoch": 0.19, + "grad_norm": 0.5975634318088306, + "learning_rate": 4.6580017880790996e-05, + "loss": 2.1986, + "step": 2507 + }, + { + "epoch": 0.19, + "grad_norm": 0.721687439031074, + "learning_rate": 4.65768634902961e-05, + "loss": 2.0557, + "step": 2508 + }, + { + "epoch": 0.19, + "grad_norm": 0.6428451818146449, + "learning_rate": 4.657370775266898e-05, + "loss": 1.9956, + "step": 2509 + }, + { + "epoch": 0.19, + "grad_norm": 0.7195816553338311, + "learning_rate": 4.6570550668106664e-05, + "loss": 1.9929, + "step": 2510 + }, + { + "epoch": 0.19, + "grad_norm": 0.888510653377431, + "learning_rate": 4.656739223680625e-05, + "loss": 2.2015, + "step": 2511 + }, + { + "epoch": 0.19, + "grad_norm": 0.6077243773500849, + "learning_rate": 4.656423245896494e-05, + "loss": 1.9667, + "step": 2512 + }, + { + "epoch": 0.19, + "grad_norm": 0.7652849593710155, + "learning_rate": 4.656107133478001e-05, + "loss": 2.002, + "step": 2513 + }, + { + "epoch": 0.19, + "grad_norm": 0.6359814216983952, + "learning_rate": 4.6557908864448824e-05, + "loss": 1.9514, + "step": 2514 + }, + { + "epoch": 0.19, + "grad_norm": 0.703376085952886, + "learning_rate": 4.6554745048168825e-05, + "loss": 2.0369, + "step": 2515 + }, + { + "epoch": 0.19, + "grad_norm": 0.8130185953694179, + "learning_rate": 4.6551579886137544e-05, + "loss": 2.1346, + "step": 2516 + }, + { + "epoch": 0.19, + "grad_norm": 0.5785374215210156, + "learning_rate": 4.654841337855259e-05, + "loss": 2.0095, + "step": 2517 + }, + { + "epoch": 0.19, + "grad_norm": 0.7922265488236757, + "learning_rate": 4.6545245525611665e-05, + "loss": 1.9888, + "step": 2518 + }, + { + "epoch": 0.19, + "grad_norm": 0.6792421944658242, + "learning_rate": 4.654207632751255e-05, + "loss": 2.1704, + "step": 2519 + }, + { + "epoch": 0.19, + "grad_norm": 0.6529081425535241, + "learning_rate": 4.6538905784453126e-05, + "loss": 1.9981, + "step": 2520 + }, + { + "epoch": 0.19, + "grad_norm": 0.6233506140798122, + "learning_rate": 4.653573389663132e-05, + "loss": 2.0454, + "step": 2521 + }, + { + "epoch": 0.19, + "grad_norm": 0.6143485948861483, + "learning_rate": 4.6532560664245175e-05, + "loss": 2.0105, + "step": 2522 + }, + { + "epoch": 0.19, + "grad_norm": 0.5878004162490259, + "learning_rate": 4.652938608749281e-05, + "loss": 2.152, + "step": 2523 + }, + { + "epoch": 0.19, + "grad_norm": 0.6329684725306782, + "learning_rate": 4.6526210166572427e-05, + "loss": 1.9562, + "step": 2524 + }, + { + "epoch": 0.19, + "grad_norm": 0.6058239023321682, + "learning_rate": 4.652303290168231e-05, + "loss": 2.0145, + "step": 2525 + }, + { + "epoch": 0.19, + "grad_norm": 0.5722880139831528, + "learning_rate": 4.651985429302083e-05, + "loss": 2.0082, + "step": 2526 + }, + { + "epoch": 0.19, + "grad_norm": 0.5812835698838148, + "learning_rate": 4.651667434078645e-05, + "loss": 2.0681, + "step": 2527 + }, + { + "epoch": 0.2, + "grad_norm": 0.6398155508932993, + "learning_rate": 4.651349304517768e-05, + "loss": 2.1892, + "step": 2528 + }, + { + "epoch": 0.2, + "grad_norm": 0.6711898483730057, + "learning_rate": 4.6510310406393176e-05, + "loss": 2.0353, + "step": 2529 + }, + { + "epoch": 0.2, + "grad_norm": 0.6060037677450617, + "learning_rate": 4.650712642463162e-05, + "loss": 1.9769, + "step": 2530 + }, + { + "epoch": 0.2, + "grad_norm": 0.7229068100509315, + "learning_rate": 4.6503941100091816e-05, + "loss": 2.1607, + "step": 2531 + }, + { + "epoch": 0.2, + "grad_norm": 0.5770847722411103, + "learning_rate": 4.6500754432972615e-05, + "loss": 1.9753, + "step": 2532 + }, + { + "epoch": 0.2, + "grad_norm": 0.6245591970449133, + "learning_rate": 4.6497566423473e-05, + "loss": 2.0326, + "step": 2533 + }, + { + "epoch": 0.2, + "grad_norm": 0.6484247102216382, + "learning_rate": 4.6494377071791996e-05, + "loss": 2.0061, + "step": 2534 + }, + { + "epoch": 0.2, + "grad_norm": 0.6980271774582169, + "learning_rate": 4.649118637812872e-05, + "loss": 1.9911, + "step": 2535 + }, + { + "epoch": 0.2, + "grad_norm": 0.6816791513504178, + "learning_rate": 4.648799434268241e-05, + "loss": 2.2098, + "step": 2536 + }, + { + "epoch": 0.2, + "grad_norm": 0.7105850829778596, + "learning_rate": 4.648480096565232e-05, + "loss": 1.9778, + "step": 2537 + }, + { + "epoch": 0.2, + "grad_norm": 0.6374545856159565, + "learning_rate": 4.6481606247237854e-05, + "loss": 1.9386, + "step": 2538 + }, + { + "epoch": 0.2, + "grad_norm": 0.7769387561562383, + "learning_rate": 4.647841018763846e-05, + "loss": 2.2127, + "step": 2539 + }, + { + "epoch": 0.2, + "grad_norm": 0.6266625671429538, + "learning_rate": 4.647521278705368e-05, + "loss": 2.0484, + "step": 2540 + }, + { + "epoch": 0.2, + "grad_norm": 0.6609623499769247, + "learning_rate": 4.647201404568314e-05, + "loss": 1.9669, + "step": 2541 + }, + { + "epoch": 0.2, + "grad_norm": 0.7607320207173331, + "learning_rate": 4.6468813963726556e-05, + "loss": 2.0151, + "step": 2542 + }, + { + "epoch": 0.2, + "grad_norm": 0.7523095930951635, + "learning_rate": 4.646561254138372e-05, + "loss": 2.1938, + "step": 2543 + }, + { + "epoch": 0.2, + "grad_norm": 0.6403148566506762, + "learning_rate": 4.646240977885451e-05, + "loss": 1.9833, + "step": 2544 + }, + { + "epoch": 0.2, + "grad_norm": 0.6661875918009426, + "learning_rate": 4.645920567633889e-05, + "loss": 2.0046, + "step": 2545 + }, + { + "epoch": 0.2, + "grad_norm": 0.6264816346894045, + "learning_rate": 4.645600023403691e-05, + "loss": 2.1022, + "step": 2546 + }, + { + "epoch": 0.2, + "grad_norm": 0.5803142986930665, + "learning_rate": 4.645279345214868e-05, + "loss": 1.9611, + "step": 2547 + }, + { + "epoch": 0.2, + "grad_norm": 0.650348397766849, + "learning_rate": 4.644958533087443e-05, + "loss": 2.1951, + "step": 2548 + }, + { + "epoch": 0.2, + "grad_norm": 0.5958432771031661, + "learning_rate": 4.6446375870414446e-05, + "loss": 1.9965, + "step": 2549 + }, + { + "epoch": 0.2, + "grad_norm": 0.6126603952489209, + "learning_rate": 4.644316507096912e-05, + "loss": 2.023, + "step": 2550 + }, + { + "epoch": 0.2, + "grad_norm": 0.6381507234176351, + "learning_rate": 4.64399529327389e-05, + "loss": 2.1905, + "step": 2551 + }, + { + "epoch": 0.2, + "grad_norm": 0.5460717086865683, + "learning_rate": 4.643673945592435e-05, + "loss": 2.0771, + "step": 2552 + }, + { + "epoch": 0.2, + "grad_norm": 0.6064345758734442, + "learning_rate": 4.643352464072608e-05, + "loss": 2.0046, + "step": 2553 + }, + { + "epoch": 0.2, + "grad_norm": 0.6930770852689816, + "learning_rate": 4.643030848734483e-05, + "loss": 1.9558, + "step": 2554 + }, + { + "epoch": 0.2, + "grad_norm": 0.5887981747283252, + "learning_rate": 4.642709099598137e-05, + "loss": 1.9727, + "step": 2555 + }, + { + "epoch": 0.2, + "grad_norm": 0.7391375565275787, + "learning_rate": 4.64238721668366e-05, + "loss": 2.1845, + "step": 2556 + }, + { + "epoch": 0.2, + "grad_norm": 0.6099042053901921, + "learning_rate": 4.642065200011147e-05, + "loss": 1.9381, + "step": 2557 + }, + { + "epoch": 0.2, + "grad_norm": 0.6410366767558648, + "learning_rate": 4.641743049600705e-05, + "loss": 2.0433, + "step": 2558 + }, + { + "epoch": 0.2, + "grad_norm": 0.5850955014806185, + "learning_rate": 4.641420765472446e-05, + "loss": 1.968, + "step": 2559 + }, + { + "epoch": 0.2, + "grad_norm": 0.7167077705853634, + "learning_rate": 4.641098347646491e-05, + "loss": 2.1841, + "step": 2560 + }, + { + "epoch": 0.2, + "grad_norm": 0.6040635664927856, + "learning_rate": 4.6407757961429696e-05, + "loss": 2.0183, + "step": 2561 + }, + { + "epoch": 0.2, + "grad_norm": 0.5906424527254666, + "learning_rate": 4.640453110982022e-05, + "loss": 1.9386, + "step": 2562 + }, + { + "epoch": 0.2, + "grad_norm": 0.6744807983581735, + "learning_rate": 4.640130292183793e-05, + "loss": 2.1929, + "step": 2563 + }, + { + "epoch": 0.2, + "grad_norm": 0.6552171141975681, + "learning_rate": 4.639807339768437e-05, + "loss": 2.1057, + "step": 2564 + }, + { + "epoch": 0.2, + "grad_norm": 0.5951574643730799, + "learning_rate": 4.63948425375612e-05, + "loss": 1.9701, + "step": 2565 + }, + { + "epoch": 0.2, + "grad_norm": 0.6844788004792994, + "learning_rate": 4.6391610341670105e-05, + "loss": 1.9789, + "step": 2566 + }, + { + "epoch": 0.2, + "grad_norm": 0.6408402412644457, + "learning_rate": 4.6388376810212905e-05, + "loss": 1.9848, + "step": 2567 + }, + { + "epoch": 0.2, + "grad_norm": 0.6167414151099627, + "learning_rate": 4.6385141943391466e-05, + "loss": 2.1842, + "step": 2568 + }, + { + "epoch": 0.2, + "grad_norm": 0.6755076403896096, + "learning_rate": 4.6381905741407774e-05, + "loss": 1.9806, + "step": 2569 + }, + { + "epoch": 0.2, + "grad_norm": 0.6814254611283641, + "learning_rate": 4.637866820446386e-05, + "loss": 1.9505, + "step": 2570 + }, + { + "epoch": 0.2, + "grad_norm": 0.6557317028419934, + "learning_rate": 4.637542933276186e-05, + "loss": 2.0537, + "step": 2571 + }, + { + "epoch": 0.2, + "grad_norm": 0.6007755011050209, + "learning_rate": 4.6372189126504004e-05, + "loss": 2.1509, + "step": 2572 + }, + { + "epoch": 0.2, + "grad_norm": 0.7141725935732299, + "learning_rate": 4.636894758589257e-05, + "loss": 2.0141, + "step": 2573 + }, + { + "epoch": 0.2, + "grad_norm": 0.6034986590006324, + "learning_rate": 4.6365704711129973e-05, + "loss": 1.9326, + "step": 2574 + }, + { + "epoch": 0.2, + "grad_norm": 0.6847327216872016, + "learning_rate": 4.636246050241864e-05, + "loss": 2.1582, + "step": 2575 + }, + { + "epoch": 0.2, + "grad_norm": 0.6466119695120638, + "learning_rate": 4.635921495996115e-05, + "loss": 2.0249, + "step": 2576 + }, + { + "epoch": 0.2, + "grad_norm": 0.5275950533765944, + "learning_rate": 4.635596808396011e-05, + "loss": 2.0512, + "step": 2577 + }, + { + "epoch": 0.2, + "grad_norm": 0.5945026369793002, + "learning_rate": 4.635271987461827e-05, + "loss": 1.9339, + "step": 2578 + }, + { + "epoch": 0.2, + "grad_norm": 0.6578289301259045, + "learning_rate": 4.634947033213839e-05, + "loss": 1.9694, + "step": 2579 + }, + { + "epoch": 0.2, + "grad_norm": 0.6301945375508965, + "learning_rate": 4.634621945672338e-05, + "loss": 2.1696, + "step": 2580 + }, + { + "epoch": 0.2, + "grad_norm": 0.6098334593422985, + "learning_rate": 4.6342967248576194e-05, + "loss": 2.0155, + "step": 2581 + }, + { + "epoch": 0.2, + "grad_norm": 0.5744365610517691, + "learning_rate": 4.633971370789989e-05, + "loss": 2.0213, + "step": 2582 + }, + { + "epoch": 0.2, + "grad_norm": 0.6221750693439922, + "learning_rate": 4.633645883489759e-05, + "loss": 2.0296, + "step": 2583 + }, + { + "epoch": 0.2, + "grad_norm": 0.6405853238989996, + "learning_rate": 4.633320262977251e-05, + "loss": 2.1845, + "step": 2584 + }, + { + "epoch": 0.2, + "grad_norm": 0.6666321309748923, + "learning_rate": 4.632994509272795e-05, + "loss": 2.0011, + "step": 2585 + }, + { + "epoch": 0.2, + "grad_norm": 0.6275811478950984, + "learning_rate": 4.6326686223967295e-05, + "loss": 1.9741, + "step": 2586 + }, + { + "epoch": 0.2, + "grad_norm": 0.6140932652479769, + "learning_rate": 4.632342602369401e-05, + "loss": 1.9649, + "step": 2587 + }, + { + "epoch": 0.2, + "grad_norm": 0.5771464129573756, + "learning_rate": 4.632016449211163e-05, + "loss": 2.172, + "step": 2588 + }, + { + "epoch": 0.2, + "grad_norm": 0.64391990799126, + "learning_rate": 4.6316901629423806e-05, + "loss": 2.0208, + "step": 2589 + }, + { + "epoch": 0.2, + "grad_norm": 0.6455979550112021, + "learning_rate": 4.6313637435834235e-05, + "loss": 1.9673, + "step": 2590 + }, + { + "epoch": 0.2, + "grad_norm": 0.7881033449711133, + "learning_rate": 4.631037191154672e-05, + "loss": 1.9972, + "step": 2591 + }, + { + "epoch": 0.2, + "grad_norm": 0.6636328167441744, + "learning_rate": 4.630710505676514e-05, + "loss": 2.1885, + "step": 2592 + }, + { + "epoch": 0.2, + "grad_norm": 0.681236055433829, + "learning_rate": 4.6303836871693465e-05, + "loss": 1.965, + "step": 2593 + }, + { + "epoch": 0.2, + "grad_norm": 0.6256080838698901, + "learning_rate": 4.630056735653574e-05, + "loss": 2.0288, + "step": 2594 + }, + { + "epoch": 0.2, + "grad_norm": 0.714541968053065, + "learning_rate": 4.629729651149608e-05, + "loss": 2.1176, + "step": 2595 + }, + { + "epoch": 0.2, + "grad_norm": 0.7330178717854247, + "learning_rate": 4.629402433677871e-05, + "loss": 2.2151, + "step": 2596 + }, + { + "epoch": 0.2, + "grad_norm": 0.6947625806918035, + "learning_rate": 4.629075083258792e-05, + "loss": 1.9903, + "step": 2597 + }, + { + "epoch": 0.2, + "grad_norm": 0.6977404663409702, + "learning_rate": 4.62874759991281e-05, + "loss": 1.9885, + "step": 2598 + }, + { + "epoch": 0.2, + "grad_norm": 0.5868231309195191, + "learning_rate": 4.628419983660369e-05, + "loss": 1.9212, + "step": 2599 + }, + { + "epoch": 0.2, + "grad_norm": 0.8153837171888013, + "learning_rate": 4.6280922345219255e-05, + "loss": 2.2106, + "step": 2600 + }, + { + "epoch": 0.2, + "grad_norm": 0.6169069233598656, + "learning_rate": 4.627764352517942e-05, + "loss": 2.0273, + "step": 2601 + }, + { + "epoch": 0.2, + "grad_norm": 0.8114057796826045, + "learning_rate": 4.627436337668888e-05, + "loss": 2.078, + "step": 2602 + }, + { + "epoch": 0.2, + "grad_norm": 0.6880936366867727, + "learning_rate": 4.6271081899952435e-05, + "loss": 2.0006, + "step": 2603 + }, + { + "epoch": 0.2, + "grad_norm": 0.6814061456837147, + "learning_rate": 4.626779909517497e-05, + "loss": 2.2114, + "step": 2604 + }, + { + "epoch": 0.2, + "grad_norm": 0.6313515731882557, + "learning_rate": 4.626451496256143e-05, + "loss": 1.9582, + "step": 2605 + }, + { + "epoch": 0.2, + "grad_norm": 0.7245878825509109, + "learning_rate": 4.6261229502316866e-05, + "loss": 1.9866, + "step": 2606 + }, + { + "epoch": 0.2, + "grad_norm": 0.648329670544724, + "learning_rate": 4.6257942714646405e-05, + "loss": 2.2024, + "step": 2607 + }, + { + "epoch": 0.2, + "grad_norm": 0.7756371981498712, + "learning_rate": 4.625465459975524e-05, + "loss": 2.056, + "step": 2608 + }, + { + "epoch": 0.2, + "grad_norm": 0.6740421069458721, + "learning_rate": 4.625136515784868e-05, + "loss": 1.9531, + "step": 2609 + }, + { + "epoch": 0.2, + "grad_norm": 0.6511230351410632, + "learning_rate": 4.62480743891321e-05, + "loss": 1.9944, + "step": 2610 + }, + { + "epoch": 0.2, + "grad_norm": 0.6263895442328945, + "learning_rate": 4.624478229381093e-05, + "loss": 1.9938, + "step": 2611 + }, + { + "epoch": 0.2, + "grad_norm": 0.6874119041578052, + "learning_rate": 4.624148887209073e-05, + "loss": 2.2105, + "step": 2612 + }, + { + "epoch": 0.2, + "grad_norm": 0.6195395378963254, + "learning_rate": 4.623819412417713e-05, + "loss": 2.0065, + "step": 2613 + }, + { + "epoch": 0.2, + "grad_norm": 0.6165988337872469, + "learning_rate": 4.6234898050275805e-05, + "loss": 2.0464, + "step": 2614 + }, + { + "epoch": 0.2, + "grad_norm": 0.6480559727054603, + "learning_rate": 4.623160065059257e-05, + "loss": 1.9912, + "step": 2615 + }, + { + "epoch": 0.2, + "grad_norm": 0.6252933703490435, + "learning_rate": 4.622830192533328e-05, + "loss": 2.1766, + "step": 2616 + }, + { + "epoch": 0.2, + "grad_norm": 0.6042034575515792, + "learning_rate": 4.622500187470389e-05, + "loss": 1.9972, + "step": 2617 + }, + { + "epoch": 0.2, + "grad_norm": 0.7110084457404399, + "learning_rate": 4.622170049891045e-05, + "loss": 1.9954, + "step": 2618 + }, + { + "epoch": 0.2, + "grad_norm": 0.6202881382255014, + "learning_rate": 4.6218397798159054e-05, + "loss": 1.9858, + "step": 2619 + }, + { + "epoch": 0.2, + "grad_norm": 0.664962645004124, + "learning_rate": 4.621509377265592e-05, + "loss": 2.2143, + "step": 2620 + }, + { + "epoch": 0.2, + "grad_norm": 0.6597752179624428, + "learning_rate": 4.6211788422607334e-05, + "loss": 1.9589, + "step": 2621 + }, + { + "epoch": 0.2, + "grad_norm": 0.6344383572699662, + "learning_rate": 4.620848174821965e-05, + "loss": 1.9528, + "step": 2622 + }, + { + "epoch": 0.2, + "grad_norm": 0.7308068826075466, + "learning_rate": 4.620517374969933e-05, + "loss": 1.9578, + "step": 2623 + }, + { + "epoch": 0.2, + "grad_norm": 0.8427333752346938, + "learning_rate": 4.6201864427252896e-05, + "loss": 2.1922, + "step": 2624 + }, + { + "epoch": 0.2, + "grad_norm": 0.5974980874181669, + "learning_rate": 4.619855378108696e-05, + "loss": 1.9527, + "step": 2625 + }, + { + "epoch": 0.2, + "grad_norm": 0.999716199534339, + "learning_rate": 4.619524181140823e-05, + "loss": 2.0393, + "step": 2626 + }, + { + "epoch": 0.2, + "grad_norm": 0.650342992780213, + "learning_rate": 4.619192851842349e-05, + "loss": 1.973, + "step": 2627 + }, + { + "epoch": 0.2, + "grad_norm": 0.8062088109528851, + "learning_rate": 4.6188613902339587e-05, + "loss": 2.1825, + "step": 2628 + }, + { + "epoch": 0.2, + "grad_norm": 0.8563120718683732, + "learning_rate": 4.6185297963363475e-05, + "loss": 1.9669, + "step": 2629 + }, + { + "epoch": 0.2, + "grad_norm": 0.6138571827653525, + "learning_rate": 4.618198070170217e-05, + "loss": 1.9532, + "step": 2630 + }, + { + "epoch": 0.2, + "grad_norm": 0.7365087840322205, + "learning_rate": 4.61786621175628e-05, + "loss": 1.9943, + "step": 2631 + }, + { + "epoch": 0.2, + "grad_norm": 0.6883989549332556, + "learning_rate": 4.617534221115255e-05, + "loss": 2.1513, + "step": 2632 + }, + { + "epoch": 0.2, + "grad_norm": 0.6900936606222442, + "learning_rate": 4.61720209826787e-05, + "loss": 2.0413, + "step": 2633 + }, + { + "epoch": 0.2, + "grad_norm": 0.6495303036975949, + "learning_rate": 4.616869843234859e-05, + "loss": 1.9844, + "step": 2634 + }, + { + "epoch": 0.2, + "grad_norm": 0.7365622207438768, + "learning_rate": 4.616537456036969e-05, + "loss": 1.9726, + "step": 2635 + }, + { + "epoch": 0.2, + "grad_norm": 0.7641384791071775, + "learning_rate": 4.61620493669495e-05, + "loss": 2.1947, + "step": 2636 + }, + { + "epoch": 0.2, + "grad_norm": 0.6528772197007925, + "learning_rate": 4.6158722852295627e-05, + "loss": 1.9836, + "step": 2637 + }, + { + "epoch": 0.2, + "grad_norm": 0.7533304123828032, + "learning_rate": 4.6155395016615766e-05, + "loss": 1.9642, + "step": 2638 + }, + { + "epoch": 0.2, + "grad_norm": 0.7467104639484458, + "learning_rate": 4.615206586011769e-05, + "loss": 2.0856, + "step": 2639 + }, + { + "epoch": 0.2, + "grad_norm": 0.6372760177405661, + "learning_rate": 4.614873538300925e-05, + "loss": 2.1544, + "step": 2640 + }, + { + "epoch": 0.2, + "grad_norm": 0.6304150273704419, + "learning_rate": 4.614540358549837e-05, + "loss": 1.9661, + "step": 2641 + }, + { + "epoch": 0.2, + "grad_norm": 0.6541947253914094, + "learning_rate": 4.614207046779308e-05, + "loss": 1.9513, + "step": 2642 + }, + { + "epoch": 0.2, + "grad_norm": 0.5775906541814531, + "learning_rate": 4.6138736030101475e-05, + "loss": 1.9696, + "step": 2643 + }, + { + "epoch": 0.2, + "grad_norm": 0.6529792868366469, + "learning_rate": 4.613540027263175e-05, + "loss": 2.1803, + "step": 2644 + }, + { + "epoch": 0.2, + "grad_norm": 0.6877087040045944, + "learning_rate": 4.613206319559214e-05, + "loss": 2.0543, + "step": 2645 + }, + { + "epoch": 0.2, + "grad_norm": 0.6514369290234705, + "learning_rate": 4.612872479919103e-05, + "loss": 1.9421, + "step": 2646 + }, + { + "epoch": 0.2, + "grad_norm": 0.7543707099073174, + "learning_rate": 4.612538508363682e-05, + "loss": 1.9848, + "step": 2647 + }, + { + "epoch": 0.2, + "grad_norm": 0.6986106815333605, + "learning_rate": 4.6122044049138045e-05, + "loss": 2.1594, + "step": 2648 + }, + { + "epoch": 0.2, + "grad_norm": 0.6788654308420587, + "learning_rate": 4.611870169590328e-05, + "loss": 2.0143, + "step": 2649 + }, + { + "epoch": 0.2, + "grad_norm": 0.6031561172947046, + "learning_rate": 4.611535802414121e-05, + "loss": 2.009, + "step": 2650 + }, + { + "epoch": 0.2, + "grad_norm": 0.7350969260289836, + "learning_rate": 4.61120130340606e-05, + "loss": 2.0743, + "step": 2651 + }, + { + "epoch": 0.2, + "grad_norm": 0.7566004503572001, + "learning_rate": 4.6108666725870286e-05, + "loss": 2.2192, + "step": 2652 + }, + { + "epoch": 0.2, + "grad_norm": 0.6676839095512593, + "learning_rate": 4.6105319099779186e-05, + "loss": 2.0207, + "step": 2653 + }, + { + "epoch": 0.2, + "grad_norm": 0.6748837361455958, + "learning_rate": 4.610197015599632e-05, + "loss": 2.003, + "step": 2654 + }, + { + "epoch": 0.2, + "grad_norm": 0.7238881204449563, + "learning_rate": 4.6098619894730766e-05, + "loss": 1.9342, + "step": 2655 + }, + { + "epoch": 0.2, + "grad_norm": 0.6523191040277376, + "learning_rate": 4.6095268316191685e-05, + "loss": 2.1813, + "step": 2656 + }, + { + "epoch": 0.2, + "grad_norm": 0.742330441610128, + "learning_rate": 4.6091915420588355e-05, + "loss": 2.1066, + "step": 2657 + }, + { + "epoch": 0.21, + "grad_norm": 0.699416442856155, + "learning_rate": 4.6088561208130094e-05, + "loss": 1.9849, + "step": 2658 + }, + { + "epoch": 0.21, + "grad_norm": 0.6588847160306627, + "learning_rate": 4.6085205679026325e-05, + "loss": 1.955, + "step": 2659 + }, + { + "epoch": 0.21, + "grad_norm": 0.6638981429281201, + "learning_rate": 4.608184883348654e-05, + "loss": 2.1703, + "step": 2660 + }, + { + "epoch": 0.21, + "grad_norm": 0.6753161118399083, + "learning_rate": 4.6078490671720324e-05, + "loss": 1.9424, + "step": 2661 + }, + { + "epoch": 0.21, + "grad_norm": 0.7448796303868312, + "learning_rate": 4.607513119393735e-05, + "loss": 1.9945, + "step": 2662 + }, + { + "epoch": 0.21, + "grad_norm": 0.6639803543587954, + "learning_rate": 4.607177040034735e-05, + "loss": 1.9961, + "step": 2663 + }, + { + "epoch": 0.21, + "grad_norm": 0.5901072399585455, + "learning_rate": 4.6068408291160165e-05, + "loss": 2.2116, + "step": 2664 + }, + { + "epoch": 0.21, + "grad_norm": 0.7356833094454719, + "learning_rate": 4.6065044866585694e-05, + "loss": 2.0004, + "step": 2665 + }, + { + "epoch": 0.21, + "grad_norm": 0.7021501499989917, + "learning_rate": 4.606168012683394e-05, + "loss": 1.9792, + "step": 2666 + }, + { + "epoch": 0.21, + "grad_norm": 0.6148561520798782, + "learning_rate": 4.6058314072114966e-05, + "loss": 1.9557, + "step": 2667 + }, + { + "epoch": 0.21, + "grad_norm": 0.5675773032041115, + "learning_rate": 4.605494670263894e-05, + "loss": 2.1266, + "step": 2668 + }, + { + "epoch": 0.21, + "grad_norm": 0.6593238118243776, + "learning_rate": 4.6051578018616095e-05, + "loss": 1.9912, + "step": 2669 + }, + { + "epoch": 0.21, + "grad_norm": 0.7428701580648603, + "learning_rate": 4.604820802025675e-05, + "loss": 2.0746, + "step": 2670 + }, + { + "epoch": 0.21, + "grad_norm": 0.6066019308120973, + "learning_rate": 4.6044836707771314e-05, + "loss": 1.926, + "step": 2671 + }, + { + "epoch": 0.21, + "grad_norm": 0.8349110964232412, + "learning_rate": 4.604146408137026e-05, + "loss": 2.1856, + "step": 2672 + }, + { + "epoch": 0.21, + "grad_norm": 0.7760398526903683, + "learning_rate": 4.6038090141264165e-05, + "loss": 1.938, + "step": 2673 + }, + { + "epoch": 0.21, + "grad_norm": 0.583652163062359, + "learning_rate": 4.603471488766368e-05, + "loss": 1.9962, + "step": 2674 + }, + { + "epoch": 0.21, + "grad_norm": 0.6732494966439296, + "learning_rate": 4.6031338320779534e-05, + "loss": 1.9694, + "step": 2675 + }, + { + "epoch": 0.21, + "grad_norm": 0.592423970933599, + "learning_rate": 4.6027960440822533e-05, + "loss": 2.2178, + "step": 2676 + }, + { + "epoch": 0.21, + "grad_norm": 0.8374765612741423, + "learning_rate": 4.602458124800357e-05, + "loss": 1.9702, + "step": 2677 + }, + { + "epoch": 0.21, + "grad_norm": 0.6565341808899915, + "learning_rate": 4.6021200742533645e-05, + "loss": 2.0096, + "step": 2678 + }, + { + "epoch": 0.21, + "grad_norm": 0.6695837195252338, + "learning_rate": 4.601781892462379e-05, + "loss": 1.9901, + "step": 2679 + }, + { + "epoch": 0.21, + "grad_norm": 0.7057589195590709, + "learning_rate": 4.601443579448516e-05, + "loss": 2.1426, + "step": 2680 + }, + { + "epoch": 0.21, + "grad_norm": 0.6282253001598147, + "learning_rate": 4.601105135232897e-05, + "loss": 1.9709, + "step": 2681 + }, + { + "epoch": 0.21, + "grad_norm": 0.7592042474834655, + "learning_rate": 4.600766559836653e-05, + "loss": 2.0136, + "step": 2682 + }, + { + "epoch": 0.21, + "grad_norm": 0.6342157574431733, + "learning_rate": 4.600427853280922e-05, + "loss": 1.9526, + "step": 2683 + }, + { + "epoch": 0.21, + "grad_norm": 0.7069254077056205, + "learning_rate": 4.6000890155868524e-05, + "loss": 2.1806, + "step": 2684 + }, + { + "epoch": 0.21, + "grad_norm": 0.6246569782481525, + "learning_rate": 4.599750046775597e-05, + "loss": 1.9486, + "step": 2685 + }, + { + "epoch": 0.21, + "grad_norm": 0.7178039593665737, + "learning_rate": 4.599410946868321e-05, + "loss": 1.9738, + "step": 2686 + }, + { + "epoch": 0.21, + "grad_norm": 0.6769499236158081, + "learning_rate": 4.599071715886195e-05, + "loss": 1.9682, + "step": 2687 + }, + { + "epoch": 0.21, + "grad_norm": 0.7933150635822529, + "learning_rate": 4.5987323538503984e-05, + "loss": 2.1817, + "step": 2688 + }, + { + "epoch": 0.21, + "grad_norm": 0.7538366145808504, + "learning_rate": 4.598392860782119e-05, + "loss": 2.0383, + "step": 2689 + }, + { + "epoch": 0.21, + "grad_norm": 0.6930574343061014, + "learning_rate": 4.598053236702553e-05, + "loss": 2.0215, + "step": 2690 + }, + { + "epoch": 0.21, + "grad_norm": 0.7111614023256035, + "learning_rate": 4.597713481632905e-05, + "loss": 1.9819, + "step": 2691 + }, + { + "epoch": 0.21, + "grad_norm": 0.6052159770218407, + "learning_rate": 4.597373595594386e-05, + "loss": 2.1928, + "step": 2692 + }, + { + "epoch": 0.21, + "grad_norm": 0.7161440277678321, + "learning_rate": 4.5970335786082176e-05, + "loss": 1.9881, + "step": 2693 + }, + { + "epoch": 0.21, + "grad_norm": 0.5729690222187727, + "learning_rate": 4.596693430695628e-05, + "loss": 1.9705, + "step": 2694 + }, + { + "epoch": 0.21, + "grad_norm": 0.6668970882865378, + "learning_rate": 4.5963531518778535e-05, + "loss": 2.1186, + "step": 2695 + }, + { + "epoch": 0.21, + "grad_norm": 0.6602973681236253, + "learning_rate": 4.59601274217614e-05, + "loss": 2.179, + "step": 2696 + }, + { + "epoch": 0.21, + "grad_norm": 0.6645224252351648, + "learning_rate": 4.5956722016117404e-05, + "loss": 2.0442, + "step": 2697 + }, + { + "epoch": 0.21, + "grad_norm": 0.6187834337762769, + "learning_rate": 4.5953315302059166e-05, + "loss": 1.9885, + "step": 2698 + }, + { + "epoch": 0.21, + "grad_norm": 0.6341351296008236, + "learning_rate": 4.594990727979937e-05, + "loss": 2.0357, + "step": 2699 + }, + { + "epoch": 0.21, + "grad_norm": 0.6583401768576727, + "learning_rate": 4.594649794955079e-05, + "loss": 2.1957, + "step": 2700 + }, + { + "epoch": 0.21, + "grad_norm": 0.698903175071379, + "learning_rate": 4.59430873115263e-05, + "loss": 2.0707, + "step": 2701 + }, + { + "epoch": 0.21, + "grad_norm": 0.6021695324554158, + "learning_rate": 4.593967536593884e-05, + "loss": 1.9307, + "step": 2702 + }, + { + "epoch": 0.21, + "grad_norm": 0.7372200389546351, + "learning_rate": 4.593626211300142e-05, + "loss": 2.0022, + "step": 2703 + }, + { + "epoch": 0.21, + "grad_norm": 0.7038707811265971, + "learning_rate": 4.593284755292715e-05, + "loss": 2.1875, + "step": 2704 + }, + { + "epoch": 0.21, + "grad_norm": 0.6931745239945363, + "learning_rate": 4.5929431685929204e-05, + "loss": 1.9846, + "step": 2705 + }, + { + "epoch": 0.21, + "grad_norm": 0.9749747679426978, + "learning_rate": 4.592601451222086e-05, + "loss": 1.9921, + "step": 2706 + }, + { + "epoch": 0.21, + "grad_norm": 0.6304867022249925, + "learning_rate": 4.592259603201546e-05, + "loss": 2.0321, + "step": 2707 + }, + { + "epoch": 0.21, + "grad_norm": 0.9620294751423234, + "learning_rate": 4.591917624552645e-05, + "loss": 2.149, + "step": 2708 + }, + { + "epoch": 0.21, + "grad_norm": 0.7179772686185427, + "learning_rate": 4.591575515296732e-05, + "loss": 1.9819, + "step": 2709 + }, + { + "epoch": 0.21, + "grad_norm": 0.7208279071564487, + "learning_rate": 4.591233275455168e-05, + "loss": 1.9119, + "step": 2710 + }, + { + "epoch": 0.21, + "grad_norm": 0.7848438247414634, + "learning_rate": 4.590890905049319e-05, + "loss": 1.9823, + "step": 2711 + }, + { + "epoch": 0.21, + "grad_norm": 0.7024802442464062, + "learning_rate": 4.590548404100561e-05, + "loss": 2.201, + "step": 2712 + }, + { + "epoch": 0.21, + "grad_norm": 0.6369270187313578, + "learning_rate": 4.5902057726302784e-05, + "loss": 2.0848, + "step": 2713 + }, + { + "epoch": 0.21, + "grad_norm": 0.6828478863561541, + "learning_rate": 4.5898630106598625e-05, + "loss": 1.993, + "step": 2714 + }, + { + "epoch": 0.21, + "grad_norm": 0.7181238008600065, + "learning_rate": 4.5895201182107134e-05, + "loss": 1.9576, + "step": 2715 + }, + { + "epoch": 0.21, + "grad_norm": 0.6776059441125085, + "learning_rate": 4.58917709530424e-05, + "loss": 2.118, + "step": 2716 + }, + { + "epoch": 0.21, + "grad_norm": 0.7677641259041942, + "learning_rate": 4.588833941961856e-05, + "loss": 2.0412, + "step": 2717 + }, + { + "epoch": 0.21, + "grad_norm": 0.7368767351520903, + "learning_rate": 4.58849065820499e-05, + "loss": 2.0143, + "step": 2718 + }, + { + "epoch": 0.21, + "grad_norm": 0.7477193378112147, + "learning_rate": 4.588147244055072e-05, + "loss": 1.9962, + "step": 2719 + }, + { + "epoch": 0.21, + "grad_norm": 0.6996894669502343, + "learning_rate": 4.587803699533543e-05, + "loss": 2.0909, + "step": 2720 + }, + { + "epoch": 0.21, + "grad_norm": 0.696941883265145, + "learning_rate": 4.587460024661852e-05, + "loss": 2.2067, + "step": 2721 + }, + { + "epoch": 0.21, + "grad_norm": 0.6531248811442996, + "learning_rate": 4.587116219461456e-05, + "loss": 1.9893, + "step": 2722 + }, + { + "epoch": 0.21, + "grad_norm": 0.6570546490300734, + "learning_rate": 4.5867722839538205e-05, + "loss": 1.9928, + "step": 2723 + }, + { + "epoch": 0.21, + "grad_norm": 0.6718208182122217, + "learning_rate": 4.586428218160419e-05, + "loss": 2.1419, + "step": 2724 + }, + { + "epoch": 0.21, + "grad_norm": 0.6721359786965663, + "learning_rate": 4.586084022102732e-05, + "loss": 1.9408, + "step": 2725 + }, + { + "epoch": 0.21, + "grad_norm": 0.6473584802823314, + "learning_rate": 4.58573969580225e-05, + "loss": 2.0356, + "step": 2726 + }, + { + "epoch": 0.21, + "grad_norm": 0.6230859480633122, + "learning_rate": 4.58539523928047e-05, + "loss": 1.985, + "step": 2727 + }, + { + "epoch": 0.21, + "grad_norm": 0.8196654156442967, + "learning_rate": 4.5850506525588986e-05, + "loss": 2.1846, + "step": 2728 + }, + { + "epoch": 0.21, + "grad_norm": 0.6580276847375983, + "learning_rate": 4.5847059356590494e-05, + "loss": 2.0321, + "step": 2729 + }, + { + "epoch": 0.21, + "grad_norm": 0.7960055602568584, + "learning_rate": 4.5843610886024445e-05, + "loss": 2.0207, + "step": 2730 + }, + { + "epoch": 0.21, + "grad_norm": 0.6257598563119594, + "learning_rate": 4.584016111410614e-05, + "loss": 1.9627, + "step": 2731 + }, + { + "epoch": 0.21, + "grad_norm": 0.6786767927403474, + "learning_rate": 4.583671004105096e-05, + "loss": 2.065, + "step": 2732 + }, + { + "epoch": 0.21, + "grad_norm": 0.7355666708735067, + "learning_rate": 4.583325766707437e-05, + "loss": 2.1655, + "step": 2733 + }, + { + "epoch": 0.21, + "grad_norm": 0.8065959056684828, + "learning_rate": 4.582980399239193e-05, + "loss": 1.9871, + "step": 2734 + }, + { + "epoch": 0.21, + "grad_norm": 0.6523713117534081, + "learning_rate": 4.5826349017219244e-05, + "loss": 1.994, + "step": 2735 + }, + { + "epoch": 0.21, + "grad_norm": 0.7643096877117508, + "learning_rate": 4.582289274177204e-05, + "loss": 2.1942, + "step": 2736 + }, + { + "epoch": 0.21, + "grad_norm": 0.6550475104022917, + "learning_rate": 4.581943516626609e-05, + "loss": 2.02, + "step": 2737 + }, + { + "epoch": 0.21, + "grad_norm": 0.6967186393956918, + "learning_rate": 4.5815976290917285e-05, + "loss": 2.0852, + "step": 2738 + }, + { + "epoch": 0.21, + "grad_norm": 0.5945616636292965, + "learning_rate": 4.5812516115941564e-05, + "loss": 2.0147, + "step": 2739 + }, + { + "epoch": 0.21, + "grad_norm": 0.6756966445469972, + "learning_rate": 4.580905464155496e-05, + "loss": 2.1629, + "step": 2740 + }, + { + "epoch": 0.21, + "grad_norm": 0.6396973306899563, + "learning_rate": 4.580559186797358e-05, + "loss": 1.9821, + "step": 2741 + }, + { + "epoch": 0.21, + "grad_norm": 0.6490476811755255, + "learning_rate": 4.580212779541364e-05, + "loss": 2.0194, + "step": 2742 + }, + { + "epoch": 0.21, + "grad_norm": 0.6293092174399644, + "learning_rate": 4.57986624240914e-05, + "loss": 1.9912, + "step": 2743 + }, + { + "epoch": 0.21, + "grad_norm": 0.5853137351391694, + "learning_rate": 4.5795195754223227e-05, + "loss": 2.0212, + "step": 2744 + }, + { + "epoch": 0.21, + "grad_norm": 0.6222177440489787, + "learning_rate": 4.579172778602555e-05, + "loss": 2.1442, + "step": 2745 + }, + { + "epoch": 0.21, + "grad_norm": 0.6778188359882265, + "learning_rate": 4.578825851971489e-05, + "loss": 1.9919, + "step": 2746 + }, + { + "epoch": 0.21, + "grad_norm": 0.587600823091895, + "learning_rate": 4.5784787955507856e-05, + "loss": 1.9981, + "step": 2747 + }, + { + "epoch": 0.21, + "grad_norm": 0.7124436175571408, + "learning_rate": 4.578131609362112e-05, + "loss": 2.1332, + "step": 2748 + }, + { + "epoch": 0.21, + "grad_norm": 0.594628326436112, + "learning_rate": 4.5777842934271455e-05, + "loss": 1.9561, + "step": 2749 + }, + { + "epoch": 0.21, + "grad_norm": 0.6627225645232516, + "learning_rate": 4.577436847767569e-05, + "loss": 1.9689, + "step": 2750 + }, + { + "epoch": 0.21, + "grad_norm": 0.7631883599138035, + "learning_rate": 4.5770892724050754e-05, + "loss": 2.0177, + "step": 2751 + }, + { + "epoch": 0.21, + "grad_norm": 0.6207930598184283, + "learning_rate": 4.5767415673613664e-05, + "loss": 1.9675, + "step": 2752 + }, + { + "epoch": 0.21, + "grad_norm": 0.789268479224515, + "learning_rate": 4.57639373265815e-05, + "loss": 2.2035, + "step": 2753 + }, + { + "epoch": 0.21, + "grad_norm": 0.6564910549515972, + "learning_rate": 4.576045768317143e-05, + "loss": 1.9582, + "step": 2754 + }, + { + "epoch": 0.21, + "grad_norm": 0.6885668117653446, + "learning_rate": 4.5756976743600694e-05, + "loss": 2.0111, + "step": 2755 + }, + { + "epoch": 0.21, + "grad_norm": 0.726912666993224, + "learning_rate": 4.5753494508086634e-05, + "loss": 2.1499, + "step": 2756 + }, + { + "epoch": 0.21, + "grad_norm": 0.6295799172352778, + "learning_rate": 4.575001097684665e-05, + "loss": 2.0606, + "step": 2757 + }, + { + "epoch": 0.21, + "grad_norm": 0.6244854012366476, + "learning_rate": 4.5746526150098234e-05, + "loss": 1.9885, + "step": 2758 + }, + { + "epoch": 0.21, + "grad_norm": 0.7174359452624782, + "learning_rate": 4.574304002805897e-05, + "loss": 1.9835, + "step": 2759 + }, + { + "epoch": 0.21, + "grad_norm": 0.6227259967417186, + "learning_rate": 4.5739552610946495e-05, + "loss": 2.1767, + "step": 2760 + }, + { + "epoch": 0.21, + "grad_norm": 0.5984088042514014, + "learning_rate": 4.5736063898978553e-05, + "loss": 2.002, + "step": 2761 + }, + { + "epoch": 0.21, + "grad_norm": 0.6806538349505173, + "learning_rate": 4.573257389237296e-05, + "loss": 1.9991, + "step": 2762 + }, + { + "epoch": 0.21, + "grad_norm": 0.5726933289209153, + "learning_rate": 4.57290825913476e-05, + "loss": 2.0095, + "step": 2763 + }, + { + "epoch": 0.21, + "grad_norm": 0.7360269692392019, + "learning_rate": 4.572558999612047e-05, + "loss": 1.9736, + "step": 2764 + }, + { + "epoch": 0.21, + "grad_norm": 0.645296711900151, + "learning_rate": 4.5722096106909595e-05, + "loss": 2.1521, + "step": 2765 + }, + { + "epoch": 0.21, + "grad_norm": 0.7201120969061646, + "learning_rate": 4.5718600923933144e-05, + "loss": 1.993, + "step": 2766 + }, + { + "epoch": 0.21, + "grad_norm": 0.6444093676344362, + "learning_rate": 4.571510444740932e-05, + "loss": 2.0214, + "step": 2767 + }, + { + "epoch": 0.21, + "grad_norm": 0.5893068827066283, + "learning_rate": 4.571160667755643e-05, + "loss": 2.1302, + "step": 2768 + }, + { + "epoch": 0.21, + "grad_norm": 0.6783447357780432, + "learning_rate": 4.5708107614592845e-05, + "loss": 2.0205, + "step": 2769 + }, + { + "epoch": 0.21, + "grad_norm": 0.6773233340650222, + "learning_rate": 4.570460725873703e-05, + "loss": 1.992, + "step": 2770 + }, + { + "epoch": 0.21, + "grad_norm": 0.6207222696489187, + "learning_rate": 4.570110561020753e-05, + "loss": 1.9541, + "step": 2771 + }, + { + "epoch": 0.21, + "grad_norm": 0.6739335409830205, + "learning_rate": 4.5697602669222964e-05, + "loss": 1.9661, + "step": 2772 + }, + { + "epoch": 0.21, + "grad_norm": 0.7606699792598559, + "learning_rate": 4.569409843600203e-05, + "loss": 2.1788, + "step": 2773 + }, + { + "epoch": 0.21, + "grad_norm": 0.6711149755631587, + "learning_rate": 4.569059291076352e-05, + "loss": 1.9947, + "step": 2774 + }, + { + "epoch": 0.21, + "grad_norm": 0.6465873488697467, + "learning_rate": 4.56870860937263e-05, + "loss": 2.0585, + "step": 2775 + }, + { + "epoch": 0.21, + "grad_norm": 0.6411975458999427, + "learning_rate": 4.568357798510931e-05, + "loss": 2.0042, + "step": 2776 + }, + { + "epoch": 0.21, + "grad_norm": 0.6888143430209183, + "learning_rate": 4.568006858513157e-05, + "loss": 2.1754, + "step": 2777 + }, + { + "epoch": 0.21, + "grad_norm": 0.8714153148177743, + "learning_rate": 4.56765578940122e-05, + "loss": 1.9843, + "step": 2778 + }, + { + "epoch": 0.21, + "grad_norm": 0.8656697630216207, + "learning_rate": 4.567304591197037e-05, + "loss": 1.9876, + "step": 2779 + }, + { + "epoch": 0.21, + "grad_norm": 0.7065480436484274, + "learning_rate": 4.566953263922537e-05, + "loss": 2.2021, + "step": 2780 + }, + { + "epoch": 0.21, + "grad_norm": 0.9803238992281242, + "learning_rate": 4.5666018075996516e-05, + "loss": 1.9878, + "step": 2781 + }, + { + "epoch": 0.21, + "grad_norm": 0.6178701951645434, + "learning_rate": 4.566250222250327e-05, + "loss": 1.9952, + "step": 2782 + }, + { + "epoch": 0.21, + "grad_norm": 0.8836848567588546, + "learning_rate": 4.5658985078965124e-05, + "loss": 1.9502, + "step": 2783 + }, + { + "epoch": 0.21, + "grad_norm": 0.8274683172290525, + "learning_rate": 4.5655466645601666e-05, + "loss": 2.013, + "step": 2784 + }, + { + "epoch": 0.21, + "grad_norm": 0.8645014730633772, + "learning_rate": 4.565194692263257e-05, + "loss": 2.1835, + "step": 2785 + }, + { + "epoch": 0.21, + "grad_norm": 1.2188305478409456, + "learning_rate": 4.5648425910277594e-05, + "loss": 2.0041, + "step": 2786 + }, + { + "epoch": 0.22, + "grad_norm": 0.6473009802391637, + "learning_rate": 4.5644903608756564e-05, + "loss": 1.999, + "step": 2787 + }, + { + "epoch": 0.22, + "grad_norm": 0.8297091501769635, + "learning_rate": 4.5641380018289384e-05, + "loss": 2.0303, + "step": 2788 + }, + { + "epoch": 0.22, + "grad_norm": 0.8988633567086005, + "learning_rate": 4.563785513909605e-05, + "loss": 2.2072, + "step": 2789 + }, + { + "epoch": 0.22, + "grad_norm": 0.7429093022412904, + "learning_rate": 4.563432897139664e-05, + "loss": 1.9594, + "step": 2790 + }, + { + "epoch": 0.22, + "grad_norm": 0.7786389766083004, + "learning_rate": 4.563080151541131e-05, + "loss": 1.9557, + "step": 2791 + }, + { + "epoch": 0.22, + "grad_norm": 0.6080534146007245, + "learning_rate": 4.5627272771360284e-05, + "loss": 2.1346, + "step": 2792 + }, + { + "epoch": 0.22, + "grad_norm": 0.7987251936834332, + "learning_rate": 4.562374273946388e-05, + "loss": 2.0202, + "step": 2793 + }, + { + "epoch": 0.22, + "grad_norm": 0.7283780239896972, + "learning_rate": 4.562021141994249e-05, + "loss": 2.0411, + "step": 2794 + }, + { + "epoch": 0.22, + "grad_norm": 0.6071314653849001, + "learning_rate": 4.561667881301659e-05, + "loss": 1.9707, + "step": 2795 + }, + { + "epoch": 0.22, + "grad_norm": 0.9252676566816939, + "learning_rate": 4.561314491890674e-05, + "loss": 1.9586, + "step": 2796 + }, + { + "epoch": 0.22, + "grad_norm": 0.6070199284302388, + "learning_rate": 4.560960973783357e-05, + "loss": 2.1933, + "step": 2797 + }, + { + "epoch": 0.22, + "grad_norm": 0.7924570906830115, + "learning_rate": 4.56060732700178e-05, + "loss": 2.0251, + "step": 2798 + }, + { + "epoch": 0.22, + "grad_norm": 0.6245592793203827, + "learning_rate": 4.560253551568022e-05, + "loss": 1.9828, + "step": 2799 + }, + { + "epoch": 0.22, + "grad_norm": 0.6175846472322825, + "learning_rate": 4.5598996475041716e-05, + "loss": 2.0201, + "step": 2800 + }, + { + "epoch": 0.22, + "grad_norm": 0.7242371757675056, + "learning_rate": 4.5595456148323235e-05, + "loss": 2.195, + "step": 2801 + }, + { + "epoch": 0.22, + "grad_norm": 0.6500032961745279, + "learning_rate": 4.559191453574582e-05, + "loss": 1.988, + "step": 2802 + }, + { + "epoch": 0.22, + "grad_norm": 0.6826942890717906, + "learning_rate": 4.558837163753059e-05, + "loss": 2.0091, + "step": 2803 + }, + { + "epoch": 0.22, + "grad_norm": 0.6229778172735722, + "learning_rate": 4.558482745389874e-05, + "loss": 1.9857, + "step": 2804 + }, + { + "epoch": 0.22, + "grad_norm": 0.63313069533344, + "learning_rate": 4.558128198507153e-05, + "loss": 2.134, + "step": 2805 + }, + { + "epoch": 0.22, + "grad_norm": 0.5999104918410882, + "learning_rate": 4.557773523127036e-05, + "loss": 2.0455, + "step": 2806 + }, + { + "epoch": 0.22, + "grad_norm": 0.6294359560155657, + "learning_rate": 4.557418719271663e-05, + "loss": 1.9403, + "step": 2807 + }, + { + "epoch": 0.22, + "grad_norm": 0.5755913347329538, + "learning_rate": 4.5570637869631876e-05, + "loss": 1.9286, + "step": 2808 + }, + { + "epoch": 0.22, + "grad_norm": 0.665712105095883, + "learning_rate": 4.55670872622377e-05, + "loss": 2.1782, + "step": 2809 + }, + { + "epoch": 0.22, + "grad_norm": 0.6305847707072353, + "learning_rate": 4.556353537075576e-05, + "loss": 1.9615, + "step": 2810 + }, + { + "epoch": 0.22, + "grad_norm": 0.6941507505291515, + "learning_rate": 4.555998219540783e-05, + "loss": 1.9509, + "step": 2811 + }, + { + "epoch": 0.22, + "grad_norm": 0.7167520786121081, + "learning_rate": 4.555642773641576e-05, + "loss": 2.2022, + "step": 2812 + }, + { + "epoch": 0.22, + "grad_norm": 0.8126009864774975, + "learning_rate": 4.5552871994001454e-05, + "loss": 2.0776, + "step": 2813 + }, + { + "epoch": 0.22, + "grad_norm": 0.6836392065689031, + "learning_rate": 4.554931496838692e-05, + "loss": 1.9893, + "step": 2814 + }, + { + "epoch": 0.22, + "grad_norm": 0.7756634706788672, + "learning_rate": 4.5545756659794234e-05, + "loss": 2.0069, + "step": 2815 + }, + { + "epoch": 0.22, + "grad_norm": 0.60678672741813, + "learning_rate": 4.554219706844555e-05, + "loss": 2.016, + "step": 2816 + }, + { + "epoch": 0.22, + "grad_norm": 0.7056114940862528, + "learning_rate": 4.5538636194563126e-05, + "loss": 2.1671, + "step": 2817 + }, + { + "epoch": 0.22, + "grad_norm": 0.6152703262317227, + "learning_rate": 4.553507403836925e-05, + "loss": 2.0159, + "step": 2818 + }, + { + "epoch": 0.22, + "grad_norm": 0.6795958586602747, + "learning_rate": 4.5531510600086356e-05, + "loss": 1.9866, + "step": 2819 + }, + { + "epoch": 0.22, + "grad_norm": 0.6154295809613098, + "learning_rate": 4.552794587993691e-05, + "loss": 1.9898, + "step": 2820 + }, + { + "epoch": 0.22, + "grad_norm": 0.7639723736967522, + "learning_rate": 4.552437987814346e-05, + "loss": 2.202, + "step": 2821 + }, + { + "epoch": 0.22, + "grad_norm": 0.6465414274009734, + "learning_rate": 4.552081259492867e-05, + "loss": 2.0013, + "step": 2822 + }, + { + "epoch": 0.22, + "grad_norm": 0.6405697302521749, + "learning_rate": 4.5517244030515235e-05, + "loss": 1.9681, + "step": 2823 + }, + { + "epoch": 0.22, + "grad_norm": 0.7215042282520571, + "learning_rate": 4.5513674185125976e-05, + "loss": 2.1878, + "step": 2824 + }, + { + "epoch": 0.22, + "grad_norm": 0.6246500836520427, + "learning_rate": 4.551010305898377e-05, + "loss": 2.0427, + "step": 2825 + }, + { + "epoch": 0.22, + "grad_norm": 0.7177701784596208, + "learning_rate": 4.550653065231156e-05, + "loss": 1.9767, + "step": 2826 + }, + { + "epoch": 0.22, + "grad_norm": 0.6100182645441249, + "learning_rate": 4.550295696533241e-05, + "loss": 1.9884, + "step": 2827 + }, + { + "epoch": 0.22, + "grad_norm": 0.6253906657069692, + "learning_rate": 4.549938199826943e-05, + "loss": 1.9576, + "step": 2828 + }, + { + "epoch": 0.22, + "grad_norm": 0.6574391423232303, + "learning_rate": 4.549580575134581e-05, + "loss": 2.1227, + "step": 2829 + }, + { + "epoch": 0.22, + "grad_norm": 0.6410611009248435, + "learning_rate": 4.549222822478484e-05, + "loss": 1.9439, + "step": 2830 + }, + { + "epoch": 0.22, + "grad_norm": 0.6364492611243694, + "learning_rate": 4.548864941880988e-05, + "loss": 2.0551, + "step": 2831 + }, + { + "epoch": 0.22, + "grad_norm": 0.6567136142944537, + "learning_rate": 4.5485069333644365e-05, + "loss": 2.0015, + "step": 2832 + }, + { + "epoch": 0.22, + "grad_norm": 0.7503929174713984, + "learning_rate": 4.548148796951182e-05, + "loss": 2.2002, + "step": 2833 + }, + { + "epoch": 0.22, + "grad_norm": 0.5530971981044319, + "learning_rate": 4.5477905326635834e-05, + "loss": 1.9807, + "step": 2834 + }, + { + "epoch": 0.22, + "grad_norm": 0.6798052421349903, + "learning_rate": 4.54743214052401e-05, + "loss": 1.994, + "step": 2835 + }, + { + "epoch": 0.22, + "grad_norm": 0.6800876655630853, + "learning_rate": 4.547073620554837e-05, + "loss": 1.9732, + "step": 2836 + }, + { + "epoch": 0.22, + "grad_norm": 0.5855190065165854, + "learning_rate": 4.5467149727784475e-05, + "loss": 2.2371, + "step": 2837 + }, + { + "epoch": 0.22, + "grad_norm": 0.7001853668201681, + "learning_rate": 4.546356197217235e-05, + "loss": 2.005, + "step": 2838 + }, + { + "epoch": 0.22, + "grad_norm": 0.6267456078592311, + "learning_rate": 4.545997293893598e-05, + "loss": 1.9511, + "step": 2839 + }, + { + "epoch": 0.22, + "grad_norm": 0.6182001894960636, + "learning_rate": 4.545638262829945e-05, + "loss": 1.9601, + "step": 2840 + }, + { + "epoch": 0.22, + "grad_norm": 0.6826371254965823, + "learning_rate": 4.5452791040486916e-05, + "loss": 2.1518, + "step": 2841 + }, + { + "epoch": 0.22, + "grad_norm": 0.7307901848650923, + "learning_rate": 4.5449198175722616e-05, + "loss": 1.9406, + "step": 2842 + }, + { + "epoch": 0.22, + "grad_norm": 0.5820433448026973, + "learning_rate": 4.544560403423086e-05, + "loss": 2.0016, + "step": 2843 + }, + { + "epoch": 0.22, + "grad_norm": 0.6314021283020453, + "learning_rate": 4.544200861623606e-05, + "loss": 2.0596, + "step": 2844 + }, + { + "epoch": 0.22, + "grad_norm": 0.6428405994029349, + "learning_rate": 4.543841192196267e-05, + "loss": 2.1636, + "step": 2845 + }, + { + "epoch": 0.22, + "grad_norm": 0.6435160655798973, + "learning_rate": 4.543481395163528e-05, + "loss": 2.0009, + "step": 2846 + }, + { + "epoch": 0.22, + "grad_norm": 0.6594843761958827, + "learning_rate": 4.54312147054785e-05, + "loss": 1.9885, + "step": 2847 + }, + { + "epoch": 0.22, + "grad_norm": 0.5994132154448734, + "learning_rate": 4.542761418371705e-05, + "loss": 1.9679, + "step": 2848 + }, + { + "epoch": 0.22, + "grad_norm": 0.80087867596022, + "learning_rate": 4.5424012386575724e-05, + "loss": 2.1821, + "step": 2849 + }, + { + "epoch": 0.22, + "grad_norm": 0.6222519603267266, + "learning_rate": 4.542040931427941e-05, + "loss": 2.0123, + "step": 2850 + }, + { + "epoch": 0.22, + "grad_norm": 0.8102587140396517, + "learning_rate": 4.541680496705304e-05, + "loss": 2.0074, + "step": 2851 + }, + { + "epoch": 0.22, + "grad_norm": 0.642088701245558, + "learning_rate": 4.541319934512167e-05, + "loss": 1.956, + "step": 2852 + }, + { + "epoch": 0.22, + "grad_norm": 0.8197684316205066, + "learning_rate": 4.5409592448710416e-05, + "loss": 2.1598, + "step": 2853 + }, + { + "epoch": 0.22, + "grad_norm": 0.6182804461324081, + "learning_rate": 4.540598427804444e-05, + "loss": 1.9841, + "step": 2854 + }, + { + "epoch": 0.22, + "grad_norm": 0.7758159466236181, + "learning_rate": 4.540237483334905e-05, + "loss": 1.978, + "step": 2855 + }, + { + "epoch": 0.22, + "grad_norm": 0.7579463687232904, + "learning_rate": 4.539876411484958e-05, + "loss": 2.0344, + "step": 2856 + }, + { + "epoch": 0.22, + "grad_norm": 0.8260274656854518, + "learning_rate": 4.539515212277147e-05, + "loss": 2.1892, + "step": 2857 + }, + { + "epoch": 0.22, + "grad_norm": 0.5942964706685796, + "learning_rate": 4.539153885734022e-05, + "loss": 1.9459, + "step": 2858 + }, + { + "epoch": 0.22, + "grad_norm": 0.7282235040093462, + "learning_rate": 4.5387924318781424e-05, + "loss": 1.9631, + "step": 2859 + }, + { + "epoch": 0.22, + "grad_norm": 0.6374913565744571, + "learning_rate": 4.538430850732077e-05, + "loss": 1.9775, + "step": 2860 + }, + { + "epoch": 0.22, + "grad_norm": 0.7534812306001003, + "learning_rate": 4.538069142318398e-05, + "loss": 2.1807, + "step": 2861 + }, + { + "epoch": 0.22, + "grad_norm": 0.590841649452214, + "learning_rate": 4.537707306659692e-05, + "loss": 2.0586, + "step": 2862 + }, + { + "epoch": 0.22, + "grad_norm": 0.8078345745360919, + "learning_rate": 4.537345343778546e-05, + "loss": 1.9146, + "step": 2863 + }, + { + "epoch": 0.22, + "grad_norm": 0.6232718261513431, + "learning_rate": 4.536983253697561e-05, + "loss": 2.0345, + "step": 2864 + }, + { + "epoch": 0.22, + "grad_norm": 0.7956778886829186, + "learning_rate": 4.536621036439344e-05, + "loss": 2.1656, + "step": 2865 + }, + { + "epoch": 0.22, + "grad_norm": 0.6220027138141474, + "learning_rate": 4.5362586920265085e-05, + "loss": 1.9612, + "step": 2866 + }, + { + "epoch": 0.22, + "grad_norm": 0.6011225600815533, + "learning_rate": 4.5358962204816775e-05, + "loss": 2.0283, + "step": 2867 + }, + { + "epoch": 0.22, + "grad_norm": 0.6305369247695293, + "learning_rate": 4.535533621827482e-05, + "loss": 2.0083, + "step": 2868 + }, + { + "epoch": 0.22, + "grad_norm": 3.7610780113415268, + "learning_rate": 4.53517089608656e-05, + "loss": 2.2337, + "step": 2869 + }, + { + "epoch": 0.22, + "grad_norm": 0.6052291106605415, + "learning_rate": 4.5348080432815587e-05, + "loss": 1.9581, + "step": 2870 + }, + { + "epoch": 0.22, + "grad_norm": 20.166794132613884, + "learning_rate": 4.534445063435132e-05, + "loss": 1.9625, + "step": 2871 + }, + { + "epoch": 0.22, + "grad_norm": 22.43504401389042, + "learning_rate": 4.534081956569942e-05, + "loss": 2.7738, + "step": 2872 + }, + { + "epoch": 0.22, + "grad_norm": 41.96701761382132, + "learning_rate": 4.53371872270866e-05, + "loss": 4.0904, + "step": 2873 + }, + { + "epoch": 0.22, + "grad_norm": 43.03228087610891, + "learning_rate": 4.533355361873962e-05, + "loss": 3.6379, + "step": 2874 + }, + { + "epoch": 0.22, + "grad_norm": 29.851978052510876, + "learning_rate": 4.532991874088537e-05, + "loss": 3.0166, + "step": 2875 + }, + { + "epoch": 0.22, + "grad_norm": 30.87212281375283, + "learning_rate": 4.532628259375077e-05, + "loss": 3.7492, + "step": 2876 + }, + { + "epoch": 0.22, + "grad_norm": 95.95200393937546, + "learning_rate": 4.532264517756284e-05, + "loss": 6.0122, + "step": 2877 + }, + { + "epoch": 0.22, + "grad_norm": 12.859550550670065, + "learning_rate": 4.531900649254869e-05, + "loss": 3.0449, + "step": 2878 + }, + { + "epoch": 0.22, + "grad_norm": 13.925404649856581, + "learning_rate": 4.531536653893549e-05, + "loss": 2.9083, + "step": 2879 + }, + { + "epoch": 0.22, + "grad_norm": 22.55321120338719, + "learning_rate": 4.53117253169505e-05, + "loss": 2.8096, + "step": 2880 + }, + { + "epoch": 0.22, + "grad_norm": 18.911858792571532, + "learning_rate": 4.530808282682106e-05, + "loss": 2.761, + "step": 2881 + }, + { + "epoch": 0.22, + "grad_norm": 5.573116485632248, + "learning_rate": 4.5304439068774575e-05, + "loss": 2.4804, + "step": 2882 + }, + { + "epoch": 0.22, + "grad_norm": 4.758761567269109, + "learning_rate": 4.5300794043038546e-05, + "loss": 2.3944, + "step": 2883 + }, + { + "epoch": 0.22, + "grad_norm": 2.933964688501277, + "learning_rate": 4.529714774984055e-05, + "loss": 2.2645, + "step": 2884 + }, + { + "epoch": 0.22, + "grad_norm": 2.7089508265074214, + "learning_rate": 4.529350018940824e-05, + "loss": 2.5618, + "step": 2885 + }, + { + "epoch": 0.22, + "grad_norm": 2.1554988423120256, + "learning_rate": 4.528985136196934e-05, + "loss": 2.2297, + "step": 2886 + }, + { + "epoch": 0.22, + "grad_norm": 3.155356489064996, + "learning_rate": 4.528620126775167e-05, + "loss": 2.3073, + "step": 2887 + }, + { + "epoch": 0.22, + "grad_norm": 6.342922549592311, + "learning_rate": 4.528254990698311e-05, + "loss": 2.2671, + "step": 2888 + }, + { + "epoch": 0.22, + "grad_norm": 4.250761253886918, + "learning_rate": 4.5278897279891646e-05, + "loss": 2.4565, + "step": 2889 + }, + { + "epoch": 0.22, + "grad_norm": 3.230708315515155, + "learning_rate": 4.527524338670531e-05, + "loss": 2.2688, + "step": 2890 + }, + { + "epoch": 0.22, + "grad_norm": 1.9813439467396243, + "learning_rate": 4.5271588227652243e-05, + "loss": 2.2126, + "step": 2891 + }, + { + "epoch": 0.22, + "grad_norm": 1.395299301826921, + "learning_rate": 4.5267931802960636e-05, + "loss": 2.1905, + "step": 2892 + }, + { + "epoch": 0.22, + "grad_norm": 1.4766864393796755, + "learning_rate": 4.5264274112858794e-05, + "loss": 2.3974, + "step": 2893 + }, + { + "epoch": 0.22, + "grad_norm": 1.9217125126653207, + "learning_rate": 4.526061515757507e-05, + "loss": 2.1368, + "step": 2894 + }, + { + "epoch": 0.22, + "grad_norm": 1.0962335322072025, + "learning_rate": 4.5256954937337906e-05, + "loss": 2.1603, + "step": 2895 + }, + { + "epoch": 0.22, + "grad_norm": 0.9636543407468255, + "learning_rate": 4.5253293452375825e-05, + "loss": 2.1114, + "step": 2896 + }, + { + "epoch": 0.22, + "grad_norm": 0.800738884958415, + "learning_rate": 4.524963070291744e-05, + "loss": 2.3236, + "step": 2897 + }, + { + "epoch": 0.22, + "grad_norm": 1.4290960109962267, + "learning_rate": 4.524596668919141e-05, + "loss": 2.1552, + "step": 2898 + }, + { + "epoch": 0.22, + "grad_norm": 1.1070494786359306, + "learning_rate": 4.5242301411426516e-05, + "loss": 2.0799, + "step": 2899 + }, + { + "epoch": 0.22, + "grad_norm": 0.6719305712665202, + "learning_rate": 4.5238634869851585e-05, + "loss": 2.1752, + "step": 2900 + }, + { + "epoch": 0.22, + "grad_norm": 0.7126071967751997, + "learning_rate": 4.523496706469554e-05, + "loss": 2.3153, + "step": 2901 + }, + { + "epoch": 0.22, + "grad_norm": 0.8126390229111553, + "learning_rate": 4.523129799618737e-05, + "loss": 2.1053, + "step": 2902 + }, + { + "epoch": 0.22, + "grad_norm": 0.6622483201105618, + "learning_rate": 4.522762766455615e-05, + "loss": 2.0884, + "step": 2903 + }, + { + "epoch": 0.22, + "grad_norm": 1.1386163835490424, + "learning_rate": 4.522395607003105e-05, + "loss": 2.0542, + "step": 2904 + }, + { + "epoch": 0.22, + "grad_norm": 0.7274368793363496, + "learning_rate": 4.522028321284128e-05, + "loss": 2.2581, + "step": 2905 + }, + { + "epoch": 0.22, + "grad_norm": 3.3954679378161785, + "learning_rate": 4.521660909321617e-05, + "loss": 2.1787, + "step": 2906 + }, + { + "epoch": 0.22, + "grad_norm": 0.6702922530941131, + "learning_rate": 4.521293371138509e-05, + "loss": 2.0398, + "step": 2907 + }, + { + "epoch": 0.22, + "grad_norm": 0.6784667388469936, + "learning_rate": 4.520925706757753e-05, + "loss": 2.0598, + "step": 2908 + }, + { + "epoch": 0.22, + "grad_norm": 0.6977581545666485, + "learning_rate": 4.5205579162023026e-05, + "loss": 2.2989, + "step": 2909 + }, + { + "epoch": 0.22, + "grad_norm": 2.2077387236904, + "learning_rate": 4.5201899994951214e-05, + "loss": 2.085, + "step": 2910 + }, + { + "epoch": 0.22, + "grad_norm": 0.7423388495954832, + "learning_rate": 4.519821956659179e-05, + "loss": 2.1073, + "step": 2911 + }, + { + "epoch": 0.22, + "grad_norm": 0.5923475715394003, + "learning_rate": 4.519453787717455e-05, + "loss": 2.1546, + "step": 2912 + }, + { + "epoch": 0.22, + "grad_norm": 0.5899555901537639, + "learning_rate": 4.519085492692933e-05, + "loss": 2.2955, + "step": 2913 + }, + { + "epoch": 0.22, + "grad_norm": 0.5907632278075337, + "learning_rate": 4.518717071608611e-05, + "loss": 1.99, + "step": 2914 + }, + { + "epoch": 0.22, + "grad_norm": 0.5796207448440479, + "learning_rate": 4.518348524487488e-05, + "loss": 2.0275, + "step": 2915 + }, + { + "epoch": 0.22, + "grad_norm": 0.653544279356977, + "learning_rate": 4.517979851352575e-05, + "loss": 2.0724, + "step": 2916 + }, + { + "epoch": 0.23, + "grad_norm": 0.9798715934058801, + "learning_rate": 4.517611052226891e-05, + "loss": 2.2267, + "step": 2917 + }, + { + "epoch": 0.23, + "grad_norm": 0.5531928906800047, + "learning_rate": 4.5172421271334596e-05, + "loss": 2.078, + "step": 2918 + }, + { + "epoch": 0.23, + "grad_norm": 0.6070879817857471, + "learning_rate": 4.516873076095315e-05, + "loss": 2.0652, + "step": 2919 + }, + { + "epoch": 0.23, + "grad_norm": 0.8722210199743303, + "learning_rate": 4.516503899135499e-05, + "loss": 2.0582, + "step": 2920 + }, + { + "epoch": 0.23, + "grad_norm": 0.6066966263477893, + "learning_rate": 4.516134596277061e-05, + "loss": 2.215, + "step": 2921 + }, + { + "epoch": 0.23, + "grad_norm": 0.5451661407756073, + "learning_rate": 4.5157651675430566e-05, + "loss": 2.0542, + "step": 2922 + }, + { + "epoch": 0.23, + "grad_norm": 0.561401151559369, + "learning_rate": 4.515395612956552e-05, + "loss": 2.0164, + "step": 2923 + }, + { + "epoch": 0.23, + "grad_norm": 1.095686439886581, + "learning_rate": 4.515025932540621e-05, + "loss": 2.0789, + "step": 2924 + }, + { + "epoch": 0.23, + "grad_norm": 0.5867301536369358, + "learning_rate": 4.5146561263183415e-05, + "loss": 2.2568, + "step": 2925 + }, + { + "epoch": 0.23, + "grad_norm": 0.5198274109784404, + "learning_rate": 4.5142861943128054e-05, + "loss": 2.0863, + "step": 2926 + }, + { + "epoch": 0.23, + "grad_norm": 0.6159994081806842, + "learning_rate": 4.5139161365471064e-05, + "loss": 2.0759, + "step": 2927 + }, + { + "epoch": 0.23, + "grad_norm": 1.5133054358121572, + "learning_rate": 4.513545953044349e-05, + "loss": 2.046, + "step": 2928 + }, + { + "epoch": 0.23, + "grad_norm": 0.5555267937876979, + "learning_rate": 4.513175643827647e-05, + "loss": 2.2429, + "step": 2929 + }, + { + "epoch": 0.23, + "grad_norm": 0.5531950471914892, + "learning_rate": 4.512805208920118e-05, + "loss": 2.0411, + "step": 2930 + }, + { + "epoch": 0.23, + "grad_norm": 0.5415491042252507, + "learning_rate": 4.5124346483448923e-05, + "loss": 2.0981, + "step": 2931 + }, + { + "epoch": 0.23, + "grad_norm": 0.5330258970161332, + "learning_rate": 4.512063962125104e-05, + "loss": 2.0328, + "step": 2932 + }, + { + "epoch": 0.23, + "grad_norm": 0.6063221214067308, + "learning_rate": 4.5116931502838954e-05, + "loss": 2.1844, + "step": 2933 + }, + { + "epoch": 0.23, + "grad_norm": 0.5297301841530939, + "learning_rate": 4.511322212844421e-05, + "loss": 2.0133, + "step": 2934 + }, + { + "epoch": 0.23, + "grad_norm": 0.6498937401538403, + "learning_rate": 4.5109511498298365e-05, + "loss": 2.0899, + "step": 2935 + }, + { + "epoch": 0.23, + "grad_norm": 0.5772001406626477, + "learning_rate": 4.5105799612633115e-05, + "loss": 2.0273, + "step": 2936 + }, + { + "epoch": 0.23, + "grad_norm": 0.6162029377259111, + "learning_rate": 4.51020864716802e-05, + "loss": 2.0866, + "step": 2937 + }, + { + "epoch": 0.23, + "grad_norm": 1.5330025861520211, + "learning_rate": 4.509837207567144e-05, + "loss": 2.2516, + "step": 2938 + }, + { + "epoch": 0.23, + "grad_norm": 0.5806761849143963, + "learning_rate": 4.5094656424838746e-05, + "loss": 2.0366, + "step": 2939 + }, + { + "epoch": 0.23, + "grad_norm": 0.5327825711569419, + "learning_rate": 4.509093951941411e-05, + "loss": 2.0163, + "step": 2940 + }, + { + "epoch": 0.23, + "grad_norm": 0.5273173302245768, + "learning_rate": 4.508722135962957e-05, + "loss": 2.2318, + "step": 2941 + }, + { + "epoch": 0.23, + "grad_norm": 0.6483279268246621, + "learning_rate": 4.5083501945717285e-05, + "loss": 2.0135, + "step": 2942 + }, + { + "epoch": 0.23, + "grad_norm": 0.523400570331478, + "learning_rate": 4.507978127790947e-05, + "loss": 2.0888, + "step": 2943 + }, + { + "epoch": 0.23, + "grad_norm": 0.6060362394075197, + "learning_rate": 4.507605935643842e-05, + "loss": 2.0282, + "step": 2944 + }, + { + "epoch": 0.23, + "grad_norm": 0.6482317093725517, + "learning_rate": 4.507233618153651e-05, + "loss": 2.2234, + "step": 2945 + }, + { + "epoch": 0.23, + "grad_norm": 0.5557428825438474, + "learning_rate": 4.5068611753436194e-05, + "loss": 2.0408, + "step": 2946 + }, + { + "epoch": 0.23, + "grad_norm": 0.6543333379952416, + "learning_rate": 4.506488607236999e-05, + "loss": 1.9892, + "step": 2947 + }, + { + "epoch": 0.23, + "grad_norm": 0.5513633603021613, + "learning_rate": 4.506115913857054e-05, + "loss": 1.9946, + "step": 2948 + }, + { + "epoch": 0.23, + "grad_norm": 0.570053182848793, + "learning_rate": 4.50574309522705e-05, + "loss": 2.0692, + "step": 2949 + }, + { + "epoch": 0.23, + "grad_norm": 0.5295210194236916, + "learning_rate": 4.505370151370265e-05, + "loss": 2.244, + "step": 2950 + }, + { + "epoch": 0.23, + "grad_norm": 0.5539428778005544, + "learning_rate": 4.504997082309983e-05, + "loss": 2.0081, + "step": 2951 + }, + { + "epoch": 0.23, + "grad_norm": 0.5551310281167554, + "learning_rate": 4.504623888069497e-05, + "loss": 2.0688, + "step": 2952 + }, + { + "epoch": 0.23, + "grad_norm": 0.5666713688531781, + "learning_rate": 4.504250568672106e-05, + "loss": 2.2231, + "step": 2953 + }, + { + "epoch": 0.23, + "grad_norm": 0.5435564900447342, + "learning_rate": 4.503877124141118e-05, + "loss": 2.0005, + "step": 2954 + }, + { + "epoch": 0.23, + "grad_norm": 0.9176725209323147, + "learning_rate": 4.5035035544998495e-05, + "loss": 2.0763, + "step": 2955 + }, + { + "epoch": 0.23, + "grad_norm": 0.5211079552103035, + "learning_rate": 4.503129859771623e-05, + "loss": 2.0328, + "step": 2956 + }, + { + "epoch": 0.23, + "grad_norm": 0.7508112094451146, + "learning_rate": 4.502756039979771e-05, + "loss": 2.2406, + "step": 2957 + }, + { + "epoch": 0.23, + "grad_norm": 0.5498865043891005, + "learning_rate": 4.502382095147631e-05, + "loss": 2.0041, + "step": 2958 + }, + { + "epoch": 0.23, + "grad_norm": 0.5929272534292733, + "learning_rate": 4.502008025298552e-05, + "loss": 2.0225, + "step": 2959 + }, + { + "epoch": 0.23, + "grad_norm": 0.537894786026507, + "learning_rate": 4.501633830455887e-05, + "loss": 2.0411, + "step": 2960 + }, + { + "epoch": 0.23, + "grad_norm": 0.5770085207610334, + "learning_rate": 4.501259510642999e-05, + "loss": 2.2783, + "step": 2961 + }, + { + "epoch": 0.23, + "grad_norm": 0.4964370893046266, + "learning_rate": 4.500885065883258e-05, + "loss": 2.1108, + "step": 2962 + }, + { + "epoch": 0.23, + "grad_norm": 0.5565558101560913, + "learning_rate": 4.5005104962000436e-05, + "loss": 1.9829, + "step": 2963 + }, + { + "epoch": 0.23, + "grad_norm": 0.5350522070567845, + "learning_rate": 4.50013580161674e-05, + "loss": 2.0104, + "step": 2964 + }, + { + "epoch": 0.23, + "grad_norm": 0.5002452148888681, + "learning_rate": 4.4997609821567405e-05, + "loss": 2.1977, + "step": 2965 + }, + { + "epoch": 0.23, + "grad_norm": 0.5190043063126969, + "learning_rate": 4.499386037843448e-05, + "loss": 2.036, + "step": 2966 + }, + { + "epoch": 0.23, + "grad_norm": 0.5389545859104026, + "learning_rate": 4.499010968700272e-05, + "loss": 2.0363, + "step": 2967 + }, + { + "epoch": 0.23, + "grad_norm": 0.48623826020737193, + "learning_rate": 4.4986357747506294e-05, + "loss": 2.0875, + "step": 2968 + }, + { + "epoch": 0.23, + "grad_norm": 0.8390979814475955, + "learning_rate": 4.498260456017944e-05, + "loss": 1.9898, + "step": 2969 + }, + { + "epoch": 0.23, + "grad_norm": 0.5146167482196662, + "learning_rate": 4.497885012525651e-05, + "loss": 2.1814, + "step": 2970 + }, + { + "epoch": 0.23, + "grad_norm": 0.5450492026554314, + "learning_rate": 4.497509444297188e-05, + "loss": 2.0319, + "step": 2971 + }, + { + "epoch": 0.23, + "grad_norm": 0.5573433855941269, + "learning_rate": 4.4971337513560035e-05, + "loss": 2.005, + "step": 2972 + }, + { + "epoch": 0.23, + "grad_norm": 0.5267879523033264, + "learning_rate": 4.496757933725555e-05, + "loss": 2.2428, + "step": 2973 + }, + { + "epoch": 0.23, + "grad_norm": 0.5070948517884898, + "learning_rate": 4.4963819914293065e-05, + "loss": 2.0826, + "step": 2974 + }, + { + "epoch": 0.23, + "grad_norm": 0.5446417774020413, + "learning_rate": 4.496005924490728e-05, + "loss": 2.037, + "step": 2975 + }, + { + "epoch": 0.23, + "grad_norm": 0.5307354551005237, + "learning_rate": 4.495629732933301e-05, + "loss": 1.9974, + "step": 2976 + }, + { + "epoch": 0.23, + "grad_norm": 0.6865498864630941, + "learning_rate": 4.495253416780511e-05, + "loss": 2.1938, + "step": 2977 + }, + { + "epoch": 0.23, + "grad_norm": 0.542953294256001, + "learning_rate": 4.494876976055853e-05, + "loss": 2.0188, + "step": 2978 + }, + { + "epoch": 0.23, + "grad_norm": 0.5551509845664487, + "learning_rate": 4.4945004107828306e-05, + "loss": 1.9786, + "step": 2979 + }, + { + "epoch": 0.23, + "grad_norm": 0.5064097033216102, + "learning_rate": 4.4941237209849543e-05, + "loss": 2.0564, + "step": 2980 + }, + { + "epoch": 0.23, + "grad_norm": 0.5435959713107112, + "learning_rate": 4.4937469066857405e-05, + "loss": 2.0109, + "step": 2981 + }, + { + "epoch": 0.23, + "grad_norm": 0.5859318797318583, + "learning_rate": 4.493369967908719e-05, + "loss": 2.1615, + "step": 2982 + }, + { + "epoch": 0.23, + "grad_norm": 0.6323865969125004, + "learning_rate": 4.49299290467742e-05, + "loss": 2.0481, + "step": 2983 + }, + { + "epoch": 0.23, + "grad_norm": 0.551989575536368, + "learning_rate": 4.492615717015387e-05, + "loss": 1.9887, + "step": 2984 + }, + { + "epoch": 0.23, + "grad_norm": 0.525058140906379, + "learning_rate": 4.492238404946169e-05, + "loss": 2.2161, + "step": 2985 + }, + { + "epoch": 0.23, + "grad_norm": 1.2797184377603426, + "learning_rate": 4.491860968493323e-05, + "loss": 2.0796, + "step": 2986 + }, + { + "epoch": 0.23, + "grad_norm": 0.6315811335961712, + "learning_rate": 4.4914834076804135e-05, + "loss": 2.0358, + "step": 2987 + }, + { + "epoch": 0.23, + "grad_norm": 0.5695245497296412, + "learning_rate": 4.491105722531014e-05, + "loss": 2.022, + "step": 2988 + }, + { + "epoch": 0.23, + "grad_norm": 0.5533580537009234, + "learning_rate": 4.4907279130687045e-05, + "loss": 1.9987, + "step": 2989 + }, + { + "epoch": 0.23, + "grad_norm": 0.6465884235430764, + "learning_rate": 4.490349979317073e-05, + "loss": 2.1776, + "step": 2990 + }, + { + "epoch": 0.23, + "grad_norm": 0.5298594952994982, + "learning_rate": 4.4899719212997167e-05, + "loss": 2.0113, + "step": 2991 + }, + { + "epoch": 0.23, + "grad_norm": 0.6758067072269289, + "learning_rate": 4.489593739040238e-05, + "loss": 2.0028, + "step": 2992 + }, + { + "epoch": 0.23, + "grad_norm": 0.541197338964297, + "learning_rate": 4.489215432562248e-05, + "loss": 2.1124, + "step": 2993 + }, + { + "epoch": 0.23, + "grad_norm": 0.5292519909755773, + "learning_rate": 4.4888370018893666e-05, + "loss": 2.2642, + "step": 2994 + }, + { + "epoch": 0.23, + "grad_norm": 0.5542301378424841, + "learning_rate": 4.488458447045222e-05, + "loss": 2.0293, + "step": 2995 + }, + { + "epoch": 0.23, + "grad_norm": 0.59741399901344, + "learning_rate": 4.488079768053447e-05, + "loss": 2.0269, + "step": 2996 + }, + { + "epoch": 0.23, + "grad_norm": 0.549636190479805, + "learning_rate": 4.487700964937684e-05, + "loss": 2.1843, + "step": 2997 + }, + { + "epoch": 0.23, + "grad_norm": 0.5422848003070199, + "learning_rate": 4.487322037721586e-05, + "loss": 2.0089, + "step": 2998 + }, + { + "epoch": 0.23, + "grad_norm": 0.5463208575157817, + "learning_rate": 4.486942986428808e-05, + "loss": 2.0904, + "step": 2999 + }, + { + "epoch": 0.23, + "grad_norm": 0.6459195025201658, + "learning_rate": 4.4865638110830165e-05, + "loss": 2.0105, + "step": 3000 + }, + { + "epoch": 0.23, + "grad_norm": 0.5776394486813085, + "learning_rate": 4.486184511707886e-05, + "loss": 2.004, + "step": 3001 + }, + { + "epoch": 0.23, + "grad_norm": 1.4757109906581949, + "learning_rate": 4.485805088327096e-05, + "loss": 2.2294, + "step": 3002 + }, + { + "epoch": 0.23, + "grad_norm": 0.6572543101453379, + "learning_rate": 4.4854255409643374e-05, + "loss": 1.9792, + "step": 3003 + }, + { + "epoch": 0.23, + "grad_norm": 0.6001542724307072, + "learning_rate": 4.4850458696433056e-05, + "loss": 2.0043, + "step": 3004 + }, + { + "epoch": 0.23, + "grad_norm": 0.538310238946539, + "learning_rate": 4.4846660743877056e-05, + "loss": 2.0936, + "step": 3005 + }, + { + "epoch": 0.23, + "grad_norm": 0.5412959976112217, + "learning_rate": 4.484286155221249e-05, + "loss": 2.1894, + "step": 3006 + }, + { + "epoch": 0.23, + "grad_norm": 0.6916963892320801, + "learning_rate": 4.483906112167656e-05, + "loss": 2.0347, + "step": 3007 + }, + { + "epoch": 0.23, + "grad_norm": 0.5378241418322235, + "learning_rate": 4.4835259452506544e-05, + "loss": 2.0322, + "step": 3008 + }, + { + "epoch": 0.23, + "grad_norm": 0.5293597395163336, + "learning_rate": 4.48314565449398e-05, + "loss": 2.2025, + "step": 3009 + }, + { + "epoch": 0.23, + "grad_norm": 0.5350689164969286, + "learning_rate": 4.482765239921375e-05, + "loss": 1.9953, + "step": 3010 + }, + { + "epoch": 0.23, + "grad_norm": 0.5125460775930459, + "learning_rate": 4.482384701556591e-05, + "loss": 2.0868, + "step": 3011 + }, + { + "epoch": 0.23, + "grad_norm": 0.5211971935313184, + "learning_rate": 4.482004039423385e-05, + "loss": 2.0176, + "step": 3012 + }, + { + "epoch": 0.23, + "grad_norm": 0.5327324633742367, + "learning_rate": 4.481623253545526e-05, + "loss": 2.0096, + "step": 3013 + }, + { + "epoch": 0.23, + "grad_norm": 0.5357325452896446, + "learning_rate": 4.481242343946787e-05, + "loss": 2.2573, + "step": 3014 + }, + { + "epoch": 0.23, + "grad_norm": 0.5107360098936532, + "learning_rate": 4.480861310650948e-05, + "loss": 2.0136, + "step": 3015 + }, + { + "epoch": 0.23, + "grad_norm": 0.5290318445763023, + "learning_rate": 4.4804801536818e-05, + "loss": 2.0329, + "step": 3016 + }, + { + "epoch": 0.23, + "grad_norm": 0.5453232158124682, + "learning_rate": 4.48009887306314e-05, + "loss": 2.1781, + "step": 3017 + }, + { + "epoch": 0.23, + "grad_norm": 0.5487487935389634, + "learning_rate": 4.4797174688187735e-05, + "loss": 2.045, + "step": 3018 + }, + { + "epoch": 0.23, + "grad_norm": 0.5505090687133408, + "learning_rate": 4.4793359409725115e-05, + "loss": 2.0087, + "step": 3019 + }, + { + "epoch": 0.23, + "grad_norm": 0.5448423394806262, + "learning_rate": 4.4789542895481765e-05, + "loss": 1.9594, + "step": 3020 + }, + { + "epoch": 0.23, + "grad_norm": 0.5858109388536379, + "learning_rate": 4.4785725145695944e-05, + "loss": 2.01, + "step": 3021 + }, + { + "epoch": 0.23, + "grad_norm": 0.6557275508743523, + "learning_rate": 4.478190616060603e-05, + "loss": 2.1915, + "step": 3022 + }, + { + "epoch": 0.23, + "grad_norm": 0.6174072734885235, + "learning_rate": 4.4778085940450446e-05, + "loss": 1.9886, + "step": 3023 + }, + { + "epoch": 0.23, + "grad_norm": 0.5644054020764692, + "learning_rate": 4.47742644854677e-05, + "loss": 2.07, + "step": 3024 + }, + { + "epoch": 0.23, + "grad_norm": 0.6221490422401692, + "learning_rate": 4.4770441795896396e-05, + "loss": 2.0201, + "step": 3025 + }, + { + "epoch": 0.23, + "grad_norm": 0.5690496479047262, + "learning_rate": 4.476661787197519e-05, + "loss": 2.2106, + "step": 3026 + }, + { + "epoch": 0.23, + "grad_norm": 0.5240771090098434, + "learning_rate": 4.4762792713942823e-05, + "loss": 2.0388, + "step": 3027 + }, + { + "epoch": 0.23, + "grad_norm": 0.6163574606302348, + "learning_rate": 4.475896632203812e-05, + "loss": 1.9703, + "step": 3028 + }, + { + "epoch": 0.23, + "grad_norm": 0.5223947186697642, + "learning_rate": 4.475513869649998e-05, + "loss": 2.1751, + "step": 3029 + }, + { + "epoch": 0.23, + "grad_norm": 0.5272071190591924, + "learning_rate": 4.4751309837567365e-05, + "loss": 2.0733, + "step": 3030 + }, + { + "epoch": 0.23, + "grad_norm": 0.5209451475983699, + "learning_rate": 4.4747479745479356e-05, + "loss": 1.9709, + "step": 3031 + }, + { + "epoch": 0.23, + "grad_norm": 0.5859910033416534, + "learning_rate": 4.474364842047505e-05, + "loss": 1.9599, + "step": 3032 + }, + { + "epoch": 0.23, + "grad_norm": 0.5208483879212257, + "learning_rate": 4.4739815862793665e-05, + "loss": 1.989, + "step": 3033 + }, + { + "epoch": 0.23, + "grad_norm": 0.5530816959492062, + "learning_rate": 4.4735982072674476e-05, + "loss": 2.22, + "step": 3034 + }, + { + "epoch": 0.23, + "grad_norm": 0.5502348586409187, + "learning_rate": 4.473214705035686e-05, + "loss": 2.0282, + "step": 3035 + }, + { + "epoch": 0.23, + "grad_norm": 0.531787217889324, + "learning_rate": 4.472831079608024e-05, + "loss": 2.0597, + "step": 3036 + }, + { + "epoch": 0.23, + "grad_norm": 0.5415553944593385, + "learning_rate": 4.4724473310084127e-05, + "loss": 2.0026, + "step": 3037 + }, + { + "epoch": 0.23, + "grad_norm": 0.5673921069995485, + "learning_rate": 4.4720634592608116e-05, + "loss": 2.1918, + "step": 3038 + }, + { + "epoch": 0.23, + "grad_norm": 0.5315174309918465, + "learning_rate": 4.471679464389187e-05, + "loss": 2.0076, + "step": 3039 + }, + { + "epoch": 0.23, + "grad_norm": 0.5764970181306173, + "learning_rate": 4.471295346417515e-05, + "loss": 1.9987, + "step": 3040 + }, + { + "epoch": 0.23, + "grad_norm": 0.5904856550179022, + "learning_rate": 4.470911105369774e-05, + "loss": 2.1834, + "step": 3041 + }, + { + "epoch": 0.23, + "grad_norm": 0.5784748903700866, + "learning_rate": 4.4705267412699575e-05, + "loss": 2.094, + "step": 3042 + }, + { + "epoch": 0.23, + "grad_norm": 0.5652651017744937, + "learning_rate": 4.470142254142061e-05, + "loss": 1.9928, + "step": 3043 + }, + { + "epoch": 0.23, + "grad_norm": 0.5251313855767901, + "learning_rate": 4.46975764401009e-05, + "loss": 2.0166, + "step": 3044 + }, + { + "epoch": 0.23, + "grad_norm": 0.6055042450496926, + "learning_rate": 4.4693729108980575e-05, + "loss": 2.0269, + "step": 3045 + }, + { + "epoch": 0.23, + "grad_norm": 0.5659951318790962, + "learning_rate": 4.4689880548299835e-05, + "loss": 2.2015, + "step": 3046 + }, + { + "epoch": 0.24, + "grad_norm": 0.5883111019040398, + "learning_rate": 4.468603075829897e-05, + "loss": 2.0019, + "step": 3047 + }, + { + "epoch": 0.24, + "grad_norm": 0.570511111654427, + "learning_rate": 4.4682179739218324e-05, + "loss": 1.99, + "step": 3048 + }, + { + "epoch": 0.24, + "grad_norm": 0.5144297455149546, + "learning_rate": 4.467832749129835e-05, + "loss": 2.0553, + "step": 3049 + }, + { + "epoch": 0.24, + "grad_norm": 0.5520219098859435, + "learning_rate": 4.467447401477954e-05, + "loss": 2.1901, + "step": 3050 + }, + { + "epoch": 0.24, + "grad_norm": 0.5605898048733032, + "learning_rate": 4.4670619309902505e-05, + "loss": 2.0287, + "step": 3051 + }, + { + "epoch": 0.24, + "grad_norm": 0.5462322543635677, + "learning_rate": 4.4666763376907885e-05, + "loss": 2.0158, + "step": 3052 + }, + { + "epoch": 0.24, + "grad_norm": 0.6783121051907095, + "learning_rate": 4.466290621603644e-05, + "loss": 2.041, + "step": 3053 + }, + { + "epoch": 0.24, + "grad_norm": 0.6015734271325199, + "learning_rate": 4.4659047827528986e-05, + "loss": 2.2325, + "step": 3054 + }, + { + "epoch": 0.24, + "grad_norm": 0.5001942558233803, + "learning_rate": 4.465518821162641e-05, + "loss": 2.0691, + "step": 3055 + }, + { + "epoch": 0.24, + "grad_norm": 0.5426410082398664, + "learning_rate": 4.465132736856969e-05, + "loss": 2.0224, + "step": 3056 + }, + { + "epoch": 0.24, + "grad_norm": 0.553152788743633, + "learning_rate": 4.464746529859987e-05, + "loss": 1.9675, + "step": 3057 + }, + { + "epoch": 0.24, + "grad_norm": 0.5585410004654184, + "learning_rate": 4.464360200195808e-05, + "loss": 2.251, + "step": 3058 + }, + { + "epoch": 0.24, + "grad_norm": 0.6035344159380756, + "learning_rate": 4.463973747888553e-05, + "loss": 2.0561, + "step": 3059 + }, + { + "epoch": 0.24, + "grad_norm": 0.5716873282611552, + "learning_rate": 4.4635871729623465e-05, + "loss": 1.954, + "step": 3060 + }, + { + "epoch": 0.24, + "grad_norm": 0.5413273484563424, + "learning_rate": 4.463200475441327e-05, + "loss": 2.037, + "step": 3061 + }, + { + "epoch": 0.24, + "grad_norm": 0.5620685447740756, + "learning_rate": 4.4628136553496375e-05, + "loss": 2.2371, + "step": 3062 + }, + { + "epoch": 0.24, + "grad_norm": 0.5256083268662024, + "learning_rate": 4.462426712711428e-05, + "loss": 2.0082, + "step": 3063 + }, + { + "epoch": 0.24, + "grad_norm": 0.5839402228358669, + "learning_rate": 4.462039647550856e-05, + "loss": 1.975, + "step": 3064 + }, + { + "epoch": 0.24, + "grad_norm": 0.5243750347545947, + "learning_rate": 4.46165245989209e-05, + "loss": 1.9719, + "step": 3065 + }, + { + "epoch": 0.24, + "grad_norm": 0.5869131625149748, + "learning_rate": 4.461265149759301e-05, + "loss": 2.2012, + "step": 3066 + }, + { + "epoch": 0.24, + "grad_norm": 0.5523463400391795, + "learning_rate": 4.460877717176672e-05, + "loss": 2.0669, + "step": 3067 + }, + { + "epoch": 0.24, + "grad_norm": 0.5321862799436589, + "learning_rate": 4.460490162168392e-05, + "loss": 1.9933, + "step": 3068 + }, + { + "epoch": 0.24, + "grad_norm": 0.6087579949830706, + "learning_rate": 4.460102484758656e-05, + "loss": 2.0305, + "step": 3069 + }, + { + "epoch": 0.24, + "grad_norm": 0.5283733021628734, + "learning_rate": 4.4597146849716706e-05, + "loss": 2.2382, + "step": 3070 + }, + { + "epoch": 0.24, + "grad_norm": 0.5608277290882208, + "learning_rate": 4.459326762831647e-05, + "loss": 2.0233, + "step": 3071 + }, + { + "epoch": 0.24, + "grad_norm": 0.6122561098971167, + "learning_rate": 4.458938718362804e-05, + "loss": 1.9949, + "step": 3072 + }, + { + "epoch": 0.24, + "grad_norm": 0.5639313766544823, + "learning_rate": 4.458550551589369e-05, + "loss": 2.0509, + "step": 3073 + }, + { + "epoch": 0.24, + "grad_norm": 0.5873973947858502, + "learning_rate": 4.458162262535578e-05, + "loss": 2.2305, + "step": 3074 + }, + { + "epoch": 0.24, + "grad_norm": 0.5196865169561186, + "learning_rate": 4.4577738512256716e-05, + "loss": 1.9592, + "step": 3075 + }, + { + "epoch": 0.24, + "grad_norm": 0.5556544493886333, + "learning_rate": 4.457385317683902e-05, + "loss": 1.9434, + "step": 3076 + }, + { + "epoch": 0.24, + "grad_norm": 0.538819078460489, + "learning_rate": 4.456996661934525e-05, + "loss": 2.0025, + "step": 3077 + }, + { + "epoch": 0.24, + "grad_norm": 0.528421568111892, + "learning_rate": 4.456607884001808e-05, + "loss": 2.2043, + "step": 3078 + }, + { + "epoch": 0.24, + "grad_norm": 0.6105065073154947, + "learning_rate": 4.4562189839100214e-05, + "loss": 1.9861, + "step": 3079 + }, + { + "epoch": 0.24, + "grad_norm": 0.5140314215635273, + "learning_rate": 4.455829961683448e-05, + "loss": 2.0949, + "step": 3080 + }, + { + "epoch": 0.24, + "grad_norm": 0.5519385280790672, + "learning_rate": 4.455440817346375e-05, + "loss": 1.9669, + "step": 3081 + }, + { + "epoch": 0.24, + "grad_norm": 0.6072245415538848, + "learning_rate": 4.4550515509231e-05, + "loss": 2.1725, + "step": 3082 + }, + { + "epoch": 0.24, + "grad_norm": 0.5272914001221394, + "learning_rate": 4.454662162437924e-05, + "loss": 2.0246, + "step": 3083 + }, + { + "epoch": 0.24, + "grad_norm": 0.5878263895969793, + "learning_rate": 4.454272651915158e-05, + "loss": 1.9421, + "step": 3084 + }, + { + "epoch": 0.24, + "grad_norm": 0.5632942506680477, + "learning_rate": 4.4538830193791235e-05, + "loss": 2.0143, + "step": 3085 + }, + { + "epoch": 0.24, + "grad_norm": 0.6011929662413099, + "learning_rate": 4.453493264854145e-05, + "loss": 2.2458, + "step": 3086 + }, + { + "epoch": 0.24, + "grad_norm": 0.5667613697920171, + "learning_rate": 4.4531033883645575e-05, + "loss": 1.9677, + "step": 3087 + }, + { + "epoch": 0.24, + "grad_norm": 0.5986957592903076, + "learning_rate": 4.452713389934701e-05, + "loss": 1.9914, + "step": 3088 + }, + { + "epoch": 0.24, + "grad_norm": 0.5781240041222101, + "learning_rate": 4.4523232695889264e-05, + "loss": 1.959, + "step": 3089 + }, + { + "epoch": 0.24, + "grad_norm": 0.6047900673537016, + "learning_rate": 4.451933027351589e-05, + "loss": 2.1844, + "step": 3090 + }, + { + "epoch": 0.24, + "grad_norm": 0.51749651286676, + "learning_rate": 4.451542663247054e-05, + "loss": 1.931, + "step": 3091 + }, + { + "epoch": 0.24, + "grad_norm": 0.608999343968275, + "learning_rate": 4.451152177299694e-05, + "loss": 2.0843, + "step": 3092 + }, + { + "epoch": 0.24, + "grad_norm": 0.5613464348121842, + "learning_rate": 4.4507615695338875e-05, + "loss": 2.0248, + "step": 3093 + }, + { + "epoch": 0.24, + "grad_norm": 0.5398056805071721, + "learning_rate": 4.4503708399740226e-05, + "loss": 2.2453, + "step": 3094 + }, + { + "epoch": 0.24, + "grad_norm": 0.5497581670785875, + "learning_rate": 4.449979988644494e-05, + "loss": 1.964, + "step": 3095 + }, + { + "epoch": 0.24, + "grad_norm": 0.5752277970428307, + "learning_rate": 4.449589015569704e-05, + "loss": 2.0048, + "step": 3096 + }, + { + "epoch": 0.24, + "grad_norm": 0.5429129664166198, + "learning_rate": 4.449197920774062e-05, + "loss": 2.0177, + "step": 3097 + }, + { + "epoch": 0.24, + "grad_norm": 0.5791858818444132, + "learning_rate": 4.448806704281986e-05, + "loss": 2.2341, + "step": 3098 + }, + { + "epoch": 0.24, + "grad_norm": 0.5311848929243846, + "learning_rate": 4.4484153661179026e-05, + "loss": 1.9975, + "step": 3099 + }, + { + "epoch": 0.24, + "grad_norm": 0.5833882168181015, + "learning_rate": 4.448023906306243e-05, + "loss": 1.9617, + "step": 3100 + }, + { + "epoch": 0.24, + "grad_norm": 0.636568355941607, + "learning_rate": 4.4476323248714483e-05, + "loss": 1.9783, + "step": 3101 + }, + { + "epoch": 0.24, + "grad_norm": 0.5190088671941866, + "learning_rate": 4.447240621837967e-05, + "loss": 2.1412, + "step": 3102 + }, + { + "epoch": 0.24, + "grad_norm": 0.6360666606981059, + "learning_rate": 4.446848797230254e-05, + "loss": 1.9726, + "step": 3103 + }, + { + "epoch": 0.24, + "grad_norm": 0.5417395305641296, + "learning_rate": 4.4464568510727725e-05, + "loss": 2.029, + "step": 3104 + }, + { + "epoch": 0.24, + "grad_norm": 0.5191390302221186, + "learning_rate": 4.446064783389994e-05, + "loss": 1.9548, + "step": 3105 + }, + { + "epoch": 0.24, + "grad_norm": 0.6407095709248257, + "learning_rate": 4.445672594206395e-05, + "loss": 2.1665, + "step": 3106 + }, + { + "epoch": 0.24, + "grad_norm": 0.5637518108239881, + "learning_rate": 4.445280283546465e-05, + "loss": 1.9612, + "step": 3107 + }, + { + "epoch": 0.24, + "grad_norm": 0.6665951520584106, + "learning_rate": 4.4448878514346934e-05, + "loss": 1.9967, + "step": 3108 + }, + { + "epoch": 0.24, + "grad_norm": 0.5235293209956146, + "learning_rate": 4.444495297895585e-05, + "loss": 2.0021, + "step": 3109 + }, + { + "epoch": 0.24, + "grad_norm": 0.5376388757881346, + "learning_rate": 4.4441026229536464e-05, + "loss": 2.1908, + "step": 3110 + }, + { + "epoch": 0.24, + "grad_norm": 0.5325745761634787, + "learning_rate": 4.443709826633394e-05, + "loss": 2.0606, + "step": 3111 + }, + { + "epoch": 0.24, + "grad_norm": 0.5406888155637817, + "learning_rate": 4.4433169089593536e-05, + "loss": 1.977, + "step": 3112 + }, + { + "epoch": 0.24, + "grad_norm": 0.523972810914124, + "learning_rate": 4.4429238699560536e-05, + "loss": 2.0081, + "step": 3113 + }, + { + "epoch": 0.24, + "grad_norm": 0.5858073658422365, + "learning_rate": 4.442530709648036e-05, + "loss": 2.1435, + "step": 3114 + }, + { + "epoch": 0.24, + "grad_norm": 0.5872113277258418, + "learning_rate": 4.442137428059846e-05, + "loss": 2.0062, + "step": 3115 + }, + { + "epoch": 0.24, + "grad_norm": 0.5492919489949836, + "learning_rate": 4.441744025216037e-05, + "loss": 2.0071, + "step": 3116 + }, + { + "epoch": 0.24, + "grad_norm": 0.563643418318092, + "learning_rate": 4.4413505011411714e-05, + "loss": 2.0355, + "step": 3117 + }, + { + "epoch": 0.24, + "grad_norm": 0.6653275474714171, + "learning_rate": 4.44095685585982e-05, + "loss": 2.1831, + "step": 3118 + }, + { + "epoch": 0.24, + "grad_norm": 0.5606047937706002, + "learning_rate": 4.4405630893965575e-05, + "loss": 1.9636, + "step": 3119 + }, + { + "epoch": 0.24, + "grad_norm": 0.6809669035294279, + "learning_rate": 4.4401692017759696e-05, + "loss": 1.9992, + "step": 3120 + }, + { + "epoch": 0.24, + "grad_norm": 0.601881053831601, + "learning_rate": 4.439775193022648e-05, + "loss": 1.9874, + "step": 3121 + }, + { + "epoch": 0.24, + "grad_norm": 0.6073000875923724, + "learning_rate": 4.439381063161192e-05, + "loss": 2.2055, + "step": 3122 + }, + { + "epoch": 0.24, + "grad_norm": 0.5848905662342224, + "learning_rate": 4.438986812216209e-05, + "loss": 2.0522, + "step": 3123 + }, + { + "epoch": 0.24, + "grad_norm": 0.65239186504742, + "learning_rate": 4.438592440212315e-05, + "loss": 2.0192, + "step": 3124 + }, + { + "epoch": 0.24, + "grad_norm": 0.5781483351767635, + "learning_rate": 4.43819794717413e-05, + "loss": 1.9548, + "step": 3125 + }, + { + "epoch": 0.24, + "grad_norm": 0.5548028137848052, + "learning_rate": 4.437803333126285e-05, + "loss": 2.2145, + "step": 3126 + }, + { + "epoch": 0.24, + "grad_norm": 0.5811463187359873, + "learning_rate": 4.4374085980934163e-05, + "loss": 1.9876, + "step": 3127 + }, + { + "epoch": 0.24, + "grad_norm": 0.5499536181347944, + "learning_rate": 4.437013742100171e-05, + "loss": 1.9467, + "step": 3128 + }, + { + "epoch": 0.24, + "grad_norm": 0.5579082879306126, + "learning_rate": 4.4366187651712e-05, + "loss": 2.0134, + "step": 3129 + }, + { + "epoch": 0.24, + "grad_norm": 0.5679271814494812, + "learning_rate": 4.436223667331163e-05, + "loss": 2.2096, + "step": 3130 + }, + { + "epoch": 0.24, + "grad_norm": 0.5264303092148903, + "learning_rate": 4.435828448604729e-05, + "loss": 2.031, + "step": 3131 + }, + { + "epoch": 0.24, + "grad_norm": 0.5704734844603361, + "learning_rate": 4.4354331090165723e-05, + "loss": 2.0001, + "step": 3132 + }, + { + "epoch": 0.24, + "grad_norm": 0.6073458689126624, + "learning_rate": 4.435037648591376e-05, + "loss": 1.982, + "step": 3133 + }, + { + "epoch": 0.24, + "grad_norm": 0.526037735751892, + "learning_rate": 4.434642067353829e-05, + "loss": 2.175, + "step": 3134 + }, + { + "epoch": 0.24, + "grad_norm": 0.6173599401846186, + "learning_rate": 4.434246365328631e-05, + "loss": 2.1007, + "step": 3135 + }, + { + "epoch": 0.24, + "grad_norm": 0.5558361519249272, + "learning_rate": 4.433850542540486e-05, + "loss": 1.9811, + "step": 3136 + }, + { + "epoch": 0.24, + "grad_norm": 0.6145086916061856, + "learning_rate": 4.433454599014107e-05, + "loss": 2.022, + "step": 3137 + }, + { + "epoch": 0.24, + "grad_norm": 0.5668464564979698, + "learning_rate": 4.4330585347742144e-05, + "loss": 2.2221, + "step": 3138 + }, + { + "epoch": 0.24, + "grad_norm": 0.5176113435286955, + "learning_rate": 4.432662349845537e-05, + "loss": 1.9587, + "step": 3139 + }, + { + "epoch": 0.24, + "grad_norm": 0.7541378393650352, + "learning_rate": 4.4322660442528086e-05, + "loss": 1.9759, + "step": 3140 + }, + { + "epoch": 0.24, + "grad_norm": 0.6357930037935415, + "learning_rate": 4.431869618020773e-05, + "loss": 1.9759, + "step": 3141 + }, + { + "epoch": 0.24, + "grad_norm": 0.5381859501403309, + "learning_rate": 4.4314730711741816e-05, + "loss": 2.2125, + "step": 3142 + }, + { + "epoch": 0.24, + "grad_norm": 0.6597049375354532, + "learning_rate": 4.4310764037377915e-05, + "loss": 1.9753, + "step": 3143 + }, + { + "epoch": 0.24, + "grad_norm": 0.5665729499487908, + "learning_rate": 4.430679615736368e-05, + "loss": 2.0038, + "step": 3144 + }, + { + "epoch": 0.24, + "grad_norm": 0.6275760667624822, + "learning_rate": 4.430282707194685e-05, + "loss": 1.9638, + "step": 3145 + }, + { + "epoch": 0.24, + "grad_norm": 0.5536298067015744, + "learning_rate": 4.429885678137523e-05, + "loss": 2.1439, + "step": 3146 + }, + { + "epoch": 0.24, + "grad_norm": 0.6063565792104014, + "learning_rate": 4.42948852858967e-05, + "loss": 2.0419, + "step": 3147 + }, + { + "epoch": 0.24, + "grad_norm": 0.6036209212881035, + "learning_rate": 4.429091258575922e-05, + "loss": 2.081, + "step": 3148 + }, + { + "epoch": 0.24, + "grad_norm": 0.5577181903040976, + "learning_rate": 4.4286938681210806e-05, + "loss": 2.0124, + "step": 3149 + }, + { + "epoch": 0.24, + "grad_norm": 0.6013356735735057, + "learning_rate": 4.428296357249958e-05, + "loss": 2.1882, + "step": 3150 + }, + { + "epoch": 0.24, + "grad_norm": 0.5494488809182315, + "learning_rate": 4.4278987259873725e-05, + "loss": 1.9539, + "step": 3151 + }, + { + "epoch": 0.24, + "grad_norm": 0.5816399600246024, + "learning_rate": 4.4275009743581496e-05, + "loss": 1.9172, + "step": 3152 + }, + { + "epoch": 0.24, + "grad_norm": 0.5485595113700477, + "learning_rate": 4.4271031023871226e-05, + "loss": 1.9945, + "step": 3153 + }, + { + "epoch": 0.24, + "grad_norm": 0.5072244229612652, + "learning_rate": 4.426705110099132e-05, + "loss": 2.0357, + "step": 3154 + }, + { + "epoch": 0.24, + "grad_norm": 0.5809621397385337, + "learning_rate": 4.426306997519026e-05, + "loss": 2.209, + "step": 3155 + }, + { + "epoch": 0.24, + "grad_norm": 0.5407912006525201, + "learning_rate": 4.425908764671661e-05, + "loss": 2.0054, + "step": 3156 + }, + { + "epoch": 0.24, + "grad_norm": 0.6263865302044387, + "learning_rate": 4.4255104115819e-05, + "loss": 1.9729, + "step": 3157 + }, + { + "epoch": 0.24, + "grad_norm": 0.5433905362304362, + "learning_rate": 4.4251119382746134e-05, + "loss": 2.1682, + "step": 3158 + }, + { + "epoch": 0.24, + "grad_norm": 0.5713421487637378, + "learning_rate": 4.424713344774681e-05, + "loss": 1.9676, + "step": 3159 + }, + { + "epoch": 0.24, + "grad_norm": 0.5477496155262481, + "learning_rate": 4.4243146311069864e-05, + "loss": 2.0346, + "step": 3160 + }, + { + "epoch": 0.24, + "grad_norm": 0.6021761618875515, + "learning_rate": 4.423915797296425e-05, + "loss": 2.0064, + "step": 3161 + }, + { + "epoch": 0.24, + "grad_norm": 0.6531923646236424, + "learning_rate": 4.4235168433678964e-05, + "loss": 2.2051, + "step": 3162 + }, + { + "epoch": 0.24, + "grad_norm": 0.5666492547643132, + "learning_rate": 4.423117769346309e-05, + "loss": 1.9859, + "step": 3163 + }, + { + "epoch": 0.24, + "grad_norm": 0.6673958926570421, + "learning_rate": 4.422718575256579e-05, + "loss": 1.9826, + "step": 3164 + }, + { + "epoch": 0.24, + "grad_norm": 0.6099876724947764, + "learning_rate": 4.42231926112363e-05, + "loss": 1.9916, + "step": 3165 + }, + { + "epoch": 0.24, + "grad_norm": 0.710935982016432, + "learning_rate": 4.421919826972391e-05, + "loss": 2.1488, + "step": 3166 + }, + { + "epoch": 0.24, + "grad_norm": 0.5346503417930785, + "learning_rate": 4.4215202728278036e-05, + "loss": 2.0467, + "step": 3167 + }, + { + "epoch": 0.24, + "grad_norm": 0.6288269047593603, + "learning_rate": 4.421120598714811e-05, + "loss": 1.9476, + "step": 3168 + }, + { + "epoch": 0.24, + "grad_norm": 0.5835183734857277, + "learning_rate": 4.4207208046583676e-05, + "loss": 1.9326, + "step": 3169 + }, + { + "epoch": 0.24, + "grad_norm": 0.787401906139243, + "learning_rate": 4.420320890683434e-05, + "loss": 2.2128, + "step": 3170 + }, + { + "epoch": 0.24, + "grad_norm": 0.5571158610331691, + "learning_rate": 4.419920856814977e-05, + "loss": 1.944, + "step": 3171 + }, + { + "epoch": 0.24, + "grad_norm": 0.7772338409273627, + "learning_rate": 4.4195207030779753e-05, + "loss": 1.952, + "step": 3172 + }, + { + "epoch": 0.24, + "grad_norm": 0.6350613062603871, + "learning_rate": 4.419120429497409e-05, + "loss": 2.1112, + "step": 3173 + }, + { + "epoch": 0.24, + "grad_norm": 0.8228409984522378, + "learning_rate": 4.418720036098272e-05, + "loss": 2.1932, + "step": 3174 + }, + { + "epoch": 0.24, + "grad_norm": 0.7123757766598553, + "learning_rate": 4.41831952290556e-05, + "loss": 1.9612, + "step": 3175 + }, + { + "epoch": 0.25, + "grad_norm": 0.711682191904777, + "learning_rate": 4.4179188899442785e-05, + "loss": 2.0422, + "step": 3176 + }, + { + "epoch": 0.25, + "grad_norm": 0.7173901475751948, + "learning_rate": 4.417518137239443e-05, + "loss": 2.0154, + "step": 3177 + }, + { + "epoch": 0.25, + "grad_norm": 0.5594013799117218, + "learning_rate": 4.417117264816072e-05, + "loss": 2.1322, + "step": 3178 + }, + { + "epoch": 0.25, + "grad_norm": 0.7223450837574492, + "learning_rate": 4.416716272699195e-05, + "loss": 2.0359, + "step": 3179 + }, + { + "epoch": 0.25, + "grad_norm": 0.5891216083427832, + "learning_rate": 4.4163151609138476e-05, + "loss": 1.9662, + "step": 3180 + }, + { + "epoch": 0.25, + "grad_norm": 0.6860727842026612, + "learning_rate": 4.4159139294850714e-05, + "loss": 1.9849, + "step": 3181 + }, + { + "epoch": 0.25, + "grad_norm": 0.6033572521887411, + "learning_rate": 4.415512578437918e-05, + "loss": 2.1296, + "step": 3182 + }, + { + "epoch": 0.25, + "grad_norm": 0.6047372855600465, + "learning_rate": 4.415111107797445e-05, + "loss": 1.9784, + "step": 3183 + }, + { + "epoch": 0.25, + "grad_norm": 0.613395815152527, + "learning_rate": 4.414709517588719e-05, + "loss": 1.9669, + "step": 3184 + }, + { + "epoch": 0.25, + "grad_norm": 0.6426145319120751, + "learning_rate": 4.414307807836811e-05, + "loss": 2.0461, + "step": 3185 + }, + { + "epoch": 0.25, + "grad_norm": 0.5595279473275597, + "learning_rate": 4.4139059785668026e-05, + "loss": 1.9541, + "step": 3186 + }, + { + "epoch": 0.25, + "grad_norm": 0.6924084155046255, + "learning_rate": 4.413504029803782e-05, + "loss": 2.1733, + "step": 3187 + }, + { + "epoch": 0.25, + "grad_norm": 0.5854552082090089, + "learning_rate": 4.413101961572843e-05, + "loss": 2.0053, + "step": 3188 + }, + { + "epoch": 0.25, + "grad_norm": 0.5657365601591201, + "learning_rate": 4.4126997738990904e-05, + "loss": 1.997, + "step": 3189 + }, + { + "epoch": 0.25, + "grad_norm": 0.6407234745274227, + "learning_rate": 4.4122974668076325e-05, + "loss": 2.2066, + "step": 3190 + }, + { + "epoch": 0.25, + "grad_norm": 0.5916786023686055, + "learning_rate": 4.411895040323588e-05, + "loss": 2.0836, + "step": 3191 + }, + { + "epoch": 0.25, + "grad_norm": 0.6032633930374778, + "learning_rate": 4.4114924944720816e-05, + "loss": 2.0106, + "step": 3192 + }, + { + "epoch": 0.25, + "grad_norm": 0.6342046580559905, + "learning_rate": 4.411089829278247e-05, + "loss": 1.945, + "step": 3193 + }, + { + "epoch": 0.25, + "grad_norm": 0.6389317662153187, + "learning_rate": 4.410687044767223e-05, + "loss": 2.1984, + "step": 3194 + }, + { + "epoch": 0.25, + "grad_norm": 0.6542441894082088, + "learning_rate": 4.410284140964157e-05, + "loss": 1.9958, + "step": 3195 + }, + { + "epoch": 0.25, + "grad_norm": 0.5937683115309178, + "learning_rate": 4.409881117894205e-05, + "loss": 1.9803, + "step": 3196 + }, + { + "epoch": 0.25, + "grad_norm": 0.5925280636945824, + "learning_rate": 4.4094779755825286e-05, + "loss": 1.9499, + "step": 3197 + }, + { + "epoch": 0.25, + "grad_norm": 0.5232937824866151, + "learning_rate": 4.409074714054298e-05, + "loss": 2.0667, + "step": 3198 + }, + { + "epoch": 0.25, + "grad_norm": 0.5308756033358528, + "learning_rate": 4.4086713333346904e-05, + "loss": 2.1992, + "step": 3199 + }, + { + "epoch": 0.25, + "grad_norm": 0.549670410488058, + "learning_rate": 4.40826783344889e-05, + "loss": 1.9567, + "step": 3200 + }, + { + "epoch": 0.25, + "grad_norm": 0.6109857083520281, + "learning_rate": 4.40786421442209e-05, + "loss": 1.9406, + "step": 3201 + }, + { + "epoch": 0.25, + "grad_norm": 0.5449108441190984, + "learning_rate": 4.407460476279489e-05, + "loss": 2.187, + "step": 3202 + }, + { + "epoch": 0.25, + "grad_norm": 0.5815709353056968, + "learning_rate": 4.4070566190462946e-05, + "loss": 2.0069, + "step": 3203 + }, + { + "epoch": 0.25, + "grad_norm": 0.5953099370600462, + "learning_rate": 4.406652642747721e-05, + "loss": 2.068, + "step": 3204 + }, + { + "epoch": 0.25, + "grad_norm": 0.5977740126693556, + "learning_rate": 4.40624854740899e-05, + "loss": 1.9676, + "step": 3205 + }, + { + "epoch": 0.25, + "grad_norm": 0.5857330017678563, + "learning_rate": 4.405844333055331e-05, + "loss": 2.011, + "step": 3206 + }, + { + "epoch": 0.25, + "grad_norm": 0.5383836634076384, + "learning_rate": 4.405439999711981e-05, + "loss": 2.2049, + "step": 3207 + }, + { + "epoch": 0.25, + "grad_norm": 0.5608673415896887, + "learning_rate": 4.405035547404185e-05, + "loss": 2.0398, + "step": 3208 + }, + { + "epoch": 0.25, + "grad_norm": 0.5436006016514275, + "learning_rate": 4.404630976157193e-05, + "loss": 1.9866, + "step": 3209 + }, + { + "epoch": 0.25, + "grad_norm": 0.5530611390368625, + "learning_rate": 4.404226285996265e-05, + "loss": 2.0972, + "step": 3210 + }, + { + "epoch": 0.25, + "grad_norm": 0.5285892806830715, + "learning_rate": 4.403821476946667e-05, + "loss": 2.1996, + "step": 3211 + }, + { + "epoch": 0.25, + "grad_norm": 0.951716874295601, + "learning_rate": 4.4034165490336725e-05, + "loss": 1.9245, + "step": 3212 + }, + { + "epoch": 0.25, + "grad_norm": 0.561144986898985, + "learning_rate": 4.403011502282564e-05, + "loss": 1.9978, + "step": 3213 + }, + { + "epoch": 0.25, + "grad_norm": 0.6404935613111418, + "learning_rate": 4.4026063367186296e-05, + "loss": 2.2001, + "step": 3214 + }, + { + "epoch": 0.25, + "grad_norm": 0.5516836754040394, + "learning_rate": 4.4022010523671655e-05, + "loss": 1.9813, + "step": 3215 + }, + { + "epoch": 0.25, + "grad_norm": 0.5183665819571563, + "learning_rate": 4.401795649253475e-05, + "loss": 2.0415, + "step": 3216 + }, + { + "epoch": 0.25, + "grad_norm": 0.6092366520886762, + "learning_rate": 4.40139012740287e-05, + "loss": 2.0294, + "step": 3217 + }, + { + "epoch": 0.25, + "grad_norm": 0.564567496348813, + "learning_rate": 4.400984486840668e-05, + "loss": 1.9832, + "step": 3218 + }, + { + "epoch": 0.25, + "grad_norm": 0.5862631154947727, + "learning_rate": 4.400578727592195e-05, + "loss": 2.1957, + "step": 3219 + }, + { + "epoch": 0.25, + "grad_norm": 0.557654457466685, + "learning_rate": 4.400172849682783e-05, + "loss": 1.9822, + "step": 3220 + }, + { + "epoch": 0.25, + "grad_norm": 0.6379741199535122, + "learning_rate": 4.399766853137776e-05, + "loss": 2.0048, + "step": 3221 + }, + { + "epoch": 0.25, + "grad_norm": 0.6000896596623431, + "learning_rate": 4.3993607379825184e-05, + "loss": 2.0727, + "step": 3222 + }, + { + "epoch": 0.25, + "grad_norm": 0.5509959785645787, + "learning_rate": 4.398954504242368e-05, + "loss": 2.1717, + "step": 3223 + }, + { + "epoch": 0.25, + "grad_norm": 0.548364913786382, + "learning_rate": 4.3985481519426864e-05, + "loss": 1.9587, + "step": 3224 + }, + { + "epoch": 0.25, + "grad_norm": 0.5515049139128383, + "learning_rate": 4.398141681108844e-05, + "loss": 1.9693, + "step": 3225 + }, + { + "epoch": 0.25, + "grad_norm": 0.605026834898499, + "learning_rate": 4.397735091766219e-05, + "loss": 2.1739, + "step": 3226 + }, + { + "epoch": 0.25, + "grad_norm": 0.5297361453332348, + "learning_rate": 4.397328383940196e-05, + "loss": 1.912, + "step": 3227 + }, + { + "epoch": 0.25, + "grad_norm": 0.5813205396252151, + "learning_rate": 4.396921557656168e-05, + "loss": 2.0003, + "step": 3228 + }, + { + "epoch": 0.25, + "grad_norm": 0.571226219283037, + "learning_rate": 4.396514612939534e-05, + "loss": 2.0465, + "step": 3229 + }, + { + "epoch": 0.25, + "grad_norm": 0.5635189241268812, + "learning_rate": 4.3961075498157024e-05, + "loss": 1.9911, + "step": 3230 + }, + { + "epoch": 0.25, + "grad_norm": 0.637102595991445, + "learning_rate": 4.395700368310086e-05, + "loss": 2.2116, + "step": 3231 + }, + { + "epoch": 0.25, + "grad_norm": 0.5920066194814708, + "learning_rate": 4.395293068448108e-05, + "loss": 1.9721, + "step": 3232 + }, + { + "epoch": 0.25, + "grad_norm": 0.56295339516942, + "learning_rate": 4.394885650255198e-05, + "loss": 1.996, + "step": 3233 + }, + { + "epoch": 0.25, + "grad_norm": 0.6921409740498028, + "learning_rate": 4.394478113756792e-05, + "loss": 2.2119, + "step": 3234 + }, + { + "epoch": 0.25, + "grad_norm": 0.5468943033338183, + "learning_rate": 4.394070458978336e-05, + "loss": 2.0813, + "step": 3235 + }, + { + "epoch": 0.25, + "grad_norm": 0.5740285660313974, + "learning_rate": 4.393662685945279e-05, + "loss": 1.9983, + "step": 3236 + }, + { + "epoch": 0.25, + "grad_norm": 0.5914369146109548, + "learning_rate": 4.393254794683082e-05, + "loss": 1.9919, + "step": 3237 + }, + { + "epoch": 0.25, + "grad_norm": 0.6080906127468796, + "learning_rate": 4.3928467852172094e-05, + "loss": 2.0118, + "step": 3238 + }, + { + "epoch": 0.25, + "grad_norm": 0.529411558711373, + "learning_rate": 4.392438657573137e-05, + "loss": 2.1488, + "step": 3239 + }, + { + "epoch": 0.25, + "grad_norm": 0.6052089419108222, + "learning_rate": 4.392030411776344e-05, + "loss": 1.9719, + "step": 3240 + }, + { + "epoch": 0.25, + "grad_norm": 0.5075801827915655, + "learning_rate": 4.39162204785232e-05, + "loss": 2.0834, + "step": 3241 + }, + { + "epoch": 0.25, + "grad_norm": 0.5661231696978897, + "learning_rate": 4.391213565826561e-05, + "loss": 1.9713, + "step": 3242 + }, + { + "epoch": 0.25, + "grad_norm": 0.5422298781509127, + "learning_rate": 4.390804965724569e-05, + "loss": 2.1621, + "step": 3243 + }, + { + "epoch": 0.25, + "grad_norm": 0.5721002124573331, + "learning_rate": 4.390396247571856e-05, + "loss": 2.0071, + "step": 3244 + }, + { + "epoch": 0.25, + "grad_norm": 0.5599377792621094, + "learning_rate": 4.3899874113939394e-05, + "loss": 1.9911, + "step": 3245 + }, + { + "epoch": 0.25, + "grad_norm": 0.5728331060929721, + "learning_rate": 4.389578457216345e-05, + "loss": 2.1599, + "step": 3246 + }, + { + "epoch": 0.25, + "grad_norm": 0.5723067273784446, + "learning_rate": 4.389169385064603e-05, + "loss": 2.0599, + "step": 3247 + }, + { + "epoch": 0.25, + "grad_norm": 0.5792381135786382, + "learning_rate": 4.3887601949642565e-05, + "loss": 1.9542, + "step": 3248 + }, + { + "epoch": 0.25, + "grad_norm": 0.5513730197014358, + "learning_rate": 4.3883508869408525e-05, + "loss": 2.0039, + "step": 3249 + }, + { + "epoch": 0.25, + "grad_norm": 0.5651637647062797, + "learning_rate": 4.387941461019944e-05, + "loss": 2.0264, + "step": 3250 + }, + { + "epoch": 0.25, + "grad_norm": 0.6029764896312595, + "learning_rate": 4.387531917227095e-05, + "loss": 2.1798, + "step": 3251 + }, + { + "epoch": 0.25, + "grad_norm": 0.5851202139593245, + "learning_rate": 4.3871222555878736e-05, + "loss": 1.9864, + "step": 3252 + }, + { + "epoch": 0.25, + "grad_norm": 0.5637226189453804, + "learning_rate": 4.386712476127858e-05, + "loss": 2.0785, + "step": 3253 + }, + { + "epoch": 0.25, + "grad_norm": 0.6260525313325669, + "learning_rate": 4.386302578872631e-05, + "loss": 1.9864, + "step": 3254 + }, + { + "epoch": 0.25, + "grad_norm": 0.6541163004093657, + "learning_rate": 4.385892563847785e-05, + "loss": 2.1882, + "step": 3255 + }, + { + "epoch": 0.25, + "grad_norm": 0.6197452157633231, + "learning_rate": 4.3854824310789193e-05, + "loss": 1.9975, + "step": 3256 + }, + { + "epoch": 0.25, + "grad_norm": 0.6661287455633321, + "learning_rate": 4.38507218059164e-05, + "loss": 1.9299, + "step": 3257 + }, + { + "epoch": 0.25, + "grad_norm": 0.6142717208272338, + "learning_rate": 4.384661812411559e-05, + "loss": 2.1479, + "step": 3258 + }, + { + "epoch": 0.25, + "grad_norm": 0.6815042646816536, + "learning_rate": 4.384251326564299e-05, + "loss": 1.9521, + "step": 3259 + }, + { + "epoch": 0.25, + "grad_norm": 0.6676586756612, + "learning_rate": 4.3838407230754885e-05, + "loss": 2.0306, + "step": 3260 + }, + { + "epoch": 0.25, + "grad_norm": 0.7664134930290358, + "learning_rate": 4.383430001970763e-05, + "loss": 1.9782, + "step": 3261 + }, + { + "epoch": 0.25, + "grad_norm": 0.5950400451396344, + "learning_rate": 4.383019163275764e-05, + "loss": 1.9454, + "step": 3262 + }, + { + "epoch": 0.25, + "grad_norm": 0.6731873750827059, + "learning_rate": 4.382608207016145e-05, + "loss": 2.1592, + "step": 3263 + }, + { + "epoch": 0.25, + "grad_norm": 0.7378730353702728, + "learning_rate": 4.38219713321756e-05, + "loss": 2.0057, + "step": 3264 + }, + { + "epoch": 0.25, + "grad_norm": 0.5912092684372481, + "learning_rate": 4.3817859419056764e-05, + "loss": 1.9883, + "step": 3265 + }, + { + "epoch": 0.25, + "grad_norm": 0.7862470092637582, + "learning_rate": 4.381374633106165e-05, + "loss": 2.0338, + "step": 3266 + }, + { + "epoch": 0.25, + "grad_norm": 0.5452156022144231, + "learning_rate": 4.380963206844707e-05, + "loss": 2.1739, + "step": 3267 + }, + { + "epoch": 0.25, + "grad_norm": 0.7542629381111885, + "learning_rate": 4.380551663146989e-05, + "loss": 1.9914, + "step": 3268 + }, + { + "epoch": 0.25, + "grad_norm": 0.6568776710991511, + "learning_rate": 4.380140002038704e-05, + "loss": 1.9548, + "step": 3269 + }, + { + "epoch": 0.25, + "grad_norm": 1.0378808608981127, + "learning_rate": 4.379728223545557e-05, + "loss": 1.9983, + "step": 3270 + }, + { + "epoch": 0.25, + "grad_norm": 0.6024612506549143, + "learning_rate": 4.379316327693253e-05, + "loss": 2.1696, + "step": 3271 + }, + { + "epoch": 0.25, + "grad_norm": 0.6521915125566463, + "learning_rate": 4.3789043145075115e-05, + "loss": 2.0377, + "step": 3272 + }, + { + "epoch": 0.25, + "grad_norm": 0.8575947938912385, + "learning_rate": 4.378492184014054e-05, + "loss": 1.9609, + "step": 3273 + }, + { + "epoch": 0.25, + "grad_norm": 0.631245056545762, + "learning_rate": 4.378079936238613e-05, + "loss": 1.986, + "step": 3274 + }, + { + "epoch": 0.25, + "grad_norm": 0.5619109139750522, + "learning_rate": 4.377667571206926e-05, + "loss": 2.203, + "step": 3275 + }, + { + "epoch": 0.25, + "grad_norm": 0.717538494358374, + "learning_rate": 4.377255088944738e-05, + "loss": 1.9305, + "step": 3276 + }, + { + "epoch": 0.25, + "grad_norm": 0.6133662391586667, + "learning_rate": 4.3768424894778045e-05, + "loss": 1.9429, + "step": 3277 + }, + { + "epoch": 0.25, + "grad_norm": 0.5759920123206065, + "learning_rate": 4.376429772831883e-05, + "loss": 2.0151, + "step": 3278 + }, + { + "epoch": 0.25, + "grad_norm": 0.6415481196106938, + "learning_rate": 4.376016939032742e-05, + "loss": 2.1659, + "step": 3279 + }, + { + "epoch": 0.25, + "grad_norm": 0.5931806179380206, + "learning_rate": 4.3756039881061574e-05, + "loss": 2.0373, + "step": 3280 + }, + { + "epoch": 0.25, + "grad_norm": 0.656448388706346, + "learning_rate": 4.3751909200779106e-05, + "loss": 1.987, + "step": 3281 + }, + { + "epoch": 0.25, + "grad_norm": 0.6091018394057396, + "learning_rate": 4.3747777349737903e-05, + "loss": 1.9788, + "step": 3282 + }, + { + "epoch": 0.25, + "grad_norm": 0.574948755391725, + "learning_rate": 4.374364432819595e-05, + "loss": 2.2037, + "step": 3283 + }, + { + "epoch": 0.25, + "grad_norm": 0.5352410128152982, + "learning_rate": 4.3739510136411286e-05, + "loss": 2.0342, + "step": 3284 + }, + { + "epoch": 0.25, + "grad_norm": 0.6116577759711661, + "learning_rate": 4.373537477464201e-05, + "loss": 1.9694, + "step": 3285 + }, + { + "epoch": 0.25, + "grad_norm": 0.5453778945851593, + "learning_rate": 4.373123824314633e-05, + "loss": 1.9504, + "step": 3286 + }, + { + "epoch": 0.25, + "grad_norm": 0.6170036439003084, + "learning_rate": 4.3727100542182486e-05, + "loss": 2.2053, + "step": 3287 + }, + { + "epoch": 0.25, + "grad_norm": 0.6202737169495443, + "learning_rate": 4.372296167200883e-05, + "loss": 1.9792, + "step": 3288 + }, + { + "epoch": 0.25, + "grad_norm": 0.5979822188804368, + "learning_rate": 4.3718821632883756e-05, + "loss": 1.9862, + "step": 3289 + }, + { + "epoch": 0.25, + "grad_norm": 0.6348792791900096, + "learning_rate": 4.371468042506576e-05, + "loss": 1.9002, + "step": 3290 + }, + { + "epoch": 0.25, + "grad_norm": 0.6280045932829521, + "learning_rate": 4.371053804881337e-05, + "loss": 2.1955, + "step": 3291 + }, + { + "epoch": 0.25, + "grad_norm": 0.7290309835986681, + "learning_rate": 4.3706394504385226e-05, + "loss": 1.9748, + "step": 3292 + }, + { + "epoch": 0.25, + "grad_norm": 0.6870690638773312, + "learning_rate": 4.370224979204003e-05, + "loss": 1.9886, + "step": 3293 + }, + { + "epoch": 0.25, + "grad_norm": 0.6793562269941067, + "learning_rate": 4.369810391203655e-05, + "loss": 1.9683, + "step": 3294 + }, + { + "epoch": 0.25, + "grad_norm": 0.6613749313176212, + "learning_rate": 4.3693956864633626e-05, + "loss": 2.1846, + "step": 3295 + }, + { + "epoch": 0.25, + "grad_norm": 0.6103176616772819, + "learning_rate": 4.368980865009017e-05, + "loss": 1.9372, + "step": 3296 + }, + { + "epoch": 0.25, + "grad_norm": 0.6792652753475463, + "learning_rate": 4.368565926866519e-05, + "loss": 2.0678, + "step": 3297 + }, + { + "epoch": 0.25, + "grad_norm": 0.5682146921406908, + "learning_rate": 4.368150872061774e-05, + "loss": 1.9393, + "step": 3298 + }, + { + "epoch": 0.25, + "grad_norm": 0.7450505181751049, + "learning_rate": 4.3677357006206954e-05, + "loss": 2.1528, + "step": 3299 + }, + { + "epoch": 0.25, + "grad_norm": 0.6700431380599663, + "learning_rate": 4.367320412569204e-05, + "loss": 1.9847, + "step": 3300 + }, + { + "epoch": 0.25, + "grad_norm": 0.7540918979853474, + "learning_rate": 4.3669050079332274e-05, + "loss": 1.9376, + "step": 3301 + }, + { + "epoch": 0.25, + "grad_norm": 0.6213174934804948, + "learning_rate": 4.3664894867387027e-05, + "loss": 1.9505, + "step": 3302 + }, + { + "epoch": 0.25, + "grad_norm": 0.8059883836267961, + "learning_rate": 4.366073849011572e-05, + "loss": 2.2246, + "step": 3303 + }, + { + "epoch": 0.25, + "grad_norm": 0.5488375739823435, + "learning_rate": 4.3656580947777836e-05, + "loss": 2.0007, + "step": 3304 + }, + { + "epoch": 0.25, + "grad_norm": 0.6484767385118575, + "learning_rate": 4.365242224063297e-05, + "loss": 1.9556, + "step": 3305 + }, + { + "epoch": 0.26, + "grad_norm": 0.662754931395236, + "learning_rate": 4.364826236894075e-05, + "loss": 1.9855, + "step": 3306 + }, + { + "epoch": 0.26, + "grad_norm": 0.6733678515166376, + "learning_rate": 4.364410133296091e-05, + "loss": 2.1629, + "step": 3307 + }, + { + "epoch": 0.26, + "grad_norm": 0.6135568690289923, + "learning_rate": 4.363993913295322e-05, + "loss": 1.9612, + "step": 3308 + }, + { + "epoch": 0.26, + "grad_norm": 0.7336750901568297, + "learning_rate": 4.3635775769177566e-05, + "loss": 2.1149, + "step": 3309 + }, + { + "epoch": 0.26, + "grad_norm": 0.5524292795243116, + "learning_rate": 4.3631611241893874e-05, + "loss": 1.9865, + "step": 3310 + }, + { + "epoch": 0.26, + "grad_norm": 0.6138231739158613, + "learning_rate": 4.362744555136214e-05, + "loss": 2.1715, + "step": 3311 + }, + { + "epoch": 0.26, + "grad_norm": 0.6072329554306661, + "learning_rate": 4.3623278697842465e-05, + "loss": 2.044, + "step": 3312 + }, + { + "epoch": 0.26, + "grad_norm": 0.6522704183978929, + "learning_rate": 4.3619110681594996e-05, + "loss": 1.9732, + "step": 3313 + }, + { + "epoch": 0.26, + "grad_norm": 0.6357362447883339, + "learning_rate": 4.361494150287996e-05, + "loss": 1.9921, + "step": 3314 + }, + { + "epoch": 0.26, + "grad_norm": 1.319668401554552, + "learning_rate": 4.361077116195764e-05, + "loss": 2.1918, + "step": 3315 + }, + { + "epoch": 0.26, + "grad_norm": 0.6166332987090383, + "learning_rate": 4.360659965908843e-05, + "loss": 1.9881, + "step": 3316 + }, + { + "epoch": 0.26, + "grad_norm": 0.765480390873279, + "learning_rate": 4.360242699453278e-05, + "loss": 2.0332, + "step": 3317 + }, + { + "epoch": 0.26, + "grad_norm": 0.6681156507324202, + "learning_rate": 4.3598253168551165e-05, + "loss": 2.0119, + "step": 3318 + }, + { + "epoch": 0.26, + "grad_norm": 0.7144967362057545, + "learning_rate": 4.359407818140422e-05, + "loss": 2.2072, + "step": 3319 + }, + { + "epoch": 0.26, + "grad_norm": 0.6340455968661336, + "learning_rate": 4.358990203335258e-05, + "loss": 1.9728, + "step": 3320 + }, + { + "epoch": 0.26, + "grad_norm": 1.3053541099892205, + "learning_rate": 4.3585724724656985e-05, + "loss": 1.9663, + "step": 3321 + }, + { + "epoch": 0.26, + "grad_norm": 0.7740309550279676, + "learning_rate": 4.358154625557825e-05, + "loss": 2.0592, + "step": 3322 + }, + { + "epoch": 0.26, + "grad_norm": 0.5487803186105508, + "learning_rate": 4.357736662637725e-05, + "loss": 2.1661, + "step": 3323 + }, + { + "epoch": 0.26, + "grad_norm": 0.7036017129551192, + "learning_rate": 4.357318583731492e-05, + "loss": 2.0091, + "step": 3324 + }, + { + "epoch": 0.26, + "grad_norm": 0.7003622111423636, + "learning_rate": 4.35690038886523e-05, + "loss": 2.0285, + "step": 3325 + }, + { + "epoch": 0.26, + "grad_norm": 0.6566763101795272, + "learning_rate": 4.3564820780650496e-05, + "loss": 1.9786, + "step": 3326 + }, + { + "epoch": 0.26, + "grad_norm": 0.755847884163255, + "learning_rate": 4.356063651357066e-05, + "loss": 2.2053, + "step": 3327 + }, + { + "epoch": 0.26, + "grad_norm": 0.7177248356640873, + "learning_rate": 4.355645108767403e-05, + "loss": 2.0294, + "step": 3328 + }, + { + "epoch": 0.26, + "grad_norm": 6.4502657034409685, + "learning_rate": 4.355226450322194e-05, + "loss": 1.9614, + "step": 3329 + }, + { + "epoch": 0.26, + "grad_norm": 0.6330405130890208, + "learning_rate": 4.354807676047575e-05, + "loss": 1.9972, + "step": 3330 + }, + { + "epoch": 0.26, + "grad_norm": 0.5886191931052352, + "learning_rate": 4.354388785969694e-05, + "loss": 2.2162, + "step": 3331 + }, + { + "epoch": 0.26, + "grad_norm": 0.6482163122056924, + "learning_rate": 4.353969780114703e-05, + "loss": 1.972, + "step": 3332 + }, + { + "epoch": 0.26, + "grad_norm": 0.6868287014165596, + "learning_rate": 4.3535506585087615e-05, + "loss": 1.9384, + "step": 3333 + }, + { + "epoch": 0.26, + "grad_norm": 0.837657520789086, + "learning_rate": 4.353131421178038e-05, + "loss": 2.0328, + "step": 3334 + }, + { + "epoch": 0.26, + "grad_norm": 0.6424904278086672, + "learning_rate": 4.352712068148708e-05, + "loss": 2.1864, + "step": 3335 + }, + { + "epoch": 0.26, + "grad_norm": 0.644194454034765, + "learning_rate": 4.352292599446951e-05, + "loss": 1.9619, + "step": 3336 + }, + { + "epoch": 0.26, + "grad_norm": 0.6396380955528167, + "learning_rate": 4.351873015098958e-05, + "loss": 2.0026, + "step": 3337 + }, + { + "epoch": 0.26, + "grad_norm": 0.5787244809705991, + "learning_rate": 4.351453315130926e-05, + "loss": 2.0064, + "step": 3338 + }, + { + "epoch": 0.26, + "grad_norm": 0.5886744309229474, + "learning_rate": 4.3510334995690566e-05, + "loss": 2.1857, + "step": 3339 + }, + { + "epoch": 0.26, + "grad_norm": 0.5810043770484355, + "learning_rate": 4.3506135684395624e-05, + "loss": 2.0112, + "step": 3340 + }, + { + "epoch": 0.26, + "grad_norm": 0.5734649583177113, + "learning_rate": 4.3501935217686593e-05, + "loss": 1.9991, + "step": 3341 + }, + { + "epoch": 0.26, + "grad_norm": 0.6017723494101405, + "learning_rate": 4.349773359582575e-05, + "loss": 1.9533, + "step": 3342 + }, + { + "epoch": 0.26, + "grad_norm": 0.6643558397714299, + "learning_rate": 4.34935308190754e-05, + "loss": 2.1964, + "step": 3343 + }, + { + "epoch": 0.26, + "grad_norm": 0.8339375034204594, + "learning_rate": 4.348932688769796e-05, + "loss": 1.9827, + "step": 3344 + }, + { + "epoch": 0.26, + "grad_norm": 0.7680698767254079, + "learning_rate": 4.348512180195587e-05, + "loss": 1.9871, + "step": 3345 + }, + { + "epoch": 0.26, + "grad_norm": 0.5630311519894491, + "learning_rate": 4.3480915562111704e-05, + "loss": 2.0061, + "step": 3346 + }, + { + "epoch": 0.26, + "grad_norm": 0.7274786665264129, + "learning_rate": 4.347670816842804e-05, + "loss": 2.2065, + "step": 3347 + }, + { + "epoch": 0.26, + "grad_norm": 0.5736075195281751, + "learning_rate": 4.347249962116759e-05, + "loss": 1.9347, + "step": 3348 + }, + { + "epoch": 0.26, + "grad_norm": 0.8256146733591994, + "learning_rate": 4.3468289920593105e-05, + "loss": 2.0248, + "step": 3349 + }, + { + "epoch": 0.26, + "grad_norm": 0.6201616414372438, + "learning_rate": 4.346407906696741e-05, + "loss": 2.0011, + "step": 3350 + }, + { + "epoch": 0.26, + "grad_norm": 0.5449729622105118, + "learning_rate": 4.3459867060553406e-05, + "loss": 2.1792, + "step": 3351 + }, + { + "epoch": 0.26, + "grad_norm": 0.5966042895886446, + "learning_rate": 4.345565390161406e-05, + "loss": 1.9734, + "step": 3352 + }, + { + "epoch": 0.26, + "grad_norm": 0.5739747873451795, + "learning_rate": 4.345143959041243e-05, + "loss": 2.0546, + "step": 3353 + }, + { + "epoch": 0.26, + "grad_norm": 0.5709630257652846, + "learning_rate": 4.3447224127211615e-05, + "loss": 1.9761, + "step": 3354 + }, + { + "epoch": 0.26, + "grad_norm": 0.5395760554946689, + "learning_rate": 4.344300751227483e-05, + "loss": 2.2084, + "step": 3355 + }, + { + "epoch": 0.26, + "grad_norm": 0.649107722615574, + "learning_rate": 4.3438789745865316e-05, + "loss": 2.0233, + "step": 3356 + }, + { + "epoch": 0.26, + "grad_norm": 0.6035839514149086, + "learning_rate": 4.3434570828246404e-05, + "loss": 1.9721, + "step": 3357 + }, + { + "epoch": 0.26, + "grad_norm": 0.5679824546491449, + "learning_rate": 4.34303507596815e-05, + "loss": 1.9765, + "step": 3358 + }, + { + "epoch": 0.26, + "grad_norm": 0.6048435047577161, + "learning_rate": 4.34261295404341e-05, + "loss": 2.2147, + "step": 3359 + }, + { + "epoch": 0.26, + "grad_norm": 0.5830877140802417, + "learning_rate": 4.342190717076773e-05, + "loss": 1.9578, + "step": 3360 + }, + { + "epoch": 0.26, + "grad_norm": 0.5382855391966205, + "learning_rate": 4.341768365094601e-05, + "loss": 1.9491, + "step": 3361 + }, + { + "epoch": 0.26, + "grad_norm": 0.5322028188645407, + "learning_rate": 4.3413458981232645e-05, + "loss": 1.9556, + "step": 3362 + }, + { + "epoch": 0.26, + "grad_norm": 0.5714307426896813, + "learning_rate": 4.340923316189138e-05, + "loss": 2.1344, + "step": 3363 + }, + { + "epoch": 0.26, + "grad_norm": 0.552061407346916, + "learning_rate": 4.3405006193186076e-05, + "loss": 1.976, + "step": 3364 + }, + { + "epoch": 0.26, + "grad_norm": 0.49554861170762793, + "learning_rate": 4.340077807538062e-05, + "loss": 2.0375, + "step": 3365 + }, + { + "epoch": 0.26, + "grad_norm": 0.5813390994549189, + "learning_rate": 4.3396548808738996e-05, + "loss": 1.9707, + "step": 3366 + }, + { + "epoch": 0.26, + "grad_norm": 0.5767326573369294, + "learning_rate": 4.339231839352526e-05, + "loss": 2.1926, + "step": 3367 + }, + { + "epoch": 0.26, + "grad_norm": 0.5883719675809757, + "learning_rate": 4.338808683000352e-05, + "loss": 1.9938, + "step": 3368 + }, + { + "epoch": 0.26, + "grad_norm": 0.6041851103058128, + "learning_rate": 4.3383854118437985e-05, + "loss": 1.9535, + "step": 3369 + }, + { + "epoch": 0.26, + "grad_norm": 0.5795335634206933, + "learning_rate": 4.337962025909291e-05, + "loss": 1.9767, + "step": 3370 + }, + { + "epoch": 0.26, + "grad_norm": 0.6099125154451848, + "learning_rate": 4.337538525223264e-05, + "loss": 2.0771, + "step": 3371 + }, + { + "epoch": 0.26, + "grad_norm": 0.6215558318063329, + "learning_rate": 4.337114909812158e-05, + "loss": 2.1499, + "step": 3372 + }, + { + "epoch": 0.26, + "grad_norm": 0.6072367112798203, + "learning_rate": 4.3366911797024213e-05, + "loss": 1.9993, + "step": 3373 + }, + { + "epoch": 0.26, + "grad_norm": 0.6916851379853962, + "learning_rate": 4.3362673349205084e-05, + "loss": 1.9861, + "step": 3374 + }, + { + "epoch": 0.26, + "grad_norm": 0.6713459140871444, + "learning_rate": 4.3358433754928826e-05, + "loss": 2.1917, + "step": 3375 + }, + { + "epoch": 0.26, + "grad_norm": 0.6458754280666313, + "learning_rate": 4.335419301446014e-05, + "loss": 1.9721, + "step": 3376 + }, + { + "epoch": 0.26, + "grad_norm": 0.7231586738055606, + "learning_rate": 4.334995112806377e-05, + "loss": 1.9626, + "step": 3377 + }, + { + "epoch": 0.26, + "grad_norm": 0.6093023144109259, + "learning_rate": 4.3345708096004574e-05, + "loss": 2.0173, + "step": 3378 + }, + { + "epoch": 0.26, + "grad_norm": 0.7243109850912278, + "learning_rate": 4.334146391854745e-05, + "loss": 2.2143, + "step": 3379 + }, + { + "epoch": 0.26, + "grad_norm": 0.6681658855781014, + "learning_rate": 4.3337218595957384e-05, + "loss": 1.9983, + "step": 3380 + }, + { + "epoch": 0.26, + "grad_norm": 0.6424942320027673, + "learning_rate": 4.333297212849944e-05, + "loss": 1.9515, + "step": 3381 + }, + { + "epoch": 0.26, + "grad_norm": 0.6980032375813264, + "learning_rate": 4.332872451643872e-05, + "loss": 2.0125, + "step": 3382 + }, + { + "epoch": 0.26, + "grad_norm": 0.7749696413298154, + "learning_rate": 4.332447576004044e-05, + "loss": 2.1838, + "step": 3383 + }, + { + "epoch": 0.26, + "grad_norm": 0.6050023657534769, + "learning_rate": 4.332022585956986e-05, + "loss": 1.9945, + "step": 3384 + }, + { + "epoch": 0.26, + "grad_norm": 0.5621619946895124, + "learning_rate": 4.331597481529232e-05, + "loss": 1.9254, + "step": 3385 + }, + { + "epoch": 0.26, + "grad_norm": 0.7756037528524355, + "learning_rate": 4.331172262747322e-05, + "loss": 1.9577, + "step": 3386 + }, + { + "epoch": 0.26, + "grad_norm": 0.6034669934600024, + "learning_rate": 4.3307469296378055e-05, + "loss": 2.1973, + "step": 3387 + }, + { + "epoch": 0.26, + "grad_norm": 0.6964275124768771, + "learning_rate": 4.330321482227237e-05, + "loss": 1.9351, + "step": 3388 + }, + { + "epoch": 0.26, + "grad_norm": 0.6379454919002772, + "learning_rate": 4.3298959205421795e-05, + "loss": 1.9611, + "step": 3389 + }, + { + "epoch": 0.26, + "grad_norm": 0.6556110713189239, + "learning_rate": 4.329470244609202e-05, + "loss": 2.0276, + "step": 3390 + }, + { + "epoch": 0.26, + "grad_norm": 0.5790557258477006, + "learning_rate": 4.329044454454882e-05, + "loss": 2.1435, + "step": 3391 + }, + { + "epoch": 0.26, + "grad_norm": 0.6192100637553798, + "learning_rate": 4.328618550105802e-05, + "loss": 1.966, + "step": 3392 + }, + { + "epoch": 0.26, + "grad_norm": 0.6438452810327356, + "learning_rate": 4.328192531588555e-05, + "loss": 1.9956, + "step": 3393 + }, + { + "epoch": 0.26, + "grad_norm": 0.6844017010091079, + "learning_rate": 4.327766398929737e-05, + "loss": 1.9829, + "step": 3394 + }, + { + "epoch": 0.26, + "grad_norm": 0.6605226868598343, + "learning_rate": 4.327340152155954e-05, + "loss": 2.1895, + "step": 3395 + }, + { + "epoch": 0.26, + "grad_norm": 0.6714904356371987, + "learning_rate": 4.3269137912938185e-05, + "loss": 2.0371, + "step": 3396 + }, + { + "epoch": 0.26, + "grad_norm": 0.6537050452483303, + "learning_rate": 4.32648731636995e-05, + "loss": 1.9836, + "step": 3397 + }, + { + "epoch": 0.26, + "grad_norm": 0.596123098414866, + "learning_rate": 4.326060727410975e-05, + "loss": 1.9887, + "step": 3398 + }, + { + "epoch": 0.26, + "grad_norm": 0.6259851481223893, + "learning_rate": 4.325634024443527e-05, + "loss": 2.1737, + "step": 3399 + }, + { + "epoch": 0.26, + "grad_norm": 0.7210761280243484, + "learning_rate": 4.325207207494247e-05, + "loss": 1.9736, + "step": 3400 + }, + { + "epoch": 0.26, + "grad_norm": 0.6334299517706657, + "learning_rate": 4.3247802765897825e-05, + "loss": 1.9371, + "step": 3401 + }, + { + "epoch": 0.26, + "grad_norm": 0.5958427464469437, + "learning_rate": 4.324353231756789e-05, + "loss": 2.0299, + "step": 3402 + }, + { + "epoch": 0.26, + "grad_norm": 0.5876429673431605, + "learning_rate": 4.3239260730219286e-05, + "loss": 1.9757, + "step": 3403 + }, + { + "epoch": 0.26, + "grad_norm": 0.6369202401944646, + "learning_rate": 4.323498800411872e-05, + "loss": 2.1648, + "step": 3404 + }, + { + "epoch": 0.26, + "grad_norm": 0.5352769254506956, + "learning_rate": 4.323071413953292e-05, + "loss": 1.9338, + "step": 3405 + }, + { + "epoch": 0.26, + "grad_norm": 0.5900630856857613, + "learning_rate": 4.322643913672876e-05, + "loss": 1.9717, + "step": 3406 + }, + { + "epoch": 0.26, + "grad_norm": 0.6130648586107842, + "learning_rate": 4.322216299597312e-05, + "loss": 2.1544, + "step": 3407 + }, + { + "epoch": 0.26, + "grad_norm": 0.5598692510465991, + "learning_rate": 4.321788571753299e-05, + "loss": 1.9428, + "step": 3408 + }, + { + "epoch": 0.26, + "grad_norm": 0.6758401584062906, + "learning_rate": 4.321360730167541e-05, + "loss": 2.0142, + "step": 3409 + }, + { + "epoch": 0.26, + "grad_norm": 0.6073161985836986, + "learning_rate": 4.3209327748667506e-05, + "loss": 1.9879, + "step": 3410 + }, + { + "epoch": 0.26, + "grad_norm": 0.6877559943121544, + "learning_rate": 4.320504705877646e-05, + "loss": 2.1397, + "step": 3411 + }, + { + "epoch": 0.26, + "grad_norm": 0.7371740557359265, + "learning_rate": 4.320076523226954e-05, + "loss": 2.015, + "step": 3412 + }, + { + "epoch": 0.26, + "grad_norm": 0.6645100746706, + "learning_rate": 4.319648226941408e-05, + "loss": 2.0135, + "step": 3413 + }, + { + "epoch": 0.26, + "grad_norm": 0.6395566917933644, + "learning_rate": 4.3192198170477474e-05, + "loss": 1.9693, + "step": 3414 + }, + { + "epoch": 0.26, + "grad_norm": 0.7190239991781591, + "learning_rate": 4.3187912935727204e-05, + "loss": 2.0273, + "step": 3415 + }, + { + "epoch": 0.26, + "grad_norm": 0.6344239998005745, + "learning_rate": 4.318362656543081e-05, + "loss": 2.1596, + "step": 3416 + }, + { + "epoch": 0.26, + "grad_norm": 0.5633102568156599, + "learning_rate": 4.3179339059855904e-05, + "loss": 1.9564, + "step": 3417 + }, + { + "epoch": 0.26, + "grad_norm": 0.720819688413015, + "learning_rate": 4.3175050419270185e-05, + "loss": 2.0135, + "step": 3418 + }, + { + "epoch": 0.26, + "grad_norm": 0.602534218540415, + "learning_rate": 4.3170760643941396e-05, + "loss": 2.1653, + "step": 3419 + }, + { + "epoch": 0.26, + "grad_norm": 0.6547663549626784, + "learning_rate": 4.316646973413738e-05, + "loss": 1.927, + "step": 3420 + }, + { + "epoch": 0.26, + "grad_norm": 0.6419636206338514, + "learning_rate": 4.316217769012603e-05, + "loss": 2.0368, + "step": 3421 + }, + { + "epoch": 0.26, + "grad_norm": 0.5962335720010901, + "learning_rate": 4.315788451217531e-05, + "loss": 1.9468, + "step": 3422 + }, + { + "epoch": 0.26, + "grad_norm": 0.7040722502169042, + "learning_rate": 4.315359020055327e-05, + "loss": 1.9613, + "step": 3423 + }, + { + "epoch": 0.26, + "grad_norm": 0.6533006368460195, + "learning_rate": 4.314929475552801e-05, + "loss": 2.2073, + "step": 3424 + }, + { + "epoch": 0.26, + "grad_norm": 0.5992028038902805, + "learning_rate": 4.314499817736773e-05, + "loss": 2.0205, + "step": 3425 + }, + { + "epoch": 0.26, + "grad_norm": 0.6436494245218551, + "learning_rate": 4.314070046634067e-05, + "loss": 1.9634, + "step": 3426 + }, + { + "epoch": 0.26, + "grad_norm": 0.6298600433972793, + "learning_rate": 4.313640162271514e-05, + "loss": 2.0242, + "step": 3427 + }, + { + "epoch": 0.26, + "grad_norm": 0.622413120761477, + "learning_rate": 4.313210164675957e-05, + "loss": 2.1217, + "step": 3428 + }, + { + "epoch": 0.26, + "grad_norm": 0.6373264323441286, + "learning_rate": 4.312780053874239e-05, + "loss": 1.9176, + "step": 3429 + }, + { + "epoch": 0.26, + "grad_norm": 0.6206090643465045, + "learning_rate": 4.312349829893217e-05, + "loss": 1.9809, + "step": 3430 + }, + { + "epoch": 0.26, + "grad_norm": 0.5844543139264697, + "learning_rate": 4.3119194927597486e-05, + "loss": 2.1895, + "step": 3431 + }, + { + "epoch": 0.26, + "grad_norm": 0.5985566092734054, + "learning_rate": 4.3114890425007023e-05, + "loss": 1.9694, + "step": 3432 + }, + { + "epoch": 0.26, + "grad_norm": 0.5846860511863712, + "learning_rate": 4.311058479142954e-05, + "loss": 2.0556, + "step": 3433 + }, + { + "epoch": 0.26, + "grad_norm": 0.6473493468413358, + "learning_rate": 4.310627802713385e-05, + "loss": 1.9571, + "step": 3434 + }, + { + "epoch": 0.27, + "grad_norm": 0.5674972753856153, + "learning_rate": 4.310197013238884e-05, + "loss": 1.965, + "step": 3435 + }, + { + "epoch": 0.27, + "grad_norm": 0.714437546061388, + "learning_rate": 4.309766110746346e-05, + "loss": 2.121, + "step": 3436 + }, + { + "epoch": 0.27, + "grad_norm": 0.5960384068287663, + "learning_rate": 4.309335095262676e-05, + "loss": 1.958, + "step": 3437 + }, + { + "epoch": 0.27, + "grad_norm": 0.5710089477893111, + "learning_rate": 4.308903966814782e-05, + "loss": 1.9827, + "step": 3438 + }, + { + "epoch": 0.27, + "grad_norm": 0.7151940356901383, + "learning_rate": 4.308472725429583e-05, + "loss": 2.1806, + "step": 3439 + }, + { + "epoch": 0.27, + "grad_norm": 0.560446721333317, + "learning_rate": 4.308041371134003e-05, + "loss": 2.0418, + "step": 3440 + }, + { + "epoch": 0.27, + "grad_norm": 0.663684064300936, + "learning_rate": 4.307609903954971e-05, + "loss": 1.9948, + "step": 3441 + }, + { + "epoch": 0.27, + "grad_norm": 0.5690660750320651, + "learning_rate": 4.307178323919429e-05, + "loss": 2.0209, + "step": 3442 + }, + { + "epoch": 0.27, + "grad_norm": 0.7775104219250217, + "learning_rate": 4.3067466310543184e-05, + "loss": 2.151, + "step": 3443 + }, + { + "epoch": 0.27, + "grad_norm": 0.5791585319961575, + "learning_rate": 4.3063148253865934e-05, + "loss": 2.0003, + "step": 3444 + }, + { + "epoch": 0.27, + "grad_norm": 0.6865458958024743, + "learning_rate": 4.305882906943214e-05, + "loss": 1.9677, + "step": 3445 + }, + { + "epoch": 0.27, + "grad_norm": 0.7435018116992348, + "learning_rate": 4.305450875751146e-05, + "loss": 2.0587, + "step": 3446 + }, + { + "epoch": 0.27, + "grad_norm": 0.574147006628735, + "learning_rate": 4.3050187318373624e-05, + "loss": 2.0023, + "step": 3447 + }, + { + "epoch": 0.27, + "grad_norm": 0.8285860855126721, + "learning_rate": 4.3045864752288445e-05, + "loss": 2.172, + "step": 3448 + }, + { + "epoch": 0.27, + "grad_norm": 0.712835756689722, + "learning_rate": 4.304154105952579e-05, + "loss": 1.9576, + "step": 3449 + }, + { + "epoch": 0.27, + "grad_norm": 0.6669105143428722, + "learning_rate": 4.303721624035561e-05, + "loss": 2.003, + "step": 3450 + }, + { + "epoch": 0.27, + "grad_norm": 0.6677012174665905, + "learning_rate": 4.303289029504793e-05, + "loss": 2.1459, + "step": 3451 + }, + { + "epoch": 0.27, + "grad_norm": 0.7140700264503651, + "learning_rate": 4.302856322387282e-05, + "loss": 2.0528, + "step": 3452 + }, + { + "epoch": 0.27, + "grad_norm": 0.5543060339324902, + "learning_rate": 4.302423502710044e-05, + "loss": 2.0025, + "step": 3453 + }, + { + "epoch": 0.27, + "grad_norm": 0.7029809762978481, + "learning_rate": 4.301990570500104e-05, + "loss": 1.9334, + "step": 3454 + }, + { + "epoch": 0.27, + "grad_norm": 0.6933270363547249, + "learning_rate": 4.3015575257844886e-05, + "loss": 1.9917, + "step": 3455 + }, + { + "epoch": 0.27, + "grad_norm": 0.6170275333723124, + "learning_rate": 4.301124368590236e-05, + "loss": 2.1338, + "step": 3456 + }, + { + "epoch": 0.27, + "grad_norm": 0.6934229534188497, + "learning_rate": 4.30069109894439e-05, + "loss": 1.9524, + "step": 3457 + }, + { + "epoch": 0.27, + "grad_norm": 0.7483831431055771, + "learning_rate": 4.300257716874001e-05, + "loss": 2.0487, + "step": 3458 + }, + { + "epoch": 0.27, + "grad_norm": 0.7167312180081552, + "learning_rate": 4.299824222406128e-05, + "loss": 2.0266, + "step": 3459 + }, + { + "epoch": 0.27, + "grad_norm": 0.6944895412136012, + "learning_rate": 4.2993906155678345e-05, + "loss": 2.1665, + "step": 3460 + }, + { + "epoch": 0.27, + "grad_norm": 0.596947623670719, + "learning_rate": 4.298956896386192e-05, + "loss": 1.9717, + "step": 3461 + }, + { + "epoch": 0.27, + "grad_norm": 0.5938412641282599, + "learning_rate": 4.2985230648882815e-05, + "loss": 1.9447, + "step": 3462 + }, + { + "epoch": 0.27, + "grad_norm": 0.7036160327129625, + "learning_rate": 4.298089121101187e-05, + "loss": 2.1877, + "step": 3463 + }, + { + "epoch": 0.27, + "grad_norm": 0.5550288729252366, + "learning_rate": 4.297655065052001e-05, + "loss": 2.1042, + "step": 3464 + }, + { + "epoch": 0.27, + "grad_norm": 0.6381540682304744, + "learning_rate": 4.297220896767825e-05, + "loss": 1.969, + "step": 3465 + }, + { + "epoch": 0.27, + "grad_norm": 0.5916594263963448, + "learning_rate": 4.296786616275765e-05, + "loss": 1.9972, + "step": 3466 + }, + { + "epoch": 0.27, + "grad_norm": 0.5480182294382414, + "learning_rate": 4.296352223602936e-05, + "loss": 1.9891, + "step": 3467 + }, + { + "epoch": 0.27, + "grad_norm": 0.5908297581273191, + "learning_rate": 4.2959177187764576e-05, + "loss": 2.207, + "step": 3468 + }, + { + "epoch": 0.27, + "grad_norm": 0.568584987974873, + "learning_rate": 4.2954831018234575e-05, + "loss": 1.9753, + "step": 3469 + }, + { + "epoch": 0.27, + "grad_norm": 0.5644334140005699, + "learning_rate": 4.295048372771072e-05, + "loss": 1.9906, + "step": 3470 + }, + { + "epoch": 0.27, + "grad_norm": 0.5875469908771835, + "learning_rate": 4.294613531646443e-05, + "loss": 2.0638, + "step": 3471 + }, + { + "epoch": 0.27, + "grad_norm": 0.5592134964810097, + "learning_rate": 4.294178578476718e-05, + "loss": 2.1786, + "step": 3472 + }, + { + "epoch": 0.27, + "grad_norm": 0.5542664728941481, + "learning_rate": 4.2937435132890534e-05, + "loss": 1.9597, + "step": 3473 + }, + { + "epoch": 0.27, + "grad_norm": 0.5675406162569849, + "learning_rate": 4.293308336110613e-05, + "loss": 1.9186, + "step": 3474 + }, + { + "epoch": 0.27, + "grad_norm": 0.588528405367183, + "learning_rate": 4.2928730469685654e-05, + "loss": 2.1698, + "step": 3475 + }, + { + "epoch": 0.27, + "grad_norm": 0.5699638321728565, + "learning_rate": 4.292437645890089e-05, + "loss": 1.992, + "step": 3476 + }, + { + "epoch": 0.27, + "grad_norm": 0.5481057694382944, + "learning_rate": 4.292002132902366e-05, + "loss": 2.0218, + "step": 3477 + }, + { + "epoch": 0.27, + "grad_norm": 0.5445649490794684, + "learning_rate": 4.291566508032589e-05, + "loss": 2.0208, + "step": 3478 + }, + { + "epoch": 0.27, + "grad_norm": 0.5956980812701832, + "learning_rate": 4.291130771307955e-05, + "loss": 1.9786, + "step": 3479 + }, + { + "epoch": 0.27, + "grad_norm": 0.5826647802249104, + "learning_rate": 4.2906949227556684e-05, + "loss": 2.1842, + "step": 3480 + }, + { + "epoch": 0.27, + "grad_norm": 0.5498371002027868, + "learning_rate": 4.290258962402941e-05, + "loss": 1.9583, + "step": 3481 + }, + { + "epoch": 0.27, + "grad_norm": 0.5373103475212477, + "learning_rate": 4.289822890276992e-05, + "loss": 1.9537, + "step": 3482 + }, + { + "epoch": 0.27, + "grad_norm": 0.6016919352380754, + "learning_rate": 4.289386706405047e-05, + "loss": 2.051, + "step": 3483 + }, + { + "epoch": 0.27, + "grad_norm": 0.5436215769634162, + "learning_rate": 4.28895041081434e-05, + "loss": 2.1338, + "step": 3484 + }, + { + "epoch": 0.27, + "grad_norm": 0.5808837844726863, + "learning_rate": 4.2885140035321094e-05, + "loss": 2.016, + "step": 3485 + }, + { + "epoch": 0.27, + "grad_norm": 0.6209222730763857, + "learning_rate": 4.288077484585602e-05, + "loss": 1.9391, + "step": 3486 + }, + { + "epoch": 0.27, + "grad_norm": 0.5734919608423779, + "learning_rate": 4.2876408540020716e-05, + "loss": 1.9745, + "step": 3487 + }, + { + "epoch": 0.27, + "grad_norm": 0.60775074838826, + "learning_rate": 4.287204111808778e-05, + "loss": 2.1612, + "step": 3488 + }, + { + "epoch": 0.27, + "grad_norm": 0.5629158467320676, + "learning_rate": 4.286767258032991e-05, + "loss": 2.0187, + "step": 3489 + }, + { + "epoch": 0.27, + "grad_norm": 0.5691637534575228, + "learning_rate": 4.286330292701983e-05, + "loss": 1.9552, + "step": 3490 + }, + { + "epoch": 0.27, + "grad_norm": 0.611181244835064, + "learning_rate": 4.285893215843037e-05, + "loss": 1.9683, + "step": 3491 + }, + { + "epoch": 0.27, + "grad_norm": 0.609905064067679, + "learning_rate": 4.285456027483441e-05, + "loss": 2.1753, + "step": 3492 + }, + { + "epoch": 0.27, + "grad_norm": 0.5841646607916439, + "learning_rate": 4.2850187276504896e-05, + "loss": 2.009, + "step": 3493 + }, + { + "epoch": 0.27, + "grad_norm": 0.680033094241314, + "learning_rate": 4.2845813163714865e-05, + "loss": 1.9663, + "step": 3494 + }, + { + "epoch": 0.27, + "grad_norm": 0.6142553429540658, + "learning_rate": 4.28414379367374e-05, + "loss": 2.1743, + "step": 3495 + }, + { + "epoch": 0.27, + "grad_norm": 0.6085222728724563, + "learning_rate": 4.2837061595845676e-05, + "loss": 2.0248, + "step": 3496 + }, + { + "epoch": 0.27, + "grad_norm": 0.5852490288440427, + "learning_rate": 4.2832684141312916e-05, + "loss": 1.966, + "step": 3497 + }, + { + "epoch": 0.27, + "grad_norm": 0.5201748258194919, + "learning_rate": 4.282830557341243e-05, + "loss": 1.9415, + "step": 3498 + }, + { + "epoch": 0.27, + "grad_norm": 0.6259861356398713, + "learning_rate": 4.2823925892417586e-05, + "loss": 1.9744, + "step": 3499 + }, + { + "epoch": 0.27, + "grad_norm": 0.5707288307950139, + "learning_rate": 4.2819545098601833e-05, + "loss": 2.1729, + "step": 3500 + }, + { + "epoch": 0.27, + "grad_norm": 0.5469073015790462, + "learning_rate": 4.281516319223866e-05, + "loss": 1.9067, + "step": 3501 + }, + { + "epoch": 0.27, + "grad_norm": 0.5567885146327375, + "learning_rate": 4.281078017360167e-05, + "loss": 2.0487, + "step": 3502 + }, + { + "epoch": 0.27, + "grad_norm": 0.5760709394883137, + "learning_rate": 4.2806396042964506e-05, + "loss": 1.9843, + "step": 3503 + }, + { + "epoch": 0.27, + "grad_norm": 0.5365022875675651, + "learning_rate": 4.2802010800600887e-05, + "loss": 2.2496, + "step": 3504 + }, + { + "epoch": 0.27, + "grad_norm": 0.5724367302351668, + "learning_rate": 4.27976244467846e-05, + "loss": 1.9944, + "step": 3505 + }, + { + "epoch": 0.27, + "grad_norm": 0.5566146561870787, + "learning_rate": 4.2793236981789507e-05, + "loss": 1.9878, + "step": 3506 + }, + { + "epoch": 0.27, + "grad_norm": 0.5527466764045958, + "learning_rate": 4.278884840588953e-05, + "loss": 1.9375, + "step": 3507 + }, + { + "epoch": 0.27, + "grad_norm": 0.5495380399005712, + "learning_rate": 4.278445871935866e-05, + "loss": 2.1744, + "step": 3508 + }, + { + "epoch": 0.27, + "grad_norm": 0.5737131415803041, + "learning_rate": 4.278006792247099e-05, + "loss": 1.9496, + "step": 3509 + }, + { + "epoch": 0.27, + "grad_norm": 0.5389822741873364, + "learning_rate": 4.277567601550063e-05, + "loss": 1.994, + "step": 3510 + }, + { + "epoch": 0.27, + "grad_norm": 0.6161358148327485, + "learning_rate": 4.277128299872178e-05, + "loss": 1.9585, + "step": 3511 + }, + { + "epoch": 0.27, + "grad_norm": 0.6175374787626668, + "learning_rate": 4.276688887240874e-05, + "loss": 2.1893, + "step": 3512 + }, + { + "epoch": 0.27, + "grad_norm": 0.6075712968975086, + "learning_rate": 4.276249363683584e-05, + "loss": 1.9821, + "step": 3513 + }, + { + "epoch": 0.27, + "grad_norm": 0.5094120351756142, + "learning_rate": 4.2758097292277485e-05, + "loss": 2.0547, + "step": 3514 + }, + { + "epoch": 0.27, + "grad_norm": 0.6095360460770262, + "learning_rate": 4.2753699839008165e-05, + "loss": 1.928, + "step": 3515 + }, + { + "epoch": 0.27, + "grad_norm": 0.6031052188996743, + "learning_rate": 4.2749301277302436e-05, + "loss": 2.1757, + "step": 3516 + }, + { + "epoch": 0.27, + "grad_norm": 0.5632300347640559, + "learning_rate": 4.2744901607434907e-05, + "loss": 1.9888, + "step": 3517 + }, + { + "epoch": 0.27, + "grad_norm": 0.5516064651210182, + "learning_rate": 4.274050082968027e-05, + "loss": 1.9775, + "step": 3518 + }, + { + "epoch": 0.27, + "grad_norm": 0.6232206105397682, + "learning_rate": 4.27360989443133e-05, + "loss": 1.9351, + "step": 3519 + }, + { + "epoch": 0.27, + "grad_norm": 0.5510369828956985, + "learning_rate": 4.2731695951608796e-05, + "loss": 2.1984, + "step": 3520 + }, + { + "epoch": 0.27, + "grad_norm": 0.6898463706651472, + "learning_rate": 4.272729185184168e-05, + "loss": 1.9656, + "step": 3521 + }, + { + "epoch": 0.27, + "grad_norm": 0.5434207460363776, + "learning_rate": 4.2722886645286896e-05, + "loss": 1.9417, + "step": 3522 + }, + { + "epoch": 0.27, + "grad_norm": 0.6158325564207758, + "learning_rate": 4.27184803322195e-05, + "loss": 1.9336, + "step": 3523 + }, + { + "epoch": 0.27, + "grad_norm": 0.5631465895752555, + "learning_rate": 4.271407291291459e-05, + "loss": 2.1689, + "step": 3524 + }, + { + "epoch": 0.27, + "grad_norm": 0.6401951007254659, + "learning_rate": 4.270966438764733e-05, + "loss": 1.9601, + "step": 3525 + }, + { + "epoch": 0.27, + "grad_norm": 0.5830976587396529, + "learning_rate": 4.270525475669297e-05, + "loss": 1.9174, + "step": 3526 + }, + { + "epoch": 0.27, + "grad_norm": 0.6569767585344545, + "learning_rate": 4.270084402032682e-05, + "loss": 2.0499, + "step": 3527 + }, + { + "epoch": 0.27, + "grad_norm": 0.6160872431867402, + "learning_rate": 4.269643217882427e-05, + "loss": 2.1516, + "step": 3528 + }, + { + "epoch": 0.27, + "grad_norm": 0.5621347350661055, + "learning_rate": 4.2692019232460755e-05, + "loss": 1.9803, + "step": 3529 + }, + { + "epoch": 0.27, + "grad_norm": 0.6838445083236101, + "learning_rate": 4.2687605181511796e-05, + "loss": 1.9519, + "step": 3530 + }, + { + "epoch": 0.27, + "grad_norm": 0.6362231150004192, + "learning_rate": 4.2683190026252984e-05, + "loss": 1.9883, + "step": 3531 + }, + { + "epoch": 0.27, + "grad_norm": 0.6638870352443634, + "learning_rate": 4.267877376695998e-05, + "loss": 2.1864, + "step": 3532 + }, + { + "epoch": 0.27, + "grad_norm": 0.5835125272321948, + "learning_rate": 4.26743564039085e-05, + "loss": 2.0174, + "step": 3533 + }, + { + "epoch": 0.27, + "grad_norm": 0.5749277383190746, + "learning_rate": 4.266993793737434e-05, + "loss": 1.9989, + "step": 3534 + }, + { + "epoch": 0.27, + "grad_norm": 0.6846675944468018, + "learning_rate": 4.266551836763336e-05, + "loss": 1.9736, + "step": 3535 + }, + { + "epoch": 0.27, + "grad_norm": 0.6025763259762551, + "learning_rate": 4.2661097694961506e-05, + "loss": 2.1971, + "step": 3536 + }, + { + "epoch": 0.27, + "grad_norm": 0.6117460003735535, + "learning_rate": 4.2656675919634756e-05, + "loss": 2.0025, + "step": 3537 + }, + { + "epoch": 0.27, + "grad_norm": 0.6263141921681835, + "learning_rate": 4.2652253041929205e-05, + "loss": 1.9023, + "step": 3538 + }, + { + "epoch": 0.27, + "grad_norm": 0.6386828240778866, + "learning_rate": 4.264782906212098e-05, + "loss": 2.0203, + "step": 3539 + }, + { + "epoch": 0.27, + "grad_norm": 0.5962488546407848, + "learning_rate": 4.264340398048628e-05, + "loss": 2.177, + "step": 3540 + }, + { + "epoch": 0.27, + "grad_norm": 0.6400321250285554, + "learning_rate": 4.263897779730139e-05, + "loss": 1.9227, + "step": 3541 + }, + { + "epoch": 0.27, + "grad_norm": 0.73282255923245, + "learning_rate": 4.2634550512842654e-05, + "loss": 1.9957, + "step": 3542 + }, + { + "epoch": 0.27, + "grad_norm": 0.5829278847102797, + "learning_rate": 4.263012212738649e-05, + "loss": 1.9299, + "step": 3543 + }, + { + "epoch": 0.27, + "grad_norm": 0.7623627785148214, + "learning_rate": 4.262569264120937e-05, + "loss": 2.1304, + "step": 3544 + }, + { + "epoch": 0.27, + "grad_norm": 0.6822532274608936, + "learning_rate": 4.262126205458785e-05, + "loss": 2.0581, + "step": 3545 + }, + { + "epoch": 0.27, + "grad_norm": 0.5622786746964618, + "learning_rate": 4.2616830367798555e-05, + "loss": 1.9939, + "step": 3546 + }, + { + "epoch": 0.27, + "grad_norm": 0.8214571913001184, + "learning_rate": 4.2612397581118165e-05, + "loss": 1.9589, + "step": 3547 + }, + { + "epoch": 0.27, + "grad_norm": 0.5774588481513999, + "learning_rate": 4.260796369482344e-05, + "loss": 2.1912, + "step": 3548 + }, + { + "epoch": 0.27, + "grad_norm": 0.692268545416117, + "learning_rate": 4.2603528709191206e-05, + "loss": 1.9431, + "step": 3549 + }, + { + "epoch": 0.27, + "grad_norm": 0.6791923928694871, + "learning_rate": 4.259909262449836e-05, + "loss": 1.9993, + "step": 3550 + }, + { + "epoch": 0.27, + "grad_norm": 0.5751758330601876, + "learning_rate": 4.259465544102186e-05, + "loss": 2.0241, + "step": 3551 + }, + { + "epoch": 0.27, + "grad_norm": 0.631682776599926, + "learning_rate": 4.259021715903874e-05, + "loss": 2.1648, + "step": 3552 + }, + { + "epoch": 0.27, + "grad_norm": 0.6333248117430617, + "learning_rate": 4.2585777778826106e-05, + "loss": 2.0004, + "step": 3553 + }, + { + "epoch": 0.27, + "grad_norm": 0.5575869903289661, + "learning_rate": 4.258133730066112e-05, + "loss": 2.0006, + "step": 3554 + }, + { + "epoch": 0.27, + "grad_norm": 0.6207432318493338, + "learning_rate": 4.257689572482101e-05, + "loss": 1.9789, + "step": 3555 + }, + { + "epoch": 0.27, + "grad_norm": 0.6236564842883453, + "learning_rate": 4.25724530515831e-05, + "loss": 2.1376, + "step": 3556 + }, + { + "epoch": 0.27, + "grad_norm": 0.6018490087744807, + "learning_rate": 4.256800928122475e-05, + "loss": 1.9989, + "step": 3557 + }, + { + "epoch": 0.27, + "grad_norm": 0.5837737169990872, + "learning_rate": 4.2563564414023425e-05, + "loss": 2.0121, + "step": 3558 + }, + { + "epoch": 0.27, + "grad_norm": 0.5678897506246137, + "learning_rate": 4.2559118450256606e-05, + "loss": 1.9457, + "step": 3559 + }, + { + "epoch": 0.27, + "grad_norm": 0.5795636050288442, + "learning_rate": 4.25546713902019e-05, + "loss": 2.145, + "step": 3560 + }, + { + "epoch": 0.27, + "grad_norm": 0.5758600301418108, + "learning_rate": 4.255022323413693e-05, + "loss": 1.9447, + "step": 3561 + }, + { + "epoch": 0.27, + "grad_norm": 0.6114102311399662, + "learning_rate": 4.2545773982339435e-05, + "loss": 1.9254, + "step": 3562 + }, + { + "epoch": 0.27, + "grad_norm": 0.5894951209657461, + "learning_rate": 4.254132363508718e-05, + "loss": 1.9254, + "step": 3563 + }, + { + "epoch": 0.27, + "grad_norm": 0.6579455900537443, + "learning_rate": 4.2536872192658036e-05, + "loss": 2.2953, + "step": 3564 + }, + { + "epoch": 0.28, + "grad_norm": 0.5968628123988545, + "learning_rate": 4.2532419655329914e-05, + "loss": 1.9795, + "step": 3565 + }, + { + "epoch": 0.28, + "grad_norm": 0.6507131490752657, + "learning_rate": 4.252796602338081e-05, + "loss": 1.9623, + "step": 3566 + }, + { + "epoch": 0.28, + "grad_norm": 0.6137508647944913, + "learning_rate": 4.252351129708878e-05, + "loss": 2.0012, + "step": 3567 + }, + { + "epoch": 0.28, + "grad_norm": 0.6348735388023643, + "learning_rate": 4.251905547673195e-05, + "loss": 2.1883, + "step": 3568 + }, + { + "epoch": 0.28, + "grad_norm": 0.5625714378638849, + "learning_rate": 4.2514598562588525e-05, + "loss": 1.9412, + "step": 3569 + }, + { + "epoch": 0.28, + "grad_norm": 0.6659271808445483, + "learning_rate": 4.251014055493675e-05, + "loss": 2.0677, + "step": 3570 + }, + { + "epoch": 0.28, + "grad_norm": 0.5666380485461262, + "learning_rate": 4.2505681454054976e-05, + "loss": 1.9743, + "step": 3571 + }, + { + "epoch": 0.28, + "grad_norm": 0.6660798478226465, + "learning_rate": 4.250122126022158e-05, + "loss": 2.189, + "step": 3572 + }, + { + "epoch": 0.28, + "grad_norm": 0.6894931664937108, + "learning_rate": 4.2496759973715057e-05, + "loss": 1.9843, + "step": 3573 + }, + { + "epoch": 0.28, + "grad_norm": 0.6268832561340679, + "learning_rate": 4.249229759481392e-05, + "loss": 1.9398, + "step": 3574 + }, + { + "epoch": 0.28, + "grad_norm": 0.69379858918675, + "learning_rate": 4.2487834123796796e-05, + "loss": 1.967, + "step": 3575 + }, + { + "epoch": 0.28, + "grad_norm": 0.652530298300391, + "learning_rate": 4.2483369560942334e-05, + "loss": 2.1919, + "step": 3576 + }, + { + "epoch": 0.28, + "grad_norm": 0.6466408698572437, + "learning_rate": 4.247890390652929e-05, + "loss": 1.9517, + "step": 3577 + }, + { + "epoch": 0.28, + "grad_norm": 0.7099083335149067, + "learning_rate": 4.2474437160836475e-05, + "loss": 2.0187, + "step": 3578 + }, + { + "epoch": 0.28, + "grad_norm": 0.6085220034334953, + "learning_rate": 4.246996932414276e-05, + "loss": 1.9295, + "step": 3579 + }, + { + "epoch": 0.28, + "grad_norm": 0.6055128721805431, + "learning_rate": 4.246550039672709e-05, + "loss": 2.1425, + "step": 3580 + }, + { + "epoch": 0.28, + "grad_norm": 0.6548863555268384, + "learning_rate": 4.246103037886848e-05, + "loss": 1.9823, + "step": 3581 + }, + { + "epoch": 0.28, + "grad_norm": 0.5452235542876803, + "learning_rate": 4.2456559270846006e-05, + "loss": 2.0255, + "step": 3582 + }, + { + "epoch": 0.28, + "grad_norm": 0.60667387122125, + "learning_rate": 4.245208707293883e-05, + "loss": 1.9773, + "step": 3583 + }, + { + "epoch": 0.28, + "grad_norm": 0.5532194008719898, + "learning_rate": 4.2447613785426166e-05, + "loss": 2.158, + "step": 3584 + }, + { + "epoch": 0.28, + "grad_norm": 0.6363204850272001, + "learning_rate": 4.2443139408587295e-05, + "loss": 1.98, + "step": 3585 + }, + { + "epoch": 0.28, + "grad_norm": 0.6124182363098932, + "learning_rate": 4.243866394270157e-05, + "loss": 1.9029, + "step": 3586 + }, + { + "epoch": 0.28, + "grad_norm": 0.6270547089256402, + "learning_rate": 4.243418738804842e-05, + "loss": 1.9367, + "step": 3587 + }, + { + "epoch": 0.28, + "grad_norm": 0.5550182301349645, + "learning_rate": 4.242970974490732e-05, + "loss": 2.1668, + "step": 3588 + }, + { + "epoch": 0.28, + "grad_norm": 0.524093186701037, + "learning_rate": 4.242523101355784e-05, + "loss": 2.0382, + "step": 3589 + }, + { + "epoch": 0.28, + "grad_norm": 0.58839779694442, + "learning_rate": 4.242075119427961e-05, + "loss": 1.9615, + "step": 3590 + }, + { + "epoch": 0.28, + "grad_norm": 0.5847928684804756, + "learning_rate": 4.241627028735231e-05, + "loss": 1.9393, + "step": 3591 + }, + { + "epoch": 0.28, + "grad_norm": 0.5915792754609337, + "learning_rate": 4.2411788293055714e-05, + "loss": 2.1656, + "step": 3592 + }, + { + "epoch": 0.28, + "grad_norm": 0.5692197619522463, + "learning_rate": 4.240730521166964e-05, + "loss": 1.9471, + "step": 3593 + }, + { + "epoch": 0.28, + "grad_norm": 0.5302539583684146, + "learning_rate": 4.2402821043474e-05, + "loss": 1.9468, + "step": 3594 + }, + { + "epoch": 0.28, + "grad_norm": 0.5800767614250936, + "learning_rate": 4.239833578874874e-05, + "loss": 2.0516, + "step": 3595 + }, + { + "epoch": 0.28, + "grad_norm": 0.610295258296427, + "learning_rate": 4.2393849447773906e-05, + "loss": 2.169, + "step": 3596 + }, + { + "epoch": 0.28, + "grad_norm": 0.5541249686121217, + "learning_rate": 4.238936202082959e-05, + "loss": 1.978, + "step": 3597 + }, + { + "epoch": 0.28, + "grad_norm": 0.6451316626778092, + "learning_rate": 4.238487350819597e-05, + "loss": 1.922, + "step": 3598 + }, + { + "epoch": 0.28, + "grad_norm": 0.594026999119571, + "learning_rate": 4.238038391015328e-05, + "loss": 1.9515, + "step": 3599 + }, + { + "epoch": 0.28, + "grad_norm": 0.6913281622954055, + "learning_rate": 4.237589322698182e-05, + "loss": 2.1972, + "step": 3600 + }, + { + "epoch": 0.28, + "grad_norm": 0.5723853604082905, + "learning_rate": 4.2371401458961965e-05, + "loss": 2.0434, + "step": 3601 + }, + { + "epoch": 0.28, + "grad_norm": 0.5638424577995461, + "learning_rate": 4.2366908606374154e-05, + "loss": 1.9402, + "step": 3602 + }, + { + "epoch": 0.28, + "grad_norm": 0.5650151943904794, + "learning_rate": 4.23624146694989e-05, + "loss": 1.9579, + "step": 3603 + }, + { + "epoch": 0.28, + "grad_norm": 0.6580841009359153, + "learning_rate": 4.2357919648616755e-05, + "loss": 2.1871, + "step": 3604 + }, + { + "epoch": 0.28, + "grad_norm": 0.6327577715920094, + "learning_rate": 4.235342354400839e-05, + "loss": 1.9753, + "step": 3605 + }, + { + "epoch": 0.28, + "grad_norm": 0.5814117013969183, + "learning_rate": 4.23489263559545e-05, + "loss": 1.9585, + "step": 3606 + }, + { + "epoch": 0.28, + "grad_norm": 0.6753239856025098, + "learning_rate": 4.2344428084735865e-05, + "loss": 2.0341, + "step": 3607 + }, + { + "epoch": 0.28, + "grad_norm": 0.5747846217294922, + "learning_rate": 4.2339928730633335e-05, + "loss": 1.9766, + "step": 3608 + }, + { + "epoch": 0.28, + "grad_norm": 0.6735753855103506, + "learning_rate": 4.2335428293927823e-05, + "loss": 2.1485, + "step": 3609 + }, + { + "epoch": 0.28, + "grad_norm": 0.617259619016361, + "learning_rate": 4.23309267749003e-05, + "loss": 1.9316, + "step": 3610 + }, + { + "epoch": 0.28, + "grad_norm": 0.5617756648780682, + "learning_rate": 4.232642417383182e-05, + "loss": 1.9694, + "step": 3611 + }, + { + "epoch": 0.28, + "grad_norm": 0.6015126110128293, + "learning_rate": 4.2321920491003505e-05, + "loss": 2.1618, + "step": 3612 + }, + { + "epoch": 0.28, + "grad_norm": 0.5470485102142324, + "learning_rate": 4.231741572669654e-05, + "loss": 1.9627, + "step": 3613 + }, + { + "epoch": 0.28, + "grad_norm": 0.6729148117988755, + "learning_rate": 4.2312909881192164e-05, + "loss": 1.9969, + "step": 3614 + }, + { + "epoch": 0.28, + "grad_norm": 0.6340268730218388, + "learning_rate": 4.23084029547717e-05, + "loss": 1.9647, + "step": 3615 + }, + { + "epoch": 0.28, + "grad_norm": 0.6350587568855608, + "learning_rate": 4.2303894947716525e-05, + "loss": 2.1933, + "step": 3616 + }, + { + "epoch": 0.28, + "grad_norm": 0.5810210642753171, + "learning_rate": 4.229938586030812e-05, + "loss": 1.9459, + "step": 3617 + }, + { + "epoch": 0.28, + "grad_norm": 0.6103930794581048, + "learning_rate": 4.2294875692827976e-05, + "loss": 1.9919, + "step": 3618 + }, + { + "epoch": 0.28, + "grad_norm": 0.5695218794060828, + "learning_rate": 4.229036444555769e-05, + "loss": 1.9077, + "step": 3619 + }, + { + "epoch": 0.28, + "grad_norm": 0.538446693709765, + "learning_rate": 4.228585211877893e-05, + "loss": 2.0638, + "step": 3620 + }, + { + "epoch": 0.28, + "grad_norm": 0.5967420469338331, + "learning_rate": 4.2281338712773416e-05, + "loss": 2.1515, + "step": 3621 + }, + { + "epoch": 0.28, + "grad_norm": 0.566508999026493, + "learning_rate": 4.227682422782293e-05, + "loss": 1.9358, + "step": 3622 + }, + { + "epoch": 0.28, + "grad_norm": 0.5647954476459947, + "learning_rate": 4.227230866420932e-05, + "loss": 1.9863, + "step": 3623 + }, + { + "epoch": 0.28, + "grad_norm": 0.621558945295215, + "learning_rate": 4.226779202221453e-05, + "loss": 2.146, + "step": 3624 + }, + { + "epoch": 0.28, + "grad_norm": 0.5398533495844497, + "learning_rate": 4.2263274302120546e-05, + "loss": 1.9566, + "step": 3625 + }, + { + "epoch": 0.28, + "grad_norm": 0.577457045530722, + "learning_rate": 4.225875550420943e-05, + "loss": 2.07, + "step": 3626 + }, + { + "epoch": 0.28, + "grad_norm": 0.5599499195681623, + "learning_rate": 4.2254235628763306e-05, + "loss": 1.9593, + "step": 3627 + }, + { + "epoch": 0.28, + "grad_norm": 0.5528940413054003, + "learning_rate": 4.224971467606437e-05, + "loss": 2.1501, + "step": 3628 + }, + { + "epoch": 0.28, + "grad_norm": 0.5897714662077573, + "learning_rate": 4.224519264639488e-05, + "loss": 1.9869, + "step": 3629 + }, + { + "epoch": 0.28, + "grad_norm": 0.58070952960925, + "learning_rate": 4.2240669540037184e-05, + "loss": 1.955, + "step": 3630 + }, + { + "epoch": 0.28, + "grad_norm": 0.599571654178837, + "learning_rate": 4.223614535727364e-05, + "loss": 1.9775, + "step": 3631 + }, + { + "epoch": 0.28, + "grad_norm": 0.6161370519779155, + "learning_rate": 4.2231620098386745e-05, + "loss": 2.0454, + "step": 3632 + }, + { + "epoch": 0.28, + "grad_norm": 0.5641901942704897, + "learning_rate": 4.2227093763659015e-05, + "loss": 2.18, + "step": 3633 + }, + { + "epoch": 0.28, + "grad_norm": 0.6682596416905594, + "learning_rate": 4.222256635337305e-05, + "loss": 1.9559, + "step": 3634 + }, + { + "epoch": 0.28, + "grad_norm": 0.62565096363727, + "learning_rate": 4.221803786781152e-05, + "loss": 1.9523, + "step": 3635 + }, + { + "epoch": 0.28, + "grad_norm": 0.5632185529378687, + "learning_rate": 4.221350830725714e-05, + "loss": 2.2169, + "step": 3636 + }, + { + "epoch": 0.28, + "grad_norm": 0.6509183960603653, + "learning_rate": 4.220897767199274e-05, + "loss": 1.965, + "step": 3637 + }, + { + "epoch": 0.28, + "grad_norm": 0.6192762582637117, + "learning_rate": 4.2204445962301157e-05, + "loss": 2.068, + "step": 3638 + }, + { + "epoch": 0.28, + "grad_norm": 0.680031407651879, + "learning_rate": 4.219991317846534e-05, + "loss": 1.9664, + "step": 3639 + }, + { + "epoch": 0.28, + "grad_norm": 0.6869830079391385, + "learning_rate": 4.219537932076828e-05, + "loss": 1.9401, + "step": 3640 + }, + { + "epoch": 0.28, + "grad_norm": 0.6236129672761939, + "learning_rate": 4.219084438949305e-05, + "loss": 2.1334, + "step": 3641 + }, + { + "epoch": 0.28, + "grad_norm": 0.7697485014305365, + "learning_rate": 4.218630838492278e-05, + "loss": 1.9315, + "step": 3642 + }, + { + "epoch": 0.28, + "grad_norm": 0.6271973277901026, + "learning_rate": 4.218177130734068e-05, + "loss": 1.9801, + "step": 3643 + }, + { + "epoch": 0.28, + "grad_norm": 0.7424544855148989, + "learning_rate": 4.2177233157030005e-05, + "loss": 2.1955, + "step": 3644 + }, + { + "epoch": 0.28, + "grad_norm": 0.7151312443573993, + "learning_rate": 4.21726939342741e-05, + "loss": 2.0733, + "step": 3645 + }, + { + "epoch": 0.28, + "grad_norm": 0.5976545533176514, + "learning_rate": 4.216815363935637e-05, + "loss": 1.9701, + "step": 3646 + }, + { + "epoch": 0.28, + "grad_norm": 0.9199907789832565, + "learning_rate": 4.2163612272560285e-05, + "loss": 1.9659, + "step": 3647 + }, + { + "epoch": 0.28, + "grad_norm": 0.7140632782975725, + "learning_rate": 4.2159069834169366e-05, + "loss": 2.1708, + "step": 3648 + }, + { + "epoch": 0.28, + "grad_norm": 0.8030525450295289, + "learning_rate": 4.215452632446723e-05, + "loss": 1.9654, + "step": 3649 + }, + { + "epoch": 0.28, + "grad_norm": 0.7162447764218811, + "learning_rate": 4.214998174373754e-05, + "loss": 2.0132, + "step": 3650 + }, + { + "epoch": 0.28, + "grad_norm": 0.717803541402079, + "learning_rate": 4.2145436092264036e-05, + "loss": 1.9878, + "step": 3651 + }, + { + "epoch": 0.28, + "grad_norm": 0.7983918925216665, + "learning_rate": 4.214088937033053e-05, + "loss": 1.9996, + "step": 3652 + }, + { + "epoch": 0.28, + "grad_norm": 0.6256055976217183, + "learning_rate": 4.213634157822087e-05, + "loss": 2.1418, + "step": 3653 + }, + { + "epoch": 0.28, + "grad_norm": 0.6976242666758139, + "learning_rate": 4.213179271621901e-05, + "loss": 1.945, + "step": 3654 + }, + { + "epoch": 0.28, + "grad_norm": 0.7891417967130395, + "learning_rate": 4.2127242784608964e-05, + "loss": 1.9337, + "step": 3655 + }, + { + "epoch": 0.28, + "grad_norm": 0.7066755555934711, + "learning_rate": 4.2122691783674786e-05, + "loss": 2.1451, + "step": 3656 + }, + { + "epoch": 0.28, + "grad_norm": 0.6921345097396429, + "learning_rate": 4.211813971370061e-05, + "loss": 2.0568, + "step": 3657 + }, + { + "epoch": 0.28, + "grad_norm": 0.66107110219017, + "learning_rate": 4.211358657497065e-05, + "loss": 1.9833, + "step": 3658 + }, + { + "epoch": 0.28, + "grad_norm": 0.6506189651012769, + "learning_rate": 4.2109032367769184e-05, + "loss": 1.9113, + "step": 3659 + }, + { + "epoch": 0.28, + "grad_norm": 0.7299732300683766, + "learning_rate": 4.210447709238054e-05, + "loss": 2.1513, + "step": 3660 + }, + { + "epoch": 0.28, + "grad_norm": 0.6536139581473334, + "learning_rate": 4.209992074908912e-05, + "loss": 2.0024, + "step": 3661 + }, + { + "epoch": 0.28, + "grad_norm": 0.7630059739097375, + "learning_rate": 4.20953633381794e-05, + "loss": 1.9434, + "step": 3662 + }, + { + "epoch": 0.28, + "grad_norm": 0.627320424637434, + "learning_rate": 4.209080485993592e-05, + "loss": 2.0309, + "step": 3663 + }, + { + "epoch": 0.28, + "grad_norm": 0.6609549788504543, + "learning_rate": 4.208624531464328e-05, + "loss": 1.9348, + "step": 3664 + }, + { + "epoch": 0.28, + "grad_norm": 0.8144441512133345, + "learning_rate": 4.2081684702586146e-05, + "loss": 2.1636, + "step": 3665 + }, + { + "epoch": 0.28, + "grad_norm": 0.6368279950705021, + "learning_rate": 4.207712302404927e-05, + "loss": 1.9436, + "step": 3666 + }, + { + "epoch": 0.28, + "grad_norm": 0.67509957430749, + "learning_rate": 4.207256027931745e-05, + "loss": 1.9727, + "step": 3667 + }, + { + "epoch": 0.28, + "grad_norm": 0.7246471972744529, + "learning_rate": 4.2067996468675554e-05, + "loss": 2.1569, + "step": 3668 + }, + { + "epoch": 0.28, + "grad_norm": 0.667741795146805, + "learning_rate": 4.206343159240852e-05, + "loss": 2.0284, + "step": 3669 + }, + { + "epoch": 0.28, + "grad_norm": 0.7179018458694636, + "learning_rate": 4.205886565080135e-05, + "loss": 1.9626, + "step": 3670 + }, + { + "epoch": 0.28, + "grad_norm": 0.7388418184937322, + "learning_rate": 4.205429864413912e-05, + "loss": 2.0461, + "step": 3671 + }, + { + "epoch": 0.28, + "grad_norm": 0.7171662474538194, + "learning_rate": 4.2049730572706975e-05, + "loss": 1.9874, + "step": 3672 + }, + { + "epoch": 0.28, + "grad_norm": 0.5848568328763255, + "learning_rate": 4.2045161436790105e-05, + "loss": 2.1068, + "step": 3673 + }, + { + "epoch": 0.28, + "grad_norm": 0.6474219331114105, + "learning_rate": 4.204059123667378e-05, + "loss": 1.9379, + "step": 3674 + }, + { + "epoch": 0.28, + "grad_norm": 0.7147258856222993, + "learning_rate": 4.203601997264333e-05, + "loss": 1.9313, + "step": 3675 + }, + { + "epoch": 0.28, + "grad_norm": 0.534804032294127, + "learning_rate": 4.203144764498418e-05, + "loss": 1.9753, + "step": 3676 + }, + { + "epoch": 0.28, + "grad_norm": 0.7070121154045783, + "learning_rate": 4.2026874253981784e-05, + "loss": 2.1294, + "step": 3677 + }, + { + "epoch": 0.28, + "grad_norm": 0.6047364049344168, + "learning_rate": 4.2022299799921676e-05, + "loss": 1.9219, + "step": 3678 + }, + { + "epoch": 0.28, + "grad_norm": 0.6540707221114954, + "learning_rate": 4.2017724283089465e-05, + "loss": 1.9848, + "step": 3679 + }, + { + "epoch": 0.28, + "grad_norm": 0.6470687965261076, + "learning_rate": 4.201314770377082e-05, + "loss": 2.1433, + "step": 3680 + }, + { + "epoch": 0.28, + "grad_norm": 0.6776147701039897, + "learning_rate": 4.200857006225146e-05, + "loss": 1.9983, + "step": 3681 + }, + { + "epoch": 0.28, + "grad_norm": 0.6336883861353526, + "learning_rate": 4.200399135881721e-05, + "loss": 2.0358, + "step": 3682 + }, + { + "epoch": 0.28, + "grad_norm": 0.6296053071156608, + "learning_rate": 4.199941159375392e-05, + "loss": 1.9927, + "step": 3683 + }, + { + "epoch": 0.28, + "grad_norm": 0.6452340141289513, + "learning_rate": 4.199483076734754e-05, + "loss": 1.9312, + "step": 3684 + }, + { + "epoch": 0.28, + "grad_norm": 0.8134644464980094, + "learning_rate": 4.1990248879884044e-05, + "loss": 2.1655, + "step": 3685 + }, + { + "epoch": 0.28, + "grad_norm": 0.706150096135686, + "learning_rate": 4.198566593164952e-05, + "loss": 1.9844, + "step": 3686 + }, + { + "epoch": 0.28, + "grad_norm": 0.7440402121958606, + "learning_rate": 4.1981081922930086e-05, + "loss": 1.9808, + "step": 3687 + }, + { + "epoch": 0.28, + "grad_norm": 0.8651733446142279, + "learning_rate": 4.197649685401195e-05, + "loss": 2.0224, + "step": 3688 + }, + { + "epoch": 0.28, + "grad_norm": 0.716079448776735, + "learning_rate": 4.197191072518139e-05, + "loss": 2.1476, + "step": 3689 + }, + { + "epoch": 0.28, + "grad_norm": 0.7528872485841568, + "learning_rate": 4.19673235367247e-05, + "loss": 1.98, + "step": 3690 + }, + { + "epoch": 0.28, + "grad_norm": 0.5623067613899365, + "learning_rate": 4.1962735288928305e-05, + "loss": 1.9248, + "step": 3691 + }, + { + "epoch": 0.28, + "grad_norm": 0.8775328457312157, + "learning_rate": 4.195814598207866e-05, + "loss": 2.1676, + "step": 3692 + }, + { + "epoch": 0.28, + "grad_norm": 0.6086395262624, + "learning_rate": 4.1953555616462306e-05, + "loss": 1.9684, + "step": 3693 + }, + { + "epoch": 0.28, + "grad_norm": 0.7308533708234124, + "learning_rate": 4.194896419236581e-05, + "loss": 2.0282, + "step": 3694 + }, + { + "epoch": 0.29, + "grad_norm": 0.6431569238318813, + "learning_rate": 4.194437171007587e-05, + "loss": 1.9371, + "step": 3695 + }, + { + "epoch": 0.29, + "grad_norm": 0.651955892258961, + "learning_rate": 4.193977816987919e-05, + "loss": 1.9868, + "step": 3696 + }, + { + "epoch": 0.29, + "grad_norm": 0.6106225390211322, + "learning_rate": 4.193518357206256e-05, + "loss": 2.1603, + "step": 3697 + }, + { + "epoch": 0.29, + "grad_norm": 0.7169168433234424, + "learning_rate": 4.193058791691285e-05, + "loss": 1.9885, + "step": 3698 + }, + { + "epoch": 0.29, + "grad_norm": 0.6363548164402616, + "learning_rate": 4.192599120471699e-05, + "loss": 2.0001, + "step": 3699 + }, + { + "epoch": 0.29, + "grad_norm": 0.590361218538044, + "learning_rate": 4.192139343576196e-05, + "loss": 2.0251, + "step": 3700 + }, + { + "epoch": 0.29, + "grad_norm": 0.7181527413578267, + "learning_rate": 4.1916794610334816e-05, + "loss": 2.1661, + "step": 3701 + }, + { + "epoch": 0.29, + "grad_norm": 0.6217415248038604, + "learning_rate": 4.191219472872271e-05, + "loss": 1.9796, + "step": 3702 + }, + { + "epoch": 0.29, + "grad_norm": 0.6439466922081448, + "learning_rate": 4.19075937912128e-05, + "loss": 1.9892, + "step": 3703 + }, + { + "epoch": 0.29, + "grad_norm": 0.6608511847562606, + "learning_rate": 4.1902991798092336e-05, + "loss": 1.936, + "step": 3704 + }, + { + "epoch": 0.29, + "grad_norm": 0.6023502963879678, + "learning_rate": 4.1898388749648674e-05, + "loss": 2.1612, + "step": 3705 + }, + { + "epoch": 0.29, + "grad_norm": 0.5909319257948227, + "learning_rate": 4.189378464616918e-05, + "loss": 1.9095, + "step": 3706 + }, + { + "epoch": 0.29, + "grad_norm": 0.5664454631982433, + "learning_rate": 4.18891794879413e-05, + "loss": 2.0211, + "step": 3707 + }, + { + "epoch": 0.29, + "grad_norm": 0.592439128006405, + "learning_rate": 4.188457327525257e-05, + "loss": 1.9933, + "step": 3708 + }, + { + "epoch": 0.29, + "grad_norm": 0.6306148919285279, + "learning_rate": 4.187996600839056e-05, + "loss": 2.1649, + "step": 3709 + }, + { + "epoch": 0.29, + "grad_norm": 0.5813971131414821, + "learning_rate": 4.187535768764293e-05, + "loss": 1.9943, + "step": 3710 + }, + { + "epoch": 0.29, + "grad_norm": 0.6002910987017142, + "learning_rate": 4.18707483132974e-05, + "loss": 1.9844, + "step": 3711 + }, + { + "epoch": 0.29, + "grad_norm": 0.6474852591967267, + "learning_rate": 4.1866137885641736e-05, + "loss": 2.1512, + "step": 3712 + }, + { + "epoch": 0.29, + "grad_norm": 0.5539846145113527, + "learning_rate": 4.18615264049638e-05, + "loss": 2.019, + "step": 3713 + }, + { + "epoch": 0.29, + "grad_norm": 0.5927722425702392, + "learning_rate": 4.1856913871551506e-05, + "loss": 1.9364, + "step": 3714 + }, + { + "epoch": 0.29, + "grad_norm": 0.6342850121416378, + "learning_rate": 4.185230028569283e-05, + "loss": 1.9444, + "step": 3715 + }, + { + "epoch": 0.29, + "grad_norm": 0.6604777794372962, + "learning_rate": 4.18476856476758e-05, + "loss": 1.9796, + "step": 3716 + }, + { + "epoch": 0.29, + "grad_norm": 0.5853262536874004, + "learning_rate": 4.1843069957788564e-05, + "loss": 2.1418, + "step": 3717 + }, + { + "epoch": 0.29, + "grad_norm": 0.6001695196600496, + "learning_rate": 4.183845321631926e-05, + "loss": 1.9306, + "step": 3718 + }, + { + "epoch": 0.29, + "grad_norm": 0.647535376377552, + "learning_rate": 4.1833835423556164e-05, + "loss": 2.0501, + "step": 3719 + }, + { + "epoch": 0.29, + "grad_norm": 0.6275327676420367, + "learning_rate": 4.1829216579787554e-05, + "loss": 1.9902, + "step": 3720 + }, + { + "epoch": 0.29, + "grad_norm": 0.5887762892279188, + "learning_rate": 4.1824596685301827e-05, + "loss": 2.1678, + "step": 3721 + }, + { + "epoch": 0.29, + "grad_norm": 0.6468948178157242, + "learning_rate": 4.181997574038741e-05, + "loss": 1.9457, + "step": 3722 + }, + { + "epoch": 0.29, + "grad_norm": 0.5916795784286839, + "learning_rate": 4.18153537453328e-05, + "loss": 1.9751, + "step": 3723 + }, + { + "epoch": 0.29, + "grad_norm": 0.6727172830218018, + "learning_rate": 4.1810730700426585e-05, + "loss": 1.995, + "step": 3724 + }, + { + "epoch": 0.29, + "grad_norm": 0.5879229392431792, + "learning_rate": 4.1806106605957385e-05, + "loss": 2.2419, + "step": 3725 + }, + { + "epoch": 0.29, + "grad_norm": 0.5803117247129603, + "learning_rate": 4.1801481462213923e-05, + "loss": 2.0243, + "step": 3726 + }, + { + "epoch": 0.29, + "grad_norm": 0.6433096028801361, + "learning_rate": 4.179685526948494e-05, + "loss": 1.9618, + "step": 3727 + }, + { + "epoch": 0.29, + "grad_norm": 0.6557278814775276, + "learning_rate": 4.179222802805928e-05, + "loss": 1.9889, + "step": 3728 + }, + { + "epoch": 0.29, + "grad_norm": 0.6609669072280904, + "learning_rate": 4.178759973822584e-05, + "loss": 2.1725, + "step": 3729 + }, + { + "epoch": 0.29, + "grad_norm": 0.6070272964311362, + "learning_rate": 4.178297040027358e-05, + "loss": 1.9638, + "step": 3730 + }, + { + "epoch": 0.29, + "grad_norm": 0.6422282982085109, + "learning_rate": 4.1778340014491537e-05, + "loss": 2.0354, + "step": 3731 + }, + { + "epoch": 0.29, + "grad_norm": 0.7263148650498896, + "learning_rate": 4.1773708581168805e-05, + "loss": 1.967, + "step": 3732 + }, + { + "epoch": 0.29, + "grad_norm": 0.6768927745311344, + "learning_rate": 4.176907610059453e-05, + "loss": 2.1521, + "step": 3733 + }, + { + "epoch": 0.29, + "grad_norm": 0.6760268932536927, + "learning_rate": 4.1764442573057936e-05, + "loss": 1.9839, + "step": 3734 + }, + { + "epoch": 0.29, + "grad_norm": 0.8601180800964421, + "learning_rate": 4.1759807998848335e-05, + "loss": 1.9589, + "step": 3735 + }, + { + "epoch": 0.29, + "grad_norm": 0.6381109812213627, + "learning_rate": 4.175517237825507e-05, + "loss": 1.9288, + "step": 3736 + }, + { + "epoch": 0.29, + "grad_norm": 0.8315830592472763, + "learning_rate": 4.175053571156756e-05, + "loss": 2.2191, + "step": 3737 + }, + { + "epoch": 0.29, + "grad_norm": 0.6895210048175766, + "learning_rate": 4.174589799907529e-05, + "loss": 2.0246, + "step": 3738 + }, + { + "epoch": 0.29, + "grad_norm": 0.6992436444783046, + "learning_rate": 4.1741259241067814e-05, + "loss": 1.9902, + "step": 3739 + }, + { + "epoch": 0.29, + "grad_norm": 0.674783121252672, + "learning_rate": 4.1736619437834754e-05, + "loss": 1.955, + "step": 3740 + }, + { + "epoch": 0.29, + "grad_norm": 0.7897336829256808, + "learning_rate": 4.1731978589665774e-05, + "loss": 2.1828, + "step": 3741 + }, + { + "epoch": 0.29, + "grad_norm": 0.691425194650292, + "learning_rate": 4.172733669685065e-05, + "loss": 1.9785, + "step": 3742 + }, + { + "epoch": 0.29, + "grad_norm": 0.6987882457763207, + "learning_rate": 4.1722693759679165e-05, + "loss": 1.9203, + "step": 3743 + }, + { + "epoch": 0.29, + "grad_norm": 0.5805207964582163, + "learning_rate": 4.171804977844122e-05, + "loss": 2.0378, + "step": 3744 + }, + { + "epoch": 0.29, + "grad_norm": 0.7442671714135737, + "learning_rate": 4.1713404753426744e-05, + "loss": 2.1423, + "step": 3745 + }, + { + "epoch": 0.29, + "grad_norm": 0.6770209912668741, + "learning_rate": 4.170875868492575e-05, + "loss": 2.0107, + "step": 3746 + }, + { + "epoch": 0.29, + "grad_norm": 0.6920145614139671, + "learning_rate": 4.170411157322831e-05, + "loss": 1.9467, + "step": 3747 + }, + { + "epoch": 0.29, + "grad_norm": 0.7737534641922775, + "learning_rate": 4.169946341862456e-05, + "loss": 1.9685, + "step": 3748 + }, + { + "epoch": 0.29, + "grad_norm": 0.5401961910033874, + "learning_rate": 4.169481422140471e-05, + "loss": 2.1523, + "step": 3749 + }, + { + "epoch": 0.29, + "grad_norm": 0.7122973873269187, + "learning_rate": 4.169016398185902e-05, + "loss": 2.0053, + "step": 3750 + }, + { + "epoch": 0.29, + "grad_norm": 0.625318261047493, + "learning_rate": 4.168551270027783e-05, + "loss": 2.0251, + "step": 3751 + }, + { + "epoch": 0.29, + "grad_norm": 0.5696754666203367, + "learning_rate": 4.168086037695153e-05, + "loss": 1.9869, + "step": 3752 + }, + { + "epoch": 0.29, + "grad_norm": 0.6690044794565485, + "learning_rate": 4.16762070121706e-05, + "loss": 2.1924, + "step": 3753 + }, + { + "epoch": 0.29, + "grad_norm": 0.5967770646292277, + "learning_rate": 4.167155260622555e-05, + "loss": 2.0024, + "step": 3754 + }, + { + "epoch": 0.29, + "grad_norm": 0.598320225419967, + "learning_rate": 4.1666897159406984e-05, + "loss": 1.9937, + "step": 3755 + }, + { + "epoch": 0.29, + "grad_norm": 0.589541358190962, + "learning_rate": 4.166224067200556e-05, + "loss": 2.0132, + "step": 3756 + }, + { + "epoch": 0.29, + "grad_norm": 0.6334616475154086, + "learning_rate": 4.1657583144312004e-05, + "loss": 2.1651, + "step": 3757 + }, + { + "epoch": 0.29, + "grad_norm": 0.6560129408766884, + "learning_rate": 4.1652924576617104e-05, + "loss": 1.9557, + "step": 3758 + }, + { + "epoch": 0.29, + "grad_norm": 0.6146363391234785, + "learning_rate": 4.1648264969211704e-05, + "loss": 1.9219, + "step": 3759 + }, + { + "epoch": 0.29, + "grad_norm": 0.5892504839759339, + "learning_rate": 4.164360432238672e-05, + "loss": 1.9663, + "step": 3760 + }, + { + "epoch": 0.29, + "grad_norm": 0.6571287875756067, + "learning_rate": 4.163894263643317e-05, + "loss": 2.1751, + "step": 3761 + }, + { + "epoch": 0.29, + "grad_norm": 0.6322659039374557, + "learning_rate": 4.163427991164206e-05, + "loss": 2.0274, + "step": 3762 + }, + { + "epoch": 0.29, + "grad_norm": 0.6154836965409817, + "learning_rate": 4.162961614830452e-05, + "loss": 1.9442, + "step": 3763 + }, + { + "epoch": 0.29, + "grad_norm": 0.6225135413794903, + "learning_rate": 4.162495134671173e-05, + "loss": 1.9632, + "step": 3764 + }, + { + "epoch": 0.29, + "grad_norm": 0.795049432926198, + "learning_rate": 4.162028550715493e-05, + "loss": 2.1672, + "step": 3765 + }, + { + "epoch": 0.29, + "grad_norm": 0.7213558375102108, + "learning_rate": 4.1615618629925424e-05, + "loss": 1.9773, + "step": 3766 + }, + { + "epoch": 0.29, + "grad_norm": 0.6433100509610542, + "learning_rate": 4.161095071531459e-05, + "loss": 1.9275, + "step": 3767 + }, + { + "epoch": 0.29, + "grad_norm": 0.6187964614148259, + "learning_rate": 4.1606281763613866e-05, + "loss": 1.9209, + "step": 3768 + }, + { + "epoch": 0.29, + "grad_norm": 0.6156113816969525, + "learning_rate": 4.160161177511475e-05, + "loss": 2.1876, + "step": 3769 + }, + { + "epoch": 0.29, + "grad_norm": 0.6513655067227441, + "learning_rate": 4.1596940750108815e-05, + "loss": 1.9644, + "step": 3770 + }, + { + "epoch": 0.29, + "grad_norm": 0.5770425557614786, + "learning_rate": 4.159226868888768e-05, + "loss": 1.9649, + "step": 3771 + }, + { + "epoch": 0.29, + "grad_norm": 0.704248245161816, + "learning_rate": 4.1587595591743046e-05, + "loss": 1.9884, + "step": 3772 + }, + { + "epoch": 0.29, + "grad_norm": 0.7243638607066908, + "learning_rate": 4.158292145896668e-05, + "loss": 2.1617, + "step": 3773 + }, + { + "epoch": 0.29, + "grad_norm": 0.6268560745072803, + "learning_rate": 4.1578246290850406e-05, + "loss": 1.958, + "step": 3774 + }, + { + "epoch": 0.29, + "grad_norm": 0.7851414752569611, + "learning_rate": 4.157357008768611e-05, + "loss": 2.0203, + "step": 3775 + }, + { + "epoch": 0.29, + "grad_norm": 0.6003588521340534, + "learning_rate": 4.156889284976575e-05, + "loss": 1.9558, + "step": 3776 + }, + { + "epoch": 0.29, + "grad_norm": 0.7828218098407062, + "learning_rate": 4.156421457738133e-05, + "loss": 2.1569, + "step": 3777 + }, + { + "epoch": 0.29, + "grad_norm": 0.640437062290225, + "learning_rate": 4.155953527082496e-05, + "loss": 1.9903, + "step": 3778 + }, + { + "epoch": 0.29, + "grad_norm": 0.7795827255519555, + "learning_rate": 4.155485493038877e-05, + "loss": 1.9329, + "step": 3779 + }, + { + "epoch": 0.29, + "grad_norm": 0.6161625806171004, + "learning_rate": 4.155017355636497e-05, + "loss": 2.0008, + "step": 3780 + }, + { + "epoch": 0.29, + "grad_norm": 0.8121995485210262, + "learning_rate": 4.1545491149045855e-05, + "loss": 2.2146, + "step": 3781 + }, + { + "epoch": 0.29, + "grad_norm": 0.5827979963584551, + "learning_rate": 4.154080770872375e-05, + "loss": 1.9621, + "step": 3782 + }, + { + "epoch": 0.29, + "grad_norm": 0.745533884517018, + "learning_rate": 4.153612323569108e-05, + "loss": 1.9825, + "step": 3783 + }, + { + "epoch": 0.29, + "grad_norm": 0.6806726306196428, + "learning_rate": 4.15314377302403e-05, + "loss": 1.9997, + "step": 3784 + }, + { + "epoch": 0.29, + "grad_norm": 0.6785866854606835, + "learning_rate": 4.1526751192663935e-05, + "loss": 2.1872, + "step": 3785 + }, + { + "epoch": 0.29, + "grad_norm": 0.6685545687231972, + "learning_rate": 4.1522063623254614e-05, + "loss": 1.9986, + "step": 3786 + }, + { + "epoch": 0.29, + "grad_norm": 0.6158387432072003, + "learning_rate": 4.151737502230498e-05, + "loss": 2.0551, + "step": 3787 + }, + { + "epoch": 0.29, + "grad_norm": 0.5783198797559468, + "learning_rate": 4.151268539010777e-05, + "loss": 1.9408, + "step": 3788 + }, + { + "epoch": 0.29, + "grad_norm": 0.7293824441419369, + "learning_rate": 4.150799472695578e-05, + "loss": 2.1578, + "step": 3789 + }, + { + "epoch": 0.29, + "grad_norm": 0.614454177983162, + "learning_rate": 4.150330303314186e-05, + "loss": 1.9386, + "step": 3790 + }, + { + "epoch": 0.29, + "grad_norm": 0.6715579437601324, + "learning_rate": 4.1498610308958944e-05, + "loss": 1.9401, + "step": 3791 + }, + { + "epoch": 0.29, + "grad_norm": 0.6604314832863485, + "learning_rate": 4.14939165547e-05, + "loss": 1.9834, + "step": 3792 + }, + { + "epoch": 0.29, + "grad_norm": 0.6463366341505608, + "learning_rate": 4.148922177065808e-05, + "loss": 2.1602, + "step": 3793 + }, + { + "epoch": 0.29, + "grad_norm": 0.6125310322242448, + "learning_rate": 4.1484525957126306e-05, + "loss": 1.9544, + "step": 3794 + }, + { + "epoch": 0.29, + "grad_norm": 0.6675931853812922, + "learning_rate": 4.147982911439786e-05, + "loss": 1.9501, + "step": 3795 + }, + { + "epoch": 0.29, + "grad_norm": 0.5795349176630711, + "learning_rate": 4.147513124276599e-05, + "loss": 1.9548, + "step": 3796 + }, + { + "epoch": 0.29, + "grad_norm": 0.6957731277957201, + "learning_rate": 4.1470432342523986e-05, + "loss": 2.2193, + "step": 3797 + }, + { + "epoch": 0.29, + "grad_norm": 0.5775759814995692, + "learning_rate": 4.146573241396523e-05, + "loss": 1.9176, + "step": 3798 + }, + { + "epoch": 0.29, + "grad_norm": 0.7166914214504236, + "learning_rate": 4.1461031457383155e-05, + "loss": 1.9598, + "step": 3799 + }, + { + "epoch": 0.29, + "grad_norm": 0.6249005085959003, + "learning_rate": 4.1456329473071266e-05, + "loss": 2.0379, + "step": 3800 + }, + { + "epoch": 0.29, + "grad_norm": 0.6984090408310287, + "learning_rate": 4.1451626461323124e-05, + "loss": 2.2095, + "step": 3801 + }, + { + "epoch": 0.29, + "grad_norm": 0.6744236103759379, + "learning_rate": 4.144692242243235e-05, + "loss": 1.9289, + "step": 3802 + }, + { + "epoch": 0.29, + "grad_norm": 0.6968875299821077, + "learning_rate": 4.144221735669265e-05, + "loss": 1.9086, + "step": 3803 + }, + { + "epoch": 0.29, + "grad_norm": 0.5967936196153527, + "learning_rate": 4.143751126439777e-05, + "loss": 2.0279, + "step": 3804 + }, + { + "epoch": 0.29, + "grad_norm": 0.6990424801649224, + "learning_rate": 4.143280414584154e-05, + "loss": 2.2224, + "step": 3805 + }, + { + "epoch": 0.29, + "grad_norm": 0.5412543314057722, + "learning_rate": 4.1428096001317834e-05, + "loss": 2.0582, + "step": 3806 + }, + { + "epoch": 0.29, + "grad_norm": 0.5916577335081531, + "learning_rate": 4.142338683112061e-05, + "loss": 1.9157, + "step": 3807 + }, + { + "epoch": 0.29, + "grad_norm": 0.6903530340385335, + "learning_rate": 4.1418676635543875e-05, + "loss": 1.9892, + "step": 3808 + }, + { + "epoch": 0.29, + "grad_norm": 0.6203696010839237, + "learning_rate": 4.141396541488171e-05, + "loss": 2.1407, + "step": 3809 + }, + { + "epoch": 0.29, + "grad_norm": 0.5971054918061296, + "learning_rate": 4.1409253169428255e-05, + "loss": 1.9674, + "step": 3810 + }, + { + "epoch": 0.29, + "grad_norm": 0.5558782665163996, + "learning_rate": 4.140453989947771e-05, + "loss": 1.9476, + "step": 3811 + }, + { + "epoch": 0.29, + "grad_norm": 0.5782427070370348, + "learning_rate": 4.139982560532435e-05, + "loss": 2.0582, + "step": 3812 + }, + { + "epoch": 0.29, + "grad_norm": 0.585325603865789, + "learning_rate": 4.13951102872625e-05, + "loss": 2.1974, + "step": 3813 + }, + { + "epoch": 0.29, + "grad_norm": 0.6051801542283487, + "learning_rate": 4.1390393945586576e-05, + "loss": 1.9737, + "step": 3814 + }, + { + "epoch": 0.29, + "grad_norm": 0.5680878182741933, + "learning_rate": 4.1385676580591016e-05, + "loss": 2.0094, + "step": 3815 + }, + { + "epoch": 0.29, + "grad_norm": 0.6363873734387459, + "learning_rate": 4.1380958192570346e-05, + "loss": 1.9964, + "step": 3816 + }, + { + "epoch": 0.29, + "grad_norm": 0.6007227104821521, + "learning_rate": 4.137623878181917e-05, + "loss": 2.1475, + "step": 3817 + }, + { + "epoch": 0.29, + "grad_norm": 0.6815804682887711, + "learning_rate": 4.137151834863213e-05, + "loss": 2.0552, + "step": 3818 + }, + { + "epoch": 0.29, + "grad_norm": 0.6222419737891229, + "learning_rate": 4.136679689330394e-05, + "loss": 1.9573, + "step": 3819 + }, + { + "epoch": 0.29, + "grad_norm": 0.6444219139377292, + "learning_rate": 4.1362074416129395e-05, + "loss": 1.9884, + "step": 3820 + }, + { + "epoch": 0.29, + "grad_norm": 0.6472578944575412, + "learning_rate": 4.1357350917403314e-05, + "loss": 2.133, + "step": 3821 + }, + { + "epoch": 0.29, + "grad_norm": 0.5571814318918725, + "learning_rate": 4.135262639742062e-05, + "loss": 1.8615, + "step": 3822 + }, + { + "epoch": 0.29, + "grad_norm": 0.6234326717590418, + "learning_rate": 4.1347900856476285e-05, + "loss": 1.9736, + "step": 3823 + }, + { + "epoch": 0.3, + "grad_norm": 0.522757702886885, + "learning_rate": 4.134317429486534e-05, + "loss": 1.9592, + "step": 3824 + }, + { + "epoch": 0.3, + "grad_norm": 0.587477050938539, + "learning_rate": 4.133844671288288e-05, + "loss": 2.0882, + "step": 3825 + }, + { + "epoch": 0.3, + "grad_norm": 0.6210542377236401, + "learning_rate": 4.133371811082408e-05, + "loss": 2.1116, + "step": 3826 + }, + { + "epoch": 0.3, + "grad_norm": 0.5511949637425722, + "learning_rate": 4.1328988488984156e-05, + "loss": 1.9628, + "step": 3827 + }, + { + "epoch": 0.3, + "grad_norm": 0.5773843327705133, + "learning_rate": 4.1324257847658385e-05, + "loss": 1.9791, + "step": 3828 + }, + { + "epoch": 0.3, + "grad_norm": 0.7238198762020395, + "learning_rate": 4.131952618714215e-05, + "loss": 2.1866, + "step": 3829 + }, + { + "epoch": 0.3, + "grad_norm": 0.6251000848677782, + "learning_rate": 4.1314793507730855e-05, + "loss": 1.9598, + "step": 3830 + }, + { + "epoch": 0.3, + "grad_norm": 0.7334419789332116, + "learning_rate": 4.131005980971997e-05, + "loss": 2.0115, + "step": 3831 + }, + { + "epoch": 0.3, + "grad_norm": 0.61654808122787, + "learning_rate": 4.130532509340505e-05, + "loss": 1.9669, + "step": 3832 + }, + { + "epoch": 0.3, + "grad_norm": 0.7098605691584817, + "learning_rate": 4.13005893590817e-05, + "loss": 2.0955, + "step": 3833 + }, + { + "epoch": 0.3, + "grad_norm": 0.6233435367167875, + "learning_rate": 4.129585260704559e-05, + "loss": 1.9566, + "step": 3834 + }, + { + "epoch": 0.3, + "grad_norm": 0.5885074043276831, + "learning_rate": 4.129111483759247e-05, + "loss": 1.967, + "step": 3835 + }, + { + "epoch": 0.3, + "grad_norm": 0.6161076841351686, + "learning_rate": 4.128637605101811e-05, + "loss": 1.9802, + "step": 3836 + }, + { + "epoch": 0.3, + "grad_norm": 0.524378497797327, + "learning_rate": 4.1281636247618386e-05, + "loss": 2.0529, + "step": 3837 + }, + { + "epoch": 0.3, + "grad_norm": 0.667027443925809, + "learning_rate": 4.127689542768923e-05, + "loss": 2.1559, + "step": 3838 + }, + { + "epoch": 0.3, + "grad_norm": 0.6157032631702479, + "learning_rate": 4.127215359152663e-05, + "loss": 1.9427, + "step": 3839 + }, + { + "epoch": 0.3, + "grad_norm": 0.5900975255754316, + "learning_rate": 4.1267410739426625e-05, + "loss": 1.9955, + "step": 3840 + }, + { + "epoch": 0.3, + "grad_norm": 0.8209839775849878, + "learning_rate": 4.126266687168535e-05, + "loss": 2.1679, + "step": 3841 + }, + { + "epoch": 0.3, + "grad_norm": 0.6071237933102744, + "learning_rate": 4.125792198859896e-05, + "loss": 1.9518, + "step": 3842 + }, + { + "epoch": 0.3, + "grad_norm": 0.7098167601851108, + "learning_rate": 4.1253176090463716e-05, + "loss": 2.0249, + "step": 3843 + }, + { + "epoch": 0.3, + "grad_norm": 0.627390929245382, + "learning_rate": 4.124842917757593e-05, + "loss": 1.9704, + "step": 3844 + }, + { + "epoch": 0.3, + "grad_norm": 0.6204529250606994, + "learning_rate": 4.1243681250231945e-05, + "loss": 2.1617, + "step": 3845 + }, + { + "epoch": 0.3, + "grad_norm": 0.6361972559365728, + "learning_rate": 4.123893230872822e-05, + "loss": 1.9475, + "step": 3846 + }, + { + "epoch": 0.3, + "grad_norm": 0.5938836620228498, + "learning_rate": 4.123418235336123e-05, + "loss": 1.9606, + "step": 3847 + }, + { + "epoch": 0.3, + "grad_norm": 0.6716685132204049, + "learning_rate": 4.122943138442755e-05, + "loss": 1.9911, + "step": 3848 + }, + { + "epoch": 0.3, + "grad_norm": 0.6664801702246396, + "learning_rate": 4.12246794022238e-05, + "loss": 2.0585, + "step": 3849 + }, + { + "epoch": 0.3, + "grad_norm": 0.6241313133185189, + "learning_rate": 4.121992640704665e-05, + "loss": 2.1804, + "step": 3850 + }, + { + "epoch": 0.3, + "grad_norm": 0.6469842430902533, + "learning_rate": 4.121517239919287e-05, + "loss": 1.9379, + "step": 3851 + }, + { + "epoch": 0.3, + "grad_norm": 0.6268787637599117, + "learning_rate": 4.121041737895926e-05, + "loss": 1.9418, + "step": 3852 + }, + { + "epoch": 0.3, + "grad_norm": 0.6673374740659214, + "learning_rate": 4.12056613466427e-05, + "loss": 2.1749, + "step": 3853 + }, + { + "epoch": 0.3, + "grad_norm": 0.6010038114486702, + "learning_rate": 4.1200904302540136e-05, + "loss": 1.9673, + "step": 3854 + }, + { + "epoch": 0.3, + "grad_norm": 0.5980945872837211, + "learning_rate": 4.1196146246948555e-05, + "loss": 1.9698, + "step": 3855 + }, + { + "epoch": 0.3, + "grad_norm": 0.6505349547823434, + "learning_rate": 4.119138718016502e-05, + "loss": 2.0493, + "step": 3856 + }, + { + "epoch": 0.3, + "grad_norm": 0.6289551684898544, + "learning_rate": 4.1186627102486674e-05, + "loss": 1.9395, + "step": 3857 + }, + { + "epoch": 0.3, + "grad_norm": 0.6917665537546885, + "learning_rate": 4.1181866014210705e-05, + "loss": 2.1503, + "step": 3858 + }, + { + "epoch": 0.3, + "grad_norm": 0.6641802093286319, + "learning_rate": 4.117710391563436e-05, + "loss": 1.9582, + "step": 3859 + }, + { + "epoch": 0.3, + "grad_norm": 0.5819769923215931, + "learning_rate": 4.1172340807054966e-05, + "loss": 2.0016, + "step": 3860 + }, + { + "epoch": 0.3, + "grad_norm": 0.7389922370062213, + "learning_rate": 4.116757668876989e-05, + "loss": 2.1497, + "step": 3861 + }, + { + "epoch": 0.3, + "grad_norm": 0.5921953994301721, + "learning_rate": 4.1162811561076584e-05, + "loss": 2.0115, + "step": 3862 + }, + { + "epoch": 0.3, + "grad_norm": 0.7361337257409508, + "learning_rate": 4.115804542427256e-05, + "loss": 1.9585, + "step": 3863 + }, + { + "epoch": 0.3, + "grad_norm": 0.6390181972698236, + "learning_rate": 4.1153278278655386e-05, + "loss": 1.9763, + "step": 3864 + }, + { + "epoch": 0.3, + "grad_norm": 0.6448865750421973, + "learning_rate": 4.1148510124522674e-05, + "loss": 2.1723, + "step": 3865 + }, + { + "epoch": 0.3, + "grad_norm": 0.6366256531070978, + "learning_rate": 4.114374096217214e-05, + "loss": 1.979, + "step": 3866 + }, + { + "epoch": 0.3, + "grad_norm": 0.6361417826220834, + "learning_rate": 4.113897079190154e-05, + "loss": 1.9386, + "step": 3867 + }, + { + "epoch": 0.3, + "grad_norm": 0.7078568579808728, + "learning_rate": 4.1134199614008695e-05, + "loss": 2.0403, + "step": 3868 + }, + { + "epoch": 0.3, + "grad_norm": 0.663271888653804, + "learning_rate": 4.112942742879149e-05, + "loss": 1.9351, + "step": 3869 + }, + { + "epoch": 0.3, + "grad_norm": 0.6333105124393905, + "learning_rate": 4.112465423654786e-05, + "loss": 2.1861, + "step": 3870 + }, + { + "epoch": 0.3, + "grad_norm": 0.590322939627224, + "learning_rate": 4.111988003757583e-05, + "loss": 2.0093, + "step": 3871 + }, + { + "epoch": 0.3, + "grad_norm": 0.6361424907464709, + "learning_rate": 4.111510483217347e-05, + "loss": 1.9733, + "step": 3872 + }, + { + "epoch": 0.3, + "grad_norm": 0.601702979866309, + "learning_rate": 4.1110328620638904e-05, + "loss": 2.1791, + "step": 3873 + }, + { + "epoch": 0.3, + "grad_norm": 0.61119646632048, + "learning_rate": 4.110555140327035e-05, + "loss": 2.0144, + "step": 3874 + }, + { + "epoch": 0.3, + "grad_norm": 0.6710618336666347, + "learning_rate": 4.110077318036605e-05, + "loss": 1.8737, + "step": 3875 + }, + { + "epoch": 0.3, + "grad_norm": 0.7984900059817356, + "learning_rate": 4.1095993952224344e-05, + "loss": 1.9586, + "step": 3876 + }, + { + "epoch": 0.3, + "grad_norm": 0.6716178927325301, + "learning_rate": 4.109121371914361e-05, + "loss": 2.1351, + "step": 3877 + }, + { + "epoch": 0.3, + "grad_norm": 0.7196671657757124, + "learning_rate": 4.108643248142229e-05, + "loss": 1.9578, + "step": 3878 + }, + { + "epoch": 0.3, + "grad_norm": 0.6514675716239146, + "learning_rate": 4.108165023935891e-05, + "loss": 1.9881, + "step": 3879 + }, + { + "epoch": 0.3, + "grad_norm": 0.7576338022037286, + "learning_rate": 4.1076866993252044e-05, + "loss": 2.0097, + "step": 3880 + }, + { + "epoch": 0.3, + "grad_norm": 0.6239123027027155, + "learning_rate": 4.107208274340032e-05, + "loss": 1.9381, + "step": 3881 + }, + { + "epoch": 0.3, + "grad_norm": 0.8501749336640424, + "learning_rate": 4.106729749010245e-05, + "loss": 2.1388, + "step": 3882 + }, + { + "epoch": 0.3, + "grad_norm": 0.8044566692530245, + "learning_rate": 4.106251123365719e-05, + "loss": 2.0012, + "step": 3883 + }, + { + "epoch": 0.3, + "grad_norm": 0.7300007290830302, + "learning_rate": 4.105772397436337e-05, + "loss": 1.9595, + "step": 3884 + }, + { + "epoch": 0.3, + "grad_norm": 0.8729567628153265, + "learning_rate": 4.105293571251988e-05, + "loss": 2.1429, + "step": 3885 + }, + { + "epoch": 0.3, + "grad_norm": 0.6933568471502953, + "learning_rate": 4.1048146448425656e-05, + "loss": 1.9622, + "step": 3886 + }, + { + "epoch": 0.3, + "grad_norm": 0.6015375889673975, + "learning_rate": 4.104335618237972e-05, + "loss": 2.0398, + "step": 3887 + }, + { + "epoch": 0.3, + "grad_norm": 0.6893158007409019, + "learning_rate": 4.103856491468116e-05, + "loss": 2.0003, + "step": 3888 + }, + { + "epoch": 0.3, + "grad_norm": 0.7030049488384783, + "learning_rate": 4.1033772645629095e-05, + "loss": 1.9561, + "step": 3889 + }, + { + "epoch": 0.3, + "grad_norm": 0.8214531954570513, + "learning_rate": 4.1028979375522745e-05, + "loss": 2.1591, + "step": 3890 + }, + { + "epoch": 0.3, + "grad_norm": 0.7519576420218346, + "learning_rate": 4.1024185104661354e-05, + "loss": 1.9539, + "step": 3891 + }, + { + "epoch": 0.3, + "grad_norm": 0.7359641436046099, + "learning_rate": 4.101938983334427e-05, + "loss": 1.9944, + "step": 3892 + }, + { + "epoch": 0.3, + "grad_norm": 0.7713006825437768, + "learning_rate": 4.1014593561870855e-05, + "loss": 2.0327, + "step": 3893 + }, + { + "epoch": 0.3, + "grad_norm": 0.8861839918944877, + "learning_rate": 4.1009796290540586e-05, + "loss": 2.1134, + "step": 3894 + }, + { + "epoch": 0.3, + "grad_norm": 0.6414954375270823, + "learning_rate": 4.1004998019652966e-05, + "loss": 1.9314, + "step": 3895 + }, + { + "epoch": 0.3, + "grad_norm": 0.9701965403462911, + "learning_rate": 4.1000198749507555e-05, + "loss": 1.9541, + "step": 3896 + }, + { + "epoch": 0.3, + "grad_norm": 0.6865817510739247, + "learning_rate": 4.099539848040401e-05, + "loss": 2.1611, + "step": 3897 + }, + { + "epoch": 0.3, + "grad_norm": 0.8992472177473495, + "learning_rate": 4.0990597212642036e-05, + "loss": 1.9594, + "step": 3898 + }, + { + "epoch": 0.3, + "grad_norm": 0.8409069075456934, + "learning_rate": 4.0985794946521385e-05, + "loss": 2.0357, + "step": 3899 + }, + { + "epoch": 0.3, + "grad_norm": 0.6275592996063134, + "learning_rate": 4.098099168234187e-05, + "loss": 1.9735, + "step": 3900 + }, + { + "epoch": 0.3, + "grad_norm": 0.874378122345896, + "learning_rate": 4.097618742040341e-05, + "loss": 1.9749, + "step": 3901 + }, + { + "epoch": 0.3, + "grad_norm": 0.8336322301344887, + "learning_rate": 4.0971382161005936e-05, + "loss": 2.16, + "step": 3902 + }, + { + "epoch": 0.3, + "grad_norm": 0.756447500049861, + "learning_rate": 4.0966575904449455e-05, + "loss": 1.9693, + "step": 3903 + }, + { + "epoch": 0.3, + "grad_norm": 0.7280308604770998, + "learning_rate": 4.096176865103405e-05, + "loss": 1.9693, + "step": 3904 + }, + { + "epoch": 0.3, + "grad_norm": 0.7692940030010743, + "learning_rate": 4.095696040105986e-05, + "loss": 1.9629, + "step": 3905 + }, + { + "epoch": 0.3, + "grad_norm": 0.7008616252657456, + "learning_rate": 4.095215115482707e-05, + "loss": 2.1898, + "step": 3906 + }, + { + "epoch": 0.3, + "grad_norm": 0.7994155487578607, + "learning_rate": 4.094734091263596e-05, + "loss": 1.9899, + "step": 3907 + }, + { + "epoch": 0.3, + "grad_norm": 0.6206056063284756, + "learning_rate": 4.0942529674786835e-05, + "loss": 1.9675, + "step": 3908 + }, + { + "epoch": 0.3, + "grad_norm": 0.7773799855828458, + "learning_rate": 4.0937717441580094e-05, + "loss": 2.1524, + "step": 3909 + }, + { + "epoch": 0.3, + "grad_norm": 0.6650349460778588, + "learning_rate": 4.093290421331618e-05, + "loss": 1.9509, + "step": 3910 + }, + { + "epoch": 0.3, + "grad_norm": 0.5244867637826205, + "learning_rate": 4.0928089990295596e-05, + "loss": 2.0021, + "step": 3911 + }, + { + "epoch": 0.3, + "grad_norm": 0.8067763805729119, + "learning_rate": 4.0923274772818926e-05, + "loss": 1.9896, + "step": 3912 + }, + { + "epoch": 0.3, + "grad_norm": 0.709736541042762, + "learning_rate": 4.091845856118679e-05, + "loss": 1.9559, + "step": 3913 + }, + { + "epoch": 0.3, + "grad_norm": 0.7596323234363336, + "learning_rate": 4.0913641355699905e-05, + "loss": 2.1884, + "step": 3914 + }, + { + "epoch": 0.3, + "grad_norm": 0.7109707777121279, + "learning_rate": 4.0908823156659e-05, + "loss": 1.9795, + "step": 3915 + }, + { + "epoch": 0.3, + "grad_norm": 0.7377098262679318, + "learning_rate": 4.090400396436492e-05, + "loss": 2.0191, + "step": 3916 + }, + { + "epoch": 0.3, + "grad_norm": 0.6128321289481717, + "learning_rate": 4.089918377911853e-05, + "loss": 2.1359, + "step": 3917 + }, + { + "epoch": 0.3, + "grad_norm": 0.6487596788145268, + "learning_rate": 4.089436260122079e-05, + "loss": 2.0608, + "step": 3918 + }, + { + "epoch": 0.3, + "grad_norm": 0.5528278559792273, + "learning_rate": 4.088954043097269e-05, + "loss": 1.9681, + "step": 3919 + }, + { + "epoch": 0.3, + "grad_norm": 0.6877384704484403, + "learning_rate": 4.088471726867531e-05, + "loss": 1.9483, + "step": 3920 + }, + { + "epoch": 0.3, + "grad_norm": 0.5882308502786506, + "learning_rate": 4.087989311462978e-05, + "loss": 1.9969, + "step": 3921 + }, + { + "epoch": 0.3, + "grad_norm": 0.6646453401542289, + "learning_rate": 4.0875067969137274e-05, + "loss": 2.1848, + "step": 3922 + }, + { + "epoch": 0.3, + "grad_norm": 0.5977845743029465, + "learning_rate": 4.087024183249906e-05, + "loss": 1.9554, + "step": 3923 + }, + { + "epoch": 0.3, + "grad_norm": 0.7494689848784223, + "learning_rate": 4.086541470501646e-05, + "loss": 2.0282, + "step": 3924 + }, + { + "epoch": 0.3, + "grad_norm": 0.5366623985619712, + "learning_rate": 4.086058658699083e-05, + "loss": 1.9795, + "step": 3925 + }, + { + "epoch": 0.3, + "grad_norm": 0.6488769816750848, + "learning_rate": 4.085575747872363e-05, + "loss": 2.1629, + "step": 3926 + }, + { + "epoch": 0.3, + "grad_norm": 0.5618492324406718, + "learning_rate": 4.085092738051636e-05, + "loss": 1.9689, + "step": 3927 + }, + { + "epoch": 0.3, + "grad_norm": 0.528711944747505, + "learning_rate": 4.084609629267057e-05, + "loss": 1.9608, + "step": 3928 + }, + { + "epoch": 0.3, + "grad_norm": 0.6366696275030724, + "learning_rate": 4.08412642154879e-05, + "loss": 2.2045, + "step": 3929 + }, + { + "epoch": 0.3, + "grad_norm": 0.5767927321291471, + "learning_rate": 4.083643114927001e-05, + "loss": 2.0507, + "step": 3930 + }, + { + "epoch": 0.3, + "grad_norm": 0.6262810275058607, + "learning_rate": 4.083159709431867e-05, + "loss": 1.9761, + "step": 3931 + }, + { + "epoch": 0.3, + "grad_norm": 0.603558159394882, + "learning_rate": 4.08267620509357e-05, + "loss": 1.9745, + "step": 3932 + }, + { + "epoch": 0.3, + "grad_norm": 0.5701235201637165, + "learning_rate": 4.0821926019422954e-05, + "loss": 1.9826, + "step": 3933 + }, + { + "epoch": 0.3, + "grad_norm": 0.5999710044862449, + "learning_rate": 4.0817089000082354e-05, + "loss": 2.1976, + "step": 3934 + }, + { + "epoch": 0.3, + "grad_norm": 0.6436869824710597, + "learning_rate": 4.081225099321592e-05, + "loss": 2.0037, + "step": 3935 + }, + { + "epoch": 0.3, + "grad_norm": 0.5418557418995505, + "learning_rate": 4.08074119991257e-05, + "loss": 2.0913, + "step": 3936 + }, + { + "epoch": 0.3, + "grad_norm": 0.652760146539893, + "learning_rate": 4.0802572018113804e-05, + "loss": 1.9648, + "step": 3937 + }, + { + "epoch": 0.3, + "grad_norm": 0.615856227564425, + "learning_rate": 4.0797731050482414e-05, + "loss": 2.1223, + "step": 3938 + }, + { + "epoch": 0.3, + "grad_norm": 0.5337244278544372, + "learning_rate": 4.079288909653378e-05, + "loss": 1.9399, + "step": 3939 + }, + { + "epoch": 0.3, + "grad_norm": 0.6072894525284867, + "learning_rate": 4.078804615657021e-05, + "loss": 2.0075, + "step": 3940 + }, + { + "epoch": 0.3, + "grad_norm": 0.6463664617478929, + "learning_rate": 4.078320223089406e-05, + "loss": 1.9656, + "step": 3941 + }, + { + "epoch": 0.3, + "grad_norm": 0.6053858812193568, + "learning_rate": 4.077835731980774e-05, + "loss": 2.1589, + "step": 3942 + }, + { + "epoch": 0.3, + "grad_norm": 0.6018016711849219, + "learning_rate": 4.077351142361376e-05, + "loss": 1.977, + "step": 3943 + }, + { + "epoch": 0.3, + "grad_norm": 0.5989532290503011, + "learning_rate": 4.076866454261466e-05, + "loss": 1.9794, + "step": 3944 + }, + { + "epoch": 0.3, + "grad_norm": 0.6046932673990401, + "learning_rate": 4.0763816677113064e-05, + "loss": 1.993, + "step": 3945 + }, + { + "epoch": 0.3, + "grad_norm": 0.6115075893355694, + "learning_rate": 4.075896782741163e-05, + "loss": 2.1672, + "step": 3946 + }, + { + "epoch": 0.3, + "grad_norm": 0.7013460309228474, + "learning_rate": 4.075411799381309e-05, + "loss": 1.9498, + "step": 3947 + }, + { + "epoch": 0.3, + "grad_norm": 0.5536393500853389, + "learning_rate": 4.074926717662025e-05, + "loss": 1.9795, + "step": 3948 + }, + { + "epoch": 0.3, + "grad_norm": 0.5944213003659885, + "learning_rate": 4.074441537613596e-05, + "loss": 2.0185, + "step": 3949 + }, + { + "epoch": 0.3, + "grad_norm": 0.6489011507801296, + "learning_rate": 4.0739562592663134e-05, + "loss": 2.1803, + "step": 3950 + }, + { + "epoch": 0.3, + "grad_norm": 0.5330079028742769, + "learning_rate": 4.0734708826504755e-05, + "loss": 1.9274, + "step": 3951 + }, + { + "epoch": 0.3, + "grad_norm": 0.611252372527336, + "learning_rate": 4.0729854077963866e-05, + "loss": 1.965, + "step": 3952 + }, + { + "epoch": 0.3, + "grad_norm": 0.6750862461291328, + "learning_rate": 4.072499834734357e-05, + "loss": 1.9219, + "step": 3953 + }, + { + "epoch": 0.31, + "grad_norm": 0.655206426453125, + "learning_rate": 4.072014163494702e-05, + "loss": 2.1192, + "step": 3954 + }, + { + "epoch": 0.31, + "grad_norm": 0.7161222534966286, + "learning_rate": 4.071528394107746e-05, + "loss": 1.9455, + "step": 3955 + }, + { + "epoch": 0.31, + "grad_norm": 0.6284146936861615, + "learning_rate": 4.0710425266038145e-05, + "loss": 1.9684, + "step": 3956 + }, + { + "epoch": 0.31, + "grad_norm": 0.6844628586151592, + "learning_rate": 4.070556561013246e-05, + "loss": 2.0016, + "step": 3957 + }, + { + "epoch": 0.31, + "grad_norm": 0.7755004546749505, + "learning_rate": 4.070070497366378e-05, + "loss": 2.196, + "step": 3958 + }, + { + "epoch": 0.31, + "grad_norm": 0.6326489269355683, + "learning_rate": 4.069584335693559e-05, + "loss": 1.9699, + "step": 3959 + }, + { + "epoch": 0.31, + "grad_norm": 0.8097240980216724, + "learning_rate": 4.0690980760251426e-05, + "loss": 1.9669, + "step": 3960 + }, + { + "epoch": 0.31, + "grad_norm": 0.6677787931024276, + "learning_rate": 4.068611718391487e-05, + "loss": 2.0097, + "step": 3961 + }, + { + "epoch": 0.31, + "grad_norm": 0.688784473718283, + "learning_rate": 4.068125262822958e-05, + "loss": 2.1466, + "step": 3962 + }, + { + "epoch": 0.31, + "grad_norm": 0.7263695693723677, + "learning_rate": 4.067638709349927e-05, + "loss": 1.9492, + "step": 3963 + }, + { + "epoch": 0.31, + "grad_norm": 0.6962450382958119, + "learning_rate": 4.06715205800277e-05, + "loss": 1.9657, + "step": 3964 + }, + { + "epoch": 0.31, + "grad_norm": 0.7149673577783155, + "learning_rate": 4.0666653088118734e-05, + "loss": 1.9378, + "step": 3965 + }, + { + "epoch": 0.31, + "grad_norm": 0.8102051381467807, + "learning_rate": 4.066178461807625e-05, + "loss": 2.1599, + "step": 3966 + }, + { + "epoch": 0.31, + "grad_norm": 0.6126262700724647, + "learning_rate": 4.065691517020421e-05, + "loss": 2.0774, + "step": 3967 + }, + { + "epoch": 0.31, + "grad_norm": 0.6809055049876596, + "learning_rate": 4.065204474480664e-05, + "loss": 1.9269, + "step": 3968 + }, + { + "epoch": 0.31, + "grad_norm": 0.6687990448358586, + "learning_rate": 4.0647173342187615e-05, + "loss": 1.9146, + "step": 3969 + }, + { + "epoch": 0.31, + "grad_norm": 0.707960760128314, + "learning_rate": 4.064230096265128e-05, + "loss": 2.161, + "step": 3970 + }, + { + "epoch": 0.31, + "grad_norm": 0.6992339231666369, + "learning_rate": 4.0637427606501833e-05, + "loss": 1.9466, + "step": 3971 + }, + { + "epoch": 0.31, + "grad_norm": 0.6548835889887717, + "learning_rate": 4.063255327404354e-05, + "loss": 1.9599, + "step": 3972 + }, + { + "epoch": 0.31, + "grad_norm": 0.6445787414692051, + "learning_rate": 4.062767796558073e-05, + "loss": 1.9736, + "step": 3973 + }, + { + "epoch": 0.31, + "grad_norm": 0.6183177348038901, + "learning_rate": 4.062280168141778e-05, + "loss": 2.1858, + "step": 3974 + }, + { + "epoch": 0.31, + "grad_norm": 0.5799925654320878, + "learning_rate": 4.0617924421859144e-05, + "loss": 1.9523, + "step": 3975 + }, + { + "epoch": 0.31, + "grad_norm": 0.5812799647488383, + "learning_rate": 4.061304618720932e-05, + "loss": 1.9651, + "step": 3976 + }, + { + "epoch": 0.31, + "grad_norm": 0.6254982255938857, + "learning_rate": 4.060816697777289e-05, + "loss": 1.8825, + "step": 3977 + }, + { + "epoch": 0.31, + "grad_norm": 0.6802300448778695, + "learning_rate": 4.060328679385448e-05, + "loss": 2.1185, + "step": 3978 + }, + { + "epoch": 0.31, + "grad_norm": 0.6273730577655596, + "learning_rate": 4.059840563575876e-05, + "loss": 1.9549, + "step": 3979 + }, + { + "epoch": 0.31, + "grad_norm": 0.5928283508554224, + "learning_rate": 4.059352350379051e-05, + "loss": 2.005, + "step": 3980 + }, + { + "epoch": 0.31, + "grad_norm": 0.6396703254040137, + "learning_rate": 4.058864039825452e-05, + "loss": 1.9481, + "step": 3981 + }, + { + "epoch": 0.31, + "grad_norm": 0.633288209867391, + "learning_rate": 4.058375631945568e-05, + "loss": 2.1884, + "step": 3982 + }, + { + "epoch": 0.31, + "grad_norm": 0.5722078112696001, + "learning_rate": 4.05788712676989e-05, + "loss": 1.9676, + "step": 3983 + }, + { + "epoch": 0.31, + "grad_norm": 0.673606589168583, + "learning_rate": 4.0573985243289195e-05, + "loss": 1.9897, + "step": 3984 + }, + { + "epoch": 0.31, + "grad_norm": 0.6106146849179724, + "learning_rate": 4.0569098246531615e-05, + "loss": 1.9488, + "step": 3985 + }, + { + "epoch": 0.31, + "grad_norm": 0.6830378494852114, + "learning_rate": 4.056421027773126e-05, + "loss": 2.1669, + "step": 3986 + }, + { + "epoch": 0.31, + "grad_norm": 0.5848723800476755, + "learning_rate": 4.055932133719333e-05, + "loss": 2.0048, + "step": 3987 + }, + { + "epoch": 0.31, + "grad_norm": 0.56868799365382, + "learning_rate": 4.0554431425223046e-05, + "loss": 1.9433, + "step": 3988 + }, + { + "epoch": 0.31, + "grad_norm": 0.6174129913622555, + "learning_rate": 4.054954054212571e-05, + "loss": 2.0008, + "step": 3989 + }, + { + "epoch": 0.31, + "grad_norm": 0.6788571167867056, + "learning_rate": 4.054464868820667e-05, + "loss": 2.1301, + "step": 3990 + }, + { + "epoch": 0.31, + "grad_norm": 0.5954529197305675, + "learning_rate": 4.0539755863771367e-05, + "loss": 1.9696, + "step": 3991 + }, + { + "epoch": 0.31, + "grad_norm": 0.747791661504132, + "learning_rate": 4.0534862069125254e-05, + "loss": 2.0345, + "step": 3992 + }, + { + "epoch": 0.31, + "grad_norm": 0.6381212657735547, + "learning_rate": 4.052996730457389e-05, + "loss": 1.9531, + "step": 3993 + }, + { + "epoch": 0.31, + "grad_norm": 0.7038711363901147, + "learning_rate": 4.052507157042287e-05, + "loss": 2.1784, + "step": 3994 + }, + { + "epoch": 0.31, + "grad_norm": 0.6146548233511422, + "learning_rate": 4.052017486697784e-05, + "loss": 1.9686, + "step": 3995 + }, + { + "epoch": 0.31, + "grad_norm": 0.6803219620205003, + "learning_rate": 4.0515277194544554e-05, + "loss": 1.9474, + "step": 3996 + }, + { + "epoch": 0.31, + "grad_norm": 0.6469660966486128, + "learning_rate": 4.051037855342876e-05, + "loss": 1.9724, + "step": 3997 + }, + { + "epoch": 0.31, + "grad_norm": 0.7130486245223812, + "learning_rate": 4.050547894393633e-05, + "loss": 2.1833, + "step": 3998 + }, + { + "epoch": 0.31, + "grad_norm": 0.6113273627151158, + "learning_rate": 4.0500578366373134e-05, + "loss": 1.9974, + "step": 3999 + }, + { + "epoch": 0.31, + "grad_norm": 0.7685298234374351, + "learning_rate": 4.049567682104516e-05, + "loss": 1.9503, + "step": 4000 + }, + { + "epoch": 0.31, + "grad_norm": 0.6208536724757183, + "learning_rate": 4.049077430825843e-05, + "loss": 2.0512, + "step": 4001 + }, + { + "epoch": 0.31, + "grad_norm": 0.8181386935703927, + "learning_rate": 4.048587082831901e-05, + "loss": 2.1764, + "step": 4002 + }, + { + "epoch": 0.31, + "grad_norm": 0.6101616068591584, + "learning_rate": 4.048096638153306e-05, + "loss": 1.9338, + "step": 4003 + }, + { + "epoch": 0.31, + "grad_norm": 0.6578592184641787, + "learning_rate": 4.047606096820679e-05, + "loss": 1.9506, + "step": 4004 + }, + { + "epoch": 0.31, + "grad_norm": 0.7672901417031481, + "learning_rate": 4.0471154588646455e-05, + "loss": 2.0373, + "step": 4005 + }, + { + "epoch": 0.31, + "grad_norm": 0.6933477230872233, + "learning_rate": 4.0466247243158376e-05, + "loss": 2.1908, + "step": 4006 + }, + { + "epoch": 0.31, + "grad_norm": 0.8540479356400489, + "learning_rate": 4.046133893204894e-05, + "loss": 1.9528, + "step": 4007 + }, + { + "epoch": 0.31, + "grad_norm": 0.6878251296685576, + "learning_rate": 4.04564296556246e-05, + "loss": 1.9557, + "step": 4008 + }, + { + "epoch": 0.31, + "grad_norm": 0.7105579766739942, + "learning_rate": 4.045151941419188e-05, + "loss": 1.9516, + "step": 4009 + }, + { + "epoch": 0.31, + "grad_norm": 0.8189345288667786, + "learning_rate": 4.044660820805731e-05, + "loss": 2.149, + "step": 4010 + }, + { + "epoch": 0.31, + "grad_norm": 0.5344511898656646, + "learning_rate": 4.044169603752753e-05, + "loss": 2.0342, + "step": 4011 + }, + { + "epoch": 0.31, + "grad_norm": 0.6863763187420581, + "learning_rate": 4.0436782902909234e-05, + "loss": 1.9208, + "step": 4012 + }, + { + "epoch": 0.31, + "grad_norm": 0.725220391489797, + "learning_rate": 4.043186880450916e-05, + "loss": 1.9492, + "step": 4013 + }, + { + "epoch": 0.31, + "grad_norm": 0.6705762837053068, + "learning_rate": 4.042695374263413e-05, + "loss": 2.2205, + "step": 4014 + }, + { + "epoch": 0.31, + "grad_norm": 0.8170922709142047, + "learning_rate": 4.0422037717591e-05, + "loss": 1.9958, + "step": 4015 + }, + { + "epoch": 0.31, + "grad_norm": 0.6541893670330549, + "learning_rate": 4.0417120729686694e-05, + "loss": 1.9424, + "step": 4016 + }, + { + "epoch": 0.31, + "grad_norm": 0.8970390296533451, + "learning_rate": 4.041220277922822e-05, + "loss": 2.0265, + "step": 4017 + }, + { + "epoch": 0.31, + "grad_norm": 0.7807026077532898, + "learning_rate": 4.040728386652261e-05, + "loss": 2.1844, + "step": 4018 + }, + { + "epoch": 0.31, + "grad_norm": 0.7573740902493367, + "learning_rate": 4.040236399187696e-05, + "loss": 2.013, + "step": 4019 + }, + { + "epoch": 0.31, + "grad_norm": 0.833450128425402, + "learning_rate": 4.039744315559846e-05, + "loss": 1.9287, + "step": 4020 + }, + { + "epoch": 0.31, + "grad_norm": 0.6706700585675034, + "learning_rate": 4.0392521357994315e-05, + "loss": 1.9206, + "step": 4021 + }, + { + "epoch": 0.31, + "grad_norm": 0.6587641498120164, + "learning_rate": 4.0387598599371844e-05, + "loss": 2.1812, + "step": 4022 + }, + { + "epoch": 0.31, + "grad_norm": 0.7913950435021273, + "learning_rate": 4.0382674880038377e-05, + "loss": 2.0243, + "step": 4023 + }, + { + "epoch": 0.31, + "grad_norm": 0.7083470022022188, + "learning_rate": 4.0377750200301325e-05, + "loss": 1.9828, + "step": 4024 + }, + { + "epoch": 0.31, + "grad_norm": 0.7614277294632309, + "learning_rate": 4.037282456046815e-05, + "loss": 1.9319, + "step": 4025 + }, + { + "epoch": 0.31, + "grad_norm": 0.7367759670715945, + "learning_rate": 4.036789796084638e-05, + "loss": 2.1518, + "step": 4026 + }, + { + "epoch": 0.31, + "grad_norm": 0.8088455260098338, + "learning_rate": 4.036297040174362e-05, + "loss": 1.9458, + "step": 4027 + }, + { + "epoch": 0.31, + "grad_norm": 0.6464907246155637, + "learning_rate": 4.035804188346749e-05, + "loss": 1.9363, + "step": 4028 + }, + { + "epoch": 0.31, + "grad_norm": 0.7437880248343192, + "learning_rate": 4.035311240632572e-05, + "loss": 2.0191, + "step": 4029 + }, + { + "epoch": 0.31, + "grad_norm": 0.7913735731501759, + "learning_rate": 4.0348181970626075e-05, + "loss": 2.1226, + "step": 4030 + }, + { + "epoch": 0.31, + "grad_norm": 0.6486981255868638, + "learning_rate": 4.0343250576676375e-05, + "loss": 1.9796, + "step": 4031 + }, + { + "epoch": 0.31, + "grad_norm": 0.6713058533336963, + "learning_rate": 4.03383182247845e-05, + "loss": 1.9512, + "step": 4032 + }, + { + "epoch": 0.31, + "grad_norm": 0.6447257924957461, + "learning_rate": 4.033338491525842e-05, + "loss": 1.9843, + "step": 4033 + }, + { + "epoch": 0.31, + "grad_norm": 0.6190412919800601, + "learning_rate": 4.032845064840612e-05, + "loss": 2.1511, + "step": 4034 + }, + { + "epoch": 0.31, + "grad_norm": 0.5563706820721747, + "learning_rate": 4.032351542453569e-05, + "loss": 1.9204, + "step": 4035 + }, + { + "epoch": 0.31, + "grad_norm": 0.6621936152872246, + "learning_rate": 4.0318579243955226e-05, + "loss": 2.0138, + "step": 4036 + }, + { + "epoch": 0.31, + "grad_norm": 0.5951986835296329, + "learning_rate": 4.0313642106972924e-05, + "loss": 1.8928, + "step": 4037 + }, + { + "epoch": 0.31, + "grad_norm": 0.7612010858730531, + "learning_rate": 4.0308704013897046e-05, + "loss": 2.1399, + "step": 4038 + }, + { + "epoch": 0.31, + "grad_norm": 0.6335425540729737, + "learning_rate": 4.030376496503589e-05, + "loss": 1.9681, + "step": 4039 + }, + { + "epoch": 0.31, + "grad_norm": 0.6512516302522636, + "learning_rate": 4.029882496069781e-05, + "loss": 1.8656, + "step": 4040 + }, + { + "epoch": 0.31, + "grad_norm": 0.6668516782976716, + "learning_rate": 4.029388400119124e-05, + "loss": 1.9722, + "step": 4041 + }, + { + "epoch": 0.31, + "grad_norm": 0.6543969273058352, + "learning_rate": 4.0288942086824666e-05, + "loss": 2.0603, + "step": 4042 + }, + { + "epoch": 0.31, + "grad_norm": 0.7095261547541843, + "learning_rate": 4.028399921790663e-05, + "loss": 2.111, + "step": 4043 + }, + { + "epoch": 0.31, + "grad_norm": 0.7275530170920769, + "learning_rate": 4.0279055394745735e-05, + "loss": 1.955, + "step": 4044 + }, + { + "epoch": 0.31, + "grad_norm": 0.6040600540830103, + "learning_rate": 4.027411061765064e-05, + "loss": 1.9669, + "step": 4045 + }, + { + "epoch": 0.31, + "grad_norm": 0.8112563656138503, + "learning_rate": 4.0269164886930075e-05, + "loss": 2.1463, + "step": 4046 + }, + { + "epoch": 0.31, + "grad_norm": 0.6998928681235164, + "learning_rate": 4.026421820289281e-05, + "loss": 1.9192, + "step": 4047 + }, + { + "epoch": 0.31, + "grad_norm": 0.765060021458267, + "learning_rate": 4.025927056584771e-05, + "loss": 2.0376, + "step": 4048 + }, + { + "epoch": 0.31, + "grad_norm": 0.6231500733024301, + "learning_rate": 4.025432197610365e-05, + "loss": 1.9835, + "step": 4049 + }, + { + "epoch": 0.31, + "grad_norm": 0.6408508241256927, + "learning_rate": 4.0249372433969604e-05, + "loss": 2.1321, + "step": 4050 + }, + { + "epoch": 0.31, + "grad_norm": 0.6238985287983541, + "learning_rate": 4.0244421939754596e-05, + "loss": 1.9963, + "step": 4051 + }, + { + "epoch": 0.31, + "grad_norm": 0.6009776238444348, + "learning_rate": 4.0239470493767704e-05, + "loss": 1.9228, + "step": 4052 + }, + { + "epoch": 0.31, + "grad_norm": 0.6494780339165497, + "learning_rate": 4.023451809631805e-05, + "loss": 1.9753, + "step": 4053 + }, + { + "epoch": 0.31, + "grad_norm": 0.5178308636558642, + "learning_rate": 4.022956474771486e-05, + "loss": 2.0482, + "step": 4054 + }, + { + "epoch": 0.31, + "grad_norm": 0.7089063311765201, + "learning_rate": 4.0224610448267374e-05, + "loss": 2.1219, + "step": 4055 + }, + { + "epoch": 0.31, + "grad_norm": 0.6881296990431022, + "learning_rate": 4.021965519828491e-05, + "loss": 1.9831, + "step": 4056 + }, + { + "epoch": 0.31, + "grad_norm": 0.5829662410557068, + "learning_rate": 4.021469899807685e-05, + "loss": 1.9113, + "step": 4057 + }, + { + "epoch": 0.31, + "grad_norm": 0.7235164656320588, + "learning_rate": 4.020974184795262e-05, + "loss": 2.1464, + "step": 4058 + }, + { + "epoch": 0.31, + "grad_norm": 0.613069299045964, + "learning_rate": 4.020478374822174e-05, + "loss": 1.9108, + "step": 4059 + }, + { + "epoch": 0.31, + "grad_norm": 0.566183753466578, + "learning_rate": 4.019982469919374e-05, + "loss": 2.0192, + "step": 4060 + }, + { + "epoch": 0.31, + "grad_norm": 0.6621362930446608, + "learning_rate": 4.0194864701178236e-05, + "loss": 1.9858, + "step": 4061 + }, + { + "epoch": 0.31, + "grad_norm": 0.5832710915149316, + "learning_rate": 4.0189903754484916e-05, + "loss": 2.134, + "step": 4062 + }, + { + "epoch": 0.31, + "grad_norm": 0.5300774396538228, + "learning_rate": 4.0184941859423495e-05, + "loss": 1.9766, + "step": 4063 + }, + { + "epoch": 0.31, + "grad_norm": 0.594071852916594, + "learning_rate": 4.017997901630378e-05, + "loss": 1.9055, + "step": 4064 + }, + { + "epoch": 0.31, + "grad_norm": 0.6664524059594977, + "learning_rate": 4.0175015225435605e-05, + "loss": 1.9479, + "step": 4065 + }, + { + "epoch": 0.31, + "grad_norm": 0.5964200763707886, + "learning_rate": 4.01700504871289e-05, + "loss": 2.1945, + "step": 4066 + }, + { + "epoch": 0.31, + "grad_norm": 0.6767975939423628, + "learning_rate": 4.016508480169361e-05, + "loss": 2.0171, + "step": 4067 + }, + { + "epoch": 0.31, + "grad_norm": 0.6941159025302919, + "learning_rate": 4.0160118169439784e-05, + "loss": 1.9541, + "step": 4068 + }, + { + "epoch": 0.31, + "grad_norm": 0.6975722077131571, + "learning_rate": 4.0155150590677505e-05, + "loss": 1.931, + "step": 4069 + }, + { + "epoch": 0.31, + "grad_norm": 0.6624950585239348, + "learning_rate": 4.015018206571691e-05, + "loss": 2.149, + "step": 4070 + }, + { + "epoch": 0.31, + "grad_norm": 0.6328763987781628, + "learning_rate": 4.0145212594868206e-05, + "loss": 1.9379, + "step": 4071 + }, + { + "epoch": 0.31, + "grad_norm": 0.6509446988740296, + "learning_rate": 4.014024217844167e-05, + "loss": 1.9456, + "step": 4072 + }, + { + "epoch": 0.31, + "grad_norm": 0.701774793259767, + "learning_rate": 4.0135270816747616e-05, + "loss": 1.9844, + "step": 4073 + }, + { + "epoch": 0.31, + "grad_norm": 0.5660466799537208, + "learning_rate": 4.013029851009642e-05, + "loss": 1.9132, + "step": 4074 + }, + { + "epoch": 0.31, + "grad_norm": 0.9094343887196557, + "learning_rate": 4.012532525879854e-05, + "loss": 2.2041, + "step": 4075 + }, + { + "epoch": 0.31, + "grad_norm": 0.6423610864906237, + "learning_rate": 4.012035106316446e-05, + "loss": 1.9683, + "step": 4076 + }, + { + "epoch": 0.31, + "grad_norm": 0.7225926440714548, + "learning_rate": 4.011537592350475e-05, + "loss": 1.9301, + "step": 4077 + }, + { + "epoch": 0.31, + "grad_norm": 0.5885874847862682, + "learning_rate": 4.0110399840130034e-05, + "loss": 2.1384, + "step": 4078 + }, + { + "epoch": 0.31, + "grad_norm": 0.591280796075023, + "learning_rate": 4.010542281335097e-05, + "loss": 2.0321, + "step": 4079 + }, + { + "epoch": 0.31, + "grad_norm": 0.6572687791010098, + "learning_rate": 4.0100444843478316e-05, + "loss": 1.9494, + "step": 4080 + }, + { + "epoch": 0.31, + "grad_norm": 0.7116668405227692, + "learning_rate": 4.009546593082284e-05, + "loss": 1.9259, + "step": 4081 + }, + { + "epoch": 0.31, + "grad_norm": 0.6647111284321758, + "learning_rate": 4.0090486075695425e-05, + "loss": 2.181, + "step": 4082 + }, + { + "epoch": 0.31, + "grad_norm": 0.5488426282994372, + "learning_rate": 4.0085505278406965e-05, + "loss": 1.9043, + "step": 4083 + }, + { + "epoch": 0.32, + "grad_norm": 0.6767314701789492, + "learning_rate": 4.0080523539268445e-05, + "loss": 1.9771, + "step": 4084 + }, + { + "epoch": 0.32, + "grad_norm": 0.5424138911592522, + "learning_rate": 4.0075540858590883e-05, + "loss": 2.0103, + "step": 4085 + }, + { + "epoch": 0.32, + "grad_norm": 0.6156755671808553, + "learning_rate": 4.007055723668538e-05, + "loss": 1.9421, + "step": 4086 + }, + { + "epoch": 0.32, + "grad_norm": 0.5763372860269943, + "learning_rate": 4.006557267386306e-05, + "loss": 2.1497, + "step": 4087 + }, + { + "epoch": 0.32, + "grad_norm": 0.5724485197989426, + "learning_rate": 4.006058717043516e-05, + "loss": 1.9518, + "step": 4088 + }, + { + "epoch": 0.32, + "grad_norm": 0.6776327466516002, + "learning_rate": 4.005560072671293e-05, + "loss": 1.9684, + "step": 4089 + }, + { + "epoch": 0.32, + "grad_norm": 0.5891820338573822, + "learning_rate": 4.00506133430077e-05, + "loss": 2.1816, + "step": 4090 + }, + { + "epoch": 0.32, + "grad_norm": 0.5833905310001206, + "learning_rate": 4.004562501963085e-05, + "loss": 2.0274, + "step": 4091 + }, + { + "epoch": 0.32, + "grad_norm": 0.5584142273382847, + "learning_rate": 4.0040635756893815e-05, + "loss": 1.9575, + "step": 4092 + }, + { + "epoch": 0.32, + "grad_norm": 0.6308651916216067, + "learning_rate": 4.00356455551081e-05, + "loss": 1.9465, + "step": 4093 + }, + { + "epoch": 0.32, + "grad_norm": 0.7290515560126466, + "learning_rate": 4.0030654414585274e-05, + "loss": 2.1995, + "step": 4094 + }, + { + "epoch": 0.32, + "grad_norm": 0.5546197178194088, + "learning_rate": 4.0025662335636935e-05, + "loss": 1.9544, + "step": 4095 + }, + { + "epoch": 0.32, + "grad_norm": 0.7673565137874488, + "learning_rate": 4.002066931857478e-05, + "loss": 1.9614, + "step": 4096 + }, + { + "epoch": 0.32, + "grad_norm": 0.6278296844606506, + "learning_rate": 4.001567536371053e-05, + "loss": 1.976, + "step": 4097 + }, + { + "epoch": 0.32, + "grad_norm": 0.726820248686488, + "learning_rate": 4.001068047135598e-05, + "loss": 2.0045, + "step": 4098 + }, + { + "epoch": 0.32, + "grad_norm": 0.7171819765104391, + "learning_rate": 4.000568464182298e-05, + "loss": 2.174, + "step": 4099 + }, + { + "epoch": 0.32, + "grad_norm": 0.6500659507829263, + "learning_rate": 4.0000687875423434e-05, + "loss": 1.9581, + "step": 4100 + }, + { + "epoch": 0.32, + "grad_norm": 0.6972264706245118, + "learning_rate": 3.999569017246934e-05, + "loss": 1.9549, + "step": 4101 + }, + { + "epoch": 0.32, + "grad_norm": 0.5898157219807859, + "learning_rate": 3.99906915332727e-05, + "loss": 2.1611, + "step": 4102 + }, + { + "epoch": 0.32, + "grad_norm": 0.6506849130612727, + "learning_rate": 3.998569195814559e-05, + "loss": 1.9515, + "step": 4103 + }, + { + "epoch": 0.32, + "grad_norm": 0.6667674551883965, + "learning_rate": 3.998069144740018e-05, + "loss": 2.0535, + "step": 4104 + }, + { + "epoch": 0.32, + "grad_norm": 0.6384655034348754, + "learning_rate": 3.9975690001348656e-05, + "loss": 1.9372, + "step": 4105 + }, + { + "epoch": 0.32, + "grad_norm": 0.7788984270879734, + "learning_rate": 3.997068762030328e-05, + "loss": 2.0143, + "step": 4106 + }, + { + "epoch": 0.32, + "grad_norm": 0.563719882087236, + "learning_rate": 3.996568430457639e-05, + "loss": 2.1671, + "step": 4107 + }, + { + "epoch": 0.32, + "grad_norm": 0.7170814891339737, + "learning_rate": 3.9960680054480336e-05, + "loss": 1.9732, + "step": 4108 + }, + { + "epoch": 0.32, + "grad_norm": 0.6033591562490503, + "learning_rate": 3.995567487032756e-05, + "loss": 1.9629, + "step": 4109 + }, + { + "epoch": 0.32, + "grad_norm": 0.5668281568158399, + "learning_rate": 3.9950668752430584e-05, + "loss": 1.9932, + "step": 4110 + }, + { + "epoch": 0.32, + "grad_norm": 0.7483063348298942, + "learning_rate": 3.994566170110192e-05, + "loss": 2.1169, + "step": 4111 + }, + { + "epoch": 0.32, + "grad_norm": 0.6168689571475807, + "learning_rate": 3.994065371665421e-05, + "loss": 1.9824, + "step": 4112 + }, + { + "epoch": 0.32, + "grad_norm": 0.7382031205135126, + "learning_rate": 3.99356447994001e-05, + "loss": 1.9817, + "step": 4113 + }, + { + "epoch": 0.32, + "grad_norm": 0.8086618970629478, + "learning_rate": 3.993063494965234e-05, + "loss": 2.1664, + "step": 4114 + }, + { + "epoch": 0.32, + "grad_norm": 0.639274337963226, + "learning_rate": 3.9925624167723686e-05, + "loss": 2.0025, + "step": 4115 + }, + { + "epoch": 0.32, + "grad_norm": 0.7961330145395729, + "learning_rate": 3.9920612453927014e-05, + "loss": 2.0132, + "step": 4116 + }, + { + "epoch": 0.32, + "grad_norm": 0.6386178915926113, + "learning_rate": 3.9915599808575204e-05, + "loss": 1.9651, + "step": 4117 + }, + { + "epoch": 0.32, + "grad_norm": 0.716198739890325, + "learning_rate": 3.991058623198123e-05, + "loss": 1.9417, + "step": 4118 + }, + { + "epoch": 0.32, + "grad_norm": 0.8806786422332284, + "learning_rate": 3.99055717244581e-05, + "loss": 2.2179, + "step": 4119 + }, + { + "epoch": 0.32, + "grad_norm": 0.6563777969517502, + "learning_rate": 3.990055628631889e-05, + "loss": 1.9123, + "step": 4120 + }, + { + "epoch": 0.32, + "grad_norm": 0.8316082380364341, + "learning_rate": 3.9895539917876734e-05, + "loss": 2.0081, + "step": 4121 + }, + { + "epoch": 0.32, + "grad_norm": 0.6531517471364706, + "learning_rate": 3.989052261944484e-05, + "loss": 2.1482, + "step": 4122 + }, + { + "epoch": 0.32, + "grad_norm": 0.9491221114104248, + "learning_rate": 3.9885504391336446e-05, + "loss": 2.0655, + "step": 4123 + }, + { + "epoch": 0.32, + "grad_norm": 0.5969032576001848, + "learning_rate": 3.988048523386485e-05, + "loss": 1.9306, + "step": 4124 + }, + { + "epoch": 0.32, + "grad_norm": 0.6930020906973378, + "learning_rate": 3.987546514734345e-05, + "loss": 1.8648, + "step": 4125 + }, + { + "epoch": 0.32, + "grad_norm": 0.7359572301489702, + "learning_rate": 3.9870444132085624e-05, + "loss": 2.1703, + "step": 4126 + }, + { + "epoch": 0.32, + "grad_norm": 0.6229532862650323, + "learning_rate": 3.98654221884049e-05, + "loss": 1.9508, + "step": 4127 + }, + { + "epoch": 0.32, + "grad_norm": 0.6771936719024844, + "learning_rate": 3.9860399316614796e-05, + "loss": 1.9391, + "step": 4128 + }, + { + "epoch": 0.32, + "grad_norm": 0.6030715902413544, + "learning_rate": 3.985537551702892e-05, + "loss": 2.0164, + "step": 4129 + }, + { + "epoch": 0.32, + "grad_norm": 0.5462154005934228, + "learning_rate": 3.9850350789960915e-05, + "loss": 1.9326, + "step": 4130 + }, + { + "epoch": 0.32, + "grad_norm": 0.618767830556047, + "learning_rate": 3.984532513572451e-05, + "loss": 2.1289, + "step": 4131 + }, + { + "epoch": 0.32, + "grad_norm": 0.568389126318579, + "learning_rate": 3.984029855463347e-05, + "loss": 1.9101, + "step": 4132 + }, + { + "epoch": 0.32, + "grad_norm": 0.6222226575696356, + "learning_rate": 3.983527104700162e-05, + "loss": 1.968, + "step": 4133 + }, + { + "epoch": 0.32, + "grad_norm": 0.6016777030481472, + "learning_rate": 3.9830242613142856e-05, + "loss": 2.1575, + "step": 4134 + }, + { + "epoch": 0.32, + "grad_norm": 0.6511502054969768, + "learning_rate": 3.982521325337112e-05, + "loss": 2.0416, + "step": 4135 + }, + { + "epoch": 0.32, + "grad_norm": 0.6534986331541173, + "learning_rate": 3.982018296800042e-05, + "loss": 1.9462, + "step": 4136 + }, + { + "epoch": 0.32, + "grad_norm": 0.6222479396519502, + "learning_rate": 3.981515175734481e-05, + "loss": 1.9251, + "step": 4137 + }, + { + "epoch": 0.32, + "grad_norm": 0.6610825183974631, + "learning_rate": 3.981011962171842e-05, + "loss": 1.9168, + "step": 4138 + }, + { + "epoch": 0.32, + "grad_norm": 0.6508214780388522, + "learning_rate": 3.9805086561435424e-05, + "loss": 2.1497, + "step": 4139 + }, + { + "epoch": 0.32, + "grad_norm": 0.6701706073600894, + "learning_rate": 3.9800052576810044e-05, + "loss": 1.9367, + "step": 4140 + }, + { + "epoch": 0.32, + "grad_norm": 0.6906092533384679, + "learning_rate": 3.9795017668156584e-05, + "loss": 2.0669, + "step": 4141 + }, + { + "epoch": 0.32, + "grad_norm": 0.6498540114859206, + "learning_rate": 3.978998183578939e-05, + "loss": 2.0183, + "step": 4142 + }, + { + "epoch": 0.32, + "grad_norm": 0.6682406019479503, + "learning_rate": 3.978494508002287e-05, + "loss": 2.1736, + "step": 4143 + }, + { + "epoch": 0.32, + "grad_norm": 0.5991606630291433, + "learning_rate": 3.97799074011715e-05, + "loss": 1.9409, + "step": 4144 + }, + { + "epoch": 0.32, + "grad_norm": 0.679494942795775, + "learning_rate": 3.977486879954979e-05, + "loss": 1.9034, + "step": 4145 + }, + { + "epoch": 0.32, + "grad_norm": 0.6458545376487609, + "learning_rate": 3.976982927547232e-05, + "loss": 2.1579, + "step": 4146 + }, + { + "epoch": 0.32, + "grad_norm": 0.5785166783159018, + "learning_rate": 3.976478882925373e-05, + "loss": 2.0051, + "step": 4147 + }, + { + "epoch": 0.32, + "grad_norm": 0.6818663488915552, + "learning_rate": 3.975974746120872e-05, + "loss": 1.9113, + "step": 4148 + }, + { + "epoch": 0.32, + "grad_norm": 0.6200890306013992, + "learning_rate": 3.975470517165205e-05, + "loss": 1.9861, + "step": 4149 + }, + { + "epoch": 0.32, + "grad_norm": 0.6172356514311024, + "learning_rate": 3.974966196089851e-05, + "loss": 2.0036, + "step": 4150 + }, + { + "epoch": 0.32, + "grad_norm": 0.6667376797902171, + "learning_rate": 3.974461782926299e-05, + "loss": 2.1371, + "step": 4151 + }, + { + "epoch": 0.32, + "grad_norm": 0.5861910524732865, + "learning_rate": 3.97395727770604e-05, + "loss": 1.9552, + "step": 4152 + }, + { + "epoch": 0.32, + "grad_norm": 0.5958368244158986, + "learning_rate": 3.973452680460574e-05, + "loss": 1.9543, + "step": 4153 + }, + { + "epoch": 0.32, + "grad_norm": 0.5202759880150478, + "learning_rate": 3.972947991221403e-05, + "loss": 2.0163, + "step": 4154 + }, + { + "epoch": 0.32, + "grad_norm": 0.7141005077994359, + "learning_rate": 3.972443210020038e-05, + "loss": 2.1162, + "step": 4155 + }, + { + "epoch": 0.32, + "grad_norm": 0.6307918856221736, + "learning_rate": 3.9719383368879946e-05, + "loss": 1.924, + "step": 4156 + }, + { + "epoch": 0.32, + "grad_norm": 0.6467399845723995, + "learning_rate": 3.971433371856794e-05, + "loss": 1.9376, + "step": 4157 + }, + { + "epoch": 0.32, + "grad_norm": 0.6559244853490612, + "learning_rate": 3.970928314957963e-05, + "loss": 1.9626, + "step": 4158 + }, + { + "epoch": 0.32, + "grad_norm": 0.6787886862581384, + "learning_rate": 3.970423166223035e-05, + "loss": 2.1393, + "step": 4159 + }, + { + "epoch": 0.32, + "grad_norm": 0.6743291378837947, + "learning_rate": 3.9699179256835484e-05, + "loss": 2.0393, + "step": 4160 + }, + { + "epoch": 0.32, + "grad_norm": 0.6788849442685472, + "learning_rate": 3.9694125933710464e-05, + "loss": 1.9498, + "step": 4161 + }, + { + "epoch": 0.32, + "grad_norm": 0.689141391814546, + "learning_rate": 3.96890716931708e-05, + "loss": 1.9654, + "step": 4162 + }, + { + "epoch": 0.32, + "grad_norm": 0.8747882560307242, + "learning_rate": 3.968401653553204e-05, + "loss": 2.1416, + "step": 4163 + }, + { + "epoch": 0.32, + "grad_norm": 0.5828133104793877, + "learning_rate": 3.9678960461109816e-05, + "loss": 1.9815, + "step": 4164 + }, + { + "epoch": 0.32, + "grad_norm": 0.6578941699912043, + "learning_rate": 3.967390347021978e-05, + "loss": 1.9491, + "step": 4165 + }, + { + "epoch": 0.32, + "grad_norm": 0.6690431374311795, + "learning_rate": 3.966884556317767e-05, + "loss": 2.0695, + "step": 4166 + }, + { + "epoch": 0.32, + "grad_norm": 0.5929844733012585, + "learning_rate": 3.966378674029927e-05, + "loss": 2.1407, + "step": 4167 + }, + { + "epoch": 0.32, + "grad_norm": 0.736529336430372, + "learning_rate": 3.965872700190042e-05, + "loss": 1.9762, + "step": 4168 + }, + { + "epoch": 0.32, + "grad_norm": 0.6092250471496666, + "learning_rate": 3.9653666348297024e-05, + "loss": 1.8881, + "step": 4169 + }, + { + "epoch": 0.32, + "grad_norm": 0.6962698082383253, + "learning_rate": 3.964860477980504e-05, + "loss": 2.0023, + "step": 4170 + }, + { + "epoch": 0.32, + "grad_norm": 0.574335469008907, + "learning_rate": 3.9643542296740486e-05, + "loss": 2.142, + "step": 4171 + }, + { + "epoch": 0.32, + "grad_norm": 0.685396584835887, + "learning_rate": 3.963847889941943e-05, + "loss": 2.0115, + "step": 4172 + }, + { + "epoch": 0.32, + "grad_norm": 0.5942806629623203, + "learning_rate": 3.9633414588158e-05, + "loss": 1.9602, + "step": 4173 + }, + { + "epoch": 0.32, + "grad_norm": 0.6221320088094293, + "learning_rate": 3.9628349363272375e-05, + "loss": 1.9347, + "step": 4174 + }, + { + "epoch": 0.32, + "grad_norm": 0.6190268661794806, + "learning_rate": 3.962328322507881e-05, + "loss": 2.1584, + "step": 4175 + }, + { + "epoch": 0.32, + "grad_norm": 0.5854184979955704, + "learning_rate": 3.96182161738936e-05, + "loss": 1.9456, + "step": 4176 + }, + { + "epoch": 0.32, + "grad_norm": 0.6761812111916473, + "learning_rate": 3.961314821003309e-05, + "loss": 1.9894, + "step": 4177 + }, + { + "epoch": 0.32, + "grad_norm": 0.6677913233325974, + "learning_rate": 3.960807933381372e-05, + "loss": 2.0588, + "step": 4178 + }, + { + "epoch": 0.32, + "grad_norm": 0.6302217176351721, + "learning_rate": 3.960300954555194e-05, + "loss": 2.1687, + "step": 4179 + }, + { + "epoch": 0.32, + "grad_norm": 0.6817323947918734, + "learning_rate": 3.959793884556428e-05, + "loss": 2.017, + "step": 4180 + }, + { + "epoch": 0.32, + "grad_norm": 0.5683292709927948, + "learning_rate": 3.959286723416733e-05, + "loss": 1.92, + "step": 4181 + }, + { + "epoch": 0.32, + "grad_norm": 0.7522464788947631, + "learning_rate": 3.958779471167773e-05, + "loss": 1.9146, + "step": 4182 + }, + { + "epoch": 0.32, + "grad_norm": 0.5944227507167528, + "learning_rate": 3.958272127841218e-05, + "loss": 2.1575, + "step": 4183 + }, + { + "epoch": 0.32, + "grad_norm": 0.611117292103314, + "learning_rate": 3.957764693468743e-05, + "loss": 1.9312, + "step": 4184 + }, + { + "epoch": 0.32, + "grad_norm": 0.5486262008145932, + "learning_rate": 3.95725716808203e-05, + "loss": 2.055, + "step": 4185 + }, + { + "epoch": 0.32, + "grad_norm": 0.5441369815693254, + "learning_rate": 3.956749551712765e-05, + "loss": 1.9596, + "step": 4186 + }, + { + "epoch": 0.32, + "grad_norm": 0.7913050312491605, + "learning_rate": 3.9562418443926405e-05, + "loss": 2.1651, + "step": 4187 + }, + { + "epoch": 0.32, + "grad_norm": 0.5654104549632992, + "learning_rate": 3.9557340461533566e-05, + "loss": 1.9722, + "step": 4188 + }, + { + "epoch": 0.32, + "grad_norm": 0.6621842290953107, + "learning_rate": 3.955226157026615e-05, + "loss": 1.9271, + "step": 4189 + }, + { + "epoch": 0.32, + "grad_norm": 0.5553873429162748, + "learning_rate": 3.954718177044126e-05, + "loss": 1.975, + "step": 4190 + }, + { + "epoch": 0.32, + "grad_norm": 0.5891143605088437, + "learning_rate": 3.954210106237606e-05, + "loss": 2.21, + "step": 4191 + }, + { + "epoch": 0.32, + "grad_norm": 0.5878468753216854, + "learning_rate": 3.953701944638775e-05, + "loss": 1.959, + "step": 4192 + }, + { + "epoch": 0.32, + "grad_norm": 0.5670275086268481, + "learning_rate": 3.953193692279359e-05, + "loss": 1.9134, + "step": 4193 + }, + { + "epoch": 0.32, + "grad_norm": 0.6460113726730338, + "learning_rate": 3.95268534919109e-05, + "loss": 1.9846, + "step": 4194 + }, + { + "epoch": 0.32, + "grad_norm": 0.616892812174001, + "learning_rate": 3.9521769154057085e-05, + "loss": 2.0986, + "step": 4195 + }, + { + "epoch": 0.32, + "grad_norm": 0.5844693900956399, + "learning_rate": 3.9516683909549556e-05, + "loss": 1.9758, + "step": 4196 + }, + { + "epoch": 0.32, + "grad_norm": 0.7695225825371955, + "learning_rate": 3.9511597758705823e-05, + "loss": 2.0918, + "step": 4197 + }, + { + "epoch": 0.32, + "grad_norm": 0.6108845788741807, + "learning_rate": 3.9506510701843415e-05, + "loss": 1.9384, + "step": 4198 + }, + { + "epoch": 0.32, + "grad_norm": 0.7358984466156699, + "learning_rate": 3.9501422739279956e-05, + "loss": 2.1465, + "step": 4199 + }, + { + "epoch": 0.32, + "grad_norm": 0.5858954601419409, + "learning_rate": 3.94963338713331e-05, + "loss": 1.9532, + "step": 4200 + }, + { + "epoch": 0.32, + "grad_norm": 0.6010029902198067, + "learning_rate": 3.9491244098320575e-05, + "loss": 1.9341, + "step": 4201 + }, + { + "epoch": 0.32, + "grad_norm": 0.7187548294193934, + "learning_rate": 3.948615342056014e-05, + "loss": 2.0105, + "step": 4202 + }, + { + "epoch": 0.32, + "grad_norm": 0.7410447610254437, + "learning_rate": 3.948106183836964e-05, + "loss": 2.237, + "step": 4203 + }, + { + "epoch": 0.32, + "grad_norm": 0.773160094178751, + "learning_rate": 3.947596935206697e-05, + "loss": 1.9106, + "step": 4204 + }, + { + "epoch": 0.32, + "grad_norm": 0.7089167731289175, + "learning_rate": 3.9470875961970046e-05, + "loss": 1.9779, + "step": 4205 + }, + { + "epoch": 0.32, + "grad_norm": 0.6271017327688269, + "learning_rate": 3.94657816683969e-05, + "loss": 1.9877, + "step": 4206 + }, + { + "epoch": 0.32, + "grad_norm": 0.7965188562601738, + "learning_rate": 3.9460686471665575e-05, + "loss": 2.1462, + "step": 4207 + }, + { + "epoch": 0.32, + "grad_norm": 0.5556564766072845, + "learning_rate": 3.945559037209419e-05, + "loss": 1.9445, + "step": 4208 + }, + { + "epoch": 0.32, + "grad_norm": 0.6317411375553889, + "learning_rate": 3.945049337000091e-05, + "loss": 2.0155, + "step": 4209 + }, + { + "epoch": 0.32, + "grad_norm": 0.6331097891280774, + "learning_rate": 3.944539546570396e-05, + "loss": 1.9554, + "step": 4210 + }, + { + "epoch": 0.32, + "grad_norm": 0.5788739374701954, + "learning_rate": 3.944029665952163e-05, + "loss": 2.1392, + "step": 4211 + }, + { + "epoch": 0.32, + "grad_norm": 0.5807134202646161, + "learning_rate": 3.943519695177226e-05, + "loss": 1.9856, + "step": 4212 + }, + { + "epoch": 0.33, + "grad_norm": 0.5754240782281697, + "learning_rate": 3.943009634277425e-05, + "loss": 1.9365, + "step": 4213 + }, + { + "epoch": 0.33, + "grad_norm": 0.6038530614824176, + "learning_rate": 3.942499483284603e-05, + "loss": 1.9384, + "step": 4214 + }, + { + "epoch": 0.33, + "grad_norm": 0.6271145163137049, + "learning_rate": 3.9419892422306135e-05, + "loss": 2.1352, + "step": 4215 + }, + { + "epoch": 0.33, + "grad_norm": 0.5607835222965799, + "learning_rate": 3.941478911147312e-05, + "loss": 2.0255, + "step": 4216 + }, + { + "epoch": 0.33, + "grad_norm": 0.6467699712595358, + "learning_rate": 3.940968490066559e-05, + "loss": 1.9653, + "step": 4217 + }, + { + "epoch": 0.33, + "grad_norm": 0.5785419738591893, + "learning_rate": 3.940457979020224e-05, + "loss": 1.9432, + "step": 4218 + }, + { + "epoch": 0.33, + "grad_norm": 0.6162449185767824, + "learning_rate": 3.93994737804018e-05, + "loss": 2.2019, + "step": 4219 + }, + { + "epoch": 0.33, + "grad_norm": 0.693963652144824, + "learning_rate": 3.9394366871583056e-05, + "loss": 1.9089, + "step": 4220 + }, + { + "epoch": 0.33, + "grad_norm": 0.6798087583273534, + "learning_rate": 3.938925906406486e-05, + "loss": 1.9198, + "step": 4221 + }, + { + "epoch": 0.33, + "grad_norm": 0.5918439343952753, + "learning_rate": 3.9384150358166104e-05, + "loss": 2.0332, + "step": 4222 + }, + { + "epoch": 0.33, + "grad_norm": 0.8535466206240492, + "learning_rate": 3.937904075420575e-05, + "loss": 2.1347, + "step": 4223 + }, + { + "epoch": 0.33, + "grad_norm": 0.7366642374484893, + "learning_rate": 3.937393025250281e-05, + "loss": 2.0028, + "step": 4224 + }, + { + "epoch": 0.33, + "grad_norm": 0.7072022369107425, + "learning_rate": 3.936881885337636e-05, + "loss": 1.9384, + "step": 4225 + }, + { + "epoch": 0.33, + "grad_norm": 0.7737388429356021, + "learning_rate": 3.9363706557145517e-05, + "loss": 1.9621, + "step": 4226 + }, + { + "epoch": 0.33, + "grad_norm": 0.6076327274112083, + "learning_rate": 3.935859336412946e-05, + "loss": 2.1192, + "step": 4227 + }, + { + "epoch": 0.33, + "grad_norm": 0.6815344495596531, + "learning_rate": 3.935347927464744e-05, + "loss": 2.0016, + "step": 4228 + }, + { + "epoch": 0.33, + "grad_norm": 0.6615853294729421, + "learning_rate": 3.934836428901875e-05, + "loss": 1.9588, + "step": 4229 + }, + { + "epoch": 0.33, + "grad_norm": 0.6235311926772648, + "learning_rate": 3.9343248407562714e-05, + "loss": 1.9616, + "step": 4230 + }, + { + "epoch": 0.33, + "grad_norm": 0.6510289824220373, + "learning_rate": 3.933813163059877e-05, + "loss": 2.1426, + "step": 4231 + }, + { + "epoch": 0.33, + "grad_norm": 0.6026470396838804, + "learning_rate": 3.933301395844636e-05, + "loss": 1.9037, + "step": 4232 + }, + { + "epoch": 0.33, + "grad_norm": 0.6478324745573237, + "learning_rate": 3.932789539142501e-05, + "loss": 1.9229, + "step": 4233 + }, + { + "epoch": 0.33, + "grad_norm": 0.6088147320383938, + "learning_rate": 3.932277592985429e-05, + "loss": 2.0331, + "step": 4234 + }, + { + "epoch": 0.33, + "grad_norm": 0.5955612231736747, + "learning_rate": 3.9317655574053834e-05, + "loss": 2.1625, + "step": 4235 + }, + { + "epoch": 0.33, + "grad_norm": 0.7205104638984847, + "learning_rate": 3.9312534324343315e-05, + "loss": 1.9806, + "step": 4236 + }, + { + "epoch": 0.33, + "grad_norm": 0.5683236566141067, + "learning_rate": 3.930741218104248e-05, + "loss": 1.8869, + "step": 4237 + }, + { + "epoch": 0.33, + "grad_norm": 0.6544598333093086, + "learning_rate": 3.9302289144471133e-05, + "loss": 1.9651, + "step": 4238 + }, + { + "epoch": 0.33, + "grad_norm": 0.6059811547721828, + "learning_rate": 3.9297165214949116e-05, + "loss": 2.1661, + "step": 4239 + }, + { + "epoch": 0.33, + "grad_norm": 0.6112877250749854, + "learning_rate": 3.929204039279634e-05, + "loss": 2.0016, + "step": 4240 + }, + { + "epoch": 0.33, + "grad_norm": 0.6280721444993135, + "learning_rate": 3.9286914678332766e-05, + "loss": 1.9413, + "step": 4241 + }, + { + "epoch": 0.33, + "grad_norm": 0.6559420218860966, + "learning_rate": 3.9281788071878424e-05, + "loss": 1.9214, + "step": 4242 + }, + { + "epoch": 0.33, + "grad_norm": 0.7951705025332012, + "learning_rate": 3.927666057375338e-05, + "loss": 2.1691, + "step": 4243 + }, + { + "epoch": 0.33, + "grad_norm": 0.6606755004220511, + "learning_rate": 3.9271532184277756e-05, + "loss": 1.9617, + "step": 4244 + }, + { + "epoch": 0.33, + "grad_norm": 0.6690951439912823, + "learning_rate": 3.926640290377176e-05, + "loss": 1.907, + "step": 4245 + }, + { + "epoch": 0.33, + "grad_norm": 0.5447368601992475, + "learning_rate": 3.926127273255562e-05, + "loss": 1.8932, + "step": 4246 + }, + { + "epoch": 0.33, + "grad_norm": 0.6504429514132948, + "learning_rate": 3.925614167094963e-05, + "loss": 2.1912, + "step": 4247 + }, + { + "epoch": 0.33, + "grad_norm": 0.6345062244363715, + "learning_rate": 3.925100971927415e-05, + "loss": 1.9291, + "step": 4248 + }, + { + "epoch": 0.33, + "grad_norm": 0.5416326667971353, + "learning_rate": 3.92458768778496e-05, + "loss": 1.9463, + "step": 4249 + }, + { + "epoch": 0.33, + "grad_norm": 0.6624254902130771, + "learning_rate": 3.9240743146996425e-05, + "loss": 1.9344, + "step": 4250 + }, + { + "epoch": 0.33, + "grad_norm": 0.654982927006778, + "learning_rate": 3.9235608527035165e-05, + "loss": 2.1271, + "step": 4251 + }, + { + "epoch": 0.33, + "grad_norm": 0.6756192245296007, + "learning_rate": 3.9230473018286373e-05, + "loss": 1.9666, + "step": 4252 + }, + { + "epoch": 0.33, + "grad_norm": 0.6637202496845914, + "learning_rate": 3.922533662107068e-05, + "loss": 2.025, + "step": 4253 + }, + { + "epoch": 0.33, + "grad_norm": 0.7659217657022728, + "learning_rate": 3.92201993357088e-05, + "loss": 1.9497, + "step": 4254 + }, + { + "epoch": 0.33, + "grad_norm": 0.6193462346349753, + "learning_rate": 3.921506116252145e-05, + "loss": 2.1023, + "step": 4255 + }, + { + "epoch": 0.33, + "grad_norm": 0.7100720856542366, + "learning_rate": 3.920992210182944e-05, + "loss": 1.9934, + "step": 4256 + }, + { + "epoch": 0.33, + "grad_norm": 0.5583371440301395, + "learning_rate": 3.920478215395361e-05, + "loss": 1.9546, + "step": 4257 + }, + { + "epoch": 0.33, + "grad_norm": 0.7031087404109865, + "learning_rate": 3.919964131921487e-05, + "loss": 1.9732, + "step": 4258 + }, + { + "epoch": 0.33, + "grad_norm": 0.6378499759337778, + "learning_rate": 3.919449959793421e-05, + "loss": 2.0445, + "step": 4259 + }, + { + "epoch": 0.33, + "grad_norm": 0.6315181914154472, + "learning_rate": 3.918935699043261e-05, + "loss": 2.1002, + "step": 4260 + }, + { + "epoch": 0.33, + "grad_norm": 0.6398218696219451, + "learning_rate": 3.918421349703117e-05, + "loss": 1.9075, + "step": 4261 + }, + { + "epoch": 0.33, + "grad_norm": 0.6279144776807133, + "learning_rate": 3.9179069118051e-05, + "loss": 1.9694, + "step": 4262 + }, + { + "epoch": 0.33, + "grad_norm": 0.6009627840596174, + "learning_rate": 3.917392385381331e-05, + "loss": 2.1633, + "step": 4263 + }, + { + "epoch": 0.33, + "grad_norm": 0.6551057753180778, + "learning_rate": 3.9168777704639316e-05, + "loss": 1.9732, + "step": 4264 + }, + { + "epoch": 0.33, + "grad_norm": 0.662148499968243, + "learning_rate": 3.9163630670850326e-05, + "loss": 2.0494, + "step": 4265 + }, + { + "epoch": 0.33, + "grad_norm": 0.6359425411127517, + "learning_rate": 3.9158482752767686e-05, + "loss": 1.9126, + "step": 4266 + }, + { + "epoch": 0.33, + "grad_norm": 0.7941442442133257, + "learning_rate": 3.915333395071281e-05, + "loss": 2.1518, + "step": 4267 + }, + { + "epoch": 0.33, + "grad_norm": 0.6103810564615236, + "learning_rate": 3.914818426500715e-05, + "loss": 1.9139, + "step": 4268 + }, + { + "epoch": 0.33, + "grad_norm": 0.8147103784645705, + "learning_rate": 3.914303369597222e-05, + "loss": 1.9763, + "step": 4269 + }, + { + "epoch": 0.33, + "grad_norm": 0.6973755384124137, + "learning_rate": 3.913788224392959e-05, + "loss": 1.973, + "step": 4270 + }, + { + "epoch": 0.33, + "grad_norm": 0.7215870957036106, + "learning_rate": 3.91327299092009e-05, + "loss": 2.1512, + "step": 4271 + }, + { + "epoch": 0.33, + "grad_norm": 0.6969747208710462, + "learning_rate": 3.9127576692107825e-05, + "loss": 2.0517, + "step": 4272 + }, + { + "epoch": 0.33, + "grad_norm": 0.7328462801748226, + "learning_rate": 3.91224225929721e-05, + "loss": 1.9495, + "step": 4273 + }, + { + "epoch": 0.33, + "grad_norm": 0.7118597534176863, + "learning_rate": 3.911726761211551e-05, + "loss": 1.9704, + "step": 4274 + }, + { + "epoch": 0.33, + "grad_norm": 0.6432313818551401, + "learning_rate": 3.91121117498599e-05, + "loss": 2.1426, + "step": 4275 + }, + { + "epoch": 0.33, + "grad_norm": 0.6427143902515152, + "learning_rate": 3.9106955006527196e-05, + "loss": 1.8722, + "step": 4276 + }, + { + "epoch": 0.33, + "grad_norm": 0.7940138342593733, + "learning_rate": 3.910179738243933e-05, + "loss": 1.9619, + "step": 4277 + }, + { + "epoch": 0.33, + "grad_norm": 0.6804206834817563, + "learning_rate": 3.909663887791832e-05, + "loss": 2.0195, + "step": 4278 + }, + { + "epoch": 0.33, + "grad_norm": 0.8703396583957184, + "learning_rate": 3.9091479493286245e-05, + "loss": 2.191, + "step": 4279 + }, + { + "epoch": 0.33, + "grad_norm": 0.7819235342186928, + "learning_rate": 3.908631922886521e-05, + "loss": 1.9707, + "step": 4280 + }, + { + "epoch": 0.33, + "grad_norm": 0.7830730519810698, + "learning_rate": 3.90811580849774e-05, + "loss": 1.9649, + "step": 4281 + }, + { + "epoch": 0.33, + "grad_norm": 0.8258504872351909, + "learning_rate": 3.907599606194503e-05, + "loss": 1.9669, + "step": 4282 + }, + { + "epoch": 0.33, + "grad_norm": 0.8381687185617055, + "learning_rate": 3.9070833160090415e-05, + "loss": 2.1841, + "step": 4283 + }, + { + "epoch": 0.33, + "grad_norm": 0.9962400620273721, + "learning_rate": 3.906566937973588e-05, + "loss": 1.9938, + "step": 4284 + }, + { + "epoch": 0.33, + "grad_norm": 0.7057815614552411, + "learning_rate": 3.906050472120382e-05, + "loss": 1.9663, + "step": 4285 + }, + { + "epoch": 0.33, + "grad_norm": 0.9838644196236582, + "learning_rate": 3.9055339184816686e-05, + "loss": 1.9667, + "step": 4286 + }, + { + "epoch": 0.33, + "grad_norm": 0.6803952165310518, + "learning_rate": 3.905017277089698e-05, + "loss": 2.1447, + "step": 4287 + }, + { + "epoch": 0.33, + "grad_norm": 0.8107295805123587, + "learning_rate": 3.9045005479767285e-05, + "loss": 1.9513, + "step": 4288 + }, + { + "epoch": 0.33, + "grad_norm": 0.6636508748800759, + "learning_rate": 3.9039837311750185e-05, + "loss": 1.9765, + "step": 4289 + }, + { + "epoch": 0.33, + "grad_norm": 0.6921984582453629, + "learning_rate": 3.9034668267168374e-05, + "loss": 2.0056, + "step": 4290 + }, + { + "epoch": 0.33, + "grad_norm": 0.6569365539144029, + "learning_rate": 3.902949834634456e-05, + "loss": 1.9962, + "step": 4291 + }, + { + "epoch": 0.33, + "grad_norm": 0.5861950974921066, + "learning_rate": 3.902432754960153e-05, + "loss": 2.1495, + "step": 4292 + }, + { + "epoch": 0.33, + "grad_norm": 0.770440891264291, + "learning_rate": 3.901915587726212e-05, + "loss": 1.9043, + "step": 4293 + }, + { + "epoch": 0.33, + "grad_norm": 0.5990832208443667, + "learning_rate": 3.901398332964922e-05, + "loss": 1.9989, + "step": 4294 + }, + { + "epoch": 0.33, + "grad_norm": 0.6385444299617757, + "learning_rate": 3.900880990708576e-05, + "loss": 2.1644, + "step": 4295 + }, + { + "epoch": 0.33, + "grad_norm": 0.6277662409168571, + "learning_rate": 3.9003635609894753e-05, + "loss": 2.0426, + "step": 4296 + }, + { + "epoch": 0.33, + "grad_norm": 0.6105182656034237, + "learning_rate": 3.8998460438399256e-05, + "loss": 1.9699, + "step": 4297 + }, + { + "epoch": 0.33, + "grad_norm": 0.7437269659624429, + "learning_rate": 3.8993284392922356e-05, + "loss": 1.9393, + "step": 4298 + }, + { + "epoch": 0.33, + "grad_norm": 0.6414177949379191, + "learning_rate": 3.898810747378723e-05, + "loss": 2.1556, + "step": 4299 + }, + { + "epoch": 0.33, + "grad_norm": 0.6308444532499686, + "learning_rate": 3.8982929681317085e-05, + "loss": 1.9512, + "step": 4300 + }, + { + "epoch": 0.33, + "grad_norm": 0.6251334020427104, + "learning_rate": 3.8977751015835204e-05, + "loss": 1.9447, + "step": 4301 + }, + { + "epoch": 0.33, + "grad_norm": 0.6893460225896615, + "learning_rate": 3.89725714776649e-05, + "loss": 1.9364, + "step": 4302 + }, + { + "epoch": 0.33, + "grad_norm": 0.6024077758931757, + "learning_rate": 3.8967391067129554e-05, + "loss": 1.9991, + "step": 4303 + }, + { + "epoch": 0.33, + "grad_norm": 0.653707150620456, + "learning_rate": 3.8962209784552615e-05, + "loss": 2.1409, + "step": 4304 + }, + { + "epoch": 0.33, + "grad_norm": 0.6336298885563304, + "learning_rate": 3.8957027630257545e-05, + "loss": 1.9564, + "step": 4305 + }, + { + "epoch": 0.33, + "grad_norm": 0.6230794956562666, + "learning_rate": 3.895184460456792e-05, + "loss": 1.9522, + "step": 4306 + }, + { + "epoch": 0.33, + "grad_norm": 0.6579539485605063, + "learning_rate": 3.8946660707807316e-05, + "loss": 2.1485, + "step": 4307 + }, + { + "epoch": 0.33, + "grad_norm": 0.5840641715645104, + "learning_rate": 3.894147594029939e-05, + "loss": 2.0345, + "step": 4308 + }, + { + "epoch": 0.33, + "grad_norm": 0.6744405857109206, + "learning_rate": 3.893629030236784e-05, + "loss": 2.0084, + "step": 4309 + }, + { + "epoch": 0.33, + "grad_norm": 0.6286397086298595, + "learning_rate": 3.8931103794336445e-05, + "loss": 1.9664, + "step": 4310 + }, + { + "epoch": 0.33, + "grad_norm": 0.7740672559762488, + "learning_rate": 3.8925916416529e-05, + "loss": 2.1496, + "step": 4311 + }, + { + "epoch": 0.33, + "grad_norm": 0.6060055357519761, + "learning_rate": 3.8920728169269395e-05, + "loss": 1.9364, + "step": 4312 + }, + { + "epoch": 0.33, + "grad_norm": 0.619143471942503, + "learning_rate": 3.891553905288153e-05, + "loss": 1.9894, + "step": 4313 + }, + { + "epoch": 0.33, + "grad_norm": 0.6153381781409927, + "learning_rate": 3.89103490676894e-05, + "loss": 1.9647, + "step": 4314 + }, + { + "epoch": 0.33, + "grad_norm": 0.6093023878888556, + "learning_rate": 3.8905158214017036e-05, + "loss": 2.0394, + "step": 4315 + }, + { + "epoch": 0.33, + "grad_norm": 0.6673039096948462, + "learning_rate": 3.889996649218852e-05, + "loss": 2.1217, + "step": 4316 + }, + { + "epoch": 0.33, + "grad_norm": 0.5923850916158117, + "learning_rate": 3.889477390252799e-05, + "loss": 1.9286, + "step": 4317 + }, + { + "epoch": 0.33, + "grad_norm": 0.66482560773539, + "learning_rate": 3.8889580445359644e-05, + "loss": 1.9539, + "step": 4318 + }, + { + "epoch": 0.33, + "grad_norm": 0.7097007569225484, + "learning_rate": 3.8884386121007735e-05, + "loss": 2.1569, + "step": 4319 + }, + { + "epoch": 0.33, + "grad_norm": 0.648878996987364, + "learning_rate": 3.887919092979656e-05, + "loss": 1.9512, + "step": 4320 + }, + { + "epoch": 0.33, + "grad_norm": 0.7942225475166459, + "learning_rate": 3.8873994872050476e-05, + "loss": 2.0265, + "step": 4321 + }, + { + "epoch": 0.33, + "grad_norm": 0.6908363231555014, + "learning_rate": 3.8868797948093907e-05, + "loss": 1.9484, + "step": 4322 + }, + { + "epoch": 0.33, + "grad_norm": 0.7984410250597462, + "learning_rate": 3.88636001582513e-05, + "loss": 1.9443, + "step": 4323 + }, + { + "epoch": 0.33, + "grad_norm": 0.7372637636973678, + "learning_rate": 3.8858401502847184e-05, + "loss": 2.1734, + "step": 4324 + }, + { + "epoch": 0.33, + "grad_norm": 0.7212128789613863, + "learning_rate": 3.885320198220613e-05, + "loss": 1.9871, + "step": 4325 + }, + { + "epoch": 0.33, + "grad_norm": 0.5613039960028418, + "learning_rate": 3.884800159665276e-05, + "loss": 1.9985, + "step": 4326 + }, + { + "epoch": 0.33, + "grad_norm": 0.5810105846741793, + "learning_rate": 3.884280034651177e-05, + "loss": 2.0023, + "step": 4327 + }, + { + "epoch": 0.33, + "grad_norm": 0.603984859952067, + "learning_rate": 3.8837598232107875e-05, + "loss": 2.1717, + "step": 4328 + }, + { + "epoch": 0.33, + "grad_norm": 0.5867783536452, + "learning_rate": 3.8832395253765885e-05, + "loss": 1.9714, + "step": 4329 + }, + { + "epoch": 0.33, + "grad_norm": 0.6227137080868063, + "learning_rate": 3.882719141181063e-05, + "loss": 1.9408, + "step": 4330 + }, + { + "epoch": 0.33, + "grad_norm": 0.5776335378307313, + "learning_rate": 3.882198670656702e-05, + "loss": 2.1308, + "step": 4331 + }, + { + "epoch": 0.33, + "grad_norm": 0.5850417625810728, + "learning_rate": 3.881678113835999e-05, + "loss": 1.9955, + "step": 4332 + }, + { + "epoch": 0.33, + "grad_norm": 0.575204070898516, + "learning_rate": 3.8811574707514554e-05, + "loss": 1.8894, + "step": 4333 + }, + { + "epoch": 0.33, + "grad_norm": 0.5405457335076372, + "learning_rate": 3.880636741435577e-05, + "loss": 2.0448, + "step": 4334 + }, + { + "epoch": 0.33, + "grad_norm": 0.6162480331095656, + "learning_rate": 3.880115925920875e-05, + "loss": 1.9569, + "step": 4335 + }, + { + "epoch": 0.33, + "grad_norm": 0.6264051050997514, + "learning_rate": 3.879595024239866e-05, + "loss": 2.1405, + "step": 4336 + }, + { + "epoch": 0.33, + "grad_norm": 0.5916383912221922, + "learning_rate": 3.879074036425073e-05, + "loss": 1.9364, + "step": 4337 + }, + { + "epoch": 0.33, + "grad_norm": 0.6654279517878134, + "learning_rate": 3.878552962509022e-05, + "loss": 1.9617, + "step": 4338 + }, + { + "epoch": 0.33, + "grad_norm": 0.5542521841408227, + "learning_rate": 3.878031802524246e-05, + "loss": 2.1424, + "step": 4339 + }, + { + "epoch": 0.33, + "grad_norm": 0.5934101125279453, + "learning_rate": 3.877510556503283e-05, + "loss": 2.049, + "step": 4340 + }, + { + "epoch": 0.33, + "grad_norm": 0.6888907264278066, + "learning_rate": 3.8769892244786777e-05, + "loss": 1.9537, + "step": 4341 + }, + { + "epoch": 0.33, + "grad_norm": 0.5756450583300693, + "learning_rate": 3.8764678064829786e-05, + "loss": 1.9567, + "step": 4342 + }, + { + "epoch": 0.34, + "grad_norm": 0.6200812269677566, + "learning_rate": 3.875946302548738e-05, + "loss": 1.9485, + "step": 4343 + }, + { + "epoch": 0.34, + "grad_norm": 0.6677011459949054, + "learning_rate": 3.8754247127085196e-05, + "loss": 2.1762, + "step": 4344 + }, + { + "epoch": 0.34, + "grad_norm": 0.626758298631475, + "learning_rate": 3.874903036994885e-05, + "loss": 1.9647, + "step": 4345 + }, + { + "epoch": 0.34, + "grad_norm": 0.8052057796280683, + "learning_rate": 3.874381275440405e-05, + "loss": 2.0362, + "step": 4346 + }, + { + "epoch": 0.34, + "grad_norm": 0.5778446515164742, + "learning_rate": 3.873859428077656e-05, + "loss": 1.9055, + "step": 4347 + }, + { + "epoch": 0.34, + "grad_norm": 1.0083069561266784, + "learning_rate": 3.8733374949392196e-05, + "loss": 2.1773, + "step": 4348 + }, + { + "epoch": 0.34, + "grad_norm": 0.604158060705805, + "learning_rate": 3.8728154760576817e-05, + "loss": 1.9015, + "step": 4349 + }, + { + "epoch": 0.34, + "grad_norm": 0.9759068361012028, + "learning_rate": 3.8722933714656326e-05, + "loss": 1.9355, + "step": 4350 + }, + { + "epoch": 0.34, + "grad_norm": 0.6302412004092347, + "learning_rate": 3.8717711811956724e-05, + "loss": 2.1948, + "step": 4351 + }, + { + "epoch": 0.34, + "grad_norm": 0.9263154794949575, + "learning_rate": 3.8712489052804016e-05, + "loss": 2.0147, + "step": 4352 + }, + { + "epoch": 0.34, + "grad_norm": 0.712102964740718, + "learning_rate": 3.8707265437524276e-05, + "loss": 1.9625, + "step": 4353 + }, + { + "epoch": 0.34, + "grad_norm": 0.7703831464616566, + "learning_rate": 3.8702040966443647e-05, + "loss": 1.9142, + "step": 4354 + }, + { + "epoch": 0.34, + "grad_norm": 0.6927806315144396, + "learning_rate": 3.8696815639888316e-05, + "loss": 1.9254, + "step": 4355 + }, + { + "epoch": 0.34, + "grad_norm": 0.7080955706200333, + "learning_rate": 3.869158945818451e-05, + "loss": 2.1294, + "step": 4356 + }, + { + "epoch": 0.34, + "grad_norm": 0.6949109716807168, + "learning_rate": 3.8686362421658536e-05, + "loss": 1.9396, + "step": 4357 + }, + { + "epoch": 0.34, + "grad_norm": 0.6792063113000733, + "learning_rate": 3.868113453063672e-05, + "loss": 2.0811, + "step": 4358 + }, + { + "epoch": 0.34, + "grad_norm": 0.5793790023500945, + "learning_rate": 3.867590578544548e-05, + "loss": 1.918, + "step": 4359 + }, + { + "epoch": 0.34, + "grad_norm": 0.5617554401257666, + "learning_rate": 3.867067618641126e-05, + "loss": 2.0772, + "step": 4360 + }, + { + "epoch": 0.34, + "grad_norm": 0.5864697959019031, + "learning_rate": 3.8665445733860564e-05, + "loss": 1.9212, + "step": 4361 + }, + { + "epoch": 0.34, + "grad_norm": 0.6403485845629397, + "learning_rate": 3.866021442811995e-05, + "loss": 1.9769, + "step": 4362 + }, + { + "epoch": 0.34, + "grad_norm": 0.6047298692041521, + "learning_rate": 3.865498226951603e-05, + "loss": 2.1612, + "step": 4363 + }, + { + "epoch": 0.34, + "grad_norm": 0.6036323065027595, + "learning_rate": 3.864974925837547e-05, + "loss": 1.9416, + "step": 4364 + }, + { + "epoch": 0.34, + "grad_norm": 0.6426389247713166, + "learning_rate": 3.8644515395024996e-05, + "loss": 2.0031, + "step": 4365 + }, + { + "epoch": 0.34, + "grad_norm": 0.6451309771276795, + "learning_rate": 3.863928067979137e-05, + "loss": 1.9549, + "step": 4366 + }, + { + "epoch": 0.34, + "grad_norm": 0.657265998403602, + "learning_rate": 3.863404511300142e-05, + "loss": 1.9616, + "step": 4367 + }, + { + "epoch": 0.34, + "grad_norm": 0.6926548820296801, + "learning_rate": 3.862880869498202e-05, + "loss": 2.1542, + "step": 4368 + }, + { + "epoch": 0.34, + "grad_norm": 0.6532338635759335, + "learning_rate": 3.862357142606011e-05, + "loss": 1.9475, + "step": 4369 + }, + { + "epoch": 0.34, + "grad_norm": 0.6792288448474958, + "learning_rate": 3.8618333306562656e-05, + "loss": 1.9408, + "step": 4370 + }, + { + "epoch": 0.34, + "grad_norm": 0.6737249830675784, + "learning_rate": 3.8613094336816724e-05, + "loss": 2.0663, + "step": 4371 + }, + { + "epoch": 0.34, + "grad_norm": 0.5585360617454067, + "learning_rate": 3.860785451714939e-05, + "loss": 2.1716, + "step": 4372 + }, + { + "epoch": 0.34, + "grad_norm": 0.6505144601677421, + "learning_rate": 3.8602613847887784e-05, + "loss": 1.9449, + "step": 4373 + }, + { + "epoch": 0.34, + "grad_norm": 0.5817532365077317, + "learning_rate": 3.859737232935911e-05, + "loss": 1.9681, + "step": 4374 + }, + { + "epoch": 0.34, + "grad_norm": 0.5983945038749495, + "learning_rate": 3.859212996189064e-05, + "loss": 1.9459, + "step": 4375 + }, + { + "epoch": 0.34, + "grad_norm": 0.7314114556193463, + "learning_rate": 3.858688674580965e-05, + "loss": 2.1313, + "step": 4376 + }, + { + "epoch": 0.34, + "grad_norm": 0.5909548692520957, + "learning_rate": 3.85816426814435e-05, + "loss": 2.0277, + "step": 4377 + }, + { + "epoch": 0.34, + "grad_norm": 0.6623723731456705, + "learning_rate": 3.857639776911961e-05, + "loss": 2.0025, + "step": 4378 + }, + { + "epoch": 0.34, + "grad_norm": 0.6125545316274319, + "learning_rate": 3.857115200916543e-05, + "loss": 1.9303, + "step": 4379 + }, + { + "epoch": 0.34, + "grad_norm": 0.7056132544504672, + "learning_rate": 3.8565905401908483e-05, + "loss": 2.082, + "step": 4380 + }, + { + "epoch": 0.34, + "grad_norm": 0.5864366618594531, + "learning_rate": 3.8560657947676325e-05, + "loss": 1.9583, + "step": 4381 + }, + { + "epoch": 0.34, + "grad_norm": 0.625493065204582, + "learning_rate": 3.855540964679658e-05, + "loss": 1.9232, + "step": 4382 + }, + { + "epoch": 0.34, + "grad_norm": 0.5816248450736996, + "learning_rate": 3.855016049959693e-05, + "loss": 2.0525, + "step": 4383 + }, + { + "epoch": 0.34, + "grad_norm": 0.5736163535612869, + "learning_rate": 3.85449105064051e-05, + "loss": 2.1753, + "step": 4384 + }, + { + "epoch": 0.34, + "grad_norm": 0.6021596190690466, + "learning_rate": 3.8539659667548855e-05, + "loss": 1.9268, + "step": 4385 + }, + { + "epoch": 0.34, + "grad_norm": 0.6033668431664059, + "learning_rate": 3.8534407983356036e-05, + "loss": 1.9663, + "step": 4386 + }, + { + "epoch": 0.34, + "grad_norm": 0.5648118830651282, + "learning_rate": 3.852915545415452e-05, + "loss": 1.9762, + "step": 4387 + }, + { + "epoch": 0.34, + "grad_norm": 0.6291279868575248, + "learning_rate": 3.852390208027226e-05, + "loss": 2.0589, + "step": 4388 + }, + { + "epoch": 0.34, + "grad_norm": 0.5625199705409151, + "learning_rate": 3.851864786203724e-05, + "loss": 2.0122, + "step": 4389 + }, + { + "epoch": 0.34, + "grad_norm": 0.5596898088384515, + "learning_rate": 3.851339279977748e-05, + "loss": 1.9733, + "step": 4390 + }, + { + "epoch": 0.34, + "grad_norm": 0.6822178071031308, + "learning_rate": 3.85081368938211e-05, + "loss": 1.9744, + "step": 4391 + }, + { + "epoch": 0.34, + "grad_norm": 0.7498263891852862, + "learning_rate": 3.850288014449625e-05, + "loss": 2.1548, + "step": 4392 + }, + { + "epoch": 0.34, + "grad_norm": 0.5981234279327199, + "learning_rate": 3.849762255213112e-05, + "loss": 1.9223, + "step": 4393 + }, + { + "epoch": 0.34, + "grad_norm": 0.7302829030366942, + "learning_rate": 3.849236411705395e-05, + "loss": 1.9801, + "step": 4394 + }, + { + "epoch": 0.34, + "grad_norm": 0.6763328139377589, + "learning_rate": 3.8487104839593074e-05, + "loss": 2.1811, + "step": 4395 + }, + { + "epoch": 0.34, + "grad_norm": 0.6171388269099434, + "learning_rate": 3.848184472007683e-05, + "loss": 2.0034, + "step": 4396 + }, + { + "epoch": 0.34, + "grad_norm": 0.6999880754595095, + "learning_rate": 3.847658375883364e-05, + "loss": 1.9193, + "step": 4397 + }, + { + "epoch": 0.34, + "grad_norm": 0.5972728499854988, + "learning_rate": 3.847132195619195e-05, + "loss": 1.9404, + "step": 4398 + }, + { + "epoch": 0.34, + "grad_norm": 0.6070771283482037, + "learning_rate": 3.846605931248031e-05, + "loss": 1.979, + "step": 4399 + }, + { + "epoch": 0.34, + "grad_norm": 0.6099604479022251, + "learning_rate": 3.846079582802725e-05, + "loss": 2.182, + "step": 4400 + }, + { + "epoch": 0.34, + "grad_norm": 0.5608375736396208, + "learning_rate": 3.845553150316141e-05, + "loss": 1.9627, + "step": 4401 + }, + { + "epoch": 0.34, + "grad_norm": 0.5672638693170281, + "learning_rate": 3.8450266338211474e-05, + "loss": 2.0279, + "step": 4402 + }, + { + "epoch": 0.34, + "grad_norm": 0.6660033917176815, + "learning_rate": 3.844500033350614e-05, + "loss": 1.9438, + "step": 4403 + }, + { + "epoch": 0.34, + "grad_norm": 0.6282443245464764, + "learning_rate": 3.8439733489374215e-05, + "loss": 2.1681, + "step": 4404 + }, + { + "epoch": 0.34, + "grad_norm": 0.5622702359925827, + "learning_rate": 3.84344658061445e-05, + "loss": 1.9212, + "step": 4405 + }, + { + "epoch": 0.34, + "grad_norm": 0.7458993525389047, + "learning_rate": 3.8429197284145916e-05, + "loss": 1.9342, + "step": 4406 + }, + { + "epoch": 0.34, + "grad_norm": 0.6235234656751504, + "learning_rate": 3.8423927923707356e-05, + "loss": 1.9417, + "step": 4407 + }, + { + "epoch": 0.34, + "grad_norm": 0.7945183695090019, + "learning_rate": 3.841865772515785e-05, + "loss": 2.1982, + "step": 4408 + }, + { + "epoch": 0.34, + "grad_norm": 0.5578727356276851, + "learning_rate": 3.8413386688826406e-05, + "loss": 1.9745, + "step": 4409 + }, + { + "epoch": 0.34, + "grad_norm": 0.7824336407270559, + "learning_rate": 3.8408114815042126e-05, + "loss": 1.975, + "step": 4410 + }, + { + "epoch": 0.34, + "grad_norm": 0.6216351827846404, + "learning_rate": 3.840284210413415e-05, + "loss": 1.976, + "step": 4411 + }, + { + "epoch": 0.34, + "grad_norm": 0.8872750739774029, + "learning_rate": 3.839756855643169e-05, + "loss": 2.2, + "step": 4412 + }, + { + "epoch": 0.34, + "grad_norm": 0.6151798659465543, + "learning_rate": 3.839229417226399e-05, + "loss": 1.9122, + "step": 4413 + }, + { + "epoch": 0.34, + "grad_norm": 0.7198660855755996, + "learning_rate": 3.838701895196034e-05, + "loss": 2.0136, + "step": 4414 + }, + { + "epoch": 0.34, + "grad_norm": 0.6931496767608801, + "learning_rate": 3.8381742895850106e-05, + "loss": 1.9569, + "step": 4415 + }, + { + "epoch": 0.34, + "grad_norm": 0.6448028692130285, + "learning_rate": 3.8376466004262684e-05, + "loss": 2.1357, + "step": 4416 + }, + { + "epoch": 0.34, + "grad_norm": 0.6644199868732823, + "learning_rate": 3.837118827752755e-05, + "loss": 1.9638, + "step": 4417 + }, + { + "epoch": 0.34, + "grad_norm": 0.7433498353314635, + "learning_rate": 3.836590971597419e-05, + "loss": 1.9767, + "step": 4418 + }, + { + "epoch": 0.34, + "grad_norm": 0.7621119758579099, + "learning_rate": 3.836063031993217e-05, + "loss": 1.9419, + "step": 4419 + }, + { + "epoch": 0.34, + "grad_norm": 0.623431164086012, + "learning_rate": 3.835535008973113e-05, + "loss": 2.1778, + "step": 4420 + }, + { + "epoch": 0.34, + "grad_norm": 0.6519281478657304, + "learning_rate": 3.8350069025700714e-05, + "loss": 1.9413, + "step": 4421 + }, + { + "epoch": 0.34, + "grad_norm": 0.6741287649515262, + "learning_rate": 3.8344787128170644e-05, + "loss": 1.8843, + "step": 4422 + }, + { + "epoch": 0.34, + "grad_norm": 0.6543883916637756, + "learning_rate": 3.833950439747069e-05, + "loss": 1.9404, + "step": 4423 + }, + { + "epoch": 0.34, + "grad_norm": 0.639618998694041, + "learning_rate": 3.833422083393068e-05, + "loss": 2.1588, + "step": 4424 + }, + { + "epoch": 0.34, + "grad_norm": 0.7792888971563574, + "learning_rate": 3.832893643788048e-05, + "loss": 1.9739, + "step": 4425 + }, + { + "epoch": 0.34, + "grad_norm": 0.5972069956774114, + "learning_rate": 3.8323651209650026e-05, + "loss": 1.9193, + "step": 4426 + }, + { + "epoch": 0.34, + "grad_norm": 0.63954927210489, + "learning_rate": 3.831836514956929e-05, + "loss": 2.0351, + "step": 4427 + }, + { + "epoch": 0.34, + "grad_norm": 0.6868894707069756, + "learning_rate": 3.83130782579683e-05, + "loss": 2.104, + "step": 4428 + }, + { + "epoch": 0.34, + "grad_norm": 0.7123481801193416, + "learning_rate": 3.8307790535177146e-05, + "loss": 1.9262, + "step": 4429 + }, + { + "epoch": 0.34, + "grad_norm": 0.6752594360746733, + "learning_rate": 3.830250198152596e-05, + "loss": 1.9485, + "step": 4430 + }, + { + "epoch": 0.34, + "grad_norm": 0.68835547310715, + "learning_rate": 3.829721259734494e-05, + "loss": 1.9427, + "step": 4431 + }, + { + "epoch": 0.34, + "grad_norm": 0.6021798034399275, + "learning_rate": 3.829192238296428e-05, + "loss": 2.1595, + "step": 4432 + }, + { + "epoch": 0.34, + "grad_norm": 0.6177533265273177, + "learning_rate": 3.828663133871433e-05, + "loss": 2.0128, + "step": 4433 + }, + { + "epoch": 0.34, + "grad_norm": 0.5531690626980392, + "learning_rate": 3.828133946492539e-05, + "loss": 1.9513, + "step": 4434 + }, + { + "epoch": 0.34, + "grad_norm": 0.6151763873065207, + "learning_rate": 3.827604676192787e-05, + "loss": 1.9621, + "step": 4435 + }, + { + "epoch": 0.34, + "grad_norm": 0.7075990693408382, + "learning_rate": 3.82707532300522e-05, + "loss": 2.1406, + "step": 4436 + }, + { + "epoch": 0.34, + "grad_norm": 0.5348308837089606, + "learning_rate": 3.82654588696289e-05, + "loss": 1.9271, + "step": 4437 + }, + { + "epoch": 0.34, + "grad_norm": 0.7167115773271293, + "learning_rate": 3.826016368098851e-05, + "loss": 1.9938, + "step": 4438 + }, + { + "epoch": 0.34, + "grad_norm": 0.596114671235856, + "learning_rate": 3.8254867664461624e-05, + "loss": 2.0721, + "step": 4439 + }, + { + "epoch": 0.34, + "grad_norm": 0.6546141997728795, + "learning_rate": 3.82495708203789e-05, + "loss": 2.1227, + "step": 4440 + }, + { + "epoch": 0.34, + "grad_norm": 0.575612789577337, + "learning_rate": 3.824427314907103e-05, + "loss": 1.9296, + "step": 4441 + }, + { + "epoch": 0.34, + "grad_norm": 0.686612674473643, + "learning_rate": 3.823897465086878e-05, + "loss": 1.9645, + "step": 4442 + }, + { + "epoch": 0.34, + "grad_norm": 0.5830525850696061, + "learning_rate": 3.8233675326102966e-05, + "loss": 1.9667, + "step": 4443 + }, + { + "epoch": 0.34, + "grad_norm": 0.7455931264622753, + "learning_rate": 3.8228375175104424e-05, + "loss": 2.1471, + "step": 4444 + }, + { + "epoch": 0.34, + "grad_norm": 0.544897455350434, + "learning_rate": 3.8223074198204075e-05, + "loss": 2.0501, + "step": 4445 + }, + { + "epoch": 0.34, + "grad_norm": 0.5614974197327285, + "learning_rate": 3.82177723957329e-05, + "loss": 1.9458, + "step": 4446 + }, + { + "epoch": 0.34, + "grad_norm": 0.7265878008804447, + "learning_rate": 3.821246976802187e-05, + "loss": 1.9887, + "step": 4447 + }, + { + "epoch": 0.34, + "grad_norm": 0.6900083680557307, + "learning_rate": 3.820716631540209e-05, + "loss": 2.0828, + "step": 4448 + }, + { + "epoch": 0.34, + "grad_norm": 0.5796403133224415, + "learning_rate": 3.820186203820466e-05, + "loss": 1.8965, + "step": 4449 + }, + { + "epoch": 0.34, + "grad_norm": 0.7623232512879177, + "learning_rate": 3.8196556936760746e-05, + "loss": 1.9752, + "step": 4450 + }, + { + "epoch": 0.34, + "grad_norm": 0.579542656646331, + "learning_rate": 3.8191251011401566e-05, + "loss": 1.9393, + "step": 4451 + }, + { + "epoch": 0.34, + "grad_norm": 0.7949012346198973, + "learning_rate": 3.81859442624584e-05, + "loss": 2.1719, + "step": 4452 + }, + { + "epoch": 0.34, + "grad_norm": 0.6544462388500241, + "learning_rate": 3.818063669026256e-05, + "loss": 1.9141, + "step": 4453 + }, + { + "epoch": 0.34, + "grad_norm": 0.7140605103685778, + "learning_rate": 3.817532829514543e-05, + "loss": 1.9451, + "step": 4454 + }, + { + "epoch": 0.34, + "grad_norm": 0.6435867400521149, + "learning_rate": 3.817001907743842e-05, + "loss": 1.9955, + "step": 4455 + }, + { + "epoch": 0.34, + "grad_norm": 0.8514679103256637, + "learning_rate": 3.8164709037473024e-05, + "loss": 2.1574, + "step": 4456 + }, + { + "epoch": 0.34, + "grad_norm": 0.8132548458648815, + "learning_rate": 3.815939817558076e-05, + "loss": 1.9691, + "step": 4457 + }, + { + "epoch": 0.34, + "grad_norm": 0.6757275666489964, + "learning_rate": 3.81540864920932e-05, + "loss": 2.0172, + "step": 4458 + }, + { + "epoch": 0.34, + "grad_norm": 0.7751533094119201, + "learning_rate": 3.814877398734199e-05, + "loss": 1.9146, + "step": 4459 + }, + { + "epoch": 0.34, + "grad_norm": 0.7394760283740757, + "learning_rate": 3.8143460661658806e-05, + "loss": 2.1714, + "step": 4460 + }, + { + "epoch": 0.34, + "grad_norm": 0.6619551291268945, + "learning_rate": 3.8138146515375366e-05, + "loss": 1.9629, + "step": 4461 + }, + { + "epoch": 0.34, + "grad_norm": 0.6562099669408633, + "learning_rate": 3.8132831548823465e-05, + "loss": 1.9479, + "step": 4462 + }, + { + "epoch": 0.34, + "grad_norm": 0.67942495489139, + "learning_rate": 3.8127515762334956e-05, + "loss": 1.9477, + "step": 4463 + }, + { + "epoch": 0.34, + "grad_norm": 0.6520331737264866, + "learning_rate": 3.81221991562417e-05, + "loss": 2.1779, + "step": 4464 + }, + { + "epoch": 0.34, + "grad_norm": 0.7283209839781335, + "learning_rate": 3.811688173087564e-05, + "loss": 1.951, + "step": 4465 + }, + { + "epoch": 0.34, + "grad_norm": 0.8320847976965166, + "learning_rate": 3.811156348656877e-05, + "loss": 1.8864, + "step": 4466 + }, + { + "epoch": 0.34, + "grad_norm": 0.7255569647509523, + "learning_rate": 3.810624442365313e-05, + "loss": 1.9596, + "step": 4467 + }, + { + "epoch": 0.34, + "grad_norm": 0.8860217960053602, + "learning_rate": 3.810092454246081e-05, + "loss": 2.1661, + "step": 4468 + }, + { + "epoch": 0.34, + "grad_norm": 0.6937212016566178, + "learning_rate": 3.809560384332395e-05, + "loss": 1.912, + "step": 4469 + }, + { + "epoch": 0.34, + "grad_norm": 0.8612660362157865, + "learning_rate": 3.809028232657474e-05, + "loss": 2.044, + "step": 4470 + }, + { + "epoch": 0.34, + "grad_norm": 0.6364416117114474, + "learning_rate": 3.8084959992545435e-05, + "loss": 1.9291, + "step": 4471 + }, + { + "epoch": 0.35, + "grad_norm": 0.8559921493649146, + "learning_rate": 3.807963684156831e-05, + "loss": 2.1473, + "step": 4472 + }, + { + "epoch": 0.35, + "grad_norm": 0.6460329994162132, + "learning_rate": 3.807431287397574e-05, + "loss": 1.9031, + "step": 4473 + }, + { + "epoch": 0.35, + "grad_norm": 0.7874675989436741, + "learning_rate": 3.80689880901001e-05, + "loss": 1.99, + "step": 4474 + }, + { + "epoch": 0.35, + "grad_norm": 0.6465324431444173, + "learning_rate": 3.8063662490273846e-05, + "loss": 2.0194, + "step": 4475 + }, + { + "epoch": 0.35, + "grad_norm": 0.9237668974303416, + "learning_rate": 3.805833607482947e-05, + "loss": 2.0214, + "step": 4476 + }, + { + "epoch": 0.35, + "grad_norm": 0.6687627969254356, + "learning_rate": 3.8053008844099535e-05, + "loss": 2.1645, + "step": 4477 + }, + { + "epoch": 0.35, + "grad_norm": 0.711514186846853, + "learning_rate": 3.8047680798416634e-05, + "loss": 1.9223, + "step": 4478 + }, + { + "epoch": 0.35, + "grad_norm": 0.6976499322967237, + "learning_rate": 3.804235193811341e-05, + "loss": 1.9665, + "step": 4479 + }, + { + "epoch": 0.35, + "grad_norm": 0.6610651322310258, + "learning_rate": 3.8037022263522584e-05, + "loss": 2.1418, + "step": 4480 + }, + { + "epoch": 0.35, + "grad_norm": 0.6567772680014503, + "learning_rate": 3.8031691774976904e-05, + "loss": 1.9634, + "step": 4481 + }, + { + "epoch": 0.35, + "grad_norm": 0.6559507738504657, + "learning_rate": 3.802636047280917e-05, + "loss": 1.9563, + "step": 4482 + }, + { + "epoch": 0.35, + "grad_norm": 0.653243600597226, + "learning_rate": 3.802102835735223e-05, + "loss": 2.0213, + "step": 4483 + }, + { + "epoch": 0.35, + "grad_norm": 0.5940082702885116, + "learning_rate": 3.8015695428939005e-05, + "loss": 2.1738, + "step": 4484 + }, + { + "epoch": 0.35, + "grad_norm": 0.78616982316246, + "learning_rate": 3.801036168790244e-05, + "loss": 1.9112, + "step": 4485 + }, + { + "epoch": 0.35, + "grad_norm": 0.5933655010084593, + "learning_rate": 3.800502713457556e-05, + "loss": 1.9269, + "step": 4486 + }, + { + "epoch": 0.35, + "grad_norm": 0.6409105469604371, + "learning_rate": 3.799969176929139e-05, + "loss": 1.9534, + "step": 4487 + }, + { + "epoch": 0.35, + "grad_norm": 0.7339980246127552, + "learning_rate": 3.799435559238306e-05, + "loss": 2.1626, + "step": 4488 + }, + { + "epoch": 0.35, + "grad_norm": 0.5962549545325753, + "learning_rate": 3.7989018604183744e-05, + "loss": 2.0306, + "step": 4489 + }, + { + "epoch": 0.35, + "grad_norm": 0.6200094656196918, + "learning_rate": 3.798368080502663e-05, + "loss": 1.9883, + "step": 4490 + }, + { + "epoch": 0.35, + "grad_norm": 0.6756244266265415, + "learning_rate": 3.797834219524498e-05, + "loss": 1.9493, + "step": 4491 + }, + { + "epoch": 0.35, + "grad_norm": 0.5571328057955283, + "learning_rate": 3.797300277517212e-05, + "loss": 2.1108, + "step": 4492 + }, + { + "epoch": 0.35, + "grad_norm": 0.619284757357046, + "learning_rate": 3.7967662545141394e-05, + "loss": 1.9247, + "step": 4493 + }, + { + "epoch": 0.35, + "grad_norm": 0.6761823760877507, + "learning_rate": 3.796232150548622e-05, + "loss": 1.9609, + "step": 4494 + }, + { + "epoch": 0.35, + "grad_norm": 0.5857258520826704, + "learning_rate": 3.795697965654007e-05, + "loss": 2.0241, + "step": 4495 + }, + { + "epoch": 0.35, + "grad_norm": 0.5817072957315393, + "learning_rate": 3.795163699863645e-05, + "loss": 2.1535, + "step": 4496 + }, + { + "epoch": 0.35, + "grad_norm": 0.7111934921047798, + "learning_rate": 3.7946293532108924e-05, + "loss": 1.9019, + "step": 4497 + }, + { + "epoch": 0.35, + "grad_norm": 0.5520885716905221, + "learning_rate": 3.794094925729111e-05, + "loss": 1.9475, + "step": 4498 + }, + { + "epoch": 0.35, + "grad_norm": 0.648726651824061, + "learning_rate": 3.793560417451667e-05, + "loss": 1.908, + "step": 4499 + }, + { + "epoch": 0.35, + "grad_norm": 0.6470228555717719, + "learning_rate": 3.793025828411932e-05, + "loss": 2.1746, + "step": 4500 + }, + { + "epoch": 0.35, + "grad_norm": 0.7383317037246777, + "learning_rate": 3.792491158643283e-05, + "loss": 2.0167, + "step": 4501 + }, + { + "epoch": 0.35, + "grad_norm": 0.7120164336355987, + "learning_rate": 3.7919564081791016e-05, + "loss": 1.9274, + "step": 4502 + }, + { + "epoch": 0.35, + "grad_norm": 0.6461439735147896, + "learning_rate": 3.791421577052774e-05, + "loss": 1.9233, + "step": 4503 + }, + { + "epoch": 0.35, + "grad_norm": 0.7043594748429448, + "learning_rate": 3.790886665297691e-05, + "loss": 2.1682, + "step": 4504 + }, + { + "epoch": 0.35, + "grad_norm": 0.748685788347361, + "learning_rate": 3.790351672947251e-05, + "loss": 1.9622, + "step": 4505 + }, + { + "epoch": 0.35, + "grad_norm": 0.599421878969426, + "learning_rate": 3.7898166000348553e-05, + "loss": 1.9282, + "step": 4506 + }, + { + "epoch": 0.35, + "grad_norm": 0.7748510910912709, + "learning_rate": 3.789281446593911e-05, + "loss": 2.0331, + "step": 4507 + }, + { + "epoch": 0.35, + "grad_norm": 0.6130002666553089, + "learning_rate": 3.788746212657829e-05, + "loss": 1.9256, + "step": 4508 + }, + { + "epoch": 0.35, + "grad_norm": 0.7125775114491252, + "learning_rate": 3.788210898260026e-05, + "loss": 2.1376, + "step": 4509 + }, + { + "epoch": 0.35, + "grad_norm": 0.6629979972676115, + "learning_rate": 3.787675503433926e-05, + "loss": 1.9068, + "step": 4510 + }, + { + "epoch": 0.35, + "grad_norm": 0.7042523414491536, + "learning_rate": 3.7871400282129535e-05, + "loss": 1.9617, + "step": 4511 + }, + { + "epoch": 0.35, + "grad_norm": 0.7484435187424696, + "learning_rate": 3.786604472630541e-05, + "loss": 2.1678, + "step": 4512 + }, + { + "epoch": 0.35, + "grad_norm": 0.7139644599465554, + "learning_rate": 3.786068836720126e-05, + "loss": 1.9437, + "step": 4513 + }, + { + "epoch": 0.35, + "grad_norm": 0.7328390801583808, + "learning_rate": 3.78553312051515e-05, + "loss": 2.0425, + "step": 4514 + }, + { + "epoch": 0.35, + "grad_norm": 0.802550608482326, + "learning_rate": 3.784997324049061e-05, + "loss": 1.9291, + "step": 4515 + }, + { + "epoch": 0.35, + "grad_norm": 0.5774978766688895, + "learning_rate": 3.78446144735531e-05, + "loss": 2.1619, + "step": 4516 + }, + { + "epoch": 0.35, + "grad_norm": 0.7348814858362367, + "learning_rate": 3.7839254904673535e-05, + "loss": 1.9604, + "step": 4517 + }, + { + "epoch": 0.35, + "grad_norm": 0.6635614605971497, + "learning_rate": 3.7833894534186547e-05, + "loss": 1.9451, + "step": 4518 + }, + { + "epoch": 0.35, + "grad_norm": 0.7963735056630429, + "learning_rate": 3.78285333624268e-05, + "loss": 1.9446, + "step": 4519 + }, + { + "epoch": 0.35, + "grad_norm": 0.5528505259637595, + "learning_rate": 3.7823171389729015e-05, + "loss": 2.0135, + "step": 4520 + }, + { + "epoch": 0.35, + "grad_norm": 0.9342537638908034, + "learning_rate": 3.7817808616427954e-05, + "loss": 2.1162, + "step": 4521 + }, + { + "epoch": 0.35, + "grad_norm": 0.5836072343727278, + "learning_rate": 3.7812445042858445e-05, + "loss": 1.9477, + "step": 4522 + }, + { + "epoch": 0.35, + "grad_norm": 0.6562720672801294, + "learning_rate": 3.7807080669355364e-05, + "loss": 1.9449, + "step": 4523 + }, + { + "epoch": 0.35, + "grad_norm": 0.7239260240502563, + "learning_rate": 3.780171549625362e-05, + "loss": 2.1631, + "step": 4524 + }, + { + "epoch": 0.35, + "grad_norm": 0.6211668040384145, + "learning_rate": 3.7796349523888194e-05, + "loss": 1.9366, + "step": 4525 + }, + { + "epoch": 0.35, + "grad_norm": 0.601126692496266, + "learning_rate": 3.779098275259409e-05, + "loss": 2.0233, + "step": 4526 + }, + { + "epoch": 0.35, + "grad_norm": 0.6654757451046609, + "learning_rate": 3.778561518270639e-05, + "loss": 1.9472, + "step": 4527 + }, + { + "epoch": 0.35, + "grad_norm": 0.6809630006499146, + "learning_rate": 3.778024681456021e-05, + "loss": 2.145, + "step": 4528 + }, + { + "epoch": 0.35, + "grad_norm": 0.611702543825706, + "learning_rate": 3.7774877648490715e-05, + "loss": 1.967, + "step": 4529 + }, + { + "epoch": 0.35, + "grad_norm": 0.629286604144254, + "learning_rate": 3.776950768483313e-05, + "loss": 1.9239, + "step": 4530 + }, + { + "epoch": 0.35, + "grad_norm": 0.6652129692701337, + "learning_rate": 3.7764136923922714e-05, + "loss": 1.9758, + "step": 4531 + }, + { + "epoch": 0.35, + "grad_norm": 0.6611273692524587, + "learning_rate": 3.775876536609481e-05, + "loss": 2.0146, + "step": 4532 + }, + { + "epoch": 0.35, + "grad_norm": 0.6568179251180883, + "learning_rate": 3.7753393011684756e-05, + "loss": 2.1347, + "step": 4533 + }, + { + "epoch": 0.35, + "grad_norm": 0.7095948969757396, + "learning_rate": 3.774801986102798e-05, + "loss": 1.9302, + "step": 4534 + }, + { + "epoch": 0.35, + "grad_norm": 0.5734242437712674, + "learning_rate": 3.774264591445997e-05, + "loss": 1.9522, + "step": 4535 + }, + { + "epoch": 0.35, + "grad_norm": 0.5928707338214555, + "learning_rate": 3.7737271172316225e-05, + "loss": 2.1217, + "step": 4536 + }, + { + "epoch": 0.35, + "grad_norm": 0.6125327935601353, + "learning_rate": 3.7731895634932316e-05, + "loss": 1.9343, + "step": 4537 + }, + { + "epoch": 0.35, + "grad_norm": 0.5503117636574768, + "learning_rate": 3.7726519302643846e-05, + "loss": 2.0371, + "step": 4538 + }, + { + "epoch": 0.35, + "grad_norm": 0.6093649456146506, + "learning_rate": 3.77211421757865e-05, + "loss": 1.95, + "step": 4539 + }, + { + "epoch": 0.35, + "grad_norm": 0.5441527425897305, + "learning_rate": 3.771576425469599e-05, + "loss": 1.9412, + "step": 4540 + }, + { + "epoch": 0.35, + "grad_norm": 0.626570346505215, + "learning_rate": 3.771038553970808e-05, + "loss": 2.1658, + "step": 4541 + }, + { + "epoch": 0.35, + "grad_norm": 0.6631777972993519, + "learning_rate": 3.7705006031158583e-05, + "loss": 1.9206, + "step": 4542 + }, + { + "epoch": 0.35, + "grad_norm": 0.5529961738904918, + "learning_rate": 3.7699625729383367e-05, + "loss": 1.9418, + "step": 4543 + }, + { + "epoch": 0.35, + "grad_norm": 0.596120332339497, + "learning_rate": 3.769424463471835e-05, + "loss": 2.1932, + "step": 4544 + }, + { + "epoch": 0.35, + "grad_norm": 0.5402253345057711, + "learning_rate": 3.7688862747499475e-05, + "loss": 2.0261, + "step": 4545 + }, + { + "epoch": 0.35, + "grad_norm": 0.6280176015203585, + "learning_rate": 3.768348006806279e-05, + "loss": 1.9731, + "step": 4546 + }, + { + "epoch": 0.35, + "grad_norm": 0.5648992552483801, + "learning_rate": 3.767809659674433e-05, + "loss": 1.9255, + "step": 4547 + }, + { + "epoch": 0.35, + "grad_norm": 0.6884928959955109, + "learning_rate": 3.767271233388022e-05, + "loss": 2.0994, + "step": 4548 + }, + { + "epoch": 0.35, + "grad_norm": 0.5801285625143808, + "learning_rate": 3.766732727980661e-05, + "loss": 1.9387, + "step": 4549 + }, + { + "epoch": 0.35, + "grad_norm": 0.665814298024065, + "learning_rate": 3.766194143485974e-05, + "loss": 1.9447, + "step": 4550 + }, + { + "epoch": 0.35, + "grad_norm": 0.625749741598418, + "learning_rate": 3.7656554799375826e-05, + "loss": 2.0227, + "step": 4551 + }, + { + "epoch": 0.35, + "grad_norm": 0.7114293565033241, + "learning_rate": 3.765116737369121e-05, + "loss": 1.9427, + "step": 4552 + }, + { + "epoch": 0.35, + "grad_norm": 0.7288519728761113, + "learning_rate": 3.7645779158142244e-05, + "loss": 2.1523, + "step": 4553 + }, + { + "epoch": 0.35, + "grad_norm": 0.6360551637515837, + "learning_rate": 3.7640390153065344e-05, + "loss": 1.9417, + "step": 4554 + }, + { + "epoch": 0.35, + "grad_norm": 0.6212972976738201, + "learning_rate": 3.7635000358796944e-05, + "loss": 1.9311, + "step": 4555 + }, + { + "epoch": 0.35, + "grad_norm": 0.8032232522016756, + "learning_rate": 3.762960977567357e-05, + "loss": 2.1672, + "step": 4556 + }, + { + "epoch": 0.35, + "grad_norm": 0.5694755003946796, + "learning_rate": 3.7624218404031776e-05, + "loss": 2.1032, + "step": 4557 + }, + { + "epoch": 0.35, + "grad_norm": 0.7995504593559443, + "learning_rate": 3.761882624420817e-05, + "loss": 1.9575, + "step": 4558 + }, + { + "epoch": 0.35, + "grad_norm": 0.6626166008222993, + "learning_rate": 3.761343329653939e-05, + "loss": 1.9275, + "step": 4559 + }, + { + "epoch": 0.35, + "grad_norm": 0.7426541296423854, + "learning_rate": 3.7608039561362165e-05, + "loss": 1.9405, + "step": 4560 + }, + { + "epoch": 0.35, + "grad_norm": 0.692703020133761, + "learning_rate": 3.760264503901323e-05, + "loss": 2.147, + "step": 4561 + }, + { + "epoch": 0.35, + "grad_norm": 0.632365593826782, + "learning_rate": 3.7597249729829385e-05, + "loss": 1.9448, + "step": 4562 + }, + { + "epoch": 0.35, + "grad_norm": 0.5947682356225279, + "learning_rate": 3.759185363414749e-05, + "loss": 1.9847, + "step": 4563 + }, + { + "epoch": 0.35, + "grad_norm": 0.5834192104801217, + "learning_rate": 3.758645675230446e-05, + "loss": 1.9446, + "step": 4564 + }, + { + "epoch": 0.35, + "grad_norm": 0.593489115597403, + "learning_rate": 3.758105908463721e-05, + "loss": 2.1533, + "step": 4565 + }, + { + "epoch": 0.35, + "grad_norm": 0.5725828287935691, + "learning_rate": 3.7575660631482765e-05, + "loss": 1.9059, + "step": 4566 + }, + { + "epoch": 0.35, + "grad_norm": 0.5894493672114854, + "learning_rate": 3.757026139317816e-05, + "loss": 1.9845, + "step": 4567 + }, + { + "epoch": 0.35, + "grad_norm": 0.6519983219193053, + "learning_rate": 3.7564861370060506e-05, + "loss": 2.1509, + "step": 4568 + }, + { + "epoch": 0.35, + "grad_norm": 0.6498794438670903, + "learning_rate": 3.755946056246693e-05, + "loss": 1.9526, + "step": 4569 + }, + { + "epoch": 0.35, + "grad_norm": 0.6132216474169538, + "learning_rate": 3.755405897073464e-05, + "loss": 1.9738, + "step": 4570 + }, + { + "epoch": 0.35, + "grad_norm": 0.7414255346001511, + "learning_rate": 3.754865659520087e-05, + "loss": 2.0065, + "step": 4571 + }, + { + "epoch": 0.35, + "grad_norm": 0.6065536364009866, + "learning_rate": 3.754325343620293e-05, + "loss": 2.0018, + "step": 4572 + }, + { + "epoch": 0.35, + "grad_norm": 0.7188275435021811, + "learning_rate": 3.753784949407814e-05, + "loss": 2.0866, + "step": 4573 + }, + { + "epoch": 0.35, + "grad_norm": 0.6628994937750039, + "learning_rate": 3.7532444769163894e-05, + "loss": 1.913, + "step": 4574 + }, + { + "epoch": 0.35, + "grad_norm": 0.6052328597426868, + "learning_rate": 3.7527039261797645e-05, + "loss": 1.9235, + "step": 4575 + }, + { + "epoch": 0.35, + "grad_norm": 0.6140602934212733, + "learning_rate": 3.752163297231687e-05, + "loss": 2.0424, + "step": 4576 + }, + { + "epoch": 0.35, + "grad_norm": 0.6206060788932011, + "learning_rate": 3.751622590105911e-05, + "loss": 2.0892, + "step": 4577 + }, + { + "epoch": 0.35, + "grad_norm": 0.6504112075589991, + "learning_rate": 3.751081804836194e-05, + "loss": 2.0051, + "step": 4578 + }, + { + "epoch": 0.35, + "grad_norm": 0.5746055505232299, + "learning_rate": 3.7505409414563017e-05, + "loss": 1.9563, + "step": 4579 + }, + { + "epoch": 0.35, + "grad_norm": 0.6647188108881102, + "learning_rate": 3.7500000000000003e-05, + "loss": 2.1705, + "step": 4580 + }, + { + "epoch": 0.35, + "grad_norm": 0.5775354576732673, + "learning_rate": 3.7494589805010635e-05, + "loss": 1.9662, + "step": 4581 + }, + { + "epoch": 0.35, + "grad_norm": 0.5691829658577531, + "learning_rate": 3.74891788299327e-05, + "loss": 2.0184, + "step": 4582 + }, + { + "epoch": 0.35, + "grad_norm": 0.5773646747269596, + "learning_rate": 3.748376707510403e-05, + "loss": 1.9053, + "step": 4583 + }, + { + "epoch": 0.35, + "grad_norm": 0.6010039732758989, + "learning_rate": 3.747835454086249e-05, + "loss": 1.9363, + "step": 4584 + }, + { + "epoch": 0.35, + "grad_norm": 0.6254087318681725, + "learning_rate": 3.747294122754601e-05, + "loss": 2.1394, + "step": 4585 + }, + { + "epoch": 0.35, + "grad_norm": 0.6252779143695171, + "learning_rate": 3.7467527135492583e-05, + "loss": 1.9304, + "step": 4586 + }, + { + "epoch": 0.35, + "grad_norm": 0.6472007465791715, + "learning_rate": 3.746211226504021e-05, + "loss": 2.0061, + "step": 4587 + }, + { + "epoch": 0.35, + "grad_norm": 0.5835858692826364, + "learning_rate": 3.745669661652696e-05, + "loss": 2.0139, + "step": 4588 + }, + { + "epoch": 0.35, + "grad_norm": 0.7380918619140692, + "learning_rate": 3.745128019029099e-05, + "loss": 2.1064, + "step": 4589 + }, + { + "epoch": 0.35, + "grad_norm": 0.5414109110382487, + "learning_rate": 3.744586298667043e-05, + "loss": 1.8944, + "step": 4590 + }, + { + "epoch": 0.35, + "grad_norm": 0.545539508588627, + "learning_rate": 3.7440445006003524e-05, + "loss": 1.8998, + "step": 4591 + }, + { + "epoch": 0.35, + "grad_norm": 0.6462009997921957, + "learning_rate": 3.7435026248628525e-05, + "loss": 1.9209, + "step": 4592 + }, + { + "epoch": 0.35, + "grad_norm": 0.5849379071625298, + "learning_rate": 3.742960671488376e-05, + "loss": 2.2009, + "step": 4593 + }, + { + "epoch": 0.35, + "grad_norm": 0.6122243423458665, + "learning_rate": 3.7424186405107576e-05, + "loss": 2.0145, + "step": 4594 + }, + { + "epoch": 0.35, + "grad_norm": 0.6869390300064838, + "learning_rate": 3.74187653196384e-05, + "loss": 1.9925, + "step": 4595 + }, + { + "epoch": 0.35, + "grad_norm": 0.562011134177658, + "learning_rate": 3.741334345881469e-05, + "loss": 1.945, + "step": 4596 + }, + { + "epoch": 0.35, + "grad_norm": 0.70982922155756, + "learning_rate": 3.7407920822974955e-05, + "loss": 2.1168, + "step": 4597 + }, + { + "epoch": 0.35, + "grad_norm": 0.6793283496652244, + "learning_rate": 3.740249741245773e-05, + "loss": 1.9613, + "step": 4598 + }, + { + "epoch": 0.35, + "grad_norm": 0.6071544331153595, + "learning_rate": 3.739707322760166e-05, + "loss": 1.9349, + "step": 4599 + }, + { + "epoch": 0.35, + "grad_norm": 0.7589780363811607, + "learning_rate": 3.7391648268745374e-05, + "loss": 2.1522, + "step": 4600 + }, + { + "epoch": 0.35, + "grad_norm": 0.5527873324192897, + "learning_rate": 3.738622253622758e-05, + "loss": 2.0132, + "step": 4601 + }, + { + "epoch": 0.36, + "grad_norm": 0.6481257788788469, + "learning_rate": 3.738079603038703e-05, + "loss": 1.8904, + "step": 4602 + }, + { + "epoch": 0.36, + "grad_norm": 0.7039334567917342, + "learning_rate": 3.737536875156252e-05, + "loss": 1.9387, + "step": 4603 + }, + { + "epoch": 0.36, + "grad_norm": 0.581156301829832, + "learning_rate": 3.736994070009291e-05, + "loss": 1.9444, + "step": 4604 + }, + { + "epoch": 0.36, + "grad_norm": 0.6517879382266721, + "learning_rate": 3.736451187631708e-05, + "loss": 2.1677, + "step": 4605 + }, + { + "epoch": 0.36, + "grad_norm": 0.5979746394869523, + "learning_rate": 3.735908228057397e-05, + "loss": 1.9477, + "step": 4606 + }, + { + "epoch": 0.36, + "grad_norm": 0.5357640041017192, + "learning_rate": 3.735365191320259e-05, + "loss": 2.0413, + "step": 4607 + }, + { + "epoch": 0.36, + "grad_norm": 0.5755159086779102, + "learning_rate": 3.734822077454197e-05, + "loss": 1.9118, + "step": 4608 + }, + { + "epoch": 0.36, + "grad_norm": 0.6934114478928152, + "learning_rate": 3.734278886493119e-05, + "loss": 2.0967, + "step": 4609 + }, + { + "epoch": 0.36, + "grad_norm": 0.6273077499067635, + "learning_rate": 3.7337356184709404e-05, + "loss": 1.9517, + "step": 4610 + }, + { + "epoch": 0.36, + "grad_norm": 0.6147067829217797, + "learning_rate": 3.733192273421579e-05, + "loss": 1.9286, + "step": 4611 + }, + { + "epoch": 0.36, + "grad_norm": 0.6378051186075927, + "learning_rate": 3.7326488513789575e-05, + "loss": 2.1279, + "step": 4612 + }, + { + "epoch": 0.36, + "grad_norm": 0.584584887172597, + "learning_rate": 3.732105352377004e-05, + "loss": 2.013, + "step": 4613 + }, + { + "epoch": 0.36, + "grad_norm": 0.5904111437185489, + "learning_rate": 3.731561776449653e-05, + "loss": 1.9558, + "step": 4614 + }, + { + "epoch": 0.36, + "grad_norm": 0.5988825537706705, + "learning_rate": 3.73101812363084e-05, + "loss": 1.9887, + "step": 4615 + }, + { + "epoch": 0.36, + "grad_norm": 0.6920621311075653, + "learning_rate": 3.7304743939545086e-05, + "loss": 1.9542, + "step": 4616 + }, + { + "epoch": 0.36, + "grad_norm": 0.598883419367774, + "learning_rate": 3.729930587454605e-05, + "loss": 2.1527, + "step": 4617 + }, + { + "epoch": 0.36, + "grad_norm": 0.7763987532804442, + "learning_rate": 3.729386704165084e-05, + "loss": 1.9328, + "step": 4618 + }, + { + "epoch": 0.36, + "grad_norm": 0.5822567768376813, + "learning_rate": 3.728842744119899e-05, + "loss": 2.018, + "step": 4619 + }, + { + "epoch": 0.36, + "grad_norm": 0.7015109472716764, + "learning_rate": 3.728298707353014e-05, + "loss": 1.9737, + "step": 4620 + }, + { + "epoch": 0.36, + "grad_norm": 0.5499912200871575, + "learning_rate": 3.7277545938983944e-05, + "loss": 2.1166, + "step": 4621 + }, + { + "epoch": 0.36, + "grad_norm": 0.6120910517543294, + "learning_rate": 3.7272104037900126e-05, + "loss": 1.9298, + "step": 4622 + }, + { + "epoch": 0.36, + "grad_norm": 0.5852681695519044, + "learning_rate": 3.7266661370618426e-05, + "loss": 1.979, + "step": 4623 + }, + { + "epoch": 0.36, + "grad_norm": 0.6191672159876855, + "learning_rate": 3.7261217937478666e-05, + "loss": 1.9266, + "step": 4624 + }, + { + "epoch": 0.36, + "grad_norm": 0.5847564464928743, + "learning_rate": 3.725577373882071e-05, + "loss": 2.174, + "step": 4625 + }, + { + "epoch": 0.36, + "grad_norm": 0.5490442193106174, + "learning_rate": 3.725032877498444e-05, + "loss": 1.9365, + "step": 4626 + }, + { + "epoch": 0.36, + "grad_norm": 0.6520073043825677, + "learning_rate": 3.7244883046309825e-05, + "loss": 1.9205, + "step": 4627 + }, + { + "epoch": 0.36, + "grad_norm": 0.6419321881823317, + "learning_rate": 3.7239436553136855e-05, + "loss": 1.9641, + "step": 4628 + }, + { + "epoch": 0.36, + "grad_norm": 0.5850553093256736, + "learning_rate": 3.723398929580558e-05, + "loss": 2.1423, + "step": 4629 + }, + { + "epoch": 0.36, + "grad_norm": 0.6727839364118863, + "learning_rate": 3.7228541274656095e-05, + "loss": 1.9831, + "step": 4630 + }, + { + "epoch": 0.36, + "grad_norm": 0.6275247077144798, + "learning_rate": 3.722309249002854e-05, + "loss": 1.9254, + "step": 4631 + }, + { + "epoch": 0.36, + "grad_norm": 0.6077404667060448, + "learning_rate": 3.7217642942263116e-05, + "loss": 2.0486, + "step": 4632 + }, + { + "epoch": 0.36, + "grad_norm": 0.8651727763107769, + "learning_rate": 3.721219263170005e-05, + "loss": 2.144, + "step": 4633 + }, + { + "epoch": 0.36, + "grad_norm": 0.6093622984162658, + "learning_rate": 3.720674155867962e-05, + "loss": 1.9273, + "step": 4634 + }, + { + "epoch": 0.36, + "grad_norm": 0.6736119062326317, + "learning_rate": 3.720128972354217e-05, + "loss": 1.9216, + "step": 4635 + }, + { + "epoch": 0.36, + "grad_norm": 0.6630065011592754, + "learning_rate": 3.719583712662809e-05, + "loss": 1.9821, + "step": 4636 + }, + { + "epoch": 0.36, + "grad_norm": 0.6053358140119253, + "learning_rate": 3.719038376827778e-05, + "loss": 2.1618, + "step": 4637 + }, + { + "epoch": 0.36, + "grad_norm": 0.5799597435925008, + "learning_rate": 3.718492964883175e-05, + "loss": 1.9346, + "step": 4638 + }, + { + "epoch": 0.36, + "grad_norm": 0.6228775409115401, + "learning_rate": 3.717947476863049e-05, + "loss": 2.0254, + "step": 4639 + }, + { + "epoch": 0.36, + "grad_norm": 0.6759821630854005, + "learning_rate": 3.7174019128014595e-05, + "loss": 1.9418, + "step": 4640 + }, + { + "epoch": 0.36, + "grad_norm": 0.6459516164249385, + "learning_rate": 3.7168562727324665e-05, + "loss": 2.0994, + "step": 4641 + }, + { + "epoch": 0.36, + "grad_norm": 0.8044837298201137, + "learning_rate": 3.716310556690138e-05, + "loss": 1.983, + "step": 4642 + }, + { + "epoch": 0.36, + "grad_norm": 0.617742525089041, + "learning_rate": 3.715764764708544e-05, + "loss": 1.9656, + "step": 4643 + }, + { + "epoch": 0.36, + "grad_norm": 0.6550254102178965, + "learning_rate": 3.7152188968217626e-05, + "loss": 2.0012, + "step": 4644 + }, + { + "epoch": 0.36, + "grad_norm": 0.6018839108485253, + "learning_rate": 3.714672953063872e-05, + "loss": 2.1283, + "step": 4645 + }, + { + "epoch": 0.36, + "grad_norm": 0.5707417904779045, + "learning_rate": 3.714126933468959e-05, + "loss": 1.9697, + "step": 4646 + }, + { + "epoch": 0.36, + "grad_norm": 0.6809655619899423, + "learning_rate": 3.713580838071115e-05, + "loss": 1.9183, + "step": 4647 + }, + { + "epoch": 0.36, + "grad_norm": 0.5941669650791962, + "learning_rate": 3.7130346669044334e-05, + "loss": 1.9262, + "step": 4648 + }, + { + "epoch": 0.36, + "grad_norm": 0.7518815762349834, + "learning_rate": 3.712488420003013e-05, + "loss": 2.0998, + "step": 4649 + }, + { + "epoch": 0.36, + "grad_norm": 0.5725059687229308, + "learning_rate": 3.711942097400961e-05, + "loss": 2.0428, + "step": 4650 + }, + { + "epoch": 0.36, + "grad_norm": 0.6596717414011584, + "learning_rate": 3.711395699132384e-05, + "loss": 1.9658, + "step": 4651 + }, + { + "epoch": 0.36, + "grad_norm": 0.6030059234909771, + "learning_rate": 3.7108492252313975e-05, + "loss": 1.9322, + "step": 4652 + }, + { + "epoch": 0.36, + "grad_norm": 0.5484816064757371, + "learning_rate": 3.710302675732119e-05, + "loss": 2.1407, + "step": 4653 + }, + { + "epoch": 0.36, + "grad_norm": 0.647664935855046, + "learning_rate": 3.709756050668673e-05, + "loss": 2.0041, + "step": 4654 + }, + { + "epoch": 0.36, + "grad_norm": 0.579449708215444, + "learning_rate": 3.709209350075187e-05, + "loss": 1.9277, + "step": 4655 + }, + { + "epoch": 0.36, + "grad_norm": 0.7497199718254093, + "learning_rate": 3.7086625739857936e-05, + "loss": 2.0176, + "step": 4656 + }, + { + "epoch": 0.36, + "grad_norm": 0.59496679369951, + "learning_rate": 3.70811572243463e-05, + "loss": 2.1822, + "step": 4657 + }, + { + "epoch": 0.36, + "grad_norm": 0.6484314983319903, + "learning_rate": 3.707568795455839e-05, + "loss": 1.9863, + "step": 4658 + }, + { + "epoch": 0.36, + "grad_norm": 0.5690762120048712, + "learning_rate": 3.707021793083568e-05, + "loss": 1.9237, + "step": 4659 + }, + { + "epoch": 0.36, + "grad_norm": 0.6705043008743519, + "learning_rate": 3.7064747153519666e-05, + "loss": 1.9436, + "step": 4660 + }, + { + "epoch": 0.36, + "grad_norm": 0.6605600977110833, + "learning_rate": 3.705927562295193e-05, + "loss": 2.1577, + "step": 4661 + }, + { + "epoch": 0.36, + "grad_norm": 0.5789292077746689, + "learning_rate": 3.705380333947408e-05, + "loss": 1.974, + "step": 4662 + }, + { + "epoch": 0.36, + "grad_norm": 0.7034311252114132, + "learning_rate": 3.704833030342777e-05, + "loss": 2.0697, + "step": 4663 + }, + { + "epoch": 0.36, + "grad_norm": 0.5831696916825349, + "learning_rate": 3.7042856515154695e-05, + "loss": 1.9434, + "step": 4664 + }, + { + "epoch": 0.36, + "grad_norm": 0.9460172960487814, + "learning_rate": 3.703738197499663e-05, + "loss": 2.1367, + "step": 4665 + }, + { + "epoch": 0.36, + "grad_norm": 0.5654026776816257, + "learning_rate": 3.7031906683295336e-05, + "loss": 1.9181, + "step": 4666 + }, + { + "epoch": 0.36, + "grad_norm": 0.846738244729651, + "learning_rate": 3.702643064039269e-05, + "loss": 1.9382, + "step": 4667 + }, + { + "epoch": 0.36, + "grad_norm": 0.6118843728036611, + "learning_rate": 3.7020953846630574e-05, + "loss": 1.9653, + "step": 4668 + }, + { + "epoch": 0.36, + "grad_norm": 0.7572111987931353, + "learning_rate": 3.7015476302350924e-05, + "loss": 2.1739, + "step": 4669 + }, + { + "epoch": 0.36, + "grad_norm": 0.5949346599788876, + "learning_rate": 3.700999800789574e-05, + "loss": 1.9535, + "step": 4670 + }, + { + "epoch": 0.36, + "grad_norm": 0.7023064675007107, + "learning_rate": 3.700451896360702e-05, + "loss": 1.9342, + "step": 4671 + }, + { + "epoch": 0.36, + "grad_norm": 0.7893816354472377, + "learning_rate": 3.699903916982688e-05, + "loss": 1.9422, + "step": 4672 + }, + { + "epoch": 0.36, + "grad_norm": 0.5871888810771583, + "learning_rate": 3.699355862689743e-05, + "loss": 2.1476, + "step": 4673 + }, + { + "epoch": 0.36, + "grad_norm": 0.7243676654051104, + "learning_rate": 3.6988077335160844e-05, + "loss": 1.9757, + "step": 4674 + }, + { + "epoch": 0.36, + "grad_norm": 0.6273895543975264, + "learning_rate": 3.698259529495933e-05, + "loss": 1.9991, + "step": 4675 + }, + { + "epoch": 0.36, + "grad_norm": 0.6656282908365744, + "learning_rate": 3.697711250663518e-05, + "loss": 1.944, + "step": 4676 + }, + { + "epoch": 0.36, + "grad_norm": 0.7190901801235662, + "learning_rate": 3.697162897053069e-05, + "loss": 2.1093, + "step": 4677 + }, + { + "epoch": 0.36, + "grad_norm": 0.761173362190466, + "learning_rate": 3.696614468698822e-05, + "loss": 1.984, + "step": 4678 + }, + { + "epoch": 0.36, + "grad_norm": 0.6424122385025459, + "learning_rate": 3.6960659656350186e-05, + "loss": 1.9448, + "step": 4679 + }, + { + "epoch": 0.36, + "grad_norm": 0.890952911421364, + "learning_rate": 3.695517387895902e-05, + "loss": 1.9384, + "step": 4680 + }, + { + "epoch": 0.36, + "grad_norm": 0.6989737145639305, + "learning_rate": 3.694968735515725e-05, + "loss": 2.1309, + "step": 4681 + }, + { + "epoch": 0.36, + "grad_norm": 0.7220639939720056, + "learning_rate": 3.694420008528739e-05, + "loss": 1.9886, + "step": 4682 + }, + { + "epoch": 0.36, + "grad_norm": 0.7970861329113466, + "learning_rate": 3.6938712069692074e-05, + "loss": 1.9776, + "step": 4683 + }, + { + "epoch": 0.36, + "grad_norm": 0.6411988496901369, + "learning_rate": 3.693322330871391e-05, + "loss": 1.9829, + "step": 4684 + }, + { + "epoch": 0.36, + "grad_norm": 0.7227587877892967, + "learning_rate": 3.692773380269558e-05, + "loss": 2.1231, + "step": 4685 + }, + { + "epoch": 0.36, + "grad_norm": 0.6833635231869992, + "learning_rate": 3.692224355197984e-05, + "loss": 1.94, + "step": 4686 + }, + { + "epoch": 0.36, + "grad_norm": 0.6645386764457574, + "learning_rate": 3.691675255690946e-05, + "loss": 1.9642, + "step": 4687 + }, + { + "epoch": 0.36, + "grad_norm": 0.6965385015727485, + "learning_rate": 3.691126081782725e-05, + "loss": 1.9576, + "step": 4688 + }, + { + "epoch": 0.36, + "grad_norm": 0.7688105334265255, + "learning_rate": 3.69057683350761e-05, + "loss": 2.1369, + "step": 4689 + }, + { + "epoch": 0.36, + "grad_norm": 0.621924953292858, + "learning_rate": 3.6900275108998925e-05, + "loss": 1.9282, + "step": 4690 + }, + { + "epoch": 0.36, + "grad_norm": 0.7901117835822349, + "learning_rate": 3.689478113993869e-05, + "loss": 1.9855, + "step": 4691 + }, + { + "epoch": 0.36, + "grad_norm": 0.6908094168079423, + "learning_rate": 3.688928642823839e-05, + "loss": 1.9461, + "step": 4692 + }, + { + "epoch": 0.36, + "grad_norm": 0.7312677029226718, + "learning_rate": 3.688379097424111e-05, + "loss": 2.1617, + "step": 4693 + }, + { + "epoch": 0.36, + "grad_norm": 0.6717444361560017, + "learning_rate": 3.6878294778289934e-05, + "loss": 1.9665, + "step": 4694 + }, + { + "epoch": 0.36, + "grad_norm": 0.8909326024452142, + "learning_rate": 3.687279784072802e-05, + "loss": 1.9488, + "step": 4695 + }, + { + "epoch": 0.36, + "grad_norm": 0.7271402416990989, + "learning_rate": 3.686730016189856e-05, + "loss": 1.9845, + "step": 4696 + }, + { + "epoch": 0.36, + "grad_norm": 0.6966872470924167, + "learning_rate": 3.68618017421448e-05, + "loss": 2.1745, + "step": 4697 + }, + { + "epoch": 0.36, + "grad_norm": 0.6524514981011487, + "learning_rate": 3.685630258181003e-05, + "loss": 1.9557, + "step": 4698 + }, + { + "epoch": 0.36, + "grad_norm": 0.737429228420455, + "learning_rate": 3.6850802681237586e-05, + "loss": 1.9708, + "step": 4699 + }, + { + "epoch": 0.36, + "grad_norm": 0.6159700820543748, + "learning_rate": 3.684530204077083e-05, + "loss": 2.0117, + "step": 4700 + }, + { + "epoch": 0.36, + "grad_norm": 0.6066024934144761, + "learning_rate": 3.6839800660753224e-05, + "loss": 2.1746, + "step": 4701 + }, + { + "epoch": 0.36, + "grad_norm": 0.6359433901257521, + "learning_rate": 3.683429854152821e-05, + "loss": 1.9269, + "step": 4702 + }, + { + "epoch": 0.36, + "grad_norm": 0.6289758389005274, + "learning_rate": 3.682879568343933e-05, + "loss": 1.9624, + "step": 4703 + }, + { + "epoch": 0.36, + "grad_norm": 0.6541772419167793, + "learning_rate": 3.682329208683014e-05, + "loss": 1.9802, + "step": 4704 + }, + { + "epoch": 0.36, + "grad_norm": 0.6026982855266556, + "learning_rate": 3.681778775204425e-05, + "loss": 2.1602, + "step": 4705 + }, + { + "epoch": 0.36, + "grad_norm": 0.7724504028348298, + "learning_rate": 3.681228267942533e-05, + "loss": 2.0269, + "step": 4706 + }, + { + "epoch": 0.36, + "grad_norm": 0.6211520140337724, + "learning_rate": 3.680677686931707e-05, + "loss": 1.9657, + "step": 4707 + }, + { + "epoch": 0.36, + "grad_norm": 0.7451407836550549, + "learning_rate": 3.680127032206323e-05, + "loss": 2.0122, + "step": 4708 + }, + { + "epoch": 0.36, + "grad_norm": 0.8191650691320711, + "learning_rate": 3.67957630380076e-05, + "loss": 2.1867, + "step": 4709 + }, + { + "epoch": 0.36, + "grad_norm": 0.6745953566886871, + "learning_rate": 3.6790255017494025e-05, + "loss": 1.9312, + "step": 4710 + }, + { + "epoch": 0.36, + "grad_norm": 0.7616853766009727, + "learning_rate": 3.6784746260866394e-05, + "loss": 1.96, + "step": 4711 + }, + { + "epoch": 0.36, + "grad_norm": 0.7295046801112041, + "learning_rate": 3.677923676846864e-05, + "loss": 2.0049, + "step": 4712 + }, + { + "epoch": 0.36, + "grad_norm": 0.694161046260058, + "learning_rate": 3.6773726540644746e-05, + "loss": 2.1122, + "step": 4713 + }, + { + "epoch": 0.36, + "grad_norm": 0.6638928477942693, + "learning_rate": 3.676821557773874e-05, + "loss": 1.9574, + "step": 4714 + }, + { + "epoch": 0.36, + "grad_norm": 0.6624925543645899, + "learning_rate": 3.676270388009469e-05, + "loss": 1.9721, + "step": 4715 + }, + { + "epoch": 0.36, + "grad_norm": 0.6660978387182667, + "learning_rate": 3.675719144805671e-05, + "loss": 1.9273, + "step": 4716 + }, + { + "epoch": 0.36, + "grad_norm": 0.9183644923551257, + "learning_rate": 3.675167828196896e-05, + "loss": 2.1165, + "step": 4717 + }, + { + "epoch": 0.36, + "grad_norm": 0.6559180353931214, + "learning_rate": 3.6746164382175674e-05, + "loss": 1.9967, + "step": 4718 + }, + { + "epoch": 0.36, + "grad_norm": 0.7120517330747328, + "learning_rate": 3.6740649749021084e-05, + "loss": 1.9402, + "step": 4719 + }, + { + "epoch": 0.36, + "grad_norm": 0.6409065073598506, + "learning_rate": 3.67351343828495e-05, + "loss": 1.9214, + "step": 4720 + }, + { + "epoch": 0.36, + "grad_norm": 0.6731334120056713, + "learning_rate": 3.6729618284005266e-05, + "loss": 2.1697, + "step": 4721 + }, + { + "epoch": 0.36, + "grad_norm": 0.6367337670013409, + "learning_rate": 3.672410145283278e-05, + "loss": 1.9172, + "step": 4722 + }, + { + "epoch": 0.36, + "grad_norm": 0.650152979979224, + "learning_rate": 3.671858388967647e-05, + "loss": 1.9346, + "step": 4723 + }, + { + "epoch": 0.36, + "grad_norm": 0.6358511097484627, + "learning_rate": 3.671306559488083e-05, + "loss": 1.8852, + "step": 4724 + }, + { + "epoch": 0.36, + "grad_norm": 0.7829484360152792, + "learning_rate": 3.670754656879038e-05, + "loss": 1.9841, + "step": 4725 + }, + { + "epoch": 0.36, + "grad_norm": 0.5437294739990459, + "learning_rate": 3.6702026811749715e-05, + "loss": 2.1451, + "step": 4726 + }, + { + "epoch": 0.36, + "grad_norm": 0.8352595488512609, + "learning_rate": 3.669650632410343e-05, + "loss": 1.9545, + "step": 4727 + }, + { + "epoch": 0.36, + "grad_norm": 0.6711262599207909, + "learning_rate": 3.669098510619621e-05, + "loss": 1.9366, + "step": 4728 + }, + { + "epoch": 0.36, + "grad_norm": 0.645527468457305, + "learning_rate": 3.6685463158372764e-05, + "loss": 2.08, + "step": 4729 + }, + { + "epoch": 0.36, + "grad_norm": 0.7214608629543685, + "learning_rate": 3.667994048097785e-05, + "loss": 1.9247, + "step": 4730 + }, + { + "epoch": 0.36, + "grad_norm": 0.6107787139542818, + "learning_rate": 3.667441707435626e-05, + "loss": 2.0255, + "step": 4731 + }, + { + "epoch": 0.37, + "grad_norm": 0.5908272047238038, + "learning_rate": 3.666889293885286e-05, + "loss": 1.9859, + "step": 4732 + }, + { + "epoch": 0.37, + "grad_norm": 0.7067818259224394, + "learning_rate": 3.666336807481253e-05, + "loss": 2.1366, + "step": 4733 + }, + { + "epoch": 0.37, + "grad_norm": 0.6059287932626503, + "learning_rate": 3.6657842482580225e-05, + "loss": 1.9511, + "step": 4734 + }, + { + "epoch": 0.37, + "grad_norm": 0.6573364868243076, + "learning_rate": 3.6652316162500925e-05, + "loss": 1.9758, + "step": 4735 + }, + { + "epoch": 0.37, + "grad_norm": 0.6362458790995237, + "learning_rate": 3.664678911491964e-05, + "loss": 1.9272, + "step": 4736 + }, + { + "epoch": 0.37, + "grad_norm": 0.722959643300314, + "learning_rate": 3.664126134018149e-05, + "loss": 2.0347, + "step": 4737 + }, + { + "epoch": 0.37, + "grad_norm": 0.6504901755942433, + "learning_rate": 3.663573283863155e-05, + "loss": 2.1483, + "step": 4738 + }, + { + "epoch": 0.37, + "grad_norm": 0.7863910147687538, + "learning_rate": 3.663020361061501e-05, + "loss": 1.9469, + "step": 4739 + }, + { + "epoch": 0.37, + "grad_norm": 0.6254541932283408, + "learning_rate": 3.6624673656477085e-05, + "loss": 1.913, + "step": 4740 + }, + { + "epoch": 0.37, + "grad_norm": 0.6893589688892566, + "learning_rate": 3.661914297656304e-05, + "loss": 2.1198, + "step": 4741 + }, + { + "epoch": 0.37, + "grad_norm": 0.5943159851954114, + "learning_rate": 3.661361157121816e-05, + "loss": 1.9524, + "step": 4742 + }, + { + "epoch": 0.37, + "grad_norm": 0.6369550146610724, + "learning_rate": 3.660807944078778e-05, + "loss": 2.0145, + "step": 4743 + }, + { + "epoch": 0.37, + "grad_norm": 0.598288693577857, + "learning_rate": 3.6602546585617336e-05, + "loss": 1.9399, + "step": 4744 + }, + { + "epoch": 0.37, + "grad_norm": 0.6413492942640383, + "learning_rate": 3.659701300605224e-05, + "loss": 2.1052, + "step": 4745 + }, + { + "epoch": 0.37, + "grad_norm": 0.5991417708818674, + "learning_rate": 3.6591478702437976e-05, + "loss": 1.9681, + "step": 4746 + }, + { + "epoch": 0.37, + "grad_norm": 0.6241898250773444, + "learning_rate": 3.658594367512008e-05, + "loss": 1.9846, + "step": 4747 + }, + { + "epoch": 0.37, + "grad_norm": 0.5775838621061189, + "learning_rate": 3.6580407924444114e-05, + "loss": 1.9257, + "step": 4748 + }, + { + "epoch": 0.37, + "grad_norm": 0.5942154168384388, + "learning_rate": 3.657487145075572e-05, + "loss": 2.1569, + "step": 4749 + }, + { + "epoch": 0.37, + "grad_norm": 0.5953021834184642, + "learning_rate": 3.656933425440055e-05, + "loss": 2.0365, + "step": 4750 + }, + { + "epoch": 0.37, + "grad_norm": 0.6199424827545824, + "learning_rate": 3.656379633572431e-05, + "loss": 1.9542, + "step": 4751 + }, + { + "epoch": 0.37, + "grad_norm": 0.6194022955660908, + "learning_rate": 3.6558257695072764e-05, + "loss": 1.9068, + "step": 4752 + }, + { + "epoch": 0.37, + "grad_norm": 0.6733998969564405, + "learning_rate": 3.6552718332791705e-05, + "loss": 2.1807, + "step": 4753 + }, + { + "epoch": 0.37, + "grad_norm": 0.7006791335968228, + "learning_rate": 3.654717824922698e-05, + "loss": 1.9273, + "step": 4754 + }, + { + "epoch": 0.37, + "grad_norm": 0.5792439530125585, + "learning_rate": 3.654163744472448e-05, + "loss": 1.952, + "step": 4755 + }, + { + "epoch": 0.37, + "grad_norm": 0.6828276972817997, + "learning_rate": 3.653609591963014e-05, + "loss": 2.0, + "step": 4756 + }, + { + "epoch": 0.37, + "grad_norm": 0.625322314376544, + "learning_rate": 3.653055367428993e-05, + "loss": 1.9745, + "step": 4757 + }, + { + "epoch": 0.37, + "grad_norm": 0.7885891218744532, + "learning_rate": 3.65250107090499e-05, + "loss": 2.1744, + "step": 4758 + }, + { + "epoch": 0.37, + "grad_norm": 0.6003271068487426, + "learning_rate": 3.65194670242561e-05, + "loss": 1.8843, + "step": 4759 + }, + { + "epoch": 0.37, + "grad_norm": 0.8609184344399764, + "learning_rate": 3.651392262025465e-05, + "loss": 1.9825, + "step": 4760 + }, + { + "epoch": 0.37, + "grad_norm": 0.6101162523801026, + "learning_rate": 3.6508377497391714e-05, + "loss": 2.1402, + "step": 4761 + }, + { + "epoch": 0.37, + "grad_norm": 0.8534718123886675, + "learning_rate": 3.6502831656013496e-05, + "loss": 2.0237, + "step": 4762 + }, + { + "epoch": 0.37, + "grad_norm": 0.696970966415562, + "learning_rate": 3.649728509646624e-05, + "loss": 1.9946, + "step": 4763 + }, + { + "epoch": 0.37, + "grad_norm": 0.9040466391560001, + "learning_rate": 3.649173781909624e-05, + "loss": 1.9549, + "step": 4764 + }, + { + "epoch": 0.37, + "grad_norm": 0.7925418800453099, + "learning_rate": 3.648618982424984e-05, + "loss": 2.1358, + "step": 4765 + }, + { + "epoch": 0.37, + "grad_norm": 0.6176131791561329, + "learning_rate": 3.648064111227343e-05, + "loss": 1.9197, + "step": 4766 + }, + { + "epoch": 0.37, + "grad_norm": 0.9971629430327391, + "learning_rate": 3.647509168351342e-05, + "loss": 1.9576, + "step": 4767 + }, + { + "epoch": 0.37, + "grad_norm": 0.6735379764414092, + "learning_rate": 3.6469541538316305e-05, + "loss": 1.9922, + "step": 4768 + }, + { + "epoch": 0.37, + "grad_norm": 0.7999531466813391, + "learning_rate": 3.6463990677028584e-05, + "loss": 1.9518, + "step": 4769 + }, + { + "epoch": 0.37, + "grad_norm": 0.7424020831917181, + "learning_rate": 3.645843909999684e-05, + "loss": 2.1188, + "step": 4770 + }, + { + "epoch": 0.37, + "grad_norm": 0.8578322560891996, + "learning_rate": 3.645288680756766e-05, + "loss": 1.9651, + "step": 4771 + }, + { + "epoch": 0.37, + "grad_norm": 0.7497432528668888, + "learning_rate": 3.6447333800087714e-05, + "loss": 1.9299, + "step": 4772 + }, + { + "epoch": 0.37, + "grad_norm": 0.6746771859867038, + "learning_rate": 3.644178007790369e-05, + "loss": 2.1816, + "step": 4773 + }, + { + "epoch": 0.37, + "grad_norm": 0.7167248304321332, + "learning_rate": 3.643622564136233e-05, + "loss": 2.0064, + "step": 4774 + }, + { + "epoch": 0.37, + "grad_norm": 0.5722167492806013, + "learning_rate": 3.6430670490810424e-05, + "loss": 1.8921, + "step": 4775 + }, + { + "epoch": 0.37, + "grad_norm": 0.6989817001031443, + "learning_rate": 3.64251146265948e-05, + "loss": 1.9661, + "step": 4776 + }, + { + "epoch": 0.37, + "grad_norm": 0.7228226005619175, + "learning_rate": 3.641955804906234e-05, + "loss": 1.988, + "step": 4777 + }, + { + "epoch": 0.37, + "grad_norm": 0.6480983627475008, + "learning_rate": 3.641400075855995e-05, + "loss": 2.1424, + "step": 4778 + }, + { + "epoch": 0.37, + "grad_norm": 0.7215222448976328, + "learning_rate": 3.640844275543461e-05, + "loss": 1.9284, + "step": 4779 + }, + { + "epoch": 0.37, + "grad_norm": 0.6382733868481769, + "learning_rate": 3.6402884040033325e-05, + "loss": 2.0043, + "step": 4780 + }, + { + "epoch": 0.37, + "grad_norm": 0.72673542802697, + "learning_rate": 3.639732461270313e-05, + "loss": 2.0087, + "step": 4781 + }, + { + "epoch": 0.37, + "grad_norm": 0.7352246283865421, + "learning_rate": 3.6391764473791146e-05, + "loss": 2.1643, + "step": 4782 + }, + { + "epoch": 0.37, + "grad_norm": 0.730022278407708, + "learning_rate": 3.638620362364451e-05, + "loss": 1.9394, + "step": 4783 + }, + { + "epoch": 0.37, + "grad_norm": 0.7040886428413377, + "learning_rate": 3.638064206261041e-05, + "loss": 1.9217, + "step": 4784 + }, + { + "epoch": 0.37, + "grad_norm": 0.64922167465325, + "learning_rate": 3.6375079791036066e-05, + "loss": 2.135, + "step": 4785 + }, + { + "epoch": 0.37, + "grad_norm": 0.6802041321356217, + "learning_rate": 3.636951680926876e-05, + "loss": 1.9149, + "step": 4786 + }, + { + "epoch": 0.37, + "grad_norm": 0.6351113225951748, + "learning_rate": 3.6363953117655815e-05, + "loss": 1.9879, + "step": 4787 + }, + { + "epoch": 0.37, + "grad_norm": 0.6329126638139047, + "learning_rate": 3.635838871654459e-05, + "loss": 1.9912, + "step": 4788 + }, + { + "epoch": 0.37, + "grad_norm": 0.6524280305072329, + "learning_rate": 3.63528236062825e-05, + "loss": 1.934, + "step": 4789 + }, + { + "epoch": 0.37, + "grad_norm": 0.59834984059157, + "learning_rate": 3.6347257787216984e-05, + "loss": 2.1532, + "step": 4790 + }, + { + "epoch": 0.37, + "grad_norm": 0.819354849115079, + "learning_rate": 3.634169125969556e-05, + "loss": 1.9114, + "step": 4791 + }, + { + "epoch": 0.37, + "grad_norm": 0.5574501477083332, + "learning_rate": 3.6336124024065756e-05, + "loss": 1.9329, + "step": 4792 + }, + { + "epoch": 0.37, + "grad_norm": 0.8849853051718746, + "learning_rate": 3.633055608067515e-05, + "loss": 2.0182, + "step": 4793 + }, + { + "epoch": 0.37, + "grad_norm": 0.734568881521357, + "learning_rate": 3.6324987429871396e-05, + "loss": 2.1256, + "step": 4794 + }, + { + "epoch": 0.37, + "grad_norm": 0.7362110709940252, + "learning_rate": 3.631941807200214e-05, + "loss": 1.948, + "step": 4795 + }, + { + "epoch": 0.37, + "grad_norm": 0.7688663947144935, + "learning_rate": 3.631384800741512e-05, + "loss": 1.9248, + "step": 4796 + }, + { + "epoch": 0.37, + "grad_norm": 0.8674906297711769, + "learning_rate": 3.6308277236458087e-05, + "loss": 2.1146, + "step": 4797 + }, + { + "epoch": 0.37, + "grad_norm": 0.6068105948524357, + "learning_rate": 3.630270575947885e-05, + "loss": 1.9392, + "step": 4798 + }, + { + "epoch": 0.37, + "grad_norm": 0.6256214382059277, + "learning_rate": 3.629713357682526e-05, + "loss": 1.999, + "step": 4799 + }, + { + "epoch": 0.37, + "grad_norm": 0.6179384383323173, + "learning_rate": 3.629156068884521e-05, + "loss": 1.9301, + "step": 4800 + }, + { + "epoch": 0.37, + "grad_norm": 0.6741669033673735, + "learning_rate": 3.628598709588664e-05, + "loss": 1.9412, + "step": 4801 + }, + { + "epoch": 0.37, + "grad_norm": 0.6883006040068876, + "learning_rate": 3.6280412798297536e-05, + "loss": 2.1709, + "step": 4802 + }, + { + "epoch": 0.37, + "grad_norm": 0.6397816357689469, + "learning_rate": 3.6274837796425915e-05, + "loss": 1.9127, + "step": 4803 + }, + { + "epoch": 0.37, + "grad_norm": 0.6680436694491237, + "learning_rate": 3.626926209061985e-05, + "loss": 1.916, + "step": 4804 + }, + { + "epoch": 0.37, + "grad_norm": 0.6352155594469302, + "learning_rate": 3.626368568122746e-05, + "loss": 1.9939, + "step": 4805 + }, + { + "epoch": 0.37, + "grad_norm": 0.6595464689303211, + "learning_rate": 3.625810856859691e-05, + "loss": 2.1554, + "step": 4806 + }, + { + "epoch": 0.37, + "grad_norm": 0.6368539874875424, + "learning_rate": 3.625253075307638e-05, + "loss": 1.9122, + "step": 4807 + }, + { + "epoch": 0.37, + "grad_norm": 0.576583132290356, + "learning_rate": 3.624695223501413e-05, + "loss": 1.9027, + "step": 4808 + }, + { + "epoch": 0.37, + "grad_norm": 0.8398802022237688, + "learning_rate": 3.624137301475846e-05, + "loss": 1.9326, + "step": 4809 + }, + { + "epoch": 0.37, + "grad_norm": 0.6850803838526457, + "learning_rate": 3.6235793092657677e-05, + "loss": 2.1494, + "step": 4810 + }, + { + "epoch": 0.37, + "grad_norm": 0.8661114941413354, + "learning_rate": 3.623021246906018e-05, + "loss": 1.9252, + "step": 4811 + }, + { + "epoch": 0.37, + "grad_norm": 0.7045008245636176, + "learning_rate": 3.622463114431439e-05, + "loss": 2.0383, + "step": 4812 + }, + { + "epoch": 0.37, + "grad_norm": 0.7955888263780125, + "learning_rate": 3.621904911876876e-05, + "loss": 1.9266, + "step": 4813 + }, + { + "epoch": 0.37, + "grad_norm": 0.7030212117884018, + "learning_rate": 3.62134663927718e-05, + "loss": 2.1516, + "step": 4814 + }, + { + "epoch": 0.37, + "grad_norm": 0.6507131457896277, + "learning_rate": 3.620788296667206e-05, + "loss": 1.9343, + "step": 4815 + }, + { + "epoch": 0.37, + "grad_norm": 0.6526919725551548, + "learning_rate": 3.620229884081816e-05, + "loss": 1.9412, + "step": 4816 + }, + { + "epoch": 0.37, + "grad_norm": 0.8476586273965105, + "learning_rate": 3.619671401555872e-05, + "loss": 2.1126, + "step": 4817 + }, + { + "epoch": 0.37, + "grad_norm": 0.6220980261084174, + "learning_rate": 3.619112849124242e-05, + "loss": 2.0459, + "step": 4818 + }, + { + "epoch": 0.37, + "grad_norm": 0.6651082744578745, + "learning_rate": 3.6185542268218e-05, + "loss": 1.9323, + "step": 4819 + }, + { + "epoch": 0.37, + "grad_norm": 0.6903831526174564, + "learning_rate": 3.617995534683422e-05, + "loss": 1.9248, + "step": 4820 + }, + { + "epoch": 0.37, + "grad_norm": 0.7014190415757046, + "learning_rate": 3.61743677274399e-05, + "loss": 1.9466, + "step": 4821 + }, + { + "epoch": 0.37, + "grad_norm": 0.6976815471371333, + "learning_rate": 3.61687794103839e-05, + "loss": 2.0984, + "step": 4822 + }, + { + "epoch": 0.37, + "grad_norm": 0.7382070190135815, + "learning_rate": 3.6163190396015115e-05, + "loss": 1.9494, + "step": 4823 + }, + { + "epoch": 0.37, + "grad_norm": 0.7367671758006374, + "learning_rate": 3.61576006846825e-05, + "loss": 1.9747, + "step": 4824 + }, + { + "epoch": 0.37, + "grad_norm": 0.7143127810736432, + "learning_rate": 3.615201027673504e-05, + "loss": 1.9418, + "step": 4825 + }, + { + "epoch": 0.37, + "grad_norm": 0.6618584439855055, + "learning_rate": 3.6146419172521755e-05, + "loss": 2.1213, + "step": 4826 + }, + { + "epoch": 0.37, + "grad_norm": 0.628905297473289, + "learning_rate": 3.614082737239174e-05, + "loss": 1.9553, + "step": 4827 + }, + { + "epoch": 0.37, + "grad_norm": 0.745367877986337, + "learning_rate": 3.613523487669409e-05, + "loss": 1.9467, + "step": 4828 + }, + { + "epoch": 0.37, + "grad_norm": 0.7563125224947433, + "learning_rate": 3.612964168577799e-05, + "loss": 2.159, + "step": 4829 + }, + { + "epoch": 0.37, + "grad_norm": 0.6378071852890695, + "learning_rate": 3.6124047799992644e-05, + "loss": 2.0224, + "step": 4830 + }, + { + "epoch": 0.37, + "grad_norm": 0.7233810739824218, + "learning_rate": 3.611845321968729e-05, + "loss": 1.9317, + "step": 4831 + }, + { + "epoch": 0.37, + "grad_norm": 0.6379880379420682, + "learning_rate": 3.611285794521123e-05, + "loss": 1.9459, + "step": 4832 + }, + { + "epoch": 0.37, + "grad_norm": 0.6098284416219479, + "learning_rate": 3.610726197691379e-05, + "loss": 1.9352, + "step": 4833 + }, + { + "epoch": 0.37, + "grad_norm": 0.6395096838876897, + "learning_rate": 3.610166531514436e-05, + "loss": 2.1485, + "step": 4834 + }, + { + "epoch": 0.37, + "grad_norm": 0.7531053626722699, + "learning_rate": 3.609606796025235e-05, + "loss": 1.935, + "step": 4835 + }, + { + "epoch": 0.37, + "grad_norm": 0.5613887591457554, + "learning_rate": 3.609046991258724e-05, + "loss": 2.0268, + "step": 4836 + }, + { + "epoch": 0.37, + "grad_norm": 0.6253388748590373, + "learning_rate": 3.608487117249853e-05, + "loss": 1.8833, + "step": 4837 + }, + { + "epoch": 0.37, + "grad_norm": 0.7218240222397928, + "learning_rate": 3.607927174033577e-05, + "loss": 2.1792, + "step": 4838 + }, + { + "epoch": 0.37, + "grad_norm": 0.5902828546062213, + "learning_rate": 3.607367161644857e-05, + "loss": 1.9617, + "step": 4839 + }, + { + "epoch": 0.37, + "grad_norm": 0.7533100967655497, + "learning_rate": 3.6068070801186556e-05, + "loss": 1.9455, + "step": 4840 + }, + { + "epoch": 0.37, + "grad_norm": 0.558009128469453, + "learning_rate": 3.606246929489941e-05, + "loss": 1.9555, + "step": 4841 + }, + { + "epoch": 0.37, + "grad_norm": 0.6486365976513113, + "learning_rate": 3.605686709793686e-05, + "loss": 2.1386, + "step": 4842 + }, + { + "epoch": 0.37, + "grad_norm": 0.6321387886310299, + "learning_rate": 3.605126421064868e-05, + "loss": 1.9676, + "step": 4843 + }, + { + "epoch": 0.37, + "grad_norm": 0.65173096131481, + "learning_rate": 3.604566063338467e-05, + "loss": 1.9478, + "step": 4844 + }, + { + "epoch": 0.37, + "grad_norm": 0.6228662913301347, + "learning_rate": 3.604005636649469e-05, + "loss": 1.9486, + "step": 4845 + }, + { + "epoch": 0.37, + "grad_norm": 0.624371585335125, + "learning_rate": 3.603445141032864e-05, + "loss": 2.115, + "step": 4846 + }, + { + "epoch": 0.37, + "grad_norm": 0.5968152884125457, + "learning_rate": 3.602884576523646e-05, + "loss": 1.9123, + "step": 4847 + }, + { + "epoch": 0.37, + "grad_norm": 0.747768162418612, + "learning_rate": 3.602323943156813e-05, + "loss": 1.9538, + "step": 4848 + }, + { + "epoch": 0.37, + "grad_norm": 0.6437661254966658, + "learning_rate": 3.601763240967367e-05, + "loss": 2.0026, + "step": 4849 + }, + { + "epoch": 0.37, + "grad_norm": 0.6028479013512997, + "learning_rate": 3.601202469990317e-05, + "loss": 2.1292, + "step": 4850 + }, + { + "epoch": 0.37, + "grad_norm": 0.701882364962353, + "learning_rate": 3.600641630260671e-05, + "loss": 1.9085, + "step": 4851 + }, + { + "epoch": 0.37, + "grad_norm": 0.6543341446933661, + "learning_rate": 3.600080721813449e-05, + "loss": 1.9492, + "step": 4852 + }, + { + "epoch": 0.37, + "grad_norm": 0.6241082689790098, + "learning_rate": 3.599519744683667e-05, + "loss": 1.9316, + "step": 4853 + }, + { + "epoch": 0.37, + "grad_norm": 0.7367186568189865, + "learning_rate": 3.59895869890635e-05, + "loss": 2.1732, + "step": 4854 + }, + { + "epoch": 0.37, + "grad_norm": 0.5782439832713605, + "learning_rate": 3.598397584516527e-05, + "loss": 2.0535, + "step": 4855 + }, + { + "epoch": 0.37, + "grad_norm": 0.7170534204782196, + "learning_rate": 3.5978364015492316e-05, + "loss": 1.9535, + "step": 4856 + }, + { + "epoch": 0.37, + "grad_norm": 0.5957469533969957, + "learning_rate": 3.597275150039499e-05, + "loss": 1.8993, + "step": 4857 + }, + { + "epoch": 0.37, + "grad_norm": 0.7500520501754648, + "learning_rate": 3.596713830022369e-05, + "loss": 2.1402, + "step": 4858 + }, + { + "epoch": 0.37, + "grad_norm": 0.6488243955121235, + "learning_rate": 3.596152441532892e-05, + "loss": 1.8859, + "step": 4859 + }, + { + "epoch": 0.37, + "grad_norm": 0.6621914445148278, + "learning_rate": 3.595590984606114e-05, + "loss": 1.9178, + "step": 4860 + }, + { + "epoch": 0.38, + "grad_norm": 0.6487321180927275, + "learning_rate": 3.59502945927709e-05, + "loss": 1.9983, + "step": 4861 + }, + { + "epoch": 0.38, + "grad_norm": 0.6293516161470537, + "learning_rate": 3.5944678655808776e-05, + "loss": 2.1425, + "step": 4862 + }, + { + "epoch": 0.38, + "grad_norm": 0.676160178749284, + "learning_rate": 3.5939062035525415e-05, + "loss": 1.9776, + "step": 4863 + }, + { + "epoch": 0.38, + "grad_norm": 0.7545774529736037, + "learning_rate": 3.5933444732271466e-05, + "loss": 1.9758, + "step": 4864 + }, + { + "epoch": 0.38, + "grad_norm": 0.7370220867433243, + "learning_rate": 3.592782674639765e-05, + "loss": 1.915, + "step": 4865 + }, + { + "epoch": 0.38, + "grad_norm": 0.7241577712820746, + "learning_rate": 3.592220807825471e-05, + "loss": 2.1472, + "step": 4866 + }, + { + "epoch": 0.38, + "grad_norm": 0.6535015884361915, + "learning_rate": 3.5916588728193454e-05, + "loss": 2.004, + "step": 4867 + }, + { + "epoch": 0.38, + "grad_norm": 0.8188842802311205, + "learning_rate": 3.5910968696564715e-05, + "loss": 1.9369, + "step": 4868 + }, + { + "epoch": 0.38, + "grad_norm": 0.6576870158359984, + "learning_rate": 3.5905347983719384e-05, + "loss": 1.9254, + "step": 4869 + }, + { + "epoch": 0.38, + "grad_norm": 0.862925927629241, + "learning_rate": 3.589972659000838e-05, + "loss": 2.1162, + "step": 4870 + }, + { + "epoch": 0.38, + "grad_norm": 0.6625663915038383, + "learning_rate": 3.589410451578266e-05, + "loss": 1.9184, + "step": 4871 + }, + { + "epoch": 0.38, + "grad_norm": 0.7044216793811316, + "learning_rate": 3.5888481761393236e-05, + "loss": 1.9468, + "step": 4872 + }, + { + "epoch": 0.38, + "grad_norm": 0.5931061083974939, + "learning_rate": 3.588285832719117e-05, + "loss": 1.9318, + "step": 4873 + }, + { + "epoch": 0.38, + "grad_norm": 0.7799332121425692, + "learning_rate": 3.5877234213527555e-05, + "loss": 2.1624, + "step": 4874 + }, + { + "epoch": 0.38, + "grad_norm": 0.5893027717145287, + "learning_rate": 3.5871609420753515e-05, + "loss": 1.9149, + "step": 4875 + }, + { + "epoch": 0.38, + "grad_norm": 0.625378115703623, + "learning_rate": 3.586598394922025e-05, + "loss": 1.9441, + "step": 4876 + }, + { + "epoch": 0.38, + "grad_norm": 0.625346058798477, + "learning_rate": 3.586035779927896e-05, + "loss": 1.9113, + "step": 4877 + }, + { + "epoch": 0.38, + "grad_norm": 0.7466397160493984, + "learning_rate": 3.585473097128092e-05, + "loss": 2.1596, + "step": 4878 + }, + { + "epoch": 0.38, + "grad_norm": 0.6286924315052096, + "learning_rate": 3.584910346557743e-05, + "loss": 1.942, + "step": 4879 + }, + { + "epoch": 0.38, + "grad_norm": 0.7182046538500895, + "learning_rate": 3.584347528251983e-05, + "loss": 1.9596, + "step": 4880 + }, + { + "epoch": 0.38, + "grad_norm": 0.6564135419418355, + "learning_rate": 3.583784642245954e-05, + "loss": 1.9692, + "step": 4881 + }, + { + "epoch": 0.38, + "grad_norm": 0.6719896381860888, + "learning_rate": 3.583221688574798e-05, + "loss": 2.1145, + "step": 4882 + }, + { + "epoch": 0.38, + "grad_norm": 0.6626820275549926, + "learning_rate": 3.582658667273661e-05, + "loss": 1.943, + "step": 4883 + }, + { + "epoch": 0.38, + "grad_norm": 0.6305779435852528, + "learning_rate": 3.582095578377695e-05, + "loss": 1.9804, + "step": 4884 + }, + { + "epoch": 0.38, + "grad_norm": 0.7769676042755611, + "learning_rate": 3.581532421922058e-05, + "loss": 1.9346, + "step": 4885 + }, + { + "epoch": 0.38, + "grad_norm": 0.6636510673320748, + "learning_rate": 3.580969197941907e-05, + "loss": 2.1317, + "step": 4886 + }, + { + "epoch": 0.38, + "grad_norm": 0.5964077681184267, + "learning_rate": 3.58040590647241e-05, + "loss": 1.9183, + "step": 4887 + }, + { + "epoch": 0.38, + "grad_norm": 0.7905805499799053, + "learning_rate": 3.5798425475487326e-05, + "loss": 1.8928, + "step": 4888 + }, + { + "epoch": 0.38, + "grad_norm": 0.6668423946944634, + "learning_rate": 3.57927912120605e-05, + "loss": 1.9675, + "step": 4889 + }, + { + "epoch": 0.38, + "grad_norm": 0.693228153826767, + "learning_rate": 3.578715627479537e-05, + "loss": 2.1948, + "step": 4890 + }, + { + "epoch": 0.38, + "grad_norm": 0.6261805740600579, + "learning_rate": 3.578152066404377e-05, + "loss": 1.9651, + "step": 4891 + }, + { + "epoch": 0.38, + "grad_norm": 0.5750266366947432, + "learning_rate": 3.5775884380157535e-05, + "loss": 1.9995, + "step": 4892 + }, + { + "epoch": 0.38, + "grad_norm": 0.5897497716502431, + "learning_rate": 3.5770247423488576e-05, + "loss": 1.9676, + "step": 4893 + }, + { + "epoch": 0.38, + "grad_norm": 0.6938595282712247, + "learning_rate": 3.5764609794388824e-05, + "loss": 2.1646, + "step": 4894 + }, + { + "epoch": 0.38, + "grad_norm": 0.5818011666412489, + "learning_rate": 3.575897149321026e-05, + "loss": 1.9607, + "step": 4895 + }, + { + "epoch": 0.38, + "grad_norm": 0.781899539806489, + "learning_rate": 3.57533325203049e-05, + "loss": 1.9604, + "step": 4896 + }, + { + "epoch": 0.38, + "grad_norm": 0.6104675117794495, + "learning_rate": 3.574769287602482e-05, + "loss": 1.9061, + "step": 4897 + }, + { + "epoch": 0.38, + "grad_norm": 0.7949453918841302, + "learning_rate": 3.574205256072212e-05, + "loss": 2.1601, + "step": 4898 + }, + { + "epoch": 0.38, + "grad_norm": 0.6233673003276599, + "learning_rate": 3.573641157474896e-05, + "loss": 1.9039, + "step": 4899 + }, + { + "epoch": 0.38, + "grad_norm": 0.6875097083360802, + "learning_rate": 3.5730769918457505e-05, + "loss": 1.918, + "step": 4900 + }, + { + "epoch": 0.38, + "grad_norm": 0.6298407881285911, + "learning_rate": 3.5725127592200004e-05, + "loss": 1.9486, + "step": 4901 + }, + { + "epoch": 0.38, + "grad_norm": 0.6879005699387595, + "learning_rate": 3.571948459632874e-05, + "loss": 2.201, + "step": 4902 + }, + { + "epoch": 0.38, + "grad_norm": 0.585224647237516, + "learning_rate": 3.5713840931196e-05, + "loss": 1.9238, + "step": 4903 + }, + { + "epoch": 0.38, + "grad_norm": 0.6634765641228637, + "learning_rate": 3.5708196597154156e-05, + "loss": 1.9315, + "step": 4904 + }, + { + "epoch": 0.38, + "grad_norm": 0.6282448634397899, + "learning_rate": 3.570255159455561e-05, + "loss": 1.9945, + "step": 4905 + }, + { + "epoch": 0.38, + "grad_norm": 0.6665441768835734, + "learning_rate": 3.569690592375281e-05, + "loss": 2.188, + "step": 4906 + }, + { + "epoch": 0.38, + "grad_norm": 0.6435155328731993, + "learning_rate": 3.569125958509822e-05, + "loss": 1.9922, + "step": 4907 + }, + { + "epoch": 0.38, + "grad_norm": 0.841636529822681, + "learning_rate": 3.5685612578944386e-05, + "loss": 1.9607, + "step": 4908 + }, + { + "epoch": 0.38, + "grad_norm": 0.579221368705233, + "learning_rate": 3.5679964905643845e-05, + "loss": 1.962, + "step": 4909 + }, + { + "epoch": 0.38, + "grad_norm": 1.0878884409838723, + "learning_rate": 3.567431656554923e-05, + "loss": 2.1543, + "step": 4910 + }, + { + "epoch": 0.38, + "grad_norm": 0.6524587137712553, + "learning_rate": 3.566866755901318e-05, + "loss": 2.0048, + "step": 4911 + }, + { + "epoch": 0.38, + "grad_norm": 0.8985314317031181, + "learning_rate": 3.566301788638838e-05, + "loss": 1.9171, + "step": 4912 + }, + { + "epoch": 0.38, + "grad_norm": 0.7219355575710548, + "learning_rate": 3.565736754802758e-05, + "loss": 1.959, + "step": 4913 + }, + { + "epoch": 0.38, + "grad_norm": 0.9435077329563867, + "learning_rate": 3.565171654428353e-05, + "loss": 2.141, + "step": 4914 + }, + { + "epoch": 0.38, + "grad_norm": 0.8014999943373552, + "learning_rate": 3.564606487550907e-05, + "loss": 1.9465, + "step": 4915 + }, + { + "epoch": 0.38, + "grad_norm": 0.8448964834738598, + "learning_rate": 3.564041254205703e-05, + "loss": 1.8788, + "step": 4916 + }, + { + "epoch": 0.38, + "grad_norm": 0.7853180201926094, + "learning_rate": 3.563475954428034e-05, + "loss": 2.0332, + "step": 4917 + }, + { + "epoch": 0.38, + "grad_norm": 0.8271357735683744, + "learning_rate": 3.562910588253191e-05, + "loss": 2.1353, + "step": 4918 + }, + { + "epoch": 0.38, + "grad_norm": 0.7044526796355662, + "learning_rate": 3.562345155716474e-05, + "loss": 1.8886, + "step": 4919 + }, + { + "epoch": 0.38, + "grad_norm": 0.7620216103355353, + "learning_rate": 3.561779656853184e-05, + "loss": 1.9581, + "step": 4920 + }, + { + "epoch": 0.38, + "grad_norm": 0.7557933361188971, + "learning_rate": 3.5612140916986295e-05, + "loss": 1.9606, + "step": 4921 + }, + { + "epoch": 0.38, + "grad_norm": 0.7488438260002206, + "learning_rate": 3.5606484602881184e-05, + "loss": 2.1036, + "step": 4922 + }, + { + "epoch": 0.38, + "grad_norm": 0.8269938362513124, + "learning_rate": 3.560082762656967e-05, + "loss": 2.0296, + "step": 4923 + }, + { + "epoch": 0.38, + "grad_norm": 0.6384114549163041, + "learning_rate": 3.5595169988404945e-05, + "loss": 1.9036, + "step": 4924 + }, + { + "epoch": 0.38, + "grad_norm": 0.796767598397149, + "learning_rate": 3.558951168874022e-05, + "loss": 1.9511, + "step": 4925 + }, + { + "epoch": 0.38, + "grad_norm": 0.6574925523142856, + "learning_rate": 3.5583852727928784e-05, + "loss": 2.1584, + "step": 4926 + }, + { + "epoch": 0.38, + "grad_norm": 0.7779394647116428, + "learning_rate": 3.5578193106323945e-05, + "loss": 1.8908, + "step": 4927 + }, + { + "epoch": 0.38, + "grad_norm": 0.7288697736838304, + "learning_rate": 3.557253282427905e-05, + "loss": 1.9244, + "step": 4928 + }, + { + "epoch": 0.38, + "grad_norm": 0.8030971869065199, + "learning_rate": 3.55668718821475e-05, + "loss": 1.9508, + "step": 4929 + }, + { + "epoch": 0.38, + "grad_norm": 0.5713921305334442, + "learning_rate": 3.556121028028272e-05, + "loss": 2.1512, + "step": 4930 + }, + { + "epoch": 0.38, + "grad_norm": 0.8450327594662159, + "learning_rate": 3.555554801903821e-05, + "loss": 1.9129, + "step": 4931 + }, + { + "epoch": 0.38, + "grad_norm": 0.6902317094232462, + "learning_rate": 3.5549885098767466e-05, + "loss": 1.9306, + "step": 4932 + }, + { + "epoch": 0.38, + "grad_norm": 0.767841052157392, + "learning_rate": 3.5544221519824054e-05, + "loss": 1.9266, + "step": 4933 + }, + { + "epoch": 0.38, + "grad_norm": 0.8576160523484292, + "learning_rate": 3.553855728256158e-05, + "loss": 2.1496, + "step": 4934 + }, + { + "epoch": 0.38, + "grad_norm": 0.7284099207394579, + "learning_rate": 3.5532892387333684e-05, + "loss": 1.9734, + "step": 4935 + }, + { + "epoch": 0.38, + "grad_norm": 0.8124906419782457, + "learning_rate": 3.552722683449404e-05, + "loss": 1.9928, + "step": 4936 + }, + { + "epoch": 0.38, + "grad_norm": 0.6548622394625544, + "learning_rate": 3.5521560624396374e-05, + "loss": 1.9522, + "step": 4937 + }, + { + "epoch": 0.38, + "grad_norm": 0.7241905203745482, + "learning_rate": 3.551589375739446e-05, + "loss": 2.1221, + "step": 4938 + }, + { + "epoch": 0.38, + "grad_norm": 0.7048180412665803, + "learning_rate": 3.55102262338421e-05, + "loss": 1.8886, + "step": 4939 + }, + { + "epoch": 0.38, + "grad_norm": 0.72704645228362, + "learning_rate": 3.550455805409314e-05, + "loss": 1.9417, + "step": 4940 + }, + { + "epoch": 0.38, + "grad_norm": 0.739575565672194, + "learning_rate": 3.549888921850146e-05, + "loss": 1.9104, + "step": 4941 + }, + { + "epoch": 0.38, + "grad_norm": 0.8694837718308387, + "learning_rate": 3.5493219727421005e-05, + "loss": 2.0068, + "step": 4942 + }, + { + "epoch": 0.38, + "grad_norm": 0.6557171092454571, + "learning_rate": 3.548754958120573e-05, + "loss": 2.1225, + "step": 4943 + }, + { + "epoch": 0.38, + "grad_norm": 0.7203125294237179, + "learning_rate": 3.548187878020965e-05, + "loss": 1.9071, + "step": 4944 + }, + { + "epoch": 0.38, + "grad_norm": 0.733140397295901, + "learning_rate": 3.547620732478681e-05, + "loss": 1.9407, + "step": 4945 + }, + { + "epoch": 0.38, + "grad_norm": 0.7584475131848014, + "learning_rate": 3.547053521529132e-05, + "loss": 2.1462, + "step": 4946 + }, + { + "epoch": 0.38, + "grad_norm": 0.6869873449389399, + "learning_rate": 3.54648624520773e-05, + "loss": 1.8969, + "step": 4947 + }, + { + "epoch": 0.38, + "grad_norm": 0.7851867765378411, + "learning_rate": 3.545918903549893e-05, + "loss": 1.9889, + "step": 4948 + }, + { + "epoch": 0.38, + "grad_norm": 0.6615166145106793, + "learning_rate": 3.545351496591042e-05, + "loss": 1.9647, + "step": 4949 + }, + { + "epoch": 0.38, + "grad_norm": 0.9990315158721332, + "learning_rate": 3.544784024366602e-05, + "loss": 2.1321, + "step": 4950 + }, + { + "epoch": 0.38, + "grad_norm": 0.6313974984637188, + "learning_rate": 3.5442164869120035e-05, + "loss": 1.9503, + "step": 4951 + }, + { + "epoch": 0.38, + "grad_norm": 0.9676895609822459, + "learning_rate": 3.5436488842626796e-05, + "loss": 1.9408, + "step": 4952 + }, + { + "epoch": 0.38, + "grad_norm": 0.78490869894415, + "learning_rate": 3.5430812164540696e-05, + "loss": 1.9296, + "step": 4953 + }, + { + "epoch": 0.38, + "grad_norm": 0.7165856847143085, + "learning_rate": 3.542513483521613e-05, + "loss": 2.0675, + "step": 4954 + }, + { + "epoch": 0.38, + "grad_norm": 0.8895762981929953, + "learning_rate": 3.541945685500758e-05, + "loss": 2.1712, + "step": 4955 + }, + { + "epoch": 0.38, + "grad_norm": 0.7316392203747876, + "learning_rate": 3.5413778224269526e-05, + "loss": 1.9718, + "step": 4956 + }, + { + "epoch": 0.38, + "grad_norm": 0.672448181299301, + "learning_rate": 3.540809894335652e-05, + "loss": 1.9351, + "step": 4957 + }, + { + "epoch": 0.38, + "grad_norm": 0.9657108158480306, + "learning_rate": 3.540241901262314e-05, + "loss": 2.2049, + "step": 4958 + }, + { + "epoch": 0.38, + "grad_norm": 0.7977634046555407, + "learning_rate": 3.5396738432424004e-05, + "loss": 2.0132, + "step": 4959 + }, + { + "epoch": 0.38, + "grad_norm": 0.6908695068692342, + "learning_rate": 3.5391057203113774e-05, + "loss": 1.9712, + "step": 4960 + }, + { + "epoch": 0.38, + "grad_norm": 0.7776545020907336, + "learning_rate": 3.5385375325047166e-05, + "loss": 1.9858, + "step": 4961 + }, + { + "epoch": 0.38, + "grad_norm": 0.6761425136956788, + "learning_rate": 3.53796927985789e-05, + "loss": 2.1517, + "step": 4962 + }, + { + "epoch": 0.38, + "grad_norm": 0.6420998580007115, + "learning_rate": 3.5374009624063784e-05, + "loss": 1.9702, + "step": 4963 + }, + { + "epoch": 0.38, + "grad_norm": 0.6368251509227237, + "learning_rate": 3.5368325801856614e-05, + "loss": 1.9241, + "step": 4964 + }, + { + "epoch": 0.38, + "grad_norm": 0.6608187805130326, + "learning_rate": 3.536264133231229e-05, + "loss": 1.9057, + "step": 4965 + }, + { + "epoch": 0.38, + "grad_norm": 0.6747763875665331, + "learning_rate": 3.5356956215785675e-05, + "loss": 2.1867, + "step": 4966 + }, + { + "epoch": 0.38, + "grad_norm": 0.5750928136821136, + "learning_rate": 3.535127045263176e-05, + "loss": 2.0105, + "step": 4967 + }, + { + "epoch": 0.38, + "grad_norm": 0.6220107590320654, + "learning_rate": 3.534558404320549e-05, + "loss": 1.9061, + "step": 4968 + }, + { + "epoch": 0.38, + "grad_norm": 0.6439262243647773, + "learning_rate": 3.5339896987861905e-05, + "loss": 1.991, + "step": 4969 + }, + { + "epoch": 0.38, + "grad_norm": 0.6117646048809102, + "learning_rate": 3.5334209286956076e-05, + "loss": 2.1053, + "step": 4970 + }, + { + "epoch": 0.38, + "grad_norm": 0.5962279058014149, + "learning_rate": 3.532852094084312e-05, + "loss": 1.9544, + "step": 4971 + }, + { + "epoch": 0.38, + "grad_norm": 0.541364344839125, + "learning_rate": 3.532283194987816e-05, + "loss": 1.9493, + "step": 4972 + }, + { + "epoch": 0.38, + "grad_norm": 0.5976789882406672, + "learning_rate": 3.531714231441639e-05, + "loss": 2.0178, + "step": 4973 + }, + { + "epoch": 0.38, + "grad_norm": 0.6112561014419314, + "learning_rate": 3.531145203481305e-05, + "loss": 1.9598, + "step": 4974 + }, + { + "epoch": 0.38, + "grad_norm": 0.5996308575886273, + "learning_rate": 3.53057611114234e-05, + "loss": 2.1614, + "step": 4975 + }, + { + "epoch": 0.38, + "grad_norm": 0.6022651087823346, + "learning_rate": 3.530006954460274e-05, + "loss": 1.929, + "step": 4976 + }, + { + "epoch": 0.38, + "grad_norm": 0.6955935245841902, + "learning_rate": 3.529437733470643e-05, + "loss": 1.9489, + "step": 4977 + }, + { + "epoch": 0.38, + "grad_norm": 0.6997916345993581, + "learning_rate": 3.528868448208985e-05, + "loss": 2.1126, + "step": 4978 + }, + { + "epoch": 0.38, + "grad_norm": 0.6566727541303923, + "learning_rate": 3.5282990987108426e-05, + "loss": 1.9735, + "step": 4979 + }, + { + "epoch": 0.38, + "grad_norm": 0.6589771113389836, + "learning_rate": 3.5277296850117635e-05, + "loss": 1.9152, + "step": 4980 + }, + { + "epoch": 0.38, + "grad_norm": 0.8259057297426624, + "learning_rate": 3.527160207147299e-05, + "loss": 1.9651, + "step": 4981 + }, + { + "epoch": 0.38, + "grad_norm": 0.6598443558399713, + "learning_rate": 3.526590665153002e-05, + "loss": 2.1058, + "step": 4982 + }, + { + "epoch": 0.38, + "grad_norm": 0.8279044215666387, + "learning_rate": 3.5260210590644327e-05, + "loss": 1.9355, + "step": 4983 + }, + { + "epoch": 0.38, + "grad_norm": 0.7152544510800812, + "learning_rate": 3.525451388917154e-05, + "loss": 1.9371, + "step": 4984 + }, + { + "epoch": 0.38, + "grad_norm": 0.7716289501710197, + "learning_rate": 3.5248816547467315e-05, + "loss": 2.011, + "step": 4985 + }, + { + "epoch": 0.38, + "grad_norm": 0.7228448912774911, + "learning_rate": 3.524311856588738e-05, + "loss": 1.9735, + "step": 4986 + }, + { + "epoch": 0.38, + "grad_norm": 0.7886004156693251, + "learning_rate": 3.523741994478747e-05, + "loss": 2.1254, + "step": 4987 + }, + { + "epoch": 0.38, + "grad_norm": 0.6564041856943515, + "learning_rate": 3.523172068452337e-05, + "loss": 1.9346, + "step": 4988 + }, + { + "epoch": 0.38, + "grad_norm": 0.7224110735626381, + "learning_rate": 3.5226020785450934e-05, + "loss": 1.9472, + "step": 4989 + }, + { + "epoch": 0.38, + "grad_norm": 0.7593298512664858, + "learning_rate": 3.522032024792599e-05, + "loss": 2.1758, + "step": 4990 + }, + { + "epoch": 0.39, + "grad_norm": 0.5950790309987041, + "learning_rate": 3.521461907230447e-05, + "loss": 1.9615, + "step": 4991 + }, + { + "epoch": 0.39, + "grad_norm": 0.6328394128244219, + "learning_rate": 3.520891725894233e-05, + "loss": 2.048, + "step": 4992 + }, + { + "epoch": 0.39, + "grad_norm": 0.6572697820220891, + "learning_rate": 3.520321480819554e-05, + "loss": 1.9425, + "step": 4993 + }, + { + "epoch": 0.39, + "grad_norm": 0.6348769675057717, + "learning_rate": 3.519751172042013e-05, + "loss": 1.8967, + "step": 4994 + }, + { + "epoch": 0.39, + "grad_norm": 0.7013175129305821, + "learning_rate": 3.519180799597217e-05, + "loss": 2.1684, + "step": 4995 + }, + { + "epoch": 0.39, + "grad_norm": 0.6169724105799803, + "learning_rate": 3.518610363520778e-05, + "loss": 1.951, + "step": 4996 + }, + { + "epoch": 0.39, + "grad_norm": 0.6601227095969193, + "learning_rate": 3.518039863848309e-05, + "loss": 1.9641, + "step": 4997 + }, + { + "epoch": 0.39, + "grad_norm": 0.7020753131930161, + "learning_rate": 3.517469300615428e-05, + "loss": 2.0315, + "step": 4998 + }, + { + "epoch": 0.39, + "grad_norm": 0.7163562740692608, + "learning_rate": 3.51689867385776e-05, + "loss": 2.1527, + "step": 4999 + }, + { + "epoch": 0.39, + "grad_norm": 0.7087977709407456, + "learning_rate": 3.516327983610929e-05, + "loss": 1.9428, + "step": 5000 + }, + { + "epoch": 0.39, + "grad_norm": 0.6083215029152462, + "learning_rate": 3.515757229910568e-05, + "loss": 1.9484, + "step": 5001 + }, + { + "epoch": 0.39, + "grad_norm": 0.9528040944593796, + "learning_rate": 3.5151864127923096e-05, + "loss": 2.1106, + "step": 5002 + }, + { + "epoch": 0.39, + "grad_norm": 0.5690130556597205, + "learning_rate": 3.514615532291793e-05, + "loss": 1.9163, + "step": 5003 + }, + { + "epoch": 0.39, + "grad_norm": 0.9381438920999967, + "learning_rate": 3.5140445884446606e-05, + "loss": 2.0292, + "step": 5004 + }, + { + "epoch": 0.39, + "grad_norm": 0.616909352266048, + "learning_rate": 3.5134735812865586e-05, + "loss": 1.9625, + "step": 5005 + }, + { + "epoch": 0.39, + "grad_norm": 0.8110078976931545, + "learning_rate": 3.512902510853138e-05, + "loss": 1.9324, + "step": 5006 + }, + { + "epoch": 0.39, + "grad_norm": 0.6071756179973448, + "learning_rate": 3.512331377180052e-05, + "loss": 2.1176, + "step": 5007 + }, + { + "epoch": 0.39, + "grad_norm": 0.7455602813805682, + "learning_rate": 3.51176018030296e-05, + "loss": 1.949, + "step": 5008 + }, + { + "epoch": 0.39, + "grad_norm": 0.6383701987621012, + "learning_rate": 3.511188920257523e-05, + "loss": 1.8885, + "step": 5009 + }, + { + "epoch": 0.39, + "grad_norm": 0.6369806590233833, + "learning_rate": 3.5106175970794074e-05, + "loss": 1.9903, + "step": 5010 + }, + { + "epoch": 0.39, + "grad_norm": 0.669527485607258, + "learning_rate": 3.5100462108042844e-05, + "loss": 2.0959, + "step": 5011 + }, + { + "epoch": 0.39, + "grad_norm": 0.6052509258200691, + "learning_rate": 3.509474761467826e-05, + "loss": 1.9715, + "step": 5012 + }, + { + "epoch": 0.39, + "grad_norm": 0.6352368860994843, + "learning_rate": 3.508903249105712e-05, + "loss": 1.9829, + "step": 5013 + }, + { + "epoch": 0.39, + "grad_norm": 0.648413771381859, + "learning_rate": 3.508331673753624e-05, + "loss": 2.1369, + "step": 5014 + }, + { + "epoch": 0.39, + "grad_norm": 0.6130999575956402, + "learning_rate": 3.507760035447246e-05, + "loss": 1.9496, + "step": 5015 + }, + { + "epoch": 0.39, + "grad_norm": 0.5758889217406541, + "learning_rate": 3.5071883342222695e-05, + "loss": 2.0148, + "step": 5016 + }, + { + "epoch": 0.39, + "grad_norm": 0.5820698654532159, + "learning_rate": 3.506616570114387e-05, + "loss": 1.9013, + "step": 5017 + }, + { + "epoch": 0.39, + "grad_norm": 0.6273154194628972, + "learning_rate": 3.506044743159298e-05, + "loss": 1.927, + "step": 5018 + }, + { + "epoch": 0.39, + "grad_norm": 0.5980274872029098, + "learning_rate": 3.505472853392703e-05, + "loss": 2.1856, + "step": 5019 + }, + { + "epoch": 0.39, + "grad_norm": 0.6521227812636816, + "learning_rate": 3.504900900850305e-05, + "loss": 1.9512, + "step": 5020 + }, + { + "epoch": 0.39, + "grad_norm": 0.6477494473361861, + "learning_rate": 3.504328885567818e-05, + "loss": 1.9951, + "step": 5021 + }, + { + "epoch": 0.39, + "grad_norm": 0.6135790221560959, + "learning_rate": 3.5037568075809514e-05, + "loss": 2.1131, + "step": 5022 + }, + { + "epoch": 0.39, + "grad_norm": 0.7305116539084799, + "learning_rate": 3.503184666925424e-05, + "loss": 2.0182, + "step": 5023 + }, + { + "epoch": 0.39, + "grad_norm": 0.6656652082082719, + "learning_rate": 3.5026124636369564e-05, + "loss": 1.9354, + "step": 5024 + }, + { + "epoch": 0.39, + "grad_norm": 0.9870328842349357, + "learning_rate": 3.5020401977512745e-05, + "loss": 1.9769, + "step": 5025 + }, + { + "epoch": 0.39, + "grad_norm": 0.6597119962730804, + "learning_rate": 3.5014678693041065e-05, + "loss": 1.9542, + "step": 5026 + }, + { + "epoch": 0.39, + "grad_norm": 0.8940749741245584, + "learning_rate": 3.500895478331185e-05, + "loss": 2.2081, + "step": 5027 + }, + { + "epoch": 0.39, + "grad_norm": 0.6404446824598085, + "learning_rate": 3.500323024868247e-05, + "loss": 1.9265, + "step": 5028 + }, + { + "epoch": 0.39, + "grad_norm": 0.811943312396918, + "learning_rate": 3.499750508951033e-05, + "loss": 1.994, + "step": 5029 + }, + { + "epoch": 0.39, + "grad_norm": 0.5744118806779627, + "learning_rate": 3.499177930615288e-05, + "loss": 1.951, + "step": 5030 + }, + { + "epoch": 0.39, + "grad_norm": 0.8107830469888612, + "learning_rate": 3.4986052898967596e-05, + "loss": 2.1361, + "step": 5031 + }, + { + "epoch": 0.39, + "grad_norm": 0.6383419898317009, + "learning_rate": 3.498032586831202e-05, + "loss": 1.9297, + "step": 5032 + }, + { + "epoch": 0.39, + "grad_norm": 0.5795718250406815, + "learning_rate": 3.497459821454368e-05, + "loss": 1.9623, + "step": 5033 + }, + { + "epoch": 0.39, + "grad_norm": 0.8100169774083417, + "learning_rate": 3.49688699380202e-05, + "loss": 2.1598, + "step": 5034 + }, + { + "epoch": 0.39, + "grad_norm": 0.638604078315698, + "learning_rate": 3.496314103909922e-05, + "loss": 2.0557, + "step": 5035 + }, + { + "epoch": 0.39, + "grad_norm": 0.5789338985855809, + "learning_rate": 3.4957411518138424e-05, + "loss": 1.933, + "step": 5036 + }, + { + "epoch": 0.39, + "grad_norm": 0.7036222176596998, + "learning_rate": 3.495168137549551e-05, + "loss": 1.9634, + "step": 5037 + }, + { + "epoch": 0.39, + "grad_norm": 0.6122702391052178, + "learning_rate": 3.494595061152825e-05, + "loss": 1.9126, + "step": 5038 + }, + { + "epoch": 0.39, + "grad_norm": 0.6107893314926427, + "learning_rate": 3.494021922659444e-05, + "loss": 2.0844, + "step": 5039 + }, + { + "epoch": 0.39, + "grad_norm": 0.7109800160386094, + "learning_rate": 3.4934487221051896e-05, + "loss": 1.9057, + "step": 5040 + }, + { + "epoch": 0.39, + "grad_norm": 0.632276211162601, + "learning_rate": 3.4928754595258507e-05, + "loss": 2.0348, + "step": 5041 + }, + { + "epoch": 0.39, + "grad_norm": 0.684807517385371, + "learning_rate": 3.492302134957218e-05, + "loss": 1.9132, + "step": 5042 + }, + { + "epoch": 0.39, + "grad_norm": 0.8238146461518109, + "learning_rate": 3.491728748435087e-05, + "loss": 2.1217, + "step": 5043 + }, + { + "epoch": 0.39, + "grad_norm": 0.704284460585149, + "learning_rate": 3.491155299995256e-05, + "loss": 1.9214, + "step": 5044 + }, + { + "epoch": 0.39, + "grad_norm": 0.6484063574546234, + "learning_rate": 3.490581789673527e-05, + "loss": 1.8958, + "step": 5045 + }, + { + "epoch": 0.39, + "grad_norm": 0.6476115655836854, + "learning_rate": 3.4900082175057084e-05, + "loss": 2.1198, + "step": 5046 + }, + { + "epoch": 0.39, + "grad_norm": 0.588357662251807, + "learning_rate": 3.48943458352761e-05, + "loss": 1.9271, + "step": 5047 + }, + { + "epoch": 0.39, + "grad_norm": 0.6138398041989258, + "learning_rate": 3.488860887775045e-05, + "loss": 1.9617, + "step": 5048 + }, + { + "epoch": 0.39, + "grad_norm": 0.7224529356434634, + "learning_rate": 3.4882871302838326e-05, + "loss": 1.9337, + "step": 5049 + }, + { + "epoch": 0.39, + "grad_norm": 0.6085381517050298, + "learning_rate": 3.4877133110897953e-05, + "loss": 1.9781, + "step": 5050 + }, + { + "epoch": 0.39, + "grad_norm": 0.689891322227274, + "learning_rate": 3.4871394302287576e-05, + "loss": 2.1299, + "step": 5051 + }, + { + "epoch": 0.39, + "grad_norm": 0.5673033528489645, + "learning_rate": 3.4865654877365503e-05, + "loss": 1.9211, + "step": 5052 + }, + { + "epoch": 0.39, + "grad_norm": 0.7063793365332953, + "learning_rate": 3.4859914836490063e-05, + "loss": 1.9259, + "step": 5053 + }, + { + "epoch": 0.39, + "grad_norm": 0.5602466605579784, + "learning_rate": 3.485417418001964e-05, + "loss": 1.9988, + "step": 5054 + }, + { + "epoch": 0.39, + "grad_norm": 0.6672311114395776, + "learning_rate": 3.484843290831264e-05, + "loss": 2.0885, + "step": 5055 + }, + { + "epoch": 0.39, + "grad_norm": 0.6463298415058347, + "learning_rate": 3.484269102172751e-05, + "loss": 1.9214, + "step": 5056 + }, + { + "epoch": 0.39, + "grad_norm": 0.6085751309892113, + "learning_rate": 3.483694852062276e-05, + "loss": 1.9387, + "step": 5057 + }, + { + "epoch": 0.39, + "grad_norm": 0.6513838993862322, + "learning_rate": 3.483120540535688e-05, + "loss": 1.9219, + "step": 5058 + }, + { + "epoch": 0.39, + "grad_norm": 0.5525022947518117, + "learning_rate": 3.482546167628847e-05, + "loss": 2.1275, + "step": 5059 + }, + { + "epoch": 0.39, + "grad_norm": 0.6651188752857388, + "learning_rate": 3.4819717333776126e-05, + "loss": 2.0334, + "step": 5060 + }, + { + "epoch": 0.39, + "grad_norm": 0.6299755797869692, + "learning_rate": 3.481397237817849e-05, + "loss": 1.9483, + "step": 5061 + }, + { + "epoch": 0.39, + "grad_norm": 0.6649929131221995, + "learning_rate": 3.480822680985424e-05, + "loss": 1.9433, + "step": 5062 + }, + { + "epoch": 0.39, + "grad_norm": 0.6515585989302406, + "learning_rate": 3.480248062916209e-05, + "loss": 2.117, + "step": 5063 + }, + { + "epoch": 0.39, + "grad_norm": 0.6126332814546808, + "learning_rate": 3.479673383646081e-05, + "loss": 1.9619, + "step": 5064 + }, + { + "epoch": 0.39, + "grad_norm": 0.6522764331628698, + "learning_rate": 3.47909864321092e-05, + "loss": 1.9519, + "step": 5065 + }, + { + "epoch": 0.39, + "grad_norm": 0.572194786763794, + "learning_rate": 3.4785238416466074e-05, + "loss": 2.0115, + "step": 5066 + }, + { + "epoch": 0.39, + "grad_norm": 0.5890212179732006, + "learning_rate": 3.477948978989031e-05, + "loss": 2.1467, + "step": 5067 + }, + { + "epoch": 0.39, + "grad_norm": 0.6135908098063265, + "learning_rate": 3.477374055274083e-05, + "loss": 1.9341, + "step": 5068 + }, + { + "epoch": 0.39, + "grad_norm": 0.5592208126466649, + "learning_rate": 3.4767990705376576e-05, + "loss": 1.9307, + "step": 5069 + }, + { + "epoch": 0.39, + "grad_norm": 0.605346290090158, + "learning_rate": 3.476224024815653e-05, + "loss": 1.9213, + "step": 5070 + }, + { + "epoch": 0.39, + "grad_norm": 0.6277069168830549, + "learning_rate": 3.475648918143973e-05, + "loss": 2.1826, + "step": 5071 + }, + { + "epoch": 0.39, + "grad_norm": 0.6185838947817488, + "learning_rate": 3.475073750558522e-05, + "loss": 2.0301, + "step": 5072 + }, + { + "epoch": 0.39, + "grad_norm": 0.598390697223414, + "learning_rate": 3.4744985220952115e-05, + "loss": 1.9124, + "step": 5073 + }, + { + "epoch": 0.39, + "grad_norm": 0.5998682517890915, + "learning_rate": 3.4739232327899554e-05, + "loss": 1.9641, + "step": 5074 + }, + { + "epoch": 0.39, + "grad_norm": 0.6185740550141803, + "learning_rate": 3.47334788267867e-05, + "loss": 2.1101, + "step": 5075 + }, + { + "epoch": 0.39, + "grad_norm": 0.6503722389680315, + "learning_rate": 3.4727724717972775e-05, + "loss": 1.9473, + "step": 5076 + }, + { + "epoch": 0.39, + "grad_norm": 0.6252433398013737, + "learning_rate": 3.4721970001817043e-05, + "loss": 1.8973, + "step": 5077 + }, + { + "epoch": 0.39, + "grad_norm": 0.8114455057663799, + "learning_rate": 3.471621467867878e-05, + "loss": 1.897, + "step": 5078 + }, + { + "epoch": 0.39, + "grad_norm": 0.6976009962830167, + "learning_rate": 3.471045874891732e-05, + "loss": 2.1843, + "step": 5079 + }, + { + "epoch": 0.39, + "grad_norm": 0.6506971509349028, + "learning_rate": 3.4704702212892026e-05, + "loss": 1.9501, + "step": 5080 + }, + { + "epoch": 0.39, + "grad_norm": 0.735880400545291, + "learning_rate": 3.469894507096231e-05, + "loss": 1.9218, + "step": 5081 + }, + { + "epoch": 0.39, + "grad_norm": 0.6150772405365771, + "learning_rate": 3.469318732348761e-05, + "loss": 1.9252, + "step": 5082 + }, + { + "epoch": 0.39, + "grad_norm": 0.7313290514504924, + "learning_rate": 3.468742897082741e-05, + "loss": 2.1071, + "step": 5083 + }, + { + "epoch": 0.39, + "grad_norm": 0.6328257819271537, + "learning_rate": 3.4681670013341204e-05, + "loss": 1.9276, + "step": 5084 + }, + { + "epoch": 0.39, + "grad_norm": 0.650721268479889, + "learning_rate": 3.467591045138857e-05, + "loss": 1.9722, + "step": 5085 + }, + { + "epoch": 0.39, + "grad_norm": 0.6296090873765905, + "learning_rate": 3.467015028532911e-05, + "loss": 1.9098, + "step": 5086 + }, + { + "epoch": 0.39, + "grad_norm": 0.6848619633990509, + "learning_rate": 3.4664389515522435e-05, + "loss": 2.1327, + "step": 5087 + }, + { + "epoch": 0.39, + "grad_norm": 0.6510473511446323, + "learning_rate": 3.465862814232822e-05, + "loss": 1.9506, + "step": 5088 + }, + { + "epoch": 0.39, + "grad_norm": 0.7600050779940416, + "learning_rate": 3.465286616610617e-05, + "loss": 1.928, + "step": 5089 + }, + { + "epoch": 0.39, + "grad_norm": 0.6204448387553597, + "learning_rate": 3.464710358721604e-05, + "loss": 1.9436, + "step": 5090 + }, + { + "epoch": 0.39, + "grad_norm": 0.7748582401322675, + "learning_rate": 3.4641340406017585e-05, + "loss": 2.1407, + "step": 5091 + }, + { + "epoch": 0.39, + "grad_norm": 0.6826420282782385, + "learning_rate": 3.463557662287065e-05, + "loss": 1.9402, + "step": 5092 + }, + { + "epoch": 0.39, + "grad_norm": 0.6613574219917621, + "learning_rate": 3.4629812238135085e-05, + "loss": 1.9415, + "step": 5093 + }, + { + "epoch": 0.39, + "grad_norm": 0.6959311685514882, + "learning_rate": 3.4624047252170776e-05, + "loss": 1.933, + "step": 5094 + }, + { + "epoch": 0.39, + "grad_norm": 0.678041244402601, + "learning_rate": 3.461828166533766e-05, + "loss": 2.1241, + "step": 5095 + }, + { + "epoch": 0.39, + "grad_norm": 0.5685807298551447, + "learning_rate": 3.461251547799571e-05, + "loss": 1.9364, + "step": 5096 + }, + { + "epoch": 0.39, + "grad_norm": 0.6884281035315439, + "learning_rate": 3.460674869050493e-05, + "loss": 1.9722, + "step": 5097 + }, + { + "epoch": 0.39, + "grad_norm": 0.6516877836476774, + "learning_rate": 3.460098130322536e-05, + "loss": 1.9021, + "step": 5098 + }, + { + "epoch": 0.39, + "grad_norm": 0.6246031735400638, + "learning_rate": 3.459521331651709e-05, + "loss": 2.1431, + "step": 5099 + }, + { + "epoch": 0.39, + "grad_norm": 0.6126417966221773, + "learning_rate": 3.458944473074023e-05, + "loss": 1.922, + "step": 5100 + }, + { + "epoch": 0.39, + "grad_norm": 0.6750844793239923, + "learning_rate": 3.4583675546254947e-05, + "loss": 1.9017, + "step": 5101 + }, + { + "epoch": 0.39, + "grad_norm": 0.6670977954542251, + "learning_rate": 3.4577905763421424e-05, + "loss": 1.9748, + "step": 5102 + }, + { + "epoch": 0.39, + "grad_norm": 0.7198755473431313, + "learning_rate": 3.457213538259989e-05, + "loss": 2.1711, + "step": 5103 + }, + { + "epoch": 0.39, + "grad_norm": 0.5750742742821333, + "learning_rate": 3.456636440415064e-05, + "loss": 1.931, + "step": 5104 + }, + { + "epoch": 0.39, + "grad_norm": 0.7123374391740395, + "learning_rate": 3.456059282843395e-05, + "loss": 1.9418, + "step": 5105 + }, + { + "epoch": 0.39, + "grad_norm": 0.7983166650273611, + "learning_rate": 3.455482065581017e-05, + "loss": 1.9267, + "step": 5106 + }, + { + "epoch": 0.39, + "grad_norm": 0.763056605363453, + "learning_rate": 3.4549047886639685e-05, + "loss": 2.1487, + "step": 5107 + }, + { + "epoch": 0.39, + "grad_norm": 0.7288017896855449, + "learning_rate": 3.454327452128292e-05, + "loss": 1.893, + "step": 5108 + }, + { + "epoch": 0.39, + "grad_norm": 0.6259020509081491, + "learning_rate": 3.453750056010031e-05, + "loss": 1.9561, + "step": 5109 + }, + { + "epoch": 0.39, + "grad_norm": 0.6858846928339414, + "learning_rate": 3.453172600345237e-05, + "loss": 2.0127, + "step": 5110 + }, + { + "epoch": 0.39, + "grad_norm": 0.7253630144634484, + "learning_rate": 3.4525950851699615e-05, + "loss": 2.1008, + "step": 5111 + }, + { + "epoch": 0.39, + "grad_norm": 0.6431262069070509, + "learning_rate": 3.452017510520262e-05, + "loss": 1.9834, + "step": 5112 + }, + { + "epoch": 0.39, + "grad_norm": 0.6694494870553823, + "learning_rate": 3.451439876432197e-05, + "loss": 1.9657, + "step": 5113 + }, + { + "epoch": 0.39, + "grad_norm": 0.6672835466280991, + "learning_rate": 3.450862182941832e-05, + "loss": 1.9153, + "step": 5114 + }, + { + "epoch": 0.39, + "grad_norm": 0.7988169812524438, + "learning_rate": 3.4502844300852365e-05, + "loss": 2.1356, + "step": 5115 + }, + { + "epoch": 0.39, + "grad_norm": 0.5528837048793839, + "learning_rate": 3.449706617898479e-05, + "loss": 2.01, + "step": 5116 + }, + { + "epoch": 0.39, + "grad_norm": 0.8164440584757596, + "learning_rate": 3.4491287464176356e-05, + "loss": 1.9029, + "step": 5117 + }, + { + "epoch": 0.39, + "grad_norm": 0.6419419950076354, + "learning_rate": 3.448550815678786e-05, + "loss": 1.8905, + "step": 5118 + }, + { + "epoch": 0.39, + "grad_norm": 0.7065539452675651, + "learning_rate": 3.447972825718012e-05, + "loss": 2.1337, + "step": 5119 + }, + { + "epoch": 0.4, + "grad_norm": 0.6587030263711571, + "learning_rate": 3.4473947765714e-05, + "loss": 1.9352, + "step": 5120 + }, + { + "epoch": 0.4, + "grad_norm": 0.6222426182698918, + "learning_rate": 3.4468166682750396e-05, + "loss": 1.8794, + "step": 5121 + }, + { + "epoch": 0.4, + "grad_norm": 0.598407123406919, + "learning_rate": 3.4462385008650256e-05, + "loss": 2.0528, + "step": 5122 + }, + { + "epoch": 0.4, + "grad_norm": 0.6774662075057788, + "learning_rate": 3.4456602743774546e-05, + "loss": 2.1225, + "step": 5123 + }, + { + "epoch": 0.4, + "grad_norm": 0.6068316567688256, + "learning_rate": 3.4450819888484284e-05, + "loss": 1.9259, + "step": 5124 + }, + { + "epoch": 0.4, + "grad_norm": 0.6501330345693576, + "learning_rate": 3.44450364431405e-05, + "loss": 1.9687, + "step": 5125 + }, + { + "epoch": 0.4, + "grad_norm": 0.8185496278360419, + "learning_rate": 3.443925240810429e-05, + "loss": 1.9759, + "step": 5126 + }, + { + "epoch": 0.4, + "grad_norm": 0.6778249108819614, + "learning_rate": 3.443346778373678e-05, + "loss": 2.1438, + "step": 5127 + }, + { + "epoch": 0.4, + "grad_norm": 0.5906612399808495, + "learning_rate": 3.442768257039913e-05, + "loss": 2.0352, + "step": 5128 + }, + { + "epoch": 0.4, + "grad_norm": 0.6928571371964668, + "learning_rate": 3.442189676845252e-05, + "loss": 1.9655, + "step": 5129 + }, + { + "epoch": 0.4, + "grad_norm": 0.664110489034663, + "learning_rate": 3.441611037825819e-05, + "loss": 1.9252, + "step": 5130 + }, + { + "epoch": 0.4, + "grad_norm": 0.5979560810888491, + "learning_rate": 3.4410323400177403e-05, + "loss": 2.1254, + "step": 5131 + }, + { + "epoch": 0.4, + "grad_norm": 0.6088116302105208, + "learning_rate": 3.4404535834571475e-05, + "loss": 1.937, + "step": 5132 + }, + { + "epoch": 0.4, + "grad_norm": 0.6322525961600001, + "learning_rate": 3.439874768180174e-05, + "loss": 1.9341, + "step": 5133 + }, + { + "epoch": 0.4, + "grad_norm": 0.5610481539786276, + "learning_rate": 3.439295894222957e-05, + "loss": 2.0191, + "step": 5134 + }, + { + "epoch": 0.4, + "grad_norm": 0.695969413194961, + "learning_rate": 3.438716961621638e-05, + "loss": 2.1643, + "step": 5135 + }, + { + "epoch": 0.4, + "grad_norm": 0.6649232098148886, + "learning_rate": 3.438137970412364e-05, + "loss": 1.9539, + "step": 5136 + }, + { + "epoch": 0.4, + "grad_norm": 0.776742545986537, + "learning_rate": 3.437558920631282e-05, + "loss": 1.9219, + "step": 5137 + }, + { + "epoch": 0.4, + "grad_norm": 0.7022998487518577, + "learning_rate": 3.436979812314545e-05, + "loss": 1.9381, + "step": 5138 + }, + { + "epoch": 0.4, + "grad_norm": 0.754871912085761, + "learning_rate": 3.4364006454983085e-05, + "loss": 2.163, + "step": 5139 + }, + { + "epoch": 0.4, + "grad_norm": 0.6805855494567521, + "learning_rate": 3.435821420218734e-05, + "loss": 1.9095, + "step": 5140 + }, + { + "epoch": 0.4, + "grad_norm": 0.6674972179146449, + "learning_rate": 3.435242136511984e-05, + "loss": 1.9671, + "step": 5141 + }, + { + "epoch": 0.4, + "grad_norm": 0.7204555658970736, + "learning_rate": 3.434662794414224e-05, + "loss": 1.9212, + "step": 5142 + }, + { + "epoch": 0.4, + "grad_norm": 0.6916334257099047, + "learning_rate": 3.434083393961627e-05, + "loss": 2.1897, + "step": 5143 + }, + { + "epoch": 0.4, + "grad_norm": 0.7097736671270447, + "learning_rate": 3.4335039351903655e-05, + "loss": 1.9501, + "step": 5144 + }, + { + "epoch": 0.4, + "grad_norm": 0.78387467883872, + "learning_rate": 3.43292441813662e-05, + "loss": 1.9351, + "step": 5145 + }, + { + "epoch": 0.4, + "grad_norm": 0.7159987457329327, + "learning_rate": 3.4323448428365685e-05, + "loss": 1.8985, + "step": 5146 + }, + { + "epoch": 0.4, + "grad_norm": 0.8751289202502383, + "learning_rate": 3.431765209326399e-05, + "loss": 2.189, + "step": 5147 + }, + { + "epoch": 0.4, + "grad_norm": 0.8509625811467332, + "learning_rate": 3.4311855176423e-05, + "loss": 1.8786, + "step": 5148 + }, + { + "epoch": 0.4, + "grad_norm": 0.7757564645598389, + "learning_rate": 3.430605767820464e-05, + "loss": 1.9526, + "step": 5149 + }, + { + "epoch": 0.4, + "grad_norm": 1.1200283631676975, + "learning_rate": 3.430025959897086e-05, + "loss": 1.9862, + "step": 5150 + }, + { + "epoch": 0.4, + "grad_norm": 0.895042336920376, + "learning_rate": 3.429446093908368e-05, + "loss": 2.1759, + "step": 5151 + }, + { + "epoch": 0.4, + "grad_norm": 1.1542142256196717, + "learning_rate": 3.428866169890511e-05, + "loss": 1.9336, + "step": 5152 + }, + { + "epoch": 0.4, + "grad_norm": 1.1158703541461679, + "learning_rate": 3.4282861878797235e-05, + "loss": 1.971, + "step": 5153 + }, + { + "epoch": 0.4, + "grad_norm": 0.7370615823645804, + "learning_rate": 3.4277061479122165e-05, + "loss": 1.9838, + "step": 5154 + }, + { + "epoch": 0.4, + "grad_norm": 1.22579558972302, + "learning_rate": 3.427126050024203e-05, + "loss": 2.1637, + "step": 5155 + }, + { + "epoch": 0.4, + "grad_norm": 0.6998609977686412, + "learning_rate": 3.426545894251901e-05, + "loss": 1.9077, + "step": 5156 + }, + { + "epoch": 0.4, + "grad_norm": 1.0131026859083376, + "learning_rate": 3.425965680631533e-05, + "loss": 1.9276, + "step": 5157 + }, + { + "epoch": 0.4, + "grad_norm": 0.7119424614406688, + "learning_rate": 3.4253854091993235e-05, + "loss": 1.9032, + "step": 5158 + }, + { + "epoch": 0.4, + "grad_norm": 0.776952531289085, + "learning_rate": 3.424805079991501e-05, + "loss": 2.013, + "step": 5159 + }, + { + "epoch": 0.4, + "grad_norm": 0.7172813218013243, + "learning_rate": 3.424224693044298e-05, + "loss": 2.1131, + "step": 5160 + }, + { + "epoch": 0.4, + "grad_norm": 0.728952270225371, + "learning_rate": 3.423644248393952e-05, + "loss": 1.9089, + "step": 5161 + }, + { + "epoch": 0.4, + "grad_norm": 0.6588193576705801, + "learning_rate": 3.4230637460767e-05, + "loss": 1.9099, + "step": 5162 + }, + { + "epoch": 0.4, + "grad_norm": 0.7497192611985066, + "learning_rate": 3.4224831861287864e-05, + "loss": 2.127, + "step": 5163 + }, + { + "epoch": 0.4, + "grad_norm": 0.6744664704845145, + "learning_rate": 3.4219025685864574e-05, + "loss": 1.9533, + "step": 5164 + }, + { + "epoch": 0.4, + "grad_norm": 0.7467798192513476, + "learning_rate": 3.421321893485965e-05, + "loss": 1.9963, + "step": 5165 + }, + { + "epoch": 0.4, + "grad_norm": 0.7099871825638486, + "learning_rate": 3.420741160863561e-05, + "loss": 1.9176, + "step": 5166 + }, + { + "epoch": 0.4, + "grad_norm": 0.8651618133622729, + "learning_rate": 3.4201603707555037e-05, + "loss": 2.0902, + "step": 5167 + }, + { + "epoch": 0.4, + "grad_norm": 0.7327223380270227, + "learning_rate": 3.419579523198054e-05, + "loss": 1.976, + "step": 5168 + }, + { + "epoch": 0.4, + "grad_norm": 0.7989076192145107, + "learning_rate": 3.418998618227478e-05, + "loss": 1.945, + "step": 5169 + }, + { + "epoch": 0.4, + "grad_norm": 0.680794191298754, + "learning_rate": 3.418417655880043e-05, + "loss": 1.9292, + "step": 5170 + }, + { + "epoch": 0.4, + "grad_norm": 0.6296391274873299, + "learning_rate": 3.41783663619202e-05, + "loss": 2.1723, + "step": 5171 + }, + { + "epoch": 0.4, + "grad_norm": 0.714581826324136, + "learning_rate": 3.417255559199686e-05, + "loss": 1.995, + "step": 5172 + }, + { + "epoch": 0.4, + "grad_norm": 0.6517959202400113, + "learning_rate": 3.416674424939318e-05, + "loss": 1.9151, + "step": 5173 + }, + { + "epoch": 0.4, + "grad_norm": 0.7430243647891767, + "learning_rate": 3.416093233447201e-05, + "loss": 1.9562, + "step": 5174 + }, + { + "epoch": 0.4, + "grad_norm": 0.7116917518055694, + "learning_rate": 3.41551198475962e-05, + "loss": 2.1601, + "step": 5175 + }, + { + "epoch": 0.4, + "grad_norm": 0.8043435232940346, + "learning_rate": 3.414930678912864e-05, + "loss": 1.9281, + "step": 5176 + }, + { + "epoch": 0.4, + "grad_norm": 0.7233351272705736, + "learning_rate": 3.414349315943227e-05, + "loss": 1.9291, + "step": 5177 + }, + { + "epoch": 0.4, + "grad_norm": 0.6889890429916535, + "learning_rate": 3.4137678958870065e-05, + "loss": 2.0542, + "step": 5178 + }, + { + "epoch": 0.4, + "grad_norm": 0.7207627617406904, + "learning_rate": 3.413186418780503e-05, + "loss": 2.1352, + "step": 5179 + }, + { + "epoch": 0.4, + "grad_norm": 0.6497740913649946, + "learning_rate": 3.41260488466002e-05, + "loss": 1.9394, + "step": 5180 + }, + { + "epoch": 0.4, + "grad_norm": 0.8390346520117312, + "learning_rate": 3.412023293561864e-05, + "loss": 1.9551, + "step": 5181 + }, + { + "epoch": 0.4, + "grad_norm": 0.6449023928647714, + "learning_rate": 3.411441645522347e-05, + "loss": 1.938, + "step": 5182 + }, + { + "epoch": 0.4, + "grad_norm": 0.8190888596988902, + "learning_rate": 3.410859940577784e-05, + "loss": 2.1311, + "step": 5183 + }, + { + "epoch": 0.4, + "grad_norm": 0.6824027525943179, + "learning_rate": 3.4102781787644936e-05, + "loss": 1.9881, + "step": 5184 + }, + { + "epoch": 0.4, + "grad_norm": 0.6869264416125215, + "learning_rate": 3.4096963601187964e-05, + "loss": 1.9124, + "step": 5185 + }, + { + "epoch": 0.4, + "grad_norm": 0.7333373906617172, + "learning_rate": 3.409114484677019e-05, + "loss": 1.9365, + "step": 5186 + }, + { + "epoch": 0.4, + "grad_norm": 0.7163243037554896, + "learning_rate": 3.408532552475489e-05, + "loss": 2.1024, + "step": 5187 + }, + { + "epoch": 0.4, + "grad_norm": 0.6718368610810902, + "learning_rate": 3.407950563550539e-05, + "loss": 1.8952, + "step": 5188 + }, + { + "epoch": 0.4, + "grad_norm": 0.849432910916129, + "learning_rate": 3.4073685179385055e-05, + "loss": 1.9269, + "step": 5189 + }, + { + "epoch": 0.4, + "grad_norm": 0.625871996430383, + "learning_rate": 3.4067864156757285e-05, + "loss": 2.0522, + "step": 5190 + }, + { + "epoch": 0.4, + "grad_norm": 0.7717397180791657, + "learning_rate": 3.40620425679855e-05, + "loss": 1.9598, + "step": 5191 + }, + { + "epoch": 0.4, + "grad_norm": 0.7919723249946629, + "learning_rate": 3.405622041343317e-05, + "loss": 2.1885, + "step": 5192 + }, + { + "epoch": 0.4, + "grad_norm": 0.6797896803009621, + "learning_rate": 3.40503976934638e-05, + "loss": 1.9569, + "step": 5193 + }, + { + "epoch": 0.4, + "grad_norm": 0.8431752009887734, + "learning_rate": 3.404457440844092e-05, + "loss": 1.9272, + "step": 5194 + }, + { + "epoch": 0.4, + "grad_norm": 0.6138715832117367, + "learning_rate": 3.403875055872809e-05, + "loss": 2.141, + "step": 5195 + }, + { + "epoch": 0.4, + "grad_norm": 0.7532240803932055, + "learning_rate": 3.403292614468895e-05, + "loss": 1.9997, + "step": 5196 + }, + { + "epoch": 0.4, + "grad_norm": 0.8005897661975706, + "learning_rate": 3.402710116668711e-05, + "loss": 1.9514, + "step": 5197 + }, + { + "epoch": 0.4, + "grad_norm": 0.561932043487301, + "learning_rate": 3.402127562508628e-05, + "loss": 1.9238, + "step": 5198 + }, + { + "epoch": 0.4, + "grad_norm": 0.8603888360652088, + "learning_rate": 3.4015449520250123e-05, + "loss": 2.1168, + "step": 5199 + }, + { + "epoch": 0.4, + "grad_norm": 0.6083117502512289, + "learning_rate": 3.4009622852542436e-05, + "loss": 1.9282, + "step": 5200 + }, + { + "epoch": 0.4, + "grad_norm": 0.5974118334823821, + "learning_rate": 3.400379562232698e-05, + "loss": 1.9231, + "step": 5201 + }, + { + "epoch": 0.4, + "grad_norm": 0.8409479283076279, + "learning_rate": 3.399796782996757e-05, + "loss": 1.989, + "step": 5202 + }, + { + "epoch": 0.4, + "grad_norm": 0.6229204406977977, + "learning_rate": 3.399213947582805e-05, + "loss": 1.969, + "step": 5203 + }, + { + "epoch": 0.4, + "grad_norm": 0.8096436491396863, + "learning_rate": 3.3986310560272346e-05, + "loss": 2.1007, + "step": 5204 + }, + { + "epoch": 0.4, + "grad_norm": 0.7074908868530467, + "learning_rate": 3.398048108366435e-05, + "loss": 1.9152, + "step": 5205 + }, + { + "epoch": 0.4, + "grad_norm": 0.7490728766885801, + "learning_rate": 3.3974651046368025e-05, + "loss": 1.9479, + "step": 5206 + }, + { + "epoch": 0.4, + "grad_norm": 0.7154405943947776, + "learning_rate": 3.396882044874736e-05, + "loss": 2.0789, + "step": 5207 + }, + { + "epoch": 0.4, + "grad_norm": 0.6013850840004539, + "learning_rate": 3.39629892911664e-05, + "loss": 1.9299, + "step": 5208 + }, + { + "epoch": 0.4, + "grad_norm": 0.7772636156467689, + "learning_rate": 3.3957157573989196e-05, + "loss": 2.0059, + "step": 5209 + }, + { + "epoch": 0.4, + "grad_norm": 0.5949626288378417, + "learning_rate": 3.395132529757984e-05, + "loss": 1.9148, + "step": 5210 + }, + { + "epoch": 0.4, + "grad_norm": 0.7835083503038963, + "learning_rate": 3.394549246230248e-05, + "loss": 1.9476, + "step": 5211 + }, + { + "epoch": 0.4, + "grad_norm": 0.6232077671416494, + "learning_rate": 3.3939659068521277e-05, + "loss": 2.1236, + "step": 5212 + }, + { + "epoch": 0.4, + "grad_norm": 0.5492288599285367, + "learning_rate": 3.3933825116600426e-05, + "loss": 1.952, + "step": 5213 + }, + { + "epoch": 0.4, + "grad_norm": 0.7358976361594033, + "learning_rate": 3.392799060690418e-05, + "loss": 1.932, + "step": 5214 + }, + { + "epoch": 0.4, + "grad_norm": 0.7269091727364939, + "learning_rate": 3.392215553979679e-05, + "loss": 2.0332, + "step": 5215 + }, + { + "epoch": 0.4, + "grad_norm": 0.7253663669870873, + "learning_rate": 3.3916319915642593e-05, + "loss": 2.1421, + "step": 5216 + }, + { + "epoch": 0.4, + "grad_norm": 0.6163733515459247, + "learning_rate": 3.3910483734805907e-05, + "loss": 1.9196, + "step": 5217 + }, + { + "epoch": 0.4, + "grad_norm": 0.6822118512181622, + "learning_rate": 3.390464699765112e-05, + "loss": 1.8877, + "step": 5218 + }, + { + "epoch": 0.4, + "grad_norm": 0.7501477255554793, + "learning_rate": 3.3898809704542645e-05, + "loss": 2.1221, + "step": 5219 + }, + { + "epoch": 0.4, + "grad_norm": 0.6202622808850948, + "learning_rate": 3.3892971855844905e-05, + "loss": 1.9652, + "step": 5220 + }, + { + "epoch": 0.4, + "grad_norm": 0.8934645349024312, + "learning_rate": 3.388713345192241e-05, + "loss": 2.0123, + "step": 5221 + }, + { + "epoch": 0.4, + "grad_norm": 0.661446574841369, + "learning_rate": 3.388129449313966e-05, + "loss": 1.9432, + "step": 5222 + }, + { + "epoch": 0.4, + "grad_norm": 0.7651857552611896, + "learning_rate": 3.3875454979861224e-05, + "loss": 1.885, + "step": 5223 + }, + { + "epoch": 0.4, + "grad_norm": 0.728772148889215, + "learning_rate": 3.386961491245165e-05, + "loss": 2.1677, + "step": 5224 + }, + { + "epoch": 0.4, + "grad_norm": 0.9152544393392451, + "learning_rate": 3.386377429127559e-05, + "loss": 1.9461, + "step": 5225 + }, + { + "epoch": 0.4, + "grad_norm": 0.7011376699196756, + "learning_rate": 3.385793311669769e-05, + "loss": 1.9182, + "step": 5226 + }, + { + "epoch": 0.4, + "grad_norm": 0.6548572417212039, + "learning_rate": 3.385209138908264e-05, + "loss": 2.1628, + "step": 5227 + }, + { + "epoch": 0.4, + "grad_norm": 0.755820390360719, + "learning_rate": 3.384624910879515e-05, + "loss": 2.0142, + "step": 5228 + }, + { + "epoch": 0.4, + "grad_norm": 0.7237642728698935, + "learning_rate": 3.384040627619999e-05, + "loss": 1.9402, + "step": 5229 + }, + { + "epoch": 0.4, + "grad_norm": 0.5743953572042363, + "learning_rate": 3.3834562891661953e-05, + "loss": 1.9626, + "step": 5230 + }, + { + "epoch": 0.4, + "grad_norm": 0.692353865690534, + "learning_rate": 3.3828718955545856e-05, + "loss": 2.0868, + "step": 5231 + }, + { + "epoch": 0.4, + "grad_norm": 0.536562168032147, + "learning_rate": 3.382287446821657e-05, + "loss": 1.9185, + "step": 5232 + }, + { + "epoch": 0.4, + "grad_norm": 0.6804395895438056, + "learning_rate": 3.3817029430038984e-05, + "loss": 1.9131, + "step": 5233 + }, + { + "epoch": 0.4, + "grad_norm": 0.545639237880857, + "learning_rate": 3.3811183841378026e-05, + "loss": 2.0617, + "step": 5234 + }, + { + "epoch": 0.4, + "grad_norm": 0.6906458231078563, + "learning_rate": 3.3805337702598665e-05, + "loss": 1.9701, + "step": 5235 + }, + { + "epoch": 0.4, + "grad_norm": 0.7034130215905531, + "learning_rate": 3.37994910140659e-05, + "loss": 2.132, + "step": 5236 + }, + { + "epoch": 0.4, + "grad_norm": 0.6340922560260684, + "learning_rate": 3.379364377614476e-05, + "loss": 1.8999, + "step": 5237 + }, + { + "epoch": 0.4, + "grad_norm": 0.8457953857030698, + "learning_rate": 3.378779598920031e-05, + "loss": 1.9566, + "step": 5238 + }, + { + "epoch": 0.4, + "grad_norm": 0.6584616535164477, + "learning_rate": 3.3781947653597664e-05, + "loss": 2.1106, + "step": 5239 + }, + { + "epoch": 0.4, + "grad_norm": 0.6286440522181891, + "learning_rate": 3.377609876970194e-05, + "loss": 2.0084, + "step": 5240 + }, + { + "epoch": 0.4, + "grad_norm": 0.6934126197897016, + "learning_rate": 3.3770249337878324e-05, + "loss": 1.9316, + "step": 5241 + }, + { + "epoch": 0.4, + "grad_norm": 0.6051795487898327, + "learning_rate": 3.376439935849201e-05, + "loss": 1.9447, + "step": 5242 + }, + { + "epoch": 0.4, + "grad_norm": 0.7181415837208338, + "learning_rate": 3.3758548831908236e-05, + "loss": 1.9312, + "step": 5243 + }, + { + "epoch": 0.4, + "grad_norm": 0.6367579874078206, + "learning_rate": 3.3752697758492286e-05, + "loss": 2.1441, + "step": 5244 + }, + { + "epoch": 0.4, + "grad_norm": 0.6806886415859817, + "learning_rate": 3.374684613860945e-05, + "loss": 1.943, + "step": 5245 + }, + { + "epoch": 0.4, + "grad_norm": 0.6978920071413222, + "learning_rate": 3.374099397262508e-05, + "loss": 2.0205, + "step": 5246 + }, + { + "epoch": 0.4, + "grad_norm": 0.6608756208767823, + "learning_rate": 3.3735141260904546e-05, + "loss": 1.9107, + "step": 5247 + }, + { + "epoch": 0.4, + "grad_norm": 0.6651245321275171, + "learning_rate": 3.372928800381326e-05, + "loss": 2.0967, + "step": 5248 + }, + { + "epoch": 0.4, + "grad_norm": 0.6118157271258999, + "learning_rate": 3.3723434201716664e-05, + "loss": 1.9745, + "step": 5249 + }, + { + "epoch": 0.41, + "grad_norm": 0.6605765584694455, + "learning_rate": 3.3717579854980223e-05, + "loss": 1.9419, + "step": 5250 + }, + { + "epoch": 0.41, + "grad_norm": 0.6217061660600536, + "learning_rate": 3.371172496396947e-05, + "loss": 2.1354, + "step": 5251 + }, + { + "epoch": 0.41, + "grad_norm": 0.6241051748013673, + "learning_rate": 3.3705869529049935e-05, + "loss": 2.0294, + "step": 5252 + }, + { + "epoch": 0.41, + "grad_norm": 0.6555008498835914, + "learning_rate": 3.3700013550587206e-05, + "loss": 2.0088, + "step": 5253 + }, + { + "epoch": 0.41, + "grad_norm": 0.5542913075640968, + "learning_rate": 3.369415702894688e-05, + "loss": 1.9328, + "step": 5254 + }, + { + "epoch": 0.41, + "grad_norm": 0.6435369289148044, + "learning_rate": 3.368829996449463e-05, + "loss": 1.9698, + "step": 5255 + }, + { + "epoch": 0.41, + "grad_norm": 0.6417280768547451, + "learning_rate": 3.368244235759611e-05, + "loss": 2.1303, + "step": 5256 + }, + { + "epoch": 0.41, + "grad_norm": 0.6886953462472054, + "learning_rate": 3.3676584208617044e-05, + "loss": 1.9449, + "step": 5257 + }, + { + "epoch": 0.41, + "grad_norm": 0.6210181981287859, + "learning_rate": 3.3670725517923186e-05, + "loss": 1.9421, + "step": 5258 + }, + { + "epoch": 0.41, + "grad_norm": 0.6393147429425503, + "learning_rate": 3.366486628588031e-05, + "loss": 1.9926, + "step": 5259 + }, + { + "epoch": 0.41, + "grad_norm": 0.6190023792123103, + "learning_rate": 3.365900651285424e-05, + "loss": 2.1097, + "step": 5260 + }, + { + "epoch": 0.41, + "grad_norm": 0.5924726801705815, + "learning_rate": 3.3653146199210816e-05, + "loss": 2.0024, + "step": 5261 + }, + { + "epoch": 0.41, + "grad_norm": 0.6666711527369532, + "learning_rate": 3.364728534531593e-05, + "loss": 1.9541, + "step": 5262 + }, + { + "epoch": 0.41, + "grad_norm": 0.7360866032981873, + "learning_rate": 3.36414239515355e-05, + "loss": 2.153, + "step": 5263 + }, + { + "epoch": 0.41, + "grad_norm": 0.6590162771064586, + "learning_rate": 3.3635562018235464e-05, + "loss": 1.9536, + "step": 5264 + }, + { + "epoch": 0.41, + "grad_norm": 0.6593605614296099, + "learning_rate": 3.362969954578182e-05, + "loss": 1.9707, + "step": 5265 + }, + { + "epoch": 0.41, + "grad_norm": 0.6401941888386696, + "learning_rate": 3.3623836534540585e-05, + "loss": 1.9432, + "step": 5266 + }, + { + "epoch": 0.41, + "grad_norm": 0.6374019819026574, + "learning_rate": 3.36179729848778e-05, + "loss": 1.9842, + "step": 5267 + }, + { + "epoch": 0.41, + "grad_norm": 0.5891797848415179, + "learning_rate": 3.361210889715957e-05, + "loss": 2.1383, + "step": 5268 + }, + { + "epoch": 0.41, + "grad_norm": 0.6791328842955693, + "learning_rate": 3.3606244271751985e-05, + "loss": 1.9178, + "step": 5269 + }, + { + "epoch": 0.41, + "grad_norm": 0.7093524321758472, + "learning_rate": 3.360037910902123e-05, + "loss": 1.9443, + "step": 5270 + }, + { + "epoch": 0.41, + "grad_norm": 0.6167685012106178, + "learning_rate": 3.359451340933347e-05, + "loss": 2.0262, + "step": 5271 + }, + { + "epoch": 0.41, + "grad_norm": 0.7361821044169229, + "learning_rate": 3.3588647173054924e-05, + "loss": 2.1283, + "step": 5272 + }, + { + "epoch": 0.41, + "grad_norm": 0.7803715391666181, + "learning_rate": 3.3582780400551864e-05, + "loss": 1.9659, + "step": 5273 + }, + { + "epoch": 0.41, + "grad_norm": 0.612198908835539, + "learning_rate": 3.357691309219057e-05, + "loss": 1.9545, + "step": 5274 + }, + { + "epoch": 0.41, + "grad_norm": 0.6046219214048734, + "learning_rate": 3.357104524833733e-05, + "loss": 1.947, + "step": 5275 + }, + { + "epoch": 0.41, + "grad_norm": 0.6937068032221769, + "learning_rate": 3.356517686935855e-05, + "loss": 2.1869, + "step": 5276 + }, + { + "epoch": 0.41, + "grad_norm": 0.6028598148032348, + "learning_rate": 3.355930795562058e-05, + "loss": 2.0009, + "step": 5277 + }, + { + "epoch": 0.41, + "grad_norm": 0.652012611738902, + "learning_rate": 3.3553438507489857e-05, + "loss": 1.9558, + "step": 5278 + }, + { + "epoch": 0.41, + "grad_norm": 0.6308769959970407, + "learning_rate": 3.354756852533282e-05, + "loss": 1.961, + "step": 5279 + }, + { + "epoch": 0.41, + "grad_norm": 0.553347355467075, + "learning_rate": 3.3541698009515974e-05, + "loss": 2.1434, + "step": 5280 + }, + { + "epoch": 0.41, + "grad_norm": 0.6119953478046416, + "learning_rate": 3.353582696040583e-05, + "loss": 1.8929, + "step": 5281 + }, + { + "epoch": 0.41, + "grad_norm": 0.6226083767859955, + "learning_rate": 3.352995537836894e-05, + "loss": 1.9324, + "step": 5282 + }, + { + "epoch": 0.41, + "grad_norm": 0.5923486480085616, + "learning_rate": 3.35240832637719e-05, + "loss": 2.0477, + "step": 5283 + }, + { + "epoch": 0.41, + "grad_norm": 0.753970142294538, + "learning_rate": 3.3518210616981314e-05, + "loss": 2.1125, + "step": 5284 + }, + { + "epoch": 0.41, + "grad_norm": 0.614150341462612, + "learning_rate": 3.351233743836386e-05, + "loss": 1.9301, + "step": 5285 + }, + { + "epoch": 0.41, + "grad_norm": 0.8223604775807146, + "learning_rate": 3.35064637282862e-05, + "loss": 1.9172, + "step": 5286 + }, + { + "epoch": 0.41, + "grad_norm": 0.7853024617602508, + "learning_rate": 3.350058948711506e-05, + "loss": 1.9313, + "step": 5287 + }, + { + "epoch": 0.41, + "grad_norm": 0.7462653367780909, + "learning_rate": 3.34947147152172e-05, + "loss": 2.1277, + "step": 5288 + }, + { + "epoch": 0.41, + "grad_norm": 0.6979593356434883, + "learning_rate": 3.3488839412959404e-05, + "loss": 1.8937, + "step": 5289 + }, + { + "epoch": 0.41, + "grad_norm": 0.591860886886428, + "learning_rate": 3.34829635807085e-05, + "loss": 2.0153, + "step": 5290 + }, + { + "epoch": 0.41, + "grad_norm": 0.677212997866265, + "learning_rate": 3.3477087218831324e-05, + "loss": 1.9012, + "step": 5291 + }, + { + "epoch": 0.41, + "grad_norm": 0.7726955864109494, + "learning_rate": 3.347121032769476e-05, + "loss": 2.1036, + "step": 5292 + }, + { + "epoch": 0.41, + "grad_norm": 0.684098049955425, + "learning_rate": 3.346533290766574e-05, + "loss": 1.9699, + "step": 5293 + }, + { + "epoch": 0.41, + "grad_norm": 0.6406872766735322, + "learning_rate": 3.3459454959111216e-05, + "loss": 1.932, + "step": 5294 + }, + { + "epoch": 0.41, + "grad_norm": 0.7139993032799766, + "learning_rate": 3.345357648239816e-05, + "loss": 1.9265, + "step": 5295 + }, + { + "epoch": 0.41, + "grad_norm": 0.7681864756959326, + "learning_rate": 3.3447697477893604e-05, + "loss": 2.1703, + "step": 5296 + }, + { + "epoch": 0.41, + "grad_norm": 0.718380306151459, + "learning_rate": 3.344181794596458e-05, + "loss": 1.9471, + "step": 5297 + }, + { + "epoch": 0.41, + "grad_norm": 0.6512072857555886, + "learning_rate": 3.343593788697819e-05, + "loss": 1.9173, + "step": 5298 + }, + { + "epoch": 0.41, + "grad_norm": 0.6574842632797722, + "learning_rate": 3.3430057301301545e-05, + "loss": 1.9076, + "step": 5299 + }, + { + "epoch": 0.41, + "grad_norm": 0.633373791204063, + "learning_rate": 3.342417618930178e-05, + "loss": 2.1679, + "step": 5300 + }, + { + "epoch": 0.41, + "grad_norm": 0.5966169302392149, + "learning_rate": 3.341829455134609e-05, + "loss": 1.931, + "step": 5301 + }, + { + "epoch": 0.41, + "grad_norm": 0.6409462902292787, + "learning_rate": 3.3412412387801694e-05, + "loss": 2.0083, + "step": 5302 + }, + { + "epoch": 0.41, + "grad_norm": 0.5929602334546218, + "learning_rate": 3.3406529699035826e-05, + "loss": 1.9623, + "step": 5303 + }, + { + "epoch": 0.41, + "grad_norm": 0.5766598760441138, + "learning_rate": 3.3400646485415784e-05, + "loss": 2.1611, + "step": 5304 + }, + { + "epoch": 0.41, + "grad_norm": 0.6439478219571193, + "learning_rate": 3.339476274730886e-05, + "loss": 1.8813, + "step": 5305 + }, + { + "epoch": 0.41, + "grad_norm": 0.69001488862924, + "learning_rate": 3.338887848508242e-05, + "loss": 1.9284, + "step": 5306 + }, + { + "epoch": 0.41, + "grad_norm": 0.6379634097320331, + "learning_rate": 3.338299369910383e-05, + "loss": 1.9562, + "step": 5307 + }, + { + "epoch": 0.41, + "grad_norm": 0.7348691268440591, + "learning_rate": 3.337710838974051e-05, + "loss": 2.1931, + "step": 5308 + }, + { + "epoch": 0.41, + "grad_norm": 0.586891011009019, + "learning_rate": 3.3371222557359894e-05, + "loss": 1.932, + "step": 5309 + }, + { + "epoch": 0.41, + "grad_norm": 0.7085664646717423, + "learning_rate": 3.336533620232946e-05, + "loss": 1.9048, + "step": 5310 + }, + { + "epoch": 0.41, + "grad_norm": 0.6181471965254565, + "learning_rate": 3.3359449325016725e-05, + "loss": 1.911, + "step": 5311 + }, + { + "epoch": 0.41, + "grad_norm": 0.5879129201872632, + "learning_rate": 3.335356192578923e-05, + "loss": 2.1318, + "step": 5312 + }, + { + "epoch": 0.41, + "grad_norm": 0.6472406361103712, + "learning_rate": 3.334767400501455e-05, + "loss": 1.9394, + "step": 5313 + }, + { + "epoch": 0.41, + "grad_norm": 0.5692944377047722, + "learning_rate": 3.334178556306028e-05, + "loss": 1.9983, + "step": 5314 + }, + { + "epoch": 0.41, + "grad_norm": 0.5861367237346459, + "learning_rate": 3.333589660029407e-05, + "loss": 1.8992, + "step": 5315 + }, + { + "epoch": 0.41, + "grad_norm": 0.5696021726610209, + "learning_rate": 3.3330007117083596e-05, + "loss": 2.1327, + "step": 5316 + }, + { + "epoch": 0.41, + "grad_norm": 0.5841289705934051, + "learning_rate": 3.3324117113796554e-05, + "loss": 1.9826, + "step": 5317 + }, + { + "epoch": 0.41, + "grad_norm": 0.5924441354951783, + "learning_rate": 3.3318226590800676e-05, + "loss": 1.911, + "step": 5318 + }, + { + "epoch": 0.41, + "grad_norm": 0.5634746192330765, + "learning_rate": 3.331233554846375e-05, + "loss": 1.9629, + "step": 5319 + }, + { + "epoch": 0.41, + "grad_norm": 0.6060996788835779, + "learning_rate": 3.330644398715357e-05, + "loss": 2.0848, + "step": 5320 + }, + { + "epoch": 0.41, + "grad_norm": 0.6252072286373167, + "learning_rate": 3.3300551907237966e-05, + "loss": 2.0015, + "step": 5321 + }, + { + "epoch": 0.41, + "grad_norm": 0.5882399546778966, + "learning_rate": 3.3294659309084795e-05, + "loss": 1.8883, + "step": 5322 + }, + { + "epoch": 0.41, + "grad_norm": 0.6282342767627428, + "learning_rate": 3.328876619306199e-05, + "loss": 1.9294, + "step": 5323 + }, + { + "epoch": 0.41, + "grad_norm": 0.7159374445704546, + "learning_rate": 3.3282872559537445e-05, + "loss": 2.1121, + "step": 5324 + }, + { + "epoch": 0.41, + "grad_norm": 0.6241206143967585, + "learning_rate": 3.327697840887914e-05, + "loss": 1.9856, + "step": 5325 + }, + { + "epoch": 0.41, + "grad_norm": 0.6719809961015538, + "learning_rate": 3.327108374145507e-05, + "loss": 1.9072, + "step": 5326 + }, + { + "epoch": 0.41, + "grad_norm": 0.6331125481645862, + "learning_rate": 3.326518855763327e-05, + "loss": 2.0484, + "step": 5327 + }, + { + "epoch": 0.41, + "grad_norm": 0.7214303477512397, + "learning_rate": 3.325929285778179e-05, + "loss": 2.1567, + "step": 5328 + }, + { + "epoch": 0.41, + "grad_norm": 0.5861930785042947, + "learning_rate": 3.325339664226873e-05, + "loss": 1.9386, + "step": 5329 + }, + { + "epoch": 0.41, + "grad_norm": 0.5961781830149099, + "learning_rate": 3.3247499911462204e-05, + "loss": 1.9516, + "step": 5330 + }, + { + "epoch": 0.41, + "grad_norm": 0.7115045127246735, + "learning_rate": 3.324160266573038e-05, + "loss": 1.9597, + "step": 5331 + }, + { + "epoch": 0.41, + "grad_norm": 0.7211409242964176, + "learning_rate": 3.323570490544145e-05, + "loss": 2.1304, + "step": 5332 + }, + { + "epoch": 0.41, + "grad_norm": 0.8215052802825855, + "learning_rate": 3.3229806630963626e-05, + "loss": 1.9903, + "step": 5333 + }, + { + "epoch": 0.41, + "grad_norm": 0.6843410490338113, + "learning_rate": 3.322390784266517e-05, + "loss": 1.969, + "step": 5334 + }, + { + "epoch": 0.41, + "grad_norm": 0.6323225278612354, + "learning_rate": 3.321800854091435e-05, + "loss": 1.9766, + "step": 5335 + }, + { + "epoch": 0.41, + "grad_norm": 0.710342787216553, + "learning_rate": 3.3212108726079504e-05, + "loss": 2.1685, + "step": 5336 + }, + { + "epoch": 0.41, + "grad_norm": 0.6274321409641549, + "learning_rate": 3.320620839852897e-05, + "loss": 1.9584, + "step": 5337 + }, + { + "epoch": 0.41, + "grad_norm": 0.6616181259084215, + "learning_rate": 3.3200307558631145e-05, + "loss": 1.8952, + "step": 5338 + }, + { + "epoch": 0.41, + "grad_norm": 0.6943513058564507, + "learning_rate": 3.319440620675442e-05, + "loss": 2.011, + "step": 5339 + }, + { + "epoch": 0.41, + "grad_norm": 0.6387000761587426, + "learning_rate": 3.318850434326726e-05, + "loss": 2.0719, + "step": 5340 + }, + { + "epoch": 0.41, + "grad_norm": 0.7940324939661112, + "learning_rate": 3.318260196853813e-05, + "loss": 1.9866, + "step": 5341 + }, + { + "epoch": 0.41, + "grad_norm": 0.7355360939558231, + "learning_rate": 3.3176699082935545e-05, + "loss": 1.9287, + "step": 5342 + }, + { + "epoch": 0.41, + "grad_norm": 0.5906442451607258, + "learning_rate": 3.317079568682804e-05, + "loss": 1.8854, + "step": 5343 + }, + { + "epoch": 0.41, + "grad_norm": 0.9712313172517418, + "learning_rate": 3.31648917805842e-05, + "loss": 2.1572, + "step": 5344 + }, + { + "epoch": 0.41, + "grad_norm": 0.6233534474131831, + "learning_rate": 3.315898736457263e-05, + "loss": 2.0169, + "step": 5345 + }, + { + "epoch": 0.41, + "grad_norm": 0.8243955919054732, + "learning_rate": 3.3153082439161955e-05, + "loss": 1.9047, + "step": 5346 + }, + { + "epoch": 0.41, + "grad_norm": 0.7424397036822097, + "learning_rate": 3.314717700472085e-05, + "loss": 1.9045, + "step": 5347 + }, + { + "epoch": 0.41, + "grad_norm": 0.762248712150822, + "learning_rate": 3.314127106161801e-05, + "loss": 2.1389, + "step": 5348 + }, + { + "epoch": 0.41, + "grad_norm": 0.763049404309697, + "learning_rate": 3.3135364610222176e-05, + "loss": 1.8824, + "step": 5349 + }, + { + "epoch": 0.41, + "grad_norm": 0.7094588973745759, + "learning_rate": 3.312945765090211e-05, + "loss": 1.9473, + "step": 5350 + }, + { + "epoch": 0.41, + "grad_norm": 0.6796285764005191, + "learning_rate": 3.31235501840266e-05, + "loss": 1.9082, + "step": 5351 + }, + { + "epoch": 0.41, + "grad_norm": 0.8510634566621629, + "learning_rate": 3.311764220996449e-05, + "loss": 2.2012, + "step": 5352 + }, + { + "epoch": 0.41, + "grad_norm": 0.7778469676119967, + "learning_rate": 3.311173372908462e-05, + "loss": 1.9048, + "step": 5353 + }, + { + "epoch": 0.41, + "grad_norm": 0.614625148193122, + "learning_rate": 3.310582474175589e-05, + "loss": 1.97, + "step": 5354 + }, + { + "epoch": 0.41, + "grad_norm": 0.7622313466563077, + "learning_rate": 3.309991524834723e-05, + "loss": 1.9458, + "step": 5355 + }, + { + "epoch": 0.41, + "grad_norm": 0.6917900793142051, + "learning_rate": 3.309400524922758e-05, + "loss": 2.143, + "step": 5356 + }, + { + "epoch": 0.41, + "grad_norm": 0.7178232395697096, + "learning_rate": 3.308809474476593e-05, + "loss": 1.9195, + "step": 5357 + }, + { + "epoch": 0.41, + "grad_norm": 0.7657897095043855, + "learning_rate": 3.30821837353313e-05, + "loss": 2.0424, + "step": 5358 + }, + { + "epoch": 0.41, + "grad_norm": 0.7310202310255814, + "learning_rate": 3.3076272221292735e-05, + "loss": 1.9099, + "step": 5359 + }, + { + "epoch": 0.41, + "grad_norm": 0.647379420288956, + "learning_rate": 3.307036020301932e-05, + "loss": 2.0979, + "step": 5360 + }, + { + "epoch": 0.41, + "grad_norm": 0.8142841419592612, + "learning_rate": 3.306444768088016e-05, + "loss": 1.9309, + "step": 5361 + }, + { + "epoch": 0.41, + "grad_norm": 0.7231203214133937, + "learning_rate": 3.305853465524441e-05, + "loss": 1.9071, + "step": 5362 + }, + { + "epoch": 0.41, + "grad_norm": 0.7493323650010434, + "learning_rate": 3.305262112648123e-05, + "loss": 1.9176, + "step": 5363 + }, + { + "epoch": 0.41, + "grad_norm": 0.6321117970619141, + "learning_rate": 3.3046707094959836e-05, + "loss": 2.143, + "step": 5364 + }, + { + "epoch": 0.41, + "grad_norm": 0.8904915285736467, + "learning_rate": 3.304079256104945e-05, + "loss": 1.8658, + "step": 5365 + }, + { + "epoch": 0.41, + "grad_norm": 0.5726260305969468, + "learning_rate": 3.3034877525119365e-05, + "loss": 1.9868, + "step": 5366 + }, + { + "epoch": 0.41, + "grad_norm": 0.8061666037883968, + "learning_rate": 3.3028961987538866e-05, + "loss": 1.9327, + "step": 5367 + }, + { + "epoch": 0.41, + "grad_norm": 0.6501762678587258, + "learning_rate": 3.3023045948677276e-05, + "loss": 2.1338, + "step": 5368 + }, + { + "epoch": 0.41, + "grad_norm": 0.6640013442665536, + "learning_rate": 3.3017129408903975e-05, + "loss": 1.9243, + "step": 5369 + }, + { + "epoch": 0.41, + "grad_norm": 0.6466435086930428, + "learning_rate": 3.301121236858835e-05, + "loss": 2.025, + "step": 5370 + }, + { + "epoch": 0.41, + "grad_norm": 0.6449506710824027, + "learning_rate": 3.3005294828099834e-05, + "loss": 1.9383, + "step": 5371 + }, + { + "epoch": 0.41, + "grad_norm": 0.8232486897759911, + "learning_rate": 3.2999376787807864e-05, + "loss": 2.1042, + "step": 5372 + }, + { + "epoch": 0.41, + "grad_norm": 0.7443728813879235, + "learning_rate": 3.2993458248081944e-05, + "loss": 1.936, + "step": 5373 + }, + { + "epoch": 0.41, + "grad_norm": 0.7590241250146753, + "learning_rate": 3.2987539209291584e-05, + "loss": 1.9107, + "step": 5374 + }, + { + "epoch": 0.41, + "grad_norm": 0.5787622383389469, + "learning_rate": 3.2981619671806335e-05, + "loss": 1.8928, + "step": 5375 + }, + { + "epoch": 0.41, + "grad_norm": 0.825712362311262, + "learning_rate": 3.297569963599579e-05, + "loss": 2.1247, + "step": 5376 + }, + { + "epoch": 0.41, + "grad_norm": 0.5490339581906496, + "learning_rate": 3.2969779102229545e-05, + "loss": 2.0606, + "step": 5377 + }, + { + "epoch": 0.41, + "grad_norm": 0.6229286280893256, + "learning_rate": 3.2963858070877254e-05, + "loss": 1.8879, + "step": 5378 + }, + { + "epoch": 0.41, + "grad_norm": 0.6643637550041719, + "learning_rate": 3.295793654230858e-05, + "loss": 1.9659, + "step": 5379 + }, + { + "epoch": 0.42, + "grad_norm": 0.6463529286718409, + "learning_rate": 3.2952014516893234e-05, + "loss": 2.1603, + "step": 5380 + }, + { + "epoch": 0.42, + "grad_norm": 0.6484300848430331, + "learning_rate": 3.294609199500097e-05, + "loss": 1.9299, + "step": 5381 + }, + { + "epoch": 0.42, + "grad_norm": 0.6379223253281268, + "learning_rate": 3.2940168977001526e-05, + "loss": 1.9676, + "step": 5382 + }, + { + "epoch": 0.42, + "grad_norm": 0.5928337322562393, + "learning_rate": 3.293424546326472e-05, + "loss": 2.0145, + "step": 5383 + }, + { + "epoch": 0.42, + "grad_norm": 0.6034318649683867, + "learning_rate": 3.292832145416037e-05, + "loss": 2.1486, + "step": 5384 + }, + { + "epoch": 0.42, + "grad_norm": 0.6452086210617555, + "learning_rate": 3.292239695005835e-05, + "loss": 1.9154, + "step": 5385 + }, + { + "epoch": 0.42, + "grad_norm": 0.565199029063573, + "learning_rate": 3.291647195132853e-05, + "loss": 1.8916, + "step": 5386 + }, + { + "epoch": 0.42, + "grad_norm": 0.611928703412391, + "learning_rate": 3.291054645834085e-05, + "loss": 1.9355, + "step": 5387 + }, + { + "epoch": 0.42, + "grad_norm": 0.641325278371438, + "learning_rate": 3.2904620471465265e-05, + "loss": 2.0621, + "step": 5388 + }, + { + "epoch": 0.42, + "grad_norm": 0.5642565882519102, + "learning_rate": 3.2898693991071745e-05, + "loss": 2.0146, + "step": 5389 + }, + { + "epoch": 0.42, + "grad_norm": 0.6294307780830736, + "learning_rate": 3.28927670175303e-05, + "loss": 1.8861, + "step": 5390 + }, + { + "epoch": 0.42, + "grad_norm": 0.6456823197143003, + "learning_rate": 3.2886839551211e-05, + "loss": 2.0031, + "step": 5391 + }, + { + "epoch": 0.42, + "grad_norm": 0.6659878202889287, + "learning_rate": 3.28809115924839e-05, + "loss": 2.162, + "step": 5392 + }, + { + "epoch": 0.42, + "grad_norm": 0.6324399690362816, + "learning_rate": 3.2874983141719115e-05, + "loss": 1.9593, + "step": 5393 + }, + { + "epoch": 0.42, + "grad_norm": 0.602348209580063, + "learning_rate": 3.286905419928678e-05, + "loss": 1.9467, + "step": 5394 + }, + { + "epoch": 0.42, + "grad_norm": 0.5756152802113174, + "learning_rate": 3.286312476555707e-05, + "loss": 1.9929, + "step": 5395 + }, + { + "epoch": 0.42, + "grad_norm": 0.6515698233982119, + "learning_rate": 3.2857194840900176e-05, + "loss": 2.1354, + "step": 5396 + }, + { + "epoch": 0.42, + "grad_norm": 0.6215240250757839, + "learning_rate": 3.285126442568633e-05, + "loss": 1.9331, + "step": 5397 + }, + { + "epoch": 0.42, + "grad_norm": 0.6643247006869244, + "learning_rate": 3.284533352028579e-05, + "loss": 1.8621, + "step": 5398 + }, + { + "epoch": 0.42, + "grad_norm": 0.6937257105518946, + "learning_rate": 3.2839402125068855e-05, + "loss": 1.9668, + "step": 5399 + }, + { + "epoch": 0.42, + "grad_norm": 0.6590564962345531, + "learning_rate": 3.283347024040583e-05, + "loss": 2.1266, + "step": 5400 + }, + { + "epoch": 0.42, + "grad_norm": 0.6415555727897525, + "learning_rate": 3.282753786666708e-05, + "loss": 1.9967, + "step": 5401 + }, + { + "epoch": 0.42, + "grad_norm": 0.7811069309996574, + "learning_rate": 3.2821605004222986e-05, + "loss": 1.9463, + "step": 5402 + }, + { + "epoch": 0.42, + "grad_norm": 0.6789272133585802, + "learning_rate": 3.2815671653443966e-05, + "loss": 1.9341, + "step": 5403 + }, + { + "epoch": 0.42, + "grad_norm": 0.6393927864589408, + "learning_rate": 3.2809737814700446e-05, + "loss": 2.1274, + "step": 5404 + }, + { + "epoch": 0.42, + "grad_norm": 0.720660913358279, + "learning_rate": 3.280380348836292e-05, + "loss": 1.9389, + "step": 5405 + }, + { + "epoch": 0.42, + "grad_norm": 0.5962042049683706, + "learning_rate": 3.279786867480189e-05, + "loss": 1.9441, + "step": 5406 + }, + { + "epoch": 0.42, + "grad_norm": 0.6429573779968364, + "learning_rate": 3.279193337438786e-05, + "loss": 1.9165, + "step": 5407 + }, + { + "epoch": 0.42, + "grad_norm": 0.5831571987918922, + "learning_rate": 3.278599758749143e-05, + "loss": 2.0326, + "step": 5408 + }, + { + "epoch": 0.42, + "grad_norm": 0.6558730896963416, + "learning_rate": 3.278006131448319e-05, + "loss": 2.1279, + "step": 5409 + }, + { + "epoch": 0.42, + "grad_norm": 0.7726335095758782, + "learning_rate": 3.2774124555733764e-05, + "loss": 1.9557, + "step": 5410 + }, + { + "epoch": 0.42, + "grad_norm": 0.634100231303346, + "learning_rate": 3.27681873116138e-05, + "loss": 1.9333, + "step": 5411 + }, + { + "epoch": 0.42, + "grad_norm": 0.8239728322209793, + "learning_rate": 3.2762249582493984e-05, + "loss": 2.1315, + "step": 5412 + }, + { + "epoch": 0.42, + "grad_norm": 0.6687361561821789, + "learning_rate": 3.2756311368745053e-05, + "loss": 1.92, + "step": 5413 + }, + { + "epoch": 0.42, + "grad_norm": 0.6801857455707073, + "learning_rate": 3.275037267073773e-05, + "loss": 1.9852, + "step": 5414 + }, + { + "epoch": 0.42, + "grad_norm": 0.8131654143789754, + "learning_rate": 3.274443348884281e-05, + "loss": 1.9139, + "step": 5415 + }, + { + "epoch": 0.42, + "grad_norm": 0.7387883901254066, + "learning_rate": 3.273849382343107e-05, + "loss": 2.1647, + "step": 5416 + }, + { + "epoch": 0.42, + "grad_norm": 0.8309319043134368, + "learning_rate": 3.2732553674873396e-05, + "loss": 1.9756, + "step": 5417 + }, + { + "epoch": 0.42, + "grad_norm": 0.6997358030108618, + "learning_rate": 3.272661304354062e-05, + "loss": 1.9328, + "step": 5418 + }, + { + "epoch": 0.42, + "grad_norm": 0.6219879378199415, + "learning_rate": 3.2720671929803645e-05, + "loss": 1.9756, + "step": 5419 + }, + { + "epoch": 0.42, + "grad_norm": 0.7561806188237168, + "learning_rate": 3.271473033403341e-05, + "loss": 2.031, + "step": 5420 + }, + { + "epoch": 0.42, + "grad_norm": 0.6677278711034889, + "learning_rate": 3.270878825660087e-05, + "loss": 2.1608, + "step": 5421 + }, + { + "epoch": 0.42, + "grad_norm": 0.6945289703547172, + "learning_rate": 3.2702845697877e-05, + "loss": 1.9434, + "step": 5422 + }, + { + "epoch": 0.42, + "grad_norm": 0.7416432792052098, + "learning_rate": 3.2696902658232835e-05, + "loss": 1.9324, + "step": 5423 + }, + { + "epoch": 0.42, + "grad_norm": 0.7182562747949965, + "learning_rate": 3.269095913803942e-05, + "loss": 2.1002, + "step": 5424 + }, + { + "epoch": 0.42, + "grad_norm": 0.6315693837351214, + "learning_rate": 3.268501513766783e-05, + "loss": 1.8572, + "step": 5425 + }, + { + "epoch": 0.42, + "grad_norm": 0.8830270748114272, + "learning_rate": 3.267907065748917e-05, + "loss": 2.009, + "step": 5426 + }, + { + "epoch": 0.42, + "grad_norm": 0.5447074759827956, + "learning_rate": 3.2673125697874585e-05, + "loss": 1.9355, + "step": 5427 + }, + { + "epoch": 0.42, + "grad_norm": 0.8267359268744935, + "learning_rate": 3.266718025919524e-05, + "loss": 1.9089, + "step": 5428 + }, + { + "epoch": 0.42, + "grad_norm": 0.739055047598812, + "learning_rate": 3.266123434182233e-05, + "loss": 2.1481, + "step": 5429 + }, + { + "epoch": 0.42, + "grad_norm": 0.6565457511060875, + "learning_rate": 3.265528794612709e-05, + "loss": 1.8831, + "step": 5430 + }, + { + "epoch": 0.42, + "grad_norm": 0.8211801035284677, + "learning_rate": 3.264934107248079e-05, + "loss": 1.9369, + "step": 5431 + }, + { + "epoch": 0.42, + "grad_norm": 0.6170832083440241, + "learning_rate": 3.264339372125468e-05, + "loss": 2.0434, + "step": 5432 + }, + { + "epoch": 0.42, + "grad_norm": 0.8385473880863283, + "learning_rate": 3.263744589282011e-05, + "loss": 2.1591, + "step": 5433 + }, + { + "epoch": 0.42, + "grad_norm": 0.6369368375090488, + "learning_rate": 3.2631497587548415e-05, + "loss": 1.912, + "step": 5434 + }, + { + "epoch": 0.42, + "grad_norm": 0.6332721939866883, + "learning_rate": 3.262554880581098e-05, + "loss": 1.9064, + "step": 5435 + }, + { + "epoch": 0.42, + "grad_norm": 0.8214737652989269, + "learning_rate": 3.26195995479792e-05, + "loss": 2.0765, + "step": 5436 + }, + { + "epoch": 0.42, + "grad_norm": 0.6195597384186993, + "learning_rate": 3.261364981442452e-05, + "loss": 1.9174, + "step": 5437 + }, + { + "epoch": 0.42, + "grad_norm": 0.7362173295151322, + "learning_rate": 3.26076996055184e-05, + "loss": 1.9226, + "step": 5438 + }, + { + "epoch": 0.42, + "grad_norm": 0.6311151273567933, + "learning_rate": 3.260174892163235e-05, + "loss": 2.0216, + "step": 5439 + }, + { + "epoch": 0.42, + "grad_norm": 0.554020995852219, + "learning_rate": 3.259579776313789e-05, + "loss": 1.9296, + "step": 5440 + }, + { + "epoch": 0.42, + "grad_norm": 0.881589836792152, + "learning_rate": 3.258984613040655e-05, + "loss": 2.1549, + "step": 5441 + }, + { + "epoch": 0.42, + "grad_norm": 0.5622356008906851, + "learning_rate": 3.2583894023809956e-05, + "loss": 1.9503, + "step": 5442 + }, + { + "epoch": 0.42, + "grad_norm": 0.7343325030107479, + "learning_rate": 3.25779414437197e-05, + "loss": 1.9232, + "step": 5443 + }, + { + "epoch": 0.42, + "grad_norm": 0.6680789443046955, + "learning_rate": 3.2571988390507416e-05, + "loss": 2.1192, + "step": 5444 + }, + { + "epoch": 0.42, + "grad_norm": 0.6236030928499064, + "learning_rate": 3.2566034864544804e-05, + "loss": 2.0281, + "step": 5445 + }, + { + "epoch": 0.42, + "grad_norm": 0.6963852352563481, + "learning_rate": 3.2560080866203545e-05, + "loss": 1.9275, + "step": 5446 + }, + { + "epoch": 0.42, + "grad_norm": 0.5962980734923388, + "learning_rate": 3.2554126395855385e-05, + "loss": 1.9583, + "step": 5447 + }, + { + "epoch": 0.42, + "grad_norm": 0.618694015992922, + "learning_rate": 3.2548171453872076e-05, + "loss": 2.1159, + "step": 5448 + }, + { + "epoch": 0.42, + "grad_norm": 0.6097435989779305, + "learning_rate": 3.254221604062542e-05, + "loss": 1.969, + "step": 5449 + }, + { + "epoch": 0.42, + "grad_norm": 0.5839369802511799, + "learning_rate": 3.253626015648723e-05, + "loss": 1.976, + "step": 5450 + }, + { + "epoch": 0.42, + "grad_norm": 0.5962427501335629, + "learning_rate": 3.253030380182936e-05, + "loss": 1.99, + "step": 5451 + }, + { + "epoch": 0.42, + "grad_norm": 0.6226046656599126, + "learning_rate": 3.252434697702369e-05, + "loss": 2.0126, + "step": 5452 + }, + { + "epoch": 0.42, + "grad_norm": 0.6503967928360542, + "learning_rate": 3.2518389682442126e-05, + "loss": 2.1454, + "step": 5453 + }, + { + "epoch": 0.42, + "grad_norm": 0.6229639323652619, + "learning_rate": 3.25124319184566e-05, + "loss": 1.957, + "step": 5454 + }, + { + "epoch": 0.42, + "grad_norm": 0.6585957902523308, + "learning_rate": 3.2506473685439096e-05, + "loss": 1.9081, + "step": 5455 + }, + { + "epoch": 0.42, + "grad_norm": 0.7721535816273783, + "learning_rate": 3.25005149837616e-05, + "loss": 2.1421, + "step": 5456 + }, + { + "epoch": 0.42, + "grad_norm": 0.6653778234987971, + "learning_rate": 3.249455581379614e-05, + "loss": 2.0075, + "step": 5457 + }, + { + "epoch": 0.42, + "grad_norm": 0.6623979170580239, + "learning_rate": 3.248859617591477e-05, + "loss": 1.963, + "step": 5458 + }, + { + "epoch": 0.42, + "grad_norm": 0.6740516845005219, + "learning_rate": 3.248263607048958e-05, + "loss": 1.9438, + "step": 5459 + }, + { + "epoch": 0.42, + "grad_norm": 0.6493063185098801, + "learning_rate": 3.247667549789268e-05, + "loss": 1.9434, + "step": 5460 + }, + { + "epoch": 0.42, + "grad_norm": 0.7555680808154528, + "learning_rate": 3.247071445849622e-05, + "loss": 2.1238, + "step": 5461 + }, + { + "epoch": 0.42, + "grad_norm": 0.6736019075933498, + "learning_rate": 3.246475295267236e-05, + "loss": 1.9137, + "step": 5462 + }, + { + "epoch": 0.42, + "grad_norm": 0.7857940747407072, + "learning_rate": 3.245879098079331e-05, + "loss": 2.0145, + "step": 5463 + }, + { + "epoch": 0.42, + "grad_norm": 0.5621384193787339, + "learning_rate": 3.245282854323129e-05, + "loss": 1.942, + "step": 5464 + }, + { + "epoch": 0.42, + "grad_norm": 0.7935732787791099, + "learning_rate": 3.244686564035857e-05, + "loss": 2.1737, + "step": 5465 + }, + { + "epoch": 0.42, + "grad_norm": 0.5655421507494806, + "learning_rate": 3.2440902272547435e-05, + "loss": 1.9185, + "step": 5466 + }, + { + "epoch": 0.42, + "grad_norm": 0.7009622857266311, + "learning_rate": 3.243493844017021e-05, + "loss": 1.9592, + "step": 5467 + }, + { + "epoch": 0.42, + "grad_norm": 0.5869157228127593, + "learning_rate": 3.242897414359923e-05, + "loss": 2.1205, + "step": 5468 + }, + { + "epoch": 0.42, + "grad_norm": 0.7839826632691027, + "learning_rate": 3.2423009383206876e-05, + "loss": 1.9139, + "step": 5469 + }, + { + "epoch": 0.42, + "grad_norm": 0.6880840664426252, + "learning_rate": 3.241704415936555e-05, + "loss": 2.0146, + "step": 5470 + }, + { + "epoch": 0.42, + "grad_norm": 0.6882937678105371, + "learning_rate": 3.241107847244769e-05, + "loss": 1.9057, + "step": 5471 + }, + { + "epoch": 0.42, + "grad_norm": 0.6652235381871626, + "learning_rate": 3.240511232282575e-05, + "loss": 1.9161, + "step": 5472 + }, + { + "epoch": 0.42, + "grad_norm": 0.6758711507376635, + "learning_rate": 3.239914571087223e-05, + "loss": 2.1404, + "step": 5473 + }, + { + "epoch": 0.42, + "grad_norm": 0.6828744622650211, + "learning_rate": 3.239317863695965e-05, + "loss": 1.9411, + "step": 5474 + }, + { + "epoch": 0.42, + "grad_norm": 0.7109241498687872, + "learning_rate": 3.238721110146055e-05, + "loss": 1.946, + "step": 5475 + }, + { + "epoch": 0.42, + "grad_norm": 0.6202686161398212, + "learning_rate": 3.2381243104747515e-05, + "loss": 2.005, + "step": 5476 + }, + { + "epoch": 0.42, + "grad_norm": 0.6073099042464479, + "learning_rate": 3.237527464719315e-05, + "loss": 2.1776, + "step": 5477 + }, + { + "epoch": 0.42, + "grad_norm": 0.6354792856647093, + "learning_rate": 3.2369305729170104e-05, + "loss": 1.9666, + "step": 5478 + }, + { + "epoch": 0.42, + "grad_norm": 0.6266403012475132, + "learning_rate": 3.236333635105101e-05, + "loss": 1.9383, + "step": 5479 + }, + { + "epoch": 0.42, + "grad_norm": 0.7119905599063874, + "learning_rate": 3.235736651320858e-05, + "loss": 2.1329, + "step": 5480 + }, + { + "epoch": 0.42, + "grad_norm": 0.5632344650770692, + "learning_rate": 3.235139621601554e-05, + "loss": 1.8772, + "step": 5481 + }, + { + "epoch": 0.42, + "grad_norm": 0.6895694931197734, + "learning_rate": 3.234542545984464e-05, + "loss": 1.9847, + "step": 5482 + }, + { + "epoch": 0.42, + "grad_norm": 0.57348800410612, + "learning_rate": 3.233945424506864e-05, + "loss": 1.9015, + "step": 5483 + }, + { + "epoch": 0.42, + "grad_norm": 0.7073989494594237, + "learning_rate": 3.233348257206036e-05, + "loss": 1.9312, + "step": 5484 + }, + { + "epoch": 0.42, + "grad_norm": 0.6053516722515452, + "learning_rate": 3.232751044119264e-05, + "loss": 2.1203, + "step": 5485 + }, + { + "epoch": 0.42, + "grad_norm": 0.7736522186800224, + "learning_rate": 3.232153785283835e-05, + "loss": 1.965, + "step": 5486 + }, + { + "epoch": 0.42, + "grad_norm": 0.66586357634712, + "learning_rate": 3.2315564807370366e-05, + "loss": 1.8798, + "step": 5487 + }, + { + "epoch": 0.42, + "grad_norm": 0.654245001755994, + "learning_rate": 3.230959130516161e-05, + "loss": 1.9973, + "step": 5488 + }, + { + "epoch": 0.42, + "grad_norm": 0.6061864268640438, + "learning_rate": 3.230361734658505e-05, + "loss": 2.0671, + "step": 5489 + }, + { + "epoch": 0.42, + "grad_norm": 0.7680848019920755, + "learning_rate": 3.229764293201366e-05, + "loss": 1.9135, + "step": 5490 + }, + { + "epoch": 0.42, + "grad_norm": 0.6036311509983753, + "learning_rate": 3.2291668061820436e-05, + "loss": 1.884, + "step": 5491 + }, + { + "epoch": 0.42, + "grad_norm": 0.7897024667505609, + "learning_rate": 3.228569273637841e-05, + "loss": 1.9369, + "step": 5492 + }, + { + "epoch": 0.42, + "grad_norm": 0.6258449289895553, + "learning_rate": 3.227971695606067e-05, + "loss": 2.0891, + "step": 5493 + }, + { + "epoch": 0.42, + "grad_norm": 0.6916593594434459, + "learning_rate": 3.227374072124029e-05, + "loss": 2.0673, + "step": 5494 + }, + { + "epoch": 0.42, + "grad_norm": 0.6518117188330713, + "learning_rate": 3.22677640322904e-05, + "loss": 1.9512, + "step": 5495 + }, + { + "epoch": 0.42, + "grad_norm": 0.6092611170364233, + "learning_rate": 3.226178688958415e-05, + "loss": 1.9958, + "step": 5496 + }, + { + "epoch": 0.42, + "grad_norm": 0.6112249442078188, + "learning_rate": 3.225580929349469e-05, + "loss": 2.1535, + "step": 5497 + }, + { + "epoch": 0.42, + "grad_norm": 0.6476680973428768, + "learning_rate": 3.224983124439526e-05, + "loss": 1.905, + "step": 5498 + }, + { + "epoch": 0.42, + "grad_norm": 0.6653773667041585, + "learning_rate": 3.224385274265909e-05, + "loss": 1.8951, + "step": 5499 + }, + { + "epoch": 0.42, + "grad_norm": 0.6582542361566793, + "learning_rate": 3.2237873788659426e-05, + "loss": 2.0665, + "step": 5500 + }, + { + "epoch": 0.42, + "grad_norm": 0.6227587498986435, + "learning_rate": 3.223189438276957e-05, + "loss": 2.061, + "step": 5501 + }, + { + "epoch": 0.42, + "grad_norm": 0.676275070605795, + "learning_rate": 3.222591452536283e-05, + "loss": 1.8989, + "step": 5502 + }, + { + "epoch": 0.42, + "grad_norm": 0.5952723171790377, + "learning_rate": 3.2219934216812577e-05, + "loss": 1.9088, + "step": 5503 + }, + { + "epoch": 0.42, + "grad_norm": 0.6425753209429459, + "learning_rate": 3.221395345749216e-05, + "loss": 1.9099, + "step": 5504 + }, + { + "epoch": 0.42, + "grad_norm": 0.6438114003272419, + "learning_rate": 3.220797224777499e-05, + "loss": 2.1085, + "step": 5505 + }, + { + "epoch": 0.42, + "grad_norm": 0.5728218918483337, + "learning_rate": 3.220199058803451e-05, + "loss": 1.9183, + "step": 5506 + }, + { + "epoch": 0.42, + "grad_norm": 0.6184371594927183, + "learning_rate": 3.219600847864417e-05, + "loss": 2.0645, + "step": 5507 + }, + { + "epoch": 0.42, + "grad_norm": 0.627973492224833, + "learning_rate": 3.219002591997746e-05, + "loss": 1.9195, + "step": 5508 + }, + { + "epoch": 0.43, + "grad_norm": 0.7046136339300492, + "learning_rate": 3.218404291240789e-05, + "loss": 2.1546, + "step": 5509 + }, + { + "epoch": 0.43, + "grad_norm": 0.7030012737852505, + "learning_rate": 3.217805945630901e-05, + "loss": 1.9139, + "step": 5510 + }, + { + "epoch": 0.43, + "grad_norm": 0.6097944281340764, + "learning_rate": 3.2172075552054396e-05, + "loss": 1.9455, + "step": 5511 + }, + { + "epoch": 0.43, + "grad_norm": 0.6250208566547525, + "learning_rate": 3.2166091200017634e-05, + "loss": 1.9487, + "step": 5512 + }, + { + "epoch": 0.43, + "grad_norm": 0.6892774558588758, + "learning_rate": 3.216010640057236e-05, + "loss": 2.1717, + "step": 5513 + }, + { + "epoch": 0.43, + "grad_norm": 0.6820070281238775, + "learning_rate": 3.215412115409223e-05, + "loss": 1.932, + "step": 5514 + }, + { + "epoch": 0.43, + "grad_norm": 0.719605332206247, + "learning_rate": 3.2148135460950926e-05, + "loss": 1.9452, + "step": 5515 + }, + { + "epoch": 0.43, + "grad_norm": 0.7167754269098408, + "learning_rate": 3.214214932152216e-05, + "loss": 1.9655, + "step": 5516 + }, + { + "epoch": 0.43, + "grad_norm": 0.7925441305797717, + "learning_rate": 3.213616273617967e-05, + "loss": 2.1331, + "step": 5517 + }, + { + "epoch": 0.43, + "grad_norm": 0.9049723788698983, + "learning_rate": 3.213017570529723e-05, + "loss": 1.9337, + "step": 5518 + }, + { + "epoch": 0.43, + "grad_norm": 0.7241100713603708, + "learning_rate": 3.212418822924862e-05, + "loss": 1.9803, + "step": 5519 + }, + { + "epoch": 0.43, + "grad_norm": 0.8024369542836135, + "learning_rate": 3.211820030840768e-05, + "loss": 1.8979, + "step": 5520 + }, + { + "epoch": 0.43, + "grad_norm": 0.8232868303700673, + "learning_rate": 3.211221194314825e-05, + "loss": 2.1254, + "step": 5521 + }, + { + "epoch": 0.43, + "grad_norm": 0.8988761093127632, + "learning_rate": 3.2106223133844207e-05, + "loss": 1.9475, + "step": 5522 + }, + { + "epoch": 0.43, + "grad_norm": 0.7382648375615933, + "learning_rate": 3.2100233880869466e-05, + "loss": 1.9, + "step": 5523 + }, + { + "epoch": 0.43, + "grad_norm": 0.7229839387085425, + "learning_rate": 3.209424418459795e-05, + "loss": 1.9229, + "step": 5524 + }, + { + "epoch": 0.43, + "grad_norm": 0.8075262864280117, + "learning_rate": 3.208825404540363e-05, + "loss": 2.1694, + "step": 5525 + }, + { + "epoch": 0.43, + "grad_norm": 1.0498941409031897, + "learning_rate": 3.2082263463660484e-05, + "loss": 1.9127, + "step": 5526 + }, + { + "epoch": 0.43, + "grad_norm": 0.7824956633924119, + "learning_rate": 3.207627243974254e-05, + "loss": 1.9317, + "step": 5527 + }, + { + "epoch": 0.43, + "grad_norm": 0.695756948294812, + "learning_rate": 3.207028097402385e-05, + "loss": 1.9239, + "step": 5528 + }, + { + "epoch": 0.43, + "grad_norm": 0.869502937624567, + "learning_rate": 3.2064289066878456e-05, + "loss": 2.1836, + "step": 5529 + }, + { + "epoch": 0.43, + "grad_norm": 0.6652202611194128, + "learning_rate": 3.205829671868048e-05, + "loss": 1.911, + "step": 5530 + }, + { + "epoch": 0.43, + "grad_norm": 0.7013150867735364, + "learning_rate": 3.205230392980404e-05, + "loss": 1.9228, + "step": 5531 + }, + { + "epoch": 0.43, + "grad_norm": 0.5496800131590572, + "learning_rate": 3.20463107006233e-05, + "loss": 2.0032, + "step": 5532 + }, + { + "epoch": 0.43, + "grad_norm": 0.7706156566641672, + "learning_rate": 3.2040317031512436e-05, + "loss": 2.1507, + "step": 5533 + }, + { + "epoch": 0.43, + "grad_norm": 0.6176484443005774, + "learning_rate": 3.2034322922845655e-05, + "loss": 1.9458, + "step": 5534 + }, + { + "epoch": 0.43, + "grad_norm": 0.5469379133064656, + "learning_rate": 3.2028328374997196e-05, + "loss": 1.9403, + "step": 5535 + }, + { + "epoch": 0.43, + "grad_norm": 0.639909366424169, + "learning_rate": 3.202233338834133e-05, + "loss": 1.9298, + "step": 5536 + }, + { + "epoch": 0.43, + "grad_norm": 0.6565769827278872, + "learning_rate": 3.201633796325233e-05, + "loss": 2.1173, + "step": 5537 + }, + { + "epoch": 0.43, + "grad_norm": 0.5550613749009332, + "learning_rate": 3.201034210010453e-05, + "loss": 2.0534, + "step": 5538 + }, + { + "epoch": 0.43, + "grad_norm": 0.6131837862882319, + "learning_rate": 3.200434579927228e-05, + "loss": 1.9521, + "step": 5539 + }, + { + "epoch": 0.43, + "grad_norm": 0.653824083945356, + "learning_rate": 3.1998349061129936e-05, + "loss": 1.883, + "step": 5540 + }, + { + "epoch": 0.43, + "grad_norm": 0.6178267137072395, + "learning_rate": 3.199235188605192e-05, + "loss": 2.1442, + "step": 5541 + }, + { + "epoch": 0.43, + "grad_norm": 0.7666114406094763, + "learning_rate": 3.198635427441266e-05, + "loss": 1.9135, + "step": 5542 + }, + { + "epoch": 0.43, + "grad_norm": 0.6891744295974432, + "learning_rate": 3.198035622658658e-05, + "loss": 1.9241, + "step": 5543 + }, + { + "epoch": 0.43, + "grad_norm": 0.8053479588396891, + "learning_rate": 3.1974357742948196e-05, + "loss": 2.0171, + "step": 5544 + }, + { + "epoch": 0.43, + "grad_norm": 0.645338650666459, + "learning_rate": 3.1968358823872006e-05, + "loss": 2.1128, + "step": 5545 + }, + { + "epoch": 0.43, + "grad_norm": 0.5481985921662712, + "learning_rate": 3.196235946973255e-05, + "loss": 1.9602, + "step": 5546 + }, + { + "epoch": 0.43, + "grad_norm": 0.6978525522805725, + "learning_rate": 3.195635968090439e-05, + "loss": 1.92, + "step": 5547 + }, + { + "epoch": 0.43, + "grad_norm": 0.6134188586923429, + "learning_rate": 3.195035945776212e-05, + "loss": 1.9142, + "step": 5548 + }, + { + "epoch": 0.43, + "grad_norm": 0.7686567179918987, + "learning_rate": 3.194435880068034e-05, + "loss": 2.1173, + "step": 5549 + }, + { + "epoch": 0.43, + "grad_norm": 0.6178725405164761, + "learning_rate": 3.193835771003373e-05, + "loss": 1.9653, + "step": 5550 + }, + { + "epoch": 0.43, + "grad_norm": 0.6683409509862176, + "learning_rate": 3.193235618619694e-05, + "loss": 1.9064, + "step": 5551 + }, + { + "epoch": 0.43, + "grad_norm": 0.6730131551392984, + "learning_rate": 3.192635422954467e-05, + "loss": 1.898, + "step": 5552 + }, + { + "epoch": 0.43, + "grad_norm": 0.6284940050193553, + "learning_rate": 3.1920351840451657e-05, + "loss": 2.1349, + "step": 5553 + }, + { + "epoch": 0.43, + "grad_norm": 0.6939775343718775, + "learning_rate": 3.191434901929265e-05, + "loss": 1.9622, + "step": 5554 + }, + { + "epoch": 0.43, + "grad_norm": 0.6332149772626218, + "learning_rate": 3.190834576644242e-05, + "loss": 1.9276, + "step": 5555 + }, + { + "epoch": 0.43, + "grad_norm": 0.6489657017125712, + "learning_rate": 3.190234208227578e-05, + "loss": 1.9118, + "step": 5556 + }, + { + "epoch": 0.43, + "grad_norm": 0.6571878936188528, + "learning_rate": 3.1896337967167586e-05, + "loss": 2.1403, + "step": 5557 + }, + { + "epoch": 0.43, + "grad_norm": 0.5664121402087847, + "learning_rate": 3.1890333421492674e-05, + "loss": 1.9243, + "step": 5558 + }, + { + "epoch": 0.43, + "grad_norm": 0.7547122291313327, + "learning_rate": 3.1884328445625937e-05, + "loss": 1.8924, + "step": 5559 + }, + { + "epoch": 0.43, + "grad_norm": 0.6396239459256722, + "learning_rate": 3.18783230399423e-05, + "loss": 1.9602, + "step": 5560 + }, + { + "epoch": 0.43, + "grad_norm": 0.635847358947385, + "learning_rate": 3.187231720481669e-05, + "loss": 2.1302, + "step": 5561 + }, + { + "epoch": 0.43, + "grad_norm": 0.6876565785077545, + "learning_rate": 3.186631094062409e-05, + "loss": 1.9288, + "step": 5562 + }, + { + "epoch": 0.43, + "grad_norm": 0.6089386780474755, + "learning_rate": 3.186030424773949e-05, + "loss": 2.0301, + "step": 5563 + }, + { + "epoch": 0.43, + "grad_norm": 0.5876104511747701, + "learning_rate": 3.1854297126537916e-05, + "loss": 1.9272, + "step": 5564 + }, + { + "epoch": 0.43, + "grad_norm": 0.6949961849253385, + "learning_rate": 3.184828957739441e-05, + "loss": 2.1834, + "step": 5565 + }, + { + "epoch": 0.43, + "grad_norm": 0.5832294479535828, + "learning_rate": 3.184228160068405e-05, + "loss": 1.9594, + "step": 5566 + }, + { + "epoch": 0.43, + "grad_norm": 0.71915185201773, + "learning_rate": 3.183627319678195e-05, + "loss": 1.9801, + "step": 5567 + }, + { + "epoch": 0.43, + "grad_norm": 0.5781935776393736, + "learning_rate": 3.1830264366063226e-05, + "loss": 1.9394, + "step": 5568 + }, + { + "epoch": 0.43, + "grad_norm": 0.6775409416722471, + "learning_rate": 3.182425510890303e-05, + "loss": 2.1399, + "step": 5569 + }, + { + "epoch": 0.43, + "grad_norm": 0.6175411048432007, + "learning_rate": 3.181824542567656e-05, + "loss": 1.916, + "step": 5570 + }, + { + "epoch": 0.43, + "grad_norm": 0.6560404504115196, + "learning_rate": 3.181223531675901e-05, + "loss": 1.9763, + "step": 5571 + }, + { + "epoch": 0.43, + "grad_norm": 0.7631719080507129, + "learning_rate": 3.180622478252564e-05, + "loss": 1.9363, + "step": 5572 + }, + { + "epoch": 0.43, + "grad_norm": 0.7248499098020429, + "learning_rate": 3.1800213823351685e-05, + "loss": 2.125, + "step": 5573 + }, + { + "epoch": 0.43, + "grad_norm": 0.7869909938454049, + "learning_rate": 3.1794202439612444e-05, + "loss": 1.9454, + "step": 5574 + }, + { + "epoch": 0.43, + "grad_norm": 0.6510848517505108, + "learning_rate": 3.1788190631683244e-05, + "loss": 1.9797, + "step": 5575 + }, + { + "epoch": 0.43, + "grad_norm": 0.6339881798816766, + "learning_rate": 3.178217839993941e-05, + "loss": 1.9007, + "step": 5576 + }, + { + "epoch": 0.43, + "grad_norm": 0.7958293017951832, + "learning_rate": 3.1776165744756314e-05, + "loss": 2.1362, + "step": 5577 + }, + { + "epoch": 0.43, + "grad_norm": 0.6717389265772505, + "learning_rate": 3.1770152666509353e-05, + "loss": 1.9518, + "step": 5578 + }, + { + "epoch": 0.43, + "grad_norm": 0.6488673600725035, + "learning_rate": 3.176413916557395e-05, + "loss": 1.9405, + "step": 5579 + }, + { + "epoch": 0.43, + "grad_norm": 0.6598453826940345, + "learning_rate": 3.1758125242325554e-05, + "loss": 1.9191, + "step": 5580 + }, + { + "epoch": 0.43, + "grad_norm": 0.6914219150642945, + "learning_rate": 3.175211089713963e-05, + "loss": 2.1558, + "step": 5581 + }, + { + "epoch": 0.43, + "grad_norm": 0.6103646244784562, + "learning_rate": 3.174609613039169e-05, + "loss": 1.8826, + "step": 5582 + }, + { + "epoch": 0.43, + "grad_norm": 0.6979368666189041, + "learning_rate": 3.174008094245725e-05, + "loss": 1.9413, + "step": 5583 + }, + { + "epoch": 0.43, + "grad_norm": 0.6121559807410863, + "learning_rate": 3.173406533371187e-05, + "loss": 1.8936, + "step": 5584 + }, + { + "epoch": 0.43, + "grad_norm": 0.710151507960139, + "learning_rate": 3.172804930453113e-05, + "loss": 2.101, + "step": 5585 + }, + { + "epoch": 0.43, + "grad_norm": 0.6376532148603582, + "learning_rate": 3.172203285529063e-05, + "loss": 1.9711, + "step": 5586 + }, + { + "epoch": 0.43, + "grad_norm": 0.5740796587668582, + "learning_rate": 3.1716015986366005e-05, + "loss": 1.9534, + "step": 5587 + }, + { + "epoch": 0.43, + "grad_norm": 0.5661990887156229, + "learning_rate": 3.170999869813291e-05, + "loss": 2.0021, + "step": 5588 + }, + { + "epoch": 0.43, + "grad_norm": 0.657153919988527, + "learning_rate": 3.170398099096703e-05, + "loss": 2.1396, + "step": 5589 + }, + { + "epoch": 0.43, + "grad_norm": 0.6297288853270356, + "learning_rate": 3.169796286524408e-05, + "loss": 1.989, + "step": 5590 + }, + { + "epoch": 0.43, + "grad_norm": 0.5844081551829533, + "learning_rate": 3.169194432133979e-05, + "loss": 1.9137, + "step": 5591 + }, + { + "epoch": 0.43, + "grad_norm": 0.6411655285201882, + "learning_rate": 3.1685925359629924e-05, + "loss": 1.9242, + "step": 5592 + }, + { + "epoch": 0.43, + "grad_norm": 0.5957149856691657, + "learning_rate": 3.1679905980490274e-05, + "loss": 2.1384, + "step": 5593 + }, + { + "epoch": 0.43, + "grad_norm": 0.6402308920806045, + "learning_rate": 3.167388618429665e-05, + "loss": 1.9841, + "step": 5594 + }, + { + "epoch": 0.43, + "grad_norm": 0.5446746250188561, + "learning_rate": 3.166786597142489e-05, + "loss": 1.874, + "step": 5595 + }, + { + "epoch": 0.43, + "grad_norm": 0.597463264678572, + "learning_rate": 3.166184534225087e-05, + "loss": 1.8861, + "step": 5596 + }, + { + "epoch": 0.43, + "grad_norm": 0.6503667002407351, + "learning_rate": 3.165582429715049e-05, + "loss": 2.1238, + "step": 5597 + }, + { + "epoch": 0.43, + "grad_norm": 0.5549774539797778, + "learning_rate": 3.164980283649964e-05, + "loss": 1.9113, + "step": 5598 + }, + { + "epoch": 0.43, + "grad_norm": 0.6981562770849379, + "learning_rate": 3.164378096067428e-05, + "loss": 1.9204, + "step": 5599 + }, + { + "epoch": 0.43, + "grad_norm": 0.6651272135530678, + "learning_rate": 3.1637758670050396e-05, + "loss": 1.9922, + "step": 5600 + }, + { + "epoch": 0.43, + "grad_norm": 0.6938228074042723, + "learning_rate": 3.1631735965003955e-05, + "loss": 2.1331, + "step": 5601 + }, + { + "epoch": 0.43, + "grad_norm": 0.6700507001131234, + "learning_rate": 3.1625712845911e-05, + "loss": 1.9222, + "step": 5602 + }, + { + "epoch": 0.43, + "grad_norm": 0.5687402131253495, + "learning_rate": 3.161968931314757e-05, + "loss": 1.9802, + "step": 5603 + }, + { + "epoch": 0.43, + "grad_norm": 0.7408564606199575, + "learning_rate": 3.1613665367089744e-05, + "loss": 1.9038, + "step": 5604 + }, + { + "epoch": 0.43, + "grad_norm": 0.6017354416684018, + "learning_rate": 3.160764100811362e-05, + "loss": 2.0945, + "step": 5605 + }, + { + "epoch": 0.43, + "grad_norm": 0.6633917462520157, + "learning_rate": 3.160161623659532e-05, + "loss": 1.9678, + "step": 5606 + }, + { + "epoch": 0.43, + "grad_norm": 0.5746071633726025, + "learning_rate": 3.1595591052911e-05, + "loss": 1.9158, + "step": 5607 + }, + { + "epoch": 0.43, + "grad_norm": 0.6561268664491572, + "learning_rate": 3.1589565457436834e-05, + "loss": 1.9476, + "step": 5608 + }, + { + "epoch": 0.43, + "grad_norm": 0.6480544738128139, + "learning_rate": 3.1583539450549036e-05, + "loss": 2.1389, + "step": 5609 + }, + { + "epoch": 0.43, + "grad_norm": 0.649009919750529, + "learning_rate": 3.157751303262381e-05, + "loss": 1.9035, + "step": 5610 + }, + { + "epoch": 0.43, + "grad_norm": 0.6935282012022059, + "learning_rate": 3.1571486204037426e-05, + "loss": 1.9505, + "step": 5611 + }, + { + "epoch": 0.43, + "grad_norm": 0.6754449665646393, + "learning_rate": 3.1565458965166166e-05, + "loss": 2.0261, + "step": 5612 + }, + { + "epoch": 0.43, + "grad_norm": 0.8167251778993289, + "learning_rate": 3.1559431316386335e-05, + "loss": 2.091, + "step": 5613 + }, + { + "epoch": 0.43, + "grad_norm": 0.6611146503877938, + "learning_rate": 3.155340325807425e-05, + "loss": 1.9362, + "step": 5614 + }, + { + "epoch": 0.43, + "grad_norm": 0.7208759102287223, + "learning_rate": 3.1547374790606293e-05, + "loss": 1.8937, + "step": 5615 + }, + { + "epoch": 0.43, + "grad_norm": 0.6500901377578676, + "learning_rate": 3.154134591435882e-05, + "loss": 1.8853, + "step": 5616 + }, + { + "epoch": 0.43, + "grad_norm": 0.7823771440414354, + "learning_rate": 3.153531662970825e-05, + "loss": 2.1279, + "step": 5617 + }, + { + "epoch": 0.43, + "grad_norm": 0.8283768467376464, + "learning_rate": 3.1529286937031016e-05, + "loss": 1.9031, + "step": 5618 + }, + { + "epoch": 0.43, + "grad_norm": 0.835448366300111, + "learning_rate": 3.1523256836703573e-05, + "loss": 1.9898, + "step": 5619 + }, + { + "epoch": 0.43, + "grad_norm": 0.7975719300266685, + "learning_rate": 3.151722632910241e-05, + "loss": 1.8939, + "step": 5620 + }, + { + "epoch": 0.43, + "grad_norm": 0.9297861943058162, + "learning_rate": 3.151119541460403e-05, + "loss": 2.1439, + "step": 5621 + }, + { + "epoch": 0.43, + "grad_norm": 0.6940642597277984, + "learning_rate": 3.150516409358498e-05, + "loss": 1.9605, + "step": 5622 + }, + { + "epoch": 0.43, + "grad_norm": 0.8434133156426439, + "learning_rate": 3.14991323664218e-05, + "loss": 1.9468, + "step": 5623 + }, + { + "epoch": 0.43, + "grad_norm": 0.7307312587276985, + "learning_rate": 3.149310023349109e-05, + "loss": 1.9061, + "step": 5624 + }, + { + "epoch": 0.43, + "grad_norm": 0.6250806731301657, + "learning_rate": 3.148706769516947e-05, + "loss": 1.9862, + "step": 5625 + }, + { + "epoch": 0.43, + "grad_norm": 0.7314703992850408, + "learning_rate": 3.148103475183356e-05, + "loss": 2.1846, + "step": 5626 + }, + { + "epoch": 0.43, + "grad_norm": 0.7413703977005917, + "learning_rate": 3.147500140386002e-05, + "loss": 1.9285, + "step": 5627 + }, + { + "epoch": 0.43, + "grad_norm": 0.6007459767992146, + "learning_rate": 3.146896765162554e-05, + "loss": 1.9477, + "step": 5628 + }, + { + "epoch": 0.43, + "grad_norm": 0.7063260244334643, + "learning_rate": 3.1462933495506844e-05, + "loss": 2.1046, + "step": 5629 + }, + { + "epoch": 0.43, + "grad_norm": 0.6583464691097333, + "learning_rate": 3.1456898935880654e-05, + "loss": 1.9515, + "step": 5630 + }, + { + "epoch": 0.43, + "grad_norm": 0.595900724631551, + "learning_rate": 3.1450863973123735e-05, + "loss": 2.0259, + "step": 5631 + }, + { + "epoch": 0.43, + "grad_norm": 0.7036715175892024, + "learning_rate": 3.1444828607612875e-05, + "loss": 1.9199, + "step": 5632 + }, + { + "epoch": 0.43, + "grad_norm": 0.7564329608678626, + "learning_rate": 3.1438792839724895e-05, + "loss": 2.1449, + "step": 5633 + }, + { + "epoch": 0.43, + "grad_norm": 0.5711565792424932, + "learning_rate": 3.1432756669836624e-05, + "loss": 1.9007, + "step": 5634 + }, + { + "epoch": 0.43, + "grad_norm": 0.8805444805309195, + "learning_rate": 3.1426720098324936e-05, + "loss": 1.95, + "step": 5635 + }, + { + "epoch": 0.43, + "grad_norm": 0.6766885487840181, + "learning_rate": 3.14206831255667e-05, + "loss": 1.9246, + "step": 5636 + }, + { + "epoch": 0.43, + "grad_norm": 0.8552016492847282, + "learning_rate": 3.141464575193884e-05, + "loss": 1.9995, + "step": 5637 + }, + { + "epoch": 0.43, + "grad_norm": 0.7930754126393524, + "learning_rate": 3.1408607977818306e-05, + "loss": 2.126, + "step": 5638 + }, + { + "epoch": 0.44, + "grad_norm": 0.6392465697169919, + "learning_rate": 3.140256980358204e-05, + "loss": 1.9739, + "step": 5639 + }, + { + "epoch": 0.44, + "grad_norm": 0.6505359465621804, + "learning_rate": 3.139653122960704e-05, + "loss": 1.9368, + "step": 5640 + }, + { + "epoch": 0.44, + "grad_norm": 0.9011323354788731, + "learning_rate": 3.139049225627031e-05, + "loss": 2.1172, + "step": 5641 + }, + { + "epoch": 0.44, + "grad_norm": 0.6137511859891005, + "learning_rate": 3.1384452883948906e-05, + "loss": 1.9576, + "step": 5642 + }, + { + "epoch": 0.44, + "grad_norm": 0.8351084893802556, + "learning_rate": 3.137841311301988e-05, + "loss": 2.0007, + "step": 5643 + }, + { + "epoch": 0.44, + "grad_norm": 0.7214810994985507, + "learning_rate": 3.137237294386032e-05, + "loss": 1.9329, + "step": 5644 + }, + { + "epoch": 0.44, + "grad_norm": 0.6237961656532114, + "learning_rate": 3.136633237684733e-05, + "loss": 1.9463, + "step": 5645 + }, + { + "epoch": 0.44, + "grad_norm": 0.7963826803090053, + "learning_rate": 3.1360291412358055e-05, + "loss": 2.1485, + "step": 5646 + }, + { + "epoch": 0.44, + "grad_norm": 0.7336081837304146, + "learning_rate": 3.135425005076967e-05, + "loss": 1.9525, + "step": 5647 + }, + { + "epoch": 0.44, + "grad_norm": 0.6102364683341208, + "learning_rate": 3.134820829245935e-05, + "loss": 1.9593, + "step": 5648 + }, + { + "epoch": 0.44, + "grad_norm": 0.8611686594721094, + "learning_rate": 3.1342166137804305e-05, + "loss": 2.0992, + "step": 5649 + }, + { + "epoch": 0.44, + "grad_norm": 0.7283125798332221, + "learning_rate": 3.133612358718177e-05, + "loss": 2.0374, + "step": 5650 + }, + { + "epoch": 0.44, + "grad_norm": 0.7680529055810161, + "learning_rate": 3.1330080640969013e-05, + "loss": 1.9495, + "step": 5651 + }, + { + "epoch": 0.44, + "grad_norm": 0.8126057726636566, + "learning_rate": 3.132403729954331e-05, + "loss": 1.9785, + "step": 5652 + }, + { + "epoch": 0.44, + "grad_norm": 0.907884186185073, + "learning_rate": 3.131799356328199e-05, + "loss": 2.1239, + "step": 5653 + }, + { + "epoch": 0.44, + "grad_norm": 0.8293905053910474, + "learning_rate": 3.1311949432562374e-05, + "loss": 1.9556, + "step": 5654 + }, + { + "epoch": 0.44, + "grad_norm": 0.9725845588753346, + "learning_rate": 3.130590490776182e-05, + "loss": 1.9439, + "step": 5655 + }, + { + "epoch": 0.44, + "grad_norm": 0.5958940545997812, + "learning_rate": 3.129985998925772e-05, + "loss": 1.9927, + "step": 5656 + }, + { + "epoch": 0.44, + "grad_norm": 0.9049355484599579, + "learning_rate": 3.1293814677427475e-05, + "loss": 1.9656, + "step": 5657 + }, + { + "epoch": 0.44, + "grad_norm": 0.7549136803220631, + "learning_rate": 3.128776897264853e-05, + "loss": 2.1347, + "step": 5658 + }, + { + "epoch": 0.44, + "grad_norm": 0.6720022788834428, + "learning_rate": 3.128172287529834e-05, + "loss": 1.9347, + "step": 5659 + }, + { + "epoch": 0.44, + "grad_norm": 0.6583619839814041, + "learning_rate": 3.127567638575438e-05, + "loss": 1.9452, + "step": 5660 + }, + { + "epoch": 0.44, + "grad_norm": 0.8349044684190652, + "learning_rate": 3.126962950439416e-05, + "loss": 2.1666, + "step": 5661 + }, + { + "epoch": 0.44, + "grad_norm": 0.6399870177308284, + "learning_rate": 3.126358223159522e-05, + "loss": 2.0164, + "step": 5662 + }, + { + "epoch": 0.44, + "grad_norm": 0.6637986645778988, + "learning_rate": 3.1257534567735105e-05, + "loss": 1.9026, + "step": 5663 + }, + { + "epoch": 0.44, + "grad_norm": 0.7877965929691404, + "learning_rate": 3.12514865131914e-05, + "loss": 1.9328, + "step": 5664 + }, + { + "epoch": 0.44, + "grad_norm": 0.6548425843584832, + "learning_rate": 3.1245438068341716e-05, + "loss": 2.118, + "step": 5665 + }, + { + "epoch": 0.44, + "grad_norm": 0.8428785557154008, + "learning_rate": 3.123938923356367e-05, + "loss": 1.909, + "step": 5666 + }, + { + "epoch": 0.44, + "grad_norm": 0.7076931364083444, + "learning_rate": 3.1233340009234916e-05, + "loss": 1.948, + "step": 5667 + }, + { + "epoch": 0.44, + "grad_norm": 0.9209240343831496, + "learning_rate": 3.122729039573315e-05, + "loss": 2.0336, + "step": 5668 + }, + { + "epoch": 0.44, + "grad_norm": 0.6994883790430878, + "learning_rate": 3.1221240393436065e-05, + "loss": 1.9338, + "step": 5669 + }, + { + "epoch": 0.44, + "grad_norm": 0.7258248339480239, + "learning_rate": 3.121519000272138e-05, + "loss": 2.0986, + "step": 5670 + }, + { + "epoch": 0.44, + "grad_norm": 0.7379881905749857, + "learning_rate": 3.120913922396684e-05, + "loss": 1.9479, + "step": 5671 + }, + { + "epoch": 0.44, + "grad_norm": 0.7116379554679477, + "learning_rate": 3.1203088057550244e-05, + "loss": 1.9304, + "step": 5672 + }, + { + "epoch": 0.44, + "grad_norm": 0.6419883003008122, + "learning_rate": 3.1197036503849375e-05, + "loss": 2.1337, + "step": 5673 + }, + { + "epoch": 0.44, + "grad_norm": 0.5931873374662571, + "learning_rate": 3.119098456324206e-05, + "loss": 1.9675, + "step": 5674 + }, + { + "epoch": 0.44, + "grad_norm": 0.6365796046810436, + "learning_rate": 3.118493223610613e-05, + "loss": 1.9293, + "step": 5675 + }, + { + "epoch": 0.44, + "grad_norm": 0.6763007361020857, + "learning_rate": 3.1178879522819496e-05, + "loss": 1.8999, + "step": 5676 + }, + { + "epoch": 0.44, + "grad_norm": 0.5798559912838727, + "learning_rate": 3.117282642376002e-05, + "loss": 1.9353, + "step": 5677 + }, + { + "epoch": 0.44, + "grad_norm": 0.6138782324790876, + "learning_rate": 3.1166772939305624e-05, + "loss": 2.1953, + "step": 5678 + }, + { + "epoch": 0.44, + "grad_norm": 0.6053371296245269, + "learning_rate": 3.116071906983427e-05, + "loss": 1.9841, + "step": 5679 + }, + { + "epoch": 0.44, + "grad_norm": 0.5811291216702393, + "learning_rate": 3.115466481572391e-05, + "loss": 1.9546, + "step": 5680 + }, + { + "epoch": 0.44, + "grad_norm": 0.5583008752776615, + "learning_rate": 3.114861017735255e-05, + "loss": 2.0254, + "step": 5681 + }, + { + "epoch": 0.44, + "grad_norm": 0.6959782999116002, + "learning_rate": 3.114255515509818e-05, + "loss": 2.1192, + "step": 5682 + }, + { + "epoch": 0.44, + "grad_norm": 0.6288505739894407, + "learning_rate": 3.1136499749338884e-05, + "loss": 1.9543, + "step": 5683 + }, + { + "epoch": 0.44, + "grad_norm": 0.6022301968296992, + "learning_rate": 3.113044396045268e-05, + "loss": 1.9214, + "step": 5684 + }, + { + "epoch": 0.44, + "grad_norm": 0.6258144349345304, + "learning_rate": 3.1124387788817675e-05, + "loss": 2.0662, + "step": 5685 + }, + { + "epoch": 0.44, + "grad_norm": 0.6814655204425702, + "learning_rate": 3.111833123481198e-05, + "loss": 1.9214, + "step": 5686 + }, + { + "epoch": 0.44, + "grad_norm": 0.5806089413387597, + "learning_rate": 3.111227429881374e-05, + "loss": 2.0211, + "step": 5687 + }, + { + "epoch": 0.44, + "grad_norm": 0.6995239871323705, + "learning_rate": 3.11062169812011e-05, + "loss": 1.9629, + "step": 5688 + }, + { + "epoch": 0.44, + "grad_norm": 0.6256702290667359, + "learning_rate": 3.1100159282352245e-05, + "loss": 1.8834, + "step": 5689 + }, + { + "epoch": 0.44, + "grad_norm": 0.7182005483318241, + "learning_rate": 3.1094101202645396e-05, + "loss": 2.0998, + "step": 5690 + }, + { + "epoch": 0.44, + "grad_norm": 0.5897869539217989, + "learning_rate": 3.1088042742458774e-05, + "loss": 1.9761, + "step": 5691 + }, + { + "epoch": 0.44, + "grad_norm": 0.712672048422269, + "learning_rate": 3.108198390217062e-05, + "loss": 1.9743, + "step": 5692 + }, + { + "epoch": 0.44, + "grad_norm": 0.5958747449883048, + "learning_rate": 3.1075924682159235e-05, + "loss": 1.9697, + "step": 5693 + }, + { + "epoch": 0.44, + "grad_norm": 0.7223463824161566, + "learning_rate": 3.1069865082802915e-05, + "loss": 2.1399, + "step": 5694 + }, + { + "epoch": 0.44, + "grad_norm": 0.6468503222814457, + "learning_rate": 3.106380510447998e-05, + "loss": 1.9359, + "step": 5695 + }, + { + "epoch": 0.44, + "grad_norm": 0.6973626502674143, + "learning_rate": 3.105774474756878e-05, + "loss": 1.8797, + "step": 5696 + }, + { + "epoch": 0.44, + "grad_norm": 0.5766560756694544, + "learning_rate": 3.105168401244769e-05, + "loss": 2.1195, + "step": 5697 + }, + { + "epoch": 0.44, + "grad_norm": 0.6723336036288439, + "learning_rate": 3.1045622899495105e-05, + "loss": 1.9232, + "step": 5698 + }, + { + "epoch": 0.44, + "grad_norm": 0.5929080382003645, + "learning_rate": 3.103956140908944e-05, + "loss": 1.9183, + "step": 5699 + }, + { + "epoch": 0.44, + "grad_norm": 0.6355364240260755, + "learning_rate": 3.1033499541609154e-05, + "loss": 1.9449, + "step": 5700 + }, + { + "epoch": 0.44, + "grad_norm": 0.6227281413567548, + "learning_rate": 3.102743729743271e-05, + "loss": 1.9086, + "step": 5701 + }, + { + "epoch": 0.44, + "grad_norm": 0.6266973476625657, + "learning_rate": 3.1021374676938584e-05, + "loss": 2.1226, + "step": 5702 + }, + { + "epoch": 0.44, + "grad_norm": 0.6539028996800659, + "learning_rate": 3.10153116805053e-05, + "loss": 1.9272, + "step": 5703 + }, + { + "epoch": 0.44, + "grad_norm": 0.6681228223165171, + "learning_rate": 3.1009248308511394e-05, + "loss": 1.9208, + "step": 5704 + }, + { + "epoch": 0.44, + "grad_norm": 0.6159281972341079, + "learning_rate": 3.100318456133544e-05, + "loss": 2.1283, + "step": 5705 + }, + { + "epoch": 0.44, + "grad_norm": 0.7052663774126838, + "learning_rate": 3.0997120439356006e-05, + "loss": 1.9883, + "step": 5706 + }, + { + "epoch": 0.44, + "grad_norm": 0.6501867927821594, + "learning_rate": 3.099105594295171e-05, + "loss": 1.9268, + "step": 5707 + }, + { + "epoch": 0.44, + "grad_norm": 0.6176458681958992, + "learning_rate": 3.098499107250118e-05, + "loss": 1.9281, + "step": 5708 + }, + { + "epoch": 0.44, + "grad_norm": 0.6047726259960889, + "learning_rate": 3.097892582838307e-05, + "loss": 1.8908, + "step": 5709 + }, + { + "epoch": 0.44, + "grad_norm": 0.671476239877798, + "learning_rate": 3.0972860210976046e-05, + "loss": 2.0854, + "step": 5710 + }, + { + "epoch": 0.44, + "grad_norm": 0.6699469831719684, + "learning_rate": 3.096679422065883e-05, + "loss": 1.8663, + "step": 5711 + }, + { + "epoch": 0.44, + "grad_norm": 0.6341770898239449, + "learning_rate": 3.096072785781015e-05, + "loss": 1.9885, + "step": 5712 + }, + { + "epoch": 0.44, + "grad_norm": 0.6226054819923315, + "learning_rate": 3.095466112280873e-05, + "loss": 1.9509, + "step": 5713 + }, + { + "epoch": 0.44, + "grad_norm": 0.6880827740637214, + "learning_rate": 3.094859401603336e-05, + "loss": 2.0943, + "step": 5714 + }, + { + "epoch": 0.44, + "grad_norm": 0.662426860200876, + "learning_rate": 3.0942526537862827e-05, + "loss": 1.9288, + "step": 5715 + }, + { + "epoch": 0.44, + "grad_norm": 0.6754217997764672, + "learning_rate": 3.0936458688675946e-05, + "loss": 1.9322, + "step": 5716 + }, + { + "epoch": 0.44, + "grad_norm": 0.8095193802523499, + "learning_rate": 3.093039046885156e-05, + "loss": 2.1083, + "step": 5717 + }, + { + "epoch": 0.44, + "grad_norm": 0.5588495657601058, + "learning_rate": 3.092432187876853e-05, + "loss": 1.9885, + "step": 5718 + }, + { + "epoch": 0.44, + "grad_norm": 0.6877874677781738, + "learning_rate": 3.091825291880576e-05, + "loss": 1.9208, + "step": 5719 + }, + { + "epoch": 0.44, + "grad_norm": 0.7145755284049263, + "learning_rate": 3.0912183589342145e-05, + "loss": 1.9058, + "step": 5720 + }, + { + "epoch": 0.44, + "grad_norm": 0.6521407847622906, + "learning_rate": 3.090611389075661e-05, + "loss": 1.9402, + "step": 5721 + }, + { + "epoch": 0.44, + "grad_norm": 0.6378500896549859, + "learning_rate": 3.0900043823428135e-05, + "loss": 2.112, + "step": 5722 + }, + { + "epoch": 0.44, + "grad_norm": 0.6985318167672901, + "learning_rate": 3.0893973387735687e-05, + "loss": 1.9081, + "step": 5723 + }, + { + "epoch": 0.44, + "grad_norm": 0.7396519116591681, + "learning_rate": 3.088790258405826e-05, + "loss": 1.9504, + "step": 5724 + }, + { + "epoch": 0.44, + "grad_norm": 0.6966317296586871, + "learning_rate": 3.088183141277489e-05, + "loss": 1.9531, + "step": 5725 + }, + { + "epoch": 0.44, + "grad_norm": 0.8077000601469055, + "learning_rate": 3.087575987426462e-05, + "loss": 2.0959, + "step": 5726 + }, + { + "epoch": 0.44, + "grad_norm": 0.6151521071501472, + "learning_rate": 3.086968796890652e-05, + "loss": 1.9102, + "step": 5727 + }, + { + "epoch": 0.44, + "grad_norm": 0.6104241636886986, + "learning_rate": 3.0863615697079684e-05, + "loss": 1.9434, + "step": 5728 + }, + { + "epoch": 0.44, + "grad_norm": 0.730877139267078, + "learning_rate": 3.0857543059163244e-05, + "loss": 1.8747, + "step": 5729 + }, + { + "epoch": 0.44, + "grad_norm": 0.6452296799580808, + "learning_rate": 3.085147005553632e-05, + "loss": 2.1487, + "step": 5730 + }, + { + "epoch": 0.44, + "grad_norm": 0.6712202380413377, + "learning_rate": 3.084539668657808e-05, + "loss": 1.9207, + "step": 5731 + }, + { + "epoch": 0.44, + "grad_norm": 0.8546406976221929, + "learning_rate": 3.083932295266771e-05, + "loss": 1.9288, + "step": 5732 + }, + { + "epoch": 0.44, + "grad_norm": 0.842712235570716, + "learning_rate": 3.0833248854184426e-05, + "loss": 1.9128, + "step": 5733 + }, + { + "epoch": 0.44, + "grad_norm": 0.6239950412938232, + "learning_rate": 3.0827174391507455e-05, + "loss": 2.1123, + "step": 5734 + }, + { + "epoch": 0.44, + "grad_norm": 1.0118108361250793, + "learning_rate": 3.082109956501604e-05, + "loss": 1.9422, + "step": 5735 + }, + { + "epoch": 0.44, + "grad_norm": 0.6014669940825281, + "learning_rate": 3.081502437508946e-05, + "loss": 1.9319, + "step": 5736 + }, + { + "epoch": 0.44, + "grad_norm": 0.7780192406599721, + "learning_rate": 3.080894882210704e-05, + "loss": 2.0019, + "step": 5737 + }, + { + "epoch": 0.44, + "grad_norm": 0.8025051982257406, + "learning_rate": 3.0802872906448066e-05, + "loss": 2.0809, + "step": 5738 + }, + { + "epoch": 0.44, + "grad_norm": 0.6156759401258579, + "learning_rate": 3.07967966284919e-05, + "loss": 1.939, + "step": 5739 + }, + { + "epoch": 0.44, + "grad_norm": 0.6413774824302707, + "learning_rate": 3.079071998861791e-05, + "loss": 1.926, + "step": 5740 + }, + { + "epoch": 0.44, + "grad_norm": 0.7518592741742948, + "learning_rate": 3.078464298720548e-05, + "loss": 1.9426, + "step": 5741 + }, + { + "epoch": 0.44, + "grad_norm": 0.7987791983415258, + "learning_rate": 3.077856562463402e-05, + "loss": 2.1456, + "step": 5742 + }, + { + "epoch": 0.44, + "grad_norm": 0.737582029992249, + "learning_rate": 3.0772487901282977e-05, + "loss": 1.9901, + "step": 5743 + }, + { + "epoch": 0.44, + "grad_norm": 0.7857942931527094, + "learning_rate": 3.0766409817531795e-05, + "loss": 1.9798, + "step": 5744 + }, + { + "epoch": 0.44, + "grad_norm": 0.70463175540639, + "learning_rate": 3.076033137375996e-05, + "loss": 1.9092, + "step": 5745 + }, + { + "epoch": 0.44, + "grad_norm": 0.8344312086943486, + "learning_rate": 3.075425257034697e-05, + "loss": 2.0679, + "step": 5746 + }, + { + "epoch": 0.44, + "grad_norm": 0.760871066110913, + "learning_rate": 3.074817340767236e-05, + "loss": 1.9413, + "step": 5747 + }, + { + "epoch": 0.44, + "grad_norm": 0.6373982971954872, + "learning_rate": 3.074209388611567e-05, + "loss": 1.9049, + "step": 5748 + }, + { + "epoch": 0.44, + "grad_norm": 0.7423452997813833, + "learning_rate": 3.0736014006056465e-05, + "loss": 2.0043, + "step": 5749 + }, + { + "epoch": 0.44, + "grad_norm": 0.6956013611059232, + "learning_rate": 3.072993376787434e-05, + "loss": 2.1138, + "step": 5750 + }, + { + "epoch": 0.44, + "grad_norm": 0.608469913008348, + "learning_rate": 3.072385317194891e-05, + "loss": 1.8915, + "step": 5751 + }, + { + "epoch": 0.44, + "grad_norm": 0.756110784860381, + "learning_rate": 3.071777221865982e-05, + "loss": 1.8639, + "step": 5752 + }, + { + "epoch": 0.44, + "grad_norm": 0.6833615027991287, + "learning_rate": 3.071169090838672e-05, + "loss": 1.9444, + "step": 5753 + }, + { + "epoch": 0.44, + "grad_norm": 0.9189515067290148, + "learning_rate": 3.0705609241509284e-05, + "loss": 2.0931, + "step": 5754 + }, + { + "epoch": 0.44, + "grad_norm": 0.7567967621122482, + "learning_rate": 3.0699527218407236e-05, + "loss": 2.023, + "step": 5755 + }, + { + "epoch": 0.44, + "grad_norm": 0.7913196701736857, + "learning_rate": 3.0693444839460276e-05, + "loss": 1.9515, + "step": 5756 + }, + { + "epoch": 0.44, + "grad_norm": 0.7879972756291038, + "learning_rate": 3.0687362105048175e-05, + "loss": 1.9332, + "step": 5757 + }, + { + "epoch": 0.44, + "grad_norm": 0.8393335126977188, + "learning_rate": 3.068127901555069e-05, + "loss": 2.1426, + "step": 5758 + }, + { + "epoch": 0.44, + "grad_norm": 0.827387334529188, + "learning_rate": 3.067519557134762e-05, + "loss": 1.9112, + "step": 5759 + }, + { + "epoch": 0.44, + "grad_norm": 0.6661379443057521, + "learning_rate": 3.0669111772818784e-05, + "loss": 1.9109, + "step": 5760 + }, + { + "epoch": 0.44, + "grad_norm": 0.6901456791948904, + "learning_rate": 3.066302762034399e-05, + "loss": 2.0122, + "step": 5761 + }, + { + "epoch": 0.44, + "grad_norm": 0.7603237042872206, + "learning_rate": 3.065694311430313e-05, + "loss": 2.1706, + "step": 5762 + }, + { + "epoch": 0.44, + "grad_norm": 0.7077419352404956, + "learning_rate": 3.065085825507607e-05, + "loss": 1.8644, + "step": 5763 + }, + { + "epoch": 0.44, + "grad_norm": 0.6731703485827933, + "learning_rate": 3.064477304304272e-05, + "loss": 1.9301, + "step": 5764 + }, + { + "epoch": 0.44, + "grad_norm": 0.7114193835034094, + "learning_rate": 3.0638687478582986e-05, + "loss": 1.9856, + "step": 5765 + }, + { + "epoch": 0.44, + "grad_norm": 0.6114941728036395, + "learning_rate": 3.063260156207684e-05, + "loss": 2.1226, + "step": 5766 + }, + { + "epoch": 0.44, + "grad_norm": 0.6161022771549636, + "learning_rate": 3.062651529390424e-05, + "loss": 1.9145, + "step": 5767 + }, + { + "epoch": 0.44, + "grad_norm": 0.6684632530489678, + "learning_rate": 3.0620428674445165e-05, + "loss": 2.0433, + "step": 5768 + }, + { + "epoch": 0.45, + "grad_norm": 0.6118152255175296, + "learning_rate": 3.061434170407965e-05, + "loss": 1.9451, + "step": 5769 + }, + { + "epoch": 0.45, + "grad_norm": 0.6348987746710177, + "learning_rate": 3.0608254383187714e-05, + "loss": 2.0961, + "step": 5770 + }, + { + "epoch": 0.45, + "grad_norm": 0.594512903309019, + "learning_rate": 3.060216671214942e-05, + "loss": 1.9386, + "step": 5771 + }, + { + "epoch": 0.45, + "grad_norm": 0.7134842303316508, + "learning_rate": 3.059607869134484e-05, + "loss": 1.918, + "step": 5772 + }, + { + "epoch": 0.45, + "grad_norm": 0.6122237054063439, + "learning_rate": 3.0589990321154076e-05, + "loss": 1.9393, + "step": 5773 + }, + { + "epoch": 0.45, + "grad_norm": 0.6732249884679624, + "learning_rate": 3.058390160195725e-05, + "loss": 2.1934, + "step": 5774 + }, + { + "epoch": 0.45, + "grad_norm": 0.5879932233373174, + "learning_rate": 3.057781253413452e-05, + "loss": 1.9816, + "step": 5775 + }, + { + "epoch": 0.45, + "grad_norm": 0.6194843408405601, + "learning_rate": 3.057172311806603e-05, + "loss": 1.9238, + "step": 5776 + }, + { + "epoch": 0.45, + "grad_norm": 0.6374707411736114, + "learning_rate": 3.056563335413198e-05, + "loss": 1.8724, + "step": 5777 + }, + { + "epoch": 0.45, + "grad_norm": 0.6014176864905327, + "learning_rate": 3.055954324271258e-05, + "loss": 2.1753, + "step": 5778 + }, + { + "epoch": 0.45, + "grad_norm": 0.6591498976409459, + "learning_rate": 3.055345278418804e-05, + "loss": 1.9615, + "step": 5779 + }, + { + "epoch": 0.45, + "grad_norm": 0.6067070575975457, + "learning_rate": 3.054736197893865e-05, + "loss": 2.0082, + "step": 5780 + }, + { + "epoch": 0.45, + "grad_norm": 0.7425535479890805, + "learning_rate": 3.0541270827344645e-05, + "loss": 1.902, + "step": 5781 + }, + { + "epoch": 0.45, + "grad_norm": 0.6934969637879375, + "learning_rate": 3.0535179329786336e-05, + "loss": 2.1057, + "step": 5782 + }, + { + "epoch": 0.45, + "grad_norm": 0.6481807528759657, + "learning_rate": 3.052908748664406e-05, + "loss": 1.9545, + "step": 5783 + }, + { + "epoch": 0.45, + "grad_norm": 0.7054520982920129, + "learning_rate": 3.052299529829813e-05, + "loss": 1.9623, + "step": 5784 + }, + { + "epoch": 0.45, + "grad_norm": 0.7810095589884813, + "learning_rate": 3.051690276512891e-05, + "loss": 1.9144, + "step": 5785 + }, + { + "epoch": 0.45, + "grad_norm": 0.6483734637919202, + "learning_rate": 3.051080988751679e-05, + "loss": 2.1813, + "step": 5786 + }, + { + "epoch": 0.45, + "grad_norm": 0.6941977951243173, + "learning_rate": 3.0504716665842174e-05, + "loss": 1.8944, + "step": 5787 + }, + { + "epoch": 0.45, + "grad_norm": 0.7810229032536363, + "learning_rate": 3.049862310048548e-05, + "loss": 1.9613, + "step": 5788 + }, + { + "epoch": 0.45, + "grad_norm": 0.5888749814878694, + "learning_rate": 3.049252919182715e-05, + "loss": 1.8577, + "step": 5789 + }, + { + "epoch": 0.45, + "grad_norm": 0.9656947102507605, + "learning_rate": 3.048643494024766e-05, + "loss": 2.1447, + "step": 5790 + }, + { + "epoch": 0.45, + "grad_norm": 0.6301585562103391, + "learning_rate": 3.0480340346127505e-05, + "loss": 1.9057, + "step": 5791 + }, + { + "epoch": 0.45, + "grad_norm": 0.7956605755982032, + "learning_rate": 3.0474245409847185e-05, + "loss": 1.9545, + "step": 5792 + }, + { + "epoch": 0.45, + "grad_norm": 0.7283144757418191, + "learning_rate": 3.0468150131787238e-05, + "loss": 1.8993, + "step": 5793 + }, + { + "epoch": 0.45, + "grad_norm": 0.7746248303744437, + "learning_rate": 3.046205451232821e-05, + "loss": 2.0672, + "step": 5794 + }, + { + "epoch": 0.45, + "grad_norm": 0.6175403243772434, + "learning_rate": 3.0455958551850682e-05, + "loss": 1.9436, + "step": 5795 + }, + { + "epoch": 0.45, + "grad_norm": 0.5948147446023655, + "learning_rate": 3.044986225073525e-05, + "loss": 1.9149, + "step": 5796 + }, + { + "epoch": 0.45, + "grad_norm": 0.6912182738719852, + "learning_rate": 3.0443765609362528e-05, + "loss": 1.9246, + "step": 5797 + }, + { + "epoch": 0.45, + "grad_norm": 0.5967504650693743, + "learning_rate": 3.0437668628113153e-05, + "loss": 2.1317, + "step": 5798 + }, + { + "epoch": 0.45, + "grad_norm": 0.5895091483558293, + "learning_rate": 3.043157130736779e-05, + "loss": 1.976, + "step": 5799 + }, + { + "epoch": 0.45, + "grad_norm": 0.5451492981864545, + "learning_rate": 3.0425473647507115e-05, + "loss": 1.8784, + "step": 5800 + }, + { + "epoch": 0.45, + "grad_norm": 0.5733678371711695, + "learning_rate": 3.041937564891183e-05, + "loss": 1.9525, + "step": 5801 + }, + { + "epoch": 0.45, + "grad_norm": 0.6781993896953907, + "learning_rate": 3.0413277311962668e-05, + "loss": 2.0332, + "step": 5802 + }, + { + "epoch": 0.45, + "grad_norm": 0.6823059792144617, + "learning_rate": 3.040717863704035e-05, + "loss": 1.9345, + "step": 5803 + }, + { + "epoch": 0.45, + "grad_norm": 0.6834360640782295, + "learning_rate": 3.0401079624525668e-05, + "loss": 1.9425, + "step": 5804 + }, + { + "epoch": 0.45, + "grad_norm": 0.6150847125185505, + "learning_rate": 3.03949802747994e-05, + "loss": 1.9875, + "step": 5805 + }, + { + "epoch": 0.45, + "grad_norm": 0.6982209554060324, + "learning_rate": 3.0388880588242347e-05, + "loss": 2.1652, + "step": 5806 + }, + { + "epoch": 0.45, + "grad_norm": 0.6245965626235614, + "learning_rate": 3.038278056523533e-05, + "loss": 1.9285, + "step": 5807 + }, + { + "epoch": 0.45, + "grad_norm": 0.6570708627418822, + "learning_rate": 3.037668020615922e-05, + "loss": 1.917, + "step": 5808 + }, + { + "epoch": 0.45, + "grad_norm": 0.5709824748440416, + "learning_rate": 3.037057951139488e-05, + "loss": 1.8992, + "step": 5809 + }, + { + "epoch": 0.45, + "grad_norm": 0.5846108487519303, + "learning_rate": 3.036447848132319e-05, + "loss": 2.1099, + "step": 5810 + }, + { + "epoch": 0.45, + "grad_norm": 0.7109491030643428, + "learning_rate": 3.0358377116325075e-05, + "loss": 2.0347, + "step": 5811 + }, + { + "epoch": 0.45, + "grad_norm": 0.6270243396177597, + "learning_rate": 3.0352275416781463e-05, + "loss": 1.907, + "step": 5812 + }, + { + "epoch": 0.45, + "grad_norm": 0.6364134201729412, + "learning_rate": 3.0346173383073307e-05, + "loss": 1.9384, + "step": 5813 + }, + { + "epoch": 0.45, + "grad_norm": 0.6674900450779712, + "learning_rate": 3.0340071015581583e-05, + "loss": 2.1365, + "step": 5814 + }, + { + "epoch": 0.45, + "grad_norm": 0.5837690441920116, + "learning_rate": 3.0333968314687296e-05, + "loss": 1.9339, + "step": 5815 + }, + { + "epoch": 0.45, + "grad_norm": 0.6997362233780504, + "learning_rate": 3.0327865280771445e-05, + "loss": 1.9428, + "step": 5816 + }, + { + "epoch": 0.45, + "grad_norm": 0.6275056328771808, + "learning_rate": 3.032176191421508e-05, + "loss": 1.9665, + "step": 5817 + }, + { + "epoch": 0.45, + "grad_norm": 0.6967948643742953, + "learning_rate": 3.0315658215399256e-05, + "loss": 2.1072, + "step": 5818 + }, + { + "epoch": 0.45, + "grad_norm": 0.6505052680559663, + "learning_rate": 3.0309554184705058e-05, + "loss": 1.9309, + "step": 5819 + }, + { + "epoch": 0.45, + "grad_norm": 0.6099629371334845, + "learning_rate": 3.0303449822513575e-05, + "loss": 1.9285, + "step": 5820 + }, + { + "epoch": 0.45, + "grad_norm": 0.6247597694038475, + "learning_rate": 3.0297345129205933e-05, + "loss": 1.9295, + "step": 5821 + }, + { + "epoch": 0.45, + "grad_norm": 0.6442285496729565, + "learning_rate": 3.0291240105163272e-05, + "loss": 2.1128, + "step": 5822 + }, + { + "epoch": 0.45, + "grad_norm": 0.6061204908232027, + "learning_rate": 3.0285134750766754e-05, + "loss": 2.0129, + "step": 5823 + }, + { + "epoch": 0.45, + "grad_norm": 0.6482506134397642, + "learning_rate": 3.0279029066397563e-05, + "loss": 1.9383, + "step": 5824 + }, + { + "epoch": 0.45, + "grad_norm": 0.690503483661787, + "learning_rate": 3.02729230524369e-05, + "loss": 1.9615, + "step": 5825 + }, + { + "epoch": 0.45, + "grad_norm": 0.627034202065041, + "learning_rate": 3.026681670926599e-05, + "loss": 2.0785, + "step": 5826 + }, + { + "epoch": 0.45, + "grad_norm": 0.7078633677456196, + "learning_rate": 3.0260710037266082e-05, + "loss": 1.9748, + "step": 5827 + }, + { + "epoch": 0.45, + "grad_norm": 0.6251306460929781, + "learning_rate": 3.0254603036818423e-05, + "loss": 1.8976, + "step": 5828 + }, + { + "epoch": 0.45, + "grad_norm": 0.644219769805675, + "learning_rate": 3.0248495708304315e-05, + "loss": 2.007, + "step": 5829 + }, + { + "epoch": 0.45, + "grad_norm": 0.6517518456469886, + "learning_rate": 3.024238805210506e-05, + "loss": 2.1587, + "step": 5830 + }, + { + "epoch": 0.45, + "grad_norm": 0.5818419248020729, + "learning_rate": 3.0236280068601987e-05, + "loss": 1.9044, + "step": 5831 + }, + { + "epoch": 0.45, + "grad_norm": 0.7287355598340334, + "learning_rate": 3.0230171758176428e-05, + "loss": 1.9495, + "step": 5832 + }, + { + "epoch": 0.45, + "grad_norm": 0.6768832661481076, + "learning_rate": 3.022406312120976e-05, + "loss": 1.9114, + "step": 5833 + }, + { + "epoch": 0.45, + "grad_norm": 0.6207307031416446, + "learning_rate": 3.0217954158083384e-05, + "loss": 2.1368, + "step": 5834 + }, + { + "epoch": 0.45, + "grad_norm": 0.7258980759954399, + "learning_rate": 3.0211844869178686e-05, + "loss": 1.9388, + "step": 5835 + }, + { + "epoch": 0.45, + "grad_norm": 0.6388601802372186, + "learning_rate": 3.02057352548771e-05, + "loss": 1.9966, + "step": 5836 + }, + { + "epoch": 0.45, + "grad_norm": 0.6227692425556223, + "learning_rate": 3.0199625315560078e-05, + "loss": 1.8945, + "step": 5837 + }, + { + "epoch": 0.45, + "grad_norm": 0.5706924334742495, + "learning_rate": 3.0193515051609083e-05, + "loss": 2.1304, + "step": 5838 + }, + { + "epoch": 0.45, + "grad_norm": 0.6382479775592301, + "learning_rate": 3.018740446340561e-05, + "loss": 1.9298, + "step": 5839 + }, + { + "epoch": 0.45, + "grad_norm": 0.7066930751598656, + "learning_rate": 3.018129355133117e-05, + "loss": 1.9715, + "step": 5840 + }, + { + "epoch": 0.45, + "grad_norm": 0.5358642174619391, + "learning_rate": 3.0175182315767285e-05, + "loss": 1.9365, + "step": 5841 + }, + { + "epoch": 0.45, + "grad_norm": 0.6866363591956317, + "learning_rate": 3.0169070757095507e-05, + "loss": 2.0069, + "step": 5842 + }, + { + "epoch": 0.45, + "grad_norm": 0.600059190155287, + "learning_rate": 3.016295887569741e-05, + "loss": 2.0635, + "step": 5843 + }, + { + "epoch": 0.45, + "grad_norm": 0.568068985388958, + "learning_rate": 3.0156846671954576e-05, + "loss": 1.9238, + "step": 5844 + }, + { + "epoch": 0.45, + "grad_norm": 0.7167827825017467, + "learning_rate": 3.0150734146248626e-05, + "loss": 1.9442, + "step": 5845 + }, + { + "epoch": 0.45, + "grad_norm": 0.5374978304369556, + "learning_rate": 3.0144621298961173e-05, + "loss": 2.1267, + "step": 5846 + }, + { + "epoch": 0.45, + "grad_norm": 0.6757165958268606, + "learning_rate": 3.0138508130473887e-05, + "loss": 1.9142, + "step": 5847 + }, + { + "epoch": 0.45, + "grad_norm": 0.6948448174678183, + "learning_rate": 3.0132394641168427e-05, + "loss": 2.007, + "step": 5848 + }, + { + "epoch": 0.45, + "grad_norm": 0.6077437555357943, + "learning_rate": 3.012628083142649e-05, + "loss": 1.9272, + "step": 5849 + }, + { + "epoch": 0.45, + "grad_norm": 0.7672750313597165, + "learning_rate": 3.012016670162977e-05, + "loss": 2.1119, + "step": 5850 + }, + { + "epoch": 0.45, + "grad_norm": 0.5958027950611892, + "learning_rate": 3.0114052252160013e-05, + "loss": 1.8916, + "step": 5851 + }, + { + "epoch": 0.45, + "grad_norm": 0.6362691561284282, + "learning_rate": 3.0107937483398967e-05, + "loss": 1.9124, + "step": 5852 + }, + { + "epoch": 0.45, + "grad_norm": 0.6498122597783662, + "learning_rate": 3.0101822395728403e-05, + "loss": 1.921, + "step": 5853 + }, + { + "epoch": 0.45, + "grad_norm": 0.604066288166943, + "learning_rate": 3.0095706989530096e-05, + "loss": 2.1652, + "step": 5854 + }, + { + "epoch": 0.45, + "grad_norm": 0.6256895403644306, + "learning_rate": 3.008959126518588e-05, + "loss": 1.9921, + "step": 5855 + }, + { + "epoch": 0.45, + "grad_norm": 0.6248806026458444, + "learning_rate": 3.0083475223077574e-05, + "loss": 1.893, + "step": 5856 + }, + { + "epoch": 0.45, + "grad_norm": 0.6109976970539495, + "learning_rate": 3.007735886358702e-05, + "loss": 1.9482, + "step": 5857 + }, + { + "epoch": 0.45, + "grad_norm": 0.7434508351032425, + "learning_rate": 3.0071242187096093e-05, + "loss": 2.0882, + "step": 5858 + }, + { + "epoch": 0.45, + "grad_norm": 0.5989737490732091, + "learning_rate": 3.0065125193986682e-05, + "loss": 1.9545, + "step": 5859 + }, + { + "epoch": 0.45, + "grad_norm": 0.7457862906738794, + "learning_rate": 3.00590078846407e-05, + "loss": 1.9459, + "step": 5860 + }, + { + "epoch": 0.45, + "grad_norm": 0.6021411980242811, + "learning_rate": 3.0052890259440074e-05, + "loss": 2.0195, + "step": 5861 + }, + { + "epoch": 0.45, + "grad_norm": 0.7722521010901666, + "learning_rate": 3.0046772318766753e-05, + "loss": 1.9465, + "step": 5862 + }, + { + "epoch": 0.45, + "grad_norm": 0.6555835462201973, + "learning_rate": 3.00406540630027e-05, + "loss": 2.1223, + "step": 5863 + }, + { + "epoch": 0.45, + "grad_norm": 0.6467735934076191, + "learning_rate": 3.0034535492529904e-05, + "loss": 1.9069, + "step": 5864 + }, + { + "epoch": 0.45, + "grad_norm": 0.6481897514592124, + "learning_rate": 3.002841660773038e-05, + "loss": 1.8667, + "step": 5865 + }, + { + "epoch": 0.45, + "grad_norm": 0.6425457021568652, + "learning_rate": 3.0022297408986146e-05, + "loss": 2.1423, + "step": 5866 + }, + { + "epoch": 0.45, + "grad_norm": 0.6027530357585592, + "learning_rate": 3.0016177896679255e-05, + "loss": 2.0292, + "step": 5867 + }, + { + "epoch": 0.45, + "grad_norm": 0.6240375855683122, + "learning_rate": 3.0010058071191772e-05, + "loss": 1.9629, + "step": 5868 + }, + { + "epoch": 0.45, + "grad_norm": 0.5726931814811277, + "learning_rate": 3.0003937932905778e-05, + "loss": 1.8965, + "step": 5869 + }, + { + "epoch": 0.45, + "grad_norm": 0.6322633159460269, + "learning_rate": 2.9997817482203388e-05, + "loss": 2.1334, + "step": 5870 + }, + { + "epoch": 0.45, + "grad_norm": 0.5850224271326997, + "learning_rate": 2.999169671946671e-05, + "loss": 1.9914, + "step": 5871 + }, + { + "epoch": 0.45, + "grad_norm": 0.5361120137306, + "learning_rate": 2.9985575645077912e-05, + "loss": 1.8992, + "step": 5872 + }, + { + "epoch": 0.45, + "grad_norm": 0.5719195961820016, + "learning_rate": 2.9979454259419138e-05, + "loss": 2.0251, + "step": 5873 + }, + { + "epoch": 0.45, + "grad_norm": 0.5979140306424473, + "learning_rate": 2.9973332562872585e-05, + "loss": 1.8826, + "step": 5874 + }, + { + "epoch": 0.45, + "grad_norm": 0.6108812639731959, + "learning_rate": 2.9967210555820446e-05, + "loss": 2.0957, + "step": 5875 + }, + { + "epoch": 0.45, + "grad_norm": 0.5824445855687179, + "learning_rate": 2.9961088238644937e-05, + "loss": 1.9262, + "step": 5876 + }, + { + "epoch": 0.45, + "grad_norm": 0.5955769642508199, + "learning_rate": 2.9954965611728324e-05, + "loss": 1.9625, + "step": 5877 + }, + { + "epoch": 0.45, + "grad_norm": 0.6965183239099628, + "learning_rate": 2.9948842675452843e-05, + "loss": 2.154, + "step": 5878 + }, + { + "epoch": 0.45, + "grad_norm": 0.5728572728103319, + "learning_rate": 2.9942719430200782e-05, + "loss": 2.0247, + "step": 5879 + }, + { + "epoch": 0.45, + "grad_norm": 0.5873205140366959, + "learning_rate": 2.993659587635444e-05, + "loss": 1.8742, + "step": 5880 + }, + { + "epoch": 0.45, + "grad_norm": 0.5757376840800539, + "learning_rate": 2.9930472014296152e-05, + "loss": 1.915, + "step": 5881 + }, + { + "epoch": 0.45, + "grad_norm": 0.7817920063761639, + "learning_rate": 2.992434784440823e-05, + "loss": 2.1434, + "step": 5882 + }, + { + "epoch": 0.45, + "grad_norm": 0.6871254658795236, + "learning_rate": 2.9918223367073038e-05, + "loss": 1.9094, + "step": 5883 + }, + { + "epoch": 0.45, + "grad_norm": 0.6902483786040653, + "learning_rate": 2.9912098582672964e-05, + "loss": 1.941, + "step": 5884 + }, + { + "epoch": 0.45, + "grad_norm": 0.6770423390800582, + "learning_rate": 2.99059734915904e-05, + "loss": 1.9193, + "step": 5885 + }, + { + "epoch": 0.45, + "grad_norm": 0.654342515134921, + "learning_rate": 2.989984809420775e-05, + "loss": 2.0164, + "step": 5886 + }, + { + "epoch": 0.45, + "grad_norm": 0.6264815406638509, + "learning_rate": 2.989372239090745e-05, + "loss": 2.114, + "step": 5887 + }, + { + "epoch": 0.45, + "grad_norm": 0.7499529035550805, + "learning_rate": 2.9887596382071965e-05, + "loss": 1.9465, + "step": 5888 + }, + { + "epoch": 0.45, + "grad_norm": 0.6541832866045406, + "learning_rate": 2.9881470068083763e-05, + "loss": 1.907, + "step": 5889 + }, + { + "epoch": 0.45, + "grad_norm": 0.6916484585900159, + "learning_rate": 2.9875343449325323e-05, + "loss": 2.1104, + "step": 5890 + }, + { + "epoch": 0.45, + "grad_norm": 0.6748633955122447, + "learning_rate": 2.986921652617916e-05, + "loss": 1.9339, + "step": 5891 + }, + { + "epoch": 0.45, + "grad_norm": 0.6295381116869506, + "learning_rate": 2.986308929902782e-05, + "loss": 2.0131, + "step": 5892 + }, + { + "epoch": 0.45, + "grad_norm": 0.6349348436545615, + "learning_rate": 2.985696176825382e-05, + "loss": 1.9599, + "step": 5893 + }, + { + "epoch": 0.45, + "grad_norm": 0.6700301491722512, + "learning_rate": 2.9850833934239752e-05, + "loss": 1.9179, + "step": 5894 + }, + { + "epoch": 0.45, + "grad_norm": 0.5780490373822696, + "learning_rate": 2.9844705797368193e-05, + "loss": 2.0867, + "step": 5895 + }, + { + "epoch": 0.45, + "grad_norm": 0.5788800115791749, + "learning_rate": 2.9838577358021758e-05, + "loss": 1.9781, + "step": 5896 + }, + { + "epoch": 0.45, + "grad_norm": 0.5914250052963899, + "learning_rate": 2.9832448616583042e-05, + "loss": 1.9192, + "step": 5897 + }, + { + "epoch": 0.46, + "grad_norm": 0.5997608681927281, + "learning_rate": 2.982631957343472e-05, + "loss": 1.9879, + "step": 5898 + }, + { + "epoch": 0.46, + "grad_norm": 0.5849522910100994, + "learning_rate": 2.9820190228959445e-05, + "loss": 2.1126, + "step": 5899 + }, + { + "epoch": 0.46, + "grad_norm": 0.6182279116715895, + "learning_rate": 2.9814060583539883e-05, + "loss": 1.9309, + "step": 5900 + }, + { + "epoch": 0.46, + "grad_norm": 0.567416295909268, + "learning_rate": 2.9807930637558733e-05, + "loss": 1.9174, + "step": 5901 + }, + { + "epoch": 0.46, + "grad_norm": 0.6423126065949616, + "learning_rate": 2.980180039139874e-05, + "loss": 2.1392, + "step": 5902 + }, + { + "epoch": 0.46, + "grad_norm": 0.6030654245451491, + "learning_rate": 2.9795669845442614e-05, + "loss": 1.9263, + "step": 5903 + }, + { + "epoch": 0.46, + "grad_norm": 0.547532943309135, + "learning_rate": 2.978953900007312e-05, + "loss": 1.9269, + "step": 5904 + }, + { + "epoch": 0.46, + "grad_norm": 0.5745689368935785, + "learning_rate": 2.9783407855673025e-05, + "loss": 1.9176, + "step": 5905 + }, + { + "epoch": 0.46, + "grad_norm": 0.6950814474030836, + "learning_rate": 2.9777276412625132e-05, + "loss": 1.9407, + "step": 5906 + }, + { + "epoch": 0.46, + "grad_norm": 0.6198642487117558, + "learning_rate": 2.977114467131225e-05, + "loss": 2.1529, + "step": 5907 + }, + { + "epoch": 0.46, + "grad_norm": 0.5575607576923353, + "learning_rate": 2.9765012632117202e-05, + "loss": 1.9215, + "step": 5908 + }, + { + "epoch": 0.46, + "grad_norm": 0.6393011271732744, + "learning_rate": 2.9758880295422835e-05, + "loss": 1.9153, + "step": 5909 + }, + { + "epoch": 0.46, + "grad_norm": 0.5837295561767646, + "learning_rate": 2.975274766161203e-05, + "loss": 1.9391, + "step": 5910 + }, + { + "epoch": 0.46, + "grad_norm": 0.7341705152820133, + "learning_rate": 2.9746614731067658e-05, + "loss": 2.1301, + "step": 5911 + }, + { + "epoch": 0.46, + "grad_norm": 0.5781172239487322, + "learning_rate": 2.9740481504172634e-05, + "loss": 1.9259, + "step": 5912 + }, + { + "epoch": 0.46, + "grad_norm": 0.6829223166055113, + "learning_rate": 2.9734347981309868e-05, + "loss": 1.934, + "step": 5913 + }, + { + "epoch": 0.46, + "grad_norm": 0.6853101534161736, + "learning_rate": 2.9728214162862317e-05, + "loss": 2.1407, + "step": 5914 + }, + { + "epoch": 0.46, + "grad_norm": 0.6815798800857179, + "learning_rate": 2.9722080049212926e-05, + "loss": 1.9295, + "step": 5915 + }, + { + "epoch": 0.46, + "grad_norm": 0.6201937169440519, + "learning_rate": 2.971594564074468e-05, + "loss": 1.9333, + "step": 5916 + }, + { + "epoch": 0.46, + "grad_norm": 0.6875413870756768, + "learning_rate": 2.9709810937840577e-05, + "loss": 2.0487, + "step": 5917 + }, + { + "epoch": 0.46, + "grad_norm": 0.6372626365975891, + "learning_rate": 2.9703675940883625e-05, + "loss": 1.9225, + "step": 5918 + }, + { + "epoch": 0.46, + "grad_norm": 0.6004064747032164, + "learning_rate": 2.9697540650256854e-05, + "loss": 2.0802, + "step": 5919 + }, + { + "epoch": 0.46, + "grad_norm": 0.5515527150902807, + "learning_rate": 2.9691405066343336e-05, + "loss": 1.8321, + "step": 5920 + }, + { + "epoch": 0.46, + "grad_norm": 0.6598469618246547, + "learning_rate": 2.9685269189526123e-05, + "loss": 1.9312, + "step": 5921 + }, + { + "epoch": 0.46, + "grad_norm": 0.6479674919511998, + "learning_rate": 2.9679133020188297e-05, + "loss": 2.1647, + "step": 5922 + }, + { + "epoch": 0.46, + "grad_norm": 0.5835508491915485, + "learning_rate": 2.9672996558712986e-05, + "loss": 1.9966, + "step": 5923 + }, + { + "epoch": 0.46, + "grad_norm": 0.6440159767565734, + "learning_rate": 2.9666859805483305e-05, + "loss": 1.9207, + "step": 5924 + }, + { + "epoch": 0.46, + "grad_norm": 0.6500028754809394, + "learning_rate": 2.9660722760882387e-05, + "loss": 1.9209, + "step": 5925 + }, + { + "epoch": 0.46, + "grad_norm": 0.5388042773105715, + "learning_rate": 2.96545854252934e-05, + "loss": 1.9555, + "step": 5926 + }, + { + "epoch": 0.46, + "grad_norm": 0.5819980533110257, + "learning_rate": 2.964844779909952e-05, + "loss": 2.1586, + "step": 5927 + }, + { + "epoch": 0.46, + "grad_norm": 0.5728317851910332, + "learning_rate": 2.9642309882683955e-05, + "loss": 1.9181, + "step": 5928 + }, + { + "epoch": 0.46, + "grad_norm": 0.5808699988925805, + "learning_rate": 2.9636171676429914e-05, + "loss": 2.0103, + "step": 5929 + }, + { + "epoch": 0.46, + "grad_norm": 0.5857061366846728, + "learning_rate": 2.9630033180720625e-05, + "loss": 1.922, + "step": 5930 + }, + { + "epoch": 0.46, + "grad_norm": 0.5757872076384501, + "learning_rate": 2.9623894395939344e-05, + "loss": 2.1355, + "step": 5931 + }, + { + "epoch": 0.46, + "grad_norm": 0.6384088026438381, + "learning_rate": 2.9617755322469336e-05, + "loss": 1.8799, + "step": 5932 + }, + { + "epoch": 0.46, + "grad_norm": 0.5739669432667489, + "learning_rate": 2.96116159606939e-05, + "loss": 1.9056, + "step": 5933 + }, + { + "epoch": 0.46, + "grad_norm": 0.7187417204353276, + "learning_rate": 2.9605476310996333e-05, + "loss": 2.1319, + "step": 5934 + }, + { + "epoch": 0.46, + "grad_norm": 0.5733529554445508, + "learning_rate": 2.9599336373759957e-05, + "loss": 1.9944, + "step": 5935 + }, + { + "epoch": 0.46, + "grad_norm": 0.6047752585470587, + "learning_rate": 2.959319614936812e-05, + "loss": 1.9309, + "step": 5936 + }, + { + "epoch": 0.46, + "grad_norm": 0.62405650142704, + "learning_rate": 2.958705563820418e-05, + "loss": 1.8992, + "step": 5937 + }, + { + "epoch": 0.46, + "grad_norm": 0.6292321132456158, + "learning_rate": 2.9580914840651508e-05, + "loss": 1.9078, + "step": 5938 + }, + { + "epoch": 0.46, + "grad_norm": 0.6208311396776585, + "learning_rate": 2.9574773757093505e-05, + "loss": 2.06, + "step": 5939 + }, + { + "epoch": 0.46, + "grad_norm": 0.5850421044653606, + "learning_rate": 2.9568632387913586e-05, + "loss": 1.9316, + "step": 5940 + }, + { + "epoch": 0.46, + "grad_norm": 0.6455775185099398, + "learning_rate": 2.9562490733495173e-05, + "loss": 2.0399, + "step": 5941 + }, + { + "epoch": 0.46, + "grad_norm": 0.5914731793845333, + "learning_rate": 2.955634879422173e-05, + "loss": 1.9121, + "step": 5942 + }, + { + "epoch": 0.46, + "grad_norm": 0.6141170356408592, + "learning_rate": 2.95502065704767e-05, + "loss": 2.1214, + "step": 5943 + }, + { + "epoch": 0.46, + "grad_norm": 0.6032996933972882, + "learning_rate": 2.954406406264359e-05, + "loss": 1.9002, + "step": 5944 + }, + { + "epoch": 0.46, + "grad_norm": 0.6409087770806413, + "learning_rate": 2.953792127110589e-05, + "loss": 1.9393, + "step": 5945 + }, + { + "epoch": 0.46, + "grad_norm": 0.6407734269741273, + "learning_rate": 2.953177819624713e-05, + "loss": 1.8988, + "step": 5946 + }, + { + "epoch": 0.46, + "grad_norm": 0.6535392139358569, + "learning_rate": 2.9525634838450834e-05, + "loss": 2.0778, + "step": 5947 + }, + { + "epoch": 0.46, + "grad_norm": 0.6638810545480212, + "learning_rate": 2.951949119810056e-05, + "loss": 2.0145, + "step": 5948 + }, + { + "epoch": 0.46, + "grad_norm": 0.6015073651731506, + "learning_rate": 2.951334727557989e-05, + "loss": 1.9383, + "step": 5949 + }, + { + "epoch": 0.46, + "grad_norm": 0.6427577626067742, + "learning_rate": 2.9507203071272404e-05, + "loss": 1.9349, + "step": 5950 + }, + { + "epoch": 0.46, + "grad_norm": 0.7472902852878983, + "learning_rate": 2.9501058585561713e-05, + "loss": 2.143, + "step": 5951 + }, + { + "epoch": 0.46, + "grad_norm": 0.6198206890509291, + "learning_rate": 2.9494913818831443e-05, + "loss": 1.9639, + "step": 5952 + }, + { + "epoch": 0.46, + "grad_norm": 0.5979936800180621, + "learning_rate": 2.9488768771465246e-05, + "loss": 1.9325, + "step": 5953 + }, + { + "epoch": 0.46, + "grad_norm": 0.5777256783062885, + "learning_rate": 2.948262344384677e-05, + "loss": 2.0319, + "step": 5954 + }, + { + "epoch": 0.46, + "grad_norm": 0.7053294183986611, + "learning_rate": 2.9476477836359694e-05, + "loss": 2.1467, + "step": 5955 + }, + { + "epoch": 0.46, + "grad_norm": 0.563736640740669, + "learning_rate": 2.947033194938772e-05, + "loss": 1.9303, + "step": 5956 + }, + { + "epoch": 0.46, + "grad_norm": 0.755739956087039, + "learning_rate": 2.946418578331456e-05, + "loss": 1.9339, + "step": 5957 + }, + { + "epoch": 0.46, + "grad_norm": 0.5795689857617631, + "learning_rate": 2.9458039338523936e-05, + "loss": 1.8778, + "step": 5958 + }, + { + "epoch": 0.46, + "grad_norm": 0.6715298216653058, + "learning_rate": 2.9451892615399607e-05, + "loss": 2.1671, + "step": 5959 + }, + { + "epoch": 0.46, + "grad_norm": 0.6365313182095643, + "learning_rate": 2.9445745614325337e-05, + "loss": 1.9715, + "step": 5960 + }, + { + "epoch": 0.46, + "grad_norm": 0.7217220329131973, + "learning_rate": 2.9439598335684893e-05, + "loss": 1.918, + "step": 5961 + }, + { + "epoch": 0.46, + "grad_norm": 0.6536711268682291, + "learning_rate": 2.943345077986209e-05, + "loss": 1.9244, + "step": 5962 + }, + { + "epoch": 0.46, + "grad_norm": 0.5971157805791927, + "learning_rate": 2.9427302947240747e-05, + "loss": 2.1523, + "step": 5963 + }, + { + "epoch": 0.46, + "grad_norm": 0.6721388121043124, + "learning_rate": 2.9421154838204702e-05, + "loss": 1.8868, + "step": 5964 + }, + { + "epoch": 0.46, + "grad_norm": 0.5901097327456989, + "learning_rate": 2.9415006453137777e-05, + "loss": 1.9206, + "step": 5965 + }, + { + "epoch": 0.46, + "grad_norm": 0.6013916989419127, + "learning_rate": 2.940885779242387e-05, + "loss": 1.9623, + "step": 5966 + }, + { + "epoch": 0.46, + "grad_norm": 0.6500430553437768, + "learning_rate": 2.940270885644687e-05, + "loss": 2.109, + "step": 5967 + }, + { + "epoch": 0.46, + "grad_norm": 0.5305325770273547, + "learning_rate": 2.939655964559066e-05, + "loss": 1.9062, + "step": 5968 + }, + { + "epoch": 0.46, + "grad_norm": 0.5686425637501226, + "learning_rate": 2.9390410160239162e-05, + "loss": 1.9069, + "step": 5969 + }, + { + "epoch": 0.46, + "grad_norm": 0.6181288397718674, + "learning_rate": 2.9384260400776332e-05, + "loss": 1.9082, + "step": 5970 + }, + { + "epoch": 0.46, + "grad_norm": 0.6010878185833171, + "learning_rate": 2.9378110367586115e-05, + "loss": 2.0718, + "step": 5971 + }, + { + "epoch": 0.46, + "grad_norm": 0.6404898518074024, + "learning_rate": 2.9371960061052478e-05, + "loss": 2.0232, + "step": 5972 + }, + { + "epoch": 0.46, + "grad_norm": 0.6122741082486938, + "learning_rate": 2.936580948155941e-05, + "loss": 1.9552, + "step": 5973 + }, + { + "epoch": 0.46, + "grad_norm": 0.7802081123553569, + "learning_rate": 2.9359658629490927e-05, + "loss": 1.913, + "step": 5974 + }, + { + "epoch": 0.46, + "grad_norm": 0.6671633665411347, + "learning_rate": 2.935350750523105e-05, + "loss": 2.187, + "step": 5975 + }, + { + "epoch": 0.46, + "grad_norm": 0.6844170526407003, + "learning_rate": 2.9347356109163803e-05, + "loss": 1.9088, + "step": 5976 + }, + { + "epoch": 0.46, + "grad_norm": 0.7637585308538776, + "learning_rate": 2.9341204441673266e-05, + "loss": 1.9218, + "step": 5977 + }, + { + "epoch": 0.46, + "grad_norm": 0.6329181075836443, + "learning_rate": 2.9335052503143488e-05, + "loss": 1.9727, + "step": 5978 + }, + { + "epoch": 0.46, + "grad_norm": 0.6628045370020772, + "learning_rate": 2.9328900293958583e-05, + "loss": 2.134, + "step": 5979 + }, + { + "epoch": 0.46, + "grad_norm": 0.7207841108382089, + "learning_rate": 2.932274781450265e-05, + "loss": 1.9223, + "step": 5980 + }, + { + "epoch": 0.46, + "grad_norm": 0.6168220054113233, + "learning_rate": 2.9316595065159815e-05, + "loss": 1.9768, + "step": 5981 + }, + { + "epoch": 0.46, + "grad_norm": 0.7358260198982544, + "learning_rate": 2.9310442046314208e-05, + "loss": 1.9494, + "step": 5982 + }, + { + "epoch": 0.46, + "grad_norm": 0.717560500509033, + "learning_rate": 2.930428875835e-05, + "loss": 2.1116, + "step": 5983 + }, + { + "epoch": 0.46, + "grad_norm": 0.7831643477036163, + "learning_rate": 2.929813520165136e-05, + "loss": 1.9086, + "step": 5984 + }, + { + "epoch": 0.46, + "grad_norm": 0.5854716702326174, + "learning_rate": 2.929198137660249e-05, + "loss": 1.9832, + "step": 5985 + }, + { + "epoch": 0.46, + "grad_norm": 0.7115282403627274, + "learning_rate": 2.9285827283587587e-05, + "loss": 1.8967, + "step": 5986 + }, + { + "epoch": 0.46, + "grad_norm": 0.7733198993500927, + "learning_rate": 2.9279672922990874e-05, + "loss": 2.1253, + "step": 5987 + }, + { + "epoch": 0.46, + "grad_norm": 0.6566746307875865, + "learning_rate": 2.9273518295196606e-05, + "loss": 1.9572, + "step": 5988 + }, + { + "epoch": 0.46, + "grad_norm": 0.7421258753400743, + "learning_rate": 2.9267363400589036e-05, + "loss": 1.8925, + "step": 5989 + }, + { + "epoch": 0.46, + "grad_norm": 0.6622406955733426, + "learning_rate": 2.926120823955244e-05, + "loss": 1.9287, + "step": 5990 + }, + { + "epoch": 0.46, + "grad_norm": 0.7180158241893673, + "learning_rate": 2.9255052812471095e-05, + "loss": 2.1562, + "step": 5991 + }, + { + "epoch": 0.46, + "grad_norm": 0.7136305695771915, + "learning_rate": 2.9248897119729336e-05, + "loss": 1.9286, + "step": 5992 + }, + { + "epoch": 0.46, + "grad_norm": 0.6507400524003629, + "learning_rate": 2.9242741161711467e-05, + "loss": 1.9081, + "step": 5993 + }, + { + "epoch": 0.46, + "grad_norm": 0.6761297807425809, + "learning_rate": 2.9236584938801842e-05, + "loss": 1.9281, + "step": 5994 + }, + { + "epoch": 0.46, + "grad_norm": 0.7256114366956248, + "learning_rate": 2.9230428451384807e-05, + "loss": 2.1501, + "step": 5995 + }, + { + "epoch": 0.46, + "grad_norm": 0.8544743399555044, + "learning_rate": 2.922427169984476e-05, + "loss": 1.9347, + "step": 5996 + }, + { + "epoch": 0.46, + "grad_norm": 0.796932107963117, + "learning_rate": 2.9218114684566067e-05, + "loss": 1.9943, + "step": 5997 + }, + { + "epoch": 0.46, + "grad_norm": 0.8625829152119489, + "learning_rate": 2.9211957405933144e-05, + "loss": 1.8956, + "step": 5998 + }, + { + "epoch": 0.46, + "grad_norm": 0.8907416044556657, + "learning_rate": 2.9205799864330425e-05, + "loss": 2.1307, + "step": 5999 + }, + { + "epoch": 0.46, + "grad_norm": 0.8481354418486149, + "learning_rate": 2.9199642060142333e-05, + "loss": 1.9001, + "step": 6000 + }, + { + "epoch": 0.46, + "grad_norm": 0.8110458884367083, + "learning_rate": 2.919348399375334e-05, + "loss": 1.9431, + "step": 6001 + }, + { + "epoch": 0.46, + "grad_norm": 0.6983623568008385, + "learning_rate": 2.9187325665547915e-05, + "loss": 1.9364, + "step": 6002 + }, + { + "epoch": 0.46, + "grad_norm": 0.885705006631116, + "learning_rate": 2.918116707591054e-05, + "loss": 2.1654, + "step": 6003 + }, + { + "epoch": 0.46, + "grad_norm": 0.6231988290505693, + "learning_rate": 2.9175008225225735e-05, + "loss": 1.9289, + "step": 6004 + }, + { + "epoch": 0.46, + "grad_norm": 0.9352950724195798, + "learning_rate": 2.9168849113878015e-05, + "loss": 1.9102, + "step": 6005 + }, + { + "epoch": 0.46, + "grad_norm": 0.6339089231534399, + "learning_rate": 2.9162689742251913e-05, + "loss": 1.8939, + "step": 6006 + }, + { + "epoch": 0.46, + "grad_norm": 0.9594279193490054, + "learning_rate": 2.9156530110732e-05, + "loss": 2.1353, + "step": 6007 + }, + { + "epoch": 0.46, + "grad_norm": 0.6788463141369327, + "learning_rate": 2.9150370219702826e-05, + "loss": 1.9246, + "step": 6008 + }, + { + "epoch": 0.46, + "grad_norm": 0.8518331399669477, + "learning_rate": 2.9144210069548993e-05, + "loss": 1.9682, + "step": 6009 + }, + { + "epoch": 0.46, + "grad_norm": 0.7246511535194453, + "learning_rate": 2.91380496606551e-05, + "loss": 1.9815, + "step": 6010 + }, + { + "epoch": 0.46, + "grad_norm": 0.9307261416658241, + "learning_rate": 2.9131888993405772e-05, + "loss": 2.1287, + "step": 6011 + }, + { + "epoch": 0.46, + "grad_norm": 0.6475655844128312, + "learning_rate": 2.9125728068185632e-05, + "loss": 1.8807, + "step": 6012 + }, + { + "epoch": 0.46, + "grad_norm": 0.7629216445526122, + "learning_rate": 2.911956688537934e-05, + "loss": 1.9203, + "step": 6013 + }, + { + "epoch": 0.46, + "grad_norm": 0.5200944476500788, + "learning_rate": 2.9113405445371572e-05, + "loss": 1.879, + "step": 6014 + }, + { + "epoch": 0.46, + "grad_norm": 0.7816722641709961, + "learning_rate": 2.9107243748547002e-05, + "loss": 2.1537, + "step": 6015 + }, + { + "epoch": 0.46, + "grad_norm": 0.5489949333037935, + "learning_rate": 2.910108179529032e-05, + "loss": 1.9887, + "step": 6016 + }, + { + "epoch": 0.46, + "grad_norm": 0.741292539163876, + "learning_rate": 2.9094919585986263e-05, + "loss": 1.9236, + "step": 6017 + }, + { + "epoch": 0.46, + "grad_norm": 0.5803594920771034, + "learning_rate": 2.9088757121019562e-05, + "loss": 1.9517, + "step": 6018 + }, + { + "epoch": 0.46, + "grad_norm": 0.7821026421642108, + "learning_rate": 2.908259440077495e-05, + "loss": 2.1103, + "step": 6019 + }, + { + "epoch": 0.46, + "grad_norm": 0.7419957883203638, + "learning_rate": 2.9076431425637186e-05, + "loss": 1.921, + "step": 6020 + }, + { + "epoch": 0.46, + "grad_norm": 0.7552171151522863, + "learning_rate": 2.907026819599108e-05, + "loss": 1.9162, + "step": 6021 + }, + { + "epoch": 0.46, + "grad_norm": 0.6109273457977054, + "learning_rate": 2.9064104712221402e-05, + "loss": 1.9777, + "step": 6022 + }, + { + "epoch": 0.46, + "grad_norm": 0.8284574135245422, + "learning_rate": 2.9057940974712967e-05, + "loss": 2.123, + "step": 6023 + }, + { + "epoch": 0.46, + "grad_norm": 0.5625931250820133, + "learning_rate": 2.9051776983850608e-05, + "loss": 1.9472, + "step": 6024 + }, + { + "epoch": 0.46, + "grad_norm": 0.7130654200441631, + "learning_rate": 2.9045612740019172e-05, + "loss": 1.9037, + "step": 6025 + }, + { + "epoch": 0.46, + "grad_norm": 0.6123249707788289, + "learning_rate": 2.9039448243603508e-05, + "loss": 1.9549, + "step": 6026 + }, + { + "epoch": 0.46, + "grad_norm": 0.6920475267902129, + "learning_rate": 2.9033283494988496e-05, + "loss": 2.126, + "step": 6027 + }, + { + "epoch": 0.47, + "grad_norm": 0.6273737386071347, + "learning_rate": 2.902711849455903e-05, + "loss": 2.0022, + "step": 6028 + }, + { + "epoch": 0.47, + "grad_norm": 0.7092443818611979, + "learning_rate": 2.9020953242700012e-05, + "loss": 1.9157, + "step": 6029 + }, + { + "epoch": 0.47, + "grad_norm": 0.6526950394158906, + "learning_rate": 2.9014787739796357e-05, + "loss": 1.8765, + "step": 6030 + }, + { + "epoch": 0.47, + "grad_norm": 0.6886205787344196, + "learning_rate": 2.900862198623302e-05, + "loss": 2.0765, + "step": 6031 + }, + { + "epoch": 0.47, + "grad_norm": 0.608315282366492, + "learning_rate": 2.9002455982394944e-05, + "loss": 1.9495, + "step": 6032 + }, + { + "epoch": 0.47, + "grad_norm": 0.6204321830864946, + "learning_rate": 2.899628972866709e-05, + "loss": 1.9036, + "step": 6033 + }, + { + "epoch": 0.47, + "grad_norm": 0.7325716062979188, + "learning_rate": 2.8990123225434456e-05, + "loss": 1.9479, + "step": 6034 + }, + { + "epoch": 0.47, + "grad_norm": 0.7898018552032495, + "learning_rate": 2.8983956473082035e-05, + "loss": 2.1685, + "step": 6035 + }, + { + "epoch": 0.47, + "grad_norm": 0.6740754423569145, + "learning_rate": 2.8977789471994855e-05, + "loss": 1.9594, + "step": 6036 + }, + { + "epoch": 0.47, + "grad_norm": 0.5931584178554936, + "learning_rate": 2.897162222255792e-05, + "loss": 1.8588, + "step": 6037 + }, + { + "epoch": 0.47, + "grad_norm": 0.6105781461985351, + "learning_rate": 2.8965454725156304e-05, + "loss": 1.964, + "step": 6038 + }, + { + "epoch": 0.47, + "grad_norm": 0.7733784573568997, + "learning_rate": 2.8959286980175065e-05, + "loss": 2.1584, + "step": 6039 + }, + { + "epoch": 0.47, + "grad_norm": 0.6629231910999642, + "learning_rate": 2.895311898799927e-05, + "loss": 1.8734, + "step": 6040 + }, + { + "epoch": 0.47, + "grad_norm": 0.6336630662818065, + "learning_rate": 2.8946950749014012e-05, + "loss": 1.9989, + "step": 6041 + }, + { + "epoch": 0.47, + "grad_norm": 0.6976598515104996, + "learning_rate": 2.89407822636044e-05, + "loss": 1.9579, + "step": 6042 + }, + { + "epoch": 0.47, + "grad_norm": 0.6588442803929545, + "learning_rate": 2.8934613532155576e-05, + "loss": 2.1298, + "step": 6043 + }, + { + "epoch": 0.47, + "grad_norm": 0.5984901173273851, + "learning_rate": 2.892844455505266e-05, + "loss": 1.9541, + "step": 6044 + }, + { + "epoch": 0.47, + "grad_norm": 0.6124076658784989, + "learning_rate": 2.8922275332680804e-05, + "loss": 1.88, + "step": 6045 + }, + { + "epoch": 0.47, + "grad_norm": 0.693190037613699, + "learning_rate": 2.8916105865425193e-05, + "loss": 1.9359, + "step": 6046 + }, + { + "epoch": 0.47, + "grad_norm": 0.6854303850161515, + "learning_rate": 2.890993615367101e-05, + "loss": 2.1658, + "step": 6047 + }, + { + "epoch": 0.47, + "grad_norm": 0.620317473937023, + "learning_rate": 2.890376619780344e-05, + "loss": 1.9665, + "step": 6048 + }, + { + "epoch": 0.47, + "grad_norm": 0.6017633292389617, + "learning_rate": 2.8897595998207717e-05, + "loss": 1.9135, + "step": 6049 + }, + { + "epoch": 0.47, + "grad_norm": 0.6695049851816347, + "learning_rate": 2.8891425555269058e-05, + "loss": 1.9482, + "step": 6050 + }, + { + "epoch": 0.47, + "grad_norm": 0.6579856773407556, + "learning_rate": 2.888525486937272e-05, + "loss": 2.1544, + "step": 6051 + }, + { + "epoch": 0.47, + "grad_norm": 0.6943533170253708, + "learning_rate": 2.8879083940903957e-05, + "loss": 1.9584, + "step": 6052 + }, + { + "epoch": 0.47, + "grad_norm": 0.7728571549998186, + "learning_rate": 2.8872912770248044e-05, + "loss": 1.9971, + "step": 6053 + }, + { + "epoch": 0.47, + "grad_norm": 0.790100486381364, + "learning_rate": 2.8866741357790283e-05, + "loss": 1.921, + "step": 6054 + }, + { + "epoch": 0.47, + "grad_norm": 0.8250976546382595, + "learning_rate": 2.8860569703915975e-05, + "loss": 2.1303, + "step": 6055 + }, + { + "epoch": 0.47, + "grad_norm": 0.5617133295986723, + "learning_rate": 2.8854397809010436e-05, + "loss": 1.8921, + "step": 6056 + }, + { + "epoch": 0.47, + "grad_norm": 0.7769489026042259, + "learning_rate": 2.884822567345901e-05, + "loss": 1.9517, + "step": 6057 + }, + { + "epoch": 0.47, + "grad_norm": 0.6641212926741072, + "learning_rate": 2.8842053297647047e-05, + "loss": 1.9326, + "step": 6058 + }, + { + "epoch": 0.47, + "grad_norm": 0.6963086897952515, + "learning_rate": 2.883588068195991e-05, + "loss": 1.9416, + "step": 6059 + }, + { + "epoch": 0.47, + "grad_norm": 0.7279431345902708, + "learning_rate": 2.882970782678298e-05, + "loss": 2.107, + "step": 6060 + }, + { + "epoch": 0.47, + "grad_norm": 0.6968588048479214, + "learning_rate": 2.882353473250167e-05, + "loss": 1.9444, + "step": 6061 + }, + { + "epoch": 0.47, + "grad_norm": 0.7109336254401915, + "learning_rate": 2.8817361399501375e-05, + "loss": 1.9478, + "step": 6062 + }, + { + "epoch": 0.47, + "grad_norm": 0.7157506190781077, + "learning_rate": 2.881118782816752e-05, + "loss": 2.1161, + "step": 6063 + }, + { + "epoch": 0.47, + "grad_norm": 0.6585982358935669, + "learning_rate": 2.8805014018885556e-05, + "loss": 1.9232, + "step": 6064 + }, + { + "epoch": 0.47, + "grad_norm": 0.6466991773353137, + "learning_rate": 2.8798839972040935e-05, + "loss": 1.8945, + "step": 6065 + }, + { + "epoch": 0.47, + "grad_norm": 0.6788620826862066, + "learning_rate": 2.8792665688019127e-05, + "loss": 1.9795, + "step": 6066 + }, + { + "epoch": 0.47, + "grad_norm": 0.7929808849009231, + "learning_rate": 2.8786491167205613e-05, + "loss": 2.1469, + "step": 6067 + }, + { + "epoch": 0.47, + "grad_norm": 0.765492373526748, + "learning_rate": 2.8780316409985915e-05, + "loss": 1.8901, + "step": 6068 + }, + { + "epoch": 0.47, + "grad_norm": 0.6430955345410521, + "learning_rate": 2.8774141416745524e-05, + "loss": 1.8766, + "step": 6069 + }, + { + "epoch": 0.47, + "grad_norm": 0.7970949401462726, + "learning_rate": 2.8767966187869983e-05, + "loss": 1.9247, + "step": 6070 + }, + { + "epoch": 0.47, + "grad_norm": 0.7224962921434076, + "learning_rate": 2.8761790723744834e-05, + "loss": 2.1156, + "step": 6071 + }, + { + "epoch": 0.47, + "grad_norm": 0.7898177335206944, + "learning_rate": 2.8755615024755633e-05, + "loss": 1.9961, + "step": 6072 + }, + { + "epoch": 0.47, + "grad_norm": 0.8446886317861725, + "learning_rate": 2.8749439091287962e-05, + "loss": 1.9131, + "step": 6073 + }, + { + "epoch": 0.47, + "grad_norm": 0.7758007328045595, + "learning_rate": 2.87432629237274e-05, + "loss": 1.9052, + "step": 6074 + }, + { + "epoch": 0.47, + "grad_norm": 0.951844290078726, + "learning_rate": 2.8737086522459563e-05, + "loss": 2.1448, + "step": 6075 + }, + { + "epoch": 0.47, + "grad_norm": 0.5842599623911655, + "learning_rate": 2.8730909887870057e-05, + "loss": 1.9293, + "step": 6076 + }, + { + "epoch": 0.47, + "grad_norm": 0.9858072108986572, + "learning_rate": 2.872473302034452e-05, + "loss": 1.9186, + "step": 6077 + }, + { + "epoch": 0.47, + "grad_norm": 0.6881601169750788, + "learning_rate": 2.87185559202686e-05, + "loss": 2.0033, + "step": 6078 + }, + { + "epoch": 0.47, + "grad_norm": 0.7856721072346264, + "learning_rate": 2.871237858802796e-05, + "loss": 1.9302, + "step": 6079 + }, + { + "epoch": 0.47, + "grad_norm": 0.7997111429447697, + "learning_rate": 2.8706201024008267e-05, + "loss": 2.1093, + "step": 6080 + }, + { + "epoch": 0.47, + "grad_norm": 0.626295539419116, + "learning_rate": 2.8700023228595225e-05, + "loss": 1.8793, + "step": 6081 + }, + { + "epoch": 0.47, + "grad_norm": 0.7778128002423056, + "learning_rate": 2.869384520217453e-05, + "loss": 1.9124, + "step": 6082 + }, + { + "epoch": 0.47, + "grad_norm": 0.7314199482865292, + "learning_rate": 2.868766694513191e-05, + "loss": 2.0705, + "step": 6083 + }, + { + "epoch": 0.47, + "grad_norm": 0.5738717192730649, + "learning_rate": 2.868148845785308e-05, + "loss": 2.0203, + "step": 6084 + }, + { + "epoch": 0.47, + "grad_norm": 0.792030696512876, + "learning_rate": 2.867530974072381e-05, + "loss": 1.9414, + "step": 6085 + }, + { + "epoch": 0.47, + "grad_norm": 0.7172438226032153, + "learning_rate": 2.8669130794129863e-05, + "loss": 1.9345, + "step": 6086 + }, + { + "epoch": 0.47, + "grad_norm": 0.8222602144114246, + "learning_rate": 2.8662951618456996e-05, + "loss": 2.1253, + "step": 6087 + }, + { + "epoch": 0.47, + "grad_norm": 0.7206897007433584, + "learning_rate": 2.865677221409101e-05, + "loss": 1.9173, + "step": 6088 + }, + { + "epoch": 0.47, + "grad_norm": 0.6571108072797999, + "learning_rate": 2.8650592581417718e-05, + "loss": 1.8818, + "step": 6089 + }, + { + "epoch": 0.47, + "grad_norm": 0.911692898390632, + "learning_rate": 2.8644412720822933e-05, + "loss": 1.9718, + "step": 6090 + }, + { + "epoch": 0.47, + "grad_norm": 0.5496703802805056, + "learning_rate": 2.863823263269249e-05, + "loss": 1.8786, + "step": 6091 + }, + { + "epoch": 0.47, + "grad_norm": 0.7499032976311122, + "learning_rate": 2.863205231741224e-05, + "loss": 2.1219, + "step": 6092 + }, + { + "epoch": 0.47, + "grad_norm": 0.6175999930661693, + "learning_rate": 2.862587177536804e-05, + "loss": 1.8987, + "step": 6093 + }, + { + "epoch": 0.47, + "grad_norm": 0.6292828661928492, + "learning_rate": 2.861969100694577e-05, + "loss": 1.9428, + "step": 6094 + }, + { + "epoch": 0.47, + "grad_norm": 0.7337999951298932, + "learning_rate": 2.861351001253132e-05, + "loss": 2.1104, + "step": 6095 + }, + { + "epoch": 0.47, + "grad_norm": 0.675859228106822, + "learning_rate": 2.8607328792510595e-05, + "loss": 1.898, + "step": 6096 + }, + { + "epoch": 0.47, + "grad_norm": 0.661865080266223, + "learning_rate": 2.8601147347269514e-05, + "loss": 1.9985, + "step": 6097 + }, + { + "epoch": 0.47, + "grad_norm": 0.5937158924458813, + "learning_rate": 2.8594965677194013e-05, + "loss": 1.9324, + "step": 6098 + }, + { + "epoch": 0.47, + "grad_norm": 0.6074448734327125, + "learning_rate": 2.858878378267003e-05, + "loss": 2.1222, + "step": 6099 + }, + { + "epoch": 0.47, + "grad_norm": 0.6270234800502213, + "learning_rate": 2.858260166408354e-05, + "loss": 1.892, + "step": 6100 + }, + { + "epoch": 0.47, + "grad_norm": 0.6092753354246233, + "learning_rate": 2.8576419321820507e-05, + "loss": 1.8842, + "step": 6101 + }, + { + "epoch": 0.47, + "grad_norm": 0.6812256014787198, + "learning_rate": 2.857023675626692e-05, + "loss": 1.9156, + "step": 6102 + }, + { + "epoch": 0.47, + "grad_norm": 0.6009945673494914, + "learning_rate": 2.856405396780879e-05, + "loss": 2.0136, + "step": 6103 + }, + { + "epoch": 0.47, + "grad_norm": 0.7282707919402174, + "learning_rate": 2.8557870956832132e-05, + "loss": 2.151, + "step": 6104 + }, + { + "epoch": 0.47, + "grad_norm": 0.5514084935556735, + "learning_rate": 2.855168772372297e-05, + "loss": 1.9077, + "step": 6105 + }, + { + "epoch": 0.47, + "grad_norm": 0.6554203992770687, + "learning_rate": 2.854550426886735e-05, + "loss": 1.9441, + "step": 6106 + }, + { + "epoch": 0.47, + "grad_norm": 0.7310541932742173, + "learning_rate": 2.8539320592651336e-05, + "loss": 2.1253, + "step": 6107 + }, + { + "epoch": 0.47, + "grad_norm": 0.594755774719508, + "learning_rate": 2.8533136695461004e-05, + "loss": 1.8911, + "step": 6108 + }, + { + "epoch": 0.47, + "grad_norm": 0.618015005561634, + "learning_rate": 2.8526952577682425e-05, + "loss": 1.9736, + "step": 6109 + }, + { + "epoch": 0.47, + "grad_norm": 0.5911892757457342, + "learning_rate": 2.8520768239701707e-05, + "loss": 1.9123, + "step": 6110 + }, + { + "epoch": 0.47, + "grad_norm": 0.7290930809107585, + "learning_rate": 2.8514583681904972e-05, + "loss": 1.9127, + "step": 6111 + }, + { + "epoch": 0.47, + "grad_norm": 0.5634256326604189, + "learning_rate": 2.8508398904678334e-05, + "loss": 2.1181, + "step": 6112 + }, + { + "epoch": 0.47, + "grad_norm": 0.6367236849331929, + "learning_rate": 2.8502213908407937e-05, + "loss": 1.9486, + "step": 6113 + }, + { + "epoch": 0.47, + "grad_norm": 0.6176580245284353, + "learning_rate": 2.8496028693479938e-05, + "loss": 1.9372, + "step": 6114 + }, + { + "epoch": 0.47, + "grad_norm": 0.7009382742760467, + "learning_rate": 2.8489843260280513e-05, + "loss": 2.0241, + "step": 6115 + }, + { + "epoch": 0.47, + "grad_norm": 0.6130760663920718, + "learning_rate": 2.8483657609195835e-05, + "loss": 2.1075, + "step": 6116 + }, + { + "epoch": 0.47, + "grad_norm": 0.6104278131448414, + "learning_rate": 2.8477471740612094e-05, + "loss": 1.896, + "step": 6117 + }, + { + "epoch": 0.47, + "grad_norm": 0.5945735973185974, + "learning_rate": 2.8471285654915512e-05, + "loss": 1.9348, + "step": 6118 + }, + { + "epoch": 0.47, + "grad_norm": 0.6541514904034934, + "learning_rate": 2.8465099352492307e-05, + "loss": 2.0572, + "step": 6119 + }, + { + "epoch": 0.47, + "grad_norm": 0.5787861155529801, + "learning_rate": 2.845891283372871e-05, + "loss": 1.9138, + "step": 6120 + }, + { + "epoch": 0.47, + "grad_norm": 0.5971763521927723, + "learning_rate": 2.845272609901098e-05, + "loss": 1.9641, + "step": 6121 + }, + { + "epoch": 0.47, + "grad_norm": 0.6436377618779282, + "learning_rate": 2.844653914872537e-05, + "loss": 1.9603, + "step": 6122 + }, + { + "epoch": 0.47, + "grad_norm": 0.6244497706521172, + "learning_rate": 2.844035198325817e-05, + "loss": 1.8929, + "step": 6123 + }, + { + "epoch": 0.47, + "grad_norm": 0.6290549793758208, + "learning_rate": 2.8434164602995655e-05, + "loss": 2.129, + "step": 6124 + }, + { + "epoch": 0.47, + "grad_norm": 0.6793031626704319, + "learning_rate": 2.8427977008324142e-05, + "loss": 1.8963, + "step": 6125 + }, + { + "epoch": 0.47, + "grad_norm": 0.6042645089643837, + "learning_rate": 2.8421789199629944e-05, + "loss": 1.9367, + "step": 6126 + }, + { + "epoch": 0.47, + "grad_norm": 0.7570077202776087, + "learning_rate": 2.8415601177299377e-05, + "loss": 2.1181, + "step": 6127 + }, + { + "epoch": 0.47, + "grad_norm": 0.5524239611151556, + "learning_rate": 2.8409412941718805e-05, + "loss": 2.0019, + "step": 6128 + }, + { + "epoch": 0.47, + "grad_norm": 0.6257208772204805, + "learning_rate": 2.840322449327458e-05, + "loss": 1.9449, + "step": 6129 + }, + { + "epoch": 0.47, + "grad_norm": 0.6241737322143389, + "learning_rate": 2.8397035832353065e-05, + "loss": 1.9102, + "step": 6130 + }, + { + "epoch": 0.47, + "grad_norm": 0.5951944577986961, + "learning_rate": 2.8390846959340638e-05, + "loss": 2.1369, + "step": 6131 + }, + { + "epoch": 0.47, + "grad_norm": 0.5806862807364876, + "learning_rate": 2.8384657874623716e-05, + "loss": 1.9321, + "step": 6132 + }, + { + "epoch": 0.47, + "grad_norm": 0.626804204217459, + "learning_rate": 2.8378468578588703e-05, + "loss": 1.9192, + "step": 6133 + }, + { + "epoch": 0.47, + "grad_norm": 0.5864251812693121, + "learning_rate": 2.837227907162201e-05, + "loss": 2.0457, + "step": 6134 + }, + { + "epoch": 0.47, + "grad_norm": 0.6549455162764742, + "learning_rate": 2.836608935411008e-05, + "loss": 1.9088, + "step": 6135 + }, + { + "epoch": 0.47, + "grad_norm": 0.5614088505271921, + "learning_rate": 2.8359899426439374e-05, + "loss": 2.0919, + "step": 6136 + }, + { + "epoch": 0.47, + "grad_norm": 0.7003340149978751, + "learning_rate": 2.8353709288996333e-05, + "loss": 1.9978, + "step": 6137 + }, + { + "epoch": 0.47, + "grad_norm": 0.5766927691662888, + "learning_rate": 2.8347518942167444e-05, + "loss": 1.9665, + "step": 6138 + }, + { + "epoch": 0.47, + "grad_norm": 0.548015639669285, + "learning_rate": 2.83413283863392e-05, + "loss": 2.1375, + "step": 6139 + }, + { + "epoch": 0.47, + "grad_norm": 0.5188959315707593, + "learning_rate": 2.8335137621898095e-05, + "loss": 1.8921, + "step": 6140 + }, + { + "epoch": 0.47, + "grad_norm": 0.5845271040985408, + "learning_rate": 2.8328946649230647e-05, + "loss": 1.9146, + "step": 6141 + }, + { + "epoch": 0.47, + "grad_norm": 0.5985565151098161, + "learning_rate": 2.8322755468723387e-05, + "loss": 1.9291, + "step": 6142 + }, + { + "epoch": 0.47, + "grad_norm": 0.5583602207056978, + "learning_rate": 2.8316564080762854e-05, + "loss": 1.9573, + "step": 6143 + }, + { + "epoch": 0.47, + "grad_norm": 0.5808160069754263, + "learning_rate": 2.8310372485735597e-05, + "loss": 2.1284, + "step": 6144 + }, + { + "epoch": 0.47, + "grad_norm": 0.6245984797766763, + "learning_rate": 2.8304180684028187e-05, + "loss": 1.9659, + "step": 6145 + }, + { + "epoch": 0.47, + "grad_norm": 0.5463832253494476, + "learning_rate": 2.8297988676027203e-05, + "loss": 1.9819, + "step": 6146 + }, + { + "epoch": 0.47, + "grad_norm": 0.6290164631193843, + "learning_rate": 2.8291796462119246e-05, + "loss": 1.9101, + "step": 6147 + }, + { + "epoch": 0.47, + "grad_norm": 0.5937066422120306, + "learning_rate": 2.8285604042690894e-05, + "loss": 2.1315, + "step": 6148 + }, + { + "epoch": 0.47, + "grad_norm": 0.6532670149783002, + "learning_rate": 2.8279411418128798e-05, + "loss": 1.9147, + "step": 6149 + }, + { + "epoch": 0.47, + "grad_norm": 0.6641452628773036, + "learning_rate": 2.8273218588819573e-05, + "loss": 1.9303, + "step": 6150 + }, + { + "epoch": 0.47, + "grad_norm": 0.6492564426112483, + "learning_rate": 2.8267025555149867e-05, + "loss": 2.1218, + "step": 6151 + }, + { + "epoch": 0.47, + "grad_norm": 0.6654462429076632, + "learning_rate": 2.8260832317506326e-05, + "loss": 1.9879, + "step": 6152 + }, + { + "epoch": 0.47, + "grad_norm": 0.6035637213693915, + "learning_rate": 2.825463887627563e-05, + "loss": 1.8904, + "step": 6153 + }, + { + "epoch": 0.47, + "grad_norm": 0.5842202944278496, + "learning_rate": 2.8248445231844466e-05, + "loss": 1.9454, + "step": 6154 + }, + { + "epoch": 0.47, + "grad_norm": 0.6810767396249412, + "learning_rate": 2.8242251384599516e-05, + "loss": 1.9095, + "step": 6155 + }, + { + "epoch": 0.47, + "grad_norm": 0.6713845080647166, + "learning_rate": 2.8236057334927494e-05, + "loss": 2.1393, + "step": 6156 + }, + { + "epoch": 0.48, + "grad_norm": 0.5653520002476087, + "learning_rate": 2.8229863083215113e-05, + "loss": 1.9318, + "step": 6157 + }, + { + "epoch": 0.48, + "grad_norm": 0.7087058206744752, + "learning_rate": 2.822366862984912e-05, + "loss": 1.8836, + "step": 6158 + }, + { + "epoch": 0.48, + "grad_norm": 0.6229565829493346, + "learning_rate": 2.8217473975216247e-05, + "loss": 1.954, + "step": 6159 + }, + { + "epoch": 0.48, + "grad_norm": 0.7815621087382343, + "learning_rate": 2.8211279119703253e-05, + "loss": 2.0912, + "step": 6160 + }, + { + "epoch": 0.48, + "grad_norm": 0.5396882479747217, + "learning_rate": 2.820508406369692e-05, + "loss": 1.9726, + "step": 6161 + }, + { + "epoch": 0.48, + "grad_norm": 0.7976339825171728, + "learning_rate": 2.8198888807584013e-05, + "loss": 1.9525, + "step": 6162 + }, + { + "epoch": 0.48, + "grad_norm": 0.6038848610897559, + "learning_rate": 2.8192693351751342e-05, + "loss": 1.858, + "step": 6163 + }, + { + "epoch": 0.48, + "grad_norm": 0.703904694513044, + "learning_rate": 2.8186497696585707e-05, + "loss": 2.1853, + "step": 6164 + }, + { + "epoch": 0.48, + "grad_norm": 0.5839196280763157, + "learning_rate": 2.8180301842473926e-05, + "loss": 1.9819, + "step": 6165 + }, + { + "epoch": 0.48, + "grad_norm": 0.7046973964425847, + "learning_rate": 2.817410578980284e-05, + "loss": 1.9532, + "step": 6166 + }, + { + "epoch": 0.48, + "grad_norm": 0.5848964288431363, + "learning_rate": 2.8167909538959292e-05, + "loss": 1.9219, + "step": 6167 + }, + { + "epoch": 0.48, + "grad_norm": 0.754947494831866, + "learning_rate": 2.8161713090330132e-05, + "loss": 2.1235, + "step": 6168 + }, + { + "epoch": 0.48, + "grad_norm": 0.6366837932948054, + "learning_rate": 2.8155516444302237e-05, + "loss": 1.8788, + "step": 6169 + }, + { + "epoch": 0.48, + "grad_norm": 0.7108136727958364, + "learning_rate": 2.8149319601262492e-05, + "loss": 1.8844, + "step": 6170 + }, + { + "epoch": 0.48, + "grad_norm": 0.6030695718504766, + "learning_rate": 2.8143122561597784e-05, + "loss": 2.0339, + "step": 6171 + }, + { + "epoch": 0.48, + "grad_norm": 0.6403831619276319, + "learning_rate": 2.8136925325695017e-05, + "loss": 2.1471, + "step": 6172 + }, + { + "epoch": 0.48, + "grad_norm": 0.6154977051718946, + "learning_rate": 2.813072789394113e-05, + "loss": 1.8562, + "step": 6173 + }, + { + "epoch": 0.48, + "grad_norm": 0.7061323377689693, + "learning_rate": 2.812453026672302e-05, + "loss": 1.9135, + "step": 6174 + }, + { + "epoch": 0.48, + "grad_norm": 0.6152239140420555, + "learning_rate": 2.811833244442766e-05, + "loss": 1.9313, + "step": 6175 + }, + { + "epoch": 0.48, + "grad_norm": 0.7293025277019005, + "learning_rate": 2.8112134427441998e-05, + "loss": 2.1367, + "step": 6176 + }, + { + "epoch": 0.48, + "grad_norm": 0.576462525545831, + "learning_rate": 2.8105936216152996e-05, + "loss": 2.0389, + "step": 6177 + }, + { + "epoch": 0.48, + "grad_norm": 0.6069390237623686, + "learning_rate": 2.809973781094763e-05, + "loss": 1.9373, + "step": 6178 + }, + { + "epoch": 0.48, + "grad_norm": 0.6314378157304235, + "learning_rate": 2.8093539212212905e-05, + "loss": 1.9404, + "step": 6179 + }, + { + "epoch": 0.48, + "grad_norm": 0.7083517174409519, + "learning_rate": 2.8087340420335818e-05, + "loss": 2.1407, + "step": 6180 + }, + { + "epoch": 0.48, + "grad_norm": 0.6085739442812378, + "learning_rate": 2.8081141435703384e-05, + "loss": 1.8795, + "step": 6181 + }, + { + "epoch": 0.48, + "grad_norm": 0.686347668277774, + "learning_rate": 2.8074942258702625e-05, + "loss": 1.9214, + "step": 6182 + }, + { + "epoch": 0.48, + "grad_norm": 0.7562017784454768, + "learning_rate": 2.8068742889720602e-05, + "loss": 2.1409, + "step": 6183 + }, + { + "epoch": 0.48, + "grad_norm": 0.6131328833498226, + "learning_rate": 2.8062543329144347e-05, + "loss": 2.0185, + "step": 6184 + }, + { + "epoch": 0.48, + "grad_norm": 0.7561551937959166, + "learning_rate": 2.8056343577360932e-05, + "loss": 1.9472, + "step": 6185 + }, + { + "epoch": 0.48, + "grad_norm": 0.7064216136558119, + "learning_rate": 2.805014363475743e-05, + "loss": 1.9215, + "step": 6186 + }, + { + "epoch": 0.48, + "grad_norm": 0.5945729312441711, + "learning_rate": 2.804394350172093e-05, + "loss": 1.8748, + "step": 6187 + }, + { + "epoch": 0.48, + "grad_norm": 0.8767651977002929, + "learning_rate": 2.8037743178638536e-05, + "loss": 2.1211, + "step": 6188 + }, + { + "epoch": 0.48, + "grad_norm": 0.7120981152521955, + "learning_rate": 2.8031542665897353e-05, + "loss": 1.9144, + "step": 6189 + }, + { + "epoch": 0.48, + "grad_norm": 0.6513010227419045, + "learning_rate": 2.802534196388451e-05, + "loss": 1.9783, + "step": 6190 + }, + { + "epoch": 0.48, + "grad_norm": 0.6527548377306654, + "learning_rate": 2.8019141072987138e-05, + "loss": 1.8964, + "step": 6191 + }, + { + "epoch": 0.48, + "grad_norm": 0.7180908197047047, + "learning_rate": 2.8012939993592386e-05, + "loss": 2.0873, + "step": 6192 + }, + { + "epoch": 0.48, + "grad_norm": 0.5644956398775843, + "learning_rate": 2.800673872608741e-05, + "loss": 1.9113, + "step": 6193 + }, + { + "epoch": 0.48, + "grad_norm": 0.6723416665426786, + "learning_rate": 2.8000537270859393e-05, + "loss": 1.8887, + "step": 6194 + }, + { + "epoch": 0.48, + "grad_norm": 0.6503820382184902, + "learning_rate": 2.7994335628295497e-05, + "loss": 1.9419, + "step": 6195 + }, + { + "epoch": 0.48, + "grad_norm": 0.6320788178361572, + "learning_rate": 2.798813379878293e-05, + "loss": 2.1741, + "step": 6196 + }, + { + "epoch": 0.48, + "grad_norm": 0.5783328004963862, + "learning_rate": 2.798193178270889e-05, + "loss": 1.9284, + "step": 6197 + }, + { + "epoch": 0.48, + "grad_norm": 0.6536369794030019, + "learning_rate": 2.7975729580460613e-05, + "loss": 1.9154, + "step": 6198 + }, + { + "epoch": 0.48, + "grad_norm": 0.599713590673247, + "learning_rate": 2.79695271924253e-05, + "loss": 1.9504, + "step": 6199 + }, + { + "epoch": 0.48, + "grad_norm": 0.6992959079936097, + "learning_rate": 2.796332461899021e-05, + "loss": 2.1409, + "step": 6200 + }, + { + "epoch": 0.48, + "grad_norm": 0.5718521089859425, + "learning_rate": 2.7957121860542595e-05, + "loss": 1.9513, + "step": 6201 + }, + { + "epoch": 0.48, + "grad_norm": 0.6516463898451217, + "learning_rate": 2.795091891746971e-05, + "loss": 1.9985, + "step": 6202 + }, + { + "epoch": 0.48, + "grad_norm": 0.6394872787664603, + "learning_rate": 2.7944715790158826e-05, + "loss": 1.9417, + "step": 6203 + }, + { + "epoch": 0.48, + "grad_norm": 0.6762012398491848, + "learning_rate": 2.7938512478997247e-05, + "loss": 2.1098, + "step": 6204 + }, + { + "epoch": 0.48, + "grad_norm": 0.5778828320413963, + "learning_rate": 2.7932308984372264e-05, + "loss": 1.8797, + "step": 6205 + }, + { + "epoch": 0.48, + "grad_norm": 0.5898964487828565, + "learning_rate": 2.792610530667118e-05, + "loss": 1.9294, + "step": 6206 + }, + { + "epoch": 0.48, + "grad_norm": 0.7816938189025686, + "learning_rate": 2.7919901446281327e-05, + "loss": 1.9356, + "step": 6207 + }, + { + "epoch": 0.48, + "grad_norm": 0.5807735827522142, + "learning_rate": 2.7913697403590022e-05, + "loss": 2.2003, + "step": 6208 + }, + { + "epoch": 0.48, + "grad_norm": 0.7416073177594824, + "learning_rate": 2.790749317898463e-05, + "loss": 1.9687, + "step": 6209 + }, + { + "epoch": 0.48, + "grad_norm": 0.6570188210974146, + "learning_rate": 2.7901288772852486e-05, + "loss": 1.9539, + "step": 6210 + }, + { + "epoch": 0.48, + "grad_norm": 0.6492437513193234, + "learning_rate": 2.7895084185580968e-05, + "loss": 1.9314, + "step": 6211 + }, + { + "epoch": 0.48, + "grad_norm": 0.7676241786619878, + "learning_rate": 2.7888879417557447e-05, + "loss": 2.0943, + "step": 6212 + }, + { + "epoch": 0.48, + "grad_norm": 0.6137155261653936, + "learning_rate": 2.788267446916932e-05, + "loss": 1.9101, + "step": 6213 + }, + { + "epoch": 0.48, + "grad_norm": 0.7993431988100764, + "learning_rate": 2.787646934080398e-05, + "loss": 1.9186, + "step": 6214 + }, + { + "epoch": 0.48, + "grad_norm": 0.6681505255680475, + "learning_rate": 2.7870264032848847e-05, + "loss": 2.001, + "step": 6215 + }, + { + "epoch": 0.48, + "grad_norm": 0.6817111604437344, + "learning_rate": 2.7864058545691335e-05, + "loss": 2.0707, + "step": 6216 + }, + { + "epoch": 0.48, + "grad_norm": 0.7807959088349454, + "learning_rate": 2.7857852879718883e-05, + "loss": 1.9475, + "step": 6217 + }, + { + "epoch": 0.48, + "grad_norm": 0.6483716676870414, + "learning_rate": 2.785164703531894e-05, + "loss": 1.8711, + "step": 6218 + }, + { + "epoch": 0.48, + "grad_norm": 0.7819843106273833, + "learning_rate": 2.7845441012878958e-05, + "loss": 1.9566, + "step": 6219 + }, + { + "epoch": 0.48, + "grad_norm": 0.8153074197466349, + "learning_rate": 2.7839234812786398e-05, + "loss": 2.1171, + "step": 6220 + }, + { + "epoch": 0.48, + "grad_norm": 0.7899667362837469, + "learning_rate": 2.7833028435428744e-05, + "loss": 2.0442, + "step": 6221 + }, + { + "epoch": 0.48, + "grad_norm": 0.7751969810908127, + "learning_rate": 2.7826821881193488e-05, + "loss": 1.9313, + "step": 6222 + }, + { + "epoch": 0.48, + "grad_norm": 0.7277906156214009, + "learning_rate": 2.782061515046814e-05, + "loss": 1.934, + "step": 6223 + }, + { + "epoch": 0.48, + "grad_norm": 0.6810980516631513, + "learning_rate": 2.781440824364019e-05, + "loss": 2.1377, + "step": 6224 + }, + { + "epoch": 0.48, + "grad_norm": 0.683502361825806, + "learning_rate": 2.7808201161097164e-05, + "loss": 1.9203, + "step": 6225 + }, + { + "epoch": 0.48, + "grad_norm": 0.7533439572775853, + "learning_rate": 2.780199390322662e-05, + "loss": 1.9185, + "step": 6226 + }, + { + "epoch": 0.48, + "grad_norm": 0.6924290575414431, + "learning_rate": 2.7795786470416075e-05, + "loss": 2.053, + "step": 6227 + }, + { + "epoch": 0.48, + "grad_norm": 0.6600270829700075, + "learning_rate": 2.77895788630531e-05, + "loss": 2.1536, + "step": 6228 + }, + { + "epoch": 0.48, + "grad_norm": 0.6329372795595144, + "learning_rate": 2.778337108152525e-05, + "loss": 1.9058, + "step": 6229 + }, + { + "epoch": 0.48, + "grad_norm": 0.639861170322704, + "learning_rate": 2.7777163126220118e-05, + "loss": 1.9785, + "step": 6230 + }, + { + "epoch": 0.48, + "grad_norm": 0.7040172819721391, + "learning_rate": 2.7770954997525277e-05, + "loss": 1.9785, + "step": 6231 + }, + { + "epoch": 0.48, + "grad_norm": 0.7186256311206687, + "learning_rate": 2.7764746695828338e-05, + "loss": 2.1167, + "step": 6232 + }, + { + "epoch": 0.48, + "grad_norm": 0.5552706841250771, + "learning_rate": 2.77585382215169e-05, + "loss": 2.0279, + "step": 6233 + }, + { + "epoch": 0.48, + "grad_norm": 0.6550710334390859, + "learning_rate": 2.7752329574978587e-05, + "loss": 1.9226, + "step": 6234 + }, + { + "epoch": 0.48, + "grad_norm": 0.7397534607079066, + "learning_rate": 2.7746120756601036e-05, + "loss": 1.8638, + "step": 6235 + }, + { + "epoch": 0.48, + "grad_norm": 0.6768705864741504, + "learning_rate": 2.773991176677188e-05, + "loss": 2.086, + "step": 6236 + }, + { + "epoch": 0.48, + "grad_norm": 0.6123262735101047, + "learning_rate": 2.773370260587878e-05, + "loss": 1.8896, + "step": 6237 + }, + { + "epoch": 0.48, + "grad_norm": 0.6817026461177041, + "learning_rate": 2.7727493274309397e-05, + "loss": 1.9399, + "step": 6238 + }, + { + "epoch": 0.48, + "grad_norm": 0.6704821989407344, + "learning_rate": 2.77212837724514e-05, + "loss": 2.013, + "step": 6239 + }, + { + "epoch": 0.48, + "grad_norm": 0.6769753792453249, + "learning_rate": 2.7715074100692485e-05, + "loss": 2.1333, + "step": 6240 + }, + { + "epoch": 0.48, + "grad_norm": 0.6488463278760194, + "learning_rate": 2.7708864259420343e-05, + "loss": 1.9872, + "step": 6241 + }, + { + "epoch": 0.48, + "grad_norm": 0.7098015590597466, + "learning_rate": 2.7702654249022658e-05, + "loss": 1.9316, + "step": 6242 + }, + { + "epoch": 0.48, + "grad_norm": 0.6843690213052005, + "learning_rate": 2.769644406988718e-05, + "loss": 1.9908, + "step": 6243 + }, + { + "epoch": 0.48, + "grad_norm": 0.8762668527890835, + "learning_rate": 2.7690233722401622e-05, + "loss": 2.0975, + "step": 6244 + }, + { + "epoch": 0.48, + "grad_norm": 0.5770720704679605, + "learning_rate": 2.7684023206953718e-05, + "loss": 1.939, + "step": 6245 + }, + { + "epoch": 0.48, + "grad_norm": 0.7628804012746702, + "learning_rate": 2.7677812523931214e-05, + "loss": 2.0027, + "step": 6246 + }, + { + "epoch": 0.48, + "grad_norm": 0.6034161507441359, + "learning_rate": 2.7671601673721875e-05, + "loss": 1.9441, + "step": 6247 + }, + { + "epoch": 0.48, + "grad_norm": 0.7497617206672372, + "learning_rate": 2.7665390656713474e-05, + "loss": 2.1, + "step": 6248 + }, + { + "epoch": 0.48, + "grad_norm": 0.6197471813617871, + "learning_rate": 2.7659179473293785e-05, + "loss": 1.9101, + "step": 6249 + }, + { + "epoch": 0.48, + "grad_norm": 0.6742153165713498, + "learning_rate": 2.7652968123850586e-05, + "loss": 1.9398, + "step": 6250 + }, + { + "epoch": 0.48, + "grad_norm": 0.6490613315645393, + "learning_rate": 2.7646756608771707e-05, + "loss": 1.9596, + "step": 6251 + }, + { + "epoch": 0.48, + "grad_norm": 0.6230335999776684, + "learning_rate": 2.7640544928444927e-05, + "loss": 2.1408, + "step": 6252 + }, + { + "epoch": 0.48, + "grad_norm": 0.5873823734704713, + "learning_rate": 2.7634333083258078e-05, + "loss": 1.8557, + "step": 6253 + }, + { + "epoch": 0.48, + "grad_norm": 0.7394570898366446, + "learning_rate": 2.7628121073598996e-05, + "loss": 1.9595, + "step": 6254 + }, + { + "epoch": 0.48, + "grad_norm": 0.603952903734535, + "learning_rate": 2.762190889985552e-05, + "loss": 1.9438, + "step": 6255 + }, + { + "epoch": 0.48, + "grad_norm": 0.7066202772028929, + "learning_rate": 2.76156965624155e-05, + "loss": 2.0827, + "step": 6256 + }, + { + "epoch": 0.48, + "grad_norm": 0.619922854523307, + "learning_rate": 2.7609484061666797e-05, + "loss": 1.9264, + "step": 6257 + }, + { + "epoch": 0.48, + "grad_norm": 0.6373186179076702, + "learning_rate": 2.7603271397997283e-05, + "loss": 2.0163, + "step": 6258 + }, + { + "epoch": 0.48, + "grad_norm": 0.7085674482236836, + "learning_rate": 2.759705857179484e-05, + "loss": 1.9401, + "step": 6259 + }, + { + "epoch": 0.48, + "grad_norm": 0.8460768376705932, + "learning_rate": 2.7590845583447366e-05, + "loss": 2.1166, + "step": 6260 + }, + { + "epoch": 0.48, + "grad_norm": 0.6407379021518438, + "learning_rate": 2.7584632433342755e-05, + "loss": 1.8905, + "step": 6261 + }, + { + "epoch": 0.48, + "grad_norm": 0.822180832229733, + "learning_rate": 2.7578419121868927e-05, + "loss": 1.8983, + "step": 6262 + }, + { + "epoch": 0.48, + "grad_norm": 0.6852591998882038, + "learning_rate": 2.7572205649413795e-05, + "loss": 1.9379, + "step": 6263 + }, + { + "epoch": 0.48, + "grad_norm": 0.9455577536761596, + "learning_rate": 2.7565992016365305e-05, + "loss": 2.2034, + "step": 6264 + }, + { + "epoch": 0.48, + "grad_norm": 0.6990673944455624, + "learning_rate": 2.755977822311139e-05, + "loss": 1.9272, + "step": 6265 + }, + { + "epoch": 0.48, + "grad_norm": 0.8673850285092596, + "learning_rate": 2.7553564270040004e-05, + "loss": 1.91, + "step": 6266 + }, + { + "epoch": 0.48, + "grad_norm": 0.7243019347774309, + "learning_rate": 2.7547350157539102e-05, + "loss": 1.9443, + "step": 6267 + }, + { + "epoch": 0.48, + "grad_norm": 0.7808072509537783, + "learning_rate": 2.754113588599667e-05, + "loss": 2.0962, + "step": 6268 + }, + { + "epoch": 0.48, + "grad_norm": 0.6098559360605493, + "learning_rate": 2.7534921455800688e-05, + "loss": 1.9306, + "step": 6269 + }, + { + "epoch": 0.48, + "grad_norm": 0.7130255314547961, + "learning_rate": 2.7528706867339155e-05, + "loss": 2.0059, + "step": 6270 + }, + { + "epoch": 0.48, + "grad_norm": 0.6684206521914795, + "learning_rate": 2.7522492121000053e-05, + "loss": 1.9323, + "step": 6271 + }, + { + "epoch": 0.48, + "grad_norm": 0.8614901461539285, + "learning_rate": 2.75162772171714e-05, + "loss": 2.1314, + "step": 6272 + }, + { + "epoch": 0.48, + "grad_norm": 0.6344629885555926, + "learning_rate": 2.7510062156241233e-05, + "loss": 1.9489, + "step": 6273 + }, + { + "epoch": 0.48, + "grad_norm": 0.7758436503945126, + "learning_rate": 2.7503846938597573e-05, + "loss": 1.8288, + "step": 6274 + }, + { + "epoch": 0.48, + "grad_norm": 0.605220214384712, + "learning_rate": 2.7497631564628463e-05, + "loss": 1.9024, + "step": 6275 + }, + { + "epoch": 0.48, + "grad_norm": 0.7868727511479419, + "learning_rate": 2.7491416034721947e-05, + "loss": 2.1032, + "step": 6276 + }, + { + "epoch": 0.48, + "grad_norm": 0.631998244984183, + "learning_rate": 2.74852003492661e-05, + "loss": 1.9976, + "step": 6277 + }, + { + "epoch": 0.48, + "grad_norm": 0.677611523979507, + "learning_rate": 2.7478984508648982e-05, + "loss": 1.9082, + "step": 6278 + }, + { + "epoch": 0.48, + "grad_norm": 0.6037355082206972, + "learning_rate": 2.7472768513258674e-05, + "loss": 1.9162, + "step": 6279 + }, + { + "epoch": 0.48, + "grad_norm": 0.7325722620758016, + "learning_rate": 2.746655236348328e-05, + "loss": 2.1248, + "step": 6280 + }, + { + "epoch": 0.48, + "grad_norm": 0.6374940967422656, + "learning_rate": 2.746033605971088e-05, + "loss": 1.9327, + "step": 6281 + }, + { + "epoch": 0.48, + "grad_norm": 0.6198196505856042, + "learning_rate": 2.745411960232959e-05, + "loss": 1.9322, + "step": 6282 + }, + { + "epoch": 0.48, + "grad_norm": 0.8806410171992154, + "learning_rate": 2.7447902991727538e-05, + "loss": 2.0063, + "step": 6283 + }, + { + "epoch": 0.48, + "grad_norm": 0.6195516100498301, + "learning_rate": 2.7441686228292845e-05, + "loss": 2.1015, + "step": 6284 + }, + { + "epoch": 0.48, + "grad_norm": 0.6724804372020051, + "learning_rate": 2.7435469312413646e-05, + "loss": 1.929, + "step": 6285 + }, + { + "epoch": 0.48, + "grad_norm": 0.7303353372836486, + "learning_rate": 2.74292522444781e-05, + "loss": 1.9625, + "step": 6286 + }, + { + "epoch": 0.49, + "grad_norm": 0.6388992246806826, + "learning_rate": 2.742303502487435e-05, + "loss": 1.9379, + "step": 6287 + }, + { + "epoch": 0.49, + "grad_norm": 0.6474355819923994, + "learning_rate": 2.741681765399058e-05, + "loss": 2.1119, + "step": 6288 + }, + { + "epoch": 0.49, + "grad_norm": 0.706857478088858, + "learning_rate": 2.7410600132214943e-05, + "loss": 2.0147, + "step": 6289 + }, + { + "epoch": 0.49, + "grad_norm": 0.6004460174570148, + "learning_rate": 2.740438245993564e-05, + "loss": 1.9207, + "step": 6290 + }, + { + "epoch": 0.49, + "grad_norm": 0.6111838454979798, + "learning_rate": 2.739816463754087e-05, + "loss": 1.9263, + "step": 6291 + }, + { + "epoch": 0.49, + "grad_norm": 0.7399164053496282, + "learning_rate": 2.739194666541882e-05, + "loss": 2.0721, + "step": 6292 + }, + { + "epoch": 0.49, + "grad_norm": 0.6003525359252971, + "learning_rate": 2.7385728543957712e-05, + "loss": 1.881, + "step": 6293 + }, + { + "epoch": 0.49, + "grad_norm": 0.6059831079086222, + "learning_rate": 2.737951027354578e-05, + "loss": 1.9259, + "step": 6294 + }, + { + "epoch": 0.49, + "grad_norm": 0.8741102411667938, + "learning_rate": 2.7373291854571242e-05, + "loss": 1.9761, + "step": 6295 + }, + { + "epoch": 0.49, + "grad_norm": 0.6247820810764958, + "learning_rate": 2.736707328742234e-05, + "loss": 1.9127, + "step": 6296 + }, + { + "epoch": 0.49, + "grad_norm": 0.9251347795626271, + "learning_rate": 2.7360854572487328e-05, + "loss": 2.0991, + "step": 6297 + }, + { + "epoch": 0.49, + "grad_norm": 0.6272725172961547, + "learning_rate": 2.7354635710154476e-05, + "loss": 1.9306, + "step": 6298 + }, + { + "epoch": 0.49, + "grad_norm": 0.9061320015216355, + "learning_rate": 2.734841670081204e-05, + "loss": 1.9352, + "step": 6299 + }, + { + "epoch": 0.49, + "grad_norm": 0.5739559382400902, + "learning_rate": 2.7342197544848298e-05, + "loss": 2.1169, + "step": 6300 + }, + { + "epoch": 0.49, + "grad_norm": 0.8159598425787489, + "learning_rate": 2.7335978242651538e-05, + "loss": 1.988, + "step": 6301 + }, + { + "epoch": 0.49, + "grad_norm": 0.5958532971495627, + "learning_rate": 2.7329758794610067e-05, + "loss": 1.8957, + "step": 6302 + }, + { + "epoch": 0.49, + "grad_norm": 0.6864914249263036, + "learning_rate": 2.732353920111218e-05, + "loss": 1.9273, + "step": 6303 + }, + { + "epoch": 0.49, + "grad_norm": 0.7457351298560015, + "learning_rate": 2.73173194625462e-05, + "loss": 2.1102, + "step": 6304 + }, + { + "epoch": 0.49, + "grad_norm": 0.6434126360288767, + "learning_rate": 2.7311099579300443e-05, + "loss": 1.9104, + "step": 6305 + }, + { + "epoch": 0.49, + "grad_norm": 0.7744130679943296, + "learning_rate": 2.730487955176324e-05, + "loss": 1.9333, + "step": 6306 + }, + { + "epoch": 0.49, + "grad_norm": 0.6963486857136643, + "learning_rate": 2.7298659380322945e-05, + "loss": 1.9047, + "step": 6307 + }, + { + "epoch": 0.49, + "grad_norm": 0.6944298369570352, + "learning_rate": 2.7292439065367897e-05, + "loss": 1.961, + "step": 6308 + }, + { + "epoch": 0.49, + "grad_norm": 0.7594025378885407, + "learning_rate": 2.7286218607286474e-05, + "loss": 2.0973, + "step": 6309 + }, + { + "epoch": 0.49, + "grad_norm": 0.6702155753060279, + "learning_rate": 2.7279998006467013e-05, + "loss": 1.9062, + "step": 6310 + }, + { + "epoch": 0.49, + "grad_norm": 0.7141238335824098, + "learning_rate": 2.7273777263297918e-05, + "loss": 1.9002, + "step": 6311 + }, + { + "epoch": 0.49, + "grad_norm": 0.7850918367434662, + "learning_rate": 2.7267556378167562e-05, + "loss": 2.0622, + "step": 6312 + }, + { + "epoch": 0.49, + "grad_norm": 0.7853737021525058, + "learning_rate": 2.7261335351464356e-05, + "loss": 1.9067, + "step": 6313 + }, + { + "epoch": 0.49, + "grad_norm": 0.7180192248553696, + "learning_rate": 2.7255114183576686e-05, + "loss": 1.9566, + "step": 6314 + }, + { + "epoch": 0.49, + "grad_norm": 0.7293039767112508, + "learning_rate": 2.7248892874892973e-05, + "loss": 1.9476, + "step": 6315 + }, + { + "epoch": 0.49, + "grad_norm": 1.0638097239181599, + "learning_rate": 2.7242671425801653e-05, + "loss": 2.1024, + "step": 6316 + }, + { + "epoch": 0.49, + "grad_norm": 0.6103435582410526, + "learning_rate": 2.723644983669113e-05, + "loss": 1.9338, + "step": 6317 + }, + { + "epoch": 0.49, + "grad_norm": 0.9525562407998518, + "learning_rate": 2.7230228107949852e-05, + "loss": 1.9767, + "step": 6318 + }, + { + "epoch": 0.49, + "grad_norm": 0.7404284053967155, + "learning_rate": 2.7224006239966276e-05, + "loss": 1.9288, + "step": 6319 + }, + { + "epoch": 0.49, + "grad_norm": 0.7664604999074345, + "learning_rate": 2.7217784233128863e-05, + "loss": 1.9619, + "step": 6320 + }, + { + "epoch": 0.49, + "grad_norm": 0.8083402247020052, + "learning_rate": 2.721156208782606e-05, + "loss": 2.1128, + "step": 6321 + }, + { + "epoch": 0.49, + "grad_norm": 0.8589773230302058, + "learning_rate": 2.7205339804446352e-05, + "loss": 1.9453, + "step": 6322 + }, + { + "epoch": 0.49, + "grad_norm": 0.8037461703300274, + "learning_rate": 2.719911738337822e-05, + "loss": 1.9369, + "step": 6323 + }, + { + "epoch": 0.49, + "grad_norm": 0.902684100404686, + "learning_rate": 2.7192894825010157e-05, + "loss": 2.036, + "step": 6324 + }, + { + "epoch": 0.49, + "grad_norm": 0.7397437051689738, + "learning_rate": 2.718667212973066e-05, + "loss": 1.9031, + "step": 6325 + }, + { + "epoch": 0.49, + "grad_norm": 0.8863541555251446, + "learning_rate": 2.718044929792824e-05, + "loss": 2.0063, + "step": 6326 + }, + { + "epoch": 0.49, + "grad_norm": 0.6734382630234651, + "learning_rate": 2.7174226329991415e-05, + "loss": 1.9337, + "step": 6327 + }, + { + "epoch": 0.49, + "grad_norm": 0.7439477573656978, + "learning_rate": 2.7168003226308704e-05, + "loss": 1.9413, + "step": 6328 + }, + { + "epoch": 0.49, + "grad_norm": 0.8405441101287547, + "learning_rate": 2.7161779987268644e-05, + "loss": 2.1336, + "step": 6329 + }, + { + "epoch": 0.49, + "grad_norm": 0.7845992489489523, + "learning_rate": 2.715555661325978e-05, + "loss": 1.9426, + "step": 6330 + }, + { + "epoch": 0.49, + "grad_norm": 0.684220392349569, + "learning_rate": 2.7149333104670665e-05, + "loss": 1.9015, + "step": 6331 + }, + { + "epoch": 0.49, + "grad_norm": 0.7445128853872242, + "learning_rate": 2.7143109461889854e-05, + "loss": 2.0821, + "step": 6332 + }, + { + "epoch": 0.49, + "grad_norm": 0.6254908015871811, + "learning_rate": 2.7136885685305914e-05, + "loss": 2.0031, + "step": 6333 + }, + { + "epoch": 0.49, + "grad_norm": 0.7128510934743082, + "learning_rate": 2.7130661775307425e-05, + "loss": 1.9537, + "step": 6334 + }, + { + "epoch": 0.49, + "grad_norm": 0.5891874934560332, + "learning_rate": 2.712443773228296e-05, + "loss": 1.9551, + "step": 6335 + }, + { + "epoch": 0.49, + "grad_norm": 0.7148586621068198, + "learning_rate": 2.7118213556621124e-05, + "loss": 2.0711, + "step": 6336 + }, + { + "epoch": 0.49, + "grad_norm": 0.5901565348664094, + "learning_rate": 2.7111989248710517e-05, + "loss": 1.899, + "step": 6337 + }, + { + "epoch": 0.49, + "grad_norm": 0.6258220849754685, + "learning_rate": 2.710576480893975e-05, + "loss": 1.9008, + "step": 6338 + }, + { + "epoch": 0.49, + "grad_norm": 0.6659096296748049, + "learning_rate": 2.7099540237697425e-05, + "loss": 1.985, + "step": 6339 + }, + { + "epoch": 0.49, + "grad_norm": 0.6057466423014145, + "learning_rate": 2.7093315535372177e-05, + "loss": 1.9626, + "step": 6340 + }, + { + "epoch": 0.49, + "grad_norm": 0.6249653946376338, + "learning_rate": 2.7087090702352645e-05, + "loss": 2.1248, + "step": 6341 + }, + { + "epoch": 0.49, + "grad_norm": 0.671220399695298, + "learning_rate": 2.7080865739027466e-05, + "loss": 1.9481, + "step": 6342 + }, + { + "epoch": 0.49, + "grad_norm": 0.7340574327747496, + "learning_rate": 2.7074640645785294e-05, + "loss": 1.9409, + "step": 6343 + }, + { + "epoch": 0.49, + "grad_norm": 0.6511500081887206, + "learning_rate": 2.706841542301477e-05, + "loss": 2.0593, + "step": 6344 + }, + { + "epoch": 0.49, + "grad_norm": 0.6696786609030081, + "learning_rate": 2.7062190071104593e-05, + "loss": 1.9721, + "step": 6345 + }, + { + "epoch": 0.49, + "grad_norm": 0.5917858614202364, + "learning_rate": 2.7055964590443406e-05, + "loss": 1.9453, + "step": 6346 + }, + { + "epoch": 0.49, + "grad_norm": 0.6694715221395168, + "learning_rate": 2.704973898141991e-05, + "loss": 1.9565, + "step": 6347 + }, + { + "epoch": 0.49, + "grad_norm": 0.6180434968984945, + "learning_rate": 2.7043513244422787e-05, + "loss": 2.1066, + "step": 6348 + }, + { + "epoch": 0.49, + "grad_norm": 0.7467616967784585, + "learning_rate": 2.703728737984073e-05, + "loss": 1.9194, + "step": 6349 + }, + { + "epoch": 0.49, + "grad_norm": 0.6380841685937406, + "learning_rate": 2.7031061388062462e-05, + "loss": 1.9194, + "step": 6350 + }, + { + "epoch": 0.49, + "grad_norm": 0.6718007430123296, + "learning_rate": 2.7024835269476683e-05, + "loss": 1.9237, + "step": 6351 + }, + { + "epoch": 0.49, + "grad_norm": 0.6836913112075511, + "learning_rate": 2.701860902447212e-05, + "loss": 1.9175, + "step": 6352 + }, + { + "epoch": 0.49, + "grad_norm": 0.7741449222510344, + "learning_rate": 2.7012382653437502e-05, + "loss": 2.1073, + "step": 6353 + }, + { + "epoch": 0.49, + "grad_norm": 0.674702675501228, + "learning_rate": 2.700615615676157e-05, + "loss": 1.8957, + "step": 6354 + }, + { + "epoch": 0.49, + "grad_norm": 0.692738056319925, + "learning_rate": 2.6999929534833067e-05, + "loss": 1.9649, + "step": 6355 + }, + { + "epoch": 0.49, + "grad_norm": 0.7722980354248258, + "learning_rate": 2.6993702788040752e-05, + "loss": 2.1176, + "step": 6356 + }, + { + "epoch": 0.49, + "grad_norm": 0.6283095319800152, + "learning_rate": 2.6987475916773375e-05, + "loss": 1.9743, + "step": 6357 + }, + { + "epoch": 0.49, + "grad_norm": 0.7239859593458725, + "learning_rate": 2.698124892141971e-05, + "loss": 1.9184, + "step": 6358 + }, + { + "epoch": 0.49, + "grad_norm": 0.7990575234945749, + "learning_rate": 2.6975021802368543e-05, + "loss": 1.9814, + "step": 6359 + }, + { + "epoch": 0.49, + "grad_norm": 0.7371279858209258, + "learning_rate": 2.6968794560008652e-05, + "loss": 1.9591, + "step": 6360 + }, + { + "epoch": 0.49, + "grad_norm": 0.57694478099228, + "learning_rate": 2.6962567194728817e-05, + "loss": 2.1411, + "step": 6361 + }, + { + "epoch": 0.49, + "grad_norm": 0.9142407508577615, + "learning_rate": 2.695633970691786e-05, + "loss": 1.9122, + "step": 6362 + }, + { + "epoch": 0.49, + "grad_norm": 0.7185005034699699, + "learning_rate": 2.6950112096964582e-05, + "loss": 1.9562, + "step": 6363 + }, + { + "epoch": 0.49, + "grad_norm": 0.7208891590353855, + "learning_rate": 2.694388436525779e-05, + "loss": 1.9924, + "step": 6364 + }, + { + "epoch": 0.49, + "grad_norm": 0.7494004827651258, + "learning_rate": 2.69376565121863e-05, + "loss": 2.1343, + "step": 6365 + }, + { + "epoch": 0.49, + "grad_norm": 0.7569180915849669, + "learning_rate": 2.6931428538138965e-05, + "loss": 1.9115, + "step": 6366 + }, + { + "epoch": 0.49, + "grad_norm": 0.5876711456789118, + "learning_rate": 2.6925200443504612e-05, + "loss": 1.9731, + "step": 6367 + }, + { + "epoch": 0.49, + "grad_norm": 0.9403923503396039, + "learning_rate": 2.6918972228672083e-05, + "loss": 2.131, + "step": 6368 + }, + { + "epoch": 0.49, + "grad_norm": 0.6975016124287862, + "learning_rate": 2.691274389403023e-05, + "loss": 1.9383, + "step": 6369 + }, + { + "epoch": 0.49, + "grad_norm": 0.6691601448721995, + "learning_rate": 2.690651543996793e-05, + "loss": 1.9907, + "step": 6370 + }, + { + "epoch": 0.49, + "grad_norm": 0.7262561516712985, + "learning_rate": 2.690028686687403e-05, + "loss": 1.8903, + "step": 6371 + }, + { + "epoch": 0.49, + "grad_norm": 0.7972537822963063, + "learning_rate": 2.689405817513742e-05, + "loss": 1.9515, + "step": 6372 + }, + { + "epoch": 0.49, + "grad_norm": 0.6614371799723311, + "learning_rate": 2.6887829365146972e-05, + "loss": 2.1152, + "step": 6373 + }, + { + "epoch": 0.49, + "grad_norm": 0.7331056166193822, + "learning_rate": 2.688160043729158e-05, + "loss": 1.9515, + "step": 6374 + }, + { + "epoch": 0.49, + "grad_norm": 0.640742495327223, + "learning_rate": 2.6875371391960154e-05, + "loss": 1.856, + "step": 6375 + }, + { + "epoch": 0.49, + "grad_norm": 0.6652746394054031, + "learning_rate": 2.6869142229541576e-05, + "loss": 1.9937, + "step": 6376 + }, + { + "epoch": 0.49, + "grad_norm": 0.623689740819728, + "learning_rate": 2.6862912950424773e-05, + "loss": 2.1254, + "step": 6377 + }, + { + "epoch": 0.49, + "grad_norm": 0.6182803784742932, + "learning_rate": 2.6856683554998664e-05, + "loss": 1.9425, + "step": 6378 + }, + { + "epoch": 0.49, + "grad_norm": 0.6217050867780594, + "learning_rate": 2.6850454043652172e-05, + "loss": 1.9413, + "step": 6379 + }, + { + "epoch": 0.49, + "grad_norm": 0.7833390775011795, + "learning_rate": 2.6844224416774232e-05, + "loss": 1.9165, + "step": 6380 + }, + { + "epoch": 0.49, + "grad_norm": 0.655663190329253, + "learning_rate": 2.683799467475379e-05, + "loss": 2.0983, + "step": 6381 + }, + { + "epoch": 0.49, + "grad_norm": 0.8302883528141125, + "learning_rate": 2.6831764817979776e-05, + "loss": 1.999, + "step": 6382 + }, + { + "epoch": 0.49, + "grad_norm": 0.7625945604520008, + "learning_rate": 2.682553484684117e-05, + "loss": 1.8862, + "step": 6383 + }, + { + "epoch": 0.49, + "grad_norm": 0.8751855810560286, + "learning_rate": 2.681930476172692e-05, + "loss": 1.9799, + "step": 6384 + }, + { + "epoch": 0.49, + "grad_norm": 0.7121713981357994, + "learning_rate": 2.6813074563026007e-05, + "loss": 2.1168, + "step": 6385 + }, + { + "epoch": 0.49, + "grad_norm": 0.617725812316514, + "learning_rate": 2.6806844251127396e-05, + "loss": 1.9398, + "step": 6386 + }, + { + "epoch": 0.49, + "grad_norm": 0.8606776568208174, + "learning_rate": 2.6800613826420073e-05, + "loss": 1.8627, + "step": 6387 + }, + { + "epoch": 0.49, + "grad_norm": 0.6623085162935066, + "learning_rate": 2.679438328929304e-05, + "loss": 1.9927, + "step": 6388 + }, + { + "epoch": 0.49, + "grad_norm": 0.7345463378154149, + "learning_rate": 2.6788152640135284e-05, + "loss": 2.1387, + "step": 6389 + }, + { + "epoch": 0.49, + "grad_norm": 0.7439848691363004, + "learning_rate": 2.678192187933581e-05, + "loss": 1.9252, + "step": 6390 + }, + { + "epoch": 0.49, + "grad_norm": 0.6534642994032598, + "learning_rate": 2.6775691007283627e-05, + "loss": 1.9147, + "step": 6391 + }, + { + "epoch": 0.49, + "grad_norm": 0.6808793804983158, + "learning_rate": 2.676946002436777e-05, + "loss": 1.8683, + "step": 6392 + }, + { + "epoch": 0.49, + "grad_norm": 0.6173372498357584, + "learning_rate": 2.6763228930977248e-05, + "loss": 2.1185, + "step": 6393 + }, + { + "epoch": 0.49, + "grad_norm": 0.694527177978532, + "learning_rate": 2.6756997727501104e-05, + "loss": 1.9406, + "step": 6394 + }, + { + "epoch": 0.49, + "grad_norm": 0.7075817026082758, + "learning_rate": 2.675076641432837e-05, + "loss": 1.9601, + "step": 6395 + }, + { + "epoch": 0.49, + "grad_norm": 0.6227519438643089, + "learning_rate": 2.6744534991848098e-05, + "loss": 1.8889, + "step": 6396 + }, + { + "epoch": 0.49, + "grad_norm": 0.6963333923752751, + "learning_rate": 2.6738303460449342e-05, + "loss": 2.122, + "step": 6397 + }, + { + "epoch": 0.49, + "grad_norm": 0.5646072270557386, + "learning_rate": 2.673207182052116e-05, + "loss": 1.9139, + "step": 6398 + }, + { + "epoch": 0.49, + "grad_norm": 0.6538531963622448, + "learning_rate": 2.6725840072452613e-05, + "loss": 1.9323, + "step": 6399 + }, + { + "epoch": 0.49, + "grad_norm": 0.5916119778924133, + "learning_rate": 2.6719608216632785e-05, + "loss": 2.1269, + "step": 6400 + }, + { + "epoch": 0.49, + "grad_norm": 0.5195173999519469, + "learning_rate": 2.6713376253450752e-05, + "loss": 1.9843, + "step": 6401 + }, + { + "epoch": 0.49, + "grad_norm": 0.6435930025104557, + "learning_rate": 2.6707144183295596e-05, + "loss": 1.9354, + "step": 6402 + }, + { + "epoch": 0.49, + "grad_norm": 0.5893239821802132, + "learning_rate": 2.6700912006556428e-05, + "loss": 1.9228, + "step": 6403 + }, + { + "epoch": 0.49, + "grad_norm": 0.6378697042324742, + "learning_rate": 2.6694679723622317e-05, + "loss": 1.9492, + "step": 6404 + }, + { + "epoch": 0.49, + "grad_norm": 0.6668632156797163, + "learning_rate": 2.6688447334882406e-05, + "loss": 2.0899, + "step": 6405 + }, + { + "epoch": 0.49, + "grad_norm": 0.5982915319177238, + "learning_rate": 2.668221484072579e-05, + "loss": 1.9246, + "step": 6406 + }, + { + "epoch": 0.49, + "grad_norm": 0.6175635190154075, + "learning_rate": 2.667598224154158e-05, + "loss": 1.9874, + "step": 6407 + }, + { + "epoch": 0.49, + "grad_norm": 0.7548753674621945, + "learning_rate": 2.6669749537718914e-05, + "loss": 1.9697, + "step": 6408 + }, + { + "epoch": 0.49, + "grad_norm": 0.6075771625951945, + "learning_rate": 2.666351672964693e-05, + "loss": 2.0868, + "step": 6409 + }, + { + "epoch": 0.49, + "grad_norm": 0.7002282873478666, + "learning_rate": 2.665728381771477e-05, + "loss": 1.8903, + "step": 6410 + }, + { + "epoch": 0.49, + "grad_norm": 0.7007065430102314, + "learning_rate": 2.6651050802311562e-05, + "loss": 1.8867, + "step": 6411 + }, + { + "epoch": 0.49, + "grad_norm": 0.5632550025659581, + "learning_rate": 2.664481768382647e-05, + "loss": 1.9242, + "step": 6412 + }, + { + "epoch": 0.49, + "grad_norm": 0.8547363471462214, + "learning_rate": 2.6638584462648663e-05, + "loss": 2.1747, + "step": 6413 + }, + { + "epoch": 0.49, + "grad_norm": 0.6100679688798435, + "learning_rate": 2.6632351139167288e-05, + "loss": 1.9371, + "step": 6414 + }, + { + "epoch": 0.49, + "grad_norm": 0.6077719150285603, + "learning_rate": 2.6626117713771526e-05, + "loss": 1.9165, + "step": 6415 + }, + { + "epoch": 0.49, + "grad_norm": 0.8199742785346006, + "learning_rate": 2.661988418685055e-05, + "loss": 1.9009, + "step": 6416 + }, + { + "epoch": 0.5, + "grad_norm": 0.6128586632987055, + "learning_rate": 2.6613650558793563e-05, + "loss": 2.0733, + "step": 6417 + }, + { + "epoch": 0.5, + "grad_norm": 0.6736447465802785, + "learning_rate": 2.6607416829989735e-05, + "loss": 1.916, + "step": 6418 + }, + { + "epoch": 0.5, + "grad_norm": 0.7715410248714414, + "learning_rate": 2.660118300082827e-05, + "loss": 1.9957, + "step": 6419 + }, + { + "epoch": 0.5, + "grad_norm": 0.5569462191645618, + "learning_rate": 2.659494907169837e-05, + "loss": 1.9038, + "step": 6420 + }, + { + "epoch": 0.5, + "grad_norm": 0.7890419025935528, + "learning_rate": 2.6588715042989253e-05, + "loss": 2.0887, + "step": 6421 + }, + { + "epoch": 0.5, + "grad_norm": 0.6375991649496321, + "learning_rate": 2.658248091509013e-05, + "loss": 1.9304, + "step": 6422 + }, + { + "epoch": 0.5, + "grad_norm": 0.6158677615368133, + "learning_rate": 2.6576246688390216e-05, + "loss": 1.8878, + "step": 6423 + }, + { + "epoch": 0.5, + "grad_norm": 0.608750926854217, + "learning_rate": 2.657001236327875e-05, + "loss": 1.8975, + "step": 6424 + }, + { + "epoch": 0.5, + "grad_norm": 0.7927305185666081, + "learning_rate": 2.6563777940144963e-05, + "loss": 2.074, + "step": 6425 + }, + { + "epoch": 0.5, + "grad_norm": 0.5685464491802472, + "learning_rate": 2.6557543419378096e-05, + "loss": 2.0006, + "step": 6426 + }, + { + "epoch": 0.5, + "grad_norm": 0.7335274472471456, + "learning_rate": 2.6551308801367385e-05, + "loss": 1.926, + "step": 6427 + }, + { + "epoch": 0.5, + "grad_norm": 0.6472048511415299, + "learning_rate": 2.6545074086502108e-05, + "loss": 1.8998, + "step": 6428 + }, + { + "epoch": 0.5, + "grad_norm": 0.6226244305351178, + "learning_rate": 2.6538839275171495e-05, + "loss": 2.1072, + "step": 6429 + }, + { + "epoch": 0.5, + "grad_norm": 0.6405350237664789, + "learning_rate": 2.653260436776483e-05, + "loss": 1.9084, + "step": 6430 + }, + { + "epoch": 0.5, + "grad_norm": 0.722162397830998, + "learning_rate": 2.6526369364671383e-05, + "loss": 1.9867, + "step": 6431 + }, + { + "epoch": 0.5, + "grad_norm": 0.5839888910470223, + "learning_rate": 2.6520134266280423e-05, + "loss": 1.9984, + "step": 6432 + }, + { + "epoch": 0.5, + "grad_norm": 0.9008688104355426, + "learning_rate": 2.651389907298123e-05, + "loss": 2.1423, + "step": 6433 + }, + { + "epoch": 0.5, + "grad_norm": 0.6555502658788382, + "learning_rate": 2.6507663785163102e-05, + "loss": 1.8524, + "step": 6434 + }, + { + "epoch": 0.5, + "grad_norm": 0.7458811807973122, + "learning_rate": 2.6501428403215333e-05, + "loss": 1.8861, + "step": 6435 + }, + { + "epoch": 0.5, + "grad_norm": 0.6126057195498361, + "learning_rate": 2.6495192927527218e-05, + "loss": 1.9225, + "step": 6436 + }, + { + "epoch": 0.5, + "grad_norm": 0.8195722288029695, + "learning_rate": 2.648895735848807e-05, + "loss": 2.0759, + "step": 6437 + }, + { + "epoch": 0.5, + "grad_norm": 0.5526570915836037, + "learning_rate": 2.6482721696487194e-05, + "loss": 2.0409, + "step": 6438 + }, + { + "epoch": 0.5, + "grad_norm": 0.8074294850016852, + "learning_rate": 2.6476485941913907e-05, + "loss": 1.9042, + "step": 6439 + }, + { + "epoch": 0.5, + "grad_norm": 0.6485487909687631, + "learning_rate": 2.6470250095157544e-05, + "loss": 1.952, + "step": 6440 + }, + { + "epoch": 0.5, + "grad_norm": 0.8235302213354181, + "learning_rate": 2.6464014156607425e-05, + "loss": 2.1099, + "step": 6441 + }, + { + "epoch": 0.5, + "grad_norm": 0.7040433830711543, + "learning_rate": 2.645777812665289e-05, + "loss": 1.9142, + "step": 6442 + }, + { + "epoch": 0.5, + "grad_norm": 0.6830980778597556, + "learning_rate": 2.6451542005683273e-05, + "loss": 1.8876, + "step": 6443 + }, + { + "epoch": 0.5, + "grad_norm": 0.6751410545835068, + "learning_rate": 2.6445305794087933e-05, + "loss": 1.9794, + "step": 6444 + }, + { + "epoch": 0.5, + "grad_norm": 0.8226084228673222, + "learning_rate": 2.6439069492256207e-05, + "loss": 2.1037, + "step": 6445 + }, + { + "epoch": 0.5, + "grad_norm": 0.7110663717682013, + "learning_rate": 2.6432833100577464e-05, + "loss": 1.9225, + "step": 6446 + }, + { + "epoch": 0.5, + "grad_norm": 0.7825733965495835, + "learning_rate": 2.6426596619441064e-05, + "loss": 1.9231, + "step": 6447 + }, + { + "epoch": 0.5, + "grad_norm": 0.6180548781077644, + "learning_rate": 2.6420360049236374e-05, + "loss": 1.9135, + "step": 6448 + }, + { + "epoch": 0.5, + "grad_norm": 0.6477929110806399, + "learning_rate": 2.6414123390352774e-05, + "loss": 2.1098, + "step": 6449 + }, + { + "epoch": 0.5, + "grad_norm": 0.6782042156932895, + "learning_rate": 2.6407886643179642e-05, + "loss": 1.9674, + "step": 6450 + }, + { + "epoch": 0.5, + "grad_norm": 0.6313235860653843, + "learning_rate": 2.6401649808106365e-05, + "loss": 1.9113, + "step": 6451 + }, + { + "epoch": 0.5, + "grad_norm": 0.590025222204397, + "learning_rate": 2.6395412885522332e-05, + "loss": 1.9045, + "step": 6452 + }, + { + "epoch": 0.5, + "grad_norm": 0.7673810216325135, + "learning_rate": 2.638917587581694e-05, + "loss": 2.1172, + "step": 6453 + }, + { + "epoch": 0.5, + "grad_norm": 0.6403630723685011, + "learning_rate": 2.638293877937959e-05, + "loss": 1.9157, + "step": 6454 + }, + { + "epoch": 0.5, + "grad_norm": 0.7478088267732901, + "learning_rate": 2.6376701596599685e-05, + "loss": 1.908, + "step": 6455 + }, + { + "epoch": 0.5, + "grad_norm": 0.7785362892239078, + "learning_rate": 2.6370464327866646e-05, + "loss": 1.8698, + "step": 6456 + }, + { + "epoch": 0.5, + "grad_norm": 0.6430051336674103, + "learning_rate": 2.6364226973569893e-05, + "loss": 2.1497, + "step": 6457 + }, + { + "epoch": 0.5, + "grad_norm": 0.6146206002096767, + "learning_rate": 2.6357989534098848e-05, + "loss": 1.9205, + "step": 6458 + }, + { + "epoch": 0.5, + "grad_norm": 0.5705869803621944, + "learning_rate": 2.635175200984293e-05, + "loss": 1.8827, + "step": 6459 + }, + { + "epoch": 0.5, + "grad_norm": 0.7840429222527192, + "learning_rate": 2.634551440119159e-05, + "loss": 1.9066, + "step": 6460 + }, + { + "epoch": 0.5, + "grad_norm": 0.580990543547921, + "learning_rate": 2.633927670853425e-05, + "loss": 2.1212, + "step": 6461 + }, + { + "epoch": 0.5, + "grad_norm": 0.9785381031087194, + "learning_rate": 2.6333038932260366e-05, + "loss": 1.9464, + "step": 6462 + }, + { + "epoch": 0.5, + "grad_norm": 0.5972155522336412, + "learning_rate": 2.6326801072759388e-05, + "loss": 1.9938, + "step": 6463 + }, + { + "epoch": 0.5, + "grad_norm": 0.6739338406529702, + "learning_rate": 2.6320563130420762e-05, + "loss": 1.9367, + "step": 6464 + }, + { + "epoch": 0.5, + "grad_norm": 0.7606005480781356, + "learning_rate": 2.6314325105633958e-05, + "loss": 2.058, + "step": 6465 + }, + { + "epoch": 0.5, + "grad_norm": 0.5444003072458795, + "learning_rate": 2.630808699878844e-05, + "loss": 1.9059, + "step": 6466 + }, + { + "epoch": 0.5, + "grad_norm": 0.603631708142459, + "learning_rate": 2.6301848810273676e-05, + "loss": 1.9152, + "step": 6467 + }, + { + "epoch": 0.5, + "grad_norm": 0.682464709504073, + "learning_rate": 2.6295610540479144e-05, + "loss": 1.9084, + "step": 6468 + }, + { + "epoch": 0.5, + "grad_norm": 0.6001165683683274, + "learning_rate": 2.628937218979432e-05, + "loss": 2.1897, + "step": 6469 + }, + { + "epoch": 0.5, + "grad_norm": 0.6119697145205543, + "learning_rate": 2.6283133758608697e-05, + "loss": 1.9305, + "step": 6470 + }, + { + "epoch": 0.5, + "grad_norm": 0.6257823626186746, + "learning_rate": 2.6276895247311762e-05, + "loss": 1.9475, + "step": 6471 + }, + { + "epoch": 0.5, + "grad_norm": 0.585516300421782, + "learning_rate": 2.627065665629301e-05, + "loss": 1.9104, + "step": 6472 + }, + { + "epoch": 0.5, + "grad_norm": 0.6904933322458482, + "learning_rate": 2.6264417985941934e-05, + "loss": 2.1072, + "step": 6473 + }, + { + "epoch": 0.5, + "grad_norm": 0.5985068987064216, + "learning_rate": 2.625817923664806e-05, + "loss": 1.8894, + "step": 6474 + }, + { + "epoch": 0.5, + "grad_norm": 0.7306728873087909, + "learning_rate": 2.625194040880089e-05, + "loss": 1.9422, + "step": 6475 + }, + { + "epoch": 0.5, + "grad_norm": 0.6368274552658867, + "learning_rate": 2.6245701502789926e-05, + "loss": 1.8867, + "step": 6476 + }, + { + "epoch": 0.5, + "grad_norm": 0.7380934310298212, + "learning_rate": 2.6239462519004708e-05, + "loss": 2.1191, + "step": 6477 + }, + { + "epoch": 0.5, + "grad_norm": 0.5841557164929541, + "learning_rate": 2.6233223457834753e-05, + "loss": 1.8899, + "step": 6478 + }, + { + "epoch": 0.5, + "grad_norm": 0.6219095969145549, + "learning_rate": 2.6226984319669588e-05, + "loss": 1.9118, + "step": 6479 + }, + { + "epoch": 0.5, + "grad_norm": 0.5612442367264727, + "learning_rate": 2.6220745104898747e-05, + "loss": 1.9137, + "step": 6480 + }, + { + "epoch": 0.5, + "grad_norm": 0.6326602545118899, + "learning_rate": 2.6214505813911782e-05, + "loss": 2.0018, + "step": 6481 + }, + { + "epoch": 0.5, + "grad_norm": 0.5925353628429101, + "learning_rate": 2.6208266447098235e-05, + "loss": 2.0961, + "step": 6482 + }, + { + "epoch": 0.5, + "grad_norm": 0.6412055795370248, + "learning_rate": 2.620202700484764e-05, + "loss": 1.9081, + "step": 6483 + }, + { + "epoch": 0.5, + "grad_norm": 0.6140144795272754, + "learning_rate": 2.6195787487549554e-05, + "loss": 1.9592, + "step": 6484 + }, + { + "epoch": 0.5, + "grad_norm": 0.6572373121609558, + "learning_rate": 2.6189547895593562e-05, + "loss": 2.1652, + "step": 6485 + }, + { + "epoch": 0.5, + "grad_norm": 0.5594062648790008, + "learning_rate": 2.6183308229369196e-05, + "loss": 1.9392, + "step": 6486 + }, + { + "epoch": 0.5, + "grad_norm": 0.6535715238490272, + "learning_rate": 2.6177068489266037e-05, + "loss": 1.8974, + "step": 6487 + }, + { + "epoch": 0.5, + "grad_norm": 0.5662751974351232, + "learning_rate": 2.617082867567366e-05, + "loss": 1.9877, + "step": 6488 + }, + { + "epoch": 0.5, + "grad_norm": 0.6078156268936326, + "learning_rate": 2.6164588788981636e-05, + "loss": 2.0873, + "step": 6489 + }, + { + "epoch": 0.5, + "grad_norm": 0.6188123406915048, + "learning_rate": 2.615834882957955e-05, + "loss": 1.9034, + "step": 6490 + }, + { + "epoch": 0.5, + "grad_norm": 0.6312126786743251, + "learning_rate": 2.6152108797856988e-05, + "loss": 1.9074, + "step": 6491 + }, + { + "epoch": 0.5, + "grad_norm": 0.6345859343140237, + "learning_rate": 2.614586869420354e-05, + "loss": 1.9321, + "step": 6492 + }, + { + "epoch": 0.5, + "grad_norm": 0.7595484134926646, + "learning_rate": 2.6139628519008806e-05, + "loss": 2.1029, + "step": 6493 + }, + { + "epoch": 0.5, + "grad_norm": 0.8093661214668302, + "learning_rate": 2.613338827266238e-05, + "loss": 1.9802, + "step": 6494 + }, + { + "epoch": 0.5, + "grad_norm": 0.6021368062009931, + "learning_rate": 2.6127147955553864e-05, + "loss": 1.8901, + "step": 6495 + }, + { + "epoch": 0.5, + "grad_norm": 0.7603385245092367, + "learning_rate": 2.6120907568072883e-05, + "loss": 1.9472, + "step": 6496 + }, + { + "epoch": 0.5, + "grad_norm": 0.7216827404475619, + "learning_rate": 2.611466711060902e-05, + "loss": 2.1154, + "step": 6497 + }, + { + "epoch": 0.5, + "grad_norm": 0.5904495242187202, + "learning_rate": 2.610842658355192e-05, + "loss": 1.8947, + "step": 6498 + }, + { + "epoch": 0.5, + "grad_norm": 0.6135900713172538, + "learning_rate": 2.6102185987291193e-05, + "loss": 1.922, + "step": 6499 + }, + { + "epoch": 0.5, + "grad_norm": 0.7206665211888598, + "learning_rate": 2.6095945322216475e-05, + "loss": 1.9861, + "step": 6500 + }, + { + "epoch": 0.5, + "grad_norm": 0.6640372432298424, + "learning_rate": 2.6089704588717375e-05, + "loss": 2.1029, + "step": 6501 + }, + { + "epoch": 0.5, + "grad_norm": 0.5726830092544704, + "learning_rate": 2.6083463787183545e-05, + "loss": 1.9174, + "step": 6502 + }, + { + "epoch": 0.5, + "grad_norm": 0.6900662103599454, + "learning_rate": 2.607722291800463e-05, + "loss": 1.8905, + "step": 6503 + }, + { + "epoch": 0.5, + "grad_norm": 0.7248149977913002, + "learning_rate": 2.6070981981570253e-05, + "loss": 1.9053, + "step": 6504 + }, + { + "epoch": 0.5, + "grad_norm": 0.6124037329195494, + "learning_rate": 2.6064740978270068e-05, + "loss": 2.0932, + "step": 6505 + }, + { + "epoch": 0.5, + "grad_norm": 0.6154079137857783, + "learning_rate": 2.6058499908493734e-05, + "loss": 1.95, + "step": 6506 + }, + { + "epoch": 0.5, + "grad_norm": 0.6410602571612857, + "learning_rate": 2.6052258772630904e-05, + "loss": 1.9309, + "step": 6507 + }, + { + "epoch": 0.5, + "grad_norm": 0.7614619568929409, + "learning_rate": 2.6046017571071233e-05, + "loss": 1.914, + "step": 6508 + }, + { + "epoch": 0.5, + "grad_norm": 0.6217193523042852, + "learning_rate": 2.6039776304204394e-05, + "loss": 2.0801, + "step": 6509 + }, + { + "epoch": 0.5, + "grad_norm": 0.7819015384910641, + "learning_rate": 2.603353497242004e-05, + "loss": 1.9156, + "step": 6510 + }, + { + "epoch": 0.5, + "grad_norm": 0.6537187431996537, + "learning_rate": 2.6027293576107858e-05, + "loss": 1.9082, + "step": 6511 + }, + { + "epoch": 0.5, + "grad_norm": 0.673756057435738, + "learning_rate": 2.6021052115657513e-05, + "loss": 1.9758, + "step": 6512 + }, + { + "epoch": 0.5, + "grad_norm": 0.6239100848813969, + "learning_rate": 2.6014810591458693e-05, + "loss": 1.969, + "step": 6513 + }, + { + "epoch": 0.5, + "grad_norm": 0.706847355813064, + "learning_rate": 2.600856900390108e-05, + "loss": 2.1096, + "step": 6514 + }, + { + "epoch": 0.5, + "grad_norm": 0.5976473344697637, + "learning_rate": 2.600232735337436e-05, + "loss": 1.9286, + "step": 6515 + }, + { + "epoch": 0.5, + "grad_norm": 0.5737124371633711, + "learning_rate": 2.5996085640268226e-05, + "loss": 1.9315, + "step": 6516 + }, + { + "epoch": 0.5, + "grad_norm": 0.674491351773786, + "learning_rate": 2.5989843864972367e-05, + "loss": 2.1327, + "step": 6517 + }, + { + "epoch": 0.5, + "grad_norm": 0.6371537861890869, + "learning_rate": 2.5983602027876497e-05, + "loss": 1.9665, + "step": 6518 + }, + { + "epoch": 0.5, + "grad_norm": 0.5739627246603651, + "learning_rate": 2.597736012937031e-05, + "loss": 2.0252, + "step": 6519 + }, + { + "epoch": 0.5, + "grad_norm": 0.5989448315254908, + "learning_rate": 2.597111816984351e-05, + "loss": 1.917, + "step": 6520 + }, + { + "epoch": 0.5, + "grad_norm": 0.7805313688530142, + "learning_rate": 2.596487614968583e-05, + "loss": 2.1378, + "step": 6521 + }, + { + "epoch": 0.5, + "grad_norm": 0.556115292494162, + "learning_rate": 2.595863406928695e-05, + "loss": 1.9427, + "step": 6522 + }, + { + "epoch": 0.5, + "grad_norm": 0.6606468995493125, + "learning_rate": 2.5952391929036607e-05, + "loss": 1.9071, + "step": 6523 + }, + { + "epoch": 0.5, + "grad_norm": 0.7276127053848916, + "learning_rate": 2.5946149729324532e-05, + "loss": 1.9441, + "step": 6524 + }, + { + "epoch": 0.5, + "grad_norm": 0.647926321420105, + "learning_rate": 2.593990747054044e-05, + "loss": 2.0367, + "step": 6525 + }, + { + "epoch": 0.5, + "grad_norm": 0.6235778308925776, + "learning_rate": 2.5933665153074066e-05, + "loss": 2.0931, + "step": 6526 + }, + { + "epoch": 0.5, + "grad_norm": 0.6940199105293988, + "learning_rate": 2.592742277731513e-05, + "loss": 1.877, + "step": 6527 + }, + { + "epoch": 0.5, + "grad_norm": 0.6561189324662845, + "learning_rate": 2.5921180343653397e-05, + "loss": 1.8998, + "step": 6528 + }, + { + "epoch": 0.5, + "grad_norm": 0.7855837172417871, + "learning_rate": 2.5914937852478582e-05, + "loss": 2.1151, + "step": 6529 + }, + { + "epoch": 0.5, + "grad_norm": 0.6618184785480731, + "learning_rate": 2.5908695304180436e-05, + "loss": 1.9466, + "step": 6530 + }, + { + "epoch": 0.5, + "grad_norm": 0.6232953111634054, + "learning_rate": 2.5902452699148706e-05, + "loss": 1.9658, + "step": 6531 + }, + { + "epoch": 0.5, + "grad_norm": 0.7454729600205101, + "learning_rate": 2.5896210037773155e-05, + "loss": 1.9026, + "step": 6532 + }, + { + "epoch": 0.5, + "grad_norm": 0.7197768050331184, + "learning_rate": 2.5889967320443525e-05, + "loss": 2.0729, + "step": 6533 + }, + { + "epoch": 0.5, + "grad_norm": 0.6301825914804862, + "learning_rate": 2.588372454754958e-05, + "loss": 1.9216, + "step": 6534 + }, + { + "epoch": 0.5, + "grad_norm": 0.7410699171847606, + "learning_rate": 2.5877481719481084e-05, + "loss": 1.961, + "step": 6535 + }, + { + "epoch": 0.5, + "grad_norm": 0.7172321039022076, + "learning_rate": 2.58712388366278e-05, + "loss": 1.9479, + "step": 6536 + }, + { + "epoch": 0.5, + "grad_norm": 0.6191187844892279, + "learning_rate": 2.5864995899379492e-05, + "loss": 1.9541, + "step": 6537 + }, + { + "epoch": 0.5, + "grad_norm": 0.8812027701116729, + "learning_rate": 2.5858752908125933e-05, + "loss": 2.143, + "step": 6538 + }, + { + "epoch": 0.5, + "grad_norm": 0.6271000850811852, + "learning_rate": 2.5852509863256906e-05, + "loss": 1.9163, + "step": 6539 + }, + { + "epoch": 0.5, + "grad_norm": 0.7495511646401828, + "learning_rate": 2.5846266765162185e-05, + "loss": 1.9088, + "step": 6540 + }, + { + "epoch": 0.5, + "grad_norm": 0.7451757222132465, + "learning_rate": 2.5840023614231555e-05, + "loss": 2.1284, + "step": 6541 + }, + { + "epoch": 0.5, + "grad_norm": 0.6579809278641219, + "learning_rate": 2.5833780410854803e-05, + "loss": 1.8942, + "step": 6542 + }, + { + "epoch": 0.5, + "grad_norm": 0.7037405966466191, + "learning_rate": 2.5827537155421717e-05, + "loss": 1.9404, + "step": 6543 + }, + { + "epoch": 0.5, + "grad_norm": 0.6230426557387939, + "learning_rate": 2.5821293848322074e-05, + "loss": 2.0121, + "step": 6544 + }, + { + "epoch": 0.5, + "grad_norm": 0.6133644587188964, + "learning_rate": 2.5815050489945693e-05, + "loss": 1.8939, + "step": 6545 + }, + { + "epoch": 0.51, + "grad_norm": 0.6627209813478885, + "learning_rate": 2.5808807080682357e-05, + "loss": 2.1186, + "step": 6546 + }, + { + "epoch": 0.51, + "grad_norm": 0.670813289271015, + "learning_rate": 2.5802563620921877e-05, + "loss": 1.8857, + "step": 6547 + }, + { + "epoch": 0.51, + "grad_norm": 0.6815405987066258, + "learning_rate": 2.5796320111054046e-05, + "loss": 1.93, + "step": 6548 + }, + { + "epoch": 0.51, + "grad_norm": 0.6400182904428378, + "learning_rate": 2.5790076551468678e-05, + "loss": 2.113, + "step": 6549 + }, + { + "epoch": 0.51, + "grad_norm": 0.696414317097647, + "learning_rate": 2.5783832942555597e-05, + "loss": 2.0038, + "step": 6550 + }, + { + "epoch": 0.51, + "grad_norm": 0.6315532102106972, + "learning_rate": 2.57775892847046e-05, + "loss": 1.8825, + "step": 6551 + }, + { + "epoch": 0.51, + "grad_norm": 0.6095253865456688, + "learning_rate": 2.5771345578305505e-05, + "loss": 1.8986, + "step": 6552 + }, + { + "epoch": 0.51, + "grad_norm": 0.754273026621178, + "learning_rate": 2.5765101823748138e-05, + "loss": 2.0933, + "step": 6553 + }, + { + "epoch": 0.51, + "grad_norm": 0.6101022920436412, + "learning_rate": 2.5758858021422327e-05, + "loss": 1.9192, + "step": 6554 + }, + { + "epoch": 0.51, + "grad_norm": 0.6922359524671611, + "learning_rate": 2.5752614171717888e-05, + "loss": 1.9294, + "step": 6555 + }, + { + "epoch": 0.51, + "grad_norm": 0.7249867819013628, + "learning_rate": 2.574637027502465e-05, + "loss": 1.988, + "step": 6556 + }, + { + "epoch": 0.51, + "grad_norm": 0.6196852222361716, + "learning_rate": 2.5740126331732457e-05, + "loss": 1.9272, + "step": 6557 + }, + { + "epoch": 0.51, + "grad_norm": 0.7469761063330268, + "learning_rate": 2.573388234223113e-05, + "loss": 2.0865, + "step": 6558 + }, + { + "epoch": 0.51, + "grad_norm": 0.5995948496574428, + "learning_rate": 2.5727638306910518e-05, + "loss": 1.9225, + "step": 6559 + }, + { + "epoch": 0.51, + "grad_norm": 0.6704497077651614, + "learning_rate": 2.5721394226160454e-05, + "loss": 1.8708, + "step": 6560 + }, + { + "epoch": 0.51, + "grad_norm": 0.6901500235670213, + "learning_rate": 2.571515010037079e-05, + "loss": 2.1018, + "step": 6561 + }, + { + "epoch": 0.51, + "grad_norm": 0.7004204854742906, + "learning_rate": 2.5708905929931365e-05, + "loss": 1.9743, + "step": 6562 + }, + { + "epoch": 0.51, + "grad_norm": 0.6347871873770429, + "learning_rate": 2.570266171523203e-05, + "loss": 1.9534, + "step": 6563 + }, + { + "epoch": 0.51, + "grad_norm": 0.6473977143340702, + "learning_rate": 2.5696417456662635e-05, + "loss": 1.936, + "step": 6564 + }, + { + "epoch": 0.51, + "grad_norm": 0.6020022940889188, + "learning_rate": 2.569017315461304e-05, + "loss": 2.1082, + "step": 6565 + }, + { + "epoch": 0.51, + "grad_norm": 0.6402247237128365, + "learning_rate": 2.5683928809473097e-05, + "loss": 1.9557, + "step": 6566 + }, + { + "epoch": 0.51, + "grad_norm": 0.5779581137798265, + "learning_rate": 2.567768442163267e-05, + "loss": 1.9866, + "step": 6567 + }, + { + "epoch": 0.51, + "grad_norm": 0.5404170883335784, + "learning_rate": 2.5671439991481626e-05, + "loss": 2.0246, + "step": 6568 + }, + { + "epoch": 0.51, + "grad_norm": 0.6021764084011956, + "learning_rate": 2.5665195519409825e-05, + "loss": 1.9269, + "step": 6569 + }, + { + "epoch": 0.51, + "grad_norm": 0.572015219090514, + "learning_rate": 2.5658951005807124e-05, + "loss": 2.134, + "step": 6570 + }, + { + "epoch": 0.51, + "grad_norm": 0.5833732534412206, + "learning_rate": 2.5652706451063414e-05, + "loss": 1.8916, + "step": 6571 + }, + { + "epoch": 0.51, + "grad_norm": 0.6407664972173585, + "learning_rate": 2.564646185556856e-05, + "loss": 1.8819, + "step": 6572 + }, + { + "epoch": 0.51, + "grad_norm": 0.5724012674693831, + "learning_rate": 2.564021721971243e-05, + "loss": 2.0751, + "step": 6573 + }, + { + "epoch": 0.51, + "grad_norm": 0.6070858081393523, + "learning_rate": 2.5633972543884903e-05, + "loss": 1.9252, + "step": 6574 + }, + { + "epoch": 0.51, + "grad_norm": 0.5840496408222126, + "learning_rate": 2.5627727828475884e-05, + "loss": 1.9422, + "step": 6575 + }, + { + "epoch": 0.51, + "grad_norm": 0.5945039173567086, + "learning_rate": 2.562148307387523e-05, + "loss": 1.8583, + "step": 6576 + }, + { + "epoch": 0.51, + "grad_norm": 0.7302556369481313, + "learning_rate": 2.561523828047283e-05, + "loss": 1.9206, + "step": 6577 + }, + { + "epoch": 0.51, + "grad_norm": 0.7939308568202998, + "learning_rate": 2.5608993448658576e-05, + "loss": 2.1211, + "step": 6578 + }, + { + "epoch": 0.51, + "grad_norm": 0.6155024660609106, + "learning_rate": 2.5602748578822366e-05, + "loss": 1.9103, + "step": 6579 + }, + { + "epoch": 0.51, + "grad_norm": 0.7240653221384331, + "learning_rate": 2.5596503671354082e-05, + "loss": 1.9608, + "step": 6580 + }, + { + "epoch": 0.51, + "grad_norm": 0.5702597143346358, + "learning_rate": 2.5590258726643622e-05, + "loss": 1.9685, + "step": 6581 + }, + { + "epoch": 0.51, + "grad_norm": 0.5627715759863336, + "learning_rate": 2.5584013745080887e-05, + "loss": 2.1114, + "step": 6582 + }, + { + "epoch": 0.51, + "grad_norm": 0.6022562582895445, + "learning_rate": 2.557776872705578e-05, + "loss": 1.9566, + "step": 6583 + }, + { + "epoch": 0.51, + "grad_norm": 0.6007677309871938, + "learning_rate": 2.5571523672958187e-05, + "loss": 1.9237, + "step": 6584 + }, + { + "epoch": 0.51, + "grad_norm": 0.612852549201217, + "learning_rate": 2.556527858317803e-05, + "loss": 2.1438, + "step": 6585 + }, + { + "epoch": 0.51, + "grad_norm": 0.578082825537637, + "learning_rate": 2.5559033458105214e-05, + "loss": 1.8709, + "step": 6586 + }, + { + "epoch": 0.51, + "grad_norm": 0.6058746397140969, + "learning_rate": 2.555278829812963e-05, + "loss": 2.0134, + "step": 6587 + }, + { + "epoch": 0.51, + "grad_norm": 0.6107176893918208, + "learning_rate": 2.5546543103641208e-05, + "loss": 1.9255, + "step": 6588 + }, + { + "epoch": 0.51, + "grad_norm": 0.5478592703328194, + "learning_rate": 2.5540297875029855e-05, + "loss": 1.8468, + "step": 6589 + }, + { + "epoch": 0.51, + "grad_norm": 0.6146878771812208, + "learning_rate": 2.553405261268549e-05, + "loss": 2.117, + "step": 6590 + }, + { + "epoch": 0.51, + "grad_norm": 0.5628835116745478, + "learning_rate": 2.5527807316998014e-05, + "loss": 1.95, + "step": 6591 + }, + { + "epoch": 0.51, + "grad_norm": 0.6261919377483969, + "learning_rate": 2.552156198835737e-05, + "loss": 1.9386, + "step": 6592 + }, + { + "epoch": 0.51, + "grad_norm": 0.5894541127077549, + "learning_rate": 2.5515316627153467e-05, + "loss": 1.9644, + "step": 6593 + }, + { + "epoch": 0.51, + "grad_norm": 0.5803564882349125, + "learning_rate": 2.550907123377623e-05, + "loss": 2.1085, + "step": 6594 + }, + { + "epoch": 0.51, + "grad_norm": 0.5893129725344164, + "learning_rate": 2.5502825808615576e-05, + "loss": 1.9014, + "step": 6595 + }, + { + "epoch": 0.51, + "grad_norm": 0.5996772034919101, + "learning_rate": 2.549658035206145e-05, + "loss": 1.9233, + "step": 6596 + }, + { + "epoch": 0.51, + "grad_norm": 0.5835587734110113, + "learning_rate": 2.549033486450378e-05, + "loss": 1.9002, + "step": 6597 + }, + { + "epoch": 0.51, + "grad_norm": 0.622206582894593, + "learning_rate": 2.548408934633248e-05, + "loss": 2.1586, + "step": 6598 + }, + { + "epoch": 0.51, + "grad_norm": 0.630864665239744, + "learning_rate": 2.5477843797937484e-05, + "loss": 1.9711, + "step": 6599 + }, + { + "epoch": 0.51, + "grad_norm": 0.6289054898510056, + "learning_rate": 2.5471598219708752e-05, + "loss": 1.9334, + "step": 6600 + }, + { + "epoch": 0.51, + "grad_norm": 0.7711055284334014, + "learning_rate": 2.5465352612036203e-05, + "loss": 1.9544, + "step": 6601 + }, + { + "epoch": 0.51, + "grad_norm": 0.5941108726458566, + "learning_rate": 2.5459106975309776e-05, + "loss": 2.0669, + "step": 6602 + }, + { + "epoch": 0.51, + "grad_norm": 0.7933195619145668, + "learning_rate": 2.5452861309919413e-05, + "loss": 1.9245, + "step": 6603 + }, + { + "epoch": 0.51, + "grad_norm": 0.6616488977692362, + "learning_rate": 2.544661561625506e-05, + "loss": 1.9095, + "step": 6604 + }, + { + "epoch": 0.51, + "grad_norm": 0.6973954469667004, + "learning_rate": 2.544036989470666e-05, + "loss": 2.1797, + "step": 6605 + }, + { + "epoch": 0.51, + "grad_norm": 0.6452638391147235, + "learning_rate": 2.5434124145664158e-05, + "loss": 1.9915, + "step": 6606 + }, + { + "epoch": 0.51, + "grad_norm": 0.6125737213034402, + "learning_rate": 2.54278783695175e-05, + "loss": 1.928, + "step": 6607 + }, + { + "epoch": 0.51, + "grad_norm": 0.7839168365955907, + "learning_rate": 2.542163256665664e-05, + "loss": 1.925, + "step": 6608 + }, + { + "epoch": 0.51, + "grad_norm": 0.6268008301437035, + "learning_rate": 2.5415386737471525e-05, + "loss": 1.8918, + "step": 6609 + }, + { + "epoch": 0.51, + "grad_norm": 0.6244913844101304, + "learning_rate": 2.540914088235211e-05, + "loss": 2.1603, + "step": 6610 + }, + { + "epoch": 0.51, + "grad_norm": 0.7305445559108353, + "learning_rate": 2.5402895001688347e-05, + "loss": 1.8815, + "step": 6611 + }, + { + "epoch": 0.51, + "grad_norm": 0.6097771757402122, + "learning_rate": 2.5396649095870202e-05, + "loss": 2.0172, + "step": 6612 + }, + { + "epoch": 0.51, + "grad_norm": 0.7104598836472195, + "learning_rate": 2.539040316528762e-05, + "loss": 1.9346, + "step": 6613 + }, + { + "epoch": 0.51, + "grad_norm": 0.8201450661432044, + "learning_rate": 2.5384157210330562e-05, + "loss": 2.1178, + "step": 6614 + }, + { + "epoch": 0.51, + "grad_norm": 0.5908416191008313, + "learning_rate": 2.5377911231388994e-05, + "loss": 1.8939, + "step": 6615 + }, + { + "epoch": 0.51, + "grad_norm": 0.7295479850326737, + "learning_rate": 2.537166522885287e-05, + "loss": 1.9063, + "step": 6616 + }, + { + "epoch": 0.51, + "grad_norm": 0.7544398311537737, + "learning_rate": 2.5365419203112163e-05, + "loss": 2.0982, + "step": 6617 + }, + { + "epoch": 0.51, + "grad_norm": 0.6296044960054024, + "learning_rate": 2.5359173154556837e-05, + "loss": 1.9909, + "step": 6618 + }, + { + "epoch": 0.51, + "grad_norm": 0.7362914480592476, + "learning_rate": 2.5352927083576856e-05, + "loss": 1.88, + "step": 6619 + }, + { + "epoch": 0.51, + "grad_norm": 0.6252893834786406, + "learning_rate": 2.5346680990562183e-05, + "loss": 1.9465, + "step": 6620 + }, + { + "epoch": 0.51, + "grad_norm": 0.6992760315702686, + "learning_rate": 2.5340434875902786e-05, + "loss": 1.8827, + "step": 6621 + }, + { + "epoch": 0.51, + "grad_norm": 0.7032932056196257, + "learning_rate": 2.533418873998865e-05, + "loss": 2.0916, + "step": 6622 + }, + { + "epoch": 0.51, + "grad_norm": 0.5785093266873672, + "learning_rate": 2.5327942583209737e-05, + "loss": 1.9074, + "step": 6623 + }, + { + "epoch": 0.51, + "grad_norm": 0.7826355115178765, + "learning_rate": 2.5321696405956018e-05, + "loss": 1.905, + "step": 6624 + }, + { + "epoch": 0.51, + "grad_norm": 0.5951128670746993, + "learning_rate": 2.5315450208617474e-05, + "loss": 1.9331, + "step": 6625 + }, + { + "epoch": 0.51, + "grad_norm": 0.6259712750432452, + "learning_rate": 2.5309203991584073e-05, + "loss": 2.111, + "step": 6626 + }, + { + "epoch": 0.51, + "grad_norm": 0.6766416194774147, + "learning_rate": 2.5302957755245805e-05, + "loss": 1.9084, + "step": 6627 + }, + { + "epoch": 0.51, + "grad_norm": 0.6037285168651996, + "learning_rate": 2.529671149999263e-05, + "loss": 1.8978, + "step": 6628 + }, + { + "epoch": 0.51, + "grad_norm": 0.6127325765893574, + "learning_rate": 2.5290465226214543e-05, + "loss": 1.9681, + "step": 6629 + }, + { + "epoch": 0.51, + "grad_norm": 0.5841792177438028, + "learning_rate": 2.528421893430152e-05, + "loss": 2.1269, + "step": 6630 + }, + { + "epoch": 0.51, + "grad_norm": 0.617397095474975, + "learning_rate": 2.5277972624643537e-05, + "loss": 1.908, + "step": 6631 + }, + { + "epoch": 0.51, + "grad_norm": 0.5633587189688742, + "learning_rate": 2.527172629763058e-05, + "loss": 1.9798, + "step": 6632 + }, + { + "epoch": 0.51, + "grad_norm": 0.589941329595494, + "learning_rate": 2.5265479953652637e-05, + "loss": 1.921, + "step": 6633 + }, + { + "epoch": 0.51, + "grad_norm": 0.6327104751330486, + "learning_rate": 2.525923359309969e-05, + "loss": 2.1278, + "step": 6634 + }, + { + "epoch": 0.51, + "grad_norm": 0.6186814560941952, + "learning_rate": 2.5252987216361723e-05, + "loss": 1.8606, + "step": 6635 + }, + { + "epoch": 0.51, + "grad_norm": 0.6145968434655722, + "learning_rate": 2.524674082382873e-05, + "loss": 1.925, + "step": 6636 + }, + { + "epoch": 0.51, + "grad_norm": 0.6630123417458124, + "learning_rate": 2.524049441589069e-05, + "loss": 1.9979, + "step": 6637 + }, + { + "epoch": 0.51, + "grad_norm": 0.6087926754554818, + "learning_rate": 2.5234247992937593e-05, + "loss": 2.1213, + "step": 6638 + }, + { + "epoch": 0.51, + "grad_norm": 0.5742631285193094, + "learning_rate": 2.522800155535943e-05, + "loss": 1.9055, + "step": 6639 + }, + { + "epoch": 0.51, + "grad_norm": 0.6937187167504592, + "learning_rate": 2.5221755103546207e-05, + "loss": 1.8946, + "step": 6640 + }, + { + "epoch": 0.51, + "grad_norm": 0.6016097139590426, + "learning_rate": 2.521550863788789e-05, + "loss": 1.965, + "step": 6641 + }, + { + "epoch": 0.51, + "grad_norm": 0.6677489183552128, + "learning_rate": 2.520926215877448e-05, + "loss": 2.0933, + "step": 6642 + }, + { + "epoch": 0.51, + "grad_norm": 0.5908826423794707, + "learning_rate": 2.520301566659598e-05, + "loss": 2.0006, + "step": 6643 + }, + { + "epoch": 0.51, + "grad_norm": 0.737560666984083, + "learning_rate": 2.5196769161742386e-05, + "loss": 1.9149, + "step": 6644 + }, + { + "epoch": 0.51, + "grad_norm": 0.5887517383948331, + "learning_rate": 2.5190522644603676e-05, + "loss": 1.9481, + "step": 6645 + }, + { + "epoch": 0.51, + "grad_norm": 0.7762551666231263, + "learning_rate": 2.5184276115569843e-05, + "loss": 2.1162, + "step": 6646 + }, + { + "epoch": 0.51, + "grad_norm": 0.6401774345065644, + "learning_rate": 2.5178029575030916e-05, + "loss": 1.902, + "step": 6647 + }, + { + "epoch": 0.51, + "grad_norm": 0.8045769911011456, + "learning_rate": 2.517178302337686e-05, + "loss": 1.888, + "step": 6648 + }, + { + "epoch": 0.51, + "grad_norm": 0.5811641553877283, + "learning_rate": 2.5165536460997685e-05, + "loss": 1.9934, + "step": 6649 + }, + { + "epoch": 0.51, + "grad_norm": 0.6901640646059801, + "learning_rate": 2.5159289888283392e-05, + "loss": 2.0762, + "step": 6650 + }, + { + "epoch": 0.51, + "grad_norm": 0.6641364706304923, + "learning_rate": 2.5153043305623974e-05, + "loss": 1.9632, + "step": 6651 + }, + { + "epoch": 0.51, + "grad_norm": 0.6830171689028038, + "learning_rate": 2.5146796713409438e-05, + "loss": 1.9316, + "step": 6652 + }, + { + "epoch": 0.51, + "grad_norm": 0.6258924788664454, + "learning_rate": 2.5140550112029775e-05, + "loss": 1.8769, + "step": 6653 + }, + { + "epoch": 0.51, + "grad_norm": 0.6910472998453835, + "learning_rate": 2.513430350187499e-05, + "loss": 2.1065, + "step": 6654 + }, + { + "epoch": 0.51, + "grad_norm": 0.6023365616298708, + "learning_rate": 2.512805688333509e-05, + "loss": 1.9598, + "step": 6655 + }, + { + "epoch": 0.51, + "grad_norm": 0.651133693258318, + "learning_rate": 2.5121810256800072e-05, + "loss": 1.9025, + "step": 6656 + }, + { + "epoch": 0.51, + "grad_norm": 0.6834977825180181, + "learning_rate": 2.5115563622659936e-05, + "loss": 1.9227, + "step": 6657 + }, + { + "epoch": 0.51, + "grad_norm": 0.5868590545569323, + "learning_rate": 2.51093169813047e-05, + "loss": 2.0791, + "step": 6658 + }, + { + "epoch": 0.51, + "grad_norm": 0.6011948248089714, + "learning_rate": 2.510307033312434e-05, + "loss": 1.9041, + "step": 6659 + }, + { + "epoch": 0.51, + "grad_norm": 0.799201478210623, + "learning_rate": 2.509682367850888e-05, + "loss": 1.8954, + "step": 6660 + }, + { + "epoch": 0.51, + "grad_norm": 0.6106814700613309, + "learning_rate": 2.5090577017848323e-05, + "loss": 1.9026, + "step": 6661 + }, + { + "epoch": 0.51, + "grad_norm": 0.9408401486680976, + "learning_rate": 2.5084330351532675e-05, + "loss": 2.1758, + "step": 6662 + }, + { + "epoch": 0.51, + "grad_norm": 0.6554253349014847, + "learning_rate": 2.5078083679951924e-05, + "loss": 1.9211, + "step": 6663 + }, + { + "epoch": 0.51, + "grad_norm": 0.7121920533366128, + "learning_rate": 2.5071837003496095e-05, + "loss": 1.9488, + "step": 6664 + }, + { + "epoch": 0.51, + "grad_norm": 0.7799823614125067, + "learning_rate": 2.5065590322555193e-05, + "loss": 1.9208, + "step": 6665 + }, + { + "epoch": 0.51, + "grad_norm": 0.6308743232210926, + "learning_rate": 2.5059343637519212e-05, + "loss": 2.1013, + "step": 6666 + }, + { + "epoch": 0.51, + "grad_norm": 0.7979534779466928, + "learning_rate": 2.505309694877816e-05, + "loss": 1.8896, + "step": 6667 + }, + { + "epoch": 0.51, + "grad_norm": 0.6173264794306369, + "learning_rate": 2.5046850256722048e-05, + "loss": 2.0149, + "step": 6668 + }, + { + "epoch": 0.51, + "grad_norm": 0.7864081946453376, + "learning_rate": 2.5040603561740894e-05, + "loss": 1.899, + "step": 6669 + }, + { + "epoch": 0.51, + "grad_norm": 0.7364723312186022, + "learning_rate": 2.5034356864224685e-05, + "loss": 2.0625, + "step": 6670 + }, + { + "epoch": 0.51, + "grad_norm": 0.8247596131665619, + "learning_rate": 2.5028110164563434e-05, + "loss": 1.9049, + "step": 6671 + }, + { + "epoch": 0.51, + "grad_norm": 0.7132997313923205, + "learning_rate": 2.502186346314715e-05, + "loss": 1.9216, + "step": 6672 + }, + { + "epoch": 0.51, + "grad_norm": 0.7463588765872892, + "learning_rate": 2.501561676036584e-05, + "loss": 1.9391, + "step": 6673 + }, + { + "epoch": 0.51, + "grad_norm": 0.6538608084078793, + "learning_rate": 2.5009370056609512e-05, + "loss": 2.1831, + "step": 6674 + }, + { + "epoch": 0.51, + "grad_norm": 0.6643066015080465, + "learning_rate": 2.5003123352268176e-05, + "loss": 1.885, + "step": 6675 + }, + { + "epoch": 0.52, + "grad_norm": 0.720157166336245, + "learning_rate": 2.4996876647731833e-05, + "loss": 1.9004, + "step": 6676 + }, + { + "epoch": 0.52, + "grad_norm": 0.612813983232992, + "learning_rate": 2.4990629943390497e-05, + "loss": 1.9212, + "step": 6677 + }, + { + "epoch": 0.52, + "grad_norm": 0.5979299110423234, + "learning_rate": 2.4984383239634167e-05, + "loss": 2.103, + "step": 6678 + }, + { + "epoch": 0.52, + "grad_norm": 0.6832136148110365, + "learning_rate": 2.4978136536852856e-05, + "loss": 1.8912, + "step": 6679 + }, + { + "epoch": 0.52, + "grad_norm": 0.5999895238823745, + "learning_rate": 2.497188983543658e-05, + "loss": 1.9868, + "step": 6680 + }, + { + "epoch": 0.52, + "grad_norm": 0.7620609205589449, + "learning_rate": 2.4965643135775324e-05, + "loss": 1.9181, + "step": 6681 + }, + { + "epoch": 0.52, + "grad_norm": 0.6423405015893627, + "learning_rate": 2.4959396438259115e-05, + "loss": 2.0946, + "step": 6682 + }, + { + "epoch": 0.52, + "grad_norm": 0.7936071310518991, + "learning_rate": 2.4953149743277958e-05, + "loss": 1.894, + "step": 6683 + }, + { + "epoch": 0.52, + "grad_norm": 0.7077760201526929, + "learning_rate": 2.4946903051221842e-05, + "loss": 1.8507, + "step": 6684 + }, + { + "epoch": 0.52, + "grad_norm": 0.5887798889227074, + "learning_rate": 2.49406563624808e-05, + "loss": 1.8935, + "step": 6685 + }, + { + "epoch": 0.52, + "grad_norm": 0.8497011918958953, + "learning_rate": 2.4934409677444813e-05, + "loss": 2.1291, + "step": 6686 + }, + { + "epoch": 0.52, + "grad_norm": 0.6640881353257697, + "learning_rate": 2.4928162996503904e-05, + "loss": 1.9164, + "step": 6687 + }, + { + "epoch": 0.52, + "grad_norm": 0.8178929498874834, + "learning_rate": 2.4921916320048082e-05, + "loss": 1.9089, + "step": 6688 + }, + { + "epoch": 0.52, + "grad_norm": 0.6573935696544478, + "learning_rate": 2.491566964846733e-05, + "loss": 1.8432, + "step": 6689 + }, + { + "epoch": 0.52, + "grad_norm": 1.0749758311940638, + "learning_rate": 2.4909422982151683e-05, + "loss": 2.1485, + "step": 6690 + }, + { + "epoch": 0.52, + "grad_norm": 0.633142134461561, + "learning_rate": 2.4903176321491125e-05, + "loss": 1.9045, + "step": 6691 + }, + { + "epoch": 0.52, + "grad_norm": 0.8586981657256701, + "learning_rate": 2.489692966687566e-05, + "loss": 1.8607, + "step": 6692 + }, + { + "epoch": 0.52, + "grad_norm": 0.6127538538250511, + "learning_rate": 2.4890683018695317e-05, + "loss": 1.9529, + "step": 6693 + }, + { + "epoch": 0.52, + "grad_norm": 0.8780725795540756, + "learning_rate": 2.488443637734007e-05, + "loss": 2.0996, + "step": 6694 + }, + { + "epoch": 0.52, + "grad_norm": 0.5397061852810898, + "learning_rate": 2.4878189743199927e-05, + "loss": 1.8956, + "step": 6695 + }, + { + "epoch": 0.52, + "grad_norm": 0.7186891442921187, + "learning_rate": 2.4871943116664917e-05, + "loss": 1.9034, + "step": 6696 + }, + { + "epoch": 0.52, + "grad_norm": 0.6146453114973145, + "learning_rate": 2.4865696498125013e-05, + "loss": 1.9031, + "step": 6697 + }, + { + "epoch": 0.52, + "grad_norm": 0.6217876282561405, + "learning_rate": 2.4859449887970238e-05, + "loss": 2.1317, + "step": 6698 + }, + { + "epoch": 0.52, + "grad_norm": 0.7026887031648654, + "learning_rate": 2.485320328659057e-05, + "loss": 1.9936, + "step": 6699 + }, + { + "epoch": 0.52, + "grad_norm": 0.5953854447743928, + "learning_rate": 2.4846956694376025e-05, + "loss": 1.9078, + "step": 6700 + }, + { + "epoch": 0.52, + "grad_norm": 0.6630793745386788, + "learning_rate": 2.4840710111716614e-05, + "loss": 1.9119, + "step": 6701 + }, + { + "epoch": 0.52, + "grad_norm": 0.7620197279551758, + "learning_rate": 2.4834463539002317e-05, + "loss": 2.1157, + "step": 6702 + }, + { + "epoch": 0.52, + "grad_norm": 0.6243289771808985, + "learning_rate": 2.482821697662315e-05, + "loss": 1.9319, + "step": 6703 + }, + { + "epoch": 0.52, + "grad_norm": 0.6316703398832386, + "learning_rate": 2.482197042496909e-05, + "loss": 1.972, + "step": 6704 + }, + { + "epoch": 0.52, + "grad_norm": 0.678801402402233, + "learning_rate": 2.4815723884430153e-05, + "loss": 1.9919, + "step": 6705 + }, + { + "epoch": 0.52, + "grad_norm": 0.5825680057129189, + "learning_rate": 2.4809477355396336e-05, + "loss": 2.1045, + "step": 6706 + }, + { + "epoch": 0.52, + "grad_norm": 0.7844047551011293, + "learning_rate": 2.480323083825762e-05, + "loss": 1.9317, + "step": 6707 + }, + { + "epoch": 0.52, + "grad_norm": 0.6614241627814321, + "learning_rate": 2.479698433340402e-05, + "loss": 1.9111, + "step": 6708 + }, + { + "epoch": 0.52, + "grad_norm": 0.6151569102807789, + "learning_rate": 2.4790737841225524e-05, + "loss": 1.9354, + "step": 6709 + }, + { + "epoch": 0.52, + "grad_norm": 0.6900031948324293, + "learning_rate": 2.4784491362112113e-05, + "loss": 2.1168, + "step": 6710 + }, + { + "epoch": 0.52, + "grad_norm": 0.6581494566532976, + "learning_rate": 2.4778244896453803e-05, + "loss": 1.9439, + "step": 6711 + }, + { + "epoch": 0.52, + "grad_norm": 0.6657633475316107, + "learning_rate": 2.477199844464057e-05, + "loss": 1.9274, + "step": 6712 + }, + { + "epoch": 0.52, + "grad_norm": 0.7144507489745362, + "learning_rate": 2.4765752007062406e-05, + "loss": 1.9003, + "step": 6713 + }, + { + "epoch": 0.52, + "grad_norm": 0.7623985367728991, + "learning_rate": 2.475950558410932e-05, + "loss": 2.1121, + "step": 6714 + }, + { + "epoch": 0.52, + "grad_norm": 0.6632496185060474, + "learning_rate": 2.475325917617128e-05, + "loss": 1.907, + "step": 6715 + }, + { + "epoch": 0.52, + "grad_norm": 0.740477192041616, + "learning_rate": 2.4747012783638272e-05, + "loss": 1.9347, + "step": 6716 + }, + { + "epoch": 0.52, + "grad_norm": 0.5760196357184919, + "learning_rate": 2.4740766406900315e-05, + "loss": 1.9901, + "step": 6717 + }, + { + "epoch": 0.52, + "grad_norm": 0.7993674826330882, + "learning_rate": 2.4734520046347362e-05, + "loss": 2.097, + "step": 6718 + }, + { + "epoch": 0.52, + "grad_norm": 0.6563414546606893, + "learning_rate": 2.472827370236942e-05, + "loss": 1.9282, + "step": 6719 + }, + { + "epoch": 0.52, + "grad_norm": 0.7020165258798277, + "learning_rate": 2.472202737535647e-05, + "loss": 1.9044, + "step": 6720 + }, + { + "epoch": 0.52, + "grad_norm": 0.5795772846152659, + "learning_rate": 2.471578106569848e-05, + "loss": 1.888, + "step": 6721 + }, + { + "epoch": 0.52, + "grad_norm": 0.7765358359830016, + "learning_rate": 2.4709534773785462e-05, + "loss": 2.1495, + "step": 6722 + }, + { + "epoch": 0.52, + "grad_norm": 0.5982578700613664, + "learning_rate": 2.4703288500007372e-05, + "loss": 1.8962, + "step": 6723 + }, + { + "epoch": 0.52, + "grad_norm": 0.7118588956260865, + "learning_rate": 2.4697042244754208e-05, + "loss": 1.9662, + "step": 6724 + }, + { + "epoch": 0.52, + "grad_norm": 0.607385491131942, + "learning_rate": 2.469079600841593e-05, + "loss": 1.9547, + "step": 6725 + }, + { + "epoch": 0.52, + "grad_norm": 0.7996836038041695, + "learning_rate": 2.468454979138253e-05, + "loss": 2.0931, + "step": 6726 + }, + { + "epoch": 0.52, + "grad_norm": 0.593839911260235, + "learning_rate": 2.467830359404399e-05, + "loss": 1.8912, + "step": 6727 + }, + { + "epoch": 0.52, + "grad_norm": 0.7344143099607419, + "learning_rate": 2.4672057416790266e-05, + "loss": 1.8788, + "step": 6728 + }, + { + "epoch": 0.52, + "grad_norm": 0.6128024914626605, + "learning_rate": 2.4665811260011353e-05, + "loss": 1.8832, + "step": 6729 + }, + { + "epoch": 0.52, + "grad_norm": 0.5686391140724573, + "learning_rate": 2.465956512409722e-05, + "loss": 1.956, + "step": 6730 + }, + { + "epoch": 0.52, + "grad_norm": 0.6762861090534112, + "learning_rate": 2.4653319009437823e-05, + "loss": 2.108, + "step": 6731 + }, + { + "epoch": 0.52, + "grad_norm": 0.680451203074685, + "learning_rate": 2.4647072916423157e-05, + "loss": 1.9085, + "step": 6732 + }, + { + "epoch": 0.52, + "grad_norm": 0.713221164139241, + "learning_rate": 2.4640826845443165e-05, + "loss": 1.9304, + "step": 6733 + }, + { + "epoch": 0.52, + "grad_norm": 0.7218507434423468, + "learning_rate": 2.463458079688784e-05, + "loss": 2.1379, + "step": 6734 + }, + { + "epoch": 0.52, + "grad_norm": 0.6526674507286963, + "learning_rate": 2.4628334771147136e-05, + "loss": 1.933, + "step": 6735 + }, + { + "epoch": 0.52, + "grad_norm": 0.6464778420717437, + "learning_rate": 2.4622088768611008e-05, + "loss": 1.9781, + "step": 6736 + }, + { + "epoch": 0.52, + "grad_norm": 0.5949194859757655, + "learning_rate": 2.4615842789669447e-05, + "loss": 1.8958, + "step": 6737 + }, + { + "epoch": 0.52, + "grad_norm": 0.5796289542634331, + "learning_rate": 2.460959683471239e-05, + "loss": 2.1069, + "step": 6738 + }, + { + "epoch": 0.52, + "grad_norm": 0.6106510587974017, + "learning_rate": 2.46033509041298e-05, + "loss": 1.9695, + "step": 6739 + }, + { + "epoch": 0.52, + "grad_norm": 0.624575271276694, + "learning_rate": 2.4597104998311656e-05, + "loss": 1.8893, + "step": 6740 + }, + { + "epoch": 0.52, + "grad_norm": 0.5724035341380189, + "learning_rate": 2.4590859117647892e-05, + "loss": 1.9249, + "step": 6741 + }, + { + "epoch": 0.52, + "grad_norm": 0.6213111489395294, + "learning_rate": 2.4584613262528474e-05, + "loss": 1.9701, + "step": 6742 + }, + { + "epoch": 0.52, + "grad_norm": 0.697271578999779, + "learning_rate": 2.4578367433343365e-05, + "loss": 2.1436, + "step": 6743 + }, + { + "epoch": 0.52, + "grad_norm": 0.6243794315928739, + "learning_rate": 2.4572121630482505e-05, + "loss": 1.9727, + "step": 6744 + }, + { + "epoch": 0.52, + "grad_norm": 0.5818349899542608, + "learning_rate": 2.4565875854335855e-05, + "loss": 1.9395, + "step": 6745 + }, + { + "epoch": 0.52, + "grad_norm": 0.6990770382030523, + "learning_rate": 2.4559630105293348e-05, + "loss": 2.1298, + "step": 6746 + }, + { + "epoch": 0.52, + "grad_norm": 0.6459763055133859, + "learning_rate": 2.4553384383744947e-05, + "loss": 1.9348, + "step": 6747 + }, + { + "epoch": 0.52, + "grad_norm": 0.5520473712682026, + "learning_rate": 2.45471386900806e-05, + "loss": 1.9527, + "step": 6748 + }, + { + "epoch": 0.52, + "grad_norm": 0.6405644222285571, + "learning_rate": 2.4540893024690233e-05, + "loss": 1.8977, + "step": 6749 + }, + { + "epoch": 0.52, + "grad_norm": 0.6213141383741682, + "learning_rate": 2.453464738796381e-05, + "loss": 2.1142, + "step": 6750 + }, + { + "epoch": 0.52, + "grad_norm": 0.5806709769443219, + "learning_rate": 2.452840178029125e-05, + "loss": 1.8977, + "step": 6751 + }, + { + "epoch": 0.52, + "grad_norm": 0.7474948406269726, + "learning_rate": 2.4522156202062515e-05, + "loss": 1.9109, + "step": 6752 + }, + { + "epoch": 0.52, + "grad_norm": 0.677271788432471, + "learning_rate": 2.4515910653667534e-05, + "loss": 1.9066, + "step": 6753 + }, + { + "epoch": 0.52, + "grad_norm": 0.5992337099234126, + "learning_rate": 2.450966513549623e-05, + "loss": 2.1152, + "step": 6754 + }, + { + "epoch": 0.52, + "grad_norm": 0.5830395139034389, + "learning_rate": 2.450341964793855e-05, + "loss": 1.9954, + "step": 6755 + }, + { + "epoch": 0.52, + "grad_norm": 0.5840318406664341, + "learning_rate": 2.4497174191384427e-05, + "loss": 1.9175, + "step": 6756 + }, + { + "epoch": 0.52, + "grad_norm": 0.5804305668201363, + "learning_rate": 2.4490928766223774e-05, + "loss": 1.9423, + "step": 6757 + }, + { + "epoch": 0.52, + "grad_norm": 0.7197838169407231, + "learning_rate": 2.4484683372846542e-05, + "loss": 2.0937, + "step": 6758 + }, + { + "epoch": 0.52, + "grad_norm": 0.6172439288259548, + "learning_rate": 2.4478438011642637e-05, + "loss": 1.8963, + "step": 6759 + }, + { + "epoch": 0.52, + "grad_norm": 0.6235788729125963, + "learning_rate": 2.4472192683001985e-05, + "loss": 1.9157, + "step": 6760 + }, + { + "epoch": 0.52, + "grad_norm": 0.5959534785565576, + "learning_rate": 2.4465947387314524e-05, + "loss": 1.9853, + "step": 6761 + }, + { + "epoch": 0.52, + "grad_norm": 0.5975694025758582, + "learning_rate": 2.445970212497015e-05, + "loss": 1.9189, + "step": 6762 + }, + { + "epoch": 0.52, + "grad_norm": 0.5889693467279564, + "learning_rate": 2.445345689635879e-05, + "loss": 2.0962, + "step": 6763 + }, + { + "epoch": 0.52, + "grad_norm": 0.638321035293485, + "learning_rate": 2.4447211701870377e-05, + "loss": 1.9231, + "step": 6764 + }, + { + "epoch": 0.52, + "grad_norm": 0.5869589860641043, + "learning_rate": 2.4440966541894792e-05, + "loss": 1.8795, + "step": 6765 + }, + { + "epoch": 0.52, + "grad_norm": 0.5699530902625876, + "learning_rate": 2.4434721416821975e-05, + "loss": 2.1158, + "step": 6766 + }, + { + "epoch": 0.52, + "grad_norm": 0.6050436652707758, + "learning_rate": 2.4428476327041816e-05, + "loss": 2.0187, + "step": 6767 + }, + { + "epoch": 0.52, + "grad_norm": 0.6708975022240512, + "learning_rate": 2.4422231272944224e-05, + "loss": 1.9343, + "step": 6768 + }, + { + "epoch": 0.52, + "grad_norm": 0.5464048787942786, + "learning_rate": 2.4415986254919115e-05, + "loss": 1.9244, + "step": 6769 + }, + { + "epoch": 0.52, + "grad_norm": 0.6976577689042746, + "learning_rate": 2.440974127335638e-05, + "loss": 2.1208, + "step": 6770 + }, + { + "epoch": 0.52, + "grad_norm": 0.5730240615629292, + "learning_rate": 2.4403496328645927e-05, + "loss": 1.9276, + "step": 6771 + }, + { + "epoch": 0.52, + "grad_norm": 0.7571714232426864, + "learning_rate": 2.439725142117764e-05, + "loss": 1.9251, + "step": 6772 + }, + { + "epoch": 0.52, + "grad_norm": 0.6625868087940432, + "learning_rate": 2.4391006551341427e-05, + "loss": 1.9773, + "step": 6773 + }, + { + "epoch": 0.52, + "grad_norm": 0.63218111458179, + "learning_rate": 2.438476171952718e-05, + "loss": 1.8817, + "step": 6774 + }, + { + "epoch": 0.52, + "grad_norm": 0.8624729973375317, + "learning_rate": 2.437851692612478e-05, + "loss": 2.1295, + "step": 6775 + }, + { + "epoch": 0.52, + "grad_norm": 0.6057129145136635, + "learning_rate": 2.437227217152412e-05, + "loss": 1.8881, + "step": 6776 + }, + { + "epoch": 0.52, + "grad_norm": 0.6593773451083194, + "learning_rate": 2.43660274561151e-05, + "loss": 1.9353, + "step": 6777 + }, + { + "epoch": 0.52, + "grad_norm": 0.7473076565520184, + "learning_rate": 2.4359782780287575e-05, + "loss": 2.0973, + "step": 6778 + }, + { + "epoch": 0.52, + "grad_norm": 0.6276330959639643, + "learning_rate": 2.4353538144431454e-05, + "loss": 1.9284, + "step": 6779 + }, + { + "epoch": 0.52, + "grad_norm": 0.7225193005370535, + "learning_rate": 2.4347293548936595e-05, + "loss": 1.9298, + "step": 6780 + }, + { + "epoch": 0.52, + "grad_norm": 0.6884143929539044, + "learning_rate": 2.4341048994192878e-05, + "loss": 1.9273, + "step": 6781 + }, + { + "epoch": 0.52, + "grad_norm": 0.669166626157464, + "learning_rate": 2.4334804480590188e-05, + "loss": 2.066, + "step": 6782 + }, + { + "epoch": 0.52, + "grad_norm": 0.7284814876054923, + "learning_rate": 2.4328560008518377e-05, + "loss": 1.9085, + "step": 6783 + }, + { + "epoch": 0.52, + "grad_norm": 0.6239209916842718, + "learning_rate": 2.4322315578367334e-05, + "loss": 1.9425, + "step": 6784 + }, + { + "epoch": 0.52, + "grad_norm": 0.7803236080098342, + "learning_rate": 2.4316071190526905e-05, + "loss": 1.8679, + "step": 6785 + }, + { + "epoch": 0.52, + "grad_norm": 0.6369172920398191, + "learning_rate": 2.430982684538696e-05, + "loss": 1.9699, + "step": 6786 + }, + { + "epoch": 0.52, + "grad_norm": 0.7015151308620332, + "learning_rate": 2.430358254333737e-05, + "loss": 2.1318, + "step": 6787 + }, + { + "epoch": 0.52, + "grad_norm": 0.6460169474328119, + "learning_rate": 2.429733828476798e-05, + "loss": 1.8822, + "step": 6788 + }, + { + "epoch": 0.52, + "grad_norm": 0.5749543660504861, + "learning_rate": 2.4291094070068638e-05, + "loss": 1.8478, + "step": 6789 + }, + { + "epoch": 0.52, + "grad_norm": 0.6489338329207572, + "learning_rate": 2.4284849899629217e-05, + "loss": 2.0708, + "step": 6790 + }, + { + "epoch": 0.52, + "grad_norm": 0.6296951755367514, + "learning_rate": 2.4278605773839548e-05, + "loss": 1.9237, + "step": 6791 + }, + { + "epoch": 0.52, + "grad_norm": 0.563245463226585, + "learning_rate": 2.4272361693089495e-05, + "loss": 1.9968, + "step": 6792 + }, + { + "epoch": 0.52, + "grad_norm": 0.7218330509470475, + "learning_rate": 2.4266117657768874e-05, + "loss": 1.9514, + "step": 6793 + }, + { + "epoch": 0.52, + "grad_norm": 0.6303823179613774, + "learning_rate": 2.4259873668267553e-05, + "loss": 1.8751, + "step": 6794 + }, + { + "epoch": 0.52, + "grad_norm": 0.7076920535276987, + "learning_rate": 2.425362972497536e-05, + "loss": 2.0869, + "step": 6795 + }, + { + "epoch": 0.52, + "grad_norm": 0.656480926864414, + "learning_rate": 2.424738582828212e-05, + "loss": 1.963, + "step": 6796 + }, + { + "epoch": 0.52, + "grad_norm": 0.7363724995566108, + "learning_rate": 2.4241141978577675e-05, + "loss": 1.9566, + "step": 6797 + }, + { + "epoch": 0.52, + "grad_norm": 0.7277395544747648, + "learning_rate": 2.4234898176251868e-05, + "loss": 1.9801, + "step": 6798 + }, + { + "epoch": 0.52, + "grad_norm": 0.8379841473739562, + "learning_rate": 2.42286544216945e-05, + "loss": 2.0948, + "step": 6799 + }, + { + "epoch": 0.52, + "grad_norm": 0.6659577058511705, + "learning_rate": 2.422241071529541e-05, + "loss": 1.9077, + "step": 6800 + }, + { + "epoch": 0.52, + "grad_norm": 0.7714648438478618, + "learning_rate": 2.421616705744441e-05, + "loss": 1.9729, + "step": 6801 + }, + { + "epoch": 0.52, + "grad_norm": 0.6628822201225683, + "learning_rate": 2.4209923448531318e-05, + "loss": 2.0946, + "step": 6802 + }, + { + "epoch": 0.52, + "grad_norm": 0.5952311084057027, + "learning_rate": 2.4203679888945963e-05, + "loss": 1.8796, + "step": 6803 + }, + { + "epoch": 0.52, + "grad_norm": 0.6836393055619138, + "learning_rate": 2.419743637907813e-05, + "loss": 2.0108, + "step": 6804 + }, + { + "epoch": 0.52, + "grad_norm": 0.6903539165802698, + "learning_rate": 2.4191192919317653e-05, + "loss": 1.9067, + "step": 6805 + }, + { + "epoch": 0.53, + "grad_norm": 0.708099156768583, + "learning_rate": 2.4184949510054313e-05, + "loss": 1.9122, + "step": 6806 + }, + { + "epoch": 0.53, + "grad_norm": 0.6796728385968701, + "learning_rate": 2.4178706151677925e-05, + "loss": 2.0893, + "step": 6807 + }, + { + "epoch": 0.53, + "grad_norm": 0.6327854860876525, + "learning_rate": 2.4172462844578295e-05, + "loss": 1.9177, + "step": 6808 + }, + { + "epoch": 0.53, + "grad_norm": 0.7414510937443434, + "learning_rate": 2.4166219589145202e-05, + "loss": 1.9004, + "step": 6809 + }, + { + "epoch": 0.53, + "grad_norm": 0.7781712396548345, + "learning_rate": 2.4159976385768444e-05, + "loss": 2.1063, + "step": 6810 + }, + { + "epoch": 0.53, + "grad_norm": 0.621843997687271, + "learning_rate": 2.4153733234837817e-05, + "loss": 1.9824, + "step": 6811 + }, + { + "epoch": 0.53, + "grad_norm": 0.7435361891021265, + "learning_rate": 2.4147490136743096e-05, + "loss": 1.8615, + "step": 6812 + }, + { + "epoch": 0.53, + "grad_norm": 0.7920600782553857, + "learning_rate": 2.414124709187408e-05, + "loss": 1.9069, + "step": 6813 + }, + { + "epoch": 0.53, + "grad_norm": 0.6051091967503096, + "learning_rate": 2.4135004100620517e-05, + "loss": 1.9507, + "step": 6814 + }, + { + "epoch": 0.53, + "grad_norm": 0.6672953505824236, + "learning_rate": 2.4128761163372203e-05, + "loss": 2.1116, + "step": 6815 + }, + { + "epoch": 0.53, + "grad_norm": 0.6394488405411185, + "learning_rate": 2.4122518280518922e-05, + "loss": 1.9096, + "step": 6816 + }, + { + "epoch": 0.53, + "grad_norm": 0.7029695181896743, + "learning_rate": 2.4116275452450424e-05, + "loss": 1.9282, + "step": 6817 + }, + { + "epoch": 0.53, + "grad_norm": 0.6386608439110977, + "learning_rate": 2.4110032679556484e-05, + "loss": 1.916, + "step": 6818 + }, + { + "epoch": 0.53, + "grad_norm": 0.7052065251058428, + "learning_rate": 2.410378996222685e-05, + "loss": 2.089, + "step": 6819 + }, + { + "epoch": 0.53, + "grad_norm": 0.6749873781333043, + "learning_rate": 2.4097547300851296e-05, + "loss": 1.8826, + "step": 6820 + }, + { + "epoch": 0.53, + "grad_norm": 0.7167518747102501, + "learning_rate": 2.4091304695819574e-05, + "loss": 1.9472, + "step": 6821 + }, + { + "epoch": 0.53, + "grad_norm": 0.7432231883065988, + "learning_rate": 2.4085062147521427e-05, + "loss": 2.1274, + "step": 6822 + }, + { + "epoch": 0.53, + "grad_norm": 0.6918260628988899, + "learning_rate": 2.407881965634661e-05, + "loss": 1.9907, + "step": 6823 + }, + { + "epoch": 0.53, + "grad_norm": 0.6266301248228702, + "learning_rate": 2.407257722268487e-05, + "loss": 1.8851, + "step": 6824 + }, + { + "epoch": 0.53, + "grad_norm": 0.7717103865401205, + "learning_rate": 2.406633484692594e-05, + "loss": 1.9052, + "step": 6825 + }, + { + "epoch": 0.53, + "grad_norm": 0.6047759068097547, + "learning_rate": 2.406009252945957e-05, + "loss": 1.9096, + "step": 6826 + }, + { + "epoch": 0.53, + "grad_norm": 0.8291367883720913, + "learning_rate": 2.4053850270675474e-05, + "loss": 2.1058, + "step": 6827 + }, + { + "epoch": 0.53, + "grad_norm": 0.5644723587796809, + "learning_rate": 2.404760807096339e-05, + "loss": 1.9254, + "step": 6828 + }, + { + "epoch": 0.53, + "grad_norm": 0.5899525154037856, + "learning_rate": 2.4041365930713058e-05, + "loss": 1.9809, + "step": 6829 + }, + { + "epoch": 0.53, + "grad_norm": 0.6335833926549232, + "learning_rate": 2.403512385031418e-05, + "loss": 1.896, + "step": 6830 + }, + { + "epoch": 0.53, + "grad_norm": 0.651372802921379, + "learning_rate": 2.4028881830156484e-05, + "loss": 2.1591, + "step": 6831 + }, + { + "epoch": 0.53, + "grad_norm": 0.588525335985015, + "learning_rate": 2.4022639870629695e-05, + "loss": 1.8998, + "step": 6832 + }, + { + "epoch": 0.53, + "grad_norm": 0.6097101569219551, + "learning_rate": 2.4016397972123502e-05, + "loss": 1.9111, + "step": 6833 + }, + { + "epoch": 0.53, + "grad_norm": 0.6704457411346033, + "learning_rate": 2.4010156135027635e-05, + "loss": 2.0914, + "step": 6834 + }, + { + "epoch": 0.53, + "grad_norm": 0.6255031233854543, + "learning_rate": 2.4003914359731783e-05, + "loss": 1.9935, + "step": 6835 + }, + { + "epoch": 0.53, + "grad_norm": 0.6501068350777695, + "learning_rate": 2.3997672646625642e-05, + "loss": 1.9106, + "step": 6836 + }, + { + "epoch": 0.53, + "grad_norm": 0.5436276998753237, + "learning_rate": 2.3991430996098926e-05, + "loss": 1.9294, + "step": 6837 + }, + { + "epoch": 0.53, + "grad_norm": 0.6567220748704277, + "learning_rate": 2.3985189408541313e-05, + "loss": 1.9128, + "step": 6838 + }, + { + "epoch": 0.53, + "grad_norm": 0.7579279803622189, + "learning_rate": 2.3978947884342496e-05, + "loss": 2.0822, + "step": 6839 + }, + { + "epoch": 0.53, + "grad_norm": 0.6192621446964727, + "learning_rate": 2.397270642389215e-05, + "loss": 1.9266, + "step": 6840 + }, + { + "epoch": 0.53, + "grad_norm": 0.6473291489568552, + "learning_rate": 2.3966465027579965e-05, + "loss": 1.8916, + "step": 6841 + }, + { + "epoch": 0.53, + "grad_norm": 0.680980680998492, + "learning_rate": 2.396022369579562e-05, + "loss": 1.9915, + "step": 6842 + }, + { + "epoch": 0.53, + "grad_norm": 0.5652075683804121, + "learning_rate": 2.395398242892877e-05, + "loss": 2.0981, + "step": 6843 + }, + { + "epoch": 0.53, + "grad_norm": 0.6204969797342692, + "learning_rate": 2.39477412273691e-05, + "loss": 1.8843, + "step": 6844 + }, + { + "epoch": 0.53, + "grad_norm": 0.624807351666636, + "learning_rate": 2.3941500091506275e-05, + "loss": 1.9323, + "step": 6845 + }, + { + "epoch": 0.53, + "grad_norm": 0.6369086123194133, + "learning_rate": 2.3935259021729935e-05, + "loss": 1.9658, + "step": 6846 + }, + { + "epoch": 0.53, + "grad_norm": 0.6217138948400651, + "learning_rate": 2.392901801842976e-05, + "loss": 2.0679, + "step": 6847 + }, + { + "epoch": 0.53, + "grad_norm": 0.5476398496472505, + "learning_rate": 2.392277708199538e-05, + "loss": 1.9317, + "step": 6848 + }, + { + "epoch": 0.53, + "grad_norm": 0.6066945188728551, + "learning_rate": 2.3916536212816454e-05, + "loss": 1.9177, + "step": 6849 + }, + { + "epoch": 0.53, + "grad_norm": 0.5873595244939671, + "learning_rate": 2.391029541128263e-05, + "loss": 1.9272, + "step": 6850 + }, + { + "epoch": 0.53, + "grad_norm": 0.7177781426609902, + "learning_rate": 2.3904054677783534e-05, + "loss": 2.1173, + "step": 6851 + }, + { + "epoch": 0.53, + "grad_norm": 0.5705803263904187, + "learning_rate": 2.3897814012708813e-05, + "loss": 1.9098, + "step": 6852 + }, + { + "epoch": 0.53, + "grad_norm": 0.6393458847388561, + "learning_rate": 2.3891573416448085e-05, + "loss": 1.9201, + "step": 6853 + }, + { + "epoch": 0.53, + "grad_norm": 0.6531558168703689, + "learning_rate": 2.388533288939098e-05, + "loss": 1.9883, + "step": 6854 + }, + { + "epoch": 0.53, + "grad_norm": 0.6441874429943146, + "learning_rate": 2.387909243192713e-05, + "loss": 2.0952, + "step": 6855 + }, + { + "epoch": 0.53, + "grad_norm": 0.609066281396595, + "learning_rate": 2.3872852044446138e-05, + "loss": 1.9278, + "step": 6856 + }, + { + "epoch": 0.53, + "grad_norm": 0.6546182544626906, + "learning_rate": 2.386661172733762e-05, + "loss": 1.8682, + "step": 6857 + }, + { + "epoch": 0.53, + "grad_norm": 0.675446999757002, + "learning_rate": 2.38603714809912e-05, + "loss": 1.8778, + "step": 6858 + }, + { + "epoch": 0.53, + "grad_norm": 0.7444320340379087, + "learning_rate": 2.3854131305796462e-05, + "loss": 2.1021, + "step": 6859 + }, + { + "epoch": 0.53, + "grad_norm": 0.6819584812800328, + "learning_rate": 2.384789120214302e-05, + "loss": 2.0283, + "step": 6860 + }, + { + "epoch": 0.53, + "grad_norm": 0.5771843788204984, + "learning_rate": 2.3841651170420452e-05, + "loss": 1.8946, + "step": 6861 + }, + { + "epoch": 0.53, + "grad_norm": 0.6172188110496265, + "learning_rate": 2.3835411211018363e-05, + "loss": 1.9105, + "step": 6862 + }, + { + "epoch": 0.53, + "grad_norm": 0.7448204919284072, + "learning_rate": 2.382917132432635e-05, + "loss": 2.0971, + "step": 6863 + }, + { + "epoch": 0.53, + "grad_norm": 0.5821705947384628, + "learning_rate": 2.3822931510733965e-05, + "loss": 1.9144, + "step": 6864 + }, + { + "epoch": 0.53, + "grad_norm": 0.6005340323705847, + "learning_rate": 2.3816691770630803e-05, + "loss": 1.8606, + "step": 6865 + }, + { + "epoch": 0.53, + "grad_norm": 0.6484878491223819, + "learning_rate": 2.3810452104406444e-05, + "loss": 2.0203, + "step": 6866 + }, + { + "epoch": 0.53, + "grad_norm": 0.6096836462339968, + "learning_rate": 2.380421251245044e-05, + "loss": 2.1376, + "step": 6867 + }, + { + "epoch": 0.53, + "grad_norm": 0.7009150752552232, + "learning_rate": 2.379797299515237e-05, + "loss": 1.9604, + "step": 6868 + }, + { + "epoch": 0.53, + "grad_norm": 0.6553904931408051, + "learning_rate": 2.3791733552901774e-05, + "loss": 1.9431, + "step": 6869 + }, + { + "epoch": 0.53, + "grad_norm": 0.6193090366345624, + "learning_rate": 2.3785494186088217e-05, + "loss": 1.9141, + "step": 6870 + }, + { + "epoch": 0.53, + "grad_norm": 0.7736903595860248, + "learning_rate": 2.3779254895101256e-05, + "loss": 2.0722, + "step": 6871 + }, + { + "epoch": 0.53, + "grad_norm": 0.6561150774563063, + "learning_rate": 2.3773015680330414e-05, + "loss": 1.8629, + "step": 6872 + }, + { + "epoch": 0.53, + "grad_norm": 0.5182705139781527, + "learning_rate": 2.3766776542165256e-05, + "loss": 1.904, + "step": 6873 + }, + { + "epoch": 0.53, + "grad_norm": 0.5810549487202489, + "learning_rate": 2.3760537480995298e-05, + "loss": 1.9271, + "step": 6874 + }, + { + "epoch": 0.53, + "grad_norm": 0.8188873168414694, + "learning_rate": 2.3754298497210073e-05, + "loss": 2.063, + "step": 6875 + }, + { + "epoch": 0.53, + "grad_norm": 0.702023092199348, + "learning_rate": 2.374805959119912e-05, + "loss": 1.8776, + "step": 6876 + }, + { + "epoch": 0.53, + "grad_norm": 0.5803541173771632, + "learning_rate": 2.3741820763351945e-05, + "loss": 1.9236, + "step": 6877 + }, + { + "epoch": 0.53, + "grad_norm": 0.6470537838943383, + "learning_rate": 2.373558201405806e-05, + "loss": 1.8677, + "step": 6878 + }, + { + "epoch": 0.53, + "grad_norm": 0.6631028917874696, + "learning_rate": 2.3729343343707e-05, + "loss": 2.1301, + "step": 6879 + }, + { + "epoch": 0.53, + "grad_norm": 0.623893806627512, + "learning_rate": 2.372310475268824e-05, + "loss": 1.9193, + "step": 6880 + }, + { + "epoch": 0.53, + "grad_norm": 0.6154738471162933, + "learning_rate": 2.3716866241391312e-05, + "loss": 1.9054, + "step": 6881 + }, + { + "epoch": 0.53, + "grad_norm": 0.6468615388599988, + "learning_rate": 2.3710627810205686e-05, + "loss": 1.9558, + "step": 6882 + }, + { + "epoch": 0.53, + "grad_norm": 0.6929877096400356, + "learning_rate": 2.3704389459520855e-05, + "loss": 2.1175, + "step": 6883 + }, + { + "epoch": 0.53, + "grad_norm": 0.5970163384843282, + "learning_rate": 2.369815118972633e-05, + "loss": 1.9196, + "step": 6884 + }, + { + "epoch": 0.53, + "grad_norm": 0.6720238952797764, + "learning_rate": 2.3691913001211567e-05, + "loss": 1.9358, + "step": 6885 + }, + { + "epoch": 0.53, + "grad_norm": 0.6089547532220582, + "learning_rate": 2.368567489436605e-05, + "loss": 1.8918, + "step": 6886 + }, + { + "epoch": 0.53, + "grad_norm": 0.6229818684048509, + "learning_rate": 2.367943686957924e-05, + "loss": 2.0991, + "step": 6887 + }, + { + "epoch": 0.53, + "grad_norm": 0.570459583142368, + "learning_rate": 2.3673198927240618e-05, + "loss": 1.9391, + "step": 6888 + }, + { + "epoch": 0.53, + "grad_norm": 0.5588454685117317, + "learning_rate": 2.3666961067739646e-05, + "loss": 1.9468, + "step": 6889 + }, + { + "epoch": 0.53, + "grad_norm": 0.6496758721614757, + "learning_rate": 2.3660723291465757e-05, + "loss": 1.8884, + "step": 6890 + }, + { + "epoch": 0.53, + "grad_norm": 0.5668348741268975, + "learning_rate": 2.365448559880842e-05, + "loss": 2.1318, + "step": 6891 + }, + { + "epoch": 0.53, + "grad_norm": 0.6481880115062101, + "learning_rate": 2.3648247990157077e-05, + "loss": 1.9129, + "step": 6892 + }, + { + "epoch": 0.53, + "grad_norm": 0.5671130049193563, + "learning_rate": 2.3642010465901158e-05, + "loss": 1.9233, + "step": 6893 + }, + { + "epoch": 0.53, + "grad_norm": 0.565254955019476, + "learning_rate": 2.3635773026430112e-05, + "loss": 1.8941, + "step": 6894 + }, + { + "epoch": 0.53, + "grad_norm": 0.5617922894977659, + "learning_rate": 2.362953567213336e-05, + "loss": 2.0552, + "step": 6895 + }, + { + "epoch": 0.53, + "grad_norm": 0.5900120968178542, + "learning_rate": 2.3623298403400318e-05, + "loss": 1.9113, + "step": 6896 + }, + { + "epoch": 0.53, + "grad_norm": 0.59758674281974, + "learning_rate": 2.3617061220620422e-05, + "loss": 1.9938, + "step": 6897 + }, + { + "epoch": 0.53, + "grad_norm": 0.6684784597662042, + "learning_rate": 2.3610824124183063e-05, + "loss": 1.929, + "step": 6898 + }, + { + "epoch": 0.53, + "grad_norm": 0.6054468772221763, + "learning_rate": 2.3604587114477677e-05, + "loss": 2.0654, + "step": 6899 + }, + { + "epoch": 0.53, + "grad_norm": 0.5964270755876917, + "learning_rate": 2.359835019189364e-05, + "loss": 1.884, + "step": 6900 + }, + { + "epoch": 0.53, + "grad_norm": 0.5766920747884321, + "learning_rate": 2.3592113356820357e-05, + "loss": 1.9003, + "step": 6901 + }, + { + "epoch": 0.53, + "grad_norm": 0.6625545172774548, + "learning_rate": 2.358587660964723e-05, + "loss": 1.8528, + "step": 6902 + }, + { + "epoch": 0.53, + "grad_norm": 0.5957701778591085, + "learning_rate": 2.3579639950763628e-05, + "loss": 2.1228, + "step": 6903 + }, + { + "epoch": 0.53, + "grad_norm": 0.5332229546551666, + "learning_rate": 2.3573403380558935e-05, + "loss": 2.0034, + "step": 6904 + }, + { + "epoch": 0.53, + "grad_norm": 0.5887189021077798, + "learning_rate": 2.3567166899422542e-05, + "loss": 1.9305, + "step": 6905 + }, + { + "epoch": 0.53, + "grad_norm": 0.688491201664447, + "learning_rate": 2.35609305077438e-05, + "loss": 1.8917, + "step": 6906 + }, + { + "epoch": 0.53, + "grad_norm": 0.6404379052876008, + "learning_rate": 2.355469420591208e-05, + "loss": 2.0586, + "step": 6907 + }, + { + "epoch": 0.53, + "grad_norm": 0.6316202744321762, + "learning_rate": 2.354845799431673e-05, + "loss": 1.94, + "step": 6908 + }, + { + "epoch": 0.53, + "grad_norm": 0.6118239124633812, + "learning_rate": 2.3542221873347117e-05, + "loss": 1.9468, + "step": 6909 + }, + { + "epoch": 0.53, + "grad_norm": 0.604040834599595, + "learning_rate": 2.3535985843392585e-05, + "loss": 1.9769, + "step": 6910 + }, + { + "epoch": 0.53, + "grad_norm": 0.7044546934138206, + "learning_rate": 2.352974990484246e-05, + "loss": 2.1114, + "step": 6911 + }, + { + "epoch": 0.53, + "grad_norm": 0.589162926996751, + "learning_rate": 2.3523514058086092e-05, + "loss": 1.9311, + "step": 6912 + }, + { + "epoch": 0.53, + "grad_norm": 0.7182601365819111, + "learning_rate": 2.3517278303512812e-05, + "loss": 1.8462, + "step": 6913 + }, + { + "epoch": 0.53, + "grad_norm": 0.6005610685268739, + "learning_rate": 2.3511042641511936e-05, + "loss": 1.9243, + "step": 6914 + }, + { + "epoch": 0.53, + "grad_norm": 0.7061502814093771, + "learning_rate": 2.350480707247279e-05, + "loss": 2.1309, + "step": 6915 + }, + { + "epoch": 0.53, + "grad_norm": 0.5499280755367931, + "learning_rate": 2.3498571596784672e-05, + "loss": 1.9845, + "step": 6916 + }, + { + "epoch": 0.53, + "grad_norm": 0.8461499592735158, + "learning_rate": 2.34923362148369e-05, + "loss": 1.9292, + "step": 6917 + }, + { + "epoch": 0.53, + "grad_norm": 0.630147152218096, + "learning_rate": 2.348610092701878e-05, + "loss": 1.9001, + "step": 6918 + }, + { + "epoch": 0.53, + "grad_norm": 0.7115403702532992, + "learning_rate": 2.3479865733719583e-05, + "loss": 2.1299, + "step": 6919 + }, + { + "epoch": 0.53, + "grad_norm": 0.6065194402136416, + "learning_rate": 2.3473630635328626e-05, + "loss": 1.8717, + "step": 6920 + }, + { + "epoch": 0.53, + "grad_norm": 0.6420221356455456, + "learning_rate": 2.3467395632235175e-05, + "loss": 1.8935, + "step": 6921 + }, + { + "epoch": 0.53, + "grad_norm": 0.6782267090405564, + "learning_rate": 2.3461160724828504e-05, + "loss": 2.0023, + "step": 6922 + }, + { + "epoch": 0.53, + "grad_norm": 0.6184186246641804, + "learning_rate": 2.34549259134979e-05, + "loss": 2.1346, + "step": 6923 + }, + { + "epoch": 0.53, + "grad_norm": 0.5995745473603085, + "learning_rate": 2.3448691198632618e-05, + "loss": 1.9167, + "step": 6924 + }, + { + "epoch": 0.53, + "grad_norm": 0.5724627745614244, + "learning_rate": 2.3442456580621906e-05, + "loss": 1.903, + "step": 6925 + }, + { + "epoch": 0.53, + "grad_norm": 0.6059457914412485, + "learning_rate": 2.3436222059855046e-05, + "loss": 1.9007, + "step": 6926 + }, + { + "epoch": 0.53, + "grad_norm": 0.7008904390475549, + "learning_rate": 2.3429987636721255e-05, + "loss": 2.094, + "step": 6927 + }, + { + "epoch": 0.53, + "grad_norm": 0.5477791583777565, + "learning_rate": 2.3423753311609793e-05, + "loss": 1.945, + "step": 6928 + }, + { + "epoch": 0.53, + "grad_norm": 0.5938890089036918, + "learning_rate": 2.341751908490988e-05, + "loss": 1.9185, + "step": 6929 + }, + { + "epoch": 0.53, + "grad_norm": 0.6770447220216039, + "learning_rate": 2.3411284957010746e-05, + "loss": 1.91, + "step": 6930 + }, + { + "epoch": 0.53, + "grad_norm": 0.70596531483098, + "learning_rate": 2.3405050928301632e-05, + "loss": 2.1294, + "step": 6931 + }, + { + "epoch": 0.53, + "grad_norm": 0.5728492471548148, + "learning_rate": 2.3398816999171734e-05, + "loss": 1.8909, + "step": 6932 + }, + { + "epoch": 0.53, + "grad_norm": 0.6840785627932982, + "learning_rate": 2.3392583170010277e-05, + "loss": 1.8891, + "step": 6933 + }, + { + "epoch": 0.53, + "grad_norm": 0.6181292946772861, + "learning_rate": 2.3386349441206442e-05, + "loss": 1.939, + "step": 6934 + }, + { + "epoch": 0.54, + "grad_norm": 0.7004103747028102, + "learning_rate": 2.3380115813149452e-05, + "loss": 2.1441, + "step": 6935 + }, + { + "epoch": 0.54, + "grad_norm": 0.623852175591801, + "learning_rate": 2.3373882286228487e-05, + "loss": 1.9123, + "step": 6936 + }, + { + "epoch": 0.54, + "grad_norm": 0.6033455416480228, + "learning_rate": 2.336764886083272e-05, + "loss": 1.9239, + "step": 6937 + }, + { + "epoch": 0.54, + "grad_norm": 0.6598874445654936, + "learning_rate": 2.3361415537351343e-05, + "loss": 1.9185, + "step": 6938 + }, + { + "epoch": 0.54, + "grad_norm": 0.7243329797556521, + "learning_rate": 2.3355182316173534e-05, + "loss": 2.172, + "step": 6939 + }, + { + "epoch": 0.54, + "grad_norm": 0.6220763789740399, + "learning_rate": 2.334894919768844e-05, + "loss": 1.9007, + "step": 6940 + }, + { + "epoch": 0.54, + "grad_norm": 0.6732834833874951, + "learning_rate": 2.3342716182285244e-05, + "loss": 1.9679, + "step": 6941 + }, + { + "epoch": 0.54, + "grad_norm": 0.6290743538021611, + "learning_rate": 2.3336483270353075e-05, + "loss": 1.896, + "step": 6942 + }, + { + "epoch": 0.54, + "grad_norm": 0.7647600318389801, + "learning_rate": 2.3330250462281085e-05, + "loss": 2.1093, + "step": 6943 + }, + { + "epoch": 0.54, + "grad_norm": 0.6296311758101999, + "learning_rate": 2.332401775845843e-05, + "loss": 1.9479, + "step": 6944 + }, + { + "epoch": 0.54, + "grad_norm": 0.6859822628177811, + "learning_rate": 2.331778515927422e-05, + "loss": 1.9131, + "step": 6945 + }, + { + "epoch": 0.54, + "grad_norm": 0.5937622532003612, + "learning_rate": 2.3311552665117596e-05, + "loss": 1.8953, + "step": 6946 + }, + { + "epoch": 0.54, + "grad_norm": 0.62087894693453, + "learning_rate": 2.3305320276377686e-05, + "loss": 1.9931, + "step": 6947 + }, + { + "epoch": 0.54, + "grad_norm": 0.6519940319026273, + "learning_rate": 2.3299087993443578e-05, + "loss": 2.0583, + "step": 6948 + }, + { + "epoch": 0.54, + "grad_norm": 0.5664911436855777, + "learning_rate": 2.3292855816704406e-05, + "loss": 1.8998, + "step": 6949 + }, + { + "epoch": 0.54, + "grad_norm": 0.5829259214555926, + "learning_rate": 2.3286623746549254e-05, + "loss": 1.9355, + "step": 6950 + }, + { + "epoch": 0.54, + "grad_norm": 0.6418516789200658, + "learning_rate": 2.3280391783367214e-05, + "loss": 2.1351, + "step": 6951 + }, + { + "epoch": 0.54, + "grad_norm": 0.7138007940635156, + "learning_rate": 2.3274159927547392e-05, + "loss": 1.9246, + "step": 6952 + }, + { + "epoch": 0.54, + "grad_norm": 0.6026687616839503, + "learning_rate": 2.326792817947885e-05, + "loss": 2.0096, + "step": 6953 + }, + { + "epoch": 0.54, + "grad_norm": 0.6200187077054666, + "learning_rate": 2.326169653955067e-05, + "loss": 1.8937, + "step": 6954 + }, + { + "epoch": 0.54, + "grad_norm": 0.6451318910676073, + "learning_rate": 2.3255465008151905e-05, + "loss": 2.1026, + "step": 6955 + }, + { + "epoch": 0.54, + "grad_norm": 0.6039961510056435, + "learning_rate": 2.3249233585671636e-05, + "loss": 1.9181, + "step": 6956 + }, + { + "epoch": 0.54, + "grad_norm": 0.6301588727294065, + "learning_rate": 2.3243002272498908e-05, + "loss": 1.8861, + "step": 6957 + }, + { + "epoch": 0.54, + "grad_norm": 0.6535534668817495, + "learning_rate": 2.3236771069022755e-05, + "loss": 1.9266, + "step": 6958 + }, + { + "epoch": 0.54, + "grad_norm": 0.6153450633121307, + "learning_rate": 2.3230539975632238e-05, + "loss": 2.0735, + "step": 6959 + }, + { + "epoch": 0.54, + "grad_norm": 0.5817948886325971, + "learning_rate": 2.3224308992716382e-05, + "loss": 1.9895, + "step": 6960 + }, + { + "epoch": 0.54, + "grad_norm": 0.6647028274507037, + "learning_rate": 2.3218078120664197e-05, + "loss": 1.8726, + "step": 6961 + }, + { + "epoch": 0.54, + "grad_norm": 0.6404021436884167, + "learning_rate": 2.321184735986473e-05, + "loss": 1.9051, + "step": 6962 + }, + { + "epoch": 0.54, + "grad_norm": 0.6104217841963283, + "learning_rate": 2.3205616710706966e-05, + "loss": 2.0636, + "step": 6963 + }, + { + "epoch": 0.54, + "grad_norm": 0.562373572926197, + "learning_rate": 2.3199386173579926e-05, + "loss": 1.8685, + "step": 6964 + }, + { + "epoch": 0.54, + "grad_norm": 0.6511361838470012, + "learning_rate": 2.319315574887261e-05, + "loss": 1.8733, + "step": 6965 + }, + { + "epoch": 0.54, + "grad_norm": 0.5410075843839882, + "learning_rate": 2.3186925436973996e-05, + "loss": 1.9435, + "step": 6966 + }, + { + "epoch": 0.54, + "grad_norm": 0.6898610281706871, + "learning_rate": 2.318069523827308e-05, + "loss": 2.0948, + "step": 6967 + }, + { + "epoch": 0.54, + "grad_norm": 0.6079141679002845, + "learning_rate": 2.3174465153158835e-05, + "loss": 1.8928, + "step": 6968 + }, + { + "epoch": 0.54, + "grad_norm": 0.6755281245097672, + "learning_rate": 2.316823518202022e-05, + "loss": 1.8831, + "step": 6969 + }, + { + "epoch": 0.54, + "grad_norm": 0.6964386660556257, + "learning_rate": 2.316200532524622e-05, + "loss": 1.9361, + "step": 6970 + }, + { + "epoch": 0.54, + "grad_norm": 0.7251950405762728, + "learning_rate": 2.3155775583225774e-05, + "loss": 2.1257, + "step": 6971 + }, + { + "epoch": 0.54, + "grad_norm": 0.6855173412700594, + "learning_rate": 2.314954595634783e-05, + "loss": 2.0141, + "step": 6972 + }, + { + "epoch": 0.54, + "grad_norm": 0.6617032673510361, + "learning_rate": 2.3143316445001342e-05, + "loss": 1.9213, + "step": 6973 + }, + { + "epoch": 0.54, + "grad_norm": 0.7249613973160681, + "learning_rate": 2.3137087049575233e-05, + "loss": 1.9375, + "step": 6974 + }, + { + "epoch": 0.54, + "grad_norm": 0.8622200660367813, + "learning_rate": 2.3130857770458436e-05, + "loss": 2.0947, + "step": 6975 + }, + { + "epoch": 0.54, + "grad_norm": 0.5616372763328307, + "learning_rate": 2.3124628608039856e-05, + "loss": 1.9241, + "step": 6976 + }, + { + "epoch": 0.54, + "grad_norm": 0.813247333180486, + "learning_rate": 2.3118399562708415e-05, + "loss": 1.8971, + "step": 6977 + }, + { + "epoch": 0.54, + "grad_norm": 0.7622497027040896, + "learning_rate": 2.3112170634853034e-05, + "loss": 1.9777, + "step": 6978 + }, + { + "epoch": 0.54, + "grad_norm": 0.6114024234879807, + "learning_rate": 2.3105941824862586e-05, + "loss": 1.8715, + "step": 6979 + }, + { + "epoch": 0.54, + "grad_norm": 0.7345018359457328, + "learning_rate": 2.3099713133125967e-05, + "loss": 2.1165, + "step": 6980 + }, + { + "epoch": 0.54, + "grad_norm": 0.6750225725094887, + "learning_rate": 2.3093484560032074e-05, + "loss": 1.8935, + "step": 6981 + }, + { + "epoch": 0.54, + "grad_norm": 0.6722410139024572, + "learning_rate": 2.308725610596977e-05, + "loss": 1.9261, + "step": 6982 + }, + { + "epoch": 0.54, + "grad_norm": 0.6002796526862085, + "learning_rate": 2.308102777132793e-05, + "loss": 2.0829, + "step": 6983 + }, + { + "epoch": 0.54, + "grad_norm": 0.6908741450115808, + "learning_rate": 2.3074799556495393e-05, + "loss": 2.0061, + "step": 6984 + }, + { + "epoch": 0.54, + "grad_norm": 0.6850103079609436, + "learning_rate": 2.3068571461861037e-05, + "loss": 1.8672, + "step": 6985 + }, + { + "epoch": 0.54, + "grad_norm": 0.591922588698335, + "learning_rate": 2.3062343487813707e-05, + "loss": 1.9733, + "step": 6986 + }, + { + "epoch": 0.54, + "grad_norm": 0.6652737791434807, + "learning_rate": 2.305611563474222e-05, + "loss": 2.1129, + "step": 6987 + }, + { + "epoch": 0.54, + "grad_norm": 0.5924844956271506, + "learning_rate": 2.3049887903035434e-05, + "loss": 1.9452, + "step": 6988 + }, + { + "epoch": 0.54, + "grad_norm": 0.5684996174977273, + "learning_rate": 2.3043660293082146e-05, + "loss": 1.8864, + "step": 6989 + }, + { + "epoch": 0.54, + "grad_norm": 0.617460985069477, + "learning_rate": 2.3037432805271182e-05, + "loss": 1.916, + "step": 6990 + }, + { + "epoch": 0.54, + "grad_norm": 0.5281311389438791, + "learning_rate": 2.303120543999136e-05, + "loss": 2.0102, + "step": 6991 + }, + { + "epoch": 0.54, + "grad_norm": 0.6979070499430653, + "learning_rate": 2.3024978197631462e-05, + "loss": 2.1233, + "step": 6992 + }, + { + "epoch": 0.54, + "grad_norm": 0.5409055517401112, + "learning_rate": 2.3018751078580287e-05, + "loss": 1.9143, + "step": 6993 + }, + { + "epoch": 0.54, + "grad_norm": 0.6920508027160223, + "learning_rate": 2.3012524083226634e-05, + "loss": 1.8603, + "step": 6994 + }, + { + "epoch": 0.54, + "grad_norm": 0.5625641840138551, + "learning_rate": 2.3006297211959253e-05, + "loss": 2.1046, + "step": 6995 + }, + { + "epoch": 0.54, + "grad_norm": 0.6173929899220647, + "learning_rate": 2.300007046516694e-05, + "loss": 1.9481, + "step": 6996 + }, + { + "epoch": 0.54, + "grad_norm": 0.5926224062531321, + "learning_rate": 2.2993843843238434e-05, + "loss": 1.9576, + "step": 6997 + }, + { + "epoch": 0.54, + "grad_norm": 0.6107784547323151, + "learning_rate": 2.2987617346562497e-05, + "loss": 1.9127, + "step": 6998 + }, + { + "epoch": 0.54, + "grad_norm": 0.579781604217849, + "learning_rate": 2.2981390975527885e-05, + "loss": 2.1035, + "step": 6999 + }, + { + "epoch": 0.54, + "grad_norm": 0.5959741326920936, + "learning_rate": 2.2975164730523323e-05, + "loss": 1.9177, + "step": 7000 + }, + { + "epoch": 0.54, + "grad_norm": 0.593494398983973, + "learning_rate": 2.296893861193755e-05, + "loss": 1.8908, + "step": 7001 + }, + { + "epoch": 0.54, + "grad_norm": 0.6607170372594442, + "learning_rate": 2.296271262015927e-05, + "loss": 1.8771, + "step": 7002 + }, + { + "epoch": 0.54, + "grad_norm": 0.5588231769927112, + "learning_rate": 2.2956486755577222e-05, + "loss": 1.9588, + "step": 7003 + }, + { + "epoch": 0.54, + "grad_norm": 0.648435541585525, + "learning_rate": 2.29502610185801e-05, + "loss": 2.0974, + "step": 7004 + }, + { + "epoch": 0.54, + "grad_norm": 0.6408984872079808, + "learning_rate": 2.2944035409556596e-05, + "loss": 1.8749, + "step": 7005 + }, + { + "epoch": 0.54, + "grad_norm": 0.6234841430035625, + "learning_rate": 2.2937809928895413e-05, + "loss": 1.8781, + "step": 7006 + }, + { + "epoch": 0.54, + "grad_norm": 0.7036612319436399, + "learning_rate": 2.2931584576985232e-05, + "loss": 2.1122, + "step": 7007 + }, + { + "epoch": 0.54, + "grad_norm": 0.5788380251171683, + "learning_rate": 2.292535935421471e-05, + "loss": 1.9241, + "step": 7008 + }, + { + "epoch": 0.54, + "grad_norm": 0.6810023589386787, + "learning_rate": 2.291913426097254e-05, + "loss": 1.9958, + "step": 7009 + }, + { + "epoch": 0.54, + "grad_norm": 0.6200881174158608, + "learning_rate": 2.2912909297647357e-05, + "loss": 1.9272, + "step": 7010 + }, + { + "epoch": 0.54, + "grad_norm": 0.6523238988010632, + "learning_rate": 2.2906684464627826e-05, + "loss": 1.868, + "step": 7011 + }, + { + "epoch": 0.54, + "grad_norm": 0.6205149522966484, + "learning_rate": 2.2900459762302588e-05, + "loss": 2.1478, + "step": 7012 + }, + { + "epoch": 0.54, + "grad_norm": 0.5813464064364479, + "learning_rate": 2.289423519106026e-05, + "loss": 1.9218, + "step": 7013 + }, + { + "epoch": 0.54, + "grad_norm": 0.5636650836602358, + "learning_rate": 2.2888010751289485e-05, + "loss": 1.8626, + "step": 7014 + }, + { + "epoch": 0.54, + "grad_norm": 0.6417591896562348, + "learning_rate": 2.288178644337888e-05, + "loss": 1.9584, + "step": 7015 + }, + { + "epoch": 0.54, + "grad_norm": 0.6052769187449286, + "learning_rate": 2.287556226771704e-05, + "loss": 2.0555, + "step": 7016 + }, + { + "epoch": 0.54, + "grad_norm": 0.591835350630733, + "learning_rate": 2.2869338224692584e-05, + "loss": 1.9158, + "step": 7017 + }, + { + "epoch": 0.54, + "grad_norm": 0.6124122539614082, + "learning_rate": 2.286311431469409e-05, + "loss": 1.9145, + "step": 7018 + }, + { + "epoch": 0.54, + "grad_norm": 0.6667524110642084, + "learning_rate": 2.285689053811015e-05, + "loss": 2.1128, + "step": 7019 + }, + { + "epoch": 0.54, + "grad_norm": 0.6081311829630692, + "learning_rate": 2.285066689532934e-05, + "loss": 1.853, + "step": 7020 + }, + { + "epoch": 0.54, + "grad_norm": 0.687556115632128, + "learning_rate": 2.2844443386740223e-05, + "loss": 1.8619, + "step": 7021 + }, + { + "epoch": 0.54, + "grad_norm": 0.6382971623618422, + "learning_rate": 2.2838220012731365e-05, + "loss": 1.9921, + "step": 7022 + }, + { + "epoch": 0.54, + "grad_norm": 0.6057546403634597, + "learning_rate": 2.2831996773691302e-05, + "loss": 1.9165, + "step": 7023 + }, + { + "epoch": 0.54, + "grad_norm": 0.6067936404980967, + "learning_rate": 2.2825773670008594e-05, + "loss": 2.0945, + "step": 7024 + }, + { + "epoch": 0.54, + "grad_norm": 0.663462780835987, + "learning_rate": 2.281955070207177e-05, + "loss": 1.8995, + "step": 7025 + }, + { + "epoch": 0.54, + "grad_norm": 0.6777704425548444, + "learning_rate": 2.2813327870269342e-05, + "loss": 1.9194, + "step": 7026 + }, + { + "epoch": 0.54, + "grad_norm": 0.7056129903038216, + "learning_rate": 2.2807105174989842e-05, + "loss": 2.0746, + "step": 7027 + }, + { + "epoch": 0.54, + "grad_norm": 0.7074882555796836, + "learning_rate": 2.2800882616621782e-05, + "loss": 1.9873, + "step": 7028 + }, + { + "epoch": 0.54, + "grad_norm": 0.7463059703195273, + "learning_rate": 2.279466019555365e-05, + "loss": 1.8292, + "step": 7029 + }, + { + "epoch": 0.54, + "grad_norm": 0.5826551625192009, + "learning_rate": 2.2788437912173953e-05, + "loss": 1.9047, + "step": 7030 + }, + { + "epoch": 0.54, + "grad_norm": 0.751733655989344, + "learning_rate": 2.2782215766871143e-05, + "loss": 1.9323, + "step": 7031 + }, + { + "epoch": 0.54, + "grad_norm": 0.6846844569600771, + "learning_rate": 2.2775993760033723e-05, + "loss": 2.0615, + "step": 7032 + }, + { + "epoch": 0.54, + "grad_norm": 0.6789548700333394, + "learning_rate": 2.2769771892050154e-05, + "loss": 1.9029, + "step": 7033 + }, + { + "epoch": 0.54, + "grad_norm": 0.5713224070040431, + "learning_rate": 2.2763550163308877e-05, + "loss": 1.9606, + "step": 7034 + }, + { + "epoch": 0.54, + "grad_norm": 0.6912832095467348, + "learning_rate": 2.2757328574198363e-05, + "loss": 1.9541, + "step": 7035 + }, + { + "epoch": 0.54, + "grad_norm": 0.6331018500034579, + "learning_rate": 2.275110712510703e-05, + "loss": 2.1113, + "step": 7036 + }, + { + "epoch": 0.54, + "grad_norm": 0.7698986496254807, + "learning_rate": 2.2744885816423313e-05, + "loss": 1.9137, + "step": 7037 + }, + { + "epoch": 0.54, + "grad_norm": 0.6602287729423949, + "learning_rate": 2.2738664648535653e-05, + "loss": 1.9441, + "step": 7038 + }, + { + "epoch": 0.54, + "grad_norm": 0.8384485578352099, + "learning_rate": 2.273244362183244e-05, + "loss": 2.1119, + "step": 7039 + }, + { + "epoch": 0.54, + "grad_norm": 0.6644004986678486, + "learning_rate": 2.2726222736702084e-05, + "loss": 1.9533, + "step": 7040 + }, + { + "epoch": 0.54, + "grad_norm": 0.6269602940696372, + "learning_rate": 2.2720001993532993e-05, + "loss": 1.8806, + "step": 7041 + }, + { + "epoch": 0.54, + "grad_norm": 0.9113031923531637, + "learning_rate": 2.2713781392713535e-05, + "loss": 1.9083, + "step": 7042 + }, + { + "epoch": 0.54, + "grad_norm": 0.674585077090671, + "learning_rate": 2.2707560934632105e-05, + "loss": 1.8596, + "step": 7043 + }, + { + "epoch": 0.54, + "grad_norm": 0.8963966075763483, + "learning_rate": 2.270134061967706e-05, + "loss": 2.1252, + "step": 7044 + }, + { + "epoch": 0.54, + "grad_norm": 0.5903137312976403, + "learning_rate": 2.2695120448236758e-05, + "loss": 1.9081, + "step": 7045 + }, + { + "epoch": 0.54, + "grad_norm": 0.8005989408765137, + "learning_rate": 2.2688900420699566e-05, + "loss": 1.9593, + "step": 7046 + }, + { + "epoch": 0.54, + "grad_norm": 0.6172194273735723, + "learning_rate": 2.2682680537453807e-05, + "loss": 1.8885, + "step": 7047 + }, + { + "epoch": 0.54, + "grad_norm": 0.650572484361134, + "learning_rate": 2.2676460798887828e-05, + "loss": 2.1261, + "step": 7048 + }, + { + "epoch": 0.54, + "grad_norm": 0.5911149005308879, + "learning_rate": 2.2670241205389936e-05, + "loss": 1.9152, + "step": 7049 + }, + { + "epoch": 0.54, + "grad_norm": 0.6391682859057229, + "learning_rate": 2.2664021757348465e-05, + "loss": 1.9219, + "step": 7050 + }, + { + "epoch": 0.54, + "grad_norm": 0.6184611343089382, + "learning_rate": 2.2657802455151715e-05, + "loss": 2.0442, + "step": 7051 + }, + { + "epoch": 0.54, + "grad_norm": 0.617959307286655, + "learning_rate": 2.2651583299187967e-05, + "loss": 1.9262, + "step": 7052 + }, + { + "epoch": 0.54, + "grad_norm": 0.5855954349373058, + "learning_rate": 2.2645364289845527e-05, + "loss": 1.9751, + "step": 7053 + }, + { + "epoch": 0.54, + "grad_norm": 0.6068063082177816, + "learning_rate": 2.2639145427512675e-05, + "loss": 1.9319, + "step": 7054 + }, + { + "epoch": 0.54, + "grad_norm": 0.6119161880553763, + "learning_rate": 2.2632926712577664e-05, + "loss": 1.8622, + "step": 7055 + }, + { + "epoch": 0.54, + "grad_norm": 0.7401214494617729, + "learning_rate": 2.2626708145428767e-05, + "loss": 2.0986, + "step": 7056 + }, + { + "epoch": 0.54, + "grad_norm": 0.650526575413362, + "learning_rate": 2.262048972645423e-05, + "loss": 1.8896, + "step": 7057 + }, + { + "epoch": 0.54, + "grad_norm": 0.6153992105687388, + "learning_rate": 2.2614271456042287e-05, + "loss": 1.867, + "step": 7058 + }, + { + "epoch": 0.54, + "grad_norm": 0.6958404556936367, + "learning_rate": 2.2608053334581188e-05, + "loss": 1.9781, + "step": 7059 + }, + { + "epoch": 0.54, + "grad_norm": 0.8525084546608769, + "learning_rate": 2.2601835362459137e-05, + "loss": 2.1342, + "step": 7060 + }, + { + "epoch": 0.54, + "grad_norm": 0.5961400193711438, + "learning_rate": 2.2595617540064362e-05, + "loss": 1.8963, + "step": 7061 + }, + { + "epoch": 0.54, + "grad_norm": 0.7133749822418753, + "learning_rate": 2.2589399867785066e-05, + "loss": 1.8691, + "step": 7062 + }, + { + "epoch": 0.54, + "grad_norm": 0.769794224711012, + "learning_rate": 2.2583182346009427e-05, + "loss": 1.9175, + "step": 7063 + }, + { + "epoch": 0.54, + "grad_norm": 0.6764473170183589, + "learning_rate": 2.2576964975125653e-05, + "loss": 2.136, + "step": 7064 + }, + { + "epoch": 0.55, + "grad_norm": 0.6971862049428335, + "learning_rate": 2.2570747755521906e-05, + "loss": 1.9702, + "step": 7065 + }, + { + "epoch": 0.55, + "grad_norm": 0.6229210677628433, + "learning_rate": 2.2564530687586353e-05, + "loss": 1.892, + "step": 7066 + }, + { + "epoch": 0.55, + "grad_norm": 0.7336707328230961, + "learning_rate": 2.2558313771707164e-05, + "loss": 1.9791, + "step": 7067 + }, + { + "epoch": 0.55, + "grad_norm": 0.6234561192939817, + "learning_rate": 2.2552097008272465e-05, + "loss": 2.1177, + "step": 7068 + }, + { + "epoch": 0.55, + "grad_norm": 0.6286443785210944, + "learning_rate": 2.2545880397670418e-05, + "loss": 1.9123, + "step": 7069 + }, + { + "epoch": 0.55, + "grad_norm": 0.6793349183753359, + "learning_rate": 2.2539663940289126e-05, + "loss": 1.912, + "step": 7070 + }, + { + "epoch": 0.55, + "grad_norm": 0.6109221225200698, + "learning_rate": 2.253344763651673e-05, + "loss": 1.9851, + "step": 7071 + }, + { + "epoch": 0.55, + "grad_norm": 0.6411202618269222, + "learning_rate": 2.2527231486741335e-05, + "loss": 2.0817, + "step": 7072 + }, + { + "epoch": 0.55, + "grad_norm": 0.6277596865889165, + "learning_rate": 2.2521015491351024e-05, + "loss": 1.8819, + "step": 7073 + }, + { + "epoch": 0.55, + "grad_norm": 0.6490474024323507, + "learning_rate": 2.2514799650733903e-05, + "loss": 1.8837, + "step": 7074 + }, + { + "epoch": 0.55, + "grad_norm": 0.6498254397721837, + "learning_rate": 2.2508583965278056e-05, + "loss": 1.959, + "step": 7075 + }, + { + "epoch": 0.55, + "grad_norm": 0.6953074094947508, + "learning_rate": 2.2502368435371546e-05, + "loss": 2.1674, + "step": 7076 + }, + { + "epoch": 0.55, + "grad_norm": 0.7777982722019334, + "learning_rate": 2.249615306140244e-05, + "loss": 1.9823, + "step": 7077 + }, + { + "epoch": 0.55, + "grad_norm": 0.6517462064972002, + "learning_rate": 2.248993784375877e-05, + "loss": 1.8857, + "step": 7078 + }, + { + "epoch": 0.55, + "grad_norm": 0.6744181231832219, + "learning_rate": 2.2483722782828602e-05, + "loss": 1.9429, + "step": 7079 + }, + { + "epoch": 0.55, + "grad_norm": 0.6113470241886698, + "learning_rate": 2.247750787899996e-05, + "loss": 2.0814, + "step": 7080 + }, + { + "epoch": 0.55, + "grad_norm": 0.6370281683862717, + "learning_rate": 2.2471293132660854e-05, + "loss": 1.9052, + "step": 7081 + }, + { + "epoch": 0.55, + "grad_norm": 0.6864563819048298, + "learning_rate": 2.2465078544199315e-05, + "loss": 1.8725, + "step": 7082 + }, + { + "epoch": 0.55, + "grad_norm": 0.5688018938728608, + "learning_rate": 2.245886411400333e-05, + "loss": 2.0922, + "step": 7083 + }, + { + "epoch": 0.55, + "grad_norm": 0.6339467111976518, + "learning_rate": 2.2452649842460897e-05, + "loss": 1.985, + "step": 7084 + }, + { + "epoch": 0.55, + "grad_norm": 0.5978812465602381, + "learning_rate": 2.2446435729960005e-05, + "loss": 1.9389, + "step": 7085 + }, + { + "epoch": 0.55, + "grad_norm": 0.6214510825501665, + "learning_rate": 2.244022177688862e-05, + "loss": 1.8599, + "step": 7086 + }, + { + "epoch": 0.55, + "grad_norm": 0.6063567140804703, + "learning_rate": 2.2434007983634698e-05, + "loss": 1.9341, + "step": 7087 + }, + { + "epoch": 0.55, + "grad_norm": 0.6603026354214003, + "learning_rate": 2.2427794350586208e-05, + "loss": 2.0587, + "step": 7088 + }, + { + "epoch": 0.55, + "grad_norm": 0.6498869179808687, + "learning_rate": 2.242158087813108e-05, + "loss": 1.9256, + "step": 7089 + }, + { + "epoch": 0.55, + "grad_norm": 0.5645758018308584, + "learning_rate": 2.241536756665725e-05, + "loss": 1.9558, + "step": 7090 + }, + { + "epoch": 0.55, + "grad_norm": 0.5449768337855257, + "learning_rate": 2.240915441655264e-05, + "loss": 1.906, + "step": 7091 + }, + { + "epoch": 0.55, + "grad_norm": 0.6862921970938655, + "learning_rate": 2.2402941428205158e-05, + "loss": 2.1415, + "step": 7092 + }, + { + "epoch": 0.55, + "grad_norm": 0.6004315119994686, + "learning_rate": 2.239672860200272e-05, + "loss": 1.8968, + "step": 7093 + }, + { + "epoch": 0.55, + "grad_norm": 0.6422995537792966, + "learning_rate": 2.239051593833321e-05, + "loss": 1.9705, + "step": 7094 + }, + { + "epoch": 0.55, + "grad_norm": 0.6016649101271928, + "learning_rate": 2.23843034375845e-05, + "loss": 1.9008, + "step": 7095 + }, + { + "epoch": 0.55, + "grad_norm": 0.7433319391126177, + "learning_rate": 2.2378091100144486e-05, + "loss": 2.165, + "step": 7096 + }, + { + "epoch": 0.55, + "grad_norm": 0.6302796150504841, + "learning_rate": 2.2371878926401007e-05, + "loss": 1.9497, + "step": 7097 + }, + { + "epoch": 0.55, + "grad_norm": 0.7388779265309346, + "learning_rate": 2.236566691674193e-05, + "loss": 1.8547, + "step": 7098 + }, + { + "epoch": 0.55, + "grad_norm": 0.6020923971267467, + "learning_rate": 2.235945507155508e-05, + "loss": 1.9102, + "step": 7099 + }, + { + "epoch": 0.55, + "grad_norm": 0.5904257903188902, + "learning_rate": 2.2353243391228302e-05, + "loss": 2.085, + "step": 7100 + }, + { + "epoch": 0.55, + "grad_norm": 0.6017370117735489, + "learning_rate": 2.2347031876149416e-05, + "loss": 1.9043, + "step": 7101 + }, + { + "epoch": 0.55, + "grad_norm": 0.6253550887842904, + "learning_rate": 2.234082052670622e-05, + "loss": 1.9643, + "step": 7102 + }, + { + "epoch": 0.55, + "grad_norm": 0.5765322305345614, + "learning_rate": 2.2334609343286532e-05, + "loss": 1.9271, + "step": 7103 + }, + { + "epoch": 0.55, + "grad_norm": 0.5997474369010031, + "learning_rate": 2.232839832627813e-05, + "loss": 2.1051, + "step": 7104 + }, + { + "epoch": 0.55, + "grad_norm": 0.61023107429698, + "learning_rate": 2.232218747606879e-05, + "loss": 1.9001, + "step": 7105 + }, + { + "epoch": 0.55, + "grad_norm": 0.7220776415909888, + "learning_rate": 2.231597679304629e-05, + "loss": 1.9247, + "step": 7106 + }, + { + "epoch": 0.55, + "grad_norm": 0.602710895891604, + "learning_rate": 2.230976627759838e-05, + "loss": 1.9171, + "step": 7107 + }, + { + "epoch": 0.55, + "grad_norm": 0.6274813470842318, + "learning_rate": 2.230355593011282e-05, + "loss": 2.0654, + "step": 7108 + }, + { + "epoch": 0.55, + "grad_norm": 0.6293276020049982, + "learning_rate": 2.2297345750977345e-05, + "loss": 1.8803, + "step": 7109 + }, + { + "epoch": 0.55, + "grad_norm": 0.6902353520763248, + "learning_rate": 2.2291135740579666e-05, + "loss": 1.9062, + "step": 7110 + }, + { + "epoch": 0.55, + "grad_norm": 0.5897435616821807, + "learning_rate": 2.2284925899307524e-05, + "loss": 1.8983, + "step": 7111 + }, + { + "epoch": 0.55, + "grad_norm": 0.6894477058140261, + "learning_rate": 2.2278716227548602e-05, + "loss": 2.0887, + "step": 7112 + }, + { + "epoch": 0.55, + "grad_norm": 0.5631959590502968, + "learning_rate": 2.2272506725690602e-05, + "loss": 1.8773, + "step": 7113 + }, + { + "epoch": 0.55, + "grad_norm": 0.5674763541325322, + "learning_rate": 2.2266297394121224e-05, + "loss": 1.8867, + "step": 7114 + }, + { + "epoch": 0.55, + "grad_norm": 0.644300278402013, + "learning_rate": 2.2260088233228124e-05, + "loss": 1.9708, + "step": 7115 + }, + { + "epoch": 0.55, + "grad_norm": 0.6722651746994859, + "learning_rate": 2.2253879243398976e-05, + "loss": 2.1445, + "step": 7116 + }, + { + "epoch": 0.55, + "grad_norm": 0.5909110145659097, + "learning_rate": 2.2247670425021416e-05, + "loss": 1.8911, + "step": 7117 + }, + { + "epoch": 0.55, + "grad_norm": 0.7002382137309457, + "learning_rate": 2.2241461778483107e-05, + "loss": 1.9433, + "step": 7118 + }, + { + "epoch": 0.55, + "grad_norm": 0.6592361345742014, + "learning_rate": 2.2235253304171674e-05, + "loss": 1.951, + "step": 7119 + }, + { + "epoch": 0.55, + "grad_norm": 0.5902326106815577, + "learning_rate": 2.222904500247473e-05, + "loss": 2.0921, + "step": 7120 + }, + { + "epoch": 0.55, + "grad_norm": 0.657334138711458, + "learning_rate": 2.2222836873779888e-05, + "loss": 1.9693, + "step": 7121 + }, + { + "epoch": 0.55, + "grad_norm": 0.5950747318325049, + "learning_rate": 2.221662891847476e-05, + "loss": 1.9305, + "step": 7122 + }, + { + "epoch": 0.55, + "grad_norm": 0.6781177497196382, + "learning_rate": 2.2210421136946905e-05, + "loss": 1.8639, + "step": 7123 + }, + { + "epoch": 0.55, + "grad_norm": 0.7076692536163869, + "learning_rate": 2.2204213529583934e-05, + "loss": 2.0714, + "step": 7124 + }, + { + "epoch": 0.55, + "grad_norm": 0.7220402174776631, + "learning_rate": 2.2198006096773385e-05, + "loss": 1.9185, + "step": 7125 + }, + { + "epoch": 0.55, + "grad_norm": 0.6844509331292953, + "learning_rate": 2.2191798838902835e-05, + "loss": 1.9145, + "step": 7126 + }, + { + "epoch": 0.55, + "grad_norm": 0.7607255365029794, + "learning_rate": 2.218559175635982e-05, + "loss": 2.0228, + "step": 7127 + }, + { + "epoch": 0.55, + "grad_norm": 0.8821401844558856, + "learning_rate": 2.2179384849531867e-05, + "loss": 2.092, + "step": 7128 + }, + { + "epoch": 0.55, + "grad_norm": 0.6025159822689634, + "learning_rate": 2.217317811880651e-05, + "loss": 1.9039, + "step": 7129 + }, + { + "epoch": 0.55, + "grad_norm": 0.6164067586434357, + "learning_rate": 2.216697156457126e-05, + "loss": 1.9117, + "step": 7130 + }, + { + "epoch": 0.55, + "grad_norm": 0.7827460749410914, + "learning_rate": 2.2160765187213604e-05, + "loss": 1.8779, + "step": 7131 + }, + { + "epoch": 0.55, + "grad_norm": 0.6121111607789798, + "learning_rate": 2.2154558987121055e-05, + "loss": 2.0825, + "step": 7132 + }, + { + "epoch": 0.55, + "grad_norm": 0.6589871554842689, + "learning_rate": 2.214835296468107e-05, + "loss": 1.9043, + "step": 7133 + }, + { + "epoch": 0.55, + "grad_norm": 0.6041585164555597, + "learning_rate": 2.2142147120281116e-05, + "loss": 1.8571, + "step": 7134 + }, + { + "epoch": 0.55, + "grad_norm": 0.7757840823701326, + "learning_rate": 2.213594145430867e-05, + "loss": 1.9162, + "step": 7135 + }, + { + "epoch": 0.55, + "grad_norm": 0.5754137911446694, + "learning_rate": 2.2129735967151162e-05, + "loss": 2.118, + "step": 7136 + }, + { + "epoch": 0.55, + "grad_norm": 0.6791259800983376, + "learning_rate": 2.212353065919603e-05, + "loss": 1.8735, + "step": 7137 + }, + { + "epoch": 0.55, + "grad_norm": 0.706139065615846, + "learning_rate": 2.2117325530830687e-05, + "loss": 1.8688, + "step": 7138 + }, + { + "epoch": 0.55, + "grad_norm": 0.7306988954336419, + "learning_rate": 2.2111120582442556e-05, + "loss": 1.914, + "step": 7139 + }, + { + "epoch": 0.55, + "grad_norm": 0.7065324494053647, + "learning_rate": 2.210491581441904e-05, + "loss": 2.1362, + "step": 7140 + }, + { + "epoch": 0.55, + "grad_norm": 0.6015279587928598, + "learning_rate": 2.2098711227147524e-05, + "loss": 1.8962, + "step": 7141 + }, + { + "epoch": 0.55, + "grad_norm": 0.6616721901679053, + "learning_rate": 2.2092506821015374e-05, + "loss": 1.8964, + "step": 7142 + }, + { + "epoch": 0.55, + "grad_norm": 0.6736976791115087, + "learning_rate": 2.208630259640998e-05, + "loss": 1.9148, + "step": 7143 + }, + { + "epoch": 0.55, + "grad_norm": 0.6437729616502058, + "learning_rate": 2.2080098553718682e-05, + "loss": 2.0852, + "step": 7144 + }, + { + "epoch": 0.55, + "grad_norm": 0.5836531457632554, + "learning_rate": 2.207389469332883e-05, + "loss": 1.9099, + "step": 7145 + }, + { + "epoch": 0.55, + "grad_norm": 0.780522278964798, + "learning_rate": 2.206769101562774e-05, + "loss": 2.0065, + "step": 7146 + }, + { + "epoch": 0.55, + "grad_norm": 0.6294901526396731, + "learning_rate": 2.2061487521002756e-05, + "loss": 1.8784, + "step": 7147 + }, + { + "epoch": 0.55, + "grad_norm": 0.659262610955476, + "learning_rate": 2.205528420984118e-05, + "loss": 2.1164, + "step": 7148 + }, + { + "epoch": 0.55, + "grad_norm": 0.7506478021350258, + "learning_rate": 2.2049081082530296e-05, + "loss": 1.9208, + "step": 7149 + }, + { + "epoch": 0.55, + "grad_norm": 0.6470726917128564, + "learning_rate": 2.2042878139457414e-05, + "loss": 1.9499, + "step": 7150 + }, + { + "epoch": 0.55, + "grad_norm": 0.6630278570646492, + "learning_rate": 2.2036675381009796e-05, + "loss": 1.8826, + "step": 7151 + }, + { + "epoch": 0.55, + "grad_norm": 0.5836915036791764, + "learning_rate": 2.2030472807574702e-05, + "loss": 2.1425, + "step": 7152 + }, + { + "epoch": 0.55, + "grad_norm": 0.6778376505702007, + "learning_rate": 2.2024270419539396e-05, + "loss": 1.9369, + "step": 7153 + }, + { + "epoch": 0.55, + "grad_norm": 0.6020272307999652, + "learning_rate": 2.201806821729111e-05, + "loss": 1.897, + "step": 7154 + }, + { + "epoch": 0.55, + "grad_norm": 0.6378113685334109, + "learning_rate": 2.201186620121707e-05, + "loss": 1.9331, + "step": 7155 + }, + { + "epoch": 0.55, + "grad_norm": 0.7196204078253812, + "learning_rate": 2.200566437170451e-05, + "loss": 2.0464, + "step": 7156 + }, + { + "epoch": 0.55, + "grad_norm": 0.6870507394380193, + "learning_rate": 2.199946272914061e-05, + "loss": 1.9842, + "step": 7157 + }, + { + "epoch": 0.55, + "grad_norm": 0.5903679297446236, + "learning_rate": 2.1993261273912595e-05, + "loss": 2.0015, + "step": 7158 + }, + { + "epoch": 0.55, + "grad_norm": 0.7061143685041945, + "learning_rate": 2.198706000640762e-05, + "loss": 1.9472, + "step": 7159 + }, + { + "epoch": 0.55, + "grad_norm": 0.7848484033832144, + "learning_rate": 2.198085892701286e-05, + "loss": 2.096, + "step": 7160 + }, + { + "epoch": 0.55, + "grad_norm": 0.6541871937377336, + "learning_rate": 2.1974658036115496e-05, + "loss": 1.9383, + "step": 7161 + }, + { + "epoch": 0.55, + "grad_norm": 0.7265036105007652, + "learning_rate": 2.1968457334102653e-05, + "loss": 1.9324, + "step": 7162 + }, + { + "epoch": 0.55, + "grad_norm": 0.6588526125122433, + "learning_rate": 2.1962256821361474e-05, + "loss": 1.8729, + "step": 7163 + }, + { + "epoch": 0.55, + "grad_norm": 0.698886202931842, + "learning_rate": 2.1956056498279072e-05, + "loss": 1.9966, + "step": 7164 + }, + { + "epoch": 0.55, + "grad_norm": 0.6022777693830814, + "learning_rate": 2.1949856365242573e-05, + "loss": 2.0972, + "step": 7165 + }, + { + "epoch": 0.55, + "grad_norm": 0.6355794948438966, + "learning_rate": 2.194365642263908e-05, + "loss": 1.9053, + "step": 7166 + }, + { + "epoch": 0.55, + "grad_norm": 0.6804858176317747, + "learning_rate": 2.193745667085566e-05, + "loss": 1.8692, + "step": 7167 + }, + { + "epoch": 0.55, + "grad_norm": 0.8027507694284006, + "learning_rate": 2.1931257110279404e-05, + "loss": 2.0691, + "step": 7168 + }, + { + "epoch": 0.55, + "grad_norm": 0.5709697890511846, + "learning_rate": 2.192505774129738e-05, + "loss": 1.892, + "step": 7169 + }, + { + "epoch": 0.55, + "grad_norm": 0.8380485138601247, + "learning_rate": 2.1918858564296625e-05, + "loss": 1.9127, + "step": 7170 + }, + { + "epoch": 0.55, + "grad_norm": 0.6667429478176476, + "learning_rate": 2.1912659579664194e-05, + "loss": 2.0148, + "step": 7171 + }, + { + "epoch": 0.55, + "grad_norm": 0.8201416312174684, + "learning_rate": 2.19064607877871e-05, + "loss": 2.1194, + "step": 7172 + }, + { + "epoch": 0.55, + "grad_norm": 0.6685640113804465, + "learning_rate": 2.1900262189052374e-05, + "loss": 1.8882, + "step": 7173 + }, + { + "epoch": 0.55, + "grad_norm": 0.734331434556597, + "learning_rate": 2.1894063783847017e-05, + "loss": 1.9626, + "step": 7174 + }, + { + "epoch": 0.55, + "grad_norm": 0.7149495157399715, + "learning_rate": 2.1887865572558008e-05, + "loss": 1.8565, + "step": 7175 + }, + { + "epoch": 0.55, + "grad_norm": 0.6527159256163847, + "learning_rate": 2.188166755557234e-05, + "loss": 2.0872, + "step": 7176 + }, + { + "epoch": 0.55, + "grad_norm": 0.6156317127751327, + "learning_rate": 2.1875469733276984e-05, + "loss": 1.9913, + "step": 7177 + }, + { + "epoch": 0.55, + "grad_norm": 0.6557012985822566, + "learning_rate": 2.1869272106058877e-05, + "loss": 1.9005, + "step": 7178 + }, + { + "epoch": 0.55, + "grad_norm": 0.6481861921031615, + "learning_rate": 2.1863074674304985e-05, + "loss": 1.8875, + "step": 7179 + }, + { + "epoch": 0.55, + "grad_norm": 0.6915144718678985, + "learning_rate": 2.1856877438402225e-05, + "loss": 2.0598, + "step": 7180 + }, + { + "epoch": 0.55, + "grad_norm": 0.5659897229314858, + "learning_rate": 2.1850680398737507e-05, + "loss": 1.9198, + "step": 7181 + }, + { + "epoch": 0.55, + "grad_norm": 0.7180510336848404, + "learning_rate": 2.1844483555697765e-05, + "loss": 1.9171, + "step": 7182 + }, + { + "epoch": 0.55, + "grad_norm": 0.6970089792392445, + "learning_rate": 2.183828690966987e-05, + "loss": 2.0298, + "step": 7183 + }, + { + "epoch": 0.55, + "grad_norm": 0.7449044811721637, + "learning_rate": 2.183209046104072e-05, + "loss": 2.0964, + "step": 7184 + }, + { + "epoch": 0.55, + "grad_norm": 0.6001297285281055, + "learning_rate": 2.1825894210197166e-05, + "loss": 1.9219, + "step": 7185 + }, + { + "epoch": 0.55, + "grad_norm": 0.7751054949107328, + "learning_rate": 2.1819698157526077e-05, + "loss": 1.9525, + "step": 7186 + }, + { + "epoch": 0.55, + "grad_norm": 0.6261356355558343, + "learning_rate": 2.1813502303414306e-05, + "loss": 1.8893, + "step": 7187 + }, + { + "epoch": 0.55, + "grad_norm": 0.5529366505647058, + "learning_rate": 2.1807306648248667e-05, + "loss": 2.0767, + "step": 7188 + }, + { + "epoch": 0.55, + "grad_norm": 0.5735127884941142, + "learning_rate": 2.180111119241599e-05, + "loss": 1.9527, + "step": 7189 + }, + { + "epoch": 0.55, + "grad_norm": 0.6297245764528177, + "learning_rate": 2.179491593630309e-05, + "loss": 1.8792, + "step": 7190 + }, + { + "epoch": 0.55, + "grad_norm": 0.6635401985904764, + "learning_rate": 2.178872088029675e-05, + "loss": 1.9294, + "step": 7191 + }, + { + "epoch": 0.55, + "grad_norm": 0.5366730749779657, + "learning_rate": 2.1782526024783765e-05, + "loss": 2.1127, + "step": 7192 + }, + { + "epoch": 0.55, + "grad_norm": 0.637139723089521, + "learning_rate": 2.1776331370150886e-05, + "loss": 1.8955, + "step": 7193 + }, + { + "epoch": 0.56, + "grad_norm": 0.6160504497415435, + "learning_rate": 2.1770136916784886e-05, + "loss": 1.9309, + "step": 7194 + }, + { + "epoch": 0.56, + "grad_norm": 0.595431003120135, + "learning_rate": 2.1763942665072515e-05, + "loss": 1.9768, + "step": 7195 + }, + { + "epoch": 0.56, + "grad_norm": 0.6425601155605549, + "learning_rate": 2.1757748615400486e-05, + "loss": 1.9279, + "step": 7196 + }, + { + "epoch": 0.56, + "grad_norm": 0.6066156650627091, + "learning_rate": 2.175155476815554e-05, + "loss": 2.1865, + "step": 7197 + }, + { + "epoch": 0.56, + "grad_norm": 0.6224862680018973, + "learning_rate": 2.174536112372437e-05, + "loss": 1.9068, + "step": 7198 + }, + { + "epoch": 0.56, + "grad_norm": 0.6522896334917498, + "learning_rate": 2.1739167682493673e-05, + "loss": 1.8614, + "step": 7199 + }, + { + "epoch": 0.56, + "grad_norm": 0.6530008537812411, + "learning_rate": 2.1732974444850143e-05, + "loss": 2.0685, + "step": 7200 + }, + { + "epoch": 0.56, + "grad_norm": 0.6220273372286494, + "learning_rate": 2.1726781411180436e-05, + "loss": 1.8989, + "step": 7201 + }, + { + "epoch": 0.56, + "grad_norm": 0.6875934935557652, + "learning_rate": 2.1720588581871205e-05, + "loss": 2.0173, + "step": 7202 + }, + { + "epoch": 0.56, + "grad_norm": 0.647425471230818, + "learning_rate": 2.1714395957309108e-05, + "loss": 1.895, + "step": 7203 + }, + { + "epoch": 0.56, + "grad_norm": 0.6975663778019863, + "learning_rate": 2.170820353788076e-05, + "loss": 2.132, + "step": 7204 + }, + { + "epoch": 0.56, + "grad_norm": 0.652413674375587, + "learning_rate": 2.1702011323972806e-05, + "loss": 1.9458, + "step": 7205 + }, + { + "epoch": 0.56, + "grad_norm": 0.5861768745372263, + "learning_rate": 2.169581931597182e-05, + "loss": 1.8725, + "step": 7206 + }, + { + "epoch": 0.56, + "grad_norm": 0.6773070762895714, + "learning_rate": 2.1689627514264406e-05, + "loss": 1.9253, + "step": 7207 + }, + { + "epoch": 0.56, + "grad_norm": 0.6701305382543101, + "learning_rate": 2.168343591923715e-05, + "loss": 1.9629, + "step": 7208 + }, + { + "epoch": 0.56, + "grad_norm": 0.6551773956243744, + "learning_rate": 2.1677244531276615e-05, + "loss": 2.1104, + "step": 7209 + }, + { + "epoch": 0.56, + "grad_norm": 0.7680767086947965, + "learning_rate": 2.167105335076935e-05, + "loss": 1.8678, + "step": 7210 + }, + { + "epoch": 0.56, + "grad_norm": 0.6623728301108752, + "learning_rate": 2.1664862378101907e-05, + "loss": 1.9475, + "step": 7211 + }, + { + "epoch": 0.56, + "grad_norm": 0.6544914406511996, + "learning_rate": 2.1658671613660805e-05, + "loss": 2.0887, + "step": 7212 + }, + { + "epoch": 0.56, + "grad_norm": 0.7121214269180476, + "learning_rate": 2.1652481057832565e-05, + "loss": 1.9314, + "step": 7213 + }, + { + "epoch": 0.56, + "grad_norm": 0.5707876974294038, + "learning_rate": 2.1646290711003673e-05, + "loss": 1.9861, + "step": 7214 + }, + { + "epoch": 0.56, + "grad_norm": 0.6926351362731875, + "learning_rate": 2.1640100573560635e-05, + "loss": 1.8641, + "step": 7215 + }, + { + "epoch": 0.56, + "grad_norm": 0.6122877260474989, + "learning_rate": 2.1633910645889928e-05, + "loss": 1.8801, + "step": 7216 + }, + { + "epoch": 0.56, + "grad_norm": 0.6669921394037489, + "learning_rate": 2.162772092837799e-05, + "loss": 2.1207, + "step": 7217 + }, + { + "epoch": 0.56, + "grad_norm": 0.5918065698270532, + "learning_rate": 2.1621531421411306e-05, + "loss": 1.9429, + "step": 7218 + }, + { + "epoch": 0.56, + "grad_norm": 0.6268004447010787, + "learning_rate": 2.1615342125376287e-05, + "loss": 1.8946, + "step": 7219 + }, + { + "epoch": 0.56, + "grad_norm": 0.5598774952385015, + "learning_rate": 2.1609153040659358e-05, + "loss": 1.9713, + "step": 7220 + }, + { + "epoch": 0.56, + "grad_norm": 0.7073847708333127, + "learning_rate": 2.1602964167646947e-05, + "loss": 2.1206, + "step": 7221 + }, + { + "epoch": 0.56, + "grad_norm": 0.5838708180880664, + "learning_rate": 2.1596775506725424e-05, + "loss": 1.8763, + "step": 7222 + }, + { + "epoch": 0.56, + "grad_norm": 0.5800284778255044, + "learning_rate": 2.1590587058281197e-05, + "loss": 1.8516, + "step": 7223 + }, + { + "epoch": 0.56, + "grad_norm": 0.657618647389656, + "learning_rate": 2.158439882270063e-05, + "loss": 2.1243, + "step": 7224 + }, + { + "epoch": 0.56, + "grad_norm": 0.5562270119868195, + "learning_rate": 2.1578210800370065e-05, + "loss": 1.9324, + "step": 7225 + }, + { + "epoch": 0.56, + "grad_norm": 0.5721847601824016, + "learning_rate": 2.1572022991675867e-05, + "loss": 1.993, + "step": 7226 + }, + { + "epoch": 0.56, + "grad_norm": 0.5690747419460479, + "learning_rate": 2.156583539700435e-05, + "loss": 1.911, + "step": 7227 + }, + { + "epoch": 0.56, + "grad_norm": 0.5881490877444991, + "learning_rate": 2.155964801674183e-05, + "loss": 1.9269, + "step": 7228 + }, + { + "epoch": 0.56, + "grad_norm": 0.6207478585431586, + "learning_rate": 2.155346085127463e-05, + "loss": 2.1148, + "step": 7229 + }, + { + "epoch": 0.56, + "grad_norm": 0.6411459013281953, + "learning_rate": 2.1547273900989026e-05, + "loss": 1.9268, + "step": 7230 + }, + { + "epoch": 0.56, + "grad_norm": 0.558632660448574, + "learning_rate": 2.1541087166271298e-05, + "loss": 1.8909, + "step": 7231 + }, + { + "epoch": 0.56, + "grad_norm": 0.6412867247519702, + "learning_rate": 2.15349006475077e-05, + "loss": 2.1073, + "step": 7232 + }, + { + "epoch": 0.56, + "grad_norm": 0.5938810372108251, + "learning_rate": 2.152871434508449e-05, + "loss": 1.9594, + "step": 7233 + }, + { + "epoch": 0.56, + "grad_norm": 0.6025254743018162, + "learning_rate": 2.1522528259387915e-05, + "loss": 1.8702, + "step": 7234 + }, + { + "epoch": 0.56, + "grad_norm": 0.5538153892434978, + "learning_rate": 2.1516342390804174e-05, + "loss": 1.9244, + "step": 7235 + }, + { + "epoch": 0.56, + "grad_norm": 0.6292929905538829, + "learning_rate": 2.1510156739719486e-05, + "loss": 2.0605, + "step": 7236 + }, + { + "epoch": 0.56, + "grad_norm": 0.589755722392195, + "learning_rate": 2.1503971306520068e-05, + "loss": 1.9073, + "step": 7237 + }, + { + "epoch": 0.56, + "grad_norm": 0.6042969964154412, + "learning_rate": 2.1497786091592066e-05, + "loss": 1.9172, + "step": 7238 + }, + { + "epoch": 0.56, + "grad_norm": 0.5792345466784315, + "learning_rate": 2.1491601095321678e-05, + "loss": 1.9593, + "step": 7239 + }, + { + "epoch": 0.56, + "grad_norm": 0.6446262371470731, + "learning_rate": 2.1485416318095034e-05, + "loss": 1.8965, + "step": 7240 + }, + { + "epoch": 0.56, + "grad_norm": 0.6075509528148927, + "learning_rate": 2.1479231760298292e-05, + "loss": 2.127, + "step": 7241 + }, + { + "epoch": 0.56, + "grad_norm": 0.5905849354335069, + "learning_rate": 2.147304742231758e-05, + "loss": 1.8929, + "step": 7242 + }, + { + "epoch": 0.56, + "grad_norm": 0.6174299116019957, + "learning_rate": 2.1466863304539002e-05, + "loss": 1.9116, + "step": 7243 + }, + { + "epoch": 0.56, + "grad_norm": 0.6828452486405352, + "learning_rate": 2.1460679407348663e-05, + "loss": 2.0654, + "step": 7244 + }, + { + "epoch": 0.56, + "grad_norm": 0.585932819302075, + "learning_rate": 2.1454495731132656e-05, + "loss": 1.9564, + "step": 7245 + }, + { + "epoch": 0.56, + "grad_norm": 0.6512186412691896, + "learning_rate": 2.1448312276277033e-05, + "loss": 1.9333, + "step": 7246 + }, + { + "epoch": 0.56, + "grad_norm": 0.7471593192695719, + "learning_rate": 2.1442129043167874e-05, + "loss": 1.866, + "step": 7247 + }, + { + "epoch": 0.56, + "grad_norm": 0.6217386041095689, + "learning_rate": 2.1435946032191214e-05, + "loss": 1.9202, + "step": 7248 + }, + { + "epoch": 0.56, + "grad_norm": 0.966798505952979, + "learning_rate": 2.1429763243733078e-05, + "loss": 2.1007, + "step": 7249 + }, + { + "epoch": 0.56, + "grad_norm": 0.5942478061829854, + "learning_rate": 2.14235806781795e-05, + "loss": 1.8993, + "step": 7250 + }, + { + "epoch": 0.56, + "grad_norm": 0.8772466218251117, + "learning_rate": 2.1417398335916466e-05, + "loss": 2.0149, + "step": 7251 + }, + { + "epoch": 0.56, + "grad_norm": 0.674575726489538, + "learning_rate": 2.1411216217329975e-05, + "loss": 1.8765, + "step": 7252 + }, + { + "epoch": 0.56, + "grad_norm": 0.6232146997120385, + "learning_rate": 2.1405034322805993e-05, + "loss": 2.0617, + "step": 7253 + }, + { + "epoch": 0.56, + "grad_norm": 0.780735005891494, + "learning_rate": 2.1398852652730485e-05, + "loss": 1.9695, + "step": 7254 + }, + { + "epoch": 0.56, + "grad_norm": 0.8718858315235172, + "learning_rate": 2.139267120748941e-05, + "loss": 1.9176, + "step": 7255 + }, + { + "epoch": 0.56, + "grad_norm": 0.6427575541782409, + "learning_rate": 2.1386489987468685e-05, + "loss": 2.1129, + "step": 7256 + }, + { + "epoch": 0.56, + "grad_norm": 0.6988038591974072, + "learning_rate": 2.1380308993054232e-05, + "loss": 1.9629, + "step": 7257 + }, + { + "epoch": 0.56, + "grad_norm": 0.8126967783396629, + "learning_rate": 2.1374128224631968e-05, + "loss": 1.9121, + "step": 7258 + }, + { + "epoch": 0.56, + "grad_norm": 0.6480828606258465, + "learning_rate": 2.1367947682587766e-05, + "loss": 1.8874, + "step": 7259 + }, + { + "epoch": 0.56, + "grad_norm": 0.7559179858354168, + "learning_rate": 2.136176736730752e-05, + "loss": 1.8846, + "step": 7260 + }, + { + "epoch": 0.56, + "grad_norm": 0.6264785205289588, + "learning_rate": 2.135558727917707e-05, + "loss": 2.0998, + "step": 7261 + }, + { + "epoch": 0.56, + "grad_norm": 0.6855266999770363, + "learning_rate": 2.1349407418582285e-05, + "loss": 1.9076, + "step": 7262 + }, + { + "epoch": 0.56, + "grad_norm": 0.6841639230184681, + "learning_rate": 2.1343227785908996e-05, + "loss": 1.8807, + "step": 7263 + }, + { + "epoch": 0.56, + "grad_norm": 0.5186418502883205, + "learning_rate": 2.133704838154301e-05, + "loss": 1.924, + "step": 7264 + }, + { + "epoch": 0.56, + "grad_norm": 0.6156556500052351, + "learning_rate": 2.1330869205870153e-05, + "loss": 2.0754, + "step": 7265 + }, + { + "epoch": 0.56, + "grad_norm": 0.6744965324455826, + "learning_rate": 2.1324690259276194e-05, + "loss": 1.8892, + "step": 7266 + }, + { + "epoch": 0.56, + "grad_norm": 0.5900687565550549, + "learning_rate": 2.1318511542146917e-05, + "loss": 1.8578, + "step": 7267 + }, + { + "epoch": 0.56, + "grad_norm": 0.5964819836294853, + "learning_rate": 2.1312333054868104e-05, + "loss": 2.0788, + "step": 7268 + }, + { + "epoch": 0.56, + "grad_norm": 0.6430949106129895, + "learning_rate": 2.130615479782548e-05, + "loss": 1.9343, + "step": 7269 + }, + { + "epoch": 0.56, + "grad_norm": 0.6288165514492883, + "learning_rate": 2.1299976771404777e-05, + "loss": 1.9859, + "step": 7270 + }, + { + "epoch": 0.56, + "grad_norm": 0.6304949588771951, + "learning_rate": 2.129379897599174e-05, + "loss": 1.9052, + "step": 7271 + }, + { + "epoch": 0.56, + "grad_norm": 0.6029355302321225, + "learning_rate": 2.1287621411972045e-05, + "loss": 1.9103, + "step": 7272 + }, + { + "epoch": 0.56, + "grad_norm": 0.8158719594910049, + "learning_rate": 2.1281444079731407e-05, + "loss": 2.0746, + "step": 7273 + }, + { + "epoch": 0.56, + "grad_norm": 0.6241475591582576, + "learning_rate": 2.1275266979655487e-05, + "loss": 1.9031, + "step": 7274 + }, + { + "epoch": 0.56, + "grad_norm": 0.5825824557244752, + "learning_rate": 2.1269090112129942e-05, + "loss": 1.8996, + "step": 7275 + }, + { + "epoch": 0.56, + "grad_norm": 0.7931492913138694, + "learning_rate": 2.1262913477540443e-05, + "loss": 1.9908, + "step": 7276 + }, + { + "epoch": 0.56, + "grad_norm": 0.8127104084335548, + "learning_rate": 2.1256737076272604e-05, + "loss": 2.1054, + "step": 7277 + }, + { + "epoch": 0.56, + "grad_norm": 0.7282542847005522, + "learning_rate": 2.125056090871204e-05, + "loss": 1.9168, + "step": 7278 + }, + { + "epoch": 0.56, + "grad_norm": 0.8202587071699139, + "learning_rate": 2.1244384975244373e-05, + "loss": 1.9363, + "step": 7279 + }, + { + "epoch": 0.56, + "grad_norm": 0.7050566243680747, + "learning_rate": 2.123820927625517e-05, + "loss": 1.8982, + "step": 7280 + }, + { + "epoch": 0.56, + "grad_norm": 0.6849295607571945, + "learning_rate": 2.1232033812130026e-05, + "loss": 2.1236, + "step": 7281 + }, + { + "epoch": 0.56, + "grad_norm": 0.6767800671926392, + "learning_rate": 2.1225858583254478e-05, + "loss": 1.9975, + "step": 7282 + }, + { + "epoch": 0.56, + "grad_norm": 0.6911295310121451, + "learning_rate": 2.1219683590014087e-05, + "loss": 1.914, + "step": 7283 + }, + { + "epoch": 0.56, + "grad_norm": 0.7651820409333433, + "learning_rate": 2.121350883279439e-05, + "loss": 1.9307, + "step": 7284 + }, + { + "epoch": 0.56, + "grad_norm": 0.7892108656387842, + "learning_rate": 2.120733431198088e-05, + "loss": 2.0932, + "step": 7285 + }, + { + "epoch": 0.56, + "grad_norm": 0.7156816524560146, + "learning_rate": 2.1201160027959077e-05, + "loss": 1.9326, + "step": 7286 + }, + { + "epoch": 0.56, + "grad_norm": 0.6845370047602501, + "learning_rate": 2.119498598111445e-05, + "loss": 1.8781, + "step": 7287 + }, + { + "epoch": 0.56, + "grad_norm": 0.7145397295165097, + "learning_rate": 2.1188812171832484e-05, + "loss": 2.0986, + "step": 7288 + }, + { + "epoch": 0.56, + "grad_norm": 0.6967629017218039, + "learning_rate": 2.1182638600498638e-05, + "loss": 1.9753, + "step": 7289 + }, + { + "epoch": 0.56, + "grad_norm": 0.6563082609900799, + "learning_rate": 2.1176465267498335e-05, + "loss": 1.9112, + "step": 7290 + }, + { + "epoch": 0.56, + "grad_norm": 0.765324743082738, + "learning_rate": 2.1170292173217016e-05, + "loss": 1.8701, + "step": 7291 + }, + { + "epoch": 0.56, + "grad_norm": 0.6646597247079286, + "learning_rate": 2.1164119318040097e-05, + "loss": 1.9354, + "step": 7292 + }, + { + "epoch": 0.56, + "grad_norm": 0.5549407964088721, + "learning_rate": 2.1157946702352956e-05, + "loss": 2.1084, + "step": 7293 + }, + { + "epoch": 0.56, + "grad_norm": 0.6800305775096903, + "learning_rate": 2.1151774326540995e-05, + "loss": 1.8631, + "step": 7294 + }, + { + "epoch": 0.56, + "grad_norm": 0.5770964736927102, + "learning_rate": 2.1145602190989573e-05, + "loss": 1.9447, + "step": 7295 + }, + { + "epoch": 0.56, + "grad_norm": 0.6505654133223174, + "learning_rate": 2.1139430296084028e-05, + "loss": 1.8349, + "step": 7296 + }, + { + "epoch": 0.56, + "grad_norm": 0.5746499518406967, + "learning_rate": 2.1133258642209723e-05, + "loss": 2.0849, + "step": 7297 + }, + { + "epoch": 0.56, + "grad_norm": 0.7566038324024239, + "learning_rate": 2.112708722975196e-05, + "loss": 1.904, + "step": 7298 + }, + { + "epoch": 0.56, + "grad_norm": 0.6691683081121365, + "learning_rate": 2.112091605909606e-05, + "loss": 1.8989, + "step": 7299 + }, + { + "epoch": 0.56, + "grad_norm": 0.5890623129883249, + "learning_rate": 2.111474513062729e-05, + "loss": 2.0621, + "step": 7300 + }, + { + "epoch": 0.56, + "grad_norm": 0.6424871177697925, + "learning_rate": 2.110857444473095e-05, + "loss": 1.9735, + "step": 7301 + }, + { + "epoch": 0.56, + "grad_norm": 0.5936359825885794, + "learning_rate": 2.11024040017923e-05, + "loss": 1.9183, + "step": 7302 + }, + { + "epoch": 0.56, + "grad_norm": 0.6188357063057587, + "learning_rate": 2.1096233802196566e-05, + "loss": 1.9112, + "step": 7303 + }, + { + "epoch": 0.56, + "grad_norm": 0.5953951359081591, + "learning_rate": 2.1090063846328994e-05, + "loss": 1.8927, + "step": 7304 + }, + { + "epoch": 0.56, + "grad_norm": 0.6504911132892564, + "learning_rate": 2.108389413457481e-05, + "loss": 2.0904, + "step": 7305 + }, + { + "epoch": 0.56, + "grad_norm": 0.6652314757090627, + "learning_rate": 2.10777246673192e-05, + "loss": 1.9195, + "step": 7306 + }, + { + "epoch": 0.56, + "grad_norm": 0.5849517295895404, + "learning_rate": 2.1071555444947356e-05, + "loss": 1.9119, + "step": 7307 + }, + { + "epoch": 0.56, + "grad_norm": 0.6236488856542042, + "learning_rate": 2.106538646784443e-05, + "loss": 1.8592, + "step": 7308 + }, + { + "epoch": 0.56, + "grad_norm": 0.6425707000101294, + "learning_rate": 2.10592177363956e-05, + "loss": 2.1318, + "step": 7309 + }, + { + "epoch": 0.56, + "grad_norm": 0.737830631510237, + "learning_rate": 2.1053049250986e-05, + "loss": 1.9342, + "step": 7310 + }, + { + "epoch": 0.56, + "grad_norm": 0.6455412232761667, + "learning_rate": 2.1046881012000737e-05, + "loss": 1.9663, + "step": 7311 + }, + { + "epoch": 0.56, + "grad_norm": 0.6239959017085986, + "learning_rate": 2.1040713019824947e-05, + "loss": 1.8808, + "step": 7312 + }, + { + "epoch": 0.56, + "grad_norm": 0.7998685955768088, + "learning_rate": 2.1034545274843702e-05, + "loss": 2.1561, + "step": 7313 + }, + { + "epoch": 0.56, + "grad_norm": 0.6866229055262413, + "learning_rate": 2.1028377777442076e-05, + "loss": 1.9229, + "step": 7314 + }, + { + "epoch": 0.56, + "grad_norm": 0.6133971319030368, + "learning_rate": 2.1022210528005158e-05, + "loss": 1.8722, + "step": 7315 + }, + { + "epoch": 0.56, + "grad_norm": 0.6879325007277484, + "learning_rate": 2.101604352691797e-05, + "loss": 1.9324, + "step": 7316 + }, + { + "epoch": 0.56, + "grad_norm": 0.703810009496628, + "learning_rate": 2.1009876774565546e-05, + "loss": 2.1263, + "step": 7317 + }, + { + "epoch": 0.56, + "grad_norm": 0.6151415270233368, + "learning_rate": 2.1003710271332914e-05, + "loss": 1.8897, + "step": 7318 + }, + { + "epoch": 0.56, + "grad_norm": 0.6151800921032786, + "learning_rate": 2.0997544017605062e-05, + "loss": 1.933, + "step": 7319 + }, + { + "epoch": 0.56, + "grad_norm": 0.6247689092229443, + "learning_rate": 2.099137801376699e-05, + "loss": 1.9846, + "step": 7320 + }, + { + "epoch": 0.56, + "grad_norm": 0.5750809373967433, + "learning_rate": 2.0985212260203646e-05, + "loss": 2.0955, + "step": 7321 + }, + { + "epoch": 0.56, + "grad_norm": 0.6239106670710417, + "learning_rate": 2.097904675729999e-05, + "loss": 1.8829, + "step": 7322 + }, + { + "epoch": 0.56, + "grad_norm": 0.6638413466005451, + "learning_rate": 2.0972881505440977e-05, + "loss": 1.8935, + "step": 7323 + }, + { + "epoch": 0.57, + "grad_norm": 0.7277011628816623, + "learning_rate": 2.0966716505011507e-05, + "loss": 1.8811, + "step": 7324 + }, + { + "epoch": 0.57, + "grad_norm": 0.7630823199090656, + "learning_rate": 2.096055175639649e-05, + "loss": 2.1231, + "step": 7325 + }, + { + "epoch": 0.57, + "grad_norm": 0.6393340549565498, + "learning_rate": 2.0954387259980834e-05, + "loss": 1.9858, + "step": 7326 + }, + { + "epoch": 0.57, + "grad_norm": 0.655572543583511, + "learning_rate": 2.0948223016149395e-05, + "loss": 1.9043, + "step": 7327 + }, + { + "epoch": 0.57, + "grad_norm": 0.7111844582569002, + "learning_rate": 2.0942059025287042e-05, + "loss": 1.921, + "step": 7328 + }, + { + "epoch": 0.57, + "grad_norm": 0.762136831360782, + "learning_rate": 2.0935895287778607e-05, + "loss": 2.1168, + "step": 7329 + }, + { + "epoch": 0.57, + "grad_norm": 0.5797093263866309, + "learning_rate": 2.0929731804008927e-05, + "loss": 1.8717, + "step": 7330 + }, + { + "epoch": 0.57, + "grad_norm": 0.766374980836551, + "learning_rate": 2.0923568574362816e-05, + "loss": 1.8769, + "step": 7331 + }, + { + "epoch": 0.57, + "grad_norm": 0.6227355717302747, + "learning_rate": 2.091740559922506e-05, + "loss": 1.9866, + "step": 7332 + }, + { + "epoch": 0.57, + "grad_norm": 0.6500367381723878, + "learning_rate": 2.091124287898045e-05, + "loss": 2.1388, + "step": 7333 + }, + { + "epoch": 0.57, + "grad_norm": 0.6039645270186607, + "learning_rate": 2.090508041401374e-05, + "loss": 1.9131, + "step": 7334 + }, + { + "epoch": 0.57, + "grad_norm": 0.5906587379808558, + "learning_rate": 2.089891820470968e-05, + "loss": 1.8895, + "step": 7335 + }, + { + "epoch": 0.57, + "grad_norm": 0.6914270717833539, + "learning_rate": 2.089275625145301e-05, + "loss": 1.9106, + "step": 7336 + }, + { + "epoch": 0.57, + "grad_norm": 0.6782870065893911, + "learning_rate": 2.088659455462843e-05, + "loss": 2.1223, + "step": 7337 + }, + { + "epoch": 0.57, + "grad_norm": 0.6588272938888124, + "learning_rate": 2.0880433114620655e-05, + "loss": 1.9942, + "step": 7338 + }, + { + "epoch": 0.57, + "grad_norm": 0.702487022460331, + "learning_rate": 2.0874271931814374e-05, + "loss": 1.8877, + "step": 7339 + }, + { + "epoch": 0.57, + "grad_norm": 0.6768333364061985, + "learning_rate": 2.086811100659423e-05, + "loss": 1.8878, + "step": 7340 + }, + { + "epoch": 0.57, + "grad_norm": 0.6666201058640474, + "learning_rate": 2.0861950339344903e-05, + "loss": 2.1104, + "step": 7341 + }, + { + "epoch": 0.57, + "grad_norm": 0.6435467248136331, + "learning_rate": 2.0855789930451013e-05, + "loss": 1.8925, + "step": 7342 + }, + { + "epoch": 0.57, + "grad_norm": 0.6320909845695809, + "learning_rate": 2.0849629780297177e-05, + "loss": 1.9657, + "step": 7343 + }, + { + "epoch": 0.57, + "grad_norm": 0.6593159093040133, + "learning_rate": 2.0843469889268012e-05, + "loss": 1.9709, + "step": 7344 + }, + { + "epoch": 0.57, + "grad_norm": 0.6994436826118214, + "learning_rate": 2.0837310257748093e-05, + "loss": 2.1234, + "step": 7345 + }, + { + "epoch": 0.57, + "grad_norm": 0.5976903538861065, + "learning_rate": 2.0831150886121998e-05, + "loss": 1.8707, + "step": 7346 + }, + { + "epoch": 0.57, + "grad_norm": 0.7060931059646764, + "learning_rate": 2.0824991774774267e-05, + "loss": 1.9359, + "step": 7347 + }, + { + "epoch": 0.57, + "grad_norm": 0.6027539226784565, + "learning_rate": 2.081883292408946e-05, + "loss": 1.9441, + "step": 7348 + }, + { + "epoch": 0.57, + "grad_norm": 0.705751071262903, + "learning_rate": 2.0812674334452098e-05, + "loss": 2.0955, + "step": 7349 + }, + { + "epoch": 0.57, + "grad_norm": 0.6501754017414836, + "learning_rate": 2.0806516006246667e-05, + "loss": 1.8891, + "step": 7350 + }, + { + "epoch": 0.57, + "grad_norm": 0.6324623236416029, + "learning_rate": 2.0800357939857666e-05, + "loss": 1.9692, + "step": 7351 + }, + { + "epoch": 0.57, + "grad_norm": 0.6050688502804717, + "learning_rate": 2.0794200135669584e-05, + "loss": 1.8858, + "step": 7352 + }, + { + "epoch": 0.57, + "grad_norm": 0.729193483662727, + "learning_rate": 2.078804259406686e-05, + "loss": 2.1256, + "step": 7353 + }, + { + "epoch": 0.57, + "grad_norm": 0.5865860927438855, + "learning_rate": 2.0781885315433945e-05, + "loss": 1.953, + "step": 7354 + }, + { + "epoch": 0.57, + "grad_norm": 0.609515615648004, + "learning_rate": 2.0775728300155247e-05, + "loss": 1.9525, + "step": 7355 + }, + { + "epoch": 0.57, + "grad_norm": 0.6442694883220836, + "learning_rate": 2.0769571548615192e-05, + "loss": 1.8235, + "step": 7356 + }, + { + "epoch": 0.57, + "grad_norm": 0.6364531234374721, + "learning_rate": 2.0763415061198167e-05, + "loss": 2.1417, + "step": 7357 + }, + { + "epoch": 0.57, + "grad_norm": 0.5782392300388424, + "learning_rate": 2.0757258838288535e-05, + "loss": 1.9585, + "step": 7358 + }, + { + "epoch": 0.57, + "grad_norm": 0.5868560016539307, + "learning_rate": 2.075110288027067e-05, + "loss": 1.9465, + "step": 7359 + }, + { + "epoch": 0.57, + "grad_norm": 0.6356635225885338, + "learning_rate": 2.074494718752891e-05, + "loss": 1.898, + "step": 7360 + }, + { + "epoch": 0.57, + "grad_norm": 0.6293176831216851, + "learning_rate": 2.0738791760447566e-05, + "loss": 2.1025, + "step": 7361 + }, + { + "epoch": 0.57, + "grad_norm": 0.6016449991054256, + "learning_rate": 2.0732636599410973e-05, + "loss": 1.9183, + "step": 7362 + }, + { + "epoch": 0.57, + "grad_norm": 0.6386926365839832, + "learning_rate": 2.07264817048034e-05, + "loss": 1.9901, + "step": 7363 + }, + { + "epoch": 0.57, + "grad_norm": 0.653190534971601, + "learning_rate": 2.0720327077009125e-05, + "loss": 1.9277, + "step": 7364 + }, + { + "epoch": 0.57, + "grad_norm": 0.6671414947956004, + "learning_rate": 2.0714172716412422e-05, + "loss": 2.1002, + "step": 7365 + }, + { + "epoch": 0.57, + "grad_norm": 0.6823965286627397, + "learning_rate": 2.0708018623397518e-05, + "loss": 1.9273, + "step": 7366 + }, + { + "epoch": 0.57, + "grad_norm": 0.6225618021359707, + "learning_rate": 2.070186479834865e-05, + "loss": 1.8743, + "step": 7367 + }, + { + "epoch": 0.57, + "grad_norm": 0.6371447632782691, + "learning_rate": 2.0695711241650005e-05, + "loss": 1.895, + "step": 7368 + }, + { + "epoch": 0.57, + "grad_norm": 0.6596598709758402, + "learning_rate": 2.068955795368579e-05, + "loss": 2.11, + "step": 7369 + }, + { + "epoch": 0.57, + "grad_norm": 0.6276795787017508, + "learning_rate": 2.0683404934840194e-05, + "loss": 1.9647, + "step": 7370 + }, + { + "epoch": 0.57, + "grad_norm": 0.645061532308815, + "learning_rate": 2.0677252185497357e-05, + "loss": 1.8867, + "step": 7371 + }, + { + "epoch": 0.57, + "grad_norm": 0.6170649555892768, + "learning_rate": 2.0671099706041413e-05, + "loss": 1.8827, + "step": 7372 + }, + { + "epoch": 0.57, + "grad_norm": 0.6017387813683415, + "learning_rate": 2.0664947496856514e-05, + "loss": 2.0895, + "step": 7373 + }, + { + "epoch": 0.57, + "grad_norm": 0.5851078981143658, + "learning_rate": 2.0658795558326743e-05, + "loss": 1.8712, + "step": 7374 + }, + { + "epoch": 0.57, + "grad_norm": 0.5973031696518187, + "learning_rate": 2.0652643890836206e-05, + "loss": 1.9821, + "step": 7375 + }, + { + "epoch": 0.57, + "grad_norm": 0.6775773923427596, + "learning_rate": 2.064649249476896e-05, + "loss": 1.9546, + "step": 7376 + }, + { + "epoch": 0.57, + "grad_norm": 0.6113860286992369, + "learning_rate": 2.0640341370509076e-05, + "loss": 2.1126, + "step": 7377 + }, + { + "epoch": 0.57, + "grad_norm": 0.6929904803168362, + "learning_rate": 2.0634190518440595e-05, + "loss": 1.8947, + "step": 7378 + }, + { + "epoch": 0.57, + "grad_norm": 0.7734925789046544, + "learning_rate": 2.0628039938947528e-05, + "loss": 1.9135, + "step": 7379 + }, + { + "epoch": 0.57, + "grad_norm": 0.6616484667557089, + "learning_rate": 2.0621889632413897e-05, + "loss": 1.9011, + "step": 7380 + }, + { + "epoch": 0.57, + "grad_norm": 0.7183585584146585, + "learning_rate": 2.0615739599223677e-05, + "loss": 2.1266, + "step": 7381 + }, + { + "epoch": 0.57, + "grad_norm": 0.7285490928099024, + "learning_rate": 2.0609589839760837e-05, + "loss": 1.9792, + "step": 7382 + }, + { + "epoch": 0.57, + "grad_norm": 0.6570023923710698, + "learning_rate": 2.0603440354409352e-05, + "loss": 1.8497, + "step": 7383 + }, + { + "epoch": 0.57, + "grad_norm": 0.7165867499995137, + "learning_rate": 2.0597291143553137e-05, + "loss": 1.8766, + "step": 7384 + }, + { + "epoch": 0.57, + "grad_norm": 0.7159072975781767, + "learning_rate": 2.059114220757613e-05, + "loss": 2.1021, + "step": 7385 + }, + { + "epoch": 0.57, + "grad_norm": 0.5531413868131627, + "learning_rate": 2.0584993546862226e-05, + "loss": 1.8672, + "step": 7386 + }, + { + "epoch": 0.57, + "grad_norm": 0.6555195095108425, + "learning_rate": 2.0578845161795307e-05, + "loss": 1.9261, + "step": 7387 + }, + { + "epoch": 0.57, + "grad_norm": 0.6276633997778613, + "learning_rate": 2.057269705275926e-05, + "loss": 2.019, + "step": 7388 + }, + { + "epoch": 0.57, + "grad_norm": 0.6387465327063436, + "learning_rate": 2.056654922013791e-05, + "loss": 2.0728, + "step": 7389 + }, + { + "epoch": 0.57, + "grad_norm": 0.6036005097645605, + "learning_rate": 2.0560401664315103e-05, + "loss": 1.9428, + "step": 7390 + }, + { + "epoch": 0.57, + "grad_norm": 0.6996799062165104, + "learning_rate": 2.0554254385674672e-05, + "loss": 1.8858, + "step": 7391 + }, + { + "epoch": 0.57, + "grad_norm": 0.6748253409045181, + "learning_rate": 2.05481073846004e-05, + "loss": 1.9227, + "step": 7392 + }, + { + "epoch": 0.57, + "grad_norm": 0.7628063743483974, + "learning_rate": 2.0541960661476063e-05, + "loss": 2.1167, + "step": 7393 + }, + { + "epoch": 0.57, + "grad_norm": 0.658668073202702, + "learning_rate": 2.0535814216685447e-05, + "loss": 1.987, + "step": 7394 + }, + { + "epoch": 0.57, + "grad_norm": 0.7283869058421752, + "learning_rate": 2.0529668050612285e-05, + "loss": 1.9042, + "step": 7395 + }, + { + "epoch": 0.57, + "grad_norm": 0.6069732163069816, + "learning_rate": 2.0523522163640316e-05, + "loss": 1.8773, + "step": 7396 + }, + { + "epoch": 0.57, + "grad_norm": 0.6449391014440827, + "learning_rate": 2.0517376556153236e-05, + "loss": 2.1408, + "step": 7397 + }, + { + "epoch": 0.57, + "grad_norm": 0.6505226632544117, + "learning_rate": 2.051123122853476e-05, + "loss": 1.8859, + "step": 7398 + }, + { + "epoch": 0.57, + "grad_norm": 0.5534992712997799, + "learning_rate": 2.0505086181168563e-05, + "loss": 1.8782, + "step": 7399 + }, + { + "epoch": 0.57, + "grad_norm": 0.6117614993161784, + "learning_rate": 2.049894141443829e-05, + "loss": 1.9413, + "step": 7400 + }, + { + "epoch": 0.57, + "grad_norm": 0.6527196665692702, + "learning_rate": 2.049279692872761e-05, + "loss": 2.0682, + "step": 7401 + }, + { + "epoch": 0.57, + "grad_norm": 0.640524576464648, + "learning_rate": 2.048665272442012e-05, + "loss": 1.8479, + "step": 7402 + }, + { + "epoch": 0.57, + "grad_norm": 0.658504031410833, + "learning_rate": 2.0480508801899444e-05, + "loss": 1.8991, + "step": 7403 + }, + { + "epoch": 0.57, + "grad_norm": 0.7941770572559292, + "learning_rate": 2.047436516154918e-05, + "loss": 1.9206, + "step": 7404 + }, + { + "epoch": 0.57, + "grad_norm": 0.6878974126968476, + "learning_rate": 2.046822180375288e-05, + "loss": 2.0717, + "step": 7405 + }, + { + "epoch": 0.57, + "grad_norm": 0.71425617941636, + "learning_rate": 2.0462078728894113e-05, + "loss": 2.0106, + "step": 7406 + }, + { + "epoch": 0.57, + "grad_norm": 0.8648724224857081, + "learning_rate": 2.0455935937356418e-05, + "loss": 1.9283, + "step": 7407 + }, + { + "epoch": 0.57, + "grad_norm": 0.626782430205911, + "learning_rate": 2.0449793429523302e-05, + "loss": 1.8618, + "step": 7408 + }, + { + "epoch": 0.57, + "grad_norm": 0.7414114821446328, + "learning_rate": 2.0443651205778284e-05, + "loss": 2.1174, + "step": 7409 + }, + { + "epoch": 0.57, + "grad_norm": 0.6968132671299696, + "learning_rate": 2.0437509266504832e-05, + "loss": 1.9201, + "step": 7410 + }, + { + "epoch": 0.57, + "grad_norm": 0.7281002042507275, + "learning_rate": 2.0431367612086416e-05, + "loss": 1.8651, + "step": 7411 + }, + { + "epoch": 0.57, + "grad_norm": 0.6356516589441528, + "learning_rate": 2.04252262429065e-05, + "loss": 1.8927, + "step": 7412 + }, + { + "epoch": 0.57, + "grad_norm": 0.7431854250079011, + "learning_rate": 2.0419085159348498e-05, + "loss": 1.9704, + "step": 7413 + }, + { + "epoch": 0.57, + "grad_norm": 0.8332549538721065, + "learning_rate": 2.041294436179583e-05, + "loss": 2.1103, + "step": 7414 + }, + { + "epoch": 0.57, + "grad_norm": 0.5797416300580368, + "learning_rate": 2.0406803850631883e-05, + "loss": 1.8642, + "step": 7415 + }, + { + "epoch": 0.57, + "grad_norm": 0.7284414722257874, + "learning_rate": 2.040066362624004e-05, + "loss": 1.8877, + "step": 7416 + }, + { + "epoch": 0.57, + "grad_norm": 0.7216333573857389, + "learning_rate": 2.0394523689003673e-05, + "loss": 2.0827, + "step": 7417 + }, + { + "epoch": 0.57, + "grad_norm": 0.6284776663562003, + "learning_rate": 2.0388384039306106e-05, + "loss": 1.8757, + "step": 7418 + }, + { + "epoch": 0.57, + "grad_norm": 0.5951179781957419, + "learning_rate": 2.038224467753066e-05, + "loss": 1.9637, + "step": 7419 + }, + { + "epoch": 0.57, + "grad_norm": 0.7521373977327651, + "learning_rate": 2.0376105604060662e-05, + "loss": 1.9383, + "step": 7420 + }, + { + "epoch": 0.57, + "grad_norm": 0.6787394134085954, + "learning_rate": 2.0369966819279384e-05, + "loss": 2.0367, + "step": 7421 + }, + { + "epoch": 0.57, + "grad_norm": 0.6373840400038623, + "learning_rate": 2.03638283235701e-05, + "loss": 1.8479, + "step": 7422 + }, + { + "epoch": 0.57, + "grad_norm": 0.737724974414012, + "learning_rate": 2.035769011731605e-05, + "loss": 1.8992, + "step": 7423 + }, + { + "epoch": 0.57, + "grad_norm": 0.6356524877603701, + "learning_rate": 2.0351552200900483e-05, + "loss": 1.9707, + "step": 7424 + }, + { + "epoch": 0.57, + "grad_norm": 0.6993567715220683, + "learning_rate": 2.034541457470661e-05, + "loss": 1.9438, + "step": 7425 + }, + { + "epoch": 0.57, + "grad_norm": 0.6278721841817009, + "learning_rate": 2.0339277239117622e-05, + "loss": 2.0964, + "step": 7426 + }, + { + "epoch": 0.57, + "grad_norm": 0.7216983972602918, + "learning_rate": 2.0333140194516705e-05, + "loss": 1.9102, + "step": 7427 + }, + { + "epoch": 0.57, + "grad_norm": 0.5843360294576814, + "learning_rate": 2.032700344128702e-05, + "loss": 1.9312, + "step": 7428 + }, + { + "epoch": 0.57, + "grad_norm": 0.7878950908674687, + "learning_rate": 2.0320866979811702e-05, + "loss": 2.055, + "step": 7429 + }, + { + "epoch": 0.57, + "grad_norm": 0.6416528585355762, + "learning_rate": 2.0314730810473886e-05, + "loss": 1.8602, + "step": 7430 + }, + { + "epoch": 0.57, + "grad_norm": 0.645743870292766, + "learning_rate": 2.0308594933656673e-05, + "loss": 2.0019, + "step": 7431 + }, + { + "epoch": 0.57, + "grad_norm": 0.6327059867524288, + "learning_rate": 2.030245934974314e-05, + "loss": 1.895, + "step": 7432 + }, + { + "epoch": 0.57, + "grad_norm": 0.5957065992942077, + "learning_rate": 2.0296324059116384e-05, + "loss": 1.8854, + "step": 7433 + }, + { + "epoch": 0.57, + "grad_norm": 0.6853454415237912, + "learning_rate": 2.0290189062159426e-05, + "loss": 2.1029, + "step": 7434 + }, + { + "epoch": 0.57, + "grad_norm": 0.5830829976964061, + "learning_rate": 2.028405435925533e-05, + "loss": 1.8808, + "step": 7435 + }, + { + "epoch": 0.57, + "grad_norm": 0.661245305855937, + "learning_rate": 2.027791995078708e-05, + "loss": 1.8743, + "step": 7436 + }, + { + "epoch": 0.57, + "grad_norm": 0.7477937381320924, + "learning_rate": 2.0271785837137685e-05, + "loss": 2.0629, + "step": 7437 + }, + { + "epoch": 0.57, + "grad_norm": 0.706015409677576, + "learning_rate": 2.0265652018690135e-05, + "loss": 1.9585, + "step": 7438 + }, + { + "epoch": 0.57, + "grad_norm": 0.7595538683054777, + "learning_rate": 2.0259518495827372e-05, + "loss": 1.9137, + "step": 7439 + }, + { + "epoch": 0.57, + "grad_norm": 0.5607071566507457, + "learning_rate": 2.025338526893234e-05, + "loss": 1.8659, + "step": 7440 + }, + { + "epoch": 0.57, + "grad_norm": 0.8435936549757441, + "learning_rate": 2.0247252338387976e-05, + "loss": 2.1034, + "step": 7441 + }, + { + "epoch": 0.57, + "grad_norm": 0.6633180763715603, + "learning_rate": 2.0241119704577168e-05, + "loss": 1.8802, + "step": 7442 + }, + { + "epoch": 0.57, + "grad_norm": 0.5874910635053093, + "learning_rate": 2.023498736788281e-05, + "loss": 1.8765, + "step": 7443 + }, + { + "epoch": 0.57, + "grad_norm": 0.7818123573771695, + "learning_rate": 2.0228855328687756e-05, + "loss": 1.9539, + "step": 7444 + }, + { + "epoch": 0.57, + "grad_norm": 0.6425650006863312, + "learning_rate": 2.022272358737487e-05, + "loss": 1.8853, + "step": 7445 + }, + { + "epoch": 0.57, + "grad_norm": 0.7359990422258524, + "learning_rate": 2.021659214432698e-05, + "loss": 2.118, + "step": 7446 + }, + { + "epoch": 0.57, + "grad_norm": 0.6259407194982133, + "learning_rate": 2.0210460999926887e-05, + "loss": 1.8953, + "step": 7447 + }, + { + "epoch": 0.57, + "grad_norm": 0.627919178173089, + "learning_rate": 2.0204330154557395e-05, + "loss": 1.8928, + "step": 7448 + }, + { + "epoch": 0.57, + "grad_norm": 0.5863685512161994, + "learning_rate": 2.0198199608601263e-05, + "loss": 2.0774, + "step": 7449 + }, + { + "epoch": 0.57, + "grad_norm": 0.6990582930259674, + "learning_rate": 2.0192069362441262e-05, + "loss": 2.0039, + "step": 7450 + }, + { + "epoch": 0.57, + "grad_norm": 0.6186505863705892, + "learning_rate": 2.0185939416460133e-05, + "loss": 1.8586, + "step": 7451 + }, + { + "epoch": 0.57, + "grad_norm": 0.5804258153121453, + "learning_rate": 2.0179809771040564e-05, + "loss": 1.8572, + "step": 7452 + }, + { + "epoch": 0.57, + "grad_norm": 0.6708033748391804, + "learning_rate": 2.0173680426565282e-05, + "loss": 2.0746, + "step": 7453 + }, + { + "epoch": 0.58, + "grad_norm": 0.5670344301200633, + "learning_rate": 2.016755138341696e-05, + "loss": 1.8525, + "step": 7454 + }, + { + "epoch": 0.58, + "grad_norm": 0.6115158828790319, + "learning_rate": 2.016142264197825e-05, + "loss": 1.9054, + "step": 7455 + }, + { + "epoch": 0.58, + "grad_norm": 0.5784293196161365, + "learning_rate": 2.015529420263181e-05, + "loss": 1.9921, + "step": 7456 + }, + { + "epoch": 0.58, + "grad_norm": 0.5936630638482707, + "learning_rate": 2.0149166065760254e-05, + "loss": 1.8874, + "step": 7457 + }, + { + "epoch": 0.58, + "grad_norm": 0.6060218442028465, + "learning_rate": 2.0143038231746182e-05, + "loss": 2.0903, + "step": 7458 + }, + { + "epoch": 0.58, + "grad_norm": 0.5762059045394486, + "learning_rate": 2.0136910700972193e-05, + "loss": 1.9866, + "step": 7459 + }, + { + "epoch": 0.58, + "grad_norm": 0.5996693658244541, + "learning_rate": 2.0130783473820843e-05, + "loss": 1.9174, + "step": 7460 + }, + { + "epoch": 0.58, + "grad_norm": 0.6404792004372338, + "learning_rate": 2.0124656550674693e-05, + "loss": 2.1065, + "step": 7461 + }, + { + "epoch": 0.58, + "grad_norm": 0.6121254749829028, + "learning_rate": 2.0118529931916247e-05, + "loss": 1.9494, + "step": 7462 + }, + { + "epoch": 0.58, + "grad_norm": 0.6516623998966036, + "learning_rate": 2.0112403617928037e-05, + "loss": 1.8624, + "step": 7463 + }, + { + "epoch": 0.58, + "grad_norm": 0.5898528230755222, + "learning_rate": 2.0106277609092557e-05, + "loss": 1.9155, + "step": 7464 + }, + { + "epoch": 0.58, + "grad_norm": 0.6476386149013306, + "learning_rate": 2.0100151905792257e-05, + "loss": 1.8746, + "step": 7465 + }, + { + "epoch": 0.58, + "grad_norm": 0.6260923744569447, + "learning_rate": 2.00940265084096e-05, + "loss": 2.1048, + "step": 7466 + }, + { + "epoch": 0.58, + "grad_norm": 0.5857347870856231, + "learning_rate": 2.008790141732704e-05, + "loss": 1.8918, + "step": 7467 + }, + { + "epoch": 0.58, + "grad_norm": 0.5651491519892858, + "learning_rate": 2.0081776632926965e-05, + "loss": 1.8136, + "step": 7468 + }, + { + "epoch": 0.58, + "grad_norm": 0.5522517109868166, + "learning_rate": 2.007565215559178e-05, + "loss": 1.9117, + "step": 7469 + }, + { + "epoch": 0.58, + "grad_norm": 0.6346710075024526, + "learning_rate": 2.0069527985703857e-05, + "loss": 2.1084, + "step": 7470 + }, + { + "epoch": 0.58, + "grad_norm": 0.6665164053294654, + "learning_rate": 2.0063404123645556e-05, + "loss": 1.8704, + "step": 7471 + }, + { + "epoch": 0.58, + "grad_norm": 0.5901807610479669, + "learning_rate": 2.0057280569799223e-05, + "loss": 1.9049, + "step": 7472 + }, + { + "epoch": 0.58, + "grad_norm": 0.635968917172567, + "learning_rate": 2.0051157324547163e-05, + "loss": 2.1212, + "step": 7473 + }, + { + "epoch": 0.58, + "grad_norm": 0.605951827647163, + "learning_rate": 2.0045034388271682e-05, + "loss": 1.9667, + "step": 7474 + }, + { + "epoch": 0.58, + "grad_norm": 0.6223305054312995, + "learning_rate": 2.0038911761355066e-05, + "loss": 1.9933, + "step": 7475 + }, + { + "epoch": 0.58, + "grad_norm": 0.5882975045259876, + "learning_rate": 2.003278944417956e-05, + "loss": 1.8758, + "step": 7476 + }, + { + "epoch": 0.58, + "grad_norm": 0.6558002044253558, + "learning_rate": 2.0026667437127425e-05, + "loss": 1.9152, + "step": 7477 + }, + { + "epoch": 0.58, + "grad_norm": 0.6593655389980609, + "learning_rate": 2.0020545740580868e-05, + "loss": 2.0409, + "step": 7478 + }, + { + "epoch": 0.58, + "grad_norm": 0.5719161447590927, + "learning_rate": 2.001442435492209e-05, + "loss": 1.9346, + "step": 7479 + }, + { + "epoch": 0.58, + "grad_norm": 0.6163830647250391, + "learning_rate": 2.000830328053329e-05, + "loss": 1.8942, + "step": 7480 + }, + { + "epoch": 0.58, + "grad_norm": 0.6298884309303193, + "learning_rate": 2.0002182517796618e-05, + "loss": 1.9555, + "step": 7481 + }, + { + "epoch": 0.58, + "grad_norm": 0.6152251923132696, + "learning_rate": 1.9996062067094228e-05, + "loss": 2.0791, + "step": 7482 + }, + { + "epoch": 0.58, + "grad_norm": 0.6171229527220804, + "learning_rate": 1.9989941928808237e-05, + "loss": 1.8887, + "step": 7483 + }, + { + "epoch": 0.58, + "grad_norm": 0.7590752427832077, + "learning_rate": 1.9983822103320747e-05, + "loss": 1.879, + "step": 7484 + }, + { + "epoch": 0.58, + "grad_norm": 0.6410255358750863, + "learning_rate": 1.9977702591013857e-05, + "loss": 2.147, + "step": 7485 + }, + { + "epoch": 0.58, + "grad_norm": 0.6097421190168215, + "learning_rate": 1.997158339226963e-05, + "loss": 1.7997, + "step": 7486 + }, + { + "epoch": 0.58, + "grad_norm": 0.6546721909584039, + "learning_rate": 1.9965464507470095e-05, + "loss": 1.9504, + "step": 7487 + }, + { + "epoch": 0.58, + "grad_norm": 0.664879367497345, + "learning_rate": 1.9959345936997306e-05, + "loss": 1.9098, + "step": 7488 + }, + { + "epoch": 0.58, + "grad_norm": 0.6574980666547309, + "learning_rate": 1.9953227681233256e-05, + "loss": 1.9149, + "step": 7489 + }, + { + "epoch": 0.58, + "grad_norm": 0.6670381344607366, + "learning_rate": 1.9947109740559936e-05, + "loss": 2.1465, + "step": 7490 + }, + { + "epoch": 0.58, + "grad_norm": 0.6490529542813748, + "learning_rate": 1.9940992115359303e-05, + "loss": 1.8828, + "step": 7491 + }, + { + "epoch": 0.58, + "grad_norm": 0.7091719813146908, + "learning_rate": 1.993487480601332e-05, + "loss": 1.8648, + "step": 7492 + }, + { + "epoch": 0.58, + "grad_norm": 0.663230339097832, + "learning_rate": 1.992875781290392e-05, + "loss": 1.9842, + "step": 7493 + }, + { + "epoch": 0.58, + "grad_norm": 0.6451242447074287, + "learning_rate": 1.992264113641299e-05, + "loss": 2.0838, + "step": 7494 + }, + { + "epoch": 0.58, + "grad_norm": 0.6761461984001201, + "learning_rate": 1.991652477692244e-05, + "loss": 1.8725, + "step": 7495 + }, + { + "epoch": 0.58, + "grad_norm": 0.6764888629671318, + "learning_rate": 1.991040873481413e-05, + "loss": 1.8958, + "step": 7496 + }, + { + "epoch": 0.58, + "grad_norm": 0.6348643339141381, + "learning_rate": 1.9904293010469906e-05, + "loss": 1.8808, + "step": 7497 + }, + { + "epoch": 0.58, + "grad_norm": 0.6372306706892445, + "learning_rate": 1.989817760427161e-05, + "loss": 2.0818, + "step": 7498 + }, + { + "epoch": 0.58, + "grad_norm": 0.5985030957992815, + "learning_rate": 1.9892062516601036e-05, + "loss": 1.8452, + "step": 7499 + }, + { + "epoch": 0.58, + "grad_norm": 0.6256361061427789, + "learning_rate": 1.988594774783999e-05, + "loss": 1.9576, + "step": 7500 + }, + { + "epoch": 0.58, + "grad_norm": 0.6360511425134711, + "learning_rate": 1.9879833298370238e-05, + "loss": 1.8892, + "step": 7501 + }, + { + "epoch": 0.58, + "grad_norm": 0.6999622105388574, + "learning_rate": 1.9873719168573517e-05, + "loss": 2.0999, + "step": 7502 + }, + { + "epoch": 0.58, + "grad_norm": 0.5395675035008263, + "learning_rate": 1.9867605358831583e-05, + "loss": 1.8828, + "step": 7503 + }, + { + "epoch": 0.58, + "grad_norm": 0.6807422017356013, + "learning_rate": 1.986149186952612e-05, + "loss": 1.9035, + "step": 7504 + }, + { + "epoch": 0.58, + "grad_norm": 0.6111838818270434, + "learning_rate": 1.9855378701038822e-05, + "loss": 2.1125, + "step": 7505 + }, + { + "epoch": 0.58, + "grad_norm": 0.5613667252474608, + "learning_rate": 1.9849265853751383e-05, + "loss": 1.9703, + "step": 7506 + }, + { + "epoch": 0.58, + "grad_norm": 0.6696172171386587, + "learning_rate": 1.9843153328045427e-05, + "loss": 1.9168, + "step": 7507 + }, + { + "epoch": 0.58, + "grad_norm": 0.599626743049355, + "learning_rate": 1.9837041124302592e-05, + "loss": 1.8893, + "step": 7508 + }, + { + "epoch": 0.58, + "grad_norm": 0.6155077201274783, + "learning_rate": 1.98309292429045e-05, + "loss": 1.874, + "step": 7509 + }, + { + "epoch": 0.58, + "grad_norm": 0.6281653756446095, + "learning_rate": 1.982481768423272e-05, + "loss": 2.1106, + "step": 7510 + }, + { + "epoch": 0.58, + "grad_norm": 0.6422081517210234, + "learning_rate": 1.9818706448668843e-05, + "loss": 1.8745, + "step": 7511 + }, + { + "epoch": 0.58, + "grad_norm": 0.5335942756138671, + "learning_rate": 1.9812595536594393e-05, + "loss": 1.9267, + "step": 7512 + }, + { + "epoch": 0.58, + "grad_norm": 0.5873523027594504, + "learning_rate": 1.9806484948390916e-05, + "loss": 1.8589, + "step": 7513 + }, + { + "epoch": 0.58, + "grad_norm": 0.6550029628677968, + "learning_rate": 1.980037468443993e-05, + "loss": 2.0874, + "step": 7514 + }, + { + "epoch": 0.58, + "grad_norm": 0.5955135580517517, + "learning_rate": 1.979426474512291e-05, + "loss": 1.8733, + "step": 7515 + }, + { + "epoch": 0.58, + "grad_norm": 0.6351558349762674, + "learning_rate": 1.978815513082133e-05, + "loss": 1.8892, + "step": 7516 + }, + { + "epoch": 0.58, + "grad_norm": 0.5896240603083429, + "learning_rate": 1.9782045841916625e-05, + "loss": 2.1121, + "step": 7517 + }, + { + "epoch": 0.58, + "grad_norm": 0.6049194885746961, + "learning_rate": 1.977593687879024e-05, + "loss": 1.993, + "step": 7518 + }, + { + "epoch": 0.58, + "grad_norm": 0.6476357499497972, + "learning_rate": 1.976982824182358e-05, + "loss": 1.8646, + "step": 7519 + }, + { + "epoch": 0.58, + "grad_norm": 0.6076586817350905, + "learning_rate": 1.9763719931398022e-05, + "loss": 1.9183, + "step": 7520 + }, + { + "epoch": 0.58, + "grad_norm": 0.5797685028477424, + "learning_rate": 1.9757611947894943e-05, + "loss": 1.925, + "step": 7521 + }, + { + "epoch": 0.58, + "grad_norm": 0.6096386985363572, + "learning_rate": 1.9751504291695694e-05, + "loss": 2.1126, + "step": 7522 + }, + { + "epoch": 0.58, + "grad_norm": 0.5943040080437846, + "learning_rate": 1.974539696318158e-05, + "loss": 1.88, + "step": 7523 + }, + { + "epoch": 0.58, + "grad_norm": 0.5497812217171836, + "learning_rate": 1.973928996273393e-05, + "loss": 1.9296, + "step": 7524 + }, + { + "epoch": 0.58, + "grad_norm": 0.5525565185515982, + "learning_rate": 1.9733183290734016e-05, + "loss": 1.8942, + "step": 7525 + }, + { + "epoch": 0.58, + "grad_norm": 0.577473909009619, + "learning_rate": 1.97270769475631e-05, + "loss": 2.0955, + "step": 7526 + }, + { + "epoch": 0.58, + "grad_norm": 0.5569508128697931, + "learning_rate": 1.9720970933602443e-05, + "loss": 1.8713, + "step": 7527 + }, + { + "epoch": 0.58, + "grad_norm": 0.5597587193983119, + "learning_rate": 1.971486524923325e-05, + "loss": 1.7974, + "step": 7528 + }, + { + "epoch": 0.58, + "grad_norm": 0.5770106661095573, + "learning_rate": 1.970875989483674e-05, + "loss": 1.859, + "step": 7529 + }, + { + "epoch": 0.58, + "grad_norm": 0.6099842965258105, + "learning_rate": 1.9702654870794072e-05, + "loss": 2.101, + "step": 7530 + }, + { + "epoch": 0.58, + "grad_norm": 0.5573426705048805, + "learning_rate": 1.9696550177486427e-05, + "loss": 1.9299, + "step": 7531 + }, + { + "epoch": 0.58, + "grad_norm": 0.5369077858623763, + "learning_rate": 1.969044581529495e-05, + "loss": 1.9068, + "step": 7532 + }, + { + "epoch": 0.58, + "grad_norm": 0.6089210354151908, + "learning_rate": 1.9684341784600747e-05, + "loss": 1.888, + "step": 7533 + }, + { + "epoch": 0.58, + "grad_norm": 0.7114880549646531, + "learning_rate": 1.967823808578492e-05, + "loss": 2.1132, + "step": 7534 + }, + { + "epoch": 0.58, + "grad_norm": 0.5641173395370542, + "learning_rate": 1.967213471922856e-05, + "loss": 1.8494, + "step": 7535 + }, + { + "epoch": 0.58, + "grad_norm": 0.6504060120101758, + "learning_rate": 1.9666031685312713e-05, + "loss": 1.8698, + "step": 7536 + }, + { + "epoch": 0.58, + "grad_norm": 0.7751881732319554, + "learning_rate": 1.9659928984418423e-05, + "loss": 2.0236, + "step": 7537 + }, + { + "epoch": 0.58, + "grad_norm": 0.65248424949771, + "learning_rate": 1.96538266169267e-05, + "loss": 2.1419, + "step": 7538 + }, + { + "epoch": 0.58, + "grad_norm": 0.7119397484158105, + "learning_rate": 1.9647724583218543e-05, + "loss": 1.9427, + "step": 7539 + }, + { + "epoch": 0.58, + "grad_norm": 0.6102276425834192, + "learning_rate": 1.9641622883674937e-05, + "loss": 1.9154, + "step": 7540 + }, + { + "epoch": 0.58, + "grad_norm": 0.6075383596120245, + "learning_rate": 1.9635521518676813e-05, + "loss": 1.8703, + "step": 7541 + }, + { + "epoch": 0.58, + "grad_norm": 0.6724357469914515, + "learning_rate": 1.9629420488605128e-05, + "loss": 2.0996, + "step": 7542 + }, + { + "epoch": 0.58, + "grad_norm": 0.5967935028428883, + "learning_rate": 1.9623319793840787e-05, + "loss": 1.9402, + "step": 7543 + }, + { + "epoch": 0.58, + "grad_norm": 0.6798387192989632, + "learning_rate": 1.961721943476467e-05, + "loss": 1.8666, + "step": 7544 + }, + { + "epoch": 0.58, + "grad_norm": 0.5955447451059178, + "learning_rate": 1.9611119411757665e-05, + "loss": 1.8419, + "step": 7545 + }, + { + "epoch": 0.58, + "grad_norm": 0.6602241962033706, + "learning_rate": 1.9605019725200605e-05, + "loss": 2.0853, + "step": 7546 + }, + { + "epoch": 0.58, + "grad_norm": 0.6837053047327782, + "learning_rate": 1.959892037547433e-05, + "loss": 1.9084, + "step": 7547 + }, + { + "epoch": 0.58, + "grad_norm": 0.6317991216773129, + "learning_rate": 1.9592821362959653e-05, + "loss": 1.9026, + "step": 7548 + }, + { + "epoch": 0.58, + "grad_norm": 0.6683632362655592, + "learning_rate": 1.9586722688037338e-05, + "loss": 1.9967, + "step": 7549 + }, + { + "epoch": 0.58, + "grad_norm": 0.6633348025790817, + "learning_rate": 1.9580624351088176e-05, + "loss": 2.0673, + "step": 7550 + }, + { + "epoch": 0.58, + "grad_norm": 0.602838920944642, + "learning_rate": 1.957452635249289e-05, + "loss": 1.8658, + "step": 7551 + }, + { + "epoch": 0.58, + "grad_norm": 0.664692394167682, + "learning_rate": 1.956842869263221e-05, + "loss": 1.8804, + "step": 7552 + }, + { + "epoch": 0.58, + "grad_norm": 0.7348265005897375, + "learning_rate": 1.9562331371886853e-05, + "loss": 1.8952, + "step": 7553 + }, + { + "epoch": 0.58, + "grad_norm": 0.6979152235019084, + "learning_rate": 1.9556234390637478e-05, + "loss": 2.1225, + "step": 7554 + }, + { + "epoch": 0.58, + "grad_norm": 0.6594635941213265, + "learning_rate": 1.955013774926475e-05, + "loss": 1.9475, + "step": 7555 + }, + { + "epoch": 0.58, + "grad_norm": 0.7153893886509993, + "learning_rate": 1.9544041448149323e-05, + "loss": 1.8636, + "step": 7556 + }, + { + "epoch": 0.58, + "grad_norm": 0.6232601181222743, + "learning_rate": 1.9537945487671794e-05, + "loss": 1.8631, + "step": 7557 + }, + { + "epoch": 0.58, + "grad_norm": 0.6831352141143314, + "learning_rate": 1.9531849868212775e-05, + "loss": 2.0824, + "step": 7558 + }, + { + "epoch": 0.58, + "grad_norm": 0.5646218633526511, + "learning_rate": 1.9525754590152818e-05, + "loss": 1.8719, + "step": 7559 + }, + { + "epoch": 0.58, + "grad_norm": 0.636855344582006, + "learning_rate": 1.9519659653872498e-05, + "loss": 1.8465, + "step": 7560 + }, + { + "epoch": 0.58, + "grad_norm": 0.669076372448756, + "learning_rate": 1.9513565059752343e-05, + "loss": 1.8569, + "step": 7561 + }, + { + "epoch": 0.58, + "grad_norm": 0.5967328342704521, + "learning_rate": 1.9507470808172855e-05, + "loss": 2.1491, + "step": 7562 + }, + { + "epoch": 0.58, + "grad_norm": 0.7086050301968272, + "learning_rate": 1.9501376899514533e-05, + "loss": 1.8971, + "step": 7563 + }, + { + "epoch": 0.58, + "grad_norm": 0.6896803251796363, + "learning_rate": 1.949528333415783e-05, + "loss": 1.9368, + "step": 7564 + }, + { + "epoch": 0.58, + "grad_norm": 0.6199854808112897, + "learning_rate": 1.948919011248321e-05, + "loss": 1.841, + "step": 7565 + }, + { + "epoch": 0.58, + "grad_norm": 0.687056408491387, + "learning_rate": 1.9483097234871094e-05, + "loss": 2.071, + "step": 7566 + }, + { + "epoch": 0.58, + "grad_norm": 0.7042535060235954, + "learning_rate": 1.9477004701701874e-05, + "loss": 1.9079, + "step": 7567 + }, + { + "epoch": 0.58, + "grad_norm": 0.5570597359259216, + "learning_rate": 1.9470912513355942e-05, + "loss": 1.9658, + "step": 7568 + }, + { + "epoch": 0.58, + "grad_norm": 0.6629395922030592, + "learning_rate": 1.9464820670213663e-05, + "loss": 1.8729, + "step": 7569 + }, + { + "epoch": 0.58, + "grad_norm": 0.7239165032245175, + "learning_rate": 1.9458729172655357e-05, + "loss": 2.0527, + "step": 7570 + }, + { + "epoch": 0.58, + "grad_norm": 0.6368581978630224, + "learning_rate": 1.9452638021061364e-05, + "loss": 1.8782, + "step": 7571 + }, + { + "epoch": 0.58, + "grad_norm": 0.5908619781751578, + "learning_rate": 1.944654721581196e-05, + "loss": 1.8752, + "step": 7572 + }, + { + "epoch": 0.58, + "grad_norm": 0.7667053191603881, + "learning_rate": 1.944045675728743e-05, + "loss": 1.8701, + "step": 7573 + }, + { + "epoch": 0.58, + "grad_norm": 0.6085696768060196, + "learning_rate": 1.9434366645868028e-05, + "loss": 2.1137, + "step": 7574 + }, + { + "epoch": 0.58, + "grad_norm": 0.753390430733746, + "learning_rate": 1.9428276881933977e-05, + "loss": 1.8631, + "step": 7575 + }, + { + "epoch": 0.58, + "grad_norm": 0.63005162168378, + "learning_rate": 1.9422187465865495e-05, + "loss": 1.9281, + "step": 7576 + }, + { + "epoch": 0.58, + "grad_norm": 0.578794465516103, + "learning_rate": 1.941609839804275e-05, + "loss": 1.8995, + "step": 7577 + }, + { + "epoch": 0.58, + "grad_norm": 0.6501152993703927, + "learning_rate": 1.9410009678845923e-05, + "loss": 2.1268, + "step": 7578 + }, + { + "epoch": 0.58, + "grad_norm": 0.6252823273451281, + "learning_rate": 1.9403921308655174e-05, + "loss": 1.9138, + "step": 7579 + }, + { + "epoch": 0.58, + "grad_norm": 0.5666028147532545, + "learning_rate": 1.939783328785059e-05, + "loss": 1.9616, + "step": 7580 + }, + { + "epoch": 0.58, + "grad_norm": 0.5504643615582026, + "learning_rate": 1.9391745616812285e-05, + "loss": 1.8821, + "step": 7581 + }, + { + "epoch": 0.58, + "grad_norm": 0.6995082849356324, + "learning_rate": 1.9385658295920357e-05, + "loss": 2.0964, + "step": 7582 + }, + { + "epoch": 0.59, + "grad_norm": 0.6160606732619602, + "learning_rate": 1.9379571325554834e-05, + "loss": 1.8769, + "step": 7583 + }, + { + "epoch": 0.59, + "grad_norm": 0.5878447444420979, + "learning_rate": 1.9373484706095775e-05, + "loss": 1.8947, + "step": 7584 + }, + { + "epoch": 0.59, + "grad_norm": 0.7063162727936425, + "learning_rate": 1.936739843792316e-05, + "loss": 1.8695, + "step": 7585 + }, + { + "epoch": 0.59, + "grad_norm": 0.5468114631380889, + "learning_rate": 1.9361312521417013e-05, + "loss": 2.1268, + "step": 7586 + }, + { + "epoch": 0.59, + "grad_norm": 0.6016263376687625, + "learning_rate": 1.9355226956957293e-05, + "loss": 1.8518, + "step": 7587 + }, + { + "epoch": 0.59, + "grad_norm": 0.5718826876833379, + "learning_rate": 1.934914174492393e-05, + "loss": 1.8744, + "step": 7588 + }, + { + "epoch": 0.59, + "grad_norm": 0.6938886081879279, + "learning_rate": 1.934305688569687e-05, + "loss": 1.8735, + "step": 7589 + }, + { + "epoch": 0.59, + "grad_norm": 0.611434917581038, + "learning_rate": 1.9336972379656015e-05, + "loss": 2.0847, + "step": 7590 + }, + { + "epoch": 0.59, + "grad_norm": 0.6697880557438126, + "learning_rate": 1.9330888227181225e-05, + "loss": 1.8967, + "step": 7591 + }, + { + "epoch": 0.59, + "grad_norm": 0.644376929742968, + "learning_rate": 1.9324804428652386e-05, + "loss": 1.874, + "step": 7592 + }, + { + "epoch": 0.59, + "grad_norm": 0.6863416530407801, + "learning_rate": 1.931872098444931e-05, + "loss": 1.9604, + "step": 7593 + }, + { + "epoch": 0.59, + "grad_norm": 0.6221475829163491, + "learning_rate": 1.9312637894951824e-05, + "loss": 2.0881, + "step": 7594 + }, + { + "epoch": 0.59, + "grad_norm": 0.658100105169115, + "learning_rate": 1.9306555160539726e-05, + "loss": 1.8559, + "step": 7595 + }, + { + "epoch": 0.59, + "grad_norm": 0.6787245946516081, + "learning_rate": 1.9300472781592767e-05, + "loss": 1.8732, + "step": 7596 + }, + { + "epoch": 0.59, + "grad_norm": 0.5583594449455466, + "learning_rate": 1.9294390758490715e-05, + "loss": 1.8852, + "step": 7597 + }, + { + "epoch": 0.59, + "grad_norm": 0.633676691118613, + "learning_rate": 1.928830909161329e-05, + "loss": 2.1102, + "step": 7598 + }, + { + "epoch": 0.59, + "grad_norm": 0.6366774170746637, + "learning_rate": 1.928222778134018e-05, + "loss": 1.9181, + "step": 7599 + }, + { + "epoch": 0.59, + "grad_norm": 0.5583888740415418, + "learning_rate": 1.927614682805109e-05, + "loss": 1.8672, + "step": 7600 + }, + { + "epoch": 0.59, + "grad_norm": 0.6867549589236431, + "learning_rate": 1.9270066232125663e-05, + "loss": 1.9151, + "step": 7601 + }, + { + "epoch": 0.59, + "grad_norm": 0.6007781558950073, + "learning_rate": 1.9263985993943537e-05, + "loss": 2.1635, + "step": 7602 + }, + { + "epoch": 0.59, + "grad_norm": 0.5531207100722589, + "learning_rate": 1.925790611388434e-05, + "loss": 1.879, + "step": 7603 + }, + { + "epoch": 0.59, + "grad_norm": 0.611486952598035, + "learning_rate": 1.925182659232764e-05, + "loss": 1.9385, + "step": 7604 + }, + { + "epoch": 0.59, + "grad_norm": 0.6065691808500147, + "learning_rate": 1.9245747429653037e-05, + "loss": 1.9616, + "step": 7605 + }, + { + "epoch": 0.59, + "grad_norm": 0.6324305611315473, + "learning_rate": 1.9239668626240044e-05, + "loss": 2.1589, + "step": 7606 + }, + { + "epoch": 0.59, + "grad_norm": 0.6595277076650461, + "learning_rate": 1.923359018246821e-05, + "loss": 1.8229, + "step": 7607 + }, + { + "epoch": 0.59, + "grad_norm": 0.6920637110195723, + "learning_rate": 1.9227512098717036e-05, + "loss": 1.8776, + "step": 7608 + }, + { + "epoch": 0.59, + "grad_norm": 0.6464730782806439, + "learning_rate": 1.9221434375365983e-05, + "loss": 1.9045, + "step": 7609 + }, + { + "epoch": 0.59, + "grad_norm": 0.651696193506599, + "learning_rate": 1.9215357012794532e-05, + "loss": 2.1166, + "step": 7610 + }, + { + "epoch": 0.59, + "grad_norm": 0.5421532025738192, + "learning_rate": 1.92092800113821e-05, + "loss": 2.0113, + "step": 7611 + }, + { + "epoch": 0.59, + "grad_norm": 0.5955992891345671, + "learning_rate": 1.9203203371508106e-05, + "loss": 1.8837, + "step": 7612 + }, + { + "epoch": 0.59, + "grad_norm": 0.6563145396795834, + "learning_rate": 1.9197127093551946e-05, + "loss": 1.863, + "step": 7613 + }, + { + "epoch": 0.59, + "grad_norm": 0.7051864180143707, + "learning_rate": 1.9191051177892968e-05, + "loss": 2.0835, + "step": 7614 + }, + { + "epoch": 0.59, + "grad_norm": 0.5824176436838076, + "learning_rate": 1.9184975624910532e-05, + "loss": 1.9472, + "step": 7615 + }, + { + "epoch": 0.59, + "grad_norm": 0.6618173573657026, + "learning_rate": 1.917890043498397e-05, + "loss": 1.8721, + "step": 7616 + }, + { + "epoch": 0.59, + "grad_norm": 0.7768219192627006, + "learning_rate": 1.917282560849255e-05, + "loss": 1.9269, + "step": 7617 + }, + { + "epoch": 0.59, + "grad_norm": 0.6594454665436558, + "learning_rate": 1.916675114581558e-05, + "loss": 2.1636, + "step": 7618 + }, + { + "epoch": 0.59, + "grad_norm": 0.7607844642340414, + "learning_rate": 1.9160677047332294e-05, + "loss": 1.8994, + "step": 7619 + }, + { + "epoch": 0.59, + "grad_norm": 0.5839653610549591, + "learning_rate": 1.915460331342192e-05, + "loss": 1.9218, + "step": 7620 + }, + { + "epoch": 0.59, + "grad_norm": 0.6375672953904478, + "learning_rate": 1.914852994446369e-05, + "loss": 1.8676, + "step": 7621 + }, + { + "epoch": 0.59, + "grad_norm": 0.6728387487782017, + "learning_rate": 1.914245694083677e-05, + "loss": 2.0477, + "step": 7622 + }, + { + "epoch": 0.59, + "grad_norm": 0.5714255795539871, + "learning_rate": 1.9136384302920312e-05, + "loss": 1.8571, + "step": 7623 + }, + { + "epoch": 0.59, + "grad_norm": 0.706055256844909, + "learning_rate": 1.913031203109349e-05, + "loss": 2.0368, + "step": 7624 + }, + { + "epoch": 0.59, + "grad_norm": 0.603293087212047, + "learning_rate": 1.9124240125735392e-05, + "loss": 1.8949, + "step": 7625 + }, + { + "epoch": 0.59, + "grad_norm": 0.7114611120341588, + "learning_rate": 1.9118168587225125e-05, + "loss": 2.0529, + "step": 7626 + }, + { + "epoch": 0.59, + "grad_norm": 0.5596024286181308, + "learning_rate": 1.9112097415941747e-05, + "loss": 1.8863, + "step": 7627 + }, + { + "epoch": 0.59, + "grad_norm": 0.7051319325984238, + "learning_rate": 1.9106026612264316e-05, + "loss": 1.8772, + "step": 7628 + }, + { + "epoch": 0.59, + "grad_norm": 0.7242464445597571, + "learning_rate": 1.9099956176571868e-05, + "loss": 1.9038, + "step": 7629 + }, + { + "epoch": 0.59, + "grad_norm": 0.5763240749728643, + "learning_rate": 1.9093886109243386e-05, + "loss": 1.9214, + "step": 7630 + }, + { + "epoch": 0.59, + "grad_norm": 0.8007476587952197, + "learning_rate": 1.9087816410657867e-05, + "loss": 2.0672, + "step": 7631 + }, + { + "epoch": 0.59, + "grad_norm": 0.6473781147589445, + "learning_rate": 1.9081747081194243e-05, + "loss": 1.8783, + "step": 7632 + }, + { + "epoch": 0.59, + "grad_norm": 0.6372584365790553, + "learning_rate": 1.9075678121231466e-05, + "loss": 1.9152, + "step": 7633 + }, + { + "epoch": 0.59, + "grad_norm": 0.5798538309930465, + "learning_rate": 1.906960953114845e-05, + "loss": 2.0876, + "step": 7634 + }, + { + "epoch": 0.59, + "grad_norm": 0.6779708767780566, + "learning_rate": 1.9063541311324063e-05, + "loss": 1.9284, + "step": 7635 + }, + { + "epoch": 0.59, + "grad_norm": 0.5876085668122086, + "learning_rate": 1.9057473462137183e-05, + "loss": 2.0232, + "step": 7636 + }, + { + "epoch": 0.59, + "grad_norm": 0.6045640124363295, + "learning_rate": 1.9051405983966654e-05, + "loss": 1.9048, + "step": 7637 + }, + { + "epoch": 0.59, + "grad_norm": 0.6803396659912817, + "learning_rate": 1.9045338877191273e-05, + "loss": 2.0784, + "step": 7638 + }, + { + "epoch": 0.59, + "grad_norm": 0.6129913206714195, + "learning_rate": 1.903927214218986e-05, + "loss": 1.8991, + "step": 7639 + }, + { + "epoch": 0.59, + "grad_norm": 0.6158775224672377, + "learning_rate": 1.9033205779341174e-05, + "loss": 1.8946, + "step": 7640 + }, + { + "epoch": 0.59, + "grad_norm": 0.6578830794031483, + "learning_rate": 1.902713978902395e-05, + "loss": 1.8633, + "step": 7641 + }, + { + "epoch": 0.59, + "grad_norm": 0.6716137483931451, + "learning_rate": 1.9021074171616944e-05, + "loss": 1.9555, + "step": 7642 + }, + { + "epoch": 0.59, + "grad_norm": 0.6002736563887744, + "learning_rate": 1.9015008927498826e-05, + "loss": 2.0767, + "step": 7643 + }, + { + "epoch": 0.59, + "grad_norm": 0.637040197673345, + "learning_rate": 1.9008944057048303e-05, + "loss": 1.9329, + "step": 7644 + }, + { + "epoch": 0.59, + "grad_norm": 0.5944808095142273, + "learning_rate": 1.9002879560644e-05, + "loss": 1.9367, + "step": 7645 + }, + { + "epoch": 0.59, + "grad_norm": 0.6928997993598467, + "learning_rate": 1.8996815438664562e-05, + "loss": 2.1515, + "step": 7646 + }, + { + "epoch": 0.59, + "grad_norm": 0.5642027716309448, + "learning_rate": 1.8990751691488605e-05, + "loss": 1.8794, + "step": 7647 + }, + { + "epoch": 0.59, + "grad_norm": 0.5426371725645626, + "learning_rate": 1.8984688319494707e-05, + "loss": 1.8746, + "step": 7648 + }, + { + "epoch": 0.59, + "grad_norm": 0.5755880918336511, + "learning_rate": 1.8978625323061422e-05, + "loss": 2.025, + "step": 7649 + }, + { + "epoch": 0.59, + "grad_norm": 0.5959507840409963, + "learning_rate": 1.8972562702567305e-05, + "loss": 1.8835, + "step": 7650 + }, + { + "epoch": 0.59, + "grad_norm": 0.5611507187399206, + "learning_rate": 1.896650045839085e-05, + "loss": 2.1309, + "step": 7651 + }, + { + "epoch": 0.59, + "grad_norm": 0.5395980553674166, + "learning_rate": 1.8960438590910567e-05, + "loss": 1.8881, + "step": 7652 + }, + { + "epoch": 0.59, + "grad_norm": 0.5809652481476495, + "learning_rate": 1.8954377100504904e-05, + "loss": 1.8932, + "step": 7653 + }, + { + "epoch": 0.59, + "grad_norm": 0.595966721291081, + "learning_rate": 1.894831598755232e-05, + "loss": 2.1252, + "step": 7654 + }, + { + "epoch": 0.59, + "grad_norm": 0.5735137991161089, + "learning_rate": 1.894225525243123e-05, + "loss": 1.9248, + "step": 7655 + }, + { + "epoch": 0.59, + "grad_norm": 0.5451566076934459, + "learning_rate": 1.8936194895520027e-05, + "loss": 1.8523, + "step": 7656 + }, + { + "epoch": 0.59, + "grad_norm": 0.5920669946347059, + "learning_rate": 1.893013491719709e-05, + "loss": 1.8835, + "step": 7657 + }, + { + "epoch": 0.59, + "grad_norm": 0.5737393704937246, + "learning_rate": 1.892407531784077e-05, + "loss": 2.1175, + "step": 7658 + }, + { + "epoch": 0.59, + "grad_norm": 0.650125666799056, + "learning_rate": 1.8918016097829376e-05, + "loss": 1.864, + "step": 7659 + }, + { + "epoch": 0.59, + "grad_norm": 0.5931904915324301, + "learning_rate": 1.8911957257541235e-05, + "loss": 1.905, + "step": 7660 + }, + { + "epoch": 0.59, + "grad_norm": 0.5930247996829384, + "learning_rate": 1.8905898797354603e-05, + "loss": 1.9136, + "step": 7661 + }, + { + "epoch": 0.59, + "grad_norm": 0.5787902606841083, + "learning_rate": 1.8899840717647747e-05, + "loss": 1.8665, + "step": 7662 + }, + { + "epoch": 0.59, + "grad_norm": 0.5587285463762074, + "learning_rate": 1.8893783018798906e-05, + "loss": 2.0741, + "step": 7663 + }, + { + "epoch": 0.59, + "grad_norm": 0.6440621057514645, + "learning_rate": 1.888772570118626e-05, + "loss": 1.913, + "step": 7664 + }, + { + "epoch": 0.59, + "grad_norm": 0.6042225792984329, + "learning_rate": 1.8881668765188024e-05, + "loss": 1.8945, + "step": 7665 + }, + { + "epoch": 0.59, + "grad_norm": 0.6284955078525634, + "learning_rate": 1.887561221118233e-05, + "loss": 2.1106, + "step": 7666 + }, + { + "epoch": 0.59, + "grad_norm": 0.5557327841831808, + "learning_rate": 1.8869556039547327e-05, + "loss": 2.0176, + "step": 7667 + }, + { + "epoch": 0.59, + "grad_norm": 0.5555472631674698, + "learning_rate": 1.8863500250661132e-05, + "loss": 1.9302, + "step": 7668 + }, + { + "epoch": 0.59, + "grad_norm": 0.6076933169783432, + "learning_rate": 1.8857444844901816e-05, + "loss": 1.9055, + "step": 7669 + }, + { + "epoch": 0.59, + "grad_norm": 0.6086386264331047, + "learning_rate": 1.8851389822647452e-05, + "loss": 2.0957, + "step": 7670 + }, + { + "epoch": 0.59, + "grad_norm": 0.589318693426939, + "learning_rate": 1.8845335184276093e-05, + "loss": 1.8835, + "step": 7671 + }, + { + "epoch": 0.59, + "grad_norm": 0.6233262412149501, + "learning_rate": 1.8839280930165734e-05, + "loss": 1.8331, + "step": 7672 + }, + { + "epoch": 0.59, + "grad_norm": 0.578148119549877, + "learning_rate": 1.883322706069438e-05, + "loss": 2.0009, + "step": 7673 + }, + { + "epoch": 0.59, + "grad_norm": 0.6096090856143407, + "learning_rate": 1.8827173576239988e-05, + "loss": 1.9099, + "step": 7674 + }, + { + "epoch": 0.59, + "grad_norm": 0.6330269737034272, + "learning_rate": 1.8821120477180503e-05, + "loss": 2.0796, + "step": 7675 + }, + { + "epoch": 0.59, + "grad_norm": 0.6019862947484418, + "learning_rate": 1.881506776389387e-05, + "loss": 1.947, + "step": 7676 + }, + { + "epoch": 0.59, + "grad_norm": 0.6449003333759745, + "learning_rate": 1.880901543675795e-05, + "loss": 1.9347, + "step": 7677 + }, + { + "epoch": 0.59, + "grad_norm": 0.7011496248020106, + "learning_rate": 1.8802963496150635e-05, + "loss": 2.0848, + "step": 7678 + }, + { + "epoch": 0.59, + "grad_norm": 0.6035376871469559, + "learning_rate": 1.8796911942449758e-05, + "loss": 1.9058, + "step": 7679 + }, + { + "epoch": 0.59, + "grad_norm": 0.6177581435147302, + "learning_rate": 1.879086077603316e-05, + "loss": 1.979, + "step": 7680 + }, + { + "epoch": 0.59, + "grad_norm": 0.631416139822634, + "learning_rate": 1.8784809997278633e-05, + "loss": 1.906, + "step": 7681 + }, + { + "epoch": 0.59, + "grad_norm": 0.5876923499469475, + "learning_rate": 1.877875960656394e-05, + "loss": 1.8442, + "step": 7682 + }, + { + "epoch": 0.59, + "grad_norm": 0.5814283022024014, + "learning_rate": 1.877270960426685e-05, + "loss": 2.0722, + "step": 7683 + }, + { + "epoch": 0.59, + "grad_norm": 0.6296999651721421, + "learning_rate": 1.8766659990765083e-05, + "loss": 1.8841, + "step": 7684 + }, + { + "epoch": 0.59, + "grad_norm": 0.6039020759272172, + "learning_rate": 1.8760610766436335e-05, + "loss": 1.8647, + "step": 7685 + }, + { + "epoch": 0.59, + "grad_norm": 0.591855462939525, + "learning_rate": 1.8754561931658296e-05, + "loss": 1.98, + "step": 7686 + }, + { + "epoch": 0.59, + "grad_norm": 0.5790357429639996, + "learning_rate": 1.874851348680861e-05, + "loss": 2.1164, + "step": 7687 + }, + { + "epoch": 0.59, + "grad_norm": 0.6119618387567786, + "learning_rate": 1.87424654322649e-05, + "loss": 1.8756, + "step": 7688 + }, + { + "epoch": 0.59, + "grad_norm": 0.6679472949345356, + "learning_rate": 1.8736417768404792e-05, + "loss": 1.9204, + "step": 7689 + }, + { + "epoch": 0.59, + "grad_norm": 0.5625197921548905, + "learning_rate": 1.873037049560585e-05, + "loss": 2.0564, + "step": 7690 + }, + { + "epoch": 0.59, + "grad_norm": 0.7021173541041683, + "learning_rate": 1.8724323614245626e-05, + "loss": 1.8542, + "step": 7691 + }, + { + "epoch": 0.59, + "grad_norm": 0.6189479197917626, + "learning_rate": 1.871827712470167e-05, + "loss": 1.9473, + "step": 7692 + }, + { + "epoch": 0.59, + "grad_norm": 0.6044790573714799, + "learning_rate": 1.8712231027351475e-05, + "loss": 1.8695, + "step": 7693 + }, + { + "epoch": 0.59, + "grad_norm": 0.6365205537043823, + "learning_rate": 1.870618532257253e-05, + "loss": 1.956, + "step": 7694 + }, + { + "epoch": 0.59, + "grad_norm": 0.7221952991466091, + "learning_rate": 1.870014001074229e-05, + "loss": 2.0779, + "step": 7695 + }, + { + "epoch": 0.59, + "grad_norm": 0.6370865901904805, + "learning_rate": 1.8694095092238186e-05, + "loss": 1.8588, + "step": 7696 + }, + { + "epoch": 0.59, + "grad_norm": 0.7765337784512784, + "learning_rate": 1.868805056743764e-05, + "loss": 1.8748, + "step": 7697 + }, + { + "epoch": 0.59, + "grad_norm": 0.5785412881356533, + "learning_rate": 1.868200643671802e-05, + "loss": 1.9235, + "step": 7698 + }, + { + "epoch": 0.59, + "grad_norm": 0.7833795339311975, + "learning_rate": 1.8675962700456696e-05, + "loss": 2.0906, + "step": 7699 + }, + { + "epoch": 0.59, + "grad_norm": 0.7302896496016024, + "learning_rate": 1.8669919359030996e-05, + "loss": 1.924, + "step": 7700 + }, + { + "epoch": 0.59, + "grad_norm": 0.6321616853204284, + "learning_rate": 1.8663876412818238e-05, + "loss": 1.9304, + "step": 7701 + }, + { + "epoch": 0.59, + "grad_norm": 0.6107288873436914, + "learning_rate": 1.8657833862195708e-05, + "loss": 2.1407, + "step": 7702 + }, + { + "epoch": 0.59, + "grad_norm": 0.6918899882460737, + "learning_rate": 1.8651791707540656e-05, + "loss": 1.9031, + "step": 7703 + }, + { + "epoch": 0.59, + "grad_norm": 0.5983769026275159, + "learning_rate": 1.864574994923033e-05, + "loss": 1.9669, + "step": 7704 + }, + { + "epoch": 0.59, + "grad_norm": 0.6237951129478277, + "learning_rate": 1.8639708587641947e-05, + "loss": 1.9017, + "step": 7705 + }, + { + "epoch": 0.59, + "grad_norm": 0.6324504575700634, + "learning_rate": 1.8633667623152673e-05, + "loss": 1.8928, + "step": 7706 + }, + { + "epoch": 0.59, + "grad_norm": 0.6281415625933285, + "learning_rate": 1.8627627056139692e-05, + "loss": 2.1019, + "step": 7707 + }, + { + "epoch": 0.59, + "grad_norm": 0.6614321870183782, + "learning_rate": 1.862158688698013e-05, + "loss": 1.8833, + "step": 7708 + }, + { + "epoch": 0.59, + "grad_norm": 0.6280023889953259, + "learning_rate": 1.8615547116051097e-05, + "loss": 1.8863, + "step": 7709 + }, + { + "epoch": 0.59, + "grad_norm": 0.6248936056974818, + "learning_rate": 1.8609507743729694e-05, + "loss": 2.083, + "step": 7710 + }, + { + "epoch": 0.59, + "grad_norm": 0.6450804657683494, + "learning_rate": 1.8603468770392964e-05, + "loss": 1.9482, + "step": 7711 + }, + { + "epoch": 0.59, + "grad_norm": 0.6406340177667768, + "learning_rate": 1.859743019641797e-05, + "loss": 1.897, + "step": 7712 + }, + { + "epoch": 0.6, + "grad_norm": 0.6726984593177419, + "learning_rate": 1.8591392022181703e-05, + "loss": 1.8997, + "step": 7713 + }, + { + "epoch": 0.6, + "grad_norm": 0.6072362926928292, + "learning_rate": 1.8585354248061155e-05, + "loss": 1.8714, + "step": 7714 + }, + { + "epoch": 0.6, + "grad_norm": 0.5853056436290325, + "learning_rate": 1.8579316874433305e-05, + "loss": 2.0732, + "step": 7715 + }, + { + "epoch": 0.6, + "grad_norm": 0.5740760803316709, + "learning_rate": 1.8573279901675073e-05, + "loss": 1.866, + "step": 7716 + }, + { + "epoch": 0.6, + "grad_norm": 0.5924569794105321, + "learning_rate": 1.856724333016337e-05, + "loss": 1.9407, + "step": 7717 + }, + { + "epoch": 0.6, + "grad_norm": 0.5658751626210109, + "learning_rate": 1.8561207160275107e-05, + "loss": 1.9026, + "step": 7718 + }, + { + "epoch": 0.6, + "grad_norm": 0.6292890968869956, + "learning_rate": 1.8555171392387127e-05, + "loss": 2.0437, + "step": 7719 + }, + { + "epoch": 0.6, + "grad_norm": 0.5884278253519223, + "learning_rate": 1.8549136026876278e-05, + "loss": 1.8897, + "step": 7720 + }, + { + "epoch": 0.6, + "grad_norm": 0.5941294603155431, + "learning_rate": 1.8543101064119355e-05, + "loss": 1.9087, + "step": 7721 + }, + { + "epoch": 0.6, + "grad_norm": 0.6409276571564958, + "learning_rate": 1.853706650449317e-05, + "loss": 2.0555, + "step": 7722 + }, + { + "epoch": 0.6, + "grad_norm": 0.6044054904189281, + "learning_rate": 1.853103234837447e-05, + "loss": 1.9716, + "step": 7723 + }, + { + "epoch": 0.6, + "grad_norm": 0.6079388183732213, + "learning_rate": 1.8524998596139987e-05, + "loss": 1.914, + "step": 7724 + }, + { + "epoch": 0.6, + "grad_norm": 0.6633073225337965, + "learning_rate": 1.8518965248166455e-05, + "loss": 1.8744, + "step": 7725 + }, + { + "epoch": 0.6, + "grad_norm": 0.6743036103359565, + "learning_rate": 1.8512932304830537e-05, + "loss": 1.9096, + "step": 7726 + }, + { + "epoch": 0.6, + "grad_norm": 0.5814667760419157, + "learning_rate": 1.8506899766508908e-05, + "loss": 2.1345, + "step": 7727 + }, + { + "epoch": 0.6, + "grad_norm": 0.5985511854595066, + "learning_rate": 1.8500867633578206e-05, + "loss": 1.8968, + "step": 7728 + }, + { + "epoch": 0.6, + "grad_norm": 0.6194039839198576, + "learning_rate": 1.8494835906415026e-05, + "loss": 1.9508, + "step": 7729 + }, + { + "epoch": 0.6, + "grad_norm": 0.630587953349486, + "learning_rate": 1.848880458539597e-05, + "loss": 1.8826, + "step": 7730 + }, + { + "epoch": 0.6, + "grad_norm": 0.666797202881205, + "learning_rate": 1.84827736708976e-05, + "loss": 2.0822, + "step": 7731 + }, + { + "epoch": 0.6, + "grad_norm": 0.5535212223626834, + "learning_rate": 1.847674316329643e-05, + "loss": 1.8889, + "step": 7732 + }, + { + "epoch": 0.6, + "grad_norm": 0.5835434616088623, + "learning_rate": 1.8470713062968993e-05, + "loss": 1.8848, + "step": 7733 + }, + { + "epoch": 0.6, + "grad_norm": 0.6145188838056859, + "learning_rate": 1.846468337029176e-05, + "loss": 2.0532, + "step": 7734 + }, + { + "epoch": 0.6, + "grad_norm": 0.5934392537100782, + "learning_rate": 1.8458654085641185e-05, + "loss": 1.9976, + "step": 7735 + }, + { + "epoch": 0.6, + "grad_norm": 0.617151614941236, + "learning_rate": 1.8452625209393716e-05, + "loss": 1.8462, + "step": 7736 + }, + { + "epoch": 0.6, + "grad_norm": 0.6344854068682579, + "learning_rate": 1.844659674192575e-05, + "loss": 1.9037, + "step": 7737 + }, + { + "epoch": 0.6, + "grad_norm": 0.6264861498899883, + "learning_rate": 1.8440568683613668e-05, + "loss": 1.8669, + "step": 7738 + }, + { + "epoch": 0.6, + "grad_norm": 0.5723406710105847, + "learning_rate": 1.8434541034833837e-05, + "loss": 2.1105, + "step": 7739 + }, + { + "epoch": 0.6, + "grad_norm": 0.5758895671529257, + "learning_rate": 1.8428513795962577e-05, + "loss": 1.9048, + "step": 7740 + }, + { + "epoch": 0.6, + "grad_norm": 0.5647157800545446, + "learning_rate": 1.84224869673762e-05, + "loss": 1.8653, + "step": 7741 + }, + { + "epoch": 0.6, + "grad_norm": 0.5416794166380693, + "learning_rate": 1.8416460549450977e-05, + "loss": 2.0007, + "step": 7742 + }, + { + "epoch": 0.6, + "grad_norm": 0.5715399729988719, + "learning_rate": 1.8410434542563162e-05, + "loss": 2.0936, + "step": 7743 + }, + { + "epoch": 0.6, + "grad_norm": 0.6038103633140979, + "learning_rate": 1.8404408947089004e-05, + "loss": 1.8245, + "step": 7744 + }, + { + "epoch": 0.6, + "grad_norm": 0.6865303338332547, + "learning_rate": 1.8398383763404683e-05, + "loss": 1.8822, + "step": 7745 + }, + { + "epoch": 0.6, + "grad_norm": 0.5997130829296533, + "learning_rate": 1.839235899188639e-05, + "loss": 1.8675, + "step": 7746 + }, + { + "epoch": 0.6, + "grad_norm": 0.595432684154704, + "learning_rate": 1.838633463291026e-05, + "loss": 2.0342, + "step": 7747 + }, + { + "epoch": 0.6, + "grad_norm": 0.544027924728637, + "learning_rate": 1.8380310686852435e-05, + "loss": 1.9624, + "step": 7748 + }, + { + "epoch": 0.6, + "grad_norm": 0.6739226841857188, + "learning_rate": 1.837428715408901e-05, + "loss": 1.9288, + "step": 7749 + }, + { + "epoch": 0.6, + "grad_norm": 0.6483018252734095, + "learning_rate": 1.8368264034996047e-05, + "loss": 1.8997, + "step": 7750 + }, + { + "epoch": 0.6, + "grad_norm": 0.598280320278809, + "learning_rate": 1.836224132994961e-05, + "loss": 2.1073, + "step": 7751 + }, + { + "epoch": 0.6, + "grad_norm": 0.6217780171334572, + "learning_rate": 1.835621903932572e-05, + "loss": 1.8639, + "step": 7752 + }, + { + "epoch": 0.6, + "grad_norm": 0.6310027601285159, + "learning_rate": 1.8350197163500362e-05, + "loss": 1.8839, + "step": 7753 + }, + { + "epoch": 0.6, + "grad_norm": 0.7017178980785458, + "learning_rate": 1.8344175702849523e-05, + "loss": 1.9401, + "step": 7754 + }, + { + "epoch": 0.6, + "grad_norm": 0.7024779219891703, + "learning_rate": 1.8338154657749128e-05, + "loss": 2.0857, + "step": 7755 + }, + { + "epoch": 0.6, + "grad_norm": 0.6629654364110027, + "learning_rate": 1.8332134028575104e-05, + "loss": 1.8908, + "step": 7756 + }, + { + "epoch": 0.6, + "grad_norm": 0.6335287202610022, + "learning_rate": 1.8326113815703356e-05, + "loss": 1.8859, + "step": 7757 + }, + { + "epoch": 0.6, + "grad_norm": 0.7035097575842879, + "learning_rate": 1.8320094019509725e-05, + "loss": 1.9079, + "step": 7758 + }, + { + "epoch": 0.6, + "grad_norm": 0.6674914708528802, + "learning_rate": 1.8314074640370078e-05, + "loss": 2.1142, + "step": 7759 + }, + { + "epoch": 0.6, + "grad_norm": 0.7532304095738104, + "learning_rate": 1.8308055678660214e-05, + "loss": 1.9745, + "step": 7760 + }, + { + "epoch": 0.6, + "grad_norm": 0.5922029710105496, + "learning_rate": 1.830203713475592e-05, + "loss": 1.9778, + "step": 7761 + }, + { + "epoch": 0.6, + "grad_norm": 0.7977118274274759, + "learning_rate": 1.829601900903297e-05, + "loss": 1.9227, + "step": 7762 + }, + { + "epoch": 0.6, + "grad_norm": 0.6873866370585509, + "learning_rate": 1.8290001301867095e-05, + "loss": 2.0672, + "step": 7763 + }, + { + "epoch": 0.6, + "grad_norm": 0.6362474355501763, + "learning_rate": 1.8283984013633994e-05, + "loss": 1.8277, + "step": 7764 + }, + { + "epoch": 0.6, + "grad_norm": 0.7595118133481907, + "learning_rate": 1.8277967144709375e-05, + "loss": 1.8891, + "step": 7765 + }, + { + "epoch": 0.6, + "grad_norm": 0.6614081217776412, + "learning_rate": 1.8271950695468877e-05, + "loss": 1.8994, + "step": 7766 + }, + { + "epoch": 0.6, + "grad_norm": 0.8510539012794145, + "learning_rate": 1.8265934666288138e-05, + "loss": 2.1158, + "step": 7767 + }, + { + "epoch": 0.6, + "grad_norm": 0.6470941999315772, + "learning_rate": 1.8259919057542753e-05, + "loss": 1.8989, + "step": 7768 + }, + { + "epoch": 0.6, + "grad_norm": 0.7667160417145628, + "learning_rate": 1.8253903869608313e-05, + "loss": 1.9311, + "step": 7769 + }, + { + "epoch": 0.6, + "grad_norm": 0.61583410037006, + "learning_rate": 1.8247889102860376e-05, + "loss": 1.8668, + "step": 7770 + }, + { + "epoch": 0.6, + "grad_norm": 0.6115283995877181, + "learning_rate": 1.8241874757674452e-05, + "loss": 2.0583, + "step": 7771 + }, + { + "epoch": 0.6, + "grad_norm": 0.6218731402090237, + "learning_rate": 1.8235860834426054e-05, + "loss": 1.8667, + "step": 7772 + }, + { + "epoch": 0.6, + "grad_norm": 0.6225194025522878, + "learning_rate": 1.8229847333490656e-05, + "loss": 1.9771, + "step": 7773 + }, + { + "epoch": 0.6, + "grad_norm": 0.585509736846809, + "learning_rate": 1.8223834255243695e-05, + "loss": 1.8606, + "step": 7774 + }, + { + "epoch": 0.6, + "grad_norm": 0.6233976512524234, + "learning_rate": 1.8217821600060604e-05, + "loss": 2.1005, + "step": 7775 + }, + { + "epoch": 0.6, + "grad_norm": 0.6346375244118908, + "learning_rate": 1.821180936831676e-05, + "loss": 1.883, + "step": 7776 + }, + { + "epoch": 0.6, + "grad_norm": 0.5644265855740093, + "learning_rate": 1.8205797560387555e-05, + "loss": 1.8575, + "step": 7777 + }, + { + "epoch": 0.6, + "grad_norm": 0.6023407771751983, + "learning_rate": 1.819978617664832e-05, + "loss": 1.9059, + "step": 7778 + }, + { + "epoch": 0.6, + "grad_norm": 0.5831614747697381, + "learning_rate": 1.819377521747436e-05, + "loss": 2.1664, + "step": 7779 + }, + { + "epoch": 0.6, + "grad_norm": 0.6005857545495962, + "learning_rate": 1.8187764683240987e-05, + "loss": 1.8854, + "step": 7780 + }, + { + "epoch": 0.6, + "grad_norm": 0.6008182808219541, + "learning_rate": 1.8181754574323446e-05, + "loss": 1.9133, + "step": 7781 + }, + { + "epoch": 0.6, + "grad_norm": 0.6701638954489487, + "learning_rate": 1.817574489109697e-05, + "loss": 1.9047, + "step": 7782 + }, + { + "epoch": 0.6, + "grad_norm": 0.7797914057030707, + "learning_rate": 1.8169735633936784e-05, + "loss": 2.0497, + "step": 7783 + }, + { + "epoch": 0.6, + "grad_norm": 0.5536464968490604, + "learning_rate": 1.8163726803218062e-05, + "loss": 1.9119, + "step": 7784 + }, + { + "epoch": 0.6, + "grad_norm": 0.6704797841778731, + "learning_rate": 1.815771839931595e-05, + "loss": 1.9655, + "step": 7785 + }, + { + "epoch": 0.6, + "grad_norm": 0.6139415734454531, + "learning_rate": 1.8151710422605596e-05, + "loss": 1.8866, + "step": 7786 + }, + { + "epoch": 0.6, + "grad_norm": 0.7212104729721152, + "learning_rate": 1.8145702873462093e-05, + "loss": 2.0949, + "step": 7787 + }, + { + "epoch": 0.6, + "grad_norm": 0.6511792471052809, + "learning_rate": 1.8139695752260523e-05, + "loss": 1.8368, + "step": 7788 + }, + { + "epoch": 0.6, + "grad_norm": 0.590368059501848, + "learning_rate": 1.8133689059375914e-05, + "loss": 1.8924, + "step": 7789 + }, + { + "epoch": 0.6, + "grad_norm": 0.7139687289831533, + "learning_rate": 1.812768279518331e-05, + "loss": 1.9371, + "step": 7790 + }, + { + "epoch": 0.6, + "grad_norm": 0.7018265757162682, + "learning_rate": 1.8121676960057707e-05, + "loss": 2.1112, + "step": 7791 + }, + { + "epoch": 0.6, + "grad_norm": 0.7482878230500701, + "learning_rate": 1.8115671554374066e-05, + "loss": 1.9234, + "step": 7792 + }, + { + "epoch": 0.6, + "grad_norm": 0.6287800205060552, + "learning_rate": 1.8109666578507336e-05, + "loss": 1.9281, + "step": 7793 + }, + { + "epoch": 0.6, + "grad_norm": 0.7918995443383934, + "learning_rate": 1.810366203283242e-05, + "loss": 1.8884, + "step": 7794 + }, + { + "epoch": 0.6, + "grad_norm": 0.6771759262240806, + "learning_rate": 1.8097657917724213e-05, + "loss": 2.0591, + "step": 7795 + }, + { + "epoch": 0.6, + "grad_norm": 0.6795670841674857, + "learning_rate": 1.8091654233557588e-05, + "loss": 1.8965, + "step": 7796 + }, + { + "epoch": 0.6, + "grad_norm": 0.6368717744136054, + "learning_rate": 1.808565098070736e-05, + "loss": 1.9103, + "step": 7797 + }, + { + "epoch": 0.6, + "grad_norm": 0.6647974750713925, + "learning_rate": 1.8079648159548346e-05, + "loss": 2.0173, + "step": 7798 + }, + { + "epoch": 0.6, + "grad_norm": 0.6999442391893519, + "learning_rate": 1.807364577045534e-05, + "loss": 2.1013, + "step": 7799 + }, + { + "epoch": 0.6, + "grad_norm": 0.6073664548570264, + "learning_rate": 1.8067643813803066e-05, + "loss": 1.8972, + "step": 7800 + }, + { + "epoch": 0.6, + "grad_norm": 0.7299174686241575, + "learning_rate": 1.806164228996628e-05, + "loss": 1.9237, + "step": 7801 + }, + { + "epoch": 0.6, + "grad_norm": 0.6031002959850923, + "learning_rate": 1.8055641199319662e-05, + "loss": 1.8913, + "step": 7802 + }, + { + "epoch": 0.6, + "grad_norm": 0.600973150473757, + "learning_rate": 1.804964054223789e-05, + "loss": 2.0941, + "step": 7803 + }, + { + "epoch": 0.6, + "grad_norm": 0.5593916640226199, + "learning_rate": 1.8043640319095618e-05, + "loss": 2.0043, + "step": 7804 + }, + { + "epoch": 0.6, + "grad_norm": 0.6171216055780611, + "learning_rate": 1.8037640530267458e-05, + "loss": 1.8854, + "step": 7805 + }, + { + "epoch": 0.6, + "grad_norm": 0.5691476928332655, + "learning_rate": 1.8031641176127993e-05, + "loss": 1.9638, + "step": 7806 + }, + { + "epoch": 0.6, + "grad_norm": 0.5861069403314976, + "learning_rate": 1.802564225705181e-05, + "loss": 2.091, + "step": 7807 + }, + { + "epoch": 0.6, + "grad_norm": 0.5431777712881763, + "learning_rate": 1.8019643773413418e-05, + "loss": 1.8843, + "step": 7808 + }, + { + "epoch": 0.6, + "grad_norm": 0.6019298883685359, + "learning_rate": 1.801364572558735e-05, + "loss": 1.8988, + "step": 7809 + }, + { + "epoch": 0.6, + "grad_norm": 0.5790242965063109, + "learning_rate": 1.800764811394808e-05, + "loss": 1.9872, + "step": 7810 + }, + { + "epoch": 0.6, + "grad_norm": 0.6369057263097551, + "learning_rate": 1.8001650938870056e-05, + "loss": 2.0637, + "step": 7811 + }, + { + "epoch": 0.6, + "grad_norm": 0.5517602621617457, + "learning_rate": 1.7995654200727725e-05, + "loss": 1.8532, + "step": 7812 + }, + { + "epoch": 0.6, + "grad_norm": 0.5596762157021434, + "learning_rate": 1.7989657899895475e-05, + "loss": 1.9205, + "step": 7813 + }, + { + "epoch": 0.6, + "grad_norm": 0.6250233531507094, + "learning_rate": 1.7983662036747682e-05, + "loss": 1.9066, + "step": 7814 + }, + { + "epoch": 0.6, + "grad_norm": 0.5848115747426713, + "learning_rate": 1.7977666611658682e-05, + "loss": 2.0706, + "step": 7815 + }, + { + "epoch": 0.6, + "grad_norm": 0.5774253218411696, + "learning_rate": 1.7971671625002813e-05, + "loss": 1.9222, + "step": 7816 + }, + { + "epoch": 0.6, + "grad_norm": 0.680425238664995, + "learning_rate": 1.7965677077154358e-05, + "loss": 1.8846, + "step": 7817 + }, + { + "epoch": 0.6, + "grad_norm": 0.6590543752562206, + "learning_rate": 1.795968296848757e-05, + "loss": 1.921, + "step": 7818 + }, + { + "epoch": 0.6, + "grad_norm": 0.5760212269972022, + "learning_rate": 1.79536892993767e-05, + "loss": 2.1341, + "step": 7819 + }, + { + "epoch": 0.6, + "grad_norm": 0.6014094710801818, + "learning_rate": 1.794769607019597e-05, + "loss": 1.8809, + "step": 7820 + }, + { + "epoch": 0.6, + "grad_norm": 0.6076167046700441, + "learning_rate": 1.7941703281319526e-05, + "loss": 1.9012, + "step": 7821 + }, + { + "epoch": 0.6, + "grad_norm": 0.565255626712296, + "learning_rate": 1.7935710933121553e-05, + "loss": 1.9993, + "step": 7822 + }, + { + "epoch": 0.6, + "grad_norm": 0.6269915996758877, + "learning_rate": 1.792971902597616e-05, + "loss": 2.1131, + "step": 7823 + }, + { + "epoch": 0.6, + "grad_norm": 0.5768757771115035, + "learning_rate": 1.7923727560257457e-05, + "loss": 1.9345, + "step": 7824 + }, + { + "epoch": 0.6, + "grad_norm": 0.6170705832434309, + "learning_rate": 1.791773653633952e-05, + "loss": 1.8921, + "step": 7825 + }, + { + "epoch": 0.6, + "grad_norm": 0.5969539826377342, + "learning_rate": 1.7911745954596373e-05, + "loss": 1.8605, + "step": 7826 + }, + { + "epoch": 0.6, + "grad_norm": 0.6066256125022826, + "learning_rate": 1.7905755815402058e-05, + "loss": 2.0766, + "step": 7827 + }, + { + "epoch": 0.6, + "grad_norm": 0.5944986377650323, + "learning_rate": 1.7899766119130547e-05, + "loss": 1.8571, + "step": 7828 + }, + { + "epoch": 0.6, + "grad_norm": 0.5858110077330112, + "learning_rate": 1.7893776866155796e-05, + "loss": 1.9526, + "step": 7829 + }, + { + "epoch": 0.6, + "grad_norm": 0.5677919074652281, + "learning_rate": 1.7887788056851763e-05, + "loss": 1.8803, + "step": 7830 + }, + { + "epoch": 0.6, + "grad_norm": 0.6300631390689946, + "learning_rate": 1.788179969159233e-05, + "loss": 2.0733, + "step": 7831 + }, + { + "epoch": 0.6, + "grad_norm": 0.5922216597867523, + "learning_rate": 1.787581177075138e-05, + "loss": 1.9638, + "step": 7832 + }, + { + "epoch": 0.6, + "grad_norm": 0.5736583336007127, + "learning_rate": 1.786982429470278e-05, + "loss": 1.8957, + "step": 7833 + }, + { + "epoch": 0.6, + "grad_norm": 0.6553455345654143, + "learning_rate": 1.786383726382034e-05, + "loss": 1.9086, + "step": 7834 + }, + { + "epoch": 0.6, + "grad_norm": 0.5445063604073471, + "learning_rate": 1.7857850678477853e-05, + "loss": 2.1732, + "step": 7835 + }, + { + "epoch": 0.6, + "grad_norm": 0.5949209908124793, + "learning_rate": 1.785186453904908e-05, + "loss": 1.891, + "step": 7836 + }, + { + "epoch": 0.6, + "grad_norm": 0.601220615707868, + "learning_rate": 1.7845878845907777e-05, + "loss": 1.947, + "step": 7837 + }, + { + "epoch": 0.6, + "grad_norm": 0.5717188789287955, + "learning_rate": 1.7839893599427655e-05, + "loss": 1.919, + "step": 7838 + }, + { + "epoch": 0.6, + "grad_norm": 0.6157986394718005, + "learning_rate": 1.7833908799982375e-05, + "loss": 2.0781, + "step": 7839 + }, + { + "epoch": 0.6, + "grad_norm": 0.5897577683343103, + "learning_rate": 1.7827924447945607e-05, + "loss": 1.8707, + "step": 7840 + }, + { + "epoch": 0.6, + "grad_norm": 0.544929613959994, + "learning_rate": 1.782194054369099e-05, + "loss": 1.9656, + "step": 7841 + }, + { + "epoch": 0.6, + "grad_norm": 0.6379077735656136, + "learning_rate": 1.781595708759211e-05, + "loss": 1.8916, + "step": 7842 + }, + { + "epoch": 0.61, + "grad_norm": 0.6000366617506121, + "learning_rate": 1.780997408002255e-05, + "loss": 2.1222, + "step": 7843 + }, + { + "epoch": 0.61, + "grad_norm": 0.6179728577898299, + "learning_rate": 1.7803991521355833e-05, + "loss": 1.9332, + "step": 7844 + }, + { + "epoch": 0.61, + "grad_norm": 0.6523102133110338, + "learning_rate": 1.7798009411965488e-05, + "loss": 1.8804, + "step": 7845 + }, + { + "epoch": 0.61, + "grad_norm": 0.7151860094893757, + "learning_rate": 1.779202775222501e-05, + "loss": 1.8776, + "step": 7846 + }, + { + "epoch": 0.61, + "grad_norm": 0.6217671189010079, + "learning_rate": 1.7786046542507843e-05, + "loss": 1.9642, + "step": 7847 + }, + { + "epoch": 0.61, + "grad_norm": 0.6190687440613986, + "learning_rate": 1.7780065783187433e-05, + "loss": 2.0869, + "step": 7848 + }, + { + "epoch": 0.61, + "grad_norm": 0.5916595747307961, + "learning_rate": 1.7774085474637174e-05, + "loss": 1.8928, + "step": 7849 + }, + { + "epoch": 0.61, + "grad_norm": 0.6289275709568086, + "learning_rate": 1.7768105617230436e-05, + "loss": 1.8397, + "step": 7850 + }, + { + "epoch": 0.61, + "grad_norm": 0.6002033890970713, + "learning_rate": 1.7762126211340584e-05, + "loss": 2.1026, + "step": 7851 + }, + { + "epoch": 0.61, + "grad_norm": 0.6448147130258309, + "learning_rate": 1.775614725734092e-05, + "loss": 1.8848, + "step": 7852 + }, + { + "epoch": 0.61, + "grad_norm": 0.555557086988092, + "learning_rate": 1.775016875560474e-05, + "loss": 1.9457, + "step": 7853 + }, + { + "epoch": 0.61, + "grad_norm": 0.6107464537399255, + "learning_rate": 1.7744190706505314e-05, + "loss": 1.8869, + "step": 7854 + }, + { + "epoch": 0.61, + "grad_norm": 0.5869432236206674, + "learning_rate": 1.7738213110415862e-05, + "loss": 2.0733, + "step": 7855 + }, + { + "epoch": 0.61, + "grad_norm": 0.6157146054263752, + "learning_rate": 1.773223596770961e-05, + "loss": 1.9056, + "step": 7856 + }, + { + "epoch": 0.61, + "grad_norm": 0.6584867691868652, + "learning_rate": 1.7726259278759716e-05, + "loss": 1.9017, + "step": 7857 + }, + { + "epoch": 0.61, + "grad_norm": 0.6264213813329534, + "learning_rate": 1.772028304393933e-05, + "loss": 1.8453, + "step": 7858 + }, + { + "epoch": 0.61, + "grad_norm": 0.7118522151676001, + "learning_rate": 1.771430726362159e-05, + "loss": 2.1332, + "step": 7859 + }, + { + "epoch": 0.61, + "grad_norm": 0.5368481785471216, + "learning_rate": 1.7708331938179573e-05, + "loss": 2.0009, + "step": 7860 + }, + { + "epoch": 0.61, + "grad_norm": 0.5710401867510809, + "learning_rate": 1.770235706798635e-05, + "loss": 1.8649, + "step": 7861 + }, + { + "epoch": 0.61, + "grad_norm": 0.6109026683273162, + "learning_rate": 1.7696382653414954e-05, + "loss": 1.9261, + "step": 7862 + }, + { + "epoch": 0.61, + "grad_norm": 0.5869656608413821, + "learning_rate": 1.769040869483839e-05, + "loss": 2.1461, + "step": 7863 + }, + { + "epoch": 0.61, + "grad_norm": 0.6068625502702995, + "learning_rate": 1.7684435192629646e-05, + "loss": 1.9137, + "step": 7864 + }, + { + "epoch": 0.61, + "grad_norm": 0.5809425731552649, + "learning_rate": 1.767846214716166e-05, + "loss": 1.8826, + "step": 7865 + }, + { + "epoch": 0.61, + "grad_norm": 0.5677613606869302, + "learning_rate": 1.7672489558807358e-05, + "loss": 1.9327, + "step": 7866 + }, + { + "epoch": 0.61, + "grad_norm": 0.6166180087456454, + "learning_rate": 1.7666517427939647e-05, + "loss": 1.8603, + "step": 7867 + }, + { + "epoch": 0.61, + "grad_norm": 0.5764775943284338, + "learning_rate": 1.7660545754931366e-05, + "loss": 2.0908, + "step": 7868 + }, + { + "epoch": 0.61, + "grad_norm": 0.5927651333936975, + "learning_rate": 1.7654574540155375e-05, + "loss": 1.8477, + "step": 7869 + }, + { + "epoch": 0.61, + "grad_norm": 0.6216857035616014, + "learning_rate": 1.7648603783984467e-05, + "loss": 1.9295, + "step": 7870 + }, + { + "epoch": 0.61, + "grad_norm": 0.59652069168757, + "learning_rate": 1.7642633486791417e-05, + "loss": 2.1204, + "step": 7871 + }, + { + "epoch": 0.61, + "grad_norm": 0.5288864603747875, + "learning_rate": 1.7636663648949e-05, + "loss": 2.0155, + "step": 7872 + }, + { + "epoch": 0.61, + "grad_norm": 0.5794646324459539, + "learning_rate": 1.7630694270829902e-05, + "loss": 1.9097, + "step": 7873 + }, + { + "epoch": 0.61, + "grad_norm": 0.607118808374081, + "learning_rate": 1.762472535280685e-05, + "loss": 1.8734, + "step": 7874 + }, + { + "epoch": 0.61, + "grad_norm": 0.639673190950008, + "learning_rate": 1.7618756895252488e-05, + "loss": 2.0686, + "step": 7875 + }, + { + "epoch": 0.61, + "grad_norm": 0.5895451515529958, + "learning_rate": 1.7612788898539448e-05, + "loss": 1.8973, + "step": 7876 + }, + { + "epoch": 0.61, + "grad_norm": 0.6220253777305389, + "learning_rate": 1.7606821363040354e-05, + "loss": 1.9109, + "step": 7877 + }, + { + "epoch": 0.61, + "grad_norm": 0.7342770980332243, + "learning_rate": 1.760085428912777e-05, + "loss": 1.9813, + "step": 7878 + }, + { + "epoch": 0.61, + "grad_norm": 0.5549089337712442, + "learning_rate": 1.759488767717425e-05, + "loss": 1.8884, + "step": 7879 + }, + { + "epoch": 0.61, + "grad_norm": 0.7152730103099663, + "learning_rate": 1.7588921527552317e-05, + "loss": 2.0773, + "step": 7880 + }, + { + "epoch": 0.61, + "grad_norm": 0.6943605257457981, + "learning_rate": 1.7582955840634458e-05, + "loss": 1.852, + "step": 7881 + }, + { + "epoch": 0.61, + "grad_norm": 0.6611591251877922, + "learning_rate": 1.7576990616793137e-05, + "loss": 1.8504, + "step": 7882 + }, + { + "epoch": 0.61, + "grad_norm": 0.5935913874892953, + "learning_rate": 1.7571025856400776e-05, + "loss": 2.0245, + "step": 7883 + }, + { + "epoch": 0.61, + "grad_norm": 0.684131538217943, + "learning_rate": 1.7565061559829796e-05, + "loss": 1.9444, + "step": 7884 + }, + { + "epoch": 0.61, + "grad_norm": 0.6948814435617148, + "learning_rate": 1.7559097727452574e-05, + "loss": 1.8492, + "step": 7885 + }, + { + "epoch": 0.61, + "grad_norm": 0.6375461746237897, + "learning_rate": 1.7553134359641434e-05, + "loss": 1.8901, + "step": 7886 + }, + { + "epoch": 0.61, + "grad_norm": 0.7377319021951324, + "learning_rate": 1.754717145676871e-05, + "loss": 2.1231, + "step": 7887 + }, + { + "epoch": 0.61, + "grad_norm": 0.6110164086481045, + "learning_rate": 1.75412090192067e-05, + "loss": 1.8822, + "step": 7888 + }, + { + "epoch": 0.61, + "grad_norm": 0.5990784653626101, + "learning_rate": 1.7535247047327647e-05, + "loss": 1.8701, + "step": 7889 + }, + { + "epoch": 0.61, + "grad_norm": 0.6213652946377782, + "learning_rate": 1.7529285541503793e-05, + "loss": 1.9267, + "step": 7890 + }, + { + "epoch": 0.61, + "grad_norm": 0.6712567975216626, + "learning_rate": 1.7523324502107322e-05, + "loss": 1.9811, + "step": 7891 + }, + { + "epoch": 0.61, + "grad_norm": 0.6564395912470025, + "learning_rate": 1.751736392951042e-05, + "loss": 2.0606, + "step": 7892 + }, + { + "epoch": 0.61, + "grad_norm": 0.6159426270957444, + "learning_rate": 1.7511403824085237e-05, + "loss": 1.8315, + "step": 7893 + }, + { + "epoch": 0.61, + "grad_norm": 0.78930003480808, + "learning_rate": 1.7505444186203863e-05, + "loss": 1.8509, + "step": 7894 + }, + { + "epoch": 0.61, + "grad_norm": 0.6570447162742777, + "learning_rate": 1.749948501623841e-05, + "loss": 2.063, + "step": 7895 + }, + { + "epoch": 0.61, + "grad_norm": 0.5827395437317916, + "learning_rate": 1.7493526314560914e-05, + "loss": 1.8527, + "step": 7896 + }, + { + "epoch": 0.61, + "grad_norm": 0.6478877776408313, + "learning_rate": 1.7487568081543402e-05, + "loss": 1.9804, + "step": 7897 + }, + { + "epoch": 0.61, + "grad_norm": 0.6965102124717057, + "learning_rate": 1.7481610317557886e-05, + "loss": 1.907, + "step": 7898 + }, + { + "epoch": 0.61, + "grad_norm": 0.5703523088545954, + "learning_rate": 1.7475653022976322e-05, + "loss": 1.8926, + "step": 7899 + }, + { + "epoch": 0.61, + "grad_norm": 0.7261274965392124, + "learning_rate": 1.7469696198170645e-05, + "loss": 2.0773, + "step": 7900 + }, + { + "epoch": 0.61, + "grad_norm": 0.5916676651136727, + "learning_rate": 1.7463739843512777e-05, + "loss": 1.9314, + "step": 7901 + }, + { + "epoch": 0.61, + "grad_norm": 0.6047730800406996, + "learning_rate": 1.7457783959374586e-05, + "loss": 1.9088, + "step": 7902 + }, + { + "epoch": 0.61, + "grad_norm": 0.6273730038263792, + "learning_rate": 1.7451828546127933e-05, + "loss": 2.0144, + "step": 7903 + }, + { + "epoch": 0.61, + "grad_norm": 0.6265284706347284, + "learning_rate": 1.7445873604144618e-05, + "loss": 2.0741, + "step": 7904 + }, + { + "epoch": 0.61, + "grad_norm": 0.5870418695440645, + "learning_rate": 1.7439919133796454e-05, + "loss": 1.8845, + "step": 7905 + }, + { + "epoch": 0.61, + "grad_norm": 0.5911542426237453, + "learning_rate": 1.7433965135455205e-05, + "loss": 1.8938, + "step": 7906 + }, + { + "epoch": 0.61, + "grad_norm": 0.7172121942309946, + "learning_rate": 1.7428011609492583e-05, + "loss": 2.0289, + "step": 7907 + }, + { + "epoch": 0.61, + "grad_norm": 0.6178258872050997, + "learning_rate": 1.742205855628031e-05, + "loss": 1.9205, + "step": 7908 + }, + { + "epoch": 0.61, + "grad_norm": 0.5437219516372771, + "learning_rate": 1.741610597619005e-05, + "loss": 1.9571, + "step": 7909 + }, + { + "epoch": 0.61, + "grad_norm": 0.5808736437971359, + "learning_rate": 1.741015386959345e-05, + "loss": 1.8994, + "step": 7910 + }, + { + "epoch": 0.61, + "grad_norm": 0.6386257751831419, + "learning_rate": 1.7404202236862126e-05, + "loss": 1.908, + "step": 7911 + }, + { + "epoch": 0.61, + "grad_norm": 0.5856127589847484, + "learning_rate": 1.7398251078367653e-05, + "loss": 2.1146, + "step": 7912 + }, + { + "epoch": 0.61, + "grad_norm": 0.5849799831314236, + "learning_rate": 1.7392300394481597e-05, + "loss": 1.8754, + "step": 7913 + }, + { + "epoch": 0.61, + "grad_norm": 0.577959089254045, + "learning_rate": 1.738635018557549e-05, + "loss": 1.8759, + "step": 7914 + }, + { + "epoch": 0.61, + "grad_norm": 0.7037860917785987, + "learning_rate": 1.73804004520208e-05, + "loss": 2.1306, + "step": 7915 + }, + { + "epoch": 0.61, + "grad_norm": 0.5369241546013384, + "learning_rate": 1.737445119418903e-05, + "loss": 1.9128, + "step": 7916 + }, + { + "epoch": 0.61, + "grad_norm": 0.5684730169652269, + "learning_rate": 1.736850241245159e-05, + "loss": 1.8928, + "step": 7917 + }, + { + "epoch": 0.61, + "grad_norm": 0.6438222618279876, + "learning_rate": 1.736255410717989e-05, + "loss": 1.9077, + "step": 7918 + }, + { + "epoch": 0.61, + "grad_norm": 0.589757951522892, + "learning_rate": 1.7356606278745325e-05, + "loss": 2.0778, + "step": 7919 + }, + { + "epoch": 0.61, + "grad_norm": 0.6387325451317934, + "learning_rate": 1.735065892751922e-05, + "loss": 1.9, + "step": 7920 + }, + { + "epoch": 0.61, + "grad_norm": 0.6508457076639893, + "learning_rate": 1.7344712053872904e-05, + "loss": 1.8971, + "step": 7921 + }, + { + "epoch": 0.61, + "grad_norm": 0.6313086783591261, + "learning_rate": 1.7338765658177673e-05, + "loss": 1.9701, + "step": 7922 + }, + { + "epoch": 0.61, + "grad_norm": 0.6351958365253333, + "learning_rate": 1.7332819740804763e-05, + "loss": 1.909, + "step": 7923 + }, + { + "epoch": 0.61, + "grad_norm": 0.5854426547907485, + "learning_rate": 1.7326874302125425e-05, + "loss": 2.088, + "step": 7924 + }, + { + "epoch": 0.61, + "grad_norm": 0.6417122455130803, + "learning_rate": 1.7320929342510835e-05, + "loss": 1.9338, + "step": 7925 + }, + { + "epoch": 0.61, + "grad_norm": 0.6150854322462879, + "learning_rate": 1.7314984862332174e-05, + "loss": 1.8561, + "step": 7926 + }, + { + "epoch": 0.61, + "grad_norm": 0.6194812651014531, + "learning_rate": 1.7309040861960588e-05, + "loss": 2.1163, + "step": 7927 + }, + { + "epoch": 0.61, + "grad_norm": 0.6255036604490646, + "learning_rate": 1.7303097341767168e-05, + "loss": 1.9524, + "step": 7928 + }, + { + "epoch": 0.61, + "grad_norm": 0.5913152906707704, + "learning_rate": 1.729715430212301e-05, + "loss": 1.8699, + "step": 7929 + }, + { + "epoch": 0.61, + "grad_norm": 0.630252163603967, + "learning_rate": 1.7291211743399142e-05, + "loss": 1.8665, + "step": 7930 + }, + { + "epoch": 0.61, + "grad_norm": 0.6743948522811635, + "learning_rate": 1.7285269665966597e-05, + "loss": 1.8937, + "step": 7931 + }, + { + "epoch": 0.61, + "grad_norm": 0.5741099734215458, + "learning_rate": 1.7279328070196367e-05, + "loss": 2.0774, + "step": 7932 + }, + { + "epoch": 0.61, + "grad_norm": 0.5663571055188146, + "learning_rate": 1.7273386956459387e-05, + "loss": 1.8711, + "step": 7933 + }, + { + "epoch": 0.61, + "grad_norm": 0.5469398317891959, + "learning_rate": 1.7267446325126613e-05, + "loss": 1.984, + "step": 7934 + }, + { + "epoch": 0.61, + "grad_norm": 0.590180910990213, + "learning_rate": 1.726150617656893e-05, + "loss": 1.9194, + "step": 7935 + }, + { + "epoch": 0.61, + "grad_norm": 0.6495951347388106, + "learning_rate": 1.7255566511157202e-05, + "loss": 2.1191, + "step": 7936 + }, + { + "epoch": 0.61, + "grad_norm": 0.5711526199658411, + "learning_rate": 1.724962732926228e-05, + "loss": 1.9093, + "step": 7937 + }, + { + "epoch": 0.61, + "grad_norm": 0.6701154678327149, + "learning_rate": 1.724368863125495e-05, + "loss": 1.8442, + "step": 7938 + }, + { + "epoch": 0.61, + "grad_norm": 0.6142836841447056, + "learning_rate": 1.723775041750601e-05, + "loss": 2.0674, + "step": 7939 + }, + { + "epoch": 0.61, + "grad_norm": 0.6181215189326719, + "learning_rate": 1.7231812688386207e-05, + "loss": 1.9722, + "step": 7940 + }, + { + "epoch": 0.61, + "grad_norm": 0.6913465182502596, + "learning_rate": 1.722587544426624e-05, + "loss": 1.9164, + "step": 7941 + }, + { + "epoch": 0.61, + "grad_norm": 0.5774474714587979, + "learning_rate": 1.7219938685516813e-05, + "loss": 1.8898, + "step": 7942 + }, + { + "epoch": 0.61, + "grad_norm": 0.6105863409077543, + "learning_rate": 1.721400241250857e-05, + "loss": 1.8607, + "step": 7943 + }, + { + "epoch": 0.61, + "grad_norm": 0.6597433325417092, + "learning_rate": 1.7208066625612137e-05, + "loss": 2.0587, + "step": 7944 + }, + { + "epoch": 0.61, + "grad_norm": 0.5939433443558272, + "learning_rate": 1.7202131325198128e-05, + "loss": 1.9571, + "step": 7945 + }, + { + "epoch": 0.61, + "grad_norm": 0.6551387871101169, + "learning_rate": 1.7196196511637084e-05, + "loss": 1.8808, + "step": 7946 + }, + { + "epoch": 0.61, + "grad_norm": 0.6203557296536555, + "learning_rate": 1.7190262185299553e-05, + "loss": 1.9544, + "step": 7947 + }, + { + "epoch": 0.61, + "grad_norm": 0.6123601245854662, + "learning_rate": 1.7184328346556043e-05, + "loss": 2.0858, + "step": 7948 + }, + { + "epoch": 0.61, + "grad_norm": 0.6121778463590674, + "learning_rate": 1.7178394995777017e-05, + "loss": 1.886, + "step": 7949 + }, + { + "epoch": 0.61, + "grad_norm": 0.6997189840662202, + "learning_rate": 1.7172462133332928e-05, + "loss": 1.8644, + "step": 7950 + }, + { + "epoch": 0.61, + "grad_norm": 0.6597529635170581, + "learning_rate": 1.7166529759594175e-05, + "loss": 1.9119, + "step": 7951 + }, + { + "epoch": 0.61, + "grad_norm": 0.6709672561238749, + "learning_rate": 1.716059787493115e-05, + "loss": 2.1265, + "step": 7952 + }, + { + "epoch": 0.61, + "grad_norm": 0.6120303691723041, + "learning_rate": 1.7154666479714217e-05, + "loss": 1.9662, + "step": 7953 + }, + { + "epoch": 0.61, + "grad_norm": 0.7134738882650948, + "learning_rate": 1.7148735574313678e-05, + "loss": 1.8967, + "step": 7954 + }, + { + "epoch": 0.61, + "grad_norm": 0.5841904401426483, + "learning_rate": 1.7142805159099823e-05, + "loss": 1.9011, + "step": 7955 + }, + { + "epoch": 0.61, + "grad_norm": 0.5456764169555162, + "learning_rate": 1.7136875234442935e-05, + "loss": 2.0628, + "step": 7956 + }, + { + "epoch": 0.61, + "grad_norm": 0.6893908449369878, + "learning_rate": 1.713094580071322e-05, + "loss": 1.8831, + "step": 7957 + }, + { + "epoch": 0.61, + "grad_norm": 0.5742596989805362, + "learning_rate": 1.712501685828089e-05, + "loss": 1.9134, + "step": 7958 + }, + { + "epoch": 0.61, + "grad_norm": 0.6545005024332411, + "learning_rate": 1.71190884075161e-05, + "loss": 1.9738, + "step": 7959 + }, + { + "epoch": 0.61, + "grad_norm": 0.6106907723159131, + "learning_rate": 1.7113160448789004e-05, + "loss": 2.0977, + "step": 7960 + }, + { + "epoch": 0.61, + "grad_norm": 0.6451038191553446, + "learning_rate": 1.7107232982469703e-05, + "loss": 1.9291, + "step": 7961 + }, + { + "epoch": 0.61, + "grad_norm": 0.7239972334341865, + "learning_rate": 1.7101306008928264e-05, + "loss": 1.9091, + "step": 7962 + }, + { + "epoch": 0.61, + "grad_norm": 0.662887261971421, + "learning_rate": 1.7095379528534747e-05, + "loss": 1.8839, + "step": 7963 + }, + { + "epoch": 0.61, + "grad_norm": 0.6312263534688255, + "learning_rate": 1.7089453541659156e-05, + "loss": 2.0567, + "step": 7964 + }, + { + "epoch": 0.61, + "grad_norm": 0.7483024201679156, + "learning_rate": 1.7083528048671472e-05, + "loss": 1.945, + "step": 7965 + }, + { + "epoch": 0.61, + "grad_norm": 0.635652201073088, + "learning_rate": 1.707760304994166e-05, + "loss": 1.8519, + "step": 7966 + }, + { + "epoch": 0.61, + "grad_norm": 0.607762180081153, + "learning_rate": 1.7071678545839637e-05, + "loss": 1.9039, + "step": 7967 + }, + { + "epoch": 0.61, + "grad_norm": 0.7066561170774155, + "learning_rate": 1.706575453673528e-05, + "loss": 2.0635, + "step": 7968 + }, + { + "epoch": 0.61, + "grad_norm": 0.5958641381948798, + "learning_rate": 1.705983102299848e-05, + "loss": 1.8844, + "step": 7969 + }, + { + "epoch": 0.61, + "grad_norm": 0.5972990343197131, + "learning_rate": 1.7053908004999034e-05, + "loss": 1.9133, + "step": 7970 + }, + { + "epoch": 0.61, + "grad_norm": 0.6463145125872612, + "learning_rate": 1.7047985483106765e-05, + "loss": 1.946, + "step": 7971 + }, + { + "epoch": 0.62, + "grad_norm": 0.588798646411515, + "learning_rate": 1.7042063457691427e-05, + "loss": 2.0574, + "step": 7972 + }, + { + "epoch": 0.62, + "grad_norm": 0.5766981243692957, + "learning_rate": 1.7036141929122752e-05, + "loss": 1.8695, + "step": 7973 + }, + { + "epoch": 0.62, + "grad_norm": 0.5941651588530273, + "learning_rate": 1.703022089777046e-05, + "loss": 1.9097, + "step": 7974 + }, + { + "epoch": 0.62, + "grad_norm": 0.5730527789485307, + "learning_rate": 1.702430036400422e-05, + "loss": 1.9164, + "step": 7975 + }, + { + "epoch": 0.62, + "grad_norm": 0.631103891784438, + "learning_rate": 1.7018380328193674e-05, + "loss": 2.0435, + "step": 7976 + }, + { + "epoch": 0.62, + "grad_norm": 0.5534488734333493, + "learning_rate": 1.7012460790708425e-05, + "loss": 1.7841, + "step": 7977 + }, + { + "epoch": 0.62, + "grad_norm": 0.5734930696584916, + "learning_rate": 1.7006541751918065e-05, + "loss": 1.9679, + "step": 7978 + }, + { + "epoch": 0.62, + "grad_norm": 0.5548599008500483, + "learning_rate": 1.7000623212192145e-05, + "loss": 1.8742, + "step": 7979 + }, + { + "epoch": 0.62, + "grad_norm": 0.6245666297705111, + "learning_rate": 1.699470517190018e-05, + "loss": 2.0611, + "step": 7980 + }, + { + "epoch": 0.62, + "grad_norm": 0.606544668125844, + "learning_rate": 1.698878763141165e-05, + "loss": 1.9305, + "step": 7981 + }, + { + "epoch": 0.62, + "grad_norm": 0.6158267457037985, + "learning_rate": 1.698287059109603e-05, + "loss": 1.9005, + "step": 7982 + }, + { + "epoch": 0.62, + "grad_norm": 0.560299826039455, + "learning_rate": 1.6976954051322723e-05, + "loss": 1.84, + "step": 7983 + }, + { + "epoch": 0.62, + "grad_norm": 0.6148135511510494, + "learning_rate": 1.697103801246115e-05, + "loss": 2.1081, + "step": 7984 + }, + { + "epoch": 0.62, + "grad_norm": 0.5975577107629523, + "learning_rate": 1.696512247488064e-05, + "loss": 1.8782, + "step": 7985 + }, + { + "epoch": 0.62, + "grad_norm": 0.6683331777920812, + "learning_rate": 1.695920743895055e-05, + "loss": 1.9223, + "step": 7986 + }, + { + "epoch": 0.62, + "grad_norm": 0.6126261745148434, + "learning_rate": 1.6953292905040176e-05, + "loss": 1.8976, + "step": 7987 + }, + { + "epoch": 0.62, + "grad_norm": 0.6679495069078509, + "learning_rate": 1.6947378873518773e-05, + "loss": 2.0959, + "step": 7988 + }, + { + "epoch": 0.62, + "grad_norm": 0.6938483756843288, + "learning_rate": 1.69414653447556e-05, + "loss": 1.9325, + "step": 7989 + }, + { + "epoch": 0.62, + "grad_norm": 0.5716343155119477, + "learning_rate": 1.6935552319119842e-05, + "loss": 1.9796, + "step": 7990 + }, + { + "epoch": 0.62, + "grad_norm": 0.6483450979338703, + "learning_rate": 1.692963979698068e-05, + "loss": 1.9206, + "step": 7991 + }, + { + "epoch": 0.62, + "grad_norm": 0.5561805460373539, + "learning_rate": 1.692372777870727e-05, + "loss": 2.1099, + "step": 7992 + }, + { + "epoch": 0.62, + "grad_norm": 0.6956486869538951, + "learning_rate": 1.6917816264668705e-05, + "loss": 1.9271, + "step": 7993 + }, + { + "epoch": 0.62, + "grad_norm": 0.602162545252122, + "learning_rate": 1.6911905255234073e-05, + "loss": 1.8839, + "step": 7994 + }, + { + "epoch": 0.62, + "grad_norm": 0.5901686064514177, + "learning_rate": 1.6905994750772427e-05, + "loss": 1.8695, + "step": 7995 + }, + { + "epoch": 0.62, + "grad_norm": 0.569357300453312, + "learning_rate": 1.690008475165278e-05, + "loss": 2.127, + "step": 7996 + }, + { + "epoch": 0.62, + "grad_norm": 0.6560344160546161, + "learning_rate": 1.6894175258244116e-05, + "loss": 1.8329, + "step": 7997 + }, + { + "epoch": 0.62, + "grad_norm": 0.7627662379042516, + "learning_rate": 1.6888266270915386e-05, + "loss": 1.8988, + "step": 7998 + }, + { + "epoch": 0.62, + "grad_norm": 0.597755388550354, + "learning_rate": 1.688235779003552e-05, + "loss": 1.9014, + "step": 7999 + }, + { + "epoch": 0.62, + "grad_norm": 0.6881618806615041, + "learning_rate": 1.687644981597341e-05, + "loss": 2.0975, + "step": 8000 + }, + { + "epoch": 0.62, + "grad_norm": 0.7977816978091596, + "learning_rate": 1.6870542349097896e-05, + "loss": 1.8794, + "step": 8001 + }, + { + "epoch": 0.62, + "grad_norm": 0.6734383211323276, + "learning_rate": 1.6864635389777823e-05, + "loss": 1.9372, + "step": 8002 + }, + { + "epoch": 0.62, + "grad_norm": 0.7203124991084491, + "learning_rate": 1.6858728938381996e-05, + "loss": 1.8298, + "step": 8003 + }, + { + "epoch": 0.62, + "grad_norm": 0.6864009574616214, + "learning_rate": 1.6852822995279162e-05, + "loss": 2.1231, + "step": 8004 + }, + { + "epoch": 0.62, + "grad_norm": 0.6852291387443736, + "learning_rate": 1.684691756083806e-05, + "loss": 1.8848, + "step": 8005 + }, + { + "epoch": 0.62, + "grad_norm": 0.6096937971114873, + "learning_rate": 1.684101263542738e-05, + "loss": 1.9214, + "step": 8006 + }, + { + "epoch": 0.62, + "grad_norm": 0.7029510562460212, + "learning_rate": 1.68351082194158e-05, + "loss": 1.8697, + "step": 8007 + }, + { + "epoch": 0.62, + "grad_norm": 0.7127286186459997, + "learning_rate": 1.6829204313171966e-05, + "loss": 2.123, + "step": 8008 + }, + { + "epoch": 0.62, + "grad_norm": 0.6243969119573524, + "learning_rate": 1.682330091706446e-05, + "loss": 1.9176, + "step": 8009 + }, + { + "epoch": 0.62, + "grad_norm": 0.7802181979540191, + "learning_rate": 1.6817398031461877e-05, + "loss": 1.8532, + "step": 8010 + }, + { + "epoch": 0.62, + "grad_norm": 0.5591925316772424, + "learning_rate": 1.681149565673275e-05, + "loss": 1.836, + "step": 8011 + }, + { + "epoch": 0.62, + "grad_norm": 0.9567767408081158, + "learning_rate": 1.680559379324558e-05, + "loss": 2.0832, + "step": 8012 + }, + { + "epoch": 0.62, + "grad_norm": 0.5533084232426075, + "learning_rate": 1.6799692441368864e-05, + "loss": 1.8367, + "step": 8013 + }, + { + "epoch": 0.62, + "grad_norm": 0.7830512871304877, + "learning_rate": 1.6793791601471032e-05, + "loss": 1.8869, + "step": 8014 + }, + { + "epoch": 0.62, + "grad_norm": 0.616983149810398, + "learning_rate": 1.67878912739205e-05, + "loss": 1.9416, + "step": 8015 + }, + { + "epoch": 0.62, + "grad_norm": 0.6995988982109247, + "learning_rate": 1.6781991459085655e-05, + "loss": 2.0572, + "step": 8016 + }, + { + "epoch": 0.62, + "grad_norm": 0.75433762192975, + "learning_rate": 1.6776092157334837e-05, + "loss": 1.933, + "step": 8017 + }, + { + "epoch": 0.62, + "grad_norm": 0.6567393243415063, + "learning_rate": 1.6770193369036387e-05, + "loss": 1.8639, + "step": 8018 + }, + { + "epoch": 0.62, + "grad_norm": 0.7404922005921006, + "learning_rate": 1.6764295094558558e-05, + "loss": 1.8564, + "step": 8019 + }, + { + "epoch": 0.62, + "grad_norm": 0.6152472413509229, + "learning_rate": 1.6758397334269617e-05, + "loss": 2.0461, + "step": 8020 + }, + { + "epoch": 0.62, + "grad_norm": 0.6866267694510039, + "learning_rate": 1.6752500088537802e-05, + "loss": 1.9558, + "step": 8021 + }, + { + "epoch": 0.62, + "grad_norm": 0.6343017888169888, + "learning_rate": 1.674660335773128e-05, + "loss": 1.8898, + "step": 8022 + }, + { + "epoch": 0.62, + "grad_norm": 0.6819270641416761, + "learning_rate": 1.674070714221822e-05, + "loss": 1.917, + "step": 8023 + }, + { + "epoch": 0.62, + "grad_norm": 0.7417602449024082, + "learning_rate": 1.673481144236674e-05, + "loss": 2.0914, + "step": 8024 + }, + { + "epoch": 0.62, + "grad_norm": 0.5716904858969567, + "learning_rate": 1.6728916258544934e-05, + "loss": 1.9304, + "step": 8025 + }, + { + "epoch": 0.62, + "grad_norm": 0.9571997724407122, + "learning_rate": 1.6723021591120868e-05, + "loss": 1.914, + "step": 8026 + }, + { + "epoch": 0.62, + "grad_norm": 0.6170397039361923, + "learning_rate": 1.671712744046256e-05, + "loss": 1.972, + "step": 8027 + }, + { + "epoch": 0.62, + "grad_norm": 0.7542187494579101, + "learning_rate": 1.671123380693802e-05, + "loss": 2.0995, + "step": 8028 + }, + { + "epoch": 0.62, + "grad_norm": 0.6364200626318585, + "learning_rate": 1.6705340690915204e-05, + "loss": 1.8761, + "step": 8029 + }, + { + "epoch": 0.62, + "grad_norm": 0.6798029863896007, + "learning_rate": 1.669944809276204e-05, + "loss": 1.9271, + "step": 8030 + }, + { + "epoch": 0.62, + "grad_norm": 0.6346467238584688, + "learning_rate": 1.669355601284644e-05, + "loss": 1.8897, + "step": 8031 + }, + { + "epoch": 0.62, + "grad_norm": 0.6258482406132627, + "learning_rate": 1.6687664451536257e-05, + "loss": 2.1138, + "step": 8032 + }, + { + "epoch": 0.62, + "grad_norm": 0.6937016290866295, + "learning_rate": 1.668177340919932e-05, + "loss": 1.9456, + "step": 8033 + }, + { + "epoch": 0.62, + "grad_norm": 0.6942954855108763, + "learning_rate": 1.6675882886203455e-05, + "loss": 1.9401, + "step": 8034 + }, + { + "epoch": 0.62, + "grad_norm": 0.6378264902439612, + "learning_rate": 1.6669992882916407e-05, + "loss": 1.9037, + "step": 8035 + }, + { + "epoch": 0.62, + "grad_norm": 0.7453561900100464, + "learning_rate": 1.666410339970593e-05, + "loss": 2.1065, + "step": 8036 + }, + { + "epoch": 0.62, + "grad_norm": 0.6114200967986942, + "learning_rate": 1.6658214436939727e-05, + "loss": 1.8579, + "step": 8037 + }, + { + "epoch": 0.62, + "grad_norm": 0.7159341784228856, + "learning_rate": 1.6652325994985457e-05, + "loss": 1.8801, + "step": 8038 + }, + { + "epoch": 0.62, + "grad_norm": 0.6965514550871307, + "learning_rate": 1.6646438074210773e-05, + "loss": 1.8748, + "step": 8039 + }, + { + "epoch": 0.62, + "grad_norm": 0.5231232297080328, + "learning_rate": 1.664055067498328e-05, + "loss": 2.1366, + "step": 8040 + }, + { + "epoch": 0.62, + "grad_norm": 0.7130533992530428, + "learning_rate": 1.663466379767054e-05, + "loss": 1.9333, + "step": 8041 + }, + { + "epoch": 0.62, + "grad_norm": 0.7032800803893571, + "learning_rate": 1.6628777442640116e-05, + "loss": 1.9024, + "step": 8042 + }, + { + "epoch": 0.62, + "grad_norm": 0.6120943397108941, + "learning_rate": 1.66228916102595e-05, + "loss": 1.8988, + "step": 8043 + }, + { + "epoch": 0.62, + "grad_norm": 0.6785806725400173, + "learning_rate": 1.6617006300896183e-05, + "loss": 2.08, + "step": 8044 + }, + { + "epoch": 0.62, + "grad_norm": 0.5878644076970218, + "learning_rate": 1.6611121514917585e-05, + "loss": 1.8978, + "step": 8045 + }, + { + "epoch": 0.62, + "grad_norm": 0.5831659900656351, + "learning_rate": 1.660523725269114e-05, + "loss": 1.9658, + "step": 8046 + }, + { + "epoch": 0.62, + "grad_norm": 0.5602775535804109, + "learning_rate": 1.659935351458423e-05, + "loss": 1.824, + "step": 8047 + }, + { + "epoch": 0.62, + "grad_norm": 0.6002489386191445, + "learning_rate": 1.6593470300964177e-05, + "loss": 2.0646, + "step": 8048 + }, + { + "epoch": 0.62, + "grad_norm": 0.5564676984264488, + "learning_rate": 1.658758761219831e-05, + "loss": 1.8593, + "step": 8049 + }, + { + "epoch": 0.62, + "grad_norm": 0.6264873869681522, + "learning_rate": 1.6581705448653916e-05, + "loss": 1.8952, + "step": 8050 + }, + { + "epoch": 0.62, + "grad_norm": 0.6058678630819602, + "learning_rate": 1.6575823810698227e-05, + "loss": 1.8987, + "step": 8051 + }, + { + "epoch": 0.62, + "grad_norm": 0.5782693188947625, + "learning_rate": 1.656994269869847e-05, + "loss": 2.1094, + "step": 8052 + }, + { + "epoch": 0.62, + "grad_norm": 0.6357996098006783, + "learning_rate": 1.6564062113021814e-05, + "loss": 1.8721, + "step": 8053 + }, + { + "epoch": 0.62, + "grad_norm": 1.515303778245165, + "learning_rate": 1.655818205403542e-05, + "loss": 1.8825, + "step": 8054 + }, + { + "epoch": 0.62, + "grad_norm": 0.7160412712075651, + "learning_rate": 1.6552302522106405e-05, + "loss": 1.8827, + "step": 8055 + }, + { + "epoch": 0.62, + "grad_norm": 1.0602593173735444, + "learning_rate": 1.6546423517601838e-05, + "loss": 2.1012, + "step": 8056 + }, + { + "epoch": 0.62, + "grad_norm": 0.6502543486562072, + "learning_rate": 1.654054504088879e-05, + "loss": 1.8866, + "step": 8057 + }, + { + "epoch": 0.62, + "grad_norm": 0.593784157115339, + "learning_rate": 1.653466709233426e-05, + "loss": 2.0046, + "step": 8058 + }, + { + "epoch": 0.62, + "grad_norm": 0.6077538852885653, + "learning_rate": 1.652878967230524e-05, + "loss": 1.891, + "step": 8059 + }, + { + "epoch": 0.62, + "grad_norm": 0.5954354868094053, + "learning_rate": 1.6522912781168685e-05, + "loss": 2.1012, + "step": 8060 + }, + { + "epoch": 0.62, + "grad_norm": 0.5827105956302798, + "learning_rate": 1.651703641929151e-05, + "loss": 1.9216, + "step": 8061 + }, + { + "epoch": 0.62, + "grad_norm": 0.5644056136574299, + "learning_rate": 1.651116058704059e-05, + "loss": 1.9216, + "step": 8062 + }, + { + "epoch": 0.62, + "grad_norm": 0.6259181331048347, + "learning_rate": 1.6505285284782802e-05, + "loss": 1.8646, + "step": 8063 + }, + { + "epoch": 0.62, + "grad_norm": 0.5904613980319957, + "learning_rate": 1.6499410512884945e-05, + "loss": 2.1182, + "step": 8064 + }, + { + "epoch": 0.62, + "grad_norm": 0.5345021678364518, + "learning_rate": 1.6493536271713816e-05, + "loss": 1.9739, + "step": 8065 + }, + { + "epoch": 0.62, + "grad_norm": 0.6547393274403956, + "learning_rate": 1.6487662561636154e-05, + "loss": 1.9231, + "step": 8066 + }, + { + "epoch": 0.62, + "grad_norm": 0.55041248391652, + "learning_rate": 1.6481789383018682e-05, + "loss": 1.8416, + "step": 8067 + }, + { + "epoch": 0.62, + "grad_norm": 0.5587007572077188, + "learning_rate": 1.647591673622811e-05, + "loss": 2.0942, + "step": 8068 + }, + { + "epoch": 0.62, + "grad_norm": 0.6008870407817143, + "learning_rate": 1.6470044621631063e-05, + "loss": 1.8771, + "step": 8069 + }, + { + "epoch": 0.62, + "grad_norm": 0.5771818797147513, + "learning_rate": 1.6464173039594172e-05, + "loss": 1.8802, + "step": 8070 + }, + { + "epoch": 0.62, + "grad_norm": 0.5496455565021258, + "learning_rate": 1.6458301990484032e-05, + "loss": 1.9363, + "step": 8071 + }, + { + "epoch": 0.62, + "grad_norm": 0.5569270637929251, + "learning_rate": 1.645243147466718e-05, + "loss": 2.1303, + "step": 8072 + }, + { + "epoch": 0.62, + "grad_norm": 0.6290749331081951, + "learning_rate": 1.6446561492510156e-05, + "loss": 1.8487, + "step": 8073 + }, + { + "epoch": 0.62, + "grad_norm": 0.6659922722590701, + "learning_rate": 1.6440692044379425e-05, + "loss": 1.947, + "step": 8074 + }, + { + "epoch": 0.62, + "grad_norm": 0.6061690885773195, + "learning_rate": 1.643482313064146e-05, + "loss": 1.9335, + "step": 8075 + }, + { + "epoch": 0.62, + "grad_norm": 0.5302501742388732, + "learning_rate": 1.6428954751662674e-05, + "loss": 2.0805, + "step": 8076 + }, + { + "epoch": 0.62, + "grad_norm": 0.5643789972439974, + "learning_rate": 1.6423086907809442e-05, + "loss": 1.9611, + "step": 8077 + }, + { + "epoch": 0.62, + "grad_norm": 0.5978386274008384, + "learning_rate": 1.6417219599448142e-05, + "loss": 1.8761, + "step": 8078 + }, + { + "epoch": 0.62, + "grad_norm": 0.5626958470384708, + "learning_rate": 1.641135282694508e-05, + "loss": 1.8671, + "step": 8079 + }, + { + "epoch": 0.62, + "grad_norm": 0.6167363174344708, + "learning_rate": 1.6405486590666532e-05, + "loss": 2.0731, + "step": 8080 + }, + { + "epoch": 0.62, + "grad_norm": 0.5752784107257081, + "learning_rate": 1.6399620890978778e-05, + "loss": 1.9475, + "step": 8081 + }, + { + "epoch": 0.62, + "grad_norm": 0.5718197703786133, + "learning_rate": 1.6393755728248017e-05, + "loss": 1.8973, + "step": 8082 + }, + { + "epoch": 0.62, + "grad_norm": 0.6568533469105671, + "learning_rate": 1.6387891102840437e-05, + "loss": 1.9846, + "step": 8083 + }, + { + "epoch": 0.62, + "grad_norm": 0.5827023216585865, + "learning_rate": 1.6382027015122202e-05, + "loss": 1.8643, + "step": 8084 + }, + { + "epoch": 0.62, + "grad_norm": 0.581859477546589, + "learning_rate": 1.6376163465459417e-05, + "loss": 2.087, + "step": 8085 + }, + { + "epoch": 0.62, + "grad_norm": 0.6712733176972534, + "learning_rate": 1.6370300454218184e-05, + "loss": 1.8773, + "step": 8086 + }, + { + "epoch": 0.62, + "grad_norm": 0.5836764467520185, + "learning_rate": 1.6364437981764542e-05, + "loss": 1.8948, + "step": 8087 + }, + { + "epoch": 0.62, + "grad_norm": 0.6397438923478695, + "learning_rate": 1.6358576048464502e-05, + "loss": 2.0889, + "step": 8088 + }, + { + "epoch": 0.62, + "grad_norm": 0.5826641247811815, + "learning_rate": 1.6352714654684072e-05, + "loss": 1.9598, + "step": 8089 + }, + { + "epoch": 0.62, + "grad_norm": 0.5377883252599942, + "learning_rate": 1.6346853800789186e-05, + "loss": 1.8654, + "step": 8090 + }, + { + "epoch": 0.62, + "grad_norm": 0.627100768298841, + "learning_rate": 1.6340993487145774e-05, + "loss": 1.8855, + "step": 8091 + }, + { + "epoch": 0.62, + "grad_norm": 0.5961259797891036, + "learning_rate": 1.6335133714119694e-05, + "loss": 2.1214, + "step": 8092 + }, + { + "epoch": 0.62, + "grad_norm": 0.6048038036339286, + "learning_rate": 1.6329274482076817e-05, + "loss": 1.8785, + "step": 8093 + }, + { + "epoch": 0.62, + "grad_norm": 0.5718464391050702, + "learning_rate": 1.6323415791382965e-05, + "loss": 1.8645, + "step": 8094 + }, + { + "epoch": 0.62, + "grad_norm": 0.5973612566044231, + "learning_rate": 1.6317557642403896e-05, + "loss": 1.8553, + "step": 8095 + }, + { + "epoch": 0.62, + "grad_norm": 0.6668321467019985, + "learning_rate": 1.631170003550538e-05, + "loss": 1.9516, + "step": 8096 + }, + { + "epoch": 0.62, + "grad_norm": 0.6406095896078066, + "learning_rate": 1.6305842971053123e-05, + "loss": 2.0735, + "step": 8097 + }, + { + "epoch": 0.62, + "grad_norm": 0.5694496974267224, + "learning_rate": 1.6299986449412797e-05, + "loss": 1.8583, + "step": 8098 + }, + { + "epoch": 0.62, + "grad_norm": 0.6471903012742594, + "learning_rate": 1.629413047095007e-05, + "loss": 1.8983, + "step": 8099 + }, + { + "epoch": 0.62, + "grad_norm": 0.6770508875523102, + "learning_rate": 1.628827503603053e-05, + "loss": 2.0704, + "step": 8100 + }, + { + "epoch": 0.62, + "grad_norm": 0.6027164083039429, + "learning_rate": 1.628242014501978e-05, + "loss": 1.912, + "step": 8101 + }, + { + "epoch": 0.63, + "grad_norm": 0.6107296977646794, + "learning_rate": 1.627656579828335e-05, + "loss": 1.974, + "step": 8102 + }, + { + "epoch": 0.63, + "grad_norm": 0.6033916630560902, + "learning_rate": 1.6270711996186745e-05, + "loss": 1.9159, + "step": 8103 + }, + { + "epoch": 0.63, + "grad_norm": 0.6333899100718187, + "learning_rate": 1.6264858739095457e-05, + "loss": 2.0812, + "step": 8104 + }, + { + "epoch": 0.63, + "grad_norm": 0.6231071912438237, + "learning_rate": 1.625900602737493e-05, + "loss": 1.8665, + "step": 8105 + }, + { + "epoch": 0.63, + "grad_norm": 0.604826120684237, + "learning_rate": 1.6253153861390552e-05, + "loss": 1.83, + "step": 8106 + }, + { + "epoch": 0.63, + "grad_norm": 0.6174874650203618, + "learning_rate": 1.6247302241507723e-05, + "loss": 1.9156, + "step": 8107 + }, + { + "epoch": 0.63, + "grad_norm": 0.5875313358786617, + "learning_rate": 1.6241451168091767e-05, + "loss": 1.9797, + "step": 8108 + }, + { + "epoch": 0.63, + "grad_norm": 0.6266089327591563, + "learning_rate": 1.6235600641507993e-05, + "loss": 2.0616, + "step": 8109 + }, + { + "epoch": 0.63, + "grad_norm": 0.5832433585138151, + "learning_rate": 1.6229750662121682e-05, + "loss": 1.8802, + "step": 8110 + }, + { + "epoch": 0.63, + "grad_norm": 0.5934576083999693, + "learning_rate": 1.6223901230298062e-05, + "loss": 1.8223, + "step": 8111 + }, + { + "epoch": 0.63, + "grad_norm": 0.6013801694484762, + "learning_rate": 1.621805234640235e-05, + "loss": 2.0854, + "step": 8112 + }, + { + "epoch": 0.63, + "grad_norm": 0.588291008313574, + "learning_rate": 1.6212204010799692e-05, + "loss": 1.9156, + "step": 8113 + }, + { + "epoch": 0.63, + "grad_norm": 0.5721952548075647, + "learning_rate": 1.6206356223855247e-05, + "loss": 1.9885, + "step": 8114 + }, + { + "epoch": 0.63, + "grad_norm": 0.5699337598355331, + "learning_rate": 1.6200508985934113e-05, + "loss": 1.9044, + "step": 8115 + }, + { + "epoch": 0.63, + "grad_norm": 0.6409316333204232, + "learning_rate": 1.619466229740134e-05, + "loss": 1.8509, + "step": 8116 + }, + { + "epoch": 0.63, + "grad_norm": 0.6857036804747035, + "learning_rate": 1.6188816158621977e-05, + "loss": 2.0861, + "step": 8117 + }, + { + "epoch": 0.63, + "grad_norm": 0.5625723024370958, + "learning_rate": 1.6182970569961025e-05, + "loss": 1.9028, + "step": 8118 + }, + { + "epoch": 0.63, + "grad_norm": 0.6138287574014092, + "learning_rate": 1.617712553178344e-05, + "loss": 1.9157, + "step": 8119 + }, + { + "epoch": 0.63, + "grad_norm": 0.6368201486613727, + "learning_rate": 1.6171281044454153e-05, + "loss": 1.9289, + "step": 8120 + }, + { + "epoch": 0.63, + "grad_norm": 0.5930858523858696, + "learning_rate": 1.6165437108338052e-05, + "loss": 2.0664, + "step": 8121 + }, + { + "epoch": 0.63, + "grad_norm": 0.6615515391843885, + "learning_rate": 1.6159593723800012e-05, + "loss": 1.8469, + "step": 8122 + }, + { + "epoch": 0.63, + "grad_norm": 0.601377849741503, + "learning_rate": 1.615375089120486e-05, + "loss": 1.8893, + "step": 8123 + }, + { + "epoch": 0.63, + "grad_norm": 0.587868546604295, + "learning_rate": 1.6147908610917367e-05, + "loss": 2.0896, + "step": 8124 + }, + { + "epoch": 0.63, + "grad_norm": 0.6414396838159648, + "learning_rate": 1.6142066883302315e-05, + "loss": 1.8464, + "step": 8125 + }, + { + "epoch": 0.63, + "grad_norm": 0.6636118403606509, + "learning_rate": 1.613622570872441e-05, + "loss": 1.8806, + "step": 8126 + }, + { + "epoch": 0.63, + "grad_norm": 0.5315243631804367, + "learning_rate": 1.613038508754835e-05, + "loss": 1.8916, + "step": 8127 + }, + { + "epoch": 0.63, + "grad_norm": 0.5443498168180257, + "learning_rate": 1.6124545020138792e-05, + "loss": 1.8791, + "step": 8128 + }, + { + "epoch": 0.63, + "grad_norm": 0.6572416383230438, + "learning_rate": 1.6118705506860344e-05, + "loss": 2.0854, + "step": 8129 + }, + { + "epoch": 0.63, + "grad_norm": 0.6465249023691553, + "learning_rate": 1.611286654807759e-05, + "loss": 1.9097, + "step": 8130 + }, + { + "epoch": 0.63, + "grad_norm": 0.5706125345136264, + "learning_rate": 1.61070281441551e-05, + "loss": 1.8534, + "step": 8131 + }, + { + "epoch": 0.63, + "grad_norm": 0.65104707713467, + "learning_rate": 1.6101190295457364e-05, + "loss": 2.069, + "step": 8132 + }, + { + "epoch": 0.63, + "grad_norm": 0.6095387153789739, + "learning_rate": 1.6095353002348883e-05, + "loss": 1.9897, + "step": 8133 + }, + { + "epoch": 0.63, + "grad_norm": 0.6541979343828871, + "learning_rate": 1.6089516265194096e-05, + "loss": 1.8967, + "step": 8134 + }, + { + "epoch": 0.63, + "grad_norm": 0.7041542623288523, + "learning_rate": 1.6083680084357406e-05, + "loss": 1.9204, + "step": 8135 + }, + { + "epoch": 0.63, + "grad_norm": 0.6013131482303682, + "learning_rate": 1.6077844460203206e-05, + "loss": 2.0851, + "step": 8136 + }, + { + "epoch": 0.63, + "grad_norm": 0.5713963058728061, + "learning_rate": 1.6072009393095827e-05, + "loss": 1.8316, + "step": 8137 + }, + { + "epoch": 0.63, + "grad_norm": 0.7461458228105966, + "learning_rate": 1.6066174883399583e-05, + "loss": 1.8683, + "step": 8138 + }, + { + "epoch": 0.63, + "grad_norm": 0.5712805059533478, + "learning_rate": 1.606034093147873e-05, + "loss": 1.9879, + "step": 8139 + }, + { + "epoch": 0.63, + "grad_norm": 0.5743408072585097, + "learning_rate": 1.6054507537697524e-05, + "loss": 1.8603, + "step": 8140 + }, + { + "epoch": 0.63, + "grad_norm": 0.62309224352662, + "learning_rate": 1.6048674702420167e-05, + "loss": 2.0376, + "step": 8141 + }, + { + "epoch": 0.63, + "grad_norm": 0.6157498767052207, + "learning_rate": 1.6042842426010813e-05, + "loss": 1.9029, + "step": 8142 + }, + { + "epoch": 0.63, + "grad_norm": 0.581655074193218, + "learning_rate": 1.6037010708833606e-05, + "loss": 1.8947, + "step": 8143 + }, + { + "epoch": 0.63, + "grad_norm": 0.5974668525946777, + "learning_rate": 1.6031179551252646e-05, + "loss": 2.032, + "step": 8144 + }, + { + "epoch": 0.63, + "grad_norm": 0.5023700833854972, + "learning_rate": 1.602534895363198e-05, + "loss": 1.9623, + "step": 8145 + }, + { + "epoch": 0.63, + "grad_norm": 0.6143190152339365, + "learning_rate": 1.6019518916335662e-05, + "loss": 1.8728, + "step": 8146 + }, + { + "epoch": 0.63, + "grad_norm": 0.5465392302896779, + "learning_rate": 1.6013689439727663e-05, + "loss": 1.906, + "step": 8147 + }, + { + "epoch": 0.63, + "grad_norm": 0.6719044804724982, + "learning_rate": 1.600786052417194e-05, + "loss": 1.8757, + "step": 8148 + }, + { + "epoch": 0.63, + "grad_norm": 0.595666847333441, + "learning_rate": 1.6002032170032443e-05, + "loss": 2.1048, + "step": 8149 + }, + { + "epoch": 0.63, + "grad_norm": 0.7392415314386237, + "learning_rate": 1.599620437767303e-05, + "loss": 1.8648, + "step": 8150 + }, + { + "epoch": 0.63, + "grad_norm": 0.6580973388601695, + "learning_rate": 1.599037714745757e-05, + "loss": 1.9703, + "step": 8151 + }, + { + "epoch": 0.63, + "grad_norm": 0.6301201200346209, + "learning_rate": 1.5984550479749882e-05, + "loss": 1.8936, + "step": 8152 + }, + { + "epoch": 0.63, + "grad_norm": 0.7824068914292536, + "learning_rate": 1.5978724374913732e-05, + "loss": 2.0885, + "step": 8153 + }, + { + "epoch": 0.63, + "grad_norm": 0.5938557897911837, + "learning_rate": 1.5972898833312893e-05, + "loss": 1.9188, + "step": 8154 + }, + { + "epoch": 0.63, + "grad_norm": 0.5867135984751461, + "learning_rate": 1.5967073855311056e-05, + "loss": 1.8744, + "step": 8155 + }, + { + "epoch": 0.63, + "grad_norm": 0.5764174067017281, + "learning_rate": 1.5961249441271903e-05, + "loss": 2.098, + "step": 8156 + }, + { + "epoch": 0.63, + "grad_norm": 0.5878453083439222, + "learning_rate": 1.595542559155909e-05, + "loss": 1.8809, + "step": 8157 + }, + { + "epoch": 0.63, + "grad_norm": 0.6221777970606243, + "learning_rate": 1.594960230653621e-05, + "loss": 1.9561, + "step": 8158 + }, + { + "epoch": 0.63, + "grad_norm": 0.5714815620707111, + "learning_rate": 1.594377958656684e-05, + "loss": 1.8767, + "step": 8159 + }, + { + "epoch": 0.63, + "grad_norm": 0.6778263028093088, + "learning_rate": 1.5937957432014506e-05, + "loss": 1.9047, + "step": 8160 + }, + { + "epoch": 0.63, + "grad_norm": 0.5769145023959021, + "learning_rate": 1.593213584324272e-05, + "loss": 2.1239, + "step": 8161 + }, + { + "epoch": 0.63, + "grad_norm": 0.6434559364530071, + "learning_rate": 1.592631482061495e-05, + "loss": 1.8986, + "step": 8162 + }, + { + "epoch": 0.63, + "grad_norm": 0.6005564674170942, + "learning_rate": 1.5920494364494616e-05, + "loss": 1.883, + "step": 8163 + }, + { + "epoch": 0.63, + "grad_norm": 0.561925540620594, + "learning_rate": 1.5914674475245115e-05, + "loss": 1.9474, + "step": 8164 + }, + { + "epoch": 0.63, + "grad_norm": 0.6616754726054338, + "learning_rate": 1.590885515322982e-05, + "loss": 2.1058, + "step": 8165 + }, + { + "epoch": 0.63, + "grad_norm": 0.5455132285433694, + "learning_rate": 1.590303639881204e-05, + "loss": 1.8727, + "step": 8166 + }, + { + "epoch": 0.63, + "grad_norm": 0.6495259356825086, + "learning_rate": 1.5897218212355077e-05, + "loss": 1.8749, + "step": 8167 + }, + { + "epoch": 0.63, + "grad_norm": 0.6147887527625137, + "learning_rate": 1.5891400594222163e-05, + "loss": 1.8735, + "step": 8168 + }, + { + "epoch": 0.63, + "grad_norm": 0.6270307980884934, + "learning_rate": 1.5885583544776534e-05, + "loss": 2.118, + "step": 8169 + }, + { + "epoch": 0.63, + "grad_norm": 0.6925350570389632, + "learning_rate": 1.5879767064381375e-05, + "loss": 1.9491, + "step": 8170 + }, + { + "epoch": 0.63, + "grad_norm": 0.5710676017424163, + "learning_rate": 1.5873951153399808e-05, + "loss": 1.9204, + "step": 8171 + }, + { + "epoch": 0.63, + "grad_norm": 0.6574959194793875, + "learning_rate": 1.5868135812194977e-05, + "loss": 1.9109, + "step": 8172 + }, + { + "epoch": 0.63, + "grad_norm": 0.6029470086877811, + "learning_rate": 1.5862321041129934e-05, + "loss": 2.0745, + "step": 8173 + }, + { + "epoch": 0.63, + "grad_norm": 0.6605546681612915, + "learning_rate": 1.5856506840567724e-05, + "loss": 1.926, + "step": 8174 + }, + { + "epoch": 0.63, + "grad_norm": 0.631577726888011, + "learning_rate": 1.5850693210871364e-05, + "loss": 1.8794, + "step": 8175 + }, + { + "epoch": 0.63, + "grad_norm": 0.5567390926673188, + "learning_rate": 1.584488015240381e-05, + "loss": 2.0051, + "step": 8176 + }, + { + "epoch": 0.63, + "grad_norm": 0.6831978573502983, + "learning_rate": 1.583906766552799e-05, + "loss": 2.0304, + "step": 8177 + }, + { + "epoch": 0.63, + "grad_norm": 0.625262748015255, + "learning_rate": 1.5833255750606825e-05, + "loss": 1.8433, + "step": 8178 + }, + { + "epoch": 0.63, + "grad_norm": 0.7187073260242481, + "learning_rate": 1.582744440800315e-05, + "loss": 1.8865, + "step": 8179 + }, + { + "epoch": 0.63, + "grad_norm": 0.6115921162894893, + "learning_rate": 1.582163363807981e-05, + "loss": 1.8639, + "step": 8180 + }, + { + "epoch": 0.63, + "grad_norm": 0.746344698450898, + "learning_rate": 1.581582344119958e-05, + "loss": 2.1107, + "step": 8181 + }, + { + "epoch": 0.63, + "grad_norm": 0.6236604105758277, + "learning_rate": 1.5810013817725216e-05, + "loss": 1.9527, + "step": 8182 + }, + { + "epoch": 0.63, + "grad_norm": 0.6120799183592102, + "learning_rate": 1.580420476801946e-05, + "loss": 1.8962, + "step": 8183 + }, + { + "epoch": 0.63, + "grad_norm": 0.6785819149712855, + "learning_rate": 1.5798396292444966e-05, + "loss": 1.8985, + "step": 8184 + }, + { + "epoch": 0.63, + "grad_norm": 0.5700043150546789, + "learning_rate": 1.5792588391364393e-05, + "loss": 2.1238, + "step": 8185 + }, + { + "epoch": 0.63, + "grad_norm": 0.6551128092140062, + "learning_rate": 1.578678106514036e-05, + "loss": 1.8622, + "step": 8186 + }, + { + "epoch": 0.63, + "grad_norm": 0.5569663536734482, + "learning_rate": 1.578097431413543e-05, + "loss": 1.9006, + "step": 8187 + }, + { + "epoch": 0.63, + "grad_norm": 0.6273827823125815, + "learning_rate": 1.577516813871215e-05, + "loss": 2.1056, + "step": 8188 + }, + { + "epoch": 0.63, + "grad_norm": 0.5433925996740084, + "learning_rate": 1.5769362539233007e-05, + "loss": 1.9165, + "step": 8189 + }, + { + "epoch": 0.63, + "grad_norm": 0.5739637496134635, + "learning_rate": 1.576355751606049e-05, + "loss": 1.8591, + "step": 8190 + }, + { + "epoch": 0.63, + "grad_norm": 0.6371749102403651, + "learning_rate": 1.5757753069557022e-05, + "loss": 1.8659, + "step": 8191 + }, + { + "epoch": 0.63, + "grad_norm": 0.5614759520207058, + "learning_rate": 1.5751949200084993e-05, + "loss": 1.9139, + "step": 8192 + }, + { + "epoch": 0.63, + "grad_norm": 0.791254540914331, + "learning_rate": 1.5746145908006778e-05, + "loss": 2.0515, + "step": 8193 + }, + { + "epoch": 0.63, + "grad_norm": 0.5432546090652003, + "learning_rate": 1.574034319368468e-05, + "loss": 1.8471, + "step": 8194 + }, + { + "epoch": 0.63, + "grad_norm": 0.6580852910200371, + "learning_rate": 1.5734541057480993e-05, + "loss": 1.9622, + "step": 8195 + }, + { + "epoch": 0.63, + "grad_norm": 0.6109275097657062, + "learning_rate": 1.5728739499757984e-05, + "loss": 1.876, + "step": 8196 + }, + { + "epoch": 0.63, + "grad_norm": 0.7098363063708722, + "learning_rate": 1.5722938520877838e-05, + "loss": 2.0967, + "step": 8197 + }, + { + "epoch": 0.63, + "grad_norm": 0.7277659192718275, + "learning_rate": 1.5717138121202764e-05, + "loss": 1.9058, + "step": 8198 + }, + { + "epoch": 0.63, + "grad_norm": 0.5582410277991453, + "learning_rate": 1.571133830109489e-05, + "loss": 1.8916, + "step": 8199 + }, + { + "epoch": 0.63, + "grad_norm": 0.6894891811058498, + "learning_rate": 1.5705539060916323e-05, + "loss": 1.8803, + "step": 8200 + }, + { + "epoch": 0.63, + "grad_norm": 0.6723858026494186, + "learning_rate": 1.569974040102914e-05, + "loss": 2.0869, + "step": 8201 + }, + { + "epoch": 0.63, + "grad_norm": 0.65493513551692, + "learning_rate": 1.5693942321795366e-05, + "loss": 1.8294, + "step": 8202 + }, + { + "epoch": 0.63, + "grad_norm": 0.6075682452729689, + "learning_rate": 1.5688144823576996e-05, + "loss": 1.8741, + "step": 8203 + }, + { + "epoch": 0.63, + "grad_norm": 0.6680175207035346, + "learning_rate": 1.5682347906736013e-05, + "loss": 1.8766, + "step": 8204 + }, + { + "epoch": 0.63, + "grad_norm": 0.7719439429786313, + "learning_rate": 1.5676551571634317e-05, + "loss": 2.0715, + "step": 8205 + }, + { + "epoch": 0.63, + "grad_norm": 0.5527198752998035, + "learning_rate": 1.567075581863382e-05, + "loss": 1.8949, + "step": 8206 + }, + { + "epoch": 0.63, + "grad_norm": 0.7039805020857247, + "learning_rate": 1.566496064809635e-05, + "loss": 1.9128, + "step": 8207 + }, + { + "epoch": 0.63, + "grad_norm": 0.678894358340965, + "learning_rate": 1.565916606038374e-05, + "loss": 1.8901, + "step": 8208 + }, + { + "epoch": 0.63, + "grad_norm": 0.6226494381252005, + "learning_rate": 1.5653372055857772e-05, + "loss": 2.052, + "step": 8209 + }, + { + "epoch": 0.63, + "grad_norm": 0.6068881925983989, + "learning_rate": 1.564757863488017e-05, + "loss": 1.8918, + "step": 8210 + }, + { + "epoch": 0.63, + "grad_norm": 0.710772123518608, + "learning_rate": 1.5641785797812665e-05, + "loss": 1.8985, + "step": 8211 + }, + { + "epoch": 0.63, + "grad_norm": 0.5776936970601333, + "learning_rate": 1.563599354501692e-05, + "loss": 1.8813, + "step": 8212 + }, + { + "epoch": 0.63, + "grad_norm": 0.5847462085535985, + "learning_rate": 1.5630201876854556e-05, + "loss": 2.1456, + "step": 8213 + }, + { + "epoch": 0.63, + "grad_norm": 0.5717775815285082, + "learning_rate": 1.562441079368719e-05, + "loss": 1.8814, + "step": 8214 + }, + { + "epoch": 0.63, + "grad_norm": 0.6150031356894797, + "learning_rate": 1.5618620295876365e-05, + "loss": 1.8829, + "step": 8215 + }, + { + "epoch": 0.63, + "grad_norm": 0.5620292866264495, + "learning_rate": 1.561283038378362e-05, + "loss": 1.863, + "step": 8216 + }, + { + "epoch": 0.63, + "grad_norm": 0.5698428993535279, + "learning_rate": 1.5607041057770443e-05, + "loss": 2.1106, + "step": 8217 + }, + { + "epoch": 0.63, + "grad_norm": 0.6124337157572259, + "learning_rate": 1.560125231819827e-05, + "loss": 1.8827, + "step": 8218 + }, + { + "epoch": 0.63, + "grad_norm": 0.6624098066420331, + "learning_rate": 1.559546416542853e-05, + "loss": 1.8848, + "step": 8219 + }, + { + "epoch": 0.63, + "grad_norm": 0.5558559020972711, + "learning_rate": 1.55896765998226e-05, + "loss": 1.9914, + "step": 8220 + }, + { + "epoch": 0.63, + "grad_norm": 0.6771763274958177, + "learning_rate": 1.558388962174181e-05, + "loss": 2.1127, + "step": 8221 + }, + { + "epoch": 0.63, + "grad_norm": 0.7440466134590237, + "learning_rate": 1.5578103231547485e-05, + "loss": 1.8577, + "step": 8222 + }, + { + "epoch": 0.63, + "grad_norm": 0.5614016490479711, + "learning_rate": 1.5572317429600874e-05, + "loss": 1.8803, + "step": 8223 + }, + { + "epoch": 0.63, + "grad_norm": 0.5485893364419799, + "learning_rate": 1.5566532216263214e-05, + "loss": 1.8685, + "step": 8224 + }, + { + "epoch": 0.63, + "grad_norm": 0.6493762376982991, + "learning_rate": 1.5560747591895707e-05, + "loss": 2.0743, + "step": 8225 + }, + { + "epoch": 0.63, + "grad_norm": 0.5732961619871015, + "learning_rate": 1.5554963556859502e-05, + "loss": 1.9832, + "step": 8226 + }, + { + "epoch": 0.63, + "grad_norm": 0.5707537925842697, + "learning_rate": 1.5549180111515732e-05, + "loss": 1.8635, + "step": 8227 + }, + { + "epoch": 0.63, + "grad_norm": 0.7161740029301322, + "learning_rate": 1.5543397256225457e-05, + "loss": 1.9338, + "step": 8228 + }, + { + "epoch": 0.63, + "grad_norm": 0.5728558791292477, + "learning_rate": 1.553761499134974e-05, + "loss": 2.0471, + "step": 8229 + }, + { + "epoch": 0.63, + "grad_norm": 0.6120882533885048, + "learning_rate": 1.5531833317249607e-05, + "loss": 1.9354, + "step": 8230 + }, + { + "epoch": 0.64, + "grad_norm": 0.6077008716899439, + "learning_rate": 1.552605223428601e-05, + "loss": 1.8686, + "step": 8231 + }, + { + "epoch": 0.64, + "grad_norm": 0.5953917133863135, + "learning_rate": 1.5520271742819883e-05, + "loss": 1.9322, + "step": 8232 + }, + { + "epoch": 0.64, + "grad_norm": 0.7260728902014262, + "learning_rate": 1.551449184321215e-05, + "loss": 2.1019, + "step": 8233 + }, + { + "epoch": 0.64, + "grad_norm": 0.5898128424381098, + "learning_rate": 1.550871253582365e-05, + "loss": 1.8632, + "step": 8234 + }, + { + "epoch": 0.64, + "grad_norm": 0.635470300023864, + "learning_rate": 1.550293382101522e-05, + "loss": 1.9043, + "step": 8235 + }, + { + "epoch": 0.64, + "grad_norm": 0.6179457716260498, + "learning_rate": 1.549715569914764e-05, + "loss": 1.8717, + "step": 8236 + }, + { + "epoch": 0.64, + "grad_norm": 0.786198922971832, + "learning_rate": 1.5491378170581673e-05, + "loss": 2.0932, + "step": 8237 + }, + { + "epoch": 0.64, + "grad_norm": 0.5624426271090147, + "learning_rate": 1.5485601235678036e-05, + "loss": 1.9573, + "step": 8238 + }, + { + "epoch": 0.64, + "grad_norm": 0.6588177540776137, + "learning_rate": 1.547982489479739e-05, + "loss": 1.8455, + "step": 8239 + }, + { + "epoch": 0.64, + "grad_norm": 0.7217626135836811, + "learning_rate": 1.547404914830039e-05, + "loss": 1.9021, + "step": 8240 + }, + { + "epoch": 0.64, + "grad_norm": 0.6935998256831328, + "learning_rate": 1.5468273996547634e-05, + "loss": 2.092, + "step": 8241 + }, + { + "epoch": 0.64, + "grad_norm": 0.698241910157609, + "learning_rate": 1.5462499439899687e-05, + "loss": 1.8916, + "step": 8242 + }, + { + "epoch": 0.64, + "grad_norm": 0.6767138739625068, + "learning_rate": 1.5456725478717088e-05, + "loss": 1.8829, + "step": 8243 + }, + { + "epoch": 0.64, + "grad_norm": 0.7023304067465366, + "learning_rate": 1.5450952113360317e-05, + "loss": 1.8764, + "step": 8244 + }, + { + "epoch": 0.64, + "grad_norm": 0.6698611768945949, + "learning_rate": 1.544517934418983e-05, + "loss": 2.1403, + "step": 8245 + }, + { + "epoch": 0.64, + "grad_norm": 0.6979222954784412, + "learning_rate": 1.543940717156606e-05, + "loss": 1.8792, + "step": 8246 + }, + { + "epoch": 0.64, + "grad_norm": 0.6839476217949796, + "learning_rate": 1.5433635595849362e-05, + "loss": 1.846, + "step": 8247 + }, + { + "epoch": 0.64, + "grad_norm": 0.6525108767904783, + "learning_rate": 1.5427864617400106e-05, + "loss": 1.8906, + "step": 8248 + }, + { + "epoch": 0.64, + "grad_norm": 0.6955640084463295, + "learning_rate": 1.5422094236578582e-05, + "loss": 2.1157, + "step": 8249 + }, + { + "epoch": 0.64, + "grad_norm": 0.664098642127394, + "learning_rate": 1.5416324453745056e-05, + "loss": 1.8921, + "step": 8250 + }, + { + "epoch": 0.64, + "grad_norm": 0.7276079778300725, + "learning_rate": 1.5410555269259775e-05, + "loss": 1.9502, + "step": 8251 + }, + { + "epoch": 0.64, + "grad_norm": 0.696146237692106, + "learning_rate": 1.5404786683482917e-05, + "loss": 1.906, + "step": 8252 + }, + { + "epoch": 0.64, + "grad_norm": 0.7047891673446766, + "learning_rate": 1.539901869677464e-05, + "loss": 2.0576, + "step": 8253 + }, + { + "epoch": 0.64, + "grad_norm": 0.712452434085597, + "learning_rate": 1.5393251309495078e-05, + "loss": 1.8319, + "step": 8254 + }, + { + "epoch": 0.64, + "grad_norm": 0.6657790011896529, + "learning_rate": 1.53874845220043e-05, + "loss": 1.868, + "step": 8255 + }, + { + "epoch": 0.64, + "grad_norm": 0.632458669051041, + "learning_rate": 1.538171833466235e-05, + "loss": 1.8939, + "step": 8256 + }, + { + "epoch": 0.64, + "grad_norm": 0.875252718223018, + "learning_rate": 1.537595274782923e-05, + "loss": 2.1076, + "step": 8257 + }, + { + "epoch": 0.64, + "grad_norm": 0.668075048234358, + "learning_rate": 1.5370187761864925e-05, + "loss": 1.8784, + "step": 8258 + }, + { + "epoch": 0.64, + "grad_norm": 0.6593903000086498, + "learning_rate": 1.5364423377129362e-05, + "loss": 1.8844, + "step": 8259 + }, + { + "epoch": 0.64, + "grad_norm": 0.6841854400582955, + "learning_rate": 1.535865959398242e-05, + "loss": 1.8852, + "step": 8260 + }, + { + "epoch": 0.64, + "grad_norm": 0.8715785511613727, + "learning_rate": 1.535289641278398e-05, + "loss": 2.0431, + "step": 8261 + }, + { + "epoch": 0.64, + "grad_norm": 0.6125051896033447, + "learning_rate": 1.5347133833893834e-05, + "loss": 1.8947, + "step": 8262 + }, + { + "epoch": 0.64, + "grad_norm": 0.6869835391280711, + "learning_rate": 1.5341371857671782e-05, + "loss": 1.9404, + "step": 8263 + }, + { + "epoch": 0.64, + "grad_norm": 0.7505560144207256, + "learning_rate": 1.5335610484477574e-05, + "loss": 1.8743, + "step": 8264 + }, + { + "epoch": 0.64, + "grad_norm": 0.5855600688663578, + "learning_rate": 1.5329849714670894e-05, + "loss": 2.1, + "step": 8265 + }, + { + "epoch": 0.64, + "grad_norm": 0.5937593652213607, + "learning_rate": 1.5324089548611427e-05, + "loss": 1.8643, + "step": 8266 + }, + { + "epoch": 0.64, + "grad_norm": 0.6755362418864037, + "learning_rate": 1.53183299866588e-05, + "loss": 1.855, + "step": 8267 + }, + { + "epoch": 0.64, + "grad_norm": 0.6361405546204497, + "learning_rate": 1.53125710291726e-05, + "loss": 1.858, + "step": 8268 + }, + { + "epoch": 0.64, + "grad_norm": 0.6494852668769009, + "learning_rate": 1.5306812676512398e-05, + "loss": 2.0751, + "step": 8269 + }, + { + "epoch": 0.64, + "grad_norm": 0.6303017898280341, + "learning_rate": 1.5301054929037696e-05, + "loss": 1.8737, + "step": 8270 + }, + { + "epoch": 0.64, + "grad_norm": 0.6548025075988254, + "learning_rate": 1.529529778710797e-05, + "loss": 1.8957, + "step": 8271 + }, + { + "epoch": 0.64, + "grad_norm": 0.6117351221638625, + "learning_rate": 1.5289541251082685e-05, + "loss": 1.9119, + "step": 8272 + }, + { + "epoch": 0.64, + "grad_norm": 0.7217486094577534, + "learning_rate": 1.5283785321321227e-05, + "loss": 2.0763, + "step": 8273 + }, + { + "epoch": 0.64, + "grad_norm": 0.6044672171650762, + "learning_rate": 1.527802999818297e-05, + "loss": 1.9282, + "step": 8274 + }, + { + "epoch": 0.64, + "grad_norm": 0.6058762495678016, + "learning_rate": 1.5272275282027227e-05, + "loss": 1.8802, + "step": 8275 + }, + { + "epoch": 0.64, + "grad_norm": 0.6435101311129808, + "learning_rate": 1.5266521173213306e-05, + "loss": 1.9759, + "step": 8276 + }, + { + "epoch": 0.64, + "grad_norm": 0.5998722890330975, + "learning_rate": 1.526076767210046e-05, + "loss": 2.1124, + "step": 8277 + }, + { + "epoch": 0.64, + "grad_norm": 0.6601584134614529, + "learning_rate": 1.5255014779047888e-05, + "loss": 1.8473, + "step": 8278 + }, + { + "epoch": 0.64, + "grad_norm": 0.6360509053621417, + "learning_rate": 1.5249262494414779e-05, + "loss": 1.8821, + "step": 8279 + }, + { + "epoch": 0.64, + "grad_norm": 0.5754572570256475, + "learning_rate": 1.5243510818560278e-05, + "loss": 1.9129, + "step": 8280 + }, + { + "epoch": 0.64, + "grad_norm": 0.7073060019844518, + "learning_rate": 1.5237759751843472e-05, + "loss": 2.1283, + "step": 8281 + }, + { + "epoch": 0.64, + "grad_norm": 0.5409692528019727, + "learning_rate": 1.5232009294623433e-05, + "loss": 1.9735, + "step": 8282 + }, + { + "epoch": 0.64, + "grad_norm": 0.5741485297007035, + "learning_rate": 1.5226259447259173e-05, + "loss": 1.8682, + "step": 8283 + }, + { + "epoch": 0.64, + "grad_norm": 0.559161722635525, + "learning_rate": 1.5220510210109692e-05, + "loss": 1.8501, + "step": 8284 + }, + { + "epoch": 0.64, + "grad_norm": 0.5986846022673626, + "learning_rate": 1.521476158353394e-05, + "loss": 2.1265, + "step": 8285 + }, + { + "epoch": 0.64, + "grad_norm": 0.6098624342768384, + "learning_rate": 1.5209013567890809e-05, + "loss": 1.8849, + "step": 8286 + }, + { + "epoch": 0.64, + "grad_norm": 0.5847258215495189, + "learning_rate": 1.5203266163539195e-05, + "loss": 1.8308, + "step": 8287 + }, + { + "epoch": 0.64, + "grad_norm": 0.5826882362576981, + "learning_rate": 1.5197519370837914e-05, + "loss": 1.9887, + "step": 8288 + }, + { + "epoch": 0.64, + "grad_norm": 0.6131710668401387, + "learning_rate": 1.5191773190145765e-05, + "loss": 2.0917, + "step": 8289 + }, + { + "epoch": 0.64, + "grad_norm": 0.6737468307936287, + "learning_rate": 1.5186027621821519e-05, + "loss": 1.9153, + "step": 8290 + }, + { + "epoch": 0.64, + "grad_norm": 0.5850376527198209, + "learning_rate": 1.5180282666223876e-05, + "loss": 1.8875, + "step": 8291 + }, + { + "epoch": 0.64, + "grad_norm": 0.6631375221770343, + "learning_rate": 1.5174538323711526e-05, + "loss": 1.9183, + "step": 8292 + }, + { + "epoch": 0.64, + "grad_norm": 0.5964725218158218, + "learning_rate": 1.516879459464312e-05, + "loss": 2.1034, + "step": 8293 + }, + { + "epoch": 0.64, + "grad_norm": 0.5944332287387949, + "learning_rate": 1.516305147937725e-05, + "loss": 1.9352, + "step": 8294 + }, + { + "epoch": 0.64, + "grad_norm": 0.6335496563320524, + "learning_rate": 1.5157308978272491e-05, + "loss": 1.8337, + "step": 8295 + }, + { + "epoch": 0.64, + "grad_norm": 0.6109860443396689, + "learning_rate": 1.5151567091687366e-05, + "loss": 1.8912, + "step": 8296 + }, + { + "epoch": 0.64, + "grad_norm": 0.6440179883257581, + "learning_rate": 1.5145825819980358e-05, + "loss": 2.1178, + "step": 8297 + }, + { + "epoch": 0.64, + "grad_norm": 0.6459999153058461, + "learning_rate": 1.5140085163509937e-05, + "loss": 1.8803, + "step": 8298 + }, + { + "epoch": 0.64, + "grad_norm": 0.721322398752142, + "learning_rate": 1.5134345122634502e-05, + "loss": 1.8711, + "step": 8299 + }, + { + "epoch": 0.64, + "grad_norm": 0.7338500907295465, + "learning_rate": 1.5128605697712423e-05, + "loss": 1.9667, + "step": 8300 + }, + { + "epoch": 0.64, + "grad_norm": 0.58298275120827, + "learning_rate": 1.5122866889102056e-05, + "loss": 1.892, + "step": 8301 + }, + { + "epoch": 0.64, + "grad_norm": 0.6829896328795524, + "learning_rate": 1.5117128697161675e-05, + "loss": 2.1107, + "step": 8302 + }, + { + "epoch": 0.64, + "grad_norm": 0.6022251087855807, + "learning_rate": 1.5111391122249558e-05, + "loss": 1.8814, + "step": 8303 + }, + { + "epoch": 0.64, + "grad_norm": 0.6012433719777383, + "learning_rate": 1.5105654164723906e-05, + "loss": 1.9031, + "step": 8304 + }, + { + "epoch": 0.64, + "grad_norm": 0.7934396214992626, + "learning_rate": 1.5099917824942917e-05, + "loss": 2.1218, + "step": 8305 + }, + { + "epoch": 0.64, + "grad_norm": 0.5628738128644897, + "learning_rate": 1.5094182103264734e-05, + "loss": 1.9238, + "step": 8306 + }, + { + "epoch": 0.64, + "grad_norm": 0.6208050780137758, + "learning_rate": 1.5088447000047445e-05, + "loss": 1.9961, + "step": 8307 + }, + { + "epoch": 0.64, + "grad_norm": 0.678462383842667, + "learning_rate": 1.5082712515649138e-05, + "loss": 1.883, + "step": 8308 + }, + { + "epoch": 0.64, + "grad_norm": 0.5734550053619724, + "learning_rate": 1.5076978650427826e-05, + "loss": 2.0707, + "step": 8309 + }, + { + "epoch": 0.64, + "grad_norm": 0.6609625791081309, + "learning_rate": 1.5071245404741496e-05, + "loss": 1.8878, + "step": 8310 + }, + { + "epoch": 0.64, + "grad_norm": 0.5957946995340899, + "learning_rate": 1.5065512778948113e-05, + "loss": 1.931, + "step": 8311 + }, + { + "epoch": 0.64, + "grad_norm": 0.5922935311235172, + "learning_rate": 1.5059780773405569e-05, + "loss": 1.9025, + "step": 8312 + }, + { + "epoch": 0.64, + "grad_norm": 0.6260024107634815, + "learning_rate": 1.5054049388471752e-05, + "loss": 1.924, + "step": 8313 + }, + { + "epoch": 0.64, + "grad_norm": 0.6567235781613207, + "learning_rate": 1.5048318624504499e-05, + "loss": 2.103, + "step": 8314 + }, + { + "epoch": 0.64, + "grad_norm": 0.6422142457679305, + "learning_rate": 1.504258848186158e-05, + "loss": 1.9157, + "step": 8315 + }, + { + "epoch": 0.64, + "grad_norm": 0.6090879090397989, + "learning_rate": 1.5036858960900784e-05, + "loss": 1.9059, + "step": 8316 + }, + { + "epoch": 0.64, + "grad_norm": 0.5860529407594672, + "learning_rate": 1.5031130061979804e-05, + "loss": 2.0723, + "step": 8317 + }, + { + "epoch": 0.64, + "grad_norm": 0.6070556915618565, + "learning_rate": 1.5025401785456323e-05, + "loss": 1.892, + "step": 8318 + }, + { + "epoch": 0.64, + "grad_norm": 0.5812505114581853, + "learning_rate": 1.5019674131687997e-05, + "loss": 1.9216, + "step": 8319 + }, + { + "epoch": 0.64, + "grad_norm": 0.5483005522117286, + "learning_rate": 1.501394710103241e-05, + "loss": 1.9201, + "step": 8320 + }, + { + "epoch": 0.64, + "grad_norm": 0.6217673454563182, + "learning_rate": 1.5008220693847133e-05, + "loss": 2.065, + "step": 8321 + }, + { + "epoch": 0.64, + "grad_norm": 0.5785405452553793, + "learning_rate": 1.5002494910489675e-05, + "loss": 1.9242, + "step": 8322 + }, + { + "epoch": 0.64, + "grad_norm": 0.5759303158367849, + "learning_rate": 1.4996769751317535e-05, + "loss": 1.888, + "step": 8323 + }, + { + "epoch": 0.64, + "grad_norm": 0.6392581200989591, + "learning_rate": 1.4991045216688162e-05, + "loss": 1.9223, + "step": 8324 + }, + { + "epoch": 0.64, + "grad_norm": 0.5625454460311049, + "learning_rate": 1.4985321306958944e-05, + "loss": 2.0063, + "step": 8325 + }, + { + "epoch": 0.64, + "grad_norm": 0.6748058489025044, + "learning_rate": 1.4979598022487254e-05, + "loss": 2.11, + "step": 8326 + }, + { + "epoch": 0.64, + "grad_norm": 0.5538524199096411, + "learning_rate": 1.4973875363630435e-05, + "loss": 1.8799, + "step": 8327 + }, + { + "epoch": 0.64, + "grad_norm": 0.6342147857005462, + "learning_rate": 1.4968153330745766e-05, + "loss": 1.854, + "step": 8328 + }, + { + "epoch": 0.64, + "grad_norm": 0.6144464139240059, + "learning_rate": 1.4962431924190495e-05, + "loss": 2.0567, + "step": 8329 + }, + { + "epoch": 0.64, + "grad_norm": 0.6177040436793608, + "learning_rate": 1.4956711144321828e-05, + "loss": 1.8772, + "step": 8330 + }, + { + "epoch": 0.64, + "grad_norm": 0.5697342071879058, + "learning_rate": 1.4950990991496943e-05, + "loss": 1.9566, + "step": 8331 + }, + { + "epoch": 0.64, + "grad_norm": 0.5800006235252373, + "learning_rate": 1.4945271466072986e-05, + "loss": 1.9476, + "step": 8332 + }, + { + "epoch": 0.64, + "grad_norm": 0.6809219626952162, + "learning_rate": 1.4939552568407023e-05, + "loss": 1.8311, + "step": 8333 + }, + { + "epoch": 0.64, + "grad_norm": 0.6479902340806296, + "learning_rate": 1.4933834298856125e-05, + "loss": 2.125, + "step": 8334 + }, + { + "epoch": 0.64, + "grad_norm": 0.6766866714776264, + "learning_rate": 1.4928116657777314e-05, + "loss": 1.8615, + "step": 8335 + }, + { + "epoch": 0.64, + "grad_norm": 0.624355159845173, + "learning_rate": 1.4922399645527541e-05, + "loss": 1.8756, + "step": 8336 + }, + { + "epoch": 0.64, + "grad_norm": 0.6115643295686684, + "learning_rate": 1.4916683262463771e-05, + "loss": 2.0561, + "step": 8337 + }, + { + "epoch": 0.64, + "grad_norm": 0.6002869454987185, + "learning_rate": 1.4910967508942885e-05, + "loss": 1.9247, + "step": 8338 + }, + { + "epoch": 0.64, + "grad_norm": 0.5905011575551776, + "learning_rate": 1.4905252385321738e-05, + "loss": 1.8617, + "step": 8339 + }, + { + "epoch": 0.64, + "grad_norm": 0.665978707803345, + "learning_rate": 1.4899537891957166e-05, + "loss": 1.8919, + "step": 8340 + }, + { + "epoch": 0.64, + "grad_norm": 0.5565799305527607, + "learning_rate": 1.4893824029205927e-05, + "loss": 2.064, + "step": 8341 + }, + { + "epoch": 0.64, + "grad_norm": 0.5470586564642844, + "learning_rate": 1.4888110797424782e-05, + "loss": 1.8414, + "step": 8342 + }, + { + "epoch": 0.64, + "grad_norm": 0.5678370175685654, + "learning_rate": 1.488239819697041e-05, + "loss": 1.927, + "step": 8343 + }, + { + "epoch": 0.64, + "grad_norm": 0.548526596206015, + "learning_rate": 1.4876686228199476e-05, + "loss": 1.9146, + "step": 8344 + }, + { + "epoch": 0.64, + "grad_norm": 0.5649908976600285, + "learning_rate": 1.4870974891468625e-05, + "loss": 1.9099, + "step": 8345 + }, + { + "epoch": 0.64, + "grad_norm": 0.6362717066922087, + "learning_rate": 1.4865264187134418e-05, + "loss": 2.0419, + "step": 8346 + }, + { + "epoch": 0.64, + "grad_norm": 0.5799718748809531, + "learning_rate": 1.4859554115553392e-05, + "loss": 1.9097, + "step": 8347 + }, + { + "epoch": 0.64, + "grad_norm": 0.7240228413988585, + "learning_rate": 1.4853844677082077e-05, + "loss": 1.9195, + "step": 8348 + }, + { + "epoch": 0.64, + "grad_norm": 0.6464349700166891, + "learning_rate": 1.4848135872076913e-05, + "loss": 2.0589, + "step": 8349 + }, + { + "epoch": 0.64, + "grad_norm": 0.6094590967829089, + "learning_rate": 1.4842427700894335e-05, + "loss": 1.9454, + "step": 8350 + }, + { + "epoch": 0.64, + "grad_norm": 0.692654192896129, + "learning_rate": 1.4836720163890714e-05, + "loss": 1.8715, + "step": 8351 + }, + { + "epoch": 0.64, + "grad_norm": 0.5742143711666354, + "learning_rate": 1.483101326142241e-05, + "loss": 1.9249, + "step": 8352 + }, + { + "epoch": 0.64, + "grad_norm": 0.7213744787539169, + "learning_rate": 1.4825306993845727e-05, + "loss": 2.0971, + "step": 8353 + }, + { + "epoch": 0.64, + "grad_norm": 0.5890199882972156, + "learning_rate": 1.4819601361516922e-05, + "loss": 1.8812, + "step": 8354 + }, + { + "epoch": 0.64, + "grad_norm": 0.5900389314356714, + "learning_rate": 1.4813896364792231e-05, + "loss": 1.861, + "step": 8355 + }, + { + "epoch": 0.64, + "grad_norm": 0.6403901725483335, + "learning_rate": 1.4808192004027833e-05, + "loss": 1.959, + "step": 8356 + }, + { + "epoch": 0.64, + "grad_norm": 0.5522444832337519, + "learning_rate": 1.4802488279579873e-05, + "loss": 1.9121, + "step": 8357 + }, + { + "epoch": 0.64, + "grad_norm": 0.6040347173582511, + "learning_rate": 1.479678519180447e-05, + "loss": 2.09, + "step": 8358 + }, + { + "epoch": 0.64, + "grad_norm": 0.5918151870480092, + "learning_rate": 1.4791082741057674e-05, + "loss": 1.9118, + "step": 8359 + }, + { + "epoch": 0.64, + "grad_norm": 0.6178034941637252, + "learning_rate": 1.4785380927695525e-05, + "loss": 1.8849, + "step": 8360 + }, + { + "epoch": 0.65, + "grad_norm": 0.5923204492687153, + "learning_rate": 1.4779679752074016e-05, + "loss": 2.1176, + "step": 8361 + }, + { + "epoch": 0.65, + "grad_norm": 0.5937402765545233, + "learning_rate": 1.4773979214549075e-05, + "loss": 1.9154, + "step": 8362 + }, + { + "epoch": 0.65, + "grad_norm": 0.5496306359528972, + "learning_rate": 1.476827931547663e-05, + "loss": 1.8407, + "step": 8363 + }, + { + "epoch": 0.65, + "grad_norm": 0.6835988362926855, + "learning_rate": 1.4762580055212535e-05, + "loss": 1.8992, + "step": 8364 + }, + { + "epoch": 0.65, + "grad_norm": 0.6618641400451147, + "learning_rate": 1.4756881434112621e-05, + "loss": 1.876, + "step": 8365 + }, + { + "epoch": 0.65, + "grad_norm": 0.558850417452291, + "learning_rate": 1.4751183452532685e-05, + "loss": 2.123, + "step": 8366 + }, + { + "epoch": 0.65, + "grad_norm": 0.6686298556803639, + "learning_rate": 1.4745486110828469e-05, + "loss": 1.9513, + "step": 8367 + }, + { + "epoch": 0.65, + "grad_norm": 0.6551899067909253, + "learning_rate": 1.4739789409355676e-05, + "loss": 1.9246, + "step": 8368 + }, + { + "epoch": 0.65, + "grad_norm": 0.632266763181478, + "learning_rate": 1.4734093348469986e-05, + "loss": 1.9525, + "step": 8369 + }, + { + "epoch": 0.65, + "grad_norm": 0.6531190688063936, + "learning_rate": 1.4728397928527022e-05, + "loss": 2.1023, + "step": 8370 + }, + { + "epoch": 0.65, + "grad_norm": 0.5759786680048378, + "learning_rate": 1.4722703149882372e-05, + "loss": 1.9022, + "step": 8371 + }, + { + "epoch": 0.65, + "grad_norm": 0.6418757468144191, + "learning_rate": 1.4717009012891575e-05, + "loss": 1.8912, + "step": 8372 + }, + { + "epoch": 0.65, + "grad_norm": 0.6208691702859511, + "learning_rate": 1.4711315517910158e-05, + "loss": 2.1196, + "step": 8373 + }, + { + "epoch": 0.65, + "grad_norm": 0.6593373464025568, + "learning_rate": 1.4705622665293584e-05, + "loss": 1.8869, + "step": 8374 + }, + { + "epoch": 0.65, + "grad_norm": 0.6248539678183344, + "learning_rate": 1.4699930455397266e-05, + "loss": 1.9512, + "step": 8375 + }, + { + "epoch": 0.65, + "grad_norm": 0.6262350862361253, + "learning_rate": 1.4694238888576611e-05, + "loss": 1.9118, + "step": 8376 + }, + { + "epoch": 0.65, + "grad_norm": 0.7035650364266526, + "learning_rate": 1.4688547965186956e-05, + "loss": 1.8821, + "step": 8377 + }, + { + "epoch": 0.65, + "grad_norm": 0.6402756077840743, + "learning_rate": 1.4682857685583613e-05, + "loss": 2.133, + "step": 8378 + }, + { + "epoch": 0.65, + "grad_norm": 0.757344491442185, + "learning_rate": 1.4677168050121853e-05, + "loss": 1.8662, + "step": 8379 + }, + { + "epoch": 0.65, + "grad_norm": 0.6612969382278916, + "learning_rate": 1.467147905915689e-05, + "loss": 1.8899, + "step": 8380 + }, + { + "epoch": 0.65, + "grad_norm": 0.5714825887140408, + "learning_rate": 1.4665790713043923e-05, + "loss": 1.975, + "step": 8381 + }, + { + "epoch": 0.65, + "grad_norm": 0.788005066422033, + "learning_rate": 1.4660103012138101e-05, + "loss": 2.1076, + "step": 8382 + }, + { + "epoch": 0.65, + "grad_norm": 0.5671699072782598, + "learning_rate": 1.4654415956794518e-05, + "loss": 1.9042, + "step": 8383 + }, + { + "epoch": 0.65, + "grad_norm": 0.7448664976130364, + "learning_rate": 1.4648729547368256e-05, + "loss": 1.8882, + "step": 8384 + }, + { + "epoch": 0.65, + "grad_norm": 0.5978554749946987, + "learning_rate": 1.4643043784214327e-05, + "loss": 1.9024, + "step": 8385 + }, + { + "epoch": 0.65, + "grad_norm": 0.6162028055246319, + "learning_rate": 1.4637358667687717e-05, + "loss": 2.1219, + "step": 8386 + }, + { + "epoch": 0.65, + "grad_norm": 0.6358085826389558, + "learning_rate": 1.4631674198143386e-05, + "loss": 1.9215, + "step": 8387 + }, + { + "epoch": 0.65, + "grad_norm": 0.6253095673043599, + "learning_rate": 1.4625990375936227e-05, + "loss": 1.8879, + "step": 8388 + }, + { + "epoch": 0.65, + "grad_norm": 0.5891575975168217, + "learning_rate": 1.4620307201421108e-05, + "loss": 1.8759, + "step": 8389 + }, + { + "epoch": 0.65, + "grad_norm": 0.5844824307884063, + "learning_rate": 1.4614624674952842e-05, + "loss": 2.0822, + "step": 8390 + }, + { + "epoch": 0.65, + "grad_norm": 0.6076718295782593, + "learning_rate": 1.4608942796886222e-05, + "loss": 1.8833, + "step": 8391 + }, + { + "epoch": 0.65, + "grad_norm": 0.610044044050712, + "learning_rate": 1.4603261567576005e-05, + "loss": 1.9266, + "step": 8392 + }, + { + "epoch": 0.65, + "grad_norm": 0.6041904689968519, + "learning_rate": 1.4597580987376867e-05, + "loss": 2.0367, + "step": 8393 + }, + { + "epoch": 0.65, + "grad_norm": 0.5486691567567659, + "learning_rate": 1.4591901056643481e-05, + "loss": 1.9878, + "step": 8394 + }, + { + "epoch": 0.65, + "grad_norm": 0.60939536967728, + "learning_rate": 1.4586221775730483e-05, + "loss": 1.8734, + "step": 8395 + }, + { + "epoch": 0.65, + "grad_norm": 0.7025366799675828, + "learning_rate": 1.458054314499242e-05, + "loss": 1.9182, + "step": 8396 + }, + { + "epoch": 0.65, + "grad_norm": 0.5760048945283269, + "learning_rate": 1.4574865164783874e-05, + "loss": 1.9016, + "step": 8397 + }, + { + "epoch": 0.65, + "grad_norm": 0.6066221588001964, + "learning_rate": 1.4569187835459313e-05, + "loss": 2.093, + "step": 8398 + }, + { + "epoch": 0.65, + "grad_norm": 0.6190251384747406, + "learning_rate": 1.4563511157373194e-05, + "loss": 1.8641, + "step": 8399 + }, + { + "epoch": 0.65, + "grad_norm": 0.5231950066328885, + "learning_rate": 1.455783513087997e-05, + "loss": 1.9563, + "step": 8400 + }, + { + "epoch": 0.65, + "grad_norm": 0.6522367159585565, + "learning_rate": 1.4552159756333988e-05, + "loss": 1.8752, + "step": 8401 + }, + { + "epoch": 0.65, + "grad_norm": 0.6378421827321957, + "learning_rate": 1.4546485034089582e-05, + "loss": 2.0896, + "step": 8402 + }, + { + "epoch": 0.65, + "grad_norm": 0.5467172471794276, + "learning_rate": 1.4540810964501078e-05, + "loss": 1.9024, + "step": 8403 + }, + { + "epoch": 0.65, + "grad_norm": 0.6147354972623675, + "learning_rate": 1.4535137547922705e-05, + "loss": 1.8624, + "step": 8404 + }, + { + "epoch": 0.65, + "grad_norm": 0.6367802605389595, + "learning_rate": 1.4529464784708685e-05, + "loss": 2.1177, + "step": 8405 + }, + { + "epoch": 0.65, + "grad_norm": 0.6344875033579794, + "learning_rate": 1.4523792675213191e-05, + "loss": 1.9227, + "step": 8406 + }, + { + "epoch": 0.65, + "grad_norm": 0.5676284131238892, + "learning_rate": 1.4518121219790349e-05, + "loss": 1.8619, + "step": 8407 + }, + { + "epoch": 0.65, + "grad_norm": 0.6291751101252852, + "learning_rate": 1.4512450418794279e-05, + "loss": 1.8821, + "step": 8408 + }, + { + "epoch": 0.65, + "grad_norm": 0.6382279217446245, + "learning_rate": 1.4506780272579004e-05, + "loss": 1.9027, + "step": 8409 + }, + { + "epoch": 0.65, + "grad_norm": 0.5791729965552005, + "learning_rate": 1.4501110781498545e-05, + "loss": 2.0835, + "step": 8410 + }, + { + "epoch": 0.65, + "grad_norm": 0.5808640192402451, + "learning_rate": 1.4495441945906868e-05, + "loss": 1.9074, + "step": 8411 + }, + { + "epoch": 0.65, + "grad_norm": 0.7144172097834264, + "learning_rate": 1.4489773766157905e-05, + "loss": 1.9677, + "step": 8412 + }, + { + "epoch": 0.65, + "grad_norm": 0.5637800667798738, + "learning_rate": 1.4484106242605544e-05, + "loss": 1.8858, + "step": 8413 + }, + { + "epoch": 0.65, + "grad_norm": 0.5957657132017352, + "learning_rate": 1.4478439375603628e-05, + "loss": 2.0952, + "step": 8414 + }, + { + "epoch": 0.65, + "grad_norm": 0.5997853878901057, + "learning_rate": 1.4472773165505965e-05, + "loss": 1.9064, + "step": 8415 + }, + { + "epoch": 0.65, + "grad_norm": 0.5553566637781663, + "learning_rate": 1.4467107612666325e-05, + "loss": 1.8395, + "step": 8416 + }, + { + "epoch": 0.65, + "grad_norm": 0.5707236780590035, + "learning_rate": 1.4461442717438423e-05, + "loss": 1.8524, + "step": 8417 + }, + { + "epoch": 0.65, + "grad_norm": 0.5481949855272942, + "learning_rate": 1.4455778480175947e-05, + "loss": 2.1056, + "step": 8418 + }, + { + "epoch": 0.65, + "grad_norm": 0.6026943304665449, + "learning_rate": 1.4450114901232536e-05, + "loss": 1.933, + "step": 8419 + }, + { + "epoch": 0.65, + "grad_norm": 0.6367322090836317, + "learning_rate": 1.4444451980961793e-05, + "loss": 1.9229, + "step": 8420 + }, + { + "epoch": 0.65, + "grad_norm": 0.5822269286977031, + "learning_rate": 1.4438789719717278e-05, + "loss": 1.8309, + "step": 8421 + }, + { + "epoch": 0.65, + "grad_norm": 0.5598154256625976, + "learning_rate": 1.4433128117852504e-05, + "loss": 2.0662, + "step": 8422 + }, + { + "epoch": 0.65, + "grad_norm": 0.6089580740826536, + "learning_rate": 1.4427467175720963e-05, + "loss": 1.9101, + "step": 8423 + }, + { + "epoch": 0.65, + "grad_norm": 0.6353665177947495, + "learning_rate": 1.442180689367606e-05, + "loss": 1.8802, + "step": 8424 + }, + { + "epoch": 0.65, + "grad_norm": 0.5852755869134891, + "learning_rate": 1.4416147272071217e-05, + "loss": 1.9708, + "step": 8425 + }, + { + "epoch": 0.65, + "grad_norm": 0.5779986415234073, + "learning_rate": 1.4410488311259779e-05, + "loss": 2.046, + "step": 8426 + }, + { + "epoch": 0.65, + "grad_norm": 0.6119835022295036, + "learning_rate": 1.4404830011595061e-05, + "loss": 1.8668, + "step": 8427 + }, + { + "epoch": 0.65, + "grad_norm": 0.6115490455169098, + "learning_rate": 1.439917237343033e-05, + "loss": 1.9312, + "step": 8428 + }, + { + "epoch": 0.65, + "grad_norm": 0.5523500629186943, + "learning_rate": 1.4393515397118817e-05, + "loss": 1.8471, + "step": 8429 + }, + { + "epoch": 0.65, + "grad_norm": 0.6853963367796468, + "learning_rate": 1.438785908301371e-05, + "loss": 2.0839, + "step": 8430 + }, + { + "epoch": 0.65, + "grad_norm": 0.5360391308820578, + "learning_rate": 1.4382203431468166e-05, + "loss": 1.946, + "step": 8431 + }, + { + "epoch": 0.65, + "grad_norm": 0.5828613091107977, + "learning_rate": 1.4376548442835264e-05, + "loss": 1.8952, + "step": 8432 + }, + { + "epoch": 0.65, + "grad_norm": 0.6538969990510641, + "learning_rate": 1.4370894117468093e-05, + "loss": 1.8855, + "step": 8433 + }, + { + "epoch": 0.65, + "grad_norm": 0.6040078346928618, + "learning_rate": 1.4365240455719676e-05, + "loss": 2.0953, + "step": 8434 + }, + { + "epoch": 0.65, + "grad_norm": 0.5997250562285082, + "learning_rate": 1.4359587457942966e-05, + "loss": 1.9147, + "step": 8435 + }, + { + "epoch": 0.65, + "grad_norm": 0.6396231405765763, + "learning_rate": 1.4353935124490944e-05, + "loss": 1.8607, + "step": 8436 + }, + { + "epoch": 0.65, + "grad_norm": 0.5480410615934718, + "learning_rate": 1.4348283455716477e-05, + "loss": 1.9447, + "step": 8437 + }, + { + "epoch": 0.65, + "grad_norm": 0.6175805536951733, + "learning_rate": 1.4342632451972423e-05, + "loss": 2.0425, + "step": 8438 + }, + { + "epoch": 0.65, + "grad_norm": 0.7063913237575837, + "learning_rate": 1.4336982113611627e-05, + "loss": 1.8255, + "step": 8439 + }, + { + "epoch": 0.65, + "grad_norm": 0.5839859235407079, + "learning_rate": 1.4331332440986833e-05, + "loss": 1.8658, + "step": 8440 + }, + { + "epoch": 0.65, + "grad_norm": 0.6021326410331537, + "learning_rate": 1.432568343445077e-05, + "loss": 1.8743, + "step": 8441 + }, + { + "epoch": 0.65, + "grad_norm": 0.6228495152851654, + "learning_rate": 1.4320035094356163e-05, + "loss": 2.0522, + "step": 8442 + }, + { + "epoch": 0.65, + "grad_norm": 0.5945449277991676, + "learning_rate": 1.431438742105562e-05, + "loss": 1.9605, + "step": 8443 + }, + { + "epoch": 0.65, + "grad_norm": 0.6207654692319132, + "learning_rate": 1.4308740414901786e-05, + "loss": 1.9179, + "step": 8444 + }, + { + "epoch": 0.65, + "grad_norm": 0.6675638651978442, + "learning_rate": 1.4303094076247198e-05, + "loss": 1.8708, + "step": 8445 + }, + { + "epoch": 0.65, + "grad_norm": 0.579769209260611, + "learning_rate": 1.4297448405444386e-05, + "loss": 2.0948, + "step": 8446 + }, + { + "epoch": 0.65, + "grad_norm": 0.5662214284765524, + "learning_rate": 1.4291803402845852e-05, + "loss": 1.9296, + "step": 8447 + }, + { + "epoch": 0.65, + "grad_norm": 0.600883709921342, + "learning_rate": 1.4286159068804012e-05, + "loss": 1.8854, + "step": 8448 + }, + { + "epoch": 0.65, + "grad_norm": 0.5407549030319233, + "learning_rate": 1.4280515403671269e-05, + "loss": 1.9608, + "step": 8449 + }, + { + "epoch": 0.65, + "grad_norm": 0.6162919457358811, + "learning_rate": 1.4274872407800002e-05, + "loss": 2.0603, + "step": 8450 + }, + { + "epoch": 0.65, + "grad_norm": 0.6100484827663457, + "learning_rate": 1.4269230081542504e-05, + "loss": 1.9084, + "step": 8451 + }, + { + "epoch": 0.65, + "grad_norm": 0.636818154258723, + "learning_rate": 1.426358842525105e-05, + "loss": 1.8274, + "step": 8452 + }, + { + "epoch": 0.65, + "grad_norm": 0.6708886457156216, + "learning_rate": 1.4257947439277883e-05, + "loss": 1.8816, + "step": 8453 + }, + { + "epoch": 0.65, + "grad_norm": 0.5963036956356728, + "learning_rate": 1.4252307123975184e-05, + "loss": 2.0885, + "step": 8454 + }, + { + "epoch": 0.65, + "grad_norm": 0.5860545459372719, + "learning_rate": 1.4246667479695102e-05, + "loss": 1.8928, + "step": 8455 + }, + { + "epoch": 0.65, + "grad_norm": 0.5687690006596076, + "learning_rate": 1.424102850678975e-05, + "loss": 1.9581, + "step": 8456 + }, + { + "epoch": 0.65, + "grad_norm": 0.5661182447874715, + "learning_rate": 1.4235390205611182e-05, + "loss": 1.9138, + "step": 8457 + }, + { + "epoch": 0.65, + "grad_norm": 0.6227025414832638, + "learning_rate": 1.422975257651143e-05, + "loss": 2.0783, + "step": 8458 + }, + { + "epoch": 0.65, + "grad_norm": 0.6077340263005613, + "learning_rate": 1.4224115619842466e-05, + "loss": 1.9016, + "step": 8459 + }, + { + "epoch": 0.65, + "grad_norm": 0.5448702481143165, + "learning_rate": 1.4218479335956237e-05, + "loss": 1.7924, + "step": 8460 + }, + { + "epoch": 0.65, + "grad_norm": 0.6442842715583527, + "learning_rate": 1.4212843725204627e-05, + "loss": 1.8729, + "step": 8461 + }, + { + "epoch": 0.65, + "grad_norm": 0.610442592279379, + "learning_rate": 1.4207208787939504e-05, + "loss": 2.0597, + "step": 8462 + }, + { + "epoch": 0.65, + "grad_norm": 0.571406911976173, + "learning_rate": 1.4201574524512673e-05, + "loss": 1.9445, + "step": 8463 + }, + { + "epoch": 0.65, + "grad_norm": 0.5648312902225111, + "learning_rate": 1.4195940935275906e-05, + "loss": 1.8947, + "step": 8464 + }, + { + "epoch": 0.65, + "grad_norm": 0.7253143853235747, + "learning_rate": 1.4190308020580931e-05, + "loss": 1.8727, + "step": 8465 + }, + { + "epoch": 0.65, + "grad_norm": 0.5946337252723355, + "learning_rate": 1.4184675780779433e-05, + "loss": 2.0862, + "step": 8466 + }, + { + "epoch": 0.65, + "grad_norm": 0.582638211781166, + "learning_rate": 1.4179044216223054e-05, + "loss": 1.8253, + "step": 8467 + }, + { + "epoch": 0.65, + "grad_norm": 0.6323046328427233, + "learning_rate": 1.4173413327263401e-05, + "loss": 1.9609, + "step": 8468 + }, + { + "epoch": 0.65, + "grad_norm": 0.5870801734128277, + "learning_rate": 1.4167783114252031e-05, + "loss": 1.9358, + "step": 8469 + }, + { + "epoch": 0.65, + "grad_norm": 0.5997595708730505, + "learning_rate": 1.4162153577540472e-05, + "loss": 2.0762, + "step": 8470 + }, + { + "epoch": 0.65, + "grad_norm": 0.6321850180025512, + "learning_rate": 1.4156524717480162e-05, + "loss": 1.9419, + "step": 8471 + }, + { + "epoch": 0.65, + "grad_norm": 0.5563100483904052, + "learning_rate": 1.4150896534422574e-05, + "loss": 1.8244, + "step": 8472 + }, + { + "epoch": 0.65, + "grad_norm": 0.5696775635313168, + "learning_rate": 1.4145269028719096e-05, + "loss": 1.8365, + "step": 8473 + }, + { + "epoch": 0.65, + "grad_norm": 0.6878602264916693, + "learning_rate": 1.4139642200721043e-05, + "loss": 2.0902, + "step": 8474 + }, + { + "epoch": 0.65, + "grad_norm": 0.6334518736809681, + "learning_rate": 1.4134016050779755e-05, + "loss": 1.9119, + "step": 8475 + }, + { + "epoch": 0.65, + "grad_norm": 0.5501471026815904, + "learning_rate": 1.412839057924648e-05, + "loss": 1.9088, + "step": 8476 + }, + { + "epoch": 0.65, + "grad_norm": 0.7509255308771137, + "learning_rate": 1.4122765786472447e-05, + "loss": 1.8323, + "step": 8477 + }, + { + "epoch": 0.65, + "grad_norm": 0.6452526786712618, + "learning_rate": 1.4117141672808837e-05, + "loss": 2.0613, + "step": 8478 + }, + { + "epoch": 0.65, + "grad_norm": 0.5759912342349364, + "learning_rate": 1.4111518238606763e-05, + "loss": 1.856, + "step": 8479 + }, + { + "epoch": 0.65, + "grad_norm": 0.551535028670185, + "learning_rate": 1.4105895484217346e-05, + "loss": 1.9951, + "step": 8480 + }, + { + "epoch": 0.65, + "grad_norm": 0.6794228304001027, + "learning_rate": 1.4100273409991638e-05, + "loss": 1.8793, + "step": 8481 + }, + { + "epoch": 0.65, + "grad_norm": 0.658221075901024, + "learning_rate": 1.4094652016280618e-05, + "loss": 2.0915, + "step": 8482 + }, + { + "epoch": 0.65, + "grad_norm": 0.5688625301369679, + "learning_rate": 1.4089031303435279e-05, + "loss": 1.8485, + "step": 8483 + }, + { + "epoch": 0.65, + "grad_norm": 0.6129076960848894, + "learning_rate": 1.4083411271806554e-05, + "loss": 1.8873, + "step": 8484 + }, + { + "epoch": 0.65, + "grad_norm": 0.5701859911290824, + "learning_rate": 1.4077791921745286e-05, + "loss": 1.9065, + "step": 8485 + }, + { + "epoch": 0.65, + "grad_norm": 0.6010933927380878, + "learning_rate": 1.4072173253602364e-05, + "loss": 2.0676, + "step": 8486 + }, + { + "epoch": 0.65, + "grad_norm": 0.5426578050004103, + "learning_rate": 1.4066555267728543e-05, + "loss": 1.9963, + "step": 8487 + }, + { + "epoch": 0.65, + "grad_norm": 0.5553893945729792, + "learning_rate": 1.4060937964474585e-05, + "loss": 1.8465, + "step": 8488 + }, + { + "epoch": 0.65, + "grad_norm": 0.6730931781520945, + "learning_rate": 1.4055321344191228e-05, + "loss": 1.9021, + "step": 8489 + }, + { + "epoch": 0.65, + "grad_norm": 0.5653963508369981, + "learning_rate": 1.4049705407229103e-05, + "loss": 2.0911, + "step": 8490 + }, + { + "epoch": 0.66, + "grad_norm": 0.5862701075061838, + "learning_rate": 1.4044090153938871e-05, + "loss": 1.875, + "step": 8491 + }, + { + "epoch": 0.66, + "grad_norm": 0.5737780929865324, + "learning_rate": 1.403847558467109e-05, + "loss": 1.8891, + "step": 8492 + }, + { + "epoch": 0.66, + "grad_norm": 0.5486844852116715, + "learning_rate": 1.4032861699776297e-05, + "loss": 1.955, + "step": 8493 + }, + { + "epoch": 0.66, + "grad_norm": 0.6260768524180835, + "learning_rate": 1.4027248499605028e-05, + "loss": 2.0892, + "step": 8494 + }, + { + "epoch": 0.66, + "grad_norm": 0.5527390406160034, + "learning_rate": 1.4021635984507698e-05, + "loss": 1.9001, + "step": 8495 + }, + { + "epoch": 0.66, + "grad_norm": 0.5770027029984508, + "learning_rate": 1.4016024154834722e-05, + "loss": 1.8301, + "step": 8496 + }, + { + "epoch": 0.66, + "grad_norm": 0.5907552349465512, + "learning_rate": 1.4010413010936508e-05, + "loss": 1.9299, + "step": 8497 + }, + { + "epoch": 0.66, + "grad_norm": 0.6062324672381054, + "learning_rate": 1.400480255316334e-05, + "loss": 2.0519, + "step": 8498 + }, + { + "epoch": 0.66, + "grad_norm": 0.5978566604214869, + "learning_rate": 1.399919278186552e-05, + "loss": 1.9683, + "step": 8499 + }, + { + "epoch": 0.66, + "grad_norm": 0.5792716538486619, + "learning_rate": 1.3993583697393287e-05, + "loss": 1.8334, + "step": 8500 + }, + { + "epoch": 0.66, + "grad_norm": 0.6486085030997955, + "learning_rate": 1.398797530009684e-05, + "loss": 1.9165, + "step": 8501 + }, + { + "epoch": 0.66, + "grad_norm": 0.6364920988361558, + "learning_rate": 1.3982367590326334e-05, + "loss": 2.1785, + "step": 8502 + }, + { + "epoch": 0.66, + "grad_norm": 0.6034087084768031, + "learning_rate": 1.397676056843188e-05, + "loss": 1.8647, + "step": 8503 + }, + { + "epoch": 0.66, + "grad_norm": 0.6287188234446461, + "learning_rate": 1.3971154234763545e-05, + "loss": 1.8712, + "step": 8504 + }, + { + "epoch": 0.66, + "grad_norm": 0.7453187348247245, + "learning_rate": 1.3965548589671363e-05, + "loss": 1.9466, + "step": 8505 + }, + { + "epoch": 0.66, + "grad_norm": 0.7166162908838116, + "learning_rate": 1.3959943633505312e-05, + "loss": 2.0211, + "step": 8506 + }, + { + "epoch": 0.66, + "grad_norm": 0.6406683667796296, + "learning_rate": 1.3954339366615334e-05, + "loss": 1.8351, + "step": 8507 + }, + { + "epoch": 0.66, + "grad_norm": 0.7466193511423979, + "learning_rate": 1.3948735789351325e-05, + "loss": 1.8836, + "step": 8508 + }, + { + "epoch": 0.66, + "grad_norm": 0.621780792721436, + "learning_rate": 1.3943132902063144e-05, + "loss": 1.917, + "step": 8509 + }, + { + "epoch": 0.66, + "grad_norm": 0.57014835824952, + "learning_rate": 1.3937530705100593e-05, + "loss": 2.0921, + "step": 8510 + }, + { + "epoch": 0.66, + "grad_norm": 0.6341438096239674, + "learning_rate": 1.3931929198813451e-05, + "loss": 1.9834, + "step": 8511 + }, + { + "epoch": 0.66, + "grad_norm": 0.6290707977599788, + "learning_rate": 1.3926328383551434e-05, + "loss": 1.8971, + "step": 8512 + }, + { + "epoch": 0.66, + "grad_norm": 0.6191908593992131, + "learning_rate": 1.3920728259664232e-05, + "loss": 1.8975, + "step": 8513 + }, + { + "epoch": 0.66, + "grad_norm": 0.5833253584950423, + "learning_rate": 1.3915128827501475e-05, + "loss": 2.0609, + "step": 8514 + }, + { + "epoch": 0.66, + "grad_norm": 0.6005955911894293, + "learning_rate": 1.3909530087412765e-05, + "loss": 1.8793, + "step": 8515 + }, + { + "epoch": 0.66, + "grad_norm": 0.5971804806446672, + "learning_rate": 1.3903932039747653e-05, + "loss": 1.8696, + "step": 8516 + }, + { + "epoch": 0.66, + "grad_norm": 0.58994638363175, + "learning_rate": 1.3898334684855647e-05, + "loss": 1.8766, + "step": 8517 + }, + { + "epoch": 0.66, + "grad_norm": 0.5751458873973857, + "learning_rate": 1.3892738023086216e-05, + "loss": 1.9172, + "step": 8518 + }, + { + "epoch": 0.66, + "grad_norm": 0.5993312938921292, + "learning_rate": 1.3887142054788777e-05, + "loss": 2.0507, + "step": 8519 + }, + { + "epoch": 0.66, + "grad_norm": 0.5672412881572056, + "learning_rate": 1.3881546780312721e-05, + "loss": 1.9102, + "step": 8520 + }, + { + "epoch": 0.66, + "grad_norm": 0.5807934501430776, + "learning_rate": 1.3875952200007358e-05, + "loss": 1.9018, + "step": 8521 + }, + { + "epoch": 0.66, + "grad_norm": 0.6413226685929957, + "learning_rate": 1.3870358314222007e-05, + "loss": 2.0798, + "step": 8522 + }, + { + "epoch": 0.66, + "grad_norm": 0.580456197932248, + "learning_rate": 1.3864765123305906e-05, + "loss": 1.8941, + "step": 8523 + }, + { + "epoch": 0.66, + "grad_norm": 0.5822159749078407, + "learning_rate": 1.3859172627608266e-05, + "loss": 1.9366, + "step": 8524 + }, + { + "epoch": 0.66, + "grad_norm": 0.6010838648521095, + "learning_rate": 1.3853580827478257e-05, + "loss": 1.8688, + "step": 8525 + }, + { + "epoch": 0.66, + "grad_norm": 0.5884866549209309, + "learning_rate": 1.3847989723264964e-05, + "loss": 2.0797, + "step": 8526 + }, + { + "epoch": 0.66, + "grad_norm": 0.545073034576549, + "learning_rate": 1.38423993153175e-05, + "loss": 1.8967, + "step": 8527 + }, + { + "epoch": 0.66, + "grad_norm": 0.5609372254453845, + "learning_rate": 1.383680960398489e-05, + "loss": 1.8685, + "step": 8528 + }, + { + "epoch": 0.66, + "grad_norm": 0.5645828410811794, + "learning_rate": 1.3831220589616101e-05, + "loss": 1.8775, + "step": 8529 + }, + { + "epoch": 0.66, + "grad_norm": 0.5511053468525992, + "learning_rate": 1.3825632272560096e-05, + "loss": 2.0025, + "step": 8530 + }, + { + "epoch": 0.66, + "grad_norm": 0.6517839308943691, + "learning_rate": 1.382004465316579e-05, + "loss": 2.0883, + "step": 8531 + }, + { + "epoch": 0.66, + "grad_norm": 0.563880873236184, + "learning_rate": 1.3814457731782003e-05, + "loss": 1.9002, + "step": 8532 + }, + { + "epoch": 0.66, + "grad_norm": 0.5884613126007006, + "learning_rate": 1.3808871508757587e-05, + "loss": 1.913, + "step": 8533 + }, + { + "epoch": 0.66, + "grad_norm": 0.6063064397574519, + "learning_rate": 1.3803285984441294e-05, + "loss": 2.0526, + "step": 8534 + }, + { + "epoch": 0.66, + "grad_norm": 0.5800230364768042, + "learning_rate": 1.3797701159181839e-05, + "loss": 1.8493, + "step": 8535 + }, + { + "epoch": 0.66, + "grad_norm": 0.5499297325574661, + "learning_rate": 1.3792117033327942e-05, + "loss": 1.9589, + "step": 8536 + }, + { + "epoch": 0.66, + "grad_norm": 0.6478588615736024, + "learning_rate": 1.3786533607228214e-05, + "loss": 1.8968, + "step": 8537 + }, + { + "epoch": 0.66, + "grad_norm": 0.6329771958707212, + "learning_rate": 1.3780950881231256e-05, + "loss": 2.0763, + "step": 8538 + }, + { + "epoch": 0.66, + "grad_norm": 0.5358917360353697, + "learning_rate": 1.3775368855685625e-05, + "loss": 1.8916, + "step": 8539 + }, + { + "epoch": 0.66, + "grad_norm": 0.637109025169848, + "learning_rate": 1.3769787530939817e-05, + "loss": 1.891, + "step": 8540 + }, + { + "epoch": 0.66, + "grad_norm": 0.5673965260128847, + "learning_rate": 1.3764206907342331e-05, + "loss": 1.8718, + "step": 8541 + }, + { + "epoch": 0.66, + "grad_norm": 0.6521987901552707, + "learning_rate": 1.3758626985241552e-05, + "loss": 2.0852, + "step": 8542 + }, + { + "epoch": 0.66, + "grad_norm": 0.5571003005752742, + "learning_rate": 1.3753047764985864e-05, + "loss": 1.9525, + "step": 8543 + }, + { + "epoch": 0.66, + "grad_norm": 0.6230516333206275, + "learning_rate": 1.3747469246923628e-05, + "loss": 1.9026, + "step": 8544 + }, + { + "epoch": 0.66, + "grad_norm": 0.6340404145684204, + "learning_rate": 1.3741891431403098e-05, + "loss": 1.8577, + "step": 8545 + }, + { + "epoch": 0.66, + "grad_norm": 0.6600575389834339, + "learning_rate": 1.3736314318772541e-05, + "loss": 2.0429, + "step": 8546 + }, + { + "epoch": 0.66, + "grad_norm": 0.6498873619680668, + "learning_rate": 1.3730737909380154e-05, + "loss": 1.8626, + "step": 8547 + }, + { + "epoch": 0.66, + "grad_norm": 0.5623245105571738, + "learning_rate": 1.372516220357409e-05, + "loss": 1.8865, + "step": 8548 + }, + { + "epoch": 0.66, + "grad_norm": 0.6607093895623688, + "learning_rate": 1.3719587201702471e-05, + "loss": 1.9498, + "step": 8549 + }, + { + "epoch": 0.66, + "grad_norm": 0.6563606008018121, + "learning_rate": 1.3714012904113363e-05, + "loss": 1.8501, + "step": 8550 + }, + { + "epoch": 0.66, + "grad_norm": 0.6198558532331665, + "learning_rate": 1.3708439311154794e-05, + "loss": 2.0818, + "step": 8551 + }, + { + "epoch": 0.66, + "grad_norm": 0.5700943533081028, + "learning_rate": 1.3702866423174746e-05, + "loss": 1.8936, + "step": 8552 + }, + { + "epoch": 0.66, + "grad_norm": 0.6431288884653616, + "learning_rate": 1.3697294240521155e-05, + "loss": 1.8353, + "step": 8553 + }, + { + "epoch": 0.66, + "grad_norm": 0.7200407733227885, + "learning_rate": 1.369172276354192e-05, + "loss": 2.0534, + "step": 8554 + }, + { + "epoch": 0.66, + "grad_norm": 0.5515226836936954, + "learning_rate": 1.3686151992584887e-05, + "loss": 1.9655, + "step": 8555 + }, + { + "epoch": 0.66, + "grad_norm": 0.5671820526737776, + "learning_rate": 1.3680581927997866e-05, + "loss": 1.8547, + "step": 8556 + }, + { + "epoch": 0.66, + "grad_norm": 0.6704553659143992, + "learning_rate": 1.3675012570128612e-05, + "loss": 1.8765, + "step": 8557 + }, + { + "epoch": 0.66, + "grad_norm": 0.592979961376165, + "learning_rate": 1.3669443919324849e-05, + "loss": 2.0761, + "step": 8558 + }, + { + "epoch": 0.66, + "grad_norm": 0.6612134705520717, + "learning_rate": 1.3663875975934248e-05, + "loss": 1.9118, + "step": 8559 + }, + { + "epoch": 0.66, + "grad_norm": 0.6290439276435041, + "learning_rate": 1.3658308740304442e-05, + "loss": 1.9336, + "step": 8560 + }, + { + "epoch": 0.66, + "grad_norm": 0.6302113149443741, + "learning_rate": 1.3652742212783015e-05, + "loss": 1.9294, + "step": 8561 + }, + { + "epoch": 0.66, + "grad_norm": 0.659622096518625, + "learning_rate": 1.3647176393717509e-05, + "loss": 1.9164, + "step": 8562 + }, + { + "epoch": 0.66, + "grad_norm": 0.5893036326325897, + "learning_rate": 1.3641611283455414e-05, + "loss": 2.0492, + "step": 8563 + }, + { + "epoch": 0.66, + "grad_norm": 0.6128457363763159, + "learning_rate": 1.3636046882344191e-05, + "loss": 1.8784, + "step": 8564 + }, + { + "epoch": 0.66, + "grad_norm": 0.5718976069132464, + "learning_rate": 1.3630483190731246e-05, + "loss": 1.9224, + "step": 8565 + }, + { + "epoch": 0.66, + "grad_norm": 0.6117637899435415, + "learning_rate": 1.3624920208963943e-05, + "loss": 2.0845, + "step": 8566 + }, + { + "epoch": 0.66, + "grad_norm": 0.5645666454087888, + "learning_rate": 1.3619357937389607e-05, + "loss": 1.8944, + "step": 8567 + }, + { + "epoch": 0.66, + "grad_norm": 0.6152548072642815, + "learning_rate": 1.3613796376355493e-05, + "loss": 1.8912, + "step": 8568 + }, + { + "epoch": 0.66, + "grad_norm": 0.5636969463493041, + "learning_rate": 1.3608235526208851e-05, + "loss": 1.8919, + "step": 8569 + }, + { + "epoch": 0.66, + "grad_norm": 0.552762268172415, + "learning_rate": 1.3602675387296878e-05, + "loss": 2.0277, + "step": 8570 + }, + { + "epoch": 0.66, + "grad_norm": 0.5999415492243528, + "learning_rate": 1.3597115959966683e-05, + "loss": 1.9362, + "step": 8571 + }, + { + "epoch": 0.66, + "grad_norm": 0.6282172049431161, + "learning_rate": 1.3591557244565401e-05, + "loss": 1.9132, + "step": 8572 + }, + { + "epoch": 0.66, + "grad_norm": 0.5534342352584115, + "learning_rate": 1.3585999241440046e-05, + "loss": 1.8664, + "step": 8573 + }, + { + "epoch": 0.66, + "grad_norm": 0.6279400443349271, + "learning_rate": 1.358044195093766e-05, + "loss": 1.9564, + "step": 8574 + }, + { + "epoch": 0.66, + "grad_norm": 0.6930726390298751, + "learning_rate": 1.3574885373405205e-05, + "loss": 2.0806, + "step": 8575 + }, + { + "epoch": 0.66, + "grad_norm": 0.6196094716214805, + "learning_rate": 1.3569329509189572e-05, + "loss": 1.8736, + "step": 8576 + }, + { + "epoch": 0.66, + "grad_norm": 0.5951292243552486, + "learning_rate": 1.3563774358637666e-05, + "loss": 1.8785, + "step": 8577 + }, + { + "epoch": 0.66, + "grad_norm": 0.7286490270615972, + "learning_rate": 1.3558219922096318e-05, + "loss": 2.0865, + "step": 8578 + }, + { + "epoch": 0.66, + "grad_norm": 0.5723622479482635, + "learning_rate": 1.3552666199912285e-05, + "loss": 1.8561, + "step": 8579 + }, + { + "epoch": 0.66, + "grad_norm": 0.6699243728236776, + "learning_rate": 1.3547113192432348e-05, + "loss": 1.9449, + "step": 8580 + }, + { + "epoch": 0.66, + "grad_norm": 0.604466441751994, + "learning_rate": 1.3541560900003172e-05, + "loss": 1.8642, + "step": 8581 + }, + { + "epoch": 0.66, + "grad_norm": 0.6409522116104243, + "learning_rate": 1.3536009322971417e-05, + "loss": 1.824, + "step": 8582 + }, + { + "epoch": 0.66, + "grad_norm": 0.6572523274642812, + "learning_rate": 1.353045846168371e-05, + "loss": 2.0556, + "step": 8583 + }, + { + "epoch": 0.66, + "grad_norm": 0.610483919632084, + "learning_rate": 1.352490831648659e-05, + "loss": 1.8716, + "step": 8584 + }, + { + "epoch": 0.66, + "grad_norm": 0.6766008775200689, + "learning_rate": 1.3519358887726585e-05, + "loss": 1.9247, + "step": 8585 + }, + { + "epoch": 0.66, + "grad_norm": 0.6005071354693734, + "learning_rate": 1.3513810175750167e-05, + "loss": 1.991, + "step": 8586 + }, + { + "epoch": 0.66, + "grad_norm": 0.6092829767934, + "learning_rate": 1.3508262180903757e-05, + "loss": 2.0896, + "step": 8587 + }, + { + "epoch": 0.66, + "grad_norm": 0.7091040707658487, + "learning_rate": 1.350271490353377e-05, + "loss": 1.8663, + "step": 8588 + }, + { + "epoch": 0.66, + "grad_norm": 0.6521786289256927, + "learning_rate": 1.349716834398651e-05, + "loss": 1.8663, + "step": 8589 + }, + { + "epoch": 0.66, + "grad_norm": 0.581375004306449, + "learning_rate": 1.3491622502608278e-05, + "loss": 2.0701, + "step": 8590 + }, + { + "epoch": 0.66, + "grad_norm": 0.6356977326703559, + "learning_rate": 1.3486077379745351e-05, + "loss": 1.9062, + "step": 8591 + }, + { + "epoch": 0.66, + "grad_norm": 0.5360480351020158, + "learning_rate": 1.3480532975743903e-05, + "loss": 1.9742, + "step": 8592 + }, + { + "epoch": 0.66, + "grad_norm": 0.6339020989294368, + "learning_rate": 1.3474989290950105e-05, + "loss": 1.9318, + "step": 8593 + }, + { + "epoch": 0.66, + "grad_norm": 0.6414836588003963, + "learning_rate": 1.346944632571007e-05, + "loss": 1.818, + "step": 8594 + }, + { + "epoch": 0.66, + "grad_norm": 0.599580117894308, + "learning_rate": 1.3463904080369872e-05, + "loss": 2.0789, + "step": 8595 + }, + { + "epoch": 0.66, + "grad_norm": 0.5615634126070519, + "learning_rate": 1.3458362555275531e-05, + "loss": 1.8855, + "step": 8596 + }, + { + "epoch": 0.66, + "grad_norm": 0.6112730891410497, + "learning_rate": 1.3452821750773032e-05, + "loss": 1.8439, + "step": 8597 + }, + { + "epoch": 0.66, + "grad_norm": 0.5481926855027783, + "learning_rate": 1.3447281667208305e-05, + "loss": 1.9753, + "step": 8598 + }, + { + "epoch": 0.66, + "grad_norm": 0.5859228881668126, + "learning_rate": 1.3441742304927247e-05, + "loss": 2.0449, + "step": 8599 + }, + { + "epoch": 0.66, + "grad_norm": 0.6293356603405511, + "learning_rate": 1.3436203664275699e-05, + "loss": 1.8994, + "step": 8600 + }, + { + "epoch": 0.66, + "grad_norm": 0.5443950816119576, + "learning_rate": 1.3430665745599458e-05, + "loss": 1.907, + "step": 8601 + }, + { + "epoch": 0.66, + "grad_norm": 0.5837096071741881, + "learning_rate": 1.3425128549244284e-05, + "loss": 1.9029, + "step": 8602 + }, + { + "epoch": 0.66, + "grad_norm": 0.5931245888165687, + "learning_rate": 1.3419592075555887e-05, + "loss": 2.1035, + "step": 8603 + }, + { + "epoch": 0.66, + "grad_norm": 0.5616027434009185, + "learning_rate": 1.3414056324879931e-05, + "loss": 1.8675, + "step": 8604 + }, + { + "epoch": 0.66, + "grad_norm": 0.5342495136644386, + "learning_rate": 1.3408521297562032e-05, + "loss": 1.9561, + "step": 8605 + }, + { + "epoch": 0.66, + "grad_norm": 0.5621810607453066, + "learning_rate": 1.340298699394777e-05, + "loss": 1.9151, + "step": 8606 + }, + { + "epoch": 0.66, + "grad_norm": 0.5983527875689366, + "learning_rate": 1.3397453414382668e-05, + "loss": 2.0649, + "step": 8607 + }, + { + "epoch": 0.66, + "grad_norm": 0.6036558960586824, + "learning_rate": 1.3391920559212218e-05, + "loss": 1.8935, + "step": 8608 + }, + { + "epoch": 0.66, + "grad_norm": 0.6834322158330604, + "learning_rate": 1.3386388428781854e-05, + "loss": 1.8728, + "step": 8609 + }, + { + "epoch": 0.66, + "grad_norm": 0.556994328282094, + "learning_rate": 1.3380857023436966e-05, + "loss": 2.1294, + "step": 8610 + }, + { + "epoch": 0.66, + "grad_norm": 0.5596047244802386, + "learning_rate": 1.3375326343522912e-05, + "loss": 1.9589, + "step": 8611 + }, + { + "epoch": 0.66, + "grad_norm": 0.7035321085298167, + "learning_rate": 1.3369796389384987e-05, + "loss": 1.8387, + "step": 8612 + }, + { + "epoch": 0.66, + "grad_norm": 0.6056846529259186, + "learning_rate": 1.3364267161368452e-05, + "loss": 1.8656, + "step": 8613 + }, + { + "epoch": 0.66, + "grad_norm": 0.6670725191531433, + "learning_rate": 1.3358738659818529e-05, + "loss": 1.8667, + "step": 8614 + }, + { + "epoch": 0.66, + "grad_norm": 0.6668562559978036, + "learning_rate": 1.3353210885080356e-05, + "loss": 2.0726, + "step": 8615 + }, + { + "epoch": 0.66, + "grad_norm": 0.6006221780662695, + "learning_rate": 1.3347683837499081e-05, + "loss": 1.8713, + "step": 8616 + }, + { + "epoch": 0.66, + "grad_norm": 0.5437169844469553, + "learning_rate": 1.3342157517419784e-05, + "loss": 1.9375, + "step": 8617 + }, + { + "epoch": 0.66, + "grad_norm": 0.6638882216412134, + "learning_rate": 1.3336631925187464e-05, + "loss": 1.8647, + "step": 8618 + }, + { + "epoch": 0.66, + "grad_norm": 0.6079776967948477, + "learning_rate": 1.3331107061147152e-05, + "loss": 2.0908, + "step": 8619 + }, + { + "epoch": 0.67, + "grad_norm": 0.5663355289423392, + "learning_rate": 1.3325582925643736e-05, + "loss": 1.8549, + "step": 8620 + }, + { + "epoch": 0.67, + "grad_norm": 0.5505350455027813, + "learning_rate": 1.3320059519022155e-05, + "loss": 1.8134, + "step": 8621 + }, + { + "epoch": 0.67, + "grad_norm": 0.6372308119260377, + "learning_rate": 1.3314536841627245e-05, + "loss": 2.0911, + "step": 8622 + }, + { + "epoch": 0.67, + "grad_norm": 0.6125913963207185, + "learning_rate": 1.330901489380379e-05, + "loss": 1.9626, + "step": 8623 + }, + { + "epoch": 0.67, + "grad_norm": 0.5903497701163174, + "learning_rate": 1.3303493675896567e-05, + "loss": 1.8914, + "step": 8624 + }, + { + "epoch": 0.67, + "grad_norm": 0.5583309349050712, + "learning_rate": 1.3297973188250298e-05, + "loss": 1.8922, + "step": 8625 + }, + { + "epoch": 0.67, + "grad_norm": 0.6648625876564936, + "learning_rate": 1.3292453431209616e-05, + "loss": 1.9234, + "step": 8626 + }, + { + "epoch": 0.67, + "grad_norm": 0.6481893625143103, + "learning_rate": 1.3286934405119183e-05, + "loss": 2.1129, + "step": 8627 + }, + { + "epoch": 0.67, + "grad_norm": 0.6127304809058359, + "learning_rate": 1.3281416110323541e-05, + "loss": 1.8729, + "step": 8628 + }, + { + "epoch": 0.67, + "grad_norm": 0.5871048174818186, + "learning_rate": 1.3275898547167223e-05, + "loss": 1.97, + "step": 8629 + }, + { + "epoch": 0.67, + "grad_norm": 0.6042252402753703, + "learning_rate": 1.3270381715994742e-05, + "loss": 1.898, + "step": 8630 + }, + { + "epoch": 0.67, + "grad_norm": 0.6207772287126526, + "learning_rate": 1.3264865617150508e-05, + "loss": 2.0631, + "step": 8631 + }, + { + "epoch": 0.67, + "grad_norm": 0.5761240972595963, + "learning_rate": 1.3259350250978914e-05, + "loss": 1.9032, + "step": 8632 + }, + { + "epoch": 0.67, + "grad_norm": 0.6168258906492459, + "learning_rate": 1.3253835617824333e-05, + "loss": 1.9284, + "step": 8633 + }, + { + "epoch": 0.67, + "grad_norm": 0.6313108853701159, + "learning_rate": 1.324832171803104e-05, + "loss": 1.8823, + "step": 8634 + }, + { + "epoch": 0.67, + "grad_norm": 0.5543843764108831, + "learning_rate": 1.3242808551943297e-05, + "loss": 2.0722, + "step": 8635 + }, + { + "epoch": 0.67, + "grad_norm": 0.6124466219280645, + "learning_rate": 1.323729611990532e-05, + "loss": 1.992, + "step": 8636 + }, + { + "epoch": 0.67, + "grad_norm": 0.5751113074120218, + "learning_rate": 1.3231784422261256e-05, + "loss": 1.8129, + "step": 8637 + }, + { + "epoch": 0.67, + "grad_norm": 0.5787250491079584, + "learning_rate": 1.322627345935526e-05, + "loss": 1.8522, + "step": 8638 + }, + { + "epoch": 0.67, + "grad_norm": 0.61873852970918, + "learning_rate": 1.3220763231531363e-05, + "loss": 2.0986, + "step": 8639 + }, + { + "epoch": 0.67, + "grad_norm": 0.6139043570440373, + "learning_rate": 1.321525373913361e-05, + "loss": 1.8932, + "step": 8640 + }, + { + "epoch": 0.67, + "grad_norm": 0.647349524027658, + "learning_rate": 1.3209744982505981e-05, + "loss": 1.8338, + "step": 8641 + }, + { + "epoch": 0.67, + "grad_norm": 0.5659427456241436, + "learning_rate": 1.3204236961992406e-05, + "loss": 1.9393, + "step": 8642 + }, + { + "epoch": 0.67, + "grad_norm": 0.5873982729163277, + "learning_rate": 1.3198729677936777e-05, + "loss": 2.1229, + "step": 8643 + }, + { + "epoch": 0.67, + "grad_norm": 0.6567380965448689, + "learning_rate": 1.3193223130682936e-05, + "loss": 1.8528, + "step": 8644 + }, + { + "epoch": 0.67, + "grad_norm": 0.6325024617637471, + "learning_rate": 1.3187717320574678e-05, + "loss": 1.9045, + "step": 8645 + }, + { + "epoch": 0.67, + "grad_norm": 0.5699967685561543, + "learning_rate": 1.318221224795575e-05, + "loss": 1.8783, + "step": 8646 + }, + { + "epoch": 0.67, + "grad_norm": 0.7058278910740858, + "learning_rate": 1.3176707913169864e-05, + "loss": 2.0709, + "step": 8647 + }, + { + "epoch": 0.67, + "grad_norm": 0.6344523613551085, + "learning_rate": 1.3171204316560672e-05, + "loss": 1.9545, + "step": 8648 + }, + { + "epoch": 0.67, + "grad_norm": 0.5531069157011015, + "learning_rate": 1.3165701458471791e-05, + "loss": 1.9011, + "step": 8649 + }, + { + "epoch": 0.67, + "grad_norm": 0.6963226978869259, + "learning_rate": 1.3160199339246782e-05, + "loss": 1.8903, + "step": 8650 + }, + { + "epoch": 0.67, + "grad_norm": 0.6847067252507361, + "learning_rate": 1.3154697959229171e-05, + "loss": 2.0682, + "step": 8651 + }, + { + "epoch": 0.67, + "grad_norm": 0.5702210634528158, + "learning_rate": 1.3149197318762425e-05, + "loss": 1.8759, + "step": 8652 + }, + { + "epoch": 0.67, + "grad_norm": 0.6337334650472904, + "learning_rate": 1.3143697418189976e-05, + "loss": 1.8783, + "step": 8653 + }, + { + "epoch": 0.67, + "grad_norm": 0.6429446647410276, + "learning_rate": 1.31381982578552e-05, + "loss": 1.9204, + "step": 8654 + }, + { + "epoch": 0.67, + "grad_norm": 0.5727540017137633, + "learning_rate": 1.3132699838101442e-05, + "loss": 2.0938, + "step": 8655 + }, + { + "epoch": 0.67, + "grad_norm": 0.6290632682576639, + "learning_rate": 1.3127202159271983e-05, + "loss": 1.8858, + "step": 8656 + }, + { + "epoch": 0.67, + "grad_norm": 0.6711671010643725, + "learning_rate": 1.3121705221710068e-05, + "loss": 1.9096, + "step": 8657 + }, + { + "epoch": 0.67, + "grad_norm": 0.6011141916907088, + "learning_rate": 1.3116209025758891e-05, + "loss": 1.8669, + "step": 8658 + }, + { + "epoch": 0.67, + "grad_norm": 0.6049575083717088, + "learning_rate": 1.3110713571761607e-05, + "loss": 2.053, + "step": 8659 + }, + { + "epoch": 0.67, + "grad_norm": 0.5716973986454544, + "learning_rate": 1.3105218860061316e-05, + "loss": 2.0254, + "step": 8660 + }, + { + "epoch": 0.67, + "grad_norm": 0.6318554013884782, + "learning_rate": 1.3099724891001086e-05, + "loss": 1.8212, + "step": 8661 + }, + { + "epoch": 0.67, + "grad_norm": 0.5822776419222844, + "learning_rate": 1.30942316649239e-05, + "loss": 1.8674, + "step": 8662 + }, + { + "epoch": 0.67, + "grad_norm": 0.6539473255754054, + "learning_rate": 1.3088739182172754e-05, + "loss": 2.0807, + "step": 8663 + }, + { + "epoch": 0.67, + "grad_norm": 0.5726907059048509, + "learning_rate": 1.3083247443090557e-05, + "loss": 1.8876, + "step": 8664 + }, + { + "epoch": 0.67, + "grad_norm": 0.6364942106172853, + "learning_rate": 1.3077756448020163e-05, + "loss": 1.9013, + "step": 8665 + }, + { + "epoch": 0.67, + "grad_norm": 0.6030113906748293, + "learning_rate": 1.307226619730442e-05, + "loss": 1.8653, + "step": 8666 + }, + { + "epoch": 0.67, + "grad_norm": 0.60789121040305, + "learning_rate": 1.3066776691286108e-05, + "loss": 2.1202, + "step": 8667 + }, + { + "epoch": 0.67, + "grad_norm": 0.6401168588582484, + "learning_rate": 1.306128793030793e-05, + "loss": 1.8666, + "step": 8668 + }, + { + "epoch": 0.67, + "grad_norm": 0.6399983815398669, + "learning_rate": 1.3055799914712613e-05, + "loss": 1.8938, + "step": 8669 + }, + { + "epoch": 0.67, + "grad_norm": 0.5508051395557823, + "learning_rate": 1.3050312644842755e-05, + "loss": 1.8825, + "step": 8670 + }, + { + "epoch": 0.67, + "grad_norm": 0.6947899353881971, + "learning_rate": 1.3044826121040976e-05, + "loss": 2.0546, + "step": 8671 + }, + { + "epoch": 0.67, + "grad_norm": 0.6564700720229625, + "learning_rate": 1.303934034364983e-05, + "loss": 1.8261, + "step": 8672 + }, + { + "epoch": 0.67, + "grad_norm": 0.5809735156918099, + "learning_rate": 1.3033855313011779e-05, + "loss": 1.9565, + "step": 8673 + }, + { + "epoch": 0.67, + "grad_norm": 0.5619105179484444, + "learning_rate": 1.3028371029469321e-05, + "loss": 1.8757, + "step": 8674 + }, + { + "epoch": 0.67, + "grad_norm": 0.6853825321241442, + "learning_rate": 1.3022887493364828e-05, + "loss": 2.0622, + "step": 8675 + }, + { + "epoch": 0.67, + "grad_norm": 0.6197992129906217, + "learning_rate": 1.3017404705040665e-05, + "loss": 1.882, + "step": 8676 + }, + { + "epoch": 0.67, + "grad_norm": 0.6033869209812119, + "learning_rate": 1.301192266483917e-05, + "loss": 1.8982, + "step": 8677 + }, + { + "epoch": 0.67, + "grad_norm": 0.6420913792748507, + "learning_rate": 1.3006441373102582e-05, + "loss": 1.8986, + "step": 8678 + }, + { + "epoch": 0.67, + "grad_norm": 0.6021570211050509, + "learning_rate": 1.300096083017312e-05, + "loss": 2.1299, + "step": 8679 + }, + { + "epoch": 0.67, + "grad_norm": 0.6610012231118445, + "learning_rate": 1.2995481036392987e-05, + "loss": 1.8846, + "step": 8680 + }, + { + "epoch": 0.67, + "grad_norm": 0.5652170866260483, + "learning_rate": 1.2990001992104275e-05, + "loss": 1.8632, + "step": 8681 + }, + { + "epoch": 0.67, + "grad_norm": 0.579067354333605, + "learning_rate": 1.2984523697649082e-05, + "loss": 1.9132, + "step": 8682 + }, + { + "epoch": 0.67, + "grad_norm": 0.6874822570013811, + "learning_rate": 1.2979046153369431e-05, + "loss": 2.0483, + "step": 8683 + }, + { + "epoch": 0.67, + "grad_norm": 0.5956231186543932, + "learning_rate": 1.2973569359607307e-05, + "loss": 1.8835, + "step": 8684 + }, + { + "epoch": 0.67, + "grad_norm": 0.5349684656914901, + "learning_rate": 1.2968093316704668e-05, + "loss": 1.9981, + "step": 8685 + }, + { + "epoch": 0.67, + "grad_norm": 0.6591905216131088, + "learning_rate": 1.2962618025003387e-05, + "loss": 1.9206, + "step": 8686 + }, + { + "epoch": 0.67, + "grad_norm": 0.5736513586558483, + "learning_rate": 1.2957143484845307e-05, + "loss": 2.0825, + "step": 8687 + }, + { + "epoch": 0.67, + "grad_norm": 0.581533273416157, + "learning_rate": 1.2951669696572239e-05, + "loss": 1.8792, + "step": 8688 + }, + { + "epoch": 0.67, + "grad_norm": 0.5878595518519131, + "learning_rate": 1.2946196660525925e-05, + "loss": 1.8829, + "step": 8689 + }, + { + "epoch": 0.67, + "grad_norm": 0.5631029998026801, + "learning_rate": 1.2940724377048072e-05, + "loss": 1.8717, + "step": 8690 + }, + { + "epoch": 0.67, + "grad_norm": 0.6711591498302996, + "learning_rate": 1.2935252846480336e-05, + "loss": 2.0881, + "step": 8691 + }, + { + "epoch": 0.67, + "grad_norm": 0.6632857706655103, + "learning_rate": 1.2929782069164328e-05, + "loss": 1.8821, + "step": 8692 + }, + { + "epoch": 0.67, + "grad_norm": 0.5705579918390798, + "learning_rate": 1.292431204544161e-05, + "loss": 1.8638, + "step": 8693 + }, + { + "epoch": 0.67, + "grad_norm": 0.6644491969516353, + "learning_rate": 1.2918842775653703e-05, + "loss": 1.8954, + "step": 8694 + }, + { + "epoch": 0.67, + "grad_norm": 0.5713225296310331, + "learning_rate": 1.291337426014207e-05, + "loss": 2.0361, + "step": 8695 + }, + { + "epoch": 0.67, + "grad_norm": 0.6832915837978382, + "learning_rate": 1.2907906499248135e-05, + "loss": 1.8719, + "step": 8696 + }, + { + "epoch": 0.67, + "grad_norm": 0.5582669651517246, + "learning_rate": 1.2902439493313273e-05, + "loss": 1.8815, + "step": 8697 + }, + { + "epoch": 0.67, + "grad_norm": 0.7521352281468984, + "learning_rate": 1.2896973242678812e-05, + "loss": 1.9321, + "step": 8698 + }, + { + "epoch": 0.67, + "grad_norm": 0.6377691421672715, + "learning_rate": 1.2891507747686033e-05, + "loss": 1.9999, + "step": 8699 + }, + { + "epoch": 0.67, + "grad_norm": 0.5736105985285261, + "learning_rate": 1.2886043008676175e-05, + "loss": 1.8454, + "step": 8700 + }, + { + "epoch": 0.67, + "grad_norm": 0.7559359865908138, + "learning_rate": 1.28805790259904e-05, + "loss": 1.8744, + "step": 8701 + }, + { + "epoch": 0.67, + "grad_norm": 0.6358727315505697, + "learning_rate": 1.2875115799969873e-05, + "loss": 1.8908, + "step": 8702 + }, + { + "epoch": 0.67, + "grad_norm": 0.5709064312530099, + "learning_rate": 1.2869653330955677e-05, + "loss": 2.1159, + "step": 8703 + }, + { + "epoch": 0.67, + "grad_norm": 0.6272993111009666, + "learning_rate": 1.2864191619288857e-05, + "loss": 1.9256, + "step": 8704 + }, + { + "epoch": 0.67, + "grad_norm": 0.6534564457869096, + "learning_rate": 1.2858730665310409e-05, + "loss": 1.8781, + "step": 8705 + }, + { + "epoch": 0.67, + "grad_norm": 0.6502841609183282, + "learning_rate": 1.285327046936128e-05, + "loss": 1.8261, + "step": 8706 + }, + { + "epoch": 0.67, + "grad_norm": 0.5600801028258066, + "learning_rate": 1.2847811031782381e-05, + "loss": 2.0815, + "step": 8707 + }, + { + "epoch": 0.67, + "grad_norm": 0.5605817157645283, + "learning_rate": 1.2842352352914566e-05, + "loss": 1.8538, + "step": 8708 + }, + { + "epoch": 0.67, + "grad_norm": 0.5952696877176576, + "learning_rate": 1.2836894433098623e-05, + "loss": 1.8875, + "step": 8709 + }, + { + "epoch": 0.67, + "grad_norm": 0.6160494490497608, + "learning_rate": 1.2831437272675337e-05, + "loss": 1.9535, + "step": 8710 + }, + { + "epoch": 0.67, + "grad_norm": 0.5622216973261807, + "learning_rate": 1.282598087198542e-05, + "loss": 2.0664, + "step": 8711 + }, + { + "epoch": 0.67, + "grad_norm": 0.6090350722916298, + "learning_rate": 1.2820525231369513e-05, + "loss": 1.92, + "step": 8712 + }, + { + "epoch": 0.67, + "grad_norm": 0.6111858315437384, + "learning_rate": 1.2815070351168257e-05, + "loss": 1.7962, + "step": 8713 + }, + { + "epoch": 0.67, + "grad_norm": 0.6091675126430611, + "learning_rate": 1.2809616231722226e-05, + "loss": 1.86, + "step": 8714 + }, + { + "epoch": 0.67, + "grad_norm": 0.6199471517289616, + "learning_rate": 1.2804162873371917e-05, + "loss": 2.073, + "step": 8715 + }, + { + "epoch": 0.67, + "grad_norm": 0.7489872968338708, + "learning_rate": 1.2798710276457838e-05, + "loss": 1.9163, + "step": 8716 + }, + { + "epoch": 0.67, + "grad_norm": 0.5888493704435825, + "learning_rate": 1.279325844132038e-05, + "loss": 1.8946, + "step": 8717 + }, + { + "epoch": 0.67, + "grad_norm": 0.5820104097078809, + "learning_rate": 1.2787807368299953e-05, + "loss": 1.8968, + "step": 8718 + }, + { + "epoch": 0.67, + "grad_norm": 0.6798128080974845, + "learning_rate": 1.2782357057736893e-05, + "loss": 2.0939, + "step": 8719 + }, + { + "epoch": 0.67, + "grad_norm": 0.6774942971158335, + "learning_rate": 1.2776907509971453e-05, + "loss": 1.8877, + "step": 8720 + }, + { + "epoch": 0.67, + "grad_norm": 0.6069237605141844, + "learning_rate": 1.2771458725343913e-05, + "loss": 1.8867, + "step": 8721 + }, + { + "epoch": 0.67, + "grad_norm": 0.6015872260863675, + "learning_rate": 1.2766010704194429e-05, + "loss": 1.877, + "step": 8722 + }, + { + "epoch": 0.67, + "grad_norm": 0.5476016472505173, + "learning_rate": 1.2760563446863144e-05, + "loss": 2.1346, + "step": 8723 + }, + { + "epoch": 0.67, + "grad_norm": 0.5648279566715566, + "learning_rate": 1.2755116953690185e-05, + "loss": 1.8936, + "step": 8724 + }, + { + "epoch": 0.67, + "grad_norm": 0.6231963609137519, + "learning_rate": 1.274967122501557e-05, + "loss": 1.9462, + "step": 8725 + }, + { + "epoch": 0.67, + "grad_norm": 0.6387268435225342, + "learning_rate": 1.2744226261179292e-05, + "loss": 1.9723, + "step": 8726 + }, + { + "epoch": 0.67, + "grad_norm": 0.5736787226257911, + "learning_rate": 1.2738782062521338e-05, + "loss": 2.0329, + "step": 8727 + }, + { + "epoch": 0.67, + "grad_norm": 0.6097582913622503, + "learning_rate": 1.273333862938158e-05, + "loss": 1.8371, + "step": 8728 + }, + { + "epoch": 0.67, + "grad_norm": 0.617895908236208, + "learning_rate": 1.2727895962099886e-05, + "loss": 1.9564, + "step": 8729 + }, + { + "epoch": 0.67, + "grad_norm": 0.5677837355447011, + "learning_rate": 1.2722454061016059e-05, + "loss": 1.8741, + "step": 8730 + }, + { + "epoch": 0.67, + "grad_norm": 0.6623722375513699, + "learning_rate": 1.2717012926469867e-05, + "loss": 2.0816, + "step": 8731 + }, + { + "epoch": 0.67, + "grad_norm": 0.6722682742782295, + "learning_rate": 1.2711572558801016e-05, + "loss": 1.9458, + "step": 8732 + }, + { + "epoch": 0.67, + "grad_norm": 0.6579497860217047, + "learning_rate": 1.2706132958349171e-05, + "loss": 1.8893, + "step": 8733 + }, + { + "epoch": 0.67, + "grad_norm": 0.6600464651354379, + "learning_rate": 1.2700694125453949e-05, + "loss": 1.8756, + "step": 8734 + }, + { + "epoch": 0.67, + "grad_norm": 0.6848976771683926, + "learning_rate": 1.2695256060454922e-05, + "loss": 1.9648, + "step": 8735 + }, + { + "epoch": 0.67, + "grad_norm": 0.7453968710889805, + "learning_rate": 1.2689818763691608e-05, + "loss": 2.0642, + "step": 8736 + }, + { + "epoch": 0.67, + "grad_norm": 0.5756189041668505, + "learning_rate": 1.2684382235503478e-05, + "loss": 1.9392, + "step": 8737 + }, + { + "epoch": 0.67, + "grad_norm": 0.6617490335438091, + "learning_rate": 1.2678946476229959e-05, + "loss": 1.8536, + "step": 8738 + }, + { + "epoch": 0.67, + "grad_norm": 0.7506384916913145, + "learning_rate": 1.2673511486210429e-05, + "loss": 2.1051, + "step": 8739 + }, + { + "epoch": 0.67, + "grad_norm": 0.581873285162329, + "learning_rate": 1.2668077265784214e-05, + "loss": 1.8506, + "step": 8740 + }, + { + "epoch": 0.67, + "grad_norm": 0.595695703777978, + "learning_rate": 1.2662643815290598e-05, + "loss": 1.9281, + "step": 8741 + }, + { + "epoch": 0.67, + "grad_norm": 0.6947511778094969, + "learning_rate": 1.265721113506881e-05, + "loss": 1.8439, + "step": 8742 + }, + { + "epoch": 0.67, + "grad_norm": 0.6449173664146374, + "learning_rate": 1.2651779225458038e-05, + "loss": 1.998, + "step": 8743 + }, + { + "epoch": 0.67, + "grad_norm": 0.6224350317698285, + "learning_rate": 1.2646348086797417e-05, + "loss": 1.8989, + "step": 8744 + }, + { + "epoch": 0.67, + "grad_norm": 0.6790410130954976, + "learning_rate": 1.2640917719426032e-05, + "loss": 1.9553, + "step": 8745 + }, + { + "epoch": 0.67, + "grad_norm": 0.583889671221694, + "learning_rate": 1.263548812368293e-05, + "loss": 1.9089, + "step": 8746 + }, + { + "epoch": 0.67, + "grad_norm": 0.5969628962423894, + "learning_rate": 1.2630059299907096e-05, + "loss": 1.9486, + "step": 8747 + }, + { + "epoch": 0.67, + "grad_norm": 0.5840452399107695, + "learning_rate": 1.262463124843748e-05, + "loss": 2.0745, + "step": 8748 + }, + { + "epoch": 0.67, + "grad_norm": 0.6437609757353934, + "learning_rate": 1.261920396961297e-05, + "loss": 1.871, + "step": 8749 + }, + { + "epoch": 0.68, + "grad_norm": 0.5717221585252856, + "learning_rate": 1.2613777463772431e-05, + "loss": 1.8757, + "step": 8750 + }, + { + "epoch": 0.68, + "grad_norm": 0.5723605872703034, + "learning_rate": 1.2608351731254628e-05, + "loss": 2.1028, + "step": 8751 + }, + { + "epoch": 0.68, + "grad_norm": 0.5586435057115842, + "learning_rate": 1.2602926772398344e-05, + "loss": 1.8964, + "step": 8752 + }, + { + "epoch": 0.68, + "grad_norm": 0.5954265559492661, + "learning_rate": 1.2597502587542267e-05, + "loss": 1.8493, + "step": 8753 + }, + { + "epoch": 0.68, + "grad_norm": 0.5942395390469012, + "learning_rate": 1.2592079177025056e-05, + "loss": 1.9371, + "step": 8754 + }, + { + "epoch": 0.68, + "grad_norm": 0.6281826293343765, + "learning_rate": 1.2586656541185327e-05, + "loss": 2.0527, + "step": 8755 + }, + { + "epoch": 0.68, + "grad_norm": 0.6432272039943556, + "learning_rate": 1.2581234680361604e-05, + "loss": 1.9081, + "step": 8756 + }, + { + "epoch": 0.68, + "grad_norm": 0.6475550466941161, + "learning_rate": 1.2575813594892425e-05, + "loss": 1.9237, + "step": 8757 + }, + { + "epoch": 0.68, + "grad_norm": 0.5851996478684167, + "learning_rate": 1.2570393285116255e-05, + "loss": 1.8926, + "step": 8758 + }, + { + "epoch": 0.68, + "grad_norm": 0.6510044105894766, + "learning_rate": 1.2564973751371472e-05, + "loss": 2.0843, + "step": 8759 + }, + { + "epoch": 0.68, + "grad_norm": 0.7124363025477015, + "learning_rate": 1.2559554993996475e-05, + "loss": 1.9566, + "step": 8760 + }, + { + "epoch": 0.68, + "grad_norm": 0.5451627289675968, + "learning_rate": 1.2554137013329575e-05, + "loss": 1.8964, + "step": 8761 + }, + { + "epoch": 0.68, + "grad_norm": 0.5661923208633487, + "learning_rate": 1.2548719809709014e-05, + "loss": 1.8784, + "step": 8762 + }, + { + "epoch": 0.68, + "grad_norm": 0.6699562863447603, + "learning_rate": 1.2543303383473042e-05, + "loss": 2.1363, + "step": 8763 + }, + { + "epoch": 0.68, + "grad_norm": 0.714096483959045, + "learning_rate": 1.2537887734959802e-05, + "loss": 1.9116, + "step": 8764 + }, + { + "epoch": 0.68, + "grad_norm": 0.5793116362609599, + "learning_rate": 1.2532472864507422e-05, + "loss": 1.9082, + "step": 8765 + }, + { + "epoch": 0.68, + "grad_norm": 0.6544742867909544, + "learning_rate": 1.2527058772453994e-05, + "loss": 1.9758, + "step": 8766 + }, + { + "epoch": 0.68, + "grad_norm": 0.7800509719635721, + "learning_rate": 1.252164545913751e-05, + "loss": 1.9075, + "step": 8767 + }, + { + "epoch": 0.68, + "grad_norm": 0.5501263086274368, + "learning_rate": 1.2516232924895981e-05, + "loss": 2.0744, + "step": 8768 + }, + { + "epoch": 0.68, + "grad_norm": 0.644350824962762, + "learning_rate": 1.2510821170067305e-05, + "loss": 1.8681, + "step": 8769 + }, + { + "epoch": 0.68, + "grad_norm": 0.7992225186303318, + "learning_rate": 1.2505410194989359e-05, + "loss": 1.8871, + "step": 8770 + }, + { + "epoch": 0.68, + "grad_norm": 0.6096261619417377, + "learning_rate": 1.2500000000000006e-05, + "loss": 2.0819, + "step": 8771 + }, + { + "epoch": 0.68, + "grad_norm": 0.6788025443794176, + "learning_rate": 1.2494590585436994e-05, + "loss": 1.9351, + "step": 8772 + }, + { + "epoch": 0.68, + "grad_norm": 0.6759960931689781, + "learning_rate": 1.2489181951638054e-05, + "loss": 1.8647, + "step": 8773 + }, + { + "epoch": 0.68, + "grad_norm": 0.5961771150630039, + "learning_rate": 1.24837740989409e-05, + "loss": 1.8634, + "step": 8774 + }, + { + "epoch": 0.68, + "grad_norm": 0.628692511645392, + "learning_rate": 1.2478367027683138e-05, + "loss": 2.0496, + "step": 8775 + }, + { + "epoch": 0.68, + "grad_norm": 0.7063952800830876, + "learning_rate": 1.2472960738202364e-05, + "loss": 1.8908, + "step": 8776 + }, + { + "epoch": 0.68, + "grad_norm": 0.7021218590966916, + "learning_rate": 1.2467555230836112e-05, + "loss": 1.8638, + "step": 8777 + }, + { + "epoch": 0.68, + "grad_norm": 0.5610603436504511, + "learning_rate": 1.2462150505921872e-05, + "loss": 1.9725, + "step": 8778 + }, + { + "epoch": 0.68, + "grad_norm": 0.6714642554802152, + "learning_rate": 1.2456746563797081e-05, + "loss": 1.8773, + "step": 8779 + }, + { + "epoch": 0.68, + "grad_norm": 0.6677076745605204, + "learning_rate": 1.2451343404799132e-05, + "loss": 2.073, + "step": 8780 + }, + { + "epoch": 0.68, + "grad_norm": 0.5861338936795244, + "learning_rate": 1.2445941029265357e-05, + "loss": 1.8537, + "step": 8781 + }, + { + "epoch": 0.68, + "grad_norm": 0.619341397626377, + "learning_rate": 1.2440539437533075e-05, + "loss": 1.8861, + "step": 8782 + }, + { + "epoch": 0.68, + "grad_norm": 0.6662572628293926, + "learning_rate": 1.24351386299395e-05, + "loss": 2.0641, + "step": 8783 + }, + { + "epoch": 0.68, + "grad_norm": 0.705633029341517, + "learning_rate": 1.2429738606821842e-05, + "loss": 1.8344, + "step": 8784 + }, + { + "epoch": 0.68, + "grad_norm": 0.6145125928571025, + "learning_rate": 1.2424339368517241e-05, + "loss": 1.9325, + "step": 8785 + }, + { + "epoch": 0.68, + "grad_norm": 0.5865299997432878, + "learning_rate": 1.2418940915362792e-05, + "loss": 1.8549, + "step": 8786 + }, + { + "epoch": 0.68, + "grad_norm": 0.7310860049384014, + "learning_rate": 1.241354324769555e-05, + "loss": 2.0772, + "step": 8787 + }, + { + "epoch": 0.68, + "grad_norm": 0.5932857037420229, + "learning_rate": 1.240814636585251e-05, + "loss": 1.8949, + "step": 8788 + }, + { + "epoch": 0.68, + "grad_norm": 0.6091244034045606, + "learning_rate": 1.2402750270170619e-05, + "loss": 1.8901, + "step": 8789 + }, + { + "epoch": 0.68, + "grad_norm": 0.7422317422346834, + "learning_rate": 1.2397354960986778e-05, + "loss": 1.8867, + "step": 8790 + }, + { + "epoch": 0.68, + "grad_norm": 0.5548355465916321, + "learning_rate": 1.2391960438637843e-05, + "loss": 1.9537, + "step": 8791 + }, + { + "epoch": 0.68, + "grad_norm": 0.6548998747751967, + "learning_rate": 1.2386566703460612e-05, + "loss": 2.0772, + "step": 8792 + }, + { + "epoch": 0.68, + "grad_norm": 0.6379166524627451, + "learning_rate": 1.238117375579184e-05, + "loss": 1.9479, + "step": 8793 + }, + { + "epoch": 0.68, + "grad_norm": 0.6107371196293422, + "learning_rate": 1.2375781595968229e-05, + "loss": 1.8995, + "step": 8794 + }, + { + "epoch": 0.68, + "grad_norm": 0.6392314146193777, + "learning_rate": 1.2370390224326434e-05, + "loss": 2.0415, + "step": 8795 + }, + { + "epoch": 0.68, + "grad_norm": 0.583465814577676, + "learning_rate": 1.236499964120306e-05, + "loss": 1.9118, + "step": 8796 + }, + { + "epoch": 0.68, + "grad_norm": 0.6644074057218952, + "learning_rate": 1.2359609846934675e-05, + "loss": 1.967, + "step": 8797 + }, + { + "epoch": 0.68, + "grad_norm": 0.5946191306395001, + "learning_rate": 1.2354220841857755e-05, + "loss": 1.8817, + "step": 8798 + }, + { + "epoch": 0.68, + "grad_norm": 0.6077100075031053, + "learning_rate": 1.234883262630879e-05, + "loss": 1.9032, + "step": 8799 + }, + { + "epoch": 0.68, + "grad_norm": 0.6666430370992059, + "learning_rate": 1.2343445200624174e-05, + "loss": 2.0219, + "step": 8800 + }, + { + "epoch": 0.68, + "grad_norm": 0.5626193383892211, + "learning_rate": 1.233805856514027e-05, + "loss": 1.7948, + "step": 8801 + }, + { + "epoch": 0.68, + "grad_norm": 0.6549132647330811, + "learning_rate": 1.2332672720193395e-05, + "loss": 1.8796, + "step": 8802 + }, + { + "epoch": 0.68, + "grad_norm": 0.6518883757999413, + "learning_rate": 1.2327287666119784e-05, + "loss": 1.9805, + "step": 8803 + }, + { + "epoch": 0.68, + "grad_norm": 0.6077235601742319, + "learning_rate": 1.2321903403255672e-05, + "loss": 2.0926, + "step": 8804 + }, + { + "epoch": 0.68, + "grad_norm": 0.6216077700134023, + "learning_rate": 1.2316519931937223e-05, + "loss": 1.8612, + "step": 8805 + }, + { + "epoch": 0.68, + "grad_norm": 0.6743369199991857, + "learning_rate": 1.2311137252500521e-05, + "loss": 1.9098, + "step": 8806 + }, + { + "epoch": 0.68, + "grad_norm": 0.6188633363446723, + "learning_rate": 1.2305755365281658e-05, + "loss": 2.0393, + "step": 8807 + }, + { + "epoch": 0.68, + "grad_norm": 0.5770180604480277, + "learning_rate": 1.2300374270616646e-05, + "loss": 1.8961, + "step": 8808 + }, + { + "epoch": 0.68, + "grad_norm": 0.5520806635559293, + "learning_rate": 1.229499396884142e-05, + "loss": 1.9026, + "step": 8809 + }, + { + "epoch": 0.68, + "grad_norm": 0.6135668284606721, + "learning_rate": 1.2289614460291932e-05, + "loss": 1.8928, + "step": 8810 + }, + { + "epoch": 0.68, + "grad_norm": 0.6145064538993978, + "learning_rate": 1.228423574530402e-05, + "loss": 1.8568, + "step": 8811 + }, + { + "epoch": 0.68, + "grad_norm": 0.585015988062962, + "learning_rate": 1.22788578242135e-05, + "loss": 2.0865, + "step": 8812 + }, + { + "epoch": 0.68, + "grad_norm": 0.5957807166786357, + "learning_rate": 1.2273480697356163e-05, + "loss": 1.836, + "step": 8813 + }, + { + "epoch": 0.68, + "grad_norm": 0.5567437321043897, + "learning_rate": 1.2268104365067702e-05, + "loss": 1.9116, + "step": 8814 + }, + { + "epoch": 0.68, + "grad_norm": 0.6368441550066674, + "learning_rate": 1.2262728827683776e-05, + "loss": 2.1079, + "step": 8815 + }, + { + "epoch": 0.68, + "grad_norm": 0.5674927031967735, + "learning_rate": 1.2257354085540034e-05, + "loss": 1.9611, + "step": 8816 + }, + { + "epoch": 0.68, + "grad_norm": 0.5725891608370609, + "learning_rate": 1.2251980138972009e-05, + "loss": 1.8493, + "step": 8817 + }, + { + "epoch": 0.68, + "grad_norm": 0.6423943796850572, + "learning_rate": 1.2246606988315251e-05, + "loss": 1.8758, + "step": 8818 + }, + { + "epoch": 0.68, + "grad_norm": 0.6585748741224486, + "learning_rate": 1.2241234633905202e-05, + "loss": 1.8859, + "step": 8819 + }, + { + "epoch": 0.68, + "grad_norm": 0.6650097545311923, + "learning_rate": 1.2235863076077279e-05, + "loss": 2.0411, + "step": 8820 + }, + { + "epoch": 0.68, + "grad_norm": 0.593182295754304, + "learning_rate": 1.2230492315166881e-05, + "loss": 1.8899, + "step": 8821 + }, + { + "epoch": 0.68, + "grad_norm": 0.6889164621736298, + "learning_rate": 1.2225122351509294e-05, + "loss": 1.9523, + "step": 8822 + }, + { + "epoch": 0.68, + "grad_norm": 0.5929854939059906, + "learning_rate": 1.2219753185439801e-05, + "loss": 1.8882, + "step": 8823 + }, + { + "epoch": 0.68, + "grad_norm": 0.6254372045136838, + "learning_rate": 1.2214384817293618e-05, + "loss": 2.0553, + "step": 8824 + }, + { + "epoch": 0.68, + "grad_norm": 0.6058496520986417, + "learning_rate": 1.2209017247405916e-05, + "loss": 1.8983, + "step": 8825 + }, + { + "epoch": 0.68, + "grad_norm": 0.6168582899922456, + "learning_rate": 1.2203650476111814e-05, + "loss": 1.9247, + "step": 8826 + }, + { + "epoch": 0.68, + "grad_norm": 0.6046463546199636, + "learning_rate": 1.2198284503746382e-05, + "loss": 2.087, + "step": 8827 + }, + { + "epoch": 0.68, + "grad_norm": 0.6461352854307241, + "learning_rate": 1.2192919330644639e-05, + "loss": 1.925, + "step": 8828 + }, + { + "epoch": 0.68, + "grad_norm": 0.6035699882418979, + "learning_rate": 1.2187554957141556e-05, + "loss": 1.9059, + "step": 8829 + }, + { + "epoch": 0.68, + "grad_norm": 0.5421313133058074, + "learning_rate": 1.218219138357205e-05, + "loss": 1.8411, + "step": 8830 + }, + { + "epoch": 0.68, + "grad_norm": 0.5412003136050683, + "learning_rate": 1.2176828610270994e-05, + "loss": 1.9012, + "step": 8831 + }, + { + "epoch": 0.68, + "grad_norm": 0.5596224861619358, + "learning_rate": 1.2171466637573206e-05, + "loss": 2.0319, + "step": 8832 + }, + { + "epoch": 0.68, + "grad_norm": 0.6028913675159295, + "learning_rate": 1.2166105465813458e-05, + "loss": 1.9113, + "step": 8833 + }, + { + "epoch": 0.68, + "grad_norm": 0.5302855995389067, + "learning_rate": 1.2160745095326467e-05, + "loss": 1.9256, + "step": 8834 + }, + { + "epoch": 0.68, + "grad_norm": 0.5375513492834442, + "learning_rate": 1.2155385526446906e-05, + "loss": 1.8875, + "step": 8835 + }, + { + "epoch": 0.68, + "grad_norm": 0.5805135815755255, + "learning_rate": 1.2150026759509392e-05, + "loss": 2.1003, + "step": 8836 + }, + { + "epoch": 0.68, + "grad_norm": 0.6524744472052294, + "learning_rate": 1.21446687948485e-05, + "loss": 1.9039, + "step": 8837 + }, + { + "epoch": 0.68, + "grad_norm": 0.6662964546437532, + "learning_rate": 1.2139311632798741e-05, + "loss": 1.8916, + "step": 8838 + }, + { + "epoch": 0.68, + "grad_norm": 0.5951191752042527, + "learning_rate": 1.2133955273694595e-05, + "loss": 2.0872, + "step": 8839 + }, + { + "epoch": 0.68, + "grad_norm": 0.5958990018643873, + "learning_rate": 1.2128599717870474e-05, + "loss": 1.9315, + "step": 8840 + }, + { + "epoch": 0.68, + "grad_norm": 0.6241624007750445, + "learning_rate": 1.2123244965660748e-05, + "loss": 1.8489, + "step": 8841 + }, + { + "epoch": 0.68, + "grad_norm": 0.6068461162751405, + "learning_rate": 1.211789101739974e-05, + "loss": 1.9049, + "step": 8842 + }, + { + "epoch": 0.68, + "grad_norm": 0.5675627637799707, + "learning_rate": 1.2112537873421714e-05, + "loss": 1.8964, + "step": 8843 + }, + { + "epoch": 0.68, + "grad_norm": 0.5508374290162419, + "learning_rate": 1.2107185534060906e-05, + "loss": 2.0937, + "step": 8844 + }, + { + "epoch": 0.68, + "grad_norm": 0.5976408528316398, + "learning_rate": 1.2101833999651446e-05, + "loss": 1.8768, + "step": 8845 + }, + { + "epoch": 0.68, + "grad_norm": 0.5834468549355947, + "learning_rate": 1.2096483270527489e-05, + "loss": 1.8995, + "step": 8846 + }, + { + "epoch": 0.68, + "grad_norm": 0.5437978974703385, + "learning_rate": 1.2091133347023099e-05, + "loss": 1.9792, + "step": 8847 + }, + { + "epoch": 0.68, + "grad_norm": 0.5937528487611289, + "learning_rate": 1.2085784229472266e-05, + "loss": 2.0644, + "step": 8848 + }, + { + "epoch": 0.68, + "grad_norm": 0.5584538869829736, + "learning_rate": 1.2080435918208997e-05, + "loss": 1.9118, + "step": 8849 + }, + { + "epoch": 0.68, + "grad_norm": 0.5983310220381675, + "learning_rate": 1.2075088413567168e-05, + "loss": 1.8529, + "step": 8850 + }, + { + "epoch": 0.68, + "grad_norm": 0.5942272224507877, + "learning_rate": 1.2069741715880676e-05, + "loss": 1.8811, + "step": 8851 + }, + { + "epoch": 0.68, + "grad_norm": 0.6019786597771181, + "learning_rate": 1.2064395825483335e-05, + "loss": 2.0718, + "step": 8852 + }, + { + "epoch": 0.68, + "grad_norm": 0.6501016498188937, + "learning_rate": 1.2059050742708888e-05, + "loss": 1.999, + "step": 8853 + }, + { + "epoch": 0.68, + "grad_norm": 0.5902441354163089, + "learning_rate": 1.2053706467891074e-05, + "loss": 1.8607, + "step": 8854 + }, + { + "epoch": 0.68, + "grad_norm": 0.5682853050172568, + "learning_rate": 1.2048363001363557e-05, + "loss": 1.8497, + "step": 8855 + }, + { + "epoch": 0.68, + "grad_norm": 0.6331533186245029, + "learning_rate": 1.204302034345993e-05, + "loss": 2.0867, + "step": 8856 + }, + { + "epoch": 0.68, + "grad_norm": 0.5568823901478561, + "learning_rate": 1.2037678494513787e-05, + "loss": 1.9061, + "step": 8857 + }, + { + "epoch": 0.68, + "grad_norm": 0.5942284791509133, + "learning_rate": 1.203233745485862e-05, + "loss": 1.8537, + "step": 8858 + }, + { + "epoch": 0.68, + "grad_norm": 0.5681410685758121, + "learning_rate": 1.2026997224827884e-05, + "loss": 1.9808, + "step": 8859 + }, + { + "epoch": 0.68, + "grad_norm": 0.5806763038192028, + "learning_rate": 1.2021657804755027e-05, + "loss": 2.0807, + "step": 8860 + }, + { + "epoch": 0.68, + "grad_norm": 0.5715662261749004, + "learning_rate": 1.201631919497338e-05, + "loss": 1.8743, + "step": 8861 + }, + { + "epoch": 0.68, + "grad_norm": 0.6674112982937261, + "learning_rate": 1.2010981395816257e-05, + "loss": 1.8547, + "step": 8862 + }, + { + "epoch": 0.68, + "grad_norm": 0.7059145218294421, + "learning_rate": 1.2005644407616943e-05, + "loss": 1.9202, + "step": 8863 + }, + { + "epoch": 0.68, + "grad_norm": 0.6036741212008151, + "learning_rate": 1.200030823070861e-05, + "loss": 2.1309, + "step": 8864 + }, + { + "epoch": 0.68, + "grad_norm": 0.6185832540199722, + "learning_rate": 1.1994972865424458e-05, + "loss": 1.9757, + "step": 8865 + }, + { + "epoch": 0.68, + "grad_norm": 0.6150503598869486, + "learning_rate": 1.1989638312097567e-05, + "loss": 1.8713, + "step": 8866 + }, + { + "epoch": 0.68, + "grad_norm": 0.6263990557912182, + "learning_rate": 1.1984304571060994e-05, + "loss": 1.882, + "step": 8867 + }, + { + "epoch": 0.68, + "grad_norm": 0.6550664095643863, + "learning_rate": 1.1978971642647777e-05, + "loss": 2.1173, + "step": 8868 + }, + { + "epoch": 0.68, + "grad_norm": 0.663525514048317, + "learning_rate": 1.1973639527190842e-05, + "loss": 1.8936, + "step": 8869 + }, + { + "epoch": 0.68, + "grad_norm": 0.609334133523607, + "learning_rate": 1.1968308225023104e-05, + "loss": 1.9084, + "step": 8870 + }, + { + "epoch": 0.68, + "grad_norm": 0.6579913967017702, + "learning_rate": 1.1962977736477418e-05, + "loss": 2.0755, + "step": 8871 + }, + { + "epoch": 0.68, + "grad_norm": 0.5412014913664036, + "learning_rate": 1.1957648061886592e-05, + "loss": 1.9265, + "step": 8872 + }, + { + "epoch": 0.68, + "grad_norm": 0.6529214344120083, + "learning_rate": 1.1952319201583375e-05, + "loss": 1.8839, + "step": 8873 + }, + { + "epoch": 0.68, + "grad_norm": 0.690778401363497, + "learning_rate": 1.1946991155900473e-05, + "loss": 1.908, + "step": 8874 + }, + { + "epoch": 0.68, + "grad_norm": 0.5600634155290309, + "learning_rate": 1.1941663925170536e-05, + "loss": 1.876, + "step": 8875 + }, + { + "epoch": 0.68, + "grad_norm": 0.574977154053681, + "learning_rate": 1.1936337509726165e-05, + "loss": 2.1155, + "step": 8876 + }, + { + "epoch": 0.68, + "grad_norm": 0.6845992120299016, + "learning_rate": 1.1931011909899909e-05, + "loss": 1.8366, + "step": 8877 + }, + { + "epoch": 0.68, + "grad_norm": 0.6181392346632707, + "learning_rate": 1.1925687126024268e-05, + "loss": 1.9286, + "step": 8878 + }, + { + "epoch": 0.69, + "grad_norm": 0.5679508684837916, + "learning_rate": 1.1920363158431691e-05, + "loss": 1.806, + "step": 8879 + }, + { + "epoch": 0.69, + "grad_norm": 0.5953247742092924, + "learning_rate": 1.1915040007454576e-05, + "loss": 2.0837, + "step": 8880 + }, + { + "epoch": 0.69, + "grad_norm": 0.6235760091207185, + "learning_rate": 1.1909717673425266e-05, + "loss": 1.8956, + "step": 8881 + }, + { + "epoch": 0.69, + "grad_norm": 0.5604918472928406, + "learning_rate": 1.1904396156676058e-05, + "loss": 1.8441, + "step": 8882 + }, + { + "epoch": 0.69, + "grad_norm": 0.6777452150674497, + "learning_rate": 1.1899075457539196e-05, + "loss": 1.8254, + "step": 8883 + }, + { + "epoch": 0.69, + "grad_norm": 0.6023650247270822, + "learning_rate": 1.1893755576346874e-05, + "loss": 2.1022, + "step": 8884 + }, + { + "epoch": 0.69, + "grad_norm": 0.6027617035413784, + "learning_rate": 1.1888436513431233e-05, + "loss": 1.8963, + "step": 8885 + }, + { + "epoch": 0.69, + "grad_norm": 0.6539234529159501, + "learning_rate": 1.1883118269124364e-05, + "loss": 1.8679, + "step": 8886 + }, + { + "epoch": 0.69, + "grad_norm": 0.6230029363436617, + "learning_rate": 1.1877800843758308e-05, + "loss": 1.8864, + "step": 8887 + }, + { + "epoch": 0.69, + "grad_norm": 0.6428341291666918, + "learning_rate": 1.187248423766505e-05, + "loss": 2.0338, + "step": 8888 + }, + { + "epoch": 0.69, + "grad_norm": 0.613766123947066, + "learning_rate": 1.186716845117653e-05, + "loss": 1.8405, + "step": 8889 + }, + { + "epoch": 0.69, + "grad_norm": 0.5532933969053196, + "learning_rate": 1.1861853484624638e-05, + "loss": 1.9795, + "step": 8890 + }, + { + "epoch": 0.69, + "grad_norm": 0.6169487386176906, + "learning_rate": 1.1856539338341213e-05, + "loss": 1.868, + "step": 8891 + }, + { + "epoch": 0.69, + "grad_norm": 0.5960371645660122, + "learning_rate": 1.1851226012658015e-05, + "loss": 2.087, + "step": 8892 + }, + { + "epoch": 0.69, + "grad_norm": 0.5721951551728145, + "learning_rate": 1.18459135079068e-05, + "loss": 1.8576, + "step": 8893 + }, + { + "epoch": 0.69, + "grad_norm": 0.6220000348786602, + "learning_rate": 1.1840601824419254e-05, + "loss": 1.9147, + "step": 8894 + }, + { + "epoch": 0.69, + "grad_norm": 0.5900780264400767, + "learning_rate": 1.1835290962526977e-05, + "loss": 1.8963, + "step": 8895 + }, + { + "epoch": 0.69, + "grad_norm": 0.5439461672235679, + "learning_rate": 1.1829980922561578e-05, + "loss": 2.0645, + "step": 8896 + }, + { + "epoch": 0.69, + "grad_norm": 0.6130964610757279, + "learning_rate": 1.1824671704854574e-05, + "loss": 1.8606, + "step": 8897 + }, + { + "epoch": 0.69, + "grad_norm": 0.6065454504427, + "learning_rate": 1.181936330973744e-05, + "loss": 1.8275, + "step": 8898 + }, + { + "epoch": 0.69, + "grad_norm": 0.6110003175225011, + "learning_rate": 1.181405573754161e-05, + "loss": 1.9116, + "step": 8899 + }, + { + "epoch": 0.69, + "grad_norm": 0.6051730352268181, + "learning_rate": 1.1808748988598432e-05, + "loss": 2.1061, + "step": 8900 + }, + { + "epoch": 0.69, + "grad_norm": 0.5542959880384529, + "learning_rate": 1.1803443063239253e-05, + "loss": 1.9033, + "step": 8901 + }, + { + "epoch": 0.69, + "grad_norm": 0.5991864687821332, + "learning_rate": 1.179813796179535e-05, + "loss": 1.8917, + "step": 8902 + }, + { + "epoch": 0.69, + "grad_norm": 0.5835249598383039, + "learning_rate": 1.1792833684597907e-05, + "loss": 1.9983, + "step": 8903 + }, + { + "epoch": 0.69, + "grad_norm": 0.618029452236078, + "learning_rate": 1.1787530231978134e-05, + "loss": 2.0739, + "step": 8904 + }, + { + "epoch": 0.69, + "grad_norm": 0.6509764650270039, + "learning_rate": 1.1782227604267115e-05, + "loss": 1.8991, + "step": 8905 + }, + { + "epoch": 0.69, + "grad_norm": 0.5639464677399753, + "learning_rate": 1.1776925801795918e-05, + "loss": 1.8663, + "step": 8906 + }, + { + "epoch": 0.69, + "grad_norm": 0.6631683405041704, + "learning_rate": 1.1771624824895584e-05, + "loss": 1.8441, + "step": 8907 + }, + { + "epoch": 0.69, + "grad_norm": 0.7244930170314408, + "learning_rate": 1.1766324673897048e-05, + "loss": 2.044, + "step": 8908 + }, + { + "epoch": 0.69, + "grad_norm": 0.536179444579546, + "learning_rate": 1.1761025349131216e-05, + "loss": 1.9369, + "step": 8909 + }, + { + "epoch": 0.69, + "grad_norm": 0.698844605755115, + "learning_rate": 1.1755726850928978e-05, + "loss": 1.8842, + "step": 8910 + }, + { + "epoch": 0.69, + "grad_norm": 0.586336241273513, + "learning_rate": 1.1750429179621113e-05, + "loss": 1.9146, + "step": 8911 + }, + { + "epoch": 0.69, + "grad_norm": 0.6575251979932237, + "learning_rate": 1.1745132335538383e-05, + "loss": 2.1108, + "step": 8912 + }, + { + "epoch": 0.69, + "grad_norm": 0.7304544296409403, + "learning_rate": 1.1739836319011496e-05, + "loss": 1.9049, + "step": 8913 + }, + { + "epoch": 0.69, + "grad_norm": 0.5706283641550365, + "learning_rate": 1.1734541130371091e-05, + "loss": 1.8179, + "step": 8914 + }, + { + "epoch": 0.69, + "grad_norm": 0.7423358573463116, + "learning_rate": 1.17292467699478e-05, + "loss": 1.9911, + "step": 8915 + }, + { + "epoch": 0.69, + "grad_norm": 0.6387765270577971, + "learning_rate": 1.1723953238072138e-05, + "loss": 2.0679, + "step": 8916 + }, + { + "epoch": 0.69, + "grad_norm": 0.6428468345137285, + "learning_rate": 1.1718660535074617e-05, + "loss": 1.8761, + "step": 8917 + }, + { + "epoch": 0.69, + "grad_norm": 0.6521253366768809, + "learning_rate": 1.1713368661285679e-05, + "loss": 1.8638, + "step": 8918 + }, + { + "epoch": 0.69, + "grad_norm": 0.552787582184753, + "learning_rate": 1.170807761703572e-05, + "loss": 1.9127, + "step": 8919 + }, + { + "epoch": 0.69, + "grad_norm": 0.6269348775012485, + "learning_rate": 1.1702787402655077e-05, + "loss": 2.0874, + "step": 8920 + }, + { + "epoch": 0.69, + "grad_norm": 0.6747665622647425, + "learning_rate": 1.1697498018474045e-05, + "loss": 1.987, + "step": 8921 + }, + { + "epoch": 0.69, + "grad_norm": 0.6037883675324819, + "learning_rate": 1.169220946482286e-05, + "loss": 1.8746, + "step": 8922 + }, + { + "epoch": 0.69, + "grad_norm": 0.7517759081716553, + "learning_rate": 1.1686921742031706e-05, + "loss": 1.9458, + "step": 8923 + }, + { + "epoch": 0.69, + "grad_norm": 0.689681542275298, + "learning_rate": 1.168163485043072e-05, + "loss": 2.0796, + "step": 8924 + }, + { + "epoch": 0.69, + "grad_norm": 0.6042231150947777, + "learning_rate": 1.1676348790349984e-05, + "loss": 1.8489, + "step": 8925 + }, + { + "epoch": 0.69, + "grad_norm": 0.7444022091593935, + "learning_rate": 1.1671063562119527e-05, + "loss": 1.9177, + "step": 8926 + }, + { + "epoch": 0.69, + "grad_norm": 0.5372101566769004, + "learning_rate": 1.1665779166069329e-05, + "loss": 1.9518, + "step": 8927 + }, + { + "epoch": 0.69, + "grad_norm": 0.7321614206163566, + "learning_rate": 1.1660495602529317e-05, + "loss": 2.0587, + "step": 8928 + }, + { + "epoch": 0.69, + "grad_norm": 0.5558316182831381, + "learning_rate": 1.1655212871829363e-05, + "loss": 1.8393, + "step": 8929 + }, + { + "epoch": 0.69, + "grad_norm": 0.5861100510809774, + "learning_rate": 1.1649930974299292e-05, + "loss": 1.8132, + "step": 8930 + }, + { + "epoch": 0.69, + "grad_norm": 0.6681001288460169, + "learning_rate": 1.1644649910268874e-05, + "loss": 1.8697, + "step": 8931 + }, + { + "epoch": 0.69, + "grad_norm": 0.713585293462639, + "learning_rate": 1.1639369680067826e-05, + "loss": 2.0735, + "step": 8932 + }, + { + "epoch": 0.69, + "grad_norm": 0.6395908342134343, + "learning_rate": 1.1634090284025818e-05, + "loss": 1.8464, + "step": 8933 + }, + { + "epoch": 0.69, + "grad_norm": 0.6980486669324184, + "learning_rate": 1.1628811722472461e-05, + "loss": 1.9269, + "step": 8934 + }, + { + "epoch": 0.69, + "grad_norm": 0.6742857271931811, + "learning_rate": 1.1623533995737318e-05, + "loss": 1.8838, + "step": 8935 + }, + { + "epoch": 0.69, + "grad_norm": 0.6776423470723552, + "learning_rate": 1.1618257104149898e-05, + "loss": 2.0972, + "step": 8936 + }, + { + "epoch": 0.69, + "grad_norm": 0.6642118329494529, + "learning_rate": 1.1612981048039662e-05, + "loss": 1.8567, + "step": 8937 + }, + { + "epoch": 0.69, + "grad_norm": 0.5561285820399503, + "learning_rate": 1.1607705827736023e-05, + "loss": 1.8689, + "step": 8938 + }, + { + "epoch": 0.69, + "grad_norm": 0.7644962606635218, + "learning_rate": 1.1602431443568309e-05, + "loss": 1.9054, + "step": 8939 + }, + { + "epoch": 0.69, + "grad_norm": 0.5814468708664705, + "learning_rate": 1.1597157895865849e-05, + "loss": 2.1014, + "step": 8940 + }, + { + "epoch": 0.69, + "grad_norm": 0.6463987066225427, + "learning_rate": 1.1591885184957888e-05, + "loss": 1.8453, + "step": 8941 + }, + { + "epoch": 0.69, + "grad_norm": 0.7092471512929696, + "learning_rate": 1.15866133111736e-05, + "loss": 1.8881, + "step": 8942 + }, + { + "epoch": 0.69, + "grad_norm": 0.6306521969659294, + "learning_rate": 1.1581342274842156e-05, + "loss": 1.9119, + "step": 8943 + }, + { + "epoch": 0.69, + "grad_norm": 0.8056851128874467, + "learning_rate": 1.1576072076292646e-05, + "loss": 2.0935, + "step": 8944 + }, + { + "epoch": 0.69, + "grad_norm": 0.5987624892222396, + "learning_rate": 1.1570802715854088e-05, + "loss": 1.8671, + "step": 8945 + }, + { + "epoch": 0.69, + "grad_norm": 0.7241315503274826, + "learning_rate": 1.1565534193855502e-05, + "loss": 1.9342, + "step": 8946 + }, + { + "epoch": 0.69, + "grad_norm": 0.7160714612811266, + "learning_rate": 1.1560266510625786e-05, + "loss": 1.9409, + "step": 8947 + }, + { + "epoch": 0.69, + "grad_norm": 0.5935638016157081, + "learning_rate": 1.1554999666493857e-05, + "loss": 2.0735, + "step": 8948 + }, + { + "epoch": 0.69, + "grad_norm": 0.7389213777941352, + "learning_rate": 1.154973366178854e-05, + "loss": 1.8752, + "step": 8949 + }, + { + "epoch": 0.69, + "grad_norm": 0.6656055625503415, + "learning_rate": 1.1544468496838585e-05, + "loss": 1.8694, + "step": 8950 + }, + { + "epoch": 0.69, + "grad_norm": 0.6598248342308699, + "learning_rate": 1.1539204171972756e-05, + "loss": 1.8352, + "step": 8951 + }, + { + "epoch": 0.69, + "grad_norm": 0.6899235881956752, + "learning_rate": 1.1533940687519704e-05, + "loss": 1.9687, + "step": 8952 + }, + { + "epoch": 0.69, + "grad_norm": 0.6734762562583627, + "learning_rate": 1.1528678043808041e-05, + "loss": 2.0654, + "step": 8953 + }, + { + "epoch": 0.69, + "grad_norm": 0.588025099494492, + "learning_rate": 1.1523416241166371e-05, + "loss": 1.8717, + "step": 8954 + }, + { + "epoch": 0.69, + "grad_norm": 0.6447800920734613, + "learning_rate": 1.1518155279923179e-05, + "loss": 1.904, + "step": 8955 + }, + { + "epoch": 0.69, + "grad_norm": 0.5886658190258836, + "learning_rate": 1.1512895160406928e-05, + "loss": 2.1072, + "step": 8956 + }, + { + "epoch": 0.69, + "grad_norm": 0.6128230081493294, + "learning_rate": 1.1507635882946055e-05, + "loss": 1.8783, + "step": 8957 + }, + { + "epoch": 0.69, + "grad_norm": 0.5646989950639825, + "learning_rate": 1.1502377447868895e-05, + "loss": 1.977, + "step": 8958 + }, + { + "epoch": 0.69, + "grad_norm": 0.6610830664979006, + "learning_rate": 1.149711985550376e-05, + "loss": 1.8535, + "step": 8959 + }, + { + "epoch": 0.69, + "grad_norm": 0.6162754333595482, + "learning_rate": 1.1491863106178905e-05, + "loss": 2.0493, + "step": 8960 + }, + { + "epoch": 0.69, + "grad_norm": 0.5623413457056937, + "learning_rate": 1.1486607200222516e-05, + "loss": 1.904, + "step": 8961 + }, + { + "epoch": 0.69, + "grad_norm": 0.630117947374411, + "learning_rate": 1.1481352137962775e-05, + "loss": 1.8877, + "step": 8962 + }, + { + "epoch": 0.69, + "grad_norm": 0.6497054129635882, + "learning_rate": 1.1476097919727746e-05, + "loss": 1.9027, + "step": 8963 + }, + { + "epoch": 0.69, + "grad_norm": 0.5903965738586977, + "learning_rate": 1.1470844545845483e-05, + "loss": 2.0835, + "step": 8964 + }, + { + "epoch": 0.69, + "grad_norm": 0.6425310664912935, + "learning_rate": 1.1465592016643973e-05, + "loss": 1.987, + "step": 8965 + }, + { + "epoch": 0.69, + "grad_norm": 0.6062504648587747, + "learning_rate": 1.1460340332451152e-05, + "loss": 1.8975, + "step": 8966 + }, + { + "epoch": 0.69, + "grad_norm": 0.6036239617901162, + "learning_rate": 1.145508949359491e-05, + "loss": 1.8289, + "step": 8967 + }, + { + "epoch": 0.69, + "grad_norm": 0.6590383456361064, + "learning_rate": 1.1449839500403072e-05, + "loss": 2.0469, + "step": 8968 + }, + { + "epoch": 0.69, + "grad_norm": 0.5849750715025981, + "learning_rate": 1.1444590353203421e-05, + "loss": 1.8668, + "step": 8969 + }, + { + "epoch": 0.69, + "grad_norm": 0.5630310422033301, + "learning_rate": 1.1439342052323682e-05, + "loss": 1.8713, + "step": 8970 + }, + { + "epoch": 0.69, + "grad_norm": 0.6500602837798692, + "learning_rate": 1.1434094598091527e-05, + "loss": 1.9178, + "step": 8971 + }, + { + "epoch": 0.69, + "grad_norm": 0.6798654638692546, + "learning_rate": 1.1428847990834576e-05, + "loss": 2.0882, + "step": 8972 + }, + { + "epoch": 0.69, + "grad_norm": 0.5567403147370228, + "learning_rate": 1.1423602230880396e-05, + "loss": 1.8623, + "step": 8973 + }, + { + "epoch": 0.69, + "grad_norm": 0.638065412208186, + "learning_rate": 1.1418357318556503e-05, + "loss": 1.8666, + "step": 8974 + }, + { + "epoch": 0.69, + "grad_norm": 0.6264968581154093, + "learning_rate": 1.1413113254190355e-05, + "loss": 1.8236, + "step": 8975 + }, + { + "epoch": 0.69, + "grad_norm": 0.6431754052019392, + "learning_rate": 1.1407870038109367e-05, + "loss": 2.0461, + "step": 8976 + }, + { + "epoch": 0.69, + "grad_norm": 0.6445769863296433, + "learning_rate": 1.1402627670640886e-05, + "loss": 1.9494, + "step": 8977 + }, + { + "epoch": 0.69, + "grad_norm": 0.5650827138130116, + "learning_rate": 1.1397386152112222e-05, + "loss": 1.8509, + "step": 8978 + }, + { + "epoch": 0.69, + "grad_norm": 0.6146498328787695, + "learning_rate": 1.1392145482850619e-05, + "loss": 1.8565, + "step": 8979 + }, + { + "epoch": 0.69, + "grad_norm": 0.581297197723896, + "learning_rate": 1.1386905663183279e-05, + "loss": 2.0822, + "step": 8980 + }, + { + "epoch": 0.69, + "grad_norm": 0.5799198228726854, + "learning_rate": 1.138166669343734e-05, + "loss": 1.8947, + "step": 8981 + }, + { + "epoch": 0.69, + "grad_norm": 0.6046031559924472, + "learning_rate": 1.1376428573939896e-05, + "loss": 1.8561, + "step": 8982 + }, + { + "epoch": 0.69, + "grad_norm": 0.5858783894629405, + "learning_rate": 1.1371191305017981e-05, + "loss": 1.9879, + "step": 8983 + }, + { + "epoch": 0.69, + "grad_norm": 0.6035202749223898, + "learning_rate": 1.1365954886998584e-05, + "loss": 1.9014, + "step": 8984 + }, + { + "epoch": 0.69, + "grad_norm": 0.6150137928215254, + "learning_rate": 1.1360719320208644e-05, + "loss": 2.0604, + "step": 8985 + }, + { + "epoch": 0.69, + "grad_norm": 0.6308692671391954, + "learning_rate": 1.1355484604975007e-05, + "loss": 1.8866, + "step": 8986 + }, + { + "epoch": 0.69, + "grad_norm": 0.5441074458398664, + "learning_rate": 1.1350250741624527e-05, + "loss": 1.8655, + "step": 8987 + }, + { + "epoch": 0.69, + "grad_norm": 0.5849942020288769, + "learning_rate": 1.134501773048398e-05, + "loss": 2.0749, + "step": 8988 + }, + { + "epoch": 0.69, + "grad_norm": 0.5441499006926636, + "learning_rate": 1.133978557188005e-05, + "loss": 1.9374, + "step": 8989 + }, + { + "epoch": 0.69, + "grad_norm": 0.5771236807412552, + "learning_rate": 1.1334554266139438e-05, + "loss": 1.8723, + "step": 8990 + }, + { + "epoch": 0.69, + "grad_norm": 0.5852552216350425, + "learning_rate": 1.1329323813588752e-05, + "loss": 1.8802, + "step": 8991 + }, + { + "epoch": 0.69, + "grad_norm": 0.5829212308814022, + "learning_rate": 1.132409421455452e-05, + "loss": 2.09, + "step": 8992 + }, + { + "epoch": 0.69, + "grad_norm": 0.5872424583607305, + "learning_rate": 1.1318865469363288e-05, + "loss": 1.9085, + "step": 8993 + }, + { + "epoch": 0.69, + "grad_norm": 0.6049008394115704, + "learning_rate": 1.1313637578341468e-05, + "loss": 1.9283, + "step": 8994 + }, + { + "epoch": 0.69, + "grad_norm": 0.5677177169557862, + "learning_rate": 1.130841054181549e-05, + "loss": 1.863, + "step": 8995 + }, + { + "epoch": 0.69, + "grad_norm": 0.5592029557494277, + "learning_rate": 1.1303184360111695e-05, + "loss": 1.9552, + "step": 8996 + }, + { + "epoch": 0.69, + "grad_norm": 0.6133598281410187, + "learning_rate": 1.1297959033556352e-05, + "loss": 2.0903, + "step": 8997 + }, + { + "epoch": 0.69, + "grad_norm": 0.6049364312685719, + "learning_rate": 1.1292734562475731e-05, + "loss": 1.8911, + "step": 8998 + }, + { + "epoch": 0.69, + "grad_norm": 0.5942166249025516, + "learning_rate": 1.1287510947195998e-05, + "loss": 1.8813, + "step": 8999 + }, + { + "epoch": 0.69, + "grad_norm": 0.5876166467493542, + "learning_rate": 1.1282288188043277e-05, + "loss": 2.0959, + "step": 9000 + }, + { + "epoch": 0.69, + "grad_norm": 0.5487513118352851, + "learning_rate": 1.1277066285343676e-05, + "loss": 1.8581, + "step": 9001 + }, + { + "epoch": 0.69, + "grad_norm": 0.5524661667346662, + "learning_rate": 1.1271845239423196e-05, + "loss": 1.8847, + "step": 9002 + }, + { + "epoch": 0.69, + "grad_norm": 0.5491182847563656, + "learning_rate": 1.1266625050607801e-05, + "loss": 1.9084, + "step": 9003 + }, + { + "epoch": 0.69, + "grad_norm": 0.5738646314008986, + "learning_rate": 1.1261405719223445e-05, + "loss": 2.0226, + "step": 9004 + }, + { + "epoch": 0.69, + "grad_norm": 0.5421923009558371, + "learning_rate": 1.1256187245595956e-05, + "loss": 1.8445, + "step": 9005 + }, + { + "epoch": 0.69, + "grad_norm": 0.5859893055896866, + "learning_rate": 1.1250969630051161e-05, + "loss": 1.902, + "step": 9006 + }, + { + "epoch": 0.69, + "grad_norm": 0.6179165487566488, + "learning_rate": 1.1245752872914813e-05, + "loss": 1.8717, + "step": 9007 + }, + { + "epoch": 0.69, + "grad_norm": 0.5920237269196194, + "learning_rate": 1.1240536974512616e-05, + "loss": 1.9672, + "step": 9008 + }, + { + "epoch": 0.7, + "grad_norm": 0.5996679037578515, + "learning_rate": 1.1235321935170223e-05, + "loss": 2.0506, + "step": 9009 + }, + { + "epoch": 0.7, + "grad_norm": 0.6058979153581787, + "learning_rate": 1.123010775521323e-05, + "loss": 1.8233, + "step": 9010 + }, + { + "epoch": 0.7, + "grad_norm": 0.5778100973649726, + "learning_rate": 1.1224894434967165e-05, + "loss": 1.8551, + "step": 9011 + }, + { + "epoch": 0.7, + "grad_norm": 0.613146258795698, + "learning_rate": 1.121968197475755e-05, + "loss": 2.0915, + "step": 9012 + }, + { + "epoch": 0.7, + "grad_norm": 0.5547189804107404, + "learning_rate": 1.1214470374909792e-05, + "loss": 1.88, + "step": 9013 + }, + { + "epoch": 0.7, + "grad_norm": 0.5669628106820677, + "learning_rate": 1.120925963574928e-05, + "loss": 1.912, + "step": 9014 + }, + { + "epoch": 0.7, + "grad_norm": 0.6637462011967322, + "learning_rate": 1.1204049757601342e-05, + "loss": 1.8702, + "step": 9015 + }, + { + "epoch": 0.7, + "grad_norm": 0.6329998871514068, + "learning_rate": 1.1198840740791253e-05, + "loss": 1.885, + "step": 9016 + }, + { + "epoch": 0.7, + "grad_norm": 0.6415384885773406, + "learning_rate": 1.1193632585644234e-05, + "loss": 2.0906, + "step": 9017 + }, + { + "epoch": 0.7, + "grad_norm": 0.5885668861348707, + "learning_rate": 1.1188425292485452e-05, + "loss": 1.8545, + "step": 9018 + }, + { + "epoch": 0.7, + "grad_norm": 0.6120703274370523, + "learning_rate": 1.1183218861640016e-05, + "loss": 1.8903, + "step": 9019 + }, + { + "epoch": 0.7, + "grad_norm": 0.6447567062713911, + "learning_rate": 1.1178013293432987e-05, + "loss": 2.0459, + "step": 9020 + }, + { + "epoch": 0.7, + "grad_norm": 0.6119559065528996, + "learning_rate": 1.1172808588189371e-05, + "loss": 2.0141, + "step": 9021 + }, + { + "epoch": 0.7, + "grad_norm": 0.5624525125415928, + "learning_rate": 1.116760474623412e-05, + "loss": 1.9349, + "step": 9022 + }, + { + "epoch": 0.7, + "grad_norm": 0.6079882131605245, + "learning_rate": 1.1162401767892128e-05, + "loss": 1.8844, + "step": 9023 + }, + { + "epoch": 0.7, + "grad_norm": 0.6587785908513255, + "learning_rate": 1.115719965348824e-05, + "loss": 2.0327, + "step": 9024 + }, + { + "epoch": 0.7, + "grad_norm": 0.6628911569822925, + "learning_rate": 1.1151998403347244e-05, + "loss": 1.878, + "step": 9025 + }, + { + "epoch": 0.7, + "grad_norm": 0.6310908490151304, + "learning_rate": 1.1146798017793878e-05, + "loss": 1.8653, + "step": 9026 + }, + { + "epoch": 0.7, + "grad_norm": 0.6924362140089038, + "learning_rate": 1.1141598497152822e-05, + "loss": 1.9822, + "step": 9027 + }, + { + "epoch": 0.7, + "grad_norm": 0.5751115490065275, + "learning_rate": 1.1136399841748705e-05, + "loss": 1.9184, + "step": 9028 + }, + { + "epoch": 0.7, + "grad_norm": 0.709002621491511, + "learning_rate": 1.1131202051906098e-05, + "loss": 2.0618, + "step": 9029 + }, + { + "epoch": 0.7, + "grad_norm": 0.6492433261792121, + "learning_rate": 1.1126005127949521e-05, + "loss": 1.8877, + "step": 9030 + }, + { + "epoch": 0.7, + "grad_norm": 0.5857699762376762, + "learning_rate": 1.1120809070203439e-05, + "loss": 1.7941, + "step": 9031 + }, + { + "epoch": 0.7, + "grad_norm": 0.6875346506182057, + "learning_rate": 1.1115613878992273e-05, + "loss": 2.1179, + "step": 9032 + }, + { + "epoch": 0.7, + "grad_norm": 0.5931447089468332, + "learning_rate": 1.1110419554640353e-05, + "loss": 1.9593, + "step": 9033 + }, + { + "epoch": 0.7, + "grad_norm": 0.5659188177705613, + "learning_rate": 1.1105226097472011e-05, + "loss": 1.841, + "step": 9034 + }, + { + "epoch": 0.7, + "grad_norm": 0.6412129882068893, + "learning_rate": 1.110003350781149e-05, + "loss": 1.8918, + "step": 9035 + }, + { + "epoch": 0.7, + "grad_norm": 0.6367599403419106, + "learning_rate": 1.1094841785982964e-05, + "loss": 1.8299, + "step": 9036 + }, + { + "epoch": 0.7, + "grad_norm": 0.7019631410934414, + "learning_rate": 1.10896509323106e-05, + "loss": 2.0714, + "step": 9037 + }, + { + "epoch": 0.7, + "grad_norm": 0.5648713714214756, + "learning_rate": 1.108446094711848e-05, + "loss": 1.8836, + "step": 9038 + }, + { + "epoch": 0.7, + "grad_norm": 0.6315288337701673, + "learning_rate": 1.1079271830730612e-05, + "loss": 1.8922, + "step": 9039 + }, + { + "epoch": 0.7, + "grad_norm": 0.7319047211924279, + "learning_rate": 1.1074083583471009e-05, + "loss": 1.8929, + "step": 9040 + }, + { + "epoch": 0.7, + "grad_norm": 0.786482489658743, + "learning_rate": 1.106889620566357e-05, + "loss": 2.0979, + "step": 9041 + }, + { + "epoch": 0.7, + "grad_norm": 0.6220242033234383, + "learning_rate": 1.106370969763216e-05, + "loss": 1.9183, + "step": 9042 + }, + { + "epoch": 0.7, + "grad_norm": 0.7118737215799696, + "learning_rate": 1.1058524059700625e-05, + "loss": 1.8291, + "step": 9043 + }, + { + "epoch": 0.7, + "grad_norm": 0.7103684660373127, + "learning_rate": 1.1053339292192685e-05, + "loss": 2.0744, + "step": 9044 + }, + { + "epoch": 0.7, + "grad_norm": 0.5451329559212765, + "learning_rate": 1.104815539543208e-05, + "loss": 1.9221, + "step": 9045 + }, + { + "epoch": 0.7, + "grad_norm": 0.7052156067961092, + "learning_rate": 1.1042972369742457e-05, + "loss": 1.8948, + "step": 9046 + }, + { + "epoch": 0.7, + "grad_norm": 0.7082693283465217, + "learning_rate": 1.1037790215447388e-05, + "loss": 1.8731, + "step": 9047 + }, + { + "epoch": 0.7, + "grad_norm": 0.6234874898157641, + "learning_rate": 1.103260893287045e-05, + "loss": 1.8789, + "step": 9048 + }, + { + "epoch": 0.7, + "grad_norm": 0.6569024854737763, + "learning_rate": 1.1027428522335109e-05, + "loss": 2.0847, + "step": 9049 + }, + { + "epoch": 0.7, + "grad_norm": 0.7109161951346876, + "learning_rate": 1.1022248984164795e-05, + "loss": 1.8649, + "step": 9050 + }, + { + "epoch": 0.7, + "grad_norm": 0.6294946277085567, + "learning_rate": 1.101707031868292e-05, + "loss": 1.8409, + "step": 9051 + }, + { + "epoch": 0.7, + "grad_norm": 0.5459657403953199, + "learning_rate": 1.1011892526212778e-05, + "loss": 1.9045, + "step": 9052 + }, + { + "epoch": 0.7, + "grad_norm": 0.7548898115060142, + "learning_rate": 1.100671560707765e-05, + "loss": 2.0695, + "step": 9053 + }, + { + "epoch": 0.7, + "grad_norm": 0.6757110874202107, + "learning_rate": 1.1001539561600752e-05, + "loss": 1.8898, + "step": 9054 + }, + { + "epoch": 0.7, + "grad_norm": 0.6284426952565466, + "learning_rate": 1.0996364390105249e-05, + "loss": 1.8846, + "step": 9055 + }, + { + "epoch": 0.7, + "grad_norm": 0.7151443084353183, + "learning_rate": 1.099119009291424e-05, + "loss": 2.07, + "step": 9056 + }, + { + "epoch": 0.7, + "grad_norm": 0.6037357861414693, + "learning_rate": 1.0986016670350789e-05, + "loss": 1.8184, + "step": 9057 + }, + { + "epoch": 0.7, + "grad_norm": 0.5663437382965097, + "learning_rate": 1.0980844122737876e-05, + "loss": 1.9734, + "step": 9058 + }, + { + "epoch": 0.7, + "grad_norm": 0.636895098695771, + "learning_rate": 1.0975672450398474e-05, + "loss": 1.8723, + "step": 9059 + }, + { + "epoch": 0.7, + "grad_norm": 0.585696111484951, + "learning_rate": 1.0970501653655446e-05, + "loss": 1.8623, + "step": 9060 + }, + { + "epoch": 0.7, + "grad_norm": 0.6018070926836733, + "learning_rate": 1.0965331732831635e-05, + "loss": 2.0335, + "step": 9061 + }, + { + "epoch": 0.7, + "grad_norm": 0.589457054084724, + "learning_rate": 1.096016268824982e-05, + "loss": 1.879, + "step": 9062 + }, + { + "epoch": 0.7, + "grad_norm": 0.5881471392466731, + "learning_rate": 1.0954994520232725e-05, + "loss": 1.9167, + "step": 9063 + }, + { + "epoch": 0.7, + "grad_norm": 0.5892091476514738, + "learning_rate": 1.094982722910302e-05, + "loss": 1.9291, + "step": 9064 + }, + { + "epoch": 0.7, + "grad_norm": 0.5824712548333607, + "learning_rate": 1.0944660815183322e-05, + "loss": 2.066, + "step": 9065 + }, + { + "epoch": 0.7, + "grad_norm": 0.6014375458918635, + "learning_rate": 1.0939495278796189e-05, + "loss": 1.881, + "step": 9066 + }, + { + "epoch": 0.7, + "grad_norm": 0.659247263436494, + "learning_rate": 1.093433062026413e-05, + "loss": 1.8505, + "step": 9067 + }, + { + "epoch": 0.7, + "grad_norm": 0.5729738108592379, + "learning_rate": 1.092916683990959e-05, + "loss": 1.8158, + "step": 9068 + }, + { + "epoch": 0.7, + "grad_norm": 0.6485817235210111, + "learning_rate": 1.092400393805497e-05, + "loss": 2.1079, + "step": 9069 + }, + { + "epoch": 0.7, + "grad_norm": 0.5696438041200594, + "learning_rate": 1.091884191502261e-05, + "loss": 1.9341, + "step": 9070 + }, + { + "epoch": 0.7, + "grad_norm": 0.5811623664936997, + "learning_rate": 1.0913680771134796e-05, + "loss": 1.9206, + "step": 9071 + }, + { + "epoch": 0.7, + "grad_norm": 0.6172993000228082, + "learning_rate": 1.090852050671376e-05, + "loss": 1.8677, + "step": 9072 + }, + { + "epoch": 0.7, + "grad_norm": 0.6330724661300089, + "learning_rate": 1.090336112208168e-05, + "loss": 2.0588, + "step": 9073 + }, + { + "epoch": 0.7, + "grad_norm": 0.5584832196605616, + "learning_rate": 1.0898202617560679e-05, + "loss": 1.8711, + "step": 9074 + }, + { + "epoch": 0.7, + "grad_norm": 0.6649929924127639, + "learning_rate": 1.0893044993472807e-05, + "loss": 1.8367, + "step": 9075 + }, + { + "epoch": 0.7, + "grad_norm": 0.5651047852612145, + "learning_rate": 1.0887888250140097e-05, + "loss": 1.9192, + "step": 9076 + }, + { + "epoch": 0.7, + "grad_norm": 0.5994202001465511, + "learning_rate": 1.0882732387884495e-05, + "loss": 2.0713, + "step": 9077 + }, + { + "epoch": 0.7, + "grad_norm": 0.5774354138388746, + "learning_rate": 1.0877577407027908e-05, + "loss": 1.8924, + "step": 9078 + }, + { + "epoch": 0.7, + "grad_norm": 0.5672691327615841, + "learning_rate": 1.087242330789218e-05, + "loss": 1.8676, + "step": 9079 + }, + { + "epoch": 0.7, + "grad_norm": 0.5945602608888274, + "learning_rate": 1.0867270090799101e-05, + "loss": 1.867, + "step": 9080 + }, + { + "epoch": 0.7, + "grad_norm": 0.56695498511441, + "learning_rate": 1.0862117756070408e-05, + "loss": 2.0643, + "step": 9081 + }, + { + "epoch": 0.7, + "grad_norm": 0.6187372765823543, + "learning_rate": 1.0856966304027794e-05, + "loss": 1.8731, + "step": 9082 + }, + { + "epoch": 0.7, + "grad_norm": 0.5944595247670489, + "learning_rate": 1.0851815734992856e-05, + "loss": 1.9807, + "step": 9083 + }, + { + "epoch": 0.7, + "grad_norm": 0.5949849583666381, + "learning_rate": 1.0846666049287193e-05, + "loss": 1.8762, + "step": 9084 + }, + { + "epoch": 0.7, + "grad_norm": 0.553331177626958, + "learning_rate": 1.0841517247232322e-05, + "loss": 2.0812, + "step": 9085 + }, + { + "epoch": 0.7, + "grad_norm": 0.5816110901336285, + "learning_rate": 1.0836369329149673e-05, + "loss": 1.883, + "step": 9086 + }, + { + "epoch": 0.7, + "grad_norm": 0.6027924349682429, + "learning_rate": 1.0831222295360693e-05, + "loss": 1.8307, + "step": 9087 + }, + { + "epoch": 0.7, + "grad_norm": 0.5891867453439753, + "learning_rate": 1.0826076146186703e-05, + "loss": 2.0519, + "step": 9088 + }, + { + "epoch": 0.7, + "grad_norm": 0.5764090670972408, + "learning_rate": 1.0820930881948996e-05, + "loss": 1.9517, + "step": 9089 + }, + { + "epoch": 0.7, + "grad_norm": 0.639198495741725, + "learning_rate": 1.0815786502968844e-05, + "loss": 1.8725, + "step": 9090 + }, + { + "epoch": 0.7, + "grad_norm": 0.6119584558755411, + "learning_rate": 1.081064300956739e-05, + "loss": 1.8848, + "step": 9091 + }, + { + "epoch": 0.7, + "grad_norm": 0.6049137738412469, + "learning_rate": 1.0805500402065793e-05, + "loss": 1.8525, + "step": 9092 + }, + { + "epoch": 0.7, + "grad_norm": 0.594575451201536, + "learning_rate": 1.0800358680785131e-05, + "loss": 2.064, + "step": 9093 + }, + { + "epoch": 0.7, + "grad_norm": 0.7778320771048078, + "learning_rate": 1.079521784604639e-05, + "loss": 1.8795, + "step": 9094 + }, + { + "epoch": 0.7, + "grad_norm": 0.5881063539509765, + "learning_rate": 1.0790077898170573e-05, + "loss": 1.9414, + "step": 9095 + }, + { + "epoch": 0.7, + "grad_norm": 0.6233175357656116, + "learning_rate": 1.0784938837478556e-05, + "loss": 1.8184, + "step": 9096 + }, + { + "epoch": 0.7, + "grad_norm": 0.6708820210209048, + "learning_rate": 1.0779800664291198e-05, + "loss": 2.0366, + "step": 9097 + }, + { + "epoch": 0.7, + "grad_norm": 0.6427088609917424, + "learning_rate": 1.077466337892932e-05, + "loss": 1.888, + "step": 9098 + }, + { + "epoch": 0.7, + "grad_norm": 0.6058515373365156, + "learning_rate": 1.0769526981713637e-05, + "loss": 1.8744, + "step": 9099 + }, + { + "epoch": 0.7, + "grad_norm": 0.6066250554972017, + "learning_rate": 1.0764391472964846e-05, + "loss": 1.8707, + "step": 9100 + }, + { + "epoch": 0.7, + "grad_norm": 0.541145392620769, + "learning_rate": 1.0759256853003578e-05, + "loss": 2.1121, + "step": 9101 + }, + { + "epoch": 0.7, + "grad_norm": 0.6215361040592173, + "learning_rate": 1.0754123122150403e-05, + "loss": 1.9273, + "step": 9102 + }, + { + "epoch": 0.7, + "grad_norm": 0.6156539468634563, + "learning_rate": 1.0748990280725846e-05, + "loss": 1.8976, + "step": 9103 + }, + { + "epoch": 0.7, + "grad_norm": 0.597990725334442, + "learning_rate": 1.0743858329050371e-05, + "loss": 1.8982, + "step": 9104 + }, + { + "epoch": 0.7, + "grad_norm": 0.6452914495627337, + "learning_rate": 1.0738727267444387e-05, + "loss": 2.0279, + "step": 9105 + }, + { + "epoch": 0.7, + "grad_norm": 0.6054249485480094, + "learning_rate": 1.0733597096228249e-05, + "loss": 1.835, + "step": 9106 + }, + { + "epoch": 0.7, + "grad_norm": 0.5516434133446267, + "learning_rate": 1.0728467815722248e-05, + "loss": 1.916, + "step": 9107 + }, + { + "epoch": 0.7, + "grad_norm": 0.570161196032374, + "learning_rate": 1.0723339426246631e-05, + "loss": 1.8933, + "step": 9108 + }, + { + "epoch": 0.7, + "grad_norm": 0.5996528106237009, + "learning_rate": 1.0718211928121585e-05, + "loss": 2.0236, + "step": 9109 + }, + { + "epoch": 0.7, + "grad_norm": 0.585697169691358, + "learning_rate": 1.0713085321667238e-05, + "loss": 1.8724, + "step": 9110 + }, + { + "epoch": 0.7, + "grad_norm": 0.6188694330078404, + "learning_rate": 1.0707959607203665e-05, + "loss": 1.8512, + "step": 9111 + }, + { + "epoch": 0.7, + "grad_norm": 0.5919157344421371, + "learning_rate": 1.0702834785050891e-05, + "loss": 1.8169, + "step": 9112 + }, + { + "epoch": 0.7, + "grad_norm": 0.6611276542969717, + "learning_rate": 1.0697710855528872e-05, + "loss": 2.0389, + "step": 9113 + }, + { + "epoch": 0.7, + "grad_norm": 0.5837143797542886, + "learning_rate": 1.0692587818957523e-05, + "loss": 1.9498, + "step": 9114 + }, + { + "epoch": 0.7, + "grad_norm": 0.592156555293683, + "learning_rate": 1.0687465675656688e-05, + "loss": 1.844, + "step": 9115 + }, + { + "epoch": 0.7, + "grad_norm": 0.5896782269159244, + "learning_rate": 1.0682344425946172e-05, + "loss": 1.8374, + "step": 9116 + }, + { + "epoch": 0.7, + "grad_norm": 0.5969475499696985, + "learning_rate": 1.0677224070145711e-05, + "loss": 2.0708, + "step": 9117 + }, + { + "epoch": 0.7, + "grad_norm": 0.6530626391598137, + "learning_rate": 1.067210460857499e-05, + "loss": 1.9103, + "step": 9118 + }, + { + "epoch": 0.7, + "grad_norm": 0.6318855688340977, + "learning_rate": 1.0666986041553639e-05, + "loss": 1.8565, + "step": 9119 + }, + { + "epoch": 0.7, + "grad_norm": 0.5507586475693985, + "learning_rate": 1.0661868369401232e-05, + "loss": 1.9463, + "step": 9120 + }, + { + "epoch": 0.7, + "grad_norm": 0.6459514645073638, + "learning_rate": 1.0656751592437295e-05, + "loss": 2.0995, + "step": 9121 + }, + { + "epoch": 0.7, + "grad_norm": 0.618714926262875, + "learning_rate": 1.0651635710981259e-05, + "loss": 1.8599, + "step": 9122 + }, + { + "epoch": 0.7, + "grad_norm": 0.640309620779375, + "learning_rate": 1.064652072535256e-05, + "loss": 1.8614, + "step": 9123 + }, + { + "epoch": 0.7, + "grad_norm": 0.6005298326894235, + "learning_rate": 1.0641406635870548e-05, + "loss": 1.8736, + "step": 9124 + }, + { + "epoch": 0.7, + "grad_norm": 0.5912396905761028, + "learning_rate": 1.0636293442854486e-05, + "loss": 2.0848, + "step": 9125 + }, + { + "epoch": 0.7, + "grad_norm": 0.5368890569683601, + "learning_rate": 1.0631181146623645e-05, + "loss": 1.9706, + "step": 9126 + }, + { + "epoch": 0.7, + "grad_norm": 0.5780489721092229, + "learning_rate": 1.0626069747497189e-05, + "loss": 1.8754, + "step": 9127 + }, + { + "epoch": 0.7, + "grad_norm": 0.7092488767632632, + "learning_rate": 1.0620959245794252e-05, + "loss": 1.843, + "step": 9128 + }, + { + "epoch": 0.7, + "grad_norm": 0.6404823415655542, + "learning_rate": 1.0615849641833905e-05, + "loss": 2.0839, + "step": 9129 + }, + { + "epoch": 0.7, + "grad_norm": 0.6090130922027726, + "learning_rate": 1.061074093593514e-05, + "loss": 1.8885, + "step": 9130 + }, + { + "epoch": 0.7, + "grad_norm": 0.6531765974865161, + "learning_rate": 1.0605633128416941e-05, + "loss": 1.9048, + "step": 9131 + }, + { + "epoch": 0.7, + "grad_norm": 0.6355725339599639, + "learning_rate": 1.0600526219598208e-05, + "loss": 1.9389, + "step": 9132 + }, + { + "epoch": 0.7, + "grad_norm": 0.6311116807264902, + "learning_rate": 1.059542020979776e-05, + "loss": 2.0494, + "step": 9133 + }, + { + "epoch": 0.7, + "grad_norm": 0.5977921596929435, + "learning_rate": 1.0590315099334419e-05, + "loss": 1.874, + "step": 9134 + }, + { + "epoch": 0.7, + "grad_norm": 0.6165528990222782, + "learning_rate": 1.0585210888526898e-05, + "loss": 1.8753, + "step": 9135 + }, + { + "epoch": 0.7, + "grad_norm": 0.7361342408858902, + "learning_rate": 1.0580107577693868e-05, + "loss": 1.8894, + "step": 9136 + }, + { + "epoch": 0.7, + "grad_norm": 0.5651574776338373, + "learning_rate": 1.0575005167153976e-05, + "loss": 2.071, + "step": 9137 + }, + { + "epoch": 0.7, + "grad_norm": 0.53760120567633, + "learning_rate": 1.0569903657225766e-05, + "loss": 2.0183, + "step": 9138 + }, + { + "epoch": 0.71, + "grad_norm": 0.7385791444295767, + "learning_rate": 1.056480304822774e-05, + "loss": 1.8683, + "step": 9139 + }, + { + "epoch": 0.71, + "grad_norm": 0.6451145320076157, + "learning_rate": 1.0559703340478378e-05, + "loss": 1.8854, + "step": 9140 + }, + { + "epoch": 0.71, + "grad_norm": 0.5810387693752033, + "learning_rate": 1.055460453429604e-05, + "loss": 2.1023, + "step": 9141 + }, + { + "epoch": 0.71, + "grad_norm": 0.5908092875667219, + "learning_rate": 1.0549506629999104e-05, + "loss": 1.8354, + "step": 9142 + }, + { + "epoch": 0.71, + "grad_norm": 0.5982518529546936, + "learning_rate": 1.0544409627905823e-05, + "loss": 1.8512, + "step": 9143 + }, + { + "epoch": 0.71, + "grad_norm": 0.6154504164744401, + "learning_rate": 1.0539313528334427e-05, + "loss": 1.8631, + "step": 9144 + }, + { + "epoch": 0.71, + "grad_norm": 0.6007975283152956, + "learning_rate": 1.053421833160311e-05, + "loss": 2.0937, + "step": 9145 + }, + { + "epoch": 0.71, + "grad_norm": 0.6324812558371196, + "learning_rate": 1.0529124038029958e-05, + "loss": 1.8819, + "step": 9146 + }, + { + "epoch": 0.71, + "grad_norm": 0.6269389856174107, + "learning_rate": 1.0524030647933045e-05, + "loss": 1.8783, + "step": 9147 + }, + { + "epoch": 0.71, + "grad_norm": 0.5913707038973363, + "learning_rate": 1.0518938161630365e-05, + "loss": 1.831, + "step": 9148 + }, + { + "epoch": 0.71, + "grad_norm": 0.5759501360372297, + "learning_rate": 1.0513846579439866e-05, + "loss": 2.0864, + "step": 9149 + }, + { + "epoch": 0.71, + "grad_norm": 0.5372606769174609, + "learning_rate": 1.0508755901679434e-05, + "loss": 1.8412, + "step": 9150 + }, + { + "epoch": 0.71, + "grad_norm": 0.5898067814951259, + "learning_rate": 1.0503666128666903e-05, + "loss": 1.9408, + "step": 9151 + }, + { + "epoch": 0.71, + "grad_norm": 0.647693619439521, + "learning_rate": 1.049857726072005e-05, + "loss": 1.8602, + "step": 9152 + }, + { + "epoch": 0.71, + "grad_norm": 0.5743667830549007, + "learning_rate": 1.0493489298156589e-05, + "loss": 2.0675, + "step": 9153 + }, + { + "epoch": 0.71, + "grad_norm": 0.5847235096846988, + "learning_rate": 1.0488402241294187e-05, + "loss": 1.8692, + "step": 9154 + }, + { + "epoch": 0.71, + "grad_norm": 0.6210592134530121, + "learning_rate": 1.0483316090450448e-05, + "loss": 1.8803, + "step": 9155 + }, + { + "epoch": 0.71, + "grad_norm": 0.637971455570303, + "learning_rate": 1.0478230845942922e-05, + "loss": 1.8875, + "step": 9156 + }, + { + "epoch": 0.71, + "grad_norm": 0.598139127248657, + "learning_rate": 1.0473146508089102e-05, + "loss": 2.1185, + "step": 9157 + }, + { + "epoch": 0.71, + "grad_norm": 0.5761171302761096, + "learning_rate": 1.0468063077206422e-05, + "loss": 1.855, + "step": 9158 + }, + { + "epoch": 0.71, + "grad_norm": 0.7166434478134002, + "learning_rate": 1.0462980553612262e-05, + "loss": 1.9408, + "step": 9159 + }, + { + "epoch": 0.71, + "grad_norm": 0.6374484654212716, + "learning_rate": 1.0457898937623947e-05, + "loss": 1.8762, + "step": 9160 + }, + { + "epoch": 0.71, + "grad_norm": 0.6478868611453413, + "learning_rate": 1.0452818229558741e-05, + "loss": 2.113, + "step": 9161 + }, + { + "epoch": 0.71, + "grad_norm": 0.6128435431946185, + "learning_rate": 1.0447738429733855e-05, + "loss": 1.8489, + "step": 9162 + }, + { + "epoch": 0.71, + "grad_norm": 0.6614622207371631, + "learning_rate": 1.044265953846644e-05, + "loss": 1.9346, + "step": 9163 + }, + { + "epoch": 0.71, + "grad_norm": 0.6086541404354413, + "learning_rate": 1.0437581556073594e-05, + "loss": 1.8777, + "step": 9164 + }, + { + "epoch": 0.71, + "grad_norm": 0.6600978241140333, + "learning_rate": 1.0432504482872355e-05, + "loss": 2.0631, + "step": 9165 + }, + { + "epoch": 0.71, + "grad_norm": 0.5955817409664877, + "learning_rate": 1.0427428319179705e-05, + "loss": 1.9142, + "step": 9166 + }, + { + "epoch": 0.71, + "grad_norm": 0.6612831621137155, + "learning_rate": 1.0422353065312573e-05, + "loss": 1.9185, + "step": 9167 + }, + { + "epoch": 0.71, + "grad_norm": 0.6342337894144602, + "learning_rate": 1.0417278721587834e-05, + "loss": 1.8357, + "step": 9168 + }, + { + "epoch": 0.71, + "grad_norm": 0.6358924008997958, + "learning_rate": 1.0412205288322272e-05, + "loss": 2.1075, + "step": 9169 + }, + { + "epoch": 0.71, + "grad_norm": 0.6253993767833744, + "learning_rate": 1.0407132765832672e-05, + "loss": 1.9541, + "step": 9170 + }, + { + "epoch": 0.71, + "grad_norm": 0.6434223852720643, + "learning_rate": 1.040206115443573e-05, + "loss": 1.8634, + "step": 9171 + }, + { + "epoch": 0.71, + "grad_norm": 0.7152873966179553, + "learning_rate": 1.0396990454448064e-05, + "loss": 1.8401, + "step": 9172 + }, + { + "epoch": 0.71, + "grad_norm": 0.6087864617375922, + "learning_rate": 1.0391920666186283e-05, + "loss": 2.0705, + "step": 9173 + }, + { + "epoch": 0.71, + "grad_norm": 0.617145200602466, + "learning_rate": 1.0386851789966906e-05, + "loss": 1.8484, + "step": 9174 + }, + { + "epoch": 0.71, + "grad_norm": 0.6190056093802709, + "learning_rate": 1.0381783826106407e-05, + "loss": 1.8507, + "step": 9175 + }, + { + "epoch": 0.71, + "grad_norm": 0.6541125680774376, + "learning_rate": 1.03767167749212e-05, + "loss": 1.9178, + "step": 9176 + }, + { + "epoch": 0.71, + "grad_norm": 0.5851915526654308, + "learning_rate": 1.0371650636727625e-05, + "loss": 2.0858, + "step": 9177 + }, + { + "epoch": 0.71, + "grad_norm": 0.5937121348079563, + "learning_rate": 1.0366585411842004e-05, + "loss": 1.8649, + "step": 9178 + }, + { + "epoch": 0.71, + "grad_norm": 0.5598166854965481, + "learning_rate": 1.036152110058058e-05, + "loss": 1.854, + "step": 9179 + }, + { + "epoch": 0.71, + "grad_norm": 0.6081817359937839, + "learning_rate": 1.0356457703259512e-05, + "loss": 1.885, + "step": 9180 + }, + { + "epoch": 0.71, + "grad_norm": 0.6222803257389463, + "learning_rate": 1.0351395220194961e-05, + "loss": 2.0485, + "step": 9181 + }, + { + "epoch": 0.71, + "grad_norm": 0.5549624694026287, + "learning_rate": 1.034633365170298e-05, + "loss": 1.9528, + "step": 9182 + }, + { + "epoch": 0.71, + "grad_norm": 0.562846511501338, + "learning_rate": 1.034127299809958e-05, + "loss": 1.8949, + "step": 9183 + }, + { + "epoch": 0.71, + "grad_norm": 0.562859662000185, + "learning_rate": 1.0336213259700742e-05, + "loss": 1.9456, + "step": 9184 + }, + { + "epoch": 0.71, + "grad_norm": 0.5694966323681612, + "learning_rate": 1.033115443682234e-05, + "loss": 2.0711, + "step": 9185 + }, + { + "epoch": 0.71, + "grad_norm": 0.6002793016046768, + "learning_rate": 1.0326096529780221e-05, + "loss": 1.9055, + "step": 9186 + }, + { + "epoch": 0.71, + "grad_norm": 0.5856210081111061, + "learning_rate": 1.0321039538890196e-05, + "loss": 1.8865, + "step": 9187 + }, + { + "epoch": 0.71, + "grad_norm": 0.5369310040700239, + "learning_rate": 1.0315983464467965e-05, + "loss": 1.9726, + "step": 9188 + }, + { + "epoch": 0.71, + "grad_norm": 0.5365621363147519, + "learning_rate": 1.031092830682921e-05, + "loss": 2.0697, + "step": 9189 + }, + { + "epoch": 0.71, + "grad_norm": 0.5416529600726808, + "learning_rate": 1.0305874066289545e-05, + "loss": 1.8485, + "step": 9190 + }, + { + "epoch": 0.71, + "grad_norm": 0.5773556099348008, + "learning_rate": 1.0300820743164517e-05, + "loss": 1.8534, + "step": 9191 + }, + { + "epoch": 0.71, + "grad_norm": 0.6056971620359966, + "learning_rate": 1.0295768337769654e-05, + "loss": 1.8932, + "step": 9192 + }, + { + "epoch": 0.71, + "grad_norm": 0.632840341869504, + "learning_rate": 1.0290716850420373e-05, + "loss": 2.1244, + "step": 9193 + }, + { + "epoch": 0.71, + "grad_norm": 0.5715013093662434, + "learning_rate": 1.0285666281432054e-05, + "loss": 1.995, + "step": 9194 + }, + { + "epoch": 0.71, + "grad_norm": 0.5758681065116839, + "learning_rate": 1.0280616631120058e-05, + "loss": 1.8977, + "step": 9195 + }, + { + "epoch": 0.71, + "grad_norm": 0.6121426899550583, + "learning_rate": 1.0275567899799627e-05, + "loss": 1.8658, + "step": 9196 + }, + { + "epoch": 0.71, + "grad_norm": 0.7007971085436007, + "learning_rate": 1.0270520087785979e-05, + "loss": 2.0747, + "step": 9197 + }, + { + "epoch": 0.71, + "grad_norm": 0.6246483726393967, + "learning_rate": 1.0265473195394271e-05, + "loss": 1.8726, + "step": 9198 + }, + { + "epoch": 0.71, + "grad_norm": 0.6167679172073429, + "learning_rate": 1.0260427222939603e-05, + "loss": 1.8508, + "step": 9199 + }, + { + "epoch": 0.71, + "grad_norm": 0.6576087787338054, + "learning_rate": 1.0255382170737017e-05, + "loss": 1.8802, + "step": 9200 + }, + { + "epoch": 0.71, + "grad_norm": 0.6744936214540866, + "learning_rate": 1.0250338039101492e-05, + "loss": 1.9684, + "step": 9201 + }, + { + "epoch": 0.71, + "grad_norm": 0.6939741958046834, + "learning_rate": 1.0245294828347956e-05, + "loss": 2.1056, + "step": 9202 + }, + { + "epoch": 0.71, + "grad_norm": 0.5684010768455341, + "learning_rate": 1.024025253879128e-05, + "loss": 1.8547, + "step": 9203 + }, + { + "epoch": 0.71, + "grad_norm": 0.7084248380870669, + "learning_rate": 1.0235211170746272e-05, + "loss": 1.8738, + "step": 9204 + }, + { + "epoch": 0.71, + "grad_norm": 0.6754908617304664, + "learning_rate": 1.0230170724527687e-05, + "loss": 2.1127, + "step": 9205 + }, + { + "epoch": 0.71, + "grad_norm": 0.5801160443704926, + "learning_rate": 1.022513120045022e-05, + "loss": 1.8622, + "step": 9206 + }, + { + "epoch": 0.71, + "grad_norm": 0.6979906434215114, + "learning_rate": 1.0220092598828506e-05, + "loss": 1.9711, + "step": 9207 + }, + { + "epoch": 0.71, + "grad_norm": 0.5714614698029041, + "learning_rate": 1.021505491997713e-05, + "loss": 1.8611, + "step": 9208 + }, + { + "epoch": 0.71, + "grad_norm": 0.573599780203101, + "learning_rate": 1.0210018164210613e-05, + "loss": 2.0945, + "step": 9209 + }, + { + "epoch": 0.71, + "grad_norm": 0.5933984367883695, + "learning_rate": 1.0204982331843421e-05, + "loss": 1.8765, + "step": 9210 + }, + { + "epoch": 0.71, + "grad_norm": 0.6089982838662868, + "learning_rate": 1.019994742318996e-05, + "loss": 1.8725, + "step": 9211 + }, + { + "epoch": 0.71, + "grad_norm": 0.6617475841757581, + "learning_rate": 1.0194913438564585e-05, + "loss": 1.9014, + "step": 9212 + }, + { + "epoch": 0.71, + "grad_norm": 0.5486714680552005, + "learning_rate": 1.0189880378281583e-05, + "loss": 1.9564, + "step": 9213 + }, + { + "epoch": 0.71, + "grad_norm": 0.5624858566544474, + "learning_rate": 1.018484824265519e-05, + "loss": 2.0566, + "step": 9214 + }, + { + "epoch": 0.71, + "grad_norm": 0.6607205287791696, + "learning_rate": 1.0179817031999592e-05, + "loss": 1.9036, + "step": 9215 + }, + { + "epoch": 0.71, + "grad_norm": 0.581678025841982, + "learning_rate": 1.0174786746628882e-05, + "loss": 1.9132, + "step": 9216 + }, + { + "epoch": 0.71, + "grad_norm": 0.5755037036557926, + "learning_rate": 1.0169757386857148e-05, + "loss": 2.0799, + "step": 9217 + }, + { + "epoch": 0.71, + "grad_norm": 0.6000446840684024, + "learning_rate": 1.0164728952998393e-05, + "loss": 1.8344, + "step": 9218 + }, + { + "epoch": 0.71, + "grad_norm": 0.5289722188984569, + "learning_rate": 1.0159701445366537e-05, + "loss": 1.8892, + "step": 9219 + }, + { + "epoch": 0.71, + "grad_norm": 0.6116870511369378, + "learning_rate": 1.0154674864275492e-05, + "loss": 1.8453, + "step": 9220 + }, + { + "epoch": 0.71, + "grad_norm": 0.6018517427676207, + "learning_rate": 1.0149649210039095e-05, + "loss": 2.0773, + "step": 9221 + }, + { + "epoch": 0.71, + "grad_norm": 0.5771253800150048, + "learning_rate": 1.0144624482971082e-05, + "loss": 1.8452, + "step": 9222 + }, + { + "epoch": 0.71, + "grad_norm": 0.5581315325428106, + "learning_rate": 1.0139600683385211e-05, + "loss": 1.8911, + "step": 9223 + }, + { + "epoch": 0.71, + "grad_norm": 0.6220632438330608, + "learning_rate": 1.0134577811595098e-05, + "loss": 1.8631, + "step": 9224 + }, + { + "epoch": 0.71, + "grad_norm": 0.528929299601843, + "learning_rate": 1.012955586791437e-05, + "loss": 1.9446, + "step": 9225 + }, + { + "epoch": 0.71, + "grad_norm": 0.5723426600599879, + "learning_rate": 1.0124534852656569e-05, + "loss": 2.0402, + "step": 9226 + }, + { + "epoch": 0.71, + "grad_norm": 0.5786359998373249, + "learning_rate": 1.0119514766135146e-05, + "loss": 1.862, + "step": 9227 + }, + { + "epoch": 0.71, + "grad_norm": 0.6125392689275904, + "learning_rate": 1.011449560866356e-05, + "loss": 1.842, + "step": 9228 + }, + { + "epoch": 0.71, + "grad_norm": 0.5790911191526162, + "learning_rate": 1.0109477380555172e-05, + "loss": 2.0697, + "step": 9229 + }, + { + "epoch": 0.71, + "grad_norm": 0.554725571518991, + "learning_rate": 1.0104460082123263e-05, + "loss": 1.8748, + "step": 9230 + }, + { + "epoch": 0.71, + "grad_norm": 0.5634733682984511, + "learning_rate": 1.009944371368112e-05, + "loss": 1.8902, + "step": 9231 + }, + { + "epoch": 0.71, + "grad_norm": 0.6302671616588267, + "learning_rate": 1.0094428275541914e-05, + "loss": 1.9676, + "step": 9232 + }, + { + "epoch": 0.71, + "grad_norm": 0.6394008497227877, + "learning_rate": 1.0089413768018774e-05, + "loss": 1.8946, + "step": 9233 + }, + { + "epoch": 0.71, + "grad_norm": 0.5990141180389505, + "learning_rate": 1.0084400191424804e-05, + "loss": 2.1013, + "step": 9234 + }, + { + "epoch": 0.71, + "grad_norm": 0.6270817832424838, + "learning_rate": 1.0079387546072996e-05, + "loss": 1.8989, + "step": 9235 + }, + { + "epoch": 0.71, + "grad_norm": 0.6068966137492929, + "learning_rate": 1.007437583227632e-05, + "loss": 1.8663, + "step": 9236 + }, + { + "epoch": 0.71, + "grad_norm": 0.5884188737728199, + "learning_rate": 1.0069365050347674e-05, + "loss": 2.0516, + "step": 9237 + }, + { + "epoch": 0.71, + "grad_norm": 0.6081604267299447, + "learning_rate": 1.0064355200599897e-05, + "loss": 1.9799, + "step": 9238 + }, + { + "epoch": 0.71, + "grad_norm": 0.6268185048734554, + "learning_rate": 1.0059346283345802e-05, + "loss": 1.9341, + "step": 9239 + }, + { + "epoch": 0.71, + "grad_norm": 0.5728982552830466, + "learning_rate": 1.0054338298898086e-05, + "loss": 1.8625, + "step": 9240 + }, + { + "epoch": 0.71, + "grad_norm": 0.5748449065888819, + "learning_rate": 1.0049331247569418e-05, + "loss": 2.0726, + "step": 9241 + }, + { + "epoch": 0.71, + "grad_norm": 0.6534999963919573, + "learning_rate": 1.0044325129672438e-05, + "loss": 1.9365, + "step": 9242 + }, + { + "epoch": 0.71, + "grad_norm": 0.6369739411115005, + "learning_rate": 1.0039319945519671e-05, + "loss": 1.8588, + "step": 9243 + }, + { + "epoch": 0.71, + "grad_norm": 0.6036211428324699, + "learning_rate": 1.0034315695423619e-05, + "loss": 1.9864, + "step": 9244 + }, + { + "epoch": 0.71, + "grad_norm": 0.5631335037004745, + "learning_rate": 1.002931237969672e-05, + "loss": 1.9031, + "step": 9245 + }, + { + "epoch": 0.71, + "grad_norm": 0.6229290213338599, + "learning_rate": 1.002430999865135e-05, + "loss": 2.0904, + "step": 9246 + }, + { + "epoch": 0.71, + "grad_norm": 0.614150747591734, + "learning_rate": 1.0019308552599827e-05, + "loss": 1.9015, + "step": 9247 + }, + { + "epoch": 0.71, + "grad_norm": 0.6020129951521515, + "learning_rate": 1.0014308041854414e-05, + "loss": 1.8864, + "step": 9248 + }, + { + "epoch": 0.71, + "grad_norm": 0.5790584168232801, + "learning_rate": 1.0009308466727313e-05, + "loss": 2.0549, + "step": 9249 + }, + { + "epoch": 0.71, + "grad_norm": 0.5634899495336889, + "learning_rate": 1.0004309827530667e-05, + "loss": 1.9986, + "step": 9250 + }, + { + "epoch": 0.71, + "grad_norm": 0.5895714685956942, + "learning_rate": 9.999312124576563e-06, + "loss": 1.848, + "step": 9251 + }, + { + "epoch": 0.71, + "grad_norm": 0.6765046884733992, + "learning_rate": 9.994315358177028e-06, + "loss": 1.831, + "step": 9252 + }, + { + "epoch": 0.71, + "grad_norm": 0.5876158533732383, + "learning_rate": 9.989319528644028e-06, + "loss": 1.8814, + "step": 9253 + }, + { + "epoch": 0.71, + "grad_norm": 0.6043639542678267, + "learning_rate": 9.984324636289477e-06, + "loss": 2.0544, + "step": 9254 + }, + { + "epoch": 0.71, + "grad_norm": 0.5817899931944838, + "learning_rate": 9.979330681425226e-06, + "loss": 1.9131, + "step": 9255 + }, + { + "epoch": 0.71, + "grad_norm": 0.6290646883876165, + "learning_rate": 9.974337664363065e-06, + "loss": 1.9108, + "step": 9256 + }, + { + "epoch": 0.71, + "grad_norm": 0.5638846314317384, + "learning_rate": 9.969345585414732e-06, + "loss": 1.8902, + "step": 9257 + }, + { + "epoch": 0.71, + "grad_norm": 0.6045638657858438, + "learning_rate": 9.964354444891902e-06, + "loss": 2.0607, + "step": 9258 + }, + { + "epoch": 0.71, + "grad_norm": 0.5782976219339114, + "learning_rate": 9.959364243106189e-06, + "loss": 1.8826, + "step": 9259 + }, + { + "epoch": 0.71, + "grad_norm": 0.5915224915822787, + "learning_rate": 9.954374980369158e-06, + "loss": 1.8847, + "step": 9260 + }, + { + "epoch": 0.71, + "grad_norm": 0.6635595631159843, + "learning_rate": 9.949386656992305e-06, + "loss": 2.0862, + "step": 9261 + }, + { + "epoch": 0.71, + "grad_norm": 0.627402783443919, + "learning_rate": 9.944399273287081e-06, + "loss": 1.8954, + "step": 9262 + }, + { + "epoch": 0.71, + "grad_norm": 0.5471039164639492, + "learning_rate": 9.93941282956484e-06, + "loss": 1.9705, + "step": 9263 + }, + { + "epoch": 0.71, + "grad_norm": 0.6161588034091146, + "learning_rate": 9.93442732613694e-06, + "loss": 1.8726, + "step": 9264 + }, + { + "epoch": 0.71, + "grad_norm": 0.5791925210441221, + "learning_rate": 9.929442763314638e-06, + "loss": 1.8929, + "step": 9265 + }, + { + "epoch": 0.71, + "grad_norm": 0.6022714609027946, + "learning_rate": 9.92445914140912e-06, + "loss": 2.0459, + "step": 9266 + }, + { + "epoch": 0.71, + "grad_norm": 0.5699188505675418, + "learning_rate": 9.919476460731559e-06, + "loss": 1.8858, + "step": 9267 + }, + { + "epoch": 0.72, + "grad_norm": 0.5560944783914413, + "learning_rate": 9.914494721593043e-06, + "loss": 1.8523, + "step": 9268 + }, + { + "epoch": 0.72, + "grad_norm": 0.6095869623516343, + "learning_rate": 9.909513924304576e-06, + "loss": 1.9441, + "step": 9269 + }, + { + "epoch": 0.72, + "grad_norm": 0.5579724162565891, + "learning_rate": 9.904534069177165e-06, + "loss": 2.0803, + "step": 9270 + }, + { + "epoch": 0.72, + "grad_norm": 0.6019718343321604, + "learning_rate": 9.89955515652169e-06, + "loss": 1.8929, + "step": 9271 + }, + { + "epoch": 0.72, + "grad_norm": 0.6244384296453995, + "learning_rate": 9.894577186649028e-06, + "loss": 1.887, + "step": 9272 + }, + { + "epoch": 0.72, + "grad_norm": 0.5687379927810733, + "learning_rate": 9.889600159869977e-06, + "loss": 2.0805, + "step": 9273 + }, + { + "epoch": 0.72, + "grad_norm": 0.5901804755337678, + "learning_rate": 9.884624076495245e-06, + "loss": 1.8696, + "step": 9274 + }, + { + "epoch": 0.72, + "grad_norm": 0.6001413139691167, + "learning_rate": 9.879648936835537e-06, + "loss": 1.9881, + "step": 9275 + }, + { + "epoch": 0.72, + "grad_norm": 0.5814126159887417, + "learning_rate": 9.87467474120147e-06, + "loss": 1.8028, + "step": 9276 + }, + { + "epoch": 0.72, + "grad_norm": 0.6070093577798656, + "learning_rate": 9.869701489903576e-06, + "loss": 1.8732, + "step": 9277 + }, + { + "epoch": 0.72, + "grad_norm": 0.5828542656286376, + "learning_rate": 9.864729183252395e-06, + "loss": 2.0749, + "step": 9278 + }, + { + "epoch": 0.72, + "grad_norm": 0.6883931659187985, + "learning_rate": 9.859757821558337e-06, + "loss": 1.8866, + "step": 9279 + }, + { + "epoch": 0.72, + "grad_norm": 0.6404242987489672, + "learning_rate": 9.85478740513179e-06, + "loss": 1.8502, + "step": 9280 + }, + { + "epoch": 0.72, + "grad_norm": 0.5586243460154057, + "learning_rate": 9.8498179342831e-06, + "loss": 1.9476, + "step": 9281 + }, + { + "epoch": 0.72, + "grad_norm": 0.6985995473612314, + "learning_rate": 9.844849409322507e-06, + "loss": 2.0613, + "step": 9282 + }, + { + "epoch": 0.72, + "grad_norm": 0.6077835055473094, + "learning_rate": 9.839881830560222e-06, + "loss": 1.8633, + "step": 9283 + }, + { + "epoch": 0.72, + "grad_norm": 0.6164790779805088, + "learning_rate": 9.834915198306394e-06, + "loss": 1.8772, + "step": 9284 + }, + { + "epoch": 0.72, + "grad_norm": 0.6740379464332183, + "learning_rate": 9.829949512871112e-06, + "loss": 1.8693, + "step": 9285 + }, + { + "epoch": 0.72, + "grad_norm": 0.5611387469339576, + "learning_rate": 9.8249847745644e-06, + "loss": 2.0391, + "step": 9286 + }, + { + "epoch": 0.72, + "grad_norm": 0.6192591806087414, + "learning_rate": 9.820020983696228e-06, + "loss": 1.902, + "step": 9287 + }, + { + "epoch": 0.72, + "grad_norm": 0.6901033779313758, + "learning_rate": 9.815058140576502e-06, + "loss": 1.9258, + "step": 9288 + }, + { + "epoch": 0.72, + "grad_norm": 0.5461962916869743, + "learning_rate": 9.810096245515094e-06, + "loss": 1.812, + "step": 9289 + }, + { + "epoch": 0.72, + "grad_norm": 0.6273547982635934, + "learning_rate": 9.805135298821768e-06, + "loss": 1.9908, + "step": 9290 + }, + { + "epoch": 0.72, + "grad_norm": 0.6162977639484502, + "learning_rate": 9.80017530080627e-06, + "loss": 1.9147, + "step": 9291 + }, + { + "epoch": 0.72, + "grad_norm": 0.5834847988762599, + "learning_rate": 9.795216251778266e-06, + "loss": 1.871, + "step": 9292 + }, + { + "epoch": 0.72, + "grad_norm": 0.5876223912801993, + "learning_rate": 9.790258152047376e-06, + "loss": 2.0444, + "step": 9293 + }, + { + "epoch": 0.72, + "grad_norm": 0.5290928720043104, + "learning_rate": 9.785301001923155e-06, + "loss": 1.9755, + "step": 9294 + }, + { + "epoch": 0.72, + "grad_norm": 0.5791252143136313, + "learning_rate": 9.780344801715096e-06, + "loss": 1.8599, + "step": 9295 + }, + { + "epoch": 0.72, + "grad_norm": 0.588392931823372, + "learning_rate": 9.775389551732633e-06, + "loss": 1.9003, + "step": 9296 + }, + { + "epoch": 0.72, + "grad_norm": 0.5529629571744131, + "learning_rate": 9.770435252285146e-06, + "loss": 1.8638, + "step": 9297 + }, + { + "epoch": 0.72, + "grad_norm": 0.537131013553097, + "learning_rate": 9.76548190368195e-06, + "loss": 2.1159, + "step": 9298 + }, + { + "epoch": 0.72, + "grad_norm": 0.5839084429182194, + "learning_rate": 9.760529506232306e-06, + "loss": 1.842, + "step": 9299 + }, + { + "epoch": 0.72, + "grad_norm": 0.5553734722973852, + "learning_rate": 9.755578060245406e-06, + "loss": 1.9605, + "step": 9300 + }, + { + "epoch": 0.72, + "grad_norm": 0.5456225138734266, + "learning_rate": 9.750627566030395e-06, + "loss": 1.8193, + "step": 9301 + }, + { + "epoch": 0.72, + "grad_norm": 0.5394570531306575, + "learning_rate": 9.745678023896353e-06, + "loss": 2.0634, + "step": 9302 + }, + { + "epoch": 0.72, + "grad_norm": 0.598553817223424, + "learning_rate": 9.7407294341523e-06, + "loss": 1.8828, + "step": 9303 + }, + { + "epoch": 0.72, + "grad_norm": 0.5753502339997031, + "learning_rate": 9.735781797107188e-06, + "loss": 1.9376, + "step": 9304 + }, + { + "epoch": 0.72, + "grad_norm": 0.557206453832556, + "learning_rate": 9.730835113069931e-06, + "loss": 2.0949, + "step": 9305 + }, + { + "epoch": 0.72, + "grad_norm": 0.5718145664194376, + "learning_rate": 9.725889382349362e-06, + "loss": 1.9781, + "step": 9306 + }, + { + "epoch": 0.72, + "grad_norm": 0.6252340794307238, + "learning_rate": 9.72094460525427e-06, + "loss": 1.8936, + "step": 9307 + }, + { + "epoch": 0.72, + "grad_norm": 0.6068262860839396, + "learning_rate": 9.71600078209337e-06, + "loss": 1.7976, + "step": 9308 + }, + { + "epoch": 0.72, + "grad_norm": 0.5593851232236426, + "learning_rate": 9.711057913175334e-06, + "loss": 1.8861, + "step": 9309 + }, + { + "epoch": 0.72, + "grad_norm": 0.6343592871161907, + "learning_rate": 9.706115998808757e-06, + "loss": 2.0901, + "step": 9310 + }, + { + "epoch": 0.72, + "grad_norm": 0.603726929348084, + "learning_rate": 9.701175039302191e-06, + "loss": 1.9028, + "step": 9311 + }, + { + "epoch": 0.72, + "grad_norm": 0.540844101733581, + "learning_rate": 9.696235034964122e-06, + "loss": 1.9085, + "step": 9312 + }, + { + "epoch": 0.72, + "grad_norm": 0.5818997421729912, + "learning_rate": 9.691295986102953e-06, + "loss": 1.889, + "step": 9313 + }, + { + "epoch": 0.72, + "grad_norm": 0.5774282749784887, + "learning_rate": 9.686357893027073e-06, + "loss": 2.0891, + "step": 9314 + }, + { + "epoch": 0.72, + "grad_norm": 0.5455396012599567, + "learning_rate": 9.68142075604479e-06, + "loss": 1.8386, + "step": 9315 + }, + { + "epoch": 0.72, + "grad_norm": 0.5560231317072828, + "learning_rate": 9.67648457546432e-06, + "loss": 1.9105, + "step": 9316 + }, + { + "epoch": 0.72, + "grad_norm": 0.5911131358901061, + "learning_rate": 9.671549351593889e-06, + "loss": 1.8223, + "step": 9317 + }, + { + "epoch": 0.72, + "grad_norm": 0.5950767093686201, + "learning_rate": 9.666615084741593e-06, + "loss": 2.1064, + "step": 9318 + }, + { + "epoch": 0.72, + "grad_norm": 0.6629213462850306, + "learning_rate": 9.661681775215498e-06, + "loss": 1.8686, + "step": 9319 + }, + { + "epoch": 0.72, + "grad_norm": 0.5162172326512674, + "learning_rate": 9.656749423323639e-06, + "loss": 1.8834, + "step": 9320 + }, + { + "epoch": 0.72, + "grad_norm": 0.5952177313652275, + "learning_rate": 9.65181802937393e-06, + "loss": 1.8938, + "step": 9321 + }, + { + "epoch": 0.72, + "grad_norm": 0.6794197089375936, + "learning_rate": 9.646887593674278e-06, + "loss": 2.0959, + "step": 9322 + }, + { + "epoch": 0.72, + "grad_norm": 0.6171538800800993, + "learning_rate": 9.641958116532518e-06, + "loss": 1.8379, + "step": 9323 + }, + { + "epoch": 0.72, + "grad_norm": 0.6181360025815352, + "learning_rate": 9.637029598256386e-06, + "loss": 1.9082, + "step": 9324 + }, + { + "epoch": 0.72, + "grad_norm": 0.600046441319299, + "learning_rate": 9.632102039153628e-06, + "loss": 1.9493, + "step": 9325 + }, + { + "epoch": 0.72, + "grad_norm": 0.5735399393542862, + "learning_rate": 9.627175439531862e-06, + "loss": 2.069, + "step": 9326 + }, + { + "epoch": 0.72, + "grad_norm": 0.6123741758597079, + "learning_rate": 9.62224979969868e-06, + "loss": 1.86, + "step": 9327 + }, + { + "epoch": 0.72, + "grad_norm": 0.5808501812293857, + "learning_rate": 9.617325119961631e-06, + "loss": 1.8406, + "step": 9328 + }, + { + "epoch": 0.72, + "grad_norm": 0.5973991678002561, + "learning_rate": 9.612401400628162e-06, + "loss": 1.8617, + "step": 9329 + }, + { + "epoch": 0.72, + "grad_norm": 0.559626654858846, + "learning_rate": 9.607478642005685e-06, + "loss": 2.1187, + "step": 9330 + }, + { + "epoch": 0.72, + "grad_norm": 0.5545233784869005, + "learning_rate": 9.602556844401553e-06, + "loss": 1.9134, + "step": 9331 + }, + { + "epoch": 0.72, + "grad_norm": 0.5700435269417434, + "learning_rate": 9.59763600812305e-06, + "loss": 1.8431, + "step": 9332 + }, + { + "epoch": 0.72, + "grad_norm": 0.6178587835056821, + "learning_rate": 9.592716133477406e-06, + "loss": 1.895, + "step": 9333 + }, + { + "epoch": 0.72, + "grad_norm": 0.5575293996099993, + "learning_rate": 9.587797220771789e-06, + "loss": 2.1126, + "step": 9334 + }, + { + "epoch": 0.72, + "grad_norm": 0.5992280849496696, + "learning_rate": 9.582879270313298e-06, + "loss": 1.8719, + "step": 9335 + }, + { + "epoch": 0.72, + "grad_norm": 0.6058746809145183, + "learning_rate": 9.577962282409002e-06, + "loss": 1.9174, + "step": 9336 + }, + { + "epoch": 0.72, + "grad_norm": 0.6047944422993576, + "learning_rate": 9.573046257365873e-06, + "loss": 1.9769, + "step": 9337 + }, + { + "epoch": 0.72, + "grad_norm": 0.6084504487046691, + "learning_rate": 9.56813119549084e-06, + "loss": 2.0983, + "step": 9338 + }, + { + "epoch": 0.72, + "grad_norm": 0.6121038646427984, + "learning_rate": 9.56321709709077e-06, + "loss": 1.873, + "step": 9339 + }, + { + "epoch": 0.72, + "grad_norm": 0.6689693776999579, + "learning_rate": 9.558303962472475e-06, + "loss": 1.8835, + "step": 9340 + }, + { + "epoch": 0.72, + "grad_norm": 0.6413289947369122, + "learning_rate": 9.553391791942701e-06, + "loss": 1.8679, + "step": 9341 + }, + { + "epoch": 0.72, + "grad_norm": 0.5876338706567029, + "learning_rate": 9.54848058580813e-06, + "loss": 2.0759, + "step": 9342 + }, + { + "epoch": 0.72, + "grad_norm": 0.5513303313729443, + "learning_rate": 9.543570344375394e-06, + "loss": 1.9421, + "step": 9343 + }, + { + "epoch": 0.72, + "grad_norm": 0.5783839120406689, + "learning_rate": 9.53866106795106e-06, + "loss": 1.8721, + "step": 9344 + }, + { + "epoch": 0.72, + "grad_norm": 0.5686676762625879, + "learning_rate": 9.53375275684163e-06, + "loss": 1.8483, + "step": 9345 + }, + { + "epoch": 0.72, + "grad_norm": 0.6792951813258494, + "learning_rate": 9.528845411353554e-06, + "loss": 2.0668, + "step": 9346 + }, + { + "epoch": 0.72, + "grad_norm": 0.5815055745084498, + "learning_rate": 9.523939031793214e-06, + "loss": 1.8526, + "step": 9347 + }, + { + "epoch": 0.72, + "grad_norm": 0.6181259903879921, + "learning_rate": 9.519033618466938e-06, + "loss": 1.8633, + "step": 9348 + }, + { + "epoch": 0.72, + "grad_norm": 0.5856358340516395, + "learning_rate": 9.514129171680992e-06, + "loss": 1.8275, + "step": 9349 + }, + { + "epoch": 0.72, + "grad_norm": 0.5724716323062503, + "learning_rate": 9.509225691741578e-06, + "loss": 2.1071, + "step": 9350 + }, + { + "epoch": 0.72, + "grad_norm": 0.6029574466175578, + "learning_rate": 9.504323178954852e-06, + "loss": 1.8944, + "step": 9351 + }, + { + "epoch": 0.72, + "grad_norm": 0.5814076117174487, + "learning_rate": 9.499421633626867e-06, + "loss": 1.8776, + "step": 9352 + }, + { + "epoch": 0.72, + "grad_norm": 0.6288234765318846, + "learning_rate": 9.494521056063679e-06, + "loss": 1.9064, + "step": 9353 + }, + { + "epoch": 0.72, + "grad_norm": 0.5662019462728898, + "learning_rate": 9.489621446571239e-06, + "loss": 2.0946, + "step": 9354 + }, + { + "epoch": 0.72, + "grad_norm": 0.6283084354313933, + "learning_rate": 9.484722805455452e-06, + "loss": 1.8854, + "step": 9355 + }, + { + "epoch": 0.72, + "grad_norm": 0.5574689220273532, + "learning_rate": 9.479825133022156e-06, + "loss": 1.9201, + "step": 9356 + }, + { + "epoch": 0.72, + "grad_norm": 0.5822509772374407, + "learning_rate": 9.474928429577135e-06, + "loss": 1.8668, + "step": 9357 + }, + { + "epoch": 0.72, + "grad_norm": 0.5515969106181204, + "learning_rate": 9.470032695426111e-06, + "loss": 2.0724, + "step": 9358 + }, + { + "epoch": 0.72, + "grad_norm": 0.6058671368309673, + "learning_rate": 9.465137930874757e-06, + "loss": 1.9054, + "step": 9359 + }, + { + "epoch": 0.72, + "grad_norm": 0.572273402916584, + "learning_rate": 9.460244136228638e-06, + "loss": 1.845, + "step": 9360 + }, + { + "epoch": 0.72, + "grad_norm": 0.5944290046213113, + "learning_rate": 9.455351311793328e-06, + "loss": 1.8932, + "step": 9361 + }, + { + "epoch": 0.72, + "grad_norm": 0.6212275596774309, + "learning_rate": 9.450459457874305e-06, + "loss": 2.145, + "step": 9362 + }, + { + "epoch": 0.72, + "grad_norm": 0.602064953299212, + "learning_rate": 9.445568574776958e-06, + "loss": 1.8713, + "step": 9363 + }, + { + "epoch": 0.72, + "grad_norm": 0.5613116881050803, + "learning_rate": 9.440678662806682e-06, + "loss": 1.9028, + "step": 9364 + }, + { + "epoch": 0.72, + "grad_norm": 0.6295597609600294, + "learning_rate": 9.435789722268745e-06, + "loss": 1.8922, + "step": 9365 + }, + { + "epoch": 0.72, + "grad_norm": 0.5943555245365125, + "learning_rate": 9.430901753468388e-06, + "loss": 2.0702, + "step": 9366 + }, + { + "epoch": 0.72, + "grad_norm": 0.6060005159332501, + "learning_rate": 9.426014756710813e-06, + "loss": 1.836, + "step": 9367 + }, + { + "epoch": 0.72, + "grad_norm": 0.5658496325921856, + "learning_rate": 9.421128732301098e-06, + "loss": 1.9566, + "step": 9368 + }, + { + "epoch": 0.72, + "grad_norm": 0.587653914794449, + "learning_rate": 9.416243680544323e-06, + "loss": 1.8741, + "step": 9369 + }, + { + "epoch": 0.72, + "grad_norm": 0.5743503097097566, + "learning_rate": 9.411359601745484e-06, + "loss": 2.0568, + "step": 9370 + }, + { + "epoch": 0.72, + "grad_norm": 0.5399595927334069, + "learning_rate": 9.40647649620949e-06, + "loss": 1.8602, + "step": 9371 + }, + { + "epoch": 0.72, + "grad_norm": 0.6146152043421261, + "learning_rate": 9.401594364241242e-06, + "loss": 1.8741, + "step": 9372 + }, + { + "epoch": 0.72, + "grad_norm": 0.6778021536665929, + "learning_rate": 9.396713206145532e-06, + "loss": 1.8854, + "step": 9373 + }, + { + "epoch": 0.72, + "grad_norm": 0.5458681620257805, + "learning_rate": 9.391833022227106e-06, + "loss": 2.0699, + "step": 9374 + }, + { + "epoch": 0.72, + "grad_norm": 0.5714016278551034, + "learning_rate": 9.386953812790683e-06, + "loss": 1.9165, + "step": 9375 + }, + { + "epoch": 0.72, + "grad_norm": 0.5719024554631115, + "learning_rate": 9.382075578140862e-06, + "loss": 1.8661, + "step": 9376 + }, + { + "epoch": 0.72, + "grad_norm": 0.6901872939184617, + "learning_rate": 9.377198318582226e-06, + "loss": 1.8194, + "step": 9377 + }, + { + "epoch": 0.72, + "grad_norm": 0.6563965744280347, + "learning_rate": 9.372322034419276e-06, + "loss": 2.0907, + "step": 9378 + }, + { + "epoch": 0.72, + "grad_norm": 0.6260836635659269, + "learning_rate": 9.367446725956464e-06, + "loss": 1.843, + "step": 9379 + }, + { + "epoch": 0.72, + "grad_norm": 0.5835467090669153, + "learning_rate": 9.362572393498172e-06, + "loss": 1.8845, + "step": 9380 + }, + { + "epoch": 0.72, + "grad_norm": 0.5699337617864676, + "learning_rate": 9.357699037348724e-06, + "loss": 1.934, + "step": 9381 + }, + { + "epoch": 0.72, + "grad_norm": 0.6023244084027957, + "learning_rate": 9.35282665781239e-06, + "loss": 2.0561, + "step": 9382 + }, + { + "epoch": 0.72, + "grad_norm": 0.621481098854153, + "learning_rate": 9.347955255193364e-06, + "loss": 1.9285, + "step": 9383 + }, + { + "epoch": 0.72, + "grad_norm": 0.5603901255737668, + "learning_rate": 9.343084829795793e-06, + "loss": 1.8745, + "step": 9384 + }, + { + "epoch": 0.72, + "grad_norm": 0.6488501978683392, + "learning_rate": 9.338215381923753e-06, + "loss": 1.8573, + "step": 9385 + }, + { + "epoch": 0.72, + "grad_norm": 0.5892705241135113, + "learning_rate": 9.333346911881271e-06, + "loss": 2.0976, + "step": 9386 + }, + { + "epoch": 0.72, + "grad_norm": 0.5760798551442249, + "learning_rate": 9.328479419972303e-06, + "loss": 1.9216, + "step": 9387 + }, + { + "epoch": 0.72, + "grad_norm": 0.6692091759960711, + "learning_rate": 9.32361290650074e-06, + "loss": 1.8401, + "step": 9388 + }, + { + "epoch": 0.72, + "grad_norm": 0.621527679608654, + "learning_rate": 9.318747371770426e-06, + "loss": 1.8896, + "step": 9389 + }, + { + "epoch": 0.72, + "grad_norm": 0.6029148964668166, + "learning_rate": 9.313882816085135e-06, + "loss": 2.0533, + "step": 9390 + }, + { + "epoch": 0.72, + "grad_norm": 0.6068531747242455, + "learning_rate": 9.309019239748578e-06, + "loss": 1.913, + "step": 9391 + }, + { + "epoch": 0.72, + "grad_norm": 0.6044104142362767, + "learning_rate": 9.30415664306441e-06, + "loss": 1.8432, + "step": 9392 + }, + { + "epoch": 0.72, + "grad_norm": 0.5425562376020766, + "learning_rate": 9.299295026336225e-06, + "loss": 1.9906, + "step": 9393 + }, + { + "epoch": 0.72, + "grad_norm": 0.5785055803951957, + "learning_rate": 9.294434389867548e-06, + "loss": 2.0687, + "step": 9394 + }, + { + "epoch": 0.72, + "grad_norm": 0.6341989758770389, + "learning_rate": 9.289574733961856e-06, + "loss": 1.9035, + "step": 9395 + }, + { + "epoch": 0.72, + "grad_norm": 0.5692716145176596, + "learning_rate": 9.284716058922553e-06, + "loss": 1.8729, + "step": 9396 + }, + { + "epoch": 0.72, + "grad_norm": 0.5825671876383791, + "learning_rate": 9.279858365052985e-06, + "loss": 1.9146, + "step": 9397 + }, + { + "epoch": 0.73, + "grad_norm": 0.6695415546558015, + "learning_rate": 9.275001652656446e-06, + "loss": 2.0661, + "step": 9398 + }, + { + "epoch": 0.73, + "grad_norm": 0.5401738138764443, + "learning_rate": 9.270145922036139e-06, + "loss": 1.9267, + "step": 9399 + }, + { + "epoch": 0.73, + "grad_norm": 0.58323333393003, + "learning_rate": 9.26529117349525e-06, + "loss": 1.8519, + "step": 9400 + }, + { + "epoch": 0.73, + "grad_norm": 0.6051823583841881, + "learning_rate": 9.260437407336872e-06, + "loss": 1.8358, + "step": 9401 + }, + { + "epoch": 0.73, + "grad_norm": 0.6234844133354178, + "learning_rate": 9.255584623864046e-06, + "loss": 2.0876, + "step": 9402 + }, + { + "epoch": 0.73, + "grad_norm": 0.5723569817697856, + "learning_rate": 9.250732823379754e-06, + "loss": 1.8554, + "step": 9403 + }, + { + "epoch": 0.73, + "grad_norm": 0.5681590229916659, + "learning_rate": 9.245882006186913e-06, + "loss": 1.8464, + "step": 9404 + }, + { + "epoch": 0.73, + "grad_norm": 0.5952711982025045, + "learning_rate": 9.241032172588373e-06, + "loss": 1.9631, + "step": 9405 + }, + { + "epoch": 0.73, + "grad_norm": 0.6163154862120638, + "learning_rate": 9.236183322886945e-06, + "loss": 2.1091, + "step": 9406 + }, + { + "epoch": 0.73, + "grad_norm": 0.6041059810488946, + "learning_rate": 9.231335457385335e-06, + "loss": 1.8495, + "step": 9407 + }, + { + "epoch": 0.73, + "grad_norm": 0.5921346946201703, + "learning_rate": 9.226488576386238e-06, + "loss": 1.8783, + "step": 9408 + }, + { + "epoch": 0.73, + "grad_norm": 0.5920007996975722, + "learning_rate": 9.22164268019227e-06, + "loss": 1.8993, + "step": 9409 + }, + { + "epoch": 0.73, + "grad_norm": 0.5962259412905768, + "learning_rate": 9.21679776910595e-06, + "loss": 2.0703, + "step": 9410 + }, + { + "epoch": 0.73, + "grad_norm": 0.6611801894038234, + "learning_rate": 9.211953843429799e-06, + "loss": 1.8377, + "step": 9411 + }, + { + "epoch": 0.73, + "grad_norm": 0.5846130304552307, + "learning_rate": 9.207110903466224e-06, + "loss": 1.9769, + "step": 9412 + }, + { + "epoch": 0.73, + "grad_norm": 0.6086267574917718, + "learning_rate": 9.202268949517584e-06, + "loss": 1.8348, + "step": 9413 + }, + { + "epoch": 0.73, + "grad_norm": 0.6175484212041576, + "learning_rate": 9.197427981886208e-06, + "loss": 2.0989, + "step": 9414 + }, + { + "epoch": 0.73, + "grad_norm": 0.6192458373781426, + "learning_rate": 9.192588000874314e-06, + "loss": 1.8728, + "step": 9415 + }, + { + "epoch": 0.73, + "grad_norm": 0.7007182080599461, + "learning_rate": 9.187749006784083e-06, + "loss": 1.8523, + "step": 9416 + }, + { + "epoch": 0.73, + "grad_norm": 0.6307553660134358, + "learning_rate": 9.182910999917655e-06, + "loss": 1.8979, + "step": 9417 + }, + { + "epoch": 0.73, + "grad_norm": 0.5083796241128526, + "learning_rate": 9.178073980577053e-06, + "loss": 1.9393, + "step": 9418 + }, + { + "epoch": 0.73, + "grad_norm": 0.6303539237793349, + "learning_rate": 9.173237949064308e-06, + "loss": 2.0958, + "step": 9419 + }, + { + "epoch": 0.73, + "grad_norm": 0.6209768820241488, + "learning_rate": 9.168402905681331e-06, + "loss": 1.9208, + "step": 9420 + }, + { + "epoch": 0.73, + "grad_norm": 0.5384370279887928, + "learning_rate": 9.163568850729986e-06, + "loss": 1.9146, + "step": 9421 + }, + { + "epoch": 0.73, + "grad_norm": 0.6571652833999045, + "learning_rate": 9.158735784512112e-06, + "loss": 2.0403, + "step": 9422 + }, + { + "epoch": 0.73, + "grad_norm": 0.6489007292984263, + "learning_rate": 9.153903707329437e-06, + "loss": 1.8428, + "step": 9423 + }, + { + "epoch": 0.73, + "grad_norm": 0.5703503555823981, + "learning_rate": 9.149072619483636e-06, + "loss": 2.0104, + "step": 9424 + }, + { + "epoch": 0.73, + "grad_norm": 0.5668344564488843, + "learning_rate": 9.144242521276369e-06, + "loss": 1.8733, + "step": 9425 + }, + { + "epoch": 0.73, + "grad_norm": 0.5746449095381313, + "learning_rate": 9.13941341300917e-06, + "loss": 2.0575, + "step": 9426 + }, + { + "epoch": 0.73, + "grad_norm": 0.6435069762237243, + "learning_rate": 9.13458529498355e-06, + "loss": 1.8776, + "step": 9427 + }, + { + "epoch": 0.73, + "grad_norm": 0.5907001994440858, + "learning_rate": 9.129758167500944e-06, + "loss": 1.8389, + "step": 9428 + }, + { + "epoch": 0.73, + "grad_norm": 0.6096899242007797, + "learning_rate": 9.124932030862735e-06, + "loss": 1.8674, + "step": 9429 + }, + { + "epoch": 0.73, + "grad_norm": 0.6442030602900712, + "learning_rate": 9.120106885370233e-06, + "loss": 1.9779, + "step": 9430 + }, + { + "epoch": 0.73, + "grad_norm": 0.6170124255336897, + "learning_rate": 9.115282731324696e-06, + "loss": 2.0336, + "step": 9431 + }, + { + "epoch": 0.73, + "grad_norm": 0.5595169007306818, + "learning_rate": 9.110459569027313e-06, + "loss": 1.8517, + "step": 9432 + }, + { + "epoch": 0.73, + "grad_norm": 0.5871197982990949, + "learning_rate": 9.105637398779216e-06, + "loss": 1.8355, + "step": 9433 + }, + { + "epoch": 0.73, + "grad_norm": 0.5660837130148821, + "learning_rate": 9.10081622088147e-06, + "loss": 2.0648, + "step": 9434 + }, + { + "epoch": 0.73, + "grad_norm": 0.5760349262808508, + "learning_rate": 9.095996035635084e-06, + "loss": 1.8565, + "step": 9435 + }, + { + "epoch": 0.73, + "grad_norm": 0.5729605771402125, + "learning_rate": 9.091176843341001e-06, + "loss": 1.9461, + "step": 9436 + }, + { + "epoch": 0.73, + "grad_norm": 0.6030907602604958, + "learning_rate": 9.086358644300102e-06, + "loss": 1.8891, + "step": 9437 + }, + { + "epoch": 0.73, + "grad_norm": 0.6453222172628725, + "learning_rate": 9.08154143881321e-06, + "loss": 2.0475, + "step": 9438 + }, + { + "epoch": 0.73, + "grad_norm": 0.5942308760362073, + "learning_rate": 9.07672522718108e-06, + "loss": 1.9126, + "step": 9439 + }, + { + "epoch": 0.73, + "grad_norm": 0.6188324557634042, + "learning_rate": 9.071910009704407e-06, + "loss": 1.86, + "step": 9440 + }, + { + "epoch": 0.73, + "grad_norm": 0.6245435240497579, + "learning_rate": 9.067095786683826e-06, + "loss": 1.8684, + "step": 9441 + }, + { + "epoch": 0.73, + "grad_norm": 0.6534368737723625, + "learning_rate": 9.06228255841991e-06, + "loss": 2.0437, + "step": 9442 + }, + { + "epoch": 0.73, + "grad_norm": 0.5507354831165161, + "learning_rate": 9.057470325213166e-06, + "loss": 1.941, + "step": 9443 + }, + { + "epoch": 0.73, + "grad_norm": 0.639838833256376, + "learning_rate": 9.052659087364046e-06, + "loss": 1.9231, + "step": 9444 + }, + { + "epoch": 0.73, + "grad_norm": 0.6260917906716007, + "learning_rate": 9.04784884517294e-06, + "loss": 1.8535, + "step": 9445 + }, + { + "epoch": 0.73, + "grad_norm": 0.6576683699220596, + "learning_rate": 9.043039598940144e-06, + "loss": 2.054, + "step": 9446 + }, + { + "epoch": 0.73, + "grad_norm": 0.6235239081672191, + "learning_rate": 9.03823134896595e-06, + "loss": 1.8874, + "step": 9447 + }, + { + "epoch": 0.73, + "grad_norm": 0.5951418950763161, + "learning_rate": 9.033424095550556e-06, + "loss": 1.8911, + "step": 9448 + }, + { + "epoch": 0.73, + "grad_norm": 0.5845715382159836, + "learning_rate": 9.02861783899407e-06, + "loss": 1.9123, + "step": 9449 + }, + { + "epoch": 0.73, + "grad_norm": 0.6876808743205632, + "learning_rate": 9.023812579596588e-06, + "loss": 1.9128, + "step": 9450 + }, + { + "epoch": 0.73, + "grad_norm": 0.5611318276007188, + "learning_rate": 9.019008317658123e-06, + "loss": 2.0361, + "step": 9451 + }, + { + "epoch": 0.73, + "grad_norm": 0.6032761363567714, + "learning_rate": 9.014205053478619e-06, + "loss": 1.8789, + "step": 9452 + }, + { + "epoch": 0.73, + "grad_norm": 0.6347957666673468, + "learning_rate": 9.009402787357973e-06, + "loss": 1.8825, + "step": 9453 + }, + { + "epoch": 0.73, + "grad_norm": 0.5791982929822131, + "learning_rate": 9.004601519595988e-06, + "loss": 2.1024, + "step": 9454 + }, + { + "epoch": 0.73, + "grad_norm": 0.5729473772425203, + "learning_rate": 8.999801250492446e-06, + "loss": 1.971, + "step": 9455 + }, + { + "epoch": 0.73, + "grad_norm": 0.6257006091154033, + "learning_rate": 8.995001980347052e-06, + "loss": 1.8897, + "step": 9456 + }, + { + "epoch": 0.73, + "grad_norm": 0.6076539047162678, + "learning_rate": 8.990203709459417e-06, + "loss": 1.835, + "step": 9457 + }, + { + "epoch": 0.73, + "grad_norm": 0.5850401819996345, + "learning_rate": 8.985406438129142e-06, + "loss": 2.055, + "step": 9458 + }, + { + "epoch": 0.73, + "grad_norm": 0.5689146963705463, + "learning_rate": 8.980610166655742e-06, + "loss": 1.8846, + "step": 9459 + }, + { + "epoch": 0.73, + "grad_norm": 0.575624075341681, + "learning_rate": 8.975814895338645e-06, + "loss": 1.8583, + "step": 9460 + }, + { + "epoch": 0.73, + "grad_norm": 0.6124471190544843, + "learning_rate": 8.971020624477267e-06, + "loss": 1.9483, + "step": 9461 + }, + { + "epoch": 0.73, + "grad_norm": 0.5870489599018549, + "learning_rate": 8.96622735437091e-06, + "loss": 1.8386, + "step": 9462 + }, + { + "epoch": 0.73, + "grad_norm": 0.6236758126541322, + "learning_rate": 8.961435085318841e-06, + "loss": 2.0685, + "step": 9463 + }, + { + "epoch": 0.73, + "grad_norm": 0.6193977701908088, + "learning_rate": 8.956643817620286e-06, + "loss": 1.8592, + "step": 9464 + }, + { + "epoch": 0.73, + "grad_norm": 0.5877995699443526, + "learning_rate": 8.951853551574346e-06, + "loss": 1.906, + "step": 9465 + }, + { + "epoch": 0.73, + "grad_norm": 0.6054047890924181, + "learning_rate": 8.947064287480134e-06, + "loss": 2.0601, + "step": 9466 + }, + { + "epoch": 0.73, + "grad_norm": 0.5390324628691262, + "learning_rate": 8.942276025636636e-06, + "loss": 1.939, + "step": 9467 + }, + { + "epoch": 0.73, + "grad_norm": 0.6175430859112587, + "learning_rate": 8.937488766342806e-06, + "loss": 1.8923, + "step": 9468 + }, + { + "epoch": 0.73, + "grad_norm": 0.6229227645876635, + "learning_rate": 8.932702509897556e-06, + "loss": 1.8753, + "step": 9469 + }, + { + "epoch": 0.73, + "grad_norm": 0.5736164626858687, + "learning_rate": 8.927917256599683e-06, + "loss": 1.8698, + "step": 9470 + }, + { + "epoch": 0.73, + "grad_norm": 0.6379059567881266, + "learning_rate": 8.923133006747955e-06, + "loss": 2.0279, + "step": 9471 + }, + { + "epoch": 0.73, + "grad_norm": 0.5760810313443809, + "learning_rate": 8.918349760641095e-06, + "loss": 1.8627, + "step": 9472 + }, + { + "epoch": 0.73, + "grad_norm": 0.5697705538618044, + "learning_rate": 8.913567518577718e-06, + "loss": 1.8896, + "step": 9473 + }, + { + "epoch": 0.73, + "grad_norm": 0.6981249684323197, + "learning_rate": 8.908786280856405e-06, + "loss": 1.9395, + "step": 9474 + }, + { + "epoch": 0.73, + "grad_norm": 0.6218000335349834, + "learning_rate": 8.904006047775668e-06, + "loss": 2.1116, + "step": 9475 + }, + { + "epoch": 0.73, + "grad_norm": 0.6178864632704987, + "learning_rate": 8.899226819633954e-06, + "loss": 1.9145, + "step": 9476 + }, + { + "epoch": 0.73, + "grad_norm": 0.7450733242307268, + "learning_rate": 8.89444859672966e-06, + "loss": 1.8561, + "step": 9477 + }, + { + "epoch": 0.73, + "grad_norm": 0.5841526035503444, + "learning_rate": 8.889671379361098e-06, + "loss": 2.0579, + "step": 9478 + }, + { + "epoch": 0.73, + "grad_norm": 0.6473474635039632, + "learning_rate": 8.884895167826537e-06, + "loss": 1.8439, + "step": 9479 + }, + { + "epoch": 0.73, + "grad_norm": 0.7216526059860939, + "learning_rate": 8.880119962424175e-06, + "loss": 1.9143, + "step": 9480 + }, + { + "epoch": 0.73, + "grad_norm": 0.6470226772967926, + "learning_rate": 8.875345763452142e-06, + "loss": 1.8636, + "step": 9481 + }, + { + "epoch": 0.73, + "grad_norm": 0.6166961930964023, + "learning_rate": 8.870572571208516e-06, + "loss": 1.8053, + "step": 9482 + }, + { + "epoch": 0.73, + "grad_norm": 0.875935645520168, + "learning_rate": 8.865800385991307e-06, + "loss": 2.0708, + "step": 9483 + }, + { + "epoch": 0.73, + "grad_norm": 0.5715072908908646, + "learning_rate": 8.86102920809846e-06, + "loss": 1.857, + "step": 9484 + }, + { + "epoch": 0.73, + "grad_norm": 0.6951406734229545, + "learning_rate": 8.856259037827861e-06, + "loss": 1.8902, + "step": 9485 + }, + { + "epoch": 0.73, + "grad_norm": 0.7076093690015756, + "learning_rate": 8.851489875477328e-06, + "loss": 1.9391, + "step": 9486 + }, + { + "epoch": 0.73, + "grad_norm": 0.5949623925899069, + "learning_rate": 8.846721721344625e-06, + "loss": 2.047, + "step": 9487 + }, + { + "epoch": 0.73, + "grad_norm": 0.6090980475835315, + "learning_rate": 8.841954575727442e-06, + "loss": 1.824, + "step": 9488 + }, + { + "epoch": 0.73, + "grad_norm": 0.7688083262452369, + "learning_rate": 8.837188438923413e-06, + "loss": 1.8532, + "step": 9489 + }, + { + "epoch": 0.73, + "grad_norm": 0.6183425126155839, + "learning_rate": 8.832423311230112e-06, + "loss": 2.1082, + "step": 9490 + }, + { + "epoch": 0.73, + "grad_norm": 0.6694578889361724, + "learning_rate": 8.82765919294504e-06, + "loss": 1.8578, + "step": 9491 + }, + { + "epoch": 0.73, + "grad_norm": 0.7091615420809887, + "learning_rate": 8.82289608436564e-06, + "loss": 1.8628, + "step": 9492 + }, + { + "epoch": 0.73, + "grad_norm": 0.6218040623554549, + "learning_rate": 8.818133985789295e-06, + "loss": 1.8747, + "step": 9493 + }, + { + "epoch": 0.73, + "grad_norm": 0.6245583656484505, + "learning_rate": 8.813372897513325e-06, + "loss": 1.8746, + "step": 9494 + }, + { + "epoch": 0.73, + "grad_norm": 0.729931058114289, + "learning_rate": 8.808612819834989e-06, + "loss": 2.0604, + "step": 9495 + }, + { + "epoch": 0.73, + "grad_norm": 0.6182482245011539, + "learning_rate": 8.803853753051452e-06, + "loss": 1.8452, + "step": 9496 + }, + { + "epoch": 0.73, + "grad_norm": 0.6247423435013503, + "learning_rate": 8.79909569745987e-06, + "loss": 1.8744, + "step": 9497 + }, + { + "epoch": 0.73, + "grad_norm": 0.795580986524546, + "learning_rate": 8.794338653357297e-06, + "loss": 2.0337, + "step": 9498 + }, + { + "epoch": 0.73, + "grad_norm": 0.5750633129359688, + "learning_rate": 8.78958262104074e-06, + "loss": 1.9174, + "step": 9499 + }, + { + "epoch": 0.73, + "grad_norm": 0.604489153943089, + "learning_rate": 8.784827600807139e-06, + "loss": 1.8853, + "step": 9500 + }, + { + "epoch": 0.73, + "grad_norm": 0.6949752365476995, + "learning_rate": 8.78007359295335e-06, + "loss": 1.874, + "step": 9501 + }, + { + "epoch": 0.73, + "grad_norm": 0.6135693159496811, + "learning_rate": 8.775320597776206e-06, + "loss": 1.8847, + "step": 9502 + }, + { + "epoch": 0.73, + "grad_norm": 0.6462318566111473, + "learning_rate": 8.770568615572461e-06, + "loss": 2.1086, + "step": 9503 + }, + { + "epoch": 0.73, + "grad_norm": 0.5733103533254105, + "learning_rate": 8.765817646638769e-06, + "loss": 1.8917, + "step": 9504 + }, + { + "epoch": 0.73, + "grad_norm": 0.5564097713988024, + "learning_rate": 8.761067691271785e-06, + "loss": 1.958, + "step": 9505 + }, + { + "epoch": 0.73, + "grad_norm": 0.6457923816094181, + "learning_rate": 8.756318749768061e-06, + "loss": 1.8115, + "step": 9506 + }, + { + "epoch": 0.73, + "grad_norm": 0.5834141388875931, + "learning_rate": 8.751570822424074e-06, + "loss": 2.0751, + "step": 9507 + }, + { + "epoch": 0.73, + "grad_norm": 0.6269065322068194, + "learning_rate": 8.746823909536286e-06, + "loss": 1.8994, + "step": 9508 + }, + { + "epoch": 0.73, + "grad_norm": 0.5626929307055278, + "learning_rate": 8.742078011401045e-06, + "loss": 1.8665, + "step": 9509 + }, + { + "epoch": 0.73, + "grad_norm": 0.5966242219412808, + "learning_rate": 8.737333128314653e-06, + "loss": 2.0732, + "step": 9510 + }, + { + "epoch": 0.73, + "grad_norm": 0.5810034332604023, + "learning_rate": 8.732589260573379e-06, + "loss": 1.9736, + "step": 9511 + }, + { + "epoch": 0.73, + "grad_norm": 0.5707036521543274, + "learning_rate": 8.727846408473377e-06, + "loss": 1.8542, + "step": 9512 + }, + { + "epoch": 0.73, + "grad_norm": 0.5879599653983297, + "learning_rate": 8.723104572310773e-06, + "loss": 1.878, + "step": 9513 + }, + { + "epoch": 0.73, + "grad_norm": 0.6057033813881587, + "learning_rate": 8.718363752381616e-06, + "loss": 1.8696, + "step": 9514 + }, + { + "epoch": 0.73, + "grad_norm": 0.5509816841704259, + "learning_rate": 8.713623948981892e-06, + "loss": 2.0485, + "step": 9515 + }, + { + "epoch": 0.73, + "grad_norm": 0.5338385562154905, + "learning_rate": 8.708885162407542e-06, + "loss": 1.8676, + "step": 9516 + }, + { + "epoch": 0.73, + "grad_norm": 0.5407978599157268, + "learning_rate": 8.704147392954412e-06, + "loss": 1.9647, + "step": 9517 + }, + { + "epoch": 0.73, + "grad_norm": 0.5859708347920605, + "learning_rate": 8.699410640918295e-06, + "loss": 1.8861, + "step": 9518 + }, + { + "epoch": 0.73, + "grad_norm": 0.5582291647799464, + "learning_rate": 8.694674906594954e-06, + "loss": 2.0295, + "step": 9519 + }, + { + "epoch": 0.73, + "grad_norm": 0.5877479785782478, + "learning_rate": 8.689940190280035e-06, + "loss": 1.8888, + "step": 9520 + }, + { + "epoch": 0.73, + "grad_norm": 0.6442823406633241, + "learning_rate": 8.685206492269155e-06, + "loss": 1.8627, + "step": 9521 + }, + { + "epoch": 0.73, + "grad_norm": 0.6453134998717286, + "learning_rate": 8.680473812857853e-06, + "loss": 2.0367, + "step": 9522 + }, + { + "epoch": 0.73, + "grad_norm": 0.5119894099689503, + "learning_rate": 8.675742152341615e-06, + "loss": 1.9228, + "step": 9523 + }, + { + "epoch": 0.73, + "grad_norm": 0.5429165080201868, + "learning_rate": 8.671011511015855e-06, + "loss": 1.8102, + "step": 9524 + }, + { + "epoch": 0.73, + "grad_norm": 0.6342332387544026, + "learning_rate": 8.666281889175928e-06, + "loss": 1.862, + "step": 9525 + }, + { + "epoch": 0.73, + "grad_norm": 0.5509788989570872, + "learning_rate": 8.661553287117125e-06, + "loss": 1.8252, + "step": 9526 + }, + { + "epoch": 0.73, + "grad_norm": 0.5627231021312027, + "learning_rate": 8.656825705134666e-06, + "loss": 2.0805, + "step": 9527 + }, + { + "epoch": 0.74, + "grad_norm": 0.5525287267445802, + "learning_rate": 8.652099143523721e-06, + "loss": 1.8514, + "step": 9528 + }, + { + "epoch": 0.74, + "grad_norm": 0.55115614443392, + "learning_rate": 8.647373602579383e-06, + "loss": 1.9534, + "step": 9529 + }, + { + "epoch": 0.74, + "grad_norm": 0.5957817560656531, + "learning_rate": 8.642649082596693e-06, + "loss": 1.9548, + "step": 9530 + }, + { + "epoch": 0.74, + "grad_norm": 0.6657659770938509, + "learning_rate": 8.637925583870614e-06, + "loss": 2.0309, + "step": 9531 + }, + { + "epoch": 0.74, + "grad_norm": 0.5866233619020875, + "learning_rate": 8.633203106696062e-06, + "loss": 1.8447, + "step": 9532 + }, + { + "epoch": 0.74, + "grad_norm": 0.5808934747309141, + "learning_rate": 8.628481651367876e-06, + "loss": 1.8767, + "step": 9533 + }, + { + "epoch": 0.74, + "grad_norm": 0.535185379678177, + "learning_rate": 8.623761218180834e-06, + "loss": 1.8669, + "step": 9534 + }, + { + "epoch": 0.74, + "grad_norm": 0.6063668267229426, + "learning_rate": 8.619041807429656e-06, + "loss": 2.07, + "step": 9535 + }, + { + "epoch": 0.74, + "grad_norm": 0.5902608817947048, + "learning_rate": 8.614323419408993e-06, + "loss": 1.9499, + "step": 9536 + }, + { + "epoch": 0.74, + "grad_norm": 0.5687856952619362, + "learning_rate": 8.60960605441343e-06, + "loss": 1.849, + "step": 9537 + }, + { + "epoch": 0.74, + "grad_norm": 0.5695324132287162, + "learning_rate": 8.604889712737496e-06, + "loss": 1.8757, + "step": 9538 + }, + { + "epoch": 0.74, + "grad_norm": 0.5526098141112786, + "learning_rate": 8.600174394675652e-06, + "loss": 2.0706, + "step": 9539 + }, + { + "epoch": 0.74, + "grad_norm": 0.5786330331838251, + "learning_rate": 8.59546010052229e-06, + "loss": 1.8599, + "step": 9540 + }, + { + "epoch": 0.74, + "grad_norm": 0.5602785682551447, + "learning_rate": 8.590746830571748e-06, + "loss": 1.8586, + "step": 9541 + }, + { + "epoch": 0.74, + "grad_norm": 0.5985409438282746, + "learning_rate": 8.5860345851183e-06, + "loss": 1.9022, + "step": 9542 + }, + { + "epoch": 0.74, + "grad_norm": 0.592201552467059, + "learning_rate": 8.581323364456125e-06, + "loss": 2.0641, + "step": 9543 + }, + { + "epoch": 0.74, + "grad_norm": 0.5578314571771809, + "learning_rate": 8.576613168879392e-06, + "loss": 1.8497, + "step": 9544 + }, + { + "epoch": 0.74, + "grad_norm": 0.6244974886306275, + "learning_rate": 8.571903998682174e-06, + "loss": 1.871, + "step": 9545 + }, + { + "epoch": 0.74, + "grad_norm": 0.5907029026001209, + "learning_rate": 8.567195854158464e-06, + "loss": 1.8929, + "step": 9546 + }, + { + "epoch": 0.74, + "grad_norm": 0.623254435928751, + "learning_rate": 8.56248873560224e-06, + "loss": 2.0496, + "step": 9547 + }, + { + "epoch": 0.74, + "grad_norm": 0.5260493697608941, + "learning_rate": 8.557782643307353e-06, + "loss": 1.9155, + "step": 9548 + }, + { + "epoch": 0.74, + "grad_norm": 0.57905602423377, + "learning_rate": 8.55307757756765e-06, + "loss": 1.8251, + "step": 9549 + }, + { + "epoch": 0.74, + "grad_norm": 0.5434398927915, + "learning_rate": 8.54837353867689e-06, + "loss": 1.887, + "step": 9550 + }, + { + "epoch": 0.74, + "grad_norm": 0.5880842494468408, + "learning_rate": 8.543670526928738e-06, + "loss": 2.0498, + "step": 9551 + }, + { + "epoch": 0.74, + "grad_norm": 0.5738337689230462, + "learning_rate": 8.538968542616846e-06, + "loss": 1.8387, + "step": 9552 + }, + { + "epoch": 0.74, + "grad_norm": 0.5543706091646878, + "learning_rate": 8.53426758603478e-06, + "loss": 1.9034, + "step": 9553 + }, + { + "epoch": 0.74, + "grad_norm": 0.545187432898989, + "learning_rate": 8.529567657476017e-06, + "loss": 1.9263, + "step": 9554 + }, + { + "epoch": 0.74, + "grad_norm": 0.5707560369565038, + "learning_rate": 8.524868757234022e-06, + "loss": 2.1097, + "step": 9555 + }, + { + "epoch": 0.74, + "grad_norm": 0.58160244042883, + "learning_rate": 8.520170885602143e-06, + "loss": 1.8936, + "step": 9556 + }, + { + "epoch": 0.74, + "grad_norm": 0.585963558994109, + "learning_rate": 8.515474042873691e-06, + "loss": 1.8756, + "step": 9557 + }, + { + "epoch": 0.74, + "grad_norm": 0.5746945313357992, + "learning_rate": 8.51077822934193e-06, + "loss": 1.8649, + "step": 9558 + }, + { + "epoch": 0.74, + "grad_norm": 0.6145953177314585, + "learning_rate": 8.506083445300015e-06, + "loss": 2.0292, + "step": 9559 + }, + { + "epoch": 0.74, + "grad_norm": 0.6025504246175644, + "learning_rate": 8.50138969104107e-06, + "loss": 1.89, + "step": 9560 + }, + { + "epoch": 0.74, + "grad_norm": 0.5796718589293484, + "learning_rate": 8.496696966858145e-06, + "loss": 2.0067, + "step": 9561 + }, + { + "epoch": 0.74, + "grad_norm": 0.6362908349276662, + "learning_rate": 8.492005273044214e-06, + "loss": 1.8651, + "step": 9562 + }, + { + "epoch": 0.74, + "grad_norm": 0.5986553542263581, + "learning_rate": 8.48731460989223e-06, + "loss": 2.0692, + "step": 9563 + }, + { + "epoch": 0.74, + "grad_norm": 0.6173634114156699, + "learning_rate": 8.482624977695023e-06, + "loss": 1.8443, + "step": 9564 + }, + { + "epoch": 0.74, + "grad_norm": 0.5768448676413439, + "learning_rate": 8.477936376745382e-06, + "loss": 1.8563, + "step": 9565 + }, + { + "epoch": 0.74, + "grad_norm": 0.5773784051898269, + "learning_rate": 8.473248807336067e-06, + "loss": 1.8736, + "step": 9566 + }, + { + "epoch": 0.74, + "grad_norm": 0.5840541693790218, + "learning_rate": 8.468562269759714e-06, + "loss": 2.1422, + "step": 9567 + }, + { + "epoch": 0.74, + "grad_norm": 0.5919013030193284, + "learning_rate": 8.463876764308928e-06, + "loss": 1.8795, + "step": 9568 + }, + { + "epoch": 0.74, + "grad_norm": 0.5927752171363201, + "learning_rate": 8.459192291276252e-06, + "loss": 1.8589, + "step": 9569 + }, + { + "epoch": 0.74, + "grad_norm": 0.6030607086938009, + "learning_rate": 8.454508850954151e-06, + "loss": 1.8474, + "step": 9570 + }, + { + "epoch": 0.74, + "grad_norm": 0.6274000771088933, + "learning_rate": 8.449826443635032e-06, + "loss": 2.0389, + "step": 9571 + }, + { + "epoch": 0.74, + "grad_norm": 0.5826636988271239, + "learning_rate": 8.445145069611237e-06, + "loss": 1.8763, + "step": 9572 + }, + { + "epoch": 0.74, + "grad_norm": 0.5066516616232706, + "learning_rate": 8.440464729175048e-06, + "loss": 1.9179, + "step": 9573 + }, + { + "epoch": 0.74, + "grad_norm": 0.6007634397741938, + "learning_rate": 8.435785422618672e-06, + "loss": 1.8771, + "step": 9574 + }, + { + "epoch": 0.74, + "grad_norm": 0.5789947552220854, + "learning_rate": 8.43110715023426e-06, + "loss": 2.0213, + "step": 9575 + }, + { + "epoch": 0.74, + "grad_norm": 0.5483313299968551, + "learning_rate": 8.426429912313896e-06, + "loss": 1.9292, + "step": 9576 + }, + { + "epoch": 0.74, + "grad_norm": 0.5629356730812739, + "learning_rate": 8.421753709149596e-06, + "loss": 1.8348, + "step": 9577 + }, + { + "epoch": 0.74, + "grad_norm": 0.5942744150441326, + "learning_rate": 8.417078541033319e-06, + "loss": 1.8496, + "step": 9578 + }, + { + "epoch": 0.74, + "grad_norm": 0.5961591159253581, + "learning_rate": 8.412404408256955e-06, + "loss": 2.0767, + "step": 9579 + }, + { + "epoch": 0.74, + "grad_norm": 0.5786137425216643, + "learning_rate": 8.407731311112324e-06, + "loss": 1.8441, + "step": 9580 + }, + { + "epoch": 0.74, + "grad_norm": 0.6185220628752255, + "learning_rate": 8.40305924989119e-06, + "loss": 1.9175, + "step": 9581 + }, + { + "epoch": 0.74, + "grad_norm": 0.5593484465853727, + "learning_rate": 8.398388224885251e-06, + "loss": 1.8667, + "step": 9582 + }, + { + "epoch": 0.74, + "grad_norm": 0.5969887176216477, + "learning_rate": 8.393718236386136e-06, + "loss": 2.0689, + "step": 9583 + }, + { + "epoch": 0.74, + "grad_norm": 0.5382555744946411, + "learning_rate": 8.38904928468541e-06, + "loss": 1.875, + "step": 9584 + }, + { + "epoch": 0.74, + "grad_norm": 0.607727273528077, + "learning_rate": 8.384381370074578e-06, + "loss": 1.9763, + "step": 9585 + }, + { + "epoch": 0.74, + "grad_norm": 0.6004819773619966, + "learning_rate": 8.379714492845076e-06, + "loss": 1.8455, + "step": 9586 + }, + { + "epoch": 0.74, + "grad_norm": 0.5606449985766947, + "learning_rate": 8.375048653288275e-06, + "loss": 2.0609, + "step": 9587 + }, + { + "epoch": 0.74, + "grad_norm": 0.5899888914027556, + "learning_rate": 8.370383851695485e-06, + "loss": 1.9002, + "step": 9588 + }, + { + "epoch": 0.74, + "grad_norm": 0.5586240303395801, + "learning_rate": 8.365720088357956e-06, + "loss": 1.8756, + "step": 9589 + }, + { + "epoch": 0.74, + "grad_norm": 0.5565193793639786, + "learning_rate": 8.361057363566837e-06, + "loss": 1.8502, + "step": 9590 + }, + { + "epoch": 0.74, + "grad_norm": 0.6007831541356444, + "learning_rate": 8.356395677613274e-06, + "loss": 2.0462, + "step": 9591 + }, + { + "epoch": 0.74, + "grad_norm": 0.531465543705732, + "learning_rate": 8.35173503078831e-06, + "loss": 1.9586, + "step": 9592 + }, + { + "epoch": 0.74, + "grad_norm": 0.5649096710271295, + "learning_rate": 8.347075423382902e-06, + "loss": 1.8504, + "step": 9593 + }, + { + "epoch": 0.74, + "grad_norm": 0.5834646403933313, + "learning_rate": 8.342416855688003e-06, + "loss": 1.8553, + "step": 9594 + }, + { + "epoch": 0.74, + "grad_norm": 0.560449618071684, + "learning_rate": 8.337759327994447e-06, + "loss": 2.0834, + "step": 9595 + }, + { + "epoch": 0.74, + "grad_norm": 0.570883296007682, + "learning_rate": 8.333102840593015e-06, + "loss": 1.8468, + "step": 9596 + }, + { + "epoch": 0.74, + "grad_norm": 0.6417575501851226, + "learning_rate": 8.328447393774458e-06, + "loss": 1.8318, + "step": 9597 + }, + { + "epoch": 0.74, + "grad_norm": 0.6370372213582371, + "learning_rate": 8.323792987829404e-06, + "loss": 1.9754, + "step": 9598 + }, + { + "epoch": 0.74, + "grad_norm": 0.5932703061201452, + "learning_rate": 8.319139623048469e-06, + "loss": 2.0881, + "step": 9599 + }, + { + "epoch": 0.74, + "grad_norm": 0.6287596935434779, + "learning_rate": 8.314487299722182e-06, + "loss": 1.8761, + "step": 9600 + }, + { + "epoch": 0.74, + "grad_norm": 0.5686017983670711, + "learning_rate": 8.309836018140982e-06, + "loss": 1.8558, + "step": 9601 + }, + { + "epoch": 0.74, + "grad_norm": 0.5786967322864862, + "learning_rate": 8.305185778595301e-06, + "loss": 1.873, + "step": 9602 + }, + { + "epoch": 0.74, + "grad_norm": 0.5813350082562291, + "learning_rate": 8.300536581375447e-06, + "loss": 2.0983, + "step": 9603 + }, + { + "epoch": 0.74, + "grad_norm": 0.5203254756064826, + "learning_rate": 8.295888426771693e-06, + "loss": 1.9227, + "step": 9604 + }, + { + "epoch": 0.74, + "grad_norm": 0.5848132054829336, + "learning_rate": 8.291241315074257e-06, + "loss": 1.8743, + "step": 9605 + }, + { + "epoch": 0.74, + "grad_norm": 0.5888031505444721, + "learning_rate": 8.286595246573262e-06, + "loss": 1.8805, + "step": 9606 + }, + { + "epoch": 0.74, + "grad_norm": 0.5994527790881932, + "learning_rate": 8.281950221558779e-06, + "loss": 2.0444, + "step": 9607 + }, + { + "epoch": 0.74, + "grad_norm": 0.5566278808405757, + "learning_rate": 8.277306240320838e-06, + "loss": 1.8583, + "step": 9608 + }, + { + "epoch": 0.74, + "grad_norm": 0.6227852225968724, + "learning_rate": 8.272663303149358e-06, + "loss": 1.8875, + "step": 9609 + }, + { + "epoch": 0.74, + "grad_norm": 0.5633665756810162, + "learning_rate": 8.268021410334225e-06, + "loss": 1.8765, + "step": 9610 + }, + { + "epoch": 0.74, + "grad_norm": 0.5652108792174515, + "learning_rate": 8.263380562165254e-06, + "loss": 2.0175, + "step": 9611 + }, + { + "epoch": 0.74, + "grad_norm": 0.5492196933046195, + "learning_rate": 8.258740758932182e-06, + "loss": 1.8596, + "step": 9612 + }, + { + "epoch": 0.74, + "grad_norm": 0.6070523775592923, + "learning_rate": 8.254102000924716e-06, + "loss": 1.9048, + "step": 9613 + }, + { + "epoch": 0.74, + "grad_norm": 0.5796414910253367, + "learning_rate": 8.249464288432446e-06, + "loss": 1.873, + "step": 9614 + }, + { + "epoch": 0.74, + "grad_norm": 0.570817504933111, + "learning_rate": 8.244827621744936e-06, + "loss": 2.0529, + "step": 9615 + }, + { + "epoch": 0.74, + "grad_norm": 0.5381811426702727, + "learning_rate": 8.240192001151668e-06, + "loss": 1.979, + "step": 9616 + }, + { + "epoch": 0.74, + "grad_norm": 0.595967925907073, + "learning_rate": 8.235557426942065e-06, + "loss": 1.831, + "step": 9617 + }, + { + "epoch": 0.74, + "grad_norm": 0.5511386037989031, + "learning_rate": 8.23092389940548e-06, + "loss": 1.8653, + "step": 9618 + }, + { + "epoch": 0.74, + "grad_norm": 0.5641650130504673, + "learning_rate": 8.226291418831206e-06, + "loss": 2.0384, + "step": 9619 + }, + { + "epoch": 0.74, + "grad_norm": 0.5852091556456945, + "learning_rate": 8.221659985508468e-06, + "loss": 1.8671, + "step": 9620 + }, + { + "epoch": 0.74, + "grad_norm": 0.5470918243119084, + "learning_rate": 8.217029599726422e-06, + "loss": 1.8894, + "step": 9621 + }, + { + "epoch": 0.74, + "grad_norm": 0.594198270327469, + "learning_rate": 8.212400261774166e-06, + "loss": 1.8649, + "step": 9622 + }, + { + "epoch": 0.74, + "grad_norm": 0.536672827691908, + "learning_rate": 8.207771971940729e-06, + "loss": 2.1247, + "step": 9623 + }, + { + "epoch": 0.74, + "grad_norm": 0.5904438585017769, + "learning_rate": 8.203144730515066e-06, + "loss": 1.947, + "step": 9624 + }, + { + "epoch": 0.74, + "grad_norm": 0.5980538145827599, + "learning_rate": 8.198518537786087e-06, + "loss": 1.8866, + "step": 9625 + }, + { + "epoch": 0.74, + "grad_norm": 0.5838714026412606, + "learning_rate": 8.193893394042614e-06, + "loss": 1.8535, + "step": 9626 + }, + { + "epoch": 0.74, + "grad_norm": 0.5414048183201188, + "learning_rate": 8.189269299573419e-06, + "loss": 2.0958, + "step": 9627 + }, + { + "epoch": 0.74, + "grad_norm": 0.6833080128211375, + "learning_rate": 8.18464625466721e-06, + "loss": 1.8503, + "step": 9628 + }, + { + "epoch": 0.74, + "grad_norm": 0.4973246209478797, + "learning_rate": 8.180024259612598e-06, + "loss": 1.9884, + "step": 9629 + }, + { + "epoch": 0.74, + "grad_norm": 0.5896880089665866, + "learning_rate": 8.175403314698178e-06, + "loss": 1.8781, + "step": 9630 + }, + { + "epoch": 0.74, + "grad_norm": 0.5791239060325994, + "learning_rate": 8.170783420212447e-06, + "loss": 2.0162, + "step": 9631 + }, + { + "epoch": 0.74, + "grad_norm": 0.5575897269130325, + "learning_rate": 8.16616457644384e-06, + "loss": 1.8666, + "step": 9632 + }, + { + "epoch": 0.74, + "grad_norm": 0.5749575684852171, + "learning_rate": 8.161546783680738e-06, + "loss": 1.8645, + "step": 9633 + }, + { + "epoch": 0.74, + "grad_norm": 0.5654675519974227, + "learning_rate": 8.156930042211442e-06, + "loss": 1.8981, + "step": 9634 + }, + { + "epoch": 0.74, + "grad_norm": 0.5492550071909558, + "learning_rate": 8.152314352324194e-06, + "loss": 1.9606, + "step": 9635 + }, + { + "epoch": 0.74, + "grad_norm": 0.5701287154088475, + "learning_rate": 8.147699714307186e-06, + "loss": 2.1076, + "step": 9636 + }, + { + "epoch": 0.74, + "grad_norm": 0.5438180106521757, + "learning_rate": 8.143086128448496e-06, + "loss": 1.8455, + "step": 9637 + }, + { + "epoch": 0.74, + "grad_norm": 0.5930548318201123, + "learning_rate": 8.1384735950362e-06, + "loss": 1.9113, + "step": 9638 + }, + { + "epoch": 0.74, + "grad_norm": 0.5586758359959807, + "learning_rate": 8.133862114358274e-06, + "loss": 2.0901, + "step": 9639 + }, + { + "epoch": 0.74, + "grad_norm": 0.5688910209225688, + "learning_rate": 8.129251686702605e-06, + "loss": 1.8825, + "step": 9640 + }, + { + "epoch": 0.74, + "grad_norm": 0.5335892056233568, + "learning_rate": 8.12464231235707e-06, + "loss": 1.9273, + "step": 9641 + }, + { + "epoch": 0.74, + "grad_norm": 0.571242473452962, + "learning_rate": 8.120033991609449e-06, + "loss": 1.873, + "step": 9642 + }, + { + "epoch": 0.74, + "grad_norm": 0.5449500026848987, + "learning_rate": 8.115426724747436e-06, + "loss": 2.0696, + "step": 9643 + }, + { + "epoch": 0.74, + "grad_norm": 0.6168661471637013, + "learning_rate": 8.110820512058708e-06, + "loss": 1.8567, + "step": 9644 + }, + { + "epoch": 0.74, + "grad_norm": 0.5982839104189641, + "learning_rate": 8.106215353830825e-06, + "loss": 1.8543, + "step": 9645 + }, + { + "epoch": 0.74, + "grad_norm": 0.5709563867642586, + "learning_rate": 8.101611250351327e-06, + "loss": 1.8973, + "step": 9646 + }, + { + "epoch": 0.74, + "grad_norm": 0.5805845267794946, + "learning_rate": 8.097008201907668e-06, + "loss": 2.0924, + "step": 9647 + }, + { + "epoch": 0.74, + "grad_norm": 0.6675423746985499, + "learning_rate": 8.092406208787206e-06, + "loss": 1.9715, + "step": 9648 + }, + { + "epoch": 0.74, + "grad_norm": 0.5567842990883047, + "learning_rate": 8.087805271277305e-06, + "loss": 1.8504, + "step": 9649 + }, + { + "epoch": 0.74, + "grad_norm": 0.5616935367192633, + "learning_rate": 8.083205389665186e-06, + "loss": 1.9171, + "step": 9650 + }, + { + "epoch": 0.74, + "grad_norm": 0.6542306493533755, + "learning_rate": 8.078606564238042e-06, + "loss": 1.9939, + "step": 9651 + }, + { + "epoch": 0.74, + "grad_norm": 0.5811367268022334, + "learning_rate": 8.07400879528302e-06, + "loss": 1.8694, + "step": 9652 + }, + { + "epoch": 0.74, + "grad_norm": 0.5702946889745036, + "learning_rate": 8.069412083087158e-06, + "loss": 1.8269, + "step": 9653 + }, + { + "epoch": 0.74, + "grad_norm": 0.5782426372043452, + "learning_rate": 8.064816427937442e-06, + "loss": 1.9597, + "step": 9654 + }, + { + "epoch": 0.74, + "grad_norm": 0.5688237485072568, + "learning_rate": 8.060221830120823e-06, + "loss": 2.0419, + "step": 9655 + }, + { + "epoch": 0.74, + "grad_norm": 0.601790059969058, + "learning_rate": 8.05562828992414e-06, + "loss": 1.8492, + "step": 9656 + }, + { + "epoch": 0.75, + "grad_norm": 0.6176574832801204, + "learning_rate": 8.051035807634191e-06, + "loss": 1.8154, + "step": 9657 + }, + { + "epoch": 0.75, + "grad_norm": 0.5636739377178016, + "learning_rate": 8.046444383537704e-06, + "loss": 1.8865, + "step": 9658 + }, + { + "epoch": 0.75, + "grad_norm": 0.6104109326153345, + "learning_rate": 8.041854017921341e-06, + "loss": 2.083, + "step": 9659 + }, + { + "epoch": 0.75, + "grad_norm": 0.5989550837830616, + "learning_rate": 8.0372647110717e-06, + "loss": 1.9429, + "step": 9660 + }, + { + "epoch": 0.75, + "grad_norm": 0.616621771430774, + "learning_rate": 8.032676463275304e-06, + "loss": 1.8406, + "step": 9661 + }, + { + "epoch": 0.75, + "grad_norm": 0.5978121767910628, + "learning_rate": 8.028089274818624e-06, + "loss": 1.8407, + "step": 9662 + }, + { + "epoch": 0.75, + "grad_norm": 0.5813324048841958, + "learning_rate": 8.023503145988048e-06, + "loss": 2.0978, + "step": 9663 + }, + { + "epoch": 0.75, + "grad_norm": 0.5794615904976046, + "learning_rate": 8.018918077069915e-06, + "loss": 1.8452, + "step": 9664 + }, + { + "epoch": 0.75, + "grad_norm": 0.5955023272413151, + "learning_rate": 8.014334068350487e-06, + "loss": 1.833, + "step": 9665 + }, + { + "epoch": 0.75, + "grad_norm": 0.5989621659558192, + "learning_rate": 8.00975112011596e-06, + "loss": 1.9333, + "step": 9666 + }, + { + "epoch": 0.75, + "grad_norm": 0.6141144551139448, + "learning_rate": 8.00516923265247e-06, + "loss": 1.8854, + "step": 9667 + }, + { + "epoch": 0.75, + "grad_norm": 0.6078873609887718, + "learning_rate": 8.000588406246081e-06, + "loss": 2.0883, + "step": 9668 + }, + { + "epoch": 0.75, + "grad_norm": 0.6111136433575022, + "learning_rate": 7.996008641182792e-06, + "loss": 1.8803, + "step": 9669 + }, + { + "epoch": 0.75, + "grad_norm": 0.5755441578275446, + "learning_rate": 7.99142993774854e-06, + "loss": 1.9041, + "step": 9670 + }, + { + "epoch": 0.75, + "grad_norm": 0.5787381342875569, + "learning_rate": 7.986852296229185e-06, + "loss": 2.0641, + "step": 9671 + }, + { + "epoch": 0.75, + "grad_norm": 0.5539860368666587, + "learning_rate": 7.98227571691054e-06, + "loss": 1.9362, + "step": 9672 + }, + { + "epoch": 0.75, + "grad_norm": 0.5889241734194053, + "learning_rate": 7.977700200078328e-06, + "loss": 1.8934, + "step": 9673 + }, + { + "epoch": 0.75, + "grad_norm": 0.5966601194218284, + "learning_rate": 7.973125746018222e-06, + "loss": 1.923, + "step": 9674 + }, + { + "epoch": 0.75, + "grad_norm": 0.6085391258984884, + "learning_rate": 7.968552355015832e-06, + "loss": 2.048, + "step": 9675 + }, + { + "epoch": 0.75, + "grad_norm": 0.6107953431120102, + "learning_rate": 7.963980027356671e-06, + "loss": 1.9115, + "step": 9676 + }, + { + "epoch": 0.75, + "grad_norm": 0.5722037609499528, + "learning_rate": 7.959408763326228e-06, + "loss": 1.825, + "step": 9677 + }, + { + "epoch": 0.75, + "grad_norm": 0.5698494172093421, + "learning_rate": 7.954838563209902e-06, + "loss": 1.855, + "step": 9678 + }, + { + "epoch": 0.75, + "grad_norm": 0.5817429264241074, + "learning_rate": 7.950269427293029e-06, + "loss": 1.9298, + "step": 9679 + }, + { + "epoch": 0.75, + "grad_norm": 0.6052188128890056, + "learning_rate": 7.945701355860876e-06, + "loss": 2.0255, + "step": 9680 + }, + { + "epoch": 0.75, + "grad_norm": 0.5814743663973506, + "learning_rate": 7.941134349198647e-06, + "loss": 1.8807, + "step": 9681 + }, + { + "epoch": 0.75, + "grad_norm": 0.5770736638403569, + "learning_rate": 7.936568407591482e-06, + "loss": 1.8497, + "step": 9682 + }, + { + "epoch": 0.75, + "grad_norm": 0.5300683017713927, + "learning_rate": 7.932003531324458e-06, + "loss": 2.0566, + "step": 9683 + }, + { + "epoch": 0.75, + "grad_norm": 0.7121966790739129, + "learning_rate": 7.927439720682555e-06, + "loss": 1.8543, + "step": 9684 + }, + { + "epoch": 0.75, + "grad_norm": 0.6295933228326592, + "learning_rate": 7.922876975950733e-06, + "loss": 1.9619, + "step": 9685 + }, + { + "epoch": 0.75, + "grad_norm": 0.5867667112963422, + "learning_rate": 7.918315297413864e-06, + "loss": 1.8397, + "step": 9686 + }, + { + "epoch": 0.75, + "grad_norm": 0.5956495850258585, + "learning_rate": 7.913754685356727e-06, + "loss": 1.8173, + "step": 9687 + }, + { + "epoch": 0.75, + "grad_norm": 0.6122745384863749, + "learning_rate": 7.909195140064088e-06, + "loss": 2.0553, + "step": 9688 + }, + { + "epoch": 0.75, + "grad_norm": 0.5679854273857808, + "learning_rate": 7.904636661820614e-06, + "loss": 1.8728, + "step": 9689 + }, + { + "epoch": 0.75, + "grad_norm": 0.7021718818558582, + "learning_rate": 7.900079250910886e-06, + "loss": 1.9154, + "step": 9690 + }, + { + "epoch": 0.75, + "grad_norm": 0.5972613644143542, + "learning_rate": 7.895522907619474e-06, + "loss": 1.8983, + "step": 9691 + }, + { + "epoch": 0.75, + "grad_norm": 0.6005479026744721, + "learning_rate": 7.890967632230827e-06, + "loss": 2.0285, + "step": 9692 + }, + { + "epoch": 0.75, + "grad_norm": 0.6709578902539571, + "learning_rate": 7.88641342502935e-06, + "loss": 1.8893, + "step": 9693 + }, + { + "epoch": 0.75, + "grad_norm": 0.6394719621218592, + "learning_rate": 7.8818602862994e-06, + "loss": 1.8235, + "step": 9694 + }, + { + "epoch": 0.75, + "grad_norm": 0.5776090049291386, + "learning_rate": 7.877308216325222e-06, + "loss": 2.064, + "step": 9695 + }, + { + "epoch": 0.75, + "grad_norm": 0.6110683813273184, + "learning_rate": 7.872757215391047e-06, + "loss": 1.8333, + "step": 9696 + }, + { + "epoch": 0.75, + "grad_norm": 0.6150996879712762, + "learning_rate": 7.868207283780992e-06, + "loss": 1.9594, + "step": 9697 + }, + { + "epoch": 0.75, + "grad_norm": 0.5979223829977564, + "learning_rate": 7.86365842177913e-06, + "loss": 1.9047, + "step": 9698 + }, + { + "epoch": 0.75, + "grad_norm": 0.6424338696092472, + "learning_rate": 7.859110629669487e-06, + "loss": 1.8873, + "step": 9699 + }, + { + "epoch": 0.75, + "grad_norm": 0.6229086406716716, + "learning_rate": 7.854563907735971e-06, + "loss": 2.0732, + "step": 9700 + }, + { + "epoch": 0.75, + "grad_norm": 0.5835406958390547, + "learning_rate": 7.850018256262461e-06, + "loss": 1.8864, + "step": 9701 + }, + { + "epoch": 0.75, + "grad_norm": 0.5451495543382027, + "learning_rate": 7.84547367553278e-06, + "loss": 1.8416, + "step": 9702 + }, + { + "epoch": 0.75, + "grad_norm": 0.5891973135618125, + "learning_rate": 7.840930165830644e-06, + "loss": 1.9485, + "step": 9703 + }, + { + "epoch": 0.75, + "grad_norm": 0.6133954450114006, + "learning_rate": 7.836387727439728e-06, + "loss": 2.0584, + "step": 9704 + }, + { + "epoch": 0.75, + "grad_norm": 0.5971132094306681, + "learning_rate": 7.831846360643635e-06, + "loss": 1.8663, + "step": 9705 + }, + { + "epoch": 0.75, + "grad_norm": 0.559862026843912, + "learning_rate": 7.827306065725903e-06, + "loss": 1.8273, + "step": 9706 + }, + { + "epoch": 0.75, + "grad_norm": 0.6073417034894042, + "learning_rate": 7.82276684297e-06, + "loss": 2.1045, + "step": 9707 + }, + { + "epoch": 0.75, + "grad_norm": 0.5859007018997301, + "learning_rate": 7.81822869265933e-06, + "loss": 1.873, + "step": 9708 + }, + { + "epoch": 0.75, + "grad_norm": 0.6463676277298301, + "learning_rate": 7.813691615077227e-06, + "loss": 1.8535, + "step": 9709 + }, + { + "epoch": 0.75, + "grad_norm": 0.5455862969665533, + "learning_rate": 7.80915561050696e-06, + "loss": 1.8422, + "step": 9710 + }, + { + "epoch": 0.75, + "grad_norm": 0.5449465566255458, + "learning_rate": 7.804620679231727e-06, + "loss": 1.8465, + "step": 9711 + }, + { + "epoch": 0.75, + "grad_norm": 0.6612260854950597, + "learning_rate": 7.800086821534669e-06, + "loss": 2.0853, + "step": 9712 + }, + { + "epoch": 0.75, + "grad_norm": 0.5928490079703577, + "learning_rate": 7.79555403769885e-06, + "loss": 1.8618, + "step": 9713 + }, + { + "epoch": 0.75, + "grad_norm": 0.6218533343621533, + "learning_rate": 7.791022328007267e-06, + "loss": 1.8742, + "step": 9714 + }, + { + "epoch": 0.75, + "grad_norm": 0.653408542850214, + "learning_rate": 7.786491692742856e-06, + "loss": 2.049, + "step": 9715 + }, + { + "epoch": 0.75, + "grad_norm": 0.5662238959546546, + "learning_rate": 7.781962132188486e-06, + "loss": 1.9141, + "step": 9716 + }, + { + "epoch": 0.75, + "grad_norm": 0.6121501601449149, + "learning_rate": 7.777433646626952e-06, + "loss": 1.8712, + "step": 9717 + }, + { + "epoch": 0.75, + "grad_norm": 0.6222948474183657, + "learning_rate": 7.772906236340987e-06, + "loss": 1.8277, + "step": 9718 + }, + { + "epoch": 0.75, + "grad_norm": 0.6139331526219693, + "learning_rate": 7.76837990161326e-06, + "loss": 1.8301, + "step": 9719 + }, + { + "epoch": 0.75, + "grad_norm": 0.6163817830151918, + "learning_rate": 7.763854642726362e-06, + "loss": 2.048, + "step": 9720 + }, + { + "epoch": 0.75, + "grad_norm": 0.6399370941854875, + "learning_rate": 7.759330459962827e-06, + "loss": 1.8919, + "step": 9721 + }, + { + "epoch": 0.75, + "grad_norm": 0.5121582182416242, + "learning_rate": 7.754807353605117e-06, + "loss": 1.9126, + "step": 9722 + }, + { + "epoch": 0.75, + "grad_norm": 0.545924164595504, + "learning_rate": 7.75028532393563e-06, + "loss": 1.8336, + "step": 9723 + }, + { + "epoch": 0.75, + "grad_norm": 0.6061016226510417, + "learning_rate": 7.745764371236693e-06, + "loss": 2.0902, + "step": 9724 + }, + { + "epoch": 0.75, + "grad_norm": 0.5433844277571601, + "learning_rate": 7.741244495790578e-06, + "loss": 1.8588, + "step": 9725 + }, + { + "epoch": 0.75, + "grad_norm": 0.6546356627910671, + "learning_rate": 7.736725697879454e-06, + "loss": 1.8949, + "step": 9726 + }, + { + "epoch": 0.75, + "grad_norm": 0.5495456697213985, + "learning_rate": 7.732207977785472e-06, + "loss": 2.0562, + "step": 9727 + }, + { + "epoch": 0.75, + "grad_norm": 0.5444023375842085, + "learning_rate": 7.727691335790682e-06, + "loss": 1.9151, + "step": 9728 + }, + { + "epoch": 0.75, + "grad_norm": 0.5837129439414848, + "learning_rate": 7.72317577217708e-06, + "loss": 1.8481, + "step": 9729 + }, + { + "epoch": 0.75, + "grad_norm": 0.5839048951679258, + "learning_rate": 7.718661287226595e-06, + "loss": 1.8934, + "step": 9730 + }, + { + "epoch": 0.75, + "grad_norm": 0.6370023380745496, + "learning_rate": 7.714147881221065e-06, + "loss": 1.8901, + "step": 9731 + }, + { + "epoch": 0.75, + "grad_norm": 0.5782269737825148, + "learning_rate": 7.709635554442304e-06, + "loss": 2.0652, + "step": 9732 + }, + { + "epoch": 0.75, + "grad_norm": 0.579088363364256, + "learning_rate": 7.705124307172032e-06, + "loss": 1.8941, + "step": 9733 + }, + { + "epoch": 0.75, + "grad_norm": 0.5307138206736076, + "learning_rate": 7.700614139691883e-06, + "loss": 1.9473, + "step": 9734 + }, + { + "epoch": 0.75, + "grad_norm": 0.5906331102066462, + "learning_rate": 7.69610505228347e-06, + "loss": 1.8922, + "step": 9735 + }, + { + "epoch": 0.75, + "grad_norm": 0.5787807830362273, + "learning_rate": 7.691597045228313e-06, + "loss": 2.0765, + "step": 9736 + }, + { + "epoch": 0.75, + "grad_norm": 0.572947877888531, + "learning_rate": 7.687090118807839e-06, + "loss": 1.8036, + "step": 9737 + }, + { + "epoch": 0.75, + "grad_norm": 0.5874298471430289, + "learning_rate": 7.682584273303469e-06, + "loss": 1.8566, + "step": 9738 + }, + { + "epoch": 0.75, + "grad_norm": 0.6240506201421289, + "learning_rate": 7.678079508996497e-06, + "loss": 2.1029, + "step": 9739 + }, + { + "epoch": 0.75, + "grad_norm": 0.6075277251535941, + "learning_rate": 7.673575826168175e-06, + "loss": 1.8466, + "step": 9740 + }, + { + "epoch": 0.75, + "grad_norm": 0.545044315614853, + "learning_rate": 7.669073225099707e-06, + "loss": 1.9593, + "step": 9741 + }, + { + "epoch": 0.75, + "grad_norm": 0.5998179639304626, + "learning_rate": 7.664571706072179e-06, + "loss": 1.8515, + "step": 9742 + }, + { + "epoch": 0.75, + "grad_norm": 0.5820251803700716, + "learning_rate": 7.660071269366669e-06, + "loss": 1.8923, + "step": 9743 + }, + { + "epoch": 0.75, + "grad_norm": 0.6166619720779593, + "learning_rate": 7.65557191526414e-06, + "loss": 2.0312, + "step": 9744 + }, + { + "epoch": 0.75, + "grad_norm": 0.6098554399864821, + "learning_rate": 7.651073644045498e-06, + "loss": 1.8669, + "step": 9745 + }, + { + "epoch": 0.75, + "grad_norm": 0.5854523204816042, + "learning_rate": 7.646576455991617e-06, + "loss": 1.8429, + "step": 9746 + }, + { + "epoch": 0.75, + "grad_norm": 0.5493494448527086, + "learning_rate": 7.642080351383249e-06, + "loss": 1.9263, + "step": 9747 + }, + { + "epoch": 0.75, + "grad_norm": 0.5638602079774317, + "learning_rate": 7.637585330501108e-06, + "loss": 2.0687, + "step": 9748 + }, + { + "epoch": 0.75, + "grad_norm": 0.5726613699603732, + "learning_rate": 7.633091393625852e-06, + "loss": 1.8277, + "step": 9749 + }, + { + "epoch": 0.75, + "grad_norm": 0.5881831992311246, + "learning_rate": 7.62859854103804e-06, + "loss": 1.8824, + "step": 9750 + }, + { + "epoch": 0.75, + "grad_norm": 0.5563221688713246, + "learning_rate": 7.624106773018183e-06, + "loss": 1.8518, + "step": 9751 + }, + { + "epoch": 0.75, + "grad_norm": 0.5550242258484831, + "learning_rate": 7.619616089846723e-06, + "loss": 2.0594, + "step": 9752 + }, + { + "epoch": 0.75, + "grad_norm": 0.5459083150519495, + "learning_rate": 7.615126491804034e-06, + "loss": 1.9124, + "step": 9753 + }, + { + "epoch": 0.75, + "grad_norm": 0.5804658829412, + "learning_rate": 7.610637979170412e-06, + "loss": 1.8372, + "step": 9754 + }, + { + "epoch": 0.75, + "grad_norm": 0.6008353982092726, + "learning_rate": 7.606150552226101e-06, + "loss": 1.865, + "step": 9755 + }, + { + "epoch": 0.75, + "grad_norm": 0.5501898660385509, + "learning_rate": 7.601664211251267e-06, + "loss": 2.0516, + "step": 9756 + }, + { + "epoch": 0.75, + "grad_norm": 0.5751352264385308, + "learning_rate": 7.59717895652601e-06, + "loss": 1.8731, + "step": 9757 + }, + { + "epoch": 0.75, + "grad_norm": 0.6027609687475649, + "learning_rate": 7.592694788330363e-06, + "loss": 1.9024, + "step": 9758 + }, + { + "epoch": 0.75, + "grad_norm": 0.5506082735635925, + "learning_rate": 7.588211706944293e-06, + "loss": 1.95, + "step": 9759 + }, + { + "epoch": 0.75, + "grad_norm": 0.5600949765927711, + "learning_rate": 7.5837297126476915e-06, + "loss": 2.0792, + "step": 9760 + }, + { + "epoch": 0.75, + "grad_norm": 0.6440422346153363, + "learning_rate": 7.579248805720396e-06, + "loss": 1.8502, + "step": 9761 + }, + { + "epoch": 0.75, + "grad_norm": 0.610829375382557, + "learning_rate": 7.574768986442163e-06, + "loss": 1.8533, + "step": 9762 + }, + { + "epoch": 0.75, + "grad_norm": 0.5897168343803262, + "learning_rate": 7.570290255092685e-06, + "loss": 1.8644, + "step": 9763 + }, + { + "epoch": 0.75, + "grad_norm": 0.5772752792068511, + "learning_rate": 7.56581261195159e-06, + "loss": 2.1087, + "step": 9764 + }, + { + "epoch": 0.75, + "grad_norm": 0.5719931050046899, + "learning_rate": 7.561336057298435e-06, + "loss": 1.946, + "step": 9765 + }, + { + "epoch": 0.75, + "grad_norm": 0.5709006687761398, + "learning_rate": 7.5568605914127095e-06, + "loss": 1.9239, + "step": 9766 + }, + { + "epoch": 0.75, + "grad_norm": 0.5688497969778306, + "learning_rate": 7.552386214573837e-06, + "loss": 1.8844, + "step": 9767 + }, + { + "epoch": 0.75, + "grad_norm": 0.5958779531833449, + "learning_rate": 7.547912927061168e-06, + "loss": 2.0398, + "step": 9768 + }, + { + "epoch": 0.75, + "grad_norm": 0.6459584054559888, + "learning_rate": 7.543440729153992e-06, + "loss": 1.9062, + "step": 9769 + }, + { + "epoch": 0.75, + "grad_norm": 0.6115568693029245, + "learning_rate": 7.5389696211315255e-06, + "loss": 1.9047, + "step": 9770 + }, + { + "epoch": 0.75, + "grad_norm": 0.6244667792077003, + "learning_rate": 7.534499603272912e-06, + "loss": 1.8798, + "step": 9771 + }, + { + "epoch": 0.75, + "grad_norm": 0.663040378961429, + "learning_rate": 7.530030675857253e-06, + "loss": 2.0619, + "step": 9772 + }, + { + "epoch": 0.75, + "grad_norm": 0.5577212686680397, + "learning_rate": 7.525562839163527e-06, + "loss": 1.8688, + "step": 9773 + }, + { + "epoch": 0.75, + "grad_norm": 0.5755058030475402, + "learning_rate": 7.521096093470709e-06, + "loss": 1.8774, + "step": 9774 + }, + { + "epoch": 0.75, + "grad_norm": 0.6109098626842512, + "learning_rate": 7.5166304390576645e-06, + "loss": 1.904, + "step": 9775 + }, + { + "epoch": 0.75, + "grad_norm": 0.5884879109454476, + "learning_rate": 7.5121658762032084e-06, + "loss": 2.073, + "step": 9776 + }, + { + "epoch": 0.75, + "grad_norm": 0.5743116729118976, + "learning_rate": 7.507702405186084e-06, + "loss": 1.9015, + "step": 9777 + }, + { + "epoch": 0.75, + "grad_norm": 0.5630612218224895, + "learning_rate": 7.503240026284947e-06, + "loss": 1.9045, + "step": 9778 + }, + { + "epoch": 0.75, + "grad_norm": 0.6215457547191732, + "learning_rate": 7.498778739778417e-06, + "loss": 1.8948, + "step": 9779 + }, + { + "epoch": 0.75, + "grad_norm": 0.5752771312826238, + "learning_rate": 7.494318545945037e-06, + "loss": 2.0286, + "step": 9780 + }, + { + "epoch": 0.75, + "grad_norm": 0.597033133231822, + "learning_rate": 7.489859445063249e-06, + "loss": 1.8679, + "step": 9781 + }, + { + "epoch": 0.75, + "grad_norm": 0.5991020672115519, + "learning_rate": 7.48540143741148e-06, + "loss": 1.845, + "step": 9782 + }, + { + "epoch": 0.75, + "grad_norm": 0.6248990381958771, + "learning_rate": 7.480944523268055e-06, + "loss": 1.8684, + "step": 9783 + }, + { + "epoch": 0.75, + "grad_norm": 0.5210730829590403, + "learning_rate": 7.476488702911219e-06, + "loss": 2.1165, + "step": 9784 + }, + { + "epoch": 0.75, + "grad_norm": 0.6124024595590598, + "learning_rate": 7.472033976619197e-06, + "loss": 1.8835, + "step": 9785 + }, + { + "epoch": 0.75, + "grad_norm": 0.5606094281203028, + "learning_rate": 7.467580344670095e-06, + "loss": 1.8673, + "step": 9786 + }, + { + "epoch": 0.76, + "grad_norm": 0.5868345203915163, + "learning_rate": 7.463127807341966e-06, + "loss": 1.8819, + "step": 9787 + }, + { + "epoch": 0.76, + "grad_norm": 0.5882130574798546, + "learning_rate": 7.458676364912828e-06, + "loss": 2.0731, + "step": 9788 + }, + { + "epoch": 0.76, + "grad_norm": 0.6480282677175998, + "learning_rate": 7.4542260176605785e-06, + "loss": 1.8775, + "step": 9789 + }, + { + "epoch": 0.76, + "grad_norm": 0.5934217682097165, + "learning_rate": 7.449776765863079e-06, + "loss": 1.9618, + "step": 9790 + }, + { + "epoch": 0.76, + "grad_norm": 0.6015078183139233, + "learning_rate": 7.445328609798113e-06, + "loss": 1.9043, + "step": 9791 + }, + { + "epoch": 0.76, + "grad_norm": 0.5997422970979445, + "learning_rate": 7.440881549743392e-06, + "loss": 2.0682, + "step": 9792 + }, + { + "epoch": 0.76, + "grad_norm": 0.556133555298735, + "learning_rate": 7.436435585976587e-06, + "loss": 1.8984, + "step": 9793 + }, + { + "epoch": 0.76, + "grad_norm": 0.7484329735057513, + "learning_rate": 7.43199071877525e-06, + "loss": 1.8942, + "step": 9794 + }, + { + "epoch": 0.76, + "grad_norm": 0.5860758986796064, + "learning_rate": 7.427546948416897e-06, + "loss": 1.9382, + "step": 9795 + }, + { + "epoch": 0.76, + "grad_norm": 0.630075014559968, + "learning_rate": 7.423104275178994e-06, + "loss": 2.1123, + "step": 9796 + }, + { + "epoch": 0.76, + "grad_norm": 0.6341688469944133, + "learning_rate": 7.41866269933889e-06, + "loss": 1.8937, + "step": 9797 + }, + { + "epoch": 0.76, + "grad_norm": 0.5923632793244357, + "learning_rate": 7.414222221173903e-06, + "loss": 1.9039, + "step": 9798 + }, + { + "epoch": 0.76, + "grad_norm": 0.5676605150522307, + "learning_rate": 7.409782840961263e-06, + "loss": 1.8915, + "step": 9799 + }, + { + "epoch": 0.76, + "grad_norm": 0.6297880850959686, + "learning_rate": 7.4053445589781445e-06, + "loss": 2.0539, + "step": 9800 + }, + { + "epoch": 0.76, + "grad_norm": 0.6033852085852077, + "learning_rate": 7.400907375501645e-06, + "loss": 1.8696, + "step": 9801 + }, + { + "epoch": 0.76, + "grad_norm": 0.56362465181915, + "learning_rate": 7.396471290808799e-06, + "loss": 1.8617, + "step": 9802 + }, + { + "epoch": 0.76, + "grad_norm": 0.5312819220950944, + "learning_rate": 7.392036305176564e-06, + "loss": 1.8899, + "step": 9803 + }, + { + "epoch": 0.76, + "grad_norm": 0.5669515056799804, + "learning_rate": 7.3876024188818395e-06, + "loss": 2.0494, + "step": 9804 + }, + { + "epoch": 0.76, + "grad_norm": 0.6067697446509266, + "learning_rate": 7.3831696322014505e-06, + "loss": 1.8969, + "step": 9805 + }, + { + "epoch": 0.76, + "grad_norm": 0.5524362610887664, + "learning_rate": 7.378737945412151e-06, + "loss": 1.8958, + "step": 9806 + }, + { + "epoch": 0.76, + "grad_norm": 0.6457859036960178, + "learning_rate": 7.374307358790633e-06, + "loss": 1.8729, + "step": 9807 + }, + { + "epoch": 0.76, + "grad_norm": 0.5969772096202524, + "learning_rate": 7.369877872613515e-06, + "loss": 2.0848, + "step": 9808 + }, + { + "epoch": 0.76, + "grad_norm": 0.5109428898763667, + "learning_rate": 7.365449487157347e-06, + "loss": 1.9325, + "step": 9809 + }, + { + "epoch": 0.76, + "grad_norm": 0.6627969106344849, + "learning_rate": 7.361022202698612e-06, + "loss": 1.8732, + "step": 9810 + }, + { + "epoch": 0.76, + "grad_norm": 0.606796263046673, + "learning_rate": 7.356596019513723e-06, + "loss": 1.8129, + "step": 9811 + }, + { + "epoch": 0.76, + "grad_norm": 0.6056828403086492, + "learning_rate": 7.3521709378790275e-06, + "loss": 2.0809, + "step": 9812 + }, + { + "epoch": 0.76, + "grad_norm": 0.634480860933343, + "learning_rate": 7.347746958070798e-06, + "loss": 1.8592, + "step": 9813 + }, + { + "epoch": 0.76, + "grad_norm": 0.6552062772959275, + "learning_rate": 7.343324080365244e-06, + "loss": 1.8457, + "step": 9814 + }, + { + "epoch": 0.76, + "grad_norm": 0.5735452854076498, + "learning_rate": 7.338902305038503e-06, + "loss": 1.9174, + "step": 9815 + }, + { + "epoch": 0.76, + "grad_norm": 0.5840527400736987, + "learning_rate": 7.334481632366643e-06, + "loss": 2.0661, + "step": 9816 + }, + { + "epoch": 0.76, + "grad_norm": 0.552672251090426, + "learning_rate": 7.330062062625667e-06, + "loss": 1.8793, + "step": 9817 + }, + { + "epoch": 0.76, + "grad_norm": 0.5743133305934345, + "learning_rate": 7.325643596091508e-06, + "loss": 1.8126, + "step": 9818 + }, + { + "epoch": 0.76, + "grad_norm": 0.6042115091194048, + "learning_rate": 7.321226233040035e-06, + "loss": 1.8613, + "step": 9819 + }, + { + "epoch": 0.76, + "grad_norm": 0.5905324398901795, + "learning_rate": 7.316809973747016e-06, + "loss": 2.0659, + "step": 9820 + }, + { + "epoch": 0.76, + "grad_norm": 0.6234941129279045, + "learning_rate": 7.312394818488205e-06, + "loss": 1.9728, + "step": 9821 + }, + { + "epoch": 0.76, + "grad_norm": 0.5773807809380678, + "learning_rate": 7.307980767539257e-06, + "loss": 1.8797, + "step": 9822 + }, + { + "epoch": 0.76, + "grad_norm": 0.5713096930685233, + "learning_rate": 7.303567821175733e-06, + "loss": 1.8893, + "step": 9823 + }, + { + "epoch": 0.76, + "grad_norm": 0.6245686703163551, + "learning_rate": 7.299155979673184e-06, + "loss": 2.0715, + "step": 9824 + }, + { + "epoch": 0.76, + "grad_norm": 0.5987778343956072, + "learning_rate": 7.294745243307027e-06, + "loss": 1.8755, + "step": 9825 + }, + { + "epoch": 0.76, + "grad_norm": 0.5838754051737682, + "learning_rate": 7.290335612352669e-06, + "loss": 1.8576, + "step": 9826 + }, + { + "epoch": 0.76, + "grad_norm": 0.6420828747248806, + "learning_rate": 7.285927087085423e-06, + "loss": 1.8691, + "step": 9827 + }, + { + "epoch": 0.76, + "grad_norm": 0.6045700993834725, + "learning_rate": 7.281519667780501e-06, + "loss": 2.0805, + "step": 9828 + }, + { + "epoch": 0.76, + "grad_norm": 0.589475220001574, + "learning_rate": 7.277113354713103e-06, + "loss": 1.825, + "step": 9829 + }, + { + "epoch": 0.76, + "grad_norm": 0.5477253206238963, + "learning_rate": 7.2727081481583335e-06, + "loss": 1.8102, + "step": 9830 + }, + { + "epoch": 0.76, + "grad_norm": 0.6099778415126593, + "learning_rate": 7.268304048391205e-06, + "loss": 1.8386, + "step": 9831 + }, + { + "epoch": 0.76, + "grad_norm": 0.6169853212817328, + "learning_rate": 7.263901055686714e-06, + "loss": 2.0795, + "step": 9832 + }, + { + "epoch": 0.76, + "grad_norm": 0.5508458606721717, + "learning_rate": 7.259499170319734e-06, + "loss": 1.894, + "step": 9833 + }, + { + "epoch": 0.76, + "grad_norm": 0.5349659496897745, + "learning_rate": 7.255098392565093e-06, + "loss": 1.9179, + "step": 9834 + }, + { + "epoch": 0.76, + "grad_norm": 0.5857638243257839, + "learning_rate": 7.2506987226975716e-06, + "loss": 1.8731, + "step": 9835 + }, + { + "epoch": 0.76, + "grad_norm": 0.5710302620260148, + "learning_rate": 7.246300160991839e-06, + "loss": 2.0431, + "step": 9836 + }, + { + "epoch": 0.76, + "grad_norm": 0.5738535332957648, + "learning_rate": 7.241902707722511e-06, + "loss": 1.8598, + "step": 9837 + }, + { + "epoch": 0.76, + "grad_norm": 0.5996640473099134, + "learning_rate": 7.237506363164168e-06, + "loss": 1.882, + "step": 9838 + }, + { + "epoch": 0.76, + "grad_norm": 0.6237261172152307, + "learning_rate": 7.233111127591255e-06, + "loss": 1.859, + "step": 9839 + }, + { + "epoch": 0.76, + "grad_norm": 0.5218589257120632, + "learning_rate": 7.228717001278221e-06, + "loss": 2.0833, + "step": 9840 + }, + { + "epoch": 0.76, + "grad_norm": 0.6014920159700984, + "learning_rate": 7.224323984499379e-06, + "loss": 1.8829, + "step": 9841 + }, + { + "epoch": 0.76, + "grad_norm": 0.581148539569195, + "learning_rate": 7.21993207752901e-06, + "loss": 1.894, + "step": 9842 + }, + { + "epoch": 0.76, + "grad_norm": 0.5489330794673478, + "learning_rate": 7.215541280641338e-06, + "loss": 1.8547, + "step": 9843 + }, + { + "epoch": 0.76, + "grad_norm": 0.6121026608985648, + "learning_rate": 7.211151594110477e-06, + "loss": 2.0526, + "step": 9844 + }, + { + "epoch": 0.76, + "grad_norm": 0.5817484792295209, + "learning_rate": 7.206763018210499e-06, + "loss": 1.8829, + "step": 9845 + }, + { + "epoch": 0.76, + "grad_norm": 0.5624682427572908, + "learning_rate": 7.202375553215404e-06, + "loss": 1.9599, + "step": 9846 + }, + { + "epoch": 0.76, + "grad_norm": 0.5884247706087061, + "learning_rate": 7.197989199399116e-06, + "loss": 1.8925, + "step": 9847 + }, + { + "epoch": 0.76, + "grad_norm": 0.6499740023749654, + "learning_rate": 7.1936039570354965e-06, + "loss": 2.0711, + "step": 9848 + }, + { + "epoch": 0.76, + "grad_norm": 0.5615400199454946, + "learning_rate": 7.189219826398333e-06, + "loss": 1.8745, + "step": 9849 + }, + { + "epoch": 0.76, + "grad_norm": 0.590790370419666, + "learning_rate": 7.184836807761342e-06, + "loss": 1.8706, + "step": 9850 + }, + { + "epoch": 0.76, + "grad_norm": 0.6201380499080543, + "learning_rate": 7.180454901398176e-06, + "loss": 1.8563, + "step": 9851 + }, + { + "epoch": 0.76, + "grad_norm": 0.5389661428004826, + "learning_rate": 7.176074107582417e-06, + "loss": 1.8901, + "step": 9852 + }, + { + "epoch": 0.76, + "grad_norm": 0.5718947937785859, + "learning_rate": 7.171694426587572e-06, + "loss": 2.065, + "step": 9853 + }, + { + "epoch": 0.76, + "grad_norm": 0.622479199982762, + "learning_rate": 7.167315858687085e-06, + "loss": 1.918, + "step": 9854 + }, + { + "epoch": 0.76, + "grad_norm": 0.6052910816377971, + "learning_rate": 7.1629384041543265e-06, + "loss": 1.8096, + "step": 9855 + }, + { + "epoch": 0.76, + "grad_norm": 0.6701438330129966, + "learning_rate": 7.158562063262603e-06, + "loss": 2.1117, + "step": 9856 + }, + { + "epoch": 0.76, + "grad_norm": 0.6168430298355714, + "learning_rate": 7.154186836285143e-06, + "loss": 1.8551, + "step": 9857 + }, + { + "epoch": 0.76, + "grad_norm": 0.5941065879165437, + "learning_rate": 7.149812723495109e-06, + "loss": 1.8708, + "step": 9858 + }, + { + "epoch": 0.76, + "grad_norm": 0.5585317496293507, + "learning_rate": 7.145439725165598e-06, + "loss": 1.9305, + "step": 9859 + }, + { + "epoch": 0.76, + "grad_norm": 0.647558653147079, + "learning_rate": 7.1410678415696346e-06, + "loss": 2.0959, + "step": 9860 + }, + { + "epoch": 0.76, + "grad_norm": 0.5756354007410476, + "learning_rate": 7.136697072980172e-06, + "loss": 1.843, + "step": 9861 + }, + { + "epoch": 0.76, + "grad_norm": 0.6188633118354876, + "learning_rate": 7.132327419670095e-06, + "loss": 1.8542, + "step": 9862 + }, + { + "epoch": 0.76, + "grad_norm": 0.602723005834421, + "learning_rate": 7.127958881912219e-06, + "loss": 1.8844, + "step": 9863 + }, + { + "epoch": 0.76, + "grad_norm": 0.6575052113461541, + "learning_rate": 7.12359145997929e-06, + "loss": 2.0612, + "step": 9864 + }, + { + "epoch": 0.76, + "grad_norm": 0.5371976658340089, + "learning_rate": 7.119225154143985e-06, + "loss": 1.9597, + "step": 9865 + }, + { + "epoch": 0.76, + "grad_norm": 0.6238552885204242, + "learning_rate": 7.114859964678918e-06, + "loss": 1.8646, + "step": 9866 + }, + { + "epoch": 0.76, + "grad_norm": 0.6448140663064231, + "learning_rate": 7.1104958918566e-06, + "loss": 1.859, + "step": 9867 + }, + { + "epoch": 0.76, + "grad_norm": 0.6313224309788756, + "learning_rate": 7.106132935949525e-06, + "loss": 2.0695, + "step": 9868 + }, + { + "epoch": 0.76, + "grad_norm": 0.5780677362809897, + "learning_rate": 7.10177109723009e-06, + "loss": 1.8542, + "step": 9869 + }, + { + "epoch": 0.76, + "grad_norm": 0.6567428839385522, + "learning_rate": 7.097410375970595e-06, + "loss": 1.8648, + "step": 9870 + }, + { + "epoch": 0.76, + "grad_norm": 0.5548950452897783, + "learning_rate": 7.093050772443324e-06, + "loss": 1.908, + "step": 9871 + }, + { + "epoch": 0.76, + "grad_norm": 0.5624506428149934, + "learning_rate": 7.088692286920456e-06, + "loss": 2.0273, + "step": 9872 + }, + { + "epoch": 0.76, + "grad_norm": 0.5788219561782851, + "learning_rate": 7.084334919674112e-06, + "loss": 1.9013, + "step": 9873 + }, + { + "epoch": 0.76, + "grad_norm": 0.6075821232200778, + "learning_rate": 7.079978670976345e-06, + "loss": 1.8772, + "step": 9874 + }, + { + "epoch": 0.76, + "grad_norm": 0.5760104755851339, + "learning_rate": 7.075623541099111e-06, + "loss": 1.8438, + "step": 9875 + }, + { + "epoch": 0.76, + "grad_norm": 0.5901380409973135, + "learning_rate": 7.071269530314342e-06, + "loss": 2.0426, + "step": 9876 + }, + { + "epoch": 0.76, + "grad_norm": 0.5599765420974949, + "learning_rate": 7.066916638893878e-06, + "loss": 1.9542, + "step": 9877 + }, + { + "epoch": 0.76, + "grad_norm": 0.5559067159582881, + "learning_rate": 7.062564867109464e-06, + "loss": 1.8511, + "step": 9878 + }, + { + "epoch": 0.76, + "grad_norm": 0.6438417692350227, + "learning_rate": 7.058214215232828e-06, + "loss": 1.8989, + "step": 9879 + }, + { + "epoch": 0.76, + "grad_norm": 0.6810140748723532, + "learning_rate": 7.05386468353558e-06, + "loss": 2.0964, + "step": 9880 + }, + { + "epoch": 0.76, + "grad_norm": 0.6039493022166044, + "learning_rate": 7.049516272289275e-06, + "loss": 1.8921, + "step": 9881 + }, + { + "epoch": 0.76, + "grad_norm": 0.6586785237096983, + "learning_rate": 7.045168981765426e-06, + "loss": 1.8629, + "step": 9882 + }, + { + "epoch": 0.76, + "grad_norm": 0.7018095238463288, + "learning_rate": 7.040822812235434e-06, + "loss": 1.9476, + "step": 9883 + }, + { + "epoch": 0.76, + "grad_norm": 0.6113648033187449, + "learning_rate": 7.036477763970642e-06, + "loss": 1.8693, + "step": 9884 + }, + { + "epoch": 0.76, + "grad_norm": 0.5861326403573981, + "learning_rate": 7.032133837242352e-06, + "loss": 2.0457, + "step": 9885 + }, + { + "epoch": 0.76, + "grad_norm": 0.660770291217403, + "learning_rate": 7.027791032321757e-06, + "loss": 1.8811, + "step": 9886 + }, + { + "epoch": 0.76, + "grad_norm": 0.6476727127953458, + "learning_rate": 7.023449349479996e-06, + "loss": 1.8569, + "step": 9887 + }, + { + "epoch": 0.76, + "grad_norm": 0.5885927269379675, + "learning_rate": 7.019108788988143e-06, + "loss": 2.0204, + "step": 9888 + }, + { + "epoch": 0.76, + "grad_norm": 0.6160449409279809, + "learning_rate": 7.01476935111719e-06, + "loss": 1.8407, + "step": 9889 + }, + { + "epoch": 0.76, + "grad_norm": 0.6237541720839994, + "learning_rate": 7.010431036138085e-06, + "loss": 1.9501, + "step": 9890 + }, + { + "epoch": 0.76, + "grad_norm": 0.6145499552931796, + "learning_rate": 7.006093844321665e-06, + "loss": 1.8518, + "step": 9891 + }, + { + "epoch": 0.76, + "grad_norm": 0.6294071976101056, + "learning_rate": 7.001757775938728e-06, + "loss": 2.0135, + "step": 9892 + }, + { + "epoch": 0.76, + "grad_norm": 0.6610598908331337, + "learning_rate": 6.997422831259992e-06, + "loss": 1.9065, + "step": 9893 + }, + { + "epoch": 0.76, + "grad_norm": 0.5609240293097617, + "learning_rate": 6.993089010556103e-06, + "loss": 1.8445, + "step": 9894 + }, + { + "epoch": 0.76, + "grad_norm": 0.6073650543201157, + "learning_rate": 6.988756314097644e-06, + "loss": 1.8489, + "step": 9895 + }, + { + "epoch": 0.76, + "grad_norm": 0.6492080809728918, + "learning_rate": 6.984424742155119e-06, + "loss": 1.9524, + "step": 9896 + }, + { + "epoch": 0.76, + "grad_norm": 0.6164924151985212, + "learning_rate": 6.980094294998968e-06, + "loss": 2.0541, + "step": 9897 + }, + { + "epoch": 0.76, + "grad_norm": 0.5857045627558285, + "learning_rate": 6.975764972899556e-06, + "loss": 1.8433, + "step": 9898 + }, + { + "epoch": 0.76, + "grad_norm": 0.5855218376581135, + "learning_rate": 6.971436776127182e-06, + "loss": 1.8002, + "step": 9899 + }, + { + "epoch": 0.76, + "grad_norm": 0.6309242972311062, + "learning_rate": 6.967109704952077e-06, + "loss": 2.0417, + "step": 9900 + }, + { + "epoch": 0.76, + "grad_norm": 0.584947267469249, + "learning_rate": 6.962783759644393e-06, + "loss": 1.8094, + "step": 9901 + }, + { + "epoch": 0.76, + "grad_norm": 0.5961253138212652, + "learning_rate": 6.9584589404742166e-06, + "loss": 1.9693, + "step": 9902 + }, + { + "epoch": 0.76, + "grad_norm": 0.568661916869947, + "learning_rate": 6.954135247711563e-06, + "loss": 1.8323, + "step": 9903 + }, + { + "epoch": 0.76, + "grad_norm": 0.5825947746596803, + "learning_rate": 6.949812681626383e-06, + "loss": 1.8477, + "step": 9904 + }, + { + "epoch": 0.76, + "grad_norm": 0.5475381638221739, + "learning_rate": 6.9454912424885475e-06, + "loss": 2.0453, + "step": 9905 + }, + { + "epoch": 0.76, + "grad_norm": 0.5660396805192651, + "learning_rate": 6.941170930567864e-06, + "loss": 1.8565, + "step": 9906 + }, + { + "epoch": 0.76, + "grad_norm": 0.6140274907693337, + "learning_rate": 6.936851746134068e-06, + "loss": 1.8912, + "step": 9907 + }, + { + "epoch": 0.76, + "grad_norm": 0.5529963599960196, + "learning_rate": 6.932533689456822e-06, + "loss": 1.9491, + "step": 9908 + }, + { + "epoch": 0.76, + "grad_norm": 0.6078343002861833, + "learning_rate": 6.92821676080572e-06, + "loss": 2.0841, + "step": 9909 + }, + { + "epoch": 0.76, + "grad_norm": 0.5747974280690421, + "learning_rate": 6.923900960450289e-06, + "loss": 1.9019, + "step": 9910 + }, + { + "epoch": 0.76, + "grad_norm": 0.5718888992215727, + "learning_rate": 6.919586288659977e-06, + "loss": 1.8654, + "step": 9911 + }, + { + "epoch": 0.76, + "grad_norm": 0.5884662966864908, + "learning_rate": 6.915272745704168e-06, + "loss": 2.0349, + "step": 9912 + }, + { + "epoch": 0.76, + "grad_norm": 0.5813157646006861, + "learning_rate": 6.910960331852187e-06, + "loss": 1.8775, + "step": 9913 + }, + { + "epoch": 0.76, + "grad_norm": 0.549247814829925, + "learning_rate": 6.906649047373246e-06, + "loss": 1.9923, + "step": 9914 + }, + { + "epoch": 0.76, + "grad_norm": 0.5920064263214806, + "learning_rate": 6.902338892536542e-06, + "loss": 1.8701, + "step": 9915 + }, + { + "epoch": 0.77, + "grad_norm": 0.5802551890479, + "learning_rate": 6.898029867611177e-06, + "loss": 1.8897, + "step": 9916 + }, + { + "epoch": 0.77, + "grad_norm": 0.5719162576192439, + "learning_rate": 6.893721972866154e-06, + "loss": 2.0842, + "step": 9917 + }, + { + "epoch": 0.77, + "grad_norm": 0.5752422800025054, + "learning_rate": 6.889415208570463e-06, + "loss": 1.8801, + "step": 9918 + }, + { + "epoch": 0.77, + "grad_norm": 0.5762258227887975, + "learning_rate": 6.885109574992984e-06, + "loss": 1.8074, + "step": 9919 + }, + { + "epoch": 0.77, + "grad_norm": 0.5635999605545179, + "learning_rate": 6.8808050724025185e-06, + "loss": 2.0545, + "step": 9920 + }, + { + "epoch": 0.77, + "grad_norm": 0.6084867791842884, + "learning_rate": 6.876501701067844e-06, + "loss": 1.9374, + "step": 9921 + }, + { + "epoch": 0.77, + "grad_norm": 0.571157135239423, + "learning_rate": 6.872199461257606e-06, + "loss": 1.8836, + "step": 9922 + }, + { + "epoch": 0.77, + "grad_norm": 0.5904110741756465, + "learning_rate": 6.867898353240435e-06, + "loss": 1.8385, + "step": 9923 + }, + { + "epoch": 0.77, + "grad_norm": 0.5582102773559309, + "learning_rate": 6.863598377284863e-06, + "loss": 2.0694, + "step": 9924 + }, + { + "epoch": 0.77, + "grad_norm": 0.5941987551340875, + "learning_rate": 6.859299533659336e-06, + "loss": 1.8539, + "step": 9925 + }, + { + "epoch": 0.77, + "grad_norm": 0.5910099621134584, + "learning_rate": 6.855001822632278e-06, + "loss": 1.8286, + "step": 9926 + }, + { + "epoch": 0.77, + "grad_norm": 0.6149555828179953, + "learning_rate": 6.850705244471992e-06, + "loss": 1.9277, + "step": 9927 + }, + { + "epoch": 0.77, + "grad_norm": 0.6028207502760927, + "learning_rate": 6.846409799446729e-06, + "loss": 1.8903, + "step": 9928 + }, + { + "epoch": 0.77, + "grad_norm": 0.5428019352296549, + "learning_rate": 6.842115487824693e-06, + "loss": 2.0702, + "step": 9929 + }, + { + "epoch": 0.77, + "grad_norm": 0.6691235264557737, + "learning_rate": 6.837822309873976e-06, + "loss": 1.8943, + "step": 9930 + }, + { + "epoch": 0.77, + "grad_norm": 0.6329276741508363, + "learning_rate": 6.833530265862614e-06, + "loss": 1.8274, + "step": 9931 + }, + { + "epoch": 0.77, + "grad_norm": 0.6185793765716361, + "learning_rate": 6.8292393560586035e-06, + "loss": 2.0335, + "step": 9932 + }, + { + "epoch": 0.77, + "grad_norm": 0.5752119925983595, + "learning_rate": 6.82494958072982e-06, + "loss": 1.9542, + "step": 9933 + }, + { + "epoch": 0.77, + "grad_norm": 0.579375854234088, + "learning_rate": 6.8206609401441e-06, + "loss": 1.899, + "step": 9934 + }, + { + "epoch": 0.77, + "grad_norm": 0.5940027930950578, + "learning_rate": 6.816373434569198e-06, + "loss": 1.8573, + "step": 9935 + }, + { + "epoch": 0.77, + "grad_norm": 0.6862348211290475, + "learning_rate": 6.8120870642727955e-06, + "loss": 1.8643, + "step": 9936 + }, + { + "epoch": 0.77, + "grad_norm": 0.6395998523386474, + "learning_rate": 6.807801829522534e-06, + "loss": 2.0391, + "step": 9937 + }, + { + "epoch": 0.77, + "grad_norm": 0.5597532720150011, + "learning_rate": 6.803517730585929e-06, + "loss": 1.8869, + "step": 9938 + }, + { + "epoch": 0.77, + "grad_norm": 0.5854461778465972, + "learning_rate": 6.799234767730464e-06, + "loss": 2.0131, + "step": 9939 + }, + { + "epoch": 0.77, + "grad_norm": 0.6222303392716293, + "learning_rate": 6.794952941223545e-06, + "loss": 1.8833, + "step": 9940 + }, + { + "epoch": 0.77, + "grad_norm": 0.611920714170305, + "learning_rate": 6.790672251332503e-06, + "loss": 2.0961, + "step": 9941 + }, + { + "epoch": 0.77, + "grad_norm": 0.6002530474081406, + "learning_rate": 6.786392698324595e-06, + "loss": 1.8336, + "step": 9942 + }, + { + "epoch": 0.77, + "grad_norm": 0.6044973178823012, + "learning_rate": 6.782114282467017e-06, + "loss": 1.9133, + "step": 9943 + }, + { + "epoch": 0.77, + "grad_norm": 0.612883647746902, + "learning_rate": 6.777837004026885e-06, + "loss": 2.0679, + "step": 9944 + }, + { + "epoch": 0.77, + "grad_norm": 0.5166189446993887, + "learning_rate": 6.773560863271247e-06, + "loss": 1.9493, + "step": 9945 + }, + { + "epoch": 0.77, + "grad_norm": 0.6223243861212813, + "learning_rate": 6.769285860467078e-06, + "loss": 1.8781, + "step": 9946 + }, + { + "epoch": 0.77, + "grad_norm": 0.6182267530224876, + "learning_rate": 6.76501199588129e-06, + "loss": 1.8873, + "step": 9947 + }, + { + "epoch": 0.77, + "grad_norm": 0.5544029840968173, + "learning_rate": 6.760739269780714e-06, + "loss": 1.8818, + "step": 9948 + }, + { + "epoch": 0.77, + "grad_norm": 0.5743640282013892, + "learning_rate": 6.75646768243211e-06, + "loss": 2.0683, + "step": 9949 + }, + { + "epoch": 0.77, + "grad_norm": 0.6017063720815752, + "learning_rate": 6.752197234102179e-06, + "loss": 1.856, + "step": 9950 + }, + { + "epoch": 0.77, + "grad_norm": 0.5936190725243424, + "learning_rate": 6.747927925057537e-06, + "loss": 1.82, + "step": 9951 + }, + { + "epoch": 0.77, + "grad_norm": 0.587064800910742, + "learning_rate": 6.743659755564738e-06, + "loss": 1.9624, + "step": 9952 + }, + { + "epoch": 0.77, + "grad_norm": 0.5637034760619979, + "learning_rate": 6.739392725890256e-06, + "loss": 2.0994, + "step": 9953 + }, + { + "epoch": 0.77, + "grad_norm": 0.5717245413617875, + "learning_rate": 6.735126836300504e-06, + "loss": 1.8872, + "step": 9954 + }, + { + "epoch": 0.77, + "grad_norm": 0.5772121715304169, + "learning_rate": 6.730862087061818e-06, + "loss": 1.89, + "step": 9955 + }, + { + "epoch": 0.77, + "grad_norm": 0.666442961052075, + "learning_rate": 6.7265984784404655e-06, + "loss": 1.9956, + "step": 9956 + }, + { + "epoch": 0.77, + "grad_norm": 0.5830694592813649, + "learning_rate": 6.722336010702634e-06, + "loss": 1.8649, + "step": 9957 + }, + { + "epoch": 0.77, + "grad_norm": 0.5619501115789374, + "learning_rate": 6.718074684114456e-06, + "loss": 1.9065, + "step": 9958 + }, + { + "epoch": 0.77, + "grad_norm": 0.5680654249010272, + "learning_rate": 6.713814498941978e-06, + "loss": 1.9055, + "step": 9959 + }, + { + "epoch": 0.77, + "grad_norm": 0.5780008396698069, + "learning_rate": 6.7095554554511916e-06, + "loss": 1.8141, + "step": 9960 + }, + { + "epoch": 0.77, + "grad_norm": 0.608119798314736, + "learning_rate": 6.70529755390798e-06, + "loss": 2.0891, + "step": 9961 + }, + { + "epoch": 0.77, + "grad_norm": 0.5561305834915199, + "learning_rate": 6.701040794578206e-06, + "loss": 1.8134, + "step": 9962 + }, + { + "epoch": 0.77, + "grad_norm": 0.6067786735025787, + "learning_rate": 6.696785177727638e-06, + "loss": 1.8829, + "step": 9963 + }, + { + "epoch": 0.77, + "grad_norm": 0.5943768684151364, + "learning_rate": 6.692530703621947e-06, + "loss": 1.9586, + "step": 9964 + }, + { + "epoch": 0.77, + "grad_norm": 0.557904373419838, + "learning_rate": 6.688277372526783e-06, + "loss": 2.0555, + "step": 9965 + }, + { + "epoch": 0.77, + "grad_norm": 0.6175311489517533, + "learning_rate": 6.684025184707693e-06, + "loss": 1.8625, + "step": 9966 + }, + { + "epoch": 0.77, + "grad_norm": 0.6023966439951527, + "learning_rate": 6.679774140430143e-06, + "loss": 1.9094, + "step": 9967 + }, + { + "epoch": 0.77, + "grad_norm": 0.5867447031494165, + "learning_rate": 6.675524239959566e-06, + "loss": 1.8962, + "step": 9968 + }, + { + "epoch": 0.77, + "grad_norm": 0.5933499724920428, + "learning_rate": 6.671275483561285e-06, + "loss": 2.0624, + "step": 9969 + }, + { + "epoch": 0.77, + "grad_norm": 0.567088893904526, + "learning_rate": 6.667027871500562e-06, + "loss": 1.9079, + "step": 9970 + }, + { + "epoch": 0.77, + "grad_norm": 0.5785237753922738, + "learning_rate": 6.66278140404262e-06, + "loss": 1.8581, + "step": 9971 + }, + { + "epoch": 0.77, + "grad_norm": 0.6474861858884289, + "learning_rate": 6.658536081452549e-06, + "loss": 1.8548, + "step": 9972 + }, + { + "epoch": 0.77, + "grad_norm": 0.5314996654002195, + "learning_rate": 6.654291903995436e-06, + "loss": 2.0279, + "step": 9973 + }, + { + "epoch": 0.77, + "grad_norm": 0.5869809651041972, + "learning_rate": 6.650048871936237e-06, + "loss": 1.8561, + "step": 9974 + }, + { + "epoch": 0.77, + "grad_norm": 0.5830605481135058, + "learning_rate": 6.645806985539863e-06, + "loss": 1.8325, + "step": 9975 + }, + { + "epoch": 0.77, + "grad_norm": 0.612416575592877, + "learning_rate": 6.6415662450711765e-06, + "loss": 2.0703, + "step": 9976 + }, + { + "epoch": 0.77, + "grad_norm": 0.5914717951755843, + "learning_rate": 6.637326650794917e-06, + "loss": 1.9208, + "step": 9977 + }, + { + "epoch": 0.77, + "grad_norm": 0.6242455548079016, + "learning_rate": 6.633088202975785e-06, + "loss": 1.919, + "step": 9978 + }, + { + "epoch": 0.77, + "grad_norm": 0.5772746947727049, + "learning_rate": 6.628850901878425e-06, + "loss": 1.9028, + "step": 9979 + }, + { + "epoch": 0.77, + "grad_norm": 0.5576240883809834, + "learning_rate": 6.6246147477673656e-06, + "loss": 1.8122, + "step": 9980 + }, + { + "epoch": 0.77, + "grad_norm": 0.6249005168028868, + "learning_rate": 6.620379740907095e-06, + "loss": 2.058, + "step": 9981 + }, + { + "epoch": 0.77, + "grad_norm": 0.5790472165284056, + "learning_rate": 6.6161458815620216e-06, + "loss": 1.8779, + "step": 9982 + }, + { + "epoch": 0.77, + "grad_norm": 0.636280980716656, + "learning_rate": 6.611913169996487e-06, + "loss": 1.8997, + "step": 9983 + }, + { + "epoch": 0.77, + "grad_norm": 0.5848290117705705, + "learning_rate": 6.60768160647475e-06, + "loss": 1.8785, + "step": 9984 + }, + { + "epoch": 0.77, + "grad_norm": 0.5541827202225574, + "learning_rate": 6.603451191261009e-06, + "loss": 2.087, + "step": 9985 + }, + { + "epoch": 0.77, + "grad_norm": 0.5742661888654794, + "learning_rate": 6.599221924619378e-06, + "loss": 1.8494, + "step": 9986 + }, + { + "epoch": 0.77, + "grad_norm": 0.6299499531508945, + "learning_rate": 6.5949938068139285e-06, + "loss": 1.8677, + "step": 9987 + }, + { + "epoch": 0.77, + "grad_norm": 0.6773542594643112, + "learning_rate": 6.59076683810862e-06, + "loss": 1.865, + "step": 9988 + }, + { + "epoch": 0.77, + "grad_norm": 0.5700730851073591, + "learning_rate": 6.586541018767362e-06, + "loss": 2.115, + "step": 9989 + }, + { + "epoch": 0.77, + "grad_norm": 0.552819991442068, + "learning_rate": 6.582316349053994e-06, + "loss": 1.8921, + "step": 9990 + }, + { + "epoch": 0.77, + "grad_norm": 0.6390164747217233, + "learning_rate": 6.578092829232277e-06, + "loss": 1.8666, + "step": 9991 + }, + { + "epoch": 0.77, + "grad_norm": 0.6055115445308528, + "learning_rate": 6.5738704595659065e-06, + "loss": 1.8626, + "step": 9992 + }, + { + "epoch": 0.77, + "grad_norm": 0.5979122394726587, + "learning_rate": 6.5696492403184984e-06, + "loss": 2.0472, + "step": 9993 + }, + { + "epoch": 0.77, + "grad_norm": 0.5779865783798013, + "learning_rate": 6.565429171753601e-06, + "loss": 1.9004, + "step": 9994 + }, + { + "epoch": 0.77, + "grad_norm": 0.589463043358146, + "learning_rate": 6.561210254134692e-06, + "loss": 1.9451, + "step": 9995 + }, + { + "epoch": 0.77, + "grad_norm": 0.5716028717010009, + "learning_rate": 6.556992487725175e-06, + "loss": 1.8594, + "step": 9996 + }, + { + "epoch": 0.77, + "grad_norm": 0.5782449804425381, + "learning_rate": 6.5527758727883834e-06, + "loss": 2.0549, + "step": 9997 + }, + { + "epoch": 0.77, + "grad_norm": 0.6043053307236973, + "learning_rate": 6.548560409587576e-06, + "loss": 1.8392, + "step": 9998 + }, + { + "epoch": 0.77, + "grad_norm": 0.6028622129263858, + "learning_rate": 6.544346098385942e-06, + "loss": 1.8521, + "step": 9999 + }, + { + "epoch": 0.77, + "grad_norm": 0.5864628895617997, + "learning_rate": 6.540132939446603e-06, + "loss": 1.8805, + "step": 10000 + }, + { + "epoch": 0.77, + "grad_norm": 0.586361992547937, + "learning_rate": 6.535920933032596e-06, + "loss": 2.0866, + "step": 10001 + }, + { + "epoch": 0.77, + "grad_norm": 0.5873368297972714, + "learning_rate": 6.531710079406906e-06, + "loss": 1.8355, + "step": 10002 + }, + { + "epoch": 0.77, + "grad_norm": 0.5791391711167567, + "learning_rate": 6.52750037883241e-06, + "loss": 1.895, + "step": 10003 + }, + { + "epoch": 0.77, + "grad_norm": 0.5485657013195775, + "learning_rate": 6.523291831571959e-06, + "loss": 1.8427, + "step": 10004 + }, + { + "epoch": 0.77, + "grad_norm": 0.5477465347979119, + "learning_rate": 6.519084437888304e-06, + "loss": 2.0585, + "step": 10005 + }, + { + "epoch": 0.77, + "grad_norm": 0.575453858685787, + "learning_rate": 6.51487819804413e-06, + "loss": 1.9265, + "step": 10006 + }, + { + "epoch": 0.77, + "grad_norm": 0.5478010492156601, + "learning_rate": 6.5106731123020546e-06, + "loss": 1.8839, + "step": 10007 + }, + { + "epoch": 0.77, + "grad_norm": 0.5769967714588066, + "learning_rate": 6.506469180924599e-06, + "loss": 1.9629, + "step": 10008 + }, + { + "epoch": 0.77, + "grad_norm": 0.6034280670721905, + "learning_rate": 6.502266404174253e-06, + "loss": 2.0596, + "step": 10009 + }, + { + "epoch": 0.77, + "grad_norm": 0.6033345144568478, + "learning_rate": 6.498064782313415e-06, + "loss": 1.8825, + "step": 10010 + }, + { + "epoch": 0.77, + "grad_norm": 0.6067575839944989, + "learning_rate": 6.4938643156043835e-06, + "loss": 1.8883, + "step": 10011 + }, + { + "epoch": 0.77, + "grad_norm": 0.5867719907349904, + "learning_rate": 6.489665004309437e-06, + "loss": 1.8519, + "step": 10012 + }, + { + "epoch": 0.77, + "grad_norm": 0.5746031227098906, + "learning_rate": 6.485466848690752e-06, + "loss": 2.0812, + "step": 10013 + }, + { + "epoch": 0.77, + "grad_norm": 0.6096205149191006, + "learning_rate": 6.481269849010419e-06, + "loss": 1.9063, + "step": 10014 + }, + { + "epoch": 0.77, + "grad_norm": 0.6076453387857592, + "learning_rate": 6.477074005530498e-06, + "loss": 1.8952, + "step": 10015 + }, + { + "epoch": 0.77, + "grad_norm": 0.5946102402060761, + "learning_rate": 6.4728793185129336e-06, + "loss": 1.8409, + "step": 10016 + }, + { + "epoch": 0.77, + "grad_norm": 0.5968892988986049, + "learning_rate": 6.46868578821962e-06, + "loss": 2.0379, + "step": 10017 + }, + { + "epoch": 0.77, + "grad_norm": 0.5942751560893951, + "learning_rate": 6.464493414912396e-06, + "loss": 1.91, + "step": 10018 + }, + { + "epoch": 0.77, + "grad_norm": 0.5648972887726359, + "learning_rate": 6.460302198852977e-06, + "loss": 1.9067, + "step": 10019 + }, + { + "epoch": 0.77, + "grad_norm": 0.5942393337560976, + "learning_rate": 6.456112140303061e-06, + "loss": 1.9133, + "step": 10020 + }, + { + "epoch": 0.77, + "grad_norm": 0.6585322177155132, + "learning_rate": 6.451923239524254e-06, + "loss": 2.0709, + "step": 10021 + }, + { + "epoch": 0.77, + "grad_norm": 0.6141989158744526, + "learning_rate": 6.4477354967780626e-06, + "loss": 1.8884, + "step": 10022 + }, + { + "epoch": 0.77, + "grad_norm": 0.6002928844728935, + "learning_rate": 6.443548912325972e-06, + "loss": 1.8747, + "step": 10023 + }, + { + "epoch": 0.77, + "grad_norm": 0.5634394492724144, + "learning_rate": 6.439363486429348e-06, + "loss": 1.8797, + "step": 10024 + }, + { + "epoch": 0.77, + "grad_norm": 0.5761662899349146, + "learning_rate": 6.435179219349502e-06, + "loss": 2.0783, + "step": 10025 + }, + { + "epoch": 0.77, + "grad_norm": 0.5579647921737981, + "learning_rate": 6.430996111347698e-06, + "loss": 1.9562, + "step": 10026 + }, + { + "epoch": 0.77, + "grad_norm": 0.5659666406618951, + "learning_rate": 6.426814162685086e-06, + "loss": 1.8415, + "step": 10027 + }, + { + "epoch": 0.77, + "grad_norm": 0.5787391743069016, + "learning_rate": 6.422633373622763e-06, + "loss": 1.8564, + "step": 10028 + }, + { + "epoch": 0.77, + "grad_norm": 0.6630315907972386, + "learning_rate": 6.418453744421757e-06, + "loss": 2.0494, + "step": 10029 + }, + { + "epoch": 0.77, + "grad_norm": 0.6110796604399334, + "learning_rate": 6.414275275343018e-06, + "loss": 1.8535, + "step": 10030 + }, + { + "epoch": 0.77, + "grad_norm": 0.5794576965042313, + "learning_rate": 6.410097966647427e-06, + "loss": 1.8939, + "step": 10031 + }, + { + "epoch": 0.77, + "grad_norm": 0.5864510194271094, + "learning_rate": 6.4059218185957895e-06, + "loss": 1.9318, + "step": 10032 + }, + { + "epoch": 0.77, + "grad_norm": 0.655566102467746, + "learning_rate": 6.401746831448838e-06, + "loss": 2.067, + "step": 10033 + }, + { + "epoch": 0.77, + "grad_norm": 0.5915265496063535, + "learning_rate": 6.397573005467236e-06, + "loss": 1.9018, + "step": 10034 + }, + { + "epoch": 0.77, + "grad_norm": 0.5551210299638268, + "learning_rate": 6.3934003409115705e-06, + "loss": 1.8719, + "step": 10035 + }, + { + "epoch": 0.77, + "grad_norm": 0.5871593044917669, + "learning_rate": 6.3892288380423605e-06, + "loss": 1.868, + "step": 10036 + }, + { + "epoch": 0.77, + "grad_norm": 0.5966944048283587, + "learning_rate": 6.38505849712005e-06, + "loss": 2.0476, + "step": 10037 + }, + { + "epoch": 0.77, + "grad_norm": 0.5842778970241503, + "learning_rate": 6.3808893184050086e-06, + "loss": 1.8378, + "step": 10038 + }, + { + "epoch": 0.77, + "grad_norm": 0.5544912691981234, + "learning_rate": 6.376721302157537e-06, + "loss": 1.9651, + "step": 10039 + }, + { + "epoch": 0.77, + "grad_norm": 0.5771080296047124, + "learning_rate": 6.37255444863786e-06, + "loss": 1.8301, + "step": 10040 + }, + { + "epoch": 0.77, + "grad_norm": 0.5491719794670327, + "learning_rate": 6.368388758106133e-06, + "loss": 2.0695, + "step": 10041 + }, + { + "epoch": 0.77, + "grad_norm": 0.5903850670669745, + "learning_rate": 6.3642242308224385e-06, + "loss": 1.8568, + "step": 10042 + }, + { + "epoch": 0.77, + "grad_norm": 0.5658938617819126, + "learning_rate": 6.36006086704678e-06, + "loss": 1.8107, + "step": 10043 + }, + { + "epoch": 0.77, + "grad_norm": 0.5305314551735982, + "learning_rate": 6.355898667039098e-06, + "loss": 1.8473, + "step": 10044 + }, + { + "epoch": 0.77, + "grad_norm": 0.5692333602334103, + "learning_rate": 6.351737631059251e-06, + "loss": 2.0614, + "step": 10045 + }, + { + "epoch": 0.78, + "grad_norm": 0.6000196452009824, + "learning_rate": 6.347577759367035e-06, + "loss": 1.8995, + "step": 10046 + }, + { + "epoch": 0.78, + "grad_norm": 0.5866873000412588, + "learning_rate": 6.343419052222166e-06, + "loss": 1.832, + "step": 10047 + }, + { + "epoch": 0.78, + "grad_norm": 0.5718061035122218, + "learning_rate": 6.339261509884287e-06, + "loss": 1.8479, + "step": 10048 + }, + { + "epoch": 0.78, + "grad_norm": 0.6466662187942003, + "learning_rate": 6.335105132612981e-06, + "loss": 2.0644, + "step": 10049 + }, + { + "epoch": 0.78, + "grad_norm": 0.5977666912694315, + "learning_rate": 6.330949920667725e-06, + "loss": 1.8454, + "step": 10050 + }, + { + "epoch": 0.78, + "grad_norm": 0.5484974019375556, + "learning_rate": 6.3267958743079644e-06, + "loss": 1.9043, + "step": 10051 + }, + { + "epoch": 0.78, + "grad_norm": 0.5749453761136436, + "learning_rate": 6.322642993793051e-06, + "loss": 1.8355, + "step": 10052 + }, + { + "epoch": 0.78, + "grad_norm": 0.5399241629411601, + "learning_rate": 6.318491279382263e-06, + "loss": 2.0221, + "step": 10053 + }, + { + "epoch": 0.78, + "grad_norm": 0.6115173243275183, + "learning_rate": 6.314340731334811e-06, + "loss": 1.9055, + "step": 10054 + }, + { + "epoch": 0.78, + "grad_norm": 0.5852342682935882, + "learning_rate": 6.31019134990983e-06, + "loss": 1.8264, + "step": 10055 + }, + { + "epoch": 0.78, + "grad_norm": 0.6147865791758184, + "learning_rate": 6.306043135366382e-06, + "loss": 1.8352, + "step": 10056 + }, + { + "epoch": 0.78, + "grad_norm": 0.5151068172060812, + "learning_rate": 6.301896087963463e-06, + "loss": 2.138, + "step": 10057 + }, + { + "epoch": 0.78, + "grad_norm": 0.6004863813288748, + "learning_rate": 6.297750207959973e-06, + "loss": 1.8971, + "step": 10058 + }, + { + "epoch": 0.78, + "grad_norm": 0.5557952775898698, + "learning_rate": 6.2936054956147734e-06, + "loss": 1.8587, + "step": 10059 + }, + { + "epoch": 0.78, + "grad_norm": 0.5763869969958483, + "learning_rate": 6.28946195118664e-06, + "loss": 1.8159, + "step": 10060 + }, + { + "epoch": 0.78, + "grad_norm": 0.5549057652601986, + "learning_rate": 6.285319574934248e-06, + "loss": 2.0673, + "step": 10061 + }, + { + "epoch": 0.78, + "grad_norm": 0.6102100694782228, + "learning_rate": 6.28117836711625e-06, + "loss": 1.9474, + "step": 10062 + }, + { + "epoch": 0.78, + "grad_norm": 0.5262440448497068, + "learning_rate": 6.277038327991178e-06, + "loss": 1.9861, + "step": 10063 + }, + { + "epoch": 0.78, + "grad_norm": 0.5857097821819889, + "learning_rate": 6.27289945781751e-06, + "loss": 1.8657, + "step": 10064 + }, + { + "epoch": 0.78, + "grad_norm": 0.554190064908165, + "learning_rate": 6.268761756853678e-06, + "loss": 2.0046, + "step": 10065 + }, + { + "epoch": 0.78, + "grad_norm": 0.6066211580840386, + "learning_rate": 6.264625225357992e-06, + "loss": 1.8893, + "step": 10066 + }, + { + "epoch": 0.78, + "grad_norm": 0.6125863165231086, + "learning_rate": 6.260489863588714e-06, + "loss": 1.8816, + "step": 10067 + }, + { + "epoch": 0.78, + "grad_norm": 0.5681644085835287, + "learning_rate": 6.256355671804051e-06, + "loss": 1.8538, + "step": 10068 + }, + { + "epoch": 0.78, + "grad_norm": 0.5595138644142642, + "learning_rate": 6.252222650262088e-06, + "loss": 2.0469, + "step": 10069 + }, + { + "epoch": 0.78, + "grad_norm": 0.5808291677875814, + "learning_rate": 6.248090799220902e-06, + "loss": 1.9365, + "step": 10070 + }, + { + "epoch": 0.78, + "grad_norm": 0.5972158629384291, + "learning_rate": 6.24396011893843e-06, + "loss": 1.7938, + "step": 10071 + }, + { + "epoch": 0.78, + "grad_norm": 0.5568687597262374, + "learning_rate": 6.2398306096725725e-06, + "loss": 1.8651, + "step": 10072 + }, + { + "epoch": 0.78, + "grad_norm": 0.593375835736498, + "learning_rate": 6.235702271681176e-06, + "loss": 2.0963, + "step": 10073 + }, + { + "epoch": 0.78, + "grad_norm": 0.6032003588416528, + "learning_rate": 6.231575105221965e-06, + "loss": 1.889, + "step": 10074 + }, + { + "epoch": 0.78, + "grad_norm": 0.5807758911649286, + "learning_rate": 6.227449110552622e-06, + "loss": 1.8522, + "step": 10075 + }, + { + "epoch": 0.78, + "grad_norm": 0.5355647110133122, + "learning_rate": 6.2233242879307485e-06, + "loss": 1.9854, + "step": 10076 + }, + { + "epoch": 0.78, + "grad_norm": 0.5517738752798058, + "learning_rate": 6.219200637613878e-06, + "loss": 2.0728, + "step": 10077 + }, + { + "epoch": 0.78, + "grad_norm": 0.5522783668122618, + "learning_rate": 6.215078159859467e-06, + "loss": 1.8672, + "step": 10078 + }, + { + "epoch": 0.78, + "grad_norm": 0.5545673671107626, + "learning_rate": 6.2109568549248955e-06, + "loss": 1.8258, + "step": 10079 + }, + { + "epoch": 0.78, + "grad_norm": 0.5912673635993275, + "learning_rate": 6.2068367230674746e-06, + "loss": 1.8731, + "step": 10080 + }, + { + "epoch": 0.78, + "grad_norm": 0.634378555627443, + "learning_rate": 6.202717764544441e-06, + "loss": 2.0781, + "step": 10081 + }, + { + "epoch": 0.78, + "grad_norm": 0.5439353515640469, + "learning_rate": 6.1985999796129565e-06, + "loss": 1.9569, + "step": 10082 + }, + { + "epoch": 0.78, + "grad_norm": 0.5953419649431108, + "learning_rate": 6.194483368530116e-06, + "loss": 1.8085, + "step": 10083 + }, + { + "epoch": 0.78, + "grad_norm": 0.5688934715157574, + "learning_rate": 6.1903679315529345e-06, + "loss": 1.906, + "step": 10084 + }, + { + "epoch": 0.78, + "grad_norm": 0.5715189741768948, + "learning_rate": 6.1862536689383546e-06, + "loss": 2.0499, + "step": 10085 + }, + { + "epoch": 0.78, + "grad_norm": 0.6208629482778797, + "learning_rate": 6.1821405809432465e-06, + "loss": 1.8366, + "step": 10086 + }, + { + "epoch": 0.78, + "grad_norm": 0.5753310144701023, + "learning_rate": 6.178028667824407e-06, + "loss": 1.9116, + "step": 10087 + }, + { + "epoch": 0.78, + "grad_norm": 0.49365565170200715, + "learning_rate": 6.1739179298385605e-06, + "loss": 1.9509, + "step": 10088 + }, + { + "epoch": 0.78, + "grad_norm": 0.5771349074060218, + "learning_rate": 6.1698083672423575e-06, + "loss": 1.9057, + "step": 10089 + }, + { + "epoch": 0.78, + "grad_norm": 0.5833238491934783, + "learning_rate": 6.165699980292375e-06, + "loss": 2.0414, + "step": 10090 + }, + { + "epoch": 0.78, + "grad_norm": 0.5954294323002491, + "learning_rate": 6.161592769245114e-06, + "loss": 1.8867, + "step": 10091 + }, + { + "epoch": 0.78, + "grad_norm": 0.5806750673585022, + "learning_rate": 6.157486734357007e-06, + "loss": 1.8525, + "step": 10092 + }, + { + "epoch": 0.78, + "grad_norm": 0.5945895819094865, + "learning_rate": 6.153381875884412e-06, + "loss": 2.0745, + "step": 10093 + }, + { + "epoch": 0.78, + "grad_norm": 0.5709817744439, + "learning_rate": 6.149278194083611e-06, + "loss": 1.9667, + "step": 10094 + }, + { + "epoch": 0.78, + "grad_norm": 0.6474232843319209, + "learning_rate": 6.145175689210808e-06, + "loss": 1.9097, + "step": 10095 + }, + { + "epoch": 0.78, + "grad_norm": 0.5598658412041255, + "learning_rate": 6.141074361522156e-06, + "loss": 1.847, + "step": 10096 + }, + { + "epoch": 0.78, + "grad_norm": 0.6146565546275735, + "learning_rate": 6.136974211273691e-06, + "loss": 2.0331, + "step": 10097 + }, + { + "epoch": 0.78, + "grad_norm": 0.5777107508715803, + "learning_rate": 6.132875238721425e-06, + "loss": 1.8992, + "step": 10098 + }, + { + "epoch": 0.78, + "grad_norm": 0.630145675797826, + "learning_rate": 6.12877744412127e-06, + "loss": 1.9002, + "step": 10099 + }, + { + "epoch": 0.78, + "grad_norm": 0.6038323782580428, + "learning_rate": 6.1246808277290514e-06, + "loss": 1.8445, + "step": 10100 + }, + { + "epoch": 0.78, + "grad_norm": 0.5228287717230661, + "learning_rate": 6.120585389800559e-06, + "loss": 1.9604, + "step": 10101 + }, + { + "epoch": 0.78, + "grad_norm": 0.6381279711780531, + "learning_rate": 6.1164911305914785e-06, + "loss": 2.0549, + "step": 10102 + }, + { + "epoch": 0.78, + "grad_norm": 0.5851651295483327, + "learning_rate": 6.1123980503574314e-06, + "loss": 1.834, + "step": 10103 + }, + { + "epoch": 0.78, + "grad_norm": 0.5771326240035205, + "learning_rate": 6.108306149353973e-06, + "loss": 1.8883, + "step": 10104 + }, + { + "epoch": 0.78, + "grad_norm": 0.5689894770075464, + "learning_rate": 6.1042154278365595e-06, + "loss": 2.1256, + "step": 10105 + }, + { + "epoch": 0.78, + "grad_norm": 0.575100721833533, + "learning_rate": 6.100125886060606e-06, + "loss": 1.8302, + "step": 10106 + }, + { + "epoch": 0.78, + "grad_norm": 0.5216443231597006, + "learning_rate": 6.096037524281445e-06, + "loss": 1.9294, + "step": 10107 + }, + { + "epoch": 0.78, + "grad_norm": 0.5852360694403315, + "learning_rate": 6.091950342754307e-06, + "loss": 1.8879, + "step": 10108 + }, + { + "epoch": 0.78, + "grad_norm": 0.5665430211497783, + "learning_rate": 6.0878643417344e-06, + "loss": 2.0706, + "step": 10109 + }, + { + "epoch": 0.78, + "grad_norm": 0.5664408951656804, + "learning_rate": 6.0837795214768065e-06, + "loss": 1.829, + "step": 10110 + }, + { + "epoch": 0.78, + "grad_norm": 0.5585570813029677, + "learning_rate": 6.079695882236561e-06, + "loss": 1.8622, + "step": 10111 + }, + { + "epoch": 0.78, + "grad_norm": 0.5863249716499088, + "learning_rate": 6.075613424268642e-06, + "loss": 1.953, + "step": 10112 + }, + { + "epoch": 0.78, + "grad_norm": 0.576412366030406, + "learning_rate": 6.071532147827913e-06, + "loss": 1.9511, + "step": 10113 + }, + { + "epoch": 0.78, + "grad_norm": 0.6428748149986354, + "learning_rate": 6.067452053169184e-06, + "loss": 2.1085, + "step": 10114 + }, + { + "epoch": 0.78, + "grad_norm": 0.6100309206460723, + "learning_rate": 6.063373140547215e-06, + "loss": 1.8199, + "step": 10115 + }, + { + "epoch": 0.78, + "grad_norm": 0.6048292381368248, + "learning_rate": 6.059295410216642e-06, + "loss": 1.8836, + "step": 10116 + }, + { + "epoch": 0.78, + "grad_norm": 0.5347002340662244, + "learning_rate": 6.05521886243208e-06, + "loss": 1.9985, + "step": 10117 + }, + { + "epoch": 0.78, + "grad_norm": 0.6686727425979928, + "learning_rate": 6.0511434974480255e-06, + "loss": 1.8423, + "step": 10118 + }, + { + "epoch": 0.78, + "grad_norm": 0.6028589174735475, + "learning_rate": 6.047069315518916e-06, + "loss": 1.8959, + "step": 10119 + }, + { + "epoch": 0.78, + "grad_norm": 0.6132176449514787, + "learning_rate": 6.042996316899147e-06, + "loss": 1.8677, + "step": 10120 + }, + { + "epoch": 0.78, + "grad_norm": 0.6806251823939801, + "learning_rate": 6.0389245018429856e-06, + "loss": 1.8489, + "step": 10121 + }, + { + "epoch": 0.78, + "grad_norm": 0.5805168963197802, + "learning_rate": 6.034853870604665e-06, + "loss": 2.1149, + "step": 10122 + }, + { + "epoch": 0.78, + "grad_norm": 0.6969058015977357, + "learning_rate": 6.030784423438323e-06, + "loss": 1.8891, + "step": 10123 + }, + { + "epoch": 0.78, + "grad_norm": 0.609700487912176, + "learning_rate": 6.026716160598042e-06, + "loss": 1.9001, + "step": 10124 + }, + { + "epoch": 0.78, + "grad_norm": 0.6152482489133086, + "learning_rate": 6.022649082337814e-06, + "loss": 2.0529, + "step": 10125 + }, + { + "epoch": 0.78, + "grad_norm": 0.6003211240117182, + "learning_rate": 6.018583188911564e-06, + "loss": 1.9437, + "step": 10126 + }, + { + "epoch": 0.78, + "grad_norm": 0.6002182451010295, + "learning_rate": 6.014518480573142e-06, + "loss": 1.836, + "step": 10127 + }, + { + "epoch": 0.78, + "grad_norm": 0.6660014445356606, + "learning_rate": 6.010454957576328e-06, + "loss": 1.8901, + "step": 10128 + }, + { + "epoch": 0.78, + "grad_norm": 0.6216928804887418, + "learning_rate": 6.00639262017482e-06, + "loss": 2.049, + "step": 10129 + }, + { + "epoch": 0.78, + "grad_norm": 0.5429054514471208, + "learning_rate": 6.002331468622249e-06, + "loss": 1.8567, + "step": 10130 + }, + { + "epoch": 0.78, + "grad_norm": 0.6413393093060423, + "learning_rate": 5.998271503172171e-06, + "loss": 1.8519, + "step": 10131 + }, + { + "epoch": 0.78, + "grad_norm": 0.5902031983831537, + "learning_rate": 5.99421272407806e-06, + "loss": 1.9589, + "step": 10132 + }, + { + "epoch": 0.78, + "grad_norm": 0.5941626227190078, + "learning_rate": 5.990155131593328e-06, + "loss": 1.8737, + "step": 10133 + }, + { + "epoch": 0.78, + "grad_norm": 0.5955417343391998, + "learning_rate": 5.986098725971307e-06, + "loss": 2.0747, + "step": 10134 + }, + { + "epoch": 0.78, + "grad_norm": 0.552707163100088, + "learning_rate": 5.9820435074652515e-06, + "loss": 1.8385, + "step": 10135 + }, + { + "epoch": 0.78, + "grad_norm": 0.677261042180884, + "learning_rate": 5.9779894763283495e-06, + "loss": 1.8682, + "step": 10136 + }, + { + "epoch": 0.78, + "grad_norm": 0.5713822323112319, + "learning_rate": 5.973936632813706e-06, + "loss": 2.0908, + "step": 10137 + }, + { + "epoch": 0.78, + "grad_norm": 0.5748302642485908, + "learning_rate": 5.9698849771743645e-06, + "loss": 1.9437, + "step": 10138 + }, + { + "epoch": 0.78, + "grad_norm": 0.667278025961368, + "learning_rate": 5.965834509663276e-06, + "loss": 1.8593, + "step": 10139 + }, + { + "epoch": 0.78, + "grad_norm": 0.6520434595527309, + "learning_rate": 5.961785230533337e-06, + "loss": 1.8723, + "step": 10140 + }, + { + "epoch": 0.78, + "grad_norm": 0.7034314452073165, + "learning_rate": 5.957737140037356e-06, + "loss": 2.0831, + "step": 10141 + }, + { + "epoch": 0.78, + "grad_norm": 0.5838837759361781, + "learning_rate": 5.9536902384280725e-06, + "loss": 1.8429, + "step": 10142 + }, + { + "epoch": 0.78, + "grad_norm": 0.6326573336011698, + "learning_rate": 5.949644525958159e-06, + "loss": 1.8971, + "step": 10143 + }, + { + "epoch": 0.78, + "grad_norm": 0.5522052282123743, + "learning_rate": 5.945600002880186e-06, + "loss": 1.9224, + "step": 10144 + }, + { + "epoch": 0.78, + "grad_norm": 0.6074494212096242, + "learning_rate": 5.941556669446688e-06, + "loss": 1.8474, + "step": 10145 + }, + { + "epoch": 0.78, + "grad_norm": 0.6623475156410622, + "learning_rate": 5.937514525910107e-06, + "loss": 2.0108, + "step": 10146 + }, + { + "epoch": 0.78, + "grad_norm": 0.5992695229736276, + "learning_rate": 5.933473572522791e-06, + "loss": 1.8422, + "step": 10147 + }, + { + "epoch": 0.78, + "grad_norm": 0.5801299422822204, + "learning_rate": 5.929433809537055e-06, + "loss": 1.8989, + "step": 10148 + }, + { + "epoch": 0.78, + "grad_norm": 0.5457602110075374, + "learning_rate": 5.925395237205111e-06, + "loss": 2.1121, + "step": 10149 + }, + { + "epoch": 0.78, + "grad_norm": 0.6447570840681043, + "learning_rate": 5.9213578557791025e-06, + "loss": 1.9697, + "step": 10150 + }, + { + "epoch": 0.78, + "grad_norm": 0.6355062517673185, + "learning_rate": 5.917321665511105e-06, + "loss": 1.8199, + "step": 10151 + }, + { + "epoch": 0.78, + "grad_norm": 0.57744702331984, + "learning_rate": 5.9132866666530975e-06, + "loss": 1.859, + "step": 10152 + }, + { + "epoch": 0.78, + "grad_norm": 0.5662753112895077, + "learning_rate": 5.9092528594570205e-06, + "loss": 1.8927, + "step": 10153 + }, + { + "epoch": 0.78, + "grad_norm": 0.6213024308491631, + "learning_rate": 5.905220244174719e-06, + "loss": 2.0644, + "step": 10154 + }, + { + "epoch": 0.78, + "grad_norm": 0.6821567518067191, + "learning_rate": 5.901188821057948e-06, + "loss": 1.8458, + "step": 10155 + }, + { + "epoch": 0.78, + "grad_norm": 0.6508438258807706, + "learning_rate": 5.897158590358434e-06, + "loss": 1.8065, + "step": 10156 + }, + { + "epoch": 0.78, + "grad_norm": 0.5853371132935865, + "learning_rate": 5.893129552327781e-06, + "loss": 1.934, + "step": 10157 + }, + { + "epoch": 0.78, + "grad_norm": 0.6669399420881194, + "learning_rate": 5.889101707217531e-06, + "loss": 2.1007, + "step": 10158 + }, + { + "epoch": 0.78, + "grad_norm": 0.6416726353851406, + "learning_rate": 5.885075055279188e-06, + "loss": 1.8645, + "step": 10159 + }, + { + "epoch": 0.78, + "grad_norm": 0.5765620991236448, + "learning_rate": 5.881049596764126e-06, + "loss": 1.8283, + "step": 10160 + }, + { + "epoch": 0.78, + "grad_norm": 0.6897390952731093, + "learning_rate": 5.877025331923677e-06, + "loss": 2.0477, + "step": 10161 + }, + { + "epoch": 0.78, + "grad_norm": 0.605277413880483, + "learning_rate": 5.873002261009106e-06, + "loss": 1.8406, + "step": 10162 + }, + { + "epoch": 0.78, + "grad_norm": 0.5393009996615813, + "learning_rate": 5.868980384271575e-06, + "loss": 1.9112, + "step": 10163 + }, + { + "epoch": 0.78, + "grad_norm": 0.6269097657371308, + "learning_rate": 5.864959701962189e-06, + "loss": 1.887, + "step": 10164 + }, + { + "epoch": 0.78, + "grad_norm": 0.6095295341773123, + "learning_rate": 5.860940214331978e-06, + "loss": 1.8742, + "step": 10165 + }, + { + "epoch": 0.78, + "grad_norm": 0.5721410707849374, + "learning_rate": 5.856921921631889e-06, + "loss": 1.9884, + "step": 10166 + }, + { + "epoch": 0.78, + "grad_norm": 0.6392445124683205, + "learning_rate": 5.852904824112821e-06, + "loss": 1.8701, + "step": 10167 + }, + { + "epoch": 0.78, + "grad_norm": 0.5904792149223216, + "learning_rate": 5.848888922025553e-06, + "loss": 1.8669, + "step": 10168 + }, + { + "epoch": 0.78, + "grad_norm": 0.5273618143885774, + "learning_rate": 5.844874215620818e-06, + "loss": 1.9654, + "step": 10169 + }, + { + "epoch": 0.78, + "grad_norm": 0.5493000320003958, + "learning_rate": 5.8408607051492924e-06, + "loss": 2.0672, + "step": 10170 + }, + { + "epoch": 0.78, + "grad_norm": 0.5757970836402747, + "learning_rate": 5.836848390861533e-06, + "loss": 1.8803, + "step": 10171 + }, + { + "epoch": 0.78, + "grad_norm": 0.5844356893432947, + "learning_rate": 5.832837273008051e-06, + "loss": 1.8375, + "step": 10172 + }, + { + "epoch": 0.78, + "grad_norm": 0.567491091008223, + "learning_rate": 5.828827351839281e-06, + "loss": 2.0416, + "step": 10173 + }, + { + "epoch": 0.78, + "grad_norm": 0.5832385356753257, + "learning_rate": 5.824818627605577e-06, + "loss": 1.8882, + "step": 10174 + }, + { + "epoch": 0.78, + "grad_norm": 0.5536219733833632, + "learning_rate": 5.820811100557217e-06, + "loss": 1.9457, + "step": 10175 + }, + { + "epoch": 0.79, + "grad_norm": 0.5688684334503991, + "learning_rate": 5.816804770944412e-06, + "loss": 1.8933, + "step": 10176 + }, + { + "epoch": 0.79, + "grad_norm": 0.5751088571899027, + "learning_rate": 5.812799639017291e-06, + "loss": 1.8959, + "step": 10177 + }, + { + "epoch": 0.79, + "grad_norm": 0.5346562415783215, + "learning_rate": 5.808795705025913e-06, + "loss": 2.0489, + "step": 10178 + }, + { + "epoch": 0.79, + "grad_norm": 0.567526388646483, + "learning_rate": 5.8047929692202575e-06, + "loss": 1.8752, + "step": 10179 + }, + { + "epoch": 0.79, + "grad_norm": 0.5950583902507007, + "learning_rate": 5.800791431850233e-06, + "loss": 1.8292, + "step": 10180 + }, + { + "epoch": 0.79, + "grad_norm": 0.5553104342275026, + "learning_rate": 5.796791093165671e-06, + "loss": 1.9726, + "step": 10181 + }, + { + "epoch": 0.79, + "grad_norm": 0.5703071925588128, + "learning_rate": 5.792791953416329e-06, + "loss": 2.0853, + "step": 10182 + }, + { + "epoch": 0.79, + "grad_norm": 0.5771191189014386, + "learning_rate": 5.788794012851892e-06, + "loss": 1.9173, + "step": 10183 + }, + { + "epoch": 0.79, + "grad_norm": 0.5711228591058355, + "learning_rate": 5.784797271721967e-06, + "loss": 1.8595, + "step": 10184 + }, + { + "epoch": 0.79, + "grad_norm": 0.5812246908652152, + "learning_rate": 5.780801730276086e-06, + "loss": 1.8691, + "step": 10185 + }, + { + "epoch": 0.79, + "grad_norm": 0.6215626897561156, + "learning_rate": 5.776807388763706e-06, + "loss": 2.0689, + "step": 10186 + }, + { + "epoch": 0.79, + "grad_norm": 0.5774305712186452, + "learning_rate": 5.772814247434213e-06, + "loss": 1.8735, + "step": 10187 + }, + { + "epoch": 0.79, + "grad_norm": 0.5445873376729672, + "learning_rate": 5.768822306536914e-06, + "loss": 1.9326, + "step": 10188 + }, + { + "epoch": 0.79, + "grad_norm": 0.5452877879908641, + "learning_rate": 5.764831566321041e-06, + "loss": 1.8502, + "step": 10189 + }, + { + "epoch": 0.79, + "grad_norm": 0.5603654199093803, + "learning_rate": 5.760842027035762e-06, + "loss": 2.0599, + "step": 10190 + }, + { + "epoch": 0.79, + "grad_norm": 0.5841340235052108, + "learning_rate": 5.756853688930136e-06, + "loss": 1.8113, + "step": 10191 + }, + { + "epoch": 0.79, + "grad_norm": 0.5566513136453618, + "learning_rate": 5.7528665522531945e-06, + "loss": 1.8716, + "step": 10192 + }, + { + "epoch": 0.79, + "grad_norm": 0.5279703432641484, + "learning_rate": 5.7488806172538715e-06, + "loss": 2.038, + "step": 10193 + }, + { + "epoch": 0.79, + "grad_norm": 0.5368760779345237, + "learning_rate": 5.7448958841810005e-06, + "loss": 1.9449, + "step": 10194 + }, + { + "epoch": 0.79, + "grad_norm": 0.5515294187408957, + "learning_rate": 5.74091235328339e-06, + "loss": 1.8733, + "step": 10195 + }, + { + "epoch": 0.79, + "grad_norm": 0.5694584250660059, + "learning_rate": 5.736930024809747e-06, + "loss": 1.895, + "step": 10196 + }, + { + "epoch": 0.79, + "grad_norm": 0.5653134365590797, + "learning_rate": 5.732948899008683e-06, + "loss": 1.8986, + "step": 10197 + }, + { + "epoch": 0.79, + "grad_norm": 0.5749694234134166, + "learning_rate": 5.728968976128784e-06, + "loss": 2.0621, + "step": 10198 + }, + { + "epoch": 0.79, + "grad_norm": 0.5923613021910564, + "learning_rate": 5.724990256418505e-06, + "loss": 1.8973, + "step": 10199 + }, + { + "epoch": 0.79, + "grad_norm": 0.5283108617445471, + "learning_rate": 5.721012740126275e-06, + "loss": 1.9051, + "step": 10200 + }, + { + "epoch": 0.79, + "grad_norm": 0.5701399239105553, + "learning_rate": 5.717036427500428e-06, + "loss": 1.8832, + "step": 10201 + }, + { + "epoch": 0.79, + "grad_norm": 0.597149923971932, + "learning_rate": 5.713061318789198e-06, + "loss": 2.0832, + "step": 10202 + }, + { + "epoch": 0.79, + "grad_norm": 0.5598766534175446, + "learning_rate": 5.709087414240797e-06, + "loss": 1.7911, + "step": 10203 + }, + { + "epoch": 0.79, + "grad_norm": 0.5507342236312438, + "learning_rate": 5.7051147141033105e-06, + "loss": 1.8805, + "step": 10204 + }, + { + "epoch": 0.79, + "grad_norm": 0.5412444472953101, + "learning_rate": 5.701143218624768e-06, + "loss": 1.8313, + "step": 10205 + }, + { + "epoch": 0.79, + "grad_norm": 0.5666461445034541, + "learning_rate": 5.697172928053154e-06, + "loss": 2.1236, + "step": 10206 + }, + { + "epoch": 0.79, + "grad_norm": 0.6036745699692719, + "learning_rate": 5.693203842636324e-06, + "loss": 1.8774, + "step": 10207 + }, + { + "epoch": 0.79, + "grad_norm": 0.6092923342936672, + "learning_rate": 5.689235962622086e-06, + "loss": 1.8851, + "step": 10208 + }, + { + "epoch": 0.79, + "grad_norm": 0.5542305622028801, + "learning_rate": 5.685269288258191e-06, + "loss": 1.9141, + "step": 10209 + }, + { + "epoch": 0.79, + "grad_norm": 0.5886553949991596, + "learning_rate": 5.6813038197922745e-06, + "loss": 2.0935, + "step": 10210 + }, + { + "epoch": 0.79, + "grad_norm": 0.6211429523901579, + "learning_rate": 5.677339557471923e-06, + "loss": 1.87, + "step": 10211 + }, + { + "epoch": 0.79, + "grad_norm": 0.5520842572665948, + "learning_rate": 5.673376501544641e-06, + "loss": 1.9255, + "step": 10212 + }, + { + "epoch": 0.79, + "grad_norm": 0.6052437487103176, + "learning_rate": 5.669414652257857e-06, + "loss": 1.9263, + "step": 10213 + }, + { + "epoch": 0.79, + "grad_norm": 0.5622522547144251, + "learning_rate": 5.665454009858939e-06, + "loss": 2.0954, + "step": 10214 + }, + { + "epoch": 0.79, + "grad_norm": 0.5889837654984621, + "learning_rate": 5.66149457459515e-06, + "loss": 1.8227, + "step": 10215 + }, + { + "epoch": 0.79, + "grad_norm": 0.6061514488626564, + "learning_rate": 5.6575363467136904e-06, + "loss": 1.8723, + "step": 10216 + }, + { + "epoch": 0.79, + "grad_norm": 0.5612073265682498, + "learning_rate": 5.6535793264617136e-06, + "loss": 1.9035, + "step": 10217 + }, + { + "epoch": 0.79, + "grad_norm": 0.5771406306708882, + "learning_rate": 5.649623514086247e-06, + "loss": 2.045, + "step": 10218 + }, + { + "epoch": 0.79, + "grad_norm": 0.584967315559342, + "learning_rate": 5.6456689098342815e-06, + "loss": 1.9497, + "step": 10219 + }, + { + "epoch": 0.79, + "grad_norm": 0.6024156814405159, + "learning_rate": 5.641715513952711e-06, + "loss": 1.7953, + "step": 10220 + }, + { + "epoch": 0.79, + "grad_norm": 0.5769820008209147, + "learning_rate": 5.63776332668837e-06, + "loss": 1.8681, + "step": 10221 + }, + { + "epoch": 0.79, + "grad_norm": 0.5515671326118492, + "learning_rate": 5.633812348288006e-06, + "loss": 2.0348, + "step": 10222 + }, + { + "epoch": 0.79, + "grad_norm": 0.6110101064489594, + "learning_rate": 5.629862578998296e-06, + "loss": 1.8227, + "step": 10223 + }, + { + "epoch": 0.79, + "grad_norm": 0.6265852542030033, + "learning_rate": 5.625914019065836e-06, + "loss": 1.8829, + "step": 10224 + }, + { + "epoch": 0.79, + "grad_norm": 0.5913058524612007, + "learning_rate": 5.621966668737158e-06, + "loss": 1.9123, + "step": 10225 + }, + { + "epoch": 0.79, + "grad_norm": 0.5452853986978798, + "learning_rate": 5.618020528258708e-06, + "loss": 2.0632, + "step": 10226 + }, + { + "epoch": 0.79, + "grad_norm": 0.5585681239462417, + "learning_rate": 5.614075597876858e-06, + "loss": 1.8609, + "step": 10227 + }, + { + "epoch": 0.79, + "grad_norm": 0.5688685312968452, + "learning_rate": 5.61013187783791e-06, + "loss": 1.8949, + "step": 10228 + }, + { + "epoch": 0.79, + "grad_norm": 0.592374676559332, + "learning_rate": 5.606189368388082e-06, + "loss": 1.8682, + "step": 10229 + }, + { + "epoch": 0.79, + "grad_norm": 0.6009408796425828, + "learning_rate": 5.602248069773525e-06, + "loss": 2.0481, + "step": 10230 + }, + { + "epoch": 0.79, + "grad_norm": 0.5492701933860198, + "learning_rate": 5.598307982240308e-06, + "loss": 1.9623, + "step": 10231 + }, + { + "epoch": 0.79, + "grad_norm": 0.5834691019948706, + "learning_rate": 5.594369106034431e-06, + "loss": 1.8692, + "step": 10232 + }, + { + "epoch": 0.79, + "grad_norm": 0.5432090300988969, + "learning_rate": 5.590431441401805e-06, + "loss": 1.8241, + "step": 10233 + }, + { + "epoch": 0.79, + "grad_norm": 0.5942845273288151, + "learning_rate": 5.586494988588287e-06, + "loss": 2.0979, + "step": 10234 + }, + { + "epoch": 0.79, + "grad_norm": 0.6139610389185082, + "learning_rate": 5.582559747839636e-06, + "loss": 1.8265, + "step": 10235 + }, + { + "epoch": 0.79, + "grad_norm": 0.538122442200722, + "learning_rate": 5.578625719401548e-06, + "loss": 1.8358, + "step": 10236 + }, + { + "epoch": 0.79, + "grad_norm": 0.580654600951496, + "learning_rate": 5.574692903519652e-06, + "loss": 1.9259, + "step": 10237 + }, + { + "epoch": 0.79, + "grad_norm": 0.6225132240193826, + "learning_rate": 5.570761300439462e-06, + "loss": 2.0463, + "step": 10238 + }, + { + "epoch": 0.79, + "grad_norm": 0.5581917644171344, + "learning_rate": 5.566830910406468e-06, + "loss": 1.9289, + "step": 10239 + }, + { + "epoch": 0.79, + "grad_norm": 0.5544339373049293, + "learning_rate": 5.562901733666062e-06, + "loss": 1.8166, + "step": 10240 + }, + { + "epoch": 0.79, + "grad_norm": 0.5939519999422949, + "learning_rate": 5.558973770463535e-06, + "loss": 1.8533, + "step": 10241 + }, + { + "epoch": 0.79, + "grad_norm": 0.5787973258444494, + "learning_rate": 5.555047021044152e-06, + "loss": 2.0659, + "step": 10242 + }, + { + "epoch": 0.79, + "grad_norm": 0.5746496768888303, + "learning_rate": 5.551121485653068e-06, + "loss": 1.9367, + "step": 10243 + }, + { + "epoch": 0.79, + "grad_norm": 0.5506921545418012, + "learning_rate": 5.547197164535356e-06, + "loss": 1.8886, + "step": 10244 + }, + { + "epoch": 0.79, + "grad_norm": 0.5731905075281262, + "learning_rate": 5.543274057936055e-06, + "loss": 1.8807, + "step": 10245 + }, + { + "epoch": 0.79, + "grad_norm": 0.5398101599746967, + "learning_rate": 5.539352166100068e-06, + "loss": 2.0526, + "step": 10246 + }, + { + "epoch": 0.79, + "grad_norm": 0.5745703364437389, + "learning_rate": 5.535431489272278e-06, + "loss": 1.8848, + "step": 10247 + }, + { + "epoch": 0.79, + "grad_norm": 0.5645346973901552, + "learning_rate": 5.531512027697469e-06, + "loss": 1.8675, + "step": 10248 + }, + { + "epoch": 0.79, + "grad_norm": 0.5659046088054787, + "learning_rate": 5.527593781620333e-06, + "loss": 1.832, + "step": 10249 + }, + { + "epoch": 0.79, + "grad_norm": 0.5266341694714294, + "learning_rate": 5.523676751285514e-06, + "loss": 2.0943, + "step": 10250 + }, + { + "epoch": 0.79, + "grad_norm": 0.5512121494694953, + "learning_rate": 5.519760936937576e-06, + "loss": 1.8815, + "step": 10251 + }, + { + "epoch": 0.79, + "grad_norm": 0.5818293062018851, + "learning_rate": 5.515846338820974e-06, + "loss": 1.8412, + "step": 10252 + }, + { + "epoch": 0.79, + "grad_norm": 0.5774122436165149, + "learning_rate": 5.511932957180143e-06, + "loss": 1.8504, + "step": 10253 + }, + { + "epoch": 0.79, + "grad_norm": 0.55915741770971, + "learning_rate": 5.508020792259388e-06, + "loss": 2.0508, + "step": 10254 + }, + { + "epoch": 0.79, + "grad_norm": 0.5837820683469448, + "learning_rate": 5.504109844302965e-06, + "loss": 1.8319, + "step": 10255 + }, + { + "epoch": 0.79, + "grad_norm": 0.5094465647589013, + "learning_rate": 5.500200113555071e-06, + "loss": 1.874, + "step": 10256 + }, + { + "epoch": 0.79, + "grad_norm": 0.5567812950145951, + "learning_rate": 5.496291600259781e-06, + "loss": 1.8742, + "step": 10257 + }, + { + "epoch": 0.79, + "grad_norm": 0.5528570780184897, + "learning_rate": 5.49238430466113e-06, + "loss": 2.035, + "step": 10258 + }, + { + "epoch": 0.79, + "grad_norm": 0.5731744766953656, + "learning_rate": 5.488478227003066e-06, + "loss": 1.8885, + "step": 10259 + }, + { + "epoch": 0.79, + "grad_norm": 0.554468470868587, + "learning_rate": 5.484573367529464e-06, + "loss": 1.8441, + "step": 10260 + }, + { + "epoch": 0.79, + "grad_norm": 0.584561113207535, + "learning_rate": 5.480669726484117e-06, + "loss": 1.8617, + "step": 10261 + }, + { + "epoch": 0.79, + "grad_norm": 0.5253242205105245, + "learning_rate": 5.476767304110744e-06, + "loss": 2.0885, + "step": 10262 + }, + { + "epoch": 0.79, + "grad_norm": 0.5727124144099541, + "learning_rate": 5.472866100652988e-06, + "loss": 1.8749, + "step": 10263 + }, + { + "epoch": 0.79, + "grad_norm": 0.5599191396672536, + "learning_rate": 5.468966116354432e-06, + "loss": 1.8963, + "step": 10264 + }, + { + "epoch": 0.79, + "grad_norm": 0.5746913336673557, + "learning_rate": 5.4650673514585525e-06, + "loss": 1.8766, + "step": 10265 + }, + { + "epoch": 0.79, + "grad_norm": 0.5809373332848577, + "learning_rate": 5.4611698062087685e-06, + "loss": 2.0741, + "step": 10266 + }, + { + "epoch": 0.79, + "grad_norm": 0.5537955606503521, + "learning_rate": 5.457273480848421e-06, + "loss": 1.8758, + "step": 10267 + }, + { + "epoch": 0.79, + "grad_norm": 0.5621255490953787, + "learning_rate": 5.453378375620771e-06, + "loss": 1.9321, + "step": 10268 + }, + { + "epoch": 0.79, + "grad_norm": 0.6335233778371688, + "learning_rate": 5.449484490769013e-06, + "loss": 1.8508, + "step": 10269 + }, + { + "epoch": 0.79, + "grad_norm": 0.565445242133362, + "learning_rate": 5.44559182653625e-06, + "loss": 2.053, + "step": 10270 + }, + { + "epoch": 0.79, + "grad_norm": 0.5887890407133843, + "learning_rate": 5.441700383165524e-06, + "loss": 1.8463, + "step": 10271 + }, + { + "epoch": 0.79, + "grad_norm": 0.5899514262415477, + "learning_rate": 5.43781016089979e-06, + "loss": 1.8304, + "step": 10272 + }, + { + "epoch": 0.79, + "grad_norm": 0.5901655907447991, + "learning_rate": 5.4339211599819305e-06, + "loss": 1.8753, + "step": 10273 + }, + { + "epoch": 0.79, + "grad_norm": 0.5496382704505604, + "learning_rate": 5.430033380654751e-06, + "loss": 2.0473, + "step": 10274 + }, + { + "epoch": 0.79, + "grad_norm": 0.5809429077436236, + "learning_rate": 5.426146823160985e-06, + "loss": 1.849, + "step": 10275 + }, + { + "epoch": 0.79, + "grad_norm": 0.5957383967825098, + "learning_rate": 5.422261487743283e-06, + "loss": 1.8604, + "step": 10276 + }, + { + "epoch": 0.79, + "grad_norm": 0.6129061434008679, + "learning_rate": 5.4183773746442236e-06, + "loss": 1.883, + "step": 10277 + }, + { + "epoch": 0.79, + "grad_norm": 0.5565066566791456, + "learning_rate": 5.414494484106308e-06, + "loss": 2.0398, + "step": 10278 + }, + { + "epoch": 0.79, + "grad_norm": 0.5982237011439069, + "learning_rate": 5.410612816371971e-06, + "loss": 1.8983, + "step": 10279 + }, + { + "epoch": 0.79, + "grad_norm": 0.6127915927204217, + "learning_rate": 5.4067323716835345e-06, + "loss": 1.8712, + "step": 10280 + }, + { + "epoch": 0.79, + "grad_norm": 0.5821930123210192, + "learning_rate": 5.402853150283294e-06, + "loss": 1.9474, + "step": 10281 + }, + { + "epoch": 0.79, + "grad_norm": 0.656713435681311, + "learning_rate": 5.398975152413438e-06, + "loss": 2.0953, + "step": 10282 + }, + { + "epoch": 0.79, + "grad_norm": 0.5840902521479423, + "learning_rate": 5.395098378316088e-06, + "loss": 1.8489, + "step": 10283 + }, + { + "epoch": 0.79, + "grad_norm": 0.55350460863578, + "learning_rate": 5.391222828233283e-06, + "loss": 1.8798, + "step": 10284 + }, + { + "epoch": 0.79, + "grad_norm": 0.6144365686094893, + "learning_rate": 5.387348502406994e-06, + "loss": 1.818, + "step": 10285 + }, + { + "epoch": 0.79, + "grad_norm": 0.5814257581474069, + "learning_rate": 5.383475401079108e-06, + "loss": 2.0623, + "step": 10286 + }, + { + "epoch": 0.79, + "grad_norm": 0.5293098189172737, + "learning_rate": 5.3796035244914475e-06, + "loss": 1.961, + "step": 10287 + }, + { + "epoch": 0.79, + "grad_norm": 0.5636973968520742, + "learning_rate": 5.3757328728857244e-06, + "loss": 1.8499, + "step": 10288 + }, + { + "epoch": 0.79, + "grad_norm": 0.560371910246276, + "learning_rate": 5.371863446503628e-06, + "loss": 1.788, + "step": 10289 + }, + { + "epoch": 0.79, + "grad_norm": 0.6198146161406368, + "learning_rate": 5.367995245586735e-06, + "loss": 2.0474, + "step": 10290 + }, + { + "epoch": 0.79, + "grad_norm": 0.5998493297919627, + "learning_rate": 5.3641282703765335e-06, + "loss": 1.8854, + "step": 10291 + }, + { + "epoch": 0.79, + "grad_norm": 0.5513247079457374, + "learning_rate": 5.360262521114487e-06, + "loss": 1.8743, + "step": 10292 + }, + { + "epoch": 0.79, + "grad_norm": 0.5830003159871909, + "learning_rate": 5.356397998041926e-06, + "loss": 1.9419, + "step": 10293 + }, + { + "epoch": 0.79, + "grad_norm": 0.6324548413570229, + "learning_rate": 5.35253470140013e-06, + "loss": 2.0546, + "step": 10294 + }, + { + "epoch": 0.79, + "grad_norm": 0.6026276940550389, + "learning_rate": 5.348672631430318e-06, + "loss": 1.8717, + "step": 10295 + }, + { + "epoch": 0.79, + "grad_norm": 0.5514225263636126, + "learning_rate": 5.344811788373591e-06, + "loss": 1.8382, + "step": 10296 + }, + { + "epoch": 0.79, + "grad_norm": 0.5671284110691961, + "learning_rate": 5.340952172471017e-06, + "loss": 1.8905, + "step": 10297 + }, + { + "epoch": 0.79, + "grad_norm": 0.5675155609848035, + "learning_rate": 5.3370937839635645e-06, + "loss": 2.039, + "step": 10298 + }, + { + "epoch": 0.79, + "grad_norm": 0.5305825517509315, + "learning_rate": 5.3332366230921136e-06, + "loss": 1.9334, + "step": 10299 + }, + { + "epoch": 0.79, + "grad_norm": 0.54277746848529, + "learning_rate": 5.329380690097505e-06, + "loss": 1.8432, + "step": 10300 + }, + { + "epoch": 0.79, + "grad_norm": 0.5508172745350512, + "learning_rate": 5.325525985220461e-06, + "loss": 1.83, + "step": 10301 + }, + { + "epoch": 0.79, + "grad_norm": 0.5645998126572026, + "learning_rate": 5.32167250870165e-06, + "loss": 2.0421, + "step": 10302 + }, + { + "epoch": 0.79, + "grad_norm": 0.5578295076236371, + "learning_rate": 5.317820260781678e-06, + "loss": 1.8688, + "step": 10303 + }, + { + "epoch": 0.79, + "grad_norm": 0.5759097367056776, + "learning_rate": 5.313969241701039e-06, + "loss": 1.8703, + "step": 10304 + }, + { + "epoch": 0.8, + "grad_norm": 0.574778720207803, + "learning_rate": 5.310119451700168e-06, + "loss": 1.8642, + "step": 10305 + }, + { + "epoch": 0.8, + "grad_norm": 0.5367015864686491, + "learning_rate": 5.30627089101943e-06, + "loss": 1.9388, + "step": 10306 + }, + { + "epoch": 0.8, + "grad_norm": 0.6049783532682824, + "learning_rate": 5.302423559899106e-06, + "loss": 2.0682, + "step": 10307 + }, + { + "epoch": 0.8, + "grad_norm": 0.6111634323387574, + "learning_rate": 5.298577458579396e-06, + "loss": 1.8124, + "step": 10308 + }, + { + "epoch": 0.8, + "grad_norm": 0.5836421462073448, + "learning_rate": 5.29473258730043e-06, + "loss": 1.8467, + "step": 10309 + }, + { + "epoch": 0.8, + "grad_norm": 0.5305833433133336, + "learning_rate": 5.290888946302255e-06, + "loss": 2.0496, + "step": 10310 + }, + { + "epoch": 0.8, + "grad_norm": 0.5445315896367134, + "learning_rate": 5.287046535824863e-06, + "loss": 1.9016, + "step": 10311 + }, + { + "epoch": 0.8, + "grad_norm": 0.558590605602343, + "learning_rate": 5.283205356108134e-06, + "loss": 1.9273, + "step": 10312 + }, + { + "epoch": 0.8, + "grad_norm": 0.6059423219905423, + "learning_rate": 5.279365407391889e-06, + "loss": 1.8909, + "step": 10313 + }, + { + "epoch": 0.8, + "grad_norm": 0.5661399926850279, + "learning_rate": 5.27552668991588e-06, + "loss": 2.0573, + "step": 10314 + }, + { + "epoch": 0.8, + "grad_norm": 0.5675871387087331, + "learning_rate": 5.271689203919769e-06, + "loss": 1.8707, + "step": 10315 + }, + { + "epoch": 0.8, + "grad_norm": 0.5689459248486165, + "learning_rate": 5.267852949643146e-06, + "loss": 1.8222, + "step": 10316 + }, + { + "epoch": 0.8, + "grad_norm": 0.5765163887617202, + "learning_rate": 5.2640179273255235e-06, + "loss": 1.9105, + "step": 10317 + }, + { + "epoch": 0.8, + "grad_norm": 0.6306191073475599, + "learning_rate": 5.260184137206342e-06, + "loss": 1.9107, + "step": 10318 + }, + { + "epoch": 0.8, + "grad_norm": 0.6997031386173649, + "learning_rate": 5.256351579524957e-06, + "loss": 2.0389, + "step": 10319 + }, + { + "epoch": 0.8, + "grad_norm": 0.5807543663854395, + "learning_rate": 5.252520254520652e-06, + "loss": 1.8686, + "step": 10320 + }, + { + "epoch": 0.8, + "grad_norm": 0.5790411930285899, + "learning_rate": 5.248690162432632e-06, + "loss": 1.8863, + "step": 10321 + }, + { + "epoch": 0.8, + "grad_norm": 0.5862191611179373, + "learning_rate": 5.244861303500026e-06, + "loss": 2.0881, + "step": 10322 + }, + { + "epoch": 0.8, + "grad_norm": 0.5560918722047492, + "learning_rate": 5.241033677961884e-06, + "loss": 1.8146, + "step": 10323 + }, + { + "epoch": 0.8, + "grad_norm": 0.5430021711267736, + "learning_rate": 5.237207286057183e-06, + "loss": 1.9507, + "step": 10324 + }, + { + "epoch": 0.8, + "grad_norm": 0.5560114458057376, + "learning_rate": 5.2333821280248165e-06, + "loss": 1.8663, + "step": 10325 + }, + { + "epoch": 0.8, + "grad_norm": 0.6010762322339794, + "learning_rate": 5.229558204103615e-06, + "loss": 2.0784, + "step": 10326 + }, + { + "epoch": 0.8, + "grad_norm": 0.5592874855314791, + "learning_rate": 5.225735514532301e-06, + "loss": 1.7995, + "step": 10327 + }, + { + "epoch": 0.8, + "grad_norm": 0.6498090355589624, + "learning_rate": 5.221914059549559e-06, + "loss": 1.9245, + "step": 10328 + }, + { + "epoch": 0.8, + "grad_norm": 0.5893423299268513, + "learning_rate": 5.218093839393973e-06, + "loss": 1.8647, + "step": 10329 + }, + { + "epoch": 0.8, + "grad_norm": 0.5894203492278574, + "learning_rate": 5.214274854304055e-06, + "loss": 1.9527, + "step": 10330 + }, + { + "epoch": 0.8, + "grad_norm": 0.6373028922919814, + "learning_rate": 5.210457104518241e-06, + "loss": 2.0314, + "step": 10331 + }, + { + "epoch": 0.8, + "grad_norm": 0.6335270602257104, + "learning_rate": 5.206640590274886e-06, + "loss": 1.8584, + "step": 10332 + }, + { + "epoch": 0.8, + "grad_norm": 0.5496610046732414, + "learning_rate": 5.20282531181227e-06, + "loss": 1.8243, + "step": 10333 + }, + { + "epoch": 0.8, + "grad_norm": 0.5998920978285385, + "learning_rate": 5.199011269368609e-06, + "loss": 2.0822, + "step": 10334 + }, + { + "epoch": 0.8, + "grad_norm": 0.5983489863751883, + "learning_rate": 5.195198463182005e-06, + "loss": 1.9002, + "step": 10335 + }, + { + "epoch": 0.8, + "grad_norm": 0.5707070103568428, + "learning_rate": 5.191386893490527e-06, + "loss": 1.8556, + "step": 10336 + }, + { + "epoch": 0.8, + "grad_norm": 0.5744978603061615, + "learning_rate": 5.187576560532145e-06, + "loss": 1.914, + "step": 10337 + }, + { + "epoch": 0.8, + "grad_norm": 0.5672192672954681, + "learning_rate": 5.1837674645447395e-06, + "loss": 1.8454, + "step": 10338 + }, + { + "epoch": 0.8, + "grad_norm": 0.5913556339760355, + "learning_rate": 5.179959605766152e-06, + "loss": 2.0527, + "step": 10339 + }, + { + "epoch": 0.8, + "grad_norm": 0.680219680864323, + "learning_rate": 5.176152984434102e-06, + "loss": 1.8865, + "step": 10340 + }, + { + "epoch": 0.8, + "grad_norm": 0.584683757062755, + "learning_rate": 5.172347600786251e-06, + "loss": 1.842, + "step": 10341 + }, + { + "epoch": 0.8, + "grad_norm": 0.5760081367948665, + "learning_rate": 5.16854345506021e-06, + "loss": 2.0348, + "step": 10342 + }, + { + "epoch": 0.8, + "grad_norm": 0.5717842824326596, + "learning_rate": 5.16474054749346e-06, + "loss": 1.9429, + "step": 10343 + }, + { + "epoch": 0.8, + "grad_norm": 0.68775606948644, + "learning_rate": 5.16093887832344e-06, + "loss": 1.8564, + "step": 10344 + }, + { + "epoch": 0.8, + "grad_norm": 0.6043264048554993, + "learning_rate": 5.157138447787518e-06, + "loss": 1.9089, + "step": 10345 + }, + { + "epoch": 0.8, + "grad_norm": 0.6394038609420872, + "learning_rate": 5.153339256122946e-06, + "loss": 2.0669, + "step": 10346 + }, + { + "epoch": 0.8, + "grad_norm": 0.636460306375277, + "learning_rate": 5.149541303566951e-06, + "loss": 1.8703, + "step": 10347 + }, + { + "epoch": 0.8, + "grad_norm": 0.5820783354923312, + "learning_rate": 5.145744590356633e-06, + "loss": 1.8293, + "step": 10348 + }, + { + "epoch": 0.8, + "grad_norm": 0.5801644385919899, + "learning_rate": 5.141949116729036e-06, + "loss": 1.9309, + "step": 10349 + }, + { + "epoch": 0.8, + "grad_norm": 0.5601047242978071, + "learning_rate": 5.138154882921148e-06, + "loss": 1.8695, + "step": 10350 + }, + { + "epoch": 0.8, + "grad_norm": 0.6222478712100765, + "learning_rate": 5.134361889169839e-06, + "loss": 2.0389, + "step": 10351 + }, + { + "epoch": 0.8, + "grad_norm": 0.577659493208015, + "learning_rate": 5.130570135711929e-06, + "loss": 1.8828, + "step": 10352 + }, + { + "epoch": 0.8, + "grad_norm": 0.5373195670886877, + "learning_rate": 5.126779622784147e-06, + "loss": 1.8101, + "step": 10353 + }, + { + "epoch": 0.8, + "grad_norm": 0.5292527344723461, + "learning_rate": 5.122990350623158e-06, + "loss": 2.0719, + "step": 10354 + }, + { + "epoch": 0.8, + "grad_norm": 0.5090977627271175, + "learning_rate": 5.119202319465538e-06, + "loss": 1.9228, + "step": 10355 + }, + { + "epoch": 0.8, + "grad_norm": 0.5594528704275471, + "learning_rate": 5.115415529547787e-06, + "loss": 1.844, + "step": 10356 + }, + { + "epoch": 0.8, + "grad_norm": 0.6029495574235477, + "learning_rate": 5.111629981106334e-06, + "loss": 1.8856, + "step": 10357 + }, + { + "epoch": 0.8, + "grad_norm": 0.6052338691824617, + "learning_rate": 5.107845674377526e-06, + "loss": 2.0608, + "step": 10358 + }, + { + "epoch": 0.8, + "grad_norm": 0.5491371500563812, + "learning_rate": 5.104062609597629e-06, + "loss": 1.8839, + "step": 10359 + }, + { + "epoch": 0.8, + "grad_norm": 0.5773457593640353, + "learning_rate": 5.100280787002839e-06, + "loss": 1.8259, + "step": 10360 + }, + { + "epoch": 0.8, + "grad_norm": 0.5649773987155331, + "learning_rate": 5.0965002068292685e-06, + "loss": 1.9423, + "step": 10361 + }, + { + "epoch": 0.8, + "grad_norm": 0.5785805466704477, + "learning_rate": 5.092720869312959e-06, + "loss": 1.873, + "step": 10362 + }, + { + "epoch": 0.8, + "grad_norm": 0.5676805001921199, + "learning_rate": 5.088942774689864e-06, + "loss": 2.0345, + "step": 10363 + }, + { + "epoch": 0.8, + "grad_norm": 0.5715183490442326, + "learning_rate": 5.08516592319587e-06, + "loss": 1.8918, + "step": 10364 + }, + { + "epoch": 0.8, + "grad_norm": 0.6174112879230108, + "learning_rate": 5.081390315066778e-06, + "loss": 1.8792, + "step": 10365 + }, + { + "epoch": 0.8, + "grad_norm": 0.5993335732602378, + "learning_rate": 5.077615950538317e-06, + "loss": 2.0485, + "step": 10366 + }, + { + "epoch": 0.8, + "grad_norm": 0.5683376129831172, + "learning_rate": 5.073842829846134e-06, + "loss": 1.8389, + "step": 10367 + }, + { + "epoch": 0.8, + "grad_norm": 0.542394646173869, + "learning_rate": 5.070070953225803e-06, + "loss": 1.8919, + "step": 10368 + }, + { + "epoch": 0.8, + "grad_norm": 0.6262665714458804, + "learning_rate": 5.066300320912817e-06, + "loss": 1.8945, + "step": 10369 + }, + { + "epoch": 0.8, + "grad_norm": 0.626755441614926, + "learning_rate": 5.062530933142593e-06, + "loss": 1.8764, + "step": 10370 + }, + { + "epoch": 0.8, + "grad_norm": 0.5954528902464622, + "learning_rate": 5.058762790150465e-06, + "loss": 2.007, + "step": 10371 + }, + { + "epoch": 0.8, + "grad_norm": 0.5930677156983201, + "learning_rate": 5.054995892171699e-06, + "loss": 1.869, + "step": 10372 + }, + { + "epoch": 0.8, + "grad_norm": 0.5780747408663028, + "learning_rate": 5.051230239441482e-06, + "loss": 1.8801, + "step": 10373 + }, + { + "epoch": 0.8, + "grad_norm": 0.5585698091130455, + "learning_rate": 5.047465832194897e-06, + "loss": 1.9302, + "step": 10374 + }, + { + "epoch": 0.8, + "grad_norm": 0.5935062474674005, + "learning_rate": 5.043702670666995e-06, + "loss": 2.0592, + "step": 10375 + }, + { + "epoch": 0.8, + "grad_norm": 0.6037233417699033, + "learning_rate": 5.039940755092726e-06, + "loss": 1.8571, + "step": 10376 + }, + { + "epoch": 0.8, + "grad_norm": 0.5804962300112098, + "learning_rate": 5.03618008570694e-06, + "loss": 1.8558, + "step": 10377 + }, + { + "epoch": 0.8, + "grad_norm": 0.5873479664586692, + "learning_rate": 5.03242066274445e-06, + "loss": 2.1213, + "step": 10378 + }, + { + "epoch": 0.8, + "grad_norm": 0.5398112015742577, + "learning_rate": 5.028662486439967e-06, + "loss": 1.816, + "step": 10379 + }, + { + "epoch": 0.8, + "grad_norm": 0.5314620328229054, + "learning_rate": 5.024905557028131e-06, + "loss": 1.9437, + "step": 10380 + }, + { + "epoch": 0.8, + "grad_norm": 0.562245946522114, + "learning_rate": 5.0211498747435056e-06, + "loss": 1.8473, + "step": 10381 + }, + { + "epoch": 0.8, + "grad_norm": 0.5601921681062941, + "learning_rate": 5.017395439820554e-06, + "loss": 1.8853, + "step": 10382 + }, + { + "epoch": 0.8, + "grad_norm": 0.6182870045260961, + "learning_rate": 5.013642252493705e-06, + "loss": 2.0214, + "step": 10383 + }, + { + "epoch": 0.8, + "grad_norm": 0.6224546570590392, + "learning_rate": 5.009890312997284e-06, + "loss": 1.8768, + "step": 10384 + }, + { + "epoch": 0.8, + "grad_norm": 0.640378831898014, + "learning_rate": 5.006139621565517e-06, + "loss": 1.8567, + "step": 10385 + }, + { + "epoch": 0.8, + "grad_norm": 0.5551483050093223, + "learning_rate": 5.002390178432603e-06, + "loss": 1.9441, + "step": 10386 + }, + { + "epoch": 0.8, + "grad_norm": 0.5719679927224324, + "learning_rate": 4.998641983832617e-06, + "loss": 2.0436, + "step": 10387 + }, + { + "epoch": 0.8, + "grad_norm": 0.5670876010467094, + "learning_rate": 4.9948950379995725e-06, + "loss": 1.8826, + "step": 10388 + }, + { + "epoch": 0.8, + "grad_norm": 0.5727792955927588, + "learning_rate": 4.991149341167425e-06, + "loss": 1.8285, + "step": 10389 + }, + { + "epoch": 0.8, + "grad_norm": 0.5724534693483202, + "learning_rate": 4.987404893570019e-06, + "loss": 2.0774, + "step": 10390 + }, + { + "epoch": 0.8, + "grad_norm": 0.603741929050136, + "learning_rate": 4.983661695441133e-06, + "loss": 1.9009, + "step": 10391 + }, + { + "epoch": 0.8, + "grad_norm": 0.5281196676362316, + "learning_rate": 4.979919747014486e-06, + "loss": 1.9007, + "step": 10392 + }, + { + "epoch": 0.8, + "grad_norm": 0.5728356756697508, + "learning_rate": 4.976179048523685e-06, + "loss": 1.8816, + "step": 10393 + }, + { + "epoch": 0.8, + "grad_norm": 0.6421015162427863, + "learning_rate": 4.972439600202297e-06, + "loss": 1.8389, + "step": 10394 + }, + { + "epoch": 0.8, + "grad_norm": 0.637883448070874, + "learning_rate": 4.968701402283773e-06, + "loss": 2.0585, + "step": 10395 + }, + { + "epoch": 0.8, + "grad_norm": 0.5505554823735205, + "learning_rate": 4.964964455001505e-06, + "loss": 1.8754, + "step": 10396 + }, + { + "epoch": 0.8, + "grad_norm": 0.5519493623918355, + "learning_rate": 4.961228758588824e-06, + "loss": 1.8408, + "step": 10397 + }, + { + "epoch": 0.8, + "grad_norm": 0.6057503850916376, + "learning_rate": 4.957494313278949e-06, + "loss": 2.0569, + "step": 10398 + }, + { + "epoch": 0.8, + "grad_norm": 0.5477108863437177, + "learning_rate": 4.953761119305031e-06, + "loss": 1.9866, + "step": 10399 + }, + { + "epoch": 0.8, + "grad_norm": 0.585291830808401, + "learning_rate": 4.950029176900172e-06, + "loss": 1.8872, + "step": 10400 + }, + { + "epoch": 0.8, + "grad_norm": 0.578500486052067, + "learning_rate": 4.9462984862973545e-06, + "loss": 1.8129, + "step": 10401 + }, + { + "epoch": 0.8, + "grad_norm": 0.5874258755013646, + "learning_rate": 4.942569047729506e-06, + "loss": 1.84, + "step": 10402 + }, + { + "epoch": 0.8, + "grad_norm": 0.5399540475919716, + "learning_rate": 4.938840861429467e-06, + "loss": 2.0339, + "step": 10403 + }, + { + "epoch": 0.8, + "grad_norm": 0.5579303258079316, + "learning_rate": 4.9351139276300085e-06, + "loss": 1.8504, + "step": 10404 + }, + { + "epoch": 0.8, + "grad_norm": 0.5244675783005178, + "learning_rate": 4.931388246563815e-06, + "loss": 1.8727, + "step": 10405 + }, + { + "epoch": 0.8, + "grad_norm": 0.5791870072121792, + "learning_rate": 4.927663818463496e-06, + "loss": 1.8567, + "step": 10406 + }, + { + "epoch": 0.8, + "grad_norm": 0.6195623670387986, + "learning_rate": 4.923940643561583e-06, + "loss": 2.0648, + "step": 10407 + }, + { + "epoch": 0.8, + "grad_norm": 0.5944153120036526, + "learning_rate": 4.920218722090533e-06, + "loss": 1.8782, + "step": 10408 + }, + { + "epoch": 0.8, + "grad_norm": 0.5413637079012079, + "learning_rate": 4.916498054282717e-06, + "loss": 1.7945, + "step": 10409 + }, + { + "epoch": 0.8, + "grad_norm": 0.5368396943795246, + "learning_rate": 4.912778640370433e-06, + "loss": 2.0301, + "step": 10410 + }, + { + "epoch": 0.8, + "grad_norm": 0.5278498142011506, + "learning_rate": 4.9090604805859e-06, + "loss": 1.9584, + "step": 10411 + }, + { + "epoch": 0.8, + "grad_norm": 0.5501669685041819, + "learning_rate": 4.905343575161256e-06, + "loss": 1.8437, + "step": 10412 + }, + { + "epoch": 0.8, + "grad_norm": 0.5233635488254041, + "learning_rate": 4.901627924328561e-06, + "loss": 1.821, + "step": 10413 + }, + { + "epoch": 0.8, + "grad_norm": 0.5766796825557069, + "learning_rate": 4.897913528319803e-06, + "loss": 1.8523, + "step": 10414 + }, + { + "epoch": 0.8, + "grad_norm": 0.551979371801613, + "learning_rate": 4.894200387366884e-06, + "loss": 2.0413, + "step": 10415 + }, + { + "epoch": 0.8, + "grad_norm": 0.5836250799455329, + "learning_rate": 4.890488501701634e-06, + "loss": 1.8462, + "step": 10416 + }, + { + "epoch": 0.8, + "grad_norm": 0.5950701799724265, + "learning_rate": 4.886777871555798e-06, + "loss": 1.9546, + "step": 10417 + }, + { + "epoch": 0.8, + "grad_norm": 0.5826279314693502, + "learning_rate": 4.883068497161045e-06, + "loss": 1.8413, + "step": 10418 + }, + { + "epoch": 0.8, + "grad_norm": 0.5561326880613706, + "learning_rate": 4.879360378748968e-06, + "loss": 2.0547, + "step": 10419 + }, + { + "epoch": 0.8, + "grad_norm": 0.5741954912855409, + "learning_rate": 4.875653516551087e-06, + "loss": 1.8776, + "step": 10420 + }, + { + "epoch": 0.8, + "grad_norm": 0.5858028268982356, + "learning_rate": 4.871947910798818e-06, + "loss": 1.8612, + "step": 10421 + }, + { + "epoch": 0.8, + "grad_norm": 0.5646329302176061, + "learning_rate": 4.868243561723535e-06, + "loss": 1.8294, + "step": 10422 + }, + { + "epoch": 0.8, + "grad_norm": 0.5745474944816826, + "learning_rate": 4.864540469556517e-06, + "loss": 2.1368, + "step": 10423 + }, + { + "epoch": 0.8, + "grad_norm": 0.5901469417854697, + "learning_rate": 4.860838634528941e-06, + "loss": 1.8662, + "step": 10424 + }, + { + "epoch": 0.8, + "grad_norm": 0.602480374412365, + "learning_rate": 4.857138056871951e-06, + "loss": 1.8809, + "step": 10425 + }, + { + "epoch": 0.8, + "grad_norm": 0.5743160014398665, + "learning_rate": 4.853438736816582e-06, + "loss": 1.8233, + "step": 10426 + }, + { + "epoch": 0.8, + "grad_norm": 0.5670704660474511, + "learning_rate": 4.849740674593795e-06, + "loss": 2.0468, + "step": 10427 + }, + { + "epoch": 0.8, + "grad_norm": 0.5857919946391883, + "learning_rate": 4.846043870434483e-06, + "loss": 1.8831, + "step": 10428 + }, + { + "epoch": 0.8, + "grad_norm": 0.5751834023916403, + "learning_rate": 4.842348324569435e-06, + "loss": 1.8539, + "step": 10429 + }, + { + "epoch": 0.8, + "grad_norm": 0.5440895214545866, + "learning_rate": 4.838654037229395e-06, + "loss": 1.9342, + "step": 10430 + }, + { + "epoch": 0.8, + "grad_norm": 0.5792815689905975, + "learning_rate": 4.83496100864502e-06, + "loss": 2.065, + "step": 10431 + }, + { + "epoch": 0.8, + "grad_norm": 0.5886922179406396, + "learning_rate": 4.831269239046851e-06, + "loss": 1.8552, + "step": 10432 + }, + { + "epoch": 0.8, + "grad_norm": 0.5836859869292756, + "learning_rate": 4.827578728665408e-06, + "loss": 1.8887, + "step": 10433 + }, + { + "epoch": 0.8, + "grad_norm": 0.5818632893246573, + "learning_rate": 4.8238894777311e-06, + "loss": 1.8484, + "step": 10434 + }, + { + "epoch": 0.81, + "grad_norm": 0.5230721166385053, + "learning_rate": 4.820201486474246e-06, + "loss": 1.9947, + "step": 10435 + }, + { + "epoch": 0.81, + "grad_norm": 0.5177893117169098, + "learning_rate": 4.816514755125126e-06, + "loss": 1.961, + "step": 10436 + }, + { + "epoch": 0.81, + "grad_norm": 0.5637150078055186, + "learning_rate": 4.8128292839139e-06, + "loss": 1.8317, + "step": 10437 + }, + { + "epoch": 0.81, + "grad_norm": 0.5691140861968128, + "learning_rate": 4.809145073070667e-06, + "loss": 1.8061, + "step": 10438 + }, + { + "epoch": 0.81, + "grad_norm": 0.5718479059451684, + "learning_rate": 4.805462122825463e-06, + "loss": 2.0331, + "step": 10439 + }, + { + "epoch": 0.81, + "grad_norm": 0.5818875503730755, + "learning_rate": 4.801780433408215e-06, + "loss": 1.9082, + "step": 10440 + }, + { + "epoch": 0.81, + "grad_norm": 0.599549074130894, + "learning_rate": 4.798100005048791e-06, + "loss": 1.8758, + "step": 10441 + }, + { + "epoch": 0.81, + "grad_norm": 0.5083303564810173, + "learning_rate": 4.7944208379769766e-06, + "loss": 1.9687, + "step": 10442 + }, + { + "epoch": 0.81, + "grad_norm": 0.5749447036694735, + "learning_rate": 4.790742932422468e-06, + "loss": 2.0679, + "step": 10443 + }, + { + "epoch": 0.81, + "grad_norm": 0.5390275564723571, + "learning_rate": 4.787066288614914e-06, + "loss": 1.8556, + "step": 10444 + }, + { + "epoch": 0.81, + "grad_norm": 0.5713121176561802, + "learning_rate": 4.783390906783841e-06, + "loss": 1.8419, + "step": 10445 + }, + { + "epoch": 0.81, + "grad_norm": 0.5581697193853977, + "learning_rate": 4.77971678715872e-06, + "loss": 1.8443, + "step": 10446 + }, + { + "epoch": 0.81, + "grad_norm": 0.6283991937112697, + "learning_rate": 4.77604392996896e-06, + "loss": 2.082, + "step": 10447 + }, + { + "epoch": 0.81, + "grad_norm": 0.5738822714978264, + "learning_rate": 4.772372335443851e-06, + "loss": 1.9904, + "step": 10448 + }, + { + "epoch": 0.81, + "grad_norm": 0.5563085336015282, + "learning_rate": 4.768702003812636e-06, + "loss": 1.8566, + "step": 10449 + }, + { + "epoch": 0.81, + "grad_norm": 0.5285736616919252, + "learning_rate": 4.765032935304467e-06, + "loss": 1.8808, + "step": 10450 + }, + { + "epoch": 0.81, + "grad_norm": 0.621729538054919, + "learning_rate": 4.761365130148418e-06, + "loss": 2.0317, + "step": 10451 + }, + { + "epoch": 0.81, + "grad_norm": 0.5617033943441008, + "learning_rate": 4.757698588573487e-06, + "loss": 1.8598, + "step": 10452 + }, + { + "epoch": 0.81, + "grad_norm": 0.5682788021599456, + "learning_rate": 4.754033310808589e-06, + "loss": 1.8582, + "step": 10453 + }, + { + "epoch": 0.81, + "grad_norm": 0.5658360035958425, + "learning_rate": 4.750369297082566e-06, + "loss": 1.8712, + "step": 10454 + }, + { + "epoch": 0.81, + "grad_norm": 0.5395946876281712, + "learning_rate": 4.746706547624174e-06, + "loss": 2.0595, + "step": 10455 + }, + { + "epoch": 0.81, + "grad_norm": 0.5644131661364578, + "learning_rate": 4.743045062662096e-06, + "loss": 1.877, + "step": 10456 + }, + { + "epoch": 0.81, + "grad_norm": 0.5340837464456463, + "learning_rate": 4.739384842424935e-06, + "loss": 1.848, + "step": 10457 + }, + { + "epoch": 0.81, + "grad_norm": 0.6144970831514104, + "learning_rate": 4.735725887141207e-06, + "loss": 1.8686, + "step": 10458 + }, + { + "epoch": 0.81, + "grad_norm": 0.5434290523420635, + "learning_rate": 4.7320681970393605e-06, + "loss": 2.0927, + "step": 10459 + }, + { + "epoch": 0.81, + "grad_norm": 0.5851139938577856, + "learning_rate": 4.72841177234776e-06, + "loss": 1.8149, + "step": 10460 + }, + { + "epoch": 0.81, + "grad_norm": 0.5356937969411537, + "learning_rate": 4.72475661329469e-06, + "loss": 1.8551, + "step": 10461 + }, + { + "epoch": 0.81, + "grad_norm": 0.5821718114167291, + "learning_rate": 4.721102720108356e-06, + "loss": 1.8609, + "step": 10462 + }, + { + "epoch": 0.81, + "grad_norm": 0.5778070883959584, + "learning_rate": 4.717450093016889e-06, + "loss": 2.0455, + "step": 10463 + }, + { + "epoch": 0.81, + "grad_norm": 0.545703023020577, + "learning_rate": 4.713798732248334e-06, + "loss": 1.8449, + "step": 10464 + }, + { + "epoch": 0.81, + "grad_norm": 0.5576551028034836, + "learning_rate": 4.710148638030665e-06, + "loss": 1.8547, + "step": 10465 + }, + { + "epoch": 0.81, + "grad_norm": 0.5751212877928249, + "learning_rate": 4.7064998105917666e-06, + "loss": 1.8656, + "step": 10466 + }, + { + "epoch": 0.81, + "grad_norm": 0.5718368178092039, + "learning_rate": 4.702852250159451e-06, + "loss": 2.1307, + "step": 10467 + }, + { + "epoch": 0.81, + "grad_norm": 0.5622218784145107, + "learning_rate": 4.699205956961455e-06, + "loss": 1.9162, + "step": 10468 + }, + { + "epoch": 0.81, + "grad_norm": 0.5472202939953829, + "learning_rate": 4.695560931225429e-06, + "loss": 1.83, + "step": 10469 + }, + { + "epoch": 0.81, + "grad_norm": 0.5559056806275278, + "learning_rate": 4.691917173178953e-06, + "loss": 1.8336, + "step": 10470 + }, + { + "epoch": 0.81, + "grad_norm": 0.5554318272060033, + "learning_rate": 4.688274683049501e-06, + "loss": 2.0599, + "step": 10471 + }, + { + "epoch": 0.81, + "grad_norm": 0.5631269649243916, + "learning_rate": 4.684633461064511e-06, + "loss": 1.8143, + "step": 10472 + }, + { + "epoch": 0.81, + "grad_norm": 0.5213349474067015, + "learning_rate": 4.680993507451317e-06, + "loss": 1.9708, + "step": 10473 + }, + { + "epoch": 0.81, + "grad_norm": 0.589386748457972, + "learning_rate": 4.677354822437161e-06, + "loss": 1.8629, + "step": 10474 + }, + { + "epoch": 0.81, + "grad_norm": 0.551180884404929, + "learning_rate": 4.673717406249239e-06, + "loss": 2.0454, + "step": 10475 + }, + { + "epoch": 0.81, + "grad_norm": 0.5598026545608378, + "learning_rate": 4.6700812591146335e-06, + "loss": 1.9079, + "step": 10476 + }, + { + "epoch": 0.81, + "grad_norm": 0.5759549581278354, + "learning_rate": 4.666446381260378e-06, + "loss": 1.8469, + "step": 10477 + }, + { + "epoch": 0.81, + "grad_norm": 0.5898736000653801, + "learning_rate": 4.6628127729134105e-06, + "loss": 1.8376, + "step": 10478 + }, + { + "epoch": 0.81, + "grad_norm": 0.5802252246443256, + "learning_rate": 4.659180434300581e-06, + "loss": 2.1074, + "step": 10479 + }, + { + "epoch": 0.81, + "grad_norm": 0.5552607861964837, + "learning_rate": 4.6555493656486815e-06, + "loss": 1.8412, + "step": 10480 + }, + { + "epoch": 0.81, + "grad_norm": 0.5390279090910323, + "learning_rate": 4.651919567184421e-06, + "loss": 1.8018, + "step": 10481 + }, + { + "epoch": 0.81, + "grad_norm": 0.5696023229054242, + "learning_rate": 4.648291039134397e-06, + "loss": 1.8407, + "step": 10482 + }, + { + "epoch": 0.81, + "grad_norm": 0.5878588761577364, + "learning_rate": 4.644663781725187e-06, + "loss": 2.077, + "step": 10483 + }, + { + "epoch": 0.81, + "grad_norm": 0.5595398778147734, + "learning_rate": 4.641037795183231e-06, + "loss": 1.8689, + "step": 10484 + }, + { + "epoch": 0.81, + "grad_norm": 0.5832328246729406, + "learning_rate": 4.6374130797349175e-06, + "loss": 1.7857, + "step": 10485 + }, + { + "epoch": 0.81, + "grad_norm": 0.6016599359661842, + "learning_rate": 4.63378963560657e-06, + "loss": 1.9533, + "step": 10486 + }, + { + "epoch": 0.81, + "grad_norm": 0.5699129715939523, + "learning_rate": 4.630167463024393e-06, + "loss": 2.0349, + "step": 10487 + }, + { + "epoch": 0.81, + "grad_norm": 0.5873996622885543, + "learning_rate": 4.626546562214546e-06, + "loss": 1.8121, + "step": 10488 + }, + { + "epoch": 0.81, + "grad_norm": 0.6251358619305442, + "learning_rate": 4.622926933403091e-06, + "loss": 1.8198, + "step": 10489 + }, + { + "epoch": 0.81, + "grad_norm": 0.6082653078986949, + "learning_rate": 4.619308576816012e-06, + "loss": 1.8298, + "step": 10490 + }, + { + "epoch": 0.81, + "grad_norm": 0.5633985137890262, + "learning_rate": 4.615691492679239e-06, + "loss": 2.0661, + "step": 10491 + }, + { + "epoch": 0.81, + "grad_norm": 0.5154932160367471, + "learning_rate": 4.612075681218578e-06, + "loss": 1.9036, + "step": 10492 + }, + { + "epoch": 0.81, + "grad_norm": 0.5992158380181931, + "learning_rate": 4.608461142659784e-06, + "loss": 1.8709, + "step": 10493 + }, + { + "epoch": 0.81, + "grad_norm": 0.5833879023377221, + "learning_rate": 4.604847877228543e-06, + "loss": 1.8265, + "step": 10494 + }, + { + "epoch": 0.81, + "grad_norm": 0.5421769690340095, + "learning_rate": 4.6012358851504285e-06, + "loss": 2.1087, + "step": 10495 + }, + { + "epoch": 0.81, + "grad_norm": 0.5889878707401719, + "learning_rate": 4.5976251666509565e-06, + "loss": 1.8288, + "step": 10496 + }, + { + "epoch": 0.81, + "grad_norm": 0.5672418726249308, + "learning_rate": 4.594015721955561e-06, + "loss": 1.9037, + "step": 10497 + }, + { + "epoch": 0.81, + "grad_norm": 0.5553774836258919, + "learning_rate": 4.5904075512895964e-06, + "loss": 1.9864, + "step": 10498 + }, + { + "epoch": 0.81, + "grad_norm": 0.5682713802690531, + "learning_rate": 4.5868006548783316e-06, + "loss": 2.0856, + "step": 10499 + }, + { + "epoch": 0.81, + "grad_norm": 0.579594831237363, + "learning_rate": 4.58319503294696e-06, + "loss": 1.9021, + "step": 10500 + }, + { + "epoch": 0.81, + "grad_norm": 0.5577872742786512, + "learning_rate": 4.579590685720597e-06, + "loss": 1.8465, + "step": 10501 + }, + { + "epoch": 0.81, + "grad_norm": 0.5811784312294996, + "learning_rate": 4.575987613424279e-06, + "loss": 1.8962, + "step": 10502 + }, + { + "epoch": 0.81, + "grad_norm": 0.563951559438157, + "learning_rate": 4.572385816282956e-06, + "loss": 2.0676, + "step": 10503 + }, + { + "epoch": 0.81, + "grad_norm": 0.5452488235748394, + "learning_rate": 4.568785294521505e-06, + "loss": 1.9255, + "step": 10504 + }, + { + "epoch": 0.81, + "grad_norm": 0.540224968107299, + "learning_rate": 4.565186048364722e-06, + "loss": 1.8362, + "step": 10505 + }, + { + "epoch": 0.81, + "grad_norm": 0.5661015197357013, + "learning_rate": 4.5615880780373235e-06, + "loss": 1.8333, + "step": 10506 + }, + { + "epoch": 0.81, + "grad_norm": 0.6071440240362667, + "learning_rate": 4.557991383763943e-06, + "loss": 2.067, + "step": 10507 + }, + { + "epoch": 0.81, + "grad_norm": 0.5530019723039468, + "learning_rate": 4.55439596576914e-06, + "loss": 1.8678, + "step": 10508 + }, + { + "epoch": 0.81, + "grad_norm": 0.5743756792914757, + "learning_rate": 4.55080182427739e-06, + "loss": 1.8947, + "step": 10509 + }, + { + "epoch": 0.81, + "grad_norm": 0.5965351939529518, + "learning_rate": 4.547208959513085e-06, + "loss": 1.9512, + "step": 10510 + }, + { + "epoch": 0.81, + "grad_norm": 0.5634071259324567, + "learning_rate": 4.543617371700551e-06, + "loss": 2.1034, + "step": 10511 + }, + { + "epoch": 0.81, + "grad_norm": 0.5429486144623774, + "learning_rate": 4.54002706106402e-06, + "loss": 1.8489, + "step": 10512 + }, + { + "epoch": 0.81, + "grad_norm": 0.6010379119475018, + "learning_rate": 4.536438027827652e-06, + "loss": 1.7957, + "step": 10513 + }, + { + "epoch": 0.81, + "grad_norm": 0.5582237640091071, + "learning_rate": 4.5328502722155244e-06, + "loss": 1.8773, + "step": 10514 + }, + { + "epoch": 0.81, + "grad_norm": 0.6214373048028072, + "learning_rate": 4.529263794451635e-06, + "loss": 2.0235, + "step": 10515 + }, + { + "epoch": 0.81, + "grad_norm": 0.5390223991927653, + "learning_rate": 4.525678594759903e-06, + "loss": 1.847, + "step": 10516 + }, + { + "epoch": 0.81, + "grad_norm": 0.5310958657562963, + "learning_rate": 4.5220946733641716e-06, + "loss": 1.9537, + "step": 10517 + }, + { + "epoch": 0.81, + "grad_norm": 0.6392939906769093, + "learning_rate": 4.518512030488184e-06, + "loss": 1.8614, + "step": 10518 + }, + { + "epoch": 0.81, + "grad_norm": 0.5624030554027067, + "learning_rate": 4.514930666355638e-06, + "loss": 2.0752, + "step": 10519 + }, + { + "epoch": 0.81, + "grad_norm": 0.6107495245275211, + "learning_rate": 4.51135058119013e-06, + "loss": 1.9063, + "step": 10520 + }, + { + "epoch": 0.81, + "grad_norm": 0.5958147860329133, + "learning_rate": 4.507771775215161e-06, + "loss": 1.8578, + "step": 10521 + }, + { + "epoch": 0.81, + "grad_norm": 0.6077220240074688, + "learning_rate": 4.504194248654198e-06, + "loss": 1.8405, + "step": 10522 + }, + { + "epoch": 0.81, + "grad_norm": 0.5892603901999128, + "learning_rate": 4.5006180017305745e-06, + "loss": 1.9627, + "step": 10523 + }, + { + "epoch": 0.81, + "grad_norm": 0.55879207097425, + "learning_rate": 4.497043034667589e-06, + "loss": 2.0468, + "step": 10524 + }, + { + "epoch": 0.81, + "grad_norm": 0.5759871833829338, + "learning_rate": 4.493469347688442e-06, + "loss": 1.8877, + "step": 10525 + }, + { + "epoch": 0.81, + "grad_norm": 0.5856719915663197, + "learning_rate": 4.489896941016233e-06, + "loss": 1.8726, + "step": 10526 + }, + { + "epoch": 0.81, + "grad_norm": 0.5959876767390332, + "learning_rate": 4.486325814874023e-06, + "loss": 2.0274, + "step": 10527 + }, + { + "epoch": 0.81, + "grad_norm": 0.5982217904457898, + "learning_rate": 4.48275596948477e-06, + "loss": 1.8946, + "step": 10528 + }, + { + "epoch": 0.81, + "grad_norm": 0.609111479823473, + "learning_rate": 4.4791874050713365e-06, + "loss": 1.9245, + "step": 10529 + }, + { + "epoch": 0.81, + "grad_norm": 0.5896932177228011, + "learning_rate": 4.475620121856547e-06, + "loss": 1.8745, + "step": 10530 + }, + { + "epoch": 0.81, + "grad_norm": 0.565351407453564, + "learning_rate": 4.472054120063104e-06, + "loss": 2.0897, + "step": 10531 + }, + { + "epoch": 0.81, + "grad_norm": 0.5373687706260235, + "learning_rate": 4.468489399913648e-06, + "loss": 1.8591, + "step": 10532 + }, + { + "epoch": 0.81, + "grad_norm": 0.6110272412522735, + "learning_rate": 4.464925961630753e-06, + "loss": 1.8503, + "step": 10533 + }, + { + "epoch": 0.81, + "grad_norm": 0.6148657184463854, + "learning_rate": 4.461363805436889e-06, + "loss": 1.8377, + "step": 10534 + }, + { + "epoch": 0.81, + "grad_norm": 0.5443570187087716, + "learning_rate": 4.457802931554453e-06, + "loss": 1.9492, + "step": 10535 + }, + { + "epoch": 0.81, + "grad_norm": 0.6184282258689743, + "learning_rate": 4.454243340205771e-06, + "loss": 2.0631, + "step": 10536 + }, + { + "epoch": 0.81, + "grad_norm": 0.572016278642415, + "learning_rate": 4.450685031613083e-06, + "loss": 1.8403, + "step": 10537 + }, + { + "epoch": 0.81, + "grad_norm": 0.5856567995080311, + "learning_rate": 4.447128005998547e-06, + "loss": 1.8818, + "step": 10538 + }, + { + "epoch": 0.81, + "grad_norm": 0.5454554922355379, + "learning_rate": 4.443572263584242e-06, + "loss": 2.0648, + "step": 10539 + }, + { + "epoch": 0.81, + "grad_norm": 0.5820548089774745, + "learning_rate": 4.440017804592161e-06, + "loss": 1.8422, + "step": 10540 + }, + { + "epoch": 0.81, + "grad_norm": 0.6047407887122879, + "learning_rate": 4.436464629244244e-06, + "loss": 1.9451, + "step": 10541 + }, + { + "epoch": 0.81, + "grad_norm": 0.6140644233269725, + "learning_rate": 4.432912737762312e-06, + "loss": 1.8327, + "step": 10542 + }, + { + "epoch": 0.81, + "grad_norm": 0.5568272860779881, + "learning_rate": 4.429362130368128e-06, + "loss": 2.0586, + "step": 10543 + }, + { + "epoch": 0.81, + "grad_norm": 0.5523858173090604, + "learning_rate": 4.4258128072833745e-06, + "loss": 1.9109, + "step": 10544 + }, + { + "epoch": 0.81, + "grad_norm": 0.5665358581469662, + "learning_rate": 4.422264768729648e-06, + "loss": 1.8393, + "step": 10545 + }, + { + "epoch": 0.81, + "grad_norm": 0.5656893694996717, + "learning_rate": 4.418718014928466e-06, + "loss": 1.8616, + "step": 10546 + }, + { + "epoch": 0.81, + "grad_norm": 0.5642854776490852, + "learning_rate": 4.41517254610127e-06, + "loss": 2.0375, + "step": 10547 + }, + { + "epoch": 0.81, + "grad_norm": 0.5690070827515783, + "learning_rate": 4.411628362469417e-06, + "loss": 1.9732, + "step": 10548 + }, + { + "epoch": 0.81, + "grad_norm": 0.564564307647766, + "learning_rate": 4.408085464254183e-06, + "loss": 1.8156, + "step": 10549 + }, + { + "epoch": 0.81, + "grad_norm": 0.5537797760222043, + "learning_rate": 4.404543851676768e-06, + "loss": 1.8613, + "step": 10550 + }, + { + "epoch": 0.81, + "grad_norm": 0.5445309394615668, + "learning_rate": 4.401003524958286e-06, + "loss": 2.0653, + "step": 10551 + }, + { + "epoch": 0.81, + "grad_norm": 0.5608439116844629, + "learning_rate": 4.39746448431978e-06, + "loss": 1.8558, + "step": 10552 + }, + { + "epoch": 0.81, + "grad_norm": 0.5891345587663599, + "learning_rate": 4.393926729982204e-06, + "loss": 1.8296, + "step": 10553 + }, + { + "epoch": 0.81, + "grad_norm": 0.5557733066866404, + "learning_rate": 4.39039026216643e-06, + "loss": 1.9472, + "step": 10554 + }, + { + "epoch": 0.81, + "grad_norm": 0.5431614137210012, + "learning_rate": 4.38685508109326e-06, + "loss": 1.8638, + "step": 10555 + }, + { + "epoch": 0.81, + "grad_norm": 0.6253208720823549, + "learning_rate": 4.383321186983411e-06, + "loss": 2.0757, + "step": 10556 + }, + { + "epoch": 0.81, + "grad_norm": 0.5606981555762648, + "learning_rate": 4.379788580057515e-06, + "loss": 1.8714, + "step": 10557 + }, + { + "epoch": 0.81, + "grad_norm": 0.5902668514626077, + "learning_rate": 4.3762572605361264e-06, + "loss": 1.9062, + "step": 10558 + }, + { + "epoch": 0.81, + "grad_norm": 0.5442211779161195, + "learning_rate": 4.37272722863972e-06, + "loss": 2.0565, + "step": 10559 + }, + { + "epoch": 0.81, + "grad_norm": 0.5428041391839626, + "learning_rate": 4.3691984845886934e-06, + "loss": 1.9226, + "step": 10560 + }, + { + "epoch": 0.81, + "grad_norm": 0.5543268175278592, + "learning_rate": 4.36567102860336e-06, + "loss": 1.8203, + "step": 10561 + }, + { + "epoch": 0.81, + "grad_norm": 0.5752307068894305, + "learning_rate": 4.3621448609039525e-06, + "loss": 1.8843, + "step": 10562 + }, + { + "epoch": 0.81, + "grad_norm": 0.5926579422871165, + "learning_rate": 4.358619981710621e-06, + "loss": 2.0152, + "step": 10563 + }, + { + "epoch": 0.81, + "grad_norm": 0.5792831571610885, + "learning_rate": 4.355096391243449e-06, + "loss": 1.8817, + "step": 10564 + }, + { + "epoch": 0.82, + "grad_norm": 0.5731650021611585, + "learning_rate": 4.351574089722408e-06, + "loss": 1.9217, + "step": 10565 + }, + { + "epoch": 0.82, + "grad_norm": 0.5156083002728282, + "learning_rate": 4.348053077367428e-06, + "loss": 1.9291, + "step": 10566 + }, + { + "epoch": 0.82, + "grad_norm": 0.6094471595873626, + "learning_rate": 4.344533354398342e-06, + "loss": 1.8604, + "step": 10567 + }, + { + "epoch": 0.82, + "grad_norm": 0.5379255466176427, + "learning_rate": 4.341014921034881e-06, + "loss": 2.0517, + "step": 10568 + }, + { + "epoch": 0.82, + "grad_norm": 0.598855412568698, + "learning_rate": 4.337497777496738e-06, + "loss": 1.8574, + "step": 10569 + }, + { + "epoch": 0.82, + "grad_norm": 0.5742304056283951, + "learning_rate": 4.3339819240034885e-06, + "loss": 1.8847, + "step": 10570 + }, + { + "epoch": 0.82, + "grad_norm": 0.5746026109802733, + "learning_rate": 4.33046736077464e-06, + "loss": 2.0352, + "step": 10571 + }, + { + "epoch": 0.82, + "grad_norm": 0.550613946682847, + "learning_rate": 4.326954088029636e-06, + "loss": 1.9641, + "step": 10572 + }, + { + "epoch": 0.82, + "grad_norm": 0.6041230013151062, + "learning_rate": 4.3234421059878045e-06, + "loss": 1.8474, + "step": 10573 + }, + { + "epoch": 0.82, + "grad_norm": 0.5893225723839458, + "learning_rate": 4.3199314148684304e-06, + "loss": 1.8612, + "step": 10574 + }, + { + "epoch": 0.82, + "grad_norm": 0.601748834173107, + "learning_rate": 4.3164220148907e-06, + "loss": 2.0793, + "step": 10575 + }, + { + "epoch": 0.82, + "grad_norm": 0.5681197801509726, + "learning_rate": 4.312913906273703e-06, + "loss": 1.8371, + "step": 10576 + }, + { + "epoch": 0.82, + "grad_norm": 0.5699461870048294, + "learning_rate": 4.309407089236486e-06, + "loss": 1.8715, + "step": 10577 + }, + { + "epoch": 0.82, + "grad_norm": 0.6395778649460857, + "learning_rate": 4.305901563997977e-06, + "loss": 1.8728, + "step": 10578 + }, + { + "epoch": 0.82, + "grad_norm": 0.5654437413902124, + "learning_rate": 4.302397330777039e-06, + "loss": 1.9314, + "step": 10579 + }, + { + "epoch": 0.82, + "grad_norm": 0.5878659707735111, + "learning_rate": 4.298894389792479e-06, + "loss": 2.0977, + "step": 10580 + }, + { + "epoch": 0.82, + "grad_norm": 0.6028337652154623, + "learning_rate": 4.295392741262977e-06, + "loss": 1.8658, + "step": 10581 + }, + { + "epoch": 0.82, + "grad_norm": 0.5625756455965717, + "learning_rate": 4.291892385407156e-06, + "loss": 1.8498, + "step": 10582 + }, + { + "epoch": 0.82, + "grad_norm": 0.631036590586062, + "learning_rate": 4.288393322443579e-06, + "loss": 2.0587, + "step": 10583 + }, + { + "epoch": 0.82, + "grad_norm": 0.6055564674823034, + "learning_rate": 4.2848955525906846e-06, + "loss": 1.8628, + "step": 10584 + }, + { + "epoch": 0.82, + "grad_norm": 0.6304497022301477, + "learning_rate": 4.281399076066861e-06, + "loss": 1.9385, + "step": 10585 + }, + { + "epoch": 0.82, + "grad_norm": 0.611662023560275, + "learning_rate": 4.277903893090407e-06, + "loss": 1.8918, + "step": 10586 + }, + { + "epoch": 0.82, + "grad_norm": 0.5944052306495856, + "learning_rate": 4.274410003879534e-06, + "loss": 1.859, + "step": 10587 + }, + { + "epoch": 0.82, + "grad_norm": 0.6610418643533845, + "learning_rate": 4.2709174086524e-06, + "loss": 2.0596, + "step": 10588 + }, + { + "epoch": 0.82, + "grad_norm": 0.6087664896515093, + "learning_rate": 4.267426107627046e-06, + "loss": 1.9043, + "step": 10589 + }, + { + "epoch": 0.82, + "grad_norm": 0.5550199618821093, + "learning_rate": 4.263936101021448e-06, + "loss": 1.8859, + "step": 10590 + }, + { + "epoch": 0.82, + "grad_norm": 0.5806254836161225, + "learning_rate": 4.260447389053507e-06, + "loss": 1.9557, + "step": 10591 + }, + { + "epoch": 0.82, + "grad_norm": 0.6157435715465363, + "learning_rate": 4.256959971941038e-06, + "loss": 2.0248, + "step": 10592 + }, + { + "epoch": 0.82, + "grad_norm": 0.5612962236954546, + "learning_rate": 4.253473849901768e-06, + "loss": 1.8265, + "step": 10593 + }, + { + "epoch": 0.82, + "grad_norm": 0.5925141038601712, + "learning_rate": 4.249989023153358e-06, + "loss": 1.852, + "step": 10594 + }, + { + "epoch": 0.82, + "grad_norm": 0.5740414024905831, + "learning_rate": 4.246505491913375e-06, + "loss": 2.0621, + "step": 10595 + }, + { + "epoch": 0.82, + "grad_norm": 0.6304525744570076, + "learning_rate": 4.243023256399312e-06, + "loss": 1.836, + "step": 10596 + }, + { + "epoch": 0.82, + "grad_norm": 0.5720337605326904, + "learning_rate": 4.2395423168285765e-06, + "loss": 1.9533, + "step": 10597 + }, + { + "epoch": 0.82, + "grad_norm": 0.6709423768403865, + "learning_rate": 4.236062673418503e-06, + "loss": 1.9178, + "step": 10598 + }, + { + "epoch": 0.82, + "grad_norm": 0.5946918594321177, + "learning_rate": 4.232584326386335e-06, + "loss": 1.8735, + "step": 10599 + }, + { + "epoch": 0.82, + "grad_norm": 0.5824161634228078, + "learning_rate": 4.229107275949245e-06, + "loss": 2.0783, + "step": 10600 + }, + { + "epoch": 0.82, + "grad_norm": 0.5697304777629438, + "learning_rate": 4.225631522324317e-06, + "loss": 1.8634, + "step": 10601 + }, + { + "epoch": 0.82, + "grad_norm": 0.5947652246841143, + "learning_rate": 4.222157065728552e-06, + "loss": 1.8837, + "step": 10602 + }, + { + "epoch": 0.82, + "grad_norm": 0.5709404250396083, + "learning_rate": 4.2186839063788895e-06, + "loss": 2.0695, + "step": 10603 + }, + { + "epoch": 0.82, + "grad_norm": 0.5113522847744275, + "learning_rate": 4.215212044492148e-06, + "loss": 1.9255, + "step": 10604 + }, + { + "epoch": 0.82, + "grad_norm": 0.5568738428940869, + "learning_rate": 4.211741480285112e-06, + "loss": 1.8672, + "step": 10605 + }, + { + "epoch": 0.82, + "grad_norm": 0.5475215485222347, + "learning_rate": 4.208272213974454e-06, + "loss": 1.865, + "step": 10606 + }, + { + "epoch": 0.82, + "grad_norm": 0.5508275450451428, + "learning_rate": 4.2048042457767775e-06, + "loss": 2.0345, + "step": 10607 + }, + { + "epoch": 0.82, + "grad_norm": 0.6170877164647876, + "learning_rate": 4.201337575908601e-06, + "loss": 1.8203, + "step": 10608 + }, + { + "epoch": 0.82, + "grad_norm": 0.543294493269657, + "learning_rate": 4.1978722045863616e-06, + "loss": 1.8665, + "step": 10609 + }, + { + "epoch": 0.82, + "grad_norm": 0.5486194732218859, + "learning_rate": 4.1944081320264176e-06, + "loss": 1.9353, + "step": 10610 + }, + { + "epoch": 0.82, + "grad_norm": 0.6080563575543125, + "learning_rate": 4.190945358445053e-06, + "loss": 1.9004, + "step": 10611 + }, + { + "epoch": 0.82, + "grad_norm": 0.5832549123247058, + "learning_rate": 4.187483884058441e-06, + "loss": 2.0686, + "step": 10612 + }, + { + "epoch": 0.82, + "grad_norm": 0.560764645460689, + "learning_rate": 4.184023709082718e-06, + "loss": 1.8479, + "step": 10613 + }, + { + "epoch": 0.82, + "grad_norm": 0.5765711989157728, + "learning_rate": 4.1805648337339135e-06, + "loss": 1.8794, + "step": 10614 + }, + { + "epoch": 0.82, + "grad_norm": 0.5644684676203171, + "learning_rate": 4.177107258227964e-06, + "loss": 2.0754, + "step": 10615 + }, + { + "epoch": 0.82, + "grad_norm": 0.5636783934960637, + "learning_rate": 4.1736509827807634e-06, + "loss": 1.9728, + "step": 10616 + }, + { + "epoch": 0.82, + "grad_norm": 0.5632071127042587, + "learning_rate": 4.170196007608082e-06, + "loss": 1.8444, + "step": 10617 + }, + { + "epoch": 0.82, + "grad_norm": 0.5614474201714444, + "learning_rate": 4.16674233292563e-06, + "loss": 1.7933, + "step": 10618 + }, + { + "epoch": 0.82, + "grad_norm": 0.5696355733475515, + "learning_rate": 4.1632899589490485e-06, + "loss": 1.8119, + "step": 10619 + }, + { + "epoch": 0.82, + "grad_norm": 0.5820098392482016, + "learning_rate": 4.159838885893866e-06, + "loss": 2.0465, + "step": 10620 + }, + { + "epoch": 0.82, + "grad_norm": 0.541708654382157, + "learning_rate": 4.156389113975559e-06, + "loss": 1.8205, + "step": 10621 + }, + { + "epoch": 0.82, + "grad_norm": 0.5089678322528497, + "learning_rate": 4.152940643409514e-06, + "loss": 1.9418, + "step": 10622 + }, + { + "epoch": 0.82, + "grad_norm": 0.5644980771877202, + "learning_rate": 4.149493474411012e-06, + "loss": 1.8693, + "step": 10623 + }, + { + "epoch": 0.82, + "grad_norm": 0.5887673251328183, + "learning_rate": 4.146047607195302e-06, + "loss": 2.0303, + "step": 10624 + }, + { + "epoch": 0.82, + "grad_norm": 0.5873937234005123, + "learning_rate": 4.142603041977505e-06, + "loss": 1.8806, + "step": 10625 + }, + { + "epoch": 0.82, + "grad_norm": 0.5542495182549052, + "learning_rate": 4.139159778972679e-06, + "loss": 1.8346, + "step": 10626 + }, + { + "epoch": 0.82, + "grad_norm": 0.6154343886627305, + "learning_rate": 4.135717818395818e-06, + "loss": 2.073, + "step": 10627 + }, + { + "epoch": 0.82, + "grad_norm": 0.5116603399246111, + "learning_rate": 4.1322771604618e-06, + "loss": 1.9651, + "step": 10628 + }, + { + "epoch": 0.82, + "grad_norm": 0.5724575021791194, + "learning_rate": 4.128837805385438e-06, + "loss": 1.8553, + "step": 10629 + }, + { + "epoch": 0.82, + "grad_norm": 0.6022243427740438, + "learning_rate": 4.125399753381487e-06, + "loss": 1.8254, + "step": 10630 + }, + { + "epoch": 0.82, + "grad_norm": 0.5612549249224116, + "learning_rate": 4.121963004664578e-06, + "loss": 1.8825, + "step": 10631 + }, + { + "epoch": 0.82, + "grad_norm": 0.5802257401342035, + "learning_rate": 4.118527559449287e-06, + "loss": 2.0873, + "step": 10632 + }, + { + "epoch": 0.82, + "grad_norm": 0.5841035115219498, + "learning_rate": 4.115093417950102e-06, + "loss": 1.8762, + "step": 10633 + }, + { + "epoch": 0.82, + "grad_norm": 0.575701519642263, + "learning_rate": 4.111660580381435e-06, + "loss": 1.802, + "step": 10634 + }, + { + "epoch": 0.82, + "grad_norm": 0.5328275065702764, + "learning_rate": 4.108229046957609e-06, + "loss": 1.9027, + "step": 10635 + }, + { + "epoch": 0.82, + "grad_norm": 0.5591962355437892, + "learning_rate": 4.104798817892869e-06, + "loss": 2.041, + "step": 10636 + }, + { + "epoch": 0.82, + "grad_norm": 0.6006509975482716, + "learning_rate": 4.10136989340138e-06, + "loss": 1.9045, + "step": 10637 + }, + { + "epoch": 0.82, + "grad_norm": 0.5700619826788629, + "learning_rate": 4.09794227369722e-06, + "loss": 1.8795, + "step": 10638 + }, + { + "epoch": 0.82, + "grad_norm": 0.5570069764791588, + "learning_rate": 4.094515958994391e-06, + "loss": 1.8577, + "step": 10639 + }, + { + "epoch": 0.82, + "grad_norm": 0.5798105674632692, + "learning_rate": 4.091090949506815e-06, + "loss": 1.9932, + "step": 10640 + }, + { + "epoch": 0.82, + "grad_norm": 0.5382403147617756, + "learning_rate": 4.087667245448326e-06, + "loss": 1.9242, + "step": 10641 + }, + { + "epoch": 0.82, + "grad_norm": 0.5450971594536743, + "learning_rate": 4.084244847032678e-06, + "loss": 1.8647, + "step": 10642 + }, + { + "epoch": 0.82, + "grad_norm": 0.6067586082974338, + "learning_rate": 4.080823754473554e-06, + "loss": 1.8473, + "step": 10643 + }, + { + "epoch": 0.82, + "grad_norm": 0.6361510777469066, + "learning_rate": 4.077403967984536e-06, + "loss": 2.0681, + "step": 10644 + }, + { + "epoch": 0.82, + "grad_norm": 0.6102597959052373, + "learning_rate": 4.073985487779141e-06, + "loss": 1.8577, + "step": 10645 + }, + { + "epoch": 0.82, + "grad_norm": 0.5634807330602346, + "learning_rate": 4.070568314070802e-06, + "loss": 1.8892, + "step": 10646 + }, + { + "epoch": 0.82, + "grad_norm": 0.5483968184630161, + "learning_rate": 4.0671524470728615e-06, + "loss": 1.8988, + "step": 10647 + }, + { + "epoch": 0.82, + "grad_norm": 0.6037163714275811, + "learning_rate": 4.063737886998587e-06, + "loss": 2.0665, + "step": 10648 + }, + { + "epoch": 0.82, + "grad_norm": 0.5803500412220749, + "learning_rate": 4.060324634061166e-06, + "loss": 1.8924, + "step": 10649 + }, + { + "epoch": 0.82, + "grad_norm": 0.6439519683100701, + "learning_rate": 4.056912688473704e-06, + "loss": 1.9049, + "step": 10650 + }, + { + "epoch": 0.82, + "grad_norm": 0.6001678568741484, + "learning_rate": 4.053502050449209e-06, + "loss": 1.8642, + "step": 10651 + }, + { + "epoch": 0.82, + "grad_norm": 0.5898285140265045, + "learning_rate": 4.050092720200638e-06, + "loss": 2.0217, + "step": 10652 + }, + { + "epoch": 0.82, + "grad_norm": 0.5286710434074449, + "learning_rate": 4.046684697940847e-06, + "loss": 1.8705, + "step": 10653 + }, + { + "epoch": 0.82, + "grad_norm": 0.5460732152789862, + "learning_rate": 4.043277983882598e-06, + "loss": 1.8931, + "step": 10654 + }, + { + "epoch": 0.82, + "grad_norm": 0.5559422028337601, + "learning_rate": 4.039872578238602e-06, + "loss": 1.8296, + "step": 10655 + }, + { + "epoch": 0.82, + "grad_norm": 0.6630854926405113, + "learning_rate": 4.03646848122147e-06, + "loss": 2.0648, + "step": 10656 + }, + { + "epoch": 0.82, + "grad_norm": 0.5929550745571056, + "learning_rate": 4.033065693043728e-06, + "loss": 1.8743, + "step": 10657 + }, + { + "epoch": 0.82, + "grad_norm": 0.5892262969771478, + "learning_rate": 4.029664213917836e-06, + "loss": 1.8921, + "step": 10658 + }, + { + "epoch": 0.82, + "grad_norm": 0.5296473977046164, + "learning_rate": 4.026264044056144e-06, + "loss": 1.9301, + "step": 10659 + }, + { + "epoch": 0.82, + "grad_norm": 0.5276090018150382, + "learning_rate": 4.022865183670954e-06, + "loss": 2.0714, + "step": 10660 + }, + { + "epoch": 0.82, + "grad_norm": 0.5709064409267968, + "learning_rate": 4.019467632974475e-06, + "loss": 1.829, + "step": 10661 + }, + { + "epoch": 0.82, + "grad_norm": 0.608692827250539, + "learning_rate": 4.016071392178808e-06, + "loss": 1.8517, + "step": 10662 + }, + { + "epoch": 0.82, + "grad_norm": 0.6022284470369109, + "learning_rate": 4.012676461496015e-06, + "loss": 1.8832, + "step": 10663 + }, + { + "epoch": 0.82, + "grad_norm": 0.6088131575344443, + "learning_rate": 4.009282841138057e-06, + "loss": 2.1101, + "step": 10664 + }, + { + "epoch": 0.82, + "grad_norm": 0.5970617794871753, + "learning_rate": 4.005890531316789e-06, + "loss": 1.8285, + "step": 10665 + }, + { + "epoch": 0.82, + "grad_norm": 0.5392640046202631, + "learning_rate": 4.002499532244033e-06, + "loss": 1.9196, + "step": 10666 + }, + { + "epoch": 0.82, + "grad_norm": 0.538572824491611, + "learning_rate": 3.999109844131485e-06, + "loss": 1.8297, + "step": 10667 + }, + { + "epoch": 0.82, + "grad_norm": 0.6138075043850049, + "learning_rate": 3.995721467190777e-06, + "loss": 2.041, + "step": 10668 + }, + { + "epoch": 0.82, + "grad_norm": 0.571605445608972, + "learning_rate": 3.992334401633477e-06, + "loss": 1.8457, + "step": 10669 + }, + { + "epoch": 0.82, + "grad_norm": 0.6403212187680287, + "learning_rate": 3.988948647671031e-06, + "loss": 1.8729, + "step": 10670 + }, + { + "epoch": 0.82, + "grad_norm": 0.565269087068171, + "learning_rate": 3.9855642055148496e-06, + "loss": 1.8737, + "step": 10671 + }, + { + "epoch": 0.82, + "grad_norm": 0.537088129794888, + "learning_rate": 3.982181075376218e-06, + "loss": 2.0976, + "step": 10672 + }, + { + "epoch": 0.82, + "grad_norm": 0.5805221163471836, + "learning_rate": 3.978799257466356e-06, + "loss": 1.843, + "step": 10673 + }, + { + "epoch": 0.82, + "grad_norm": 0.5966064014751619, + "learning_rate": 3.975418751996429e-06, + "loss": 1.8961, + "step": 10674 + }, + { + "epoch": 0.82, + "grad_norm": 0.6045539483690441, + "learning_rate": 3.972039559177473e-06, + "loss": 1.9182, + "step": 10675 + }, + { + "epoch": 0.82, + "grad_norm": 0.625646862038081, + "learning_rate": 3.968661679220468e-06, + "loss": 2.0668, + "step": 10676 + }, + { + "epoch": 0.82, + "grad_norm": 0.6233525712289661, + "learning_rate": 3.965285112336326e-06, + "loss": 1.8189, + "step": 10677 + }, + { + "epoch": 0.82, + "grad_norm": 0.5390807768627728, + "learning_rate": 3.961909858735838e-06, + "loss": 1.9383, + "step": 10678 + }, + { + "epoch": 0.82, + "grad_norm": 0.5727035669787839, + "learning_rate": 3.958535918629744e-06, + "loss": 1.8585, + "step": 10679 + }, + { + "epoch": 0.82, + "grad_norm": 0.5707310984824652, + "learning_rate": 3.955163292228695e-06, + "loss": 2.0756, + "step": 10680 + }, + { + "epoch": 0.82, + "grad_norm": 0.5608016359449933, + "learning_rate": 3.951791979743255e-06, + "loss": 1.8043, + "step": 10681 + }, + { + "epoch": 0.82, + "grad_norm": 0.6299898186736491, + "learning_rate": 3.948421981383913e-06, + "loss": 1.8583, + "step": 10682 + }, + { + "epoch": 0.82, + "grad_norm": 0.6301800931983205, + "learning_rate": 3.945053297361065e-06, + "loss": 1.8624, + "step": 10683 + }, + { + "epoch": 0.82, + "grad_norm": 0.5194553934447379, + "learning_rate": 3.941685927885036e-06, + "loss": 2.0464, + "step": 10684 + }, + { + "epoch": 0.82, + "grad_norm": 0.5795581959263472, + "learning_rate": 3.9383198731660655e-06, + "loss": 1.8038, + "step": 10685 + }, + { + "epoch": 0.82, + "grad_norm": 0.6058273957992596, + "learning_rate": 3.934955133414309e-06, + "loss": 1.8635, + "step": 10686 + }, + { + "epoch": 0.82, + "grad_norm": 0.6143540863082474, + "learning_rate": 3.9315917088398405e-06, + "loss": 1.8206, + "step": 10687 + }, + { + "epoch": 0.82, + "grad_norm": 0.5691748800863341, + "learning_rate": 3.928229599652653e-06, + "loss": 2.0207, + "step": 10688 + }, + { + "epoch": 0.82, + "grad_norm": 0.5670368091645882, + "learning_rate": 3.924868806062657e-06, + "loss": 1.8942, + "step": 10689 + }, + { + "epoch": 0.82, + "grad_norm": 0.526738411069201, + "learning_rate": 3.92150932827968e-06, + "loss": 1.9671, + "step": 10690 + }, + { + "epoch": 0.82, + "grad_norm": 0.5419907575417912, + "learning_rate": 3.918151166513465e-06, + "loss": 1.8159, + "step": 10691 + }, + { + "epoch": 0.82, + "grad_norm": 0.5598445857364877, + "learning_rate": 3.914794320973683e-06, + "loss": 2.0529, + "step": 10692 + }, + { + "epoch": 0.82, + "grad_norm": 0.6126120280339779, + "learning_rate": 3.911438791869909e-06, + "loss": 1.8707, + "step": 10693 + }, + { + "epoch": 0.83, + "grad_norm": 0.5383020458193987, + "learning_rate": 3.908084579411647e-06, + "loss": 1.899, + "step": 10694 + }, + { + "epoch": 0.83, + "grad_norm": 0.563856598439715, + "learning_rate": 3.904731683808313e-06, + "loss": 1.8763, + "step": 10695 + }, + { + "epoch": 0.83, + "grad_norm": 0.5456713368450398, + "learning_rate": 3.901380105269242e-06, + "loss": 2.0243, + "step": 10696 + }, + { + "epoch": 0.83, + "grad_norm": 0.5269004386001627, + "learning_rate": 3.898029844003684e-06, + "loss": 1.9599, + "step": 10697 + }, + { + "epoch": 0.83, + "grad_norm": 0.6062287795023077, + "learning_rate": 3.894680900220812e-06, + "loss": 1.8332, + "step": 10698 + }, + { + "epoch": 0.83, + "grad_norm": 0.6244027539377481, + "learning_rate": 3.891333274129716e-06, + "loss": 1.858, + "step": 10699 + }, + { + "epoch": 0.83, + "grad_norm": 0.567995497445232, + "learning_rate": 3.887986965939405e-06, + "loss": 2.0448, + "step": 10700 + }, + { + "epoch": 0.83, + "grad_norm": 0.5861379639235234, + "learning_rate": 3.8846419758587884e-06, + "loss": 1.8497, + "step": 10701 + }, + { + "epoch": 0.83, + "grad_norm": 0.5934809415454316, + "learning_rate": 3.8812983040967195e-06, + "loss": 1.852, + "step": 10702 + }, + { + "epoch": 0.83, + "grad_norm": 0.5191989313707627, + "learning_rate": 3.877955950861958e-06, + "loss": 1.9266, + "step": 10703 + }, + { + "epoch": 0.83, + "grad_norm": 0.5363355264282338, + "learning_rate": 3.8746149163631785e-06, + "loss": 2.0707, + "step": 10704 + }, + { + "epoch": 0.83, + "grad_norm": 0.5832172674021526, + "learning_rate": 3.8712752008089795e-06, + "loss": 1.843, + "step": 10705 + }, + { + "epoch": 0.83, + "grad_norm": 0.5687438601743268, + "learning_rate": 3.8679368044078565e-06, + "loss": 1.8778, + "step": 10706 + }, + { + "epoch": 0.83, + "grad_norm": 0.5837821123838463, + "learning_rate": 3.864599727368257e-06, + "loss": 1.8834, + "step": 10707 + }, + { + "epoch": 0.83, + "grad_norm": 0.5449700873798506, + "learning_rate": 3.861263969898529e-06, + "loss": 2.0545, + "step": 10708 + }, + { + "epoch": 0.83, + "grad_norm": 0.531165510768281, + "learning_rate": 3.857929532206922e-06, + "loss": 1.8991, + "step": 10709 + }, + { + "epoch": 0.83, + "grad_norm": 0.5958949169788778, + "learning_rate": 3.854596414501633e-06, + "loss": 1.8997, + "step": 10710 + }, + { + "epoch": 0.83, + "grad_norm": 0.6351689814939461, + "learning_rate": 3.851264616990763e-06, + "loss": 1.8516, + "step": 10711 + }, + { + "epoch": 0.83, + "grad_norm": 0.5573929354426366, + "learning_rate": 3.847934139882311e-06, + "loss": 2.1147, + "step": 10712 + }, + { + "epoch": 0.83, + "grad_norm": 0.5503276909580921, + "learning_rate": 3.8446049833842366e-06, + "loss": 1.9063, + "step": 10713 + }, + { + "epoch": 0.83, + "grad_norm": 0.5894230886448758, + "learning_rate": 3.841277147704378e-06, + "loss": 1.8588, + "step": 10714 + }, + { + "epoch": 0.83, + "grad_norm": 0.6304056089298457, + "learning_rate": 3.837950633050505e-06, + "loss": 1.9215, + "step": 10715 + }, + { + "epoch": 0.83, + "grad_norm": 0.5722506324458648, + "learning_rate": 3.834625439630318e-06, + "loss": 2.0479, + "step": 10716 + }, + { + "epoch": 0.83, + "grad_norm": 0.5665467464368783, + "learning_rate": 3.83130156765141e-06, + "loss": 1.8004, + "step": 10717 + }, + { + "epoch": 0.83, + "grad_norm": 0.5643343684276644, + "learning_rate": 3.827979017321309e-06, + "loss": 1.8476, + "step": 10718 + }, + { + "epoch": 0.83, + "grad_norm": 0.6208498023858295, + "learning_rate": 3.824657788847452e-06, + "loss": 1.9089, + "step": 10719 + }, + { + "epoch": 0.83, + "grad_norm": 0.5602600700380348, + "learning_rate": 3.821337882437198e-06, + "loss": 2.0319, + "step": 10720 + }, + { + "epoch": 0.83, + "grad_norm": 0.5103762411744561, + "learning_rate": 3.818019298297834e-06, + "loss": 1.9146, + "step": 10721 + }, + { + "epoch": 0.83, + "grad_norm": 0.6206672104740065, + "learning_rate": 3.814702036636536e-06, + "loss": 1.8843, + "step": 10722 + }, + { + "epoch": 0.83, + "grad_norm": 0.5904050295468254, + "learning_rate": 3.811386097660416e-06, + "loss": 1.8494, + "step": 10723 + }, + { + "epoch": 0.83, + "grad_norm": 0.5897503173551677, + "learning_rate": 3.8080714815765196e-06, + "loss": 2.0914, + "step": 10724 + }, + { + "epoch": 0.83, + "grad_norm": 0.570551204299024, + "learning_rate": 3.8047581885917727e-06, + "loss": 1.8968, + "step": 10725 + }, + { + "epoch": 0.83, + "grad_norm": 0.578588601196136, + "learning_rate": 3.8014462189130434e-06, + "loss": 1.8782, + "step": 10726 + }, + { + "epoch": 0.83, + "grad_norm": 0.5738960214784489, + "learning_rate": 3.7981355727471135e-06, + "loss": 1.8017, + "step": 10727 + }, + { + "epoch": 0.83, + "grad_norm": 0.5574976927972151, + "learning_rate": 3.7948262503006787e-06, + "loss": 2.1131, + "step": 10728 + }, + { + "epoch": 0.83, + "grad_norm": 0.542004617031652, + "learning_rate": 3.7915182517803544e-06, + "loss": 1.8234, + "step": 10729 + }, + { + "epoch": 0.83, + "grad_norm": 0.5699865695084094, + "learning_rate": 3.7882115773926726e-06, + "loss": 1.8617, + "step": 10730 + }, + { + "epoch": 0.83, + "grad_norm": 0.573859240587759, + "learning_rate": 3.7849062273440815e-06, + "loss": 1.7971, + "step": 10731 + }, + { + "epoch": 0.83, + "grad_norm": 0.5536900172434109, + "learning_rate": 3.78160220184095e-06, + "loss": 2.0503, + "step": 10732 + }, + { + "epoch": 0.83, + "grad_norm": 0.5766615933089808, + "learning_rate": 3.7782995010895607e-06, + "loss": 1.8866, + "step": 10733 + }, + { + "epoch": 0.83, + "grad_norm": 0.5572580468141878, + "learning_rate": 3.7749981252961114e-06, + "loss": 1.9125, + "step": 10734 + }, + { + "epoch": 0.83, + "grad_norm": 0.6396995862758699, + "learning_rate": 3.771698074666727e-06, + "loss": 1.8355, + "step": 10735 + }, + { + "epoch": 0.83, + "grad_norm": 0.5987649765962956, + "learning_rate": 3.7683993494074367e-06, + "loss": 2.0659, + "step": 10736 + }, + { + "epoch": 0.83, + "grad_norm": 0.5787813025254561, + "learning_rate": 3.765101949724198e-06, + "loss": 1.8796, + "step": 10737 + }, + { + "epoch": 0.83, + "grad_norm": 0.6123504960491841, + "learning_rate": 3.7618058758228798e-06, + "loss": 1.8602, + "step": 10738 + }, + { + "epoch": 0.83, + "grad_norm": 0.6639818734042311, + "learning_rate": 3.75851112790927e-06, + "loss": 1.8905, + "step": 10739 + }, + { + "epoch": 0.83, + "grad_norm": 0.554442448707175, + "learning_rate": 3.7552177061890736e-06, + "loss": 1.9199, + "step": 10740 + }, + { + "epoch": 0.83, + "grad_norm": 0.544240338617265, + "learning_rate": 3.7519256108679097e-06, + "loss": 2.0012, + "step": 10741 + }, + { + "epoch": 0.83, + "grad_norm": 0.6049756624723479, + "learning_rate": 3.748634842151319e-06, + "loss": 1.8389, + "step": 10742 + }, + { + "epoch": 0.83, + "grad_norm": 0.5952696472087444, + "learning_rate": 3.74534540024476e-06, + "loss": 1.8304, + "step": 10743 + }, + { + "epoch": 0.83, + "grad_norm": 0.538891829038343, + "learning_rate": 3.7420572853536008e-06, + "loss": 2.0192, + "step": 10744 + }, + { + "epoch": 0.83, + "grad_norm": 0.5593710980442993, + "learning_rate": 3.738770497683136e-06, + "loss": 1.8495, + "step": 10745 + }, + { + "epoch": 0.83, + "grad_norm": 0.5977112551711441, + "learning_rate": 3.735485037438574e-06, + "loss": 1.9231, + "step": 10746 + }, + { + "epoch": 0.83, + "grad_norm": 0.5894606584638741, + "learning_rate": 3.732200904825042e-06, + "loss": 1.851, + "step": 10747 + }, + { + "epoch": 0.83, + "grad_norm": 0.5791515076342623, + "learning_rate": 3.728918100047568e-06, + "loss": 2.0371, + "step": 10748 + }, + { + "epoch": 0.83, + "grad_norm": 0.5619769957405876, + "learning_rate": 3.7256366233111266e-06, + "loss": 1.9034, + "step": 10749 + }, + { + "epoch": 0.83, + "grad_norm": 0.5988008336224916, + "learning_rate": 3.722356474820593e-06, + "loss": 1.8153, + "step": 10750 + }, + { + "epoch": 0.83, + "grad_norm": 0.6287153649341175, + "learning_rate": 3.7190776547807447e-06, + "loss": 1.8467, + "step": 10751 + }, + { + "epoch": 0.83, + "grad_norm": 0.5754778825571893, + "learning_rate": 3.715800163396313e-06, + "loss": 2.0103, + "step": 10752 + }, + { + "epoch": 0.83, + "grad_norm": 0.5344265867023649, + "learning_rate": 3.712524000871903e-06, + "loss": 1.9268, + "step": 10753 + }, + { + "epoch": 0.83, + "grad_norm": 0.5389847035852853, + "learning_rate": 3.7092491674120766e-06, + "loss": 1.8182, + "step": 10754 + }, + { + "epoch": 0.83, + "grad_norm": 0.5807825516462479, + "learning_rate": 3.705975663221295e-06, + "loss": 1.8956, + "step": 10755 + }, + { + "epoch": 0.83, + "grad_norm": 0.6285476000003524, + "learning_rate": 3.7027034885039224e-06, + "loss": 2.0265, + "step": 10756 + }, + { + "epoch": 0.83, + "grad_norm": 0.5684692427237742, + "learning_rate": 3.699432643464265e-06, + "loss": 1.8601, + "step": 10757 + }, + { + "epoch": 0.83, + "grad_norm": 0.5688725012336092, + "learning_rate": 3.6961631283065374e-06, + "loss": 1.8645, + "step": 10758 + }, + { + "epoch": 0.83, + "grad_norm": 0.5695517873279012, + "learning_rate": 3.692894943234856e-06, + "loss": 1.9329, + "step": 10759 + }, + { + "epoch": 0.83, + "grad_norm": 0.5809810119288172, + "learning_rate": 3.6896280884532842e-06, + "loss": 2.0728, + "step": 10760 + }, + { + "epoch": 0.83, + "grad_norm": 0.5656820586016746, + "learning_rate": 3.6863625641657712e-06, + "loss": 1.8249, + "step": 10761 + }, + { + "epoch": 0.83, + "grad_norm": 0.5583207574363943, + "learning_rate": 3.683098370576196e-06, + "loss": 1.8458, + "step": 10762 + }, + { + "epoch": 0.83, + "grad_norm": 0.5400000532881264, + "learning_rate": 3.679835507888371e-06, + "loss": 1.861, + "step": 10763 + }, + { + "epoch": 0.83, + "grad_norm": 0.5895824726573278, + "learning_rate": 3.676573976305997e-06, + "loss": 2.0258, + "step": 10764 + }, + { + "epoch": 0.83, + "grad_norm": 0.5540433994483275, + "learning_rate": 3.6733137760327108e-06, + "loss": 1.962, + "step": 10765 + }, + { + "epoch": 0.83, + "grad_norm": 0.5439044586910192, + "learning_rate": 3.6700549072720554e-06, + "loss": 1.8099, + "step": 10766 + }, + { + "epoch": 0.83, + "grad_norm": 0.6100317532157468, + "learning_rate": 3.66679737022749e-06, + "loss": 1.8957, + "step": 10767 + }, + { + "epoch": 0.83, + "grad_norm": 0.5532694667341393, + "learning_rate": 3.663541165102419e-06, + "loss": 2.0533, + "step": 10768 + }, + { + "epoch": 0.83, + "grad_norm": 0.5763894349661339, + "learning_rate": 3.6602862921001178e-06, + "loss": 1.8437, + "step": 10769 + }, + { + "epoch": 0.83, + "grad_norm": 0.5574496609973877, + "learning_rate": 3.657032751423803e-06, + "loss": 1.8427, + "step": 10770 + }, + { + "epoch": 0.83, + "grad_norm": 0.5366708548972129, + "learning_rate": 3.6537805432766248e-06, + "loss": 1.9604, + "step": 10771 + }, + { + "epoch": 0.83, + "grad_norm": 0.5461627965779805, + "learning_rate": 3.650529667861613e-06, + "loss": 1.8717, + "step": 10772 + }, + { + "epoch": 0.83, + "grad_norm": 0.5939330798584653, + "learning_rate": 3.6472801253817412e-06, + "loss": 2.0636, + "step": 10773 + }, + { + "epoch": 0.83, + "grad_norm": 0.5828431606194739, + "learning_rate": 3.644031916039889e-06, + "loss": 1.8655, + "step": 10774 + }, + { + "epoch": 0.83, + "grad_norm": 0.5394274105516511, + "learning_rate": 3.6407850400388575e-06, + "loss": 1.8615, + "step": 10775 + }, + { + "epoch": 0.83, + "grad_norm": 0.5717235089222511, + "learning_rate": 3.637539497581363e-06, + "loss": 2.0144, + "step": 10776 + }, + { + "epoch": 0.83, + "grad_norm": 0.5220374457371912, + "learning_rate": 3.634295288870035e-06, + "loss": 1.9098, + "step": 10777 + }, + { + "epoch": 0.83, + "grad_norm": 0.5824677543095469, + "learning_rate": 3.6310524141074248e-06, + "loss": 1.8224, + "step": 10778 + }, + { + "epoch": 0.83, + "grad_norm": 0.5517912304144045, + "learning_rate": 3.6278108734959985e-06, + "loss": 1.8404, + "step": 10779 + }, + { + "epoch": 0.83, + "grad_norm": 0.5553743721089511, + "learning_rate": 3.624570667238139e-06, + "loss": 2.059, + "step": 10780 + }, + { + "epoch": 0.83, + "grad_norm": 0.5476866306798203, + "learning_rate": 3.6213317955361454e-06, + "loss": 1.878, + "step": 10781 + }, + { + "epoch": 0.83, + "grad_norm": 0.5596477394266737, + "learning_rate": 3.6180942585922334e-06, + "loss": 1.8362, + "step": 10782 + }, + { + "epoch": 0.83, + "grad_norm": 0.5437451009748437, + "learning_rate": 3.614858056608536e-06, + "loss": 1.825, + "step": 10783 + }, + { + "epoch": 0.83, + "grad_norm": 0.5873150764828959, + "learning_rate": 3.611623189787103e-06, + "loss": 1.928, + "step": 10784 + }, + { + "epoch": 0.83, + "grad_norm": 0.5337628838598454, + "learning_rate": 3.608389658329897e-06, + "loss": 2.0679, + "step": 10785 + }, + { + "epoch": 0.83, + "grad_norm": 0.5849488556801015, + "learning_rate": 3.6051574624388074e-06, + "loss": 1.8616, + "step": 10786 + }, + { + "epoch": 0.83, + "grad_norm": 0.5811216632282263, + "learning_rate": 3.6019266023156277e-06, + "loss": 1.882, + "step": 10787 + }, + { + "epoch": 0.83, + "grad_norm": 0.564177530193364, + "learning_rate": 3.598697078162075e-06, + "loss": 2.0347, + "step": 10788 + }, + { + "epoch": 0.83, + "grad_norm": 0.5371775963769169, + "learning_rate": 3.5954688901797846e-06, + "loss": 1.8737, + "step": 10789 + }, + { + "epoch": 0.83, + "grad_norm": 0.5321897077376401, + "learning_rate": 3.592242038570301e-06, + "loss": 1.9171, + "step": 10790 + }, + { + "epoch": 0.83, + "grad_norm": 0.6619604305610112, + "learning_rate": 3.5890165235350938e-06, + "loss": 1.8731, + "step": 10791 + }, + { + "epoch": 0.83, + "grad_norm": 0.5602237946319516, + "learning_rate": 3.5857923452755433e-06, + "loss": 2.0482, + "step": 10792 + }, + { + "epoch": 0.83, + "grad_norm": 0.5595352119008782, + "learning_rate": 3.58256950399295e-06, + "loss": 1.8452, + "step": 10793 + }, + { + "epoch": 0.83, + "grad_norm": 0.597463378281254, + "learning_rate": 3.57934799988853e-06, + "loss": 1.8423, + "step": 10794 + }, + { + "epoch": 0.83, + "grad_norm": 0.5951380225806212, + "learning_rate": 3.5761278331634037e-06, + "loss": 1.7885, + "step": 10795 + }, + { + "epoch": 0.83, + "grad_norm": 0.5163799770447567, + "learning_rate": 3.572909004018632e-06, + "loss": 1.9421, + "step": 10796 + }, + { + "epoch": 0.83, + "grad_norm": 0.5487157096653337, + "learning_rate": 3.569691512655182e-06, + "loss": 2.0615, + "step": 10797 + }, + { + "epoch": 0.83, + "grad_norm": 0.5929985441592112, + "learning_rate": 3.5664753592739188e-06, + "loss": 1.853, + "step": 10798 + }, + { + "epoch": 0.83, + "grad_norm": 0.6028244117953934, + "learning_rate": 3.563260544075661e-06, + "loss": 1.8583, + "step": 10799 + }, + { + "epoch": 0.83, + "grad_norm": 0.5502729670974442, + "learning_rate": 3.560047067261099e-06, + "loss": 2.0917, + "step": 10800 + }, + { + "epoch": 0.83, + "grad_norm": 0.5900503362596001, + "learning_rate": 3.556834929030883e-06, + "loss": 1.8284, + "step": 10801 + }, + { + "epoch": 0.83, + "grad_norm": 0.5593725769303093, + "learning_rate": 3.553624129585559e-06, + "loss": 1.9733, + "step": 10802 + }, + { + "epoch": 0.83, + "grad_norm": 0.5769159843991529, + "learning_rate": 3.5504146691255736e-06, + "loss": 1.7989, + "step": 10803 + }, + { + "epoch": 0.83, + "grad_norm": 0.6188408346993773, + "learning_rate": 3.547206547851323e-06, + "loss": 1.8711, + "step": 10804 + }, + { + "epoch": 0.83, + "grad_norm": 0.5774724604516033, + "learning_rate": 3.543999765963102e-06, + "loss": 2.0039, + "step": 10805 + }, + { + "epoch": 0.83, + "grad_norm": 0.5727024566900277, + "learning_rate": 3.540794323661109e-06, + "loss": 1.8579, + "step": 10806 + }, + { + "epoch": 0.83, + "grad_norm": 0.5377008856995557, + "learning_rate": 3.5375902211454925e-06, + "loss": 1.8773, + "step": 10807 + }, + { + "epoch": 0.83, + "grad_norm": 0.5173820121251498, + "learning_rate": 3.534387458616284e-06, + "loss": 1.9311, + "step": 10808 + }, + { + "epoch": 0.83, + "grad_norm": 0.6088263802567797, + "learning_rate": 3.5311860362734454e-06, + "loss": 2.0766, + "step": 10809 + }, + { + "epoch": 0.83, + "grad_norm": 0.5589596116059125, + "learning_rate": 3.5279859543168674e-06, + "loss": 1.8362, + "step": 10810 + }, + { + "epoch": 0.83, + "grad_norm": 0.6205291753948583, + "learning_rate": 3.5247872129463318e-06, + "loss": 1.8313, + "step": 10811 + }, + { + "epoch": 0.83, + "grad_norm": 0.5648615650274053, + "learning_rate": 3.5215898123615477e-06, + "loss": 2.0192, + "step": 10812 + }, + { + "epoch": 0.83, + "grad_norm": 0.5620175775536919, + "learning_rate": 3.518393752762156e-06, + "loss": 1.8636, + "step": 10813 + }, + { + "epoch": 0.83, + "grad_norm": 0.5921677924134402, + "learning_rate": 3.5151990343476853e-06, + "loss": 1.8834, + "step": 10814 + }, + { + "epoch": 0.83, + "grad_norm": 0.5027932497238712, + "learning_rate": 3.512005657317602e-06, + "loss": 1.9328, + "step": 10815 + }, + { + "epoch": 0.83, + "grad_norm": 0.6179834376305374, + "learning_rate": 3.5088136218712817e-06, + "loss": 1.8615, + "step": 10816 + }, + { + "epoch": 0.83, + "grad_norm": 0.5851861744880031, + "learning_rate": 3.5056229282080077e-06, + "loss": 2.0146, + "step": 10817 + }, + { + "epoch": 0.83, + "grad_norm": 0.5495288408140643, + "learning_rate": 3.502433576527006e-06, + "loss": 1.8705, + "step": 10818 + }, + { + "epoch": 0.83, + "grad_norm": 0.5778685820041513, + "learning_rate": 3.4992455670273843e-06, + "loss": 1.8614, + "step": 10819 + }, + { + "epoch": 0.83, + "grad_norm": 0.5452358348438684, + "learning_rate": 3.4960588999081916e-06, + "loss": 2.0577, + "step": 10820 + }, + { + "epoch": 0.83, + "grad_norm": 0.5407436032092618, + "learning_rate": 3.49287357536838e-06, + "loss": 1.9434, + "step": 10821 + }, + { + "epoch": 0.83, + "grad_norm": 0.5451299243195878, + "learning_rate": 3.4896895936068263e-06, + "loss": 1.8468, + "step": 10822 + }, + { + "epoch": 0.83, + "grad_norm": 0.5892697917447388, + "learning_rate": 3.486506954822316e-06, + "loss": 1.8974, + "step": 10823 + }, + { + "epoch": 0.84, + "grad_norm": 0.5441851278320088, + "learning_rate": 3.483325659213557e-06, + "loss": 1.8046, + "step": 10824 + }, + { + "epoch": 0.84, + "grad_norm": 0.5295188915764267, + "learning_rate": 3.480145706979171e-06, + "loss": 2.0737, + "step": 10825 + }, + { + "epoch": 0.84, + "grad_norm": 0.6030719770605287, + "learning_rate": 3.4769670983176935e-06, + "loss": 1.8629, + "step": 10826 + }, + { + "epoch": 0.84, + "grad_norm": 0.6035899084310105, + "learning_rate": 3.4737898334275792e-06, + "loss": 1.9344, + "step": 10827 + }, + { + "epoch": 0.84, + "grad_norm": 0.5473661709414861, + "learning_rate": 3.4706139125071977e-06, + "loss": 1.8696, + "step": 10828 + }, + { + "epoch": 0.84, + "grad_norm": 0.5739439369620827, + "learning_rate": 3.4674393357548322e-06, + "loss": 2.069, + "step": 10829 + }, + { + "epoch": 0.84, + "grad_norm": 0.5413005008842366, + "learning_rate": 3.464266103368688e-06, + "loss": 1.8865, + "step": 10830 + }, + { + "epoch": 0.84, + "grad_norm": 0.5906839263018648, + "learning_rate": 3.461094215546884e-06, + "loss": 1.8743, + "step": 10831 + }, + { + "epoch": 0.84, + "grad_norm": 0.5576929035833064, + "learning_rate": 3.457923672487451e-06, + "loss": 2.0499, + "step": 10832 + }, + { + "epoch": 0.84, + "grad_norm": 0.5293207939269674, + "learning_rate": 3.4547544743883385e-06, + "loss": 1.9605, + "step": 10833 + }, + { + "epoch": 0.84, + "grad_norm": 0.5572364052476882, + "learning_rate": 3.451586621447417e-06, + "loss": 1.8204, + "step": 10834 + }, + { + "epoch": 0.84, + "grad_norm": 0.5455729339537025, + "learning_rate": 3.4484201138624632e-06, + "loss": 1.8502, + "step": 10835 + }, + { + "epoch": 0.84, + "grad_norm": 0.5810245520922116, + "learning_rate": 3.445254951831181e-06, + "loss": 1.8398, + "step": 10836 + }, + { + "epoch": 0.84, + "grad_norm": 0.5660997967884946, + "learning_rate": 3.4420911355511785e-06, + "loss": 2.0328, + "step": 10837 + }, + { + "epoch": 0.84, + "grad_norm": 0.5545514371393105, + "learning_rate": 3.4389286652199896e-06, + "loss": 1.8644, + "step": 10838 + }, + { + "epoch": 0.84, + "grad_norm": 0.5361064017113826, + "learning_rate": 3.435767541035059e-06, + "loss": 1.9717, + "step": 10839 + }, + { + "epoch": 0.84, + "grad_norm": 0.5702882641858916, + "learning_rate": 3.4326077631937485e-06, + "loss": 1.8423, + "step": 10840 + }, + { + "epoch": 0.84, + "grad_norm": 0.5600441781492409, + "learning_rate": 3.429449331893342e-06, + "loss": 2.0623, + "step": 10841 + }, + { + "epoch": 0.84, + "grad_norm": 0.5745624838647883, + "learning_rate": 3.426292247331017e-06, + "loss": 1.8127, + "step": 10842 + }, + { + "epoch": 0.84, + "grad_norm": 0.571814145647379, + "learning_rate": 3.4231365097039e-06, + "loss": 1.8425, + "step": 10843 + }, + { + "epoch": 0.84, + "grad_norm": 0.5445758035333338, + "learning_rate": 3.419982119209017e-06, + "loss": 2.0191, + "step": 10844 + }, + { + "epoch": 0.84, + "grad_norm": 0.5649250270253655, + "learning_rate": 3.4168290760432902e-06, + "loss": 1.9191, + "step": 10845 + }, + { + "epoch": 0.84, + "grad_norm": 0.5562775792943725, + "learning_rate": 3.413677380403596e-06, + "loss": 1.922, + "step": 10846 + }, + { + "epoch": 0.84, + "grad_norm": 0.553313119925402, + "learning_rate": 3.4105270324867073e-06, + "loss": 1.8749, + "step": 10847 + }, + { + "epoch": 0.84, + "grad_norm": 0.5644913423287873, + "learning_rate": 3.407378032489297e-06, + "loss": 1.8885, + "step": 10848 + }, + { + "epoch": 0.84, + "grad_norm": 0.5338045614090035, + "learning_rate": 3.4042303806079922e-06, + "loss": 2.0532, + "step": 10849 + }, + { + "epoch": 0.84, + "grad_norm": 0.5535319692979269, + "learning_rate": 3.401084077039293e-06, + "loss": 1.8533, + "step": 10850 + }, + { + "epoch": 0.84, + "grad_norm": 0.548198985056893, + "learning_rate": 3.3979391219796487e-06, + "loss": 1.8851, + "step": 10851 + }, + { + "epoch": 0.84, + "grad_norm": 0.5213757869730109, + "learning_rate": 3.3947955156254153e-06, + "loss": 1.9287, + "step": 10852 + }, + { + "epoch": 0.84, + "grad_norm": 0.5571967257297079, + "learning_rate": 3.3916532581728414e-06, + "loss": 2.0456, + "step": 10853 + }, + { + "epoch": 0.84, + "grad_norm": 0.5415759166943671, + "learning_rate": 3.3885123498181348e-06, + "loss": 1.8124, + "step": 10854 + }, + { + "epoch": 0.84, + "grad_norm": 0.5567354341952665, + "learning_rate": 3.3853727907573796e-06, + "loss": 1.8581, + "step": 10855 + }, + { + "epoch": 0.84, + "grad_norm": 0.542965612653651, + "learning_rate": 3.382234581186591e-06, + "loss": 1.8745, + "step": 10856 + }, + { + "epoch": 0.84, + "grad_norm": 0.5594755061050475, + "learning_rate": 3.3790977213017187e-06, + "loss": 2.0588, + "step": 10857 + }, + { + "epoch": 0.84, + "grad_norm": 0.5408739885797237, + "learning_rate": 3.375962211298589e-06, + "loss": 1.9478, + "step": 10858 + }, + { + "epoch": 0.84, + "grad_norm": 0.5603631408160092, + "learning_rate": 3.372828051372967e-06, + "loss": 1.8481, + "step": 10859 + }, + { + "epoch": 0.84, + "grad_norm": 0.537985529359681, + "learning_rate": 3.369695241720547e-06, + "loss": 1.8287, + "step": 10860 + }, + { + "epoch": 0.84, + "grad_norm": 0.5668723929917758, + "learning_rate": 3.366563782536905e-06, + "loss": 2.0508, + "step": 10861 + }, + { + "epoch": 0.84, + "grad_norm": 0.5451123759197846, + "learning_rate": 3.3634336740175605e-06, + "loss": 1.8814, + "step": 10862 + }, + { + "epoch": 0.84, + "grad_norm": 0.5806161231990192, + "learning_rate": 3.360304916357937e-06, + "loss": 1.8504, + "step": 10863 + }, + { + "epoch": 0.84, + "grad_norm": 0.5286821665757154, + "learning_rate": 3.3571775097533704e-06, + "loss": 1.9745, + "step": 10864 + }, + { + "epoch": 0.84, + "grad_norm": 0.5312105031970817, + "learning_rate": 3.3540514543991347e-06, + "loss": 2.0771, + "step": 10865 + }, + { + "epoch": 0.84, + "grad_norm": 0.5456644626425059, + "learning_rate": 3.3509267504903853e-06, + "loss": 1.8138, + "step": 10866 + }, + { + "epoch": 0.84, + "grad_norm": 0.6022251467290242, + "learning_rate": 3.347803398222213e-06, + "loss": 1.8542, + "step": 10867 + }, + { + "epoch": 0.84, + "grad_norm": 0.5652011274413492, + "learning_rate": 3.344681397789626e-06, + "loss": 1.8681, + "step": 10868 + }, + { + "epoch": 0.84, + "grad_norm": 0.5216526325442508, + "learning_rate": 3.34156074938754e-06, + "loss": 2.0074, + "step": 10869 + }, + { + "epoch": 0.84, + "grad_norm": 0.4881695947940889, + "learning_rate": 3.3384414532107918e-06, + "loss": 1.9275, + "step": 10870 + }, + { + "epoch": 0.84, + "grad_norm": 0.589856056191756, + "learning_rate": 3.335323509454133e-06, + "loss": 1.8702, + "step": 10871 + }, + { + "epoch": 0.84, + "grad_norm": 0.5820299323170798, + "learning_rate": 3.332206918312225e-06, + "loss": 1.8092, + "step": 10872 + }, + { + "epoch": 0.84, + "grad_norm": 0.5421502780921352, + "learning_rate": 3.3290916799796567e-06, + "loss": 2.051, + "step": 10873 + }, + { + "epoch": 0.84, + "grad_norm": 0.5819498483380261, + "learning_rate": 3.3259777946509195e-06, + "loss": 1.851, + "step": 10874 + }, + { + "epoch": 0.84, + "grad_norm": 0.583393186932019, + "learning_rate": 3.3228652625204276e-06, + "loss": 1.9099, + "step": 10875 + }, + { + "epoch": 0.84, + "grad_norm": 0.5471643139005614, + "learning_rate": 3.3197540837825085e-06, + "loss": 2.1232, + "step": 10876 + }, + { + "epoch": 0.84, + "grad_norm": 0.5257979191058662, + "learning_rate": 3.316644258631407e-06, + "loss": 1.8862, + "step": 10877 + }, + { + "epoch": 0.84, + "grad_norm": 0.5764249499473517, + "learning_rate": 3.313535787261282e-06, + "loss": 1.8363, + "step": 10878 + }, + { + "epoch": 0.84, + "grad_norm": 0.563350269811615, + "learning_rate": 3.3104286698662083e-06, + "loss": 1.8594, + "step": 10879 + }, + { + "epoch": 0.84, + "grad_norm": 0.6021398286114723, + "learning_rate": 3.3073229066401756e-06, + "loss": 1.7979, + "step": 10880 + }, + { + "epoch": 0.84, + "grad_norm": 0.5760971456272964, + "learning_rate": 3.3042184977770898e-06, + "loss": 2.008, + "step": 10881 + }, + { + "epoch": 0.84, + "grad_norm": 0.5710239034222614, + "learning_rate": 3.301115443470773e-06, + "loss": 1.9209, + "step": 10882 + }, + { + "epoch": 0.84, + "grad_norm": 0.5725731292578958, + "learning_rate": 3.2980137439149574e-06, + "loss": 1.9172, + "step": 10883 + }, + { + "epoch": 0.84, + "grad_norm": 0.5988762070048066, + "learning_rate": 3.2949133993032984e-06, + "loss": 1.8472, + "step": 10884 + }, + { + "epoch": 0.84, + "grad_norm": 0.5632641812432931, + "learning_rate": 3.2918144098293635e-06, + "loss": 2.1158, + "step": 10885 + }, + { + "epoch": 0.84, + "grad_norm": 0.5721633369808168, + "learning_rate": 3.2887167756866345e-06, + "loss": 1.8525, + "step": 10886 + }, + { + "epoch": 0.84, + "grad_norm": 0.5819395697415632, + "learning_rate": 3.285620497068509e-06, + "loss": 1.8494, + "step": 10887 + }, + { + "epoch": 0.84, + "grad_norm": 0.6221131603716856, + "learning_rate": 3.2825255741683077e-06, + "loss": 1.886, + "step": 10888 + }, + { + "epoch": 0.84, + "grad_norm": 0.6126319192034867, + "learning_rate": 3.2794320071792394e-06, + "loss": 2.0642, + "step": 10889 + }, + { + "epoch": 0.84, + "grad_norm": 0.6057442093698397, + "learning_rate": 3.2763397962944695e-06, + "loss": 1.9056, + "step": 10890 + }, + { + "epoch": 0.84, + "grad_norm": 0.5712629758685175, + "learning_rate": 3.273248941707052e-06, + "loss": 1.8698, + "step": 10891 + }, + { + "epoch": 0.84, + "grad_norm": 0.5912102428183791, + "learning_rate": 3.2701594436099493e-06, + "loss": 1.8882, + "step": 10892 + }, + { + "epoch": 0.84, + "grad_norm": 0.6044906752970196, + "learning_rate": 3.2670713021960647e-06, + "loss": 2.0466, + "step": 10893 + }, + { + "epoch": 0.84, + "grad_norm": 0.6025411656744994, + "learning_rate": 3.2639845176582056e-06, + "loss": 1.8497, + "step": 10894 + }, + { + "epoch": 0.84, + "grad_norm": 0.5228589107430774, + "learning_rate": 3.2608990901890762e-06, + "loss": 1.9279, + "step": 10895 + }, + { + "epoch": 0.84, + "grad_norm": 0.5645994358684547, + "learning_rate": 3.2578150199813363e-06, + "loss": 1.8453, + "step": 10896 + }, + { + "epoch": 0.84, + "grad_norm": 0.5939542135803495, + "learning_rate": 3.254732307227509e-06, + "loss": 2.056, + "step": 10897 + }, + { + "epoch": 0.84, + "grad_norm": 0.5699327238498245, + "learning_rate": 3.251650952120083e-06, + "loss": 1.901, + "step": 10898 + }, + { + "epoch": 0.84, + "grad_norm": 0.5802531639070976, + "learning_rate": 3.2485709548514405e-06, + "loss": 1.9231, + "step": 10899 + }, + { + "epoch": 0.84, + "grad_norm": 0.5811166266384296, + "learning_rate": 3.2454923156138544e-06, + "loss": 1.8618, + "step": 10900 + }, + { + "epoch": 0.84, + "grad_norm": 0.5001782263775642, + "learning_rate": 3.242415034599566e-06, + "loss": 2.0583, + "step": 10901 + }, + { + "epoch": 0.84, + "grad_norm": 0.5450268345550175, + "learning_rate": 3.239339112000686e-06, + "loss": 1.8481, + "step": 10902 + }, + { + "epoch": 0.84, + "grad_norm": 0.5675184993638455, + "learning_rate": 3.2362645480092517e-06, + "loss": 1.8528, + "step": 10903 + }, + { + "epoch": 0.84, + "grad_norm": 0.5609127780451145, + "learning_rate": 3.233191342817243e-06, + "loss": 1.8931, + "step": 10904 + }, + { + "epoch": 0.84, + "grad_norm": 0.5432441480349907, + "learning_rate": 3.2301194966165127e-06, + "loss": 2.055, + "step": 10905 + }, + { + "epoch": 0.84, + "grad_norm": 0.5601793007087711, + "learning_rate": 3.227049009598848e-06, + "loss": 1.8685, + "step": 10906 + }, + { + "epoch": 0.84, + "grad_norm": 0.5745000688554834, + "learning_rate": 3.2239798819559713e-06, + "loss": 1.8877, + "step": 10907 + }, + { + "epoch": 0.84, + "grad_norm": 0.5694937797266293, + "learning_rate": 3.2209121138794846e-06, + "loss": 1.917, + "step": 10908 + }, + { + "epoch": 0.84, + "grad_norm": 0.5494321123759563, + "learning_rate": 3.2178457055609234e-06, + "loss": 2.0821, + "step": 10909 + }, + { + "epoch": 0.84, + "grad_norm": 0.5613103130268279, + "learning_rate": 3.214780657191738e-06, + "loss": 1.8897, + "step": 10910 + }, + { + "epoch": 0.84, + "grad_norm": 0.5613196158389909, + "learning_rate": 3.2117169689632935e-06, + "loss": 1.9026, + "step": 10911 + }, + { + "epoch": 0.84, + "grad_norm": 0.5763871917601698, + "learning_rate": 3.208654641066866e-06, + "loss": 1.8172, + "step": 10912 + }, + { + "epoch": 0.84, + "grad_norm": 0.6088420855459398, + "learning_rate": 3.2055936736936493e-06, + "loss": 2.0673, + "step": 10913 + }, + { + "epoch": 0.84, + "grad_norm": 0.5670884902561305, + "learning_rate": 3.202534067034757e-06, + "loss": 1.916, + "step": 10914 + }, + { + "epoch": 0.84, + "grad_norm": 0.5546009904752476, + "learning_rate": 3.199475821281206e-06, + "loss": 1.8339, + "step": 10915 + }, + { + "epoch": 0.84, + "grad_norm": 0.6141315829959625, + "learning_rate": 3.1964189366239377e-06, + "loss": 1.8576, + "step": 10916 + }, + { + "epoch": 0.84, + "grad_norm": 0.5363785658436857, + "learning_rate": 3.1933634132538083e-06, + "loss": 2.0606, + "step": 10917 + }, + { + "epoch": 0.84, + "grad_norm": 0.5980089163191169, + "learning_rate": 3.190309251361584e-06, + "loss": 1.862, + "step": 10918 + }, + { + "epoch": 0.84, + "grad_norm": 0.5687030982623018, + "learning_rate": 3.1872564511379517e-06, + "loss": 1.8173, + "step": 10919 + }, + { + "epoch": 0.84, + "grad_norm": 0.5407838935298391, + "learning_rate": 3.1842050127735057e-06, + "loss": 1.9234, + "step": 10920 + }, + { + "epoch": 0.84, + "grad_norm": 0.5858651323103572, + "learning_rate": 3.1811549364587634e-06, + "loss": 2.062, + "step": 10921 + }, + { + "epoch": 0.84, + "grad_norm": 0.6159956817345205, + "learning_rate": 3.1781062223841528e-06, + "loss": 1.7894, + "step": 10922 + }, + { + "epoch": 0.84, + "grad_norm": 0.566405201962037, + "learning_rate": 3.1750588707400162e-06, + "loss": 1.868, + "step": 10923 + }, + { + "epoch": 0.84, + "grad_norm": 0.5644229331228829, + "learning_rate": 3.1720128817166156e-06, + "loss": 1.8439, + "step": 10924 + }, + { + "epoch": 0.84, + "grad_norm": 0.6059954353350857, + "learning_rate": 3.1689682555041208e-06, + "loss": 2.0537, + "step": 10925 + }, + { + "epoch": 0.84, + "grad_norm": 0.4957434842587024, + "learning_rate": 3.165924992292621e-06, + "loss": 1.9416, + "step": 10926 + }, + { + "epoch": 0.84, + "grad_norm": 0.5608216883341663, + "learning_rate": 3.1628830922721202e-06, + "loss": 1.8501, + "step": 10927 + }, + { + "epoch": 0.84, + "grad_norm": 0.5648481454963536, + "learning_rate": 3.159842555632539e-06, + "loss": 1.9111, + "step": 10928 + }, + { + "epoch": 0.84, + "grad_norm": 0.5336754229406827, + "learning_rate": 3.1568033825637082e-06, + "loss": 2.0868, + "step": 10929 + }, + { + "epoch": 0.84, + "grad_norm": 0.5313713704557796, + "learning_rate": 3.1537655732553768e-06, + "loss": 1.8309, + "step": 10930 + }, + { + "epoch": 0.84, + "grad_norm": 0.5927933687774304, + "learning_rate": 3.150729127897206e-06, + "loss": 1.8666, + "step": 10931 + }, + { + "epoch": 0.84, + "grad_norm": 0.5860486204504295, + "learning_rate": 3.147694046678776e-06, + "loss": 1.8593, + "step": 10932 + }, + { + "epoch": 0.84, + "grad_norm": 0.5233292949254459, + "learning_rate": 3.1446603297895784e-06, + "loss": 2.0888, + "step": 10933 + }, + { + "epoch": 0.84, + "grad_norm": 0.5306765012175834, + "learning_rate": 3.1416279774190177e-06, + "loss": 1.8644, + "step": 10934 + }, + { + "epoch": 0.84, + "grad_norm": 0.542430463344358, + "learning_rate": 3.138596989756426e-06, + "loss": 1.8481, + "step": 10935 + }, + { + "epoch": 0.84, + "grad_norm": 0.5812676091764443, + "learning_rate": 3.135567366991024e-06, + "loss": 1.8621, + "step": 10936 + }, + { + "epoch": 0.84, + "grad_norm": 0.5591068149272558, + "learning_rate": 3.1325391093119745e-06, + "loss": 2.0399, + "step": 10937 + }, + { + "epoch": 0.84, + "grad_norm": 0.5547368034731127, + "learning_rate": 3.1295122169083506e-06, + "loss": 1.8667, + "step": 10938 + }, + { + "epoch": 0.84, + "grad_norm": 0.548716081856602, + "learning_rate": 3.126486689969113e-06, + "loss": 1.935, + "step": 10939 + }, + { + "epoch": 0.84, + "grad_norm": 0.5694557164154117, + "learning_rate": 3.123462528683177e-06, + "loss": 1.8678, + "step": 10940 + }, + { + "epoch": 0.84, + "grad_norm": 0.5401284296612855, + "learning_rate": 3.1204397332393497e-06, + "loss": 2.0801, + "step": 10941 + }, + { + "epoch": 0.84, + "grad_norm": 0.5625482705614778, + "learning_rate": 3.117418303826344e-06, + "loss": 1.84, + "step": 10942 + }, + { + "epoch": 0.84, + "grad_norm": 0.5467805514481275, + "learning_rate": 3.114398240632818e-06, + "loss": 1.8563, + "step": 10943 + }, + { + "epoch": 0.84, + "grad_norm": 0.5606917962000476, + "learning_rate": 3.1113795438473146e-06, + "loss": 1.8411, + "step": 10944 + }, + { + "epoch": 0.84, + "grad_norm": 0.5234531034048913, + "learning_rate": 3.1083622136583028e-06, + "loss": 2.0926, + "step": 10945 + }, + { + "epoch": 0.84, + "grad_norm": 0.5821915255224257, + "learning_rate": 3.1053462502541793e-06, + "loss": 1.8482, + "step": 10946 + }, + { + "epoch": 0.84, + "grad_norm": 0.5831788725882885, + "learning_rate": 3.102331653823226e-06, + "loss": 1.8606, + "step": 10947 + }, + { + "epoch": 0.84, + "grad_norm": 0.5679157632610933, + "learning_rate": 3.0993184245536767e-06, + "loss": 1.8617, + "step": 10948 + }, + { + "epoch": 0.84, + "grad_norm": 0.5772493333324574, + "learning_rate": 3.0963065626336414e-06, + "loss": 2.0526, + "step": 10949 + }, + { + "epoch": 0.84, + "grad_norm": 0.6189221110985661, + "learning_rate": 3.093296068251164e-06, + "loss": 1.8168, + "step": 10950 + }, + { + "epoch": 0.84, + "grad_norm": 0.5508559474949465, + "learning_rate": 3.0902869415942197e-06, + "loss": 1.9296, + "step": 10951 + }, + { + "epoch": 0.84, + "grad_norm": 0.5825031704800143, + "learning_rate": 3.0872791828506635e-06, + "loss": 1.8297, + "step": 10952 + }, + { + "epoch": 0.85, + "grad_norm": 0.6086337161606319, + "learning_rate": 3.0842727922082844e-06, + "loss": 2.0893, + "step": 10953 + }, + { + "epoch": 0.85, + "grad_norm": 0.5409128068430991, + "learning_rate": 3.0812677698547955e-06, + "loss": 1.8471, + "step": 10954 + }, + { + "epoch": 0.85, + "grad_norm": 0.5657734012788627, + "learning_rate": 3.0782641159777998e-06, + "loss": 1.8651, + "step": 10955 + }, + { + "epoch": 0.85, + "grad_norm": 0.5812652155657768, + "learning_rate": 3.0752618307648308e-06, + "loss": 1.8285, + "step": 10956 + }, + { + "epoch": 0.85, + "grad_norm": 0.5378682695544696, + "learning_rate": 3.0722609144033356e-06, + "loss": 1.9144, + "step": 10957 + }, + { + "epoch": 0.85, + "grad_norm": 0.5585393706339652, + "learning_rate": 3.0692613670806724e-06, + "loss": 2.0567, + "step": 10958 + }, + { + "epoch": 0.85, + "grad_norm": 0.5799617958904439, + "learning_rate": 3.0662631889841143e-06, + "loss": 1.9037, + "step": 10959 + }, + { + "epoch": 0.85, + "grad_norm": 0.6310044991387034, + "learning_rate": 3.0632663803008555e-06, + "loss": 1.84, + "step": 10960 + }, + { + "epoch": 0.85, + "grad_norm": 0.5530864289185683, + "learning_rate": 3.0602709412179853e-06, + "loss": 2.0856, + "step": 10961 + }, + { + "epoch": 0.85, + "grad_norm": 0.5511277973527723, + "learning_rate": 3.057276871922543e-06, + "loss": 1.8509, + "step": 10962 + }, + { + "epoch": 0.85, + "grad_norm": 0.534423446839176, + "learning_rate": 3.054284172601446e-06, + "loss": 1.8204, + "step": 10963 + }, + { + "epoch": 0.85, + "grad_norm": 0.6004772203396654, + "learning_rate": 3.0512928434415387e-06, + "loss": 1.9541, + "step": 10964 + }, + { + "epoch": 0.85, + "grad_norm": 0.5555100380601801, + "learning_rate": 3.048302884629592e-06, + "loss": 2.0494, + "step": 10965 + }, + { + "epoch": 0.85, + "grad_norm": 0.556165659204902, + "learning_rate": 3.045314296352272e-06, + "loss": 1.832, + "step": 10966 + }, + { + "epoch": 0.85, + "grad_norm": 0.5470158456892792, + "learning_rate": 3.0423270787961727e-06, + "loss": 1.8445, + "step": 10967 + }, + { + "epoch": 0.85, + "grad_norm": 0.5374116575970441, + "learning_rate": 3.039341232147799e-06, + "loss": 1.8538, + "step": 10968 + }, + { + "epoch": 0.85, + "grad_norm": 0.5631786532645231, + "learning_rate": 3.0363567565935695e-06, + "loss": 2.0561, + "step": 10969 + }, + { + "epoch": 0.85, + "grad_norm": 0.5373128964890701, + "learning_rate": 3.0333736523198182e-06, + "loss": 1.9225, + "step": 10970 + }, + { + "epoch": 0.85, + "grad_norm": 0.6310472502490602, + "learning_rate": 3.0303919195127872e-06, + "loss": 1.8468, + "step": 10971 + }, + { + "epoch": 0.85, + "grad_norm": 0.5652327565739999, + "learning_rate": 3.027411558358645e-06, + "loss": 1.7827, + "step": 10972 + }, + { + "epoch": 0.85, + "grad_norm": 0.5759118657274799, + "learning_rate": 3.0244325690434643e-06, + "loss": 2.0348, + "step": 10973 + }, + { + "epoch": 0.85, + "grad_norm": 0.60265365008215, + "learning_rate": 3.021454951753236e-06, + "loss": 1.8139, + "step": 10974 + }, + { + "epoch": 0.85, + "grad_norm": 0.5787671158801412, + "learning_rate": 3.0184787066738634e-06, + "loss": 1.8254, + "step": 10975 + }, + { + "epoch": 0.85, + "grad_norm": 0.5131282797751109, + "learning_rate": 3.015503833991171e-06, + "loss": 1.9287, + "step": 10976 + }, + { + "epoch": 0.85, + "grad_norm": 0.5752678084997072, + "learning_rate": 3.0125303338908895e-06, + "loss": 2.083, + "step": 10977 + }, + { + "epoch": 0.85, + "grad_norm": 0.5763334798925462, + "learning_rate": 3.0095582065586574e-06, + "loss": 1.8385, + "step": 10978 + }, + { + "epoch": 0.85, + "grad_norm": 0.5873987680843143, + "learning_rate": 3.006587452180054e-06, + "loss": 1.8466, + "step": 10979 + }, + { + "epoch": 0.85, + "grad_norm": 0.5741686458833027, + "learning_rate": 3.003618070940542e-06, + "loss": 1.8635, + "step": 10980 + }, + { + "epoch": 0.85, + "grad_norm": 0.5907973108532687, + "learning_rate": 3.0006500630255204e-06, + "loss": 2.1192, + "step": 10981 + }, + { + "epoch": 0.85, + "grad_norm": 0.5079033073657315, + "learning_rate": 2.997683428620296e-06, + "loss": 1.9007, + "step": 10982 + }, + { + "epoch": 0.85, + "grad_norm": 0.578098589806834, + "learning_rate": 2.994718167910071e-06, + "loss": 1.864, + "step": 10983 + }, + { + "epoch": 0.85, + "grad_norm": 0.5473025274105937, + "learning_rate": 2.9917542810799976e-06, + "loss": 1.9249, + "step": 10984 + }, + { + "epoch": 0.85, + "grad_norm": 0.5811269206084355, + "learning_rate": 2.988791768315122e-06, + "loss": 2.0584, + "step": 10985 + }, + { + "epoch": 0.85, + "grad_norm": 0.5400387978958406, + "learning_rate": 2.98583062980039e-06, + "loss": 1.8277, + "step": 10986 + }, + { + "epoch": 0.85, + "grad_norm": 0.610797745623506, + "learning_rate": 2.982870865720691e-06, + "loss": 1.8161, + "step": 10987 + }, + { + "epoch": 0.85, + "grad_norm": 0.5767610086673044, + "learning_rate": 2.979912476260821e-06, + "loss": 1.9089, + "step": 10988 + }, + { + "epoch": 0.85, + "grad_norm": 0.5506393541426384, + "learning_rate": 2.9769554616054627e-06, + "loss": 1.8206, + "step": 10989 + }, + { + "epoch": 0.85, + "grad_norm": 0.555791523176043, + "learning_rate": 2.9739998219392608e-06, + "loss": 2.0572, + "step": 10990 + }, + { + "epoch": 0.85, + "grad_norm": 0.5455962531658465, + "learning_rate": 2.9710455574467277e-06, + "loss": 1.8515, + "step": 10991 + }, + { + "epoch": 0.85, + "grad_norm": 0.6155663929220291, + "learning_rate": 2.968092668312314e-06, + "loss": 1.866, + "step": 10992 + }, + { + "epoch": 0.85, + "grad_norm": 0.5563076759270154, + "learning_rate": 2.9651411547203937e-06, + "loss": 2.0706, + "step": 10993 + }, + { + "epoch": 0.85, + "grad_norm": 0.5879381629948147, + "learning_rate": 2.962191016855223e-06, + "loss": 1.81, + "step": 10994 + }, + { + "epoch": 0.85, + "grad_norm": 0.5216280954920404, + "learning_rate": 2.959242254901007e-06, + "loss": 1.8923, + "step": 10995 + }, + { + "epoch": 0.85, + "grad_norm": 0.5590860226410004, + "learning_rate": 2.956294869041845e-06, + "loss": 1.8979, + "step": 10996 + }, + { + "epoch": 0.85, + "grad_norm": 0.5498133373839367, + "learning_rate": 2.9533488594617435e-06, + "loss": 2.045, + "step": 10997 + }, + { + "epoch": 0.85, + "grad_norm": 0.5495819613580668, + "learning_rate": 2.9504042263446522e-06, + "loss": 1.8799, + "step": 10998 + }, + { + "epoch": 0.85, + "grad_norm": 0.5731031295335662, + "learning_rate": 2.947460969874402e-06, + "loss": 1.9329, + "step": 10999 + }, + { + "epoch": 0.85, + "grad_norm": 0.5467644516566684, + "learning_rate": 2.9445190902347543e-06, + "loss": 1.8432, + "step": 11000 + }, + { + "epoch": 0.85, + "grad_norm": 0.5379478976555605, + "learning_rate": 2.9415785876093947e-06, + "loss": 1.9561, + "step": 11001 + }, + { + "epoch": 0.85, + "grad_norm": 0.5411904796126497, + "learning_rate": 2.938639462181897e-06, + "loss": 2.0329, + "step": 11002 + }, + { + "epoch": 0.85, + "grad_norm": 0.5676258839137424, + "learning_rate": 2.935701714135769e-06, + "loss": 1.8489, + "step": 11003 + }, + { + "epoch": 0.85, + "grad_norm": 0.5795091086435591, + "learning_rate": 2.9327653436544226e-06, + "loss": 1.9103, + "step": 11004 + }, + { + "epoch": 0.85, + "grad_norm": 0.5523509532403, + "learning_rate": 2.929830350921192e-06, + "loss": 2.0373, + "step": 11005 + }, + { + "epoch": 0.85, + "grad_norm": 0.5790198262462639, + "learning_rate": 2.9268967361193193e-06, + "loss": 1.824, + "step": 11006 + }, + { + "epoch": 0.85, + "grad_norm": 0.5201229156966158, + "learning_rate": 2.923964499431964e-06, + "loss": 1.9696, + "step": 11007 + }, + { + "epoch": 0.85, + "grad_norm": 0.5720206230304521, + "learning_rate": 2.9210336410421924e-06, + "loss": 1.8691, + "step": 11008 + }, + { + "epoch": 0.85, + "grad_norm": 0.5365668277764181, + "learning_rate": 2.918104161132995e-06, + "loss": 2.0197, + "step": 11009 + }, + { + "epoch": 0.85, + "grad_norm": 0.5939603982323544, + "learning_rate": 2.91517605988727e-06, + "loss": 1.8698, + "step": 11010 + }, + { + "epoch": 0.85, + "grad_norm": 0.5519467082479718, + "learning_rate": 2.9122493374878295e-06, + "loss": 1.8179, + "step": 11011 + }, + { + "epoch": 0.85, + "grad_norm": 0.5525193006052546, + "learning_rate": 2.9093239941173994e-06, + "loss": 1.8397, + "step": 11012 + }, + { + "epoch": 0.85, + "grad_norm": 0.5273563961587224, + "learning_rate": 2.906400029958625e-06, + "loss": 1.9815, + "step": 11013 + }, + { + "epoch": 0.85, + "grad_norm": 0.5924625918768084, + "learning_rate": 2.9034774451940612e-06, + "loss": 2.0666, + "step": 11014 + }, + { + "epoch": 0.85, + "grad_norm": 0.5778011474494228, + "learning_rate": 2.900556240006172e-06, + "loss": 1.8578, + "step": 11015 + }, + { + "epoch": 0.85, + "grad_norm": 0.6480074084187046, + "learning_rate": 2.8976364145773455e-06, + "loss": 1.8581, + "step": 11016 + }, + { + "epoch": 0.85, + "grad_norm": 0.5763887235236398, + "learning_rate": 2.8947179690898773e-06, + "loss": 2.0487, + "step": 11017 + }, + { + "epoch": 0.85, + "grad_norm": 0.5742864038819305, + "learning_rate": 2.8918009037259746e-06, + "loss": 1.8369, + "step": 11018 + }, + { + "epoch": 0.85, + "grad_norm": 0.540195170980317, + "learning_rate": 2.888885218667767e-06, + "loss": 1.9255, + "step": 11019 + }, + { + "epoch": 0.85, + "grad_norm": 0.5701069033442724, + "learning_rate": 2.8859709140972886e-06, + "loss": 1.8987, + "step": 11020 + }, + { + "epoch": 0.85, + "grad_norm": 0.554268675967514, + "learning_rate": 2.8830579901964916e-06, + "loss": 1.8862, + "step": 11021 + }, + { + "epoch": 0.85, + "grad_norm": 0.590149951011285, + "learning_rate": 2.8801464471472446e-06, + "loss": 2.0442, + "step": 11022 + }, + { + "epoch": 0.85, + "grad_norm": 0.5372193084564831, + "learning_rate": 2.877236285131324e-06, + "loss": 1.8532, + "step": 11023 + }, + { + "epoch": 0.85, + "grad_norm": 0.5839194672203313, + "learning_rate": 2.8743275043304318e-06, + "loss": 1.8678, + "step": 11024 + }, + { + "epoch": 0.85, + "grad_norm": 0.5454407550459932, + "learning_rate": 2.8714201049261562e-06, + "loss": 2.051, + "step": 11025 + }, + { + "epoch": 0.85, + "grad_norm": 0.5287070490814902, + "learning_rate": 2.8685140871000355e-06, + "loss": 1.9101, + "step": 11026 + }, + { + "epoch": 0.85, + "grad_norm": 0.5658380179881011, + "learning_rate": 2.8656094510335046e-06, + "loss": 1.802, + "step": 11027 + }, + { + "epoch": 0.85, + "grad_norm": 0.586186235560385, + "learning_rate": 2.8627061969078967e-06, + "loss": 1.8452, + "step": 11028 + }, + { + "epoch": 0.85, + "grad_norm": 0.5434137414814252, + "learning_rate": 2.8598043249044916e-06, + "loss": 2.0927, + "step": 11029 + }, + { + "epoch": 0.85, + "grad_norm": 0.5818504002713629, + "learning_rate": 2.856903835204447e-06, + "loss": 1.8513, + "step": 11030 + }, + { + "epoch": 0.85, + "grad_norm": 0.5455938290187642, + "learning_rate": 2.8540047279888652e-06, + "loss": 1.8966, + "step": 11031 + }, + { + "epoch": 0.85, + "grad_norm": 0.5180221728074166, + "learning_rate": 2.851107003438755e-06, + "loss": 1.9282, + "step": 11032 + }, + { + "epoch": 0.85, + "grad_norm": 0.5802955978263918, + "learning_rate": 2.8482106617350125e-06, + "loss": 1.8015, + "step": 11033 + }, + { + "epoch": 0.85, + "grad_norm": 0.543842652437521, + "learning_rate": 2.8453157030584847e-06, + "loss": 2.0644, + "step": 11034 + }, + { + "epoch": 0.85, + "grad_norm": 0.5894892583123816, + "learning_rate": 2.8424221275899165e-06, + "loss": 1.8706, + "step": 11035 + }, + { + "epoch": 0.85, + "grad_norm": 0.5479315470375944, + "learning_rate": 2.839529935509952e-06, + "loss": 1.854, + "step": 11036 + }, + { + "epoch": 0.85, + "grad_norm": 0.546320347066512, + "learning_rate": 2.83663912699918e-06, + "loss": 2.074, + "step": 11037 + }, + { + "epoch": 0.85, + "grad_norm": 0.5693772512103398, + "learning_rate": 2.8337497022380698e-06, + "loss": 1.9029, + "step": 11038 + }, + { + "epoch": 0.85, + "grad_norm": 0.5787814168398973, + "learning_rate": 2.8308616614070248e-06, + "loss": 1.8227, + "step": 11039 + }, + { + "epoch": 0.85, + "grad_norm": 0.5663286254562697, + "learning_rate": 2.827975004686367e-06, + "loss": 1.8353, + "step": 11040 + }, + { + "epoch": 0.85, + "grad_norm": 0.5330453057641804, + "learning_rate": 2.825089732256311e-06, + "loss": 1.8034, + "step": 11041 + }, + { + "epoch": 0.85, + "grad_norm": 0.5325817204145, + "learning_rate": 2.8222058442969963e-06, + "loss": 2.0201, + "step": 11042 + }, + { + "epoch": 0.85, + "grad_norm": 0.552892674884264, + "learning_rate": 2.819323340988486e-06, + "loss": 1.8741, + "step": 11043 + }, + { + "epoch": 0.85, + "grad_norm": 0.5596738376664717, + "learning_rate": 2.8164422225107345e-06, + "loss": 1.9557, + "step": 11044 + }, + { + "epoch": 0.85, + "grad_norm": 0.5512342164996215, + "learning_rate": 2.8135624890436335e-06, + "loss": 1.9056, + "step": 11045 + }, + { + "epoch": 0.85, + "grad_norm": 0.5943153710304545, + "learning_rate": 2.8106841407669673e-06, + "loss": 2.0774, + "step": 11046 + }, + { + "epoch": 0.85, + "grad_norm": 0.5800654507401186, + "learning_rate": 2.8078071778604416e-06, + "loss": 1.9021, + "step": 11047 + }, + { + "epoch": 0.85, + "grad_norm": 0.5718435752845573, + "learning_rate": 2.8049316005036906e-06, + "loss": 1.8718, + "step": 11048 + }, + { + "epoch": 0.85, + "grad_norm": 0.5714808889154996, + "learning_rate": 2.8020574088762326e-06, + "loss": 2.0614, + "step": 11049 + }, + { + "epoch": 0.85, + "grad_norm": 0.5344693755388021, + "learning_rate": 2.7991846031575257e-06, + "loss": 1.9715, + "step": 11050 + }, + { + "epoch": 0.85, + "grad_norm": 0.5713569354584445, + "learning_rate": 2.7963131835269247e-06, + "loss": 1.8571, + "step": 11051 + }, + { + "epoch": 0.85, + "grad_norm": 0.5644392717326377, + "learning_rate": 2.7934431501637076e-06, + "loss": 1.8638, + "step": 11052 + }, + { + "epoch": 0.85, + "grad_norm": 0.5820972981252023, + "learning_rate": 2.790574503247062e-06, + "loss": 1.8757, + "step": 11053 + }, + { + "epoch": 0.85, + "grad_norm": 0.5154199781045713, + "learning_rate": 2.787707242956086e-06, + "loss": 2.0486, + "step": 11054 + }, + { + "epoch": 0.85, + "grad_norm": 0.5784381480450455, + "learning_rate": 2.784841369469801e-06, + "loss": 1.8639, + "step": 11055 + }, + { + "epoch": 0.85, + "grad_norm": 0.5704443151540566, + "learning_rate": 2.78197688296713e-06, + "loss": 1.8681, + "step": 11056 + }, + { + "epoch": 0.85, + "grad_norm": 0.5520884808935981, + "learning_rate": 2.779113783626916e-06, + "loss": 1.9462, + "step": 11057 + }, + { + "epoch": 0.85, + "grad_norm": 0.5697594945467133, + "learning_rate": 2.7762520716279135e-06, + "loss": 2.0659, + "step": 11058 + }, + { + "epoch": 0.85, + "grad_norm": 0.579430920283695, + "learning_rate": 2.773391747148793e-06, + "loss": 1.897, + "step": 11059 + }, + { + "epoch": 0.85, + "grad_norm": 0.593009555462676, + "learning_rate": 2.7705328103681317e-06, + "loss": 1.8401, + "step": 11060 + }, + { + "epoch": 0.85, + "grad_norm": 0.5733792092830888, + "learning_rate": 2.767675261464431e-06, + "loss": 2.0061, + "step": 11061 + }, + { + "epoch": 0.85, + "grad_norm": 0.553163924029067, + "learning_rate": 2.764819100616095e-06, + "loss": 1.8295, + "step": 11062 + }, + { + "epoch": 0.85, + "grad_norm": 0.5909269760666099, + "learning_rate": 2.7619643280014486e-06, + "loss": 1.9441, + "step": 11063 + }, + { + "epoch": 0.85, + "grad_norm": 0.6127396813111059, + "learning_rate": 2.759110943798726e-06, + "loss": 1.8773, + "step": 11064 + }, + { + "epoch": 0.85, + "grad_norm": 0.5813109143152801, + "learning_rate": 2.756258948186072e-06, + "loss": 1.8449, + "step": 11065 + }, + { + "epoch": 0.85, + "grad_norm": 0.5424026098165504, + "learning_rate": 2.753408341341554e-06, + "loss": 2.0664, + "step": 11066 + }, + { + "epoch": 0.85, + "grad_norm": 0.5736207285773557, + "learning_rate": 2.7505591234431414e-06, + "loss": 1.8714, + "step": 11067 + }, + { + "epoch": 0.85, + "grad_norm": 0.5671986559764755, + "learning_rate": 2.7477112946687283e-06, + "loss": 1.8405, + "step": 11068 + }, + { + "epoch": 0.85, + "grad_norm": 0.5592837508433316, + "learning_rate": 2.744864855196114e-06, + "loss": 1.9559, + "step": 11069 + }, + { + "epoch": 0.85, + "grad_norm": 0.5642971883243525, + "learning_rate": 2.742019805203014e-06, + "loss": 2.0829, + "step": 11070 + }, + { + "epoch": 0.85, + "grad_norm": 0.5550203508228634, + "learning_rate": 2.7391761448670587e-06, + "loss": 1.8859, + "step": 11071 + }, + { + "epoch": 0.85, + "grad_norm": 0.5598563273713845, + "learning_rate": 2.736333874365776e-06, + "loss": 1.8646, + "step": 11072 + }, + { + "epoch": 0.85, + "grad_norm": 0.6150124921818244, + "learning_rate": 2.733492993876638e-06, + "loss": 1.8223, + "step": 11073 + }, + { + "epoch": 0.85, + "grad_norm": 0.5560040065382178, + "learning_rate": 2.7306535035770077e-06, + "loss": 2.0464, + "step": 11074 + }, + { + "epoch": 0.85, + "grad_norm": 0.5695290748116774, + "learning_rate": 2.727815403644157e-06, + "loss": 1.9434, + "step": 11075 + }, + { + "epoch": 0.85, + "grad_norm": 0.5874578429537222, + "learning_rate": 2.7249786942552895e-06, + "loss": 1.8526, + "step": 11076 + }, + { + "epoch": 0.85, + "grad_norm": 0.6023082165974242, + "learning_rate": 2.72214337558751e-06, + "loss": 1.8827, + "step": 11077 + }, + { + "epoch": 0.85, + "grad_norm": 0.59503292452373, + "learning_rate": 2.7193094478178417e-06, + "loss": 2.01, + "step": 11078 + }, + { + "epoch": 0.85, + "grad_norm": 0.6195899635284864, + "learning_rate": 2.7164769111232195e-06, + "loss": 1.8103, + "step": 11079 + }, + { + "epoch": 0.85, + "grad_norm": 0.5808438458319892, + "learning_rate": 2.713645765680478e-06, + "loss": 1.8664, + "step": 11080 + }, + { + "epoch": 0.85, + "grad_norm": 0.6030336493853367, + "learning_rate": 2.7108160116663893e-06, + "loss": 2.0332, + "step": 11081 + }, + { + "epoch": 0.85, + "grad_norm": 0.4978833017158907, + "learning_rate": 2.707987649257629e-06, + "loss": 1.9048, + "step": 11082 + }, + { + "epoch": 0.86, + "grad_norm": 0.5501691564461823, + "learning_rate": 2.705160678630769e-06, + "loss": 1.829, + "step": 11083 + }, + { + "epoch": 0.86, + "grad_norm": 0.5842797071934983, + "learning_rate": 2.7023350999623246e-06, + "loss": 1.85, + "step": 11084 + }, + { + "epoch": 0.86, + "grad_norm": 0.6227716556008069, + "learning_rate": 2.699510913428699e-06, + "loss": 1.903, + "step": 11085 + }, + { + "epoch": 0.86, + "grad_norm": 0.5997720983786747, + "learning_rate": 2.6966881192062116e-06, + "loss": 2.0441, + "step": 11086 + }, + { + "epoch": 0.86, + "grad_norm": 0.5842314093446619, + "learning_rate": 2.693866717471122e-06, + "loss": 1.9234, + "step": 11087 + }, + { + "epoch": 0.86, + "grad_norm": 0.49736657373171883, + "learning_rate": 2.691046708399561e-06, + "loss": 1.8658, + "step": 11088 + }, + { + "epoch": 0.86, + "grad_norm": 0.5608805639727138, + "learning_rate": 2.6882280921675996e-06, + "loss": 1.832, + "step": 11089 + }, + { + "epoch": 0.86, + "grad_norm": 0.5547555250748492, + "learning_rate": 2.685410868951227e-06, + "loss": 2.0387, + "step": 11090 + }, + { + "epoch": 0.86, + "grad_norm": 0.545393927578426, + "learning_rate": 2.682595038926314e-06, + "loss": 1.8974, + "step": 11091 + }, + { + "epoch": 0.86, + "grad_norm": 0.5538317197948686, + "learning_rate": 2.6797806022686834e-06, + "loss": 1.8561, + "step": 11092 + }, + { + "epoch": 0.86, + "grad_norm": 0.5974283220348613, + "learning_rate": 2.6769675591540394e-06, + "loss": 2.022, + "step": 11093 + }, + { + "epoch": 0.86, + "grad_norm": 0.5221108121006991, + "learning_rate": 2.674155909758011e-06, + "loss": 1.9385, + "step": 11094 + }, + { + "epoch": 0.86, + "grad_norm": 0.6037087757415877, + "learning_rate": 2.6713456542561575e-06, + "loss": 1.821, + "step": 11095 + }, + { + "epoch": 0.86, + "grad_norm": 0.5604649967570153, + "learning_rate": 2.668536792823917e-06, + "loss": 1.8948, + "step": 11096 + }, + { + "epoch": 0.86, + "grad_norm": 0.6123391002984477, + "learning_rate": 2.665729325636665e-06, + "loss": 1.7996, + "step": 11097 + }, + { + "epoch": 0.86, + "grad_norm": 0.537079244148139, + "learning_rate": 2.6629232528696814e-06, + "loss": 2.0488, + "step": 11098 + }, + { + "epoch": 0.86, + "grad_norm": 0.5524344482271178, + "learning_rate": 2.6601185746981644e-06, + "loss": 1.8586, + "step": 11099 + }, + { + "epoch": 0.86, + "grad_norm": 0.5460961107505126, + "learning_rate": 2.657315291297219e-06, + "loss": 1.9524, + "step": 11100 + }, + { + "epoch": 0.86, + "grad_norm": 0.5326339762230258, + "learning_rate": 2.654513402841863e-06, + "loss": 1.8351, + "step": 11101 + }, + { + "epoch": 0.86, + "grad_norm": 0.5703173537087283, + "learning_rate": 2.651712909507034e-06, + "loss": 2.056, + "step": 11102 + }, + { + "epoch": 0.86, + "grad_norm": 0.5439627309345092, + "learning_rate": 2.6489138114675795e-06, + "loss": 1.8461, + "step": 11103 + }, + { + "epoch": 0.86, + "grad_norm": 0.596770569540055, + "learning_rate": 2.6461161088982555e-06, + "loss": 1.8309, + "step": 11104 + }, + { + "epoch": 0.86, + "grad_norm": 0.5574797731060114, + "learning_rate": 2.643319801973734e-06, + "loss": 1.822, + "step": 11105 + }, + { + "epoch": 0.86, + "grad_norm": 0.5144228978523118, + "learning_rate": 2.6405248908686008e-06, + "loss": 2.0973, + "step": 11106 + }, + { + "epoch": 0.86, + "grad_norm": 0.5613189547104983, + "learning_rate": 2.637731375757352e-06, + "loss": 1.8585, + "step": 11107 + }, + { + "epoch": 0.86, + "grad_norm": 0.5770588056935186, + "learning_rate": 2.634939256814403e-06, + "loss": 1.8428, + "step": 11108 + }, + { + "epoch": 0.86, + "grad_norm": 0.6226534630888009, + "learning_rate": 2.632148534214074e-06, + "loss": 1.8736, + "step": 11109 + }, + { + "epoch": 0.86, + "grad_norm": 0.61972989152355, + "learning_rate": 2.6293592081305994e-06, + "loss": 2.018, + "step": 11110 + }, + { + "epoch": 0.86, + "grad_norm": 0.5801818192202404, + "learning_rate": 2.6265712787381297e-06, + "loss": 1.8536, + "step": 11111 + }, + { + "epoch": 0.86, + "grad_norm": 0.5564235419488669, + "learning_rate": 2.6237847462107305e-06, + "loss": 1.8799, + "step": 11112 + }, + { + "epoch": 0.86, + "grad_norm": 0.5395678256829634, + "learning_rate": 2.6209996107223718e-06, + "loss": 1.9146, + "step": 11113 + }, + { + "epoch": 0.86, + "grad_norm": 0.6091093964342706, + "learning_rate": 2.6182158724469413e-06, + "loss": 2.0359, + "step": 11114 + }, + { + "epoch": 0.86, + "grad_norm": 0.5667788613242687, + "learning_rate": 2.6154335315582423e-06, + "loss": 1.8653, + "step": 11115 + }, + { + "epoch": 0.86, + "grad_norm": 0.5751331994344332, + "learning_rate": 2.612652588229986e-06, + "loss": 1.8962, + "step": 11116 + }, + { + "epoch": 0.86, + "grad_norm": 0.5315577363396237, + "learning_rate": 2.609873042635799e-06, + "loss": 1.8471, + "step": 11117 + }, + { + "epoch": 0.86, + "grad_norm": 0.527203059812803, + "learning_rate": 2.607094894949222e-06, + "loss": 2.0455, + "step": 11118 + }, + { + "epoch": 0.86, + "grad_norm": 0.5589480608292238, + "learning_rate": 2.6043181453436953e-06, + "loss": 1.9284, + "step": 11119 + }, + { + "epoch": 0.86, + "grad_norm": 0.5503262045370972, + "learning_rate": 2.6015427939925923e-06, + "loss": 1.9212, + "step": 11120 + }, + { + "epoch": 0.86, + "grad_norm": 0.5866505630335165, + "learning_rate": 2.598768841069196e-06, + "loss": 1.8739, + "step": 11121 + }, + { + "epoch": 0.86, + "grad_norm": 0.5673783842449909, + "learning_rate": 2.595996286746674e-06, + "loss": 2.0257, + "step": 11122 + }, + { + "epoch": 0.86, + "grad_norm": 0.5654470605003413, + "learning_rate": 2.5932251311981513e-06, + "loss": 1.8353, + "step": 11123 + }, + { + "epoch": 0.86, + "grad_norm": 0.5656856236489031, + "learning_rate": 2.590455374596634e-06, + "loss": 1.9151, + "step": 11124 + }, + { + "epoch": 0.86, + "grad_norm": 0.5199376742175843, + "learning_rate": 2.58768701711504e-06, + "loss": 1.9037, + "step": 11125 + }, + { + "epoch": 0.86, + "grad_norm": 0.5839023796593644, + "learning_rate": 2.584920058926227e-06, + "loss": 2.0741, + "step": 11126 + }, + { + "epoch": 0.86, + "grad_norm": 0.581743288866878, + "learning_rate": 2.5821545002029298e-06, + "loss": 1.865, + "step": 11127 + }, + { + "epoch": 0.86, + "grad_norm": 0.5656500166113706, + "learning_rate": 2.5793903411178268e-06, + "loss": 1.8409, + "step": 11128 + }, + { + "epoch": 0.86, + "grad_norm": 0.5799791227023938, + "learning_rate": 2.5766275818434936e-06, + "loss": 1.8291, + "step": 11129 + }, + { + "epoch": 0.86, + "grad_norm": 0.5750515017697811, + "learning_rate": 2.5738662225524096e-06, + "loss": 2.0255, + "step": 11130 + }, + { + "epoch": 0.86, + "grad_norm": 0.5020256537431663, + "learning_rate": 2.5711062634169975e-06, + "loss": 1.8861, + "step": 11131 + }, + { + "epoch": 0.86, + "grad_norm": 0.542742324875227, + "learning_rate": 2.5683477046095557e-06, + "loss": 1.8566, + "step": 11132 + }, + { + "epoch": 0.86, + "grad_norm": 0.5690100595693123, + "learning_rate": 2.5655905463023127e-06, + "loss": 1.8812, + "step": 11133 + }, + { + "epoch": 0.86, + "grad_norm": 0.5718892803191697, + "learning_rate": 2.5628347886674232e-06, + "loss": 2.055, + "step": 11134 + }, + { + "epoch": 0.86, + "grad_norm": 0.5714850464725609, + "learning_rate": 2.5600804318769295e-06, + "loss": 1.8915, + "step": 11135 + }, + { + "epoch": 0.86, + "grad_norm": 0.5487985890329029, + "learning_rate": 2.5573274761027944e-06, + "loss": 1.8847, + "step": 11136 + }, + { + "epoch": 0.86, + "grad_norm": 0.5385141133625567, + "learning_rate": 2.5545759215169106e-06, + "loss": 1.9681, + "step": 11137 + }, + { + "epoch": 0.86, + "grad_norm": 0.5571988797925721, + "learning_rate": 2.551825768291055e-06, + "loss": 2.0664, + "step": 11138 + }, + { + "epoch": 0.86, + "grad_norm": 0.5812598552715662, + "learning_rate": 2.5490770165969375e-06, + "loss": 1.8618, + "step": 11139 + }, + { + "epoch": 0.86, + "grad_norm": 0.546920929007709, + "learning_rate": 2.546329666606173e-06, + "loss": 1.8501, + "step": 11140 + }, + { + "epoch": 0.86, + "grad_norm": 0.5826021627139846, + "learning_rate": 2.543583718490283e-06, + "loss": 1.8611, + "step": 11141 + }, + { + "epoch": 0.86, + "grad_norm": 0.5394600820489752, + "learning_rate": 2.540839172420725e-06, + "loss": 2.03, + "step": 11142 + }, + { + "epoch": 0.86, + "grad_norm": 0.5624079003924163, + "learning_rate": 2.538096028568837e-06, + "loss": 1.8547, + "step": 11143 + }, + { + "epoch": 0.86, + "grad_norm": 0.5166128284060002, + "learning_rate": 2.5353542871058844e-06, + "loss": 1.8952, + "step": 11144 + }, + { + "epoch": 0.86, + "grad_norm": 0.5802555013582236, + "learning_rate": 2.5326139482030638e-06, + "loss": 1.8134, + "step": 11145 + }, + { + "epoch": 0.86, + "grad_norm": 0.5471388468595368, + "learning_rate": 2.5298750120314437e-06, + "loss": 2.0419, + "step": 11146 + }, + { + "epoch": 0.86, + "grad_norm": 0.5428048842060942, + "learning_rate": 2.527137478762037e-06, + "loss": 1.8362, + "step": 11147 + }, + { + "epoch": 0.86, + "grad_norm": 0.5641489798165573, + "learning_rate": 2.524401348565761e-06, + "loss": 1.7885, + "step": 11148 + }, + { + "epoch": 0.86, + "grad_norm": 0.5509013513429136, + "learning_rate": 2.521666621613439e-06, + "loss": 1.8314, + "step": 11149 + }, + { + "epoch": 0.86, + "grad_norm": 0.49930698100206855, + "learning_rate": 2.5189332980758124e-06, + "loss": 2.026, + "step": 11150 + }, + { + "epoch": 0.86, + "grad_norm": 0.607406993017461, + "learning_rate": 2.5162013781235337e-06, + "loss": 1.9186, + "step": 11151 + }, + { + "epoch": 0.86, + "grad_norm": 0.5829267850116784, + "learning_rate": 2.5134708619271722e-06, + "loss": 1.8205, + "step": 11152 + }, + { + "epoch": 0.86, + "grad_norm": 0.6433692587466404, + "learning_rate": 2.5107417496572e-06, + "loss": 1.8933, + "step": 11153 + }, + { + "epoch": 0.86, + "grad_norm": 0.5270570066146664, + "learning_rate": 2.508014041484008e-06, + "loss": 2.0295, + "step": 11154 + }, + { + "epoch": 0.86, + "grad_norm": 0.5701922517705831, + "learning_rate": 2.5052877375778995e-06, + "loss": 1.8695, + "step": 11155 + }, + { + "epoch": 0.86, + "grad_norm": 0.5192785253260583, + "learning_rate": 2.5025628381090877e-06, + "loss": 1.9016, + "step": 11156 + }, + { + "epoch": 0.86, + "grad_norm": 0.5560929580792876, + "learning_rate": 2.499839343247698e-06, + "loss": 1.8535, + "step": 11157 + }, + { + "epoch": 0.86, + "grad_norm": 0.5930083717615772, + "learning_rate": 2.4971172531637747e-06, + "loss": 2.0489, + "step": 11158 + }, + { + "epoch": 0.86, + "grad_norm": 0.5545289073908967, + "learning_rate": 2.494396568027263e-06, + "loss": 1.8454, + "step": 11159 + }, + { + "epoch": 0.86, + "grad_norm": 0.584851600785787, + "learning_rate": 2.491677288008029e-06, + "loss": 1.8323, + "step": 11160 + }, + { + "epoch": 0.86, + "grad_norm": 0.5569525378811834, + "learning_rate": 2.488959413275849e-06, + "loss": 1.8851, + "step": 11161 + }, + { + "epoch": 0.86, + "grad_norm": 0.5395363285090012, + "learning_rate": 2.4862429440004108e-06, + "loss": 2.0596, + "step": 11162 + }, + { + "epoch": 0.86, + "grad_norm": 0.5730595459105783, + "learning_rate": 2.4835278803513136e-06, + "loss": 1.8707, + "step": 11163 + }, + { + "epoch": 0.86, + "grad_norm": 0.5550563237059293, + "learning_rate": 2.4808142224980733e-06, + "loss": 1.8547, + "step": 11164 + }, + { + "epoch": 0.86, + "grad_norm": 0.5399600083213264, + "learning_rate": 2.478101970610114e-06, + "loss": 1.8785, + "step": 11165 + }, + { + "epoch": 0.86, + "grad_norm": 0.6212810292663378, + "learning_rate": 2.475391124856763e-06, + "loss": 2.065, + "step": 11166 + }, + { + "epoch": 0.86, + "grad_norm": 0.5650863642847908, + "learning_rate": 2.4726816854072797e-06, + "loss": 1.8569, + "step": 11167 + }, + { + "epoch": 0.86, + "grad_norm": 0.537784248189289, + "learning_rate": 2.469973652430832e-06, + "loss": 1.913, + "step": 11168 + }, + { + "epoch": 0.86, + "grad_norm": 0.5544963826747129, + "learning_rate": 2.467267026096473e-06, + "loss": 1.8637, + "step": 11169 + }, + { + "epoch": 0.86, + "grad_norm": 0.5882843999607376, + "learning_rate": 2.464561806573204e-06, + "loss": 2.0303, + "step": 11170 + }, + { + "epoch": 0.86, + "grad_norm": 0.5344342368678255, + "learning_rate": 2.4618579940299256e-06, + "loss": 1.866, + "step": 11171 + }, + { + "epoch": 0.86, + "grad_norm": 0.5335356737719169, + "learning_rate": 2.459155588635434e-06, + "loss": 1.8494, + "step": 11172 + }, + { + "epoch": 0.86, + "grad_norm": 0.5591720025866959, + "learning_rate": 2.4564545905584657e-06, + "loss": 1.8787, + "step": 11173 + }, + { + "epoch": 0.86, + "grad_norm": 0.5624590075536152, + "learning_rate": 2.453754999967639e-06, + "loss": 2.0874, + "step": 11174 + }, + { + "epoch": 0.86, + "grad_norm": 0.5660977564009094, + "learning_rate": 2.451056817031516e-06, + "loss": 1.8929, + "step": 11175 + }, + { + "epoch": 0.86, + "grad_norm": 0.5927977742228033, + "learning_rate": 2.4483600419185542e-06, + "loss": 1.8572, + "step": 11176 + }, + { + "epoch": 0.86, + "grad_norm": 0.5549569209061838, + "learning_rate": 2.4456646747971102e-06, + "loss": 1.867, + "step": 11177 + }, + { + "epoch": 0.86, + "grad_norm": 0.5562491030038597, + "learning_rate": 2.4429707158354853e-06, + "loss": 2.0772, + "step": 11178 + }, + { + "epoch": 0.86, + "grad_norm": 0.6960403182335471, + "learning_rate": 2.440278165201859e-06, + "loss": 1.8104, + "step": 11179 + }, + { + "epoch": 0.86, + "grad_norm": 0.5572973378499211, + "learning_rate": 2.4375870230643417e-06, + "loss": 1.8433, + "step": 11180 + }, + { + "epoch": 0.86, + "grad_norm": 0.5408198478186312, + "learning_rate": 2.434897289590965e-06, + "loss": 1.9341, + "step": 11181 + }, + { + "epoch": 0.86, + "grad_norm": 0.5521888656464179, + "learning_rate": 2.432208964949642e-06, + "loss": 2.0475, + "step": 11182 + }, + { + "epoch": 0.86, + "grad_norm": 0.5415081898986202, + "learning_rate": 2.429522049308222e-06, + "loss": 1.8598, + "step": 11183 + }, + { + "epoch": 0.86, + "grad_norm": 0.5739269056097686, + "learning_rate": 2.4268365428344736e-06, + "loss": 1.8756, + "step": 11184 + }, + { + "epoch": 0.86, + "grad_norm": 0.5607125664400467, + "learning_rate": 2.4241524456960457e-06, + "loss": 1.7849, + "step": 11185 + }, + { + "epoch": 0.86, + "grad_norm": 0.5317577450246797, + "learning_rate": 2.4214697580605237e-06, + "loss": 2.084, + "step": 11186 + }, + { + "epoch": 0.86, + "grad_norm": 0.49680180249323114, + "learning_rate": 2.418788480095402e-06, + "loss": 1.9178, + "step": 11187 + }, + { + "epoch": 0.86, + "grad_norm": 0.5510305202390415, + "learning_rate": 2.416108611968079e-06, + "loss": 1.8763, + "step": 11188 + }, + { + "epoch": 0.86, + "grad_norm": 0.5638856389418918, + "learning_rate": 2.4134301538458747e-06, + "loss": 1.8331, + "step": 11189 + }, + { + "epoch": 0.86, + "grad_norm": 0.5402537285074656, + "learning_rate": 2.4107531058960104e-06, + "loss": 2.0089, + "step": 11190 + }, + { + "epoch": 0.86, + "grad_norm": 0.5493474489942526, + "learning_rate": 2.408077468285627e-06, + "loss": 1.8438, + "step": 11191 + }, + { + "epoch": 0.86, + "grad_norm": 0.5497207178117967, + "learning_rate": 2.405403241181786e-06, + "loss": 1.8629, + "step": 11192 + }, + { + "epoch": 0.86, + "grad_norm": 0.5228326010400871, + "learning_rate": 2.402730424751437e-06, + "loss": 1.9137, + "step": 11193 + }, + { + "epoch": 0.86, + "grad_norm": 0.5415726696041544, + "learning_rate": 2.4000590191614574e-06, + "loss": 2.0364, + "step": 11194 + }, + { + "epoch": 0.86, + "grad_norm": 0.5532820321452955, + "learning_rate": 2.397389024578639e-06, + "loss": 1.8601, + "step": 11195 + }, + { + "epoch": 0.86, + "grad_norm": 0.5767398956765146, + "learning_rate": 2.394720441169676e-06, + "loss": 1.895, + "step": 11196 + }, + { + "epoch": 0.86, + "grad_norm": 0.5523924828731, + "learning_rate": 2.3920532691011825e-06, + "loss": 1.8137, + "step": 11197 + }, + { + "epoch": 0.86, + "grad_norm": 0.5371665340955613, + "learning_rate": 2.389387508539678e-06, + "loss": 2.0274, + "step": 11198 + }, + { + "epoch": 0.86, + "grad_norm": 0.5144604742738368, + "learning_rate": 2.3867231596515964e-06, + "loss": 1.9241, + "step": 11199 + }, + { + "epoch": 0.86, + "grad_norm": 0.563427837684983, + "learning_rate": 2.3840602226032875e-06, + "loss": 1.8656, + "step": 11200 + }, + { + "epoch": 0.86, + "grad_norm": 0.543917409932888, + "learning_rate": 2.3813986975610077e-06, + "loss": 1.8236, + "step": 11201 + }, + { + "epoch": 0.86, + "grad_norm": 0.5466573688013141, + "learning_rate": 2.378738584690926e-06, + "loss": 2.0644, + "step": 11202 + }, + { + "epoch": 0.86, + "grad_norm": 0.5434375361939563, + "learning_rate": 2.3760798841591243e-06, + "loss": 1.8917, + "step": 11203 + }, + { + "epoch": 0.86, + "grad_norm": 0.5573268266755599, + "learning_rate": 2.373422596131597e-06, + "loss": 1.834, + "step": 11204 + }, + { + "epoch": 0.86, + "grad_norm": 0.5551949520423707, + "learning_rate": 2.3707667207742506e-06, + "loss": 1.8884, + "step": 11205 + }, + { + "epoch": 0.86, + "grad_norm": 0.5108744672462963, + "learning_rate": 2.3681122582529e-06, + "loss": 1.9497, + "step": 11206 + }, + { + "epoch": 0.86, + "grad_norm": 0.5636372601658254, + "learning_rate": 2.3654592087332755e-06, + "loss": 2.056, + "step": 11207 + }, + { + "epoch": 0.86, + "grad_norm": 0.5650551694457353, + "learning_rate": 2.362807572381018e-06, + "loss": 1.8548, + "step": 11208 + }, + { + "epoch": 0.86, + "grad_norm": 0.5544478434957251, + "learning_rate": 2.36015734936168e-06, + "loss": 1.8559, + "step": 11209 + }, + { + "epoch": 0.86, + "grad_norm": 0.5408471781778638, + "learning_rate": 2.3575085398407247e-06, + "loss": 2.0323, + "step": 11210 + }, + { + "epoch": 0.86, + "grad_norm": 0.549856534880969, + "learning_rate": 2.3548611439835305e-06, + "loss": 1.8444, + "step": 11211 + }, + { + "epoch": 0.86, + "grad_norm": 0.5227804950916602, + "learning_rate": 2.3522151619553875e-06, + "loss": 1.944, + "step": 11212 + }, + { + "epoch": 0.87, + "grad_norm": 0.6072816110239908, + "learning_rate": 2.34957059392148e-06, + "loss": 1.8504, + "step": 11213 + }, + { + "epoch": 0.87, + "grad_norm": 0.5289102551760602, + "learning_rate": 2.3469274400469366e-06, + "loss": 2.0188, + "step": 11214 + }, + { + "epoch": 0.87, + "grad_norm": 0.5399085851550032, + "learning_rate": 2.344285700496779e-06, + "loss": 1.8554, + "step": 11215 + }, + { + "epoch": 0.87, + "grad_norm": 0.5679989989771184, + "learning_rate": 2.3416453754359273e-06, + "loss": 1.8708, + "step": 11216 + }, + { + "epoch": 0.87, + "grad_norm": 0.5481679765093012, + "learning_rate": 2.3390064650292447e-06, + "loss": 1.8268, + "step": 11217 + }, + { + "epoch": 0.87, + "grad_norm": 0.5532323108883072, + "learning_rate": 2.336368969441485e-06, + "loss": 1.9146, + "step": 11218 + }, + { + "epoch": 0.87, + "grad_norm": 0.5825307857881219, + "learning_rate": 2.333732888837306e-06, + "loss": 2.0575, + "step": 11219 + }, + { + "epoch": 0.87, + "grad_norm": 0.5613955901073903, + "learning_rate": 2.3310982233813063e-06, + "loss": 1.8782, + "step": 11220 + }, + { + "epoch": 0.87, + "grad_norm": 0.5576089643400929, + "learning_rate": 2.3284649732379677e-06, + "loss": 1.8228, + "step": 11221 + }, + { + "epoch": 0.87, + "grad_norm": 0.5413130156732641, + "learning_rate": 2.3258331385716926e-06, + "loss": 2.0829, + "step": 11222 + }, + { + "epoch": 0.87, + "grad_norm": 0.625205475949688, + "learning_rate": 2.3232027195468104e-06, + "loss": 1.8306, + "step": 11223 + }, + { + "epoch": 0.87, + "grad_norm": 0.5702348429046165, + "learning_rate": 2.3205737163275343e-06, + "loss": 1.9641, + "step": 11224 + }, + { + "epoch": 0.87, + "grad_norm": 0.5606726303290873, + "learning_rate": 2.3179461290780137e-06, + "loss": 1.8508, + "step": 11225 + }, + { + "epoch": 0.87, + "grad_norm": 0.5429020737250992, + "learning_rate": 2.3153199579623e-06, + "loss": 2.0305, + "step": 11226 + }, + { + "epoch": 0.87, + "grad_norm": 0.5759606799846586, + "learning_rate": 2.3126952031443467e-06, + "loss": 1.8796, + "step": 11227 + }, + { + "epoch": 0.87, + "grad_norm": 0.5431037186086397, + "learning_rate": 2.310071864788041e-06, + "loss": 1.8805, + "step": 11228 + }, + { + "epoch": 0.87, + "grad_norm": 0.5793859567745122, + "learning_rate": 2.307449943057155e-06, + "loss": 1.8562, + "step": 11229 + }, + { + "epoch": 0.87, + "grad_norm": 0.5391712122048643, + "learning_rate": 2.3048294381153885e-06, + "loss": 2.048, + "step": 11230 + }, + { + "epoch": 0.87, + "grad_norm": 0.5368238269759552, + "learning_rate": 2.3022103501263657e-06, + "loss": 1.9706, + "step": 11231 + }, + { + "epoch": 0.87, + "grad_norm": 0.5889449249856246, + "learning_rate": 2.299592679253587e-06, + "loss": 1.805, + "step": 11232 + }, + { + "epoch": 0.87, + "grad_norm": 0.5723530325559715, + "learning_rate": 2.2969764256604965e-06, + "loss": 1.8663, + "step": 11233 + }, + { + "epoch": 0.87, + "grad_norm": 0.5322989001498653, + "learning_rate": 2.2943615895104325e-06, + "loss": 2.0234, + "step": 11234 + }, + { + "epoch": 0.87, + "grad_norm": 0.5472885639284986, + "learning_rate": 2.291748170966651e-06, + "loss": 1.8409, + "step": 11235 + }, + { + "epoch": 0.87, + "grad_norm": 0.5765461213881032, + "learning_rate": 2.2891361701923213e-06, + "loss": 1.8413, + "step": 11236 + }, + { + "epoch": 0.87, + "grad_norm": 0.5162892566322802, + "learning_rate": 2.2865255873505185e-06, + "loss": 1.9394, + "step": 11237 + }, + { + "epoch": 0.87, + "grad_norm": 0.5544108274293472, + "learning_rate": 2.2839164226042266e-06, + "loss": 1.8491, + "step": 11238 + }, + { + "epoch": 0.87, + "grad_norm": 0.5530816195654853, + "learning_rate": 2.2813086761163614e-06, + "loss": 2.0496, + "step": 11239 + }, + { + "epoch": 0.87, + "grad_norm": 0.5559668185007612, + "learning_rate": 2.2787023480497246e-06, + "loss": 1.863, + "step": 11240 + }, + { + "epoch": 0.87, + "grad_norm": 0.5561916753836837, + "learning_rate": 2.2760974385670377e-06, + "loss": 1.8567, + "step": 11241 + }, + { + "epoch": 0.87, + "grad_norm": 0.5433597473236561, + "learning_rate": 2.2734939478309432e-06, + "loss": 2.0422, + "step": 11242 + }, + { + "epoch": 0.87, + "grad_norm": 0.5310572430934506, + "learning_rate": 2.270891876003983e-06, + "loss": 1.9369, + "step": 11243 + }, + { + "epoch": 0.87, + "grad_norm": 0.5518141067798691, + "learning_rate": 2.2682912232486143e-06, + "loss": 1.776, + "step": 11244 + }, + { + "epoch": 0.87, + "grad_norm": 0.5677971958347475, + "learning_rate": 2.2656919897272115e-06, + "loss": 1.8395, + "step": 11245 + }, + { + "epoch": 0.87, + "grad_norm": 0.537189654853249, + "learning_rate": 2.2630941756020512e-06, + "loss": 2.0076, + "step": 11246 + }, + { + "epoch": 0.87, + "grad_norm": 0.5532704582541089, + "learning_rate": 2.2604977810353286e-06, + "loss": 1.8821, + "step": 11247 + }, + { + "epoch": 0.87, + "grad_norm": 0.5453766876422301, + "learning_rate": 2.2579028061891465e-06, + "loss": 1.8514, + "step": 11248 + }, + { + "epoch": 0.87, + "grad_norm": 0.5185142381893495, + "learning_rate": 2.2553092512255153e-06, + "loss": 1.9292, + "step": 11249 + }, + { + "epoch": 0.87, + "grad_norm": 0.5655559500521713, + "learning_rate": 2.2527171163063687e-06, + "loss": 1.8736, + "step": 11250 + }, + { + "epoch": 0.87, + "grad_norm": 0.5403050542895828, + "learning_rate": 2.2501264015935387e-06, + "loss": 2.0669, + "step": 11251 + }, + { + "epoch": 0.87, + "grad_norm": 0.5315193236127808, + "learning_rate": 2.2475371072487762e-06, + "loss": 1.7833, + "step": 11252 + }, + { + "epoch": 0.87, + "grad_norm": 0.5524684059112607, + "learning_rate": 2.2449492334337436e-06, + "loss": 1.8304, + "step": 11253 + }, + { + "epoch": 0.87, + "grad_norm": 0.5562949745471019, + "learning_rate": 2.2423627803100145e-06, + "loss": 2.0566, + "step": 11254 + }, + { + "epoch": 0.87, + "grad_norm": 0.5431313701969532, + "learning_rate": 2.239777748039057e-06, + "loss": 1.977, + "step": 11255 + }, + { + "epoch": 0.87, + "grad_norm": 0.5889820961085118, + "learning_rate": 2.237194136782281e-06, + "loss": 1.8436, + "step": 11256 + }, + { + "epoch": 0.87, + "grad_norm": 0.5430057266518863, + "learning_rate": 2.234611946700987e-06, + "loss": 1.8809, + "step": 11257 + }, + { + "epoch": 0.87, + "grad_norm": 0.5540410499123491, + "learning_rate": 2.2320311779563924e-06, + "loss": 1.8056, + "step": 11258 + }, + { + "epoch": 0.87, + "grad_norm": 0.5392564061816609, + "learning_rate": 2.229451830709625e-06, + "loss": 2.0326, + "step": 11259 + }, + { + "epoch": 0.87, + "grad_norm": 0.5704134989746348, + "learning_rate": 2.2268739051217236e-06, + "loss": 1.8539, + "step": 11260 + }, + { + "epoch": 0.87, + "grad_norm": 0.5519489409415285, + "learning_rate": 2.2242974013536365e-06, + "loss": 1.8195, + "step": 11261 + }, + { + "epoch": 0.87, + "grad_norm": 0.5405882761437483, + "learning_rate": 2.221722319566233e-06, + "loss": 1.9229, + "step": 11262 + }, + { + "epoch": 0.87, + "grad_norm": 0.5465540579041684, + "learning_rate": 2.2191486599202726e-06, + "loss": 2.0346, + "step": 11263 + }, + { + "epoch": 0.87, + "grad_norm": 0.5618174097076192, + "learning_rate": 2.2165764225764524e-06, + "loss": 1.805, + "step": 11264 + }, + { + "epoch": 0.87, + "grad_norm": 0.5387537874727099, + "learning_rate": 2.2140056076953658e-06, + "loss": 1.8417, + "step": 11265 + }, + { + "epoch": 0.87, + "grad_norm": 0.557298248759387, + "learning_rate": 2.211436215437507e-06, + "loss": 2.0192, + "step": 11266 + }, + { + "epoch": 0.87, + "grad_norm": 0.5592222717797685, + "learning_rate": 2.2088682459633137e-06, + "loss": 1.8461, + "step": 11267 + }, + { + "epoch": 0.87, + "grad_norm": 0.5176376167307225, + "learning_rate": 2.206301699433097e-06, + "loss": 1.9796, + "step": 11268 + }, + { + "epoch": 0.87, + "grad_norm": 0.5484814386352039, + "learning_rate": 2.203736576007104e-06, + "loss": 1.8503, + "step": 11269 + }, + { + "epoch": 0.87, + "grad_norm": 0.5614535391894523, + "learning_rate": 2.201172875845492e-06, + "loss": 1.8505, + "step": 11270 + }, + { + "epoch": 0.87, + "grad_norm": 0.5659117374812458, + "learning_rate": 2.1986105991083084e-06, + "loss": 2.0148, + "step": 11271 + }, + { + "epoch": 0.87, + "grad_norm": 0.5308201278777612, + "learning_rate": 2.196049745955539e-06, + "loss": 1.8477, + "step": 11272 + }, + { + "epoch": 0.87, + "grad_norm": 0.6116829807195332, + "learning_rate": 2.19349031654707e-06, + "loss": 1.8185, + "step": 11273 + }, + { + "epoch": 0.87, + "grad_norm": 0.5429642187792052, + "learning_rate": 2.190932311042684e-06, + "loss": 1.8634, + "step": 11274 + }, + { + "epoch": 0.87, + "grad_norm": 0.5489604407823094, + "learning_rate": 2.1883757296021045e-06, + "loss": 2.0837, + "step": 11275 + }, + { + "epoch": 0.87, + "grad_norm": 0.5470825973465622, + "learning_rate": 2.1858205723849362e-06, + "loss": 1.9031, + "step": 11276 + }, + { + "epoch": 0.87, + "grad_norm": 0.5579022900911772, + "learning_rate": 2.1832668395507074e-06, + "loss": 1.8191, + "step": 11277 + }, + { + "epoch": 0.87, + "grad_norm": 0.572529990047102, + "learning_rate": 2.180714531258873e-06, + "loss": 2.0263, + "step": 11278 + }, + { + "epoch": 0.87, + "grad_norm": 0.5616079358340998, + "learning_rate": 2.178163647668771e-06, + "loss": 1.8397, + "step": 11279 + }, + { + "epoch": 0.87, + "grad_norm": 0.5269903301876449, + "learning_rate": 2.1756141889396647e-06, + "loss": 1.9306, + "step": 11280 + }, + { + "epoch": 0.87, + "grad_norm": 0.5694279824480117, + "learning_rate": 2.1730661552307323e-06, + "loss": 1.8678, + "step": 11281 + }, + { + "epoch": 0.87, + "grad_norm": 0.5545374310775893, + "learning_rate": 2.1705195467010525e-06, + "loss": 1.8395, + "step": 11282 + }, + { + "epoch": 0.87, + "grad_norm": 0.5872291465127009, + "learning_rate": 2.1679743635096255e-06, + "loss": 2.0756, + "step": 11283 + }, + { + "epoch": 0.87, + "grad_norm": 0.5864343754941461, + "learning_rate": 2.1654306058153522e-06, + "loss": 1.8858, + "step": 11284 + }, + { + "epoch": 0.87, + "grad_norm": 0.565070031796905, + "learning_rate": 2.1628882737770555e-06, + "loss": 1.8763, + "step": 11285 + }, + { + "epoch": 0.87, + "grad_norm": 0.5125167296322625, + "learning_rate": 2.1603473675534614e-06, + "loss": 1.9787, + "step": 11286 + }, + { + "epoch": 0.87, + "grad_norm": 0.5446003377478872, + "learning_rate": 2.1578078873032063e-06, + "loss": 2.0932, + "step": 11287 + }, + { + "epoch": 0.87, + "grad_norm": 0.568133999438589, + "learning_rate": 2.1552698331848447e-06, + "loss": 1.9141, + "step": 11288 + }, + { + "epoch": 0.87, + "grad_norm": 0.5488982915893751, + "learning_rate": 2.1527332053568357e-06, + "loss": 1.8652, + "step": 11289 + }, + { + "epoch": 0.87, + "grad_norm": 0.5873104977773866, + "learning_rate": 2.150198003977552e-06, + "loss": 1.8005, + "step": 11290 + }, + { + "epoch": 0.87, + "grad_norm": 0.52261890273868, + "learning_rate": 2.1476642292052763e-06, + "loss": 2.047, + "step": 11291 + }, + { + "epoch": 0.87, + "grad_norm": 0.5647540322864588, + "learning_rate": 2.1451318811982006e-06, + "loss": 1.8114, + "step": 11292 + }, + { + "epoch": 0.87, + "grad_norm": 0.535383911330196, + "learning_rate": 2.1426009601144326e-06, + "loss": 1.959, + "step": 11293 + }, + { + "epoch": 0.87, + "grad_norm": 0.537429994868881, + "learning_rate": 2.1400714661119897e-06, + "loss": 1.7978, + "step": 11294 + }, + { + "epoch": 0.87, + "grad_norm": 0.5743541758929745, + "learning_rate": 2.137543399348793e-06, + "loss": 2.0562, + "step": 11295 + }, + { + "epoch": 0.87, + "grad_norm": 0.5513022731371093, + "learning_rate": 2.135016759982683e-06, + "loss": 1.8234, + "step": 11296 + }, + { + "epoch": 0.87, + "grad_norm": 0.5576755971816011, + "learning_rate": 2.1324915481714113e-06, + "loss": 1.8453, + "step": 11297 + }, + { + "epoch": 0.87, + "grad_norm": 0.559126085397488, + "learning_rate": 2.129967764072632e-06, + "loss": 2.0561, + "step": 11298 + }, + { + "epoch": 0.87, + "grad_norm": 0.5393568575597373, + "learning_rate": 2.127445407843917e-06, + "loss": 1.9609, + "step": 11299 + }, + { + "epoch": 0.87, + "grad_norm": 0.5512449096638727, + "learning_rate": 2.1249244796427508e-06, + "loss": 1.8465, + "step": 11300 + }, + { + "epoch": 0.87, + "grad_norm": 0.5332588795434223, + "learning_rate": 2.1224049796265246e-06, + "loss": 1.7936, + "step": 11301 + }, + { + "epoch": 0.87, + "grad_norm": 0.5378823733753211, + "learning_rate": 2.119886907952531e-06, + "loss": 1.8513, + "step": 11302 + }, + { + "epoch": 0.87, + "grad_norm": 0.5529996483875359, + "learning_rate": 2.117370264777996e-06, + "loss": 2.0297, + "step": 11303 + }, + { + "epoch": 0.87, + "grad_norm": 0.5598368815471738, + "learning_rate": 2.1148550502600396e-06, + "loss": 1.8486, + "step": 11304 + }, + { + "epoch": 0.87, + "grad_norm": 0.49884275269143374, + "learning_rate": 2.112341264555698e-06, + "loss": 1.9286, + "step": 11305 + }, + { + "epoch": 0.87, + "grad_norm": 0.5471970783075354, + "learning_rate": 2.1098289078219147e-06, + "loss": 1.8486, + "step": 11306 + }, + { + "epoch": 0.87, + "grad_norm": 0.5603608659261473, + "learning_rate": 2.107317980215548e-06, + "loss": 2.073, + "step": 11307 + }, + { + "epoch": 0.87, + "grad_norm": 0.5838193588011032, + "learning_rate": 2.104808481893367e-06, + "loss": 1.8544, + "step": 11308 + }, + { + "epoch": 0.87, + "grad_norm": 0.6172612241126606, + "learning_rate": 2.102300413012051e-06, + "loss": 1.8497, + "step": 11309 + }, + { + "epoch": 0.87, + "grad_norm": 0.5350741290922535, + "learning_rate": 2.099793773728176e-06, + "loss": 2.0337, + "step": 11310 + }, + { + "epoch": 0.87, + "grad_norm": 0.5498345428351682, + "learning_rate": 2.09728856419826e-06, + "loss": 1.954, + "step": 11311 + }, + { + "epoch": 0.87, + "grad_norm": 0.5452153391717481, + "learning_rate": 2.094784784578707e-06, + "loss": 1.9139, + "step": 11312 + }, + { + "epoch": 0.87, + "grad_norm": 0.5623295867506422, + "learning_rate": 2.09228243502583e-06, + "loss": 1.8498, + "step": 11313 + }, + { + "epoch": 0.87, + "grad_norm": 0.5969262846712321, + "learning_rate": 2.089781515695877e-06, + "loss": 1.8558, + "step": 11314 + }, + { + "epoch": 0.87, + "grad_norm": 0.5631674729019587, + "learning_rate": 2.0872820267449745e-06, + "loss": 2.0479, + "step": 11315 + }, + { + "epoch": 0.87, + "grad_norm": 0.5304841242386363, + "learning_rate": 2.0847839683291792e-06, + "loss": 1.8174, + "step": 11316 + }, + { + "epoch": 0.87, + "grad_norm": 0.552456689601667, + "learning_rate": 2.082287340604469e-06, + "loss": 1.9302, + "step": 11317 + }, + { + "epoch": 0.87, + "grad_norm": 0.5686911682245245, + "learning_rate": 2.0797921437267055e-06, + "loss": 1.8951, + "step": 11318 + }, + { + "epoch": 0.87, + "grad_norm": 0.5544318525301177, + "learning_rate": 2.077298377851669e-06, + "loss": 2.0505, + "step": 11319 + }, + { + "epoch": 0.87, + "grad_norm": 0.572812764271841, + "learning_rate": 2.0748060431350747e-06, + "loss": 1.8521, + "step": 11320 + }, + { + "epoch": 0.87, + "grad_norm": 0.5645169750106035, + "learning_rate": 2.0723151397325086e-06, + "loss": 1.8578, + "step": 11321 + }, + { + "epoch": 0.87, + "grad_norm": 0.5966568233554009, + "learning_rate": 2.0698256677995077e-06, + "loss": 1.8113, + "step": 11322 + }, + { + "epoch": 0.87, + "grad_norm": 0.5262893152324377, + "learning_rate": 2.0673376274914836e-06, + "loss": 2.0343, + "step": 11323 + }, + { + "epoch": 0.87, + "grad_norm": 0.5323368165568843, + "learning_rate": 2.0648510189637785e-06, + "loss": 1.9887, + "step": 11324 + }, + { + "epoch": 0.87, + "grad_norm": 0.5478733957343552, + "learning_rate": 2.0623658423716496e-06, + "loss": 1.8484, + "step": 11325 + }, + { + "epoch": 0.87, + "grad_norm": 0.5558995215599773, + "learning_rate": 2.05988209787025e-06, + "loss": 1.8338, + "step": 11326 + }, + { + "epoch": 0.87, + "grad_norm": 0.5793835482879174, + "learning_rate": 2.057399785614653e-06, + "loss": 2.0891, + "step": 11327 + }, + { + "epoch": 0.87, + "grad_norm": 0.544927015199699, + "learning_rate": 2.0549189057598337e-06, + "loss": 1.8477, + "step": 11328 + }, + { + "epoch": 0.87, + "grad_norm": 0.5780353365902464, + "learning_rate": 2.052439458460689e-06, + "loss": 1.8575, + "step": 11329 + }, + { + "epoch": 0.87, + "grad_norm": 0.5453804382469871, + "learning_rate": 2.049961443872023e-06, + "loss": 1.8991, + "step": 11330 + }, + { + "epoch": 0.87, + "grad_norm": 0.5523543164279991, + "learning_rate": 2.047484862148541e-06, + "loss": 2.0741, + "step": 11331 + }, + { + "epoch": 0.87, + "grad_norm": 0.576603720609261, + "learning_rate": 2.045009713444873e-06, + "loss": 1.8801, + "step": 11332 + }, + { + "epoch": 0.87, + "grad_norm": 0.5861415410715166, + "learning_rate": 2.042535997915551e-06, + "loss": 1.8513, + "step": 11333 + }, + { + "epoch": 0.87, + "grad_norm": 0.5689708095546199, + "learning_rate": 2.0400637157150156e-06, + "loss": 1.8432, + "step": 11334 + }, + { + "epoch": 0.87, + "grad_norm": 0.567836307375821, + "learning_rate": 2.0375928669976256e-06, + "loss": 2.0546, + "step": 11335 + }, + { + "epoch": 0.87, + "grad_norm": 0.5188205952028175, + "learning_rate": 2.035123451917645e-06, + "loss": 1.9323, + "step": 11336 + }, + { + "epoch": 0.87, + "grad_norm": 0.5630712092179299, + "learning_rate": 2.0326554706292465e-06, + "loss": 1.862, + "step": 11337 + }, + { + "epoch": 0.87, + "grad_norm": 0.5653816110602845, + "learning_rate": 2.030188923286522e-06, + "loss": 1.8151, + "step": 11338 + }, + { + "epoch": 0.87, + "grad_norm": 0.5406949830404786, + "learning_rate": 2.0277238100434666e-06, + "loss": 2.0524, + "step": 11339 + }, + { + "epoch": 0.87, + "grad_norm": 0.5676194855851181, + "learning_rate": 2.0252601310539825e-06, + "loss": 1.8532, + "step": 11340 + }, + { + "epoch": 0.87, + "grad_norm": 0.558816523811979, + "learning_rate": 2.0227978864718944e-06, + "loss": 1.8615, + "step": 11341 + }, + { + "epoch": 0.88, + "grad_norm": 0.5446730467179306, + "learning_rate": 2.0203370764509262e-06, + "loss": 1.9245, + "step": 11342 + }, + { + "epoch": 0.88, + "grad_norm": 0.5594999543495495, + "learning_rate": 2.0178777011447157e-06, + "loss": 2.0295, + "step": 11343 + }, + { + "epoch": 0.88, + "grad_norm": 0.563304431702629, + "learning_rate": 2.0154197607068133e-06, + "loss": 1.8574, + "step": 11344 + }, + { + "epoch": 0.88, + "grad_norm": 0.5596852370277634, + "learning_rate": 2.0129632552906787e-06, + "loss": 1.8692, + "step": 11345 + }, + { + "epoch": 0.88, + "grad_norm": 0.5440368394663957, + "learning_rate": 2.0105081850496836e-06, + "loss": 1.9005, + "step": 11346 + }, + { + "epoch": 0.88, + "grad_norm": 0.556728443812638, + "learning_rate": 2.0080545501371033e-06, + "loss": 2.077, + "step": 11347 + }, + { + "epoch": 0.88, + "grad_norm": 0.5515716062235766, + "learning_rate": 2.0056023507061343e-06, + "loss": 1.9558, + "step": 11348 + }, + { + "epoch": 0.88, + "grad_norm": 0.5791199292798315, + "learning_rate": 2.0031515869098655e-06, + "loss": 1.8771, + "step": 11349 + }, + { + "epoch": 0.88, + "grad_norm": 0.5434476139362671, + "learning_rate": 2.000702258901324e-06, + "loss": 1.8385, + "step": 11350 + }, + { + "epoch": 0.88, + "grad_norm": 0.559490711483806, + "learning_rate": 1.9982543668334244e-06, + "loss": 2.0258, + "step": 11351 + }, + { + "epoch": 0.88, + "grad_norm": 0.548634427973004, + "learning_rate": 1.9958079108589906e-06, + "loss": 1.851, + "step": 11352 + }, + { + "epoch": 0.88, + "grad_norm": 0.5698944808313946, + "learning_rate": 1.993362891130779e-06, + "loss": 1.8042, + "step": 11353 + }, + { + "epoch": 0.88, + "grad_norm": 0.5711369165580861, + "learning_rate": 1.990919307801434e-06, + "loss": 1.8625, + "step": 11354 + }, + { + "epoch": 0.88, + "grad_norm": 0.52956372077662, + "learning_rate": 1.9884771610235193e-06, + "loss": 2.0948, + "step": 11355 + }, + { + "epoch": 0.88, + "grad_norm": 0.5867014579665674, + "learning_rate": 1.986036450949516e-06, + "loss": 1.8711, + "step": 11356 + }, + { + "epoch": 0.88, + "grad_norm": 0.5320978489803737, + "learning_rate": 1.9835971777317913e-06, + "loss": 1.8458, + "step": 11357 + }, + { + "epoch": 0.88, + "grad_norm": 0.5656434533262092, + "learning_rate": 1.981159341522651e-06, + "loss": 1.8417, + "step": 11358 + }, + { + "epoch": 0.88, + "grad_norm": 0.5370699392084854, + "learning_rate": 1.9787229424743035e-06, + "loss": 2.006, + "step": 11359 + }, + { + "epoch": 0.88, + "grad_norm": 0.5621554495264767, + "learning_rate": 1.9762879807388444e-06, + "loss": 1.8233, + "step": 11360 + }, + { + "epoch": 0.88, + "grad_norm": 0.5114883561620484, + "learning_rate": 1.9738544564683214e-06, + "loss": 1.9193, + "step": 11361 + }, + { + "epoch": 0.88, + "grad_norm": 0.5634356465972353, + "learning_rate": 1.9714223698146543e-06, + "loss": 1.7716, + "step": 11362 + }, + { + "epoch": 0.88, + "grad_norm": 0.5413141255429869, + "learning_rate": 1.9689917209296887e-06, + "loss": 2.0252, + "step": 11363 + }, + { + "epoch": 0.88, + "grad_norm": 0.5646520316611936, + "learning_rate": 1.9665625099651917e-06, + "loss": 1.8821, + "step": 11364 + }, + { + "epoch": 0.88, + "grad_norm": 0.5588443832031327, + "learning_rate": 1.9641347370728174e-06, + "loss": 1.8671, + "step": 11365 + }, + { + "epoch": 0.88, + "grad_norm": 0.5776577099415924, + "learning_rate": 1.961708402404139e-06, + "loss": 1.8839, + "step": 11366 + }, + { + "epoch": 0.88, + "grad_norm": 0.4730394318420244, + "learning_rate": 1.9592835061106595e-06, + "loss": 2.0708, + "step": 11367 + }, + { + "epoch": 0.88, + "grad_norm": 0.5601752312403557, + "learning_rate": 1.9568600483437533e-06, + "loss": 1.8701, + "step": 11368 + }, + { + "epoch": 0.88, + "grad_norm": 0.5624520862470677, + "learning_rate": 1.9544380292547486e-06, + "loss": 1.8635, + "step": 11369 + }, + { + "epoch": 0.88, + "grad_norm": 0.5524109785465701, + "learning_rate": 1.9520174489948473e-06, + "loss": 1.8733, + "step": 11370 + }, + { + "epoch": 0.88, + "grad_norm": 0.5763411800158951, + "learning_rate": 1.9495983077151726e-06, + "loss": 2.063, + "step": 11371 + }, + { + "epoch": 0.88, + "grad_norm": 0.5834554644109189, + "learning_rate": 1.947180605566781e-06, + "loss": 1.8621, + "step": 11372 + }, + { + "epoch": 0.88, + "grad_norm": 0.553018427423777, + "learning_rate": 1.9447643427006e-06, + "loss": 1.921, + "step": 11373 + }, + { + "epoch": 0.88, + "grad_norm": 0.5554544631154585, + "learning_rate": 1.9423495192674945e-06, + "loss": 1.8736, + "step": 11374 + }, + { + "epoch": 0.88, + "grad_norm": 0.5592957752143144, + "learning_rate": 1.9399361354182354e-06, + "loss": 2.0579, + "step": 11375 + }, + { + "epoch": 0.88, + "grad_norm": 0.5917400585614372, + "learning_rate": 1.937524191303497e-06, + "loss": 1.8608, + "step": 11376 + }, + { + "epoch": 0.88, + "grad_norm": 0.5682246420780163, + "learning_rate": 1.9351136870738614e-06, + "loss": 1.8746, + "step": 11377 + }, + { + "epoch": 0.88, + "grad_norm": 0.5812271034965071, + "learning_rate": 1.9327046228798358e-06, + "loss": 1.8705, + "step": 11378 + }, + { + "epoch": 0.88, + "grad_norm": 0.4914333441989122, + "learning_rate": 1.9302969988718195e-06, + "loss": 2.0685, + "step": 11379 + }, + { + "epoch": 0.88, + "grad_norm": 0.5499568516794129, + "learning_rate": 1.927890815200137e-06, + "loss": 1.8482, + "step": 11380 + }, + { + "epoch": 0.88, + "grad_norm": 0.590792570107143, + "learning_rate": 1.925486072015012e-06, + "loss": 1.8396, + "step": 11381 + }, + { + "epoch": 0.88, + "grad_norm": 0.5607462689493338, + "learning_rate": 1.923082769466586e-06, + "loss": 1.8812, + "step": 11382 + }, + { + "epoch": 0.88, + "grad_norm": 0.5427023693670741, + "learning_rate": 1.9206809077049025e-06, + "loss": 2.0409, + "step": 11383 + }, + { + "epoch": 0.88, + "grad_norm": 0.5502759350673243, + "learning_rate": 1.9182804868799254e-06, + "loss": 1.822, + "step": 11384 + }, + { + "epoch": 0.88, + "grad_norm": 0.6185817430145331, + "learning_rate": 1.9158815071415147e-06, + "loss": 1.9241, + "step": 11385 + }, + { + "epoch": 0.88, + "grad_norm": 0.5644341920983977, + "learning_rate": 1.9134839686394568e-06, + "loss": 1.9634, + "step": 11386 + }, + { + "epoch": 0.88, + "grad_norm": 0.5484151840681039, + "learning_rate": 1.911087871523437e-06, + "loss": 2.0262, + "step": 11387 + }, + { + "epoch": 0.88, + "grad_norm": 0.6193101845250805, + "learning_rate": 1.9086932159430527e-06, + "loss": 1.864, + "step": 11388 + }, + { + "epoch": 0.88, + "grad_norm": 0.5808121623256574, + "learning_rate": 1.9063000020478117e-06, + "loss": 1.8507, + "step": 11389 + }, + { + "epoch": 0.88, + "grad_norm": 0.5582539100275166, + "learning_rate": 1.903908229987131e-06, + "loss": 1.8784, + "step": 11390 + }, + { + "epoch": 0.88, + "grad_norm": 0.5365706460361583, + "learning_rate": 1.9015178999103433e-06, + "loss": 2.0549, + "step": 11391 + }, + { + "epoch": 0.88, + "grad_norm": 0.5205814788034887, + "learning_rate": 1.8991290119666828e-06, + "loss": 1.9109, + "step": 11392 + }, + { + "epoch": 0.88, + "grad_norm": 0.5679481654679627, + "learning_rate": 1.896741566305299e-06, + "loss": 1.8288, + "step": 11393 + }, + { + "epoch": 0.88, + "grad_norm": 0.5418815057964814, + "learning_rate": 1.8943555630752507e-06, + "loss": 1.8601, + "step": 11394 + }, + { + "epoch": 0.88, + "grad_norm": 0.5862655515690436, + "learning_rate": 1.8919710024255105e-06, + "loss": 2.0581, + "step": 11395 + }, + { + "epoch": 0.88, + "grad_norm": 0.5640665429497063, + "learning_rate": 1.8895878845049398e-06, + "loss": 1.8747, + "step": 11396 + }, + { + "epoch": 0.88, + "grad_norm": 0.5455842901947854, + "learning_rate": 1.8872062094623416e-06, + "loss": 1.8234, + "step": 11397 + }, + { + "epoch": 0.88, + "grad_norm": 0.5312684942535182, + "learning_rate": 1.884825977446414e-06, + "loss": 1.944, + "step": 11398 + }, + { + "epoch": 0.88, + "grad_norm": 0.5432389523187544, + "learning_rate": 1.8824471886057516e-06, + "loss": 2.0851, + "step": 11399 + }, + { + "epoch": 0.88, + "grad_norm": 0.5791278155183416, + "learning_rate": 1.8800698430888857e-06, + "loss": 1.8549, + "step": 11400 + }, + { + "epoch": 0.88, + "grad_norm": 0.5599541645812369, + "learning_rate": 1.8776939410442397e-06, + "loss": 1.8802, + "step": 11401 + }, + { + "epoch": 0.88, + "grad_norm": 0.5446443623878265, + "learning_rate": 1.875319482620147e-06, + "loss": 1.8289, + "step": 11402 + }, + { + "epoch": 0.88, + "grad_norm": 0.5573991198873848, + "learning_rate": 1.8729464679648618e-06, + "loss": 2.0435, + "step": 11403 + }, + { + "epoch": 0.88, + "grad_norm": 0.5030148175000844, + "learning_rate": 1.870574897226529e-06, + "loss": 1.8941, + "step": 11404 + }, + { + "epoch": 0.88, + "grad_norm": 0.5576267883753275, + "learning_rate": 1.8682047705532308e-06, + "loss": 1.8351, + "step": 11405 + }, + { + "epoch": 0.88, + "grad_norm": 0.5480264130596546, + "learning_rate": 1.8658360880929371e-06, + "loss": 1.7954, + "step": 11406 + }, + { + "epoch": 0.88, + "grad_norm": 0.577334551147996, + "learning_rate": 1.863468849993527e-06, + "loss": 2.0625, + "step": 11407 + }, + { + "epoch": 0.88, + "grad_norm": 0.5517706061202078, + "learning_rate": 1.8611030564028075e-06, + "loss": 1.8786, + "step": 11408 + }, + { + "epoch": 0.88, + "grad_norm": 0.5760873933821362, + "learning_rate": 1.8587387074684852e-06, + "loss": 1.9031, + "step": 11409 + }, + { + "epoch": 0.88, + "grad_norm": 0.5928241719947808, + "learning_rate": 1.8563758033381645e-06, + "loss": 1.8635, + "step": 11410 + }, + { + "epoch": 0.88, + "grad_norm": 0.5049704377248493, + "learning_rate": 1.8540143441593854e-06, + "loss": 2.1061, + "step": 11411 + }, + { + "epoch": 0.88, + "grad_norm": 0.5885543974706761, + "learning_rate": 1.8516543300795746e-06, + "loss": 1.8573, + "step": 11412 + }, + { + "epoch": 0.88, + "grad_norm": 0.5434842731844524, + "learning_rate": 1.8492957612460754e-06, + "loss": 1.9061, + "step": 11413 + }, + { + "epoch": 0.88, + "grad_norm": 0.6006982994514282, + "learning_rate": 1.846938637806156e-06, + "loss": 1.7901, + "step": 11414 + }, + { + "epoch": 0.88, + "grad_norm": 0.5305207585592315, + "learning_rate": 1.8445829599069682e-06, + "loss": 2.0208, + "step": 11415 + }, + { + "epoch": 0.88, + "grad_norm": 0.5634056904571927, + "learning_rate": 1.8422287276955886e-06, + "loss": 1.834, + "step": 11416 + }, + { + "epoch": 0.88, + "grad_norm": 0.5138256777073237, + "learning_rate": 1.8398759413190053e-06, + "loss": 1.9281, + "step": 11417 + }, + { + "epoch": 0.88, + "grad_norm": 0.5794606665028862, + "learning_rate": 1.8375246009241092e-06, + "loss": 1.875, + "step": 11418 + }, + { + "epoch": 0.88, + "grad_norm": 0.5374531556585735, + "learning_rate": 1.8351747066577134e-06, + "loss": 2.0518, + "step": 11419 + }, + { + "epoch": 0.88, + "grad_norm": 0.5708448603297026, + "learning_rate": 1.8328262586665173e-06, + "loss": 1.8389, + "step": 11420 + }, + { + "epoch": 0.88, + "grad_norm": 0.5434416526072174, + "learning_rate": 1.8304792570971508e-06, + "loss": 1.8417, + "step": 11421 + }, + { + "epoch": 0.88, + "grad_norm": 0.5487952362585242, + "learning_rate": 1.828133702096152e-06, + "loss": 1.9191, + "step": 11422 + }, + { + "epoch": 0.88, + "grad_norm": 0.5023233947685983, + "learning_rate": 1.8257895938099568e-06, + "loss": 1.9448, + "step": 11423 + }, + { + "epoch": 0.88, + "grad_norm": 0.5217867061043434, + "learning_rate": 1.8234469323849201e-06, + "loss": 2.0017, + "step": 11424 + }, + { + "epoch": 0.88, + "grad_norm": 0.5360032944416055, + "learning_rate": 1.8211057179673003e-06, + "loss": 1.8718, + "step": 11425 + }, + { + "epoch": 0.88, + "grad_norm": 0.5630404258046613, + "learning_rate": 1.8187659507032718e-06, + "loss": 1.835, + "step": 11426 + }, + { + "epoch": 0.88, + "grad_norm": 0.5449949965008825, + "learning_rate": 1.8164276307389178e-06, + "loss": 2.0378, + "step": 11427 + }, + { + "epoch": 0.88, + "grad_norm": 0.5432824272167419, + "learning_rate": 1.8140907582202271e-06, + "loss": 1.8485, + "step": 11428 + }, + { + "epoch": 0.88, + "grad_norm": 0.5268602587517842, + "learning_rate": 1.8117553332930997e-06, + "loss": 1.8888, + "step": 11429 + }, + { + "epoch": 0.88, + "grad_norm": 0.5839837550403904, + "learning_rate": 1.8094213561033469e-06, + "loss": 1.8385, + "step": 11430 + }, + { + "epoch": 0.88, + "grad_norm": 0.5310612007541586, + "learning_rate": 1.8070888267966878e-06, + "loss": 2.0793, + "step": 11431 + }, + { + "epoch": 0.88, + "grad_norm": 0.5337712370840574, + "learning_rate": 1.8047577455187537e-06, + "loss": 1.8606, + "step": 11432 + }, + { + "epoch": 0.88, + "grad_norm": 0.5718593742179925, + "learning_rate": 1.8024281124150805e-06, + "loss": 1.8169, + "step": 11433 + }, + { + "epoch": 0.88, + "grad_norm": 0.5447043727200745, + "learning_rate": 1.8000999276311187e-06, + "loss": 1.8874, + "step": 11434 + }, + { + "epoch": 0.88, + "grad_norm": 0.517376371045215, + "learning_rate": 1.7977731913122242e-06, + "loss": 1.9434, + "step": 11435 + }, + { + "epoch": 0.88, + "grad_norm": 0.537247692656742, + "learning_rate": 1.7954479036036698e-06, + "loss": 2.0057, + "step": 11436 + }, + { + "epoch": 0.88, + "grad_norm": 0.558218381539825, + "learning_rate": 1.7931240646506254e-06, + "loss": 1.8266, + "step": 11437 + }, + { + "epoch": 0.88, + "grad_norm": 0.5632202271515055, + "learning_rate": 1.790801674598186e-06, + "loss": 1.7982, + "step": 11438 + }, + { + "epoch": 0.88, + "grad_norm": 0.5868016870077111, + "learning_rate": 1.7884807335913412e-06, + "loss": 1.9939, + "step": 11439 + }, + { + "epoch": 0.88, + "grad_norm": 0.5621041248749206, + "learning_rate": 1.786161241775e-06, + "loss": 1.8937, + "step": 11440 + }, + { + "epoch": 0.88, + "grad_norm": 0.6043670282233546, + "learning_rate": 1.7838431992939775e-06, + "loss": 1.8092, + "step": 11441 + }, + { + "epoch": 0.88, + "grad_norm": 0.5236691426667504, + "learning_rate": 1.7815266062929992e-06, + "loss": 1.921, + "step": 11442 + }, + { + "epoch": 0.88, + "grad_norm": 0.5348056290663287, + "learning_rate": 1.7792114629166967e-06, + "loss": 2.063, + "step": 11443 + }, + { + "epoch": 0.88, + "grad_norm": 0.5660441627646823, + "learning_rate": 1.7768977693096184e-06, + "loss": 1.8692, + "step": 11444 + }, + { + "epoch": 0.88, + "grad_norm": 0.5796437953649689, + "learning_rate": 1.774585525616218e-06, + "loss": 1.8092, + "step": 11445 + }, + { + "epoch": 0.88, + "grad_norm": 0.5477984053334748, + "learning_rate": 1.7722747319808469e-06, + "loss": 1.8704, + "step": 11446 + }, + { + "epoch": 0.88, + "grad_norm": 0.5251831251568906, + "learning_rate": 1.7699653885477923e-06, + "loss": 2.0335, + "step": 11447 + }, + { + "epoch": 0.88, + "grad_norm": 0.5026893527417187, + "learning_rate": 1.7676574954612335e-06, + "loss": 1.8634, + "step": 11448 + }, + { + "epoch": 0.88, + "grad_norm": 0.5377440378348431, + "learning_rate": 1.7653510528652494e-06, + "loss": 1.8886, + "step": 11449 + }, + { + "epoch": 0.88, + "grad_norm": 0.547145007729135, + "learning_rate": 1.7630460609038556e-06, + "loss": 1.8442, + "step": 11450 + }, + { + "epoch": 0.88, + "grad_norm": 0.5469870296677756, + "learning_rate": 1.760742519720951e-06, + "loss": 2.0614, + "step": 11451 + }, + { + "epoch": 0.88, + "grad_norm": 0.594287691495631, + "learning_rate": 1.7584404294603618e-06, + "loss": 1.8553, + "step": 11452 + }, + { + "epoch": 0.88, + "grad_norm": 0.5539632393921048, + "learning_rate": 1.7561397902658205e-06, + "loss": 1.8254, + "step": 11453 + }, + { + "epoch": 0.88, + "grad_norm": 0.5817113916503119, + "learning_rate": 1.753840602280954e-06, + "loss": 1.9122, + "step": 11454 + }, + { + "epoch": 0.88, + "grad_norm": 0.5498871052031432, + "learning_rate": 1.7515428656493167e-06, + "loss": 1.826, + "step": 11455 + }, + { + "epoch": 0.88, + "grad_norm": 0.5349252140993577, + "learning_rate": 1.7492465805143715e-06, + "loss": 1.9992, + "step": 11456 + }, + { + "epoch": 0.88, + "grad_norm": 0.5698287777477677, + "learning_rate": 1.7469517470194707e-06, + "loss": 1.8563, + "step": 11457 + }, + { + "epoch": 0.88, + "grad_norm": 0.6149504626125878, + "learning_rate": 1.7446583653079052e-06, + "loss": 1.8079, + "step": 11458 + }, + { + "epoch": 0.88, + "grad_norm": 0.5711284169011605, + "learning_rate": 1.742366435522852e-06, + "loss": 2.04, + "step": 11459 + }, + { + "epoch": 0.88, + "grad_norm": 0.5119927951453412, + "learning_rate": 1.7400759578073994e-06, + "loss": 1.9854, + "step": 11460 + }, + { + "epoch": 0.88, + "grad_norm": 0.5780208662192877, + "learning_rate": 1.7377869323045693e-06, + "loss": 1.8439, + "step": 11461 + }, + { + "epoch": 0.88, + "grad_norm": 0.5811806932942469, + "learning_rate": 1.7354993591572583e-06, + "loss": 1.8628, + "step": 11462 + }, + { + "epoch": 0.88, + "grad_norm": 0.52653393976474, + "learning_rate": 1.7332132385082966e-06, + "loss": 1.9811, + "step": 11463 + }, + { + "epoch": 0.88, + "grad_norm": 0.5545719825569585, + "learning_rate": 1.7309285705004174e-06, + "loss": 1.7913, + "step": 11464 + }, + { + "epoch": 0.88, + "grad_norm": 0.5519025613258323, + "learning_rate": 1.728645355276251e-06, + "loss": 1.8902, + "step": 11465 + }, + { + "epoch": 0.88, + "grad_norm": 0.5177559789773891, + "learning_rate": 1.7263635929783666e-06, + "loss": 1.9415, + "step": 11466 + }, + { + "epoch": 0.88, + "grad_norm": 0.5385086436625898, + "learning_rate": 1.7240832837492089e-06, + "loss": 1.8887, + "step": 11467 + }, + { + "epoch": 0.88, + "grad_norm": 0.5442795652765817, + "learning_rate": 1.7218044277311468e-06, + "loss": 2.0151, + "step": 11468 + }, + { + "epoch": 0.88, + "grad_norm": 0.5479580565307639, + "learning_rate": 1.7195270250664725e-06, + "loss": 1.8735, + "step": 11469 + }, + { + "epoch": 0.88, + "grad_norm": 0.5864212348747019, + "learning_rate": 1.7172510758973608e-06, + "loss": 1.8612, + "step": 11470 + }, + { + "epoch": 0.88, + "grad_norm": 0.575159541732696, + "learning_rate": 1.7149765803659124e-06, + "loss": 2.0793, + "step": 11471 + }, + { + "epoch": 0.89, + "grad_norm": 0.533729930601059, + "learning_rate": 1.7127035386141326e-06, + "loss": 1.8003, + "step": 11472 + }, + { + "epoch": 0.89, + "grad_norm": 0.5089293084138778, + "learning_rate": 1.710431950783939e-06, + "loss": 1.9449, + "step": 11473 + }, + { + "epoch": 0.89, + "grad_norm": 0.5476500538044772, + "learning_rate": 1.7081618170171538e-06, + "loss": 1.8473, + "step": 11474 + }, + { + "epoch": 0.89, + "grad_norm": 0.5704101526407619, + "learning_rate": 1.7058931374555108e-06, + "loss": 1.801, + "step": 11475 + }, + { + "epoch": 0.89, + "grad_norm": 0.5662651394140729, + "learning_rate": 1.7036259122406557e-06, + "loss": 2.0124, + "step": 11476 + }, + { + "epoch": 0.89, + "grad_norm": 0.549025468735945, + "learning_rate": 1.7013601415141383e-06, + "loss": 1.8221, + "step": 11477 + }, + { + "epoch": 0.89, + "grad_norm": 0.5502669478535362, + "learning_rate": 1.6990958254174182e-06, + "loss": 1.8643, + "step": 11478 + }, + { + "epoch": 0.89, + "grad_norm": 0.5412541320252907, + "learning_rate": 1.6968329640918712e-06, + "loss": 1.9434, + "step": 11479 + }, + { + "epoch": 0.89, + "grad_norm": 0.5721593338532074, + "learning_rate": 1.694571557678773e-06, + "loss": 2.0734, + "step": 11480 + }, + { + "epoch": 0.89, + "grad_norm": 0.5630047601013076, + "learning_rate": 1.6923116063193107e-06, + "loss": 1.7974, + "step": 11481 + }, + { + "epoch": 0.89, + "grad_norm": 0.5553723584718063, + "learning_rate": 1.690053110154588e-06, + "loss": 1.9089, + "step": 11482 + }, + { + "epoch": 0.89, + "grad_norm": 0.5585254013197967, + "learning_rate": 1.687796069325609e-06, + "loss": 2.1024, + "step": 11483 + }, + { + "epoch": 0.89, + "grad_norm": 0.5636307723415748, + "learning_rate": 1.6855404839732913e-06, + "loss": 1.843, + "step": 11484 + }, + { + "epoch": 0.89, + "grad_norm": 0.5511016877619674, + "learning_rate": 1.6832863542384585e-06, + "loss": 1.9488, + "step": 11485 + }, + { + "epoch": 0.89, + "grad_norm": 0.5599689550315416, + "learning_rate": 1.6810336802618453e-06, + "loss": 1.8306, + "step": 11486 + }, + { + "epoch": 0.89, + "grad_norm": 0.5537086824748512, + "learning_rate": 1.6787824621840976e-06, + "loss": 1.8575, + "step": 11487 + }, + { + "epoch": 0.89, + "grad_norm": 0.5536654417187133, + "learning_rate": 1.6765327001457665e-06, + "loss": 2.0624, + "step": 11488 + }, + { + "epoch": 0.89, + "grad_norm": 0.5438282667619144, + "learning_rate": 1.674284394287312e-06, + "loss": 1.8376, + "step": 11489 + }, + { + "epoch": 0.89, + "grad_norm": 0.5323617698369226, + "learning_rate": 1.672037544749111e-06, + "loss": 1.8746, + "step": 11490 + }, + { + "epoch": 0.89, + "grad_norm": 0.5631009029239705, + "learning_rate": 1.6697921516714372e-06, + "loss": 1.945, + "step": 11491 + }, + { + "epoch": 0.89, + "grad_norm": 0.5884476755481036, + "learning_rate": 1.6675482151944866e-06, + "loss": 2.0714, + "step": 11492 + }, + { + "epoch": 0.89, + "grad_norm": 0.5872801935769651, + "learning_rate": 1.6653057354583445e-06, + "loss": 1.8592, + "step": 11493 + }, + { + "epoch": 0.89, + "grad_norm": 0.5703552451578564, + "learning_rate": 1.6630647126030325e-06, + "loss": 1.8877, + "step": 11494 + }, + { + "epoch": 0.89, + "grad_norm": 0.5367921988048878, + "learning_rate": 1.6608251467684632e-06, + "loss": 2.0715, + "step": 11495 + }, + { + "epoch": 0.89, + "grad_norm": 0.5788993200993916, + "learning_rate": 1.658587038094453e-06, + "loss": 1.8256, + "step": 11496 + }, + { + "epoch": 0.89, + "grad_norm": 0.5302187426392724, + "learning_rate": 1.6563503867207481e-06, + "loss": 1.9384, + "step": 11497 + }, + { + "epoch": 0.89, + "grad_norm": 0.5715673241539011, + "learning_rate": 1.6541151927869868e-06, + "loss": 1.8412, + "step": 11498 + }, + { + "epoch": 0.89, + "grad_norm": 0.5790838234986355, + "learning_rate": 1.6518814564327134e-06, + "loss": 1.8505, + "step": 11499 + }, + { + "epoch": 0.89, + "grad_norm": 0.5350456843446728, + "learning_rate": 1.6496491777974077e-06, + "loss": 2.0565, + "step": 11500 + }, + { + "epoch": 0.89, + "grad_norm": 0.5759006085601842, + "learning_rate": 1.647418357020422e-06, + "loss": 1.8292, + "step": 11501 + }, + { + "epoch": 0.89, + "grad_norm": 0.5306123612452441, + "learning_rate": 1.6451889942410453e-06, + "loss": 1.8654, + "step": 11502 + }, + { + "epoch": 0.89, + "grad_norm": 0.5426362792230239, + "learning_rate": 1.6429610895984688e-06, + "loss": 2.0804, + "step": 11503 + }, + { + "epoch": 0.89, + "grad_norm": 0.5216841658217515, + "learning_rate": 1.6407346432317755e-06, + "loss": 1.8741, + "step": 11504 + }, + { + "epoch": 0.89, + "grad_norm": 0.5667584330820248, + "learning_rate": 1.6385096552799906e-06, + "loss": 1.8444, + "step": 11505 + }, + { + "epoch": 0.89, + "grad_norm": 0.6019417627288464, + "learning_rate": 1.636286125882014e-06, + "loss": 1.8824, + "step": 11506 + }, + { + "epoch": 0.89, + "grad_norm": 0.5803286184776754, + "learning_rate": 1.6340640551766706e-06, + "loss": 1.8879, + "step": 11507 + }, + { + "epoch": 0.89, + "grad_norm": 0.5595180063103656, + "learning_rate": 1.6318434433027051e-06, + "loss": 2.0683, + "step": 11508 + }, + { + "epoch": 0.89, + "grad_norm": 0.564393284386757, + "learning_rate": 1.6296242903987514e-06, + "loss": 1.8471, + "step": 11509 + }, + { + "epoch": 0.89, + "grad_norm": 0.526819182177988, + "learning_rate": 1.6274065966033592e-06, + "loss": 1.9064, + "step": 11510 + }, + { + "epoch": 0.89, + "grad_norm": 0.5797477153724305, + "learning_rate": 1.6251903620549901e-06, + "loss": 1.8588, + "step": 11511 + }, + { + "epoch": 0.89, + "grad_norm": 0.5282103382720872, + "learning_rate": 1.6229755868920116e-06, + "loss": 1.9987, + "step": 11512 + }, + { + "epoch": 0.89, + "grad_norm": 0.5686101759770306, + "learning_rate": 1.6207622712527043e-06, + "loss": 1.8895, + "step": 11513 + }, + { + "epoch": 0.89, + "grad_norm": 0.5798136941689981, + "learning_rate": 1.6185504152752523e-06, + "loss": 1.8486, + "step": 11514 + }, + { + "epoch": 0.89, + "grad_norm": 0.578283250504507, + "learning_rate": 1.616340019097748e-06, + "loss": 2.0023, + "step": 11515 + }, + { + "epoch": 0.89, + "grad_norm": 0.5522000020238053, + "learning_rate": 1.6141310828582058e-06, + "loss": 1.9585, + "step": 11516 + }, + { + "epoch": 0.89, + "grad_norm": 0.5671364730243942, + "learning_rate": 1.6119236066945264e-06, + "loss": 1.8699, + "step": 11517 + }, + { + "epoch": 0.89, + "grad_norm": 0.5521964335085521, + "learning_rate": 1.6097175907445389e-06, + "loss": 1.8465, + "step": 11518 + }, + { + "epoch": 0.89, + "grad_norm": 0.5544334709279299, + "learning_rate": 1.6075130351459715e-06, + "loss": 1.8755, + "step": 11519 + }, + { + "epoch": 0.89, + "grad_norm": 0.5406042487548138, + "learning_rate": 1.6053099400364645e-06, + "loss": 2.0455, + "step": 11520 + }, + { + "epoch": 0.89, + "grad_norm": 0.527987913374905, + "learning_rate": 1.6031083055535662e-06, + "loss": 1.8534, + "step": 11521 + }, + { + "epoch": 0.89, + "grad_norm": 0.5154940508786238, + "learning_rate": 1.6009081318347302e-06, + "loss": 1.9454, + "step": 11522 + }, + { + "epoch": 0.89, + "grad_norm": 0.5219694964022139, + "learning_rate": 1.5987094190173273e-06, + "loss": 1.8532, + "step": 11523 + }, + { + "epoch": 0.89, + "grad_norm": 0.5672431341750163, + "learning_rate": 1.5965121672386313e-06, + "loss": 2.1037, + "step": 11524 + }, + { + "epoch": 0.89, + "grad_norm": 0.5613193408313942, + "learning_rate": 1.5943163766358238e-06, + "loss": 1.81, + "step": 11525 + }, + { + "epoch": 0.89, + "grad_norm": 0.5686138602445893, + "learning_rate": 1.592122047345998e-06, + "loss": 1.848, + "step": 11526 + }, + { + "epoch": 0.89, + "grad_norm": 0.5236972679363534, + "learning_rate": 1.5899291795061554e-06, + "loss": 2.001, + "step": 11527 + }, + { + "epoch": 0.89, + "grad_norm": 0.5166000909253287, + "learning_rate": 1.5877377732532033e-06, + "loss": 1.9114, + "step": 11528 + }, + { + "epoch": 0.89, + "grad_norm": 0.5556321395789722, + "learning_rate": 1.5855478287239628e-06, + "loss": 1.866, + "step": 11529 + }, + { + "epoch": 0.89, + "grad_norm": 0.5464626028019876, + "learning_rate": 1.5833593460551606e-06, + "loss": 1.8368, + "step": 11530 + }, + { + "epoch": 0.89, + "grad_norm": 0.5407746105954445, + "learning_rate": 1.581172325383437e-06, + "loss": 1.822, + "step": 11531 + }, + { + "epoch": 0.89, + "grad_norm": 0.5333676049003504, + "learning_rate": 1.5789867668453223e-06, + "loss": 2.0637, + "step": 11532 + }, + { + "epoch": 0.89, + "grad_norm": 0.5396263740254015, + "learning_rate": 1.5768026705772848e-06, + "loss": 1.834, + "step": 11533 + }, + { + "epoch": 0.89, + "grad_norm": 0.5438394018269245, + "learning_rate": 1.5746200367156823e-06, + "loss": 1.8543, + "step": 11534 + }, + { + "epoch": 0.89, + "grad_norm": 0.5321757616084442, + "learning_rate": 1.572438865396783e-06, + "loss": 1.9551, + "step": 11535 + }, + { + "epoch": 0.89, + "grad_norm": 0.541679020778305, + "learning_rate": 1.5702591567567705e-06, + "loss": 2.0459, + "step": 11536 + }, + { + "epoch": 0.89, + "grad_norm": 0.5683657345046792, + "learning_rate": 1.5680809109317295e-06, + "loss": 1.8654, + "step": 11537 + }, + { + "epoch": 0.89, + "grad_norm": 0.5805774880487938, + "learning_rate": 1.5659041280576576e-06, + "loss": 1.8293, + "step": 11538 + }, + { + "epoch": 0.89, + "grad_norm": 0.5705951698548497, + "learning_rate": 1.5637288082704649e-06, + "loss": 1.8518, + "step": 11539 + }, + { + "epoch": 0.89, + "grad_norm": 0.5400605409766198, + "learning_rate": 1.5615549517059513e-06, + "loss": 2.0092, + "step": 11540 + }, + { + "epoch": 0.89, + "grad_norm": 0.5350277584267021, + "learning_rate": 1.5593825584998556e-06, + "loss": 1.8851, + "step": 11541 + }, + { + "epoch": 0.89, + "grad_norm": 0.5958600529362325, + "learning_rate": 1.5572116287878081e-06, + "loss": 1.8933, + "step": 11542 + }, + { + "epoch": 0.89, + "grad_norm": 0.5643169319921733, + "learning_rate": 1.5550421627053336e-06, + "loss": 1.8949, + "step": 11543 + }, + { + "epoch": 0.89, + "grad_norm": 0.5688988531788705, + "learning_rate": 1.5528741603879016e-06, + "loss": 2.0592, + "step": 11544 + }, + { + "epoch": 0.89, + "grad_norm": 0.5569108556299291, + "learning_rate": 1.5507076219708538e-06, + "loss": 1.876, + "step": 11545 + }, + { + "epoch": 0.89, + "grad_norm": 0.5608668024932708, + "learning_rate": 1.5485425475894572e-06, + "loss": 1.829, + "step": 11546 + }, + { + "epoch": 0.89, + "grad_norm": 0.5186299500402831, + "learning_rate": 1.5463789373788978e-06, + "loss": 1.9358, + "step": 11547 + }, + { + "epoch": 0.89, + "grad_norm": 0.6071739448379364, + "learning_rate": 1.5442167914742455e-06, + "loss": 2.0678, + "step": 11548 + }, + { + "epoch": 0.89, + "grad_norm": 0.5662753644720798, + "learning_rate": 1.5420561100105007e-06, + "loss": 1.8703, + "step": 11549 + }, + { + "epoch": 0.89, + "grad_norm": 0.5635858042631295, + "learning_rate": 1.5398968931225639e-06, + "loss": 1.8335, + "step": 11550 + }, + { + "epoch": 0.89, + "grad_norm": 0.5759985110604045, + "learning_rate": 1.5377391409452324e-06, + "loss": 1.841, + "step": 11551 + }, + { + "epoch": 0.89, + "grad_norm": 0.5429365477523836, + "learning_rate": 1.5355828536132405e-06, + "loss": 2.0888, + "step": 11552 + }, + { + "epoch": 0.89, + "grad_norm": 0.5408100568461693, + "learning_rate": 1.5334280312612e-06, + "loss": 1.9563, + "step": 11553 + }, + { + "epoch": 0.89, + "grad_norm": 0.5577544197795093, + "learning_rate": 1.5312746740236471e-06, + "loss": 1.8509, + "step": 11554 + }, + { + "epoch": 0.89, + "grad_norm": 0.5462128136671481, + "learning_rate": 1.529122782035039e-06, + "loss": 1.8493, + "step": 11555 + }, + { + "epoch": 0.89, + "grad_norm": 0.5372412256542201, + "learning_rate": 1.5269723554297095e-06, + "loss": 2.0328, + "step": 11556 + }, + { + "epoch": 0.89, + "grad_norm": 0.5428278282472243, + "learning_rate": 1.5248233943419205e-06, + "loss": 1.8162, + "step": 11557 + }, + { + "epoch": 0.89, + "grad_norm": 0.5694209568403608, + "learning_rate": 1.5226758989058565e-06, + "loss": 1.8687, + "step": 11558 + }, + { + "epoch": 0.89, + "grad_norm": 0.5586634864506735, + "learning_rate": 1.5205298692555769e-06, + "loss": 1.8877, + "step": 11559 + }, + { + "epoch": 0.89, + "grad_norm": 0.4931589729173802, + "learning_rate": 1.5183853055250746e-06, + "loss": 2.1076, + "step": 11560 + }, + { + "epoch": 0.89, + "grad_norm": 0.5618606027167793, + "learning_rate": 1.5162422078482424e-06, + "loss": 1.846, + "step": 11561 + }, + { + "epoch": 0.89, + "grad_norm": 0.5478665746683364, + "learning_rate": 1.5141005763588845e-06, + "loss": 1.8341, + "step": 11562 + }, + { + "epoch": 0.89, + "grad_norm": 0.5391092299472615, + "learning_rate": 1.5119604111907077e-06, + "loss": 1.8563, + "step": 11563 + }, + { + "epoch": 0.89, + "grad_norm": 0.5154712713242152, + "learning_rate": 1.5098217124773356e-06, + "loss": 2.0463, + "step": 11564 + }, + { + "epoch": 0.89, + "grad_norm": 0.5399775172940153, + "learning_rate": 1.5076844803522922e-06, + "loss": 1.8257, + "step": 11565 + }, + { + "epoch": 0.89, + "grad_norm": 0.505914977858674, + "learning_rate": 1.505548714949015e-06, + "loss": 1.8391, + "step": 11566 + }, + { + "epoch": 0.89, + "grad_norm": 0.5647101880926146, + "learning_rate": 1.503414416400853e-06, + "loss": 1.8434, + "step": 11567 + }, + { + "epoch": 0.89, + "grad_norm": 0.5565693985751698, + "learning_rate": 1.5012815848410523e-06, + "loss": 2.0487, + "step": 11568 + }, + { + "epoch": 0.89, + "grad_norm": 0.5825650948006594, + "learning_rate": 1.4991502204027786e-06, + "loss": 1.8467, + "step": 11569 + }, + { + "epoch": 0.89, + "grad_norm": 0.5404904882481903, + "learning_rate": 1.4970203232190977e-06, + "loss": 1.792, + "step": 11570 + }, + { + "epoch": 0.89, + "grad_norm": 0.5325039152037033, + "learning_rate": 1.494891893422995e-06, + "loss": 1.8435, + "step": 11571 + }, + { + "epoch": 0.89, + "grad_norm": 0.5016319729314044, + "learning_rate": 1.4927649311473501e-06, + "loss": 2.0959, + "step": 11572 + }, + { + "epoch": 0.89, + "grad_norm": 0.5341816761030543, + "learning_rate": 1.4906394365249627e-06, + "loss": 1.8666, + "step": 11573 + }, + { + "epoch": 0.89, + "grad_norm": 0.5328364186729707, + "learning_rate": 1.4885154096885318e-06, + "loss": 1.8636, + "step": 11574 + }, + { + "epoch": 0.89, + "grad_norm": 0.5541914577813613, + "learning_rate": 1.486392850770671e-06, + "loss": 1.8676, + "step": 11575 + }, + { + "epoch": 0.89, + "grad_norm": 0.5382688032761922, + "learning_rate": 1.4842717599039047e-06, + "loss": 2.0766, + "step": 11576 + }, + { + "epoch": 0.89, + "grad_norm": 0.5569870313632911, + "learning_rate": 1.4821521372206547e-06, + "loss": 1.8406, + "step": 11577 + }, + { + "epoch": 0.89, + "grad_norm": 0.5217034421803733, + "learning_rate": 1.4800339828532627e-06, + "loss": 1.9297, + "step": 11578 + }, + { + "epoch": 0.89, + "grad_norm": 0.5428979127909535, + "learning_rate": 1.4779172969339672e-06, + "loss": 1.8456, + "step": 11579 + }, + { + "epoch": 0.89, + "grad_norm": 0.5481496322229213, + "learning_rate": 1.475802079594929e-06, + "loss": 2.0423, + "step": 11580 + }, + { + "epoch": 0.89, + "grad_norm": 0.5434713311108774, + "learning_rate": 1.4736883309682065e-06, + "loss": 1.8353, + "step": 11581 + }, + { + "epoch": 0.89, + "grad_norm": 0.5706542306496056, + "learning_rate": 1.471576051185769e-06, + "loss": 1.8321, + "step": 11582 + }, + { + "epoch": 0.89, + "grad_norm": 0.5665692413734926, + "learning_rate": 1.4694652403794972e-06, + "loss": 1.8212, + "step": 11583 + }, + { + "epoch": 0.89, + "grad_norm": 0.500725732836912, + "learning_rate": 1.4673558986811776e-06, + "loss": 2.126, + "step": 11584 + }, + { + "epoch": 0.89, + "grad_norm": 0.5564945456815288, + "learning_rate": 1.4652480262225015e-06, + "loss": 1.8164, + "step": 11585 + }, + { + "epoch": 0.89, + "grad_norm": 0.5628586916486585, + "learning_rate": 1.4631416231350782e-06, + "loss": 1.8236, + "step": 11586 + }, + { + "epoch": 0.89, + "grad_norm": 0.5373099881686948, + "learning_rate": 1.4610366895504074e-06, + "loss": 1.8577, + "step": 11587 + }, + { + "epoch": 0.89, + "grad_norm": 0.5370315493794655, + "learning_rate": 1.4589332255999205e-06, + "loss": 2.0491, + "step": 11588 + }, + { + "epoch": 0.89, + "grad_norm": 0.5786041662723317, + "learning_rate": 1.4568312314149456e-06, + "loss": 1.8325, + "step": 11589 + }, + { + "epoch": 0.89, + "grad_norm": 0.5399534401068655, + "learning_rate": 1.4547307071267086e-06, + "loss": 1.8105, + "step": 11590 + }, + { + "epoch": 0.89, + "grad_norm": 0.5409039572752965, + "learning_rate": 1.4526316528663653e-06, + "loss": 1.9102, + "step": 11591 + }, + { + "epoch": 0.89, + "grad_norm": 0.5588041357744535, + "learning_rate": 1.450534068764961e-06, + "loss": 2.0673, + "step": 11592 + }, + { + "epoch": 0.89, + "grad_norm": 0.5550623775895449, + "learning_rate": 1.448437954953452e-06, + "loss": 1.8847, + "step": 11593 + }, + { + "epoch": 0.89, + "grad_norm": 0.5754586748824821, + "learning_rate": 1.4463433115627229e-06, + "loss": 1.8457, + "step": 11594 + }, + { + "epoch": 0.89, + "grad_norm": 0.5609369312532884, + "learning_rate": 1.444250138723538e-06, + "loss": 1.887, + "step": 11595 + }, + { + "epoch": 0.89, + "grad_norm": 0.5536132673369919, + "learning_rate": 1.4421584365665847e-06, + "loss": 2.0327, + "step": 11596 + }, + { + "epoch": 0.89, + "grad_norm": 0.5121870789793983, + "learning_rate": 1.4400682052224613e-06, + "loss": 1.9637, + "step": 11597 + }, + { + "epoch": 0.89, + "grad_norm": 0.5425302361243759, + "learning_rate": 1.4379794448216633e-06, + "loss": 1.8411, + "step": 11598 + }, + { + "epoch": 0.89, + "grad_norm": 0.5469752757658726, + "learning_rate": 1.4358921554946087e-06, + "loss": 1.8819, + "step": 11599 + }, + { + "epoch": 0.89, + "grad_norm": 0.5362345275848386, + "learning_rate": 1.433806337371607e-06, + "loss": 2.0141, + "step": 11600 + }, + { + "epoch": 0.9, + "grad_norm": 0.546044911409612, + "learning_rate": 1.431721990582885e-06, + "loss": 1.8677, + "step": 11601 + }, + { + "epoch": 0.9, + "grad_norm": 0.5554629976181261, + "learning_rate": 1.429639115258588e-06, + "loss": 1.8623, + "step": 11602 + }, + { + "epoch": 0.9, + "grad_norm": 0.5253589802437095, + "learning_rate": 1.4275577115287458e-06, + "loss": 1.8856, + "step": 11603 + }, + { + "epoch": 0.9, + "grad_norm": 0.5395582732106576, + "learning_rate": 1.4254777795233098e-06, + "loss": 2.0669, + "step": 11604 + }, + { + "epoch": 0.9, + "grad_norm": 0.5429214973083368, + "learning_rate": 1.4233993193721513e-06, + "loss": 1.8088, + "step": 11605 + }, + { + "epoch": 0.9, + "grad_norm": 0.5437350852995847, + "learning_rate": 1.4213223312050216e-06, + "loss": 1.8232, + "step": 11606 + }, + { + "epoch": 0.9, + "grad_norm": 0.5605509528789246, + "learning_rate": 1.4192468151516064e-06, + "loss": 1.7867, + "step": 11607 + }, + { + "epoch": 0.9, + "grad_norm": 0.555345067347659, + "learning_rate": 1.4171727713414824e-06, + "loss": 2.1308, + "step": 11608 + }, + { + "epoch": 0.9, + "grad_norm": 0.5017882847950005, + "learning_rate": 1.415100199904143e-06, + "loss": 1.9585, + "step": 11609 + }, + { + "epoch": 0.9, + "grad_norm": 0.5699754789317231, + "learning_rate": 1.4130291009689879e-06, + "loss": 1.8357, + "step": 11610 + }, + { + "epoch": 0.9, + "grad_norm": 0.550954707692324, + "learning_rate": 1.4109594746653243e-06, + "loss": 1.8447, + "step": 11611 + }, + { + "epoch": 0.9, + "grad_norm": 0.5496112979347849, + "learning_rate": 1.408891321122366e-06, + "loss": 2.0417, + "step": 11612 + }, + { + "epoch": 0.9, + "grad_norm": 0.5744276835341443, + "learning_rate": 1.406824640469237e-06, + "loss": 1.8653, + "step": 11613 + }, + { + "epoch": 0.9, + "grad_norm": 0.5914070211002466, + "learning_rate": 1.404759432834968e-06, + "loss": 1.8646, + "step": 11614 + }, + { + "epoch": 0.9, + "grad_norm": 0.5227930174030866, + "learning_rate": 1.4026956983485024e-06, + "loss": 1.9122, + "step": 11615 + }, + { + "epoch": 0.9, + "grad_norm": 0.5945733806069876, + "learning_rate": 1.400633437138682e-06, + "loss": 2.0478, + "step": 11616 + }, + { + "epoch": 0.9, + "grad_norm": 0.5677315986787986, + "learning_rate": 1.3985726493342649e-06, + "loss": 1.8489, + "step": 11617 + }, + { + "epoch": 0.9, + "grad_norm": 0.5269670738629265, + "learning_rate": 1.3965133350639149e-06, + "loss": 1.8789, + "step": 11618 + }, + { + "epoch": 0.9, + "grad_norm": 0.5400614841820163, + "learning_rate": 1.3944554944562038e-06, + "loss": 1.8268, + "step": 11619 + }, + { + "epoch": 0.9, + "grad_norm": 0.5632628281370748, + "learning_rate": 1.3923991276396097e-06, + "loss": 2.0724, + "step": 11620 + }, + { + "epoch": 0.9, + "grad_norm": 0.562864145863654, + "learning_rate": 1.3903442347425188e-06, + "loss": 1.8104, + "step": 11621 + }, + { + "epoch": 0.9, + "grad_norm": 0.49793932970079546, + "learning_rate": 1.3882908158932311e-06, + "loss": 1.9099, + "step": 11622 + }, + { + "epoch": 0.9, + "grad_norm": 0.5600569635142791, + "learning_rate": 1.3862388712199442e-06, + "loss": 1.865, + "step": 11623 + }, + { + "epoch": 0.9, + "grad_norm": 0.5484038013626298, + "learning_rate": 1.384188400850775e-06, + "loss": 2.0173, + "step": 11624 + }, + { + "epoch": 0.9, + "grad_norm": 0.5638574969012546, + "learning_rate": 1.3821394049137405e-06, + "loss": 1.8346, + "step": 11625 + }, + { + "epoch": 0.9, + "grad_norm": 0.5373387734770145, + "learning_rate": 1.3800918835367633e-06, + "loss": 1.8105, + "step": 11626 + }, + { + "epoch": 0.9, + "grad_norm": 0.5398784543311672, + "learning_rate": 1.3780458368476856e-06, + "loss": 1.889, + "step": 11627 + }, + { + "epoch": 0.9, + "grad_norm": 0.5550276347529988, + "learning_rate": 1.3760012649742499e-06, + "loss": 2.1092, + "step": 11628 + }, + { + "epoch": 0.9, + "grad_norm": 0.5396034582180304, + "learning_rate": 1.373958168044101e-06, + "loss": 1.8298, + "step": 11629 + }, + { + "epoch": 0.9, + "grad_norm": 0.5451938993298576, + "learning_rate": 1.3719165461848038e-06, + "loss": 1.8466, + "step": 11630 + }, + { + "epoch": 0.9, + "grad_norm": 0.590680006946242, + "learning_rate": 1.3698763995238228e-06, + "loss": 1.811, + "step": 11631 + }, + { + "epoch": 0.9, + "grad_norm": 0.5212617300719344, + "learning_rate": 1.3678377281885312e-06, + "loss": 2.0822, + "step": 11632 + }, + { + "epoch": 0.9, + "grad_norm": 0.5284491757533188, + "learning_rate": 1.3658005323062218e-06, + "loss": 1.8538, + "step": 11633 + }, + { + "epoch": 0.9, + "grad_norm": 0.49128686588545867, + "learning_rate": 1.3637648120040675e-06, + "loss": 1.9444, + "step": 11634 + }, + { + "epoch": 0.9, + "grad_norm": 0.5294896628727015, + "learning_rate": 1.361730567409178e-06, + "loss": 1.83, + "step": 11635 + }, + { + "epoch": 0.9, + "grad_norm": 0.5443466021337594, + "learning_rate": 1.3596977986485627e-06, + "loss": 2.0661, + "step": 11636 + }, + { + "epoch": 0.9, + "grad_norm": 0.5445264210530161, + "learning_rate": 1.3576665058491229e-06, + "loss": 1.8738, + "step": 11637 + }, + { + "epoch": 0.9, + "grad_norm": 0.5671393281054745, + "learning_rate": 1.3556366891376932e-06, + "loss": 1.8182, + "step": 11638 + }, + { + "epoch": 0.9, + "grad_norm": 0.5590281380217906, + "learning_rate": 1.353608348641e-06, + "loss": 1.8231, + "step": 11639 + }, + { + "epoch": 0.9, + "grad_norm": 0.5314347333718293, + "learning_rate": 1.3515814844856756e-06, + "loss": 1.9337, + "step": 11640 + }, + { + "epoch": 0.9, + "grad_norm": 0.5351449113856471, + "learning_rate": 1.349556096798274e-06, + "loss": 2.0345, + "step": 11641 + }, + { + "epoch": 0.9, + "grad_norm": 0.5696918765728142, + "learning_rate": 1.3475321857052386e-06, + "loss": 1.7705, + "step": 11642 + }, + { + "epoch": 0.9, + "grad_norm": 0.5399951073542054, + "learning_rate": 1.3455097513329352e-06, + "loss": 1.9177, + "step": 11643 + }, + { + "epoch": 0.9, + "grad_norm": 0.5609708560243463, + "learning_rate": 1.3434887938076373e-06, + "loss": 2.0378, + "step": 11644 + }, + { + "epoch": 0.9, + "grad_norm": 0.5716074265111633, + "learning_rate": 1.341469313255514e-06, + "loss": 1.8565, + "step": 11645 + }, + { + "epoch": 0.9, + "grad_norm": 0.4970874654286937, + "learning_rate": 1.3394513098026585e-06, + "loss": 1.95, + "step": 11646 + }, + { + "epoch": 0.9, + "grad_norm": 0.5404093075670032, + "learning_rate": 1.3374347835750534e-06, + "loss": 1.8409, + "step": 11647 + }, + { + "epoch": 0.9, + "grad_norm": 0.5148236035340942, + "learning_rate": 1.335419734698598e-06, + "loss": 2.0339, + "step": 11648 + }, + { + "epoch": 0.9, + "grad_norm": 0.545245452747753, + "learning_rate": 1.333406163299114e-06, + "loss": 1.8663, + "step": 11649 + }, + { + "epoch": 0.9, + "grad_norm": 0.5406087523417343, + "learning_rate": 1.3313940695023065e-06, + "loss": 1.8486, + "step": 11650 + }, + { + "epoch": 0.9, + "grad_norm": 0.5475893700078736, + "learning_rate": 1.329383453433794e-06, + "loss": 1.8387, + "step": 11651 + }, + { + "epoch": 0.9, + "grad_norm": 0.5335378735327975, + "learning_rate": 1.327374315219121e-06, + "loss": 2.0847, + "step": 11652 + }, + { + "epoch": 0.9, + "grad_norm": 0.5138035852166473, + "learning_rate": 1.3253666549837174e-06, + "loss": 1.915, + "step": 11653 + }, + { + "epoch": 0.9, + "grad_norm": 0.5635210039457367, + "learning_rate": 1.3233604728529303e-06, + "loss": 1.8059, + "step": 11654 + }, + { + "epoch": 0.9, + "grad_norm": 0.604284014446974, + "learning_rate": 1.321355768952015e-06, + "loss": 1.8535, + "step": 11655 + }, + { + "epoch": 0.9, + "grad_norm": 0.5427330845038708, + "learning_rate": 1.3193525434061327e-06, + "loss": 2.0185, + "step": 11656 + }, + { + "epoch": 0.9, + "grad_norm": 0.5603510476231374, + "learning_rate": 1.3173507963403526e-06, + "loss": 1.7882, + "step": 11657 + }, + { + "epoch": 0.9, + "grad_norm": 0.5449415385601569, + "learning_rate": 1.315350527879655e-06, + "loss": 1.8209, + "step": 11658 + }, + { + "epoch": 0.9, + "grad_norm": 0.49752840259731257, + "learning_rate": 1.3133517381489241e-06, + "loss": 1.8991, + "step": 11659 + }, + { + "epoch": 0.9, + "grad_norm": 0.5679971385295358, + "learning_rate": 1.3113544272729483e-06, + "loss": 2.0728, + "step": 11660 + }, + { + "epoch": 0.9, + "grad_norm": 0.5516762220737355, + "learning_rate": 1.3093585953764337e-06, + "loss": 1.8588, + "step": 11661 + }, + { + "epoch": 0.9, + "grad_norm": 0.5615364809411306, + "learning_rate": 1.3073642425839833e-06, + "loss": 1.8555, + "step": 11662 + }, + { + "epoch": 0.9, + "grad_norm": 0.5645680627661618, + "learning_rate": 1.3053713690201169e-06, + "loss": 1.8519, + "step": 11663 + }, + { + "epoch": 0.9, + "grad_norm": 0.5618147188517575, + "learning_rate": 1.3033799748092545e-06, + "loss": 2.0133, + "step": 11664 + }, + { + "epoch": 0.9, + "grad_norm": 0.5089274696036804, + "learning_rate": 1.3013900600757301e-06, + "loss": 1.9022, + "step": 11665 + }, + { + "epoch": 0.9, + "grad_norm": 0.5982750960852059, + "learning_rate": 1.2994016249437802e-06, + "loss": 1.8409, + "step": 11666 + }, + { + "epoch": 0.9, + "grad_norm": 0.5474835445239772, + "learning_rate": 1.29741466953755e-06, + "loss": 1.8468, + "step": 11667 + }, + { + "epoch": 0.9, + "grad_norm": 0.5720836908376999, + "learning_rate": 1.2954291939810959e-06, + "loss": 2.0688, + "step": 11668 + }, + { + "epoch": 0.9, + "grad_norm": 0.5502129279239272, + "learning_rate": 1.293445198398377e-06, + "loss": 1.8325, + "step": 11669 + }, + { + "epoch": 0.9, + "grad_norm": 0.5445088075760147, + "learning_rate": 1.2914626829132637e-06, + "loss": 1.8304, + "step": 11670 + }, + { + "epoch": 0.9, + "grad_norm": 0.49086490734368843, + "learning_rate": 1.2894816476495319e-06, + "loss": 1.9582, + "step": 11671 + }, + { + "epoch": 0.9, + "grad_norm": 0.5596714564502115, + "learning_rate": 1.2875020927308661e-06, + "loss": 1.8509, + "step": 11672 + }, + { + "epoch": 0.9, + "grad_norm": 0.5506238006737157, + "learning_rate": 1.2855240182808593e-06, + "loss": 2.0923, + "step": 11673 + }, + { + "epoch": 0.9, + "grad_norm": 0.5852959940018374, + "learning_rate": 1.2835474244230095e-06, + "loss": 1.7989, + "step": 11674 + }, + { + "epoch": 0.9, + "grad_norm": 0.5407076481321933, + "learning_rate": 1.2815723112807266e-06, + "loss": 1.8466, + "step": 11675 + }, + { + "epoch": 0.9, + "grad_norm": 0.5177528869089557, + "learning_rate": 1.2795986789773145e-06, + "loss": 2.0896, + "step": 11676 + }, + { + "epoch": 0.9, + "grad_norm": 0.493932006825372, + "learning_rate": 1.2776265276360055e-06, + "loss": 1.91, + "step": 11677 + }, + { + "epoch": 0.9, + "grad_norm": 0.540197842995922, + "learning_rate": 1.275655857379926e-06, + "loss": 1.8271, + "step": 11678 + }, + { + "epoch": 0.9, + "grad_norm": 0.551498650963775, + "learning_rate": 1.2736866683321164e-06, + "loss": 1.8192, + "step": 11679 + }, + { + "epoch": 0.9, + "grad_norm": 0.5517703114815048, + "learning_rate": 1.27171896061552e-06, + "loss": 2.0621, + "step": 11680 + }, + { + "epoch": 0.9, + "grad_norm": 0.5264024798795706, + "learning_rate": 1.2697527343529802e-06, + "loss": 1.8793, + "step": 11681 + }, + { + "epoch": 0.9, + "grad_norm": 0.5444054298611706, + "learning_rate": 1.2677879896672656e-06, + "loss": 1.8478, + "step": 11682 + }, + { + "epoch": 0.9, + "grad_norm": 0.5374627600267693, + "learning_rate": 1.2658247266810475e-06, + "loss": 1.8385, + "step": 11683 + }, + { + "epoch": 0.9, + "grad_norm": 0.5115284993837694, + "learning_rate": 1.263862945516886e-06, + "loss": 1.8877, + "step": 11684 + }, + { + "epoch": 0.9, + "grad_norm": 0.5485831110715437, + "learning_rate": 1.2619026462972721e-06, + "loss": 2.0309, + "step": 11685 + }, + { + "epoch": 0.9, + "grad_norm": 0.5550212097352878, + "learning_rate": 1.2599438291446026e-06, + "loss": 1.8081, + "step": 11686 + }, + { + "epoch": 0.9, + "grad_norm": 0.561576066890958, + "learning_rate": 1.257986494181157e-06, + "loss": 1.83, + "step": 11687 + }, + { + "epoch": 0.9, + "grad_norm": 0.5693421753245838, + "learning_rate": 1.2560306415291546e-06, + "loss": 2.1009, + "step": 11688 + }, + { + "epoch": 0.9, + "grad_norm": 0.5574440831779182, + "learning_rate": 1.254076271310703e-06, + "loss": 1.8411, + "step": 11689 + }, + { + "epoch": 0.9, + "grad_norm": 0.5141377125876563, + "learning_rate": 1.252123383647813e-06, + "loss": 1.9309, + "step": 11690 + }, + { + "epoch": 0.9, + "grad_norm": 0.5282351216418917, + "learning_rate": 1.250171978662426e-06, + "loss": 1.8565, + "step": 11691 + }, + { + "epoch": 0.9, + "grad_norm": 0.5655292069881842, + "learning_rate": 1.248222056476367e-06, + "loss": 1.8624, + "step": 11692 + }, + { + "epoch": 0.9, + "grad_norm": 0.5475046202705339, + "learning_rate": 1.2462736172113798e-06, + "loss": 2.0698, + "step": 11693 + }, + { + "epoch": 0.9, + "grad_norm": 0.5568818256302587, + "learning_rate": 1.2443266609891147e-06, + "loss": 1.8501, + "step": 11694 + }, + { + "epoch": 0.9, + "grad_norm": 0.5423448451050904, + "learning_rate": 1.2423811879311215e-06, + "loss": 1.7978, + "step": 11695 + }, + { + "epoch": 0.9, + "grad_norm": 0.508833125413515, + "learning_rate": 1.2404371981588753e-06, + "loss": 1.8741, + "step": 11696 + }, + { + "epoch": 0.9, + "grad_norm": 0.5277871966599968, + "learning_rate": 1.2384946917937402e-06, + "loss": 2.0685, + "step": 11697 + }, + { + "epoch": 0.9, + "grad_norm": 0.529910598485824, + "learning_rate": 1.2365536689569912e-06, + "loss": 1.833, + "step": 11698 + }, + { + "epoch": 0.9, + "grad_norm": 0.5434716499213597, + "learning_rate": 1.2346141297698288e-06, + "loss": 1.8591, + "step": 11699 + }, + { + "epoch": 0.9, + "grad_norm": 0.540306294605111, + "learning_rate": 1.232676074353331e-06, + "loss": 2.0499, + "step": 11700 + }, + { + "epoch": 0.9, + "grad_norm": 0.5552632078108508, + "learning_rate": 1.2307395028285067e-06, + "loss": 1.8912, + "step": 11701 + }, + { + "epoch": 0.9, + "grad_norm": 0.5373591383581019, + "learning_rate": 1.2288044153162593e-06, + "loss": 1.9035, + "step": 11702 + }, + { + "epoch": 0.9, + "grad_norm": 0.5587410217514533, + "learning_rate": 1.2268708119374083e-06, + "loss": 1.8817, + "step": 11703 + }, + { + "epoch": 0.9, + "grad_norm": 0.5687774803904874, + "learning_rate": 1.2249386928126744e-06, + "loss": 1.836, + "step": 11704 + }, + { + "epoch": 0.9, + "grad_norm": 0.5164803666724173, + "learning_rate": 1.2230080580626885e-06, + "loss": 2.0257, + "step": 11705 + }, + { + "epoch": 0.9, + "grad_norm": 0.5431744424935566, + "learning_rate": 1.2210789078079876e-06, + "loss": 1.8537, + "step": 11706 + }, + { + "epoch": 0.9, + "grad_norm": 0.5543007448868885, + "learning_rate": 1.219151242169017e-06, + "loss": 1.8623, + "step": 11707 + }, + { + "epoch": 0.9, + "grad_norm": 0.5328856847781371, + "learning_rate": 1.217225061266128e-06, + "loss": 2.0623, + "step": 11708 + }, + { + "epoch": 0.9, + "grad_norm": 0.540747534150304, + "learning_rate": 1.215300365219582e-06, + "loss": 1.9069, + "step": 11709 + }, + { + "epoch": 0.9, + "grad_norm": 0.5311373694961518, + "learning_rate": 1.213377154149542e-06, + "loss": 1.8586, + "step": 11710 + }, + { + "epoch": 0.9, + "grad_norm": 0.5701537270693223, + "learning_rate": 1.2114554281760866e-06, + "loss": 1.8764, + "step": 11711 + }, + { + "epoch": 0.9, + "grad_norm": 0.5382629987349338, + "learning_rate": 1.2095351874191946e-06, + "loss": 2.0619, + "step": 11712 + }, + { + "epoch": 0.9, + "grad_norm": 0.5529203820074862, + "learning_rate": 1.207616431998751e-06, + "loss": 1.8775, + "step": 11713 + }, + { + "epoch": 0.9, + "grad_norm": 0.5471952414976659, + "learning_rate": 1.2056991620345599e-06, + "loss": 1.8319, + "step": 11714 + }, + { + "epoch": 0.9, + "grad_norm": 0.535681572745494, + "learning_rate": 1.203783377646317e-06, + "loss": 1.9151, + "step": 11715 + }, + { + "epoch": 0.9, + "grad_norm": 0.5675284741828543, + "learning_rate": 1.2018690789536352e-06, + "loss": 1.8395, + "step": 11716 + }, + { + "epoch": 0.9, + "grad_norm": 0.5304234249325372, + "learning_rate": 1.19995626607603e-06, + "loss": 2.0355, + "step": 11717 + }, + { + "epoch": 0.9, + "grad_norm": 0.558426551540738, + "learning_rate": 1.1980449391329307e-06, + "loss": 1.8874, + "step": 11718 + }, + { + "epoch": 0.9, + "grad_norm": 0.5679184444588375, + "learning_rate": 1.196135098243667e-06, + "loss": 1.877, + "step": 11719 + }, + { + "epoch": 0.9, + "grad_norm": 0.5652704495527306, + "learning_rate": 1.1942267435274767e-06, + "loss": 2.0452, + "step": 11720 + }, + { + "epoch": 0.9, + "grad_norm": 0.5105423601508826, + "learning_rate": 1.1923198751035064e-06, + "loss": 1.9018, + "step": 11721 + }, + { + "epoch": 0.9, + "grad_norm": 0.589169380400336, + "learning_rate": 1.1904144930908161e-06, + "loss": 1.8704, + "step": 11722 + }, + { + "epoch": 0.9, + "grad_norm": 0.5453583277882987, + "learning_rate": 1.1885105976083555e-06, + "loss": 1.8728, + "step": 11723 + }, + { + "epoch": 0.9, + "grad_norm": 0.5318337947410641, + "learning_rate": 1.1866081887749986e-06, + "loss": 1.842, + "step": 11724 + }, + { + "epoch": 0.9, + "grad_norm": 0.5404784524096817, + "learning_rate": 1.1847072667095256e-06, + "loss": 2.0476, + "step": 11725 + }, + { + "epoch": 0.9, + "grad_norm": 0.563705007895485, + "learning_rate": 1.1828078315306052e-06, + "loss": 1.8918, + "step": 11726 + }, + { + "epoch": 0.9, + "grad_norm": 0.4945931238775916, + "learning_rate": 1.1809098833568426e-06, + "loss": 1.9327, + "step": 11727 + }, + { + "epoch": 0.9, + "grad_norm": 0.5627656930134513, + "learning_rate": 1.179013422306724e-06, + "loss": 1.8128, + "step": 11728 + }, + { + "epoch": 0.9, + "grad_norm": 0.569377198273101, + "learning_rate": 1.177118448498657e-06, + "loss": 2.0869, + "step": 11729 + }, + { + "epoch": 0.9, + "grad_norm": 0.5350003569583077, + "learning_rate": 1.1752249620509553e-06, + "loss": 1.8696, + "step": 11730 + }, + { + "epoch": 0.91, + "grad_norm": 0.568220048848639, + "learning_rate": 1.1733329630818273e-06, + "loss": 1.8396, + "step": 11731 + }, + { + "epoch": 0.91, + "grad_norm": 0.5397075551396808, + "learning_rate": 1.1714424517094065e-06, + "loss": 1.9979, + "step": 11732 + }, + { + "epoch": 0.91, + "grad_norm": 0.518590728379184, + "learning_rate": 1.1695534280517283e-06, + "loss": 1.8887, + "step": 11733 + }, + { + "epoch": 0.91, + "grad_norm": 0.5525872491023931, + "learning_rate": 1.1676658922267215e-06, + "loss": 1.8226, + "step": 11734 + }, + { + "epoch": 0.91, + "grad_norm": 0.5787253552890166, + "learning_rate": 1.1657798443522466e-06, + "loss": 1.8767, + "step": 11735 + }, + { + "epoch": 0.91, + "grad_norm": 0.5826549138136291, + "learning_rate": 1.1638952845460432e-06, + "loss": 1.8482, + "step": 11736 + }, + { + "epoch": 0.91, + "grad_norm": 0.5625548084775526, + "learning_rate": 1.1620122129257783e-06, + "loss": 2.0367, + "step": 11737 + }, + { + "epoch": 0.91, + "grad_norm": 0.5314618488795448, + "learning_rate": 1.1601306296090243e-06, + "loss": 1.8234, + "step": 11738 + }, + { + "epoch": 0.91, + "grad_norm": 0.5505654203767429, + "learning_rate": 1.1582505347132482e-06, + "loss": 1.8673, + "step": 11739 + }, + { + "epoch": 0.91, + "grad_norm": 0.50946198351235, + "learning_rate": 1.1563719283558371e-06, + "loss": 1.911, + "step": 11740 + }, + { + "epoch": 0.91, + "grad_norm": 0.5279208645648353, + "learning_rate": 1.1544948106540775e-06, + "loss": 2.0436, + "step": 11741 + }, + { + "epoch": 0.91, + "grad_norm": 0.5302752674669353, + "learning_rate": 1.1526191817251618e-06, + "loss": 1.78, + "step": 11742 + }, + { + "epoch": 0.91, + "grad_norm": 0.5365834237020748, + "learning_rate": 1.1507450416862075e-06, + "loss": 1.8179, + "step": 11743 + }, + { + "epoch": 0.91, + "grad_norm": 0.5765674355993516, + "learning_rate": 1.1488723906542098e-06, + "loss": 2.0456, + "step": 11744 + }, + { + "epoch": 0.91, + "grad_norm": 0.5583160738885791, + "learning_rate": 1.1470012287460918e-06, + "loss": 1.8891, + "step": 11745 + }, + { + "epoch": 0.91, + "grad_norm": 0.4981355220775474, + "learning_rate": 1.1451315560786796e-06, + "loss": 1.9219, + "step": 11746 + }, + { + "epoch": 0.91, + "grad_norm": 0.537244029985671, + "learning_rate": 1.143263372768702e-06, + "loss": 1.7573, + "step": 11747 + }, + { + "epoch": 0.91, + "grad_norm": 0.5670799208266605, + "learning_rate": 1.141396678932799e-06, + "loss": 1.8452, + "step": 11748 + }, + { + "epoch": 0.91, + "grad_norm": 0.5628178483534041, + "learning_rate": 1.1395314746875134e-06, + "loss": 2.0636, + "step": 11749 + }, + { + "epoch": 0.91, + "grad_norm": 0.5561326788896732, + "learning_rate": 1.1376677601492996e-06, + "loss": 1.8317, + "step": 11750 + }, + { + "epoch": 0.91, + "grad_norm": 0.5712972328181289, + "learning_rate": 1.1358055354345172e-06, + "loss": 1.816, + "step": 11751 + }, + { + "epoch": 0.91, + "grad_norm": 0.505380192796172, + "learning_rate": 1.1339448006594282e-06, + "loss": 1.9051, + "step": 11752 + }, + { + "epoch": 0.91, + "grad_norm": 0.5248917854481459, + "learning_rate": 1.1320855559402127e-06, + "loss": 2.0336, + "step": 11753 + }, + { + "epoch": 0.91, + "grad_norm": 0.5396657551550129, + "learning_rate": 1.1302278013929468e-06, + "loss": 1.8935, + "step": 11754 + }, + { + "epoch": 0.91, + "grad_norm": 0.5739422266499928, + "learning_rate": 1.1283715371336212e-06, + "loss": 1.8355, + "step": 11755 + }, + { + "epoch": 0.91, + "grad_norm": 0.5331158887257462, + "learning_rate": 1.1265167632781238e-06, + "loss": 1.8548, + "step": 11756 + }, + { + "epoch": 0.91, + "grad_norm": 0.534123068524729, + "learning_rate": 1.1246634799422617e-06, + "loss": 2.01, + "step": 11757 + }, + { + "epoch": 0.91, + "grad_norm": 0.4985067594614268, + "learning_rate": 1.1228116872417398e-06, + "loss": 1.9783, + "step": 11758 + }, + { + "epoch": 0.91, + "grad_norm": 0.5682062384155223, + "learning_rate": 1.120961385292177e-06, + "loss": 1.8863, + "step": 11759 + }, + { + "epoch": 0.91, + "grad_norm": 0.5769116928791308, + "learning_rate": 1.1191125742090885e-06, + "loss": 1.8365, + "step": 11760 + }, + { + "epoch": 0.91, + "grad_norm": 0.5190870138042077, + "learning_rate": 1.1172652541079103e-06, + "loss": 2.0264, + "step": 11761 + }, + { + "epoch": 0.91, + "grad_norm": 0.532632312095928, + "learning_rate": 1.1154194251039752e-06, + "loss": 1.8304, + "step": 11762 + }, + { + "epoch": 0.91, + "grad_norm": 0.5539867629519758, + "learning_rate": 1.113575087312524e-06, + "loss": 1.8593, + "step": 11763 + }, + { + "epoch": 0.91, + "grad_norm": 0.5203881259700491, + "learning_rate": 1.1117322408487064e-06, + "loss": 1.9438, + "step": 11764 + }, + { + "epoch": 0.91, + "grad_norm": 0.5354982316190441, + "learning_rate": 1.109890885827583e-06, + "loss": 2.0468, + "step": 11765 + }, + { + "epoch": 0.91, + "grad_norm": 0.5548755198085497, + "learning_rate": 1.108051022364115e-06, + "loss": 1.8531, + "step": 11766 + }, + { + "epoch": 0.91, + "grad_norm": 0.5369463412592698, + "learning_rate": 1.1062126505731713e-06, + "loss": 1.813, + "step": 11767 + }, + { + "epoch": 0.91, + "grad_norm": 0.5457424466873846, + "learning_rate": 1.1043757705695297e-06, + "loss": 1.8733, + "step": 11768 + }, + { + "epoch": 0.91, + "grad_norm": 0.5427023109282663, + "learning_rate": 1.102540382467876e-06, + "loss": 2.0437, + "step": 11769 + }, + { + "epoch": 0.91, + "grad_norm": 0.5457317006538139, + "learning_rate": 1.1007064863827966e-06, + "loss": 1.8539, + "step": 11770 + }, + { + "epoch": 0.91, + "grad_norm": 0.510378003769033, + "learning_rate": 1.098874082428794e-06, + "loss": 1.945, + "step": 11771 + }, + { + "epoch": 0.91, + "grad_norm": 0.5535389413191191, + "learning_rate": 1.0970431707202717e-06, + "loss": 1.8788, + "step": 11772 + }, + { + "epoch": 0.91, + "grad_norm": 0.5452996261570157, + "learning_rate": 1.0952137513715348e-06, + "loss": 2.0343, + "step": 11773 + }, + { + "epoch": 0.91, + "grad_norm": 0.544547770610094, + "learning_rate": 1.093385824496815e-06, + "loss": 1.8433, + "step": 11774 + }, + { + "epoch": 0.91, + "grad_norm": 0.5557503081773384, + "learning_rate": 1.0915593902102227e-06, + "loss": 1.8576, + "step": 11775 + }, + { + "epoch": 0.91, + "grad_norm": 0.5642614401685496, + "learning_rate": 1.0897344486257955e-06, + "loss": 1.8029, + "step": 11776 + }, + { + "epoch": 0.91, + "grad_norm": 0.5114328009568181, + "learning_rate": 1.0879109998574804e-06, + "loss": 2.0922, + "step": 11777 + }, + { + "epoch": 0.91, + "grad_norm": 0.5535535634538401, + "learning_rate": 1.0860890440191035e-06, + "loss": 1.841, + "step": 11778 + }, + { + "epoch": 0.91, + "grad_norm": 0.564030046211814, + "learning_rate": 1.0842685812244347e-06, + "loss": 1.8548, + "step": 11779 + }, + { + "epoch": 0.91, + "grad_norm": 0.51640556280171, + "learning_rate": 1.0824496115871303e-06, + "loss": 1.8047, + "step": 11780 + }, + { + "epoch": 0.91, + "grad_norm": 0.554209636544499, + "learning_rate": 1.0806321352207439e-06, + "loss": 2.0592, + "step": 11781 + }, + { + "epoch": 0.91, + "grad_norm": 0.5818740071796774, + "learning_rate": 1.0788161522387651e-06, + "loss": 1.8419, + "step": 11782 + }, + { + "epoch": 0.91, + "grad_norm": 0.5232021561864928, + "learning_rate": 1.0770016627545593e-06, + "loss": 1.9036, + "step": 11783 + }, + { + "epoch": 0.91, + "grad_norm": 0.5569178603423323, + "learning_rate": 1.075188666881416e-06, + "loss": 1.849, + "step": 11784 + }, + { + "epoch": 0.91, + "grad_norm": 0.5489386621755648, + "learning_rate": 1.0733771647325363e-06, + "loss": 2.0182, + "step": 11785 + }, + { + "epoch": 0.91, + "grad_norm": 0.5512114216480695, + "learning_rate": 1.0715671564210106e-06, + "loss": 1.884, + "step": 11786 + }, + { + "epoch": 0.91, + "grad_norm": 0.5556669596772178, + "learning_rate": 1.0697586420598427e-06, + "loss": 1.8795, + "step": 11787 + }, + { + "epoch": 0.91, + "grad_norm": 0.5540836288405323, + "learning_rate": 1.067951621761959e-06, + "loss": 1.8172, + "step": 11788 + }, + { + "epoch": 0.91, + "grad_norm": 0.5082737828472793, + "learning_rate": 1.0661460956401664e-06, + "loss": 2.1239, + "step": 11789 + }, + { + "epoch": 0.91, + "grad_norm": 0.5452476690803758, + "learning_rate": 1.0643420638071971e-06, + "loss": 1.8652, + "step": 11790 + }, + { + "epoch": 0.91, + "grad_norm": 0.5463291548225684, + "learning_rate": 1.0625395263756805e-06, + "loss": 1.8522, + "step": 11791 + }, + { + "epoch": 0.91, + "grad_norm": 0.5582753436705997, + "learning_rate": 1.060738483458157e-06, + "loss": 1.8517, + "step": 11792 + }, + { + "epoch": 0.91, + "grad_norm": 0.5411743462484173, + "learning_rate": 1.0589389351670809e-06, + "loss": 2.0387, + "step": 11793 + }, + { + "epoch": 0.91, + "grad_norm": 0.5494928086444191, + "learning_rate": 1.0571408816147987e-06, + "loss": 1.8261, + "step": 11794 + }, + { + "epoch": 0.91, + "grad_norm": 0.51612670444131, + "learning_rate": 1.055344322913568e-06, + "loss": 1.9503, + "step": 11795 + }, + { + "epoch": 0.91, + "grad_norm": 0.5774723610747716, + "learning_rate": 1.0535492591755597e-06, + "loss": 1.8094, + "step": 11796 + }, + { + "epoch": 0.91, + "grad_norm": 0.5319660726300384, + "learning_rate": 1.0517556905128457e-06, + "loss": 2.0339, + "step": 11797 + }, + { + "epoch": 0.91, + "grad_norm": 0.5421727360932004, + "learning_rate": 1.0499636170374056e-06, + "loss": 1.8485, + "step": 11798 + }, + { + "epoch": 0.91, + "grad_norm": 0.5670375665884982, + "learning_rate": 1.048173038861125e-06, + "loss": 1.8999, + "step": 11799 + }, + { + "epoch": 0.91, + "grad_norm": 0.5772303193809998, + "learning_rate": 1.0463839560958006e-06, + "loss": 1.8618, + "step": 11800 + }, + { + "epoch": 0.91, + "grad_norm": 0.5480437310431202, + "learning_rate": 1.0445963688531264e-06, + "loss": 2.0734, + "step": 11801 + }, + { + "epoch": 0.91, + "grad_norm": 0.517455140826079, + "learning_rate": 1.042810277244713e-06, + "loss": 1.8693, + "step": 11802 + }, + { + "epoch": 0.91, + "grad_norm": 0.5783830105799179, + "learning_rate": 1.041025681382074e-06, + "loss": 1.8309, + "step": 11803 + }, + { + "epoch": 0.91, + "grad_norm": 0.5635013311611135, + "learning_rate": 1.0392425813766287e-06, + "loss": 1.8114, + "step": 11804 + }, + { + "epoch": 0.91, + "grad_norm": 0.575997976180365, + "learning_rate": 1.0374609773396992e-06, + "loss": 2.0573, + "step": 11805 + }, + { + "epoch": 0.91, + "grad_norm": 0.548188256073481, + "learning_rate": 1.0356808693825244e-06, + "loss": 1.8728, + "step": 11806 + }, + { + "epoch": 0.91, + "grad_norm": 0.5502040637247195, + "learning_rate": 1.03390225761624e-06, + "loss": 1.8539, + "step": 11807 + }, + { + "epoch": 0.91, + "grad_norm": 0.5188579662928685, + "learning_rate": 1.0321251421518935e-06, + "loss": 1.9308, + "step": 11808 + }, + { + "epoch": 0.91, + "grad_norm": 0.5457352640913005, + "learning_rate": 1.0303495231004324e-06, + "loss": 2.0722, + "step": 11809 + }, + { + "epoch": 0.91, + "grad_norm": 0.5609458793380842, + "learning_rate": 1.0285754005727233e-06, + "loss": 1.8498, + "step": 11810 + }, + { + "epoch": 0.91, + "grad_norm": 0.5482076933283306, + "learning_rate": 1.0268027746795307e-06, + "loss": 1.8252, + "step": 11811 + }, + { + "epoch": 0.91, + "grad_norm": 0.5931409980376064, + "learning_rate": 1.025031645531524e-06, + "loss": 1.8747, + "step": 11812 + }, + { + "epoch": 0.91, + "grad_norm": 0.5587364256945818, + "learning_rate": 1.0232620132392817e-06, + "loss": 2.0968, + "step": 11813 + }, + { + "epoch": 0.91, + "grad_norm": 0.5222389838168767, + "learning_rate": 1.0214938779132932e-06, + "loss": 1.9463, + "step": 11814 + }, + { + "epoch": 0.91, + "grad_norm": 0.5329884277189912, + "learning_rate": 1.0197272396639478e-06, + "loss": 1.8673, + "step": 11815 + }, + { + "epoch": 0.91, + "grad_norm": 0.5403492472536453, + "learning_rate": 1.0179620986015465e-06, + "loss": 1.8685, + "step": 11816 + }, + { + "epoch": 0.91, + "grad_norm": 0.5545107649082348, + "learning_rate": 1.0161984548362869e-06, + "loss": 2.0621, + "step": 11817 + }, + { + "epoch": 0.91, + "grad_norm": 0.5415432863081561, + "learning_rate": 1.0144363084782864e-06, + "loss": 1.859, + "step": 11818 + }, + { + "epoch": 0.91, + "grad_norm": 0.567183814006836, + "learning_rate": 1.0126756596375686e-06, + "loss": 1.8533, + "step": 11819 + }, + { + "epoch": 0.91, + "grad_norm": 0.4962614085235359, + "learning_rate": 1.0109165084240424e-06, + "loss": 1.9117, + "step": 11820 + }, + { + "epoch": 0.91, + "grad_norm": 0.5123647679954968, + "learning_rate": 1.0091588549475534e-06, + "loss": 2.0395, + "step": 11821 + }, + { + "epoch": 0.91, + "grad_norm": 0.5399521632202037, + "learning_rate": 1.007402699317836e-06, + "loss": 1.8251, + "step": 11822 + }, + { + "epoch": 0.91, + "grad_norm": 0.5698949193600872, + "learning_rate": 1.0056480416445273e-06, + "loss": 1.8861, + "step": 11823 + }, + { + "epoch": 0.91, + "grad_norm": 0.5317784965506607, + "learning_rate": 1.0038948820371874e-06, + "loss": 1.8325, + "step": 11824 + }, + { + "epoch": 0.91, + "grad_norm": 0.5310758506657064, + "learning_rate": 1.0021432206052645e-06, + "loss": 2.0241, + "step": 11825 + }, + { + "epoch": 0.91, + "grad_norm": 0.5221094901156447, + "learning_rate": 1.0003930574581266e-06, + "loss": 1.9432, + "step": 11826 + }, + { + "epoch": 0.91, + "grad_norm": 0.5571145825080576, + "learning_rate": 9.986443927050475e-07, + "loss": 1.895, + "step": 11827 + }, + { + "epoch": 0.91, + "grad_norm": 0.5262985987607308, + "learning_rate": 9.96897226455193e-07, + "loss": 1.9005, + "step": 11828 + }, + { + "epoch": 0.91, + "grad_norm": 0.56579582424519, + "learning_rate": 9.951515588176558e-07, + "loss": 2.0806, + "step": 11829 + }, + { + "epoch": 0.91, + "grad_norm": 0.5497881730011629, + "learning_rate": 9.934073899014213e-07, + "loss": 1.8488, + "step": 11830 + }, + { + "epoch": 0.91, + "grad_norm": 0.5471937685333395, + "learning_rate": 9.9166471981538e-07, + "loss": 1.8683, + "step": 11831 + }, + { + "epoch": 0.91, + "grad_norm": 0.5391937218653906, + "learning_rate": 9.899235486683451e-07, + "loss": 1.8482, + "step": 11832 + }, + { + "epoch": 0.91, + "grad_norm": 0.5224584781116529, + "learning_rate": 9.881838765690154e-07, + "loss": 2.0845, + "step": 11833 + }, + { + "epoch": 0.91, + "grad_norm": 0.5753857611989643, + "learning_rate": 9.864457036260072e-07, + "loss": 1.8834, + "step": 11834 + }, + { + "epoch": 0.91, + "grad_norm": 0.5418116486071205, + "learning_rate": 9.8470902994785e-07, + "loss": 1.902, + "step": 11835 + }, + { + "epoch": 0.91, + "grad_norm": 0.5563870468251938, + "learning_rate": 9.829738556429629e-07, + "loss": 1.8204, + "step": 11836 + }, + { + "epoch": 0.91, + "grad_norm": 0.528432009452502, + "learning_rate": 9.812401808196813e-07, + "loss": 2.0238, + "step": 11837 + }, + { + "epoch": 0.91, + "grad_norm": 0.5306562901430418, + "learning_rate": 9.795080055862466e-07, + "loss": 1.8485, + "step": 11838 + }, + { + "epoch": 0.91, + "grad_norm": 0.5152606913602151, + "learning_rate": 9.777773300508026e-07, + "loss": 1.9189, + "step": 11839 + }, + { + "epoch": 0.91, + "grad_norm": 0.5265339910541998, + "learning_rate": 9.760481543214128e-07, + "loss": 1.8243, + "step": 11840 + }, + { + "epoch": 0.91, + "grad_norm": 0.533441625616188, + "learning_rate": 9.743204785060246e-07, + "loss": 2.0289, + "step": 11841 + }, + { + "epoch": 0.91, + "grad_norm": 0.5363355485539634, + "learning_rate": 9.725943027125124e-07, + "loss": 1.8521, + "step": 11842 + }, + { + "epoch": 0.91, + "grad_norm": 0.5691911052026314, + "learning_rate": 9.70869627048643e-07, + "loss": 1.8488, + "step": 11843 + }, + { + "epoch": 0.91, + "grad_norm": 0.544980388832805, + "learning_rate": 9.691464516220966e-07, + "loss": 1.84, + "step": 11844 + }, + { + "epoch": 0.91, + "grad_norm": 0.5081780620891033, + "learning_rate": 9.674247765404571e-07, + "loss": 2.0888, + "step": 11845 + }, + { + "epoch": 0.91, + "grad_norm": 0.5600623798900044, + "learning_rate": 9.657046019112188e-07, + "loss": 1.8304, + "step": 11846 + }, + { + "epoch": 0.91, + "grad_norm": 0.5476236120841461, + "learning_rate": 9.639859278417761e-07, + "loss": 1.87, + "step": 11847 + }, + { + "epoch": 0.91, + "grad_norm": 0.5410034801548375, + "learning_rate": 9.622687544394382e-07, + "loss": 1.8196, + "step": 11848 + }, + { + "epoch": 0.91, + "grad_norm": 0.5711445190650498, + "learning_rate": 9.605530818114077e-07, + "loss": 2.0827, + "step": 11849 + }, + { + "epoch": 0.91, + "grad_norm": 0.5370827074356536, + "learning_rate": 9.58838910064805e-07, + "loss": 1.8848, + "step": 11850 + }, + { + "epoch": 0.91, + "grad_norm": 0.522896154780989, + "learning_rate": 9.571262393066548e-07, + "loss": 1.8907, + "step": 11851 + }, + { + "epoch": 0.91, + "grad_norm": 0.549417599896585, + "learning_rate": 9.554150696438834e-07, + "loss": 1.8545, + "step": 11852 + }, + { + "epoch": 0.91, + "grad_norm": 0.5608399590410367, + "learning_rate": 9.537054011833247e-07, + "loss": 2.0427, + "step": 11853 + }, + { + "epoch": 0.91, + "grad_norm": 0.5605400075515798, + "learning_rate": 9.519972340317262e-07, + "loss": 1.896, + "step": 11854 + }, + { + "epoch": 0.91, + "grad_norm": 0.5587087769314447, + "learning_rate": 9.50290568295728e-07, + "loss": 1.8249, + "step": 11855 + }, + { + "epoch": 0.91, + "grad_norm": 0.5425341471811116, + "learning_rate": 9.48585404081892e-07, + "loss": 1.8339, + "step": 11856 + }, + { + "epoch": 0.91, + "grad_norm": 0.5388478651779817, + "learning_rate": 9.468817414966719e-07, + "loss": 2.0671, + "step": 11857 + }, + { + "epoch": 0.91, + "grad_norm": 0.505354736284435, + "learning_rate": 9.451795806464381e-07, + "loss": 1.94, + "step": 11858 + }, + { + "epoch": 0.91, + "grad_norm": 0.5663372866657157, + "learning_rate": 9.43478921637464e-07, + "loss": 1.8341, + "step": 11859 + }, + { + "epoch": 0.91, + "grad_norm": 0.5885815708016894, + "learning_rate": 9.417797645759285e-07, + "loss": 1.8658, + "step": 11860 + }, + { + "epoch": 0.92, + "grad_norm": 0.5469144714152501, + "learning_rate": 9.400821095679136e-07, + "loss": 2.0374, + "step": 11861 + }, + { + "epoch": 0.92, + "grad_norm": 0.5473432494973925, + "learning_rate": 9.383859567194148e-07, + "loss": 1.8638, + "step": 11862 + }, + { + "epoch": 0.92, + "grad_norm": 0.5469174168439362, + "learning_rate": 9.36691306136328e-07, + "loss": 1.8536, + "step": 11863 + }, + { + "epoch": 0.92, + "grad_norm": 0.4974083892842534, + "learning_rate": 9.349981579244549e-07, + "loss": 1.9522, + "step": 11864 + }, + { + "epoch": 0.92, + "grad_norm": 0.574733731721297, + "learning_rate": 9.333065121895107e-07, + "loss": 2.1142, + "step": 11865 + }, + { + "epoch": 0.92, + "grad_norm": 0.5607069446456302, + "learning_rate": 9.31616369037111e-07, + "loss": 1.8515, + "step": 11866 + }, + { + "epoch": 0.92, + "grad_norm": 0.5805434259357729, + "learning_rate": 9.299277285727714e-07, + "loss": 1.81, + "step": 11867 + }, + { + "epoch": 0.92, + "grad_norm": 0.5495817125195068, + "learning_rate": 9.282405909019298e-07, + "loss": 1.8622, + "step": 11868 + }, + { + "epoch": 0.92, + "grad_norm": 0.5443258688246629, + "learning_rate": 9.265549561299213e-07, + "loss": 2.0641, + "step": 11869 + }, + { + "epoch": 0.92, + "grad_norm": 0.5226504797415402, + "learning_rate": 9.248708243619758e-07, + "loss": 1.922, + "step": 11870 + }, + { + "epoch": 0.92, + "grad_norm": 0.5643166882521794, + "learning_rate": 9.231881957032534e-07, + "loss": 1.8333, + "step": 11871 + }, + { + "epoch": 0.92, + "grad_norm": 0.5662526832419851, + "learning_rate": 9.21507070258798e-07, + "loss": 1.847, + "step": 11872 + }, + { + "epoch": 0.92, + "grad_norm": 0.5338919919606523, + "learning_rate": 9.198274481335728e-07, + "loss": 2.0022, + "step": 11873 + }, + { + "epoch": 0.92, + "grad_norm": 0.5620204242449471, + "learning_rate": 9.181493294324494e-07, + "loss": 1.8835, + "step": 11874 + }, + { + "epoch": 0.92, + "grad_norm": 0.5277450015660122, + "learning_rate": 9.164727142601859e-07, + "loss": 1.8417, + "step": 11875 + }, + { + "epoch": 0.92, + "grad_norm": 0.5012193863953166, + "learning_rate": 9.147976027214761e-07, + "loss": 1.8792, + "step": 11876 + }, + { + "epoch": 0.92, + "grad_norm": 0.5795859162532453, + "learning_rate": 9.131239949208947e-07, + "loss": 2.0434, + "step": 11877 + }, + { + "epoch": 0.92, + "grad_norm": 0.5418394861459361, + "learning_rate": 9.114518909629278e-07, + "loss": 1.8316, + "step": 11878 + }, + { + "epoch": 0.92, + "grad_norm": 0.5660283943019464, + "learning_rate": 9.097812909519859e-07, + "loss": 1.8387, + "step": 11879 + }, + { + "epoch": 0.92, + "grad_norm": 0.5736851482552975, + "learning_rate": 9.081121949923582e-07, + "loss": 1.8834, + "step": 11880 + }, + { + "epoch": 0.92, + "grad_norm": 0.5757140368323163, + "learning_rate": 9.064446031882556e-07, + "loss": 2.0582, + "step": 11881 + }, + { + "epoch": 0.92, + "grad_norm": 0.5180111595949544, + "learning_rate": 9.047785156438033e-07, + "loss": 1.9046, + "step": 11882 + }, + { + "epoch": 0.92, + "grad_norm": 0.5560004338699833, + "learning_rate": 9.031139324630095e-07, + "loss": 1.8751, + "step": 11883 + }, + { + "epoch": 0.92, + "grad_norm": 0.544977212663069, + "learning_rate": 9.014508537498051e-07, + "loss": 1.8412, + "step": 11884 + }, + { + "epoch": 0.92, + "grad_norm": 0.5382486691967211, + "learning_rate": 8.997892796080209e-07, + "loss": 2.097, + "step": 11885 + }, + { + "epoch": 0.92, + "grad_norm": 0.556459628801389, + "learning_rate": 8.981292101414019e-07, + "loss": 1.8018, + "step": 11886 + }, + { + "epoch": 0.92, + "grad_norm": 0.5458398133795308, + "learning_rate": 8.96470645453587e-07, + "loss": 1.8675, + "step": 11887 + }, + { + "epoch": 0.92, + "grad_norm": 0.5384218146489703, + "learning_rate": 8.948135856481298e-07, + "loss": 1.8591, + "step": 11888 + }, + { + "epoch": 0.92, + "grad_norm": 0.5335689472800059, + "learning_rate": 8.931580308284893e-07, + "loss": 1.9734, + "step": 11889 + }, + { + "epoch": 0.92, + "grad_norm": 0.5448979333650018, + "learning_rate": 8.915039810980242e-07, + "loss": 2.0614, + "step": 11890 + }, + { + "epoch": 0.92, + "grad_norm": 0.5341996357206228, + "learning_rate": 8.898514365600074e-07, + "loss": 1.8323, + "step": 11891 + }, + { + "epoch": 0.92, + "grad_norm": 0.5462844441674541, + "learning_rate": 8.882003973176095e-07, + "loss": 1.797, + "step": 11892 + }, + { + "epoch": 0.92, + "grad_norm": 0.5686052216123942, + "learning_rate": 8.865508634739172e-07, + "loss": 2.0506, + "step": 11893 + }, + { + "epoch": 0.92, + "grad_norm": 0.5421570968389228, + "learning_rate": 8.84902835131915e-07, + "loss": 1.8453, + "step": 11894 + }, + { + "epoch": 0.92, + "grad_norm": 0.5246606184418561, + "learning_rate": 8.832563123944926e-07, + "loss": 1.9431, + "step": 11895 + }, + { + "epoch": 0.92, + "grad_norm": 0.5710368911481135, + "learning_rate": 8.816112953644567e-07, + "loss": 1.8928, + "step": 11896 + }, + { + "epoch": 0.92, + "grad_norm": 0.5574967825633839, + "learning_rate": 8.799677841445059e-07, + "loss": 2.0443, + "step": 11897 + }, + { + "epoch": 0.92, + "grad_norm": 0.5468648614169509, + "learning_rate": 8.78325778837255e-07, + "loss": 1.8276, + "step": 11898 + }, + { + "epoch": 0.92, + "grad_norm": 0.5980513193997323, + "learning_rate": 8.766852795452224e-07, + "loss": 1.8486, + "step": 11899 + }, + { + "epoch": 0.92, + "grad_norm": 0.5638881637125096, + "learning_rate": 8.750462863708259e-07, + "loss": 1.8658, + "step": 11900 + }, + { + "epoch": 0.92, + "grad_norm": 0.5021591731341097, + "learning_rate": 8.734087994163975e-07, + "loss": 1.9167, + "step": 11901 + }, + { + "epoch": 0.92, + "grad_norm": 0.5465755382603655, + "learning_rate": 8.717728187841722e-07, + "loss": 2.0215, + "step": 11902 + }, + { + "epoch": 0.92, + "grad_norm": 0.5423741351545511, + "learning_rate": 8.701383445762934e-07, + "loss": 1.8442, + "step": 11903 + }, + { + "epoch": 0.92, + "grad_norm": 0.5428343966907364, + "learning_rate": 8.685053768948043e-07, + "loss": 1.8557, + "step": 11904 + }, + { + "epoch": 0.92, + "grad_norm": 0.534695345026071, + "learning_rate": 8.668739158416622e-07, + "loss": 2.0782, + "step": 11905 + }, + { + "epoch": 0.92, + "grad_norm": 0.5719366583091761, + "learning_rate": 8.652439615187163e-07, + "loss": 1.8724, + "step": 11906 + }, + { + "epoch": 0.92, + "grad_norm": 0.5087360655115176, + "learning_rate": 8.636155140277408e-07, + "loss": 1.883, + "step": 11907 + }, + { + "epoch": 0.92, + "grad_norm": 0.5816074291386367, + "learning_rate": 8.619885734704042e-07, + "loss": 1.8206, + "step": 11908 + }, + { + "epoch": 0.92, + "grad_norm": 0.5646659408652039, + "learning_rate": 8.603631399482809e-07, + "loss": 1.878, + "step": 11909 + }, + { + "epoch": 0.92, + "grad_norm": 0.5314669057588279, + "learning_rate": 8.587392135628591e-07, + "loss": 2.0513, + "step": 11910 + }, + { + "epoch": 0.92, + "grad_norm": 0.5532194759434288, + "learning_rate": 8.57116794415519e-07, + "loss": 1.8029, + "step": 11911 + }, + { + "epoch": 0.92, + "grad_norm": 0.539268647887158, + "learning_rate": 8.554958826075599e-07, + "loss": 1.8628, + "step": 11912 + }, + { + "epoch": 0.92, + "grad_norm": 0.5081376630713487, + "learning_rate": 8.538764782401842e-07, + "loss": 1.9415, + "step": 11913 + }, + { + "epoch": 0.92, + "grad_norm": 0.5603904436695119, + "learning_rate": 8.52258581414489e-07, + "loss": 2.0655, + "step": 11914 + }, + { + "epoch": 0.92, + "grad_norm": 0.5476354524819602, + "learning_rate": 8.506421922314934e-07, + "loss": 1.8395, + "step": 11915 + }, + { + "epoch": 0.92, + "grad_norm": 0.5638888659668961, + "learning_rate": 8.490273107921165e-07, + "loss": 1.8492, + "step": 11916 + }, + { + "epoch": 0.92, + "grad_norm": 0.5228077267876022, + "learning_rate": 8.474139371971751e-07, + "loss": 2.0645, + "step": 11917 + }, + { + "epoch": 0.92, + "grad_norm": 0.5409271899646221, + "learning_rate": 8.45802071547408e-07, + "loss": 1.8687, + "step": 11918 + }, + { + "epoch": 0.92, + "grad_norm": 0.5621099302541483, + "learning_rate": 8.44191713943443e-07, + "loss": 1.839, + "step": 11919 + }, + { + "epoch": 0.92, + "grad_norm": 0.5502732220891616, + "learning_rate": 8.425828644858219e-07, + "loss": 1.9343, + "step": 11920 + }, + { + "epoch": 0.92, + "grad_norm": 0.5857710001146665, + "learning_rate": 8.409755232749977e-07, + "loss": 1.8051, + "step": 11921 + }, + { + "epoch": 0.92, + "grad_norm": 0.5695933943707873, + "learning_rate": 8.393696904113152e-07, + "loss": 2.0803, + "step": 11922 + }, + { + "epoch": 0.92, + "grad_norm": 0.5618243614538307, + "learning_rate": 8.377653659950441e-07, + "loss": 1.8803, + "step": 11923 + }, + { + "epoch": 0.92, + "grad_norm": 0.531793837697941, + "learning_rate": 8.361625501263348e-07, + "loss": 1.8341, + "step": 11924 + }, + { + "epoch": 0.92, + "grad_norm": 0.552410194383956, + "learning_rate": 8.345612429052658e-07, + "loss": 2.0645, + "step": 11925 + }, + { + "epoch": 0.92, + "grad_norm": 0.5021487485531813, + "learning_rate": 8.329614444318179e-07, + "loss": 1.9366, + "step": 11926 + }, + { + "epoch": 0.92, + "grad_norm": 0.5400174975334813, + "learning_rate": 8.313631548058642e-07, + "loss": 1.8499, + "step": 11927 + }, + { + "epoch": 0.92, + "grad_norm": 0.5691426724455668, + "learning_rate": 8.297663741271916e-07, + "loss": 1.8464, + "step": 11928 + }, + { + "epoch": 0.92, + "grad_norm": 0.5134475907701236, + "learning_rate": 8.281711024955063e-07, + "loss": 2.0638, + "step": 11929 + }, + { + "epoch": 0.92, + "grad_norm": 0.5432226467773476, + "learning_rate": 8.265773400103954e-07, + "loss": 1.8662, + "step": 11930 + }, + { + "epoch": 0.92, + "grad_norm": 0.5298302401537774, + "learning_rate": 8.249850867713682e-07, + "loss": 1.8365, + "step": 11931 + }, + { + "epoch": 0.92, + "grad_norm": 0.5060669420050612, + "learning_rate": 8.233943428778368e-07, + "loss": 1.9298, + "step": 11932 + }, + { + "epoch": 0.92, + "grad_norm": 0.5486658629518246, + "learning_rate": 8.218051084291162e-07, + "loss": 1.8796, + "step": 11933 + }, + { + "epoch": 0.92, + "grad_norm": 0.5153681845088866, + "learning_rate": 8.202173835244271e-07, + "loss": 2.0416, + "step": 11934 + }, + { + "epoch": 0.92, + "grad_norm": 0.5483279979116691, + "learning_rate": 8.186311682629011e-07, + "loss": 1.8554, + "step": 11935 + }, + { + "epoch": 0.92, + "grad_norm": 0.5314938246671458, + "learning_rate": 8.170464627435704e-07, + "loss": 1.82, + "step": 11936 + }, + { + "epoch": 0.92, + "grad_norm": 0.5298649424089256, + "learning_rate": 8.15463267065375e-07, + "loss": 2.0815, + "step": 11937 + }, + { + "epoch": 0.92, + "grad_norm": 0.5278995215211619, + "learning_rate": 8.138815813271611e-07, + "loss": 1.9195, + "step": 11938 + }, + { + "epoch": 0.92, + "grad_norm": 0.5718884048293611, + "learning_rate": 8.123014056276801e-07, + "loss": 1.8503, + "step": 11939 + }, + { + "epoch": 0.92, + "grad_norm": 0.5672190365499018, + "learning_rate": 8.107227400655864e-07, + "loss": 1.8631, + "step": 11940 + }, + { + "epoch": 0.92, + "grad_norm": 0.5472847075074475, + "learning_rate": 8.09145584739443e-07, + "loss": 1.854, + "step": 11941 + }, + { + "epoch": 0.92, + "grad_norm": 0.5397064844627799, + "learning_rate": 8.07569939747721e-07, + "loss": 2.0831, + "step": 11942 + }, + { + "epoch": 0.92, + "grad_norm": 0.5576858655157813, + "learning_rate": 8.059958051887917e-07, + "loss": 1.8548, + "step": 11943 + }, + { + "epoch": 0.92, + "grad_norm": 0.5005606177798906, + "learning_rate": 8.044231811609376e-07, + "loss": 1.8836, + "step": 11944 + }, + { + "epoch": 0.92, + "grad_norm": 0.5706739615983839, + "learning_rate": 8.028520677623414e-07, + "loss": 1.8601, + "step": 11945 + }, + { + "epoch": 0.92, + "grad_norm": 0.5290625460223916, + "learning_rate": 8.012824650910938e-07, + "loss": 2.0629, + "step": 11946 + }, + { + "epoch": 0.92, + "grad_norm": 0.5576930280306315, + "learning_rate": 7.997143732451945e-07, + "loss": 1.8738, + "step": 11947 + }, + { + "epoch": 0.92, + "grad_norm": 0.5630809797159464, + "learning_rate": 7.981477923225428e-07, + "loss": 1.8955, + "step": 11948 + }, + { + "epoch": 0.92, + "grad_norm": 0.5546566379075027, + "learning_rate": 7.965827224209466e-07, + "loss": 2.0761, + "step": 11949 + }, + { + "epoch": 0.92, + "grad_norm": 0.5702444145629301, + "learning_rate": 7.950191636381249e-07, + "loss": 1.8768, + "step": 11950 + }, + { + "epoch": 0.92, + "grad_norm": 0.5116258652672493, + "learning_rate": 7.934571160716914e-07, + "loss": 1.9289, + "step": 11951 + }, + { + "epoch": 0.92, + "grad_norm": 0.5583553447553387, + "learning_rate": 7.918965798191763e-07, + "loss": 1.8486, + "step": 11952 + }, + { + "epoch": 0.92, + "grad_norm": 0.5521591979988606, + "learning_rate": 7.90337554977999e-07, + "loss": 1.8504, + "step": 11953 + }, + { + "epoch": 0.92, + "grad_norm": 0.5431407955011353, + "learning_rate": 7.887800416455093e-07, + "loss": 2.0314, + "step": 11954 + }, + { + "epoch": 0.92, + "grad_norm": 0.6003466095351941, + "learning_rate": 7.872240399189434e-07, + "loss": 1.8956, + "step": 11955 + }, + { + "epoch": 0.92, + "grad_norm": 0.5541574981327462, + "learning_rate": 7.856695498954486e-07, + "loss": 1.8989, + "step": 11956 + }, + { + "epoch": 0.92, + "grad_norm": 0.5086403327007514, + "learning_rate": 7.841165716720833e-07, + "loss": 1.876, + "step": 11957 + }, + { + "epoch": 0.92, + "grad_norm": 0.5613833546116672, + "learning_rate": 7.825651053457949e-07, + "loss": 2.0115, + "step": 11958 + }, + { + "epoch": 0.92, + "grad_norm": 0.5538188415848389, + "learning_rate": 7.810151510134589e-07, + "loss": 1.8556, + "step": 11959 + }, + { + "epoch": 0.92, + "grad_norm": 0.5390133573782259, + "learning_rate": 7.794667087718421e-07, + "loss": 1.7937, + "step": 11960 + }, + { + "epoch": 0.92, + "grad_norm": 0.5089928894748439, + "learning_rate": 7.779197787176145e-07, + "loss": 2.0045, + "step": 11961 + }, + { + "epoch": 0.92, + "grad_norm": 0.5459011013537541, + "learning_rate": 7.763743609473628e-07, + "loss": 1.8717, + "step": 11962 + }, + { + "epoch": 0.92, + "grad_norm": 0.5186150708865325, + "learning_rate": 7.748304555575792e-07, + "loss": 1.9197, + "step": 11963 + }, + { + "epoch": 0.92, + "grad_norm": 0.5549381498410941, + "learning_rate": 7.73288062644642e-07, + "loss": 1.8279, + "step": 11964 + }, + { + "epoch": 0.92, + "grad_norm": 0.5466436116645752, + "learning_rate": 7.717471823048605e-07, + "loss": 1.9041, + "step": 11965 + }, + { + "epoch": 0.92, + "grad_norm": 0.5474183394380007, + "learning_rate": 7.702078146344354e-07, + "loss": 2.1042, + "step": 11966 + }, + { + "epoch": 0.92, + "grad_norm": 0.5713697551234194, + "learning_rate": 7.686699597294705e-07, + "loss": 1.8527, + "step": 11967 + }, + { + "epoch": 0.92, + "grad_norm": 0.5911695677904448, + "learning_rate": 7.671336176859917e-07, + "loss": 1.8762, + "step": 11968 + }, + { + "epoch": 0.92, + "grad_norm": 0.542545247927804, + "learning_rate": 7.655987885999056e-07, + "loss": 1.9272, + "step": 11969 + }, + { + "epoch": 0.92, + "grad_norm": 0.5542683880007954, + "learning_rate": 7.640654725670465e-07, + "loss": 2.0525, + "step": 11970 + }, + { + "epoch": 0.92, + "grad_norm": 0.5647088679849149, + "learning_rate": 7.625336696831464e-07, + "loss": 1.8235, + "step": 11971 + }, + { + "epoch": 0.92, + "grad_norm": 0.5472405972053979, + "learning_rate": 7.610033800438344e-07, + "loss": 1.8649, + "step": 11972 + }, + { + "epoch": 0.92, + "grad_norm": 0.5640594451569386, + "learning_rate": 7.594746037446642e-07, + "loss": 1.8706, + "step": 11973 + }, + { + "epoch": 0.92, + "grad_norm": 0.5311015609978653, + "learning_rate": 7.579473408810739e-07, + "loss": 2.0539, + "step": 11974 + }, + { + "epoch": 0.92, + "grad_norm": 0.5284631743060324, + "learning_rate": 7.564215915484146e-07, + "loss": 1.9215, + "step": 11975 + }, + { + "epoch": 0.92, + "grad_norm": 0.5402489815762331, + "learning_rate": 7.548973558419603e-07, + "loss": 1.8788, + "step": 11976 + }, + { + "epoch": 0.92, + "grad_norm": 0.5405191594318214, + "learning_rate": 7.533746338568599e-07, + "loss": 1.8656, + "step": 11977 + }, + { + "epoch": 0.92, + "grad_norm": 0.5359344702495117, + "learning_rate": 7.518534256881898e-07, + "loss": 2.0282, + "step": 11978 + }, + { + "epoch": 0.92, + "grad_norm": 0.5406650486089863, + "learning_rate": 7.503337314309217e-07, + "loss": 1.8385, + "step": 11979 + }, + { + "epoch": 0.92, + "grad_norm": 0.5429358016034893, + "learning_rate": 7.488155511799433e-07, + "loss": 1.8745, + "step": 11980 + }, + { + "epoch": 0.92, + "grad_norm": 0.5114506174523652, + "learning_rate": 7.472988850300317e-07, + "loss": 2.0855, + "step": 11981 + }, + { + "epoch": 0.92, + "grad_norm": 0.5296394043509212, + "learning_rate": 7.45783733075886e-07, + "loss": 1.9833, + "step": 11982 + }, + { + "epoch": 0.92, + "grad_norm": 0.548173780356348, + "learning_rate": 7.442700954121001e-07, + "loss": 1.8445, + "step": 11983 + }, + { + "epoch": 0.92, + "grad_norm": 0.5317315758945761, + "learning_rate": 7.42757972133179e-07, + "loss": 1.7907, + "step": 11984 + }, + { + "epoch": 0.92, + "grad_norm": 0.5706128891721373, + "learning_rate": 7.412473633335277e-07, + "loss": 1.868, + "step": 11985 + }, + { + "epoch": 0.92, + "grad_norm": 0.5618555972535536, + "learning_rate": 7.397382691074595e-07, + "loss": 2.0272, + "step": 11986 + }, + { + "epoch": 0.92, + "grad_norm": 0.5499407966878038, + "learning_rate": 7.382306895491964e-07, + "loss": 1.8174, + "step": 11987 + }, + { + "epoch": 0.92, + "grad_norm": 0.5200596198104859, + "learning_rate": 7.367246247528603e-07, + "loss": 1.9104, + "step": 11988 + }, + { + "epoch": 0.92, + "grad_norm": 0.5436961977209263, + "learning_rate": 7.352200748124816e-07, + "loss": 1.8366, + "step": 11989 + }, + { + "epoch": 0.93, + "grad_norm": 0.532953175263536, + "learning_rate": 7.337170398219962e-07, + "loss": 2.0576, + "step": 11990 + }, + { + "epoch": 0.93, + "grad_norm": 0.534701341634024, + "learning_rate": 7.322155198752456e-07, + "loss": 1.8574, + "step": 11991 + }, + { + "epoch": 0.93, + "grad_norm": 0.5433682535268415, + "learning_rate": 7.307155150659744e-07, + "loss": 1.8851, + "step": 11992 + }, + { + "epoch": 0.93, + "grad_norm": 0.5719929165540277, + "learning_rate": 7.292170254878328e-07, + "loss": 1.83, + "step": 11993 + }, + { + "epoch": 0.93, + "grad_norm": 0.5096531979117557, + "learning_rate": 7.277200512343818e-07, + "loss": 2.0602, + "step": 11994 + }, + { + "epoch": 0.93, + "grad_norm": 0.5396676452755502, + "learning_rate": 7.262245923990773e-07, + "loss": 1.8827, + "step": 11995 + }, + { + "epoch": 0.93, + "grad_norm": 0.5474877917638199, + "learning_rate": 7.247306490752948e-07, + "loss": 1.8967, + "step": 11996 + }, + { + "epoch": 0.93, + "grad_norm": 0.5404838591180459, + "learning_rate": 7.232382213563011e-07, + "loss": 1.8592, + "step": 11997 + }, + { + "epoch": 0.93, + "grad_norm": 0.5297255502773219, + "learning_rate": 7.217473093352745e-07, + "loss": 2.0471, + "step": 11998 + }, + { + "epoch": 0.93, + "grad_norm": 0.5999482569534146, + "learning_rate": 7.202579131053072e-07, + "loss": 1.8287, + "step": 11999 + }, + { + "epoch": 0.93, + "grad_norm": 0.5157188688486453, + "learning_rate": 7.187700327593749e-07, + "loss": 1.9667, + "step": 12000 + }, + { + "epoch": 0.93, + "grad_norm": 0.5399714605368815, + "learning_rate": 7.172836683903811e-07, + "loss": 1.8492, + "step": 12001 + }, + { + "epoch": 0.93, + "grad_norm": 0.5670168143656699, + "learning_rate": 7.157988200911291e-07, + "loss": 2.06, + "step": 12002 + }, + { + "epoch": 0.93, + "grad_norm": 0.5443737022249971, + "learning_rate": 7.143154879543118e-07, + "loss": 1.8194, + "step": 12003 + }, + { + "epoch": 0.93, + "grad_norm": 0.5778497121678692, + "learning_rate": 7.12833672072552e-07, + "loss": 1.88, + "step": 12004 + }, + { + "epoch": 0.93, + "grad_norm": 0.5742933519939378, + "learning_rate": 7.113533725383564e-07, + "loss": 1.8713, + "step": 12005 + }, + { + "epoch": 0.93, + "grad_norm": 0.4891852658328119, + "learning_rate": 7.098745894441511e-07, + "loss": 2.0514, + "step": 12006 + }, + { + "epoch": 0.93, + "grad_norm": 0.5432824662067035, + "learning_rate": 7.083973228822648e-07, + "loss": 1.8328, + "step": 12007 + }, + { + "epoch": 0.93, + "grad_norm": 0.5380265985228503, + "learning_rate": 7.069215729449186e-07, + "loss": 1.8358, + "step": 12008 + }, + { + "epoch": 0.93, + "grad_norm": 0.5668338340377961, + "learning_rate": 7.054473397242634e-07, + "loss": 1.8835, + "step": 12009 + }, + { + "epoch": 0.93, + "grad_norm": 0.5732653691430536, + "learning_rate": 7.03974623312334e-07, + "loss": 2.0706, + "step": 12010 + }, + { + "epoch": 0.93, + "grad_norm": 0.5487969385206273, + "learning_rate": 7.025034238010764e-07, + "loss": 1.8164, + "step": 12011 + }, + { + "epoch": 0.93, + "grad_norm": 0.5357722996604882, + "learning_rate": 7.010337412823531e-07, + "loss": 1.8167, + "step": 12012 + }, + { + "epoch": 0.93, + "grad_norm": 0.5136236043033853, + "learning_rate": 6.995655758479103e-07, + "loss": 1.923, + "step": 12013 + }, + { + "epoch": 0.93, + "grad_norm": 0.5157797273743138, + "learning_rate": 6.980989275894162e-07, + "loss": 2.0702, + "step": 12014 + }, + { + "epoch": 0.93, + "grad_norm": 0.5532182062532611, + "learning_rate": 6.966337965984476e-07, + "loss": 1.8432, + "step": 12015 + }, + { + "epoch": 0.93, + "grad_norm": 0.539683100410497, + "learning_rate": 6.951701829664675e-07, + "loss": 1.8699, + "step": 12016 + }, + { + "epoch": 0.93, + "grad_norm": 0.5603943534852046, + "learning_rate": 6.937080867848583e-07, + "loss": 1.8639, + "step": 12017 + }, + { + "epoch": 0.93, + "grad_norm": 0.559670081500926, + "learning_rate": 6.922475081449082e-07, + "loss": 2.06, + "step": 12018 + }, + { + "epoch": 0.93, + "grad_norm": 0.499788380330511, + "learning_rate": 6.907884471378023e-07, + "loss": 1.9303, + "step": 12019 + }, + { + "epoch": 0.93, + "grad_norm": 0.5373315425909709, + "learning_rate": 6.893309038546431e-07, + "loss": 1.8847, + "step": 12020 + }, + { + "epoch": 0.93, + "grad_norm": 0.5554722734027194, + "learning_rate": 6.878748783864242e-07, + "loss": 1.8624, + "step": 12021 + }, + { + "epoch": 0.93, + "grad_norm": 0.548067132090641, + "learning_rate": 6.864203708240507e-07, + "loss": 2.0301, + "step": 12022 + }, + { + "epoch": 0.93, + "grad_norm": 0.5509567734675266, + "learning_rate": 6.849673812583391e-07, + "loss": 1.8359, + "step": 12023 + }, + { + "epoch": 0.93, + "grad_norm": 0.5798341510670232, + "learning_rate": 6.835159097800026e-07, + "loss": 1.8936, + "step": 12024 + }, + { + "epoch": 0.93, + "grad_norm": 0.5129686984181393, + "learning_rate": 6.820659564796605e-07, + "loss": 1.9308, + "step": 12025 + }, + { + "epoch": 0.93, + "grad_norm": 0.5655841540613078, + "learning_rate": 6.806175214478433e-07, + "loss": 2.0763, + "step": 12026 + }, + { + "epoch": 0.93, + "grad_norm": 0.5525845501780215, + "learning_rate": 6.791706047749785e-07, + "loss": 1.8715, + "step": 12027 + }, + { + "epoch": 0.93, + "grad_norm": 0.5596378698238804, + "learning_rate": 6.777252065514078e-07, + "loss": 1.8456, + "step": 12028 + }, + { + "epoch": 0.93, + "grad_norm": 0.5615912898264964, + "learning_rate": 6.762813268673701e-07, + "loss": 1.8172, + "step": 12029 + }, + { + "epoch": 0.93, + "grad_norm": 0.532558233627528, + "learning_rate": 6.748389658130128e-07, + "loss": 2.054, + "step": 12030 + }, + { + "epoch": 0.93, + "grad_norm": 0.5270721388356946, + "learning_rate": 6.733981234783887e-07, + "loss": 1.9429, + "step": 12031 + }, + { + "epoch": 0.93, + "grad_norm": 0.5644681120940344, + "learning_rate": 6.719587999534565e-07, + "loss": 1.8606, + "step": 12032 + }, + { + "epoch": 0.93, + "grad_norm": 0.5283696269313857, + "learning_rate": 6.705209953280778e-07, + "loss": 1.8082, + "step": 12033 + }, + { + "epoch": 0.93, + "grad_norm": 0.5394656269774584, + "learning_rate": 6.690847096920222e-07, + "loss": 2.0301, + "step": 12034 + }, + { + "epoch": 0.93, + "grad_norm": 0.5517927351674337, + "learning_rate": 6.676499431349625e-07, + "loss": 1.911, + "step": 12035 + }, + { + "epoch": 0.93, + "grad_norm": 0.562096439790101, + "learning_rate": 6.662166957464772e-07, + "loss": 1.8715, + "step": 12036 + }, + { + "epoch": 0.93, + "grad_norm": 0.5552496653630097, + "learning_rate": 6.647849676160472e-07, + "loss": 1.8506, + "step": 12037 + }, + { + "epoch": 0.93, + "grad_norm": 0.49935203096553693, + "learning_rate": 6.633547588330652e-07, + "loss": 2.0622, + "step": 12038 + }, + { + "epoch": 0.93, + "grad_norm": 0.5599250506696739, + "learning_rate": 6.619260694868235e-07, + "loss": 1.8735, + "step": 12039 + }, + { + "epoch": 0.93, + "grad_norm": 0.5456614907798658, + "learning_rate": 6.604988996665173e-07, + "loss": 1.8643, + "step": 12040 + }, + { + "epoch": 0.93, + "grad_norm": 0.5536663243623563, + "learning_rate": 6.590732494612562e-07, + "loss": 1.8719, + "step": 12041 + }, + { + "epoch": 0.93, + "grad_norm": 0.5463697326516083, + "learning_rate": 6.576491189600465e-07, + "loss": 2.0502, + "step": 12042 + }, + { + "epoch": 0.93, + "grad_norm": 0.5562017834588785, + "learning_rate": 6.562265082518059e-07, + "loss": 1.8859, + "step": 12043 + }, + { + "epoch": 0.93, + "grad_norm": 0.5320660893448803, + "learning_rate": 6.54805417425347e-07, + "loss": 1.9479, + "step": 12044 + }, + { + "epoch": 0.93, + "grad_norm": 0.5448159752183019, + "learning_rate": 6.533858465694015e-07, + "loss": 1.856, + "step": 12045 + }, + { + "epoch": 0.93, + "grad_norm": 0.5157673866739633, + "learning_rate": 6.519677957725956e-07, + "loss": 2.0269, + "step": 12046 + }, + { + "epoch": 0.93, + "grad_norm": 0.5448171398172151, + "learning_rate": 6.505512651234613e-07, + "loss": 1.8996, + "step": 12047 + }, + { + "epoch": 0.93, + "grad_norm": 0.5646446400961729, + "learning_rate": 6.491362547104446e-07, + "loss": 1.8423, + "step": 12048 + }, + { + "epoch": 0.93, + "grad_norm": 0.5313713198139957, + "learning_rate": 6.477227646218887e-07, + "loss": 1.9068, + "step": 12049 + }, + { + "epoch": 0.93, + "grad_norm": 0.5110933774164131, + "learning_rate": 6.463107949460395e-07, + "loss": 2.136, + "step": 12050 + }, + { + "epoch": 0.93, + "grad_norm": 0.5560249762278169, + "learning_rate": 6.449003457710546e-07, + "loss": 1.8414, + "step": 12051 + }, + { + "epoch": 0.93, + "grad_norm": 0.5378572424290625, + "learning_rate": 6.434914171849965e-07, + "loss": 1.8422, + "step": 12052 + }, + { + "epoch": 0.93, + "grad_norm": 0.5481566825593707, + "learning_rate": 6.420840092758257e-07, + "loss": 1.8436, + "step": 12053 + }, + { + "epoch": 0.93, + "grad_norm": 0.5565134971772293, + "learning_rate": 6.40678122131419e-07, + "loss": 2.0849, + "step": 12054 + }, + { + "epoch": 0.93, + "grad_norm": 0.5413968469171467, + "learning_rate": 6.392737558395423e-07, + "loss": 1.8559, + "step": 12055 + }, + { + "epoch": 0.93, + "grad_norm": 0.5034657565372168, + "learning_rate": 6.378709104878839e-07, + "loss": 1.9399, + "step": 12056 + }, + { + "epoch": 0.93, + "grad_norm": 0.551262151656272, + "learning_rate": 6.364695861640318e-07, + "loss": 1.8214, + "step": 12057 + }, + { + "epoch": 0.93, + "grad_norm": 0.5337161582732907, + "learning_rate": 6.350697829554636e-07, + "loss": 2.0487, + "step": 12058 + }, + { + "epoch": 0.93, + "grad_norm": 0.5366577107713069, + "learning_rate": 6.33671500949587e-07, + "loss": 1.8142, + "step": 12059 + }, + { + "epoch": 0.93, + "grad_norm": 0.5456591215677576, + "learning_rate": 6.322747402336959e-07, + "loss": 1.8366, + "step": 12060 + }, + { + "epoch": 0.93, + "grad_norm": 0.5318763837532458, + "learning_rate": 6.308795008949958e-07, + "loss": 1.832, + "step": 12061 + }, + { + "epoch": 0.93, + "grad_norm": 0.5027404758948059, + "learning_rate": 6.294857830206058e-07, + "loss": 2.104, + "step": 12062 + }, + { + "epoch": 0.93, + "grad_norm": 0.5495700252760988, + "learning_rate": 6.280935866975313e-07, + "loss": 1.853, + "step": 12063 + }, + { + "epoch": 0.93, + "grad_norm": 0.5527659919622202, + "learning_rate": 6.267029120126944e-07, + "loss": 1.8391, + "step": 12064 + }, + { + "epoch": 0.93, + "grad_norm": 0.5389161929513338, + "learning_rate": 6.253137590529257e-07, + "loss": 1.7941, + "step": 12065 + }, + { + "epoch": 0.93, + "grad_norm": 0.5190827140714885, + "learning_rate": 6.239261279049529e-07, + "loss": 2.0275, + "step": 12066 + }, + { + "epoch": 0.93, + "grad_norm": 0.5547715258785623, + "learning_rate": 6.225400186554098e-07, + "loss": 1.8141, + "step": 12067 + }, + { + "epoch": 0.93, + "grad_norm": 0.540224719139451, + "learning_rate": 6.21155431390838e-07, + "loss": 1.8219, + "step": 12068 + }, + { + "epoch": 0.93, + "grad_norm": 0.5085356653522338, + "learning_rate": 6.197723661976824e-07, + "loss": 1.9397, + "step": 12069 + }, + { + "epoch": 0.93, + "grad_norm": 0.546295112903078, + "learning_rate": 6.183908231622986e-07, + "loss": 2.073, + "step": 12070 + }, + { + "epoch": 0.93, + "grad_norm": 0.5262887184138038, + "learning_rate": 6.170108023709348e-07, + "loss": 1.8918, + "step": 12071 + }, + { + "epoch": 0.93, + "grad_norm": 0.5632086587309858, + "learning_rate": 6.15632303909755e-07, + "loss": 1.8636, + "step": 12072 + }, + { + "epoch": 0.93, + "grad_norm": 0.5524100704268221, + "learning_rate": 6.142553278648239e-07, + "loss": 1.852, + "step": 12073 + }, + { + "epoch": 0.93, + "grad_norm": 0.5279465094978973, + "learning_rate": 6.128798743221143e-07, + "loss": 2.1131, + "step": 12074 + }, + { + "epoch": 0.93, + "grad_norm": 0.5096687476904901, + "learning_rate": 6.115059433674963e-07, + "loss": 1.9601, + "step": 12075 + }, + { + "epoch": 0.93, + "grad_norm": 0.5510228145494964, + "learning_rate": 6.101335350867515e-07, + "loss": 1.8322, + "step": 12076 + }, + { + "epoch": 0.93, + "grad_norm": 0.5529306529549411, + "learning_rate": 6.087626495655696e-07, + "loss": 1.8869, + "step": 12077 + }, + { + "epoch": 0.93, + "grad_norm": 0.5379775757040772, + "learning_rate": 6.073932868895349e-07, + "loss": 2.0862, + "step": 12078 + }, + { + "epoch": 0.93, + "grad_norm": 0.5583679009194233, + "learning_rate": 6.060254471441456e-07, + "loss": 1.8529, + "step": 12079 + }, + { + "epoch": 0.93, + "grad_norm": 0.5473126156099183, + "learning_rate": 6.046591304148003e-07, + "loss": 1.8528, + "step": 12080 + }, + { + "epoch": 0.93, + "grad_norm": 0.5186878496649145, + "learning_rate": 6.032943367868027e-07, + "loss": 1.9441, + "step": 12081 + }, + { + "epoch": 0.93, + "grad_norm": 0.5389580000196879, + "learning_rate": 6.019310663453653e-07, + "loss": 2.0365, + "step": 12082 + }, + { + "epoch": 0.93, + "grad_norm": 0.6111879803525038, + "learning_rate": 6.005693191756006e-07, + "loss": 1.8427, + "step": 12083 + }, + { + "epoch": 0.93, + "grad_norm": 0.5498179434010276, + "learning_rate": 5.992090953625295e-07, + "loss": 1.8574, + "step": 12084 + }, + { + "epoch": 0.93, + "grad_norm": 0.5493380725347365, + "learning_rate": 5.978503949910757e-07, + "loss": 1.8433, + "step": 12085 + }, + { + "epoch": 0.93, + "grad_norm": 0.519289108908545, + "learning_rate": 5.964932181460658e-07, + "loss": 1.9947, + "step": 12086 + }, + { + "epoch": 0.93, + "grad_norm": 0.5216815187795476, + "learning_rate": 5.951375649122404e-07, + "loss": 1.9278, + "step": 12087 + }, + { + "epoch": 0.93, + "grad_norm": 0.5695668700667512, + "learning_rate": 5.937834353742316e-07, + "loss": 1.8564, + "step": 12088 + }, + { + "epoch": 0.93, + "grad_norm": 0.5295866250049729, + "learning_rate": 5.924308296165859e-07, + "loss": 1.7958, + "step": 12089 + }, + { + "epoch": 0.93, + "grad_norm": 0.5129824341499704, + "learning_rate": 5.910797477237551e-07, + "loss": 2.0275, + "step": 12090 + }, + { + "epoch": 0.93, + "grad_norm": 0.5554984307209867, + "learning_rate": 5.897301897800883e-07, + "loss": 1.8665, + "step": 12091 + }, + { + "epoch": 0.93, + "grad_norm": 0.5217822294880273, + "learning_rate": 5.883821558698432e-07, + "loss": 1.8822, + "step": 12092 + }, + { + "epoch": 0.93, + "grad_norm": 0.5038629141065102, + "learning_rate": 5.870356460771915e-07, + "loss": 1.9113, + "step": 12093 + }, + { + "epoch": 0.93, + "grad_norm": 0.5222649185787611, + "learning_rate": 5.856906604861906e-07, + "loss": 2.0156, + "step": 12094 + }, + { + "epoch": 0.93, + "grad_norm": 0.547598621198585, + "learning_rate": 5.84347199180818e-07, + "loss": 1.8406, + "step": 12095 + }, + { + "epoch": 0.93, + "grad_norm": 0.5473717519739777, + "learning_rate": 5.830052622449566e-07, + "loss": 1.8839, + "step": 12096 + }, + { + "epoch": 0.93, + "grad_norm": 0.5436525485233353, + "learning_rate": 5.816648497623783e-07, + "loss": 1.8307, + "step": 12097 + }, + { + "epoch": 0.93, + "grad_norm": 0.5383867410736927, + "learning_rate": 5.8032596181678e-07, + "loss": 2.0667, + "step": 12098 + }, + { + "epoch": 0.93, + "grad_norm": 0.5630590629347338, + "learning_rate": 5.789885984917504e-07, + "loss": 1.864, + "step": 12099 + }, + { + "epoch": 0.93, + "grad_norm": 0.5056727428791555, + "learning_rate": 5.776527598707837e-07, + "loss": 1.9184, + "step": 12100 + }, + { + "epoch": 0.93, + "grad_norm": 0.5333257597445358, + "learning_rate": 5.763184460372884e-07, + "loss": 1.8192, + "step": 12101 + }, + { + "epoch": 0.93, + "grad_norm": 0.5225802417729573, + "learning_rate": 5.749856570745643e-07, + "loss": 2.0243, + "step": 12102 + }, + { + "epoch": 0.93, + "grad_norm": 0.5792814458973582, + "learning_rate": 5.736543930658284e-07, + "loss": 1.8291, + "step": 12103 + }, + { + "epoch": 0.93, + "grad_norm": 0.5608060018255706, + "learning_rate": 5.723246540941946e-07, + "loss": 1.8407, + "step": 12104 + }, + { + "epoch": 0.93, + "grad_norm": 0.5602728477966785, + "learning_rate": 5.709964402426826e-07, + "loss": 1.8661, + "step": 12105 + }, + { + "epoch": 0.93, + "grad_norm": 0.5064918229671801, + "learning_rate": 5.696697515942235e-07, + "loss": 1.8765, + "step": 12106 + }, + { + "epoch": 0.93, + "grad_norm": 0.5194614451807128, + "learning_rate": 5.683445882316396e-07, + "loss": 2.004, + "step": 12107 + }, + { + "epoch": 0.93, + "grad_norm": 0.57016010923494, + "learning_rate": 5.670209502376706e-07, + "loss": 1.8499, + "step": 12108 + }, + { + "epoch": 0.93, + "grad_norm": 0.54206056839908, + "learning_rate": 5.656988376949613e-07, + "loss": 1.8657, + "step": 12109 + }, + { + "epoch": 0.93, + "grad_norm": 0.5465392592248804, + "learning_rate": 5.643782506860484e-07, + "loss": 2.0576, + "step": 12110 + }, + { + "epoch": 0.93, + "grad_norm": 0.5509752857808684, + "learning_rate": 5.630591892933857e-07, + "loss": 1.8976, + "step": 12111 + }, + { + "epoch": 0.93, + "grad_norm": 0.4984076012332594, + "learning_rate": 5.617416535993291e-07, + "loss": 1.8952, + "step": 12112 + }, + { + "epoch": 0.93, + "grad_norm": 0.5809744286810843, + "learning_rate": 5.604256436861355e-07, + "loss": 1.8262, + "step": 12113 + }, + { + "epoch": 0.93, + "grad_norm": 0.5374952332161287, + "learning_rate": 5.591111596359666e-07, + "loss": 2.014, + "step": 12114 + }, + { + "epoch": 0.93, + "grad_norm": 0.5332028291482898, + "learning_rate": 5.577982015308958e-07, + "loss": 1.8591, + "step": 12115 + }, + { + "epoch": 0.93, + "grad_norm": 0.5585024938748564, + "learning_rate": 5.564867694528935e-07, + "loss": 1.8533, + "step": 12116 + }, + { + "epoch": 0.93, + "grad_norm": 0.5560685959676128, + "learning_rate": 5.551768634838417e-07, + "loss": 1.8669, + "step": 12117 + }, + { + "epoch": 0.93, + "grad_norm": 0.5440056492524306, + "learning_rate": 5.538684837055163e-07, + "loss": 1.9523, + "step": 12118 + }, + { + "epoch": 0.93, + "grad_norm": 0.5121292914098255, + "learning_rate": 5.525616301996078e-07, + "loss": 2.0361, + "step": 12119 + }, + { + "epoch": 0.94, + "grad_norm": 0.5462760916854598, + "learning_rate": 5.512563030477146e-07, + "loss": 1.821, + "step": 12120 + }, + { + "epoch": 0.94, + "grad_norm": 0.5418468983622584, + "learning_rate": 5.499525023313273e-07, + "loss": 1.8454, + "step": 12121 + }, + { + "epoch": 0.94, + "grad_norm": 0.5346413962763046, + "learning_rate": 5.486502281318473e-07, + "loss": 2.0669, + "step": 12122 + }, + { + "epoch": 0.94, + "grad_norm": 0.5398009281824697, + "learning_rate": 5.473494805305818e-07, + "loss": 1.8483, + "step": 12123 + }, + { + "epoch": 0.94, + "grad_norm": 0.5003600787040667, + "learning_rate": 5.460502596087408e-07, + "loss": 1.9661, + "step": 12124 + }, + { + "epoch": 0.94, + "grad_norm": 0.5622621099640973, + "learning_rate": 5.447525654474428e-07, + "loss": 1.8474, + "step": 12125 + }, + { + "epoch": 0.94, + "grad_norm": 0.5426874842493473, + "learning_rate": 5.434563981277063e-07, + "loss": 1.8621, + "step": 12126 + }, + { + "epoch": 0.94, + "grad_norm": 0.5202740840655923, + "learning_rate": 5.421617577304583e-07, + "loss": 2.0296, + "step": 12127 + }, + { + "epoch": 0.94, + "grad_norm": 0.5584009958630821, + "learning_rate": 5.408686443365257e-07, + "loss": 1.8683, + "step": 12128 + }, + { + "epoch": 0.94, + "grad_norm": 0.5326150135490557, + "learning_rate": 5.39577058026644e-07, + "loss": 1.8318, + "step": 12129 + }, + { + "epoch": 0.94, + "grad_norm": 0.5064788744843992, + "learning_rate": 5.382869988814543e-07, + "loss": 2.0582, + "step": 12130 + }, + { + "epoch": 0.94, + "grad_norm": 0.5045057071303688, + "learning_rate": 5.369984669814976e-07, + "loss": 1.9598, + "step": 12131 + }, + { + "epoch": 0.94, + "grad_norm": 0.563094415580324, + "learning_rate": 5.357114624072235e-07, + "loss": 1.8792, + "step": 12132 + }, + { + "epoch": 0.94, + "grad_norm": 0.5425507622075342, + "learning_rate": 5.344259852389844e-07, + "loss": 1.8703, + "step": 12133 + }, + { + "epoch": 0.94, + "grad_norm": 0.5453864811909958, + "learning_rate": 5.331420355570382e-07, + "loss": 2.0554, + "step": 12134 + }, + { + "epoch": 0.94, + "grad_norm": 0.5407871471121478, + "learning_rate": 5.31859613441546e-07, + "loss": 1.8406, + "step": 12135 + }, + { + "epoch": 0.94, + "grad_norm": 0.5592319995955283, + "learning_rate": 5.305787189725769e-07, + "loss": 1.8804, + "step": 12136 + }, + { + "epoch": 0.94, + "grad_norm": 0.5074730849703841, + "learning_rate": 5.292993522301005e-07, + "loss": 1.9213, + "step": 12137 + }, + { + "epoch": 0.94, + "grad_norm": 0.5455377662819488, + "learning_rate": 5.280215132939942e-07, + "loss": 1.8096, + "step": 12138 + }, + { + "epoch": 0.94, + "grad_norm": 0.5436737766226606, + "learning_rate": 5.267452022440389e-07, + "loss": 2.0154, + "step": 12139 + }, + { + "epoch": 0.94, + "grad_norm": 0.5421312487172725, + "learning_rate": 5.254704191599208e-07, + "loss": 1.829, + "step": 12140 + }, + { + "epoch": 0.94, + "grad_norm": 0.5774860151621288, + "learning_rate": 5.241971641212234e-07, + "loss": 1.8712, + "step": 12141 + }, + { + "epoch": 0.94, + "grad_norm": 0.5295691503089527, + "learning_rate": 5.229254372074471e-07, + "loss": 2.0486, + "step": 12142 + }, + { + "epoch": 0.94, + "grad_norm": 0.5112865990683462, + "learning_rate": 5.21655238497995e-07, + "loss": 1.9115, + "step": 12143 + }, + { + "epoch": 0.94, + "grad_norm": 0.5491790652722256, + "learning_rate": 5.203865680721593e-07, + "loss": 1.8245, + "step": 12144 + }, + { + "epoch": 0.94, + "grad_norm": 0.5339669367896346, + "learning_rate": 5.191194260091597e-07, + "loss": 1.8649, + "step": 12145 + }, + { + "epoch": 0.94, + "grad_norm": 0.5189749243388174, + "learning_rate": 5.178538123881055e-07, + "loss": 1.9766, + "step": 12146 + }, + { + "epoch": 0.94, + "grad_norm": 0.5475449812726021, + "learning_rate": 5.165897272880082e-07, + "loss": 1.8079, + "step": 12147 + }, + { + "epoch": 0.94, + "grad_norm": 0.5409235159761037, + "learning_rate": 5.153271707877994e-07, + "loss": 1.8517, + "step": 12148 + }, + { + "epoch": 0.94, + "grad_norm": 0.5255110994406206, + "learning_rate": 5.140661429662963e-07, + "loss": 1.9023, + "step": 12149 + }, + { + "epoch": 0.94, + "grad_norm": 0.5399704780944399, + "learning_rate": 5.128066439022361e-07, + "loss": 1.8641, + "step": 12150 + }, + { + "epoch": 0.94, + "grad_norm": 0.5615741487356377, + "learning_rate": 5.115486736742559e-07, + "loss": 2.0317, + "step": 12151 + }, + { + "epoch": 0.94, + "grad_norm": 0.572540203570783, + "learning_rate": 5.102922323608872e-07, + "loss": 1.856, + "step": 12152 + }, + { + "epoch": 0.94, + "grad_norm": 0.5576647822411424, + "learning_rate": 5.090373200405868e-07, + "loss": 1.8761, + "step": 12153 + }, + { + "epoch": 0.94, + "grad_norm": 0.5357490382229382, + "learning_rate": 5.077839367916948e-07, + "loss": 2.0908, + "step": 12154 + }, + { + "epoch": 0.94, + "grad_norm": 0.48986963106455633, + "learning_rate": 5.06532082692468e-07, + "loss": 1.9065, + "step": 12155 + }, + { + "epoch": 0.94, + "grad_norm": 0.5482354454489646, + "learning_rate": 5.052817578210661e-07, + "loss": 1.8325, + "step": 12156 + }, + { + "epoch": 0.94, + "grad_norm": 0.5298763947650685, + "learning_rate": 5.040329622555517e-07, + "loss": 1.8998, + "step": 12157 + }, + { + "epoch": 0.94, + "grad_norm": 0.5187499227749742, + "learning_rate": 5.027856960738875e-07, + "loss": 1.8549, + "step": 12158 + }, + { + "epoch": 0.94, + "grad_norm": 0.5404049222300696, + "learning_rate": 5.015399593539555e-07, + "loss": 2.0579, + "step": 12159 + }, + { + "epoch": 0.94, + "grad_norm": 0.5470673906702084, + "learning_rate": 5.002957521735213e-07, + "loss": 1.8497, + "step": 12160 + }, + { + "epoch": 0.94, + "grad_norm": 0.5641814186955232, + "learning_rate": 4.990530746102729e-07, + "loss": 1.8228, + "step": 12161 + }, + { + "epoch": 0.94, + "grad_norm": 0.5319470538022956, + "learning_rate": 4.978119267417925e-07, + "loss": 1.9395, + "step": 12162 + }, + { + "epoch": 0.94, + "grad_norm": 0.5422591456966249, + "learning_rate": 4.965723086455709e-07, + "loss": 2.0241, + "step": 12163 + }, + { + "epoch": 0.94, + "grad_norm": 0.5647452775183375, + "learning_rate": 4.95334220399002e-07, + "loss": 1.8166, + "step": 12164 + }, + { + "epoch": 0.94, + "grad_norm": 0.5504909963979242, + "learning_rate": 4.940976620793875e-07, + "loss": 1.823, + "step": 12165 + }, + { + "epoch": 0.94, + "grad_norm": 0.5185672040985381, + "learning_rate": 4.928626337639241e-07, + "loss": 2.0392, + "step": 12166 + }, + { + "epoch": 0.94, + "grad_norm": 0.5586598039786779, + "learning_rate": 4.916291355297309e-07, + "loss": 1.8082, + "step": 12167 + }, + { + "epoch": 0.94, + "grad_norm": 0.5266276729778374, + "learning_rate": 4.9039716745381e-07, + "loss": 1.9233, + "step": 12168 + }, + { + "epoch": 0.94, + "grad_norm": 0.5730368155960067, + "learning_rate": 4.891667296130802e-07, + "loss": 1.8899, + "step": 12169 + }, + { + "epoch": 0.94, + "grad_norm": 0.5512821852406627, + "learning_rate": 4.879378220843667e-07, + "loss": 1.8265, + "step": 12170 + }, + { + "epoch": 0.94, + "grad_norm": 0.5144916815875704, + "learning_rate": 4.867104449443938e-07, + "loss": 2.0208, + "step": 12171 + }, + { + "epoch": 0.94, + "grad_norm": 0.524171546956164, + "learning_rate": 4.854845982697892e-07, + "loss": 1.8305, + "step": 12172 + }, + { + "epoch": 0.94, + "grad_norm": 0.5479290579506326, + "learning_rate": 4.842602821370917e-07, + "loss": 1.8427, + "step": 12173 + }, + { + "epoch": 0.94, + "grad_norm": 0.4999135790266691, + "learning_rate": 4.830374966227346e-07, + "loss": 1.9056, + "step": 12174 + }, + { + "epoch": 0.94, + "grad_norm": 0.5423823762111387, + "learning_rate": 4.81816241803068e-07, + "loss": 2.0381, + "step": 12175 + }, + { + "epoch": 0.94, + "grad_norm": 0.5362492005951296, + "learning_rate": 4.805965177543365e-07, + "loss": 1.8068, + "step": 12176 + }, + { + "epoch": 0.94, + "grad_norm": 0.5421371683272366, + "learning_rate": 4.7937832455269e-07, + "loss": 1.8633, + "step": 12177 + }, + { + "epoch": 0.94, + "grad_norm": 0.5657536872965847, + "learning_rate": 4.781616622741903e-07, + "loss": 2.0864, + "step": 12178 + }, + { + "epoch": 0.94, + "grad_norm": 0.5266896015864112, + "learning_rate": 4.769465309947957e-07, + "loss": 1.8672, + "step": 12179 + }, + { + "epoch": 0.94, + "grad_norm": 0.5122677548849798, + "learning_rate": 4.7573293079037076e-07, + "loss": 1.8986, + "step": 12180 + }, + { + "epoch": 0.94, + "grad_norm": 0.5413571229997403, + "learning_rate": 4.745208617366881e-07, + "loss": 1.8546, + "step": 12181 + }, + { + "epoch": 0.94, + "grad_norm": 0.5809757271057946, + "learning_rate": 4.7331032390942333e-07, + "loss": 1.8566, + "step": 12182 + }, + { + "epoch": 0.94, + "grad_norm": 0.5157749460031795, + "learning_rate": 4.721013173841493e-07, + "loss": 2.0547, + "step": 12183 + }, + { + "epoch": 0.94, + "grad_norm": 0.5520335341724514, + "learning_rate": 4.7089384223635567e-07, + "loss": 1.7968, + "step": 12184 + }, + { + "epoch": 0.94, + "grad_norm": 0.564075199584124, + "learning_rate": 4.696878985414266e-07, + "loss": 1.8278, + "step": 12185 + }, + { + "epoch": 0.94, + "grad_norm": 0.5569996639026346, + "learning_rate": 4.6848348637465735e-07, + "loss": 2.0797, + "step": 12186 + }, + { + "epoch": 0.94, + "grad_norm": 0.5191816331003827, + "learning_rate": 4.672806058112406e-07, + "loss": 1.9974, + "step": 12187 + }, + { + "epoch": 0.94, + "grad_norm": 0.5765193637109733, + "learning_rate": 4.660792569262773e-07, + "loss": 1.8405, + "step": 12188 + }, + { + "epoch": 0.94, + "grad_norm": 0.5386959376417068, + "learning_rate": 4.6487943979477424e-07, + "loss": 1.8342, + "step": 12189 + }, + { + "epoch": 0.94, + "grad_norm": 0.5510767576306024, + "learning_rate": 4.6368115449164364e-07, + "loss": 1.8199, + "step": 12190 + }, + { + "epoch": 0.94, + "grad_norm": 0.5587408109876917, + "learning_rate": 4.6248440109169235e-07, + "loss": 2.0511, + "step": 12191 + }, + { + "epoch": 0.94, + "grad_norm": 0.5524100859254958, + "learning_rate": 4.6128917966964393e-07, + "loss": 1.8395, + "step": 12192 + }, + { + "epoch": 0.94, + "grad_norm": 0.5241241427212794, + "learning_rate": 4.60095490300122e-07, + "loss": 1.9507, + "step": 12193 + }, + { + "epoch": 0.94, + "grad_norm": 0.537602196274474, + "learning_rate": 4.5890333305764475e-07, + "loss": 1.913, + "step": 12194 + }, + { + "epoch": 0.94, + "grad_norm": 0.5211663766292899, + "learning_rate": 4.5771270801665824e-07, + "loss": 2.0403, + "step": 12195 + }, + { + "epoch": 0.94, + "grad_norm": 0.5478911134978154, + "learning_rate": 4.5652361525148345e-07, + "loss": 1.901, + "step": 12196 + }, + { + "epoch": 0.94, + "grad_norm": 0.548952006633524, + "learning_rate": 4.5533605483636666e-07, + "loss": 1.8567, + "step": 12197 + }, + { + "epoch": 0.94, + "grad_norm": 0.5539852273587849, + "learning_rate": 4.5415002684545404e-07, + "loss": 2.0462, + "step": 12198 + }, + { + "epoch": 0.94, + "grad_norm": 0.5013202601978779, + "learning_rate": 4.52965531352792e-07, + "loss": 1.9344, + "step": 12199 + }, + { + "epoch": 0.94, + "grad_norm": 0.5392221144311241, + "learning_rate": 4.517825684323324e-07, + "loss": 1.8405, + "step": 12200 + }, + { + "epoch": 0.94, + "grad_norm": 0.5577884990522757, + "learning_rate": 4.5060113815793556e-07, + "loss": 1.8669, + "step": 12201 + }, + { + "epoch": 0.94, + "grad_norm": 0.5391962976494958, + "learning_rate": 4.494212406033593e-07, + "loss": 1.799, + "step": 12202 + }, + { + "epoch": 0.94, + "grad_norm": 0.555755275843181, + "learning_rate": 4.482428758422724e-07, + "loss": 2.0166, + "step": 12203 + }, + { + "epoch": 0.94, + "grad_norm": 0.5518085292901486, + "learning_rate": 4.470660439482438e-07, + "loss": 1.8602, + "step": 12204 + }, + { + "epoch": 0.94, + "grad_norm": 0.5230656526445953, + "learning_rate": 4.4589074499474525e-07, + "loss": 1.9116, + "step": 12205 + }, + { + "epoch": 0.94, + "grad_norm": 0.568985452113987, + "learning_rate": 4.447169790551625e-07, + "loss": 1.8377, + "step": 12206 + }, + { + "epoch": 0.94, + "grad_norm": 0.5314224309406822, + "learning_rate": 4.435447462027731e-07, + "loss": 2.0176, + "step": 12207 + }, + { + "epoch": 0.94, + "grad_norm": 0.5590341387344524, + "learning_rate": 4.4237404651076565e-07, + "loss": 1.8626, + "step": 12208 + }, + { + "epoch": 0.94, + "grad_norm": 0.5551050586448392, + "learning_rate": 4.4120488005223173e-07, + "loss": 1.8873, + "step": 12209 + }, + { + "epoch": 0.94, + "grad_norm": 0.5603297600317924, + "learning_rate": 4.4003724690016566e-07, + "loss": 1.846, + "step": 12210 + }, + { + "epoch": 0.94, + "grad_norm": 0.49990366928934526, + "learning_rate": 4.3887114712747035e-07, + "loss": 2.0786, + "step": 12211 + }, + { + "epoch": 0.94, + "grad_norm": 0.5505149694964356, + "learning_rate": 4.377065808069486e-07, + "loss": 1.8472, + "step": 12212 + }, + { + "epoch": 0.94, + "grad_norm": 0.5415482672162248, + "learning_rate": 4.365435480113089e-07, + "loss": 1.8593, + "step": 12213 + }, + { + "epoch": 0.94, + "grad_norm": 0.5403853125018282, + "learning_rate": 4.3538204881316824e-07, + "loss": 1.8149, + "step": 12214 + }, + { + "epoch": 0.94, + "grad_norm": 0.5257227583946817, + "learning_rate": 4.3422208328503523e-07, + "loss": 2.0478, + "step": 12215 + }, + { + "epoch": 0.94, + "grad_norm": 0.559025423293466, + "learning_rate": 4.330636514993408e-07, + "loss": 1.8964, + "step": 12216 + }, + { + "epoch": 0.94, + "grad_norm": 0.5637217293375276, + "learning_rate": 4.3190675352840216e-07, + "loss": 1.8179, + "step": 12217 + }, + { + "epoch": 0.94, + "grad_norm": 0.5104559733500768, + "learning_rate": 4.3075138944445595e-07, + "loss": 1.9275, + "step": 12218 + }, + { + "epoch": 0.94, + "grad_norm": 0.5289990924999646, + "learning_rate": 4.2959755931963055e-07, + "loss": 2.0687, + "step": 12219 + }, + { + "epoch": 0.94, + "grad_norm": 0.5617302931137816, + "learning_rate": 4.284452632259711e-07, + "loss": 1.8438, + "step": 12220 + }, + { + "epoch": 0.94, + "grad_norm": 0.5614950693369076, + "learning_rate": 4.272945012354146e-07, + "loss": 1.8278, + "step": 12221 + }, + { + "epoch": 0.94, + "grad_norm": 0.5690435567735518, + "learning_rate": 4.2614527341980906e-07, + "loss": 1.8742, + "step": 12222 + }, + { + "epoch": 0.94, + "grad_norm": 0.5391416957886751, + "learning_rate": 4.249975798509054e-07, + "loss": 2.0235, + "step": 12223 + }, + { + "epoch": 0.94, + "grad_norm": 0.49598629779309555, + "learning_rate": 4.238514206003602e-07, + "loss": 1.9298, + "step": 12224 + }, + { + "epoch": 0.94, + "grad_norm": 0.5598855292605013, + "learning_rate": 4.2270679573973296e-07, + "loss": 1.8675, + "step": 12225 + }, + { + "epoch": 0.94, + "grad_norm": 0.5377604720596268, + "learning_rate": 4.215637053404858e-07, + "loss": 1.8927, + "step": 12226 + }, + { + "epoch": 0.94, + "grad_norm": 0.5243668154673904, + "learning_rate": 4.2042214947398684e-07, + "loss": 2.081, + "step": 12227 + }, + { + "epoch": 0.94, + "grad_norm": 0.5512437084153002, + "learning_rate": 4.192821282115095e-07, + "loss": 1.9157, + "step": 12228 + }, + { + "epoch": 0.94, + "grad_norm": 0.5310905822238625, + "learning_rate": 4.181436416242301e-07, + "loss": 1.8376, + "step": 12229 + }, + { + "epoch": 0.94, + "grad_norm": 0.5016822173488662, + "learning_rate": 4.1700668978322533e-07, + "loss": 1.9272, + "step": 12230 + }, + { + "epoch": 0.94, + "grad_norm": 0.5835360070773158, + "learning_rate": 4.1587127275948266e-07, + "loss": 2.0966, + "step": 12231 + }, + { + "epoch": 0.94, + "grad_norm": 0.5636751502306065, + "learning_rate": 4.1473739062389273e-07, + "loss": 1.8681, + "step": 12232 + }, + { + "epoch": 0.94, + "grad_norm": 0.5389633755504954, + "learning_rate": 4.1360504344724327e-07, + "loss": 1.8612, + "step": 12233 + }, + { + "epoch": 0.94, + "grad_norm": 0.5591933203759875, + "learning_rate": 4.124742313002361e-07, + "loss": 1.8127, + "step": 12234 + }, + { + "epoch": 0.94, + "grad_norm": 0.58168621871857, + "learning_rate": 4.113449542534703e-07, + "loss": 2.0429, + "step": 12235 + }, + { + "epoch": 0.94, + "grad_norm": 0.5028949709261705, + "learning_rate": 4.1021721237745337e-07, + "loss": 1.9433, + "step": 12236 + }, + { + "epoch": 0.94, + "grad_norm": 0.5713537806442365, + "learning_rate": 4.0909100574259284e-07, + "loss": 1.8276, + "step": 12237 + }, + { + "epoch": 0.94, + "grad_norm": 0.5685058728049907, + "learning_rate": 4.0796633441920196e-07, + "loss": 1.8422, + "step": 12238 + }, + { + "epoch": 0.94, + "grad_norm": 0.5159785340553931, + "learning_rate": 4.0684319847749954e-07, + "loss": 2.0439, + "step": 12239 + }, + { + "epoch": 0.94, + "grad_norm": 0.5789110531664214, + "learning_rate": 4.057215979876072e-07, + "loss": 1.8515, + "step": 12240 + }, + { + "epoch": 0.94, + "grad_norm": 0.5468434558351187, + "learning_rate": 4.0460153301954964e-07, + "loss": 1.8555, + "step": 12241 + }, + { + "epoch": 0.94, + "grad_norm": 0.520424528174259, + "learning_rate": 4.034830036432652e-07, + "loss": 1.8924, + "step": 12242 + }, + { + "epoch": 0.94, + "grad_norm": 0.5177848054482475, + "learning_rate": 4.0236600992857585e-07, + "loss": 1.993, + "step": 12243 + }, + { + "epoch": 0.94, + "grad_norm": 0.5410933823546056, + "learning_rate": 4.012505519452259e-07, + "loss": 1.86, + "step": 12244 + }, + { + "epoch": 0.94, + "grad_norm": 0.5461110229319871, + "learning_rate": 4.0013662976286226e-07, + "loss": 1.8738, + "step": 12245 + }, + { + "epoch": 0.94, + "grad_norm": 0.5509420023329147, + "learning_rate": 3.9902424345102384e-07, + "loss": 1.8659, + "step": 12246 + }, + { + "epoch": 0.94, + "grad_norm": 0.5303491835729459, + "learning_rate": 3.979133930791662e-07, + "loss": 1.9885, + "step": 12247 + }, + { + "epoch": 0.94, + "grad_norm": 0.554539817806879, + "learning_rate": 3.9680407871664496e-07, + "loss": 1.8653, + "step": 12248 + }, + { + "epoch": 0.94, + "grad_norm": 0.5142982438360112, + "learning_rate": 3.956963004327158e-07, + "loss": 1.9383, + "step": 12249 + }, + { + "epoch": 0.95, + "grad_norm": 0.5693899343743858, + "learning_rate": 3.945900582965456e-07, + "loss": 1.8727, + "step": 12250 + }, + { + "epoch": 0.95, + "grad_norm": 0.5378790521144408, + "learning_rate": 3.9348535237719863e-07, + "loss": 2.0658, + "step": 12251 + }, + { + "epoch": 0.95, + "grad_norm": 0.54164503752832, + "learning_rate": 3.923821827436447e-07, + "loss": 1.8345, + "step": 12252 + }, + { + "epoch": 0.95, + "grad_norm": 0.5369444453511621, + "learning_rate": 3.912805494647648e-07, + "loss": 1.817, + "step": 12253 + }, + { + "epoch": 0.95, + "grad_norm": 0.5482704559172112, + "learning_rate": 3.9018045260933445e-07, + "loss": 1.8346, + "step": 12254 + }, + { + "epoch": 0.95, + "grad_norm": 0.529781017295403, + "learning_rate": 3.8908189224603763e-07, + "loss": 2.069, + "step": 12255 + }, + { + "epoch": 0.95, + "grad_norm": 0.5509203384048785, + "learning_rate": 3.8798486844346393e-07, + "loss": 1.8348, + "step": 12256 + }, + { + "epoch": 0.95, + "grad_norm": 0.5495009492182514, + "learning_rate": 3.868893812701002e-07, + "loss": 1.911, + "step": 12257 + }, + { + "epoch": 0.95, + "grad_norm": 0.5653906395793388, + "learning_rate": 3.857954307943501e-07, + "loss": 1.9123, + "step": 12258 + }, + { + "epoch": 0.95, + "grad_norm": 0.5340609782581943, + "learning_rate": 3.847030170845062e-07, + "loss": 2.0473, + "step": 12259 + }, + { + "epoch": 0.95, + "grad_norm": 0.5434235608959901, + "learning_rate": 3.83612140208775e-07, + "loss": 1.8683, + "step": 12260 + }, + { + "epoch": 0.95, + "grad_norm": 0.5271745985071611, + "learning_rate": 3.82522800235266e-07, + "loss": 1.9213, + "step": 12261 + }, + { + "epoch": 0.95, + "grad_norm": 0.5295137234310723, + "learning_rate": 3.8143499723198863e-07, + "loss": 1.8267, + "step": 12262 + }, + { + "epoch": 0.95, + "grad_norm": 0.5313218753984338, + "learning_rate": 3.803487312668608e-07, + "loss": 2.0484, + "step": 12263 + }, + { + "epoch": 0.95, + "grad_norm": 0.5771844660821385, + "learning_rate": 3.7926400240770044e-07, + "loss": 1.882, + "step": 12264 + }, + { + "epoch": 0.95, + "grad_norm": 0.5523301716143051, + "learning_rate": 3.78180810722234e-07, + "loss": 1.8582, + "step": 12265 + }, + { + "epoch": 0.95, + "grad_norm": 0.5394392939218419, + "learning_rate": 3.7709915627808787e-07, + "loss": 1.8433, + "step": 12266 + }, + { + "epoch": 0.95, + "grad_norm": 0.4899183774861811, + "learning_rate": 3.7601903914279425e-07, + "loss": 2.0708, + "step": 12267 + }, + { + "epoch": 0.95, + "grad_norm": 0.5519238246461585, + "learning_rate": 3.7494045938379075e-07, + "loss": 1.8505, + "step": 12268 + }, + { + "epoch": 0.95, + "grad_norm": 0.5627441240378865, + "learning_rate": 3.7386341706841797e-07, + "loss": 1.877, + "step": 12269 + }, + { + "epoch": 0.95, + "grad_norm": 0.5594633713796201, + "learning_rate": 3.727879122639166e-07, + "loss": 1.857, + "step": 12270 + }, + { + "epoch": 0.95, + "grad_norm": 0.5301094687640432, + "learning_rate": 3.717139450374385e-07, + "loss": 2.0582, + "step": 12271 + }, + { + "epoch": 0.95, + "grad_norm": 0.5698654002278227, + "learning_rate": 3.706415154560328e-07, + "loss": 1.8756, + "step": 12272 + }, + { + "epoch": 0.95, + "grad_norm": 0.5221582840636342, + "learning_rate": 3.695706235866597e-07, + "loss": 1.9205, + "step": 12273 + }, + { + "epoch": 0.95, + "grad_norm": 0.5465714589680125, + "learning_rate": 3.6850126949617427e-07, + "loss": 1.834, + "step": 12274 + }, + { + "epoch": 0.95, + "grad_norm": 0.5080209074639347, + "learning_rate": 3.6743345325134515e-07, + "loss": 2.0151, + "step": 12275 + }, + { + "epoch": 0.95, + "grad_norm": 0.5659795046138616, + "learning_rate": 3.6636717491883856e-07, + "loss": 1.8135, + "step": 12276 + }, + { + "epoch": 0.95, + "grad_norm": 0.5668541237269371, + "learning_rate": 3.6530243456522615e-07, + "loss": 1.8756, + "step": 12277 + }, + { + "epoch": 0.95, + "grad_norm": 0.5418833038128792, + "learning_rate": 3.6423923225698265e-07, + "loss": 1.8742, + "step": 12278 + }, + { + "epoch": 0.95, + "grad_norm": 0.5194763297402295, + "learning_rate": 3.6317756806049374e-07, + "loss": 2.085, + "step": 12279 + }, + { + "epoch": 0.95, + "grad_norm": 0.4996968547903169, + "learning_rate": 3.6211744204203703e-07, + "loss": 1.9095, + "step": 12280 + }, + { + "epoch": 0.95, + "grad_norm": 0.5346625108320361, + "learning_rate": 3.610588542678039e-07, + "loss": 1.7892, + "step": 12281 + }, + { + "epoch": 0.95, + "grad_norm": 0.5359006757533082, + "learning_rate": 3.600018048038861e-07, + "loss": 1.8787, + "step": 12282 + }, + { + "epoch": 0.95, + "grad_norm": 0.5397848866104188, + "learning_rate": 3.5894629371628073e-07, + "loss": 2.0539, + "step": 12283 + }, + { + "epoch": 0.95, + "grad_norm": 0.5493073636740198, + "learning_rate": 3.5789232107088786e-07, + "loss": 1.8393, + "step": 12284 + }, + { + "epoch": 0.95, + "grad_norm": 0.5549291537615051, + "learning_rate": 3.568398869335049e-07, + "loss": 1.8239, + "step": 12285 + }, + { + "epoch": 0.95, + "grad_norm": 0.5191194307834666, + "learning_rate": 3.557889913698459e-07, + "loss": 1.9295, + "step": 12286 + }, + { + "epoch": 0.95, + "grad_norm": 0.550780643054991, + "learning_rate": 3.547396344455223e-07, + "loss": 2.106, + "step": 12287 + }, + { + "epoch": 0.95, + "grad_norm": 0.532555199114257, + "learning_rate": 3.536918162260483e-07, + "loss": 1.8294, + "step": 12288 + }, + { + "epoch": 0.95, + "grad_norm": 0.5536490952472499, + "learning_rate": 3.5264553677684386e-07, + "loss": 1.883, + "step": 12289 + }, + { + "epoch": 0.95, + "grad_norm": 0.5538876429713857, + "learning_rate": 3.516007961632317e-07, + "loss": 1.8884, + "step": 12290 + }, + { + "epoch": 0.95, + "grad_norm": 0.5402184041294837, + "learning_rate": 3.5055759445044025e-07, + "loss": 2.0005, + "step": 12291 + }, + { + "epoch": 0.95, + "grad_norm": 0.4989007938093657, + "learning_rate": 3.4951593170360066e-07, + "loss": 1.9393, + "step": 12292 + }, + { + "epoch": 0.95, + "grad_norm": 0.5640278942238962, + "learning_rate": 3.4847580798774983e-07, + "loss": 1.8135, + "step": 12293 + }, + { + "epoch": 0.95, + "grad_norm": 0.526503125647013, + "learning_rate": 3.474372233678219e-07, + "loss": 1.8382, + "step": 12294 + }, + { + "epoch": 0.95, + "grad_norm": 0.5370124172405291, + "learning_rate": 3.4640017790866506e-07, + "loss": 2.0725, + "step": 12295 + }, + { + "epoch": 0.95, + "grad_norm": 0.5458650143859223, + "learning_rate": 3.453646716750247e-07, + "loss": 1.8321, + "step": 12296 + }, + { + "epoch": 0.95, + "grad_norm": 0.5810408513485574, + "learning_rate": 3.4433070473155194e-07, + "loss": 1.8435, + "step": 12297 + }, + { + "epoch": 0.95, + "grad_norm": 0.508044175636359, + "learning_rate": 3.4329827714280336e-07, + "loss": 1.9192, + "step": 12298 + }, + { + "epoch": 0.95, + "grad_norm": 0.5245742263355347, + "learning_rate": 3.422673889732303e-07, + "loss": 2.0712, + "step": 12299 + }, + { + "epoch": 0.95, + "grad_norm": 0.5554381885030606, + "learning_rate": 3.412380402872062e-07, + "loss": 1.8641, + "step": 12300 + }, + { + "epoch": 0.95, + "grad_norm": 0.5508865719830663, + "learning_rate": 3.40210231148988e-07, + "loss": 1.7974, + "step": 12301 + }, + { + "epoch": 0.95, + "grad_norm": 0.5603484641934283, + "learning_rate": 3.3918396162275214e-07, + "loss": 1.8769, + "step": 12302 + }, + { + "epoch": 0.95, + "grad_norm": 0.5285344672910878, + "learning_rate": 3.3815923177256956e-07, + "loss": 2.1095, + "step": 12303 + }, + { + "epoch": 0.95, + "grad_norm": 0.5047771148356289, + "learning_rate": 3.3713604166241973e-07, + "loss": 1.922, + "step": 12304 + }, + { + "epoch": 0.95, + "grad_norm": 0.5493741563796235, + "learning_rate": 3.361143913561848e-07, + "loss": 1.8481, + "step": 12305 + }, + { + "epoch": 0.95, + "grad_norm": 0.5149484210637101, + "learning_rate": 3.3509428091764996e-07, + "loss": 1.8607, + "step": 12306 + }, + { + "epoch": 0.95, + "grad_norm": 0.5254544785840605, + "learning_rate": 3.3407571041050577e-07, + "loss": 2.0649, + "step": 12307 + }, + { + "epoch": 0.95, + "grad_norm": 0.5284349277228865, + "learning_rate": 3.3305867989834596e-07, + "loss": 1.8172, + "step": 12308 + }, + { + "epoch": 0.95, + "grad_norm": 0.5528527644226342, + "learning_rate": 3.3204318944466406e-07, + "loss": 1.806, + "step": 12309 + }, + { + "epoch": 0.95, + "grad_norm": 0.5477552595381143, + "learning_rate": 3.310292391128678e-07, + "loss": 1.9241, + "step": 12310 + }, + { + "epoch": 0.95, + "grad_norm": 0.4989331953621159, + "learning_rate": 3.300168289662564e-07, + "loss": 2.0885, + "step": 12311 + }, + { + "epoch": 0.95, + "grad_norm": 0.551681746850437, + "learning_rate": 3.2900595906804053e-07, + "loss": 1.8621, + "step": 12312 + }, + { + "epoch": 0.95, + "grad_norm": 0.5506843293759881, + "learning_rate": 3.2799662948133635e-07, + "loss": 1.8343, + "step": 12313 + }, + { + "epoch": 0.95, + "grad_norm": 0.5886823406736493, + "learning_rate": 3.2698884026915734e-07, + "loss": 1.876, + "step": 12314 + }, + { + "epoch": 0.95, + "grad_norm": 0.5604859639607, + "learning_rate": 3.2598259149442254e-07, + "loss": 2.0219, + "step": 12315 + }, + { + "epoch": 0.95, + "grad_norm": 0.5779869988479575, + "learning_rate": 3.2497788321995957e-07, + "loss": 1.8455, + "step": 12316 + }, + { + "epoch": 0.95, + "grad_norm": 0.502363936274882, + "learning_rate": 3.239747155084932e-07, + "loss": 1.9543, + "step": 12317 + }, + { + "epoch": 0.95, + "grad_norm": 0.553722107028103, + "learning_rate": 3.229730884226567e-07, + "loss": 1.8599, + "step": 12318 + }, + { + "epoch": 0.95, + "grad_norm": 0.5629480882485736, + "learning_rate": 3.219730020249889e-07, + "loss": 2.0727, + "step": 12319 + }, + { + "epoch": 0.95, + "grad_norm": 0.587033926305675, + "learning_rate": 3.2097445637792324e-07, + "loss": 1.8629, + "step": 12320 + }, + { + "epoch": 0.95, + "grad_norm": 0.5472509987159287, + "learning_rate": 3.1997745154380977e-07, + "loss": 1.8708, + "step": 12321 + }, + { + "epoch": 0.95, + "grad_norm": 0.5751421174249743, + "learning_rate": 3.189819875848876e-07, + "loss": 1.8544, + "step": 12322 + }, + { + "epoch": 0.95, + "grad_norm": 0.5402861685040083, + "learning_rate": 3.179880645633182e-07, + "loss": 1.9086, + "step": 12323 + }, + { + "epoch": 0.95, + "grad_norm": 0.5241695954677373, + "learning_rate": 3.169956825411435e-07, + "loss": 2.0229, + "step": 12324 + }, + { + "epoch": 0.95, + "grad_norm": 0.5430333294428568, + "learning_rate": 3.160048415803335e-07, + "loss": 1.8735, + "step": 12325 + }, + { + "epoch": 0.95, + "grad_norm": 0.5504624968284116, + "learning_rate": 3.150155417427442e-07, + "loss": 1.8777, + "step": 12326 + }, + { + "epoch": 0.95, + "grad_norm": 0.5337205091858703, + "learning_rate": 3.140277830901428e-07, + "loss": 2.0682, + "step": 12327 + }, + { + "epoch": 0.95, + "grad_norm": 0.5847126676620163, + "learning_rate": 3.1304156568419675e-07, + "loss": 1.8632, + "step": 12328 + }, + { + "epoch": 0.95, + "grad_norm": 0.5057182113660615, + "learning_rate": 3.1205688958648725e-07, + "loss": 1.9295, + "step": 12329 + }, + { + "epoch": 0.95, + "grad_norm": 0.5211825750872021, + "learning_rate": 3.110737548584847e-07, + "loss": 1.8267, + "step": 12330 + }, + { + "epoch": 0.95, + "grad_norm": 0.5182513614885402, + "learning_rate": 3.1009216156157315e-07, + "loss": 2.0463, + "step": 12331 + }, + { + "epoch": 0.95, + "grad_norm": 0.5537858737262462, + "learning_rate": 3.091121097570343e-07, + "loss": 1.85, + "step": 12332 + }, + { + "epoch": 0.95, + "grad_norm": 0.5690584144719656, + "learning_rate": 3.08133599506058e-07, + "loss": 1.7827, + "step": 12333 + }, + { + "epoch": 0.95, + "grad_norm": 0.5473953630463927, + "learning_rate": 3.0715663086974265e-07, + "loss": 1.8585, + "step": 12334 + }, + { + "epoch": 0.95, + "grad_norm": 0.5607094569621096, + "learning_rate": 3.0618120390907555e-07, + "loss": 2.0552, + "step": 12335 + }, + { + "epoch": 0.95, + "grad_norm": 0.4985698285304345, + "learning_rate": 3.0520731868496355e-07, + "loss": 1.8852, + "step": 12336 + }, + { + "epoch": 0.95, + "grad_norm": 0.579430602208256, + "learning_rate": 3.04234975258208e-07, + "loss": 1.8359, + "step": 12337 + }, + { + "epoch": 0.95, + "grad_norm": 0.5783297342988696, + "learning_rate": 3.032641736895103e-07, + "loss": 1.9002, + "step": 12338 + }, + { + "epoch": 0.95, + "grad_norm": 0.5155027762659877, + "learning_rate": 3.022949140394915e-07, + "loss": 1.9951, + "step": 12339 + }, + { + "epoch": 0.95, + "grad_norm": 0.5263107517911539, + "learning_rate": 3.0132719636866146e-07, + "loss": 1.8384, + "step": 12340 + }, + { + "epoch": 0.95, + "grad_norm": 0.5398907210331203, + "learning_rate": 3.0036102073743855e-07, + "loss": 1.8755, + "step": 12341 + }, + { + "epoch": 0.95, + "grad_norm": 0.5041202731640542, + "learning_rate": 2.993963872061467e-07, + "loss": 1.9217, + "step": 12342 + }, + { + "epoch": 0.95, + "grad_norm": 0.5451679361804138, + "learning_rate": 2.9843329583501e-07, + "loss": 1.848, + "step": 12343 + }, + { + "epoch": 0.95, + "grad_norm": 0.5274908179825102, + "learning_rate": 2.974717466841581e-07, + "loss": 2.0717, + "step": 12344 + }, + { + "epoch": 0.95, + "grad_norm": 0.5598256165934916, + "learning_rate": 2.965117398136291e-07, + "loss": 1.8405, + "step": 12345 + }, + { + "epoch": 0.95, + "grad_norm": 0.5737466474565918, + "learning_rate": 2.955532752833501e-07, + "loss": 1.8285, + "step": 12346 + }, + { + "epoch": 0.95, + "grad_norm": 0.5370546368731641, + "learning_rate": 2.945963531531759e-07, + "loss": 2.0828, + "step": 12347 + }, + { + "epoch": 0.95, + "grad_norm": 0.5228934012992893, + "learning_rate": 2.936409734828394e-07, + "loss": 1.8933, + "step": 12348 + }, + { + "epoch": 0.95, + "grad_norm": 0.5694115245380343, + "learning_rate": 2.9268713633199276e-07, + "loss": 1.895, + "step": 12349 + }, + { + "epoch": 0.95, + "grad_norm": 0.577088092875283, + "learning_rate": 2.9173484176019393e-07, + "loss": 1.8677, + "step": 12350 + }, + { + "epoch": 0.95, + "grad_norm": 0.5231379914295355, + "learning_rate": 2.907840898268871e-07, + "loss": 2.0329, + "step": 12351 + }, + { + "epoch": 0.95, + "grad_norm": 0.5520524327905233, + "learning_rate": 2.8983488059144135e-07, + "loss": 1.803, + "step": 12352 + }, + { + "epoch": 0.95, + "grad_norm": 0.5858015789056435, + "learning_rate": 2.8888721411311494e-07, + "loss": 1.8046, + "step": 12353 + }, + { + "epoch": 0.95, + "grad_norm": 0.5491024458431032, + "learning_rate": 2.879410904510743e-07, + "loss": 1.9226, + "step": 12354 + }, + { + "epoch": 0.95, + "grad_norm": 0.5810116934631, + "learning_rate": 2.8699650966439174e-07, + "loss": 1.8539, + "step": 12355 + }, + { + "epoch": 0.95, + "grad_norm": 0.5426844998355125, + "learning_rate": 2.8605347181203946e-07, + "loss": 2.0453, + "step": 12356 + }, + { + "epoch": 0.95, + "grad_norm": 0.5569202869658801, + "learning_rate": 2.8511197695289815e-07, + "loss": 1.8935, + "step": 12357 + }, + { + "epoch": 0.95, + "grad_norm": 0.5423199332722081, + "learning_rate": 2.841720251457458e-07, + "loss": 1.836, + "step": 12358 + }, + { + "epoch": 0.95, + "grad_norm": 0.5448797739159761, + "learning_rate": 2.8323361644926883e-07, + "loss": 2.1123, + "step": 12359 + }, + { + "epoch": 0.95, + "grad_norm": 0.4761314574841633, + "learning_rate": 2.822967509220592e-07, + "loss": 1.9265, + "step": 12360 + }, + { + "epoch": 0.95, + "grad_norm": 0.5420007857759062, + "learning_rate": 2.813614286226035e-07, + "loss": 1.8603, + "step": 12361 + }, + { + "epoch": 0.95, + "grad_norm": 0.5652302429272086, + "learning_rate": 2.804276496093022e-07, + "loss": 1.8996, + "step": 12362 + }, + { + "epoch": 0.95, + "grad_norm": 0.5186137867521708, + "learning_rate": 2.794954139404532e-07, + "loss": 2.0704, + "step": 12363 + }, + { + "epoch": 0.95, + "grad_norm": 0.5674649644703988, + "learning_rate": 2.7856472167425707e-07, + "loss": 1.8336, + "step": 12364 + }, + { + "epoch": 0.95, + "grad_norm": 0.5600444592478302, + "learning_rate": 2.776355728688257e-07, + "loss": 1.8927, + "step": 12365 + }, + { + "epoch": 0.95, + "grad_norm": 0.5541777553761364, + "learning_rate": 2.767079675821682e-07, + "loss": 1.8561, + "step": 12366 + }, + { + "epoch": 0.95, + "grad_norm": 0.5018088116267028, + "learning_rate": 2.757819058721967e-07, + "loss": 1.9305, + "step": 12367 + }, + { + "epoch": 0.95, + "grad_norm": 0.528825069283403, + "learning_rate": 2.7485738779673143e-07, + "loss": 2.0703, + "step": 12368 + }, + { + "epoch": 0.95, + "grad_norm": 0.5407507583267065, + "learning_rate": 2.7393441341349304e-07, + "loss": 1.8262, + "step": 12369 + }, + { + "epoch": 0.95, + "grad_norm": 0.5603465324800216, + "learning_rate": 2.7301298278010755e-07, + "loss": 1.8728, + "step": 12370 + }, + { + "epoch": 0.95, + "grad_norm": 0.5182275020324781, + "learning_rate": 2.720930959540985e-07, + "loss": 2.0472, + "step": 12371 + }, + { + "epoch": 0.95, + "grad_norm": 0.5566520161880496, + "learning_rate": 2.7117475299290594e-07, + "loss": 1.8426, + "step": 12372 + }, + { + "epoch": 0.95, + "grad_norm": 0.5137081713735653, + "learning_rate": 2.702579539538619e-07, + "loss": 1.9067, + "step": 12373 + }, + { + "epoch": 0.95, + "grad_norm": 0.5379390680171702, + "learning_rate": 2.6934269889420384e-07, + "loss": 1.8573, + "step": 12374 + }, + { + "epoch": 0.95, + "grad_norm": 0.5805139441556773, + "learning_rate": 2.6842898787107773e-07, + "loss": 1.8504, + "step": 12375 + }, + { + "epoch": 0.95, + "grad_norm": 0.5328653307281224, + "learning_rate": 2.675168209415324e-07, + "loss": 2.0335, + "step": 12376 + }, + { + "epoch": 0.95, + "grad_norm": 0.553313503276388, + "learning_rate": 2.6660619816251397e-07, + "loss": 1.8511, + "step": 12377 + }, + { + "epoch": 0.95, + "grad_norm": 0.5497424013014071, + "learning_rate": 2.6569711959087693e-07, + "loss": 1.8623, + "step": 12378 + }, + { + "epoch": 0.96, + "grad_norm": 0.5449162646944041, + "learning_rate": 2.647895852833815e-07, + "loss": 1.9373, + "step": 12379 + }, + { + "epoch": 0.96, + "grad_norm": 0.5414848391700594, + "learning_rate": 2.6388359529668506e-07, + "loss": 2.0285, + "step": 12380 + }, + { + "epoch": 0.96, + "grad_norm": 0.544323035405987, + "learning_rate": 2.6297914968735625e-07, + "loss": 1.7962, + "step": 12381 + }, + { + "epoch": 0.96, + "grad_norm": 0.5334283845142075, + "learning_rate": 2.6207624851185833e-07, + "loss": 1.8634, + "step": 12382 + }, + { + "epoch": 0.96, + "grad_norm": 0.5460798667873793, + "learning_rate": 2.6117489182656827e-07, + "loss": 2.0472, + "step": 12383 + }, + { + "epoch": 0.96, + "grad_norm": 0.5602045842838083, + "learning_rate": 2.6027507968776067e-07, + "loss": 1.8211, + "step": 12384 + }, + { + "epoch": 0.96, + "grad_norm": 0.5066887380268822, + "learning_rate": 2.593768121516099e-07, + "loss": 1.9263, + "step": 12385 + }, + { + "epoch": 0.96, + "grad_norm": 0.5424878535430804, + "learning_rate": 2.584800892742073e-07, + "loss": 1.8474, + "step": 12386 + }, + { + "epoch": 0.96, + "grad_norm": 0.536058109031376, + "learning_rate": 2.575849111115275e-07, + "loss": 1.8758, + "step": 12387 + }, + { + "epoch": 0.96, + "grad_norm": 0.5500069813474334, + "learning_rate": 2.5669127771946743e-07, + "loss": 2.0468, + "step": 12388 + }, + { + "epoch": 0.96, + "grad_norm": 0.542400335799354, + "learning_rate": 2.557991891538214e-07, + "loss": 1.8158, + "step": 12389 + }, + { + "epoch": 0.96, + "grad_norm": 0.5089344621594065, + "learning_rate": 2.549086454702837e-07, + "loss": 1.8096, + "step": 12390 + }, + { + "epoch": 0.96, + "grad_norm": 0.5108176060258455, + "learning_rate": 2.5401964672445154e-07, + "loss": 1.895, + "step": 12391 + }, + { + "epoch": 0.96, + "grad_norm": 0.5298504557373275, + "learning_rate": 2.531321929718333e-07, + "loss": 2.0107, + "step": 12392 + }, + { + "epoch": 0.96, + "grad_norm": 0.5352394143456641, + "learning_rate": 2.522462842678347e-07, + "loss": 1.8577, + "step": 12393 + }, + { + "epoch": 0.96, + "grad_norm": 0.5390564275935895, + "learning_rate": 2.513619206677698e-07, + "loss": 1.8268, + "step": 12394 + }, + { + "epoch": 0.96, + "grad_norm": 0.5288484907912087, + "learning_rate": 2.504791022268471e-07, + "loss": 2.093, + "step": 12395 + }, + { + "epoch": 0.96, + "grad_norm": 0.5619047506054939, + "learning_rate": 2.495978290001866e-07, + "loss": 1.8471, + "step": 12396 + }, + { + "epoch": 0.96, + "grad_norm": 0.5651827248256176, + "learning_rate": 2.4871810104281356e-07, + "loss": 1.925, + "step": 12397 + }, + { + "epoch": 0.96, + "grad_norm": 0.4992166862084879, + "learning_rate": 2.4783991840964805e-07, + "loss": 1.9487, + "step": 12398 + }, + { + "epoch": 0.96, + "grad_norm": 0.5230245578010629, + "learning_rate": 2.469632811555239e-07, + "loss": 1.8326, + "step": 12399 + }, + { + "epoch": 0.96, + "grad_norm": 0.5348566496717568, + "learning_rate": 2.4608818933516686e-07, + "loss": 2.0406, + "step": 12400 + }, + { + "epoch": 0.96, + "grad_norm": 0.5240812360986895, + "learning_rate": 2.452146430032165e-07, + "loss": 1.8626, + "step": 12401 + }, + { + "epoch": 0.96, + "grad_norm": 0.5448488850190394, + "learning_rate": 2.4434264221421255e-07, + "loss": 1.8479, + "step": 12402 + }, + { + "epoch": 0.96, + "grad_norm": 0.528819997492905, + "learning_rate": 2.434721870225948e-07, + "loss": 2.0754, + "step": 12403 + }, + { + "epoch": 0.96, + "grad_norm": 0.521779962255133, + "learning_rate": 2.4260327748271136e-07, + "loss": 1.9587, + "step": 12404 + }, + { + "epoch": 0.96, + "grad_norm": 0.5457535017597165, + "learning_rate": 2.4173591364881066e-07, + "loss": 1.8263, + "step": 12405 + }, + { + "epoch": 0.96, + "grad_norm": 0.556643670626656, + "learning_rate": 2.408700955750492e-07, + "loss": 1.8918, + "step": 12406 + }, + { + "epoch": 0.96, + "grad_norm": 0.5809544877309927, + "learning_rate": 2.400058233154784e-07, + "loss": 1.8611, + "step": 12407 + }, + { + "epoch": 0.96, + "grad_norm": 0.5382186129127595, + "learning_rate": 2.3914309692406045e-07, + "loss": 2.0274, + "step": 12408 + }, + { + "epoch": 0.96, + "grad_norm": 0.5224938864866695, + "learning_rate": 2.382819164546579e-07, + "loss": 1.8325, + "step": 12409 + }, + { + "epoch": 0.96, + "grad_norm": 0.5420614482794511, + "learning_rate": 2.3742228196104166e-07, + "loss": 1.91, + "step": 12410 + }, + { + "epoch": 0.96, + "grad_norm": 0.5381463122715112, + "learning_rate": 2.3656419349687986e-07, + "loss": 1.8296, + "step": 12411 + }, + { + "epoch": 0.96, + "grad_norm": 0.5261606163973161, + "learning_rate": 2.3570765111574354e-07, + "loss": 2.0603, + "step": 12412 + }, + { + "epoch": 0.96, + "grad_norm": 0.557720406692031, + "learning_rate": 2.3485265487111497e-07, + "loss": 1.8317, + "step": 12413 + }, + { + "epoch": 0.96, + "grad_norm": 0.5059603519031224, + "learning_rate": 2.3399920481637083e-07, + "loss": 1.7887, + "step": 12414 + }, + { + "epoch": 0.96, + "grad_norm": 0.5556067719900546, + "learning_rate": 2.331473010047991e-07, + "loss": 2.0533, + "step": 12415 + }, + { + "epoch": 0.96, + "grad_norm": 0.5352671376490844, + "learning_rate": 2.3229694348958497e-07, + "loss": 1.9135, + "step": 12416 + }, + { + "epoch": 0.96, + "grad_norm": 0.5460479944231387, + "learning_rate": 2.314481323238249e-07, + "loss": 1.8527, + "step": 12417 + }, + { + "epoch": 0.96, + "grad_norm": 0.5762529193680177, + "learning_rate": 2.3060086756050426e-07, + "loss": 1.8838, + "step": 12418 + }, + { + "epoch": 0.96, + "grad_norm": 0.5794252363952456, + "learning_rate": 2.2975514925253072e-07, + "loss": 1.8713, + "step": 12419 + }, + { + "epoch": 0.96, + "grad_norm": 0.5316220642173061, + "learning_rate": 2.2891097745270097e-07, + "loss": 2.0842, + "step": 12420 + }, + { + "epoch": 0.96, + "grad_norm": 0.5396891263817313, + "learning_rate": 2.2806835221372003e-07, + "loss": 1.8976, + "step": 12421 + }, + { + "epoch": 0.96, + "grad_norm": 0.5018710322250157, + "learning_rate": 2.2722727358819584e-07, + "loss": 1.9232, + "step": 12422 + }, + { + "epoch": 0.96, + "grad_norm": 0.5320132030177137, + "learning_rate": 2.2638774162864751e-07, + "loss": 1.8492, + "step": 12423 + }, + { + "epoch": 0.96, + "grad_norm": 0.5182018472798056, + "learning_rate": 2.2554975638747754e-07, + "loss": 2.0256, + "step": 12424 + }, + { + "epoch": 0.96, + "grad_norm": 0.549548814503514, + "learning_rate": 2.2471331791701912e-07, + "loss": 1.8569, + "step": 12425 + }, + { + "epoch": 0.96, + "grad_norm": 0.5434820488029828, + "learning_rate": 2.2387842626948319e-07, + "loss": 1.8361, + "step": 12426 + }, + { + "epoch": 0.96, + "grad_norm": 0.554275302082161, + "learning_rate": 2.230450814970031e-07, + "loss": 1.8847, + "step": 12427 + }, + { + "epoch": 0.96, + "grad_norm": 0.5276322185752452, + "learning_rate": 2.2221328365160386e-07, + "loss": 2.0158, + "step": 12428 + }, + { + "epoch": 0.96, + "grad_norm": 0.5062758494390275, + "learning_rate": 2.2138303278521622e-07, + "loss": 1.9021, + "step": 12429 + }, + { + "epoch": 0.96, + "grad_norm": 0.5523816308607804, + "learning_rate": 2.20554328949682e-07, + "loss": 1.8757, + "step": 12430 + }, + { + "epoch": 0.96, + "grad_norm": 0.5417471527920115, + "learning_rate": 2.1972717219673754e-07, + "loss": 1.8609, + "step": 12431 + }, + { + "epoch": 0.96, + "grad_norm": 0.5010203023610377, + "learning_rate": 2.1890156257802495e-07, + "loss": 2.0392, + "step": 12432 + }, + { + "epoch": 0.96, + "grad_norm": 0.5671128469426724, + "learning_rate": 2.1807750014509186e-07, + "loss": 1.8746, + "step": 12433 + }, + { + "epoch": 0.96, + "grad_norm": 0.5450920054437463, + "learning_rate": 2.1725498494938602e-07, + "loss": 1.8084, + "step": 12434 + }, + { + "epoch": 0.96, + "grad_norm": 0.506297053819699, + "learning_rate": 2.1643401704226075e-07, + "loss": 1.9425, + "step": 12435 + }, + { + "epoch": 0.96, + "grad_norm": 0.5185994738775137, + "learning_rate": 2.1561459647497783e-07, + "loss": 2.0042, + "step": 12436 + }, + { + "epoch": 0.96, + "grad_norm": 0.548419339855689, + "learning_rate": 2.1479672329868805e-07, + "loss": 1.8826, + "step": 12437 + }, + { + "epoch": 0.96, + "grad_norm": 0.5537267920297948, + "learning_rate": 2.1398039756445886e-07, + "loss": 1.805, + "step": 12438 + }, + { + "epoch": 0.96, + "grad_norm": 0.5423706764836805, + "learning_rate": 2.1316561932325785e-07, + "loss": 1.8168, + "step": 12439 + }, + { + "epoch": 0.96, + "grad_norm": 0.513332141458772, + "learning_rate": 2.1235238862595541e-07, + "loss": 2.0412, + "step": 12440 + }, + { + "epoch": 0.96, + "grad_norm": 0.5121199465188533, + "learning_rate": 2.1154070552332206e-07, + "loss": 1.9127, + "step": 12441 + }, + { + "epoch": 0.96, + "grad_norm": 0.545177019456379, + "learning_rate": 2.1073057006603669e-07, + "loss": 1.8367, + "step": 12442 + }, + { + "epoch": 0.96, + "grad_norm": 0.5396070560126897, + "learning_rate": 2.0992198230467829e-07, + "loss": 1.8216, + "step": 12443 + }, + { + "epoch": 0.96, + "grad_norm": 0.5089123989148564, + "learning_rate": 2.091149422897315e-07, + "loss": 2.0106, + "step": 12444 + }, + { + "epoch": 0.96, + "grad_norm": 0.5256701337816259, + "learning_rate": 2.08309450071581e-07, + "loss": 1.8689, + "step": 12445 + }, + { + "epoch": 0.96, + "grad_norm": 0.5253200391424355, + "learning_rate": 2.0750550570051984e-07, + "loss": 1.8227, + "step": 12446 + }, + { + "epoch": 0.96, + "grad_norm": 0.5052653570887029, + "learning_rate": 2.067031092267385e-07, + "loss": 1.9541, + "step": 12447 + }, + { + "epoch": 0.96, + "grad_norm": 0.5224144641889229, + "learning_rate": 2.059022607003358e-07, + "loss": 2.0869, + "step": 12448 + }, + { + "epoch": 0.96, + "grad_norm": 0.5717522517304106, + "learning_rate": 2.0510296017131336e-07, + "loss": 1.8272, + "step": 12449 + }, + { + "epoch": 0.96, + "grad_norm": 0.5423504368693773, + "learning_rate": 2.0430520768957295e-07, + "loss": 1.8448, + "step": 12450 + }, + { + "epoch": 0.96, + "grad_norm": 0.5559752937273479, + "learning_rate": 2.0350900330491917e-07, + "loss": 1.8296, + "step": 12451 + }, + { + "epoch": 0.96, + "grad_norm": 0.5219547363193888, + "learning_rate": 2.0271434706706782e-07, + "loss": 2.0336, + "step": 12452 + }, + { + "epoch": 0.96, + "grad_norm": 0.5079057157754275, + "learning_rate": 2.0192123902562642e-07, + "loss": 1.9552, + "step": 12453 + }, + { + "epoch": 0.96, + "grad_norm": 0.5485193374667694, + "learning_rate": 2.011296792301165e-07, + "loss": 1.8395, + "step": 12454 + }, + { + "epoch": 0.96, + "grad_norm": 0.5293042047432247, + "learning_rate": 2.003396677299596e-07, + "loss": 1.8367, + "step": 12455 + }, + { + "epoch": 0.96, + "grad_norm": 0.5118128830368989, + "learning_rate": 1.9955120457447462e-07, + "loss": 2.0296, + "step": 12456 + }, + { + "epoch": 0.96, + "grad_norm": 0.5561386644196412, + "learning_rate": 1.9876428981288887e-07, + "loss": 1.87, + "step": 12457 + }, + { + "epoch": 0.96, + "grad_norm": 0.546218140041735, + "learning_rate": 1.97978923494338e-07, + "loss": 1.8283, + "step": 12458 + }, + { + "epoch": 0.96, + "grad_norm": 0.534924277059728, + "learning_rate": 1.9719510566784948e-07, + "loss": 1.8438, + "step": 12459 + }, + { + "epoch": 0.96, + "grad_norm": 0.5437760333591966, + "learning_rate": 1.9641283638236474e-07, + "loss": 2.0904, + "step": 12460 + }, + { + "epoch": 0.96, + "grad_norm": 0.5519887971950597, + "learning_rate": 1.9563211568672247e-07, + "loss": 1.8316, + "step": 12461 + }, + { + "epoch": 0.96, + "grad_norm": 0.5343767342809388, + "learning_rate": 1.948529436296642e-07, + "loss": 1.8202, + "step": 12462 + }, + { + "epoch": 0.96, + "grad_norm": 0.5475635817389152, + "learning_rate": 1.9407532025983997e-07, + "loss": 1.85, + "step": 12463 + }, + { + "epoch": 0.96, + "grad_norm": 0.5381371053068487, + "learning_rate": 1.9329924562579705e-07, + "loss": 2.0429, + "step": 12464 + }, + { + "epoch": 0.96, + "grad_norm": 0.5456808466454736, + "learning_rate": 1.9252471977599385e-07, + "loss": 1.8589, + "step": 12465 + }, + { + "epoch": 0.96, + "grad_norm": 0.5102659653264047, + "learning_rate": 1.917517427587834e-07, + "loss": 1.9421, + "step": 12466 + }, + { + "epoch": 0.96, + "grad_norm": 0.568506163859508, + "learning_rate": 1.9098031462242705e-07, + "loss": 1.8757, + "step": 12467 + }, + { + "epoch": 0.96, + "grad_norm": 0.533655515387091, + "learning_rate": 1.9021043541508354e-07, + "loss": 1.9989, + "step": 12468 + }, + { + "epoch": 0.96, + "grad_norm": 0.5743923118102602, + "learning_rate": 1.8944210518482552e-07, + "loss": 1.8672, + "step": 12469 + }, + { + "epoch": 0.96, + "grad_norm": 0.5570272160654681, + "learning_rate": 1.8867532397962573e-07, + "loss": 1.8704, + "step": 12470 + }, + { + "epoch": 0.96, + "grad_norm": 0.558709378767941, + "learning_rate": 1.8791009184734588e-07, + "loss": 1.8772, + "step": 12471 + }, + { + "epoch": 0.96, + "grad_norm": 0.5356752272922317, + "learning_rate": 1.871464088357755e-07, + "loss": 2.1065, + "step": 12472 + }, + { + "epoch": 0.96, + "grad_norm": 0.5583683539766308, + "learning_rate": 1.863842749925848e-07, + "loss": 1.8405, + "step": 12473 + }, + { + "epoch": 0.96, + "grad_norm": 0.533271416335019, + "learning_rate": 1.8562369036536076e-07, + "loss": 1.8053, + "step": 12474 + }, + { + "epoch": 0.96, + "grad_norm": 0.5319187372987666, + "learning_rate": 1.848646550015931e-07, + "loss": 1.818, + "step": 12475 + }, + { + "epoch": 0.96, + "grad_norm": 0.5440419798774808, + "learning_rate": 1.8410716894866342e-07, + "loss": 2.0316, + "step": 12476 + }, + { + "epoch": 0.96, + "grad_norm": 0.5462511133741799, + "learning_rate": 1.8335123225387275e-07, + "loss": 1.8281, + "step": 12477 + }, + { + "epoch": 0.96, + "grad_norm": 0.4862759817667618, + "learning_rate": 1.8259684496441387e-07, + "loss": 1.8953, + "step": 12478 + }, + { + "epoch": 0.96, + "grad_norm": 0.5691828856888608, + "learning_rate": 1.8184400712738525e-07, + "loss": 1.8863, + "step": 12479 + }, + { + "epoch": 0.96, + "grad_norm": 0.530777394843586, + "learning_rate": 1.8109271878979372e-07, + "loss": 2.0429, + "step": 12480 + }, + { + "epoch": 0.96, + "grad_norm": 0.5742917196912182, + "learning_rate": 1.8034297999854066e-07, + "loss": 1.7977, + "step": 12481 + }, + { + "epoch": 0.96, + "grad_norm": 0.537871279138477, + "learning_rate": 1.7959479080043305e-07, + "loss": 1.8537, + "step": 12482 + }, + { + "epoch": 0.96, + "grad_norm": 0.5769397914140664, + "learning_rate": 1.788481512421919e-07, + "loss": 1.8751, + "step": 12483 + }, + { + "epoch": 0.96, + "grad_norm": 0.5067905128618029, + "learning_rate": 1.7810306137042987e-07, + "loss": 2.1057, + "step": 12484 + }, + { + "epoch": 0.96, + "grad_norm": 0.5509915554759925, + "learning_rate": 1.7735952123165978e-07, + "loss": 1.9035, + "step": 12485 + }, + { + "epoch": 0.96, + "grad_norm": 0.5519980230005824, + "learning_rate": 1.7661753087231115e-07, + "loss": 1.8594, + "step": 12486 + }, + { + "epoch": 0.96, + "grad_norm": 0.5292472963154756, + "learning_rate": 1.7587709033870803e-07, + "loss": 1.8472, + "step": 12487 + }, + { + "epoch": 0.96, + "grad_norm": 0.5157220036224456, + "learning_rate": 1.7513819967707735e-07, + "loss": 2.0532, + "step": 12488 + }, + { + "epoch": 0.96, + "grad_norm": 0.5377902554672803, + "learning_rate": 1.7440085893355163e-07, + "loss": 1.8323, + "step": 12489 + }, + { + "epoch": 0.96, + "grad_norm": 0.5715956149480098, + "learning_rate": 1.7366506815416627e-07, + "loss": 1.7942, + "step": 12490 + }, + { + "epoch": 0.96, + "grad_norm": 0.4998533636995358, + "learning_rate": 1.7293082738486232e-07, + "loss": 1.8844, + "step": 12491 + }, + { + "epoch": 0.96, + "grad_norm": 0.5110151907441478, + "learning_rate": 1.7219813667147532e-07, + "loss": 2.0146, + "step": 12492 + }, + { + "epoch": 0.96, + "grad_norm": 0.5357589423193935, + "learning_rate": 1.714669960597548e-07, + "loss": 1.8153, + "step": 12493 + }, + { + "epoch": 0.96, + "grad_norm": 0.540694750369853, + "learning_rate": 1.7073740559535033e-07, + "loss": 1.864, + "step": 12494 + }, + { + "epoch": 0.96, + "grad_norm": 0.5152038171756893, + "learning_rate": 1.7000936532380884e-07, + "loss": 1.8593, + "step": 12495 + }, + { + "epoch": 0.96, + "grad_norm": 0.5327718587610514, + "learning_rate": 1.6928287529058563e-07, + "loss": 2.0654, + "step": 12496 + }, + { + "epoch": 0.96, + "grad_norm": 0.5011354476819369, + "learning_rate": 1.685579355410416e-07, + "loss": 1.898, + "step": 12497 + }, + { + "epoch": 0.96, + "grad_norm": 0.5593724148558932, + "learning_rate": 1.6783454612043502e-07, + "loss": 1.8614, + "step": 12498 + }, + { + "epoch": 0.96, + "grad_norm": 0.5398587042589721, + "learning_rate": 1.671127070739298e-07, + "loss": 1.8156, + "step": 12499 + }, + { + "epoch": 0.96, + "grad_norm": 0.5092281612917596, + "learning_rate": 1.6639241844659537e-07, + "loss": 2.0143, + "step": 12500 + }, + { + "epoch": 0.96, + "grad_norm": 0.5489207537679166, + "learning_rate": 1.6567368028340135e-07, + "loss": 1.8882, + "step": 12501 + }, + { + "epoch": 0.96, + "grad_norm": 0.5502652506950222, + "learning_rate": 1.6495649262922297e-07, + "loss": 1.8519, + "step": 12502 + }, + { + "epoch": 0.96, + "grad_norm": 0.5101195734327651, + "learning_rate": 1.6424085552883274e-07, + "loss": 1.8702, + "step": 12503 + }, + { + "epoch": 0.96, + "grad_norm": 0.5190898083988714, + "learning_rate": 1.635267690269171e-07, + "loss": 2.0807, + "step": 12504 + }, + { + "epoch": 0.96, + "grad_norm": 0.5240690807551348, + "learning_rate": 1.6281423316805434e-07, + "loss": 1.8883, + "step": 12505 + }, + { + "epoch": 0.96, + "grad_norm": 0.5585956554202949, + "learning_rate": 1.6210324799673382e-07, + "loss": 1.8834, + "step": 12506 + }, + { + "epoch": 0.96, + "grad_norm": 0.540225742463416, + "learning_rate": 1.613938135573395e-07, + "loss": 1.8514, + "step": 12507 + }, + { + "epoch": 0.96, + "grad_norm": 0.5396760004924651, + "learning_rate": 1.6068592989417485e-07, + "loss": 2.0752, + "step": 12508 + }, + { + "epoch": 0.97, + "grad_norm": 0.5030728033992318, + "learning_rate": 1.5997959705142673e-07, + "loss": 1.9776, + "step": 12509 + }, + { + "epoch": 0.97, + "grad_norm": 0.5551317564585917, + "learning_rate": 1.5927481507319875e-07, + "loss": 1.8449, + "step": 12510 + }, + { + "epoch": 0.97, + "grad_norm": 0.545625308832388, + "learning_rate": 1.585715840034918e-07, + "loss": 1.8684, + "step": 12511 + }, + { + "epoch": 0.97, + "grad_norm": 0.5301462697166901, + "learning_rate": 1.5786990388621247e-07, + "loss": 2.0515, + "step": 12512 + }, + { + "epoch": 0.97, + "grad_norm": 0.5312824407391193, + "learning_rate": 1.5716977476516737e-07, + "loss": 1.8124, + "step": 12513 + }, + { + "epoch": 0.97, + "grad_norm": 0.5524569144586571, + "learning_rate": 1.5647119668407429e-07, + "loss": 1.8629, + "step": 12514 + }, + { + "epoch": 0.97, + "grad_norm": 0.5511747793035497, + "learning_rate": 1.5577416968654e-07, + "loss": 1.8754, + "step": 12515 + }, + { + "epoch": 0.97, + "grad_norm": 0.512940529710257, + "learning_rate": 1.550786938160881e-07, + "loss": 2.0836, + "step": 12516 + }, + { + "epoch": 0.97, + "grad_norm": 0.5852547267474459, + "learning_rate": 1.543847691161393e-07, + "loss": 1.836, + "step": 12517 + }, + { + "epoch": 0.97, + "grad_norm": 0.5355785276558362, + "learning_rate": 1.5369239563001736e-07, + "loss": 1.8866, + "step": 12518 + }, + { + "epoch": 0.97, + "grad_norm": 0.5487670181302974, + "learning_rate": 1.5300157340095156e-07, + "loss": 1.825, + "step": 12519 + }, + { + "epoch": 0.97, + "grad_norm": 0.5091948934643001, + "learning_rate": 1.523123024720713e-07, + "loss": 2.055, + "step": 12520 + }, + { + "epoch": 0.97, + "grad_norm": 0.5175750687676114, + "learning_rate": 1.5162458288640878e-07, + "loss": 1.829, + "step": 12521 + }, + { + "epoch": 0.97, + "grad_norm": 0.5090117129478666, + "learning_rate": 1.5093841468690472e-07, + "loss": 1.9673, + "step": 12522 + }, + { + "epoch": 0.97, + "grad_norm": 0.5327583228384077, + "learning_rate": 1.5025379791639705e-07, + "loss": 1.8166, + "step": 12523 + }, + { + "epoch": 0.97, + "grad_norm": 0.5393044911573703, + "learning_rate": 1.4957073261763212e-07, + "loss": 2.099, + "step": 12524 + }, + { + "epoch": 0.97, + "grad_norm": 0.5356423405696384, + "learning_rate": 1.4888921883325357e-07, + "loss": 1.8642, + "step": 12525 + }, + { + "epoch": 0.97, + "grad_norm": 0.5392096903622182, + "learning_rate": 1.4820925660581075e-07, + "loss": 1.8182, + "step": 12526 + }, + { + "epoch": 0.97, + "grad_norm": 0.5359680310615048, + "learning_rate": 1.4753084597776133e-07, + "loss": 1.8436, + "step": 12527 + }, + { + "epoch": 0.97, + "grad_norm": 0.5153624859697878, + "learning_rate": 1.4685398699145482e-07, + "loss": 2.064, + "step": 12528 + }, + { + "epoch": 0.97, + "grad_norm": 0.5812000556549394, + "learning_rate": 1.4617867968915178e-07, + "loss": 1.8328, + "step": 12529 + }, + { + "epoch": 0.97, + "grad_norm": 0.5578181879258931, + "learning_rate": 1.4550492411301851e-07, + "loss": 1.8122, + "step": 12530 + }, + { + "epoch": 0.97, + "grad_norm": 0.5374745210501191, + "learning_rate": 1.4483272030511585e-07, + "loss": 1.8452, + "step": 12531 + }, + { + "epoch": 0.97, + "grad_norm": 0.5299271136333544, + "learning_rate": 1.4416206830741298e-07, + "loss": 2.0439, + "step": 12532 + }, + { + "epoch": 0.97, + "grad_norm": 0.5420342024211667, + "learning_rate": 1.4349296816178746e-07, + "loss": 1.7912, + "step": 12533 + }, + { + "epoch": 0.97, + "grad_norm": 0.5325794389655181, + "learning_rate": 1.4282541991000598e-07, + "loss": 1.9735, + "step": 12534 + }, + { + "epoch": 0.97, + "grad_norm": 0.5306285657268817, + "learning_rate": 1.4215942359374902e-07, + "loss": 1.821, + "step": 12535 + }, + { + "epoch": 0.97, + "grad_norm": 0.5312574800939726, + "learning_rate": 1.414949792545972e-07, + "loss": 2.0612, + "step": 12536 + }, + { + "epoch": 0.97, + "grad_norm": 0.5526942710250652, + "learning_rate": 1.4083208693403683e-07, + "loss": 1.8297, + "step": 12537 + }, + { + "epoch": 0.97, + "grad_norm": 0.524278994127449, + "learning_rate": 1.4017074667345144e-07, + "loss": 1.8396, + "step": 12538 + }, + { + "epoch": 0.97, + "grad_norm": 0.5653408535402199, + "learning_rate": 1.3951095851413576e-07, + "loss": 1.8231, + "step": 12539 + }, + { + "epoch": 0.97, + "grad_norm": 0.4859839720252603, + "learning_rate": 1.3885272249727632e-07, + "loss": 1.9353, + "step": 12540 + }, + { + "epoch": 0.97, + "grad_norm": 0.5166221049756597, + "learning_rate": 1.3819603866397635e-07, + "loss": 2.0634, + "step": 12541 + }, + { + "epoch": 0.97, + "grad_norm": 0.5386256100596942, + "learning_rate": 1.375409070552336e-07, + "loss": 1.8795, + "step": 12542 + }, + { + "epoch": 0.97, + "grad_norm": 0.5714181341589131, + "learning_rate": 1.3688732771194868e-07, + "loss": 1.8362, + "step": 12543 + }, + { + "epoch": 0.97, + "grad_norm": 0.5083934969048498, + "learning_rate": 1.3623530067492785e-07, + "loss": 2.0956, + "step": 12544 + }, + { + "epoch": 0.97, + "grad_norm": 0.534113148459979, + "learning_rate": 1.3558482598487743e-07, + "loss": 1.8619, + "step": 12545 + }, + { + "epoch": 0.97, + "grad_norm": 0.5500580739872959, + "learning_rate": 1.3493590368241493e-07, + "loss": 1.8139, + "step": 12546 + }, + { + "epoch": 0.97, + "grad_norm": 0.5529064406659795, + "learning_rate": 1.342885338080524e-07, + "loss": 1.928, + "step": 12547 + }, + { + "epoch": 0.97, + "grad_norm": 0.5449751043424931, + "learning_rate": 1.3364271640220748e-07, + "loss": 2.0957, + "step": 12548 + }, + { + "epoch": 0.97, + "grad_norm": 0.5576994275258095, + "learning_rate": 1.3299845150520074e-07, + "loss": 1.821, + "step": 12549 + }, + { + "epoch": 0.97, + "grad_norm": 0.5547620307417171, + "learning_rate": 1.323557391572555e-07, + "loss": 1.844, + "step": 12550 + }, + { + "epoch": 0.97, + "grad_norm": 0.5462297236665662, + "learning_rate": 1.3171457939850086e-07, + "loss": 1.8459, + "step": 12551 + }, + { + "epoch": 0.97, + "grad_norm": 0.5120830552148097, + "learning_rate": 1.310749722689686e-07, + "loss": 2.0574, + "step": 12552 + }, + { + "epoch": 0.97, + "grad_norm": 0.49356176899517235, + "learning_rate": 1.3043691780859068e-07, + "loss": 1.9089, + "step": 12553 + }, + { + "epoch": 0.97, + "grad_norm": 0.5597195045515584, + "learning_rate": 1.2980041605719917e-07, + "loss": 1.8275, + "step": 12554 + }, + { + "epoch": 0.97, + "grad_norm": 0.5291635827550134, + "learning_rate": 1.2916546705453725e-07, + "loss": 1.8555, + "step": 12555 + }, + { + "epoch": 0.97, + "grad_norm": 0.5096586409762475, + "learning_rate": 1.2853207084025098e-07, + "loss": 1.9616, + "step": 12556 + }, + { + "epoch": 0.97, + "grad_norm": 0.5358808220035992, + "learning_rate": 1.279002274538782e-07, + "loss": 1.8707, + "step": 12557 + }, + { + "epoch": 0.97, + "grad_norm": 0.5254971260091508, + "learning_rate": 1.2726993693487067e-07, + "loss": 1.8466, + "step": 12558 + }, + { + "epoch": 0.97, + "grad_norm": 0.5153441807618191, + "learning_rate": 1.26641199322583e-07, + "loss": 1.9579, + "step": 12559 + }, + { + "epoch": 0.97, + "grad_norm": 0.5475401381067887, + "learning_rate": 1.2601401465626716e-07, + "loss": 1.8133, + "step": 12560 + }, + { + "epoch": 0.97, + "grad_norm": 0.5395166355395358, + "learning_rate": 1.253883829750835e-07, + "loss": 2.0474, + "step": 12561 + }, + { + "epoch": 0.97, + "grad_norm": 0.5673586689545557, + "learning_rate": 1.2476430431808682e-07, + "loss": 1.8737, + "step": 12562 + }, + { + "epoch": 0.97, + "grad_norm": 0.5242772064899629, + "learning_rate": 1.2414177872424603e-07, + "loss": 1.8382, + "step": 12563 + }, + { + "epoch": 0.97, + "grad_norm": 0.5247384427323458, + "learning_rate": 1.2352080623242724e-07, + "loss": 2.0358, + "step": 12564 + }, + { + "epoch": 0.97, + "grad_norm": 0.5178375965707774, + "learning_rate": 1.229013868813994e-07, + "loss": 1.8789, + "step": 12565 + }, + { + "epoch": 0.97, + "grad_norm": 0.5515400217574341, + "learning_rate": 1.2228352070983719e-07, + "loss": 1.8514, + "step": 12566 + }, + { + "epoch": 0.97, + "grad_norm": 0.531798453110105, + "learning_rate": 1.2166720775631247e-07, + "loss": 1.8277, + "step": 12567 + }, + { + "epoch": 0.97, + "grad_norm": 0.5312651652487288, + "learning_rate": 1.210524480593056e-07, + "loss": 1.9952, + "step": 12568 + }, + { + "epoch": 0.97, + "grad_norm": 0.5492669528373674, + "learning_rate": 1.2043924165720255e-07, + "loss": 1.8466, + "step": 12569 + }, + { + "epoch": 0.97, + "grad_norm": 0.5412493092558326, + "learning_rate": 1.1982758858828658e-07, + "loss": 1.899, + "step": 12570 + }, + { + "epoch": 0.97, + "grad_norm": 0.4885499120743349, + "learning_rate": 1.1921748889074104e-07, + "loss": 1.917, + "step": 12571 + }, + { + "epoch": 0.97, + "grad_norm": 0.5384729953621615, + "learning_rate": 1.186089426026632e-07, + "loss": 1.8693, + "step": 12572 + }, + { + "epoch": 0.97, + "grad_norm": 0.5248666451650283, + "learning_rate": 1.1800194976204215e-07, + "loss": 2.0541, + "step": 12573 + }, + { + "epoch": 0.97, + "grad_norm": 0.5723669480282808, + "learning_rate": 1.1739651040677812e-07, + "loss": 1.8354, + "step": 12574 + }, + { + "epoch": 0.97, + "grad_norm": 0.5457631398202833, + "learning_rate": 1.1679262457467144e-07, + "loss": 1.819, + "step": 12575 + }, + { + "epoch": 0.97, + "grad_norm": 0.5180906612387542, + "learning_rate": 1.161902923034197e-07, + "loss": 1.9986, + "step": 12576 + }, + { + "epoch": 0.97, + "grad_norm": 0.5371651065631734, + "learning_rate": 1.1558951363064008e-07, + "loss": 1.8462, + "step": 12577 + }, + { + "epoch": 0.97, + "grad_norm": 0.516921171311841, + "learning_rate": 1.1499028859383031e-07, + "loss": 1.8756, + "step": 12578 + }, + { + "epoch": 0.97, + "grad_norm": 0.5459777778827176, + "learning_rate": 1.1439261723040773e-07, + "loss": 1.8503, + "step": 12579 + }, + { + "epoch": 0.97, + "grad_norm": 0.5180000826287885, + "learning_rate": 1.137964995776869e-07, + "loss": 2.0285, + "step": 12580 + }, + { + "epoch": 0.97, + "grad_norm": 0.5323573443097662, + "learning_rate": 1.1320193567288529e-07, + "loss": 1.8159, + "step": 12581 + }, + { + "epoch": 0.97, + "grad_norm": 0.5450359683393713, + "learning_rate": 1.1260892555312597e-07, + "loss": 1.8563, + "step": 12582 + }, + { + "epoch": 0.97, + "grad_norm": 0.5612941595208045, + "learning_rate": 1.1201746925542933e-07, + "loss": 1.8445, + "step": 12583 + }, + { + "epoch": 0.97, + "grad_norm": 0.5023515488564199, + "learning_rate": 1.1142756681672417e-07, + "loss": 1.9386, + "step": 12584 + }, + { + "epoch": 0.97, + "grad_norm": 0.5185690796785887, + "learning_rate": 1.1083921827384214e-07, + "loss": 2.0632, + "step": 12585 + }, + { + "epoch": 0.97, + "grad_norm": 0.559092574513715, + "learning_rate": 1.1025242366351496e-07, + "loss": 1.8496, + "step": 12586 + }, + { + "epoch": 0.97, + "grad_norm": 0.5505028740963146, + "learning_rate": 1.0966718302237999e-07, + "loss": 1.8386, + "step": 12587 + }, + { + "epoch": 0.97, + "grad_norm": 0.5375084409045378, + "learning_rate": 1.0908349638697191e-07, + "loss": 2.0352, + "step": 12588 + }, + { + "epoch": 0.97, + "grad_norm": 0.5573371652510921, + "learning_rate": 1.0850136379373654e-07, + "loss": 1.8089, + "step": 12589 + }, + { + "epoch": 0.97, + "grad_norm": 0.5463213396848147, + "learning_rate": 1.0792078527901983e-07, + "loss": 1.9531, + "step": 12590 + }, + { + "epoch": 0.97, + "grad_norm": 0.562011590350056, + "learning_rate": 1.07341760879065e-07, + "loss": 1.862, + "step": 12591 + }, + { + "epoch": 0.97, + "grad_norm": 0.5691895403734718, + "learning_rate": 1.0676429063002646e-07, + "loss": 1.874, + "step": 12592 + }, + { + "epoch": 0.97, + "grad_norm": 0.5246059769537014, + "learning_rate": 1.0618837456795872e-07, + "loss": 2.0259, + "step": 12593 + }, + { + "epoch": 0.97, + "grad_norm": 0.5547160969649736, + "learning_rate": 1.0561401272881633e-07, + "loss": 1.867, + "step": 12594 + }, + { + "epoch": 0.97, + "grad_norm": 0.5401859237227011, + "learning_rate": 1.050412051484595e-07, + "loss": 1.8659, + "step": 12595 + }, + { + "epoch": 0.97, + "grad_norm": 0.4948326888696587, + "learning_rate": 1.044699518626513e-07, + "loss": 1.8602, + "step": 12596 + }, + { + "epoch": 0.97, + "grad_norm": 0.5654582025215356, + "learning_rate": 1.0390025290705485e-07, + "loss": 2.06, + "step": 12597 + }, + { + "epoch": 0.97, + "grad_norm": 0.5610644156053208, + "learning_rate": 1.0333210831724449e-07, + "loss": 1.8882, + "step": 12598 + }, + { + "epoch": 0.97, + "grad_norm": 0.535859326193958, + "learning_rate": 1.0276551812868906e-07, + "loss": 1.8899, + "step": 12599 + }, + { + "epoch": 0.97, + "grad_norm": 0.5432551680255661, + "learning_rate": 1.0220048237676305e-07, + "loss": 2.0705, + "step": 12600 + }, + { + "epoch": 0.97, + "grad_norm": 0.5563337790798661, + "learning_rate": 1.0163700109674101e-07, + "loss": 1.9066, + "step": 12601 + }, + { + "epoch": 0.97, + "grad_norm": 0.49674691099510326, + "learning_rate": 1.0107507432380591e-07, + "loss": 1.9466, + "step": 12602 + }, + { + "epoch": 0.97, + "grad_norm": 0.5515358471326531, + "learning_rate": 1.0051470209304359e-07, + "loss": 1.8315, + "step": 12603 + }, + { + "epoch": 0.97, + "grad_norm": 0.544961555038401, + "learning_rate": 9.995588443943715e-08, + "loss": 1.8383, + "step": 12604 + }, + { + "epoch": 0.97, + "grad_norm": 0.5488002659085539, + "learning_rate": 9.939862139787814e-08, + "loss": 2.0881, + "step": 12605 + }, + { + "epoch": 0.97, + "grad_norm": 0.5428586615860987, + "learning_rate": 9.88429130031554e-08, + "loss": 1.8387, + "step": 12606 + }, + { + "epoch": 0.97, + "grad_norm": 0.5472002396991176, + "learning_rate": 9.828875928996617e-08, + "loss": 1.8417, + "step": 12607 + }, + { + "epoch": 0.97, + "grad_norm": 0.5319973909105155, + "learning_rate": 9.773616029291055e-08, + "loss": 2.0772, + "step": 12608 + }, + { + "epoch": 0.97, + "grad_norm": 0.49273799229513887, + "learning_rate": 9.718511604648595e-08, + "loss": 1.9355, + "step": 12609 + }, + { + "epoch": 0.97, + "grad_norm": 0.5443128044985188, + "learning_rate": 9.663562658509817e-08, + "loss": 1.8679, + "step": 12610 + }, + { + "epoch": 0.97, + "grad_norm": 0.5350359420197642, + "learning_rate": 9.608769194305312e-08, + "loss": 1.8265, + "step": 12611 + }, + { + "epoch": 0.97, + "grad_norm": 0.5165794021836044, + "learning_rate": 9.55413121545623e-08, + "loss": 1.988, + "step": 12612 + }, + { + "epoch": 0.97, + "grad_norm": 0.5293335817613162, + "learning_rate": 9.499648725373456e-08, + "loss": 1.8485, + "step": 12613 + }, + { + "epoch": 0.97, + "grad_norm": 0.5394531034378792, + "learning_rate": 9.445321727459266e-08, + "loss": 1.8071, + "step": 12614 + }, + { + "epoch": 0.97, + "grad_norm": 0.5195230801665499, + "learning_rate": 9.391150225104838e-08, + "loss": 1.9285, + "step": 12615 + }, + { + "epoch": 0.97, + "grad_norm": 0.5449366559439456, + "learning_rate": 9.337134221692745e-08, + "loss": 1.8346, + "step": 12616 + }, + { + "epoch": 0.97, + "grad_norm": 0.5406203819873967, + "learning_rate": 9.283273720595287e-08, + "loss": 2.0912, + "step": 12617 + }, + { + "epoch": 0.97, + "grad_norm": 0.526284757663565, + "learning_rate": 9.229568725174776e-08, + "loss": 1.7782, + "step": 12618 + }, + { + "epoch": 0.97, + "grad_norm": 0.5416142510966998, + "learning_rate": 9.176019238785194e-08, + "loss": 1.8835, + "step": 12619 + }, + { + "epoch": 0.97, + "grad_norm": 0.5332863809409796, + "learning_rate": 9.122625264768869e-08, + "loss": 2.0115, + "step": 12620 + }, + { + "epoch": 0.97, + "grad_norm": 0.49979229266951447, + "learning_rate": 9.069386806460078e-08, + "loss": 1.9268, + "step": 12621 + }, + { + "epoch": 0.97, + "grad_norm": 0.5364194228608096, + "learning_rate": 9.016303867182274e-08, + "loss": 1.8509, + "step": 12622 + }, + { + "epoch": 0.97, + "grad_norm": 0.5586636663630387, + "learning_rate": 8.96337645024975e-08, + "loss": 1.8639, + "step": 12623 + }, + { + "epoch": 0.97, + "grad_norm": 0.5538907095114045, + "learning_rate": 8.910604558967361e-08, + "loss": 1.8214, + "step": 12624 + }, + { + "epoch": 0.97, + "grad_norm": 0.5431647550571213, + "learning_rate": 8.857988196629697e-08, + "loss": 2.0741, + "step": 12625 + }, + { + "epoch": 0.97, + "grad_norm": 0.5458643774152034, + "learning_rate": 8.805527366521349e-08, + "loss": 1.82, + "step": 12626 + }, + { + "epoch": 0.97, + "grad_norm": 0.5006335486984427, + "learning_rate": 8.753222071918032e-08, + "loss": 1.9412, + "step": 12627 + }, + { + "epoch": 0.97, + "grad_norm": 0.5407675028444235, + "learning_rate": 8.701072316085468e-08, + "loss": 1.8168, + "step": 12628 + }, + { + "epoch": 0.97, + "grad_norm": 0.5261728593233176, + "learning_rate": 8.649078102279662e-08, + "loss": 2.0601, + "step": 12629 + }, + { + "epoch": 0.97, + "grad_norm": 0.5370374367185273, + "learning_rate": 8.597239433746351e-08, + "loss": 1.8244, + "step": 12630 + }, + { + "epoch": 0.97, + "grad_norm": 0.5707207883258855, + "learning_rate": 8.545556313722391e-08, + "loss": 1.8486, + "step": 12631 + }, + { + "epoch": 0.97, + "grad_norm": 0.5229009867842098, + "learning_rate": 8.494028745434368e-08, + "loss": 1.9996, + "step": 12632 + }, + { + "epoch": 0.97, + "grad_norm": 0.5195900136527069, + "learning_rate": 8.442656732099707e-08, + "loss": 1.9094, + "step": 12633 + }, + { + "epoch": 0.97, + "grad_norm": 0.5350028033686787, + "learning_rate": 8.391440276925566e-08, + "loss": 1.8415, + "step": 12634 + }, + { + "epoch": 0.97, + "grad_norm": 0.5577082776266667, + "learning_rate": 8.340379383109387e-08, + "loss": 1.8343, + "step": 12635 + }, + { + "epoch": 0.97, + "grad_norm": 0.5382150867940216, + "learning_rate": 8.289474053839452e-08, + "loss": 1.7985, + "step": 12636 + }, + { + "epoch": 0.97, + "grad_norm": 0.532893845302689, + "learning_rate": 8.238724292294053e-08, + "loss": 2.052, + "step": 12637 + }, + { + "epoch": 0.98, + "grad_norm": 0.567900878615726, + "learning_rate": 8.188130101641212e-08, + "loss": 1.874, + "step": 12638 + }, + { + "epoch": 0.98, + "grad_norm": 0.5527065720761866, + "learning_rate": 8.137691485040066e-08, + "loss": 1.8213, + "step": 12639 + }, + { + "epoch": 0.98, + "grad_norm": 0.49983910040330676, + "learning_rate": 8.087408445639766e-08, + "loss": 1.9008, + "step": 12640 + }, + { + "epoch": 0.98, + "grad_norm": 0.51717154516016, + "learning_rate": 8.037280986579466e-08, + "loss": 2.0688, + "step": 12641 + }, + { + "epoch": 0.98, + "grad_norm": 0.5689292508001577, + "learning_rate": 7.987309110989161e-08, + "loss": 1.8857, + "step": 12642 + }, + { + "epoch": 0.98, + "grad_norm": 0.5358937832116302, + "learning_rate": 7.93749282198858e-08, + "loss": 1.8644, + "step": 12643 + }, + { + "epoch": 0.98, + "grad_norm": 0.519379638150312, + "learning_rate": 7.887832122687732e-08, + "loss": 1.8693, + "step": 12644 + }, + { + "epoch": 0.98, + "grad_norm": 0.5233268286120923, + "learning_rate": 7.838327016187474e-08, + "loss": 2.0274, + "step": 12645 + }, + { + "epoch": 0.98, + "grad_norm": 0.4973262696283313, + "learning_rate": 7.788977505578666e-08, + "loss": 1.9418, + "step": 12646 + }, + { + "epoch": 0.98, + "grad_norm": 0.5518111053824067, + "learning_rate": 7.739783593942174e-08, + "loss": 1.8502, + "step": 12647 + }, + { + "epoch": 0.98, + "grad_norm": 0.5725631126749992, + "learning_rate": 7.690745284349432e-08, + "loss": 1.807, + "step": 12648 + }, + { + "epoch": 0.98, + "grad_norm": 0.535070471072652, + "learning_rate": 7.64186257986188e-08, + "loss": 2.0662, + "step": 12649 + }, + { + "epoch": 0.98, + "grad_norm": 0.5620079190844915, + "learning_rate": 7.593135483532077e-08, + "loss": 1.8727, + "step": 12650 + }, + { + "epoch": 0.98, + "grad_norm": 0.5259601592682241, + "learning_rate": 7.544563998401755e-08, + "loss": 1.8083, + "step": 12651 + }, + { + "epoch": 0.98, + "grad_norm": 0.5291452998839081, + "learning_rate": 7.496148127503488e-08, + "loss": 1.9502, + "step": 12652 + }, + { + "epoch": 0.98, + "grad_norm": 0.5383358448263236, + "learning_rate": 7.447887873860415e-08, + "loss": 2.0279, + "step": 12653 + }, + { + "epoch": 0.98, + "grad_norm": 0.540220288201958, + "learning_rate": 7.399783240485125e-08, + "loss": 1.8525, + "step": 12654 + }, + { + "epoch": 0.98, + "grad_norm": 0.5495777913048978, + "learning_rate": 7.351834230381327e-08, + "loss": 1.8702, + "step": 12655 + }, + { + "epoch": 0.98, + "grad_norm": 0.5473260683270397, + "learning_rate": 7.304040846542459e-08, + "loss": 1.9092, + "step": 12656 + }, + { + "epoch": 0.98, + "grad_norm": 0.5188596261148773, + "learning_rate": 7.256403091952523e-08, + "loss": 2.0571, + "step": 12657 + }, + { + "epoch": 0.98, + "grad_norm": 0.4812111618076212, + "learning_rate": 7.208920969586085e-08, + "loss": 1.9482, + "step": 12658 + }, + { + "epoch": 0.98, + "grad_norm": 0.556614725038666, + "learning_rate": 7.161594482407164e-08, + "loss": 1.8053, + "step": 12659 + }, + { + "epoch": 0.98, + "grad_norm": 0.5718698252619451, + "learning_rate": 7.114423633370615e-08, + "loss": 1.811, + "step": 12660 + }, + { + "epoch": 0.98, + "grad_norm": 0.5309689179370667, + "learning_rate": 7.067408425421862e-08, + "loss": 2.0472, + "step": 12661 + }, + { + "epoch": 0.98, + "grad_norm": 0.5471651863457686, + "learning_rate": 7.020548861495779e-08, + "loss": 1.8587, + "step": 12662 + }, + { + "epoch": 0.98, + "grad_norm": 0.5371848841488011, + "learning_rate": 6.973844944518637e-08, + "loss": 1.8301, + "step": 12663 + }, + { + "epoch": 0.98, + "grad_norm": 0.5200849316253853, + "learning_rate": 6.927296677405604e-08, + "loss": 2.0161, + "step": 12664 + }, + { + "epoch": 0.98, + "grad_norm": 0.5224383581107138, + "learning_rate": 6.880904063063243e-08, + "loss": 1.9337, + "step": 12665 + }, + { + "epoch": 0.98, + "grad_norm": 0.5508369577985587, + "learning_rate": 6.834667104388403e-08, + "loss": 1.8609, + "step": 12666 + }, + { + "epoch": 0.98, + "grad_norm": 0.5438198142178579, + "learning_rate": 6.788585804267112e-08, + "loss": 1.8763, + "step": 12667 + }, + { + "epoch": 0.98, + "grad_norm": 0.5343235896165646, + "learning_rate": 6.742660165576786e-08, + "loss": 1.816, + "step": 12668 + }, + { + "epoch": 0.98, + "grad_norm": 0.5168653020761007, + "learning_rate": 6.696890191184856e-08, + "loss": 2.0808, + "step": 12669 + }, + { + "epoch": 0.98, + "grad_norm": 0.5459212451722081, + "learning_rate": 6.65127588394876e-08, + "loss": 1.918, + "step": 12670 + }, + { + "epoch": 0.98, + "grad_norm": 0.5018438398293998, + "learning_rate": 6.605817246716495e-08, + "loss": 1.9623, + "step": 12671 + }, + { + "epoch": 0.98, + "grad_norm": 0.5539811303810094, + "learning_rate": 6.560514282326347e-08, + "loss": 1.8767, + "step": 12672 + }, + { + "epoch": 0.98, + "grad_norm": 0.5200526372878406, + "learning_rate": 6.515366993606332e-08, + "loss": 1.9968, + "step": 12673 + }, + { + "epoch": 0.98, + "grad_norm": 0.5736847151433727, + "learning_rate": 6.470375383375582e-08, + "loss": 1.8673, + "step": 12674 + }, + { + "epoch": 0.98, + "grad_norm": 0.5591678346149859, + "learning_rate": 6.425539454442963e-08, + "loss": 1.8409, + "step": 12675 + }, + { + "epoch": 0.98, + "grad_norm": 0.5468278237356234, + "learning_rate": 6.380859209607626e-08, + "loss": 1.8718, + "step": 12676 + }, + { + "epoch": 0.98, + "grad_norm": 0.48315293383017843, + "learning_rate": 6.33633465165956e-08, + "loss": 2.0875, + "step": 12677 + }, + { + "epoch": 0.98, + "grad_norm": 0.5397592932382222, + "learning_rate": 6.291965783377929e-08, + "loss": 1.8753, + "step": 12678 + }, + { + "epoch": 0.98, + "grad_norm": 0.5525657239818212, + "learning_rate": 6.247752607533575e-08, + "loss": 1.8797, + "step": 12679 + }, + { + "epoch": 0.98, + "grad_norm": 0.5417268485491037, + "learning_rate": 6.203695126886788e-08, + "loss": 1.8334, + "step": 12680 + }, + { + "epoch": 0.98, + "grad_norm": 0.5182953659560007, + "learning_rate": 6.15979334418787e-08, + "loss": 2.095, + "step": 12681 + }, + { + "epoch": 0.98, + "grad_norm": 0.5443430182078263, + "learning_rate": 6.11604726217796e-08, + "loss": 1.8308, + "step": 12682 + }, + { + "epoch": 0.98, + "grad_norm": 0.513913194390248, + "learning_rate": 6.072456883588485e-08, + "loss": 1.9282, + "step": 12683 + }, + { + "epoch": 0.98, + "grad_norm": 0.5491914494730041, + "learning_rate": 6.0290222111406e-08, + "loss": 1.7579, + "step": 12684 + }, + { + "epoch": 0.98, + "grad_norm": 0.5271174798744462, + "learning_rate": 5.985743247546582e-08, + "loss": 2.0379, + "step": 12685 + }, + { + "epoch": 0.98, + "grad_norm": 0.5438864693030997, + "learning_rate": 5.942619995508159e-08, + "loss": 1.8065, + "step": 12686 + }, + { + "epoch": 0.98, + "grad_norm": 0.5470665758471396, + "learning_rate": 5.8996524577178967e-08, + "loss": 1.8731, + "step": 12687 + }, + { + "epoch": 0.98, + "grad_norm": 0.5660181968909646, + "learning_rate": 5.856840636858374e-08, + "loss": 1.8047, + "step": 12688 + }, + { + "epoch": 0.98, + "grad_norm": 0.49110596949710383, + "learning_rate": 5.814184535602452e-08, + "loss": 2.0712, + "step": 12689 + }, + { + "epoch": 0.98, + "grad_norm": 0.5480895925612354, + "learning_rate": 5.771684156613277e-08, + "loss": 1.8166, + "step": 12690 + }, + { + "epoch": 0.98, + "grad_norm": 0.5523892303984304, + "learning_rate": 5.7293395025445617e-08, + "loss": 1.8763, + "step": 12691 + }, + { + "epoch": 0.98, + "grad_norm": 0.5361268346768417, + "learning_rate": 5.687150576039746e-08, + "loss": 1.8493, + "step": 12692 + }, + { + "epoch": 0.98, + "grad_norm": 0.5253058924694681, + "learning_rate": 5.645117379733111e-08, + "loss": 2.0683, + "step": 12693 + }, + { + "epoch": 0.98, + "grad_norm": 0.5518078771811902, + "learning_rate": 5.603239916248948e-08, + "loss": 1.8244, + "step": 12694 + }, + { + "epoch": 0.98, + "grad_norm": 0.5589974677716034, + "learning_rate": 5.5615181882015534e-08, + "loss": 1.8401, + "step": 12695 + }, + { + "epoch": 0.98, + "grad_norm": 0.49192212574870675, + "learning_rate": 5.519952198196066e-08, + "loss": 1.9287, + "step": 12696 + }, + { + "epoch": 0.98, + "grad_norm": 0.5426366164477189, + "learning_rate": 5.478541948827631e-08, + "loss": 2.0957, + "step": 12697 + }, + { + "epoch": 0.98, + "grad_norm": 0.5561824100639771, + "learning_rate": 5.4372874426814045e-08, + "loss": 1.8399, + "step": 12698 + }, + { + "epoch": 0.98, + "grad_norm": 0.5607261188328716, + "learning_rate": 5.3961886823333806e-08, + "loss": 1.8499, + "step": 12699 + }, + { + "epoch": 0.98, + "grad_norm": 0.49683726883633333, + "learning_rate": 5.3552456703495626e-08, + "loss": 1.8472, + "step": 12700 + }, + { + "epoch": 0.98, + "grad_norm": 0.5489112065707532, + "learning_rate": 5.314458409285683e-08, + "loss": 2.034, + "step": 12701 + }, + { + "epoch": 0.98, + "grad_norm": 0.49833393531273956, + "learning_rate": 5.273826901688872e-08, + "loss": 1.9565, + "step": 12702 + }, + { + "epoch": 0.98, + "grad_norm": 0.5382170386068984, + "learning_rate": 5.2333511500954334e-08, + "loss": 1.8432, + "step": 12703 + }, + { + "epoch": 0.98, + "grad_norm": 0.5552571614438986, + "learning_rate": 5.193031157032513e-08, + "loss": 1.8412, + "step": 12704 + }, + { + "epoch": 0.98, + "grad_norm": 0.5499559579250465, + "learning_rate": 5.1528669250178184e-08, + "loss": 2.0328, + "step": 12705 + }, + { + "epoch": 0.98, + "grad_norm": 0.5443377080605588, + "learning_rate": 5.1128584565585115e-08, + "loss": 1.812, + "step": 12706 + }, + { + "epoch": 0.98, + "grad_norm": 0.5437971121918732, + "learning_rate": 5.073005754152871e-08, + "loss": 1.8017, + "step": 12707 + }, + { + "epoch": 0.98, + "grad_norm": 0.5155693935909547, + "learning_rate": 5.033308820289184e-08, + "loss": 1.9271, + "step": 12708 + }, + { + "epoch": 0.98, + "grad_norm": 0.5639006346933892, + "learning_rate": 4.993767657445192e-08, + "loss": 2.0794, + "step": 12709 + }, + { + "epoch": 0.98, + "grad_norm": 0.5385293324501685, + "learning_rate": 4.95438226809003e-08, + "loss": 1.8203, + "step": 12710 + }, + { + "epoch": 0.98, + "grad_norm": 0.5398241014775447, + "learning_rate": 4.915152654682842e-08, + "loss": 1.8872, + "step": 12711 + }, + { + "epoch": 0.98, + "grad_norm": 0.5888667607431607, + "learning_rate": 4.8760788196727804e-08, + "loss": 1.8844, + "step": 12712 + }, + { + "epoch": 0.98, + "grad_norm": 0.5250310451161447, + "learning_rate": 4.8371607654992825e-08, + "loss": 2.0716, + "step": 12713 + }, + { + "epoch": 0.98, + "grad_norm": 0.49539355719705613, + "learning_rate": 4.798398494592071e-08, + "loss": 1.9101, + "step": 12714 + }, + { + "epoch": 0.98, + "grad_norm": 0.5352536356496731, + "learning_rate": 4.759792009371711e-08, + "loss": 1.8494, + "step": 12715 + }, + { + "epoch": 0.98, + "grad_norm": 0.5463177850110065, + "learning_rate": 4.72134131224794e-08, + "loss": 1.846, + "step": 12716 + }, + { + "epoch": 0.98, + "grad_norm": 0.5268101760204572, + "learning_rate": 4.683046405621894e-08, + "loss": 2.0437, + "step": 12717 + }, + { + "epoch": 0.98, + "grad_norm": 0.5339729818327588, + "learning_rate": 4.64490729188416e-08, + "loss": 1.9023, + "step": 12718 + }, + { + "epoch": 0.98, + "grad_norm": 0.5390969025439178, + "learning_rate": 4.6069239734161665e-08, + "loss": 1.8271, + "step": 12719 + }, + { + "epoch": 0.98, + "grad_norm": 0.490460920450286, + "learning_rate": 4.5690964525890726e-08, + "loss": 1.9334, + "step": 12720 + }, + { + "epoch": 0.98, + "grad_norm": 0.5330683969267866, + "learning_rate": 4.531424731764877e-08, + "loss": 2.0577, + "step": 12721 + }, + { + "epoch": 0.98, + "grad_norm": 0.5812444356767384, + "learning_rate": 4.493908813295311e-08, + "loss": 1.8555, + "step": 12722 + }, + { + "epoch": 0.98, + "grad_norm": 0.5768701402542648, + "learning_rate": 4.4565486995229446e-08, + "loss": 1.8807, + "step": 12723 + }, + { + "epoch": 0.98, + "grad_norm": 0.5321720368405246, + "learning_rate": 4.419344392780356e-08, + "loss": 1.8588, + "step": 12724 + }, + { + "epoch": 0.98, + "grad_norm": 0.5291896766039336, + "learning_rate": 4.382295895389854e-08, + "loss": 2.0183, + "step": 12725 + }, + { + "epoch": 0.98, + "grad_norm": 0.5532818790852237, + "learning_rate": 4.3454032096651446e-08, + "loss": 1.8614, + "step": 12726 + }, + { + "epoch": 0.98, + "grad_norm": 0.5140013341804782, + "learning_rate": 4.308666337909106e-08, + "loss": 1.9338, + "step": 12727 + }, + { + "epoch": 0.98, + "grad_norm": 0.5476791640202888, + "learning_rate": 4.272085282415739e-08, + "loss": 1.8071, + "step": 12728 + }, + { + "epoch": 0.98, + "grad_norm": 0.5211579821670043, + "learning_rate": 4.23566004546877e-08, + "loss": 2.0501, + "step": 12729 + }, + { + "epoch": 0.98, + "grad_norm": 0.5343809462811061, + "learning_rate": 4.199390629342492e-08, + "loss": 1.8399, + "step": 12730 + }, + { + "epoch": 0.98, + "grad_norm": 0.5404780585593314, + "learning_rate": 4.1632770363012056e-08, + "loss": 1.8227, + "step": 12731 + }, + { + "epoch": 0.98, + "grad_norm": 0.5253215361773808, + "learning_rate": 4.127319268599494e-08, + "loss": 1.8566, + "step": 12732 + }, + { + "epoch": 0.98, + "grad_norm": 0.4780602218067452, + "learning_rate": 4.0915173284827857e-08, + "loss": 2.0651, + "step": 12733 + }, + { + "epoch": 0.98, + "grad_norm": 0.5581575483723127, + "learning_rate": 4.055871218185958e-08, + "loss": 1.8485, + "step": 12734 + }, + { + "epoch": 0.98, + "grad_norm": 0.5377137220996736, + "learning_rate": 4.020380939934732e-08, + "loss": 1.8824, + "step": 12735 + }, + { + "epoch": 0.98, + "grad_norm": 0.5487743107257379, + "learning_rate": 3.985046495944833e-08, + "loss": 1.8545, + "step": 12736 + }, + { + "epoch": 0.98, + "grad_norm": 0.5402240659215451, + "learning_rate": 3.949867888422554e-08, + "loss": 2.0564, + "step": 12737 + }, + { + "epoch": 0.98, + "grad_norm": 0.5396436311355212, + "learning_rate": 3.914845119563915e-08, + "loss": 1.8447, + "step": 12738 + }, + { + "epoch": 0.98, + "grad_norm": 0.4928139754264298, + "learning_rate": 3.8799781915555e-08, + "loss": 1.9578, + "step": 12739 + }, + { + "epoch": 0.98, + "grad_norm": 0.5679757530552128, + "learning_rate": 3.845267106574735e-08, + "loss": 1.7783, + "step": 12740 + }, + { + "epoch": 0.98, + "grad_norm": 0.5412951693886414, + "learning_rate": 3.810711866788219e-08, + "loss": 2.0156, + "step": 12741 + }, + { + "epoch": 0.98, + "grad_norm": 0.552954387511485, + "learning_rate": 3.776312474353394e-08, + "loss": 1.8231, + "step": 12742 + }, + { + "epoch": 0.98, + "grad_norm": 0.5495456785466195, + "learning_rate": 3.742068931418263e-08, + "loss": 1.8587, + "step": 12743 + }, + { + "epoch": 0.98, + "grad_norm": 0.562644544699006, + "learning_rate": 3.707981240120839e-08, + "loss": 1.8713, + "step": 12744 + }, + { + "epoch": 0.98, + "grad_norm": 0.5040839986101762, + "learning_rate": 3.674049402588864e-08, + "loss": 2.06, + "step": 12745 + }, + { + "epoch": 0.98, + "grad_norm": 0.5381314260734263, + "learning_rate": 3.640273420941198e-08, + "loss": 1.8638, + "step": 12746 + }, + { + "epoch": 0.98, + "grad_norm": 0.5216463037844793, + "learning_rate": 3.606653297286711e-08, + "loss": 1.817, + "step": 12747 + }, + { + "epoch": 0.98, + "grad_norm": 0.5577135071554226, + "learning_rate": 3.5731890337242776e-08, + "loss": 1.8524, + "step": 12748 + }, + { + "epoch": 0.98, + "grad_norm": 0.5322586414052444, + "learning_rate": 3.53988063234334e-08, + "loss": 1.9986, + "step": 12749 + }, + { + "epoch": 0.98, + "grad_norm": 0.5477515667008829, + "learning_rate": 3.506728095223066e-08, + "loss": 1.8136, + "step": 12750 + }, + { + "epoch": 0.98, + "grad_norm": 0.5119090284384246, + "learning_rate": 3.473731424433746e-08, + "loss": 1.9188, + "step": 12751 + }, + { + "epoch": 0.98, + "grad_norm": 0.5367923286069458, + "learning_rate": 3.440890622035398e-08, + "loss": 1.8416, + "step": 12752 + }, + { + "epoch": 0.98, + "grad_norm": 0.5351785567275853, + "learning_rate": 3.4082056900783254e-08, + "loss": 1.9968, + "step": 12753 + }, + { + "epoch": 0.98, + "grad_norm": 0.5358556438066955, + "learning_rate": 3.37567663060312e-08, + "loss": 1.8075, + "step": 12754 + }, + { + "epoch": 0.98, + "grad_norm": 0.5415079954579507, + "learning_rate": 3.3433034456412106e-08, + "loss": 1.9122, + "step": 12755 + }, + { + "epoch": 0.98, + "grad_norm": 0.540279277493834, + "learning_rate": 3.3110861372129264e-08, + "loss": 1.8225, + "step": 12756 + }, + { + "epoch": 0.98, + "grad_norm": 0.5070501218706311, + "learning_rate": 3.279024707330547e-08, + "loss": 2.0261, + "step": 12757 + }, + { + "epoch": 0.98, + "grad_norm": 0.5302924634080722, + "learning_rate": 3.2471191579952485e-08, + "loss": 1.9096, + "step": 12758 + }, + { + "epoch": 0.98, + "grad_norm": 0.605153847892772, + "learning_rate": 3.2153694911990494e-08, + "loss": 1.8467, + "step": 12759 + }, + { + "epoch": 0.98, + "grad_norm": 0.5431018642030828, + "learning_rate": 3.183775708924808e-08, + "loss": 1.8638, + "step": 12760 + }, + { + "epoch": 0.98, + "grad_norm": 0.5582861872441743, + "learning_rate": 3.1523378131442796e-08, + "loss": 2.0233, + "step": 12761 + }, + { + "epoch": 0.98, + "grad_norm": 0.5424774883403826, + "learning_rate": 3.1210558058208937e-08, + "loss": 1.8802, + "step": 12762 + }, + { + "epoch": 0.98, + "grad_norm": 0.5575470057551708, + "learning_rate": 3.089929688907256e-08, + "loss": 1.8521, + "step": 12763 + }, + { + "epoch": 0.98, + "grad_norm": 0.5116598775666753, + "learning_rate": 3.0589594643468114e-08, + "loss": 1.962, + "step": 12764 + }, + { + "epoch": 0.98, + "grad_norm": 0.5226371891507903, + "learning_rate": 3.02814513407329e-08, + "loss": 2.0378, + "step": 12765 + }, + { + "epoch": 0.98, + "grad_norm": 0.5606416762952409, + "learning_rate": 2.997486700010432e-08, + "loss": 1.89, + "step": 12766 + }, + { + "epoch": 0.98, + "grad_norm": 0.550864330832561, + "learning_rate": 2.9669841640725393e-08, + "loss": 1.84, + "step": 12767 + }, + { + "epoch": 0.99, + "grad_norm": 0.543953206855594, + "learning_rate": 2.9366375281639213e-08, + "loss": 1.8224, + "step": 12768 + }, + { + "epoch": 0.99, + "grad_norm": 0.5366439724100214, + "learning_rate": 2.906446794179174e-08, + "loss": 2.052, + "step": 12769 + }, + { + "epoch": 0.99, + "grad_norm": 0.5024024512682185, + "learning_rate": 2.8764119640031783e-08, + "loss": 1.9314, + "step": 12770 + }, + { + "epoch": 0.99, + "grad_norm": 0.5378747490599656, + "learning_rate": 2.8465330395111012e-08, + "loss": 1.8749, + "step": 12771 + }, + { + "epoch": 0.99, + "grad_norm": 0.5551855842379617, + "learning_rate": 2.816810022568672e-08, + "loss": 1.887, + "step": 12772 + }, + { + "epoch": 0.99, + "grad_norm": 0.5220511198792261, + "learning_rate": 2.7872429150316294e-08, + "loss": 2.0715, + "step": 12773 + }, + { + "epoch": 0.99, + "grad_norm": 0.571609528831424, + "learning_rate": 2.757831718745718e-08, + "loss": 1.8691, + "step": 12774 + }, + { + "epoch": 0.99, + "grad_norm": 0.5494269113386022, + "learning_rate": 2.7285764355472475e-08, + "loss": 1.8, + "step": 12775 + }, + { + "epoch": 0.99, + "grad_norm": 0.4988895170506658, + "learning_rate": 2.6994770672628123e-08, + "loss": 1.9143, + "step": 12776 + }, + { + "epoch": 0.99, + "grad_norm": 0.5465917675273891, + "learning_rate": 2.670533615709292e-08, + "loss": 1.8395, + "step": 12777 + }, + { + "epoch": 0.99, + "grad_norm": 0.5114986090501923, + "learning_rate": 2.6417460826935746e-08, + "loss": 2.0511, + "step": 12778 + }, + { + "epoch": 0.99, + "grad_norm": 0.5576799581000161, + "learning_rate": 2.613114470012834e-08, + "loss": 1.8487, + "step": 12779 + }, + { + "epoch": 0.99, + "grad_norm": 0.5363373919308031, + "learning_rate": 2.5846387794550842e-08, + "loss": 1.8412, + "step": 12780 + }, + { + "epoch": 0.99, + "grad_norm": 0.5395181084629698, + "learning_rate": 2.5563190127980696e-08, + "loss": 2.0086, + "step": 12781 + }, + { + "epoch": 0.99, + "grad_norm": 0.5174136543638879, + "learning_rate": 2.5281551718098208e-08, + "loss": 1.9393, + "step": 12782 + }, + { + "epoch": 0.99, + "grad_norm": 0.5556289835232506, + "learning_rate": 2.500147258248653e-08, + "loss": 1.8183, + "step": 12783 + }, + { + "epoch": 0.99, + "grad_norm": 0.5559679702027939, + "learning_rate": 2.4722952738631676e-08, + "loss": 1.86, + "step": 12784 + }, + { + "epoch": 0.99, + "grad_norm": 0.562802808749891, + "learning_rate": 2.4445992203925293e-08, + "loss": 2.0791, + "step": 12785 + }, + { + "epoch": 0.99, + "grad_norm": 0.5539333060397166, + "learning_rate": 2.417059099565633e-08, + "loss": 1.8789, + "step": 12786 + }, + { + "epoch": 0.99, + "grad_norm": 0.5751472653169857, + "learning_rate": 2.3896749131022135e-08, + "loss": 1.8365, + "step": 12787 + }, + { + "epoch": 0.99, + "grad_norm": 0.5485465960917203, + "learning_rate": 2.362446662711737e-08, + "loss": 1.8577, + "step": 12788 + }, + { + "epoch": 0.99, + "grad_norm": 0.5034027923825843, + "learning_rate": 2.3353743500942325e-08, + "loss": 1.9211, + "step": 12789 + }, + { + "epoch": 0.99, + "grad_norm": 0.5572957006449073, + "learning_rate": 2.308457976940015e-08, + "loss": 2.0417, + "step": 12790 + }, + { + "epoch": 0.99, + "grad_norm": 0.5312749746377146, + "learning_rate": 2.281697544929684e-08, + "loss": 1.8282, + "step": 12791 + }, + { + "epoch": 0.99, + "grad_norm": 0.555257056049469, + "learning_rate": 2.2550930557335704e-08, + "loss": 1.8038, + "step": 12792 + }, + { + "epoch": 0.99, + "grad_norm": 0.5103850517755295, + "learning_rate": 2.2286445110131225e-08, + "loss": 2.0027, + "step": 12793 + }, + { + "epoch": 0.99, + "grad_norm": 0.5326093446501065, + "learning_rate": 2.20235191241952e-08, + "loss": 1.8296, + "step": 12794 + }, + { + "epoch": 0.99, + "grad_norm": 0.5135536995616966, + "learning_rate": 2.1762152615942277e-08, + "loss": 1.8821, + "step": 12795 + }, + { + "epoch": 0.99, + "grad_norm": 0.540209221417365, + "learning_rate": 2.150234560169273e-08, + "loss": 1.8356, + "step": 12796 + }, + { + "epoch": 0.99, + "grad_norm": 0.5309298960346736, + "learning_rate": 2.124409809766692e-08, + "loss": 1.9961, + "step": 12797 + }, + { + "epoch": 0.99, + "grad_norm": 0.5561346650899518, + "learning_rate": 2.0987410119985284e-08, + "loss": 1.8623, + "step": 12798 + }, + { + "epoch": 0.99, + "grad_norm": 0.542254420812681, + "learning_rate": 2.073228168467667e-08, + "loss": 1.8488, + "step": 12799 + }, + { + "epoch": 0.99, + "grad_norm": 0.5565537588407203, + "learning_rate": 2.0478712807667222e-08, + "loss": 1.7944, + "step": 12800 + }, + { + "epoch": 0.99, + "grad_norm": 0.5131197592858376, + "learning_rate": 2.02267035047915e-08, + "loss": 1.9616, + "step": 12801 + }, + { + "epoch": 0.99, + "grad_norm": 0.5359541476718073, + "learning_rate": 1.997625379178414e-08, + "loss": 2.054, + "step": 12802 + }, + { + "epoch": 0.99, + "grad_norm": 0.5885334090488491, + "learning_rate": 1.9727363684277077e-08, + "loss": 1.8489, + "step": 12803 + }, + { + "epoch": 0.99, + "grad_norm": 0.5662611088438113, + "learning_rate": 1.948003319781344e-08, + "loss": 1.7935, + "step": 12804 + }, + { + "epoch": 0.99, + "grad_norm": 0.52930347528441, + "learning_rate": 1.9234262347833652e-08, + "loss": 2.0759, + "step": 12805 + }, + { + "epoch": 0.99, + "grad_norm": 0.5419895160582128, + "learning_rate": 1.8990051149683774e-08, + "loss": 1.8791, + "step": 12806 + }, + { + "epoch": 0.99, + "grad_norm": 0.5081769097193298, + "learning_rate": 1.8747399618607164e-08, + "loss": 1.958, + "step": 12807 + }, + { + "epoch": 0.99, + "grad_norm": 0.5274914532840101, + "learning_rate": 1.850630776975837e-08, + "loss": 1.827, + "step": 12808 + }, + { + "epoch": 0.99, + "grad_norm": 0.5476653170661598, + "learning_rate": 1.826677561818646e-08, + "loss": 1.8268, + "step": 12809 + }, + { + "epoch": 0.99, + "grad_norm": 0.5528649562007844, + "learning_rate": 1.8028803178846142e-08, + "loss": 2.0665, + "step": 12810 + }, + { + "epoch": 0.99, + "grad_norm": 0.5472353927704439, + "learning_rate": 1.7792390466597754e-08, + "loss": 1.8691, + "step": 12811 + }, + { + "epoch": 0.99, + "grad_norm": 0.5671658498593329, + "learning_rate": 1.7557537496198927e-08, + "loss": 1.8198, + "step": 12812 + }, + { + "epoch": 0.99, + "grad_norm": 0.5273872028797206, + "learning_rate": 1.732424428231294e-08, + "loss": 2.1192, + "step": 12813 + }, + { + "epoch": 0.99, + "grad_norm": 0.5108209403648459, + "learning_rate": 1.7092510839505914e-08, + "loss": 1.9177, + "step": 12814 + }, + { + "epoch": 0.99, + "grad_norm": 0.557838702541044, + "learning_rate": 1.6862337182246835e-08, + "loss": 1.8563, + "step": 12815 + }, + { + "epoch": 0.99, + "grad_norm": 0.5572753071706266, + "learning_rate": 1.663372332490476e-08, + "loss": 1.8785, + "step": 12816 + }, + { + "epoch": 0.99, + "grad_norm": 0.5287800529999468, + "learning_rate": 1.6406669281754382e-08, + "loss": 2.0596, + "step": 12817 + }, + { + "epoch": 0.99, + "grad_norm": 0.5336721475028753, + "learning_rate": 1.618117506697048e-08, + "loss": 1.8819, + "step": 12818 + }, + { + "epoch": 0.99, + "grad_norm": 0.5365070177425443, + "learning_rate": 1.5957240694630672e-08, + "loss": 1.8576, + "step": 12819 + }, + { + "epoch": 0.99, + "grad_norm": 0.5145073357756669, + "learning_rate": 1.5734866178718223e-08, + "loss": 1.9196, + "step": 12820 + }, + { + "epoch": 0.99, + "grad_norm": 0.5246799975326919, + "learning_rate": 1.5514051533116468e-08, + "loss": 1.8061, + "step": 12821 + }, + { + "epoch": 0.99, + "grad_norm": 0.5399867104724179, + "learning_rate": 1.529479677161161e-08, + "loss": 2.0693, + "step": 12822 + }, + { + "epoch": 0.99, + "grad_norm": 0.5453435124435558, + "learning_rate": 1.507710190788991e-08, + "loss": 1.8097, + "step": 12823 + }, + { + "epoch": 0.99, + "grad_norm": 0.5446354099529966, + "learning_rate": 1.4860966955548838e-08, + "loss": 1.8353, + "step": 12824 + }, + { + "epoch": 0.99, + "grad_norm": 0.5387154863444586, + "learning_rate": 1.4646391928077595e-08, + "loss": 2.0569, + "step": 12825 + }, + { + "epoch": 0.99, + "grad_norm": 0.49982161676903925, + "learning_rate": 1.4433376838873801e-08, + "loss": 1.8912, + "step": 12826 + }, + { + "epoch": 0.99, + "grad_norm": 0.5498456077376449, + "learning_rate": 1.4221921701237928e-08, + "loss": 1.8672, + "step": 12827 + }, + { + "epoch": 0.99, + "grad_norm": 0.5509377876659737, + "learning_rate": 1.4012026528370525e-08, + "loss": 1.8488, + "step": 12828 + }, + { + "epoch": 0.99, + "grad_norm": 0.5497211119502855, + "learning_rate": 1.3803691333377777e-08, + "loss": 2.052, + "step": 12829 + }, + { + "epoch": 0.99, + "grad_norm": 0.5382089975719664, + "learning_rate": 1.3596916129268721e-08, + "loss": 1.8808, + "step": 12830 + }, + { + "epoch": 0.99, + "grad_norm": 0.5422248182104982, + "learning_rate": 1.3391700928949703e-08, + "loss": 1.8315, + "step": 12831 + }, + { + "epoch": 0.99, + "grad_norm": 0.5103107601285621, + "learning_rate": 1.3188045745232691e-08, + "loss": 1.9446, + "step": 12832 + }, + { + "epoch": 0.99, + "grad_norm": 0.5749850218982084, + "learning_rate": 1.2985950590835294e-08, + "loss": 1.8522, + "step": 12833 + }, + { + "epoch": 0.99, + "grad_norm": 0.5302559700026642, + "learning_rate": 1.2785415478375196e-08, + "loss": 2.0538, + "step": 12834 + }, + { + "epoch": 0.99, + "grad_norm": 0.5515767994926126, + "learning_rate": 1.2586440420372936e-08, + "loss": 1.8736, + "step": 12835 + }, + { + "epoch": 0.99, + "grad_norm": 0.5333634540863439, + "learning_rate": 1.238902542924636e-08, + "loss": 1.8794, + "step": 12836 + }, + { + "epoch": 0.99, + "grad_norm": 0.5448599623840182, + "learning_rate": 1.2193170517327268e-08, + "loss": 2.0072, + "step": 12837 + }, + { + "epoch": 0.99, + "grad_norm": 0.5055703725745492, + "learning_rate": 1.1998875696839218e-08, + "loss": 1.9062, + "step": 12838 + }, + { + "epoch": 0.99, + "grad_norm": 0.5438334034623047, + "learning_rate": 1.1806140979916947e-08, + "loss": 1.9356, + "step": 12839 + }, + { + "epoch": 0.99, + "grad_norm": 0.5440237899180227, + "learning_rate": 1.161496637858972e-08, + "loss": 1.8395, + "step": 12840 + }, + { + "epoch": 0.99, + "grad_norm": 0.5403451737465077, + "learning_rate": 1.1425351904795212e-08, + "loss": 1.8121, + "step": 12841 + }, + { + "epoch": 0.99, + "grad_norm": 0.5040399856048634, + "learning_rate": 1.1237297570371175e-08, + "loss": 2.0805, + "step": 12842 + }, + { + "epoch": 0.99, + "grad_norm": 0.5361409237252649, + "learning_rate": 1.1050803387060992e-08, + "loss": 1.8551, + "step": 12843 + }, + { + "epoch": 0.99, + "grad_norm": 0.5511620807969663, + "learning_rate": 1.0865869366505355e-08, + "loss": 1.7945, + "step": 12844 + }, + { + "epoch": 0.99, + "grad_norm": 0.507633089375155, + "learning_rate": 1.068249552025058e-08, + "loss": 1.874, + "step": 12845 + }, + { + "epoch": 0.99, + "grad_norm": 0.5549417732088375, + "learning_rate": 1.0500681859745842e-08, + "loss": 2.0484, + "step": 12846 + }, + { + "epoch": 0.99, + "grad_norm": 0.5378655826097648, + "learning_rate": 1.032042839634595e-08, + "loss": 1.7979, + "step": 12847 + }, + { + "epoch": 0.99, + "grad_norm": 0.5470753199781028, + "learning_rate": 1.014173514129746e-08, + "loss": 1.8902, + "step": 12848 + }, + { + "epoch": 0.99, + "grad_norm": 0.5032611999544041, + "learning_rate": 9.964602105763665e-09, + "loss": 2.0236, + "step": 12849 + }, + { + "epoch": 0.99, + "grad_norm": 0.5495371835638619, + "learning_rate": 9.789029300802389e-09, + "loss": 1.8618, + "step": 12850 + }, + { + "epoch": 0.99, + "grad_norm": 0.5339553827663366, + "learning_rate": 9.615016737374306e-09, + "loss": 1.9248, + "step": 12851 + }, + { + "epoch": 0.99, + "grad_norm": 0.5411082800945316, + "learning_rate": 9.442564426342949e-09, + "loss": 1.7471, + "step": 12852 + }, + { + "epoch": 0.99, + "grad_norm": 0.5430415430204144, + "learning_rate": 9.271672378474706e-09, + "loss": 1.8393, + "step": 12853 + }, + { + "epoch": 0.99, + "grad_norm": 0.5322541222375715, + "learning_rate": 9.102340604441595e-09, + "loss": 2.0216, + "step": 12854 + }, + { + "epoch": 0.99, + "grad_norm": 0.5798996038023844, + "learning_rate": 8.934569114815717e-09, + "loss": 1.8642, + "step": 12855 + }, + { + "epoch": 0.99, + "grad_norm": 0.540324213626176, + "learning_rate": 8.768357920069247e-09, + "loss": 1.8543, + "step": 12856 + }, + { + "epoch": 0.99, + "grad_norm": 0.5259371568034433, + "learning_rate": 8.603707030582774e-09, + "loss": 1.9317, + "step": 12857 + }, + { + "epoch": 0.99, + "grad_norm": 0.5332460163335814, + "learning_rate": 8.440616456631411e-09, + "loss": 2.0635, + "step": 12858 + }, + { + "epoch": 0.99, + "grad_norm": 0.5427477069813599, + "learning_rate": 8.279086208401454e-09, + "loss": 1.8299, + "step": 12859 + }, + { + "epoch": 0.99, + "grad_norm": 0.5343159031207851, + "learning_rate": 8.119116295976503e-09, + "loss": 1.8659, + "step": 12860 + }, + { + "epoch": 0.99, + "grad_norm": 0.5434889605822667, + "learning_rate": 7.96070672934579e-09, + "loss": 1.8206, + "step": 12861 + }, + { + "epoch": 0.99, + "grad_norm": 0.5668502349485895, + "learning_rate": 7.803857518395852e-09, + "loss": 2.0587, + "step": 12862 + }, + { + "epoch": 0.99, + "grad_norm": 0.5116010395545514, + "learning_rate": 7.64856867292163e-09, + "loss": 1.9073, + "step": 12863 + }, + { + "epoch": 0.99, + "grad_norm": 0.5439204149447824, + "learning_rate": 7.494840202620922e-09, + "loss": 1.8806, + "step": 12864 + }, + { + "epoch": 0.99, + "grad_norm": 0.5394396024011466, + "learning_rate": 7.342672117086058e-09, + "loss": 1.8655, + "step": 12865 + }, + { + "epoch": 0.99, + "grad_norm": 0.5151057475224693, + "learning_rate": 7.1920644258205436e-09, + "loss": 2.0685, + "step": 12866 + }, + { + "epoch": 0.99, + "grad_norm": 0.530427163020553, + "learning_rate": 7.0430171382307455e-09, + "loss": 1.8967, + "step": 12867 + }, + { + "epoch": 0.99, + "grad_norm": 0.5585150485696962, + "learning_rate": 6.895530263617555e-09, + "loss": 1.8691, + "step": 12868 + }, + { + "epoch": 0.99, + "grad_norm": 0.5091573650165916, + "learning_rate": 6.749603811190275e-09, + "loss": 1.9402, + "step": 12869 + }, + { + "epoch": 0.99, + "grad_norm": 0.5225382375092924, + "learning_rate": 6.605237790058283e-09, + "loss": 2.0152, + "step": 12870 + }, + { + "epoch": 0.99, + "grad_norm": 0.5216198249999051, + "learning_rate": 6.462432209239367e-09, + "loss": 1.8616, + "step": 12871 + }, + { + "epoch": 0.99, + "grad_norm": 0.5236206213815553, + "learning_rate": 6.32118707764584e-09, + "loss": 1.8691, + "step": 12872 + }, + { + "epoch": 0.99, + "grad_norm": 0.5653374398981833, + "learning_rate": 6.181502404098427e-09, + "loss": 1.848, + "step": 12873 + }, + { + "epoch": 0.99, + "grad_norm": 0.5240068455431693, + "learning_rate": 6.043378197315153e-09, + "loss": 2.0557, + "step": 12874 + }, + { + "epoch": 0.99, + "grad_norm": 0.5535647209309493, + "learning_rate": 5.90681446592245e-09, + "loss": 1.8326, + "step": 12875 + }, + { + "epoch": 0.99, + "grad_norm": 0.5421142193506108, + "learning_rate": 5.771811218444056e-09, + "loss": 1.9204, + "step": 12876 + }, + { + "epoch": 0.99, + "grad_norm": 0.5651087921344566, + "learning_rate": 5.638368463312116e-09, + "loss": 1.8378, + "step": 12877 + }, + { + "epoch": 0.99, + "grad_norm": 0.5081701422759135, + "learning_rate": 5.506486208856076e-09, + "loss": 2.0438, + "step": 12878 + }, + { + "epoch": 0.99, + "grad_norm": 0.5450160909175693, + "learning_rate": 5.376164463311018e-09, + "loss": 1.8624, + "step": 12879 + }, + { + "epoch": 0.99, + "grad_norm": 0.5314499709272762, + "learning_rate": 5.247403234809323e-09, + "loss": 1.8817, + "step": 12880 + }, + { + "epoch": 0.99, + "grad_norm": 0.5574501845343649, + "learning_rate": 5.120202531397333e-09, + "loss": 2.0645, + "step": 12881 + }, + { + "epoch": 0.99, + "grad_norm": 0.48722957981921344, + "learning_rate": 4.994562361010368e-09, + "loss": 1.9237, + "step": 12882 + }, + { + "epoch": 0.99, + "grad_norm": 0.542611995529748, + "learning_rate": 4.87048273149493e-09, + "loss": 1.8454, + "step": 12883 + }, + { + "epoch": 0.99, + "grad_norm": 0.5330092863172161, + "learning_rate": 4.747963650597598e-09, + "loss": 1.8638, + "step": 12884 + }, + { + "epoch": 0.99, + "grad_norm": 0.5608851744263007, + "learning_rate": 4.627005125967809e-09, + "loss": 1.864, + "step": 12885 + }, + { + "epoch": 0.99, + "grad_norm": 0.5277984886168451, + "learning_rate": 4.507607165157857e-09, + "loss": 2.0985, + "step": 12886 + }, + { + "epoch": 0.99, + "grad_norm": 0.5839726330849501, + "learning_rate": 4.3897697756228874e-09, + "loss": 1.8729, + "step": 12887 + }, + { + "epoch": 0.99, + "grad_norm": 0.4972330756009016, + "learning_rate": 4.2734929647181285e-09, + "loss": 1.923, + "step": 12888 + }, + { + "epoch": 0.99, + "grad_norm": 0.5364795677232849, + "learning_rate": 4.15877673970444e-09, + "loss": 1.8419, + "step": 12889 + }, + { + "epoch": 0.99, + "grad_norm": 0.5186413746557103, + "learning_rate": 4.045621107745534e-09, + "loss": 2.0454, + "step": 12890 + }, + { + "epoch": 0.99, + "grad_norm": 0.5387920096109056, + "learning_rate": 3.93402607590243e-09, + "loss": 1.8888, + "step": 12891 + }, + { + "epoch": 0.99, + "grad_norm": 0.5312001458061415, + "learning_rate": 3.823991651144554e-09, + "loss": 1.8191, + "step": 12892 + }, + { + "epoch": 0.99, + "grad_norm": 0.5388291101410658, + "learning_rate": 3.7155178403441848e-09, + "loss": 1.814, + "step": 12893 + }, + { + "epoch": 0.99, + "grad_norm": 0.5047745144893855, + "learning_rate": 3.6086046502681326e-09, + "loss": 2.0588, + "step": 12894 + }, + { + "epoch": 0.99, + "grad_norm": 0.5627606259668229, + "learning_rate": 3.5032520875971643e-09, + "loss": 1.8592, + "step": 12895 + }, + { + "epoch": 0.99, + "grad_norm": 0.5541122171074505, + "learning_rate": 3.3994601589038e-09, + "loss": 1.8253, + "step": 12896 + }, + { + "epoch": 0.99, + "grad_norm": 0.5513142822874707, + "learning_rate": 3.2972288706717425e-09, + "loss": 1.8622, + "step": 12897 + }, + { + "epoch": 1.0, + "grad_norm": 0.5081497117862622, + "learning_rate": 3.196558229284774e-09, + "loss": 2.0356, + "step": 12898 + }, + { + "epoch": 1.0, + "grad_norm": 0.5395541136465374, + "learning_rate": 3.0974482410239813e-09, + "loss": 1.8324, + "step": 12899 + }, + { + "epoch": 1.0, + "grad_norm": 0.5028777404217911, + "learning_rate": 2.9998989120788578e-09, + "loss": 1.9053, + "step": 12900 + }, + { + "epoch": 1.0, + "grad_norm": 0.5483321901645064, + "learning_rate": 2.9039102485417524e-09, + "loss": 1.8417, + "step": 12901 + }, + { + "epoch": 1.0, + "grad_norm": 0.5625779688427248, + "learning_rate": 2.8094822564050936e-09, + "loss": 1.9986, + "step": 12902 + }, + { + "epoch": 1.0, + "grad_norm": 0.5501153195081722, + "learning_rate": 2.7166149415613905e-09, + "loss": 1.858, + "step": 12903 + }, + { + "epoch": 1.0, + "grad_norm": 0.5385904350419389, + "learning_rate": 2.6253083098115584e-09, + "loss": 1.849, + "step": 12904 + }, + { + "epoch": 1.0, + "grad_norm": 0.5515099757028045, + "learning_rate": 2.5355623668538164e-09, + "loss": 1.8753, + "step": 12905 + }, + { + "epoch": 1.0, + "grad_norm": 0.5529170780677597, + "learning_rate": 2.4473771182947914e-09, + "loss": 2.0499, + "step": 12906 + }, + { + "epoch": 1.0, + "grad_norm": 0.5137424625825454, + "learning_rate": 2.3607525696384135e-09, + "loss": 1.9149, + "step": 12907 + }, + { + "epoch": 1.0, + "grad_norm": 0.5259261199510776, + "learning_rate": 2.275688726294245e-09, + "loss": 1.8352, + "step": 12908 + }, + { + "epoch": 1.0, + "grad_norm": 0.5733449561476708, + "learning_rate": 2.1921855935719272e-09, + "loss": 1.8858, + "step": 12909 + }, + { + "epoch": 1.0, + "grad_norm": 0.5181933314992972, + "learning_rate": 2.110243176683957e-09, + "loss": 2.0265, + "step": 12910 + }, + { + "epoch": 1.0, + "grad_norm": 0.5415346639630024, + "learning_rate": 2.0298614807456874e-09, + "loss": 1.8162, + "step": 12911 + }, + { + "epoch": 1.0, + "grad_norm": 0.5366704449611802, + "learning_rate": 1.951040510780877e-09, + "loss": 1.8396, + "step": 12912 + }, + { + "epoch": 1.0, + "grad_norm": 0.5151198861679401, + "learning_rate": 1.873780271705039e-09, + "loss": 1.937, + "step": 12913 + }, + { + "epoch": 1.0, + "grad_norm": 0.5335693682582073, + "learning_rate": 1.7980807683448674e-09, + "loss": 2.0364, + "step": 12914 + }, + { + "epoch": 1.0, + "grad_norm": 0.5419626636876316, + "learning_rate": 1.7239420054243615e-09, + "loss": 1.8178, + "step": 12915 + }, + { + "epoch": 1.0, + "grad_norm": 0.5523098733901055, + "learning_rate": 1.651363987575927e-09, + "loss": 1.8896, + "step": 12916 + }, + { + "epoch": 1.0, + "grad_norm": 0.5362504726965033, + "learning_rate": 1.5803467193264977e-09, + "loss": 1.8363, + "step": 12917 + }, + { + "epoch": 1.0, + "grad_norm": 0.5446112410298122, + "learning_rate": 1.5108902051141903e-09, + "loss": 2.0733, + "step": 12918 + }, + { + "epoch": 1.0, + "grad_norm": 0.5106676437203901, + "learning_rate": 1.4429944492716507e-09, + "loss": 1.9181, + "step": 12919 + }, + { + "epoch": 1.0, + "grad_norm": 0.5372676542601247, + "learning_rate": 1.3766594560399305e-09, + "loss": 1.8278, + "step": 12920 + }, + { + "epoch": 1.0, + "grad_norm": 0.5683255149306662, + "learning_rate": 1.3118852295601613e-09, + "loss": 1.8624, + "step": 12921 + }, + { + "epoch": 1.0, + "grad_norm": 0.5194736067535166, + "learning_rate": 1.248671773876331e-09, + "loss": 2.0548, + "step": 12922 + }, + { + "epoch": 1.0, + "grad_norm": 0.5382111895933023, + "learning_rate": 1.187019092935282e-09, + "loss": 1.8067, + "step": 12923 + }, + { + "epoch": 1.0, + "grad_norm": 0.5270498365092458, + "learning_rate": 1.1269271905867128e-09, + "loss": 1.8474, + "step": 12924 + }, + { + "epoch": 1.0, + "grad_norm": 0.5219585103045453, + "learning_rate": 1.0683960705831774e-09, + "loss": 1.9149, + "step": 12925 + }, + { + "epoch": 1.0, + "grad_norm": 0.5208685240656405, + "learning_rate": 1.0114257365773094e-09, + "loss": 2.028, + "step": 12926 + }, + { + "epoch": 1.0, + "grad_norm": 0.5525641344438285, + "learning_rate": 9.56016192124598e-10, + "loss": 1.9042, + "step": 12927 + }, + { + "epoch": 1.0, + "grad_norm": 0.5554333474483573, + "learning_rate": 9.021674406889391e-10, + "loss": 1.8162, + "step": 12928 + }, + { + "epoch": 1.0, + "grad_norm": 0.5481007471207056, + "learning_rate": 8.498794856287573e-10, + "loss": 1.9002, + "step": 12929 + }, + { + "epoch": 1.0, + "grad_norm": 0.5302882133059988, + "learning_rate": 7.991523302108839e-10, + "loss": 2.0671, + "step": 12930 + }, + { + "epoch": 1.0, + "grad_norm": 0.508259019058269, + "learning_rate": 7.499859775994544e-10, + "loss": 1.9218, + "step": 12931 + }, + { + "epoch": 1.0, + "grad_norm": 0.5611338073183052, + "learning_rate": 7.023804308670112e-10, + "loss": 1.8314, + "step": 12932 + }, + { + "epoch": 1.0, + "grad_norm": 0.5397483938328642, + "learning_rate": 6.563356929834009e-10, + "loss": 1.7805, + "step": 12933 + }, + { + "epoch": 1.0, + "grad_norm": 0.5279416094867868, + "learning_rate": 6.118517668268764e-10, + "loss": 2.0409, + "step": 12934 + }, + { + "epoch": 1.0, + "grad_norm": 0.5424628778356461, + "learning_rate": 5.689286551702199e-10, + "loss": 1.8519, + "step": 12935 + }, + { + "epoch": 1.0, + "grad_norm": 0.5465559577822899, + "learning_rate": 5.275663606973958e-10, + "loss": 1.8336, + "step": 12936 + }, + { + "epoch": 1.0, + "grad_norm": 0.5806456675669902, + "learning_rate": 4.877648859868966e-10, + "loss": 1.8561, + "step": 12937 + }, + { + "epoch": 1.0, + "grad_norm": 0.4915816112944975, + "learning_rate": 4.495242335256222e-10, + "loss": 2.074, + "step": 12938 + }, + { + "epoch": 1.0, + "grad_norm": 0.5238544703738774, + "learning_rate": 4.128444057033276e-10, + "loss": 1.8602, + "step": 12939 + }, + { + "epoch": 1.0, + "grad_norm": 0.5440292299768047, + "learning_rate": 3.7772540480707217e-10, + "loss": 1.8325, + "step": 12940 + }, + { + "epoch": 1.0, + "grad_norm": 0.551005928058127, + "learning_rate": 3.44167233032322e-10, + "loss": 1.7959, + "step": 12941 + }, + { + "epoch": 1.0, + "grad_norm": 0.556787073139806, + "learning_rate": 3.1216989246907193e-10, + "loss": 2.0652, + "step": 12942 + }, + { + "epoch": 1.0, + "grad_norm": 0.5290659297564679, + "learning_rate": 2.8173338512127445e-10, + "loss": 1.8164, + "step": 12943 + }, + { + "epoch": 1.0, + "grad_norm": 0.5088968466529408, + "learning_rate": 2.5285771288463546e-10, + "loss": 1.9008, + "step": 12944 + }, + { + "epoch": 1.0, + "grad_norm": 0.5558595456533397, + "learning_rate": 2.255428775632673e-10, + "loss": 1.8267, + "step": 12945 + }, + { + "epoch": 1.0, + "grad_norm": 0.5157993397084163, + "learning_rate": 1.9978888086413795e-10, + "loss": 2.0419, + "step": 12946 + }, + { + "epoch": 1.0, + "grad_norm": 0.5342265901200998, + "learning_rate": 1.7559572439429516e-10, + "loss": 1.8417, + "step": 12947 + }, + { + "epoch": 1.0, + "grad_norm": 0.5382101598106954, + "learning_rate": 1.529634096636423e-10, + "loss": 1.831, + "step": 12948 + }, + { + "epoch": 1.0, + "grad_norm": 0.5776222194645314, + "learning_rate": 1.3189193808493815e-10, + "loss": 1.8744, + "step": 12949 + }, + { + "epoch": 1.0, + "grad_norm": 0.5188828639426037, + "learning_rate": 1.12381310973797e-10, + "loss": 2.1104, + "step": 12950 + }, + { + "epoch": 1.0, + "grad_norm": 0.5279330734845321, + "learning_rate": 9.443152954868861e-11, + "loss": 1.8617, + "step": 12951 + }, + { + "epoch": 1.0, + "grad_norm": 0.534475577994914, + "learning_rate": 7.804259493093824e-11, + "loss": 1.8338, + "step": 12952 + }, + { + "epoch": 1.0, + "grad_norm": 0.542408574656326, + "learning_rate": 6.321450814195107e-11, + "loss": 1.8309, + "step": 12953 + }, + { + "epoch": 1.0, + "grad_norm": 0.5495583978918248, + "learning_rate": 4.994727011153888e-11, + "loss": 2.0997, + "step": 12954 + }, + { + "epoch": 1.0, + "grad_norm": 0.5372520342820748, + "learning_rate": 3.8240881664042275e-11, + "loss": 1.8736, + "step": 12955 + }, + { + "epoch": 1.0, + "grad_norm": 0.5011767299602624, + "learning_rate": 2.8095343529432882e-11, + "loss": 1.8717, + "step": 12956 + }, + { + "epoch": 1.0, + "grad_norm": 0.5497651832940555, + "learning_rate": 1.9510656346088953e-11, + "loss": 1.806, + "step": 12957 + }, + { + "epoch": 1.0, + "grad_norm": 0.5328127220106561, + "learning_rate": 1.248682064691753e-11, + "loss": 2.08, + "step": 12958 + }, + { + "epoch": 1.0, + "grad_norm": 0.5335184717219762, + "learning_rate": 7.0238368704567124e-12, + "loss": 1.8916, + "step": 12959 + }, + { + "epoch": 1.0, + "grad_norm": 0.5320203093793444, + "learning_rate": 3.121705355324522e-12, + "loss": 1.8631, + "step": 12960 + }, + { + "epoch": 1.0, + "grad_norm": 0.5560938197837585, + "learning_rate": 7.804263513211397e-13, + "loss": 1.8308, + "step": 12961 + }, + { + "epoch": 1.0, + "grad_norm": 0.49048502620299095, + "learning_rate": 0.0, + "loss": 1.9668, + "step": 12962 + }, + { + "epoch": 1.0, + "step": 12962, + "total_flos": 0.0, + "train_loss": 0.23017303778275058, + "train_runtime": 7116.4219, + "train_samples_per_second": 1865.25, + "train_steps_per_second": 1.821 + } + ], + "logging_steps": 1.0, + "max_steps": 12962, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}