diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13704 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 19532, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005119803399549457, + "grad_norm": 1.4939812421798706, + "learning_rate": 7.679705099324186e-05, + "loss": 8.8823, + "step": 10 + }, + { + "epoch": 0.0010239606799098914, + "grad_norm": 1.7099491357803345, + "learning_rate": 0.00015359410198648372, + "loss": 8.4098, + "step": 20 + }, + { + "epoch": 0.0015359410198648373, + "grad_norm": 1.341354489326477, + "learning_rate": 0.00023039115297972558, + "loss": 7.809, + "step": 30 + }, + { + "epoch": 0.0020479213598197828, + "grad_norm": 1.2555238008499146, + "learning_rate": 0.00030718820397296744, + "loss": 7.2814, + "step": 40 + }, + { + "epoch": 0.0025599016997747285, + "grad_norm": 1.1430288553237915, + "learning_rate": 0.0003839852549662093, + "loss": 6.8009, + "step": 50 + }, + { + "epoch": 0.0030718820397296746, + "grad_norm": 0.7848866581916809, + "learning_rate": 0.00046078230595945115, + "loss": 6.4164, + "step": 60 + }, + { + "epoch": 0.0035838623796846203, + "grad_norm": 1.1270220279693604, + "learning_rate": 0.000537579356952693, + "loss": 6.1553, + "step": 70 + }, + { + "epoch": 0.0040958427196395655, + "grad_norm": 0.5496548414230347, + "learning_rate": 0.0006143764079459349, + "loss": 5.9572, + "step": 80 + }, + { + "epoch": 0.004607823059594511, + "grad_norm": 0.8258134126663208, + "learning_rate": 0.0006911734589391768, + "loss": 5.7536, + "step": 90 + }, + { + "epoch": 0.005119803399549457, + "grad_norm": 0.544425368309021, + "learning_rate": 0.0007679705099324186, + "loss": 5.6043, + "step": 100 + }, + { + "epoch": 0.005631783739504403, + "grad_norm": 1.129820466041565, + "learning_rate": 0.0008447675609256605, + "loss": 5.3984, + "step": 110 + }, + { + "epoch": 0.006143764079459349, + "grad_norm": 1.6234118938446045, + "learning_rate": 0.0009215646119189023, + "loss": 5.2392, + "step": 120 + }, + { + "epoch": 0.006655744419414295, + "grad_norm": 0.7183708548545837, + "learning_rate": 0.0009983616629121441, + "loss": 5.107, + "step": 130 + }, + { + "epoch": 0.0071677247593692405, + "grad_norm": 1.0296348333358765, + "learning_rate": 0.001075158713905386, + "loss": 4.9662, + "step": 140 + }, + { + "epoch": 0.007679705099324186, + "grad_norm": 1.6978133916854858, + "learning_rate": 0.001151955764898628, + "loss": 4.8161, + "step": 150 + }, + { + "epoch": 0.008191685439279131, + "grad_norm": 0.8946409821510315, + "learning_rate": 0.0012287528158918697, + "loss": 4.7119, + "step": 160 + }, + { + "epoch": 0.008703665779234078, + "grad_norm": 1.0135765075683594, + "learning_rate": 0.0013055498668851117, + "loss": 4.6082, + "step": 170 + }, + { + "epoch": 0.009215646119189022, + "grad_norm": 0.8236331343650818, + "learning_rate": 0.0013823469178783536, + "loss": 4.496, + "step": 180 + }, + { + "epoch": 0.009727626459143969, + "grad_norm": 1.161008596420288, + "learning_rate": 0.0014591439688715956, + "loss": 4.4071, + "step": 190 + }, + { + "epoch": 0.010239606799098914, + "grad_norm": 1.3253235816955566, + "learning_rate": 0.0015, + "loss": 4.3286, + "step": 200 + }, + { + "epoch": 0.01075158713905386, + "grad_norm": 1.6026867628097534, + "learning_rate": 0.0015, + "loss": 4.2374, + "step": 210 + }, + { + "epoch": 0.011263567479008805, + "grad_norm": 1.0043503046035767, + "learning_rate": 0.0015, + "loss": 4.1526, + "step": 220 + }, + { + "epoch": 0.011775547818963752, + "grad_norm": 0.963283121585846, + "learning_rate": 0.0015, + "loss": 4.0709, + "step": 230 + }, + { + "epoch": 0.012287528158918698, + "grad_norm": 0.8025517463684082, + "learning_rate": 0.0015, + "loss": 3.9997, + "step": 240 + }, + { + "epoch": 0.012799508498873643, + "grad_norm": 0.7000623345375061, + "learning_rate": 0.0015, + "loss": 3.91, + "step": 250 + }, + { + "epoch": 0.01331148883882859, + "grad_norm": 0.8964600563049316, + "learning_rate": 0.0015, + "loss": 3.8844, + "step": 260 + }, + { + "epoch": 0.013823469178783535, + "grad_norm": 0.7321097254753113, + "learning_rate": 0.0015, + "loss": 3.8324, + "step": 270 + }, + { + "epoch": 0.014335449518738481, + "grad_norm": 0.8242825269699097, + "learning_rate": 0.0015, + "loss": 3.7653, + "step": 280 + }, + { + "epoch": 0.014847429858693426, + "grad_norm": 1.045832633972168, + "learning_rate": 0.0015, + "loss": 3.7115, + "step": 290 + }, + { + "epoch": 0.015359410198648372, + "grad_norm": 1.0511783361434937, + "learning_rate": 0.0015, + "loss": 3.6754, + "step": 300 + }, + { + "epoch": 0.015871390538603317, + "grad_norm": 0.79283607006073, + "learning_rate": 0.0015, + "loss": 3.615, + "step": 310 + }, + { + "epoch": 0.016383370878558262, + "grad_norm": 0.7592840194702148, + "learning_rate": 0.0015, + "loss": 3.5692, + "step": 320 + }, + { + "epoch": 0.01689535121851321, + "grad_norm": 0.6317871809005737, + "learning_rate": 0.0015, + "loss": 3.5581, + "step": 330 + }, + { + "epoch": 0.017407331558468155, + "grad_norm": 0.8634727597236633, + "learning_rate": 0.0015, + "loss": 3.5035, + "step": 340 + }, + { + "epoch": 0.0179193118984231, + "grad_norm": 0.9801504611968994, + "learning_rate": 0.0015, + "loss": 3.4543, + "step": 350 + }, + { + "epoch": 0.018431292238378045, + "grad_norm": 0.9941282868385315, + "learning_rate": 0.0015, + "loss": 3.4323, + "step": 360 + }, + { + "epoch": 0.018943272578332993, + "grad_norm": 1.1075271368026733, + "learning_rate": 0.0015, + "loss": 3.3992, + "step": 370 + }, + { + "epoch": 0.019455252918287938, + "grad_norm": 0.9263769388198853, + "learning_rate": 0.0015, + "loss": 3.3484, + "step": 380 + }, + { + "epoch": 0.019967233258242883, + "grad_norm": 0.6879151463508606, + "learning_rate": 0.0015, + "loss": 3.3255, + "step": 390 + }, + { + "epoch": 0.020479213598197828, + "grad_norm": 1.0170198678970337, + "learning_rate": 0.0015, + "loss": 3.2744, + "step": 400 + }, + { + "epoch": 0.020991193938152776, + "grad_norm": 0.9534377455711365, + "learning_rate": 0.0015, + "loss": 3.2513, + "step": 410 + }, + { + "epoch": 0.02150317427810772, + "grad_norm": 1.1487725973129272, + "learning_rate": 0.0015, + "loss": 3.2043, + "step": 420 + }, + { + "epoch": 0.022015154618062666, + "grad_norm": 0.8081286549568176, + "learning_rate": 0.0015, + "loss": 3.1891, + "step": 430 + }, + { + "epoch": 0.02252713495801761, + "grad_norm": 0.8324559926986694, + "learning_rate": 0.0015, + "loss": 3.1025, + "step": 440 + }, + { + "epoch": 0.02303911529797256, + "grad_norm": 0.9536003470420837, + "learning_rate": 0.0015, + "loss": 3.1029, + "step": 450 + }, + { + "epoch": 0.023551095637927504, + "grad_norm": 1.3307809829711914, + "learning_rate": 0.0015, + "loss": 3.0508, + "step": 460 + }, + { + "epoch": 0.02406307597788245, + "grad_norm": 1.237606167793274, + "learning_rate": 0.0015, + "loss": 3.0528, + "step": 470 + }, + { + "epoch": 0.024575056317837397, + "grad_norm": 0.9293427467346191, + "learning_rate": 0.0015, + "loss": 2.9933, + "step": 480 + }, + { + "epoch": 0.02508703665779234, + "grad_norm": 0.8388038873672485, + "learning_rate": 0.0015, + "loss": 2.9593, + "step": 490 + }, + { + "epoch": 0.025599016997747286, + "grad_norm": 0.7568584084510803, + "learning_rate": 0.0015, + "loss": 2.9442, + "step": 500 + }, + { + "epoch": 0.02611099733770223, + "grad_norm": 0.7443001866340637, + "learning_rate": 0.0015, + "loss": 2.9138, + "step": 510 + }, + { + "epoch": 0.02662297767765718, + "grad_norm": 0.9567376375198364, + "learning_rate": 0.0015, + "loss": 2.8952, + "step": 520 + }, + { + "epoch": 0.027134958017612124, + "grad_norm": 0.7521085143089294, + "learning_rate": 0.0015, + "loss": 2.8719, + "step": 530 + }, + { + "epoch": 0.02764693835756707, + "grad_norm": 1.0200743675231934, + "learning_rate": 0.0015, + "loss": 2.8533, + "step": 540 + }, + { + "epoch": 0.028158918697522014, + "grad_norm": 0.8097197413444519, + "learning_rate": 0.0015, + "loss": 2.8476, + "step": 550 + }, + { + "epoch": 0.028670899037476962, + "grad_norm": 0.7335869669914246, + "learning_rate": 0.0015, + "loss": 2.7611, + "step": 560 + }, + { + "epoch": 0.029182879377431907, + "grad_norm": 0.7385020852088928, + "learning_rate": 0.0015, + "loss": 2.7824, + "step": 570 + }, + { + "epoch": 0.029694859717386852, + "grad_norm": 0.8730366826057434, + "learning_rate": 0.0015, + "loss": 2.7236, + "step": 580 + }, + { + "epoch": 0.030206840057341797, + "grad_norm": 0.8042418360710144, + "learning_rate": 0.0015, + "loss": 2.7331, + "step": 590 + }, + { + "epoch": 0.030718820397296745, + "grad_norm": 0.7750236392021179, + "learning_rate": 0.0015, + "loss": 2.6946, + "step": 600 + }, + { + "epoch": 0.03123080073725169, + "grad_norm": 1.130753755569458, + "learning_rate": 0.0015, + "loss": 2.7127, + "step": 610 + }, + { + "epoch": 0.031742781077206635, + "grad_norm": 0.7699748277664185, + "learning_rate": 0.0015, + "loss": 2.665, + "step": 620 + }, + { + "epoch": 0.03225476141716158, + "grad_norm": 0.7676917314529419, + "learning_rate": 0.0015, + "loss": 2.6516, + "step": 630 + }, + { + "epoch": 0.032766741757116524, + "grad_norm": 0.9566435217857361, + "learning_rate": 0.0015, + "loss": 2.6311, + "step": 640 + }, + { + "epoch": 0.03327872209707147, + "grad_norm": 0.9319092631340027, + "learning_rate": 0.0015, + "loss": 2.6062, + "step": 650 + }, + { + "epoch": 0.03379070243702642, + "grad_norm": 0.7314916849136353, + "learning_rate": 0.0015, + "loss": 2.5822, + "step": 660 + }, + { + "epoch": 0.03430268277698136, + "grad_norm": 0.765346109867096, + "learning_rate": 0.0015, + "loss": 2.587, + "step": 670 + }, + { + "epoch": 0.03481466311693631, + "grad_norm": 0.8714979887008667, + "learning_rate": 0.0015, + "loss": 2.5479, + "step": 680 + }, + { + "epoch": 0.03532664345689126, + "grad_norm": 0.7182953357696533, + "learning_rate": 0.0015, + "loss": 2.5388, + "step": 690 + }, + { + "epoch": 0.0358386237968462, + "grad_norm": 0.71555095911026, + "learning_rate": 0.0015, + "loss": 2.5196, + "step": 700 + }, + { + "epoch": 0.03635060413680115, + "grad_norm": 0.6901549696922302, + "learning_rate": 0.0015, + "loss": 2.4948, + "step": 710 + }, + { + "epoch": 0.03686258447675609, + "grad_norm": 0.7073848247528076, + "learning_rate": 0.0015, + "loss": 2.4814, + "step": 720 + }, + { + "epoch": 0.03737456481671104, + "grad_norm": 0.6590971350669861, + "learning_rate": 0.0015, + "loss": 2.4799, + "step": 730 + }, + { + "epoch": 0.037886545156665986, + "grad_norm": 0.6124588251113892, + "learning_rate": 0.0015, + "loss": 2.4529, + "step": 740 + }, + { + "epoch": 0.03839852549662093, + "grad_norm": 0.7170097231864929, + "learning_rate": 0.0015, + "loss": 2.4397, + "step": 750 + }, + { + "epoch": 0.038910505836575876, + "grad_norm": 0.7509459853172302, + "learning_rate": 0.0015, + "loss": 2.433, + "step": 760 + }, + { + "epoch": 0.039422486176530824, + "grad_norm": 0.8185219168663025, + "learning_rate": 0.0015, + "loss": 2.4364, + "step": 770 + }, + { + "epoch": 0.039934466516485766, + "grad_norm": 0.6452121734619141, + "learning_rate": 0.0015, + "loss": 2.4375, + "step": 780 + }, + { + "epoch": 0.040446446856440714, + "grad_norm": 0.7798700928688049, + "learning_rate": 0.0015, + "loss": 2.4082, + "step": 790 + }, + { + "epoch": 0.040958427196395655, + "grad_norm": 0.905072808265686, + "learning_rate": 0.0015, + "loss": 2.3811, + "step": 800 + }, + { + "epoch": 0.041470407536350604, + "grad_norm": 0.7047348618507385, + "learning_rate": 0.0015, + "loss": 2.3955, + "step": 810 + }, + { + "epoch": 0.04198238787630555, + "grad_norm": 0.6472852230072021, + "learning_rate": 0.0015, + "loss": 2.3776, + "step": 820 + }, + { + "epoch": 0.04249436821626049, + "grad_norm": 0.729308545589447, + "learning_rate": 0.0015, + "loss": 2.3465, + "step": 830 + }, + { + "epoch": 0.04300634855621544, + "grad_norm": 0.8292624950408936, + "learning_rate": 0.0015, + "loss": 2.3578, + "step": 840 + }, + { + "epoch": 0.04351832889617039, + "grad_norm": 0.6298139691352844, + "learning_rate": 0.0015, + "loss": 2.3349, + "step": 850 + }, + { + "epoch": 0.04403030923612533, + "grad_norm": 0.647214949131012, + "learning_rate": 0.0015, + "loss": 2.299, + "step": 860 + }, + { + "epoch": 0.04454228957608028, + "grad_norm": 0.7034851312637329, + "learning_rate": 0.0015, + "loss": 2.2927, + "step": 870 + }, + { + "epoch": 0.04505426991603522, + "grad_norm": 0.6373961567878723, + "learning_rate": 0.0015, + "loss": 2.2776, + "step": 880 + }, + { + "epoch": 0.04556625025599017, + "grad_norm": 0.8384701609611511, + "learning_rate": 0.0015, + "loss": 2.2948, + "step": 890 + }, + { + "epoch": 0.04607823059594512, + "grad_norm": 0.7856025695800781, + "learning_rate": 0.0015, + "loss": 2.3034, + "step": 900 + }, + { + "epoch": 0.04659021093590006, + "grad_norm": 0.6041284799575806, + "learning_rate": 0.0015, + "loss": 2.2773, + "step": 910 + }, + { + "epoch": 0.04710219127585501, + "grad_norm": 0.8801588416099548, + "learning_rate": 0.0015, + "loss": 2.2706, + "step": 920 + }, + { + "epoch": 0.047614171615809955, + "grad_norm": 0.7567424178123474, + "learning_rate": 0.0015, + "loss": 2.2754, + "step": 930 + }, + { + "epoch": 0.0481261519557649, + "grad_norm": 0.6421610713005066, + "learning_rate": 0.0015, + "loss": 2.2514, + "step": 940 + }, + { + "epoch": 0.048638132295719845, + "grad_norm": 0.7311142683029175, + "learning_rate": 0.0015, + "loss": 2.2005, + "step": 950 + }, + { + "epoch": 0.04915011263567479, + "grad_norm": 0.7399065494537354, + "learning_rate": 0.0015, + "loss": 2.2038, + "step": 960 + }, + { + "epoch": 0.049662092975629735, + "grad_norm": 0.708454430103302, + "learning_rate": 0.0015, + "loss": 2.1758, + "step": 970 + }, + { + "epoch": 0.05017407331558468, + "grad_norm": 0.6199438571929932, + "learning_rate": 0.0015, + "loss": 2.227, + "step": 980 + }, + { + "epoch": 0.050686053655539624, + "grad_norm": 0.6159200668334961, + "learning_rate": 0.0015, + "loss": 2.1547, + "step": 990 + }, + { + "epoch": 0.05119803399549457, + "grad_norm": 0.6560512781143188, + "learning_rate": 0.0015, + "loss": 2.1787, + "step": 1000 + }, + { + "epoch": 0.05171001433544952, + "grad_norm": 0.6151387691497803, + "learning_rate": 0.0015, + "loss": 2.1776, + "step": 1010 + }, + { + "epoch": 0.05222199467540446, + "grad_norm": 0.6162774562835693, + "learning_rate": 0.0015, + "loss": 2.1604, + "step": 1020 + }, + { + "epoch": 0.05273397501535941, + "grad_norm": 0.6564657092094421, + "learning_rate": 0.0015, + "loss": 2.1837, + "step": 1030 + }, + { + "epoch": 0.05324595535531436, + "grad_norm": 0.5790508985519409, + "learning_rate": 0.0015, + "loss": 2.1561, + "step": 1040 + }, + { + "epoch": 0.0537579356952693, + "grad_norm": 0.6484589576721191, + "learning_rate": 0.0015, + "loss": 2.1676, + "step": 1050 + }, + { + "epoch": 0.05426991603522425, + "grad_norm": 0.6969457268714905, + "learning_rate": 0.0015, + "loss": 2.1462, + "step": 1060 + }, + { + "epoch": 0.05478189637517919, + "grad_norm": 0.7145557403564453, + "learning_rate": 0.0015, + "loss": 2.13, + "step": 1070 + }, + { + "epoch": 0.05529387671513414, + "grad_norm": 0.6353093981742859, + "learning_rate": 0.0015, + "loss": 2.1197, + "step": 1080 + }, + { + "epoch": 0.055805857055089086, + "grad_norm": 0.5896279811859131, + "learning_rate": 0.0015, + "loss": 2.1177, + "step": 1090 + }, + { + "epoch": 0.05631783739504403, + "grad_norm": 0.6247608661651611, + "learning_rate": 0.0015, + "loss": 2.1123, + "step": 1100 + }, + { + "epoch": 0.056829817734998976, + "grad_norm": 0.6024080514907837, + "learning_rate": 0.0015, + "loss": 2.0949, + "step": 1110 + }, + { + "epoch": 0.057341798074953924, + "grad_norm": 0.7400630116462708, + "learning_rate": 0.0015, + "loss": 2.0915, + "step": 1120 + }, + { + "epoch": 0.057853778414908866, + "grad_norm": 0.6276081800460815, + "learning_rate": 0.0015, + "loss": 2.0916, + "step": 1130 + }, + { + "epoch": 0.058365758754863814, + "grad_norm": 0.7214579582214355, + "learning_rate": 0.0015, + "loss": 2.1027, + "step": 1140 + }, + { + "epoch": 0.05887773909481876, + "grad_norm": 0.7833266258239746, + "learning_rate": 0.0015, + "loss": 2.0884, + "step": 1150 + }, + { + "epoch": 0.059389719434773704, + "grad_norm": 0.7453588247299194, + "learning_rate": 0.0015, + "loss": 2.0764, + "step": 1160 + }, + { + "epoch": 0.05990169977472865, + "grad_norm": 0.5965461134910583, + "learning_rate": 0.0015, + "loss": 2.0941, + "step": 1170 + }, + { + "epoch": 0.06041368011468359, + "grad_norm": 0.6565614938735962, + "learning_rate": 0.0015, + "loss": 2.0396, + "step": 1180 + }, + { + "epoch": 0.06092566045463854, + "grad_norm": 0.670816957950592, + "learning_rate": 0.0015, + "loss": 2.0629, + "step": 1190 + }, + { + "epoch": 0.06143764079459349, + "grad_norm": 0.6220470666885376, + "learning_rate": 0.0015, + "loss": 2.064, + "step": 1200 + }, + { + "epoch": 0.06194962113454843, + "grad_norm": 0.5919376015663147, + "learning_rate": 0.0015, + "loss": 2.0385, + "step": 1210 + }, + { + "epoch": 0.06246160147450338, + "grad_norm": 0.6242793202400208, + "learning_rate": 0.0015, + "loss": 2.0487, + "step": 1220 + }, + { + "epoch": 0.06297358181445832, + "grad_norm": 0.5903810262680054, + "learning_rate": 0.0015, + "loss": 2.0348, + "step": 1230 + }, + { + "epoch": 0.06348556215441327, + "grad_norm": 0.6573896408081055, + "learning_rate": 0.0015, + "loss": 2.0186, + "step": 1240 + }, + { + "epoch": 0.06399754249436822, + "grad_norm": 0.6017488241195679, + "learning_rate": 0.0015, + "loss": 2.0126, + "step": 1250 + }, + { + "epoch": 0.06450952283432317, + "grad_norm": 0.533431351184845, + "learning_rate": 0.0015, + "loss": 2.026, + "step": 1260 + }, + { + "epoch": 0.06502150317427811, + "grad_norm": 0.5461450815200806, + "learning_rate": 0.0015, + "loss": 1.9961, + "step": 1270 + }, + { + "epoch": 0.06553348351423305, + "grad_norm": 0.5579766035079956, + "learning_rate": 0.0015, + "loss": 2.0064, + "step": 1280 + }, + { + "epoch": 0.066045463854188, + "grad_norm": 0.5514289736747742, + "learning_rate": 0.0015, + "loss": 2.0079, + "step": 1290 + }, + { + "epoch": 0.06655744419414295, + "grad_norm": 0.5938010215759277, + "learning_rate": 0.0015, + "loss": 1.9811, + "step": 1300 + }, + { + "epoch": 0.0670694245340979, + "grad_norm": 0.703124463558197, + "learning_rate": 0.0015, + "loss": 1.9634, + "step": 1310 + }, + { + "epoch": 0.06758140487405284, + "grad_norm": 0.545432448387146, + "learning_rate": 0.0015, + "loss": 1.9927, + "step": 1320 + }, + { + "epoch": 0.06809338521400778, + "grad_norm": 0.5673125386238098, + "learning_rate": 0.0015, + "loss": 1.9911, + "step": 1330 + }, + { + "epoch": 0.06860536555396272, + "grad_norm": 0.5682245492935181, + "learning_rate": 0.0015, + "loss": 1.9733, + "step": 1340 + }, + { + "epoch": 0.06911734589391767, + "grad_norm": 0.5960274934768677, + "learning_rate": 0.0015, + "loss": 1.9733, + "step": 1350 + }, + { + "epoch": 0.06962932623387262, + "grad_norm": 0.6102215051651001, + "learning_rate": 0.0015, + "loss": 1.9559, + "step": 1360 + }, + { + "epoch": 0.07014130657382757, + "grad_norm": 0.5990728735923767, + "learning_rate": 0.0015, + "loss": 1.9463, + "step": 1370 + }, + { + "epoch": 0.07065328691378252, + "grad_norm": 0.6161502003669739, + "learning_rate": 0.0015, + "loss": 1.978, + "step": 1380 + }, + { + "epoch": 0.07116526725373745, + "grad_norm": 0.5682898759841919, + "learning_rate": 0.0015, + "loss": 1.9558, + "step": 1390 + }, + { + "epoch": 0.0716772475936924, + "grad_norm": 0.5973048210144043, + "learning_rate": 0.0015, + "loss": 1.9376, + "step": 1400 + }, + { + "epoch": 0.07218922793364735, + "grad_norm": 0.5553535223007202, + "learning_rate": 0.0015, + "loss": 1.9468, + "step": 1410 + }, + { + "epoch": 0.0727012082736023, + "grad_norm": 0.5181711912155151, + "learning_rate": 0.0015, + "loss": 1.9188, + "step": 1420 + }, + { + "epoch": 0.07321318861355725, + "grad_norm": 0.6532855033874512, + "learning_rate": 0.0015, + "loss": 1.9069, + "step": 1430 + }, + { + "epoch": 0.07372516895351218, + "grad_norm": 0.531043291091919, + "learning_rate": 0.0015, + "loss": 1.9319, + "step": 1440 + }, + { + "epoch": 0.07423714929346713, + "grad_norm": 0.5700235962867737, + "learning_rate": 0.0015, + "loss": 1.8891, + "step": 1450 + }, + { + "epoch": 0.07474912963342208, + "grad_norm": 0.523414134979248, + "learning_rate": 0.0015, + "loss": 1.9165, + "step": 1460 + }, + { + "epoch": 0.07526110997337702, + "grad_norm": 0.5649904608726501, + "learning_rate": 0.0015, + "loss": 1.905, + "step": 1470 + }, + { + "epoch": 0.07577309031333197, + "grad_norm": 0.5912672877311707, + "learning_rate": 0.0015, + "loss": 1.9162, + "step": 1480 + }, + { + "epoch": 0.07628507065328691, + "grad_norm": 0.5597636699676514, + "learning_rate": 0.0015, + "loss": 1.9158, + "step": 1490 + }, + { + "epoch": 0.07679705099324186, + "grad_norm": 0.553896963596344, + "learning_rate": 0.0015, + "loss": 1.871, + "step": 1500 + }, + { + "epoch": 0.0773090313331968, + "grad_norm": 0.5018342137336731, + "learning_rate": 0.0015, + "loss": 1.9119, + "step": 1510 + }, + { + "epoch": 0.07782101167315175, + "grad_norm": 0.5367796421051025, + "learning_rate": 0.0015, + "loss": 1.8706, + "step": 1520 + }, + { + "epoch": 0.0783329920131067, + "grad_norm": 0.5023203492164612, + "learning_rate": 0.0015, + "loss": 1.8808, + "step": 1530 + }, + { + "epoch": 0.07884497235306165, + "grad_norm": 0.5962059497833252, + "learning_rate": 0.0015, + "loss": 1.9022, + "step": 1540 + }, + { + "epoch": 0.07935695269301658, + "grad_norm": 0.5200186967849731, + "learning_rate": 0.0015, + "loss": 1.8728, + "step": 1550 + }, + { + "epoch": 0.07986893303297153, + "grad_norm": 0.5361810922622681, + "learning_rate": 0.0015, + "loss": 1.8462, + "step": 1560 + }, + { + "epoch": 0.08038091337292648, + "grad_norm": 0.5771626830101013, + "learning_rate": 0.0015, + "loss": 1.873, + "step": 1570 + }, + { + "epoch": 0.08089289371288143, + "grad_norm": 0.5451227426528931, + "learning_rate": 0.0015, + "loss": 1.8693, + "step": 1580 + }, + { + "epoch": 0.08140487405283638, + "grad_norm": 0.5574854016304016, + "learning_rate": 0.0015, + "loss": 1.8615, + "step": 1590 + }, + { + "epoch": 0.08191685439279131, + "grad_norm": 0.574317455291748, + "learning_rate": 0.0015, + "loss": 1.8424, + "step": 1600 + }, + { + "epoch": 0.08242883473274626, + "grad_norm": 0.545906662940979, + "learning_rate": 0.0015, + "loss": 1.8572, + "step": 1610 + }, + { + "epoch": 0.08294081507270121, + "grad_norm": 0.5127050280570984, + "learning_rate": 0.0015, + "loss": 1.8391, + "step": 1620 + }, + { + "epoch": 0.08345279541265616, + "grad_norm": 0.5646129250526428, + "learning_rate": 0.0015, + "loss": 1.8316, + "step": 1630 + }, + { + "epoch": 0.0839647757526111, + "grad_norm": 0.5549367666244507, + "learning_rate": 0.0015, + "loss": 1.8371, + "step": 1640 + }, + { + "epoch": 0.08447675609256605, + "grad_norm": 0.5479699373245239, + "learning_rate": 0.0015, + "loss": 1.8378, + "step": 1650 + }, + { + "epoch": 0.08498873643252099, + "grad_norm": 0.5359328985214233, + "learning_rate": 0.0015, + "loss": 1.8372, + "step": 1660 + }, + { + "epoch": 0.08550071677247593, + "grad_norm": 0.5599870085716248, + "learning_rate": 0.0015, + "loss": 1.8499, + "step": 1670 + }, + { + "epoch": 0.08601269711243088, + "grad_norm": 0.5272551774978638, + "learning_rate": 0.0015, + "loss": 1.8381, + "step": 1680 + }, + { + "epoch": 0.08652467745238583, + "grad_norm": 0.534377932548523, + "learning_rate": 0.0015, + "loss": 1.8124, + "step": 1690 + }, + { + "epoch": 0.08703665779234078, + "grad_norm": 0.6432906985282898, + "learning_rate": 0.0015, + "loss": 1.8354, + "step": 1700 + }, + { + "epoch": 0.08754863813229571, + "grad_norm": 0.5227901935577393, + "learning_rate": 0.0015, + "loss": 1.8091, + "step": 1710 + }, + { + "epoch": 0.08806061847225066, + "grad_norm": 0.48951131105422974, + "learning_rate": 0.0015, + "loss": 1.7854, + "step": 1720 + }, + { + "epoch": 0.08857259881220561, + "grad_norm": 0.5127034783363342, + "learning_rate": 0.0015, + "loss": 1.8208, + "step": 1730 + }, + { + "epoch": 0.08908457915216056, + "grad_norm": 0.5147260427474976, + "learning_rate": 0.0015, + "loss": 1.8289, + "step": 1740 + }, + { + "epoch": 0.08959655949211551, + "grad_norm": 0.536268413066864, + "learning_rate": 0.0015, + "loss": 1.7894, + "step": 1750 + }, + { + "epoch": 0.09010853983207044, + "grad_norm": 0.537369966506958, + "learning_rate": 0.0015, + "loss": 1.7985, + "step": 1760 + }, + { + "epoch": 0.09062052017202539, + "grad_norm": 0.5217599868774414, + "learning_rate": 0.0015, + "loss": 1.8196, + "step": 1770 + }, + { + "epoch": 0.09113250051198034, + "grad_norm": 0.47711503505706787, + "learning_rate": 0.0015, + "loss": 1.7931, + "step": 1780 + }, + { + "epoch": 0.09164448085193529, + "grad_norm": 0.5544558763504028, + "learning_rate": 0.0015, + "loss": 1.8201, + "step": 1790 + }, + { + "epoch": 0.09215646119189023, + "grad_norm": 0.5024393200874329, + "learning_rate": 0.0015, + "loss": 1.7974, + "step": 1800 + }, + { + "epoch": 0.09266844153184518, + "grad_norm": 0.5126355290412903, + "learning_rate": 0.0015, + "loss": 1.7874, + "step": 1810 + }, + { + "epoch": 0.09318042187180012, + "grad_norm": 0.5882781744003296, + "learning_rate": 0.0015, + "loss": 1.791, + "step": 1820 + }, + { + "epoch": 0.09369240221175507, + "grad_norm": 0.508765697479248, + "learning_rate": 0.0015, + "loss": 1.7819, + "step": 1830 + }, + { + "epoch": 0.09420438255171001, + "grad_norm": 0.5449949502944946, + "learning_rate": 0.0015, + "loss": 1.7838, + "step": 1840 + }, + { + "epoch": 0.09471636289166496, + "grad_norm": 0.4996667802333832, + "learning_rate": 0.0015, + "loss": 1.7618, + "step": 1850 + }, + { + "epoch": 0.09522834323161991, + "grad_norm": 0.5014889240264893, + "learning_rate": 0.0015, + "loss": 1.7752, + "step": 1860 + }, + { + "epoch": 0.09574032357157485, + "grad_norm": 0.5011769533157349, + "learning_rate": 0.0015, + "loss": 1.7768, + "step": 1870 + }, + { + "epoch": 0.0962523039115298, + "grad_norm": 0.49963292479515076, + "learning_rate": 0.0015, + "loss": 1.778, + "step": 1880 + }, + { + "epoch": 0.09676428425148474, + "grad_norm": 0.46659213304519653, + "learning_rate": 0.0015, + "loss": 1.7668, + "step": 1890 + }, + { + "epoch": 0.09727626459143969, + "grad_norm": 0.5140760540962219, + "learning_rate": 0.0015, + "loss": 1.7448, + "step": 1900 + }, + { + "epoch": 0.09778824493139464, + "grad_norm": 0.49709445238113403, + "learning_rate": 0.0015, + "loss": 1.7573, + "step": 1910 + }, + { + "epoch": 0.09830022527134959, + "grad_norm": 0.464329332113266, + "learning_rate": 0.0015, + "loss": 1.7435, + "step": 1920 + }, + { + "epoch": 0.09881220561130452, + "grad_norm": 0.4815766215324402, + "learning_rate": 0.0015, + "loss": 1.7533, + "step": 1930 + }, + { + "epoch": 0.09932418595125947, + "grad_norm": 0.4601441025733948, + "learning_rate": 0.0015, + "loss": 1.7339, + "step": 1940 + }, + { + "epoch": 0.09983616629121442, + "grad_norm": 0.46905994415283203, + "learning_rate": 0.0015, + "loss": 1.7421, + "step": 1950 + }, + { + "epoch": 0.10034814663116937, + "grad_norm": 0.4927903413772583, + "learning_rate": 0.0015, + "loss": 1.7259, + "step": 1960 + }, + { + "epoch": 0.10086012697112431, + "grad_norm": 0.4930973947048187, + "learning_rate": 0.0015, + "loss": 1.735, + "step": 1970 + }, + { + "epoch": 0.10137210731107925, + "grad_norm": 0.4698399305343628, + "learning_rate": 0.0015, + "loss": 1.7478, + "step": 1980 + }, + { + "epoch": 0.1018840876510342, + "grad_norm": 0.5083284974098206, + "learning_rate": 0.0015, + "loss": 1.7491, + "step": 1990 + }, + { + "epoch": 0.10239606799098915, + "grad_norm": 0.4888325035572052, + "learning_rate": 0.0015, + "loss": 1.7261, + "step": 2000 + }, + { + "epoch": 0.1029080483309441, + "grad_norm": 0.524994432926178, + "learning_rate": 0.0015, + "loss": 1.7221, + "step": 2010 + }, + { + "epoch": 0.10342002867089904, + "grad_norm": 0.49820294976234436, + "learning_rate": 0.0015, + "loss": 1.7279, + "step": 2020 + }, + { + "epoch": 0.10393200901085399, + "grad_norm": 0.49288976192474365, + "learning_rate": 0.0015, + "loss": 1.746, + "step": 2030 + }, + { + "epoch": 0.10444398935080892, + "grad_norm": 0.4776252806186676, + "learning_rate": 0.0015, + "loss": 1.7384, + "step": 2040 + }, + { + "epoch": 0.10495596969076387, + "grad_norm": 0.46143004298210144, + "learning_rate": 0.0015, + "loss": 1.7037, + "step": 2050 + }, + { + "epoch": 0.10546795003071882, + "grad_norm": 0.4855809211730957, + "learning_rate": 0.0015, + "loss": 1.7052, + "step": 2060 + }, + { + "epoch": 0.10597993037067377, + "grad_norm": 0.491964727640152, + "learning_rate": 0.0015, + "loss": 1.7275, + "step": 2070 + }, + { + "epoch": 0.10649191071062872, + "grad_norm": 0.5072810053825378, + "learning_rate": 0.0015, + "loss": 1.7262, + "step": 2080 + }, + { + "epoch": 0.10700389105058365, + "grad_norm": 0.5020768642425537, + "learning_rate": 0.0015, + "loss": 1.7106, + "step": 2090 + }, + { + "epoch": 0.1075158713905386, + "grad_norm": 0.4881630837917328, + "learning_rate": 0.0015, + "loss": 1.7411, + "step": 2100 + }, + { + "epoch": 0.10802785173049355, + "grad_norm": 0.5104793906211853, + "learning_rate": 0.0015, + "loss": 1.7053, + "step": 2110 + }, + { + "epoch": 0.1085398320704485, + "grad_norm": 0.4574519991874695, + "learning_rate": 0.0015, + "loss": 1.7219, + "step": 2120 + }, + { + "epoch": 0.10905181241040345, + "grad_norm": 0.4427832365036011, + "learning_rate": 0.0015, + "loss": 1.6966, + "step": 2130 + }, + { + "epoch": 0.10956379275035838, + "grad_norm": 0.46723929047584534, + "learning_rate": 0.0015, + "loss": 1.7106, + "step": 2140 + }, + { + "epoch": 0.11007577309031333, + "grad_norm": 0.4710049629211426, + "learning_rate": 0.0015, + "loss": 1.7, + "step": 2150 + }, + { + "epoch": 0.11058775343026828, + "grad_norm": 0.46849745512008667, + "learning_rate": 0.0015, + "loss": 1.7071, + "step": 2160 + }, + { + "epoch": 0.11109973377022322, + "grad_norm": 0.4712335765361786, + "learning_rate": 0.0015, + "loss": 1.685, + "step": 2170 + }, + { + "epoch": 0.11161171411017817, + "grad_norm": 0.45318537950515747, + "learning_rate": 0.0015, + "loss": 1.6996, + "step": 2180 + }, + { + "epoch": 0.11212369445013312, + "grad_norm": 0.4772440791130066, + "learning_rate": 0.0015, + "loss": 1.705, + "step": 2190 + }, + { + "epoch": 0.11263567479008806, + "grad_norm": 0.4854085147380829, + "learning_rate": 0.0015, + "loss": 1.691, + "step": 2200 + }, + { + "epoch": 0.113147655130043, + "grad_norm": 0.4931398928165436, + "learning_rate": 0.0015, + "loss": 1.6979, + "step": 2210 + }, + { + "epoch": 0.11365963546999795, + "grad_norm": 0.4212550222873688, + "learning_rate": 0.0015, + "loss": 1.6792, + "step": 2220 + }, + { + "epoch": 0.1141716158099529, + "grad_norm": 0.4916476905345917, + "learning_rate": 0.0015, + "loss": 1.682, + "step": 2230 + }, + { + "epoch": 0.11468359614990785, + "grad_norm": 0.44974076747894287, + "learning_rate": 0.0015, + "loss": 1.6734, + "step": 2240 + }, + { + "epoch": 0.11519557648986278, + "grad_norm": 0.4464137554168701, + "learning_rate": 0.0015, + "loss": 1.7032, + "step": 2250 + }, + { + "epoch": 0.11570755682981773, + "grad_norm": 0.4473714530467987, + "learning_rate": 0.0015, + "loss": 1.6868, + "step": 2260 + }, + { + "epoch": 0.11621953716977268, + "grad_norm": 0.4802720844745636, + "learning_rate": 0.0015, + "loss": 1.6805, + "step": 2270 + }, + { + "epoch": 0.11673151750972763, + "grad_norm": 0.45060625672340393, + "learning_rate": 0.0015, + "loss": 1.6716, + "step": 2280 + }, + { + "epoch": 0.11724349784968258, + "grad_norm": 0.47407498955726624, + "learning_rate": 0.0015, + "loss": 1.6569, + "step": 2290 + }, + { + "epoch": 0.11775547818963752, + "grad_norm": 0.45615556836128235, + "learning_rate": 0.0015, + "loss": 1.6682, + "step": 2300 + }, + { + "epoch": 0.11826745852959246, + "grad_norm": 0.4670998156070709, + "learning_rate": 0.0015, + "loss": 1.6785, + "step": 2310 + }, + { + "epoch": 0.11877943886954741, + "grad_norm": 0.45432570576667786, + "learning_rate": 0.0015, + "loss": 1.674, + "step": 2320 + }, + { + "epoch": 0.11929141920950236, + "grad_norm": 0.44804081320762634, + "learning_rate": 0.0015, + "loss": 1.6619, + "step": 2330 + }, + { + "epoch": 0.1198033995494573, + "grad_norm": 0.4523905813694, + "learning_rate": 0.0015, + "loss": 1.6652, + "step": 2340 + }, + { + "epoch": 0.12031537988941225, + "grad_norm": 0.4514728784561157, + "learning_rate": 0.0015, + "loss": 1.6652, + "step": 2350 + }, + { + "epoch": 0.12082736022936719, + "grad_norm": 0.41209134459495544, + "learning_rate": 0.0015, + "loss": 1.658, + "step": 2360 + }, + { + "epoch": 0.12133934056932213, + "grad_norm": 0.4219752252101898, + "learning_rate": 0.0015, + "loss": 1.6379, + "step": 2370 + }, + { + "epoch": 0.12185132090927708, + "grad_norm": 0.47252357006073, + "learning_rate": 0.0015, + "loss": 1.6636, + "step": 2380 + }, + { + "epoch": 0.12236330124923203, + "grad_norm": 0.4292849004268646, + "learning_rate": 0.0015, + "loss": 1.6528, + "step": 2390 + }, + { + "epoch": 0.12287528158918698, + "grad_norm": 0.4734489917755127, + "learning_rate": 0.0015, + "loss": 1.6297, + "step": 2400 + }, + { + "epoch": 0.12338726192914191, + "grad_norm": 0.48543623089790344, + "learning_rate": 0.0015, + "loss": 1.6404, + "step": 2410 + }, + { + "epoch": 0.12389924226909686, + "grad_norm": 0.4184911549091339, + "learning_rate": 0.0015, + "loss": 1.6315, + "step": 2420 + }, + { + "epoch": 0.12441122260905181, + "grad_norm": 0.42600351572036743, + "learning_rate": 0.0015, + "loss": 1.6502, + "step": 2430 + }, + { + "epoch": 0.12492320294900676, + "grad_norm": 0.4201619029045105, + "learning_rate": 0.0015, + "loss": 1.6372, + "step": 2440 + }, + { + "epoch": 0.1254351832889617, + "grad_norm": 0.4165250360965729, + "learning_rate": 0.0015, + "loss": 1.6334, + "step": 2450 + }, + { + "epoch": 0.12594716362891664, + "grad_norm": 0.4470268487930298, + "learning_rate": 0.0015, + "loss": 1.6359, + "step": 2460 + }, + { + "epoch": 0.1264591439688716, + "grad_norm": 0.4310542941093445, + "learning_rate": 0.0015, + "loss": 1.6439, + "step": 2470 + }, + { + "epoch": 0.12697112430882654, + "grad_norm": 0.4297926425933838, + "learning_rate": 0.0015, + "loss": 1.6222, + "step": 2480 + }, + { + "epoch": 0.1274831046487815, + "grad_norm": 0.45335137844085693, + "learning_rate": 0.0015, + "loss": 1.6559, + "step": 2490 + }, + { + "epoch": 0.12799508498873644, + "grad_norm": 0.4176558256149292, + "learning_rate": 0.0015, + "loss": 1.6561, + "step": 2500 + }, + { + "epoch": 0.12850706532869138, + "grad_norm": 0.4358290433883667, + "learning_rate": 0.0015, + "loss": 1.6241, + "step": 2510 + }, + { + "epoch": 0.12901904566864633, + "grad_norm": 0.44109201431274414, + "learning_rate": 0.0015, + "loss": 1.6022, + "step": 2520 + }, + { + "epoch": 0.12953102600860128, + "grad_norm": 0.44387978315353394, + "learning_rate": 0.0015, + "loss": 1.6335, + "step": 2530 + }, + { + "epoch": 0.13004300634855623, + "grad_norm": 0.434861421585083, + "learning_rate": 0.0015, + "loss": 1.6377, + "step": 2540 + }, + { + "epoch": 0.13055498668851115, + "grad_norm": 0.419826865196228, + "learning_rate": 0.0015, + "loss": 1.6238, + "step": 2550 + }, + { + "epoch": 0.1310669670284661, + "grad_norm": 0.471110463142395, + "learning_rate": 0.0015, + "loss": 1.6383, + "step": 2560 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 0.44935643672943115, + "learning_rate": 0.0015, + "loss": 1.6006, + "step": 2570 + }, + { + "epoch": 0.132090927708376, + "grad_norm": 0.4497852027416229, + "learning_rate": 0.0015, + "loss": 1.6115, + "step": 2580 + }, + { + "epoch": 0.13260290804833094, + "grad_norm": 0.45850351452827454, + "learning_rate": 0.0015, + "loss": 1.6194, + "step": 2590 + }, + { + "epoch": 0.1331148883882859, + "grad_norm": 0.40869665145874023, + "learning_rate": 0.0015, + "loss": 1.6159, + "step": 2600 + }, + { + "epoch": 0.13362686872824084, + "grad_norm": 0.4347962737083435, + "learning_rate": 0.0015, + "loss": 1.6254, + "step": 2610 + }, + { + "epoch": 0.1341388490681958, + "grad_norm": 0.4899897873401642, + "learning_rate": 0.0015, + "loss": 1.6296, + "step": 2620 + }, + { + "epoch": 0.13465082940815074, + "grad_norm": 0.44309839606285095, + "learning_rate": 0.0015, + "loss": 1.6179, + "step": 2630 + }, + { + "epoch": 0.13516280974810568, + "grad_norm": 0.3890606164932251, + "learning_rate": 0.0015, + "loss": 1.6044, + "step": 2640 + }, + { + "epoch": 0.13567479008806063, + "grad_norm": 0.42358025908470154, + "learning_rate": 0.0015, + "loss": 1.619, + "step": 2650 + }, + { + "epoch": 0.13618677042801555, + "grad_norm": 0.42111581563949585, + "learning_rate": 0.0015, + "loss": 1.6127, + "step": 2660 + }, + { + "epoch": 0.1366987507679705, + "grad_norm": 0.4441932141780853, + "learning_rate": 0.0015, + "loss": 1.6224, + "step": 2670 + }, + { + "epoch": 0.13721073110792545, + "grad_norm": 0.4351959228515625, + "learning_rate": 0.0015, + "loss": 1.5957, + "step": 2680 + }, + { + "epoch": 0.1377227114478804, + "grad_norm": 0.43544304370880127, + "learning_rate": 0.0015, + "loss": 1.5925, + "step": 2690 + }, + { + "epoch": 0.13823469178783535, + "grad_norm": 0.4298728406429291, + "learning_rate": 0.0015, + "loss": 1.5893, + "step": 2700 + }, + { + "epoch": 0.1387466721277903, + "grad_norm": 0.4463229477405548, + "learning_rate": 0.0015, + "loss": 1.5881, + "step": 2710 + }, + { + "epoch": 0.13925865246774524, + "grad_norm": 0.43847158551216125, + "learning_rate": 0.0015, + "loss": 1.5982, + "step": 2720 + }, + { + "epoch": 0.1397706328077002, + "grad_norm": 0.44918614625930786, + "learning_rate": 0.0015, + "loss": 1.6095, + "step": 2730 + }, + { + "epoch": 0.14028261314765514, + "grad_norm": 0.45398586988449097, + "learning_rate": 0.0015, + "loss": 1.5985, + "step": 2740 + }, + { + "epoch": 0.1407945934876101, + "grad_norm": 0.41213494539260864, + "learning_rate": 0.0015, + "loss": 1.6153, + "step": 2750 + }, + { + "epoch": 0.14130657382756504, + "grad_norm": 0.41266897320747375, + "learning_rate": 0.0015, + "loss": 1.5919, + "step": 2760 + }, + { + "epoch": 0.14181855416751996, + "grad_norm": 0.42942896485328674, + "learning_rate": 0.0015, + "loss": 1.5793, + "step": 2770 + }, + { + "epoch": 0.1423305345074749, + "grad_norm": 0.4180223047733307, + "learning_rate": 0.0015, + "loss": 1.5938, + "step": 2780 + }, + { + "epoch": 0.14284251484742985, + "grad_norm": 0.4204559922218323, + "learning_rate": 0.0015, + "loss": 1.5927, + "step": 2790 + }, + { + "epoch": 0.1433544951873848, + "grad_norm": 0.43727442622184753, + "learning_rate": 0.0015, + "loss": 1.6018, + "step": 2800 + }, + { + "epoch": 0.14386647552733975, + "grad_norm": 0.4330785870552063, + "learning_rate": 0.0015, + "loss": 1.6004, + "step": 2810 + }, + { + "epoch": 0.1443784558672947, + "grad_norm": 0.415101021528244, + "learning_rate": 0.0015, + "loss": 1.5708, + "step": 2820 + }, + { + "epoch": 0.14489043620724965, + "grad_norm": 0.41477903723716736, + "learning_rate": 0.0015, + "loss": 1.5747, + "step": 2830 + }, + { + "epoch": 0.1454024165472046, + "grad_norm": 0.4343889653682709, + "learning_rate": 0.0015, + "loss": 1.5958, + "step": 2840 + }, + { + "epoch": 0.14591439688715954, + "grad_norm": 0.4018150866031647, + "learning_rate": 0.0015, + "loss": 1.5589, + "step": 2850 + }, + { + "epoch": 0.1464263772271145, + "grad_norm": 0.4799724817276001, + "learning_rate": 0.0015, + "loss": 1.5745, + "step": 2860 + }, + { + "epoch": 0.1469383575670694, + "grad_norm": 0.42355528473854065, + "learning_rate": 0.0015, + "loss": 1.5928, + "step": 2870 + }, + { + "epoch": 0.14745033790702436, + "grad_norm": 0.40638747811317444, + "learning_rate": 0.0015, + "loss": 1.5623, + "step": 2880 + }, + { + "epoch": 0.1479623182469793, + "grad_norm": 0.39846664667129517, + "learning_rate": 0.0015, + "loss": 1.577, + "step": 2890 + }, + { + "epoch": 0.14847429858693426, + "grad_norm": 0.4010321795940399, + "learning_rate": 0.0015, + "loss": 1.5821, + "step": 2900 + }, + { + "epoch": 0.1489862789268892, + "grad_norm": 0.42778313159942627, + "learning_rate": 0.0015, + "loss": 1.5623, + "step": 2910 + }, + { + "epoch": 0.14949825926684415, + "grad_norm": 0.39266425371170044, + "learning_rate": 0.0015, + "loss": 1.5821, + "step": 2920 + }, + { + "epoch": 0.1500102396067991, + "grad_norm": 0.40784794092178345, + "learning_rate": 0.0015, + "loss": 1.5664, + "step": 2930 + }, + { + "epoch": 0.15052221994675405, + "grad_norm": 0.43437501788139343, + "learning_rate": 0.0015, + "loss": 1.5658, + "step": 2940 + }, + { + "epoch": 0.151034200286709, + "grad_norm": 0.4373057186603546, + "learning_rate": 0.0015, + "loss": 1.5591, + "step": 2950 + }, + { + "epoch": 0.15154618062666395, + "grad_norm": 0.40370023250579834, + "learning_rate": 0.0015, + "loss": 1.555, + "step": 2960 + }, + { + "epoch": 0.1520581609666189, + "grad_norm": 0.4626748263835907, + "learning_rate": 0.0015, + "loss": 1.5808, + "step": 2970 + }, + { + "epoch": 0.15257014130657381, + "grad_norm": 0.4095107614994049, + "learning_rate": 0.0015, + "loss": 1.5705, + "step": 2980 + }, + { + "epoch": 0.15308212164652876, + "grad_norm": 0.4343841075897217, + "learning_rate": 0.0015, + "loss": 1.5738, + "step": 2990 + }, + { + "epoch": 0.1535941019864837, + "grad_norm": 0.42325645685195923, + "learning_rate": 0.0015, + "loss": 1.567, + "step": 3000 + }, + { + "epoch": 0.15410608232643866, + "grad_norm": 0.39237692952156067, + "learning_rate": 0.0015, + "loss": 1.5748, + "step": 3010 + }, + { + "epoch": 0.1546180626663936, + "grad_norm": 0.39682793617248535, + "learning_rate": 0.0015, + "loss": 1.5711, + "step": 3020 + }, + { + "epoch": 0.15513004300634856, + "grad_norm": 0.4060477614402771, + "learning_rate": 0.0015, + "loss": 1.5623, + "step": 3030 + }, + { + "epoch": 0.1556420233463035, + "grad_norm": 0.4088119864463806, + "learning_rate": 0.0015, + "loss": 1.5532, + "step": 3040 + }, + { + "epoch": 0.15615400368625845, + "grad_norm": 0.39976736903190613, + "learning_rate": 0.0015, + "loss": 1.5436, + "step": 3050 + }, + { + "epoch": 0.1566659840262134, + "grad_norm": 0.42855167388916016, + "learning_rate": 0.0015, + "loss": 1.5577, + "step": 3060 + }, + { + "epoch": 0.15717796436616835, + "grad_norm": 0.4451335072517395, + "learning_rate": 0.0015, + "loss": 1.5375, + "step": 3070 + }, + { + "epoch": 0.1576899447061233, + "grad_norm": 0.3867264688014984, + "learning_rate": 0.0015, + "loss": 1.5418, + "step": 3080 + }, + { + "epoch": 0.15820192504607822, + "grad_norm": 0.4165036976337433, + "learning_rate": 0.0015, + "loss": 1.564, + "step": 3090 + }, + { + "epoch": 0.15871390538603317, + "grad_norm": 0.3978787958621979, + "learning_rate": 0.0015, + "loss": 1.5408, + "step": 3100 + }, + { + "epoch": 0.15922588572598811, + "grad_norm": 0.37848272919654846, + "learning_rate": 0.0015, + "loss": 1.5477, + "step": 3110 + }, + { + "epoch": 0.15973786606594306, + "grad_norm": 0.4218755066394806, + "learning_rate": 0.0015, + "loss": 1.5533, + "step": 3120 + }, + { + "epoch": 0.160249846405898, + "grad_norm": 0.38090386986732483, + "learning_rate": 0.0015, + "loss": 1.5453, + "step": 3130 + }, + { + "epoch": 0.16076182674585296, + "grad_norm": 0.39693617820739746, + "learning_rate": 0.0015, + "loss": 1.5633, + "step": 3140 + }, + { + "epoch": 0.1612738070858079, + "grad_norm": 0.3855767250061035, + "learning_rate": 0.0015, + "loss": 1.5381, + "step": 3150 + }, + { + "epoch": 0.16178578742576286, + "grad_norm": 0.3672980070114136, + "learning_rate": 0.0015, + "loss": 1.5458, + "step": 3160 + }, + { + "epoch": 0.1622977677657178, + "grad_norm": 0.3810063302516937, + "learning_rate": 0.0015, + "loss": 1.559, + "step": 3170 + }, + { + "epoch": 0.16280974810567275, + "grad_norm": 0.4658653140068054, + "learning_rate": 0.0015, + "loss": 1.5274, + "step": 3180 + }, + { + "epoch": 0.1633217284456277, + "grad_norm": 0.40785935521125793, + "learning_rate": 0.0015, + "loss": 1.5279, + "step": 3190 + }, + { + "epoch": 0.16383370878558262, + "grad_norm": 0.40147677063941956, + "learning_rate": 0.0015, + "loss": 1.542, + "step": 3200 + }, + { + "epoch": 0.16434568912553757, + "grad_norm": 0.39116302132606506, + "learning_rate": 0.0015, + "loss": 1.5148, + "step": 3210 + }, + { + "epoch": 0.16485766946549252, + "grad_norm": 0.3875216245651245, + "learning_rate": 0.0015, + "loss": 1.5289, + "step": 3220 + }, + { + "epoch": 0.16536964980544747, + "grad_norm": 0.4106022119522095, + "learning_rate": 0.0015, + "loss": 1.5358, + "step": 3230 + }, + { + "epoch": 0.16588163014540241, + "grad_norm": 0.393637090921402, + "learning_rate": 0.0015, + "loss": 1.5334, + "step": 3240 + }, + { + "epoch": 0.16639361048535736, + "grad_norm": 0.3800962269306183, + "learning_rate": 0.0015, + "loss": 1.5364, + "step": 3250 + }, + { + "epoch": 0.1669055908253123, + "grad_norm": 0.3848235011100769, + "learning_rate": 0.0015, + "loss": 1.5411, + "step": 3260 + }, + { + "epoch": 0.16741757116526726, + "grad_norm": 0.38832154870033264, + "learning_rate": 0.0015, + "loss": 1.5373, + "step": 3270 + }, + { + "epoch": 0.1679295515052222, + "grad_norm": 0.43623119592666626, + "learning_rate": 0.0015, + "loss": 1.5558, + "step": 3280 + }, + { + "epoch": 0.16844153184517716, + "grad_norm": 0.3507107198238373, + "learning_rate": 0.0015, + "loss": 1.5365, + "step": 3290 + }, + { + "epoch": 0.1689535121851321, + "grad_norm": 0.38700392842292786, + "learning_rate": 0.0015, + "loss": 1.5383, + "step": 3300 + }, + { + "epoch": 0.16946549252508702, + "grad_norm": 0.38841623067855835, + "learning_rate": 0.0015, + "loss": 1.5399, + "step": 3310 + }, + { + "epoch": 0.16997747286504197, + "grad_norm": 0.39128798246383667, + "learning_rate": 0.0015, + "loss": 1.5271, + "step": 3320 + }, + { + "epoch": 0.17048945320499692, + "grad_norm": 0.38994646072387695, + "learning_rate": 0.0015, + "loss": 1.5317, + "step": 3330 + }, + { + "epoch": 0.17100143354495187, + "grad_norm": 0.37731438875198364, + "learning_rate": 0.0015, + "loss": 1.5251, + "step": 3340 + }, + { + "epoch": 0.17151341388490682, + "grad_norm": 0.4156712293624878, + "learning_rate": 0.0015, + "loss": 1.5221, + "step": 3350 + }, + { + "epoch": 0.17202539422486177, + "grad_norm": 0.38232874870300293, + "learning_rate": 0.0015, + "loss": 1.5196, + "step": 3360 + }, + { + "epoch": 0.17253737456481671, + "grad_norm": 0.3940838575363159, + "learning_rate": 0.0015, + "loss": 1.5213, + "step": 3370 + }, + { + "epoch": 0.17304935490477166, + "grad_norm": 0.4050334393978119, + "learning_rate": 0.0015, + "loss": 1.5159, + "step": 3380 + }, + { + "epoch": 0.1735613352447266, + "grad_norm": 0.3736588954925537, + "learning_rate": 0.0015, + "loss": 1.5157, + "step": 3390 + }, + { + "epoch": 0.17407331558468156, + "grad_norm": 0.40355414152145386, + "learning_rate": 0.0015, + "loss": 1.5446, + "step": 3400 + }, + { + "epoch": 0.1745852959246365, + "grad_norm": 0.37198445200920105, + "learning_rate": 0.0015, + "loss": 1.5322, + "step": 3410 + }, + { + "epoch": 0.17509727626459143, + "grad_norm": 0.35825085639953613, + "learning_rate": 0.0015, + "loss": 1.5136, + "step": 3420 + }, + { + "epoch": 0.17560925660454638, + "grad_norm": 0.4174591302871704, + "learning_rate": 0.0015, + "loss": 1.5092, + "step": 3430 + }, + { + "epoch": 0.17612123694450132, + "grad_norm": 0.38272011280059814, + "learning_rate": 0.0015, + "loss": 1.515, + "step": 3440 + }, + { + "epoch": 0.17663321728445627, + "grad_norm": 0.4088602364063263, + "learning_rate": 0.0015, + "loss": 1.5089, + "step": 3450 + }, + { + "epoch": 0.17714519762441122, + "grad_norm": 0.37706780433654785, + "learning_rate": 0.0015, + "loss": 1.513, + "step": 3460 + }, + { + "epoch": 0.17765717796436617, + "grad_norm": 0.3772091865539551, + "learning_rate": 0.0015, + "loss": 1.5096, + "step": 3470 + }, + { + "epoch": 0.17816915830432112, + "grad_norm": 0.3540133535861969, + "learning_rate": 0.0015, + "loss": 1.5099, + "step": 3480 + }, + { + "epoch": 0.17868113864427607, + "grad_norm": 0.36549830436706543, + "learning_rate": 0.0015, + "loss": 1.511, + "step": 3490 + }, + { + "epoch": 0.17919311898423101, + "grad_norm": 0.39273905754089355, + "learning_rate": 0.0015, + "loss": 1.5005, + "step": 3500 + }, + { + "epoch": 0.17970509932418596, + "grad_norm": 0.35500046610832214, + "learning_rate": 0.0015, + "loss": 1.4962, + "step": 3510 + }, + { + "epoch": 0.18021707966414088, + "grad_norm": 0.39818084239959717, + "learning_rate": 0.0015, + "loss": 1.4951, + "step": 3520 + }, + { + "epoch": 0.18072906000409583, + "grad_norm": 0.3649390637874603, + "learning_rate": 0.0015, + "loss": 1.5038, + "step": 3530 + }, + { + "epoch": 0.18124104034405078, + "grad_norm": 0.376000314950943, + "learning_rate": 0.0015, + "loss": 1.4945, + "step": 3540 + }, + { + "epoch": 0.18175302068400573, + "grad_norm": 0.3638756573200226, + "learning_rate": 0.0015, + "loss": 1.5012, + "step": 3550 + }, + { + "epoch": 0.18226500102396068, + "grad_norm": 0.3695107400417328, + "learning_rate": 0.0015, + "loss": 1.5261, + "step": 3560 + }, + { + "epoch": 0.18277698136391562, + "grad_norm": 0.424125999212265, + "learning_rate": 0.0015, + "loss": 1.5245, + "step": 3570 + }, + { + "epoch": 0.18328896170387057, + "grad_norm": 0.3683246374130249, + "learning_rate": 0.0015, + "loss": 1.507, + "step": 3580 + }, + { + "epoch": 0.18380094204382552, + "grad_norm": 0.3763924241065979, + "learning_rate": 0.0015, + "loss": 1.4671, + "step": 3590 + }, + { + "epoch": 0.18431292238378047, + "grad_norm": 0.3692323565483093, + "learning_rate": 0.0015, + "loss": 1.5182, + "step": 3600 + }, + { + "epoch": 0.18482490272373542, + "grad_norm": 0.37030673027038574, + "learning_rate": 0.0015, + "loss": 1.5037, + "step": 3610 + }, + { + "epoch": 0.18533688306369037, + "grad_norm": 0.3666503429412842, + "learning_rate": 0.0015, + "loss": 1.499, + "step": 3620 + }, + { + "epoch": 0.1858488634036453, + "grad_norm": 0.3609069287776947, + "learning_rate": 0.0015, + "loss": 1.5052, + "step": 3630 + }, + { + "epoch": 0.18636084374360024, + "grad_norm": 0.3748449683189392, + "learning_rate": 0.0015, + "loss": 1.4596, + "step": 3640 + }, + { + "epoch": 0.18687282408355518, + "grad_norm": 0.4080664813518524, + "learning_rate": 0.0015, + "loss": 1.5051, + "step": 3650 + }, + { + "epoch": 0.18738480442351013, + "grad_norm": 0.3743340075016022, + "learning_rate": 0.0015, + "loss": 1.4658, + "step": 3660 + }, + { + "epoch": 0.18789678476346508, + "grad_norm": 0.36924538016319275, + "learning_rate": 0.0015, + "loss": 1.474, + "step": 3670 + }, + { + "epoch": 0.18840876510342003, + "grad_norm": 0.3834936022758484, + "learning_rate": 0.0015, + "loss": 1.4952, + "step": 3680 + }, + { + "epoch": 0.18892074544337498, + "grad_norm": 0.3493509590625763, + "learning_rate": 0.0015, + "loss": 1.4765, + "step": 3690 + }, + { + "epoch": 0.18943272578332992, + "grad_norm": 0.3550162613391876, + "learning_rate": 0.0015, + "loss": 1.4928, + "step": 3700 + }, + { + "epoch": 0.18994470612328487, + "grad_norm": 0.3747323155403137, + "learning_rate": 0.0015, + "loss": 1.4872, + "step": 3710 + }, + { + "epoch": 0.19045668646323982, + "grad_norm": 0.3649948835372925, + "learning_rate": 0.0015, + "loss": 1.5015, + "step": 3720 + }, + { + "epoch": 0.19096866680319477, + "grad_norm": 0.37357765436172485, + "learning_rate": 0.0015, + "loss": 1.4828, + "step": 3730 + }, + { + "epoch": 0.1914806471431497, + "grad_norm": 0.36136525869369507, + "learning_rate": 0.0015, + "loss": 1.5063, + "step": 3740 + }, + { + "epoch": 0.19199262748310464, + "grad_norm": 0.35555464029312134, + "learning_rate": 0.0015, + "loss": 1.4797, + "step": 3750 + }, + { + "epoch": 0.1925046078230596, + "grad_norm": 0.3460323214530945, + "learning_rate": 0.0015, + "loss": 1.4913, + "step": 3760 + }, + { + "epoch": 0.19301658816301454, + "grad_norm": 0.35079696774482727, + "learning_rate": 0.0015, + "loss": 1.4714, + "step": 3770 + }, + { + "epoch": 0.19352856850296948, + "grad_norm": 0.3562418818473816, + "learning_rate": 0.0015, + "loss": 1.4816, + "step": 3780 + }, + { + "epoch": 0.19404054884292443, + "grad_norm": 0.3714292049407959, + "learning_rate": 0.0015, + "loss": 1.496, + "step": 3790 + }, + { + "epoch": 0.19455252918287938, + "grad_norm": 0.37646958231925964, + "learning_rate": 0.0015, + "loss": 1.4814, + "step": 3800 + }, + { + "epoch": 0.19506450952283433, + "grad_norm": 0.37127116322517395, + "learning_rate": 0.0015, + "loss": 1.4902, + "step": 3810 + }, + { + "epoch": 0.19557648986278928, + "grad_norm": 0.3644818961620331, + "learning_rate": 0.0015, + "loss": 1.4811, + "step": 3820 + }, + { + "epoch": 0.19608847020274423, + "grad_norm": 0.38677945733070374, + "learning_rate": 0.0015, + "loss": 1.5001, + "step": 3830 + }, + { + "epoch": 0.19660045054269917, + "grad_norm": 0.379823237657547, + "learning_rate": 0.0015, + "loss": 1.4665, + "step": 3840 + }, + { + "epoch": 0.1971124308826541, + "grad_norm": 0.37844884395599365, + "learning_rate": 0.0015, + "loss": 1.4783, + "step": 3850 + }, + { + "epoch": 0.19762441122260904, + "grad_norm": 0.36030471324920654, + "learning_rate": 0.0015, + "loss": 1.4883, + "step": 3860 + }, + { + "epoch": 0.198136391562564, + "grad_norm": 0.3515039384365082, + "learning_rate": 0.0015, + "loss": 1.4614, + "step": 3870 + }, + { + "epoch": 0.19864837190251894, + "grad_norm": 0.3469856381416321, + "learning_rate": 0.0015, + "loss": 1.4669, + "step": 3880 + }, + { + "epoch": 0.1991603522424739, + "grad_norm": 0.3526422381401062, + "learning_rate": 0.0015, + "loss": 1.4568, + "step": 3890 + }, + { + "epoch": 0.19967233258242884, + "grad_norm": 0.34970229864120483, + "learning_rate": 0.0015, + "loss": 1.4467, + "step": 3900 + }, + { + "epoch": 0.20018431292238378, + "grad_norm": 0.35208991169929504, + "learning_rate": 0.0015, + "loss": 1.5057, + "step": 3910 + }, + { + "epoch": 0.20069629326233873, + "grad_norm": 0.35446539521217346, + "learning_rate": 0.0015, + "loss": 1.4677, + "step": 3920 + }, + { + "epoch": 0.20120827360229368, + "grad_norm": 0.32680749893188477, + "learning_rate": 0.0015, + "loss": 1.4577, + "step": 3930 + }, + { + "epoch": 0.20172025394224863, + "grad_norm": 0.3479768931865692, + "learning_rate": 0.0015, + "loss": 1.4679, + "step": 3940 + }, + { + "epoch": 0.20223223428220358, + "grad_norm": 0.3349073529243469, + "learning_rate": 0.0015, + "loss": 1.4497, + "step": 3950 + }, + { + "epoch": 0.2027442146221585, + "grad_norm": 0.35016781091690063, + "learning_rate": 0.0015, + "loss": 1.449, + "step": 3960 + }, + { + "epoch": 0.20325619496211345, + "grad_norm": 0.349086195230484, + "learning_rate": 0.0015, + "loss": 1.4751, + "step": 3970 + }, + { + "epoch": 0.2037681753020684, + "grad_norm": 0.36575040221214294, + "learning_rate": 0.0015, + "loss": 1.4653, + "step": 3980 + }, + { + "epoch": 0.20428015564202334, + "grad_norm": 0.34002363681793213, + "learning_rate": 0.0015, + "loss": 1.4826, + "step": 3990 + }, + { + "epoch": 0.2047921359819783, + "grad_norm": 0.36541834473609924, + "learning_rate": 0.0015, + "loss": 1.4485, + "step": 4000 + }, + { + "epoch": 0.20530411632193324, + "grad_norm": 0.3874847888946533, + "learning_rate": 0.0015, + "loss": 1.478, + "step": 4010 + }, + { + "epoch": 0.2058160966618882, + "grad_norm": 0.36418798565864563, + "learning_rate": 0.0015, + "loss": 1.4629, + "step": 4020 + }, + { + "epoch": 0.20632807700184314, + "grad_norm": 0.34188389778137207, + "learning_rate": 0.0015, + "loss": 1.4784, + "step": 4030 + }, + { + "epoch": 0.20684005734179808, + "grad_norm": 0.35976287722587585, + "learning_rate": 0.0015, + "loss": 1.458, + "step": 4040 + }, + { + "epoch": 0.20735203768175303, + "grad_norm": 0.37284791469573975, + "learning_rate": 0.0015, + "loss": 1.471, + "step": 4050 + }, + { + "epoch": 0.20786401802170798, + "grad_norm": 0.3462198078632355, + "learning_rate": 0.0015, + "loss": 1.4748, + "step": 4060 + }, + { + "epoch": 0.2083759983616629, + "grad_norm": 0.3988822102546692, + "learning_rate": 0.0015, + "loss": 1.4576, + "step": 4070 + }, + { + "epoch": 0.20888797870161785, + "grad_norm": 0.361892431974411, + "learning_rate": 0.0015, + "loss": 1.4516, + "step": 4080 + }, + { + "epoch": 0.2093999590415728, + "grad_norm": 0.3648587763309479, + "learning_rate": 0.0015, + "loss": 1.4537, + "step": 4090 + }, + { + "epoch": 0.20991193938152775, + "grad_norm": 0.35592299699783325, + "learning_rate": 0.0015, + "loss": 1.4346, + "step": 4100 + }, + { + "epoch": 0.2104239197214827, + "grad_norm": 0.3457651138305664, + "learning_rate": 0.0015, + "loss": 1.4455, + "step": 4110 + }, + { + "epoch": 0.21093590006143764, + "grad_norm": 0.3580280542373657, + "learning_rate": 0.0015, + "loss": 1.452, + "step": 4120 + }, + { + "epoch": 0.2114478804013926, + "grad_norm": 0.3704809844493866, + "learning_rate": 0.0015, + "loss": 1.4655, + "step": 4130 + }, + { + "epoch": 0.21195986074134754, + "grad_norm": 0.37433552742004395, + "learning_rate": 0.0015, + "loss": 1.4526, + "step": 4140 + }, + { + "epoch": 0.2124718410813025, + "grad_norm": 0.35324522852897644, + "learning_rate": 0.0015, + "loss": 1.4651, + "step": 4150 + }, + { + "epoch": 0.21298382142125744, + "grad_norm": 0.34257858991622925, + "learning_rate": 0.0015, + "loss": 1.4454, + "step": 4160 + }, + { + "epoch": 0.21349580176121236, + "grad_norm": 0.34159529209136963, + "learning_rate": 0.0015, + "loss": 1.4561, + "step": 4170 + }, + { + "epoch": 0.2140077821011673, + "grad_norm": 0.3691791296005249, + "learning_rate": 0.0015, + "loss": 1.4496, + "step": 4180 + }, + { + "epoch": 0.21451976244112225, + "grad_norm": 0.3290902078151703, + "learning_rate": 0.0015, + "loss": 1.4477, + "step": 4190 + }, + { + "epoch": 0.2150317427810772, + "grad_norm": 0.35127583146095276, + "learning_rate": 0.0015, + "loss": 1.4389, + "step": 4200 + }, + { + "epoch": 0.21554372312103215, + "grad_norm": 0.3416004776954651, + "learning_rate": 0.0015, + "loss": 1.4569, + "step": 4210 + }, + { + "epoch": 0.2160557034609871, + "grad_norm": 0.33589133620262146, + "learning_rate": 0.0015, + "loss": 1.4536, + "step": 4220 + }, + { + "epoch": 0.21656768380094205, + "grad_norm": 0.3249707818031311, + "learning_rate": 0.0015, + "loss": 1.4421, + "step": 4230 + }, + { + "epoch": 0.217079664140897, + "grad_norm": 0.3269306719303131, + "learning_rate": 0.0015, + "loss": 1.4644, + "step": 4240 + }, + { + "epoch": 0.21759164448085194, + "grad_norm": 0.34012100100517273, + "learning_rate": 0.0015, + "loss": 1.4419, + "step": 4250 + }, + { + "epoch": 0.2181036248208069, + "grad_norm": 0.3248611390590668, + "learning_rate": 0.0015, + "loss": 1.4321, + "step": 4260 + }, + { + "epoch": 0.21861560516076184, + "grad_norm": 0.33508434891700745, + "learning_rate": 0.0015, + "loss": 1.4547, + "step": 4270 + }, + { + "epoch": 0.21912758550071676, + "grad_norm": 0.3807787597179413, + "learning_rate": 0.0015, + "loss": 1.441, + "step": 4280 + }, + { + "epoch": 0.2196395658406717, + "grad_norm": 0.34403491020202637, + "learning_rate": 0.0015, + "loss": 1.4309, + "step": 4290 + }, + { + "epoch": 0.22015154618062666, + "grad_norm": 0.339507520198822, + "learning_rate": 0.0015, + "loss": 1.4408, + "step": 4300 + }, + { + "epoch": 0.2206635265205816, + "grad_norm": 0.34783267974853516, + "learning_rate": 0.0015, + "loss": 1.4362, + "step": 4310 + }, + { + "epoch": 0.22117550686053655, + "grad_norm": 0.3477760851383209, + "learning_rate": 0.0015, + "loss": 1.4743, + "step": 4320 + }, + { + "epoch": 0.2216874872004915, + "grad_norm": 0.33150288462638855, + "learning_rate": 0.0015, + "loss": 1.4338, + "step": 4330 + }, + { + "epoch": 0.22219946754044645, + "grad_norm": 0.3353327810764313, + "learning_rate": 0.0015, + "loss": 1.4389, + "step": 4340 + }, + { + "epoch": 0.2227114478804014, + "grad_norm": 0.35436680912971497, + "learning_rate": 0.0015, + "loss": 1.4221, + "step": 4350 + }, + { + "epoch": 0.22322342822035635, + "grad_norm": 0.35052821040153503, + "learning_rate": 0.0015, + "loss": 1.4463, + "step": 4360 + }, + { + "epoch": 0.2237354085603113, + "grad_norm": 0.3383365273475647, + "learning_rate": 0.0015, + "loss": 1.4438, + "step": 4370 + }, + { + "epoch": 0.22424738890026624, + "grad_norm": 0.33028966188430786, + "learning_rate": 0.0015, + "loss": 1.4365, + "step": 4380 + }, + { + "epoch": 0.22475936924022116, + "grad_norm": 0.3439690172672272, + "learning_rate": 0.0015, + "loss": 1.434, + "step": 4390 + }, + { + "epoch": 0.2252713495801761, + "grad_norm": 0.3257237374782562, + "learning_rate": 0.0015, + "loss": 1.4268, + "step": 4400 + }, + { + "epoch": 0.22578332992013106, + "grad_norm": 0.34487271308898926, + "learning_rate": 0.0015, + "loss": 1.419, + "step": 4410 + }, + { + "epoch": 0.226295310260086, + "grad_norm": 0.3513702154159546, + "learning_rate": 0.0015, + "loss": 1.416, + "step": 4420 + }, + { + "epoch": 0.22680729060004096, + "grad_norm": 0.32178881764411926, + "learning_rate": 0.0015, + "loss": 1.4267, + "step": 4430 + }, + { + "epoch": 0.2273192709399959, + "grad_norm": 0.32011663913726807, + "learning_rate": 0.0015, + "loss": 1.4269, + "step": 4440 + }, + { + "epoch": 0.22783125127995085, + "grad_norm": 0.3356774151325226, + "learning_rate": 0.0015, + "loss": 1.4253, + "step": 4450 + }, + { + "epoch": 0.2283432316199058, + "grad_norm": 0.33938485383987427, + "learning_rate": 0.0015, + "loss": 1.4137, + "step": 4460 + }, + { + "epoch": 0.22885521195986075, + "grad_norm": 0.3313305675983429, + "learning_rate": 0.0015, + "loss": 1.4178, + "step": 4470 + }, + { + "epoch": 0.2293671922998157, + "grad_norm": 0.31967252492904663, + "learning_rate": 0.0015, + "loss": 1.4421, + "step": 4480 + }, + { + "epoch": 0.22987917263977065, + "grad_norm": 0.3485276401042938, + "learning_rate": 0.0015, + "loss": 1.4202, + "step": 4490 + }, + { + "epoch": 0.23039115297972557, + "grad_norm": 0.3465486764907837, + "learning_rate": 0.0015, + "loss": 1.4364, + "step": 4500 + }, + { + "epoch": 0.23090313331968051, + "grad_norm": 0.3443972170352936, + "learning_rate": 0.0015, + "loss": 1.4326, + "step": 4510 + }, + { + "epoch": 0.23141511365963546, + "grad_norm": 0.33160969614982605, + "learning_rate": 0.0015, + "loss": 1.4147, + "step": 4520 + }, + { + "epoch": 0.2319270939995904, + "grad_norm": 0.3427571952342987, + "learning_rate": 0.0015, + "loss": 1.4316, + "step": 4530 + }, + { + "epoch": 0.23243907433954536, + "grad_norm": 0.3282462954521179, + "learning_rate": 0.0015, + "loss": 1.3933, + "step": 4540 + }, + { + "epoch": 0.2329510546795003, + "grad_norm": 0.3840288519859314, + "learning_rate": 0.0015, + "loss": 1.4206, + "step": 4550 + }, + { + "epoch": 0.23346303501945526, + "grad_norm": 0.34188082814216614, + "learning_rate": 0.0015, + "loss": 1.4286, + "step": 4560 + }, + { + "epoch": 0.2339750153594102, + "grad_norm": 0.32480111718177795, + "learning_rate": 0.0015, + "loss": 1.4191, + "step": 4570 + }, + { + "epoch": 0.23448699569936515, + "grad_norm": 0.3416594862937927, + "learning_rate": 0.0015, + "loss": 1.432, + "step": 4580 + }, + { + "epoch": 0.2349989760393201, + "grad_norm": 0.32898756861686707, + "learning_rate": 0.0015, + "loss": 1.414, + "step": 4590 + }, + { + "epoch": 0.23551095637927505, + "grad_norm": 0.3290642499923706, + "learning_rate": 0.0015, + "loss": 1.4272, + "step": 4600 + }, + { + "epoch": 0.23602293671922997, + "grad_norm": 0.333150178194046, + "learning_rate": 0.0015, + "loss": 1.4254, + "step": 4610 + }, + { + "epoch": 0.23653491705918492, + "grad_norm": 0.30599096417427063, + "learning_rate": 0.0015, + "loss": 1.4255, + "step": 4620 + }, + { + "epoch": 0.23704689739913987, + "grad_norm": 0.34288567304611206, + "learning_rate": 0.0015, + "loss": 1.4027, + "step": 4630 + }, + { + "epoch": 0.23755887773909481, + "grad_norm": 0.36715662479400635, + "learning_rate": 0.0015, + "loss": 1.4155, + "step": 4640 + }, + { + "epoch": 0.23807085807904976, + "grad_norm": 0.32257118821144104, + "learning_rate": 0.0015, + "loss": 1.4178, + "step": 4650 + }, + { + "epoch": 0.2385828384190047, + "grad_norm": 0.3298852741718292, + "learning_rate": 0.0015, + "loss": 1.4149, + "step": 4660 + }, + { + "epoch": 0.23909481875895966, + "grad_norm": 0.32268422842025757, + "learning_rate": 0.0015, + "loss": 1.4384, + "step": 4670 + }, + { + "epoch": 0.2396067990989146, + "grad_norm": 0.33715546131134033, + "learning_rate": 0.0015, + "loss": 1.4014, + "step": 4680 + }, + { + "epoch": 0.24011877943886956, + "grad_norm": 0.3131064772605896, + "learning_rate": 0.0015, + "loss": 1.4163, + "step": 4690 + }, + { + "epoch": 0.2406307597788245, + "grad_norm": 0.3470405042171478, + "learning_rate": 0.0015, + "loss": 1.4186, + "step": 4700 + }, + { + "epoch": 0.24114274011877943, + "grad_norm": 0.35475459694862366, + "learning_rate": 0.0015, + "loss": 1.417, + "step": 4710 + }, + { + "epoch": 0.24165472045873437, + "grad_norm": 0.3337201178073883, + "learning_rate": 0.0015, + "loss": 1.4271, + "step": 4720 + }, + { + "epoch": 0.24216670079868932, + "grad_norm": 0.3554363548755646, + "learning_rate": 0.0015, + "loss": 1.4182, + "step": 4730 + }, + { + "epoch": 0.24267868113864427, + "grad_norm": 0.32346460223197937, + "learning_rate": 0.0015, + "loss": 1.421, + "step": 4740 + }, + { + "epoch": 0.24319066147859922, + "grad_norm": 0.3117121756076813, + "learning_rate": 0.0015, + "loss": 1.4278, + "step": 4750 + }, + { + "epoch": 0.24370264181855417, + "grad_norm": 0.3506932556629181, + "learning_rate": 0.0015, + "loss": 1.3881, + "step": 4760 + }, + { + "epoch": 0.24421462215850911, + "grad_norm": 0.3424610495567322, + "learning_rate": 0.0015, + "loss": 1.4236, + "step": 4770 + }, + { + "epoch": 0.24472660249846406, + "grad_norm": 0.3284012973308563, + "learning_rate": 0.0015, + "loss": 1.4147, + "step": 4780 + }, + { + "epoch": 0.245238582838419, + "grad_norm": 0.3341637849807739, + "learning_rate": 0.0015, + "loss": 1.4109, + "step": 4790 + }, + { + "epoch": 0.24575056317837396, + "grad_norm": 0.32382500171661377, + "learning_rate": 0.0015, + "loss": 1.4063, + "step": 4800 + }, + { + "epoch": 0.2462625435183289, + "grad_norm": 0.3269002437591553, + "learning_rate": 0.0015, + "loss": 1.42, + "step": 4810 + }, + { + "epoch": 0.24677452385828383, + "grad_norm": 0.33705347776412964, + "learning_rate": 0.0015, + "loss": 1.4108, + "step": 4820 + }, + { + "epoch": 0.24728650419823878, + "grad_norm": 0.32141435146331787, + "learning_rate": 0.0015, + "loss": 1.4012, + "step": 4830 + }, + { + "epoch": 0.24779848453819373, + "grad_norm": 0.32620713114738464, + "learning_rate": 0.0015, + "loss": 1.3946, + "step": 4840 + }, + { + "epoch": 0.24831046487814867, + "grad_norm": 0.3150465488433838, + "learning_rate": 0.0015, + "loss": 1.4239, + "step": 4850 + }, + { + "epoch": 0.24882244521810362, + "grad_norm": 0.3141099214553833, + "learning_rate": 0.0015, + "loss": 1.4248, + "step": 4860 + }, + { + "epoch": 0.24933442555805857, + "grad_norm": 0.31802797317504883, + "learning_rate": 0.0015, + "loss": 1.3965, + "step": 4870 + }, + { + "epoch": 0.24984640589801352, + "grad_norm": 0.31748947501182556, + "learning_rate": 0.0015, + "loss": 1.4222, + "step": 4880 + }, + { + "epoch": 0.25035838623796847, + "grad_norm": 0.30938032269477844, + "learning_rate": 0.0015, + "loss": 1.4001, + "step": 4890 + }, + { + "epoch": 0.2508703665779234, + "grad_norm": 0.3129180371761322, + "learning_rate": 0.0015, + "loss": 1.3958, + "step": 4900 + }, + { + "epoch": 0.25138234691787836, + "grad_norm": 0.31602999567985535, + "learning_rate": 0.0015, + "loss": 1.4114, + "step": 4910 + }, + { + "epoch": 0.2518943272578333, + "grad_norm": 0.3049462139606476, + "learning_rate": 0.0015, + "loss": 1.3868, + "step": 4920 + }, + { + "epoch": 0.25240630759778826, + "grad_norm": 0.3103995621204376, + "learning_rate": 0.0015, + "loss": 1.401, + "step": 4930 + }, + { + "epoch": 0.2529182879377432, + "grad_norm": 0.30271056294441223, + "learning_rate": 0.0015, + "loss": 1.4046, + "step": 4940 + }, + { + "epoch": 0.25343026827769816, + "grad_norm": 0.32372725009918213, + "learning_rate": 0.0015, + "loss": 1.3719, + "step": 4950 + }, + { + "epoch": 0.2539422486176531, + "grad_norm": 0.3129730224609375, + "learning_rate": 0.0015, + "loss": 1.3797, + "step": 4960 + }, + { + "epoch": 0.25445422895760805, + "grad_norm": 0.3240148425102234, + "learning_rate": 0.0015, + "loss": 1.4134, + "step": 4970 + }, + { + "epoch": 0.254966209297563, + "grad_norm": 0.30317404866218567, + "learning_rate": 0.0015, + "loss": 1.3894, + "step": 4980 + }, + { + "epoch": 0.2554781896375179, + "grad_norm": 0.33288583159446716, + "learning_rate": 0.0015, + "loss": 1.4132, + "step": 4990 + }, + { + "epoch": 0.25599016997747287, + "grad_norm": 0.3233846127986908, + "learning_rate": 0.0015, + "loss": 1.3762, + "step": 5000 + }, + { + "epoch": 0.2565021503174278, + "grad_norm": 0.30729755759239197, + "learning_rate": 0.0015, + "loss": 1.3975, + "step": 5010 + }, + { + "epoch": 0.25701413065738277, + "grad_norm": 0.3006018400192261, + "learning_rate": 0.0015, + "loss": 1.4047, + "step": 5020 + }, + { + "epoch": 0.2575261109973377, + "grad_norm": 0.3207467794418335, + "learning_rate": 0.0015, + "loss": 1.4084, + "step": 5030 + }, + { + "epoch": 0.25803809133729266, + "grad_norm": 0.3039129674434662, + "learning_rate": 0.0015, + "loss": 1.4209, + "step": 5040 + }, + { + "epoch": 0.2585500716772476, + "grad_norm": 0.29750290513038635, + "learning_rate": 0.0015, + "loss": 1.4156, + "step": 5050 + }, + { + "epoch": 0.25906205201720256, + "grad_norm": 0.314507395029068, + "learning_rate": 0.0015, + "loss": 1.3685, + "step": 5060 + }, + { + "epoch": 0.2595740323571575, + "grad_norm": 0.3176608681678772, + "learning_rate": 0.0015, + "loss": 1.3701, + "step": 5070 + }, + { + "epoch": 0.26008601269711246, + "grad_norm": 0.3273438513278961, + "learning_rate": 0.0015, + "loss": 1.3841, + "step": 5080 + }, + { + "epoch": 0.2605979930370674, + "grad_norm": 0.3173183798789978, + "learning_rate": 0.0015, + "loss": 1.3732, + "step": 5090 + }, + { + "epoch": 0.2611099733770223, + "grad_norm": 0.33317986130714417, + "learning_rate": 0.0015, + "loss": 1.3815, + "step": 5100 + }, + { + "epoch": 0.2616219537169773, + "grad_norm": 0.3045515716075897, + "learning_rate": 0.0015, + "loss": 1.4042, + "step": 5110 + }, + { + "epoch": 0.2621339340569322, + "grad_norm": 0.3056975305080414, + "learning_rate": 0.0015, + "loss": 1.4156, + "step": 5120 + }, + { + "epoch": 0.26264591439688717, + "grad_norm": 0.3231489956378937, + "learning_rate": 0.0015, + "loss": 1.4076, + "step": 5130 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 0.3215503990650177, + "learning_rate": 0.0015, + "loss": 1.3712, + "step": 5140 + }, + { + "epoch": 0.26366987507679707, + "grad_norm": 0.30379393696784973, + "learning_rate": 0.0015, + "loss": 1.3648, + "step": 5150 + }, + { + "epoch": 0.264181855416752, + "grad_norm": 0.2987072765827179, + "learning_rate": 0.0015, + "loss": 1.3859, + "step": 5160 + }, + { + "epoch": 0.26469383575670696, + "grad_norm": 0.3293174207210541, + "learning_rate": 0.0015, + "loss": 1.3974, + "step": 5170 + }, + { + "epoch": 0.2652058160966619, + "grad_norm": 0.34920957684516907, + "learning_rate": 0.0015, + "loss": 1.3868, + "step": 5180 + }, + { + "epoch": 0.26571779643661686, + "grad_norm": 0.3054308295249939, + "learning_rate": 0.0015, + "loss": 1.3838, + "step": 5190 + }, + { + "epoch": 0.2662297767765718, + "grad_norm": 0.3131832182407379, + "learning_rate": 0.0015, + "loss": 1.377, + "step": 5200 + }, + { + "epoch": 0.2667417571165267, + "grad_norm": 0.30868205428123474, + "learning_rate": 0.0015, + "loss": 1.3999, + "step": 5210 + }, + { + "epoch": 0.2672537374564817, + "grad_norm": 0.3193263113498688, + "learning_rate": 0.0015, + "loss": 1.3789, + "step": 5220 + }, + { + "epoch": 0.2677657177964366, + "grad_norm": 0.3142963945865631, + "learning_rate": 0.0015, + "loss": 1.3993, + "step": 5230 + }, + { + "epoch": 0.2682776981363916, + "grad_norm": 0.3012097179889679, + "learning_rate": 0.0015, + "loss": 1.3959, + "step": 5240 + }, + { + "epoch": 0.2687896784763465, + "grad_norm": 0.30580368638038635, + "learning_rate": 0.0015, + "loss": 1.4106, + "step": 5250 + }, + { + "epoch": 0.26930165881630147, + "grad_norm": 0.2862599790096283, + "learning_rate": 0.0015, + "loss": 1.3873, + "step": 5260 + }, + { + "epoch": 0.2698136391562564, + "grad_norm": 0.3221125602722168, + "learning_rate": 0.0015, + "loss": 1.3997, + "step": 5270 + }, + { + "epoch": 0.27032561949621137, + "grad_norm": 0.29167062044143677, + "learning_rate": 0.0015, + "loss": 1.3707, + "step": 5280 + }, + { + "epoch": 0.2708375998361663, + "grad_norm": 0.3372457027435303, + "learning_rate": 0.0015, + "loss": 1.3767, + "step": 5290 + }, + { + "epoch": 0.27134958017612126, + "grad_norm": 0.308940589427948, + "learning_rate": 0.0015, + "loss": 1.377, + "step": 5300 + }, + { + "epoch": 0.2718615605160762, + "grad_norm": 0.2946240305900574, + "learning_rate": 0.0015, + "loss": 1.3811, + "step": 5310 + }, + { + "epoch": 0.2723735408560311, + "grad_norm": 0.30118903517723083, + "learning_rate": 0.0015, + "loss": 1.3991, + "step": 5320 + }, + { + "epoch": 0.2728855211959861, + "grad_norm": 0.3128001093864441, + "learning_rate": 0.0015, + "loss": 1.3806, + "step": 5330 + }, + { + "epoch": 0.273397501535941, + "grad_norm": 0.3355924189090729, + "learning_rate": 0.0015, + "loss": 1.378, + "step": 5340 + }, + { + "epoch": 0.273909481875896, + "grad_norm": 0.29809674620628357, + "learning_rate": 0.0015, + "loss": 1.365, + "step": 5350 + }, + { + "epoch": 0.2744214622158509, + "grad_norm": 0.2897878885269165, + "learning_rate": 0.0015, + "loss": 1.3796, + "step": 5360 + }, + { + "epoch": 0.2749334425558059, + "grad_norm": 0.33131879568099976, + "learning_rate": 0.0015, + "loss": 1.3789, + "step": 5370 + }, + { + "epoch": 0.2754454228957608, + "grad_norm": 0.3270549476146698, + "learning_rate": 0.0015, + "loss": 1.3877, + "step": 5380 + }, + { + "epoch": 0.27595740323571577, + "grad_norm": 0.3001706898212433, + "learning_rate": 0.0015, + "loss": 1.376, + "step": 5390 + }, + { + "epoch": 0.2764693835756707, + "grad_norm": 0.3149849772453308, + "learning_rate": 0.0015, + "loss": 1.3815, + "step": 5400 + }, + { + "epoch": 0.27698136391562567, + "grad_norm": 0.28992435336112976, + "learning_rate": 0.0015, + "loss": 1.3731, + "step": 5410 + }, + { + "epoch": 0.2774933442555806, + "grad_norm": 0.295311838388443, + "learning_rate": 0.0015, + "loss": 1.3958, + "step": 5420 + }, + { + "epoch": 0.2780053245955355, + "grad_norm": 0.2988681495189667, + "learning_rate": 0.0015, + "loss": 1.3946, + "step": 5430 + }, + { + "epoch": 0.2785173049354905, + "grad_norm": 0.3085227608680725, + "learning_rate": 0.0015, + "loss": 1.3776, + "step": 5440 + }, + { + "epoch": 0.2790292852754454, + "grad_norm": 0.30014750361442566, + "learning_rate": 0.0015, + "loss": 1.3772, + "step": 5450 + }, + { + "epoch": 0.2795412656154004, + "grad_norm": 0.3058876693248749, + "learning_rate": 0.0015, + "loss": 1.3637, + "step": 5460 + }, + { + "epoch": 0.2800532459553553, + "grad_norm": 0.2952674925327301, + "learning_rate": 0.0015, + "loss": 1.3888, + "step": 5470 + }, + { + "epoch": 0.2805652262953103, + "grad_norm": 0.3016969561576843, + "learning_rate": 0.0015, + "loss": 1.3874, + "step": 5480 + }, + { + "epoch": 0.2810772066352652, + "grad_norm": 0.30375874042510986, + "learning_rate": 0.0015, + "loss": 1.3652, + "step": 5490 + }, + { + "epoch": 0.2815891869752202, + "grad_norm": 0.29380300641059875, + "learning_rate": 0.0015, + "loss": 1.3768, + "step": 5500 + }, + { + "epoch": 0.2821011673151751, + "grad_norm": 0.2994033992290497, + "learning_rate": 0.0015, + "loss": 1.376, + "step": 5510 + }, + { + "epoch": 0.28261314765513007, + "grad_norm": 0.3174065053462982, + "learning_rate": 0.0015, + "loss": 1.3873, + "step": 5520 + }, + { + "epoch": 0.283125127995085, + "grad_norm": 0.3069535791873932, + "learning_rate": 0.0015, + "loss": 1.3636, + "step": 5530 + }, + { + "epoch": 0.2836371083350399, + "grad_norm": 0.2826645076274872, + "learning_rate": 0.0015, + "loss": 1.3567, + "step": 5540 + }, + { + "epoch": 0.2841490886749949, + "grad_norm": 0.295926034450531, + "learning_rate": 0.0015, + "loss": 1.361, + "step": 5550 + }, + { + "epoch": 0.2846610690149498, + "grad_norm": 0.29257112741470337, + "learning_rate": 0.0015, + "loss": 1.3699, + "step": 5560 + }, + { + "epoch": 0.2851730493549048, + "grad_norm": 0.28169023990631104, + "learning_rate": 0.0015, + "loss": 1.353, + "step": 5570 + }, + { + "epoch": 0.2856850296948597, + "grad_norm": 0.31054553389549255, + "learning_rate": 0.0015, + "loss": 1.3955, + "step": 5580 + }, + { + "epoch": 0.2861970100348147, + "grad_norm": 0.28373947739601135, + "learning_rate": 0.0015, + "loss": 1.3843, + "step": 5590 + }, + { + "epoch": 0.2867089903747696, + "grad_norm": 0.29920247197151184, + "learning_rate": 0.0015, + "loss": 1.3588, + "step": 5600 + }, + { + "epoch": 0.2872209707147246, + "grad_norm": 0.2981637120246887, + "learning_rate": 0.0015, + "loss": 1.376, + "step": 5610 + }, + { + "epoch": 0.2877329510546795, + "grad_norm": 0.269811749458313, + "learning_rate": 0.0015, + "loss": 1.3733, + "step": 5620 + }, + { + "epoch": 0.2882449313946345, + "grad_norm": 0.28365617990493774, + "learning_rate": 0.0015, + "loss": 1.3376, + "step": 5630 + }, + { + "epoch": 0.2887569117345894, + "grad_norm": 0.2953552305698395, + "learning_rate": 0.0015, + "loss": 1.367, + "step": 5640 + }, + { + "epoch": 0.2892688920745443, + "grad_norm": 0.2910911440849304, + "learning_rate": 0.0015, + "loss": 1.3708, + "step": 5650 + }, + { + "epoch": 0.2897808724144993, + "grad_norm": 0.2998880445957184, + "learning_rate": 0.0015, + "loss": 1.3917, + "step": 5660 + }, + { + "epoch": 0.2902928527544542, + "grad_norm": 0.3000008165836334, + "learning_rate": 0.0015, + "loss": 1.3597, + "step": 5670 + }, + { + "epoch": 0.2908048330944092, + "grad_norm": 0.3019564747810364, + "learning_rate": 0.0015, + "loss": 1.3641, + "step": 5680 + }, + { + "epoch": 0.2913168134343641, + "grad_norm": 0.28087547421455383, + "learning_rate": 0.0015, + "loss": 1.3427, + "step": 5690 + }, + { + "epoch": 0.2918287937743191, + "grad_norm": 0.32179591059684753, + "learning_rate": 0.0015, + "loss": 1.3576, + "step": 5700 + }, + { + "epoch": 0.292340774114274, + "grad_norm": 0.30196836590766907, + "learning_rate": 0.0015, + "loss": 1.3866, + "step": 5710 + }, + { + "epoch": 0.292852754454229, + "grad_norm": 0.29928138852119446, + "learning_rate": 0.0015, + "loss": 1.3711, + "step": 5720 + }, + { + "epoch": 0.2933647347941839, + "grad_norm": 0.30917906761169434, + "learning_rate": 0.0015, + "loss": 1.3481, + "step": 5730 + }, + { + "epoch": 0.2938767151341388, + "grad_norm": 0.32579630613327026, + "learning_rate": 0.0015, + "loss": 1.3713, + "step": 5740 + }, + { + "epoch": 0.2943886954740938, + "grad_norm": 0.3042047321796417, + "learning_rate": 0.0015, + "loss": 1.3758, + "step": 5750 + }, + { + "epoch": 0.2949006758140487, + "grad_norm": 0.2910909354686737, + "learning_rate": 0.0015, + "loss": 1.3675, + "step": 5760 + }, + { + "epoch": 0.2954126561540037, + "grad_norm": 0.29718905687332153, + "learning_rate": 0.0015, + "loss": 1.3576, + "step": 5770 + }, + { + "epoch": 0.2959246364939586, + "grad_norm": 0.28392040729522705, + "learning_rate": 0.0015, + "loss": 1.3779, + "step": 5780 + }, + { + "epoch": 0.2964366168339136, + "grad_norm": 0.2852902114391327, + "learning_rate": 0.0015, + "loss": 1.3709, + "step": 5790 + }, + { + "epoch": 0.2969485971738685, + "grad_norm": 0.29683250188827515, + "learning_rate": 0.0015, + "loss": 1.3757, + "step": 5800 + }, + { + "epoch": 0.2974605775138235, + "grad_norm": 0.2882269620895386, + "learning_rate": 0.0015, + "loss": 1.3706, + "step": 5810 + }, + { + "epoch": 0.2979725578537784, + "grad_norm": 0.3086804449558258, + "learning_rate": 0.0015, + "loss": 1.3506, + "step": 5820 + }, + { + "epoch": 0.2984845381937334, + "grad_norm": 0.2780090868473053, + "learning_rate": 0.0015, + "loss": 1.3565, + "step": 5830 + }, + { + "epoch": 0.2989965185336883, + "grad_norm": 0.30415329337120056, + "learning_rate": 0.0015, + "loss": 1.3593, + "step": 5840 + }, + { + "epoch": 0.2995084988736432, + "grad_norm": 0.2865590751171112, + "learning_rate": 0.0015, + "loss": 1.3873, + "step": 5850 + }, + { + "epoch": 0.3000204792135982, + "grad_norm": 0.2798267900943756, + "learning_rate": 0.0015, + "loss": 1.3439, + "step": 5860 + }, + { + "epoch": 0.3005324595535531, + "grad_norm": 0.29937195777893066, + "learning_rate": 0.0015, + "loss": 1.3483, + "step": 5870 + }, + { + "epoch": 0.3010444398935081, + "grad_norm": 0.27708205580711365, + "learning_rate": 0.0015, + "loss": 1.3207, + "step": 5880 + }, + { + "epoch": 0.301556420233463, + "grad_norm": 0.2955605983734131, + "learning_rate": 0.0015, + "loss": 1.3524, + "step": 5890 + }, + { + "epoch": 0.302068400573418, + "grad_norm": 0.3226946294307709, + "learning_rate": 0.0015, + "loss": 1.3545, + "step": 5900 + }, + { + "epoch": 0.3025803809133729, + "grad_norm": 0.2925417721271515, + "learning_rate": 0.0015, + "loss": 1.3435, + "step": 5910 + }, + { + "epoch": 0.3030923612533279, + "grad_norm": 0.3087621331214905, + "learning_rate": 0.0015, + "loss": 1.3275, + "step": 5920 + }, + { + "epoch": 0.3036043415932828, + "grad_norm": 0.2996879518032074, + "learning_rate": 0.0015, + "loss": 1.3514, + "step": 5930 + }, + { + "epoch": 0.3041163219332378, + "grad_norm": 0.3085525333881378, + "learning_rate": 0.0015, + "loss": 1.3539, + "step": 5940 + }, + { + "epoch": 0.3046283022731927, + "grad_norm": 0.28985559940338135, + "learning_rate": 0.0015, + "loss": 1.3661, + "step": 5950 + }, + { + "epoch": 0.30514028261314763, + "grad_norm": 0.2889237701892853, + "learning_rate": 0.0015, + "loss": 1.3622, + "step": 5960 + }, + { + "epoch": 0.3056522629531026, + "grad_norm": 0.3278009593486786, + "learning_rate": 0.0015, + "loss": 1.3438, + "step": 5970 + }, + { + "epoch": 0.3061642432930575, + "grad_norm": 0.2967126965522766, + "learning_rate": 0.0015, + "loss": 1.3752, + "step": 5980 + }, + { + "epoch": 0.3066762236330125, + "grad_norm": 0.2810833752155304, + "learning_rate": 0.0015, + "loss": 1.3673, + "step": 5990 + }, + { + "epoch": 0.3071882039729674, + "grad_norm": 0.2842026650905609, + "learning_rate": 0.0015, + "loss": 1.3315, + "step": 6000 + }, + { + "epoch": 0.3077001843129224, + "grad_norm": 0.2904771864414215, + "learning_rate": 0.0015, + "loss": 1.3551, + "step": 6010 + }, + { + "epoch": 0.3082121646528773, + "grad_norm": 0.2798822224140167, + "learning_rate": 0.0015, + "loss": 1.374, + "step": 6020 + }, + { + "epoch": 0.3087241449928323, + "grad_norm": 0.2831931412220001, + "learning_rate": 0.0015, + "loss": 1.3449, + "step": 6030 + }, + { + "epoch": 0.3092361253327872, + "grad_norm": 0.27797648310661316, + "learning_rate": 0.0015, + "loss": 1.3427, + "step": 6040 + }, + { + "epoch": 0.3097481056727422, + "grad_norm": 0.2972757816314697, + "learning_rate": 0.0015, + "loss": 1.3498, + "step": 6050 + }, + { + "epoch": 0.3102600860126971, + "grad_norm": 0.2661411166191101, + "learning_rate": 0.0015, + "loss": 1.3391, + "step": 6060 + }, + { + "epoch": 0.31077206635265203, + "grad_norm": 0.2736954689025879, + "learning_rate": 0.0015, + "loss": 1.3637, + "step": 6070 + }, + { + "epoch": 0.311284046692607, + "grad_norm": 0.27739083766937256, + "learning_rate": 0.0015, + "loss": 1.3432, + "step": 6080 + }, + { + "epoch": 0.31179602703256193, + "grad_norm": 0.275734543800354, + "learning_rate": 0.0015, + "loss": 1.3523, + "step": 6090 + }, + { + "epoch": 0.3123080073725169, + "grad_norm": 0.29389500617980957, + "learning_rate": 0.0015, + "loss": 1.3566, + "step": 6100 + }, + { + "epoch": 0.3128199877124718, + "grad_norm": 0.3517824113368988, + "learning_rate": 0.0015, + "loss": 1.3401, + "step": 6110 + }, + { + "epoch": 0.3133319680524268, + "grad_norm": 0.2847048342227936, + "learning_rate": 0.0015, + "loss": 1.3345, + "step": 6120 + }, + { + "epoch": 0.3138439483923817, + "grad_norm": 0.2781658470630646, + "learning_rate": 0.0015, + "loss": 1.3165, + "step": 6130 + }, + { + "epoch": 0.3143559287323367, + "grad_norm": 0.27928218245506287, + "learning_rate": 0.0015, + "loss": 1.3419, + "step": 6140 + }, + { + "epoch": 0.3148679090722916, + "grad_norm": 0.29375484585762024, + "learning_rate": 0.0015, + "loss": 1.3424, + "step": 6150 + }, + { + "epoch": 0.3153798894122466, + "grad_norm": 0.2773997187614441, + "learning_rate": 0.0015, + "loss": 1.3153, + "step": 6160 + }, + { + "epoch": 0.3158918697522015, + "grad_norm": 0.2810317277908325, + "learning_rate": 0.0015, + "loss": 1.3633, + "step": 6170 + }, + { + "epoch": 0.31640385009215644, + "grad_norm": 0.2810805141925812, + "learning_rate": 0.0015, + "loss": 1.3388, + "step": 6180 + }, + { + "epoch": 0.3169158304321114, + "grad_norm": 0.27900010347366333, + "learning_rate": 0.0015, + "loss": 1.3494, + "step": 6190 + }, + { + "epoch": 0.31742781077206633, + "grad_norm": 0.2763247787952423, + "learning_rate": 0.0015, + "loss": 1.347, + "step": 6200 + }, + { + "epoch": 0.3179397911120213, + "grad_norm": 0.27593132853507996, + "learning_rate": 0.0015, + "loss": 1.3286, + "step": 6210 + }, + { + "epoch": 0.31845177145197623, + "grad_norm": 0.2928100526332855, + "learning_rate": 0.0015, + "loss": 1.3485, + "step": 6220 + }, + { + "epoch": 0.3189637517919312, + "grad_norm": 0.2809889316558838, + "learning_rate": 0.0015, + "loss": 1.3318, + "step": 6230 + }, + { + "epoch": 0.3194757321318861, + "grad_norm": 0.2984907329082489, + "learning_rate": 0.0015, + "loss": 1.3474, + "step": 6240 + }, + { + "epoch": 0.3199877124718411, + "grad_norm": 0.2861260771751404, + "learning_rate": 0.0015, + "loss": 1.3308, + "step": 6250 + }, + { + "epoch": 0.320499692811796, + "grad_norm": 0.30209678411483765, + "learning_rate": 0.0015, + "loss": 1.3438, + "step": 6260 + }, + { + "epoch": 0.321011673151751, + "grad_norm": 0.27839919924736023, + "learning_rate": 0.0015, + "loss": 1.3606, + "step": 6270 + }, + { + "epoch": 0.3215236534917059, + "grad_norm": 0.27120068669319153, + "learning_rate": 0.0015, + "loss": 1.3291, + "step": 6280 + }, + { + "epoch": 0.32203563383166084, + "grad_norm": 0.2891988158226013, + "learning_rate": 0.0015, + "loss": 1.3483, + "step": 6290 + }, + { + "epoch": 0.3225476141716158, + "grad_norm": 0.3099561929702759, + "learning_rate": 0.0015, + "loss": 1.3538, + "step": 6300 + }, + { + "epoch": 0.32305959451157074, + "grad_norm": 0.28136762976646423, + "learning_rate": 0.0015, + "loss": 1.344, + "step": 6310 + }, + { + "epoch": 0.3235715748515257, + "grad_norm": 0.27209803462028503, + "learning_rate": 0.0015, + "loss": 1.3395, + "step": 6320 + }, + { + "epoch": 0.32408355519148063, + "grad_norm": 0.2847345173358917, + "learning_rate": 0.0015, + "loss": 1.3278, + "step": 6330 + }, + { + "epoch": 0.3245955355314356, + "grad_norm": 0.29409244656562805, + "learning_rate": 0.0015, + "loss": 1.352, + "step": 6340 + }, + { + "epoch": 0.32510751587139053, + "grad_norm": 0.26782944798469543, + "learning_rate": 0.0015, + "loss": 1.3211, + "step": 6350 + }, + { + "epoch": 0.3256194962113455, + "grad_norm": 0.27680841088294983, + "learning_rate": 0.0015, + "loss": 1.3168, + "step": 6360 + }, + { + "epoch": 0.3261314765513004, + "grad_norm": 0.28913265466690063, + "learning_rate": 0.0015, + "loss": 1.3412, + "step": 6370 + }, + { + "epoch": 0.3266434568912554, + "grad_norm": 0.2598094046115875, + "learning_rate": 0.0015, + "loss": 1.3235, + "step": 6380 + }, + { + "epoch": 0.3271554372312103, + "grad_norm": 0.2622967064380646, + "learning_rate": 0.0015, + "loss": 1.3353, + "step": 6390 + }, + { + "epoch": 0.32766741757116524, + "grad_norm": 0.2802422046661377, + "learning_rate": 0.0015, + "loss": 1.3278, + "step": 6400 + }, + { + "epoch": 0.3281793979111202, + "grad_norm": 0.2863336503505707, + "learning_rate": 0.0015, + "loss": 1.3421, + "step": 6410 + }, + { + "epoch": 0.32869137825107514, + "grad_norm": 0.28782033920288086, + "learning_rate": 0.0015, + "loss": 1.3395, + "step": 6420 + }, + { + "epoch": 0.3292033585910301, + "grad_norm": 0.2650611698627472, + "learning_rate": 0.0015, + "loss": 1.3461, + "step": 6430 + }, + { + "epoch": 0.32971533893098504, + "grad_norm": 0.28210777044296265, + "learning_rate": 0.0015, + "loss": 1.3452, + "step": 6440 + }, + { + "epoch": 0.33022731927094, + "grad_norm": 0.29541024565696716, + "learning_rate": 0.0015, + "loss": 1.3304, + "step": 6450 + }, + { + "epoch": 0.33073929961089493, + "grad_norm": 0.27473190426826477, + "learning_rate": 0.0015, + "loss": 1.3277, + "step": 6460 + }, + { + "epoch": 0.3312512799508499, + "grad_norm": 0.2899293005466461, + "learning_rate": 0.0015, + "loss": 1.3193, + "step": 6470 + }, + { + "epoch": 0.33176326029080483, + "grad_norm": 0.2961236834526062, + "learning_rate": 0.0015, + "loss": 1.3252, + "step": 6480 + }, + { + "epoch": 0.3322752406307598, + "grad_norm": 0.2859441637992859, + "learning_rate": 0.0015, + "loss": 1.3327, + "step": 6490 + }, + { + "epoch": 0.3327872209707147, + "grad_norm": 0.26721256971359253, + "learning_rate": 0.0015, + "loss": 1.344, + "step": 6500 + }, + { + "epoch": 0.33329920131066965, + "grad_norm": 0.27258962392807007, + "learning_rate": 0.0015, + "loss": 1.3291, + "step": 6510 + }, + { + "epoch": 0.3338111816506246, + "grad_norm": 0.2868225872516632, + "learning_rate": 0.0015, + "loss": 1.3542, + "step": 6520 + }, + { + "epoch": 0.33432316199057954, + "grad_norm": 0.27058276534080505, + "learning_rate": 0.0015, + "loss": 1.3428, + "step": 6530 + }, + { + "epoch": 0.3348351423305345, + "grad_norm": 0.2648937404155731, + "learning_rate": 0.0015, + "loss": 1.3345, + "step": 6540 + }, + { + "epoch": 0.33534712267048944, + "grad_norm": 0.2588028609752655, + "learning_rate": 0.0015, + "loss": 1.3163, + "step": 6550 + }, + { + "epoch": 0.3358591030104444, + "grad_norm": 0.2773786783218384, + "learning_rate": 0.0015, + "loss": 1.3353, + "step": 6560 + }, + { + "epoch": 0.33637108335039934, + "grad_norm": 0.2635444402694702, + "learning_rate": 0.0015, + "loss": 1.3073, + "step": 6570 + }, + { + "epoch": 0.3368830636903543, + "grad_norm": 0.28633764386177063, + "learning_rate": 0.0015, + "loss": 1.3085, + "step": 6580 + }, + { + "epoch": 0.33739504403030923, + "grad_norm": 0.29486966133117676, + "learning_rate": 0.0015, + "loss": 1.3316, + "step": 6590 + }, + { + "epoch": 0.3379070243702642, + "grad_norm": 0.2629407048225403, + "learning_rate": 0.0015, + "loss": 1.3319, + "step": 6600 + }, + { + "epoch": 0.33841900471021913, + "grad_norm": 0.2779609262943268, + "learning_rate": 0.0015, + "loss": 1.3043, + "step": 6610 + }, + { + "epoch": 0.33893098505017405, + "grad_norm": 0.2911774218082428, + "learning_rate": 0.0015, + "loss": 1.361, + "step": 6620 + }, + { + "epoch": 0.339442965390129, + "grad_norm": 0.26540687680244446, + "learning_rate": 0.0015, + "loss": 1.3095, + "step": 6630 + }, + { + "epoch": 0.33995494573008395, + "grad_norm": 0.27710777521133423, + "learning_rate": 0.0015, + "loss": 1.3173, + "step": 6640 + }, + { + "epoch": 0.3404669260700389, + "grad_norm": 0.2614011764526367, + "learning_rate": 0.0015, + "loss": 1.3178, + "step": 6650 + }, + { + "epoch": 0.34097890640999384, + "grad_norm": 0.2797437906265259, + "learning_rate": 0.0015, + "loss": 1.3287, + "step": 6660 + }, + { + "epoch": 0.3414908867499488, + "grad_norm": 0.28846311569213867, + "learning_rate": 0.0015, + "loss": 1.3222, + "step": 6670 + }, + { + "epoch": 0.34200286708990374, + "grad_norm": 0.2507641911506653, + "learning_rate": 0.0015, + "loss": 1.3297, + "step": 6680 + }, + { + "epoch": 0.3425148474298587, + "grad_norm": 0.277458518743515, + "learning_rate": 0.0015, + "loss": 1.3092, + "step": 6690 + }, + { + "epoch": 0.34302682776981364, + "grad_norm": 0.28139162063598633, + "learning_rate": 0.0015, + "loss": 1.3509, + "step": 6700 + }, + { + "epoch": 0.3435388081097686, + "grad_norm": 0.26460030674934387, + "learning_rate": 0.0015, + "loss": 1.3357, + "step": 6710 + }, + { + "epoch": 0.34405078844972353, + "grad_norm": 0.2602977752685547, + "learning_rate": 0.0015, + "loss": 1.3375, + "step": 6720 + }, + { + "epoch": 0.34456276878967845, + "grad_norm": 0.3062650263309479, + "learning_rate": 0.0015, + "loss": 1.3225, + "step": 6730 + }, + { + "epoch": 0.34507474912963343, + "grad_norm": 0.27152612805366516, + "learning_rate": 0.0015, + "loss": 1.3326, + "step": 6740 + }, + { + "epoch": 0.34558672946958835, + "grad_norm": 0.2585943341255188, + "learning_rate": 0.0015, + "loss": 1.3275, + "step": 6750 + }, + { + "epoch": 0.3460987098095433, + "grad_norm": 0.2826108932495117, + "learning_rate": 0.0015, + "loss": 1.3143, + "step": 6760 + }, + { + "epoch": 0.34661069014949825, + "grad_norm": 0.2719128131866455, + "learning_rate": 0.0015, + "loss": 1.3136, + "step": 6770 + }, + { + "epoch": 0.3471226704894532, + "grad_norm": 0.2605542540550232, + "learning_rate": 0.0015, + "loss": 1.3207, + "step": 6780 + }, + { + "epoch": 0.34763465082940814, + "grad_norm": 0.26649779081344604, + "learning_rate": 0.0015, + "loss": 1.304, + "step": 6790 + }, + { + "epoch": 0.3481466311693631, + "grad_norm": 0.28349971771240234, + "learning_rate": 0.0015, + "loss": 1.3176, + "step": 6800 + }, + { + "epoch": 0.34865861150931804, + "grad_norm": 0.27145761251449585, + "learning_rate": 0.0015, + "loss": 1.3294, + "step": 6810 + }, + { + "epoch": 0.349170591849273, + "grad_norm": 0.26513341069221497, + "learning_rate": 0.0015, + "loss": 1.3299, + "step": 6820 + }, + { + "epoch": 0.34968257218922794, + "grad_norm": 0.2701232135295868, + "learning_rate": 0.0015, + "loss": 1.3028, + "step": 6830 + }, + { + "epoch": 0.35019455252918286, + "grad_norm": 0.27336186170578003, + "learning_rate": 0.0015, + "loss": 1.3253, + "step": 6840 + }, + { + "epoch": 0.35070653286913783, + "grad_norm": 0.26006847620010376, + "learning_rate": 0.0015, + "loss": 1.3097, + "step": 6850 + }, + { + "epoch": 0.35121851320909275, + "grad_norm": 0.2867346405982971, + "learning_rate": 0.0015, + "loss": 1.3489, + "step": 6860 + }, + { + "epoch": 0.35173049354904773, + "grad_norm": 0.2665490210056305, + "learning_rate": 0.0015, + "loss": 1.3029, + "step": 6870 + }, + { + "epoch": 0.35224247388900265, + "grad_norm": 0.26250341534614563, + "learning_rate": 0.0015, + "loss": 1.324, + "step": 6880 + }, + { + "epoch": 0.3527544542289576, + "grad_norm": 0.27404358983039856, + "learning_rate": 0.0015, + "loss": 1.3222, + "step": 6890 + }, + { + "epoch": 0.35326643456891255, + "grad_norm": 0.271932989358902, + "learning_rate": 0.0015, + "loss": 1.3068, + "step": 6900 + }, + { + "epoch": 0.3537784149088675, + "grad_norm": 0.25479060411453247, + "learning_rate": 0.0015, + "loss": 1.3143, + "step": 6910 + }, + { + "epoch": 0.35429039524882244, + "grad_norm": 0.2571351230144501, + "learning_rate": 0.0015, + "loss": 1.2886, + "step": 6920 + }, + { + "epoch": 0.35480237558877736, + "grad_norm": 0.2612917125225067, + "learning_rate": 0.0015, + "loss": 1.3199, + "step": 6930 + }, + { + "epoch": 0.35531435592873234, + "grad_norm": 0.2573522925376892, + "learning_rate": 0.0015, + "loss": 1.3143, + "step": 6940 + }, + { + "epoch": 0.35582633626868726, + "grad_norm": 0.2598212659358978, + "learning_rate": 0.0015, + "loss": 1.3039, + "step": 6950 + }, + { + "epoch": 0.35633831660864224, + "grad_norm": 0.2575034201145172, + "learning_rate": 0.0015, + "loss": 1.3095, + "step": 6960 + }, + { + "epoch": 0.35685029694859716, + "grad_norm": 0.2559545636177063, + "learning_rate": 0.0015, + "loss": 1.2971, + "step": 6970 + }, + { + "epoch": 0.35736227728855213, + "grad_norm": 0.26087066531181335, + "learning_rate": 0.0015, + "loss": 1.3023, + "step": 6980 + }, + { + "epoch": 0.35787425762850705, + "grad_norm": 0.2606737017631531, + "learning_rate": 0.0015, + "loss": 1.3098, + "step": 6990 + }, + { + "epoch": 0.35838623796846203, + "grad_norm": 0.27495986223220825, + "learning_rate": 0.0015, + "loss": 1.3249, + "step": 7000 + }, + { + "epoch": 0.35889821830841695, + "grad_norm": 0.25473734736442566, + "learning_rate": 0.0015, + "loss": 1.3253, + "step": 7010 + }, + { + "epoch": 0.3594101986483719, + "grad_norm": 0.2764824330806732, + "learning_rate": 0.0015, + "loss": 1.3101, + "step": 7020 + }, + { + "epoch": 0.35992217898832685, + "grad_norm": 0.27935823798179626, + "learning_rate": 0.0015, + "loss": 1.3268, + "step": 7030 + }, + { + "epoch": 0.36043415932828177, + "grad_norm": 0.26057881116867065, + "learning_rate": 0.0015, + "loss": 1.2999, + "step": 7040 + }, + { + "epoch": 0.36094613966823674, + "grad_norm": 0.27014756202697754, + "learning_rate": 0.0015, + "loss": 1.3083, + "step": 7050 + }, + { + "epoch": 0.36145812000819166, + "grad_norm": 0.26150983572006226, + "learning_rate": 0.0015, + "loss": 1.3059, + "step": 7060 + }, + { + "epoch": 0.36197010034814664, + "grad_norm": 0.2634667158126831, + "learning_rate": 0.0015, + "loss": 1.3325, + "step": 7070 + }, + { + "epoch": 0.36248208068810156, + "grad_norm": 0.2591879665851593, + "learning_rate": 0.0015, + "loss": 1.3004, + "step": 7080 + }, + { + "epoch": 0.36299406102805654, + "grad_norm": 0.27941566705703735, + "learning_rate": 0.0015, + "loss": 1.3216, + "step": 7090 + }, + { + "epoch": 0.36350604136801146, + "grad_norm": 0.2634701430797577, + "learning_rate": 0.0015, + "loss": 1.3043, + "step": 7100 + }, + { + "epoch": 0.36401802170796643, + "grad_norm": 0.2601988613605499, + "learning_rate": 0.0015, + "loss": 1.3128, + "step": 7110 + }, + { + "epoch": 0.36453000204792135, + "grad_norm": 0.2701079249382019, + "learning_rate": 0.0015, + "loss": 1.2908, + "step": 7120 + }, + { + "epoch": 0.36504198238787633, + "grad_norm": 0.2694578170776367, + "learning_rate": 0.0015, + "loss": 1.303, + "step": 7130 + }, + { + "epoch": 0.36555396272783125, + "grad_norm": 0.2465587705373764, + "learning_rate": 0.0015, + "loss": 1.3177, + "step": 7140 + }, + { + "epoch": 0.36606594306778617, + "grad_norm": 0.26136472821235657, + "learning_rate": 0.0015, + "loss": 1.3112, + "step": 7150 + }, + { + "epoch": 0.36657792340774115, + "grad_norm": 0.2548895478248596, + "learning_rate": 0.0015, + "loss": 1.3114, + "step": 7160 + }, + { + "epoch": 0.36708990374769607, + "grad_norm": 0.2586556673049927, + "learning_rate": 0.0015, + "loss": 1.3076, + "step": 7170 + }, + { + "epoch": 0.36760188408765104, + "grad_norm": 0.25887277722358704, + "learning_rate": 0.0015, + "loss": 1.3217, + "step": 7180 + }, + { + "epoch": 0.36811386442760596, + "grad_norm": 0.2628803253173828, + "learning_rate": 0.0015, + "loss": 1.3012, + "step": 7190 + }, + { + "epoch": 0.36862584476756094, + "grad_norm": 0.2630269527435303, + "learning_rate": 0.0015, + "loss": 1.3187, + "step": 7200 + }, + { + "epoch": 0.36913782510751586, + "grad_norm": 0.2589748501777649, + "learning_rate": 0.0015, + "loss": 1.2885, + "step": 7210 + }, + { + "epoch": 0.36964980544747084, + "grad_norm": 0.262361615896225, + "learning_rate": 0.0015, + "loss": 1.2962, + "step": 7220 + }, + { + "epoch": 0.37016178578742576, + "grad_norm": 0.24950037896633148, + "learning_rate": 0.0015, + "loss": 1.3026, + "step": 7230 + }, + { + "epoch": 0.37067376612738073, + "grad_norm": 0.2537461817264557, + "learning_rate": 0.0015, + "loss": 1.2971, + "step": 7240 + }, + { + "epoch": 0.37118574646733565, + "grad_norm": 0.25920331478118896, + "learning_rate": 0.0015, + "loss": 1.2951, + "step": 7250 + }, + { + "epoch": 0.3716977268072906, + "grad_norm": 0.2526357173919678, + "learning_rate": 0.0015, + "loss": 1.2989, + "step": 7260 + }, + { + "epoch": 0.37220970714724555, + "grad_norm": 0.28876397013664246, + "learning_rate": 0.0015, + "loss": 1.3063, + "step": 7270 + }, + { + "epoch": 0.37272168748720047, + "grad_norm": 0.27300864458084106, + "learning_rate": 0.0015, + "loss": 1.2954, + "step": 7280 + }, + { + "epoch": 0.37323366782715545, + "grad_norm": 0.26332223415374756, + "learning_rate": 0.0015, + "loss": 1.3329, + "step": 7290 + }, + { + "epoch": 0.37374564816711037, + "grad_norm": 0.26332515478134155, + "learning_rate": 0.0015, + "loss": 1.2908, + "step": 7300 + }, + { + "epoch": 0.37425762850706534, + "grad_norm": 0.2604503631591797, + "learning_rate": 0.0015, + "loss": 1.3002, + "step": 7310 + }, + { + "epoch": 0.37476960884702026, + "grad_norm": 0.25917840003967285, + "learning_rate": 0.0015, + "loss": 1.2983, + "step": 7320 + }, + { + "epoch": 0.37528158918697524, + "grad_norm": 0.26824817061424255, + "learning_rate": 0.0015, + "loss": 1.3183, + "step": 7330 + }, + { + "epoch": 0.37579356952693016, + "grad_norm": 0.2575696110725403, + "learning_rate": 0.0015, + "loss": 1.318, + "step": 7340 + }, + { + "epoch": 0.37630554986688514, + "grad_norm": 0.2578194737434387, + "learning_rate": 0.0015, + "loss": 1.2833, + "step": 7350 + }, + { + "epoch": 0.37681753020684006, + "grad_norm": 0.2768312096595764, + "learning_rate": 0.0015, + "loss": 1.2948, + "step": 7360 + }, + { + "epoch": 0.377329510546795, + "grad_norm": 0.2382088154554367, + "learning_rate": 0.0015, + "loss": 1.3, + "step": 7370 + }, + { + "epoch": 0.37784149088674995, + "grad_norm": 0.2637539803981781, + "learning_rate": 0.0015, + "loss": 1.2792, + "step": 7380 + }, + { + "epoch": 0.3783534712267049, + "grad_norm": 0.2832081615924835, + "learning_rate": 0.0015, + "loss": 1.3097, + "step": 7390 + }, + { + "epoch": 0.37886545156665985, + "grad_norm": 0.2672945261001587, + "learning_rate": 0.0015, + "loss": 1.2989, + "step": 7400 + }, + { + "epoch": 0.37937743190661477, + "grad_norm": 0.24696801602840424, + "learning_rate": 0.0015, + "loss": 1.3174, + "step": 7410 + }, + { + "epoch": 0.37988941224656975, + "grad_norm": 0.2638930082321167, + "learning_rate": 0.0015, + "loss": 1.295, + "step": 7420 + }, + { + "epoch": 0.38040139258652467, + "grad_norm": 0.2714937925338745, + "learning_rate": 0.0015, + "loss": 1.2917, + "step": 7430 + }, + { + "epoch": 0.38091337292647964, + "grad_norm": 0.2469353824853897, + "learning_rate": 0.0015, + "loss": 1.2919, + "step": 7440 + }, + { + "epoch": 0.38142535326643456, + "grad_norm": 0.25035470724105835, + "learning_rate": 0.0015, + "loss": 1.2896, + "step": 7450 + }, + { + "epoch": 0.38193733360638954, + "grad_norm": 0.26178446412086487, + "learning_rate": 0.0015, + "loss": 1.2891, + "step": 7460 + }, + { + "epoch": 0.38244931394634446, + "grad_norm": 0.26942870020866394, + "learning_rate": 0.0015, + "loss": 1.2723, + "step": 7470 + }, + { + "epoch": 0.3829612942862994, + "grad_norm": 0.26943838596343994, + "learning_rate": 0.0015, + "loss": 1.284, + "step": 7480 + }, + { + "epoch": 0.38347327462625436, + "grad_norm": 0.25865715742111206, + "learning_rate": 0.0015, + "loss": 1.3063, + "step": 7490 + }, + { + "epoch": 0.3839852549662093, + "grad_norm": 0.27455562353134155, + "learning_rate": 0.0015, + "loss": 1.2988, + "step": 7500 + }, + { + "epoch": 0.38449723530616425, + "grad_norm": 0.2636263370513916, + "learning_rate": 0.0015, + "loss": 1.2739, + "step": 7510 + }, + { + "epoch": 0.3850092156461192, + "grad_norm": 0.26559826731681824, + "learning_rate": 0.0015, + "loss": 1.2958, + "step": 7520 + }, + { + "epoch": 0.38552119598607415, + "grad_norm": 0.2592698335647583, + "learning_rate": 0.0015, + "loss": 1.2981, + "step": 7530 + }, + { + "epoch": 0.38603317632602907, + "grad_norm": 0.25872740149497986, + "learning_rate": 0.0015, + "loss": 1.3005, + "step": 7540 + }, + { + "epoch": 0.38654515666598405, + "grad_norm": 0.26369425654411316, + "learning_rate": 0.0015, + "loss": 1.3021, + "step": 7550 + }, + { + "epoch": 0.38705713700593897, + "grad_norm": 0.25757378339767456, + "learning_rate": 0.0015, + "loss": 1.302, + "step": 7560 + }, + { + "epoch": 0.38756911734589394, + "grad_norm": 0.27320241928100586, + "learning_rate": 0.0015, + "loss": 1.2802, + "step": 7570 + }, + { + "epoch": 0.38808109768584886, + "grad_norm": 0.2795805335044861, + "learning_rate": 0.0015, + "loss": 1.295, + "step": 7580 + }, + { + "epoch": 0.3885930780258038, + "grad_norm": 0.26023516058921814, + "learning_rate": 0.0015, + "loss": 1.2889, + "step": 7590 + }, + { + "epoch": 0.38910505836575876, + "grad_norm": 0.2582970857620239, + "learning_rate": 0.0015, + "loss": 1.302, + "step": 7600 + }, + { + "epoch": 0.3896170387057137, + "grad_norm": 0.2473934441804886, + "learning_rate": 0.0015, + "loss": 1.3023, + "step": 7610 + }, + { + "epoch": 0.39012901904566866, + "grad_norm": 0.2547856271266937, + "learning_rate": 0.0015, + "loss": 1.29, + "step": 7620 + }, + { + "epoch": 0.3906409993856236, + "grad_norm": 0.26764586567878723, + "learning_rate": 0.0015, + "loss": 1.2905, + "step": 7630 + }, + { + "epoch": 0.39115297972557855, + "grad_norm": 0.2481442391872406, + "learning_rate": 0.0015, + "loss": 1.3164, + "step": 7640 + }, + { + "epoch": 0.3916649600655335, + "grad_norm": 0.25532233715057373, + "learning_rate": 0.0015, + "loss": 1.2958, + "step": 7650 + }, + { + "epoch": 0.39217694040548845, + "grad_norm": 0.24001578986644745, + "learning_rate": 0.0015, + "loss": 1.2827, + "step": 7660 + }, + { + "epoch": 0.39268892074544337, + "grad_norm": 0.2489776611328125, + "learning_rate": 0.0015, + "loss": 1.2742, + "step": 7670 + }, + { + "epoch": 0.39320090108539835, + "grad_norm": 0.23535743355751038, + "learning_rate": 0.0015, + "loss": 1.2855, + "step": 7680 + }, + { + "epoch": 0.39371288142535327, + "grad_norm": 0.25811052322387695, + "learning_rate": 0.0015, + "loss": 1.2971, + "step": 7690 + }, + { + "epoch": 0.3942248617653082, + "grad_norm": 0.24241647124290466, + "learning_rate": 0.0015, + "loss": 1.2968, + "step": 7700 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 0.25648635625839233, + "learning_rate": 0.0015, + "loss": 1.2916, + "step": 7710 + }, + { + "epoch": 0.3952488224452181, + "grad_norm": 0.2703993618488312, + "learning_rate": 0.0015, + "loss": 1.2909, + "step": 7720 + }, + { + "epoch": 0.39576080278517306, + "grad_norm": 0.2558510899543762, + "learning_rate": 0.0015, + "loss": 1.2913, + "step": 7730 + }, + { + "epoch": 0.396272783125128, + "grad_norm": 0.2394089698791504, + "learning_rate": 0.0015, + "loss": 1.2968, + "step": 7740 + }, + { + "epoch": 0.39678476346508296, + "grad_norm": 0.2338177114725113, + "learning_rate": 0.0015, + "loss": 1.2894, + "step": 7750 + }, + { + "epoch": 0.3972967438050379, + "grad_norm": 0.25422418117523193, + "learning_rate": 0.0015, + "loss": 1.2958, + "step": 7760 + }, + { + "epoch": 0.39780872414499285, + "grad_norm": 0.2437313348054886, + "learning_rate": 0.0015, + "loss": 1.2878, + "step": 7770 + }, + { + "epoch": 0.3983207044849478, + "grad_norm": 0.26623979210853577, + "learning_rate": 0.0015, + "loss": 1.2915, + "step": 7780 + }, + { + "epoch": 0.39883268482490275, + "grad_norm": 0.24698524177074432, + "learning_rate": 0.0015, + "loss": 1.2949, + "step": 7790 + }, + { + "epoch": 0.39934466516485767, + "grad_norm": 0.23496921360492706, + "learning_rate": 0.0015, + "loss": 1.3069, + "step": 7800 + }, + { + "epoch": 0.3998566455048126, + "grad_norm": 0.2393864393234253, + "learning_rate": 0.0015, + "loss": 1.2913, + "step": 7810 + }, + { + "epoch": 0.40036862584476757, + "grad_norm": 0.24716414511203766, + "learning_rate": 0.0015, + "loss": 1.2829, + "step": 7820 + }, + { + "epoch": 0.4008806061847225, + "grad_norm": 0.24985013902187347, + "learning_rate": 0.0015, + "loss": 1.2773, + "step": 7830 + }, + { + "epoch": 0.40139258652467746, + "grad_norm": 0.24895814061164856, + "learning_rate": 0.0015, + "loss": 1.2889, + "step": 7840 + }, + { + "epoch": 0.4019045668646324, + "grad_norm": 0.2497827261686325, + "learning_rate": 0.0015, + "loss": 1.2747, + "step": 7850 + }, + { + "epoch": 0.40241654720458736, + "grad_norm": 0.23879243433475494, + "learning_rate": 0.0015, + "loss": 1.3071, + "step": 7860 + }, + { + "epoch": 0.4029285275445423, + "grad_norm": 0.24402157962322235, + "learning_rate": 0.0015, + "loss": 1.2924, + "step": 7870 + }, + { + "epoch": 0.40344050788449726, + "grad_norm": 0.24736930429935455, + "learning_rate": 0.0015, + "loss": 1.2643, + "step": 7880 + }, + { + "epoch": 0.4039524882244522, + "grad_norm": 0.2525321841239929, + "learning_rate": 0.0015, + "loss": 1.3014, + "step": 7890 + }, + { + "epoch": 0.40446446856440715, + "grad_norm": 0.2575211226940155, + "learning_rate": 0.0015, + "loss": 1.2625, + "step": 7900 + }, + { + "epoch": 0.4049764489043621, + "grad_norm": 0.24405083060264587, + "learning_rate": 0.0015, + "loss": 1.2834, + "step": 7910 + }, + { + "epoch": 0.405488429244317, + "grad_norm": 0.28250402212142944, + "learning_rate": 0.0015, + "loss": 1.2814, + "step": 7920 + }, + { + "epoch": 0.40600040958427197, + "grad_norm": 0.2795003056526184, + "learning_rate": 0.0015, + "loss": 1.3154, + "step": 7930 + }, + { + "epoch": 0.4065123899242269, + "grad_norm": 0.24883300065994263, + "learning_rate": 0.0015, + "loss": 1.2887, + "step": 7940 + }, + { + "epoch": 0.40702437026418187, + "grad_norm": 0.2502342164516449, + "learning_rate": 0.0015, + "loss": 1.3033, + "step": 7950 + }, + { + "epoch": 0.4075363506041368, + "grad_norm": 0.24973638355731964, + "learning_rate": 0.0015, + "loss": 1.2947, + "step": 7960 + }, + { + "epoch": 0.40804833094409176, + "grad_norm": 0.24371185898780823, + "learning_rate": 0.0015, + "loss": 1.2908, + "step": 7970 + }, + { + "epoch": 0.4085603112840467, + "grad_norm": 0.24570930004119873, + "learning_rate": 0.0015, + "loss": 1.2879, + "step": 7980 + }, + { + "epoch": 0.40907229162400166, + "grad_norm": 0.23717066645622253, + "learning_rate": 0.0015, + "loss": 1.2928, + "step": 7990 + }, + { + "epoch": 0.4095842719639566, + "grad_norm": 0.24726137518882751, + "learning_rate": 0.0015, + "loss": 1.2915, + "step": 8000 + }, + { + "epoch": 0.41009625230391156, + "grad_norm": 0.2352866679430008, + "learning_rate": 0.0015, + "loss": 1.2817, + "step": 8010 + }, + { + "epoch": 0.4106082326438665, + "grad_norm": 0.251365065574646, + "learning_rate": 0.0015, + "loss": 1.2979, + "step": 8020 + }, + { + "epoch": 0.4111202129838214, + "grad_norm": 0.22410385310649872, + "learning_rate": 0.0015, + "loss": 1.2749, + "step": 8030 + }, + { + "epoch": 0.4116321933237764, + "grad_norm": 0.25029605627059937, + "learning_rate": 0.0015, + "loss": 1.2862, + "step": 8040 + }, + { + "epoch": 0.4121441736637313, + "grad_norm": 0.25629550218582153, + "learning_rate": 0.0015, + "loss": 1.2749, + "step": 8050 + }, + { + "epoch": 0.41265615400368627, + "grad_norm": 0.23836827278137207, + "learning_rate": 0.0015, + "loss": 1.28, + "step": 8060 + }, + { + "epoch": 0.4131681343436412, + "grad_norm": 0.23752672970294952, + "learning_rate": 0.0015, + "loss": 1.2916, + "step": 8070 + }, + { + "epoch": 0.41368011468359617, + "grad_norm": 0.26047077775001526, + "learning_rate": 0.0015, + "loss": 1.2718, + "step": 8080 + }, + { + "epoch": 0.4141920950235511, + "grad_norm": 0.24297983944416046, + "learning_rate": 0.0015, + "loss": 1.2961, + "step": 8090 + }, + { + "epoch": 0.41470407536350606, + "grad_norm": 0.24528458714485168, + "learning_rate": 0.0015, + "loss": 1.2591, + "step": 8100 + }, + { + "epoch": 0.415216055703461, + "grad_norm": 0.24459367990493774, + "learning_rate": 0.0015, + "loss": 1.2754, + "step": 8110 + }, + { + "epoch": 0.41572803604341596, + "grad_norm": 0.24630287289619446, + "learning_rate": 0.0015, + "loss": 1.2864, + "step": 8120 + }, + { + "epoch": 0.4162400163833709, + "grad_norm": 0.2514908015727997, + "learning_rate": 0.0015, + "loss": 1.2847, + "step": 8130 + }, + { + "epoch": 0.4167519967233258, + "grad_norm": 0.227911576628685, + "learning_rate": 0.0015, + "loss": 1.2798, + "step": 8140 + }, + { + "epoch": 0.4172639770632808, + "grad_norm": 0.2512179911136627, + "learning_rate": 0.0015, + "loss": 1.2817, + "step": 8150 + }, + { + "epoch": 0.4177759574032357, + "grad_norm": 0.24971604347229004, + "learning_rate": 0.0015, + "loss": 1.2856, + "step": 8160 + }, + { + "epoch": 0.4182879377431907, + "grad_norm": 0.24980546534061432, + "learning_rate": 0.0015, + "loss": 1.2932, + "step": 8170 + }, + { + "epoch": 0.4187999180831456, + "grad_norm": 0.2510388493537903, + "learning_rate": 0.0015, + "loss": 1.2849, + "step": 8180 + }, + { + "epoch": 0.41931189842310057, + "grad_norm": 0.23916485905647278, + "learning_rate": 0.0015, + "loss": 1.2787, + "step": 8190 + }, + { + "epoch": 0.4198238787630555, + "grad_norm": 0.2525003254413605, + "learning_rate": 0.0015, + "loss": 1.2856, + "step": 8200 + }, + { + "epoch": 0.42033585910301047, + "grad_norm": 0.25865113735198975, + "learning_rate": 0.0015, + "loss": 1.2473, + "step": 8210 + }, + { + "epoch": 0.4208478394429654, + "grad_norm": 0.24689891934394836, + "learning_rate": 0.0015, + "loss": 1.2663, + "step": 8220 + }, + { + "epoch": 0.4213598197829203, + "grad_norm": 0.2257513701915741, + "learning_rate": 0.0015, + "loss": 1.2576, + "step": 8230 + }, + { + "epoch": 0.4218718001228753, + "grad_norm": 0.2339119166135788, + "learning_rate": 0.0015, + "loss": 1.3053, + "step": 8240 + }, + { + "epoch": 0.4223837804628302, + "grad_norm": 0.2590661942958832, + "learning_rate": 0.0015, + "loss": 1.2698, + "step": 8250 + }, + { + "epoch": 0.4228957608027852, + "grad_norm": 0.2483995407819748, + "learning_rate": 0.0015, + "loss": 1.2728, + "step": 8260 + }, + { + "epoch": 0.4234077411427401, + "grad_norm": 0.23534591495990753, + "learning_rate": 0.0015, + "loss": 1.2867, + "step": 8270 + }, + { + "epoch": 0.4239197214826951, + "grad_norm": 0.22678501904010773, + "learning_rate": 0.0015, + "loss": 1.2775, + "step": 8280 + }, + { + "epoch": 0.42443170182265, + "grad_norm": 0.2298179715871811, + "learning_rate": 0.0015, + "loss": 1.2866, + "step": 8290 + }, + { + "epoch": 0.424943682162605, + "grad_norm": 0.2495158165693283, + "learning_rate": 0.0015, + "loss": 1.2762, + "step": 8300 + }, + { + "epoch": 0.4254556625025599, + "grad_norm": 0.22808024287223816, + "learning_rate": 0.0015, + "loss": 1.269, + "step": 8310 + }, + { + "epoch": 0.42596764284251487, + "grad_norm": 0.24249188601970673, + "learning_rate": 0.0015, + "loss": 1.2881, + "step": 8320 + }, + { + "epoch": 0.4264796231824698, + "grad_norm": 0.2539406418800354, + "learning_rate": 0.0015, + "loss": 1.2618, + "step": 8330 + }, + { + "epoch": 0.4269916035224247, + "grad_norm": 0.2367791384458542, + "learning_rate": 0.0015, + "loss": 1.2762, + "step": 8340 + }, + { + "epoch": 0.4275035838623797, + "grad_norm": 0.2301592379808426, + "learning_rate": 0.0015, + "loss": 1.2724, + "step": 8350 + }, + { + "epoch": 0.4280155642023346, + "grad_norm": 0.24136430025100708, + "learning_rate": 0.0015, + "loss": 1.2629, + "step": 8360 + }, + { + "epoch": 0.4285275445422896, + "grad_norm": 0.23719066381454468, + "learning_rate": 0.0015, + "loss": 1.2624, + "step": 8370 + }, + { + "epoch": 0.4290395248822445, + "grad_norm": 0.2514694631099701, + "learning_rate": 0.0015, + "loss": 1.2686, + "step": 8380 + }, + { + "epoch": 0.4295515052221995, + "grad_norm": 0.24186182022094727, + "learning_rate": 0.0015, + "loss": 1.2823, + "step": 8390 + }, + { + "epoch": 0.4300634855621544, + "grad_norm": 0.23494115471839905, + "learning_rate": 0.0015, + "loss": 1.2534, + "step": 8400 + }, + { + "epoch": 0.4305754659021094, + "grad_norm": 0.2518327534198761, + "learning_rate": 0.0015, + "loss": 1.2913, + "step": 8410 + }, + { + "epoch": 0.4310874462420643, + "grad_norm": 0.23622803390026093, + "learning_rate": 0.0015, + "loss": 1.2652, + "step": 8420 + }, + { + "epoch": 0.4315994265820193, + "grad_norm": 0.22990188002586365, + "learning_rate": 0.0015, + "loss": 1.277, + "step": 8430 + }, + { + "epoch": 0.4321114069219742, + "grad_norm": 0.23679761588573456, + "learning_rate": 0.0015, + "loss": 1.2839, + "step": 8440 + }, + { + "epoch": 0.4326233872619291, + "grad_norm": 0.25512683391571045, + "learning_rate": 0.0015, + "loss": 1.2818, + "step": 8450 + }, + { + "epoch": 0.4331353676018841, + "grad_norm": 0.24284730851650238, + "learning_rate": 0.0015, + "loss": 1.2882, + "step": 8460 + }, + { + "epoch": 0.433647347941839, + "grad_norm": 0.24152646958827972, + "learning_rate": 0.0015, + "loss": 1.2727, + "step": 8470 + }, + { + "epoch": 0.434159328281794, + "grad_norm": 0.24133774638175964, + "learning_rate": 0.0015, + "loss": 1.2743, + "step": 8480 + }, + { + "epoch": 0.4346713086217489, + "grad_norm": 0.23270800709724426, + "learning_rate": 0.0015, + "loss": 1.2651, + "step": 8490 + }, + { + "epoch": 0.4351832889617039, + "grad_norm": 0.2446971833705902, + "learning_rate": 0.0015, + "loss": 1.268, + "step": 8500 + }, + { + "epoch": 0.4356952693016588, + "grad_norm": 0.23358875513076782, + "learning_rate": 0.0015, + "loss": 1.2774, + "step": 8510 + }, + { + "epoch": 0.4362072496416138, + "grad_norm": 0.22265927493572235, + "learning_rate": 0.0015, + "loss": 1.2602, + "step": 8520 + }, + { + "epoch": 0.4367192299815687, + "grad_norm": 0.22781646251678467, + "learning_rate": 0.0015, + "loss": 1.2724, + "step": 8530 + }, + { + "epoch": 0.4372312103215237, + "grad_norm": 0.23868761956691742, + "learning_rate": 0.0015, + "loss": 1.2581, + "step": 8540 + }, + { + "epoch": 0.4377431906614786, + "grad_norm": 0.2235594540834427, + "learning_rate": 0.0015, + "loss": 1.2741, + "step": 8550 + }, + { + "epoch": 0.4382551710014335, + "grad_norm": 0.2419920712709427, + "learning_rate": 0.0015, + "loss": 1.2765, + "step": 8560 + }, + { + "epoch": 0.4387671513413885, + "grad_norm": 0.27400338649749756, + "learning_rate": 0.0015, + "loss": 1.2635, + "step": 8570 + }, + { + "epoch": 0.4392791316813434, + "grad_norm": 0.23386618494987488, + "learning_rate": 0.0015, + "loss": 1.2806, + "step": 8580 + }, + { + "epoch": 0.4397911120212984, + "grad_norm": 0.24642907083034515, + "learning_rate": 0.0015, + "loss": 1.2739, + "step": 8590 + }, + { + "epoch": 0.4403030923612533, + "grad_norm": 0.2347201406955719, + "learning_rate": 0.0015, + "loss": 1.2581, + "step": 8600 + }, + { + "epoch": 0.4408150727012083, + "grad_norm": 0.22591201961040497, + "learning_rate": 0.0015, + "loss": 1.2882, + "step": 8610 + }, + { + "epoch": 0.4413270530411632, + "grad_norm": 0.2508542537689209, + "learning_rate": 0.0015, + "loss": 1.2699, + "step": 8620 + }, + { + "epoch": 0.4418390333811182, + "grad_norm": 0.2366652637720108, + "learning_rate": 0.0015, + "loss": 1.2522, + "step": 8630 + }, + { + "epoch": 0.4423510137210731, + "grad_norm": 0.22938509285449982, + "learning_rate": 0.0015, + "loss": 1.2676, + "step": 8640 + }, + { + "epoch": 0.4428629940610281, + "grad_norm": 0.22820281982421875, + "learning_rate": 0.0015, + "loss": 1.2712, + "step": 8650 + }, + { + "epoch": 0.443374974400983, + "grad_norm": 0.22258944809436798, + "learning_rate": 0.0015, + "loss": 1.2721, + "step": 8660 + }, + { + "epoch": 0.4438869547409379, + "grad_norm": 0.23942533135414124, + "learning_rate": 0.0015, + "loss": 1.2659, + "step": 8670 + }, + { + "epoch": 0.4443989350808929, + "grad_norm": 0.23312713205814362, + "learning_rate": 0.0015, + "loss": 1.2755, + "step": 8680 + }, + { + "epoch": 0.4449109154208478, + "grad_norm": 0.2283553183078766, + "learning_rate": 0.0015, + "loss": 1.2537, + "step": 8690 + }, + { + "epoch": 0.4454228957608028, + "grad_norm": 0.23631595075130463, + "learning_rate": 0.0015, + "loss": 1.2487, + "step": 8700 + }, + { + "epoch": 0.4459348761007577, + "grad_norm": 0.2447190135717392, + "learning_rate": 0.0015, + "loss": 1.2529, + "step": 8710 + }, + { + "epoch": 0.4464468564407127, + "grad_norm": 0.24584966897964478, + "learning_rate": 0.0015, + "loss": 1.2738, + "step": 8720 + }, + { + "epoch": 0.4469588367806676, + "grad_norm": 0.2374550849199295, + "learning_rate": 0.0015, + "loss": 1.2791, + "step": 8730 + }, + { + "epoch": 0.4474708171206226, + "grad_norm": 0.240436390042305, + "learning_rate": 0.0015, + "loss": 1.2518, + "step": 8740 + }, + { + "epoch": 0.4479827974605775, + "grad_norm": 0.23341523110866547, + "learning_rate": 0.0015, + "loss": 1.2688, + "step": 8750 + }, + { + "epoch": 0.4484947778005325, + "grad_norm": 0.24230003356933594, + "learning_rate": 0.0015, + "loss": 1.2379, + "step": 8760 + }, + { + "epoch": 0.4490067581404874, + "grad_norm": 0.2401583343744278, + "learning_rate": 0.0015, + "loss": 1.2699, + "step": 8770 + }, + { + "epoch": 0.4495187384804423, + "grad_norm": 0.22647708654403687, + "learning_rate": 0.0015, + "loss": 1.2656, + "step": 8780 + }, + { + "epoch": 0.4500307188203973, + "grad_norm": 0.24045558273792267, + "learning_rate": 0.0015, + "loss": 1.2531, + "step": 8790 + }, + { + "epoch": 0.4505426991603522, + "grad_norm": 0.2597295045852661, + "learning_rate": 0.0015, + "loss": 1.2568, + "step": 8800 + }, + { + "epoch": 0.4510546795003072, + "grad_norm": 0.22485364973545074, + "learning_rate": 0.0015, + "loss": 1.2478, + "step": 8810 + }, + { + "epoch": 0.4515666598402621, + "grad_norm": 0.23133698105812073, + "learning_rate": 0.0015, + "loss": 1.2688, + "step": 8820 + }, + { + "epoch": 0.4520786401802171, + "grad_norm": 0.22866465151309967, + "learning_rate": 0.0015, + "loss": 1.2516, + "step": 8830 + }, + { + "epoch": 0.452590620520172, + "grad_norm": 0.2258300632238388, + "learning_rate": 0.0015, + "loss": 1.2571, + "step": 8840 + }, + { + "epoch": 0.453102600860127, + "grad_norm": 0.23454922437667847, + "learning_rate": 0.0015, + "loss": 1.2413, + "step": 8850 + }, + { + "epoch": 0.4536145812000819, + "grad_norm": 0.22673968970775604, + "learning_rate": 0.0015, + "loss": 1.2504, + "step": 8860 + }, + { + "epoch": 0.4541265615400369, + "grad_norm": 0.24363909661769867, + "learning_rate": 0.0015, + "loss": 1.2511, + "step": 8870 + }, + { + "epoch": 0.4546385418799918, + "grad_norm": 0.25056564807891846, + "learning_rate": 0.0015, + "loss": 1.2423, + "step": 8880 + }, + { + "epoch": 0.45515052221994673, + "grad_norm": 0.2318125218153, + "learning_rate": 0.0015, + "loss": 1.2753, + "step": 8890 + }, + { + "epoch": 0.4556625025599017, + "grad_norm": 0.22525230050086975, + "learning_rate": 0.0015, + "loss": 1.2389, + "step": 8900 + }, + { + "epoch": 0.4561744828998566, + "grad_norm": 0.23389683663845062, + "learning_rate": 0.0015, + "loss": 1.2457, + "step": 8910 + }, + { + "epoch": 0.4566864632398116, + "grad_norm": 0.23282834887504578, + "learning_rate": 0.0015, + "loss": 1.2628, + "step": 8920 + }, + { + "epoch": 0.4571984435797665, + "grad_norm": 0.24000655114650726, + "learning_rate": 0.0015, + "loss": 1.2637, + "step": 8930 + }, + { + "epoch": 0.4577104239197215, + "grad_norm": 0.22707650065422058, + "learning_rate": 0.0015, + "loss": 1.2651, + "step": 8940 + }, + { + "epoch": 0.4582224042596764, + "grad_norm": 0.24544113874435425, + "learning_rate": 0.0015, + "loss": 1.2597, + "step": 8950 + }, + { + "epoch": 0.4587343845996314, + "grad_norm": 0.2471536099910736, + "learning_rate": 0.0015, + "loss": 1.2583, + "step": 8960 + }, + { + "epoch": 0.4592463649395863, + "grad_norm": 0.2399998903274536, + "learning_rate": 0.0015, + "loss": 1.2587, + "step": 8970 + }, + { + "epoch": 0.4597583452795413, + "grad_norm": 0.239053875207901, + "learning_rate": 0.0015, + "loss": 1.2604, + "step": 8980 + }, + { + "epoch": 0.4602703256194962, + "grad_norm": 0.23578478395938873, + "learning_rate": 0.0015, + "loss": 1.251, + "step": 8990 + }, + { + "epoch": 0.46078230595945113, + "grad_norm": 0.22768492996692657, + "learning_rate": 0.0015, + "loss": 1.2584, + "step": 9000 + }, + { + "epoch": 0.4612942862994061, + "grad_norm": 0.2407897710800171, + "learning_rate": 0.0015, + "loss": 1.2551, + "step": 9010 + }, + { + "epoch": 0.46180626663936103, + "grad_norm": 0.24113765358924866, + "learning_rate": 0.0015, + "loss": 1.2686, + "step": 9020 + }, + { + "epoch": 0.462318246979316, + "grad_norm": 0.23086939752101898, + "learning_rate": 0.0015, + "loss": 1.2521, + "step": 9030 + }, + { + "epoch": 0.4628302273192709, + "grad_norm": 0.2428579032421112, + "learning_rate": 0.0015, + "loss": 1.2539, + "step": 9040 + }, + { + "epoch": 0.4633422076592259, + "grad_norm": 0.23166462779045105, + "learning_rate": 0.0015, + "loss": 1.2452, + "step": 9050 + }, + { + "epoch": 0.4638541879991808, + "grad_norm": 0.23648124933242798, + "learning_rate": 0.0015, + "loss": 1.2522, + "step": 9060 + }, + { + "epoch": 0.4643661683391358, + "grad_norm": 0.23984448611736298, + "learning_rate": 0.0015, + "loss": 1.2556, + "step": 9070 + }, + { + "epoch": 0.4648781486790907, + "grad_norm": 0.22623547911643982, + "learning_rate": 0.0015, + "loss": 1.2496, + "step": 9080 + }, + { + "epoch": 0.4653901290190457, + "grad_norm": 0.23154547810554504, + "learning_rate": 0.0015, + "loss": 1.2688, + "step": 9090 + }, + { + "epoch": 0.4659021093590006, + "grad_norm": 0.24457304179668427, + "learning_rate": 0.0015, + "loss": 1.2457, + "step": 9100 + }, + { + "epoch": 0.46641408969895554, + "grad_norm": 0.22743169963359833, + "learning_rate": 0.0015, + "loss": 1.2533, + "step": 9110 + }, + { + "epoch": 0.4669260700389105, + "grad_norm": 0.23356840014457703, + "learning_rate": 0.0015, + "loss": 1.2529, + "step": 9120 + }, + { + "epoch": 0.46743805037886543, + "grad_norm": 0.23355025053024292, + "learning_rate": 0.0015, + "loss": 1.2595, + "step": 9130 + }, + { + "epoch": 0.4679500307188204, + "grad_norm": 0.21895302832126617, + "learning_rate": 0.0015, + "loss": 1.2613, + "step": 9140 + }, + { + "epoch": 0.46846201105877533, + "grad_norm": 0.23437921702861786, + "learning_rate": 0.0015, + "loss": 1.2631, + "step": 9150 + }, + { + "epoch": 0.4689739913987303, + "grad_norm": 0.22628231346607208, + "learning_rate": 0.0015, + "loss": 1.2634, + "step": 9160 + }, + { + "epoch": 0.4694859717386852, + "grad_norm": 0.2286689728498459, + "learning_rate": 0.0015, + "loss": 1.2412, + "step": 9170 + }, + { + "epoch": 0.4699979520786402, + "grad_norm": 0.21830707788467407, + "learning_rate": 0.0015, + "loss": 1.2714, + "step": 9180 + }, + { + "epoch": 0.4705099324185951, + "grad_norm": 0.2502080500125885, + "learning_rate": 0.0015, + "loss": 1.2419, + "step": 9190 + }, + { + "epoch": 0.4710219127585501, + "grad_norm": 0.21958868205547333, + "learning_rate": 0.0015, + "loss": 1.2406, + "step": 9200 + }, + { + "epoch": 0.471533893098505, + "grad_norm": 0.22988547384738922, + "learning_rate": 0.0015, + "loss": 1.2802, + "step": 9210 + }, + { + "epoch": 0.47204587343845994, + "grad_norm": 0.22131182253360748, + "learning_rate": 0.0015, + "loss": 1.2496, + "step": 9220 + }, + { + "epoch": 0.4725578537784149, + "grad_norm": 0.24254952371120453, + "learning_rate": 0.0015, + "loss": 1.2702, + "step": 9230 + }, + { + "epoch": 0.47306983411836984, + "grad_norm": 0.22780196368694305, + "learning_rate": 0.0015, + "loss": 1.2452, + "step": 9240 + }, + { + "epoch": 0.4735818144583248, + "grad_norm": 0.22993087768554688, + "learning_rate": 0.0015, + "loss": 1.2475, + "step": 9250 + }, + { + "epoch": 0.47409379479827973, + "grad_norm": 0.21792259812355042, + "learning_rate": 0.0015, + "loss": 1.2532, + "step": 9260 + }, + { + "epoch": 0.4746057751382347, + "grad_norm": 0.22392146289348602, + "learning_rate": 0.0015, + "loss": 1.2451, + "step": 9270 + }, + { + "epoch": 0.47511775547818963, + "grad_norm": 0.24879144132137299, + "learning_rate": 0.0015, + "loss": 1.2492, + "step": 9280 + }, + { + "epoch": 0.4756297358181446, + "grad_norm": 0.21757066249847412, + "learning_rate": 0.0015, + "loss": 1.2508, + "step": 9290 + }, + { + "epoch": 0.4761417161580995, + "grad_norm": 0.23313356935977936, + "learning_rate": 0.0015, + "loss": 1.2532, + "step": 9300 + }, + { + "epoch": 0.4766536964980545, + "grad_norm": 0.25208523869514465, + "learning_rate": 0.0015, + "loss": 1.2286, + "step": 9310 + }, + { + "epoch": 0.4771656768380094, + "grad_norm": 0.2262171059846878, + "learning_rate": 0.0015, + "loss": 1.2398, + "step": 9320 + }, + { + "epoch": 0.47767765717796434, + "grad_norm": 0.2252594530582428, + "learning_rate": 0.0015, + "loss": 1.2525, + "step": 9330 + }, + { + "epoch": 0.4781896375179193, + "grad_norm": 0.2281142771244049, + "learning_rate": 0.0015, + "loss": 1.2453, + "step": 9340 + }, + { + "epoch": 0.47870161785787424, + "grad_norm": 0.22341011464595795, + "learning_rate": 0.0015, + "loss": 1.2628, + "step": 9350 + }, + { + "epoch": 0.4792135981978292, + "grad_norm": 0.22117526829242706, + "learning_rate": 0.0015, + "loss": 1.2597, + "step": 9360 + }, + { + "epoch": 0.47972557853778414, + "grad_norm": 0.2359929233789444, + "learning_rate": 0.0015, + "loss": 1.2504, + "step": 9370 + }, + { + "epoch": 0.4802375588777391, + "grad_norm": 0.2348971962928772, + "learning_rate": 0.0015, + "loss": 1.2352, + "step": 9380 + }, + { + "epoch": 0.48074953921769403, + "grad_norm": 0.23461927473545074, + "learning_rate": 0.0015, + "loss": 1.2383, + "step": 9390 + }, + { + "epoch": 0.481261519557649, + "grad_norm": 0.2463158220052719, + "learning_rate": 0.0015, + "loss": 1.2329, + "step": 9400 + }, + { + "epoch": 0.48177349989760393, + "grad_norm": 0.240493506193161, + "learning_rate": 0.0015, + "loss": 1.2614, + "step": 9410 + }, + { + "epoch": 0.48228548023755885, + "grad_norm": 0.22357292473316193, + "learning_rate": 0.0015, + "loss": 1.2553, + "step": 9420 + }, + { + "epoch": 0.4827974605775138, + "grad_norm": 0.2223501205444336, + "learning_rate": 0.0015, + "loss": 1.245, + "step": 9430 + }, + { + "epoch": 0.48330944091746875, + "grad_norm": 0.2278713434934616, + "learning_rate": 0.0015, + "loss": 1.2544, + "step": 9440 + }, + { + "epoch": 0.4838214212574237, + "grad_norm": 0.23052051663398743, + "learning_rate": 0.0015, + "loss": 1.2614, + "step": 9450 + }, + { + "epoch": 0.48433340159737864, + "grad_norm": 0.22685429453849792, + "learning_rate": 0.0015, + "loss": 1.2613, + "step": 9460 + }, + { + "epoch": 0.4848453819373336, + "grad_norm": 0.22306014597415924, + "learning_rate": 0.0015, + "loss": 1.2289, + "step": 9470 + }, + { + "epoch": 0.48535736227728854, + "grad_norm": 0.22385765612125397, + "learning_rate": 0.0015, + "loss": 1.2452, + "step": 9480 + }, + { + "epoch": 0.4858693426172435, + "grad_norm": 0.22245322167873383, + "learning_rate": 0.0015, + "loss": 1.2541, + "step": 9490 + }, + { + "epoch": 0.48638132295719844, + "grad_norm": 0.2279806137084961, + "learning_rate": 0.0015, + "loss": 1.2557, + "step": 9500 + }, + { + "epoch": 0.4868933032971534, + "grad_norm": 0.2449760138988495, + "learning_rate": 0.0015, + "loss": 1.2358, + "step": 9510 + }, + { + "epoch": 0.48740528363710833, + "grad_norm": 0.22621648013591766, + "learning_rate": 0.0015, + "loss": 1.2466, + "step": 9520 + }, + { + "epoch": 0.48791726397706325, + "grad_norm": 0.22223225235939026, + "learning_rate": 0.0015, + "loss": 1.2522, + "step": 9530 + }, + { + "epoch": 0.48842924431701823, + "grad_norm": 0.23512163758277893, + "learning_rate": 0.0015, + "loss": 1.2542, + "step": 9540 + }, + { + "epoch": 0.48894122465697315, + "grad_norm": 0.21729685366153717, + "learning_rate": 0.0015, + "loss": 1.224, + "step": 9550 + }, + { + "epoch": 0.4894532049969281, + "grad_norm": 0.22177568078041077, + "learning_rate": 0.0015, + "loss": 1.2624, + "step": 9560 + }, + { + "epoch": 0.48996518533688305, + "grad_norm": 0.22674211859703064, + "learning_rate": 0.0015, + "loss": 1.2191, + "step": 9570 + }, + { + "epoch": 0.490477165676838, + "grad_norm": 0.25243934988975525, + "learning_rate": 0.0015, + "loss": 1.2327, + "step": 9580 + }, + { + "epoch": 0.49098914601679294, + "grad_norm": 0.22206014394760132, + "learning_rate": 0.0015, + "loss": 1.2369, + "step": 9590 + }, + { + "epoch": 0.4915011263567479, + "grad_norm": 0.21915268898010254, + "learning_rate": 0.0015, + "loss": 1.2475, + "step": 9600 + }, + { + "epoch": 0.49201310669670284, + "grad_norm": 0.219084694981575, + "learning_rate": 0.0015, + "loss": 1.2469, + "step": 9610 + }, + { + "epoch": 0.4925250870366578, + "grad_norm": 0.21210044622421265, + "learning_rate": 0.0015, + "loss": 1.2385, + "step": 9620 + }, + { + "epoch": 0.49303706737661274, + "grad_norm": 0.22252093255519867, + "learning_rate": 0.0015, + "loss": 1.2652, + "step": 9630 + }, + { + "epoch": 0.49354904771656766, + "grad_norm": 0.2407660186290741, + "learning_rate": 0.0015, + "loss": 1.2436, + "step": 9640 + }, + { + "epoch": 0.49406102805652263, + "grad_norm": 0.22691743075847626, + "learning_rate": 0.0015, + "loss": 1.2254, + "step": 9650 + }, + { + "epoch": 0.49457300839647755, + "grad_norm": 0.23666201531887054, + "learning_rate": 0.0015, + "loss": 1.2297, + "step": 9660 + }, + { + "epoch": 0.49508498873643253, + "grad_norm": 0.21549946069717407, + "learning_rate": 0.0015, + "loss": 1.238, + "step": 9670 + }, + { + "epoch": 0.49559696907638745, + "grad_norm": 0.22083760797977448, + "learning_rate": 0.0015, + "loss": 1.2531, + "step": 9680 + }, + { + "epoch": 0.4961089494163424, + "grad_norm": 0.23391181230545044, + "learning_rate": 0.0015, + "loss": 1.1973, + "step": 9690 + }, + { + "epoch": 0.49662092975629735, + "grad_norm": 0.21990463137626648, + "learning_rate": 0.0015, + "loss": 1.2357, + "step": 9700 + }, + { + "epoch": 0.4971329100962523, + "grad_norm": 0.22842243313789368, + "learning_rate": 0.0015, + "loss": 1.2566, + "step": 9710 + }, + { + "epoch": 0.49764489043620724, + "grad_norm": 0.2154964953660965, + "learning_rate": 0.0015, + "loss": 1.2489, + "step": 9720 + }, + { + "epoch": 0.4981568707761622, + "grad_norm": 0.23381535708904266, + "learning_rate": 0.0015, + "loss": 1.2379, + "step": 9730 + }, + { + "epoch": 0.49866885111611714, + "grad_norm": 0.23405200242996216, + "learning_rate": 0.0015, + "loss": 1.251, + "step": 9740 + }, + { + "epoch": 0.49918083145607206, + "grad_norm": 0.24905334413051605, + "learning_rate": 0.0015, + "loss": 1.2247, + "step": 9750 + }, + { + "epoch": 0.49969281179602704, + "grad_norm": 0.22687901556491852, + "learning_rate": 0.0015, + "loss": 1.2362, + "step": 9760 + }, + { + "epoch": 0.500204792135982, + "grad_norm": 0.21950958669185638, + "learning_rate": 0.0015, + "loss": 1.2304, + "step": 9770 + }, + { + "epoch": 0.5007167724759369, + "grad_norm": 0.24343635141849518, + "learning_rate": 0.0015, + "loss": 1.2313, + "step": 9780 + }, + { + "epoch": 0.5012287528158919, + "grad_norm": 0.2238016575574875, + "learning_rate": 0.0015, + "loss": 1.2504, + "step": 9790 + }, + { + "epoch": 0.5017407331558468, + "grad_norm": 0.22162608802318573, + "learning_rate": 0.0015, + "loss": 1.2242, + "step": 9800 + }, + { + "epoch": 0.5022527134958018, + "grad_norm": 0.2090781331062317, + "learning_rate": 0.0015, + "loss": 1.2214, + "step": 9810 + }, + { + "epoch": 0.5027646938357567, + "grad_norm": 0.23861265182495117, + "learning_rate": 0.0015, + "loss": 1.2554, + "step": 9820 + }, + { + "epoch": 0.5032766741757116, + "grad_norm": 0.24569468200206757, + "learning_rate": 0.0015, + "loss": 1.2525, + "step": 9830 + }, + { + "epoch": 0.5037886545156666, + "grad_norm": 0.22713309526443481, + "learning_rate": 0.0015, + "loss": 1.2513, + "step": 9840 + }, + { + "epoch": 0.5043006348556216, + "grad_norm": 0.22980822622776031, + "learning_rate": 0.0015, + "loss": 1.2493, + "step": 9850 + }, + { + "epoch": 0.5048126151955765, + "grad_norm": 0.23609554767608643, + "learning_rate": 0.0015, + "loss": 1.2366, + "step": 9860 + }, + { + "epoch": 0.5053245955355314, + "grad_norm": 0.2115827053785324, + "learning_rate": 0.0015, + "loss": 1.2558, + "step": 9870 + }, + { + "epoch": 0.5058365758754864, + "grad_norm": 0.20506598055362701, + "learning_rate": 0.0015, + "loss": 1.2421, + "step": 9880 + }, + { + "epoch": 0.5063485562154413, + "grad_norm": 0.21842671930789948, + "learning_rate": 0.0015, + "loss": 1.2328, + "step": 9890 + }, + { + "epoch": 0.5068605365553963, + "grad_norm": 0.2390349954366684, + "learning_rate": 0.0015, + "loss": 1.2494, + "step": 9900 + }, + { + "epoch": 0.5073725168953512, + "grad_norm": 0.21842844784259796, + "learning_rate": 0.0015, + "loss": 1.243, + "step": 9910 + }, + { + "epoch": 0.5078844972353062, + "grad_norm": 0.21210695803165436, + "learning_rate": 0.0015, + "loss": 1.2438, + "step": 9920 + }, + { + "epoch": 0.5083964775752611, + "grad_norm": 0.21826642751693726, + "learning_rate": 0.0015, + "loss": 1.2402, + "step": 9930 + }, + { + "epoch": 0.5089084579152161, + "grad_norm": 0.21249307692050934, + "learning_rate": 0.0015, + "loss": 1.2168, + "step": 9940 + }, + { + "epoch": 0.509420438255171, + "grad_norm": 0.22593854367733002, + "learning_rate": 0.0015, + "loss": 1.222, + "step": 9950 + }, + { + "epoch": 0.509932418595126, + "grad_norm": 0.22972868382930756, + "learning_rate": 0.0015, + "loss": 1.2577, + "step": 9960 + }, + { + "epoch": 0.5104443989350809, + "grad_norm": 0.21808108687400818, + "learning_rate": 0.0015, + "loss": 1.2301, + "step": 9970 + }, + { + "epoch": 0.5109563792750358, + "grad_norm": 0.21525093913078308, + "learning_rate": 0.0015, + "loss": 1.2412, + "step": 9980 + }, + { + "epoch": 0.5114683596149908, + "grad_norm": 0.22222475707530975, + "learning_rate": 0.0015, + "loss": 1.237, + "step": 9990 + }, + { + "epoch": 0.5119803399549457, + "grad_norm": 0.23491185903549194, + "learning_rate": 0.0015, + "loss": 1.2436, + "step": 10000 + }, + { + "epoch": 0.5124923202949007, + "grad_norm": 0.23327389359474182, + "learning_rate": 0.0015, + "loss": 1.223, + "step": 10010 + }, + { + "epoch": 0.5130043006348556, + "grad_norm": 0.21225926280021667, + "learning_rate": 0.0015, + "loss": 1.2215, + "step": 10020 + }, + { + "epoch": 0.5135162809748106, + "grad_norm": 0.21181495487689972, + "learning_rate": 0.0015, + "loss": 1.2297, + "step": 10030 + }, + { + "epoch": 0.5140282613147655, + "grad_norm": 0.21177121996879578, + "learning_rate": 0.0015, + "loss": 1.2228, + "step": 10040 + }, + { + "epoch": 0.5145402416547205, + "grad_norm": 0.22206859290599823, + "learning_rate": 0.0015, + "loss": 1.2579, + "step": 10050 + }, + { + "epoch": 0.5150522219946754, + "grad_norm": 0.21502964198589325, + "learning_rate": 0.0015, + "loss": 1.2298, + "step": 10060 + }, + { + "epoch": 0.5155642023346303, + "grad_norm": 0.22302408516407013, + "learning_rate": 0.0015, + "loss": 1.2226, + "step": 10070 + }, + { + "epoch": 0.5160761826745853, + "grad_norm": 0.21490171551704407, + "learning_rate": 0.0015, + "loss": 1.2554, + "step": 10080 + }, + { + "epoch": 0.5165881630145402, + "grad_norm": 0.22137999534606934, + "learning_rate": 0.0015, + "loss": 1.2189, + "step": 10090 + }, + { + "epoch": 0.5171001433544952, + "grad_norm": 0.21363165974617004, + "learning_rate": 0.0015, + "loss": 1.2533, + "step": 10100 + }, + { + "epoch": 0.5176121236944501, + "grad_norm": 0.23033399879932404, + "learning_rate": 0.0015, + "loss": 1.2406, + "step": 10110 + }, + { + "epoch": 0.5181241040344051, + "grad_norm": 0.22692923247814178, + "learning_rate": 0.0015, + "loss": 1.2294, + "step": 10120 + }, + { + "epoch": 0.51863608437436, + "grad_norm": 0.23053601384162903, + "learning_rate": 0.0015, + "loss": 1.2351, + "step": 10130 + }, + { + "epoch": 0.519148064714315, + "grad_norm": 0.21180744469165802, + "learning_rate": 0.0015, + "loss": 1.2518, + "step": 10140 + }, + { + "epoch": 0.5196600450542699, + "grad_norm": 0.2388363927602768, + "learning_rate": 0.0015, + "loss": 1.2188, + "step": 10150 + }, + { + "epoch": 0.5201720253942249, + "grad_norm": 0.22531351447105408, + "learning_rate": 0.0015, + "loss": 1.2242, + "step": 10160 + }, + { + "epoch": 0.5206840057341798, + "grad_norm": 0.2166026532649994, + "learning_rate": 0.0015, + "loss": 1.2122, + "step": 10170 + }, + { + "epoch": 0.5211959860741348, + "grad_norm": 0.23231609165668488, + "learning_rate": 0.0015, + "loss": 1.2078, + "step": 10180 + }, + { + "epoch": 0.5217079664140897, + "grad_norm": 0.2189248949289322, + "learning_rate": 0.0015, + "loss": 1.2392, + "step": 10190 + }, + { + "epoch": 0.5222199467540446, + "grad_norm": 0.21036341786384583, + "learning_rate": 0.0015, + "loss": 1.2325, + "step": 10200 + }, + { + "epoch": 0.5227319270939996, + "grad_norm": 0.21162335574626923, + "learning_rate": 0.0015, + "loss": 1.2348, + "step": 10210 + }, + { + "epoch": 0.5232439074339545, + "grad_norm": 0.21558861434459686, + "learning_rate": 0.0015, + "loss": 1.2343, + "step": 10220 + }, + { + "epoch": 0.5237558877739095, + "grad_norm": 0.22100234031677246, + "learning_rate": 0.0015, + "loss": 1.2373, + "step": 10230 + }, + { + "epoch": 0.5242678681138644, + "grad_norm": 0.225110724568367, + "learning_rate": 0.0015, + "loss": 1.2368, + "step": 10240 + }, + { + "epoch": 0.5247798484538194, + "grad_norm": 0.21674303710460663, + "learning_rate": 0.0015, + "loss": 1.2365, + "step": 10250 + }, + { + "epoch": 0.5252918287937743, + "grad_norm": 0.23076364398002625, + "learning_rate": 0.0015, + "loss": 1.2202, + "step": 10260 + }, + { + "epoch": 0.5258038091337293, + "grad_norm": 0.23180685937404633, + "learning_rate": 0.0015, + "loss": 1.234, + "step": 10270 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.21580268442630768, + "learning_rate": 0.0015, + "loss": 1.2372, + "step": 10280 + }, + { + "epoch": 0.5268277698136391, + "grad_norm": 0.2099384069442749, + "learning_rate": 0.0015, + "loss": 1.2118, + "step": 10290 + }, + { + "epoch": 0.5273397501535941, + "grad_norm": 0.23586790263652802, + "learning_rate": 0.0015, + "loss": 1.2482, + "step": 10300 + }, + { + "epoch": 0.527851730493549, + "grad_norm": 0.2149907946586609, + "learning_rate": 0.0015, + "loss": 1.2469, + "step": 10310 + }, + { + "epoch": 0.528363710833504, + "grad_norm": 0.21271546185016632, + "learning_rate": 0.0015, + "loss": 1.2325, + "step": 10320 + }, + { + "epoch": 0.5288756911734589, + "grad_norm": 0.20998185873031616, + "learning_rate": 0.0015, + "loss": 1.247, + "step": 10330 + }, + { + "epoch": 0.5293876715134139, + "grad_norm": 0.23234112560749054, + "learning_rate": 0.0015, + "loss": 1.2395, + "step": 10340 + }, + { + "epoch": 0.5298996518533688, + "grad_norm": 0.2261328250169754, + "learning_rate": 0.0015, + "loss": 1.2244, + "step": 10350 + }, + { + "epoch": 0.5304116321933238, + "grad_norm": 0.2102995663881302, + "learning_rate": 0.0015, + "loss": 1.2307, + "step": 10360 + }, + { + "epoch": 0.5309236125332787, + "grad_norm": 0.21107365190982819, + "learning_rate": 0.0015, + "loss": 1.2195, + "step": 10370 + }, + { + "epoch": 0.5314355928732337, + "grad_norm": 0.2249820977449417, + "learning_rate": 0.0015, + "loss": 1.2499, + "step": 10380 + }, + { + "epoch": 0.5319475732131886, + "grad_norm": 0.2142641544342041, + "learning_rate": 0.0015, + "loss": 1.2329, + "step": 10390 + }, + { + "epoch": 0.5324595535531436, + "grad_norm": 0.2172004133462906, + "learning_rate": 0.0015, + "loss": 1.2098, + "step": 10400 + }, + { + "epoch": 0.5329715338930985, + "grad_norm": 0.19984416663646698, + "learning_rate": 0.0015, + "loss": 1.2135, + "step": 10410 + }, + { + "epoch": 0.5334835142330534, + "grad_norm": 0.22618216276168823, + "learning_rate": 0.0015, + "loss": 1.2173, + "step": 10420 + }, + { + "epoch": 0.5339954945730084, + "grad_norm": 0.22356146574020386, + "learning_rate": 0.0015, + "loss": 1.2423, + "step": 10430 + }, + { + "epoch": 0.5345074749129634, + "grad_norm": 0.2300511598587036, + "learning_rate": 0.0015, + "loss": 1.2308, + "step": 10440 + }, + { + "epoch": 0.5350194552529183, + "grad_norm": 0.22442519664764404, + "learning_rate": 0.0015, + "loss": 1.2435, + "step": 10450 + }, + { + "epoch": 0.5355314355928732, + "grad_norm": 0.21556325256824493, + "learning_rate": 0.0015, + "loss": 1.2499, + "step": 10460 + }, + { + "epoch": 0.5360434159328282, + "grad_norm": 0.21608006954193115, + "learning_rate": 0.0015, + "loss": 1.2367, + "step": 10470 + }, + { + "epoch": 0.5365553962727831, + "grad_norm": 0.22256320714950562, + "learning_rate": 0.0015, + "loss": 1.2325, + "step": 10480 + }, + { + "epoch": 0.5370673766127381, + "grad_norm": 0.22661398351192474, + "learning_rate": 0.0015, + "loss": 1.2253, + "step": 10490 + }, + { + "epoch": 0.537579356952693, + "grad_norm": 0.21327906847000122, + "learning_rate": 0.0015, + "loss": 1.215, + "step": 10500 + }, + { + "epoch": 0.5380913372926479, + "grad_norm": 0.21695594489574432, + "learning_rate": 0.0015, + "loss": 1.2372, + "step": 10510 + }, + { + "epoch": 0.5386033176326029, + "grad_norm": 0.20584948360919952, + "learning_rate": 0.0015, + "loss": 1.2491, + "step": 10520 + }, + { + "epoch": 0.5391152979725579, + "grad_norm": 0.2212359756231308, + "learning_rate": 0.0015, + "loss": 1.2415, + "step": 10530 + }, + { + "epoch": 0.5396272783125128, + "grad_norm": 0.2696838974952698, + "learning_rate": 0.0015, + "loss": 1.2254, + "step": 10540 + }, + { + "epoch": 0.5401392586524677, + "grad_norm": 0.21417804062366486, + "learning_rate": 0.0015, + "loss": 1.2307, + "step": 10550 + }, + { + "epoch": 0.5406512389924227, + "grad_norm": 0.2126997709274292, + "learning_rate": 0.0015, + "loss": 1.2134, + "step": 10560 + }, + { + "epoch": 0.5411632193323777, + "grad_norm": 0.21690891683101654, + "learning_rate": 0.0015, + "loss": 1.2136, + "step": 10570 + }, + { + "epoch": 0.5416751996723326, + "grad_norm": 0.21153941750526428, + "learning_rate": 0.0015, + "loss": 1.2157, + "step": 10580 + }, + { + "epoch": 0.5421871800122875, + "grad_norm": 0.21089473366737366, + "learning_rate": 0.0015, + "loss": 1.2272, + "step": 10590 + }, + { + "epoch": 0.5426991603522425, + "grad_norm": 0.2564721703529358, + "learning_rate": 0.0015, + "loss": 1.2026, + "step": 10600 + }, + { + "epoch": 0.5432111406921974, + "grad_norm": 0.2235645204782486, + "learning_rate": 0.0015, + "loss": 1.2373, + "step": 10610 + }, + { + "epoch": 0.5437231210321524, + "grad_norm": 0.21624423563480377, + "learning_rate": 0.0015, + "loss": 1.2208, + "step": 10620 + }, + { + "epoch": 0.5442351013721073, + "grad_norm": 0.22423268854618073, + "learning_rate": 0.0015, + "loss": 1.2246, + "step": 10630 + }, + { + "epoch": 0.5447470817120622, + "grad_norm": 0.20781590044498444, + "learning_rate": 0.0015, + "loss": 1.2197, + "step": 10640 + }, + { + "epoch": 0.5452590620520172, + "grad_norm": 0.21837033331394196, + "learning_rate": 0.0015, + "loss": 1.2195, + "step": 10650 + }, + { + "epoch": 0.5457710423919722, + "grad_norm": 0.23481489717960358, + "learning_rate": 0.0015, + "loss": 1.2221, + "step": 10660 + }, + { + "epoch": 0.5462830227319271, + "grad_norm": 0.20522017776966095, + "learning_rate": 0.0015, + "loss": 1.2119, + "step": 10670 + }, + { + "epoch": 0.546795003071882, + "grad_norm": 0.24082933366298676, + "learning_rate": 0.0015, + "loss": 1.2115, + "step": 10680 + }, + { + "epoch": 0.547306983411837, + "grad_norm": 0.21289277076721191, + "learning_rate": 0.0015, + "loss": 1.2386, + "step": 10690 + }, + { + "epoch": 0.547818963751792, + "grad_norm": 0.21003836393356323, + "learning_rate": 0.0015, + "loss": 1.2107, + "step": 10700 + }, + { + "epoch": 0.5483309440917469, + "grad_norm": 0.21242666244506836, + "learning_rate": 0.0015, + "loss": 1.2429, + "step": 10710 + }, + { + "epoch": 0.5488429244317018, + "grad_norm": 0.2271721065044403, + "learning_rate": 0.0015, + "loss": 1.2314, + "step": 10720 + }, + { + "epoch": 0.5493549047716567, + "grad_norm": 0.21104945242404938, + "learning_rate": 0.0015, + "loss": 1.2342, + "step": 10730 + }, + { + "epoch": 0.5498668851116117, + "grad_norm": 0.2085346132516861, + "learning_rate": 0.0015, + "loss": 1.2271, + "step": 10740 + }, + { + "epoch": 0.5503788654515667, + "grad_norm": 0.22231942415237427, + "learning_rate": 0.0015, + "loss": 1.2306, + "step": 10750 + }, + { + "epoch": 0.5508908457915216, + "grad_norm": 0.21245570480823517, + "learning_rate": 0.0015, + "loss": 1.2258, + "step": 10760 + }, + { + "epoch": 0.5514028261314765, + "grad_norm": 0.19826675951480865, + "learning_rate": 0.0015, + "loss": 1.2163, + "step": 10770 + }, + { + "epoch": 0.5519148064714315, + "grad_norm": 0.22163072228431702, + "learning_rate": 0.0015, + "loss": 1.229, + "step": 10780 + }, + { + "epoch": 0.5524267868113865, + "grad_norm": 0.21903766691684723, + "learning_rate": 0.0015, + "loss": 1.2139, + "step": 10790 + }, + { + "epoch": 0.5529387671513414, + "grad_norm": 0.2075222283601761, + "learning_rate": 0.0015, + "loss": 1.2129, + "step": 10800 + }, + { + "epoch": 0.5534507474912963, + "grad_norm": 0.21938522160053253, + "learning_rate": 0.0015, + "loss": 1.2232, + "step": 10810 + }, + { + "epoch": 0.5539627278312513, + "grad_norm": 0.21770595014095306, + "learning_rate": 0.0015, + "loss": 1.2465, + "step": 10820 + }, + { + "epoch": 0.5544747081712063, + "grad_norm": 0.20712700486183167, + "learning_rate": 0.0015, + "loss": 1.2183, + "step": 10830 + }, + { + "epoch": 0.5549866885111612, + "grad_norm": 0.22477000951766968, + "learning_rate": 0.0015, + "loss": 1.2186, + "step": 10840 + }, + { + "epoch": 0.5554986688511161, + "grad_norm": 0.21939463913440704, + "learning_rate": 0.0015, + "loss": 1.2355, + "step": 10850 + }, + { + "epoch": 0.556010649191071, + "grad_norm": 0.2524956464767456, + "learning_rate": 0.0015, + "loss": 1.2092, + "step": 10860 + }, + { + "epoch": 0.556522629531026, + "grad_norm": 0.2115110456943512, + "learning_rate": 0.0015, + "loss": 1.2137, + "step": 10870 + }, + { + "epoch": 0.557034609870981, + "grad_norm": 0.20509475469589233, + "learning_rate": 0.0015, + "loss": 1.2234, + "step": 10880 + }, + { + "epoch": 0.5575465902109359, + "grad_norm": 0.21247826516628265, + "learning_rate": 0.0015, + "loss": 1.2234, + "step": 10890 + }, + { + "epoch": 0.5580585705508908, + "grad_norm": 0.21064293384552002, + "learning_rate": 0.0015, + "loss": 1.2289, + "step": 10900 + }, + { + "epoch": 0.5585705508908458, + "grad_norm": 0.21902692317962646, + "learning_rate": 0.0015, + "loss": 1.2085, + "step": 10910 + }, + { + "epoch": 0.5590825312308008, + "grad_norm": 0.21347709000110626, + "learning_rate": 0.0015, + "loss": 1.2151, + "step": 10920 + }, + { + "epoch": 0.5595945115707557, + "grad_norm": 0.20034797489643097, + "learning_rate": 0.0015, + "loss": 1.218, + "step": 10930 + }, + { + "epoch": 0.5601064919107106, + "grad_norm": 0.20223546028137207, + "learning_rate": 0.0015, + "loss": 1.2176, + "step": 10940 + }, + { + "epoch": 0.5606184722506655, + "grad_norm": 0.23771893978118896, + "learning_rate": 0.0015, + "loss": 1.2297, + "step": 10950 + }, + { + "epoch": 0.5611304525906206, + "grad_norm": 0.24617038667201996, + "learning_rate": 0.0015, + "loss": 1.2331, + "step": 10960 + }, + { + "epoch": 0.5616424329305755, + "grad_norm": 0.2169172167778015, + "learning_rate": 0.0015, + "loss": 1.2319, + "step": 10970 + }, + { + "epoch": 0.5621544132705304, + "grad_norm": 0.21281367540359497, + "learning_rate": 0.0015, + "loss": 1.2205, + "step": 10980 + }, + { + "epoch": 0.5626663936104853, + "grad_norm": 0.21705804765224457, + "learning_rate": 0.0015, + "loss": 1.2138, + "step": 10990 + }, + { + "epoch": 0.5631783739504403, + "grad_norm": 0.19822140038013458, + "learning_rate": 0.0015, + "loss": 1.2339, + "step": 11000 + }, + { + "epoch": 0.5636903542903953, + "grad_norm": 0.20427508652210236, + "learning_rate": 0.0015, + "loss": 1.2195, + "step": 11010 + }, + { + "epoch": 0.5642023346303502, + "grad_norm": 0.2140669971704483, + "learning_rate": 0.0015, + "loss": 1.1975, + "step": 11020 + }, + { + "epoch": 0.5647143149703051, + "grad_norm": 0.20858561992645264, + "learning_rate": 0.0015, + "loss": 1.208, + "step": 11030 + }, + { + "epoch": 0.5652262953102601, + "grad_norm": 0.21723324060440063, + "learning_rate": 0.0015, + "loss": 1.2193, + "step": 11040 + }, + { + "epoch": 0.5657382756502151, + "grad_norm": 0.21611307561397552, + "learning_rate": 0.0015, + "loss": 1.2199, + "step": 11050 + }, + { + "epoch": 0.56625025599017, + "grad_norm": 0.21373584866523743, + "learning_rate": 0.0015, + "loss": 1.2065, + "step": 11060 + }, + { + "epoch": 0.5667622363301249, + "grad_norm": 0.2058737874031067, + "learning_rate": 0.0015, + "loss": 1.2019, + "step": 11070 + }, + { + "epoch": 0.5672742166700798, + "grad_norm": 0.22086186707019806, + "learning_rate": 0.0015, + "loss": 1.2108, + "step": 11080 + }, + { + "epoch": 0.5677861970100349, + "grad_norm": 0.21599149703979492, + "learning_rate": 0.0015, + "loss": 1.209, + "step": 11090 + }, + { + "epoch": 0.5682981773499898, + "grad_norm": 0.22241829335689545, + "learning_rate": 0.0015, + "loss": 1.2054, + "step": 11100 + }, + { + "epoch": 0.5688101576899447, + "grad_norm": 0.19618919491767883, + "learning_rate": 0.0015, + "loss": 1.2293, + "step": 11110 + }, + { + "epoch": 0.5693221380298996, + "grad_norm": 0.19986511766910553, + "learning_rate": 0.0015, + "loss": 1.1945, + "step": 11120 + }, + { + "epoch": 0.5698341183698546, + "grad_norm": 0.20131878554821014, + "learning_rate": 0.0015, + "loss": 1.2082, + "step": 11130 + }, + { + "epoch": 0.5703460987098096, + "grad_norm": 0.20655354857444763, + "learning_rate": 0.0015, + "loss": 1.2111, + "step": 11140 + }, + { + "epoch": 0.5708580790497645, + "grad_norm": 0.2156609296798706, + "learning_rate": 0.0015, + "loss": 1.2288, + "step": 11150 + }, + { + "epoch": 0.5713700593897194, + "grad_norm": 0.20367379486560822, + "learning_rate": 0.0015, + "loss": 1.2229, + "step": 11160 + }, + { + "epoch": 0.5718820397296743, + "grad_norm": 0.20256848633289337, + "learning_rate": 0.0015, + "loss": 1.2236, + "step": 11170 + }, + { + "epoch": 0.5723940200696294, + "grad_norm": 0.20862998068332672, + "learning_rate": 0.0015, + "loss": 1.2153, + "step": 11180 + }, + { + "epoch": 0.5729060004095843, + "grad_norm": 0.21000482141971588, + "learning_rate": 0.0015, + "loss": 1.2164, + "step": 11190 + }, + { + "epoch": 0.5734179807495392, + "grad_norm": 0.21778449416160583, + "learning_rate": 0.0015, + "loss": 1.2221, + "step": 11200 + }, + { + "epoch": 0.5739299610894941, + "grad_norm": 0.20954222977161407, + "learning_rate": 0.0015, + "loss": 1.2257, + "step": 11210 + }, + { + "epoch": 0.5744419414294492, + "grad_norm": 0.21105293929576874, + "learning_rate": 0.0015, + "loss": 1.2218, + "step": 11220 + }, + { + "epoch": 0.5749539217694041, + "grad_norm": 0.2167726457118988, + "learning_rate": 0.0015, + "loss": 1.2193, + "step": 11230 + }, + { + "epoch": 0.575465902109359, + "grad_norm": 0.20207858085632324, + "learning_rate": 0.0015, + "loss": 1.2243, + "step": 11240 + }, + { + "epoch": 0.5759778824493139, + "grad_norm": 0.21475255489349365, + "learning_rate": 0.0015, + "loss": 1.2222, + "step": 11250 + }, + { + "epoch": 0.576489862789269, + "grad_norm": 0.22506240010261536, + "learning_rate": 0.0015, + "loss": 1.2255, + "step": 11260 + }, + { + "epoch": 0.5770018431292239, + "grad_norm": 0.23033161461353302, + "learning_rate": 0.0015, + "loss": 1.2287, + "step": 11270 + }, + { + "epoch": 0.5775138234691788, + "grad_norm": 0.20455433428287506, + "learning_rate": 0.0015, + "loss": 1.2141, + "step": 11280 + }, + { + "epoch": 0.5780258038091337, + "grad_norm": 0.22457818686962128, + "learning_rate": 0.0015, + "loss": 1.2329, + "step": 11290 + }, + { + "epoch": 0.5785377841490886, + "grad_norm": 0.2011692076921463, + "learning_rate": 0.0015, + "loss": 1.213, + "step": 11300 + }, + { + "epoch": 0.5790497644890437, + "grad_norm": 0.20488318800926208, + "learning_rate": 0.0015, + "loss": 1.2224, + "step": 11310 + }, + { + "epoch": 0.5795617448289986, + "grad_norm": 0.22065885365009308, + "learning_rate": 0.0015, + "loss": 1.231, + "step": 11320 + }, + { + "epoch": 0.5800737251689535, + "grad_norm": 0.20532485842704773, + "learning_rate": 0.0015, + "loss": 1.2051, + "step": 11330 + }, + { + "epoch": 0.5805857055089084, + "grad_norm": 0.20642031729221344, + "learning_rate": 0.0015, + "loss": 1.215, + "step": 11340 + }, + { + "epoch": 0.5810976858488635, + "grad_norm": 0.20660312473773956, + "learning_rate": 0.0015, + "loss": 1.2191, + "step": 11350 + }, + { + "epoch": 0.5816096661888184, + "grad_norm": 0.21046073734760284, + "learning_rate": 0.0015, + "loss": 1.2142, + "step": 11360 + }, + { + "epoch": 0.5821216465287733, + "grad_norm": 0.21846343576908112, + "learning_rate": 0.0015, + "loss": 1.2205, + "step": 11370 + }, + { + "epoch": 0.5826336268687282, + "grad_norm": 0.20589517056941986, + "learning_rate": 0.0015, + "loss": 1.2057, + "step": 11380 + }, + { + "epoch": 0.5831456072086831, + "grad_norm": 0.20691034197807312, + "learning_rate": 0.0015, + "loss": 1.2064, + "step": 11390 + }, + { + "epoch": 0.5836575875486382, + "grad_norm": 0.21649305522441864, + "learning_rate": 0.0015, + "loss": 1.2032, + "step": 11400 + }, + { + "epoch": 0.5841695678885931, + "grad_norm": 0.2329801321029663, + "learning_rate": 0.0015, + "loss": 1.2196, + "step": 11410 + }, + { + "epoch": 0.584681548228548, + "grad_norm": 0.23256272077560425, + "learning_rate": 0.0015, + "loss": 1.2124, + "step": 11420 + }, + { + "epoch": 0.5851935285685029, + "grad_norm": 0.2036832720041275, + "learning_rate": 0.0015, + "loss": 1.2098, + "step": 11430 + }, + { + "epoch": 0.585705508908458, + "grad_norm": 0.21199576556682587, + "learning_rate": 0.0015, + "loss": 1.2266, + "step": 11440 + }, + { + "epoch": 0.5862174892484129, + "grad_norm": 0.2015303373336792, + "learning_rate": 0.0015, + "loss": 1.1916, + "step": 11450 + }, + { + "epoch": 0.5867294695883678, + "grad_norm": 0.2176617681980133, + "learning_rate": 0.0015, + "loss": 1.1888, + "step": 11460 + }, + { + "epoch": 0.5872414499283227, + "grad_norm": 0.21515142917633057, + "learning_rate": 0.0015, + "loss": 1.2096, + "step": 11470 + }, + { + "epoch": 0.5877534302682776, + "grad_norm": 0.21731404960155487, + "learning_rate": 0.0015, + "loss": 1.2077, + "step": 11480 + }, + { + "epoch": 0.5882654106082327, + "grad_norm": 0.20664644241333008, + "learning_rate": 0.0015, + "loss": 1.2027, + "step": 11490 + }, + { + "epoch": 0.5887773909481876, + "grad_norm": 0.20170624554157257, + "learning_rate": 0.0015, + "loss": 1.233, + "step": 11500 + }, + { + "epoch": 0.5892893712881425, + "grad_norm": 0.2092912346124649, + "learning_rate": 0.0015, + "loss": 1.2004, + "step": 11510 + }, + { + "epoch": 0.5898013516280974, + "grad_norm": 0.204396590590477, + "learning_rate": 0.0015, + "loss": 1.2052, + "step": 11520 + }, + { + "epoch": 0.5903133319680525, + "grad_norm": 0.2075720578432083, + "learning_rate": 0.0015, + "loss": 1.2042, + "step": 11530 + }, + { + "epoch": 0.5908253123080074, + "grad_norm": 0.19743815064430237, + "learning_rate": 0.0015, + "loss": 1.1974, + "step": 11540 + }, + { + "epoch": 0.5913372926479623, + "grad_norm": 0.19972637295722961, + "learning_rate": 0.0015, + "loss": 1.2021, + "step": 11550 + }, + { + "epoch": 0.5918492729879172, + "grad_norm": 0.20364214479923248, + "learning_rate": 0.0015, + "loss": 1.2149, + "step": 11560 + }, + { + "epoch": 0.5923612533278723, + "grad_norm": 0.20440620183944702, + "learning_rate": 0.0015, + "loss": 1.1855, + "step": 11570 + }, + { + "epoch": 0.5928732336678272, + "grad_norm": 0.21338412165641785, + "learning_rate": 0.0015, + "loss": 1.2022, + "step": 11580 + }, + { + "epoch": 0.5933852140077821, + "grad_norm": 0.2067076861858368, + "learning_rate": 0.0015, + "loss": 1.2109, + "step": 11590 + }, + { + "epoch": 0.593897194347737, + "grad_norm": 0.20598556101322174, + "learning_rate": 0.0015, + "loss": 1.2132, + "step": 11600 + }, + { + "epoch": 0.5944091746876919, + "grad_norm": 0.21331733465194702, + "learning_rate": 0.0015, + "loss": 1.2021, + "step": 11610 + }, + { + "epoch": 0.594921155027647, + "grad_norm": 0.23132279515266418, + "learning_rate": 0.0015, + "loss": 1.1954, + "step": 11620 + }, + { + "epoch": 0.5954331353676019, + "grad_norm": 0.2226603478193283, + "learning_rate": 0.0015, + "loss": 1.2055, + "step": 11630 + }, + { + "epoch": 0.5959451157075568, + "grad_norm": 0.19999723136425018, + "learning_rate": 0.0015, + "loss": 1.1961, + "step": 11640 + }, + { + "epoch": 0.5964570960475117, + "grad_norm": 0.19226787984371185, + "learning_rate": 0.0015, + "loss": 1.2056, + "step": 11650 + }, + { + "epoch": 0.5969690763874668, + "grad_norm": 0.20891976356506348, + "learning_rate": 0.0015, + "loss": 1.2023, + "step": 11660 + }, + { + "epoch": 0.5974810567274217, + "grad_norm": 0.19218876957893372, + "learning_rate": 0.0015, + "loss": 1.2027, + "step": 11670 + }, + { + "epoch": 0.5979930370673766, + "grad_norm": 0.20928075909614563, + "learning_rate": 0.0015, + "loss": 1.2176, + "step": 11680 + }, + { + "epoch": 0.5985050174073315, + "grad_norm": 0.204718217253685, + "learning_rate": 0.0015, + "loss": 1.2014, + "step": 11690 + }, + { + "epoch": 0.5990169977472865, + "grad_norm": 0.22869887948036194, + "learning_rate": 0.0015, + "loss": 1.1888, + "step": 11700 + }, + { + "epoch": 0.5995289780872415, + "grad_norm": 0.19692908227443695, + "learning_rate": 0.0015, + "loss": 1.2161, + "step": 11710 + }, + { + "epoch": 0.6000409584271964, + "grad_norm": 0.2099919617176056, + "learning_rate": 0.0015, + "loss": 1.1968, + "step": 11720 + }, + { + "epoch": 0.6005529387671513, + "grad_norm": 0.20044675469398499, + "learning_rate": 0.0015, + "loss": 1.2071, + "step": 11730 + }, + { + "epoch": 0.6010649191071062, + "grad_norm": 0.20645897090435028, + "learning_rate": 0.0015, + "loss": 1.2142, + "step": 11740 + }, + { + "epoch": 0.6015768994470613, + "grad_norm": 0.20446518063545227, + "learning_rate": 0.0015, + "loss": 1.1907, + "step": 11750 + }, + { + "epoch": 0.6020888797870162, + "grad_norm": 0.19793803989887238, + "learning_rate": 0.0015, + "loss": 1.2237, + "step": 11760 + }, + { + "epoch": 0.6026008601269711, + "grad_norm": 0.23807552456855774, + "learning_rate": 0.0015, + "loss": 1.2072, + "step": 11770 + }, + { + "epoch": 0.603112840466926, + "grad_norm": 0.20290285348892212, + "learning_rate": 0.0015, + "loss": 1.2048, + "step": 11780 + }, + { + "epoch": 0.6036248208068811, + "grad_norm": 0.21725532412528992, + "learning_rate": 0.0015, + "loss": 1.1961, + "step": 11790 + }, + { + "epoch": 0.604136801146836, + "grad_norm": 0.20467454195022583, + "learning_rate": 0.0015, + "loss": 1.2301, + "step": 11800 + }, + { + "epoch": 0.6046487814867909, + "grad_norm": 0.20618268847465515, + "learning_rate": 0.0015, + "loss": 1.2026, + "step": 11810 + }, + { + "epoch": 0.6051607618267458, + "grad_norm": 0.2097761183977127, + "learning_rate": 0.0015, + "loss": 1.1992, + "step": 11820 + }, + { + "epoch": 0.6056727421667008, + "grad_norm": 0.21861404180526733, + "learning_rate": 0.0015, + "loss": 1.2047, + "step": 11830 + }, + { + "epoch": 0.6061847225066558, + "grad_norm": 0.2066473513841629, + "learning_rate": 0.0015, + "loss": 1.2022, + "step": 11840 + }, + { + "epoch": 0.6066967028466107, + "grad_norm": 0.203571155667305, + "learning_rate": 0.0015, + "loss": 1.1729, + "step": 11850 + }, + { + "epoch": 0.6072086831865656, + "grad_norm": 0.20523090660572052, + "learning_rate": 0.0015, + "loss": 1.222, + "step": 11860 + }, + { + "epoch": 0.6077206635265205, + "grad_norm": 0.2021731734275818, + "learning_rate": 0.0015, + "loss": 1.1983, + "step": 11870 + }, + { + "epoch": 0.6082326438664756, + "grad_norm": 0.20643019676208496, + "learning_rate": 0.0015, + "loss": 1.2147, + "step": 11880 + }, + { + "epoch": 0.6087446242064305, + "grad_norm": 0.21817174553871155, + "learning_rate": 0.0015, + "loss": 1.1988, + "step": 11890 + }, + { + "epoch": 0.6092566045463854, + "grad_norm": 0.21849657595157623, + "learning_rate": 0.0015, + "loss": 1.1908, + "step": 11900 + }, + { + "epoch": 0.6097685848863403, + "grad_norm": 0.21117383241653442, + "learning_rate": 0.0015, + "loss": 1.2318, + "step": 11910 + }, + { + "epoch": 0.6102805652262953, + "grad_norm": 0.2120293378829956, + "learning_rate": 0.0015, + "loss": 1.2071, + "step": 11920 + }, + { + "epoch": 0.6107925455662503, + "grad_norm": 0.20229868590831757, + "learning_rate": 0.0015, + "loss": 1.191, + "step": 11930 + }, + { + "epoch": 0.6113045259062052, + "grad_norm": 0.19626636803150177, + "learning_rate": 0.0015, + "loss": 1.2172, + "step": 11940 + }, + { + "epoch": 0.6118165062461601, + "grad_norm": 0.21968694031238556, + "learning_rate": 0.0015, + "loss": 1.1901, + "step": 11950 + }, + { + "epoch": 0.612328486586115, + "grad_norm": 0.22982917726039886, + "learning_rate": 0.0015, + "loss": 1.2023, + "step": 11960 + }, + { + "epoch": 0.6128404669260701, + "grad_norm": 0.20328094065189362, + "learning_rate": 0.0015, + "loss": 1.193, + "step": 11970 + }, + { + "epoch": 0.613352447266025, + "grad_norm": 0.20781250298023224, + "learning_rate": 0.0015, + "loss": 1.1871, + "step": 11980 + }, + { + "epoch": 0.6138644276059799, + "grad_norm": 0.1945171356201172, + "learning_rate": 0.0015, + "loss": 1.1954, + "step": 11990 + }, + { + "epoch": 0.6143764079459348, + "grad_norm": 0.2018270492553711, + "learning_rate": 0.0015, + "loss": 1.1848, + "step": 12000 + }, + { + "epoch": 0.6148883882858899, + "grad_norm": 0.20180918276309967, + "learning_rate": 0.0015, + "loss": 1.2081, + "step": 12010 + }, + { + "epoch": 0.6154003686258448, + "grad_norm": 0.20221208035945892, + "learning_rate": 0.0015, + "loss": 1.2076, + "step": 12020 + }, + { + "epoch": 0.6159123489657997, + "grad_norm": 0.2013401836156845, + "learning_rate": 0.0015, + "loss": 1.2211, + "step": 12030 + }, + { + "epoch": 0.6164243293057546, + "grad_norm": 0.20016033947467804, + "learning_rate": 0.0015, + "loss": 1.2037, + "step": 12040 + }, + { + "epoch": 0.6169363096457096, + "grad_norm": 0.20722372829914093, + "learning_rate": 0.0015, + "loss": 1.2052, + "step": 12050 + }, + { + "epoch": 0.6174482899856646, + "grad_norm": 0.21285022795200348, + "learning_rate": 0.0015, + "loss": 1.2066, + "step": 12060 + }, + { + "epoch": 0.6179602703256195, + "grad_norm": 0.21281997859477997, + "learning_rate": 0.0015, + "loss": 1.1955, + "step": 12070 + }, + { + "epoch": 0.6184722506655744, + "grad_norm": 0.19675594568252563, + "learning_rate": 0.0015, + "loss": 1.2088, + "step": 12080 + }, + { + "epoch": 0.6189842310055294, + "grad_norm": 0.21459296345710754, + "learning_rate": 0.0015, + "loss": 1.2255, + "step": 12090 + }, + { + "epoch": 0.6194962113454844, + "grad_norm": 0.20511606335639954, + "learning_rate": 0.0015, + "loss": 1.2, + "step": 12100 + }, + { + "epoch": 0.6200081916854393, + "grad_norm": 0.20228254795074463, + "learning_rate": 0.0015, + "loss": 1.1906, + "step": 12110 + }, + { + "epoch": 0.6205201720253942, + "grad_norm": 0.1966087371110916, + "learning_rate": 0.0015, + "loss": 1.1771, + "step": 12120 + }, + { + "epoch": 0.6210321523653491, + "grad_norm": 0.2050897479057312, + "learning_rate": 0.0015, + "loss": 1.1931, + "step": 12130 + }, + { + "epoch": 0.6215441327053041, + "grad_norm": 0.20761296153068542, + "learning_rate": 0.0015, + "loss": 1.1796, + "step": 12140 + }, + { + "epoch": 0.6220561130452591, + "grad_norm": 0.19282642006874084, + "learning_rate": 0.0015, + "loss": 1.2022, + "step": 12150 + }, + { + "epoch": 0.622568093385214, + "grad_norm": 0.2018144577741623, + "learning_rate": 0.0015, + "loss": 1.2151, + "step": 12160 + }, + { + "epoch": 0.6230800737251689, + "grad_norm": 0.19583159685134888, + "learning_rate": 0.0015, + "loss": 1.2027, + "step": 12170 + }, + { + "epoch": 0.6235920540651239, + "grad_norm": 0.22334228456020355, + "learning_rate": 0.0015, + "loss": 1.2158, + "step": 12180 + }, + { + "epoch": 0.6241040344050789, + "grad_norm": 0.2306404560804367, + "learning_rate": 0.0015, + "loss": 1.1856, + "step": 12190 + }, + { + "epoch": 0.6246160147450338, + "grad_norm": 0.21355292201042175, + "learning_rate": 0.0015, + "loss": 1.1723, + "step": 12200 + }, + { + "epoch": 0.6251279950849887, + "grad_norm": 0.19845044612884521, + "learning_rate": 0.0015, + "loss": 1.2052, + "step": 12210 + }, + { + "epoch": 0.6256399754249437, + "grad_norm": 0.2062026709318161, + "learning_rate": 0.0015, + "loss": 1.2093, + "step": 12220 + }, + { + "epoch": 0.6261519557648987, + "grad_norm": 0.20521892607212067, + "learning_rate": 0.0015, + "loss": 1.1888, + "step": 12230 + }, + { + "epoch": 0.6266639361048536, + "grad_norm": 0.20746907591819763, + "learning_rate": 0.0015, + "loss": 1.2038, + "step": 12240 + }, + { + "epoch": 0.6271759164448085, + "grad_norm": 0.19719459116458893, + "learning_rate": 0.0015, + "loss": 1.1995, + "step": 12250 + }, + { + "epoch": 0.6276878967847634, + "grad_norm": 0.20681564509868622, + "learning_rate": 0.0015, + "loss": 1.2157, + "step": 12260 + }, + { + "epoch": 0.6281998771247184, + "grad_norm": 0.20236019790172577, + "learning_rate": 0.0015, + "loss": 1.1859, + "step": 12270 + }, + { + "epoch": 0.6287118574646734, + "grad_norm": 0.22654055058956146, + "learning_rate": 0.0015, + "loss": 1.1961, + "step": 12280 + }, + { + "epoch": 0.6292238378046283, + "grad_norm": 0.1928294599056244, + "learning_rate": 0.0015, + "loss": 1.1932, + "step": 12290 + }, + { + "epoch": 0.6297358181445832, + "grad_norm": 0.21249711513519287, + "learning_rate": 0.0015, + "loss": 1.2018, + "step": 12300 + }, + { + "epoch": 0.6302477984845382, + "grad_norm": 0.19809094071388245, + "learning_rate": 0.0015, + "loss": 1.1806, + "step": 12310 + }, + { + "epoch": 0.6307597788244932, + "grad_norm": 0.1965721845626831, + "learning_rate": 0.0015, + "loss": 1.1956, + "step": 12320 + }, + { + "epoch": 0.6312717591644481, + "grad_norm": 0.20646794140338898, + "learning_rate": 0.0015, + "loss": 1.1907, + "step": 12330 + }, + { + "epoch": 0.631783739504403, + "grad_norm": 0.19848330318927765, + "learning_rate": 0.0015, + "loss": 1.2049, + "step": 12340 + }, + { + "epoch": 0.632295719844358, + "grad_norm": 0.19884952902793884, + "learning_rate": 0.0015, + "loss": 1.1886, + "step": 12350 + }, + { + "epoch": 0.6328077001843129, + "grad_norm": 0.21490252017974854, + "learning_rate": 0.0015, + "loss": 1.2033, + "step": 12360 + }, + { + "epoch": 0.6333196805242679, + "grad_norm": 0.21076445281505585, + "learning_rate": 0.0015, + "loss": 1.1725, + "step": 12370 + }, + { + "epoch": 0.6338316608642228, + "grad_norm": 0.20743723213672638, + "learning_rate": 0.0015, + "loss": 1.2118, + "step": 12380 + }, + { + "epoch": 0.6343436412041777, + "grad_norm": 0.2091572880744934, + "learning_rate": 0.0015, + "loss": 1.2058, + "step": 12390 + }, + { + "epoch": 0.6348556215441327, + "grad_norm": 0.19593819975852966, + "learning_rate": 0.0015, + "loss": 1.1789, + "step": 12400 + }, + { + "epoch": 0.6353676018840877, + "grad_norm": 0.21120460331439972, + "learning_rate": 0.0015, + "loss": 1.199, + "step": 12410 + }, + { + "epoch": 0.6358795822240426, + "grad_norm": 0.19703616201877594, + "learning_rate": 0.0015, + "loss": 1.2062, + "step": 12420 + }, + { + "epoch": 0.6363915625639975, + "grad_norm": 0.2228432148694992, + "learning_rate": 0.0015, + "loss": 1.2046, + "step": 12430 + }, + { + "epoch": 0.6369035429039525, + "grad_norm": 0.19556592404842377, + "learning_rate": 0.0015, + "loss": 1.1958, + "step": 12440 + }, + { + "epoch": 0.6374155232439075, + "grad_norm": 0.2118174135684967, + "learning_rate": 0.0015, + "loss": 1.2158, + "step": 12450 + }, + { + "epoch": 0.6379275035838624, + "grad_norm": 0.19802866876125336, + "learning_rate": 0.0015, + "loss": 1.1889, + "step": 12460 + }, + { + "epoch": 0.6384394839238173, + "grad_norm": 0.2045314460992813, + "learning_rate": 0.0015, + "loss": 1.2052, + "step": 12470 + }, + { + "epoch": 0.6389514642637723, + "grad_norm": 0.20061345398426056, + "learning_rate": 0.0015, + "loss": 1.1859, + "step": 12480 + }, + { + "epoch": 0.6394634446037272, + "grad_norm": 0.19872547686100006, + "learning_rate": 0.0015, + "loss": 1.2002, + "step": 12490 + }, + { + "epoch": 0.6399754249436822, + "grad_norm": 0.2001519650220871, + "learning_rate": 0.0015, + "loss": 1.192, + "step": 12500 + }, + { + "epoch": 0.6404874052836371, + "grad_norm": 0.20049947500228882, + "learning_rate": 0.0015, + "loss": 1.1919, + "step": 12510 + }, + { + "epoch": 0.640999385623592, + "grad_norm": 0.20143716037273407, + "learning_rate": 0.0015, + "loss": 1.1821, + "step": 12520 + }, + { + "epoch": 0.641511365963547, + "grad_norm": 0.19347570836544037, + "learning_rate": 0.0015, + "loss": 1.2135, + "step": 12530 + }, + { + "epoch": 0.642023346303502, + "grad_norm": 0.19492658972740173, + "learning_rate": 0.0015, + "loss": 1.1891, + "step": 12540 + }, + { + "epoch": 0.6425353266434569, + "grad_norm": 0.19527223706245422, + "learning_rate": 0.0015, + "loss": 1.2102, + "step": 12550 + }, + { + "epoch": 0.6430473069834118, + "grad_norm": 0.1927892118692398, + "learning_rate": 0.0015, + "loss": 1.1714, + "step": 12560 + }, + { + "epoch": 0.6435592873233668, + "grad_norm": 0.2009015530347824, + "learning_rate": 0.0015, + "loss": 1.2035, + "step": 12570 + }, + { + "epoch": 0.6440712676633217, + "grad_norm": 0.21776844561100006, + "learning_rate": 0.0015, + "loss": 1.1777, + "step": 12580 + }, + { + "epoch": 0.6445832480032767, + "grad_norm": 0.19154374301433563, + "learning_rate": 0.0015, + "loss": 1.1906, + "step": 12590 + }, + { + "epoch": 0.6450952283432316, + "grad_norm": 0.19381144642829895, + "learning_rate": 0.0015, + "loss": 1.1778, + "step": 12600 + }, + { + "epoch": 0.6456072086831866, + "grad_norm": 0.19017955660820007, + "learning_rate": 0.0015, + "loss": 1.1967, + "step": 12610 + }, + { + "epoch": 0.6461191890231415, + "grad_norm": 0.21785299479961395, + "learning_rate": 0.0015, + "loss": 1.2088, + "step": 12620 + }, + { + "epoch": 0.6466311693630965, + "grad_norm": 0.2039538025856018, + "learning_rate": 0.0015, + "loss": 1.1663, + "step": 12630 + }, + { + "epoch": 0.6471431497030514, + "grad_norm": 0.19732427597045898, + "learning_rate": 0.0015, + "loss": 1.1913, + "step": 12640 + }, + { + "epoch": 0.6476551300430063, + "grad_norm": 0.1911800503730774, + "learning_rate": 0.0015, + "loss": 1.2052, + "step": 12650 + }, + { + "epoch": 0.6481671103829613, + "grad_norm": 0.19413244724273682, + "learning_rate": 0.0015, + "loss": 1.1804, + "step": 12660 + }, + { + "epoch": 0.6486790907229162, + "grad_norm": 0.1838771104812622, + "learning_rate": 0.0015, + "loss": 1.1911, + "step": 12670 + }, + { + "epoch": 0.6491910710628712, + "grad_norm": 0.1838536560535431, + "learning_rate": 0.0015, + "loss": 1.1991, + "step": 12680 + }, + { + "epoch": 0.6497030514028261, + "grad_norm": 0.20453278720378876, + "learning_rate": 0.0015, + "loss": 1.1992, + "step": 12690 + }, + { + "epoch": 0.6502150317427811, + "grad_norm": 0.21677398681640625, + "learning_rate": 0.0015, + "loss": 1.1811, + "step": 12700 + }, + { + "epoch": 0.650727012082736, + "grad_norm": 0.19484928250312805, + "learning_rate": 0.0015, + "loss": 1.1924, + "step": 12710 + }, + { + "epoch": 0.651238992422691, + "grad_norm": 0.1887393295764923, + "learning_rate": 0.0015, + "loss": 1.1978, + "step": 12720 + }, + { + "epoch": 0.6517509727626459, + "grad_norm": 0.19239051640033722, + "learning_rate": 0.0015, + "loss": 1.2051, + "step": 12730 + }, + { + "epoch": 0.6522629531026009, + "grad_norm": 0.20435065031051636, + "learning_rate": 0.0015, + "loss": 1.153, + "step": 12740 + }, + { + "epoch": 0.6527749334425558, + "grad_norm": 0.2020270824432373, + "learning_rate": 0.0015, + "loss": 1.2096, + "step": 12750 + }, + { + "epoch": 0.6532869137825108, + "grad_norm": 0.21720841526985168, + "learning_rate": 0.0015, + "loss": 1.1776, + "step": 12760 + }, + { + "epoch": 0.6537988941224657, + "grad_norm": 0.19210828840732574, + "learning_rate": 0.0015, + "loss": 1.1894, + "step": 12770 + }, + { + "epoch": 0.6543108744624206, + "grad_norm": 0.19044719636440277, + "learning_rate": 0.0015, + "loss": 1.1894, + "step": 12780 + }, + { + "epoch": 0.6548228548023756, + "grad_norm": 0.20893365144729614, + "learning_rate": 0.0015, + "loss": 1.1916, + "step": 12790 + }, + { + "epoch": 0.6553348351423305, + "grad_norm": 0.20288752019405365, + "learning_rate": 0.0015, + "loss": 1.2018, + "step": 12800 + }, + { + "epoch": 0.6558468154822855, + "grad_norm": 0.1970445066690445, + "learning_rate": 0.0015, + "loss": 1.1728, + "step": 12810 + }, + { + "epoch": 0.6563587958222404, + "grad_norm": 0.19928324222564697, + "learning_rate": 0.0015, + "loss": 1.1959, + "step": 12820 + }, + { + "epoch": 0.6568707761621954, + "grad_norm": 0.1929846554994583, + "learning_rate": 0.0015, + "loss": 1.1885, + "step": 12830 + }, + { + "epoch": 0.6573827565021503, + "grad_norm": 0.20633605122566223, + "learning_rate": 0.0015, + "loss": 1.2145, + "step": 12840 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.19971442222595215, + "learning_rate": 0.0015, + "loss": 1.188, + "step": 12850 + }, + { + "epoch": 0.6584067171820602, + "grad_norm": 0.18677356839179993, + "learning_rate": 0.0015, + "loss": 1.1943, + "step": 12860 + }, + { + "epoch": 0.6589186975220152, + "grad_norm": 0.1940857172012329, + "learning_rate": 0.0015, + "loss": 1.1921, + "step": 12870 + }, + { + "epoch": 0.6594306778619701, + "grad_norm": 0.20788009464740753, + "learning_rate": 0.0015, + "loss": 1.1922, + "step": 12880 + }, + { + "epoch": 0.659942658201925, + "grad_norm": 0.20371931791305542, + "learning_rate": 0.0015, + "loss": 1.1963, + "step": 12890 + }, + { + "epoch": 0.66045463854188, + "grad_norm": 0.19461549818515778, + "learning_rate": 0.0015, + "loss": 1.1639, + "step": 12900 + }, + { + "epoch": 0.6609666188818349, + "grad_norm": 0.19904249906539917, + "learning_rate": 0.0015, + "loss": 1.1708, + "step": 12910 + }, + { + "epoch": 0.6614785992217899, + "grad_norm": 0.2062397003173828, + "learning_rate": 0.0015, + "loss": 1.1937, + "step": 12920 + }, + { + "epoch": 0.6619905795617448, + "grad_norm": 0.20642533898353577, + "learning_rate": 0.0015, + "loss": 1.1929, + "step": 12930 + }, + { + "epoch": 0.6625025599016998, + "grad_norm": 0.19433195888996124, + "learning_rate": 0.0015, + "loss": 1.1886, + "step": 12940 + }, + { + "epoch": 0.6630145402416547, + "grad_norm": 0.1951138973236084, + "learning_rate": 0.0015, + "loss": 1.1847, + "step": 12950 + }, + { + "epoch": 0.6635265205816097, + "grad_norm": 0.19220565259456635, + "learning_rate": 0.0015, + "loss": 1.1847, + "step": 12960 + }, + { + "epoch": 0.6640385009215646, + "grad_norm": 0.1887965053319931, + "learning_rate": 0.0015, + "loss": 1.1791, + "step": 12970 + }, + { + "epoch": 0.6645504812615196, + "grad_norm": 0.18562547862529755, + "learning_rate": 0.0015, + "loss": 1.1677, + "step": 12980 + }, + { + "epoch": 0.6650624616014745, + "grad_norm": 0.1826203167438507, + "learning_rate": 0.0015, + "loss": 1.1796, + "step": 12990 + }, + { + "epoch": 0.6655744419414295, + "grad_norm": 0.18740873038768768, + "learning_rate": 0.0015, + "loss": 1.1797, + "step": 13000 + }, + { + "epoch": 0.6660864222813844, + "grad_norm": 0.1979881227016449, + "learning_rate": 0.0015, + "loss": 1.198, + "step": 13010 + }, + { + "epoch": 0.6665984026213393, + "grad_norm": 0.20608335733413696, + "learning_rate": 0.0015, + "loss": 1.1926, + "step": 13020 + }, + { + "epoch": 0.6671103829612943, + "grad_norm": 0.21441541612148285, + "learning_rate": 0.0015, + "loss": 1.2049, + "step": 13030 + }, + { + "epoch": 0.6676223633012492, + "grad_norm": 0.22678618133068085, + "learning_rate": 0.0015, + "loss": 1.1917, + "step": 13040 + }, + { + "epoch": 0.6681343436412042, + "grad_norm": 0.19718590378761292, + "learning_rate": 0.0015, + "loss": 1.1968, + "step": 13050 + }, + { + "epoch": 0.6686463239811591, + "grad_norm": 0.19607524573802948, + "learning_rate": 0.0015, + "loss": 1.1721, + "step": 13060 + }, + { + "epoch": 0.6691583043211141, + "grad_norm": 0.19298435747623444, + "learning_rate": 0.0015, + "loss": 1.1979, + "step": 13070 + }, + { + "epoch": 0.669670284661069, + "grad_norm": 0.19610482454299927, + "learning_rate": 0.0015, + "loss": 1.1919, + "step": 13080 + }, + { + "epoch": 0.670182265001024, + "grad_norm": 0.19872240722179413, + "learning_rate": 0.0015, + "loss": 1.183, + "step": 13090 + }, + { + "epoch": 0.6706942453409789, + "grad_norm": 0.1863928586244583, + "learning_rate": 0.0015, + "loss": 1.1868, + "step": 13100 + }, + { + "epoch": 0.6712062256809338, + "grad_norm": 0.19495519995689392, + "learning_rate": 0.0015, + "loss": 1.2084, + "step": 13110 + }, + { + "epoch": 0.6717182060208888, + "grad_norm": 0.19348977506160736, + "learning_rate": 0.0015, + "loss": 1.1981, + "step": 13120 + }, + { + "epoch": 0.6722301863608438, + "grad_norm": 0.19418825209140778, + "learning_rate": 0.0015, + "loss": 1.2081, + "step": 13130 + }, + { + "epoch": 0.6727421667007987, + "grad_norm": 0.19263537228107452, + "learning_rate": 0.0015, + "loss": 1.181, + "step": 13140 + }, + { + "epoch": 0.6732541470407536, + "grad_norm": 0.19272197782993317, + "learning_rate": 0.0015, + "loss": 1.1908, + "step": 13150 + }, + { + "epoch": 0.6737661273807086, + "grad_norm": 0.19103066623210907, + "learning_rate": 0.0015, + "loss": 1.164, + "step": 13160 + }, + { + "epoch": 0.6742781077206635, + "grad_norm": 0.19996246695518494, + "learning_rate": 0.0015, + "loss": 1.1951, + "step": 13170 + }, + { + "epoch": 0.6747900880606185, + "grad_norm": 0.2288653403520584, + "learning_rate": 0.0015, + "loss": 1.2188, + "step": 13180 + }, + { + "epoch": 0.6753020684005734, + "grad_norm": 0.1978132575750351, + "learning_rate": 0.0015, + "loss": 1.177, + "step": 13190 + }, + { + "epoch": 0.6758140487405284, + "grad_norm": 0.2042623907327652, + "learning_rate": 0.0015, + "loss": 1.1833, + "step": 13200 + }, + { + "epoch": 0.6763260290804833, + "grad_norm": 0.1838945895433426, + "learning_rate": 0.0015, + "loss": 1.1638, + "step": 13210 + }, + { + "epoch": 0.6768380094204383, + "grad_norm": 0.18537567555904388, + "learning_rate": 0.0015, + "loss": 1.1879, + "step": 13220 + }, + { + "epoch": 0.6773499897603932, + "grad_norm": 0.19888518750667572, + "learning_rate": 0.0015, + "loss": 1.1648, + "step": 13230 + }, + { + "epoch": 0.6778619701003481, + "grad_norm": 0.20373912155628204, + "learning_rate": 0.0015, + "loss": 1.2043, + "step": 13240 + }, + { + "epoch": 0.6783739504403031, + "grad_norm": 0.19218416512012482, + "learning_rate": 0.0015, + "loss": 1.1553, + "step": 13250 + }, + { + "epoch": 0.678885930780258, + "grad_norm": 0.1989835649728775, + "learning_rate": 0.0015, + "loss": 1.1679, + "step": 13260 + }, + { + "epoch": 0.679397911120213, + "grad_norm": 0.20067016780376434, + "learning_rate": 0.0015, + "loss": 1.1827, + "step": 13270 + }, + { + "epoch": 0.6799098914601679, + "grad_norm": 0.19568151235580444, + "learning_rate": 0.0015, + "loss": 1.1839, + "step": 13280 + }, + { + "epoch": 0.6804218718001229, + "grad_norm": 0.2029784619808197, + "learning_rate": 0.0015, + "loss": 1.1787, + "step": 13290 + }, + { + "epoch": 0.6809338521400778, + "grad_norm": 0.19807346165180206, + "learning_rate": 0.0015, + "loss": 1.1763, + "step": 13300 + }, + { + "epoch": 0.6814458324800328, + "grad_norm": 0.1898653358221054, + "learning_rate": 0.0015, + "loss": 1.2075, + "step": 13310 + }, + { + "epoch": 0.6819578128199877, + "grad_norm": 0.2038862705230713, + "learning_rate": 0.0015, + "loss": 1.1773, + "step": 13320 + }, + { + "epoch": 0.6824697931599426, + "grad_norm": 0.18675602972507477, + "learning_rate": 0.0015, + "loss": 1.1888, + "step": 13330 + }, + { + "epoch": 0.6829817734998976, + "grad_norm": 0.20663636922836304, + "learning_rate": 0.0015, + "loss": 1.169, + "step": 13340 + }, + { + "epoch": 0.6834937538398526, + "grad_norm": 0.1998421996831894, + "learning_rate": 0.0015, + "loss": 1.1725, + "step": 13350 + }, + { + "epoch": 0.6840057341798075, + "grad_norm": 0.20095355808734894, + "learning_rate": 0.0015, + "loss": 1.1727, + "step": 13360 + }, + { + "epoch": 0.6845177145197624, + "grad_norm": 0.19053997099399567, + "learning_rate": 0.0015, + "loss": 1.1759, + "step": 13370 + }, + { + "epoch": 0.6850296948597174, + "grad_norm": 0.20177049934864044, + "learning_rate": 0.0015, + "loss": 1.1845, + "step": 13380 + }, + { + "epoch": 0.6855416751996724, + "grad_norm": 0.19868339598178864, + "learning_rate": 0.0015, + "loss": 1.178, + "step": 13390 + }, + { + "epoch": 0.6860536555396273, + "grad_norm": 0.1922164112329483, + "learning_rate": 0.0015, + "loss": 1.1536, + "step": 13400 + }, + { + "epoch": 0.6865656358795822, + "grad_norm": 0.2025415003299713, + "learning_rate": 0.0015, + "loss": 1.1849, + "step": 13410 + }, + { + "epoch": 0.6870776162195372, + "grad_norm": 0.19813013076782227, + "learning_rate": 0.0015, + "loss": 1.1803, + "step": 13420 + }, + { + "epoch": 0.6875895965594921, + "grad_norm": 0.18536531925201416, + "learning_rate": 0.0015, + "loss": 1.1686, + "step": 13430 + }, + { + "epoch": 0.6881015768994471, + "grad_norm": 0.1998080015182495, + "learning_rate": 0.0015, + "loss": 1.1949, + "step": 13440 + }, + { + "epoch": 0.688613557239402, + "grad_norm": 0.1955641508102417, + "learning_rate": 0.0015, + "loss": 1.1758, + "step": 13450 + }, + { + "epoch": 0.6891255375793569, + "grad_norm": 0.19140900671482086, + "learning_rate": 0.0015, + "loss": 1.1675, + "step": 13460 + }, + { + "epoch": 0.6896375179193119, + "grad_norm": 0.20261794328689575, + "learning_rate": 0.0015, + "loss": 1.1802, + "step": 13470 + }, + { + "epoch": 0.6901494982592669, + "grad_norm": 0.19682539999485016, + "learning_rate": 0.0015, + "loss": 1.1798, + "step": 13480 + }, + { + "epoch": 0.6906614785992218, + "grad_norm": 0.2020127922296524, + "learning_rate": 0.0015, + "loss": 1.172, + "step": 13490 + }, + { + "epoch": 0.6911734589391767, + "grad_norm": 0.19824573397636414, + "learning_rate": 0.0015, + "loss": 1.1888, + "step": 13500 + }, + { + "epoch": 0.6916854392791317, + "grad_norm": 0.20089636743068695, + "learning_rate": 0.0015, + "loss": 1.1865, + "step": 13510 + }, + { + "epoch": 0.6921974196190867, + "grad_norm": 0.1954367458820343, + "learning_rate": 0.0015, + "loss": 1.1734, + "step": 13520 + }, + { + "epoch": 0.6927093999590416, + "grad_norm": 0.1989155411720276, + "learning_rate": 0.0015, + "loss": 1.1676, + "step": 13530 + }, + { + "epoch": 0.6932213802989965, + "grad_norm": 0.20354506373405457, + "learning_rate": 0.0015, + "loss": 1.1638, + "step": 13540 + }, + { + "epoch": 0.6937333606389514, + "grad_norm": 0.18505001068115234, + "learning_rate": 0.0015, + "loss": 1.1623, + "step": 13550 + }, + { + "epoch": 0.6942453409789064, + "grad_norm": 0.19758115708827972, + "learning_rate": 0.0015, + "loss": 1.1715, + "step": 13560 + }, + { + "epoch": 0.6947573213188614, + "grad_norm": 0.19761599600315094, + "learning_rate": 0.0015, + "loss": 1.1892, + "step": 13570 + }, + { + "epoch": 0.6952693016588163, + "grad_norm": 0.2028966248035431, + "learning_rate": 0.0015, + "loss": 1.1779, + "step": 13580 + }, + { + "epoch": 0.6957812819987712, + "grad_norm": 0.1852991133928299, + "learning_rate": 0.0015, + "loss": 1.1756, + "step": 13590 + }, + { + "epoch": 0.6962932623387262, + "grad_norm": 0.18972176313400269, + "learning_rate": 0.0015, + "loss": 1.1583, + "step": 13600 + }, + { + "epoch": 0.6968052426786812, + "grad_norm": 0.18746834993362427, + "learning_rate": 0.0015, + "loss": 1.1758, + "step": 13610 + }, + { + "epoch": 0.6973172230186361, + "grad_norm": 0.1831192672252655, + "learning_rate": 0.0015, + "loss": 1.1904, + "step": 13620 + }, + { + "epoch": 0.697829203358591, + "grad_norm": 0.21230356395244598, + "learning_rate": 0.0015, + "loss": 1.1673, + "step": 13630 + }, + { + "epoch": 0.698341183698546, + "grad_norm": 0.2109021544456482, + "learning_rate": 0.0015, + "loss": 1.176, + "step": 13640 + }, + { + "epoch": 0.698853164038501, + "grad_norm": 0.18572686612606049, + "learning_rate": 0.0015, + "loss": 1.195, + "step": 13650 + }, + { + "epoch": 0.6993651443784559, + "grad_norm": 0.19169217348098755, + "learning_rate": 0.0015, + "loss": 1.1865, + "step": 13660 + }, + { + "epoch": 0.6998771247184108, + "grad_norm": 0.18918085098266602, + "learning_rate": 0.0015, + "loss": 1.1788, + "step": 13670 + }, + { + "epoch": 0.7003891050583657, + "grad_norm": 0.19315798580646515, + "learning_rate": 0.0014955269451601939, + "loss": 1.1739, + "step": 13680 + }, + { + "epoch": 0.7009010853983207, + "grad_norm": 0.18943412601947784, + "learning_rate": 0.0014896616625957439, + "loss": 1.1649, + "step": 13690 + }, + { + "epoch": 0.7014130657382757, + "grad_norm": 0.19846367835998535, + "learning_rate": 0.001483819382986655, + "loss": 1.1883, + "step": 13700 + }, + { + "epoch": 0.7019250460782306, + "grad_norm": 0.19269226491451263, + "learning_rate": 0.001478000016118014, + "loss": 1.1775, + "step": 13710 + }, + { + "epoch": 0.7024370264181855, + "grad_norm": 0.19260330498218536, + "learning_rate": 0.0014722034721287212, + "loss": 1.169, + "step": 13720 + }, + { + "epoch": 0.7029490067581405, + "grad_norm": 0.19868920743465424, + "learning_rate": 0.0014664296615101004, + "loss": 1.1671, + "step": 13730 + }, + { + "epoch": 0.7034609870980955, + "grad_norm": 0.1958989053964615, + "learning_rate": 0.0014606784951045186, + "loss": 1.2049, + "step": 13740 + }, + { + "epoch": 0.7039729674380504, + "grad_norm": 0.194174125790596, + "learning_rate": 0.0014549498841040086, + "loss": 1.1703, + "step": 13750 + }, + { + "epoch": 0.7044849477780053, + "grad_norm": 0.19567228853702545, + "learning_rate": 0.0014492437400488976, + "loss": 1.1649, + "step": 13760 + }, + { + "epoch": 0.7049969281179602, + "grad_norm": 0.191901296377182, + "learning_rate": 0.0014435599748264416, + "loss": 1.169, + "step": 13770 + }, + { + "epoch": 0.7055089084579153, + "grad_norm": 0.1933002918958664, + "learning_rate": 0.0014378985006694644, + "loss": 1.1873, + "step": 13780 + }, + { + "epoch": 0.7060208887978702, + "grad_norm": 0.20665253698825836, + "learning_rate": 0.0014322592301550022, + "loss": 1.1773, + "step": 13790 + }, + { + "epoch": 0.7065328691378251, + "grad_norm": 0.19543762505054474, + "learning_rate": 0.0014266420762029542, + "loss": 1.1738, + "step": 13800 + }, + { + "epoch": 0.70704484947778, + "grad_norm": 0.186002716422081, + "learning_rate": 0.0014210469520747377, + "loss": 1.1783, + "step": 13810 + }, + { + "epoch": 0.707556829817735, + "grad_norm": 0.1872335523366928, + "learning_rate": 0.0014154737713719476, + "loss": 1.1918, + "step": 13820 + }, + { + "epoch": 0.70806881015769, + "grad_norm": 0.1909414827823639, + "learning_rate": 0.0014099224480350252, + "loss": 1.1587, + "step": 13830 + }, + { + "epoch": 0.7085807904976449, + "grad_norm": 0.1957162618637085, + "learning_rate": 0.0014043928963419256, + "loss": 1.1783, + "step": 13840 + }, + { + "epoch": 0.7090927708375998, + "grad_norm": 0.1931842565536499, + "learning_rate": 0.0013988850309067965, + "loss": 1.1749, + "step": 13850 + }, + { + "epoch": 0.7096047511775547, + "grad_norm": 0.2018897980451584, + "learning_rate": 0.0013933987666786593, + "loss": 1.1457, + "step": 13860 + }, + { + "epoch": 0.7101167315175098, + "grad_norm": 0.1824326366186142, + "learning_rate": 0.0013879340189400947, + "loss": 1.1861, + "step": 13870 + }, + { + "epoch": 0.7106287118574647, + "grad_norm": 0.19200804829597473, + "learning_rate": 0.0013824907033059355, + "loss": 1.1669, + "step": 13880 + }, + { + "epoch": 0.7111406921974196, + "grad_norm": 0.18873439729213715, + "learning_rate": 0.001377068735721964, + "loss": 1.1555, + "step": 13890 + }, + { + "epoch": 0.7116526725373745, + "grad_norm": 0.19836601614952087, + "learning_rate": 0.0013716680324636122, + "loss": 1.1536, + "step": 13900 + }, + { + "epoch": 0.7121646528773296, + "grad_norm": 0.2006756067276001, + "learning_rate": 0.001366288510134671, + "loss": 1.1595, + "step": 13910 + }, + { + "epoch": 0.7126766332172845, + "grad_norm": 0.18679478764533997, + "learning_rate": 0.0013609300856660014, + "loss": 1.1762, + "step": 13920 + }, + { + "epoch": 0.7131886135572394, + "grad_norm": 0.19826917350292206, + "learning_rate": 0.001355592676314251, + "loss": 1.1752, + "step": 13930 + }, + { + "epoch": 0.7137005938971943, + "grad_norm": 0.18885891139507294, + "learning_rate": 0.0013502761996605787, + "loss": 1.1731, + "step": 13940 + }, + { + "epoch": 0.7142125742371493, + "grad_norm": 0.1888403594493866, + "learning_rate": 0.0013449805736093791, + "loss": 1.1536, + "step": 13950 + }, + { + "epoch": 0.7147245545771043, + "grad_norm": 0.20078985393047333, + "learning_rate": 0.0013397057163870173, + "loss": 1.1545, + "step": 13960 + }, + { + "epoch": 0.7152365349170592, + "grad_norm": 0.19156110286712646, + "learning_rate": 0.001334451546540564, + "loss": 1.148, + "step": 13970 + }, + { + "epoch": 0.7157485152570141, + "grad_norm": 0.19765546917915344, + "learning_rate": 0.0013292179829365398, + "loss": 1.1776, + "step": 13980 + }, + { + "epoch": 0.716260495596969, + "grad_norm": 0.1948610097169876, + "learning_rate": 0.001324004944759661, + "loss": 1.1597, + "step": 13990 + }, + { + "epoch": 0.7167724759369241, + "grad_norm": 0.1816781461238861, + "learning_rate": 0.0013188123515115915, + "loss": 1.1484, + "step": 14000 + }, + { + "epoch": 0.717284456276879, + "grad_norm": 0.2072591632604599, + "learning_rate": 0.0013136401230097012, + "loss": 1.1678, + "step": 14010 + }, + { + "epoch": 0.7177964366168339, + "grad_norm": 0.19381676614284515, + "learning_rate": 0.0013084881793858267, + "loss": 1.1714, + "step": 14020 + }, + { + "epoch": 0.7183084169567888, + "grad_norm": 0.178278848528862, + "learning_rate": 0.0013033564410850373, + "loss": 1.162, + "step": 14030 + }, + { + "epoch": 0.7188203972967439, + "grad_norm": 0.18733732402324677, + "learning_rate": 0.001298244828864409, + "loss": 1.1565, + "step": 14040 + }, + { + "epoch": 0.7193323776366988, + "grad_norm": 0.18614625930786133, + "learning_rate": 0.0012931532637917983, + "loss": 1.1678, + "step": 14050 + }, + { + "epoch": 0.7198443579766537, + "grad_norm": 0.17618735134601593, + "learning_rate": 0.0012880816672446245, + "loss": 1.1723, + "step": 14060 + }, + { + "epoch": 0.7203563383166086, + "grad_norm": 0.17765553295612335, + "learning_rate": 0.0012830299609086558, + "loss": 1.1511, + "step": 14070 + }, + { + "epoch": 0.7208683186565635, + "grad_norm": 0.19092194736003876, + "learning_rate": 0.0012779980667767994, + "loss": 1.1679, + "step": 14080 + }, + { + "epoch": 0.7213802989965186, + "grad_norm": 0.18768686056137085, + "learning_rate": 0.0012729859071478975, + "loss": 1.1668, + "step": 14090 + }, + { + "epoch": 0.7218922793364735, + "grad_norm": 0.18770349025726318, + "learning_rate": 0.0012679934046255271, + "loss": 1.1749, + "step": 14100 + }, + { + "epoch": 0.7224042596764284, + "grad_norm": 0.1935562640428543, + "learning_rate": 0.0012630204821168047, + "loss": 1.1535, + "step": 14110 + }, + { + "epoch": 0.7229162400163833, + "grad_norm": 0.17887477576732635, + "learning_rate": 0.0012580670628311967, + "loss": 1.1541, + "step": 14120 + }, + { + "epoch": 0.7234282203563384, + "grad_norm": 0.18734948337078094, + "learning_rate": 0.0012531330702793323, + "loss": 1.1669, + "step": 14130 + }, + { + "epoch": 0.7239402006962933, + "grad_norm": 0.17879174649715424, + "learning_rate": 0.0012482184282718238, + "loss": 1.1905, + "step": 14140 + }, + { + "epoch": 0.7244521810362482, + "grad_norm": 0.1950501948595047, + "learning_rate": 0.0012433230609180889, + "loss": 1.1446, + "step": 14150 + }, + { + "epoch": 0.7249641613762031, + "grad_norm": 0.1801559329032898, + "learning_rate": 0.0012384468926251798, + "loss": 1.1367, + "step": 14160 + }, + { + "epoch": 0.7254761417161582, + "grad_norm": 0.17999699711799622, + "learning_rate": 0.0012335898480966146, + "loss": 1.1402, + "step": 14170 + }, + { + "epoch": 0.7259881220561131, + "grad_norm": 0.18279437720775604, + "learning_rate": 0.0012287518523312166, + "loss": 1.1597, + "step": 14180 + }, + { + "epoch": 0.726500102396068, + "grad_norm": 0.19126516580581665, + "learning_rate": 0.001223932830621954, + "loss": 1.1604, + "step": 14190 + }, + { + "epoch": 0.7270120827360229, + "grad_norm": 0.18581058084964752, + "learning_rate": 0.0012191327085547877, + "loss": 1.1532, + "step": 14200 + }, + { + "epoch": 0.7275240630759778, + "grad_norm": 0.20243413746356964, + "learning_rate": 0.0012143514120075223, + "loss": 1.1495, + "step": 14210 + }, + { + "epoch": 0.7280360434159329, + "grad_norm": 0.19404320418834686, + "learning_rate": 0.0012095888671486597, + "loss": 1.1567, + "step": 14220 + }, + { + "epoch": 0.7285480237558878, + "grad_norm": 0.18503792583942413, + "learning_rate": 0.0012048450004362614, + "loss": 1.128, + "step": 14230 + }, + { + "epoch": 0.7290600040958427, + "grad_norm": 0.19073212146759033, + "learning_rate": 0.0012001197386168117, + "loss": 1.1458, + "step": 14240 + }, + { + "epoch": 0.7295719844357976, + "grad_norm": 0.2037813812494278, + "learning_rate": 0.0011954130087240865, + "loss": 1.1741, + "step": 14250 + }, + { + "epoch": 0.7300839647757527, + "grad_norm": 0.18591246008872986, + "learning_rate": 0.0011907247380780264, + "loss": 1.1458, + "step": 14260 + }, + { + "epoch": 0.7305959451157076, + "grad_norm": 0.18210938572883606, + "learning_rate": 0.0011860548542836156, + "loss": 1.1695, + "step": 14270 + }, + { + "epoch": 0.7311079254556625, + "grad_norm": 0.18794593214988708, + "learning_rate": 0.0011814032852297623, + "loss": 1.1458, + "step": 14280 + }, + { + "epoch": 0.7316199057956174, + "grad_norm": 0.1834757775068283, + "learning_rate": 0.001176769959088186, + "loss": 1.1485, + "step": 14290 + }, + { + "epoch": 0.7321318861355723, + "grad_norm": 0.1770770400762558, + "learning_rate": 0.0011721548043123092, + "loss": 1.1473, + "step": 14300 + }, + { + "epoch": 0.7326438664755274, + "grad_norm": 0.19540582597255707, + "learning_rate": 0.0011675577496361507, + "loss": 1.14, + "step": 14310 + }, + { + "epoch": 0.7331558468154823, + "grad_norm": 0.18834899365901947, + "learning_rate": 0.0011629787240732272, + "loss": 1.1326, + "step": 14320 + }, + { + "epoch": 0.7336678271554372, + "grad_norm": 0.18618904054164886, + "learning_rate": 0.0011584176569154553, + "loss": 1.1388, + "step": 14330 + }, + { + "epoch": 0.7341798074953921, + "grad_norm": 0.1807902604341507, + "learning_rate": 0.0011538744777320608, + "loss": 1.1448, + "step": 14340 + }, + { + "epoch": 0.7346917878353472, + "grad_norm": 0.18239812552928925, + "learning_rate": 0.0011493491163684908, + "loss": 1.1355, + "step": 14350 + }, + { + "epoch": 0.7352037681753021, + "grad_norm": 0.18156401813030243, + "learning_rate": 0.0011448415029453305, + "loss": 1.1309, + "step": 14360 + }, + { + "epoch": 0.735715748515257, + "grad_norm": 0.1813691258430481, + "learning_rate": 0.0011403515678572234, + "loss": 1.134, + "step": 14370 + }, + { + "epoch": 0.7362277288552119, + "grad_norm": 0.18241450190544128, + "learning_rate": 0.0011358792417717981, + "loss": 1.1378, + "step": 14380 + }, + { + "epoch": 0.736739709195167, + "grad_norm": 0.18394464254379272, + "learning_rate": 0.001131424455628596, + "loss": 1.1497, + "step": 14390 + }, + { + "epoch": 0.7372516895351219, + "grad_norm": 0.18612609803676605, + "learning_rate": 0.0011269871406380059, + "loss": 1.1669, + "step": 14400 + }, + { + "epoch": 0.7377636698750768, + "grad_norm": 0.18373136222362518, + "learning_rate": 0.001122567228280201, + "loss": 1.1453, + "step": 14410 + }, + { + "epoch": 0.7382756502150317, + "grad_norm": 0.193937748670578, + "learning_rate": 0.001118164650304082, + "loss": 1.1357, + "step": 14420 + }, + { + "epoch": 0.7387876305549866, + "grad_norm": 0.18261444568634033, + "learning_rate": 0.0011137793387262216, + "loss": 1.169, + "step": 14430 + }, + { + "epoch": 0.7392996108949417, + "grad_norm": 0.19592134654521942, + "learning_rate": 0.0011094112258298167, + "loss": 1.1518, + "step": 14440 + }, + { + "epoch": 0.7398115912348966, + "grad_norm": 0.17495043575763702, + "learning_rate": 0.0011050602441636402, + "loss": 1.1481, + "step": 14450 + }, + { + "epoch": 0.7403235715748515, + "grad_norm": 0.18108507990837097, + "learning_rate": 0.001100726326541002, + "loss": 1.1327, + "step": 14460 + }, + { + "epoch": 0.7408355519148064, + "grad_norm": 0.1797986775636673, + "learning_rate": 0.00109640940603871, + "loss": 1.1394, + "step": 14470 + }, + { + "epoch": 0.7413475322547615, + "grad_norm": 0.18484458327293396, + "learning_rate": 0.001092109415996037, + "loss": 1.1188, + "step": 14480 + }, + { + "epoch": 0.7418595125947164, + "grad_norm": 0.1784062534570694, + "learning_rate": 0.0010878262900136915, + "loss": 1.125, + "step": 14490 + }, + { + "epoch": 0.7423714929346713, + "grad_norm": 0.1869814693927765, + "learning_rate": 0.0010835599619527924, + "loss": 1.1417, + "step": 14500 + }, + { + "epoch": 0.7428834732746262, + "grad_norm": 0.18346761167049408, + "learning_rate": 0.0010793103659338475, + "loss": 1.1182, + "step": 14510 + }, + { + "epoch": 0.7433954536145811, + "grad_norm": 0.188985213637352, + "learning_rate": 0.0010750774363357356, + "loss": 1.1412, + "step": 14520 + }, + { + "epoch": 0.7439074339545362, + "grad_norm": 0.1802164912223816, + "learning_rate": 0.0010708611077946955, + "loss": 1.1338, + "step": 14530 + }, + { + "epoch": 0.7444194142944911, + "grad_norm": 0.17940784990787506, + "learning_rate": 0.0010666613152033133, + "loss": 1.1477, + "step": 14540 + }, + { + "epoch": 0.744931394634446, + "grad_norm": 0.19481126964092255, + "learning_rate": 0.00106247799370952, + "loss": 1.1306, + "step": 14550 + }, + { + "epoch": 0.7454433749744009, + "grad_norm": 0.17663590610027313, + "learning_rate": 0.0010583110787155889, + "loss": 1.1395, + "step": 14560 + }, + { + "epoch": 0.745955355314356, + "grad_norm": 0.18392081558704376, + "learning_rate": 0.001054160505877137, + "loss": 1.1339, + "step": 14570 + }, + { + "epoch": 0.7464673356543109, + "grad_norm": 0.1872582733631134, + "learning_rate": 0.0010500262111021333, + "loss": 1.1271, + "step": 14580 + }, + { + "epoch": 0.7469793159942658, + "grad_norm": 0.18514196574687958, + "learning_rate": 0.0010459081305499078, + "loss": 1.1561, + "step": 14590 + }, + { + "epoch": 0.7474912963342207, + "grad_norm": 0.18902930617332458, + "learning_rate": 0.0010418062006301674, + "loss": 1.1402, + "step": 14600 + }, + { + "epoch": 0.7480032766741758, + "grad_norm": 0.1824546903371811, + "learning_rate": 0.0010377203580020109, + "loss": 1.1439, + "step": 14610 + }, + { + "epoch": 0.7485152570141307, + "grad_norm": 0.1803770363330841, + "learning_rate": 0.001033650539572954, + "loss": 1.1313, + "step": 14620 + }, + { + "epoch": 0.7490272373540856, + "grad_norm": 0.19267936050891876, + "learning_rate": 0.0010295966824979534, + "loss": 1.1082, + "step": 14630 + }, + { + "epoch": 0.7495392176940405, + "grad_norm": 0.19047097861766815, + "learning_rate": 0.0010255587241784366, + "loss": 1.122, + "step": 14640 + }, + { + "epoch": 0.7500511980339954, + "grad_norm": 0.1689426302909851, + "learning_rate": 0.0010215366022613358, + "loss": 1.1172, + "step": 14650 + }, + { + "epoch": 0.7505631783739505, + "grad_norm": 0.18644796311855316, + "learning_rate": 0.0010175302546381246, + "loss": 1.146, + "step": 14660 + }, + { + "epoch": 0.7510751587139054, + "grad_norm": 0.18672852218151093, + "learning_rate": 0.0010135396194438586, + "loss": 1.1386, + "step": 14670 + }, + { + "epoch": 0.7515871390538603, + "grad_norm": 0.19166767597198486, + "learning_rate": 0.0010095646350562206, + "loss": 1.1365, + "step": 14680 + }, + { + "epoch": 0.7520991193938152, + "grad_norm": 0.18109376728534698, + "learning_rate": 0.0010056052400945696, + "loss": 1.113, + "step": 14690 + }, + { + "epoch": 0.7526110997337703, + "grad_norm": 0.17950654029846191, + "learning_rate": 0.0010016613734189915, + "loss": 1.1474, + "step": 14700 + }, + { + "epoch": 0.7531230800737252, + "grad_norm": 0.184305801987648, + "learning_rate": 0.0009977329741293565, + "loss": 1.1199, + "step": 14710 + }, + { + "epoch": 0.7536350604136801, + "grad_norm": 0.18768514692783356, + "learning_rate": 0.0009938199815643773, + "loss": 1.1451, + "step": 14720 + }, + { + "epoch": 0.754147040753635, + "grad_norm": 0.17981773614883423, + "learning_rate": 0.0009899223353006738, + "loss": 1.1423, + "step": 14730 + }, + { + "epoch": 0.75465902109359, + "grad_norm": 0.17722870409488678, + "learning_rate": 0.0009860399751518388, + "loss": 1.1208, + "step": 14740 + }, + { + "epoch": 0.755171001433545, + "grad_norm": 0.18367789685726166, + "learning_rate": 0.0009821728411675095, + "loss": 1.148, + "step": 14750 + }, + { + "epoch": 0.7556829817734999, + "grad_norm": 0.18441089987754822, + "learning_rate": 0.0009783208736324418, + "loss": 1.1112, + "step": 14760 + }, + { + "epoch": 0.7561949621134548, + "grad_norm": 0.1897488385438919, + "learning_rate": 0.000974484013065587, + "loss": 1.1231, + "step": 14770 + }, + { + "epoch": 0.7567069424534097, + "grad_norm": 0.18716907501220703, + "learning_rate": 0.0009706622002191746, + "loss": 1.1018, + "step": 14780 + }, + { + "epoch": 0.7572189227933648, + "grad_norm": 0.18121209740638733, + "learning_rate": 0.0009668553760777972, + "loss": 1.1225, + "step": 14790 + }, + { + "epoch": 0.7577309031333197, + "grad_norm": 0.19911837577819824, + "learning_rate": 0.0009630634818574985, + "loss": 1.1266, + "step": 14800 + }, + { + "epoch": 0.7582428834732746, + "grad_norm": 0.169275164604187, + "learning_rate": 0.0009592864590048661, + "loss": 1.1152, + "step": 14810 + }, + { + "epoch": 0.7587548638132295, + "grad_norm": 0.1855994015932083, + "learning_rate": 0.0009555242491961278, + "loss": 1.1318, + "step": 14820 + }, + { + "epoch": 0.7592668441531846, + "grad_norm": 0.17527516186237335, + "learning_rate": 0.0009517767943362495, + "loss": 1.0988, + "step": 14830 + }, + { + "epoch": 0.7597788244931395, + "grad_norm": 0.18066614866256714, + "learning_rate": 0.0009480440365580401, + "loss": 1.1097, + "step": 14840 + }, + { + "epoch": 0.7602908048330944, + "grad_norm": 0.17801222205162048, + "learning_rate": 0.000944325918221256, + "loss": 1.1196, + "step": 14850 + }, + { + "epoch": 0.7608027851730493, + "grad_norm": 0.19464291632175446, + "learning_rate": 0.0009406223819117125, + "loss": 1.1319, + "step": 14860 + }, + { + "epoch": 0.7613147655130043, + "grad_norm": 0.1878882348537445, + "learning_rate": 0.0009369333704403964, + "loss": 1.13, + "step": 14870 + }, + { + "epoch": 0.7618267458529593, + "grad_norm": 0.17626269161701202, + "learning_rate": 0.0009332588268425832, + "loss": 1.1181, + "step": 14880 + }, + { + "epoch": 0.7623387261929142, + "grad_norm": 0.1895529329776764, + "learning_rate": 0.0009295986943769574, + "loss": 1.1333, + "step": 14890 + }, + { + "epoch": 0.7628507065328691, + "grad_norm": 0.1784052848815918, + "learning_rate": 0.0009259529165247364, + "loss": 1.1242, + "step": 14900 + }, + { + "epoch": 0.763362686872824, + "grad_norm": 0.17965124547481537, + "learning_rate": 0.0009223214369887976, + "loss": 1.1258, + "step": 14910 + }, + { + "epoch": 0.7638746672127791, + "grad_norm": 0.17978616058826447, + "learning_rate": 0.0009187041996928093, + "loss": 1.1125, + "step": 14920 + }, + { + "epoch": 0.764386647552734, + "grad_norm": 0.18885265290737152, + "learning_rate": 0.0009151011487803643, + "loss": 1.1061, + "step": 14930 + }, + { + "epoch": 0.7648986278926889, + "grad_norm": 0.18489712476730347, + "learning_rate": 0.0009115122286141184, + "loss": 1.127, + "step": 14940 + }, + { + "epoch": 0.7654106082326438, + "grad_norm": 0.17437365651130676, + "learning_rate": 0.0009079373837749296, + "loss": 1.1148, + "step": 14950 + }, + { + "epoch": 0.7659225885725988, + "grad_norm": 0.18147113919258118, + "learning_rate": 0.0009043765590610044, + "loss": 1.1014, + "step": 14960 + }, + { + "epoch": 0.7664345689125538, + "grad_norm": 0.17263419926166534, + "learning_rate": 0.0009008296994870436, + "loss": 1.1118, + "step": 14970 + }, + { + "epoch": 0.7669465492525087, + "grad_norm": 0.17921820282936096, + "learning_rate": 0.000897296750283394, + "loss": 1.1245, + "step": 14980 + }, + { + "epoch": 0.7674585295924636, + "grad_norm": 0.17663663625717163, + "learning_rate": 0.0008937776568952028, + "loss": 1.1078, + "step": 14990 + }, + { + "epoch": 0.7679705099324186, + "grad_norm": 0.17961500585079193, + "learning_rate": 0.0008902723649815751, + "loss": 1.0977, + "step": 15000 + }, + { + "epoch": 0.7684824902723736, + "grad_norm": 0.18368123471736908, + "learning_rate": 0.0008867808204147341, + "loss": 1.103, + "step": 15010 + }, + { + "epoch": 0.7689944706123285, + "grad_norm": 0.18269400298595428, + "learning_rate": 0.0008833029692791867, + "loss": 1.108, + "step": 15020 + }, + { + "epoch": 0.7695064509522834, + "grad_norm": 0.1727774292230606, + "learning_rate": 0.0008798387578708893, + "loss": 1.1033, + "step": 15030 + }, + { + "epoch": 0.7700184312922383, + "grad_norm": 0.18222136795520782, + "learning_rate": 0.0008763881326964195, + "loss": 1.1089, + "step": 15040 + }, + { + "epoch": 0.7705304116321933, + "grad_norm": 0.1899970918893814, + "learning_rate": 0.0008729510404721502, + "loss": 1.1039, + "step": 15050 + }, + { + "epoch": 0.7710423919721483, + "grad_norm": 0.18128469586372375, + "learning_rate": 0.0008695274281234262, + "loss": 1.1078, + "step": 15060 + }, + { + "epoch": 0.7715543723121032, + "grad_norm": 0.18401475250720978, + "learning_rate": 0.0008661172427837451, + "loss": 1.1023, + "step": 15070 + }, + { + "epoch": 0.7720663526520581, + "grad_norm": 0.18456844985485077, + "learning_rate": 0.0008627204317939403, + "loss": 1.1187, + "step": 15080 + }, + { + "epoch": 0.7725783329920131, + "grad_norm": 0.18838796019554138, + "learning_rate": 0.0008593369427013692, + "loss": 1.0908, + "step": 15090 + }, + { + "epoch": 0.7730903133319681, + "grad_norm": 0.18515382707118988, + "learning_rate": 0.0008559667232591014, + "loss": 1.1099, + "step": 15100 + }, + { + "epoch": 0.773602293671923, + "grad_norm": 0.18746817111968994, + "learning_rate": 0.0008526097214251135, + "loss": 1.1073, + "step": 15110 + }, + { + "epoch": 0.7741142740118779, + "grad_norm": 0.18683654069900513, + "learning_rate": 0.0008492658853614846, + "loss": 1.1195, + "step": 15120 + }, + { + "epoch": 0.7746262543518329, + "grad_norm": 0.17560458183288574, + "learning_rate": 0.0008459351634335962, + "loss": 1.0919, + "step": 15130 + }, + { + "epoch": 0.7751382346917879, + "grad_norm": 0.17539164423942566, + "learning_rate": 0.0008426175042093346, + "loss": 1.1082, + "step": 15140 + }, + { + "epoch": 0.7756502150317428, + "grad_norm": 0.17442087829113007, + "learning_rate": 0.0008393128564582973, + "loss": 1.1077, + "step": 15150 + }, + { + "epoch": 0.7761621953716977, + "grad_norm": 0.17610372602939606, + "learning_rate": 0.0008360211691510009, + "loss": 1.0976, + "step": 15160 + }, + { + "epoch": 0.7766741757116526, + "grad_norm": 0.18700052797794342, + "learning_rate": 0.0008327423914580938, + "loss": 1.1116, + "step": 15170 + }, + { + "epoch": 0.7771861560516076, + "grad_norm": 0.18908992409706116, + "learning_rate": 0.0008294764727495717, + "loss": 1.1266, + "step": 15180 + }, + { + "epoch": 0.7776981363915626, + "grad_norm": 0.17554494738578796, + "learning_rate": 0.0008262233625939947, + "loss": 1.1228, + "step": 15190 + }, + { + "epoch": 0.7782101167315175, + "grad_norm": 0.1848273128271103, + "learning_rate": 0.0008229830107577095, + "loss": 1.1032, + "step": 15200 + }, + { + "epoch": 0.7787220970714724, + "grad_norm": 0.1751490831375122, + "learning_rate": 0.0008197553672040732, + "loss": 1.1022, + "step": 15210 + }, + { + "epoch": 0.7792340774114274, + "grad_norm": 0.19107986986637115, + "learning_rate": 0.0008165403820926805, + "loss": 1.1107, + "step": 15220 + }, + { + "epoch": 0.7797460577513824, + "grad_norm": 0.17038871347904205, + "learning_rate": 0.000813338005778595, + "loss": 1.0906, + "step": 15230 + }, + { + "epoch": 0.7802580380913373, + "grad_norm": 0.17573246359825134, + "learning_rate": 0.0008101481888115815, + "loss": 1.1185, + "step": 15240 + }, + { + "epoch": 0.7807700184312922, + "grad_norm": 0.18138054013252258, + "learning_rate": 0.000806970881935343, + "loss": 1.1068, + "step": 15250 + }, + { + "epoch": 0.7812819987712472, + "grad_norm": 0.18504558503627777, + "learning_rate": 0.00080380603608676, + "loss": 1.1187, + "step": 15260 + }, + { + "epoch": 0.7817939791112021, + "grad_norm": 0.1914263665676117, + "learning_rate": 0.0008006536023951326, + "loss": 1.1028, + "step": 15270 + }, + { + "epoch": 0.7823059594511571, + "grad_norm": 0.17930828034877777, + "learning_rate": 0.0007975135321814267, + "loss": 1.12, + "step": 15280 + }, + { + "epoch": 0.782817939791112, + "grad_norm": 0.18710237741470337, + "learning_rate": 0.0007943857769575209, + "loss": 1.0943, + "step": 15290 + }, + { + "epoch": 0.783329920131067, + "grad_norm": 0.18522420525550842, + "learning_rate": 0.0007912702884254589, + "loss": 1.1125, + "step": 15300 + }, + { + "epoch": 0.7838419004710219, + "grad_norm": 0.17634257674217224, + "learning_rate": 0.0007881670184767039, + "loss": 1.0855, + "step": 15310 + }, + { + "epoch": 0.7843538808109769, + "grad_norm": 0.1925361305475235, + "learning_rate": 0.0007850759191913941, + "loss": 1.0957, + "step": 15320 + }, + { + "epoch": 0.7848658611509318, + "grad_norm": 0.18163706362247467, + "learning_rate": 0.0007819969428376047, + "loss": 1.0994, + "step": 15330 + }, + { + "epoch": 0.7853778414908867, + "grad_norm": 0.1802321821451187, + "learning_rate": 0.0007789300418706098, + "loss": 1.1043, + "step": 15340 + }, + { + "epoch": 0.7858898218308417, + "grad_norm": 0.20434251427650452, + "learning_rate": 0.0007758751689321484, + "loss": 1.0943, + "step": 15350 + }, + { + "epoch": 0.7864018021707967, + "grad_norm": 0.1818198412656784, + "learning_rate": 0.0007728322768496924, + "loss": 1.0916, + "step": 15360 + }, + { + "epoch": 0.7869137825107516, + "grad_norm": 0.18060991168022156, + "learning_rate": 0.0007698013186357197, + "loss": 1.1122, + "step": 15370 + }, + { + "epoch": 0.7874257628507065, + "grad_norm": 0.18546059727668762, + "learning_rate": 0.0007667822474869874, + "loss": 1.1075, + "step": 15380 + }, + { + "epoch": 0.7879377431906615, + "grad_norm": 0.18823228776454926, + "learning_rate": 0.0007637750167838097, + "loss": 1.1197, + "step": 15390 + }, + { + "epoch": 0.7884497235306164, + "grad_norm": 0.17590127885341644, + "learning_rate": 0.0007607795800893374, + "loss": 1.0865, + "step": 15400 + }, + { + "epoch": 0.7889617038705714, + "grad_norm": 0.18602034449577332, + "learning_rate": 0.000757795891148842, + "loss": 1.1, + "step": 15410 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.19357922673225403, + "learning_rate": 0.0007548239038889995, + "loss": 1.1015, + "step": 15420 + }, + { + "epoch": 0.7899856645504812, + "grad_norm": 0.17590965330600739, + "learning_rate": 0.000751863572417181, + "loss": 1.1113, + "step": 15430 + }, + { + "epoch": 0.7904976448904362, + "grad_norm": 0.1751716434955597, + "learning_rate": 0.0007489148510207429, + "loss": 1.0898, + "step": 15440 + }, + { + "epoch": 0.7910096252303912, + "grad_norm": 0.17589299380779266, + "learning_rate": 0.000745977694166321, + "loss": 1.0931, + "step": 15450 + }, + { + "epoch": 0.7915216055703461, + "grad_norm": 0.17544785141944885, + "learning_rate": 0.0007430520564991282, + "loss": 1.0914, + "step": 15460 + }, + { + "epoch": 0.792033585910301, + "grad_norm": 0.18367989361286163, + "learning_rate": 0.0007401378928422531, + "loss": 1.1043, + "step": 15470 + }, + { + "epoch": 0.792545566250256, + "grad_norm": 0.17736022174358368, + "learning_rate": 0.0007372351581959634, + "loss": 1.1252, + "step": 15480 + }, + { + "epoch": 0.7930575465902109, + "grad_norm": 0.18722687661647797, + "learning_rate": 0.0007343438077370098, + "loss": 1.095, + "step": 15490 + }, + { + "epoch": 0.7935695269301659, + "grad_norm": 0.1756405234336853, + "learning_rate": 0.0007314637968179351, + "loss": 1.1017, + "step": 15500 + }, + { + "epoch": 0.7940815072701208, + "grad_norm": 0.17875617742538452, + "learning_rate": 0.0007285950809663841, + "loss": 1.0979, + "step": 15510 + }, + { + "epoch": 0.7945934876100758, + "grad_norm": 0.17093615233898163, + "learning_rate": 0.0007257376158844169, + "loss": 1.0886, + "step": 15520 + }, + { + "epoch": 0.7951054679500307, + "grad_norm": 0.18361063301563263, + "learning_rate": 0.0007228913574478252, + "loss": 1.1089, + "step": 15530 + }, + { + "epoch": 0.7956174482899857, + "grad_norm": 0.1857183277606964, + "learning_rate": 0.0007200562617054503, + "loss": 1.0806, + "step": 15540 + }, + { + "epoch": 0.7961294286299406, + "grad_norm": 0.1974077820777893, + "learning_rate": 0.0007172322848785056, + "loss": 1.088, + "step": 15550 + }, + { + "epoch": 0.7966414089698955, + "grad_norm": 0.173116534948349, + "learning_rate": 0.0007144193833598987, + "loss": 1.0921, + "step": 15560 + }, + { + "epoch": 0.7971533893098505, + "grad_norm": 0.17753879725933075, + "learning_rate": 0.0007116175137135599, + "loss": 1.0846, + "step": 15570 + }, + { + "epoch": 0.7976653696498055, + "grad_norm": 0.1796150505542755, + "learning_rate": 0.0007088266326737707, + "loss": 1.0816, + "step": 15580 + }, + { + "epoch": 0.7981773499897604, + "grad_norm": 0.17271041870117188, + "learning_rate": 0.0007060466971444953, + "loss": 1.0875, + "step": 15590 + }, + { + "epoch": 0.7986893303297153, + "grad_norm": 0.1766566038131714, + "learning_rate": 0.0007032776641987162, + "loss": 1.085, + "step": 15600 + }, + { + "epoch": 0.7992013106696703, + "grad_norm": 0.17464908957481384, + "learning_rate": 0.0007005194910777697, + "loss": 1.0669, + "step": 15610 + }, + { + "epoch": 0.7997132910096252, + "grad_norm": 0.18235880136489868, + "learning_rate": 0.0006977721351906876, + "loss": 1.0983, + "step": 15620 + }, + { + "epoch": 0.8002252713495802, + "grad_norm": 0.17582911252975464, + "learning_rate": 0.0006950355541135377, + "loss": 1.0748, + "step": 15630 + }, + { + "epoch": 0.8007372516895351, + "grad_norm": 0.18529601395130157, + "learning_rate": 0.0006923097055887701, + "loss": 1.082, + "step": 15640 + }, + { + "epoch": 0.80124923202949, + "grad_norm": 0.18771891295909882, + "learning_rate": 0.000689594547524564, + "loss": 1.0792, + "step": 15650 + }, + { + "epoch": 0.801761212369445, + "grad_norm": 0.18567664921283722, + "learning_rate": 0.0006868900379941773, + "loss": 1.0929, + "step": 15660 + }, + { + "epoch": 0.8022731927094, + "grad_norm": 0.18062008917331696, + "learning_rate": 0.0006841961352353004, + "loss": 1.0952, + "step": 15670 + }, + { + "epoch": 0.8027851730493549, + "grad_norm": 0.17383413016796112, + "learning_rate": 0.0006815127976494104, + "loss": 1.1029, + "step": 15680 + }, + { + "epoch": 0.8032971533893098, + "grad_norm": 0.17971891164779663, + "learning_rate": 0.0006788399838011287, + "loss": 1.1032, + "step": 15690 + }, + { + "epoch": 0.8038091337292648, + "grad_norm": 0.17936407029628754, + "learning_rate": 0.0006761776524175815, + "loss": 1.1001, + "step": 15700 + }, + { + "epoch": 0.8043211140692197, + "grad_norm": 0.18222102522850037, + "learning_rate": 0.0006735257623877627, + "loss": 1.0872, + "step": 15710 + }, + { + "epoch": 0.8048330944091747, + "grad_norm": 0.18015074729919434, + "learning_rate": 0.0006708842727618985, + "loss": 1.0991, + "step": 15720 + }, + { + "epoch": 0.8053450747491296, + "grad_norm": 0.17375022172927856, + "learning_rate": 0.0006682531427508156, + "loss": 1.0623, + "step": 15730 + }, + { + "epoch": 0.8058570550890846, + "grad_norm": 0.1764671802520752, + "learning_rate": 0.0006656323317253108, + "loss": 1.0984, + "step": 15740 + }, + { + "epoch": 0.8063690354290395, + "grad_norm": 0.1692001074552536, + "learning_rate": 0.0006630217992155241, + "loss": 1.0859, + "step": 15750 + }, + { + "epoch": 0.8068810157689945, + "grad_norm": 0.17819392681121826, + "learning_rate": 0.0006604215049103134, + "loss": 1.0899, + "step": 15760 + }, + { + "epoch": 0.8073929961089494, + "grad_norm": 0.17758633196353912, + "learning_rate": 0.0006578314086566325, + "loss": 1.0826, + "step": 15770 + }, + { + "epoch": 0.8079049764489044, + "grad_norm": 0.17600396275520325, + "learning_rate": 0.0006552514704589104, + "loss": 1.0912, + "step": 15780 + }, + { + "epoch": 0.8084169567888593, + "grad_norm": 0.177523672580719, + "learning_rate": 0.0006526816504784343, + "loss": 1.0814, + "step": 15790 + }, + { + "epoch": 0.8089289371288143, + "grad_norm": 0.17935074865818024, + "learning_rate": 0.0006501219090327343, + "loss": 1.0859, + "step": 15800 + }, + { + "epoch": 0.8094409174687692, + "grad_norm": 0.18292473256587982, + "learning_rate": 0.0006475722065949703, + "loss": 1.0716, + "step": 15810 + }, + { + "epoch": 0.8099528978087241, + "grad_norm": 0.18235322833061218, + "learning_rate": 0.000645032503793322, + "loss": 1.085, + "step": 15820 + }, + { + "epoch": 0.8104648781486791, + "grad_norm": 0.18412081897258759, + "learning_rate": 0.0006425027614103806, + "loss": 1.0872, + "step": 15830 + }, + { + "epoch": 0.810976858488634, + "grad_norm": 0.17389538884162903, + "learning_rate": 0.0006399829403825436, + "loss": 1.0935, + "step": 15840 + }, + { + "epoch": 0.811488838828589, + "grad_norm": 0.17470002174377441, + "learning_rate": 0.0006374730017994116, + "loss": 1.0603, + "step": 15850 + }, + { + "epoch": 0.8120008191685439, + "grad_norm": 0.17814920842647552, + "learning_rate": 0.0006349729069031867, + "loss": 1.1096, + "step": 15860 + }, + { + "epoch": 0.8125127995084989, + "grad_norm": 0.18193413317203522, + "learning_rate": 0.000632482617088075, + "loss": 1.076, + "step": 15870 + }, + { + "epoch": 0.8130247798484538, + "grad_norm": 0.18022698163986206, + "learning_rate": 0.0006300020938996901, + "loss": 1.0868, + "step": 15880 + }, + { + "epoch": 0.8135367601884088, + "grad_norm": 0.16944915056228638, + "learning_rate": 0.0006275312990344587, + "loss": 1.0857, + "step": 15890 + }, + { + "epoch": 0.8140487405283637, + "grad_norm": 0.17860791087150574, + "learning_rate": 0.0006250701943390303, + "loss": 1.0885, + "step": 15900 + }, + { + "epoch": 0.8145607208683187, + "grad_norm": 0.169233039021492, + "learning_rate": 0.0006226187418096868, + "loss": 1.0701, + "step": 15910 + }, + { + "epoch": 0.8150727012082736, + "grad_norm": 0.18404126167297363, + "learning_rate": 0.0006201769035917569, + "loss": 1.0862, + "step": 15920 + }, + { + "epoch": 0.8155846815482285, + "grad_norm": 0.1732415407896042, + "learning_rate": 0.0006177446419790303, + "loss": 1.0552, + "step": 15930 + }, + { + "epoch": 0.8160966618881835, + "grad_norm": 0.17680327594280243, + "learning_rate": 0.0006153219194131765, + "loss": 1.0839, + "step": 15940 + }, + { + "epoch": 0.8166086422281384, + "grad_norm": 0.168556347489357, + "learning_rate": 0.000612908698483164, + "loss": 1.0628, + "step": 15950 + }, + { + "epoch": 0.8171206225680934, + "grad_norm": 0.1826118528842926, + "learning_rate": 0.0006105049419246835, + "loss": 1.0855, + "step": 15960 + }, + { + "epoch": 0.8176326029080483, + "grad_norm": 0.17182965576648712, + "learning_rate": 0.0006081106126195717, + "loss": 1.0669, + "step": 15970 + }, + { + "epoch": 0.8181445832480033, + "grad_norm": 0.16935127973556519, + "learning_rate": 0.0006057256735952383, + "loss": 1.083, + "step": 15980 + }, + { + "epoch": 0.8186565635879582, + "grad_norm": 0.17464590072631836, + "learning_rate": 0.0006033500880240954, + "loss": 1.0671, + "step": 15990 + }, + { + "epoch": 0.8191685439279132, + "grad_norm": 0.17747105658054352, + "learning_rate": 0.0006009838192229885, + "loss": 1.0678, + "step": 16000 + }, + { + "epoch": 0.8196805242678681, + "grad_norm": 0.17449192702770233, + "learning_rate": 0.0005986268306526304, + "loss": 1.0796, + "step": 16010 + }, + { + "epoch": 0.8201925046078231, + "grad_norm": 0.17097654938697815, + "learning_rate": 0.0005962790859170364, + "loss": 1.0778, + "step": 16020 + }, + { + "epoch": 0.820704484947778, + "grad_norm": 0.16904379427433014, + "learning_rate": 0.0005939405487629626, + "loss": 1.0843, + "step": 16030 + }, + { + "epoch": 0.821216465287733, + "grad_norm": 0.17497345805168152, + "learning_rate": 0.0005916111830793466, + "loss": 1.101, + "step": 16040 + }, + { + "epoch": 0.8217284456276879, + "grad_norm": 0.1789994090795517, + "learning_rate": 0.0005892909528967487, + "loss": 1.0845, + "step": 16050 + }, + { + "epoch": 0.8222404259676428, + "grad_norm": 0.1678200364112854, + "learning_rate": 0.0005869798223867978, + "loss": 1.0606, + "step": 16060 + }, + { + "epoch": 0.8227524063075978, + "grad_norm": 0.17383365333080292, + "learning_rate": 0.000584677755861637, + "loss": 1.0674, + "step": 16070 + }, + { + "epoch": 0.8232643866475527, + "grad_norm": 0.17335745692253113, + "learning_rate": 0.0005823847177733732, + "loss": 1.0965, + "step": 16080 + }, + { + "epoch": 0.8237763669875077, + "grad_norm": 0.16967058181762695, + "learning_rate": 0.0005801006727135282, + "loss": 1.0677, + "step": 16090 + }, + { + "epoch": 0.8242883473274626, + "grad_norm": 0.16847650706768036, + "learning_rate": 0.0005778255854124912, + "loss": 1.0791, + "step": 16100 + }, + { + "epoch": 0.8248003276674176, + "grad_norm": 0.17251423001289368, + "learning_rate": 0.0005755594207389755, + "loss": 1.0806, + "step": 16110 + }, + { + "epoch": 0.8253123080073725, + "grad_norm": 0.17555896937847137, + "learning_rate": 0.0005733021436994743, + "loss": 1.066, + "step": 16120 + }, + { + "epoch": 0.8258242883473275, + "grad_norm": 0.16997992992401123, + "learning_rate": 0.000571053719437722, + "loss": 1.0876, + "step": 16130 + }, + { + "epoch": 0.8263362686872824, + "grad_norm": 0.17845116555690765, + "learning_rate": 0.0005688141132341551, + "loss": 1.085, + "step": 16140 + }, + { + "epoch": 0.8268482490272373, + "grad_norm": 0.1836511194705963, + "learning_rate": 0.0005665832905053756, + "loss": 1.0769, + "step": 16150 + }, + { + "epoch": 0.8273602293671923, + "grad_norm": 0.1753719449043274, + "learning_rate": 0.0005643612168036182, + "loss": 1.0742, + "step": 16160 + }, + { + "epoch": 0.8278722097071473, + "grad_norm": 0.17152993381023407, + "learning_rate": 0.0005621478578162176, + "loss": 1.0761, + "step": 16170 + }, + { + "epoch": 0.8283841900471022, + "grad_norm": 0.18273817002773285, + "learning_rate": 0.0005599431793650786, + "loss": 1.0803, + "step": 16180 + }, + { + "epoch": 0.8288961703870571, + "grad_norm": 0.1865053027868271, + "learning_rate": 0.0005577471474061485, + "loss": 1.0695, + "step": 16190 + }, + { + "epoch": 0.8294081507270121, + "grad_norm": 0.16600672900676727, + "learning_rate": 0.0005555597280288918, + "loss": 1.0844, + "step": 16200 + }, + { + "epoch": 0.829920131066967, + "grad_norm": 0.1850479394197464, + "learning_rate": 0.0005533808874557656, + "loss": 1.0658, + "step": 16210 + }, + { + "epoch": 0.830432111406922, + "grad_norm": 0.17687514424324036, + "learning_rate": 0.000551210592041699, + "loss": 1.072, + "step": 16220 + }, + { + "epoch": 0.8309440917468769, + "grad_norm": 0.1833869218826294, + "learning_rate": 0.000549048808273573, + "loss": 1.0739, + "step": 16230 + }, + { + "epoch": 0.8314560720868319, + "grad_norm": 0.1750813126564026, + "learning_rate": 0.0005468955027697031, + "loss": 1.0851, + "step": 16240 + }, + { + "epoch": 0.8319680524267868, + "grad_norm": 0.18595030903816223, + "learning_rate": 0.0005447506422793241, + "loss": 1.0615, + "step": 16250 + }, + { + "epoch": 0.8324800327667418, + "grad_norm": 0.1711542159318924, + "learning_rate": 0.0005426141936820762, + "loss": 1.0689, + "step": 16260 + }, + { + "epoch": 0.8329920131066967, + "grad_norm": 0.18596914410591125, + "learning_rate": 0.000540486123987494, + "loss": 1.0574, + "step": 16270 + }, + { + "epoch": 0.8335039934466516, + "grad_norm": 0.17115946114063263, + "learning_rate": 0.0005383664003344964, + "loss": 1.0703, + "step": 16280 + }, + { + "epoch": 0.8340159737866066, + "grad_norm": 0.1802951842546463, + "learning_rate": 0.0005362549899908805, + "loss": 1.074, + "step": 16290 + }, + { + "epoch": 0.8345279541265616, + "grad_norm": 0.18504950404167175, + "learning_rate": 0.0005341518603528143, + "loss": 1.0747, + "step": 16300 + }, + { + "epoch": 0.8350399344665165, + "grad_norm": 0.17508040368556976, + "learning_rate": 0.000532056978944335, + "loss": 1.0784, + "step": 16310 + }, + { + "epoch": 0.8355519148064714, + "grad_norm": 0.1866855025291443, + "learning_rate": 0.0005299703134168463, + "loss": 1.0799, + "step": 16320 + }, + { + "epoch": 0.8360638951464264, + "grad_norm": 0.16678877174854279, + "learning_rate": 0.0005278918315486196, + "loss": 1.0531, + "step": 16330 + }, + { + "epoch": 0.8365758754863813, + "grad_norm": 0.1872544288635254, + "learning_rate": 0.000525821501244296, + "loss": 1.0768, + "step": 16340 + }, + { + "epoch": 0.8370878558263363, + "grad_norm": 0.17887745797634125, + "learning_rate": 0.0005237592905343908, + "loss": 1.0552, + "step": 16350 + }, + { + "epoch": 0.8375998361662912, + "grad_norm": 0.1764066219329834, + "learning_rate": 0.0005217051675748001, + "loss": 1.0511, + "step": 16360 + }, + { + "epoch": 0.8381118165062461, + "grad_norm": 0.17765092849731445, + "learning_rate": 0.0005196591006463087, + "loss": 1.0645, + "step": 16370 + }, + { + "epoch": 0.8386237968462011, + "grad_norm": 0.17197942733764648, + "learning_rate": 0.0005176210581541006, + "loss": 1.0561, + "step": 16380 + }, + { + "epoch": 0.8391357771861561, + "grad_norm": 0.1778382807970047, + "learning_rate": 0.0005155910086272709, + "loss": 1.0818, + "step": 16390 + }, + { + "epoch": 0.839647757526111, + "grad_norm": 0.1758384257555008, + "learning_rate": 0.00051356892071834, + "loss": 1.0755, + "step": 16400 + }, + { + "epoch": 0.8401597378660659, + "grad_norm": 0.17765450477600098, + "learning_rate": 0.0005115547632027694, + "loss": 1.0622, + "step": 16410 + }, + { + "epoch": 0.8406717182060209, + "grad_norm": 0.1722906529903412, + "learning_rate": 0.0005095485049784797, + "loss": 1.0562, + "step": 16420 + }, + { + "epoch": 0.8411836985459759, + "grad_norm": 0.18041284382343292, + "learning_rate": 0.0005075501150653699, + "loss": 1.0563, + "step": 16430 + }, + { + "epoch": 0.8416956788859308, + "grad_norm": 0.1721327304840088, + "learning_rate": 0.0005055595626048399, + "loss": 1.0872, + "step": 16440 + }, + { + "epoch": 0.8422076592258857, + "grad_norm": 0.17623233795166016, + "learning_rate": 0.000503576816859313, + "loss": 1.0768, + "step": 16450 + }, + { + "epoch": 0.8427196395658406, + "grad_norm": 0.1824178546667099, + "learning_rate": 0.000501601847211762, + "loss": 1.0773, + "step": 16460 + }, + { + "epoch": 0.8432316199057956, + "grad_norm": 0.17492622137069702, + "learning_rate": 0.0004996346231652357, + "loss": 1.0751, + "step": 16470 + }, + { + "epoch": 0.8437436002457506, + "grad_norm": 0.19331291317939758, + "learning_rate": 0.0004976751143423888, + "loss": 1.0522, + "step": 16480 + }, + { + "epoch": 0.8442555805857055, + "grad_norm": 0.17318172752857208, + "learning_rate": 0.0004957232904850122, + "loss": 1.0611, + "step": 16490 + }, + { + "epoch": 0.8447675609256604, + "grad_norm": 0.18951846659183502, + "learning_rate": 0.0004937791214535661, + "loss": 1.0584, + "step": 16500 + }, + { + "epoch": 0.8452795412656154, + "grad_norm": 0.17713989317417145, + "learning_rate": 0.0004918425772267145, + "loss": 1.0542, + "step": 16510 + }, + { + "epoch": 0.8457915216055704, + "grad_norm": 0.16759324073791504, + "learning_rate": 0.0004899136279008613, + "loss": 1.0689, + "step": 16520 + }, + { + "epoch": 0.8463035019455253, + "grad_norm": 0.18664461374282837, + "learning_rate": 0.000487992243689689, + "loss": 1.0732, + "step": 16530 + }, + { + "epoch": 0.8468154822854802, + "grad_norm": 0.17348751425743103, + "learning_rate": 0.00048607839492369886, + "loss": 1.0762, + "step": 16540 + }, + { + "epoch": 0.8473274626254352, + "grad_norm": 0.17233343422412872, + "learning_rate": 0.0004841720520497518, + "loss": 1.0579, + "step": 16550 + }, + { + "epoch": 0.8478394429653902, + "grad_norm": 0.18232837319374084, + "learning_rate": 0.0004822731856306133, + "loss": 1.0576, + "step": 16560 + }, + { + "epoch": 0.8483514233053451, + "grad_norm": 0.17330168187618256, + "learning_rate": 0.000480381766344498, + "loss": 1.044, + "step": 16570 + }, + { + "epoch": 0.8488634036453, + "grad_norm": 0.1745171695947647, + "learning_rate": 0.00047849776498461725, + "loss": 1.07, + "step": 16580 + }, + { + "epoch": 0.8493753839852549, + "grad_norm": 0.1749190390110016, + "learning_rate": 0.00047662115245872787, + "loss": 1.0666, + "step": 16590 + }, + { + "epoch": 0.84988736432521, + "grad_norm": 0.17629800736904144, + "learning_rate": 0.0004747518997886834, + "loss": 1.0694, + "step": 16600 + }, + { + "epoch": 0.8503993446651649, + "grad_norm": 0.17141848802566528, + "learning_rate": 0.00047288997810998585, + "loss": 1.0752, + "step": 16610 + }, + { + "epoch": 0.8509113250051198, + "grad_norm": 0.16317421197891235, + "learning_rate": 0.00047103535867134064, + "loss": 1.0575, + "step": 16620 + }, + { + "epoch": 0.8514233053450747, + "grad_norm": 0.1698952317237854, + "learning_rate": 0.0004691880128342126, + "loss": 1.054, + "step": 16630 + }, + { + "epoch": 0.8519352856850297, + "grad_norm": 0.17862023413181305, + "learning_rate": 0.00046734791207238334, + "loss": 1.0578, + "step": 16640 + }, + { + "epoch": 0.8524472660249847, + "grad_norm": 0.17291221022605896, + "learning_rate": 0.0004655150279715109, + "loss": 1.0614, + "step": 16650 + }, + { + "epoch": 0.8529592463649396, + "grad_norm": 0.18683776259422302, + "learning_rate": 0.0004636893322286915, + "loss": 1.0587, + "step": 16660 + }, + { + "epoch": 0.8534712267048945, + "grad_norm": 0.17157678306102753, + "learning_rate": 0.00046187079665202144, + "loss": 1.0876, + "step": 16670 + }, + { + "epoch": 0.8539832070448494, + "grad_norm": 0.16680538654327393, + "learning_rate": 0.0004600593931601628, + "loss": 1.0608, + "step": 16680 + }, + { + "epoch": 0.8544951873848045, + "grad_norm": 0.17904032766819, + "learning_rate": 0.00045825509378190934, + "loss": 1.0622, + "step": 16690 + }, + { + "epoch": 0.8550071677247594, + "grad_norm": 0.17377473413944244, + "learning_rate": 0.0004564578706557547, + "loss": 1.0761, + "step": 16700 + }, + { + "epoch": 0.8555191480647143, + "grad_norm": 0.17606638371944427, + "learning_rate": 0.0004546676960294617, + "loss": 1.0627, + "step": 16710 + }, + { + "epoch": 0.8560311284046692, + "grad_norm": 0.1655128300189972, + "learning_rate": 0.0004528845422596346, + "loss": 1.0579, + "step": 16720 + }, + { + "epoch": 0.8565431087446242, + "grad_norm": 0.185993954539299, + "learning_rate": 0.0004511083818112919, + "loss": 1.0604, + "step": 16730 + }, + { + "epoch": 0.8570550890845792, + "grad_norm": 0.18218767642974854, + "learning_rate": 0.00044933918725744066, + "loss": 1.0595, + "step": 16740 + }, + { + "epoch": 0.8575670694245341, + "grad_norm": 0.16947178542613983, + "learning_rate": 0.000447576931278654, + "loss": 1.0494, + "step": 16750 + }, + { + "epoch": 0.858079049764489, + "grad_norm": 0.17753495275974274, + "learning_rate": 0.00044582158666264793, + "loss": 1.0522, + "step": 16760 + }, + { + "epoch": 0.858591030104444, + "grad_norm": 0.1756090372800827, + "learning_rate": 0.0004440731263038627, + "loss": 1.074, + "step": 16770 + }, + { + "epoch": 0.859103010444399, + "grad_norm": 0.18287988007068634, + "learning_rate": 0.00044233152320304276, + "loss": 1.0883, + "step": 16780 + }, + { + "epoch": 0.8596149907843539, + "grad_norm": 0.18234935402870178, + "learning_rate": 0.0004405967504668205, + "loss": 1.0481, + "step": 16790 + }, + { + "epoch": 0.8601269711243088, + "grad_norm": 0.17408689856529236, + "learning_rate": 0.0004388687813073016, + "loss": 1.0672, + "step": 16800 + }, + { + "epoch": 0.8606389514642637, + "grad_norm": 0.1746188998222351, + "learning_rate": 0.00043714758904165, + "loss": 1.0581, + "step": 16810 + }, + { + "epoch": 0.8611509318042188, + "grad_norm": 0.17414236068725586, + "learning_rate": 0.0004354331470916772, + "loss": 1.0296, + "step": 16820 + }, + { + "epoch": 0.8616629121441737, + "grad_norm": 0.17176198959350586, + "learning_rate": 0.00043372542898343074, + "loss": 1.048, + "step": 16830 + }, + { + "epoch": 0.8621748924841286, + "grad_norm": 0.17366254329681396, + "learning_rate": 0.0004320244083467865, + "loss": 1.0584, + "step": 16840 + }, + { + "epoch": 0.8626868728240835, + "grad_norm": 0.17431634664535522, + "learning_rate": 0.0004303300589150403, + "loss": 1.0747, + "step": 16850 + }, + { + "epoch": 0.8631988531640385, + "grad_norm": 0.17983673512935638, + "learning_rate": 0.0004286423545245033, + "loss": 1.0477, + "step": 16860 + }, + { + "epoch": 0.8637108335039935, + "grad_norm": 0.17973174154758453, + "learning_rate": 0.00042696126911409766, + "loss": 1.0733, + "step": 16870 + }, + { + "epoch": 0.8642228138439484, + "grad_norm": 0.17209124565124512, + "learning_rate": 0.0004252867767249536, + "loss": 1.0553, + "step": 16880 + }, + { + "epoch": 0.8647347941839033, + "grad_norm": 0.17548377811908722, + "learning_rate": 0.0004236188515000098, + "loss": 1.0317, + "step": 16890 + }, + { + "epoch": 0.8652467745238582, + "grad_norm": 0.1856032758951187, + "learning_rate": 0.0004219574676836124, + "loss": 1.0645, + "step": 16900 + }, + { + "epoch": 0.8657587548638133, + "grad_norm": 0.171828031539917, + "learning_rate": 0.0004203025996211187, + "loss": 1.0468, + "step": 16910 + }, + { + "epoch": 0.8662707352037682, + "grad_norm": 0.1737641543149948, + "learning_rate": 0.00041865422175850074, + "loss": 1.0593, + "step": 16920 + }, + { + "epoch": 0.8667827155437231, + "grad_norm": 0.17497050762176514, + "learning_rate": 0.00041701230864194997, + "loss": 1.0558, + "step": 16930 + }, + { + "epoch": 0.867294695883678, + "grad_norm": 0.1742735356092453, + "learning_rate": 0.00041537683491748515, + "loss": 1.0524, + "step": 16940 + }, + { + "epoch": 0.8678066762236331, + "grad_norm": 0.16955190896987915, + "learning_rate": 0.00041374777533055996, + "loss": 1.0734, + "step": 16950 + }, + { + "epoch": 0.868318656563588, + "grad_norm": 0.17131267488002777, + "learning_rate": 0.00041212510472567404, + "loss": 1.047, + "step": 16960 + }, + { + "epoch": 0.8688306369035429, + "grad_norm": 0.18686212599277496, + "learning_rate": 0.00041050879804598354, + "loss": 1.0628, + "step": 16970 + }, + { + "epoch": 0.8693426172434978, + "grad_norm": 0.18018223345279694, + "learning_rate": 0.0004088988303329146, + "loss": 1.0727, + "step": 16980 + }, + { + "epoch": 0.8698545975834528, + "grad_norm": 0.17378225922584534, + "learning_rate": 0.00040729517672577834, + "loss": 1.0608, + "step": 16990 + }, + { + "epoch": 0.8703665779234078, + "grad_norm": 0.17299434542655945, + "learning_rate": 0.0004056978124613862, + "loss": 1.0572, + "step": 17000 + }, + { + "epoch": 0.8708785582633627, + "grad_norm": 0.17272843420505524, + "learning_rate": 0.0004041067128736684, + "loss": 1.068, + "step": 17010 + }, + { + "epoch": 0.8713905386033176, + "grad_norm": 0.17482733726501465, + "learning_rate": 0.0004025218533932921, + "loss": 1.0434, + "step": 17020 + }, + { + "epoch": 0.8719025189432725, + "grad_norm": 0.17604181170463562, + "learning_rate": 0.00040094320954728313, + "loss": 1.0473, + "step": 17030 + }, + { + "epoch": 0.8724144992832276, + "grad_norm": 0.17563997209072113, + "learning_rate": 0.000399370756958647, + "loss": 1.0326, + "step": 17040 + }, + { + "epoch": 0.8729264796231825, + "grad_norm": 0.17245963215827942, + "learning_rate": 0.00039780447134599286, + "loss": 1.0473, + "step": 17050 + }, + { + "epoch": 0.8734384599631374, + "grad_norm": 0.1761290282011032, + "learning_rate": 0.00039624432852315933, + "loss": 1.0521, + "step": 17060 + }, + { + "epoch": 0.8739504403030923, + "grad_norm": 0.17559461295604706, + "learning_rate": 0.0003946903043988396, + "loss": 1.0499, + "step": 17070 + }, + { + "epoch": 0.8744624206430474, + "grad_norm": 0.16970165073871613, + "learning_rate": 0.00039314237497621053, + "loss": 1.0653, + "step": 17080 + }, + { + "epoch": 0.8749744009830023, + "grad_norm": 0.1792786717414856, + "learning_rate": 0.00039160051635256165, + "loss": 1.0554, + "step": 17090 + }, + { + "epoch": 0.8754863813229572, + "grad_norm": 0.16863805055618286, + "learning_rate": 0.0003900647047189262, + "loss": 1.0524, + "step": 17100 + }, + { + "epoch": 0.8759983616629121, + "grad_norm": 0.1794777661561966, + "learning_rate": 0.0003885349163597133, + "loss": 1.0741, + "step": 17110 + }, + { + "epoch": 0.876510342002867, + "grad_norm": 0.1949402540922165, + "learning_rate": 0.0003870111276523419, + "loss": 1.0458, + "step": 17120 + }, + { + "epoch": 0.8770223223428221, + "grad_norm": 0.17837046086788177, + "learning_rate": 0.0003854933150668761, + "loss": 1.0484, + "step": 17130 + }, + { + "epoch": 0.877534302682777, + "grad_norm": 0.16682222485542297, + "learning_rate": 0.00038398145516566133, + "loss": 1.0643, + "step": 17140 + }, + { + "epoch": 0.8780462830227319, + "grad_norm": 0.17241717875003815, + "learning_rate": 0.00038247552460296324, + "loss": 1.0561, + "step": 17150 + }, + { + "epoch": 0.8785582633626868, + "grad_norm": 0.16557161509990692, + "learning_rate": 0.00038097550012460626, + "loss": 1.0614, + "step": 17160 + }, + { + "epoch": 0.8790702437026419, + "grad_norm": 0.17597849667072296, + "learning_rate": 0.00037948135856761536, + "loss": 1.0541, + "step": 17170 + }, + { + "epoch": 0.8795822240425968, + "grad_norm": 0.17368751764297485, + "learning_rate": 0.00037799307685985786, + "loss": 1.0482, + "step": 17180 + }, + { + "epoch": 0.8800942043825517, + "grad_norm": 0.17278683185577393, + "learning_rate": 0.00037651063201968706, + "loss": 1.0493, + "step": 17190 + }, + { + "epoch": 0.8806061847225066, + "grad_norm": 0.17373493313789368, + "learning_rate": 0.00037503400115558816, + "loss": 1.0547, + "step": 17200 + }, + { + "epoch": 0.8811181650624617, + "grad_norm": 0.1761094480752945, + "learning_rate": 0.0003735631614658236, + "loss": 1.0476, + "step": 17210 + }, + { + "epoch": 0.8816301454024166, + "grad_norm": 0.1749420464038849, + "learning_rate": 0.00037209809023808216, + "loss": 1.0313, + "step": 17220 + }, + { + "epoch": 0.8821421257423715, + "grad_norm": 0.1756523847579956, + "learning_rate": 0.0003706387648491272, + "loss": 1.0551, + "step": 17230 + }, + { + "epoch": 0.8826541060823264, + "grad_norm": 0.1767933964729309, + "learning_rate": 0.0003691851627644478, + "loss": 1.0385, + "step": 17240 + }, + { + "epoch": 0.8831660864222813, + "grad_norm": 0.17991852760314941, + "learning_rate": 0.00036773726153791126, + "loss": 1.0534, + "step": 17250 + }, + { + "epoch": 0.8836780667622364, + "grad_norm": 0.17097926139831543, + "learning_rate": 0.00036629503881141533, + "loss": 1.0424, + "step": 17260 + }, + { + "epoch": 0.8841900471021913, + "grad_norm": 0.1836550533771515, + "learning_rate": 0.00036485847231454427, + "loss": 1.0627, + "step": 17270 + }, + { + "epoch": 0.8847020274421462, + "grad_norm": 0.18745499849319458, + "learning_rate": 0.00036342753986422373, + "loss": 1.0475, + "step": 17280 + }, + { + "epoch": 0.8852140077821011, + "grad_norm": 0.17117556929588318, + "learning_rate": 0.00036200221936437925, + "loss": 1.0457, + "step": 17290 + }, + { + "epoch": 0.8857259881220562, + "grad_norm": 0.17555800080299377, + "learning_rate": 0.0003605824888055944, + "loss": 1.0505, + "step": 17300 + }, + { + "epoch": 0.8862379684620111, + "grad_norm": 0.17367680370807648, + "learning_rate": 0.00035916832626477105, + "loss": 1.0433, + "step": 17310 + }, + { + "epoch": 0.886749948801966, + "grad_norm": 0.16771985590457916, + "learning_rate": 0.0003577597099047911, + "loss": 1.0405, + "step": 17320 + }, + { + "epoch": 0.8872619291419209, + "grad_norm": 0.17749017477035522, + "learning_rate": 0.00035635661797417894, + "loss": 1.0326, + "step": 17330 + }, + { + "epoch": 0.8877739094818758, + "grad_norm": 0.1756659597158432, + "learning_rate": 0.0003549590288067658, + "loss": 1.0481, + "step": 17340 + }, + { + "epoch": 0.8882858898218309, + "grad_norm": 0.17804957926273346, + "learning_rate": 0.00035356692082135497, + "loss": 1.0348, + "step": 17350 + }, + { + "epoch": 0.8887978701617858, + "grad_norm": 0.17013497650623322, + "learning_rate": 0.000352180272521389, + "loss": 1.0444, + "step": 17360 + }, + { + "epoch": 0.8893098505017407, + "grad_norm": 0.16462627053260803, + "learning_rate": 0.000350799062494617, + "loss": 1.0473, + "step": 17370 + }, + { + "epoch": 0.8898218308416956, + "grad_norm": 0.18292909860610962, + "learning_rate": 0.00034942326941276463, + "loss": 1.0548, + "step": 17380 + }, + { + "epoch": 0.8903338111816507, + "grad_norm": 0.16778182983398438, + "learning_rate": 0.00034805287203120474, + "loss": 1.0486, + "step": 17390 + }, + { + "epoch": 0.8908457915216056, + "grad_norm": 0.17783689498901367, + "learning_rate": 0.0003466878491886288, + "loss": 1.0422, + "step": 17400 + }, + { + "epoch": 0.8913577718615605, + "grad_norm": 0.17219282686710358, + "learning_rate": 0.0003453281798067208, + "loss": 1.036, + "step": 17410 + }, + { + "epoch": 0.8918697522015154, + "grad_norm": 0.17862632870674133, + "learning_rate": 0.00034397384288983114, + "loss": 1.0441, + "step": 17420 + }, + { + "epoch": 0.8923817325414705, + "grad_norm": 0.17450949549674988, + "learning_rate": 0.00034262481752465293, + "loss": 1.0629, + "step": 17430 + }, + { + "epoch": 0.8928937128814254, + "grad_norm": 0.17378470301628113, + "learning_rate": 0.00034128108287989866, + "loss": 1.0322, + "step": 17440 + }, + { + "epoch": 0.8934056932213803, + "grad_norm": 0.17379970848560333, + "learning_rate": 0.00033994261820597885, + "loss": 1.0553, + "step": 17450 + }, + { + "epoch": 0.8939176735613352, + "grad_norm": 0.17971958220005035, + "learning_rate": 0.00033860940283468143, + "loss": 1.0532, + "step": 17460 + }, + { + "epoch": 0.8944296539012901, + "grad_norm": 0.17435471713542938, + "learning_rate": 0.0003372814161788526, + "loss": 1.0289, + "step": 17470 + }, + { + "epoch": 0.8949416342412452, + "grad_norm": 0.17900234460830688, + "learning_rate": 0.00033595863773207914, + "loss": 1.0407, + "step": 17480 + }, + { + "epoch": 0.8954536145812001, + "grad_norm": 0.1703522503376007, + "learning_rate": 0.00033464104706837144, + "loss": 1.0505, + "step": 17490 + }, + { + "epoch": 0.895965594921155, + "grad_norm": 0.1772749274969101, + "learning_rate": 0.00033332862384184833, + "loss": 1.0504, + "step": 17500 + }, + { + "epoch": 0.8964775752611099, + "grad_norm": 0.19156505167484283, + "learning_rate": 0.0003320213477864227, + "loss": 1.0537, + "step": 17510 + }, + { + "epoch": 0.896989555601065, + "grad_norm": 0.17889319360256195, + "learning_rate": 0.00033071919871548877, + "loss": 1.0371, + "step": 17520 + }, + { + "epoch": 0.8975015359410199, + "grad_norm": 0.17776621878147125, + "learning_rate": 0.0003294221565216104, + "loss": 1.0498, + "step": 17530 + }, + { + "epoch": 0.8980135162809748, + "grad_norm": 0.1731380671262741, + "learning_rate": 0.0003281302011762101, + "loss": 1.048, + "step": 17540 + }, + { + "epoch": 0.8985254966209297, + "grad_norm": 0.17784886062145233, + "learning_rate": 0.0003268433127292607, + "loss": 1.0477, + "step": 17550 + }, + { + "epoch": 0.8990374769608847, + "grad_norm": 0.17313584685325623, + "learning_rate": 0.00032556147130897615, + "loss": 1.0323, + "step": 17560 + }, + { + "epoch": 0.8995494573008397, + "grad_norm": 0.17907077074050903, + "learning_rate": 0.00032428465712150536, + "loss": 1.0527, + "step": 17570 + }, + { + "epoch": 0.9000614376407946, + "grad_norm": 0.1737951934337616, + "learning_rate": 0.0003230128504506268, + "loss": 1.036, + "step": 17580 + }, + { + "epoch": 0.9005734179807495, + "grad_norm": 0.17653332650661469, + "learning_rate": 0.00032174603165744314, + "loss": 1.0478, + "step": 17590 + }, + { + "epoch": 0.9010853983207044, + "grad_norm": 0.16936801373958588, + "learning_rate": 0.00032048418118007897, + "loss": 1.0452, + "step": 17600 + }, + { + "epoch": 0.9015973786606595, + "grad_norm": 0.17044688761234283, + "learning_rate": 0.00031922727953337794, + "loss": 1.0433, + "step": 17610 + }, + { + "epoch": 0.9021093590006144, + "grad_norm": 0.16897530853748322, + "learning_rate": 0.0003179753073086024, + "loss": 1.041, + "step": 17620 + }, + { + "epoch": 0.9026213393405693, + "grad_norm": 0.17904484272003174, + "learning_rate": 0.00031672824517313354, + "loss": 1.0562, + "step": 17630 + }, + { + "epoch": 0.9031333196805242, + "grad_norm": 0.1729121208190918, + "learning_rate": 0.0003154860738701725, + "loss": 1.0345, + "step": 17640 + }, + { + "epoch": 0.9036453000204792, + "grad_norm": 0.17275741696357727, + "learning_rate": 0.00031424877421844385, + "loss": 1.0494, + "step": 17650 + }, + { + "epoch": 0.9041572803604342, + "grad_norm": 0.16756050288677216, + "learning_rate": 0.0003130163271118985, + "loss": 1.0305, + "step": 17660 + }, + { + "epoch": 0.9046692607003891, + "grad_norm": 0.17867998778820038, + "learning_rate": 0.00031178871351941924, + "loss": 1.045, + "step": 17670 + }, + { + "epoch": 0.905181241040344, + "grad_norm": 0.17364557087421417, + "learning_rate": 0.00031056591448452663, + "loss": 1.0407, + "step": 17680 + }, + { + "epoch": 0.905693221380299, + "grad_norm": 0.18060193955898285, + "learning_rate": 0.0003093479111250863, + "loss": 1.0404, + "step": 17690 + }, + { + "epoch": 0.906205201720254, + "grad_norm": 0.17321224510669708, + "learning_rate": 0.0003081346846330176, + "loss": 1.0338, + "step": 17700 + }, + { + "epoch": 0.9067171820602089, + "grad_norm": 0.1827027052640915, + "learning_rate": 0.0003069262162740026, + "loss": 1.0513, + "step": 17710 + }, + { + "epoch": 0.9072291624001638, + "grad_norm": 0.17330406606197357, + "learning_rate": 0.0003057224873871977, + "loss": 1.0537, + "step": 17720 + }, + { + "epoch": 0.9077411427401187, + "grad_norm": 0.1664852797985077, + "learning_rate": 0.00030452347938494435, + "loss": 1.0385, + "step": 17730 + }, + { + "epoch": 0.9082531230800738, + "grad_norm": 0.1791536808013916, + "learning_rate": 0.00030332917375248324, + "loss": 1.0205, + "step": 17740 + }, + { + "epoch": 0.9087651034200287, + "grad_norm": 0.168918177485466, + "learning_rate": 0.0003021395520476674, + "loss": 1.0278, + "step": 17750 + }, + { + "epoch": 0.9092770837599836, + "grad_norm": 0.17502665519714355, + "learning_rate": 0.00030095459590067796, + "loss": 1.0533, + "step": 17760 + }, + { + "epoch": 0.9097890640999385, + "grad_norm": 0.17242580652236938, + "learning_rate": 0.00029977428701374024, + "loss": 1.0465, + "step": 17770 + }, + { + "epoch": 0.9103010444398935, + "grad_norm": 0.16884900629520416, + "learning_rate": 0.0002985986071608414, + "loss": 1.0553, + "step": 17780 + }, + { + "epoch": 0.9108130247798485, + "grad_norm": 0.17999139428138733, + "learning_rate": 0.00029742753818744894, + "loss": 1.052, + "step": 17790 + }, + { + "epoch": 0.9113250051198034, + "grad_norm": 0.19205188751220703, + "learning_rate": 0.0002962610620102301, + "loss": 1.0386, + "step": 17800 + }, + { + "epoch": 0.9118369854597583, + "grad_norm": 0.17089873552322388, + "learning_rate": 0.00029509916061677314, + "loss": 1.0519, + "step": 17810 + }, + { + "epoch": 0.9123489657997133, + "grad_norm": 0.1669624298810959, + "learning_rate": 0.0002939418160653087, + "loss": 1.045, + "step": 17820 + }, + { + "epoch": 0.9128609461396683, + "grad_norm": 0.1757606416940689, + "learning_rate": 0.000292789010484433, + "loss": 1.0311, + "step": 17830 + }, + { + "epoch": 0.9133729264796232, + "grad_norm": 0.1726016104221344, + "learning_rate": 0.00029164072607283187, + "loss": 1.0302, + "step": 17840 + }, + { + "epoch": 0.9138849068195781, + "grad_norm": 0.17893843352794647, + "learning_rate": 0.0002904969450990057, + "loss": 1.0236, + "step": 17850 + }, + { + "epoch": 0.914396887159533, + "grad_norm": 0.17613349854946136, + "learning_rate": 0.00028935764990099594, + "loss": 1.0467, + "step": 17860 + }, + { + "epoch": 0.914908867499488, + "grad_norm": 0.1762663722038269, + "learning_rate": 0.00028822282288611204, + "loss": 1.0143, + "step": 17870 + }, + { + "epoch": 0.915420847839443, + "grad_norm": 0.17385472357273102, + "learning_rate": 0.00028709244653066, + "loss": 1.0373, + "step": 17880 + }, + { + "epoch": 0.9159328281793979, + "grad_norm": 0.173353374004364, + "learning_rate": 0.0002859665033796716, + "loss": 1.0231, + "step": 17890 + }, + { + "epoch": 0.9164448085193528, + "grad_norm": 0.1739385724067688, + "learning_rate": 0.0002848449760466353, + "loss": 1.0174, + "step": 17900 + }, + { + "epoch": 0.9169567888593078, + "grad_norm": 0.17758533358573914, + "learning_rate": 0.000283727847213227, + "loss": 1.0271, + "step": 17910 + }, + { + "epoch": 0.9174687691992628, + "grad_norm": 0.17424450814723969, + "learning_rate": 0.00028261509962904325, + "loss": 1.0464, + "step": 17920 + }, + { + "epoch": 0.9179807495392177, + "grad_norm": 0.18018485605716705, + "learning_rate": 0.0002815067161113347, + "loss": 1.0379, + "step": 17930 + }, + { + "epoch": 0.9184927298791726, + "grad_norm": 0.18166567385196686, + "learning_rate": 0.0002804026795447407, + "loss": 1.0364, + "step": 17940 + }, + { + "epoch": 0.9190047102191276, + "grad_norm": 0.17235900461673737, + "learning_rate": 0.00027930297288102513, + "loss": 1.052, + "step": 17950 + }, + { + "epoch": 0.9195166905590826, + "grad_norm": 0.17493902146816254, + "learning_rate": 0.000278207579138813, + "loss": 1.0377, + "step": 17960 + }, + { + "epoch": 0.9200286708990375, + "grad_norm": 0.17957419157028198, + "learning_rate": 0.0002771164814033282, + "loss": 1.0392, + "step": 17970 + }, + { + "epoch": 0.9205406512389924, + "grad_norm": 0.178439199924469, + "learning_rate": 0.00027602966282613264, + "loss": 1.0333, + "step": 17980 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.17528565227985382, + "learning_rate": 0.0002749471066248655, + "loss": 1.035, + "step": 17990 + }, + { + "epoch": 0.9215646119189023, + "grad_norm": 0.18786676228046417, + "learning_rate": 0.0002738687960829849, + "loss": 1.0263, + "step": 18000 + }, + { + "epoch": 0.9220765922588573, + "grad_norm": 0.18565250933170319, + "learning_rate": 0.00027279471454950873, + "loss": 1.0266, + "step": 18010 + }, + { + "epoch": 0.9225885725988122, + "grad_norm": 0.17576780915260315, + "learning_rate": 0.00027172484543875865, + "loss": 1.0472, + "step": 18020 + }, + { + "epoch": 0.9231005529387671, + "grad_norm": 0.17549046874046326, + "learning_rate": 0.00027065917223010303, + "loss": 1.0357, + "step": 18030 + }, + { + "epoch": 0.9236125332787221, + "grad_norm": 0.17524850368499756, + "learning_rate": 0.00026959767846770227, + "loss": 1.0194, + "step": 18040 + }, + { + "epoch": 0.9241245136186771, + "grad_norm": 0.18681474030017853, + "learning_rate": 0.00026854034776025495, + "loss": 1.0406, + "step": 18050 + }, + { + "epoch": 0.924636493958632, + "grad_norm": 0.1830626130104065, + "learning_rate": 0.000267487163780744, + "loss": 1.0445, + "step": 18060 + }, + { + "epoch": 0.9251484742985869, + "grad_norm": 0.1787140816450119, + "learning_rate": 0.00026643811026618537, + "loss": 1.0365, + "step": 18070 + }, + { + "epoch": 0.9256604546385419, + "grad_norm": 0.1781841665506363, + "learning_rate": 0.00026539317101737637, + "loss": 1.0278, + "step": 18080 + }, + { + "epoch": 0.9261724349784968, + "grad_norm": 0.18114568293094635, + "learning_rate": 0.00026435232989864576, + "loss": 1.0273, + "step": 18090 + }, + { + "epoch": 0.9266844153184518, + "grad_norm": 0.18065612018108368, + "learning_rate": 0.0002633155708376045, + "loss": 1.0435, + "step": 18100 + }, + { + "epoch": 0.9271963956584067, + "grad_norm": 0.17828424274921417, + "learning_rate": 0.0002622828778248974, + "loss": 1.0103, + "step": 18110 + }, + { + "epoch": 0.9277083759983616, + "grad_norm": 0.17807289958000183, + "learning_rate": 0.0002612542349139565, + "loss": 1.0437, + "step": 18120 + }, + { + "epoch": 0.9282203563383166, + "grad_norm": 0.17496445775032043, + "learning_rate": 0.0002602296262207541, + "loss": 1.0219, + "step": 18130 + }, + { + "epoch": 0.9287323366782716, + "grad_norm": 0.17806415259838104, + "learning_rate": 0.00025920903592355785, + "loss": 1.0256, + "step": 18140 + }, + { + "epoch": 0.9292443170182265, + "grad_norm": 0.17231720685958862, + "learning_rate": 0.00025819244826268654, + "loss": 1.0487, + "step": 18150 + }, + { + "epoch": 0.9297562973581814, + "grad_norm": 0.18158575892448425, + "learning_rate": 0.00025717984754026655, + "loss": 1.0258, + "step": 18160 + }, + { + "epoch": 0.9302682776981364, + "grad_norm": 0.17217537760734558, + "learning_rate": 0.0002561712181199894, + "loss": 1.012, + "step": 18170 + }, + { + "epoch": 0.9307802580380914, + "grad_norm": 0.16844135522842407, + "learning_rate": 0.0002551665444268703, + "loss": 1.0449, + "step": 18180 + }, + { + "epoch": 0.9312922383780463, + "grad_norm": 0.17478111386299133, + "learning_rate": 0.0002541658109470081, + "loss": 1.0357, + "step": 18190 + }, + { + "epoch": 0.9318042187180012, + "grad_norm": 0.17291343212127686, + "learning_rate": 0.00025316900222734496, + "loss": 1.0406, + "step": 18200 + }, + { + "epoch": 0.9323161990579562, + "grad_norm": 0.17205969989299774, + "learning_rate": 0.00025217610287542845, + "loss": 1.0263, + "step": 18210 + }, + { + "epoch": 0.9328281793979111, + "grad_norm": 0.17579463124275208, + "learning_rate": 0.0002511870975591733, + "loss": 1.0487, + "step": 18220 + }, + { + "epoch": 0.9333401597378661, + "grad_norm": 0.185591459274292, + "learning_rate": 0.00025020197100662507, + "loss": 1.0289, + "step": 18230 + }, + { + "epoch": 0.933852140077821, + "grad_norm": 0.18697933852672577, + "learning_rate": 0.0002492207080057241, + "loss": 1.0445, + "step": 18240 + }, + { + "epoch": 0.934364120417776, + "grad_norm": 0.1702352613210678, + "learning_rate": 0.00024824329340407056, + "loss": 1.017, + "step": 18250 + }, + { + "epoch": 0.9348761007577309, + "grad_norm": 0.17386525869369507, + "learning_rate": 0.0002472697121086907, + "loss": 1.0265, + "step": 18260 + }, + { + "epoch": 0.9353880810976859, + "grad_norm": 0.17194058001041412, + "learning_rate": 0.0002462999490858035, + "loss": 1.0305, + "step": 18270 + }, + { + "epoch": 0.9359000614376408, + "grad_norm": 0.17600733041763306, + "learning_rate": 0.00024533398936058893, + "loss": 1.0161, + "step": 18280 + }, + { + "epoch": 0.9364120417775957, + "grad_norm": 0.17031820118427277, + "learning_rate": 0.0002443718180169563, + "loss": 1.0435, + "step": 18290 + }, + { + "epoch": 0.9369240221175507, + "grad_norm": 0.17277632653713226, + "learning_rate": 0.00024341342019731398, + "loss": 1.0321, + "step": 18300 + }, + { + "epoch": 0.9374360024575056, + "grad_norm": 0.17314958572387695, + "learning_rate": 0.00024245878110234033, + "loss": 1.0419, + "step": 18310 + }, + { + "epoch": 0.9379479827974606, + "grad_norm": 0.17943693697452545, + "learning_rate": 0.0002415078859907547, + "loss": 1.0455, + "step": 18320 + }, + { + "epoch": 0.9384599631374155, + "grad_norm": 0.17218518257141113, + "learning_rate": 0.00024056072017909026, + "loss": 1.0174, + "step": 18330 + }, + { + "epoch": 0.9389719434773705, + "grad_norm": 0.1672009378671646, + "learning_rate": 0.0002396172690414667, + "loss": 1.0304, + "step": 18340 + }, + { + "epoch": 0.9394839238173254, + "grad_norm": 0.16872192919254303, + "learning_rate": 0.00023867751800936513, + "loss": 1.0334, + "step": 18350 + }, + { + "epoch": 0.9399959041572804, + "grad_norm": 0.17709334194660187, + "learning_rate": 0.0002377414525714023, + "loss": 1.043, + "step": 18360 + }, + { + "epoch": 0.9405078844972353, + "grad_norm": 0.17235656082630157, + "learning_rate": 0.00023680905827310717, + "loss": 1.0296, + "step": 18370 + }, + { + "epoch": 0.9410198648371902, + "grad_norm": 0.17677216231822968, + "learning_rate": 0.0002358803207166974, + "loss": 1.0304, + "step": 18380 + }, + { + "epoch": 0.9415318451771452, + "grad_norm": 0.17921361327171326, + "learning_rate": 0.00023495522556085693, + "loss": 1.0287, + "step": 18390 + }, + { + "epoch": 0.9420438255171002, + "grad_norm": 0.18774552643299103, + "learning_rate": 0.0002340337585205149, + "loss": 1.0303, + "step": 18400 + }, + { + "epoch": 0.9425558058570551, + "grad_norm": 0.1885557323694229, + "learning_rate": 0.00023311590536662463, + "loss": 1.0225, + "step": 18410 + }, + { + "epoch": 0.94306778619701, + "grad_norm": 0.17091277241706848, + "learning_rate": 0.00023220165192594432, + "loss": 1.0216, + "step": 18420 + }, + { + "epoch": 0.943579766536965, + "grad_norm": 0.17530862987041473, + "learning_rate": 0.00023129098408081777, + "loss": 1.0303, + "step": 18430 + }, + { + "epoch": 0.9440917468769199, + "grad_norm": 0.17937549948692322, + "learning_rate": 0.00023038388776895662, + "loss": 1.0234, + "step": 18440 + }, + { + "epoch": 0.9446037272168749, + "grad_norm": 0.1720314472913742, + "learning_rate": 0.00022948034898322335, + "loss": 1.0304, + "step": 18450 + }, + { + "epoch": 0.9451157075568298, + "grad_norm": 0.1731894463300705, + "learning_rate": 0.00022858035377141452, + "loss": 1.021, + "step": 18460 + }, + { + "epoch": 0.9456276878967848, + "grad_norm": 0.17468558251857758, + "learning_rate": 0.00022768388823604584, + "loss": 1.0224, + "step": 18470 + }, + { + "epoch": 0.9461396682367397, + "grad_norm": 0.17135438323020935, + "learning_rate": 0.00022679093853413717, + "loss": 1.0392, + "step": 18480 + }, + { + "epoch": 0.9466516485766947, + "grad_norm": 0.1784532517194748, + "learning_rate": 0.00022590149087699918, + "loss": 1.0183, + "step": 18490 + }, + { + "epoch": 0.9471636289166496, + "grad_norm": 0.18522332608699799, + "learning_rate": 0.00022501553153001985, + "loss": 1.0361, + "step": 18500 + }, + { + "epoch": 0.9476756092566045, + "grad_norm": 0.18401268124580383, + "learning_rate": 0.00022413304681245284, + "loss": 1.0329, + "step": 18510 + }, + { + "epoch": 0.9481875895965595, + "grad_norm": 0.16760528087615967, + "learning_rate": 0.00022325402309720624, + "loss": 1.0199, + "step": 18520 + }, + { + "epoch": 0.9486995699365144, + "grad_norm": 0.18120263516902924, + "learning_rate": 0.00022237844681063175, + "loss": 1.0252, + "step": 18530 + }, + { + "epoch": 0.9492115502764694, + "grad_norm": 0.1899506002664566, + "learning_rate": 0.00022150630443231562, + "loss": 1.0064, + "step": 18540 + }, + { + "epoch": 0.9497235306164243, + "grad_norm": 0.1819719672203064, + "learning_rate": 0.00022063758249486932, + "loss": 1.0246, + "step": 18550 + }, + { + "epoch": 0.9502355109563793, + "grad_norm": 0.17660754919052124, + "learning_rate": 0.00021977226758372213, + "loss": 1.0305, + "step": 18560 + }, + { + "epoch": 0.9507474912963342, + "grad_norm": 0.17415086925029755, + "learning_rate": 0.00021891034633691347, + "loss": 1.0369, + "step": 18570 + }, + { + "epoch": 0.9512594716362892, + "grad_norm": 0.17310403287410736, + "learning_rate": 0.00021805180544488684, + "loss": 1.0272, + "step": 18580 + }, + { + "epoch": 0.9517714519762441, + "grad_norm": 0.17484420537948608, + "learning_rate": 0.0002171966316502845, + "loss": 1.028, + "step": 18590 + }, + { + "epoch": 0.952283432316199, + "grad_norm": 0.18543212115764618, + "learning_rate": 0.00021634481174774217, + "loss": 1.0296, + "step": 18600 + }, + { + "epoch": 0.952795412656154, + "grad_norm": 0.1763850450515747, + "learning_rate": 0.00021549633258368582, + "loss": 1.0307, + "step": 18610 + }, + { + "epoch": 0.953307392996109, + "grad_norm": 0.16824059188365936, + "learning_rate": 0.00021465118105612805, + "loss": 1.0206, + "step": 18620 + }, + { + "epoch": 0.9538193733360639, + "grad_norm": 0.17931176722049713, + "learning_rate": 0.00021380934411446574, + "loss": 1.016, + "step": 18630 + }, + { + "epoch": 0.9543313536760188, + "grad_norm": 0.18147091567516327, + "learning_rate": 0.00021297080875927913, + "loss": 1.0211, + "step": 18640 + }, + { + "epoch": 0.9548433340159738, + "grad_norm": 0.18163631856441498, + "learning_rate": 0.00021213556204213033, + "loss": 1.0263, + "step": 18650 + }, + { + "epoch": 0.9553553143559287, + "grad_norm": 0.17591601610183716, + "learning_rate": 0.00021130359106536384, + "loss": 1.0417, + "step": 18660 + }, + { + "epoch": 0.9558672946958837, + "grad_norm": 0.17677730321884155, + "learning_rate": 0.00021047488298190723, + "loss": 1.0299, + "step": 18670 + }, + { + "epoch": 0.9563792750358386, + "grad_norm": 0.17326125502586365, + "learning_rate": 0.0002096494249950729, + "loss": 1.0268, + "step": 18680 + }, + { + "epoch": 0.9568912553757936, + "grad_norm": 0.1793946474790573, + "learning_rate": 0.00020882720435836026, + "loss": 1.0355, + "step": 18690 + }, + { + "epoch": 0.9574032357157485, + "grad_norm": 0.1703524887561798, + "learning_rate": 0.00020800820837525892, + "loss": 1.005, + "step": 18700 + }, + { + "epoch": 0.9579152160557035, + "grad_norm": 0.17965586483478546, + "learning_rate": 0.000207192424399053, + "loss": 1.0182, + "step": 18710 + }, + { + "epoch": 0.9584271963956584, + "grad_norm": 0.16650822758674622, + "learning_rate": 0.00020637983983262526, + "loss": 1.0304, + "step": 18720 + }, + { + "epoch": 0.9589391767356134, + "grad_norm": 0.1700984239578247, + "learning_rate": 0.00020557044212826323, + "loss": 1.0103, + "step": 18730 + }, + { + "epoch": 0.9594511570755683, + "grad_norm": 0.18094299733638763, + "learning_rate": 0.0002047642187874647, + "loss": 1.0247, + "step": 18740 + }, + { + "epoch": 0.9599631374155232, + "grad_norm": 0.16972561180591583, + "learning_rate": 0.0002039611573607455, + "loss": 1.0328, + "step": 18750 + }, + { + "epoch": 0.9604751177554782, + "grad_norm": 0.1718764752149582, + "learning_rate": 0.0002031612454474467, + "loss": 1.0015, + "step": 18760 + }, + { + "epoch": 0.9609870980954331, + "grad_norm": 0.17211291193962097, + "learning_rate": 0.00020236447069554324, + "loss": 1.0485, + "step": 18770 + }, + { + "epoch": 0.9614990784353881, + "grad_norm": 0.17325459420681, + "learning_rate": 0.00020157082080145356, + "loss": 1.0122, + "step": 18780 + }, + { + "epoch": 0.962011058775343, + "grad_norm": 0.1677115559577942, + "learning_rate": 0.00020078028350984888, + "loss": 1.0144, + "step": 18790 + }, + { + "epoch": 0.962523039115298, + "grad_norm": 0.17302511632442474, + "learning_rate": 0.00019999284661346487, + "loss": 1.0247, + "step": 18800 + }, + { + "epoch": 0.9630350194552529, + "grad_norm": 0.1713932901620865, + "learning_rate": 0.00019920849795291223, + "loss": 1.0135, + "step": 18810 + }, + { + "epoch": 0.9635469997952079, + "grad_norm": 0.1779249906539917, + "learning_rate": 0.00019842722541648977, + "loss": 1.0166, + "step": 18820 + }, + { + "epoch": 0.9640589801351628, + "grad_norm": 0.17072229087352753, + "learning_rate": 0.00019764901693999665, + "loss": 1.0214, + "step": 18830 + }, + { + "epoch": 0.9645709604751177, + "grad_norm": 0.17682915925979614, + "learning_rate": 0.00019687386050654655, + "loss": 1.0412, + "step": 18840 + }, + { + "epoch": 0.9650829408150727, + "grad_norm": 0.17209376394748688, + "learning_rate": 0.00019610174414638203, + "loss": 1.0139, + "step": 18850 + }, + { + "epoch": 0.9655949211550277, + "grad_norm": 0.16988667845726013, + "learning_rate": 0.0001953326559366896, + "loss": 1.03, + "step": 18860 + }, + { + "epoch": 0.9661069014949826, + "grad_norm": 0.17056208848953247, + "learning_rate": 0.0001945665840014157, + "loss": 1.0335, + "step": 18870 + }, + { + "epoch": 0.9666188818349375, + "grad_norm": 0.17054276168346405, + "learning_rate": 0.0001938035165110831, + "loss": 1.0281, + "step": 18880 + }, + { + "epoch": 0.9671308621748925, + "grad_norm": 0.17490647733211517, + "learning_rate": 0.00019304344168260865, + "loss": 1.0401, + "step": 18890 + }, + { + "epoch": 0.9676428425148474, + "grad_norm": 0.17823657393455505, + "learning_rate": 0.00019228634777912089, + "loss": 1.0225, + "step": 18900 + }, + { + "epoch": 0.9681548228548024, + "grad_norm": 0.1651022583246231, + "learning_rate": 0.00019153222310977906, + "loss": 1.0088, + "step": 18910 + }, + { + "epoch": 0.9686668031947573, + "grad_norm": 0.18135780096054077, + "learning_rate": 0.00019078105602959264, + "loss": 1.0289, + "step": 18920 + }, + { + "epoch": 0.9691787835347123, + "grad_norm": 0.17016355693340302, + "learning_rate": 0.00019003283493924117, + "loss": 1.0111, + "step": 18930 + }, + { + "epoch": 0.9696907638746672, + "grad_norm": 0.17754383385181427, + "learning_rate": 0.00018928754828489555, + "loss": 1.0291, + "step": 18940 + }, + { + "epoch": 0.9702027442146222, + "grad_norm": 0.16962246596813202, + "learning_rate": 0.00018854518455803946, + "loss": 1.0228, + "step": 18950 + }, + { + "epoch": 0.9707147245545771, + "grad_norm": 0.17820075154304504, + "learning_rate": 0.00018780573229529142, + "loss": 1.0231, + "step": 18960 + }, + { + "epoch": 0.971226704894532, + "grad_norm": 0.16597416996955872, + "learning_rate": 0.00018706918007822834, + "loss": 1.0327, + "step": 18970 + }, + { + "epoch": 0.971738685234487, + "grad_norm": 0.17721499502658844, + "learning_rate": 0.00018633551653320852, + "loss": 1.0084, + "step": 18980 + }, + { + "epoch": 0.972250665574442, + "grad_norm": 0.17141114175319672, + "learning_rate": 0.0001856047303311967, + "loss": 1.0361, + "step": 18990 + }, + { + "epoch": 0.9727626459143969, + "grad_norm": 0.17473644018173218, + "learning_rate": 0.0001848768101875884, + "loss": 1.0051, + "step": 19000 + }, + { + "epoch": 0.9732746262543518, + "grad_norm": 0.17746561765670776, + "learning_rate": 0.00018415174486203638, + "loss": 1.0266, + "step": 19010 + }, + { + "epoch": 0.9737866065943068, + "grad_norm": 0.16750702261924744, + "learning_rate": 0.00018342952315827656, + "loss": 1.0282, + "step": 19020 + }, + { + "epoch": 0.9742985869342617, + "grad_norm": 0.1748443841934204, + "learning_rate": 0.00018271013392395522, + "loss": 1.0183, + "step": 19030 + }, + { + "epoch": 0.9748105672742167, + "grad_norm": 0.17715822160243988, + "learning_rate": 0.0001819935660504572, + "loss": 1.0145, + "step": 19040 + }, + { + "epoch": 0.9753225476141716, + "grad_norm": 0.17972363531589508, + "learning_rate": 0.0001812798084727336, + "loss": 1.0069, + "step": 19050 + }, + { + "epoch": 0.9758345279541265, + "grad_norm": 0.17496472597122192, + "learning_rate": 0.00018056885016913175, + "loss": 1.0074, + "step": 19060 + }, + { + "epoch": 0.9763465082940815, + "grad_norm": 0.18323951959609985, + "learning_rate": 0.00017986068016122433, + "loss": 1.0487, + "step": 19070 + }, + { + "epoch": 0.9768584886340365, + "grad_norm": 0.16890741884708405, + "learning_rate": 0.00017915528751364033, + "loss": 1.0153, + "step": 19080 + }, + { + "epoch": 0.9773704689739914, + "grad_norm": 0.17116831243038177, + "learning_rate": 0.0001784526613338959, + "loss": 1.0132, + "step": 19090 + }, + { + "epoch": 0.9778824493139463, + "grad_norm": 0.17036503553390503, + "learning_rate": 0.00017775279077222617, + "loss": 1.0228, + "step": 19100 + }, + { + "epoch": 0.9783944296539013, + "grad_norm": 0.17859075963497162, + "learning_rate": 0.00017705566502141802, + "loss": 1.0123, + "step": 19110 + }, + { + "epoch": 0.9789064099938563, + "grad_norm": 0.17719532549381256, + "learning_rate": 0.00017636127331664266, + "loss": 1.0385, + "step": 19120 + }, + { + "epoch": 0.9794183903338112, + "grad_norm": 0.17673194408416748, + "learning_rate": 0.00017566960493528995, + "loss": 1.0224, + "step": 19130 + }, + { + "epoch": 0.9799303706737661, + "grad_norm": 0.1806950718164444, + "learning_rate": 0.00017498064919680242, + "loss": 1.0111, + "step": 19140 + }, + { + "epoch": 0.9804423510137211, + "grad_norm": 0.16843082010746002, + "learning_rate": 0.00017429439546251066, + "loss": 1.0059, + "step": 19150 + }, + { + "epoch": 0.980954331353676, + "grad_norm": 0.17275168001651764, + "learning_rate": 0.00017361083313546875, + "loss": 1.037, + "step": 19160 + }, + { + "epoch": 0.981466311693631, + "grad_norm": 0.17250047624111176, + "learning_rate": 0.0001729299516602907, + "loss": 1.0193, + "step": 19170 + }, + { + "epoch": 0.9819782920335859, + "grad_norm": 0.17009197175502777, + "learning_rate": 0.00017225174052298777, + "loss": 1.0412, + "step": 19180 + }, + { + "epoch": 0.9824902723735408, + "grad_norm": 0.16845643520355225, + "learning_rate": 0.0001715761892508056, + "loss": 1.0268, + "step": 19190 + }, + { + "epoch": 0.9830022527134958, + "grad_norm": 0.16763417422771454, + "learning_rate": 0.0001709032874120629, + "loss": 1.0425, + "step": 19200 + }, + { + "epoch": 0.9835142330534508, + "grad_norm": 0.1747148334980011, + "learning_rate": 0.00017023302461599015, + "loss": 1.0228, + "step": 19210 + }, + { + "epoch": 0.9840262133934057, + "grad_norm": 0.17626087367534637, + "learning_rate": 0.0001695653905125693, + "loss": 1.0142, + "step": 19220 + }, + { + "epoch": 0.9845381937333606, + "grad_norm": 0.17711155116558075, + "learning_rate": 0.00016890037479237377, + "loss": 1.0238, + "step": 19230 + }, + { + "epoch": 0.9850501740733156, + "grad_norm": 0.1858174353837967, + "learning_rate": 0.00016823796718640937, + "loss": 1.033, + "step": 19240 + }, + { + "epoch": 0.9855621544132706, + "grad_norm": 0.1855236142873764, + "learning_rate": 0.0001675781574659558, + "loss": 1.0276, + "step": 19250 + }, + { + "epoch": 0.9860741347532255, + "grad_norm": 0.16916634142398834, + "learning_rate": 0.0001669209354424084, + "loss": 1.0208, + "step": 19260 + }, + { + "epoch": 0.9865861150931804, + "grad_norm": 0.18142545223236084, + "learning_rate": 0.00016626629096712137, + "loss": 1.0302, + "step": 19270 + }, + { + "epoch": 0.9870980954331353, + "grad_norm": 0.16748617589473724, + "learning_rate": 0.00016561421393125036, + "loss": 1.0244, + "step": 19280 + }, + { + "epoch": 0.9876100757730903, + "grad_norm": 0.180519700050354, + "learning_rate": 0.000164964694265597, + "loss": 1.0009, + "step": 19290 + }, + { + "epoch": 0.9881220561130453, + "grad_norm": 0.16856172680854797, + "learning_rate": 0.00016431772194045298, + "loss": 1.009, + "step": 19300 + }, + { + "epoch": 0.9886340364530002, + "grad_norm": 0.17907920479774475, + "learning_rate": 0.00016367328696544536, + "loss": 1.0182, + "step": 19310 + }, + { + "epoch": 0.9891460167929551, + "grad_norm": 0.18012414872646332, + "learning_rate": 0.00016303137938938238, + "loss": 1.0238, + "step": 19320 + }, + { + "epoch": 0.9896579971329101, + "grad_norm": 0.17940422892570496, + "learning_rate": 0.0001623919893000996, + "loss": 1.035, + "step": 19330 + }, + { + "epoch": 0.9901699774728651, + "grad_norm": 0.17534732818603516, + "learning_rate": 0.00016175510682430694, + "loss": 1.0282, + "step": 19340 + }, + { + "epoch": 0.99068195781282, + "grad_norm": 0.17742076516151428, + "learning_rate": 0.0001611207221274363, + "loss": 1.0308, + "step": 19350 + }, + { + "epoch": 0.9911939381527749, + "grad_norm": 0.174584299325943, + "learning_rate": 0.00016048882541348943, + "loss": 1.0433, + "step": 19360 + }, + { + "epoch": 0.9917059184927299, + "grad_norm": 0.17817029356956482, + "learning_rate": 0.00015985940692488709, + "loss": 1.0088, + "step": 19370 + }, + { + "epoch": 0.9922178988326849, + "grad_norm": 0.1764860898256302, + "learning_rate": 0.00015923245694231792, + "loss": 1.0051, + "step": 19380 + }, + { + "epoch": 0.9927298791726398, + "grad_norm": 0.1679990142583847, + "learning_rate": 0.00015860796578458873, + "loss": 1.0383, + "step": 19390 + }, + { + "epoch": 0.9932418595125947, + "grad_norm": 0.17141203582286835, + "learning_rate": 0.00015798592380847468, + "loss": 1.0367, + "step": 19400 + }, + { + "epoch": 0.9937538398525496, + "grad_norm": 0.17301303148269653, + "learning_rate": 0.00015736632140857067, + "loss": 1.0227, + "step": 19410 + }, + { + "epoch": 0.9942658201925046, + "grad_norm": 0.17585515975952148, + "learning_rate": 0.00015674914901714278, + "loss": 1.0373, + "step": 19420 + }, + { + "epoch": 0.9947778005324596, + "grad_norm": 0.17036980390548706, + "learning_rate": 0.0001561343971039807, + "loss": 1.0025, + "step": 19430 + }, + { + "epoch": 0.9952897808724145, + "grad_norm": 0.1802191138267517, + "learning_rate": 0.00015552205617625053, + "loss": 1.0378, + "step": 19440 + }, + { + "epoch": 0.9958017612123694, + "grad_norm": 0.17641904950141907, + "learning_rate": 0.000154912116778348, + "loss": 1.0317, + "step": 19450 + }, + { + "epoch": 0.9963137415523244, + "grad_norm": 0.18595443665981293, + "learning_rate": 0.0001543045694917528, + "loss": 1.0081, + "step": 19460 + }, + { + "epoch": 0.9968257218922794, + "grad_norm": 0.17444072663784027, + "learning_rate": 0.0001536994049348828, + "loss": 1.0242, + "step": 19470 + }, + { + "epoch": 0.9973377022322343, + "grad_norm": 0.17894035577774048, + "learning_rate": 0.00015309661376294953, + "loss": 1.0269, + "step": 19480 + }, + { + "epoch": 0.9978496825721892, + "grad_norm": 0.17125560343265533, + "learning_rate": 0.00015249618666781352, + "loss": 1.0189, + "step": 19490 + }, + { + "epoch": 0.9983616629121441, + "grad_norm": 0.1681634485721588, + "learning_rate": 0.0001518981143778408, + "loss": 1.0014, + "step": 19500 + }, + { + "epoch": 0.9988736432520992, + "grad_norm": 0.17360231280326843, + "learning_rate": 0.0001513023876577597, + "loss": 1.0033, + "step": 19510 + }, + { + "epoch": 0.9993856235920541, + "grad_norm": 0.17242667078971863, + "learning_rate": 0.00015070899730851815, + "loss": 1.0236, + "step": 19520 + }, + { + "epoch": 0.999897603932009, + "grad_norm": 0.16095665097236633, + "learning_rate": 0.0001501179341671418, + "loss": 1.0393, + "step": 19530 + } + ], + "logging_steps": 10, + "max_steps": 19532, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.8288897328545792e+17, + "train_batch_size": 512, + "trial_name": null, + "trial_params": null + }