diff --git "a/checkpoint-3500/trainer_state.json" "b/checkpoint-3500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-3500/trainer_state.json" @@ -0,0 +1,24533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.33489618218352313, + "eval_steps": 500, + "global_step": 3500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00033478406427854036, + "grad_norm": 2.5951156616210938, + "learning_rate": 5e-06, + "loss": 2.0872, + "step": 1 + }, + { + "epoch": 0.0006695681285570807, + "grad_norm": 2.011127471923828, + "learning_rate": 1e-05, + "loss": 2.0068, + "step": 2 + }, + { + "epoch": 0.001004352192835621, + "grad_norm": 2.058666467666626, + "learning_rate": 1.5e-05, + "loss": 2.0258, + "step": 3 + }, + { + "epoch": 0.0013391362571141614, + "grad_norm": 2.2120566368103027, + "learning_rate": 2e-05, + "loss": 2.0142, + "step": 4 + }, + { + "epoch": 0.0016739203213927017, + "grad_norm": 2.370628595352173, + "learning_rate": 2.5e-05, + "loss": 2.0344, + "step": 5 + }, + { + "epoch": 0.002008704385671242, + "grad_norm": 1.437334418296814, + "learning_rate": 3e-05, + "loss": 1.9203, + "step": 6 + }, + { + "epoch": 0.002343488449949782, + "grad_norm": 1.1889039278030396, + "learning_rate": 3.5e-05, + "loss": 1.9264, + "step": 7 + }, + { + "epoch": 0.002678272514228323, + "grad_norm": 1.0925624370574951, + "learning_rate": 4e-05, + "loss": 1.9148, + "step": 8 + }, + { + "epoch": 0.003013056578506863, + "grad_norm": 0.5106806755065918, + "learning_rate": 4.5e-05, + "loss": 1.9098, + "step": 9 + }, + { + "epoch": 0.0033478406427854034, + "grad_norm": 0.506732702255249, + "learning_rate": 5e-05, + "loss": 1.832, + "step": 10 + }, + { + "epoch": 0.0036826247070639436, + "grad_norm": 0.47460949420928955, + "learning_rate": 5.500000000000001e-05, + "loss": 1.879, + "step": 11 + }, + { + "epoch": 0.004017408771342484, + "grad_norm": 0.5833293199539185, + "learning_rate": 6e-05, + "loss": 1.8777, + "step": 12 + }, + { + "epoch": 0.004352192835621024, + "grad_norm": 0.4383687376976013, + "learning_rate": 6.500000000000001e-05, + "loss": 1.8559, + "step": 13 + }, + { + "epoch": 0.004686976899899564, + "grad_norm": 0.35056746006011963, + "learning_rate": 7e-05, + "loss": 1.7573, + "step": 14 + }, + { + "epoch": 0.0050217609641781055, + "grad_norm": 0.5545064210891724, + "learning_rate": 7.500000000000001e-05, + "loss": 1.7541, + "step": 15 + }, + { + "epoch": 0.005356545028456646, + "grad_norm": 0.3440060019493103, + "learning_rate": 8e-05, + "loss": 1.6876, + "step": 16 + }, + { + "epoch": 0.005691329092735186, + "grad_norm": 0.36561861634254456, + "learning_rate": 8.5e-05, + "loss": 1.7454, + "step": 17 + }, + { + "epoch": 0.006026113157013726, + "grad_norm": 0.282402902841568, + "learning_rate": 9e-05, + "loss": 1.8184, + "step": 18 + }, + { + "epoch": 0.0063608972212922665, + "grad_norm": 0.1981375813484192, + "learning_rate": 9.5e-05, + "loss": 1.7448, + "step": 19 + }, + { + "epoch": 0.006695681285570807, + "grad_norm": 0.16754242777824402, + "learning_rate": 0.0001, + "loss": 1.7555, + "step": 20 + }, + { + "epoch": 0.007030465349849347, + "grad_norm": 0.17915141582489014, + "learning_rate": 0.0001, + "loss": 1.7533, + "step": 21 + }, + { + "epoch": 0.007365249414127887, + "grad_norm": 0.1990516483783722, + "learning_rate": 0.0001, + "loss": 1.6819, + "step": 22 + }, + { + "epoch": 0.0077000334784064275, + "grad_norm": 0.20808538794517517, + "learning_rate": 0.0001, + "loss": 1.7345, + "step": 23 + }, + { + "epoch": 0.008034817542684968, + "grad_norm": 0.2500799894332886, + "learning_rate": 0.0001, + "loss": 1.7636, + "step": 24 + }, + { + "epoch": 0.008369601606963508, + "grad_norm": 0.2141977846622467, + "learning_rate": 0.0001, + "loss": 1.7475, + "step": 25 + }, + { + "epoch": 0.008704385671242048, + "grad_norm": 0.2018044888973236, + "learning_rate": 0.0001, + "loss": 1.6445, + "step": 26 + }, + { + "epoch": 0.009039169735520589, + "grad_norm": 0.19822722673416138, + "learning_rate": 0.0001, + "loss": 1.7604, + "step": 27 + }, + { + "epoch": 0.009373953799799129, + "grad_norm": 0.18675795197486877, + "learning_rate": 0.0001, + "loss": 1.7743, + "step": 28 + }, + { + "epoch": 0.009708737864077669, + "grad_norm": 0.16032469272613525, + "learning_rate": 0.0001, + "loss": 1.7221, + "step": 29 + }, + { + "epoch": 0.010043521928356211, + "grad_norm": 0.17107701301574707, + "learning_rate": 0.0001, + "loss": 1.7514, + "step": 30 + }, + { + "epoch": 0.010378305992634751, + "grad_norm": 0.1589154601097107, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 31 + }, + { + "epoch": 0.010713090056913292, + "grad_norm": 0.13346004486083984, + "learning_rate": 0.0001, + "loss": 1.6011, + "step": 32 + }, + { + "epoch": 0.011047874121191832, + "grad_norm": 0.1687479168176651, + "learning_rate": 0.0001, + "loss": 1.7694, + "step": 33 + }, + { + "epoch": 0.011382658185470372, + "grad_norm": 0.14785747230052948, + "learning_rate": 0.0001, + "loss": 1.6836, + "step": 34 + }, + { + "epoch": 0.011717442249748912, + "grad_norm": 0.13441652059555054, + "learning_rate": 0.0001, + "loss": 1.7087, + "step": 35 + }, + { + "epoch": 0.012052226314027453, + "grad_norm": 0.13479024171829224, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 36 + }, + { + "epoch": 0.012387010378305993, + "grad_norm": 0.15816231071949005, + "learning_rate": 0.0001, + "loss": 1.6643, + "step": 37 + }, + { + "epoch": 0.012721794442584533, + "grad_norm": 0.12814071774482727, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 38 + }, + { + "epoch": 0.013056578506863073, + "grad_norm": 0.129450261592865, + "learning_rate": 0.0001, + "loss": 1.7623, + "step": 39 + }, + { + "epoch": 0.013391362571141614, + "grad_norm": 0.13946504890918732, + "learning_rate": 0.0001, + "loss": 1.8067, + "step": 40 + }, + { + "epoch": 0.013726146635420154, + "grad_norm": 0.1161293238401413, + "learning_rate": 0.0001, + "loss": 1.688, + "step": 41 + }, + { + "epoch": 0.014060930699698694, + "grad_norm": 0.11296379566192627, + "learning_rate": 0.0001, + "loss": 1.6035, + "step": 42 + }, + { + "epoch": 0.014395714763977234, + "grad_norm": 0.12507247924804688, + "learning_rate": 0.0001, + "loss": 1.7287, + "step": 43 + }, + { + "epoch": 0.014730498828255775, + "grad_norm": 0.11496929079294205, + "learning_rate": 0.0001, + "loss": 1.626, + "step": 44 + }, + { + "epoch": 0.015065282892534315, + "grad_norm": 0.13881774246692657, + "learning_rate": 0.0001, + "loss": 1.7501, + "step": 45 + }, + { + "epoch": 0.015400066956812855, + "grad_norm": 0.1255090981721878, + "learning_rate": 0.0001, + "loss": 1.6952, + "step": 46 + }, + { + "epoch": 0.015734851021091397, + "grad_norm": 0.11783197522163391, + "learning_rate": 0.0001, + "loss": 1.6256, + "step": 47 + }, + { + "epoch": 0.016069635085369936, + "grad_norm": 0.12152993679046631, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 48 + }, + { + "epoch": 0.016404419149648478, + "grad_norm": 0.12172088027000427, + "learning_rate": 0.0001, + "loss": 1.6927, + "step": 49 + }, + { + "epoch": 0.016739203213927016, + "grad_norm": 0.13490882515907288, + "learning_rate": 0.0001, + "loss": 1.7372, + "step": 50 + }, + { + "epoch": 0.017073987278205558, + "grad_norm": 0.1124483197927475, + "learning_rate": 0.0001, + "loss": 1.6206, + "step": 51 + }, + { + "epoch": 0.017408771342484097, + "grad_norm": 0.11569201201200485, + "learning_rate": 0.0001, + "loss": 1.7156, + "step": 52 + }, + { + "epoch": 0.01774355540676264, + "grad_norm": 0.12394021451473236, + "learning_rate": 0.0001, + "loss": 1.6132, + "step": 53 + }, + { + "epoch": 0.018078339471041177, + "grad_norm": 0.11930014938116074, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 54 + }, + { + "epoch": 0.01841312353531972, + "grad_norm": 0.1183612123131752, + "learning_rate": 0.0001, + "loss": 1.6953, + "step": 55 + }, + { + "epoch": 0.018747907599598258, + "grad_norm": 0.11677711457014084, + "learning_rate": 0.0001, + "loss": 1.6936, + "step": 56 + }, + { + "epoch": 0.0190826916638768, + "grad_norm": 0.12049452215433121, + "learning_rate": 0.0001, + "loss": 1.6381, + "step": 57 + }, + { + "epoch": 0.019417475728155338, + "grad_norm": 0.11653623729944229, + "learning_rate": 0.0001, + "loss": 1.7704, + "step": 58 + }, + { + "epoch": 0.01975225979243388, + "grad_norm": 0.12089766561985016, + "learning_rate": 0.0001, + "loss": 1.6819, + "step": 59 + }, + { + "epoch": 0.020087043856712422, + "grad_norm": 0.12823008000850677, + "learning_rate": 0.0001, + "loss": 1.7584, + "step": 60 + }, + { + "epoch": 0.02042182792099096, + "grad_norm": 0.12439601868391037, + "learning_rate": 0.0001, + "loss": 1.6761, + "step": 61 + }, + { + "epoch": 0.020756611985269503, + "grad_norm": 0.12000609189271927, + "learning_rate": 0.0001, + "loss": 1.7014, + "step": 62 + }, + { + "epoch": 0.02109139604954804, + "grad_norm": 0.12034812569618225, + "learning_rate": 0.0001, + "loss": 1.7128, + "step": 63 + }, + { + "epoch": 0.021426180113826583, + "grad_norm": 0.11534720659255981, + "learning_rate": 0.0001, + "loss": 1.694, + "step": 64 + }, + { + "epoch": 0.02176096417810512, + "grad_norm": 0.11633310467004776, + "learning_rate": 0.0001, + "loss": 1.6718, + "step": 65 + }, + { + "epoch": 0.022095748242383664, + "grad_norm": 0.13419900834560394, + "learning_rate": 0.0001, + "loss": 1.707, + "step": 66 + }, + { + "epoch": 0.022430532306662202, + "grad_norm": 0.11928509920835495, + "learning_rate": 0.0001, + "loss": 1.6935, + "step": 67 + }, + { + "epoch": 0.022765316370940744, + "grad_norm": 0.11948949843645096, + "learning_rate": 0.0001, + "loss": 1.6304, + "step": 68 + }, + { + "epoch": 0.023100100435219283, + "grad_norm": 0.12679244577884674, + "learning_rate": 0.0001, + "loss": 1.6605, + "step": 69 + }, + { + "epoch": 0.023434884499497825, + "grad_norm": 0.10675504058599472, + "learning_rate": 0.0001, + "loss": 1.6785, + "step": 70 + }, + { + "epoch": 0.023769668563776363, + "grad_norm": 0.12108162045478821, + "learning_rate": 0.0001, + "loss": 1.6695, + "step": 71 + }, + { + "epoch": 0.024104452628054905, + "grad_norm": 0.11032188683748245, + "learning_rate": 0.0001, + "loss": 1.7293, + "step": 72 + }, + { + "epoch": 0.024439236692333444, + "grad_norm": 0.11592775583267212, + "learning_rate": 0.0001, + "loss": 1.6726, + "step": 73 + }, + { + "epoch": 0.024774020756611986, + "grad_norm": 0.11566442996263504, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 74 + }, + { + "epoch": 0.025108804820890524, + "grad_norm": 0.11673177778720856, + "learning_rate": 0.0001, + "loss": 1.6223, + "step": 75 + }, + { + "epoch": 0.025443588885169066, + "grad_norm": 0.1140669733285904, + "learning_rate": 0.0001, + "loss": 1.6886, + "step": 76 + }, + { + "epoch": 0.025778372949447605, + "grad_norm": 0.11448585987091064, + "learning_rate": 0.0001, + "loss": 1.6765, + "step": 77 + }, + { + "epoch": 0.026113157013726147, + "grad_norm": 0.11363522708415985, + "learning_rate": 0.0001, + "loss": 1.6241, + "step": 78 + }, + { + "epoch": 0.02644794107800469, + "grad_norm": 0.10882357507944107, + "learning_rate": 0.0001, + "loss": 1.6495, + "step": 79 + }, + { + "epoch": 0.026782725142283227, + "grad_norm": 0.11577261239290237, + "learning_rate": 0.0001, + "loss": 1.6941, + "step": 80 + }, + { + "epoch": 0.02711750920656177, + "grad_norm": 0.12674297392368317, + "learning_rate": 0.0001, + "loss": 1.7615, + "step": 81 + }, + { + "epoch": 0.027452293270840308, + "grad_norm": 0.11801646649837494, + "learning_rate": 0.0001, + "loss": 1.6414, + "step": 82 + }, + { + "epoch": 0.02778707733511885, + "grad_norm": 0.11615725606679916, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 83 + }, + { + "epoch": 0.028121861399397388, + "grad_norm": 0.1159651130437851, + "learning_rate": 0.0001, + "loss": 1.6371, + "step": 84 + }, + { + "epoch": 0.02845664546367593, + "grad_norm": 0.12539416551589966, + "learning_rate": 0.0001, + "loss": 1.7152, + "step": 85 + }, + { + "epoch": 0.02879142952795447, + "grad_norm": 0.10691766440868378, + "learning_rate": 0.0001, + "loss": 1.552, + "step": 86 + }, + { + "epoch": 0.02912621359223301, + "grad_norm": 0.11859432607889175, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 87 + }, + { + "epoch": 0.02946099765651155, + "grad_norm": 0.12362800538539886, + "learning_rate": 0.0001, + "loss": 1.6944, + "step": 88 + }, + { + "epoch": 0.02979578172079009, + "grad_norm": 0.12135861068964005, + "learning_rate": 0.0001, + "loss": 1.6703, + "step": 89 + }, + { + "epoch": 0.03013056578506863, + "grad_norm": 0.15077495574951172, + "learning_rate": 0.0001, + "loss": 1.7522, + "step": 90 + }, + { + "epoch": 0.03046534984934717, + "grad_norm": 0.1137770563364029, + "learning_rate": 0.0001, + "loss": 1.6263, + "step": 91 + }, + { + "epoch": 0.03080013391362571, + "grad_norm": 0.11616989970207214, + "learning_rate": 0.0001, + "loss": 1.7166, + "step": 92 + }, + { + "epoch": 0.031134917977904252, + "grad_norm": 0.14210130274295807, + "learning_rate": 0.0001, + "loss": 1.7889, + "step": 93 + }, + { + "epoch": 0.031469702042182794, + "grad_norm": 0.1261507272720337, + "learning_rate": 0.0001, + "loss": 1.6593, + "step": 94 + }, + { + "epoch": 0.03180448610646133, + "grad_norm": 0.13197694718837738, + "learning_rate": 0.0001, + "loss": 1.6182, + "step": 95 + }, + { + "epoch": 0.03213927017073987, + "grad_norm": 0.11830636113882065, + "learning_rate": 0.0001, + "loss": 1.6373, + "step": 96 + }, + { + "epoch": 0.03247405423501841, + "grad_norm": 0.12643662095069885, + "learning_rate": 0.0001, + "loss": 1.6601, + "step": 97 + }, + { + "epoch": 0.032808838299296955, + "grad_norm": 0.13787776231765747, + "learning_rate": 0.0001, + "loss": 1.7496, + "step": 98 + }, + { + "epoch": 0.033143622363575494, + "grad_norm": 0.1096898540854454, + "learning_rate": 0.0001, + "loss": 1.5582, + "step": 99 + }, + { + "epoch": 0.03347840642785403, + "grad_norm": 0.13948234915733337, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 100 + }, + { + "epoch": 0.03381319049213258, + "grad_norm": 0.11294490098953247, + "learning_rate": 0.0001, + "loss": 1.6703, + "step": 101 + }, + { + "epoch": 0.034147974556411116, + "grad_norm": 0.12141433358192444, + "learning_rate": 0.0001, + "loss": 1.6553, + "step": 102 + }, + { + "epoch": 0.034482758620689655, + "grad_norm": 0.13332489132881165, + "learning_rate": 0.0001, + "loss": 1.6761, + "step": 103 + }, + { + "epoch": 0.03481754268496819, + "grad_norm": 0.12173039466142654, + "learning_rate": 0.0001, + "loss": 1.6304, + "step": 104 + }, + { + "epoch": 0.03515232674924674, + "grad_norm": 0.12168910354375839, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 105 + }, + { + "epoch": 0.03548711081352528, + "grad_norm": 0.1244431585073471, + "learning_rate": 0.0001, + "loss": 1.6463, + "step": 106 + }, + { + "epoch": 0.035821894877803816, + "grad_norm": 0.12028734385967255, + "learning_rate": 0.0001, + "loss": 1.684, + "step": 107 + }, + { + "epoch": 0.036156678942082354, + "grad_norm": 0.12029126286506653, + "learning_rate": 0.0001, + "loss": 1.6799, + "step": 108 + }, + { + "epoch": 0.0364914630063609, + "grad_norm": 0.11806860566139221, + "learning_rate": 0.0001, + "loss": 1.7245, + "step": 109 + }, + { + "epoch": 0.03682624707063944, + "grad_norm": 0.12406452000141144, + "learning_rate": 0.0001, + "loss": 1.6881, + "step": 110 + }, + { + "epoch": 0.03716103113491798, + "grad_norm": 0.118985615670681, + "learning_rate": 0.0001, + "loss": 1.6675, + "step": 111 + }, + { + "epoch": 0.037495815199196515, + "grad_norm": 0.12949040532112122, + "learning_rate": 0.0001, + "loss": 1.6871, + "step": 112 + }, + { + "epoch": 0.03783059926347506, + "grad_norm": 0.12375173717737198, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 113 + }, + { + "epoch": 0.0381653833277536, + "grad_norm": 0.11779066920280457, + "learning_rate": 0.0001, + "loss": 1.7399, + "step": 114 + }, + { + "epoch": 0.03850016739203214, + "grad_norm": 0.1195269301533699, + "learning_rate": 0.0001, + "loss": 1.65, + "step": 115 + }, + { + "epoch": 0.038834951456310676, + "grad_norm": 0.11929327249526978, + "learning_rate": 0.0001, + "loss": 1.6214, + "step": 116 + }, + { + "epoch": 0.03916973552058922, + "grad_norm": 0.11532218009233475, + "learning_rate": 0.0001, + "loss": 1.6395, + "step": 117 + }, + { + "epoch": 0.03950451958486776, + "grad_norm": 0.11126700043678284, + "learning_rate": 0.0001, + "loss": 1.622, + "step": 118 + }, + { + "epoch": 0.0398393036491463, + "grad_norm": 0.1309433877468109, + "learning_rate": 0.0001, + "loss": 1.5791, + "step": 119 + }, + { + "epoch": 0.040174087713424844, + "grad_norm": 0.12015924602746964, + "learning_rate": 0.0001, + "loss": 1.655, + "step": 120 + }, + { + "epoch": 0.04050887177770338, + "grad_norm": 0.12615351378917694, + "learning_rate": 0.0001, + "loss": 1.6215, + "step": 121 + }, + { + "epoch": 0.04084365584198192, + "grad_norm": 0.1387631893157959, + "learning_rate": 0.0001, + "loss": 1.7451, + "step": 122 + }, + { + "epoch": 0.04117843990626046, + "grad_norm": 0.1166117936372757, + "learning_rate": 0.0001, + "loss": 1.6537, + "step": 123 + }, + { + "epoch": 0.041513223970539005, + "grad_norm": 0.1521015763282776, + "learning_rate": 0.0001, + "loss": 1.6545, + "step": 124 + }, + { + "epoch": 0.041848008034817544, + "grad_norm": 0.1296280473470688, + "learning_rate": 0.0001, + "loss": 1.6355, + "step": 125 + }, + { + "epoch": 0.04218279209909608, + "grad_norm": 0.13189557194709778, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 126 + }, + { + "epoch": 0.04251757616337462, + "grad_norm": 0.1445418745279312, + "learning_rate": 0.0001, + "loss": 1.7444, + "step": 127 + }, + { + "epoch": 0.042852360227653166, + "grad_norm": 0.11560577899217606, + "learning_rate": 0.0001, + "loss": 1.6468, + "step": 128 + }, + { + "epoch": 0.043187144291931705, + "grad_norm": 0.16312864422798157, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 129 + }, + { + "epoch": 0.04352192835621024, + "grad_norm": 0.1284494251012802, + "learning_rate": 0.0001, + "loss": 1.6643, + "step": 130 + }, + { + "epoch": 0.04385671242048878, + "grad_norm": 0.11743518710136414, + "learning_rate": 0.0001, + "loss": 1.6273, + "step": 131 + }, + { + "epoch": 0.04419149648476733, + "grad_norm": 0.17127898335456848, + "learning_rate": 0.0001, + "loss": 1.5955, + "step": 132 + }, + { + "epoch": 0.044526280549045866, + "grad_norm": 0.1554144024848938, + "learning_rate": 0.0001, + "loss": 1.7738, + "step": 133 + }, + { + "epoch": 0.044861064613324404, + "grad_norm": 0.13085848093032837, + "learning_rate": 0.0001, + "loss": 1.5957, + "step": 134 + }, + { + "epoch": 0.04519584867760294, + "grad_norm": 0.1883288025856018, + "learning_rate": 0.0001, + "loss": 1.6159, + "step": 135 + }, + { + "epoch": 0.04553063274188149, + "grad_norm": 0.11826716363430023, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 136 + }, + { + "epoch": 0.04586541680616003, + "grad_norm": 0.15767724812030792, + "learning_rate": 0.0001, + "loss": 1.682, + "step": 137 + }, + { + "epoch": 0.046200200870438565, + "grad_norm": 0.14300817251205444, + "learning_rate": 0.0001, + "loss": 1.6152, + "step": 138 + }, + { + "epoch": 0.04653498493471711, + "grad_norm": 0.11646521836519241, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 139 + }, + { + "epoch": 0.04686976899899565, + "grad_norm": 0.12624727189540863, + "learning_rate": 0.0001, + "loss": 1.6128, + "step": 140 + }, + { + "epoch": 0.04720455306327419, + "grad_norm": 0.14111122488975525, + "learning_rate": 0.0001, + "loss": 1.618, + "step": 141 + }, + { + "epoch": 0.047539337127552726, + "grad_norm": 0.1404058188199997, + "learning_rate": 0.0001, + "loss": 1.66, + "step": 142 + }, + { + "epoch": 0.04787412119183127, + "grad_norm": 0.12555940449237823, + "learning_rate": 0.0001, + "loss": 1.666, + "step": 143 + }, + { + "epoch": 0.04820890525610981, + "grad_norm": 0.14494475722312927, + "learning_rate": 0.0001, + "loss": 1.6147, + "step": 144 + }, + { + "epoch": 0.04854368932038835, + "grad_norm": 0.12508632242679596, + "learning_rate": 0.0001, + "loss": 1.5765, + "step": 145 + }, + { + "epoch": 0.04887847338466689, + "grad_norm": 0.11790450662374496, + "learning_rate": 0.0001, + "loss": 1.7342, + "step": 146 + }, + { + "epoch": 0.04921325744894543, + "grad_norm": 0.1416400671005249, + "learning_rate": 0.0001, + "loss": 1.6673, + "step": 147 + }, + { + "epoch": 0.04954804151322397, + "grad_norm": 0.13537850975990295, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 148 + }, + { + "epoch": 0.04988282557750251, + "grad_norm": 0.12219058722257614, + "learning_rate": 0.0001, + "loss": 1.6677, + "step": 149 + }, + { + "epoch": 0.05021760964178105, + "grad_norm": 0.1398639678955078, + "learning_rate": 0.0001, + "loss": 1.6454, + "step": 150 + }, + { + "epoch": 0.050552393706059594, + "grad_norm": 0.14572647213935852, + "learning_rate": 0.0001, + "loss": 1.6094, + "step": 151 + }, + { + "epoch": 0.05088717777033813, + "grad_norm": 0.10937194526195526, + "learning_rate": 0.0001, + "loss": 1.5776, + "step": 152 + }, + { + "epoch": 0.05122196183461667, + "grad_norm": 0.1404120773077011, + "learning_rate": 0.0001, + "loss": 1.6112, + "step": 153 + }, + { + "epoch": 0.05155674589889521, + "grad_norm": 0.1480460911989212, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 154 + }, + { + "epoch": 0.051891529963173755, + "grad_norm": 0.10971348732709885, + "learning_rate": 0.0001, + "loss": 1.5744, + "step": 155 + }, + { + "epoch": 0.05222631402745229, + "grad_norm": 0.1468382179737091, + "learning_rate": 0.0001, + "loss": 1.7518, + "step": 156 + }, + { + "epoch": 0.05256109809173083, + "grad_norm": 0.13429516553878784, + "learning_rate": 0.0001, + "loss": 1.5812, + "step": 157 + }, + { + "epoch": 0.05289588215600938, + "grad_norm": 0.11399335414171219, + "learning_rate": 0.0001, + "loss": 1.6812, + "step": 158 + }, + { + "epoch": 0.053230666220287916, + "grad_norm": 0.13944409787654877, + "learning_rate": 0.0001, + "loss": 1.6789, + "step": 159 + }, + { + "epoch": 0.053565450284566454, + "grad_norm": 0.1390630453824997, + "learning_rate": 0.0001, + "loss": 1.6368, + "step": 160 + }, + { + "epoch": 0.05390023434884499, + "grad_norm": 0.1098702922463417, + "learning_rate": 0.0001, + "loss": 1.5462, + "step": 161 + }, + { + "epoch": 0.05423501841312354, + "grad_norm": 0.13710471987724304, + "learning_rate": 0.0001, + "loss": 1.7208, + "step": 162 + }, + { + "epoch": 0.05456980247740208, + "grad_norm": 0.1283336579799652, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 163 + }, + { + "epoch": 0.054904586541680615, + "grad_norm": 0.11550601571798325, + "learning_rate": 0.0001, + "loss": 1.7409, + "step": 164 + }, + { + "epoch": 0.055239370605959154, + "grad_norm": 0.12028289586305618, + "learning_rate": 0.0001, + "loss": 1.6685, + "step": 165 + }, + { + "epoch": 0.0555741546702377, + "grad_norm": 0.13237926363945007, + "learning_rate": 0.0001, + "loss": 1.6639, + "step": 166 + }, + { + "epoch": 0.05590893873451624, + "grad_norm": 0.11385014653205872, + "learning_rate": 0.0001, + "loss": 1.6742, + "step": 167 + }, + { + "epoch": 0.056243722798794776, + "grad_norm": 0.13613030314445496, + "learning_rate": 0.0001, + "loss": 1.6898, + "step": 168 + }, + { + "epoch": 0.056578506863073315, + "grad_norm": 0.12617048621177673, + "learning_rate": 0.0001, + "loss": 1.6239, + "step": 169 + }, + { + "epoch": 0.05691329092735186, + "grad_norm": 0.11637625098228455, + "learning_rate": 0.0001, + "loss": 1.6362, + "step": 170 + }, + { + "epoch": 0.0572480749916304, + "grad_norm": 0.13217699527740479, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 171 + }, + { + "epoch": 0.05758285905590894, + "grad_norm": 0.12088079750537872, + "learning_rate": 0.0001, + "loss": 1.4997, + "step": 172 + }, + { + "epoch": 0.057917643120187476, + "grad_norm": 0.11359237879514694, + "learning_rate": 0.0001, + "loss": 1.564, + "step": 173 + }, + { + "epoch": 0.05825242718446602, + "grad_norm": 0.12509793043136597, + "learning_rate": 0.0001, + "loss": 1.6855, + "step": 174 + }, + { + "epoch": 0.05858721124874456, + "grad_norm": 0.1233699694275856, + "learning_rate": 0.0001, + "loss": 1.665, + "step": 175 + }, + { + "epoch": 0.0589219953130231, + "grad_norm": 0.11172114312648773, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 176 + }, + { + "epoch": 0.059256779377301644, + "grad_norm": 0.12242110818624496, + "learning_rate": 0.0001, + "loss": 1.6736, + "step": 177 + }, + { + "epoch": 0.05959156344158018, + "grad_norm": 0.12275474518537521, + "learning_rate": 0.0001, + "loss": 1.6373, + "step": 178 + }, + { + "epoch": 0.05992634750585872, + "grad_norm": 0.11666038632392883, + "learning_rate": 0.0001, + "loss": 1.6957, + "step": 179 + }, + { + "epoch": 0.06026113157013726, + "grad_norm": 0.1209944486618042, + "learning_rate": 0.0001, + "loss": 1.618, + "step": 180 + }, + { + "epoch": 0.060595915634415805, + "grad_norm": 0.12028312683105469, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 181 + }, + { + "epoch": 0.06093069969869434, + "grad_norm": 0.11835712194442749, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 182 + }, + { + "epoch": 0.06126548376297288, + "grad_norm": 0.13166043162345886, + "learning_rate": 0.0001, + "loss": 1.6064, + "step": 183 + }, + { + "epoch": 0.06160026782725142, + "grad_norm": 0.1366170346736908, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 184 + }, + { + "epoch": 0.061935051891529966, + "grad_norm": 0.12185468524694443, + "learning_rate": 0.0001, + "loss": 1.5695, + "step": 185 + }, + { + "epoch": 0.062269835955808504, + "grad_norm": 0.12310407310724258, + "learning_rate": 0.0001, + "loss": 1.6799, + "step": 186 + }, + { + "epoch": 0.06260462002008704, + "grad_norm": 0.14412462711334229, + "learning_rate": 0.0001, + "loss": 1.5855, + "step": 187 + }, + { + "epoch": 0.06293940408436559, + "grad_norm": 0.11908841878175735, + "learning_rate": 0.0001, + "loss": 1.5752, + "step": 188 + }, + { + "epoch": 0.06327418814864412, + "grad_norm": 0.12137061357498169, + "learning_rate": 0.0001, + "loss": 1.6018, + "step": 189 + }, + { + "epoch": 0.06360897221292267, + "grad_norm": 0.128020778298378, + "learning_rate": 0.0001, + "loss": 1.5894, + "step": 190 + }, + { + "epoch": 0.06394375627720121, + "grad_norm": 0.13447493314743042, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 191 + }, + { + "epoch": 0.06427854034147974, + "grad_norm": 0.11885492503643036, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 192 + }, + { + "epoch": 0.06461332440575829, + "grad_norm": 0.13066913187503815, + "learning_rate": 0.0001, + "loss": 1.6807, + "step": 193 + }, + { + "epoch": 0.06494810847003682, + "grad_norm": 0.12650778889656067, + "learning_rate": 0.0001, + "loss": 1.6498, + "step": 194 + }, + { + "epoch": 0.06528289253431536, + "grad_norm": 0.116504967212677, + "learning_rate": 0.0001, + "loss": 1.6037, + "step": 195 + }, + { + "epoch": 0.06561767659859391, + "grad_norm": 0.12200898677110672, + "learning_rate": 0.0001, + "loss": 1.5816, + "step": 196 + }, + { + "epoch": 0.06595246066287244, + "grad_norm": 0.13350239396095276, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 197 + }, + { + "epoch": 0.06628724472715099, + "grad_norm": 0.12119137495756149, + "learning_rate": 0.0001, + "loss": 1.5747, + "step": 198 + }, + { + "epoch": 0.06662202879142953, + "grad_norm": 0.12292595952749252, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 199 + }, + { + "epoch": 0.06695681285570806, + "grad_norm": 0.14958657324314117, + "learning_rate": 0.0001, + "loss": 1.7248, + "step": 200 + }, + { + "epoch": 0.06729159691998661, + "grad_norm": 0.1206580251455307, + "learning_rate": 0.0001, + "loss": 1.647, + "step": 201 + }, + { + "epoch": 0.06762638098426516, + "grad_norm": 0.13404549658298492, + "learning_rate": 0.0001, + "loss": 1.6827, + "step": 202 + }, + { + "epoch": 0.06796116504854369, + "grad_norm": 0.11746184527873993, + "learning_rate": 0.0001, + "loss": 1.5827, + "step": 203 + }, + { + "epoch": 0.06829594911282223, + "grad_norm": 0.1220933049917221, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 204 + }, + { + "epoch": 0.06863073317710076, + "grad_norm": 0.1395500898361206, + "learning_rate": 0.0001, + "loss": 1.6691, + "step": 205 + }, + { + "epoch": 0.06896551724137931, + "grad_norm": 0.12085775285959244, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 206 + }, + { + "epoch": 0.06930030130565785, + "grad_norm": 0.139579176902771, + "learning_rate": 0.0001, + "loss": 1.6357, + "step": 207 + }, + { + "epoch": 0.06963508536993639, + "grad_norm": 0.12011922895908356, + "learning_rate": 0.0001, + "loss": 1.5418, + "step": 208 + }, + { + "epoch": 0.06996986943421493, + "grad_norm": 0.11939892917871475, + "learning_rate": 0.0001, + "loss": 1.5816, + "step": 209 + }, + { + "epoch": 0.07030465349849348, + "grad_norm": 0.12651924788951874, + "learning_rate": 0.0001, + "loss": 1.5286, + "step": 210 + }, + { + "epoch": 0.07063943756277201, + "grad_norm": 0.13420534133911133, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 211 + }, + { + "epoch": 0.07097422162705055, + "grad_norm": 0.11868797987699509, + "learning_rate": 0.0001, + "loss": 1.6367, + "step": 212 + }, + { + "epoch": 0.07130900569132909, + "grad_norm": 0.11338218301534653, + "learning_rate": 0.0001, + "loss": 1.517, + "step": 213 + }, + { + "epoch": 0.07164378975560763, + "grad_norm": 0.14230981469154358, + "learning_rate": 0.0001, + "loss": 1.6773, + "step": 214 + }, + { + "epoch": 0.07197857381988618, + "grad_norm": 0.11315491795539856, + "learning_rate": 0.0001, + "loss": 1.5564, + "step": 215 + }, + { + "epoch": 0.07231335788416471, + "grad_norm": 0.12009023874998093, + "learning_rate": 0.0001, + "loss": 1.6317, + "step": 216 + }, + { + "epoch": 0.07264814194844325, + "grad_norm": 0.1332681030035019, + "learning_rate": 0.0001, + "loss": 1.6393, + "step": 217 + }, + { + "epoch": 0.0729829260127218, + "grad_norm": 0.12581905722618103, + "learning_rate": 0.0001, + "loss": 1.7155, + "step": 218 + }, + { + "epoch": 0.07331771007700033, + "grad_norm": 0.12259216606616974, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 219 + }, + { + "epoch": 0.07365249414127888, + "grad_norm": 0.13090763986110687, + "learning_rate": 0.0001, + "loss": 1.6692, + "step": 220 + }, + { + "epoch": 0.07398727820555742, + "grad_norm": 0.11311494559049606, + "learning_rate": 0.0001, + "loss": 1.6653, + "step": 221 + }, + { + "epoch": 0.07432206226983595, + "grad_norm": 0.1307578831911087, + "learning_rate": 0.0001, + "loss": 1.5978, + "step": 222 + }, + { + "epoch": 0.0746568463341145, + "grad_norm": 0.12622885406017303, + "learning_rate": 0.0001, + "loss": 1.7782, + "step": 223 + }, + { + "epoch": 0.07499163039839303, + "grad_norm": 0.11902297288179398, + "learning_rate": 0.0001, + "loss": 1.5689, + "step": 224 + }, + { + "epoch": 0.07532641446267158, + "grad_norm": 0.11696305125951767, + "learning_rate": 0.0001, + "loss": 1.6077, + "step": 225 + }, + { + "epoch": 0.07566119852695012, + "grad_norm": 0.11666855216026306, + "learning_rate": 0.0001, + "loss": 1.5568, + "step": 226 + }, + { + "epoch": 0.07599598259122865, + "grad_norm": 0.12056950479745865, + "learning_rate": 0.0001, + "loss": 1.6829, + "step": 227 + }, + { + "epoch": 0.0763307666555072, + "grad_norm": 0.11957021802663803, + "learning_rate": 0.0001, + "loss": 1.7184, + "step": 228 + }, + { + "epoch": 0.07666555071978574, + "grad_norm": 0.11590487509965897, + "learning_rate": 0.0001, + "loss": 1.6775, + "step": 229 + }, + { + "epoch": 0.07700033478406428, + "grad_norm": 0.11034328490495682, + "learning_rate": 0.0001, + "loss": 1.5773, + "step": 230 + }, + { + "epoch": 0.07733511884834282, + "grad_norm": 0.12097325176000595, + "learning_rate": 0.0001, + "loss": 1.5552, + "step": 231 + }, + { + "epoch": 0.07766990291262135, + "grad_norm": 0.11697199940681458, + "learning_rate": 0.0001, + "loss": 1.6762, + "step": 232 + }, + { + "epoch": 0.0780046869768999, + "grad_norm": 0.11488549411296844, + "learning_rate": 0.0001, + "loss": 1.6219, + "step": 233 + }, + { + "epoch": 0.07833947104117844, + "grad_norm": 0.12868645787239075, + "learning_rate": 0.0001, + "loss": 1.6596, + "step": 234 + }, + { + "epoch": 0.07867425510545697, + "grad_norm": 0.11428504437208176, + "learning_rate": 0.0001, + "loss": 1.5926, + "step": 235 + }, + { + "epoch": 0.07900903916973552, + "grad_norm": 0.14550745487213135, + "learning_rate": 0.0001, + "loss": 1.6773, + "step": 236 + }, + { + "epoch": 0.07934382323401407, + "grad_norm": 0.11800127476453781, + "learning_rate": 0.0001, + "loss": 1.7403, + "step": 237 + }, + { + "epoch": 0.0796786072982926, + "grad_norm": 0.12732075154781342, + "learning_rate": 0.0001, + "loss": 1.6886, + "step": 238 + }, + { + "epoch": 0.08001339136257114, + "grad_norm": 0.1188284233212471, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 239 + }, + { + "epoch": 0.08034817542684969, + "grad_norm": 0.12447573244571686, + "learning_rate": 0.0001, + "loss": 1.668, + "step": 240 + }, + { + "epoch": 0.08068295949112822, + "grad_norm": 0.129620760679245, + "learning_rate": 0.0001, + "loss": 1.6134, + "step": 241 + }, + { + "epoch": 0.08101774355540677, + "grad_norm": 0.12539665400981903, + "learning_rate": 0.0001, + "loss": 1.7069, + "step": 242 + }, + { + "epoch": 0.0813525276196853, + "grad_norm": 0.13554492592811584, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 243 + }, + { + "epoch": 0.08168731168396384, + "grad_norm": 0.11758473515510559, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 244 + }, + { + "epoch": 0.08202209574824239, + "grad_norm": 0.11309672147035599, + "learning_rate": 0.0001, + "loss": 1.5836, + "step": 245 + }, + { + "epoch": 0.08235687981252092, + "grad_norm": 0.12910054624080658, + "learning_rate": 0.0001, + "loss": 1.6104, + "step": 246 + }, + { + "epoch": 0.08269166387679946, + "grad_norm": 0.12267620116472244, + "learning_rate": 0.0001, + "loss": 1.6505, + "step": 247 + }, + { + "epoch": 0.08302644794107801, + "grad_norm": 0.12700802087783813, + "learning_rate": 0.0001, + "loss": 1.6474, + "step": 248 + }, + { + "epoch": 0.08336123200535654, + "grad_norm": 0.13106848299503326, + "learning_rate": 0.0001, + "loss": 1.7076, + "step": 249 + }, + { + "epoch": 0.08369601606963509, + "grad_norm": 0.12598051130771637, + "learning_rate": 0.0001, + "loss": 1.6463, + "step": 250 + }, + { + "epoch": 0.08403080013391362, + "grad_norm": 0.1270611584186554, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 251 + }, + { + "epoch": 0.08436558419819216, + "grad_norm": 0.1215846911072731, + "learning_rate": 0.0001, + "loss": 1.7082, + "step": 252 + }, + { + "epoch": 0.08470036826247071, + "grad_norm": 0.11944068968296051, + "learning_rate": 0.0001, + "loss": 1.6046, + "step": 253 + }, + { + "epoch": 0.08503515232674924, + "grad_norm": 0.12395983189344406, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 254 + }, + { + "epoch": 0.08536993639102779, + "grad_norm": 0.11616060882806778, + "learning_rate": 0.0001, + "loss": 1.6514, + "step": 255 + }, + { + "epoch": 0.08570472045530633, + "grad_norm": 0.1274399757385254, + "learning_rate": 0.0001, + "loss": 1.6023, + "step": 256 + }, + { + "epoch": 0.08603950451958486, + "grad_norm": 0.11419884115457535, + "learning_rate": 0.0001, + "loss": 1.6053, + "step": 257 + }, + { + "epoch": 0.08637428858386341, + "grad_norm": 0.11922091245651245, + "learning_rate": 0.0001, + "loss": 1.6771, + "step": 258 + }, + { + "epoch": 0.08670907264814195, + "grad_norm": 0.12727287411689758, + "learning_rate": 0.0001, + "loss": 1.5332, + "step": 259 + }, + { + "epoch": 0.08704385671242049, + "grad_norm": 0.12368068844079971, + "learning_rate": 0.0001, + "loss": 1.6962, + "step": 260 + }, + { + "epoch": 0.08737864077669903, + "grad_norm": 0.11546538770198822, + "learning_rate": 0.0001, + "loss": 1.6239, + "step": 261 + }, + { + "epoch": 0.08771342484097756, + "grad_norm": 0.13736455142498016, + "learning_rate": 0.0001, + "loss": 1.7133, + "step": 262 + }, + { + "epoch": 0.08804820890525611, + "grad_norm": 0.12773726880550385, + "learning_rate": 0.0001, + "loss": 1.6127, + "step": 263 + }, + { + "epoch": 0.08838299296953465, + "grad_norm": 0.12833422422409058, + "learning_rate": 0.0001, + "loss": 1.5803, + "step": 264 + }, + { + "epoch": 0.08871777703381319, + "grad_norm": 0.13427826762199402, + "learning_rate": 0.0001, + "loss": 1.5815, + "step": 265 + }, + { + "epoch": 0.08905256109809173, + "grad_norm": 0.1173439621925354, + "learning_rate": 0.0001, + "loss": 1.5457, + "step": 266 + }, + { + "epoch": 0.08938734516237028, + "grad_norm": 0.12156970053911209, + "learning_rate": 0.0001, + "loss": 1.5969, + "step": 267 + }, + { + "epoch": 0.08972212922664881, + "grad_norm": 0.15133506059646606, + "learning_rate": 0.0001, + "loss": 1.6223, + "step": 268 + }, + { + "epoch": 0.09005691329092735, + "grad_norm": 0.13353589177131653, + "learning_rate": 0.0001, + "loss": 1.545, + "step": 269 + }, + { + "epoch": 0.09039169735520589, + "grad_norm": 0.12940257787704468, + "learning_rate": 0.0001, + "loss": 1.6135, + "step": 270 + }, + { + "epoch": 0.09072648141948443, + "grad_norm": 0.12897267937660217, + "learning_rate": 0.0001, + "loss": 1.6413, + "step": 271 + }, + { + "epoch": 0.09106126548376298, + "grad_norm": 0.12336087226867676, + "learning_rate": 0.0001, + "loss": 1.702, + "step": 272 + }, + { + "epoch": 0.09139604954804151, + "grad_norm": 0.11277737468481064, + "learning_rate": 0.0001, + "loss": 1.5743, + "step": 273 + }, + { + "epoch": 0.09173083361232005, + "grad_norm": 0.11659134924411774, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 274 + }, + { + "epoch": 0.0920656176765986, + "grad_norm": 0.11736118793487549, + "learning_rate": 0.0001, + "loss": 1.655, + "step": 275 + }, + { + "epoch": 0.09240040174087713, + "grad_norm": 0.12133463472127914, + "learning_rate": 0.0001, + "loss": 1.6771, + "step": 276 + }, + { + "epoch": 0.09273518580515568, + "grad_norm": 0.11516664177179337, + "learning_rate": 0.0001, + "loss": 1.5545, + "step": 277 + }, + { + "epoch": 0.09306996986943422, + "grad_norm": 0.10916180163621902, + "learning_rate": 0.0001, + "loss": 1.5301, + "step": 278 + }, + { + "epoch": 0.09340475393371275, + "grad_norm": 0.11232040077447891, + "learning_rate": 0.0001, + "loss": 1.5489, + "step": 279 + }, + { + "epoch": 0.0937395379979913, + "grad_norm": 0.12515543401241302, + "learning_rate": 0.0001, + "loss": 1.6817, + "step": 280 + }, + { + "epoch": 0.09407432206226983, + "grad_norm": 0.11998307704925537, + "learning_rate": 0.0001, + "loss": 1.563, + "step": 281 + }, + { + "epoch": 0.09440910612654838, + "grad_norm": 0.12774354219436646, + "learning_rate": 0.0001, + "loss": 1.622, + "step": 282 + }, + { + "epoch": 0.09474389019082692, + "grad_norm": 0.12023581564426422, + "learning_rate": 0.0001, + "loss": 1.5367, + "step": 283 + }, + { + "epoch": 0.09507867425510545, + "grad_norm": 0.12877605855464935, + "learning_rate": 0.0001, + "loss": 1.5806, + "step": 284 + }, + { + "epoch": 0.095413458319384, + "grad_norm": 0.11994509398937225, + "learning_rate": 0.0001, + "loss": 1.6017, + "step": 285 + }, + { + "epoch": 0.09574824238366254, + "grad_norm": 0.12522728741168976, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 286 + }, + { + "epoch": 0.09608302644794108, + "grad_norm": 0.13130401074886322, + "learning_rate": 0.0001, + "loss": 1.6211, + "step": 287 + }, + { + "epoch": 0.09641781051221962, + "grad_norm": 0.1242026537656784, + "learning_rate": 0.0001, + "loss": 1.6428, + "step": 288 + }, + { + "epoch": 0.09675259457649815, + "grad_norm": 0.12561045587062836, + "learning_rate": 0.0001, + "loss": 1.7275, + "step": 289 + }, + { + "epoch": 0.0970873786407767, + "grad_norm": 0.11756443232297897, + "learning_rate": 0.0001, + "loss": 1.5905, + "step": 290 + }, + { + "epoch": 0.09742216270505524, + "grad_norm": 0.11787443608045578, + "learning_rate": 0.0001, + "loss": 1.5809, + "step": 291 + }, + { + "epoch": 0.09775694676933377, + "grad_norm": 0.11708027869462967, + "learning_rate": 0.0001, + "loss": 1.6205, + "step": 292 + }, + { + "epoch": 0.09809173083361232, + "grad_norm": 0.12011709064245224, + "learning_rate": 0.0001, + "loss": 1.6327, + "step": 293 + }, + { + "epoch": 0.09842651489789087, + "grad_norm": 0.12868238985538483, + "learning_rate": 0.0001, + "loss": 1.7539, + "step": 294 + }, + { + "epoch": 0.0987612989621694, + "grad_norm": 0.11626073718070984, + "learning_rate": 0.0001, + "loss": 1.6877, + "step": 295 + }, + { + "epoch": 0.09909608302644794, + "grad_norm": 0.1279468834400177, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 296 + }, + { + "epoch": 0.09943086709072649, + "grad_norm": 0.12956663966178894, + "learning_rate": 0.0001, + "loss": 1.5736, + "step": 297 + }, + { + "epoch": 0.09976565115500502, + "grad_norm": 0.11931903660297394, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 298 + }, + { + "epoch": 0.10010043521928357, + "grad_norm": 0.12837816774845123, + "learning_rate": 0.0001, + "loss": 1.5923, + "step": 299 + }, + { + "epoch": 0.1004352192835621, + "grad_norm": 0.12146858870983124, + "learning_rate": 0.0001, + "loss": 1.6206, + "step": 300 + }, + { + "epoch": 0.10077000334784064, + "grad_norm": 0.11455334722995758, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 301 + }, + { + "epoch": 0.10110478741211919, + "grad_norm": 0.12035822868347168, + "learning_rate": 0.0001, + "loss": 1.576, + "step": 302 + }, + { + "epoch": 0.10143957147639772, + "grad_norm": 0.12373282760381699, + "learning_rate": 0.0001, + "loss": 1.6688, + "step": 303 + }, + { + "epoch": 0.10177435554067626, + "grad_norm": 0.13985779881477356, + "learning_rate": 0.0001, + "loss": 1.667, + "step": 304 + }, + { + "epoch": 0.10210913960495481, + "grad_norm": 0.11246056109666824, + "learning_rate": 0.0001, + "loss": 1.6014, + "step": 305 + }, + { + "epoch": 0.10244392366923334, + "grad_norm": 0.13154080510139465, + "learning_rate": 0.0001, + "loss": 1.5909, + "step": 306 + }, + { + "epoch": 0.10277870773351189, + "grad_norm": 0.13235047459602356, + "learning_rate": 0.0001, + "loss": 1.6888, + "step": 307 + }, + { + "epoch": 0.10311349179779042, + "grad_norm": 0.13294562697410583, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 308 + }, + { + "epoch": 0.10344827586206896, + "grad_norm": 0.1274106800556183, + "learning_rate": 0.0001, + "loss": 1.7178, + "step": 309 + }, + { + "epoch": 0.10378305992634751, + "grad_norm": 0.11676975339651108, + "learning_rate": 0.0001, + "loss": 1.5587, + "step": 310 + }, + { + "epoch": 0.10411784399062604, + "grad_norm": 0.1180170550942421, + "learning_rate": 0.0001, + "loss": 1.5579, + "step": 311 + }, + { + "epoch": 0.10445262805490459, + "grad_norm": 0.1267906278371811, + "learning_rate": 0.0001, + "loss": 1.5994, + "step": 312 + }, + { + "epoch": 0.10478741211918313, + "grad_norm": 0.12398704141378403, + "learning_rate": 0.0001, + "loss": 1.5459, + "step": 313 + }, + { + "epoch": 0.10512219618346166, + "grad_norm": 0.12039758265018463, + "learning_rate": 0.0001, + "loss": 1.5995, + "step": 314 + }, + { + "epoch": 0.10545698024774021, + "grad_norm": 0.12191271781921387, + "learning_rate": 0.0001, + "loss": 1.5639, + "step": 315 + }, + { + "epoch": 0.10579176431201875, + "grad_norm": 0.1351427584886551, + "learning_rate": 0.0001, + "loss": 1.6553, + "step": 316 + }, + { + "epoch": 0.10612654837629729, + "grad_norm": 0.13542529940605164, + "learning_rate": 0.0001, + "loss": 1.5455, + "step": 317 + }, + { + "epoch": 0.10646133244057583, + "grad_norm": 0.13739462196826935, + "learning_rate": 0.0001, + "loss": 1.6414, + "step": 318 + }, + { + "epoch": 0.10679611650485436, + "grad_norm": 0.11810696870088577, + "learning_rate": 0.0001, + "loss": 1.7078, + "step": 319 + }, + { + "epoch": 0.10713090056913291, + "grad_norm": 0.13632580637931824, + "learning_rate": 0.0001, + "loss": 1.6044, + "step": 320 + }, + { + "epoch": 0.10746568463341145, + "grad_norm": 0.12454043328762054, + "learning_rate": 0.0001, + "loss": 1.6654, + "step": 321 + }, + { + "epoch": 0.10780046869768999, + "grad_norm": 0.11818061023950577, + "learning_rate": 0.0001, + "loss": 1.5693, + "step": 322 + }, + { + "epoch": 0.10813525276196853, + "grad_norm": 0.12229089438915253, + "learning_rate": 0.0001, + "loss": 1.6248, + "step": 323 + }, + { + "epoch": 0.10847003682624708, + "grad_norm": 0.11546499282121658, + "learning_rate": 0.0001, + "loss": 1.5091, + "step": 324 + }, + { + "epoch": 0.10880482089052561, + "grad_norm": 0.12005545943975449, + "learning_rate": 0.0001, + "loss": 1.5801, + "step": 325 + }, + { + "epoch": 0.10913960495480415, + "grad_norm": 0.12114623188972473, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 326 + }, + { + "epoch": 0.10947438901908269, + "grad_norm": 0.11608844995498657, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 327 + }, + { + "epoch": 0.10980917308336123, + "grad_norm": 0.11119306832551956, + "learning_rate": 0.0001, + "loss": 1.5515, + "step": 328 + }, + { + "epoch": 0.11014395714763978, + "grad_norm": 0.12586964666843414, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 329 + }, + { + "epoch": 0.11047874121191831, + "grad_norm": 0.127826526761055, + "learning_rate": 0.0001, + "loss": 1.7205, + "step": 330 + }, + { + "epoch": 0.11081352527619685, + "grad_norm": 0.11828092485666275, + "learning_rate": 0.0001, + "loss": 1.6711, + "step": 331 + }, + { + "epoch": 0.1111483093404754, + "grad_norm": 0.13583530485630035, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 332 + }, + { + "epoch": 0.11148309340475393, + "grad_norm": 0.11893647909164429, + "learning_rate": 0.0001, + "loss": 1.5707, + "step": 333 + }, + { + "epoch": 0.11181787746903248, + "grad_norm": 0.13151027262210846, + "learning_rate": 0.0001, + "loss": 1.6576, + "step": 334 + }, + { + "epoch": 0.11215266153331102, + "grad_norm": 0.11656352877616882, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 335 + }, + { + "epoch": 0.11248744559758955, + "grad_norm": 0.1267959475517273, + "learning_rate": 0.0001, + "loss": 1.5069, + "step": 336 + }, + { + "epoch": 0.1128222296618681, + "grad_norm": 0.12403184920549393, + "learning_rate": 0.0001, + "loss": 1.5273, + "step": 337 + }, + { + "epoch": 0.11315701372614663, + "grad_norm": 0.12692154943943024, + "learning_rate": 0.0001, + "loss": 1.6647, + "step": 338 + }, + { + "epoch": 0.11349179779042518, + "grad_norm": 0.11919606477022171, + "learning_rate": 0.0001, + "loss": 1.6833, + "step": 339 + }, + { + "epoch": 0.11382658185470372, + "grad_norm": 0.11304503679275513, + "learning_rate": 0.0001, + "loss": 1.5757, + "step": 340 + }, + { + "epoch": 0.11416136591898225, + "grad_norm": 0.11996794492006302, + "learning_rate": 0.0001, + "loss": 1.6102, + "step": 341 + }, + { + "epoch": 0.1144961499832608, + "grad_norm": 0.12606146931648254, + "learning_rate": 0.0001, + "loss": 1.59, + "step": 342 + }, + { + "epoch": 0.11483093404753934, + "grad_norm": 0.12146681547164917, + "learning_rate": 0.0001, + "loss": 1.4989, + "step": 343 + }, + { + "epoch": 0.11516571811181787, + "grad_norm": 0.13275377452373505, + "learning_rate": 0.0001, + "loss": 1.6152, + "step": 344 + }, + { + "epoch": 0.11550050217609642, + "grad_norm": 0.12684765458106995, + "learning_rate": 0.0001, + "loss": 1.542, + "step": 345 + }, + { + "epoch": 0.11583528624037495, + "grad_norm": 0.1186991035938263, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 346 + }, + { + "epoch": 0.1161700703046535, + "grad_norm": 0.12221034616231918, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 347 + }, + { + "epoch": 0.11650485436893204, + "grad_norm": 0.11776617169380188, + "learning_rate": 0.0001, + "loss": 1.5821, + "step": 348 + }, + { + "epoch": 0.11683963843321057, + "grad_norm": 0.13464072346687317, + "learning_rate": 0.0001, + "loss": 1.6188, + "step": 349 + }, + { + "epoch": 0.11717442249748912, + "grad_norm": 0.13101482391357422, + "learning_rate": 0.0001, + "loss": 1.5194, + "step": 350 + }, + { + "epoch": 0.11750920656176767, + "grad_norm": 0.11970439553260803, + "learning_rate": 0.0001, + "loss": 1.5891, + "step": 351 + }, + { + "epoch": 0.1178439906260462, + "grad_norm": 0.11731956154108047, + "learning_rate": 0.0001, + "loss": 1.6441, + "step": 352 + }, + { + "epoch": 0.11817877469032474, + "grad_norm": 0.1163954809308052, + "learning_rate": 0.0001, + "loss": 1.5739, + "step": 353 + }, + { + "epoch": 0.11851355875460329, + "grad_norm": 0.13119016587734222, + "learning_rate": 0.0001, + "loss": 1.6667, + "step": 354 + }, + { + "epoch": 0.11884834281888182, + "grad_norm": 0.11406403034925461, + "learning_rate": 0.0001, + "loss": 1.5391, + "step": 355 + }, + { + "epoch": 0.11918312688316036, + "grad_norm": 0.12543243169784546, + "learning_rate": 0.0001, + "loss": 1.6413, + "step": 356 + }, + { + "epoch": 0.1195179109474389, + "grad_norm": 0.11639681458473206, + "learning_rate": 0.0001, + "loss": 1.5946, + "step": 357 + }, + { + "epoch": 0.11985269501171744, + "grad_norm": 0.11582693457603455, + "learning_rate": 0.0001, + "loss": 1.5797, + "step": 358 + }, + { + "epoch": 0.12018747907599599, + "grad_norm": 0.12131619453430176, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 359 + }, + { + "epoch": 0.12052226314027452, + "grad_norm": 0.1220826804637909, + "learning_rate": 0.0001, + "loss": 1.4938, + "step": 360 + }, + { + "epoch": 0.12085704720455306, + "grad_norm": 0.12737631797790527, + "learning_rate": 0.0001, + "loss": 1.5622, + "step": 361 + }, + { + "epoch": 0.12119183126883161, + "grad_norm": 0.12794937193393707, + "learning_rate": 0.0001, + "loss": 1.5852, + "step": 362 + }, + { + "epoch": 0.12152661533311014, + "grad_norm": 0.11786255985498428, + "learning_rate": 0.0001, + "loss": 1.6532, + "step": 363 + }, + { + "epoch": 0.12186139939738869, + "grad_norm": 0.12443582713603973, + "learning_rate": 0.0001, + "loss": 1.5664, + "step": 364 + }, + { + "epoch": 0.12219618346166722, + "grad_norm": 0.124130979180336, + "learning_rate": 0.0001, + "loss": 1.5809, + "step": 365 + }, + { + "epoch": 0.12253096752594576, + "grad_norm": 0.11969106644392014, + "learning_rate": 0.0001, + "loss": 1.5073, + "step": 366 + }, + { + "epoch": 0.12286575159022431, + "grad_norm": 0.12146104872226715, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 367 + }, + { + "epoch": 0.12320053565450284, + "grad_norm": 0.11919710785150528, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 368 + }, + { + "epoch": 0.12353531971878139, + "grad_norm": 0.12359990924596786, + "learning_rate": 0.0001, + "loss": 1.6564, + "step": 369 + }, + { + "epoch": 0.12387010378305993, + "grad_norm": 0.12216739356517792, + "learning_rate": 0.0001, + "loss": 1.658, + "step": 370 + }, + { + "epoch": 0.12420488784733846, + "grad_norm": 0.12388269603252411, + "learning_rate": 0.0001, + "loss": 1.6542, + "step": 371 + }, + { + "epoch": 0.12453967191161701, + "grad_norm": 0.12631452083587646, + "learning_rate": 0.0001, + "loss": 1.5741, + "step": 372 + }, + { + "epoch": 0.12487445597589555, + "grad_norm": 0.11718172580003738, + "learning_rate": 0.0001, + "loss": 1.5247, + "step": 373 + }, + { + "epoch": 0.12520924004017409, + "grad_norm": 0.11787404865026474, + "learning_rate": 0.0001, + "loss": 1.604, + "step": 374 + }, + { + "epoch": 0.12554402410445262, + "grad_norm": 0.1190713569521904, + "learning_rate": 0.0001, + "loss": 1.5771, + "step": 375 + }, + { + "epoch": 0.12587880816873118, + "grad_norm": 0.11780121177434921, + "learning_rate": 0.0001, + "loss": 1.6445, + "step": 376 + }, + { + "epoch": 0.1262135922330097, + "grad_norm": 0.11370184272527695, + "learning_rate": 0.0001, + "loss": 1.4544, + "step": 377 + }, + { + "epoch": 0.12654837629728824, + "grad_norm": 0.12931419909000397, + "learning_rate": 0.0001, + "loss": 1.5261, + "step": 378 + }, + { + "epoch": 0.1268831603615668, + "grad_norm": 0.11074584722518921, + "learning_rate": 0.0001, + "loss": 1.5329, + "step": 379 + }, + { + "epoch": 0.12721794442584533, + "grad_norm": 0.1251228302717209, + "learning_rate": 0.0001, + "loss": 1.6181, + "step": 380 + }, + { + "epoch": 0.12755272849012386, + "grad_norm": 0.11304245889186859, + "learning_rate": 0.0001, + "loss": 1.5198, + "step": 381 + }, + { + "epoch": 0.12788751255440242, + "grad_norm": 0.11219135671854019, + "learning_rate": 0.0001, + "loss": 1.494, + "step": 382 + }, + { + "epoch": 0.12822229661868095, + "grad_norm": 0.13162165880203247, + "learning_rate": 0.0001, + "loss": 1.6073, + "step": 383 + }, + { + "epoch": 0.12855708068295948, + "grad_norm": 0.11944107711315155, + "learning_rate": 0.0001, + "loss": 1.6021, + "step": 384 + }, + { + "epoch": 0.12889186474723804, + "grad_norm": 0.11878252029418945, + "learning_rate": 0.0001, + "loss": 1.6051, + "step": 385 + }, + { + "epoch": 0.12922664881151658, + "grad_norm": 0.1224270910024643, + "learning_rate": 0.0001, + "loss": 1.596, + "step": 386 + }, + { + "epoch": 0.1295614328757951, + "grad_norm": 0.12815283238887787, + "learning_rate": 0.0001, + "loss": 1.6652, + "step": 387 + }, + { + "epoch": 0.12989621694007364, + "grad_norm": 0.11265059560537338, + "learning_rate": 0.0001, + "loss": 1.5478, + "step": 388 + }, + { + "epoch": 0.1302310010043522, + "grad_norm": 0.12850640714168549, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 389 + }, + { + "epoch": 0.13056578506863073, + "grad_norm": 0.11487656831741333, + "learning_rate": 0.0001, + "loss": 1.6089, + "step": 390 + }, + { + "epoch": 0.13090056913290926, + "grad_norm": 0.1160978302359581, + "learning_rate": 0.0001, + "loss": 1.6248, + "step": 391 + }, + { + "epoch": 0.13123535319718782, + "grad_norm": 0.12001185864210129, + "learning_rate": 0.0001, + "loss": 1.5911, + "step": 392 + }, + { + "epoch": 0.13157013726146635, + "grad_norm": 0.11623065918684006, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 393 + }, + { + "epoch": 0.13190492132574488, + "grad_norm": 0.11913128942251205, + "learning_rate": 0.0001, + "loss": 1.6233, + "step": 394 + }, + { + "epoch": 0.13223970539002344, + "grad_norm": 0.11658355593681335, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 395 + }, + { + "epoch": 0.13257448945430197, + "grad_norm": 0.11986858397722244, + "learning_rate": 0.0001, + "loss": 1.7414, + "step": 396 + }, + { + "epoch": 0.1329092735185805, + "grad_norm": 0.12659533321857452, + "learning_rate": 0.0001, + "loss": 1.6037, + "step": 397 + }, + { + "epoch": 0.13324405758285907, + "grad_norm": 0.11471698433160782, + "learning_rate": 0.0001, + "loss": 1.5939, + "step": 398 + }, + { + "epoch": 0.1335788416471376, + "grad_norm": 0.12152232974767685, + "learning_rate": 0.0001, + "loss": 1.5663, + "step": 399 + }, + { + "epoch": 0.13391362571141613, + "grad_norm": 0.12228668481111526, + "learning_rate": 0.0001, + "loss": 1.6717, + "step": 400 + }, + { + "epoch": 0.1342484097756947, + "grad_norm": 0.11998744308948517, + "learning_rate": 0.0001, + "loss": 1.6532, + "step": 401 + }, + { + "epoch": 0.13458319383997322, + "grad_norm": 0.12556074559688568, + "learning_rate": 0.0001, + "loss": 1.6477, + "step": 402 + }, + { + "epoch": 0.13491797790425175, + "grad_norm": 0.12216352671384811, + "learning_rate": 0.0001, + "loss": 1.6084, + "step": 403 + }, + { + "epoch": 0.1352527619685303, + "grad_norm": 0.1290225237607956, + "learning_rate": 0.0001, + "loss": 1.6369, + "step": 404 + }, + { + "epoch": 0.13558754603280884, + "grad_norm": 0.11453018337488174, + "learning_rate": 0.0001, + "loss": 1.5886, + "step": 405 + }, + { + "epoch": 0.13592233009708737, + "grad_norm": 0.12192509323358536, + "learning_rate": 0.0001, + "loss": 1.5788, + "step": 406 + }, + { + "epoch": 0.1362571141613659, + "grad_norm": 0.11374159157276154, + "learning_rate": 0.0001, + "loss": 1.5429, + "step": 407 + }, + { + "epoch": 0.13659189822564446, + "grad_norm": 0.11875942349433899, + "learning_rate": 0.0001, + "loss": 1.6524, + "step": 408 + }, + { + "epoch": 0.136926682289923, + "grad_norm": 0.12176533043384552, + "learning_rate": 0.0001, + "loss": 1.6572, + "step": 409 + }, + { + "epoch": 0.13726146635420153, + "grad_norm": 0.12200423330068588, + "learning_rate": 0.0001, + "loss": 1.7139, + "step": 410 + }, + { + "epoch": 0.1375962504184801, + "grad_norm": 0.11800340563058853, + "learning_rate": 0.0001, + "loss": 1.6276, + "step": 411 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 0.12321179360151291, + "learning_rate": 0.0001, + "loss": 1.6849, + "step": 412 + }, + { + "epoch": 0.13826581854703715, + "grad_norm": 0.12165375053882599, + "learning_rate": 0.0001, + "loss": 1.5823, + "step": 413 + }, + { + "epoch": 0.1386006026113157, + "grad_norm": 0.12587733566761017, + "learning_rate": 0.0001, + "loss": 1.5712, + "step": 414 + }, + { + "epoch": 0.13893538667559424, + "grad_norm": 0.11877655982971191, + "learning_rate": 0.0001, + "loss": 1.606, + "step": 415 + }, + { + "epoch": 0.13927017073987277, + "grad_norm": 0.11970411241054535, + "learning_rate": 0.0001, + "loss": 1.4995, + "step": 416 + }, + { + "epoch": 0.13960495480415133, + "grad_norm": 0.14770293235778809, + "learning_rate": 0.0001, + "loss": 1.7334, + "step": 417 + }, + { + "epoch": 0.13993973886842986, + "grad_norm": 0.11904104053974152, + "learning_rate": 0.0001, + "loss": 1.6258, + "step": 418 + }, + { + "epoch": 0.1402745229327084, + "grad_norm": 0.13043157756328583, + "learning_rate": 0.0001, + "loss": 1.5564, + "step": 419 + }, + { + "epoch": 0.14060930699698695, + "grad_norm": 0.1354888528585434, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 420 + }, + { + "epoch": 0.1409440910612655, + "grad_norm": 0.11834760010242462, + "learning_rate": 0.0001, + "loss": 1.5345, + "step": 421 + }, + { + "epoch": 0.14127887512554402, + "grad_norm": 0.13029152154922485, + "learning_rate": 0.0001, + "loss": 1.5007, + "step": 422 + }, + { + "epoch": 0.14161365918982258, + "grad_norm": 0.1352154165506363, + "learning_rate": 0.0001, + "loss": 1.5925, + "step": 423 + }, + { + "epoch": 0.1419484432541011, + "grad_norm": 0.13768818974494934, + "learning_rate": 0.0001, + "loss": 1.6513, + "step": 424 + }, + { + "epoch": 0.14228322731837964, + "grad_norm": 0.1345231682062149, + "learning_rate": 0.0001, + "loss": 1.6524, + "step": 425 + }, + { + "epoch": 0.14261801138265817, + "grad_norm": 0.11808541417121887, + "learning_rate": 0.0001, + "loss": 1.6038, + "step": 426 + }, + { + "epoch": 0.14295279544693673, + "grad_norm": 0.1403636336326599, + "learning_rate": 0.0001, + "loss": 1.5559, + "step": 427 + }, + { + "epoch": 0.14328757951121526, + "grad_norm": 0.13042065501213074, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 428 + }, + { + "epoch": 0.1436223635754938, + "grad_norm": 0.12809261679649353, + "learning_rate": 0.0001, + "loss": 1.5913, + "step": 429 + }, + { + "epoch": 0.14395714763977235, + "grad_norm": 0.13735899329185486, + "learning_rate": 0.0001, + "loss": 1.5814, + "step": 430 + }, + { + "epoch": 0.14429193170405089, + "grad_norm": 0.12458304315805435, + "learning_rate": 0.0001, + "loss": 1.6909, + "step": 431 + }, + { + "epoch": 0.14462671576832942, + "grad_norm": 0.11777736246585846, + "learning_rate": 0.0001, + "loss": 1.599, + "step": 432 + }, + { + "epoch": 0.14496149983260798, + "grad_norm": 0.11958497762680054, + "learning_rate": 0.0001, + "loss": 1.6224, + "step": 433 + }, + { + "epoch": 0.1452962838968865, + "grad_norm": 0.11626480519771576, + "learning_rate": 0.0001, + "loss": 1.6192, + "step": 434 + }, + { + "epoch": 0.14563106796116504, + "grad_norm": 0.12103210389614105, + "learning_rate": 0.0001, + "loss": 1.5581, + "step": 435 + }, + { + "epoch": 0.1459658520254436, + "grad_norm": 0.1175006702542305, + "learning_rate": 0.0001, + "loss": 1.6147, + "step": 436 + }, + { + "epoch": 0.14630063608972213, + "grad_norm": 0.1194823831319809, + "learning_rate": 0.0001, + "loss": 1.4559, + "step": 437 + }, + { + "epoch": 0.14663542015400066, + "grad_norm": 0.12060422450304031, + "learning_rate": 0.0001, + "loss": 1.706, + "step": 438 + }, + { + "epoch": 0.14697020421827922, + "grad_norm": 0.12133188545703888, + "learning_rate": 0.0001, + "loss": 1.6583, + "step": 439 + }, + { + "epoch": 0.14730498828255775, + "grad_norm": 0.11069684475660324, + "learning_rate": 0.0001, + "loss": 1.5626, + "step": 440 + }, + { + "epoch": 0.14763977234683628, + "grad_norm": 0.11735668778419495, + "learning_rate": 0.0001, + "loss": 1.5014, + "step": 441 + }, + { + "epoch": 0.14797455641111484, + "grad_norm": 0.11778223514556885, + "learning_rate": 0.0001, + "loss": 1.6483, + "step": 442 + }, + { + "epoch": 0.14830934047539338, + "grad_norm": 0.11628784239292145, + "learning_rate": 0.0001, + "loss": 1.5629, + "step": 443 + }, + { + "epoch": 0.1486441245396719, + "grad_norm": 0.12314952164888382, + "learning_rate": 0.0001, + "loss": 1.6362, + "step": 444 + }, + { + "epoch": 0.14897890860395044, + "grad_norm": 0.11853016167879105, + "learning_rate": 0.0001, + "loss": 1.6355, + "step": 445 + }, + { + "epoch": 0.149313692668229, + "grad_norm": 0.1322093904018402, + "learning_rate": 0.0001, + "loss": 1.7655, + "step": 446 + }, + { + "epoch": 0.14964847673250753, + "grad_norm": 0.11611328274011612, + "learning_rate": 0.0001, + "loss": 1.5878, + "step": 447 + }, + { + "epoch": 0.14998326079678606, + "grad_norm": 0.11989305913448334, + "learning_rate": 0.0001, + "loss": 1.5576, + "step": 448 + }, + { + "epoch": 0.15031804486106462, + "grad_norm": 0.11867792159318924, + "learning_rate": 0.0001, + "loss": 1.5453, + "step": 449 + }, + { + "epoch": 0.15065282892534315, + "grad_norm": 0.11955395340919495, + "learning_rate": 0.0001, + "loss": 1.6089, + "step": 450 + }, + { + "epoch": 0.15098761298962168, + "grad_norm": 0.13159644603729248, + "learning_rate": 0.0001, + "loss": 1.6053, + "step": 451 + }, + { + "epoch": 0.15132239705390024, + "grad_norm": 0.12264451384544373, + "learning_rate": 0.0001, + "loss": 1.6277, + "step": 452 + }, + { + "epoch": 0.15165718111817877, + "grad_norm": 0.1267840564250946, + "learning_rate": 0.0001, + "loss": 1.6047, + "step": 453 + }, + { + "epoch": 0.1519919651824573, + "grad_norm": 0.1316317319869995, + "learning_rate": 0.0001, + "loss": 1.5497, + "step": 454 + }, + { + "epoch": 0.15232674924673587, + "grad_norm": 0.12278051674365997, + "learning_rate": 0.0001, + "loss": 1.665, + "step": 455 + }, + { + "epoch": 0.1526615333110144, + "grad_norm": 0.13153740763664246, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 456 + }, + { + "epoch": 0.15299631737529293, + "grad_norm": 0.12118583172559738, + "learning_rate": 0.0001, + "loss": 1.5897, + "step": 457 + }, + { + "epoch": 0.1533311014395715, + "grad_norm": 0.12203945219516754, + "learning_rate": 0.0001, + "loss": 1.5709, + "step": 458 + }, + { + "epoch": 0.15366588550385002, + "grad_norm": 0.13483074307441711, + "learning_rate": 0.0001, + "loss": 1.662, + "step": 459 + }, + { + "epoch": 0.15400066956812855, + "grad_norm": 0.12122450023889542, + "learning_rate": 0.0001, + "loss": 1.6289, + "step": 460 + }, + { + "epoch": 0.1543354536324071, + "grad_norm": 0.1384558528661728, + "learning_rate": 0.0001, + "loss": 1.6274, + "step": 461 + }, + { + "epoch": 0.15467023769668564, + "grad_norm": 0.1436455249786377, + "learning_rate": 0.0001, + "loss": 1.6007, + "step": 462 + }, + { + "epoch": 0.15500502176096417, + "grad_norm": 0.12359965592622757, + "learning_rate": 0.0001, + "loss": 1.6757, + "step": 463 + }, + { + "epoch": 0.1553398058252427, + "grad_norm": 0.13497023284435272, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 464 + }, + { + "epoch": 0.15567458988952126, + "grad_norm": 0.12588655948638916, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 465 + }, + { + "epoch": 0.1560093739537998, + "grad_norm": 0.11950384825468063, + "learning_rate": 0.0001, + "loss": 1.6388, + "step": 466 + }, + { + "epoch": 0.15634415801807833, + "grad_norm": 0.13280175626277924, + "learning_rate": 0.0001, + "loss": 1.6097, + "step": 467 + }, + { + "epoch": 0.1566789420823569, + "grad_norm": 0.11717383563518524, + "learning_rate": 0.0001, + "loss": 1.6519, + "step": 468 + }, + { + "epoch": 0.15701372614663542, + "grad_norm": 0.12387187778949738, + "learning_rate": 0.0001, + "loss": 1.5661, + "step": 469 + }, + { + "epoch": 0.15734851021091395, + "grad_norm": 0.12535057961940765, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 470 + }, + { + "epoch": 0.1576832942751925, + "grad_norm": 0.12057804316282272, + "learning_rate": 0.0001, + "loss": 1.6463, + "step": 471 + }, + { + "epoch": 0.15801807833947104, + "grad_norm": 0.1360681802034378, + "learning_rate": 0.0001, + "loss": 1.7093, + "step": 472 + }, + { + "epoch": 0.15835286240374957, + "grad_norm": 0.11986411362886429, + "learning_rate": 0.0001, + "loss": 1.5864, + "step": 473 + }, + { + "epoch": 0.15868764646802813, + "grad_norm": 0.11335694789886475, + "learning_rate": 0.0001, + "loss": 1.5495, + "step": 474 + }, + { + "epoch": 0.15902243053230666, + "grad_norm": 0.11684451997280121, + "learning_rate": 0.0001, + "loss": 1.5295, + "step": 475 + }, + { + "epoch": 0.1593572145965852, + "grad_norm": 0.12882184982299805, + "learning_rate": 0.0001, + "loss": 1.6903, + "step": 476 + }, + { + "epoch": 0.15969199866086375, + "grad_norm": 0.12175029516220093, + "learning_rate": 0.0001, + "loss": 1.6421, + "step": 477 + }, + { + "epoch": 0.16002678272514229, + "grad_norm": 0.1330244094133377, + "learning_rate": 0.0001, + "loss": 1.5691, + "step": 478 + }, + { + "epoch": 0.16036156678942082, + "grad_norm": 0.12204015254974365, + "learning_rate": 0.0001, + "loss": 1.6557, + "step": 479 + }, + { + "epoch": 0.16069635085369938, + "grad_norm": 0.1265457272529602, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 480 + }, + { + "epoch": 0.1610311349179779, + "grad_norm": 0.13419146835803986, + "learning_rate": 0.0001, + "loss": 1.5694, + "step": 481 + }, + { + "epoch": 0.16136591898225644, + "grad_norm": 0.12663477659225464, + "learning_rate": 0.0001, + "loss": 1.49, + "step": 482 + }, + { + "epoch": 0.16170070304653497, + "grad_norm": 0.1386338621377945, + "learning_rate": 0.0001, + "loss": 1.5774, + "step": 483 + }, + { + "epoch": 0.16203548711081353, + "grad_norm": 0.1266423612833023, + "learning_rate": 0.0001, + "loss": 1.568, + "step": 484 + }, + { + "epoch": 0.16237027117509206, + "grad_norm": 0.11795584112405777, + "learning_rate": 0.0001, + "loss": 1.5952, + "step": 485 + }, + { + "epoch": 0.1627050552393706, + "grad_norm": 0.13247069716453552, + "learning_rate": 0.0001, + "loss": 1.5486, + "step": 486 + }, + { + "epoch": 0.16303983930364915, + "grad_norm": 0.12367638945579529, + "learning_rate": 0.0001, + "loss": 1.6618, + "step": 487 + }, + { + "epoch": 0.16337462336792768, + "grad_norm": 0.11988285183906555, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 488 + }, + { + "epoch": 0.16370940743220622, + "grad_norm": 0.12422308325767517, + "learning_rate": 0.0001, + "loss": 1.5753, + "step": 489 + }, + { + "epoch": 0.16404419149648478, + "grad_norm": 0.12060552090406418, + "learning_rate": 0.0001, + "loss": 1.6158, + "step": 490 + }, + { + "epoch": 0.1643789755607633, + "grad_norm": 0.1219470277428627, + "learning_rate": 0.0001, + "loss": 1.5057, + "step": 491 + }, + { + "epoch": 0.16471375962504184, + "grad_norm": 0.12771841883659363, + "learning_rate": 0.0001, + "loss": 1.6627, + "step": 492 + }, + { + "epoch": 0.1650485436893204, + "grad_norm": 0.11713176220655441, + "learning_rate": 0.0001, + "loss": 1.5697, + "step": 493 + }, + { + "epoch": 0.16538332775359893, + "grad_norm": 0.1419348567724228, + "learning_rate": 0.0001, + "loss": 1.7253, + "step": 494 + }, + { + "epoch": 0.16571811181787746, + "grad_norm": 0.1297536939382553, + "learning_rate": 0.0001, + "loss": 1.666, + "step": 495 + }, + { + "epoch": 0.16605289588215602, + "grad_norm": 0.12997077405452728, + "learning_rate": 0.0001, + "loss": 1.5825, + "step": 496 + }, + { + "epoch": 0.16638767994643455, + "grad_norm": 0.14354097843170166, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 497 + }, + { + "epoch": 0.16672246401071308, + "grad_norm": 0.12498887628316879, + "learning_rate": 0.0001, + "loss": 1.7003, + "step": 498 + }, + { + "epoch": 0.16705724807499164, + "grad_norm": 0.13219912350177765, + "learning_rate": 0.0001, + "loss": 1.6218, + "step": 499 + }, + { + "epoch": 0.16739203213927017, + "grad_norm": 0.13144424557685852, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 500 + }, + { + "epoch": 0.1677268162035487, + "grad_norm": 0.12147901952266693, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 501 + }, + { + "epoch": 0.16806160026782724, + "grad_norm": 0.13109005987644196, + "learning_rate": 0.0001, + "loss": 1.7168, + "step": 502 + }, + { + "epoch": 0.1683963843321058, + "grad_norm": 0.1306311935186386, + "learning_rate": 0.0001, + "loss": 1.5859, + "step": 503 + }, + { + "epoch": 0.16873116839638433, + "grad_norm": 0.115351103246212, + "learning_rate": 0.0001, + "loss": 1.6124, + "step": 504 + }, + { + "epoch": 0.16906595246066286, + "grad_norm": 0.12713004648685455, + "learning_rate": 0.0001, + "loss": 1.5558, + "step": 505 + }, + { + "epoch": 0.16940073652494142, + "grad_norm": 0.1304563283920288, + "learning_rate": 0.0001, + "loss": 1.676, + "step": 506 + }, + { + "epoch": 0.16973552058921995, + "grad_norm": 0.12284432351589203, + "learning_rate": 0.0001, + "loss": 1.5585, + "step": 507 + }, + { + "epoch": 0.17007030465349848, + "grad_norm": 0.12343181669712067, + "learning_rate": 0.0001, + "loss": 1.5869, + "step": 508 + }, + { + "epoch": 0.17040508871777704, + "grad_norm": 0.11459839344024658, + "learning_rate": 0.0001, + "loss": 1.6051, + "step": 509 + }, + { + "epoch": 0.17073987278205557, + "grad_norm": 0.11883780360221863, + "learning_rate": 0.0001, + "loss": 1.5064, + "step": 510 + }, + { + "epoch": 0.1710746568463341, + "grad_norm": 0.12307373434305191, + "learning_rate": 0.0001, + "loss": 1.5257, + "step": 511 + }, + { + "epoch": 0.17140944091061266, + "grad_norm": 0.11666516959667206, + "learning_rate": 0.0001, + "loss": 1.5842, + "step": 512 + }, + { + "epoch": 0.1717442249748912, + "grad_norm": 0.11493846029043198, + "learning_rate": 0.0001, + "loss": 1.6215, + "step": 513 + }, + { + "epoch": 0.17207900903916973, + "grad_norm": 0.1198093444108963, + "learning_rate": 0.0001, + "loss": 1.5875, + "step": 514 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 0.11997364461421967, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 515 + }, + { + "epoch": 0.17274857716772682, + "grad_norm": 0.12003917992115021, + "learning_rate": 0.0001, + "loss": 1.7019, + "step": 516 + }, + { + "epoch": 0.17308336123200535, + "grad_norm": 0.11761089414358139, + "learning_rate": 0.0001, + "loss": 1.5742, + "step": 517 + }, + { + "epoch": 0.1734181452962839, + "grad_norm": 0.12004124373197556, + "learning_rate": 0.0001, + "loss": 1.5947, + "step": 518 + }, + { + "epoch": 0.17375292936056244, + "grad_norm": 0.12139872461557388, + "learning_rate": 0.0001, + "loss": 1.4861, + "step": 519 + }, + { + "epoch": 0.17408771342484097, + "grad_norm": 0.12214326858520508, + "learning_rate": 0.0001, + "loss": 1.6953, + "step": 520 + }, + { + "epoch": 0.1744224974891195, + "grad_norm": 0.12239626795053482, + "learning_rate": 0.0001, + "loss": 1.5529, + "step": 521 + }, + { + "epoch": 0.17475728155339806, + "grad_norm": 0.11888886988162994, + "learning_rate": 0.0001, + "loss": 1.5099, + "step": 522 + }, + { + "epoch": 0.1750920656176766, + "grad_norm": 0.11585521697998047, + "learning_rate": 0.0001, + "loss": 1.5392, + "step": 523 + }, + { + "epoch": 0.17542684968195513, + "grad_norm": 0.1300823837518692, + "learning_rate": 0.0001, + "loss": 1.6598, + "step": 524 + }, + { + "epoch": 0.1757616337462337, + "grad_norm": 0.12741157412528992, + "learning_rate": 0.0001, + "loss": 1.5798, + "step": 525 + }, + { + "epoch": 0.17609641781051222, + "grad_norm": 0.11614137142896652, + "learning_rate": 0.0001, + "loss": 1.5343, + "step": 526 + }, + { + "epoch": 0.17643120187479075, + "grad_norm": 0.12221526354551315, + "learning_rate": 0.0001, + "loss": 1.552, + "step": 527 + }, + { + "epoch": 0.1767659859390693, + "grad_norm": 0.13221661746501923, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 528 + }, + { + "epoch": 0.17710077000334784, + "grad_norm": 0.12069322913885117, + "learning_rate": 0.0001, + "loss": 1.6148, + "step": 529 + }, + { + "epoch": 0.17743555406762637, + "grad_norm": 0.11254309117794037, + "learning_rate": 0.0001, + "loss": 1.5917, + "step": 530 + }, + { + "epoch": 0.17777033813190493, + "grad_norm": 0.11715224385261536, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 531 + }, + { + "epoch": 0.17810512219618346, + "grad_norm": 0.1183256059885025, + "learning_rate": 0.0001, + "loss": 1.4889, + "step": 532 + }, + { + "epoch": 0.178439906260462, + "grad_norm": 0.12182603031396866, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 533 + }, + { + "epoch": 0.17877469032474055, + "grad_norm": 0.1232253909111023, + "learning_rate": 0.0001, + "loss": 1.6754, + "step": 534 + }, + { + "epoch": 0.17910947438901909, + "grad_norm": 0.11796277016401291, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 535 + }, + { + "epoch": 0.17944425845329762, + "grad_norm": 0.13181637227535248, + "learning_rate": 0.0001, + "loss": 1.6505, + "step": 536 + }, + { + "epoch": 0.17977904251757618, + "grad_norm": 0.11481553316116333, + "learning_rate": 0.0001, + "loss": 1.492, + "step": 537 + }, + { + "epoch": 0.1801138265818547, + "grad_norm": 0.12842705845832825, + "learning_rate": 0.0001, + "loss": 1.734, + "step": 538 + }, + { + "epoch": 0.18044861064613324, + "grad_norm": 0.1235375851392746, + "learning_rate": 0.0001, + "loss": 1.6496, + "step": 539 + }, + { + "epoch": 0.18078339471041177, + "grad_norm": 0.12111697345972061, + "learning_rate": 0.0001, + "loss": 1.5044, + "step": 540 + }, + { + "epoch": 0.18111817877469033, + "grad_norm": 0.12484171241521835, + "learning_rate": 0.0001, + "loss": 1.6643, + "step": 541 + }, + { + "epoch": 0.18145296283896886, + "grad_norm": 0.12675760686397552, + "learning_rate": 0.0001, + "loss": 1.6188, + "step": 542 + }, + { + "epoch": 0.1817877469032474, + "grad_norm": 0.12203079462051392, + "learning_rate": 0.0001, + "loss": 1.507, + "step": 543 + }, + { + "epoch": 0.18212253096752595, + "grad_norm": 0.12013613432645798, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 544 + }, + { + "epoch": 0.18245731503180448, + "grad_norm": 0.12438444793224335, + "learning_rate": 0.0001, + "loss": 1.5849, + "step": 545 + }, + { + "epoch": 0.18279209909608302, + "grad_norm": 0.13607415556907654, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 546 + }, + { + "epoch": 0.18312688316036158, + "grad_norm": 0.1240532174706459, + "learning_rate": 0.0001, + "loss": 1.5205, + "step": 547 + }, + { + "epoch": 0.1834616672246401, + "grad_norm": 0.1510075032711029, + "learning_rate": 0.0001, + "loss": 1.6608, + "step": 548 + }, + { + "epoch": 0.18379645128891864, + "grad_norm": 0.11965179443359375, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 549 + }, + { + "epoch": 0.1841312353531972, + "grad_norm": 0.14874660968780518, + "learning_rate": 0.0001, + "loss": 1.6156, + "step": 550 + }, + { + "epoch": 0.18446601941747573, + "grad_norm": 0.1273370385169983, + "learning_rate": 0.0001, + "loss": 1.5117, + "step": 551 + }, + { + "epoch": 0.18480080348175426, + "grad_norm": 0.1213572546839714, + "learning_rate": 0.0001, + "loss": 1.5124, + "step": 552 + }, + { + "epoch": 0.18513558754603282, + "grad_norm": 0.1602640151977539, + "learning_rate": 0.0001, + "loss": 1.6318, + "step": 553 + }, + { + "epoch": 0.18547037161031135, + "grad_norm": 0.12859167158603668, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 554 + }, + { + "epoch": 0.18580515567458988, + "grad_norm": 0.13728216290473938, + "learning_rate": 0.0001, + "loss": 1.5873, + "step": 555 + }, + { + "epoch": 0.18613993973886844, + "grad_norm": 0.12880103290081024, + "learning_rate": 0.0001, + "loss": 1.5121, + "step": 556 + }, + { + "epoch": 0.18647472380314697, + "grad_norm": 0.1293378323316574, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 557 + }, + { + "epoch": 0.1868095078674255, + "grad_norm": 0.1387391984462738, + "learning_rate": 0.0001, + "loss": 1.6486, + "step": 558 + }, + { + "epoch": 0.18714429193170404, + "grad_norm": 0.14882785081863403, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 559 + }, + { + "epoch": 0.1874790759959826, + "grad_norm": 0.11521956324577332, + "learning_rate": 0.0001, + "loss": 1.5032, + "step": 560 + }, + { + "epoch": 0.18781386006026113, + "grad_norm": 0.12418463081121445, + "learning_rate": 0.0001, + "loss": 1.5422, + "step": 561 + }, + { + "epoch": 0.18814864412453966, + "grad_norm": 0.13123475015163422, + "learning_rate": 0.0001, + "loss": 1.6459, + "step": 562 + }, + { + "epoch": 0.18848342818881822, + "grad_norm": 0.12267505377531052, + "learning_rate": 0.0001, + "loss": 1.61, + "step": 563 + }, + { + "epoch": 0.18881821225309675, + "grad_norm": 0.12172992527484894, + "learning_rate": 0.0001, + "loss": 1.551, + "step": 564 + }, + { + "epoch": 0.18915299631737528, + "grad_norm": 0.12027712911367416, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 565 + }, + { + "epoch": 0.18948778038165384, + "grad_norm": 0.11598297208547592, + "learning_rate": 0.0001, + "loss": 1.5959, + "step": 566 + }, + { + "epoch": 0.18982256444593237, + "grad_norm": 0.11541326344013214, + "learning_rate": 0.0001, + "loss": 1.5936, + "step": 567 + }, + { + "epoch": 0.1901573485102109, + "grad_norm": 0.12343809008598328, + "learning_rate": 0.0001, + "loss": 1.6091, + "step": 568 + }, + { + "epoch": 0.19049213257448946, + "grad_norm": 0.11451027542352676, + "learning_rate": 0.0001, + "loss": 1.6203, + "step": 569 + }, + { + "epoch": 0.190826916638768, + "grad_norm": 0.1260651797056198, + "learning_rate": 0.0001, + "loss": 1.6105, + "step": 570 + }, + { + "epoch": 0.19116170070304653, + "grad_norm": 0.1183401346206665, + "learning_rate": 0.0001, + "loss": 1.583, + "step": 571 + }, + { + "epoch": 0.1914964847673251, + "grad_norm": 0.11767153441905975, + "learning_rate": 0.0001, + "loss": 1.5717, + "step": 572 + }, + { + "epoch": 0.19183126883160362, + "grad_norm": 0.11693871766328812, + "learning_rate": 0.0001, + "loss": 1.5783, + "step": 573 + }, + { + "epoch": 0.19216605289588215, + "grad_norm": 0.1267687827348709, + "learning_rate": 0.0001, + "loss": 1.5803, + "step": 574 + }, + { + "epoch": 0.1925008369601607, + "grad_norm": 0.11946652829647064, + "learning_rate": 0.0001, + "loss": 1.5575, + "step": 575 + }, + { + "epoch": 0.19283562102443924, + "grad_norm": 0.12602412700653076, + "learning_rate": 0.0001, + "loss": 1.7297, + "step": 576 + }, + { + "epoch": 0.19317040508871777, + "grad_norm": 0.12529441714286804, + "learning_rate": 0.0001, + "loss": 1.6877, + "step": 577 + }, + { + "epoch": 0.1935051891529963, + "grad_norm": 0.12578092515468597, + "learning_rate": 0.0001, + "loss": 1.5397, + "step": 578 + }, + { + "epoch": 0.19383997321727486, + "grad_norm": 0.12697197496891022, + "learning_rate": 0.0001, + "loss": 1.5541, + "step": 579 + }, + { + "epoch": 0.1941747572815534, + "grad_norm": 0.12927542626857758, + "learning_rate": 0.0001, + "loss": 1.6155, + "step": 580 + }, + { + "epoch": 0.19450954134583193, + "grad_norm": 0.1361040472984314, + "learning_rate": 0.0001, + "loss": 1.5857, + "step": 581 + }, + { + "epoch": 0.19484432541011049, + "grad_norm": 0.11877462267875671, + "learning_rate": 0.0001, + "loss": 1.5558, + "step": 582 + }, + { + "epoch": 0.19517910947438902, + "grad_norm": 0.14642973244190216, + "learning_rate": 0.0001, + "loss": 1.6476, + "step": 583 + }, + { + "epoch": 0.19551389353866755, + "grad_norm": 0.13428737223148346, + "learning_rate": 0.0001, + "loss": 1.5862, + "step": 584 + }, + { + "epoch": 0.1958486776029461, + "grad_norm": 0.1275390088558197, + "learning_rate": 0.0001, + "loss": 1.5418, + "step": 585 + }, + { + "epoch": 0.19618346166722464, + "grad_norm": 0.1398482322692871, + "learning_rate": 0.0001, + "loss": 1.4985, + "step": 586 + }, + { + "epoch": 0.19651824573150317, + "grad_norm": 0.12443619966506958, + "learning_rate": 0.0001, + "loss": 1.5726, + "step": 587 + }, + { + "epoch": 0.19685302979578173, + "grad_norm": 0.12923243641853333, + "learning_rate": 0.0001, + "loss": 1.5596, + "step": 588 + }, + { + "epoch": 0.19718781386006026, + "grad_norm": 0.14045698940753937, + "learning_rate": 0.0001, + "loss": 1.5475, + "step": 589 + }, + { + "epoch": 0.1975225979243388, + "grad_norm": 0.12687772512435913, + "learning_rate": 0.0001, + "loss": 1.7041, + "step": 590 + }, + { + "epoch": 0.19785738198861735, + "grad_norm": 0.14536388218402863, + "learning_rate": 0.0001, + "loss": 1.5724, + "step": 591 + }, + { + "epoch": 0.19819216605289589, + "grad_norm": 0.1331462413072586, + "learning_rate": 0.0001, + "loss": 1.6991, + "step": 592 + }, + { + "epoch": 0.19852695011717442, + "grad_norm": 0.13363464176654816, + "learning_rate": 0.0001, + "loss": 1.6665, + "step": 593 + }, + { + "epoch": 0.19886173418145298, + "grad_norm": 0.13291539251804352, + "learning_rate": 0.0001, + "loss": 1.6278, + "step": 594 + }, + { + "epoch": 0.1991965182457315, + "grad_norm": 0.1261158436536789, + "learning_rate": 0.0001, + "loss": 1.6129, + "step": 595 + }, + { + "epoch": 0.19953130231001004, + "grad_norm": 0.12324585020542145, + "learning_rate": 0.0001, + "loss": 1.6509, + "step": 596 + }, + { + "epoch": 0.19986608637428857, + "grad_norm": 0.11849376559257507, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 597 + }, + { + "epoch": 0.20020087043856713, + "grad_norm": 0.1167241707444191, + "learning_rate": 0.0001, + "loss": 1.5539, + "step": 598 + }, + { + "epoch": 0.20053565450284566, + "grad_norm": 0.11860879510641098, + "learning_rate": 0.0001, + "loss": 1.5962, + "step": 599 + }, + { + "epoch": 0.2008704385671242, + "grad_norm": 0.12385833263397217, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 600 + }, + { + "epoch": 0.20120522263140275, + "grad_norm": 0.12093829363584518, + "learning_rate": 0.0001, + "loss": 1.6914, + "step": 601 + }, + { + "epoch": 0.20154000669568128, + "grad_norm": 0.11839880049228668, + "learning_rate": 0.0001, + "loss": 1.5645, + "step": 602 + }, + { + "epoch": 0.20187479075995982, + "grad_norm": 0.11958955973386765, + "learning_rate": 0.0001, + "loss": 1.6964, + "step": 603 + }, + { + "epoch": 0.20220957482423838, + "grad_norm": 0.12148015946149826, + "learning_rate": 0.0001, + "loss": 1.6201, + "step": 604 + }, + { + "epoch": 0.2025443588885169, + "grad_norm": 0.11879414319992065, + "learning_rate": 0.0001, + "loss": 1.5696, + "step": 605 + }, + { + "epoch": 0.20287914295279544, + "grad_norm": 0.11815709620714188, + "learning_rate": 0.0001, + "loss": 1.5771, + "step": 606 + }, + { + "epoch": 0.203213927017074, + "grad_norm": 0.12391653656959534, + "learning_rate": 0.0001, + "loss": 1.4984, + "step": 607 + }, + { + "epoch": 0.20354871108135253, + "grad_norm": 0.12949740886688232, + "learning_rate": 0.0001, + "loss": 1.6746, + "step": 608 + }, + { + "epoch": 0.20388349514563106, + "grad_norm": 0.12630179524421692, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 609 + }, + { + "epoch": 0.20421827920990962, + "grad_norm": 0.13836237788200378, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 610 + }, + { + "epoch": 0.20455306327418815, + "grad_norm": 0.12105460464954376, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 611 + }, + { + "epoch": 0.20488784733846668, + "grad_norm": 0.13807529211044312, + "learning_rate": 0.0001, + "loss": 1.5858, + "step": 612 + }, + { + "epoch": 0.20522263140274524, + "grad_norm": 0.12660756707191467, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 613 + }, + { + "epoch": 0.20555741546702377, + "grad_norm": 0.11513250321149826, + "learning_rate": 0.0001, + "loss": 1.5572, + "step": 614 + }, + { + "epoch": 0.2058921995313023, + "grad_norm": 0.12499019503593445, + "learning_rate": 0.0001, + "loss": 1.5902, + "step": 615 + }, + { + "epoch": 0.20622698359558084, + "grad_norm": 0.13060630857944489, + "learning_rate": 0.0001, + "loss": 1.6933, + "step": 616 + }, + { + "epoch": 0.2065617676598594, + "grad_norm": 0.11751000583171844, + "learning_rate": 0.0001, + "loss": 1.6165, + "step": 617 + }, + { + "epoch": 0.20689655172413793, + "grad_norm": 0.12362553179264069, + "learning_rate": 0.0001, + "loss": 1.6214, + "step": 618 + }, + { + "epoch": 0.20723133578841646, + "grad_norm": 0.11933618783950806, + "learning_rate": 0.0001, + "loss": 1.6041, + "step": 619 + }, + { + "epoch": 0.20756611985269502, + "grad_norm": 0.12560446560382843, + "learning_rate": 0.0001, + "loss": 1.689, + "step": 620 + }, + { + "epoch": 0.20790090391697355, + "grad_norm": 0.12433163821697235, + "learning_rate": 0.0001, + "loss": 1.6717, + "step": 621 + }, + { + "epoch": 0.20823568798125208, + "grad_norm": 0.12220048159360886, + "learning_rate": 0.0001, + "loss": 1.6216, + "step": 622 + }, + { + "epoch": 0.20857047204553064, + "grad_norm": 0.11404889076948166, + "learning_rate": 0.0001, + "loss": 1.5362, + "step": 623 + }, + { + "epoch": 0.20890525610980917, + "grad_norm": 0.11990871280431747, + "learning_rate": 0.0001, + "loss": 1.5971, + "step": 624 + }, + { + "epoch": 0.2092400401740877, + "grad_norm": 0.11785005033016205, + "learning_rate": 0.0001, + "loss": 1.5641, + "step": 625 + }, + { + "epoch": 0.20957482423836626, + "grad_norm": 0.12312883138656616, + "learning_rate": 0.0001, + "loss": 1.617, + "step": 626 + }, + { + "epoch": 0.2099096083026448, + "grad_norm": 0.11449938267469406, + "learning_rate": 0.0001, + "loss": 1.5396, + "step": 627 + }, + { + "epoch": 0.21024439236692333, + "grad_norm": 0.1219322681427002, + "learning_rate": 0.0001, + "loss": 1.5951, + "step": 628 + }, + { + "epoch": 0.2105791764312019, + "grad_norm": 0.12152589112520218, + "learning_rate": 0.0001, + "loss": 1.6017, + "step": 629 + }, + { + "epoch": 0.21091396049548042, + "grad_norm": 0.11546038091182709, + "learning_rate": 0.0001, + "loss": 1.5969, + "step": 630 + }, + { + "epoch": 0.21124874455975895, + "grad_norm": 0.1294824779033661, + "learning_rate": 0.0001, + "loss": 1.5983, + "step": 631 + }, + { + "epoch": 0.2115835286240375, + "grad_norm": 0.12606552243232727, + "learning_rate": 0.0001, + "loss": 1.6026, + "step": 632 + }, + { + "epoch": 0.21191831268831604, + "grad_norm": 0.12761344015598297, + "learning_rate": 0.0001, + "loss": 1.6561, + "step": 633 + }, + { + "epoch": 0.21225309675259457, + "grad_norm": 0.11588580161333084, + "learning_rate": 0.0001, + "loss": 1.5967, + "step": 634 + }, + { + "epoch": 0.2125878808168731, + "grad_norm": 0.11629272252321243, + "learning_rate": 0.0001, + "loss": 1.5894, + "step": 635 + }, + { + "epoch": 0.21292266488115166, + "grad_norm": 0.1237213984131813, + "learning_rate": 0.0001, + "loss": 1.6113, + "step": 636 + }, + { + "epoch": 0.2132574489454302, + "grad_norm": 0.12293344736099243, + "learning_rate": 0.0001, + "loss": 1.5972, + "step": 637 + }, + { + "epoch": 0.21359223300970873, + "grad_norm": 0.1172887459397316, + "learning_rate": 0.0001, + "loss": 1.5765, + "step": 638 + }, + { + "epoch": 0.21392701707398729, + "grad_norm": 0.12403010576963425, + "learning_rate": 0.0001, + "loss": 1.5639, + "step": 639 + }, + { + "epoch": 0.21426180113826582, + "grad_norm": 0.12683235108852386, + "learning_rate": 0.0001, + "loss": 1.5197, + "step": 640 + }, + { + "epoch": 0.21459658520254435, + "grad_norm": 0.11593903601169586, + "learning_rate": 0.0001, + "loss": 1.5158, + "step": 641 + }, + { + "epoch": 0.2149313692668229, + "grad_norm": 0.1251828819513321, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 642 + }, + { + "epoch": 0.21526615333110144, + "grad_norm": 0.12358346581459045, + "learning_rate": 0.0001, + "loss": 1.6012, + "step": 643 + }, + { + "epoch": 0.21560093739537997, + "grad_norm": 0.11473721265792847, + "learning_rate": 0.0001, + "loss": 1.5365, + "step": 644 + }, + { + "epoch": 0.21593572145965853, + "grad_norm": 0.1184060201048851, + "learning_rate": 0.0001, + "loss": 1.4507, + "step": 645 + }, + { + "epoch": 0.21627050552393706, + "grad_norm": 0.12540043890476227, + "learning_rate": 0.0001, + "loss": 1.5854, + "step": 646 + }, + { + "epoch": 0.2166052895882156, + "grad_norm": 0.12070447206497192, + "learning_rate": 0.0001, + "loss": 1.6097, + "step": 647 + }, + { + "epoch": 0.21694007365249415, + "grad_norm": 0.11351459473371506, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 648 + }, + { + "epoch": 0.21727485771677268, + "grad_norm": 0.1242094486951828, + "learning_rate": 0.0001, + "loss": 1.5222, + "step": 649 + }, + { + "epoch": 0.21760964178105122, + "grad_norm": 0.12054958194494247, + "learning_rate": 0.0001, + "loss": 1.5285, + "step": 650 + }, + { + "epoch": 0.21794442584532978, + "grad_norm": 0.12539923191070557, + "learning_rate": 0.0001, + "loss": 1.5001, + "step": 651 + }, + { + "epoch": 0.2182792099096083, + "grad_norm": 0.12270530313253403, + "learning_rate": 0.0001, + "loss": 1.6769, + "step": 652 + }, + { + "epoch": 0.21861399397388684, + "grad_norm": 0.12920905649662018, + "learning_rate": 0.0001, + "loss": 1.5999, + "step": 653 + }, + { + "epoch": 0.21894877803816537, + "grad_norm": 0.13267312943935394, + "learning_rate": 0.0001, + "loss": 1.5382, + "step": 654 + }, + { + "epoch": 0.21928356210244393, + "grad_norm": 0.11984428763389587, + "learning_rate": 0.0001, + "loss": 1.631, + "step": 655 + }, + { + "epoch": 0.21961834616672246, + "grad_norm": 0.1474982053041458, + "learning_rate": 0.0001, + "loss": 1.6709, + "step": 656 + }, + { + "epoch": 0.219953130231001, + "grad_norm": 0.13542193174362183, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 657 + }, + { + "epoch": 0.22028791429527955, + "grad_norm": 0.13832658529281616, + "learning_rate": 0.0001, + "loss": 1.6118, + "step": 658 + }, + { + "epoch": 0.22062269835955808, + "grad_norm": 0.15140588581562042, + "learning_rate": 0.0001, + "loss": 1.6314, + "step": 659 + }, + { + "epoch": 0.22095748242383662, + "grad_norm": 0.12110920995473862, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 660 + }, + { + "epoch": 0.22129226648811517, + "grad_norm": 0.14811581373214722, + "learning_rate": 0.0001, + "loss": 1.6642, + "step": 661 + }, + { + "epoch": 0.2216270505523937, + "grad_norm": 0.12733857333660126, + "learning_rate": 0.0001, + "loss": 1.5512, + "step": 662 + }, + { + "epoch": 0.22196183461667224, + "grad_norm": 0.13028332591056824, + "learning_rate": 0.0001, + "loss": 1.5613, + "step": 663 + }, + { + "epoch": 0.2222966186809508, + "grad_norm": 0.1242808997631073, + "learning_rate": 0.0001, + "loss": 1.5869, + "step": 664 + }, + { + "epoch": 0.22263140274522933, + "grad_norm": 0.12380847334861755, + "learning_rate": 0.0001, + "loss": 1.5926, + "step": 665 + }, + { + "epoch": 0.22296618680950786, + "grad_norm": 0.12564754486083984, + "learning_rate": 0.0001, + "loss": 1.5811, + "step": 666 + }, + { + "epoch": 0.22330097087378642, + "grad_norm": 0.1509399712085724, + "learning_rate": 0.0001, + "loss": 1.7172, + "step": 667 + }, + { + "epoch": 0.22363575493806495, + "grad_norm": 0.12397512793540955, + "learning_rate": 0.0001, + "loss": 1.642, + "step": 668 + }, + { + "epoch": 0.22397053900234348, + "grad_norm": 0.13826021552085876, + "learning_rate": 0.0001, + "loss": 1.6395, + "step": 669 + }, + { + "epoch": 0.22430532306662204, + "grad_norm": 0.1417902112007141, + "learning_rate": 0.0001, + "loss": 1.6169, + "step": 670 + }, + { + "epoch": 0.22464010713090057, + "grad_norm": 0.12220132350921631, + "learning_rate": 0.0001, + "loss": 1.5686, + "step": 671 + }, + { + "epoch": 0.2249748911951791, + "grad_norm": 0.13563144207000732, + "learning_rate": 0.0001, + "loss": 1.6556, + "step": 672 + }, + { + "epoch": 0.22530967525945764, + "grad_norm": 0.13794521987438202, + "learning_rate": 0.0001, + "loss": 1.5187, + "step": 673 + }, + { + "epoch": 0.2256444593237362, + "grad_norm": 0.12060145288705826, + "learning_rate": 0.0001, + "loss": 1.5901, + "step": 674 + }, + { + "epoch": 0.22597924338801473, + "grad_norm": 0.13909369707107544, + "learning_rate": 0.0001, + "loss": 1.5101, + "step": 675 + }, + { + "epoch": 0.22631402745229326, + "grad_norm": 0.13746792078018188, + "learning_rate": 0.0001, + "loss": 1.6084, + "step": 676 + }, + { + "epoch": 0.22664881151657182, + "grad_norm": 0.11612525582313538, + "learning_rate": 0.0001, + "loss": 1.606, + "step": 677 + }, + { + "epoch": 0.22698359558085035, + "grad_norm": 0.13988125324249268, + "learning_rate": 0.0001, + "loss": 1.6123, + "step": 678 + }, + { + "epoch": 0.22731837964512888, + "grad_norm": 0.13023462891578674, + "learning_rate": 0.0001, + "loss": 1.6202, + "step": 679 + }, + { + "epoch": 0.22765316370940744, + "grad_norm": 0.11764882504940033, + "learning_rate": 0.0001, + "loss": 1.5744, + "step": 680 + }, + { + "epoch": 0.22798794777368597, + "grad_norm": 0.12987253069877625, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 681 + }, + { + "epoch": 0.2283227318379645, + "grad_norm": 0.12687528133392334, + "learning_rate": 0.0001, + "loss": 1.6177, + "step": 682 + }, + { + "epoch": 0.22865751590224306, + "grad_norm": 0.117088183760643, + "learning_rate": 0.0001, + "loss": 1.5704, + "step": 683 + }, + { + "epoch": 0.2289922999665216, + "grad_norm": 0.13380305469036102, + "learning_rate": 0.0001, + "loss": 1.5013, + "step": 684 + }, + { + "epoch": 0.22932708403080013, + "grad_norm": 0.13155803084373474, + "learning_rate": 0.0001, + "loss": 1.6627, + "step": 685 + }, + { + "epoch": 0.2296618680950787, + "grad_norm": 0.12210634350776672, + "learning_rate": 0.0001, + "loss": 1.491, + "step": 686 + }, + { + "epoch": 0.22999665215935722, + "grad_norm": 0.12427474558353424, + "learning_rate": 0.0001, + "loss": 1.6381, + "step": 687 + }, + { + "epoch": 0.23033143622363575, + "grad_norm": 0.12354297190904617, + "learning_rate": 0.0001, + "loss": 1.5804, + "step": 688 + }, + { + "epoch": 0.2306662202879143, + "grad_norm": 0.11402271687984467, + "learning_rate": 0.0001, + "loss": 1.5562, + "step": 689 + }, + { + "epoch": 0.23100100435219284, + "grad_norm": 0.12571346759796143, + "learning_rate": 0.0001, + "loss": 1.6974, + "step": 690 + }, + { + "epoch": 0.23133578841647137, + "grad_norm": 0.12201119214296341, + "learning_rate": 0.0001, + "loss": 1.5866, + "step": 691 + }, + { + "epoch": 0.2316705724807499, + "grad_norm": 0.13017117977142334, + "learning_rate": 0.0001, + "loss": 1.6493, + "step": 692 + }, + { + "epoch": 0.23200535654502846, + "grad_norm": 0.11595404893159866, + "learning_rate": 0.0001, + "loss": 1.5047, + "step": 693 + }, + { + "epoch": 0.232340140609307, + "grad_norm": 0.11953503638505936, + "learning_rate": 0.0001, + "loss": 1.4952, + "step": 694 + }, + { + "epoch": 0.23267492467358553, + "grad_norm": 0.11844140291213989, + "learning_rate": 0.0001, + "loss": 1.6223, + "step": 695 + }, + { + "epoch": 0.23300970873786409, + "grad_norm": 0.12358598411083221, + "learning_rate": 0.0001, + "loss": 1.6303, + "step": 696 + }, + { + "epoch": 0.23334449280214262, + "grad_norm": 0.12384648621082306, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 697 + }, + { + "epoch": 0.23367927686642115, + "grad_norm": 0.11835581809282303, + "learning_rate": 0.0001, + "loss": 1.6098, + "step": 698 + }, + { + "epoch": 0.2340140609306997, + "grad_norm": 0.1138228103518486, + "learning_rate": 0.0001, + "loss": 1.4118, + "step": 699 + }, + { + "epoch": 0.23434884499497824, + "grad_norm": 0.11459102481603622, + "learning_rate": 0.0001, + "loss": 1.5633, + "step": 700 + }, + { + "epoch": 0.23468362905925677, + "grad_norm": 0.11587528139352798, + "learning_rate": 0.0001, + "loss": 1.6973, + "step": 701 + }, + { + "epoch": 0.23501841312353533, + "grad_norm": 0.13280251622200012, + "learning_rate": 0.0001, + "loss": 1.5161, + "step": 702 + }, + { + "epoch": 0.23535319718781386, + "grad_norm": 0.12264399230480194, + "learning_rate": 0.0001, + "loss": 1.656, + "step": 703 + }, + { + "epoch": 0.2356879812520924, + "grad_norm": 0.11608457565307617, + "learning_rate": 0.0001, + "loss": 1.5099, + "step": 704 + }, + { + "epoch": 0.23602276531637095, + "grad_norm": 0.12152610719203949, + "learning_rate": 0.0001, + "loss": 1.5169, + "step": 705 + }, + { + "epoch": 0.23635754938064948, + "grad_norm": 0.12914855778217316, + "learning_rate": 0.0001, + "loss": 1.5904, + "step": 706 + }, + { + "epoch": 0.23669233344492802, + "grad_norm": 0.13277898728847504, + "learning_rate": 0.0001, + "loss": 1.656, + "step": 707 + }, + { + "epoch": 0.23702711750920658, + "grad_norm": 0.12540487945079803, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 708 + }, + { + "epoch": 0.2373619015734851, + "grad_norm": 0.11845158785581589, + "learning_rate": 0.0001, + "loss": 1.5014, + "step": 709 + }, + { + "epoch": 0.23769668563776364, + "grad_norm": 0.11418534815311432, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 710 + }, + { + "epoch": 0.23803146970204217, + "grad_norm": 0.1384686678647995, + "learning_rate": 0.0001, + "loss": 1.6188, + "step": 711 + }, + { + "epoch": 0.23836625376632073, + "grad_norm": 0.12325987964868546, + "learning_rate": 0.0001, + "loss": 1.5636, + "step": 712 + }, + { + "epoch": 0.23870103783059926, + "grad_norm": 0.11931071430444717, + "learning_rate": 0.0001, + "loss": 1.5655, + "step": 713 + }, + { + "epoch": 0.2390358218948778, + "grad_norm": 0.12119931727647781, + "learning_rate": 0.0001, + "loss": 1.5289, + "step": 714 + }, + { + "epoch": 0.23937060595915635, + "grad_norm": 0.12172186374664307, + "learning_rate": 0.0001, + "loss": 1.6467, + "step": 715 + }, + { + "epoch": 0.23970539002343488, + "grad_norm": 0.12344299256801605, + "learning_rate": 0.0001, + "loss": 1.5616, + "step": 716 + }, + { + "epoch": 0.24004017408771341, + "grad_norm": 0.12173335254192352, + "learning_rate": 0.0001, + "loss": 1.6135, + "step": 717 + }, + { + "epoch": 0.24037495815199197, + "grad_norm": 0.1223810538649559, + "learning_rate": 0.0001, + "loss": 1.6239, + "step": 718 + }, + { + "epoch": 0.2407097422162705, + "grad_norm": 0.11744136363267899, + "learning_rate": 0.0001, + "loss": 1.5704, + "step": 719 + }, + { + "epoch": 0.24104452628054904, + "grad_norm": 0.12341196089982986, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 720 + }, + { + "epoch": 0.2413793103448276, + "grad_norm": 0.12578146159648895, + "learning_rate": 0.0001, + "loss": 1.604, + "step": 721 + }, + { + "epoch": 0.24171409440910613, + "grad_norm": 0.12708286941051483, + "learning_rate": 0.0001, + "loss": 1.5583, + "step": 722 + }, + { + "epoch": 0.24204887847338466, + "grad_norm": 0.11757246404886246, + "learning_rate": 0.0001, + "loss": 1.4911, + "step": 723 + }, + { + "epoch": 0.24238366253766322, + "grad_norm": 0.1309349238872528, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 724 + }, + { + "epoch": 0.24271844660194175, + "grad_norm": 0.13289286196231842, + "learning_rate": 0.0001, + "loss": 1.6547, + "step": 725 + }, + { + "epoch": 0.24305323066622028, + "grad_norm": 0.12044942378997803, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 726 + }, + { + "epoch": 0.24338801473049884, + "grad_norm": 0.12810328602790833, + "learning_rate": 0.0001, + "loss": 1.6775, + "step": 727 + }, + { + "epoch": 0.24372279879477737, + "grad_norm": 0.12643273174762726, + "learning_rate": 0.0001, + "loss": 1.4938, + "step": 728 + }, + { + "epoch": 0.2440575828590559, + "grad_norm": 0.1253504455089569, + "learning_rate": 0.0001, + "loss": 1.6482, + "step": 729 + }, + { + "epoch": 0.24439236692333444, + "grad_norm": 0.12725912034511566, + "learning_rate": 0.0001, + "loss": 1.4911, + "step": 730 + }, + { + "epoch": 0.244727150987613, + "grad_norm": 0.13506008684635162, + "learning_rate": 0.0001, + "loss": 1.5739, + "step": 731 + }, + { + "epoch": 0.24506193505189153, + "grad_norm": 0.12034797668457031, + "learning_rate": 0.0001, + "loss": 1.6477, + "step": 732 + }, + { + "epoch": 0.24539671911617006, + "grad_norm": 0.12169791758060455, + "learning_rate": 0.0001, + "loss": 1.6398, + "step": 733 + }, + { + "epoch": 0.24573150318044862, + "grad_norm": 0.1253383755683899, + "learning_rate": 0.0001, + "loss": 1.5921, + "step": 734 + }, + { + "epoch": 0.24606628724472715, + "grad_norm": 0.11854001134634018, + "learning_rate": 0.0001, + "loss": 1.598, + "step": 735 + }, + { + "epoch": 0.24640107130900568, + "grad_norm": 0.13825742900371552, + "learning_rate": 0.0001, + "loss": 1.6588, + "step": 736 + }, + { + "epoch": 0.24673585537328424, + "grad_norm": 0.1235450729727745, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 737 + }, + { + "epoch": 0.24707063943756277, + "grad_norm": 0.12598398327827454, + "learning_rate": 0.0001, + "loss": 1.6038, + "step": 738 + }, + { + "epoch": 0.2474054235018413, + "grad_norm": 0.14527225494384766, + "learning_rate": 0.0001, + "loss": 1.6419, + "step": 739 + }, + { + "epoch": 0.24774020756611986, + "grad_norm": 0.11842803657054901, + "learning_rate": 0.0001, + "loss": 1.5628, + "step": 740 + }, + { + "epoch": 0.2480749916303984, + "grad_norm": 0.12376052141189575, + "learning_rate": 0.0001, + "loss": 1.5271, + "step": 741 + }, + { + "epoch": 0.24840977569467693, + "grad_norm": 0.13634417951107025, + "learning_rate": 0.0001, + "loss": 1.7012, + "step": 742 + }, + { + "epoch": 0.24874455975895549, + "grad_norm": 0.12457748502492905, + "learning_rate": 0.0001, + "loss": 1.5623, + "step": 743 + }, + { + "epoch": 0.24907934382323402, + "grad_norm": 0.11860496550798416, + "learning_rate": 0.0001, + "loss": 1.6049, + "step": 744 + }, + { + "epoch": 0.24941412788751255, + "grad_norm": 0.12447136640548706, + "learning_rate": 0.0001, + "loss": 1.6967, + "step": 745 + }, + { + "epoch": 0.2497489119517911, + "grad_norm": 0.12220341712236404, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 746 + }, + { + "epoch": 0.2500836960160696, + "grad_norm": 0.11865612119436264, + "learning_rate": 0.0001, + "loss": 1.5519, + "step": 747 + }, + { + "epoch": 0.25041848008034817, + "grad_norm": 0.11847954988479614, + "learning_rate": 0.0001, + "loss": 1.5087, + "step": 748 + }, + { + "epoch": 0.25075326414462673, + "grad_norm": 0.12107084691524506, + "learning_rate": 0.0001, + "loss": 1.5995, + "step": 749 + }, + { + "epoch": 0.25108804820890523, + "grad_norm": 0.12188322097063065, + "learning_rate": 0.0001, + "loss": 1.6439, + "step": 750 + }, + { + "epoch": 0.2514228322731838, + "grad_norm": 0.12144109606742859, + "learning_rate": 0.0001, + "loss": 1.5613, + "step": 751 + }, + { + "epoch": 0.25175761633746235, + "grad_norm": 0.12133816629648209, + "learning_rate": 0.0001, + "loss": 1.5364, + "step": 752 + }, + { + "epoch": 0.25209240040174086, + "grad_norm": 0.11708073318004608, + "learning_rate": 0.0001, + "loss": 1.5221, + "step": 753 + }, + { + "epoch": 0.2524271844660194, + "grad_norm": 0.1203671544790268, + "learning_rate": 0.0001, + "loss": 1.5736, + "step": 754 + }, + { + "epoch": 0.252761968530298, + "grad_norm": 0.12079092115163803, + "learning_rate": 0.0001, + "loss": 1.5842, + "step": 755 + }, + { + "epoch": 0.2530967525945765, + "grad_norm": 0.1294735223054886, + "learning_rate": 0.0001, + "loss": 1.5994, + "step": 756 + }, + { + "epoch": 0.25343153665885504, + "grad_norm": 0.1251528263092041, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 757 + }, + { + "epoch": 0.2537663207231336, + "grad_norm": 0.12093610316514969, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 758 + }, + { + "epoch": 0.2541011047874121, + "grad_norm": 0.1214980036020279, + "learning_rate": 0.0001, + "loss": 1.5887, + "step": 759 + }, + { + "epoch": 0.25443588885169066, + "grad_norm": 0.12011279165744781, + "learning_rate": 0.0001, + "loss": 1.5973, + "step": 760 + }, + { + "epoch": 0.2547706729159692, + "grad_norm": 0.12630945444107056, + "learning_rate": 0.0001, + "loss": 1.6184, + "step": 761 + }, + { + "epoch": 0.2551054569802477, + "grad_norm": 0.12001120299100876, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 762 + }, + { + "epoch": 0.2554402410445263, + "grad_norm": 0.1369365155696869, + "learning_rate": 0.0001, + "loss": 1.5718, + "step": 763 + }, + { + "epoch": 0.25577502510880484, + "grad_norm": 0.1201329231262207, + "learning_rate": 0.0001, + "loss": 1.5354, + "step": 764 + }, + { + "epoch": 0.25610980917308335, + "grad_norm": 0.12741532921791077, + "learning_rate": 0.0001, + "loss": 1.6193, + "step": 765 + }, + { + "epoch": 0.2564445932373619, + "grad_norm": 0.12349703162908554, + "learning_rate": 0.0001, + "loss": 1.6143, + "step": 766 + }, + { + "epoch": 0.25677937730164047, + "grad_norm": 0.11855439841747284, + "learning_rate": 0.0001, + "loss": 1.6037, + "step": 767 + }, + { + "epoch": 0.25711416136591897, + "grad_norm": 0.12034845352172852, + "learning_rate": 0.0001, + "loss": 1.5317, + "step": 768 + }, + { + "epoch": 0.25744894543019753, + "grad_norm": 0.11987943202257156, + "learning_rate": 0.0001, + "loss": 1.535, + "step": 769 + }, + { + "epoch": 0.2577837294944761, + "grad_norm": 0.12118515372276306, + "learning_rate": 0.0001, + "loss": 1.5974, + "step": 770 + }, + { + "epoch": 0.2581185135587546, + "grad_norm": 0.12842996418476105, + "learning_rate": 0.0001, + "loss": 1.609, + "step": 771 + }, + { + "epoch": 0.25845329762303315, + "grad_norm": 0.12420446425676346, + "learning_rate": 0.0001, + "loss": 1.6093, + "step": 772 + }, + { + "epoch": 0.2587880816873117, + "grad_norm": 0.12443120032548904, + "learning_rate": 0.0001, + "loss": 1.6122, + "step": 773 + }, + { + "epoch": 0.2591228657515902, + "grad_norm": 0.11912049353122711, + "learning_rate": 0.0001, + "loss": 1.5209, + "step": 774 + }, + { + "epoch": 0.2594576498158688, + "grad_norm": 0.1273064911365509, + "learning_rate": 0.0001, + "loss": 1.608, + "step": 775 + }, + { + "epoch": 0.2597924338801473, + "grad_norm": 0.11585114896297455, + "learning_rate": 0.0001, + "loss": 1.3888, + "step": 776 + }, + { + "epoch": 0.26012721794442584, + "grad_norm": 0.12005290389060974, + "learning_rate": 0.0001, + "loss": 1.4666, + "step": 777 + }, + { + "epoch": 0.2604620020087044, + "grad_norm": 0.11954595148563385, + "learning_rate": 0.0001, + "loss": 1.5558, + "step": 778 + }, + { + "epoch": 0.2607967860729829, + "grad_norm": 0.1307271122932434, + "learning_rate": 0.0001, + "loss": 1.6063, + "step": 779 + }, + { + "epoch": 0.26113157013726146, + "grad_norm": 0.113981693983078, + "learning_rate": 0.0001, + "loss": 1.4857, + "step": 780 + }, + { + "epoch": 0.26146635420154, + "grad_norm": 0.1225418671965599, + "learning_rate": 0.0001, + "loss": 1.5508, + "step": 781 + }, + { + "epoch": 0.2618011382658185, + "grad_norm": 0.12919741868972778, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 782 + }, + { + "epoch": 0.2621359223300971, + "grad_norm": 0.11552941054105759, + "learning_rate": 0.0001, + "loss": 1.6183, + "step": 783 + }, + { + "epoch": 0.26247070639437564, + "grad_norm": 0.13457614183425903, + "learning_rate": 0.0001, + "loss": 1.6461, + "step": 784 + }, + { + "epoch": 0.26280549045865415, + "grad_norm": 0.11841408908367157, + "learning_rate": 0.0001, + "loss": 1.5481, + "step": 785 + }, + { + "epoch": 0.2631402745229327, + "grad_norm": 0.11701938509941101, + "learning_rate": 0.0001, + "loss": 1.5883, + "step": 786 + }, + { + "epoch": 0.26347505858721126, + "grad_norm": 0.14221838116645813, + "learning_rate": 0.0001, + "loss": 1.5904, + "step": 787 + }, + { + "epoch": 0.26380984265148977, + "grad_norm": 0.11813905090093613, + "learning_rate": 0.0001, + "loss": 1.5653, + "step": 788 + }, + { + "epoch": 0.2641446267157683, + "grad_norm": 0.1315639317035675, + "learning_rate": 0.0001, + "loss": 1.5811, + "step": 789 + }, + { + "epoch": 0.2644794107800469, + "grad_norm": 0.13400433957576752, + "learning_rate": 0.0001, + "loss": 1.5363, + "step": 790 + }, + { + "epoch": 0.2648141948443254, + "grad_norm": 0.12116281688213348, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 791 + }, + { + "epoch": 0.26514897890860395, + "grad_norm": 0.1382567137479782, + "learning_rate": 0.0001, + "loss": 1.592, + "step": 792 + }, + { + "epoch": 0.2654837629728825, + "grad_norm": 0.14005912840366364, + "learning_rate": 0.0001, + "loss": 1.6114, + "step": 793 + }, + { + "epoch": 0.265818547037161, + "grad_norm": 0.13382911682128906, + "learning_rate": 0.0001, + "loss": 1.5942, + "step": 794 + }, + { + "epoch": 0.26615333110143957, + "grad_norm": 0.12423510104417801, + "learning_rate": 0.0001, + "loss": 1.5378, + "step": 795 + }, + { + "epoch": 0.26648811516571813, + "grad_norm": 0.12228628993034363, + "learning_rate": 0.0001, + "loss": 1.5704, + "step": 796 + }, + { + "epoch": 0.26682289922999664, + "grad_norm": 0.1286916881799698, + "learning_rate": 0.0001, + "loss": 1.6037, + "step": 797 + }, + { + "epoch": 0.2671576832942752, + "grad_norm": 0.12864018976688385, + "learning_rate": 0.0001, + "loss": 1.6522, + "step": 798 + }, + { + "epoch": 0.26749246735855375, + "grad_norm": 0.12012400478124619, + "learning_rate": 0.0001, + "loss": 1.5275, + "step": 799 + }, + { + "epoch": 0.26782725142283226, + "grad_norm": 0.12273643165826797, + "learning_rate": 0.0001, + "loss": 1.5848, + "step": 800 + }, + { + "epoch": 0.2681620354871108, + "grad_norm": 0.13991284370422363, + "learning_rate": 0.0001, + "loss": 1.6271, + "step": 801 + }, + { + "epoch": 0.2684968195513894, + "grad_norm": 0.1236526146531105, + "learning_rate": 0.0001, + "loss": 1.57, + "step": 802 + }, + { + "epoch": 0.2688316036156679, + "grad_norm": 0.1302153319120407, + "learning_rate": 0.0001, + "loss": 1.5638, + "step": 803 + }, + { + "epoch": 0.26916638767994644, + "grad_norm": 0.11963735520839691, + "learning_rate": 0.0001, + "loss": 1.6089, + "step": 804 + }, + { + "epoch": 0.269501171744225, + "grad_norm": 0.13298673927783966, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 805 + }, + { + "epoch": 0.2698359558085035, + "grad_norm": 0.13616934418678284, + "learning_rate": 0.0001, + "loss": 1.653, + "step": 806 + }, + { + "epoch": 0.27017073987278206, + "grad_norm": 0.12497668713331223, + "learning_rate": 0.0001, + "loss": 1.5514, + "step": 807 + }, + { + "epoch": 0.2705055239370606, + "grad_norm": 0.11764683574438095, + "learning_rate": 0.0001, + "loss": 1.5878, + "step": 808 + }, + { + "epoch": 0.2708403080013391, + "grad_norm": 0.12114263325929642, + "learning_rate": 0.0001, + "loss": 1.5628, + "step": 809 + }, + { + "epoch": 0.2711750920656177, + "grad_norm": 0.1347784847021103, + "learning_rate": 0.0001, + "loss": 1.7159, + "step": 810 + }, + { + "epoch": 0.27150987612989624, + "grad_norm": 0.12009880691766739, + "learning_rate": 0.0001, + "loss": 1.6043, + "step": 811 + }, + { + "epoch": 0.27184466019417475, + "grad_norm": 0.1278241127729416, + "learning_rate": 0.0001, + "loss": 1.6309, + "step": 812 + }, + { + "epoch": 0.2721794442584533, + "grad_norm": 0.1216406300663948, + "learning_rate": 0.0001, + "loss": 1.5867, + "step": 813 + }, + { + "epoch": 0.2725142283227318, + "grad_norm": 0.11623333394527435, + "learning_rate": 0.0001, + "loss": 1.5272, + "step": 814 + }, + { + "epoch": 0.27284901238701037, + "grad_norm": 0.11762827634811401, + "learning_rate": 0.0001, + "loss": 1.4148, + "step": 815 + }, + { + "epoch": 0.27318379645128893, + "grad_norm": 0.12679798901081085, + "learning_rate": 0.0001, + "loss": 1.678, + "step": 816 + }, + { + "epoch": 0.27351858051556743, + "grad_norm": 0.12463215738534927, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 817 + }, + { + "epoch": 0.273853364579846, + "grad_norm": 0.12248417735099792, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 818 + }, + { + "epoch": 0.27418814864412455, + "grad_norm": 0.11953899264335632, + "learning_rate": 0.0001, + "loss": 1.5704, + "step": 819 + }, + { + "epoch": 0.27452293270840306, + "grad_norm": 0.12919148802757263, + "learning_rate": 0.0001, + "loss": 1.6948, + "step": 820 + }, + { + "epoch": 0.2748577167726816, + "grad_norm": 0.11798353493213654, + "learning_rate": 0.0001, + "loss": 1.4814, + "step": 821 + }, + { + "epoch": 0.2751925008369602, + "grad_norm": 0.13017946481704712, + "learning_rate": 0.0001, + "loss": 1.5837, + "step": 822 + }, + { + "epoch": 0.2755272849012387, + "grad_norm": 0.1253434419631958, + "learning_rate": 0.0001, + "loss": 1.5418, + "step": 823 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 0.11546894907951355, + "learning_rate": 0.0001, + "loss": 1.5851, + "step": 824 + }, + { + "epoch": 0.2761968530297958, + "grad_norm": 0.12117631733417511, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 825 + }, + { + "epoch": 0.2765316370940743, + "grad_norm": 0.12088704109191895, + "learning_rate": 0.0001, + "loss": 1.571, + "step": 826 + }, + { + "epoch": 0.27686642115835286, + "grad_norm": 0.12261457741260529, + "learning_rate": 0.0001, + "loss": 1.5143, + "step": 827 + }, + { + "epoch": 0.2772012052226314, + "grad_norm": 0.12313897162675858, + "learning_rate": 0.0001, + "loss": 1.621, + "step": 828 + }, + { + "epoch": 0.2775359892869099, + "grad_norm": 0.12563903629779816, + "learning_rate": 0.0001, + "loss": 1.657, + "step": 829 + }, + { + "epoch": 0.2778707733511885, + "grad_norm": 0.1187531128525734, + "learning_rate": 0.0001, + "loss": 1.5346, + "step": 830 + }, + { + "epoch": 0.27820555741546704, + "grad_norm": 0.12233595550060272, + "learning_rate": 0.0001, + "loss": 1.5835, + "step": 831 + }, + { + "epoch": 0.27854034147974555, + "grad_norm": 0.12235147505998611, + "learning_rate": 0.0001, + "loss": 1.6104, + "step": 832 + }, + { + "epoch": 0.2788751255440241, + "grad_norm": 0.11765027791261673, + "learning_rate": 0.0001, + "loss": 1.5489, + "step": 833 + }, + { + "epoch": 0.27920990960830266, + "grad_norm": 0.12349414080381393, + "learning_rate": 0.0001, + "loss": 1.6089, + "step": 834 + }, + { + "epoch": 0.27954469367258117, + "grad_norm": 0.12419009208679199, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 835 + }, + { + "epoch": 0.2798794777368597, + "grad_norm": 0.12406160682439804, + "learning_rate": 0.0001, + "loss": 1.5774, + "step": 836 + }, + { + "epoch": 0.2802142618011383, + "grad_norm": 0.11722970008850098, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 837 + }, + { + "epoch": 0.2805490458654168, + "grad_norm": 0.1188267171382904, + "learning_rate": 0.0001, + "loss": 1.5005, + "step": 838 + }, + { + "epoch": 0.28088382992969535, + "grad_norm": 0.11977598071098328, + "learning_rate": 0.0001, + "loss": 1.5556, + "step": 839 + }, + { + "epoch": 0.2812186139939739, + "grad_norm": 0.12196852266788483, + "learning_rate": 0.0001, + "loss": 1.604, + "step": 840 + }, + { + "epoch": 0.2815533980582524, + "grad_norm": 0.12035735696554184, + "learning_rate": 0.0001, + "loss": 1.5304, + "step": 841 + }, + { + "epoch": 0.281888182122531, + "grad_norm": 0.12375766038894653, + "learning_rate": 0.0001, + "loss": 1.5929, + "step": 842 + }, + { + "epoch": 0.28222296618680953, + "grad_norm": 0.1304924190044403, + "learning_rate": 0.0001, + "loss": 1.6148, + "step": 843 + }, + { + "epoch": 0.28255775025108804, + "grad_norm": 0.12864375114440918, + "learning_rate": 0.0001, + "loss": 1.4907, + "step": 844 + }, + { + "epoch": 0.2828925343153666, + "grad_norm": 0.12013059109449387, + "learning_rate": 0.0001, + "loss": 1.5051, + "step": 845 + }, + { + "epoch": 0.28322731837964515, + "grad_norm": 0.1277569979429245, + "learning_rate": 0.0001, + "loss": 1.5942, + "step": 846 + }, + { + "epoch": 0.28356210244392366, + "grad_norm": 0.13474377989768982, + "learning_rate": 0.0001, + "loss": 1.6098, + "step": 847 + }, + { + "epoch": 0.2838968865082022, + "grad_norm": 0.12635944783687592, + "learning_rate": 0.0001, + "loss": 1.6217, + "step": 848 + }, + { + "epoch": 0.2842316705724808, + "grad_norm": 0.12218885123729706, + "learning_rate": 0.0001, + "loss": 1.578, + "step": 849 + }, + { + "epoch": 0.2845664546367593, + "grad_norm": 0.12037128210067749, + "learning_rate": 0.0001, + "loss": 1.5502, + "step": 850 + }, + { + "epoch": 0.28490123870103784, + "grad_norm": 0.12386499345302582, + "learning_rate": 0.0001, + "loss": 1.6922, + "step": 851 + }, + { + "epoch": 0.28523602276531634, + "grad_norm": 0.1298052966594696, + "learning_rate": 0.0001, + "loss": 1.6589, + "step": 852 + }, + { + "epoch": 0.2855708068295949, + "grad_norm": 0.12143804877996445, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 853 + }, + { + "epoch": 0.28590559089387346, + "grad_norm": 0.11675681918859482, + "learning_rate": 0.0001, + "loss": 1.4877, + "step": 854 + }, + { + "epoch": 0.28624037495815197, + "grad_norm": 0.11870943009853363, + "learning_rate": 0.0001, + "loss": 1.5699, + "step": 855 + }, + { + "epoch": 0.2865751590224305, + "grad_norm": 0.12752340734004974, + "learning_rate": 0.0001, + "loss": 1.5648, + "step": 856 + }, + { + "epoch": 0.2869099430867091, + "grad_norm": 0.1254730522632599, + "learning_rate": 0.0001, + "loss": 1.5331, + "step": 857 + }, + { + "epoch": 0.2872447271509876, + "grad_norm": 0.12351144105195999, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 858 + }, + { + "epoch": 0.28757951121526615, + "grad_norm": 0.12823925912380219, + "learning_rate": 0.0001, + "loss": 1.4704, + "step": 859 + }, + { + "epoch": 0.2879142952795447, + "grad_norm": 0.12884090840816498, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 860 + }, + { + "epoch": 0.2882490793438232, + "grad_norm": 0.12310319393873215, + "learning_rate": 0.0001, + "loss": 1.5554, + "step": 861 + }, + { + "epoch": 0.28858386340810177, + "grad_norm": 0.12592901289463043, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 862 + }, + { + "epoch": 0.28891864747238033, + "grad_norm": 0.12326246500015259, + "learning_rate": 0.0001, + "loss": 1.5408, + "step": 863 + }, + { + "epoch": 0.28925343153665883, + "grad_norm": 0.12688298523426056, + "learning_rate": 0.0001, + "loss": 1.609, + "step": 864 + }, + { + "epoch": 0.2895882156009374, + "grad_norm": 0.13284268975257874, + "learning_rate": 0.0001, + "loss": 1.4774, + "step": 865 + }, + { + "epoch": 0.28992299966521595, + "grad_norm": 0.12346718460321426, + "learning_rate": 0.0001, + "loss": 1.5675, + "step": 866 + }, + { + "epoch": 0.29025778372949446, + "grad_norm": 0.12501643598079681, + "learning_rate": 0.0001, + "loss": 1.54, + "step": 867 + }, + { + "epoch": 0.290592567793773, + "grad_norm": 0.14129911363124847, + "learning_rate": 0.0001, + "loss": 1.5707, + "step": 868 + }, + { + "epoch": 0.2909273518580516, + "grad_norm": 0.11998032033443451, + "learning_rate": 0.0001, + "loss": 1.6025, + "step": 869 + }, + { + "epoch": 0.2912621359223301, + "grad_norm": 0.14502458274364471, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 870 + }, + { + "epoch": 0.29159691998660864, + "grad_norm": 0.13429078459739685, + "learning_rate": 0.0001, + "loss": 1.5773, + "step": 871 + }, + { + "epoch": 0.2919317040508872, + "grad_norm": 0.12702088057994843, + "learning_rate": 0.0001, + "loss": 1.5331, + "step": 872 + }, + { + "epoch": 0.2922664881151657, + "grad_norm": 0.1450689435005188, + "learning_rate": 0.0001, + "loss": 1.6426, + "step": 873 + }, + { + "epoch": 0.29260127217944426, + "grad_norm": 0.12571430206298828, + "learning_rate": 0.0001, + "loss": 1.5702, + "step": 874 + }, + { + "epoch": 0.2929360562437228, + "grad_norm": 0.15491126477718353, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 875 + }, + { + "epoch": 0.2932708403080013, + "grad_norm": 0.1497523933649063, + "learning_rate": 0.0001, + "loss": 1.6073, + "step": 876 + }, + { + "epoch": 0.2936056243722799, + "grad_norm": 0.12279631197452545, + "learning_rate": 0.0001, + "loss": 1.5836, + "step": 877 + }, + { + "epoch": 0.29394040843655844, + "grad_norm": 0.16039283573627472, + "learning_rate": 0.0001, + "loss": 1.6125, + "step": 878 + }, + { + "epoch": 0.29427519250083695, + "grad_norm": 0.1275695562362671, + "learning_rate": 0.0001, + "loss": 1.5279, + "step": 879 + }, + { + "epoch": 0.2946099765651155, + "grad_norm": 0.12885813415050507, + "learning_rate": 0.0001, + "loss": 1.5662, + "step": 880 + }, + { + "epoch": 0.29494476062939406, + "grad_norm": 0.1439967006444931, + "learning_rate": 0.0001, + "loss": 1.6408, + "step": 881 + }, + { + "epoch": 0.29527954469367257, + "grad_norm": 0.12064056098461151, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 882 + }, + { + "epoch": 0.29561432875795113, + "grad_norm": 0.12883847951889038, + "learning_rate": 0.0001, + "loss": 1.6024, + "step": 883 + }, + { + "epoch": 0.2959491128222297, + "grad_norm": 0.12654966115951538, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 884 + }, + { + "epoch": 0.2962838968865082, + "grad_norm": 0.13914820551872253, + "learning_rate": 0.0001, + "loss": 1.5345, + "step": 885 + }, + { + "epoch": 0.29661868095078675, + "grad_norm": 0.12559537589550018, + "learning_rate": 0.0001, + "loss": 1.515, + "step": 886 + }, + { + "epoch": 0.2969534650150653, + "grad_norm": 0.1451893299818039, + "learning_rate": 0.0001, + "loss": 1.5924, + "step": 887 + }, + { + "epoch": 0.2972882490793438, + "grad_norm": 0.13416925072669983, + "learning_rate": 0.0001, + "loss": 1.6371, + "step": 888 + }, + { + "epoch": 0.2976230331436224, + "grad_norm": 0.12274248152971268, + "learning_rate": 0.0001, + "loss": 1.6539, + "step": 889 + }, + { + "epoch": 0.2979578172079009, + "grad_norm": 0.143101766705513, + "learning_rate": 0.0001, + "loss": 1.5748, + "step": 890 + }, + { + "epoch": 0.29829260127217944, + "grad_norm": 0.12564097344875336, + "learning_rate": 0.0001, + "loss": 1.5875, + "step": 891 + }, + { + "epoch": 0.298627385336458, + "grad_norm": 0.12403486669063568, + "learning_rate": 0.0001, + "loss": 1.5765, + "step": 892 + }, + { + "epoch": 0.2989621694007365, + "grad_norm": 0.13099223375320435, + "learning_rate": 0.0001, + "loss": 1.5656, + "step": 893 + }, + { + "epoch": 0.29929695346501506, + "grad_norm": 0.12135787308216095, + "learning_rate": 0.0001, + "loss": 1.4958, + "step": 894 + }, + { + "epoch": 0.2996317375292936, + "grad_norm": 0.12442804127931595, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 895 + }, + { + "epoch": 0.2999665215935721, + "grad_norm": 0.12768028676509857, + "learning_rate": 0.0001, + "loss": 1.6719, + "step": 896 + }, + { + "epoch": 0.3003013056578507, + "grad_norm": 0.1240835040807724, + "learning_rate": 0.0001, + "loss": 1.5114, + "step": 897 + }, + { + "epoch": 0.30063608972212924, + "grad_norm": 0.12057949602603912, + "learning_rate": 0.0001, + "loss": 1.5864, + "step": 898 + }, + { + "epoch": 0.30097087378640774, + "grad_norm": 0.1332257241010666, + "learning_rate": 0.0001, + "loss": 1.652, + "step": 899 + }, + { + "epoch": 0.3013056578506863, + "grad_norm": 0.12191877514123917, + "learning_rate": 0.0001, + "loss": 1.6016, + "step": 900 + }, + { + "epoch": 0.30164044191496486, + "grad_norm": 0.13481038808822632, + "learning_rate": 0.0001, + "loss": 1.5724, + "step": 901 + }, + { + "epoch": 0.30197522597924337, + "grad_norm": 0.12434981763362885, + "learning_rate": 0.0001, + "loss": 1.5873, + "step": 902 + }, + { + "epoch": 0.3023100100435219, + "grad_norm": 0.12398968636989594, + "learning_rate": 0.0001, + "loss": 1.5917, + "step": 903 + }, + { + "epoch": 0.3026447941078005, + "grad_norm": 0.13455741107463837, + "learning_rate": 0.0001, + "loss": 1.6293, + "step": 904 + }, + { + "epoch": 0.302979578172079, + "grad_norm": 0.12864330410957336, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 905 + }, + { + "epoch": 0.30331436223635755, + "grad_norm": 0.1306915581226349, + "learning_rate": 0.0001, + "loss": 1.5669, + "step": 906 + }, + { + "epoch": 0.3036491463006361, + "grad_norm": 0.12770214676856995, + "learning_rate": 0.0001, + "loss": 1.515, + "step": 907 + }, + { + "epoch": 0.3039839303649146, + "grad_norm": 0.12244972586631775, + "learning_rate": 0.0001, + "loss": 1.7102, + "step": 908 + }, + { + "epoch": 0.30431871442919317, + "grad_norm": 0.12544330954551697, + "learning_rate": 0.0001, + "loss": 1.5809, + "step": 909 + }, + { + "epoch": 0.30465349849347173, + "grad_norm": 0.12653569877147675, + "learning_rate": 0.0001, + "loss": 1.5504, + "step": 910 + }, + { + "epoch": 0.30498828255775023, + "grad_norm": 0.1295597404241562, + "learning_rate": 0.0001, + "loss": 1.6077, + "step": 911 + }, + { + "epoch": 0.3053230666220288, + "grad_norm": 0.13423195481300354, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 912 + }, + { + "epoch": 0.30565785068630735, + "grad_norm": 0.12957747280597687, + "learning_rate": 0.0001, + "loss": 1.72, + "step": 913 + }, + { + "epoch": 0.30599263475058586, + "grad_norm": 0.1274273693561554, + "learning_rate": 0.0001, + "loss": 1.5916, + "step": 914 + }, + { + "epoch": 0.3063274188148644, + "grad_norm": 0.12693728506565094, + "learning_rate": 0.0001, + "loss": 1.5582, + "step": 915 + }, + { + "epoch": 0.306662202879143, + "grad_norm": 0.12224942445755005, + "learning_rate": 0.0001, + "loss": 1.6431, + "step": 916 + }, + { + "epoch": 0.3069969869434215, + "grad_norm": 0.12495341151952744, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 917 + }, + { + "epoch": 0.30733177100770004, + "grad_norm": 0.12348316609859467, + "learning_rate": 0.0001, + "loss": 1.5617, + "step": 918 + }, + { + "epoch": 0.3076665550719786, + "grad_norm": 0.12086449563503265, + "learning_rate": 0.0001, + "loss": 1.5866, + "step": 919 + }, + { + "epoch": 0.3080013391362571, + "grad_norm": 0.12970371544361115, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 920 + }, + { + "epoch": 0.30833612320053566, + "grad_norm": 0.115717314183712, + "learning_rate": 0.0001, + "loss": 1.4493, + "step": 921 + }, + { + "epoch": 0.3086709072648142, + "grad_norm": 0.1250089704990387, + "learning_rate": 0.0001, + "loss": 1.5889, + "step": 922 + }, + { + "epoch": 0.3090056913290927, + "grad_norm": 0.11084622144699097, + "learning_rate": 0.0001, + "loss": 1.3815, + "step": 923 + }, + { + "epoch": 0.3093404753933713, + "grad_norm": 0.12127161026000977, + "learning_rate": 0.0001, + "loss": 1.5558, + "step": 924 + }, + { + "epoch": 0.30967525945764984, + "grad_norm": 0.12244665622711182, + "learning_rate": 0.0001, + "loss": 1.6409, + "step": 925 + }, + { + "epoch": 0.31001004352192835, + "grad_norm": 0.12553781270980835, + "learning_rate": 0.0001, + "loss": 1.6205, + "step": 926 + }, + { + "epoch": 0.3103448275862069, + "grad_norm": 0.12222031503915787, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 927 + }, + { + "epoch": 0.3106796116504854, + "grad_norm": 0.1246923953294754, + "learning_rate": 0.0001, + "loss": 1.719, + "step": 928 + }, + { + "epoch": 0.31101439571476397, + "grad_norm": 0.13237862288951874, + "learning_rate": 0.0001, + "loss": 1.6517, + "step": 929 + }, + { + "epoch": 0.31134917977904253, + "grad_norm": 0.11562683433294296, + "learning_rate": 0.0001, + "loss": 1.5043, + "step": 930 + }, + { + "epoch": 0.31168396384332103, + "grad_norm": 0.12860921025276184, + "learning_rate": 0.0001, + "loss": 1.5939, + "step": 931 + }, + { + "epoch": 0.3120187479075996, + "grad_norm": 0.11789809912443161, + "learning_rate": 0.0001, + "loss": 1.4763, + "step": 932 + }, + { + "epoch": 0.31235353197187815, + "grad_norm": 0.12612248957157135, + "learning_rate": 0.0001, + "loss": 1.6355, + "step": 933 + }, + { + "epoch": 0.31268831603615665, + "grad_norm": 0.14561748504638672, + "learning_rate": 0.0001, + "loss": 1.6897, + "step": 934 + }, + { + "epoch": 0.3130231001004352, + "grad_norm": 0.1276092380285263, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 935 + }, + { + "epoch": 0.3133578841647138, + "grad_norm": 0.13539274036884308, + "learning_rate": 0.0001, + "loss": 1.5562, + "step": 936 + }, + { + "epoch": 0.3136926682289923, + "grad_norm": 0.12490363419055939, + "learning_rate": 0.0001, + "loss": 1.5592, + "step": 937 + }, + { + "epoch": 0.31402745229327084, + "grad_norm": 0.12392627447843552, + "learning_rate": 0.0001, + "loss": 1.6344, + "step": 938 + }, + { + "epoch": 0.3143622363575494, + "grad_norm": 0.13469712436199188, + "learning_rate": 0.0001, + "loss": 1.7123, + "step": 939 + }, + { + "epoch": 0.3146970204218279, + "grad_norm": 0.13380196690559387, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 940 + }, + { + "epoch": 0.31503180448610646, + "grad_norm": 0.12370868027210236, + "learning_rate": 0.0001, + "loss": 1.5663, + "step": 941 + }, + { + "epoch": 0.315366588550385, + "grad_norm": 0.1381116360425949, + "learning_rate": 0.0001, + "loss": 1.5682, + "step": 942 + }, + { + "epoch": 0.3157013726146635, + "grad_norm": 0.15112708508968353, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 943 + }, + { + "epoch": 0.3160361566789421, + "grad_norm": 0.13402314484119415, + "learning_rate": 0.0001, + "loss": 1.67, + "step": 944 + }, + { + "epoch": 0.31637094074322064, + "grad_norm": 0.13505329191684723, + "learning_rate": 0.0001, + "loss": 1.5149, + "step": 945 + }, + { + "epoch": 0.31670572480749914, + "grad_norm": 0.1328267902135849, + "learning_rate": 0.0001, + "loss": 1.5129, + "step": 946 + }, + { + "epoch": 0.3170405088717777, + "grad_norm": 0.12792791426181793, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 947 + }, + { + "epoch": 0.31737529293605626, + "grad_norm": 0.11726494878530502, + "learning_rate": 0.0001, + "loss": 1.5581, + "step": 948 + }, + { + "epoch": 0.31771007700033477, + "grad_norm": 0.12302982062101364, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 949 + }, + { + "epoch": 0.3180448610646133, + "grad_norm": 0.1206970065832138, + "learning_rate": 0.0001, + "loss": 1.5066, + "step": 950 + }, + { + "epoch": 0.3183796451288919, + "grad_norm": 0.1165679469704628, + "learning_rate": 0.0001, + "loss": 1.5486, + "step": 951 + }, + { + "epoch": 0.3187144291931704, + "grad_norm": 0.12752187252044678, + "learning_rate": 0.0001, + "loss": 1.6441, + "step": 952 + }, + { + "epoch": 0.31904921325744895, + "grad_norm": 0.12091311067342758, + "learning_rate": 0.0001, + "loss": 1.5482, + "step": 953 + }, + { + "epoch": 0.3193839973217275, + "grad_norm": 0.12838125228881836, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 954 + }, + { + "epoch": 0.319718781386006, + "grad_norm": 0.11839887499809265, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 955 + }, + { + "epoch": 0.32005356545028457, + "grad_norm": 0.1277683675289154, + "learning_rate": 0.0001, + "loss": 1.5461, + "step": 956 + }, + { + "epoch": 0.32038834951456313, + "grad_norm": 0.12134066224098206, + "learning_rate": 0.0001, + "loss": 1.5649, + "step": 957 + }, + { + "epoch": 0.32072313357884163, + "grad_norm": 0.12735500931739807, + "learning_rate": 0.0001, + "loss": 1.608, + "step": 958 + }, + { + "epoch": 0.3210579176431202, + "grad_norm": 0.133828267455101, + "learning_rate": 0.0001, + "loss": 1.5675, + "step": 959 + }, + { + "epoch": 0.32139270170739875, + "grad_norm": 0.12437241524457932, + "learning_rate": 0.0001, + "loss": 1.6325, + "step": 960 + }, + { + "epoch": 0.32172748577167726, + "grad_norm": 0.12489302456378937, + "learning_rate": 0.0001, + "loss": 1.6441, + "step": 961 + }, + { + "epoch": 0.3220622698359558, + "grad_norm": 0.12957216799259186, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 962 + }, + { + "epoch": 0.3223970539002344, + "grad_norm": 0.1317603886127472, + "learning_rate": 0.0001, + "loss": 1.6061, + "step": 963 + }, + { + "epoch": 0.3227318379645129, + "grad_norm": 0.12075690180063248, + "learning_rate": 0.0001, + "loss": 1.5508, + "step": 964 + }, + { + "epoch": 0.32306662202879144, + "grad_norm": 0.11924642324447632, + "learning_rate": 0.0001, + "loss": 1.4772, + "step": 965 + }, + { + "epoch": 0.32340140609306994, + "grad_norm": 0.12515272200107574, + "learning_rate": 0.0001, + "loss": 1.5748, + "step": 966 + }, + { + "epoch": 0.3237361901573485, + "grad_norm": 0.11952123045921326, + "learning_rate": 0.0001, + "loss": 1.5852, + "step": 967 + }, + { + "epoch": 0.32407097422162706, + "grad_norm": 0.125240296125412, + "learning_rate": 0.0001, + "loss": 1.5388, + "step": 968 + }, + { + "epoch": 0.32440575828590557, + "grad_norm": 0.12284346669912338, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 969 + }, + { + "epoch": 0.3247405423501841, + "grad_norm": 0.11825854331254959, + "learning_rate": 0.0001, + "loss": 1.52, + "step": 970 + }, + { + "epoch": 0.3250753264144627, + "grad_norm": 0.1247822567820549, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 971 + }, + { + "epoch": 0.3254101104787412, + "grad_norm": 0.12490460276603699, + "learning_rate": 0.0001, + "loss": 1.6047, + "step": 972 + }, + { + "epoch": 0.32574489454301975, + "grad_norm": 0.11784359812736511, + "learning_rate": 0.0001, + "loss": 1.451, + "step": 973 + }, + { + "epoch": 0.3260796786072983, + "grad_norm": 0.12558013200759888, + "learning_rate": 0.0001, + "loss": 1.6244, + "step": 974 + }, + { + "epoch": 0.3264144626715768, + "grad_norm": 0.12492769211530685, + "learning_rate": 0.0001, + "loss": 1.6821, + "step": 975 + }, + { + "epoch": 0.32674924673585537, + "grad_norm": 0.11894410103559494, + "learning_rate": 0.0001, + "loss": 1.5476, + "step": 976 + }, + { + "epoch": 0.32708403080013393, + "grad_norm": 0.12406729906797409, + "learning_rate": 0.0001, + "loss": 1.5954, + "step": 977 + }, + { + "epoch": 0.32741881486441243, + "grad_norm": 0.12805567681789398, + "learning_rate": 0.0001, + "loss": 1.5216, + "step": 978 + }, + { + "epoch": 0.327753598928691, + "grad_norm": 0.12648111581802368, + "learning_rate": 0.0001, + "loss": 1.6923, + "step": 979 + }, + { + "epoch": 0.32808838299296955, + "grad_norm": 0.12503187358379364, + "learning_rate": 0.0001, + "loss": 1.6204, + "step": 980 + }, + { + "epoch": 0.32842316705724806, + "grad_norm": 0.12180895358324051, + "learning_rate": 0.0001, + "loss": 1.5764, + "step": 981 + }, + { + "epoch": 0.3287579511215266, + "grad_norm": 0.12118836492300034, + "learning_rate": 0.0001, + "loss": 1.4937, + "step": 982 + }, + { + "epoch": 0.3290927351858052, + "grad_norm": 0.12758868932724, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 983 + }, + { + "epoch": 0.3294275192500837, + "grad_norm": 0.1190565824508667, + "learning_rate": 0.0001, + "loss": 1.587, + "step": 984 + }, + { + "epoch": 0.32976230331436224, + "grad_norm": 0.12521426379680634, + "learning_rate": 0.0001, + "loss": 1.5403, + "step": 985 + }, + { + "epoch": 0.3300970873786408, + "grad_norm": 0.1259697824716568, + "learning_rate": 0.0001, + "loss": 1.5356, + "step": 986 + }, + { + "epoch": 0.3304318714429193, + "grad_norm": 0.12639686465263367, + "learning_rate": 0.0001, + "loss": 1.5941, + "step": 987 + }, + { + "epoch": 0.33076665550719786, + "grad_norm": 0.12533701956272125, + "learning_rate": 0.0001, + "loss": 1.6826, + "step": 988 + }, + { + "epoch": 0.3311014395714764, + "grad_norm": 0.1349916309118271, + "learning_rate": 0.0001, + "loss": 1.6818, + "step": 989 + }, + { + "epoch": 0.3314362236357549, + "grad_norm": 0.12522515654563904, + "learning_rate": 0.0001, + "loss": 1.531, + "step": 990 + }, + { + "epoch": 0.3317710077000335, + "grad_norm": 0.12278946489095688, + "learning_rate": 0.0001, + "loss": 1.5098, + "step": 991 + }, + { + "epoch": 0.33210579176431204, + "grad_norm": 0.1286853700876236, + "learning_rate": 0.0001, + "loss": 1.5117, + "step": 992 + }, + { + "epoch": 0.33244057582859055, + "grad_norm": 0.1212511882185936, + "learning_rate": 0.0001, + "loss": 1.4762, + "step": 993 + }, + { + "epoch": 0.3327753598928691, + "grad_norm": 0.1347900927066803, + "learning_rate": 0.0001, + "loss": 1.6793, + "step": 994 + }, + { + "epoch": 0.33311014395714766, + "grad_norm": 0.11994650959968567, + "learning_rate": 0.0001, + "loss": 1.6026, + "step": 995 + }, + { + "epoch": 0.33344492802142617, + "grad_norm": 0.13167862594127655, + "learning_rate": 0.0001, + "loss": 1.6341, + "step": 996 + }, + { + "epoch": 0.3337797120857047, + "grad_norm": 0.13315805792808533, + "learning_rate": 0.0001, + "loss": 1.5414, + "step": 997 + }, + { + "epoch": 0.3341144961499833, + "grad_norm": 0.12088074535131454, + "learning_rate": 0.0001, + "loss": 1.5769, + "step": 998 + }, + { + "epoch": 0.3344492802142618, + "grad_norm": 0.13783089816570282, + "learning_rate": 0.0001, + "loss": 1.5365, + "step": 999 + }, + { + "epoch": 0.33478406427854035, + "grad_norm": 0.13187260925769806, + "learning_rate": 0.0001, + "loss": 1.5929, + "step": 1000 + }, + { + "epoch": 0.3351188483428189, + "grad_norm": 0.13189886510372162, + "learning_rate": 0.0001, + "loss": 1.5591, + "step": 1001 + }, + { + "epoch": 0.3354536324070974, + "grad_norm": 0.1421831101179123, + "learning_rate": 0.0001, + "loss": 1.5674, + "step": 1002 + }, + { + "epoch": 0.335788416471376, + "grad_norm": 0.1282414346933365, + "learning_rate": 0.0001, + "loss": 1.5696, + "step": 1003 + }, + { + "epoch": 0.3361232005356545, + "grad_norm": 0.13641226291656494, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 1004 + }, + { + "epoch": 0.33645798459993304, + "grad_norm": 0.14396816492080688, + "learning_rate": 0.0001, + "loss": 1.5648, + "step": 1005 + }, + { + "epoch": 0.3367927686642116, + "grad_norm": 0.12792754173278809, + "learning_rate": 0.0001, + "loss": 1.631, + "step": 1006 + }, + { + "epoch": 0.3371275527284901, + "grad_norm": 0.1327052116394043, + "learning_rate": 0.0001, + "loss": 1.5746, + "step": 1007 + }, + { + "epoch": 0.33746233679276866, + "grad_norm": 0.14353278279304504, + "learning_rate": 0.0001, + "loss": 1.5345, + "step": 1008 + }, + { + "epoch": 0.3377971208570472, + "grad_norm": 0.137548565864563, + "learning_rate": 0.0001, + "loss": 1.6771, + "step": 1009 + }, + { + "epoch": 0.3381319049213257, + "grad_norm": 0.13727347552776337, + "learning_rate": 0.0001, + "loss": 1.6451, + "step": 1010 + }, + { + "epoch": 0.3384666889856043, + "grad_norm": 0.13395574688911438, + "learning_rate": 0.0001, + "loss": 1.5378, + "step": 1011 + }, + { + "epoch": 0.33880147304988284, + "grad_norm": 0.12692630290985107, + "learning_rate": 0.0001, + "loss": 1.5555, + "step": 1012 + }, + { + "epoch": 0.33913625711416134, + "grad_norm": 0.12900549173355103, + "learning_rate": 0.0001, + "loss": 1.5451, + "step": 1013 + }, + { + "epoch": 0.3394710411784399, + "grad_norm": 0.11654023826122284, + "learning_rate": 0.0001, + "loss": 1.5063, + "step": 1014 + }, + { + "epoch": 0.33980582524271846, + "grad_norm": 0.13518574833869934, + "learning_rate": 0.0001, + "loss": 1.5578, + "step": 1015 + }, + { + "epoch": 0.34014060930699697, + "grad_norm": 0.126609668135643, + "learning_rate": 0.0001, + "loss": 1.4299, + "step": 1016 + }, + { + "epoch": 0.3404753933712755, + "grad_norm": 0.12412185966968536, + "learning_rate": 0.0001, + "loss": 1.5083, + "step": 1017 + }, + { + "epoch": 0.3408101774355541, + "grad_norm": 0.12521536648273468, + "learning_rate": 0.0001, + "loss": 1.5264, + "step": 1018 + }, + { + "epoch": 0.3411449614998326, + "grad_norm": 0.12396744638681412, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 1019 + }, + { + "epoch": 0.34147974556411115, + "grad_norm": 0.12353380024433136, + "learning_rate": 0.0001, + "loss": 1.5615, + "step": 1020 + }, + { + "epoch": 0.3418145296283897, + "grad_norm": 0.1337115615606308, + "learning_rate": 0.0001, + "loss": 1.5777, + "step": 1021 + }, + { + "epoch": 0.3421493136926682, + "grad_norm": 0.13354641199111938, + "learning_rate": 0.0001, + "loss": 1.5417, + "step": 1022 + }, + { + "epoch": 0.34248409775694677, + "grad_norm": 0.12444625794887543, + "learning_rate": 0.0001, + "loss": 1.579, + "step": 1023 + }, + { + "epoch": 0.34281888182122533, + "grad_norm": 0.12876839935779572, + "learning_rate": 0.0001, + "loss": 1.4921, + "step": 1024 + }, + { + "epoch": 0.34315366588550383, + "grad_norm": 0.13097478449344635, + "learning_rate": 0.0001, + "loss": 1.5756, + "step": 1025 + }, + { + "epoch": 0.3434884499497824, + "grad_norm": 0.1257512867450714, + "learning_rate": 0.0001, + "loss": 1.5273, + "step": 1026 + }, + { + "epoch": 0.34382323401406095, + "grad_norm": 0.13378176093101501, + "learning_rate": 0.0001, + "loss": 1.5484, + "step": 1027 + }, + { + "epoch": 0.34415801807833946, + "grad_norm": 0.1325940638780594, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 1028 + }, + { + "epoch": 0.344492802142618, + "grad_norm": 0.11962547153234482, + "learning_rate": 0.0001, + "loss": 1.4859, + "step": 1029 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 0.12927503883838654, + "learning_rate": 0.0001, + "loss": 1.6788, + "step": 1030 + }, + { + "epoch": 0.3451623702711751, + "grad_norm": 0.13427825272083282, + "learning_rate": 0.0001, + "loss": 1.5514, + "step": 1031 + }, + { + "epoch": 0.34549715433545364, + "grad_norm": 0.13139428198337555, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 1032 + }, + { + "epoch": 0.3458319383997322, + "grad_norm": 0.12266752868890762, + "learning_rate": 0.0001, + "loss": 1.5226, + "step": 1033 + }, + { + "epoch": 0.3461667224640107, + "grad_norm": 0.14490726590156555, + "learning_rate": 0.0001, + "loss": 1.5562, + "step": 1034 + }, + { + "epoch": 0.34650150652828926, + "grad_norm": 0.11922143399715424, + "learning_rate": 0.0001, + "loss": 1.465, + "step": 1035 + }, + { + "epoch": 0.3468362905925678, + "grad_norm": 0.12442134320735931, + "learning_rate": 0.0001, + "loss": 1.5653, + "step": 1036 + }, + { + "epoch": 0.3471710746568463, + "grad_norm": 0.1383199840784073, + "learning_rate": 0.0001, + "loss": 1.5509, + "step": 1037 + }, + { + "epoch": 0.3475058587211249, + "grad_norm": 0.12311188876628876, + "learning_rate": 0.0001, + "loss": 1.5429, + "step": 1038 + }, + { + "epoch": 0.34784064278540344, + "grad_norm": 0.12368562817573547, + "learning_rate": 0.0001, + "loss": 1.6099, + "step": 1039 + }, + { + "epoch": 0.34817542684968195, + "grad_norm": 0.13235348463058472, + "learning_rate": 0.0001, + "loss": 1.586, + "step": 1040 + }, + { + "epoch": 0.3485102109139605, + "grad_norm": 0.12543101608753204, + "learning_rate": 0.0001, + "loss": 1.5094, + "step": 1041 + }, + { + "epoch": 0.348844994978239, + "grad_norm": 0.12461157888174057, + "learning_rate": 0.0001, + "loss": 1.6067, + "step": 1042 + }, + { + "epoch": 0.34917977904251757, + "grad_norm": 0.12375465035438538, + "learning_rate": 0.0001, + "loss": 1.5953, + "step": 1043 + }, + { + "epoch": 0.34951456310679613, + "grad_norm": 0.13041523098945618, + "learning_rate": 0.0001, + "loss": 1.6088, + "step": 1044 + }, + { + "epoch": 0.34984934717107463, + "grad_norm": 0.12022354453802109, + "learning_rate": 0.0001, + "loss": 1.4805, + "step": 1045 + }, + { + "epoch": 0.3501841312353532, + "grad_norm": 0.1251700222492218, + "learning_rate": 0.0001, + "loss": 1.5457, + "step": 1046 + }, + { + "epoch": 0.35051891529963175, + "grad_norm": 0.12562930583953857, + "learning_rate": 0.0001, + "loss": 1.501, + "step": 1047 + }, + { + "epoch": 0.35085369936391025, + "grad_norm": 0.13178695738315582, + "learning_rate": 0.0001, + "loss": 1.6332, + "step": 1048 + }, + { + "epoch": 0.3511884834281888, + "grad_norm": 0.12346772104501724, + "learning_rate": 0.0001, + "loss": 1.5875, + "step": 1049 + }, + { + "epoch": 0.3515232674924674, + "grad_norm": 0.12000037729740143, + "learning_rate": 0.0001, + "loss": 1.5166, + "step": 1050 + }, + { + "epoch": 0.3518580515567459, + "grad_norm": 0.13240620493888855, + "learning_rate": 0.0001, + "loss": 1.5801, + "step": 1051 + }, + { + "epoch": 0.35219283562102444, + "grad_norm": 0.12688381969928741, + "learning_rate": 0.0001, + "loss": 1.5581, + "step": 1052 + }, + { + "epoch": 0.352527619685303, + "grad_norm": 0.12421749532222748, + "learning_rate": 0.0001, + "loss": 1.5626, + "step": 1053 + }, + { + "epoch": 0.3528624037495815, + "grad_norm": 0.12876258790493011, + "learning_rate": 0.0001, + "loss": 1.4921, + "step": 1054 + }, + { + "epoch": 0.35319718781386006, + "grad_norm": 0.13299116492271423, + "learning_rate": 0.0001, + "loss": 1.5828, + "step": 1055 + }, + { + "epoch": 0.3535319718781386, + "grad_norm": 0.12605415284633636, + "learning_rate": 0.0001, + "loss": 1.5963, + "step": 1056 + }, + { + "epoch": 0.3538667559424171, + "grad_norm": 0.13100145757198334, + "learning_rate": 0.0001, + "loss": 1.6035, + "step": 1057 + }, + { + "epoch": 0.3542015400066957, + "grad_norm": 0.12380324304103851, + "learning_rate": 0.0001, + "loss": 1.5784, + "step": 1058 + }, + { + "epoch": 0.35453632407097424, + "grad_norm": 0.1288285106420517, + "learning_rate": 0.0001, + "loss": 1.5454, + "step": 1059 + }, + { + "epoch": 0.35487110813525274, + "grad_norm": 0.12464431673288345, + "learning_rate": 0.0001, + "loss": 1.5622, + "step": 1060 + }, + { + "epoch": 0.3552058921995313, + "grad_norm": 0.12694504857063293, + "learning_rate": 0.0001, + "loss": 1.5361, + "step": 1061 + }, + { + "epoch": 0.35554067626380986, + "grad_norm": 0.12736117839813232, + "learning_rate": 0.0001, + "loss": 1.5931, + "step": 1062 + }, + { + "epoch": 0.35587546032808837, + "grad_norm": 0.12816745042800903, + "learning_rate": 0.0001, + "loss": 1.584, + "step": 1063 + }, + { + "epoch": 0.3562102443923669, + "grad_norm": 0.12096529453992844, + "learning_rate": 0.0001, + "loss": 1.4851, + "step": 1064 + }, + { + "epoch": 0.3565450284566455, + "grad_norm": 0.12956807017326355, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 1065 + }, + { + "epoch": 0.356879812520924, + "grad_norm": 0.12413816154003143, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 1066 + }, + { + "epoch": 0.35721459658520255, + "grad_norm": 0.13675865530967712, + "learning_rate": 0.0001, + "loss": 1.498, + "step": 1067 + }, + { + "epoch": 0.3575493806494811, + "grad_norm": 0.12694036960601807, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 1068 + }, + { + "epoch": 0.3578841647137596, + "grad_norm": 0.13280896842479706, + "learning_rate": 0.0001, + "loss": 1.4662, + "step": 1069 + }, + { + "epoch": 0.35821894877803817, + "grad_norm": 0.13775292038917542, + "learning_rate": 0.0001, + "loss": 1.5833, + "step": 1070 + }, + { + "epoch": 0.35855373284231673, + "grad_norm": 0.12691499292850494, + "learning_rate": 0.0001, + "loss": 1.6034, + "step": 1071 + }, + { + "epoch": 0.35888851690659523, + "grad_norm": 0.13247890770435333, + "learning_rate": 0.0001, + "loss": 1.5617, + "step": 1072 + }, + { + "epoch": 0.3592233009708738, + "grad_norm": 0.1524164378643036, + "learning_rate": 0.0001, + "loss": 1.7153, + "step": 1073 + }, + { + "epoch": 0.35955808503515235, + "grad_norm": 0.12795189023017883, + "learning_rate": 0.0001, + "loss": 1.5657, + "step": 1074 + }, + { + "epoch": 0.35989286909943086, + "grad_norm": 0.12827672064304352, + "learning_rate": 0.0001, + "loss": 1.4345, + "step": 1075 + }, + { + "epoch": 0.3602276531637094, + "grad_norm": 0.13488048315048218, + "learning_rate": 0.0001, + "loss": 1.5137, + "step": 1076 + }, + { + "epoch": 0.360562437227988, + "grad_norm": 0.11891927570104599, + "learning_rate": 0.0001, + "loss": 1.508, + "step": 1077 + }, + { + "epoch": 0.3608972212922665, + "grad_norm": 0.1263907551765442, + "learning_rate": 0.0001, + "loss": 1.5969, + "step": 1078 + }, + { + "epoch": 0.36123200535654504, + "grad_norm": 0.12749949097633362, + "learning_rate": 0.0001, + "loss": 1.5646, + "step": 1079 + }, + { + "epoch": 0.36156678942082354, + "grad_norm": 0.12221404910087585, + "learning_rate": 0.0001, + "loss": 1.5279, + "step": 1080 + }, + { + "epoch": 0.3619015734851021, + "grad_norm": 0.12473400682210922, + "learning_rate": 0.0001, + "loss": 1.507, + "step": 1081 + }, + { + "epoch": 0.36223635754938066, + "grad_norm": 0.13297304511070251, + "learning_rate": 0.0001, + "loss": 1.5636, + "step": 1082 + }, + { + "epoch": 0.36257114161365916, + "grad_norm": 0.1260288655757904, + "learning_rate": 0.0001, + "loss": 1.5429, + "step": 1083 + }, + { + "epoch": 0.3629059256779377, + "grad_norm": 0.12271251529455185, + "learning_rate": 0.0001, + "loss": 1.6139, + "step": 1084 + }, + { + "epoch": 0.3632407097422163, + "grad_norm": 0.13517338037490845, + "learning_rate": 0.0001, + "loss": 1.59, + "step": 1085 + }, + { + "epoch": 0.3635754938064948, + "grad_norm": 0.12335921078920364, + "learning_rate": 0.0001, + "loss": 1.5477, + "step": 1086 + }, + { + "epoch": 0.36391027787077335, + "grad_norm": 0.12416140735149384, + "learning_rate": 0.0001, + "loss": 1.5792, + "step": 1087 + }, + { + "epoch": 0.3642450619350519, + "grad_norm": 0.1330622136592865, + "learning_rate": 0.0001, + "loss": 1.6416, + "step": 1088 + }, + { + "epoch": 0.3645798459993304, + "grad_norm": 0.11882945895195007, + "learning_rate": 0.0001, + "loss": 1.5633, + "step": 1089 + }, + { + "epoch": 0.36491463006360897, + "grad_norm": 0.12056804448366165, + "learning_rate": 0.0001, + "loss": 1.5639, + "step": 1090 + }, + { + "epoch": 0.36524941412788753, + "grad_norm": 0.12773139774799347, + "learning_rate": 0.0001, + "loss": 1.5221, + "step": 1091 + }, + { + "epoch": 0.36558419819216603, + "grad_norm": 0.12159121781587601, + "learning_rate": 0.0001, + "loss": 1.5255, + "step": 1092 + }, + { + "epoch": 0.3659189822564446, + "grad_norm": 0.12454614788293839, + "learning_rate": 0.0001, + "loss": 1.5685, + "step": 1093 + }, + { + "epoch": 0.36625376632072315, + "grad_norm": 0.1252131462097168, + "learning_rate": 0.0001, + "loss": 1.5721, + "step": 1094 + }, + { + "epoch": 0.36658855038500165, + "grad_norm": 0.12228623777627945, + "learning_rate": 0.0001, + "loss": 1.5488, + "step": 1095 + }, + { + "epoch": 0.3669233344492802, + "grad_norm": 0.1220550686120987, + "learning_rate": 0.0001, + "loss": 1.524, + "step": 1096 + }, + { + "epoch": 0.3672581185135588, + "grad_norm": 0.12096890807151794, + "learning_rate": 0.0001, + "loss": 1.4846, + "step": 1097 + }, + { + "epoch": 0.3675929025778373, + "grad_norm": 0.12377587705850601, + "learning_rate": 0.0001, + "loss": 1.6305, + "step": 1098 + }, + { + "epoch": 0.36792768664211584, + "grad_norm": 0.12515562772750854, + "learning_rate": 0.0001, + "loss": 1.6078, + "step": 1099 + }, + { + "epoch": 0.3682624707063944, + "grad_norm": 0.12402921915054321, + "learning_rate": 0.0001, + "loss": 1.532, + "step": 1100 + }, + { + "epoch": 0.3685972547706729, + "grad_norm": 0.12373632192611694, + "learning_rate": 0.0001, + "loss": 1.512, + "step": 1101 + }, + { + "epoch": 0.36893203883495146, + "grad_norm": 0.12751725316047668, + "learning_rate": 0.0001, + "loss": 1.5799, + "step": 1102 + }, + { + "epoch": 0.36926682289923, + "grad_norm": 0.12221360951662064, + "learning_rate": 0.0001, + "loss": 1.4454, + "step": 1103 + }, + { + "epoch": 0.3696016069635085, + "grad_norm": 0.12299706041812897, + "learning_rate": 0.0001, + "loss": 1.5994, + "step": 1104 + }, + { + "epoch": 0.3699363910277871, + "grad_norm": 0.1294013112783432, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 1105 + }, + { + "epoch": 0.37027117509206564, + "grad_norm": 0.1240616887807846, + "learning_rate": 0.0001, + "loss": 1.5548, + "step": 1106 + }, + { + "epoch": 0.37060595915634414, + "grad_norm": 0.12403808534145355, + "learning_rate": 0.0001, + "loss": 1.6311, + "step": 1107 + }, + { + "epoch": 0.3709407432206227, + "grad_norm": 0.11872854828834534, + "learning_rate": 0.0001, + "loss": 1.4156, + "step": 1108 + }, + { + "epoch": 0.37127552728490126, + "grad_norm": 0.12752331793308258, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 1109 + }, + { + "epoch": 0.37161031134917977, + "grad_norm": 0.12329373508691788, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 1110 + }, + { + "epoch": 0.3719450954134583, + "grad_norm": 0.12340104579925537, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 1111 + }, + { + "epoch": 0.3722798794777369, + "grad_norm": 0.11669819802045822, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 1112 + }, + { + "epoch": 0.3726146635420154, + "grad_norm": 0.11677731573581696, + "learning_rate": 0.0001, + "loss": 1.5151, + "step": 1113 + }, + { + "epoch": 0.37294944760629395, + "grad_norm": 0.12206505239009857, + "learning_rate": 0.0001, + "loss": 1.6733, + "step": 1114 + }, + { + "epoch": 0.3732842316705725, + "grad_norm": 0.12234992533922195, + "learning_rate": 0.0001, + "loss": 1.5242, + "step": 1115 + }, + { + "epoch": 0.373619015734851, + "grad_norm": 0.12357670813798904, + "learning_rate": 0.0001, + "loss": 1.5432, + "step": 1116 + }, + { + "epoch": 0.37395379979912957, + "grad_norm": 0.12345674633979797, + "learning_rate": 0.0001, + "loss": 1.6483, + "step": 1117 + }, + { + "epoch": 0.3742885838634081, + "grad_norm": 0.1179901510477066, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 1118 + }, + { + "epoch": 0.37462336792768663, + "grad_norm": 0.12135247141122818, + "learning_rate": 0.0001, + "loss": 1.554, + "step": 1119 + }, + { + "epoch": 0.3749581519919652, + "grad_norm": 0.12836892902851105, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 1120 + }, + { + "epoch": 0.3752929360562437, + "grad_norm": 0.12851716578006744, + "learning_rate": 0.0001, + "loss": 1.6372, + "step": 1121 + }, + { + "epoch": 0.37562772012052226, + "grad_norm": 0.12096036225557327, + "learning_rate": 0.0001, + "loss": 1.5042, + "step": 1122 + }, + { + "epoch": 0.3759625041848008, + "grad_norm": 0.121758371591568, + "learning_rate": 0.0001, + "loss": 1.5561, + "step": 1123 + }, + { + "epoch": 0.3762972882490793, + "grad_norm": 0.12547370791435242, + "learning_rate": 0.0001, + "loss": 1.571, + "step": 1124 + }, + { + "epoch": 0.3766320723133579, + "grad_norm": 0.12488459795713425, + "learning_rate": 0.0001, + "loss": 1.6101, + "step": 1125 + }, + { + "epoch": 0.37696685637763644, + "grad_norm": 0.12440396845340729, + "learning_rate": 0.0001, + "loss": 1.4978, + "step": 1126 + }, + { + "epoch": 0.37730164044191494, + "grad_norm": 0.1293293535709381, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 1127 + }, + { + "epoch": 0.3776364245061935, + "grad_norm": 0.1270667314529419, + "learning_rate": 0.0001, + "loss": 1.5403, + "step": 1128 + }, + { + "epoch": 0.37797120857047206, + "grad_norm": 0.13023768365383148, + "learning_rate": 0.0001, + "loss": 1.6641, + "step": 1129 + }, + { + "epoch": 0.37830599263475057, + "grad_norm": 0.12713496387004852, + "learning_rate": 0.0001, + "loss": 1.5685, + "step": 1130 + }, + { + "epoch": 0.3786407766990291, + "grad_norm": 0.126458540558815, + "learning_rate": 0.0001, + "loss": 1.5624, + "step": 1131 + }, + { + "epoch": 0.3789755607633077, + "grad_norm": 0.12100820988416672, + "learning_rate": 0.0001, + "loss": 1.5158, + "step": 1132 + }, + { + "epoch": 0.3793103448275862, + "grad_norm": 0.13373976945877075, + "learning_rate": 0.0001, + "loss": 1.5151, + "step": 1133 + }, + { + "epoch": 0.37964512889186475, + "grad_norm": 0.12730540335178375, + "learning_rate": 0.0001, + "loss": 1.5701, + "step": 1134 + }, + { + "epoch": 0.3799799129561433, + "grad_norm": 0.13641048967838287, + "learning_rate": 0.0001, + "loss": 1.5144, + "step": 1135 + }, + { + "epoch": 0.3803146970204218, + "grad_norm": 0.13271461427211761, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 1136 + }, + { + "epoch": 0.38064948108470037, + "grad_norm": 0.12385160475969315, + "learning_rate": 0.0001, + "loss": 1.5374, + "step": 1137 + }, + { + "epoch": 0.38098426514897893, + "grad_norm": 0.12949350476264954, + "learning_rate": 0.0001, + "loss": 1.546, + "step": 1138 + }, + { + "epoch": 0.38131904921325743, + "grad_norm": 0.135132297873497, + "learning_rate": 0.0001, + "loss": 1.5913, + "step": 1139 + }, + { + "epoch": 0.381653833277536, + "grad_norm": 0.11533955484628677, + "learning_rate": 0.0001, + "loss": 1.3968, + "step": 1140 + }, + { + "epoch": 0.38198861734181455, + "grad_norm": 0.13532719016075134, + "learning_rate": 0.0001, + "loss": 1.5534, + "step": 1141 + }, + { + "epoch": 0.38232340140609306, + "grad_norm": 0.14101184904575348, + "learning_rate": 0.0001, + "loss": 1.557, + "step": 1142 + }, + { + "epoch": 0.3826581854703716, + "grad_norm": 0.12038899213075638, + "learning_rate": 0.0001, + "loss": 1.4831, + "step": 1143 + }, + { + "epoch": 0.3829929695346502, + "grad_norm": 0.13053514063358307, + "learning_rate": 0.0001, + "loss": 1.5882, + "step": 1144 + }, + { + "epoch": 0.3833277535989287, + "grad_norm": 0.12372793257236481, + "learning_rate": 0.0001, + "loss": 1.6047, + "step": 1145 + }, + { + "epoch": 0.38366253766320724, + "grad_norm": 0.12823140621185303, + "learning_rate": 0.0001, + "loss": 1.6126, + "step": 1146 + }, + { + "epoch": 0.3839973217274858, + "grad_norm": 0.12058600783348083, + "learning_rate": 0.0001, + "loss": 1.4713, + "step": 1147 + }, + { + "epoch": 0.3843321057917643, + "grad_norm": 0.12674620747566223, + "learning_rate": 0.0001, + "loss": 1.6126, + "step": 1148 + }, + { + "epoch": 0.38466688985604286, + "grad_norm": 0.1214526891708374, + "learning_rate": 0.0001, + "loss": 1.6317, + "step": 1149 + }, + { + "epoch": 0.3850016739203214, + "grad_norm": 0.12831653654575348, + "learning_rate": 0.0001, + "loss": 1.5479, + "step": 1150 + }, + { + "epoch": 0.3853364579845999, + "grad_norm": 0.12079459428787231, + "learning_rate": 0.0001, + "loss": 1.5544, + "step": 1151 + }, + { + "epoch": 0.3856712420488785, + "grad_norm": 0.12021779268980026, + "learning_rate": 0.0001, + "loss": 1.5536, + "step": 1152 + }, + { + "epoch": 0.38600602611315704, + "grad_norm": 0.13052217662334442, + "learning_rate": 0.0001, + "loss": 1.5482, + "step": 1153 + }, + { + "epoch": 0.38634081017743555, + "grad_norm": 0.12613235414028168, + "learning_rate": 0.0001, + "loss": 1.6056, + "step": 1154 + }, + { + "epoch": 0.3866755942417141, + "grad_norm": 0.12751324474811554, + "learning_rate": 0.0001, + "loss": 1.5513, + "step": 1155 + }, + { + "epoch": 0.3870103783059926, + "grad_norm": 0.11987000703811646, + "learning_rate": 0.0001, + "loss": 1.4836, + "step": 1156 + }, + { + "epoch": 0.38734516237027117, + "grad_norm": 0.13999362289905548, + "learning_rate": 0.0001, + "loss": 1.6763, + "step": 1157 + }, + { + "epoch": 0.3876799464345497, + "grad_norm": 0.128611221909523, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 1158 + }, + { + "epoch": 0.38801473049882823, + "grad_norm": 0.1292606145143509, + "learning_rate": 0.0001, + "loss": 1.6846, + "step": 1159 + }, + { + "epoch": 0.3883495145631068, + "grad_norm": 0.13090923428535461, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 1160 + }, + { + "epoch": 0.38868429862738535, + "grad_norm": 0.12356492131948471, + "learning_rate": 0.0001, + "loss": 1.5158, + "step": 1161 + }, + { + "epoch": 0.38901908269166385, + "grad_norm": 0.12005447596311569, + "learning_rate": 0.0001, + "loss": 1.62, + "step": 1162 + }, + { + "epoch": 0.3893538667559424, + "grad_norm": 0.12113460153341293, + "learning_rate": 0.0001, + "loss": 1.4954, + "step": 1163 + }, + { + "epoch": 0.38968865082022097, + "grad_norm": 0.11953802406787872, + "learning_rate": 0.0001, + "loss": 1.4891, + "step": 1164 + }, + { + "epoch": 0.3900234348844995, + "grad_norm": 0.1292644739151001, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 1165 + }, + { + "epoch": 0.39035821894877804, + "grad_norm": 0.12345704436302185, + "learning_rate": 0.0001, + "loss": 1.4939, + "step": 1166 + }, + { + "epoch": 0.3906930030130566, + "grad_norm": 0.12334253638982773, + "learning_rate": 0.0001, + "loss": 1.6058, + "step": 1167 + }, + { + "epoch": 0.3910277870773351, + "grad_norm": 0.13044217228889465, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 1168 + }, + { + "epoch": 0.39136257114161366, + "grad_norm": 0.12309286743402481, + "learning_rate": 0.0001, + "loss": 1.5007, + "step": 1169 + }, + { + "epoch": 0.3916973552058922, + "grad_norm": 0.12565681338310242, + "learning_rate": 0.0001, + "loss": 1.5172, + "step": 1170 + }, + { + "epoch": 0.3920321392701707, + "grad_norm": 0.13335129618644714, + "learning_rate": 0.0001, + "loss": 1.5666, + "step": 1171 + }, + { + "epoch": 0.3923669233344493, + "grad_norm": 0.12664766609668732, + "learning_rate": 0.0001, + "loss": 1.5471, + "step": 1172 + }, + { + "epoch": 0.39270170739872784, + "grad_norm": 0.12703973054885864, + "learning_rate": 0.0001, + "loss": 1.545, + "step": 1173 + }, + { + "epoch": 0.39303649146300634, + "grad_norm": 0.12242884933948517, + "learning_rate": 0.0001, + "loss": 1.4768, + "step": 1174 + }, + { + "epoch": 0.3933712755272849, + "grad_norm": 0.13055263459682465, + "learning_rate": 0.0001, + "loss": 1.4782, + "step": 1175 + }, + { + "epoch": 0.39370605959156346, + "grad_norm": 0.13161849975585938, + "learning_rate": 0.0001, + "loss": 1.621, + "step": 1176 + }, + { + "epoch": 0.39404084365584197, + "grad_norm": 0.1257203370332718, + "learning_rate": 0.0001, + "loss": 1.5655, + "step": 1177 + }, + { + "epoch": 0.3943756277201205, + "grad_norm": 0.14164592325687408, + "learning_rate": 0.0001, + "loss": 1.4884, + "step": 1178 + }, + { + "epoch": 0.3947104117843991, + "grad_norm": 0.12696050107479095, + "learning_rate": 0.0001, + "loss": 1.5829, + "step": 1179 + }, + { + "epoch": 0.3950451958486776, + "grad_norm": 0.12652398645877838, + "learning_rate": 0.0001, + "loss": 1.6345, + "step": 1180 + }, + { + "epoch": 0.39537997991295615, + "grad_norm": 0.12333660572767258, + "learning_rate": 0.0001, + "loss": 1.5375, + "step": 1181 + }, + { + "epoch": 0.3957147639772347, + "grad_norm": 0.13108794391155243, + "learning_rate": 0.0001, + "loss": 1.6441, + "step": 1182 + }, + { + "epoch": 0.3960495480415132, + "grad_norm": 0.13195887207984924, + "learning_rate": 0.0001, + "loss": 1.5939, + "step": 1183 + }, + { + "epoch": 0.39638433210579177, + "grad_norm": 0.12931646406650543, + "learning_rate": 0.0001, + "loss": 1.5317, + "step": 1184 + }, + { + "epoch": 0.39671911617007033, + "grad_norm": 0.12439566105604172, + "learning_rate": 0.0001, + "loss": 1.5391, + "step": 1185 + }, + { + "epoch": 0.39705390023434883, + "grad_norm": 0.12557551264762878, + "learning_rate": 0.0001, + "loss": 1.5723, + "step": 1186 + }, + { + "epoch": 0.3973886842986274, + "grad_norm": 0.13013330101966858, + "learning_rate": 0.0001, + "loss": 1.4812, + "step": 1187 + }, + { + "epoch": 0.39772346836290595, + "grad_norm": 0.12955336272716522, + "learning_rate": 0.0001, + "loss": 1.5799, + "step": 1188 + }, + { + "epoch": 0.39805825242718446, + "grad_norm": 0.1347295343875885, + "learning_rate": 0.0001, + "loss": 1.6634, + "step": 1189 + }, + { + "epoch": 0.398393036491463, + "grad_norm": 0.13187319040298462, + "learning_rate": 0.0001, + "loss": 1.5146, + "step": 1190 + }, + { + "epoch": 0.3987278205557416, + "grad_norm": 0.13010048866271973, + "learning_rate": 0.0001, + "loss": 1.5003, + "step": 1191 + }, + { + "epoch": 0.3990626046200201, + "grad_norm": 0.12330204248428345, + "learning_rate": 0.0001, + "loss": 1.5765, + "step": 1192 + }, + { + "epoch": 0.39939738868429864, + "grad_norm": 0.1346241533756256, + "learning_rate": 0.0001, + "loss": 1.5979, + "step": 1193 + }, + { + "epoch": 0.39973217274857714, + "grad_norm": 0.13725797832012177, + "learning_rate": 0.0001, + "loss": 1.5813, + "step": 1194 + }, + { + "epoch": 0.4000669568128557, + "grad_norm": 0.12039465457201004, + "learning_rate": 0.0001, + "loss": 1.4363, + "step": 1195 + }, + { + "epoch": 0.40040174087713426, + "grad_norm": 0.1276928186416626, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 1196 + }, + { + "epoch": 0.40073652494141276, + "grad_norm": 0.12903235852718353, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 1197 + }, + { + "epoch": 0.4010713090056913, + "grad_norm": 0.12678353488445282, + "learning_rate": 0.0001, + "loss": 1.5624, + "step": 1198 + }, + { + "epoch": 0.4014060930699699, + "grad_norm": 0.12884308397769928, + "learning_rate": 0.0001, + "loss": 1.5995, + "step": 1199 + }, + { + "epoch": 0.4017408771342484, + "grad_norm": 0.11986846476793289, + "learning_rate": 0.0001, + "loss": 1.4767, + "step": 1200 + }, + { + "epoch": 0.40207566119852695, + "grad_norm": 0.12227410078048706, + "learning_rate": 0.0001, + "loss": 1.5056, + "step": 1201 + }, + { + "epoch": 0.4024104452628055, + "grad_norm": 0.12593914568424225, + "learning_rate": 0.0001, + "loss": 1.5836, + "step": 1202 + }, + { + "epoch": 0.402745229327084, + "grad_norm": 0.12477041780948639, + "learning_rate": 0.0001, + "loss": 1.5745, + "step": 1203 + }, + { + "epoch": 0.40308001339136257, + "grad_norm": 0.1216067373752594, + "learning_rate": 0.0001, + "loss": 1.5824, + "step": 1204 + }, + { + "epoch": 0.4034147974556411, + "grad_norm": 0.13550971448421478, + "learning_rate": 0.0001, + "loss": 1.6635, + "step": 1205 + }, + { + "epoch": 0.40374958151991963, + "grad_norm": 0.12963739037513733, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 1206 + }, + { + "epoch": 0.4040843655841982, + "grad_norm": 0.11887506395578384, + "learning_rate": 0.0001, + "loss": 1.4933, + "step": 1207 + }, + { + "epoch": 0.40441914964847675, + "grad_norm": 0.13262464106082916, + "learning_rate": 0.0001, + "loss": 1.5759, + "step": 1208 + }, + { + "epoch": 0.40475393371275525, + "grad_norm": 0.13952501118183136, + "learning_rate": 0.0001, + "loss": 1.6918, + "step": 1209 + }, + { + "epoch": 0.4050887177770338, + "grad_norm": 0.13401460647583008, + "learning_rate": 0.0001, + "loss": 1.5102, + "step": 1210 + }, + { + "epoch": 0.4054235018413124, + "grad_norm": 0.14476630091667175, + "learning_rate": 0.0001, + "loss": 1.6817, + "step": 1211 + }, + { + "epoch": 0.4057582859055909, + "grad_norm": 0.1285640001296997, + "learning_rate": 0.0001, + "loss": 1.653, + "step": 1212 + }, + { + "epoch": 0.40609306996986944, + "grad_norm": 0.13845203816890717, + "learning_rate": 0.0001, + "loss": 1.5996, + "step": 1213 + }, + { + "epoch": 0.406427854034148, + "grad_norm": 0.13416174054145813, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 1214 + }, + { + "epoch": 0.4067626380984265, + "grad_norm": 0.1267634481191635, + "learning_rate": 0.0001, + "loss": 1.5257, + "step": 1215 + }, + { + "epoch": 0.40709742216270506, + "grad_norm": 0.13453447818756104, + "learning_rate": 0.0001, + "loss": 1.5745, + "step": 1216 + }, + { + "epoch": 0.4074322062269836, + "grad_norm": 0.12069771438837051, + "learning_rate": 0.0001, + "loss": 1.5516, + "step": 1217 + }, + { + "epoch": 0.4077669902912621, + "grad_norm": 0.12483450770378113, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 1218 + }, + { + "epoch": 0.4081017743555407, + "grad_norm": 0.14123085141181946, + "learning_rate": 0.0001, + "loss": 1.6334, + "step": 1219 + }, + { + "epoch": 0.40843655841981924, + "grad_norm": 0.12844936549663544, + "learning_rate": 0.0001, + "loss": 1.4936, + "step": 1220 + }, + { + "epoch": 0.40877134248409774, + "grad_norm": 0.13094481825828552, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 1221 + }, + { + "epoch": 0.4091061265483763, + "grad_norm": 0.12563113868236542, + "learning_rate": 0.0001, + "loss": 1.4708, + "step": 1222 + }, + { + "epoch": 0.40944091061265486, + "grad_norm": 0.12495769560337067, + "learning_rate": 0.0001, + "loss": 1.5012, + "step": 1223 + }, + { + "epoch": 0.40977569467693337, + "grad_norm": 0.12314360588788986, + "learning_rate": 0.0001, + "loss": 1.5769, + "step": 1224 + }, + { + "epoch": 0.4101104787412119, + "grad_norm": 0.1389753818511963, + "learning_rate": 0.0001, + "loss": 1.5978, + "step": 1225 + }, + { + "epoch": 0.4104452628054905, + "grad_norm": 0.12703324854373932, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 1226 + }, + { + "epoch": 0.410780046869769, + "grad_norm": 0.11995337903499603, + "learning_rate": 0.0001, + "loss": 1.5307, + "step": 1227 + }, + { + "epoch": 0.41111483093404755, + "grad_norm": 0.1330454796552658, + "learning_rate": 0.0001, + "loss": 1.6277, + "step": 1228 + }, + { + "epoch": 0.41144961499832605, + "grad_norm": 0.12632183730602264, + "learning_rate": 0.0001, + "loss": 1.507, + "step": 1229 + }, + { + "epoch": 0.4117843990626046, + "grad_norm": 0.13255640864372253, + "learning_rate": 0.0001, + "loss": 1.5797, + "step": 1230 + }, + { + "epoch": 0.41211918312688317, + "grad_norm": 0.13822025060653687, + "learning_rate": 0.0001, + "loss": 1.5945, + "step": 1231 + }, + { + "epoch": 0.4124539671911617, + "grad_norm": 0.1303391307592392, + "learning_rate": 0.0001, + "loss": 1.5928, + "step": 1232 + }, + { + "epoch": 0.41278875125544023, + "grad_norm": 0.12309371680021286, + "learning_rate": 0.0001, + "loss": 1.4794, + "step": 1233 + }, + { + "epoch": 0.4131235353197188, + "grad_norm": 0.12375032901763916, + "learning_rate": 0.0001, + "loss": 1.5133, + "step": 1234 + }, + { + "epoch": 0.4134583193839973, + "grad_norm": 0.13613499701023102, + "learning_rate": 0.0001, + "loss": 1.621, + "step": 1235 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 0.13198764622211456, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 1236 + }, + { + "epoch": 0.4141278875125544, + "grad_norm": 0.1294814646244049, + "learning_rate": 0.0001, + "loss": 1.5836, + "step": 1237 + }, + { + "epoch": 0.4144626715768329, + "grad_norm": 0.12597797811031342, + "learning_rate": 0.0001, + "loss": 1.5988, + "step": 1238 + }, + { + "epoch": 0.4147974556411115, + "grad_norm": 0.12371232360601425, + "learning_rate": 0.0001, + "loss": 1.5432, + "step": 1239 + }, + { + "epoch": 0.41513223970539004, + "grad_norm": 0.12919354438781738, + "learning_rate": 0.0001, + "loss": 1.5507, + "step": 1240 + }, + { + "epoch": 0.41546702376966854, + "grad_norm": 0.12919418513774872, + "learning_rate": 0.0001, + "loss": 1.7431, + "step": 1241 + }, + { + "epoch": 0.4158018078339471, + "grad_norm": 0.12314452975988388, + "learning_rate": 0.0001, + "loss": 1.5407, + "step": 1242 + }, + { + "epoch": 0.41613659189822566, + "grad_norm": 0.1360636204481125, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 1243 + }, + { + "epoch": 0.41647137596250416, + "grad_norm": 0.12739785015583038, + "learning_rate": 0.0001, + "loss": 1.4998, + "step": 1244 + }, + { + "epoch": 0.4168061600267827, + "grad_norm": 0.12558461725711823, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 1245 + }, + { + "epoch": 0.4171409440910613, + "grad_norm": 0.130743145942688, + "learning_rate": 0.0001, + "loss": 1.6537, + "step": 1246 + }, + { + "epoch": 0.4174757281553398, + "grad_norm": 0.12714166939258575, + "learning_rate": 0.0001, + "loss": 1.4309, + "step": 1247 + }, + { + "epoch": 0.41781051221961835, + "grad_norm": 0.12849892675876617, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 1248 + }, + { + "epoch": 0.4181452962838969, + "grad_norm": 0.1366477757692337, + "learning_rate": 0.0001, + "loss": 1.6397, + "step": 1249 + }, + { + "epoch": 0.4184800803481754, + "grad_norm": 0.1324029415845871, + "learning_rate": 0.0001, + "loss": 1.5647, + "step": 1250 + }, + { + "epoch": 0.41881486441245397, + "grad_norm": 0.1272830069065094, + "learning_rate": 0.0001, + "loss": 1.633, + "step": 1251 + }, + { + "epoch": 0.41914964847673253, + "grad_norm": 0.12891270220279694, + "learning_rate": 0.0001, + "loss": 1.5571, + "step": 1252 + }, + { + "epoch": 0.41948443254101103, + "grad_norm": 0.1334099918603897, + "learning_rate": 0.0001, + "loss": 1.4905, + "step": 1253 + }, + { + "epoch": 0.4198192166052896, + "grad_norm": 0.12439723312854767, + "learning_rate": 0.0001, + "loss": 1.5859, + "step": 1254 + }, + { + "epoch": 0.42015400066956815, + "grad_norm": 0.13870543241500854, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 1255 + }, + { + "epoch": 0.42048878473384665, + "grad_norm": 0.13232079148292542, + "learning_rate": 0.0001, + "loss": 1.6566, + "step": 1256 + }, + { + "epoch": 0.4208235687981252, + "grad_norm": 0.12575885653495789, + "learning_rate": 0.0001, + "loss": 1.5629, + "step": 1257 + }, + { + "epoch": 0.4211583528624038, + "grad_norm": 0.12995895743370056, + "learning_rate": 0.0001, + "loss": 1.5703, + "step": 1258 + }, + { + "epoch": 0.4214931369266823, + "grad_norm": 0.12801054120063782, + "learning_rate": 0.0001, + "loss": 1.6326, + "step": 1259 + }, + { + "epoch": 0.42182792099096084, + "grad_norm": 0.12584693729877472, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 1260 + }, + { + "epoch": 0.4221627050552394, + "grad_norm": 0.13142889738082886, + "learning_rate": 0.0001, + "loss": 1.7081, + "step": 1261 + }, + { + "epoch": 0.4224974891195179, + "grad_norm": 0.12793239951133728, + "learning_rate": 0.0001, + "loss": 1.6032, + "step": 1262 + }, + { + "epoch": 0.42283227318379646, + "grad_norm": 0.12368165701627731, + "learning_rate": 0.0001, + "loss": 1.5468, + "step": 1263 + }, + { + "epoch": 0.423167057248075, + "grad_norm": 0.13081911206245422, + "learning_rate": 0.0001, + "loss": 1.6175, + "step": 1264 + }, + { + "epoch": 0.4235018413123535, + "grad_norm": 0.12801037728786469, + "learning_rate": 0.0001, + "loss": 1.537, + "step": 1265 + }, + { + "epoch": 0.4238366253766321, + "grad_norm": 0.1274782121181488, + "learning_rate": 0.0001, + "loss": 1.5277, + "step": 1266 + }, + { + "epoch": 0.4241714094409106, + "grad_norm": 0.1194332018494606, + "learning_rate": 0.0001, + "loss": 1.496, + "step": 1267 + }, + { + "epoch": 0.42450619350518914, + "grad_norm": 0.13174927234649658, + "learning_rate": 0.0001, + "loss": 1.5975, + "step": 1268 + }, + { + "epoch": 0.4248409775694677, + "grad_norm": 0.1254556030035019, + "learning_rate": 0.0001, + "loss": 1.6119, + "step": 1269 + }, + { + "epoch": 0.4251757616337462, + "grad_norm": 0.13203318417072296, + "learning_rate": 0.0001, + "loss": 1.5564, + "step": 1270 + }, + { + "epoch": 0.42551054569802477, + "grad_norm": 0.12941622734069824, + "learning_rate": 0.0001, + "loss": 1.6285, + "step": 1271 + }, + { + "epoch": 0.4258453297623033, + "grad_norm": 0.12527894973754883, + "learning_rate": 0.0001, + "loss": 1.5703, + "step": 1272 + }, + { + "epoch": 0.42618011382658183, + "grad_norm": 0.12617714703083038, + "learning_rate": 0.0001, + "loss": 1.6523, + "step": 1273 + }, + { + "epoch": 0.4265148978908604, + "grad_norm": 0.12326870858669281, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 1274 + }, + { + "epoch": 0.42684968195513895, + "grad_norm": 0.1295124888420105, + "learning_rate": 0.0001, + "loss": 1.5587, + "step": 1275 + }, + { + "epoch": 0.42718446601941745, + "grad_norm": 0.12248773872852325, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 1276 + }, + { + "epoch": 0.427519250083696, + "grad_norm": 0.12932232022285461, + "learning_rate": 0.0001, + "loss": 1.6162, + "step": 1277 + }, + { + "epoch": 0.42785403414797457, + "grad_norm": 0.1178537905216217, + "learning_rate": 0.0001, + "loss": 1.472, + "step": 1278 + }, + { + "epoch": 0.4281888182122531, + "grad_norm": 0.1269647628068924, + "learning_rate": 0.0001, + "loss": 1.5551, + "step": 1279 + }, + { + "epoch": 0.42852360227653163, + "grad_norm": 0.120000459253788, + "learning_rate": 0.0001, + "loss": 1.509, + "step": 1280 + }, + { + "epoch": 0.4288583863408102, + "grad_norm": 0.12708665430545807, + "learning_rate": 0.0001, + "loss": 1.5293, + "step": 1281 + }, + { + "epoch": 0.4291931704050887, + "grad_norm": 0.13209426403045654, + "learning_rate": 0.0001, + "loss": 1.6311, + "step": 1282 + }, + { + "epoch": 0.42952795446936726, + "grad_norm": 0.1305491328239441, + "learning_rate": 0.0001, + "loss": 1.5505, + "step": 1283 + }, + { + "epoch": 0.4298627385336458, + "grad_norm": 0.1237809956073761, + "learning_rate": 0.0001, + "loss": 1.5457, + "step": 1284 + }, + { + "epoch": 0.4301975225979243, + "grad_norm": 0.13375982642173767, + "learning_rate": 0.0001, + "loss": 1.5321, + "step": 1285 + }, + { + "epoch": 0.4305323066622029, + "grad_norm": 0.13597902655601501, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 1286 + }, + { + "epoch": 0.43086709072648144, + "grad_norm": 0.12488207966089249, + "learning_rate": 0.0001, + "loss": 1.5231, + "step": 1287 + }, + { + "epoch": 0.43120187479075994, + "grad_norm": 0.12950995564460754, + "learning_rate": 0.0001, + "loss": 1.7162, + "step": 1288 + }, + { + "epoch": 0.4315366588550385, + "grad_norm": 0.12734153866767883, + "learning_rate": 0.0001, + "loss": 1.5735, + "step": 1289 + }, + { + "epoch": 0.43187144291931706, + "grad_norm": 0.13684290647506714, + "learning_rate": 0.0001, + "loss": 1.5866, + "step": 1290 + }, + { + "epoch": 0.43220622698359557, + "grad_norm": 0.12665408849716187, + "learning_rate": 0.0001, + "loss": 1.5236, + "step": 1291 + }, + { + "epoch": 0.4325410110478741, + "grad_norm": 0.12092933058738708, + "learning_rate": 0.0001, + "loss": 1.4859, + "step": 1292 + }, + { + "epoch": 0.4328757951121527, + "grad_norm": 0.14012545347213745, + "learning_rate": 0.0001, + "loss": 1.6158, + "step": 1293 + }, + { + "epoch": 0.4332105791764312, + "grad_norm": 0.12820059061050415, + "learning_rate": 0.0001, + "loss": 1.5108, + "step": 1294 + }, + { + "epoch": 0.43354536324070975, + "grad_norm": 0.13247036933898926, + "learning_rate": 0.0001, + "loss": 1.6031, + "step": 1295 + }, + { + "epoch": 0.4338801473049883, + "grad_norm": 0.12412893772125244, + "learning_rate": 0.0001, + "loss": 1.5829, + "step": 1296 + }, + { + "epoch": 0.4342149313692668, + "grad_norm": 0.12657597661018372, + "learning_rate": 0.0001, + "loss": 1.5139, + "step": 1297 + }, + { + "epoch": 0.43454971543354537, + "grad_norm": 0.13494263589382172, + "learning_rate": 0.0001, + "loss": 1.6264, + "step": 1298 + }, + { + "epoch": 0.43488449949782393, + "grad_norm": 0.12553179264068604, + "learning_rate": 0.0001, + "loss": 1.5587, + "step": 1299 + }, + { + "epoch": 0.43521928356210243, + "grad_norm": 0.12029055505990982, + "learning_rate": 0.0001, + "loss": 1.5177, + "step": 1300 + }, + { + "epoch": 0.435554067626381, + "grad_norm": 0.12742608785629272, + "learning_rate": 0.0001, + "loss": 1.6345, + "step": 1301 + }, + { + "epoch": 0.43588885169065955, + "grad_norm": 0.12749677896499634, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 1302 + }, + { + "epoch": 0.43622363575493805, + "grad_norm": 0.13716910779476166, + "learning_rate": 0.0001, + "loss": 1.6064, + "step": 1303 + }, + { + "epoch": 0.4365584198192166, + "grad_norm": 0.11626800149679184, + "learning_rate": 0.0001, + "loss": 1.461, + "step": 1304 + }, + { + "epoch": 0.4368932038834951, + "grad_norm": 0.12892816960811615, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 1305 + }, + { + "epoch": 0.4372279879477737, + "grad_norm": 0.12171407043933868, + "learning_rate": 0.0001, + "loss": 1.5669, + "step": 1306 + }, + { + "epoch": 0.43756277201205224, + "grad_norm": 0.12705732882022858, + "learning_rate": 0.0001, + "loss": 1.5392, + "step": 1307 + }, + { + "epoch": 0.43789755607633074, + "grad_norm": 0.12489151209592819, + "learning_rate": 0.0001, + "loss": 1.5621, + "step": 1308 + }, + { + "epoch": 0.4382323401406093, + "grad_norm": 0.1306968778371811, + "learning_rate": 0.0001, + "loss": 1.5601, + "step": 1309 + }, + { + "epoch": 0.43856712420488786, + "grad_norm": 0.12457779794931412, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 1310 + }, + { + "epoch": 0.43890190826916636, + "grad_norm": 0.1351223587989807, + "learning_rate": 0.0001, + "loss": 1.6364, + "step": 1311 + }, + { + "epoch": 0.4392366923334449, + "grad_norm": 0.16403745114803314, + "learning_rate": 0.0001, + "loss": 1.6135, + "step": 1312 + }, + { + "epoch": 0.4395714763977235, + "grad_norm": 0.1373598426580429, + "learning_rate": 0.0001, + "loss": 1.6102, + "step": 1313 + }, + { + "epoch": 0.439906260462002, + "grad_norm": 0.12474294006824493, + "learning_rate": 0.0001, + "loss": 1.4732, + "step": 1314 + }, + { + "epoch": 0.44024104452628054, + "grad_norm": 0.13775482773780823, + "learning_rate": 0.0001, + "loss": 1.4623, + "step": 1315 + }, + { + "epoch": 0.4405758285905591, + "grad_norm": 0.12874817848205566, + "learning_rate": 0.0001, + "loss": 1.5885, + "step": 1316 + }, + { + "epoch": 0.4409106126548376, + "grad_norm": 0.13382995128631592, + "learning_rate": 0.0001, + "loss": 1.4458, + "step": 1317 + }, + { + "epoch": 0.44124539671911617, + "grad_norm": 0.1267126202583313, + "learning_rate": 0.0001, + "loss": 1.5709, + "step": 1318 + }, + { + "epoch": 0.4415801807833947, + "grad_norm": 0.12839357554912567, + "learning_rate": 0.0001, + "loss": 1.5377, + "step": 1319 + }, + { + "epoch": 0.44191496484767323, + "grad_norm": 0.13176332414150238, + "learning_rate": 0.0001, + "loss": 1.4342, + "step": 1320 + }, + { + "epoch": 0.4422497489119518, + "grad_norm": 0.13202795386314392, + "learning_rate": 0.0001, + "loss": 1.5997, + "step": 1321 + }, + { + "epoch": 0.44258453297623035, + "grad_norm": 0.12316932529211044, + "learning_rate": 0.0001, + "loss": 1.4323, + "step": 1322 + }, + { + "epoch": 0.44291931704050885, + "grad_norm": 0.1301979273557663, + "learning_rate": 0.0001, + "loss": 1.5882, + "step": 1323 + }, + { + "epoch": 0.4432541011047874, + "grad_norm": 0.1263076364994049, + "learning_rate": 0.0001, + "loss": 1.4469, + "step": 1324 + }, + { + "epoch": 0.44358888516906597, + "grad_norm": 0.12310474365949631, + "learning_rate": 0.0001, + "loss": 1.4898, + "step": 1325 + }, + { + "epoch": 0.4439236692333445, + "grad_norm": 0.12039102613925934, + "learning_rate": 0.0001, + "loss": 1.5324, + "step": 1326 + }, + { + "epoch": 0.44425845329762303, + "grad_norm": 0.12545818090438843, + "learning_rate": 0.0001, + "loss": 1.6171, + "step": 1327 + }, + { + "epoch": 0.4445932373619016, + "grad_norm": 0.1259836107492447, + "learning_rate": 0.0001, + "loss": 1.5059, + "step": 1328 + }, + { + "epoch": 0.4449280214261801, + "grad_norm": 0.12518031895160675, + "learning_rate": 0.0001, + "loss": 1.5958, + "step": 1329 + }, + { + "epoch": 0.44526280549045866, + "grad_norm": 0.12583878636360168, + "learning_rate": 0.0001, + "loss": 1.4837, + "step": 1330 + }, + { + "epoch": 0.4455975895547372, + "grad_norm": 0.12569929659366608, + "learning_rate": 0.0001, + "loss": 1.536, + "step": 1331 + }, + { + "epoch": 0.4459323736190157, + "grad_norm": 0.1288549304008484, + "learning_rate": 0.0001, + "loss": 1.5525, + "step": 1332 + }, + { + "epoch": 0.4462671576832943, + "grad_norm": 0.13198384642601013, + "learning_rate": 0.0001, + "loss": 1.542, + "step": 1333 + }, + { + "epoch": 0.44660194174757284, + "grad_norm": 0.1238170713186264, + "learning_rate": 0.0001, + "loss": 1.4021, + "step": 1334 + }, + { + "epoch": 0.44693672581185134, + "grad_norm": 0.13295157253742218, + "learning_rate": 0.0001, + "loss": 1.5553, + "step": 1335 + }, + { + "epoch": 0.4472715098761299, + "grad_norm": 0.13403776288032532, + "learning_rate": 0.0001, + "loss": 1.4761, + "step": 1336 + }, + { + "epoch": 0.44760629394040846, + "grad_norm": 0.13343052566051483, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 1337 + }, + { + "epoch": 0.44794107800468697, + "grad_norm": 0.125327467918396, + "learning_rate": 0.0001, + "loss": 1.5682, + "step": 1338 + }, + { + "epoch": 0.4482758620689655, + "grad_norm": 0.12958160042762756, + "learning_rate": 0.0001, + "loss": 1.5294, + "step": 1339 + }, + { + "epoch": 0.4486106461332441, + "grad_norm": 0.1384599506855011, + "learning_rate": 0.0001, + "loss": 1.5791, + "step": 1340 + }, + { + "epoch": 0.4489454301975226, + "grad_norm": 0.1257963478565216, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 1341 + }, + { + "epoch": 0.44928021426180115, + "grad_norm": 0.12630927562713623, + "learning_rate": 0.0001, + "loss": 1.5558, + "step": 1342 + }, + { + "epoch": 0.44961499832607965, + "grad_norm": 0.1268066167831421, + "learning_rate": 0.0001, + "loss": 1.5958, + "step": 1343 + }, + { + "epoch": 0.4499497823903582, + "grad_norm": 0.12455032020807266, + "learning_rate": 0.0001, + "loss": 1.5607, + "step": 1344 + }, + { + "epoch": 0.45028456645463677, + "grad_norm": 0.12265735119581223, + "learning_rate": 0.0001, + "loss": 1.5197, + "step": 1345 + }, + { + "epoch": 0.4506193505189153, + "grad_norm": 0.1307050883769989, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 1346 + }, + { + "epoch": 0.45095413458319383, + "grad_norm": 0.13128429651260376, + "learning_rate": 0.0001, + "loss": 1.5559, + "step": 1347 + }, + { + "epoch": 0.4512889186474724, + "grad_norm": 0.13010568916797638, + "learning_rate": 0.0001, + "loss": 1.5332, + "step": 1348 + }, + { + "epoch": 0.4516237027117509, + "grad_norm": 0.12650929391384125, + "learning_rate": 0.0001, + "loss": 1.6047, + "step": 1349 + }, + { + "epoch": 0.45195848677602946, + "grad_norm": 0.12306904792785645, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 1350 + }, + { + "epoch": 0.452293270840308, + "grad_norm": 0.13351021707057953, + "learning_rate": 0.0001, + "loss": 1.4737, + "step": 1351 + }, + { + "epoch": 0.4526280549045865, + "grad_norm": 0.12178155779838562, + "learning_rate": 0.0001, + "loss": 1.4775, + "step": 1352 + }, + { + "epoch": 0.4529628389688651, + "grad_norm": 0.13516512513160706, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 1353 + }, + { + "epoch": 0.45329762303314364, + "grad_norm": 0.12909267842769623, + "learning_rate": 0.0001, + "loss": 1.4684, + "step": 1354 + }, + { + "epoch": 0.45363240709742214, + "grad_norm": 0.12209142744541168, + "learning_rate": 0.0001, + "loss": 1.5198, + "step": 1355 + }, + { + "epoch": 0.4539671911617007, + "grad_norm": 0.1269826740026474, + "learning_rate": 0.0001, + "loss": 1.5294, + "step": 1356 + }, + { + "epoch": 0.45430197522597926, + "grad_norm": 0.13762542605400085, + "learning_rate": 0.0001, + "loss": 1.5567, + "step": 1357 + }, + { + "epoch": 0.45463675929025776, + "grad_norm": 0.1306358128786087, + "learning_rate": 0.0001, + "loss": 1.5829, + "step": 1358 + }, + { + "epoch": 0.4549715433545363, + "grad_norm": 0.1383924037218094, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 1359 + }, + { + "epoch": 0.4553063274188149, + "grad_norm": 0.13577204942703247, + "learning_rate": 0.0001, + "loss": 1.6067, + "step": 1360 + }, + { + "epoch": 0.4556411114830934, + "grad_norm": 0.12534180283546448, + "learning_rate": 0.0001, + "loss": 1.574, + "step": 1361 + }, + { + "epoch": 0.45597589554737195, + "grad_norm": 0.12367561459541321, + "learning_rate": 0.0001, + "loss": 1.5089, + "step": 1362 + }, + { + "epoch": 0.4563106796116505, + "grad_norm": 0.14012429118156433, + "learning_rate": 0.0001, + "loss": 1.6044, + "step": 1363 + }, + { + "epoch": 0.456645463675929, + "grad_norm": 0.13164697587490082, + "learning_rate": 0.0001, + "loss": 1.6058, + "step": 1364 + }, + { + "epoch": 0.45698024774020757, + "grad_norm": 0.14275015890598297, + "learning_rate": 0.0001, + "loss": 1.6945, + "step": 1365 + }, + { + "epoch": 0.4573150318044861, + "grad_norm": 0.1312190294265747, + "learning_rate": 0.0001, + "loss": 1.5595, + "step": 1366 + }, + { + "epoch": 0.45764981586876463, + "grad_norm": 0.1276426464319229, + "learning_rate": 0.0001, + "loss": 1.5639, + "step": 1367 + }, + { + "epoch": 0.4579845999330432, + "grad_norm": 0.12928691506385803, + "learning_rate": 0.0001, + "loss": 1.6555, + "step": 1368 + }, + { + "epoch": 0.45831938399732175, + "grad_norm": 0.12562155723571777, + "learning_rate": 0.0001, + "loss": 1.5017, + "step": 1369 + }, + { + "epoch": 0.45865416806160025, + "grad_norm": 0.12555162608623505, + "learning_rate": 0.0001, + "loss": 1.5133, + "step": 1370 + }, + { + "epoch": 0.4589889521258788, + "grad_norm": 0.13354945182800293, + "learning_rate": 0.0001, + "loss": 1.5802, + "step": 1371 + }, + { + "epoch": 0.4593237361901574, + "grad_norm": 0.13059929013252258, + "learning_rate": 0.0001, + "loss": 1.5152, + "step": 1372 + }, + { + "epoch": 0.4596585202544359, + "grad_norm": 0.1313420981168747, + "learning_rate": 0.0001, + "loss": 1.5411, + "step": 1373 + }, + { + "epoch": 0.45999330431871444, + "grad_norm": 0.13619214296340942, + "learning_rate": 0.0001, + "loss": 1.5348, + "step": 1374 + }, + { + "epoch": 0.460328088382993, + "grad_norm": 0.12227842211723328, + "learning_rate": 0.0001, + "loss": 1.5258, + "step": 1375 + }, + { + "epoch": 0.4606628724472715, + "grad_norm": 0.12962037324905396, + "learning_rate": 0.0001, + "loss": 1.6469, + "step": 1376 + }, + { + "epoch": 0.46099765651155006, + "grad_norm": 0.128581240773201, + "learning_rate": 0.0001, + "loss": 1.6151, + "step": 1377 + }, + { + "epoch": 0.4613324405758286, + "grad_norm": 0.12887564301490784, + "learning_rate": 0.0001, + "loss": 1.5741, + "step": 1378 + }, + { + "epoch": 0.4616672246401071, + "grad_norm": 0.12684863805770874, + "learning_rate": 0.0001, + "loss": 1.6168, + "step": 1379 + }, + { + "epoch": 0.4620020087043857, + "grad_norm": 0.11986137181520462, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 1380 + }, + { + "epoch": 0.4623367927686642, + "grad_norm": 0.12904709577560425, + "learning_rate": 0.0001, + "loss": 1.5247, + "step": 1381 + }, + { + "epoch": 0.46267157683294274, + "grad_norm": 0.12737007439136505, + "learning_rate": 0.0001, + "loss": 1.6354, + "step": 1382 + }, + { + "epoch": 0.4630063608972213, + "grad_norm": 0.13845406472682953, + "learning_rate": 0.0001, + "loss": 1.5696, + "step": 1383 + }, + { + "epoch": 0.4633411449614998, + "grad_norm": 0.1215730682015419, + "learning_rate": 0.0001, + "loss": 1.5277, + "step": 1384 + }, + { + "epoch": 0.46367592902577837, + "grad_norm": 0.12643855810165405, + "learning_rate": 0.0001, + "loss": 1.5691, + "step": 1385 + }, + { + "epoch": 0.4640107130900569, + "grad_norm": 0.12575271725654602, + "learning_rate": 0.0001, + "loss": 1.5075, + "step": 1386 + }, + { + "epoch": 0.46434549715433543, + "grad_norm": 0.13134850561618805, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 1387 + }, + { + "epoch": 0.464680281218614, + "grad_norm": 0.12751908600330353, + "learning_rate": 0.0001, + "loss": 1.5396, + "step": 1388 + }, + { + "epoch": 0.46501506528289255, + "grad_norm": 0.1260857880115509, + "learning_rate": 0.0001, + "loss": 1.581, + "step": 1389 + }, + { + "epoch": 0.46534984934717105, + "grad_norm": 0.13056620955467224, + "learning_rate": 0.0001, + "loss": 1.5604, + "step": 1390 + }, + { + "epoch": 0.4656846334114496, + "grad_norm": 0.12854252755641937, + "learning_rate": 0.0001, + "loss": 1.5729, + "step": 1391 + }, + { + "epoch": 0.46601941747572817, + "grad_norm": 0.12587207555770874, + "learning_rate": 0.0001, + "loss": 1.5685, + "step": 1392 + }, + { + "epoch": 0.4663542015400067, + "grad_norm": 0.13984687626361847, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 1393 + }, + { + "epoch": 0.46668898560428523, + "grad_norm": 0.1340693235397339, + "learning_rate": 0.0001, + "loss": 1.5047, + "step": 1394 + }, + { + "epoch": 0.4670237696685638, + "grad_norm": 0.12426851689815521, + "learning_rate": 0.0001, + "loss": 1.5614, + "step": 1395 + }, + { + "epoch": 0.4673585537328423, + "grad_norm": 0.14335423707962036, + "learning_rate": 0.0001, + "loss": 1.5968, + "step": 1396 + }, + { + "epoch": 0.46769333779712086, + "grad_norm": 0.1285167783498764, + "learning_rate": 0.0001, + "loss": 1.4816, + "step": 1397 + }, + { + "epoch": 0.4680281218613994, + "grad_norm": 0.12221338599920273, + "learning_rate": 0.0001, + "loss": 1.5412, + "step": 1398 + }, + { + "epoch": 0.4683629059256779, + "grad_norm": 0.13749419152736664, + "learning_rate": 0.0001, + "loss": 1.6426, + "step": 1399 + }, + { + "epoch": 0.4686976899899565, + "grad_norm": 0.1292765736579895, + "learning_rate": 0.0001, + "loss": 1.4826, + "step": 1400 + }, + { + "epoch": 0.46903247405423504, + "grad_norm": 0.12175814807415009, + "learning_rate": 0.0001, + "loss": 1.4674, + "step": 1401 + }, + { + "epoch": 0.46936725811851354, + "grad_norm": 0.13381820917129517, + "learning_rate": 0.0001, + "loss": 1.515, + "step": 1402 + }, + { + "epoch": 0.4697020421827921, + "grad_norm": 0.13659454882144928, + "learning_rate": 0.0001, + "loss": 1.5513, + "step": 1403 + }, + { + "epoch": 0.47003682624707066, + "grad_norm": 0.12511052191257477, + "learning_rate": 0.0001, + "loss": 1.5457, + "step": 1404 + }, + { + "epoch": 0.47037161031134916, + "grad_norm": 0.13325883448123932, + "learning_rate": 0.0001, + "loss": 1.5893, + "step": 1405 + }, + { + "epoch": 0.4707063943756277, + "grad_norm": 0.12582562863826752, + "learning_rate": 0.0001, + "loss": 1.5285, + "step": 1406 + }, + { + "epoch": 0.4710411784399063, + "grad_norm": 0.13141517341136932, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 1407 + }, + { + "epoch": 0.4713759625041848, + "grad_norm": 0.13099296391010284, + "learning_rate": 0.0001, + "loss": 1.5322, + "step": 1408 + }, + { + "epoch": 0.47171074656846335, + "grad_norm": 0.146238312125206, + "learning_rate": 0.0001, + "loss": 1.6397, + "step": 1409 + }, + { + "epoch": 0.4720455306327419, + "grad_norm": 0.12129180878400803, + "learning_rate": 0.0001, + "loss": 1.5033, + "step": 1410 + }, + { + "epoch": 0.4723803146970204, + "grad_norm": 0.125573992729187, + "learning_rate": 0.0001, + "loss": 1.571, + "step": 1411 + }, + { + "epoch": 0.47271509876129897, + "grad_norm": 0.14334800839424133, + "learning_rate": 0.0001, + "loss": 1.5323, + "step": 1412 + }, + { + "epoch": 0.47304988282557753, + "grad_norm": 0.1354663372039795, + "learning_rate": 0.0001, + "loss": 1.5733, + "step": 1413 + }, + { + "epoch": 0.47338466688985603, + "grad_norm": 0.13040928542613983, + "learning_rate": 0.0001, + "loss": 1.4702, + "step": 1414 + }, + { + "epoch": 0.4737194509541346, + "grad_norm": 0.12931925058364868, + "learning_rate": 0.0001, + "loss": 1.6017, + "step": 1415 + }, + { + "epoch": 0.47405423501841315, + "grad_norm": 0.13492871820926666, + "learning_rate": 0.0001, + "loss": 1.5827, + "step": 1416 + }, + { + "epoch": 0.47438901908269165, + "grad_norm": 0.12549789249897003, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 1417 + }, + { + "epoch": 0.4747238031469702, + "grad_norm": 0.13328687846660614, + "learning_rate": 0.0001, + "loss": 1.6163, + "step": 1418 + }, + { + "epoch": 0.4750585872112487, + "grad_norm": 0.13430629670619965, + "learning_rate": 0.0001, + "loss": 1.5663, + "step": 1419 + }, + { + "epoch": 0.4753933712755273, + "grad_norm": 0.12909024953842163, + "learning_rate": 0.0001, + "loss": 1.6085, + "step": 1420 + }, + { + "epoch": 0.47572815533980584, + "grad_norm": 0.13095097243785858, + "learning_rate": 0.0001, + "loss": 1.585, + "step": 1421 + }, + { + "epoch": 0.47606293940408434, + "grad_norm": 0.1313266009092331, + "learning_rate": 0.0001, + "loss": 1.5279, + "step": 1422 + }, + { + "epoch": 0.4763977234683629, + "grad_norm": 0.12739764153957367, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 1423 + }, + { + "epoch": 0.47673250753264146, + "grad_norm": 0.12780874967575073, + "learning_rate": 0.0001, + "loss": 1.5566, + "step": 1424 + }, + { + "epoch": 0.47706729159691996, + "grad_norm": 0.12299945950508118, + "learning_rate": 0.0001, + "loss": 1.5632, + "step": 1425 + }, + { + "epoch": 0.4774020756611985, + "grad_norm": 0.12845619022846222, + "learning_rate": 0.0001, + "loss": 1.5799, + "step": 1426 + }, + { + "epoch": 0.4777368597254771, + "grad_norm": 0.12429885566234589, + "learning_rate": 0.0001, + "loss": 1.565, + "step": 1427 + }, + { + "epoch": 0.4780716437897556, + "grad_norm": 0.12623021006584167, + "learning_rate": 0.0001, + "loss": 1.5579, + "step": 1428 + }, + { + "epoch": 0.47840642785403414, + "grad_norm": 0.121118925511837, + "learning_rate": 0.0001, + "loss": 1.5044, + "step": 1429 + }, + { + "epoch": 0.4787412119183127, + "grad_norm": 0.13029584288597107, + "learning_rate": 0.0001, + "loss": 1.5945, + "step": 1430 + }, + { + "epoch": 0.4790759959825912, + "grad_norm": 0.1309075504541397, + "learning_rate": 0.0001, + "loss": 1.5638, + "step": 1431 + }, + { + "epoch": 0.47941078004686977, + "grad_norm": 0.12302339822053909, + "learning_rate": 0.0001, + "loss": 1.553, + "step": 1432 + }, + { + "epoch": 0.4797455641111483, + "grad_norm": 0.13640674948692322, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 1433 + }, + { + "epoch": 0.48008034817542683, + "grad_norm": 0.12669233977794647, + "learning_rate": 0.0001, + "loss": 1.5603, + "step": 1434 + }, + { + "epoch": 0.4804151322397054, + "grad_norm": 0.14192534983158112, + "learning_rate": 0.0001, + "loss": 1.5648, + "step": 1435 + }, + { + "epoch": 0.48074991630398395, + "grad_norm": 0.12855654954910278, + "learning_rate": 0.0001, + "loss": 1.5782, + "step": 1436 + }, + { + "epoch": 0.48108470036826245, + "grad_norm": 0.13193868100643158, + "learning_rate": 0.0001, + "loss": 1.4815, + "step": 1437 + }, + { + "epoch": 0.481419484432541, + "grad_norm": 0.1313331574201584, + "learning_rate": 0.0001, + "loss": 1.597, + "step": 1438 + }, + { + "epoch": 0.48175426849681957, + "grad_norm": 0.14010664820671082, + "learning_rate": 0.0001, + "loss": 1.5911, + "step": 1439 + }, + { + "epoch": 0.4820890525610981, + "grad_norm": 0.12899306416511536, + "learning_rate": 0.0001, + "loss": 1.5346, + "step": 1440 + }, + { + "epoch": 0.48242383662537663, + "grad_norm": 0.14157001674175262, + "learning_rate": 0.0001, + "loss": 1.4947, + "step": 1441 + }, + { + "epoch": 0.4827586206896552, + "grad_norm": 0.12598420679569244, + "learning_rate": 0.0001, + "loss": 1.5713, + "step": 1442 + }, + { + "epoch": 0.4830934047539337, + "grad_norm": 0.12368304282426834, + "learning_rate": 0.0001, + "loss": 1.4691, + "step": 1443 + }, + { + "epoch": 0.48342818881821226, + "grad_norm": 0.15252211689949036, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 1444 + }, + { + "epoch": 0.4837629728824908, + "grad_norm": 0.12461958080530167, + "learning_rate": 0.0001, + "loss": 1.5377, + "step": 1445 + }, + { + "epoch": 0.4840977569467693, + "grad_norm": 0.13883721828460693, + "learning_rate": 0.0001, + "loss": 1.5754, + "step": 1446 + }, + { + "epoch": 0.4844325410110479, + "grad_norm": 0.14833161234855652, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 1447 + }, + { + "epoch": 0.48476732507532644, + "grad_norm": 0.12511619925498962, + "learning_rate": 0.0001, + "loss": 1.5765, + "step": 1448 + }, + { + "epoch": 0.48510210913960494, + "grad_norm": 0.1352238804101944, + "learning_rate": 0.0001, + "loss": 1.5231, + "step": 1449 + }, + { + "epoch": 0.4854368932038835, + "grad_norm": 0.14310289919376373, + "learning_rate": 0.0001, + "loss": 1.5516, + "step": 1450 + }, + { + "epoch": 0.48577167726816206, + "grad_norm": 0.1293793022632599, + "learning_rate": 0.0001, + "loss": 1.6124, + "step": 1451 + }, + { + "epoch": 0.48610646133244056, + "grad_norm": 0.1351606398820877, + "learning_rate": 0.0001, + "loss": 1.5535, + "step": 1452 + }, + { + "epoch": 0.4864412453967191, + "grad_norm": 0.1305823028087616, + "learning_rate": 0.0001, + "loss": 1.505, + "step": 1453 + }, + { + "epoch": 0.4867760294609977, + "grad_norm": 0.12973332405090332, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 1454 + }, + { + "epoch": 0.4871108135252762, + "grad_norm": 0.1279638260602951, + "learning_rate": 0.0001, + "loss": 1.5664, + "step": 1455 + }, + { + "epoch": 0.48744559758955475, + "grad_norm": 0.1322777271270752, + "learning_rate": 0.0001, + "loss": 1.605, + "step": 1456 + }, + { + "epoch": 0.48778038165383325, + "grad_norm": 0.14680039882659912, + "learning_rate": 0.0001, + "loss": 1.5243, + "step": 1457 + }, + { + "epoch": 0.4881151657181118, + "grad_norm": 0.12435714155435562, + "learning_rate": 0.0001, + "loss": 1.4835, + "step": 1458 + }, + { + "epoch": 0.48844994978239037, + "grad_norm": 0.13253144919872284, + "learning_rate": 0.0001, + "loss": 1.5797, + "step": 1459 + }, + { + "epoch": 0.4887847338466689, + "grad_norm": 0.14123192429542542, + "learning_rate": 0.0001, + "loss": 1.5795, + "step": 1460 + }, + { + "epoch": 0.48911951791094743, + "grad_norm": 0.1254579871892929, + "learning_rate": 0.0001, + "loss": 1.4829, + "step": 1461 + }, + { + "epoch": 0.489454301975226, + "grad_norm": 0.1407458633184433, + "learning_rate": 0.0001, + "loss": 1.5746, + "step": 1462 + }, + { + "epoch": 0.4897890860395045, + "grad_norm": 0.13967539370059967, + "learning_rate": 0.0001, + "loss": 1.611, + "step": 1463 + }, + { + "epoch": 0.49012387010378305, + "grad_norm": 0.13044650852680206, + "learning_rate": 0.0001, + "loss": 1.5614, + "step": 1464 + }, + { + "epoch": 0.4904586541680616, + "grad_norm": 0.13819964230060577, + "learning_rate": 0.0001, + "loss": 1.5579, + "step": 1465 + }, + { + "epoch": 0.4907934382323401, + "grad_norm": 0.12795104086399078, + "learning_rate": 0.0001, + "loss": 1.5373, + "step": 1466 + }, + { + "epoch": 0.4911282222966187, + "grad_norm": 0.13034126162528992, + "learning_rate": 0.0001, + "loss": 1.5077, + "step": 1467 + }, + { + "epoch": 0.49146300636089724, + "grad_norm": 0.1358436644077301, + "learning_rate": 0.0001, + "loss": 1.6376, + "step": 1468 + }, + { + "epoch": 0.49179779042517574, + "grad_norm": 0.12750184535980225, + "learning_rate": 0.0001, + "loss": 1.5638, + "step": 1469 + }, + { + "epoch": 0.4921325744894543, + "grad_norm": 0.13034793734550476, + "learning_rate": 0.0001, + "loss": 1.5053, + "step": 1470 + }, + { + "epoch": 0.49246735855373286, + "grad_norm": 0.1303941309452057, + "learning_rate": 0.0001, + "loss": 1.5342, + "step": 1471 + }, + { + "epoch": 0.49280214261801136, + "grad_norm": 0.12955164909362793, + "learning_rate": 0.0001, + "loss": 1.5396, + "step": 1472 + }, + { + "epoch": 0.4931369266822899, + "grad_norm": 0.12884975969791412, + "learning_rate": 0.0001, + "loss": 1.5389, + "step": 1473 + }, + { + "epoch": 0.4934717107465685, + "grad_norm": 0.1278049647808075, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 1474 + }, + { + "epoch": 0.493806494810847, + "grad_norm": 0.12420760840177536, + "learning_rate": 0.0001, + "loss": 1.4753, + "step": 1475 + }, + { + "epoch": 0.49414127887512554, + "grad_norm": 0.12760096788406372, + "learning_rate": 0.0001, + "loss": 1.647, + "step": 1476 + }, + { + "epoch": 0.4944760629394041, + "grad_norm": 0.1320486068725586, + "learning_rate": 0.0001, + "loss": 1.5758, + "step": 1477 + }, + { + "epoch": 0.4948108470036826, + "grad_norm": 0.13898344337940216, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 1478 + }, + { + "epoch": 0.49514563106796117, + "grad_norm": 0.12908297777175903, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 1479 + }, + { + "epoch": 0.4954804151322397, + "grad_norm": 0.13149291276931763, + "learning_rate": 0.0001, + "loss": 1.5297, + "step": 1480 + }, + { + "epoch": 0.49581519919651823, + "grad_norm": 0.13526497781276703, + "learning_rate": 0.0001, + "loss": 1.5374, + "step": 1481 + }, + { + "epoch": 0.4961499832607968, + "grad_norm": 0.12223420292139053, + "learning_rate": 0.0001, + "loss": 1.5424, + "step": 1482 + }, + { + "epoch": 0.49648476732507535, + "grad_norm": 0.1266697198152542, + "learning_rate": 0.0001, + "loss": 1.5847, + "step": 1483 + }, + { + "epoch": 0.49681955138935385, + "grad_norm": 0.14440171420574188, + "learning_rate": 0.0001, + "loss": 1.5362, + "step": 1484 + }, + { + "epoch": 0.4971543354536324, + "grad_norm": 0.12831640243530273, + "learning_rate": 0.0001, + "loss": 1.5803, + "step": 1485 + }, + { + "epoch": 0.49748911951791097, + "grad_norm": 0.13665077090263367, + "learning_rate": 0.0001, + "loss": 1.5741, + "step": 1486 + }, + { + "epoch": 0.4978239035821895, + "grad_norm": 0.13725218176841736, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 1487 + }, + { + "epoch": 0.49815868764646803, + "grad_norm": 0.1271527111530304, + "learning_rate": 0.0001, + "loss": 1.6129, + "step": 1488 + }, + { + "epoch": 0.4984934717107466, + "grad_norm": 0.15319159626960754, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 1489 + }, + { + "epoch": 0.4988282557750251, + "grad_norm": 0.12440894544124603, + "learning_rate": 0.0001, + "loss": 1.4354, + "step": 1490 + }, + { + "epoch": 0.49916303983930366, + "grad_norm": 0.1261643022298813, + "learning_rate": 0.0001, + "loss": 1.609, + "step": 1491 + }, + { + "epoch": 0.4994978239035822, + "grad_norm": 0.14216668903827667, + "learning_rate": 0.0001, + "loss": 1.5599, + "step": 1492 + }, + { + "epoch": 0.4998326079678607, + "grad_norm": 0.13173174858093262, + "learning_rate": 0.0001, + "loss": 1.5056, + "step": 1493 + }, + { + "epoch": 0.5001673920321392, + "grad_norm": 0.12335377931594849, + "learning_rate": 0.0001, + "loss": 1.5544, + "step": 1494 + }, + { + "epoch": 0.5005021760964178, + "grad_norm": 0.13367588818073273, + "learning_rate": 0.0001, + "loss": 1.4908, + "step": 1495 + }, + { + "epoch": 0.5008369601606963, + "grad_norm": 0.13830317556858063, + "learning_rate": 0.0001, + "loss": 1.6147, + "step": 1496 + }, + { + "epoch": 0.5011717442249749, + "grad_norm": 0.13441935181617737, + "learning_rate": 0.0001, + "loss": 1.6855, + "step": 1497 + }, + { + "epoch": 0.5015065282892535, + "grad_norm": 0.14937585592269897, + "learning_rate": 0.0001, + "loss": 1.6021, + "step": 1498 + }, + { + "epoch": 0.501841312353532, + "grad_norm": 0.1289912909269333, + "learning_rate": 0.0001, + "loss": 1.5516, + "step": 1499 + }, + { + "epoch": 0.5021760964178105, + "grad_norm": 0.12371324002742767, + "learning_rate": 0.0001, + "loss": 1.5842, + "step": 1500 + }, + { + "epoch": 0.502510880482089, + "grad_norm": 0.12764602899551392, + "learning_rate": 0.0001, + "loss": 1.5836, + "step": 1501 + }, + { + "epoch": 0.5028456645463676, + "grad_norm": 0.12929953634738922, + "learning_rate": 0.0001, + "loss": 1.5656, + "step": 1502 + }, + { + "epoch": 0.5031804486106461, + "grad_norm": 0.1252906322479248, + "learning_rate": 0.0001, + "loss": 1.4856, + "step": 1503 + }, + { + "epoch": 0.5035152326749247, + "grad_norm": 0.13477809727191925, + "learning_rate": 0.0001, + "loss": 1.6185, + "step": 1504 + }, + { + "epoch": 0.5038500167392033, + "grad_norm": 0.12459214776754379, + "learning_rate": 0.0001, + "loss": 1.5323, + "step": 1505 + }, + { + "epoch": 0.5041848008034817, + "grad_norm": 0.12989842891693115, + "learning_rate": 0.0001, + "loss": 1.5325, + "step": 1506 + }, + { + "epoch": 0.5045195848677603, + "grad_norm": 0.12878334522247314, + "learning_rate": 0.0001, + "loss": 1.6504, + "step": 1507 + }, + { + "epoch": 0.5048543689320388, + "grad_norm": 0.14765828847885132, + "learning_rate": 0.0001, + "loss": 1.5978, + "step": 1508 + }, + { + "epoch": 0.5051891529963174, + "grad_norm": 0.1294100284576416, + "learning_rate": 0.0001, + "loss": 1.6909, + "step": 1509 + }, + { + "epoch": 0.505523937060596, + "grad_norm": 0.1304991990327835, + "learning_rate": 0.0001, + "loss": 1.6513, + "step": 1510 + }, + { + "epoch": 0.5058587211248745, + "grad_norm": 0.1318545788526535, + "learning_rate": 0.0001, + "loss": 1.5489, + "step": 1511 + }, + { + "epoch": 0.506193505189153, + "grad_norm": 0.13185527920722961, + "learning_rate": 0.0001, + "loss": 1.6317, + "step": 1512 + }, + { + "epoch": 0.5065282892534315, + "grad_norm": 0.13133597373962402, + "learning_rate": 0.0001, + "loss": 1.5853, + "step": 1513 + }, + { + "epoch": 0.5068630733177101, + "grad_norm": 0.14132916927337646, + "learning_rate": 0.0001, + "loss": 1.6844, + "step": 1514 + }, + { + "epoch": 0.5071978573819886, + "grad_norm": 0.12680397927761078, + "learning_rate": 0.0001, + "loss": 1.6048, + "step": 1515 + }, + { + "epoch": 0.5075326414462672, + "grad_norm": 0.125723198056221, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 1516 + }, + { + "epoch": 0.5078674255105456, + "grad_norm": 0.135573148727417, + "learning_rate": 0.0001, + "loss": 1.6619, + "step": 1517 + }, + { + "epoch": 0.5082022095748242, + "grad_norm": 0.12755006551742554, + "learning_rate": 0.0001, + "loss": 1.5376, + "step": 1518 + }, + { + "epoch": 0.5085369936391028, + "grad_norm": 0.1527450680732727, + "learning_rate": 0.0001, + "loss": 1.4984, + "step": 1519 + }, + { + "epoch": 0.5088717777033813, + "grad_norm": 0.12978217005729675, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 1520 + }, + { + "epoch": 0.5092065617676599, + "grad_norm": 0.13393737375736237, + "learning_rate": 0.0001, + "loss": 1.5267, + "step": 1521 + }, + { + "epoch": 0.5095413458319384, + "grad_norm": 0.13406458497047424, + "learning_rate": 0.0001, + "loss": 1.4858, + "step": 1522 + }, + { + "epoch": 0.5098761298962169, + "grad_norm": 0.13214215636253357, + "learning_rate": 0.0001, + "loss": 1.5391, + "step": 1523 + }, + { + "epoch": 0.5102109139604954, + "grad_norm": 0.13335101306438446, + "learning_rate": 0.0001, + "loss": 1.5791, + "step": 1524 + }, + { + "epoch": 0.510545698024774, + "grad_norm": 0.12885718047618866, + "learning_rate": 0.0001, + "loss": 1.532, + "step": 1525 + }, + { + "epoch": 0.5108804820890526, + "grad_norm": 0.12838226556777954, + "learning_rate": 0.0001, + "loss": 1.5186, + "step": 1526 + }, + { + "epoch": 0.5112152661533311, + "grad_norm": 0.13160903751850128, + "learning_rate": 0.0001, + "loss": 1.5792, + "step": 1527 + }, + { + "epoch": 0.5115500502176097, + "grad_norm": 0.1264614462852478, + "learning_rate": 0.0001, + "loss": 1.6005, + "step": 1528 + }, + { + "epoch": 0.5118848342818881, + "grad_norm": 0.13425403833389282, + "learning_rate": 0.0001, + "loss": 1.5413, + "step": 1529 + }, + { + "epoch": 0.5122196183461667, + "grad_norm": 0.12175809592008591, + "learning_rate": 0.0001, + "loss": 1.5128, + "step": 1530 + }, + { + "epoch": 0.5125544024104453, + "grad_norm": 0.1299484223127365, + "learning_rate": 0.0001, + "loss": 1.4981, + "step": 1531 + }, + { + "epoch": 0.5128891864747238, + "grad_norm": 0.12358542531728745, + "learning_rate": 0.0001, + "loss": 1.4794, + "step": 1532 + }, + { + "epoch": 0.5132239705390024, + "grad_norm": 0.12457676231861115, + "learning_rate": 0.0001, + "loss": 1.462, + "step": 1533 + }, + { + "epoch": 0.5135587546032809, + "grad_norm": 0.12775678932666779, + "learning_rate": 0.0001, + "loss": 1.4993, + "step": 1534 + }, + { + "epoch": 0.5138935386675594, + "grad_norm": 0.12386265397071838, + "learning_rate": 0.0001, + "loss": 1.504, + "step": 1535 + }, + { + "epoch": 0.5142283227318379, + "grad_norm": 0.13995805382728577, + "learning_rate": 0.0001, + "loss": 1.5912, + "step": 1536 + }, + { + "epoch": 0.5145631067961165, + "grad_norm": 0.1274706870317459, + "learning_rate": 0.0001, + "loss": 1.6514, + "step": 1537 + }, + { + "epoch": 0.5148978908603951, + "grad_norm": 0.12781144678592682, + "learning_rate": 0.0001, + "loss": 1.5379, + "step": 1538 + }, + { + "epoch": 0.5152326749246736, + "grad_norm": 0.12408823519945145, + "learning_rate": 0.0001, + "loss": 1.4709, + "step": 1539 + }, + { + "epoch": 0.5155674589889522, + "grad_norm": 0.12711866199970245, + "learning_rate": 0.0001, + "loss": 1.5529, + "step": 1540 + }, + { + "epoch": 0.5159022430532306, + "grad_norm": 0.12433881312608719, + "learning_rate": 0.0001, + "loss": 1.4641, + "step": 1541 + }, + { + "epoch": 0.5162370271175092, + "grad_norm": 0.13031256198883057, + "learning_rate": 0.0001, + "loss": 1.6042, + "step": 1542 + }, + { + "epoch": 0.5165718111817877, + "grad_norm": 0.1294173002243042, + "learning_rate": 0.0001, + "loss": 1.5269, + "step": 1543 + }, + { + "epoch": 0.5169065952460663, + "grad_norm": 0.1273273229598999, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 1544 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 0.13191919028759003, + "learning_rate": 0.0001, + "loss": 1.5684, + "step": 1545 + }, + { + "epoch": 0.5175761633746234, + "grad_norm": 0.13768093287944794, + "learning_rate": 0.0001, + "loss": 1.555, + "step": 1546 + }, + { + "epoch": 0.5179109474389019, + "grad_norm": 0.12926150858402252, + "learning_rate": 0.0001, + "loss": 1.4731, + "step": 1547 + }, + { + "epoch": 0.5182457315031804, + "grad_norm": 0.12586715817451477, + "learning_rate": 0.0001, + "loss": 1.4794, + "step": 1548 + }, + { + "epoch": 0.518580515567459, + "grad_norm": 0.12548579275608063, + "learning_rate": 0.0001, + "loss": 1.5266, + "step": 1549 + }, + { + "epoch": 0.5189152996317375, + "grad_norm": 0.12171539664268494, + "learning_rate": 0.0001, + "loss": 1.4205, + "step": 1550 + }, + { + "epoch": 0.5192500836960161, + "grad_norm": 0.13130709528923035, + "learning_rate": 0.0001, + "loss": 1.5927, + "step": 1551 + }, + { + "epoch": 0.5195848677602946, + "grad_norm": 0.1342555582523346, + "learning_rate": 0.0001, + "loss": 1.5756, + "step": 1552 + }, + { + "epoch": 0.5199196518245731, + "grad_norm": 0.12991021573543549, + "learning_rate": 0.0001, + "loss": 1.646, + "step": 1553 + }, + { + "epoch": 0.5202544358888517, + "grad_norm": 0.13074184954166412, + "learning_rate": 0.0001, + "loss": 1.4619, + "step": 1554 + }, + { + "epoch": 0.5205892199531302, + "grad_norm": 0.12969058752059937, + "learning_rate": 0.0001, + "loss": 1.5048, + "step": 1555 + }, + { + "epoch": 0.5209240040174088, + "grad_norm": 0.12283259630203247, + "learning_rate": 0.0001, + "loss": 1.4968, + "step": 1556 + }, + { + "epoch": 0.5212587880816874, + "grad_norm": 0.14244720339775085, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 1557 + }, + { + "epoch": 0.5215935721459658, + "grad_norm": 0.12856322526931763, + "learning_rate": 0.0001, + "loss": 1.5382, + "step": 1558 + }, + { + "epoch": 0.5219283562102444, + "grad_norm": 0.1262657344341278, + "learning_rate": 0.0001, + "loss": 1.5191, + "step": 1559 + }, + { + "epoch": 0.5222631402745229, + "grad_norm": 0.1350589543581009, + "learning_rate": 0.0001, + "loss": 1.5812, + "step": 1560 + }, + { + "epoch": 0.5225979243388015, + "grad_norm": 0.13602742552757263, + "learning_rate": 0.0001, + "loss": 1.6252, + "step": 1561 + }, + { + "epoch": 0.52293270840308, + "grad_norm": 0.1273350566625595, + "learning_rate": 0.0001, + "loss": 1.5607, + "step": 1562 + }, + { + "epoch": 0.5232674924673586, + "grad_norm": 0.1261235773563385, + "learning_rate": 0.0001, + "loss": 1.4537, + "step": 1563 + }, + { + "epoch": 0.523602276531637, + "grad_norm": 0.123395174741745, + "learning_rate": 0.0001, + "loss": 1.4839, + "step": 1564 + }, + { + "epoch": 0.5239370605959156, + "grad_norm": 0.12707623839378357, + "learning_rate": 0.0001, + "loss": 1.5671, + "step": 1565 + }, + { + "epoch": 0.5242718446601942, + "grad_norm": 0.119587741792202, + "learning_rate": 0.0001, + "loss": 1.4637, + "step": 1566 + }, + { + "epoch": 0.5246066287244727, + "grad_norm": 0.12568604946136475, + "learning_rate": 0.0001, + "loss": 1.5196, + "step": 1567 + }, + { + "epoch": 0.5249414127887513, + "grad_norm": 0.13292740285396576, + "learning_rate": 0.0001, + "loss": 1.5909, + "step": 1568 + }, + { + "epoch": 0.5252761968530298, + "grad_norm": 0.13198155164718628, + "learning_rate": 0.0001, + "loss": 1.6039, + "step": 1569 + }, + { + "epoch": 0.5256109809173083, + "grad_norm": 0.12587766349315643, + "learning_rate": 0.0001, + "loss": 1.5418, + "step": 1570 + }, + { + "epoch": 0.5259457649815868, + "grad_norm": 0.12726300954818726, + "learning_rate": 0.0001, + "loss": 1.5366, + "step": 1571 + }, + { + "epoch": 0.5262805490458654, + "grad_norm": 0.12479355186223984, + "learning_rate": 0.0001, + "loss": 1.5486, + "step": 1572 + }, + { + "epoch": 0.526615333110144, + "grad_norm": 0.1242307722568512, + "learning_rate": 0.0001, + "loss": 1.4547, + "step": 1573 + }, + { + "epoch": 0.5269501171744225, + "grad_norm": 0.12753188610076904, + "learning_rate": 0.0001, + "loss": 1.6649, + "step": 1574 + }, + { + "epoch": 0.5272849012387011, + "grad_norm": 0.12815521657466888, + "learning_rate": 0.0001, + "loss": 1.4489, + "step": 1575 + }, + { + "epoch": 0.5276196853029795, + "grad_norm": 0.1192578375339508, + "learning_rate": 0.0001, + "loss": 1.4078, + "step": 1576 + }, + { + "epoch": 0.5279544693672581, + "grad_norm": 0.12596169114112854, + "learning_rate": 0.0001, + "loss": 1.5369, + "step": 1577 + }, + { + "epoch": 0.5282892534315367, + "grad_norm": 0.13193419575691223, + "learning_rate": 0.0001, + "loss": 1.5601, + "step": 1578 + }, + { + "epoch": 0.5286240374958152, + "grad_norm": 0.1277266889810562, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 1579 + }, + { + "epoch": 0.5289588215600938, + "grad_norm": 0.12819704413414001, + "learning_rate": 0.0001, + "loss": 1.4713, + "step": 1580 + }, + { + "epoch": 0.5292936056243723, + "grad_norm": 0.1399090439081192, + "learning_rate": 0.0001, + "loss": 1.5978, + "step": 1581 + }, + { + "epoch": 0.5296283896886508, + "grad_norm": 0.1373160183429718, + "learning_rate": 0.0001, + "loss": 1.6614, + "step": 1582 + }, + { + "epoch": 0.5299631737529293, + "grad_norm": 0.1253012716770172, + "learning_rate": 0.0001, + "loss": 1.5317, + "step": 1583 + }, + { + "epoch": 0.5302979578172079, + "grad_norm": 0.124544158577919, + "learning_rate": 0.0001, + "loss": 1.4947, + "step": 1584 + }, + { + "epoch": 0.5306327418814865, + "grad_norm": 0.13060353696346283, + "learning_rate": 0.0001, + "loss": 1.5342, + "step": 1585 + }, + { + "epoch": 0.530967525945765, + "grad_norm": 0.12680500745773315, + "learning_rate": 0.0001, + "loss": 1.4597, + "step": 1586 + }, + { + "epoch": 0.5313023100100436, + "grad_norm": 0.13112664222717285, + "learning_rate": 0.0001, + "loss": 1.5978, + "step": 1587 + }, + { + "epoch": 0.531637094074322, + "grad_norm": 0.13016077876091003, + "learning_rate": 0.0001, + "loss": 1.5575, + "step": 1588 + }, + { + "epoch": 0.5319718781386006, + "grad_norm": 0.1273767054080963, + "learning_rate": 0.0001, + "loss": 1.607, + "step": 1589 + }, + { + "epoch": 0.5323066622028791, + "grad_norm": 0.1310475915670395, + "learning_rate": 0.0001, + "loss": 1.5066, + "step": 1590 + }, + { + "epoch": 0.5326414462671577, + "grad_norm": 0.12938565015792847, + "learning_rate": 0.0001, + "loss": 1.4933, + "step": 1591 + }, + { + "epoch": 0.5329762303314363, + "grad_norm": 0.12316200882196426, + "learning_rate": 0.0001, + "loss": 1.4752, + "step": 1592 + }, + { + "epoch": 0.5333110143957147, + "grad_norm": 0.13205035030841827, + "learning_rate": 0.0001, + "loss": 1.5061, + "step": 1593 + }, + { + "epoch": 0.5336457984599933, + "grad_norm": 0.12517520785331726, + "learning_rate": 0.0001, + "loss": 1.5237, + "step": 1594 + }, + { + "epoch": 0.5339805825242718, + "grad_norm": 0.1309306025505066, + "learning_rate": 0.0001, + "loss": 1.5975, + "step": 1595 + }, + { + "epoch": 0.5343153665885504, + "grad_norm": 0.13565212488174438, + "learning_rate": 0.0001, + "loss": 1.6888, + "step": 1596 + }, + { + "epoch": 0.534650150652829, + "grad_norm": 0.13044795393943787, + "learning_rate": 0.0001, + "loss": 1.547, + "step": 1597 + }, + { + "epoch": 0.5349849347171075, + "grad_norm": 0.12757791578769684, + "learning_rate": 0.0001, + "loss": 1.5788, + "step": 1598 + }, + { + "epoch": 0.535319718781386, + "grad_norm": 0.12625539302825928, + "learning_rate": 0.0001, + "loss": 1.6271, + "step": 1599 + }, + { + "epoch": 0.5356545028456645, + "grad_norm": 0.12980274856090546, + "learning_rate": 0.0001, + "loss": 1.4808, + "step": 1600 + }, + { + "epoch": 0.5359892869099431, + "grad_norm": 0.1339329481124878, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 1601 + }, + { + "epoch": 0.5363240709742216, + "grad_norm": 0.13570533692836761, + "learning_rate": 0.0001, + "loss": 1.5526, + "step": 1602 + }, + { + "epoch": 0.5366588550385002, + "grad_norm": 0.13043223321437836, + "learning_rate": 0.0001, + "loss": 1.5046, + "step": 1603 + }, + { + "epoch": 0.5369936391027788, + "grad_norm": 0.1268492341041565, + "learning_rate": 0.0001, + "loss": 1.4846, + "step": 1604 + }, + { + "epoch": 0.5373284231670572, + "grad_norm": 0.12844318151474, + "learning_rate": 0.0001, + "loss": 1.622, + "step": 1605 + }, + { + "epoch": 0.5376632072313358, + "grad_norm": 0.12543794512748718, + "learning_rate": 0.0001, + "loss": 1.4895, + "step": 1606 + }, + { + "epoch": 0.5379979912956143, + "grad_norm": 0.13247263431549072, + "learning_rate": 0.0001, + "loss": 1.5431, + "step": 1607 + }, + { + "epoch": 0.5383327753598929, + "grad_norm": 0.12495877593755722, + "learning_rate": 0.0001, + "loss": 1.5534, + "step": 1608 + }, + { + "epoch": 0.5386675594241714, + "grad_norm": 0.12770773470401764, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 1609 + }, + { + "epoch": 0.53900234348845, + "grad_norm": 0.1249793991446495, + "learning_rate": 0.0001, + "loss": 1.549, + "step": 1610 + }, + { + "epoch": 0.5393371275527284, + "grad_norm": 0.13602420687675476, + "learning_rate": 0.0001, + "loss": 1.6911, + "step": 1611 + }, + { + "epoch": 0.539671911617007, + "grad_norm": 0.1260257512331009, + "learning_rate": 0.0001, + "loss": 1.6155, + "step": 1612 + }, + { + "epoch": 0.5400066956812856, + "grad_norm": 0.13716067373752594, + "learning_rate": 0.0001, + "loss": 1.5017, + "step": 1613 + }, + { + "epoch": 0.5403414797455641, + "grad_norm": 0.12322457879781723, + "learning_rate": 0.0001, + "loss": 1.4567, + "step": 1614 + }, + { + "epoch": 0.5406762638098427, + "grad_norm": 0.1295168548822403, + "learning_rate": 0.0001, + "loss": 1.5388, + "step": 1615 + }, + { + "epoch": 0.5410110478741212, + "grad_norm": 0.13598200678825378, + "learning_rate": 0.0001, + "loss": 1.6189, + "step": 1616 + }, + { + "epoch": 0.5413458319383997, + "grad_norm": 0.12514351308345795, + "learning_rate": 0.0001, + "loss": 1.5957, + "step": 1617 + }, + { + "epoch": 0.5416806160026783, + "grad_norm": 0.13243642449378967, + "learning_rate": 0.0001, + "loss": 1.5211, + "step": 1618 + }, + { + "epoch": 0.5420154000669568, + "grad_norm": 0.14331547915935516, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 1619 + }, + { + "epoch": 0.5423501841312354, + "grad_norm": 0.13204847276210785, + "learning_rate": 0.0001, + "loss": 1.6131, + "step": 1620 + }, + { + "epoch": 0.5426849681955139, + "grad_norm": 0.13828937709331512, + "learning_rate": 0.0001, + "loss": 1.6206, + "step": 1621 + }, + { + "epoch": 0.5430197522597925, + "grad_norm": 0.13166444003582, + "learning_rate": 0.0001, + "loss": 1.556, + "step": 1622 + }, + { + "epoch": 0.5433545363240709, + "grad_norm": 0.131551131606102, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 1623 + }, + { + "epoch": 0.5436893203883495, + "grad_norm": 0.1386868953704834, + "learning_rate": 0.0001, + "loss": 1.626, + "step": 1624 + }, + { + "epoch": 0.544024104452628, + "grad_norm": 0.12754793465137482, + "learning_rate": 0.0001, + "loss": 1.5419, + "step": 1625 + }, + { + "epoch": 0.5443588885169066, + "grad_norm": 0.13059911131858826, + "learning_rate": 0.0001, + "loss": 1.5886, + "step": 1626 + }, + { + "epoch": 0.5446936725811852, + "grad_norm": 0.13056625425815582, + "learning_rate": 0.0001, + "loss": 1.5093, + "step": 1627 + }, + { + "epoch": 0.5450284566454636, + "grad_norm": 0.12965354323387146, + "learning_rate": 0.0001, + "loss": 1.5766, + "step": 1628 + }, + { + "epoch": 0.5453632407097422, + "grad_norm": 0.12052886188030243, + "learning_rate": 0.0001, + "loss": 1.5315, + "step": 1629 + }, + { + "epoch": 0.5456980247740207, + "grad_norm": 0.12897798418998718, + "learning_rate": 0.0001, + "loss": 1.6129, + "step": 1630 + }, + { + "epoch": 0.5460328088382993, + "grad_norm": 0.12880270183086395, + "learning_rate": 0.0001, + "loss": 1.6111, + "step": 1631 + }, + { + "epoch": 0.5463675929025779, + "grad_norm": 0.13251414895057678, + "learning_rate": 0.0001, + "loss": 1.5786, + "step": 1632 + }, + { + "epoch": 0.5467023769668564, + "grad_norm": 0.13067522644996643, + "learning_rate": 0.0001, + "loss": 1.5724, + "step": 1633 + }, + { + "epoch": 0.5470371610311349, + "grad_norm": 0.127615824341774, + "learning_rate": 0.0001, + "loss": 1.4672, + "step": 1634 + }, + { + "epoch": 0.5473719450954134, + "grad_norm": 0.12785358726978302, + "learning_rate": 0.0001, + "loss": 1.4379, + "step": 1635 + }, + { + "epoch": 0.547706729159692, + "grad_norm": 0.1336808055639267, + "learning_rate": 0.0001, + "loss": 1.5894, + "step": 1636 + }, + { + "epoch": 0.5480415132239705, + "grad_norm": 0.12709666788578033, + "learning_rate": 0.0001, + "loss": 1.5646, + "step": 1637 + }, + { + "epoch": 0.5483762972882491, + "grad_norm": 0.1278083175420761, + "learning_rate": 0.0001, + "loss": 1.5481, + "step": 1638 + }, + { + "epoch": 0.5487110813525277, + "grad_norm": 0.1273607462644577, + "learning_rate": 0.0001, + "loss": 1.6099, + "step": 1639 + }, + { + "epoch": 0.5490458654168061, + "grad_norm": 0.13073420524597168, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 1640 + }, + { + "epoch": 0.5493806494810847, + "grad_norm": 0.12339271605014801, + "learning_rate": 0.0001, + "loss": 1.4866, + "step": 1641 + }, + { + "epoch": 0.5497154335453632, + "grad_norm": 0.12296874821186066, + "learning_rate": 0.0001, + "loss": 1.4542, + "step": 1642 + }, + { + "epoch": 0.5500502176096418, + "grad_norm": 0.12228816747665405, + "learning_rate": 0.0001, + "loss": 1.5008, + "step": 1643 + }, + { + "epoch": 0.5503850016739203, + "grad_norm": 0.12167999148368835, + "learning_rate": 0.0001, + "loss": 1.4793, + "step": 1644 + }, + { + "epoch": 0.5507197857381989, + "grad_norm": 0.1323646754026413, + "learning_rate": 0.0001, + "loss": 1.6053, + "step": 1645 + }, + { + "epoch": 0.5510545698024774, + "grad_norm": 0.13682882487773895, + "learning_rate": 0.0001, + "loss": 1.5962, + "step": 1646 + }, + { + "epoch": 0.5513893538667559, + "grad_norm": 0.13337336480617523, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 1647 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.12662284076213837, + "learning_rate": 0.0001, + "loss": 1.4729, + "step": 1648 + }, + { + "epoch": 0.552058921995313, + "grad_norm": 0.13070893287658691, + "learning_rate": 0.0001, + "loss": 1.5548, + "step": 1649 + }, + { + "epoch": 0.5523937060595916, + "grad_norm": 0.1237405389547348, + "learning_rate": 0.0001, + "loss": 1.5731, + "step": 1650 + }, + { + "epoch": 0.5527284901238702, + "grad_norm": 0.12684407830238342, + "learning_rate": 0.0001, + "loss": 1.5927, + "step": 1651 + }, + { + "epoch": 0.5530632741881486, + "grad_norm": 0.13257922232151031, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 1652 + }, + { + "epoch": 0.5533980582524272, + "grad_norm": 0.12506547570228577, + "learning_rate": 0.0001, + "loss": 1.4954, + "step": 1653 + }, + { + "epoch": 0.5537328423167057, + "grad_norm": 0.13652825355529785, + "learning_rate": 0.0001, + "loss": 1.5936, + "step": 1654 + }, + { + "epoch": 0.5540676263809843, + "grad_norm": 0.1281632035970688, + "learning_rate": 0.0001, + "loss": 1.5239, + "step": 1655 + }, + { + "epoch": 0.5544024104452628, + "grad_norm": 0.1302935630083084, + "learning_rate": 0.0001, + "loss": 1.5731, + "step": 1656 + }, + { + "epoch": 0.5547371945095414, + "grad_norm": 0.13843512535095215, + "learning_rate": 0.0001, + "loss": 1.6028, + "step": 1657 + }, + { + "epoch": 0.5550719785738198, + "grad_norm": 0.13132615387439728, + "learning_rate": 0.0001, + "loss": 1.5167, + "step": 1658 + }, + { + "epoch": 0.5554067626380984, + "grad_norm": 0.1269274204969406, + "learning_rate": 0.0001, + "loss": 1.3276, + "step": 1659 + }, + { + "epoch": 0.555741546702377, + "grad_norm": 0.14026238024234772, + "learning_rate": 0.0001, + "loss": 1.5699, + "step": 1660 + }, + { + "epoch": 0.5560763307666555, + "grad_norm": 0.13259948790073395, + "learning_rate": 0.0001, + "loss": 1.5627, + "step": 1661 + }, + { + "epoch": 0.5564111148309341, + "grad_norm": 0.1282505840063095, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 1662 + }, + { + "epoch": 0.5567458988952126, + "grad_norm": 0.14385761320590973, + "learning_rate": 0.0001, + "loss": 1.5731, + "step": 1663 + }, + { + "epoch": 0.5570806829594911, + "grad_norm": 0.12249067425727844, + "learning_rate": 0.0001, + "loss": 1.5416, + "step": 1664 + }, + { + "epoch": 0.5574154670237697, + "grad_norm": 0.13182908296585083, + "learning_rate": 0.0001, + "loss": 1.5313, + "step": 1665 + }, + { + "epoch": 0.5577502510880482, + "grad_norm": 0.14085689187049866, + "learning_rate": 0.0001, + "loss": 1.5736, + "step": 1666 + }, + { + "epoch": 0.5580850351523268, + "grad_norm": 0.14808295667171478, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 1667 + }, + { + "epoch": 0.5584198192166053, + "grad_norm": 0.13931553065776825, + "learning_rate": 0.0001, + "loss": 1.5729, + "step": 1668 + }, + { + "epoch": 0.5587546032808838, + "grad_norm": 0.14633771777153015, + "learning_rate": 0.0001, + "loss": 1.5433, + "step": 1669 + }, + { + "epoch": 0.5590893873451623, + "grad_norm": 0.1228380873799324, + "learning_rate": 0.0001, + "loss": 1.544, + "step": 1670 + }, + { + "epoch": 0.5594241714094409, + "grad_norm": 0.12809088826179504, + "learning_rate": 0.0001, + "loss": 1.5724, + "step": 1671 + }, + { + "epoch": 0.5597589554737195, + "grad_norm": 0.13453969359397888, + "learning_rate": 0.0001, + "loss": 1.5062, + "step": 1672 + }, + { + "epoch": 0.560093739537998, + "grad_norm": 0.13969993591308594, + "learning_rate": 0.0001, + "loss": 1.6302, + "step": 1673 + }, + { + "epoch": 0.5604285236022766, + "grad_norm": 0.13022400438785553, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 1674 + }, + { + "epoch": 0.560763307666555, + "grad_norm": 0.13372890651226044, + "learning_rate": 0.0001, + "loss": 1.6017, + "step": 1675 + }, + { + "epoch": 0.5610980917308336, + "grad_norm": 0.1426994502544403, + "learning_rate": 0.0001, + "loss": 1.5737, + "step": 1676 + }, + { + "epoch": 0.5614328757951121, + "grad_norm": 0.1358005702495575, + "learning_rate": 0.0001, + "loss": 1.5812, + "step": 1677 + }, + { + "epoch": 0.5617676598593907, + "grad_norm": 0.1320638507604599, + "learning_rate": 0.0001, + "loss": 1.5414, + "step": 1678 + }, + { + "epoch": 0.5621024439236693, + "grad_norm": 0.13449324667453766, + "learning_rate": 0.0001, + "loss": 1.4752, + "step": 1679 + }, + { + "epoch": 0.5624372279879478, + "grad_norm": 0.13063769042491913, + "learning_rate": 0.0001, + "loss": 1.5002, + "step": 1680 + }, + { + "epoch": 0.5627720120522263, + "grad_norm": 0.12591435015201569, + "learning_rate": 0.0001, + "loss": 1.5331, + "step": 1681 + }, + { + "epoch": 0.5631067961165048, + "grad_norm": 0.144126296043396, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 1682 + }, + { + "epoch": 0.5634415801807834, + "grad_norm": 0.13355223834514618, + "learning_rate": 0.0001, + "loss": 1.546, + "step": 1683 + }, + { + "epoch": 0.563776364245062, + "grad_norm": 0.12519478797912598, + "learning_rate": 0.0001, + "loss": 1.5836, + "step": 1684 + }, + { + "epoch": 0.5641111483093405, + "grad_norm": 0.1350811868906021, + "learning_rate": 0.0001, + "loss": 1.577, + "step": 1685 + }, + { + "epoch": 0.5644459323736191, + "grad_norm": 0.14059753715991974, + "learning_rate": 0.0001, + "loss": 1.5457, + "step": 1686 + }, + { + "epoch": 0.5647807164378975, + "grad_norm": 0.13620074093341827, + "learning_rate": 0.0001, + "loss": 1.5318, + "step": 1687 + }, + { + "epoch": 0.5651155005021761, + "grad_norm": 0.13117417693138123, + "learning_rate": 0.0001, + "loss": 1.5413, + "step": 1688 + }, + { + "epoch": 0.5654502845664546, + "grad_norm": 0.14555278420448303, + "learning_rate": 0.0001, + "loss": 1.5775, + "step": 1689 + }, + { + "epoch": 0.5657850686307332, + "grad_norm": 0.12660092115402222, + "learning_rate": 0.0001, + "loss": 1.5034, + "step": 1690 + }, + { + "epoch": 0.5661198526950117, + "grad_norm": 0.12967108190059662, + "learning_rate": 0.0001, + "loss": 1.5755, + "step": 1691 + }, + { + "epoch": 0.5664546367592903, + "grad_norm": 0.13999544084072113, + "learning_rate": 0.0001, + "loss": 1.4471, + "step": 1692 + }, + { + "epoch": 0.5667894208235688, + "grad_norm": 0.13235735893249512, + "learning_rate": 0.0001, + "loss": 1.4967, + "step": 1693 + }, + { + "epoch": 0.5671242048878473, + "grad_norm": 0.1373562067747116, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 1694 + }, + { + "epoch": 0.5674589889521259, + "grad_norm": 0.1320851445198059, + "learning_rate": 0.0001, + "loss": 1.5259, + "step": 1695 + }, + { + "epoch": 0.5677937730164044, + "grad_norm": 0.13309001922607422, + "learning_rate": 0.0001, + "loss": 1.5604, + "step": 1696 + }, + { + "epoch": 0.568128557080683, + "grad_norm": 0.12666000425815582, + "learning_rate": 0.0001, + "loss": 1.5352, + "step": 1697 + }, + { + "epoch": 0.5684633411449616, + "grad_norm": 0.12397143244743347, + "learning_rate": 0.0001, + "loss": 1.5474, + "step": 1698 + }, + { + "epoch": 0.56879812520924, + "grad_norm": 0.1286936104297638, + "learning_rate": 0.0001, + "loss": 1.5125, + "step": 1699 + }, + { + "epoch": 0.5691329092735186, + "grad_norm": 0.12525172531604767, + "learning_rate": 0.0001, + "loss": 1.4172, + "step": 1700 + }, + { + "epoch": 0.5694676933377971, + "grad_norm": 0.13234922289848328, + "learning_rate": 0.0001, + "loss": 1.5374, + "step": 1701 + }, + { + "epoch": 0.5698024774020757, + "grad_norm": 0.13341423869132996, + "learning_rate": 0.0001, + "loss": 1.5615, + "step": 1702 + }, + { + "epoch": 0.5701372614663542, + "grad_norm": 0.12672466039657593, + "learning_rate": 0.0001, + "loss": 1.4147, + "step": 1703 + }, + { + "epoch": 0.5704720455306327, + "grad_norm": 0.13073183596134186, + "learning_rate": 0.0001, + "loss": 1.5237, + "step": 1704 + }, + { + "epoch": 0.5708068295949112, + "grad_norm": 0.13044412434101105, + "learning_rate": 0.0001, + "loss": 1.6044, + "step": 1705 + }, + { + "epoch": 0.5711416136591898, + "grad_norm": 0.13865146040916443, + "learning_rate": 0.0001, + "loss": 1.5648, + "step": 1706 + }, + { + "epoch": 0.5714763977234684, + "grad_norm": 0.13418787717819214, + "learning_rate": 0.0001, + "loss": 1.5948, + "step": 1707 + }, + { + "epoch": 0.5718111817877469, + "grad_norm": 0.1279216855764389, + "learning_rate": 0.0001, + "loss": 1.5465, + "step": 1708 + }, + { + "epoch": 0.5721459658520255, + "grad_norm": 0.13305789232254028, + "learning_rate": 0.0001, + "loss": 1.5768, + "step": 1709 + }, + { + "epoch": 0.5724807499163039, + "grad_norm": 0.12358289957046509, + "learning_rate": 0.0001, + "loss": 1.4377, + "step": 1710 + }, + { + "epoch": 0.5728155339805825, + "grad_norm": 0.128280371427536, + "learning_rate": 0.0001, + "loss": 1.5684, + "step": 1711 + }, + { + "epoch": 0.573150318044861, + "grad_norm": 0.1336420327425003, + "learning_rate": 0.0001, + "loss": 1.5438, + "step": 1712 + }, + { + "epoch": 0.5734851021091396, + "grad_norm": 0.13142135739326477, + "learning_rate": 0.0001, + "loss": 1.5821, + "step": 1713 + }, + { + "epoch": 0.5738198861734182, + "grad_norm": 0.1367759257555008, + "learning_rate": 0.0001, + "loss": 1.5294, + "step": 1714 + }, + { + "epoch": 0.5741546702376967, + "grad_norm": 0.1364768147468567, + "learning_rate": 0.0001, + "loss": 1.4889, + "step": 1715 + }, + { + "epoch": 0.5744894543019752, + "grad_norm": 0.12675487995147705, + "learning_rate": 0.0001, + "loss": 1.5789, + "step": 1716 + }, + { + "epoch": 0.5748242383662537, + "grad_norm": 0.13054460287094116, + "learning_rate": 0.0001, + "loss": 1.5653, + "step": 1717 + }, + { + "epoch": 0.5751590224305323, + "grad_norm": 0.14481523633003235, + "learning_rate": 0.0001, + "loss": 1.6135, + "step": 1718 + }, + { + "epoch": 0.5754938064948109, + "grad_norm": 0.1317768394947052, + "learning_rate": 0.0001, + "loss": 1.5015, + "step": 1719 + }, + { + "epoch": 0.5758285905590894, + "grad_norm": 0.13205017149448395, + "learning_rate": 0.0001, + "loss": 1.5667, + "step": 1720 + }, + { + "epoch": 0.576163374623368, + "grad_norm": 0.13702328503131866, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 1721 + }, + { + "epoch": 0.5764981586876464, + "grad_norm": 0.13435296714305878, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 1722 + }, + { + "epoch": 0.576832942751925, + "grad_norm": 0.13013921678066254, + "learning_rate": 0.0001, + "loss": 1.5948, + "step": 1723 + }, + { + "epoch": 0.5771677268162035, + "grad_norm": 0.12254009395837784, + "learning_rate": 0.0001, + "loss": 1.485, + "step": 1724 + }, + { + "epoch": 0.5775025108804821, + "grad_norm": 0.13023540377616882, + "learning_rate": 0.0001, + "loss": 1.6237, + "step": 1725 + }, + { + "epoch": 0.5778372949447607, + "grad_norm": 0.1339290589094162, + "learning_rate": 0.0001, + "loss": 1.5983, + "step": 1726 + }, + { + "epoch": 0.5781720790090392, + "grad_norm": 0.13126787543296814, + "learning_rate": 0.0001, + "loss": 1.5947, + "step": 1727 + }, + { + "epoch": 0.5785068630733177, + "grad_norm": 0.12525591254234314, + "learning_rate": 0.0001, + "loss": 1.4519, + "step": 1728 + }, + { + "epoch": 0.5788416471375962, + "grad_norm": 0.12789173424243927, + "learning_rate": 0.0001, + "loss": 1.5293, + "step": 1729 + }, + { + "epoch": 0.5791764312018748, + "grad_norm": 0.12775948643684387, + "learning_rate": 0.0001, + "loss": 1.5971, + "step": 1730 + }, + { + "epoch": 0.5795112152661533, + "grad_norm": 0.13437266647815704, + "learning_rate": 0.0001, + "loss": 1.595, + "step": 1731 + }, + { + "epoch": 0.5798459993304319, + "grad_norm": 0.13249057531356812, + "learning_rate": 0.0001, + "loss": 1.5524, + "step": 1732 + }, + { + "epoch": 0.5801807833947105, + "grad_norm": 0.12838158011436462, + "learning_rate": 0.0001, + "loss": 1.4641, + "step": 1733 + }, + { + "epoch": 0.5805155674589889, + "grad_norm": 0.1311095654964447, + "learning_rate": 0.0001, + "loss": 1.5964, + "step": 1734 + }, + { + "epoch": 0.5808503515232675, + "grad_norm": 0.12928825616836548, + "learning_rate": 0.0001, + "loss": 1.5153, + "step": 1735 + }, + { + "epoch": 0.581185135587546, + "grad_norm": 0.1317373663187027, + "learning_rate": 0.0001, + "loss": 1.5805, + "step": 1736 + }, + { + "epoch": 0.5815199196518246, + "grad_norm": 0.1291595846414566, + "learning_rate": 0.0001, + "loss": 1.4974, + "step": 1737 + }, + { + "epoch": 0.5818547037161031, + "grad_norm": 0.12890678644180298, + "learning_rate": 0.0001, + "loss": 1.5778, + "step": 1738 + }, + { + "epoch": 0.5821894877803817, + "grad_norm": 0.13605663180351257, + "learning_rate": 0.0001, + "loss": 1.5206, + "step": 1739 + }, + { + "epoch": 0.5825242718446602, + "grad_norm": 0.12535326182842255, + "learning_rate": 0.0001, + "loss": 1.4989, + "step": 1740 + }, + { + "epoch": 0.5828590559089387, + "grad_norm": 0.13682806491851807, + "learning_rate": 0.0001, + "loss": 1.5558, + "step": 1741 + }, + { + "epoch": 0.5831938399732173, + "grad_norm": 0.12900637090206146, + "learning_rate": 0.0001, + "loss": 1.5687, + "step": 1742 + }, + { + "epoch": 0.5835286240374958, + "grad_norm": 0.1287071406841278, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 1743 + }, + { + "epoch": 0.5838634081017744, + "grad_norm": 0.12810088694095612, + "learning_rate": 0.0001, + "loss": 1.5363, + "step": 1744 + }, + { + "epoch": 0.5841981921660528, + "grad_norm": 0.13105565309524536, + "learning_rate": 0.0001, + "loss": 1.5633, + "step": 1745 + }, + { + "epoch": 0.5845329762303314, + "grad_norm": 0.13414978981018066, + "learning_rate": 0.0001, + "loss": 1.5965, + "step": 1746 + }, + { + "epoch": 0.58486776029461, + "grad_norm": 0.12767766416072845, + "learning_rate": 0.0001, + "loss": 1.517, + "step": 1747 + }, + { + "epoch": 0.5852025443588885, + "grad_norm": 0.12798413634300232, + "learning_rate": 0.0001, + "loss": 1.4184, + "step": 1748 + }, + { + "epoch": 0.5855373284231671, + "grad_norm": 0.13183465600013733, + "learning_rate": 0.0001, + "loss": 1.4812, + "step": 1749 + }, + { + "epoch": 0.5858721124874456, + "grad_norm": 0.12950639426708221, + "learning_rate": 0.0001, + "loss": 1.4371, + "step": 1750 + }, + { + "epoch": 0.5862068965517241, + "grad_norm": 0.1397038698196411, + "learning_rate": 0.0001, + "loss": 1.5023, + "step": 1751 + }, + { + "epoch": 0.5865416806160026, + "grad_norm": 0.1396951824426651, + "learning_rate": 0.0001, + "loss": 1.5174, + "step": 1752 + }, + { + "epoch": 0.5868764646802812, + "grad_norm": 0.13188160955905914, + "learning_rate": 0.0001, + "loss": 1.511, + "step": 1753 + }, + { + "epoch": 0.5872112487445598, + "grad_norm": 0.13433519005775452, + "learning_rate": 0.0001, + "loss": 1.5214, + "step": 1754 + }, + { + "epoch": 0.5875460328088383, + "grad_norm": 0.13022519648075104, + "learning_rate": 0.0001, + "loss": 1.5629, + "step": 1755 + }, + { + "epoch": 0.5878808168731169, + "grad_norm": 0.12651024758815765, + "learning_rate": 0.0001, + "loss": 1.4469, + "step": 1756 + }, + { + "epoch": 0.5882156009373953, + "grad_norm": 0.13489894568920135, + "learning_rate": 0.0001, + "loss": 1.5363, + "step": 1757 + }, + { + "epoch": 0.5885503850016739, + "grad_norm": 0.13707391917705536, + "learning_rate": 0.0001, + "loss": 1.6495, + "step": 1758 + }, + { + "epoch": 0.5888851690659525, + "grad_norm": 0.12528660893440247, + "learning_rate": 0.0001, + "loss": 1.5296, + "step": 1759 + }, + { + "epoch": 0.589219953130231, + "grad_norm": 0.14160814881324768, + "learning_rate": 0.0001, + "loss": 1.5977, + "step": 1760 + }, + { + "epoch": 0.5895547371945096, + "grad_norm": 0.12557724118232727, + "learning_rate": 0.0001, + "loss": 1.4915, + "step": 1761 + }, + { + "epoch": 0.5898895212587881, + "grad_norm": 0.12706881761550903, + "learning_rate": 0.0001, + "loss": 1.5775, + "step": 1762 + }, + { + "epoch": 0.5902243053230666, + "grad_norm": 0.13343869149684906, + "learning_rate": 0.0001, + "loss": 1.6033, + "step": 1763 + }, + { + "epoch": 0.5905590893873451, + "grad_norm": 0.1284165382385254, + "learning_rate": 0.0001, + "loss": 1.5255, + "step": 1764 + }, + { + "epoch": 0.5908938734516237, + "grad_norm": 0.12860101461410522, + "learning_rate": 0.0001, + "loss": 1.4694, + "step": 1765 + }, + { + "epoch": 0.5912286575159023, + "grad_norm": 0.12808945775032043, + "learning_rate": 0.0001, + "loss": 1.6068, + "step": 1766 + }, + { + "epoch": 0.5915634415801808, + "grad_norm": 0.13219839334487915, + "learning_rate": 0.0001, + "loss": 1.5519, + "step": 1767 + }, + { + "epoch": 0.5918982256444594, + "grad_norm": 0.12471086531877518, + "learning_rate": 0.0001, + "loss": 1.4465, + "step": 1768 + }, + { + "epoch": 0.5922330097087378, + "grad_norm": 0.13721035420894623, + "learning_rate": 0.0001, + "loss": 1.5656, + "step": 1769 + }, + { + "epoch": 0.5925677937730164, + "grad_norm": 0.1299833208322525, + "learning_rate": 0.0001, + "loss": 1.4767, + "step": 1770 + }, + { + "epoch": 0.5929025778372949, + "grad_norm": 0.13570041954517365, + "learning_rate": 0.0001, + "loss": 1.5929, + "step": 1771 + }, + { + "epoch": 0.5932373619015735, + "grad_norm": 0.12360662966966629, + "learning_rate": 0.0001, + "loss": 1.4179, + "step": 1772 + }, + { + "epoch": 0.5935721459658521, + "grad_norm": 0.138414204120636, + "learning_rate": 0.0001, + "loss": 1.6123, + "step": 1773 + }, + { + "epoch": 0.5939069300301306, + "grad_norm": 0.1347961127758026, + "learning_rate": 0.0001, + "loss": 1.6135, + "step": 1774 + }, + { + "epoch": 0.5942417140944091, + "grad_norm": 0.1333123743534088, + "learning_rate": 0.0001, + "loss": 1.3935, + "step": 1775 + }, + { + "epoch": 0.5945764981586876, + "grad_norm": 0.13112439215183258, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 1776 + }, + { + "epoch": 0.5949112822229662, + "grad_norm": 0.1356613039970398, + "learning_rate": 0.0001, + "loss": 1.5338, + "step": 1777 + }, + { + "epoch": 0.5952460662872447, + "grad_norm": 0.13762056827545166, + "learning_rate": 0.0001, + "loss": 1.5684, + "step": 1778 + }, + { + "epoch": 0.5955808503515233, + "grad_norm": 0.13242678344249725, + "learning_rate": 0.0001, + "loss": 1.5946, + "step": 1779 + }, + { + "epoch": 0.5959156344158018, + "grad_norm": 0.1304038166999817, + "learning_rate": 0.0001, + "loss": 1.5634, + "step": 1780 + }, + { + "epoch": 0.5962504184800803, + "grad_norm": 0.13004854321479797, + "learning_rate": 0.0001, + "loss": 1.5612, + "step": 1781 + }, + { + "epoch": 0.5965852025443589, + "grad_norm": 0.13909399509429932, + "learning_rate": 0.0001, + "loss": 1.5613, + "step": 1782 + }, + { + "epoch": 0.5969199866086374, + "grad_norm": 0.13109537959098816, + "learning_rate": 0.0001, + "loss": 1.5769, + "step": 1783 + }, + { + "epoch": 0.597254770672916, + "grad_norm": 0.13889670372009277, + "learning_rate": 0.0001, + "loss": 1.5788, + "step": 1784 + }, + { + "epoch": 0.5975895547371946, + "grad_norm": 0.12981747090816498, + "learning_rate": 0.0001, + "loss": 1.5294, + "step": 1785 + }, + { + "epoch": 0.597924338801473, + "grad_norm": 0.12865106761455536, + "learning_rate": 0.0001, + "loss": 1.5907, + "step": 1786 + }, + { + "epoch": 0.5982591228657516, + "grad_norm": 0.13081815838813782, + "learning_rate": 0.0001, + "loss": 1.6513, + "step": 1787 + }, + { + "epoch": 0.5985939069300301, + "grad_norm": 0.1357847899198532, + "learning_rate": 0.0001, + "loss": 1.6925, + "step": 1788 + }, + { + "epoch": 0.5989286909943087, + "grad_norm": 0.1296125054359436, + "learning_rate": 0.0001, + "loss": 1.5362, + "step": 1789 + }, + { + "epoch": 0.5992634750585872, + "grad_norm": 0.13272371888160706, + "learning_rate": 0.0001, + "loss": 1.669, + "step": 1790 + }, + { + "epoch": 0.5995982591228658, + "grad_norm": 0.1340399980545044, + "learning_rate": 0.0001, + "loss": 1.5674, + "step": 1791 + }, + { + "epoch": 0.5999330431871442, + "grad_norm": 0.12497217208147049, + "learning_rate": 0.0001, + "loss": 1.4629, + "step": 1792 + }, + { + "epoch": 0.6002678272514228, + "grad_norm": 0.14285002648830414, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 1793 + }, + { + "epoch": 0.6006026113157014, + "grad_norm": 0.1328384429216385, + "learning_rate": 0.0001, + "loss": 1.5532, + "step": 1794 + }, + { + "epoch": 0.6009373953799799, + "grad_norm": 0.13168397545814514, + "learning_rate": 0.0001, + "loss": 1.6406, + "step": 1795 + }, + { + "epoch": 0.6012721794442585, + "grad_norm": 0.12567539513111115, + "learning_rate": 0.0001, + "loss": 1.5389, + "step": 1796 + }, + { + "epoch": 0.601606963508537, + "grad_norm": 0.13105528056621552, + "learning_rate": 0.0001, + "loss": 1.5754, + "step": 1797 + }, + { + "epoch": 0.6019417475728155, + "grad_norm": 0.1292327493429184, + "learning_rate": 0.0001, + "loss": 1.4713, + "step": 1798 + }, + { + "epoch": 0.602276531637094, + "grad_norm": 0.12788547575473785, + "learning_rate": 0.0001, + "loss": 1.5787, + "step": 1799 + }, + { + "epoch": 0.6026113157013726, + "grad_norm": 0.1307074874639511, + "learning_rate": 0.0001, + "loss": 1.6191, + "step": 1800 + }, + { + "epoch": 0.6029460997656512, + "grad_norm": 0.136485293507576, + "learning_rate": 0.0001, + "loss": 1.6063, + "step": 1801 + }, + { + "epoch": 0.6032808838299297, + "grad_norm": 0.12938566505908966, + "learning_rate": 0.0001, + "loss": 1.5466, + "step": 1802 + }, + { + "epoch": 0.6036156678942083, + "grad_norm": 0.12429405003786087, + "learning_rate": 0.0001, + "loss": 1.4672, + "step": 1803 + }, + { + "epoch": 0.6039504519584867, + "grad_norm": 0.12657684087753296, + "learning_rate": 0.0001, + "loss": 1.5159, + "step": 1804 + }, + { + "epoch": 0.6042852360227653, + "grad_norm": 0.13287223875522614, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 1805 + }, + { + "epoch": 0.6046200200870439, + "grad_norm": 0.13268281519412994, + "learning_rate": 0.0001, + "loss": 1.5282, + "step": 1806 + }, + { + "epoch": 0.6049548041513224, + "grad_norm": 0.1264685094356537, + "learning_rate": 0.0001, + "loss": 1.5795, + "step": 1807 + }, + { + "epoch": 0.605289588215601, + "grad_norm": 0.1276138424873352, + "learning_rate": 0.0001, + "loss": 1.4648, + "step": 1808 + }, + { + "epoch": 0.6056243722798795, + "grad_norm": 0.13063056766986847, + "learning_rate": 0.0001, + "loss": 1.5692, + "step": 1809 + }, + { + "epoch": 0.605959156344158, + "grad_norm": 0.12172877043485641, + "learning_rate": 0.0001, + "loss": 1.4785, + "step": 1810 + }, + { + "epoch": 0.6062939404084365, + "grad_norm": 0.13516037166118622, + "learning_rate": 0.0001, + "loss": 1.5316, + "step": 1811 + }, + { + "epoch": 0.6066287244727151, + "grad_norm": 0.12978719174861908, + "learning_rate": 0.0001, + "loss": 1.5103, + "step": 1812 + }, + { + "epoch": 0.6069635085369937, + "grad_norm": 0.1354977786540985, + "learning_rate": 0.0001, + "loss": 1.5368, + "step": 1813 + }, + { + "epoch": 0.6072982926012722, + "grad_norm": 0.12445911020040512, + "learning_rate": 0.0001, + "loss": 1.4966, + "step": 1814 + }, + { + "epoch": 0.6076330766655507, + "grad_norm": 0.13546685874462128, + "learning_rate": 0.0001, + "loss": 1.62, + "step": 1815 + }, + { + "epoch": 0.6079678607298292, + "grad_norm": 0.12861642241477966, + "learning_rate": 0.0001, + "loss": 1.5895, + "step": 1816 + }, + { + "epoch": 0.6083026447941078, + "grad_norm": 0.13455091416835785, + "learning_rate": 0.0001, + "loss": 1.5217, + "step": 1817 + }, + { + "epoch": 0.6086374288583863, + "grad_norm": 0.13514240086078644, + "learning_rate": 0.0001, + "loss": 1.5947, + "step": 1818 + }, + { + "epoch": 0.6089722129226649, + "grad_norm": 0.12753477692604065, + "learning_rate": 0.0001, + "loss": 1.492, + "step": 1819 + }, + { + "epoch": 0.6093069969869435, + "grad_norm": 0.1335463970899582, + "learning_rate": 0.0001, + "loss": 1.5806, + "step": 1820 + }, + { + "epoch": 0.6096417810512219, + "grad_norm": 0.14587751030921936, + "learning_rate": 0.0001, + "loss": 1.5679, + "step": 1821 + }, + { + "epoch": 0.6099765651155005, + "grad_norm": 0.13787920773029327, + "learning_rate": 0.0001, + "loss": 1.4759, + "step": 1822 + }, + { + "epoch": 0.610311349179779, + "grad_norm": 0.135360449552536, + "learning_rate": 0.0001, + "loss": 1.4968, + "step": 1823 + }, + { + "epoch": 0.6106461332440576, + "grad_norm": 0.13543657958507538, + "learning_rate": 0.0001, + "loss": 1.5321, + "step": 1824 + }, + { + "epoch": 0.6109809173083361, + "grad_norm": 0.127221018075943, + "learning_rate": 0.0001, + "loss": 1.5239, + "step": 1825 + }, + { + "epoch": 0.6113157013726147, + "grad_norm": 0.1439230740070343, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 1826 + }, + { + "epoch": 0.6116504854368932, + "grad_norm": 0.13141925632953644, + "learning_rate": 0.0001, + "loss": 1.504, + "step": 1827 + }, + { + "epoch": 0.6119852695011717, + "grad_norm": 0.12811610102653503, + "learning_rate": 0.0001, + "loss": 1.6137, + "step": 1828 + }, + { + "epoch": 0.6123200535654503, + "grad_norm": 0.13353578746318817, + "learning_rate": 0.0001, + "loss": 1.5209, + "step": 1829 + }, + { + "epoch": 0.6126548376297288, + "grad_norm": 0.13006985187530518, + "learning_rate": 0.0001, + "loss": 1.4776, + "step": 1830 + }, + { + "epoch": 0.6129896216940074, + "grad_norm": 0.1350172609090805, + "learning_rate": 0.0001, + "loss": 1.5994, + "step": 1831 + }, + { + "epoch": 0.613324405758286, + "grad_norm": 0.13640815019607544, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 1832 + }, + { + "epoch": 0.6136591898225644, + "grad_norm": 0.14161550998687744, + "learning_rate": 0.0001, + "loss": 1.5486, + "step": 1833 + }, + { + "epoch": 0.613993973886843, + "grad_norm": 0.12927186489105225, + "learning_rate": 0.0001, + "loss": 1.5166, + "step": 1834 + }, + { + "epoch": 0.6143287579511215, + "grad_norm": 0.1287536919116974, + "learning_rate": 0.0001, + "loss": 1.496, + "step": 1835 + }, + { + "epoch": 0.6146635420154001, + "grad_norm": 0.13734175264835358, + "learning_rate": 0.0001, + "loss": 1.5638, + "step": 1836 + }, + { + "epoch": 0.6149983260796786, + "grad_norm": 0.13784490525722504, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 1837 + }, + { + "epoch": 0.6153331101439572, + "grad_norm": 0.1259312480688095, + "learning_rate": 0.0001, + "loss": 1.5208, + "step": 1838 + }, + { + "epoch": 0.6156678942082356, + "grad_norm": 0.15089771151542664, + "learning_rate": 0.0001, + "loss": 1.5251, + "step": 1839 + }, + { + "epoch": 0.6160026782725142, + "grad_norm": 0.14801523089408875, + "learning_rate": 0.0001, + "loss": 1.5706, + "step": 1840 + }, + { + "epoch": 0.6163374623367928, + "grad_norm": 0.1345253735780716, + "learning_rate": 0.0001, + "loss": 1.5695, + "step": 1841 + }, + { + "epoch": 0.6166722464010713, + "grad_norm": 0.15094773471355438, + "learning_rate": 0.0001, + "loss": 1.5744, + "step": 1842 + }, + { + "epoch": 0.6170070304653499, + "grad_norm": 0.13193759322166443, + "learning_rate": 0.0001, + "loss": 1.5345, + "step": 1843 + }, + { + "epoch": 0.6173418145296284, + "grad_norm": 0.12728765606880188, + "learning_rate": 0.0001, + "loss": 1.5026, + "step": 1844 + }, + { + "epoch": 0.6176765985939069, + "grad_norm": 0.14725570380687714, + "learning_rate": 0.0001, + "loss": 1.581, + "step": 1845 + }, + { + "epoch": 0.6180113826581854, + "grad_norm": 0.13824598491191864, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 1846 + }, + { + "epoch": 0.618346166722464, + "grad_norm": 0.12178414314985275, + "learning_rate": 0.0001, + "loss": 1.4936, + "step": 1847 + }, + { + "epoch": 0.6186809507867426, + "grad_norm": 0.156047984957695, + "learning_rate": 0.0001, + "loss": 1.5737, + "step": 1848 + }, + { + "epoch": 0.6190157348510211, + "grad_norm": 0.15707126259803772, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 1849 + }, + { + "epoch": 0.6193505189152997, + "grad_norm": 0.1378837376832962, + "learning_rate": 0.0001, + "loss": 1.616, + "step": 1850 + }, + { + "epoch": 0.6196853029795781, + "grad_norm": 0.1423729658126831, + "learning_rate": 0.0001, + "loss": 1.5409, + "step": 1851 + }, + { + "epoch": 0.6200200870438567, + "grad_norm": 0.16630493104457855, + "learning_rate": 0.0001, + "loss": 1.6264, + "step": 1852 + }, + { + "epoch": 0.6203548711081353, + "grad_norm": 0.13753686845302582, + "learning_rate": 0.0001, + "loss": 1.6104, + "step": 1853 + }, + { + "epoch": 0.6206896551724138, + "grad_norm": 0.13337332010269165, + "learning_rate": 0.0001, + "loss": 1.5104, + "step": 1854 + }, + { + "epoch": 0.6210244392366924, + "grad_norm": 0.14229977130889893, + "learning_rate": 0.0001, + "loss": 1.4228, + "step": 1855 + }, + { + "epoch": 0.6213592233009708, + "grad_norm": 0.1403966248035431, + "learning_rate": 0.0001, + "loss": 1.5623, + "step": 1856 + }, + { + "epoch": 0.6216940073652494, + "grad_norm": 0.12786665558815002, + "learning_rate": 0.0001, + "loss": 1.5058, + "step": 1857 + }, + { + "epoch": 0.6220287914295279, + "grad_norm": 0.14748771488666534, + "learning_rate": 0.0001, + "loss": 1.5004, + "step": 1858 + }, + { + "epoch": 0.6223635754938065, + "grad_norm": 0.14041772484779358, + "learning_rate": 0.0001, + "loss": 1.6154, + "step": 1859 + }, + { + "epoch": 0.6226983595580851, + "grad_norm": 0.1256851702928543, + "learning_rate": 0.0001, + "loss": 1.4634, + "step": 1860 + }, + { + "epoch": 0.6230331436223636, + "grad_norm": 0.12676502764225006, + "learning_rate": 0.0001, + "loss": 1.5163, + "step": 1861 + }, + { + "epoch": 0.6233679276866421, + "grad_norm": 0.14927968382835388, + "learning_rate": 0.0001, + "loss": 1.5686, + "step": 1862 + }, + { + "epoch": 0.6237027117509206, + "grad_norm": 0.1308298408985138, + "learning_rate": 0.0001, + "loss": 1.5032, + "step": 1863 + }, + { + "epoch": 0.6240374958151992, + "grad_norm": 0.13208165764808655, + "learning_rate": 0.0001, + "loss": 1.5519, + "step": 1864 + }, + { + "epoch": 0.6243722798794777, + "grad_norm": 0.13822416961193085, + "learning_rate": 0.0001, + "loss": 1.5664, + "step": 1865 + }, + { + "epoch": 0.6247070639437563, + "grad_norm": 0.13646993041038513, + "learning_rate": 0.0001, + "loss": 1.5361, + "step": 1866 + }, + { + "epoch": 0.6250418480080349, + "grad_norm": 0.1273556500673294, + "learning_rate": 0.0001, + "loss": 1.546, + "step": 1867 + }, + { + "epoch": 0.6253766320723133, + "grad_norm": 0.13555049896240234, + "learning_rate": 0.0001, + "loss": 1.5288, + "step": 1868 + }, + { + "epoch": 0.6257114161365919, + "grad_norm": 0.13126762211322784, + "learning_rate": 0.0001, + "loss": 1.4659, + "step": 1869 + }, + { + "epoch": 0.6260462002008704, + "grad_norm": 0.1348927766084671, + "learning_rate": 0.0001, + "loss": 1.5812, + "step": 1870 + }, + { + "epoch": 0.626380984265149, + "grad_norm": 0.1363980621099472, + "learning_rate": 0.0001, + "loss": 1.6506, + "step": 1871 + }, + { + "epoch": 0.6267157683294275, + "grad_norm": 0.13422980904579163, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 1872 + }, + { + "epoch": 0.6270505523937061, + "grad_norm": 0.12745925784111023, + "learning_rate": 0.0001, + "loss": 1.4898, + "step": 1873 + }, + { + "epoch": 0.6273853364579846, + "grad_norm": 0.1292264759540558, + "learning_rate": 0.0001, + "loss": 1.548, + "step": 1874 + }, + { + "epoch": 0.6277201205222631, + "grad_norm": 0.1412927806377411, + "learning_rate": 0.0001, + "loss": 1.5228, + "step": 1875 + }, + { + "epoch": 0.6280549045865417, + "grad_norm": 0.1328163594007492, + "learning_rate": 0.0001, + "loss": 1.5521, + "step": 1876 + }, + { + "epoch": 0.6283896886508202, + "grad_norm": 0.1258804351091385, + "learning_rate": 0.0001, + "loss": 1.4781, + "step": 1877 + }, + { + "epoch": 0.6287244727150988, + "grad_norm": 0.128944993019104, + "learning_rate": 0.0001, + "loss": 1.5123, + "step": 1878 + }, + { + "epoch": 0.6290592567793774, + "grad_norm": 0.1244087815284729, + "learning_rate": 0.0001, + "loss": 1.4386, + "step": 1879 + }, + { + "epoch": 0.6293940408436558, + "grad_norm": 0.12890097498893738, + "learning_rate": 0.0001, + "loss": 1.5266, + "step": 1880 + }, + { + "epoch": 0.6297288249079344, + "grad_norm": 0.1312391459941864, + "learning_rate": 0.0001, + "loss": 1.5395, + "step": 1881 + }, + { + "epoch": 0.6300636089722129, + "grad_norm": 0.13363149762153625, + "learning_rate": 0.0001, + "loss": 1.5721, + "step": 1882 + }, + { + "epoch": 0.6303983930364915, + "grad_norm": 0.13130998611450195, + "learning_rate": 0.0001, + "loss": 1.5542, + "step": 1883 + }, + { + "epoch": 0.63073317710077, + "grad_norm": 0.13050179183483124, + "learning_rate": 0.0001, + "loss": 1.5422, + "step": 1884 + }, + { + "epoch": 0.6310679611650486, + "grad_norm": 0.13548725843429565, + "learning_rate": 0.0001, + "loss": 1.5597, + "step": 1885 + }, + { + "epoch": 0.631402745229327, + "grad_norm": 0.13810521364212036, + "learning_rate": 0.0001, + "loss": 1.6428, + "step": 1886 + }, + { + "epoch": 0.6317375292936056, + "grad_norm": 0.12898769974708557, + "learning_rate": 0.0001, + "loss": 1.5091, + "step": 1887 + }, + { + "epoch": 0.6320723133578842, + "grad_norm": 0.13874949514865875, + "learning_rate": 0.0001, + "loss": 1.473, + "step": 1888 + }, + { + "epoch": 0.6324070974221627, + "grad_norm": 0.1275644749403, + "learning_rate": 0.0001, + "loss": 1.5844, + "step": 1889 + }, + { + "epoch": 0.6327418814864413, + "grad_norm": 0.13245896995067596, + "learning_rate": 0.0001, + "loss": 1.602, + "step": 1890 + }, + { + "epoch": 0.6330766655507197, + "grad_norm": 0.13937050104141235, + "learning_rate": 0.0001, + "loss": 1.6106, + "step": 1891 + }, + { + "epoch": 0.6334114496149983, + "grad_norm": 0.13569729030132294, + "learning_rate": 0.0001, + "loss": 1.523, + "step": 1892 + }, + { + "epoch": 0.6337462336792768, + "grad_norm": 0.1360468864440918, + "learning_rate": 0.0001, + "loss": 1.5032, + "step": 1893 + }, + { + "epoch": 0.6340810177435554, + "grad_norm": 0.12757538259029388, + "learning_rate": 0.0001, + "loss": 1.487, + "step": 1894 + }, + { + "epoch": 0.634415801807834, + "grad_norm": 0.13325755298137665, + "learning_rate": 0.0001, + "loss": 1.5386, + "step": 1895 + }, + { + "epoch": 0.6347505858721125, + "grad_norm": 0.1348341703414917, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 1896 + }, + { + "epoch": 0.635085369936391, + "grad_norm": 0.14284925162792206, + "learning_rate": 0.0001, + "loss": 1.636, + "step": 1897 + }, + { + "epoch": 0.6354201540006695, + "grad_norm": 0.12641146779060364, + "learning_rate": 0.0001, + "loss": 1.5172, + "step": 1898 + }, + { + "epoch": 0.6357549380649481, + "grad_norm": 0.1327671855688095, + "learning_rate": 0.0001, + "loss": 1.6519, + "step": 1899 + }, + { + "epoch": 0.6360897221292267, + "grad_norm": 0.13408274948596954, + "learning_rate": 0.0001, + "loss": 1.4722, + "step": 1900 + }, + { + "epoch": 0.6364245061935052, + "grad_norm": 0.13136939704418182, + "learning_rate": 0.0001, + "loss": 1.56, + "step": 1901 + }, + { + "epoch": 0.6367592902577838, + "grad_norm": 0.13018733263015747, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 1902 + }, + { + "epoch": 0.6370940743220622, + "grad_norm": 0.137217178940773, + "learning_rate": 0.0001, + "loss": 1.6224, + "step": 1903 + }, + { + "epoch": 0.6374288583863408, + "grad_norm": 0.12886135280132294, + "learning_rate": 0.0001, + "loss": 1.5993, + "step": 1904 + }, + { + "epoch": 0.6377636424506193, + "grad_norm": 0.12878277897834778, + "learning_rate": 0.0001, + "loss": 1.4407, + "step": 1905 + }, + { + "epoch": 0.6380984265148979, + "grad_norm": 0.12817195057868958, + "learning_rate": 0.0001, + "loss": 1.5113, + "step": 1906 + }, + { + "epoch": 0.6384332105791765, + "grad_norm": 0.12779603898525238, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 1907 + }, + { + "epoch": 0.638767994643455, + "grad_norm": 0.13575701415538788, + "learning_rate": 0.0001, + "loss": 1.5689, + "step": 1908 + }, + { + "epoch": 0.6391027787077335, + "grad_norm": 0.1292586326599121, + "learning_rate": 0.0001, + "loss": 1.5853, + "step": 1909 + }, + { + "epoch": 0.639437562772012, + "grad_norm": 0.13209429383277893, + "learning_rate": 0.0001, + "loss": 1.5374, + "step": 1910 + }, + { + "epoch": 0.6397723468362906, + "grad_norm": 0.13795161247253418, + "learning_rate": 0.0001, + "loss": 1.5752, + "step": 1911 + }, + { + "epoch": 0.6401071309005691, + "grad_norm": 0.13106195628643036, + "learning_rate": 0.0001, + "loss": 1.5074, + "step": 1912 + }, + { + "epoch": 0.6404419149648477, + "grad_norm": 0.1364029496908188, + "learning_rate": 0.0001, + "loss": 1.4415, + "step": 1913 + }, + { + "epoch": 0.6407766990291263, + "grad_norm": 0.13437704741954803, + "learning_rate": 0.0001, + "loss": 1.5179, + "step": 1914 + }, + { + "epoch": 0.6411114830934047, + "grad_norm": 0.12899838387966156, + "learning_rate": 0.0001, + "loss": 1.4437, + "step": 1915 + }, + { + "epoch": 0.6414462671576833, + "grad_norm": 0.1336640864610672, + "learning_rate": 0.0001, + "loss": 1.4988, + "step": 1916 + }, + { + "epoch": 0.6417810512219618, + "grad_norm": 0.13116469979286194, + "learning_rate": 0.0001, + "loss": 1.5944, + "step": 1917 + }, + { + "epoch": 0.6421158352862404, + "grad_norm": 0.1323315054178238, + "learning_rate": 0.0001, + "loss": 1.6378, + "step": 1918 + }, + { + "epoch": 0.642450619350519, + "grad_norm": 0.13012604415416718, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 1919 + }, + { + "epoch": 0.6427854034147975, + "grad_norm": 0.13358043134212494, + "learning_rate": 0.0001, + "loss": 1.4948, + "step": 1920 + }, + { + "epoch": 0.643120187479076, + "grad_norm": 0.13027198612689972, + "learning_rate": 0.0001, + "loss": 1.5749, + "step": 1921 + }, + { + "epoch": 0.6434549715433545, + "grad_norm": 0.11880921572446823, + "learning_rate": 0.0001, + "loss": 1.434, + "step": 1922 + }, + { + "epoch": 0.6437897556076331, + "grad_norm": 0.1275249421596527, + "learning_rate": 0.0001, + "loss": 1.5074, + "step": 1923 + }, + { + "epoch": 0.6441245396719116, + "grad_norm": 0.13402846455574036, + "learning_rate": 0.0001, + "loss": 1.6019, + "step": 1924 + }, + { + "epoch": 0.6444593237361902, + "grad_norm": 0.1263839304447174, + "learning_rate": 0.0001, + "loss": 1.494, + "step": 1925 + }, + { + "epoch": 0.6447941078004688, + "grad_norm": 0.12889358401298523, + "learning_rate": 0.0001, + "loss": 1.4811, + "step": 1926 + }, + { + "epoch": 0.6451288918647472, + "grad_norm": 0.13030682504177094, + "learning_rate": 0.0001, + "loss": 1.5573, + "step": 1927 + }, + { + "epoch": 0.6454636759290258, + "grad_norm": 0.12815749645233154, + "learning_rate": 0.0001, + "loss": 1.5839, + "step": 1928 + }, + { + "epoch": 0.6457984599933043, + "grad_norm": 0.13763943314552307, + "learning_rate": 0.0001, + "loss": 1.4967, + "step": 1929 + }, + { + "epoch": 0.6461332440575829, + "grad_norm": 0.12890425324440002, + "learning_rate": 0.0001, + "loss": 1.4861, + "step": 1930 + }, + { + "epoch": 0.6464680281218614, + "grad_norm": 0.13768140971660614, + "learning_rate": 0.0001, + "loss": 1.5095, + "step": 1931 + }, + { + "epoch": 0.6468028121861399, + "grad_norm": 0.1268666833639145, + "learning_rate": 0.0001, + "loss": 1.5237, + "step": 1932 + }, + { + "epoch": 0.6471375962504184, + "grad_norm": 0.13325713574886322, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 1933 + }, + { + "epoch": 0.647472380314697, + "grad_norm": 0.13848131895065308, + "learning_rate": 0.0001, + "loss": 1.4935, + "step": 1934 + }, + { + "epoch": 0.6478071643789756, + "grad_norm": 0.1393735706806183, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 1935 + }, + { + "epoch": 0.6481419484432541, + "grad_norm": 0.1441955864429474, + "learning_rate": 0.0001, + "loss": 1.6218, + "step": 1936 + }, + { + "epoch": 0.6484767325075327, + "grad_norm": 0.13111312687397003, + "learning_rate": 0.0001, + "loss": 1.5639, + "step": 1937 + }, + { + "epoch": 0.6488115165718111, + "grad_norm": 0.12940305471420288, + "learning_rate": 0.0001, + "loss": 1.5864, + "step": 1938 + }, + { + "epoch": 0.6491463006360897, + "grad_norm": 0.13657227158546448, + "learning_rate": 0.0001, + "loss": 1.5125, + "step": 1939 + }, + { + "epoch": 0.6494810847003682, + "grad_norm": 0.12390992790460587, + "learning_rate": 0.0001, + "loss": 1.4631, + "step": 1940 + }, + { + "epoch": 0.6498158687646468, + "grad_norm": 0.1316480040550232, + "learning_rate": 0.0001, + "loss": 1.5343, + "step": 1941 + }, + { + "epoch": 0.6501506528289254, + "grad_norm": 0.13427673280239105, + "learning_rate": 0.0001, + "loss": 1.5456, + "step": 1942 + }, + { + "epoch": 0.6504854368932039, + "grad_norm": 0.1284562200307846, + "learning_rate": 0.0001, + "loss": 1.5017, + "step": 1943 + }, + { + "epoch": 0.6508202209574824, + "grad_norm": 0.13431181013584137, + "learning_rate": 0.0001, + "loss": 1.45, + "step": 1944 + }, + { + "epoch": 0.6511550050217609, + "grad_norm": 0.13080428540706635, + "learning_rate": 0.0001, + "loss": 1.5035, + "step": 1945 + }, + { + "epoch": 0.6514897890860395, + "grad_norm": 0.13691136240959167, + "learning_rate": 0.0001, + "loss": 1.5145, + "step": 1946 + }, + { + "epoch": 0.651824573150318, + "grad_norm": 0.12990237772464752, + "learning_rate": 0.0001, + "loss": 1.5393, + "step": 1947 + }, + { + "epoch": 0.6521593572145966, + "grad_norm": 0.12529443204402924, + "learning_rate": 0.0001, + "loss": 1.468, + "step": 1948 + }, + { + "epoch": 0.6524941412788752, + "grad_norm": 0.13029485940933228, + "learning_rate": 0.0001, + "loss": 1.5229, + "step": 1949 + }, + { + "epoch": 0.6528289253431536, + "grad_norm": 0.13873140513896942, + "learning_rate": 0.0001, + "loss": 1.5667, + "step": 1950 + }, + { + "epoch": 0.6531637094074322, + "grad_norm": 0.13176368176937103, + "learning_rate": 0.0001, + "loss": 1.4231, + "step": 1951 + }, + { + "epoch": 0.6534984934717107, + "grad_norm": 0.13046538829803467, + "learning_rate": 0.0001, + "loss": 1.5151, + "step": 1952 + }, + { + "epoch": 0.6538332775359893, + "grad_norm": 0.1290617287158966, + "learning_rate": 0.0001, + "loss": 1.6184, + "step": 1953 + }, + { + "epoch": 0.6541680616002679, + "grad_norm": 0.13826888799667358, + "learning_rate": 0.0001, + "loss": 1.5597, + "step": 1954 + }, + { + "epoch": 0.6545028456645464, + "grad_norm": 0.1341448426246643, + "learning_rate": 0.0001, + "loss": 1.5763, + "step": 1955 + }, + { + "epoch": 0.6548376297288249, + "grad_norm": 0.1293526589870453, + "learning_rate": 0.0001, + "loss": 1.4475, + "step": 1956 + }, + { + "epoch": 0.6551724137931034, + "grad_norm": 0.12727828323841095, + "learning_rate": 0.0001, + "loss": 1.5195, + "step": 1957 + }, + { + "epoch": 0.655507197857382, + "grad_norm": 0.13981108367443085, + "learning_rate": 0.0001, + "loss": 1.6515, + "step": 1958 + }, + { + "epoch": 0.6558419819216605, + "grad_norm": 0.1339573711156845, + "learning_rate": 0.0001, + "loss": 1.4506, + "step": 1959 + }, + { + "epoch": 0.6561767659859391, + "grad_norm": 0.13203227519989014, + "learning_rate": 0.0001, + "loss": 1.5553, + "step": 1960 + }, + { + "epoch": 0.6565115500502177, + "grad_norm": 0.1276148110628128, + "learning_rate": 0.0001, + "loss": 1.5442, + "step": 1961 + }, + { + "epoch": 0.6568463341144961, + "grad_norm": 0.13206414878368378, + "learning_rate": 0.0001, + "loss": 1.4193, + "step": 1962 + }, + { + "epoch": 0.6571811181787747, + "grad_norm": 0.14616969227790833, + "learning_rate": 0.0001, + "loss": 1.6147, + "step": 1963 + }, + { + "epoch": 0.6575159022430532, + "grad_norm": 0.13604846596717834, + "learning_rate": 0.0001, + "loss": 1.5652, + "step": 1964 + }, + { + "epoch": 0.6578506863073318, + "grad_norm": 0.13196608424186707, + "learning_rate": 0.0001, + "loss": 1.565, + "step": 1965 + }, + { + "epoch": 0.6581854703716103, + "grad_norm": 0.14214178919792175, + "learning_rate": 0.0001, + "loss": 1.5692, + "step": 1966 + }, + { + "epoch": 0.6585202544358888, + "grad_norm": 0.1290048062801361, + "learning_rate": 0.0001, + "loss": 1.5004, + "step": 1967 + }, + { + "epoch": 0.6588550385001674, + "grad_norm": 0.13306178152561188, + "learning_rate": 0.0001, + "loss": 1.5913, + "step": 1968 + }, + { + "epoch": 0.6591898225644459, + "grad_norm": 0.1337195485830307, + "learning_rate": 0.0001, + "loss": 1.5888, + "step": 1969 + }, + { + "epoch": 0.6595246066287245, + "grad_norm": 0.1345224380493164, + "learning_rate": 0.0001, + "loss": 1.5513, + "step": 1970 + }, + { + "epoch": 0.659859390693003, + "grad_norm": 0.12885946035385132, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 1971 + }, + { + "epoch": 0.6601941747572816, + "grad_norm": 0.1352531760931015, + "learning_rate": 0.0001, + "loss": 1.5958, + "step": 1972 + }, + { + "epoch": 0.66052895882156, + "grad_norm": 0.12501929700374603, + "learning_rate": 0.0001, + "loss": 1.4162, + "step": 1973 + }, + { + "epoch": 0.6608637428858386, + "grad_norm": 0.1291869580745697, + "learning_rate": 0.0001, + "loss": 1.4463, + "step": 1974 + }, + { + "epoch": 0.6611985269501172, + "grad_norm": 0.14670369029045105, + "learning_rate": 0.0001, + "loss": 1.4661, + "step": 1975 + }, + { + "epoch": 0.6615333110143957, + "grad_norm": 0.13643884658813477, + "learning_rate": 0.0001, + "loss": 1.5677, + "step": 1976 + }, + { + "epoch": 0.6618680950786743, + "grad_norm": 0.13746634125709534, + "learning_rate": 0.0001, + "loss": 1.4903, + "step": 1977 + }, + { + "epoch": 0.6622028791429528, + "grad_norm": 0.14677157998085022, + "learning_rate": 0.0001, + "loss": 1.5492, + "step": 1978 + }, + { + "epoch": 0.6625376632072313, + "grad_norm": 0.1345069259405136, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 1979 + }, + { + "epoch": 0.6628724472715098, + "grad_norm": 0.13783417642116547, + "learning_rate": 0.0001, + "loss": 1.5546, + "step": 1980 + }, + { + "epoch": 0.6632072313357884, + "grad_norm": 0.13266097009181976, + "learning_rate": 0.0001, + "loss": 1.4469, + "step": 1981 + }, + { + "epoch": 0.663542015400067, + "grad_norm": 0.13931085169315338, + "learning_rate": 0.0001, + "loss": 1.5797, + "step": 1982 + }, + { + "epoch": 0.6638767994643455, + "grad_norm": 0.13039837777614594, + "learning_rate": 0.0001, + "loss": 1.4508, + "step": 1983 + }, + { + "epoch": 0.6642115835286241, + "grad_norm": 0.13921616971492767, + "learning_rate": 0.0001, + "loss": 1.6177, + "step": 1984 + }, + { + "epoch": 0.6645463675929025, + "grad_norm": 0.1381753534078598, + "learning_rate": 0.0001, + "loss": 1.6578, + "step": 1985 + }, + { + "epoch": 0.6648811516571811, + "grad_norm": 0.1361846625804901, + "learning_rate": 0.0001, + "loss": 1.5422, + "step": 1986 + }, + { + "epoch": 0.6652159357214597, + "grad_norm": 0.14170324802398682, + "learning_rate": 0.0001, + "loss": 1.6339, + "step": 1987 + }, + { + "epoch": 0.6655507197857382, + "grad_norm": 0.13164804875850677, + "learning_rate": 0.0001, + "loss": 1.5623, + "step": 1988 + }, + { + "epoch": 0.6658855038500168, + "grad_norm": 0.13766439259052277, + "learning_rate": 0.0001, + "loss": 1.5661, + "step": 1989 + }, + { + "epoch": 0.6662202879142953, + "grad_norm": 0.1340639889240265, + "learning_rate": 0.0001, + "loss": 1.6035, + "step": 1990 + }, + { + "epoch": 0.6665550719785738, + "grad_norm": 0.132024347782135, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 1991 + }, + { + "epoch": 0.6668898560428523, + "grad_norm": 0.13272161781787872, + "learning_rate": 0.0001, + "loss": 1.4522, + "step": 1992 + }, + { + "epoch": 0.6672246401071309, + "grad_norm": 0.14372223615646362, + "learning_rate": 0.0001, + "loss": 1.581, + "step": 1993 + }, + { + "epoch": 0.6675594241714095, + "grad_norm": 0.13869139552116394, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 1994 + }, + { + "epoch": 0.667894208235688, + "grad_norm": 0.12776124477386475, + "learning_rate": 0.0001, + "loss": 1.51, + "step": 1995 + }, + { + "epoch": 0.6682289922999666, + "grad_norm": 0.13583005964756012, + "learning_rate": 0.0001, + "loss": 1.5771, + "step": 1996 + }, + { + "epoch": 0.668563776364245, + "grad_norm": 0.13394635915756226, + "learning_rate": 0.0001, + "loss": 1.5605, + "step": 1997 + }, + { + "epoch": 0.6688985604285236, + "grad_norm": 0.13842739164829254, + "learning_rate": 0.0001, + "loss": 1.5541, + "step": 1998 + }, + { + "epoch": 0.6692333444928021, + "grad_norm": 0.13265378773212433, + "learning_rate": 0.0001, + "loss": 1.5772, + "step": 1999 + }, + { + "epoch": 0.6695681285570807, + "grad_norm": 0.13662943243980408, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 2000 + }, + { + "epoch": 0.6699029126213593, + "grad_norm": 0.12512929737567902, + "learning_rate": 0.0001, + "loss": 1.4162, + "step": 2001 + }, + { + "epoch": 0.6702376966856378, + "grad_norm": 0.1327543556690216, + "learning_rate": 0.0001, + "loss": 1.4978, + "step": 2002 + }, + { + "epoch": 0.6705724807499163, + "grad_norm": 0.13269194960594177, + "learning_rate": 0.0001, + "loss": 1.5998, + "step": 2003 + }, + { + "epoch": 0.6709072648141948, + "grad_norm": 0.14017336070537567, + "learning_rate": 0.0001, + "loss": 1.5785, + "step": 2004 + }, + { + "epoch": 0.6712420488784734, + "grad_norm": 0.1304367482662201, + "learning_rate": 0.0001, + "loss": 1.4781, + "step": 2005 + }, + { + "epoch": 0.671576832942752, + "grad_norm": 0.13442495465278625, + "learning_rate": 0.0001, + "loss": 1.5358, + "step": 2006 + }, + { + "epoch": 0.6719116170070305, + "grad_norm": 0.13490137457847595, + "learning_rate": 0.0001, + "loss": 1.6273, + "step": 2007 + }, + { + "epoch": 0.672246401071309, + "grad_norm": 0.1324394941329956, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 2008 + }, + { + "epoch": 0.6725811851355875, + "grad_norm": 0.12797103822231293, + "learning_rate": 0.0001, + "loss": 1.551, + "step": 2009 + }, + { + "epoch": 0.6729159691998661, + "grad_norm": 0.13374999165534973, + "learning_rate": 0.0001, + "loss": 1.5571, + "step": 2010 + }, + { + "epoch": 0.6732507532641446, + "grad_norm": 0.13020572066307068, + "learning_rate": 0.0001, + "loss": 1.4756, + "step": 2011 + }, + { + "epoch": 0.6735855373284232, + "grad_norm": 0.12501733005046844, + "learning_rate": 0.0001, + "loss": 1.5073, + "step": 2012 + }, + { + "epoch": 0.6739203213927017, + "grad_norm": 0.12433689087629318, + "learning_rate": 0.0001, + "loss": 1.4574, + "step": 2013 + }, + { + "epoch": 0.6742551054569802, + "grad_norm": 0.14026397466659546, + "learning_rate": 0.0001, + "loss": 1.5513, + "step": 2014 + }, + { + "epoch": 0.6745898895212588, + "grad_norm": 0.1340554803609848, + "learning_rate": 0.0001, + "loss": 1.5686, + "step": 2015 + }, + { + "epoch": 0.6749246735855373, + "grad_norm": 0.12796646356582642, + "learning_rate": 0.0001, + "loss": 1.4842, + "step": 2016 + }, + { + "epoch": 0.6752594576498159, + "grad_norm": 0.1362949162721634, + "learning_rate": 0.0001, + "loss": 1.5763, + "step": 2017 + }, + { + "epoch": 0.6755942417140944, + "grad_norm": 0.1347300410270691, + "learning_rate": 0.0001, + "loss": 1.5975, + "step": 2018 + }, + { + "epoch": 0.675929025778373, + "grad_norm": 0.13647662103176117, + "learning_rate": 0.0001, + "loss": 1.5395, + "step": 2019 + }, + { + "epoch": 0.6762638098426514, + "grad_norm": 0.13441947102546692, + "learning_rate": 0.0001, + "loss": 1.5726, + "step": 2020 + }, + { + "epoch": 0.67659859390693, + "grad_norm": 0.13435856997966766, + "learning_rate": 0.0001, + "loss": 1.6806, + "step": 2021 + }, + { + "epoch": 0.6769333779712086, + "grad_norm": 0.1239754781126976, + "learning_rate": 0.0001, + "loss": 1.4045, + "step": 2022 + }, + { + "epoch": 0.6772681620354871, + "grad_norm": 0.13493669033050537, + "learning_rate": 0.0001, + "loss": 1.5606, + "step": 2023 + }, + { + "epoch": 0.6776029460997657, + "grad_norm": 0.12938407063484192, + "learning_rate": 0.0001, + "loss": 1.5201, + "step": 2024 + }, + { + "epoch": 0.6779377301640442, + "grad_norm": 0.12213901430368423, + "learning_rate": 0.0001, + "loss": 1.4436, + "step": 2025 + }, + { + "epoch": 0.6782725142283227, + "grad_norm": 0.14107517898082733, + "learning_rate": 0.0001, + "loss": 1.5584, + "step": 2026 + }, + { + "epoch": 0.6786072982926012, + "grad_norm": 0.13082027435302734, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 2027 + }, + { + "epoch": 0.6789420823568798, + "grad_norm": 0.14623381197452545, + "learning_rate": 0.0001, + "loss": 1.668, + "step": 2028 + }, + { + "epoch": 0.6792768664211584, + "grad_norm": 0.12862159311771393, + "learning_rate": 0.0001, + "loss": 1.5534, + "step": 2029 + }, + { + "epoch": 0.6796116504854369, + "grad_norm": 0.13177117705345154, + "learning_rate": 0.0001, + "loss": 1.5564, + "step": 2030 + }, + { + "epoch": 0.6799464345497155, + "grad_norm": 0.12835298478603363, + "learning_rate": 0.0001, + "loss": 1.479, + "step": 2031 + }, + { + "epoch": 0.6802812186139939, + "grad_norm": 0.14096349477767944, + "learning_rate": 0.0001, + "loss": 1.6175, + "step": 2032 + }, + { + "epoch": 0.6806160026782725, + "grad_norm": 0.12646090984344482, + "learning_rate": 0.0001, + "loss": 1.4861, + "step": 2033 + }, + { + "epoch": 0.680950786742551, + "grad_norm": 0.137931689620018, + "learning_rate": 0.0001, + "loss": 1.5051, + "step": 2034 + }, + { + "epoch": 0.6812855708068296, + "grad_norm": 0.13240592181682587, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 2035 + }, + { + "epoch": 0.6816203548711082, + "grad_norm": 0.1362670511007309, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 2036 + }, + { + "epoch": 0.6819551389353867, + "grad_norm": 0.13148629665374756, + "learning_rate": 0.0001, + "loss": 1.521, + "step": 2037 + }, + { + "epoch": 0.6822899229996652, + "grad_norm": 0.13285885751247406, + "learning_rate": 0.0001, + "loss": 1.5122, + "step": 2038 + }, + { + "epoch": 0.6826247070639437, + "grad_norm": 0.1264655739068985, + "learning_rate": 0.0001, + "loss": 1.4886, + "step": 2039 + }, + { + "epoch": 0.6829594911282223, + "grad_norm": 0.12677529454231262, + "learning_rate": 0.0001, + "loss": 1.5068, + "step": 2040 + }, + { + "epoch": 0.6832942751925009, + "grad_norm": 0.13277101516723633, + "learning_rate": 0.0001, + "loss": 1.6065, + "step": 2041 + }, + { + "epoch": 0.6836290592567794, + "grad_norm": 0.13291488587856293, + "learning_rate": 0.0001, + "loss": 1.5755, + "step": 2042 + }, + { + "epoch": 0.6839638433210579, + "grad_norm": 0.13058260083198547, + "learning_rate": 0.0001, + "loss": 1.5286, + "step": 2043 + }, + { + "epoch": 0.6842986273853364, + "grad_norm": 0.13059435784816742, + "learning_rate": 0.0001, + "loss": 1.5803, + "step": 2044 + }, + { + "epoch": 0.684633411449615, + "grad_norm": 0.12917304039001465, + "learning_rate": 0.0001, + "loss": 1.576, + "step": 2045 + }, + { + "epoch": 0.6849681955138935, + "grad_norm": 0.12822791934013367, + "learning_rate": 0.0001, + "loss": 1.5201, + "step": 2046 + }, + { + "epoch": 0.6853029795781721, + "grad_norm": 0.14006927609443665, + "learning_rate": 0.0001, + "loss": 1.5445, + "step": 2047 + }, + { + "epoch": 0.6856377636424507, + "grad_norm": 0.13502942025661469, + "learning_rate": 0.0001, + "loss": 1.543, + "step": 2048 + }, + { + "epoch": 0.6859725477067291, + "grad_norm": 0.1351221352815628, + "learning_rate": 0.0001, + "loss": 1.5594, + "step": 2049 + }, + { + "epoch": 0.6863073317710077, + "grad_norm": 0.13474461436271667, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 2050 + }, + { + "epoch": 0.6866421158352862, + "grad_norm": 0.1317591369152069, + "learning_rate": 0.0001, + "loss": 1.5681, + "step": 2051 + }, + { + "epoch": 0.6869768998995648, + "grad_norm": 0.1300475299358368, + "learning_rate": 0.0001, + "loss": 1.5426, + "step": 2052 + }, + { + "epoch": 0.6873116839638433, + "grad_norm": 0.1308741718530655, + "learning_rate": 0.0001, + "loss": 1.5649, + "step": 2053 + }, + { + "epoch": 0.6876464680281219, + "grad_norm": 0.1339602768421173, + "learning_rate": 0.0001, + "loss": 1.5422, + "step": 2054 + }, + { + "epoch": 0.6879812520924004, + "grad_norm": 0.12556122243404388, + "learning_rate": 0.0001, + "loss": 1.3939, + "step": 2055 + }, + { + "epoch": 0.6883160361566789, + "grad_norm": 0.1331097036600113, + "learning_rate": 0.0001, + "loss": 1.5725, + "step": 2056 + }, + { + "epoch": 0.6886508202209575, + "grad_norm": 0.12769033014774323, + "learning_rate": 0.0001, + "loss": 1.5133, + "step": 2057 + }, + { + "epoch": 0.688985604285236, + "grad_norm": 0.13246020674705505, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 2058 + }, + { + "epoch": 0.6893203883495146, + "grad_norm": 0.13371361792087555, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 2059 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.1314792037010193, + "learning_rate": 0.0001, + "loss": 1.4943, + "step": 2060 + }, + { + "epoch": 0.6899899564780716, + "grad_norm": 0.13194666802883148, + "learning_rate": 0.0001, + "loss": 1.5983, + "step": 2061 + }, + { + "epoch": 0.6903247405423502, + "grad_norm": 0.13631388545036316, + "learning_rate": 0.0001, + "loss": 1.4932, + "step": 2062 + }, + { + "epoch": 0.6906595246066287, + "grad_norm": 0.1319463849067688, + "learning_rate": 0.0001, + "loss": 1.5848, + "step": 2063 + }, + { + "epoch": 0.6909943086709073, + "grad_norm": 0.14124637842178345, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 2064 + }, + { + "epoch": 0.6913290927351858, + "grad_norm": 0.12954577803611755, + "learning_rate": 0.0001, + "loss": 1.4153, + "step": 2065 + }, + { + "epoch": 0.6916638767994644, + "grad_norm": 0.1325748711824417, + "learning_rate": 0.0001, + "loss": 1.5766, + "step": 2066 + }, + { + "epoch": 0.6919986608637428, + "grad_norm": 0.13064290583133698, + "learning_rate": 0.0001, + "loss": 1.4995, + "step": 2067 + }, + { + "epoch": 0.6923334449280214, + "grad_norm": 0.1248745545744896, + "learning_rate": 0.0001, + "loss": 1.5077, + "step": 2068 + }, + { + "epoch": 0.6926682289923, + "grad_norm": 0.1278417706489563, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 2069 + }, + { + "epoch": 0.6930030130565785, + "grad_norm": 0.13311515748500824, + "learning_rate": 0.0001, + "loss": 1.5251, + "step": 2070 + }, + { + "epoch": 0.6933377971208571, + "grad_norm": 0.13218218088150024, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 2071 + }, + { + "epoch": 0.6936725811851356, + "grad_norm": 0.13042452931404114, + "learning_rate": 0.0001, + "loss": 1.5534, + "step": 2072 + }, + { + "epoch": 0.6940073652494141, + "grad_norm": 0.1393493264913559, + "learning_rate": 0.0001, + "loss": 1.594, + "step": 2073 + }, + { + "epoch": 0.6943421493136926, + "grad_norm": 0.1298573911190033, + "learning_rate": 0.0001, + "loss": 1.518, + "step": 2074 + }, + { + "epoch": 0.6946769333779712, + "grad_norm": 0.13325051963329315, + "learning_rate": 0.0001, + "loss": 1.5068, + "step": 2075 + }, + { + "epoch": 0.6950117174422498, + "grad_norm": 0.1269649714231491, + "learning_rate": 0.0001, + "loss": 1.4805, + "step": 2076 + }, + { + "epoch": 0.6953465015065283, + "grad_norm": 0.12699490785598755, + "learning_rate": 0.0001, + "loss": 1.4228, + "step": 2077 + }, + { + "epoch": 0.6956812855708069, + "grad_norm": 0.1379399597644806, + "learning_rate": 0.0001, + "loss": 1.4138, + "step": 2078 + }, + { + "epoch": 0.6960160696350853, + "grad_norm": 0.13343951106071472, + "learning_rate": 0.0001, + "loss": 1.5947, + "step": 2079 + }, + { + "epoch": 0.6963508536993639, + "grad_norm": 0.13461847603321075, + "learning_rate": 0.0001, + "loss": 1.5333, + "step": 2080 + }, + { + "epoch": 0.6966856377636425, + "grad_norm": 0.1299065202474594, + "learning_rate": 0.0001, + "loss": 1.5415, + "step": 2081 + }, + { + "epoch": 0.697020421827921, + "grad_norm": 0.1272873431444168, + "learning_rate": 0.0001, + "loss": 1.4443, + "step": 2082 + }, + { + "epoch": 0.6973552058921996, + "grad_norm": 0.136282280087471, + "learning_rate": 0.0001, + "loss": 1.4996, + "step": 2083 + }, + { + "epoch": 0.697689989956478, + "grad_norm": 0.12842769920825958, + "learning_rate": 0.0001, + "loss": 1.5574, + "step": 2084 + }, + { + "epoch": 0.6980247740207566, + "grad_norm": 0.12897315621376038, + "learning_rate": 0.0001, + "loss": 1.6162, + "step": 2085 + }, + { + "epoch": 0.6983595580850351, + "grad_norm": 0.13097885251045227, + "learning_rate": 0.0001, + "loss": 1.4949, + "step": 2086 + }, + { + "epoch": 0.6986943421493137, + "grad_norm": 0.13251438736915588, + "learning_rate": 0.0001, + "loss": 1.5041, + "step": 2087 + }, + { + "epoch": 0.6990291262135923, + "grad_norm": 0.1319066435098648, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 2088 + }, + { + "epoch": 0.6993639102778708, + "grad_norm": 0.13142657279968262, + "learning_rate": 0.0001, + "loss": 1.452, + "step": 2089 + }, + { + "epoch": 0.6996986943421493, + "grad_norm": 0.13348999619483948, + "learning_rate": 0.0001, + "loss": 1.4905, + "step": 2090 + }, + { + "epoch": 0.7000334784064278, + "grad_norm": 0.13037413358688354, + "learning_rate": 0.0001, + "loss": 1.4949, + "step": 2091 + }, + { + "epoch": 0.7003682624707064, + "grad_norm": 0.15308037400245667, + "learning_rate": 0.0001, + "loss": 1.6023, + "step": 2092 + }, + { + "epoch": 0.7007030465349849, + "grad_norm": 0.128286212682724, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 2093 + }, + { + "epoch": 0.7010378305992635, + "grad_norm": 0.13967067003250122, + "learning_rate": 0.0001, + "loss": 1.5577, + "step": 2094 + }, + { + "epoch": 0.7013726146635421, + "grad_norm": 0.13320837914943695, + "learning_rate": 0.0001, + "loss": 1.5923, + "step": 2095 + }, + { + "epoch": 0.7017073987278205, + "grad_norm": 0.12857401371002197, + "learning_rate": 0.0001, + "loss": 1.4623, + "step": 2096 + }, + { + "epoch": 0.7020421827920991, + "grad_norm": 0.12525291740894318, + "learning_rate": 0.0001, + "loss": 1.5126, + "step": 2097 + }, + { + "epoch": 0.7023769668563776, + "grad_norm": 0.1316770762205124, + "learning_rate": 0.0001, + "loss": 1.5433, + "step": 2098 + }, + { + "epoch": 0.7027117509206562, + "grad_norm": 0.1343490481376648, + "learning_rate": 0.0001, + "loss": 1.5085, + "step": 2099 + }, + { + "epoch": 0.7030465349849347, + "grad_norm": 0.12864871323108673, + "learning_rate": 0.0001, + "loss": 1.46, + "step": 2100 + }, + { + "epoch": 0.7033813190492133, + "grad_norm": 0.13915804028511047, + "learning_rate": 0.0001, + "loss": 1.6961, + "step": 2101 + }, + { + "epoch": 0.7037161031134918, + "grad_norm": 0.12709419429302216, + "learning_rate": 0.0001, + "loss": 1.4931, + "step": 2102 + }, + { + "epoch": 0.7040508871777703, + "grad_norm": 0.1383008360862732, + "learning_rate": 0.0001, + "loss": 1.5925, + "step": 2103 + }, + { + "epoch": 0.7043856712420489, + "grad_norm": 0.1338641494512558, + "learning_rate": 0.0001, + "loss": 1.4715, + "step": 2104 + }, + { + "epoch": 0.7047204553063274, + "grad_norm": 0.12291635572910309, + "learning_rate": 0.0001, + "loss": 1.3746, + "step": 2105 + }, + { + "epoch": 0.705055239370606, + "grad_norm": 0.13391555845737457, + "learning_rate": 0.0001, + "loss": 1.627, + "step": 2106 + }, + { + "epoch": 0.7053900234348845, + "grad_norm": 0.13259120285511017, + "learning_rate": 0.0001, + "loss": 1.6069, + "step": 2107 + }, + { + "epoch": 0.705724807499163, + "grad_norm": 0.13009488582611084, + "learning_rate": 0.0001, + "loss": 1.534, + "step": 2108 + }, + { + "epoch": 0.7060595915634416, + "grad_norm": 0.12612484395503998, + "learning_rate": 0.0001, + "loss": 1.4612, + "step": 2109 + }, + { + "epoch": 0.7063943756277201, + "grad_norm": 0.12470883876085281, + "learning_rate": 0.0001, + "loss": 1.4388, + "step": 2110 + }, + { + "epoch": 0.7067291596919987, + "grad_norm": 0.13072682917118073, + "learning_rate": 0.0001, + "loss": 1.5083, + "step": 2111 + }, + { + "epoch": 0.7070639437562772, + "grad_norm": 0.13037820160388947, + "learning_rate": 0.0001, + "loss": 1.4514, + "step": 2112 + }, + { + "epoch": 0.7073987278205558, + "grad_norm": 0.1304703801870346, + "learning_rate": 0.0001, + "loss": 1.4644, + "step": 2113 + }, + { + "epoch": 0.7077335118848342, + "grad_norm": 0.1345730423927307, + "learning_rate": 0.0001, + "loss": 1.4849, + "step": 2114 + }, + { + "epoch": 0.7080682959491128, + "grad_norm": 0.14024527370929718, + "learning_rate": 0.0001, + "loss": 1.5851, + "step": 2115 + }, + { + "epoch": 0.7084030800133914, + "grad_norm": 0.13666972517967224, + "learning_rate": 0.0001, + "loss": 1.4858, + "step": 2116 + }, + { + "epoch": 0.7087378640776699, + "grad_norm": 0.13574914634227753, + "learning_rate": 0.0001, + "loss": 1.5258, + "step": 2117 + }, + { + "epoch": 0.7090726481419485, + "grad_norm": 0.1362755447626114, + "learning_rate": 0.0001, + "loss": 1.5592, + "step": 2118 + }, + { + "epoch": 0.7094074322062269, + "grad_norm": 0.12771886587142944, + "learning_rate": 0.0001, + "loss": 1.459, + "step": 2119 + }, + { + "epoch": 0.7097422162705055, + "grad_norm": 0.13762152194976807, + "learning_rate": 0.0001, + "loss": 1.5934, + "step": 2120 + }, + { + "epoch": 0.710077000334784, + "grad_norm": 0.13554149866104126, + "learning_rate": 0.0001, + "loss": 1.5728, + "step": 2121 + }, + { + "epoch": 0.7104117843990626, + "grad_norm": 0.1313951313495636, + "learning_rate": 0.0001, + "loss": 1.517, + "step": 2122 + }, + { + "epoch": 0.7107465684633412, + "grad_norm": 0.12920212745666504, + "learning_rate": 0.0001, + "loss": 1.4647, + "step": 2123 + }, + { + "epoch": 0.7110813525276197, + "grad_norm": 0.13671697676181793, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 2124 + }, + { + "epoch": 0.7114161365918982, + "grad_norm": 0.12860006093978882, + "learning_rate": 0.0001, + "loss": 1.5304, + "step": 2125 + }, + { + "epoch": 0.7117509206561767, + "grad_norm": 0.12372934073209763, + "learning_rate": 0.0001, + "loss": 1.4964, + "step": 2126 + }, + { + "epoch": 0.7120857047204553, + "grad_norm": 0.13640989363193512, + "learning_rate": 0.0001, + "loss": 1.521, + "step": 2127 + }, + { + "epoch": 0.7124204887847339, + "grad_norm": 0.13121746480464935, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 2128 + }, + { + "epoch": 0.7127552728490124, + "grad_norm": 0.1307837963104248, + "learning_rate": 0.0001, + "loss": 1.5688, + "step": 2129 + }, + { + "epoch": 0.713090056913291, + "grad_norm": 0.13141870498657227, + "learning_rate": 0.0001, + "loss": 1.5435, + "step": 2130 + }, + { + "epoch": 0.7134248409775694, + "grad_norm": 0.13490049540996552, + "learning_rate": 0.0001, + "loss": 1.5421, + "step": 2131 + }, + { + "epoch": 0.713759625041848, + "grad_norm": 0.13801416754722595, + "learning_rate": 0.0001, + "loss": 1.6097, + "step": 2132 + }, + { + "epoch": 0.7140944091061265, + "grad_norm": 0.13066011667251587, + "learning_rate": 0.0001, + "loss": 1.4629, + "step": 2133 + }, + { + "epoch": 0.7144291931704051, + "grad_norm": 0.13355465233325958, + "learning_rate": 0.0001, + "loss": 1.6363, + "step": 2134 + }, + { + "epoch": 0.7147639772346837, + "grad_norm": 0.12968328595161438, + "learning_rate": 0.0001, + "loss": 1.4454, + "step": 2135 + }, + { + "epoch": 0.7150987612989622, + "grad_norm": 0.14093713462352753, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 2136 + }, + { + "epoch": 0.7154335453632407, + "grad_norm": 0.13097916543483734, + "learning_rate": 0.0001, + "loss": 1.5531, + "step": 2137 + }, + { + "epoch": 0.7157683294275192, + "grad_norm": 0.1295294314622879, + "learning_rate": 0.0001, + "loss": 1.5923, + "step": 2138 + }, + { + "epoch": 0.7161031134917978, + "grad_norm": 0.13776849210262299, + "learning_rate": 0.0001, + "loss": 1.5992, + "step": 2139 + }, + { + "epoch": 0.7164378975560763, + "grad_norm": 0.13502860069274902, + "learning_rate": 0.0001, + "loss": 1.4677, + "step": 2140 + }, + { + "epoch": 0.7167726816203549, + "grad_norm": 0.13480490446090698, + "learning_rate": 0.0001, + "loss": 1.6244, + "step": 2141 + }, + { + "epoch": 0.7171074656846335, + "grad_norm": 0.13483154773712158, + "learning_rate": 0.0001, + "loss": 1.616, + "step": 2142 + }, + { + "epoch": 0.7174422497489119, + "grad_norm": 0.14340271055698395, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 2143 + }, + { + "epoch": 0.7177770338131905, + "grad_norm": 0.13620589673519135, + "learning_rate": 0.0001, + "loss": 1.5193, + "step": 2144 + }, + { + "epoch": 0.718111817877469, + "grad_norm": 0.13150522112846375, + "learning_rate": 0.0001, + "loss": 1.5038, + "step": 2145 + }, + { + "epoch": 0.7184466019417476, + "grad_norm": 0.13259613513946533, + "learning_rate": 0.0001, + "loss": 1.5666, + "step": 2146 + }, + { + "epoch": 0.7187813860060261, + "grad_norm": 0.1307973563671112, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 2147 + }, + { + "epoch": 0.7191161700703047, + "grad_norm": 0.13372613489627838, + "learning_rate": 0.0001, + "loss": 1.5352, + "step": 2148 + }, + { + "epoch": 0.7194509541345832, + "grad_norm": 0.13534867763519287, + "learning_rate": 0.0001, + "loss": 1.4652, + "step": 2149 + }, + { + "epoch": 0.7197857381988617, + "grad_norm": 0.1332571804523468, + "learning_rate": 0.0001, + "loss": 1.5532, + "step": 2150 + }, + { + "epoch": 0.7201205222631403, + "grad_norm": 0.13172098994255066, + "learning_rate": 0.0001, + "loss": 1.4728, + "step": 2151 + }, + { + "epoch": 0.7204553063274188, + "grad_norm": 0.12765897810459137, + "learning_rate": 0.0001, + "loss": 1.4597, + "step": 2152 + }, + { + "epoch": 0.7207900903916974, + "grad_norm": 0.13026951253414154, + "learning_rate": 0.0001, + "loss": 1.4877, + "step": 2153 + }, + { + "epoch": 0.721124874455976, + "grad_norm": 0.1389724761247635, + "learning_rate": 0.0001, + "loss": 1.5332, + "step": 2154 + }, + { + "epoch": 0.7214596585202544, + "grad_norm": 0.13382194936275482, + "learning_rate": 0.0001, + "loss": 1.5179, + "step": 2155 + }, + { + "epoch": 0.721794442584533, + "grad_norm": 0.12780801951885223, + "learning_rate": 0.0001, + "loss": 1.4393, + "step": 2156 + }, + { + "epoch": 0.7221292266488115, + "grad_norm": 0.1323569118976593, + "learning_rate": 0.0001, + "loss": 1.5528, + "step": 2157 + }, + { + "epoch": 0.7224640107130901, + "grad_norm": 0.1358579397201538, + "learning_rate": 0.0001, + "loss": 1.4996, + "step": 2158 + }, + { + "epoch": 0.7227987947773686, + "grad_norm": 0.13905704021453857, + "learning_rate": 0.0001, + "loss": 1.5979, + "step": 2159 + }, + { + "epoch": 0.7231335788416471, + "grad_norm": 0.1356305480003357, + "learning_rate": 0.0001, + "loss": 1.5851, + "step": 2160 + }, + { + "epoch": 0.7234683629059256, + "grad_norm": 0.13545480370521545, + "learning_rate": 0.0001, + "loss": 1.5622, + "step": 2161 + }, + { + "epoch": 0.7238031469702042, + "grad_norm": 0.13289092481136322, + "learning_rate": 0.0001, + "loss": 1.5253, + "step": 2162 + }, + { + "epoch": 0.7241379310344828, + "grad_norm": 0.130274698138237, + "learning_rate": 0.0001, + "loss": 1.4498, + "step": 2163 + }, + { + "epoch": 0.7244727150987613, + "grad_norm": 0.13009384274482727, + "learning_rate": 0.0001, + "loss": 1.5593, + "step": 2164 + }, + { + "epoch": 0.7248074991630399, + "grad_norm": 0.13778330385684967, + "learning_rate": 0.0001, + "loss": 1.4054, + "step": 2165 + }, + { + "epoch": 0.7251422832273183, + "grad_norm": 0.14639288187026978, + "learning_rate": 0.0001, + "loss": 1.5563, + "step": 2166 + }, + { + "epoch": 0.7254770672915969, + "grad_norm": 0.14019513130187988, + "learning_rate": 0.0001, + "loss": 1.6143, + "step": 2167 + }, + { + "epoch": 0.7258118513558754, + "grad_norm": 0.15255634486675262, + "learning_rate": 0.0001, + "loss": 1.4999, + "step": 2168 + }, + { + "epoch": 0.726146635420154, + "grad_norm": 0.133973628282547, + "learning_rate": 0.0001, + "loss": 1.5648, + "step": 2169 + }, + { + "epoch": 0.7264814194844326, + "grad_norm": 0.14227105677127838, + "learning_rate": 0.0001, + "loss": 1.5372, + "step": 2170 + }, + { + "epoch": 0.7268162035487111, + "grad_norm": 0.13694263994693756, + "learning_rate": 0.0001, + "loss": 1.5454, + "step": 2171 + }, + { + "epoch": 0.7271509876129896, + "grad_norm": 0.1395786851644516, + "learning_rate": 0.0001, + "loss": 1.6018, + "step": 2172 + }, + { + "epoch": 0.7274857716772681, + "grad_norm": 0.13695751130580902, + "learning_rate": 0.0001, + "loss": 1.5542, + "step": 2173 + }, + { + "epoch": 0.7278205557415467, + "grad_norm": 0.14114227890968323, + "learning_rate": 0.0001, + "loss": 1.4742, + "step": 2174 + }, + { + "epoch": 0.7281553398058253, + "grad_norm": 0.14633609354496002, + "learning_rate": 0.0001, + "loss": 1.5335, + "step": 2175 + }, + { + "epoch": 0.7284901238701038, + "grad_norm": 0.12929964065551758, + "learning_rate": 0.0001, + "loss": 1.4759, + "step": 2176 + }, + { + "epoch": 0.7288249079343824, + "grad_norm": 0.14383701980113983, + "learning_rate": 0.0001, + "loss": 1.5744, + "step": 2177 + }, + { + "epoch": 0.7291596919986608, + "grad_norm": 0.14609093964099884, + "learning_rate": 0.0001, + "loss": 1.4927, + "step": 2178 + }, + { + "epoch": 0.7294944760629394, + "grad_norm": 0.13813704252243042, + "learning_rate": 0.0001, + "loss": 1.535, + "step": 2179 + }, + { + "epoch": 0.7298292601272179, + "grad_norm": 0.13343721628189087, + "learning_rate": 0.0001, + "loss": 1.5239, + "step": 2180 + }, + { + "epoch": 0.7301640441914965, + "grad_norm": 0.13793961703777313, + "learning_rate": 0.0001, + "loss": 1.4959, + "step": 2181 + }, + { + "epoch": 0.7304988282557751, + "grad_norm": 0.14635740220546722, + "learning_rate": 0.0001, + "loss": 1.5759, + "step": 2182 + }, + { + "epoch": 0.7308336123200536, + "grad_norm": 0.13331273198127747, + "learning_rate": 0.0001, + "loss": 1.5169, + "step": 2183 + }, + { + "epoch": 0.7311683963843321, + "grad_norm": 0.13492250442504883, + "learning_rate": 0.0001, + "loss": 1.4711, + "step": 2184 + }, + { + "epoch": 0.7315031804486106, + "grad_norm": 0.14489556849002838, + "learning_rate": 0.0001, + "loss": 1.584, + "step": 2185 + }, + { + "epoch": 0.7318379645128892, + "grad_norm": 0.13701508939266205, + "learning_rate": 0.0001, + "loss": 1.5844, + "step": 2186 + }, + { + "epoch": 0.7321727485771677, + "grad_norm": 0.1370009034872055, + "learning_rate": 0.0001, + "loss": 1.5287, + "step": 2187 + }, + { + "epoch": 0.7325075326414463, + "grad_norm": 0.14577260613441467, + "learning_rate": 0.0001, + "loss": 1.4752, + "step": 2188 + }, + { + "epoch": 0.7328423167057249, + "grad_norm": 0.1377391368150711, + "learning_rate": 0.0001, + "loss": 1.5484, + "step": 2189 + }, + { + "epoch": 0.7331771007700033, + "grad_norm": 0.1396346390247345, + "learning_rate": 0.0001, + "loss": 1.5405, + "step": 2190 + }, + { + "epoch": 0.7335118848342819, + "grad_norm": 0.1492149382829666, + "learning_rate": 0.0001, + "loss": 1.5028, + "step": 2191 + }, + { + "epoch": 0.7338466688985604, + "grad_norm": 0.13928255438804626, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 2192 + }, + { + "epoch": 0.734181452962839, + "grad_norm": 0.13838155567646027, + "learning_rate": 0.0001, + "loss": 1.5661, + "step": 2193 + }, + { + "epoch": 0.7345162370271175, + "grad_norm": 0.1435183733701706, + "learning_rate": 0.0001, + "loss": 1.6133, + "step": 2194 + }, + { + "epoch": 0.734851021091396, + "grad_norm": 0.13500259816646576, + "learning_rate": 0.0001, + "loss": 1.5728, + "step": 2195 + }, + { + "epoch": 0.7351858051556746, + "grad_norm": 0.13238045573234558, + "learning_rate": 0.0001, + "loss": 1.5435, + "step": 2196 + }, + { + "epoch": 0.7355205892199531, + "grad_norm": 0.13493601977825165, + "learning_rate": 0.0001, + "loss": 1.5117, + "step": 2197 + }, + { + "epoch": 0.7358553732842317, + "grad_norm": 0.1433602273464203, + "learning_rate": 0.0001, + "loss": 1.5921, + "step": 2198 + }, + { + "epoch": 0.7361901573485102, + "grad_norm": 0.13165898621082306, + "learning_rate": 0.0001, + "loss": 1.5648, + "step": 2199 + }, + { + "epoch": 0.7365249414127888, + "grad_norm": 0.1355050653219223, + "learning_rate": 0.0001, + "loss": 1.5998, + "step": 2200 + }, + { + "epoch": 0.7368597254770672, + "grad_norm": 0.1296299695968628, + "learning_rate": 0.0001, + "loss": 1.3903, + "step": 2201 + }, + { + "epoch": 0.7371945095413458, + "grad_norm": 0.13563255965709686, + "learning_rate": 0.0001, + "loss": 1.5462, + "step": 2202 + }, + { + "epoch": 0.7375292936056244, + "grad_norm": 0.13449116051197052, + "learning_rate": 0.0001, + "loss": 1.5344, + "step": 2203 + }, + { + "epoch": 0.7378640776699029, + "grad_norm": 0.12928107380867004, + "learning_rate": 0.0001, + "loss": 1.5212, + "step": 2204 + }, + { + "epoch": 0.7381988617341815, + "grad_norm": 0.13199785351753235, + "learning_rate": 0.0001, + "loss": 1.5408, + "step": 2205 + }, + { + "epoch": 0.73853364579846, + "grad_norm": 0.13608896732330322, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 2206 + }, + { + "epoch": 0.7388684298627385, + "grad_norm": 0.1248575821518898, + "learning_rate": 0.0001, + "loss": 1.4513, + "step": 2207 + }, + { + "epoch": 0.739203213927017, + "grad_norm": 0.1319798231124878, + "learning_rate": 0.0001, + "loss": 1.5231, + "step": 2208 + }, + { + "epoch": 0.7395379979912956, + "grad_norm": 0.1297694742679596, + "learning_rate": 0.0001, + "loss": 1.492, + "step": 2209 + }, + { + "epoch": 0.7398727820555742, + "grad_norm": 0.13263830542564392, + "learning_rate": 0.0001, + "loss": 1.5746, + "step": 2210 + }, + { + "epoch": 0.7402075661198527, + "grad_norm": 0.1352548599243164, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 2211 + }, + { + "epoch": 0.7405423501841313, + "grad_norm": 0.13107185065746307, + "learning_rate": 0.0001, + "loss": 1.5053, + "step": 2212 + }, + { + "epoch": 0.7408771342484097, + "grad_norm": 0.13326485455036163, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 2213 + }, + { + "epoch": 0.7412119183126883, + "grad_norm": 0.14211507141590118, + "learning_rate": 0.0001, + "loss": 1.5694, + "step": 2214 + }, + { + "epoch": 0.7415467023769668, + "grad_norm": 0.13121196627616882, + "learning_rate": 0.0001, + "loss": 1.4977, + "step": 2215 + }, + { + "epoch": 0.7418814864412454, + "grad_norm": 0.13140466809272766, + "learning_rate": 0.0001, + "loss": 1.568, + "step": 2216 + }, + { + "epoch": 0.742216270505524, + "grad_norm": 0.1365407258272171, + "learning_rate": 0.0001, + "loss": 1.6667, + "step": 2217 + }, + { + "epoch": 0.7425510545698025, + "grad_norm": 0.13460293412208557, + "learning_rate": 0.0001, + "loss": 1.5813, + "step": 2218 + }, + { + "epoch": 0.742885838634081, + "grad_norm": 0.13729612529277802, + "learning_rate": 0.0001, + "loss": 1.5491, + "step": 2219 + }, + { + "epoch": 0.7432206226983595, + "grad_norm": 0.13383755087852478, + "learning_rate": 0.0001, + "loss": 1.5678, + "step": 2220 + }, + { + "epoch": 0.7435554067626381, + "grad_norm": 0.13744328916072845, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 2221 + }, + { + "epoch": 0.7438901908269167, + "grad_norm": 0.12934266030788422, + "learning_rate": 0.0001, + "loss": 1.5429, + "step": 2222 + }, + { + "epoch": 0.7442249748911952, + "grad_norm": 0.1308993250131607, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 2223 + }, + { + "epoch": 0.7445597589554738, + "grad_norm": 0.1382169872522354, + "learning_rate": 0.0001, + "loss": 1.6019, + "step": 2224 + }, + { + "epoch": 0.7448945430197522, + "grad_norm": 0.13184891641139984, + "learning_rate": 0.0001, + "loss": 1.5357, + "step": 2225 + }, + { + "epoch": 0.7452293270840308, + "grad_norm": 0.1404266655445099, + "learning_rate": 0.0001, + "loss": 1.5935, + "step": 2226 + }, + { + "epoch": 0.7455641111483093, + "grad_norm": 0.13625003397464752, + "learning_rate": 0.0001, + "loss": 1.5588, + "step": 2227 + }, + { + "epoch": 0.7458988952125879, + "grad_norm": 0.1287645548582077, + "learning_rate": 0.0001, + "loss": 1.435, + "step": 2228 + }, + { + "epoch": 0.7462336792768665, + "grad_norm": 0.13726918399333954, + "learning_rate": 0.0001, + "loss": 1.5453, + "step": 2229 + }, + { + "epoch": 0.746568463341145, + "grad_norm": 0.13299064338207245, + "learning_rate": 0.0001, + "loss": 1.4996, + "step": 2230 + }, + { + "epoch": 0.7469032474054235, + "grad_norm": 0.13553793728351593, + "learning_rate": 0.0001, + "loss": 1.5395, + "step": 2231 + }, + { + "epoch": 0.747238031469702, + "grad_norm": 0.13683359324932098, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 2232 + }, + { + "epoch": 0.7475728155339806, + "grad_norm": 0.13002213835716248, + "learning_rate": 0.0001, + "loss": 1.5245, + "step": 2233 + }, + { + "epoch": 0.7479075995982591, + "grad_norm": 0.13479109108448029, + "learning_rate": 0.0001, + "loss": 1.5724, + "step": 2234 + }, + { + "epoch": 0.7482423836625377, + "grad_norm": 0.13677366077899933, + "learning_rate": 0.0001, + "loss": 1.6276, + "step": 2235 + }, + { + "epoch": 0.7485771677268162, + "grad_norm": 0.14970214664936066, + "learning_rate": 0.0001, + "loss": 1.6145, + "step": 2236 + }, + { + "epoch": 0.7489119517910947, + "grad_norm": 0.1285363882780075, + "learning_rate": 0.0001, + "loss": 1.4591, + "step": 2237 + }, + { + "epoch": 0.7492467358553733, + "grad_norm": 0.14044371247291565, + "learning_rate": 0.0001, + "loss": 1.511, + "step": 2238 + }, + { + "epoch": 0.7495815199196518, + "grad_norm": 0.13310682773590088, + "learning_rate": 0.0001, + "loss": 1.5777, + "step": 2239 + }, + { + "epoch": 0.7499163039839304, + "grad_norm": 0.14290130138397217, + "learning_rate": 0.0001, + "loss": 1.5075, + "step": 2240 + }, + { + "epoch": 0.750251088048209, + "grad_norm": 0.1509731411933899, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 2241 + }, + { + "epoch": 0.7505858721124874, + "grad_norm": 0.13322798907756805, + "learning_rate": 0.0001, + "loss": 1.5722, + "step": 2242 + }, + { + "epoch": 0.750920656176766, + "grad_norm": 0.1355818659067154, + "learning_rate": 0.0001, + "loss": 1.4922, + "step": 2243 + }, + { + "epoch": 0.7512554402410445, + "grad_norm": 0.14394080638885498, + "learning_rate": 0.0001, + "loss": 1.5976, + "step": 2244 + }, + { + "epoch": 0.7515902243053231, + "grad_norm": 0.135832279920578, + "learning_rate": 0.0001, + "loss": 1.5138, + "step": 2245 + }, + { + "epoch": 0.7519250083696016, + "grad_norm": 0.13906393945217133, + "learning_rate": 0.0001, + "loss": 1.5351, + "step": 2246 + }, + { + "epoch": 0.7522597924338802, + "grad_norm": 0.13090325891971588, + "learning_rate": 0.0001, + "loss": 1.4505, + "step": 2247 + }, + { + "epoch": 0.7525945764981586, + "grad_norm": 0.13537496328353882, + "learning_rate": 0.0001, + "loss": 1.4955, + "step": 2248 + }, + { + "epoch": 0.7529293605624372, + "grad_norm": 0.1373416930437088, + "learning_rate": 0.0001, + "loss": 1.541, + "step": 2249 + }, + { + "epoch": 0.7532641446267158, + "grad_norm": 0.1294248253107071, + "learning_rate": 0.0001, + "loss": 1.4943, + "step": 2250 + }, + { + "epoch": 0.7535989286909943, + "grad_norm": 0.12977437674999237, + "learning_rate": 0.0001, + "loss": 1.5315, + "step": 2251 + }, + { + "epoch": 0.7539337127552729, + "grad_norm": 0.13353915512561798, + "learning_rate": 0.0001, + "loss": 1.4855, + "step": 2252 + }, + { + "epoch": 0.7542684968195514, + "grad_norm": 0.1338808536529541, + "learning_rate": 0.0001, + "loss": 1.5483, + "step": 2253 + }, + { + "epoch": 0.7546032808838299, + "grad_norm": 0.13082879781723022, + "learning_rate": 0.0001, + "loss": 1.5276, + "step": 2254 + }, + { + "epoch": 0.7549380649481084, + "grad_norm": 0.12903323769569397, + "learning_rate": 0.0001, + "loss": 1.5506, + "step": 2255 + }, + { + "epoch": 0.755272849012387, + "grad_norm": 0.1312693953514099, + "learning_rate": 0.0001, + "loss": 1.4347, + "step": 2256 + }, + { + "epoch": 0.7556076330766656, + "grad_norm": 0.13503922522068024, + "learning_rate": 0.0001, + "loss": 1.5089, + "step": 2257 + }, + { + "epoch": 0.7559424171409441, + "grad_norm": 0.13478560745716095, + "learning_rate": 0.0001, + "loss": 1.4717, + "step": 2258 + }, + { + "epoch": 0.7562772012052227, + "grad_norm": 0.14111362397670746, + "learning_rate": 0.0001, + "loss": 1.4982, + "step": 2259 + }, + { + "epoch": 0.7566119852695011, + "grad_norm": 0.13715283572673798, + "learning_rate": 0.0001, + "loss": 1.5166, + "step": 2260 + }, + { + "epoch": 0.7569467693337797, + "grad_norm": 0.14457426965236664, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 2261 + }, + { + "epoch": 0.7572815533980582, + "grad_norm": 0.13212622702121735, + "learning_rate": 0.0001, + "loss": 1.4653, + "step": 2262 + }, + { + "epoch": 0.7576163374623368, + "grad_norm": 0.136484295129776, + "learning_rate": 0.0001, + "loss": 1.4416, + "step": 2263 + }, + { + "epoch": 0.7579511215266154, + "grad_norm": 0.13701216876506805, + "learning_rate": 0.0001, + "loss": 1.5158, + "step": 2264 + }, + { + "epoch": 0.7582859055908939, + "grad_norm": 0.13045822083950043, + "learning_rate": 0.0001, + "loss": 1.4805, + "step": 2265 + }, + { + "epoch": 0.7586206896551724, + "grad_norm": 0.13484729826450348, + "learning_rate": 0.0001, + "loss": 1.4919, + "step": 2266 + }, + { + "epoch": 0.7589554737194509, + "grad_norm": 0.1352708488702774, + "learning_rate": 0.0001, + "loss": 1.5632, + "step": 2267 + }, + { + "epoch": 0.7592902577837295, + "grad_norm": 0.13968177139759064, + "learning_rate": 0.0001, + "loss": 1.5983, + "step": 2268 + }, + { + "epoch": 0.759625041848008, + "grad_norm": 0.13527031242847443, + "learning_rate": 0.0001, + "loss": 1.5361, + "step": 2269 + }, + { + "epoch": 0.7599598259122866, + "grad_norm": 0.13342413306236267, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 2270 + }, + { + "epoch": 0.7602946099765651, + "grad_norm": 0.13037632405757904, + "learning_rate": 0.0001, + "loss": 1.4433, + "step": 2271 + }, + { + "epoch": 0.7606293940408436, + "grad_norm": 0.12888109683990479, + "learning_rate": 0.0001, + "loss": 1.5565, + "step": 2272 + }, + { + "epoch": 0.7609641781051222, + "grad_norm": 0.13160650432109833, + "learning_rate": 0.0001, + "loss": 1.6344, + "step": 2273 + }, + { + "epoch": 0.7612989621694007, + "grad_norm": 0.13456179201602936, + "learning_rate": 0.0001, + "loss": 1.5983, + "step": 2274 + }, + { + "epoch": 0.7616337462336793, + "grad_norm": 0.12624886631965637, + "learning_rate": 0.0001, + "loss": 1.4877, + "step": 2275 + }, + { + "epoch": 0.7619685302979579, + "grad_norm": 0.13493984937667847, + "learning_rate": 0.0001, + "loss": 1.6083, + "step": 2276 + }, + { + "epoch": 0.7623033143622363, + "grad_norm": 0.13616621494293213, + "learning_rate": 0.0001, + "loss": 1.59, + "step": 2277 + }, + { + "epoch": 0.7626380984265149, + "grad_norm": 0.1309913843870163, + "learning_rate": 0.0001, + "loss": 1.5356, + "step": 2278 + }, + { + "epoch": 0.7629728824907934, + "grad_norm": 0.1269841343164444, + "learning_rate": 0.0001, + "loss": 1.442, + "step": 2279 + }, + { + "epoch": 0.763307666555072, + "grad_norm": 0.13083530962467194, + "learning_rate": 0.0001, + "loss": 1.4919, + "step": 2280 + }, + { + "epoch": 0.7636424506193505, + "grad_norm": 0.13288795948028564, + "learning_rate": 0.0001, + "loss": 1.5919, + "step": 2281 + }, + { + "epoch": 0.7639772346836291, + "grad_norm": 0.1334894597530365, + "learning_rate": 0.0001, + "loss": 1.5203, + "step": 2282 + }, + { + "epoch": 0.7643120187479076, + "grad_norm": 0.1322222203016281, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 2283 + }, + { + "epoch": 0.7646468028121861, + "grad_norm": 0.13740068674087524, + "learning_rate": 0.0001, + "loss": 1.5966, + "step": 2284 + }, + { + "epoch": 0.7649815868764647, + "grad_norm": 0.13021446764469147, + "learning_rate": 0.0001, + "loss": 1.5163, + "step": 2285 + }, + { + "epoch": 0.7653163709407432, + "grad_norm": 0.13992641866207123, + "learning_rate": 0.0001, + "loss": 1.5116, + "step": 2286 + }, + { + "epoch": 0.7656511550050218, + "grad_norm": 0.13332848250865936, + "learning_rate": 0.0001, + "loss": 1.5066, + "step": 2287 + }, + { + "epoch": 0.7659859390693003, + "grad_norm": 0.12683235108852386, + "learning_rate": 0.0001, + "loss": 1.4933, + "step": 2288 + }, + { + "epoch": 0.7663207231335788, + "grad_norm": 0.13610418140888214, + "learning_rate": 0.0001, + "loss": 1.5115, + "step": 2289 + }, + { + "epoch": 0.7666555071978574, + "grad_norm": 0.13530276715755463, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 2290 + }, + { + "epoch": 0.7669902912621359, + "grad_norm": 0.13067664206027985, + "learning_rate": 0.0001, + "loss": 1.4806, + "step": 2291 + }, + { + "epoch": 0.7673250753264145, + "grad_norm": 0.12956401705741882, + "learning_rate": 0.0001, + "loss": 1.4432, + "step": 2292 + }, + { + "epoch": 0.767659859390693, + "grad_norm": 0.1368110626935959, + "learning_rate": 0.0001, + "loss": 1.5858, + "step": 2293 + }, + { + "epoch": 0.7679946434549716, + "grad_norm": 0.13342629373073578, + "learning_rate": 0.0001, + "loss": 1.4773, + "step": 2294 + }, + { + "epoch": 0.76832942751925, + "grad_norm": 0.13525448739528656, + "learning_rate": 0.0001, + "loss": 1.5574, + "step": 2295 + }, + { + "epoch": 0.7686642115835286, + "grad_norm": 0.14219002425670624, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 2296 + }, + { + "epoch": 0.7689989956478072, + "grad_norm": 0.13410523533821106, + "learning_rate": 0.0001, + "loss": 1.5414, + "step": 2297 + }, + { + "epoch": 0.7693337797120857, + "grad_norm": 0.1366255283355713, + "learning_rate": 0.0001, + "loss": 1.5588, + "step": 2298 + }, + { + "epoch": 0.7696685637763643, + "grad_norm": 0.14335733652114868, + "learning_rate": 0.0001, + "loss": 1.4797, + "step": 2299 + }, + { + "epoch": 0.7700033478406428, + "grad_norm": 0.13368913531303406, + "learning_rate": 0.0001, + "loss": 1.5068, + "step": 2300 + }, + { + "epoch": 0.7703381319049213, + "grad_norm": 0.14045390486717224, + "learning_rate": 0.0001, + "loss": 1.5532, + "step": 2301 + }, + { + "epoch": 0.7706729159691998, + "grad_norm": 0.13820236921310425, + "learning_rate": 0.0001, + "loss": 1.4334, + "step": 2302 + }, + { + "epoch": 0.7710077000334784, + "grad_norm": 0.13486477732658386, + "learning_rate": 0.0001, + "loss": 1.6277, + "step": 2303 + }, + { + "epoch": 0.771342484097757, + "grad_norm": 0.1374381184577942, + "learning_rate": 0.0001, + "loss": 1.4995, + "step": 2304 + }, + { + "epoch": 0.7716772681620355, + "grad_norm": 0.14841946959495544, + "learning_rate": 0.0001, + "loss": 1.6044, + "step": 2305 + }, + { + "epoch": 0.7720120522263141, + "grad_norm": 0.13106206059455872, + "learning_rate": 0.0001, + "loss": 1.5009, + "step": 2306 + }, + { + "epoch": 0.7723468362905925, + "grad_norm": 0.13768276572227478, + "learning_rate": 0.0001, + "loss": 1.5289, + "step": 2307 + }, + { + "epoch": 0.7726816203548711, + "grad_norm": 0.14987289905548096, + "learning_rate": 0.0001, + "loss": 1.5654, + "step": 2308 + }, + { + "epoch": 0.7730164044191496, + "grad_norm": 0.13422365486621857, + "learning_rate": 0.0001, + "loss": 1.5781, + "step": 2309 + }, + { + "epoch": 0.7733511884834282, + "grad_norm": 0.14007548987865448, + "learning_rate": 0.0001, + "loss": 1.468, + "step": 2310 + }, + { + "epoch": 0.7736859725477068, + "grad_norm": 0.140237495303154, + "learning_rate": 0.0001, + "loss": 1.4408, + "step": 2311 + }, + { + "epoch": 0.7740207566119852, + "grad_norm": 0.1331593543291092, + "learning_rate": 0.0001, + "loss": 1.5213, + "step": 2312 + }, + { + "epoch": 0.7743555406762638, + "grad_norm": 0.13670580089092255, + "learning_rate": 0.0001, + "loss": 1.5034, + "step": 2313 + }, + { + "epoch": 0.7746903247405423, + "grad_norm": 0.13198411464691162, + "learning_rate": 0.0001, + "loss": 1.4633, + "step": 2314 + }, + { + "epoch": 0.7750251088048209, + "grad_norm": 0.14384810626506805, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 2315 + }, + { + "epoch": 0.7753598928690995, + "grad_norm": 0.12808088958263397, + "learning_rate": 0.0001, + "loss": 1.4751, + "step": 2316 + }, + { + "epoch": 0.775694676933378, + "grad_norm": 0.14130346477031708, + "learning_rate": 0.0001, + "loss": 1.5306, + "step": 2317 + }, + { + "epoch": 0.7760294609976565, + "grad_norm": 0.13153797388076782, + "learning_rate": 0.0001, + "loss": 1.5046, + "step": 2318 + }, + { + "epoch": 0.776364245061935, + "grad_norm": 0.13447383046150208, + "learning_rate": 0.0001, + "loss": 1.5288, + "step": 2319 + }, + { + "epoch": 0.7766990291262136, + "grad_norm": 0.13588428497314453, + "learning_rate": 0.0001, + "loss": 1.5792, + "step": 2320 + }, + { + "epoch": 0.7770338131904921, + "grad_norm": 0.1414654701948166, + "learning_rate": 0.0001, + "loss": 1.6252, + "step": 2321 + }, + { + "epoch": 0.7773685972547707, + "grad_norm": 0.14798319339752197, + "learning_rate": 0.0001, + "loss": 1.5182, + "step": 2322 + }, + { + "epoch": 0.7777033813190493, + "grad_norm": 0.13594651222229004, + "learning_rate": 0.0001, + "loss": 1.59, + "step": 2323 + }, + { + "epoch": 0.7780381653833277, + "grad_norm": 0.13689537346363068, + "learning_rate": 0.0001, + "loss": 1.5312, + "step": 2324 + }, + { + "epoch": 0.7783729494476063, + "grad_norm": 0.13842853903770447, + "learning_rate": 0.0001, + "loss": 1.5453, + "step": 2325 + }, + { + "epoch": 0.7787077335118848, + "grad_norm": 0.14006944000720978, + "learning_rate": 0.0001, + "loss": 1.5789, + "step": 2326 + }, + { + "epoch": 0.7790425175761634, + "grad_norm": 0.1328335702419281, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 2327 + }, + { + "epoch": 0.7793773016404419, + "grad_norm": 0.1366383582353592, + "learning_rate": 0.0001, + "loss": 1.5861, + "step": 2328 + }, + { + "epoch": 0.7797120857047205, + "grad_norm": 0.1384078413248062, + "learning_rate": 0.0001, + "loss": 1.4768, + "step": 2329 + }, + { + "epoch": 0.780046869768999, + "grad_norm": 0.13138563930988312, + "learning_rate": 0.0001, + "loss": 1.5415, + "step": 2330 + }, + { + "epoch": 0.7803816538332775, + "grad_norm": 0.13533802330493927, + "learning_rate": 0.0001, + "loss": 1.5351, + "step": 2331 + }, + { + "epoch": 0.7807164378975561, + "grad_norm": 0.12634359300136566, + "learning_rate": 0.0001, + "loss": 1.4854, + "step": 2332 + }, + { + "epoch": 0.7810512219618346, + "grad_norm": 0.14045196771621704, + "learning_rate": 0.0001, + "loss": 1.5979, + "step": 2333 + }, + { + "epoch": 0.7813860060261132, + "grad_norm": 0.12970393896102905, + "learning_rate": 0.0001, + "loss": 1.4883, + "step": 2334 + }, + { + "epoch": 0.7817207900903917, + "grad_norm": 0.13416926562786102, + "learning_rate": 0.0001, + "loss": 1.538, + "step": 2335 + }, + { + "epoch": 0.7820555741546702, + "grad_norm": 0.12993508577346802, + "learning_rate": 0.0001, + "loss": 1.3861, + "step": 2336 + }, + { + "epoch": 0.7823903582189488, + "grad_norm": 0.1441780924797058, + "learning_rate": 0.0001, + "loss": 1.5082, + "step": 2337 + }, + { + "epoch": 0.7827251422832273, + "grad_norm": 0.1340634673833847, + "learning_rate": 0.0001, + "loss": 1.5308, + "step": 2338 + }, + { + "epoch": 0.7830599263475059, + "grad_norm": 0.1375696063041687, + "learning_rate": 0.0001, + "loss": 1.4726, + "step": 2339 + }, + { + "epoch": 0.7833947104117844, + "grad_norm": 0.13143296539783478, + "learning_rate": 0.0001, + "loss": 1.5403, + "step": 2340 + }, + { + "epoch": 0.783729494476063, + "grad_norm": 0.14144007861614227, + "learning_rate": 0.0001, + "loss": 1.5596, + "step": 2341 + }, + { + "epoch": 0.7840642785403414, + "grad_norm": 0.1288491189479828, + "learning_rate": 0.0001, + "loss": 1.4793, + "step": 2342 + }, + { + "epoch": 0.78439906260462, + "grad_norm": 0.13762634992599487, + "learning_rate": 0.0001, + "loss": 1.5224, + "step": 2343 + }, + { + "epoch": 0.7847338466688986, + "grad_norm": 0.1369268000125885, + "learning_rate": 0.0001, + "loss": 1.5678, + "step": 2344 + }, + { + "epoch": 0.7850686307331771, + "grad_norm": 0.1348867565393448, + "learning_rate": 0.0001, + "loss": 1.5764, + "step": 2345 + }, + { + "epoch": 0.7854034147974557, + "grad_norm": 0.13499613106250763, + "learning_rate": 0.0001, + "loss": 1.5317, + "step": 2346 + }, + { + "epoch": 0.7857381988617341, + "grad_norm": 0.136494979262352, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 2347 + }, + { + "epoch": 0.7860729829260127, + "grad_norm": 0.13742174208164215, + "learning_rate": 0.0001, + "loss": 1.5524, + "step": 2348 + }, + { + "epoch": 0.7864077669902912, + "grad_norm": 0.1315702348947525, + "learning_rate": 0.0001, + "loss": 1.5199, + "step": 2349 + }, + { + "epoch": 0.7867425510545698, + "grad_norm": 0.1344085931777954, + "learning_rate": 0.0001, + "loss": 1.5222, + "step": 2350 + }, + { + "epoch": 0.7870773351188484, + "grad_norm": 0.1331881582736969, + "learning_rate": 0.0001, + "loss": 1.4746, + "step": 2351 + }, + { + "epoch": 0.7874121191831269, + "grad_norm": 0.13880756497383118, + "learning_rate": 0.0001, + "loss": 1.5027, + "step": 2352 + }, + { + "epoch": 0.7877469032474054, + "grad_norm": 0.1315576285123825, + "learning_rate": 0.0001, + "loss": 1.5833, + "step": 2353 + }, + { + "epoch": 0.7880816873116839, + "grad_norm": 0.1278029829263687, + "learning_rate": 0.0001, + "loss": 1.4475, + "step": 2354 + }, + { + "epoch": 0.7884164713759625, + "grad_norm": 0.14114075899124146, + "learning_rate": 0.0001, + "loss": 1.4451, + "step": 2355 + }, + { + "epoch": 0.788751255440241, + "grad_norm": 0.1352827101945877, + "learning_rate": 0.0001, + "loss": 1.4816, + "step": 2356 + }, + { + "epoch": 0.7890860395045196, + "grad_norm": 0.1316574364900589, + "learning_rate": 0.0001, + "loss": 1.4572, + "step": 2357 + }, + { + "epoch": 0.7894208235687982, + "grad_norm": 0.13792237639427185, + "learning_rate": 0.0001, + "loss": 1.6108, + "step": 2358 + }, + { + "epoch": 0.7897556076330766, + "grad_norm": 0.1365162879228592, + "learning_rate": 0.0001, + "loss": 1.5303, + "step": 2359 + }, + { + "epoch": 0.7900903916973552, + "grad_norm": 0.13918493688106537, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 2360 + }, + { + "epoch": 0.7904251757616337, + "grad_norm": 0.1277536302804947, + "learning_rate": 0.0001, + "loss": 1.5365, + "step": 2361 + }, + { + "epoch": 0.7907599598259123, + "grad_norm": 0.13407327234745026, + "learning_rate": 0.0001, + "loss": 1.4571, + "step": 2362 + }, + { + "epoch": 0.7910947438901909, + "grad_norm": 0.1346539407968521, + "learning_rate": 0.0001, + "loss": 1.4506, + "step": 2363 + }, + { + "epoch": 0.7914295279544694, + "grad_norm": 0.13160093128681183, + "learning_rate": 0.0001, + "loss": 1.4457, + "step": 2364 + }, + { + "epoch": 0.7917643120187479, + "grad_norm": 0.13025003671646118, + "learning_rate": 0.0001, + "loss": 1.56, + "step": 2365 + }, + { + "epoch": 0.7920990960830264, + "grad_norm": 0.14476409554481506, + "learning_rate": 0.0001, + "loss": 1.5876, + "step": 2366 + }, + { + "epoch": 0.792433880147305, + "grad_norm": 0.13053929805755615, + "learning_rate": 0.0001, + "loss": 1.4338, + "step": 2367 + }, + { + "epoch": 0.7927686642115835, + "grad_norm": 0.13872520625591278, + "learning_rate": 0.0001, + "loss": 1.6427, + "step": 2368 + }, + { + "epoch": 0.7931034482758621, + "grad_norm": 0.14061668515205383, + "learning_rate": 0.0001, + "loss": 1.4886, + "step": 2369 + }, + { + "epoch": 0.7934382323401407, + "grad_norm": 0.130232036113739, + "learning_rate": 0.0001, + "loss": 1.4023, + "step": 2370 + }, + { + "epoch": 0.7937730164044191, + "grad_norm": 0.23358748853206635, + "learning_rate": 0.0001, + "loss": 1.457, + "step": 2371 + }, + { + "epoch": 0.7941078004686977, + "grad_norm": 0.13233914971351624, + "learning_rate": 0.0001, + "loss": 1.4307, + "step": 2372 + }, + { + "epoch": 0.7944425845329762, + "grad_norm": 0.13504283130168915, + "learning_rate": 0.0001, + "loss": 1.5976, + "step": 2373 + }, + { + "epoch": 0.7947773685972548, + "grad_norm": 0.13976161181926727, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 2374 + }, + { + "epoch": 0.7951121526615333, + "grad_norm": 0.1336098313331604, + "learning_rate": 0.0001, + "loss": 1.4469, + "step": 2375 + }, + { + "epoch": 0.7954469367258119, + "grad_norm": 0.13648861646652222, + "learning_rate": 0.0001, + "loss": 1.4964, + "step": 2376 + }, + { + "epoch": 0.7957817207900904, + "grad_norm": 0.13627798855304718, + "learning_rate": 0.0001, + "loss": 1.5834, + "step": 2377 + }, + { + "epoch": 0.7961165048543689, + "grad_norm": 0.14114542305469513, + "learning_rate": 0.0001, + "loss": 1.5566, + "step": 2378 + }, + { + "epoch": 0.7964512889186475, + "grad_norm": 0.13499446213245392, + "learning_rate": 0.0001, + "loss": 1.5174, + "step": 2379 + }, + { + "epoch": 0.796786072982926, + "grad_norm": 0.14620280265808105, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 2380 + }, + { + "epoch": 0.7971208570472046, + "grad_norm": 0.13239939510822296, + "learning_rate": 0.0001, + "loss": 1.5274, + "step": 2381 + }, + { + "epoch": 0.7974556411114831, + "grad_norm": 0.13517913222312927, + "learning_rate": 0.0001, + "loss": 1.5291, + "step": 2382 + }, + { + "epoch": 0.7977904251757616, + "grad_norm": 0.1352391242980957, + "learning_rate": 0.0001, + "loss": 1.5285, + "step": 2383 + }, + { + "epoch": 0.7981252092400402, + "grad_norm": 0.14000670611858368, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 2384 + }, + { + "epoch": 0.7984599933043187, + "grad_norm": 0.1349296271800995, + "learning_rate": 0.0001, + "loss": 1.5001, + "step": 2385 + }, + { + "epoch": 0.7987947773685973, + "grad_norm": 0.1352308988571167, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 2386 + }, + { + "epoch": 0.7991295614328758, + "grad_norm": 0.1368694305419922, + "learning_rate": 0.0001, + "loss": 1.5861, + "step": 2387 + }, + { + "epoch": 0.7994643454971543, + "grad_norm": 0.1355554759502411, + "learning_rate": 0.0001, + "loss": 1.5377, + "step": 2388 + }, + { + "epoch": 0.7997991295614328, + "grad_norm": 0.13328254222869873, + "learning_rate": 0.0001, + "loss": 1.5517, + "step": 2389 + }, + { + "epoch": 0.8001339136257114, + "grad_norm": 0.13724930584430695, + "learning_rate": 0.0001, + "loss": 1.5987, + "step": 2390 + }, + { + "epoch": 0.80046869768999, + "grad_norm": 0.13542616367340088, + "learning_rate": 0.0001, + "loss": 1.6654, + "step": 2391 + }, + { + "epoch": 0.8008034817542685, + "grad_norm": 0.1366943120956421, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 2392 + }, + { + "epoch": 0.8011382658185471, + "grad_norm": 0.13868063688278198, + "learning_rate": 0.0001, + "loss": 1.587, + "step": 2393 + }, + { + "epoch": 0.8014730498828255, + "grad_norm": 0.1393207311630249, + "learning_rate": 0.0001, + "loss": 1.5559, + "step": 2394 + }, + { + "epoch": 0.8018078339471041, + "grad_norm": 0.13909262418746948, + "learning_rate": 0.0001, + "loss": 1.5007, + "step": 2395 + }, + { + "epoch": 0.8021426180113826, + "grad_norm": 0.12949267029762268, + "learning_rate": 0.0001, + "loss": 1.5108, + "step": 2396 + }, + { + "epoch": 0.8024774020756612, + "grad_norm": 0.12755730748176575, + "learning_rate": 0.0001, + "loss": 1.5008, + "step": 2397 + }, + { + "epoch": 0.8028121861399398, + "grad_norm": 0.12899887561798096, + "learning_rate": 0.0001, + "loss": 1.3877, + "step": 2398 + }, + { + "epoch": 0.8031469702042183, + "grad_norm": 0.1423116773366928, + "learning_rate": 0.0001, + "loss": 1.4996, + "step": 2399 + }, + { + "epoch": 0.8034817542684968, + "grad_norm": 0.13548225164413452, + "learning_rate": 0.0001, + "loss": 1.5214, + "step": 2400 + }, + { + "epoch": 0.8038165383327753, + "grad_norm": 0.13150808215141296, + "learning_rate": 0.0001, + "loss": 1.4772, + "step": 2401 + }, + { + "epoch": 0.8041513223970539, + "grad_norm": 0.13790038228034973, + "learning_rate": 0.0001, + "loss": 1.5704, + "step": 2402 + }, + { + "epoch": 0.8044861064613325, + "grad_norm": 0.13106264173984528, + "learning_rate": 0.0001, + "loss": 1.5073, + "step": 2403 + }, + { + "epoch": 0.804820890525611, + "grad_norm": 0.13568797707557678, + "learning_rate": 0.0001, + "loss": 1.6371, + "step": 2404 + }, + { + "epoch": 0.8051556745898896, + "grad_norm": 0.13882842659950256, + "learning_rate": 0.0001, + "loss": 1.5571, + "step": 2405 + }, + { + "epoch": 0.805490458654168, + "grad_norm": 0.1312180459499359, + "learning_rate": 0.0001, + "loss": 1.5625, + "step": 2406 + }, + { + "epoch": 0.8058252427184466, + "grad_norm": 0.12823453545570374, + "learning_rate": 0.0001, + "loss": 1.5046, + "step": 2407 + }, + { + "epoch": 0.8061600267827251, + "grad_norm": 0.13207179307937622, + "learning_rate": 0.0001, + "loss": 1.5031, + "step": 2408 + }, + { + "epoch": 0.8064948108470037, + "grad_norm": 0.1277305632829666, + "learning_rate": 0.0001, + "loss": 1.4867, + "step": 2409 + }, + { + "epoch": 0.8068295949112823, + "grad_norm": 0.13227322697639465, + "learning_rate": 0.0001, + "loss": 1.5019, + "step": 2410 + }, + { + "epoch": 0.8071643789755608, + "grad_norm": 0.1336304098367691, + "learning_rate": 0.0001, + "loss": 1.4424, + "step": 2411 + }, + { + "epoch": 0.8074991630398393, + "grad_norm": 0.13859078288078308, + "learning_rate": 0.0001, + "loss": 1.5301, + "step": 2412 + }, + { + "epoch": 0.8078339471041178, + "grad_norm": 0.1342136412858963, + "learning_rate": 0.0001, + "loss": 1.485, + "step": 2413 + }, + { + "epoch": 0.8081687311683964, + "grad_norm": 0.14003999531269073, + "learning_rate": 0.0001, + "loss": 1.5313, + "step": 2414 + }, + { + "epoch": 0.8085035152326749, + "grad_norm": 0.13216662406921387, + "learning_rate": 0.0001, + "loss": 1.52, + "step": 2415 + }, + { + "epoch": 0.8088382992969535, + "grad_norm": 0.1373407393693924, + "learning_rate": 0.0001, + "loss": 1.5157, + "step": 2416 + }, + { + "epoch": 0.8091730833612321, + "grad_norm": 0.13850343227386475, + "learning_rate": 0.0001, + "loss": 1.4971, + "step": 2417 + }, + { + "epoch": 0.8095078674255105, + "grad_norm": 0.1334608793258667, + "learning_rate": 0.0001, + "loss": 1.5237, + "step": 2418 + }, + { + "epoch": 0.8098426514897891, + "grad_norm": 0.13133668899536133, + "learning_rate": 0.0001, + "loss": 1.5053, + "step": 2419 + }, + { + "epoch": 0.8101774355540676, + "grad_norm": 0.13715368509292603, + "learning_rate": 0.0001, + "loss": 1.6357, + "step": 2420 + }, + { + "epoch": 0.8105122196183462, + "grad_norm": 0.14129430055618286, + "learning_rate": 0.0001, + "loss": 1.5736, + "step": 2421 + }, + { + "epoch": 0.8108470036826247, + "grad_norm": 0.133287250995636, + "learning_rate": 0.0001, + "loss": 1.4701, + "step": 2422 + }, + { + "epoch": 0.8111817877469032, + "grad_norm": 0.137081116437912, + "learning_rate": 0.0001, + "loss": 1.4562, + "step": 2423 + }, + { + "epoch": 0.8115165718111818, + "grad_norm": 0.13136571645736694, + "learning_rate": 0.0001, + "loss": 1.5014, + "step": 2424 + }, + { + "epoch": 0.8118513558754603, + "grad_norm": 0.13660964369773865, + "learning_rate": 0.0001, + "loss": 1.5533, + "step": 2425 + }, + { + "epoch": 0.8121861399397389, + "grad_norm": 0.145840123295784, + "learning_rate": 0.0001, + "loss": 1.6406, + "step": 2426 + }, + { + "epoch": 0.8125209240040174, + "grad_norm": 0.13612517714500427, + "learning_rate": 0.0001, + "loss": 1.4968, + "step": 2427 + }, + { + "epoch": 0.812855708068296, + "grad_norm": 0.14182846248149872, + "learning_rate": 0.0001, + "loss": 1.5507, + "step": 2428 + }, + { + "epoch": 0.8131904921325744, + "grad_norm": 0.13697752356529236, + "learning_rate": 0.0001, + "loss": 1.5241, + "step": 2429 + }, + { + "epoch": 0.813525276196853, + "grad_norm": 0.14000248908996582, + "learning_rate": 0.0001, + "loss": 1.6002, + "step": 2430 + }, + { + "epoch": 0.8138600602611316, + "grad_norm": 0.13774293661117554, + "learning_rate": 0.0001, + "loss": 1.5198, + "step": 2431 + }, + { + "epoch": 0.8141948443254101, + "grad_norm": 0.13524143397808075, + "learning_rate": 0.0001, + "loss": 1.5326, + "step": 2432 + }, + { + "epoch": 0.8145296283896887, + "grad_norm": 0.13584178686141968, + "learning_rate": 0.0001, + "loss": 1.5313, + "step": 2433 + }, + { + "epoch": 0.8148644124539672, + "grad_norm": 0.13589173555374146, + "learning_rate": 0.0001, + "loss": 1.5097, + "step": 2434 + }, + { + "epoch": 0.8151991965182457, + "grad_norm": 0.1420723795890808, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 2435 + }, + { + "epoch": 0.8155339805825242, + "grad_norm": 0.13078542053699493, + "learning_rate": 0.0001, + "loss": 1.4239, + "step": 2436 + }, + { + "epoch": 0.8158687646468028, + "grad_norm": 0.14007273316383362, + "learning_rate": 0.0001, + "loss": 1.5912, + "step": 2437 + }, + { + "epoch": 0.8162035487110814, + "grad_norm": 0.13472947478294373, + "learning_rate": 0.0001, + "loss": 1.5146, + "step": 2438 + }, + { + "epoch": 0.8165383327753599, + "grad_norm": 0.13456539809703827, + "learning_rate": 0.0001, + "loss": 1.5277, + "step": 2439 + }, + { + "epoch": 0.8168731168396385, + "grad_norm": 0.13376279175281525, + "learning_rate": 0.0001, + "loss": 1.4554, + "step": 2440 + }, + { + "epoch": 0.8172079009039169, + "grad_norm": 0.13720721006393433, + "learning_rate": 0.0001, + "loss": 1.5463, + "step": 2441 + }, + { + "epoch": 0.8175426849681955, + "grad_norm": 0.1363624483346939, + "learning_rate": 0.0001, + "loss": 1.537, + "step": 2442 + }, + { + "epoch": 0.817877469032474, + "grad_norm": 0.13379956781864166, + "learning_rate": 0.0001, + "loss": 1.5831, + "step": 2443 + }, + { + "epoch": 0.8182122530967526, + "grad_norm": 0.13432839512825012, + "learning_rate": 0.0001, + "loss": 1.5511, + "step": 2444 + }, + { + "epoch": 0.8185470371610312, + "grad_norm": 0.1365717500448227, + "learning_rate": 0.0001, + "loss": 1.4519, + "step": 2445 + }, + { + "epoch": 0.8188818212253097, + "grad_norm": 0.13430190086364746, + "learning_rate": 0.0001, + "loss": 1.4878, + "step": 2446 + }, + { + "epoch": 0.8192166052895882, + "grad_norm": 0.13606110215187073, + "learning_rate": 0.0001, + "loss": 1.5585, + "step": 2447 + }, + { + "epoch": 0.8195513893538667, + "grad_norm": 0.13404667377471924, + "learning_rate": 0.0001, + "loss": 1.5156, + "step": 2448 + }, + { + "epoch": 0.8198861734181453, + "grad_norm": 0.14223212003707886, + "learning_rate": 0.0001, + "loss": 1.5904, + "step": 2449 + }, + { + "epoch": 0.8202209574824239, + "grad_norm": 0.13209384679794312, + "learning_rate": 0.0001, + "loss": 1.551, + "step": 2450 + }, + { + "epoch": 0.8205557415467024, + "grad_norm": 0.13522854447364807, + "learning_rate": 0.0001, + "loss": 1.5325, + "step": 2451 + }, + { + "epoch": 0.820890525610981, + "grad_norm": 0.13555531203746796, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 2452 + }, + { + "epoch": 0.8212253096752594, + "grad_norm": 0.13121196627616882, + "learning_rate": 0.0001, + "loss": 1.5208, + "step": 2453 + }, + { + "epoch": 0.821560093739538, + "grad_norm": 0.13988123834133148, + "learning_rate": 0.0001, + "loss": 1.6188, + "step": 2454 + }, + { + "epoch": 0.8218948778038165, + "grad_norm": 0.1347675770521164, + "learning_rate": 0.0001, + "loss": 1.5212, + "step": 2455 + }, + { + "epoch": 0.8222296618680951, + "grad_norm": 0.13975632190704346, + "learning_rate": 0.0001, + "loss": 1.6152, + "step": 2456 + }, + { + "epoch": 0.8225644459323737, + "grad_norm": 0.1271917223930359, + "learning_rate": 0.0001, + "loss": 1.4209, + "step": 2457 + }, + { + "epoch": 0.8228992299966521, + "grad_norm": 0.13226144015789032, + "learning_rate": 0.0001, + "loss": 1.5397, + "step": 2458 + }, + { + "epoch": 0.8232340140609307, + "grad_norm": 0.1391698569059372, + "learning_rate": 0.0001, + "loss": 1.5394, + "step": 2459 + }, + { + "epoch": 0.8235687981252092, + "grad_norm": 0.13757720589637756, + "learning_rate": 0.0001, + "loss": 1.5465, + "step": 2460 + }, + { + "epoch": 0.8239035821894878, + "grad_norm": 0.13116374611854553, + "learning_rate": 0.0001, + "loss": 1.5072, + "step": 2461 + }, + { + "epoch": 0.8242383662537663, + "grad_norm": 0.13408921658992767, + "learning_rate": 0.0001, + "loss": 1.5398, + "step": 2462 + }, + { + "epoch": 0.8245731503180449, + "grad_norm": 0.13682673871517181, + "learning_rate": 0.0001, + "loss": 1.574, + "step": 2463 + }, + { + "epoch": 0.8249079343823233, + "grad_norm": 0.12918630242347717, + "learning_rate": 0.0001, + "loss": 1.4619, + "step": 2464 + }, + { + "epoch": 0.8252427184466019, + "grad_norm": 0.14337001740932465, + "learning_rate": 0.0001, + "loss": 1.5494, + "step": 2465 + }, + { + "epoch": 0.8255775025108805, + "grad_norm": 0.13083745539188385, + "learning_rate": 0.0001, + "loss": 1.4594, + "step": 2466 + }, + { + "epoch": 0.825912286575159, + "grad_norm": 0.13452093303203583, + "learning_rate": 0.0001, + "loss": 1.5114, + "step": 2467 + }, + { + "epoch": 0.8262470706394376, + "grad_norm": 0.1375538408756256, + "learning_rate": 0.0001, + "loss": 1.5472, + "step": 2468 + }, + { + "epoch": 0.8265818547037161, + "grad_norm": 0.13618512451648712, + "learning_rate": 0.0001, + "loss": 1.5067, + "step": 2469 + }, + { + "epoch": 0.8269166387679946, + "grad_norm": 0.13334475457668304, + "learning_rate": 0.0001, + "loss": 1.5626, + "step": 2470 + }, + { + "epoch": 0.8272514228322732, + "grad_norm": 0.12935003638267517, + "learning_rate": 0.0001, + "loss": 1.4524, + "step": 2471 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.1333768367767334, + "learning_rate": 0.0001, + "loss": 1.4809, + "step": 2472 + }, + { + "epoch": 0.8279209909608303, + "grad_norm": 0.139461949467659, + "learning_rate": 0.0001, + "loss": 1.5265, + "step": 2473 + }, + { + "epoch": 0.8282557750251088, + "grad_norm": 0.14345921576023102, + "learning_rate": 0.0001, + "loss": 1.5911, + "step": 2474 + }, + { + "epoch": 0.8285905590893874, + "grad_norm": 0.12835142016410828, + "learning_rate": 0.0001, + "loss": 1.4934, + "step": 2475 + }, + { + "epoch": 0.8289253431536658, + "grad_norm": 0.13207587599754333, + "learning_rate": 0.0001, + "loss": 1.5013, + "step": 2476 + }, + { + "epoch": 0.8292601272179444, + "grad_norm": 0.14216424524784088, + "learning_rate": 0.0001, + "loss": 1.5783, + "step": 2477 + }, + { + "epoch": 0.829594911282223, + "grad_norm": 0.1372382938861847, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 2478 + }, + { + "epoch": 0.8299296953465015, + "grad_norm": 0.14100505411624908, + "learning_rate": 0.0001, + "loss": 1.5893, + "step": 2479 + }, + { + "epoch": 0.8302644794107801, + "grad_norm": 0.13831539452075958, + "learning_rate": 0.0001, + "loss": 1.5308, + "step": 2480 + }, + { + "epoch": 0.8305992634750586, + "grad_norm": 0.13254091143608093, + "learning_rate": 0.0001, + "loss": 1.509, + "step": 2481 + }, + { + "epoch": 0.8309340475393371, + "grad_norm": 0.13434451818466187, + "learning_rate": 0.0001, + "loss": 1.4544, + "step": 2482 + }, + { + "epoch": 0.8312688316036156, + "grad_norm": 0.13452693819999695, + "learning_rate": 0.0001, + "loss": 1.4875, + "step": 2483 + }, + { + "epoch": 0.8316036156678942, + "grad_norm": 0.13497060537338257, + "learning_rate": 0.0001, + "loss": 1.4973, + "step": 2484 + }, + { + "epoch": 0.8319383997321728, + "grad_norm": 0.13919363915920258, + "learning_rate": 0.0001, + "loss": 1.4425, + "step": 2485 + }, + { + "epoch": 0.8322731837964513, + "grad_norm": 0.14376235008239746, + "learning_rate": 0.0001, + "loss": 1.5438, + "step": 2486 + }, + { + "epoch": 0.8326079678607299, + "grad_norm": 0.13027647137641907, + "learning_rate": 0.0001, + "loss": 1.4899, + "step": 2487 + }, + { + "epoch": 0.8329427519250083, + "grad_norm": 0.1342213749885559, + "learning_rate": 0.0001, + "loss": 1.4716, + "step": 2488 + }, + { + "epoch": 0.8332775359892869, + "grad_norm": 0.1298682540655136, + "learning_rate": 0.0001, + "loss": 1.4359, + "step": 2489 + }, + { + "epoch": 0.8336123200535654, + "grad_norm": 0.13764667510986328, + "learning_rate": 0.0001, + "loss": 1.6205, + "step": 2490 + }, + { + "epoch": 0.833947104117844, + "grad_norm": 0.13023105263710022, + "learning_rate": 0.0001, + "loss": 1.4276, + "step": 2491 + }, + { + "epoch": 0.8342818881821226, + "grad_norm": 0.1355689913034439, + "learning_rate": 0.0001, + "loss": 1.4635, + "step": 2492 + }, + { + "epoch": 0.8346166722464011, + "grad_norm": 0.13397172093391418, + "learning_rate": 0.0001, + "loss": 1.5855, + "step": 2493 + }, + { + "epoch": 0.8349514563106796, + "grad_norm": 0.13192683458328247, + "learning_rate": 0.0001, + "loss": 1.5209, + "step": 2494 + }, + { + "epoch": 0.8352862403749581, + "grad_norm": 0.13405252993106842, + "learning_rate": 0.0001, + "loss": 1.5144, + "step": 2495 + }, + { + "epoch": 0.8356210244392367, + "grad_norm": 0.13375818729400635, + "learning_rate": 0.0001, + "loss": 1.4467, + "step": 2496 + }, + { + "epoch": 0.8359558085035153, + "grad_norm": 0.12543916702270508, + "learning_rate": 0.0001, + "loss": 1.3992, + "step": 2497 + }, + { + "epoch": 0.8362905925677938, + "grad_norm": 0.13587196171283722, + "learning_rate": 0.0001, + "loss": 1.487, + "step": 2498 + }, + { + "epoch": 0.8366253766320723, + "grad_norm": 0.13462427258491516, + "learning_rate": 0.0001, + "loss": 1.5455, + "step": 2499 + }, + { + "epoch": 0.8369601606963508, + "grad_norm": 0.13338516652584076, + "learning_rate": 0.0001, + "loss": 1.5612, + "step": 2500 + }, + { + "epoch": 0.0003348961821835231, + "grad_norm": 0.1373075246810913, + "learning_rate": 0.0001, + "loss": 1.5044, + "step": 2501 + }, + { + "epoch": 0.0006697923643670462, + "grad_norm": 0.13018828630447388, + "learning_rate": 0.0001, + "loss": 1.4832, + "step": 2502 + }, + { + "epoch": 0.0010046885465505692, + "grad_norm": 0.1399035006761551, + "learning_rate": 0.0001, + "loss": 1.5154, + "step": 2503 + }, + { + "epoch": 0.0013395847287340924, + "grad_norm": 0.13555970788002014, + "learning_rate": 0.0001, + "loss": 1.5546, + "step": 2504 + }, + { + "epoch": 0.0016744809109176155, + "grad_norm": 0.13555824756622314, + "learning_rate": 0.0001, + "loss": 1.4698, + "step": 2505 + }, + { + "epoch": 0.0020093770931011385, + "grad_norm": 0.13882486522197723, + "learning_rate": 0.0001, + "loss": 1.527, + "step": 2506 + }, + { + "epoch": 0.002344273275284662, + "grad_norm": 0.1495855301618576, + "learning_rate": 0.0001, + "loss": 1.5397, + "step": 2507 + }, + { + "epoch": 0.0026791694574681848, + "grad_norm": 0.13565705716609955, + "learning_rate": 0.0001, + "loss": 1.4942, + "step": 2508 + }, + { + "epoch": 0.003014065639651708, + "grad_norm": 0.13897860050201416, + "learning_rate": 0.0001, + "loss": 1.5108, + "step": 2509 + }, + { + "epoch": 0.003348961821835231, + "grad_norm": 0.13958251476287842, + "learning_rate": 0.0001, + "loss": 1.4347, + "step": 2510 + }, + { + "epoch": 0.003683858004018754, + "grad_norm": 0.14172150194644928, + "learning_rate": 0.0001, + "loss": 1.6219, + "step": 2511 + }, + { + "epoch": 0.004018754186202277, + "grad_norm": 0.14103008806705475, + "learning_rate": 0.0001, + "loss": 1.4782, + "step": 2512 + }, + { + "epoch": 0.004353650368385801, + "grad_norm": 0.1375630795955658, + "learning_rate": 0.0001, + "loss": 1.4742, + "step": 2513 + }, + { + "epoch": 0.004688546550569324, + "grad_norm": 0.13548851013183594, + "learning_rate": 0.0001, + "loss": 1.4329, + "step": 2514 + }, + { + "epoch": 0.005023442732752847, + "grad_norm": 0.1332726776599884, + "learning_rate": 0.0001, + "loss": 1.4935, + "step": 2515 + }, + { + "epoch": 0.0053583389149363695, + "grad_norm": 0.1388276070356369, + "learning_rate": 0.0001, + "loss": 1.5153, + "step": 2516 + }, + { + "epoch": 0.0056932350971198925, + "grad_norm": 0.14141787588596344, + "learning_rate": 0.0001, + "loss": 1.5171, + "step": 2517 + }, + { + "epoch": 0.006028131279303416, + "grad_norm": 0.13511618971824646, + "learning_rate": 0.0001, + "loss": 1.4681, + "step": 2518 + }, + { + "epoch": 0.006363027461486939, + "grad_norm": 0.13339103758335114, + "learning_rate": 0.0001, + "loss": 1.5153, + "step": 2519 + }, + { + "epoch": 0.006697923643670462, + "grad_norm": 0.13787244260311127, + "learning_rate": 0.0001, + "loss": 1.4729, + "step": 2520 + }, + { + "epoch": 0.007032819825853985, + "grad_norm": 0.13926245272159576, + "learning_rate": 0.0001, + "loss": 1.5248, + "step": 2521 + }, + { + "epoch": 0.007367716008037508, + "grad_norm": 0.1414000689983368, + "learning_rate": 0.0001, + "loss": 1.5346, + "step": 2522 + }, + { + "epoch": 0.007702612190221032, + "grad_norm": 0.13984519243240356, + "learning_rate": 0.0001, + "loss": 1.47, + "step": 2523 + }, + { + "epoch": 0.008037508372404554, + "grad_norm": 0.13971388339996338, + "learning_rate": 0.0001, + "loss": 1.519, + "step": 2524 + }, + { + "epoch": 0.008372404554588079, + "grad_norm": 0.13793839514255524, + "learning_rate": 0.0001, + "loss": 1.4681, + "step": 2525 + }, + { + "epoch": 0.008707300736771601, + "grad_norm": 0.136174738407135, + "learning_rate": 0.0001, + "loss": 1.3919, + "step": 2526 + }, + { + "epoch": 0.009042196918955124, + "grad_norm": 0.13941626250743866, + "learning_rate": 0.0001, + "loss": 1.5496, + "step": 2527 + }, + { + "epoch": 0.009377093101138647, + "grad_norm": 0.13217207789421082, + "learning_rate": 0.0001, + "loss": 1.4442, + "step": 2528 + }, + { + "epoch": 0.00971198928332217, + "grad_norm": 0.14001114666461945, + "learning_rate": 0.0001, + "loss": 1.5094, + "step": 2529 + }, + { + "epoch": 0.010046885465505693, + "grad_norm": 0.14493252336978912, + "learning_rate": 0.0001, + "loss": 1.5396, + "step": 2530 + }, + { + "epoch": 0.010381781647689216, + "grad_norm": 0.1410461664199829, + "learning_rate": 0.0001, + "loss": 1.4883, + "step": 2531 + }, + { + "epoch": 0.010716677829872739, + "grad_norm": 0.13383805751800537, + "learning_rate": 0.0001, + "loss": 1.4484, + "step": 2532 + }, + { + "epoch": 0.011051574012056262, + "grad_norm": 0.1375914365053177, + "learning_rate": 0.0001, + "loss": 1.4361, + "step": 2533 + }, + { + "epoch": 0.011386470194239785, + "grad_norm": 0.13821835815906525, + "learning_rate": 0.0001, + "loss": 1.4563, + "step": 2534 + }, + { + "epoch": 0.01172136637642331, + "grad_norm": 0.14385046064853668, + "learning_rate": 0.0001, + "loss": 1.5055, + "step": 2535 + }, + { + "epoch": 0.012056262558606833, + "grad_norm": 0.13575609028339386, + "learning_rate": 0.0001, + "loss": 1.4273, + "step": 2536 + }, + { + "epoch": 0.012391158740790355, + "grad_norm": 0.14160922169685364, + "learning_rate": 0.0001, + "loss": 1.4898, + "step": 2537 + }, + { + "epoch": 0.012726054922973878, + "grad_norm": 0.1364385187625885, + "learning_rate": 0.0001, + "loss": 1.3803, + "step": 2538 + }, + { + "epoch": 0.013060951105157401, + "grad_norm": 0.14626376330852509, + "learning_rate": 0.0001, + "loss": 1.5782, + "step": 2539 + }, + { + "epoch": 0.013395847287340924, + "grad_norm": 0.140034481883049, + "learning_rate": 0.0001, + "loss": 1.4472, + "step": 2540 + }, + { + "epoch": 0.013730743469524447, + "grad_norm": 0.13497884571552277, + "learning_rate": 0.0001, + "loss": 1.4742, + "step": 2541 + }, + { + "epoch": 0.01406563965170797, + "grad_norm": 0.13669952750205994, + "learning_rate": 0.0001, + "loss": 1.4631, + "step": 2542 + }, + { + "epoch": 0.014400535833891493, + "grad_norm": 0.14313064515590668, + "learning_rate": 0.0001, + "loss": 1.5265, + "step": 2543 + }, + { + "epoch": 0.014735432016075016, + "grad_norm": 0.14192776381969452, + "learning_rate": 0.0001, + "loss": 1.5202, + "step": 2544 + }, + { + "epoch": 0.01507032819825854, + "grad_norm": 0.13427315652370453, + "learning_rate": 0.0001, + "loss": 1.4417, + "step": 2545 + }, + { + "epoch": 0.015405224380442064, + "grad_norm": 0.13986799120903015, + "learning_rate": 0.0001, + "loss": 1.5255, + "step": 2546 + }, + { + "epoch": 0.015740120562625585, + "grad_norm": 0.13709712028503418, + "learning_rate": 0.0001, + "loss": 1.4431, + "step": 2547 + }, + { + "epoch": 0.016075016744809108, + "grad_norm": 0.13961805403232574, + "learning_rate": 0.0001, + "loss": 1.486, + "step": 2548 + }, + { + "epoch": 0.01640991292699263, + "grad_norm": 0.1424587219953537, + "learning_rate": 0.0001, + "loss": 1.5153, + "step": 2549 + }, + { + "epoch": 0.016744809109176157, + "grad_norm": 0.141858771443367, + "learning_rate": 0.0001, + "loss": 1.4655, + "step": 2550 + }, + { + "epoch": 0.01707970529135968, + "grad_norm": 0.1496685892343521, + "learning_rate": 0.0001, + "loss": 1.4828, + "step": 2551 + }, + { + "epoch": 0.017414601473543203, + "grad_norm": 0.14911946654319763, + "learning_rate": 0.0001, + "loss": 1.5248, + "step": 2552 + }, + { + "epoch": 0.017749497655726726, + "grad_norm": 0.1409626305103302, + "learning_rate": 0.0001, + "loss": 1.4795, + "step": 2553 + }, + { + "epoch": 0.01808439383791025, + "grad_norm": 0.1455659717321396, + "learning_rate": 0.0001, + "loss": 1.5117, + "step": 2554 + }, + { + "epoch": 0.018419290020093772, + "grad_norm": 0.1373487412929535, + "learning_rate": 0.0001, + "loss": 1.4457, + "step": 2555 + }, + { + "epoch": 0.018754186202277295, + "grad_norm": 0.1419982612133026, + "learning_rate": 0.0001, + "loss": 1.4763, + "step": 2556 + }, + { + "epoch": 0.019089082384460818, + "grad_norm": 0.13673114776611328, + "learning_rate": 0.0001, + "loss": 1.4535, + "step": 2557 + }, + { + "epoch": 0.01942397856664434, + "grad_norm": 0.1524597704410553, + "learning_rate": 0.0001, + "loss": 1.5723, + "step": 2558 + }, + { + "epoch": 0.019758874748827863, + "grad_norm": 0.14004305005073547, + "learning_rate": 0.0001, + "loss": 1.5207, + "step": 2559 + }, + { + "epoch": 0.020093770931011386, + "grad_norm": 0.13520407676696777, + "learning_rate": 0.0001, + "loss": 1.4977, + "step": 2560 + }, + { + "epoch": 0.02042866711319491, + "grad_norm": 0.14743876457214355, + "learning_rate": 0.0001, + "loss": 1.4901, + "step": 2561 + }, + { + "epoch": 0.020763563295378432, + "grad_norm": 0.1442539095878601, + "learning_rate": 0.0001, + "loss": 1.5069, + "step": 2562 + }, + { + "epoch": 0.021098459477561955, + "grad_norm": 0.14434701204299927, + "learning_rate": 0.0001, + "loss": 1.5274, + "step": 2563 + }, + { + "epoch": 0.021433355659745478, + "grad_norm": 0.1463836133480072, + "learning_rate": 0.0001, + "loss": 1.5482, + "step": 2564 + }, + { + "epoch": 0.021768251841929, + "grad_norm": 0.1377059519290924, + "learning_rate": 0.0001, + "loss": 1.4227, + "step": 2565 + }, + { + "epoch": 0.022103148024112524, + "grad_norm": 0.13638120889663696, + "learning_rate": 0.0001, + "loss": 1.4764, + "step": 2566 + }, + { + "epoch": 0.022438044206296047, + "grad_norm": 0.15557897090911865, + "learning_rate": 0.0001, + "loss": 1.5515, + "step": 2567 + }, + { + "epoch": 0.02277294038847957, + "grad_norm": 0.14417023956775665, + "learning_rate": 0.0001, + "loss": 1.636, + "step": 2568 + }, + { + "epoch": 0.023107836570663093, + "grad_norm": 0.1372545212507248, + "learning_rate": 0.0001, + "loss": 1.4169, + "step": 2569 + }, + { + "epoch": 0.02344273275284662, + "grad_norm": 0.1555853635072708, + "learning_rate": 0.0001, + "loss": 1.4845, + "step": 2570 + }, + { + "epoch": 0.023777628935030142, + "grad_norm": 0.15075907111167908, + "learning_rate": 0.0001, + "loss": 1.4158, + "step": 2571 + }, + { + "epoch": 0.024112525117213665, + "grad_norm": 0.132953479886055, + "learning_rate": 0.0001, + "loss": 1.3884, + "step": 2572 + }, + { + "epoch": 0.024447421299397188, + "grad_norm": 0.1466650664806366, + "learning_rate": 0.0001, + "loss": 1.4772, + "step": 2573 + }, + { + "epoch": 0.02478231748158071, + "grad_norm": 0.1482735276222229, + "learning_rate": 0.0001, + "loss": 1.5177, + "step": 2574 + }, + { + "epoch": 0.025117213663764234, + "grad_norm": 0.13384978473186493, + "learning_rate": 0.0001, + "loss": 1.4631, + "step": 2575 + }, + { + "epoch": 0.025452109845947757, + "grad_norm": 0.14189793169498444, + "learning_rate": 0.0001, + "loss": 1.4961, + "step": 2576 + }, + { + "epoch": 0.02578700602813128, + "grad_norm": 0.1458515226840973, + "learning_rate": 0.0001, + "loss": 1.4043, + "step": 2577 + }, + { + "epoch": 0.026121902210314803, + "grad_norm": 0.1455136090517044, + "learning_rate": 0.0001, + "loss": 1.4856, + "step": 2578 + }, + { + "epoch": 0.026456798392498326, + "grad_norm": 0.1397375911474228, + "learning_rate": 0.0001, + "loss": 1.4641, + "step": 2579 + }, + { + "epoch": 0.02679169457468185, + "grad_norm": 0.15315301716327667, + "learning_rate": 0.0001, + "loss": 1.5293, + "step": 2580 + }, + { + "epoch": 0.02712659075686537, + "grad_norm": 0.14230938255786896, + "learning_rate": 0.0001, + "loss": 1.5069, + "step": 2581 + }, + { + "epoch": 0.027461486939048894, + "grad_norm": 0.1534007042646408, + "learning_rate": 0.0001, + "loss": 1.4823, + "step": 2582 + }, + { + "epoch": 0.027796383121232417, + "grad_norm": 0.1446549892425537, + "learning_rate": 0.0001, + "loss": 1.5113, + "step": 2583 + }, + { + "epoch": 0.02813127930341594, + "grad_norm": 0.14052511751651764, + "learning_rate": 0.0001, + "loss": 1.4786, + "step": 2584 + }, + { + "epoch": 0.028466175485599463, + "grad_norm": 0.1368207037448883, + "learning_rate": 0.0001, + "loss": 1.4235, + "step": 2585 + }, + { + "epoch": 0.028801071667782986, + "grad_norm": 0.15163308382034302, + "learning_rate": 0.0001, + "loss": 1.5705, + "step": 2586 + }, + { + "epoch": 0.02913596784996651, + "grad_norm": 0.13964402675628662, + "learning_rate": 0.0001, + "loss": 1.4089, + "step": 2587 + }, + { + "epoch": 0.029470864032150032, + "grad_norm": 0.13717153668403625, + "learning_rate": 0.0001, + "loss": 1.4685, + "step": 2588 + }, + { + "epoch": 0.029805760214333555, + "grad_norm": 0.15048347413539886, + "learning_rate": 0.0001, + "loss": 1.5594, + "step": 2589 + }, + { + "epoch": 0.03014065639651708, + "grad_norm": 0.14722289144992828, + "learning_rate": 0.0001, + "loss": 1.4536, + "step": 2590 + }, + { + "epoch": 0.030475552578700604, + "grad_norm": 0.14477623999118805, + "learning_rate": 0.0001, + "loss": 1.4998, + "step": 2591 + }, + { + "epoch": 0.030810448760884127, + "grad_norm": 0.14907695353031158, + "learning_rate": 0.0001, + "loss": 1.5043, + "step": 2592 + }, + { + "epoch": 0.03114534494306765, + "grad_norm": 0.14083077013492584, + "learning_rate": 0.0001, + "loss": 1.4757, + "step": 2593 + }, + { + "epoch": 0.03148024112525117, + "grad_norm": 0.13702519237995148, + "learning_rate": 0.0001, + "loss": 1.4473, + "step": 2594 + }, + { + "epoch": 0.031815137307434696, + "grad_norm": 0.14483878016471863, + "learning_rate": 0.0001, + "loss": 1.5523, + "step": 2595 + }, + { + "epoch": 0.032150033489618215, + "grad_norm": 0.14438624680042267, + "learning_rate": 0.0001, + "loss": 1.5179, + "step": 2596 + }, + { + "epoch": 0.03248492967180174, + "grad_norm": 0.13237686455249786, + "learning_rate": 0.0001, + "loss": 1.4097, + "step": 2597 + }, + { + "epoch": 0.03281982585398526, + "grad_norm": 0.1421421617269516, + "learning_rate": 0.0001, + "loss": 1.4931, + "step": 2598 + }, + { + "epoch": 0.03315472203616879, + "grad_norm": 0.1436019092798233, + "learning_rate": 0.0001, + "loss": 1.493, + "step": 2599 + }, + { + "epoch": 0.033489618218352314, + "grad_norm": 0.14162643253803253, + "learning_rate": 0.0001, + "loss": 1.4743, + "step": 2600 + }, + { + "epoch": 0.033824514400535834, + "grad_norm": 0.13866618275642395, + "learning_rate": 0.0001, + "loss": 1.4849, + "step": 2601 + }, + { + "epoch": 0.03415941058271936, + "grad_norm": 0.1426554024219513, + "learning_rate": 0.0001, + "loss": 1.5222, + "step": 2602 + }, + { + "epoch": 0.03449430676490288, + "grad_norm": 0.14943377673625946, + "learning_rate": 0.0001, + "loss": 1.6258, + "step": 2603 + }, + { + "epoch": 0.034829202947086406, + "grad_norm": 0.14554472267627716, + "learning_rate": 0.0001, + "loss": 1.5683, + "step": 2604 + }, + { + "epoch": 0.035164099129269925, + "grad_norm": 0.15278968214988708, + "learning_rate": 0.0001, + "loss": 1.5867, + "step": 2605 + }, + { + "epoch": 0.03549899531145345, + "grad_norm": 0.1403978019952774, + "learning_rate": 0.0001, + "loss": 1.4167, + "step": 2606 + }, + { + "epoch": 0.03583389149363697, + "grad_norm": 0.14401455223560333, + "learning_rate": 0.0001, + "loss": 1.5629, + "step": 2607 + }, + { + "epoch": 0.0361687876758205, + "grad_norm": 0.14237728714942932, + "learning_rate": 0.0001, + "loss": 1.4703, + "step": 2608 + }, + { + "epoch": 0.03650368385800402, + "grad_norm": 0.15331515669822693, + "learning_rate": 0.0001, + "loss": 1.4727, + "step": 2609 + }, + { + "epoch": 0.036838580040187544, + "grad_norm": 0.13513866066932678, + "learning_rate": 0.0001, + "loss": 1.4841, + "step": 2610 + }, + { + "epoch": 0.03717347622237106, + "grad_norm": 0.1455763429403305, + "learning_rate": 0.0001, + "loss": 1.5234, + "step": 2611 + }, + { + "epoch": 0.03750837240455459, + "grad_norm": 0.15021777153015137, + "learning_rate": 0.0001, + "loss": 1.4737, + "step": 2612 + }, + { + "epoch": 0.03784326858673811, + "grad_norm": 0.1495695263147354, + "learning_rate": 0.0001, + "loss": 1.4734, + "step": 2613 + }, + { + "epoch": 0.038178164768921635, + "grad_norm": 0.14363637566566467, + "learning_rate": 0.0001, + "loss": 1.5417, + "step": 2614 + }, + { + "epoch": 0.038513060951105155, + "grad_norm": 0.14860506355762482, + "learning_rate": 0.0001, + "loss": 1.4967, + "step": 2615 + }, + { + "epoch": 0.03884795713328868, + "grad_norm": 0.138936385512352, + "learning_rate": 0.0001, + "loss": 1.5451, + "step": 2616 + }, + { + "epoch": 0.0391828533154722, + "grad_norm": 0.13959284126758575, + "learning_rate": 0.0001, + "loss": 1.4186, + "step": 2617 + }, + { + "epoch": 0.03951774949765573, + "grad_norm": 0.13905645906925201, + "learning_rate": 0.0001, + "loss": 1.4981, + "step": 2618 + }, + { + "epoch": 0.039852645679839246, + "grad_norm": 0.13835424184799194, + "learning_rate": 0.0001, + "loss": 1.4646, + "step": 2619 + }, + { + "epoch": 0.04018754186202277, + "grad_norm": 0.1373290866613388, + "learning_rate": 0.0001, + "loss": 1.461, + "step": 2620 + }, + { + "epoch": 0.0405224380442063, + "grad_norm": 0.1450372189283371, + "learning_rate": 0.0001, + "loss": 1.4827, + "step": 2621 + }, + { + "epoch": 0.04085733422638982, + "grad_norm": 0.1416875571012497, + "learning_rate": 0.0001, + "loss": 1.4285, + "step": 2622 + }, + { + "epoch": 0.041192230408573345, + "grad_norm": 0.14558644592761993, + "learning_rate": 0.0001, + "loss": 1.4835, + "step": 2623 + }, + { + "epoch": 0.041527126590756865, + "grad_norm": 0.14103256165981293, + "learning_rate": 0.0001, + "loss": 1.4838, + "step": 2624 + }, + { + "epoch": 0.04186202277294039, + "grad_norm": 0.13511677086353302, + "learning_rate": 0.0001, + "loss": 1.4154, + "step": 2625 + }, + { + "epoch": 0.04219691895512391, + "grad_norm": 0.1407540738582611, + "learning_rate": 0.0001, + "loss": 1.3846, + "step": 2626 + }, + { + "epoch": 0.04253181513730744, + "grad_norm": 0.13562853634357452, + "learning_rate": 0.0001, + "loss": 1.3934, + "step": 2627 + }, + { + "epoch": 0.042866711319490956, + "grad_norm": 0.14359675347805023, + "learning_rate": 0.0001, + "loss": 1.5502, + "step": 2628 + }, + { + "epoch": 0.04320160750167448, + "grad_norm": 0.14287690818309784, + "learning_rate": 0.0001, + "loss": 1.5875, + "step": 2629 + }, + { + "epoch": 0.043536503683858, + "grad_norm": 0.14544172585010529, + "learning_rate": 0.0001, + "loss": 1.5727, + "step": 2630 + }, + { + "epoch": 0.04387139986604153, + "grad_norm": 0.13922414183616638, + "learning_rate": 0.0001, + "loss": 1.3776, + "step": 2631 + }, + { + "epoch": 0.04420629604822505, + "grad_norm": 0.13899439573287964, + "learning_rate": 0.0001, + "loss": 1.4755, + "step": 2632 + }, + { + "epoch": 0.044541192230408574, + "grad_norm": 0.14173531532287598, + "learning_rate": 0.0001, + "loss": 1.4999, + "step": 2633 + }, + { + "epoch": 0.044876088412592094, + "grad_norm": 0.13438910245895386, + "learning_rate": 0.0001, + "loss": 1.4053, + "step": 2634 + }, + { + "epoch": 0.04521098459477562, + "grad_norm": 0.14383186399936676, + "learning_rate": 0.0001, + "loss": 1.515, + "step": 2635 + }, + { + "epoch": 0.04554588077695914, + "grad_norm": 0.1358998566865921, + "learning_rate": 0.0001, + "loss": 1.4713, + "step": 2636 + }, + { + "epoch": 0.045880776959142666, + "grad_norm": 0.13926327228546143, + "learning_rate": 0.0001, + "loss": 1.443, + "step": 2637 + }, + { + "epoch": 0.046215673141326186, + "grad_norm": 0.14442448318004608, + "learning_rate": 0.0001, + "loss": 1.5082, + "step": 2638 + }, + { + "epoch": 0.04655056932350971, + "grad_norm": 0.13538411259651184, + "learning_rate": 0.0001, + "loss": 1.4567, + "step": 2639 + }, + { + "epoch": 0.04688546550569324, + "grad_norm": 0.14301930367946625, + "learning_rate": 0.0001, + "loss": 1.4305, + "step": 2640 + }, + { + "epoch": 0.04722036168787676, + "grad_norm": 0.13956156373023987, + "learning_rate": 0.0001, + "loss": 1.4996, + "step": 2641 + }, + { + "epoch": 0.047555257870060284, + "grad_norm": 0.13752375543117523, + "learning_rate": 0.0001, + "loss": 1.4959, + "step": 2642 + }, + { + "epoch": 0.047890154052243804, + "grad_norm": 0.14346663653850555, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 2643 + }, + { + "epoch": 0.04822505023442733, + "grad_norm": 0.14569582045078278, + "learning_rate": 0.0001, + "loss": 1.4791, + "step": 2644 + }, + { + "epoch": 0.04855994641661085, + "grad_norm": 0.1401805430650711, + "learning_rate": 0.0001, + "loss": 1.446, + "step": 2645 + }, + { + "epoch": 0.048894842598794376, + "grad_norm": 0.14419414103031158, + "learning_rate": 0.0001, + "loss": 1.4684, + "step": 2646 + }, + { + "epoch": 0.049229738780977896, + "grad_norm": 0.15199334919452667, + "learning_rate": 0.0001, + "loss": 1.4544, + "step": 2647 + }, + { + "epoch": 0.04956463496316142, + "grad_norm": 0.13346783816814423, + "learning_rate": 0.0001, + "loss": 1.4106, + "step": 2648 + }, + { + "epoch": 0.04989953114534494, + "grad_norm": 0.14965283870697021, + "learning_rate": 0.0001, + "loss": 1.4939, + "step": 2649 + }, + { + "epoch": 0.05023442732752847, + "grad_norm": 0.15457184612751007, + "learning_rate": 0.0001, + "loss": 1.5083, + "step": 2650 + }, + { + "epoch": 0.05056932350971199, + "grad_norm": 0.13663454353809357, + "learning_rate": 0.0001, + "loss": 1.5019, + "step": 2651 + }, + { + "epoch": 0.050904219691895514, + "grad_norm": 0.14018338918685913, + "learning_rate": 0.0001, + "loss": 1.4765, + "step": 2652 + }, + { + "epoch": 0.05123911587407903, + "grad_norm": 0.14040859043598175, + "learning_rate": 0.0001, + "loss": 1.4675, + "step": 2653 + }, + { + "epoch": 0.05157401205626256, + "grad_norm": 0.14722508192062378, + "learning_rate": 0.0001, + "loss": 1.4704, + "step": 2654 + }, + { + "epoch": 0.05190890823844608, + "grad_norm": 0.1608511507511139, + "learning_rate": 0.0001, + "loss": 1.5832, + "step": 2655 + }, + { + "epoch": 0.052243804420629605, + "grad_norm": 0.14078736305236816, + "learning_rate": 0.0001, + "loss": 1.4143, + "step": 2656 + }, + { + "epoch": 0.052578700602813125, + "grad_norm": 0.14444111287593842, + "learning_rate": 0.0001, + "loss": 1.5433, + "step": 2657 + }, + { + "epoch": 0.05291359678499665, + "grad_norm": 0.1437847912311554, + "learning_rate": 0.0001, + "loss": 1.5246, + "step": 2658 + }, + { + "epoch": 0.05324849296718017, + "grad_norm": 0.13621410727500916, + "learning_rate": 0.0001, + "loss": 1.4587, + "step": 2659 + }, + { + "epoch": 0.0535833891493637, + "grad_norm": 0.1466796100139618, + "learning_rate": 0.0001, + "loss": 1.4839, + "step": 2660 + }, + { + "epoch": 0.053918285331547224, + "grad_norm": 0.14355647563934326, + "learning_rate": 0.0001, + "loss": 1.5311, + "step": 2661 + }, + { + "epoch": 0.05425318151373074, + "grad_norm": 0.14155465364456177, + "learning_rate": 0.0001, + "loss": 1.4321, + "step": 2662 + }, + { + "epoch": 0.05458807769591427, + "grad_norm": 0.13682101666927338, + "learning_rate": 0.0001, + "loss": 1.436, + "step": 2663 + }, + { + "epoch": 0.05492297387809779, + "grad_norm": 0.14461514353752136, + "learning_rate": 0.0001, + "loss": 1.491, + "step": 2664 + }, + { + "epoch": 0.055257870060281315, + "grad_norm": 0.14589102566242218, + "learning_rate": 0.0001, + "loss": 1.452, + "step": 2665 + }, + { + "epoch": 0.055592766242464835, + "grad_norm": 0.13621005415916443, + "learning_rate": 0.0001, + "loss": 1.4085, + "step": 2666 + }, + { + "epoch": 0.05592766242464836, + "grad_norm": 0.13911636173725128, + "learning_rate": 0.0001, + "loss": 1.5222, + "step": 2667 + }, + { + "epoch": 0.05626255860683188, + "grad_norm": 0.14434079825878143, + "learning_rate": 0.0001, + "loss": 1.5304, + "step": 2668 + }, + { + "epoch": 0.05659745478901541, + "grad_norm": 0.1485336571931839, + "learning_rate": 0.0001, + "loss": 1.5097, + "step": 2669 + }, + { + "epoch": 0.056932350971198926, + "grad_norm": 0.14089222252368927, + "learning_rate": 0.0001, + "loss": 1.4742, + "step": 2670 + }, + { + "epoch": 0.05726724715338245, + "grad_norm": 0.1406714767217636, + "learning_rate": 0.0001, + "loss": 1.4688, + "step": 2671 + }, + { + "epoch": 0.05760214333556597, + "grad_norm": 0.13756093382835388, + "learning_rate": 0.0001, + "loss": 1.4576, + "step": 2672 + }, + { + "epoch": 0.0579370395177495, + "grad_norm": 0.13884751498699188, + "learning_rate": 0.0001, + "loss": 1.5325, + "step": 2673 + }, + { + "epoch": 0.05827193569993302, + "grad_norm": 0.14231450855731964, + "learning_rate": 0.0001, + "loss": 1.4978, + "step": 2674 + }, + { + "epoch": 0.058606831882116545, + "grad_norm": 0.14200443029403687, + "learning_rate": 0.0001, + "loss": 1.5245, + "step": 2675 + }, + { + "epoch": 0.058941728064300064, + "grad_norm": 0.1436898410320282, + "learning_rate": 0.0001, + "loss": 1.4993, + "step": 2676 + }, + { + "epoch": 0.05927662424648359, + "grad_norm": 0.1421404331922531, + "learning_rate": 0.0001, + "loss": 1.4759, + "step": 2677 + }, + { + "epoch": 0.05961152042866711, + "grad_norm": 0.14410701394081116, + "learning_rate": 0.0001, + "loss": 1.4643, + "step": 2678 + }, + { + "epoch": 0.059946416610850636, + "grad_norm": 0.14235490560531616, + "learning_rate": 0.0001, + "loss": 1.5351, + "step": 2679 + }, + { + "epoch": 0.06028131279303416, + "grad_norm": 0.1442299336194992, + "learning_rate": 0.0001, + "loss": 1.5266, + "step": 2680 + }, + { + "epoch": 0.06061620897521768, + "grad_norm": 0.1418464481830597, + "learning_rate": 0.0001, + "loss": 1.5767, + "step": 2681 + }, + { + "epoch": 0.06095110515740121, + "grad_norm": 0.14591242372989655, + "learning_rate": 0.0001, + "loss": 1.4796, + "step": 2682 + }, + { + "epoch": 0.06128600133958473, + "grad_norm": 0.14850719273090363, + "learning_rate": 0.0001, + "loss": 1.4836, + "step": 2683 + }, + { + "epoch": 0.061620897521768254, + "grad_norm": 0.14169856905937195, + "learning_rate": 0.0001, + "loss": 1.4769, + "step": 2684 + }, + { + "epoch": 0.061955793703951774, + "grad_norm": 0.14849288761615753, + "learning_rate": 0.0001, + "loss": 1.5431, + "step": 2685 + }, + { + "epoch": 0.0622906898861353, + "grad_norm": 0.14057648181915283, + "learning_rate": 0.0001, + "loss": 1.4802, + "step": 2686 + }, + { + "epoch": 0.06262558606831882, + "grad_norm": 0.15280254185199738, + "learning_rate": 0.0001, + "loss": 1.4881, + "step": 2687 + }, + { + "epoch": 0.06296048225050234, + "grad_norm": 0.13690420985221863, + "learning_rate": 0.0001, + "loss": 1.374, + "step": 2688 + }, + { + "epoch": 0.06329537843268587, + "grad_norm": 0.14229778945446014, + "learning_rate": 0.0001, + "loss": 1.4981, + "step": 2689 + }, + { + "epoch": 0.06363027461486939, + "grad_norm": 0.14442794024944305, + "learning_rate": 0.0001, + "loss": 1.4799, + "step": 2690 + }, + { + "epoch": 0.06396517079705291, + "grad_norm": 0.15053285658359528, + "learning_rate": 0.0001, + "loss": 1.566, + "step": 2691 + }, + { + "epoch": 0.06430006697923643, + "grad_norm": 0.1395815759897232, + "learning_rate": 0.0001, + "loss": 1.4605, + "step": 2692 + }, + { + "epoch": 0.06463496316141996, + "grad_norm": 0.15144221484661102, + "learning_rate": 0.0001, + "loss": 1.4642, + "step": 2693 + }, + { + "epoch": 0.06496985934360348, + "grad_norm": 0.14683520793914795, + "learning_rate": 0.0001, + "loss": 1.4647, + "step": 2694 + }, + { + "epoch": 0.065304755525787, + "grad_norm": 0.1422366201877594, + "learning_rate": 0.0001, + "loss": 1.5096, + "step": 2695 + }, + { + "epoch": 0.06563965170797052, + "grad_norm": 0.14606067538261414, + "learning_rate": 0.0001, + "loss": 1.5396, + "step": 2696 + }, + { + "epoch": 0.06597454789015406, + "grad_norm": 0.13931868970394135, + "learning_rate": 0.0001, + "loss": 1.4693, + "step": 2697 + }, + { + "epoch": 0.06630944407233758, + "grad_norm": 0.14257977902889252, + "learning_rate": 0.0001, + "loss": 1.5156, + "step": 2698 + }, + { + "epoch": 0.0666443402545211, + "grad_norm": 0.14814600348472595, + "learning_rate": 0.0001, + "loss": 1.5187, + "step": 2699 + }, + { + "epoch": 0.06697923643670463, + "grad_norm": 0.14432038366794586, + "learning_rate": 0.0001, + "loss": 1.4105, + "step": 2700 + }, + { + "epoch": 0.06731413261888815, + "grad_norm": 0.14682446420192719, + "learning_rate": 0.0001, + "loss": 1.5516, + "step": 2701 + }, + { + "epoch": 0.06764902880107167, + "grad_norm": 0.14887043833732605, + "learning_rate": 0.0001, + "loss": 1.5903, + "step": 2702 + }, + { + "epoch": 0.06798392498325519, + "grad_norm": 0.14733371138572693, + "learning_rate": 0.0001, + "loss": 1.5192, + "step": 2703 + }, + { + "epoch": 0.06831882116543872, + "grad_norm": 0.14175830781459808, + "learning_rate": 0.0001, + "loss": 1.468, + "step": 2704 + }, + { + "epoch": 0.06865371734762224, + "grad_norm": 0.13812625408172607, + "learning_rate": 0.0001, + "loss": 1.4148, + "step": 2705 + }, + { + "epoch": 0.06898861352980576, + "grad_norm": 0.14035607874393463, + "learning_rate": 0.0001, + "loss": 1.4531, + "step": 2706 + }, + { + "epoch": 0.06932350971198928, + "grad_norm": 0.14815551042556763, + "learning_rate": 0.0001, + "loss": 1.5359, + "step": 2707 + }, + { + "epoch": 0.06965840589417281, + "grad_norm": 0.14825250208377838, + "learning_rate": 0.0001, + "loss": 1.4737, + "step": 2708 + }, + { + "epoch": 0.06999330207635633, + "grad_norm": 0.14412347972393036, + "learning_rate": 0.0001, + "loss": 1.4155, + "step": 2709 + }, + { + "epoch": 0.07032819825853985, + "grad_norm": 0.14373992383480072, + "learning_rate": 0.0001, + "loss": 1.5142, + "step": 2710 + }, + { + "epoch": 0.07066309444072337, + "grad_norm": 0.14845426380634308, + "learning_rate": 0.0001, + "loss": 1.6058, + "step": 2711 + }, + { + "epoch": 0.0709979906229069, + "grad_norm": 0.15275675058364868, + "learning_rate": 0.0001, + "loss": 1.5081, + "step": 2712 + }, + { + "epoch": 0.07133288680509042, + "grad_norm": 0.1481783539056778, + "learning_rate": 0.0001, + "loss": 1.5469, + "step": 2713 + }, + { + "epoch": 0.07166778298727394, + "grad_norm": 0.14404213428497314, + "learning_rate": 0.0001, + "loss": 1.6133, + "step": 2714 + }, + { + "epoch": 0.07200267916945746, + "grad_norm": 0.138307586312294, + "learning_rate": 0.0001, + "loss": 1.5124, + "step": 2715 + }, + { + "epoch": 0.072337575351641, + "grad_norm": 0.14422239363193512, + "learning_rate": 0.0001, + "loss": 1.4536, + "step": 2716 + }, + { + "epoch": 0.07267247153382451, + "grad_norm": 0.14184726774692535, + "learning_rate": 0.0001, + "loss": 1.4547, + "step": 2717 + }, + { + "epoch": 0.07300736771600803, + "grad_norm": 0.1343645304441452, + "learning_rate": 0.0001, + "loss": 1.4741, + "step": 2718 + }, + { + "epoch": 0.07334226389819157, + "grad_norm": 0.14284183084964752, + "learning_rate": 0.0001, + "loss": 1.5052, + "step": 2719 + }, + { + "epoch": 0.07367716008037509, + "grad_norm": 0.14086835086345673, + "learning_rate": 0.0001, + "loss": 1.5035, + "step": 2720 + }, + { + "epoch": 0.0740120562625586, + "grad_norm": 0.146831676363945, + "learning_rate": 0.0001, + "loss": 1.4494, + "step": 2721 + }, + { + "epoch": 0.07434695244474213, + "grad_norm": 0.14096811413764954, + "learning_rate": 0.0001, + "loss": 1.3873, + "step": 2722 + }, + { + "epoch": 0.07468184862692566, + "grad_norm": 0.1556435525417328, + "learning_rate": 0.0001, + "loss": 1.5451, + "step": 2723 + }, + { + "epoch": 0.07501674480910918, + "grad_norm": 0.1460019052028656, + "learning_rate": 0.0001, + "loss": 1.573, + "step": 2724 + }, + { + "epoch": 0.0753516409912927, + "grad_norm": 0.14590851962566376, + "learning_rate": 0.0001, + "loss": 1.4472, + "step": 2725 + }, + { + "epoch": 0.07568653717347622, + "grad_norm": 0.14320936799049377, + "learning_rate": 0.0001, + "loss": 1.5049, + "step": 2726 + }, + { + "epoch": 0.07602143335565975, + "grad_norm": 0.15445268154144287, + "learning_rate": 0.0001, + "loss": 1.5577, + "step": 2727 + }, + { + "epoch": 0.07635632953784327, + "grad_norm": 0.1448332965373993, + "learning_rate": 0.0001, + "loss": 1.5076, + "step": 2728 + }, + { + "epoch": 0.07669122572002679, + "grad_norm": 0.1451125591993332, + "learning_rate": 0.0001, + "loss": 1.5353, + "step": 2729 + }, + { + "epoch": 0.07702612190221031, + "grad_norm": 0.1508006453514099, + "learning_rate": 0.0001, + "loss": 1.653, + "step": 2730 + }, + { + "epoch": 0.07736101808439384, + "grad_norm": 0.13818366825580597, + "learning_rate": 0.0001, + "loss": 1.3885, + "step": 2731 + }, + { + "epoch": 0.07769591426657736, + "grad_norm": 0.13172827661037445, + "learning_rate": 0.0001, + "loss": 1.3705, + "step": 2732 + }, + { + "epoch": 0.07803081044876088, + "grad_norm": 0.1504860669374466, + "learning_rate": 0.0001, + "loss": 1.5108, + "step": 2733 + }, + { + "epoch": 0.0783657066309444, + "grad_norm": 0.1418769508600235, + "learning_rate": 0.0001, + "loss": 1.4078, + "step": 2734 + }, + { + "epoch": 0.07870060281312793, + "grad_norm": 0.14419272541999817, + "learning_rate": 0.0001, + "loss": 1.4953, + "step": 2735 + }, + { + "epoch": 0.07903549899531145, + "grad_norm": 0.14663195610046387, + "learning_rate": 0.0001, + "loss": 1.4626, + "step": 2736 + }, + { + "epoch": 0.07937039517749497, + "grad_norm": 0.13987226784229279, + "learning_rate": 0.0001, + "loss": 1.5421, + "step": 2737 + }, + { + "epoch": 0.07970529135967849, + "grad_norm": 0.1439124494791031, + "learning_rate": 0.0001, + "loss": 1.5178, + "step": 2738 + }, + { + "epoch": 0.08004018754186203, + "grad_norm": 0.1464778035879135, + "learning_rate": 0.0001, + "loss": 1.4798, + "step": 2739 + }, + { + "epoch": 0.08037508372404555, + "grad_norm": 0.13677044212818146, + "learning_rate": 0.0001, + "loss": 1.449, + "step": 2740 + }, + { + "epoch": 0.08070997990622907, + "grad_norm": 0.1404646635055542, + "learning_rate": 0.0001, + "loss": 1.5006, + "step": 2741 + }, + { + "epoch": 0.0810448760884126, + "grad_norm": 0.14357681572437286, + "learning_rate": 0.0001, + "loss": 1.5219, + "step": 2742 + }, + { + "epoch": 0.08137977227059612, + "grad_norm": 0.14373867213726044, + "learning_rate": 0.0001, + "loss": 1.4428, + "step": 2743 + }, + { + "epoch": 0.08171466845277964, + "grad_norm": 0.1410626769065857, + "learning_rate": 0.0001, + "loss": 1.4882, + "step": 2744 + }, + { + "epoch": 0.08204956463496316, + "grad_norm": 0.13724640011787415, + "learning_rate": 0.0001, + "loss": 1.4109, + "step": 2745 + }, + { + "epoch": 0.08238446081714669, + "grad_norm": 0.1481046825647354, + "learning_rate": 0.0001, + "loss": 1.614, + "step": 2746 + }, + { + "epoch": 0.08271935699933021, + "grad_norm": 0.1424095779657364, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 2747 + }, + { + "epoch": 0.08305425318151373, + "grad_norm": 0.14285503327846527, + "learning_rate": 0.0001, + "loss": 1.3974, + "step": 2748 + }, + { + "epoch": 0.08338914936369725, + "grad_norm": 0.1423054337501526, + "learning_rate": 0.0001, + "loss": 1.5738, + "step": 2749 + }, + { + "epoch": 0.08372404554588078, + "grad_norm": 0.14483210444450378, + "learning_rate": 0.0001, + "loss": 1.4587, + "step": 2750 + }, + { + "epoch": 0.0840589417280643, + "grad_norm": 0.14319851994514465, + "learning_rate": 0.0001, + "loss": 1.4642, + "step": 2751 + }, + { + "epoch": 0.08439383791024782, + "grad_norm": 0.13955651223659515, + "learning_rate": 0.0001, + "loss": 1.3657, + "step": 2752 + }, + { + "epoch": 0.08472873409243134, + "grad_norm": 0.1478879302740097, + "learning_rate": 0.0001, + "loss": 1.6474, + "step": 2753 + }, + { + "epoch": 0.08506363027461487, + "grad_norm": 0.14244432747364044, + "learning_rate": 0.0001, + "loss": 1.5459, + "step": 2754 + }, + { + "epoch": 0.0853985264567984, + "grad_norm": 0.14115175604820251, + "learning_rate": 0.0001, + "loss": 1.4614, + "step": 2755 + }, + { + "epoch": 0.08573342263898191, + "grad_norm": 0.13785597681999207, + "learning_rate": 0.0001, + "loss": 1.4257, + "step": 2756 + }, + { + "epoch": 0.08606831882116543, + "grad_norm": 0.14487232267856598, + "learning_rate": 0.0001, + "loss": 1.4983, + "step": 2757 + }, + { + "epoch": 0.08640321500334897, + "grad_norm": 0.14556938409805298, + "learning_rate": 0.0001, + "loss": 1.4263, + "step": 2758 + }, + { + "epoch": 0.08673811118553248, + "grad_norm": 0.1452976018190384, + "learning_rate": 0.0001, + "loss": 1.5328, + "step": 2759 + }, + { + "epoch": 0.087073007367716, + "grad_norm": 0.14344626665115356, + "learning_rate": 0.0001, + "loss": 1.4995, + "step": 2760 + }, + { + "epoch": 0.08740790354989954, + "grad_norm": 0.14486606419086456, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 2761 + }, + { + "epoch": 0.08774279973208306, + "grad_norm": 0.1462741494178772, + "learning_rate": 0.0001, + "loss": 1.4929, + "step": 2762 + }, + { + "epoch": 0.08807769591426658, + "grad_norm": 0.14603829383850098, + "learning_rate": 0.0001, + "loss": 1.5696, + "step": 2763 + }, + { + "epoch": 0.0884125920964501, + "grad_norm": 0.14310294389724731, + "learning_rate": 0.0001, + "loss": 1.4989, + "step": 2764 + }, + { + "epoch": 0.08874748827863363, + "grad_norm": 0.14336185157299042, + "learning_rate": 0.0001, + "loss": 1.4546, + "step": 2765 + }, + { + "epoch": 0.08908238446081715, + "grad_norm": 0.15111830830574036, + "learning_rate": 0.0001, + "loss": 1.5161, + "step": 2766 + }, + { + "epoch": 0.08941728064300067, + "grad_norm": 0.1415042281150818, + "learning_rate": 0.0001, + "loss": 1.4944, + "step": 2767 + }, + { + "epoch": 0.08975217682518419, + "grad_norm": 0.14492426812648773, + "learning_rate": 0.0001, + "loss": 1.4775, + "step": 2768 + }, + { + "epoch": 0.09008707300736772, + "grad_norm": 0.14594174921512604, + "learning_rate": 0.0001, + "loss": 1.4341, + "step": 2769 + }, + { + "epoch": 0.09042196918955124, + "grad_norm": 0.1443568468093872, + "learning_rate": 0.0001, + "loss": 1.4261, + "step": 2770 + }, + { + "epoch": 0.09075686537173476, + "grad_norm": 0.14747354388237, + "learning_rate": 0.0001, + "loss": 1.4934, + "step": 2771 + }, + { + "epoch": 0.09109176155391828, + "grad_norm": 0.14336217939853668, + "learning_rate": 0.0001, + "loss": 1.5243, + "step": 2772 + }, + { + "epoch": 0.09142665773610181, + "grad_norm": 0.1489473432302475, + "learning_rate": 0.0001, + "loss": 1.4997, + "step": 2773 + }, + { + "epoch": 0.09176155391828533, + "grad_norm": 0.140666201710701, + "learning_rate": 0.0001, + "loss": 1.4384, + "step": 2774 + }, + { + "epoch": 0.09209645010046885, + "grad_norm": 0.14660325646400452, + "learning_rate": 0.0001, + "loss": 1.4997, + "step": 2775 + }, + { + "epoch": 0.09243134628265237, + "grad_norm": 0.14790892601013184, + "learning_rate": 0.0001, + "loss": 1.4342, + "step": 2776 + }, + { + "epoch": 0.0927662424648359, + "grad_norm": 0.14560778439044952, + "learning_rate": 0.0001, + "loss": 1.5703, + "step": 2777 + }, + { + "epoch": 0.09310113864701942, + "grad_norm": 0.14003904163837433, + "learning_rate": 0.0001, + "loss": 1.5091, + "step": 2778 + }, + { + "epoch": 0.09343603482920294, + "grad_norm": 0.14353299140930176, + "learning_rate": 0.0001, + "loss": 1.5347, + "step": 2779 + }, + { + "epoch": 0.09377093101138648, + "grad_norm": 0.14216402173042297, + "learning_rate": 0.0001, + "loss": 1.4911, + "step": 2780 + }, + { + "epoch": 0.09410582719357, + "grad_norm": 0.14118923246860504, + "learning_rate": 0.0001, + "loss": 1.454, + "step": 2781 + }, + { + "epoch": 0.09444072337575352, + "grad_norm": 0.1451854258775711, + "learning_rate": 0.0001, + "loss": 1.5258, + "step": 2782 + }, + { + "epoch": 0.09477561955793704, + "grad_norm": 0.1455250382423401, + "learning_rate": 0.0001, + "loss": 1.508, + "step": 2783 + }, + { + "epoch": 0.09511051574012057, + "grad_norm": 0.1398870199918747, + "learning_rate": 0.0001, + "loss": 1.4157, + "step": 2784 + }, + { + "epoch": 0.09544541192230409, + "grad_norm": 0.1541321873664856, + "learning_rate": 0.0001, + "loss": 1.5076, + "step": 2785 + }, + { + "epoch": 0.09578030810448761, + "grad_norm": 0.14037346839904785, + "learning_rate": 0.0001, + "loss": 1.4294, + "step": 2786 + }, + { + "epoch": 0.09611520428667113, + "grad_norm": 0.15060150623321533, + "learning_rate": 0.0001, + "loss": 1.4742, + "step": 2787 + }, + { + "epoch": 0.09645010046885466, + "grad_norm": 0.1445634961128235, + "learning_rate": 0.0001, + "loss": 1.4149, + "step": 2788 + }, + { + "epoch": 0.09678499665103818, + "grad_norm": 0.1478995382785797, + "learning_rate": 0.0001, + "loss": 1.5271, + "step": 2789 + }, + { + "epoch": 0.0971198928332217, + "grad_norm": 0.1673794984817505, + "learning_rate": 0.0001, + "loss": 1.5274, + "step": 2790 + }, + { + "epoch": 0.09745478901540522, + "grad_norm": 0.14559200406074524, + "learning_rate": 0.0001, + "loss": 1.4834, + "step": 2791 + }, + { + "epoch": 0.09778968519758875, + "grad_norm": 0.14184166491031647, + "learning_rate": 0.0001, + "loss": 1.5399, + "step": 2792 + }, + { + "epoch": 0.09812458137977227, + "grad_norm": 0.14942292869091034, + "learning_rate": 0.0001, + "loss": 1.3988, + "step": 2793 + }, + { + "epoch": 0.09845947756195579, + "grad_norm": 0.14354535937309265, + "learning_rate": 0.0001, + "loss": 1.4625, + "step": 2794 + }, + { + "epoch": 0.09879437374413931, + "grad_norm": 0.14293980598449707, + "learning_rate": 0.0001, + "loss": 1.5965, + "step": 2795 + }, + { + "epoch": 0.09912926992632284, + "grad_norm": 0.1488644778728485, + "learning_rate": 0.0001, + "loss": 1.4126, + "step": 2796 + }, + { + "epoch": 0.09946416610850636, + "grad_norm": 0.13816925883293152, + "learning_rate": 0.0001, + "loss": 1.4674, + "step": 2797 + }, + { + "epoch": 0.09979906229068988, + "grad_norm": 0.1448117345571518, + "learning_rate": 0.0001, + "loss": 1.5199, + "step": 2798 + }, + { + "epoch": 0.10013395847287342, + "grad_norm": 0.14358732104301453, + "learning_rate": 0.0001, + "loss": 1.3845, + "step": 2799 + }, + { + "epoch": 0.10046885465505694, + "grad_norm": 0.1465434730052948, + "learning_rate": 0.0001, + "loss": 1.4358, + "step": 2800 + }, + { + "epoch": 0.10080375083724046, + "grad_norm": 0.14081235229969025, + "learning_rate": 0.0001, + "loss": 1.4383, + "step": 2801 + }, + { + "epoch": 0.10113864701942397, + "grad_norm": 0.14658094942569733, + "learning_rate": 0.0001, + "loss": 1.4229, + "step": 2802 + }, + { + "epoch": 0.10147354320160751, + "grad_norm": 0.14403977990150452, + "learning_rate": 0.0001, + "loss": 1.5351, + "step": 2803 + }, + { + "epoch": 0.10180843938379103, + "grad_norm": 0.1446755975484848, + "learning_rate": 0.0001, + "loss": 1.4485, + "step": 2804 + }, + { + "epoch": 0.10214333556597455, + "grad_norm": 0.14856703579425812, + "learning_rate": 0.0001, + "loss": 1.5629, + "step": 2805 + }, + { + "epoch": 0.10247823174815807, + "grad_norm": 0.14590945839881897, + "learning_rate": 0.0001, + "loss": 1.4851, + "step": 2806 + }, + { + "epoch": 0.1028131279303416, + "grad_norm": 0.14451883733272552, + "learning_rate": 0.0001, + "loss": 1.46, + "step": 2807 + }, + { + "epoch": 0.10314802411252512, + "grad_norm": 0.15517374873161316, + "learning_rate": 0.0001, + "loss": 1.5061, + "step": 2808 + }, + { + "epoch": 0.10348292029470864, + "grad_norm": 0.14735953509807587, + "learning_rate": 0.0001, + "loss": 1.514, + "step": 2809 + }, + { + "epoch": 0.10381781647689216, + "grad_norm": 0.1464541256427765, + "learning_rate": 0.0001, + "loss": 1.5269, + "step": 2810 + }, + { + "epoch": 0.10415271265907569, + "grad_norm": 0.15126463770866394, + "learning_rate": 0.0001, + "loss": 1.411, + "step": 2811 + }, + { + "epoch": 0.10448760884125921, + "grad_norm": 0.1394396275281906, + "learning_rate": 0.0001, + "loss": 1.5222, + "step": 2812 + }, + { + "epoch": 0.10482250502344273, + "grad_norm": 0.14099687337875366, + "learning_rate": 0.0001, + "loss": 1.4384, + "step": 2813 + }, + { + "epoch": 0.10515740120562625, + "grad_norm": 0.15170758962631226, + "learning_rate": 0.0001, + "loss": 1.5148, + "step": 2814 + }, + { + "epoch": 0.10549229738780978, + "grad_norm": 0.14324727654457092, + "learning_rate": 0.0001, + "loss": 1.4224, + "step": 2815 + }, + { + "epoch": 0.1058271935699933, + "grad_norm": 0.1460292786359787, + "learning_rate": 0.0001, + "loss": 1.558, + "step": 2816 + }, + { + "epoch": 0.10616208975217682, + "grad_norm": 0.1621580272912979, + "learning_rate": 0.0001, + "loss": 1.5001, + "step": 2817 + }, + { + "epoch": 0.10649698593436034, + "grad_norm": 0.14594002068042755, + "learning_rate": 0.0001, + "loss": 1.5567, + "step": 2818 + }, + { + "epoch": 0.10683188211654387, + "grad_norm": 0.1518278867006302, + "learning_rate": 0.0001, + "loss": 1.5166, + "step": 2819 + }, + { + "epoch": 0.1071667782987274, + "grad_norm": 0.1512691229581833, + "learning_rate": 0.0001, + "loss": 1.3993, + "step": 2820 + }, + { + "epoch": 0.10750167448091091, + "grad_norm": 0.14213557541370392, + "learning_rate": 0.0001, + "loss": 1.3772, + "step": 2821 + }, + { + "epoch": 0.10783657066309445, + "grad_norm": 0.14900755882263184, + "learning_rate": 0.0001, + "loss": 1.5243, + "step": 2822 + }, + { + "epoch": 0.10817146684527797, + "grad_norm": 0.15196503698825836, + "learning_rate": 0.0001, + "loss": 1.5012, + "step": 2823 + }, + { + "epoch": 0.10850636302746149, + "grad_norm": 0.14796164631843567, + "learning_rate": 0.0001, + "loss": 1.4849, + "step": 2824 + }, + { + "epoch": 0.108841259209645, + "grad_norm": 0.15282899141311646, + "learning_rate": 0.0001, + "loss": 1.5008, + "step": 2825 + }, + { + "epoch": 0.10917615539182854, + "grad_norm": 0.14410486817359924, + "learning_rate": 0.0001, + "loss": 1.4907, + "step": 2826 + }, + { + "epoch": 0.10951105157401206, + "grad_norm": 0.14892512559890747, + "learning_rate": 0.0001, + "loss": 1.4086, + "step": 2827 + }, + { + "epoch": 0.10984594775619558, + "grad_norm": 0.14852510392665863, + "learning_rate": 0.0001, + "loss": 1.5008, + "step": 2828 + }, + { + "epoch": 0.1101808439383791, + "grad_norm": 0.14441753923892975, + "learning_rate": 0.0001, + "loss": 1.4408, + "step": 2829 + }, + { + "epoch": 0.11051574012056263, + "grad_norm": 0.14280498027801514, + "learning_rate": 0.0001, + "loss": 1.5208, + "step": 2830 + }, + { + "epoch": 0.11085063630274615, + "grad_norm": 0.15437792241573334, + "learning_rate": 0.0001, + "loss": 1.5327, + "step": 2831 + }, + { + "epoch": 0.11118553248492967, + "grad_norm": 0.1393211930990219, + "learning_rate": 0.0001, + "loss": 1.492, + "step": 2832 + }, + { + "epoch": 0.11152042866711319, + "grad_norm": 0.15891778469085693, + "learning_rate": 0.0001, + "loss": 1.5739, + "step": 2833 + }, + { + "epoch": 0.11185532484929672, + "grad_norm": 0.15602029860019684, + "learning_rate": 0.0001, + "loss": 1.4942, + "step": 2834 + }, + { + "epoch": 0.11219022103148024, + "grad_norm": 0.15723031759262085, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 2835 + }, + { + "epoch": 0.11252511721366376, + "grad_norm": 0.1397925466299057, + "learning_rate": 0.0001, + "loss": 1.3486, + "step": 2836 + }, + { + "epoch": 0.11286001339584728, + "grad_norm": 0.16877742111682892, + "learning_rate": 0.0001, + "loss": 1.551, + "step": 2837 + }, + { + "epoch": 0.11319490957803081, + "grad_norm": 0.1419568657875061, + "learning_rate": 0.0001, + "loss": 1.4831, + "step": 2838 + }, + { + "epoch": 0.11352980576021433, + "grad_norm": 0.14331306517124176, + "learning_rate": 0.0001, + "loss": 1.3837, + "step": 2839 + }, + { + "epoch": 0.11386470194239785, + "grad_norm": 0.1620616614818573, + "learning_rate": 0.0001, + "loss": 1.4805, + "step": 2840 + }, + { + "epoch": 0.11419959812458139, + "grad_norm": 0.14861570298671722, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 2841 + }, + { + "epoch": 0.1145344943067649, + "grad_norm": 0.15495343506336212, + "learning_rate": 0.0001, + "loss": 1.5645, + "step": 2842 + }, + { + "epoch": 0.11486939048894843, + "grad_norm": 0.15632815659046173, + "learning_rate": 0.0001, + "loss": 1.4557, + "step": 2843 + }, + { + "epoch": 0.11520428667113194, + "grad_norm": 0.14908194541931152, + "learning_rate": 0.0001, + "loss": 1.5085, + "step": 2844 + }, + { + "epoch": 0.11553918285331548, + "grad_norm": 0.14396606385707855, + "learning_rate": 0.0001, + "loss": 1.4502, + "step": 2845 + }, + { + "epoch": 0.115874079035499, + "grad_norm": 0.16483518481254578, + "learning_rate": 0.0001, + "loss": 1.5383, + "step": 2846 + }, + { + "epoch": 0.11620897521768252, + "grad_norm": 0.14482854306697845, + "learning_rate": 0.0001, + "loss": 1.4928, + "step": 2847 + }, + { + "epoch": 0.11654387139986604, + "grad_norm": 0.15183790028095245, + "learning_rate": 0.0001, + "loss": 1.551, + "step": 2848 + }, + { + "epoch": 0.11687876758204957, + "grad_norm": 0.1584623008966446, + "learning_rate": 0.0001, + "loss": 1.5034, + "step": 2849 + }, + { + "epoch": 0.11721366376423309, + "grad_norm": 0.15252350270748138, + "learning_rate": 0.0001, + "loss": 1.6486, + "step": 2850 + }, + { + "epoch": 0.11754855994641661, + "grad_norm": 0.15280024707317352, + "learning_rate": 0.0001, + "loss": 1.5209, + "step": 2851 + }, + { + "epoch": 0.11788345612860013, + "grad_norm": 0.13550280034542084, + "learning_rate": 0.0001, + "loss": 1.372, + "step": 2852 + }, + { + "epoch": 0.11821835231078366, + "grad_norm": 0.14424198865890503, + "learning_rate": 0.0001, + "loss": 1.4848, + "step": 2853 + }, + { + "epoch": 0.11855324849296718, + "grad_norm": 0.14939643442630768, + "learning_rate": 0.0001, + "loss": 1.4816, + "step": 2854 + }, + { + "epoch": 0.1188881446751507, + "grad_norm": 0.15128619968891144, + "learning_rate": 0.0001, + "loss": 1.4966, + "step": 2855 + }, + { + "epoch": 0.11922304085733422, + "grad_norm": 0.1442454755306244, + "learning_rate": 0.0001, + "loss": 1.5416, + "step": 2856 + }, + { + "epoch": 0.11955793703951775, + "grad_norm": 0.14224649965763092, + "learning_rate": 0.0001, + "loss": 1.545, + "step": 2857 + }, + { + "epoch": 0.11989283322170127, + "grad_norm": 0.14138370752334595, + "learning_rate": 0.0001, + "loss": 1.5045, + "step": 2858 + }, + { + "epoch": 0.12022772940388479, + "grad_norm": 0.15062503516674042, + "learning_rate": 0.0001, + "loss": 1.4984, + "step": 2859 + }, + { + "epoch": 0.12056262558606833, + "grad_norm": 0.15108346939086914, + "learning_rate": 0.0001, + "loss": 1.5688, + "step": 2860 + }, + { + "epoch": 0.12089752176825184, + "grad_norm": 0.1456029713153839, + "learning_rate": 0.0001, + "loss": 1.4947, + "step": 2861 + }, + { + "epoch": 0.12123241795043536, + "grad_norm": 0.14550332725048065, + "learning_rate": 0.0001, + "loss": 1.3615, + "step": 2862 + }, + { + "epoch": 0.12156731413261888, + "grad_norm": 0.14609716832637787, + "learning_rate": 0.0001, + "loss": 1.4911, + "step": 2863 + }, + { + "epoch": 0.12190221031480242, + "grad_norm": 0.14730757474899292, + "learning_rate": 0.0001, + "loss": 1.4903, + "step": 2864 + }, + { + "epoch": 0.12223710649698594, + "grad_norm": 0.1497606337070465, + "learning_rate": 0.0001, + "loss": 1.4883, + "step": 2865 + }, + { + "epoch": 0.12257200267916946, + "grad_norm": 0.14975416660308838, + "learning_rate": 0.0001, + "loss": 1.5021, + "step": 2866 + }, + { + "epoch": 0.12290689886135298, + "grad_norm": 0.14952072501182556, + "learning_rate": 0.0001, + "loss": 1.5539, + "step": 2867 + }, + { + "epoch": 0.12324179504353651, + "grad_norm": 0.15092499554157257, + "learning_rate": 0.0001, + "loss": 1.459, + "step": 2868 + }, + { + "epoch": 0.12357669122572003, + "grad_norm": 0.14270053803920746, + "learning_rate": 0.0001, + "loss": 1.4523, + "step": 2869 + }, + { + "epoch": 0.12391158740790355, + "grad_norm": 0.14697770774364471, + "learning_rate": 0.0001, + "loss": 1.5208, + "step": 2870 + }, + { + "epoch": 0.12424648359008707, + "grad_norm": 0.1468277871608734, + "learning_rate": 0.0001, + "loss": 1.5413, + "step": 2871 + }, + { + "epoch": 0.1245813797722706, + "grad_norm": 0.1417912393808365, + "learning_rate": 0.0001, + "loss": 1.4422, + "step": 2872 + }, + { + "epoch": 0.12491627595445412, + "grad_norm": 0.1466628909111023, + "learning_rate": 0.0001, + "loss": 1.5536, + "step": 2873 + }, + { + "epoch": 0.12525117213663764, + "grad_norm": 0.15047575533390045, + "learning_rate": 0.0001, + "loss": 1.5134, + "step": 2874 + }, + { + "epoch": 0.12558606831882116, + "grad_norm": 0.14367663860321045, + "learning_rate": 0.0001, + "loss": 1.5137, + "step": 2875 + }, + { + "epoch": 0.12592096450100468, + "grad_norm": 0.14631038904190063, + "learning_rate": 0.0001, + "loss": 1.5526, + "step": 2876 + }, + { + "epoch": 0.1262558606831882, + "grad_norm": 0.1465481072664261, + "learning_rate": 0.0001, + "loss": 1.5348, + "step": 2877 + }, + { + "epoch": 0.12659075686537175, + "grad_norm": 0.141470804810524, + "learning_rate": 0.0001, + "loss": 1.4179, + "step": 2878 + }, + { + "epoch": 0.12692565304755526, + "grad_norm": 0.14299000799655914, + "learning_rate": 0.0001, + "loss": 1.447, + "step": 2879 + }, + { + "epoch": 0.12726054922973878, + "grad_norm": 0.13756218552589417, + "learning_rate": 0.0001, + "loss": 1.3867, + "step": 2880 + }, + { + "epoch": 0.1275954454119223, + "grad_norm": 0.141182541847229, + "learning_rate": 0.0001, + "loss": 1.4574, + "step": 2881 + }, + { + "epoch": 0.12793034159410582, + "grad_norm": 0.1464800387620926, + "learning_rate": 0.0001, + "loss": 1.4442, + "step": 2882 + }, + { + "epoch": 0.12826523777628934, + "grad_norm": 0.14528337121009827, + "learning_rate": 0.0001, + "loss": 1.4991, + "step": 2883 + }, + { + "epoch": 0.12860013395847286, + "grad_norm": 0.14793065190315247, + "learning_rate": 0.0001, + "loss": 1.5269, + "step": 2884 + }, + { + "epoch": 0.1289350301406564, + "grad_norm": 0.1435883641242981, + "learning_rate": 0.0001, + "loss": 1.4521, + "step": 2885 + }, + { + "epoch": 0.12926992632283993, + "grad_norm": 0.14673016965389252, + "learning_rate": 0.0001, + "loss": 1.5129, + "step": 2886 + }, + { + "epoch": 0.12960482250502345, + "grad_norm": 0.13962426781654358, + "learning_rate": 0.0001, + "loss": 1.467, + "step": 2887 + }, + { + "epoch": 0.12993971868720697, + "grad_norm": 0.1506863832473755, + "learning_rate": 0.0001, + "loss": 1.5761, + "step": 2888 + }, + { + "epoch": 0.1302746148693905, + "grad_norm": 0.14275865256786346, + "learning_rate": 0.0001, + "loss": 1.4139, + "step": 2889 + }, + { + "epoch": 0.130609511051574, + "grad_norm": 0.14221537113189697, + "learning_rate": 0.0001, + "loss": 1.4269, + "step": 2890 + }, + { + "epoch": 0.13094440723375753, + "grad_norm": 0.1372966170310974, + "learning_rate": 0.0001, + "loss": 1.4217, + "step": 2891 + }, + { + "epoch": 0.13127930341594105, + "grad_norm": 0.1523875892162323, + "learning_rate": 0.0001, + "loss": 1.528, + "step": 2892 + }, + { + "epoch": 0.1316141995981246, + "grad_norm": 0.13655850291252136, + "learning_rate": 0.0001, + "loss": 1.4749, + "step": 2893 + }, + { + "epoch": 0.1319490957803081, + "grad_norm": 0.14285436272621155, + "learning_rate": 0.0001, + "loss": 1.416, + "step": 2894 + }, + { + "epoch": 0.13228399196249163, + "grad_norm": 0.1540876179933548, + "learning_rate": 0.0001, + "loss": 1.604, + "step": 2895 + }, + { + "epoch": 0.13261888814467515, + "grad_norm": 0.14926448464393616, + "learning_rate": 0.0001, + "loss": 1.5095, + "step": 2896 + }, + { + "epoch": 0.13295378432685867, + "grad_norm": 0.15166886150836945, + "learning_rate": 0.0001, + "loss": 1.5766, + "step": 2897 + }, + { + "epoch": 0.1332886805090422, + "grad_norm": 0.15036708116531372, + "learning_rate": 0.0001, + "loss": 1.4603, + "step": 2898 + }, + { + "epoch": 0.1336235766912257, + "grad_norm": 0.14928917586803436, + "learning_rate": 0.0001, + "loss": 1.5102, + "step": 2899 + }, + { + "epoch": 0.13395847287340926, + "grad_norm": 0.14783331751823425, + "learning_rate": 0.0001, + "loss": 1.4389, + "step": 2900 + }, + { + "epoch": 0.13429336905559278, + "grad_norm": 0.1541386842727661, + "learning_rate": 0.0001, + "loss": 1.643, + "step": 2901 + }, + { + "epoch": 0.1346282652377763, + "grad_norm": 0.15611632168293, + "learning_rate": 0.0001, + "loss": 1.5295, + "step": 2902 + }, + { + "epoch": 0.13496316141995982, + "grad_norm": 0.1471196413040161, + "learning_rate": 0.0001, + "loss": 1.4305, + "step": 2903 + }, + { + "epoch": 0.13529805760214333, + "grad_norm": 0.15436099469661713, + "learning_rate": 0.0001, + "loss": 1.5258, + "step": 2904 + }, + { + "epoch": 0.13563295378432685, + "grad_norm": 0.14856241643428802, + "learning_rate": 0.0001, + "loss": 1.432, + "step": 2905 + }, + { + "epoch": 0.13596784996651037, + "grad_norm": 0.1435495764017105, + "learning_rate": 0.0001, + "loss": 1.4063, + "step": 2906 + }, + { + "epoch": 0.1363027461486939, + "grad_norm": 0.14317458868026733, + "learning_rate": 0.0001, + "loss": 1.4238, + "step": 2907 + }, + { + "epoch": 0.13663764233087744, + "grad_norm": 0.14765316247940063, + "learning_rate": 0.0001, + "loss": 1.5453, + "step": 2908 + }, + { + "epoch": 0.13697253851306096, + "grad_norm": 0.15258818864822388, + "learning_rate": 0.0001, + "loss": 1.559, + "step": 2909 + }, + { + "epoch": 0.13730743469524448, + "grad_norm": 0.14648139476776123, + "learning_rate": 0.0001, + "loss": 1.4419, + "step": 2910 + }, + { + "epoch": 0.137642330877428, + "grad_norm": 0.14375437796115875, + "learning_rate": 0.0001, + "loss": 1.4301, + "step": 2911 + }, + { + "epoch": 0.13797722705961152, + "grad_norm": 0.1457594335079193, + "learning_rate": 0.0001, + "loss": 1.4593, + "step": 2912 + }, + { + "epoch": 0.13831212324179504, + "grad_norm": 0.14920419454574585, + "learning_rate": 0.0001, + "loss": 1.5514, + "step": 2913 + }, + { + "epoch": 0.13864701942397856, + "grad_norm": 0.1460302621126175, + "learning_rate": 0.0001, + "loss": 1.47, + "step": 2914 + }, + { + "epoch": 0.13898191560616208, + "grad_norm": 0.1491982638835907, + "learning_rate": 0.0001, + "loss": 1.4894, + "step": 2915 + }, + { + "epoch": 0.13931681178834562, + "grad_norm": 0.14638325572013855, + "learning_rate": 0.0001, + "loss": 1.4824, + "step": 2916 + }, + { + "epoch": 0.13965170797052914, + "grad_norm": 0.14499424397945404, + "learning_rate": 0.0001, + "loss": 1.4292, + "step": 2917 + }, + { + "epoch": 0.13998660415271266, + "grad_norm": 0.14576716721057892, + "learning_rate": 0.0001, + "loss": 1.5192, + "step": 2918 + }, + { + "epoch": 0.14032150033489618, + "grad_norm": 0.1488197296857834, + "learning_rate": 0.0001, + "loss": 1.442, + "step": 2919 + }, + { + "epoch": 0.1406563965170797, + "grad_norm": 0.14574909210205078, + "learning_rate": 0.0001, + "loss": 1.4224, + "step": 2920 + }, + { + "epoch": 0.14099129269926322, + "grad_norm": 0.15139609575271606, + "learning_rate": 0.0001, + "loss": 1.5193, + "step": 2921 + }, + { + "epoch": 0.14132618888144674, + "grad_norm": 0.15167635679244995, + "learning_rate": 0.0001, + "loss": 1.5037, + "step": 2922 + }, + { + "epoch": 0.1416610850636303, + "grad_norm": 0.14516854286193848, + "learning_rate": 0.0001, + "loss": 1.4547, + "step": 2923 + }, + { + "epoch": 0.1419959812458138, + "grad_norm": 0.14109300076961517, + "learning_rate": 0.0001, + "loss": 1.4066, + "step": 2924 + }, + { + "epoch": 0.14233087742799733, + "grad_norm": 0.1450345367193222, + "learning_rate": 0.0001, + "loss": 1.512, + "step": 2925 + }, + { + "epoch": 0.14266577361018085, + "grad_norm": 0.14865948259830475, + "learning_rate": 0.0001, + "loss": 1.4758, + "step": 2926 + }, + { + "epoch": 0.14300066979236437, + "grad_norm": 0.1497637927532196, + "learning_rate": 0.0001, + "loss": 1.4643, + "step": 2927 + }, + { + "epoch": 0.14333556597454788, + "grad_norm": 0.1432199478149414, + "learning_rate": 0.0001, + "loss": 1.467, + "step": 2928 + }, + { + "epoch": 0.1436704621567314, + "grad_norm": 0.15253181755542755, + "learning_rate": 0.0001, + "loss": 1.5088, + "step": 2929 + }, + { + "epoch": 0.14400535833891492, + "grad_norm": 0.1489313691854477, + "learning_rate": 0.0001, + "loss": 1.475, + "step": 2930 + }, + { + "epoch": 0.14434025452109847, + "grad_norm": 0.15359234809875488, + "learning_rate": 0.0001, + "loss": 1.5527, + "step": 2931 + }, + { + "epoch": 0.144675150703282, + "grad_norm": 0.14580143988132477, + "learning_rate": 0.0001, + "loss": 1.4946, + "step": 2932 + }, + { + "epoch": 0.1450100468854655, + "grad_norm": 0.14856304228305817, + "learning_rate": 0.0001, + "loss": 1.4547, + "step": 2933 + }, + { + "epoch": 0.14534494306764903, + "grad_norm": 0.14780816435813904, + "learning_rate": 0.0001, + "loss": 1.4839, + "step": 2934 + }, + { + "epoch": 0.14567983924983255, + "grad_norm": 0.15428078174591064, + "learning_rate": 0.0001, + "loss": 1.5118, + "step": 2935 + }, + { + "epoch": 0.14601473543201607, + "grad_norm": 0.13885167241096497, + "learning_rate": 0.0001, + "loss": 1.4458, + "step": 2936 + }, + { + "epoch": 0.1463496316141996, + "grad_norm": 0.14830608665943146, + "learning_rate": 0.0001, + "loss": 1.441, + "step": 2937 + }, + { + "epoch": 0.14668452779638314, + "grad_norm": 0.14985600113868713, + "learning_rate": 0.0001, + "loss": 1.4694, + "step": 2938 + }, + { + "epoch": 0.14701942397856665, + "grad_norm": 0.14177678525447845, + "learning_rate": 0.0001, + "loss": 1.5314, + "step": 2939 + }, + { + "epoch": 0.14735432016075017, + "grad_norm": 0.1454205960035324, + "learning_rate": 0.0001, + "loss": 1.4811, + "step": 2940 + }, + { + "epoch": 0.1476892163429337, + "grad_norm": 0.14319349825382233, + "learning_rate": 0.0001, + "loss": 1.4508, + "step": 2941 + }, + { + "epoch": 0.1480241125251172, + "grad_norm": 0.14416146278381348, + "learning_rate": 0.0001, + "loss": 1.4633, + "step": 2942 + }, + { + "epoch": 0.14835900870730073, + "grad_norm": 0.14271238446235657, + "learning_rate": 0.0001, + "loss": 1.4124, + "step": 2943 + }, + { + "epoch": 0.14869390488948425, + "grad_norm": 0.149638369679451, + "learning_rate": 0.0001, + "loss": 1.5879, + "step": 2944 + }, + { + "epoch": 0.14902880107166777, + "grad_norm": 0.15497125685214996, + "learning_rate": 0.0001, + "loss": 1.5543, + "step": 2945 + }, + { + "epoch": 0.14936369725385132, + "grad_norm": 0.14638566970825195, + "learning_rate": 0.0001, + "loss": 1.52, + "step": 2946 + }, + { + "epoch": 0.14969859343603484, + "grad_norm": 0.13843445479869843, + "learning_rate": 0.0001, + "loss": 1.4464, + "step": 2947 + }, + { + "epoch": 0.15003348961821836, + "grad_norm": 0.1452428102493286, + "learning_rate": 0.0001, + "loss": 1.5037, + "step": 2948 + }, + { + "epoch": 0.15036838580040188, + "grad_norm": 0.1429130882024765, + "learning_rate": 0.0001, + "loss": 1.4431, + "step": 2949 + }, + { + "epoch": 0.1507032819825854, + "grad_norm": 0.14541609585285187, + "learning_rate": 0.0001, + "loss": 1.502, + "step": 2950 + }, + { + "epoch": 0.15103817816476892, + "grad_norm": 0.1436072587966919, + "learning_rate": 0.0001, + "loss": 1.5754, + "step": 2951 + }, + { + "epoch": 0.15137307434695244, + "grad_norm": 0.1478150635957718, + "learning_rate": 0.0001, + "loss": 1.4515, + "step": 2952 + }, + { + "epoch": 0.15170797052913595, + "grad_norm": 0.14313754439353943, + "learning_rate": 0.0001, + "loss": 1.4284, + "step": 2953 + }, + { + "epoch": 0.1520428667113195, + "grad_norm": 0.14626973867416382, + "learning_rate": 0.0001, + "loss": 1.4427, + "step": 2954 + }, + { + "epoch": 0.15237776289350302, + "grad_norm": 0.14305603504180908, + "learning_rate": 0.0001, + "loss": 1.4067, + "step": 2955 + }, + { + "epoch": 0.15271265907568654, + "grad_norm": 0.15285871922969818, + "learning_rate": 0.0001, + "loss": 1.4773, + "step": 2956 + }, + { + "epoch": 0.15304755525787006, + "grad_norm": 0.14073546230793, + "learning_rate": 0.0001, + "loss": 1.394, + "step": 2957 + }, + { + "epoch": 0.15338245144005358, + "grad_norm": 0.15410828590393066, + "learning_rate": 0.0001, + "loss": 1.5485, + "step": 2958 + }, + { + "epoch": 0.1537173476222371, + "grad_norm": 0.162978857755661, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 2959 + }, + { + "epoch": 0.15405224380442062, + "grad_norm": 0.14464987814426422, + "learning_rate": 0.0001, + "loss": 1.5635, + "step": 2960 + }, + { + "epoch": 0.15438713998660417, + "grad_norm": 0.1483234167098999, + "learning_rate": 0.0001, + "loss": 1.481, + "step": 2961 + }, + { + "epoch": 0.15472203616878769, + "grad_norm": 0.151525616645813, + "learning_rate": 0.0001, + "loss": 1.4573, + "step": 2962 + }, + { + "epoch": 0.1550569323509712, + "grad_norm": 0.14589788019657135, + "learning_rate": 0.0001, + "loss": 1.5513, + "step": 2963 + }, + { + "epoch": 0.15539182853315472, + "grad_norm": 0.15947841107845306, + "learning_rate": 0.0001, + "loss": 1.5102, + "step": 2964 + }, + { + "epoch": 0.15572672471533824, + "grad_norm": 0.14565446972846985, + "learning_rate": 0.0001, + "loss": 1.4366, + "step": 2965 + }, + { + "epoch": 0.15606162089752176, + "grad_norm": 0.1432366818189621, + "learning_rate": 0.0001, + "loss": 1.5625, + "step": 2966 + }, + { + "epoch": 0.15639651707970528, + "grad_norm": 0.15333931148052216, + "learning_rate": 0.0001, + "loss": 1.458, + "step": 2967 + }, + { + "epoch": 0.1567314132618888, + "grad_norm": 0.15091878175735474, + "learning_rate": 0.0001, + "loss": 1.4619, + "step": 2968 + }, + { + "epoch": 0.15706630944407235, + "grad_norm": 0.1437712013721466, + "learning_rate": 0.0001, + "loss": 1.5207, + "step": 2969 + }, + { + "epoch": 0.15740120562625587, + "grad_norm": 0.16013203561306, + "learning_rate": 0.0001, + "loss": 1.5625, + "step": 2970 + }, + { + "epoch": 0.1577361018084394, + "grad_norm": 0.1535797417163849, + "learning_rate": 0.0001, + "loss": 1.5372, + "step": 2971 + }, + { + "epoch": 0.1580709979906229, + "grad_norm": 0.14695370197296143, + "learning_rate": 0.0001, + "loss": 1.492, + "step": 2972 + }, + { + "epoch": 0.15840589417280643, + "grad_norm": 0.15556497871875763, + "learning_rate": 0.0001, + "loss": 1.5317, + "step": 2973 + }, + { + "epoch": 0.15874079035498995, + "grad_norm": 0.15451493859291077, + "learning_rate": 0.0001, + "loss": 1.5248, + "step": 2974 + }, + { + "epoch": 0.15907568653717347, + "grad_norm": 0.14186540246009827, + "learning_rate": 0.0001, + "loss": 1.4655, + "step": 2975 + }, + { + "epoch": 0.15941058271935699, + "grad_norm": 0.15657803416252136, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 2976 + }, + { + "epoch": 0.15974547890154053, + "grad_norm": 0.15531529486179352, + "learning_rate": 0.0001, + "loss": 1.512, + "step": 2977 + }, + { + "epoch": 0.16008037508372405, + "grad_norm": 0.14492064714431763, + "learning_rate": 0.0001, + "loss": 1.4434, + "step": 2978 + }, + { + "epoch": 0.16041527126590757, + "grad_norm": 0.1516708880662918, + "learning_rate": 0.0001, + "loss": 1.4656, + "step": 2979 + }, + { + "epoch": 0.1607501674480911, + "grad_norm": 0.14958037436008453, + "learning_rate": 0.0001, + "loss": 1.4626, + "step": 2980 + }, + { + "epoch": 0.1610850636302746, + "grad_norm": 0.1415179967880249, + "learning_rate": 0.0001, + "loss": 1.4767, + "step": 2981 + }, + { + "epoch": 0.16141995981245813, + "grad_norm": 0.1595795750617981, + "learning_rate": 0.0001, + "loss": 1.5476, + "step": 2982 + }, + { + "epoch": 0.16175485599464165, + "grad_norm": 0.1559695601463318, + "learning_rate": 0.0001, + "loss": 1.5846, + "step": 2983 + }, + { + "epoch": 0.1620897521768252, + "grad_norm": 0.14177894592285156, + "learning_rate": 0.0001, + "loss": 1.3714, + "step": 2984 + }, + { + "epoch": 0.16242464835900872, + "grad_norm": 0.15120206773281097, + "learning_rate": 0.0001, + "loss": 1.4896, + "step": 2985 + }, + { + "epoch": 0.16275954454119224, + "grad_norm": 0.1491135209798813, + "learning_rate": 0.0001, + "loss": 1.4239, + "step": 2986 + }, + { + "epoch": 0.16309444072337576, + "grad_norm": 0.15394911170005798, + "learning_rate": 0.0001, + "loss": 1.5397, + "step": 2987 + }, + { + "epoch": 0.16342933690555927, + "grad_norm": 0.14349506795406342, + "learning_rate": 0.0001, + "loss": 1.4447, + "step": 2988 + }, + { + "epoch": 0.1637642330877428, + "grad_norm": 0.15076670050621033, + "learning_rate": 0.0001, + "loss": 1.4206, + "step": 2989 + }, + { + "epoch": 0.1640991292699263, + "grad_norm": 0.14125433564186096, + "learning_rate": 0.0001, + "loss": 1.426, + "step": 2990 + }, + { + "epoch": 0.16443402545210983, + "grad_norm": 0.15920014679431915, + "learning_rate": 0.0001, + "loss": 1.5571, + "step": 2991 + }, + { + "epoch": 0.16476892163429338, + "grad_norm": 0.1530846655368805, + "learning_rate": 0.0001, + "loss": 1.4773, + "step": 2992 + }, + { + "epoch": 0.1651038178164769, + "grad_norm": 0.14886631071567535, + "learning_rate": 0.0001, + "loss": 1.5835, + "step": 2993 + }, + { + "epoch": 0.16543871399866042, + "grad_norm": 0.1467156857252121, + "learning_rate": 0.0001, + "loss": 1.5853, + "step": 2994 + }, + { + "epoch": 0.16577361018084394, + "grad_norm": 0.14700840413570404, + "learning_rate": 0.0001, + "loss": 1.5046, + "step": 2995 + }, + { + "epoch": 0.16610850636302746, + "grad_norm": 0.1469178944826126, + "learning_rate": 0.0001, + "loss": 1.5363, + "step": 2996 + }, + { + "epoch": 0.16644340254521098, + "grad_norm": 0.15248429775238037, + "learning_rate": 0.0001, + "loss": 1.4008, + "step": 2997 + }, + { + "epoch": 0.1667782987273945, + "grad_norm": 0.15299245715141296, + "learning_rate": 0.0001, + "loss": 1.5311, + "step": 2998 + }, + { + "epoch": 0.16711319490957804, + "grad_norm": 0.1501973271369934, + "learning_rate": 0.0001, + "loss": 1.481, + "step": 2999 + }, + { + "epoch": 0.16744809109176156, + "grad_norm": 0.14169913530349731, + "learning_rate": 0.0001, + "loss": 1.3979, + "step": 3000 + }, + { + "epoch": 0.16778298727394508, + "grad_norm": 0.14365142583847046, + "learning_rate": 0.0001, + "loss": 1.492, + "step": 3001 + }, + { + "epoch": 0.1681178834561286, + "grad_norm": 0.1459331214427948, + "learning_rate": 0.0001, + "loss": 1.4924, + "step": 3002 + }, + { + "epoch": 0.16845277963831212, + "grad_norm": 0.1534721553325653, + "learning_rate": 0.0001, + "loss": 1.5352, + "step": 3003 + }, + { + "epoch": 0.16878767582049564, + "grad_norm": 0.14717139303684235, + "learning_rate": 0.0001, + "loss": 1.5159, + "step": 3004 + }, + { + "epoch": 0.16912257200267916, + "grad_norm": 0.14475423097610474, + "learning_rate": 0.0001, + "loss": 1.4526, + "step": 3005 + }, + { + "epoch": 0.16945746818486268, + "grad_norm": 0.14285942912101746, + "learning_rate": 0.0001, + "loss": 1.4722, + "step": 3006 + }, + { + "epoch": 0.16979236436704623, + "grad_norm": 0.14912879467010498, + "learning_rate": 0.0001, + "loss": 1.4364, + "step": 3007 + }, + { + "epoch": 0.17012726054922975, + "grad_norm": 0.15310518443584442, + "learning_rate": 0.0001, + "loss": 1.541, + "step": 3008 + }, + { + "epoch": 0.17046215673141327, + "grad_norm": 0.14872018992900848, + "learning_rate": 0.0001, + "loss": 1.5321, + "step": 3009 + }, + { + "epoch": 0.1707970529135968, + "grad_norm": 0.1429416537284851, + "learning_rate": 0.0001, + "loss": 1.4555, + "step": 3010 + }, + { + "epoch": 0.1711319490957803, + "grad_norm": 0.14461266994476318, + "learning_rate": 0.0001, + "loss": 1.4324, + "step": 3011 + }, + { + "epoch": 0.17146684527796383, + "grad_norm": 0.15209822356700897, + "learning_rate": 0.0001, + "loss": 1.5429, + "step": 3012 + }, + { + "epoch": 0.17180174146014734, + "grad_norm": 0.148372620344162, + "learning_rate": 0.0001, + "loss": 1.4373, + "step": 3013 + }, + { + "epoch": 0.17213663764233086, + "grad_norm": 0.1498633325099945, + "learning_rate": 0.0001, + "loss": 1.5929, + "step": 3014 + }, + { + "epoch": 0.1724715338245144, + "grad_norm": 0.15261657536029816, + "learning_rate": 0.0001, + "loss": 1.5334, + "step": 3015 + }, + { + "epoch": 0.17280643000669793, + "grad_norm": 0.14222516119480133, + "learning_rate": 0.0001, + "loss": 1.4574, + "step": 3016 + }, + { + "epoch": 0.17314132618888145, + "grad_norm": 0.14974059164524078, + "learning_rate": 0.0001, + "loss": 1.4512, + "step": 3017 + }, + { + "epoch": 0.17347622237106497, + "grad_norm": 0.14610859751701355, + "learning_rate": 0.0001, + "loss": 1.4675, + "step": 3018 + }, + { + "epoch": 0.1738111185532485, + "grad_norm": 0.14841997623443604, + "learning_rate": 0.0001, + "loss": 1.399, + "step": 3019 + }, + { + "epoch": 0.174146014735432, + "grad_norm": 0.14367492496967316, + "learning_rate": 0.0001, + "loss": 1.4205, + "step": 3020 + }, + { + "epoch": 0.17448091091761553, + "grad_norm": 0.14827755093574524, + "learning_rate": 0.0001, + "loss": 1.4087, + "step": 3021 + }, + { + "epoch": 0.17481580709979908, + "grad_norm": 0.1429951936006546, + "learning_rate": 0.0001, + "loss": 1.4383, + "step": 3022 + }, + { + "epoch": 0.1751507032819826, + "grad_norm": 0.14675314724445343, + "learning_rate": 0.0001, + "loss": 1.4825, + "step": 3023 + }, + { + "epoch": 0.17548559946416611, + "grad_norm": 0.1578107476234436, + "learning_rate": 0.0001, + "loss": 1.5891, + "step": 3024 + }, + { + "epoch": 0.17582049564634963, + "grad_norm": 0.15326331555843353, + "learning_rate": 0.0001, + "loss": 1.5382, + "step": 3025 + }, + { + "epoch": 0.17615539182853315, + "grad_norm": 0.15587475895881653, + "learning_rate": 0.0001, + "loss": 1.4613, + "step": 3026 + }, + { + "epoch": 0.17649028801071667, + "grad_norm": 0.14984256029129028, + "learning_rate": 0.0001, + "loss": 1.4881, + "step": 3027 + }, + { + "epoch": 0.1768251841929002, + "grad_norm": 0.15645815432071686, + "learning_rate": 0.0001, + "loss": 1.5756, + "step": 3028 + }, + { + "epoch": 0.1771600803750837, + "grad_norm": 0.14840419590473175, + "learning_rate": 0.0001, + "loss": 1.5134, + "step": 3029 + }, + { + "epoch": 0.17749497655726726, + "grad_norm": 0.1488848477602005, + "learning_rate": 0.0001, + "loss": 1.3932, + "step": 3030 + }, + { + "epoch": 0.17782987273945078, + "grad_norm": 0.14206485450267792, + "learning_rate": 0.0001, + "loss": 1.4531, + "step": 3031 + }, + { + "epoch": 0.1781647689216343, + "grad_norm": 0.14872042834758759, + "learning_rate": 0.0001, + "loss": 1.4467, + "step": 3032 + }, + { + "epoch": 0.17849966510381782, + "grad_norm": 0.1473948210477829, + "learning_rate": 0.0001, + "loss": 1.5119, + "step": 3033 + }, + { + "epoch": 0.17883456128600134, + "grad_norm": 0.14164240658283234, + "learning_rate": 0.0001, + "loss": 1.4372, + "step": 3034 + }, + { + "epoch": 0.17916945746818486, + "grad_norm": 0.15156961977481842, + "learning_rate": 0.0001, + "loss": 1.5728, + "step": 3035 + }, + { + "epoch": 0.17950435365036838, + "grad_norm": 0.14617635309696198, + "learning_rate": 0.0001, + "loss": 1.4933, + "step": 3036 + }, + { + "epoch": 0.1798392498325519, + "grad_norm": 0.14830727875232697, + "learning_rate": 0.0001, + "loss": 1.5332, + "step": 3037 + }, + { + "epoch": 0.18017414601473544, + "grad_norm": 0.1488112509250641, + "learning_rate": 0.0001, + "loss": 1.4318, + "step": 3038 + }, + { + "epoch": 0.18050904219691896, + "grad_norm": 0.14796082675457, + "learning_rate": 0.0001, + "loss": 1.6183, + "step": 3039 + }, + { + "epoch": 0.18084393837910248, + "grad_norm": 0.15212929248809814, + "learning_rate": 0.0001, + "loss": 1.554, + "step": 3040 + }, + { + "epoch": 0.181178834561286, + "grad_norm": 0.1485069841146469, + "learning_rate": 0.0001, + "loss": 1.4418, + "step": 3041 + }, + { + "epoch": 0.18151373074346952, + "grad_norm": 0.1428901106119156, + "learning_rate": 0.0001, + "loss": 1.4409, + "step": 3042 + }, + { + "epoch": 0.18184862692565304, + "grad_norm": 0.1394181102514267, + "learning_rate": 0.0001, + "loss": 1.4253, + "step": 3043 + }, + { + "epoch": 0.18218352310783656, + "grad_norm": 0.15848679840564728, + "learning_rate": 0.0001, + "loss": 1.4266, + "step": 3044 + }, + { + "epoch": 0.1825184192900201, + "grad_norm": 0.15975135564804077, + "learning_rate": 0.0001, + "loss": 1.5339, + "step": 3045 + }, + { + "epoch": 0.18285331547220363, + "grad_norm": 0.1499025970697403, + "learning_rate": 0.0001, + "loss": 1.4731, + "step": 3046 + }, + { + "epoch": 0.18318821165438715, + "grad_norm": 0.1490570306777954, + "learning_rate": 0.0001, + "loss": 1.5787, + "step": 3047 + }, + { + "epoch": 0.18352310783657066, + "grad_norm": 0.16007208824157715, + "learning_rate": 0.0001, + "loss": 1.5333, + "step": 3048 + }, + { + "epoch": 0.18385800401875418, + "grad_norm": 0.14733049273490906, + "learning_rate": 0.0001, + "loss": 1.4289, + "step": 3049 + }, + { + "epoch": 0.1841929002009377, + "grad_norm": 0.14483436942100525, + "learning_rate": 0.0001, + "loss": 1.4398, + "step": 3050 + }, + { + "epoch": 0.18452779638312122, + "grad_norm": 0.1574944406747818, + "learning_rate": 0.0001, + "loss": 1.5116, + "step": 3051 + }, + { + "epoch": 0.18486269256530474, + "grad_norm": 0.14573010802268982, + "learning_rate": 0.0001, + "loss": 1.4275, + "step": 3052 + }, + { + "epoch": 0.1851975887474883, + "grad_norm": 0.14518269896507263, + "learning_rate": 0.0001, + "loss": 1.4883, + "step": 3053 + }, + { + "epoch": 0.1855324849296718, + "grad_norm": 0.14543338119983673, + "learning_rate": 0.0001, + "loss": 1.442, + "step": 3054 + }, + { + "epoch": 0.18586738111185533, + "grad_norm": 0.157647043466568, + "learning_rate": 0.0001, + "loss": 1.3932, + "step": 3055 + }, + { + "epoch": 0.18620227729403885, + "grad_norm": 0.15597252547740936, + "learning_rate": 0.0001, + "loss": 1.5283, + "step": 3056 + }, + { + "epoch": 0.18653717347622237, + "grad_norm": 0.14394451677799225, + "learning_rate": 0.0001, + "loss": 1.4146, + "step": 3057 + }, + { + "epoch": 0.1868720696584059, + "grad_norm": 0.14798322319984436, + "learning_rate": 0.0001, + "loss": 1.4653, + "step": 3058 + }, + { + "epoch": 0.1872069658405894, + "grad_norm": 0.1463840901851654, + "learning_rate": 0.0001, + "loss": 1.5469, + "step": 3059 + }, + { + "epoch": 0.18754186202277295, + "grad_norm": 0.14234453439712524, + "learning_rate": 0.0001, + "loss": 1.4405, + "step": 3060 + }, + { + "epoch": 0.18787675820495647, + "grad_norm": 0.1502765715122223, + "learning_rate": 0.0001, + "loss": 1.6293, + "step": 3061 + }, + { + "epoch": 0.18821165438714, + "grad_norm": 0.14024700224399567, + "learning_rate": 0.0001, + "loss": 1.4284, + "step": 3062 + }, + { + "epoch": 0.1885465505693235, + "grad_norm": 0.1412835270166397, + "learning_rate": 0.0001, + "loss": 1.4675, + "step": 3063 + }, + { + "epoch": 0.18888144675150703, + "grad_norm": 0.1488562822341919, + "learning_rate": 0.0001, + "loss": 1.5071, + "step": 3064 + }, + { + "epoch": 0.18921634293369055, + "grad_norm": 0.14808917045593262, + "learning_rate": 0.0001, + "loss": 1.4587, + "step": 3065 + }, + { + "epoch": 0.18955123911587407, + "grad_norm": 0.14762601256370544, + "learning_rate": 0.0001, + "loss": 1.5168, + "step": 3066 + }, + { + "epoch": 0.1898861352980576, + "grad_norm": 0.14939825236797333, + "learning_rate": 0.0001, + "loss": 1.5294, + "step": 3067 + }, + { + "epoch": 0.19022103148024114, + "grad_norm": 0.1474798023700714, + "learning_rate": 0.0001, + "loss": 1.4373, + "step": 3068 + }, + { + "epoch": 0.19055592766242466, + "grad_norm": 0.1497553139925003, + "learning_rate": 0.0001, + "loss": 1.4871, + "step": 3069 + }, + { + "epoch": 0.19089082384460818, + "grad_norm": 0.1531815230846405, + "learning_rate": 0.0001, + "loss": 1.4986, + "step": 3070 + }, + { + "epoch": 0.1912257200267917, + "grad_norm": 0.1441792994737625, + "learning_rate": 0.0001, + "loss": 1.4386, + "step": 3071 + }, + { + "epoch": 0.19156061620897522, + "grad_norm": 0.14361928403377533, + "learning_rate": 0.0001, + "loss": 1.3961, + "step": 3072 + }, + { + "epoch": 0.19189551239115873, + "grad_norm": 0.15087366104125977, + "learning_rate": 0.0001, + "loss": 1.5987, + "step": 3073 + }, + { + "epoch": 0.19223040857334225, + "grad_norm": 0.15212655067443848, + "learning_rate": 0.0001, + "loss": 1.5697, + "step": 3074 + }, + { + "epoch": 0.19256530475552577, + "grad_norm": 0.15329968929290771, + "learning_rate": 0.0001, + "loss": 1.5129, + "step": 3075 + }, + { + "epoch": 0.19290020093770932, + "grad_norm": 0.14272186160087585, + "learning_rate": 0.0001, + "loss": 1.431, + "step": 3076 + }, + { + "epoch": 0.19323509711989284, + "grad_norm": 0.14385369420051575, + "learning_rate": 0.0001, + "loss": 1.5065, + "step": 3077 + }, + { + "epoch": 0.19356999330207636, + "grad_norm": 0.14664167165756226, + "learning_rate": 0.0001, + "loss": 1.472, + "step": 3078 + }, + { + "epoch": 0.19390488948425988, + "grad_norm": 0.15277238190174103, + "learning_rate": 0.0001, + "loss": 1.4597, + "step": 3079 + }, + { + "epoch": 0.1942397856664434, + "grad_norm": 0.15171590447425842, + "learning_rate": 0.0001, + "loss": 1.5161, + "step": 3080 + }, + { + "epoch": 0.19457468184862692, + "grad_norm": 0.14549346268177032, + "learning_rate": 0.0001, + "loss": 1.4613, + "step": 3081 + }, + { + "epoch": 0.19490957803081044, + "grad_norm": 0.14628693461418152, + "learning_rate": 0.0001, + "loss": 1.4514, + "step": 3082 + }, + { + "epoch": 0.19524447421299398, + "grad_norm": 0.14419186115264893, + "learning_rate": 0.0001, + "loss": 1.4797, + "step": 3083 + }, + { + "epoch": 0.1955793703951775, + "grad_norm": 0.14945529401302338, + "learning_rate": 0.0001, + "loss": 1.452, + "step": 3084 + }, + { + "epoch": 0.19591426657736102, + "grad_norm": 0.1542719602584839, + "learning_rate": 0.0001, + "loss": 1.5125, + "step": 3085 + }, + { + "epoch": 0.19624916275954454, + "grad_norm": 0.15316370129585266, + "learning_rate": 0.0001, + "loss": 1.597, + "step": 3086 + }, + { + "epoch": 0.19658405894172806, + "grad_norm": 0.15390264987945557, + "learning_rate": 0.0001, + "loss": 1.4559, + "step": 3087 + }, + { + "epoch": 0.19691895512391158, + "grad_norm": 0.15577276051044464, + "learning_rate": 0.0001, + "loss": 1.5045, + "step": 3088 + }, + { + "epoch": 0.1972538513060951, + "grad_norm": 0.14395619928836823, + "learning_rate": 0.0001, + "loss": 1.485, + "step": 3089 + }, + { + "epoch": 0.19758874748827862, + "grad_norm": 0.14563584327697754, + "learning_rate": 0.0001, + "loss": 1.4736, + "step": 3090 + }, + { + "epoch": 0.19792364367046217, + "grad_norm": 0.15247657895088196, + "learning_rate": 0.0001, + "loss": 1.4362, + "step": 3091 + }, + { + "epoch": 0.1982585398526457, + "grad_norm": 0.1521885246038437, + "learning_rate": 0.0001, + "loss": 1.4417, + "step": 3092 + }, + { + "epoch": 0.1985934360348292, + "grad_norm": 0.14599932730197906, + "learning_rate": 0.0001, + "loss": 1.4764, + "step": 3093 + }, + { + "epoch": 0.19892833221701273, + "grad_norm": 0.14040043950080872, + "learning_rate": 0.0001, + "loss": 1.3132, + "step": 3094 + }, + { + "epoch": 0.19926322839919625, + "grad_norm": 0.15043559670448303, + "learning_rate": 0.0001, + "loss": 1.4917, + "step": 3095 + }, + { + "epoch": 0.19959812458137977, + "grad_norm": 0.15356433391571045, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 3096 + }, + { + "epoch": 0.19993302076356329, + "grad_norm": 0.15018680691719055, + "learning_rate": 0.0001, + "loss": 1.5324, + "step": 3097 + }, + { + "epoch": 0.20026791694574683, + "grad_norm": 0.15176072716712952, + "learning_rate": 0.0001, + "loss": 1.4686, + "step": 3098 + }, + { + "epoch": 0.20060281312793035, + "grad_norm": 0.15239951014518738, + "learning_rate": 0.0001, + "loss": 1.5726, + "step": 3099 + }, + { + "epoch": 0.20093770931011387, + "grad_norm": 0.1486234813928604, + "learning_rate": 0.0001, + "loss": 1.4154, + "step": 3100 + }, + { + "epoch": 0.2012726054922974, + "grad_norm": 0.1507226973772049, + "learning_rate": 0.0001, + "loss": 1.5301, + "step": 3101 + }, + { + "epoch": 0.2016075016744809, + "grad_norm": 0.14854057133197784, + "learning_rate": 0.0001, + "loss": 1.3898, + "step": 3102 + }, + { + "epoch": 0.20194239785666443, + "grad_norm": 0.15393656492233276, + "learning_rate": 0.0001, + "loss": 1.4901, + "step": 3103 + }, + { + "epoch": 0.20227729403884795, + "grad_norm": 0.15347276628017426, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 3104 + }, + { + "epoch": 0.20261219022103147, + "grad_norm": 0.16099943220615387, + "learning_rate": 0.0001, + "loss": 1.524, + "step": 3105 + }, + { + "epoch": 0.20294708640321502, + "grad_norm": 0.14733991026878357, + "learning_rate": 0.0001, + "loss": 1.4457, + "step": 3106 + }, + { + "epoch": 0.20328198258539854, + "grad_norm": 0.147960364818573, + "learning_rate": 0.0001, + "loss": 1.4295, + "step": 3107 + }, + { + "epoch": 0.20361687876758205, + "grad_norm": 0.15491634607315063, + "learning_rate": 0.0001, + "loss": 1.4399, + "step": 3108 + }, + { + "epoch": 0.20395177494976557, + "grad_norm": 0.14620526134967804, + "learning_rate": 0.0001, + "loss": 1.4536, + "step": 3109 + }, + { + "epoch": 0.2042866711319491, + "grad_norm": 0.14189660549163818, + "learning_rate": 0.0001, + "loss": 1.3921, + "step": 3110 + }, + { + "epoch": 0.2046215673141326, + "grad_norm": 0.14943283796310425, + "learning_rate": 0.0001, + "loss": 1.342, + "step": 3111 + }, + { + "epoch": 0.20495646349631613, + "grad_norm": 0.1576606184244156, + "learning_rate": 0.0001, + "loss": 1.4832, + "step": 3112 + }, + { + "epoch": 0.20529135967849965, + "grad_norm": 0.1504100263118744, + "learning_rate": 0.0001, + "loss": 1.4987, + "step": 3113 + }, + { + "epoch": 0.2056262558606832, + "grad_norm": 0.15109190344810486, + "learning_rate": 0.0001, + "loss": 1.4591, + "step": 3114 + }, + { + "epoch": 0.20596115204286672, + "grad_norm": 0.15667004883289337, + "learning_rate": 0.0001, + "loss": 1.3823, + "step": 3115 + }, + { + "epoch": 0.20629604822505024, + "grad_norm": 0.14729659259319305, + "learning_rate": 0.0001, + "loss": 1.4394, + "step": 3116 + }, + { + "epoch": 0.20663094440723376, + "grad_norm": 0.1476004719734192, + "learning_rate": 0.0001, + "loss": 1.443, + "step": 3117 + }, + { + "epoch": 0.20696584058941728, + "grad_norm": 0.17168119549751282, + "learning_rate": 0.0001, + "loss": 1.5484, + "step": 3118 + }, + { + "epoch": 0.2073007367716008, + "grad_norm": 0.14790430665016174, + "learning_rate": 0.0001, + "loss": 1.469, + "step": 3119 + }, + { + "epoch": 0.20763563295378432, + "grad_norm": 0.16001714766025543, + "learning_rate": 0.0001, + "loss": 1.4979, + "step": 3120 + }, + { + "epoch": 0.20797052913596786, + "grad_norm": 0.15084713697433472, + "learning_rate": 0.0001, + "loss": 1.3852, + "step": 3121 + }, + { + "epoch": 0.20830542531815138, + "grad_norm": 0.14759619534015656, + "learning_rate": 0.0001, + "loss": 1.5474, + "step": 3122 + }, + { + "epoch": 0.2086403215003349, + "grad_norm": 0.15545868873596191, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 3123 + }, + { + "epoch": 0.20897521768251842, + "grad_norm": 0.15997780859470367, + "learning_rate": 0.0001, + "loss": 1.4449, + "step": 3124 + }, + { + "epoch": 0.20931011386470194, + "grad_norm": 0.14409230649471283, + "learning_rate": 0.0001, + "loss": 1.5113, + "step": 3125 + }, + { + "epoch": 0.20964501004688546, + "grad_norm": 0.14863921701908112, + "learning_rate": 0.0001, + "loss": 1.5332, + "step": 3126 + }, + { + "epoch": 0.20997990622906898, + "grad_norm": 0.14341822266578674, + "learning_rate": 0.0001, + "loss": 1.4174, + "step": 3127 + }, + { + "epoch": 0.2103148024112525, + "grad_norm": 0.14988672733306885, + "learning_rate": 0.0001, + "loss": 1.4759, + "step": 3128 + }, + { + "epoch": 0.21064969859343605, + "grad_norm": 0.14667005836963654, + "learning_rate": 0.0001, + "loss": 1.4981, + "step": 3129 + }, + { + "epoch": 0.21098459477561957, + "grad_norm": 0.14991022646427155, + "learning_rate": 0.0001, + "loss": 1.5047, + "step": 3130 + }, + { + "epoch": 0.21131949095780309, + "grad_norm": 0.14525125920772552, + "learning_rate": 0.0001, + "loss": 1.3783, + "step": 3131 + }, + { + "epoch": 0.2116543871399866, + "grad_norm": 0.15049485862255096, + "learning_rate": 0.0001, + "loss": 1.4224, + "step": 3132 + }, + { + "epoch": 0.21198928332217012, + "grad_norm": 0.15043260157108307, + "learning_rate": 0.0001, + "loss": 1.4528, + "step": 3133 + }, + { + "epoch": 0.21232417950435364, + "grad_norm": 0.15994007885456085, + "learning_rate": 0.0001, + "loss": 1.5085, + "step": 3134 + }, + { + "epoch": 0.21265907568653716, + "grad_norm": 0.14406290650367737, + "learning_rate": 0.0001, + "loss": 1.464, + "step": 3135 + }, + { + "epoch": 0.21299397186872068, + "grad_norm": 0.15055160224437714, + "learning_rate": 0.0001, + "loss": 1.4788, + "step": 3136 + }, + { + "epoch": 0.21332886805090423, + "grad_norm": 0.150338813662529, + "learning_rate": 0.0001, + "loss": 1.5285, + "step": 3137 + }, + { + "epoch": 0.21366376423308775, + "grad_norm": 0.1500924527645111, + "learning_rate": 0.0001, + "loss": 1.4794, + "step": 3138 + }, + { + "epoch": 0.21399866041527127, + "grad_norm": 0.14780132472515106, + "learning_rate": 0.0001, + "loss": 1.5283, + "step": 3139 + }, + { + "epoch": 0.2143335565974548, + "grad_norm": 0.14827805757522583, + "learning_rate": 0.0001, + "loss": 1.4136, + "step": 3140 + }, + { + "epoch": 0.2146684527796383, + "grad_norm": 0.14717240631580353, + "learning_rate": 0.0001, + "loss": 1.4812, + "step": 3141 + }, + { + "epoch": 0.21500334896182183, + "grad_norm": 0.15452304482460022, + "learning_rate": 0.0001, + "loss": 1.492, + "step": 3142 + }, + { + "epoch": 0.21533824514400535, + "grad_norm": 0.17032486200332642, + "learning_rate": 0.0001, + "loss": 1.5352, + "step": 3143 + }, + { + "epoch": 0.2156731413261889, + "grad_norm": 0.15467940270900726, + "learning_rate": 0.0001, + "loss": 1.5039, + "step": 3144 + }, + { + "epoch": 0.2160080375083724, + "grad_norm": 0.15712405741214752, + "learning_rate": 0.0001, + "loss": 1.507, + "step": 3145 + }, + { + "epoch": 0.21634293369055593, + "grad_norm": 0.158954918384552, + "learning_rate": 0.0001, + "loss": 1.5357, + "step": 3146 + }, + { + "epoch": 0.21667782987273945, + "grad_norm": 0.14702315628528595, + "learning_rate": 0.0001, + "loss": 1.5295, + "step": 3147 + }, + { + "epoch": 0.21701272605492297, + "grad_norm": 0.14634455740451813, + "learning_rate": 0.0001, + "loss": 1.4175, + "step": 3148 + }, + { + "epoch": 0.2173476222371065, + "grad_norm": 0.15392786264419556, + "learning_rate": 0.0001, + "loss": 1.5544, + "step": 3149 + }, + { + "epoch": 0.21768251841929, + "grad_norm": 0.16009333729743958, + "learning_rate": 0.0001, + "loss": 1.63, + "step": 3150 + }, + { + "epoch": 0.21801741460147353, + "grad_norm": 0.14513467252254486, + "learning_rate": 0.0001, + "loss": 1.4659, + "step": 3151 + }, + { + "epoch": 0.21835231078365708, + "grad_norm": 0.14836783707141876, + "learning_rate": 0.0001, + "loss": 1.4878, + "step": 3152 + }, + { + "epoch": 0.2186872069658406, + "grad_norm": 0.15242692828178406, + "learning_rate": 0.0001, + "loss": 1.4305, + "step": 3153 + }, + { + "epoch": 0.21902210314802412, + "grad_norm": 0.14825765788555145, + "learning_rate": 0.0001, + "loss": 1.4887, + "step": 3154 + }, + { + "epoch": 0.21935699933020764, + "grad_norm": 0.15620997548103333, + "learning_rate": 0.0001, + "loss": 1.5047, + "step": 3155 + }, + { + "epoch": 0.21969189551239116, + "grad_norm": 0.15094633400440216, + "learning_rate": 0.0001, + "loss": 1.4292, + "step": 3156 + }, + { + "epoch": 0.22002679169457467, + "grad_norm": 0.1550116240978241, + "learning_rate": 0.0001, + "loss": 1.4542, + "step": 3157 + }, + { + "epoch": 0.2203616878767582, + "grad_norm": 0.15815100073814392, + "learning_rate": 0.0001, + "loss": 1.5237, + "step": 3158 + }, + { + "epoch": 0.22069658405894174, + "grad_norm": 0.1517287939786911, + "learning_rate": 0.0001, + "loss": 1.4601, + "step": 3159 + }, + { + "epoch": 0.22103148024112526, + "grad_norm": 0.15398970246315002, + "learning_rate": 0.0001, + "loss": 1.4871, + "step": 3160 + }, + { + "epoch": 0.22136637642330878, + "grad_norm": 0.1565081775188446, + "learning_rate": 0.0001, + "loss": 1.4783, + "step": 3161 + }, + { + "epoch": 0.2217012726054923, + "grad_norm": 0.14842985570430756, + "learning_rate": 0.0001, + "loss": 1.3362, + "step": 3162 + }, + { + "epoch": 0.22203616878767582, + "grad_norm": 0.15767332911491394, + "learning_rate": 0.0001, + "loss": 1.4659, + "step": 3163 + }, + { + "epoch": 0.22237106496985934, + "grad_norm": 0.15569786727428436, + "learning_rate": 0.0001, + "loss": 1.5224, + "step": 3164 + }, + { + "epoch": 0.22270596115204286, + "grad_norm": 0.15435588359832764, + "learning_rate": 0.0001, + "loss": 1.5205, + "step": 3165 + }, + { + "epoch": 0.22304085733422638, + "grad_norm": 0.15310215950012207, + "learning_rate": 0.0001, + "loss": 1.4772, + "step": 3166 + }, + { + "epoch": 0.22337575351640993, + "grad_norm": 0.16117234528064728, + "learning_rate": 0.0001, + "loss": 1.4884, + "step": 3167 + }, + { + "epoch": 0.22371064969859344, + "grad_norm": 0.15367351472377777, + "learning_rate": 0.0001, + "loss": 1.5431, + "step": 3168 + }, + { + "epoch": 0.22404554588077696, + "grad_norm": 0.15488213300704956, + "learning_rate": 0.0001, + "loss": 1.4126, + "step": 3169 + }, + { + "epoch": 0.22438044206296048, + "grad_norm": 0.15359318256378174, + "learning_rate": 0.0001, + "loss": 1.5055, + "step": 3170 + }, + { + "epoch": 0.224715338245144, + "grad_norm": 0.14719818532466888, + "learning_rate": 0.0001, + "loss": 1.4483, + "step": 3171 + }, + { + "epoch": 0.22505023442732752, + "grad_norm": 0.15634144842624664, + "learning_rate": 0.0001, + "loss": 1.57, + "step": 3172 + }, + { + "epoch": 0.22538513060951104, + "grad_norm": 0.1440373957157135, + "learning_rate": 0.0001, + "loss": 1.4468, + "step": 3173 + }, + { + "epoch": 0.22572002679169456, + "grad_norm": 0.14679275453090668, + "learning_rate": 0.0001, + "loss": 1.4967, + "step": 3174 + }, + { + "epoch": 0.2260549229738781, + "grad_norm": 0.1491180658340454, + "learning_rate": 0.0001, + "loss": 1.5348, + "step": 3175 + }, + { + "epoch": 0.22638981915606163, + "grad_norm": 0.16220654547214508, + "learning_rate": 0.0001, + "loss": 1.51, + "step": 3176 + }, + { + "epoch": 0.22672471533824515, + "grad_norm": 0.14397957921028137, + "learning_rate": 0.0001, + "loss": 1.3946, + "step": 3177 + }, + { + "epoch": 0.22705961152042867, + "grad_norm": 0.1529865860939026, + "learning_rate": 0.0001, + "loss": 1.5672, + "step": 3178 + }, + { + "epoch": 0.2273945077026122, + "grad_norm": 0.14720121026039124, + "learning_rate": 0.0001, + "loss": 1.4336, + "step": 3179 + }, + { + "epoch": 0.2277294038847957, + "grad_norm": 0.14547964930534363, + "learning_rate": 0.0001, + "loss": 1.5099, + "step": 3180 + }, + { + "epoch": 0.22806430006697923, + "grad_norm": 0.15165194869041443, + "learning_rate": 0.0001, + "loss": 1.4584, + "step": 3181 + }, + { + "epoch": 0.22839919624916277, + "grad_norm": 0.14752137660980225, + "learning_rate": 0.0001, + "loss": 1.4914, + "step": 3182 + }, + { + "epoch": 0.2287340924313463, + "grad_norm": 0.142874076962471, + "learning_rate": 0.0001, + "loss": 1.4233, + "step": 3183 + }, + { + "epoch": 0.2290689886135298, + "grad_norm": 0.1451757699251175, + "learning_rate": 0.0001, + "loss": 1.5243, + "step": 3184 + }, + { + "epoch": 0.22940388479571333, + "grad_norm": 0.14648723602294922, + "learning_rate": 0.0001, + "loss": 1.4939, + "step": 3185 + }, + { + "epoch": 0.22973878097789685, + "grad_norm": 0.15457214415073395, + "learning_rate": 0.0001, + "loss": 1.5448, + "step": 3186 + }, + { + "epoch": 0.23007367716008037, + "grad_norm": 0.14498350024223328, + "learning_rate": 0.0001, + "loss": 1.4727, + "step": 3187 + }, + { + "epoch": 0.2304085733422639, + "grad_norm": 0.13952688872814178, + "learning_rate": 0.0001, + "loss": 1.3022, + "step": 3188 + }, + { + "epoch": 0.2307434695244474, + "grad_norm": 0.15119028091430664, + "learning_rate": 0.0001, + "loss": 1.5231, + "step": 3189 + }, + { + "epoch": 0.23107836570663096, + "grad_norm": 0.15895050764083862, + "learning_rate": 0.0001, + "loss": 1.5578, + "step": 3190 + }, + { + "epoch": 0.23141326188881448, + "grad_norm": 0.15064969658851624, + "learning_rate": 0.0001, + "loss": 1.4316, + "step": 3191 + }, + { + "epoch": 0.231748158070998, + "grad_norm": 0.14713798463344574, + "learning_rate": 0.0001, + "loss": 1.4909, + "step": 3192 + }, + { + "epoch": 0.23208305425318151, + "grad_norm": 0.17914965748786926, + "learning_rate": 0.0001, + "loss": 1.47, + "step": 3193 + }, + { + "epoch": 0.23241795043536503, + "grad_norm": 0.1502309888601303, + "learning_rate": 0.0001, + "loss": 1.4022, + "step": 3194 + }, + { + "epoch": 0.23275284661754855, + "grad_norm": 0.14691807329654694, + "learning_rate": 0.0001, + "loss": 1.4684, + "step": 3195 + }, + { + "epoch": 0.23308774279973207, + "grad_norm": 0.1478799283504486, + "learning_rate": 0.0001, + "loss": 1.4575, + "step": 3196 + }, + { + "epoch": 0.23342263898191562, + "grad_norm": 0.14784233272075653, + "learning_rate": 0.0001, + "loss": 1.3907, + "step": 3197 + }, + { + "epoch": 0.23375753516409914, + "grad_norm": 0.15338735282421112, + "learning_rate": 0.0001, + "loss": 1.5106, + "step": 3198 + }, + { + "epoch": 0.23409243134628266, + "grad_norm": 0.15106314420700073, + "learning_rate": 0.0001, + "loss": 1.5257, + "step": 3199 + }, + { + "epoch": 0.23442732752846618, + "grad_norm": 0.1492011845111847, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 3200 + }, + { + "epoch": 0.2347622237106497, + "grad_norm": 0.14995889365673065, + "learning_rate": 0.0001, + "loss": 1.4645, + "step": 3201 + }, + { + "epoch": 0.23509711989283322, + "grad_norm": 0.15579792857170105, + "learning_rate": 0.0001, + "loss": 1.5242, + "step": 3202 + }, + { + "epoch": 0.23543201607501674, + "grad_norm": 0.1529867798089981, + "learning_rate": 0.0001, + "loss": 1.3896, + "step": 3203 + }, + { + "epoch": 0.23576691225720026, + "grad_norm": 0.1536225974559784, + "learning_rate": 0.0001, + "loss": 1.4913, + "step": 3204 + }, + { + "epoch": 0.2361018084393838, + "grad_norm": 0.14973454177379608, + "learning_rate": 0.0001, + "loss": 1.4563, + "step": 3205 + }, + { + "epoch": 0.23643670462156732, + "grad_norm": 0.16134217381477356, + "learning_rate": 0.0001, + "loss": 1.5128, + "step": 3206 + }, + { + "epoch": 0.23677160080375084, + "grad_norm": 0.1515876054763794, + "learning_rate": 0.0001, + "loss": 1.5311, + "step": 3207 + }, + { + "epoch": 0.23710649698593436, + "grad_norm": 0.14970046281814575, + "learning_rate": 0.0001, + "loss": 1.5453, + "step": 3208 + }, + { + "epoch": 0.23744139316811788, + "grad_norm": 0.16384047269821167, + "learning_rate": 0.0001, + "loss": 1.4953, + "step": 3209 + }, + { + "epoch": 0.2377762893503014, + "grad_norm": 0.1614055335521698, + "learning_rate": 0.0001, + "loss": 1.3486, + "step": 3210 + }, + { + "epoch": 0.23811118553248492, + "grad_norm": 0.15903745591640472, + "learning_rate": 0.0001, + "loss": 1.5484, + "step": 3211 + }, + { + "epoch": 0.23844608171466844, + "grad_norm": 0.15176984667778015, + "learning_rate": 0.0001, + "loss": 1.4096, + "step": 3212 + }, + { + "epoch": 0.238780977896852, + "grad_norm": 0.1553134322166443, + "learning_rate": 0.0001, + "loss": 1.5071, + "step": 3213 + }, + { + "epoch": 0.2391158740790355, + "grad_norm": 0.15207169950008392, + "learning_rate": 0.0001, + "loss": 1.4846, + "step": 3214 + }, + { + "epoch": 0.23945077026121903, + "grad_norm": 0.14777179062366486, + "learning_rate": 0.0001, + "loss": 1.5013, + "step": 3215 + }, + { + "epoch": 0.23978566644340255, + "grad_norm": 0.15911854803562164, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 3216 + }, + { + "epoch": 0.24012056262558606, + "grad_norm": 0.16325682401657104, + "learning_rate": 0.0001, + "loss": 1.5035, + "step": 3217 + }, + { + "epoch": 0.24045545880776958, + "grad_norm": 0.1513979583978653, + "learning_rate": 0.0001, + "loss": 1.4521, + "step": 3218 + }, + { + "epoch": 0.2407903549899531, + "grad_norm": 0.16634339094161987, + "learning_rate": 0.0001, + "loss": 1.5492, + "step": 3219 + }, + { + "epoch": 0.24112525117213665, + "grad_norm": 0.15186385810375214, + "learning_rate": 0.0001, + "loss": 1.4537, + "step": 3220 + }, + { + "epoch": 0.24146014735432017, + "grad_norm": 0.15369641780853271, + "learning_rate": 0.0001, + "loss": 1.4837, + "step": 3221 + }, + { + "epoch": 0.2417950435365037, + "grad_norm": 0.15841279923915863, + "learning_rate": 0.0001, + "loss": 1.4912, + "step": 3222 + }, + { + "epoch": 0.2421299397186872, + "grad_norm": 0.15733489394187927, + "learning_rate": 0.0001, + "loss": 1.4796, + "step": 3223 + }, + { + "epoch": 0.24246483590087073, + "grad_norm": 0.14990831911563873, + "learning_rate": 0.0001, + "loss": 1.5174, + "step": 3224 + }, + { + "epoch": 0.24279973208305425, + "grad_norm": 0.1500232070684433, + "learning_rate": 0.0001, + "loss": 1.479, + "step": 3225 + }, + { + "epoch": 0.24313462826523777, + "grad_norm": 0.14930838346481323, + "learning_rate": 0.0001, + "loss": 1.4311, + "step": 3226 + }, + { + "epoch": 0.2434695244474213, + "grad_norm": 0.14843516051769257, + "learning_rate": 0.0001, + "loss": 1.4714, + "step": 3227 + }, + { + "epoch": 0.24380442062960483, + "grad_norm": 0.14314772188663483, + "learning_rate": 0.0001, + "loss": 1.4189, + "step": 3228 + }, + { + "epoch": 0.24413931681178835, + "grad_norm": 0.15701232850551605, + "learning_rate": 0.0001, + "loss": 1.5089, + "step": 3229 + }, + { + "epoch": 0.24447421299397187, + "grad_norm": 0.14976859092712402, + "learning_rate": 0.0001, + "loss": 1.4099, + "step": 3230 + }, + { + "epoch": 0.2448091091761554, + "grad_norm": 0.14823557436466217, + "learning_rate": 0.0001, + "loss": 1.5241, + "step": 3231 + }, + { + "epoch": 0.2451440053583389, + "grad_norm": 0.1463557332754135, + "learning_rate": 0.0001, + "loss": 1.4538, + "step": 3232 + }, + { + "epoch": 0.24547890154052243, + "grad_norm": 0.15305958688259125, + "learning_rate": 0.0001, + "loss": 1.5595, + "step": 3233 + }, + { + "epoch": 0.24581379772270595, + "grad_norm": 0.15024501085281372, + "learning_rate": 0.0001, + "loss": 1.4731, + "step": 3234 + }, + { + "epoch": 0.24614869390488947, + "grad_norm": 0.15059971809387207, + "learning_rate": 0.0001, + "loss": 1.5031, + "step": 3235 + }, + { + "epoch": 0.24648359008707302, + "grad_norm": 0.15700224041938782, + "learning_rate": 0.0001, + "loss": 1.518, + "step": 3236 + }, + { + "epoch": 0.24681848626925654, + "grad_norm": 0.1432359218597412, + "learning_rate": 0.0001, + "loss": 1.472, + "step": 3237 + }, + { + "epoch": 0.24715338245144006, + "grad_norm": 0.14348803460597992, + "learning_rate": 0.0001, + "loss": 1.4363, + "step": 3238 + }, + { + "epoch": 0.24748827863362358, + "grad_norm": 0.15466760098934174, + "learning_rate": 0.0001, + "loss": 1.4862, + "step": 3239 + }, + { + "epoch": 0.2478231748158071, + "grad_norm": 0.1510024070739746, + "learning_rate": 0.0001, + "loss": 1.5311, + "step": 3240 + }, + { + "epoch": 0.24815807099799062, + "grad_norm": 0.15169547498226166, + "learning_rate": 0.0001, + "loss": 1.5277, + "step": 3241 + }, + { + "epoch": 0.24849296718017413, + "grad_norm": 0.1525082141160965, + "learning_rate": 0.0001, + "loss": 1.4847, + "step": 3242 + }, + { + "epoch": 0.24882786336235768, + "grad_norm": 0.1485402137041092, + "learning_rate": 0.0001, + "loss": 1.5632, + "step": 3243 + }, + { + "epoch": 0.2491627595445412, + "grad_norm": 0.1473839432001114, + "learning_rate": 0.0001, + "loss": 1.4764, + "step": 3244 + }, + { + "epoch": 0.24949765572672472, + "grad_norm": 0.1522548496723175, + "learning_rate": 0.0001, + "loss": 1.6454, + "step": 3245 + }, + { + "epoch": 0.24983255190890824, + "grad_norm": 0.1536744087934494, + "learning_rate": 0.0001, + "loss": 1.479, + "step": 3246 + }, + { + "epoch": 0.25016744809109176, + "grad_norm": 0.14527705311775208, + "learning_rate": 0.0001, + "loss": 1.5183, + "step": 3247 + }, + { + "epoch": 0.2505023442732753, + "grad_norm": 0.1515124887228012, + "learning_rate": 0.0001, + "loss": 1.477, + "step": 3248 + }, + { + "epoch": 0.2508372404554588, + "grad_norm": 0.1502838134765625, + "learning_rate": 0.0001, + "loss": 1.4847, + "step": 3249 + }, + { + "epoch": 0.2511721366376423, + "grad_norm": 0.14173923432826996, + "learning_rate": 0.0001, + "loss": 1.4254, + "step": 3250 + }, + { + "epoch": 0.25150703281982584, + "grad_norm": 0.14899542927742004, + "learning_rate": 0.0001, + "loss": 1.4805, + "step": 3251 + }, + { + "epoch": 0.25184192900200936, + "grad_norm": 0.1492302268743515, + "learning_rate": 0.0001, + "loss": 1.5133, + "step": 3252 + }, + { + "epoch": 0.2521768251841929, + "grad_norm": 0.15040142834186554, + "learning_rate": 0.0001, + "loss": 1.4799, + "step": 3253 + }, + { + "epoch": 0.2525117213663764, + "grad_norm": 0.14641860127449036, + "learning_rate": 0.0001, + "loss": 1.4095, + "step": 3254 + }, + { + "epoch": 0.25284661754855997, + "grad_norm": 0.15745055675506592, + "learning_rate": 0.0001, + "loss": 1.5862, + "step": 3255 + }, + { + "epoch": 0.2531815137307435, + "grad_norm": 0.1484472006559372, + "learning_rate": 0.0001, + "loss": 1.5026, + "step": 3256 + }, + { + "epoch": 0.253516409912927, + "grad_norm": 0.14549176394939423, + "learning_rate": 0.0001, + "loss": 1.4881, + "step": 3257 + }, + { + "epoch": 0.25385130609511053, + "grad_norm": 0.15266196429729462, + "learning_rate": 0.0001, + "loss": 1.595, + "step": 3258 + }, + { + "epoch": 0.25418620227729405, + "grad_norm": 0.149305522441864, + "learning_rate": 0.0001, + "loss": 1.4982, + "step": 3259 + }, + { + "epoch": 0.25452109845947757, + "grad_norm": 0.14763863384723663, + "learning_rate": 0.0001, + "loss": 1.4299, + "step": 3260 + }, + { + "epoch": 0.2548559946416611, + "grad_norm": 0.15491554141044617, + "learning_rate": 0.0001, + "loss": 1.4403, + "step": 3261 + }, + { + "epoch": 0.2551908908238446, + "grad_norm": 0.14249491691589355, + "learning_rate": 0.0001, + "loss": 1.469, + "step": 3262 + }, + { + "epoch": 0.2555257870060281, + "grad_norm": 0.14917422831058502, + "learning_rate": 0.0001, + "loss": 1.401, + "step": 3263 + }, + { + "epoch": 0.25586068318821165, + "grad_norm": 0.14510908722877502, + "learning_rate": 0.0001, + "loss": 1.4544, + "step": 3264 + }, + { + "epoch": 0.25619557937039517, + "grad_norm": 0.15895402431488037, + "learning_rate": 0.0001, + "loss": 1.5992, + "step": 3265 + }, + { + "epoch": 0.2565304755525787, + "grad_norm": 0.15244704484939575, + "learning_rate": 0.0001, + "loss": 1.5201, + "step": 3266 + }, + { + "epoch": 0.2568653717347622, + "grad_norm": 0.15109996497631073, + "learning_rate": 0.0001, + "loss": 1.3982, + "step": 3267 + }, + { + "epoch": 0.2572002679169457, + "grad_norm": 0.15099941194057465, + "learning_rate": 0.0001, + "loss": 1.4949, + "step": 3268 + }, + { + "epoch": 0.25753516409912924, + "grad_norm": 0.15113183856010437, + "learning_rate": 0.0001, + "loss": 1.4185, + "step": 3269 + }, + { + "epoch": 0.2578700602813128, + "grad_norm": 0.15029966831207275, + "learning_rate": 0.0001, + "loss": 1.4401, + "step": 3270 + }, + { + "epoch": 0.25820495646349634, + "grad_norm": 0.15110653638839722, + "learning_rate": 0.0001, + "loss": 1.4606, + "step": 3271 + }, + { + "epoch": 0.25853985264567986, + "grad_norm": 0.14699125289916992, + "learning_rate": 0.0001, + "loss": 1.3752, + "step": 3272 + }, + { + "epoch": 0.2588747488278634, + "grad_norm": 0.1514248549938202, + "learning_rate": 0.0001, + "loss": 1.4327, + "step": 3273 + }, + { + "epoch": 0.2592096450100469, + "grad_norm": 0.14915835857391357, + "learning_rate": 0.0001, + "loss": 1.4541, + "step": 3274 + }, + { + "epoch": 0.2595445411922304, + "grad_norm": 0.1568090319633484, + "learning_rate": 0.0001, + "loss": 1.5556, + "step": 3275 + }, + { + "epoch": 0.25987943737441394, + "grad_norm": 0.15016591548919678, + "learning_rate": 0.0001, + "loss": 1.4523, + "step": 3276 + }, + { + "epoch": 0.26021433355659745, + "grad_norm": 0.14889879524707794, + "learning_rate": 0.0001, + "loss": 1.5291, + "step": 3277 + }, + { + "epoch": 0.260549229738781, + "grad_norm": 0.15212146937847137, + "learning_rate": 0.0001, + "loss": 1.4512, + "step": 3278 + }, + { + "epoch": 0.2608841259209645, + "grad_norm": 0.1520780622959137, + "learning_rate": 0.0001, + "loss": 1.4806, + "step": 3279 + }, + { + "epoch": 0.261219022103148, + "grad_norm": 0.14372298121452332, + "learning_rate": 0.0001, + "loss": 1.4424, + "step": 3280 + }, + { + "epoch": 0.26155391828533153, + "grad_norm": 0.14275674521923065, + "learning_rate": 0.0001, + "loss": 1.399, + "step": 3281 + }, + { + "epoch": 0.26188881446751505, + "grad_norm": 0.16535142064094543, + "learning_rate": 0.0001, + "loss": 1.6046, + "step": 3282 + }, + { + "epoch": 0.26222371064969857, + "grad_norm": 0.15277263522148132, + "learning_rate": 0.0001, + "loss": 1.4929, + "step": 3283 + }, + { + "epoch": 0.2625586068318821, + "grad_norm": 0.15697571635246277, + "learning_rate": 0.0001, + "loss": 1.5432, + "step": 3284 + }, + { + "epoch": 0.26289350301406567, + "grad_norm": 0.1490306705236435, + "learning_rate": 0.0001, + "loss": 1.5217, + "step": 3285 + }, + { + "epoch": 0.2632283991962492, + "grad_norm": 0.144150048494339, + "learning_rate": 0.0001, + "loss": 1.3888, + "step": 3286 + }, + { + "epoch": 0.2635632953784327, + "grad_norm": 0.14768853783607483, + "learning_rate": 0.0001, + "loss": 1.434, + "step": 3287 + }, + { + "epoch": 0.2638981915606162, + "grad_norm": 0.15402550995349884, + "learning_rate": 0.0001, + "loss": 1.4649, + "step": 3288 + }, + { + "epoch": 0.26423308774279974, + "grad_norm": 0.14135251939296722, + "learning_rate": 0.0001, + "loss": 1.3352, + "step": 3289 + }, + { + "epoch": 0.26456798392498326, + "grad_norm": 0.14637348055839539, + "learning_rate": 0.0001, + "loss": 1.4109, + "step": 3290 + }, + { + "epoch": 0.2649028801071668, + "grad_norm": 0.16085872054100037, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 3291 + }, + { + "epoch": 0.2652377762893503, + "grad_norm": 0.1754620373249054, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 3292 + }, + { + "epoch": 0.2655726724715338, + "grad_norm": 0.15829911828041077, + "learning_rate": 0.0001, + "loss": 1.5336, + "step": 3293 + }, + { + "epoch": 0.26590756865371734, + "grad_norm": 0.15245473384857178, + "learning_rate": 0.0001, + "loss": 1.3699, + "step": 3294 + }, + { + "epoch": 0.26624246483590086, + "grad_norm": 0.16027551889419556, + "learning_rate": 0.0001, + "loss": 1.5331, + "step": 3295 + }, + { + "epoch": 0.2665773610180844, + "grad_norm": 0.1470411866903305, + "learning_rate": 0.0001, + "loss": 1.447, + "step": 3296 + }, + { + "epoch": 0.2669122572002679, + "grad_norm": 0.15567657351493835, + "learning_rate": 0.0001, + "loss": 1.4364, + "step": 3297 + }, + { + "epoch": 0.2672471533824514, + "grad_norm": 0.15527381002902985, + "learning_rate": 0.0001, + "loss": 1.5166, + "step": 3298 + }, + { + "epoch": 0.26758204956463494, + "grad_norm": 0.14668625593185425, + "learning_rate": 0.0001, + "loss": 1.5355, + "step": 3299 + }, + { + "epoch": 0.2679169457468185, + "grad_norm": 0.15117013454437256, + "learning_rate": 0.0001, + "loss": 1.4292, + "step": 3300 + }, + { + "epoch": 0.26825184192900203, + "grad_norm": 0.15294520556926727, + "learning_rate": 0.0001, + "loss": 1.4845, + "step": 3301 + }, + { + "epoch": 0.26858673811118555, + "grad_norm": 0.15113379061222076, + "learning_rate": 0.0001, + "loss": 1.4522, + "step": 3302 + }, + { + "epoch": 0.26892163429336907, + "grad_norm": 0.14972607791423798, + "learning_rate": 0.0001, + "loss": 1.5287, + "step": 3303 + }, + { + "epoch": 0.2692565304755526, + "grad_norm": 0.15561367571353912, + "learning_rate": 0.0001, + "loss": 1.5022, + "step": 3304 + }, + { + "epoch": 0.2695914266577361, + "grad_norm": 0.15228398144245148, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 3305 + }, + { + "epoch": 0.26992632283991963, + "grad_norm": 0.15206193923950195, + "learning_rate": 0.0001, + "loss": 1.4691, + "step": 3306 + }, + { + "epoch": 0.27026121902210315, + "grad_norm": 0.15127262473106384, + "learning_rate": 0.0001, + "loss": 1.4826, + "step": 3307 + }, + { + "epoch": 0.27059611520428667, + "grad_norm": 0.14557790756225586, + "learning_rate": 0.0001, + "loss": 1.5215, + "step": 3308 + }, + { + "epoch": 0.2709310113864702, + "grad_norm": 0.15763740241527557, + "learning_rate": 0.0001, + "loss": 1.5212, + "step": 3309 + }, + { + "epoch": 0.2712659075686537, + "grad_norm": 0.1479531228542328, + "learning_rate": 0.0001, + "loss": 1.4819, + "step": 3310 + }, + { + "epoch": 0.2716008037508372, + "grad_norm": 0.14689850807189941, + "learning_rate": 0.0001, + "loss": 1.4905, + "step": 3311 + }, + { + "epoch": 0.27193569993302075, + "grad_norm": 0.15508733689785004, + "learning_rate": 0.0001, + "loss": 1.5279, + "step": 3312 + }, + { + "epoch": 0.27227059611520427, + "grad_norm": 0.1534416526556015, + "learning_rate": 0.0001, + "loss": 1.4621, + "step": 3313 + }, + { + "epoch": 0.2726054922973878, + "grad_norm": 0.14797250926494598, + "learning_rate": 0.0001, + "loss": 1.429, + "step": 3314 + }, + { + "epoch": 0.2729403884795713, + "grad_norm": 0.15446336567401886, + "learning_rate": 0.0001, + "loss": 1.4702, + "step": 3315 + }, + { + "epoch": 0.2732752846617549, + "grad_norm": 0.1508505940437317, + "learning_rate": 0.0001, + "loss": 1.5452, + "step": 3316 + }, + { + "epoch": 0.2736101808439384, + "grad_norm": 0.15143181383609772, + "learning_rate": 0.0001, + "loss": 1.4926, + "step": 3317 + }, + { + "epoch": 0.2739450770261219, + "grad_norm": 0.15116530656814575, + "learning_rate": 0.0001, + "loss": 1.4111, + "step": 3318 + }, + { + "epoch": 0.27427997320830544, + "grad_norm": 0.1512419730424881, + "learning_rate": 0.0001, + "loss": 1.4943, + "step": 3319 + }, + { + "epoch": 0.27461486939048896, + "grad_norm": 0.1462402641773224, + "learning_rate": 0.0001, + "loss": 1.5575, + "step": 3320 + }, + { + "epoch": 0.2749497655726725, + "grad_norm": 0.1448834389448166, + "learning_rate": 0.0001, + "loss": 1.423, + "step": 3321 + }, + { + "epoch": 0.275284661754856, + "grad_norm": 0.1444164365530014, + "learning_rate": 0.0001, + "loss": 1.4009, + "step": 3322 + }, + { + "epoch": 0.2756195579370395, + "grad_norm": 0.14190183579921722, + "learning_rate": 0.0001, + "loss": 1.4412, + "step": 3323 + }, + { + "epoch": 0.27595445411922304, + "grad_norm": 0.144227534532547, + "learning_rate": 0.0001, + "loss": 1.4297, + "step": 3324 + }, + { + "epoch": 0.27628935030140656, + "grad_norm": 0.146895632147789, + "learning_rate": 0.0001, + "loss": 1.4689, + "step": 3325 + }, + { + "epoch": 0.2766242464835901, + "grad_norm": 0.15591838955879211, + "learning_rate": 0.0001, + "loss": 1.4893, + "step": 3326 + }, + { + "epoch": 0.2769591426657736, + "grad_norm": 0.15250559151172638, + "learning_rate": 0.0001, + "loss": 1.519, + "step": 3327 + }, + { + "epoch": 0.2772940388479571, + "grad_norm": 0.14862549304962158, + "learning_rate": 0.0001, + "loss": 1.5158, + "step": 3328 + }, + { + "epoch": 0.27762893503014063, + "grad_norm": 0.1499813199043274, + "learning_rate": 0.0001, + "loss": 1.4556, + "step": 3329 + }, + { + "epoch": 0.27796383121232415, + "grad_norm": 0.15060973167419434, + "learning_rate": 0.0001, + "loss": 1.4599, + "step": 3330 + }, + { + "epoch": 0.27829872739450773, + "grad_norm": 0.14788423478603363, + "learning_rate": 0.0001, + "loss": 1.4697, + "step": 3331 + }, + { + "epoch": 0.27863362357669125, + "grad_norm": 0.14984434843063354, + "learning_rate": 0.0001, + "loss": 1.4535, + "step": 3332 + }, + { + "epoch": 0.27896851975887477, + "grad_norm": 0.15621770918369293, + "learning_rate": 0.0001, + "loss": 1.6125, + "step": 3333 + }, + { + "epoch": 0.2793034159410583, + "grad_norm": 0.15792222321033478, + "learning_rate": 0.0001, + "loss": 1.5877, + "step": 3334 + }, + { + "epoch": 0.2796383121232418, + "grad_norm": 0.15121862292289734, + "learning_rate": 0.0001, + "loss": 1.4266, + "step": 3335 + }, + { + "epoch": 0.2799732083054253, + "grad_norm": 0.15701161324977875, + "learning_rate": 0.0001, + "loss": 1.4681, + "step": 3336 + }, + { + "epoch": 0.28030810448760884, + "grad_norm": 0.14958228170871735, + "learning_rate": 0.0001, + "loss": 1.4815, + "step": 3337 + }, + { + "epoch": 0.28064300066979236, + "grad_norm": 0.14307574927806854, + "learning_rate": 0.0001, + "loss": 1.4184, + "step": 3338 + }, + { + "epoch": 0.2809778968519759, + "grad_norm": 0.1685991883277893, + "learning_rate": 0.0001, + "loss": 1.5861, + "step": 3339 + }, + { + "epoch": 0.2813127930341594, + "grad_norm": 0.144222691655159, + "learning_rate": 0.0001, + "loss": 1.4452, + "step": 3340 + }, + { + "epoch": 0.2816476892163429, + "grad_norm": 0.14686745405197144, + "learning_rate": 0.0001, + "loss": 1.4747, + "step": 3341 + }, + { + "epoch": 0.28198258539852644, + "grad_norm": 0.15471439063549042, + "learning_rate": 0.0001, + "loss": 1.4458, + "step": 3342 + }, + { + "epoch": 0.28231748158070996, + "grad_norm": 0.1510482132434845, + "learning_rate": 0.0001, + "loss": 1.4489, + "step": 3343 + }, + { + "epoch": 0.2826523777628935, + "grad_norm": 0.14976145327091217, + "learning_rate": 0.0001, + "loss": 1.3924, + "step": 3344 + }, + { + "epoch": 0.282987273945077, + "grad_norm": 0.15701331198215485, + "learning_rate": 0.0001, + "loss": 1.5704, + "step": 3345 + }, + { + "epoch": 0.2833221701272606, + "grad_norm": 0.14849096536636353, + "learning_rate": 0.0001, + "loss": 1.4541, + "step": 3346 + }, + { + "epoch": 0.2836570663094441, + "grad_norm": 0.1467583030462265, + "learning_rate": 0.0001, + "loss": 1.4102, + "step": 3347 + }, + { + "epoch": 0.2839919624916276, + "grad_norm": 0.1470215618610382, + "learning_rate": 0.0001, + "loss": 1.5082, + "step": 3348 + }, + { + "epoch": 0.28432685867381113, + "grad_norm": 0.14759418368339539, + "learning_rate": 0.0001, + "loss": 1.5136, + "step": 3349 + }, + { + "epoch": 0.28466175485599465, + "grad_norm": 0.15090933442115784, + "learning_rate": 0.0001, + "loss": 1.4585, + "step": 3350 + }, + { + "epoch": 0.2849966510381782, + "grad_norm": 0.15021245181560516, + "learning_rate": 0.0001, + "loss": 1.5347, + "step": 3351 + }, + { + "epoch": 0.2853315472203617, + "grad_norm": 0.14365991950035095, + "learning_rate": 0.0001, + "loss": 1.4526, + "step": 3352 + }, + { + "epoch": 0.2856664434025452, + "grad_norm": 0.14594535529613495, + "learning_rate": 0.0001, + "loss": 1.4779, + "step": 3353 + }, + { + "epoch": 0.28600133958472873, + "grad_norm": 0.15110114216804504, + "learning_rate": 0.0001, + "loss": 1.5095, + "step": 3354 + }, + { + "epoch": 0.28633623576691225, + "grad_norm": 0.1482629030942917, + "learning_rate": 0.0001, + "loss": 1.5056, + "step": 3355 + }, + { + "epoch": 0.28667113194909577, + "grad_norm": 0.1559736132621765, + "learning_rate": 0.0001, + "loss": 1.4846, + "step": 3356 + }, + { + "epoch": 0.2870060281312793, + "grad_norm": 0.14480571448802948, + "learning_rate": 0.0001, + "loss": 1.423, + "step": 3357 + }, + { + "epoch": 0.2873409243134628, + "grad_norm": 0.14984071254730225, + "learning_rate": 0.0001, + "loss": 1.4767, + "step": 3358 + }, + { + "epoch": 0.28767582049564633, + "grad_norm": 0.15247821807861328, + "learning_rate": 0.0001, + "loss": 1.5178, + "step": 3359 + }, + { + "epoch": 0.28801071667782985, + "grad_norm": 0.1461106687784195, + "learning_rate": 0.0001, + "loss": 1.4228, + "step": 3360 + }, + { + "epoch": 0.2883456128600134, + "grad_norm": 0.1496150642633438, + "learning_rate": 0.0001, + "loss": 1.4189, + "step": 3361 + }, + { + "epoch": 0.28868050904219694, + "grad_norm": 0.16044890880584717, + "learning_rate": 0.0001, + "loss": 1.5286, + "step": 3362 + }, + { + "epoch": 0.28901540522438046, + "grad_norm": 0.14375263452529907, + "learning_rate": 0.0001, + "loss": 1.4536, + "step": 3363 + }, + { + "epoch": 0.289350301406564, + "grad_norm": 0.14571574330329895, + "learning_rate": 0.0001, + "loss": 1.5264, + "step": 3364 + }, + { + "epoch": 0.2896851975887475, + "grad_norm": 0.14890584349632263, + "learning_rate": 0.0001, + "loss": 1.4577, + "step": 3365 + }, + { + "epoch": 0.290020093770931, + "grad_norm": 0.1530643105506897, + "learning_rate": 0.0001, + "loss": 1.5208, + "step": 3366 + }, + { + "epoch": 0.29035498995311454, + "grad_norm": 0.15293872356414795, + "learning_rate": 0.0001, + "loss": 1.4503, + "step": 3367 + }, + { + "epoch": 0.29068988613529806, + "grad_norm": 0.15317441523075104, + "learning_rate": 0.0001, + "loss": 1.5716, + "step": 3368 + }, + { + "epoch": 0.2910247823174816, + "grad_norm": 0.1484214812517166, + "learning_rate": 0.0001, + "loss": 1.4718, + "step": 3369 + }, + { + "epoch": 0.2913596784996651, + "grad_norm": 0.15597650408744812, + "learning_rate": 0.0001, + "loss": 1.4813, + "step": 3370 + }, + { + "epoch": 0.2916945746818486, + "grad_norm": 0.14535312354564667, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 3371 + }, + { + "epoch": 0.29202947086403214, + "grad_norm": 0.15586082637310028, + "learning_rate": 0.0001, + "loss": 1.5989, + "step": 3372 + }, + { + "epoch": 0.29236436704621566, + "grad_norm": 0.1462761014699936, + "learning_rate": 0.0001, + "loss": 1.3777, + "step": 3373 + }, + { + "epoch": 0.2926992632283992, + "grad_norm": 0.15477129817008972, + "learning_rate": 0.0001, + "loss": 1.5025, + "step": 3374 + }, + { + "epoch": 0.2930341594105827, + "grad_norm": 0.15040694177150726, + "learning_rate": 0.0001, + "loss": 1.4232, + "step": 3375 + }, + { + "epoch": 0.29336905559276627, + "grad_norm": 0.14610585570335388, + "learning_rate": 0.0001, + "loss": 1.4791, + "step": 3376 + }, + { + "epoch": 0.2937039517749498, + "grad_norm": 0.1511322408914566, + "learning_rate": 0.0001, + "loss": 1.5075, + "step": 3377 + }, + { + "epoch": 0.2940388479571333, + "grad_norm": 0.15154308080673218, + "learning_rate": 0.0001, + "loss": 1.5467, + "step": 3378 + }, + { + "epoch": 0.29437374413931683, + "grad_norm": 0.1455857902765274, + "learning_rate": 0.0001, + "loss": 1.5215, + "step": 3379 + }, + { + "epoch": 0.29470864032150035, + "grad_norm": 0.15827228128910065, + "learning_rate": 0.0001, + "loss": 1.5196, + "step": 3380 + }, + { + "epoch": 0.29504353650368387, + "grad_norm": 0.15383900701999664, + "learning_rate": 0.0001, + "loss": 1.561, + "step": 3381 + }, + { + "epoch": 0.2953784326858674, + "grad_norm": 0.14216770231723785, + "learning_rate": 0.0001, + "loss": 1.3841, + "step": 3382 + }, + { + "epoch": 0.2957133288680509, + "grad_norm": 0.15554247796535492, + "learning_rate": 0.0001, + "loss": 1.3912, + "step": 3383 + }, + { + "epoch": 0.2960482250502344, + "grad_norm": 0.15362423658370972, + "learning_rate": 0.0001, + "loss": 1.5188, + "step": 3384 + }, + { + "epoch": 0.29638312123241795, + "grad_norm": 0.15400175750255585, + "learning_rate": 0.0001, + "loss": 1.4673, + "step": 3385 + }, + { + "epoch": 0.29671801741460146, + "grad_norm": 0.15115594863891602, + "learning_rate": 0.0001, + "loss": 1.4775, + "step": 3386 + }, + { + "epoch": 0.297052913596785, + "grad_norm": 0.14465996623039246, + "learning_rate": 0.0001, + "loss": 1.4302, + "step": 3387 + }, + { + "epoch": 0.2973878097789685, + "grad_norm": 0.1533147394657135, + "learning_rate": 0.0001, + "loss": 1.5944, + "step": 3388 + }, + { + "epoch": 0.297722705961152, + "grad_norm": 0.1591259390115738, + "learning_rate": 0.0001, + "loss": 1.5124, + "step": 3389 + }, + { + "epoch": 0.29805760214333554, + "grad_norm": 0.15579280257225037, + "learning_rate": 0.0001, + "loss": 1.5061, + "step": 3390 + }, + { + "epoch": 0.29839249832551906, + "grad_norm": 0.15362292528152466, + "learning_rate": 0.0001, + "loss": 1.5147, + "step": 3391 + }, + { + "epoch": 0.29872739450770264, + "grad_norm": 0.16088083386421204, + "learning_rate": 0.0001, + "loss": 1.4932, + "step": 3392 + }, + { + "epoch": 0.29906229068988616, + "grad_norm": 0.15710797905921936, + "learning_rate": 0.0001, + "loss": 1.4535, + "step": 3393 + }, + { + "epoch": 0.2993971868720697, + "grad_norm": 0.14893610775470734, + "learning_rate": 0.0001, + "loss": 1.4464, + "step": 3394 + }, + { + "epoch": 0.2997320830542532, + "grad_norm": 0.1565815806388855, + "learning_rate": 0.0001, + "loss": 1.4875, + "step": 3395 + }, + { + "epoch": 0.3000669792364367, + "grad_norm": 0.14962182939052582, + "learning_rate": 0.0001, + "loss": 1.4647, + "step": 3396 + }, + { + "epoch": 0.30040187541862023, + "grad_norm": 0.14253853261470795, + "learning_rate": 0.0001, + "loss": 1.4166, + "step": 3397 + }, + { + "epoch": 0.30073677160080375, + "grad_norm": 0.13929632306098938, + "learning_rate": 0.0001, + "loss": 1.3655, + "step": 3398 + }, + { + "epoch": 0.3010716677829873, + "grad_norm": 0.1565397083759308, + "learning_rate": 0.0001, + "loss": 1.5822, + "step": 3399 + }, + { + "epoch": 0.3014065639651708, + "grad_norm": 0.15752102434635162, + "learning_rate": 0.0001, + "loss": 1.5227, + "step": 3400 + }, + { + "epoch": 0.3017414601473543, + "grad_norm": 0.15279462933540344, + "learning_rate": 0.0001, + "loss": 1.4461, + "step": 3401 + }, + { + "epoch": 0.30207635632953783, + "grad_norm": 0.14432330429553986, + "learning_rate": 0.0001, + "loss": 1.5224, + "step": 3402 + }, + { + "epoch": 0.30241125251172135, + "grad_norm": 0.1433674395084381, + "learning_rate": 0.0001, + "loss": 1.364, + "step": 3403 + }, + { + "epoch": 0.30274614869390487, + "grad_norm": 0.14757895469665527, + "learning_rate": 0.0001, + "loss": 1.5272, + "step": 3404 + }, + { + "epoch": 0.3030810448760884, + "grad_norm": 0.14946860074996948, + "learning_rate": 0.0001, + "loss": 1.4806, + "step": 3405 + }, + { + "epoch": 0.3034159410582719, + "grad_norm": 0.146403968334198, + "learning_rate": 0.0001, + "loss": 1.4675, + "step": 3406 + }, + { + "epoch": 0.3037508372404555, + "grad_norm": 0.15197888016700745, + "learning_rate": 0.0001, + "loss": 1.3588, + "step": 3407 + }, + { + "epoch": 0.304085733422639, + "grad_norm": 0.14806897938251495, + "learning_rate": 0.0001, + "loss": 1.4443, + "step": 3408 + }, + { + "epoch": 0.3044206296048225, + "grad_norm": 0.14273184537887573, + "learning_rate": 0.0001, + "loss": 1.4078, + "step": 3409 + }, + { + "epoch": 0.30475552578700604, + "grad_norm": 0.1511409729719162, + "learning_rate": 0.0001, + "loss": 1.4823, + "step": 3410 + }, + { + "epoch": 0.30509042196918956, + "grad_norm": 0.15107282996177673, + "learning_rate": 0.0001, + "loss": 1.4685, + "step": 3411 + }, + { + "epoch": 0.3054253181513731, + "grad_norm": 0.14973436295986176, + "learning_rate": 0.0001, + "loss": 1.5428, + "step": 3412 + }, + { + "epoch": 0.3057602143335566, + "grad_norm": 0.15601438283920288, + "learning_rate": 0.0001, + "loss": 1.5471, + "step": 3413 + }, + { + "epoch": 0.3060951105157401, + "grad_norm": 0.15377770364284515, + "learning_rate": 0.0001, + "loss": 1.4574, + "step": 3414 + }, + { + "epoch": 0.30643000669792364, + "grad_norm": 0.14803405106067657, + "learning_rate": 0.0001, + "loss": 1.4975, + "step": 3415 + }, + { + "epoch": 0.30676490288010716, + "grad_norm": 0.15249978005886078, + "learning_rate": 0.0001, + "loss": 1.3705, + "step": 3416 + }, + { + "epoch": 0.3070997990622907, + "grad_norm": 0.15126726031303406, + "learning_rate": 0.0001, + "loss": 1.4297, + "step": 3417 + }, + { + "epoch": 0.3074346952444742, + "grad_norm": 0.14738833904266357, + "learning_rate": 0.0001, + "loss": 1.3943, + "step": 3418 + }, + { + "epoch": 0.3077695914266577, + "grad_norm": 0.1609349548816681, + "learning_rate": 0.0001, + "loss": 1.491, + "step": 3419 + }, + { + "epoch": 0.30810448760884124, + "grad_norm": 0.14774426817893982, + "learning_rate": 0.0001, + "loss": 1.4311, + "step": 3420 + }, + { + "epoch": 0.30843938379102476, + "grad_norm": 0.1546633392572403, + "learning_rate": 0.0001, + "loss": 1.4578, + "step": 3421 + }, + { + "epoch": 0.30877427997320833, + "grad_norm": 0.15193051099777222, + "learning_rate": 0.0001, + "loss": 1.4649, + "step": 3422 + }, + { + "epoch": 0.30910917615539185, + "grad_norm": 0.15336847305297852, + "learning_rate": 0.0001, + "loss": 1.6061, + "step": 3423 + }, + { + "epoch": 0.30944407233757537, + "grad_norm": 0.14489242434501648, + "learning_rate": 0.0001, + "loss": 1.468, + "step": 3424 + }, + { + "epoch": 0.3097789685197589, + "grad_norm": 0.15843623876571655, + "learning_rate": 0.0001, + "loss": 1.4897, + "step": 3425 + }, + { + "epoch": 0.3101138647019424, + "grad_norm": 0.15312214195728302, + "learning_rate": 0.0001, + "loss": 1.4869, + "step": 3426 + }, + { + "epoch": 0.31044876088412593, + "grad_norm": 0.15313860774040222, + "learning_rate": 0.0001, + "loss": 1.4169, + "step": 3427 + }, + { + "epoch": 0.31078365706630945, + "grad_norm": 0.15683285892009735, + "learning_rate": 0.0001, + "loss": 1.4916, + "step": 3428 + }, + { + "epoch": 0.31111855324849297, + "grad_norm": 0.15352609753608704, + "learning_rate": 0.0001, + "loss": 1.4138, + "step": 3429 + }, + { + "epoch": 0.3114534494306765, + "grad_norm": 0.16370314359664917, + "learning_rate": 0.0001, + "loss": 1.5103, + "step": 3430 + }, + { + "epoch": 0.31178834561286, + "grad_norm": 0.15114983916282654, + "learning_rate": 0.0001, + "loss": 1.4491, + "step": 3431 + }, + { + "epoch": 0.3121232417950435, + "grad_norm": 0.15531513094902039, + "learning_rate": 0.0001, + "loss": 1.5423, + "step": 3432 + }, + { + "epoch": 0.31245813797722705, + "grad_norm": 0.14949560165405273, + "learning_rate": 0.0001, + "loss": 1.4947, + "step": 3433 + }, + { + "epoch": 0.31279303415941057, + "grad_norm": 0.14659979939460754, + "learning_rate": 0.0001, + "loss": 1.4453, + "step": 3434 + }, + { + "epoch": 0.3131279303415941, + "grad_norm": 0.147125706076622, + "learning_rate": 0.0001, + "loss": 1.474, + "step": 3435 + }, + { + "epoch": 0.3134628265237776, + "grad_norm": 0.1630190759897232, + "learning_rate": 0.0001, + "loss": 1.5404, + "step": 3436 + }, + { + "epoch": 0.3137977227059612, + "grad_norm": 0.1427680253982544, + "learning_rate": 0.0001, + "loss": 1.3876, + "step": 3437 + }, + { + "epoch": 0.3141326188881447, + "grad_norm": 0.15487343072891235, + "learning_rate": 0.0001, + "loss": 1.4852, + "step": 3438 + }, + { + "epoch": 0.3144675150703282, + "grad_norm": 0.15448251366615295, + "learning_rate": 0.0001, + "loss": 1.4936, + "step": 3439 + }, + { + "epoch": 0.31480241125251174, + "grad_norm": 0.14923791587352753, + "learning_rate": 0.0001, + "loss": 1.5198, + "step": 3440 + }, + { + "epoch": 0.31513730743469526, + "grad_norm": 0.15218065679073334, + "learning_rate": 0.0001, + "loss": 1.4689, + "step": 3441 + }, + { + "epoch": 0.3154722036168788, + "grad_norm": 0.15223555266857147, + "learning_rate": 0.0001, + "loss": 1.5374, + "step": 3442 + }, + { + "epoch": 0.3158070997990623, + "grad_norm": 0.14683569967746735, + "learning_rate": 0.0001, + "loss": 1.5073, + "step": 3443 + }, + { + "epoch": 0.3161419959812458, + "grad_norm": 0.152296245098114, + "learning_rate": 0.0001, + "loss": 1.3803, + "step": 3444 + }, + { + "epoch": 0.31647689216342934, + "grad_norm": 0.149747833609581, + "learning_rate": 0.0001, + "loss": 1.4896, + "step": 3445 + }, + { + "epoch": 0.31681178834561285, + "grad_norm": 0.153005912899971, + "learning_rate": 0.0001, + "loss": 1.5106, + "step": 3446 + }, + { + "epoch": 0.3171466845277964, + "grad_norm": 0.15335361659526825, + "learning_rate": 0.0001, + "loss": 1.4979, + "step": 3447 + }, + { + "epoch": 0.3174815807099799, + "grad_norm": 0.15364506840705872, + "learning_rate": 0.0001, + "loss": 1.5059, + "step": 3448 + }, + { + "epoch": 0.3178164768921634, + "grad_norm": 0.15296977758407593, + "learning_rate": 0.0001, + "loss": 1.5079, + "step": 3449 + }, + { + "epoch": 0.31815137307434693, + "grad_norm": 0.1484413892030716, + "learning_rate": 0.0001, + "loss": 1.3828, + "step": 3450 + }, + { + "epoch": 0.31848626925653045, + "grad_norm": 0.16037903726100922, + "learning_rate": 0.0001, + "loss": 1.4062, + "step": 3451 + }, + { + "epoch": 0.31882116543871397, + "grad_norm": 0.15443116426467896, + "learning_rate": 0.0001, + "loss": 1.4373, + "step": 3452 + }, + { + "epoch": 0.31915606162089755, + "grad_norm": 0.1552223265171051, + "learning_rate": 0.0001, + "loss": 1.5142, + "step": 3453 + }, + { + "epoch": 0.31949095780308107, + "grad_norm": 0.14638473093509674, + "learning_rate": 0.0001, + "loss": 1.439, + "step": 3454 + }, + { + "epoch": 0.3198258539852646, + "grad_norm": 0.14468824863433838, + "learning_rate": 0.0001, + "loss": 1.401, + "step": 3455 + }, + { + "epoch": 0.3201607501674481, + "grad_norm": 0.15405170619487762, + "learning_rate": 0.0001, + "loss": 1.4824, + "step": 3456 + }, + { + "epoch": 0.3204956463496316, + "grad_norm": 0.15160967409610748, + "learning_rate": 0.0001, + "loss": 1.4575, + "step": 3457 + }, + { + "epoch": 0.32083054253181514, + "grad_norm": 0.1501069813966751, + "learning_rate": 0.0001, + "loss": 1.4433, + "step": 3458 + }, + { + "epoch": 0.32116543871399866, + "grad_norm": 0.15485960245132446, + "learning_rate": 0.0001, + "loss": 1.4768, + "step": 3459 + }, + { + "epoch": 0.3215003348961822, + "grad_norm": 0.16001489758491516, + "learning_rate": 0.0001, + "loss": 1.456, + "step": 3460 + }, + { + "epoch": 0.3218352310783657, + "grad_norm": 0.15336276590824127, + "learning_rate": 0.0001, + "loss": 1.5219, + "step": 3461 + }, + { + "epoch": 0.3221701272605492, + "grad_norm": 0.15447048842906952, + "learning_rate": 0.0001, + "loss": 1.4807, + "step": 3462 + }, + { + "epoch": 0.32250502344273274, + "grad_norm": 0.15134736895561218, + "learning_rate": 0.0001, + "loss": 1.4368, + "step": 3463 + }, + { + "epoch": 0.32283991962491626, + "grad_norm": 0.1590096652507782, + "learning_rate": 0.0001, + "loss": 1.4353, + "step": 3464 + }, + { + "epoch": 0.3231748158070998, + "grad_norm": 0.15310245752334595, + "learning_rate": 0.0001, + "loss": 1.5494, + "step": 3465 + }, + { + "epoch": 0.3235097119892833, + "grad_norm": 0.14869531989097595, + "learning_rate": 0.0001, + "loss": 1.4673, + "step": 3466 + }, + { + "epoch": 0.3238446081714668, + "grad_norm": 0.15101325511932373, + "learning_rate": 0.0001, + "loss": 1.4633, + "step": 3467 + }, + { + "epoch": 0.3241795043536504, + "grad_norm": 0.1566777378320694, + "learning_rate": 0.0001, + "loss": 1.5121, + "step": 3468 + }, + { + "epoch": 0.3245144005358339, + "grad_norm": 0.1482858955860138, + "learning_rate": 0.0001, + "loss": 1.3693, + "step": 3469 + }, + { + "epoch": 0.32484929671801743, + "grad_norm": 0.15713654458522797, + "learning_rate": 0.0001, + "loss": 1.5532, + "step": 3470 + }, + { + "epoch": 0.32518419290020095, + "grad_norm": 0.15125976502895355, + "learning_rate": 0.0001, + "loss": 1.5104, + "step": 3471 + }, + { + "epoch": 0.32551908908238447, + "grad_norm": 0.1499818116426468, + "learning_rate": 0.0001, + "loss": 1.378, + "step": 3472 + }, + { + "epoch": 0.325853985264568, + "grad_norm": 0.16104479134082794, + "learning_rate": 0.0001, + "loss": 1.6239, + "step": 3473 + }, + { + "epoch": 0.3261888814467515, + "grad_norm": 0.15305879712104797, + "learning_rate": 0.0001, + "loss": 1.4882, + "step": 3474 + }, + { + "epoch": 0.32652377762893503, + "grad_norm": 0.14504723250865936, + "learning_rate": 0.0001, + "loss": 1.4882, + "step": 3475 + }, + { + "epoch": 0.32685867381111855, + "grad_norm": 0.15625393390655518, + "learning_rate": 0.0001, + "loss": 1.546, + "step": 3476 + }, + { + "epoch": 0.32719356999330207, + "grad_norm": 0.1602199822664261, + "learning_rate": 0.0001, + "loss": 1.5666, + "step": 3477 + }, + { + "epoch": 0.3275284661754856, + "grad_norm": 0.14354944229125977, + "learning_rate": 0.0001, + "loss": 1.4286, + "step": 3478 + }, + { + "epoch": 0.3278633623576691, + "grad_norm": 0.15154190361499786, + "learning_rate": 0.0001, + "loss": 1.4966, + "step": 3479 + }, + { + "epoch": 0.3281982585398526, + "grad_norm": 0.15154610574245453, + "learning_rate": 0.0001, + "loss": 1.5694, + "step": 3480 + }, + { + "epoch": 0.32853315472203615, + "grad_norm": 0.15371978282928467, + "learning_rate": 0.0001, + "loss": 1.4273, + "step": 3481 + }, + { + "epoch": 0.32886805090421967, + "grad_norm": 0.15090040862560272, + "learning_rate": 0.0001, + "loss": 1.4088, + "step": 3482 + }, + { + "epoch": 0.32920294708640324, + "grad_norm": 0.15387658774852753, + "learning_rate": 0.0001, + "loss": 1.5402, + "step": 3483 + }, + { + "epoch": 0.32953784326858676, + "grad_norm": 0.15785494446754456, + "learning_rate": 0.0001, + "loss": 1.5964, + "step": 3484 + }, + { + "epoch": 0.3298727394507703, + "grad_norm": 0.15188738703727722, + "learning_rate": 0.0001, + "loss": 1.527, + "step": 3485 + }, + { + "epoch": 0.3302076356329538, + "grad_norm": 0.14585669338703156, + "learning_rate": 0.0001, + "loss": 1.3841, + "step": 3486 + }, + { + "epoch": 0.3305425318151373, + "grad_norm": 0.15196645259857178, + "learning_rate": 0.0001, + "loss": 1.3851, + "step": 3487 + }, + { + "epoch": 0.33087742799732084, + "grad_norm": 0.1498725563287735, + "learning_rate": 0.0001, + "loss": 1.5218, + "step": 3488 + }, + { + "epoch": 0.33121232417950436, + "grad_norm": 0.15197834372520447, + "learning_rate": 0.0001, + "loss": 1.4579, + "step": 3489 + }, + { + "epoch": 0.3315472203616879, + "grad_norm": 0.15742060542106628, + "learning_rate": 0.0001, + "loss": 1.5561, + "step": 3490 + }, + { + "epoch": 0.3318821165438714, + "grad_norm": 0.15078939497470856, + "learning_rate": 0.0001, + "loss": 1.4181, + "step": 3491 + }, + { + "epoch": 0.3322170127260549, + "grad_norm": 0.1436023712158203, + "learning_rate": 0.0001, + "loss": 1.4586, + "step": 3492 + }, + { + "epoch": 0.33255190890823844, + "grad_norm": 0.14628535509109497, + "learning_rate": 0.0001, + "loss": 1.4207, + "step": 3493 + }, + { + "epoch": 0.33288680509042196, + "grad_norm": 0.16081663966178894, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 3494 + }, + { + "epoch": 0.3332217012726055, + "grad_norm": 0.1588294953107834, + "learning_rate": 0.0001, + "loss": 1.5263, + "step": 3495 + }, + { + "epoch": 0.333556597454789, + "grad_norm": 0.14988917112350464, + "learning_rate": 0.0001, + "loss": 1.4401, + "step": 3496 + }, + { + "epoch": 0.3338914936369725, + "grad_norm": 0.17191949486732483, + "learning_rate": 0.0001, + "loss": 1.6032, + "step": 3497 + }, + { + "epoch": 0.3342263898191561, + "grad_norm": 0.14784646034240723, + "learning_rate": 0.0001, + "loss": 1.4044, + "step": 3498 + }, + { + "epoch": 0.3345612860013396, + "grad_norm": 0.1579027622938156, + "learning_rate": 0.0001, + "loss": 1.4487, + "step": 3499 + }, + { + "epoch": 0.33489618218352313, + "grad_norm": 0.15323099493980408, + "learning_rate": 0.0001, + "loss": 1.4331, + "step": 3500 + } + ], + "logging_steps": 1, + "max_steps": 5972, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.986928858418381e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}