diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14137 @@ +{ + "best_metric": 1.6265630154840407, + "best_model_checkpoint": "train/20241111-Compress:256x-Lr:5e-5-Llama3-8B-instruct-GPT2-Large-RAG-no-ft_token-onlySquad-everymem/checkpoint-2000", + "epoch": 2.9482218536944904, + "eval_steps": 250, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014741109268472453, + "grad_norm": 10.542601492422415, + "learning_rate": 5.000000000000001e-07, + "loss": 4.9379, + "step": 1 + }, + { + "epoch": 0.0029482218536944905, + "grad_norm": 14.75158889747135, + "learning_rate": 1.0000000000000002e-06, + "loss": 5.1736, + "step": 2 + }, + { + "epoch": 0.004422332780541736, + "grad_norm": 20.766239292080247, + "learning_rate": 1.5e-06, + "loss": 4.988, + "step": 3 + }, + { + "epoch": 0.005896443707388981, + "grad_norm": 22.19482163527959, + "learning_rate": 2.0000000000000003e-06, + "loss": 5.1685, + "step": 4 + }, + { + "epoch": 0.0073705546342362266, + "grad_norm": 11.413242986921624, + "learning_rate": 2.5e-06, + "loss": 5.2236, + "step": 5 + }, + { + "epoch": 0.008844665561083471, + "grad_norm": 31.813613722216566, + "learning_rate": 3e-06, + "loss": 5.0986, + "step": 6 + }, + { + "epoch": 0.010318776487930717, + "grad_norm": 11.019348121373584, + "learning_rate": 3.5000000000000004e-06, + "loss": 5.3992, + "step": 7 + }, + { + "epoch": 0.011792887414777962, + "grad_norm": 13.5247891619426, + "learning_rate": 4.000000000000001e-06, + "loss": 5.0985, + "step": 8 + }, + { + "epoch": 0.013266998341625208, + "grad_norm": 14.064692754390002, + "learning_rate": 4.5e-06, + "loss": 5.1665, + "step": 9 + }, + { + "epoch": 0.014741109268472453, + "grad_norm": 19.06002641481889, + "learning_rate": 5e-06, + "loss": 4.9754, + "step": 10 + }, + { + "epoch": 0.016215220195319697, + "grad_norm": 28.46141572477718, + "learning_rate": 5.500000000000001e-06, + "loss": 4.9661, + "step": 11 + }, + { + "epoch": 0.017689331122166942, + "grad_norm": 14.223456574227844, + "learning_rate": 6e-06, + "loss": 5.0477, + "step": 12 + }, + { + "epoch": 0.019163442049014188, + "grad_norm": 10.541714955899053, + "learning_rate": 6.5000000000000004e-06, + "loss": 4.9983, + "step": 13 + }, + { + "epoch": 0.020637552975861433, + "grad_norm": 10.403206961696505, + "learning_rate": 7.000000000000001e-06, + "loss": 4.538, + "step": 14 + }, + { + "epoch": 0.02211166390270868, + "grad_norm": 9.667787640349095, + "learning_rate": 7.5e-06, + "loss": 4.3433, + "step": 15 + }, + { + "epoch": 0.023585774829555924, + "grad_norm": 34.74159281731877, + "learning_rate": 8.000000000000001e-06, + "loss": 4.2695, + "step": 16 + }, + { + "epoch": 0.02505988575640317, + "grad_norm": 8.544322523623235, + "learning_rate": 8.500000000000002e-06, + "loss": 4.1162, + "step": 17 + }, + { + "epoch": 0.026533996683250415, + "grad_norm": 12.138609494258457, + "learning_rate": 9e-06, + "loss": 3.7167, + "step": 18 + }, + { + "epoch": 0.02800810761009766, + "grad_norm": 7.109522555989333, + "learning_rate": 9.5e-06, + "loss": 3.8958, + "step": 19 + }, + { + "epoch": 0.029482218536944906, + "grad_norm": 12.117353722264145, + "learning_rate": 1e-05, + "loss": 3.6133, + "step": 20 + }, + { + "epoch": 0.03095632946379215, + "grad_norm": 8.98839965901471, + "learning_rate": 1.05e-05, + "loss": 3.7009, + "step": 21 + }, + { + "epoch": 0.032430440390639394, + "grad_norm": 27.510298586535054, + "learning_rate": 1.1000000000000001e-05, + "loss": 3.5887, + "step": 22 + }, + { + "epoch": 0.03390455131748664, + "grad_norm": 9.121228418438392, + "learning_rate": 1.1500000000000002e-05, + "loss": 3.5404, + "step": 23 + }, + { + "epoch": 0.035378662244333885, + "grad_norm": 30.216555431881524, + "learning_rate": 1.2e-05, + "loss": 3.5466, + "step": 24 + }, + { + "epoch": 0.03685277317118113, + "grad_norm": 40.91363510016404, + "learning_rate": 1.25e-05, + "loss": 3.2227, + "step": 25 + }, + { + "epoch": 0.038326884098028376, + "grad_norm": 14.123070509116236, + "learning_rate": 1.3000000000000001e-05, + "loss": 3.2812, + "step": 26 + }, + { + "epoch": 0.03980099502487562, + "grad_norm": 28.9019232403691, + "learning_rate": 1.3500000000000001e-05, + "loss": 3.257, + "step": 27 + }, + { + "epoch": 0.04127510595172287, + "grad_norm": 8.806093741682783, + "learning_rate": 1.4000000000000001e-05, + "loss": 3.2704, + "step": 28 + }, + { + "epoch": 0.04274921687857011, + "grad_norm": 10.635808495985684, + "learning_rate": 1.45e-05, + "loss": 3.1303, + "step": 29 + }, + { + "epoch": 0.04422332780541736, + "grad_norm": 12.7978741787882, + "learning_rate": 1.5e-05, + "loss": 2.8083, + "step": 30 + }, + { + "epoch": 0.0456974387322646, + "grad_norm": 7.577938242705195, + "learning_rate": 1.55e-05, + "loss": 2.7483, + "step": 31 + }, + { + "epoch": 0.04717154965911185, + "grad_norm": 5.686714537323279, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.8328, + "step": 32 + }, + { + "epoch": 0.048645660585959094, + "grad_norm": 5.251588853465192, + "learning_rate": 1.65e-05, + "loss": 2.4911, + "step": 33 + }, + { + "epoch": 0.05011977151280634, + "grad_norm": 5.799491518027747, + "learning_rate": 1.7000000000000003e-05, + "loss": 2.6598, + "step": 34 + }, + { + "epoch": 0.051593882439653585, + "grad_norm": 49.994274574538785, + "learning_rate": 1.75e-05, + "loss": 2.5952, + "step": 35 + }, + { + "epoch": 0.05306799336650083, + "grad_norm": 3.706841386569642, + "learning_rate": 1.8e-05, + "loss": 2.3438, + "step": 36 + }, + { + "epoch": 0.054542104293348076, + "grad_norm": 3.9279312850784254, + "learning_rate": 1.85e-05, + "loss": 2.2941, + "step": 37 + }, + { + "epoch": 0.05601621522019532, + "grad_norm": 3.305465137801589, + "learning_rate": 1.9e-05, + "loss": 2.1947, + "step": 38 + }, + { + "epoch": 0.05749032614704257, + "grad_norm": 4.047149295421852, + "learning_rate": 1.9500000000000003e-05, + "loss": 2.2379, + "step": 39 + }, + { + "epoch": 0.05896443707388981, + "grad_norm": 4.849153126192219, + "learning_rate": 2e-05, + "loss": 1.9444, + "step": 40 + }, + { + "epoch": 0.06043854800073706, + "grad_norm": 3.9283157290189785, + "learning_rate": 2.05e-05, + "loss": 1.9679, + "step": 41 + }, + { + "epoch": 0.0619126589275843, + "grad_norm": 3.55358453763254, + "learning_rate": 2.1e-05, + "loss": 2.0119, + "step": 42 + }, + { + "epoch": 0.06338676985443155, + "grad_norm": 3.1384102523002255, + "learning_rate": 2.15e-05, + "loss": 1.9532, + "step": 43 + }, + { + "epoch": 0.06486088078127879, + "grad_norm": 3.728170691671425, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.9814, + "step": 44 + }, + { + "epoch": 0.06633499170812604, + "grad_norm": 2.759380467583325, + "learning_rate": 2.25e-05, + "loss": 1.8839, + "step": 45 + }, + { + "epoch": 0.06780910263497328, + "grad_norm": 2.6363244786239415, + "learning_rate": 2.3000000000000003e-05, + "loss": 1.8761, + "step": 46 + }, + { + "epoch": 0.06928321356182053, + "grad_norm": 2.371999752900238, + "learning_rate": 2.35e-05, + "loss": 1.8118, + "step": 47 + }, + { + "epoch": 0.07075732448866777, + "grad_norm": 3.5700949304177927, + "learning_rate": 2.4e-05, + "loss": 1.7577, + "step": 48 + }, + { + "epoch": 0.07223143541551502, + "grad_norm": 2.9866840157969983, + "learning_rate": 2.45e-05, + "loss": 1.9868, + "step": 49 + }, + { + "epoch": 0.07370554634236226, + "grad_norm": 2.7323796430573015, + "learning_rate": 2.5e-05, + "loss": 1.9494, + "step": 50 + }, + { + "epoch": 0.07517965726920951, + "grad_norm": 2.4284875979500526, + "learning_rate": 2.5500000000000003e-05, + "loss": 1.871, + "step": 51 + }, + { + "epoch": 0.07665376819605675, + "grad_norm": 3.2890906253436727, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.8587, + "step": 52 + }, + { + "epoch": 0.078127879122904, + "grad_norm": 2.312546497599205, + "learning_rate": 2.6500000000000004e-05, + "loss": 1.6874, + "step": 53 + }, + { + "epoch": 0.07960199004975124, + "grad_norm": 2.6813059554230123, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.904, + "step": 54 + }, + { + "epoch": 0.0810761009765985, + "grad_norm": 2.080791397660076, + "learning_rate": 2.7500000000000004e-05, + "loss": 1.8092, + "step": 55 + }, + { + "epoch": 0.08255021190344573, + "grad_norm": 2.350351522411094, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.8377, + "step": 56 + }, + { + "epoch": 0.08402432283029299, + "grad_norm": 2.374558759157098, + "learning_rate": 2.8499999999999998e-05, + "loss": 1.9329, + "step": 57 + }, + { + "epoch": 0.08549843375714022, + "grad_norm": 2.447574532049086, + "learning_rate": 2.9e-05, + "loss": 1.7067, + "step": 58 + }, + { + "epoch": 0.08697254468398748, + "grad_norm": 2.2704482729088618, + "learning_rate": 2.95e-05, + "loss": 1.6713, + "step": 59 + }, + { + "epoch": 0.08844665561083472, + "grad_norm": 2.0926121139115943, + "learning_rate": 3e-05, + "loss": 1.5644, + "step": 60 + }, + { + "epoch": 0.08992076653768195, + "grad_norm": 2.3963622020865016, + "learning_rate": 3.05e-05, + "loss": 1.8546, + "step": 61 + }, + { + "epoch": 0.0913948774645292, + "grad_norm": 2.196258124052599, + "learning_rate": 3.1e-05, + "loss": 1.841, + "step": 62 + }, + { + "epoch": 0.09286898839137644, + "grad_norm": 2.270958876566279, + "learning_rate": 3.15e-05, + "loss": 1.6501, + "step": 63 + }, + { + "epoch": 0.0943430993182237, + "grad_norm": 2.601987603541407, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.532, + "step": 64 + }, + { + "epoch": 0.09581721024507094, + "grad_norm": 2.068880327158985, + "learning_rate": 3.2500000000000004e-05, + "loss": 1.7656, + "step": 65 + }, + { + "epoch": 0.09729132117191819, + "grad_norm": 2.3192701470932087, + "learning_rate": 3.3e-05, + "loss": 1.8471, + "step": 66 + }, + { + "epoch": 0.09876543209876543, + "grad_norm": 2.412193389541428, + "learning_rate": 3.35e-05, + "loss": 1.7321, + "step": 67 + }, + { + "epoch": 0.10023954302561268, + "grad_norm": 3.005819715767446, + "learning_rate": 3.4000000000000007e-05, + "loss": 1.6546, + "step": 68 + }, + { + "epoch": 0.10171365395245992, + "grad_norm": 2.144864082565192, + "learning_rate": 3.45e-05, + "loss": 1.6561, + "step": 69 + }, + { + "epoch": 0.10318776487930717, + "grad_norm": 2.423850158542077, + "learning_rate": 3.5e-05, + "loss": 1.8283, + "step": 70 + }, + { + "epoch": 0.10466187580615441, + "grad_norm": 2.319801145778886, + "learning_rate": 3.55e-05, + "loss": 1.7837, + "step": 71 + }, + { + "epoch": 0.10613598673300166, + "grad_norm": 2.06578138166346, + "learning_rate": 3.6e-05, + "loss": 1.7814, + "step": 72 + }, + { + "epoch": 0.1076100976598489, + "grad_norm": 1.9773213735314357, + "learning_rate": 3.65e-05, + "loss": 1.777, + "step": 73 + }, + { + "epoch": 0.10908420858669615, + "grad_norm": 1.8238764897198516, + "learning_rate": 3.7e-05, + "loss": 1.6219, + "step": 74 + }, + { + "epoch": 0.11055831951354339, + "grad_norm": 2.1520342318708696, + "learning_rate": 3.7500000000000003e-05, + "loss": 1.7022, + "step": 75 + }, + { + "epoch": 0.11203243044039064, + "grad_norm": 1.9571210341882685, + "learning_rate": 3.8e-05, + "loss": 1.6719, + "step": 76 + }, + { + "epoch": 0.11350654136723788, + "grad_norm": 2.258248890873436, + "learning_rate": 3.85e-05, + "loss": 1.6456, + "step": 77 + }, + { + "epoch": 0.11498065229408513, + "grad_norm": 1.9365640040624643, + "learning_rate": 3.9000000000000006e-05, + "loss": 1.5789, + "step": 78 + }, + { + "epoch": 0.11645476322093237, + "grad_norm": 2.4701209788483003, + "learning_rate": 3.9500000000000005e-05, + "loss": 1.9057, + "step": 79 + }, + { + "epoch": 0.11792887414777962, + "grad_norm": 2.4384415593077433, + "learning_rate": 4e-05, + "loss": 1.6999, + "step": 80 + }, + { + "epoch": 0.11940298507462686, + "grad_norm": 2.487784966483082, + "learning_rate": 4.05e-05, + "loss": 1.8003, + "step": 81 + }, + { + "epoch": 0.12087709600147412, + "grad_norm": 2.184200086671485, + "learning_rate": 4.1e-05, + "loss": 1.7446, + "step": 82 + }, + { + "epoch": 0.12235120692832135, + "grad_norm": 2.1855229844329296, + "learning_rate": 4.15e-05, + "loss": 1.8108, + "step": 83 + }, + { + "epoch": 0.1238253178551686, + "grad_norm": 2.1906026362911493, + "learning_rate": 4.2e-05, + "loss": 1.6437, + "step": 84 + }, + { + "epoch": 0.12529942878201586, + "grad_norm": 2.3199766795860777, + "learning_rate": 4.25e-05, + "loss": 1.6714, + "step": 85 + }, + { + "epoch": 0.1267735397088631, + "grad_norm": 2.343977141499948, + "learning_rate": 4.3e-05, + "loss": 1.6882, + "step": 86 + }, + { + "epoch": 0.12824765063571034, + "grad_norm": 2.201401814430496, + "learning_rate": 4.35e-05, + "loss": 1.6077, + "step": 87 + }, + { + "epoch": 0.12972176156255757, + "grad_norm": 1.8836813540439896, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.4839, + "step": 88 + }, + { + "epoch": 0.13119587248940484, + "grad_norm": 2.071602243274418, + "learning_rate": 4.4500000000000004e-05, + "loss": 1.6349, + "step": 89 + }, + { + "epoch": 0.13266998341625208, + "grad_norm": 2.178031812122008, + "learning_rate": 4.5e-05, + "loss": 1.508, + "step": 90 + }, + { + "epoch": 0.13414409434309932, + "grad_norm": 2.4650430009352697, + "learning_rate": 4.55e-05, + "loss": 1.5345, + "step": 91 + }, + { + "epoch": 0.13561820526994656, + "grad_norm": 2.2376626110126336, + "learning_rate": 4.600000000000001e-05, + "loss": 1.8138, + "step": 92 + }, + { + "epoch": 0.1370923161967938, + "grad_norm": 1.848529568956504, + "learning_rate": 4.6500000000000005e-05, + "loss": 1.7583, + "step": 93 + }, + { + "epoch": 0.13856642712364106, + "grad_norm": 2.3004335616761007, + "learning_rate": 4.7e-05, + "loss": 1.6402, + "step": 94 + }, + { + "epoch": 0.1400405380504883, + "grad_norm": 2.048771918355873, + "learning_rate": 4.75e-05, + "loss": 1.5049, + "step": 95 + }, + { + "epoch": 0.14151464897733554, + "grad_norm": 2.212046853767831, + "learning_rate": 4.8e-05, + "loss": 1.709, + "step": 96 + }, + { + "epoch": 0.14298875990418278, + "grad_norm": 2.131450287924872, + "learning_rate": 4.85e-05, + "loss": 1.5567, + "step": 97 + }, + { + "epoch": 0.14446287083103004, + "grad_norm": 2.298086249237732, + "learning_rate": 4.9e-05, + "loss": 1.63, + "step": 98 + }, + { + "epoch": 0.14593698175787728, + "grad_norm": 2.1815599718613368, + "learning_rate": 4.9500000000000004e-05, + "loss": 1.8168, + "step": 99 + }, + { + "epoch": 0.14741109268472452, + "grad_norm": 1.8790682843739575, + "learning_rate": 5e-05, + "loss": 1.6481, + "step": 100 + }, + { + "epoch": 0.14888520361157176, + "grad_norm": 1.8522093081747166, + "learning_rate": 4.999999216450553e-05, + "loss": 1.689, + "step": 101 + }, + { + "epoch": 0.15035931453841903, + "grad_norm": 2.536841445178404, + "learning_rate": 4.9999968658027006e-05, + "loss": 1.614, + "step": 102 + }, + { + "epoch": 0.15183342546526626, + "grad_norm": 2.3043916156729503, + "learning_rate": 4.999992948057919e-05, + "loss": 1.6039, + "step": 103 + }, + { + "epoch": 0.1533075363921135, + "grad_norm": 2.1947569228942054, + "learning_rate": 4.999987463218663e-05, + "loss": 1.6672, + "step": 104 + }, + { + "epoch": 0.15478164731896074, + "grad_norm": 2.1080649369677635, + "learning_rate": 4.9999804112883694e-05, + "loss": 1.6218, + "step": 105 + }, + { + "epoch": 0.156255758245808, + "grad_norm": 2.2353676678305248, + "learning_rate": 4.99997179227146e-05, + "loss": 1.6265, + "step": 106 + }, + { + "epoch": 0.15772986917265525, + "grad_norm": 2.1160965815101758, + "learning_rate": 4.999961606173337e-05, + "loss": 1.6612, + "step": 107 + }, + { + "epoch": 0.15920398009950248, + "grad_norm": 1.8458162787735273, + "learning_rate": 4.9999498530003866e-05, + "loss": 1.5433, + "step": 108 + }, + { + "epoch": 0.16067809102634972, + "grad_norm": 2.0363236664899396, + "learning_rate": 4.999936532759974e-05, + "loss": 1.7881, + "step": 109 + }, + { + "epoch": 0.162152201953197, + "grad_norm": 1.852527801366834, + "learning_rate": 4.9999216454604505e-05, + "loss": 1.5593, + "step": 110 + }, + { + "epoch": 0.16362631288004423, + "grad_norm": 2.230435078862805, + "learning_rate": 4.9999051911111484e-05, + "loss": 1.5841, + "step": 111 + }, + { + "epoch": 0.16510042380689147, + "grad_norm": 2.1978031026660187, + "learning_rate": 4.99988716972238e-05, + "loss": 1.5882, + "step": 112 + }, + { + "epoch": 0.1665745347337387, + "grad_norm": 1.9271774956319956, + "learning_rate": 4.999867581305444e-05, + "loss": 1.5627, + "step": 113 + }, + { + "epoch": 0.16804864566058597, + "grad_norm": 2.062486937510423, + "learning_rate": 4.9998464258726174e-05, + "loss": 1.7414, + "step": 114 + }, + { + "epoch": 0.1695227565874332, + "grad_norm": 1.8279360159980338, + "learning_rate": 4.999823703437162e-05, + "loss": 1.5979, + "step": 115 + }, + { + "epoch": 0.17099686751428045, + "grad_norm": 2.039134407967887, + "learning_rate": 4.999799414013322e-05, + "loss": 1.6692, + "step": 116 + }, + { + "epoch": 0.1724709784411277, + "grad_norm": 1.8610128674110982, + "learning_rate": 4.9997735576163215e-05, + "loss": 1.4458, + "step": 117 + }, + { + "epoch": 0.17394508936797495, + "grad_norm": 2.033024878113718, + "learning_rate": 4.9997461342623686e-05, + "loss": 1.5339, + "step": 118 + }, + { + "epoch": 0.1754192002948222, + "grad_norm": 2.2969587661104565, + "learning_rate": 4.999717143968654e-05, + "loss": 1.4748, + "step": 119 + }, + { + "epoch": 0.17689331122166943, + "grad_norm": 2.0766775496898267, + "learning_rate": 4.9996865867533496e-05, + "loss": 1.4835, + "step": 120 + }, + { + "epoch": 0.17836742214851667, + "grad_norm": 1.9920823611770595, + "learning_rate": 4.99965446263561e-05, + "loss": 1.5707, + "step": 121 + }, + { + "epoch": 0.1798415330753639, + "grad_norm": 1.8183865816598839, + "learning_rate": 4.9996207716355726e-05, + "loss": 1.5999, + "step": 122 + }, + { + "epoch": 0.18131564400221117, + "grad_norm": 2.044071753136424, + "learning_rate": 4.999585513774354e-05, + "loss": 1.7161, + "step": 123 + }, + { + "epoch": 0.1827897549290584, + "grad_norm": 1.9475531485360498, + "learning_rate": 4.9995486890740573e-05, + "loss": 1.5239, + "step": 124 + }, + { + "epoch": 0.18426386585590565, + "grad_norm": 1.8695379171798558, + "learning_rate": 4.9995102975577655e-05, + "loss": 1.496, + "step": 125 + }, + { + "epoch": 0.1857379767827529, + "grad_norm": 2.07468536933046, + "learning_rate": 4.999470339249543e-05, + "loss": 1.4883, + "step": 126 + }, + { + "epoch": 0.18721208770960016, + "grad_norm": 2.057945192732373, + "learning_rate": 4.9994288141744374e-05, + "loss": 1.4464, + "step": 127 + }, + { + "epoch": 0.1886861986364474, + "grad_norm": 2.2233408324512176, + "learning_rate": 4.999385722358479e-05, + "loss": 1.7203, + "step": 128 + }, + { + "epoch": 0.19016030956329463, + "grad_norm": 1.8766122879678175, + "learning_rate": 4.999341063828679e-05, + "loss": 1.5164, + "step": 129 + }, + { + "epoch": 0.19163442049014187, + "grad_norm": 2.0143590692499154, + "learning_rate": 4.9992948386130315e-05, + "loss": 1.5847, + "step": 130 + }, + { + "epoch": 0.19310853141698914, + "grad_norm": 2.2516484050259615, + "learning_rate": 4.9992470467405104e-05, + "loss": 1.5919, + "step": 131 + }, + { + "epoch": 0.19458264234383638, + "grad_norm": 1.8031131763830892, + "learning_rate": 4.999197688241076e-05, + "loss": 1.6906, + "step": 132 + }, + { + "epoch": 0.19605675327068361, + "grad_norm": 2.029461939306091, + "learning_rate": 4.999146763145668e-05, + "loss": 1.4806, + "step": 133 + }, + { + "epoch": 0.19753086419753085, + "grad_norm": 2.2177739750178236, + "learning_rate": 4.9990942714862066e-05, + "loss": 1.5786, + "step": 134 + }, + { + "epoch": 0.19900497512437812, + "grad_norm": 1.8300716110976656, + "learning_rate": 4.999040213295597e-05, + "loss": 1.4284, + "step": 135 + }, + { + "epoch": 0.20047908605122536, + "grad_norm": 2.1550993129566107, + "learning_rate": 4.9989845886077246e-05, + "loss": 1.6816, + "step": 136 + }, + { + "epoch": 0.2019531969780726, + "grad_norm": 1.8417871663069272, + "learning_rate": 4.9989273974574566e-05, + "loss": 1.3767, + "step": 137 + }, + { + "epoch": 0.20342730790491984, + "grad_norm": 2.192083189610869, + "learning_rate": 4.998868639880644e-05, + "loss": 1.6876, + "step": 138 + }, + { + "epoch": 0.2049014188317671, + "grad_norm": 2.3202672883326696, + "learning_rate": 4.998808315914117e-05, + "loss": 1.6607, + "step": 139 + }, + { + "epoch": 0.20637552975861434, + "grad_norm": 2.1634097440339533, + "learning_rate": 4.9987464255956894e-05, + "loss": 1.4282, + "step": 140 + }, + { + "epoch": 0.20784964068546158, + "grad_norm": 2.060724939544029, + "learning_rate": 4.9986829689641574e-05, + "loss": 1.4832, + "step": 141 + }, + { + "epoch": 0.20932375161230882, + "grad_norm": 2.0560524471658854, + "learning_rate": 4.998617946059297e-05, + "loss": 1.6022, + "step": 142 + }, + { + "epoch": 0.21079786253915608, + "grad_norm": 2.5000191687802715, + "learning_rate": 4.998551356921868e-05, + "loss": 1.7717, + "step": 143 + }, + { + "epoch": 0.21227197346600332, + "grad_norm": 2.3369231425139625, + "learning_rate": 4.99848320159361e-05, + "loss": 1.5764, + "step": 144 + }, + { + "epoch": 0.21374608439285056, + "grad_norm": 2.1573902377147407, + "learning_rate": 4.9984134801172464e-05, + "loss": 1.5507, + "step": 145 + }, + { + "epoch": 0.2152201953196978, + "grad_norm": 2.00797826179197, + "learning_rate": 4.998342192536482e-05, + "loss": 1.422, + "step": 146 + }, + { + "epoch": 0.21669430624654507, + "grad_norm": 2.3310063089017783, + "learning_rate": 4.998269338896e-05, + "loss": 1.6247, + "step": 147 + }, + { + "epoch": 0.2181684171733923, + "grad_norm": 1.9432428833058097, + "learning_rate": 4.998194919241471e-05, + "loss": 1.5054, + "step": 148 + }, + { + "epoch": 0.21964252810023954, + "grad_norm": 1.9152441620114702, + "learning_rate": 4.9981189336195425e-05, + "loss": 1.49, + "step": 149 + }, + { + "epoch": 0.22111663902708678, + "grad_norm": 2.026637547852652, + "learning_rate": 4.998041382077846e-05, + "loss": 1.6851, + "step": 150 + }, + { + "epoch": 0.22259074995393405, + "grad_norm": 1.7738780008145096, + "learning_rate": 4.9979622646649935e-05, + "loss": 1.4543, + "step": 151 + }, + { + "epoch": 0.22406486088078129, + "grad_norm": 2.0729515673384666, + "learning_rate": 4.997881581430579e-05, + "loss": 1.6819, + "step": 152 + }, + { + "epoch": 0.22553897180762852, + "grad_norm": 1.7773047138999618, + "learning_rate": 4.997799332425178e-05, + "loss": 1.6475, + "step": 153 + }, + { + "epoch": 0.22701308273447576, + "grad_norm": 1.5539139399810504, + "learning_rate": 4.997715517700347e-05, + "loss": 1.4423, + "step": 154 + }, + { + "epoch": 0.228487193661323, + "grad_norm": 1.920641123104727, + "learning_rate": 4.9976301373086254e-05, + "loss": 1.5398, + "step": 155 + }, + { + "epoch": 0.22996130458817027, + "grad_norm": 2.0395080547318667, + "learning_rate": 4.997543191303532e-05, + "loss": 1.6276, + "step": 156 + }, + { + "epoch": 0.2314354155150175, + "grad_norm": 2.092941925343976, + "learning_rate": 4.9974546797395685e-05, + "loss": 1.6567, + "step": 157 + }, + { + "epoch": 0.23290952644186474, + "grad_norm": 1.9710414206702205, + "learning_rate": 4.9973646026722166e-05, + "loss": 1.3883, + "step": 158 + }, + { + "epoch": 0.23438363736871198, + "grad_norm": 1.964366999096162, + "learning_rate": 4.997272960157942e-05, + "loss": 1.4694, + "step": 159 + }, + { + "epoch": 0.23585774829555925, + "grad_norm": 2.0563750208731255, + "learning_rate": 4.997179752254188e-05, + "loss": 1.5317, + "step": 160 + }, + { + "epoch": 0.2373318592224065, + "grad_norm": 2.0723551336259245, + "learning_rate": 4.997084979019382e-05, + "loss": 1.6037, + "step": 161 + }, + { + "epoch": 0.23880597014925373, + "grad_norm": 1.9570127322601956, + "learning_rate": 4.996988640512931e-05, + "loss": 1.5421, + "step": 162 + }, + { + "epoch": 0.24028008107610097, + "grad_norm": 1.898225309342246, + "learning_rate": 4.9968907367952245e-05, + "loss": 1.6293, + "step": 163 + }, + { + "epoch": 0.24175419200294823, + "grad_norm": 1.909058362613203, + "learning_rate": 4.9967912679276316e-05, + "loss": 1.5222, + "step": 164 + }, + { + "epoch": 0.24322830292979547, + "grad_norm": 2.51349298393404, + "learning_rate": 4.996690233972505e-05, + "loss": 1.6348, + "step": 165 + }, + { + "epoch": 0.2447024138566427, + "grad_norm": 2.0716379205494966, + "learning_rate": 4.996587634993175e-05, + "loss": 1.5544, + "step": 166 + }, + { + "epoch": 0.24617652478348995, + "grad_norm": 2.0953407507577113, + "learning_rate": 4.996483471053955e-05, + "loss": 1.4252, + "step": 167 + }, + { + "epoch": 0.2476506357103372, + "grad_norm": 1.858270782079412, + "learning_rate": 4.996377742220139e-05, + "loss": 1.578, + "step": 168 + }, + { + "epoch": 0.24912474663718445, + "grad_norm": 2.348591597546374, + "learning_rate": 4.9962704485580034e-05, + "loss": 1.5462, + "step": 169 + }, + { + "epoch": 0.2505988575640317, + "grad_norm": 1.7413280745047648, + "learning_rate": 4.996161590134802e-05, + "loss": 1.3747, + "step": 170 + }, + { + "epoch": 0.25207296849087896, + "grad_norm": 1.9123069964988997, + "learning_rate": 4.996051167018773e-05, + "loss": 1.5259, + "step": 171 + }, + { + "epoch": 0.2535470794177262, + "grad_norm": 1.9249726429766465, + "learning_rate": 4.995939179279134e-05, + "loss": 1.4706, + "step": 172 + }, + { + "epoch": 0.25502119034457343, + "grad_norm": 1.8405394969827769, + "learning_rate": 4.9958256269860826e-05, + "loss": 1.7181, + "step": 173 + }, + { + "epoch": 0.2564953012714207, + "grad_norm": 2.1502327371646563, + "learning_rate": 4.995710510210798e-05, + "loss": 1.6783, + "step": 174 + }, + { + "epoch": 0.2579694121982679, + "grad_norm": 1.8132622530039457, + "learning_rate": 4.9955938290254404e-05, + "loss": 1.6482, + "step": 175 + }, + { + "epoch": 0.25944352312511515, + "grad_norm": 1.8339603897817565, + "learning_rate": 4.99547558350315e-05, + "loss": 1.5723, + "step": 176 + }, + { + "epoch": 0.2609176340519624, + "grad_norm": 1.804856858006492, + "learning_rate": 4.9953557737180477e-05, + "loss": 1.4287, + "step": 177 + }, + { + "epoch": 0.2623917449788097, + "grad_norm": 1.8453585186936599, + "learning_rate": 4.9952343997452355e-05, + "loss": 1.4236, + "step": 178 + }, + { + "epoch": 0.2638658559056569, + "grad_norm": 2.002195583640961, + "learning_rate": 4.995111461660794e-05, + "loss": 1.5233, + "step": 179 + }, + { + "epoch": 0.26533996683250416, + "grad_norm": 1.8855827497250792, + "learning_rate": 4.9949869595417876e-05, + "loss": 1.6272, + "step": 180 + }, + { + "epoch": 0.2668140777593514, + "grad_norm": 2.0602540011427846, + "learning_rate": 4.994860893466258e-05, + "loss": 1.4663, + "step": 181 + }, + { + "epoch": 0.26828818868619864, + "grad_norm": 1.6948356836843923, + "learning_rate": 4.994733263513228e-05, + "loss": 1.55, + "step": 182 + }, + { + "epoch": 0.2697622996130459, + "grad_norm": 1.83476674512537, + "learning_rate": 4.994604069762702e-05, + "loss": 1.6474, + "step": 183 + }, + { + "epoch": 0.2712364105398931, + "grad_norm": 1.894212644863446, + "learning_rate": 4.994473312295663e-05, + "loss": 1.667, + "step": 184 + }, + { + "epoch": 0.27271052146674035, + "grad_norm": 1.9069100784881219, + "learning_rate": 4.994340991194076e-05, + "loss": 1.4898, + "step": 185 + }, + { + "epoch": 0.2741846323935876, + "grad_norm": 1.8500035801414696, + "learning_rate": 4.994207106540884e-05, + "loss": 1.7226, + "step": 186 + }, + { + "epoch": 0.2756587433204349, + "grad_norm": 2.0568917058720078, + "learning_rate": 4.994071658420012e-05, + "loss": 1.6814, + "step": 187 + }, + { + "epoch": 0.2771328542472821, + "grad_norm": 1.737312166777921, + "learning_rate": 4.993934646916364e-05, + "loss": 1.6241, + "step": 188 + }, + { + "epoch": 0.27860696517412936, + "grad_norm": 1.8974867586822286, + "learning_rate": 4.993796072115824e-05, + "loss": 1.5502, + "step": 189 + }, + { + "epoch": 0.2800810761009766, + "grad_norm": 1.627286109874131, + "learning_rate": 4.993655934105256e-05, + "loss": 1.512, + "step": 190 + }, + { + "epoch": 0.28155518702782384, + "grad_norm": 1.9361902547623564, + "learning_rate": 4.993514232972504e-05, + "loss": 1.5987, + "step": 191 + }, + { + "epoch": 0.2830292979546711, + "grad_norm": 1.8497831913485774, + "learning_rate": 4.9933709688063935e-05, + "loss": 1.5047, + "step": 192 + }, + { + "epoch": 0.2845034088815183, + "grad_norm": 1.9299011112237805, + "learning_rate": 4.993226141696726e-05, + "loss": 1.4832, + "step": 193 + }, + { + "epoch": 0.28597751980836555, + "grad_norm": 1.8794470502453384, + "learning_rate": 4.9930797517342853e-05, + "loss": 1.5571, + "step": 194 + }, + { + "epoch": 0.28745163073521285, + "grad_norm": 1.9405317274889466, + "learning_rate": 4.992931799010836e-05, + "loss": 1.4643, + "step": 195 + }, + { + "epoch": 0.2889257416620601, + "grad_norm": 1.6917059018450584, + "learning_rate": 4.992782283619118e-05, + "loss": 1.5118, + "step": 196 + }, + { + "epoch": 0.2903998525889073, + "grad_norm": 1.7850001828276383, + "learning_rate": 4.992631205652857e-05, + "loss": 1.6505, + "step": 197 + }, + { + "epoch": 0.29187396351575456, + "grad_norm": 2.048264708749373, + "learning_rate": 4.992478565206752e-05, + "loss": 1.5746, + "step": 198 + }, + { + "epoch": 0.2933480744426018, + "grad_norm": 1.8559218967627773, + "learning_rate": 4.992324362376484e-05, + "loss": 1.7085, + "step": 199 + }, + { + "epoch": 0.29482218536944904, + "grad_norm": 1.988831449575493, + "learning_rate": 4.992168597258715e-05, + "loss": 1.5021, + "step": 200 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 1.6964017120185386, + "learning_rate": 4.992011269951083e-05, + "loss": 1.4622, + "step": 201 + }, + { + "epoch": 0.2977704072231435, + "grad_norm": 1.7264181223438744, + "learning_rate": 4.991852380552209e-05, + "loss": 1.443, + "step": 202 + }, + { + "epoch": 0.2992445181499908, + "grad_norm": 1.5860528763017867, + "learning_rate": 4.99169192916169e-05, + "loss": 1.5066, + "step": 203 + }, + { + "epoch": 0.30071862907683805, + "grad_norm": 1.6375511802071567, + "learning_rate": 4.991529915880103e-05, + "loss": 1.3621, + "step": 204 + }, + { + "epoch": 0.3021927400036853, + "grad_norm": 1.620046622111707, + "learning_rate": 4.991366340809005e-05, + "loss": 1.7273, + "step": 205 + }, + { + "epoch": 0.30366685093053253, + "grad_norm": 1.7310077180960322, + "learning_rate": 4.99120120405093e-05, + "loss": 1.6215, + "step": 206 + }, + { + "epoch": 0.30514096185737977, + "grad_norm": 1.7646692440493075, + "learning_rate": 4.9910345057093936e-05, + "loss": 1.4291, + "step": 207 + }, + { + "epoch": 0.306615072784227, + "grad_norm": 1.7385663773792093, + "learning_rate": 4.990866245888889e-05, + "loss": 1.6184, + "step": 208 + }, + { + "epoch": 0.30808918371107424, + "grad_norm": 1.881623965382252, + "learning_rate": 4.9906964246948874e-05, + "loss": 1.4605, + "step": 209 + }, + { + "epoch": 0.3095632946379215, + "grad_norm": 1.6654262774873103, + "learning_rate": 4.99052504223384e-05, + "loss": 1.5176, + "step": 210 + }, + { + "epoch": 0.3110374055647688, + "grad_norm": 1.6322070737859578, + "learning_rate": 4.990352098613176e-05, + "loss": 1.5362, + "step": 211 + }, + { + "epoch": 0.312511516491616, + "grad_norm": 1.5849406299242006, + "learning_rate": 4.9901775939413026e-05, + "loss": 1.4416, + "step": 212 + }, + { + "epoch": 0.31398562741846325, + "grad_norm": 2.052211763910315, + "learning_rate": 4.990001528327607e-05, + "loss": 1.49, + "step": 213 + }, + { + "epoch": 0.3154597383453105, + "grad_norm": 1.7744427491274635, + "learning_rate": 4.989823901882454e-05, + "loss": 1.3264, + "step": 214 + }, + { + "epoch": 0.31693384927215773, + "grad_norm": 2.0492089629439487, + "learning_rate": 4.989644714717187e-05, + "loss": 1.515, + "step": 215 + }, + { + "epoch": 0.31840796019900497, + "grad_norm": 1.8718381925858396, + "learning_rate": 4.989463966944127e-05, + "loss": 1.5779, + "step": 216 + }, + { + "epoch": 0.3198820711258522, + "grad_norm": 1.7506940691450417, + "learning_rate": 4.989281658676573e-05, + "loss": 1.5214, + "step": 217 + }, + { + "epoch": 0.32135618205269945, + "grad_norm": 1.9412881673289646, + "learning_rate": 4.989097790028806e-05, + "loss": 1.5771, + "step": 218 + }, + { + "epoch": 0.3228302929795467, + "grad_norm": 1.8586373669195384, + "learning_rate": 4.98891236111608e-05, + "loss": 1.595, + "step": 219 + }, + { + "epoch": 0.324304403906394, + "grad_norm": 1.9537625302274773, + "learning_rate": 4.988725372054629e-05, + "loss": 1.5517, + "step": 220 + }, + { + "epoch": 0.3257785148332412, + "grad_norm": 1.7472069795055158, + "learning_rate": 4.988536822961666e-05, + "loss": 1.537, + "step": 221 + }, + { + "epoch": 0.32725262576008846, + "grad_norm": 1.9921702664695777, + "learning_rate": 4.988346713955381e-05, + "loss": 1.5504, + "step": 222 + }, + { + "epoch": 0.3287267366869357, + "grad_norm": 2.0329267183860518, + "learning_rate": 4.9881550451549405e-05, + "loss": 1.6402, + "step": 223 + }, + { + "epoch": 0.33020084761378293, + "grad_norm": 1.9075573674845931, + "learning_rate": 4.987961816680492e-05, + "loss": 1.7153, + "step": 224 + }, + { + "epoch": 0.33167495854063017, + "grad_norm": 2.169518598959896, + "learning_rate": 4.9877670286531585e-05, + "loss": 1.6808, + "step": 225 + }, + { + "epoch": 0.3331490694674774, + "grad_norm": 1.6680833120557446, + "learning_rate": 4.98757068119504e-05, + "loss": 1.5796, + "step": 226 + }, + { + "epoch": 0.33462318039432465, + "grad_norm": 1.5567522762754262, + "learning_rate": 4.9873727744292144e-05, + "loss": 1.6217, + "step": 227 + }, + { + "epoch": 0.33609729132117194, + "grad_norm": 2.3513075985056755, + "learning_rate": 4.987173308479738e-05, + "loss": 1.7312, + "step": 228 + }, + { + "epoch": 0.3375714022480192, + "grad_norm": 1.7241615413824978, + "learning_rate": 4.9869722834716446e-05, + "loss": 1.4205, + "step": 229 + }, + { + "epoch": 0.3390455131748664, + "grad_norm": 1.8826174813096306, + "learning_rate": 4.9867696995309445e-05, + "loss": 1.3508, + "step": 230 + }, + { + "epoch": 0.34051962410171366, + "grad_norm": 1.9308716387921656, + "learning_rate": 4.986565556784625e-05, + "loss": 1.6173, + "step": 231 + }, + { + "epoch": 0.3419937350285609, + "grad_norm": 1.879170421024246, + "learning_rate": 4.98635985536065e-05, + "loss": 1.6777, + "step": 232 + }, + { + "epoch": 0.34346784595540814, + "grad_norm": 1.821979790580715, + "learning_rate": 4.986152595387963e-05, + "loss": 1.4362, + "step": 233 + }, + { + "epoch": 0.3449419568822554, + "grad_norm": 1.9441473355444765, + "learning_rate": 4.9859437769964815e-05, + "loss": 1.6138, + "step": 234 + }, + { + "epoch": 0.3464160678091026, + "grad_norm": 1.8543749988685765, + "learning_rate": 4.985733400317101e-05, + "loss": 1.5919, + "step": 235 + }, + { + "epoch": 0.3478901787359499, + "grad_norm": 1.7113855202111685, + "learning_rate": 4.985521465481695e-05, + "loss": 1.5299, + "step": 236 + }, + { + "epoch": 0.34936428966279715, + "grad_norm": 1.9334724157006715, + "learning_rate": 4.985307972623112e-05, + "loss": 1.5473, + "step": 237 + }, + { + "epoch": 0.3508384005896444, + "grad_norm": 1.6254158221590336, + "learning_rate": 4.985092921875178e-05, + "loss": 1.5182, + "step": 238 + }, + { + "epoch": 0.3523125115164916, + "grad_norm": 1.7504192939901426, + "learning_rate": 4.984876313372695e-05, + "loss": 1.5105, + "step": 239 + }, + { + "epoch": 0.35378662244333886, + "grad_norm": 1.8750132242372193, + "learning_rate": 4.984658147251442e-05, + "loss": 1.5587, + "step": 240 + }, + { + "epoch": 0.3552607333701861, + "grad_norm": 1.950406066478905, + "learning_rate": 4.984438423648174e-05, + "loss": 1.4449, + "step": 241 + }, + { + "epoch": 0.35673484429703334, + "grad_norm": 1.9475823453797563, + "learning_rate": 4.9842171427006225e-05, + "loss": 1.5364, + "step": 242 + }, + { + "epoch": 0.3582089552238806, + "grad_norm": 1.6947296124965352, + "learning_rate": 4.983994304547495e-05, + "loss": 1.5385, + "step": 243 + }, + { + "epoch": 0.3596830661507278, + "grad_norm": 1.83443633046863, + "learning_rate": 4.9837699093284765e-05, + "loss": 1.4704, + "step": 244 + }, + { + "epoch": 0.3611571770775751, + "grad_norm": 1.9536703340246153, + "learning_rate": 4.983543957184224e-05, + "loss": 1.6683, + "step": 245 + }, + { + "epoch": 0.36263128800442235, + "grad_norm": 1.8570997277690837, + "learning_rate": 4.983316448256377e-05, + "loss": 1.2434, + "step": 246 + }, + { + "epoch": 0.3641053989312696, + "grad_norm": 1.8087943771180426, + "learning_rate": 4.983087382687544e-05, + "loss": 1.649, + "step": 247 + }, + { + "epoch": 0.3655795098581168, + "grad_norm": 1.9300428673326977, + "learning_rate": 4.982856760621313e-05, + "loss": 1.4011, + "step": 248 + }, + { + "epoch": 0.36705362078496406, + "grad_norm": 1.7496896877367119, + "learning_rate": 4.9826245822022474e-05, + "loss": 1.5923, + "step": 249 + }, + { + "epoch": 0.3685277317118113, + "grad_norm": 1.799511244433208, + "learning_rate": 4.9823908475758875e-05, + "loss": 1.6522, + "step": 250 + }, + { + "epoch": 0.3685277317118113, + "eval_bleu": 0.04824338198619645, + "eval_bleu_1gram": 0.34108276434595103, + "eval_bleu_2gram": 0.11484718043335776, + "eval_bleu_3gram": 0.04694283612862056, + "eval_bleu_4gram": 0.022946329938694693, + "eval_rag_val_loss": 1.5737228063485955, + "eval_rouge1": 0.32701224706717685, + "eval_rouge2": 0.10811216474930627, + "eval_rougeL": 0.3077781137710209, + "step": 250 + }, + { + "epoch": 0.37000184263865854, + "grad_norm": 1.6572600829421518, + "learning_rate": 4.982155556888745e-05, + "loss": 1.5452, + "step": 251 + }, + { + "epoch": 0.3714759535655058, + "grad_norm": 1.930245940296186, + "learning_rate": 4.981918710288309e-05, + "loss": 1.4397, + "step": 252 + }, + { + "epoch": 0.3729500644923531, + "grad_norm": 1.938358977856933, + "learning_rate": 4.981680307923047e-05, + "loss": 1.6566, + "step": 253 + }, + { + "epoch": 0.3744241754192003, + "grad_norm": 1.8774043879437272, + "learning_rate": 4.981440349942397e-05, + "loss": 1.6621, + "step": 254 + }, + { + "epoch": 0.37589828634604755, + "grad_norm": 1.861090886146892, + "learning_rate": 4.981198836496775e-05, + "loss": 1.551, + "step": 255 + }, + { + "epoch": 0.3773723972728948, + "grad_norm": 1.802861334234698, + "learning_rate": 4.9809557677375704e-05, + "loss": 1.5642, + "step": 256 + }, + { + "epoch": 0.378846508199742, + "grad_norm": 2.2210855146396966, + "learning_rate": 4.98071114381715e-05, + "loss": 1.5815, + "step": 257 + }, + { + "epoch": 0.38032061912658927, + "grad_norm": 1.818147739177719, + "learning_rate": 4.980464964888852e-05, + "loss": 1.5253, + "step": 258 + }, + { + "epoch": 0.3817947300534365, + "grad_norm": 1.9086978392103475, + "learning_rate": 4.980217231106991e-05, + "loss": 1.6457, + "step": 259 + }, + { + "epoch": 0.38326884098028374, + "grad_norm": 1.8975392166930145, + "learning_rate": 4.979967942626858e-05, + "loss": 1.5502, + "step": 260 + }, + { + "epoch": 0.38474295190713104, + "grad_norm": 2.20955374721703, + "learning_rate": 4.979717099604715e-05, + "loss": 1.597, + "step": 261 + }, + { + "epoch": 0.3862170628339783, + "grad_norm": 1.8116599800082387, + "learning_rate": 4.979464702197801e-05, + "loss": 1.6264, + "step": 262 + }, + { + "epoch": 0.3876911737608255, + "grad_norm": 2.0297578466146042, + "learning_rate": 4.9792107505643304e-05, + "loss": 1.5653, + "step": 263 + }, + { + "epoch": 0.38916528468767275, + "grad_norm": 1.947739768198093, + "learning_rate": 4.9789552448634874e-05, + "loss": 1.6046, + "step": 264 + }, + { + "epoch": 0.39063939561452, + "grad_norm": 1.7200403398396384, + "learning_rate": 4.9786981852554346e-05, + "loss": 1.4561, + "step": 265 + }, + { + "epoch": 0.39211350654136723, + "grad_norm": 1.9104408099008892, + "learning_rate": 4.978439571901307e-05, + "loss": 1.7935, + "step": 266 + }, + { + "epoch": 0.39358761746821447, + "grad_norm": 1.7448903829253266, + "learning_rate": 4.9781794049632135e-05, + "loss": 1.537, + "step": 267 + }, + { + "epoch": 0.3950617283950617, + "grad_norm": 1.6825972792233093, + "learning_rate": 4.9779176846042366e-05, + "loss": 1.4845, + "step": 268 + }, + { + "epoch": 0.396535839321909, + "grad_norm": 1.8188593880703856, + "learning_rate": 4.977654410988434e-05, + "loss": 1.4426, + "step": 269 + }, + { + "epoch": 0.39800995024875624, + "grad_norm": 1.6547712526387357, + "learning_rate": 4.977389584280835e-05, + "loss": 1.5164, + "step": 270 + }, + { + "epoch": 0.3994840611756035, + "grad_norm": 1.6810958281099189, + "learning_rate": 4.9771232046474444e-05, + "loss": 1.448, + "step": 271 + }, + { + "epoch": 0.4009581721024507, + "grad_norm": 1.8850363739511167, + "learning_rate": 4.976855272255239e-05, + "loss": 1.5656, + "step": 272 + }, + { + "epoch": 0.40243228302929795, + "grad_norm": 1.9444085284352308, + "learning_rate": 4.976585787272168e-05, + "loss": 1.7654, + "step": 273 + }, + { + "epoch": 0.4039063939561452, + "grad_norm": 1.8866578402273446, + "learning_rate": 4.976314749867158e-05, + "loss": 1.6431, + "step": 274 + }, + { + "epoch": 0.40538050488299243, + "grad_norm": 1.652201833559983, + "learning_rate": 4.976042160210104e-05, + "loss": 1.5283, + "step": 275 + }, + { + "epoch": 0.40685461580983967, + "grad_norm": 1.7229485296265201, + "learning_rate": 4.975768018471877e-05, + "loss": 1.4934, + "step": 276 + }, + { + "epoch": 0.4083287267366869, + "grad_norm": 1.8622273597605954, + "learning_rate": 4.9754923248243195e-05, + "loss": 1.6194, + "step": 277 + }, + { + "epoch": 0.4098028376635342, + "grad_norm": 1.9324903634486195, + "learning_rate": 4.975215079440247e-05, + "loss": 1.8159, + "step": 278 + }, + { + "epoch": 0.41127694859038144, + "grad_norm": 1.9146934870543342, + "learning_rate": 4.974936282493448e-05, + "loss": 1.5907, + "step": 279 + }, + { + "epoch": 0.4127510595172287, + "grad_norm": 1.7951085571573018, + "learning_rate": 4.974655934158684e-05, + "loss": 1.5056, + "step": 280 + }, + { + "epoch": 0.4142251704440759, + "grad_norm": 1.8439588913338272, + "learning_rate": 4.974374034611687e-05, + "loss": 1.541, + "step": 281 + }, + { + "epoch": 0.41569928137092316, + "grad_norm": 1.7446759161819978, + "learning_rate": 4.9740905840291646e-05, + "loss": 1.5647, + "step": 282 + }, + { + "epoch": 0.4171733922977704, + "grad_norm": 1.893249266743817, + "learning_rate": 4.9738055825887936e-05, + "loss": 1.6144, + "step": 283 + }, + { + "epoch": 0.41864750322461763, + "grad_norm": 1.7681815618999355, + "learning_rate": 4.973519030469225e-05, + "loss": 1.4294, + "step": 284 + }, + { + "epoch": 0.4201216141514649, + "grad_norm": 2.0851913432044213, + "learning_rate": 4.97323092785008e-05, + "loss": 1.4536, + "step": 285 + }, + { + "epoch": 0.42159572507831217, + "grad_norm": 1.984376922366195, + "learning_rate": 4.972941274911953e-05, + "loss": 1.6432, + "step": 286 + }, + { + "epoch": 0.4230698360051594, + "grad_norm": 1.9614436495638687, + "learning_rate": 4.97265007183641e-05, + "loss": 1.3757, + "step": 287 + }, + { + "epoch": 0.42454394693200664, + "grad_norm": 1.7788729282989244, + "learning_rate": 4.9723573188059894e-05, + "loss": 1.4745, + "step": 288 + }, + { + "epoch": 0.4260180578588539, + "grad_norm": 1.825779706491545, + "learning_rate": 4.972063016004199e-05, + "loss": 1.4818, + "step": 289 + }, + { + "epoch": 0.4274921687857011, + "grad_norm": 1.8536216134738772, + "learning_rate": 4.971767163615522e-05, + "loss": 1.5649, + "step": 290 + }, + { + "epoch": 0.42896627971254836, + "grad_norm": 1.70620204592658, + "learning_rate": 4.971469761825407e-05, + "loss": 1.7124, + "step": 291 + }, + { + "epoch": 0.4304403906393956, + "grad_norm": 1.9209948129142063, + "learning_rate": 4.971170810820279e-05, + "loss": 1.6574, + "step": 292 + }, + { + "epoch": 0.43191450156624284, + "grad_norm": 1.8584657898413137, + "learning_rate": 4.970870310787532e-05, + "loss": 1.5529, + "step": 293 + }, + { + "epoch": 0.43338861249309013, + "grad_norm": 1.9514525920867223, + "learning_rate": 4.970568261915531e-05, + "loss": 1.5712, + "step": 294 + }, + { + "epoch": 0.43486272341993737, + "grad_norm": 1.8046863770584642, + "learning_rate": 4.970264664393614e-05, + "loss": 1.4221, + "step": 295 + }, + { + "epoch": 0.4363368343467846, + "grad_norm": 1.594034954850858, + "learning_rate": 4.9699595184120853e-05, + "loss": 1.5871, + "step": 296 + }, + { + "epoch": 0.43781094527363185, + "grad_norm": 1.7385663773792093, + "learning_rate": 4.9696528241622244e-05, + "loss": 1.6057, + "step": 297 + }, + { + "epoch": 0.4392850562004791, + "grad_norm": 1.8332213743298447, + "learning_rate": 4.9693445818362783e-05, + "loss": 1.4573, + "step": 298 + }, + { + "epoch": 0.4407591671273263, + "grad_norm": 1.89901624107547, + "learning_rate": 4.969034791627466e-05, + "loss": 1.5596, + "step": 299 + }, + { + "epoch": 0.44223327805417356, + "grad_norm": 2.0104218978682007, + "learning_rate": 4.9687234537299765e-05, + "loss": 1.3984, + "step": 300 + }, + { + "epoch": 0.4437073889810208, + "grad_norm": 1.843432770933328, + "learning_rate": 4.968410568338967e-05, + "loss": 1.515, + "step": 301 + }, + { + "epoch": 0.4451814999078681, + "grad_norm": 1.819186341159947, + "learning_rate": 4.968096135650569e-05, + "loss": 1.5026, + "step": 302 + }, + { + "epoch": 0.44665561083471533, + "grad_norm": 1.7817738749784682, + "learning_rate": 4.9677801558618795e-05, + "loss": 1.589, + "step": 303 + }, + { + "epoch": 0.44812972176156257, + "grad_norm": 1.8840459052282146, + "learning_rate": 4.967462629170969e-05, + "loss": 1.5816, + "step": 304 + }, + { + "epoch": 0.4496038326884098, + "grad_norm": 1.8997332385669479, + "learning_rate": 4.967143555776873e-05, + "loss": 1.5773, + "step": 305 + }, + { + "epoch": 0.45107794361525705, + "grad_norm": 2.0175375920495884, + "learning_rate": 4.9668229358796014e-05, + "loss": 1.4526, + "step": 306 + }, + { + "epoch": 0.4525520545421043, + "grad_norm": 1.9467586672393884, + "learning_rate": 4.966500769680131e-05, + "loss": 1.3903, + "step": 307 + }, + { + "epoch": 0.4540261654689515, + "grad_norm": 1.8750722871197363, + "learning_rate": 4.966177057380409e-05, + "loss": 1.5008, + "step": 308 + }, + { + "epoch": 0.45550027639579876, + "grad_norm": 1.8761046652318696, + "learning_rate": 4.965851799183349e-05, + "loss": 1.5039, + "step": 309 + }, + { + "epoch": 0.456974387322646, + "grad_norm": 1.8297284259339959, + "learning_rate": 4.9655249952928375e-05, + "loss": 1.3733, + "step": 310 + }, + { + "epoch": 0.4584484982494933, + "grad_norm": 1.609772420163937, + "learning_rate": 4.965196645913728e-05, + "loss": 1.5001, + "step": 311 + }, + { + "epoch": 0.45992260917634054, + "grad_norm": 1.8387292292580308, + "learning_rate": 4.964866751251842e-05, + "loss": 1.74, + "step": 312 + }, + { + "epoch": 0.4613967201031878, + "grad_norm": 1.784231468079857, + "learning_rate": 4.964535311513971e-05, + "loss": 1.539, + "step": 313 + }, + { + "epoch": 0.462870831030035, + "grad_norm": 1.7147122204505345, + "learning_rate": 4.9642023269078745e-05, + "loss": 1.5192, + "step": 314 + }, + { + "epoch": 0.46434494195688225, + "grad_norm": 1.6686955818132867, + "learning_rate": 4.963867797642281e-05, + "loss": 1.4396, + "step": 315 + }, + { + "epoch": 0.4658190528837295, + "grad_norm": 1.7852084696872015, + "learning_rate": 4.963531723926885e-05, + "loss": 1.6137, + "step": 316 + }, + { + "epoch": 0.46729316381057673, + "grad_norm": 1.5856517665890846, + "learning_rate": 4.963194105972353e-05, + "loss": 1.6777, + "step": 317 + }, + { + "epoch": 0.46876727473742397, + "grad_norm": 1.6801835857854193, + "learning_rate": 4.962854943990316e-05, + "loss": 1.4395, + "step": 318 + }, + { + "epoch": 0.47024138566427126, + "grad_norm": 1.7172009510072304, + "learning_rate": 4.962514238193375e-05, + "loss": 1.2703, + "step": 319 + }, + { + "epoch": 0.4717154965911185, + "grad_norm": 1.8608378578241502, + "learning_rate": 4.9621719887950966e-05, + "loss": 1.711, + "step": 320 + }, + { + "epoch": 0.47318960751796574, + "grad_norm": 1.8098349215642842, + "learning_rate": 4.9618281960100164e-05, + "loss": 1.273, + "step": 321 + }, + { + "epoch": 0.474663718444813, + "grad_norm": 1.8268931876760737, + "learning_rate": 4.9614828600536386e-05, + "loss": 1.5861, + "step": 322 + }, + { + "epoch": 0.4761378293716602, + "grad_norm": 1.8848761807763157, + "learning_rate": 4.9611359811424324e-05, + "loss": 1.6667, + "step": 323 + }, + { + "epoch": 0.47761194029850745, + "grad_norm": 1.7224359695869051, + "learning_rate": 4.960787559493836e-05, + "loss": 1.4574, + "step": 324 + }, + { + "epoch": 0.4790860512253547, + "grad_norm": 1.8962990604458858, + "learning_rate": 4.960437595326253e-05, + "loss": 1.5485, + "step": 325 + }, + { + "epoch": 0.48056016215220193, + "grad_norm": 1.6479640479656865, + "learning_rate": 4.960086088859055e-05, + "loss": 1.3061, + "step": 326 + }, + { + "epoch": 0.4820342730790492, + "grad_norm": 1.693045229520003, + "learning_rate": 4.95973304031258e-05, + "loss": 1.5577, + "step": 327 + }, + { + "epoch": 0.48350838400589646, + "grad_norm": 1.7807957254002036, + "learning_rate": 4.9593784499081336e-05, + "loss": 1.5978, + "step": 328 + }, + { + "epoch": 0.4849824949327437, + "grad_norm": 1.7486963184507502, + "learning_rate": 4.959022317867986e-05, + "loss": 1.5676, + "step": 329 + }, + { + "epoch": 0.48645660585959094, + "grad_norm": 1.7930202102756085, + "learning_rate": 4.9586646444153764e-05, + "loss": 1.4347, + "step": 330 + }, + { + "epoch": 0.4879307167864382, + "grad_norm": 1.6959663220246723, + "learning_rate": 4.958305429774507e-05, + "loss": 1.4695, + "step": 331 + }, + { + "epoch": 0.4894048277132854, + "grad_norm": 1.7847901350756699, + "learning_rate": 4.9579446741705485e-05, + "loss": 1.5542, + "step": 332 + }, + { + "epoch": 0.49087893864013266, + "grad_norm": 1.736363619773062, + "learning_rate": 4.957582377829637e-05, + "loss": 1.5498, + "step": 333 + }, + { + "epoch": 0.4923530495669799, + "grad_norm": 1.7008940757858666, + "learning_rate": 4.957218540978874e-05, + "loss": 1.5284, + "step": 334 + }, + { + "epoch": 0.49382716049382713, + "grad_norm": 1.5816515808404354, + "learning_rate": 4.9568531638463264e-05, + "loss": 1.414, + "step": 335 + }, + { + "epoch": 0.4953012714206744, + "grad_norm": 1.7616933072234282, + "learning_rate": 4.9564862466610284e-05, + "loss": 1.5051, + "step": 336 + }, + { + "epoch": 0.49677538234752167, + "grad_norm": 1.7645106892530869, + "learning_rate": 4.9561177896529764e-05, + "loss": 1.5276, + "step": 337 + }, + { + "epoch": 0.4982494932743689, + "grad_norm": 1.6571621812083663, + "learning_rate": 4.9557477930531346e-05, + "loss": 1.6586, + "step": 338 + }, + { + "epoch": 0.49972360420121614, + "grad_norm": 1.8880966557152847, + "learning_rate": 4.9553762570934314e-05, + "loss": 1.5564, + "step": 339 + }, + { + "epoch": 0.5011977151280634, + "grad_norm": 1.8661837892738515, + "learning_rate": 4.955003182006761e-05, + "loss": 1.4869, + "step": 340 + }, + { + "epoch": 0.5026718260549107, + "grad_norm": 1.7412277794246471, + "learning_rate": 4.954628568026981e-05, + "loss": 1.336, + "step": 341 + }, + { + "epoch": 0.5041459369817579, + "grad_norm": 1.6618906094555774, + "learning_rate": 4.954252415388914e-05, + "loss": 1.5603, + "step": 342 + }, + { + "epoch": 0.5056200479086052, + "grad_norm": 1.6000290391194125, + "learning_rate": 4.953874724328347e-05, + "loss": 1.4097, + "step": 343 + }, + { + "epoch": 0.5070941588354524, + "grad_norm": 1.5390411220556837, + "learning_rate": 4.953495495082032e-05, + "loss": 1.4327, + "step": 344 + }, + { + "epoch": 0.5085682697622996, + "grad_norm": 1.8537928672356163, + "learning_rate": 4.953114727887686e-05, + "loss": 1.4635, + "step": 345 + }, + { + "epoch": 0.5100423806891469, + "grad_norm": 1.9248772718271365, + "learning_rate": 4.952732422983989e-05, + "loss": 1.6069, + "step": 346 + }, + { + "epoch": 0.5115164916159941, + "grad_norm": 1.8327897378560116, + "learning_rate": 4.9523485806105826e-05, + "loss": 1.506, + "step": 347 + }, + { + "epoch": 0.5129906025428413, + "grad_norm": 1.6867867834621388, + "learning_rate": 4.951963201008076e-05, + "loss": 1.5001, + "step": 348 + }, + { + "epoch": 0.5144647134696886, + "grad_norm": 1.727999818210239, + "learning_rate": 4.9515762844180405e-05, + "loss": 1.5658, + "step": 349 + }, + { + "epoch": 0.5159388243965358, + "grad_norm": 1.792338607523979, + "learning_rate": 4.9511878310830106e-05, + "loss": 1.4469, + "step": 350 + }, + { + "epoch": 0.5174129353233831, + "grad_norm": 1.665252689741385, + "learning_rate": 4.950797841246484e-05, + "loss": 1.5282, + "step": 351 + }, + { + "epoch": 0.5188870462502303, + "grad_norm": 1.7260577124166554, + "learning_rate": 4.950406315152921e-05, + "loss": 1.4513, + "step": 352 + }, + { + "epoch": 0.5203611571770775, + "grad_norm": 1.6363130125730734, + "learning_rate": 4.9500132530477475e-05, + "loss": 1.5438, + "step": 353 + }, + { + "epoch": 0.5218352681039248, + "grad_norm": 1.8963660724800133, + "learning_rate": 4.949618655177348e-05, + "loss": 1.5857, + "step": 354 + }, + { + "epoch": 0.523309379030772, + "grad_norm": 1.8304866957588684, + "learning_rate": 4.949222521789074e-05, + "loss": 1.5886, + "step": 355 + }, + { + "epoch": 0.5247834899576194, + "grad_norm": 1.7552667111914908, + "learning_rate": 4.948824853131236e-05, + "loss": 1.4921, + "step": 356 + }, + { + "epoch": 0.5262576008844666, + "grad_norm": 1.7218471019444106, + "learning_rate": 4.948425649453111e-05, + "loss": 1.3544, + "step": 357 + }, + { + "epoch": 0.5277317118113138, + "grad_norm": 1.8410092041887887, + "learning_rate": 4.948024911004933e-05, + "loss": 1.5307, + "step": 358 + }, + { + "epoch": 0.5292058227381611, + "grad_norm": 1.84916087012853, + "learning_rate": 4.9476226380379014e-05, + "loss": 1.6957, + "step": 359 + }, + { + "epoch": 0.5306799336650083, + "grad_norm": 1.7845316990126243, + "learning_rate": 4.947218830804178e-05, + "loss": 1.5464, + "step": 360 + }, + { + "epoch": 0.5321540445918556, + "grad_norm": 1.9532893607600796, + "learning_rate": 4.946813489556883e-05, + "loss": 1.54, + "step": 361 + }, + { + "epoch": 0.5336281555187028, + "grad_norm": 1.7193653045680455, + "learning_rate": 4.946406614550103e-05, + "loss": 1.5283, + "step": 362 + }, + { + "epoch": 0.53510226644555, + "grad_norm": 1.7851546473255935, + "learning_rate": 4.945998206038881e-05, + "loss": 1.5738, + "step": 363 + }, + { + "epoch": 0.5365763773723973, + "grad_norm": 1.809106082730639, + "learning_rate": 4.945588264279225e-05, + "loss": 1.6902, + "step": 364 + }, + { + "epoch": 0.5380504882992445, + "grad_norm": 1.8137877920828727, + "learning_rate": 4.945176789528102e-05, + "loss": 1.5106, + "step": 365 + }, + { + "epoch": 0.5395245992260918, + "grad_norm": 1.683959213206895, + "learning_rate": 4.944763782043441e-05, + "loss": 1.4803, + "step": 366 + }, + { + "epoch": 0.540998710152939, + "grad_norm": 1.6836850170348148, + "learning_rate": 4.944349242084131e-05, + "loss": 1.5327, + "step": 367 + }, + { + "epoch": 0.5424728210797862, + "grad_norm": 1.7955376333670994, + "learning_rate": 4.943933169910023e-05, + "loss": 1.6079, + "step": 368 + }, + { + "epoch": 0.5439469320066335, + "grad_norm": 1.6751273064131693, + "learning_rate": 4.9435155657819266e-05, + "loss": 1.3816, + "step": 369 + }, + { + "epoch": 0.5454210429334807, + "grad_norm": 1.771809036130391, + "learning_rate": 4.9430964299616136e-05, + "loss": 1.4085, + "step": 370 + }, + { + "epoch": 0.5468951538603279, + "grad_norm": 1.8674027705894036, + "learning_rate": 4.942675762711813e-05, + "loss": 1.466, + "step": 371 + }, + { + "epoch": 0.5483692647871752, + "grad_norm": 1.7510025013982573, + "learning_rate": 4.942253564296218e-05, + "loss": 1.5686, + "step": 372 + }, + { + "epoch": 0.5498433757140225, + "grad_norm": 1.7751545261226611, + "learning_rate": 4.9418298349794767e-05, + "loss": 1.7306, + "step": 373 + }, + { + "epoch": 0.5513174866408698, + "grad_norm": 1.7897797192193394, + "learning_rate": 4.941404575027202e-05, + "loss": 1.4025, + "step": 374 + }, + { + "epoch": 0.552791597567717, + "grad_norm": 1.5912111292883768, + "learning_rate": 4.9409777847059625e-05, + "loss": 1.5827, + "step": 375 + }, + { + "epoch": 0.5542657084945642, + "grad_norm": 1.7964859914554656, + "learning_rate": 4.940549464283287e-05, + "loss": 1.3956, + "step": 376 + }, + { + "epoch": 0.5557398194214115, + "grad_norm": 1.6214447744482985, + "learning_rate": 4.940119614027663e-05, + "loss": 1.4579, + "step": 377 + }, + { + "epoch": 0.5572139303482587, + "grad_norm": 1.7065987813879293, + "learning_rate": 4.93968823420854e-05, + "loss": 1.6299, + "step": 378 + }, + { + "epoch": 0.558688041275106, + "grad_norm": 1.8918716482051499, + "learning_rate": 4.9392553250963215e-05, + "loss": 1.4715, + "step": 379 + }, + { + "epoch": 0.5601621522019532, + "grad_norm": 1.8153975116080192, + "learning_rate": 4.9388208869623734e-05, + "loss": 1.6462, + "step": 380 + }, + { + "epoch": 0.5616362631288004, + "grad_norm": 1.787132388107462, + "learning_rate": 4.938384920079019e-05, + "loss": 1.2408, + "step": 381 + }, + { + "epoch": 0.5631103740556477, + "grad_norm": 1.803564077035902, + "learning_rate": 4.937947424719538e-05, + "loss": 1.4267, + "step": 382 + }, + { + "epoch": 0.5645844849824949, + "grad_norm": 1.9134134866072703, + "learning_rate": 4.937508401158171e-05, + "loss": 1.5415, + "step": 383 + }, + { + "epoch": 0.5660585959093422, + "grad_norm": 1.920379801378615, + "learning_rate": 4.937067849670115e-05, + "loss": 1.4923, + "step": 384 + }, + { + "epoch": 0.5675327068361894, + "grad_norm": 1.7576246373137066, + "learning_rate": 4.936625770531525e-05, + "loss": 1.4902, + "step": 385 + }, + { + "epoch": 0.5690068177630366, + "grad_norm": 1.7083754495530958, + "learning_rate": 4.936182164019515e-05, + "loss": 1.4744, + "step": 386 + }, + { + "epoch": 0.5704809286898839, + "grad_norm": 1.7812743603563133, + "learning_rate": 4.935737030412153e-05, + "loss": 1.3867, + "step": 387 + }, + { + "epoch": 0.5719550396167311, + "grad_norm": 1.7760903434757664, + "learning_rate": 4.935290369988468e-05, + "loss": 1.5744, + "step": 388 + }, + { + "epoch": 0.5734291505435785, + "grad_norm": 1.6242917791685432, + "learning_rate": 4.934842183028443e-05, + "loss": 1.3365, + "step": 389 + }, + { + "epoch": 0.5749032614704257, + "grad_norm": 1.6025642053653, + "learning_rate": 4.9343924698130206e-05, + "loss": 1.426, + "step": 390 + }, + { + "epoch": 0.5763773723972729, + "grad_norm": 1.7101751571314143, + "learning_rate": 4.9339412306240984e-05, + "loss": 1.6726, + "step": 391 + }, + { + "epoch": 0.5778514833241202, + "grad_norm": 1.6439576949688486, + "learning_rate": 4.933488465744531e-05, + "loss": 1.4645, + "step": 392 + }, + { + "epoch": 0.5793255942509674, + "grad_norm": 1.8196544849532825, + "learning_rate": 4.933034175458129e-05, + "loss": 1.4769, + "step": 393 + }, + { + "epoch": 0.5807997051778147, + "grad_norm": 2.076573072052071, + "learning_rate": 4.9325783600496596e-05, + "loss": 1.4516, + "step": 394 + }, + { + "epoch": 0.5822738161046619, + "grad_norm": 1.8776443907458529, + "learning_rate": 4.9321210198048465e-05, + "loss": 1.4449, + "step": 395 + }, + { + "epoch": 0.5837479270315091, + "grad_norm": 1.8768191731394743, + "learning_rate": 4.931662155010367e-05, + "loss": 1.4863, + "step": 396 + }, + { + "epoch": 0.5852220379583564, + "grad_norm": 1.774138795100083, + "learning_rate": 4.931201765953858e-05, + "loss": 1.4567, + "step": 397 + }, + { + "epoch": 0.5866961488852036, + "grad_norm": 2.095018258937029, + "learning_rate": 4.9307398529239083e-05, + "loss": 1.5368, + "step": 398 + }, + { + "epoch": 0.5881702598120508, + "grad_norm": 1.7918840838262935, + "learning_rate": 4.930276416210063e-05, + "loss": 1.4562, + "step": 399 + }, + { + "epoch": 0.5896443707388981, + "grad_norm": 1.8635535850875868, + "learning_rate": 4.929811456102824e-05, + "loss": 1.6778, + "step": 400 + }, + { + "epoch": 0.5911184816657453, + "grad_norm": 1.617357991395137, + "learning_rate": 4.929344972893646e-05, + "loss": 1.563, + "step": 401 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 1.7088147934835842, + "learning_rate": 4.928876966874938e-05, + "loss": 1.4051, + "step": 402 + }, + { + "epoch": 0.5940667035194398, + "grad_norm": 1.6753560129131577, + "learning_rate": 4.9284074383400655e-05, + "loss": 1.4051, + "step": 403 + }, + { + "epoch": 0.595540814446287, + "grad_norm": 1.6029978205332274, + "learning_rate": 4.927936387583348e-05, + "loss": 1.3202, + "step": 404 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 1.7688631055274433, + "learning_rate": 4.9274638149000585e-05, + "loss": 1.5041, + "step": 405 + }, + { + "epoch": 0.5984890362999816, + "grad_norm": 1.5799863809891932, + "learning_rate": 4.9269897205864235e-05, + "loss": 1.4954, + "step": 406 + }, + { + "epoch": 0.5999631472268289, + "grad_norm": 1.6553242093291975, + "learning_rate": 4.926514104939625e-05, + "loss": 1.3095, + "step": 407 + }, + { + "epoch": 0.6014372581536761, + "grad_norm": 1.716742643816914, + "learning_rate": 4.9260369682577965e-05, + "loss": 1.6561, + "step": 408 + }, + { + "epoch": 0.6029113690805233, + "grad_norm": 1.6326547208784874, + "learning_rate": 4.9255583108400285e-05, + "loss": 1.5655, + "step": 409 + }, + { + "epoch": 0.6043854800073706, + "grad_norm": 1.5109955390899672, + "learning_rate": 4.9250781329863606e-05, + "loss": 1.4361, + "step": 410 + }, + { + "epoch": 0.6058595909342178, + "grad_norm": 1.6255240695688515, + "learning_rate": 4.924596434997787e-05, + "loss": 1.4307, + "step": 411 + }, + { + "epoch": 0.6073337018610651, + "grad_norm": 1.5463493301694566, + "learning_rate": 4.924113217176256e-05, + "loss": 1.5367, + "step": 412 + }, + { + "epoch": 0.6088078127879123, + "grad_norm": 1.6327332837906885, + "learning_rate": 4.9236284798246666e-05, + "loss": 1.5593, + "step": 413 + }, + { + "epoch": 0.6102819237147595, + "grad_norm": 2.0993855485710746, + "learning_rate": 4.923142223246873e-05, + "loss": 1.5256, + "step": 414 + }, + { + "epoch": 0.6117560346416068, + "grad_norm": 1.825255269004198, + "learning_rate": 4.922654447747679e-05, + "loss": 1.5226, + "step": 415 + }, + { + "epoch": 0.613230145568454, + "grad_norm": 1.8755328374971283, + "learning_rate": 4.922165153632842e-05, + "loss": 1.6397, + "step": 416 + }, + { + "epoch": 0.6147042564953012, + "grad_norm": 1.8949169277613347, + "learning_rate": 4.9216743412090694e-05, + "loss": 1.5982, + "step": 417 + }, + { + "epoch": 0.6161783674221485, + "grad_norm": 1.8352227950423996, + "learning_rate": 4.9211820107840234e-05, + "loss": 1.3602, + "step": 418 + }, + { + "epoch": 0.6176524783489957, + "grad_norm": 1.9041189570331363, + "learning_rate": 4.920688162666316e-05, + "loss": 1.5457, + "step": 419 + }, + { + "epoch": 0.619126589275843, + "grad_norm": 1.7763910108481864, + "learning_rate": 4.920192797165511e-05, + "loss": 1.4164, + "step": 420 + }, + { + "epoch": 0.6206007002026902, + "grad_norm": 1.7419113829898394, + "learning_rate": 4.919695914592122e-05, + "loss": 1.5608, + "step": 421 + }, + { + "epoch": 0.6220748111295376, + "grad_norm": 1.6244963085382704, + "learning_rate": 4.919197515257616e-05, + "loss": 1.2675, + "step": 422 + }, + { + "epoch": 0.6235489220563848, + "grad_norm": 1.7962890996639524, + "learning_rate": 4.9186975994744075e-05, + "loss": 1.5134, + "step": 423 + }, + { + "epoch": 0.625023032983232, + "grad_norm": 1.7239569426777772, + "learning_rate": 4.918196167555866e-05, + "loss": 1.5654, + "step": 424 + }, + { + "epoch": 0.6264971439100793, + "grad_norm": 1.8657507056948368, + "learning_rate": 4.9176932198163074e-05, + "loss": 1.3322, + "step": 425 + }, + { + "epoch": 0.6279712548369265, + "grad_norm": 1.6874855888599058, + "learning_rate": 4.917188756570999e-05, + "loss": 1.5147, + "step": 426 + }, + { + "epoch": 0.6294453657637737, + "grad_norm": 1.5605202148604673, + "learning_rate": 4.9166827781361594e-05, + "loss": 1.3247, + "step": 427 + }, + { + "epoch": 0.630919476690621, + "grad_norm": 1.7854848346434706, + "learning_rate": 4.916175284828955e-05, + "loss": 1.5542, + "step": 428 + }, + { + "epoch": 0.6323935876174682, + "grad_norm": 1.7684688115453004, + "learning_rate": 4.915666276967501e-05, + "loss": 1.5501, + "step": 429 + }, + { + "epoch": 0.6338676985443155, + "grad_norm": 1.6717944972541314, + "learning_rate": 4.9151557548708676e-05, + "loss": 1.4231, + "step": 430 + }, + { + "epoch": 0.6353418094711627, + "grad_norm": 1.8571704650231153, + "learning_rate": 4.9146437188590675e-05, + "loss": 1.6042, + "step": 431 + }, + { + "epoch": 0.6368159203980099, + "grad_norm": 1.8889238885678714, + "learning_rate": 4.914130169253066e-05, + "loss": 1.4622, + "step": 432 + }, + { + "epoch": 0.6382900313248572, + "grad_norm": 1.8406117126296115, + "learning_rate": 4.913615106374777e-05, + "loss": 1.6079, + "step": 433 + }, + { + "epoch": 0.6397641422517044, + "grad_norm": 1.7939840529593174, + "learning_rate": 4.91309853054706e-05, + "loss": 1.513, + "step": 434 + }, + { + "epoch": 0.6412382531785517, + "grad_norm": 1.7370053527169775, + "learning_rate": 4.912580442093727e-05, + "loss": 1.5361, + "step": 435 + }, + { + "epoch": 0.6427123641053989, + "grad_norm": 1.772219067088753, + "learning_rate": 4.9120608413395366e-05, + "loss": 1.5908, + "step": 436 + }, + { + "epoch": 0.6441864750322461, + "grad_norm": 1.725629114065271, + "learning_rate": 4.911539728610194e-05, + "loss": 1.4334, + "step": 437 + }, + { + "epoch": 0.6456605859590934, + "grad_norm": 1.661961550027967, + "learning_rate": 4.9110171042323536e-05, + "loss": 1.5997, + "step": 438 + }, + { + "epoch": 0.6471346968859407, + "grad_norm": 1.6235969795559844, + "learning_rate": 4.910492968533618e-05, + "loss": 1.5554, + "step": 439 + }, + { + "epoch": 0.648608807812788, + "grad_norm": 2.0796905586340557, + "learning_rate": 4.909967321842535e-05, + "loss": 1.4449, + "step": 440 + }, + { + "epoch": 0.6500829187396352, + "grad_norm": 1.5206619409980802, + "learning_rate": 4.9094401644886e-05, + "loss": 1.2062, + "step": 441 + }, + { + "epoch": 0.6515570296664824, + "grad_norm": 1.8492072211012145, + "learning_rate": 4.908911496802257e-05, + "loss": 1.5708, + "step": 442 + }, + { + "epoch": 0.6530311405933297, + "grad_norm": 1.90030844844969, + "learning_rate": 4.908381319114898e-05, + "loss": 1.4393, + "step": 443 + }, + { + "epoch": 0.6545052515201769, + "grad_norm": 2.1027586982138056, + "learning_rate": 4.9078496317588556e-05, + "loss": 1.5348, + "step": 444 + }, + { + "epoch": 0.6559793624470242, + "grad_norm": 1.774498710229856, + "learning_rate": 4.907316435067415e-05, + "loss": 1.507, + "step": 445 + }, + { + "epoch": 0.6574534733738714, + "grad_norm": 2.028774337387536, + "learning_rate": 4.906781729374804e-05, + "loss": 1.5131, + "step": 446 + }, + { + "epoch": 0.6589275843007186, + "grad_norm": 1.8197254986261917, + "learning_rate": 4.906245515016197e-05, + "loss": 1.5178, + "step": 447 + }, + { + "epoch": 0.6604016952275659, + "grad_norm": 1.821641821358822, + "learning_rate": 4.905707792327715e-05, + "loss": 1.6951, + "step": 448 + }, + { + "epoch": 0.6618758061544131, + "grad_norm": 1.7083524222199116, + "learning_rate": 4.9051685616464246e-05, + "loss": 1.4012, + "step": 449 + }, + { + "epoch": 0.6633499170812603, + "grad_norm": 1.689141428865083, + "learning_rate": 4.904627823310335e-05, + "loss": 1.3242, + "step": 450 + }, + { + "epoch": 0.6648240280081076, + "grad_norm": 1.6191904683833345, + "learning_rate": 4.9040855776584035e-05, + "loss": 1.438, + "step": 451 + }, + { + "epoch": 0.6662981389349548, + "grad_norm": 1.7294841229828566, + "learning_rate": 4.9035418250305314e-05, + "loss": 1.4644, + "step": 452 + }, + { + "epoch": 0.6677722498618021, + "grad_norm": 1.842108432150388, + "learning_rate": 4.9029965657675636e-05, + "loss": 1.492, + "step": 453 + }, + { + "epoch": 0.6692463607886493, + "grad_norm": 1.806699745654287, + "learning_rate": 4.9024498002112906e-05, + "loss": 1.4552, + "step": 454 + }, + { + "epoch": 0.6707204717154965, + "grad_norm": 1.6157211068527249, + "learning_rate": 4.901901528704447e-05, + "loss": 1.4178, + "step": 455 + }, + { + "epoch": 0.6721945826423439, + "grad_norm": 1.6756986584351095, + "learning_rate": 4.90135175159071e-05, + "loss": 1.4597, + "step": 456 + }, + { + "epoch": 0.6736686935691911, + "grad_norm": 1.7297972329884908, + "learning_rate": 4.900800469214703e-05, + "loss": 1.5601, + "step": 457 + }, + { + "epoch": 0.6751428044960384, + "grad_norm": 1.6800806339531662, + "learning_rate": 4.900247681921991e-05, + "loss": 1.5899, + "step": 458 + }, + { + "epoch": 0.6766169154228856, + "grad_norm": 1.6239636858088693, + "learning_rate": 4.899693390059082e-05, + "loss": 1.513, + "step": 459 + }, + { + "epoch": 0.6780910263497328, + "grad_norm": 1.7149385673708424, + "learning_rate": 4.89913759397343e-05, + "loss": 1.5596, + "step": 460 + }, + { + "epoch": 0.6795651372765801, + "grad_norm": 1.7551631374558498, + "learning_rate": 4.898580294013428e-05, + "loss": 1.5048, + "step": 461 + }, + { + "epoch": 0.6810392482034273, + "grad_norm": 1.790992861471002, + "learning_rate": 4.898021490528415e-05, + "loss": 1.4767, + "step": 462 + }, + { + "epoch": 0.6825133591302746, + "grad_norm": 1.832508992775453, + "learning_rate": 4.89746118386867e-05, + "loss": 1.318, + "step": 463 + }, + { + "epoch": 0.6839874700571218, + "grad_norm": 1.5878711056418666, + "learning_rate": 4.8968993743854176e-05, + "loss": 1.5015, + "step": 464 + }, + { + "epoch": 0.685461580983969, + "grad_norm": 1.7137147841115068, + "learning_rate": 4.89633606243082e-05, + "loss": 1.3526, + "step": 465 + }, + { + "epoch": 0.6869356919108163, + "grad_norm": 1.627229628130214, + "learning_rate": 4.895771248357983e-05, + "loss": 1.4907, + "step": 466 + }, + { + "epoch": 0.6884098028376635, + "grad_norm": 1.7391438947550655, + "learning_rate": 4.895204932520957e-05, + "loss": 1.4798, + "step": 467 + }, + { + "epoch": 0.6898839137645107, + "grad_norm": 1.7775534705209048, + "learning_rate": 4.8946371152747285e-05, + "loss": 1.3824, + "step": 468 + }, + { + "epoch": 0.691358024691358, + "grad_norm": 1.8921013734527743, + "learning_rate": 4.8940677969752295e-05, + "loss": 1.5791, + "step": 469 + }, + { + "epoch": 0.6928321356182052, + "grad_norm": 1.8413598683940766, + "learning_rate": 4.893496977979331e-05, + "loss": 1.6194, + "step": 470 + }, + { + "epoch": 0.6943062465450525, + "grad_norm": 1.8377235328533845, + "learning_rate": 4.892924658644844e-05, + "loss": 1.5055, + "step": 471 + }, + { + "epoch": 0.6957803574718998, + "grad_norm": 1.7725471580699306, + "learning_rate": 4.892350839330522e-05, + "loss": 1.4665, + "step": 472 + }, + { + "epoch": 0.697254468398747, + "grad_norm": 1.9568221233659708, + "learning_rate": 4.891775520396057e-05, + "loss": 1.4196, + "step": 473 + }, + { + "epoch": 0.6987285793255943, + "grad_norm": 1.7010672500969553, + "learning_rate": 4.8911987022020823e-05, + "loss": 1.5434, + "step": 474 + }, + { + "epoch": 0.7002026902524415, + "grad_norm": 1.812554523042569, + "learning_rate": 4.89062038511017e-05, + "loss": 1.4831, + "step": 475 + }, + { + "epoch": 0.7016768011792888, + "grad_norm": 1.7774638041098487, + "learning_rate": 4.8900405694828313e-05, + "loss": 1.3765, + "step": 476 + }, + { + "epoch": 0.703150912106136, + "grad_norm": 1.6492734118446895, + "learning_rate": 4.8894592556835186e-05, + "loss": 1.3158, + "step": 477 + }, + { + "epoch": 0.7046250230329832, + "grad_norm": 1.7110334395591669, + "learning_rate": 4.8888764440766225e-05, + "loss": 1.4758, + "step": 478 + }, + { + "epoch": 0.7060991339598305, + "grad_norm": 1.7046997069875438, + "learning_rate": 4.888292135027472e-05, + "loss": 1.4444, + "step": 479 + }, + { + "epoch": 0.7075732448866777, + "grad_norm": 1.7047225039460836, + "learning_rate": 4.887706328902335e-05, + "loss": 1.5892, + "step": 480 + }, + { + "epoch": 0.709047355813525, + "grad_norm": 1.7277817369697133, + "learning_rate": 4.8871190260684174e-05, + "loss": 1.4221, + "step": 481 + }, + { + "epoch": 0.7105214667403722, + "grad_norm": 1.592764905278726, + "learning_rate": 4.886530226893865e-05, + "loss": 1.4581, + "step": 482 + }, + { + "epoch": 0.7119955776672194, + "grad_norm": 2.1578804154588025, + "learning_rate": 4.88593993174776e-05, + "loss": 1.6157, + "step": 483 + }, + { + "epoch": 0.7134696885940667, + "grad_norm": 1.8208716192810515, + "learning_rate": 4.885348141000122e-05, + "loss": 1.4467, + "step": 484 + }, + { + "epoch": 0.7149437995209139, + "grad_norm": 1.7555074540656064, + "learning_rate": 4.8847548550219105e-05, + "loss": 1.5836, + "step": 485 + }, + { + "epoch": 0.7164179104477612, + "grad_norm": 1.5560618704334026, + "learning_rate": 4.884160074185019e-05, + "loss": 1.3614, + "step": 486 + }, + { + "epoch": 0.7178920213746084, + "grad_norm": 1.8615176893895358, + "learning_rate": 4.8835637988622804e-05, + "loss": 1.3898, + "step": 487 + }, + { + "epoch": 0.7193661323014556, + "grad_norm": 1.7605236931343822, + "learning_rate": 4.8829660294274636e-05, + "loss": 1.4536, + "step": 488 + }, + { + "epoch": 0.720840243228303, + "grad_norm": 1.6492765198767378, + "learning_rate": 4.8823667662552744e-05, + "loss": 1.3982, + "step": 489 + }, + { + "epoch": 0.7223143541551502, + "grad_norm": 1.8220000732832222, + "learning_rate": 4.881766009721354e-05, + "loss": 1.4368, + "step": 490 + }, + { + "epoch": 0.7237884650819975, + "grad_norm": 1.5817778207747901, + "learning_rate": 4.8811637602022806e-05, + "loss": 1.5633, + "step": 491 + }, + { + "epoch": 0.7252625760088447, + "grad_norm": 1.673540230811404, + "learning_rate": 4.8805600180755685e-05, + "loss": 1.6391, + "step": 492 + }, + { + "epoch": 0.7267366869356919, + "grad_norm": 1.6729552442283937, + "learning_rate": 4.8799547837196667e-05, + "loss": 1.4103, + "step": 493 + }, + { + "epoch": 0.7282107978625392, + "grad_norm": 1.8937503424414397, + "learning_rate": 4.87934805751396e-05, + "loss": 1.492, + "step": 494 + }, + { + "epoch": 0.7296849087893864, + "grad_norm": 1.6432538723385648, + "learning_rate": 4.8787398398387684e-05, + "loss": 1.5379, + "step": 495 + }, + { + "epoch": 0.7311590197162336, + "grad_norm": 1.6910717217086466, + "learning_rate": 4.878130131075347e-05, + "loss": 1.5918, + "step": 496 + }, + { + "epoch": 0.7326331306430809, + "grad_norm": 1.553604975934806, + "learning_rate": 4.877518931605885e-05, + "loss": 1.32, + "step": 497 + }, + { + "epoch": 0.7341072415699281, + "grad_norm": 1.5701799479095186, + "learning_rate": 4.8769062418135066e-05, + "loss": 1.4937, + "step": 498 + }, + { + "epoch": 0.7355813524967754, + "grad_norm": 1.6125995960084167, + "learning_rate": 4.8762920620822704e-05, + "loss": 1.6703, + "step": 499 + }, + { + "epoch": 0.7370554634236226, + "grad_norm": 1.815907859107379, + "learning_rate": 4.875676392797168e-05, + "loss": 1.5666, + "step": 500 + }, + { + "epoch": 0.7370554634236226, + "eval_bleu": 0.049864103853823394, + "eval_bleu_1gram": 0.3517426238755257, + "eval_bleu_2gram": 0.12146521375505184, + "eval_bleu_3gram": 0.04852393719199864, + "eval_bleu_4gram": 0.02316378746484796, + "eval_rag_val_loss": 1.525708330254401, + "eval_rouge1": 0.3319672816480387, + "eval_rouge2": 0.11286492939453996, + "eval_rougeL": 0.31334997990494057, + "step": 500 + }, + { + "epoch": 0.7385295743504698, + "grad_norm": 1.8611478928431302, + "learning_rate": 4.875059234344126e-05, + "loss": 1.5724, + "step": 501 + }, + { + "epoch": 0.7400036852773171, + "grad_norm": 1.6171364154210777, + "learning_rate": 4.874440587110003e-05, + "loss": 1.5886, + "step": 502 + }, + { + "epoch": 0.7414777962041643, + "grad_norm": 1.7119811894105343, + "learning_rate": 4.873820451482592e-05, + "loss": 1.461, + "step": 503 + }, + { + "epoch": 0.7429519071310116, + "grad_norm": 1.6702272370204336, + "learning_rate": 4.873198827850618e-05, + "loss": 1.5485, + "step": 504 + }, + { + "epoch": 0.7444260180578589, + "grad_norm": 1.867967259220181, + "learning_rate": 4.872575716603739e-05, + "loss": 1.5593, + "step": 505 + }, + { + "epoch": 0.7459001289847061, + "grad_norm": 1.8501421152120512, + "learning_rate": 4.871951118132547e-05, + "loss": 1.4647, + "step": 506 + }, + { + "epoch": 0.7473742399115534, + "grad_norm": 1.6515600632114533, + "learning_rate": 4.8713250328285654e-05, + "loss": 1.5828, + "step": 507 + }, + { + "epoch": 0.7488483508384006, + "grad_norm": 1.7049863250944728, + "learning_rate": 4.8706974610842474e-05, + "loss": 1.4158, + "step": 508 + }, + { + "epoch": 0.7503224617652479, + "grad_norm": 1.7348378569063014, + "learning_rate": 4.87006840329298e-05, + "loss": 1.3832, + "step": 509 + }, + { + "epoch": 0.7517965726920951, + "grad_norm": 1.6345373652899626, + "learning_rate": 4.8694378598490826e-05, + "loss": 1.5045, + "step": 510 + }, + { + "epoch": 0.7532706836189423, + "grad_norm": 1.9749966174712368, + "learning_rate": 4.868805831147805e-05, + "loss": 1.6251, + "step": 511 + }, + { + "epoch": 0.7547447945457896, + "grad_norm": 1.646313134607012, + "learning_rate": 4.868172317585326e-05, + "loss": 1.554, + "step": 512 + }, + { + "epoch": 0.7562189054726368, + "grad_norm": 1.6642882144775883, + "learning_rate": 4.867537319558758e-05, + "loss": 1.4583, + "step": 513 + }, + { + "epoch": 0.757693016399484, + "grad_norm": 1.7284106551371836, + "learning_rate": 4.866900837466144e-05, + "loss": 1.5583, + "step": 514 + }, + { + "epoch": 0.7591671273263313, + "grad_norm": 1.6809198870838025, + "learning_rate": 4.8662628717064544e-05, + "loss": 1.5409, + "step": 515 + }, + { + "epoch": 0.7606412382531785, + "grad_norm": 1.562017976796623, + "learning_rate": 4.865623422679593e-05, + "loss": 1.5286, + "step": 516 + }, + { + "epoch": 0.7621153491800258, + "grad_norm": 1.75968284285208, + "learning_rate": 4.8649824907863894e-05, + "loss": 1.521, + "step": 517 + }, + { + "epoch": 0.763589460106873, + "grad_norm": 1.5779583531384793, + "learning_rate": 4.864340076428607e-05, + "loss": 1.3209, + "step": 518 + }, + { + "epoch": 0.7650635710337202, + "grad_norm": 1.7439921658716193, + "learning_rate": 4.863696180008937e-05, + "loss": 1.5267, + "step": 519 + }, + { + "epoch": 0.7665376819605675, + "grad_norm": 1.7213756965833582, + "learning_rate": 4.8630508019309976e-05, + "loss": 1.51, + "step": 520 + }, + { + "epoch": 0.7680117928874147, + "grad_norm": 1.7651257779743919, + "learning_rate": 4.8624039425993375e-05, + "loss": 1.5558, + "step": 521 + }, + { + "epoch": 0.7694859038142621, + "grad_norm": 1.6010956618817953, + "learning_rate": 4.861755602419434e-05, + "loss": 1.5821, + "step": 522 + }, + { + "epoch": 0.7709600147411093, + "grad_norm": 1.4458724303369055, + "learning_rate": 4.861105781797692e-05, + "loss": 1.4634, + "step": 523 + }, + { + "epoch": 0.7724341256679566, + "grad_norm": 1.6344709233276349, + "learning_rate": 4.8604544811414465e-05, + "loss": 1.4633, + "step": 524 + }, + { + "epoch": 0.7739082365948038, + "grad_norm": 1.5475520184082878, + "learning_rate": 4.859801700858957e-05, + "loss": 1.4808, + "step": 525 + }, + { + "epoch": 0.775382347521651, + "grad_norm": 1.7307446262045316, + "learning_rate": 4.859147441359412e-05, + "loss": 1.576, + "step": 526 + }, + { + "epoch": 0.7768564584484983, + "grad_norm": 1.7574041273683978, + "learning_rate": 4.858491703052927e-05, + "loss": 1.4243, + "step": 527 + }, + { + "epoch": 0.7783305693753455, + "grad_norm": 1.5035501430054596, + "learning_rate": 4.8578344863505464e-05, + "loss": 1.4507, + "step": 528 + }, + { + "epoch": 0.7798046803021927, + "grad_norm": 1.5982972144115446, + "learning_rate": 4.857175791664238e-05, + "loss": 1.3406, + "step": 529 + }, + { + "epoch": 0.78127879122904, + "grad_norm": 1.7771944423464143, + "learning_rate": 4.856515619406898e-05, + "loss": 1.6275, + "step": 530 + }, + { + "epoch": 0.7827529021558872, + "grad_norm": 1.6675574624199347, + "learning_rate": 4.855853969992349e-05, + "loss": 1.4599, + "step": 531 + }, + { + "epoch": 0.7842270130827345, + "grad_norm": 1.7613711807223646, + "learning_rate": 4.8551908438353374e-05, + "loss": 1.4255, + "step": 532 + }, + { + "epoch": 0.7857011240095817, + "grad_norm": 1.7896445714252482, + "learning_rate": 4.854526241351539e-05, + "loss": 1.5616, + "step": 533 + }, + { + "epoch": 0.7871752349364289, + "grad_norm": 1.8317942516864372, + "learning_rate": 4.853860162957552e-05, + "loss": 1.3952, + "step": 534 + }, + { + "epoch": 0.7886493458632762, + "grad_norm": 1.8120388233539366, + "learning_rate": 4.8531926090709016e-05, + "loss": 1.3347, + "step": 535 + }, + { + "epoch": 0.7901234567901234, + "grad_norm": 1.7306410314079854, + "learning_rate": 4.8525235801100346e-05, + "loss": 1.455, + "step": 536 + }, + { + "epoch": 0.7915975677169707, + "grad_norm": 1.7714094758103223, + "learning_rate": 4.851853076494327e-05, + "loss": 1.188, + "step": 537 + }, + { + "epoch": 0.793071678643818, + "grad_norm": 1.6633401655134548, + "learning_rate": 4.8511810986440766e-05, + "loss": 1.563, + "step": 538 + }, + { + "epoch": 0.7945457895706652, + "grad_norm": 1.6108013609850818, + "learning_rate": 4.8505076469805054e-05, + "loss": 1.512, + "step": 539 + }, + { + "epoch": 0.7960199004975125, + "grad_norm": 1.76443008904392, + "learning_rate": 4.849832721925759e-05, + "loss": 1.681, + "step": 540 + }, + { + "epoch": 0.7974940114243597, + "grad_norm": 1.8573863189535162, + "learning_rate": 4.849156323902908e-05, + "loss": 1.4304, + "step": 541 + }, + { + "epoch": 0.798968122351207, + "grad_norm": 1.8755339180206456, + "learning_rate": 4.848478453335946e-05, + "loss": 1.398, + "step": 542 + }, + { + "epoch": 0.8004422332780542, + "grad_norm": 1.7021103382497447, + "learning_rate": 4.8477991106497874e-05, + "loss": 1.5522, + "step": 543 + }, + { + "epoch": 0.8019163442049014, + "grad_norm": 1.6021965446536812, + "learning_rate": 4.847118296270272e-05, + "loss": 1.4176, + "step": 544 + }, + { + "epoch": 0.8033904551317487, + "grad_norm": 1.6114137199093335, + "learning_rate": 4.8464360106241615e-05, + "loss": 1.5274, + "step": 545 + }, + { + "epoch": 0.8048645660585959, + "grad_norm": 1.7533287677898846, + "learning_rate": 4.845752254139139e-05, + "loss": 1.4279, + "step": 546 + }, + { + "epoch": 0.8063386769854431, + "grad_norm": 1.923612909817304, + "learning_rate": 4.845067027243809e-05, + "loss": 1.5484, + "step": 547 + }, + { + "epoch": 0.8078127879122904, + "grad_norm": 1.8260488866506253, + "learning_rate": 4.844380330367701e-05, + "loss": 1.4365, + "step": 548 + }, + { + "epoch": 0.8092868988391376, + "grad_norm": 1.7314524728419343, + "learning_rate": 4.843692163941264e-05, + "loss": 1.6001, + "step": 549 + }, + { + "epoch": 0.8107610097659849, + "grad_norm": 1.5860084555401721, + "learning_rate": 4.8430025283958645e-05, + "loss": 1.564, + "step": 550 + }, + { + "epoch": 0.8122351206928321, + "grad_norm": 1.6769805387028527, + "learning_rate": 4.842311424163797e-05, + "loss": 1.4055, + "step": 551 + }, + { + "epoch": 0.8137092316196793, + "grad_norm": 1.6973788417899383, + "learning_rate": 4.8416188516782715e-05, + "loss": 1.5738, + "step": 552 + }, + { + "epoch": 0.8151833425465266, + "grad_norm": 2.0282141667354168, + "learning_rate": 4.84092481137342e-05, + "loss": 1.5318, + "step": 553 + }, + { + "epoch": 0.8166574534733738, + "grad_norm": 1.5529009651691543, + "learning_rate": 4.840229303684294e-05, + "loss": 1.2693, + "step": 554 + }, + { + "epoch": 0.8181315644002212, + "grad_norm": 1.606619466600095, + "learning_rate": 4.8395323290468655e-05, + "loss": 1.4098, + "step": 555 + }, + { + "epoch": 0.8196056753270684, + "grad_norm": 1.6385132050108706, + "learning_rate": 4.838833887898026e-05, + "loss": 1.5526, + "step": 556 + }, + { + "epoch": 0.8210797862539156, + "grad_norm": 1.611953891274566, + "learning_rate": 4.838133980675586e-05, + "loss": 1.3188, + "step": 557 + }, + { + "epoch": 0.8225538971807629, + "grad_norm": 1.697205150655051, + "learning_rate": 4.837432607818275e-05, + "loss": 1.4741, + "step": 558 + }, + { + "epoch": 0.8240280081076101, + "grad_norm": 1.8683644818860858, + "learning_rate": 4.836729769765741e-05, + "loss": 1.5286, + "step": 559 + }, + { + "epoch": 0.8255021190344574, + "grad_norm": 1.6028531712224687, + "learning_rate": 4.83602546695855e-05, + "loss": 1.4908, + "step": 560 + }, + { + "epoch": 0.8269762299613046, + "grad_norm": 1.9011592741517895, + "learning_rate": 4.835319699838189e-05, + "loss": 1.7057, + "step": 561 + }, + { + "epoch": 0.8284503408881518, + "grad_norm": 1.7526635608178835, + "learning_rate": 4.834612468847058e-05, + "loss": 1.5934, + "step": 562 + }, + { + "epoch": 0.8299244518149991, + "grad_norm": 1.7057506375502405, + "learning_rate": 4.833903774428481e-05, + "loss": 1.3791, + "step": 563 + }, + { + "epoch": 0.8313985627418463, + "grad_norm": 1.7633940519083953, + "learning_rate": 4.833193617026692e-05, + "loss": 1.5183, + "step": 564 + }, + { + "epoch": 0.8328726736686936, + "grad_norm": 1.6557604228098732, + "learning_rate": 4.8324819970868473e-05, + "loss": 1.342, + "step": 565 + }, + { + "epoch": 0.8343467845955408, + "grad_norm": 1.6511287325296256, + "learning_rate": 4.831768915055019e-05, + "loss": 1.5193, + "step": 566 + }, + { + "epoch": 0.835820895522388, + "grad_norm": 1.885977399983924, + "learning_rate": 4.831054371378194e-05, + "loss": 1.4996, + "step": 567 + }, + { + "epoch": 0.8372950064492353, + "grad_norm": 1.9637028015020381, + "learning_rate": 4.830338366504277e-05, + "loss": 1.4268, + "step": 568 + }, + { + "epoch": 0.8387691173760825, + "grad_norm": 1.7081776137053164, + "learning_rate": 4.829620900882089e-05, + "loss": 1.2281, + "step": 569 + }, + { + "epoch": 0.8402432283029297, + "grad_norm": 1.642584221246862, + "learning_rate": 4.8289019749613645e-05, + "loss": 1.4689, + "step": 570 + }, + { + "epoch": 0.8417173392297771, + "grad_norm": 1.7788218629411756, + "learning_rate": 4.8281815891927554e-05, + "loss": 1.4177, + "step": 571 + }, + { + "epoch": 0.8431914501566243, + "grad_norm": 1.7466419562686073, + "learning_rate": 4.827459744027828e-05, + "loss": 1.4491, + "step": 572 + }, + { + "epoch": 0.8446655610834716, + "grad_norm": 1.795111279876853, + "learning_rate": 4.826736439919063e-05, + "loss": 1.4244, + "step": 573 + }, + { + "epoch": 0.8461396720103188, + "grad_norm": 1.8952211991700518, + "learning_rate": 4.826011677319857e-05, + "loss": 1.432, + "step": 574 + }, + { + "epoch": 0.847613782937166, + "grad_norm": 1.7548746608742227, + "learning_rate": 4.825285456684518e-05, + "loss": 1.5606, + "step": 575 + }, + { + "epoch": 0.8490878938640133, + "grad_norm": 1.6553174398419468, + "learning_rate": 4.824557778468272e-05, + "loss": 1.3863, + "step": 576 + }, + { + "epoch": 0.8505620047908605, + "grad_norm": 1.7501735601189816, + "learning_rate": 4.823828643127255e-05, + "loss": 1.5383, + "step": 577 + }, + { + "epoch": 0.8520361157177078, + "grad_norm": 1.7159971212305138, + "learning_rate": 4.823098051118519e-05, + "loss": 1.5674, + "step": 578 + }, + { + "epoch": 0.853510226644555, + "grad_norm": 2.047117379084849, + "learning_rate": 4.822366002900027e-05, + "loss": 1.646, + "step": 579 + }, + { + "epoch": 0.8549843375714022, + "grad_norm": 1.8469605376030016, + "learning_rate": 4.821632498930656e-05, + "loss": 1.5541, + "step": 580 + }, + { + "epoch": 0.8564584484982495, + "grad_norm": 1.7193612138978247, + "learning_rate": 4.820897539670195e-05, + "loss": 1.4, + "step": 581 + }, + { + "epoch": 0.8579325594250967, + "grad_norm": 1.6259148296745782, + "learning_rate": 4.820161125579347e-05, + "loss": 1.5393, + "step": 582 + }, + { + "epoch": 0.859406670351944, + "grad_norm": 1.5843744703062477, + "learning_rate": 4.819423257119723e-05, + "loss": 1.4826, + "step": 583 + }, + { + "epoch": 0.8608807812787912, + "grad_norm": 1.8474203527727162, + "learning_rate": 4.818683934753851e-05, + "loss": 1.6756, + "step": 584 + }, + { + "epoch": 0.8623548922056384, + "grad_norm": 1.7197620533319622, + "learning_rate": 4.817943158945166e-05, + "loss": 1.5244, + "step": 585 + }, + { + "epoch": 0.8638290031324857, + "grad_norm": 1.7459539233722814, + "learning_rate": 4.817200930158015e-05, + "loss": 1.4336, + "step": 586 + }, + { + "epoch": 0.8653031140593329, + "grad_norm": 1.606937896435291, + "learning_rate": 4.816457248857657e-05, + "loss": 1.4565, + "step": 587 + }, + { + "epoch": 0.8667772249861803, + "grad_norm": 1.7682005739966837, + "learning_rate": 4.815712115510261e-05, + "loss": 1.3508, + "step": 588 + }, + { + "epoch": 0.8682513359130275, + "grad_norm": 1.4921241467193749, + "learning_rate": 4.8149655305829066e-05, + "loss": 1.4753, + "step": 589 + }, + { + "epoch": 0.8697254468398747, + "grad_norm": 1.633669176736577, + "learning_rate": 4.814217494543581e-05, + "loss": 1.2383, + "step": 590 + }, + { + "epoch": 0.871199557766722, + "grad_norm": 1.488404596779175, + "learning_rate": 4.813468007861185e-05, + "loss": 1.4448, + "step": 591 + }, + { + "epoch": 0.8726736686935692, + "grad_norm": 2.0575587875848593, + "learning_rate": 4.812717071005525e-05, + "loss": 1.5027, + "step": 592 + }, + { + "epoch": 0.8741477796204165, + "grad_norm": 2.0227995938473233, + "learning_rate": 4.8119646844473185e-05, + "loss": 1.5384, + "step": 593 + }, + { + "epoch": 0.8756218905472637, + "grad_norm": 1.9656235226381509, + "learning_rate": 4.811210848658191e-05, + "loss": 1.3056, + "step": 594 + }, + { + "epoch": 0.8770960014741109, + "grad_norm": 1.5471618078198595, + "learning_rate": 4.8104555641106766e-05, + "loss": 1.4889, + "step": 595 + }, + { + "epoch": 0.8785701124009582, + "grad_norm": 1.8372806262418375, + "learning_rate": 4.8096988312782174e-05, + "loss": 1.6928, + "step": 596 + }, + { + "epoch": 0.8800442233278054, + "grad_norm": 1.7147729811262298, + "learning_rate": 4.808940650635163e-05, + "loss": 1.4582, + "step": 597 + }, + { + "epoch": 0.8815183342546526, + "grad_norm": 1.7960392874432884, + "learning_rate": 4.8081810226567725e-05, + "loss": 1.5862, + "step": 598 + }, + { + "epoch": 0.8829924451814999, + "grad_norm": 1.711820331103058, + "learning_rate": 4.8074199478192097e-05, + "loss": 1.498, + "step": 599 + }, + { + "epoch": 0.8844665561083471, + "grad_norm": 1.7276360121758914, + "learning_rate": 4.8066574265995464e-05, + "loss": 1.473, + "step": 600 + }, + { + "epoch": 0.8859406670351944, + "grad_norm": 2.0970992444538403, + "learning_rate": 4.805893459475761e-05, + "loss": 1.3368, + "step": 601 + }, + { + "epoch": 0.8874147779620416, + "grad_norm": 2.1279253178377004, + "learning_rate": 4.805128046926739e-05, + "loss": 1.6721, + "step": 602 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 1.9262093770312738, + "learning_rate": 4.804361189432271e-05, + "loss": 1.6607, + "step": 603 + }, + { + "epoch": 0.8903629998157362, + "grad_norm": 1.8930343534694576, + "learning_rate": 4.803592887473053e-05, + "loss": 1.7086, + "step": 604 + }, + { + "epoch": 0.8918371107425834, + "grad_norm": 1.632568122061679, + "learning_rate": 4.802823141530687e-05, + "loss": 1.4234, + "step": 605 + }, + { + "epoch": 0.8933112216694307, + "grad_norm": 1.9635123563513197, + "learning_rate": 4.8020519520876816e-05, + "loss": 1.5608, + "step": 606 + }, + { + "epoch": 0.8947853325962779, + "grad_norm": 1.5958031537793698, + "learning_rate": 4.801279319627448e-05, + "loss": 1.4355, + "step": 607 + }, + { + "epoch": 0.8962594435231251, + "grad_norm": 1.5734508525202275, + "learning_rate": 4.8005052446343016e-05, + "loss": 1.5025, + "step": 608 + }, + { + "epoch": 0.8977335544499724, + "grad_norm": 1.590357741890919, + "learning_rate": 4.799729727593466e-05, + "loss": 1.6171, + "step": 609 + }, + { + "epoch": 0.8992076653768196, + "grad_norm": 1.7315146426430756, + "learning_rate": 4.798952768991063e-05, + "loss": 1.5196, + "step": 610 + }, + { + "epoch": 0.9006817763036669, + "grad_norm": 1.573508582834522, + "learning_rate": 4.798174369314123e-05, + "loss": 1.4365, + "step": 611 + }, + { + "epoch": 0.9021558872305141, + "grad_norm": 1.6606401523620964, + "learning_rate": 4.7973945290505766e-05, + "loss": 1.3367, + "step": 612 + }, + { + "epoch": 0.9036299981573613, + "grad_norm": 1.7051436334110197, + "learning_rate": 4.796613248689259e-05, + "loss": 1.401, + "step": 613 + }, + { + "epoch": 0.9051041090842086, + "grad_norm": 1.697160899769141, + "learning_rate": 4.795830528719908e-05, + "loss": 1.4064, + "step": 614 + }, + { + "epoch": 0.9065782200110558, + "grad_norm": 1.6792637490099682, + "learning_rate": 4.795046369633163e-05, + "loss": 1.4698, + "step": 615 + }, + { + "epoch": 0.908052330937903, + "grad_norm": 1.898250366532436, + "learning_rate": 4.7942607719205663e-05, + "loss": 1.4675, + "step": 616 + }, + { + "epoch": 0.9095264418647503, + "grad_norm": 1.6596374419127309, + "learning_rate": 4.793473736074561e-05, + "loss": 1.3765, + "step": 617 + }, + { + "epoch": 0.9110005527915975, + "grad_norm": 1.598639822019987, + "learning_rate": 4.792685262588492e-05, + "loss": 1.6055, + "step": 618 + }, + { + "epoch": 0.9124746637184448, + "grad_norm": 1.7802517419310218, + "learning_rate": 4.791895351956607e-05, + "loss": 1.6189, + "step": 619 + }, + { + "epoch": 0.913948774645292, + "grad_norm": 1.5977536943560384, + "learning_rate": 4.791104004674052e-05, + "loss": 1.3756, + "step": 620 + }, + { + "epoch": 0.9154228855721394, + "grad_norm": 1.5973833592535913, + "learning_rate": 4.7903112212368756e-05, + "loss": 1.4855, + "step": 621 + }, + { + "epoch": 0.9168969964989866, + "grad_norm": 1.5772981649370836, + "learning_rate": 4.789517002142026e-05, + "loss": 1.2613, + "step": 622 + }, + { + "epoch": 0.9183711074258338, + "grad_norm": 1.5425538096633087, + "learning_rate": 4.788721347887349e-05, + "loss": 1.4042, + "step": 623 + }, + { + "epoch": 0.9198452183526811, + "grad_norm": 1.6595327844694368, + "learning_rate": 4.7879242589715955e-05, + "loss": 1.5202, + "step": 624 + }, + { + "epoch": 0.9213193292795283, + "grad_norm": 1.882409966837432, + "learning_rate": 4.78712573589441e-05, + "loss": 1.8385, + "step": 625 + }, + { + "epoch": 0.9227934402063755, + "grad_norm": 1.5409680493339222, + "learning_rate": 4.7863257791563384e-05, + "loss": 1.5849, + "step": 626 + }, + { + "epoch": 0.9242675511332228, + "grad_norm": 1.8439664551999941, + "learning_rate": 4.785524389258827e-05, + "loss": 1.4637, + "step": 627 + }, + { + "epoch": 0.92574166206007, + "grad_norm": 1.6828936827335954, + "learning_rate": 4.7847215667042165e-05, + "loss": 1.6227, + "step": 628 + }, + { + "epoch": 0.9272157729869173, + "grad_norm": 1.6952506199104134, + "learning_rate": 4.78391731199575e-05, + "loss": 1.5701, + "step": 629 + }, + { + "epoch": 0.9286898839137645, + "grad_norm": 1.547078205979934, + "learning_rate": 4.7831116256375644e-05, + "loss": 1.4696, + "step": 630 + }, + { + "epoch": 0.9301639948406117, + "grad_norm": 1.600661531423974, + "learning_rate": 4.782304508134696e-05, + "loss": 1.5449, + "step": 631 + }, + { + "epoch": 0.931638105767459, + "grad_norm": 1.588480371845649, + "learning_rate": 4.7814959599930794e-05, + "loss": 1.2881, + "step": 632 + }, + { + "epoch": 0.9331122166943062, + "grad_norm": 1.7806309075891864, + "learning_rate": 4.7806859817195425e-05, + "loss": 1.5511, + "step": 633 + }, + { + "epoch": 0.9345863276211535, + "grad_norm": 1.5774950337520721, + "learning_rate": 4.779874573821814e-05, + "loss": 1.3597, + "step": 634 + }, + { + "epoch": 0.9360604385480007, + "grad_norm": 1.6792695700996014, + "learning_rate": 4.779061736808514e-05, + "loss": 1.4927, + "step": 635 + }, + { + "epoch": 0.9375345494748479, + "grad_norm": 1.6897696314344495, + "learning_rate": 4.778247471189163e-05, + "loss": 1.3959, + "step": 636 + }, + { + "epoch": 0.9390086604016952, + "grad_norm": 1.6187650201166919, + "learning_rate": 4.777431777474174e-05, + "loss": 1.4868, + "step": 637 + }, + { + "epoch": 0.9404827713285425, + "grad_norm": 1.505513944477025, + "learning_rate": 4.776614656174856e-05, + "loss": 1.3534, + "step": 638 + }, + { + "epoch": 0.9419568822553898, + "grad_norm": 1.4920532805310136, + "learning_rate": 4.775796107803413e-05, + "loss": 1.4877, + "step": 639 + }, + { + "epoch": 0.943430993182237, + "grad_norm": 1.789707650546946, + "learning_rate": 4.7749761328729436e-05, + "loss": 1.632, + "step": 640 + }, + { + "epoch": 0.9449051041090842, + "grad_norm": 1.5988219096316705, + "learning_rate": 4.77415473189744e-05, + "loss": 1.4742, + "step": 641 + }, + { + "epoch": 0.9463792150359315, + "grad_norm": 1.5586449942135832, + "learning_rate": 4.77333190539179e-05, + "loss": 1.4758, + "step": 642 + }, + { + "epoch": 0.9478533259627787, + "grad_norm": 1.5988251902993467, + "learning_rate": 4.772507653871773e-05, + "loss": 1.3524, + "step": 643 + }, + { + "epoch": 0.949327436889626, + "grad_norm": 1.7705209419112953, + "learning_rate": 4.7716819778540625e-05, + "loss": 1.6272, + "step": 644 + }, + { + "epoch": 0.9508015478164732, + "grad_norm": 1.660636706671694, + "learning_rate": 4.770854877856225e-05, + "loss": 1.4897, + "step": 645 + }, + { + "epoch": 0.9522756587433204, + "grad_norm": 1.9015811590127132, + "learning_rate": 4.7700263543967195e-05, + "loss": 1.4256, + "step": 646 + }, + { + "epoch": 0.9537497696701677, + "grad_norm": 1.7306241553384767, + "learning_rate": 4.769196407994898e-05, + "loss": 1.4095, + "step": 647 + }, + { + "epoch": 0.9552238805970149, + "grad_norm": 1.6024022575718655, + "learning_rate": 4.768365039171002e-05, + "loss": 1.5108, + "step": 648 + }, + { + "epoch": 0.9566979915238621, + "grad_norm": 1.8686845595725392, + "learning_rate": 4.7675322484461674e-05, + "loss": 1.5385, + "step": 649 + }, + { + "epoch": 0.9581721024507094, + "grad_norm": 1.5783638348530795, + "learning_rate": 4.766698036342421e-05, + "loss": 1.3522, + "step": 650 + }, + { + "epoch": 0.9596462133775566, + "grad_norm": 1.7358905940409899, + "learning_rate": 4.765862403382678e-05, + "loss": 1.4563, + "step": 651 + }, + { + "epoch": 0.9611203243044039, + "grad_norm": 1.6868556169864075, + "learning_rate": 4.7650253500907494e-05, + "loss": 1.5699, + "step": 652 + }, + { + "epoch": 0.9625944352312511, + "grad_norm": 1.576506209337351, + "learning_rate": 4.76418687699133e-05, + "loss": 1.6541, + "step": 653 + }, + { + "epoch": 0.9640685461580984, + "grad_norm": 1.970339239606081, + "learning_rate": 4.76334698461001e-05, + "loss": 1.4497, + "step": 654 + }, + { + "epoch": 0.9655426570849457, + "grad_norm": 1.7663451894931055, + "learning_rate": 4.7625056734732654e-05, + "loss": 1.417, + "step": 655 + }, + { + "epoch": 0.9670167680117929, + "grad_norm": 1.8093780069540335, + "learning_rate": 4.7616629441084655e-05, + "loss": 1.468, + "step": 656 + }, + { + "epoch": 0.9684908789386402, + "grad_norm": 1.506355886575976, + "learning_rate": 4.760818797043864e-05, + "loss": 1.5133, + "step": 657 + }, + { + "epoch": 0.9699649898654874, + "grad_norm": 1.665298432763018, + "learning_rate": 4.759973232808609e-05, + "loss": 1.474, + "step": 658 + }, + { + "epoch": 0.9714391007923346, + "grad_norm": 1.5021839138152469, + "learning_rate": 4.75912625193273e-05, + "loss": 1.4184, + "step": 659 + }, + { + "epoch": 0.9729132117191819, + "grad_norm": 1.7473590541859465, + "learning_rate": 4.7582778549471494e-05, + "loss": 1.449, + "step": 660 + }, + { + "epoch": 0.9743873226460291, + "grad_norm": 1.5233425012948403, + "learning_rate": 4.7574280423836776e-05, + "loss": 1.2742, + "step": 661 + }, + { + "epoch": 0.9758614335728764, + "grad_norm": 1.7013333196929006, + "learning_rate": 4.756576814775009e-05, + "loss": 1.5258, + "step": 662 + }, + { + "epoch": 0.9773355444997236, + "grad_norm": 1.6033550372541576, + "learning_rate": 4.7557241726547266e-05, + "loss": 1.4548, + "step": 663 + }, + { + "epoch": 0.9788096554265708, + "grad_norm": 1.6487359956344683, + "learning_rate": 4.7548701165573003e-05, + "loss": 1.3467, + "step": 664 + }, + { + "epoch": 0.9802837663534181, + "grad_norm": 1.648531364191147, + "learning_rate": 4.754014647018088e-05, + "loss": 1.5249, + "step": 665 + }, + { + "epoch": 0.9817578772802653, + "grad_norm": 1.527489228405722, + "learning_rate": 4.75315776457333e-05, + "loss": 1.2307, + "step": 666 + }, + { + "epoch": 0.9832319882071126, + "grad_norm": 1.6423498623305697, + "learning_rate": 4.752299469760154e-05, + "loss": 1.4412, + "step": 667 + }, + { + "epoch": 0.9847060991339598, + "grad_norm": 1.608062579370135, + "learning_rate": 4.751439763116575e-05, + "loss": 1.5007, + "step": 668 + }, + { + "epoch": 0.986180210060807, + "grad_norm": 1.693547609068286, + "learning_rate": 4.750578645181489e-05, + "loss": 1.523, + "step": 669 + }, + { + "epoch": 0.9876543209876543, + "grad_norm": 1.6393169456496408, + "learning_rate": 4.74971611649468e-05, + "loss": 1.5944, + "step": 670 + }, + { + "epoch": 0.9891284319145016, + "grad_norm": 1.5490394630897009, + "learning_rate": 4.748852177596815e-05, + "loss": 1.4744, + "step": 671 + }, + { + "epoch": 0.9906025428413489, + "grad_norm": 1.7242987105613135, + "learning_rate": 4.747986829029445e-05, + "loss": 1.6405, + "step": 672 + }, + { + "epoch": 0.9920766537681961, + "grad_norm": 1.6003581659453179, + "learning_rate": 4.747120071335004e-05, + "loss": 1.5092, + "step": 673 + }, + { + "epoch": 0.9935507646950433, + "grad_norm": 1.7075237899173015, + "learning_rate": 4.746251905056811e-05, + "loss": 1.4837, + "step": 674 + }, + { + "epoch": 0.9950248756218906, + "grad_norm": 1.6254956149860846, + "learning_rate": 4.745382330739067e-05, + "loss": 1.5699, + "step": 675 + }, + { + "epoch": 0.9964989865487378, + "grad_norm": 1.6152010156583876, + "learning_rate": 4.7445113489268544e-05, + "loss": 1.3392, + "step": 676 + }, + { + "epoch": 0.997973097475585, + "grad_norm": 1.4897348109055284, + "learning_rate": 4.74363896016614e-05, + "loss": 1.4951, + "step": 677 + }, + { + "epoch": 0.9994472084024323, + "grad_norm": 1.6969560663519077, + "learning_rate": 4.742765165003772e-05, + "loss": 1.4857, + "step": 678 + }, + { + "epoch": 1.0009213193292796, + "grad_norm": 1.6645701175908278, + "learning_rate": 4.741889963987478e-05, + "loss": 1.4045, + "step": 679 + }, + { + "epoch": 1.0023954302561269, + "grad_norm": 1.6543320400999817, + "learning_rate": 4.741013357665871e-05, + "loss": 1.1458, + "step": 680 + }, + { + "epoch": 1.0038695411829741, + "grad_norm": 1.8272377534665492, + "learning_rate": 4.7401353465884406e-05, + "loss": 1.3011, + "step": 681 + }, + { + "epoch": 1.0053436521098214, + "grad_norm": 1.554451162499311, + "learning_rate": 4.73925593130556e-05, + "loss": 1.1387, + "step": 682 + }, + { + "epoch": 1.0068177630366686, + "grad_norm": 2.033797910907898, + "learning_rate": 4.7383751123684806e-05, + "loss": 1.3314, + "step": 683 + }, + { + "epoch": 1.0082918739635158, + "grad_norm": 1.7324135488763774, + "learning_rate": 4.737492890329335e-05, + "loss": 1.1564, + "step": 684 + }, + { + "epoch": 1.009765984890363, + "grad_norm": 1.9036313816118642, + "learning_rate": 4.736609265741135e-05, + "loss": 1.2494, + "step": 685 + }, + { + "epoch": 1.0112400958172103, + "grad_norm": 1.6847992937671599, + "learning_rate": 4.7357242391577724e-05, + "loss": 1.3732, + "step": 686 + }, + { + "epoch": 1.0127142067440575, + "grad_norm": 2.0434491557307917, + "learning_rate": 4.7348378111340145e-05, + "loss": 1.1515, + "step": 687 + }, + { + "epoch": 1.0141883176709048, + "grad_norm": 1.8929767956407253, + "learning_rate": 4.733949982225511e-05, + "loss": 1.184, + "step": 688 + }, + { + "epoch": 1.015662428597752, + "grad_norm": 1.8249296645105988, + "learning_rate": 4.7330607529887884e-05, + "loss": 1.0749, + "step": 689 + }, + { + "epoch": 1.0171365395245993, + "grad_norm": 1.8259922857257385, + "learning_rate": 4.73217012398125e-05, + "loss": 1.2582, + "step": 690 + }, + { + "epoch": 1.0186106504514465, + "grad_norm": 1.9125887432577295, + "learning_rate": 4.731278095761178e-05, + "loss": 1.2178, + "step": 691 + }, + { + "epoch": 1.0200847613782937, + "grad_norm": 1.9581689021561286, + "learning_rate": 4.73038466888773e-05, + "loss": 1.3432, + "step": 692 + }, + { + "epoch": 1.021558872305141, + "grad_norm": 2.0120926296608945, + "learning_rate": 4.729489843920942e-05, + "loss": 1.1121, + "step": 693 + }, + { + "epoch": 1.0230329832319882, + "grad_norm": 1.7744387182476278, + "learning_rate": 4.728593621421726e-05, + "loss": 1.0938, + "step": 694 + }, + { + "epoch": 1.0245070941588355, + "grad_norm": 1.9230047487506763, + "learning_rate": 4.727696001951869e-05, + "loss": 1.0658, + "step": 695 + }, + { + "epoch": 1.0259812050856827, + "grad_norm": 1.7635808268519948, + "learning_rate": 4.726796986074034e-05, + "loss": 1.0238, + "step": 696 + }, + { + "epoch": 1.02745531601253, + "grad_norm": 1.8424304831031615, + "learning_rate": 4.725896574351763e-05, + "loss": 1.1836, + "step": 697 + }, + { + "epoch": 1.0289294269393772, + "grad_norm": 1.9853839862238, + "learning_rate": 4.7249947673494645e-05, + "loss": 1.2393, + "step": 698 + }, + { + "epoch": 1.0304035378662244, + "grad_norm": 1.8546668317457866, + "learning_rate": 4.72409156563243e-05, + "loss": 1.068, + "step": 699 + }, + { + "epoch": 1.0318776487930716, + "grad_norm": 1.9263237427019861, + "learning_rate": 4.7231869697668214e-05, + "loss": 1.2192, + "step": 700 + }, + { + "epoch": 1.0333517597199189, + "grad_norm": 1.8961694929400221, + "learning_rate": 4.722280980319675e-05, + "loss": 1.1651, + "step": 701 + }, + { + "epoch": 1.0348258706467661, + "grad_norm": 1.8958722274758995, + "learning_rate": 4.7213735978589016e-05, + "loss": 1.086, + "step": 702 + }, + { + "epoch": 1.0362999815736134, + "grad_norm": 1.920707968581033, + "learning_rate": 4.720464822953284e-05, + "loss": 1.3001, + "step": 703 + }, + { + "epoch": 1.0377740925004606, + "grad_norm": 2.033207463709648, + "learning_rate": 4.719554656172478e-05, + "loss": 1.3512, + "step": 704 + }, + { + "epoch": 1.0392482034273078, + "grad_norm": 1.893497334359798, + "learning_rate": 4.7186430980870124e-05, + "loss": 1.2213, + "step": 705 + }, + { + "epoch": 1.040722314354155, + "grad_norm": 1.9565062894303316, + "learning_rate": 4.717730149268287e-05, + "loss": 1.115, + "step": 706 + }, + { + "epoch": 1.0421964252810023, + "grad_norm": 1.9193517356463834, + "learning_rate": 4.716815810288575e-05, + "loss": 1.2553, + "step": 707 + }, + { + "epoch": 1.0436705362078496, + "grad_norm": 2.0806191757088546, + "learning_rate": 4.7159000817210205e-05, + "loss": 1.2919, + "step": 708 + }, + { + "epoch": 1.0451446471346968, + "grad_norm": 1.9243055651615135, + "learning_rate": 4.714982964139639e-05, + "loss": 1.1681, + "step": 709 + }, + { + "epoch": 1.046618758061544, + "grad_norm": 2.02960584562886, + "learning_rate": 4.714064458119314e-05, + "loss": 1.1467, + "step": 710 + }, + { + "epoch": 1.0480928689883915, + "grad_norm": 2.3131263503261645, + "learning_rate": 4.713144564235803e-05, + "loss": 1.1478, + "step": 711 + }, + { + "epoch": 1.0495669799152387, + "grad_norm": 1.804795034928628, + "learning_rate": 4.7122232830657315e-05, + "loss": 1.1921, + "step": 712 + }, + { + "epoch": 1.051041090842086, + "grad_norm": 1.7319013120978317, + "learning_rate": 4.7113006151865944e-05, + "loss": 1.1302, + "step": 713 + }, + { + "epoch": 1.0525152017689332, + "grad_norm": 1.7901973539915657, + "learning_rate": 4.710376561176758e-05, + "loss": 1.1353, + "step": 714 + }, + { + "epoch": 1.0539893126957804, + "grad_norm": 1.87534379986065, + "learning_rate": 4.7094511216154546e-05, + "loss": 1.1556, + "step": 715 + }, + { + "epoch": 1.0554634236226277, + "grad_norm": 1.8132493015878601, + "learning_rate": 4.708524297082786e-05, + "loss": 1.1175, + "step": 716 + }, + { + "epoch": 1.056937534549475, + "grad_norm": 1.9548926327386598, + "learning_rate": 4.7075960881597236e-05, + "loss": 1.3925, + "step": 717 + }, + { + "epoch": 1.0584116454763222, + "grad_norm": 1.9201881014553503, + "learning_rate": 4.706666495428105e-05, + "loss": 1.1691, + "step": 718 + }, + { + "epoch": 1.0598857564031694, + "grad_norm": 2.100252899200614, + "learning_rate": 4.705735519470636e-05, + "loss": 1.3896, + "step": 719 + }, + { + "epoch": 1.0613598673300166, + "grad_norm": 1.897210057940877, + "learning_rate": 4.7048031608708876e-05, + "loss": 1.2165, + "step": 720 + }, + { + "epoch": 1.0628339782568639, + "grad_norm": 1.797016702123261, + "learning_rate": 4.703869420213301e-05, + "loss": 1.0157, + "step": 721 + }, + { + "epoch": 1.0643080891837111, + "grad_norm": 1.954950075024271, + "learning_rate": 4.702934298083181e-05, + "loss": 1.0934, + "step": 722 + }, + { + "epoch": 1.0657822001105584, + "grad_norm": 1.8716187825564312, + "learning_rate": 4.701997795066699e-05, + "loss": 1.4161, + "step": 723 + }, + { + "epoch": 1.0672563110374056, + "grad_norm": 1.891808194763231, + "learning_rate": 4.701059911750893e-05, + "loss": 1.1178, + "step": 724 + }, + { + "epoch": 1.0687304219642528, + "grad_norm": 1.7294349079350462, + "learning_rate": 4.7001206487236644e-05, + "loss": 1.0219, + "step": 725 + }, + { + "epoch": 1.0702045328911, + "grad_norm": 1.9835866127922441, + "learning_rate": 4.69918000657378e-05, + "loss": 1.1748, + "step": 726 + }, + { + "epoch": 1.0716786438179473, + "grad_norm": 1.8108946168971511, + "learning_rate": 4.698237985890873e-05, + "loss": 1.2905, + "step": 727 + }, + { + "epoch": 1.0731527547447945, + "grad_norm": 2.110975322483375, + "learning_rate": 4.697294587265438e-05, + "loss": 1.1964, + "step": 728 + }, + { + "epoch": 1.0746268656716418, + "grad_norm": 1.8274964776376994, + "learning_rate": 4.696349811288836e-05, + "loss": 1.0506, + "step": 729 + }, + { + "epoch": 1.076100976598489, + "grad_norm": 1.78960567041504, + "learning_rate": 4.695403658553288e-05, + "loss": 1.3826, + "step": 730 + }, + { + "epoch": 1.0775750875253363, + "grad_norm": 1.7964804838220931, + "learning_rate": 4.6944561296518816e-05, + "loss": 1.0888, + "step": 731 + }, + { + "epoch": 1.0790491984521835, + "grad_norm": 2.012961113271312, + "learning_rate": 4.693507225178564e-05, + "loss": 1.175, + "step": 732 + }, + { + "epoch": 1.0805233093790307, + "grad_norm": 1.7533405300664726, + "learning_rate": 4.692556945728147e-05, + "loss": 1.0948, + "step": 733 + }, + { + "epoch": 1.081997420305878, + "grad_norm": 1.969127679353585, + "learning_rate": 4.691605291896304e-05, + "loss": 1.1976, + "step": 734 + }, + { + "epoch": 1.0834715312327252, + "grad_norm": 1.9246303996197247, + "learning_rate": 4.690652264279567e-05, + "loss": 1.3948, + "step": 735 + }, + { + "epoch": 1.0849456421595725, + "grad_norm": 1.9228475947457793, + "learning_rate": 4.689697863475334e-05, + "loss": 1.209, + "step": 736 + }, + { + "epoch": 1.0864197530864197, + "grad_norm": 1.8442747371545725, + "learning_rate": 4.688742090081859e-05, + "loss": 1.2617, + "step": 737 + }, + { + "epoch": 1.087893864013267, + "grad_norm": 1.7129984248655685, + "learning_rate": 4.68778494469826e-05, + "loss": 1.0915, + "step": 738 + }, + { + "epoch": 1.0893679749401142, + "grad_norm": 1.6886787536184504, + "learning_rate": 4.686826427924514e-05, + "loss": 1.3351, + "step": 739 + }, + { + "epoch": 1.0908420858669614, + "grad_norm": 1.9253643582812818, + "learning_rate": 4.685866540361456e-05, + "loss": 1.2054, + "step": 740 + }, + { + "epoch": 1.0923161967938086, + "grad_norm": 1.833177090258177, + "learning_rate": 4.684905282610781e-05, + "loss": 1.1527, + "step": 741 + }, + { + "epoch": 1.0937903077206559, + "grad_norm": 1.878769962816404, + "learning_rate": 4.6839426552750454e-05, + "loss": 1.2708, + "step": 742 + }, + { + "epoch": 1.0952644186475031, + "grad_norm": 1.9389312748919887, + "learning_rate": 4.6829786589576604e-05, + "loss": 1.1904, + "step": 743 + }, + { + "epoch": 1.0967385295743504, + "grad_norm": 1.7420598141532473, + "learning_rate": 4.6820132942628974e-05, + "loss": 1.186, + "step": 744 + }, + { + "epoch": 1.0982126405011976, + "grad_norm": 1.8555039573139989, + "learning_rate": 4.6810465617958856e-05, + "loss": 1.1122, + "step": 745 + }, + { + "epoch": 1.099686751428045, + "grad_norm": 1.8615491961941468, + "learning_rate": 4.680078462162611e-05, + "loss": 1.2685, + "step": 746 + }, + { + "epoch": 1.1011608623548923, + "grad_norm": 2.0443377894878516, + "learning_rate": 4.679108995969917e-05, + "loss": 1.3216, + "step": 747 + }, + { + "epoch": 1.1026349732817395, + "grad_norm": 1.821300320424321, + "learning_rate": 4.678138163825503e-05, + "loss": 1.2258, + "step": 748 + }, + { + "epoch": 1.1041090842085868, + "grad_norm": 1.9161189029090677, + "learning_rate": 4.677165966337924e-05, + "loss": 1.3147, + "step": 749 + }, + { + "epoch": 1.105583195135434, + "grad_norm": 1.9361667968328764, + "learning_rate": 4.676192404116594e-05, + "loss": 1.1274, + "step": 750 + }, + { + "epoch": 1.105583195135434, + "eval_bleu": 0.04832359604321219, + "eval_bleu_1gram": 0.3489221720041889, + "eval_bleu_2gram": 0.1226059538293485, + "eval_bleu_3gram": 0.048325292345045893, + "eval_bleu_4gram": 0.022483994131201883, + "eval_rag_val_loss": 1.5553608203446994, + "eval_rouge1": 0.3373041450590792, + "eval_rouge2": 0.11489670979442308, + "eval_rougeL": 0.31697945556853585, + "step": 750 + }, + { + "epoch": 1.1070573060622813, + "grad_norm": 2.059973469720756, + "learning_rate": 4.6752174777717786e-05, + "loss": 1.2282, + "step": 751 + }, + { + "epoch": 1.1085314169891285, + "grad_norm": 1.9499485889406722, + "learning_rate": 4.674241187914601e-05, + "loss": 0.99, + "step": 752 + }, + { + "epoch": 1.1100055279159757, + "grad_norm": 1.960007283528088, + "learning_rate": 4.673263535157038e-05, + "loss": 1.1945, + "step": 753 + }, + { + "epoch": 1.111479638842823, + "grad_norm": 2.15022541904847, + "learning_rate": 4.6722845201119214e-05, + "loss": 1.2765, + "step": 754 + }, + { + "epoch": 1.1129537497696702, + "grad_norm": 2.098606228382695, + "learning_rate": 4.671304143392936e-05, + "loss": 1.2809, + "step": 755 + }, + { + "epoch": 1.1144278606965174, + "grad_norm": 2.1353745495124086, + "learning_rate": 4.670322405614621e-05, + "loss": 1.2348, + "step": 756 + }, + { + "epoch": 1.1159019716233647, + "grad_norm": 1.8928509054055485, + "learning_rate": 4.6693393073923686e-05, + "loss": 1.2209, + "step": 757 + }, + { + "epoch": 1.117376082550212, + "grad_norm": 1.8769153984160498, + "learning_rate": 4.6683548493424236e-05, + "loss": 1.1185, + "step": 758 + }, + { + "epoch": 1.1188501934770592, + "grad_norm": 1.861674257580004, + "learning_rate": 4.667369032081883e-05, + "loss": 1.2134, + "step": 759 + }, + { + "epoch": 1.1203243044039064, + "grad_norm": 1.9099952302244056, + "learning_rate": 4.666381856228697e-05, + "loss": 1.2536, + "step": 760 + }, + { + "epoch": 1.1217984153307536, + "grad_norm": 1.9062018466558288, + "learning_rate": 4.665393322401664e-05, + "loss": 1.411, + "step": 761 + }, + { + "epoch": 1.1232725262576009, + "grad_norm": 1.9084961587109184, + "learning_rate": 4.6644034312204387e-05, + "loss": 1.0589, + "step": 762 + }, + { + "epoch": 1.1247466371844481, + "grad_norm": 1.9482587082241267, + "learning_rate": 4.6634121833055235e-05, + "loss": 1.1527, + "step": 763 + }, + { + "epoch": 1.1262207481112954, + "grad_norm": 1.9062931493659574, + "learning_rate": 4.662419579278271e-05, + "loss": 1.2479, + "step": 764 + }, + { + "epoch": 1.1276948590381426, + "grad_norm": 1.9283854304040413, + "learning_rate": 4.6614256197608855e-05, + "loss": 1.2503, + "step": 765 + }, + { + "epoch": 1.1291689699649898, + "grad_norm": 1.9222112842143262, + "learning_rate": 4.660430305376419e-05, + "loss": 1.323, + "step": 766 + }, + { + "epoch": 1.130643080891837, + "grad_norm": 1.8120554674850413, + "learning_rate": 4.659433636748775e-05, + "loss": 1.3234, + "step": 767 + }, + { + "epoch": 1.1321171918186843, + "grad_norm": 1.71110610479528, + "learning_rate": 4.658435614502705e-05, + "loss": 1.0505, + "step": 768 + }, + { + "epoch": 1.1335913027455315, + "grad_norm": 2.0124255670086892, + "learning_rate": 4.657436239263808e-05, + "loss": 1.177, + "step": 769 + }, + { + "epoch": 1.1350654136723788, + "grad_norm": 1.8902943219499422, + "learning_rate": 4.6564355116585325e-05, + "loss": 1.3793, + "step": 770 + }, + { + "epoch": 1.136539524599226, + "grad_norm": 1.7362883727390206, + "learning_rate": 4.655433432314174e-05, + "loss": 1.0214, + "step": 771 + }, + { + "epoch": 1.1380136355260733, + "grad_norm": 1.8006550047472214, + "learning_rate": 4.654430001858874e-05, + "loss": 1.1815, + "step": 772 + }, + { + "epoch": 1.1394877464529205, + "grad_norm": 1.9270018139342167, + "learning_rate": 4.653425220921626e-05, + "loss": 1.3309, + "step": 773 + }, + { + "epoch": 1.1409618573797677, + "grad_norm": 2.1138547447054794, + "learning_rate": 4.6524190901322626e-05, + "loss": 1.2242, + "step": 774 + }, + { + "epoch": 1.142435968306615, + "grad_norm": 2.1271683344980046, + "learning_rate": 4.651411610121469e-05, + "loss": 1.2117, + "step": 775 + }, + { + "epoch": 1.1439100792334624, + "grad_norm": 2.04938114873831, + "learning_rate": 4.650402781520772e-05, + "loss": 1.3106, + "step": 776 + }, + { + "epoch": 1.1453841901603097, + "grad_norm": 1.8633284652773754, + "learning_rate": 4.649392604962546e-05, + "loss": 1.2068, + "step": 777 + }, + { + "epoch": 1.146858301087157, + "grad_norm": 2.0148619161010983, + "learning_rate": 4.648381081080009e-05, + "loss": 1.1792, + "step": 778 + }, + { + "epoch": 1.1483324120140042, + "grad_norm": 1.9219231948392115, + "learning_rate": 4.647368210507225e-05, + "loss": 1.319, + "step": 779 + }, + { + "epoch": 1.1498065229408514, + "grad_norm": 2.0746685912426517, + "learning_rate": 4.6463539938791e-05, + "loss": 1.1628, + "step": 780 + }, + { + "epoch": 1.1512806338676986, + "grad_norm": 1.9249777829758352, + "learning_rate": 4.645338431831388e-05, + "loss": 1.1656, + "step": 781 + }, + { + "epoch": 1.1527547447945459, + "grad_norm": 1.9588541461106448, + "learning_rate": 4.6443215250006806e-05, + "loss": 1.1347, + "step": 782 + }, + { + "epoch": 1.154228855721393, + "grad_norm": 2.213609462522597, + "learning_rate": 4.643303274024416e-05, + "loss": 1.241, + "step": 783 + }, + { + "epoch": 1.1557029666482403, + "grad_norm": 2.24719720686905, + "learning_rate": 4.642283679540874e-05, + "loss": 1.3281, + "step": 784 + }, + { + "epoch": 1.1571770775750876, + "grad_norm": 2.119060968404964, + "learning_rate": 4.641262742189178e-05, + "loss": 1.2635, + "step": 785 + }, + { + "epoch": 1.1586511885019348, + "grad_norm": 2.0360543581258246, + "learning_rate": 4.640240462609291e-05, + "loss": 1.3259, + "step": 786 + }, + { + "epoch": 1.160125299428782, + "grad_norm": 2.0091196992569613, + "learning_rate": 4.639216841442018e-05, + "loss": 1.295, + "step": 787 + }, + { + "epoch": 1.1615994103556293, + "grad_norm": 2.04665589069839, + "learning_rate": 4.6381918793290055e-05, + "loss": 1.4083, + "step": 788 + }, + { + "epoch": 1.1630735212824765, + "grad_norm": 1.9770604889665038, + "learning_rate": 4.6371655769127396e-05, + "loss": 1.3114, + "step": 789 + }, + { + "epoch": 1.1645476322093238, + "grad_norm": 2.0549801230107407, + "learning_rate": 4.63613793483655e-05, + "loss": 1.2201, + "step": 790 + }, + { + "epoch": 1.166021743136171, + "grad_norm": 2.0593650574509255, + "learning_rate": 4.6351089537446e-05, + "loss": 1.3657, + "step": 791 + }, + { + "epoch": 1.1674958540630183, + "grad_norm": 2.0118875080915544, + "learning_rate": 4.6340786342818964e-05, + "loss": 1.3112, + "step": 792 + }, + { + "epoch": 1.1689699649898655, + "grad_norm": 1.852055994896926, + "learning_rate": 4.633046977094286e-05, + "loss": 1.232, + "step": 793 + }, + { + "epoch": 1.1704440759167127, + "grad_norm": 1.8902511227002372, + "learning_rate": 4.632013982828451e-05, + "loss": 1.3732, + "step": 794 + }, + { + "epoch": 1.17191818684356, + "grad_norm": 1.8677073756518854, + "learning_rate": 4.630979652131913e-05, + "loss": 1.1752, + "step": 795 + }, + { + "epoch": 1.1733922977704072, + "grad_norm": 1.7973951830697428, + "learning_rate": 4.629943985653032e-05, + "loss": 1.3248, + "step": 796 + }, + { + "epoch": 1.1748664086972544, + "grad_norm": 1.838999171693998, + "learning_rate": 4.6289069840410036e-05, + "loss": 1.2396, + "step": 797 + }, + { + "epoch": 1.1763405196241017, + "grad_norm": 1.9341370715115471, + "learning_rate": 4.627868647945863e-05, + "loss": 1.0613, + "step": 798 + }, + { + "epoch": 1.177814630550949, + "grad_norm": 1.7367682226413044, + "learning_rate": 4.62682897801848e-05, + "loss": 0.9865, + "step": 799 + }, + { + "epoch": 1.1792887414777962, + "grad_norm": 2.019427593824958, + "learning_rate": 4.625787974910559e-05, + "loss": 1.196, + "step": 800 + }, + { + "epoch": 1.1807628524046434, + "grad_norm": 1.945706614820701, + "learning_rate": 4.6247456392746444e-05, + "loss": 1.2637, + "step": 801 + }, + { + "epoch": 1.1822369633314906, + "grad_norm": 1.7717963872354054, + "learning_rate": 4.623701971764112e-05, + "loss": 1.1619, + "step": 802 + }, + { + "epoch": 1.1837110742583379, + "grad_norm": 1.92765521937285, + "learning_rate": 4.622656973033174e-05, + "loss": 1.0535, + "step": 803 + }, + { + "epoch": 1.1851851851851851, + "grad_norm": 1.7733314247168879, + "learning_rate": 4.621610643736878e-05, + "loss": 1.146, + "step": 804 + }, + { + "epoch": 1.1866592961120324, + "grad_norm": 1.7575950657622053, + "learning_rate": 4.620562984531103e-05, + "loss": 1.1746, + "step": 805 + }, + { + "epoch": 1.1881334070388796, + "grad_norm": 1.8443727249860549, + "learning_rate": 4.619513996072564e-05, + "loss": 1.2879, + "step": 806 + }, + { + "epoch": 1.1896075179657268, + "grad_norm": 1.8136864429277426, + "learning_rate": 4.618463679018808e-05, + "loss": 1.3109, + "step": 807 + }, + { + "epoch": 1.191081628892574, + "grad_norm": 1.9022187303511535, + "learning_rate": 4.617412034028217e-05, + "loss": 1.242, + "step": 808 + }, + { + "epoch": 1.1925557398194213, + "grad_norm": 1.7985484416293065, + "learning_rate": 4.616359061760001e-05, + "loss": 1.2317, + "step": 809 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 1.964728168098802, + "learning_rate": 4.6153047628742066e-05, + "loss": 1.1521, + "step": 810 + }, + { + "epoch": 1.1955039616731158, + "grad_norm": 1.9894042675131691, + "learning_rate": 4.61424913803171e-05, + "loss": 1.216, + "step": 811 + }, + { + "epoch": 1.196978072599963, + "grad_norm": 1.9904342416241996, + "learning_rate": 4.613192187894218e-05, + "loss": 1.3258, + "step": 812 + }, + { + "epoch": 1.1984521835268105, + "grad_norm": 2.06252612473097, + "learning_rate": 4.612133913124268e-05, + "loss": 1.2279, + "step": 813 + }, + { + "epoch": 1.1999262944536577, + "grad_norm": 2.0443174968342244, + "learning_rate": 4.61107431438523e-05, + "loss": 1.3923, + "step": 814 + }, + { + "epoch": 1.201400405380505, + "grad_norm": 2.037292647781029, + "learning_rate": 4.610013392341301e-05, + "loss": 1.1695, + "step": 815 + }, + { + "epoch": 1.2028745163073522, + "grad_norm": 1.9747739396914579, + "learning_rate": 4.608951147657511e-05, + "loss": 1.2224, + "step": 816 + }, + { + "epoch": 1.2043486272341994, + "grad_norm": 1.928814648121447, + "learning_rate": 4.607887580999715e-05, + "loss": 1.3066, + "step": 817 + }, + { + "epoch": 1.2058227381610467, + "grad_norm": 1.7628306328987635, + "learning_rate": 4.6068226930345995e-05, + "loss": 1.0662, + "step": 818 + }, + { + "epoch": 1.207296849087894, + "grad_norm": 1.8021481305712592, + "learning_rate": 4.605756484429678e-05, + "loss": 1.2606, + "step": 819 + }, + { + "epoch": 1.2087709600147412, + "grad_norm": 2.047069744135142, + "learning_rate": 4.604688955853293e-05, + "loss": 1.1361, + "step": 820 + }, + { + "epoch": 1.2102450709415884, + "grad_norm": 1.8754182349097253, + "learning_rate": 4.603620107974612e-05, + "loss": 1.1022, + "step": 821 + }, + { + "epoch": 1.2117191818684356, + "grad_norm": 1.8822805832531442, + "learning_rate": 4.602549941463633e-05, + "loss": 1.2389, + "step": 822 + }, + { + "epoch": 1.2131932927952829, + "grad_norm": 1.8911851415713685, + "learning_rate": 4.601478456991178e-05, + "loss": 1.257, + "step": 823 + }, + { + "epoch": 1.2146674037221301, + "grad_norm": 1.8257667785776173, + "learning_rate": 4.6004056552288956e-05, + "loss": 1.0839, + "step": 824 + }, + { + "epoch": 1.2161415146489774, + "grad_norm": 1.9537110937031328, + "learning_rate": 4.5993315368492603e-05, + "loss": 1.0454, + "step": 825 + }, + { + "epoch": 1.2176156255758246, + "grad_norm": 1.7445804600964923, + "learning_rate": 4.5982561025255726e-05, + "loss": 1.2837, + "step": 826 + }, + { + "epoch": 1.2190897365026718, + "grad_norm": 1.9185719211708803, + "learning_rate": 4.5971793529319576e-05, + "loss": 1.159, + "step": 827 + }, + { + "epoch": 1.220563847429519, + "grad_norm": 1.8274022165170416, + "learning_rate": 4.596101288743362e-05, + "loss": 1.2292, + "step": 828 + }, + { + "epoch": 1.2220379583563663, + "grad_norm": 2.229485985057623, + "learning_rate": 4.595021910635563e-05, + "loss": 1.3312, + "step": 829 + }, + { + "epoch": 1.2235120692832135, + "grad_norm": 2.0491974445941397, + "learning_rate": 4.5939412192851535e-05, + "loss": 1.2012, + "step": 830 + }, + { + "epoch": 1.2249861802100608, + "grad_norm": 2.0764561886408117, + "learning_rate": 4.592859215369557e-05, + "loss": 1.2366, + "step": 831 + }, + { + "epoch": 1.226460291136908, + "grad_norm": 2.0749286363578348, + "learning_rate": 4.591775899567015e-05, + "loss": 1.3437, + "step": 832 + }, + { + "epoch": 1.2279344020637553, + "grad_norm": 2.0806212383315326, + "learning_rate": 4.590691272556592e-05, + "loss": 1.0536, + "step": 833 + }, + { + "epoch": 1.2294085129906025, + "grad_norm": 2.138625052997736, + "learning_rate": 4.589605335018176e-05, + "loss": 1.2359, + "step": 834 + }, + { + "epoch": 1.2308826239174497, + "grad_norm": 2.017168267451644, + "learning_rate": 4.588518087632475e-05, + "loss": 1.3051, + "step": 835 + }, + { + "epoch": 1.232356734844297, + "grad_norm": 1.9112500138716264, + "learning_rate": 4.587429531081019e-05, + "loss": 1.1611, + "step": 836 + }, + { + "epoch": 1.2338308457711442, + "grad_norm": 1.9438705922095745, + "learning_rate": 4.5863396660461575e-05, + "loss": 1.2222, + "step": 837 + }, + { + "epoch": 1.2353049566979915, + "grad_norm": 1.9247965742353426, + "learning_rate": 4.585248493211063e-05, + "loss": 1.1865, + "step": 838 + }, + { + "epoch": 1.2367790676248387, + "grad_norm": 1.938434560018029, + "learning_rate": 4.5841560132597244e-05, + "loss": 1.0139, + "step": 839 + }, + { + "epoch": 1.238253178551686, + "grad_norm": 1.9023332223463831, + "learning_rate": 4.583062226876952e-05, + "loss": 1.3509, + "step": 840 + }, + { + "epoch": 1.2397272894785332, + "grad_norm": 1.8888812259755496, + "learning_rate": 4.5819671347483725e-05, + "loss": 1.0203, + "step": 841 + }, + { + "epoch": 1.2412014004053806, + "grad_norm": 2.008767935976047, + "learning_rate": 4.580870737560435e-05, + "loss": 1.3465, + "step": 842 + }, + { + "epoch": 1.2426755113322279, + "grad_norm": 1.771548706367048, + "learning_rate": 4.579773036000405e-05, + "loss": 1.4631, + "step": 843 + }, + { + "epoch": 1.244149622259075, + "grad_norm": 2.1056041780134684, + "learning_rate": 4.5786740307563636e-05, + "loss": 1.1632, + "step": 844 + }, + { + "epoch": 1.2456237331859223, + "grad_norm": 1.7614772318121938, + "learning_rate": 4.577573722517211e-05, + "loss": 1.2406, + "step": 845 + }, + { + "epoch": 1.2470978441127696, + "grad_norm": 1.941060824145296, + "learning_rate": 4.5764721119726653e-05, + "loss": 1.1045, + "step": 846 + }, + { + "epoch": 1.2485719550396168, + "grad_norm": 1.9177119749394835, + "learning_rate": 4.575369199813258e-05, + "loss": 1.3186, + "step": 847 + }, + { + "epoch": 1.250046065966464, + "grad_norm": 1.9702454594120657, + "learning_rate": 4.5742649867303386e-05, + "loss": 1.0918, + "step": 848 + }, + { + "epoch": 1.2515201768933113, + "grad_norm": 2.2033457543989172, + "learning_rate": 4.573159473416072e-05, + "loss": 1.1995, + "step": 849 + }, + { + "epoch": 1.2529942878201585, + "grad_norm": 1.9168505234923359, + "learning_rate": 4.572052660563437e-05, + "loss": 1.2621, + "step": 850 + }, + { + "epoch": 1.2544683987470058, + "grad_norm": 1.8475782446494462, + "learning_rate": 4.570944548866228e-05, + "loss": 1.2181, + "step": 851 + }, + { + "epoch": 1.255942509673853, + "grad_norm": 1.8534563254064578, + "learning_rate": 4.569835139019054e-05, + "loss": 1.2162, + "step": 852 + }, + { + "epoch": 1.2574166206007003, + "grad_norm": 1.688457641142536, + "learning_rate": 4.5687244317173356e-05, + "loss": 1.1438, + "step": 853 + }, + { + "epoch": 1.2588907315275475, + "grad_norm": 1.9298995669822576, + "learning_rate": 4.567612427657308e-05, + "loss": 1.3113, + "step": 854 + }, + { + "epoch": 1.2603648424543947, + "grad_norm": 1.877831482761959, + "learning_rate": 4.566499127536021e-05, + "loss": 1.2634, + "step": 855 + }, + { + "epoch": 1.261838953381242, + "grad_norm": 2.169264287736473, + "learning_rate": 4.565384532051335e-05, + "loss": 1.3957, + "step": 856 + }, + { + "epoch": 1.2633130643080892, + "grad_norm": 1.7663846702556634, + "learning_rate": 4.56426864190192e-05, + "loss": 1.1191, + "step": 857 + }, + { + "epoch": 1.2647871752349364, + "grad_norm": 1.8045734848403157, + "learning_rate": 4.563151457787263e-05, + "loss": 1.4029, + "step": 858 + }, + { + "epoch": 1.2662612861617837, + "grad_norm": 2.0040378341453624, + "learning_rate": 4.562032980407658e-05, + "loss": 1.2542, + "step": 859 + }, + { + "epoch": 1.267735397088631, + "grad_norm": 1.7774041134151946, + "learning_rate": 4.56091321046421e-05, + "loss": 1.0325, + "step": 860 + }, + { + "epoch": 1.2692095080154782, + "grad_norm": 1.9178436922219209, + "learning_rate": 4.5597921486588366e-05, + "loss": 1.2749, + "step": 861 + }, + { + "epoch": 1.2706836189423254, + "grad_norm": 2.0261190313034447, + "learning_rate": 4.558669795694263e-05, + "loss": 1.3292, + "step": 862 + }, + { + "epoch": 1.2721577298691726, + "grad_norm": 1.8865470097253247, + "learning_rate": 4.557546152274025e-05, + "loss": 1.2121, + "step": 863 + }, + { + "epoch": 1.2736318407960199, + "grad_norm": 2.0123974886547016, + "learning_rate": 4.556421219102466e-05, + "loss": 1.1027, + "step": 864 + }, + { + "epoch": 1.2751059517228671, + "grad_norm": 1.8472406996692792, + "learning_rate": 4.555294996884738e-05, + "loss": 1.1813, + "step": 865 + }, + { + "epoch": 1.2765800626497144, + "grad_norm": 1.8678430021136798, + "learning_rate": 4.5541674863268035e-05, + "loss": 1.2206, + "step": 866 + }, + { + "epoch": 1.2780541735765616, + "grad_norm": 1.7360457887016547, + "learning_rate": 4.553038688135429e-05, + "loss": 1.3746, + "step": 867 + }, + { + "epoch": 1.2795282845034088, + "grad_norm": 1.8732700632594796, + "learning_rate": 4.551908603018191e-05, + "loss": 1.1941, + "step": 868 + }, + { + "epoch": 1.281002395430256, + "grad_norm": 1.8819306393813404, + "learning_rate": 4.5507772316834715e-05, + "loss": 1.1475, + "step": 869 + }, + { + "epoch": 1.2824765063571033, + "grad_norm": 1.9574279383114594, + "learning_rate": 4.549644574840458e-05, + "loss": 1.2588, + "step": 870 + }, + { + "epoch": 1.2839506172839505, + "grad_norm": 1.7583345443335727, + "learning_rate": 4.5485106331991446e-05, + "loss": 1.0202, + "step": 871 + }, + { + "epoch": 1.2854247282107978, + "grad_norm": 1.8985904584888604, + "learning_rate": 4.5473754074703324e-05, + "loss": 1.2881, + "step": 872 + }, + { + "epoch": 1.286898839137645, + "grad_norm": 1.9169067425955675, + "learning_rate": 4.546238898365623e-05, + "loss": 1.1355, + "step": 873 + }, + { + "epoch": 1.2883729500644923, + "grad_norm": 1.8695511162894336, + "learning_rate": 4.545101106597428e-05, + "loss": 1.2505, + "step": 874 + }, + { + "epoch": 1.2898470609913395, + "grad_norm": 2.2799967628589872, + "learning_rate": 4.5439620328789593e-05, + "loss": 1.2138, + "step": 875 + }, + { + "epoch": 1.2913211719181867, + "grad_norm": 1.8974143202665794, + "learning_rate": 4.5428216779242336e-05, + "loss": 1.1693, + "step": 876 + }, + { + "epoch": 1.292795282845034, + "grad_norm": 1.8682219381186351, + "learning_rate": 4.541680042448069e-05, + "loss": 1.155, + "step": 877 + }, + { + "epoch": 1.2942693937718812, + "grad_norm": 1.9171227727905555, + "learning_rate": 4.540537127166089e-05, + "loss": 1.1816, + "step": 878 + }, + { + "epoch": 1.2957435046987285, + "grad_norm": 2.1015021446665445, + "learning_rate": 4.5393929327947195e-05, + "loss": 1.1996, + "step": 879 + }, + { + "epoch": 1.2972176156255757, + "grad_norm": 1.8735547535706223, + "learning_rate": 4.538247460051184e-05, + "loss": 1.1889, + "step": 880 + }, + { + "epoch": 1.298691726552423, + "grad_norm": 2.198880916686168, + "learning_rate": 4.537100709653512e-05, + "loss": 1.2868, + "step": 881 + }, + { + "epoch": 1.3001658374792704, + "grad_norm": 1.9357337284066887, + "learning_rate": 4.535952682320531e-05, + "loss": 1.3053, + "step": 882 + }, + { + "epoch": 1.3016399484061176, + "grad_norm": 1.9950794248139392, + "learning_rate": 4.534803378771871e-05, + "loss": 1.1592, + "step": 883 + }, + { + "epoch": 1.3031140593329649, + "grad_norm": 2.0115906313496157, + "learning_rate": 4.53365279972796e-05, + "loss": 1.2114, + "step": 884 + }, + { + "epoch": 1.304588170259812, + "grad_norm": 1.9877671811917321, + "learning_rate": 4.532500945910026e-05, + "loss": 1.3775, + "step": 885 + }, + { + "epoch": 1.3060622811866593, + "grad_norm": 2.00708410209064, + "learning_rate": 4.5313478180400995e-05, + "loss": 1.3223, + "step": 886 + }, + { + "epoch": 1.3075363921135066, + "grad_norm": 1.9213141886921883, + "learning_rate": 4.530193416841003e-05, + "loss": 1.26, + "step": 887 + }, + { + "epoch": 1.3090105030403538, + "grad_norm": 2.202929548968211, + "learning_rate": 4.529037743036362e-05, + "loss": 1.4045, + "step": 888 + }, + { + "epoch": 1.310484613967201, + "grad_norm": 2.1333296711214738, + "learning_rate": 4.5278807973506e-05, + "loss": 1.2379, + "step": 889 + }, + { + "epoch": 1.3119587248940483, + "grad_norm": 1.930311835599059, + "learning_rate": 4.526722580508934e-05, + "loss": 1.1965, + "step": 890 + }, + { + "epoch": 1.3134328358208955, + "grad_norm": 2.031047047966508, + "learning_rate": 4.525563093237383e-05, + "loss": 1.2721, + "step": 891 + }, + { + "epoch": 1.3149069467477428, + "grad_norm": 1.9557458002337205, + "learning_rate": 4.524402336262756e-05, + "loss": 1.188, + "step": 892 + }, + { + "epoch": 1.31638105767459, + "grad_norm": 1.8648089338104161, + "learning_rate": 4.523240310312664e-05, + "loss": 1.2553, + "step": 893 + }, + { + "epoch": 1.3178551686014373, + "grad_norm": 1.9969554616271883, + "learning_rate": 4.522077016115511e-05, + "loss": 1.2092, + "step": 894 + }, + { + "epoch": 1.3193292795282845, + "grad_norm": 2.0407491555642694, + "learning_rate": 4.520912454400494e-05, + "loss": 1.3227, + "step": 895 + }, + { + "epoch": 1.3208033904551317, + "grad_norm": 1.833479644254606, + "learning_rate": 4.519746625897607e-05, + "loss": 1.2718, + "step": 896 + }, + { + "epoch": 1.322277501381979, + "grad_norm": 1.9786800455534768, + "learning_rate": 4.518579531337638e-05, + "loss": 1.4308, + "step": 897 + }, + { + "epoch": 1.3237516123088262, + "grad_norm": 2.0547855484344177, + "learning_rate": 4.5174111714521685e-05, + "loss": 1.212, + "step": 898 + }, + { + "epoch": 1.3252257232356734, + "grad_norm": 1.9233808743084766, + "learning_rate": 4.516241546973571e-05, + "loss": 1.219, + "step": 899 + }, + { + "epoch": 1.3266998341625207, + "grad_norm": 1.7080239349180115, + "learning_rate": 4.515070658635013e-05, + "loss": 1.0944, + "step": 900 + }, + { + "epoch": 1.328173945089368, + "grad_norm": 2.0443339408966077, + "learning_rate": 4.5138985071704546e-05, + "loss": 1.2423, + "step": 901 + }, + { + "epoch": 1.3296480560162152, + "grad_norm": 1.7154840130017959, + "learning_rate": 4.512725093314645e-05, + "loss": 1.1359, + "step": 902 + }, + { + "epoch": 1.3311221669430624, + "grad_norm": 2.2840776033328805, + "learning_rate": 4.5115504178031285e-05, + "loss": 1.4354, + "step": 903 + }, + { + "epoch": 1.3325962778699096, + "grad_norm": 2.1892841147970574, + "learning_rate": 4.5103744813722374e-05, + "loss": 1.294, + "step": 904 + }, + { + "epoch": 1.3340703887967569, + "grad_norm": 1.9591072325437795, + "learning_rate": 4.509197284759094e-05, + "loss": 1.2564, + "step": 905 + }, + { + "epoch": 1.3355444997236043, + "grad_norm": 1.9277819904912872, + "learning_rate": 4.508018828701612e-05, + "loss": 1.1511, + "step": 906 + }, + { + "epoch": 1.3370186106504516, + "grad_norm": 1.898052726025306, + "learning_rate": 4.506839113938496e-05, + "loss": 1.2168, + "step": 907 + }, + { + "epoch": 1.3384927215772988, + "grad_norm": 1.8898282224309781, + "learning_rate": 4.505658141209237e-05, + "loss": 1.209, + "step": 908 + }, + { + "epoch": 1.339966832504146, + "grad_norm": 2.1069520951715197, + "learning_rate": 4.504475911254115e-05, + "loss": 1.3438, + "step": 909 + }, + { + "epoch": 1.3414409434309933, + "grad_norm": 2.0220314354495197, + "learning_rate": 4.503292424814198e-05, + "loss": 1.2104, + "step": 910 + }, + { + "epoch": 1.3429150543578405, + "grad_norm": 2.1640977546767326, + "learning_rate": 4.502107682631343e-05, + "loss": 1.2863, + "step": 911 + }, + { + "epoch": 1.3443891652846878, + "grad_norm": 1.9003034299192694, + "learning_rate": 4.500921685448193e-05, + "loss": 1.2467, + "step": 912 + }, + { + "epoch": 1.345863276211535, + "grad_norm": 1.9945559914380864, + "learning_rate": 4.499734434008178e-05, + "loss": 1.3747, + "step": 913 + }, + { + "epoch": 1.3473373871383822, + "grad_norm": 2.2528308438302878, + "learning_rate": 4.498545929055515e-05, + "loss": 1.2159, + "step": 914 + }, + { + "epoch": 1.3488114980652295, + "grad_norm": 2.368546803800168, + "learning_rate": 4.497356171335204e-05, + "loss": 1.3446, + "step": 915 + }, + { + "epoch": 1.3502856089920767, + "grad_norm": 1.9593938701883025, + "learning_rate": 4.496165161593035e-05, + "loss": 1.3796, + "step": 916 + }, + { + "epoch": 1.351759719918924, + "grad_norm": 1.903468745214265, + "learning_rate": 4.4949729005755765e-05, + "loss": 1.2523, + "step": 917 + }, + { + "epoch": 1.3532338308457712, + "grad_norm": 2.134018330074337, + "learning_rate": 4.493779389030187e-05, + "loss": 1.3163, + "step": 918 + }, + { + "epoch": 1.3547079417726184, + "grad_norm": 1.8943798515939838, + "learning_rate": 4.492584627705008e-05, + "loss": 1.2499, + "step": 919 + }, + { + "epoch": 1.3561820526994657, + "grad_norm": 2.088743572670381, + "learning_rate": 4.491388617348959e-05, + "loss": 1.4958, + "step": 920 + }, + { + "epoch": 1.357656163626313, + "grad_norm": 1.8504505279038082, + "learning_rate": 4.490191358711751e-05, + "loss": 1.2194, + "step": 921 + }, + { + "epoch": 1.3591302745531602, + "grad_norm": 2.0754973355586097, + "learning_rate": 4.488992852543871e-05, + "loss": 1.3853, + "step": 922 + }, + { + "epoch": 1.3606043854800074, + "grad_norm": 1.9777146551093516, + "learning_rate": 4.4877930995965905e-05, + "loss": 1.1802, + "step": 923 + }, + { + "epoch": 1.3620784964068546, + "grad_norm": 1.8829521032472305, + "learning_rate": 4.486592100621961e-05, + "loss": 1.3003, + "step": 924 + }, + { + "epoch": 1.3635526073337019, + "grad_norm": 1.8847925688245744, + "learning_rate": 4.4853898563728184e-05, + "loss": 1.2712, + "step": 925 + }, + { + "epoch": 1.365026718260549, + "grad_norm": 1.8728280521372789, + "learning_rate": 4.484186367602775e-05, + "loss": 1.2012, + "step": 926 + }, + { + "epoch": 1.3665008291873963, + "grad_norm": 1.6873998965489518, + "learning_rate": 4.482981635066227e-05, + "loss": 1.1333, + "step": 927 + }, + { + "epoch": 1.3679749401142436, + "grad_norm": 1.948134371037688, + "learning_rate": 4.481775659518346e-05, + "loss": 1.1469, + "step": 928 + }, + { + "epoch": 1.3694490510410908, + "grad_norm": 1.850175426516826, + "learning_rate": 4.480568441715086e-05, + "loss": 1.2145, + "step": 929 + }, + { + "epoch": 1.370923161967938, + "grad_norm": 2.0001653364505714, + "learning_rate": 4.479359982413181e-05, + "loss": 1.384, + "step": 930 + }, + { + "epoch": 1.3723972728947853, + "grad_norm": 1.9367454659303158, + "learning_rate": 4.478150282370138e-05, + "loss": 1.1829, + "step": 931 + }, + { + "epoch": 1.3738713838216325, + "grad_norm": 1.901426372184927, + "learning_rate": 4.476939342344246e-05, + "loss": 1.1795, + "step": 932 + }, + { + "epoch": 1.3753454947484798, + "grad_norm": 1.7712354390495477, + "learning_rate": 4.475727163094572e-05, + "loss": 1.2497, + "step": 933 + }, + { + "epoch": 1.376819605675327, + "grad_norm": 2.003916719940972, + "learning_rate": 4.474513745380955e-05, + "loss": 1.2073, + "step": 934 + }, + { + "epoch": 1.3782937166021743, + "grad_norm": 1.8773015202071874, + "learning_rate": 4.473299089964015e-05, + "loss": 1.1917, + "step": 935 + }, + { + "epoch": 1.3797678275290215, + "grad_norm": 1.894054738318195, + "learning_rate": 4.472083197605146e-05, + "loss": 1.1564, + "step": 936 + }, + { + "epoch": 1.3812419384558687, + "grad_norm": 1.970212968071443, + "learning_rate": 4.470866069066516e-05, + "loss": 1.3188, + "step": 937 + }, + { + "epoch": 1.382716049382716, + "grad_norm": 1.682057965780056, + "learning_rate": 4.4696477051110705e-05, + "loss": 1.1682, + "step": 938 + }, + { + "epoch": 1.3841901603095632, + "grad_norm": 2.0254033381652596, + "learning_rate": 4.468428106502528e-05, + "loss": 1.1606, + "step": 939 + }, + { + "epoch": 1.3856642712364104, + "grad_norm": 2.0392637190063274, + "learning_rate": 4.4672072740053816e-05, + "loss": 1.2856, + "step": 940 + }, + { + "epoch": 1.3871383821632577, + "grad_norm": 1.9208541888292932, + "learning_rate": 4.4659852083848975e-05, + "loss": 1.1485, + "step": 941 + }, + { + "epoch": 1.388612493090105, + "grad_norm": 2.2490284199562156, + "learning_rate": 4.464761910407113e-05, + "loss": 1.2585, + "step": 942 + }, + { + "epoch": 1.3900866040169522, + "grad_norm": 1.7282960913951813, + "learning_rate": 4.463537380838841e-05, + "loss": 1.1456, + "step": 943 + }, + { + "epoch": 1.3915607149437994, + "grad_norm": 1.7542988565009476, + "learning_rate": 4.462311620447666e-05, + "loss": 1.2246, + "step": 944 + }, + { + "epoch": 1.3930348258706466, + "grad_norm": 1.9156444629470781, + "learning_rate": 4.461084630001942e-05, + "loss": 1.2611, + "step": 945 + }, + { + "epoch": 1.3945089367974939, + "grad_norm": 1.8049839321277235, + "learning_rate": 4.459856410270795e-05, + "loss": 1.2373, + "step": 946 + }, + { + "epoch": 1.3959830477243411, + "grad_norm": 1.9517555013104875, + "learning_rate": 4.4586269620241216e-05, + "loss": 1.1427, + "step": 947 + }, + { + "epoch": 1.3974571586511886, + "grad_norm": 1.7879347932646847, + "learning_rate": 4.457396286032589e-05, + "loss": 1.1145, + "step": 948 + }, + { + "epoch": 1.3989312695780358, + "grad_norm": 1.886347321463787, + "learning_rate": 4.4561643830676336e-05, + "loss": 1.3078, + "step": 949 + }, + { + "epoch": 1.400405380504883, + "grad_norm": 2.0960460997236554, + "learning_rate": 4.454931253901461e-05, + "loss": 1.2028, + "step": 950 + }, + { + "epoch": 1.4018794914317303, + "grad_norm": 2.0038057119866637, + "learning_rate": 4.453696899307045e-05, + "loss": 1.3936, + "step": 951 + }, + { + "epoch": 1.4033536023585775, + "grad_norm": 1.9078422525321308, + "learning_rate": 4.4524613200581284e-05, + "loss": 1.1678, + "step": 952 + }, + { + "epoch": 1.4048277132854248, + "grad_norm": 1.9148368670348805, + "learning_rate": 4.4512245169292206e-05, + "loss": 1.2803, + "step": 953 + }, + { + "epoch": 1.406301824212272, + "grad_norm": 1.8665223792487269, + "learning_rate": 4.449986490695599e-05, + "loss": 1.2145, + "step": 954 + }, + { + "epoch": 1.4077759351391192, + "grad_norm": 1.9907902623538376, + "learning_rate": 4.4487472421333074e-05, + "loss": 1.2263, + "step": 955 + }, + { + "epoch": 1.4092500460659665, + "grad_norm": 2.0034240975450066, + "learning_rate": 4.447506772019155e-05, + "loss": 1.1615, + "step": 956 + }, + { + "epoch": 1.4107241569928137, + "grad_norm": 2.0719946595126815, + "learning_rate": 4.44626508113072e-05, + "loss": 1.4633, + "step": 957 + }, + { + "epoch": 1.412198267919661, + "grad_norm": 1.9689547114435981, + "learning_rate": 4.445022170246341e-05, + "loss": 1.3024, + "step": 958 + }, + { + "epoch": 1.4136723788465082, + "grad_norm": 2.102384385276534, + "learning_rate": 4.443778040145124e-05, + "loss": 1.1678, + "step": 959 + }, + { + "epoch": 1.4151464897733554, + "grad_norm": 1.9068690061021774, + "learning_rate": 4.44253269160694e-05, + "loss": 1.1425, + "step": 960 + }, + { + "epoch": 1.4166206007002027, + "grad_norm": 1.8930244667459024, + "learning_rate": 4.441286125412422e-05, + "loss": 1.1605, + "step": 961 + }, + { + "epoch": 1.41809471162705, + "grad_norm": 2.028689017140227, + "learning_rate": 4.440038342342967e-05, + "loss": 1.2434, + "step": 962 + }, + { + "epoch": 1.4195688225538972, + "grad_norm": 1.911879809181173, + "learning_rate": 4.4387893431807344e-05, + "loss": 1.2782, + "step": 963 + }, + { + "epoch": 1.4210429334807444, + "grad_norm": 1.7739733588014863, + "learning_rate": 4.437539128708647e-05, + "loss": 1.1386, + "step": 964 + }, + { + "epoch": 1.4225170444075916, + "grad_norm": 2.116439405641339, + "learning_rate": 4.4362876997103885e-05, + "loss": 1.2379, + "step": 965 + }, + { + "epoch": 1.4239911553344389, + "grad_norm": 1.8639994582637933, + "learning_rate": 4.4350350569704045e-05, + "loss": 1.2856, + "step": 966 + }, + { + "epoch": 1.4254652662612861, + "grad_norm": 2.031336151643573, + "learning_rate": 4.4337812012738996e-05, + "loss": 1.1877, + "step": 967 + }, + { + "epoch": 1.4269393771881334, + "grad_norm": 2.046614535700179, + "learning_rate": 4.4325261334068426e-05, + "loss": 1.2791, + "step": 968 + }, + { + "epoch": 1.4284134881149806, + "grad_norm": 1.9761423259938107, + "learning_rate": 4.431269854155957e-05, + "loss": 1.1401, + "step": 969 + }, + { + "epoch": 1.4298875990418278, + "grad_norm": 1.893309334828956, + "learning_rate": 4.4300123643087304e-05, + "loss": 1.3055, + "step": 970 + }, + { + "epoch": 1.431361709968675, + "grad_norm": 1.7685626412628297, + "learning_rate": 4.428753664653406e-05, + "loss": 1.3326, + "step": 971 + }, + { + "epoch": 1.4328358208955223, + "grad_norm": 1.8284935212994555, + "learning_rate": 4.427493755978987e-05, + "loss": 1.2479, + "step": 972 + }, + { + "epoch": 1.4343099318223698, + "grad_norm": 2.0735294324535105, + "learning_rate": 4.426232639075234e-05, + "loss": 1.2229, + "step": 973 + }, + { + "epoch": 1.435784042749217, + "grad_norm": 1.8559715473156344, + "learning_rate": 4.424970314732664e-05, + "loss": 1.1806, + "step": 974 + }, + { + "epoch": 1.4372581536760642, + "grad_norm": 1.9585716291957957, + "learning_rate": 4.423706783742554e-05, + "loss": 1.3054, + "step": 975 + }, + { + "epoch": 1.4387322646029115, + "grad_norm": 1.8091937196671901, + "learning_rate": 4.422442046896933e-05, + "loss": 1.1424, + "step": 976 + }, + { + "epoch": 1.4402063755297587, + "grad_norm": 1.7330097633126418, + "learning_rate": 4.421176104988589e-05, + "loss": 1.1876, + "step": 977 + }, + { + "epoch": 1.441680486456606, + "grad_norm": 1.7391180531693347, + "learning_rate": 4.419908958811064e-05, + "loss": 1.1685, + "step": 978 + }, + { + "epoch": 1.4431545973834532, + "grad_norm": 1.7289370955082806, + "learning_rate": 4.418640609158656e-05, + "loss": 1.3844, + "step": 979 + }, + { + "epoch": 1.4446287083103004, + "grad_norm": 1.666172892223675, + "learning_rate": 4.417371056826417e-05, + "loss": 1.2546, + "step": 980 + }, + { + "epoch": 1.4461028192371477, + "grad_norm": 1.9808952880444375, + "learning_rate": 4.4161003026101525e-05, + "loss": 1.1787, + "step": 981 + }, + { + "epoch": 1.447576930163995, + "grad_norm": 2.0488065967574265, + "learning_rate": 4.41482834730642e-05, + "loss": 1.2623, + "step": 982 + }, + { + "epoch": 1.4490510410908422, + "grad_norm": 1.7999039041511262, + "learning_rate": 4.4135551917125334e-05, + "loss": 1.0902, + "step": 983 + }, + { + "epoch": 1.4505251520176894, + "grad_norm": 1.8675776756300442, + "learning_rate": 4.4122808366265556e-05, + "loss": 1.1667, + "step": 984 + }, + { + "epoch": 1.4519992629445366, + "grad_norm": 1.7077469013036894, + "learning_rate": 4.411005282847304e-05, + "loss": 1.2685, + "step": 985 + }, + { + "epoch": 1.4534733738713839, + "grad_norm": 1.7941980739359924, + "learning_rate": 4.409728531174345e-05, + "loss": 1.0802, + "step": 986 + }, + { + "epoch": 1.454947484798231, + "grad_norm": 1.9702490896961755, + "learning_rate": 4.4084505824079975e-05, + "loss": 1.3053, + "step": 987 + }, + { + "epoch": 1.4564215957250783, + "grad_norm": 1.8792361408030633, + "learning_rate": 4.40717143734933e-05, + "loss": 1.1681, + "step": 988 + }, + { + "epoch": 1.4578957066519256, + "grad_norm": 1.7822119808906913, + "learning_rate": 4.405891096800162e-05, + "loss": 1.1079, + "step": 989 + }, + { + "epoch": 1.4593698175787728, + "grad_norm": 1.8977830802839433, + "learning_rate": 4.404609561563062e-05, + "loss": 1.0291, + "step": 990 + }, + { + "epoch": 1.46084392850562, + "grad_norm": 1.9218700998135256, + "learning_rate": 4.403326832441345e-05, + "loss": 1.233, + "step": 991 + }, + { + "epoch": 1.4623180394324673, + "grad_norm": 2.093780517355723, + "learning_rate": 4.402042910239078e-05, + "loss": 1.0966, + "step": 992 + }, + { + "epoch": 1.4637921503593145, + "grad_norm": 1.9256528614260946, + "learning_rate": 4.400757795761074e-05, + "loss": 1.3607, + "step": 993 + }, + { + "epoch": 1.4652662612861618, + "grad_norm": 1.9386013961300534, + "learning_rate": 4.399471489812893e-05, + "loss": 1.2029, + "step": 994 + }, + { + "epoch": 1.466740372213009, + "grad_norm": 2.0512529684413745, + "learning_rate": 4.398183993200843e-05, + "loss": 1.1685, + "step": 995 + }, + { + "epoch": 1.4682144831398563, + "grad_norm": 2.0936053781238435, + "learning_rate": 4.3968953067319777e-05, + "loss": 1.2021, + "step": 996 + }, + { + "epoch": 1.4696885940667035, + "grad_norm": 1.8396869438718744, + "learning_rate": 4.395605431214096e-05, + "loss": 1.3155, + "step": 997 + }, + { + "epoch": 1.4711627049935507, + "grad_norm": 1.9608838199395964, + "learning_rate": 4.394314367455744e-05, + "loss": 1.0922, + "step": 998 + }, + { + "epoch": 1.472636815920398, + "grad_norm": 1.7586355359448314, + "learning_rate": 4.393022116266212e-05, + "loss": 1.2193, + "step": 999 + }, + { + "epoch": 1.4741109268472452, + "grad_norm": 2.2619578180806923, + "learning_rate": 4.3917286784555325e-05, + "loss": 1.4428, + "step": 1000 + }, + { + "epoch": 1.4741109268472452, + "eval_bleu": 0.04710557283526075, + "eval_bleu_1gram": 0.3537460085813653, + "eval_bleu_2gram": 0.11992925427956323, + "eval_bleu_3gram": 0.04622731006726815, + "eval_bleu_4gram": 0.021123928680788814, + "eval_rag_val_loss": 1.5481538125263747, + "eval_rouge1": 0.33311218076912447, + "eval_rouge2": 0.11163593556247434, + "eval_rougeL": 0.3133783621457725, + "step": 1000 + }, + { + "epoch": 1.4755850377740924, + "grad_norm": 1.7852993496691465, + "learning_rate": 4.390434054834483e-05, + "loss": 1.2443, + "step": 1001 + }, + { + "epoch": 1.4770591487009397, + "grad_norm": 1.9408911898702972, + "learning_rate": 4.389138246214588e-05, + "loss": 1.2617, + "step": 1002 + }, + { + "epoch": 1.478533259627787, + "grad_norm": 1.8804633336609262, + "learning_rate": 4.387841253408109e-05, + "loss": 1.2545, + "step": 1003 + }, + { + "epoch": 1.4800073705546342, + "grad_norm": 1.807517800672155, + "learning_rate": 4.386543077228053e-05, + "loss": 1.0795, + "step": 1004 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 1.8226115888805021, + "learning_rate": 4.3852437184881687e-05, + "loss": 1.2368, + "step": 1005 + }, + { + "epoch": 1.4829555924083286, + "grad_norm": 1.7759731499515286, + "learning_rate": 4.383943178002944e-05, + "loss": 1.1635, + "step": 1006 + }, + { + "epoch": 1.4844297033351759, + "grad_norm": 2.0251827298830354, + "learning_rate": 4.382641456587611e-05, + "loss": 1.2174, + "step": 1007 + }, + { + "epoch": 1.4859038142620231, + "grad_norm": 1.8529512372175647, + "learning_rate": 4.38133855505814e-05, + "loss": 1.2467, + "step": 1008 + }, + { + "epoch": 1.4873779251888704, + "grad_norm": 1.8583765635002416, + "learning_rate": 4.3800344742312396e-05, + "loss": 1.2368, + "step": 1009 + }, + { + "epoch": 1.4888520361157176, + "grad_norm": 1.6766595561721727, + "learning_rate": 4.3787292149243605e-05, + "loss": 1.2179, + "step": 1010 + }, + { + "epoch": 1.4903261470425648, + "grad_norm": 1.887057887965862, + "learning_rate": 4.3774227779556906e-05, + "loss": 1.0748, + "step": 1011 + }, + { + "epoch": 1.491800257969412, + "grad_norm": 1.8740406125043485, + "learning_rate": 4.376115164144157e-05, + "loss": 1.2463, + "step": 1012 + }, + { + "epoch": 1.4932743688962593, + "grad_norm": 1.8362573872541625, + "learning_rate": 4.374806374309421e-05, + "loss": 1.0845, + "step": 1013 + }, + { + "epoch": 1.4947484798231068, + "grad_norm": 1.9110518464182196, + "learning_rate": 4.3734964092718885e-05, + "loss": 1.2809, + "step": 1014 + }, + { + "epoch": 1.496222590749954, + "grad_norm": 2.0270872449951485, + "learning_rate": 4.372185269852693e-05, + "loss": 1.3253, + "step": 1015 + }, + { + "epoch": 1.4976967016768012, + "grad_norm": 1.8056851886608907, + "learning_rate": 4.370872956873712e-05, + "loss": 1.2459, + "step": 1016 + }, + { + "epoch": 1.4991708126036485, + "grad_norm": 1.7809119322012972, + "learning_rate": 4.369559471157552e-05, + "loss": 1.2797, + "step": 1017 + }, + { + "epoch": 1.5006449235304957, + "grad_norm": 2.111677368526342, + "learning_rate": 4.36824481352756e-05, + "loss": 1.3684, + "step": 1018 + }, + { + "epoch": 1.502119034457343, + "grad_norm": 1.8136400386358484, + "learning_rate": 4.366928984807815e-05, + "loss": 1.3088, + "step": 1019 + }, + { + "epoch": 1.5035931453841902, + "grad_norm": 1.6579132365865314, + "learning_rate": 4.36561198582313e-05, + "loss": 1.1212, + "step": 1020 + }, + { + "epoch": 1.5050672563110374, + "grad_norm": 1.9304753598663904, + "learning_rate": 4.364293817399052e-05, + "loss": 1.2989, + "step": 1021 + }, + { + "epoch": 1.5065413672378847, + "grad_norm": 1.8308767491666493, + "learning_rate": 4.362974480361862e-05, + "loss": 1.1554, + "step": 1022 + }, + { + "epoch": 1.508015478164732, + "grad_norm": 1.964352980613792, + "learning_rate": 4.361653975538572e-05, + "loss": 1.3509, + "step": 1023 + }, + { + "epoch": 1.5094895890915792, + "grad_norm": 2.082338693331692, + "learning_rate": 4.3603323037569265e-05, + "loss": 1.279, + "step": 1024 + }, + { + "epoch": 1.5109637000184264, + "grad_norm": 1.8059765724276355, + "learning_rate": 4.359009465845402e-05, + "loss": 1.0633, + "step": 1025 + }, + { + "epoch": 1.5124378109452736, + "grad_norm": 1.895122129183627, + "learning_rate": 4.3576854626332055e-05, + "loss": 1.1434, + "step": 1026 + }, + { + "epoch": 1.5139119218721209, + "grad_norm": 1.837701801974174, + "learning_rate": 4.356360294950275e-05, + "loss": 1.2943, + "step": 1027 + }, + { + "epoch": 1.515386032798968, + "grad_norm": 2.002332519309243, + "learning_rate": 4.3550339636272775e-05, + "loss": 1.3521, + "step": 1028 + }, + { + "epoch": 1.5168601437258153, + "grad_norm": 1.7799747653844067, + "learning_rate": 4.35370646949561e-05, + "loss": 1.2239, + "step": 1029 + }, + { + "epoch": 1.5183342546526626, + "grad_norm": 1.8763600820718749, + "learning_rate": 4.352377813387398e-05, + "loss": 1.2009, + "step": 1030 + }, + { + "epoch": 1.5198083655795098, + "grad_norm": 1.873681558551299, + "learning_rate": 4.3510479961354964e-05, + "loss": 1.2526, + "step": 1031 + }, + { + "epoch": 1.521282476506357, + "grad_norm": 1.6504819946277676, + "learning_rate": 4.349717018573487e-05, + "loss": 1.1681, + "step": 1032 + }, + { + "epoch": 1.5227565874332043, + "grad_norm": 1.856955805727659, + "learning_rate": 4.348384881535679e-05, + "loss": 1.2538, + "step": 1033 + }, + { + "epoch": 1.5242306983600515, + "grad_norm": 1.8931735807616588, + "learning_rate": 4.347051585857109e-05, + "loss": 1.3238, + "step": 1034 + }, + { + "epoch": 1.525704809286899, + "grad_norm": 1.830682123555617, + "learning_rate": 4.34571713237354e-05, + "loss": 1.1462, + "step": 1035 + }, + { + "epoch": 1.5271789202137462, + "grad_norm": 1.9411429336017156, + "learning_rate": 4.344381521921458e-05, + "loss": 1.2485, + "step": 1036 + }, + { + "epoch": 1.5286530311405935, + "grad_norm": 1.7587618828367326, + "learning_rate": 4.3430447553380785e-05, + "loss": 1.074, + "step": 1037 + }, + { + "epoch": 1.5301271420674407, + "grad_norm": 1.9504948721678683, + "learning_rate": 4.34170683346134e-05, + "loss": 1.207, + "step": 1038 + }, + { + "epoch": 1.531601252994288, + "grad_norm": 1.8973504867251907, + "learning_rate": 4.3403677571299026e-05, + "loss": 1.4281, + "step": 1039 + }, + { + "epoch": 1.5330753639211352, + "grad_norm": 1.9216689332726449, + "learning_rate": 4.339027527183154e-05, + "loss": 1.1606, + "step": 1040 + }, + { + "epoch": 1.5345494748479824, + "grad_norm": 1.757229788941443, + "learning_rate": 4.337686144461204e-05, + "loss": 1.1878, + "step": 1041 + }, + { + "epoch": 1.5360235857748297, + "grad_norm": 1.802586641385353, + "learning_rate": 4.3363436098048825e-05, + "loss": 1.3005, + "step": 1042 + }, + { + "epoch": 1.537497696701677, + "grad_norm": 1.9067754802909465, + "learning_rate": 4.3349999240557446e-05, + "loss": 1.1313, + "step": 1043 + }, + { + "epoch": 1.5389718076285241, + "grad_norm": 2.126324633455197, + "learning_rate": 4.333655088056065e-05, + "loss": 1.4061, + "step": 1044 + }, + { + "epoch": 1.5404459185553714, + "grad_norm": 1.9076362947139993, + "learning_rate": 4.332309102648841e-05, + "loss": 1.3356, + "step": 1045 + }, + { + "epoch": 1.5419200294822186, + "grad_norm": 1.771936461982236, + "learning_rate": 4.330961968677788e-05, + "loss": 1.1578, + "step": 1046 + }, + { + "epoch": 1.5433941404090659, + "grad_norm": 1.9262885301859323, + "learning_rate": 4.329613686987344e-05, + "loss": 1.3629, + "step": 1047 + }, + { + "epoch": 1.544868251335913, + "grad_norm": 1.8219757994416996, + "learning_rate": 4.328264258422665e-05, + "loss": 1.2104, + "step": 1048 + }, + { + "epoch": 1.5463423622627603, + "grad_norm": 1.9569555941822794, + "learning_rate": 4.3269136838296264e-05, + "loss": 1.345, + "step": 1049 + }, + { + "epoch": 1.5478164731896076, + "grad_norm": 1.819906230334432, + "learning_rate": 4.325561964054822e-05, + "loss": 1.4101, + "step": 1050 + }, + { + "epoch": 1.5492905841164548, + "grad_norm": 2.082157896960155, + "learning_rate": 4.324209099945563e-05, + "loss": 1.1818, + "step": 1051 + }, + { + "epoch": 1.550764695043302, + "grad_norm": 1.86058710164103, + "learning_rate": 4.322855092349878e-05, + "loss": 1.3853, + "step": 1052 + }, + { + "epoch": 1.5522388059701493, + "grad_norm": 1.836146111506626, + "learning_rate": 4.321499942116511e-05, + "loss": 1.1401, + "step": 1053 + }, + { + "epoch": 1.5537129168969965, + "grad_norm": 1.8128654506179072, + "learning_rate": 4.320143650094927e-05, + "loss": 1.2487, + "step": 1054 + }, + { + "epoch": 1.5551870278238438, + "grad_norm": 1.8970593131952311, + "learning_rate": 4.318786217135301e-05, + "loss": 1.3857, + "step": 1055 + }, + { + "epoch": 1.556661138750691, + "grad_norm": 1.9615967998632555, + "learning_rate": 4.3174276440885276e-05, + "loss": 1.185, + "step": 1056 + }, + { + "epoch": 1.5581352496775382, + "grad_norm": 1.7684784508908382, + "learning_rate": 4.316067931806212e-05, + "loss": 1.0542, + "step": 1057 + }, + { + "epoch": 1.5596093606043855, + "grad_norm": 1.9063932646399908, + "learning_rate": 4.3147070811406765e-05, + "loss": 1.2336, + "step": 1058 + }, + { + "epoch": 1.5610834715312327, + "grad_norm": 1.863391417119542, + "learning_rate": 4.313345092944957e-05, + "loss": 1.2063, + "step": 1059 + }, + { + "epoch": 1.56255758245808, + "grad_norm": 1.8144653451300685, + "learning_rate": 4.3119819680728e-05, + "loss": 1.3092, + "step": 1060 + }, + { + "epoch": 1.5640316933849272, + "grad_norm": 1.905964939503467, + "learning_rate": 4.310617707378668e-05, + "loss": 1.2017, + "step": 1061 + }, + { + "epoch": 1.5655058043117744, + "grad_norm": 1.9321008332675937, + "learning_rate": 4.309252311717732e-05, + "loss": 1.2136, + "step": 1062 + }, + { + "epoch": 1.5669799152386217, + "grad_norm": 2.0171368274189496, + "learning_rate": 4.307885781945876e-05, + "loss": 1.2324, + "step": 1063 + }, + { + "epoch": 1.568454026165469, + "grad_norm": 1.8587233739919908, + "learning_rate": 4.3065181189196956e-05, + "loss": 1.3017, + "step": 1064 + }, + { + "epoch": 1.5699281370923162, + "grad_norm": 1.9124230662311437, + "learning_rate": 4.305149323496497e-05, + "loss": 1.3507, + "step": 1065 + }, + { + "epoch": 1.5714022480191634, + "grad_norm": 1.8777700942570117, + "learning_rate": 4.303779396534293e-05, + "loss": 1.2357, + "step": 1066 + }, + { + "epoch": 1.5728763589460106, + "grad_norm": 1.7811982665998625, + "learning_rate": 4.30240833889181e-05, + "loss": 1.2835, + "step": 1067 + }, + { + "epoch": 1.5743504698728579, + "grad_norm": 1.9165017291986042, + "learning_rate": 4.30103615142848e-05, + "loss": 1.258, + "step": 1068 + }, + { + "epoch": 1.575824580799705, + "grad_norm": 1.7652063464180527, + "learning_rate": 4.2996628350044454e-05, + "loss": 1.2886, + "step": 1069 + }, + { + "epoch": 1.5772986917265523, + "grad_norm": 1.8500412755949607, + "learning_rate": 4.298288390480554e-05, + "loss": 1.1388, + "step": 1070 + }, + { + "epoch": 1.5787728026533996, + "grad_norm": 1.9184986011936476, + "learning_rate": 4.296912818718363e-05, + "loss": 1.1086, + "step": 1071 + }, + { + "epoch": 1.5802469135802468, + "grad_norm": 1.915202645916812, + "learning_rate": 4.295536120580135e-05, + "loss": 1.3577, + "step": 1072 + }, + { + "epoch": 1.581721024507094, + "grad_norm": 1.985155687928824, + "learning_rate": 4.2941582969288384e-05, + "loss": 1.3213, + "step": 1073 + }, + { + "epoch": 1.5831951354339413, + "grad_norm": 1.9104833030380386, + "learning_rate": 4.292779348628148e-05, + "loss": 1.3851, + "step": 1074 + }, + { + "epoch": 1.5846692463607885, + "grad_norm": 1.91145109197785, + "learning_rate": 4.2913992765424434e-05, + "loss": 1.2434, + "step": 1075 + }, + { + "epoch": 1.5861433572876358, + "grad_norm": 1.730167405733206, + "learning_rate": 4.2900180815368076e-05, + "loss": 1.2735, + "step": 1076 + }, + { + "epoch": 1.587617468214483, + "grad_norm": 1.8422407665106706, + "learning_rate": 4.2886357644770294e-05, + "loss": 1.213, + "step": 1077 + }, + { + "epoch": 1.5890915791413303, + "grad_norm": 2.0168250008631006, + "learning_rate": 4.287252326229598e-05, + "loss": 1.1869, + "step": 1078 + }, + { + "epoch": 1.5905656900681775, + "grad_norm": 1.8473379494054016, + "learning_rate": 4.285867767661709e-05, + "loss": 1.219, + "step": 1079 + }, + { + "epoch": 1.5920398009950247, + "grad_norm": 1.7956929838679612, + "learning_rate": 4.284482089641257e-05, + "loss": 1.1474, + "step": 1080 + }, + { + "epoch": 1.593513911921872, + "grad_norm": 1.8855936870054948, + "learning_rate": 4.283095293036842e-05, + "loss": 1.1987, + "step": 1081 + }, + { + "epoch": 1.5949880228487192, + "grad_norm": 1.7377318467712326, + "learning_rate": 4.281707378717761e-05, + "loss": 1.2977, + "step": 1082 + }, + { + "epoch": 1.5964621337755664, + "grad_norm": 1.941549253665688, + "learning_rate": 4.280318347554013e-05, + "loss": 1.3362, + "step": 1083 + }, + { + "epoch": 1.597936244702414, + "grad_norm": 1.79061648914385, + "learning_rate": 4.2789282004163e-05, + "loss": 1.2801, + "step": 1084 + }, + { + "epoch": 1.5994103556292611, + "grad_norm": 2.1294220185862045, + "learning_rate": 4.27753693817602e-05, + "loss": 1.1791, + "step": 1085 + }, + { + "epoch": 1.6008844665561084, + "grad_norm": 1.789910261366166, + "learning_rate": 4.276144561705271e-05, + "loss": 1.0298, + "step": 1086 + }, + { + "epoch": 1.6023585774829556, + "grad_norm": 1.83935987584918, + "learning_rate": 4.27475107187685e-05, + "loss": 1.1299, + "step": 1087 + }, + { + "epoch": 1.6038326884098029, + "grad_norm": 1.8073147232681004, + "learning_rate": 4.273356469564251e-05, + "loss": 1.2355, + "step": 1088 + }, + { + "epoch": 1.60530679933665, + "grad_norm": 1.9020461333342862, + "learning_rate": 4.271960755641668e-05, + "loss": 1.14, + "step": 1089 + }, + { + "epoch": 1.6067809102634973, + "grad_norm": 1.90104953990682, + "learning_rate": 4.270563930983986e-05, + "loss": 1.2043, + "step": 1090 + }, + { + "epoch": 1.6082550211903446, + "grad_norm": 1.8690935565439237, + "learning_rate": 4.269165996466793e-05, + "loss": 1.3084, + "step": 1091 + }, + { + "epoch": 1.6097291321171918, + "grad_norm": 1.8732310533714696, + "learning_rate": 4.267766952966369e-05, + "loss": 1.2108, + "step": 1092 + }, + { + "epoch": 1.611203243044039, + "grad_norm": 1.8326481347884551, + "learning_rate": 4.266366801359689e-05, + "loss": 1.2946, + "step": 1093 + }, + { + "epoch": 1.6126773539708863, + "grad_norm": 1.7119660791218507, + "learning_rate": 4.264965542524424e-05, + "loss": 0.9716, + "step": 1094 + }, + { + "epoch": 1.6141514648977335, + "grad_norm": 2.006191564166435, + "learning_rate": 4.263563177338938e-05, + "loss": 1.2558, + "step": 1095 + }, + { + "epoch": 1.6156255758245808, + "grad_norm": 1.756098271667975, + "learning_rate": 4.262159706682291e-05, + "loss": 1.2415, + "step": 1096 + }, + { + "epoch": 1.617099686751428, + "grad_norm": 1.8145735488201515, + "learning_rate": 4.2607551314342297e-05, + "loss": 1.196, + "step": 1097 + }, + { + "epoch": 1.6185737976782753, + "grad_norm": 1.9683976236492993, + "learning_rate": 4.259349452475202e-05, + "loss": 1.1848, + "step": 1098 + }, + { + "epoch": 1.6200479086051225, + "grad_norm": 1.9825803195417955, + "learning_rate": 4.25794267068634e-05, + "loss": 1.2926, + "step": 1099 + }, + { + "epoch": 1.6215220195319697, + "grad_norm": 2.0098460545586327, + "learning_rate": 4.256534786949472e-05, + "loss": 1.3416, + "step": 1100 + }, + { + "epoch": 1.6229961304588172, + "grad_norm": 2.1685209653552024, + "learning_rate": 4.255125802147114e-05, + "loss": 1.3728, + "step": 1101 + }, + { + "epoch": 1.6244702413856644, + "grad_norm": 1.938307686086913, + "learning_rate": 4.253715717162474e-05, + "loss": 1.1128, + "step": 1102 + }, + { + "epoch": 1.6259443523125117, + "grad_norm": 1.8195898889287117, + "learning_rate": 4.252304532879449e-05, + "loss": 1.2526, + "step": 1103 + }, + { + "epoch": 1.627418463239359, + "grad_norm": 2.042627017905458, + "learning_rate": 4.2508922501826244e-05, + "loss": 1.2557, + "step": 1104 + }, + { + "epoch": 1.6288925741662061, + "grad_norm": 1.9459736018571157, + "learning_rate": 4.249478869957276e-05, + "loss": 1.2338, + "step": 1105 + }, + { + "epoch": 1.6303666850930534, + "grad_norm": 2.0336472670217516, + "learning_rate": 4.248064393089366e-05, + "loss": 1.2837, + "step": 1106 + }, + { + "epoch": 1.6318407960199006, + "grad_norm": 1.8672951383968779, + "learning_rate": 4.246648820465544e-05, + "loss": 1.2686, + "step": 1107 + }, + { + "epoch": 1.6333149069467479, + "grad_norm": 1.9262759673926062, + "learning_rate": 4.2452321529731475e-05, + "loss": 1.1191, + "step": 1108 + }, + { + "epoch": 1.634789017873595, + "grad_norm": 1.9928605681710756, + "learning_rate": 4.2438143915002e-05, + "loss": 1.2318, + "step": 1109 + }, + { + "epoch": 1.6362631288004423, + "grad_norm": 1.7655129186113852, + "learning_rate": 4.242395536935409e-05, + "loss": 1.387, + "step": 1110 + }, + { + "epoch": 1.6377372397272896, + "grad_norm": 2.0112439234080486, + "learning_rate": 4.2409755901681716e-05, + "loss": 1.3361, + "step": 1111 + }, + { + "epoch": 1.6392113506541368, + "grad_norm": 1.787524299189659, + "learning_rate": 4.239554552088563e-05, + "loss": 1.2788, + "step": 1112 + }, + { + "epoch": 1.640685461580984, + "grad_norm": 1.8300088810314366, + "learning_rate": 4.238132423587349e-05, + "loss": 1.3415, + "step": 1113 + }, + { + "epoch": 1.6421595725078313, + "grad_norm": 1.9643185711874585, + "learning_rate": 4.236709205555973e-05, + "loss": 1.2452, + "step": 1114 + }, + { + "epoch": 1.6436336834346785, + "grad_norm": 1.8802687053198865, + "learning_rate": 4.235284898886568e-05, + "loss": 1.4005, + "step": 1115 + }, + { + "epoch": 1.6451077943615258, + "grad_norm": 1.8477788323996884, + "learning_rate": 4.233859504471943e-05, + "loss": 1.1429, + "step": 1116 + }, + { + "epoch": 1.646581905288373, + "grad_norm": 1.964663548482063, + "learning_rate": 4.2324330232055924e-05, + "loss": 1.1998, + "step": 1117 + }, + { + "epoch": 1.6480560162152202, + "grad_norm": 1.9047670067707487, + "learning_rate": 4.231005455981692e-05, + "loss": 1.1231, + "step": 1118 + }, + { + "epoch": 1.6495301271420675, + "grad_norm": 1.845622905622793, + "learning_rate": 4.2295768036950953e-05, + "loss": 1.2158, + "step": 1119 + }, + { + "epoch": 1.6510042380689147, + "grad_norm": 1.7227520213244676, + "learning_rate": 4.22814706724134e-05, + "loss": 1.226, + "step": 1120 + }, + { + "epoch": 1.652478348995762, + "grad_norm": 1.9877921291603484, + "learning_rate": 4.226716247516641e-05, + "loss": 1.2949, + "step": 1121 + }, + { + "epoch": 1.6539524599226092, + "grad_norm": 1.810466678164202, + "learning_rate": 4.2252843454178925e-05, + "loss": 1.3307, + "step": 1122 + }, + { + "epoch": 1.6554265708494564, + "grad_norm": 1.7627726107315673, + "learning_rate": 4.223851361842668e-05, + "loss": 1.1535, + "step": 1123 + }, + { + "epoch": 1.6569006817763037, + "grad_norm": 1.8907676832095228, + "learning_rate": 4.222417297689217e-05, + "loss": 1.2935, + "step": 1124 + }, + { + "epoch": 1.658374792703151, + "grad_norm": 1.7970703018863439, + "learning_rate": 4.2209821538564684e-05, + "loss": 1.2032, + "step": 1125 + }, + { + "epoch": 1.6598489036299982, + "grad_norm": 1.8964828664064546, + "learning_rate": 4.219545931244027e-05, + "loss": 1.3649, + "step": 1126 + }, + { + "epoch": 1.6613230145568454, + "grad_norm": 1.824122685715049, + "learning_rate": 4.218108630752174e-05, + "loss": 1.4419, + "step": 1127 + }, + { + "epoch": 1.6627971254836926, + "grad_norm": 1.912200894371983, + "learning_rate": 4.2166702532818665e-05, + "loss": 1.297, + "step": 1128 + }, + { + "epoch": 1.6642712364105399, + "grad_norm": 1.9770491532431862, + "learning_rate": 4.2152307997347365e-05, + "loss": 1.2846, + "step": 1129 + }, + { + "epoch": 1.665745347337387, + "grad_norm": 1.9258810330861753, + "learning_rate": 4.213790271013089e-05, + "loss": 1.2353, + "step": 1130 + }, + { + "epoch": 1.6672194582642343, + "grad_norm": 2.0050289585752474, + "learning_rate": 4.212348668019906e-05, + "loss": 1.195, + "step": 1131 + }, + { + "epoch": 1.6686935691910816, + "grad_norm": 1.830339313380367, + "learning_rate": 4.2109059916588414e-05, + "loss": 1.0695, + "step": 1132 + }, + { + "epoch": 1.6701676801179288, + "grad_norm": 1.833505001122671, + "learning_rate": 4.20946224283422e-05, + "loss": 1.0125, + "step": 1133 + }, + { + "epoch": 1.671641791044776, + "grad_norm": 1.9804112051705174, + "learning_rate": 4.2080174224510426e-05, + "loss": 1.324, + "step": 1134 + }, + { + "epoch": 1.6731159019716233, + "grad_norm": 1.710219628914054, + "learning_rate": 4.2065715314149775e-05, + "loss": 1.0566, + "step": 1135 + }, + { + "epoch": 1.6745900128984705, + "grad_norm": 1.64057282864407, + "learning_rate": 4.2051245706323696e-05, + "loss": 1.1222, + "step": 1136 + }, + { + "epoch": 1.6760641238253178, + "grad_norm": 1.9637349149147647, + "learning_rate": 4.2036765410102285e-05, + "loss": 1.0906, + "step": 1137 + }, + { + "epoch": 1.677538234752165, + "grad_norm": 1.8240049837320405, + "learning_rate": 4.202227443456238e-05, + "loss": 1.2323, + "step": 1138 + }, + { + "epoch": 1.6790123456790123, + "grad_norm": 1.9480210410593393, + "learning_rate": 4.200777278878749e-05, + "loss": 1.2079, + "step": 1139 + }, + { + "epoch": 1.6804864566058595, + "grad_norm": 1.9074388142743126, + "learning_rate": 4.199326048186782e-05, + "loss": 1.2942, + "step": 1140 + }, + { + "epoch": 1.6819605675327067, + "grad_norm": 1.8919630753904124, + "learning_rate": 4.197873752290027e-05, + "loss": 1.2149, + "step": 1141 + }, + { + "epoch": 1.683434678459554, + "grad_norm": 1.7600493715687344, + "learning_rate": 4.1964203920988385e-05, + "loss": 1.0952, + "step": 1142 + }, + { + "epoch": 1.6849087893864012, + "grad_norm": 1.9261659930388288, + "learning_rate": 4.19496596852424e-05, + "loss": 1.3109, + "step": 1143 + }, + { + "epoch": 1.6863829003132484, + "grad_norm": 1.8768810373489273, + "learning_rate": 4.1935104824779246e-05, + "loss": 1.294, + "step": 1144 + }, + { + "epoch": 1.6878570112400957, + "grad_norm": 1.9677935502228958, + "learning_rate": 4.192053934872247e-05, + "loss": 1.2749, + "step": 1145 + }, + { + "epoch": 1.689331122166943, + "grad_norm": 2.037120142439185, + "learning_rate": 4.1905963266202276e-05, + "loss": 1.4639, + "step": 1146 + }, + { + "epoch": 1.6908052330937902, + "grad_norm": 1.861661642962001, + "learning_rate": 4.189137658635555e-05, + "loss": 1.3886, + "step": 1147 + }, + { + "epoch": 1.6922793440206374, + "grad_norm": 2.3855936798206425, + "learning_rate": 4.187677931832578e-05, + "loss": 1.3504, + "step": 1148 + }, + { + "epoch": 1.6937534549474846, + "grad_norm": 2.1097919193374524, + "learning_rate": 4.1862171471263126e-05, + "loss": 1.3451, + "step": 1149 + }, + { + "epoch": 1.695227565874332, + "grad_norm": 1.9216781143195996, + "learning_rate": 4.184755305432436e-05, + "loss": 1.3488, + "step": 1150 + }, + { + "epoch": 1.6967016768011793, + "grad_norm": 1.9479156602935095, + "learning_rate": 4.1832924076672876e-05, + "loss": 1.2722, + "step": 1151 + }, + { + "epoch": 1.6981757877280266, + "grad_norm": 1.8703568506678965, + "learning_rate": 4.181828454747872e-05, + "loss": 1.2535, + "step": 1152 + }, + { + "epoch": 1.6996498986548738, + "grad_norm": 1.873169386817977, + "learning_rate": 4.180363447591849e-05, + "loss": 1.4875, + "step": 1153 + }, + { + "epoch": 1.701124009581721, + "grad_norm": 1.9670314857239748, + "learning_rate": 4.178897387117546e-05, + "loss": 1.4447, + "step": 1154 + }, + { + "epoch": 1.7025981205085683, + "grad_norm": 1.727028831232438, + "learning_rate": 4.177430274243947e-05, + "loss": 1.2714, + "step": 1155 + }, + { + "epoch": 1.7040722314354155, + "grad_norm": 1.7399758297787102, + "learning_rate": 4.175962109890696e-05, + "loss": 1.2887, + "step": 1156 + }, + { + "epoch": 1.7055463423622628, + "grad_norm": 1.9946159969364994, + "learning_rate": 4.1744928949780975e-05, + "loss": 1.332, + "step": 1157 + }, + { + "epoch": 1.70702045328911, + "grad_norm": 1.8748412382940545, + "learning_rate": 4.173022630427113e-05, + "loss": 1.1747, + "step": 1158 + }, + { + "epoch": 1.7084945642159572, + "grad_norm": 1.801780307701713, + "learning_rate": 4.1715513171593614e-05, + "loss": 1.3079, + "step": 1159 + }, + { + "epoch": 1.7099686751428045, + "grad_norm": 1.8248668230141263, + "learning_rate": 4.170078956097121e-05, + "loss": 1.3446, + "step": 1160 + }, + { + "epoch": 1.7114427860696517, + "grad_norm": 1.8900217402181687, + "learning_rate": 4.168605548163326e-05, + "loss": 1.2939, + "step": 1161 + }, + { + "epoch": 1.712916896996499, + "grad_norm": 1.7064373458461526, + "learning_rate": 4.167131094281565e-05, + "loss": 1.0821, + "step": 1162 + }, + { + "epoch": 1.7143910079233462, + "grad_norm": 1.7611034189905972, + "learning_rate": 4.165655595376088e-05, + "loss": 1.3065, + "step": 1163 + }, + { + "epoch": 1.7158651188501934, + "grad_norm": 1.805938353266847, + "learning_rate": 4.1641790523717935e-05, + "loss": 1.1311, + "step": 1164 + }, + { + "epoch": 1.7173392297770407, + "grad_norm": 1.9556661935175772, + "learning_rate": 4.162701466194237e-05, + "loss": 1.3561, + "step": 1165 + }, + { + "epoch": 1.718813340703888, + "grad_norm": 1.7340302683202522, + "learning_rate": 4.161222837769627e-05, + "loss": 1.0482, + "step": 1166 + }, + { + "epoch": 1.7202874516307354, + "grad_norm": 1.8453258956435885, + "learning_rate": 4.159743168024829e-05, + "loss": 1.337, + "step": 1167 + }, + { + "epoch": 1.7217615625575826, + "grad_norm": 1.8831672809056383, + "learning_rate": 4.158262457887356e-05, + "loss": 1.1831, + "step": 1168 + }, + { + "epoch": 1.7232356734844299, + "grad_norm": 1.9917243568758662, + "learning_rate": 4.156780708285378e-05, + "loss": 1.2676, + "step": 1169 + }, + { + "epoch": 1.724709784411277, + "grad_norm": 1.8132331286236758, + "learning_rate": 4.155297920147713e-05, + "loss": 1.1476, + "step": 1170 + }, + { + "epoch": 1.7261838953381243, + "grad_norm": 1.8651559543601395, + "learning_rate": 4.153814094403831e-05, + "loss": 1.2898, + "step": 1171 + }, + { + "epoch": 1.7276580062649716, + "grad_norm": 1.9299369372802475, + "learning_rate": 4.1523292319838524e-05, + "loss": 1.0423, + "step": 1172 + }, + { + "epoch": 1.7291321171918188, + "grad_norm": 2.0821933933697365, + "learning_rate": 4.150843333818549e-05, + "loss": 1.3333, + "step": 1173 + }, + { + "epoch": 1.730606228118666, + "grad_norm": 1.7358736316595578, + "learning_rate": 4.149356400839339e-05, + "loss": 1.047, + "step": 1174 + }, + { + "epoch": 1.7320803390455133, + "grad_norm": 2.31799046088717, + "learning_rate": 4.1478684339782926e-05, + "loss": 1.3923, + "step": 1175 + }, + { + "epoch": 1.7335544499723605, + "grad_norm": 1.8034789427227818, + "learning_rate": 4.1463794341681244e-05, + "loss": 1.2442, + "step": 1176 + }, + { + "epoch": 1.7350285608992078, + "grad_norm": 1.9622704584282626, + "learning_rate": 4.1448894023422005e-05, + "loss": 1.2256, + "step": 1177 + }, + { + "epoch": 1.736502671826055, + "grad_norm": 2.0212032988843904, + "learning_rate": 4.143398339434529e-05, + "loss": 1.3743, + "step": 1178 + }, + { + "epoch": 1.7379767827529022, + "grad_norm": 1.9532401089126716, + "learning_rate": 4.1419062463797695e-05, + "loss": 1.2189, + "step": 1179 + }, + { + "epoch": 1.7394508936797495, + "grad_norm": 1.9884353308191478, + "learning_rate": 4.140413124113225e-05, + "loss": 1.4263, + "step": 1180 + }, + { + "epoch": 1.7409250046065967, + "grad_norm": 2.0396914626524727, + "learning_rate": 4.138918973570842e-05, + "loss": 1.1713, + "step": 1181 + }, + { + "epoch": 1.742399115533444, + "grad_norm": 1.89297005735597, + "learning_rate": 4.1374237956892133e-05, + "loss": 1.2778, + "step": 1182 + }, + { + "epoch": 1.7438732264602912, + "grad_norm": 2.0251309293640483, + "learning_rate": 4.135927591405577e-05, + "loss": 1.295, + "step": 1183 + }, + { + "epoch": 1.7453473373871384, + "grad_norm": 2.0096657360228924, + "learning_rate": 4.134430361657813e-05, + "loss": 1.1209, + "step": 1184 + }, + { + "epoch": 1.7468214483139857, + "grad_norm": 1.9954076733814325, + "learning_rate": 4.1329321073844415e-05, + "loss": 1.201, + "step": 1185 + }, + { + "epoch": 1.748295559240833, + "grad_norm": 2.01562937654715, + "learning_rate": 4.131432829524631e-05, + "loss": 1.1608, + "step": 1186 + }, + { + "epoch": 1.7497696701676801, + "grad_norm": 2.01413476546496, + "learning_rate": 4.129932529018187e-05, + "loss": 1.2642, + "step": 1187 + }, + { + "epoch": 1.7512437810945274, + "grad_norm": 2.1457401613420806, + "learning_rate": 4.128431206805557e-05, + "loss": 1.433, + "step": 1188 + }, + { + "epoch": 1.7527178920213746, + "grad_norm": 1.8965854480340019, + "learning_rate": 4.126928863827827e-05, + "loss": 1.2827, + "step": 1189 + }, + { + "epoch": 1.7541920029482219, + "grad_norm": 1.752547861661921, + "learning_rate": 4.1254255010267285e-05, + "loss": 1.284, + "step": 1190 + }, + { + "epoch": 1.755666113875069, + "grad_norm": 1.8389301988175208, + "learning_rate": 4.123921119344627e-05, + "loss": 1.2685, + "step": 1191 + }, + { + "epoch": 1.7571402248019163, + "grad_norm": 1.8339557097025971, + "learning_rate": 4.122415719724528e-05, + "loss": 1.4762, + "step": 1192 + }, + { + "epoch": 1.7586143357287636, + "grad_norm": 1.865088715782675, + "learning_rate": 4.120909303110078e-05, + "loss": 1.2256, + "step": 1193 + }, + { + "epoch": 1.7600884466556108, + "grad_norm": 1.6230941012749778, + "learning_rate": 4.119401870445555e-05, + "loss": 1.1583, + "step": 1194 + }, + { + "epoch": 1.761562557582458, + "grad_norm": 1.9337570583853347, + "learning_rate": 4.1178934226758803e-05, + "loss": 1.1924, + "step": 1195 + }, + { + "epoch": 1.7630366685093053, + "grad_norm": 1.8186642629858298, + "learning_rate": 4.1163839607466084e-05, + "loss": 1.1415, + "step": 1196 + }, + { + "epoch": 1.7645107794361525, + "grad_norm": 1.7903219395963126, + "learning_rate": 4.114873485603927e-05, + "loss": 1.2286, + "step": 1197 + }, + { + "epoch": 1.7659848903629998, + "grad_norm": 1.6935997674622014, + "learning_rate": 4.113361998194665e-05, + "loss": 1.1085, + "step": 1198 + }, + { + "epoch": 1.767459001289847, + "grad_norm": 1.9573374982593792, + "learning_rate": 4.111849499466281e-05, + "loss": 1.4939, + "step": 1199 + }, + { + "epoch": 1.7689331122166942, + "grad_norm": 1.7747274404452258, + "learning_rate": 4.110335990366868e-05, + "loss": 1.1663, + "step": 1200 + }, + { + "epoch": 1.7704072231435415, + "grad_norm": 1.739624190469356, + "learning_rate": 4.108821471845155e-05, + "loss": 1.2981, + "step": 1201 + }, + { + "epoch": 1.7718813340703887, + "grad_norm": 1.8854769772601188, + "learning_rate": 4.107305944850502e-05, + "loss": 1.3968, + "step": 1202 + }, + { + "epoch": 1.773355444997236, + "grad_norm": 1.7286527934848495, + "learning_rate": 4.105789410332901e-05, + "loss": 1.0797, + "step": 1203 + }, + { + "epoch": 1.7748295559240832, + "grad_norm": 1.7856795798062597, + "learning_rate": 4.104271869242975e-05, + "loss": 1.1614, + "step": 1204 + }, + { + "epoch": 1.7763036668509304, + "grad_norm": 1.7031263998883295, + "learning_rate": 4.10275332253198e-05, + "loss": 1.1268, + "step": 1205 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 1.7717712910221877, + "learning_rate": 4.1012337711518e-05, + "loss": 1.2461, + "step": 1206 + }, + { + "epoch": 1.779251888704625, + "grad_norm": 1.9373306692638488, + "learning_rate": 4.099713216054952e-05, + "loss": 1.3029, + "step": 1207 + }, + { + "epoch": 1.7807259996314722, + "grad_norm": 1.8833718636321377, + "learning_rate": 4.098191658194578e-05, + "loss": 1.3794, + "step": 1208 + }, + { + "epoch": 1.7822001105583194, + "grad_norm": 1.921731214649604, + "learning_rate": 4.096669098524451e-05, + "loss": 1.2974, + "step": 1209 + }, + { + "epoch": 1.7836742214851666, + "grad_norm": 1.6931901295000011, + "learning_rate": 4.095145537998972e-05, + "loss": 1.0482, + "step": 1210 + }, + { + "epoch": 1.7851483324120139, + "grad_norm": 1.7316180474900313, + "learning_rate": 4.0936209775731686e-05, + "loss": 1.1968, + "step": 1211 + }, + { + "epoch": 1.786622443338861, + "grad_norm": 2.0403582084538323, + "learning_rate": 4.0920954182026965e-05, + "loss": 1.3086, + "step": 1212 + }, + { + "epoch": 1.7880965542657083, + "grad_norm": 1.9066816372828312, + "learning_rate": 4.090568860843836e-05, + "loss": 1.1558, + "step": 1213 + }, + { + "epoch": 1.7895706651925556, + "grad_norm": 2.127502426522616, + "learning_rate": 4.089041306453494e-05, + "loss": 1.1972, + "step": 1214 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 2.0013660295769196, + "learning_rate": 4.0875127559892015e-05, + "loss": 1.2047, + "step": 1215 + }, + { + "epoch": 1.79251888704625, + "grad_norm": 1.9904459203537852, + "learning_rate": 4.085983210409114e-05, + "loss": 1.2446, + "step": 1216 + }, + { + "epoch": 1.7939929979730975, + "grad_norm": 1.814407791540448, + "learning_rate": 4.084452670672012e-05, + "loss": 1.3015, + "step": 1217 + }, + { + "epoch": 1.7954671088999448, + "grad_norm": 1.8119198100722682, + "learning_rate": 4.082921137737299e-05, + "loss": 1.3315, + "step": 1218 + }, + { + "epoch": 1.796941219826792, + "grad_norm": 2.0153242966254457, + "learning_rate": 4.081388612564999e-05, + "loss": 1.2139, + "step": 1219 + }, + { + "epoch": 1.7984153307536392, + "grad_norm": 1.837036193648923, + "learning_rate": 4.07985509611576e-05, + "loss": 1.2622, + "step": 1220 + }, + { + "epoch": 1.7998894416804865, + "grad_norm": 1.8516436208989888, + "learning_rate": 4.078320589350851e-05, + "loss": 1.306, + "step": 1221 + }, + { + "epoch": 1.8013635526073337, + "grad_norm": 1.7578483747424278, + "learning_rate": 4.076785093232162e-05, + "loss": 1.2373, + "step": 1222 + }, + { + "epoch": 1.802837663534181, + "grad_norm": 1.8275956259573485, + "learning_rate": 4.0752486087222006e-05, + "loss": 1.2986, + "step": 1223 + }, + { + "epoch": 1.8043117744610282, + "grad_norm": 2.0905801491708997, + "learning_rate": 4.073711136784099e-05, + "loss": 1.2647, + "step": 1224 + }, + { + "epoch": 1.8057858853878754, + "grad_norm": 2.008867157496913, + "learning_rate": 4.072172678381603e-05, + "loss": 1.3697, + "step": 1225 + }, + { + "epoch": 1.8072599963147227, + "grad_norm": 1.840144560048702, + "learning_rate": 4.07063323447908e-05, + "loss": 1.1796, + "step": 1226 + }, + { + "epoch": 1.80873410724157, + "grad_norm": 1.7366482382740513, + "learning_rate": 4.0690928060415144e-05, + "loss": 1.1902, + "step": 1227 + }, + { + "epoch": 1.8102082181684171, + "grad_norm": 1.951229121348806, + "learning_rate": 4.067551394034508e-05, + "loss": 1.2459, + "step": 1228 + }, + { + "epoch": 1.8116823290952644, + "grad_norm": 1.9515362190327847, + "learning_rate": 4.066008999424279e-05, + "loss": 1.4361, + "step": 1229 + }, + { + "epoch": 1.8131564400221116, + "grad_norm": 1.804719404458753, + "learning_rate": 4.06446562317766e-05, + "loss": 1.0962, + "step": 1230 + }, + { + "epoch": 1.8146305509489589, + "grad_norm": 1.9655295780957824, + "learning_rate": 4.062921266262102e-05, + "loss": 1.3538, + "step": 1231 + }, + { + "epoch": 1.816104661875806, + "grad_norm": 1.8144401820843206, + "learning_rate": 4.0613759296456675e-05, + "loss": 1.2675, + "step": 1232 + }, + { + "epoch": 1.8175787728026536, + "grad_norm": 1.837893803235994, + "learning_rate": 4.059829614297036e-05, + "loss": 1.2578, + "step": 1233 + }, + { + "epoch": 1.8190528837295008, + "grad_norm": 1.8601099693440113, + "learning_rate": 4.058282321185498e-05, + "loss": 1.2072, + "step": 1234 + }, + { + "epoch": 1.820526994656348, + "grad_norm": 1.9126803643440953, + "learning_rate": 4.0567340512809586e-05, + "loss": 1.3911, + "step": 1235 + }, + { + "epoch": 1.8220011055831953, + "grad_norm": 1.836488941245533, + "learning_rate": 4.0551848055539345e-05, + "loss": 1.1442, + "step": 1236 + }, + { + "epoch": 1.8234752165100425, + "grad_norm": 2.200785592006832, + "learning_rate": 4.0536345849755545e-05, + "loss": 1.4227, + "step": 1237 + }, + { + "epoch": 1.8249493274368898, + "grad_norm": 2.0864498352192595, + "learning_rate": 4.0520833905175576e-05, + "loss": 1.3014, + "step": 1238 + }, + { + "epoch": 1.826423438363737, + "grad_norm": 2.0893770926850452, + "learning_rate": 4.0505312231522944e-05, + "loss": 1.2053, + "step": 1239 + }, + { + "epoch": 1.8278975492905842, + "grad_norm": 1.9714247683770754, + "learning_rate": 4.048978083852724e-05, + "loss": 1.3151, + "step": 1240 + }, + { + "epoch": 1.8293716602174315, + "grad_norm": 1.887759788268554, + "learning_rate": 4.0474239735924166e-05, + "loss": 1.1569, + "step": 1241 + }, + { + "epoch": 1.8308457711442787, + "grad_norm": 1.8476009562245232, + "learning_rate": 4.045868893345549e-05, + "loss": 1.2439, + "step": 1242 + }, + { + "epoch": 1.832319882071126, + "grad_norm": 2.0706931861821705, + "learning_rate": 4.0443128440869084e-05, + "loss": 1.3339, + "step": 1243 + }, + { + "epoch": 1.8337939929979732, + "grad_norm": 1.7172358692635747, + "learning_rate": 4.042755826791886e-05, + "loss": 1.0632, + "step": 1244 + }, + { + "epoch": 1.8352681039248204, + "grad_norm": 1.9403614325918679, + "learning_rate": 4.041197842436484e-05, + "loss": 1.2979, + "step": 1245 + }, + { + "epoch": 1.8367422148516677, + "grad_norm": 1.9239620258961594, + "learning_rate": 4.0396388919973074e-05, + "loss": 1.1112, + "step": 1246 + }, + { + "epoch": 1.838216325778515, + "grad_norm": 1.864200900904161, + "learning_rate": 4.038078976451567e-05, + "loss": 1.2435, + "step": 1247 + }, + { + "epoch": 1.8396904367053621, + "grad_norm": 1.7706432764124702, + "learning_rate": 4.036518096777082e-05, + "loss": 1.1653, + "step": 1248 + }, + { + "epoch": 1.8411645476322094, + "grad_norm": 1.8064599519033935, + "learning_rate": 4.0349562539522725e-05, + "loss": 1.2137, + "step": 1249 + }, + { + "epoch": 1.8426386585590566, + "grad_norm": 1.8003111782244594, + "learning_rate": 4.033393448956162e-05, + "loss": 1.1869, + "step": 1250 + }, + { + "epoch": 1.8426386585590566, + "eval_bleu": 0.048702265004150425, + "eval_bleu_1gram": 0.34895182098107314, + "eval_bleu_2gram": 0.12077243209842681, + "eval_bleu_3gram": 0.04874536480753008, + "eval_bleu_4gram": 0.022646375855692985, + "eval_rag_val_loss": 1.5452378831243003, + "eval_rouge1": 0.3332082488018953, + "eval_rouge2": 0.11316398251026843, + "eval_rougeL": 0.3126958399847377, + "step": 1250 + }, + { + "epoch": 1.8441127694859039, + "grad_norm": 1.948282081750063, + "learning_rate": 4.03182968276838e-05, + "loss": 1.1966, + "step": 1251 + }, + { + "epoch": 1.845586880412751, + "grad_norm": 1.8194222954460737, + "learning_rate": 4.030264956369157e-05, + "loss": 1.1786, + "step": 1252 + }, + { + "epoch": 1.8470609913395983, + "grad_norm": 1.7283532017835233, + "learning_rate": 4.028699270739326e-05, + "loss": 1.1096, + "step": 1253 + }, + { + "epoch": 1.8485351022664456, + "grad_norm": 1.8168961059268236, + "learning_rate": 4.027132626860318e-05, + "loss": 1.1748, + "step": 1254 + }, + { + "epoch": 1.8500092131932928, + "grad_norm": 1.9992881938743918, + "learning_rate": 4.02556502571417e-05, + "loss": 1.3077, + "step": 1255 + }, + { + "epoch": 1.85148332412014, + "grad_norm": 2.0348938170527933, + "learning_rate": 4.023996468283515e-05, + "loss": 1.1406, + "step": 1256 + }, + { + "epoch": 1.8529574350469873, + "grad_norm": 1.8747836941881846, + "learning_rate": 4.022426955551588e-05, + "loss": 1.1618, + "step": 1257 + }, + { + "epoch": 1.8544315459738345, + "grad_norm": 1.7492383934509335, + "learning_rate": 4.020856488502221e-05, + "loss": 1.1624, + "step": 1258 + }, + { + "epoch": 1.8559056569006818, + "grad_norm": 1.8520217519104791, + "learning_rate": 4.019285068119845e-05, + "loss": 1.3805, + "step": 1259 + }, + { + "epoch": 1.857379767827529, + "grad_norm": 2.0601375806105042, + "learning_rate": 4.017712695389487e-05, + "loss": 1.2948, + "step": 1260 + }, + { + "epoch": 1.8588538787543762, + "grad_norm": 1.7210131224154699, + "learning_rate": 4.0161393712967756e-05, + "loss": 1.1627, + "step": 1261 + }, + { + "epoch": 1.8603279896812235, + "grad_norm": 1.803741206982409, + "learning_rate": 4.01456509682793e-05, + "loss": 1.2001, + "step": 1262 + }, + { + "epoch": 1.8618021006080707, + "grad_norm": 1.871970526752597, + "learning_rate": 4.012989872969768e-05, + "loss": 1.2769, + "step": 1263 + }, + { + "epoch": 1.863276211534918, + "grad_norm": 1.8615618115741301, + "learning_rate": 4.011413700709703e-05, + "loss": 1.1997, + "step": 1264 + }, + { + "epoch": 1.8647503224617652, + "grad_norm": 1.801478054585274, + "learning_rate": 4.009836581035742e-05, + "loss": 1.1766, + "step": 1265 + }, + { + "epoch": 1.8662244333886124, + "grad_norm": 1.940315416001795, + "learning_rate": 4.008258514936486e-05, + "loss": 1.2656, + "step": 1266 + }, + { + "epoch": 1.8676985443154597, + "grad_norm": 1.84795430462972, + "learning_rate": 4.006679503401129e-05, + "loss": 1.2304, + "step": 1267 + }, + { + "epoch": 1.869172655242307, + "grad_norm": 2.0210821516355213, + "learning_rate": 4.0050995474194576e-05, + "loss": 1.387, + "step": 1268 + }, + { + "epoch": 1.8706467661691542, + "grad_norm": 1.8351643983961288, + "learning_rate": 4.003518647981852e-05, + "loss": 1.3058, + "step": 1269 + }, + { + "epoch": 1.8721208770960014, + "grad_norm": 1.8960338807841222, + "learning_rate": 4.0019368060792806e-05, + "loss": 1.0698, + "step": 1270 + }, + { + "epoch": 1.8735949880228486, + "grad_norm": 1.7714793952779375, + "learning_rate": 4.000354022703306e-05, + "loss": 1.2642, + "step": 1271 + }, + { + "epoch": 1.8750690989496959, + "grad_norm": 1.9631888547347802, + "learning_rate": 3.998770298846079e-05, + "loss": 1.1407, + "step": 1272 + }, + { + "epoch": 1.876543209876543, + "grad_norm": 1.8885628669784618, + "learning_rate": 3.9971856355003396e-05, + "loss": 1.2952, + "step": 1273 + }, + { + "epoch": 1.8780173208033903, + "grad_norm": 1.9533328746799457, + "learning_rate": 3.9956000336594185e-05, + "loss": 1.2955, + "step": 1274 + }, + { + "epoch": 1.8794914317302376, + "grad_norm": 1.9145949265464923, + "learning_rate": 3.994013494317233e-05, + "loss": 1.3606, + "step": 1275 + }, + { + "epoch": 1.8809655426570848, + "grad_norm": 2.015132873546277, + "learning_rate": 3.9924260184682894e-05, + "loss": 1.282, + "step": 1276 + }, + { + "epoch": 1.882439653583932, + "grad_norm": 2.19542131799087, + "learning_rate": 3.9908376071076805e-05, + "loss": 1.2211, + "step": 1277 + }, + { + "epoch": 1.8839137645107793, + "grad_norm": 1.7760036236966599, + "learning_rate": 3.9892482612310836e-05, + "loss": 1.0788, + "step": 1278 + }, + { + "epoch": 1.8853878754376265, + "grad_norm": 1.8199160557555802, + "learning_rate": 3.9876579818347654e-05, + "loss": 1.1686, + "step": 1279 + }, + { + "epoch": 1.8868619863644738, + "grad_norm": 2.018995438999959, + "learning_rate": 3.986066769915575e-05, + "loss": 1.2312, + "step": 1280 + }, + { + "epoch": 1.888336097291321, + "grad_norm": 1.8573979357150452, + "learning_rate": 3.984474626470948e-05, + "loss": 1.3486, + "step": 1281 + }, + { + "epoch": 1.8898102082181683, + "grad_norm": 2.038570183847055, + "learning_rate": 3.982881552498902e-05, + "loss": 1.2178, + "step": 1282 + }, + { + "epoch": 1.8912843191450157, + "grad_norm": 1.7797308349692056, + "learning_rate": 3.981287548998039e-05, + "loss": 1.3454, + "step": 1283 + }, + { + "epoch": 1.892758430071863, + "grad_norm": 1.8065937098004767, + "learning_rate": 3.979692616967543e-05, + "loss": 1.1505, + "step": 1284 + }, + { + "epoch": 1.8942325409987102, + "grad_norm": 1.9516186818327408, + "learning_rate": 3.978096757407182e-05, + "loss": 1.3331, + "step": 1285 + }, + { + "epoch": 1.8957066519255574, + "grad_norm": 2.1568726456086833, + "learning_rate": 3.976499971317302e-05, + "loss": 1.2342, + "step": 1286 + }, + { + "epoch": 1.8971807628524047, + "grad_norm": 1.8644989321340673, + "learning_rate": 3.974902259698833e-05, + "loss": 1.4244, + "step": 1287 + }, + { + "epoch": 1.898654873779252, + "grad_norm": 1.8607895543298092, + "learning_rate": 3.973303623553283e-05, + "loss": 1.231, + "step": 1288 + }, + { + "epoch": 1.9001289847060991, + "grad_norm": 1.7732760990319096, + "learning_rate": 3.9717040638827406e-05, + "loss": 1.212, + "step": 1289 + }, + { + "epoch": 1.9016030956329464, + "grad_norm": 1.6088192174525329, + "learning_rate": 3.9701035816898734e-05, + "loss": 1.167, + "step": 1290 + }, + { + "epoch": 1.9030772065597936, + "grad_norm": 1.783607328465992, + "learning_rate": 3.9685021779779264e-05, + "loss": 1.3402, + "step": 1291 + }, + { + "epoch": 1.9045513174866409, + "grad_norm": 1.715458648899289, + "learning_rate": 3.966899853750724e-05, + "loss": 1.3654, + "step": 1292 + }, + { + "epoch": 1.906025428413488, + "grad_norm": 1.7972168970332127, + "learning_rate": 3.9652966100126655e-05, + "loss": 1.1601, + "step": 1293 + }, + { + "epoch": 1.9074995393403353, + "grad_norm": 1.7842336060843975, + "learning_rate": 3.9636924477687265e-05, + "loss": 1.2925, + "step": 1294 + }, + { + "epoch": 1.9089736502671826, + "grad_norm": 1.8458450827931445, + "learning_rate": 3.9620873680244616e-05, + "loss": 1.3226, + "step": 1295 + }, + { + "epoch": 1.9104477611940298, + "grad_norm": 1.8390891437010344, + "learning_rate": 3.960481371785997e-05, + "loss": 1.1871, + "step": 1296 + }, + { + "epoch": 1.911921872120877, + "grad_norm": 1.7721933041959994, + "learning_rate": 3.958874460060035e-05, + "loss": 1.1808, + "step": 1297 + }, + { + "epoch": 1.9133959830477243, + "grad_norm": 1.7477844382259737, + "learning_rate": 3.95726663385385e-05, + "loss": 1.1591, + "step": 1298 + }, + { + "epoch": 1.9148700939745718, + "grad_norm": 1.9245669113067494, + "learning_rate": 3.955657894175293e-05, + "loss": 1.3242, + "step": 1299 + }, + { + "epoch": 1.916344204901419, + "grad_norm": 1.8511991506492163, + "learning_rate": 3.9540482420327845e-05, + "loss": 1.1912, + "step": 1300 + }, + { + "epoch": 1.9178183158282662, + "grad_norm": 2.2430300309258033, + "learning_rate": 3.952437678435319e-05, + "loss": 1.2118, + "step": 1301 + }, + { + "epoch": 1.9192924267551135, + "grad_norm": 1.9149109497480785, + "learning_rate": 3.950826204392461e-05, + "loss": 1.2133, + "step": 1302 + }, + { + "epoch": 1.9207665376819607, + "grad_norm": 1.6859895516068424, + "learning_rate": 3.949213820914347e-05, + "loss": 1.2436, + "step": 1303 + }, + { + "epoch": 1.922240648608808, + "grad_norm": 2.03249063018103, + "learning_rate": 3.9476005290116814e-05, + "loss": 1.2645, + "step": 1304 + }, + { + "epoch": 1.9237147595356552, + "grad_norm": 1.812264328466209, + "learning_rate": 3.94598632969574e-05, + "loss": 1.1948, + "step": 1305 + }, + { + "epoch": 1.9251888704625024, + "grad_norm": 1.8221580089262344, + "learning_rate": 3.944371223978366e-05, + "loss": 1.1905, + "step": 1306 + }, + { + "epoch": 1.9266629813893497, + "grad_norm": 2.0170351758394625, + "learning_rate": 3.942755212871973e-05, + "loss": 1.2373, + "step": 1307 + }, + { + "epoch": 1.928137092316197, + "grad_norm": 1.8732271714295254, + "learning_rate": 3.94113829738954e-05, + "loss": 1.1487, + "step": 1308 + }, + { + "epoch": 1.9296112032430441, + "grad_norm": 1.818437519299654, + "learning_rate": 3.939520478544614e-05, + "loss": 1.3074, + "step": 1309 + }, + { + "epoch": 1.9310853141698914, + "grad_norm": 1.803439811317709, + "learning_rate": 3.937901757351307e-05, + "loss": 1.3, + "step": 1310 + }, + { + "epoch": 1.9325594250967386, + "grad_norm": 1.9745017908145668, + "learning_rate": 3.936282134824297e-05, + "loss": 1.1545, + "step": 1311 + }, + { + "epoch": 1.9340335360235859, + "grad_norm": 1.8393019346138344, + "learning_rate": 3.93466161197883e-05, + "loss": 1.3737, + "step": 1312 + }, + { + "epoch": 1.935507646950433, + "grad_norm": 1.7163009534858393, + "learning_rate": 3.933040189830711e-05, + "loss": 1.0954, + "step": 1313 + }, + { + "epoch": 1.9369817578772803, + "grad_norm": 1.8541571227553122, + "learning_rate": 3.931417869396313e-05, + "loss": 1.2255, + "step": 1314 + }, + { + "epoch": 1.9384558688041276, + "grad_norm": 1.7203077712933845, + "learning_rate": 3.929794651692571e-05, + "loss": 1.1313, + "step": 1315 + }, + { + "epoch": 1.9399299797309748, + "grad_norm": 1.8044675882947914, + "learning_rate": 3.928170537736981e-05, + "loss": 1.1159, + "step": 1316 + }, + { + "epoch": 1.941404090657822, + "grad_norm": 1.6383659431857382, + "learning_rate": 3.9265455285476025e-05, + "loss": 1.1734, + "step": 1317 + }, + { + "epoch": 1.9428782015846693, + "grad_norm": 1.7968349949903029, + "learning_rate": 3.9249196251430556e-05, + "loss": 1.3347, + "step": 1318 + }, + { + "epoch": 1.9443523125115165, + "grad_norm": 1.9054955412172647, + "learning_rate": 3.92329282854252e-05, + "loss": 1.4535, + "step": 1319 + }, + { + "epoch": 1.9458264234383638, + "grad_norm": 2.0137488334742364, + "learning_rate": 3.9216651397657364e-05, + "loss": 1.4461, + "step": 1320 + }, + { + "epoch": 1.947300534365211, + "grad_norm": 2.0266086076066325, + "learning_rate": 3.9200365598330056e-05, + "loss": 1.3215, + "step": 1321 + }, + { + "epoch": 1.9487746452920582, + "grad_norm": 1.943006934023513, + "learning_rate": 3.9184070897651854e-05, + "loss": 1.383, + "step": 1322 + }, + { + "epoch": 1.9502487562189055, + "grad_norm": 1.9190659502076364, + "learning_rate": 3.916776730583691e-05, + "loss": 1.1812, + "step": 1323 + }, + { + "epoch": 1.9517228671457527, + "grad_norm": 2.111401072326197, + "learning_rate": 3.915145483310498e-05, + "loss": 1.322, + "step": 1324 + }, + { + "epoch": 1.9531969780726, + "grad_norm": 1.9477670037476407, + "learning_rate": 3.9135133489681356e-05, + "loss": 1.2421, + "step": 1325 + }, + { + "epoch": 1.9546710889994472, + "grad_norm": 1.8113719455271258, + "learning_rate": 3.91188032857969e-05, + "loss": 1.1733, + "step": 1326 + }, + { + "epoch": 1.9561451999262944, + "grad_norm": 1.8227026968220015, + "learning_rate": 3.910246423168803e-05, + "loss": 1.2431, + "step": 1327 + }, + { + "epoch": 1.9576193108531417, + "grad_norm": 2.092538853401969, + "learning_rate": 3.908611633759672e-05, + "loss": 1.3122, + "step": 1328 + }, + { + "epoch": 1.959093421779989, + "grad_norm": 1.8688696143612555, + "learning_rate": 3.906975961377046e-05, + "loss": 1.3971, + "step": 1329 + }, + { + "epoch": 1.9605675327068361, + "grad_norm": 1.9890412262814996, + "learning_rate": 3.905339407046231e-05, + "loss": 1.4756, + "step": 1330 + }, + { + "epoch": 1.9620416436336834, + "grad_norm": 2.012499019077607, + "learning_rate": 3.9037019717930826e-05, + "loss": 1.4202, + "step": 1331 + }, + { + "epoch": 1.9635157545605306, + "grad_norm": 1.8199782821910526, + "learning_rate": 3.902063656644012e-05, + "loss": 1.2034, + "step": 1332 + }, + { + "epoch": 1.9649898654873779, + "grad_norm": 1.7866698011090887, + "learning_rate": 3.900424462625977e-05, + "loss": 1.293, + "step": 1333 + }, + { + "epoch": 1.966463976414225, + "grad_norm": 1.7856965363846546, + "learning_rate": 3.898784390766491e-05, + "loss": 1.2813, + "step": 1334 + }, + { + "epoch": 1.9679380873410723, + "grad_norm": 1.7445537423897435, + "learning_rate": 3.897143442093616e-05, + "loss": 1.2391, + "step": 1335 + }, + { + "epoch": 1.9694121982679196, + "grad_norm": 1.7176120805886745, + "learning_rate": 3.895501617635964e-05, + "loss": 1.0774, + "step": 1336 + }, + { + "epoch": 1.9708863091947668, + "grad_norm": 1.802678959781068, + "learning_rate": 3.893858918422693e-05, + "loss": 1.241, + "step": 1337 + }, + { + "epoch": 1.972360420121614, + "grad_norm": 1.7748554627577509, + "learning_rate": 3.892215345483515e-05, + "loss": 1.1883, + "step": 1338 + }, + { + "epoch": 1.9738345310484613, + "grad_norm": 1.9104180965882762, + "learning_rate": 3.890570899848685e-05, + "loss": 1.4005, + "step": 1339 + }, + { + "epoch": 1.9753086419753085, + "grad_norm": 1.7871510651942184, + "learning_rate": 3.888925582549006e-05, + "loss": 1.2861, + "step": 1340 + }, + { + "epoch": 1.9767827529021558, + "grad_norm": 1.7658735792330242, + "learning_rate": 3.887279394615829e-05, + "loss": 1.2319, + "step": 1341 + }, + { + "epoch": 1.978256863829003, + "grad_norm": 1.840935385266865, + "learning_rate": 3.885632337081049e-05, + "loss": 1.277, + "step": 1342 + }, + { + "epoch": 1.9797309747558502, + "grad_norm": 1.7053744658315741, + "learning_rate": 3.8839844109771086e-05, + "loss": 1.1554, + "step": 1343 + }, + { + "epoch": 1.9812050856826975, + "grad_norm": 1.8866519639730726, + "learning_rate": 3.8823356173369895e-05, + "loss": 1.403, + "step": 1344 + }, + { + "epoch": 1.9826791966095447, + "grad_norm": 1.626864610624901, + "learning_rate": 3.8806859571942244e-05, + "loss": 1.2483, + "step": 1345 + }, + { + "epoch": 1.984153307536392, + "grad_norm": 1.8965975789502902, + "learning_rate": 3.8790354315828846e-05, + "loss": 1.2473, + "step": 1346 + }, + { + "epoch": 1.9856274184632392, + "grad_norm": 1.7827270139614713, + "learning_rate": 3.877384041537584e-05, + "loss": 1.4432, + "step": 1347 + }, + { + "epoch": 1.9871015293900864, + "grad_norm": 2.042235029348997, + "learning_rate": 3.8757317880934786e-05, + "loss": 1.3813, + "step": 1348 + }, + { + "epoch": 1.988575640316934, + "grad_norm": 1.7606918827091844, + "learning_rate": 3.8740786722862676e-05, + "loss": 1.1214, + "step": 1349 + }, + { + "epoch": 1.9900497512437811, + "grad_norm": 1.8107840208854808, + "learning_rate": 3.872424695152189e-05, + "loss": 1.2968, + "step": 1350 + }, + { + "epoch": 1.9915238621706284, + "grad_norm": 1.9953451227136578, + "learning_rate": 3.870769857728022e-05, + "loss": 1.2203, + "step": 1351 + }, + { + "epoch": 1.9929979730974756, + "grad_norm": 1.9744930365124012, + "learning_rate": 3.869114161051082e-05, + "loss": 1.2328, + "step": 1352 + }, + { + "epoch": 1.9944720840243229, + "grad_norm": 1.8419812657276473, + "learning_rate": 3.867457606159226e-05, + "loss": 1.1942, + "step": 1353 + }, + { + "epoch": 1.99594619495117, + "grad_norm": 1.7611947305859108, + "learning_rate": 3.86580019409085e-05, + "loss": 1.1931, + "step": 1354 + }, + { + "epoch": 1.9974203058780173, + "grad_norm": 1.6344202330768596, + "learning_rate": 3.8641419258848835e-05, + "loss": 1.0649, + "step": 1355 + }, + { + "epoch": 1.9988944168048646, + "grad_norm": 1.9735311550892811, + "learning_rate": 3.862482802580795e-05, + "loss": 1.2363, + "step": 1356 + }, + { + "epoch": 2.000368527731712, + "grad_norm": 1.8442219275693046, + "learning_rate": 3.860822825218588e-05, + "loss": 1.1441, + "step": 1357 + }, + { + "epoch": 2.0018426386585593, + "grad_norm": 1.7857747748892803, + "learning_rate": 3.859161994838803e-05, + "loss": 1.0668, + "step": 1358 + }, + { + "epoch": 2.0033167495854065, + "grad_norm": 1.7010962626036672, + "learning_rate": 3.8575003124825135e-05, + "loss": 0.8501, + "step": 1359 + }, + { + "epoch": 2.0047908605122537, + "grad_norm": 1.7876735442953533, + "learning_rate": 3.855837779191329e-05, + "loss": 0.7953, + "step": 1360 + }, + { + "epoch": 2.006264971439101, + "grad_norm": 1.7058387624073088, + "learning_rate": 3.8541743960073893e-05, + "loss": 0.9207, + "step": 1361 + }, + { + "epoch": 2.0077390823659482, + "grad_norm": 1.6804007501823424, + "learning_rate": 3.8525101639733706e-05, + "loss": 0.9173, + "step": 1362 + }, + { + "epoch": 2.0092131932927955, + "grad_norm": 1.8163975212953913, + "learning_rate": 3.850845084132478e-05, + "loss": 0.8751, + "step": 1363 + }, + { + "epoch": 2.0106873042196427, + "grad_norm": 1.5901131366568777, + "learning_rate": 3.84917915752845e-05, + "loss": 1.0104, + "step": 1364 + }, + { + "epoch": 2.01216141514649, + "grad_norm": 1.655024020778441, + "learning_rate": 3.847512385205556e-05, + "loss": 0.7875, + "step": 1365 + }, + { + "epoch": 2.013635526073337, + "grad_norm": 1.7146582709522675, + "learning_rate": 3.845844768208593e-05, + "loss": 0.9048, + "step": 1366 + }, + { + "epoch": 2.0151096370001844, + "grad_norm": 1.9761531843276894, + "learning_rate": 3.8441763075828904e-05, + "loss": 0.8797, + "step": 1367 + }, + { + "epoch": 2.0165837479270317, + "grad_norm": 1.972628693340942, + "learning_rate": 3.842507004374304e-05, + "loss": 0.8751, + "step": 1368 + }, + { + "epoch": 2.018057858853879, + "grad_norm": 1.9838013908782093, + "learning_rate": 3.8408368596292224e-05, + "loss": 0.9271, + "step": 1369 + }, + { + "epoch": 2.019531969780726, + "grad_norm": 2.0103643802570343, + "learning_rate": 3.839165874394555e-05, + "loss": 0.8759, + "step": 1370 + }, + { + "epoch": 2.0210060807075734, + "grad_norm": 2.082068924267785, + "learning_rate": 3.8374940497177434e-05, + "loss": 0.9728, + "step": 1371 + }, + { + "epoch": 2.0224801916344206, + "grad_norm": 2.1116217057104105, + "learning_rate": 3.835821386646753e-05, + "loss": 0.9845, + "step": 1372 + }, + { + "epoch": 2.023954302561268, + "grad_norm": 2.0508596060179722, + "learning_rate": 3.834147886230074e-05, + "loss": 0.9927, + "step": 1373 + }, + { + "epoch": 2.025428413488115, + "grad_norm": 1.997013842989832, + "learning_rate": 3.8324735495167246e-05, + "loss": 1.0003, + "step": 1374 + }, + { + "epoch": 2.0269025244149623, + "grad_norm": 2.1340776540754716, + "learning_rate": 3.8307983775562435e-05, + "loss": 0.891, + "step": 1375 + }, + { + "epoch": 2.0283766353418096, + "grad_norm": 2.1687888853781043, + "learning_rate": 3.8291223713986955e-05, + "loss": 0.8061, + "step": 1376 + }, + { + "epoch": 2.029850746268657, + "grad_norm": 2.18468446735995, + "learning_rate": 3.827445532094669e-05, + "loss": 0.9903, + "step": 1377 + }, + { + "epoch": 2.031324857195504, + "grad_norm": 2.0818416086827014, + "learning_rate": 3.8257678606952705e-05, + "loss": 0.8366, + "step": 1378 + }, + { + "epoch": 2.0327989681223513, + "grad_norm": 2.0400775122408774, + "learning_rate": 3.824089358252133e-05, + "loss": 0.9875, + "step": 1379 + }, + { + "epoch": 2.0342730790491985, + "grad_norm": 2.183074834920526, + "learning_rate": 3.822410025817406e-05, + "loss": 0.8794, + "step": 1380 + }, + { + "epoch": 2.0357471899760458, + "grad_norm": 2.0378284447053066, + "learning_rate": 3.820729864443764e-05, + "loss": 0.789, + "step": 1381 + }, + { + "epoch": 2.037221300902893, + "grad_norm": 1.9510525503701919, + "learning_rate": 3.819048875184398e-05, + "loss": 0.8298, + "step": 1382 + }, + { + "epoch": 2.0386954118297402, + "grad_norm": 2.022089328637764, + "learning_rate": 3.8173670590930165e-05, + "loss": 0.7973, + "step": 1383 + }, + { + "epoch": 2.0401695227565875, + "grad_norm": 2.1023363014749834, + "learning_rate": 3.815684417223851e-05, + "loss": 0.9196, + "step": 1384 + }, + { + "epoch": 2.0416436336834347, + "grad_norm": 2.0829569921718964, + "learning_rate": 3.814000950631647e-05, + "loss": 0.9855, + "step": 1385 + }, + { + "epoch": 2.043117744610282, + "grad_norm": 2.1110524607202072, + "learning_rate": 3.812316660371666e-05, + "loss": 0.856, + "step": 1386 + }, + { + "epoch": 2.044591855537129, + "grad_norm": 2.163798953097036, + "learning_rate": 3.810631547499692e-05, + "loss": 0.8027, + "step": 1387 + }, + { + "epoch": 2.0460659664639764, + "grad_norm": 1.881829349365738, + "learning_rate": 3.808945613072017e-05, + "loss": 0.7646, + "step": 1388 + }, + { + "epoch": 2.0475400773908237, + "grad_norm": 1.986875025535312, + "learning_rate": 3.807258858145453e-05, + "loss": 0.9371, + "step": 1389 + }, + { + "epoch": 2.049014188317671, + "grad_norm": 1.9492742801928584, + "learning_rate": 3.8055712837773225e-05, + "loss": 0.9211, + "step": 1390 + }, + { + "epoch": 2.050488299244518, + "grad_norm": 2.028289985705516, + "learning_rate": 3.803882891025466e-05, + "loss": 0.8203, + "step": 1391 + }, + { + "epoch": 2.0519624101713654, + "grad_norm": 1.9081633923863692, + "learning_rate": 3.802193680948236e-05, + "loss": 0.8979, + "step": 1392 + }, + { + "epoch": 2.0534365210982126, + "grad_norm": 2.1828756226333, + "learning_rate": 3.800503654604493e-05, + "loss": 0.82, + "step": 1393 + }, + { + "epoch": 2.05491063202506, + "grad_norm": 1.972326209531169, + "learning_rate": 3.798812813053615e-05, + "loss": 0.7363, + "step": 1394 + }, + { + "epoch": 2.056384742951907, + "grad_norm": 1.788951753745132, + "learning_rate": 3.7971211573554865e-05, + "loss": 0.9638, + "step": 1395 + }, + { + "epoch": 2.0578588538787543, + "grad_norm": 2.0160018213808826, + "learning_rate": 3.795428688570505e-05, + "loss": 0.8725, + "step": 1396 + }, + { + "epoch": 2.0593329648056016, + "grad_norm": 2.041376197030208, + "learning_rate": 3.793735407759577e-05, + "loss": 0.8207, + "step": 1397 + }, + { + "epoch": 2.060807075732449, + "grad_norm": 2.18431229617341, + "learning_rate": 3.792041315984118e-05, + "loss": 0.9525, + "step": 1398 + }, + { + "epoch": 2.062281186659296, + "grad_norm": 2.0891993019203627, + "learning_rate": 3.7903464143060506e-05, + "loss": 0.8858, + "step": 1399 + }, + { + "epoch": 2.0637552975861433, + "grad_norm": 2.1831610017085556, + "learning_rate": 3.788650703787808e-05, + "loss": 1.0065, + "step": 1400 + }, + { + "epoch": 2.0652294085129905, + "grad_norm": 1.8813815555642048, + "learning_rate": 3.7869541854923275e-05, + "loss": 0.8553, + "step": 1401 + }, + { + "epoch": 2.0667035194398378, + "grad_norm": 2.028008794761714, + "learning_rate": 3.785256860483054e-05, + "loss": 0.8109, + "step": 1402 + }, + { + "epoch": 2.068177630366685, + "grad_norm": 2.3541636734214815, + "learning_rate": 3.783558729823939e-05, + "loss": 1.0432, + "step": 1403 + }, + { + "epoch": 2.0696517412935322, + "grad_norm": 2.077943492792988, + "learning_rate": 3.781859794579436e-05, + "loss": 0.8413, + "step": 1404 + }, + { + "epoch": 2.0711258522203795, + "grad_norm": 2.1728780063348743, + "learning_rate": 3.780160055814507e-05, + "loss": 0.9425, + "step": 1405 + }, + { + "epoch": 2.0725999631472267, + "grad_norm": 2.134764954897539, + "learning_rate": 3.778459514594613e-05, + "loss": 0.8866, + "step": 1406 + }, + { + "epoch": 2.074074074074074, + "grad_norm": 2.074411616724855, + "learning_rate": 3.776758171985723e-05, + "loss": 0.781, + "step": 1407 + }, + { + "epoch": 2.075548185000921, + "grad_norm": 2.194043861986983, + "learning_rate": 3.775056029054304e-05, + "loss": 0.9234, + "step": 1408 + }, + { + "epoch": 2.0770222959277684, + "grad_norm": 2.157607604843163, + "learning_rate": 3.773353086867328e-05, + "loss": 0.8635, + "step": 1409 + }, + { + "epoch": 2.0784964068546157, + "grad_norm": 1.9892615279405041, + "learning_rate": 3.7716493464922654e-05, + "loss": 0.9011, + "step": 1410 + }, + { + "epoch": 2.079970517781463, + "grad_norm": 1.9791073907374646, + "learning_rate": 3.769944808997088e-05, + "loss": 0.8467, + "step": 1411 + }, + { + "epoch": 2.08144462870831, + "grad_norm": 2.168802187069107, + "learning_rate": 3.768239475450269e-05, + "loss": 0.915, + "step": 1412 + }, + { + "epoch": 2.0829187396351574, + "grad_norm": 2.2209268556218733, + "learning_rate": 3.7665333469207766e-05, + "loss": 0.8558, + "step": 1413 + }, + { + "epoch": 2.0843928505620046, + "grad_norm": 1.8506138942891583, + "learning_rate": 3.7648264244780804e-05, + "loss": 0.8275, + "step": 1414 + }, + { + "epoch": 2.085866961488852, + "grad_norm": 1.9264525197264786, + "learning_rate": 3.7631187091921483e-05, + "loss": 0.8782, + "step": 1415 + }, + { + "epoch": 2.087341072415699, + "grad_norm": 2.1009224409760647, + "learning_rate": 3.761410202133443e-05, + "loss": 0.9072, + "step": 1416 + }, + { + "epoch": 2.0888151833425463, + "grad_norm": 1.9073569413881535, + "learning_rate": 3.759700904372924e-05, + "loss": 0.8315, + "step": 1417 + }, + { + "epoch": 2.0902892942693936, + "grad_norm": 1.9217323312295775, + "learning_rate": 3.757990816982046e-05, + "loss": 0.8848, + "step": 1418 + }, + { + "epoch": 2.091763405196241, + "grad_norm": 2.090187115848783, + "learning_rate": 3.756279941032761e-05, + "loss": 0.9534, + "step": 1419 + }, + { + "epoch": 2.093237516123088, + "grad_norm": 2.096881858484732, + "learning_rate": 3.754568277597512e-05, + "loss": 0.9431, + "step": 1420 + }, + { + "epoch": 2.0947116270499353, + "grad_norm": 1.8975055432144436, + "learning_rate": 3.7528558277492395e-05, + "loss": 0.8695, + "step": 1421 + }, + { + "epoch": 2.096185737976783, + "grad_norm": 2.009660160126084, + "learning_rate": 3.751142592561373e-05, + "loss": 0.8803, + "step": 1422 + }, + { + "epoch": 2.09765984890363, + "grad_norm": 2.173873969431794, + "learning_rate": 3.749428573107837e-05, + "loss": 0.9142, + "step": 1423 + }, + { + "epoch": 2.0991339598304775, + "grad_norm": 2.20462688360011, + "learning_rate": 3.747713770463046e-05, + "loss": 0.9121, + "step": 1424 + }, + { + "epoch": 2.1006080707573247, + "grad_norm": 2.2194274083155934, + "learning_rate": 3.7459981857019064e-05, + "loss": 0.9857, + "step": 1425 + }, + { + "epoch": 2.102082181684172, + "grad_norm": 2.261097877155455, + "learning_rate": 3.7442818198998156e-05, + "loss": 0.9549, + "step": 1426 + }, + { + "epoch": 2.103556292611019, + "grad_norm": 2.0146018100063494, + "learning_rate": 3.7425646741326585e-05, + "loss": 0.937, + "step": 1427 + }, + { + "epoch": 2.1050304035378664, + "grad_norm": 2.0066025944024872, + "learning_rate": 3.74084674947681e-05, + "loss": 0.8741, + "step": 1428 + }, + { + "epoch": 2.1065045144647137, + "grad_norm": 2.0212309010393055, + "learning_rate": 3.739128047009134e-05, + "loss": 0.8549, + "step": 1429 + }, + { + "epoch": 2.107978625391561, + "grad_norm": 1.8842845559754433, + "learning_rate": 3.7374085678069794e-05, + "loss": 0.9084, + "step": 1430 + }, + { + "epoch": 2.109452736318408, + "grad_norm": 1.9773793097124184, + "learning_rate": 3.735688312948186e-05, + "loss": 0.9287, + "step": 1431 + }, + { + "epoch": 2.1109268472452554, + "grad_norm": 2.3539347801669837, + "learning_rate": 3.733967283511077e-05, + "loss": 1.05, + "step": 1432 + }, + { + "epoch": 2.1124009581721026, + "grad_norm": 2.3238116765365935, + "learning_rate": 3.7322454805744605e-05, + "loss": 1.0657, + "step": 1433 + }, + { + "epoch": 2.11387506909895, + "grad_norm": 1.961026132715875, + "learning_rate": 3.730522905217632e-05, + "loss": 0.8128, + "step": 1434 + }, + { + "epoch": 2.115349180025797, + "grad_norm": 2.0635096072987067, + "learning_rate": 3.728799558520369e-05, + "loss": 1.0626, + "step": 1435 + }, + { + "epoch": 2.1168232909526443, + "grad_norm": 2.263305529094511, + "learning_rate": 3.7270754415629346e-05, + "loss": 0.9281, + "step": 1436 + }, + { + "epoch": 2.1182974018794916, + "grad_norm": 2.2399624407548515, + "learning_rate": 3.725350555426072e-05, + "loss": 0.9108, + "step": 1437 + }, + { + "epoch": 2.119771512806339, + "grad_norm": 2.0148110335653224, + "learning_rate": 3.7236249011910085e-05, + "loss": 0.7875, + "step": 1438 + }, + { + "epoch": 2.121245623733186, + "grad_norm": 1.9612332307233578, + "learning_rate": 3.7218984799394534e-05, + "loss": 0.9921, + "step": 1439 + }, + { + "epoch": 2.1227197346600333, + "grad_norm": 2.170199179935063, + "learning_rate": 3.7201712927535954e-05, + "loss": 0.9712, + "step": 1440 + }, + { + "epoch": 2.1241938455868805, + "grad_norm": 1.964319056686202, + "learning_rate": 3.7184433407161026e-05, + "loss": 0.9123, + "step": 1441 + }, + { + "epoch": 2.1256679565137278, + "grad_norm": 2.232532728751973, + "learning_rate": 3.716714624910126e-05, + "loss": 0.8909, + "step": 1442 + }, + { + "epoch": 2.127142067440575, + "grad_norm": 2.162264741658338, + "learning_rate": 3.714985146419291e-05, + "loss": 0.8618, + "step": 1443 + }, + { + "epoch": 2.1286161783674222, + "grad_norm": 2.112909816740118, + "learning_rate": 3.713254906327703e-05, + "loss": 0.8012, + "step": 1444 + }, + { + "epoch": 2.1300902892942695, + "grad_norm": 2.01443209575205, + "learning_rate": 3.711523905719946e-05, + "loss": 0.9044, + "step": 1445 + }, + { + "epoch": 2.1315644002211167, + "grad_norm": 1.9147663924156424, + "learning_rate": 3.70979214568108e-05, + "loss": 0.8598, + "step": 1446 + }, + { + "epoch": 2.133038511147964, + "grad_norm": 2.4083290531248007, + "learning_rate": 3.70805962729664e-05, + "loss": 0.9821, + "step": 1447 + }, + { + "epoch": 2.134512622074811, + "grad_norm": 2.0323565476810055, + "learning_rate": 3.706326351652636e-05, + "loss": 0.9298, + "step": 1448 + }, + { + "epoch": 2.1359867330016584, + "grad_norm": 2.289939634202132, + "learning_rate": 3.704592319835557e-05, + "loss": 0.7555, + "step": 1449 + }, + { + "epoch": 2.1374608439285057, + "grad_norm": 2.0825533614815175, + "learning_rate": 3.702857532932359e-05, + "loss": 0.9834, + "step": 1450 + }, + { + "epoch": 2.138934954855353, + "grad_norm": 2.1134694239523424, + "learning_rate": 3.7011219920304774e-05, + "loss": 0.9361, + "step": 1451 + }, + { + "epoch": 2.1404090657822, + "grad_norm": 2.081678750721065, + "learning_rate": 3.699385698217816e-05, + "loss": 1.0017, + "step": 1452 + }, + { + "epoch": 2.1418831767090474, + "grad_norm": 2.1914111458626793, + "learning_rate": 3.6976486525827546e-05, + "loss": 0.8345, + "step": 1453 + }, + { + "epoch": 2.1433572876358946, + "grad_norm": 1.9090090936411204, + "learning_rate": 3.695910856214141e-05, + "loss": 0.9289, + "step": 1454 + }, + { + "epoch": 2.144831398562742, + "grad_norm": 2.1081727169007447, + "learning_rate": 3.694172310201295e-05, + "loss": 1.0178, + "step": 1455 + }, + { + "epoch": 2.146305509489589, + "grad_norm": 1.8817820914957895, + "learning_rate": 3.692433015634005e-05, + "loss": 0.8786, + "step": 1456 + }, + { + "epoch": 2.1477796204164363, + "grad_norm": 2.0570426178533685, + "learning_rate": 3.690692973602532e-05, + "loss": 0.9622, + "step": 1457 + }, + { + "epoch": 2.1492537313432836, + "grad_norm": 2.24391644638823, + "learning_rate": 3.6889521851976005e-05, + "loss": 1.0092, + "step": 1458 + }, + { + "epoch": 2.150727842270131, + "grad_norm": 2.246333950301708, + "learning_rate": 3.6872106515104065e-05, + "loss": 0.846, + "step": 1459 + }, + { + "epoch": 2.152201953196978, + "grad_norm": 2.0396881897429697, + "learning_rate": 3.6854683736326125e-05, + "loss": 0.9784, + "step": 1460 + }, + { + "epoch": 2.1536760641238253, + "grad_norm": 2.2976278511479196, + "learning_rate": 3.683725352656348e-05, + "loss": 0.9658, + "step": 1461 + }, + { + "epoch": 2.1551501750506725, + "grad_norm": 1.977616522884941, + "learning_rate": 3.681981589674206e-05, + "loss": 0.8477, + "step": 1462 + }, + { + "epoch": 2.1566242859775198, + "grad_norm": 1.9743275427118583, + "learning_rate": 3.6802370857792464e-05, + "loss": 0.8927, + "step": 1463 + }, + { + "epoch": 2.158098396904367, + "grad_norm": 2.0488298705232673, + "learning_rate": 3.678491842064995e-05, + "loss": 0.9523, + "step": 1464 + }, + { + "epoch": 2.1595725078312142, + "grad_norm": 2.046360912264953, + "learning_rate": 3.6767458596254364e-05, + "loss": 0.8787, + "step": 1465 + }, + { + "epoch": 2.1610466187580615, + "grad_norm": 2.1274202812849783, + "learning_rate": 3.674999139555024e-05, + "loss": 0.8368, + "step": 1466 + }, + { + "epoch": 2.1625207296849087, + "grad_norm": 2.2073350005301484, + "learning_rate": 3.67325168294867e-05, + "loss": 0.8863, + "step": 1467 + }, + { + "epoch": 2.163994840611756, + "grad_norm": 1.8879472669979658, + "learning_rate": 3.67150349090175e-05, + "loss": 0.8869, + "step": 1468 + }, + { + "epoch": 2.165468951538603, + "grad_norm": 2.039205261143766, + "learning_rate": 3.669754564510099e-05, + "loss": 0.9612, + "step": 1469 + }, + { + "epoch": 2.1669430624654504, + "grad_norm": 2.2291650385865536, + "learning_rate": 3.668004904870014e-05, + "loss": 0.8797, + "step": 1470 + }, + { + "epoch": 2.1684171733922977, + "grad_norm": 2.11697510502034, + "learning_rate": 3.666254513078251e-05, + "loss": 0.8437, + "step": 1471 + }, + { + "epoch": 2.169891284319145, + "grad_norm": 2.0246922200214605, + "learning_rate": 3.664503390232024e-05, + "loss": 0.8228, + "step": 1472 + }, + { + "epoch": 2.171365395245992, + "grad_norm": 2.1023154345774095, + "learning_rate": 3.6627515374290065e-05, + "loss": 0.992, + "step": 1473 + }, + { + "epoch": 2.1728395061728394, + "grad_norm": 1.8848701092314866, + "learning_rate": 3.66099895576733e-05, + "loss": 0.7264, + "step": 1474 + }, + { + "epoch": 2.1743136170996866, + "grad_norm": 1.9998146209634018, + "learning_rate": 3.6592456463455804e-05, + "loss": 0.9129, + "step": 1475 + }, + { + "epoch": 2.175787728026534, + "grad_norm": 2.2937966822725357, + "learning_rate": 3.657491610262802e-05, + "loss": 0.9663, + "step": 1476 + }, + { + "epoch": 2.177261838953381, + "grad_norm": 2.331690573289138, + "learning_rate": 3.655736848618495e-05, + "loss": 0.9636, + "step": 1477 + }, + { + "epoch": 2.1787359498802283, + "grad_norm": 2.111940646182858, + "learning_rate": 3.653981362512612e-05, + "loss": 0.8608, + "step": 1478 + }, + { + "epoch": 2.1802100608070756, + "grad_norm": 2.032584236341984, + "learning_rate": 3.652225153045562e-05, + "loss": 0.8771, + "step": 1479 + }, + { + "epoch": 2.181684171733923, + "grad_norm": 1.8614791376551973, + "learning_rate": 3.650468221318206e-05, + "loss": 0.7011, + "step": 1480 + }, + { + "epoch": 2.18315828266077, + "grad_norm": 2.0351159499968645, + "learning_rate": 3.648710568431859e-05, + "loss": 0.8497, + "step": 1481 + }, + { + "epoch": 2.1846323935876173, + "grad_norm": 1.992118924026525, + "learning_rate": 3.6469521954882865e-05, + "loss": 0.8525, + "step": 1482 + }, + { + "epoch": 2.1861065045144645, + "grad_norm": 2.098903406022416, + "learning_rate": 3.645193103589707e-05, + "loss": 1.0211, + "step": 1483 + }, + { + "epoch": 2.1875806154413118, + "grad_norm": 2.184934146683098, + "learning_rate": 3.6434332938387875e-05, + "loss": 0.8939, + "step": 1484 + }, + { + "epoch": 2.189054726368159, + "grad_norm": 2.1576725786657422, + "learning_rate": 3.6416727673386484e-05, + "loss": 0.8568, + "step": 1485 + }, + { + "epoch": 2.1905288372950062, + "grad_norm": 2.1528091619794893, + "learning_rate": 3.639911525192857e-05, + "loss": 1.0871, + "step": 1486 + }, + { + "epoch": 2.1920029482218535, + "grad_norm": 2.1805461983233605, + "learning_rate": 3.638149568505428e-05, + "loss": 0.9945, + "step": 1487 + }, + { + "epoch": 2.1934770591487007, + "grad_norm": 2.1263172330548277, + "learning_rate": 3.636386898380827e-05, + "loss": 1.0502, + "step": 1488 + }, + { + "epoch": 2.194951170075548, + "grad_norm": 2.2224662712979213, + "learning_rate": 3.634623515923965e-05, + "loss": 0.9194, + "step": 1489 + }, + { + "epoch": 2.196425281002395, + "grad_norm": 2.141030154903602, + "learning_rate": 3.632859422240199e-05, + "loss": 0.9408, + "step": 1490 + }, + { + "epoch": 2.1978993919292424, + "grad_norm": 1.97506916794195, + "learning_rate": 3.631094618435334e-05, + "loss": 0.9132, + "step": 1491 + }, + { + "epoch": 2.19937350285609, + "grad_norm": 2.024728723887344, + "learning_rate": 3.629329105615617e-05, + "loss": 0.7939, + "step": 1492 + }, + { + "epoch": 2.2008476137829374, + "grad_norm": 2.006878706204851, + "learning_rate": 3.6275628848877445e-05, + "loss": 0.8811, + "step": 1493 + }, + { + "epoch": 2.2023217247097846, + "grad_norm": 2.128366832464544, + "learning_rate": 3.6257959573588505e-05, + "loss": 0.9491, + "step": 1494 + }, + { + "epoch": 2.203795835636632, + "grad_norm": 2.001973727980873, + "learning_rate": 3.624028324136517e-05, + "loss": 0.8953, + "step": 1495 + }, + { + "epoch": 2.205269946563479, + "grad_norm": 1.9872733148821282, + "learning_rate": 3.622259986328765e-05, + "loss": 0.7757, + "step": 1496 + }, + { + "epoch": 2.2067440574903263, + "grad_norm": 2.011522361280237, + "learning_rate": 3.620490945044059e-05, + "loss": 0.8904, + "step": 1497 + }, + { + "epoch": 2.2082181684171736, + "grad_norm": 2.0440218310151446, + "learning_rate": 3.618721201391304e-05, + "loss": 1.0982, + "step": 1498 + }, + { + "epoch": 2.209692279344021, + "grad_norm": 2.3159360360123054, + "learning_rate": 3.616950756479846e-05, + "loss": 0.9941, + "step": 1499 + }, + { + "epoch": 2.211166390270868, + "grad_norm": 1.991417290891538, + "learning_rate": 3.615179611419469e-05, + "loss": 0.8157, + "step": 1500 + }, + { + "epoch": 2.211166390270868, + "eval_bleu": 0.04873016621100759, + "eval_bleu_1gram": 0.34546108364969297, + "eval_bleu_2gram": 0.12150810336143084, + "eval_bleu_3gram": 0.04877699663074917, + "eval_bleu_4gram": 0.02300488317527046, + "eval_rag_val_loss": 1.6415073826748838, + "eval_rouge1": 0.33328230586389573, + "eval_rouge2": 0.11506413960285228, + "eval_rougeL": 0.313148189190874, + "step": 1500 + }, + { + "epoch": 2.2126405011977153, + "grad_norm": 2.105556280974188, + "learning_rate": 3.613407767320398e-05, + "loss": 0.7993, + "step": 1501 + }, + { + "epoch": 2.2141146121245625, + "grad_norm": 2.1702186251084985, + "learning_rate": 3.6116352252932936e-05, + "loss": 0.9826, + "step": 1502 + }, + { + "epoch": 2.2155887230514097, + "grad_norm": 2.110317104610807, + "learning_rate": 3.609861986449256e-05, + "loss": 0.7707, + "step": 1503 + }, + { + "epoch": 2.217062833978257, + "grad_norm": 2.236017543764845, + "learning_rate": 3.6080880518998216e-05, + "loss": 0.8911, + "step": 1504 + }, + { + "epoch": 2.2185369449051042, + "grad_norm": 2.155143675119247, + "learning_rate": 3.606313422756962e-05, + "loss": 1.0983, + "step": 1505 + }, + { + "epoch": 2.2200110558319515, + "grad_norm": 2.051700640267227, + "learning_rate": 3.604538100133086e-05, + "loss": 1.0635, + "step": 1506 + }, + { + "epoch": 2.2214851667587987, + "grad_norm": 1.9430346040260391, + "learning_rate": 3.602762085141035e-05, + "loss": 0.8652, + "step": 1507 + }, + { + "epoch": 2.222959277685646, + "grad_norm": 2.1888095750440337, + "learning_rate": 3.600985378894086e-05, + "loss": 0.9552, + "step": 1508 + }, + { + "epoch": 2.224433388612493, + "grad_norm": 2.1659819425779454, + "learning_rate": 3.599207982505949e-05, + "loss": 0.8315, + "step": 1509 + }, + { + "epoch": 2.2259074995393404, + "grad_norm": 2.3219815297800914, + "learning_rate": 3.597429897090765e-05, + "loss": 1.0268, + "step": 1510 + }, + { + "epoch": 2.2273816104661877, + "grad_norm": 2.4124955211854777, + "learning_rate": 3.5956511237631106e-05, + "loss": 0.8832, + "step": 1511 + }, + { + "epoch": 2.228855721393035, + "grad_norm": 2.1960766097152726, + "learning_rate": 3.59387166363799e-05, + "loss": 0.9568, + "step": 1512 + }, + { + "epoch": 2.230329832319882, + "grad_norm": 1.9597512718923582, + "learning_rate": 3.592091517830838e-05, + "loss": 0.8691, + "step": 1513 + }, + { + "epoch": 2.2318039432467294, + "grad_norm": 2.086257281419265, + "learning_rate": 3.5903106874575235e-05, + "loss": 0.9168, + "step": 1514 + }, + { + "epoch": 2.2332780541735766, + "grad_norm": 2.133130842771444, + "learning_rate": 3.5885291736343375e-05, + "loss": 0.9482, + "step": 1515 + }, + { + "epoch": 2.234752165100424, + "grad_norm": 2.0770376702291715, + "learning_rate": 3.586746977478006e-05, + "loss": 0.9055, + "step": 1516 + }, + { + "epoch": 2.236226276027271, + "grad_norm": 2.050196856839468, + "learning_rate": 3.58496410010568e-05, + "loss": 0.9004, + "step": 1517 + }, + { + "epoch": 2.2377003869541183, + "grad_norm": 2.2821588665345036, + "learning_rate": 3.583180542634937e-05, + "loss": 0.9628, + "step": 1518 + }, + { + "epoch": 2.2391744978809656, + "grad_norm": 2.1189697196708783, + "learning_rate": 3.5813963061837815e-05, + "loss": 1.1239, + "step": 1519 + }, + { + "epoch": 2.240648608807813, + "grad_norm": 2.0305813202250422, + "learning_rate": 3.5796113918706426e-05, + "loss": 1.1239, + "step": 1520 + }, + { + "epoch": 2.24212271973466, + "grad_norm": 2.0927562348000968, + "learning_rate": 3.577825800814376e-05, + "loss": 0.9268, + "step": 1521 + }, + { + "epoch": 2.2435968306615073, + "grad_norm": 2.0088263300960847, + "learning_rate": 3.576039534134262e-05, + "loss": 0.9577, + "step": 1522 + }, + { + "epoch": 2.2450709415883545, + "grad_norm": 2.0326487493570893, + "learning_rate": 3.57425259295e-05, + "loss": 0.8378, + "step": 1523 + }, + { + "epoch": 2.2465450525152018, + "grad_norm": 2.04329397264233, + "learning_rate": 3.5724649783817185e-05, + "loss": 0.9176, + "step": 1524 + }, + { + "epoch": 2.248019163442049, + "grad_norm": 2.0977054639900152, + "learning_rate": 3.5706766915499646e-05, + "loss": 0.9114, + "step": 1525 + }, + { + "epoch": 2.2494932743688962, + "grad_norm": 2.089667253973781, + "learning_rate": 3.568887733575706e-05, + "loss": 0.9543, + "step": 1526 + }, + { + "epoch": 2.2509673852957435, + "grad_norm": 2.130977525940725, + "learning_rate": 3.567098105580333e-05, + "loss": 0.8649, + "step": 1527 + }, + { + "epoch": 2.2524414962225907, + "grad_norm": 2.1621832554986837, + "learning_rate": 3.5653078086856546e-05, + "loss": 0.8767, + "step": 1528 + }, + { + "epoch": 2.253915607149438, + "grad_norm": 2.0102921547817134, + "learning_rate": 3.563516844013901e-05, + "loss": 0.9834, + "step": 1529 + }, + { + "epoch": 2.255389718076285, + "grad_norm": 1.9456040497119693, + "learning_rate": 3.561725212687718e-05, + "loss": 0.9435, + "step": 1530 + }, + { + "epoch": 2.2568638290031324, + "grad_norm": 2.13741044420168, + "learning_rate": 3.559932915830172e-05, + "loss": 0.9592, + "step": 1531 + }, + { + "epoch": 2.2583379399299797, + "grad_norm": 2.0989162418761205, + "learning_rate": 3.558139954564746e-05, + "loss": 0.8713, + "step": 1532 + }, + { + "epoch": 2.259812050856827, + "grad_norm": 1.9467204563774225, + "learning_rate": 3.556346330015338e-05, + "loss": 0.7914, + "step": 1533 + }, + { + "epoch": 2.261286161783674, + "grad_norm": 2.111455385958536, + "learning_rate": 3.554552043306264e-05, + "loss": 0.9559, + "step": 1534 + }, + { + "epoch": 2.2627602727105214, + "grad_norm": 2.314534400768346, + "learning_rate": 3.552757095562253e-05, + "loss": 1.0114, + "step": 1535 + }, + { + "epoch": 2.2642343836373686, + "grad_norm": 2.132745091976501, + "learning_rate": 3.55096148790845e-05, + "loss": 0.9328, + "step": 1536 + }, + { + "epoch": 2.265708494564216, + "grad_norm": 2.2102806814096696, + "learning_rate": 3.5491652214704115e-05, + "loss": 0.9129, + "step": 1537 + }, + { + "epoch": 2.267182605491063, + "grad_norm": 2.1735395461039544, + "learning_rate": 3.547368297374109e-05, + "loss": 0.9542, + "step": 1538 + }, + { + "epoch": 2.2686567164179103, + "grad_norm": 2.142869649578109, + "learning_rate": 3.545570716745927e-05, + "loss": 0.8873, + "step": 1539 + }, + { + "epoch": 2.2701308273447576, + "grad_norm": 2.31773402739152, + "learning_rate": 3.543772480712658e-05, + "loss": 1.0277, + "step": 1540 + }, + { + "epoch": 2.271604938271605, + "grad_norm": 2.048202200846928, + "learning_rate": 3.5419735904015095e-05, + "loss": 0.9348, + "step": 1541 + }, + { + "epoch": 2.273079049198452, + "grad_norm": 2.072265625, + "learning_rate": 3.540174046940096e-05, + "loss": 0.9552, + "step": 1542 + }, + { + "epoch": 2.2745531601252993, + "grad_norm": 2.1244810256020905, + "learning_rate": 3.538373851456442e-05, + "loss": 1.0018, + "step": 1543 + }, + { + "epoch": 2.2760272710521465, + "grad_norm": 2.412079820953815, + "learning_rate": 3.536573005078981e-05, + "loss": 0.972, + "step": 1544 + }, + { + "epoch": 2.2775013819789938, + "grad_norm": 2.118810953259364, + "learning_rate": 3.5347715089365576e-05, + "loss": 0.9555, + "step": 1545 + }, + { + "epoch": 2.278975492905841, + "grad_norm": 2.2370507284729766, + "learning_rate": 3.532969364158417e-05, + "loss": 0.915, + "step": 1546 + }, + { + "epoch": 2.2804496038326882, + "grad_norm": 2.1738247250155913, + "learning_rate": 3.5311665718742184e-05, + "loss": 1.0054, + "step": 1547 + }, + { + "epoch": 2.2819237147595355, + "grad_norm": 2.0754793004212426, + "learning_rate": 3.529363133214021e-05, + "loss": 0.8591, + "step": 1548 + }, + { + "epoch": 2.2833978256863827, + "grad_norm": 2.163074474433509, + "learning_rate": 3.527559049308291e-05, + "loss": 0.9694, + "step": 1549 + }, + { + "epoch": 2.28487193661323, + "grad_norm": 2.188200048419773, + "learning_rate": 3.525754321287902e-05, + "loss": 1.1303, + "step": 1550 + }, + { + "epoch": 2.286346047540077, + "grad_norm": 2.187646915406885, + "learning_rate": 3.523948950284127e-05, + "loss": 0.9452, + "step": 1551 + }, + { + "epoch": 2.287820158466925, + "grad_norm": 2.2667391438210025, + "learning_rate": 3.522142937428645e-05, + "loss": 0.8507, + "step": 1552 + }, + { + "epoch": 2.289294269393772, + "grad_norm": 2.059186527964843, + "learning_rate": 3.5203362838535355e-05, + "loss": 0.9941, + "step": 1553 + }, + { + "epoch": 2.2907683803206194, + "grad_norm": 2.1223300541394625, + "learning_rate": 3.518528990691281e-05, + "loss": 0.9625, + "step": 1554 + }, + { + "epoch": 2.2922424912474666, + "grad_norm": 2.0428141139716907, + "learning_rate": 3.516721059074764e-05, + "loss": 0.8442, + "step": 1555 + }, + { + "epoch": 2.293716602174314, + "grad_norm": 2.2479874829252657, + "learning_rate": 3.5149124901372677e-05, + "loss": 1.0568, + "step": 1556 + }, + { + "epoch": 2.295190713101161, + "grad_norm": 2.2184364204710407, + "learning_rate": 3.513103285012475e-05, + "loss": 1.0052, + "step": 1557 + }, + { + "epoch": 2.2966648240280083, + "grad_norm": 1.987701391417144, + "learning_rate": 3.511293444834466e-05, + "loss": 0.9989, + "step": 1558 + }, + { + "epoch": 2.2981389349548556, + "grad_norm": 2.0599799510757935, + "learning_rate": 3.509482970737722e-05, + "loss": 1.0301, + "step": 1559 + }, + { + "epoch": 2.299613045881703, + "grad_norm": 2.140687704908184, + "learning_rate": 3.5076718638571185e-05, + "loss": 0.8967, + "step": 1560 + }, + { + "epoch": 2.30108715680855, + "grad_norm": 2.3877592175752818, + "learning_rate": 3.505860125327928e-05, + "loss": 0.9361, + "step": 1561 + }, + { + "epoch": 2.3025612677353973, + "grad_norm": 2.125644137423384, + "learning_rate": 3.504047756285822e-05, + "loss": 0.8999, + "step": 1562 + }, + { + "epoch": 2.3040353786622445, + "grad_norm": 1.9433866105974673, + "learning_rate": 3.5022347578668644e-05, + "loss": 0.8091, + "step": 1563 + }, + { + "epoch": 2.3055094895890917, + "grad_norm": 2.0982249250982865, + "learning_rate": 3.5004211312075143e-05, + "loss": 0.9879, + "step": 1564 + }, + { + "epoch": 2.306983600515939, + "grad_norm": 2.43759438747665, + "learning_rate": 3.498606877444625e-05, + "loss": 0.9294, + "step": 1565 + }, + { + "epoch": 2.308457711442786, + "grad_norm": 2.1661650981873426, + "learning_rate": 3.4967919977154406e-05, + "loss": 0.9955, + "step": 1566 + }, + { + "epoch": 2.3099318223696335, + "grad_norm": 2.0028949290011826, + "learning_rate": 3.4949764931576014e-05, + "loss": 0.8, + "step": 1567 + }, + { + "epoch": 2.3114059332964807, + "grad_norm": 2.009714850700947, + "learning_rate": 3.4931603649091374e-05, + "loss": 0.8421, + "step": 1568 + }, + { + "epoch": 2.312880044223328, + "grad_norm": 2.417448465454824, + "learning_rate": 3.4913436141084676e-05, + "loss": 1.1608, + "step": 1569 + }, + { + "epoch": 2.314354155150175, + "grad_norm": 2.1768051286075494, + "learning_rate": 3.489526241894406e-05, + "loss": 0.8538, + "step": 1570 + }, + { + "epoch": 2.3158282660770224, + "grad_norm": 2.07055547476035, + "learning_rate": 3.487708249406153e-05, + "loss": 0.8612, + "step": 1571 + }, + { + "epoch": 2.3173023770038697, + "grad_norm": 2.353764311128565, + "learning_rate": 3.4858896377832966e-05, + "loss": 1.0192, + "step": 1572 + }, + { + "epoch": 2.318776487930717, + "grad_norm": 2.2784707894075784, + "learning_rate": 3.4840704081658155e-05, + "loss": 0.9219, + "step": 1573 + }, + { + "epoch": 2.320250598857564, + "grad_norm": 2.2050579059981468, + "learning_rate": 3.482250561694075e-05, + "loss": 0.9986, + "step": 1574 + }, + { + "epoch": 2.3217247097844114, + "grad_norm": 2.136927620618377, + "learning_rate": 3.4804300995088264e-05, + "loss": 0.9289, + "step": 1575 + }, + { + "epoch": 2.3231988207112586, + "grad_norm": 2.180358674002088, + "learning_rate": 3.478609022751207e-05, + "loss": 0.8347, + "step": 1576 + }, + { + "epoch": 2.324672931638106, + "grad_norm": 2.2248727912113884, + "learning_rate": 3.4767873325627406e-05, + "loss": 0.9333, + "step": 1577 + }, + { + "epoch": 2.326147042564953, + "grad_norm": 2.162521309279041, + "learning_rate": 3.4749650300853343e-05, + "loss": 0.949, + "step": 1578 + }, + { + "epoch": 2.3276211534918003, + "grad_norm": 2.082663148531871, + "learning_rate": 3.473142116461279e-05, + "loss": 0.9482, + "step": 1579 + }, + { + "epoch": 2.3290952644186476, + "grad_norm": 2.085001989313361, + "learning_rate": 3.47131859283325e-05, + "loss": 0.8233, + "step": 1580 + }, + { + "epoch": 2.330569375345495, + "grad_norm": 2.176165178761462, + "learning_rate": 3.469494460344304e-05, + "loss": 0.871, + "step": 1581 + }, + { + "epoch": 2.332043486272342, + "grad_norm": 2.0867394895104945, + "learning_rate": 3.467669720137879e-05, + "loss": 0.9044, + "step": 1582 + }, + { + "epoch": 2.3335175971991893, + "grad_norm": 2.152126848261575, + "learning_rate": 3.465844373357794e-05, + "loss": 0.9604, + "step": 1583 + }, + { + "epoch": 2.3349917081260365, + "grad_norm": 2.1952117136562106, + "learning_rate": 3.464018421148249e-05, + "loss": 0.9005, + "step": 1584 + }, + { + "epoch": 2.3364658190528838, + "grad_norm": 2.1958414438328324, + "learning_rate": 3.462191864653821e-05, + "loss": 0.9823, + "step": 1585 + }, + { + "epoch": 2.337939929979731, + "grad_norm": 2.0407326826159697, + "learning_rate": 3.460364705019472e-05, + "loss": 0.8598, + "step": 1586 + }, + { + "epoch": 2.3394140409065782, + "grad_norm": 2.0465947316023176, + "learning_rate": 3.458536943390536e-05, + "loss": 0.9172, + "step": 1587 + }, + { + "epoch": 2.3408881518334255, + "grad_norm": 2.39592658704524, + "learning_rate": 3.456708580912725e-05, + "loss": 0.9101, + "step": 1588 + }, + { + "epoch": 2.3423622627602727, + "grad_norm": 2.0897879619817177, + "learning_rate": 3.4548796187321295e-05, + "loss": 0.8849, + "step": 1589 + }, + { + "epoch": 2.34383637368712, + "grad_norm": 2.07649890121993, + "learning_rate": 3.453050057995217e-05, + "loss": 0.9358, + "step": 1590 + }, + { + "epoch": 2.345310484613967, + "grad_norm": 2.0713856344987676, + "learning_rate": 3.451219899848827e-05, + "loss": 0.9659, + "step": 1591 + }, + { + "epoch": 2.3467845955408144, + "grad_norm": 2.2172347791602873, + "learning_rate": 3.449389145440175e-05, + "loss": 0.9438, + "step": 1592 + }, + { + "epoch": 2.3482587064676617, + "grad_norm": 2.1269267827584213, + "learning_rate": 3.4475577959168505e-05, + "loss": 0.9422, + "step": 1593 + }, + { + "epoch": 2.349732817394509, + "grad_norm": 2.182007460800828, + "learning_rate": 3.445725852426817e-05, + "loss": 0.9615, + "step": 1594 + }, + { + "epoch": 2.351206928321356, + "grad_norm": 2.151156726435392, + "learning_rate": 3.443893316118407e-05, + "loss": 1.0295, + "step": 1595 + }, + { + "epoch": 2.3526810392482034, + "grad_norm": 2.3361724429169253, + "learning_rate": 3.4420601881403284e-05, + "loss": 1.0373, + "step": 1596 + }, + { + "epoch": 2.3541551501750506, + "grad_norm": 2.106883067633596, + "learning_rate": 3.440226469641658e-05, + "loss": 0.8609, + "step": 1597 + }, + { + "epoch": 2.355629261101898, + "grad_norm": 1.9681753500648527, + "learning_rate": 3.4383921617718427e-05, + "loss": 0.7817, + "step": 1598 + }, + { + "epoch": 2.357103372028745, + "grad_norm": 2.025628041962785, + "learning_rate": 3.4365572656807e-05, + "loss": 0.9318, + "step": 1599 + }, + { + "epoch": 2.3585774829555923, + "grad_norm": 1.9314713977707498, + "learning_rate": 3.4347217825184134e-05, + "loss": 0.7432, + "step": 1600 + }, + { + "epoch": 2.3600515938824396, + "grad_norm": 2.006976120408059, + "learning_rate": 3.432885713435539e-05, + "loss": 0.7835, + "step": 1601 + }, + { + "epoch": 2.361525704809287, + "grad_norm": 1.988069354384554, + "learning_rate": 3.431049059582996e-05, + "loss": 1.1013, + "step": 1602 + }, + { + "epoch": 2.362999815736134, + "grad_norm": 1.8315693287184354, + "learning_rate": 3.4292118221120715e-05, + "loss": 0.8786, + "step": 1603 + }, + { + "epoch": 2.3644739266629813, + "grad_norm": 1.9643463051108345, + "learning_rate": 3.42737400217442e-05, + "loss": 0.966, + "step": 1604 + }, + { + "epoch": 2.3659480375898285, + "grad_norm": 2.0706976766178204, + "learning_rate": 3.425535600922059e-05, + "loss": 0.9188, + "step": 1605 + }, + { + "epoch": 2.3674221485166758, + "grad_norm": 2.2718845582364806, + "learning_rate": 3.423696619507369e-05, + "loss": 1.0676, + "step": 1606 + }, + { + "epoch": 2.368896259443523, + "grad_norm": 2.1186866098109087, + "learning_rate": 3.4218570590831e-05, + "loss": 0.8887, + "step": 1607 + }, + { + "epoch": 2.3703703703703702, + "grad_norm": 2.15284748032915, + "learning_rate": 3.4200169208023594e-05, + "loss": 0.9728, + "step": 1608 + }, + { + "epoch": 2.3718444812972175, + "grad_norm": 2.0876352780304197, + "learning_rate": 3.418176205818618e-05, + "loss": 0.9943, + "step": 1609 + }, + { + "epoch": 2.3733185922240647, + "grad_norm": 2.09037280649979, + "learning_rate": 3.4163349152857096e-05, + "loss": 0.9014, + "step": 1610 + }, + { + "epoch": 2.374792703150912, + "grad_norm": 2.0884968918513978, + "learning_rate": 3.4144930503578286e-05, + "loss": 0.9424, + "step": 1611 + }, + { + "epoch": 2.376266814077759, + "grad_norm": 2.1697687048634884, + "learning_rate": 3.412650612189528e-05, + "loss": 0.8936, + "step": 1612 + }, + { + "epoch": 2.3777409250046064, + "grad_norm": 2.573344381717162, + "learning_rate": 3.4108076019357204e-05, + "loss": 0.9592, + "step": 1613 + }, + { + "epoch": 2.3792150359314537, + "grad_norm": 2.2670968372139333, + "learning_rate": 3.4089640207516786e-05, + "loss": 0.9989, + "step": 1614 + }, + { + "epoch": 2.380689146858301, + "grad_norm": 2.131349725711338, + "learning_rate": 3.4071198697930315e-05, + "loss": 0.9266, + "step": 1615 + }, + { + "epoch": 2.382163257785148, + "grad_norm": 2.427159898939151, + "learning_rate": 3.405275150215766e-05, + "loss": 0.9548, + "step": 1616 + }, + { + "epoch": 2.3836373687119954, + "grad_norm": 2.084556207972814, + "learning_rate": 3.403429863176226e-05, + "loss": 0.9502, + "step": 1617 + }, + { + "epoch": 2.3851114796388426, + "grad_norm": 2.096098991327774, + "learning_rate": 3.40158400983111e-05, + "loss": 0.7499, + "step": 1618 + }, + { + "epoch": 2.38658559056569, + "grad_norm": 2.0713654917240403, + "learning_rate": 3.399737591337471e-05, + "loss": 1.0064, + "step": 1619 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 2.069673835562989, + "learning_rate": 3.397890608852718e-05, + "loss": 0.9333, + "step": 1620 + }, + { + "epoch": 2.3895338124193843, + "grad_norm": 1.9691526818582668, + "learning_rate": 3.396043063534613e-05, + "loss": 0.8549, + "step": 1621 + }, + { + "epoch": 2.3910079233462316, + "grad_norm": 2.3543610506629316, + "learning_rate": 3.39419495654127e-05, + "loss": 0.9269, + "step": 1622 + }, + { + "epoch": 2.392482034273079, + "grad_norm": 2.1781065755363147, + "learning_rate": 3.3923462890311544e-05, + "loss": 0.9849, + "step": 1623 + }, + { + "epoch": 2.393956145199926, + "grad_norm": 2.092803855172961, + "learning_rate": 3.3904970621630866e-05, + "loss": 0.8728, + "step": 1624 + }, + { + "epoch": 2.3954302561267733, + "grad_norm": 2.165155678629464, + "learning_rate": 3.388647277096234e-05, + "loss": 0.9673, + "step": 1625 + }, + { + "epoch": 2.396904367053621, + "grad_norm": 2.1029362496051953, + "learning_rate": 3.386796934990115e-05, + "loss": 1.0627, + "step": 1626 + }, + { + "epoch": 2.398378477980468, + "grad_norm": 1.859216763471053, + "learning_rate": 3.3849460370045966e-05, + "loss": 0.7436, + "step": 1627 + }, + { + "epoch": 2.3998525889073155, + "grad_norm": 2.0326469899383826, + "learning_rate": 3.3830945842998954e-05, + "loss": 0.9289, + "step": 1628 + }, + { + "epoch": 2.4013266998341627, + "grad_norm": 2.190326064061408, + "learning_rate": 3.381242578036576e-05, + "loss": 0.9337, + "step": 1629 + }, + { + "epoch": 2.40280081076101, + "grad_norm": 2.3612744094128177, + "learning_rate": 3.379390019375548e-05, + "loss": 0.9605, + "step": 1630 + }, + { + "epoch": 2.404274921687857, + "grad_norm": 2.2711833252512696, + "learning_rate": 3.377536909478069e-05, + "loss": 0.9584, + "step": 1631 + }, + { + "epoch": 2.4057490326147044, + "grad_norm": 1.9657601557176625, + "learning_rate": 3.3756832495057414e-05, + "loss": 0.8141, + "step": 1632 + }, + { + "epoch": 2.4072231435415516, + "grad_norm": 2.380512565908135, + "learning_rate": 3.373829040620513e-05, + "loss": 0.8898, + "step": 1633 + }, + { + "epoch": 2.408697254468399, + "grad_norm": 2.1856932672011813, + "learning_rate": 3.3719742839846743e-05, + "loss": 0.8777, + "step": 1634 + }, + { + "epoch": 2.410171365395246, + "grad_norm": 1.946063896204426, + "learning_rate": 3.370118980760861e-05, + "loss": 0.8107, + "step": 1635 + }, + { + "epoch": 2.4116454763220934, + "grad_norm": 2.0184666910170606, + "learning_rate": 3.3682631321120504e-05, + "loss": 0.8594, + "step": 1636 + }, + { + "epoch": 2.4131195872489406, + "grad_norm": 2.3139880779490065, + "learning_rate": 3.366406739201562e-05, + "loss": 1.0516, + "step": 1637 + }, + { + "epoch": 2.414593698175788, + "grad_norm": 2.170370335422805, + "learning_rate": 3.364549803193057e-05, + "loss": 0.9245, + "step": 1638 + }, + { + "epoch": 2.416067809102635, + "grad_norm": 2.193439376880252, + "learning_rate": 3.362692325250534e-05, + "loss": 0.909, + "step": 1639 + }, + { + "epoch": 2.4175419200294823, + "grad_norm": 2.27545747454107, + "learning_rate": 3.360834306538336e-05, + "loss": 1.0398, + "step": 1640 + }, + { + "epoch": 2.4190160309563296, + "grad_norm": 2.1886718880719593, + "learning_rate": 3.3589757482211416e-05, + "loss": 1.0136, + "step": 1641 + }, + { + "epoch": 2.420490141883177, + "grad_norm": 2.114314871848405, + "learning_rate": 3.3571166514639684e-05, + "loss": 0.8983, + "step": 1642 + }, + { + "epoch": 2.421964252810024, + "grad_norm": 2.212360476819845, + "learning_rate": 3.3552570174321724e-05, + "loss": 0.9351, + "step": 1643 + }, + { + "epoch": 2.4234383637368713, + "grad_norm": 2.1295423764707793, + "learning_rate": 3.353396847291446e-05, + "loss": 0.8284, + "step": 1644 + }, + { + "epoch": 2.4249124746637185, + "grad_norm": 2.221608107535075, + "learning_rate": 3.3515361422078165e-05, + "loss": 1.0301, + "step": 1645 + }, + { + "epoch": 2.4263865855905657, + "grad_norm": 2.2171231605275685, + "learning_rate": 3.3496749033476485e-05, + "loss": 1.0551, + "step": 1646 + }, + { + "epoch": 2.427860696517413, + "grad_norm": 1.8549922291060805, + "learning_rate": 3.347813131877638e-05, + "loss": 0.7652, + "step": 1647 + }, + { + "epoch": 2.4293348074442602, + "grad_norm": 2.36213694209873, + "learning_rate": 3.34595082896482e-05, + "loss": 1.0077, + "step": 1648 + }, + { + "epoch": 2.4308089183711075, + "grad_norm": 1.9535966227459283, + "learning_rate": 3.344087995776558e-05, + "loss": 0.8663, + "step": 1649 + }, + { + "epoch": 2.4322830292979547, + "grad_norm": 2.3428508814119944, + "learning_rate": 3.34222463348055e-05, + "loss": 1.135, + "step": 1650 + }, + { + "epoch": 2.433757140224802, + "grad_norm": 2.383183084655947, + "learning_rate": 3.340360743244825e-05, + "loss": 0.7825, + "step": 1651 + }, + { + "epoch": 2.435231251151649, + "grad_norm": 2.3485923081551503, + "learning_rate": 3.338496326237743e-05, + "loss": 0.9195, + "step": 1652 + }, + { + "epoch": 2.4367053620784964, + "grad_norm": 2.155528956945027, + "learning_rate": 3.336631383627995e-05, + "loss": 1.0636, + "step": 1653 + }, + { + "epoch": 2.4381794730053437, + "grad_norm": 2.195288824313503, + "learning_rate": 3.334765916584599e-05, + "loss": 0.8705, + "step": 1654 + }, + { + "epoch": 2.439653583932191, + "grad_norm": 2.3171733338739053, + "learning_rate": 3.332899926276905e-05, + "loss": 0.9485, + "step": 1655 + }, + { + "epoch": 2.441127694859038, + "grad_norm": 2.1115926882042584, + "learning_rate": 3.33103341387459e-05, + "loss": 0.8916, + "step": 1656 + }, + { + "epoch": 2.4426018057858854, + "grad_norm": 1.9914602111473398, + "learning_rate": 3.3291663805476566e-05, + "loss": 1.026, + "step": 1657 + }, + { + "epoch": 2.4440759167127326, + "grad_norm": 2.040109416794873, + "learning_rate": 3.3272988274664364e-05, + "loss": 0.9017, + "step": 1658 + }, + { + "epoch": 2.44555002763958, + "grad_norm": 1.9436320818002248, + "learning_rate": 3.325430755801584e-05, + "loss": 0.876, + "step": 1659 + }, + { + "epoch": 2.447024138566427, + "grad_norm": 2.1072596362915634, + "learning_rate": 3.323562166724082e-05, + "loss": 0.961, + "step": 1660 + }, + { + "epoch": 2.4484982494932743, + "grad_norm": 2.115524708837307, + "learning_rate": 3.321693061405235e-05, + "loss": 1.0675, + "step": 1661 + }, + { + "epoch": 2.4499723604201216, + "grad_norm": 2.1230358695523166, + "learning_rate": 3.319823441016673e-05, + "loss": 0.9437, + "step": 1662 + }, + { + "epoch": 2.451446471346969, + "grad_norm": 2.119031152666345, + "learning_rate": 3.317953306730347e-05, + "loss": 0.9348, + "step": 1663 + }, + { + "epoch": 2.452920582273816, + "grad_norm": 2.071947251292825, + "learning_rate": 3.316082659718532e-05, + "loss": 0.9308, + "step": 1664 + }, + { + "epoch": 2.4543946932006633, + "grad_norm": 2.2747578670216, + "learning_rate": 3.314211501153823e-05, + "loss": 0.8905, + "step": 1665 + }, + { + "epoch": 2.4558688041275105, + "grad_norm": 2.1415525779566638, + "learning_rate": 3.312339832209137e-05, + "loss": 0.8951, + "step": 1666 + }, + { + "epoch": 2.4573429150543578, + "grad_norm": 2.0920363288263304, + "learning_rate": 3.3104676540577094e-05, + "loss": 0.9556, + "step": 1667 + }, + { + "epoch": 2.458817025981205, + "grad_norm": 2.2609223065396336, + "learning_rate": 3.308594967873095e-05, + "loss": 1.0335, + "step": 1668 + }, + { + "epoch": 2.4602911369080522, + "grad_norm": 2.191359249161743, + "learning_rate": 3.3067217748291695e-05, + "loss": 1.0301, + "step": 1669 + }, + { + "epoch": 2.4617652478348995, + "grad_norm": 2.0595600097167033, + "learning_rate": 3.304848076100122e-05, + "loss": 1.074, + "step": 1670 + }, + { + "epoch": 2.4632393587617467, + "grad_norm": 2.1670555597144157, + "learning_rate": 3.302973872860463e-05, + "loss": 0.8694, + "step": 1671 + }, + { + "epoch": 2.464713469688594, + "grad_norm": 2.1512601308899946, + "learning_rate": 3.301099166285017e-05, + "loss": 0.9075, + "step": 1672 + }, + { + "epoch": 2.466187580615441, + "grad_norm": 2.138781791196569, + "learning_rate": 3.299223957548923e-05, + "loss": 0.9028, + "step": 1673 + }, + { + "epoch": 2.4676616915422884, + "grad_norm": 2.040666672659351, + "learning_rate": 3.2973482478276364e-05, + "loss": 1.003, + "step": 1674 + }, + { + "epoch": 2.4691358024691357, + "grad_norm": 2.184293522222493, + "learning_rate": 3.2954720382969263e-05, + "loss": 0.9832, + "step": 1675 + }, + { + "epoch": 2.470609913395983, + "grad_norm": 2.21155272641892, + "learning_rate": 3.293595330132876e-05, + "loss": 0.951, + "step": 1676 + }, + { + "epoch": 2.47208402432283, + "grad_norm": 2.103558014820677, + "learning_rate": 3.291718124511879e-05, + "loss": 0.9952, + "step": 1677 + }, + { + "epoch": 2.4735581352496774, + "grad_norm": 2.3208753115169474, + "learning_rate": 3.289840422610643e-05, + "loss": 1.0309, + "step": 1678 + }, + { + "epoch": 2.4750322461765246, + "grad_norm": 1.9706674956322985, + "learning_rate": 3.287962225606185e-05, + "loss": 0.9542, + "step": 1679 + }, + { + "epoch": 2.476506357103372, + "grad_norm": 2.089495193077924, + "learning_rate": 3.286083534675835e-05, + "loss": 1.0185, + "step": 1680 + }, + { + "epoch": 2.477980468030219, + "grad_norm": 2.145761383726894, + "learning_rate": 3.284204350997229e-05, + "loss": 0.9694, + "step": 1681 + }, + { + "epoch": 2.4794545789570663, + "grad_norm": 2.1918397634440576, + "learning_rate": 3.282324675748314e-05, + "loss": 0.8156, + "step": 1682 + }, + { + "epoch": 2.4809286898839136, + "grad_norm": 2.0903991531612696, + "learning_rate": 3.280444510107346e-05, + "loss": 0.9545, + "step": 1683 + }, + { + "epoch": 2.4824028008107613, + "grad_norm": 2.218925254240221, + "learning_rate": 3.278563855252885e-05, + "loss": 0.9508, + "step": 1684 + }, + { + "epoch": 2.4838769117376085, + "grad_norm": 2.1670144119945287, + "learning_rate": 3.276682712363801e-05, + "loss": 1.2227, + "step": 1685 + }, + { + "epoch": 2.4853510226644557, + "grad_norm": 2.1599083425360295, + "learning_rate": 3.274801082619269e-05, + "loss": 0.9885, + "step": 1686 + }, + { + "epoch": 2.486825133591303, + "grad_norm": 2.1722997894920595, + "learning_rate": 3.2729189671987695e-05, + "loss": 0.8845, + "step": 1687 + }, + { + "epoch": 2.48829924451815, + "grad_norm": 2.1762462509447262, + "learning_rate": 3.271036367282085e-05, + "loss": 0.9866, + "step": 1688 + }, + { + "epoch": 2.4897733554449974, + "grad_norm": 2.1546321825834034, + "learning_rate": 3.269153284049306e-05, + "loss": 0.8365, + "step": 1689 + }, + { + "epoch": 2.4912474663718447, + "grad_norm": 2.1551317272949717, + "learning_rate": 3.267269718680822e-05, + "loss": 0.9565, + "step": 1690 + }, + { + "epoch": 2.492721577298692, + "grad_norm": 2.079049958627438, + "learning_rate": 3.265385672357327e-05, + "loss": 1.0143, + "step": 1691 + }, + { + "epoch": 2.494195688225539, + "grad_norm": 2.0610591595121157, + "learning_rate": 3.2635011462598145e-05, + "loss": 0.9417, + "step": 1692 + }, + { + "epoch": 2.4956697991523864, + "grad_norm": 1.949729409150316, + "learning_rate": 3.261616141569581e-05, + "loss": 0.9508, + "step": 1693 + }, + { + "epoch": 2.4971439100792336, + "grad_norm": 2.2995523431754945, + "learning_rate": 3.2597306594682225e-05, + "loss": 0.9785, + "step": 1694 + }, + { + "epoch": 2.498618021006081, + "grad_norm": 2.102322806059451, + "learning_rate": 3.257844701137633e-05, + "loss": 1.1082, + "step": 1695 + }, + { + "epoch": 2.500092131932928, + "grad_norm": 2.1459871502408916, + "learning_rate": 3.255958267760006e-05, + "loss": 1.0246, + "step": 1696 + }, + { + "epoch": 2.5015662428597754, + "grad_norm": 2.095364417927057, + "learning_rate": 3.254071360517833e-05, + "loss": 1.1134, + "step": 1697 + }, + { + "epoch": 2.5030403537866226, + "grad_norm": 1.9574809823506967, + "learning_rate": 3.252183980593901e-05, + "loss": 0.9581, + "step": 1698 + }, + { + "epoch": 2.50451446471347, + "grad_norm": 2.0274580553939545, + "learning_rate": 3.250296129171295e-05, + "loss": 0.9124, + "step": 1699 + }, + { + "epoch": 2.505988575640317, + "grad_norm": 2.241454320254508, + "learning_rate": 3.2484078074333954e-05, + "loss": 0.8277, + "step": 1700 + }, + { + "epoch": 2.5074626865671643, + "grad_norm": 1.9206174131939369, + "learning_rate": 3.246519016563876e-05, + "loss": 0.9121, + "step": 1701 + }, + { + "epoch": 2.5089367974940116, + "grad_norm": 2.098198790333586, + "learning_rate": 3.244629757746706e-05, + "loss": 0.8616, + "step": 1702 + }, + { + "epoch": 2.510410908420859, + "grad_norm": 1.9350718387802128, + "learning_rate": 3.242740032166149e-05, + "loss": 0.979, + "step": 1703 + }, + { + "epoch": 2.511885019347706, + "grad_norm": 2.2151144625880947, + "learning_rate": 3.240849841006758e-05, + "loss": 0.9639, + "step": 1704 + }, + { + "epoch": 2.5133591302745533, + "grad_norm": 2.1544258030677392, + "learning_rate": 3.23895918545338e-05, + "loss": 0.851, + "step": 1705 + }, + { + "epoch": 2.5148332412014005, + "grad_norm": 2.156089223866205, + "learning_rate": 3.237068066691152e-05, + "loss": 1.1099, + "step": 1706 + }, + { + "epoch": 2.5163073521282477, + "grad_norm": 2.150213554776883, + "learning_rate": 3.2351764859055034e-05, + "loss": 1.1039, + "step": 1707 + }, + { + "epoch": 2.517781463055095, + "grad_norm": 1.9318008896364856, + "learning_rate": 3.233284444282152e-05, + "loss": 0.8274, + "step": 1708 + }, + { + "epoch": 2.519255573981942, + "grad_norm": 2.0140289374870353, + "learning_rate": 3.2313919430071026e-05, + "loss": 0.9672, + "step": 1709 + }, + { + "epoch": 2.5207296849087895, + "grad_norm": 1.9871690558561252, + "learning_rate": 3.2294989832666514e-05, + "loss": 0.9749, + "step": 1710 + }, + { + "epoch": 2.5222037958356367, + "grad_norm": 1.900264786791097, + "learning_rate": 3.22760556624738e-05, + "loss": 0.9303, + "step": 1711 + }, + { + "epoch": 2.523677906762484, + "grad_norm": 2.2311870534015847, + "learning_rate": 3.225711693136156e-05, + "loss": 1.0793, + "step": 1712 + }, + { + "epoch": 2.525152017689331, + "grad_norm": 2.1005053502527504, + "learning_rate": 3.223817365120136e-05, + "loss": 0.8998, + "step": 1713 + }, + { + "epoch": 2.5266261286161784, + "grad_norm": 2.121638837433541, + "learning_rate": 3.221922583386758e-05, + "loss": 0.9425, + "step": 1714 + }, + { + "epoch": 2.5281002395430257, + "grad_norm": 2.2763327289527804, + "learning_rate": 3.220027349123748e-05, + "loss": 1.0774, + "step": 1715 + }, + { + "epoch": 2.529574350469873, + "grad_norm": 2.189693441142029, + "learning_rate": 3.2181316635191125e-05, + "loss": 0.9742, + "step": 1716 + }, + { + "epoch": 2.53104846139672, + "grad_norm": 2.0344893228546495, + "learning_rate": 3.2162355277611416e-05, + "loss": 0.8612, + "step": 1717 + }, + { + "epoch": 2.5325225723235674, + "grad_norm": 1.99004281481974, + "learning_rate": 3.214338943038409e-05, + "loss": 0.9278, + "step": 1718 + }, + { + "epoch": 2.5339966832504146, + "grad_norm": 1.8806389256599139, + "learning_rate": 3.21244191053977e-05, + "loss": 0.8617, + "step": 1719 + }, + { + "epoch": 2.535470794177262, + "grad_norm": 2.126287743324814, + "learning_rate": 3.2105444314543584e-05, + "loss": 0.9358, + "step": 1720 + }, + { + "epoch": 2.536944905104109, + "grad_norm": 2.4017904993658874, + "learning_rate": 3.208646506971589e-05, + "loss": 1.2069, + "step": 1721 + }, + { + "epoch": 2.5384190160309563, + "grad_norm": 2.0375225931798906, + "learning_rate": 3.206748138281157e-05, + "loss": 1.0304, + "step": 1722 + }, + { + "epoch": 2.5398931269578036, + "grad_norm": 2.0652366746399387, + "learning_rate": 3.204849326573034e-05, + "loss": 0.9532, + "step": 1723 + }, + { + "epoch": 2.541367237884651, + "grad_norm": 2.0550017025979965, + "learning_rate": 3.20295007303747e-05, + "loss": 0.856, + "step": 1724 + }, + { + "epoch": 2.542841348811498, + "grad_norm": 2.160482040588414, + "learning_rate": 3.201050378864994e-05, + "loss": 0.9894, + "step": 1725 + }, + { + "epoch": 2.5443154597383453, + "grad_norm": 2.0283493458667747, + "learning_rate": 3.1991502452464074e-05, + "loss": 0.87, + "step": 1726 + }, + { + "epoch": 2.5457895706651925, + "grad_norm": 2.017427451756768, + "learning_rate": 3.1972496733727906e-05, + "loss": 1.1002, + "step": 1727 + }, + { + "epoch": 2.5472636815920398, + "grad_norm": 2.1190578180825916, + "learning_rate": 3.195348664435497e-05, + "loss": 0.7762, + "step": 1728 + }, + { + "epoch": 2.548737792518887, + "grad_norm": 2.109400883268571, + "learning_rate": 3.193447219626153e-05, + "loss": 0.9088, + "step": 1729 + }, + { + "epoch": 2.5502119034457342, + "grad_norm": 1.9671511742932197, + "learning_rate": 3.191545340136661e-05, + "loss": 0.8853, + "step": 1730 + }, + { + "epoch": 2.5516860143725815, + "grad_norm": 2.164148211933585, + "learning_rate": 3.1896430271591937e-05, + "loss": 1.076, + "step": 1731 + }, + { + "epoch": 2.5531601252994287, + "grad_norm": 2.2774694810449305, + "learning_rate": 3.187740281886195e-05, + "loss": 0.9779, + "step": 1732 + }, + { + "epoch": 2.554634236226276, + "grad_norm": 2.188016449045282, + "learning_rate": 3.185837105510383e-05, + "loss": 0.8944, + "step": 1733 + }, + { + "epoch": 2.556108347153123, + "grad_norm": 2.145532037514011, + "learning_rate": 3.183933499224743e-05, + "loss": 1.0104, + "step": 1734 + }, + { + "epoch": 2.5575824580799704, + "grad_norm": 2.2024180042189125, + "learning_rate": 3.18202946422253e-05, + "loss": 0.9971, + "step": 1735 + }, + { + "epoch": 2.5590565690068177, + "grad_norm": 2.019033935231454, + "learning_rate": 3.18012500169727e-05, + "loss": 0.9574, + "step": 1736 + }, + { + "epoch": 2.560530679933665, + "grad_norm": 2.1676034980535923, + "learning_rate": 3.178220112842753e-05, + "loss": 0.9253, + "step": 1737 + }, + { + "epoch": 2.562004790860512, + "grad_norm": 2.108396288970941, + "learning_rate": 3.176314798853042e-05, + "loss": 0.8269, + "step": 1738 + }, + { + "epoch": 2.5634789017873594, + "grad_norm": 2.118128606095879, + "learning_rate": 3.17440906092246e-05, + "loss": 0.9522, + "step": 1739 + }, + { + "epoch": 2.5649530127142066, + "grad_norm": 2.2306281198194924, + "learning_rate": 3.1725029002456e-05, + "loss": 0.8739, + "step": 1740 + }, + { + "epoch": 2.566427123641054, + "grad_norm": 2.1995455836112026, + "learning_rate": 3.17059631801732e-05, + "loss": 0.9796, + "step": 1741 + }, + { + "epoch": 2.567901234567901, + "grad_norm": 2.0279094516847094, + "learning_rate": 3.168689315432741e-05, + "loss": 0.7954, + "step": 1742 + }, + { + "epoch": 2.5693753454947483, + "grad_norm": 2.15323483390432, + "learning_rate": 3.1667818936872465e-05, + "loss": 0.9948, + "step": 1743 + }, + { + "epoch": 2.5708494564215956, + "grad_norm": 2.214436811574459, + "learning_rate": 3.1648740539764844e-05, + "loss": 0.9673, + "step": 1744 + }, + { + "epoch": 2.572323567348443, + "grad_norm": 2.2543091306483807, + "learning_rate": 3.162965797496364e-05, + "loss": 1.0563, + "step": 1745 + }, + { + "epoch": 2.57379767827529, + "grad_norm": 1.859509631499959, + "learning_rate": 3.161057125443056e-05, + "loss": 0.7734, + "step": 1746 + }, + { + "epoch": 2.5752717892021373, + "grad_norm": 2.0305983451768137, + "learning_rate": 3.1591480390129914e-05, + "loss": 0.7847, + "step": 1747 + }, + { + "epoch": 2.5767459001289845, + "grad_norm": 2.242334553751078, + "learning_rate": 3.157238539402862e-05, + "loss": 0.9543, + "step": 1748 + }, + { + "epoch": 2.5782200110558318, + "grad_norm": 2.32862920229735, + "learning_rate": 3.155328627809617e-05, + "loss": 1.0283, + "step": 1749 + }, + { + "epoch": 2.579694121982679, + "grad_norm": 2.031294602124497, + "learning_rate": 3.1534183054304645e-05, + "loss": 1.0185, + "step": 1750 + }, + { + "epoch": 2.579694121982679, + "eval_bleu": 0.055573090242479375, + "eval_bleu_1gram": 0.34188218281027344, + "eval_bleu_2gram": 0.12114871340479708, + "eval_bleu_3gram": 0.0520342290224777, + "eval_bleu_4gram": 0.026826966821431385, + "eval_rag_val_loss": 1.643004084466606, + "eval_rouge1": 0.33405043556066294, + "eval_rouge2": 0.11513784981928654, + "eval_rougeL": 0.3135612357194884, + "step": 1750 + }, + { + "epoch": 2.5811682329095262, + "grad_norm": 2.1984838202937182, + "learning_rate": 3.1515075734628705e-05, + "loss": 0.8624, + "step": 1751 + }, + { + "epoch": 2.5826423438363735, + "grad_norm": 2.2407784636865675, + "learning_rate": 3.149596433104556e-05, + "loss": 1.0364, + "step": 1752 + }, + { + "epoch": 2.5841164547632207, + "grad_norm": 2.2441090713553575, + "learning_rate": 3.147684885553502e-05, + "loss": 0.9345, + "step": 1753 + }, + { + "epoch": 2.585590565690068, + "grad_norm": 2.416179728504946, + "learning_rate": 3.145772932007939e-05, + "loss": 0.9889, + "step": 1754 + }, + { + "epoch": 2.587064676616915, + "grad_norm": 2.243337834577121, + "learning_rate": 3.143860573666357e-05, + "loss": 0.7987, + "step": 1755 + }, + { + "epoch": 2.5885387875437624, + "grad_norm": 2.083553022881279, + "learning_rate": 3.1419478117274984e-05, + "loss": 1.007, + "step": 1756 + }, + { + "epoch": 2.5900128984706097, + "grad_norm": 2.3068097187438785, + "learning_rate": 3.140034647390357e-05, + "loss": 1.1439, + "step": 1757 + }, + { + "epoch": 2.591487009397457, + "grad_norm": 2.0400043519291553, + "learning_rate": 3.13812108185418e-05, + "loss": 0.9338, + "step": 1758 + }, + { + "epoch": 2.592961120324304, + "grad_norm": 2.1313970431169587, + "learning_rate": 3.136207116318466e-05, + "loss": 0.9051, + "step": 1759 + }, + { + "epoch": 2.5944352312511514, + "grad_norm": 2.5250123014244843, + "learning_rate": 3.1342927519829644e-05, + "loss": 1.1853, + "step": 1760 + }, + { + "epoch": 2.5959093421779986, + "grad_norm": 1.9879246599938454, + "learning_rate": 3.1323779900476744e-05, + "loss": 0.9466, + "step": 1761 + }, + { + "epoch": 2.597383453104846, + "grad_norm": 2.1238132977945967, + "learning_rate": 3.1304628317128446e-05, + "loss": 0.9648, + "step": 1762 + }, + { + "epoch": 2.5988575640316935, + "grad_norm": 2.0564555984255772, + "learning_rate": 3.128547278178972e-05, + "loss": 1.0392, + "step": 1763 + }, + { + "epoch": 2.600331674958541, + "grad_norm": 2.1716716863301433, + "learning_rate": 3.126631330646802e-05, + "loss": 0.9093, + "step": 1764 + }, + { + "epoch": 2.601805785885388, + "grad_norm": 2.081155617896485, + "learning_rate": 3.124714990317324e-05, + "loss": 1.0173, + "step": 1765 + }, + { + "epoch": 2.6032798968122353, + "grad_norm": 1.979675435042255, + "learning_rate": 3.122798258391779e-05, + "loss": 1.1098, + "step": 1766 + }, + { + "epoch": 2.6047540077390825, + "grad_norm": 1.9700741629026115, + "learning_rate": 3.120881136071649e-05, + "loss": 0.9434, + "step": 1767 + }, + { + "epoch": 2.6062281186659297, + "grad_norm": 1.9446365836746766, + "learning_rate": 3.118963624558662e-05, + "loss": 0.8653, + "step": 1768 + }, + { + "epoch": 2.607702229592777, + "grad_norm": 2.196249330940644, + "learning_rate": 3.11704572505479e-05, + "loss": 0.9166, + "step": 1769 + }, + { + "epoch": 2.609176340519624, + "grad_norm": 2.1288570853339617, + "learning_rate": 3.115127438762247e-05, + "loss": 1.0654, + "step": 1770 + }, + { + "epoch": 2.6106504514464715, + "grad_norm": 2.022968134698401, + "learning_rate": 3.113208766883494e-05, + "loss": 0.9494, + "step": 1771 + }, + { + "epoch": 2.6121245623733187, + "grad_norm": 1.8923497175365587, + "learning_rate": 3.111289710621228e-05, + "loss": 0.9244, + "step": 1772 + }, + { + "epoch": 2.613598673300166, + "grad_norm": 2.191640368788528, + "learning_rate": 3.109370271178389e-05, + "loss": 1.067, + "step": 1773 + }, + { + "epoch": 2.615072784227013, + "grad_norm": 2.0718999570606376, + "learning_rate": 3.10745044975816e-05, + "loss": 0.8979, + "step": 1774 + }, + { + "epoch": 2.6165468951538604, + "grad_norm": 2.192556368408781, + "learning_rate": 3.1055302475639594e-05, + "loss": 1.0057, + "step": 1775 + }, + { + "epoch": 2.6180210060807076, + "grad_norm": 2.0596661606431126, + "learning_rate": 3.103609665799445e-05, + "loss": 0.9229, + "step": 1776 + }, + { + "epoch": 2.619495117007555, + "grad_norm": 2.084100721806418, + "learning_rate": 3.1016887056685155e-05, + "loss": 0.9164, + "step": 1777 + }, + { + "epoch": 2.620969227934402, + "grad_norm": 2.1277173563537644, + "learning_rate": 3.0997673683753024e-05, + "loss": 0.9373, + "step": 1778 + }, + { + "epoch": 2.6224433388612494, + "grad_norm": 2.196438755007362, + "learning_rate": 3.0978456551241786e-05, + "loss": 0.868, + "step": 1779 + }, + { + "epoch": 2.6239174497880966, + "grad_norm": 2.0005884496942516, + "learning_rate": 3.095923567119748e-05, + "loss": 0.8726, + "step": 1780 + }, + { + "epoch": 2.625391560714944, + "grad_norm": 2.08627545194374, + "learning_rate": 3.094001105566852e-05, + "loss": 0.9243, + "step": 1781 + }, + { + "epoch": 2.626865671641791, + "grad_norm": 2.1493808963891095, + "learning_rate": 3.0920782716705654e-05, + "loss": 0.8746, + "step": 1782 + }, + { + "epoch": 2.6283397825686383, + "grad_norm": 2.2030218654868663, + "learning_rate": 3.0901550666361964e-05, + "loss": 1.0587, + "step": 1783 + }, + { + "epoch": 2.6298138934954856, + "grad_norm": 2.2500929283449036, + "learning_rate": 3.088231491669287e-05, + "loss": 1.1282, + "step": 1784 + }, + { + "epoch": 2.631288004422333, + "grad_norm": 2.0809582205028807, + "learning_rate": 3.0863075479756084e-05, + "loss": 0.9671, + "step": 1785 + }, + { + "epoch": 2.63276211534918, + "grad_norm": 2.061466304795124, + "learning_rate": 3.084383236761166e-05, + "loss": 0.9586, + "step": 1786 + }, + { + "epoch": 2.6342362262760273, + "grad_norm": 2.4975333442791374, + "learning_rate": 3.0824585592321936e-05, + "loss": 1.0932, + "step": 1787 + }, + { + "epoch": 2.6357103372028745, + "grad_norm": 2.258729952885257, + "learning_rate": 3.080533516595155e-05, + "loss": 1.0924, + "step": 1788 + }, + { + "epoch": 2.6371844481297217, + "grad_norm": 2.154876271388944, + "learning_rate": 3.078608110056745e-05, + "loss": 1.0354, + "step": 1789 + }, + { + "epoch": 2.638658559056569, + "grad_norm": 2.024379672886872, + "learning_rate": 3.076682340823882e-05, + "loss": 1.0128, + "step": 1790 + }, + { + "epoch": 2.6401326699834162, + "grad_norm": 2.168539436078495, + "learning_rate": 3.074756210103715e-05, + "loss": 0.9384, + "step": 1791 + }, + { + "epoch": 2.6416067809102635, + "grad_norm": 2.062797640507273, + "learning_rate": 3.072829719103619e-05, + "loss": 0.9524, + "step": 1792 + }, + { + "epoch": 2.6430808918371107, + "grad_norm": 2.2362241762481614, + "learning_rate": 3.070902869031196e-05, + "loss": 0.8175, + "step": 1793 + }, + { + "epoch": 2.644555002763958, + "grad_norm": 1.9729043634575534, + "learning_rate": 3.0689756610942705e-05, + "loss": 0.8934, + "step": 1794 + }, + { + "epoch": 2.646029113690805, + "grad_norm": 2.049923323150976, + "learning_rate": 3.067048096500893e-05, + "loss": 0.829, + "step": 1795 + }, + { + "epoch": 2.6475032246176524, + "grad_norm": 2.059977520570044, + "learning_rate": 3.065120176459338e-05, + "loss": 0.9228, + "step": 1796 + }, + { + "epoch": 2.6489773355444997, + "grad_norm": 2.12026022591524, + "learning_rate": 3.0631919021781e-05, + "loss": 0.985, + "step": 1797 + }, + { + "epoch": 2.650451446471347, + "grad_norm": 2.1315320541473604, + "learning_rate": 3.0612632748659e-05, + "loss": 0.9291, + "step": 1798 + }, + { + "epoch": 2.651925557398194, + "grad_norm": 2.2521885718201107, + "learning_rate": 3.0593342957316765e-05, + "loss": 0.9138, + "step": 1799 + }, + { + "epoch": 2.6533996683250414, + "grad_norm": 2.2148548374874046, + "learning_rate": 3.05740496598459e-05, + "loss": 0.9376, + "step": 1800 + }, + { + "epoch": 2.6548737792518886, + "grad_norm": 2.041306353583633, + "learning_rate": 3.055475286834021e-05, + "loss": 0.9404, + "step": 1801 + }, + { + "epoch": 2.656347890178736, + "grad_norm": 2.1500605330706204, + "learning_rate": 3.053545259489569e-05, + "loss": 0.924, + "step": 1802 + }, + { + "epoch": 2.657822001105583, + "grad_norm": 2.1694027668619222, + "learning_rate": 3.051614885161051e-05, + "loss": 0.888, + "step": 1803 + }, + { + "epoch": 2.6592961120324303, + "grad_norm": 2.186691025602148, + "learning_rate": 3.0496841650585022e-05, + "loss": 0.9828, + "step": 1804 + }, + { + "epoch": 2.6607702229592776, + "grad_norm": 2.3808591747205616, + "learning_rate": 3.0477531003921745e-05, + "loss": 1.0999, + "step": 1805 + }, + { + "epoch": 2.662244333886125, + "grad_norm": 2.0975944183158957, + "learning_rate": 3.0458216923725356e-05, + "loss": 0.9419, + "step": 1806 + }, + { + "epoch": 2.663718444812972, + "grad_norm": 2.0947238308548624, + "learning_rate": 3.043889942210268e-05, + "loss": 0.8704, + "step": 1807 + }, + { + "epoch": 2.6651925557398193, + "grad_norm": 1.972787018020252, + "learning_rate": 3.0419578511162695e-05, + "loss": 0.9647, + "step": 1808 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 2.068045241984393, + "learning_rate": 3.0400254203016503e-05, + "loss": 0.9381, + "step": 1809 + }, + { + "epoch": 2.6681407775935138, + "grad_norm": 2.1525107871038958, + "learning_rate": 3.0380926509777364e-05, + "loss": 1.1002, + "step": 1810 + }, + { + "epoch": 2.669614888520361, + "grad_norm": 1.995996401926899, + "learning_rate": 3.0361595443560624e-05, + "loss": 1.0872, + "step": 1811 + }, + { + "epoch": 2.6710889994472087, + "grad_norm": 2.041661385769405, + "learning_rate": 3.034226101648377e-05, + "loss": 1.0054, + "step": 1812 + }, + { + "epoch": 2.672563110374056, + "grad_norm": 2.2057666497646826, + "learning_rate": 3.0322923240666377e-05, + "loss": 0.9795, + "step": 1813 + }, + { + "epoch": 2.674037221300903, + "grad_norm": 2.050299538849172, + "learning_rate": 3.030358212823014e-05, + "loss": 0.9672, + "step": 1814 + }, + { + "epoch": 2.6755113322277504, + "grad_norm": 2.051742589967626, + "learning_rate": 3.0284237691298823e-05, + "loss": 0.9873, + "step": 1815 + }, + { + "epoch": 2.6769854431545976, + "grad_norm": 1.9295027795016872, + "learning_rate": 3.0264889941998285e-05, + "loss": 0.8804, + "step": 1816 + }, + { + "epoch": 2.678459554081445, + "grad_norm": 2.180346755001575, + "learning_rate": 3.0245538892456455e-05, + "loss": 1.0665, + "step": 1817 + }, + { + "epoch": 2.679933665008292, + "grad_norm": 2.0464484083288443, + "learning_rate": 3.0226184554803357e-05, + "loss": 0.9575, + "step": 1818 + }, + { + "epoch": 2.6814077759351393, + "grad_norm": 2.08952953792962, + "learning_rate": 3.0206826941171035e-05, + "loss": 0.9013, + "step": 1819 + }, + { + "epoch": 2.6828818868619866, + "grad_norm": 2.3457320923807248, + "learning_rate": 3.0187466063693614e-05, + "loss": 0.9603, + "step": 1820 + }, + { + "epoch": 2.684355997788834, + "grad_norm": 2.047790439113738, + "learning_rate": 3.0168101934507266e-05, + "loss": 1.0001, + "step": 1821 + }, + { + "epoch": 2.685830108715681, + "grad_norm": 2.1518726975746922, + "learning_rate": 3.0148734565750176e-05, + "loss": 0.9236, + "step": 1822 + }, + { + "epoch": 2.6873042196425283, + "grad_norm": 2.2367997251103877, + "learning_rate": 3.012936396956259e-05, + "loss": 0.9916, + "step": 1823 + }, + { + "epoch": 2.6887783305693755, + "grad_norm": 2.3769347692453224, + "learning_rate": 3.0109990158086764e-05, + "loss": 0.8328, + "step": 1824 + }, + { + "epoch": 2.690252441496223, + "grad_norm": 2.0610036334691046, + "learning_rate": 3.0090613143466956e-05, + "loss": 0.9659, + "step": 1825 + }, + { + "epoch": 2.69172655242307, + "grad_norm": 2.233069405214683, + "learning_rate": 3.0071232937849457e-05, + "loss": 0.9709, + "step": 1826 + }, + { + "epoch": 2.6932006633499173, + "grad_norm": 2.154831903721188, + "learning_rate": 3.0051849553382555e-05, + "loss": 0.9363, + "step": 1827 + }, + { + "epoch": 2.6946747742767645, + "grad_norm": 2.0612165908465574, + "learning_rate": 3.0032463002216505e-05, + "loss": 0.925, + "step": 1828 + }, + { + "epoch": 2.6961488852036117, + "grad_norm": 2.233269905024685, + "learning_rate": 3.001307329650357e-05, + "loss": 0.9821, + "step": 1829 + }, + { + "epoch": 2.697622996130459, + "grad_norm": 2.1985717689055693, + "learning_rate": 2.9993680448397988e-05, + "loss": 1.0333, + "step": 1830 + }, + { + "epoch": 2.699097107057306, + "grad_norm": 2.065792230482113, + "learning_rate": 2.997428447005596e-05, + "loss": 0.9076, + "step": 1831 + }, + { + "epoch": 2.7005712179841534, + "grad_norm": 2.164533763485579, + "learning_rate": 2.9954885373635655e-05, + "loss": 1.1866, + "step": 1832 + }, + { + "epoch": 2.7020453289110007, + "grad_norm": 2.258621546054044, + "learning_rate": 2.9935483171297186e-05, + "loss": 1.0448, + "step": 1833 + }, + { + "epoch": 2.703519439837848, + "grad_norm": 2.198199237419151, + "learning_rate": 2.991607787520263e-05, + "loss": 0.9786, + "step": 1834 + }, + { + "epoch": 2.704993550764695, + "grad_norm": 1.947070082294161, + "learning_rate": 2.989666949751599e-05, + "loss": 0.98, + "step": 1835 + }, + { + "epoch": 2.7064676616915424, + "grad_norm": 2.1596233131299214, + "learning_rate": 2.9877258050403212e-05, + "loss": 0.9382, + "step": 1836 + }, + { + "epoch": 2.7079417726183896, + "grad_norm": 1.9660084724864615, + "learning_rate": 2.985784354603215e-05, + "loss": 0.8798, + "step": 1837 + }, + { + "epoch": 2.709415883545237, + "grad_norm": 2.124654854064859, + "learning_rate": 2.9838425996572583e-05, + "loss": 0.9097, + "step": 1838 + }, + { + "epoch": 2.710889994472084, + "grad_norm": 2.0373399262063634, + "learning_rate": 2.981900541419621e-05, + "loss": 1.073, + "step": 1839 + }, + { + "epoch": 2.7123641053989314, + "grad_norm": 2.366214262566019, + "learning_rate": 2.9799581811076605e-05, + "loss": 1.0424, + "step": 1840 + }, + { + "epoch": 2.7138382163257786, + "grad_norm": 2.2303612143653795, + "learning_rate": 2.978015519938926e-05, + "loss": 0.9905, + "step": 1841 + }, + { + "epoch": 2.715312327252626, + "grad_norm": 1.9686715852168089, + "learning_rate": 2.9760725591311545e-05, + "loss": 0.8199, + "step": 1842 + }, + { + "epoch": 2.716786438179473, + "grad_norm": 2.122109859991592, + "learning_rate": 2.9741292999022707e-05, + "loss": 0.9643, + "step": 1843 + }, + { + "epoch": 2.7182605491063203, + "grad_norm": 2.139339534410948, + "learning_rate": 2.9721857434703858e-05, + "loss": 0.8441, + "step": 1844 + }, + { + "epoch": 2.7197346600331676, + "grad_norm": 2.169012914546158, + "learning_rate": 2.9702418910537983e-05, + "loss": 0.9663, + "step": 1845 + }, + { + "epoch": 2.721208770960015, + "grad_norm": 2.1349947877056454, + "learning_rate": 2.9682977438709914e-05, + "loss": 0.9757, + "step": 1846 + }, + { + "epoch": 2.722682881886862, + "grad_norm": 2.2382260726864955, + "learning_rate": 2.9663533031406344e-05, + "loss": 1.0964, + "step": 1847 + }, + { + "epoch": 2.7241569928137093, + "grad_norm": 2.1530888923972586, + "learning_rate": 2.9644085700815777e-05, + "loss": 0.9852, + "step": 1848 + }, + { + "epoch": 2.7256311037405565, + "grad_norm": 2.013907595434308, + "learning_rate": 2.9624635459128585e-05, + "loss": 0.7825, + "step": 1849 + }, + { + "epoch": 2.7271052146674037, + "grad_norm": 2.343103039776737, + "learning_rate": 2.960518231853695e-05, + "loss": 0.9298, + "step": 1850 + }, + { + "epoch": 2.728579325594251, + "grad_norm": 2.180973999575077, + "learning_rate": 2.9585726291234872e-05, + "loss": 0.9289, + "step": 1851 + }, + { + "epoch": 2.730053436521098, + "grad_norm": 2.0294113053486362, + "learning_rate": 2.9566267389418144e-05, + "loss": 1.0403, + "step": 1852 + }, + { + "epoch": 2.7315275474479455, + "grad_norm": 1.9998146805735708, + "learning_rate": 2.9546805625284384e-05, + "loss": 0.9106, + "step": 1853 + }, + { + "epoch": 2.7330016583747927, + "grad_norm": 2.129740644255068, + "learning_rate": 2.9527341011033e-05, + "loss": 1.0296, + "step": 1854 + }, + { + "epoch": 2.73447576930164, + "grad_norm": 2.1004496183902512, + "learning_rate": 2.9507873558865175e-05, + "loss": 1.0393, + "step": 1855 + }, + { + "epoch": 2.735949880228487, + "grad_norm": 2.2503740211725822, + "learning_rate": 2.9488403280983873e-05, + "loss": 1.0947, + "step": 1856 + }, + { + "epoch": 2.7374239911553344, + "grad_norm": 2.1085446454125645, + "learning_rate": 2.9468930189593845e-05, + "loss": 0.9323, + "step": 1857 + }, + { + "epoch": 2.7388981020821817, + "grad_norm": 2.296346383021392, + "learning_rate": 2.9449454296901603e-05, + "loss": 0.8923, + "step": 1858 + }, + { + "epoch": 2.740372213009029, + "grad_norm": 2.020647754865544, + "learning_rate": 2.9429975615115383e-05, + "loss": 1.1155, + "step": 1859 + }, + { + "epoch": 2.741846323935876, + "grad_norm": 2.0743597812444876, + "learning_rate": 2.9410494156445216e-05, + "loss": 1.0538, + "step": 1860 + }, + { + "epoch": 2.7433204348627234, + "grad_norm": 2.1427226682884144, + "learning_rate": 2.9391009933102836e-05, + "loss": 0.9989, + "step": 1861 + }, + { + "epoch": 2.7447945457895706, + "grad_norm": 1.99638361608485, + "learning_rate": 2.9371522957301734e-05, + "loss": 0.9557, + "step": 1862 + }, + { + "epoch": 2.746268656716418, + "grad_norm": 2.0440211311636567, + "learning_rate": 2.935203324125711e-05, + "loss": 0.9788, + "step": 1863 + }, + { + "epoch": 2.747742767643265, + "grad_norm": 2.3555224967632062, + "learning_rate": 2.9332540797185892e-05, + "loss": 1.141, + "step": 1864 + }, + { + "epoch": 2.7492168785701123, + "grad_norm": 2.146658643012522, + "learning_rate": 2.9313045637306714e-05, + "loss": 0.9468, + "step": 1865 + }, + { + "epoch": 2.7506909894969596, + "grad_norm": 2.0289027810755624, + "learning_rate": 2.9293547773839917e-05, + "loss": 1.1054, + "step": 1866 + }, + { + "epoch": 2.752165100423807, + "grad_norm": 2.0019487185546017, + "learning_rate": 2.9274047219007534e-05, + "loss": 0.7559, + "step": 1867 + }, + { + "epoch": 2.753639211350654, + "grad_norm": 2.1149406193903797, + "learning_rate": 2.925454398503328e-05, + "loss": 0.9985, + "step": 1868 + }, + { + "epoch": 2.7551133222775013, + "grad_norm": 1.9260872680921393, + "learning_rate": 2.9235038084142557e-05, + "loss": 0.8982, + "step": 1869 + }, + { + "epoch": 2.7565874332043485, + "grad_norm": 2.2864158354150113, + "learning_rate": 2.921552952856243e-05, + "loss": 1.016, + "step": 1870 + }, + { + "epoch": 2.7580615441311958, + "grad_norm": 1.9241097336126731, + "learning_rate": 2.919601833052163e-05, + "loss": 0.925, + "step": 1871 + }, + { + "epoch": 2.759535655058043, + "grad_norm": 2.2882905771697715, + "learning_rate": 2.9176504502250563e-05, + "loss": 0.9278, + "step": 1872 + }, + { + "epoch": 2.7610097659848902, + "grad_norm": 2.051962782776982, + "learning_rate": 2.9156988055981254e-05, + "loss": 0.9639, + "step": 1873 + }, + { + "epoch": 2.7624838769117375, + "grad_norm": 2.086700185723885, + "learning_rate": 2.9137469003947392e-05, + "loss": 0.9043, + "step": 1874 + }, + { + "epoch": 2.7639579878385847, + "grad_norm": 2.15568745227044, + "learning_rate": 2.9117947358384288e-05, + "loss": 0.9606, + "step": 1875 + }, + { + "epoch": 2.765432098765432, + "grad_norm": 2.1569483635053412, + "learning_rate": 2.909842313152888e-05, + "loss": 0.9616, + "step": 1876 + }, + { + "epoch": 2.766906209692279, + "grad_norm": 2.0924481572117615, + "learning_rate": 2.9078896335619732e-05, + "loss": 1.0455, + "step": 1877 + }, + { + "epoch": 2.7683803206191264, + "grad_norm": 2.1992132210534963, + "learning_rate": 2.9059366982897007e-05, + "loss": 0.9652, + "step": 1878 + }, + { + "epoch": 2.7698544315459737, + "grad_norm": 2.3053370708010936, + "learning_rate": 2.9039835085602473e-05, + "loss": 1.0063, + "step": 1879 + }, + { + "epoch": 2.771328542472821, + "grad_norm": 2.198036865515296, + "learning_rate": 2.9020300655979503e-05, + "loss": 1.1366, + "step": 1880 + }, + { + "epoch": 2.772802653399668, + "grad_norm": 2.2998020957721597, + "learning_rate": 2.9000763706273036e-05, + "loss": 1.0675, + "step": 1881 + }, + { + "epoch": 2.7742767643265154, + "grad_norm": 2.180870254820166, + "learning_rate": 2.8981224248729628e-05, + "loss": 1.113, + "step": 1882 + }, + { + "epoch": 2.7757508752533626, + "grad_norm": 1.93734082214149, + "learning_rate": 2.896168229559737e-05, + "loss": 0.8801, + "step": 1883 + }, + { + "epoch": 2.77722498618021, + "grad_norm": 2.0103423215201, + "learning_rate": 2.8942137859125928e-05, + "loss": 1.0129, + "step": 1884 + }, + { + "epoch": 2.778699097107057, + "grad_norm": 2.185063340165934, + "learning_rate": 2.8922590951566536e-05, + "loss": 0.9029, + "step": 1885 + }, + { + "epoch": 2.7801732080339043, + "grad_norm": 2.2095920526269057, + "learning_rate": 2.8903041585171963e-05, + "loss": 0.8379, + "step": 1886 + }, + { + "epoch": 2.7816473189607516, + "grad_norm": 2.1660106717354886, + "learning_rate": 2.8883489772196525e-05, + "loss": 1.0209, + "step": 1887 + }, + { + "epoch": 2.783121429887599, + "grad_norm": 2.167633635593321, + "learning_rate": 2.886393552489608e-05, + "loss": 1.1411, + "step": 1888 + }, + { + "epoch": 2.784595540814446, + "grad_norm": 2.3025102061357146, + "learning_rate": 2.8844378855527998e-05, + "loss": 0.9594, + "step": 1889 + }, + { + "epoch": 2.7860696517412933, + "grad_norm": 2.1065194475465554, + "learning_rate": 2.8824819776351176e-05, + "loss": 1.0102, + "step": 1890 + }, + { + "epoch": 2.7875437626681405, + "grad_norm": 1.9284450840310214, + "learning_rate": 2.8805258299626015e-05, + "loss": 0.9375, + "step": 1891 + }, + { + "epoch": 2.7890178735949878, + "grad_norm": 1.8859508523413062, + "learning_rate": 2.878569443761442e-05, + "loss": 0.8936, + "step": 1892 + }, + { + "epoch": 2.790491984521835, + "grad_norm": 2.1042186869117554, + "learning_rate": 2.8766128202579797e-05, + "loss": 0.9601, + "step": 1893 + }, + { + "epoch": 2.7919660954486822, + "grad_norm": 2.088913983449061, + "learning_rate": 2.874655960678704e-05, + "loss": 1.0908, + "step": 1894 + }, + { + "epoch": 2.79344020637553, + "grad_norm": 2.132596183119328, + "learning_rate": 2.87269886625025e-05, + "loss": 0.8279, + "step": 1895 + }, + { + "epoch": 2.794914317302377, + "grad_norm": 2.0647120317930434, + "learning_rate": 2.870741538199405e-05, + "loss": 0.9965, + "step": 1896 + }, + { + "epoch": 2.7963884282292244, + "grad_norm": 2.2337437518288055, + "learning_rate": 2.8687839777530977e-05, + "loss": 0.946, + "step": 1897 + }, + { + "epoch": 2.7978625391560716, + "grad_norm": 2.1167733888929954, + "learning_rate": 2.8668261861384045e-05, + "loss": 1.0438, + "step": 1898 + }, + { + "epoch": 2.799336650082919, + "grad_norm": 2.0151452964857115, + "learning_rate": 2.8648681645825472e-05, + "loss": 0.9852, + "step": 1899 + }, + { + "epoch": 2.800810761009766, + "grad_norm": 2.059927868182048, + "learning_rate": 2.8629099143128907e-05, + "loss": 1.0796, + "step": 1900 + }, + { + "epoch": 2.8022848719366134, + "grad_norm": 2.203255182843403, + "learning_rate": 2.860951436556944e-05, + "loss": 0.8487, + "step": 1901 + }, + { + "epoch": 2.8037589828634606, + "grad_norm": 2.261651072700207, + "learning_rate": 2.8589927325423576e-05, + "loss": 0.9975, + "step": 1902 + }, + { + "epoch": 2.805233093790308, + "grad_norm": 2.1972534721886143, + "learning_rate": 2.8570338034969264e-05, + "loss": 0.8591, + "step": 1903 + }, + { + "epoch": 2.806707204717155, + "grad_norm": 2.0294720424503625, + "learning_rate": 2.855074650648583e-05, + "loss": 0.9723, + "step": 1904 + }, + { + "epoch": 2.8081813156440023, + "grad_norm": 2.012433741645712, + "learning_rate": 2.853115275225403e-05, + "loss": 1.0298, + "step": 1905 + }, + { + "epoch": 2.8096554265708495, + "grad_norm": 2.0880314179099306, + "learning_rate": 2.8511556784556e-05, + "loss": 0.9103, + "step": 1906 + }, + { + "epoch": 2.811129537497697, + "grad_norm": 2.161099605589604, + "learning_rate": 2.8491958615675262e-05, + "loss": 0.8923, + "step": 1907 + }, + { + "epoch": 2.812603648424544, + "grad_norm": 2.071033855940649, + "learning_rate": 2.8472358257896732e-05, + "loss": 0.989, + "step": 1908 + }, + { + "epoch": 2.8140777593513913, + "grad_norm": 2.0227639981092995, + "learning_rate": 2.8452755723506687e-05, + "loss": 1.0256, + "step": 1909 + }, + { + "epoch": 2.8155518702782385, + "grad_norm": 1.95844058155714, + "learning_rate": 2.843315102479276e-05, + "loss": 0.9416, + "step": 1910 + }, + { + "epoch": 2.8170259812050857, + "grad_norm": 2.118830532467706, + "learning_rate": 2.841354417404397e-05, + "loss": 0.8657, + "step": 1911 + }, + { + "epoch": 2.818500092131933, + "grad_norm": 2.1501538997837173, + "learning_rate": 2.8393935183550662e-05, + "loss": 1.0455, + "step": 1912 + }, + { + "epoch": 2.81997420305878, + "grad_norm": 2.123405082404658, + "learning_rate": 2.8374324065604517e-05, + "loss": 0.9056, + "step": 1913 + }, + { + "epoch": 2.8214483139856275, + "grad_norm": 2.0136367099181594, + "learning_rate": 2.8354710832498576e-05, + "loss": 1.0075, + "step": 1914 + }, + { + "epoch": 2.8229224249124747, + "grad_norm": 2.067461692519748, + "learning_rate": 2.833509549652717e-05, + "loss": 1.0024, + "step": 1915 + }, + { + "epoch": 2.824396535839322, + "grad_norm": 2.123742123971472, + "learning_rate": 2.831547806998598e-05, + "loss": 0.9694, + "step": 1916 + }, + { + "epoch": 2.825870646766169, + "grad_norm": 2.2401391219464584, + "learning_rate": 2.8295858565171983e-05, + "loss": 1.0497, + "step": 1917 + }, + { + "epoch": 2.8273447576930164, + "grad_norm": 1.9842866292184558, + "learning_rate": 2.8276236994383453e-05, + "loss": 0.904, + "step": 1918 + }, + { + "epoch": 2.8288188686198636, + "grad_norm": 1.9829123806277538, + "learning_rate": 2.825661336991998e-05, + "loss": 0.936, + "step": 1919 + }, + { + "epoch": 2.830292979546711, + "grad_norm": 2.0137752354731964, + "learning_rate": 2.8236987704082417e-05, + "loss": 0.9222, + "step": 1920 + }, + { + "epoch": 2.831767090473558, + "grad_norm": 1.9893906052265287, + "learning_rate": 2.8217360009172922e-05, + "loss": 0.8904, + "step": 1921 + }, + { + "epoch": 2.8332412014004054, + "grad_norm": 2.0695718844072566, + "learning_rate": 2.8197730297494896e-05, + "loss": 0.9024, + "step": 1922 + }, + { + "epoch": 2.8347153123272526, + "grad_norm": 2.0782482139371394, + "learning_rate": 2.8178098581353018e-05, + "loss": 0.8734, + "step": 1923 + }, + { + "epoch": 2.8361894232541, + "grad_norm": 2.074784196409333, + "learning_rate": 2.8158464873053237e-05, + "loss": 0.9932, + "step": 1924 + }, + { + "epoch": 2.837663534180947, + "grad_norm": 1.9607349306320534, + "learning_rate": 2.8138829184902727e-05, + "loss": 0.9758, + "step": 1925 + }, + { + "epoch": 2.8391376451077943, + "grad_norm": 2.1966187198752976, + "learning_rate": 2.811919152920991e-05, + "loss": 0.9256, + "step": 1926 + }, + { + "epoch": 2.8406117560346416, + "grad_norm": 2.149744142788989, + "learning_rate": 2.8099551918284468e-05, + "loss": 0.9008, + "step": 1927 + }, + { + "epoch": 2.842085866961489, + "grad_norm": 2.1642825018262246, + "learning_rate": 2.8079910364437263e-05, + "loss": 1.0446, + "step": 1928 + }, + { + "epoch": 2.843559977888336, + "grad_norm": 2.1750647085802366, + "learning_rate": 2.8060266879980408e-05, + "loss": 1.0718, + "step": 1929 + }, + { + "epoch": 2.8450340888151833, + "grad_norm": 2.0072262869301407, + "learning_rate": 2.8040621477227214e-05, + "loss": 1.0321, + "step": 1930 + }, + { + "epoch": 2.8465081997420305, + "grad_norm": 1.954013530801488, + "learning_rate": 2.8020974168492197e-05, + "loss": 0.9447, + "step": 1931 + }, + { + "epoch": 2.8479823106688777, + "grad_norm": 1.9962274614852062, + "learning_rate": 2.8001324966091076e-05, + "loss": 0.7935, + "step": 1932 + }, + { + "epoch": 2.849456421595725, + "grad_norm": 2.3085786718918557, + "learning_rate": 2.7981673882340726e-05, + "loss": 1.0391, + "step": 1933 + }, + { + "epoch": 2.8509305325225722, + "grad_norm": 2.093838590199608, + "learning_rate": 2.796202092955924e-05, + "loss": 0.8771, + "step": 1934 + }, + { + "epoch": 2.8524046434494195, + "grad_norm": 2.226016007666598, + "learning_rate": 2.7942366120065872e-05, + "loss": 0.8799, + "step": 1935 + }, + { + "epoch": 2.8538787543762667, + "grad_norm": 2.0024517052550013, + "learning_rate": 2.792270946618102e-05, + "loss": 0.8739, + "step": 1936 + }, + { + "epoch": 2.855352865303114, + "grad_norm": 2.22901358628443, + "learning_rate": 2.790305098022626e-05, + "loss": 0.997, + "step": 1937 + }, + { + "epoch": 2.856826976229961, + "grad_norm": 2.372418757696739, + "learning_rate": 2.78833906745243e-05, + "loss": 0.9276, + "step": 1938 + }, + { + "epoch": 2.8583010871568084, + "grad_norm": 2.2132512040392966, + "learning_rate": 2.7863728561399016e-05, + "loss": 0.9028, + "step": 1939 + }, + { + "epoch": 2.8597751980836557, + "grad_norm": 2.0593539432265318, + "learning_rate": 2.7844064653175378e-05, + "loss": 0.8346, + "step": 1940 + }, + { + "epoch": 2.861249309010503, + "grad_norm": 2.1577616382435725, + "learning_rate": 2.7824398962179503e-05, + "loss": 1.0189, + "step": 1941 + }, + { + "epoch": 2.86272341993735, + "grad_norm": 2.0930959334680383, + "learning_rate": 2.780473150073864e-05, + "loss": 0.9341, + "step": 1942 + }, + { + "epoch": 2.8641975308641974, + "grad_norm": 2.1796446218340813, + "learning_rate": 2.7785062281181124e-05, + "loss": 1.0079, + "step": 1943 + }, + { + "epoch": 2.8656716417910446, + "grad_norm": 2.2649931980393023, + "learning_rate": 2.7765391315836396e-05, + "loss": 1.0087, + "step": 1944 + }, + { + "epoch": 2.8671457527178923, + "grad_norm": 2.2403913474524915, + "learning_rate": 2.7745718617034998e-05, + "loss": 0.8887, + "step": 1945 + }, + { + "epoch": 2.8686198636447395, + "grad_norm": 2.066933577232033, + "learning_rate": 2.7726044197108557e-05, + "loss": 1.0993, + "step": 1946 + }, + { + "epoch": 2.8700939745715868, + "grad_norm": 2.176907095703998, + "learning_rate": 2.7706368068389778e-05, + "loss": 0.7714, + "step": 1947 + }, + { + "epoch": 2.871568085498434, + "grad_norm": 2.324074312738905, + "learning_rate": 2.7686690243212432e-05, + "loss": 1.1721, + "step": 1948 + }, + { + "epoch": 2.8730421964252812, + "grad_norm": 2.0571385837692198, + "learning_rate": 2.7667010733911354e-05, + "loss": 0.965, + "step": 1949 + }, + { + "epoch": 2.8745163073521285, + "grad_norm": 2.135276628570073, + "learning_rate": 2.7647329552822455e-05, + "loss": 0.9041, + "step": 1950 + }, + { + "epoch": 2.8759904182789757, + "grad_norm": 2.164665936780751, + "learning_rate": 2.762764671228267e-05, + "loss": 0.8907, + "step": 1951 + }, + { + "epoch": 2.877464529205823, + "grad_norm": 2.239392815108347, + "learning_rate": 2.760796222462998e-05, + "loss": 0.9501, + "step": 1952 + }, + { + "epoch": 2.87893864013267, + "grad_norm": 2.0317727223296007, + "learning_rate": 2.7588276102203398e-05, + "loss": 0.8825, + "step": 1953 + }, + { + "epoch": 2.8804127510595174, + "grad_norm": 2.1517928122791745, + "learning_rate": 2.7568588357342973e-05, + "loss": 0.9281, + "step": 1954 + }, + { + "epoch": 2.8818868619863647, + "grad_norm": 2.173488319620076, + "learning_rate": 2.754889900238975e-05, + "loss": 0.9885, + "step": 1955 + }, + { + "epoch": 2.883360972913212, + "grad_norm": 2.3318064212248997, + "learning_rate": 2.7529208049685807e-05, + "loss": 1.049, + "step": 1956 + }, + { + "epoch": 2.884835083840059, + "grad_norm": 2.189165843454745, + "learning_rate": 2.7509515511574208e-05, + "loss": 0.9454, + "step": 1957 + }, + { + "epoch": 2.8863091947669064, + "grad_norm": 2.0281886580106177, + "learning_rate": 2.748982140039902e-05, + "loss": 0.896, + "step": 1958 + }, + { + "epoch": 2.8877833056937536, + "grad_norm": 2.0649616694183437, + "learning_rate": 2.747012572850528e-05, + "loss": 0.8909, + "step": 1959 + }, + { + "epoch": 2.889257416620601, + "grad_norm": 1.9376862959358307, + "learning_rate": 2.7450428508239024e-05, + "loss": 0.9517, + "step": 1960 + }, + { + "epoch": 2.890731527547448, + "grad_norm": 2.024468472269905, + "learning_rate": 2.743072975194723e-05, + "loss": 0.9304, + "step": 1961 + }, + { + "epoch": 2.8922056384742953, + "grad_norm": 2.197833693928408, + "learning_rate": 2.741102947197789e-05, + "loss": 0.9326, + "step": 1962 + }, + { + "epoch": 2.8936797494011426, + "grad_norm": 2.205512842757133, + "learning_rate": 2.7391327680679895e-05, + "loss": 0.9714, + "step": 1963 + }, + { + "epoch": 2.89515386032799, + "grad_norm": 2.124966452838655, + "learning_rate": 2.7371624390403116e-05, + "loss": 1.0, + "step": 1964 + }, + { + "epoch": 2.896627971254837, + "grad_norm": 2.164118686869307, + "learning_rate": 2.735191961349835e-05, + "loss": 0.9994, + "step": 1965 + }, + { + "epoch": 2.8981020821816843, + "grad_norm": 2.190263691708167, + "learning_rate": 2.7332213362317328e-05, + "loss": 0.9316, + "step": 1966 + }, + { + "epoch": 2.8995761931085315, + "grad_norm": 1.948747535885078, + "learning_rate": 2.7312505649212722e-05, + "loss": 0.8285, + "step": 1967 + }, + { + "epoch": 2.901050304035379, + "grad_norm": 2.1955565184596666, + "learning_rate": 2.7292796486538093e-05, + "loss": 1.0584, + "step": 1968 + }, + { + "epoch": 2.902524414962226, + "grad_norm": 1.936813048128675, + "learning_rate": 2.727308588664793e-05, + "loss": 0.7911, + "step": 1969 + }, + { + "epoch": 2.9039985258890733, + "grad_norm": 2.351214513029234, + "learning_rate": 2.725337386189761e-05, + "loss": 1.0626, + "step": 1970 + }, + { + "epoch": 2.9054726368159205, + "grad_norm": 2.072243764979541, + "learning_rate": 2.723366042464342e-05, + "loss": 0.9979, + "step": 1971 + }, + { + "epoch": 2.9069467477427677, + "grad_norm": 2.144502010736748, + "learning_rate": 2.7213945587242508e-05, + "loss": 1.1117, + "step": 1972 + }, + { + "epoch": 2.908420858669615, + "grad_norm": 2.3090684535013946, + "learning_rate": 2.7194229362052924e-05, + "loss": 0.9736, + "step": 1973 + }, + { + "epoch": 2.909894969596462, + "grad_norm": 2.0226378757177077, + "learning_rate": 2.7174511761433585e-05, + "loss": 1.1498, + "step": 1974 + }, + { + "epoch": 2.9113690805233094, + "grad_norm": 2.114863284767324, + "learning_rate": 2.715479279774425e-05, + "loss": 0.9978, + "step": 1975 + }, + { + "epoch": 2.9128431914501567, + "grad_norm": 2.0441151424010013, + "learning_rate": 2.7135072483345552e-05, + "loss": 0.8907, + "step": 1976 + }, + { + "epoch": 2.914317302377004, + "grad_norm": 2.159377663109734, + "learning_rate": 2.7115350830598958e-05, + "loss": 0.948, + "step": 1977 + }, + { + "epoch": 2.915791413303851, + "grad_norm": 2.074035751402493, + "learning_rate": 2.709562785186679e-05, + "loss": 0.9448, + "step": 1978 + }, + { + "epoch": 2.9172655242306984, + "grad_norm": 2.4391524753433584, + "learning_rate": 2.7075903559512178e-05, + "loss": 0.9926, + "step": 1979 + }, + { + "epoch": 2.9187396351575456, + "grad_norm": 2.25316259852769, + "learning_rate": 2.7056177965899097e-05, + "loss": 0.9782, + "step": 1980 + }, + { + "epoch": 2.920213746084393, + "grad_norm": 2.237264512099727, + "learning_rate": 2.7036451083392332e-05, + "loss": 0.9497, + "step": 1981 + }, + { + "epoch": 2.92168785701124, + "grad_norm": 2.232577367725665, + "learning_rate": 2.701672292435747e-05, + "loss": 0.9958, + "step": 1982 + }, + { + "epoch": 2.9231619679380874, + "grad_norm": 2.2715650890176575, + "learning_rate": 2.69969935011609e-05, + "loss": 1.0359, + "step": 1983 + }, + { + "epoch": 2.9246360788649346, + "grad_norm": 2.1787683880326028, + "learning_rate": 2.6977262826169807e-05, + "loss": 1.0161, + "step": 1984 + }, + { + "epoch": 2.926110189791782, + "grad_norm": 2.195982155531071, + "learning_rate": 2.695753091175216e-05, + "loss": 0.9379, + "step": 1985 + }, + { + "epoch": 2.927584300718629, + "grad_norm": 2.138687593585216, + "learning_rate": 2.6937797770276702e-05, + "loss": 1.0092, + "step": 1986 + }, + { + "epoch": 2.9290584116454763, + "grad_norm": 2.00690222860402, + "learning_rate": 2.6918063414112942e-05, + "loss": 0.8235, + "step": 1987 + }, + { + "epoch": 2.9305325225723236, + "grad_norm": 2.1371470689804912, + "learning_rate": 2.6898327855631155e-05, + "loss": 1.0929, + "step": 1988 + }, + { + "epoch": 2.932006633499171, + "grad_norm": 2.004136100213434, + "learning_rate": 2.6878591107202383e-05, + "loss": 0.9106, + "step": 1989 + }, + { + "epoch": 2.933480744426018, + "grad_norm": 2.084505539771222, + "learning_rate": 2.685885318119839e-05, + "loss": 0.9831, + "step": 1990 + }, + { + "epoch": 2.9349548553528653, + "grad_norm": 2.2400347114325823, + "learning_rate": 2.683911408999169e-05, + "loss": 0.9796, + "step": 1991 + }, + { + "epoch": 2.9364289662797125, + "grad_norm": 2.0879904256579245, + "learning_rate": 2.6819373845955527e-05, + "loss": 0.9937, + "step": 1992 + }, + { + "epoch": 2.9379030772065597, + "grad_norm": 2.097440172144683, + "learning_rate": 2.6799632461463862e-05, + "loss": 1.0235, + "step": 1993 + }, + { + "epoch": 2.939377188133407, + "grad_norm": 2.122542924211053, + "learning_rate": 2.6779889948891384e-05, + "loss": 1.132, + "step": 1994 + }, + { + "epoch": 2.940851299060254, + "grad_norm": 2.0540627809864143, + "learning_rate": 2.676014632061347e-05, + "loss": 0.8272, + "step": 1995 + }, + { + "epoch": 2.9423254099871015, + "grad_norm": 2.1484950942209196, + "learning_rate": 2.674040158900622e-05, + "loss": 0.8859, + "step": 1996 + }, + { + "epoch": 2.9437995209139487, + "grad_norm": 2.2527703078784116, + "learning_rate": 2.6720655766446412e-05, + "loss": 1.0119, + "step": 1997 + }, + { + "epoch": 2.945273631840796, + "grad_norm": 2.24887438274793, + "learning_rate": 2.6700908865311497e-05, + "loss": 1.1303, + "step": 1998 + }, + { + "epoch": 2.946747742767643, + "grad_norm": 1.9380307854883236, + "learning_rate": 2.6681160897979623e-05, + "loss": 0.8704, + "step": 1999 + }, + { + "epoch": 2.9482218536944904, + "grad_norm": 2.116668524954073, + "learning_rate": 2.6661411876829596e-05, + "loss": 0.9759, + "step": 2000 + }, + { + "epoch": 2.9482218536944904, + "eval_bleu": 0.04787357154563729, + "eval_bleu_1gram": 0.3421104118417729, + "eval_bleu_2gram": 0.11816282651259664, + "eval_bleu_3gram": 0.04741820328686351, + "eval_bleu_4gram": 0.022146725466342863, + "eval_rag_val_loss": 1.6265630154840407, + "eval_rouge1": 0.33128632764598065, + "eval_rouge2": 0.11283687684156013, + "eval_rougeL": 0.31066768988855775, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 4068, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": true, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}