{ "best_metric": 1.0981753053524161, "best_model_checkpoint": "train/20241110-Compress:64x-Lr:5e-5-Llama3-8B-instruct-GPT2-Large-RAG-no-ft_token-onlySquad-everymem/checkpoint-2000", "epoch": 2.9482218536944904, "eval_steps": 250, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014741109268472453, "grad_norm": 10.61697156179873, "learning_rate": 5.000000000000001e-07, "loss": 5.3633, "step": 1 }, { "epoch": 0.0029482218536944905, "grad_norm": 224.56039520305, "learning_rate": 1.0000000000000002e-06, "loss": 5.6658, "step": 2 }, { "epoch": 0.004422332780541736, "grad_norm": 15.08603156528157, "learning_rate": 1.5e-06, "loss": 5.3766, "step": 3 }, { "epoch": 0.005896443707388981, "grad_norm": 18.645343499879953, "learning_rate": 2.0000000000000003e-06, "loss": 5.671, "step": 4 }, { "epoch": 0.0073705546342362266, "grad_norm": 9.084548962177012, "learning_rate": 2.5e-06, "loss": 5.6932, "step": 5 }, { "epoch": 0.008844665561083471, "grad_norm": 15.283537461398927, "learning_rate": 3e-06, "loss": 5.7208, "step": 6 }, { "epoch": 0.010318776487930717, "grad_norm": 10.915661442802572, "learning_rate": 3.5000000000000004e-06, "loss": 5.8001, "step": 7 }, { "epoch": 0.011792887414777962, "grad_norm": 11.023433347359246, "learning_rate": 4.000000000000001e-06, "loss": 5.4991, "step": 8 }, { "epoch": 0.013266998341625208, "grad_norm": 10.747539149885442, "learning_rate": 4.5e-06, "loss": 5.5819, "step": 9 }, { "epoch": 0.014741109268472453, "grad_norm": 9.884743434758239, "learning_rate": 5e-06, "loss": 5.5065, "step": 10 }, { "epoch": 0.016215220195319697, "grad_norm": 14.361978753494252, "learning_rate": 5.500000000000001e-06, "loss": 5.4653, "step": 11 }, { "epoch": 0.017689331122166942, "grad_norm": 39.781871374271255, "learning_rate": 6e-06, "loss": 5.6215, "step": 12 }, { "epoch": 0.019163442049014188, "grad_norm": 9.949821462898452, "learning_rate": 6.5000000000000004e-06, "loss": 5.5279, "step": 13 }, { "epoch": 0.020637552975861433, "grad_norm": 18.7383858886749, "learning_rate": 7.000000000000001e-06, "loss": 5.0728, "step": 14 }, { "epoch": 0.02211166390270868, "grad_norm": 12.231148051839435, "learning_rate": 7.5e-06, "loss": 4.8488, "step": 15 }, { "epoch": 0.023585774829555924, "grad_norm": 10.114834427131333, "learning_rate": 8.000000000000001e-06, "loss": 4.7566, "step": 16 }, { "epoch": 0.02505988575640317, "grad_norm": 10.61995155222846, "learning_rate": 8.500000000000002e-06, "loss": 4.5164, "step": 17 }, { "epoch": 0.026533996683250415, "grad_norm": 8.191474241193744, "learning_rate": 9e-06, "loss": 3.8504, "step": 18 }, { "epoch": 0.02800810761009766, "grad_norm": 8.415295580490994, "learning_rate": 9.5e-06, "loss": 4.0015, "step": 19 }, { "epoch": 0.029482218536944906, "grad_norm": 8.010409258841937, "learning_rate": 1e-05, "loss": 3.6088, "step": 20 }, { "epoch": 0.03095632946379215, "grad_norm": 8.680559418401918, "learning_rate": 1.05e-05, "loss": 3.742, "step": 21 }, { "epoch": 0.032430440390639394, "grad_norm": 7.291808499364629, "learning_rate": 1.1000000000000001e-05, "loss": 3.562, "step": 22 }, { "epoch": 0.03390455131748664, "grad_norm": 8.470053934873613, "learning_rate": 1.1500000000000002e-05, "loss": 3.4202, "step": 23 }, { "epoch": 0.035378662244333885, "grad_norm": 5.5291357795103675, "learning_rate": 1.2e-05, "loss": 3.3375, "step": 24 }, { "epoch": 0.03685277317118113, "grad_norm": 7.774255021395639, "learning_rate": 1.25e-05, "loss": 2.9184, "step": 25 }, { "epoch": 0.038326884098028376, "grad_norm": 5.968444177272398, "learning_rate": 1.3000000000000001e-05, "loss": 2.9923, "step": 26 }, { "epoch": 0.03980099502487562, "grad_norm": 6.11696767594729, "learning_rate": 1.3500000000000001e-05, "loss": 2.8022, "step": 27 }, { "epoch": 0.04127510595172287, "grad_norm": 12.695955512561987, "learning_rate": 1.4000000000000001e-05, "loss": 2.7045, "step": 28 }, { "epoch": 0.04274921687857011, "grad_norm": 5.9081030466730535, "learning_rate": 1.45e-05, "loss": 2.6329, "step": 29 }, { "epoch": 0.04422332780541736, "grad_norm": 7.4316778395093195, "learning_rate": 1.5e-05, "loss": 2.2996, "step": 30 }, { "epoch": 0.0456974387322646, "grad_norm": 7.974483327362275, "learning_rate": 1.55e-05, "loss": 2.2619, "step": 31 }, { "epoch": 0.04717154965911185, "grad_norm": 5.284389668629848, "learning_rate": 1.6000000000000003e-05, "loss": 2.3616, "step": 32 }, { "epoch": 0.048645660585959094, "grad_norm": 4.358278854545061, "learning_rate": 1.65e-05, "loss": 2.1357, "step": 33 }, { "epoch": 0.05011977151280634, "grad_norm": 30.165881517284454, "learning_rate": 1.7000000000000003e-05, "loss": 2.2622, "step": 34 }, { "epoch": 0.051593882439653585, "grad_norm": 9.057439403999837, "learning_rate": 1.75e-05, "loss": 2.2881, "step": 35 }, { "epoch": 0.05306799336650083, "grad_norm": 4.739699991767717, "learning_rate": 1.8e-05, "loss": 2.0831, "step": 36 }, { "epoch": 0.054542104293348076, "grad_norm": 4.295736932027445, "learning_rate": 1.85e-05, "loss": 2.0218, "step": 37 }, { "epoch": 0.05601621522019532, "grad_norm": 7.3604427138154795, "learning_rate": 1.9e-05, "loss": 2.0171, "step": 38 }, { "epoch": 0.05749032614704257, "grad_norm": 3.8708303845041554, "learning_rate": 1.9500000000000003e-05, "loss": 1.9946, "step": 39 }, { "epoch": 0.05896443707388981, "grad_norm": 5.668513202873602, "learning_rate": 2e-05, "loss": 1.8514, "step": 40 }, { "epoch": 0.06043854800073706, "grad_norm": 3.8249670588252958, "learning_rate": 2.05e-05, "loss": 1.7843, "step": 41 }, { "epoch": 0.0619126589275843, "grad_norm": 5.094941731158118, "learning_rate": 2.1e-05, "loss": 1.8313, "step": 42 }, { "epoch": 0.06338676985443155, "grad_norm": 3.548917879471766, "learning_rate": 2.15e-05, "loss": 1.8234, "step": 43 }, { "epoch": 0.06486088078127879, "grad_norm": 8.040992616329117, "learning_rate": 2.2000000000000003e-05, "loss": 1.7645, "step": 44 }, { "epoch": 0.06633499170812604, "grad_norm": 9.843767051076503, "learning_rate": 2.25e-05, "loss": 1.763, "step": 45 }, { "epoch": 0.06780910263497328, "grad_norm": 4.052267009949107, "learning_rate": 2.3000000000000003e-05, "loss": 1.6715, "step": 46 }, { "epoch": 0.06928321356182053, "grad_norm": 4.571191739010181, "learning_rate": 2.35e-05, "loss": 1.7366, "step": 47 }, { "epoch": 0.07075732448866777, "grad_norm": 3.787808114288387, "learning_rate": 2.4e-05, "loss": 1.5895, "step": 48 }, { "epoch": 0.07223143541551502, "grad_norm": 3.3567836969374274, "learning_rate": 2.45e-05, "loss": 1.8296, "step": 49 }, { "epoch": 0.07370554634236226, "grad_norm": 3.9961612878426402, "learning_rate": 2.5e-05, "loss": 1.734, "step": 50 }, { "epoch": 0.07517965726920951, "grad_norm": 3.020795589518156, "learning_rate": 2.5500000000000003e-05, "loss": 1.6975, "step": 51 }, { "epoch": 0.07665376819605675, "grad_norm": 2.630653832101553, "learning_rate": 2.6000000000000002e-05, "loss": 1.6182, "step": 52 }, { "epoch": 0.078127879122904, "grad_norm": 2.6069766232371983, "learning_rate": 2.6500000000000004e-05, "loss": 1.584, "step": 53 }, { "epoch": 0.07960199004975124, "grad_norm": 2.616702908874404, "learning_rate": 2.7000000000000002e-05, "loss": 1.6532, "step": 54 }, { "epoch": 0.0810761009765985, "grad_norm": 2.643252899113491, "learning_rate": 2.7500000000000004e-05, "loss": 1.6537, "step": 55 }, { "epoch": 0.08255021190344573, "grad_norm": 2.6651239601778403, "learning_rate": 2.8000000000000003e-05, "loss": 1.6673, "step": 56 }, { "epoch": 0.08402432283029299, "grad_norm": 2.872628934272325, "learning_rate": 2.8499999999999998e-05, "loss": 1.687, "step": 57 }, { "epoch": 0.08549843375714022, "grad_norm": 2.8071359981673547, "learning_rate": 2.9e-05, "loss": 1.4657, "step": 58 }, { "epoch": 0.08697254468398748, "grad_norm": 3.0532730613267995, "learning_rate": 2.95e-05, "loss": 1.5616, "step": 59 }, { "epoch": 0.08844665561083472, "grad_norm": 2.8058193122672814, "learning_rate": 3e-05, "loss": 1.4097, "step": 60 }, { "epoch": 0.08992076653768195, "grad_norm": 3.362794634875531, "learning_rate": 3.05e-05, "loss": 1.6319, "step": 61 }, { "epoch": 0.0913948774645292, "grad_norm": 2.802536588595021, "learning_rate": 3.1e-05, "loss": 1.5441, "step": 62 }, { "epoch": 0.09286898839137644, "grad_norm": 2.762948330363296, "learning_rate": 3.15e-05, "loss": 1.4919, "step": 63 }, { "epoch": 0.0943430993182237, "grad_norm": 2.7457759667879023, "learning_rate": 3.2000000000000005e-05, "loss": 1.3134, "step": 64 }, { "epoch": 0.09581721024507094, "grad_norm": 2.883746897708595, "learning_rate": 3.2500000000000004e-05, "loss": 1.5037, "step": 65 }, { "epoch": 0.09729132117191819, "grad_norm": 2.6798251369706922, "learning_rate": 3.3e-05, "loss": 1.5308, "step": 66 }, { "epoch": 0.09876543209876543, "grad_norm": 2.968570904599923, "learning_rate": 3.35e-05, "loss": 1.4532, "step": 67 }, { "epoch": 0.10023954302561268, "grad_norm": 2.8889245882844423, "learning_rate": 3.4000000000000007e-05, "loss": 1.379, "step": 68 }, { "epoch": 0.10171365395245992, "grad_norm": 2.8759515265782105, "learning_rate": 3.45e-05, "loss": 1.4253, "step": 69 }, { "epoch": 0.10318776487930717, "grad_norm": 2.922180975738543, "learning_rate": 3.5e-05, "loss": 1.5606, "step": 70 }, { "epoch": 0.10466187580615441, "grad_norm": 2.874893020629797, "learning_rate": 3.55e-05, "loss": 1.5087, "step": 71 }, { "epoch": 0.10613598673300166, "grad_norm": 3.1463819282282386, "learning_rate": 3.6e-05, "loss": 1.5685, "step": 72 }, { "epoch": 0.1076100976598489, "grad_norm": 2.998953954797914, "learning_rate": 3.65e-05, "loss": 1.5271, "step": 73 }, { "epoch": 0.10908420858669615, "grad_norm": 2.635127196164508, "learning_rate": 3.7e-05, "loss": 1.397, "step": 74 }, { "epoch": 0.11055831951354339, "grad_norm": 2.6610494898079273, "learning_rate": 3.7500000000000003e-05, "loss": 1.45, "step": 75 }, { "epoch": 0.11203243044039064, "grad_norm": 2.526749931748358, "learning_rate": 3.8e-05, "loss": 1.4083, "step": 76 }, { "epoch": 0.11350654136723788, "grad_norm": 2.760989598730958, "learning_rate": 3.85e-05, "loss": 1.3441, "step": 77 }, { "epoch": 0.11498065229408513, "grad_norm": 2.5333761069384355, "learning_rate": 3.9000000000000006e-05, "loss": 1.2841, "step": 78 }, { "epoch": 0.11645476322093237, "grad_norm": 3.141754099255769, "learning_rate": 3.9500000000000005e-05, "loss": 1.5838, "step": 79 }, { "epoch": 0.11792887414777962, "grad_norm": 2.5431364739534894, "learning_rate": 4e-05, "loss": 1.4258, "step": 80 }, { "epoch": 0.11940298507462686, "grad_norm": 3.232445920180582, "learning_rate": 4.05e-05, "loss": 1.6264, "step": 81 }, { "epoch": 0.12087709600147412, "grad_norm": 2.7624127542518906, "learning_rate": 4.1e-05, "loss": 1.4753, "step": 82 }, { "epoch": 0.12235120692832135, "grad_norm": 2.5812944754770655, "learning_rate": 4.15e-05, "loss": 1.515, "step": 83 }, { "epoch": 0.1238253178551686, "grad_norm": 2.5897304228186213, "learning_rate": 4.2e-05, "loss": 1.3401, "step": 84 }, { "epoch": 0.12529942878201586, "grad_norm": 2.9717454755698416, "learning_rate": 4.25e-05, "loss": 1.4418, "step": 85 }, { "epoch": 0.1267735397088631, "grad_norm": 2.6008414337445998, "learning_rate": 4.3e-05, "loss": 1.3529, "step": 86 }, { "epoch": 0.12824765063571034, "grad_norm": 2.674881698987202, "learning_rate": 4.35e-05, "loss": 1.3422, "step": 87 }, { "epoch": 0.12972176156255757, "grad_norm": 2.6284419609107257, "learning_rate": 4.4000000000000006e-05, "loss": 1.2314, "step": 88 }, { "epoch": 0.13119587248940484, "grad_norm": 3.0221881503010883, "learning_rate": 4.4500000000000004e-05, "loss": 1.3267, "step": 89 }, { "epoch": 0.13266998341625208, "grad_norm": 2.5827139040018188, "learning_rate": 4.5e-05, "loss": 1.2807, "step": 90 }, { "epoch": 0.13414409434309932, "grad_norm": 3.0792619958005747, "learning_rate": 4.55e-05, "loss": 1.1765, "step": 91 }, { "epoch": 0.13561820526994656, "grad_norm": 2.971480709839415, "learning_rate": 4.600000000000001e-05, "loss": 1.5149, "step": 92 }, { "epoch": 0.1370923161967938, "grad_norm": 2.7912378883355293, "learning_rate": 4.6500000000000005e-05, "loss": 1.4913, "step": 93 }, { "epoch": 0.13856642712364106, "grad_norm": 2.9691198821284877, "learning_rate": 4.7e-05, "loss": 1.3014, "step": 94 }, { "epoch": 0.1400405380504883, "grad_norm": 3.0151415817901066, "learning_rate": 4.75e-05, "loss": 1.3729, "step": 95 }, { "epoch": 0.14151464897733554, "grad_norm": 2.5810563502051966, "learning_rate": 4.8e-05, "loss": 1.3562, "step": 96 }, { "epoch": 0.14298875990418278, "grad_norm": 2.8521803095185567, "learning_rate": 4.85e-05, "loss": 1.2032, "step": 97 }, { "epoch": 0.14446287083103004, "grad_norm": 2.6382007192352033, "learning_rate": 4.9e-05, "loss": 1.3329, "step": 98 }, { "epoch": 0.14593698175787728, "grad_norm": 2.6292649997835733, "learning_rate": 4.9500000000000004e-05, "loss": 1.4653, "step": 99 }, { "epoch": 0.14741109268472452, "grad_norm": 2.741970564474851, "learning_rate": 5e-05, "loss": 1.352, "step": 100 }, { "epoch": 0.14888520361157176, "grad_norm": 2.388201613466443, "learning_rate": 4.999999216450553e-05, "loss": 1.3584, "step": 101 }, { "epoch": 0.15035931453841903, "grad_norm": 2.6931267320437473, "learning_rate": 4.9999968658027006e-05, "loss": 1.3016, "step": 102 }, { "epoch": 0.15183342546526626, "grad_norm": 2.3918684175399263, "learning_rate": 4.999992948057919e-05, "loss": 1.2581, "step": 103 }, { "epoch": 0.1533075363921135, "grad_norm": 3.4190918913612522, "learning_rate": 4.999987463218663e-05, "loss": 1.3818, "step": 104 }, { "epoch": 0.15478164731896074, "grad_norm": 2.6129959475299347, "learning_rate": 4.9999804112883694e-05, "loss": 1.3279, "step": 105 }, { "epoch": 0.156255758245808, "grad_norm": 2.7158528819457968, "learning_rate": 4.99997179227146e-05, "loss": 1.2847, "step": 106 }, { "epoch": 0.15772986917265525, "grad_norm": 2.588405480182749, "learning_rate": 4.999961606173337e-05, "loss": 1.3243, "step": 107 }, { "epoch": 0.15920398009950248, "grad_norm": 2.4731689692925123, "learning_rate": 4.9999498530003866e-05, "loss": 1.1902, "step": 108 }, { "epoch": 0.16067809102634972, "grad_norm": 3.1335588360526927, "learning_rate": 4.999936532759974e-05, "loss": 1.4942, "step": 109 }, { "epoch": 0.162152201953197, "grad_norm": 2.441183778926256, "learning_rate": 4.9999216454604505e-05, "loss": 1.2513, "step": 110 }, { "epoch": 0.16362631288004423, "grad_norm": 2.882690757930752, "learning_rate": 4.9999051911111484e-05, "loss": 1.2638, "step": 111 }, { "epoch": 0.16510042380689147, "grad_norm": 2.5888491370551416, "learning_rate": 4.99988716972238e-05, "loss": 1.1867, "step": 112 }, { "epoch": 0.1665745347337387, "grad_norm": 2.580021293759263, "learning_rate": 4.999867581305444e-05, "loss": 1.2432, "step": 113 }, { "epoch": 0.16804864566058597, "grad_norm": 4.01611348902925, "learning_rate": 4.9998464258726174e-05, "loss": 1.4301, "step": 114 }, { "epoch": 0.1695227565874332, "grad_norm": 2.80917696143427, "learning_rate": 4.999823703437162e-05, "loss": 1.2639, "step": 115 }, { "epoch": 0.17099686751428045, "grad_norm": 2.6949986371831334, "learning_rate": 4.999799414013322e-05, "loss": 1.4286, "step": 116 }, { "epoch": 0.1724709784411277, "grad_norm": 2.8850236388614827, "learning_rate": 4.9997735576163215e-05, "loss": 1.1949, "step": 117 }, { "epoch": 0.17394508936797495, "grad_norm": 2.526128509182295, "learning_rate": 4.9997461342623686e-05, "loss": 1.1675, "step": 118 }, { "epoch": 0.1754192002948222, "grad_norm": 2.7933005395965718, "learning_rate": 4.999717143968654e-05, "loss": 1.0644, "step": 119 }, { "epoch": 0.17689331122166943, "grad_norm": 3.744406533984798, "learning_rate": 4.9996865867533496e-05, "loss": 1.2864, "step": 120 }, { "epoch": 0.17836742214851667, "grad_norm": 2.999463987149217, "learning_rate": 4.99965446263561e-05, "loss": 1.2394, "step": 121 }, { "epoch": 0.1798415330753639, "grad_norm": 2.5701727089942494, "learning_rate": 4.9996207716355726e-05, "loss": 1.3617, "step": 122 }, { "epoch": 0.18131564400221117, "grad_norm": 2.758865458013036, "learning_rate": 4.999585513774354e-05, "loss": 1.3225, "step": 123 }, { "epoch": 0.1827897549290584, "grad_norm": 2.559691681325797, "learning_rate": 4.9995486890740573e-05, "loss": 1.192, "step": 124 }, { "epoch": 0.18426386585590565, "grad_norm": 2.6347457172385416, "learning_rate": 4.9995102975577655e-05, "loss": 1.2576, "step": 125 }, { "epoch": 0.1857379767827529, "grad_norm": 2.2560155773166137, "learning_rate": 4.999470339249543e-05, "loss": 1.2124, "step": 126 }, { "epoch": 0.18721208770960016, "grad_norm": 2.647366467531351, "learning_rate": 4.9994288141744374e-05, "loss": 1.1136, "step": 127 }, { "epoch": 0.1886861986364474, "grad_norm": 2.834643864943964, "learning_rate": 4.999385722358479e-05, "loss": 1.4052, "step": 128 }, { "epoch": 0.19016030956329463, "grad_norm": 2.458628706779015, "learning_rate": 4.999341063828679e-05, "loss": 1.177, "step": 129 }, { "epoch": 0.19163442049014187, "grad_norm": 2.751484903501078, "learning_rate": 4.9992948386130315e-05, "loss": 1.3057, "step": 130 }, { "epoch": 0.19310853141698914, "grad_norm": 2.5093800051818564, "learning_rate": 4.9992470467405104e-05, "loss": 1.275, "step": 131 }, { "epoch": 0.19458264234383638, "grad_norm": 2.7990683368001483, "learning_rate": 4.999197688241076e-05, "loss": 1.3966, "step": 132 }, { "epoch": 0.19605675327068361, "grad_norm": 2.6884193622172816, "learning_rate": 4.999146763145668e-05, "loss": 1.2051, "step": 133 }, { "epoch": 0.19753086419753085, "grad_norm": 2.5776831970822682, "learning_rate": 4.9990942714862066e-05, "loss": 1.2228, "step": 134 }, { "epoch": 0.19900497512437812, "grad_norm": 2.9822808552736015, "learning_rate": 4.999040213295597e-05, "loss": 1.1476, "step": 135 }, { "epoch": 0.20047908605122536, "grad_norm": 3.14454050003492, "learning_rate": 4.9989845886077246e-05, "loss": 1.3674, "step": 136 }, { "epoch": 0.2019531969780726, "grad_norm": 2.9668872008345017, "learning_rate": 4.9989273974574566e-05, "loss": 1.1274, "step": 137 }, { "epoch": 0.20342730790491984, "grad_norm": 3.174729739724011, "learning_rate": 4.998868639880644e-05, "loss": 1.4377, "step": 138 }, { "epoch": 0.2049014188317671, "grad_norm": 3.066470783307803, "learning_rate": 4.998808315914117e-05, "loss": 1.3482, "step": 139 }, { "epoch": 0.20637552975861434, "grad_norm": 2.6062675749920294, "learning_rate": 4.9987464255956894e-05, "loss": 1.1379, "step": 140 }, { "epoch": 0.20784964068546158, "grad_norm": 2.1219995296380674, "learning_rate": 4.9986829689641574e-05, "loss": 1.1073, "step": 141 }, { "epoch": 0.20932375161230882, "grad_norm": 2.485235581005737, "learning_rate": 4.998617946059297e-05, "loss": 1.2949, "step": 142 }, { "epoch": 0.21079786253915608, "grad_norm": 2.9615579322456895, "learning_rate": 4.998551356921868e-05, "loss": 1.4175, "step": 143 }, { "epoch": 0.21227197346600332, "grad_norm": 2.812857202522849, "learning_rate": 4.99848320159361e-05, "loss": 1.2366, "step": 144 }, { "epoch": 0.21374608439285056, "grad_norm": 2.9395300161247278, "learning_rate": 4.9984134801172464e-05, "loss": 1.2099, "step": 145 }, { "epoch": 0.2152201953196978, "grad_norm": 2.428532660198918, "learning_rate": 4.998342192536482e-05, "loss": 1.0984, "step": 146 }, { "epoch": 0.21669430624654507, "grad_norm": 2.7115883169674024, "learning_rate": 4.998269338896e-05, "loss": 1.3003, "step": 147 }, { "epoch": 0.2181684171733923, "grad_norm": 2.47349700374202, "learning_rate": 4.998194919241471e-05, "loss": 1.1756, "step": 148 }, { "epoch": 0.21964252810023954, "grad_norm": 2.5628669289980612, "learning_rate": 4.9981189336195425e-05, "loss": 1.0473, "step": 149 }, { "epoch": 0.22111663902708678, "grad_norm": 2.8927861563233153, "learning_rate": 4.998041382077846e-05, "loss": 1.2406, "step": 150 }, { "epoch": 0.22259074995393405, "grad_norm": 2.3343715968994503, "learning_rate": 4.9979622646649935e-05, "loss": 1.1461, "step": 151 }, { "epoch": 0.22406486088078129, "grad_norm": 2.9411082731082017, "learning_rate": 4.997881581430579e-05, "loss": 1.2911, "step": 152 }, { "epoch": 0.22553897180762852, "grad_norm": 2.7831746846134617, "learning_rate": 4.997799332425178e-05, "loss": 1.3143, "step": 153 }, { "epoch": 0.22701308273447576, "grad_norm": 2.271918454600752, "learning_rate": 4.997715517700347e-05, "loss": 1.1463, "step": 154 }, { "epoch": 0.228487193661323, "grad_norm": 2.4575881690724035, "learning_rate": 4.9976301373086254e-05, "loss": 1.1363, "step": 155 }, { "epoch": 0.22996130458817027, "grad_norm": 2.469231474076661, "learning_rate": 4.997543191303532e-05, "loss": 1.2225, "step": 156 }, { "epoch": 0.2314354155150175, "grad_norm": 2.666362943836904, "learning_rate": 4.9974546797395685e-05, "loss": 1.3384, "step": 157 }, { "epoch": 0.23290952644186474, "grad_norm": 2.4321610104567775, "learning_rate": 4.9973646026722166e-05, "loss": 1.0816, "step": 158 }, { "epoch": 0.23438363736871198, "grad_norm": 2.573292404929827, "learning_rate": 4.997272960157942e-05, "loss": 1.2069, "step": 159 }, { "epoch": 0.23585774829555925, "grad_norm": 2.44890508514067, "learning_rate": 4.997179752254188e-05, "loss": 1.1721, "step": 160 }, { "epoch": 0.2373318592224065, "grad_norm": 2.6551187237992733, "learning_rate": 4.997084979019382e-05, "loss": 1.291, "step": 161 }, { "epoch": 0.23880597014925373, "grad_norm": 2.376373094865058, "learning_rate": 4.996988640512931e-05, "loss": 1.2192, "step": 162 }, { "epoch": 0.24028008107610097, "grad_norm": 2.4300351400150086, "learning_rate": 4.9968907367952245e-05, "loss": 1.2309, "step": 163 }, { "epoch": 0.24175419200294823, "grad_norm": 2.259331426556457, "learning_rate": 4.9967912679276316e-05, "loss": 1.1891, "step": 164 }, { "epoch": 0.24322830292979547, "grad_norm": 2.5094337808035574, "learning_rate": 4.996690233972505e-05, "loss": 1.2203, "step": 165 }, { "epoch": 0.2447024138566427, "grad_norm": 2.569055870390044, "learning_rate": 4.996587634993175e-05, "loss": 1.1868, "step": 166 }, { "epoch": 0.24617652478348995, "grad_norm": 2.5428395509494024, "learning_rate": 4.996483471053955e-05, "loss": 1.0991, "step": 167 }, { "epoch": 0.2476506357103372, "grad_norm": 2.5207541166947434, "learning_rate": 4.996377742220139e-05, "loss": 1.2177, "step": 168 }, { "epoch": 0.24912474663718445, "grad_norm": 3.0450906859675957, "learning_rate": 4.9962704485580034e-05, "loss": 1.1947, "step": 169 }, { "epoch": 0.2505988575640317, "grad_norm": 2.2842141320484393, "learning_rate": 4.996161590134802e-05, "loss": 1.0642, "step": 170 }, { "epoch": 0.25207296849087896, "grad_norm": 2.5143749851318784, "learning_rate": 4.996051167018773e-05, "loss": 1.2072, "step": 171 }, { "epoch": 0.2535470794177262, "grad_norm": 2.4371171308254334, "learning_rate": 4.995939179279134e-05, "loss": 1.164, "step": 172 }, { "epoch": 0.25502119034457343, "grad_norm": 3.801411873388641, "learning_rate": 4.9958256269860826e-05, "loss": 1.2979, "step": 173 }, { "epoch": 0.2564953012714207, "grad_norm": 3.0327959346171434, "learning_rate": 4.995710510210798e-05, "loss": 1.3545, "step": 174 }, { "epoch": 0.2579694121982679, "grad_norm": 2.430653959431846, "learning_rate": 4.9955938290254404e-05, "loss": 1.2238, "step": 175 }, { "epoch": 0.25944352312511515, "grad_norm": 2.800925789778478, "learning_rate": 4.99547558350315e-05, "loss": 1.2779, "step": 176 }, { "epoch": 0.2609176340519624, "grad_norm": 2.153076047299329, "learning_rate": 4.9953557737180477e-05, "loss": 1.0717, "step": 177 }, { "epoch": 0.2623917449788097, "grad_norm": 2.719867432813831, "learning_rate": 4.9952343997452355e-05, "loss": 1.0264, "step": 178 }, { "epoch": 0.2638658559056569, "grad_norm": 2.7520072720509194, "learning_rate": 4.995111461660794e-05, "loss": 1.1534, "step": 179 }, { "epoch": 0.26533996683250416, "grad_norm": 2.746800381945662, "learning_rate": 4.9949869595417876e-05, "loss": 1.3368, "step": 180 }, { "epoch": 0.2668140777593514, "grad_norm": 2.9133587925872955, "learning_rate": 4.994860893466258e-05, "loss": 1.1338, "step": 181 }, { "epoch": 0.26828818868619864, "grad_norm": 2.3364475675741043, "learning_rate": 4.994733263513228e-05, "loss": 1.1898, "step": 182 }, { "epoch": 0.2697622996130459, "grad_norm": 2.8507798048262987, "learning_rate": 4.994604069762702e-05, "loss": 1.2877, "step": 183 }, { "epoch": 0.2712364105398931, "grad_norm": 2.6185781854972388, "learning_rate": 4.994473312295663e-05, "loss": 1.3653, "step": 184 }, { "epoch": 0.27271052146674035, "grad_norm": 2.600030121262007, "learning_rate": 4.994340991194076e-05, "loss": 1.1204, "step": 185 }, { "epoch": 0.2741846323935876, "grad_norm": 2.5929487898271844, "learning_rate": 4.994207106540884e-05, "loss": 1.3522, "step": 186 }, { "epoch": 0.2756587433204349, "grad_norm": 2.598500233694055, "learning_rate": 4.994071658420012e-05, "loss": 1.3748, "step": 187 }, { "epoch": 0.2771328542472821, "grad_norm": 2.3823643997847634, "learning_rate": 4.993934646916364e-05, "loss": 1.2487, "step": 188 }, { "epoch": 0.27860696517412936, "grad_norm": 2.213214254661336, "learning_rate": 4.993796072115824e-05, "loss": 1.1578, "step": 189 }, { "epoch": 0.2800810761009766, "grad_norm": 2.520426462509367, "learning_rate": 4.993655934105256e-05, "loss": 1.1448, "step": 190 }, { "epoch": 0.28155518702782384, "grad_norm": 2.3621492559448565, "learning_rate": 4.993514232972504e-05, "loss": 1.2412, "step": 191 }, { "epoch": 0.2830292979546711, "grad_norm": 2.311433804075154, "learning_rate": 4.9933709688063935e-05, "loss": 1.1895, "step": 192 }, { "epoch": 0.2845034088815183, "grad_norm": 2.534655881620308, "learning_rate": 4.993226141696726e-05, "loss": 1.159, "step": 193 }, { "epoch": 0.28597751980836555, "grad_norm": 2.6335128622383737, "learning_rate": 4.9930797517342853e-05, "loss": 1.2183, "step": 194 }, { "epoch": 0.28745163073521285, "grad_norm": 2.5631942738917917, "learning_rate": 4.992931799010836e-05, "loss": 1.106, "step": 195 }, { "epoch": 0.2889257416620601, "grad_norm": 2.7367253256894934, "learning_rate": 4.992782283619118e-05, "loss": 1.2009, "step": 196 }, { "epoch": 0.2903998525889073, "grad_norm": 2.438065781097837, "learning_rate": 4.992631205652857e-05, "loss": 1.2107, "step": 197 }, { "epoch": 0.29187396351575456, "grad_norm": 2.6497460117621396, "learning_rate": 4.992478565206752e-05, "loss": 1.2392, "step": 198 }, { "epoch": 0.2933480744426018, "grad_norm": 2.5775763650380057, "learning_rate": 4.992324362376484e-05, "loss": 1.3215, "step": 199 }, { "epoch": 0.29482218536944904, "grad_norm": 2.4910592422148206, "learning_rate": 4.992168597258715e-05, "loss": 1.1189, "step": 200 }, { "epoch": 0.2962962962962963, "grad_norm": 2.440007411757922, "learning_rate": 4.992011269951083e-05, "loss": 1.1458, "step": 201 }, { "epoch": 0.2977704072231435, "grad_norm": 2.5018042729310683, "learning_rate": 4.991852380552209e-05, "loss": 1.1312, "step": 202 }, { "epoch": 0.2992445181499908, "grad_norm": 2.483478696353504, "learning_rate": 4.99169192916169e-05, "loss": 1.1624, "step": 203 }, { "epoch": 0.30071862907683805, "grad_norm": 2.1804066773903883, "learning_rate": 4.991529915880103e-05, "loss": 1.022, "step": 204 }, { "epoch": 0.3021927400036853, "grad_norm": 2.4654430984679907, "learning_rate": 4.991366340809005e-05, "loss": 1.3237, "step": 205 }, { "epoch": 0.30366685093053253, "grad_norm": 2.309610262438573, "learning_rate": 4.99120120405093e-05, "loss": 1.3506, "step": 206 }, { "epoch": 0.30514096185737977, "grad_norm": 2.0380169171800104, "learning_rate": 4.9910345057093936e-05, "loss": 1.0345, "step": 207 }, { "epoch": 0.306615072784227, "grad_norm": 2.5171050462531936, "learning_rate": 4.990866245888889e-05, "loss": 1.2814, "step": 208 }, { "epoch": 0.30808918371107424, "grad_norm": 2.586295664897165, "learning_rate": 4.9906964246948874e-05, "loss": 1.2166, "step": 209 }, { "epoch": 0.3095632946379215, "grad_norm": 2.484484112340805, "learning_rate": 4.99052504223384e-05, "loss": 1.167, "step": 210 }, { "epoch": 0.3110374055647688, "grad_norm": 2.251380708702999, "learning_rate": 4.990352098613176e-05, "loss": 1.1655, "step": 211 }, { "epoch": 0.312511516491616, "grad_norm": 2.09931161587771, "learning_rate": 4.9901775939413026e-05, "loss": 1.1222, "step": 212 }, { "epoch": 0.31398562741846325, "grad_norm": 2.5440053863168473, "learning_rate": 4.990001528327607e-05, "loss": 1.1737, "step": 213 }, { "epoch": 0.3154597383453105, "grad_norm": 2.185788602304634, "learning_rate": 4.989823901882454e-05, "loss": 1.0243, "step": 214 }, { "epoch": 0.31693384927215773, "grad_norm": 2.3766954293278393, "learning_rate": 4.989644714717187e-05, "loss": 1.1168, "step": 215 }, { "epoch": 0.31840796019900497, "grad_norm": 2.44845681149402, "learning_rate": 4.989463966944127e-05, "loss": 1.1837, "step": 216 }, { "epoch": 0.3198820711258522, "grad_norm": 2.37183178848489, "learning_rate": 4.989281658676573e-05, "loss": 1.0781, "step": 217 }, { "epoch": 0.32135618205269945, "grad_norm": 2.54934666061642, "learning_rate": 4.989097790028806e-05, "loss": 1.2591, "step": 218 }, { "epoch": 0.3228302929795467, "grad_norm": 2.4959485604291998, "learning_rate": 4.98891236111608e-05, "loss": 1.1265, "step": 219 }, { "epoch": 0.324304403906394, "grad_norm": 2.474835392265723, "learning_rate": 4.988725372054629e-05, "loss": 1.2131, "step": 220 }, { "epoch": 0.3257785148332412, "grad_norm": 2.432955883311319, "learning_rate": 4.988536822961666e-05, "loss": 1.2803, "step": 221 }, { "epoch": 0.32725262576008846, "grad_norm": 2.6380975127804653, "learning_rate": 4.988346713955381e-05, "loss": 1.2412, "step": 222 }, { "epoch": 0.3287267366869357, "grad_norm": 2.5825642599731897, "learning_rate": 4.9881550451549405e-05, "loss": 1.1386, "step": 223 }, { "epoch": 0.33020084761378293, "grad_norm": 2.7065798340416216, "learning_rate": 4.987961816680492e-05, "loss": 1.3316, "step": 224 }, { "epoch": 0.33167495854063017, "grad_norm": 2.745105636176114, "learning_rate": 4.9877670286531585e-05, "loss": 1.3274, "step": 225 }, { "epoch": 0.3331490694674774, "grad_norm": 2.465784247106301, "learning_rate": 4.98757068119504e-05, "loss": 1.2512, "step": 226 }, { "epoch": 0.33462318039432465, "grad_norm": 2.2387644727144322, "learning_rate": 4.9873727744292144e-05, "loss": 1.2975, "step": 227 }, { "epoch": 0.33609729132117194, "grad_norm": 2.6891713822446306, "learning_rate": 4.987173308479738e-05, "loss": 1.3597, "step": 228 }, { "epoch": 0.3375714022480192, "grad_norm": 2.4096156770700263, "learning_rate": 4.9869722834716446e-05, "loss": 1.1279, "step": 229 }, { "epoch": 0.3390455131748664, "grad_norm": 2.5007229713291905, "learning_rate": 4.9867696995309445e-05, "loss": 1.0225, "step": 230 }, { "epoch": 0.34051962410171366, "grad_norm": 2.8904507971157156, "learning_rate": 4.986565556784625e-05, "loss": 1.2372, "step": 231 }, { "epoch": 0.3419937350285609, "grad_norm": 2.471575218956074, "learning_rate": 4.98635985536065e-05, "loss": 1.3066, "step": 232 }, { "epoch": 0.34346784595540814, "grad_norm": 2.9593176122174634, "learning_rate": 4.986152595387963e-05, "loss": 1.1745, "step": 233 }, { "epoch": 0.3449419568822554, "grad_norm": 2.289187170806402, "learning_rate": 4.9859437769964815e-05, "loss": 1.1151, "step": 234 }, { "epoch": 0.3464160678091026, "grad_norm": 2.6048711408788527, "learning_rate": 4.985733400317101e-05, "loss": 1.2614, "step": 235 }, { "epoch": 0.3478901787359499, "grad_norm": 2.5174008374349843, "learning_rate": 4.985521465481695e-05, "loss": 1.1495, "step": 236 }, { "epoch": 0.34936428966279715, "grad_norm": 2.5851742635164467, "learning_rate": 4.985307972623112e-05, "loss": 1.1895, "step": 237 }, { "epoch": 0.3508384005896444, "grad_norm": 2.3105944437529917, "learning_rate": 4.985092921875178e-05, "loss": 1.1287, "step": 238 }, { "epoch": 0.3523125115164916, "grad_norm": 2.3700906306454073, "learning_rate": 4.984876313372695e-05, "loss": 1.083, "step": 239 }, { "epoch": 0.35378662244333886, "grad_norm": 2.366936698695663, "learning_rate": 4.984658147251442e-05, "loss": 1.1453, "step": 240 }, { "epoch": 0.3552607333701861, "grad_norm": 2.535644014102465, "learning_rate": 4.984438423648174e-05, "loss": 1.1245, "step": 241 }, { "epoch": 0.35673484429703334, "grad_norm": 2.2704845008883927, "learning_rate": 4.9842171427006225e-05, "loss": 1.1442, "step": 242 }, { "epoch": 0.3582089552238806, "grad_norm": 2.189057476901078, "learning_rate": 4.983994304547495e-05, "loss": 1.1465, "step": 243 }, { "epoch": 0.3596830661507278, "grad_norm": 2.314889858583567, "learning_rate": 4.9837699093284765e-05, "loss": 1.0968, "step": 244 }, { "epoch": 0.3611571770775751, "grad_norm": 3.006652449794858, "learning_rate": 4.983543957184224e-05, "loss": 1.3805, "step": 245 }, { "epoch": 0.36263128800442235, "grad_norm": 2.4730773856414388, "learning_rate": 4.983316448256377e-05, "loss": 0.9247, "step": 246 }, { "epoch": 0.3641053989312696, "grad_norm": 2.6499892108625627, "learning_rate": 4.983087382687544e-05, "loss": 1.3119, "step": 247 }, { "epoch": 0.3655795098581168, "grad_norm": 2.1620065997196782, "learning_rate": 4.982856760621313e-05, "loss": 0.972, "step": 248 }, { "epoch": 0.36705362078496406, "grad_norm": 2.569414161526586, "learning_rate": 4.9826245822022474e-05, "loss": 1.23, "step": 249 }, { "epoch": 0.3685277317118113, "grad_norm": 2.416190878849829, "learning_rate": 4.9823908475758875e-05, "loss": 1.3144, "step": 250 }, { "epoch": 0.3685277317118113, "eval_bleu": 0.07815206714351704, "eval_bleu_1gram": 0.3882564630984295, "eval_bleu_2gram": 0.15862507008677815, "eval_bleu_3gram": 0.07262837239089334, "eval_bleu_4gram": 0.03454950679573124, "eval_rag_val_loss": 1.2124991666886114, "eval_rouge1": 0.3826912907804564, "eval_rouge2": 0.15044444078158314, "eval_rougeL": 0.36413418589775887, "step": 250 }, { "epoch": 0.37000184263865854, "grad_norm": 2.255903235335249, "learning_rate": 4.982155556888745e-05, "loss": 1.1667, "step": 251 }, { "epoch": 0.3714759535655058, "grad_norm": 2.2837381250065425, "learning_rate": 4.981918710288309e-05, "loss": 1.1563, "step": 252 }, { "epoch": 0.3729500644923531, "grad_norm": 2.5546386900739035, "learning_rate": 4.981680307923047e-05, "loss": 1.2122, "step": 253 }, { "epoch": 0.3744241754192003, "grad_norm": 2.488150648585196, "learning_rate": 4.981440349942397e-05, "loss": 1.154, "step": 254 }, { "epoch": 0.37589828634604755, "grad_norm": 2.3938026004121538, "learning_rate": 4.981198836496775e-05, "loss": 1.1162, "step": 255 }, { "epoch": 0.3773723972728948, "grad_norm": 2.444997748937526, "learning_rate": 4.9809557677375704e-05, "loss": 1.2319, "step": 256 }, { "epoch": 0.378846508199742, "grad_norm": 2.323748680471422, "learning_rate": 4.98071114381715e-05, "loss": 1.0432, "step": 257 }, { "epoch": 0.38032061912658927, "grad_norm": 2.370670890494676, "learning_rate": 4.980464964888852e-05, "loss": 1.1801, "step": 258 }, { "epoch": 0.3817947300534365, "grad_norm": 2.594772596887698, "learning_rate": 4.980217231106991e-05, "loss": 1.2053, "step": 259 }, { "epoch": 0.38326884098028374, "grad_norm": 2.271965153097835, "learning_rate": 4.979967942626858e-05, "loss": 1.1525, "step": 260 }, { "epoch": 0.38474295190713104, "grad_norm": 2.854903597030878, "learning_rate": 4.979717099604715e-05, "loss": 1.1399, "step": 261 }, { "epoch": 0.3862170628339783, "grad_norm": 2.4574360476115316, "learning_rate": 4.979464702197801e-05, "loss": 1.2395, "step": 262 }, { "epoch": 0.3876911737608255, "grad_norm": 2.5222345561464903, "learning_rate": 4.9792107505643304e-05, "loss": 1.0665, "step": 263 }, { "epoch": 0.38916528468767275, "grad_norm": 2.8030832957351786, "learning_rate": 4.9789552448634874e-05, "loss": 1.2191, "step": 264 }, { "epoch": 0.39063939561452, "grad_norm": 2.5021311259621846, "learning_rate": 4.9786981852554346e-05, "loss": 1.1302, "step": 265 }, { "epoch": 0.39211350654136723, "grad_norm": 2.734298530599257, "learning_rate": 4.978439571901307e-05, "loss": 1.3491, "step": 266 }, { "epoch": 0.39358761746821447, "grad_norm": 2.7591787093473967, "learning_rate": 4.9781794049632135e-05, "loss": 1.1957, "step": 267 }, { "epoch": 0.3950617283950617, "grad_norm": 2.519668551703401, "learning_rate": 4.9779176846042366e-05, "loss": 1.1163, "step": 268 }, { "epoch": 0.396535839321909, "grad_norm": 2.4799679228031004, "learning_rate": 4.977654410988434e-05, "loss": 1.0915, "step": 269 }, { "epoch": 0.39800995024875624, "grad_norm": 2.7518975040400213, "learning_rate": 4.977389584280835e-05, "loss": 1.208, "step": 270 }, { "epoch": 0.3994840611756035, "grad_norm": 2.4662945281937807, "learning_rate": 4.9771232046474444e-05, "loss": 1.0541, "step": 271 }, { "epoch": 0.4009581721024507, "grad_norm": 2.316218196446748, "learning_rate": 4.976855272255239e-05, "loss": 1.1703, "step": 272 }, { "epoch": 0.40243228302929795, "grad_norm": 2.5952426338322767, "learning_rate": 4.976585787272168e-05, "loss": 1.2896, "step": 273 }, { "epoch": 0.4039063939561452, "grad_norm": 2.9181230088074166, "learning_rate": 4.976314749867158e-05, "loss": 1.293, "step": 274 }, { "epoch": 0.40538050488299243, "grad_norm": 2.415639713928496, "learning_rate": 4.976042160210104e-05, "loss": 1.2577, "step": 275 }, { "epoch": 0.40685461580983967, "grad_norm": 2.938724019314222, "learning_rate": 4.975768018471877e-05, "loss": 1.2693, "step": 276 }, { "epoch": 0.4083287267366869, "grad_norm": 2.7375577876989574, "learning_rate": 4.9754923248243195e-05, "loss": 1.1946, "step": 277 }, { "epoch": 0.4098028376635342, "grad_norm": 2.5431120051335325, "learning_rate": 4.975215079440247e-05, "loss": 1.3376, "step": 278 }, { "epoch": 0.41127694859038144, "grad_norm": 2.2796214313097405, "learning_rate": 4.974936282493448e-05, "loss": 1.1643, "step": 279 }, { "epoch": 0.4127510595172287, "grad_norm": 2.459177217201275, "learning_rate": 4.974655934158684e-05, "loss": 1.2067, "step": 280 }, { "epoch": 0.4142251704440759, "grad_norm": 2.2634356213332345, "learning_rate": 4.974374034611687e-05, "loss": 1.1517, "step": 281 }, { "epoch": 0.41569928137092316, "grad_norm": 2.273934640963388, "learning_rate": 4.9740905840291646e-05, "loss": 1.1407, "step": 282 }, { "epoch": 0.4171733922977704, "grad_norm": 2.182613145794636, "learning_rate": 4.9738055825887936e-05, "loss": 1.1754, "step": 283 }, { "epoch": 0.41864750322461763, "grad_norm": 2.285799584755562, "learning_rate": 4.973519030469225e-05, "loss": 0.9708, "step": 284 }, { "epoch": 0.4201216141514649, "grad_norm": 2.251712888821597, "learning_rate": 4.97323092785008e-05, "loss": 1.1466, "step": 285 }, { "epoch": 0.42159572507831217, "grad_norm": 2.3149267299583554, "learning_rate": 4.972941274911953e-05, "loss": 1.0894, "step": 286 }, { "epoch": 0.4230698360051594, "grad_norm": 2.727383377980554, "learning_rate": 4.97265007183641e-05, "loss": 1.0578, "step": 287 }, { "epoch": 0.42454394693200664, "grad_norm": 2.446817148433334, "learning_rate": 4.9723573188059894e-05, "loss": 1.1423, "step": 288 }, { "epoch": 0.4260180578588539, "grad_norm": 2.5856516700274654, "learning_rate": 4.972063016004199e-05, "loss": 1.0924, "step": 289 }, { "epoch": 0.4274921687857011, "grad_norm": 2.468863665704367, "learning_rate": 4.971767163615522e-05, "loss": 1.15, "step": 290 }, { "epoch": 0.42896627971254836, "grad_norm": 2.4806044654063544, "learning_rate": 4.971469761825407e-05, "loss": 1.2418, "step": 291 }, { "epoch": 0.4304403906393956, "grad_norm": 2.857628627762282, "learning_rate": 4.971170810820279e-05, "loss": 1.2869, "step": 292 }, { "epoch": 0.43191450156624284, "grad_norm": 2.7645877250707414, "learning_rate": 4.970870310787532e-05, "loss": 1.1219, "step": 293 }, { "epoch": 0.43338861249309013, "grad_norm": 2.730170630687477, "learning_rate": 4.970568261915531e-05, "loss": 1.1517, "step": 294 }, { "epoch": 0.43486272341993737, "grad_norm": 2.6722836544441853, "learning_rate": 4.970264664393614e-05, "loss": 1.0833, "step": 295 }, { "epoch": 0.4363368343467846, "grad_norm": 2.339549254669979, "learning_rate": 4.9699595184120853e-05, "loss": 1.2227, "step": 296 }, { "epoch": 0.43781094527363185, "grad_norm": 2.4958410000776206, "learning_rate": 4.9696528241622244e-05, "loss": 1.2216, "step": 297 }, { "epoch": 0.4392850562004791, "grad_norm": 2.621274029462891, "learning_rate": 4.9693445818362783e-05, "loss": 1.0675, "step": 298 }, { "epoch": 0.4407591671273263, "grad_norm": 2.4933516793757553, "learning_rate": 4.969034791627466e-05, "loss": 1.1887, "step": 299 }, { "epoch": 0.44223327805417356, "grad_norm": 2.179667811150996, "learning_rate": 4.9687234537299765e-05, "loss": 1.0745, "step": 300 }, { "epoch": 0.4437073889810208, "grad_norm": 2.3432803891505887, "learning_rate": 4.968410568338967e-05, "loss": 1.0867, "step": 301 }, { "epoch": 0.4451814999078681, "grad_norm": 2.5637933212034967, "learning_rate": 4.968096135650569e-05, "loss": 1.1198, "step": 302 }, { "epoch": 0.44665561083471533, "grad_norm": 2.321850610453477, "learning_rate": 4.9677801558618795e-05, "loss": 1.237, "step": 303 }, { "epoch": 0.44812972176156257, "grad_norm": 2.6350456750664266, "learning_rate": 4.967462629170969e-05, "loss": 1.1513, "step": 304 }, { "epoch": 0.4496038326884098, "grad_norm": 2.3256915386864345, "learning_rate": 4.967143555776873e-05, "loss": 1.1888, "step": 305 }, { "epoch": 0.45107794361525705, "grad_norm": 2.5263481242636976, "learning_rate": 4.9668229358796014e-05, "loss": 1.1145, "step": 306 }, { "epoch": 0.4525520545421043, "grad_norm": 2.6909976643552533, "learning_rate": 4.966500769680131e-05, "loss": 1.1084, "step": 307 }, { "epoch": 0.4540261654689515, "grad_norm": 2.5321836280345913, "learning_rate": 4.966177057380409e-05, "loss": 1.1325, "step": 308 }, { "epoch": 0.45550027639579876, "grad_norm": 2.5570902592941938, "learning_rate": 4.965851799183349e-05, "loss": 1.142, "step": 309 }, { "epoch": 0.456974387322646, "grad_norm": 2.5304502942883533, "learning_rate": 4.9655249952928375e-05, "loss": 0.9499, "step": 310 }, { "epoch": 0.4584484982494933, "grad_norm": 2.333557549557578, "learning_rate": 4.965196645913728e-05, "loss": 1.1373, "step": 311 }, { "epoch": 0.45992260917634054, "grad_norm": 2.5656706570130097, "learning_rate": 4.964866751251842e-05, "loss": 1.3193, "step": 312 }, { "epoch": 0.4613967201031878, "grad_norm": 2.538673065837975, "learning_rate": 4.964535311513971e-05, "loss": 1.121, "step": 313 }, { "epoch": 0.462870831030035, "grad_norm": 2.2788872174974437, "learning_rate": 4.9642023269078745e-05, "loss": 1.1868, "step": 314 }, { "epoch": 0.46434494195688225, "grad_norm": 2.4322194341244012, "learning_rate": 4.963867797642281e-05, "loss": 1.0922, "step": 315 }, { "epoch": 0.4658190528837295, "grad_norm": 2.6826905975462556, "learning_rate": 4.963531723926885e-05, "loss": 1.2137, "step": 316 }, { "epoch": 0.46729316381057673, "grad_norm": 2.5418501824170354, "learning_rate": 4.963194105972353e-05, "loss": 1.1589, "step": 317 }, { "epoch": 0.46876727473742397, "grad_norm": 2.256525431805178, "learning_rate": 4.962854943990316e-05, "loss": 1.067, "step": 318 }, { "epoch": 0.47024138566427126, "grad_norm": 2.048532411891992, "learning_rate": 4.962514238193375e-05, "loss": 0.9739, "step": 319 }, { "epoch": 0.4717154965911185, "grad_norm": 2.6291381370902878, "learning_rate": 4.9621719887950966e-05, "loss": 1.2654, "step": 320 }, { "epoch": 0.47318960751796574, "grad_norm": 2.3398087997246266, "learning_rate": 4.9618281960100164e-05, "loss": 0.9409, "step": 321 }, { "epoch": 0.474663718444813, "grad_norm": 2.6227852697287206, "learning_rate": 4.9614828600536386e-05, "loss": 1.1949, "step": 322 }, { "epoch": 0.4761378293716602, "grad_norm": 2.2732751532723334, "learning_rate": 4.9611359811424324e-05, "loss": 1.2392, "step": 323 }, { "epoch": 0.47761194029850745, "grad_norm": 2.1897575718025504, "learning_rate": 4.960787559493836e-05, "loss": 1.1166, "step": 324 }, { "epoch": 0.4790860512253547, "grad_norm": 2.484538138929235, "learning_rate": 4.960437595326253e-05, "loss": 1.1452, "step": 325 }, { "epoch": 0.48056016215220193, "grad_norm": 2.6613915441883127, "learning_rate": 4.960086088859055e-05, "loss": 1.0231, "step": 326 }, { "epoch": 0.4820342730790492, "grad_norm": 3.0368229868992636, "learning_rate": 4.95973304031258e-05, "loss": 1.1134, "step": 327 }, { "epoch": 0.48350838400589646, "grad_norm": 2.6159320610043615, "learning_rate": 4.9593784499081336e-05, "loss": 1.198, "step": 328 }, { "epoch": 0.4849824949327437, "grad_norm": 2.603475331411752, "learning_rate": 4.959022317867986e-05, "loss": 1.2378, "step": 329 }, { "epoch": 0.48645660585959094, "grad_norm": 2.411836851223093, "learning_rate": 4.9586646444153764e-05, "loss": 1.0323, "step": 330 }, { "epoch": 0.4879307167864382, "grad_norm": 2.4997597578964434, "learning_rate": 4.958305429774507e-05, "loss": 1.1211, "step": 331 }, { "epoch": 0.4894048277132854, "grad_norm": 2.5251019070144687, "learning_rate": 4.9579446741705485e-05, "loss": 1.1321, "step": 332 }, { "epoch": 0.49087893864013266, "grad_norm": 2.437933858748795, "learning_rate": 4.957582377829637e-05, "loss": 1.0483, "step": 333 }, { "epoch": 0.4923530495669799, "grad_norm": 2.661897019713721, "learning_rate": 4.957218540978874e-05, "loss": 1.1199, "step": 334 }, { "epoch": 0.49382716049382713, "grad_norm": 2.159281382760316, "learning_rate": 4.9568531638463264e-05, "loss": 1.0652, "step": 335 }, { "epoch": 0.4953012714206744, "grad_norm": 2.6552931015672665, "learning_rate": 4.9564862466610284e-05, "loss": 1.1349, "step": 336 }, { "epoch": 0.49677538234752167, "grad_norm": 2.485648542546389, "learning_rate": 4.9561177896529764e-05, "loss": 1.1319, "step": 337 }, { "epoch": 0.4982494932743689, "grad_norm": 2.3293398561275334, "learning_rate": 4.9557477930531346e-05, "loss": 1.2263, "step": 338 }, { "epoch": 0.49972360420121614, "grad_norm": 2.5398639998483836, "learning_rate": 4.9553762570934314e-05, "loss": 1.135, "step": 339 }, { "epoch": 0.5011977151280634, "grad_norm": 2.4180374752008764, "learning_rate": 4.955003182006761e-05, "loss": 1.0835, "step": 340 }, { "epoch": 0.5026718260549107, "grad_norm": 2.7376022041088977, "learning_rate": 4.954628568026981e-05, "loss": 0.9648, "step": 341 }, { "epoch": 0.5041459369817579, "grad_norm": 2.521984709630562, "learning_rate": 4.954252415388914e-05, "loss": 1.0727, "step": 342 }, { "epoch": 0.5056200479086052, "grad_norm": 2.4545860070433263, "learning_rate": 4.953874724328347e-05, "loss": 1.1183, "step": 343 }, { "epoch": 0.5070941588354524, "grad_norm": 2.3119466996038387, "learning_rate": 4.953495495082032e-05, "loss": 1.1129, "step": 344 }, { "epoch": 0.5085682697622996, "grad_norm": 2.5333184162336493, "learning_rate": 4.953114727887686e-05, "loss": 1.1062, "step": 345 }, { "epoch": 0.5100423806891469, "grad_norm": 2.386491081005336, "learning_rate": 4.952732422983989e-05, "loss": 1.0694, "step": 346 }, { "epoch": 0.5115164916159941, "grad_norm": 2.570044135371525, "learning_rate": 4.9523485806105826e-05, "loss": 1.1043, "step": 347 }, { "epoch": 0.5129906025428413, "grad_norm": 2.2792850549647996, "learning_rate": 4.951963201008076e-05, "loss": 1.049, "step": 348 }, { "epoch": 0.5144647134696886, "grad_norm": 2.428926698359969, "learning_rate": 4.9515762844180405e-05, "loss": 1.195, "step": 349 }, { "epoch": 0.5159388243965358, "grad_norm": 2.451893102310182, "learning_rate": 4.9511878310830106e-05, "loss": 1.0035, "step": 350 }, { "epoch": 0.5174129353233831, "grad_norm": 2.445473175132569, "learning_rate": 4.950797841246484e-05, "loss": 1.1586, "step": 351 }, { "epoch": 0.5188870462502303, "grad_norm": 2.274513226088038, "learning_rate": 4.950406315152921e-05, "loss": 1.0219, "step": 352 }, { "epoch": 0.5203611571770775, "grad_norm": 2.601625917137152, "learning_rate": 4.9500132530477475e-05, "loss": 1.0236, "step": 353 }, { "epoch": 0.5218352681039248, "grad_norm": 2.69571972410572, "learning_rate": 4.949618655177348e-05, "loss": 1.1867, "step": 354 }, { "epoch": 0.523309379030772, "grad_norm": 2.3638731664465205, "learning_rate": 4.949222521789074e-05, "loss": 1.1309, "step": 355 }, { "epoch": 0.5247834899576194, "grad_norm": 2.8005434837132417, "learning_rate": 4.948824853131236e-05, "loss": 1.1781, "step": 356 }, { "epoch": 0.5262576008844666, "grad_norm": 2.7329149980554974, "learning_rate": 4.948425649453111e-05, "loss": 1.0813, "step": 357 }, { "epoch": 0.5277317118113138, "grad_norm": 2.3615754836451055, "learning_rate": 4.948024911004933e-05, "loss": 1.0914, "step": 358 }, { "epoch": 0.5292058227381611, "grad_norm": 2.435454561859465, "learning_rate": 4.9476226380379014e-05, "loss": 1.1995, "step": 359 }, { "epoch": 0.5306799336650083, "grad_norm": 2.6266452537993477, "learning_rate": 4.947218830804178e-05, "loss": 1.1697, "step": 360 }, { "epoch": 0.5321540445918556, "grad_norm": 2.4004405849090427, "learning_rate": 4.946813489556883e-05, "loss": 1.0914, "step": 361 }, { "epoch": 0.5336281555187028, "grad_norm": 2.3577942257849855, "learning_rate": 4.946406614550103e-05, "loss": 1.1861, "step": 362 }, { "epoch": 0.53510226644555, "grad_norm": 2.5944606313080234, "learning_rate": 4.945998206038881e-05, "loss": 1.1674, "step": 363 }, { "epoch": 0.5365763773723973, "grad_norm": 2.7624439976051613, "learning_rate": 4.945588264279225e-05, "loss": 1.4151, "step": 364 }, { "epoch": 0.5380504882992445, "grad_norm": 2.5743380390133943, "learning_rate": 4.945176789528102e-05, "loss": 1.2019, "step": 365 }, { "epoch": 0.5395245992260918, "grad_norm": 2.554428787430511, "learning_rate": 4.944763782043441e-05, "loss": 1.0723, "step": 366 }, { "epoch": 0.540998710152939, "grad_norm": 2.50704659147909, "learning_rate": 4.944349242084131e-05, "loss": 1.0544, "step": 367 }, { "epoch": 0.5424728210797862, "grad_norm": 2.541567649371743, "learning_rate": 4.943933169910023e-05, "loss": 1.1449, "step": 368 }, { "epoch": 0.5439469320066335, "grad_norm": 2.633614166189683, "learning_rate": 4.9435155657819266e-05, "loss": 1.0859, "step": 369 }, { "epoch": 0.5454210429334807, "grad_norm": 2.3190709141346146, "learning_rate": 4.9430964299616136e-05, "loss": 0.9999, "step": 370 }, { "epoch": 0.5468951538603279, "grad_norm": 2.3944097740139543, "learning_rate": 4.942675762711813e-05, "loss": 1.0884, "step": 371 }, { "epoch": 0.5483692647871752, "grad_norm": 2.3864400298206063, "learning_rate": 4.942253564296218e-05, "loss": 1.1064, "step": 372 }, { "epoch": 0.5498433757140225, "grad_norm": 2.535426426672559, "learning_rate": 4.9418298349794767e-05, "loss": 1.313, "step": 373 }, { "epoch": 0.5513174866408698, "grad_norm": 3.3892089418154407, "learning_rate": 4.941404575027202e-05, "loss": 1.0725, "step": 374 }, { "epoch": 0.552791597567717, "grad_norm": 2.570361290558482, "learning_rate": 4.9409777847059625e-05, "loss": 1.2509, "step": 375 }, { "epoch": 0.5542657084945642, "grad_norm": 2.6259842798382786, "learning_rate": 4.940549464283287e-05, "loss": 1.0221, "step": 376 }, { "epoch": 0.5557398194214115, "grad_norm": 2.644171235274648, "learning_rate": 4.940119614027663e-05, "loss": 1.085, "step": 377 }, { "epoch": 0.5572139303482587, "grad_norm": 2.4278795194318157, "learning_rate": 4.93968823420854e-05, "loss": 1.1924, "step": 378 }, { "epoch": 0.558688041275106, "grad_norm": 2.7610566075209872, "learning_rate": 4.9392553250963215e-05, "loss": 1.0511, "step": 379 }, { "epoch": 0.5601621522019532, "grad_norm": 2.6034133330136857, "learning_rate": 4.9388208869623734e-05, "loss": 1.2462, "step": 380 }, { "epoch": 0.5616362631288004, "grad_norm": 2.2781817561276, "learning_rate": 4.938384920079019e-05, "loss": 0.9203, "step": 381 }, { "epoch": 0.5631103740556477, "grad_norm": 2.1761080977503124, "learning_rate": 4.937947424719538e-05, "loss": 0.9668, "step": 382 }, { "epoch": 0.5645844849824949, "grad_norm": 2.5863474725467146, "learning_rate": 4.937508401158171e-05, "loss": 1.1776, "step": 383 }, { "epoch": 0.5660585959093422, "grad_norm": 3.420953495890425, "learning_rate": 4.937067849670115e-05, "loss": 1.1095, "step": 384 }, { "epoch": 0.5675327068361894, "grad_norm": 2.6642104300765657, "learning_rate": 4.936625770531525e-05, "loss": 1.1496, "step": 385 }, { "epoch": 0.5690068177630366, "grad_norm": 2.2941024551061195, "learning_rate": 4.936182164019515e-05, "loss": 1.037, "step": 386 }, { "epoch": 0.5704809286898839, "grad_norm": 2.5790165140038157, "learning_rate": 4.935737030412153e-05, "loss": 1.0119, "step": 387 }, { "epoch": 0.5719550396167311, "grad_norm": 2.538186259560203, "learning_rate": 4.935290369988468e-05, "loss": 1.2501, "step": 388 }, { "epoch": 0.5734291505435785, "grad_norm": 2.634755490158417, "learning_rate": 4.934842183028443e-05, "loss": 0.9358, "step": 389 }, { "epoch": 0.5749032614704257, "grad_norm": 2.575509333930397, "learning_rate": 4.9343924698130206e-05, "loss": 1.0798, "step": 390 }, { "epoch": 0.5763773723972729, "grad_norm": 2.669244642671942, "learning_rate": 4.9339412306240984e-05, "loss": 1.2203, "step": 391 }, { "epoch": 0.5778514833241202, "grad_norm": 2.163303283380819, "learning_rate": 4.933488465744531e-05, "loss": 1.0623, "step": 392 }, { "epoch": 0.5793255942509674, "grad_norm": 2.7073457433376524, "learning_rate": 4.933034175458129e-05, "loss": 1.1835, "step": 393 }, { "epoch": 0.5807997051778147, "grad_norm": 11.052469479894704, "learning_rate": 4.9325783600496596e-05, "loss": 1.0518, "step": 394 }, { "epoch": 0.5822738161046619, "grad_norm": 4.028143817156276, "learning_rate": 4.9321210198048465e-05, "loss": 0.972, "step": 395 }, { "epoch": 0.5837479270315091, "grad_norm": 3.058573638732971, "learning_rate": 4.931662155010367e-05, "loss": 1.2169, "step": 396 }, { "epoch": 0.5852220379583564, "grad_norm": 3.35271016280725, "learning_rate": 4.931201765953858e-05, "loss": 1.1632, "step": 397 }, { "epoch": 0.5866961488852036, "grad_norm": 2.380009537846022, "learning_rate": 4.9307398529239083e-05, "loss": 1.2864, "step": 398 }, { "epoch": 0.5881702598120508, "grad_norm": 3.1607983741935657, "learning_rate": 4.930276416210063e-05, "loss": 1.2766, "step": 399 }, { "epoch": 0.5896443707388981, "grad_norm": 5.937420011282751, "learning_rate": 4.929811456102824e-05, "loss": 1.3274, "step": 400 }, { "epoch": 0.5911184816657453, "grad_norm": 3.147759224880039, "learning_rate": 4.929344972893646e-05, "loss": 1.3701, "step": 401 }, { "epoch": 0.5925925925925926, "grad_norm": 2.5177716875502996, "learning_rate": 4.928876966874938e-05, "loss": 1.0886, "step": 402 }, { "epoch": 0.5940667035194398, "grad_norm": 2.8877895986150275, "learning_rate": 4.9284074383400655e-05, "loss": 1.1047, "step": 403 }, { "epoch": 0.595540814446287, "grad_norm": 2.299122547621843, "learning_rate": 4.927936387583348e-05, "loss": 0.9753, "step": 404 }, { "epoch": 0.5970149253731343, "grad_norm": 2.675600276869333, "learning_rate": 4.9274638149000585e-05, "loss": 1.1432, "step": 405 }, { "epoch": 0.5984890362999816, "grad_norm": 2.255788562538474, "learning_rate": 4.9269897205864235e-05, "loss": 1.0741, "step": 406 }, { "epoch": 0.5999631472268289, "grad_norm": 1.941318563710785, "learning_rate": 4.926514104939625e-05, "loss": 0.9092, "step": 407 }, { "epoch": 0.6014372581536761, "grad_norm": 2.3060640006799322, "learning_rate": 4.9260369682577965e-05, "loss": 1.1613, "step": 408 }, { "epoch": 0.6029113690805233, "grad_norm": 2.4558776184928597, "learning_rate": 4.9255583108400285e-05, "loss": 1.1613, "step": 409 }, { "epoch": 0.6043854800073706, "grad_norm": 2.4603954868767057, "learning_rate": 4.9250781329863606e-05, "loss": 1.0553, "step": 410 }, { "epoch": 0.6058595909342178, "grad_norm": 2.6720075964791135, "learning_rate": 4.924596434997787e-05, "loss": 1.0238, "step": 411 }, { "epoch": 0.6073337018610651, "grad_norm": 2.4715496558052075, "learning_rate": 4.924113217176256e-05, "loss": 1.1351, "step": 412 }, { "epoch": 0.6088078127879123, "grad_norm": 2.5852733116356794, "learning_rate": 4.9236284798246666e-05, "loss": 1.087, "step": 413 }, { "epoch": 0.6102819237147595, "grad_norm": 2.714809763304574, "learning_rate": 4.923142223246873e-05, "loss": 1.1858, "step": 414 }, { "epoch": 0.6117560346416068, "grad_norm": 2.511722262129397, "learning_rate": 4.922654447747679e-05, "loss": 1.099, "step": 415 }, { "epoch": 0.613230145568454, "grad_norm": 2.472644004369608, "learning_rate": 4.922165153632842e-05, "loss": 1.1593, "step": 416 }, { "epoch": 0.6147042564953012, "grad_norm": 2.571275803024348, "learning_rate": 4.9216743412090694e-05, "loss": 1.0795, "step": 417 }, { "epoch": 0.6161783674221485, "grad_norm": 2.7528276644473197, "learning_rate": 4.9211820107840234e-05, "loss": 1.1083, "step": 418 }, { "epoch": 0.6176524783489957, "grad_norm": 2.321104383364023, "learning_rate": 4.920688162666316e-05, "loss": 1.1086, "step": 419 }, { "epoch": 0.619126589275843, "grad_norm": 2.21038347723298, "learning_rate": 4.920192797165511e-05, "loss": 0.998, "step": 420 }, { "epoch": 0.6206007002026902, "grad_norm": 2.440627512521464, "learning_rate": 4.919695914592122e-05, "loss": 1.1883, "step": 421 }, { "epoch": 0.6220748111295376, "grad_norm": 2.5649510268505327, "learning_rate": 4.919197515257616e-05, "loss": 0.9101, "step": 422 }, { "epoch": 0.6235489220563848, "grad_norm": 2.4517184556184497, "learning_rate": 4.9186975994744075e-05, "loss": 1.1502, "step": 423 }, { "epoch": 0.625023032983232, "grad_norm": 2.807598760129745, "learning_rate": 4.918196167555866e-05, "loss": 1.1422, "step": 424 }, { "epoch": 0.6264971439100793, "grad_norm": 2.4500758178812956, "learning_rate": 4.9176932198163074e-05, "loss": 1.0435, "step": 425 }, { "epoch": 0.6279712548369265, "grad_norm": 2.251341419894887, "learning_rate": 4.917188756570999e-05, "loss": 1.0394, "step": 426 }, { "epoch": 0.6294453657637737, "grad_norm": 2.2146990693485735, "learning_rate": 4.9166827781361594e-05, "loss": 1.013, "step": 427 }, { "epoch": 0.630919476690621, "grad_norm": 2.2478022968469555, "learning_rate": 4.916175284828955e-05, "loss": 1.0792, "step": 428 }, { "epoch": 0.6323935876174682, "grad_norm": 2.7787502070198205, "learning_rate": 4.915666276967501e-05, "loss": 1.2358, "step": 429 }, { "epoch": 0.6338676985443155, "grad_norm": 2.020643507179203, "learning_rate": 4.9151557548708676e-05, "loss": 0.9638, "step": 430 }, { "epoch": 0.6353418094711627, "grad_norm": 2.232651158904404, "learning_rate": 4.9146437188590675e-05, "loss": 1.1217, "step": 431 }, { "epoch": 0.6368159203980099, "grad_norm": 2.4749860589761843, "learning_rate": 4.914130169253066e-05, "loss": 1.0364, "step": 432 }, { "epoch": 0.6382900313248572, "grad_norm": 2.588227517134156, "learning_rate": 4.913615106374777e-05, "loss": 1.1168, "step": 433 }, { "epoch": 0.6397641422517044, "grad_norm": 2.536225973422506, "learning_rate": 4.91309853054706e-05, "loss": 1.2094, "step": 434 }, { "epoch": 0.6412382531785517, "grad_norm": 2.179535234937034, "learning_rate": 4.912580442093727e-05, "loss": 1.2088, "step": 435 }, { "epoch": 0.6427123641053989, "grad_norm": 2.456892485710421, "learning_rate": 4.9120608413395366e-05, "loss": 1.093, "step": 436 }, { "epoch": 0.6441864750322461, "grad_norm": 2.50254863527885, "learning_rate": 4.911539728610194e-05, "loss": 1.082, "step": 437 }, { "epoch": 0.6456605859590934, "grad_norm": 2.3706569112112845, "learning_rate": 4.9110171042323536e-05, "loss": 1.193, "step": 438 }, { "epoch": 0.6471346968859407, "grad_norm": 2.5151859634842517, "learning_rate": 4.910492968533618e-05, "loss": 1.0582, "step": 439 }, { "epoch": 0.648608807812788, "grad_norm": 2.522107036367225, "learning_rate": 4.909967321842535e-05, "loss": 1.0046, "step": 440 }, { "epoch": 0.6500829187396352, "grad_norm": 2.095444520209058, "learning_rate": 4.9094401644886e-05, "loss": 0.9006, "step": 441 }, { "epoch": 0.6515570296664824, "grad_norm": 2.513793562972105, "learning_rate": 4.908911496802257e-05, "loss": 1.1406, "step": 442 }, { "epoch": 0.6530311405933297, "grad_norm": 2.4391109327337688, "learning_rate": 4.908381319114898e-05, "loss": 1.0968, "step": 443 }, { "epoch": 0.6545052515201769, "grad_norm": 2.77328654133619, "learning_rate": 4.9078496317588556e-05, "loss": 1.1016, "step": 444 }, { "epoch": 0.6559793624470242, "grad_norm": 2.378168702695207, "learning_rate": 4.907316435067415e-05, "loss": 1.1423, "step": 445 }, { "epoch": 0.6574534733738714, "grad_norm": 2.548452906559088, "learning_rate": 4.906781729374804e-05, "loss": 1.1674, "step": 446 }, { "epoch": 0.6589275843007186, "grad_norm": 2.848014400863671, "learning_rate": 4.906245515016197e-05, "loss": 1.1753, "step": 447 }, { "epoch": 0.6604016952275659, "grad_norm": 2.363664781959796, "learning_rate": 4.905707792327715e-05, "loss": 1.2826, "step": 448 }, { "epoch": 0.6618758061544131, "grad_norm": 2.3478395514248027, "learning_rate": 4.9051685616464246e-05, "loss": 1.0876, "step": 449 }, { "epoch": 0.6633499170812603, "grad_norm": 2.2575227228079324, "learning_rate": 4.904627823310335e-05, "loss": 0.9487, "step": 450 }, { "epoch": 0.6648240280081076, "grad_norm": 2.9830191363322847, "learning_rate": 4.9040855776584035e-05, "loss": 1.0073, "step": 451 }, { "epoch": 0.6662981389349548, "grad_norm": 2.157233525304234, "learning_rate": 4.9035418250305314e-05, "loss": 1.0566, "step": 452 }, { "epoch": 0.6677722498618021, "grad_norm": 2.4469090329040877, "learning_rate": 4.9029965657675636e-05, "loss": 1.1033, "step": 453 }, { "epoch": 0.6692463607886493, "grad_norm": 2.3402231765349244, "learning_rate": 4.9024498002112906e-05, "loss": 1.0281, "step": 454 }, { "epoch": 0.6707204717154965, "grad_norm": 2.037502817725357, "learning_rate": 4.901901528704447e-05, "loss": 1.074, "step": 455 }, { "epoch": 0.6721945826423439, "grad_norm": 2.3486124081429147, "learning_rate": 4.90135175159071e-05, "loss": 1.0809, "step": 456 }, { "epoch": 0.6736686935691911, "grad_norm": 2.388794341421867, "learning_rate": 4.900800469214703e-05, "loss": 1.1057, "step": 457 }, { "epoch": 0.6751428044960384, "grad_norm": 2.5541183350944268, "learning_rate": 4.900247681921991e-05, "loss": 1.1774, "step": 458 }, { "epoch": 0.6766169154228856, "grad_norm": 2.2561510566298595, "learning_rate": 4.899693390059082e-05, "loss": 1.087, "step": 459 }, { "epoch": 0.6780910263497328, "grad_norm": 2.209918538831851, "learning_rate": 4.89913759397343e-05, "loss": 1.1562, "step": 460 }, { "epoch": 0.6795651372765801, "grad_norm": 2.4421205009891134, "learning_rate": 4.898580294013428e-05, "loss": 1.104, "step": 461 }, { "epoch": 0.6810392482034273, "grad_norm": 2.4912262500754405, "learning_rate": 4.898021490528415e-05, "loss": 1.1259, "step": 462 }, { "epoch": 0.6825133591302746, "grad_norm": 2.683688810350101, "learning_rate": 4.89746118386867e-05, "loss": 1.0323, "step": 463 }, { "epoch": 0.6839874700571218, "grad_norm": 2.2435021351208357, "learning_rate": 4.8968993743854176e-05, "loss": 1.0833, "step": 464 }, { "epoch": 0.685461580983969, "grad_norm": 2.2918659008002575, "learning_rate": 4.89633606243082e-05, "loss": 0.9603, "step": 465 }, { "epoch": 0.6869356919108163, "grad_norm": 2.135301862902604, "learning_rate": 4.895771248357983e-05, "loss": 1.103, "step": 466 }, { "epoch": 0.6884098028376635, "grad_norm": 2.228472830439236, "learning_rate": 4.895204932520957e-05, "loss": 1.0745, "step": 467 }, { "epoch": 0.6898839137645107, "grad_norm": 2.1618906959830726, "learning_rate": 4.8946371152747285e-05, "loss": 1.0309, "step": 468 }, { "epoch": 0.691358024691358, "grad_norm": 2.6033714809708064, "learning_rate": 4.8940677969752295e-05, "loss": 1.1183, "step": 469 }, { "epoch": 0.6928321356182052, "grad_norm": 2.4367197206133846, "learning_rate": 4.893496977979331e-05, "loss": 1.0945, "step": 470 }, { "epoch": 0.6943062465450525, "grad_norm": 2.246835072126815, "learning_rate": 4.892924658644844e-05, "loss": 1.0247, "step": 471 }, { "epoch": 0.6957803574718998, "grad_norm": 2.774371868216854, "learning_rate": 4.892350839330522e-05, "loss": 1.0317, "step": 472 }, { "epoch": 0.697254468398747, "grad_norm": 2.3021542867597704, "learning_rate": 4.891775520396057e-05, "loss": 1.02, "step": 473 }, { "epoch": 0.6987285793255943, "grad_norm": 2.68051686256255, "learning_rate": 4.8911987022020823e-05, "loss": 1.2132, "step": 474 }, { "epoch": 0.7002026902524415, "grad_norm": 2.5372771104158653, "learning_rate": 4.89062038511017e-05, "loss": 1.0496, "step": 475 }, { "epoch": 0.7016768011792888, "grad_norm": 2.43779625681725, "learning_rate": 4.8900405694828313e-05, "loss": 1.0717, "step": 476 }, { "epoch": 0.703150912106136, "grad_norm": 2.340302844216732, "learning_rate": 4.8894592556835186e-05, "loss": 1.0392, "step": 477 }, { "epoch": 0.7046250230329832, "grad_norm": 2.5041836065733896, "learning_rate": 4.8888764440766225e-05, "loss": 1.0993, "step": 478 }, { "epoch": 0.7060991339598305, "grad_norm": 2.0689907245086983, "learning_rate": 4.888292135027472e-05, "loss": 0.9906, "step": 479 }, { "epoch": 0.7075732448866777, "grad_norm": 2.412823108916075, "learning_rate": 4.887706328902335e-05, "loss": 1.1485, "step": 480 }, { "epoch": 0.709047355813525, "grad_norm": 2.31121532738087, "learning_rate": 4.8871190260684174e-05, "loss": 1.0083, "step": 481 }, { "epoch": 0.7105214667403722, "grad_norm": 2.4187530083847064, "learning_rate": 4.886530226893865e-05, "loss": 1.0307, "step": 482 }, { "epoch": 0.7119955776672194, "grad_norm": 2.521111517655327, "learning_rate": 4.88593993174776e-05, "loss": 1.2399, "step": 483 }, { "epoch": 0.7134696885940667, "grad_norm": 2.880375025491574, "learning_rate": 4.885348141000122e-05, "loss": 1.1084, "step": 484 }, { "epoch": 0.7149437995209139, "grad_norm": 2.3693107927538573, "learning_rate": 4.8847548550219105e-05, "loss": 1.113, "step": 485 }, { "epoch": 0.7164179104477612, "grad_norm": 2.679809923406124, "learning_rate": 4.884160074185019e-05, "loss": 1.0661, "step": 486 }, { "epoch": 0.7178920213746084, "grad_norm": 2.5462555863436664, "learning_rate": 4.8835637988622804e-05, "loss": 0.9768, "step": 487 }, { "epoch": 0.7193661323014556, "grad_norm": 2.4057134054762903, "learning_rate": 4.8829660294274636e-05, "loss": 1.0729, "step": 488 }, { "epoch": 0.720840243228303, "grad_norm": 2.2833381397540142, "learning_rate": 4.8823667662552744e-05, "loss": 1.0111, "step": 489 }, { "epoch": 0.7223143541551502, "grad_norm": 2.5534669719447605, "learning_rate": 4.881766009721354e-05, "loss": 0.992, "step": 490 }, { "epoch": 0.7237884650819975, "grad_norm": 2.571125493278198, "learning_rate": 4.8811637602022806e-05, "loss": 1.0927, "step": 491 }, { "epoch": 0.7252625760088447, "grad_norm": 2.5163393131935794, "learning_rate": 4.8805600180755685e-05, "loss": 1.178, "step": 492 }, { "epoch": 0.7267366869356919, "grad_norm": 2.5241781275747086, "learning_rate": 4.8799547837196667e-05, "loss": 1.0517, "step": 493 }, { "epoch": 0.7282107978625392, "grad_norm": 2.4274420957406644, "learning_rate": 4.87934805751396e-05, "loss": 1.0329, "step": 494 }, { "epoch": 0.7296849087893864, "grad_norm": 2.463323302534618, "learning_rate": 4.8787398398387684e-05, "loss": 1.1532, "step": 495 }, { "epoch": 0.7311590197162336, "grad_norm": 2.6877918528506792, "learning_rate": 4.878130131075347e-05, "loss": 1.1154, "step": 496 }, { "epoch": 0.7326331306430809, "grad_norm": 2.485642691538013, "learning_rate": 4.877518931605885e-05, "loss": 0.9265, "step": 497 }, { "epoch": 0.7341072415699281, "grad_norm": 2.79917523294107, "learning_rate": 4.8769062418135066e-05, "loss": 1.1418, "step": 498 }, { "epoch": 0.7355813524967754, "grad_norm": 2.3812882347741895, "learning_rate": 4.8762920620822704e-05, "loss": 1.2198, "step": 499 }, { "epoch": 0.7370554634236226, "grad_norm": 2.3764612822136604, "learning_rate": 4.875676392797168e-05, "loss": 1.1654, "step": 500 }, { "epoch": 0.7370554634236226, "eval_bleu": 0.08618230962663566, "eval_bleu_1gram": 0.4071017529012301, "eval_bleu_2gram": 0.17382510346548008, "eval_bleu_3gram": 0.08273989186412671, "eval_bleu_4gram": 0.04223346869740902, "eval_rag_val_loss": 1.1140535358459718, "eval_rouge1": 0.3987255593176895, "eval_rouge2": 0.16200492135523356, "eval_rougeL": 0.38186554194607036, "step": 500 }, { "epoch": 0.7385295743504698, "grad_norm": 2.679191877168386, "learning_rate": 4.875059234344126e-05, "loss": 1.2013, "step": 501 }, { "epoch": 0.7400036852773171, "grad_norm": 2.5057098986314945, "learning_rate": 4.874440587110003e-05, "loss": 1.1513, "step": 502 }, { "epoch": 0.7414777962041643, "grad_norm": 2.4974323438429225, "learning_rate": 4.873820451482592e-05, "loss": 1.1625, "step": 503 }, { "epoch": 0.7429519071310116, "grad_norm": 2.305922768800774, "learning_rate": 4.873198827850618e-05, "loss": 1.1537, "step": 504 }, { "epoch": 0.7444260180578589, "grad_norm": 2.521163719111806, "learning_rate": 4.872575716603739e-05, "loss": 1.1002, "step": 505 }, { "epoch": 0.7459001289847061, "grad_norm": 2.5989984930958037, "learning_rate": 4.871951118132547e-05, "loss": 1.1024, "step": 506 }, { "epoch": 0.7473742399115534, "grad_norm": 2.1893729774581185, "learning_rate": 4.8713250328285654e-05, "loss": 1.2329, "step": 507 }, { "epoch": 0.7488483508384006, "grad_norm": 2.3810328106692267, "learning_rate": 4.8706974610842474e-05, "loss": 1.037, "step": 508 }, { "epoch": 0.7503224617652479, "grad_norm": 2.2199555466471295, "learning_rate": 4.87006840329298e-05, "loss": 1.0413, "step": 509 }, { "epoch": 0.7517965726920951, "grad_norm": 2.1520614854332023, "learning_rate": 4.8694378598490826e-05, "loss": 1.0277, "step": 510 }, { "epoch": 0.7532706836189423, "grad_norm": 2.7073600976643823, "learning_rate": 4.868805831147805e-05, "loss": 1.0748, "step": 511 }, { "epoch": 0.7547447945457896, "grad_norm": 2.217211875216997, "learning_rate": 4.868172317585326e-05, "loss": 1.2129, "step": 512 }, { "epoch": 0.7562189054726368, "grad_norm": 2.2442841512999467, "learning_rate": 4.867537319558758e-05, "loss": 0.9933, "step": 513 }, { "epoch": 0.757693016399484, "grad_norm": 2.4609970812169903, "learning_rate": 4.866900837466144e-05, "loss": 1.037, "step": 514 }, { "epoch": 0.7591671273263313, "grad_norm": 2.372697215751879, "learning_rate": 4.8662628717064544e-05, "loss": 1.1038, "step": 515 }, { "epoch": 0.7606412382531785, "grad_norm": 2.7355830548465128, "learning_rate": 4.865623422679593e-05, "loss": 1.2237, "step": 516 }, { "epoch": 0.7621153491800258, "grad_norm": 2.6228618087869524, "learning_rate": 4.8649824907863894e-05, "loss": 1.073, "step": 517 }, { "epoch": 0.763589460106873, "grad_norm": 2.3005759596112365, "learning_rate": 4.864340076428607e-05, "loss": 0.8816, "step": 518 }, { "epoch": 0.7650635710337202, "grad_norm": 2.428831679560508, "learning_rate": 4.863696180008937e-05, "loss": 1.1048, "step": 519 }, { "epoch": 0.7665376819605675, "grad_norm": 2.7008627537442527, "learning_rate": 4.8630508019309976e-05, "loss": 1.1239, "step": 520 }, { "epoch": 0.7680117928874147, "grad_norm": 2.710627205539727, "learning_rate": 4.8624039425993375e-05, "loss": 1.0796, "step": 521 }, { "epoch": 0.7694859038142621, "grad_norm": 2.561087055971382, "learning_rate": 4.861755602419434e-05, "loss": 1.2087, "step": 522 }, { "epoch": 0.7709600147411093, "grad_norm": 2.4725146983821595, "learning_rate": 4.861105781797692e-05, "loss": 1.0612, "step": 523 }, { "epoch": 0.7724341256679566, "grad_norm": 2.2730382195081917, "learning_rate": 4.8604544811414465e-05, "loss": 1.0887, "step": 524 }, { "epoch": 0.7739082365948038, "grad_norm": 2.780501018357292, "learning_rate": 4.859801700858957e-05, "loss": 1.0419, "step": 525 }, { "epoch": 0.775382347521651, "grad_norm": 2.4315321810030435, "learning_rate": 4.859147441359412e-05, "loss": 1.0981, "step": 526 }, { "epoch": 0.7768564584484983, "grad_norm": 2.191732943249507, "learning_rate": 4.858491703052927e-05, "loss": 0.9575, "step": 527 }, { "epoch": 0.7783305693753455, "grad_norm": 2.350434904234574, "learning_rate": 4.8578344863505464e-05, "loss": 1.075, "step": 528 }, { "epoch": 0.7798046803021927, "grad_norm": 2.3495407792096192, "learning_rate": 4.857175791664238e-05, "loss": 1.0473, "step": 529 }, { "epoch": 0.78127879122904, "grad_norm": 2.6049320977965738, "learning_rate": 4.856515619406898e-05, "loss": 1.1154, "step": 530 }, { "epoch": 0.7827529021558872, "grad_norm": 2.0392837112107696, "learning_rate": 4.855853969992349e-05, "loss": 0.9944, "step": 531 }, { "epoch": 0.7842270130827345, "grad_norm": 2.8718337162939935, "learning_rate": 4.8551908438353374e-05, "loss": 0.9558, "step": 532 }, { "epoch": 0.7857011240095817, "grad_norm": 2.533678091164546, "learning_rate": 4.854526241351539e-05, "loss": 1.1722, "step": 533 }, { "epoch": 0.7871752349364289, "grad_norm": 2.4355808430056602, "learning_rate": 4.853860162957552e-05, "loss": 0.9838, "step": 534 }, { "epoch": 0.7886493458632762, "grad_norm": 2.5809648998287344, "learning_rate": 4.8531926090709016e-05, "loss": 0.9728, "step": 535 }, { "epoch": 0.7901234567901234, "grad_norm": 2.356050688310044, "learning_rate": 4.8525235801100346e-05, "loss": 1.0757, "step": 536 }, { "epoch": 0.7915975677169707, "grad_norm": 2.2803767374888673, "learning_rate": 4.851853076494327e-05, "loss": 0.8872, "step": 537 }, { "epoch": 0.793071678643818, "grad_norm": 2.351761717611314, "learning_rate": 4.8511810986440766e-05, "loss": 1.0906, "step": 538 }, { "epoch": 0.7945457895706652, "grad_norm": 2.416332967234632, "learning_rate": 4.8505076469805054e-05, "loss": 1.0797, "step": 539 }, { "epoch": 0.7960199004975125, "grad_norm": 2.357332975391142, "learning_rate": 4.849832721925759e-05, "loss": 1.1471, "step": 540 }, { "epoch": 0.7974940114243597, "grad_norm": 2.341218929542858, "learning_rate": 4.849156323902908e-05, "loss": 1.014, "step": 541 }, { "epoch": 0.798968122351207, "grad_norm": 2.7540731609436375, "learning_rate": 4.848478453335946e-05, "loss": 1.0237, "step": 542 }, { "epoch": 0.8004422332780542, "grad_norm": 2.4591359159209585, "learning_rate": 4.8477991106497874e-05, "loss": 1.1157, "step": 543 }, { "epoch": 0.8019163442049014, "grad_norm": 2.0393503504767754, "learning_rate": 4.847118296270272e-05, "loss": 0.9425, "step": 544 }, { "epoch": 0.8033904551317487, "grad_norm": 2.373270710251096, "learning_rate": 4.8464360106241615e-05, "loss": 1.1121, "step": 545 }, { "epoch": 0.8048645660585959, "grad_norm": 2.6561409422982023, "learning_rate": 4.845752254139139e-05, "loss": 0.9459, "step": 546 }, { "epoch": 0.8063386769854431, "grad_norm": 2.4768845983488204, "learning_rate": 4.845067027243809e-05, "loss": 1.0821, "step": 547 }, { "epoch": 0.8078127879122904, "grad_norm": 2.1916860582003896, "learning_rate": 4.844380330367701e-05, "loss": 0.9764, "step": 548 }, { "epoch": 0.8092868988391376, "grad_norm": 2.553643436159745, "learning_rate": 4.843692163941264e-05, "loss": 1.234, "step": 549 }, { "epoch": 0.8107610097659849, "grad_norm": 2.1849150507145834, "learning_rate": 4.8430025283958645e-05, "loss": 1.1618, "step": 550 }, { "epoch": 0.8122351206928321, "grad_norm": 2.5259149159095897, "learning_rate": 4.842311424163797e-05, "loss": 1.0168, "step": 551 }, { "epoch": 0.8137092316196793, "grad_norm": 2.566078199906123, "learning_rate": 4.8416188516782715e-05, "loss": 1.0166, "step": 552 }, { "epoch": 0.8151833425465266, "grad_norm": 5.817736819979309, "learning_rate": 4.84092481137342e-05, "loss": 1.0621, "step": 553 }, { "epoch": 0.8166574534733738, "grad_norm": 11.5623818881214, "learning_rate": 4.840229303684294e-05, "loss": 0.9443, "step": 554 }, { "epoch": 0.8181315644002212, "grad_norm": 3.4392742779842527, "learning_rate": 4.8395323290468655e-05, "loss": 1.0647, "step": 555 }, { "epoch": 0.8196056753270684, "grad_norm": 2.8914213166234717, "learning_rate": 4.838833887898026e-05, "loss": 1.1415, "step": 556 }, { "epoch": 0.8210797862539156, "grad_norm": 2.8752474471043494, "learning_rate": 4.838133980675586e-05, "loss": 0.9507, "step": 557 }, { "epoch": 0.8225538971807629, "grad_norm": 3.203371512998917, "learning_rate": 4.837432607818275e-05, "loss": 1.0353, "step": 558 }, { "epoch": 0.8240280081076101, "grad_norm": 37.39437925155831, "learning_rate": 4.836729769765741e-05, "loss": 1.1006, "step": 559 }, { "epoch": 0.8255021190344574, "grad_norm": 2.8111453714741517, "learning_rate": 4.83602546695855e-05, "loss": 1.1267, "step": 560 }, { "epoch": 0.8269762299613046, "grad_norm": 3.850009442602691, "learning_rate": 4.835319699838189e-05, "loss": 1.347, "step": 561 }, { "epoch": 0.8284503408881518, "grad_norm": 32.86173575387966, "learning_rate": 4.834612468847058e-05, "loss": 1.2305, "step": 562 }, { "epoch": 0.8299244518149991, "grad_norm": 3.0781996664682927, "learning_rate": 4.833903774428481e-05, "loss": 1.0612, "step": 563 }, { "epoch": 0.8313985627418463, "grad_norm": 3.3475058309263535, "learning_rate": 4.833193617026692e-05, "loss": 1.0611, "step": 564 }, { "epoch": 0.8328726736686936, "grad_norm": 13.361420484852907, "learning_rate": 4.8324819970868473e-05, "loss": 1.0252, "step": 565 }, { "epoch": 0.8343467845955408, "grad_norm": 3.7734410382682686, "learning_rate": 4.831768915055019e-05, "loss": 1.1175, "step": 566 }, { "epoch": 0.835820895522388, "grad_norm": 2.944638769666949, "learning_rate": 4.831054371378194e-05, "loss": 1.1, "step": 567 }, { "epoch": 0.8372950064492353, "grad_norm": 2.6776819314419065, "learning_rate": 4.830338366504277e-05, "loss": 1.0123, "step": 568 }, { "epoch": 0.8387691173760825, "grad_norm": 19.48581512194953, "learning_rate": 4.829620900882089e-05, "loss": 0.9367, "step": 569 }, { "epoch": 0.8402432283029297, "grad_norm": 3.34170212825944, "learning_rate": 4.8289019749613645e-05, "loss": 1.1025, "step": 570 }, { "epoch": 0.8417173392297771, "grad_norm": 3.951359652452222, "learning_rate": 4.8281815891927554e-05, "loss": 1.1161, "step": 571 }, { "epoch": 0.8431914501566243, "grad_norm": 2.620396710420563, "learning_rate": 4.827459744027828e-05, "loss": 1.0991, "step": 572 }, { "epoch": 0.8446655610834716, "grad_norm": 2.7396323099339503, "learning_rate": 4.826736439919063e-05, "loss": 1.0199, "step": 573 }, { "epoch": 0.8461396720103188, "grad_norm": 5.120104801178275, "learning_rate": 4.826011677319857e-05, "loss": 1.008, "step": 574 }, { "epoch": 0.847613782937166, "grad_norm": 2.3651039345870037, "learning_rate": 4.825285456684518e-05, "loss": 1.2252, "step": 575 }, { "epoch": 0.8490878938640133, "grad_norm": 5.012866441074875, "learning_rate": 4.824557778468272e-05, "loss": 1.0741, "step": 576 }, { "epoch": 0.8505620047908605, "grad_norm": 2.9448630397276383, "learning_rate": 4.823828643127255e-05, "loss": 1.1238, "step": 577 }, { "epoch": 0.8520361157177078, "grad_norm": 2.6591650730926446, "learning_rate": 4.823098051118519e-05, "loss": 1.1401, "step": 578 }, { "epoch": 0.853510226644555, "grad_norm": 2.451527166263905, "learning_rate": 4.822366002900027e-05, "loss": 1.1852, "step": 579 }, { "epoch": 0.8549843375714022, "grad_norm": 2.5981162216115306, "learning_rate": 4.821632498930656e-05, "loss": 1.1217, "step": 580 }, { "epoch": 0.8564584484982495, "grad_norm": 5.014218902763818, "learning_rate": 4.820897539670195e-05, "loss": 1.0832, "step": 581 }, { "epoch": 0.8579325594250967, "grad_norm": 4.70523257257404, "learning_rate": 4.820161125579347e-05, "loss": 1.1313, "step": 582 }, { "epoch": 0.859406670351944, "grad_norm": 2.380021057997435, "learning_rate": 4.819423257119723e-05, "loss": 1.1533, "step": 583 }, { "epoch": 0.8608807812787912, "grad_norm": 2.589171263321634, "learning_rate": 4.818683934753851e-05, "loss": 1.1713, "step": 584 }, { "epoch": 0.8623548922056384, "grad_norm": 2.593308376724988, "learning_rate": 4.817943158945166e-05, "loss": 1.0659, "step": 585 }, { "epoch": 0.8638290031324857, "grad_norm": 4.3895662056049085, "learning_rate": 4.817200930158015e-05, "loss": 1.0613, "step": 586 }, { "epoch": 0.8653031140593329, "grad_norm": 3.937591309472654, "learning_rate": 4.816457248857657e-05, "loss": 0.981, "step": 587 }, { "epoch": 0.8667772249861803, "grad_norm": 2.2825960342577334, "learning_rate": 4.815712115510261e-05, "loss": 0.9993, "step": 588 }, { "epoch": 0.8682513359130275, "grad_norm": 2.5802037961445357, "learning_rate": 4.8149655305829066e-05, "loss": 1.0946, "step": 589 }, { "epoch": 0.8697254468398747, "grad_norm": 2.336446955314843, "learning_rate": 4.814217494543581e-05, "loss": 0.8739, "step": 590 }, { "epoch": 0.871199557766722, "grad_norm": 2.876967047445946, "learning_rate": 4.813468007861185e-05, "loss": 0.9984, "step": 591 }, { "epoch": 0.8726736686935692, "grad_norm": 5.6206076533108105, "learning_rate": 4.812717071005525e-05, "loss": 1.1301, "step": 592 }, { "epoch": 0.8741477796204165, "grad_norm": 2.8908392749445637, "learning_rate": 4.8119646844473185e-05, "loss": 1.1381, "step": 593 }, { "epoch": 0.8756218905472637, "grad_norm": 2.843037610374493, "learning_rate": 4.811210848658191e-05, "loss": 0.9114, "step": 594 }, { "epoch": 0.8770960014741109, "grad_norm": 4.180846664978228, "learning_rate": 4.8104555641106766e-05, "loss": 1.0981, "step": 595 }, { "epoch": 0.8785701124009582, "grad_norm": 3.346943551295582, "learning_rate": 4.8096988312782174e-05, "loss": 1.1436, "step": 596 }, { "epoch": 0.8800442233278054, "grad_norm": 8.943989138601733, "learning_rate": 4.808940650635163e-05, "loss": 1.212, "step": 597 }, { "epoch": 0.8815183342546526, "grad_norm": 3.1099064818424464, "learning_rate": 4.8081810226567725e-05, "loss": 1.0646, "step": 598 }, { "epoch": 0.8829924451814999, "grad_norm": 5.3122714722791065, "learning_rate": 4.8074199478192097e-05, "loss": 1.0915, "step": 599 }, { "epoch": 0.8844665561083471, "grad_norm": 2.6563699134238363, "learning_rate": 4.8066574265995464e-05, "loss": 1.1747, "step": 600 }, { "epoch": 0.8859406670351944, "grad_norm": 3.1058802302076187, "learning_rate": 4.805893459475761e-05, "loss": 0.9584, "step": 601 }, { "epoch": 0.8874147779620416, "grad_norm": 10.962642349484673, "learning_rate": 4.805128046926739e-05, "loss": 1.1711, "step": 602 }, { "epoch": 0.8888888888888888, "grad_norm": 61.078896452022406, "learning_rate": 4.804361189432271e-05, "loss": 1.1649, "step": 603 }, { "epoch": 0.8903629998157362, "grad_norm": 8.959192373798135, "learning_rate": 4.803592887473053e-05, "loss": 1.1951, "step": 604 }, { "epoch": 0.8918371107425834, "grad_norm": 20.379766175910007, "learning_rate": 4.802823141530687e-05, "loss": 0.9762, "step": 605 }, { "epoch": 0.8933112216694307, "grad_norm": 3.2431353918391697, "learning_rate": 4.8020519520876816e-05, "loss": 1.2783, "step": 606 }, { "epoch": 0.8947853325962779, "grad_norm": 2.5103437062236047, "learning_rate": 4.801279319627448e-05, "loss": 1.1141, "step": 607 }, { "epoch": 0.8962594435231251, "grad_norm": 2.9516807940371357, "learning_rate": 4.8005052446343016e-05, "loss": 1.085, "step": 608 }, { "epoch": 0.8977335544499724, "grad_norm": 3.610059532879287, "learning_rate": 4.799729727593466e-05, "loss": 1.2029, "step": 609 }, { "epoch": 0.8992076653768196, "grad_norm": 7.071710842383281, "learning_rate": 4.798952768991063e-05, "loss": 1.2103, "step": 610 }, { "epoch": 0.9006817763036669, "grad_norm": 4.179927754850148, "learning_rate": 4.798174369314123e-05, "loss": 1.0257, "step": 611 }, { "epoch": 0.9021558872305141, "grad_norm": 6.53268314681492, "learning_rate": 4.7973945290505766e-05, "loss": 1.044, "step": 612 }, { "epoch": 0.9036299981573613, "grad_norm": 3.7138072271569778, "learning_rate": 4.796613248689259e-05, "loss": 1.0603, "step": 613 }, { "epoch": 0.9051041090842086, "grad_norm": 20.776824893056176, "learning_rate": 4.795830528719908e-05, "loss": 1.0653, "step": 614 }, { "epoch": 0.9065782200110558, "grad_norm": 7.841398213325313, "learning_rate": 4.795046369633163e-05, "loss": 1.1548, "step": 615 }, { "epoch": 0.908052330937903, "grad_norm": 45.6396300387955, "learning_rate": 4.7942607719205663e-05, "loss": 1.1506, "step": 616 }, { "epoch": 0.9095264418647503, "grad_norm": 10.381944285675921, "learning_rate": 4.793473736074561e-05, "loss": 1.1513, "step": 617 }, { "epoch": 0.9110005527915975, "grad_norm": 10.886899554472784, "learning_rate": 4.792685262588492e-05, "loss": 1.3653, "step": 618 }, { "epoch": 0.9124746637184448, "grad_norm": 14.6739766948505, "learning_rate": 4.791895351956607e-05, "loss": 1.5354, "step": 619 }, { "epoch": 0.913948774645292, "grad_norm": 10.782336194151569, "learning_rate": 4.791104004674052e-05, "loss": 1.3222, "step": 620 }, { "epoch": 0.9154228855721394, "grad_norm": 48.77635255284445, "learning_rate": 4.7903112212368756e-05, "loss": 1.8102, "step": 621 }, { "epoch": 0.9168969964989866, "grad_norm": 6.71068710197512, "learning_rate": 4.789517002142026e-05, "loss": 1.6358, "step": 622 }, { "epoch": 0.9183711074258338, "grad_norm": 20.254981664434123, "learning_rate": 4.788721347887349e-05, "loss": 1.7302, "step": 623 }, { "epoch": 0.9198452183526811, "grad_norm": 7.686918407242877, "learning_rate": 4.7879242589715955e-05, "loss": 1.8535, "step": 624 }, { "epoch": 0.9213193292795283, "grad_norm": 24.342140311030068, "learning_rate": 4.78712573589441e-05, "loss": 2.1503, "step": 625 }, { "epoch": 0.9227934402063755, "grad_norm": 5.574779714842422, "learning_rate": 4.7863257791563384e-05, "loss": 1.8339, "step": 626 }, { "epoch": 0.9242675511332228, "grad_norm": 17.59968296112266, "learning_rate": 4.785524389258827e-05, "loss": 1.6685, "step": 627 }, { "epoch": 0.92574166206007, "grad_norm": 22.674268158497146, "learning_rate": 4.7847215667042165e-05, "loss": 1.8529, "step": 628 }, { "epoch": 0.9272157729869173, "grad_norm": 59.424177824823545, "learning_rate": 4.78391731199575e-05, "loss": 1.7303, "step": 629 }, { "epoch": 0.9286898839137645, "grad_norm": 24.25870821615212, "learning_rate": 4.7831116256375644e-05, "loss": 1.7657, "step": 630 }, { "epoch": 0.9301639948406117, "grad_norm": 14.282858993743384, "learning_rate": 4.782304508134696e-05, "loss": 1.8043, "step": 631 }, { "epoch": 0.931638105767459, "grad_norm": 12.945611889265798, "learning_rate": 4.7814959599930794e-05, "loss": 1.5911, "step": 632 }, { "epoch": 0.9331122166943062, "grad_norm": 11.751178438869372, "learning_rate": 4.7806859817195425e-05, "loss": 1.9664, "step": 633 }, { "epoch": 0.9345863276211535, "grad_norm": 42.990392276937875, "learning_rate": 4.779874573821814e-05, "loss": 1.7651, "step": 634 }, { "epoch": 0.9360604385480007, "grad_norm": 20.408740605809914, "learning_rate": 4.779061736808514e-05, "loss": 2.027, "step": 635 }, { "epoch": 0.9375345494748479, "grad_norm": 2.9811943640805745, "learning_rate": 4.778247471189163e-05, "loss": 1.8471, "step": 636 }, { "epoch": 0.9390086604016952, "grad_norm": 9.939957224947653, "learning_rate": 4.777431777474174e-05, "loss": 2.0527, "step": 637 }, { "epoch": 0.9404827713285425, "grad_norm": 3.6046107171144066, "learning_rate": 4.776614656174856e-05, "loss": 1.8561, "step": 638 }, { "epoch": 0.9419568822553898, "grad_norm": 2.5357383212633238, "learning_rate": 4.775796107803413e-05, "loss": 1.9289, "step": 639 }, { "epoch": 0.943430993182237, "grad_norm": 2.0777844601737336, "learning_rate": 4.7749761328729436e-05, "loss": 2.1665, "step": 640 }, { "epoch": 0.9449051041090842, "grad_norm": 2.290885745998849, "learning_rate": 4.77415473189744e-05, "loss": 2.0061, "step": 641 }, { "epoch": 0.9463792150359315, "grad_norm": 2.684261588741759, "learning_rate": 4.77333190539179e-05, "loss": 1.9624, "step": 642 }, { "epoch": 0.9478533259627787, "grad_norm": 1.274070020985153, "learning_rate": 4.772507653871773e-05, "loss": 1.7978, "step": 643 }, { "epoch": 0.949327436889626, "grad_norm": 1.309722686709963, "learning_rate": 4.7716819778540625e-05, "loss": 2.0861, "step": 644 }, { "epoch": 0.9508015478164732, "grad_norm": 1.6511872846084235, "learning_rate": 4.770854877856225e-05, "loss": 2.0014, "step": 645 }, { "epoch": 0.9522756587433204, "grad_norm": 2.093962587410378, "learning_rate": 4.7700263543967195e-05, "loss": 1.9755, "step": 646 }, { "epoch": 0.9537497696701677, "grad_norm": 1.4516938247651279, "learning_rate": 4.769196407994898e-05, "loss": 1.8842, "step": 647 }, { "epoch": 0.9552238805970149, "grad_norm": 1.596567244463875, "learning_rate": 4.768365039171002e-05, "loss": 1.9751, "step": 648 }, { "epoch": 0.9566979915238621, "grad_norm": 1.2335191487904593, "learning_rate": 4.7675322484461674e-05, "loss": 2.1441, "step": 649 }, { "epoch": 0.9581721024507094, "grad_norm": 1.0325144904830905, "learning_rate": 4.766698036342421e-05, "loss": 1.8034, "step": 650 }, { "epoch": 0.9596462133775566, "grad_norm": 0.9263493850866569, "learning_rate": 4.765862403382678e-05, "loss": 1.954, "step": 651 }, { "epoch": 0.9611203243044039, "grad_norm": 1.1823189798076608, "learning_rate": 4.7650253500907494e-05, "loss": 2.0566, "step": 652 }, { "epoch": 0.9625944352312511, "grad_norm": 1.128775618759864, "learning_rate": 4.76418687699133e-05, "loss": 2.1813, "step": 653 }, { "epoch": 0.9640685461580984, "grad_norm": 1.5139946574944552, "learning_rate": 4.76334698461001e-05, "loss": 2.001, "step": 654 }, { "epoch": 0.9655426570849457, "grad_norm": 3.4259080504522896, "learning_rate": 4.7625056734732654e-05, "loss": 1.7788, "step": 655 }, { "epoch": 0.9670167680117929, "grad_norm": 1.7685170077475945, "learning_rate": 4.7616629441084655e-05, "loss": 2.001, "step": 656 }, { "epoch": 0.9684908789386402, "grad_norm": 1.2578734389049688, "learning_rate": 4.760818797043864e-05, "loss": 1.8793, "step": 657 }, { "epoch": 0.9699649898654874, "grad_norm": 0.9545808523393678, "learning_rate": 4.759973232808609e-05, "loss": 1.9361, "step": 658 }, { "epoch": 0.9714391007923346, "grad_norm": 1.0200703682655323, "learning_rate": 4.75912625193273e-05, "loss": 1.8802, "step": 659 }, { "epoch": 0.9729132117191819, "grad_norm": 1.4111108481205838, "learning_rate": 4.7582778549471494e-05, "loss": 1.9786, "step": 660 }, { "epoch": 0.9743873226460291, "grad_norm": 0.9514290777165798, "learning_rate": 4.7574280423836776e-05, "loss": 1.7297, "step": 661 }, { "epoch": 0.9758614335728764, "grad_norm": 1.293884782634935, "learning_rate": 4.756576814775009e-05, "loss": 1.9037, "step": 662 }, { "epoch": 0.9773355444997236, "grad_norm": 1.2334320716168234, "learning_rate": 4.7557241726547266e-05, "loss": 1.8917, "step": 663 }, { "epoch": 0.9788096554265708, "grad_norm": 2.1795423452521665, "learning_rate": 4.7548701165573003e-05, "loss": 1.7803, "step": 664 }, { "epoch": 0.9802837663534181, "grad_norm": 1.8964455283337973, "learning_rate": 4.754014647018088e-05, "loss": 2.067, "step": 665 }, { "epoch": 0.9817578772802653, "grad_norm": 1.1862353567781279, "learning_rate": 4.75315776457333e-05, "loss": 1.6299, "step": 666 }, { "epoch": 0.9832319882071126, "grad_norm": 1.7687941609723192, "learning_rate": 4.752299469760154e-05, "loss": 1.8733, "step": 667 }, { "epoch": 0.9847060991339598, "grad_norm": 1.2127206778519215, "learning_rate": 4.751439763116575e-05, "loss": 2.0071, "step": 668 }, { "epoch": 0.986180210060807, "grad_norm": 1.1123856539307686, "learning_rate": 4.750578645181489e-05, "loss": 1.9389, "step": 669 }, { "epoch": 0.9876543209876543, "grad_norm": 1.260258115564091, "learning_rate": 4.74971611649468e-05, "loss": 2.0584, "step": 670 }, { "epoch": 0.9891284319145016, "grad_norm": 1.099642060303938, "learning_rate": 4.748852177596815e-05, "loss": 1.8937, "step": 671 }, { "epoch": 0.9906025428413489, "grad_norm": 1.2815316053609214, "learning_rate": 4.747986829029445e-05, "loss": 2.0495, "step": 672 }, { "epoch": 0.9920766537681961, "grad_norm": 1.4285928673497919, "learning_rate": 4.747120071335004e-05, "loss": 1.8627, "step": 673 }, { "epoch": 0.9935507646950433, "grad_norm": 1.8606591158178005, "learning_rate": 4.746251905056811e-05, "loss": 1.912, "step": 674 }, { "epoch": 0.9950248756218906, "grad_norm": 1.0559402937229811, "learning_rate": 4.745382330739067e-05, "loss": 1.946, "step": 675 }, { "epoch": 0.9964989865487378, "grad_norm": 1.164534863377192, "learning_rate": 4.7445113489268544e-05, "loss": 1.6793, "step": 676 }, { "epoch": 0.997973097475585, "grad_norm": 1.3656418264648706, "learning_rate": 4.74363896016614e-05, "loss": 1.8892, "step": 677 }, { "epoch": 0.9994472084024323, "grad_norm": 1.2796702997451355, "learning_rate": 4.742765165003772e-05, "loss": 1.785, "step": 678 }, { "epoch": 1.0009213193292796, "grad_norm": 1.4475803213284533, "learning_rate": 4.741889963987478e-05, "loss": 1.9179, "step": 679 }, { "epoch": 1.0023954302561269, "grad_norm": 1.3014839725687257, "learning_rate": 4.741013357665871e-05, "loss": 1.6659, "step": 680 }, { "epoch": 1.0038695411829741, "grad_norm": 1.7505883181622912, "learning_rate": 4.7401353465884406e-05, "loss": 1.8979, "step": 681 }, { "epoch": 1.0053436521098214, "grad_norm": 2.5765248274492527, "learning_rate": 4.73925593130556e-05, "loss": 1.6705, "step": 682 }, { "epoch": 1.0068177630366686, "grad_norm": 2.5389053061316442, "learning_rate": 4.7383751123684806e-05, "loss": 1.9121, "step": 683 }, { "epoch": 1.0082918739635158, "grad_norm": 2.478009399950427, "learning_rate": 4.737492890329335e-05, "loss": 1.6073, "step": 684 }, { "epoch": 1.009765984890363, "grad_norm": 4.672826153437971, "learning_rate": 4.736609265741135e-05, "loss": 1.6052, "step": 685 }, { "epoch": 1.0112400958172103, "grad_norm": 2.2850047525903205, "learning_rate": 4.7357242391577724e-05, "loss": 1.8208, "step": 686 }, { "epoch": 1.0127142067440575, "grad_norm": 2.9851204305664707, "learning_rate": 4.7348378111340145e-05, "loss": 1.4295, "step": 687 }, { "epoch": 1.0141883176709048, "grad_norm": 5.209790425249793, "learning_rate": 4.733949982225511e-05, "loss": 1.6002, "step": 688 }, { "epoch": 1.015662428597752, "grad_norm": 9.465133530309123, "learning_rate": 4.7330607529887884e-05, "loss": 1.3735, "step": 689 }, { "epoch": 1.0171365395245993, "grad_norm": 3.2623625159983396, "learning_rate": 4.73217012398125e-05, "loss": 1.55, "step": 690 }, { "epoch": 1.0186106504514465, "grad_norm": 2.618453172417601, "learning_rate": 4.731278095761178e-05, "loss": 1.5206, "step": 691 }, { "epoch": 1.0200847613782937, "grad_norm": 4.240730948871066, "learning_rate": 4.73038466888773e-05, "loss": 1.6005, "step": 692 }, { "epoch": 1.021558872305141, "grad_norm": 2.379449390864728, "learning_rate": 4.729489843920942e-05, "loss": 1.3142, "step": 693 }, { "epoch": 1.0230329832319882, "grad_norm": 2.2825990633238065, "learning_rate": 4.728593621421726e-05, "loss": 1.3224, "step": 694 }, { "epoch": 1.0245070941588355, "grad_norm": 2.5374754655525735, "learning_rate": 4.727696001951869e-05, "loss": 1.2925, "step": 695 }, { "epoch": 1.0259812050856827, "grad_norm": 3.6079510939301, "learning_rate": 4.726796986074034e-05, "loss": 1.2146, "step": 696 }, { "epoch": 1.02745531601253, "grad_norm": 2.4539293076173663, "learning_rate": 4.725896574351763e-05, "loss": 1.2892, "step": 697 }, { "epoch": 1.0289294269393772, "grad_norm": 2.482872082503682, "learning_rate": 4.7249947673494645e-05, "loss": 1.2706, "step": 698 }, { "epoch": 1.0304035378662244, "grad_norm": 2.2366603021583265, "learning_rate": 4.72409156563243e-05, "loss": 1.1393, "step": 699 }, { "epoch": 1.0318776487930716, "grad_norm": 2.361876721387823, "learning_rate": 4.7231869697668214e-05, "loss": 1.2278, "step": 700 }, { "epoch": 1.0333517597199189, "grad_norm": 2.2613964740639654, "learning_rate": 4.722280980319675e-05, "loss": 1.1068, "step": 701 }, { "epoch": 1.0348258706467661, "grad_norm": 2.457797611722578, "learning_rate": 4.7213735978589016e-05, "loss": 1.0204, "step": 702 }, { "epoch": 1.0362999815736134, "grad_norm": 2.486295137988166, "learning_rate": 4.720464822953284e-05, "loss": 1.1741, "step": 703 }, { "epoch": 1.0377740925004606, "grad_norm": 2.8030436594376953, "learning_rate": 4.719554656172478e-05, "loss": 1.1314, "step": 704 }, { "epoch": 1.0392482034273078, "grad_norm": 2.7325375732183983, "learning_rate": 4.7186430980870124e-05, "loss": 1.0538, "step": 705 }, { "epoch": 1.040722314354155, "grad_norm": 2.678354893969303, "learning_rate": 4.717730149268287e-05, "loss": 0.8759, "step": 706 }, { "epoch": 1.0421964252810023, "grad_norm": 2.6927068084377987, "learning_rate": 4.716815810288575e-05, "loss": 0.9579, "step": 707 }, { "epoch": 1.0436705362078496, "grad_norm": 2.692769761310231, "learning_rate": 4.7159000817210205e-05, "loss": 1.0493, "step": 708 }, { "epoch": 1.0451446471346968, "grad_norm": 2.359144389016912, "learning_rate": 4.714982964139639e-05, "loss": 0.8443, "step": 709 }, { "epoch": 1.046618758061544, "grad_norm": 3.395964044656913, "learning_rate": 4.714064458119314e-05, "loss": 0.8777, "step": 710 }, { "epoch": 1.0480928689883915, "grad_norm": 2.3856801272052466, "learning_rate": 4.713144564235803e-05, "loss": 0.8705, "step": 711 }, { "epoch": 1.0495669799152387, "grad_norm": 2.45173022229532, "learning_rate": 4.7122232830657315e-05, "loss": 0.9057, "step": 712 }, { "epoch": 1.051041090842086, "grad_norm": 3.3700208315785583, "learning_rate": 4.7113006151865944e-05, "loss": 0.7717, "step": 713 }, { "epoch": 1.0525152017689332, "grad_norm": 2.5737120805552807, "learning_rate": 4.710376561176758e-05, "loss": 0.8243, "step": 714 }, { "epoch": 1.0539893126957804, "grad_norm": 2.648440650780528, "learning_rate": 4.7094511216154546e-05, "loss": 0.8902, "step": 715 }, { "epoch": 1.0554634236226277, "grad_norm": 2.4840352527862724, "learning_rate": 4.708524297082786e-05, "loss": 0.791, "step": 716 }, { "epoch": 1.056937534549475, "grad_norm": 2.7977660054545046, "learning_rate": 4.7075960881597236e-05, "loss": 1.033, "step": 717 }, { "epoch": 1.0584116454763222, "grad_norm": 2.260459958085286, "learning_rate": 4.706666495428105e-05, "loss": 0.7694, "step": 718 }, { "epoch": 1.0598857564031694, "grad_norm": 2.4586578952311506, "learning_rate": 4.705735519470636e-05, "loss": 0.9615, "step": 719 }, { "epoch": 1.0613598673300166, "grad_norm": 2.5995880974429753, "learning_rate": 4.7048031608708876e-05, "loss": 0.8581, "step": 720 }, { "epoch": 1.0628339782568639, "grad_norm": 2.087093647221645, "learning_rate": 4.703869420213301e-05, "loss": 0.7168, "step": 721 }, { "epoch": 1.0643080891837111, "grad_norm": 2.1483939981391296, "learning_rate": 4.702934298083181e-05, "loss": 0.8436, "step": 722 }, { "epoch": 1.0657822001105584, "grad_norm": 3.034583549522201, "learning_rate": 4.701997795066699e-05, "loss": 1.02, "step": 723 }, { "epoch": 1.0672563110374056, "grad_norm": 2.4088258703820875, "learning_rate": 4.701059911750893e-05, "loss": 0.8257, "step": 724 }, { "epoch": 1.0687304219642528, "grad_norm": 2.3208462393474316, "learning_rate": 4.7001206487236644e-05, "loss": 0.825, "step": 725 }, { "epoch": 1.0702045328911, "grad_norm": 2.264900459982277, "learning_rate": 4.69918000657378e-05, "loss": 0.82, "step": 726 }, { "epoch": 1.0716786438179473, "grad_norm": 2.239340859231949, "learning_rate": 4.698237985890873e-05, "loss": 0.9044, "step": 727 }, { "epoch": 1.0731527547447945, "grad_norm": 2.5059707866588643, "learning_rate": 4.697294587265438e-05, "loss": 0.7921, "step": 728 }, { "epoch": 1.0746268656716418, "grad_norm": 2.119190352599152, "learning_rate": 4.696349811288836e-05, "loss": 0.7279, "step": 729 }, { "epoch": 1.076100976598489, "grad_norm": 2.578683965949751, "learning_rate": 4.695403658553288e-05, "loss": 1.0579, "step": 730 }, { "epoch": 1.0775750875253363, "grad_norm": 2.3334059249848482, "learning_rate": 4.6944561296518816e-05, "loss": 0.8225, "step": 731 }, { "epoch": 1.0790491984521835, "grad_norm": 2.392859517891362, "learning_rate": 4.693507225178564e-05, "loss": 0.7913, "step": 732 }, { "epoch": 1.0805233093790307, "grad_norm": 2.2907462843860973, "learning_rate": 4.692556945728147e-05, "loss": 0.8355, "step": 733 }, { "epoch": 1.081997420305878, "grad_norm": 2.3693387671225685, "learning_rate": 4.691605291896304e-05, "loss": 0.8877, "step": 734 }, { "epoch": 1.0834715312327252, "grad_norm": 2.5390324517452756, "learning_rate": 4.690652264279567e-05, "loss": 0.9675, "step": 735 }, { "epoch": 1.0849456421595725, "grad_norm": 2.1871462944717783, "learning_rate": 4.689697863475334e-05, "loss": 0.8036, "step": 736 }, { "epoch": 1.0864197530864197, "grad_norm": 2.316065230846043, "learning_rate": 4.688742090081859e-05, "loss": 0.8461, "step": 737 }, { "epoch": 1.087893864013267, "grad_norm": 2.054642245298327, "learning_rate": 4.68778494469826e-05, "loss": 0.8055, "step": 738 }, { "epoch": 1.0893679749401142, "grad_norm": 2.2286409013525255, "learning_rate": 4.686826427924514e-05, "loss": 0.9437, "step": 739 }, { "epoch": 1.0908420858669614, "grad_norm": 2.2614160839218336, "learning_rate": 4.685866540361456e-05, "loss": 0.7841, "step": 740 }, { "epoch": 1.0923161967938086, "grad_norm": 2.4061217831226425, "learning_rate": 4.684905282610781e-05, "loss": 0.8496, "step": 741 }, { "epoch": 1.0937903077206559, "grad_norm": 2.525519772654792, "learning_rate": 4.6839426552750454e-05, "loss": 0.8416, "step": 742 }, { "epoch": 1.0952644186475031, "grad_norm": 2.7532760873097453, "learning_rate": 4.6829786589576604e-05, "loss": 0.886, "step": 743 }, { "epoch": 1.0967385295743504, "grad_norm": 2.5842406822642565, "learning_rate": 4.6820132942628974e-05, "loss": 0.7743, "step": 744 }, { "epoch": 1.0982126405011976, "grad_norm": 2.411156148631502, "learning_rate": 4.6810465617958856e-05, "loss": 0.7514, "step": 745 }, { "epoch": 1.099686751428045, "grad_norm": 2.2904592165210858, "learning_rate": 4.680078462162611e-05, "loss": 0.835, "step": 746 }, { "epoch": 1.1011608623548923, "grad_norm": 2.321791155239546, "learning_rate": 4.679108995969917e-05, "loss": 0.802, "step": 747 }, { "epoch": 1.1026349732817395, "grad_norm": 2.2091700090545885, "learning_rate": 4.678138163825503e-05, "loss": 0.8985, "step": 748 }, { "epoch": 1.1041090842085868, "grad_norm": 2.4437822422720163, "learning_rate": 4.677165966337924e-05, "loss": 0.8313, "step": 749 }, { "epoch": 1.105583195135434, "grad_norm": 2.0564354253490116, "learning_rate": 4.676192404116594e-05, "loss": 0.7442, "step": 750 }, { "epoch": 1.105583195135434, "eval_bleu": 0.08292343322990926, "eval_bleu_1gram": 0.40388178354228776, "eval_bleu_2gram": 0.16988408226460286, "eval_bleu_3gram": 0.0781712046594634, "eval_bleu_4gram": 0.038341580023742564, "eval_rag_val_loss": 1.1190900478952674, "eval_rouge1": 0.40014123658572504, "eval_rouge2": 0.16459563476986266, "eval_rougeL": 0.38317503573235917, "step": 750 }, { "epoch": 1.1070573060622813, "grad_norm": 2.293199777743337, "learning_rate": 4.6752174777717786e-05, "loss": 0.7952, "step": 751 }, { "epoch": 1.1085314169891285, "grad_norm": 2.5362549268769623, "learning_rate": 4.674241187914601e-05, "loss": 0.709, "step": 752 }, { "epoch": 1.1100055279159757, "grad_norm": 3.3617617622760347, "learning_rate": 4.673263535157038e-05, "loss": 0.9157, "step": 753 }, { "epoch": 1.111479638842823, "grad_norm": 2.3033379175533417, "learning_rate": 4.6722845201119214e-05, "loss": 0.8301, "step": 754 }, { "epoch": 1.1129537497696702, "grad_norm": 2.545868938174176, "learning_rate": 4.671304143392936e-05, "loss": 0.8996, "step": 755 }, { "epoch": 1.1144278606965174, "grad_norm": 2.4083779573981223, "learning_rate": 4.670322405614621e-05, "loss": 0.8203, "step": 756 }, { "epoch": 1.1159019716233647, "grad_norm": 2.49922883537621, "learning_rate": 4.6693393073923686e-05, "loss": 0.8249, "step": 757 }, { "epoch": 1.117376082550212, "grad_norm": 2.0291276848382296, "learning_rate": 4.6683548493424236e-05, "loss": 0.7952, "step": 758 }, { "epoch": 1.1188501934770592, "grad_norm": 2.389587177072863, "learning_rate": 4.667369032081883e-05, "loss": 0.7874, "step": 759 }, { "epoch": 1.1203243044039064, "grad_norm": 2.400985408187307, "learning_rate": 4.666381856228697e-05, "loss": 0.8999, "step": 760 }, { "epoch": 1.1217984153307536, "grad_norm": 2.8013357893043196, "learning_rate": 4.665393322401664e-05, "loss": 1.0759, "step": 761 }, { "epoch": 1.1232725262576009, "grad_norm": 2.373567148953153, "learning_rate": 4.6644034312204387e-05, "loss": 0.7489, "step": 762 }, { "epoch": 1.1247466371844481, "grad_norm": 2.111416203499304, "learning_rate": 4.6634121833055235e-05, "loss": 0.7707, "step": 763 }, { "epoch": 1.1262207481112954, "grad_norm": 2.70247990843356, "learning_rate": 4.662419579278271e-05, "loss": 0.8293, "step": 764 }, { "epoch": 1.1276948590381426, "grad_norm": 2.5788566707085256, "learning_rate": 4.6614256197608855e-05, "loss": 0.8545, "step": 765 }, { "epoch": 1.1291689699649898, "grad_norm": 2.1957018089595883, "learning_rate": 4.660430305376419e-05, "loss": 0.874, "step": 766 }, { "epoch": 1.130643080891837, "grad_norm": 3.084388054198803, "learning_rate": 4.659433636748775e-05, "loss": 0.9721, "step": 767 }, { "epoch": 1.1321171918186843, "grad_norm": 2.2942672769877612, "learning_rate": 4.658435614502705e-05, "loss": 0.7703, "step": 768 }, { "epoch": 1.1335913027455315, "grad_norm": 2.482505430770727, "learning_rate": 4.657436239263808e-05, "loss": 0.821, "step": 769 }, { "epoch": 1.1350654136723788, "grad_norm": 2.482982509162742, "learning_rate": 4.6564355116585325e-05, "loss": 0.8797, "step": 770 }, { "epoch": 1.136539524599226, "grad_norm": 2.5359962141628465, "learning_rate": 4.655433432314174e-05, "loss": 0.7386, "step": 771 }, { "epoch": 1.1380136355260733, "grad_norm": 2.4201100471002728, "learning_rate": 4.654430001858874e-05, "loss": 0.8604, "step": 772 }, { "epoch": 1.1394877464529205, "grad_norm": 2.312822783341243, "learning_rate": 4.653425220921626e-05, "loss": 0.8591, "step": 773 }, { "epoch": 1.1409618573797677, "grad_norm": 2.9081326201367967, "learning_rate": 4.6524190901322626e-05, "loss": 0.8528, "step": 774 }, { "epoch": 1.142435968306615, "grad_norm": 2.592461783146748, "learning_rate": 4.651411610121469e-05, "loss": 0.8673, "step": 775 }, { "epoch": 1.1439100792334624, "grad_norm": 2.4452477589772377, "learning_rate": 4.650402781520772e-05, "loss": 0.8516, "step": 776 }, { "epoch": 1.1453841901603097, "grad_norm": 2.4657401557206806, "learning_rate": 4.649392604962546e-05, "loss": 0.9178, "step": 777 }, { "epoch": 1.146858301087157, "grad_norm": 2.6651271806898276, "learning_rate": 4.648381081080009e-05, "loss": 0.8589, "step": 778 }, { "epoch": 1.1483324120140042, "grad_norm": 2.328442033520868, "learning_rate": 4.647368210507225e-05, "loss": 0.8805, "step": 779 }, { "epoch": 1.1498065229408514, "grad_norm": 2.3410924467648253, "learning_rate": 4.6463539938791e-05, "loss": 0.7942, "step": 780 }, { "epoch": 1.1512806338676986, "grad_norm": 2.533882938128876, "learning_rate": 4.645338431831388e-05, "loss": 0.7567, "step": 781 }, { "epoch": 1.1527547447945459, "grad_norm": 2.23125063257716, "learning_rate": 4.6443215250006806e-05, "loss": 0.7193, "step": 782 }, { "epoch": 1.154228855721393, "grad_norm": 2.4971688452157403, "learning_rate": 4.643303274024416e-05, "loss": 0.8776, "step": 783 }, { "epoch": 1.1557029666482403, "grad_norm": 3.2206917248414824, "learning_rate": 4.642283679540874e-05, "loss": 0.9035, "step": 784 }, { "epoch": 1.1571770775750876, "grad_norm": 2.665899722755361, "learning_rate": 4.641262742189178e-05, "loss": 0.9324, "step": 785 }, { "epoch": 1.1586511885019348, "grad_norm": 2.3827116397711205, "learning_rate": 4.640240462609291e-05, "loss": 0.924, "step": 786 }, { "epoch": 1.160125299428782, "grad_norm": 2.372539651296192, "learning_rate": 4.639216841442018e-05, "loss": 0.8651, "step": 787 }, { "epoch": 1.1615994103556293, "grad_norm": 2.518613757668083, "learning_rate": 4.6381918793290055e-05, "loss": 1.0386, "step": 788 }, { "epoch": 1.1630735212824765, "grad_norm": 2.4674536283052846, "learning_rate": 4.6371655769127396e-05, "loss": 0.8576, "step": 789 }, { "epoch": 1.1645476322093238, "grad_norm": 2.714565432985493, "learning_rate": 4.63613793483655e-05, "loss": 0.8011, "step": 790 }, { "epoch": 1.166021743136171, "grad_norm": 2.473984396278559, "learning_rate": 4.6351089537446e-05, "loss": 0.9507, "step": 791 }, { "epoch": 1.1674958540630183, "grad_norm": 2.180178897936602, "learning_rate": 4.6340786342818964e-05, "loss": 0.8685, "step": 792 }, { "epoch": 1.1689699649898655, "grad_norm": 2.2181550423701646, "learning_rate": 4.633046977094286e-05, "loss": 0.8922, "step": 793 }, { "epoch": 1.1704440759167127, "grad_norm": 2.667900475540213, "learning_rate": 4.632013982828451e-05, "loss": 0.9013, "step": 794 }, { "epoch": 1.17191818684356, "grad_norm": 2.552505728092174, "learning_rate": 4.630979652131913e-05, "loss": 0.8824, "step": 795 }, { "epoch": 1.1733922977704072, "grad_norm": 2.164514157092737, "learning_rate": 4.629943985653032e-05, "loss": 0.92, "step": 796 }, { "epoch": 1.1748664086972544, "grad_norm": 2.790939767819731, "learning_rate": 4.6289069840410036e-05, "loss": 0.8572, "step": 797 }, { "epoch": 1.1763405196241017, "grad_norm": 2.5025651169878484, "learning_rate": 4.627868647945863e-05, "loss": 0.7466, "step": 798 }, { "epoch": 1.177814630550949, "grad_norm": 2.0716448257575752, "learning_rate": 4.62682897801848e-05, "loss": 0.7518, "step": 799 }, { "epoch": 1.1792887414777962, "grad_norm": 2.4165900865018872, "learning_rate": 4.625787974910559e-05, "loss": 0.7699, "step": 800 }, { "epoch": 1.1807628524046434, "grad_norm": 2.6272299241367865, "learning_rate": 4.6247456392746444e-05, "loss": 0.7614, "step": 801 }, { "epoch": 1.1822369633314906, "grad_norm": 2.2130788401519434, "learning_rate": 4.623701971764112e-05, "loss": 0.8002, "step": 802 }, { "epoch": 1.1837110742583379, "grad_norm": 2.4534662094448034, "learning_rate": 4.622656973033174e-05, "loss": 0.6557, "step": 803 }, { "epoch": 1.1851851851851851, "grad_norm": 2.3786743251452016, "learning_rate": 4.621610643736878e-05, "loss": 0.8378, "step": 804 }, { "epoch": 1.1866592961120324, "grad_norm": 2.496942557899795, "learning_rate": 4.620562984531103e-05, "loss": 0.8238, "step": 805 }, { "epoch": 1.1881334070388796, "grad_norm": 2.4279556234976933, "learning_rate": 4.619513996072564e-05, "loss": 0.8793, "step": 806 }, { "epoch": 1.1896075179657268, "grad_norm": 2.36178162970895, "learning_rate": 4.618463679018808e-05, "loss": 0.9319, "step": 807 }, { "epoch": 1.191081628892574, "grad_norm": 2.1724634265345246, "learning_rate": 4.617412034028217e-05, "loss": 0.8022, "step": 808 }, { "epoch": 1.1925557398194213, "grad_norm": 2.339502376560896, "learning_rate": 4.616359061760001e-05, "loss": 0.8927, "step": 809 }, { "epoch": 1.1940298507462686, "grad_norm": 2.5577972803146576, "learning_rate": 4.6153047628742066e-05, "loss": 0.813, "step": 810 }, { "epoch": 1.1955039616731158, "grad_norm": 2.332840617700921, "learning_rate": 4.61424913803171e-05, "loss": 0.8248, "step": 811 }, { "epoch": 1.196978072599963, "grad_norm": 2.288492906549023, "learning_rate": 4.613192187894218e-05, "loss": 0.8591, "step": 812 }, { "epoch": 1.1984521835268105, "grad_norm": 2.1997156479610647, "learning_rate": 4.612133913124268e-05, "loss": 0.8382, "step": 813 }, { "epoch": 1.1999262944536577, "grad_norm": 2.310201791644019, "learning_rate": 4.61107431438523e-05, "loss": 0.97, "step": 814 }, { "epoch": 1.201400405380505, "grad_norm": 2.3776206565750275, "learning_rate": 4.610013392341301e-05, "loss": 0.8326, "step": 815 }, { "epoch": 1.2028745163073522, "grad_norm": 2.3781523613978597, "learning_rate": 4.608951147657511e-05, "loss": 0.815, "step": 816 }, { "epoch": 1.2043486272341994, "grad_norm": 2.1641668301438934, "learning_rate": 4.607887580999715e-05, "loss": 0.8612, "step": 817 }, { "epoch": 1.2058227381610467, "grad_norm": 2.5554133792617413, "learning_rate": 4.6068226930345995e-05, "loss": 0.8113, "step": 818 }, { "epoch": 1.207296849087894, "grad_norm": 2.274560814689121, "learning_rate": 4.605756484429678e-05, "loss": 0.8522, "step": 819 }, { "epoch": 1.2087709600147412, "grad_norm": 2.255916129054745, "learning_rate": 4.604688955853293e-05, "loss": 0.7605, "step": 820 }, { "epoch": 1.2102450709415884, "grad_norm": 2.503181531174904, "learning_rate": 4.603620107974612e-05, "loss": 0.8171, "step": 821 }, { "epoch": 1.2117191818684356, "grad_norm": 2.5108956372059636, "learning_rate": 4.602549941463633e-05, "loss": 0.9439, "step": 822 }, { "epoch": 1.2131932927952829, "grad_norm": 2.6321332377882634, "learning_rate": 4.601478456991178e-05, "loss": 0.891, "step": 823 }, { "epoch": 1.2146674037221301, "grad_norm": 3.2747456342617225, "learning_rate": 4.6004056552288956e-05, "loss": 0.7429, "step": 824 }, { "epoch": 1.2161415146489774, "grad_norm": 2.4564611989627023, "learning_rate": 4.5993315368492603e-05, "loss": 0.788, "step": 825 }, { "epoch": 1.2176156255758246, "grad_norm": 2.1552819553028897, "learning_rate": 4.5982561025255726e-05, "loss": 0.7747, "step": 826 }, { "epoch": 1.2190897365026718, "grad_norm": 2.0848864932918745, "learning_rate": 4.5971793529319576e-05, "loss": 0.7032, "step": 827 }, { "epoch": 1.220563847429519, "grad_norm": 2.3348198197501757, "learning_rate": 4.596101288743362e-05, "loss": 0.8262, "step": 828 }, { "epoch": 1.2220379583563663, "grad_norm": 3.0461700406418397, "learning_rate": 4.595021910635563e-05, "loss": 0.9109, "step": 829 }, { "epoch": 1.2235120692832135, "grad_norm": 2.77456083517085, "learning_rate": 4.5939412192851535e-05, "loss": 0.9004, "step": 830 }, { "epoch": 1.2249861802100608, "grad_norm": 2.5153826484277646, "learning_rate": 4.592859215369557e-05, "loss": 0.8417, "step": 831 }, { "epoch": 1.226460291136908, "grad_norm": 2.6889174515894685, "learning_rate": 4.591775899567015e-05, "loss": 0.9771, "step": 832 }, { "epoch": 1.2279344020637553, "grad_norm": 2.2809759524096678, "learning_rate": 4.590691272556592e-05, "loss": 0.7003, "step": 833 }, { "epoch": 1.2294085129906025, "grad_norm": 2.3585271259209284, "learning_rate": 4.589605335018176e-05, "loss": 0.785, "step": 834 }, { "epoch": 1.2308826239174497, "grad_norm": 2.448902456494452, "learning_rate": 4.588518087632475e-05, "loss": 0.9048, "step": 835 }, { "epoch": 1.232356734844297, "grad_norm": 2.3362636784969, "learning_rate": 4.587429531081019e-05, "loss": 0.8234, "step": 836 }, { "epoch": 1.2338308457711442, "grad_norm": 2.6736503190857923, "learning_rate": 4.5863396660461575e-05, "loss": 0.8626, "step": 837 }, { "epoch": 1.2353049566979915, "grad_norm": 2.439505607828402, "learning_rate": 4.585248493211063e-05, "loss": 0.8044, "step": 838 }, { "epoch": 1.2367790676248387, "grad_norm": 2.52149288492222, "learning_rate": 4.5841560132597244e-05, "loss": 0.7156, "step": 839 }, { "epoch": 1.238253178551686, "grad_norm": 2.8335428347455576, "learning_rate": 4.583062226876952e-05, "loss": 0.9819, "step": 840 }, { "epoch": 1.2397272894785332, "grad_norm": 2.2203757548868626, "learning_rate": 4.5819671347483725e-05, "loss": 0.6908, "step": 841 }, { "epoch": 1.2412014004053806, "grad_norm": 2.2251434622815616, "learning_rate": 4.580870737560435e-05, "loss": 0.8373, "step": 842 }, { "epoch": 1.2426755113322279, "grad_norm": 2.3563523288028967, "learning_rate": 4.579773036000405e-05, "loss": 1.0025, "step": 843 }, { "epoch": 1.244149622259075, "grad_norm": 2.2443699864772633, "learning_rate": 4.5786740307563636e-05, "loss": 0.7333, "step": 844 }, { "epoch": 1.2456237331859223, "grad_norm": 2.023506428490851, "learning_rate": 4.577573722517211e-05, "loss": 0.8088, "step": 845 }, { "epoch": 1.2470978441127696, "grad_norm": 2.3109518487883327, "learning_rate": 4.5764721119726653e-05, "loss": 0.8105, "step": 846 }, { "epoch": 1.2485719550396168, "grad_norm": 2.164532662007543, "learning_rate": 4.575369199813258e-05, "loss": 0.8183, "step": 847 }, { "epoch": 1.250046065966464, "grad_norm": 2.2728237096960346, "learning_rate": 4.5742649867303386e-05, "loss": 0.666, "step": 848 }, { "epoch": 1.2515201768933113, "grad_norm": 2.3791537852921203, "learning_rate": 4.573159473416072e-05, "loss": 0.7859, "step": 849 }, { "epoch": 1.2529942878201585, "grad_norm": 2.461477165994964, "learning_rate": 4.572052660563437e-05, "loss": 0.8129, "step": 850 }, { "epoch": 1.2544683987470058, "grad_norm": 2.372570200324692, "learning_rate": 4.570944548866228e-05, "loss": 0.826, "step": 851 }, { "epoch": 1.255942509673853, "grad_norm": 2.1840147318919563, "learning_rate": 4.569835139019054e-05, "loss": 0.7885, "step": 852 }, { "epoch": 1.2574166206007003, "grad_norm": 2.1937091443205246, "learning_rate": 4.5687244317173356e-05, "loss": 0.8087, "step": 853 }, { "epoch": 1.2588907315275475, "grad_norm": 2.3425556445764553, "learning_rate": 4.567612427657308e-05, "loss": 0.9755, "step": 854 }, { "epoch": 1.2603648424543947, "grad_norm": 2.317335383094229, "learning_rate": 4.566499127536021e-05, "loss": 0.8566, "step": 855 }, { "epoch": 1.261838953381242, "grad_norm": 2.694578828333525, "learning_rate": 4.565384532051335e-05, "loss": 0.9246, "step": 856 }, { "epoch": 1.2633130643080892, "grad_norm": 2.6199264542252547, "learning_rate": 4.56426864190192e-05, "loss": 0.8442, "step": 857 }, { "epoch": 1.2647871752349364, "grad_norm": 2.2234223290875503, "learning_rate": 4.563151457787263e-05, "loss": 0.7931, "step": 858 }, { "epoch": 1.2662612861617837, "grad_norm": 2.6271965282798253, "learning_rate": 4.562032980407658e-05, "loss": 0.8958, "step": 859 }, { "epoch": 1.267735397088631, "grad_norm": 2.0805938511192976, "learning_rate": 4.56091321046421e-05, "loss": 0.6799, "step": 860 }, { "epoch": 1.2692095080154782, "grad_norm": 2.4828648805944527, "learning_rate": 4.5597921486588366e-05, "loss": 0.8776, "step": 861 }, { "epoch": 1.2706836189423254, "grad_norm": 2.1290896835394695, "learning_rate": 4.558669795694263e-05, "loss": 0.8578, "step": 862 }, { "epoch": 1.2721577298691726, "grad_norm": 2.6366854743270656, "learning_rate": 4.557546152274025e-05, "loss": 0.8835, "step": 863 }, { "epoch": 1.2736318407960199, "grad_norm": 2.449576952474396, "learning_rate": 4.556421219102466e-05, "loss": 0.7346, "step": 864 }, { "epoch": 1.2751059517228671, "grad_norm": 2.314669236020597, "learning_rate": 4.555294996884738e-05, "loss": 0.7785, "step": 865 }, { "epoch": 1.2765800626497144, "grad_norm": 2.0648533659590935, "learning_rate": 4.5541674863268035e-05, "loss": 0.8054, "step": 866 }, { "epoch": 1.2780541735765616, "grad_norm": 2.3816160105965722, "learning_rate": 4.553038688135429e-05, "loss": 0.9817, "step": 867 }, { "epoch": 1.2795282845034088, "grad_norm": 2.345645595737605, "learning_rate": 4.551908603018191e-05, "loss": 0.8724, "step": 868 }, { "epoch": 1.281002395430256, "grad_norm": 2.196301437756931, "learning_rate": 4.5507772316834715e-05, "loss": 0.836, "step": 869 }, { "epoch": 1.2824765063571033, "grad_norm": 2.4068744394147683, "learning_rate": 4.549644574840458e-05, "loss": 0.9897, "step": 870 }, { "epoch": 1.2839506172839505, "grad_norm": 1.9572776289093554, "learning_rate": 4.5485106331991446e-05, "loss": 0.7574, "step": 871 }, { "epoch": 1.2854247282107978, "grad_norm": 2.196187126809912, "learning_rate": 4.5473754074703324e-05, "loss": 0.7494, "step": 872 }, { "epoch": 1.286898839137645, "grad_norm": 2.2234169675585362, "learning_rate": 4.546238898365623e-05, "loss": 0.8459, "step": 873 }, { "epoch": 1.2883729500644923, "grad_norm": 2.1768600008609624, "learning_rate": 4.545101106597428e-05, "loss": 0.9016, "step": 874 }, { "epoch": 1.2898470609913395, "grad_norm": 2.359894651065941, "learning_rate": 4.5439620328789593e-05, "loss": 0.825, "step": 875 }, { "epoch": 1.2913211719181867, "grad_norm": 2.7216626054866255, "learning_rate": 4.5428216779242336e-05, "loss": 0.8504, "step": 876 }, { "epoch": 1.292795282845034, "grad_norm": 2.3518071348367395, "learning_rate": 4.541680042448069e-05, "loss": 0.8343, "step": 877 }, { "epoch": 1.2942693937718812, "grad_norm": 2.297116740474122, "learning_rate": 4.540537127166089e-05, "loss": 0.8241, "step": 878 }, { "epoch": 1.2957435046987285, "grad_norm": 2.226216071768296, "learning_rate": 4.5393929327947195e-05, "loss": 0.8368, "step": 879 }, { "epoch": 1.2972176156255757, "grad_norm": 2.5852968280897755, "learning_rate": 4.538247460051184e-05, "loss": 0.8389, "step": 880 }, { "epoch": 1.298691726552423, "grad_norm": 2.486196270223549, "learning_rate": 4.537100709653512e-05, "loss": 0.7643, "step": 881 }, { "epoch": 1.3001658374792704, "grad_norm": 2.6160883630941916, "learning_rate": 4.535952682320531e-05, "loss": 0.9142, "step": 882 }, { "epoch": 1.3016399484061176, "grad_norm": 2.513485680051602, "learning_rate": 4.534803378771871e-05, "loss": 0.8208, "step": 883 }, { "epoch": 1.3031140593329649, "grad_norm": 2.239548994880436, "learning_rate": 4.53365279972796e-05, "loss": 0.8456, "step": 884 }, { "epoch": 1.304588170259812, "grad_norm": 2.868298224486053, "learning_rate": 4.532500945910026e-05, "loss": 0.9507, "step": 885 }, { "epoch": 1.3060622811866593, "grad_norm": 2.320034125997865, "learning_rate": 4.5313478180400995e-05, "loss": 0.9082, "step": 886 }, { "epoch": 1.3075363921135066, "grad_norm": 2.4658586978757326, "learning_rate": 4.530193416841003e-05, "loss": 0.8473, "step": 887 }, { "epoch": 1.3090105030403538, "grad_norm": 2.531049838217102, "learning_rate": 4.529037743036362e-05, "loss": 0.9711, "step": 888 }, { "epoch": 1.310484613967201, "grad_norm": 2.247798054146351, "learning_rate": 4.5278807973506e-05, "loss": 0.7747, "step": 889 }, { "epoch": 1.3119587248940483, "grad_norm": 2.3362300013486816, "learning_rate": 4.526722580508934e-05, "loss": 0.7627, "step": 890 }, { "epoch": 1.3134328358208955, "grad_norm": 2.4619621928865767, "learning_rate": 4.525563093237383e-05, "loss": 0.8652, "step": 891 }, { "epoch": 1.3149069467477428, "grad_norm": 2.4526525151759624, "learning_rate": 4.524402336262756e-05, "loss": 0.8451, "step": 892 }, { "epoch": 1.31638105767459, "grad_norm": 2.6461569720797424, "learning_rate": 4.523240310312664e-05, "loss": 0.8932, "step": 893 }, { "epoch": 1.3178551686014373, "grad_norm": 2.4686371982734157, "learning_rate": 4.522077016115511e-05, "loss": 0.8441, "step": 894 }, { "epoch": 1.3193292795282845, "grad_norm": 2.7497753571743084, "learning_rate": 4.520912454400494e-05, "loss": 0.9259, "step": 895 }, { "epoch": 1.3208033904551317, "grad_norm": 2.3121278824269313, "learning_rate": 4.519746625897607e-05, "loss": 0.8893, "step": 896 }, { "epoch": 1.322277501381979, "grad_norm": 2.7354664395392794, "learning_rate": 4.518579531337638e-05, "loss": 0.9888, "step": 897 }, { "epoch": 1.3237516123088262, "grad_norm": 2.354596484171223, "learning_rate": 4.5174111714521685e-05, "loss": 0.7556, "step": 898 }, { "epoch": 1.3252257232356734, "grad_norm": 2.2428647390901673, "learning_rate": 4.516241546973571e-05, "loss": 0.8873, "step": 899 }, { "epoch": 1.3266998341625207, "grad_norm": 2.0372398678467607, "learning_rate": 4.515070658635013e-05, "loss": 0.7688, "step": 900 }, { "epoch": 1.328173945089368, "grad_norm": 2.5089236260866223, "learning_rate": 4.5138985071704546e-05, "loss": 0.8411, "step": 901 }, { "epoch": 1.3296480560162152, "grad_norm": 2.084225069570223, "learning_rate": 4.512725093314645e-05, "loss": 0.7, "step": 902 }, { "epoch": 1.3311221669430624, "grad_norm": 2.5799733327625916, "learning_rate": 4.5115504178031285e-05, "loss": 0.9132, "step": 903 }, { "epoch": 1.3325962778699096, "grad_norm": 2.5294349662170665, "learning_rate": 4.5103744813722374e-05, "loss": 0.9491, "step": 904 }, { "epoch": 1.3340703887967569, "grad_norm": 2.418511892028051, "learning_rate": 4.509197284759094e-05, "loss": 0.8336, "step": 905 }, { "epoch": 1.3355444997236043, "grad_norm": 2.518406437995757, "learning_rate": 4.508018828701612e-05, "loss": 0.7807, "step": 906 }, { "epoch": 1.3370186106504516, "grad_norm": 2.1784320893943487, "learning_rate": 4.506839113938496e-05, "loss": 0.8636, "step": 907 }, { "epoch": 1.3384927215772988, "grad_norm": 2.455829077610056, "learning_rate": 4.505658141209237e-05, "loss": 0.8382, "step": 908 }, { "epoch": 1.339966832504146, "grad_norm": 2.499296470833561, "learning_rate": 4.504475911254115e-05, "loss": 0.9979, "step": 909 }, { "epoch": 1.3414409434309933, "grad_norm": 2.1589529814766246, "learning_rate": 4.503292424814198e-05, "loss": 0.7633, "step": 910 }, { "epoch": 1.3429150543578405, "grad_norm": 2.5665213502478554, "learning_rate": 4.502107682631343e-05, "loss": 0.9805, "step": 911 }, { "epoch": 1.3443891652846878, "grad_norm": 2.6706858883377245, "learning_rate": 4.500921685448193e-05, "loss": 0.8444, "step": 912 }, { "epoch": 1.345863276211535, "grad_norm": 2.333291791364441, "learning_rate": 4.499734434008178e-05, "loss": 0.8954, "step": 913 }, { "epoch": 1.3473373871383822, "grad_norm": 2.3843426903451697, "learning_rate": 4.498545929055515e-05, "loss": 0.7607, "step": 914 }, { "epoch": 1.3488114980652295, "grad_norm": 2.245926348211485, "learning_rate": 4.497356171335204e-05, "loss": 0.8586, "step": 915 }, { "epoch": 1.3502856089920767, "grad_norm": 2.6560331368275096, "learning_rate": 4.496165161593035e-05, "loss": 0.9519, "step": 916 }, { "epoch": 1.351759719918924, "grad_norm": 2.877085219780158, "learning_rate": 4.4949729005755765e-05, "loss": 0.9, "step": 917 }, { "epoch": 1.3532338308457712, "grad_norm": 2.4363896702070367, "learning_rate": 4.493779389030187e-05, "loss": 0.8379, "step": 918 }, { "epoch": 1.3547079417726184, "grad_norm": 2.293084682672657, "learning_rate": 4.492584627705008e-05, "loss": 0.8403, "step": 919 }, { "epoch": 1.3561820526994657, "grad_norm": 2.3165223476913654, "learning_rate": 4.491388617348959e-05, "loss": 0.9033, "step": 920 }, { "epoch": 1.357656163626313, "grad_norm": 2.4118183655427163, "learning_rate": 4.490191358711751e-05, "loss": 0.7305, "step": 921 }, { "epoch": 1.3591302745531602, "grad_norm": 2.5354581162369643, "learning_rate": 4.488992852543871e-05, "loss": 0.8474, "step": 922 }, { "epoch": 1.3606043854800074, "grad_norm": 2.741705523353726, "learning_rate": 4.4877930995965905e-05, "loss": 0.8356, "step": 923 }, { "epoch": 1.3620784964068546, "grad_norm": 2.253735831708587, "learning_rate": 4.486592100621961e-05, "loss": 0.7795, "step": 924 }, { "epoch": 1.3635526073337019, "grad_norm": 2.3218679641062536, "learning_rate": 4.4853898563728184e-05, "loss": 0.8719, "step": 925 }, { "epoch": 1.365026718260549, "grad_norm": 2.5949077550115422, "learning_rate": 4.484186367602775e-05, "loss": 0.8517, "step": 926 }, { "epoch": 1.3665008291873963, "grad_norm": 2.150126289384587, "learning_rate": 4.482981635066227e-05, "loss": 0.8055, "step": 927 }, { "epoch": 1.3679749401142436, "grad_norm": 2.308131860784296, "learning_rate": 4.481775659518346e-05, "loss": 0.8018, "step": 928 }, { "epoch": 1.3694490510410908, "grad_norm": 2.2224809681406854, "learning_rate": 4.480568441715086e-05, "loss": 0.7956, "step": 929 }, { "epoch": 1.370923161967938, "grad_norm": 2.609376187809657, "learning_rate": 4.479359982413181e-05, "loss": 0.9657, "step": 930 }, { "epoch": 1.3723972728947853, "grad_norm": 2.595035280491313, "learning_rate": 4.478150282370138e-05, "loss": 0.9011, "step": 931 }, { "epoch": 1.3738713838216325, "grad_norm": 2.383137865209381, "learning_rate": 4.476939342344246e-05, "loss": 0.8034, "step": 932 }, { "epoch": 1.3753454947484798, "grad_norm": 2.248625759490426, "learning_rate": 4.475727163094572e-05, "loss": 0.8022, "step": 933 }, { "epoch": 1.376819605675327, "grad_norm": 2.1691994412975237, "learning_rate": 4.474513745380955e-05, "loss": 0.7787, "step": 934 }, { "epoch": 1.3782937166021743, "grad_norm": 2.5495466016216164, "learning_rate": 4.473299089964015e-05, "loss": 0.8059, "step": 935 }, { "epoch": 1.3797678275290215, "grad_norm": 2.4248369378392978, "learning_rate": 4.472083197605146e-05, "loss": 0.8201, "step": 936 }, { "epoch": 1.3812419384558687, "grad_norm": 2.5784910000031354, "learning_rate": 4.470866069066516e-05, "loss": 0.9597, "step": 937 }, { "epoch": 1.382716049382716, "grad_norm": 2.1359997085185336, "learning_rate": 4.4696477051110705e-05, "loss": 0.7734, "step": 938 }, { "epoch": 1.3841901603095632, "grad_norm": 2.613386348783473, "learning_rate": 4.468428106502528e-05, "loss": 0.868, "step": 939 }, { "epoch": 1.3856642712364104, "grad_norm": 2.2979833303300112, "learning_rate": 4.4672072740053816e-05, "loss": 0.8802, "step": 940 }, { "epoch": 1.3871383821632577, "grad_norm": 2.5541047064363727, "learning_rate": 4.4659852083848975e-05, "loss": 0.7974, "step": 941 }, { "epoch": 1.388612493090105, "grad_norm": 2.405059123432734, "learning_rate": 4.464761910407113e-05, "loss": 0.8732, "step": 942 }, { "epoch": 1.3900866040169522, "grad_norm": 2.1344459622428635, "learning_rate": 4.463537380838841e-05, "loss": 0.8276, "step": 943 }, { "epoch": 1.3915607149437994, "grad_norm": 2.3898657297381285, "learning_rate": 4.462311620447666e-05, "loss": 0.8905, "step": 944 }, { "epoch": 1.3930348258706466, "grad_norm": 2.2123365525040284, "learning_rate": 4.461084630001942e-05, "loss": 0.9081, "step": 945 }, { "epoch": 1.3945089367974939, "grad_norm": 2.322873740947156, "learning_rate": 4.459856410270795e-05, "loss": 0.8648, "step": 946 }, { "epoch": 1.3959830477243411, "grad_norm": 2.38677049446517, "learning_rate": 4.4586269620241216e-05, "loss": 0.8257, "step": 947 }, { "epoch": 1.3974571586511886, "grad_norm": 2.3009327448399826, "learning_rate": 4.457396286032589e-05, "loss": 0.7025, "step": 948 }, { "epoch": 1.3989312695780358, "grad_norm": 2.2345356916785315, "learning_rate": 4.4561643830676336e-05, "loss": 0.892, "step": 949 }, { "epoch": 1.400405380504883, "grad_norm": 2.3015736379567566, "learning_rate": 4.454931253901461e-05, "loss": 0.7764, "step": 950 }, { "epoch": 1.4018794914317303, "grad_norm": 2.434991866613195, "learning_rate": 4.453696899307045e-05, "loss": 0.9819, "step": 951 }, { "epoch": 1.4033536023585775, "grad_norm": 2.8286498199268095, "learning_rate": 4.4524613200581284e-05, "loss": 0.8002, "step": 952 }, { "epoch": 1.4048277132854248, "grad_norm": 2.2202079173832194, "learning_rate": 4.4512245169292206e-05, "loss": 0.8279, "step": 953 }, { "epoch": 1.406301824212272, "grad_norm": 2.244857952322315, "learning_rate": 4.449986490695599e-05, "loss": 0.7671, "step": 954 }, { "epoch": 1.4077759351391192, "grad_norm": 2.3508797601791693, "learning_rate": 4.4487472421333074e-05, "loss": 0.8704, "step": 955 }, { "epoch": 1.4092500460659665, "grad_norm": 2.3308901143400504, "learning_rate": 4.447506772019155e-05, "loss": 0.801, "step": 956 }, { "epoch": 1.4107241569928137, "grad_norm": 2.4810315552457207, "learning_rate": 4.44626508113072e-05, "loss": 0.9633, "step": 957 }, { "epoch": 1.412198267919661, "grad_norm": 2.6125740040787995, "learning_rate": 4.445022170246341e-05, "loss": 0.8837, "step": 958 }, { "epoch": 1.4136723788465082, "grad_norm": 2.538496030559821, "learning_rate": 4.443778040145124e-05, "loss": 0.7288, "step": 959 }, { "epoch": 1.4151464897733554, "grad_norm": 2.2954443803454, "learning_rate": 4.44253269160694e-05, "loss": 0.7505, "step": 960 }, { "epoch": 1.4166206007002027, "grad_norm": 2.0659143334692374, "learning_rate": 4.441286125412422e-05, "loss": 0.7311, "step": 961 }, { "epoch": 1.41809471162705, "grad_norm": 2.3357003898590554, "learning_rate": 4.440038342342967e-05, "loss": 0.8306, "step": 962 }, { "epoch": 1.4195688225538972, "grad_norm": 2.2427456789027187, "learning_rate": 4.4387893431807344e-05, "loss": 0.8796, "step": 963 }, { "epoch": 1.4210429334807444, "grad_norm": 2.4995645143775835, "learning_rate": 4.437539128708647e-05, "loss": 0.8316, "step": 964 }, { "epoch": 1.4225170444075916, "grad_norm": 2.470047428741386, "learning_rate": 4.4362876997103885e-05, "loss": 0.8093, "step": 965 }, { "epoch": 1.4239911553344389, "grad_norm": 2.249532015133231, "learning_rate": 4.4350350569704045e-05, "loss": 0.8599, "step": 966 }, { "epoch": 1.4254652662612861, "grad_norm": 2.2356510386711577, "learning_rate": 4.4337812012738996e-05, "loss": 0.7694, "step": 967 }, { "epoch": 1.4269393771881334, "grad_norm": 2.5671837962815167, "learning_rate": 4.4325261334068426e-05, "loss": 0.8278, "step": 968 }, { "epoch": 1.4284134881149806, "grad_norm": 2.087459280564859, "learning_rate": 4.431269854155957e-05, "loss": 0.7308, "step": 969 }, { "epoch": 1.4298875990418278, "grad_norm": 2.19418751410309, "learning_rate": 4.4300123643087304e-05, "loss": 0.8475, "step": 970 }, { "epoch": 1.431361709968675, "grad_norm": 2.787299103618014, "learning_rate": 4.428753664653406e-05, "loss": 0.9414, "step": 971 }, { "epoch": 1.4328358208955223, "grad_norm": 2.2559351524407396, "learning_rate": 4.427493755978987e-05, "loss": 0.8605, "step": 972 }, { "epoch": 1.4343099318223698, "grad_norm": 2.9750213237607315, "learning_rate": 4.426232639075234e-05, "loss": 0.873, "step": 973 }, { "epoch": 1.435784042749217, "grad_norm": 2.6621228985292507, "learning_rate": 4.424970314732664e-05, "loss": 0.8129, "step": 974 }, { "epoch": 1.4372581536760642, "grad_norm": 2.2916509685556674, "learning_rate": 4.423706783742554e-05, "loss": 0.8673, "step": 975 }, { "epoch": 1.4387322646029115, "grad_norm": 2.268245260456742, "learning_rate": 4.422442046896933e-05, "loss": 0.8359, "step": 976 }, { "epoch": 1.4402063755297587, "grad_norm": 2.1538608971028292, "learning_rate": 4.421176104988589e-05, "loss": 0.7812, "step": 977 }, { "epoch": 1.441680486456606, "grad_norm": 2.394422320179857, "learning_rate": 4.419908958811064e-05, "loss": 0.8077, "step": 978 }, { "epoch": 1.4431545973834532, "grad_norm": 2.0540987629013086, "learning_rate": 4.418640609158656e-05, "loss": 0.9148, "step": 979 }, { "epoch": 1.4446287083103004, "grad_norm": 2.250160635406245, "learning_rate": 4.417371056826417e-05, "loss": 0.8863, "step": 980 }, { "epoch": 1.4461028192371477, "grad_norm": 2.177610768458181, "learning_rate": 4.4161003026101525e-05, "loss": 0.7901, "step": 981 }, { "epoch": 1.447576930163995, "grad_norm": 2.472662421003358, "learning_rate": 4.41482834730642e-05, "loss": 0.8543, "step": 982 }, { "epoch": 1.4490510410908422, "grad_norm": 2.173025800598308, "learning_rate": 4.4135551917125334e-05, "loss": 0.7738, "step": 983 }, { "epoch": 1.4505251520176894, "grad_norm": 2.167371072949643, "learning_rate": 4.4122808366265556e-05, "loss": 0.7463, "step": 984 }, { "epoch": 1.4519992629445366, "grad_norm": 2.2546019645639723, "learning_rate": 4.411005282847304e-05, "loss": 0.9093, "step": 985 }, { "epoch": 1.4534733738713839, "grad_norm": 2.3920721523349413, "learning_rate": 4.409728531174345e-05, "loss": 0.7281, "step": 986 }, { "epoch": 1.454947484798231, "grad_norm": 2.2569988328950137, "learning_rate": 4.4084505824079975e-05, "loss": 0.8429, "step": 987 }, { "epoch": 1.4564215957250783, "grad_norm": 2.297923465095025, "learning_rate": 4.40717143734933e-05, "loss": 0.8092, "step": 988 }, { "epoch": 1.4578957066519256, "grad_norm": 2.6960292581844003, "learning_rate": 4.405891096800162e-05, "loss": 0.8862, "step": 989 }, { "epoch": 1.4593698175787728, "grad_norm": 2.5340177686410033, "learning_rate": 4.404609561563062e-05, "loss": 0.7592, "step": 990 }, { "epoch": 1.46084392850562, "grad_norm": 2.404666131651526, "learning_rate": 4.403326832441345e-05, "loss": 0.8122, "step": 991 }, { "epoch": 1.4623180394324673, "grad_norm": 2.6439680798633756, "learning_rate": 4.402042910239078e-05, "loss": 0.7565, "step": 992 }, { "epoch": 1.4637921503593145, "grad_norm": 2.4933608590440905, "learning_rate": 4.400757795761074e-05, "loss": 0.9728, "step": 993 }, { "epoch": 1.4652662612861618, "grad_norm": 2.7400197212177786, "learning_rate": 4.399471489812893e-05, "loss": 0.813, "step": 994 }, { "epoch": 1.466740372213009, "grad_norm": 2.195651533887871, "learning_rate": 4.398183993200843e-05, "loss": 0.751, "step": 995 }, { "epoch": 1.4682144831398563, "grad_norm": 2.457753959099246, "learning_rate": 4.3968953067319777e-05, "loss": 0.8101, "step": 996 }, { "epoch": 1.4696885940667035, "grad_norm": 2.341331251112346, "learning_rate": 4.395605431214096e-05, "loss": 0.8153, "step": 997 }, { "epoch": 1.4711627049935507, "grad_norm": 2.5224062108428886, "learning_rate": 4.394314367455744e-05, "loss": 0.7541, "step": 998 }, { "epoch": 1.472636815920398, "grad_norm": 2.4913002276297878, "learning_rate": 4.393022116266212e-05, "loss": 0.8827, "step": 999 }, { "epoch": 1.4741109268472452, "grad_norm": 2.8267119022864415, "learning_rate": 4.3917286784555325e-05, "loss": 1.0432, "step": 1000 }, { "epoch": 1.4741109268472452, "eval_bleu": 0.08465239136028488, "eval_bleu_1gram": 0.41068053398600124, "eval_bleu_2gram": 0.17267559830542173, "eval_bleu_3gram": 0.08067446786981494, "eval_bleu_4gram": 0.040015689589235764, "eval_rag_val_loss": 1.092801840395056, "eval_rouge1": 0.4042905592541274, "eval_rouge2": 0.1642443570944988, "eval_rougeL": 0.3861227616631128, "step": 1000 }, { "epoch": 1.4755850377740924, "grad_norm": 2.2393605557613876, "learning_rate": 4.390434054834483e-05, "loss": 0.7663, "step": 1001 }, { "epoch": 1.4770591487009397, "grad_norm": 3.3655859021366017, "learning_rate": 4.389138246214588e-05, "loss": 0.8594, "step": 1002 }, { "epoch": 1.478533259627787, "grad_norm": 2.32937875053937, "learning_rate": 4.387841253408109e-05, "loss": 0.9097, "step": 1003 }, { "epoch": 1.4800073705546342, "grad_norm": 2.1608439725218442, "learning_rate": 4.386543077228053e-05, "loss": 0.7668, "step": 1004 }, { "epoch": 1.4814814814814814, "grad_norm": 2.35789867974208, "learning_rate": 4.3852437184881687e-05, "loss": 0.9858, "step": 1005 }, { "epoch": 1.4829555924083286, "grad_norm": 2.4081881755479317, "learning_rate": 4.383943178002944e-05, "loss": 0.8611, "step": 1006 }, { "epoch": 1.4844297033351759, "grad_norm": 2.511173645955046, "learning_rate": 4.382641456587611e-05, "loss": 0.8066, "step": 1007 }, { "epoch": 1.4859038142620231, "grad_norm": 1.9736846358315527, "learning_rate": 4.38133855505814e-05, "loss": 0.8293, "step": 1008 }, { "epoch": 1.4873779251888704, "grad_norm": 2.336277761524023, "learning_rate": 4.3800344742312396e-05, "loss": 0.8379, "step": 1009 }, { "epoch": 1.4888520361157176, "grad_norm": 1.8707725551845718, "learning_rate": 4.3787292149243605e-05, "loss": 0.7497, "step": 1010 }, { "epoch": 1.4903261470425648, "grad_norm": 2.07975280796746, "learning_rate": 4.3774227779556906e-05, "loss": 0.6969, "step": 1011 }, { "epoch": 1.491800257969412, "grad_norm": 2.484974051173103, "learning_rate": 4.376115164144157e-05, "loss": 0.9435, "step": 1012 }, { "epoch": 1.4932743688962593, "grad_norm": 2.052530527201482, "learning_rate": 4.374806374309421e-05, "loss": 0.7343, "step": 1013 }, { "epoch": 1.4947484798231068, "grad_norm": 2.0996686492359866, "learning_rate": 4.3734964092718885e-05, "loss": 0.8912, "step": 1014 }, { "epoch": 1.496222590749954, "grad_norm": 2.500282653088829, "learning_rate": 4.372185269852693e-05, "loss": 0.9469, "step": 1015 }, { "epoch": 1.4976967016768012, "grad_norm": 2.566147232339172, "learning_rate": 4.370872956873712e-05, "loss": 0.8438, "step": 1016 }, { "epoch": 1.4991708126036485, "grad_norm": 2.286740945252829, "learning_rate": 4.369559471157552e-05, "loss": 0.9009, "step": 1017 }, { "epoch": 1.5006449235304957, "grad_norm": 2.6931380636730977, "learning_rate": 4.36824481352756e-05, "loss": 1.0038, "step": 1018 }, { "epoch": 1.502119034457343, "grad_norm": 2.200400541429812, "learning_rate": 4.366928984807815e-05, "loss": 0.8597, "step": 1019 }, { "epoch": 1.5035931453841902, "grad_norm": 2.4205831712516166, "learning_rate": 4.36561198582313e-05, "loss": 0.8339, "step": 1020 }, { "epoch": 1.5050672563110374, "grad_norm": 2.1643313024054947, "learning_rate": 4.364293817399052e-05, "loss": 0.8952, "step": 1021 }, { "epoch": 1.5065413672378847, "grad_norm": 1.9254241055073362, "learning_rate": 4.362974480361862e-05, "loss": 0.7146, "step": 1022 }, { "epoch": 1.508015478164732, "grad_norm": 2.4090171855030067, "learning_rate": 4.361653975538572e-05, "loss": 0.914, "step": 1023 }, { "epoch": 1.5094895890915792, "grad_norm": 2.564965899205468, "learning_rate": 4.3603323037569265e-05, "loss": 0.8906, "step": 1024 }, { "epoch": 1.5109637000184264, "grad_norm": 2.097915491320354, "learning_rate": 4.359009465845402e-05, "loss": 0.6926, "step": 1025 }, { "epoch": 1.5124378109452736, "grad_norm": 2.2607188806506486, "learning_rate": 4.3576854626332055e-05, "loss": 0.8381, "step": 1026 }, { "epoch": 1.5139119218721209, "grad_norm": 2.757411852038998, "learning_rate": 4.356360294950275e-05, "loss": 0.9277, "step": 1027 }, { "epoch": 1.515386032798968, "grad_norm": 2.84733891648579, "learning_rate": 4.3550339636272775e-05, "loss": 0.9046, "step": 1028 }, { "epoch": 1.5168601437258153, "grad_norm": 2.1863146158727607, "learning_rate": 4.35370646949561e-05, "loss": 0.8048, "step": 1029 }, { "epoch": 1.5183342546526626, "grad_norm": 2.2501399738429577, "learning_rate": 4.352377813387398e-05, "loss": 0.8545, "step": 1030 }, { "epoch": 1.5198083655795098, "grad_norm": 2.3784779631335793, "learning_rate": 4.3510479961354964e-05, "loss": 0.7993, "step": 1031 }, { "epoch": 1.521282476506357, "grad_norm": 2.063738942108012, "learning_rate": 4.349717018573487e-05, "loss": 0.7562, "step": 1032 }, { "epoch": 1.5227565874332043, "grad_norm": 2.489684854350187, "learning_rate": 4.348384881535679e-05, "loss": 0.8269, "step": 1033 }, { "epoch": 1.5242306983600515, "grad_norm": 2.265253181201843, "learning_rate": 4.347051585857109e-05, "loss": 0.8508, "step": 1034 }, { "epoch": 1.525704809286899, "grad_norm": 2.4765018491809334, "learning_rate": 4.34571713237354e-05, "loss": 0.8819, "step": 1035 }, { "epoch": 1.5271789202137462, "grad_norm": 2.217446925099021, "learning_rate": 4.344381521921458e-05, "loss": 0.7809, "step": 1036 }, { "epoch": 1.5286530311405935, "grad_norm": 2.5967951903490807, "learning_rate": 4.3430447553380785e-05, "loss": 0.7755, "step": 1037 }, { "epoch": 1.5301271420674407, "grad_norm": 2.412414284403893, "learning_rate": 4.34170683346134e-05, "loss": 0.8705, "step": 1038 }, { "epoch": 1.531601252994288, "grad_norm": 2.239252914776044, "learning_rate": 4.3403677571299026e-05, "loss": 0.9809, "step": 1039 }, { "epoch": 1.5330753639211352, "grad_norm": 2.0510680370530983, "learning_rate": 4.339027527183154e-05, "loss": 0.7398, "step": 1040 }, { "epoch": 1.5345494748479824, "grad_norm": 2.16988847296524, "learning_rate": 4.337686144461204e-05, "loss": 0.8284, "step": 1041 }, { "epoch": 1.5360235857748297, "grad_norm": 2.0353722631582136, "learning_rate": 4.3363436098048825e-05, "loss": 0.8518, "step": 1042 }, { "epoch": 1.537497696701677, "grad_norm": 2.714431841222411, "learning_rate": 4.3349999240557446e-05, "loss": 0.8001, "step": 1043 }, { "epoch": 1.5389718076285241, "grad_norm": 2.60391916751948, "learning_rate": 4.333655088056065e-05, "loss": 0.8707, "step": 1044 }, { "epoch": 1.5404459185553714, "grad_norm": 2.4701277353099695, "learning_rate": 4.332309102648841e-05, "loss": 0.9418, "step": 1045 }, { "epoch": 1.5419200294822186, "grad_norm": 2.708914493905359, "learning_rate": 4.330961968677788e-05, "loss": 0.83, "step": 1046 }, { "epoch": 1.5433941404090659, "grad_norm": 2.555301604115331, "learning_rate": 4.329613686987344e-05, "loss": 0.9156, "step": 1047 }, { "epoch": 1.544868251335913, "grad_norm": 2.302389570439769, "learning_rate": 4.328264258422665e-05, "loss": 0.8853, "step": 1048 }, { "epoch": 1.5463423622627603, "grad_norm": 2.5182524991984496, "learning_rate": 4.3269136838296264e-05, "loss": 1.0045, "step": 1049 }, { "epoch": 1.5478164731896076, "grad_norm": 2.4339481787576007, "learning_rate": 4.325561964054822e-05, "loss": 1.0217, "step": 1050 }, { "epoch": 1.5492905841164548, "grad_norm": 2.4938223806790516, "learning_rate": 4.324209099945563e-05, "loss": 0.8326, "step": 1051 }, { "epoch": 1.550764695043302, "grad_norm": 2.236496138612214, "learning_rate": 4.322855092349878e-05, "loss": 0.9053, "step": 1052 }, { "epoch": 1.5522388059701493, "grad_norm": 2.2959875351570793, "learning_rate": 4.321499942116511e-05, "loss": 0.7389, "step": 1053 }, { "epoch": 1.5537129168969965, "grad_norm": 1.9294228932201354, "learning_rate": 4.320143650094927e-05, "loss": 0.841, "step": 1054 }, { "epoch": 1.5551870278238438, "grad_norm": 2.867880589338934, "learning_rate": 4.318786217135301e-05, "loss": 1.0595, "step": 1055 }, { "epoch": 1.556661138750691, "grad_norm": 2.1280020657292718, "learning_rate": 4.3174276440885276e-05, "loss": 0.7803, "step": 1056 }, { "epoch": 1.5581352496775382, "grad_norm": 2.1762870050096397, "learning_rate": 4.316067931806212e-05, "loss": 0.779, "step": 1057 }, { "epoch": 1.5596093606043855, "grad_norm": 2.0188617592692055, "learning_rate": 4.3147070811406765e-05, "loss": 0.8226, "step": 1058 }, { "epoch": 1.5610834715312327, "grad_norm": 2.3007011464395526, "learning_rate": 4.313345092944957e-05, "loss": 0.7602, "step": 1059 }, { "epoch": 1.56255758245808, "grad_norm": 2.0969512152429504, "learning_rate": 4.3119819680728e-05, "loss": 0.9392, "step": 1060 }, { "epoch": 1.5640316933849272, "grad_norm": 2.3994395992548787, "learning_rate": 4.310617707378668e-05, "loss": 0.7885, "step": 1061 }, { "epoch": 1.5655058043117744, "grad_norm": 2.0922869224941496, "learning_rate": 4.309252311717732e-05, "loss": 0.7758, "step": 1062 }, { "epoch": 1.5669799152386217, "grad_norm": 2.2830171403279054, "learning_rate": 4.307885781945876e-05, "loss": 0.8269, "step": 1063 }, { "epoch": 1.568454026165469, "grad_norm": 3.100648067783021, "learning_rate": 4.3065181189196956e-05, "loss": 0.8948, "step": 1064 }, { "epoch": 1.5699281370923162, "grad_norm": 2.529198085667267, "learning_rate": 4.305149323496497e-05, "loss": 0.8522, "step": 1065 }, { "epoch": 1.5714022480191634, "grad_norm": 2.276567120570283, "learning_rate": 4.303779396534293e-05, "loss": 0.7179, "step": 1066 }, { "epoch": 1.5728763589460106, "grad_norm": 2.2193749850463007, "learning_rate": 4.30240833889181e-05, "loss": 0.8843, "step": 1067 }, { "epoch": 1.5743504698728579, "grad_norm": 2.3209481444345226, "learning_rate": 4.30103615142848e-05, "loss": 0.9034, "step": 1068 }, { "epoch": 1.575824580799705, "grad_norm": 2.3276731321392634, "learning_rate": 4.2996628350044454e-05, "loss": 0.8512, "step": 1069 }, { "epoch": 1.5772986917265523, "grad_norm": 2.4283152934049914, "learning_rate": 4.298288390480554e-05, "loss": 0.806, "step": 1070 }, { "epoch": 1.5787728026533996, "grad_norm": 2.604611829220298, "learning_rate": 4.296912818718363e-05, "loss": 0.7494, "step": 1071 }, { "epoch": 1.5802469135802468, "grad_norm": 2.618782855353542, "learning_rate": 4.295536120580135e-05, "loss": 0.9868, "step": 1072 }, { "epoch": 1.581721024507094, "grad_norm": 2.6282834906761563, "learning_rate": 4.2941582969288384e-05, "loss": 0.8754, "step": 1073 }, { "epoch": 1.5831951354339413, "grad_norm": 2.573114971801993, "learning_rate": 4.292779348628148e-05, "loss": 0.8676, "step": 1074 }, { "epoch": 1.5846692463607885, "grad_norm": 2.2734381292283277, "learning_rate": 4.2913992765424434e-05, "loss": 0.8184, "step": 1075 }, { "epoch": 1.5861433572876358, "grad_norm": 2.0649555500779204, "learning_rate": 4.2900180815368076e-05, "loss": 0.7775, "step": 1076 }, { "epoch": 1.587617468214483, "grad_norm": 2.499768437151706, "learning_rate": 4.2886357644770294e-05, "loss": 0.8026, "step": 1077 }, { "epoch": 1.5890915791413303, "grad_norm": 2.7682630812588145, "learning_rate": 4.287252326229598e-05, "loss": 0.8096, "step": 1078 }, { "epoch": 1.5905656900681775, "grad_norm": 2.344941408608827, "learning_rate": 4.285867767661709e-05, "loss": 0.9124, "step": 1079 }, { "epoch": 1.5920398009950247, "grad_norm": 2.1010988993328783, "learning_rate": 4.284482089641257e-05, "loss": 0.7515, "step": 1080 }, { "epoch": 1.593513911921872, "grad_norm": 2.2674446953839373, "learning_rate": 4.283095293036842e-05, "loss": 0.8668, "step": 1081 }, { "epoch": 1.5949880228487192, "grad_norm": 2.15273905756868, "learning_rate": 4.281707378717761e-05, "loss": 0.8208, "step": 1082 }, { "epoch": 1.5964621337755664, "grad_norm": 2.4070411471442297, "learning_rate": 4.280318347554013e-05, "loss": 0.8804, "step": 1083 }, { "epoch": 1.597936244702414, "grad_norm": 2.2562165741280507, "learning_rate": 4.2789282004163e-05, "loss": 0.9519, "step": 1084 }, { "epoch": 1.5994103556292611, "grad_norm": 2.798689269769537, "learning_rate": 4.27753693817602e-05, "loss": 0.827, "step": 1085 }, { "epoch": 1.6008844665561084, "grad_norm": 2.4575375276418603, "learning_rate": 4.276144561705271e-05, "loss": 0.7425, "step": 1086 }, { "epoch": 1.6023585774829556, "grad_norm": 2.1386661895258334, "learning_rate": 4.27475107187685e-05, "loss": 0.7458, "step": 1087 }, { "epoch": 1.6038326884098029, "grad_norm": 2.3753848516955207, "learning_rate": 4.273356469564251e-05, "loss": 0.8408, "step": 1088 }, { "epoch": 1.60530679933665, "grad_norm": 2.155743525634585, "learning_rate": 4.271960755641668e-05, "loss": 0.7801, "step": 1089 }, { "epoch": 1.6067809102634973, "grad_norm": 2.4322750137270917, "learning_rate": 4.270563930983986e-05, "loss": 0.7674, "step": 1090 }, { "epoch": 1.6082550211903446, "grad_norm": 2.2923824464034386, "learning_rate": 4.269165996466793e-05, "loss": 0.9165, "step": 1091 }, { "epoch": 1.6097291321171918, "grad_norm": 2.570410451211112, "learning_rate": 4.267766952966369e-05, "loss": 0.7623, "step": 1092 }, { "epoch": 1.611203243044039, "grad_norm": 2.6592517721133127, "learning_rate": 4.266366801359689e-05, "loss": 0.9384, "step": 1093 }, { "epoch": 1.6126773539708863, "grad_norm": 2.022271958404403, "learning_rate": 4.264965542524424e-05, "loss": 0.6907, "step": 1094 }, { "epoch": 1.6141514648977335, "grad_norm": 2.676618147236494, "learning_rate": 4.263563177338938e-05, "loss": 0.8169, "step": 1095 }, { "epoch": 1.6156255758245808, "grad_norm": 2.1700052679674746, "learning_rate": 4.262159706682291e-05, "loss": 0.7787, "step": 1096 }, { "epoch": 1.617099686751428, "grad_norm": 2.20900984664249, "learning_rate": 4.2607551314342297e-05, "loss": 0.819, "step": 1097 }, { "epoch": 1.6185737976782753, "grad_norm": 2.465547149937651, "learning_rate": 4.259349452475202e-05, "loss": 0.8196, "step": 1098 }, { "epoch": 1.6200479086051225, "grad_norm": 2.319099083300777, "learning_rate": 4.25794267068634e-05, "loss": 0.8053, "step": 1099 }, { "epoch": 1.6215220195319697, "grad_norm": 2.943930547081578, "learning_rate": 4.256534786949472e-05, "loss": 0.9498, "step": 1100 }, { "epoch": 1.6229961304588172, "grad_norm": 2.3355466585849856, "learning_rate": 4.255125802147114e-05, "loss": 0.8354, "step": 1101 }, { "epoch": 1.6244702413856644, "grad_norm": 2.380572657732054, "learning_rate": 4.253715717162474e-05, "loss": 0.7942, "step": 1102 }, { "epoch": 1.6259443523125117, "grad_norm": 2.3894488863605736, "learning_rate": 4.252304532879449e-05, "loss": 0.919, "step": 1103 }, { "epoch": 1.627418463239359, "grad_norm": 2.5779125704089747, "learning_rate": 4.2508922501826244e-05, "loss": 0.8795, "step": 1104 }, { "epoch": 1.6288925741662061, "grad_norm": 2.308216561165201, "learning_rate": 4.249478869957276e-05, "loss": 0.9082, "step": 1105 }, { "epoch": 1.6303666850930534, "grad_norm": 2.6690002502750008, "learning_rate": 4.248064393089366e-05, "loss": 0.9364, "step": 1106 }, { "epoch": 1.6318407960199006, "grad_norm": 2.9299557168888555, "learning_rate": 4.246648820465544e-05, "loss": 0.8824, "step": 1107 }, { "epoch": 1.6333149069467479, "grad_norm": 2.904129074780148, "learning_rate": 4.2452321529731475e-05, "loss": 0.8309, "step": 1108 }, { "epoch": 1.634789017873595, "grad_norm": 2.6378891897289427, "learning_rate": 4.2438143915002e-05, "loss": 0.8281, "step": 1109 }, { "epoch": 1.6362631288004423, "grad_norm": 2.3128462867243047, "learning_rate": 4.242395536935409e-05, "loss": 0.8917, "step": 1110 }, { "epoch": 1.6377372397272896, "grad_norm": 2.5832666162879194, "learning_rate": 4.2409755901681716e-05, "loss": 0.858, "step": 1111 }, { "epoch": 1.6392113506541368, "grad_norm": 2.4782204839295017, "learning_rate": 4.239554552088563e-05, "loss": 0.9661, "step": 1112 }, { "epoch": 1.640685461580984, "grad_norm": 2.296964682560376, "learning_rate": 4.238132423587349e-05, "loss": 0.8241, "step": 1113 }, { "epoch": 1.6421595725078313, "grad_norm": 2.514517972950582, "learning_rate": 4.236709205555973e-05, "loss": 0.8948, "step": 1114 }, { "epoch": 1.6436336834346785, "grad_norm": 2.4318270079485784, "learning_rate": 4.235284898886568e-05, "loss": 0.9444, "step": 1115 }, { "epoch": 1.6451077943615258, "grad_norm": 2.4089669086088463, "learning_rate": 4.233859504471943e-05, "loss": 0.8169, "step": 1116 }, { "epoch": 1.646581905288373, "grad_norm": 2.1758978382699614, "learning_rate": 4.2324330232055924e-05, "loss": 0.8022, "step": 1117 }, { "epoch": 1.6480560162152202, "grad_norm": 2.21769130324572, "learning_rate": 4.231005455981692e-05, "loss": 0.7939, "step": 1118 }, { "epoch": 1.6495301271420675, "grad_norm": 2.2451995927406267, "learning_rate": 4.2295768036950953e-05, "loss": 0.8379, "step": 1119 }, { "epoch": 1.6510042380689147, "grad_norm": 2.2701130579215323, "learning_rate": 4.22814706724134e-05, "loss": 0.8388, "step": 1120 }, { "epoch": 1.652478348995762, "grad_norm": 2.5055046514324433, "learning_rate": 4.226716247516641e-05, "loss": 0.9294, "step": 1121 }, { "epoch": 1.6539524599226092, "grad_norm": 2.2543984971436037, "learning_rate": 4.2252843454178925e-05, "loss": 0.9486, "step": 1122 }, { "epoch": 1.6554265708494564, "grad_norm": 2.331832187150169, "learning_rate": 4.223851361842668e-05, "loss": 0.7699, "step": 1123 }, { "epoch": 1.6569006817763037, "grad_norm": 2.434549453223666, "learning_rate": 4.222417297689217e-05, "loss": 0.9163, "step": 1124 }, { "epoch": 1.658374792703151, "grad_norm": 2.612344662369375, "learning_rate": 4.2209821538564684e-05, "loss": 0.6954, "step": 1125 }, { "epoch": 1.6598489036299982, "grad_norm": 2.42631478208897, "learning_rate": 4.219545931244027e-05, "loss": 0.9488, "step": 1126 }, { "epoch": 1.6613230145568454, "grad_norm": 2.475240360495142, "learning_rate": 4.218108630752174e-05, "loss": 1.0345, "step": 1127 }, { "epoch": 1.6627971254836926, "grad_norm": 2.291409067645781, "learning_rate": 4.2166702532818665e-05, "loss": 0.8037, "step": 1128 }, { "epoch": 1.6642712364105399, "grad_norm": 2.289706090864927, "learning_rate": 4.2152307997347365e-05, "loss": 0.7953, "step": 1129 }, { "epoch": 1.665745347337387, "grad_norm": 2.4758968963047696, "learning_rate": 4.213790271013089e-05, "loss": 0.8744, "step": 1130 }, { "epoch": 1.6672194582642343, "grad_norm": 2.366294667285352, "learning_rate": 4.212348668019906e-05, "loss": 0.891, "step": 1131 }, { "epoch": 1.6686935691910816, "grad_norm": 1.945386218776855, "learning_rate": 4.2109059916588414e-05, "loss": 0.621, "step": 1132 }, { "epoch": 1.6701676801179288, "grad_norm": 2.311608632447217, "learning_rate": 4.20946224283422e-05, "loss": 0.7607, "step": 1133 }, { "epoch": 1.671641791044776, "grad_norm": 2.4513930505915957, "learning_rate": 4.2080174224510426e-05, "loss": 0.8549, "step": 1134 }, { "epoch": 1.6731159019716233, "grad_norm": 2.1938973749547848, "learning_rate": 4.2065715314149775e-05, "loss": 0.7629, "step": 1135 }, { "epoch": 1.6745900128984705, "grad_norm": 2.434676858503911, "learning_rate": 4.2051245706323696e-05, "loss": 0.8175, "step": 1136 }, { "epoch": 1.6760641238253178, "grad_norm": 2.55374557442452, "learning_rate": 4.2036765410102285e-05, "loss": 0.8369, "step": 1137 }, { "epoch": 1.677538234752165, "grad_norm": 2.5509575261520787, "learning_rate": 4.202227443456238e-05, "loss": 0.8373, "step": 1138 }, { "epoch": 1.6790123456790123, "grad_norm": 2.29704024570334, "learning_rate": 4.200777278878749e-05, "loss": 0.8072, "step": 1139 }, { "epoch": 1.6804864566058595, "grad_norm": 2.0727487876298696, "learning_rate": 4.199326048186782e-05, "loss": 0.826, "step": 1140 }, { "epoch": 1.6819605675327067, "grad_norm": 2.286297374670409, "learning_rate": 4.197873752290027e-05, "loss": 0.8638, "step": 1141 }, { "epoch": 1.683434678459554, "grad_norm": 2.5212980005009964, "learning_rate": 4.1964203920988385e-05, "loss": 0.7384, "step": 1142 }, { "epoch": 1.6849087893864012, "grad_norm": 2.542700406168565, "learning_rate": 4.19496596852424e-05, "loss": 0.9088, "step": 1143 }, { "epoch": 1.6863829003132484, "grad_norm": 2.1673147503844215, "learning_rate": 4.1935104824779246e-05, "loss": 0.8434, "step": 1144 }, { "epoch": 1.6878570112400957, "grad_norm": 2.460245670984237, "learning_rate": 4.192053934872247e-05, "loss": 0.8097, "step": 1145 }, { "epoch": 1.689331122166943, "grad_norm": 2.341771625958546, "learning_rate": 4.1905963266202276e-05, "loss": 1.0546, "step": 1146 }, { "epoch": 1.6908052330937902, "grad_norm": 2.5000154494761886, "learning_rate": 4.189137658635555e-05, "loss": 0.9775, "step": 1147 }, { "epoch": 1.6922793440206374, "grad_norm": 3.1469645876666346, "learning_rate": 4.187677931832578e-05, "loss": 0.8802, "step": 1148 }, { "epoch": 1.6937534549474846, "grad_norm": 2.2473474337485233, "learning_rate": 4.1862171471263126e-05, "loss": 0.8966, "step": 1149 }, { "epoch": 1.695227565874332, "grad_norm": 2.5068101632695745, "learning_rate": 4.184755305432436e-05, "loss": 0.9859, "step": 1150 }, { "epoch": 1.6967016768011793, "grad_norm": 2.545988150809524, "learning_rate": 4.1832924076672876e-05, "loss": 0.8884, "step": 1151 }, { "epoch": 1.6981757877280266, "grad_norm": 2.5083187935070854, "learning_rate": 4.181828454747872e-05, "loss": 0.8061, "step": 1152 }, { "epoch": 1.6996498986548738, "grad_norm": 2.3524854502699246, "learning_rate": 4.180363447591849e-05, "loss": 1.0069, "step": 1153 }, { "epoch": 1.701124009581721, "grad_norm": 2.309907645819277, "learning_rate": 4.178897387117546e-05, "loss": 1.0078, "step": 1154 }, { "epoch": 1.7025981205085683, "grad_norm": 2.332644690209578, "learning_rate": 4.177430274243947e-05, "loss": 0.9441, "step": 1155 }, { "epoch": 1.7040722314354155, "grad_norm": 2.1173818576466807, "learning_rate": 4.175962109890696e-05, "loss": 0.8137, "step": 1156 }, { "epoch": 1.7055463423622628, "grad_norm": 2.338935994388463, "learning_rate": 4.1744928949780975e-05, "loss": 0.7975, "step": 1157 }, { "epoch": 1.70702045328911, "grad_norm": 2.413033374073407, "learning_rate": 4.173022630427113e-05, "loss": 0.8574, "step": 1158 }, { "epoch": 1.7084945642159572, "grad_norm": 2.4765585529283904, "learning_rate": 4.1715513171593614e-05, "loss": 0.7874, "step": 1159 }, { "epoch": 1.7099686751428045, "grad_norm": 2.191039790827687, "learning_rate": 4.170078956097121e-05, "loss": 0.8332, "step": 1160 }, { "epoch": 1.7114427860696517, "grad_norm": 2.590234787362356, "learning_rate": 4.168605548163326e-05, "loss": 0.8398, "step": 1161 }, { "epoch": 1.712916896996499, "grad_norm": 2.418251822176045, "learning_rate": 4.167131094281565e-05, "loss": 0.7469, "step": 1162 }, { "epoch": 1.7143910079233462, "grad_norm": 2.5133797239412012, "learning_rate": 4.165655595376088e-05, "loss": 0.9621, "step": 1163 }, { "epoch": 1.7158651188501934, "grad_norm": 2.3365403229988897, "learning_rate": 4.1641790523717935e-05, "loss": 0.763, "step": 1164 }, { "epoch": 1.7173392297770407, "grad_norm": 2.2389719165439548, "learning_rate": 4.162701466194237e-05, "loss": 0.9137, "step": 1165 }, { "epoch": 1.718813340703888, "grad_norm": 2.007148246874261, "learning_rate": 4.161222837769627e-05, "loss": 0.6595, "step": 1166 }, { "epoch": 1.7202874516307354, "grad_norm": 2.197765134194964, "learning_rate": 4.159743168024829e-05, "loss": 0.8206, "step": 1167 }, { "epoch": 1.7217615625575826, "grad_norm": 2.2424199319449696, "learning_rate": 4.158262457887356e-05, "loss": 0.8538, "step": 1168 }, { "epoch": 1.7232356734844299, "grad_norm": 2.2370231248106265, "learning_rate": 4.156780708285378e-05, "loss": 0.8327, "step": 1169 }, { "epoch": 1.724709784411277, "grad_norm": 2.0302566446891497, "learning_rate": 4.155297920147713e-05, "loss": 0.7529, "step": 1170 }, { "epoch": 1.7261838953381243, "grad_norm": 2.3367312304419507, "learning_rate": 4.153814094403831e-05, "loss": 0.8109, "step": 1171 }, { "epoch": 1.7276580062649716, "grad_norm": 2.267664655534349, "learning_rate": 4.1523292319838524e-05, "loss": 0.6821, "step": 1172 }, { "epoch": 1.7291321171918188, "grad_norm": 2.3797410524370193, "learning_rate": 4.150843333818549e-05, "loss": 0.9202, "step": 1173 }, { "epoch": 1.730606228118666, "grad_norm": 2.2768682961753983, "learning_rate": 4.149356400839339e-05, "loss": 0.7448, "step": 1174 }, { "epoch": 1.7320803390455133, "grad_norm": 2.6324780531907583, "learning_rate": 4.1478684339782926e-05, "loss": 0.8915, "step": 1175 }, { "epoch": 1.7335544499723605, "grad_norm": 2.3149990291057883, "learning_rate": 4.1463794341681244e-05, "loss": 0.8538, "step": 1176 }, { "epoch": 1.7350285608992078, "grad_norm": 2.44193314626849, "learning_rate": 4.1448894023422005e-05, "loss": 0.7683, "step": 1177 }, { "epoch": 1.736502671826055, "grad_norm": 2.9174678428727843, "learning_rate": 4.143398339434529e-05, "loss": 0.8883, "step": 1178 }, { "epoch": 1.7379767827529022, "grad_norm": 2.146415619100414, "learning_rate": 4.1419062463797695e-05, "loss": 0.7531, "step": 1179 }, { "epoch": 1.7394508936797495, "grad_norm": 2.314682214400523, "learning_rate": 4.140413124113225e-05, "loss": 0.8579, "step": 1180 }, { "epoch": 1.7409250046065967, "grad_norm": 2.1775043451592135, "learning_rate": 4.138918973570842e-05, "loss": 0.7602, "step": 1181 }, { "epoch": 1.742399115533444, "grad_norm": 2.3598333254648747, "learning_rate": 4.1374237956892133e-05, "loss": 0.7735, "step": 1182 }, { "epoch": 1.7438732264602912, "grad_norm": 2.6920748075432783, "learning_rate": 4.135927591405577e-05, "loss": 0.8564, "step": 1183 }, { "epoch": 1.7453473373871384, "grad_norm": 2.02203296828442, "learning_rate": 4.134430361657813e-05, "loss": 0.7045, "step": 1184 }, { "epoch": 1.7468214483139857, "grad_norm": 2.66211779362893, "learning_rate": 4.1329321073844415e-05, "loss": 0.8595, "step": 1185 }, { "epoch": 1.748295559240833, "grad_norm": 2.5003567441084957, "learning_rate": 4.131432829524631e-05, "loss": 0.7883, "step": 1186 }, { "epoch": 1.7497696701676801, "grad_norm": 2.7494657170859855, "learning_rate": 4.129932529018187e-05, "loss": 0.9403, "step": 1187 }, { "epoch": 1.7512437810945274, "grad_norm": 2.61320242313849, "learning_rate": 4.128431206805557e-05, "loss": 0.9175, "step": 1188 }, { "epoch": 1.7527178920213746, "grad_norm": 2.3441273703356935, "learning_rate": 4.126928863827827e-05, "loss": 0.8023, "step": 1189 }, { "epoch": 1.7541920029482219, "grad_norm": 2.370445200475179, "learning_rate": 4.1254255010267285e-05, "loss": 0.852, "step": 1190 }, { "epoch": 1.755666113875069, "grad_norm": 2.14619545211303, "learning_rate": 4.123921119344627e-05, "loss": 0.838, "step": 1191 }, { "epoch": 1.7571402248019163, "grad_norm": 2.4851988380138383, "learning_rate": 4.122415719724528e-05, "loss": 0.9861, "step": 1192 }, { "epoch": 1.7586143357287636, "grad_norm": 2.4458648740260833, "learning_rate": 4.120909303110078e-05, "loss": 0.8315, "step": 1193 }, { "epoch": 1.7600884466556108, "grad_norm": 2.3232979112366734, "learning_rate": 4.119401870445555e-05, "loss": 0.9282, "step": 1194 }, { "epoch": 1.761562557582458, "grad_norm": 2.559275761917248, "learning_rate": 4.1178934226758803e-05, "loss": 0.7737, "step": 1195 }, { "epoch": 1.7630366685093053, "grad_norm": 2.081756630942049, "learning_rate": 4.1163839607466084e-05, "loss": 0.8065, "step": 1196 }, { "epoch": 1.7645107794361525, "grad_norm": 2.5025659744148263, "learning_rate": 4.114873485603927e-05, "loss": 0.8473, "step": 1197 }, { "epoch": 1.7659848903629998, "grad_norm": 2.0037432449997987, "learning_rate": 4.113361998194665e-05, "loss": 0.7346, "step": 1198 }, { "epoch": 1.767459001289847, "grad_norm": 3.0960265751765146, "learning_rate": 4.111849499466281e-05, "loss": 1.0223, "step": 1199 }, { "epoch": 1.7689331122166942, "grad_norm": 2.3097331015898424, "learning_rate": 4.110335990366868e-05, "loss": 0.8678, "step": 1200 }, { "epoch": 1.7704072231435415, "grad_norm": 2.896910725427802, "learning_rate": 4.108821471845155e-05, "loss": 0.8781, "step": 1201 }, { "epoch": 1.7718813340703887, "grad_norm": 2.508469254960538, "learning_rate": 4.107305944850502e-05, "loss": 0.904, "step": 1202 }, { "epoch": 1.773355444997236, "grad_norm": 2.1213999153172436, "learning_rate": 4.105789410332901e-05, "loss": 0.7272, "step": 1203 }, { "epoch": 1.7748295559240832, "grad_norm": 2.0615901674198245, "learning_rate": 4.104271869242975e-05, "loss": 0.761, "step": 1204 }, { "epoch": 1.7763036668509304, "grad_norm": 2.239390046997792, "learning_rate": 4.10275332253198e-05, "loss": 0.7912, "step": 1205 }, { "epoch": 1.7777777777777777, "grad_norm": 2.231824259227946, "learning_rate": 4.1012337711518e-05, "loss": 0.8034, "step": 1206 }, { "epoch": 1.779251888704625, "grad_norm": 2.536049707437003, "learning_rate": 4.099713216054952e-05, "loss": 0.7827, "step": 1207 }, { "epoch": 1.7807259996314722, "grad_norm": 2.7240519948189856, "learning_rate": 4.098191658194578e-05, "loss": 0.9151, "step": 1208 }, { "epoch": 1.7822001105583194, "grad_norm": 2.969981048729027, "learning_rate": 4.096669098524451e-05, "loss": 0.9468, "step": 1209 }, { "epoch": 1.7836742214851666, "grad_norm": 2.8050842854557403, "learning_rate": 4.095145537998972e-05, "loss": 0.7517, "step": 1210 }, { "epoch": 1.7851483324120139, "grad_norm": 2.6798081440362056, "learning_rate": 4.0936209775731686e-05, "loss": 0.896, "step": 1211 }, { "epoch": 1.786622443338861, "grad_norm": 2.3640686236779374, "learning_rate": 4.0920954182026965e-05, "loss": 0.8499, "step": 1212 }, { "epoch": 1.7880965542657083, "grad_norm": 2.345277822341064, "learning_rate": 4.090568860843836e-05, "loss": 0.7925, "step": 1213 }, { "epoch": 1.7895706651925556, "grad_norm": 2.5350369982304914, "learning_rate": 4.089041306453494e-05, "loss": 0.8141, "step": 1214 }, { "epoch": 1.7910447761194028, "grad_norm": 2.3679056117023323, "learning_rate": 4.0875127559892015e-05, "loss": 0.816, "step": 1215 }, { "epoch": 1.79251888704625, "grad_norm": 2.33897360796511, "learning_rate": 4.085983210409114e-05, "loss": 0.738, "step": 1216 }, { "epoch": 1.7939929979730975, "grad_norm": 2.2505632331336543, "learning_rate": 4.084452670672012e-05, "loss": 0.8953, "step": 1217 }, { "epoch": 1.7954671088999448, "grad_norm": 2.2525387322655397, "learning_rate": 4.082921137737299e-05, "loss": 0.8698, "step": 1218 }, { "epoch": 1.796941219826792, "grad_norm": 2.2606448455352877, "learning_rate": 4.081388612564999e-05, "loss": 0.7913, "step": 1219 }, { "epoch": 1.7984153307536392, "grad_norm": 2.3018404687243272, "learning_rate": 4.07985509611576e-05, "loss": 0.8639, "step": 1220 }, { "epoch": 1.7998894416804865, "grad_norm": 2.311400899793861, "learning_rate": 4.078320589350851e-05, "loss": 0.821, "step": 1221 }, { "epoch": 1.8013635526073337, "grad_norm": 2.151897737292629, "learning_rate": 4.076785093232162e-05, "loss": 0.8024, "step": 1222 }, { "epoch": 1.802837663534181, "grad_norm": 2.190748146502982, "learning_rate": 4.0752486087222006e-05, "loss": 0.8455, "step": 1223 }, { "epoch": 1.8043117744610282, "grad_norm": 2.2798137585383165, "learning_rate": 4.073711136784099e-05, "loss": 0.7947, "step": 1224 }, { "epoch": 1.8057858853878754, "grad_norm": 2.6432979981929727, "learning_rate": 4.072172678381603e-05, "loss": 0.9126, "step": 1225 }, { "epoch": 1.8072599963147227, "grad_norm": 2.2286489247901433, "learning_rate": 4.07063323447908e-05, "loss": 0.8185, "step": 1226 }, { "epoch": 1.80873410724157, "grad_norm": 2.128751360657056, "learning_rate": 4.0690928060415144e-05, "loss": 0.8542, "step": 1227 }, { "epoch": 1.8102082181684171, "grad_norm": 2.116130607287941, "learning_rate": 4.067551394034508e-05, "loss": 0.8125, "step": 1228 }, { "epoch": 1.8116823290952644, "grad_norm": 2.2894774181337825, "learning_rate": 4.066008999424279e-05, "loss": 0.9707, "step": 1229 }, { "epoch": 1.8131564400221116, "grad_norm": 2.254424407476795, "learning_rate": 4.06446562317766e-05, "loss": 0.781, "step": 1230 }, { "epoch": 1.8146305509489589, "grad_norm": 2.2278470800987864, "learning_rate": 4.062921266262102e-05, "loss": 0.9449, "step": 1231 }, { "epoch": 1.816104661875806, "grad_norm": 2.2864147926535425, "learning_rate": 4.0613759296456675e-05, "loss": 0.9126, "step": 1232 }, { "epoch": 1.8175787728026536, "grad_norm": 2.5869127722132492, "learning_rate": 4.059829614297036e-05, "loss": 0.9003, "step": 1233 }, { "epoch": 1.8190528837295008, "grad_norm": 2.4062671413058796, "learning_rate": 4.058282321185498e-05, "loss": 0.781, "step": 1234 }, { "epoch": 1.820526994656348, "grad_norm": 2.243131432235734, "learning_rate": 4.0567340512809586e-05, "loss": 0.9684, "step": 1235 }, { "epoch": 1.8220011055831953, "grad_norm": 1.9856583057620516, "learning_rate": 4.0551848055539345e-05, "loss": 0.7376, "step": 1236 }, { "epoch": 1.8234752165100425, "grad_norm": 2.646612568245542, "learning_rate": 4.0536345849755545e-05, "loss": 0.9889, "step": 1237 }, { "epoch": 1.8249493274368898, "grad_norm": 2.3520720173845397, "learning_rate": 4.0520833905175576e-05, "loss": 0.9032, "step": 1238 }, { "epoch": 1.826423438363737, "grad_norm": 2.580217286947704, "learning_rate": 4.0505312231522944e-05, "loss": 0.7817, "step": 1239 }, { "epoch": 1.8278975492905842, "grad_norm": 2.60841452459004, "learning_rate": 4.048978083852724e-05, "loss": 0.8569, "step": 1240 }, { "epoch": 1.8293716602174315, "grad_norm": 2.2984535865039226, "learning_rate": 4.0474239735924166e-05, "loss": 0.7828, "step": 1241 }, { "epoch": 1.8308457711442787, "grad_norm": 2.1593360378432562, "learning_rate": 4.045868893345549e-05, "loss": 0.8557, "step": 1242 }, { "epoch": 1.832319882071126, "grad_norm": 2.3947515837467708, "learning_rate": 4.0443128440869084e-05, "loss": 1.0002, "step": 1243 }, { "epoch": 1.8337939929979732, "grad_norm": 2.1737029801340166, "learning_rate": 4.042755826791886e-05, "loss": 0.7401, "step": 1244 }, { "epoch": 1.8352681039248204, "grad_norm": 2.647979066495179, "learning_rate": 4.041197842436484e-05, "loss": 0.8275, "step": 1245 }, { "epoch": 1.8367422148516677, "grad_norm": 2.4761693022594695, "learning_rate": 4.0396388919973074e-05, "loss": 0.7417, "step": 1246 }, { "epoch": 1.838216325778515, "grad_norm": 2.322124454452024, "learning_rate": 4.038078976451567e-05, "loss": 0.8267, "step": 1247 }, { "epoch": 1.8396904367053621, "grad_norm": 2.4489911474065567, "learning_rate": 4.036518096777082e-05, "loss": 0.8601, "step": 1248 }, { "epoch": 1.8411645476322094, "grad_norm": 2.519114757090933, "learning_rate": 4.0349562539522725e-05, "loss": 0.7996, "step": 1249 }, { "epoch": 1.8426386585590566, "grad_norm": 2.6157927027529597, "learning_rate": 4.033393448956162e-05, "loss": 0.8367, "step": 1250 }, { "epoch": 1.8426386585590566, "eval_bleu": 0.0866447891472186, "eval_bleu_1gram": 0.4074652317203469, "eval_bleu_2gram": 0.17676197050404116, "eval_bleu_3gram": 0.08336166007636095, "eval_bleu_4gram": 0.0414309302336783, "eval_rag_val_loss": 1.0817895477176995, "eval_rouge1": 0.39905996367705476, "eval_rouge2": 0.16983091758140165, "eval_rougeL": 0.3845041470468088, "step": 1250 }, { "epoch": 1.8441127694859039, "grad_norm": 2.4255586236346116, "learning_rate": 4.03182968276838e-05, "loss": 0.8383, "step": 1251 }, { "epoch": 1.845586880412751, "grad_norm": 2.3322140415955457, "learning_rate": 4.030264956369157e-05, "loss": 0.6783, "step": 1252 }, { "epoch": 1.8470609913395983, "grad_norm": 2.0004667691095306, "learning_rate": 4.028699270739326e-05, "loss": 0.7331, "step": 1253 }, { "epoch": 1.8485351022664456, "grad_norm": 2.143070133159923, "learning_rate": 4.027132626860318e-05, "loss": 0.7656, "step": 1254 }, { "epoch": 1.8500092131932928, "grad_norm": 2.548297227545918, "learning_rate": 4.02556502571417e-05, "loss": 0.8982, "step": 1255 }, { "epoch": 1.85148332412014, "grad_norm": 2.25873860831837, "learning_rate": 4.023996468283515e-05, "loss": 0.739, "step": 1256 }, { "epoch": 1.8529574350469873, "grad_norm": 2.3811629792580478, "learning_rate": 4.022426955551588e-05, "loss": 0.8699, "step": 1257 }, { "epoch": 1.8544315459738345, "grad_norm": 2.1861935665291057, "learning_rate": 4.020856488502221e-05, "loss": 0.7938, "step": 1258 }, { "epoch": 1.8559056569006818, "grad_norm": 2.475148275538433, "learning_rate": 4.019285068119845e-05, "loss": 0.9446, "step": 1259 }, { "epoch": 1.857379767827529, "grad_norm": 2.2802605769511537, "learning_rate": 4.017712695389487e-05, "loss": 0.9349, "step": 1260 }, { "epoch": 1.8588538787543762, "grad_norm": 2.101890226428654, "learning_rate": 4.0161393712967756e-05, "loss": 0.7166, "step": 1261 }, { "epoch": 1.8603279896812235, "grad_norm": 1.9444619276380954, "learning_rate": 4.01456509682793e-05, "loss": 0.7436, "step": 1262 }, { "epoch": 1.8618021006080707, "grad_norm": 2.501142908155266, "learning_rate": 4.012989872969768e-05, "loss": 0.9994, "step": 1263 }, { "epoch": 1.863276211534918, "grad_norm": 2.3113114679228564, "learning_rate": 4.011413700709703e-05, "loss": 0.8235, "step": 1264 }, { "epoch": 1.8647503224617652, "grad_norm": 2.6253449349612814, "learning_rate": 4.009836581035742e-05, "loss": 0.831, "step": 1265 }, { "epoch": 1.8662244333886124, "grad_norm": 2.6129735015623377, "learning_rate": 4.008258514936486e-05, "loss": 0.9367, "step": 1266 }, { "epoch": 1.8676985443154597, "grad_norm": 2.2707725190118375, "learning_rate": 4.006679503401129e-05, "loss": 0.8039, "step": 1267 }, { "epoch": 1.869172655242307, "grad_norm": 2.361550244154052, "learning_rate": 4.0050995474194576e-05, "loss": 0.9105, "step": 1268 }, { "epoch": 1.8706467661691542, "grad_norm": 2.180037603666427, "learning_rate": 4.003518647981852e-05, "loss": 0.8265, "step": 1269 }, { "epoch": 1.8721208770960014, "grad_norm": 2.209026575739413, "learning_rate": 4.0019368060792806e-05, "loss": 0.7316, "step": 1270 }, { "epoch": 1.8735949880228486, "grad_norm": 2.2523326438182756, "learning_rate": 4.000354022703306e-05, "loss": 0.7854, "step": 1271 }, { "epoch": 1.8750690989496959, "grad_norm": 2.5707967470142847, "learning_rate": 3.998770298846079e-05, "loss": 0.817, "step": 1272 }, { "epoch": 1.876543209876543, "grad_norm": 2.4394189177070214, "learning_rate": 3.9971856355003396e-05, "loss": 0.9236, "step": 1273 }, { "epoch": 1.8780173208033903, "grad_norm": 2.34463819840132, "learning_rate": 3.9956000336594185e-05, "loss": 0.8483, "step": 1274 }, { "epoch": 1.8794914317302376, "grad_norm": 2.178049107547044, "learning_rate": 3.994013494317233e-05, "loss": 0.8454, "step": 1275 }, { "epoch": 1.8809655426570848, "grad_norm": 2.541705542928914, "learning_rate": 3.9924260184682894e-05, "loss": 0.7941, "step": 1276 }, { "epoch": 1.882439653583932, "grad_norm": 2.63686034776044, "learning_rate": 3.9908376071076805e-05, "loss": 0.8575, "step": 1277 }, { "epoch": 1.8839137645107793, "grad_norm": 1.9590351254097287, "learning_rate": 3.9892482612310836e-05, "loss": 0.7581, "step": 1278 }, { "epoch": 1.8853878754376265, "grad_norm": 2.2457442832754264, "learning_rate": 3.9876579818347654e-05, "loss": 0.7347, "step": 1279 }, { "epoch": 1.8868619863644738, "grad_norm": 2.270698811690865, "learning_rate": 3.986066769915575e-05, "loss": 0.7957, "step": 1280 }, { "epoch": 1.888336097291321, "grad_norm": 2.177927927217139, "learning_rate": 3.984474626470948e-05, "loss": 0.8964, "step": 1281 }, { "epoch": 1.8898102082181683, "grad_norm": 1.9601843858131045, "learning_rate": 3.982881552498902e-05, "loss": 0.7727, "step": 1282 }, { "epoch": 1.8912843191450157, "grad_norm": 2.6806155896972217, "learning_rate": 3.981287548998039e-05, "loss": 0.9373, "step": 1283 }, { "epoch": 1.892758430071863, "grad_norm": 2.1807375330491556, "learning_rate": 3.979692616967543e-05, "loss": 0.7096, "step": 1284 }, { "epoch": 1.8942325409987102, "grad_norm": 2.2566407010694336, "learning_rate": 3.978096757407182e-05, "loss": 0.8466, "step": 1285 }, { "epoch": 1.8957066519255574, "grad_norm": 2.3823634990965155, "learning_rate": 3.976499971317302e-05, "loss": 0.7773, "step": 1286 }, { "epoch": 1.8971807628524047, "grad_norm": 2.458199761420361, "learning_rate": 3.974902259698833e-05, "loss": 1.0092, "step": 1287 }, { "epoch": 1.898654873779252, "grad_norm": 2.30162978271425, "learning_rate": 3.973303623553283e-05, "loss": 0.8268, "step": 1288 }, { "epoch": 1.9001289847060991, "grad_norm": 2.5050909658579066, "learning_rate": 3.9717040638827406e-05, "loss": 0.8123, "step": 1289 }, { "epoch": 1.9016030956329464, "grad_norm": 2.122612228751408, "learning_rate": 3.9701035816898734e-05, "loss": 0.7941, "step": 1290 }, { "epoch": 1.9030772065597936, "grad_norm": 1.9624657014413007, "learning_rate": 3.9685021779779264e-05, "loss": 0.7809, "step": 1291 }, { "epoch": 1.9045513174866409, "grad_norm": 2.0801219093836663, "learning_rate": 3.966899853750724e-05, "loss": 0.9862, "step": 1292 }, { "epoch": 1.906025428413488, "grad_norm": 2.6548843463485507, "learning_rate": 3.9652966100126655e-05, "loss": 0.8271, "step": 1293 }, { "epoch": 1.9074995393403353, "grad_norm": 2.507085391726311, "learning_rate": 3.9636924477687265e-05, "loss": 0.8517, "step": 1294 }, { "epoch": 1.9089736502671826, "grad_norm": 2.2571898133013724, "learning_rate": 3.9620873680244616e-05, "loss": 0.9, "step": 1295 }, { "epoch": 1.9104477611940298, "grad_norm": 2.1361054092858054, "learning_rate": 3.960481371785997e-05, "loss": 0.8455, "step": 1296 }, { "epoch": 1.911921872120877, "grad_norm": 2.2047869318567894, "learning_rate": 3.958874460060035e-05, "loss": 0.8002, "step": 1297 }, { "epoch": 1.9133959830477243, "grad_norm": 2.139508589714621, "learning_rate": 3.95726663385385e-05, "loss": 0.801, "step": 1298 }, { "epoch": 1.9148700939745718, "grad_norm": 2.465259547158712, "learning_rate": 3.955657894175293e-05, "loss": 0.9259, "step": 1299 }, { "epoch": 1.916344204901419, "grad_norm": 2.5465397526371887, "learning_rate": 3.9540482420327845e-05, "loss": 0.8865, "step": 1300 }, { "epoch": 1.9178183158282662, "grad_norm": 2.2071755691199377, "learning_rate": 3.952437678435319e-05, "loss": 0.774, "step": 1301 }, { "epoch": 1.9192924267551135, "grad_norm": 2.572136599839212, "learning_rate": 3.950826204392461e-05, "loss": 0.8731, "step": 1302 }, { "epoch": 1.9207665376819607, "grad_norm": 2.0013919992957327, "learning_rate": 3.949213820914347e-05, "loss": 0.7638, "step": 1303 }, { "epoch": 1.922240648608808, "grad_norm": 2.4999244678531194, "learning_rate": 3.9476005290116814e-05, "loss": 0.8378, "step": 1304 }, { "epoch": 1.9237147595356552, "grad_norm": 2.0155587592070856, "learning_rate": 3.94598632969574e-05, "loss": 0.7599, "step": 1305 }, { "epoch": 1.9251888704625024, "grad_norm": 2.161216434286506, "learning_rate": 3.944371223978366e-05, "loss": 0.771, "step": 1306 }, { "epoch": 1.9266629813893497, "grad_norm": 2.384656049130828, "learning_rate": 3.942755212871973e-05, "loss": 0.8817, "step": 1307 }, { "epoch": 1.928137092316197, "grad_norm": 2.5951831029779338, "learning_rate": 3.94113829738954e-05, "loss": 0.819, "step": 1308 }, { "epoch": 1.9296112032430441, "grad_norm": 2.266845584801592, "learning_rate": 3.939520478544614e-05, "loss": 0.8631, "step": 1309 }, { "epoch": 1.9310853141698914, "grad_norm": 2.3507056211740833, "learning_rate": 3.937901757351307e-05, "loss": 0.9727, "step": 1310 }, { "epoch": 1.9325594250967386, "grad_norm": 2.3005399982280825, "learning_rate": 3.936282134824297e-05, "loss": 0.7919, "step": 1311 }, { "epoch": 1.9340335360235859, "grad_norm": 2.0743171395567708, "learning_rate": 3.93466161197883e-05, "loss": 0.9792, "step": 1312 }, { "epoch": 1.935507646950433, "grad_norm": 1.8801311219935357, "learning_rate": 3.933040189830711e-05, "loss": 0.7572, "step": 1313 }, { "epoch": 1.9369817578772803, "grad_norm": 2.1593301859510112, "learning_rate": 3.931417869396313e-05, "loss": 0.9333, "step": 1314 }, { "epoch": 1.9384558688041276, "grad_norm": 2.268175465371104, "learning_rate": 3.929794651692571e-05, "loss": 0.7224, "step": 1315 }, { "epoch": 1.9399299797309748, "grad_norm": 2.196683950785518, "learning_rate": 3.928170537736981e-05, "loss": 0.7558, "step": 1316 }, { "epoch": 1.941404090657822, "grad_norm": 2.2953196339142563, "learning_rate": 3.9265455285476025e-05, "loss": 0.9085, "step": 1317 }, { "epoch": 1.9428782015846693, "grad_norm": 2.731397828175925, "learning_rate": 3.9249196251430556e-05, "loss": 0.953, "step": 1318 }, { "epoch": 1.9443523125115165, "grad_norm": 2.4439793081578274, "learning_rate": 3.92329282854252e-05, "loss": 0.8752, "step": 1319 }, { "epoch": 1.9458264234383638, "grad_norm": 2.4186696159943444, "learning_rate": 3.9216651397657364e-05, "loss": 0.9532, "step": 1320 }, { "epoch": 1.947300534365211, "grad_norm": 2.3569440633992915, "learning_rate": 3.9200365598330056e-05, "loss": 0.9256, "step": 1321 }, { "epoch": 1.9487746452920582, "grad_norm": 2.2447342183018972, "learning_rate": 3.9184070897651854e-05, "loss": 0.854, "step": 1322 }, { "epoch": 1.9502487562189055, "grad_norm": 2.463882766777192, "learning_rate": 3.916776730583691e-05, "loss": 0.8531, "step": 1323 }, { "epoch": 1.9517228671457527, "grad_norm": 2.343699543727722, "learning_rate": 3.915145483310498e-05, "loss": 0.8824, "step": 1324 }, { "epoch": 1.9531969780726, "grad_norm": 2.248371488597586, "learning_rate": 3.9135133489681356e-05, "loss": 0.8101, "step": 1325 }, { "epoch": 1.9546710889994472, "grad_norm": 2.251610709209662, "learning_rate": 3.91188032857969e-05, "loss": 0.7876, "step": 1326 }, { "epoch": 1.9561451999262944, "grad_norm": 2.186565417508181, "learning_rate": 3.910246423168803e-05, "loss": 0.891, "step": 1327 }, { "epoch": 1.9576193108531417, "grad_norm": 2.7654374053608137, "learning_rate": 3.908611633759672e-05, "loss": 0.8799, "step": 1328 }, { "epoch": 1.959093421779989, "grad_norm": 2.373231229175229, "learning_rate": 3.906975961377046e-05, "loss": 0.9267, "step": 1329 }, { "epoch": 1.9605675327068361, "grad_norm": 2.6755454746146548, "learning_rate": 3.905339407046231e-05, "loss": 1.0072, "step": 1330 }, { "epoch": 1.9620416436336834, "grad_norm": 2.6293807940536, "learning_rate": 3.9037019717930826e-05, "loss": 0.8695, "step": 1331 }, { "epoch": 1.9635157545605306, "grad_norm": 2.0892015843110245, "learning_rate": 3.902063656644012e-05, "loss": 0.7408, "step": 1332 }, { "epoch": 1.9649898654873779, "grad_norm": 2.237045506184602, "learning_rate": 3.900424462625977e-05, "loss": 0.8893, "step": 1333 }, { "epoch": 1.966463976414225, "grad_norm": 2.333099887159908, "learning_rate": 3.898784390766491e-05, "loss": 0.8341, "step": 1334 }, { "epoch": 1.9679380873410723, "grad_norm": 2.4655658129802998, "learning_rate": 3.897143442093616e-05, "loss": 0.8601, "step": 1335 }, { "epoch": 1.9694121982679196, "grad_norm": 2.0801260356131417, "learning_rate": 3.895501617635964e-05, "loss": 0.752, "step": 1336 }, { "epoch": 1.9708863091947668, "grad_norm": 2.3160177744577015, "learning_rate": 3.893858918422693e-05, "loss": 0.8127, "step": 1337 }, { "epoch": 1.972360420121614, "grad_norm": 2.4868530774067006, "learning_rate": 3.892215345483515e-05, "loss": 0.7764, "step": 1338 }, { "epoch": 1.9738345310484613, "grad_norm": 2.583948851649009, "learning_rate": 3.890570899848685e-05, "loss": 0.9719, "step": 1339 }, { "epoch": 1.9753086419753085, "grad_norm": 2.2934642561794067, "learning_rate": 3.888925582549006e-05, "loss": 0.843, "step": 1340 }, { "epoch": 1.9767827529021558, "grad_norm": 2.285163657673624, "learning_rate": 3.887279394615829e-05, "loss": 0.8049, "step": 1341 }, { "epoch": 1.978256863829003, "grad_norm": 2.13735098960772, "learning_rate": 3.885632337081049e-05, "loss": 0.8558, "step": 1342 }, { "epoch": 1.9797309747558502, "grad_norm": 1.946995508863758, "learning_rate": 3.8839844109771086e-05, "loss": 0.7331, "step": 1343 }, { "epoch": 1.9812050856826975, "grad_norm": 2.442825271204133, "learning_rate": 3.8823356173369895e-05, "loss": 0.8929, "step": 1344 }, { "epoch": 1.9826791966095447, "grad_norm": 2.013779142466062, "learning_rate": 3.8806859571942244e-05, "loss": 0.849, "step": 1345 }, { "epoch": 1.984153307536392, "grad_norm": 2.392690925274638, "learning_rate": 3.8790354315828846e-05, "loss": 0.8224, "step": 1346 }, { "epoch": 1.9856274184632392, "grad_norm": 2.147027902280589, "learning_rate": 3.877384041537584e-05, "loss": 0.9389, "step": 1347 }, { "epoch": 1.9871015293900864, "grad_norm": 2.460037212090375, "learning_rate": 3.8757317880934786e-05, "loss": 0.9005, "step": 1348 }, { "epoch": 1.988575640316934, "grad_norm": 2.3402555736943604, "learning_rate": 3.8740786722862676e-05, "loss": 0.7802, "step": 1349 }, { "epoch": 1.9900497512437811, "grad_norm": 2.406664032428318, "learning_rate": 3.872424695152189e-05, "loss": 0.8624, "step": 1350 }, { "epoch": 1.9915238621706284, "grad_norm": 2.7285802856735244, "learning_rate": 3.870769857728022e-05, "loss": 0.8986, "step": 1351 }, { "epoch": 1.9929979730974756, "grad_norm": 2.532491120664615, "learning_rate": 3.869114161051082e-05, "loss": 0.8404, "step": 1352 }, { "epoch": 1.9944720840243229, "grad_norm": 2.3913114472234835, "learning_rate": 3.867457606159226e-05, "loss": 0.8092, "step": 1353 }, { "epoch": 1.99594619495117, "grad_norm": 2.6395174672119137, "learning_rate": 3.86580019409085e-05, "loss": 0.8251, "step": 1354 }, { "epoch": 1.9974203058780173, "grad_norm": 2.1697665072218753, "learning_rate": 3.8641419258848835e-05, "loss": 0.7892, "step": 1355 }, { "epoch": 1.9988944168048646, "grad_norm": 2.63612261779582, "learning_rate": 3.862482802580795e-05, "loss": 0.8748, "step": 1356 }, { "epoch": 2.000368527731712, "grad_norm": 2.341090308108381, "learning_rate": 3.860822825218588e-05, "loss": 0.7335, "step": 1357 }, { "epoch": 2.0018426386585593, "grad_norm": 1.8818496204743858, "learning_rate": 3.859161994838803e-05, "loss": 0.5982, "step": 1358 }, { "epoch": 2.0033167495854065, "grad_norm": 1.645501002628713, "learning_rate": 3.8575003124825135e-05, "loss": 0.4988, "step": 1359 }, { "epoch": 2.0047908605122537, "grad_norm": 2.189053773828577, "learning_rate": 3.855837779191329e-05, "loss": 0.513, "step": 1360 }, { "epoch": 2.006264971439101, "grad_norm": 1.7232334960386264, "learning_rate": 3.8541743960073893e-05, "loss": 0.4685, "step": 1361 }, { "epoch": 2.0077390823659482, "grad_norm": 1.8895910288840834, "learning_rate": 3.8525101639733706e-05, "loss": 0.5615, "step": 1362 }, { "epoch": 2.0092131932927955, "grad_norm": 1.835299506834009, "learning_rate": 3.850845084132478e-05, "loss": 0.6161, "step": 1363 }, { "epoch": 2.0106873042196427, "grad_norm": 1.86733669818156, "learning_rate": 3.84917915752845e-05, "loss": 0.5945, "step": 1364 }, { "epoch": 2.01216141514649, "grad_norm": 1.9557108126886837, "learning_rate": 3.847512385205556e-05, "loss": 0.4988, "step": 1365 }, { "epoch": 2.013635526073337, "grad_norm": 1.936182127637862, "learning_rate": 3.845844768208593e-05, "loss": 0.5214, "step": 1366 }, { "epoch": 2.0151096370001844, "grad_norm": 1.9459478727143575, "learning_rate": 3.8441763075828904e-05, "loss": 0.43, "step": 1367 }, { "epoch": 2.0165837479270317, "grad_norm": 1.9806621991242714, "learning_rate": 3.842507004374304e-05, "loss": 0.4856, "step": 1368 }, { "epoch": 2.018057858853879, "grad_norm": 2.0824047562341526, "learning_rate": 3.8408368596292224e-05, "loss": 0.5605, "step": 1369 }, { "epoch": 2.019531969780726, "grad_norm": 2.4634970297341607, "learning_rate": 3.839165874394555e-05, "loss": 0.5492, "step": 1370 }, { "epoch": 2.0210060807075734, "grad_norm": 2.181753403251077, "learning_rate": 3.8374940497177434e-05, "loss": 0.5119, "step": 1371 }, { "epoch": 2.0224801916344206, "grad_norm": 2.4016126062828573, "learning_rate": 3.835821386646753e-05, "loss": 0.5971, "step": 1372 }, { "epoch": 2.023954302561268, "grad_norm": 2.2079069577668515, "learning_rate": 3.834147886230074e-05, "loss": 0.5472, "step": 1373 }, { "epoch": 2.025428413488115, "grad_norm": 2.281774800173868, "learning_rate": 3.8324735495167246e-05, "loss": 0.6055, "step": 1374 }, { "epoch": 2.0269025244149623, "grad_norm": 2.4018120402120497, "learning_rate": 3.8307983775562435e-05, "loss": 0.6832, "step": 1375 }, { "epoch": 2.0283766353418096, "grad_norm": 2.1542703137528485, "learning_rate": 3.8291223713986955e-05, "loss": 0.4563, "step": 1376 }, { "epoch": 2.029850746268657, "grad_norm": 2.3393711128579655, "learning_rate": 3.827445532094669e-05, "loss": 0.5772, "step": 1377 }, { "epoch": 2.031324857195504, "grad_norm": 2.006379205934255, "learning_rate": 3.8257678606952705e-05, "loss": 0.4661, "step": 1378 }, { "epoch": 2.0327989681223513, "grad_norm": 2.157891906116965, "learning_rate": 3.824089358252133e-05, "loss": 0.5757, "step": 1379 }, { "epoch": 2.0342730790491985, "grad_norm": 2.4590905418838025, "learning_rate": 3.822410025817406e-05, "loss": 0.5311, "step": 1380 }, { "epoch": 2.0357471899760458, "grad_norm": 2.3496032866972243, "learning_rate": 3.820729864443764e-05, "loss": 0.4819, "step": 1381 }, { "epoch": 2.037221300902893, "grad_norm": 2.046997765506779, "learning_rate": 3.819048875184398e-05, "loss": 0.5009, "step": 1382 }, { "epoch": 2.0386954118297402, "grad_norm": 2.0910632986944053, "learning_rate": 3.8173670590930165e-05, "loss": 0.4596, "step": 1383 }, { "epoch": 2.0401695227565875, "grad_norm": 2.580671867301639, "learning_rate": 3.815684417223851e-05, "loss": 0.6052, "step": 1384 }, { "epoch": 2.0416436336834347, "grad_norm": 2.0414066797920998, "learning_rate": 3.814000950631647e-05, "loss": 0.5653, "step": 1385 }, { "epoch": 2.043117744610282, "grad_norm": 2.2955907228456875, "learning_rate": 3.812316660371666e-05, "loss": 0.5642, "step": 1386 }, { "epoch": 2.044591855537129, "grad_norm": 2.369253132311586, "learning_rate": 3.810631547499692e-05, "loss": 0.5133, "step": 1387 }, { "epoch": 2.0460659664639764, "grad_norm": 2.0155324988616, "learning_rate": 3.808945613072017e-05, "loss": 0.4975, "step": 1388 }, { "epoch": 2.0475400773908237, "grad_norm": 2.2831268950464136, "learning_rate": 3.807258858145453e-05, "loss": 0.5301, "step": 1389 }, { "epoch": 2.049014188317671, "grad_norm": 1.7843771805891886, "learning_rate": 3.8055712837773225e-05, "loss": 0.4953, "step": 1390 }, { "epoch": 2.050488299244518, "grad_norm": 1.9212912936913635, "learning_rate": 3.803882891025466e-05, "loss": 0.4519, "step": 1391 }, { "epoch": 2.0519624101713654, "grad_norm": 2.207293415533948, "learning_rate": 3.802193680948236e-05, "loss": 0.5104, "step": 1392 }, { "epoch": 2.0534365210982126, "grad_norm": 2.1161034543107062, "learning_rate": 3.800503654604493e-05, "loss": 0.492, "step": 1393 }, { "epoch": 2.05491063202506, "grad_norm": 2.3323809746468895, "learning_rate": 3.798812813053615e-05, "loss": 0.4275, "step": 1394 }, { "epoch": 2.056384742951907, "grad_norm": 2.043037136238906, "learning_rate": 3.7971211573554865e-05, "loss": 0.5917, "step": 1395 }, { "epoch": 2.0578588538787543, "grad_norm": 2.038084533720168, "learning_rate": 3.795428688570505e-05, "loss": 0.5229, "step": 1396 }, { "epoch": 2.0593329648056016, "grad_norm": 2.0377399935120395, "learning_rate": 3.793735407759577e-05, "loss": 0.4804, "step": 1397 }, { "epoch": 2.060807075732449, "grad_norm": 2.5097055865873306, "learning_rate": 3.792041315984118e-05, "loss": 0.5754, "step": 1398 }, { "epoch": 2.062281186659296, "grad_norm": 2.1659529928881476, "learning_rate": 3.7903464143060506e-05, "loss": 0.4912, "step": 1399 }, { "epoch": 2.0637552975861433, "grad_norm": 2.2606967336480803, "learning_rate": 3.788650703787808e-05, "loss": 0.5563, "step": 1400 }, { "epoch": 2.0652294085129905, "grad_norm": 2.1097466035524675, "learning_rate": 3.7869541854923275e-05, "loss": 0.4867, "step": 1401 }, { "epoch": 2.0667035194398378, "grad_norm": 2.1220737954052757, "learning_rate": 3.785256860483054e-05, "loss": 0.5008, "step": 1402 }, { "epoch": 2.068177630366685, "grad_norm": 2.3715072849394225, "learning_rate": 3.783558729823939e-05, "loss": 0.6234, "step": 1403 }, { "epoch": 2.0696517412935322, "grad_norm": 2.1305860139520068, "learning_rate": 3.781859794579436e-05, "loss": 0.5148, "step": 1404 }, { "epoch": 2.0711258522203795, "grad_norm": 1.9711761058244746, "learning_rate": 3.780160055814507e-05, "loss": 0.5439, "step": 1405 }, { "epoch": 2.0725999631472267, "grad_norm": 2.5833278984094687, "learning_rate": 3.778459514594613e-05, "loss": 0.4717, "step": 1406 }, { "epoch": 2.074074074074074, "grad_norm": 2.104031158716954, "learning_rate": 3.776758171985723e-05, "loss": 0.4525, "step": 1407 }, { "epoch": 2.075548185000921, "grad_norm": 2.109966392972617, "learning_rate": 3.775056029054304e-05, "loss": 0.4978, "step": 1408 }, { "epoch": 2.0770222959277684, "grad_norm": 2.7590293901128695, "learning_rate": 3.773353086867328e-05, "loss": 0.5459, "step": 1409 }, { "epoch": 2.0784964068546157, "grad_norm": 2.3328207906280594, "learning_rate": 3.7716493464922654e-05, "loss": 0.5567, "step": 1410 }, { "epoch": 2.079970517781463, "grad_norm": 2.1413306862081187, "learning_rate": 3.769944808997088e-05, "loss": 0.4665, "step": 1411 }, { "epoch": 2.08144462870831, "grad_norm": 2.4847957806555696, "learning_rate": 3.768239475450269e-05, "loss": 0.5465, "step": 1412 }, { "epoch": 2.0829187396351574, "grad_norm": 2.3934574675533336, "learning_rate": 3.7665333469207766e-05, "loss": 0.5143, "step": 1413 }, { "epoch": 2.0843928505620046, "grad_norm": 2.589173104980374, "learning_rate": 3.7648264244780804e-05, "loss": 0.4889, "step": 1414 }, { "epoch": 2.085866961488852, "grad_norm": 2.0268260022739133, "learning_rate": 3.7631187091921483e-05, "loss": 0.4997, "step": 1415 }, { "epoch": 2.087341072415699, "grad_norm": 2.537783726206257, "learning_rate": 3.761410202133443e-05, "loss": 0.5274, "step": 1416 }, { "epoch": 2.0888151833425463, "grad_norm": 1.9284485457392537, "learning_rate": 3.759700904372924e-05, "loss": 0.4449, "step": 1417 }, { "epoch": 2.0902892942693936, "grad_norm": 2.4258454294360696, "learning_rate": 3.757990816982046e-05, "loss": 0.5869, "step": 1418 }, { "epoch": 2.091763405196241, "grad_norm": 2.42343178212852, "learning_rate": 3.756279941032761e-05, "loss": 0.561, "step": 1419 }, { "epoch": 2.093237516123088, "grad_norm": 2.184466301970479, "learning_rate": 3.754568277597512e-05, "loss": 0.5628, "step": 1420 }, { "epoch": 2.0947116270499353, "grad_norm": 2.1174201415224845, "learning_rate": 3.7528558277492395e-05, "loss": 0.5541, "step": 1421 }, { "epoch": 2.096185737976783, "grad_norm": 2.2984025507797603, "learning_rate": 3.751142592561373e-05, "loss": 0.5708, "step": 1422 }, { "epoch": 2.09765984890363, "grad_norm": 2.1895489497317673, "learning_rate": 3.749428573107837e-05, "loss": 0.5397, "step": 1423 }, { "epoch": 2.0991339598304775, "grad_norm": 2.1811674829303067, "learning_rate": 3.747713770463046e-05, "loss": 0.5573, "step": 1424 }, { "epoch": 2.1006080707573247, "grad_norm": 2.1449396686027793, "learning_rate": 3.7459981857019064e-05, "loss": 0.558, "step": 1425 }, { "epoch": 2.102082181684172, "grad_norm": 2.180678713074946, "learning_rate": 3.7442818198998156e-05, "loss": 0.5616, "step": 1426 }, { "epoch": 2.103556292611019, "grad_norm": 2.121380472211347, "learning_rate": 3.7425646741326585e-05, "loss": 0.5092, "step": 1427 }, { "epoch": 2.1050304035378664, "grad_norm": 1.9584265206532758, "learning_rate": 3.74084674947681e-05, "loss": 0.5539, "step": 1428 }, { "epoch": 2.1065045144647137, "grad_norm": 2.0422528910948574, "learning_rate": 3.739128047009134e-05, "loss": 0.4525, "step": 1429 }, { "epoch": 2.107978625391561, "grad_norm": 2.077366740776311, "learning_rate": 3.7374085678069794e-05, "loss": 0.5339, "step": 1430 }, { "epoch": 2.109452736318408, "grad_norm": 1.8726792278002058, "learning_rate": 3.735688312948186e-05, "loss": 0.5466, "step": 1431 }, { "epoch": 2.1109268472452554, "grad_norm": 2.1561899591462472, "learning_rate": 3.733967283511077e-05, "loss": 0.5931, "step": 1432 }, { "epoch": 2.1124009581721026, "grad_norm": 2.2071740568427556, "learning_rate": 3.7322454805744605e-05, "loss": 0.5384, "step": 1433 }, { "epoch": 2.11387506909895, "grad_norm": 1.8543079479554712, "learning_rate": 3.730522905217632e-05, "loss": 0.4607, "step": 1434 }, { "epoch": 2.115349180025797, "grad_norm": 2.2325490680014037, "learning_rate": 3.728799558520369e-05, "loss": 0.6546, "step": 1435 }, { "epoch": 2.1168232909526443, "grad_norm": 2.8451601661153645, "learning_rate": 3.7270754415629346e-05, "loss": 0.5539, "step": 1436 }, { "epoch": 2.1182974018794916, "grad_norm": 2.3965153151973264, "learning_rate": 3.725350555426072e-05, "loss": 0.5143, "step": 1437 }, { "epoch": 2.119771512806339, "grad_norm": 2.2983550408960864, "learning_rate": 3.7236249011910085e-05, "loss": 0.5304, "step": 1438 }, { "epoch": 2.121245623733186, "grad_norm": 2.0254840884444154, "learning_rate": 3.7218984799394534e-05, "loss": 0.5311, "step": 1439 }, { "epoch": 2.1227197346600333, "grad_norm": 2.3088949817021867, "learning_rate": 3.7201712927535954e-05, "loss": 0.61, "step": 1440 }, { "epoch": 2.1241938455868805, "grad_norm": 2.313914819882239, "learning_rate": 3.7184433407161026e-05, "loss": 0.6603, "step": 1441 }, { "epoch": 2.1256679565137278, "grad_norm": 2.110534123437441, "learning_rate": 3.716714624910126e-05, "loss": 0.5402, "step": 1442 }, { "epoch": 2.127142067440575, "grad_norm": 1.9180153648787301, "learning_rate": 3.714985146419291e-05, "loss": 0.4866, "step": 1443 }, { "epoch": 2.1286161783674222, "grad_norm": 2.117345149515717, "learning_rate": 3.713254906327703e-05, "loss": 0.4833, "step": 1444 }, { "epoch": 2.1300902892942695, "grad_norm": 2.1217990777389706, "learning_rate": 3.711523905719946e-05, "loss": 0.6021, "step": 1445 }, { "epoch": 2.1315644002211167, "grad_norm": 1.9436110443785928, "learning_rate": 3.70979214568108e-05, "loss": 0.5012, "step": 1446 }, { "epoch": 2.133038511147964, "grad_norm": 2.2196465413631072, "learning_rate": 3.70805962729664e-05, "loss": 0.5747, "step": 1447 }, { "epoch": 2.134512622074811, "grad_norm": 2.1018938562033433, "learning_rate": 3.706326351652636e-05, "loss": 0.5345, "step": 1448 }, { "epoch": 2.1359867330016584, "grad_norm": 2.128587723447435, "learning_rate": 3.704592319835557e-05, "loss": 0.5081, "step": 1449 }, { "epoch": 2.1374608439285057, "grad_norm": 2.1859964925837323, "learning_rate": 3.702857532932359e-05, "loss": 0.6091, "step": 1450 }, { "epoch": 2.138934954855353, "grad_norm": 2.0448100615995606, "learning_rate": 3.7011219920304774e-05, "loss": 0.5487, "step": 1451 }, { "epoch": 2.1404090657822, "grad_norm": 2.3488925718813753, "learning_rate": 3.699385698217816e-05, "loss": 0.6826, "step": 1452 }, { "epoch": 2.1418831767090474, "grad_norm": 2.1918950207276966, "learning_rate": 3.6976486525827546e-05, "loss": 0.4891, "step": 1453 }, { "epoch": 2.1433572876358946, "grad_norm": 2.0749442633071613, "learning_rate": 3.695910856214141e-05, "loss": 0.5251, "step": 1454 }, { "epoch": 2.144831398562742, "grad_norm": 2.4151318598224796, "learning_rate": 3.694172310201295e-05, "loss": 0.6599, "step": 1455 }, { "epoch": 2.146305509489589, "grad_norm": 1.9204289648559296, "learning_rate": 3.692433015634005e-05, "loss": 0.4745, "step": 1456 }, { "epoch": 2.1477796204164363, "grad_norm": 1.9946027887098232, "learning_rate": 3.690692973602532e-05, "loss": 0.5379, "step": 1457 }, { "epoch": 2.1492537313432836, "grad_norm": 2.110913655042624, "learning_rate": 3.6889521851976005e-05, "loss": 0.5164, "step": 1458 }, { "epoch": 2.150727842270131, "grad_norm": 2.0631548679966754, "learning_rate": 3.6872106515104065e-05, "loss": 0.4185, "step": 1459 }, { "epoch": 2.152201953196978, "grad_norm": 2.124435237559094, "learning_rate": 3.6854683736326125e-05, "loss": 0.5315, "step": 1460 }, { "epoch": 2.1536760641238253, "grad_norm": 2.382490893063347, "learning_rate": 3.683725352656348e-05, "loss": 0.5644, "step": 1461 }, { "epoch": 2.1551501750506725, "grad_norm": 2.1046388933421354, "learning_rate": 3.681981589674206e-05, "loss": 0.4336, "step": 1462 }, { "epoch": 2.1566242859775198, "grad_norm": 3.0168399241170523, "learning_rate": 3.6802370857792464e-05, "loss": 0.4932, "step": 1463 }, { "epoch": 2.158098396904367, "grad_norm": 2.718791303649751, "learning_rate": 3.678491842064995e-05, "loss": 0.6427, "step": 1464 }, { "epoch": 2.1595725078312142, "grad_norm": 2.0677465119828837, "learning_rate": 3.6767458596254364e-05, "loss": 0.5141, "step": 1465 }, { "epoch": 2.1610466187580615, "grad_norm": 2.1572807170516977, "learning_rate": 3.674999139555024e-05, "loss": 0.4685, "step": 1466 }, { "epoch": 2.1625207296849087, "grad_norm": 2.587888598171376, "learning_rate": 3.67325168294867e-05, "loss": 0.5342, "step": 1467 }, { "epoch": 2.163994840611756, "grad_norm": 1.9705224913731811, "learning_rate": 3.67150349090175e-05, "loss": 0.509, "step": 1468 }, { "epoch": 2.165468951538603, "grad_norm": 2.233812701557234, "learning_rate": 3.669754564510099e-05, "loss": 0.5623, "step": 1469 }, { "epoch": 2.1669430624654504, "grad_norm": 2.4166621065644835, "learning_rate": 3.668004904870014e-05, "loss": 0.5233, "step": 1470 }, { "epoch": 2.1684171733922977, "grad_norm": 2.3610824570755904, "learning_rate": 3.666254513078251e-05, "loss": 0.4924, "step": 1471 }, { "epoch": 2.169891284319145, "grad_norm": 2.099423252106539, "learning_rate": 3.664503390232024e-05, "loss": 0.499, "step": 1472 }, { "epoch": 2.171365395245992, "grad_norm": 2.380543313080882, "learning_rate": 3.6627515374290065e-05, "loss": 0.6254, "step": 1473 }, { "epoch": 2.1728395061728394, "grad_norm": 1.8601828991784641, "learning_rate": 3.66099895576733e-05, "loss": 0.4151, "step": 1474 }, { "epoch": 2.1743136170996866, "grad_norm": 2.6961841887540805, "learning_rate": 3.6592456463455804e-05, "loss": 0.4848, "step": 1475 }, { "epoch": 2.175787728026534, "grad_norm": 2.413307047045674, "learning_rate": 3.657491610262802e-05, "loss": 0.6347, "step": 1476 }, { "epoch": 2.177261838953381, "grad_norm": 3.4118884254718314, "learning_rate": 3.655736848618495e-05, "loss": 0.6054, "step": 1477 }, { "epoch": 2.1787359498802283, "grad_norm": 2.3539963607176566, "learning_rate": 3.653981362512612e-05, "loss": 0.5248, "step": 1478 }, { "epoch": 2.1802100608070756, "grad_norm": 2.160511725703111, "learning_rate": 3.652225153045562e-05, "loss": 0.5607, "step": 1479 }, { "epoch": 2.181684171733923, "grad_norm": 2.252045019943115, "learning_rate": 3.650468221318206e-05, "loss": 0.4319, "step": 1480 }, { "epoch": 2.18315828266077, "grad_norm": 2.0688978436243572, "learning_rate": 3.648710568431859e-05, "loss": 0.5759, "step": 1481 }, { "epoch": 2.1846323935876173, "grad_norm": 1.8175094043850437, "learning_rate": 3.6469521954882865e-05, "loss": 0.455, "step": 1482 }, { "epoch": 2.1861065045144645, "grad_norm": 2.0752320768802215, "learning_rate": 3.645193103589707e-05, "loss": 0.5448, "step": 1483 }, { "epoch": 2.1875806154413118, "grad_norm": 2.0480891696099524, "learning_rate": 3.6434332938387875e-05, "loss": 0.5034, "step": 1484 }, { "epoch": 2.189054726368159, "grad_norm": 2.0272557822015767, "learning_rate": 3.6416727673386484e-05, "loss": 0.4791, "step": 1485 }, { "epoch": 2.1905288372950062, "grad_norm": 2.4209928813453083, "learning_rate": 3.639911525192857e-05, "loss": 0.7028, "step": 1486 }, { "epoch": 2.1920029482218535, "grad_norm": 2.3096935666768097, "learning_rate": 3.638149568505428e-05, "loss": 0.5871, "step": 1487 }, { "epoch": 2.1934770591487007, "grad_norm": 2.397557199569495, "learning_rate": 3.636386898380827e-05, "loss": 0.6218, "step": 1488 }, { "epoch": 2.194951170075548, "grad_norm": 2.3411870548974267, "learning_rate": 3.634623515923965e-05, "loss": 0.5684, "step": 1489 }, { "epoch": 2.196425281002395, "grad_norm": 2.300596582744494, "learning_rate": 3.632859422240199e-05, "loss": 0.5051, "step": 1490 }, { "epoch": 2.1978993919292424, "grad_norm": 2.107109492132461, "learning_rate": 3.631094618435334e-05, "loss": 0.5596, "step": 1491 }, { "epoch": 2.19937350285609, "grad_norm": 2.4525139892054715, "learning_rate": 3.629329105615617e-05, "loss": 0.5253, "step": 1492 }, { "epoch": 2.2008476137829374, "grad_norm": 2.2133807912970407, "learning_rate": 3.6275628848877445e-05, "loss": 0.4897, "step": 1493 }, { "epoch": 2.2023217247097846, "grad_norm": 2.0998464619048267, "learning_rate": 3.6257959573588505e-05, "loss": 0.5323, "step": 1494 }, { "epoch": 2.203795835636632, "grad_norm": 2.3220235250827237, "learning_rate": 3.624028324136517e-05, "loss": 0.6126, "step": 1495 }, { "epoch": 2.205269946563479, "grad_norm": 2.128504611984608, "learning_rate": 3.622259986328765e-05, "loss": 0.5231, "step": 1496 }, { "epoch": 2.2067440574903263, "grad_norm": 2.3912398601023863, "learning_rate": 3.620490945044059e-05, "loss": 0.5111, "step": 1497 }, { "epoch": 2.2082181684171736, "grad_norm": 2.804919730005203, "learning_rate": 3.618721201391304e-05, "loss": 0.643, "step": 1498 }, { "epoch": 2.209692279344021, "grad_norm": 2.6670377790981363, "learning_rate": 3.616950756479846e-05, "loss": 0.6354, "step": 1499 }, { "epoch": 2.211166390270868, "grad_norm": 2.495983807884172, "learning_rate": 3.615179611419469e-05, "loss": 0.5019, "step": 1500 }, { "epoch": 2.211166390270868, "eval_bleu": 0.0880829636205845, "eval_bleu_1gram": 0.4057079193225461, "eval_bleu_2gram": 0.1736825154776357, "eval_bleu_3gram": 0.08219142186608647, "eval_bleu_4gram": 0.04208497606656399, "eval_rag_val_loss": 1.1340778583800921, "eval_rouge1": 0.4051959610976157, "eval_rouge2": 0.16975153904742107, "eval_rougeL": 0.38898831165092607, "step": 1500 }, { "epoch": 2.2126405011977153, "grad_norm": 2.1669344247526223, "learning_rate": 3.613407767320398e-05, "loss": 0.4803, "step": 1501 }, { "epoch": 2.2141146121245625, "grad_norm": 2.4356075667808947, "learning_rate": 3.6116352252932936e-05, "loss": 0.5646, "step": 1502 }, { "epoch": 2.2155887230514097, "grad_norm": 2.147688745590713, "learning_rate": 3.609861986449256e-05, "loss": 0.4608, "step": 1503 }, { "epoch": 2.217062833978257, "grad_norm": 2.106404904971441, "learning_rate": 3.6080880518998216e-05, "loss": 0.5541, "step": 1504 }, { "epoch": 2.2185369449051042, "grad_norm": 2.235575426856866, "learning_rate": 3.606313422756962e-05, "loss": 0.6292, "step": 1505 }, { "epoch": 2.2200110558319515, "grad_norm": 2.201212059270235, "learning_rate": 3.604538100133086e-05, "loss": 0.5771, "step": 1506 }, { "epoch": 2.2214851667587987, "grad_norm": 2.2361102002170306, "learning_rate": 3.602762085141035e-05, "loss": 0.4897, "step": 1507 }, { "epoch": 2.222959277685646, "grad_norm": 2.532252925407504, "learning_rate": 3.600985378894086e-05, "loss": 0.5181, "step": 1508 }, { "epoch": 2.224433388612493, "grad_norm": 2.218899144277537, "learning_rate": 3.599207982505949e-05, "loss": 0.5404, "step": 1509 }, { "epoch": 2.2259074995393404, "grad_norm": 2.3734325456598944, "learning_rate": 3.597429897090765e-05, "loss": 0.5977, "step": 1510 }, { "epoch": 2.2273816104661877, "grad_norm": 2.092366003123696, "learning_rate": 3.5956511237631106e-05, "loss": 0.5422, "step": 1511 }, { "epoch": 2.228855721393035, "grad_norm": 3.8444927707444894, "learning_rate": 3.59387166363799e-05, "loss": 0.5886, "step": 1512 }, { "epoch": 2.230329832319882, "grad_norm": 2.1348666963793814, "learning_rate": 3.592091517830838e-05, "loss": 0.4851, "step": 1513 }, { "epoch": 2.2318039432467294, "grad_norm": 2.2916457666542223, "learning_rate": 3.5903106874575235e-05, "loss": 0.554, "step": 1514 }, { "epoch": 2.2332780541735766, "grad_norm": 2.1826163136183934, "learning_rate": 3.5885291736343375e-05, "loss": 0.5708, "step": 1515 }, { "epoch": 2.234752165100424, "grad_norm": 2.208128853693865, "learning_rate": 3.586746977478006e-05, "loss": 0.6019, "step": 1516 }, { "epoch": 2.236226276027271, "grad_norm": 2.231265912680423, "learning_rate": 3.58496410010568e-05, "loss": 0.5928, "step": 1517 }, { "epoch": 2.2377003869541183, "grad_norm": 2.17537401863948, "learning_rate": 3.583180542634937e-05, "loss": 0.5447, "step": 1518 }, { "epoch": 2.2391744978809656, "grad_norm": 2.0012187820468164, "learning_rate": 3.5813963061837815e-05, "loss": 0.5791, "step": 1519 }, { "epoch": 2.240648608807813, "grad_norm": 2.0769294225155654, "learning_rate": 3.5796113918706426e-05, "loss": 0.6301, "step": 1520 }, { "epoch": 2.24212271973466, "grad_norm": 2.1098392682076277, "learning_rate": 3.577825800814376e-05, "loss": 0.5198, "step": 1521 }, { "epoch": 2.2435968306615073, "grad_norm": 2.0696934188258194, "learning_rate": 3.576039534134262e-05, "loss": 0.5609, "step": 1522 }, { "epoch": 2.2450709415883545, "grad_norm": 2.0997918479937776, "learning_rate": 3.57425259295e-05, "loss": 0.4785, "step": 1523 }, { "epoch": 2.2465450525152018, "grad_norm": 2.618279163942316, "learning_rate": 3.5724649783817185e-05, "loss": 0.6204, "step": 1524 }, { "epoch": 2.248019163442049, "grad_norm": 2.378676229546801, "learning_rate": 3.5706766915499646e-05, "loss": 0.558, "step": 1525 }, { "epoch": 2.2494932743688962, "grad_norm": 2.209462135247741, "learning_rate": 3.568887733575706e-05, "loss": 0.6144, "step": 1526 }, { "epoch": 2.2509673852957435, "grad_norm": 2.181568496317724, "learning_rate": 3.567098105580333e-05, "loss": 0.4763, "step": 1527 }, { "epoch": 2.2524414962225907, "grad_norm": 2.7341291044570055, "learning_rate": 3.5653078086856546e-05, "loss": 0.5178, "step": 1528 }, { "epoch": 2.253915607149438, "grad_norm": 2.44491612936071, "learning_rate": 3.563516844013901e-05, "loss": 0.6611, "step": 1529 }, { "epoch": 2.255389718076285, "grad_norm": 2.335591982801305, "learning_rate": 3.561725212687718e-05, "loss": 0.5123, "step": 1530 }, { "epoch": 2.2568638290031324, "grad_norm": 2.5438954529662885, "learning_rate": 3.559932915830172e-05, "loss": 0.7924, "step": 1531 }, { "epoch": 2.2583379399299797, "grad_norm": 3.979800361917081, "learning_rate": 3.558139954564746e-05, "loss": 0.5209, "step": 1532 }, { "epoch": 2.259812050856827, "grad_norm": 1.8226238851212226, "learning_rate": 3.556346330015338e-05, "loss": 0.4187, "step": 1533 }, { "epoch": 2.261286161783674, "grad_norm": 2.238142132388591, "learning_rate": 3.554552043306264e-05, "loss": 0.5998, "step": 1534 }, { "epoch": 2.2627602727105214, "grad_norm": 2.5595929473376797, "learning_rate": 3.552757095562253e-05, "loss": 0.5838, "step": 1535 }, { "epoch": 2.2642343836373686, "grad_norm": 2.0033224642305885, "learning_rate": 3.55096148790845e-05, "loss": 0.5607, "step": 1536 }, { "epoch": 2.265708494564216, "grad_norm": 1.9506215645449085, "learning_rate": 3.5491652214704115e-05, "loss": 0.4861, "step": 1537 }, { "epoch": 2.267182605491063, "grad_norm": 2.2092197606476773, "learning_rate": 3.547368297374109e-05, "loss": 0.6269, "step": 1538 }, { "epoch": 2.2686567164179103, "grad_norm": 1.9654472743773983, "learning_rate": 3.545570716745927e-05, "loss": 0.5227, "step": 1539 }, { "epoch": 2.2701308273447576, "grad_norm": 2.441149986550588, "learning_rate": 3.543772480712658e-05, "loss": 0.6398, "step": 1540 }, { "epoch": 2.271604938271605, "grad_norm": 1.9619791979916736, "learning_rate": 3.5419735904015095e-05, "loss": 0.4954, "step": 1541 }, { "epoch": 2.273079049198452, "grad_norm": 2.122349488546121, "learning_rate": 3.540174046940096e-05, "loss": 0.5793, "step": 1542 }, { "epoch": 2.2745531601252993, "grad_norm": 2.071032014010856, "learning_rate": 3.538373851456442e-05, "loss": 0.5988, "step": 1543 }, { "epoch": 2.2760272710521465, "grad_norm": 2.326423586559757, "learning_rate": 3.536573005078981e-05, "loss": 0.5886, "step": 1544 }, { "epoch": 2.2775013819789938, "grad_norm": 2.3713290303623533, "learning_rate": 3.5347715089365576e-05, "loss": 0.5944, "step": 1545 }, { "epoch": 2.278975492905841, "grad_norm": 2.164632123214316, "learning_rate": 3.532969364158417e-05, "loss": 0.5457, "step": 1546 }, { "epoch": 2.2804496038326882, "grad_norm": 2.132251036803067, "learning_rate": 3.5311665718742184e-05, "loss": 0.5708, "step": 1547 }, { "epoch": 2.2819237147595355, "grad_norm": 2.5506020639290807, "learning_rate": 3.529363133214021e-05, "loss": 0.5256, "step": 1548 }, { "epoch": 2.2833978256863827, "grad_norm": 2.1693269341500843, "learning_rate": 3.527559049308291e-05, "loss": 0.5808, "step": 1549 }, { "epoch": 2.28487193661323, "grad_norm": 2.653487171125551, "learning_rate": 3.525754321287902e-05, "loss": 0.7098, "step": 1550 }, { "epoch": 2.286346047540077, "grad_norm": 2.430297774919829, "learning_rate": 3.523948950284127e-05, "loss": 0.5795, "step": 1551 }, { "epoch": 2.287820158466925, "grad_norm": 2.1367362682454285, "learning_rate": 3.522142937428645e-05, "loss": 0.4675, "step": 1552 }, { "epoch": 2.289294269393772, "grad_norm": 2.5742708933155485, "learning_rate": 3.5203362838535355e-05, "loss": 0.632, "step": 1553 }, { "epoch": 2.2907683803206194, "grad_norm": 1.9752717157156816, "learning_rate": 3.518528990691281e-05, "loss": 0.5689, "step": 1554 }, { "epoch": 2.2922424912474666, "grad_norm": 2.4463105038980504, "learning_rate": 3.516721059074764e-05, "loss": 0.5618, "step": 1555 }, { "epoch": 2.293716602174314, "grad_norm": 2.584082915142063, "learning_rate": 3.5149124901372677e-05, "loss": 0.6833, "step": 1556 }, { "epoch": 2.295190713101161, "grad_norm": 2.5317012243400585, "learning_rate": 3.513103285012475e-05, "loss": 0.674, "step": 1557 }, { "epoch": 2.2966648240280083, "grad_norm": 2.0663365602910644, "learning_rate": 3.511293444834466e-05, "loss": 0.5726, "step": 1558 }, { "epoch": 2.2981389349548556, "grad_norm": 2.0245975424118825, "learning_rate": 3.509482970737722e-05, "loss": 0.5899, "step": 1559 }, { "epoch": 2.299613045881703, "grad_norm": 2.346827305132605, "learning_rate": 3.5076718638571185e-05, "loss": 0.4565, "step": 1560 }, { "epoch": 2.30108715680855, "grad_norm": 2.3758451062457904, "learning_rate": 3.505860125327928e-05, "loss": 0.5855, "step": 1561 }, { "epoch": 2.3025612677353973, "grad_norm": 1.9787504728970953, "learning_rate": 3.504047756285822e-05, "loss": 0.4764, "step": 1562 }, { "epoch": 2.3040353786622445, "grad_norm": 1.9253414495267132, "learning_rate": 3.5022347578668644e-05, "loss": 0.4622, "step": 1563 }, { "epoch": 2.3055094895890917, "grad_norm": 2.1777163107084303, "learning_rate": 3.5004211312075143e-05, "loss": 0.6399, "step": 1564 }, { "epoch": 2.306983600515939, "grad_norm": 2.466210133171842, "learning_rate": 3.498606877444625e-05, "loss": 0.5659, "step": 1565 }, { "epoch": 2.308457711442786, "grad_norm": 3.0228379712332565, "learning_rate": 3.4967919977154406e-05, "loss": 0.5265, "step": 1566 }, { "epoch": 2.3099318223696335, "grad_norm": 2.085413263288294, "learning_rate": 3.4949764931576014e-05, "loss": 0.4714, "step": 1567 }, { "epoch": 2.3114059332964807, "grad_norm": 1.9608688038499325, "learning_rate": 3.4931603649091374e-05, "loss": 0.5134, "step": 1568 }, { "epoch": 2.312880044223328, "grad_norm": 2.3025103096829533, "learning_rate": 3.4913436141084676e-05, "loss": 0.6803, "step": 1569 }, { "epoch": 2.314354155150175, "grad_norm": 2.0299006748799138, "learning_rate": 3.489526241894406e-05, "loss": 0.571, "step": 1570 }, { "epoch": 2.3158282660770224, "grad_norm": 1.906832371538136, "learning_rate": 3.487708249406153e-05, "loss": 0.5025, "step": 1571 }, { "epoch": 2.3173023770038697, "grad_norm": 2.454001592018463, "learning_rate": 3.4858896377832966e-05, "loss": 0.6708, "step": 1572 }, { "epoch": 2.318776487930717, "grad_norm": 2.1427889835780167, "learning_rate": 3.4840704081658155e-05, "loss": 0.5231, "step": 1573 }, { "epoch": 2.320250598857564, "grad_norm": 2.531715161945428, "learning_rate": 3.482250561694075e-05, "loss": 0.5613, "step": 1574 }, { "epoch": 2.3217247097844114, "grad_norm": 2.375588896170206, "learning_rate": 3.4804300995088264e-05, "loss": 0.4769, "step": 1575 }, { "epoch": 2.3231988207112586, "grad_norm": 2.3944626466964483, "learning_rate": 3.478609022751207e-05, "loss": 0.5448, "step": 1576 }, { "epoch": 2.324672931638106, "grad_norm": 2.13152131621771, "learning_rate": 3.4767873325627406e-05, "loss": 0.439, "step": 1577 }, { "epoch": 2.326147042564953, "grad_norm": 2.471312147023792, "learning_rate": 3.4749650300853343e-05, "loss": 0.5826, "step": 1578 }, { "epoch": 2.3276211534918003, "grad_norm": 2.03774572657611, "learning_rate": 3.473142116461279e-05, "loss": 0.5169, "step": 1579 }, { "epoch": 2.3290952644186476, "grad_norm": 2.0903050564168466, "learning_rate": 3.47131859283325e-05, "loss": 0.4755, "step": 1580 }, { "epoch": 2.330569375345495, "grad_norm": 2.275549048937661, "learning_rate": 3.469494460344304e-05, "loss": 0.4169, "step": 1581 }, { "epoch": 2.332043486272342, "grad_norm": 2.5565358046455304, "learning_rate": 3.467669720137879e-05, "loss": 0.4558, "step": 1582 }, { "epoch": 2.3335175971991893, "grad_norm": 2.1304732129253425, "learning_rate": 3.465844373357794e-05, "loss": 0.5342, "step": 1583 }, { "epoch": 2.3349917081260365, "grad_norm": 2.4271670696783234, "learning_rate": 3.464018421148249e-05, "loss": 0.5411, "step": 1584 }, { "epoch": 2.3364658190528838, "grad_norm": 2.33380274365184, "learning_rate": 3.462191864653821e-05, "loss": 0.5699, "step": 1585 }, { "epoch": 2.337939929979731, "grad_norm": 2.578083014868834, "learning_rate": 3.460364705019472e-05, "loss": 0.5649, "step": 1586 }, { "epoch": 2.3394140409065782, "grad_norm": 2.420064828020123, "learning_rate": 3.458536943390536e-05, "loss": 0.4895, "step": 1587 }, { "epoch": 2.3408881518334255, "grad_norm": 1.9704561258313502, "learning_rate": 3.456708580912725e-05, "loss": 0.5387, "step": 1588 }, { "epoch": 2.3423622627602727, "grad_norm": 2.717723170208643, "learning_rate": 3.4548796187321295e-05, "loss": 0.5837, "step": 1589 }, { "epoch": 2.34383637368712, "grad_norm": 2.1402813712201354, "learning_rate": 3.453050057995217e-05, "loss": 0.4959, "step": 1590 }, { "epoch": 2.345310484613967, "grad_norm": 2.798516926365124, "learning_rate": 3.451219899848827e-05, "loss": 0.5494, "step": 1591 }, { "epoch": 2.3467845955408144, "grad_norm": 2.3717639359315767, "learning_rate": 3.449389145440175e-05, "loss": 0.5025, "step": 1592 }, { "epoch": 2.3482587064676617, "grad_norm": 2.1205390781003044, "learning_rate": 3.4475577959168505e-05, "loss": 0.5605, "step": 1593 }, { "epoch": 2.349732817394509, "grad_norm": 2.499063984168183, "learning_rate": 3.445725852426817e-05, "loss": 0.5301, "step": 1594 }, { "epoch": 2.351206928321356, "grad_norm": 1.9723537099762507, "learning_rate": 3.443893316118407e-05, "loss": 0.5615, "step": 1595 }, { "epoch": 2.3526810392482034, "grad_norm": 2.1756291498957205, "learning_rate": 3.4420601881403284e-05, "loss": 0.6517, "step": 1596 }, { "epoch": 2.3541551501750506, "grad_norm": 2.2232343461602913, "learning_rate": 3.440226469641658e-05, "loss": 0.4818, "step": 1597 }, { "epoch": 2.355629261101898, "grad_norm": 2.3884374046500354, "learning_rate": 3.4383921617718427e-05, "loss": 0.51, "step": 1598 }, { "epoch": 2.357103372028745, "grad_norm": 2.3062012954159115, "learning_rate": 3.4365572656807e-05, "loss": 0.5376, "step": 1599 }, { "epoch": 2.3585774829555923, "grad_norm": 2.1297291136585184, "learning_rate": 3.4347217825184134e-05, "loss": 0.4449, "step": 1600 }, { "epoch": 2.3600515938824396, "grad_norm": 2.3671236438762446, "learning_rate": 3.432885713435539e-05, "loss": 0.5064, "step": 1601 }, { "epoch": 2.361525704809287, "grad_norm": 2.3098892733635945, "learning_rate": 3.431049059582996e-05, "loss": 0.7278, "step": 1602 }, { "epoch": 2.362999815736134, "grad_norm": 2.351264503835817, "learning_rate": 3.4292118221120715e-05, "loss": 0.5353, "step": 1603 }, { "epoch": 2.3644739266629813, "grad_norm": 2.086497942325867, "learning_rate": 3.42737400217442e-05, "loss": 0.5054, "step": 1604 }, { "epoch": 2.3659480375898285, "grad_norm": 2.4085272379850147, "learning_rate": 3.425535600922059e-05, "loss": 0.5703, "step": 1605 }, { "epoch": 2.3674221485166758, "grad_norm": 2.4412257745793817, "learning_rate": 3.423696619507369e-05, "loss": 0.6947, "step": 1606 }, { "epoch": 2.368896259443523, "grad_norm": 2.4091984903122152, "learning_rate": 3.4218570590831e-05, "loss": 0.5247, "step": 1607 }, { "epoch": 2.3703703703703702, "grad_norm": 2.7744550530729843, "learning_rate": 3.4200169208023594e-05, "loss": 0.5361, "step": 1608 }, { "epoch": 2.3718444812972175, "grad_norm": 2.043892821008365, "learning_rate": 3.418176205818618e-05, "loss": 0.5267, "step": 1609 }, { "epoch": 2.3733185922240647, "grad_norm": 2.0542924740576076, "learning_rate": 3.4163349152857096e-05, "loss": 0.4901, "step": 1610 }, { "epoch": 2.374792703150912, "grad_norm": 2.660795742700856, "learning_rate": 3.4144930503578286e-05, "loss": 0.5907, "step": 1611 }, { "epoch": 2.376266814077759, "grad_norm": 2.06282572630647, "learning_rate": 3.412650612189528e-05, "loss": 0.5321, "step": 1612 }, { "epoch": 2.3777409250046064, "grad_norm": 2.47633587013875, "learning_rate": 3.4108076019357204e-05, "loss": 0.6292, "step": 1613 }, { "epoch": 2.3792150359314537, "grad_norm": 2.0750611169845556, "learning_rate": 3.4089640207516786e-05, "loss": 0.5566, "step": 1614 }, { "epoch": 2.380689146858301, "grad_norm": 2.201474267993319, "learning_rate": 3.4071198697930315e-05, "loss": 0.5736, "step": 1615 }, { "epoch": 2.382163257785148, "grad_norm": 2.0581224397839732, "learning_rate": 3.405275150215766e-05, "loss": 0.5235, "step": 1616 }, { "epoch": 2.3836373687119954, "grad_norm": 2.1825474943431304, "learning_rate": 3.403429863176226e-05, "loss": 0.5177, "step": 1617 }, { "epoch": 2.3851114796388426, "grad_norm": 2.0111531175379507, "learning_rate": 3.40158400983111e-05, "loss": 0.433, "step": 1618 }, { "epoch": 2.38658559056569, "grad_norm": 2.3277530244994646, "learning_rate": 3.399737591337471e-05, "loss": 0.5769, "step": 1619 }, { "epoch": 2.388059701492537, "grad_norm": 2.0830210896787356, "learning_rate": 3.397890608852718e-05, "loss": 0.5718, "step": 1620 }, { "epoch": 2.3895338124193843, "grad_norm": 2.3291102635123115, "learning_rate": 3.396043063534613e-05, "loss": 0.5188, "step": 1621 }, { "epoch": 2.3910079233462316, "grad_norm": 2.3271394217985244, "learning_rate": 3.39419495654127e-05, "loss": 0.5519, "step": 1622 }, { "epoch": 2.392482034273079, "grad_norm": 2.71673123179593, "learning_rate": 3.3923462890311544e-05, "loss": 0.6268, "step": 1623 }, { "epoch": 2.393956145199926, "grad_norm": 2.133287425838688, "learning_rate": 3.3904970621630866e-05, "loss": 0.5046, "step": 1624 }, { "epoch": 2.3954302561267733, "grad_norm": 2.2433063759206715, "learning_rate": 3.388647277096234e-05, "loss": 0.5234, "step": 1625 }, { "epoch": 2.396904367053621, "grad_norm": 2.4937165451673935, "learning_rate": 3.386796934990115e-05, "loss": 0.6253, "step": 1626 }, { "epoch": 2.398378477980468, "grad_norm": 1.974774664083847, "learning_rate": 3.3849460370045966e-05, "loss": 0.4572, "step": 1627 }, { "epoch": 2.3998525889073155, "grad_norm": 2.6317188017474336, "learning_rate": 3.3830945842998954e-05, "loss": 0.5168, "step": 1628 }, { "epoch": 2.4013266998341627, "grad_norm": 2.391688091789095, "learning_rate": 3.381242578036576e-05, "loss": 0.578, "step": 1629 }, { "epoch": 2.40280081076101, "grad_norm": 2.448196027249819, "learning_rate": 3.379390019375548e-05, "loss": 0.5182, "step": 1630 }, { "epoch": 2.404274921687857, "grad_norm": 2.5472951905164374, "learning_rate": 3.377536909478069e-05, "loss": 0.5108, "step": 1631 }, { "epoch": 2.4057490326147044, "grad_norm": 2.2129560226918024, "learning_rate": 3.3756832495057414e-05, "loss": 0.5549, "step": 1632 }, { "epoch": 2.4072231435415516, "grad_norm": 2.2954432378193403, "learning_rate": 3.373829040620513e-05, "loss": 0.5139, "step": 1633 }, { "epoch": 2.408697254468399, "grad_norm": 2.404078705322835, "learning_rate": 3.3719742839846743e-05, "loss": 0.4729, "step": 1634 }, { "epoch": 2.410171365395246, "grad_norm": 1.9763576718073126, "learning_rate": 3.370118980760861e-05, "loss": 0.4737, "step": 1635 }, { "epoch": 2.4116454763220934, "grad_norm": 2.4285108654901486, "learning_rate": 3.3682631321120504e-05, "loss": 0.5197, "step": 1636 }, { "epoch": 2.4131195872489406, "grad_norm": 2.60722518350472, "learning_rate": 3.366406739201562e-05, "loss": 0.6889, "step": 1637 }, { "epoch": 2.414593698175788, "grad_norm": 2.2878879489137285, "learning_rate": 3.364549803193057e-05, "loss": 0.6644, "step": 1638 }, { "epoch": 2.416067809102635, "grad_norm": 2.0208509254345284, "learning_rate": 3.362692325250534e-05, "loss": 0.5244, "step": 1639 }, { "epoch": 2.4175419200294823, "grad_norm": 2.263615315399695, "learning_rate": 3.360834306538336e-05, "loss": 0.5563, "step": 1640 }, { "epoch": 2.4190160309563296, "grad_norm": 2.7594884691420845, "learning_rate": 3.3589757482211416e-05, "loss": 0.6263, "step": 1641 }, { "epoch": 2.420490141883177, "grad_norm": 2.1048941031524224, "learning_rate": 3.3571166514639684e-05, "loss": 0.5343, "step": 1642 }, { "epoch": 2.421964252810024, "grad_norm": 2.5056075150185886, "learning_rate": 3.3552570174321724e-05, "loss": 0.52, "step": 1643 }, { "epoch": 2.4234383637368713, "grad_norm": 2.4932651402162835, "learning_rate": 3.353396847291446e-05, "loss": 0.5476, "step": 1644 }, { "epoch": 2.4249124746637185, "grad_norm": 2.4396808356116746, "learning_rate": 3.3515361422078165e-05, "loss": 0.6409, "step": 1645 }, { "epoch": 2.4263865855905657, "grad_norm": 2.1406829157881524, "learning_rate": 3.3496749033476485e-05, "loss": 0.5191, "step": 1646 }, { "epoch": 2.427860696517413, "grad_norm": 2.0671076314408348, "learning_rate": 3.347813131877638e-05, "loss": 0.4745, "step": 1647 }, { "epoch": 2.4293348074442602, "grad_norm": 2.3138852481067964, "learning_rate": 3.34595082896482e-05, "loss": 0.5776, "step": 1648 }, { "epoch": 2.4308089183711075, "grad_norm": 2.001155162043779, "learning_rate": 3.344087995776558e-05, "loss": 0.4828, "step": 1649 }, { "epoch": 2.4322830292979547, "grad_norm": 2.474818340532095, "learning_rate": 3.34222463348055e-05, "loss": 0.596, "step": 1650 }, { "epoch": 2.433757140224802, "grad_norm": 2.5193168134885444, "learning_rate": 3.340360743244825e-05, "loss": 0.5025, "step": 1651 }, { "epoch": 2.435231251151649, "grad_norm": 2.114987852995059, "learning_rate": 3.338496326237743e-05, "loss": 0.5267, "step": 1652 }, { "epoch": 2.4367053620784964, "grad_norm": 2.3804283346483013, "learning_rate": 3.336631383627995e-05, "loss": 0.698, "step": 1653 }, { "epoch": 2.4381794730053437, "grad_norm": 2.2744176265939227, "learning_rate": 3.334765916584599e-05, "loss": 0.561, "step": 1654 }, { "epoch": 2.439653583932191, "grad_norm": 2.1900627383298965, "learning_rate": 3.332899926276905e-05, "loss": 0.5286, "step": 1655 }, { "epoch": 2.441127694859038, "grad_norm": 2.3272064240179113, "learning_rate": 3.33103341387459e-05, "loss": 0.5332, "step": 1656 }, { "epoch": 2.4426018057858854, "grad_norm": 1.9900813918984859, "learning_rate": 3.3291663805476566e-05, "loss": 0.5401, "step": 1657 }, { "epoch": 2.4440759167127326, "grad_norm": 2.1468005793525733, "learning_rate": 3.3272988274664364e-05, "loss": 0.4796, "step": 1658 }, { "epoch": 2.44555002763958, "grad_norm": 2.342517681401146, "learning_rate": 3.325430755801584e-05, "loss": 0.5643, "step": 1659 }, { "epoch": 2.447024138566427, "grad_norm": 2.260054165182624, "learning_rate": 3.323562166724082e-05, "loss": 0.6021, "step": 1660 }, { "epoch": 2.4484982494932743, "grad_norm": 2.0695052966473804, "learning_rate": 3.321693061405235e-05, "loss": 0.5514, "step": 1661 }, { "epoch": 2.4499723604201216, "grad_norm": 2.022093927007322, "learning_rate": 3.319823441016673e-05, "loss": 0.5765, "step": 1662 }, { "epoch": 2.451446471346969, "grad_norm": 2.2387566985217, "learning_rate": 3.317953306730347e-05, "loss": 0.4954, "step": 1663 }, { "epoch": 2.452920582273816, "grad_norm": 2.1858993123381554, "learning_rate": 3.316082659718532e-05, "loss": 0.546, "step": 1664 }, { "epoch": 2.4543946932006633, "grad_norm": 2.674771974773721, "learning_rate": 3.314211501153823e-05, "loss": 0.4728, "step": 1665 }, { "epoch": 2.4558688041275105, "grad_norm": 2.27028319201419, "learning_rate": 3.312339832209137e-05, "loss": 0.5211, "step": 1666 }, { "epoch": 2.4573429150543578, "grad_norm": 2.291223540957152, "learning_rate": 3.3104676540577094e-05, "loss": 0.5814, "step": 1667 }, { "epoch": 2.458817025981205, "grad_norm": 2.3683685277390456, "learning_rate": 3.308594967873095e-05, "loss": 0.617, "step": 1668 }, { "epoch": 2.4602911369080522, "grad_norm": 2.3513711743765, "learning_rate": 3.3067217748291695e-05, "loss": 0.5935, "step": 1669 }, { "epoch": 2.4617652478348995, "grad_norm": 2.1571717434467517, "learning_rate": 3.304848076100122e-05, "loss": 0.6507, "step": 1670 }, { "epoch": 2.4632393587617467, "grad_norm": 2.078045836353433, "learning_rate": 3.302973872860463e-05, "loss": 0.4961, "step": 1671 }, { "epoch": 2.464713469688594, "grad_norm": 2.1799092060395378, "learning_rate": 3.301099166285017e-05, "loss": 0.555, "step": 1672 }, { "epoch": 2.466187580615441, "grad_norm": 2.202389641723773, "learning_rate": 3.299223957548923e-05, "loss": 0.4699, "step": 1673 }, { "epoch": 2.4676616915422884, "grad_norm": 1.9901765734483983, "learning_rate": 3.2973482478276364e-05, "loss": 0.5506, "step": 1674 }, { "epoch": 2.4691358024691357, "grad_norm": 2.2519188222398268, "learning_rate": 3.2954720382969263e-05, "loss": 0.6497, "step": 1675 }, { "epoch": 2.470609913395983, "grad_norm": 2.2158449523820085, "learning_rate": 3.293595330132876e-05, "loss": 0.5115, "step": 1676 }, { "epoch": 2.47208402432283, "grad_norm": 2.3472542569333843, "learning_rate": 3.291718124511879e-05, "loss": 0.58, "step": 1677 }, { "epoch": 2.4735581352496774, "grad_norm": 2.287030147805808, "learning_rate": 3.289840422610643e-05, "loss": 0.5494, "step": 1678 }, { "epoch": 2.4750322461765246, "grad_norm": 2.779939149698315, "learning_rate": 3.287962225606185e-05, "loss": 0.6372, "step": 1679 }, { "epoch": 2.476506357103372, "grad_norm": 2.4892172976609266, "learning_rate": 3.286083534675835e-05, "loss": 0.6564, "step": 1680 }, { "epoch": 2.477980468030219, "grad_norm": 2.3002700108139797, "learning_rate": 3.284204350997229e-05, "loss": 0.5375, "step": 1681 }, { "epoch": 2.4794545789570663, "grad_norm": 1.9906629525823027, "learning_rate": 3.282324675748314e-05, "loss": 0.4185, "step": 1682 }, { "epoch": 2.4809286898839136, "grad_norm": 2.540156483823544, "learning_rate": 3.280444510107346e-05, "loss": 0.5918, "step": 1683 }, { "epoch": 2.4824028008107613, "grad_norm": 2.185785984462579, "learning_rate": 3.278563855252885e-05, "loss": 0.5755, "step": 1684 }, { "epoch": 2.4838769117376085, "grad_norm": 2.617083282673149, "learning_rate": 3.276682712363801e-05, "loss": 0.6874, "step": 1685 }, { "epoch": 2.4853510226644557, "grad_norm": 2.65760194369388, "learning_rate": 3.274801082619269e-05, "loss": 0.6154, "step": 1686 }, { "epoch": 2.486825133591303, "grad_norm": 2.40590566177796, "learning_rate": 3.2729189671987695e-05, "loss": 0.4829, "step": 1687 }, { "epoch": 2.48829924451815, "grad_norm": 2.4159607568799992, "learning_rate": 3.271036367282085e-05, "loss": 0.5429, "step": 1688 }, { "epoch": 2.4897733554449974, "grad_norm": 2.3884120497295296, "learning_rate": 3.269153284049306e-05, "loss": 0.4552, "step": 1689 }, { "epoch": 2.4912474663718447, "grad_norm": 2.15586838593339, "learning_rate": 3.267269718680822e-05, "loss": 0.536, "step": 1690 }, { "epoch": 2.492721577298692, "grad_norm": 2.102131592795502, "learning_rate": 3.265385672357327e-05, "loss": 0.5532, "step": 1691 }, { "epoch": 2.494195688225539, "grad_norm": 2.8650952430858427, "learning_rate": 3.2635011462598145e-05, "loss": 0.5573, "step": 1692 }, { "epoch": 2.4956697991523864, "grad_norm": 2.5127939915767077, "learning_rate": 3.261616141569581e-05, "loss": 0.5044, "step": 1693 }, { "epoch": 2.4971439100792336, "grad_norm": 2.1850550475704464, "learning_rate": 3.2597306594682225e-05, "loss": 0.507, "step": 1694 }, { "epoch": 2.498618021006081, "grad_norm": 2.394403003039964, "learning_rate": 3.257844701137633e-05, "loss": 0.6079, "step": 1695 }, { "epoch": 2.500092131932928, "grad_norm": 2.0415416860928133, "learning_rate": 3.255958267760006e-05, "loss": 0.5567, "step": 1696 }, { "epoch": 2.5015662428597754, "grad_norm": 2.1928631024925367, "learning_rate": 3.254071360517833e-05, "loss": 0.6651, "step": 1697 }, { "epoch": 2.5030403537866226, "grad_norm": 1.9826156746981902, "learning_rate": 3.252183980593901e-05, "loss": 0.5787, "step": 1698 }, { "epoch": 2.50451446471347, "grad_norm": 2.2797969214028986, "learning_rate": 3.250296129171295e-05, "loss": 0.5995, "step": 1699 }, { "epoch": 2.505988575640317, "grad_norm": 2.0993565890705015, "learning_rate": 3.2484078074333954e-05, "loss": 0.5144, "step": 1700 }, { "epoch": 2.5074626865671643, "grad_norm": 1.9367162903727384, "learning_rate": 3.246519016563876e-05, "loss": 0.5175, "step": 1701 }, { "epoch": 2.5089367974940116, "grad_norm": 2.303348682571403, "learning_rate": 3.244629757746706e-05, "loss": 0.5396, "step": 1702 }, { "epoch": 2.510410908420859, "grad_norm": 2.7011751407981435, "learning_rate": 3.242740032166149e-05, "loss": 0.4966, "step": 1703 }, { "epoch": 2.511885019347706, "grad_norm": 2.072770872372407, "learning_rate": 3.240849841006758e-05, "loss": 0.5754, "step": 1704 }, { "epoch": 2.5133591302745533, "grad_norm": 2.0312218297325457, "learning_rate": 3.23895918545338e-05, "loss": 0.4923, "step": 1705 }, { "epoch": 2.5148332412014005, "grad_norm": 2.294072316154843, "learning_rate": 3.237068066691152e-05, "loss": 0.5815, "step": 1706 }, { "epoch": 2.5163073521282477, "grad_norm": 2.3207160778547795, "learning_rate": 3.2351764859055034e-05, "loss": 0.6646, "step": 1707 }, { "epoch": 2.517781463055095, "grad_norm": 2.452247316834822, "learning_rate": 3.233284444282152e-05, "loss": 0.4754, "step": 1708 }, { "epoch": 2.519255573981942, "grad_norm": 1.9811927329851466, "learning_rate": 3.2313919430071026e-05, "loss": 0.5197, "step": 1709 }, { "epoch": 2.5207296849087895, "grad_norm": 2.035697411106317, "learning_rate": 3.2294989832666514e-05, "loss": 0.5193, "step": 1710 }, { "epoch": 2.5222037958356367, "grad_norm": 1.9552860359058089, "learning_rate": 3.22760556624738e-05, "loss": 0.4463, "step": 1711 }, { "epoch": 2.523677906762484, "grad_norm": 2.2813382066369563, "learning_rate": 3.225711693136156e-05, "loss": 0.6232, "step": 1712 }, { "epoch": 2.525152017689331, "grad_norm": 2.187030305845189, "learning_rate": 3.223817365120136e-05, "loss": 0.4425, "step": 1713 }, { "epoch": 2.5266261286161784, "grad_norm": 3.220993445308332, "learning_rate": 3.221922583386758e-05, "loss": 0.5537, "step": 1714 }, { "epoch": 2.5281002395430257, "grad_norm": 2.3665908716269937, "learning_rate": 3.220027349123748e-05, "loss": 0.4989, "step": 1715 }, { "epoch": 2.529574350469873, "grad_norm": 2.5953584762258157, "learning_rate": 3.2181316635191125e-05, "loss": 0.5963, "step": 1716 }, { "epoch": 2.53104846139672, "grad_norm": 2.14329050995251, "learning_rate": 3.2162355277611416e-05, "loss": 0.4523, "step": 1717 }, { "epoch": 2.5325225723235674, "grad_norm": 2.3677458151924435, "learning_rate": 3.214338943038409e-05, "loss": 0.5398, "step": 1718 }, { "epoch": 2.5339966832504146, "grad_norm": 2.468915909612515, "learning_rate": 3.21244191053977e-05, "loss": 0.5525, "step": 1719 }, { "epoch": 2.535470794177262, "grad_norm": 2.4398517512014077, "learning_rate": 3.2105444314543584e-05, "loss": 0.5419, "step": 1720 }, { "epoch": 2.536944905104109, "grad_norm": 2.6714904686916445, "learning_rate": 3.208646506971589e-05, "loss": 0.6568, "step": 1721 }, { "epoch": 2.5384190160309563, "grad_norm": 2.175086740978398, "learning_rate": 3.206748138281157e-05, "loss": 0.5289, "step": 1722 }, { "epoch": 2.5398931269578036, "grad_norm": 2.9811123094411363, "learning_rate": 3.204849326573034e-05, "loss": 0.6019, "step": 1723 }, { "epoch": 2.541367237884651, "grad_norm": 2.5569624264805593, "learning_rate": 3.20295007303747e-05, "loss": 0.5269, "step": 1724 }, { "epoch": 2.542841348811498, "grad_norm": 2.3226918566476913, "learning_rate": 3.201050378864994e-05, "loss": 0.5366, "step": 1725 }, { "epoch": 2.5443154597383453, "grad_norm": 2.4308627803336664, "learning_rate": 3.1991502452464074e-05, "loss": 0.5929, "step": 1726 }, { "epoch": 2.5457895706651925, "grad_norm": 2.2750130663486057, "learning_rate": 3.1972496733727906e-05, "loss": 0.6262, "step": 1727 }, { "epoch": 2.5472636815920398, "grad_norm": 2.5790135557447798, "learning_rate": 3.195348664435497e-05, "loss": 0.5228, "step": 1728 }, { "epoch": 2.548737792518887, "grad_norm": 2.7549170837109704, "learning_rate": 3.193447219626153e-05, "loss": 0.6045, "step": 1729 }, { "epoch": 2.5502119034457342, "grad_norm": 2.369435366719626, "learning_rate": 3.191545340136661e-05, "loss": 0.5677, "step": 1730 }, { "epoch": 2.5516860143725815, "grad_norm": 2.3828062964186665, "learning_rate": 3.1896430271591937e-05, "loss": 0.5579, "step": 1731 }, { "epoch": 2.5531601252994287, "grad_norm": 2.4299365342254866, "learning_rate": 3.187740281886195e-05, "loss": 0.571, "step": 1732 }, { "epoch": 2.554634236226276, "grad_norm": 3.020210061504463, "learning_rate": 3.185837105510383e-05, "loss": 0.5458, "step": 1733 }, { "epoch": 2.556108347153123, "grad_norm": 2.159813741671295, "learning_rate": 3.183933499224743e-05, "loss": 0.6026, "step": 1734 }, { "epoch": 2.5575824580799704, "grad_norm": 2.1519131376824037, "learning_rate": 3.18202946422253e-05, "loss": 0.5951, "step": 1735 }, { "epoch": 2.5590565690068177, "grad_norm": 2.070807286283552, "learning_rate": 3.18012500169727e-05, "loss": 0.5253, "step": 1736 }, { "epoch": 2.560530679933665, "grad_norm": 2.3074476363005076, "learning_rate": 3.178220112842753e-05, "loss": 0.4982, "step": 1737 }, { "epoch": 2.562004790860512, "grad_norm": 1.9955308572155954, "learning_rate": 3.176314798853042e-05, "loss": 0.4829, "step": 1738 }, { "epoch": 2.5634789017873594, "grad_norm": 2.2370532863240586, "learning_rate": 3.17440906092246e-05, "loss": 0.6553, "step": 1739 }, { "epoch": 2.5649530127142066, "grad_norm": 2.493323183940011, "learning_rate": 3.1725029002456e-05, "loss": 0.5468, "step": 1740 }, { "epoch": 2.566427123641054, "grad_norm": 2.098761070454012, "learning_rate": 3.17059631801732e-05, "loss": 0.5783, "step": 1741 }, { "epoch": 2.567901234567901, "grad_norm": 2.6016710046817146, "learning_rate": 3.168689315432741e-05, "loss": 0.4837, "step": 1742 }, { "epoch": 2.5693753454947483, "grad_norm": 2.3021350239147487, "learning_rate": 3.1667818936872465e-05, "loss": 0.5691, "step": 1743 }, { "epoch": 2.5708494564215956, "grad_norm": 2.049139968224474, "learning_rate": 3.1648740539764844e-05, "loss": 0.508, "step": 1744 }, { "epoch": 2.572323567348443, "grad_norm": 2.566784974463786, "learning_rate": 3.162965797496364e-05, "loss": 0.6035, "step": 1745 }, { "epoch": 2.57379767827529, "grad_norm": 2.093285124683261, "learning_rate": 3.161057125443056e-05, "loss": 0.4992, "step": 1746 }, { "epoch": 2.5752717892021373, "grad_norm": 2.084223582474285, "learning_rate": 3.1591480390129914e-05, "loss": 0.4767, "step": 1747 }, { "epoch": 2.5767459001289845, "grad_norm": 2.2145353233705283, "learning_rate": 3.157238539402862e-05, "loss": 0.5006, "step": 1748 }, { "epoch": 2.5782200110558318, "grad_norm": 2.5718384824735088, "learning_rate": 3.155328627809617e-05, "loss": 0.6074, "step": 1749 }, { "epoch": 2.579694121982679, "grad_norm": 2.1763990747532196, "learning_rate": 3.1534183054304645e-05, "loss": 0.6135, "step": 1750 }, { "epoch": 2.579694121982679, "eval_bleu": 0.09022869002605914, "eval_bleu_1gram": 0.4083196112421035, "eval_bleu_2gram": 0.1750883236889509, "eval_bleu_3gram": 0.08452598047485654, "eval_bleu_4gram": 0.04279504499903709, "eval_rag_val_loss": 1.1402124568659773, "eval_rouge1": 0.40823468913669525, "eval_rouge2": 0.1714998681697217, "eval_rougeL": 0.39170235533018055, "step": 1750 }, { "epoch": 2.5811682329095262, "grad_norm": 2.469767879745337, "learning_rate": 3.1515075734628705e-05, "loss": 0.5064, "step": 1751 }, { "epoch": 2.5826423438363735, "grad_norm": 2.22089336187695, "learning_rate": 3.149596433104556e-05, "loss": 0.6392, "step": 1752 }, { "epoch": 2.5841164547632207, "grad_norm": 2.853502590768486, "learning_rate": 3.147684885553502e-05, "loss": 0.6013, "step": 1753 }, { "epoch": 2.585590565690068, "grad_norm": 2.6755113451636428, "learning_rate": 3.145772932007939e-05, "loss": 0.5119, "step": 1754 }, { "epoch": 2.587064676616915, "grad_norm": 2.2853209868171396, "learning_rate": 3.143860573666357e-05, "loss": 0.5169, "step": 1755 }, { "epoch": 2.5885387875437624, "grad_norm": 2.4450168613718346, "learning_rate": 3.1419478117274984e-05, "loss": 0.5548, "step": 1756 }, { "epoch": 2.5900128984706097, "grad_norm": 2.8754338476152985, "learning_rate": 3.140034647390357e-05, "loss": 0.7737, "step": 1757 }, { "epoch": 2.591487009397457, "grad_norm": 2.132319243128068, "learning_rate": 3.13812108185418e-05, "loss": 0.4719, "step": 1758 }, { "epoch": 2.592961120324304, "grad_norm": 1.9315767499431702, "learning_rate": 3.136207116318466e-05, "loss": 0.4738, "step": 1759 }, { "epoch": 2.5944352312511514, "grad_norm": 2.466920391986322, "learning_rate": 3.1342927519829644e-05, "loss": 0.6942, "step": 1760 }, { "epoch": 2.5959093421779986, "grad_norm": 2.185655742862117, "learning_rate": 3.1323779900476744e-05, "loss": 0.5675, "step": 1761 }, { "epoch": 2.597383453104846, "grad_norm": 2.1619512401943823, "learning_rate": 3.1304628317128446e-05, "loss": 0.5904, "step": 1762 }, { "epoch": 2.5988575640316935, "grad_norm": 2.101589613955841, "learning_rate": 3.128547278178972e-05, "loss": 0.5869, "step": 1763 }, { "epoch": 2.600331674958541, "grad_norm": 2.169807163231413, "learning_rate": 3.126631330646802e-05, "loss": 0.5257, "step": 1764 }, { "epoch": 2.601805785885388, "grad_norm": 2.248841623302539, "learning_rate": 3.124714990317324e-05, "loss": 0.5784, "step": 1765 }, { "epoch": 2.6032798968122353, "grad_norm": 2.1283555184654133, "learning_rate": 3.122798258391779e-05, "loss": 0.634, "step": 1766 }, { "epoch": 2.6047540077390825, "grad_norm": 2.329000014164664, "learning_rate": 3.120881136071649e-05, "loss": 0.6349, "step": 1767 }, { "epoch": 2.6062281186659297, "grad_norm": 1.8242999759210068, "learning_rate": 3.118963624558662e-05, "loss": 0.4249, "step": 1768 }, { "epoch": 2.607702229592777, "grad_norm": 2.2723773175351605, "learning_rate": 3.11704572505479e-05, "loss": 0.558, "step": 1769 }, { "epoch": 2.609176340519624, "grad_norm": 2.383220300016993, "learning_rate": 3.115127438762247e-05, "loss": 0.6408, "step": 1770 }, { "epoch": 2.6106504514464715, "grad_norm": 2.5416800284946444, "learning_rate": 3.113208766883494e-05, "loss": 0.5696, "step": 1771 }, { "epoch": 2.6121245623733187, "grad_norm": 2.6155080388579512, "learning_rate": 3.111289710621228e-05, "loss": 0.6234, "step": 1772 }, { "epoch": 2.613598673300166, "grad_norm": 2.2156903298973187, "learning_rate": 3.109370271178389e-05, "loss": 0.6239, "step": 1773 }, { "epoch": 2.615072784227013, "grad_norm": 2.0218115205542766, "learning_rate": 3.10745044975816e-05, "loss": 0.5312, "step": 1774 }, { "epoch": 2.6165468951538604, "grad_norm": 2.1127455168037663, "learning_rate": 3.1055302475639594e-05, "loss": 0.5309, "step": 1775 }, { "epoch": 2.6180210060807076, "grad_norm": 2.51654680316103, "learning_rate": 3.103609665799445e-05, "loss": 0.6734, "step": 1776 }, { "epoch": 2.619495117007555, "grad_norm": 2.1313916738188037, "learning_rate": 3.1016887056685155e-05, "loss": 0.5434, "step": 1777 }, { "epoch": 2.620969227934402, "grad_norm": 2.2443541582075186, "learning_rate": 3.0997673683753024e-05, "loss": 0.5446, "step": 1778 }, { "epoch": 2.6224433388612494, "grad_norm": 2.3473336859828025, "learning_rate": 3.0978456551241786e-05, "loss": 0.533, "step": 1779 }, { "epoch": 2.6239174497880966, "grad_norm": 2.059349428055738, "learning_rate": 3.095923567119748e-05, "loss": 0.5262, "step": 1780 }, { "epoch": 2.625391560714944, "grad_norm": 2.319400286936267, "learning_rate": 3.094001105566852e-05, "loss": 0.5522, "step": 1781 }, { "epoch": 2.626865671641791, "grad_norm": 2.409657629236303, "learning_rate": 3.0920782716705654e-05, "loss": 0.5618, "step": 1782 }, { "epoch": 2.6283397825686383, "grad_norm": 2.50316457729031, "learning_rate": 3.0901550666361964e-05, "loss": 0.64, "step": 1783 }, { "epoch": 2.6298138934954856, "grad_norm": 2.253394110027405, "learning_rate": 3.088231491669287e-05, "loss": 0.6423, "step": 1784 }, { "epoch": 2.631288004422333, "grad_norm": 1.9035380098532455, "learning_rate": 3.0863075479756084e-05, "loss": 0.4694, "step": 1785 }, { "epoch": 2.63276211534918, "grad_norm": 2.278303150385425, "learning_rate": 3.084383236761166e-05, "loss": 0.5682, "step": 1786 }, { "epoch": 2.6342362262760273, "grad_norm": 2.747226964211258, "learning_rate": 3.0824585592321936e-05, "loss": 0.6068, "step": 1787 }, { "epoch": 2.6357103372028745, "grad_norm": 2.477852181694092, "learning_rate": 3.080533516595155e-05, "loss": 0.7103, "step": 1788 }, { "epoch": 2.6371844481297217, "grad_norm": 2.325658015996106, "learning_rate": 3.078608110056745e-05, "loss": 0.6227, "step": 1789 }, { "epoch": 2.638658559056569, "grad_norm": 2.2511968608305626, "learning_rate": 3.076682340823882e-05, "loss": 0.6039, "step": 1790 }, { "epoch": 2.6401326699834162, "grad_norm": 2.147074318952075, "learning_rate": 3.074756210103715e-05, "loss": 0.5699, "step": 1791 }, { "epoch": 2.6416067809102635, "grad_norm": 2.09676758537955, "learning_rate": 3.072829719103619e-05, "loss": 0.5732, "step": 1792 }, { "epoch": 2.6430808918371107, "grad_norm": 2.2113878849528104, "learning_rate": 3.070902869031196e-05, "loss": 0.4507, "step": 1793 }, { "epoch": 2.644555002763958, "grad_norm": 1.9082843368881373, "learning_rate": 3.0689756610942705e-05, "loss": 0.4711, "step": 1794 }, { "epoch": 2.646029113690805, "grad_norm": 2.412310016642563, "learning_rate": 3.067048096500893e-05, "loss": 0.4987, "step": 1795 }, { "epoch": 2.6475032246176524, "grad_norm": 2.3841905955029716, "learning_rate": 3.065120176459338e-05, "loss": 0.5569, "step": 1796 }, { "epoch": 2.6489773355444997, "grad_norm": 2.309907749034908, "learning_rate": 3.0631919021781e-05, "loss": 0.5587, "step": 1797 }, { "epoch": 2.650451446471347, "grad_norm": 2.1562304288556446, "learning_rate": 3.0612632748659e-05, "loss": 0.5118, "step": 1798 }, { "epoch": 2.651925557398194, "grad_norm": 2.3582578123511815, "learning_rate": 3.0593342957316765e-05, "loss": 0.5427, "step": 1799 }, { "epoch": 2.6533996683250414, "grad_norm": 2.417368184138094, "learning_rate": 3.05740496598459e-05, "loss": 0.5661, "step": 1800 }, { "epoch": 2.6548737792518886, "grad_norm": 2.0091071203904516, "learning_rate": 3.055475286834021e-05, "loss": 0.5807, "step": 1801 }, { "epoch": 2.656347890178736, "grad_norm": 2.2187516118433295, "learning_rate": 3.053545259489569e-05, "loss": 0.5667, "step": 1802 }, { "epoch": 2.657822001105583, "grad_norm": 2.087050351809939, "learning_rate": 3.051614885161051e-05, "loss": 0.5247, "step": 1803 }, { "epoch": 2.6592961120324303, "grad_norm": 2.415600530542403, "learning_rate": 3.0496841650585022e-05, "loss": 0.6302, "step": 1804 }, { "epoch": 2.6607702229592776, "grad_norm": 2.5458916948270898, "learning_rate": 3.0477531003921745e-05, "loss": 0.5644, "step": 1805 }, { "epoch": 2.662244333886125, "grad_norm": 2.161562691126417, "learning_rate": 3.0458216923725356e-05, "loss": 0.5555, "step": 1806 }, { "epoch": 2.663718444812972, "grad_norm": 2.3073967995095606, "learning_rate": 3.043889942210268e-05, "loss": 0.5339, "step": 1807 }, { "epoch": 2.6651925557398193, "grad_norm": 2.2093120301592633, "learning_rate": 3.0419578511162695e-05, "loss": 0.557, "step": 1808 }, { "epoch": 2.6666666666666665, "grad_norm": 2.198136654612232, "learning_rate": 3.0400254203016503e-05, "loss": 0.5498, "step": 1809 }, { "epoch": 2.6681407775935138, "grad_norm": 2.146631765121367, "learning_rate": 3.0380926509777364e-05, "loss": 0.6948, "step": 1810 }, { "epoch": 2.669614888520361, "grad_norm": 2.358845430272195, "learning_rate": 3.0361595443560624e-05, "loss": 0.6861, "step": 1811 }, { "epoch": 2.6710889994472087, "grad_norm": 2.478910085613218, "learning_rate": 3.034226101648377e-05, "loss": 0.5569, "step": 1812 }, { "epoch": 2.672563110374056, "grad_norm": 2.3887875545310373, "learning_rate": 3.0322923240666377e-05, "loss": 0.6214, "step": 1813 }, { "epoch": 2.674037221300903, "grad_norm": 1.9900059742095064, "learning_rate": 3.030358212823014e-05, "loss": 0.5516, "step": 1814 }, { "epoch": 2.6755113322277504, "grad_norm": 2.0859457294430364, "learning_rate": 3.0284237691298823e-05, "loss": 0.6141, "step": 1815 }, { "epoch": 2.6769854431545976, "grad_norm": 2.1877140485398705, "learning_rate": 3.0264889941998285e-05, "loss": 0.5454, "step": 1816 }, { "epoch": 2.678459554081445, "grad_norm": 2.2590028997219433, "learning_rate": 3.0245538892456455e-05, "loss": 0.6063, "step": 1817 }, { "epoch": 2.679933665008292, "grad_norm": 2.409884791489409, "learning_rate": 3.0226184554803357e-05, "loss": 0.6019, "step": 1818 }, { "epoch": 2.6814077759351393, "grad_norm": 2.125710312551017, "learning_rate": 3.0206826941171035e-05, "loss": 0.5264, "step": 1819 }, { "epoch": 2.6828818868619866, "grad_norm": 2.389523320884273, "learning_rate": 3.0187466063693614e-05, "loss": 0.5853, "step": 1820 }, { "epoch": 2.684355997788834, "grad_norm": 2.4687209308699476, "learning_rate": 3.0168101934507266e-05, "loss": 0.53, "step": 1821 }, { "epoch": 2.685830108715681, "grad_norm": 2.3762344364657917, "learning_rate": 3.0148734565750176e-05, "loss": 0.6224, "step": 1822 }, { "epoch": 2.6873042196425283, "grad_norm": 2.0872854385209445, "learning_rate": 3.012936396956259e-05, "loss": 0.5498, "step": 1823 }, { "epoch": 2.6887783305693755, "grad_norm": 2.041523584555213, "learning_rate": 3.0109990158086764e-05, "loss": 0.485, "step": 1824 }, { "epoch": 2.690252441496223, "grad_norm": 1.9986800964422566, "learning_rate": 3.0090613143466956e-05, "loss": 0.5293, "step": 1825 }, { "epoch": 2.69172655242307, "grad_norm": 1.8776711828059103, "learning_rate": 3.0071232937849457e-05, "loss": 0.5548, "step": 1826 }, { "epoch": 2.6932006633499173, "grad_norm": 2.595268081018513, "learning_rate": 3.0051849553382555e-05, "loss": 0.5659, "step": 1827 }, { "epoch": 2.6946747742767645, "grad_norm": 2.09350288413145, "learning_rate": 3.0032463002216505e-05, "loss": 0.5129, "step": 1828 }, { "epoch": 2.6961488852036117, "grad_norm": 2.784773491323798, "learning_rate": 3.001307329650357e-05, "loss": 0.6633, "step": 1829 }, { "epoch": 2.697622996130459, "grad_norm": 2.171837017358407, "learning_rate": 2.9993680448397988e-05, "loss": 0.5506, "step": 1830 }, { "epoch": 2.699097107057306, "grad_norm": 2.6056938474494165, "learning_rate": 2.997428447005596e-05, "loss": 0.5917, "step": 1831 }, { "epoch": 2.7005712179841534, "grad_norm": 2.646934330262911, "learning_rate": 2.9954885373635655e-05, "loss": 0.6825, "step": 1832 }, { "epoch": 2.7020453289110007, "grad_norm": 2.338123842110101, "learning_rate": 2.9935483171297186e-05, "loss": 0.6268, "step": 1833 }, { "epoch": 2.703519439837848, "grad_norm": 2.1162917152975536, "learning_rate": 2.991607787520263e-05, "loss": 0.5458, "step": 1834 }, { "epoch": 2.704993550764695, "grad_norm": 1.941056340881259, "learning_rate": 2.989666949751599e-05, "loss": 0.5336, "step": 1835 }, { "epoch": 2.7064676616915424, "grad_norm": 2.597723617253803, "learning_rate": 2.9877258050403212e-05, "loss": 0.6088, "step": 1836 }, { "epoch": 2.7079417726183896, "grad_norm": 2.5854426253309457, "learning_rate": 2.985784354603215e-05, "loss": 0.5155, "step": 1837 }, { "epoch": 2.709415883545237, "grad_norm": 2.2840179999355392, "learning_rate": 2.9838425996572583e-05, "loss": 0.5031, "step": 1838 }, { "epoch": 2.710889994472084, "grad_norm": 2.472577954057587, "learning_rate": 2.981900541419621e-05, "loss": 0.6911, "step": 1839 }, { "epoch": 2.7123641053989314, "grad_norm": 2.347067963937932, "learning_rate": 2.9799581811076605e-05, "loss": 0.5842, "step": 1840 }, { "epoch": 2.7138382163257786, "grad_norm": 2.9291516437027703, "learning_rate": 2.978015519938926e-05, "loss": 0.5634, "step": 1841 }, { "epoch": 2.715312327252626, "grad_norm": 2.385505830078421, "learning_rate": 2.9760725591311545e-05, "loss": 0.5489, "step": 1842 }, { "epoch": 2.716786438179473, "grad_norm": 2.151756358780257, "learning_rate": 2.9741292999022707e-05, "loss": 0.5556, "step": 1843 }, { "epoch": 2.7182605491063203, "grad_norm": 2.6977724211044523, "learning_rate": 2.9721857434703858e-05, "loss": 0.5099, "step": 1844 }, { "epoch": 2.7197346600331676, "grad_norm": 2.603357743829764, "learning_rate": 2.9702418910537983e-05, "loss": 0.6032, "step": 1845 }, { "epoch": 2.721208770960015, "grad_norm": 2.080681855607178, "learning_rate": 2.9682977438709914e-05, "loss": 0.5504, "step": 1846 }, { "epoch": 2.722682881886862, "grad_norm": 2.1641227631148348, "learning_rate": 2.9663533031406344e-05, "loss": 0.571, "step": 1847 }, { "epoch": 2.7241569928137093, "grad_norm": 2.123785232691764, "learning_rate": 2.9644085700815777e-05, "loss": 0.6089, "step": 1848 }, { "epoch": 2.7256311037405565, "grad_norm": 2.009953644820545, "learning_rate": 2.9624635459128585e-05, "loss": 0.4615, "step": 1849 }, { "epoch": 2.7271052146674037, "grad_norm": 2.3231477722250555, "learning_rate": 2.960518231853695e-05, "loss": 0.4632, "step": 1850 }, { "epoch": 2.728579325594251, "grad_norm": 2.039542773749094, "learning_rate": 2.9585726291234872e-05, "loss": 0.3987, "step": 1851 }, { "epoch": 2.730053436521098, "grad_norm": 2.3701673831661876, "learning_rate": 2.9566267389418144e-05, "loss": 0.6166, "step": 1852 }, { "epoch": 2.7315275474479455, "grad_norm": 2.258388775734976, "learning_rate": 2.9546805625284384e-05, "loss": 0.5271, "step": 1853 }, { "epoch": 2.7330016583747927, "grad_norm": 2.2740213489989767, "learning_rate": 2.9527341011033e-05, "loss": 0.5607, "step": 1854 }, { "epoch": 2.73447576930164, "grad_norm": 2.476348290069321, "learning_rate": 2.9507873558865175e-05, "loss": 0.6205, "step": 1855 }, { "epoch": 2.735949880228487, "grad_norm": 2.4897858817943797, "learning_rate": 2.9488403280983873e-05, "loss": 0.6656, "step": 1856 }, { "epoch": 2.7374239911553344, "grad_norm": 2.1495381812968066, "learning_rate": 2.9468930189593845e-05, "loss": 0.5407, "step": 1857 }, { "epoch": 2.7388981020821817, "grad_norm": 2.3246672894727074, "learning_rate": 2.9449454296901603e-05, "loss": 0.5717, "step": 1858 }, { "epoch": 2.740372213009029, "grad_norm": 2.2523340199207755, "learning_rate": 2.9429975615115383e-05, "loss": 0.6988, "step": 1859 }, { "epoch": 2.741846323935876, "grad_norm": 2.4460954966458153, "learning_rate": 2.9410494156445216e-05, "loss": 0.5891, "step": 1860 }, { "epoch": 2.7433204348627234, "grad_norm": 2.518506691968281, "learning_rate": 2.9391009933102836e-05, "loss": 0.6087, "step": 1861 }, { "epoch": 2.7447945457895706, "grad_norm": 2.473084808863674, "learning_rate": 2.9371522957301734e-05, "loss": 0.5554, "step": 1862 }, { "epoch": 2.746268656716418, "grad_norm": 2.3585004385615593, "learning_rate": 2.935203324125711e-05, "loss": 0.6011, "step": 1863 }, { "epoch": 2.747742767643265, "grad_norm": 2.2982570097571657, "learning_rate": 2.9332540797185892e-05, "loss": 0.671, "step": 1864 }, { "epoch": 2.7492168785701123, "grad_norm": 2.446058165718744, "learning_rate": 2.9313045637306714e-05, "loss": 0.5933, "step": 1865 }, { "epoch": 2.7506909894969596, "grad_norm": 2.3131555195206857, "learning_rate": 2.9293547773839917e-05, "loss": 0.6432, "step": 1866 }, { "epoch": 2.752165100423807, "grad_norm": 2.123678581918487, "learning_rate": 2.9274047219007534e-05, "loss": 0.4655, "step": 1867 }, { "epoch": 2.753639211350654, "grad_norm": 2.484888851358784, "learning_rate": 2.925454398503328e-05, "loss": 0.5807, "step": 1868 }, { "epoch": 2.7551133222775013, "grad_norm": 2.198583155336676, "learning_rate": 2.9235038084142557e-05, "loss": 0.5425, "step": 1869 }, { "epoch": 2.7565874332043485, "grad_norm": 2.7018944028413148, "learning_rate": 2.921552952856243e-05, "loss": 0.5816, "step": 1870 }, { "epoch": 2.7580615441311958, "grad_norm": 2.184295268643339, "learning_rate": 2.919601833052163e-05, "loss": 0.6152, "step": 1871 }, { "epoch": 2.759535655058043, "grad_norm": 2.788791589909849, "learning_rate": 2.9176504502250563e-05, "loss": 0.6246, "step": 1872 }, { "epoch": 2.7610097659848902, "grad_norm": 2.037392937591376, "learning_rate": 2.9156988055981254e-05, "loss": 0.5256, "step": 1873 }, { "epoch": 2.7624838769117375, "grad_norm": 2.2283023929468913, "learning_rate": 2.9137469003947392e-05, "loss": 0.5402, "step": 1874 }, { "epoch": 2.7639579878385847, "grad_norm": 2.055966519465382, "learning_rate": 2.9117947358384288e-05, "loss": 0.5415, "step": 1875 }, { "epoch": 2.765432098765432, "grad_norm": 2.0949663642969067, "learning_rate": 2.909842313152888e-05, "loss": 0.588, "step": 1876 }, { "epoch": 2.766906209692279, "grad_norm": 2.184349843591172, "learning_rate": 2.9078896335619732e-05, "loss": 0.6297, "step": 1877 }, { "epoch": 2.7683803206191264, "grad_norm": 2.12789641061492, "learning_rate": 2.9059366982897007e-05, "loss": 0.5146, "step": 1878 }, { "epoch": 2.7698544315459737, "grad_norm": 2.3271148333442424, "learning_rate": 2.9039835085602473e-05, "loss": 0.5227, "step": 1879 }, { "epoch": 2.771328542472821, "grad_norm": 2.3370571961428785, "learning_rate": 2.9020300655979503e-05, "loss": 0.7404, "step": 1880 }, { "epoch": 2.772802653399668, "grad_norm": 2.2341656353369608, "learning_rate": 2.9000763706273036e-05, "loss": 0.6312, "step": 1881 }, { "epoch": 2.7742767643265154, "grad_norm": 2.0322136573803364, "learning_rate": 2.8981224248729628e-05, "loss": 0.5203, "step": 1882 }, { "epoch": 2.7757508752533626, "grad_norm": 2.466396997076156, "learning_rate": 2.896168229559737e-05, "loss": 0.4911, "step": 1883 }, { "epoch": 2.77722498618021, "grad_norm": 2.1379058720250432, "learning_rate": 2.8942137859125928e-05, "loss": 0.5255, "step": 1884 }, { "epoch": 2.778699097107057, "grad_norm": 2.346586418441449, "learning_rate": 2.8922590951566536e-05, "loss": 0.5484, "step": 1885 }, { "epoch": 2.7801732080339043, "grad_norm": 2.126718387313654, "learning_rate": 2.8903041585171963e-05, "loss": 0.4612, "step": 1886 }, { "epoch": 2.7816473189607516, "grad_norm": 2.3226309858655303, "learning_rate": 2.8883489772196525e-05, "loss": 0.5809, "step": 1887 }, { "epoch": 2.783121429887599, "grad_norm": 2.4675387537733404, "learning_rate": 2.886393552489608e-05, "loss": 0.6873, "step": 1888 }, { "epoch": 2.784595540814446, "grad_norm": 2.6472641587868986, "learning_rate": 2.8844378855527998e-05, "loss": 0.5228, "step": 1889 }, { "epoch": 2.7860696517412933, "grad_norm": 2.167217392498703, "learning_rate": 2.8824819776351176e-05, "loss": 0.5456, "step": 1890 }, { "epoch": 2.7875437626681405, "grad_norm": 1.884623309677205, "learning_rate": 2.8805258299626015e-05, "loss": 0.4701, "step": 1891 }, { "epoch": 2.7890178735949878, "grad_norm": 2.493542819869983, "learning_rate": 2.878569443761442e-05, "loss": 0.5065, "step": 1892 }, { "epoch": 2.790491984521835, "grad_norm": 2.3564440983730814, "learning_rate": 2.8766128202579797e-05, "loss": 0.5645, "step": 1893 }, { "epoch": 2.7919660954486822, "grad_norm": 2.651158738065102, "learning_rate": 2.874655960678704e-05, "loss": 0.6441, "step": 1894 }, { "epoch": 2.79344020637553, "grad_norm": 2.1697613427553226, "learning_rate": 2.87269886625025e-05, "loss": 0.4987, "step": 1895 }, { "epoch": 2.794914317302377, "grad_norm": 2.461708940966454, "learning_rate": 2.870741538199405e-05, "loss": 0.6243, "step": 1896 }, { "epoch": 2.7963884282292244, "grad_norm": 2.6038489389507604, "learning_rate": 2.8687839777530977e-05, "loss": 0.5617, "step": 1897 }, { "epoch": 2.7978625391560716, "grad_norm": 2.272573405172421, "learning_rate": 2.8668261861384045e-05, "loss": 0.5737, "step": 1898 }, { "epoch": 2.799336650082919, "grad_norm": 2.12847526457422, "learning_rate": 2.8648681645825472e-05, "loss": 0.6208, "step": 1899 }, { "epoch": 2.800810761009766, "grad_norm": 2.3160778925814207, "learning_rate": 2.8629099143128907e-05, "loss": 0.6102, "step": 1900 }, { "epoch": 2.8022848719366134, "grad_norm": 2.131983445674045, "learning_rate": 2.860951436556944e-05, "loss": 0.4298, "step": 1901 }, { "epoch": 2.8037589828634606, "grad_norm": 2.9952615828109055, "learning_rate": 2.8589927325423576e-05, "loss": 0.6029, "step": 1902 }, { "epoch": 2.805233093790308, "grad_norm": 2.1028384054501275, "learning_rate": 2.8570338034969264e-05, "loss": 0.4768, "step": 1903 }, { "epoch": 2.806707204717155, "grad_norm": 2.072249172479752, "learning_rate": 2.855074650648583e-05, "loss": 0.4987, "step": 1904 }, { "epoch": 2.8081813156440023, "grad_norm": 2.4801043385430006, "learning_rate": 2.853115275225403e-05, "loss": 0.6726, "step": 1905 }, { "epoch": 2.8096554265708495, "grad_norm": 1.9964032017270377, "learning_rate": 2.8511556784556e-05, "loss": 0.4815, "step": 1906 }, { "epoch": 2.811129537497697, "grad_norm": 2.049877498036897, "learning_rate": 2.8491958615675262e-05, "loss": 0.5357, "step": 1907 }, { "epoch": 2.812603648424544, "grad_norm": 2.3786662063633823, "learning_rate": 2.8472358257896732e-05, "loss": 0.5494, "step": 1908 }, { "epoch": 2.8140777593513913, "grad_norm": 2.1551161286469234, "learning_rate": 2.8452755723506687e-05, "loss": 0.5683, "step": 1909 }, { "epoch": 2.8155518702782385, "grad_norm": 2.158758832408726, "learning_rate": 2.843315102479276e-05, "loss": 0.4939, "step": 1910 }, { "epoch": 2.8170259812050857, "grad_norm": 2.1670001091281526, "learning_rate": 2.841354417404397e-05, "loss": 0.4101, "step": 1911 }, { "epoch": 2.818500092131933, "grad_norm": 2.2716968072956707, "learning_rate": 2.8393935183550662e-05, "loss": 0.5541, "step": 1912 }, { "epoch": 2.81997420305878, "grad_norm": 2.3496953198576382, "learning_rate": 2.8374324065604517e-05, "loss": 0.5337, "step": 1913 }, { "epoch": 2.8214483139856275, "grad_norm": 2.1364687925095436, "learning_rate": 2.8354710832498576e-05, "loss": 0.5676, "step": 1914 }, { "epoch": 2.8229224249124747, "grad_norm": 2.4142762123494905, "learning_rate": 2.833509549652717e-05, "loss": 0.6571, "step": 1915 }, { "epoch": 2.824396535839322, "grad_norm": 2.2040734380128555, "learning_rate": 2.831547806998598e-05, "loss": 0.5674, "step": 1916 }, { "epoch": 2.825870646766169, "grad_norm": 2.3259417646667013, "learning_rate": 2.8295858565171983e-05, "loss": 0.6222, "step": 1917 }, { "epoch": 2.8273447576930164, "grad_norm": 2.478301872493263, "learning_rate": 2.8276236994383453e-05, "loss": 0.6426, "step": 1918 }, { "epoch": 2.8288188686198636, "grad_norm": 2.438069888276554, "learning_rate": 2.825661336991998e-05, "loss": 0.573, "step": 1919 }, { "epoch": 2.830292979546711, "grad_norm": 2.7449241958462864, "learning_rate": 2.8236987704082417e-05, "loss": 0.5369, "step": 1920 }, { "epoch": 2.831767090473558, "grad_norm": 2.0323371912088684, "learning_rate": 2.8217360009172922e-05, "loss": 0.489, "step": 1921 }, { "epoch": 2.8332412014004054, "grad_norm": 2.154123778319003, "learning_rate": 2.8197730297494896e-05, "loss": 0.5302, "step": 1922 }, { "epoch": 2.8347153123272526, "grad_norm": 2.9382126937856863, "learning_rate": 2.8178098581353018e-05, "loss": 0.5556, "step": 1923 }, { "epoch": 2.8361894232541, "grad_norm": 2.4783303482389996, "learning_rate": 2.8158464873053237e-05, "loss": 0.5982, "step": 1924 }, { "epoch": 2.837663534180947, "grad_norm": 2.457063756585197, "learning_rate": 2.8138829184902727e-05, "loss": 0.6586, "step": 1925 }, { "epoch": 2.8391376451077943, "grad_norm": 2.1874895368053333, "learning_rate": 2.811919152920991e-05, "loss": 0.4695, "step": 1926 }, { "epoch": 2.8406117560346416, "grad_norm": 2.176482548172905, "learning_rate": 2.8099551918284468e-05, "loss": 0.503, "step": 1927 }, { "epoch": 2.842085866961489, "grad_norm": 2.006338090240185, "learning_rate": 2.8079910364437263e-05, "loss": 0.5807, "step": 1928 }, { "epoch": 2.843559977888336, "grad_norm": 2.0026321971371823, "learning_rate": 2.8060266879980408e-05, "loss": 0.5881, "step": 1929 }, { "epoch": 2.8450340888151833, "grad_norm": 2.0404522716097246, "learning_rate": 2.8040621477227214e-05, "loss": 0.5792, "step": 1930 }, { "epoch": 2.8465081997420305, "grad_norm": 2.094255187165821, "learning_rate": 2.8020974168492197e-05, "loss": 0.5321, "step": 1931 }, { "epoch": 2.8479823106688777, "grad_norm": 2.238807922500025, "learning_rate": 2.8001324966091076e-05, "loss": 0.52, "step": 1932 }, { "epoch": 2.849456421595725, "grad_norm": 2.396248082183564, "learning_rate": 2.7981673882340726e-05, "loss": 0.5895, "step": 1933 }, { "epoch": 2.8509305325225722, "grad_norm": 2.7381868659063557, "learning_rate": 2.796202092955924e-05, "loss": 0.4688, "step": 1934 }, { "epoch": 2.8524046434494195, "grad_norm": 2.1569984353404235, "learning_rate": 2.7942366120065872e-05, "loss": 0.4963, "step": 1935 }, { "epoch": 2.8538787543762667, "grad_norm": 2.144618854227302, "learning_rate": 2.792270946618102e-05, "loss": 0.5253, "step": 1936 }, { "epoch": 2.855352865303114, "grad_norm": 2.7113123917600626, "learning_rate": 2.790305098022626e-05, "loss": 0.6852, "step": 1937 }, { "epoch": 2.856826976229961, "grad_norm": 2.2340455779409756, "learning_rate": 2.78833906745243e-05, "loss": 0.4881, "step": 1938 }, { "epoch": 2.8583010871568084, "grad_norm": 2.5352822670673287, "learning_rate": 2.7863728561399016e-05, "loss": 0.5039, "step": 1939 }, { "epoch": 2.8597751980836557, "grad_norm": 2.002607791204525, "learning_rate": 2.7844064653175378e-05, "loss": 0.434, "step": 1940 }, { "epoch": 2.861249309010503, "grad_norm": 2.0047018096722327, "learning_rate": 2.7824398962179503e-05, "loss": 0.5778, "step": 1941 }, { "epoch": 2.86272341993735, "grad_norm": 2.2721671523944735, "learning_rate": 2.780473150073864e-05, "loss": 0.5414, "step": 1942 }, { "epoch": 2.8641975308641974, "grad_norm": 2.3440461035285822, "learning_rate": 2.7785062281181124e-05, "loss": 0.6213, "step": 1943 }, { "epoch": 2.8656716417910446, "grad_norm": 2.984463835187976, "learning_rate": 2.7765391315836396e-05, "loss": 0.6308, "step": 1944 }, { "epoch": 2.8671457527178923, "grad_norm": 2.4621196511181416, "learning_rate": 2.7745718617034998e-05, "loss": 0.5632, "step": 1945 }, { "epoch": 2.8686198636447395, "grad_norm": 2.1745257770469184, "learning_rate": 2.7726044197108557e-05, "loss": 0.6319, "step": 1946 }, { "epoch": 2.8700939745715868, "grad_norm": 2.2874440796067304, "learning_rate": 2.7706368068389778e-05, "loss": 0.4737, "step": 1947 }, { "epoch": 2.871568085498434, "grad_norm": 2.1183855671699257, "learning_rate": 2.7686690243212432e-05, "loss": 0.6328, "step": 1948 }, { "epoch": 2.8730421964252812, "grad_norm": 2.037229218071175, "learning_rate": 2.7667010733911354e-05, "loss": 0.5009, "step": 1949 }, { "epoch": 2.8745163073521285, "grad_norm": 2.1013973128620647, "learning_rate": 2.7647329552822455e-05, "loss": 0.5337, "step": 1950 }, { "epoch": 2.8759904182789757, "grad_norm": 2.0251720167025584, "learning_rate": 2.762764671228267e-05, "loss": 0.5413, "step": 1951 }, { "epoch": 2.877464529205823, "grad_norm": 2.1259899077655904, "learning_rate": 2.760796222462998e-05, "loss": 0.572, "step": 1952 }, { "epoch": 2.87893864013267, "grad_norm": 3.432199587862759, "learning_rate": 2.7588276102203398e-05, "loss": 0.4838, "step": 1953 }, { "epoch": 2.8804127510595174, "grad_norm": 2.064948160661541, "learning_rate": 2.7568588357342973e-05, "loss": 0.5258, "step": 1954 }, { "epoch": 2.8818868619863647, "grad_norm": 2.0731161459272838, "learning_rate": 2.754889900238975e-05, "loss": 0.5856, "step": 1955 }, { "epoch": 2.883360972913212, "grad_norm": 2.287338076004067, "learning_rate": 2.7529208049685807e-05, "loss": 0.6324, "step": 1956 }, { "epoch": 2.884835083840059, "grad_norm": 2.1287259366730034, "learning_rate": 2.7509515511574208e-05, "loss": 0.5225, "step": 1957 }, { "epoch": 2.8863091947669064, "grad_norm": 2.4158145015947983, "learning_rate": 2.748982140039902e-05, "loss": 0.5675, "step": 1958 }, { "epoch": 2.8877833056937536, "grad_norm": 2.2114296085781904, "learning_rate": 2.747012572850528e-05, "loss": 0.5371, "step": 1959 }, { "epoch": 2.889257416620601, "grad_norm": 2.350068691954147, "learning_rate": 2.7450428508239024e-05, "loss": 0.622, "step": 1960 }, { "epoch": 2.890731527547448, "grad_norm": 2.06161931048532, "learning_rate": 2.743072975194723e-05, "loss": 0.5153, "step": 1961 }, { "epoch": 2.8922056384742953, "grad_norm": 2.232333871570325, "learning_rate": 2.741102947197789e-05, "loss": 0.4967, "step": 1962 }, { "epoch": 2.8936797494011426, "grad_norm": 2.24554213674895, "learning_rate": 2.7391327680679895e-05, "loss": 0.5023, "step": 1963 }, { "epoch": 2.89515386032799, "grad_norm": 2.103417921143768, "learning_rate": 2.7371624390403116e-05, "loss": 0.6149, "step": 1964 }, { "epoch": 2.896627971254837, "grad_norm": 2.564498773126426, "learning_rate": 2.735191961349835e-05, "loss": 0.5698, "step": 1965 }, { "epoch": 2.8981020821816843, "grad_norm": 2.1554490965458437, "learning_rate": 2.7332213362317328e-05, "loss": 0.5334, "step": 1966 }, { "epoch": 2.8995761931085315, "grad_norm": 2.1718686330139363, "learning_rate": 2.7312505649212722e-05, "loss": 0.4688, "step": 1967 }, { "epoch": 2.901050304035379, "grad_norm": 2.40738740271492, "learning_rate": 2.7292796486538093e-05, "loss": 0.6264, "step": 1968 }, { "epoch": 2.902524414962226, "grad_norm": 2.081839890838199, "learning_rate": 2.727308588664793e-05, "loss": 0.4554, "step": 1969 }, { "epoch": 2.9039985258890733, "grad_norm": 2.5706254484518016, "learning_rate": 2.725337386189761e-05, "loss": 0.5592, "step": 1970 }, { "epoch": 2.9054726368159205, "grad_norm": 2.214804674237806, "learning_rate": 2.723366042464342e-05, "loss": 0.5521, "step": 1971 }, { "epoch": 2.9069467477427677, "grad_norm": 2.2730225908706667, "learning_rate": 2.7213945587242508e-05, "loss": 0.6343, "step": 1972 }, { "epoch": 2.908420858669615, "grad_norm": 2.210280573541657, "learning_rate": 2.7194229362052924e-05, "loss": 0.4998, "step": 1973 }, { "epoch": 2.909894969596462, "grad_norm": 2.448578624638461, "learning_rate": 2.7174511761433585e-05, "loss": 0.7806, "step": 1974 }, { "epoch": 2.9113690805233094, "grad_norm": 2.0271421710809783, "learning_rate": 2.715479279774425e-05, "loss": 0.5058, "step": 1975 }, { "epoch": 2.9128431914501567, "grad_norm": 2.921372967189868, "learning_rate": 2.7135072483345552e-05, "loss": 0.4976, "step": 1976 }, { "epoch": 2.914317302377004, "grad_norm": 2.184231414223348, "learning_rate": 2.7115350830598958e-05, "loss": 0.4947, "step": 1977 }, { "epoch": 2.915791413303851, "grad_norm": 2.259461220010254, "learning_rate": 2.709562785186679e-05, "loss": 0.5505, "step": 1978 }, { "epoch": 2.9172655242306984, "grad_norm": 2.2100825189918076, "learning_rate": 2.7075903559512178e-05, "loss": 0.5601, "step": 1979 }, { "epoch": 2.9187396351575456, "grad_norm": 2.4376645154943075, "learning_rate": 2.7056177965899097e-05, "loss": 0.5549, "step": 1980 }, { "epoch": 2.920213746084393, "grad_norm": 2.344639723701695, "learning_rate": 2.7036451083392332e-05, "loss": 0.6275, "step": 1981 }, { "epoch": 2.92168785701124, "grad_norm": 2.2534842534095927, "learning_rate": 2.701672292435747e-05, "loss": 0.5928, "step": 1982 }, { "epoch": 2.9231619679380874, "grad_norm": 2.3689462907531063, "learning_rate": 2.69969935011609e-05, "loss": 0.6031, "step": 1983 }, { "epoch": 2.9246360788649346, "grad_norm": 2.010034897154963, "learning_rate": 2.6977262826169807e-05, "loss": 0.5303, "step": 1984 }, { "epoch": 2.926110189791782, "grad_norm": 2.12789641061492, "learning_rate": 2.695753091175216e-05, "loss": 0.5125, "step": 1985 }, { "epoch": 2.927584300718629, "grad_norm": 2.373292911788957, "learning_rate": 2.6937797770276702e-05, "loss": 0.6126, "step": 1986 }, { "epoch": 2.9290584116454763, "grad_norm": 2.5364977280666556, "learning_rate": 2.6918063414112942e-05, "loss": 0.4558, "step": 1987 }, { "epoch": 2.9305325225723236, "grad_norm": 2.2284593499823875, "learning_rate": 2.6898327855631155e-05, "loss": 0.6221, "step": 1988 }, { "epoch": 2.932006633499171, "grad_norm": 2.006557205780179, "learning_rate": 2.6878591107202383e-05, "loss": 0.5321, "step": 1989 }, { "epoch": 2.933480744426018, "grad_norm": 2.114483672801747, "learning_rate": 2.685885318119839e-05, "loss": 0.5772, "step": 1990 }, { "epoch": 2.9349548553528653, "grad_norm": 2.3554889937459276, "learning_rate": 2.683911408999169e-05, "loss": 0.6094, "step": 1991 }, { "epoch": 2.9364289662797125, "grad_norm": 2.6639748835768646, "learning_rate": 2.6819373845955527e-05, "loss": 0.7046, "step": 1992 }, { "epoch": 2.9379030772065597, "grad_norm": 2.1663902791119294, "learning_rate": 2.6799632461463862e-05, "loss": 0.542, "step": 1993 }, { "epoch": 2.939377188133407, "grad_norm": 2.268524629522672, "learning_rate": 2.6779889948891384e-05, "loss": 0.6225, "step": 1994 }, { "epoch": 2.940851299060254, "grad_norm": 2.7759761742935334, "learning_rate": 2.676014632061347e-05, "loss": 0.5287, "step": 1995 }, { "epoch": 2.9423254099871015, "grad_norm": 2.8271541720580875, "learning_rate": 2.674040158900622e-05, "loss": 0.5578, "step": 1996 }, { "epoch": 2.9437995209139487, "grad_norm": 2.5541831168992797, "learning_rate": 2.6720655766446412e-05, "loss": 0.5739, "step": 1997 }, { "epoch": 2.945273631840796, "grad_norm": 2.1679002355798116, "learning_rate": 2.6700908865311497e-05, "loss": 0.6205, "step": 1998 }, { "epoch": 2.946747742767643, "grad_norm": 1.9961114274229474, "learning_rate": 2.6681160897979623e-05, "loss": 0.5097, "step": 1999 }, { "epoch": 2.9482218536944904, "grad_norm": 2.0998873362519173, "learning_rate": 2.6661411876829596e-05, "loss": 0.5321, "step": 2000 }, { "epoch": 2.9482218536944904, "eval_bleu": 0.09335297725294989, "eval_bleu_1gram": 0.41463546806489926, "eval_bleu_2gram": 0.17857581838255815, "eval_bleu_3gram": 0.08679246062521935, "eval_bleu_4gram": 0.04509221762981915, "eval_rag_val_loss": 1.0981753053524161, "eval_rouge1": 0.4102120101462044, "eval_rouge2": 0.17363659495175307, "eval_rougeL": 0.39351070263564747, "step": 2000 } ], "logging_steps": 1, "max_steps": 4068, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }