diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,54774 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 7820, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00128, + "grad_norm": 13.502479553222656, + "learning_rate": 0.0, + "loss": 10.9785, + "step": 1 + }, + { + "epoch": 0.00256, + "grad_norm": 13.624500274658203, + "learning_rate": 7.672634271099745e-07, + "loss": 10.98, + "step": 2 + }, + { + "epoch": 0.00384, + "grad_norm": 13.133468627929688, + "learning_rate": 1.534526854219949e-06, + "loss": 10.9409, + "step": 3 + }, + { + "epoch": 0.00512, + "grad_norm": 13.427982330322266, + "learning_rate": 2.301790281329923e-06, + "loss": 10.8518, + "step": 4 + }, + { + "epoch": 0.0064, + "grad_norm": 12.44352912902832, + "learning_rate": 3.069053708439898e-06, + "loss": 10.738, + "step": 5 + }, + { + "epoch": 0.00768, + "grad_norm": 10.861345291137695, + "learning_rate": 3.836317135549872e-06, + "loss": 10.6095, + "step": 6 + }, + { + "epoch": 0.00896, + "grad_norm": 9.260466575622559, + "learning_rate": 4.603580562659846e-06, + "loss": 10.4604, + "step": 7 + }, + { + "epoch": 0.01024, + "grad_norm": 7.927541255950928, + "learning_rate": 5.3708439897698205e-06, + "loss": 10.3281, + "step": 8 + }, + { + "epoch": 0.01152, + "grad_norm": 6.898108959197998, + "learning_rate": 6.138107416879796e-06, + "loss": 10.1985, + "step": 9 + }, + { + "epoch": 0.0128, + "grad_norm": 5.914000988006592, + "learning_rate": 6.905370843989769e-06, + "loss": 10.0951, + "step": 10 + }, + { + "epoch": 0.01408, + "grad_norm": 5.054530143737793, + "learning_rate": 7.672634271099744e-06, + "loss": 10.012, + "step": 11 + }, + { + "epoch": 0.01536, + "grad_norm": 4.351410865783691, + "learning_rate": 8.439897698209718e-06, + "loss": 9.9117, + "step": 12 + }, + { + "epoch": 0.01664, + "grad_norm": 3.752089500427246, + "learning_rate": 9.207161125319692e-06, + "loss": 9.8225, + "step": 13 + }, + { + "epoch": 0.01792, + "grad_norm": 3.252004623413086, + "learning_rate": 9.974424552429668e-06, + "loss": 9.7603, + "step": 14 + }, + { + "epoch": 0.0192, + "grad_norm": 2.8720457553863525, + "learning_rate": 1.0741687979539641e-05, + "loss": 9.7285, + "step": 15 + }, + { + "epoch": 0.02048, + "grad_norm": 2.666605234146118, + "learning_rate": 1.1508951406649615e-05, + "loss": 9.6632, + "step": 16 + }, + { + "epoch": 0.02176, + "grad_norm": 2.497063398361206, + "learning_rate": 1.2276214833759591e-05, + "loss": 9.6141, + "step": 17 + }, + { + "epoch": 0.02304, + "grad_norm": 2.3066694736480713, + "learning_rate": 1.3043478260869564e-05, + "loss": 9.6125, + "step": 18 + }, + { + "epoch": 0.02432, + "grad_norm": 2.267719268798828, + "learning_rate": 1.3810741687979538e-05, + "loss": 9.5742, + "step": 19 + }, + { + "epoch": 0.0256, + "grad_norm": 2.268841505050659, + "learning_rate": 1.4578005115089511e-05, + "loss": 9.5505, + "step": 20 + }, + { + "epoch": 0.02688, + "grad_norm": 2.2032310962677, + "learning_rate": 1.5345268542199487e-05, + "loss": 9.5406, + "step": 21 + }, + { + "epoch": 0.02816, + "grad_norm": 2.1540212631225586, + "learning_rate": 1.611253196930946e-05, + "loss": 9.5205, + "step": 22 + }, + { + "epoch": 0.02944, + "grad_norm": 2.1648716926574707, + "learning_rate": 1.6879795396419436e-05, + "loss": 9.5023, + "step": 23 + }, + { + "epoch": 0.03072, + "grad_norm": 2.1777994632720947, + "learning_rate": 1.764705882352941e-05, + "loss": 9.4566, + "step": 24 + }, + { + "epoch": 0.032, + "grad_norm": 2.093733310699463, + "learning_rate": 1.8414322250639385e-05, + "loss": 9.4797, + "step": 25 + }, + { + "epoch": 0.03328, + "grad_norm": 2.14025616645813, + "learning_rate": 1.918158567774936e-05, + "loss": 9.4326, + "step": 26 + }, + { + "epoch": 0.03456, + "grad_norm": 2.243187665939331, + "learning_rate": 1.9948849104859337e-05, + "loss": 9.3569, + "step": 27 + }, + { + "epoch": 0.03584, + "grad_norm": 2.1062655448913574, + "learning_rate": 2.0716112531969308e-05, + "loss": 9.3849, + "step": 28 + }, + { + "epoch": 0.03712, + "grad_norm": 2.0766758918762207, + "learning_rate": 2.1483375959079282e-05, + "loss": 9.351, + "step": 29 + }, + { + "epoch": 0.0384, + "grad_norm": 2.101067304611206, + "learning_rate": 2.2250639386189256e-05, + "loss": 9.3139, + "step": 30 + }, + { + "epoch": 0.03968, + "grad_norm": 2.0506746768951416, + "learning_rate": 2.301790281329923e-05, + "loss": 9.2755, + "step": 31 + }, + { + "epoch": 0.04096, + "grad_norm": 2.015533924102783, + "learning_rate": 2.3785166240409205e-05, + "loss": 9.2446, + "step": 32 + }, + { + "epoch": 0.04224, + "grad_norm": 1.9934455156326294, + "learning_rate": 2.4552429667519183e-05, + "loss": 9.2283, + "step": 33 + }, + { + "epoch": 0.04352, + "grad_norm": 1.9839218854904175, + "learning_rate": 2.5319693094629154e-05, + "loss": 9.1785, + "step": 34 + }, + { + "epoch": 0.0448, + "grad_norm": 2.0591323375701904, + "learning_rate": 2.6086956521739128e-05, + "loss": 9.1733, + "step": 35 + }, + { + "epoch": 0.04608, + "grad_norm": 1.8989123106002808, + "learning_rate": 2.6854219948849103e-05, + "loss": 9.1775, + "step": 36 + }, + { + "epoch": 0.04736, + "grad_norm": 2.2588517665863037, + "learning_rate": 2.7621483375959077e-05, + "loss": 9.1317, + "step": 37 + }, + { + "epoch": 0.04864, + "grad_norm": 1.869653582572937, + "learning_rate": 2.838874680306905e-05, + "loss": 9.0794, + "step": 38 + }, + { + "epoch": 0.04992, + "grad_norm": 2.098316192626953, + "learning_rate": 2.9156010230179022e-05, + "loss": 9.0853, + "step": 39 + }, + { + "epoch": 0.0512, + "grad_norm": 1.8606550693511963, + "learning_rate": 2.9923273657289e-05, + "loss": 9.0243, + "step": 40 + }, + { + "epoch": 0.05248, + "grad_norm": 1.9760617017745972, + "learning_rate": 3.0690537084398974e-05, + "loss": 8.9874, + "step": 41 + }, + { + "epoch": 0.05376, + "grad_norm": 1.8577723503112793, + "learning_rate": 3.145780051150895e-05, + "loss": 8.9687, + "step": 42 + }, + { + "epoch": 0.05504, + "grad_norm": 1.7823419570922852, + "learning_rate": 3.222506393861892e-05, + "loss": 8.9302, + "step": 43 + }, + { + "epoch": 0.05632, + "grad_norm": 1.8468880653381348, + "learning_rate": 3.29923273657289e-05, + "loss": 8.8693, + "step": 44 + }, + { + "epoch": 0.0576, + "grad_norm": 1.7553874254226685, + "learning_rate": 3.375959079283887e-05, + "loss": 8.8835, + "step": 45 + }, + { + "epoch": 0.05888, + "grad_norm": 1.8048585653305054, + "learning_rate": 3.4526854219948846e-05, + "loss": 8.8015, + "step": 46 + }, + { + "epoch": 0.06016, + "grad_norm": 1.8145036697387695, + "learning_rate": 3.529411764705882e-05, + "loss": 8.7847, + "step": 47 + }, + { + "epoch": 0.06144, + "grad_norm": 1.6603283882141113, + "learning_rate": 3.6061381074168795e-05, + "loss": 8.7738, + "step": 48 + }, + { + "epoch": 0.06272, + "grad_norm": 1.6837772130966187, + "learning_rate": 3.682864450127877e-05, + "loss": 8.7088, + "step": 49 + }, + { + "epoch": 0.064, + "grad_norm": 1.686160683631897, + "learning_rate": 3.7595907928388744e-05, + "loss": 8.6635, + "step": 50 + }, + { + "epoch": 0.06528, + "grad_norm": 1.7713919878005981, + "learning_rate": 3.836317135549872e-05, + "loss": 8.6265, + "step": 51 + }, + { + "epoch": 0.06656, + "grad_norm": 1.8491101264953613, + "learning_rate": 3.913043478260869e-05, + "loss": 8.5384, + "step": 52 + }, + { + "epoch": 0.06784, + "grad_norm": 1.6337801218032837, + "learning_rate": 3.989769820971867e-05, + "loss": 8.5446, + "step": 53 + }, + { + "epoch": 0.06912, + "grad_norm": 1.5614173412322998, + "learning_rate": 4.066496163682864e-05, + "loss": 8.5252, + "step": 54 + }, + { + "epoch": 0.0704, + "grad_norm": 1.5374433994293213, + "learning_rate": 4.1432225063938615e-05, + "loss": 8.4633, + "step": 55 + }, + { + "epoch": 0.07168, + "grad_norm": 1.6636866331100464, + "learning_rate": 4.219948849104859e-05, + "loss": 8.4685, + "step": 56 + }, + { + "epoch": 0.07296, + "grad_norm": 1.6900804042816162, + "learning_rate": 4.2966751918158564e-05, + "loss": 8.4324, + "step": 57 + }, + { + "epoch": 0.07424, + "grad_norm": 1.5862479209899902, + "learning_rate": 4.373401534526854e-05, + "loss": 8.359, + "step": 58 + }, + { + "epoch": 0.07552, + "grad_norm": 1.5928101539611816, + "learning_rate": 4.450127877237851e-05, + "loss": 8.3375, + "step": 59 + }, + { + "epoch": 0.0768, + "grad_norm": 1.5766217708587646, + "learning_rate": 4.526854219948848e-05, + "loss": 8.2851, + "step": 60 + }, + { + "epoch": 0.07808, + "grad_norm": 1.5101016759872437, + "learning_rate": 4.603580562659846e-05, + "loss": 8.2516, + "step": 61 + }, + { + "epoch": 0.07936, + "grad_norm": 1.3688981533050537, + "learning_rate": 4.6803069053708436e-05, + "loss": 8.268, + "step": 62 + }, + { + "epoch": 0.08064, + "grad_norm": 1.4474924802780151, + "learning_rate": 4.757033248081841e-05, + "loss": 8.2099, + "step": 63 + }, + { + "epoch": 0.08192, + "grad_norm": 1.4008480310440063, + "learning_rate": 4.8337595907928385e-05, + "loss": 8.1661, + "step": 64 + }, + { + "epoch": 0.0832, + "grad_norm": 1.4576811790466309, + "learning_rate": 4.9104859335038366e-05, + "loss": 8.1582, + "step": 65 + }, + { + "epoch": 0.08448, + "grad_norm": 1.4062941074371338, + "learning_rate": 4.987212276214833e-05, + "loss": 8.0852, + "step": 66 + }, + { + "epoch": 0.08576, + "grad_norm": 1.3305187225341797, + "learning_rate": 5.063938618925831e-05, + "loss": 8.0746, + "step": 67 + }, + { + "epoch": 0.08704, + "grad_norm": 1.3217337131500244, + "learning_rate": 5.140664961636828e-05, + "loss": 8.0454, + "step": 68 + }, + { + "epoch": 0.08832, + "grad_norm": 1.3334351778030396, + "learning_rate": 5.2173913043478256e-05, + "loss": 7.9983, + "step": 69 + }, + { + "epoch": 0.0896, + "grad_norm": 1.2380847930908203, + "learning_rate": 5.294117647058824e-05, + "loss": 7.9684, + "step": 70 + }, + { + "epoch": 0.09088, + "grad_norm": 1.3053793907165527, + "learning_rate": 5.3708439897698205e-05, + "loss": 7.9135, + "step": 71 + }, + { + "epoch": 0.09216, + "grad_norm": 1.3208932876586914, + "learning_rate": 5.447570332480817e-05, + "loss": 7.9277, + "step": 72 + }, + { + "epoch": 0.09344, + "grad_norm": 1.3776912689208984, + "learning_rate": 5.5242966751918154e-05, + "loss": 7.861, + "step": 73 + }, + { + "epoch": 0.09472, + "grad_norm": 1.1503986120224, + "learning_rate": 5.601023017902813e-05, + "loss": 7.8479, + "step": 74 + }, + { + "epoch": 0.096, + "grad_norm": 1.2197959423065186, + "learning_rate": 5.67774936061381e-05, + "loss": 7.7553, + "step": 75 + }, + { + "epoch": 0.09728, + "grad_norm": 1.0848344564437866, + "learning_rate": 5.754475703324808e-05, + "loss": 7.7622, + "step": 76 + }, + { + "epoch": 0.09856, + "grad_norm": 1.0727125406265259, + "learning_rate": 5.8312020460358044e-05, + "loss": 7.6749, + "step": 77 + }, + { + "epoch": 0.09984, + "grad_norm": 1.0996843576431274, + "learning_rate": 5.9079283887468026e-05, + "loss": 7.7144, + "step": 78 + }, + { + "epoch": 0.10112, + "grad_norm": 1.086287498474121, + "learning_rate": 5.9846547314578e-05, + "loss": 7.644, + "step": 79 + }, + { + "epoch": 0.1024, + "grad_norm": 0.9513853192329407, + "learning_rate": 6.0613810741687974e-05, + "loss": 7.6277, + "step": 80 + }, + { + "epoch": 0.10368, + "grad_norm": 1.0651500225067139, + "learning_rate": 6.138107416879795e-05, + "loss": 7.6083, + "step": 81 + }, + { + "epoch": 0.10496, + "grad_norm": 1.1370134353637695, + "learning_rate": 6.214833759590792e-05, + "loss": 7.5705, + "step": 82 + }, + { + "epoch": 0.10624, + "grad_norm": 1.0704236030578613, + "learning_rate": 6.29156010230179e-05, + "loss": 7.5459, + "step": 83 + }, + { + "epoch": 0.10752, + "grad_norm": 1.046200156211853, + "learning_rate": 6.368286445012787e-05, + "loss": 7.5118, + "step": 84 + }, + { + "epoch": 0.1088, + "grad_norm": 0.8698644638061523, + "learning_rate": 6.445012787723785e-05, + "loss": 7.4862, + "step": 85 + }, + { + "epoch": 0.11008, + "grad_norm": 1.1209050416946411, + "learning_rate": 6.521739130434782e-05, + "loss": 7.485, + "step": 86 + }, + { + "epoch": 0.11136, + "grad_norm": 1.3683184385299683, + "learning_rate": 6.59846547314578e-05, + "loss": 7.4883, + "step": 87 + }, + { + "epoch": 0.11264, + "grad_norm": 0.9742879867553711, + "learning_rate": 6.675191815856777e-05, + "loss": 7.3843, + "step": 88 + }, + { + "epoch": 0.11392, + "grad_norm": 1.3988070487976074, + "learning_rate": 6.751918158567774e-05, + "loss": 7.394, + "step": 89 + }, + { + "epoch": 0.1152, + "grad_norm": 1.1061723232269287, + "learning_rate": 6.828644501278772e-05, + "loss": 7.3124, + "step": 90 + }, + { + "epoch": 0.11648, + "grad_norm": 1.0231846570968628, + "learning_rate": 6.905370843989769e-05, + "loss": 7.3708, + "step": 91 + }, + { + "epoch": 0.11776, + "grad_norm": 1.3509619235992432, + "learning_rate": 6.982097186700767e-05, + "loss": 7.3974, + "step": 92 + }, + { + "epoch": 0.11904, + "grad_norm": 0.8934109807014465, + "learning_rate": 7.058823529411764e-05, + "loss": 7.3336, + "step": 93 + }, + { + "epoch": 0.12032, + "grad_norm": 0.7775417566299438, + "learning_rate": 7.135549872122762e-05, + "loss": 7.3351, + "step": 94 + }, + { + "epoch": 0.1216, + "grad_norm": 1.0462260246276855, + "learning_rate": 7.212276214833759e-05, + "loss": 7.3454, + "step": 95 + }, + { + "epoch": 0.12288, + "grad_norm": 0.9035119414329529, + "learning_rate": 7.289002557544756e-05, + "loss": 7.2667, + "step": 96 + }, + { + "epoch": 0.12416, + "grad_norm": 0.8907987475395203, + "learning_rate": 7.365728900255754e-05, + "loss": 7.2154, + "step": 97 + }, + { + "epoch": 0.12544, + "grad_norm": 0.9043131470680237, + "learning_rate": 7.442455242966751e-05, + "loss": 7.2902, + "step": 98 + }, + { + "epoch": 0.12672, + "grad_norm": 0.9289432764053345, + "learning_rate": 7.519181585677749e-05, + "loss": 7.2135, + "step": 99 + }, + { + "epoch": 0.128, + "grad_norm": 0.6835809350013733, + "learning_rate": 7.595907928388747e-05, + "loss": 7.1967, + "step": 100 + }, + { + "epoch": 0.12928, + "grad_norm": 1.0327857732772827, + "learning_rate": 7.672634271099744e-05, + "loss": 7.1981, + "step": 101 + }, + { + "epoch": 0.13056, + "grad_norm": 0.8814536929130554, + "learning_rate": 7.749360613810741e-05, + "loss": 7.1923, + "step": 102 + }, + { + "epoch": 0.13184, + "grad_norm": 0.8455113172531128, + "learning_rate": 7.826086956521738e-05, + "loss": 7.2318, + "step": 103 + }, + { + "epoch": 0.13312, + "grad_norm": 1.2715808153152466, + "learning_rate": 7.902813299232736e-05, + "loss": 7.1884, + "step": 104 + }, + { + "epoch": 0.1344, + "grad_norm": 0.904243528842926, + "learning_rate": 7.979539641943735e-05, + "loss": 7.1839, + "step": 105 + }, + { + "epoch": 0.13568, + "grad_norm": 0.9426031112670898, + "learning_rate": 8.056265984654731e-05, + "loss": 7.1849, + "step": 106 + }, + { + "epoch": 0.13696, + "grad_norm": 0.7906021475791931, + "learning_rate": 8.132992327365728e-05, + "loss": 7.1946, + "step": 107 + }, + { + "epoch": 0.13824, + "grad_norm": 0.8392283320426941, + "learning_rate": 8.209718670076726e-05, + "loss": 7.1438, + "step": 108 + }, + { + "epoch": 0.13952, + "grad_norm": 0.7050894498825073, + "learning_rate": 8.286445012787723e-05, + "loss": 7.1547, + "step": 109 + }, + { + "epoch": 0.1408, + "grad_norm": 0.8787212371826172, + "learning_rate": 8.363171355498722e-05, + "loss": 7.1193, + "step": 110 + }, + { + "epoch": 0.14208, + "grad_norm": 0.5892102122306824, + "learning_rate": 8.439897698209718e-05, + "loss": 7.0926, + "step": 111 + }, + { + "epoch": 0.14336, + "grad_norm": 1.2569445371627808, + "learning_rate": 8.516624040920715e-05, + "loss": 7.1459, + "step": 112 + }, + { + "epoch": 0.14464, + "grad_norm": 0.6565424799919128, + "learning_rate": 8.593350383631713e-05, + "loss": 7.0536, + "step": 113 + }, + { + "epoch": 0.14592, + "grad_norm": 0.875479519367218, + "learning_rate": 8.670076726342709e-05, + "loss": 7.1119, + "step": 114 + }, + { + "epoch": 0.1472, + "grad_norm": 0.6395849585533142, + "learning_rate": 8.746803069053708e-05, + "loss": 7.0775, + "step": 115 + }, + { + "epoch": 0.14848, + "grad_norm": 0.7110955715179443, + "learning_rate": 8.823529411764705e-05, + "loss": 7.0741, + "step": 116 + }, + { + "epoch": 0.14976, + "grad_norm": 0.6394131779670715, + "learning_rate": 8.900255754475703e-05, + "loss": 7.0442, + "step": 117 + }, + { + "epoch": 0.15104, + "grad_norm": 0.9272814393043518, + "learning_rate": 8.9769820971867e-05, + "loss": 7.0344, + "step": 118 + }, + { + "epoch": 0.15232, + "grad_norm": 0.858862578868866, + "learning_rate": 9.053708439897696e-05, + "loss": 7.0646, + "step": 119 + }, + { + "epoch": 0.1536, + "grad_norm": 1.1841069459915161, + "learning_rate": 9.130434782608695e-05, + "loss": 7.0339, + "step": 120 + }, + { + "epoch": 0.15488, + "grad_norm": 0.9319404363632202, + "learning_rate": 9.207161125319692e-05, + "loss": 7.0008, + "step": 121 + }, + { + "epoch": 0.15616, + "grad_norm": 0.8237971067428589, + "learning_rate": 9.28388746803069e-05, + "loss": 7.0618, + "step": 122 + }, + { + "epoch": 0.15744, + "grad_norm": 1.379276156425476, + "learning_rate": 9.360613810741687e-05, + "loss": 6.9875, + "step": 123 + }, + { + "epoch": 0.15872, + "grad_norm": 0.9294237494468689, + "learning_rate": 9.437340153452683e-05, + "loss": 6.9856, + "step": 124 + }, + { + "epoch": 0.16, + "grad_norm": 0.9710653424263, + "learning_rate": 9.514066496163682e-05, + "loss": 6.9368, + "step": 125 + }, + { + "epoch": 0.16128, + "grad_norm": 1.0589849948883057, + "learning_rate": 9.59079283887468e-05, + "loss": 6.9884, + "step": 126 + }, + { + "epoch": 0.16256, + "grad_norm": 0.6673716902732849, + "learning_rate": 9.667519181585677e-05, + "loss": 6.9719, + "step": 127 + }, + { + "epoch": 0.16384, + "grad_norm": 0.7187034487724304, + "learning_rate": 9.744245524296674e-05, + "loss": 6.9662, + "step": 128 + }, + { + "epoch": 0.16512, + "grad_norm": 0.7513381242752075, + "learning_rate": 9.820971867007673e-05, + "loss": 6.8949, + "step": 129 + }, + { + "epoch": 0.1664, + "grad_norm": 0.6384944319725037, + "learning_rate": 9.897698209718669e-05, + "loss": 6.9296, + "step": 130 + }, + { + "epoch": 0.16768, + "grad_norm": 0.6383824348449707, + "learning_rate": 9.974424552429667e-05, + "loss": 6.956, + "step": 131 + }, + { + "epoch": 0.16896, + "grad_norm": 0.8002307415008545, + "learning_rate": 0.00010051150895140664, + "loss": 6.9135, + "step": 132 + }, + { + "epoch": 0.17024, + "grad_norm": 0.6587149500846863, + "learning_rate": 0.00010127877237851662, + "loss": 6.8886, + "step": 133 + }, + { + "epoch": 0.17152, + "grad_norm": 0.7700603604316711, + "learning_rate": 0.0001020460358056266, + "loss": 6.9318, + "step": 134 + }, + { + "epoch": 0.1728, + "grad_norm": 0.7758870720863342, + "learning_rate": 0.00010281329923273656, + "loss": 6.9266, + "step": 135 + }, + { + "epoch": 0.17408, + "grad_norm": 0.818088948726654, + "learning_rate": 0.00010358056265984654, + "loss": 6.8917, + "step": 136 + }, + { + "epoch": 0.17536, + "grad_norm": 1.1521923542022705, + "learning_rate": 0.00010434782608695651, + "loss": 6.9264, + "step": 137 + }, + { + "epoch": 0.17664, + "grad_norm": 0.7598437070846558, + "learning_rate": 0.00010511508951406649, + "loss": 6.8756, + "step": 138 + }, + { + "epoch": 0.17792, + "grad_norm": 0.8913196921348572, + "learning_rate": 0.00010588235294117647, + "loss": 6.8535, + "step": 139 + }, + { + "epoch": 0.1792, + "grad_norm": 1.0058693885803223, + "learning_rate": 0.00010664961636828644, + "loss": 6.88, + "step": 140 + }, + { + "epoch": 0.18048, + "grad_norm": 0.9338283538818359, + "learning_rate": 0.00010741687979539641, + "loss": 6.8383, + "step": 141 + }, + { + "epoch": 0.18176, + "grad_norm": 0.6229548454284668, + "learning_rate": 0.00010818414322250638, + "loss": 6.7874, + "step": 142 + }, + { + "epoch": 0.18304, + "grad_norm": 0.7639989256858826, + "learning_rate": 0.00010895140664961635, + "loss": 6.7625, + "step": 143 + }, + { + "epoch": 0.18432, + "grad_norm": 0.6291145086288452, + "learning_rate": 0.00010971867007672633, + "loss": 6.7944, + "step": 144 + }, + { + "epoch": 0.1856, + "grad_norm": 0.8313521146774292, + "learning_rate": 0.00011048593350383631, + "loss": 6.859, + "step": 145 + }, + { + "epoch": 0.18688, + "grad_norm": 1.0272287130355835, + "learning_rate": 0.00011125319693094628, + "loss": 6.8568, + "step": 146 + }, + { + "epoch": 0.18816, + "grad_norm": 1.1838618516921997, + "learning_rate": 0.00011202046035805626, + "loss": 6.7789, + "step": 147 + }, + { + "epoch": 0.18944, + "grad_norm": 0.5190697312355042, + "learning_rate": 0.00011278772378516622, + "loss": 6.8148, + "step": 148 + }, + { + "epoch": 0.19072, + "grad_norm": 0.7801781892776489, + "learning_rate": 0.0001135549872122762, + "loss": 6.8162, + "step": 149 + }, + { + "epoch": 0.192, + "grad_norm": 0.8845781087875366, + "learning_rate": 0.00011432225063938618, + "loss": 6.7817, + "step": 150 + }, + { + "epoch": 0.19328, + "grad_norm": 0.7235129475593567, + "learning_rate": 0.00011508951406649615, + "loss": 6.8149, + "step": 151 + }, + { + "epoch": 0.19456, + "grad_norm": 0.827165961265564, + "learning_rate": 0.00011585677749360613, + "loss": 6.6936, + "step": 152 + }, + { + "epoch": 0.19584, + "grad_norm": 0.6322879791259766, + "learning_rate": 0.00011662404092071609, + "loss": 6.7753, + "step": 153 + }, + { + "epoch": 0.19712, + "grad_norm": 0.6085387468338013, + "learning_rate": 0.00011739130434782608, + "loss": 6.7522, + "step": 154 + }, + { + "epoch": 0.1984, + "grad_norm": 0.5771993398666382, + "learning_rate": 0.00011815856777493605, + "loss": 6.7441, + "step": 155 + }, + { + "epoch": 0.19968, + "grad_norm": 0.6479660272598267, + "learning_rate": 0.00011892583120204603, + "loss": 6.795, + "step": 156 + }, + { + "epoch": 0.20096, + "grad_norm": 0.7639020085334778, + "learning_rate": 0.000119693094629156, + "loss": 6.7089, + "step": 157 + }, + { + "epoch": 0.20224, + "grad_norm": 0.7963739037513733, + "learning_rate": 0.00012046035805626599, + "loss": 6.7321, + "step": 158 + }, + { + "epoch": 0.20352, + "grad_norm": 0.8144316077232361, + "learning_rate": 0.00012122762148337595, + "loss": 6.7284, + "step": 159 + }, + { + "epoch": 0.2048, + "grad_norm": 0.8595559000968933, + "learning_rate": 0.00012199488491048592, + "loss": 6.7329, + "step": 160 + }, + { + "epoch": 0.20608, + "grad_norm": 0.8563242554664612, + "learning_rate": 0.0001227621483375959, + "loss": 6.7072, + "step": 161 + }, + { + "epoch": 0.20736, + "grad_norm": 0.9249243140220642, + "learning_rate": 0.00012352941176470587, + "loss": 6.7389, + "step": 162 + }, + { + "epoch": 0.20864, + "grad_norm": 0.7681375741958618, + "learning_rate": 0.00012429667519181585, + "loss": 6.6333, + "step": 163 + }, + { + "epoch": 0.20992, + "grad_norm": 0.8071373105049133, + "learning_rate": 0.00012506393861892582, + "loss": 6.7132, + "step": 164 + }, + { + "epoch": 0.2112, + "grad_norm": 0.7506120204925537, + "learning_rate": 0.0001258312020460358, + "loss": 6.6808, + "step": 165 + }, + { + "epoch": 0.21248, + "grad_norm": 0.8336671590805054, + "learning_rate": 0.00012659846547314577, + "loss": 6.7015, + "step": 166 + }, + { + "epoch": 0.21376, + "grad_norm": 0.9540615677833557, + "learning_rate": 0.00012736572890025574, + "loss": 6.6975, + "step": 167 + }, + { + "epoch": 0.21504, + "grad_norm": 0.8073275089263916, + "learning_rate": 0.00012813299232736572, + "loss": 6.6884, + "step": 168 + }, + { + "epoch": 0.21632, + "grad_norm": 0.6824653148651123, + "learning_rate": 0.0001289002557544757, + "loss": 6.6723, + "step": 169 + }, + { + "epoch": 0.2176, + "grad_norm": 0.5908713340759277, + "learning_rate": 0.00012966751918158567, + "loss": 6.6807, + "step": 170 + }, + { + "epoch": 0.21888, + "grad_norm": 0.7622809410095215, + "learning_rate": 0.00013043478260869564, + "loss": 6.6472, + "step": 171 + }, + { + "epoch": 0.22016, + "grad_norm": 0.6623175740242004, + "learning_rate": 0.00013120204603580562, + "loss": 6.6533, + "step": 172 + }, + { + "epoch": 0.22144, + "grad_norm": 0.7238216400146484, + "learning_rate": 0.0001319693094629156, + "loss": 6.654, + "step": 173 + }, + { + "epoch": 0.22272, + "grad_norm": 0.6670112013816833, + "learning_rate": 0.00013273657289002556, + "loss": 6.6262, + "step": 174 + }, + { + "epoch": 0.224, + "grad_norm": 0.6878064274787903, + "learning_rate": 0.00013350383631713554, + "loss": 6.6101, + "step": 175 + }, + { + "epoch": 0.22528, + "grad_norm": 0.7931588292121887, + "learning_rate": 0.0001342710997442455, + "loss": 6.6616, + "step": 176 + }, + { + "epoch": 0.22656, + "grad_norm": 1.0632978677749634, + "learning_rate": 0.0001350383631713555, + "loss": 6.6203, + "step": 177 + }, + { + "epoch": 0.22784, + "grad_norm": 0.9125176668167114, + "learning_rate": 0.00013580562659846546, + "loss": 6.5786, + "step": 178 + }, + { + "epoch": 0.22912, + "grad_norm": 0.6771340370178223, + "learning_rate": 0.00013657289002557544, + "loss": 6.6114, + "step": 179 + }, + { + "epoch": 0.2304, + "grad_norm": 0.7708578109741211, + "learning_rate": 0.0001373401534526854, + "loss": 6.663, + "step": 180 + }, + { + "epoch": 0.23168, + "grad_norm": 0.7952269315719604, + "learning_rate": 0.00013810741687979538, + "loss": 6.5768, + "step": 181 + }, + { + "epoch": 0.23296, + "grad_norm": 0.9929698705673218, + "learning_rate": 0.00013887468030690536, + "loss": 6.5578, + "step": 182 + }, + { + "epoch": 0.23424, + "grad_norm": 1.016129493713379, + "learning_rate": 0.00013964194373401533, + "loss": 6.6065, + "step": 183 + }, + { + "epoch": 0.23552, + "grad_norm": 1.0715256929397583, + "learning_rate": 0.0001404092071611253, + "loss": 6.6292, + "step": 184 + }, + { + "epoch": 0.2368, + "grad_norm": 1.1549087762832642, + "learning_rate": 0.00014117647058823528, + "loss": 6.5751, + "step": 185 + }, + { + "epoch": 0.23808, + "grad_norm": 0.736225962638855, + "learning_rate": 0.00014194373401534526, + "loss": 6.5302, + "step": 186 + }, + { + "epoch": 0.23936, + "grad_norm": 0.6689443588256836, + "learning_rate": 0.00014271099744245523, + "loss": 6.6393, + "step": 187 + }, + { + "epoch": 0.24064, + "grad_norm": 0.9276636838912964, + "learning_rate": 0.0001434782608695652, + "loss": 6.5304, + "step": 188 + }, + { + "epoch": 0.24192, + "grad_norm": 1.093260645866394, + "learning_rate": 0.00014424552429667518, + "loss": 6.5826, + "step": 189 + }, + { + "epoch": 0.2432, + "grad_norm": 0.9400092959403992, + "learning_rate": 0.00014501278772378515, + "loss": 6.5214, + "step": 190 + }, + { + "epoch": 0.24448, + "grad_norm": 0.9401909708976746, + "learning_rate": 0.00014578005115089513, + "loss": 6.5743, + "step": 191 + }, + { + "epoch": 0.24576, + "grad_norm": 0.8417365550994873, + "learning_rate": 0.0001465473145780051, + "loss": 6.5711, + "step": 192 + }, + { + "epoch": 0.24704, + "grad_norm": 0.8696411848068237, + "learning_rate": 0.00014731457800511508, + "loss": 6.5372, + "step": 193 + }, + { + "epoch": 0.24832, + "grad_norm": 0.696698784828186, + "learning_rate": 0.00014808184143222505, + "loss": 6.5471, + "step": 194 + }, + { + "epoch": 0.2496, + "grad_norm": 0.8627312779426575, + "learning_rate": 0.00014884910485933503, + "loss": 6.495, + "step": 195 + }, + { + "epoch": 0.25088, + "grad_norm": 0.8102883100509644, + "learning_rate": 0.000149616368286445, + "loss": 6.5341, + "step": 196 + }, + { + "epoch": 0.25216, + "grad_norm": 0.8614912629127502, + "learning_rate": 0.00015038363171355497, + "loss": 6.5825, + "step": 197 + }, + { + "epoch": 0.25344, + "grad_norm": 0.6828194260597229, + "learning_rate": 0.00015115089514066495, + "loss": 6.5386, + "step": 198 + }, + { + "epoch": 0.25472, + "grad_norm": 0.6624521613121033, + "learning_rate": 0.00015191815856777495, + "loss": 6.4857, + "step": 199 + }, + { + "epoch": 0.256, + "grad_norm": 0.6489097476005554, + "learning_rate": 0.0001526854219948849, + "loss": 6.5139, + "step": 200 + }, + { + "epoch": 0.25728, + "grad_norm": 0.6855762004852295, + "learning_rate": 0.00015345268542199487, + "loss": 6.4764, + "step": 201 + }, + { + "epoch": 0.25856, + "grad_norm": 0.8213603496551514, + "learning_rate": 0.00015421994884910485, + "loss": 6.5242, + "step": 202 + }, + { + "epoch": 0.25984, + "grad_norm": 0.824782133102417, + "learning_rate": 0.00015498721227621482, + "loss": 6.5478, + "step": 203 + }, + { + "epoch": 0.26112, + "grad_norm": 0.9565497636795044, + "learning_rate": 0.00015575447570332482, + "loss": 6.5082, + "step": 204 + }, + { + "epoch": 0.2624, + "grad_norm": 0.9202224612236023, + "learning_rate": 0.00015652173913043477, + "loss": 6.4114, + "step": 205 + }, + { + "epoch": 0.26368, + "grad_norm": 0.992260217666626, + "learning_rate": 0.00015728900255754474, + "loss": 6.4625, + "step": 206 + }, + { + "epoch": 0.26496, + "grad_norm": 0.8465138077735901, + "learning_rate": 0.00015805626598465472, + "loss": 6.439, + "step": 207 + }, + { + "epoch": 0.26624, + "grad_norm": 0.7151897549629211, + "learning_rate": 0.0001588235294117647, + "loss": 6.4956, + "step": 208 + }, + { + "epoch": 0.26752, + "grad_norm": 0.8685120940208435, + "learning_rate": 0.0001595907928388747, + "loss": 6.4803, + "step": 209 + }, + { + "epoch": 0.2688, + "grad_norm": 0.81340491771698, + "learning_rate": 0.00016035805626598464, + "loss": 6.4193, + "step": 210 + }, + { + "epoch": 0.27008, + "grad_norm": 0.7921631932258606, + "learning_rate": 0.00016112531969309462, + "loss": 6.4279, + "step": 211 + }, + { + "epoch": 0.27136, + "grad_norm": 0.6297151446342468, + "learning_rate": 0.0001618925831202046, + "loss": 6.5347, + "step": 212 + }, + { + "epoch": 0.27264, + "grad_norm": 0.6811320781707764, + "learning_rate": 0.00016265984654731456, + "loss": 6.4414, + "step": 213 + }, + { + "epoch": 0.27392, + "grad_norm": 0.6986665725708008, + "learning_rate": 0.00016342710997442457, + "loss": 6.4284, + "step": 214 + }, + { + "epoch": 0.2752, + "grad_norm": 0.6655412316322327, + "learning_rate": 0.0001641943734015345, + "loss": 6.4398, + "step": 215 + }, + { + "epoch": 0.27648, + "grad_norm": 0.6471274495124817, + "learning_rate": 0.0001649616368286445, + "loss": 6.4229, + "step": 216 + }, + { + "epoch": 0.27776, + "grad_norm": 0.7184582948684692, + "learning_rate": 0.00016572890025575446, + "loss": 6.4255, + "step": 217 + }, + { + "epoch": 0.27904, + "grad_norm": 0.7616591453552246, + "learning_rate": 0.00016649616368286444, + "loss": 6.4472, + "step": 218 + }, + { + "epoch": 0.28032, + "grad_norm": 0.6204221248626709, + "learning_rate": 0.00016726342710997444, + "loss": 6.3404, + "step": 219 + }, + { + "epoch": 0.2816, + "grad_norm": 0.7307862639427185, + "learning_rate": 0.00016803069053708438, + "loss": 6.4221, + "step": 220 + }, + { + "epoch": 0.28288, + "grad_norm": 0.6093372702598572, + "learning_rate": 0.00016879795396419436, + "loss": 6.3769, + "step": 221 + }, + { + "epoch": 0.28416, + "grad_norm": 0.7051405906677246, + "learning_rate": 0.00016956521739130433, + "loss": 6.4473, + "step": 222 + }, + { + "epoch": 0.28544, + "grad_norm": 0.6887394785881042, + "learning_rate": 0.0001703324808184143, + "loss": 6.4528, + "step": 223 + }, + { + "epoch": 0.28672, + "grad_norm": 0.7406412959098816, + "learning_rate": 0.00017109974424552428, + "loss": 6.375, + "step": 224 + }, + { + "epoch": 0.288, + "grad_norm": 0.6635006666183472, + "learning_rate": 0.00017186700767263426, + "loss": 6.4649, + "step": 225 + }, + { + "epoch": 0.28928, + "grad_norm": 0.7995834946632385, + "learning_rate": 0.00017263427109974423, + "loss": 6.3737, + "step": 226 + }, + { + "epoch": 0.29056, + "grad_norm": 0.9514179825782776, + "learning_rate": 0.00017340153452685418, + "loss": 6.3449, + "step": 227 + }, + { + "epoch": 0.29184, + "grad_norm": 1.0529365539550781, + "learning_rate": 0.00017416879795396418, + "loss": 6.3666, + "step": 228 + }, + { + "epoch": 0.29312, + "grad_norm": 1.0295612812042236, + "learning_rate": 0.00017493606138107415, + "loss": 6.4025, + "step": 229 + }, + { + "epoch": 0.2944, + "grad_norm": 1.07936692237854, + "learning_rate": 0.00017570332480818413, + "loss": 6.3812, + "step": 230 + }, + { + "epoch": 0.29568, + "grad_norm": 0.9472998976707458, + "learning_rate": 0.0001764705882352941, + "loss": 6.3867, + "step": 231 + }, + { + "epoch": 0.29696, + "grad_norm": 0.6875970959663391, + "learning_rate": 0.00017723785166240405, + "loss": 6.4047, + "step": 232 + }, + { + "epoch": 0.29824, + "grad_norm": 0.8679647445678711, + "learning_rate": 0.00017800511508951405, + "loss": 6.3743, + "step": 233 + }, + { + "epoch": 0.29952, + "grad_norm": 0.9203290939331055, + "learning_rate": 0.00017877237851662403, + "loss": 6.3606, + "step": 234 + }, + { + "epoch": 0.3008, + "grad_norm": 0.8777357935905457, + "learning_rate": 0.000179539641943734, + "loss": 6.3564, + "step": 235 + }, + { + "epoch": 0.30208, + "grad_norm": 0.8760378956794739, + "learning_rate": 0.00018030690537084397, + "loss": 6.3576, + "step": 236 + }, + { + "epoch": 0.30336, + "grad_norm": 0.7797200083732605, + "learning_rate": 0.00018107416879795392, + "loss": 6.2981, + "step": 237 + }, + { + "epoch": 0.30464, + "grad_norm": 0.7287745475769043, + "learning_rate": 0.00018184143222506392, + "loss": 6.342, + "step": 238 + }, + { + "epoch": 0.30592, + "grad_norm": 0.7142657041549683, + "learning_rate": 0.0001826086956521739, + "loss": 6.4097, + "step": 239 + }, + { + "epoch": 0.3072, + "grad_norm": 0.8947247266769409, + "learning_rate": 0.00018337595907928387, + "loss": 6.4198, + "step": 240 + }, + { + "epoch": 0.30848, + "grad_norm": 0.7610101103782654, + "learning_rate": 0.00018414322250639385, + "loss": 6.3638, + "step": 241 + }, + { + "epoch": 0.30976, + "grad_norm": 0.7212010622024536, + "learning_rate": 0.0001849104859335038, + "loss": 6.3509, + "step": 242 + }, + { + "epoch": 0.31104, + "grad_norm": 0.711169421672821, + "learning_rate": 0.0001856777493606138, + "loss": 6.2656, + "step": 243 + }, + { + "epoch": 0.31232, + "grad_norm": 0.6636462807655334, + "learning_rate": 0.00018644501278772377, + "loss": 6.3338, + "step": 244 + }, + { + "epoch": 0.3136, + "grad_norm": 0.6644899249076843, + "learning_rate": 0.00018721227621483374, + "loss": 6.3888, + "step": 245 + }, + { + "epoch": 0.31488, + "grad_norm": 0.8558899760246277, + "learning_rate": 0.00018797953964194372, + "loss": 6.383, + "step": 246 + }, + { + "epoch": 0.31616, + "grad_norm": 0.8236832618713379, + "learning_rate": 0.00018874680306905366, + "loss": 6.3142, + "step": 247 + }, + { + "epoch": 0.31744, + "grad_norm": 0.6856957674026489, + "learning_rate": 0.00018951406649616367, + "loss": 6.3231, + "step": 248 + }, + { + "epoch": 0.31872, + "grad_norm": 0.8850679993629456, + "learning_rate": 0.00019028132992327364, + "loss": 6.334, + "step": 249 + }, + { + "epoch": 0.32, + "grad_norm": 0.9059402942657471, + "learning_rate": 0.00019104859335038361, + "loss": 6.3826, + "step": 250 + }, + { + "epoch": 0.32128, + "grad_norm": 0.7600975036621094, + "learning_rate": 0.0001918158567774936, + "loss": 6.3067, + "step": 251 + }, + { + "epoch": 0.32256, + "grad_norm": 0.73809814453125, + "learning_rate": 0.00019258312020460354, + "loss": 6.3276, + "step": 252 + }, + { + "epoch": 0.32384, + "grad_norm": 0.7253278493881226, + "learning_rate": 0.00019335038363171354, + "loss": 6.282, + "step": 253 + }, + { + "epoch": 0.32512, + "grad_norm": 0.6514863967895508, + "learning_rate": 0.0001941176470588235, + "loss": 6.2885, + "step": 254 + }, + { + "epoch": 0.3264, + "grad_norm": 0.8405912518501282, + "learning_rate": 0.0001948849104859335, + "loss": 6.3166, + "step": 255 + }, + { + "epoch": 0.32768, + "grad_norm": 1.0162445306777954, + "learning_rate": 0.00019565217391304346, + "loss": 6.2927, + "step": 256 + }, + { + "epoch": 0.32896, + "grad_norm": 1.0011142492294312, + "learning_rate": 0.00019641943734015346, + "loss": 6.2595, + "step": 257 + }, + { + "epoch": 0.33024, + "grad_norm": 0.8047503232955933, + "learning_rate": 0.0001971867007672634, + "loss": 6.333, + "step": 258 + }, + { + "epoch": 0.33152, + "grad_norm": 0.7660624384880066, + "learning_rate": 0.00019795396419437338, + "loss": 6.2664, + "step": 259 + }, + { + "epoch": 0.3328, + "grad_norm": 0.7502520680427551, + "learning_rate": 0.00019872122762148336, + "loss": 6.2721, + "step": 260 + }, + { + "epoch": 0.33408, + "grad_norm": 0.9337821006774902, + "learning_rate": 0.00019948849104859333, + "loss": 6.3069, + "step": 261 + }, + { + "epoch": 0.33536, + "grad_norm": 0.6733538508415222, + "learning_rate": 0.00020025575447570333, + "loss": 6.221, + "step": 262 + }, + { + "epoch": 0.33664, + "grad_norm": 0.7917484045028687, + "learning_rate": 0.00020102301790281328, + "loss": 6.2338, + "step": 263 + }, + { + "epoch": 0.33792, + "grad_norm": 0.7609951496124268, + "learning_rate": 0.00020179028132992326, + "loss": 6.3031, + "step": 264 + }, + { + "epoch": 0.3392, + "grad_norm": 0.6764228940010071, + "learning_rate": 0.00020255754475703323, + "loss": 6.2349, + "step": 265 + }, + { + "epoch": 0.34048, + "grad_norm": 0.7090582847595215, + "learning_rate": 0.0002033248081841432, + "loss": 6.244, + "step": 266 + }, + { + "epoch": 0.34176, + "grad_norm": 0.7496779561042786, + "learning_rate": 0.0002040920716112532, + "loss": 6.2315, + "step": 267 + }, + { + "epoch": 0.34304, + "grad_norm": 0.987930417060852, + "learning_rate": 0.00020485933503836315, + "loss": 6.2223, + "step": 268 + }, + { + "epoch": 0.34432, + "grad_norm": 0.8948729634284973, + "learning_rate": 0.00020562659846547313, + "loss": 6.2722, + "step": 269 + }, + { + "epoch": 0.3456, + "grad_norm": 0.7875823974609375, + "learning_rate": 0.0002063938618925831, + "loss": 6.228, + "step": 270 + }, + { + "epoch": 0.34688, + "grad_norm": 0.7225289940834045, + "learning_rate": 0.00020716112531969308, + "loss": 6.1819, + "step": 271 + }, + { + "epoch": 0.34816, + "grad_norm": 0.9579319953918457, + "learning_rate": 0.00020792838874680308, + "loss": 6.2365, + "step": 272 + }, + { + "epoch": 0.34944, + "grad_norm": 1.02460777759552, + "learning_rate": 0.00020869565217391303, + "loss": 6.2907, + "step": 273 + }, + { + "epoch": 0.35072, + "grad_norm": 1.275744915008545, + "learning_rate": 0.000209462915601023, + "loss": 6.259, + "step": 274 + }, + { + "epoch": 0.352, + "grad_norm": 0.8093612194061279, + "learning_rate": 0.00021023017902813297, + "loss": 6.1701, + "step": 275 + }, + { + "epoch": 0.35328, + "grad_norm": 0.9731954336166382, + "learning_rate": 0.00021099744245524295, + "loss": 6.259, + "step": 276 + }, + { + "epoch": 0.35456, + "grad_norm": 1.0986355543136597, + "learning_rate": 0.00021176470588235295, + "loss": 6.1786, + "step": 277 + }, + { + "epoch": 0.35584, + "grad_norm": 1.1325825452804565, + "learning_rate": 0.0002125319693094629, + "loss": 6.253, + "step": 278 + }, + { + "epoch": 0.35712, + "grad_norm": 0.7239522337913513, + "learning_rate": 0.00021329923273657287, + "loss": 6.2677, + "step": 279 + }, + { + "epoch": 0.3584, + "grad_norm": 0.9211587905883789, + "learning_rate": 0.00021406649616368285, + "loss": 6.1859, + "step": 280 + }, + { + "epoch": 0.35968, + "grad_norm": 0.8542487621307373, + "learning_rate": 0.00021483375959079282, + "loss": 6.1534, + "step": 281 + }, + { + "epoch": 0.36096, + "grad_norm": 0.7646416425704956, + "learning_rate": 0.0002156010230179028, + "loss": 6.1181, + "step": 282 + }, + { + "epoch": 0.36224, + "grad_norm": 0.7755529880523682, + "learning_rate": 0.00021636828644501277, + "loss": 6.1676, + "step": 283 + }, + { + "epoch": 0.36352, + "grad_norm": 0.6631358861923218, + "learning_rate": 0.00021713554987212274, + "loss": 6.2041, + "step": 284 + }, + { + "epoch": 0.3648, + "grad_norm": 0.676986038684845, + "learning_rate": 0.0002179028132992327, + "loss": 6.178, + "step": 285 + }, + { + "epoch": 0.36608, + "grad_norm": 0.7331904172897339, + "learning_rate": 0.0002186700767263427, + "loss": 6.1775, + "step": 286 + }, + { + "epoch": 0.36736, + "grad_norm": 0.7227631211280823, + "learning_rate": 0.00021943734015345267, + "loss": 6.1277, + "step": 287 + }, + { + "epoch": 0.36864, + "grad_norm": 0.7674806118011475, + "learning_rate": 0.00022020460358056264, + "loss": 6.1654, + "step": 288 + }, + { + "epoch": 0.36992, + "grad_norm": 0.7369276881217957, + "learning_rate": 0.00022097186700767261, + "loss": 6.1768, + "step": 289 + }, + { + "epoch": 0.3712, + "grad_norm": 0.7614607214927673, + "learning_rate": 0.00022173913043478256, + "loss": 6.175, + "step": 290 + }, + { + "epoch": 0.37248, + "grad_norm": 0.6262795329093933, + "learning_rate": 0.00022250639386189256, + "loss": 6.1554, + "step": 291 + }, + { + "epoch": 0.37376, + "grad_norm": 0.7323906421661377, + "learning_rate": 0.00022327365728900254, + "loss": 6.1748, + "step": 292 + }, + { + "epoch": 0.37504, + "grad_norm": 0.6152296662330627, + "learning_rate": 0.0002240409207161125, + "loss": 6.1391, + "step": 293 + }, + { + "epoch": 0.37632, + "grad_norm": 0.672359824180603, + "learning_rate": 0.0002248081841432225, + "loss": 6.1529, + "step": 294 + }, + { + "epoch": 0.3776, + "grad_norm": 0.8039406538009644, + "learning_rate": 0.00022557544757033243, + "loss": 6.1391, + "step": 295 + }, + { + "epoch": 0.37888, + "grad_norm": 0.9391714334487915, + "learning_rate": 0.00022634271099744244, + "loss": 6.1234, + "step": 296 + }, + { + "epoch": 0.38016, + "grad_norm": 0.6732305884361267, + "learning_rate": 0.0002271099744245524, + "loss": 6.1335, + "step": 297 + }, + { + "epoch": 0.38144, + "grad_norm": 0.8230670094490051, + "learning_rate": 0.00022787723785166238, + "loss": 6.136, + "step": 298 + }, + { + "epoch": 0.38272, + "grad_norm": 0.5934028625488281, + "learning_rate": 0.00022864450127877236, + "loss": 6.1398, + "step": 299 + }, + { + "epoch": 0.384, + "grad_norm": 0.8017310500144958, + "learning_rate": 0.0002294117647058823, + "loss": 6.1337, + "step": 300 + }, + { + "epoch": 0.38528, + "grad_norm": 0.7498769760131836, + "learning_rate": 0.0002301790281329923, + "loss": 6.1117, + "step": 301 + }, + { + "epoch": 0.38656, + "grad_norm": 0.727734386920929, + "learning_rate": 0.00023094629156010228, + "loss": 6.1219, + "step": 302 + }, + { + "epoch": 0.38784, + "grad_norm": 0.7669496536254883, + "learning_rate": 0.00023171355498721226, + "loss": 6.1272, + "step": 303 + }, + { + "epoch": 0.38912, + "grad_norm": 0.7476457357406616, + "learning_rate": 0.00023248081841432223, + "loss": 6.0301, + "step": 304 + }, + { + "epoch": 0.3904, + "grad_norm": 0.7540501952171326, + "learning_rate": 0.00023324808184143218, + "loss": 6.1143, + "step": 305 + }, + { + "epoch": 0.39168, + "grad_norm": 1.032804012298584, + "learning_rate": 0.00023401534526854218, + "loss": 6.1371, + "step": 306 + }, + { + "epoch": 0.39296, + "grad_norm": 1.1408971548080444, + "learning_rate": 0.00023478260869565215, + "loss": 6.0916, + "step": 307 + }, + { + "epoch": 0.39424, + "grad_norm": 1.027319312095642, + "learning_rate": 0.00023554987212276213, + "loss": 6.1407, + "step": 308 + }, + { + "epoch": 0.39552, + "grad_norm": 0.8574281930923462, + "learning_rate": 0.0002363171355498721, + "loss": 6.0626, + "step": 309 + }, + { + "epoch": 0.3968, + "grad_norm": 0.937127411365509, + "learning_rate": 0.0002370843989769821, + "loss": 6.1034, + "step": 310 + }, + { + "epoch": 0.39808, + "grad_norm": 0.7400676012039185, + "learning_rate": 0.00023785166240409205, + "loss": 6.0996, + "step": 311 + }, + { + "epoch": 0.39936, + "grad_norm": 0.7361345291137695, + "learning_rate": 0.00023861892583120203, + "loss": 6.047, + "step": 312 + }, + { + "epoch": 0.40064, + "grad_norm": 0.7439408898353577, + "learning_rate": 0.000239386189258312, + "loss": 6.0906, + "step": 313 + }, + { + "epoch": 0.40192, + "grad_norm": 0.7513951659202576, + "learning_rate": 0.00024015345268542197, + "loss": 6.0956, + "step": 314 + }, + { + "epoch": 0.4032, + "grad_norm": 0.7911155223846436, + "learning_rate": 0.00024092071611253198, + "loss": 6.0411, + "step": 315 + }, + { + "epoch": 0.40448, + "grad_norm": 0.7218300700187683, + "learning_rate": 0.00024168797953964192, + "loss": 6.1029, + "step": 316 + }, + { + "epoch": 0.40576, + "grad_norm": 0.8403254151344299, + "learning_rate": 0.0002424552429667519, + "loss": 6.0944, + "step": 317 + }, + { + "epoch": 0.40704, + "grad_norm": 0.676937460899353, + "learning_rate": 0.00024322250639386187, + "loss": 6.037, + "step": 318 + }, + { + "epoch": 0.40832, + "grad_norm": 0.8323513865470886, + "learning_rate": 0.00024398976982097185, + "loss": 6.0856, + "step": 319 + }, + { + "epoch": 0.4096, + "grad_norm": 0.9101656675338745, + "learning_rate": 0.0002447570332480818, + "loss": 6.089, + "step": 320 + }, + { + "epoch": 0.41088, + "grad_norm": 0.7529237270355225, + "learning_rate": 0.0002455242966751918, + "loss": 6.0525, + "step": 321 + }, + { + "epoch": 0.41216, + "grad_norm": 0.6977245211601257, + "learning_rate": 0.00024629156010230177, + "loss": 6.0561, + "step": 322 + }, + { + "epoch": 0.41344, + "grad_norm": 0.9390827417373657, + "learning_rate": 0.00024705882352941174, + "loss": 6.0536, + "step": 323 + }, + { + "epoch": 0.41472, + "grad_norm": 0.895576536655426, + "learning_rate": 0.0002478260869565217, + "loss": 6.0667, + "step": 324 + }, + { + "epoch": 0.416, + "grad_norm": 0.9296811819076538, + "learning_rate": 0.0002485933503836317, + "loss": 6.0086, + "step": 325 + }, + { + "epoch": 0.41728, + "grad_norm": 0.9841163754463196, + "learning_rate": 0.00024936061381074167, + "loss": 6.0587, + "step": 326 + }, + { + "epoch": 0.41856, + "grad_norm": 0.7630248665809631, + "learning_rate": 0.00025012787723785164, + "loss": 5.9773, + "step": 327 + }, + { + "epoch": 0.41984, + "grad_norm": 0.8875618577003479, + "learning_rate": 0.0002508951406649616, + "loss": 6.0148, + "step": 328 + }, + { + "epoch": 0.42112, + "grad_norm": 0.8923934102058411, + "learning_rate": 0.0002516624040920716, + "loss": 6.0467, + "step": 329 + }, + { + "epoch": 0.4224, + "grad_norm": 0.8564996719360352, + "learning_rate": 0.00025242966751918156, + "loss": 6.0135, + "step": 330 + }, + { + "epoch": 0.42368, + "grad_norm": 0.803805410861969, + "learning_rate": 0.00025319693094629154, + "loss": 5.9922, + "step": 331 + }, + { + "epoch": 0.42496, + "grad_norm": 0.6739373803138733, + "learning_rate": 0.0002539641943734015, + "loss": 6.0465, + "step": 332 + }, + { + "epoch": 0.42624, + "grad_norm": 0.7415125966072083, + "learning_rate": 0.0002547314578005115, + "loss": 6.0081, + "step": 333 + }, + { + "epoch": 0.42752, + "grad_norm": 0.6618335247039795, + "learning_rate": 0.00025549872122762146, + "loss": 5.9794, + "step": 334 + }, + { + "epoch": 0.4288, + "grad_norm": 0.7585832476615906, + "learning_rate": 0.00025626598465473144, + "loss": 5.9738, + "step": 335 + }, + { + "epoch": 0.43008, + "grad_norm": 0.7203150987625122, + "learning_rate": 0.0002570332480818414, + "loss": 6.0329, + "step": 336 + }, + { + "epoch": 0.43136, + "grad_norm": 0.7902894020080566, + "learning_rate": 0.0002578005115089514, + "loss": 5.9299, + "step": 337 + }, + { + "epoch": 0.43264, + "grad_norm": 0.725581705570221, + "learning_rate": 0.00025856777493606136, + "loss": 5.9855, + "step": 338 + }, + { + "epoch": 0.43392, + "grad_norm": 0.8299371004104614, + "learning_rate": 0.00025933503836317133, + "loss": 6.0628, + "step": 339 + }, + { + "epoch": 0.4352, + "grad_norm": 0.7858306765556335, + "learning_rate": 0.0002601023017902813, + "loss": 6.0055, + "step": 340 + }, + { + "epoch": 0.43648, + "grad_norm": 0.8693034052848816, + "learning_rate": 0.0002608695652173913, + "loss": 6.032, + "step": 341 + }, + { + "epoch": 0.43776, + "grad_norm": 0.8078804612159729, + "learning_rate": 0.00026163682864450126, + "loss": 5.9117, + "step": 342 + }, + { + "epoch": 0.43904, + "grad_norm": 0.6977505683898926, + "learning_rate": 0.00026240409207161123, + "loss": 6.0052, + "step": 343 + }, + { + "epoch": 0.44032, + "grad_norm": 0.7459436058998108, + "learning_rate": 0.0002631713554987212, + "loss": 6.0064, + "step": 344 + }, + { + "epoch": 0.4416, + "grad_norm": 0.8525255918502808, + "learning_rate": 0.0002639386189258312, + "loss": 5.9702, + "step": 345 + }, + { + "epoch": 0.44288, + "grad_norm": 0.8468485474586487, + "learning_rate": 0.00026470588235294115, + "loss": 6.0015, + "step": 346 + }, + { + "epoch": 0.44416, + "grad_norm": 0.8215516805648804, + "learning_rate": 0.00026547314578005113, + "loss": 5.9744, + "step": 347 + }, + { + "epoch": 0.44544, + "grad_norm": 0.7234410643577576, + "learning_rate": 0.0002662404092071611, + "loss": 5.9851, + "step": 348 + }, + { + "epoch": 0.44672, + "grad_norm": 0.7426914572715759, + "learning_rate": 0.0002670076726342711, + "loss": 5.9737, + "step": 349 + }, + { + "epoch": 0.448, + "grad_norm": 0.7684586048126221, + "learning_rate": 0.00026777493606138105, + "loss": 5.9324, + "step": 350 + }, + { + "epoch": 0.44928, + "grad_norm": 0.8180778622627258, + "learning_rate": 0.000268542199488491, + "loss": 5.8851, + "step": 351 + }, + { + "epoch": 0.45056, + "grad_norm": 0.7742370367050171, + "learning_rate": 0.000269309462915601, + "loss": 5.9474, + "step": 352 + }, + { + "epoch": 0.45184, + "grad_norm": 0.9160585403442383, + "learning_rate": 0.000270076726342711, + "loss": 5.958, + "step": 353 + }, + { + "epoch": 0.45312, + "grad_norm": 0.9427086710929871, + "learning_rate": 0.00027084398976982095, + "loss": 5.9811, + "step": 354 + }, + { + "epoch": 0.4544, + "grad_norm": 1.0058586597442627, + "learning_rate": 0.0002716112531969309, + "loss": 5.9625, + "step": 355 + }, + { + "epoch": 0.45568, + "grad_norm": 1.0322132110595703, + "learning_rate": 0.0002723785166240409, + "loss": 5.8985, + "step": 356 + }, + { + "epoch": 0.45696, + "grad_norm": 0.8881152868270874, + "learning_rate": 0.00027314578005115087, + "loss": 6.0119, + "step": 357 + }, + { + "epoch": 0.45824, + "grad_norm": 0.7439785599708557, + "learning_rate": 0.00027391304347826085, + "loss": 5.9378, + "step": 358 + }, + { + "epoch": 0.45952, + "grad_norm": 0.8855685591697693, + "learning_rate": 0.0002746803069053708, + "loss": 5.9244, + "step": 359 + }, + { + "epoch": 0.4608, + "grad_norm": 0.6977657079696655, + "learning_rate": 0.0002754475703324808, + "loss": 5.9455, + "step": 360 + }, + { + "epoch": 0.46208, + "grad_norm": 0.6967872381210327, + "learning_rate": 0.00027621483375959077, + "loss": 5.9463, + "step": 361 + }, + { + "epoch": 0.46336, + "grad_norm": 0.6706664562225342, + "learning_rate": 0.00027698209718670074, + "loss": 5.9024, + "step": 362 + }, + { + "epoch": 0.46464, + "grad_norm": 1.084336757659912, + "learning_rate": 0.0002777493606138107, + "loss": 5.864, + "step": 363 + }, + { + "epoch": 0.46592, + "grad_norm": 1.0996636152267456, + "learning_rate": 0.0002785166240409207, + "loss": 5.9009, + "step": 364 + }, + { + "epoch": 0.4672, + "grad_norm": 0.9186033606529236, + "learning_rate": 0.00027928388746803067, + "loss": 5.8365, + "step": 365 + }, + { + "epoch": 0.46848, + "grad_norm": 0.8409336805343628, + "learning_rate": 0.00028005115089514064, + "loss": 5.908, + "step": 366 + }, + { + "epoch": 0.46976, + "grad_norm": 0.8734238147735596, + "learning_rate": 0.0002808184143222506, + "loss": 5.8922, + "step": 367 + }, + { + "epoch": 0.47104, + "grad_norm": 1.0424097776412964, + "learning_rate": 0.0002815856777493606, + "loss": 5.975, + "step": 368 + }, + { + "epoch": 0.47232, + "grad_norm": 1.0325592756271362, + "learning_rate": 0.00028235294117647056, + "loss": 5.8882, + "step": 369 + }, + { + "epoch": 0.4736, + "grad_norm": 0.8004043102264404, + "learning_rate": 0.00028312020460358054, + "loss": 5.8663, + "step": 370 + }, + { + "epoch": 0.47488, + "grad_norm": 0.6563780307769775, + "learning_rate": 0.0002838874680306905, + "loss": 5.8704, + "step": 371 + }, + { + "epoch": 0.47616, + "grad_norm": 0.7133100032806396, + "learning_rate": 0.0002846547314578005, + "loss": 5.8959, + "step": 372 + }, + { + "epoch": 0.47744, + "grad_norm": 0.656963050365448, + "learning_rate": 0.00028542199488491046, + "loss": 5.866, + "step": 373 + }, + { + "epoch": 0.47872, + "grad_norm": 0.761816143989563, + "learning_rate": 0.00028618925831202044, + "loss": 5.8776, + "step": 374 + }, + { + "epoch": 0.48, + "grad_norm": 0.900486171245575, + "learning_rate": 0.0002869565217391304, + "loss": 5.9083, + "step": 375 + }, + { + "epoch": 0.48128, + "grad_norm": 1.0122328996658325, + "learning_rate": 0.0002877237851662404, + "loss": 5.8945, + "step": 376 + }, + { + "epoch": 0.48256, + "grad_norm": 0.9674193859100342, + "learning_rate": 0.00028849104859335036, + "loss": 5.8972, + "step": 377 + }, + { + "epoch": 0.48384, + "grad_norm": 0.7003923058509827, + "learning_rate": 0.00028925831202046033, + "loss": 5.8883, + "step": 378 + }, + { + "epoch": 0.48512, + "grad_norm": 0.7299737930297852, + "learning_rate": 0.0002900255754475703, + "loss": 5.8642, + "step": 379 + }, + { + "epoch": 0.4864, + "grad_norm": 0.68450927734375, + "learning_rate": 0.0002907928388746803, + "loss": 5.8051, + "step": 380 + }, + { + "epoch": 0.48768, + "grad_norm": 0.744691789150238, + "learning_rate": 0.00029156010230179026, + "loss": 5.8283, + "step": 381 + }, + { + "epoch": 0.48896, + "grad_norm": 0.8168903589248657, + "learning_rate": 0.00029232736572890023, + "loss": 5.8316, + "step": 382 + }, + { + "epoch": 0.49024, + "grad_norm": 1.2333744764328003, + "learning_rate": 0.0002930946291560102, + "loss": 5.8244, + "step": 383 + }, + { + "epoch": 0.49152, + "grad_norm": 0.9197435975074768, + "learning_rate": 0.0002938618925831202, + "loss": 5.8464, + "step": 384 + }, + { + "epoch": 0.4928, + "grad_norm": 1.0070711374282837, + "learning_rate": 0.00029462915601023015, + "loss": 5.8809, + "step": 385 + }, + { + "epoch": 0.49408, + "grad_norm": 0.9315182566642761, + "learning_rate": 0.00029539641943734013, + "loss": 5.8207, + "step": 386 + }, + { + "epoch": 0.49536, + "grad_norm": 0.7878130078315735, + "learning_rate": 0.0002961636828644501, + "loss": 5.8888, + "step": 387 + }, + { + "epoch": 0.49664, + "grad_norm": 0.8333479762077332, + "learning_rate": 0.0002969309462915601, + "loss": 5.8616, + "step": 388 + }, + { + "epoch": 0.49792, + "grad_norm": 0.8158892393112183, + "learning_rate": 0.00029769820971867005, + "loss": 5.8188, + "step": 389 + }, + { + "epoch": 0.4992, + "grad_norm": 0.6871780157089233, + "learning_rate": 0.00029846547314578, + "loss": 5.8525, + "step": 390 + }, + { + "epoch": 0.50048, + "grad_norm": 0.7237544059753418, + "learning_rate": 0.00029923273657289, + "loss": 5.8758, + "step": 391 + }, + { + "epoch": 0.50176, + "grad_norm": 0.6453325152397156, + "learning_rate": 0.0003, + "loss": 5.7895, + "step": 392 + }, + { + "epoch": 0.50304, + "grad_norm": 0.5974975824356079, + "learning_rate": 0.0002999596177143626, + "loss": 5.8014, + "step": 393 + }, + { + "epoch": 0.50432, + "grad_norm": 0.6222047209739685, + "learning_rate": 0.00029991923542872523, + "loss": 5.7825, + "step": 394 + }, + { + "epoch": 0.5056, + "grad_norm": 0.6509471535682678, + "learning_rate": 0.00029987885314308786, + "loss": 5.806, + "step": 395 + }, + { + "epoch": 0.50688, + "grad_norm": 0.6112677454948425, + "learning_rate": 0.0002998384708574505, + "loss": 5.7888, + "step": 396 + }, + { + "epoch": 0.50816, + "grad_norm": 0.67631596326828, + "learning_rate": 0.0002997980885718131, + "loss": 5.7826, + "step": 397 + }, + { + "epoch": 0.50944, + "grad_norm": 0.6522372961044312, + "learning_rate": 0.00029975770628617575, + "loss": 5.7987, + "step": 398 + }, + { + "epoch": 0.51072, + "grad_norm": 0.5814371109008789, + "learning_rate": 0.00029971732400053844, + "loss": 5.7732, + "step": 399 + }, + { + "epoch": 0.512, + "grad_norm": 0.7316078543663025, + "learning_rate": 0.00029967694171490107, + "loss": 5.7616, + "step": 400 + }, + { + "epoch": 0.51328, + "grad_norm": 0.817568302154541, + "learning_rate": 0.00029963655942926364, + "loss": 5.8326, + "step": 401 + }, + { + "epoch": 0.51456, + "grad_norm": 0.6369678378105164, + "learning_rate": 0.0002995961771436263, + "loss": 5.7762, + "step": 402 + }, + { + "epoch": 0.51584, + "grad_norm": 0.6471033096313477, + "learning_rate": 0.00029955579485798896, + "loss": 5.7027, + "step": 403 + }, + { + "epoch": 0.51712, + "grad_norm": 0.6128319501876831, + "learning_rate": 0.0002995154125723516, + "loss": 5.754, + "step": 404 + }, + { + "epoch": 0.5184, + "grad_norm": 0.7475748062133789, + "learning_rate": 0.0002994750302867142, + "loss": 5.7301, + "step": 405 + }, + { + "epoch": 0.51968, + "grad_norm": 0.6675015091896057, + "learning_rate": 0.00029943464800107685, + "loss": 5.7793, + "step": 406 + }, + { + "epoch": 0.52096, + "grad_norm": 0.640458345413208, + "learning_rate": 0.0002993942657154395, + "loss": 5.735, + "step": 407 + }, + { + "epoch": 0.52224, + "grad_norm": 0.6345043778419495, + "learning_rate": 0.0002993538834298021, + "loss": 5.6872, + "step": 408 + }, + { + "epoch": 0.52352, + "grad_norm": 0.6826564073562622, + "learning_rate": 0.00029931350114416474, + "loss": 5.691, + "step": 409 + }, + { + "epoch": 0.5248, + "grad_norm": 0.576991856098175, + "learning_rate": 0.00029927311885852737, + "loss": 5.7254, + "step": 410 + }, + { + "epoch": 0.52608, + "grad_norm": 0.572139322757721, + "learning_rate": 0.00029923273657289, + "loss": 5.7618, + "step": 411 + }, + { + "epoch": 0.52736, + "grad_norm": 0.7463775277137756, + "learning_rate": 0.00029919235428725263, + "loss": 5.7217, + "step": 412 + }, + { + "epoch": 0.52864, + "grad_norm": 0.7591479420661926, + "learning_rate": 0.00029915197200161526, + "loss": 5.745, + "step": 413 + }, + { + "epoch": 0.52992, + "grad_norm": 0.7695474624633789, + "learning_rate": 0.0002991115897159779, + "loss": 5.7294, + "step": 414 + }, + { + "epoch": 0.5312, + "grad_norm": 0.704517126083374, + "learning_rate": 0.0002990712074303405, + "loss": 5.7123, + "step": 415 + }, + { + "epoch": 0.53248, + "grad_norm": 0.7229870557785034, + "learning_rate": 0.0002990308251447032, + "loss": 5.6996, + "step": 416 + }, + { + "epoch": 0.53376, + "grad_norm": 0.7659012079238892, + "learning_rate": 0.0002989904428590658, + "loss": 5.7188, + "step": 417 + }, + { + "epoch": 0.53504, + "grad_norm": 0.8571960926055908, + "learning_rate": 0.0002989500605734284, + "loss": 5.7277, + "step": 418 + }, + { + "epoch": 0.53632, + "grad_norm": 0.9707953929901123, + "learning_rate": 0.00029890967828779104, + "loss": 5.668, + "step": 419 + }, + { + "epoch": 0.5376, + "grad_norm": 0.9887204766273499, + "learning_rate": 0.00029886929600215367, + "loss": 5.7147, + "step": 420 + }, + { + "epoch": 0.53888, + "grad_norm": 0.8890637159347534, + "learning_rate": 0.00029882891371651635, + "loss": 5.6983, + "step": 421 + }, + { + "epoch": 0.54016, + "grad_norm": 0.9526740908622742, + "learning_rate": 0.000298788531430879, + "loss": 5.7069, + "step": 422 + }, + { + "epoch": 0.54144, + "grad_norm": 1.1278778314590454, + "learning_rate": 0.0002987481491452416, + "loss": 5.6393, + "step": 423 + }, + { + "epoch": 0.54272, + "grad_norm": 1.0044233798980713, + "learning_rate": 0.0002987077668596042, + "loss": 5.7143, + "step": 424 + }, + { + "epoch": 0.544, + "grad_norm": 0.9827817678451538, + "learning_rate": 0.0002986673845739669, + "loss": 5.7981, + "step": 425 + }, + { + "epoch": 0.54528, + "grad_norm": 0.8233329057693481, + "learning_rate": 0.0002986270022883295, + "loss": 5.7221, + "step": 426 + }, + { + "epoch": 0.54656, + "grad_norm": 0.7597877383232117, + "learning_rate": 0.00029858662000269213, + "loss": 5.698, + "step": 427 + }, + { + "epoch": 0.54784, + "grad_norm": 0.8371834754943848, + "learning_rate": 0.00029854623771705476, + "loss": 5.6721, + "step": 428 + }, + { + "epoch": 0.54912, + "grad_norm": 0.6564275026321411, + "learning_rate": 0.0002985058554314174, + "loss": 5.6973, + "step": 429 + }, + { + "epoch": 0.5504, + "grad_norm": 0.7076083421707153, + "learning_rate": 0.00029846547314578, + "loss": 5.6448, + "step": 430 + }, + { + "epoch": 0.55168, + "grad_norm": 0.7080224752426147, + "learning_rate": 0.00029842509086014266, + "loss": 5.6874, + "step": 431 + }, + { + "epoch": 0.55296, + "grad_norm": 0.792473554611206, + "learning_rate": 0.0002983847085745053, + "loss": 5.7225, + "step": 432 + }, + { + "epoch": 0.55424, + "grad_norm": 1.0753412246704102, + "learning_rate": 0.0002983443262888679, + "loss": 5.6673, + "step": 433 + }, + { + "epoch": 0.55552, + "grad_norm": 1.0193465948104858, + "learning_rate": 0.00029830394400323055, + "loss": 5.7159, + "step": 434 + }, + { + "epoch": 0.5568, + "grad_norm": 1.0127590894699097, + "learning_rate": 0.0002982635617175932, + "loss": 5.6806, + "step": 435 + }, + { + "epoch": 0.55808, + "grad_norm": 1.2227166891098022, + "learning_rate": 0.0002982231794319558, + "loss": 5.66, + "step": 436 + }, + { + "epoch": 0.55936, + "grad_norm": 0.7891542315483093, + "learning_rate": 0.00029818279714631844, + "loss": 5.6076, + "step": 437 + }, + { + "epoch": 0.56064, + "grad_norm": 0.7889094948768616, + "learning_rate": 0.0002981424148606811, + "loss": 5.6662, + "step": 438 + }, + { + "epoch": 0.56192, + "grad_norm": 0.7303580045700073, + "learning_rate": 0.00029810203257504375, + "loss": 5.6123, + "step": 439 + }, + { + "epoch": 0.5632, + "grad_norm": 0.7027185559272766, + "learning_rate": 0.0002980616502894063, + "loss": 5.6627, + "step": 440 + }, + { + "epoch": 0.56448, + "grad_norm": 0.6738660931587219, + "learning_rate": 0.00029802126800376896, + "loss": 5.6118, + "step": 441 + }, + { + "epoch": 0.56576, + "grad_norm": 0.7443022727966309, + "learning_rate": 0.00029798088571813164, + "loss": 5.629, + "step": 442 + }, + { + "epoch": 0.56704, + "grad_norm": 0.685570478439331, + "learning_rate": 0.00029794050343249427, + "loss": 5.6223, + "step": 443 + }, + { + "epoch": 0.56832, + "grad_norm": 0.672305703163147, + "learning_rate": 0.0002979001211468569, + "loss": 5.6039, + "step": 444 + }, + { + "epoch": 0.5696, + "grad_norm": 0.7415111064910889, + "learning_rate": 0.00029785973886121953, + "loss": 5.6167, + "step": 445 + }, + { + "epoch": 0.57088, + "grad_norm": 0.7740527987480164, + "learning_rate": 0.00029781935657558216, + "loss": 5.5913, + "step": 446 + }, + { + "epoch": 0.57216, + "grad_norm": 0.7420479655265808, + "learning_rate": 0.0002977789742899448, + "loss": 5.6269, + "step": 447 + }, + { + "epoch": 0.57344, + "grad_norm": 0.7080816626548767, + "learning_rate": 0.0002977385920043074, + "loss": 5.7041, + "step": 448 + }, + { + "epoch": 0.57472, + "grad_norm": 0.7158259153366089, + "learning_rate": 0.00029769820971867005, + "loss": 5.6587, + "step": 449 + }, + { + "epoch": 0.576, + "grad_norm": 0.7243630290031433, + "learning_rate": 0.0002976578274330327, + "loss": 5.6178, + "step": 450 + }, + { + "epoch": 0.57728, + "grad_norm": 0.6919866800308228, + "learning_rate": 0.0002976174451473953, + "loss": 5.6674, + "step": 451 + }, + { + "epoch": 0.57856, + "grad_norm": 0.7445707321166992, + "learning_rate": 0.00029757706286175794, + "loss": 5.6193, + "step": 452 + }, + { + "epoch": 0.57984, + "grad_norm": 0.7470800876617432, + "learning_rate": 0.00029753668057612057, + "loss": 5.6655, + "step": 453 + }, + { + "epoch": 0.58112, + "grad_norm": 0.7181954383850098, + "learning_rate": 0.0002974962982904832, + "loss": 5.607, + "step": 454 + }, + { + "epoch": 0.5824, + "grad_norm": 0.7144619822502136, + "learning_rate": 0.0002974559160048459, + "loss": 5.6176, + "step": 455 + }, + { + "epoch": 0.58368, + "grad_norm": 0.6728463768959045, + "learning_rate": 0.00029741553371920846, + "loss": 5.5554, + "step": 456 + }, + { + "epoch": 0.58496, + "grad_norm": 0.7108640074729919, + "learning_rate": 0.0002973751514335711, + "loss": 5.5815, + "step": 457 + }, + { + "epoch": 0.58624, + "grad_norm": 0.5689119696617126, + "learning_rate": 0.0002973347691479337, + "loss": 5.5097, + "step": 458 + }, + { + "epoch": 0.58752, + "grad_norm": 0.6072651743888855, + "learning_rate": 0.0002972943868622964, + "loss": 5.5147, + "step": 459 + }, + { + "epoch": 0.5888, + "grad_norm": 0.5563207268714905, + "learning_rate": 0.00029725400457665904, + "loss": 5.5191, + "step": 460 + }, + { + "epoch": 0.59008, + "grad_norm": 0.6030489802360535, + "learning_rate": 0.00029721362229102167, + "loss": 5.5896, + "step": 461 + }, + { + "epoch": 0.59136, + "grad_norm": 0.6141897439956665, + "learning_rate": 0.0002971732400053843, + "loss": 5.6277, + "step": 462 + }, + { + "epoch": 0.59264, + "grad_norm": 0.6721770167350769, + "learning_rate": 0.00029713285771974687, + "loss": 5.5537, + "step": 463 + }, + { + "epoch": 0.59392, + "grad_norm": 0.6960611343383789, + "learning_rate": 0.00029709247543410956, + "loss": 5.569, + "step": 464 + }, + { + "epoch": 0.5952, + "grad_norm": 0.6458708047866821, + "learning_rate": 0.0002970520931484722, + "loss": 5.5418, + "step": 465 + }, + { + "epoch": 0.59648, + "grad_norm": 0.7103924751281738, + "learning_rate": 0.0002970117108628348, + "loss": 5.5574, + "step": 466 + }, + { + "epoch": 0.59776, + "grad_norm": 0.677963376045227, + "learning_rate": 0.00029697132857719745, + "loss": 5.5791, + "step": 467 + }, + { + "epoch": 0.59904, + "grad_norm": 0.6947761178016663, + "learning_rate": 0.0002969309462915601, + "loss": 5.5745, + "step": 468 + }, + { + "epoch": 0.60032, + "grad_norm": 0.6401717662811279, + "learning_rate": 0.0002968905640059227, + "loss": 5.5222, + "step": 469 + }, + { + "epoch": 0.6016, + "grad_norm": 0.6908291578292847, + "learning_rate": 0.00029685018172028534, + "loss": 5.5118, + "step": 470 + }, + { + "epoch": 0.60288, + "grad_norm": 0.7326011061668396, + "learning_rate": 0.00029680979943464797, + "loss": 5.5127, + "step": 471 + }, + { + "epoch": 0.60416, + "grad_norm": 0.8172990679740906, + "learning_rate": 0.00029676941714901065, + "loss": 5.5237, + "step": 472 + }, + { + "epoch": 0.60544, + "grad_norm": 0.7394744157791138, + "learning_rate": 0.0002967290348633732, + "loss": 5.4849, + "step": 473 + }, + { + "epoch": 0.60672, + "grad_norm": 0.6962910294532776, + "learning_rate": 0.00029668865257773586, + "loss": 5.5167, + "step": 474 + }, + { + "epoch": 0.608, + "grad_norm": 0.6705038547515869, + "learning_rate": 0.0002966482702920985, + "loss": 5.5084, + "step": 475 + }, + { + "epoch": 0.60928, + "grad_norm": 0.8199055790901184, + "learning_rate": 0.0002966078880064611, + "loss": 5.5338, + "step": 476 + }, + { + "epoch": 0.61056, + "grad_norm": 0.833018958568573, + "learning_rate": 0.0002965675057208238, + "loss": 5.5654, + "step": 477 + }, + { + "epoch": 0.61184, + "grad_norm": 0.7582036852836609, + "learning_rate": 0.00029652712343518643, + "loss": 5.5814, + "step": 478 + }, + { + "epoch": 0.61312, + "grad_norm": 0.8647993803024292, + "learning_rate": 0.000296486741149549, + "loss": 5.5089, + "step": 479 + }, + { + "epoch": 0.6144, + "grad_norm": 1.1151838302612305, + "learning_rate": 0.00029644635886391164, + "loss": 5.5816, + "step": 480 + }, + { + "epoch": 0.61568, + "grad_norm": 0.8047844767570496, + "learning_rate": 0.0002964059765782743, + "loss": 5.5261, + "step": 481 + }, + { + "epoch": 0.61696, + "grad_norm": 0.7434077858924866, + "learning_rate": 0.00029636559429263695, + "loss": 5.5063, + "step": 482 + }, + { + "epoch": 0.61824, + "grad_norm": 0.7304471135139465, + "learning_rate": 0.0002963252120069996, + "loss": 5.5246, + "step": 483 + }, + { + "epoch": 0.61952, + "grad_norm": 0.686316728591919, + "learning_rate": 0.0002962848297213622, + "loss": 5.4783, + "step": 484 + }, + { + "epoch": 0.6208, + "grad_norm": 0.6505491733551025, + "learning_rate": 0.00029624444743572484, + "loss": 5.4989, + "step": 485 + }, + { + "epoch": 0.62208, + "grad_norm": 0.6548778414726257, + "learning_rate": 0.00029620406515008747, + "loss": 5.4819, + "step": 486 + }, + { + "epoch": 0.62336, + "grad_norm": 0.6001530289649963, + "learning_rate": 0.0002961636828644501, + "loss": 5.4859, + "step": 487 + }, + { + "epoch": 0.62464, + "grad_norm": 0.6683222055435181, + "learning_rate": 0.00029612330057881273, + "loss": 5.4872, + "step": 488 + }, + { + "epoch": 0.62592, + "grad_norm": 0.7493375539779663, + "learning_rate": 0.00029608291829317536, + "loss": 5.5236, + "step": 489 + }, + { + "epoch": 0.6272, + "grad_norm": 0.7252801060676575, + "learning_rate": 0.000296042536007538, + "loss": 5.4432, + "step": 490 + }, + { + "epoch": 0.62848, + "grad_norm": 0.6479570269584656, + "learning_rate": 0.0002960021537219006, + "loss": 5.4957, + "step": 491 + }, + { + "epoch": 0.62976, + "grad_norm": 0.6929423809051514, + "learning_rate": 0.00029596177143626325, + "loss": 5.4412, + "step": 492 + }, + { + "epoch": 0.63104, + "grad_norm": 0.6217756271362305, + "learning_rate": 0.0002959213891506259, + "loss": 5.4922, + "step": 493 + }, + { + "epoch": 0.63232, + "grad_norm": 0.7749668955802917, + "learning_rate": 0.00029588100686498857, + "loss": 5.5026, + "step": 494 + }, + { + "epoch": 0.6336, + "grad_norm": 0.7131249308586121, + "learning_rate": 0.0002958406245793512, + "loss": 5.4615, + "step": 495 + }, + { + "epoch": 0.63488, + "grad_norm": 0.6029372215270996, + "learning_rate": 0.00029580024229371377, + "loss": 5.5491, + "step": 496 + }, + { + "epoch": 0.63616, + "grad_norm": 0.8320589661598206, + "learning_rate": 0.0002957598600080764, + "loss": 5.4513, + "step": 497 + }, + { + "epoch": 0.63744, + "grad_norm": 0.8968410491943359, + "learning_rate": 0.0002957194777224391, + "loss": 5.5575, + "step": 498 + }, + { + "epoch": 0.63872, + "grad_norm": 0.8323341012001038, + "learning_rate": 0.0002956790954368017, + "loss": 5.4464, + "step": 499 + }, + { + "epoch": 0.64, + "grad_norm": 0.7660730481147766, + "learning_rate": 0.00029563871315116435, + "loss": 5.5199, + "step": 500 + }, + { + "epoch": 0.64128, + "grad_norm": 0.8838502764701843, + "learning_rate": 0.000295598330865527, + "loss": 5.4954, + "step": 501 + }, + { + "epoch": 0.64256, + "grad_norm": 0.8735381960868835, + "learning_rate": 0.0002955579485798896, + "loss": 5.4527, + "step": 502 + }, + { + "epoch": 0.64384, + "grad_norm": 0.7608069777488708, + "learning_rate": 0.00029551756629425224, + "loss": 5.4398, + "step": 503 + }, + { + "epoch": 0.64512, + "grad_norm": 0.7376892566680908, + "learning_rate": 0.00029547718400861487, + "loss": 5.4521, + "step": 504 + }, + { + "epoch": 0.6464, + "grad_norm": 0.6925538778305054, + "learning_rate": 0.0002954368017229775, + "loss": 5.504, + "step": 505 + }, + { + "epoch": 0.64768, + "grad_norm": 0.6140308380126953, + "learning_rate": 0.00029539641943734013, + "loss": 5.4147, + "step": 506 + }, + { + "epoch": 0.64896, + "grad_norm": 0.6824973225593567, + "learning_rate": 0.00029535603715170276, + "loss": 5.4963, + "step": 507 + }, + { + "epoch": 0.65024, + "grad_norm": 0.736219584941864, + "learning_rate": 0.0002953156548660654, + "loss": 5.4566, + "step": 508 + }, + { + "epoch": 0.65152, + "grad_norm": 0.6932424902915955, + "learning_rate": 0.000295275272580428, + "loss": 5.3826, + "step": 509 + }, + { + "epoch": 0.6528, + "grad_norm": 0.6957204341888428, + "learning_rate": 0.00029523489029479065, + "loss": 5.4521, + "step": 510 + }, + { + "epoch": 0.65408, + "grad_norm": 0.5883374810218811, + "learning_rate": 0.00029519450800915333, + "loss": 5.4681, + "step": 511 + }, + { + "epoch": 0.65536, + "grad_norm": 0.6524292826652527, + "learning_rate": 0.0002951541257235159, + "loss": 5.3794, + "step": 512 + }, + { + "epoch": 0.65664, + "grad_norm": 0.7906764149665833, + "learning_rate": 0.00029511374343787854, + "loss": 5.443, + "step": 513 + }, + { + "epoch": 0.65792, + "grad_norm": 0.7420827746391296, + "learning_rate": 0.00029507336115224117, + "loss": 5.4216, + "step": 514 + }, + { + "epoch": 0.6592, + "grad_norm": 0.7884359955787659, + "learning_rate": 0.0002950329788666038, + "loss": 5.4701, + "step": 515 + }, + { + "epoch": 0.66048, + "grad_norm": 0.7779830694198608, + "learning_rate": 0.0002949925965809665, + "loss": 5.4779, + "step": 516 + }, + { + "epoch": 0.66176, + "grad_norm": 0.7066437602043152, + "learning_rate": 0.0002949522142953291, + "loss": 5.4453, + "step": 517 + }, + { + "epoch": 0.66304, + "grad_norm": 0.8100753426551819, + "learning_rate": 0.00029491183200969174, + "loss": 5.4251, + "step": 518 + }, + { + "epoch": 0.66432, + "grad_norm": 0.7143018245697021, + "learning_rate": 0.0002948714497240543, + "loss": 5.4786, + "step": 519 + }, + { + "epoch": 0.6656, + "grad_norm": 0.7291150689125061, + "learning_rate": 0.000294831067438417, + "loss": 5.4265, + "step": 520 + }, + { + "epoch": 0.66688, + "grad_norm": 0.6859623193740845, + "learning_rate": 0.00029479068515277963, + "loss": 5.4724, + "step": 521 + }, + { + "epoch": 0.66816, + "grad_norm": 0.6909458637237549, + "learning_rate": 0.00029475030286714226, + "loss": 5.3719, + "step": 522 + }, + { + "epoch": 0.66944, + "grad_norm": 0.6806208491325378, + "learning_rate": 0.0002947099205815049, + "loss": 5.3872, + "step": 523 + }, + { + "epoch": 0.67072, + "grad_norm": 0.6667022109031677, + "learning_rate": 0.0002946695382958675, + "loss": 5.3614, + "step": 524 + }, + { + "epoch": 0.672, + "grad_norm": 0.6555402278900146, + "learning_rate": 0.00029462915601023015, + "loss": 5.4534, + "step": 525 + }, + { + "epoch": 0.67328, + "grad_norm": 0.685303270816803, + "learning_rate": 0.0002945887737245928, + "loss": 5.426, + "step": 526 + }, + { + "epoch": 0.67456, + "grad_norm": 0.7246394753456116, + "learning_rate": 0.0002945483914389554, + "loss": 5.4041, + "step": 527 + }, + { + "epoch": 0.67584, + "grad_norm": 0.765101969242096, + "learning_rate": 0.00029450800915331804, + "loss": 5.386, + "step": 528 + }, + { + "epoch": 0.67712, + "grad_norm": 0.7084211111068726, + "learning_rate": 0.0002944676268676807, + "loss": 5.3894, + "step": 529 + }, + { + "epoch": 0.6784, + "grad_norm": 0.6788263916969299, + "learning_rate": 0.0002944272445820433, + "loss": 5.375, + "step": 530 + }, + { + "epoch": 0.67968, + "grad_norm": 0.7256125807762146, + "learning_rate": 0.00029438686229640593, + "loss": 5.3544, + "step": 531 + }, + { + "epoch": 0.68096, + "grad_norm": 0.8374799489974976, + "learning_rate": 0.00029434648001076856, + "loss": 5.3758, + "step": 532 + }, + { + "epoch": 0.68224, + "grad_norm": 0.8988571166992188, + "learning_rate": 0.00029430609772513125, + "loss": 5.4003, + "step": 533 + }, + { + "epoch": 0.68352, + "grad_norm": 0.7412604093551636, + "learning_rate": 0.0002942657154394939, + "loss": 5.3558, + "step": 534 + }, + { + "epoch": 0.6848, + "grad_norm": 0.7604132890701294, + "learning_rate": 0.00029422533315385645, + "loss": 5.4506, + "step": 535 + }, + { + "epoch": 0.68608, + "grad_norm": 0.9959605932235718, + "learning_rate": 0.0002941849508682191, + "loss": 5.3877, + "step": 536 + }, + { + "epoch": 0.68736, + "grad_norm": 0.834484875202179, + "learning_rate": 0.00029414456858258177, + "loss": 5.3407, + "step": 537 + }, + { + "epoch": 0.68864, + "grad_norm": 0.6473014950752258, + "learning_rate": 0.0002941041862969444, + "loss": 5.3553, + "step": 538 + }, + { + "epoch": 0.68992, + "grad_norm": 0.8236899375915527, + "learning_rate": 0.00029406380401130703, + "loss": 5.3946, + "step": 539 + }, + { + "epoch": 0.6912, + "grad_norm": 0.7361433506011963, + "learning_rate": 0.00029402342172566966, + "loss": 5.3825, + "step": 540 + }, + { + "epoch": 0.69248, + "grad_norm": 0.5942022800445557, + "learning_rate": 0.0002939830394400323, + "loss": 5.3754, + "step": 541 + }, + { + "epoch": 0.69376, + "grad_norm": 0.6257084608078003, + "learning_rate": 0.0002939426571543949, + "loss": 5.3383, + "step": 542 + }, + { + "epoch": 0.69504, + "grad_norm": 0.5751544237136841, + "learning_rate": 0.00029390227486875755, + "loss": 5.3087, + "step": 543 + }, + { + "epoch": 0.69632, + "grad_norm": 0.6479839086532593, + "learning_rate": 0.0002938618925831202, + "loss": 5.4463, + "step": 544 + }, + { + "epoch": 0.6976, + "grad_norm": 0.6127558350563049, + "learning_rate": 0.0002938215102974828, + "loss": 5.4066, + "step": 545 + }, + { + "epoch": 0.69888, + "grad_norm": 0.6831675171852112, + "learning_rate": 0.00029378112801184544, + "loss": 5.3043, + "step": 546 + }, + { + "epoch": 0.70016, + "grad_norm": 0.69155353307724, + "learning_rate": 0.00029374074572620807, + "loss": 5.3776, + "step": 547 + }, + { + "epoch": 0.70144, + "grad_norm": 0.6620556116104126, + "learning_rate": 0.0002937003634405707, + "loss": 5.3669, + "step": 548 + }, + { + "epoch": 0.70272, + "grad_norm": 0.5906518697738647, + "learning_rate": 0.00029365998115493333, + "loss": 5.3356, + "step": 549 + }, + { + "epoch": 0.704, + "grad_norm": 0.6667118668556213, + "learning_rate": 0.000293619598869296, + "loss": 5.327, + "step": 550 + }, + { + "epoch": 0.70528, + "grad_norm": 0.7207005023956299, + "learning_rate": 0.0002935792165836586, + "loss": 5.3675, + "step": 551 + }, + { + "epoch": 0.70656, + "grad_norm": 0.8886672854423523, + "learning_rate": 0.0002935388342980212, + "loss": 5.3172, + "step": 552 + }, + { + "epoch": 0.70784, + "grad_norm": 0.9381269216537476, + "learning_rate": 0.00029349845201238385, + "loss": 5.3404, + "step": 553 + }, + { + "epoch": 0.70912, + "grad_norm": 0.790340006351471, + "learning_rate": 0.0002934580697267465, + "loss": 5.3174, + "step": 554 + }, + { + "epoch": 0.7104, + "grad_norm": 0.636177659034729, + "learning_rate": 0.00029341768744110916, + "loss": 5.3767, + "step": 555 + }, + { + "epoch": 0.71168, + "grad_norm": 0.7448100447654724, + "learning_rate": 0.0002933773051554718, + "loss": 5.3201, + "step": 556 + }, + { + "epoch": 0.71296, + "grad_norm": 0.7365016341209412, + "learning_rate": 0.0002933369228698344, + "loss": 5.318, + "step": 557 + }, + { + "epoch": 0.71424, + "grad_norm": 0.6620506048202515, + "learning_rate": 0.000293296540584197, + "loss": 5.3417, + "step": 558 + }, + { + "epoch": 0.71552, + "grad_norm": 0.5934759378433228, + "learning_rate": 0.0002932561582985597, + "loss": 5.3751, + "step": 559 + }, + { + "epoch": 0.7168, + "grad_norm": 0.6874362230300903, + "learning_rate": 0.0002932157760129223, + "loss": 5.3089, + "step": 560 + }, + { + "epoch": 0.71808, + "grad_norm": 0.6296939253807068, + "learning_rate": 0.00029317539372728494, + "loss": 5.3057, + "step": 561 + }, + { + "epoch": 0.71936, + "grad_norm": 0.5342876315116882, + "learning_rate": 0.0002931350114416476, + "loss": 5.3055, + "step": 562 + }, + { + "epoch": 0.72064, + "grad_norm": 0.7313459515571594, + "learning_rate": 0.0002930946291560102, + "loss": 5.3002, + "step": 563 + }, + { + "epoch": 0.72192, + "grad_norm": 0.7934847474098206, + "learning_rate": 0.00029305424687037283, + "loss": 5.394, + "step": 564 + }, + { + "epoch": 0.7232, + "grad_norm": 0.6675227284431458, + "learning_rate": 0.00029301386458473546, + "loss": 5.2447, + "step": 565 + }, + { + "epoch": 0.72448, + "grad_norm": 0.6422837972640991, + "learning_rate": 0.0002929734822990981, + "loss": 5.3177, + "step": 566 + }, + { + "epoch": 0.72576, + "grad_norm": 0.6145399212837219, + "learning_rate": 0.0002929331000134607, + "loss": 5.3537, + "step": 567 + }, + { + "epoch": 0.72704, + "grad_norm": 0.5638757944107056, + "learning_rate": 0.00029289271772782335, + "loss": 5.2738, + "step": 568 + }, + { + "epoch": 0.72832, + "grad_norm": 0.6802723407745361, + "learning_rate": 0.000292852335442186, + "loss": 5.3411, + "step": 569 + }, + { + "epoch": 0.7296, + "grad_norm": 0.8496166467666626, + "learning_rate": 0.0002928119531565486, + "loss": 5.3137, + "step": 570 + }, + { + "epoch": 0.73088, + "grad_norm": 0.8245171904563904, + "learning_rate": 0.00029277157087091125, + "loss": 5.3141, + "step": 571 + }, + { + "epoch": 0.73216, + "grad_norm": 0.6584154963493347, + "learning_rate": 0.00029273118858527393, + "loss": 5.3264, + "step": 572 + }, + { + "epoch": 0.73344, + "grad_norm": 0.6775988936424255, + "learning_rate": 0.00029269080629963656, + "loss": 5.2959, + "step": 573 + }, + { + "epoch": 0.73472, + "grad_norm": 0.6198107600212097, + "learning_rate": 0.0002926504240139992, + "loss": 5.2696, + "step": 574 + }, + { + "epoch": 0.736, + "grad_norm": 0.7053045034408569, + "learning_rate": 0.00029261004172836177, + "loss": 5.3243, + "step": 575 + }, + { + "epoch": 0.73728, + "grad_norm": 0.7388597130775452, + "learning_rate": 0.00029256965944272445, + "loss": 5.3298, + "step": 576 + }, + { + "epoch": 0.73856, + "grad_norm": 0.7207250595092773, + "learning_rate": 0.0002925292771570871, + "loss": 5.3113, + "step": 577 + }, + { + "epoch": 0.73984, + "grad_norm": 0.7020829319953918, + "learning_rate": 0.0002924888948714497, + "loss": 5.2813, + "step": 578 + }, + { + "epoch": 0.74112, + "grad_norm": 0.8059638738632202, + "learning_rate": 0.00029244851258581234, + "loss": 5.2436, + "step": 579 + }, + { + "epoch": 0.7424, + "grad_norm": 1.0253353118896484, + "learning_rate": 0.00029240813030017497, + "loss": 5.2739, + "step": 580 + }, + { + "epoch": 0.74368, + "grad_norm": 0.963039755821228, + "learning_rate": 0.0002923677480145376, + "loss": 5.2639, + "step": 581 + }, + { + "epoch": 0.74496, + "grad_norm": 0.7356200814247131, + "learning_rate": 0.00029232736572890023, + "loss": 5.2874, + "step": 582 + }, + { + "epoch": 0.74624, + "grad_norm": 0.7162445187568665, + "learning_rate": 0.00029228698344326286, + "loss": 5.3408, + "step": 583 + }, + { + "epoch": 0.74752, + "grad_norm": 0.708344042301178, + "learning_rate": 0.0002922466011576255, + "loss": 5.2798, + "step": 584 + }, + { + "epoch": 0.7488, + "grad_norm": 0.5786714553833008, + "learning_rate": 0.0002922062188719881, + "loss": 5.285, + "step": 585 + }, + { + "epoch": 0.75008, + "grad_norm": 0.6987271904945374, + "learning_rate": 0.00029216583658635075, + "loss": 5.2781, + "step": 586 + }, + { + "epoch": 0.75136, + "grad_norm": 0.6231822967529297, + "learning_rate": 0.0002921254543007134, + "loss": 5.2242, + "step": 587 + }, + { + "epoch": 0.75264, + "grad_norm": 0.679735004901886, + "learning_rate": 0.000292085072015076, + "loss": 5.3149, + "step": 588 + }, + { + "epoch": 0.75392, + "grad_norm": 0.6461718678474426, + "learning_rate": 0.0002920446897294387, + "loss": 5.2661, + "step": 589 + }, + { + "epoch": 0.7552, + "grad_norm": 0.6281011700630188, + "learning_rate": 0.0002920043074438013, + "loss": 5.1899, + "step": 590 + }, + { + "epoch": 0.75648, + "grad_norm": 0.6488833427429199, + "learning_rate": 0.0002919639251581639, + "loss": 5.2635, + "step": 591 + }, + { + "epoch": 0.75776, + "grad_norm": 0.7168968319892883, + "learning_rate": 0.00029192354287252653, + "loss": 5.1908, + "step": 592 + }, + { + "epoch": 0.75904, + "grad_norm": 0.7377144694328308, + "learning_rate": 0.0002918831605868892, + "loss": 5.2676, + "step": 593 + }, + { + "epoch": 0.76032, + "grad_norm": 0.5983268022537231, + "learning_rate": 0.00029184277830125185, + "loss": 5.26, + "step": 594 + }, + { + "epoch": 0.7616, + "grad_norm": 0.6470866203308105, + "learning_rate": 0.0002918023960156145, + "loss": 5.228, + "step": 595 + }, + { + "epoch": 0.76288, + "grad_norm": 0.7147027850151062, + "learning_rate": 0.0002917620137299771, + "loss": 5.2475, + "step": 596 + }, + { + "epoch": 0.76416, + "grad_norm": 0.686434268951416, + "learning_rate": 0.00029172163144433974, + "loss": 5.2174, + "step": 597 + }, + { + "epoch": 0.76544, + "grad_norm": 0.7053865194320679, + "learning_rate": 0.00029168124915870237, + "loss": 5.2822, + "step": 598 + }, + { + "epoch": 0.76672, + "grad_norm": 0.8161548972129822, + "learning_rate": 0.000291640866873065, + "loss": 5.1864, + "step": 599 + }, + { + "epoch": 0.768, + "grad_norm": 0.6877032518386841, + "learning_rate": 0.0002916004845874276, + "loss": 5.281, + "step": 600 + }, + { + "epoch": 0.76928, + "grad_norm": 0.5830395817756653, + "learning_rate": 0.00029156010230179026, + "loss": 5.2073, + "step": 601 + }, + { + "epoch": 0.77056, + "grad_norm": 0.7007346749305725, + "learning_rate": 0.0002915197200161529, + "loss": 5.3173, + "step": 602 + }, + { + "epoch": 0.77184, + "grad_norm": 0.7271237969398499, + "learning_rate": 0.0002914793377305155, + "loss": 5.235, + "step": 603 + }, + { + "epoch": 0.77312, + "grad_norm": 0.5913999080657959, + "learning_rate": 0.00029143895544487815, + "loss": 5.2552, + "step": 604 + }, + { + "epoch": 0.7744, + "grad_norm": 0.6722851991653442, + "learning_rate": 0.0002913985731592408, + "loss": 5.2112, + "step": 605 + }, + { + "epoch": 0.77568, + "grad_norm": 0.6968867182731628, + "learning_rate": 0.00029135819087360346, + "loss": 5.2505, + "step": 606 + }, + { + "epoch": 0.77696, + "grad_norm": 0.6645686030387878, + "learning_rate": 0.00029131780858796604, + "loss": 5.1846, + "step": 607 + }, + { + "epoch": 0.77824, + "grad_norm": 0.6209181547164917, + "learning_rate": 0.00029127742630232867, + "loss": 5.2269, + "step": 608 + }, + { + "epoch": 0.77952, + "grad_norm": 0.7518525123596191, + "learning_rate": 0.0002912370440166913, + "loss": 5.1761, + "step": 609 + }, + { + "epoch": 0.7808, + "grad_norm": 0.6878477334976196, + "learning_rate": 0.0002911966617310539, + "loss": 5.2348, + "step": 610 + }, + { + "epoch": 0.78208, + "grad_norm": 0.7975960969924927, + "learning_rate": 0.0002911562794454166, + "loss": 5.2519, + "step": 611 + }, + { + "epoch": 0.78336, + "grad_norm": 0.7792649269104004, + "learning_rate": 0.00029111589715977924, + "loss": 5.1812, + "step": 612 + }, + { + "epoch": 0.78464, + "grad_norm": 0.7435747385025024, + "learning_rate": 0.00029107551487414187, + "loss": 5.1505, + "step": 613 + }, + { + "epoch": 0.78592, + "grad_norm": 0.7414288520812988, + "learning_rate": 0.00029103513258850445, + "loss": 5.1972, + "step": 614 + }, + { + "epoch": 0.7872, + "grad_norm": 0.7069399356842041, + "learning_rate": 0.00029099475030286713, + "loss": 5.1766, + "step": 615 + }, + { + "epoch": 0.78848, + "grad_norm": 0.5848967432975769, + "learning_rate": 0.00029095436801722976, + "loss": 5.1791, + "step": 616 + }, + { + "epoch": 0.78976, + "grad_norm": 0.6382315158843994, + "learning_rate": 0.0002909139857315924, + "loss": 5.2266, + "step": 617 + }, + { + "epoch": 0.79104, + "grad_norm": 0.6295589208602905, + "learning_rate": 0.000290873603445955, + "loss": 5.2489, + "step": 618 + }, + { + "epoch": 0.79232, + "grad_norm": 0.6399298310279846, + "learning_rate": 0.00029083322116031765, + "loss": 5.2295, + "step": 619 + }, + { + "epoch": 0.7936, + "grad_norm": 0.6969920992851257, + "learning_rate": 0.0002907928388746803, + "loss": 5.1619, + "step": 620 + }, + { + "epoch": 0.79488, + "grad_norm": 0.6793097853660583, + "learning_rate": 0.0002907524565890429, + "loss": 5.2213, + "step": 621 + }, + { + "epoch": 0.79616, + "grad_norm": 0.6430673003196716, + "learning_rate": 0.00029071207430340554, + "loss": 5.2848, + "step": 622 + }, + { + "epoch": 0.79744, + "grad_norm": 0.7029622793197632, + "learning_rate": 0.00029067169201776817, + "loss": 5.2335, + "step": 623 + }, + { + "epoch": 0.79872, + "grad_norm": 0.5936407446861267, + "learning_rate": 0.0002906313097321308, + "loss": 5.2245, + "step": 624 + }, + { + "epoch": 0.8, + "grad_norm": 0.7349820733070374, + "learning_rate": 0.00029059092744649343, + "loss": 5.2356, + "step": 625 + }, + { + "epoch": 0.80128, + "grad_norm": 0.747024655342102, + "learning_rate": 0.00029055054516085606, + "loss": 5.1537, + "step": 626 + }, + { + "epoch": 0.80256, + "grad_norm": 0.5645571351051331, + "learning_rate": 0.0002905101628752187, + "loss": 5.1635, + "step": 627 + }, + { + "epoch": 0.80384, + "grad_norm": 0.7009657621383667, + "learning_rate": 0.0002904697805895814, + "loss": 5.2236, + "step": 628 + }, + { + "epoch": 0.80512, + "grad_norm": 0.7518696188926697, + "learning_rate": 0.000290429398303944, + "loss": 5.132, + "step": 629 + }, + { + "epoch": 0.8064, + "grad_norm": 0.7385541796684265, + "learning_rate": 0.0002903890160183066, + "loss": 5.184, + "step": 630 + }, + { + "epoch": 0.80768, + "grad_norm": 0.6494706869125366, + "learning_rate": 0.0002903486337326692, + "loss": 5.2385, + "step": 631 + }, + { + "epoch": 0.80896, + "grad_norm": 0.7356922626495361, + "learning_rate": 0.0002903082514470319, + "loss": 5.1194, + "step": 632 + }, + { + "epoch": 0.81024, + "grad_norm": 0.5700814723968506, + "learning_rate": 0.0002902678691613945, + "loss": 5.1652, + "step": 633 + }, + { + "epoch": 0.81152, + "grad_norm": 0.5895199775695801, + "learning_rate": 0.00029022748687575716, + "loss": 5.1429, + "step": 634 + }, + { + "epoch": 0.8128, + "grad_norm": 0.5886778831481934, + "learning_rate": 0.0002901871045901198, + "loss": 5.1932, + "step": 635 + }, + { + "epoch": 0.81408, + "grad_norm": 0.6429824829101562, + "learning_rate": 0.0002901467223044824, + "loss": 5.1488, + "step": 636 + }, + { + "epoch": 0.81536, + "grad_norm": 0.5817999243736267, + "learning_rate": 0.00029010634001884505, + "loss": 5.1306, + "step": 637 + }, + { + "epoch": 0.81664, + "grad_norm": 0.6249139308929443, + "learning_rate": 0.0002900659577332077, + "loss": 5.1539, + "step": 638 + }, + { + "epoch": 0.81792, + "grad_norm": 0.6901640295982361, + "learning_rate": 0.0002900255754475703, + "loss": 5.1751, + "step": 639 + }, + { + "epoch": 0.8192, + "grad_norm": 0.7093073725700378, + "learning_rate": 0.00028998519316193294, + "loss": 5.1692, + "step": 640 + }, + { + "epoch": 0.82048, + "grad_norm": 0.6962242722511292, + "learning_rate": 0.00028994481087629557, + "loss": 5.1768, + "step": 641 + }, + { + "epoch": 0.82176, + "grad_norm": 0.712927520275116, + "learning_rate": 0.0002899044285906582, + "loss": 5.1236, + "step": 642 + }, + { + "epoch": 0.82304, + "grad_norm": 0.600985586643219, + "learning_rate": 0.00028986404630502083, + "loss": 5.2116, + "step": 643 + }, + { + "epoch": 0.82432, + "grad_norm": 0.6715438365936279, + "learning_rate": 0.00028982366401938346, + "loss": 5.1825, + "step": 644 + }, + { + "epoch": 0.8256, + "grad_norm": 0.854724645614624, + "learning_rate": 0.00028978328173374614, + "loss": 5.1579, + "step": 645 + }, + { + "epoch": 0.82688, + "grad_norm": 0.8737854957580566, + "learning_rate": 0.00028974289944810877, + "loss": 5.1176, + "step": 646 + }, + { + "epoch": 0.82816, + "grad_norm": 0.8141425251960754, + "learning_rate": 0.00028970251716247135, + "loss": 5.172, + "step": 647 + }, + { + "epoch": 0.82944, + "grad_norm": 0.6891692280769348, + "learning_rate": 0.000289662134876834, + "loss": 5.2035, + "step": 648 + }, + { + "epoch": 0.83072, + "grad_norm": 0.6787595748901367, + "learning_rate": 0.0002896217525911966, + "loss": 5.1818, + "step": 649 + }, + { + "epoch": 0.832, + "grad_norm": 0.8094693422317505, + "learning_rate": 0.0002895813703055593, + "loss": 5.0946, + "step": 650 + }, + { + "epoch": 0.83328, + "grad_norm": 0.6562574505805969, + "learning_rate": 0.0002895409880199219, + "loss": 5.093, + "step": 651 + }, + { + "epoch": 0.83456, + "grad_norm": 0.5972064733505249, + "learning_rate": 0.00028950060573428455, + "loss": 5.2078, + "step": 652 + }, + { + "epoch": 0.83584, + "grad_norm": 0.6744362115859985, + "learning_rate": 0.00028946022344864713, + "loss": 5.0914, + "step": 653 + }, + { + "epoch": 0.83712, + "grad_norm": 0.7293115258216858, + "learning_rate": 0.0002894198411630098, + "loss": 5.0954, + "step": 654 + }, + { + "epoch": 0.8384, + "grad_norm": 0.6300185918807983, + "learning_rate": 0.00028937945887737244, + "loss": 5.2061, + "step": 655 + }, + { + "epoch": 0.83968, + "grad_norm": 0.729161262512207, + "learning_rate": 0.00028933907659173507, + "loss": 5.1129, + "step": 656 + }, + { + "epoch": 0.84096, + "grad_norm": 0.7147690653800964, + "learning_rate": 0.0002892986943060977, + "loss": 5.1773, + "step": 657 + }, + { + "epoch": 0.84224, + "grad_norm": 0.6832327842712402, + "learning_rate": 0.00028925831202046033, + "loss": 5.1379, + "step": 658 + }, + { + "epoch": 0.84352, + "grad_norm": 0.7403486371040344, + "learning_rate": 0.00028921792973482296, + "loss": 5.1094, + "step": 659 + }, + { + "epoch": 0.8448, + "grad_norm": 0.7538831830024719, + "learning_rate": 0.0002891775474491856, + "loss": 5.0802, + "step": 660 + }, + { + "epoch": 0.84608, + "grad_norm": 0.7270022630691528, + "learning_rate": 0.0002891371651635482, + "loss": 5.1342, + "step": 661 + }, + { + "epoch": 0.84736, + "grad_norm": 0.6888076066970825, + "learning_rate": 0.00028909678287791085, + "loss": 5.118, + "step": 662 + }, + { + "epoch": 0.84864, + "grad_norm": 0.6924152374267578, + "learning_rate": 0.0002890564005922735, + "loss": 5.1728, + "step": 663 + }, + { + "epoch": 0.84992, + "grad_norm": 0.6232258081436157, + "learning_rate": 0.0002890160183066361, + "loss": 5.1357, + "step": 664 + }, + { + "epoch": 0.8512, + "grad_norm": 0.7234542369842529, + "learning_rate": 0.00028897563602099874, + "loss": 5.0607, + "step": 665 + }, + { + "epoch": 0.85248, + "grad_norm": 0.7320601940155029, + "learning_rate": 0.0002889352537353614, + "loss": 5.1074, + "step": 666 + }, + { + "epoch": 0.85376, + "grad_norm": 0.6856744289398193, + "learning_rate": 0.00028889487144972406, + "loss": 5.0398, + "step": 667 + }, + { + "epoch": 0.85504, + "grad_norm": 0.6123003363609314, + "learning_rate": 0.0002888544891640867, + "loss": 5.178, + "step": 668 + }, + { + "epoch": 0.85632, + "grad_norm": 0.6712738871574402, + "learning_rate": 0.0002888141068784493, + "loss": 5.1151, + "step": 669 + }, + { + "epoch": 0.8576, + "grad_norm": 0.6770631670951843, + "learning_rate": 0.0002887737245928119, + "loss": 5.1691, + "step": 670 + }, + { + "epoch": 0.85888, + "grad_norm": 0.603068470954895, + "learning_rate": 0.0002887333423071746, + "loss": 5.0652, + "step": 671 + }, + { + "epoch": 0.86016, + "grad_norm": 0.7057216167449951, + "learning_rate": 0.0002886929600215372, + "loss": 5.1038, + "step": 672 + }, + { + "epoch": 0.86144, + "grad_norm": 0.708411693572998, + "learning_rate": 0.00028865257773589984, + "loss": 5.1423, + "step": 673 + }, + { + "epoch": 0.86272, + "grad_norm": 0.634673535823822, + "learning_rate": 0.00028861219545026247, + "loss": 5.0765, + "step": 674 + }, + { + "epoch": 0.864, + "grad_norm": 0.6029425263404846, + "learning_rate": 0.0002885718131646251, + "loss": 5.0668, + "step": 675 + }, + { + "epoch": 0.86528, + "grad_norm": 0.6047025918960571, + "learning_rate": 0.00028853143087898773, + "loss": 5.0427, + "step": 676 + }, + { + "epoch": 0.86656, + "grad_norm": 0.6101155281066895, + "learning_rate": 0.00028849104859335036, + "loss": 5.1077, + "step": 677 + }, + { + "epoch": 0.86784, + "grad_norm": 0.6415575742721558, + "learning_rate": 0.000288450666307713, + "loss": 5.1074, + "step": 678 + }, + { + "epoch": 0.86912, + "grad_norm": 0.6284694671630859, + "learning_rate": 0.0002884102840220756, + "loss": 5.1415, + "step": 679 + }, + { + "epoch": 0.8704, + "grad_norm": 0.6168042421340942, + "learning_rate": 0.00028836990173643825, + "loss": 5.06, + "step": 680 + }, + { + "epoch": 0.87168, + "grad_norm": 0.6429285407066345, + "learning_rate": 0.0002883295194508009, + "loss": 5.1369, + "step": 681 + }, + { + "epoch": 0.87296, + "grad_norm": 0.6893006563186646, + "learning_rate": 0.0002882891371651635, + "loss": 5.0639, + "step": 682 + }, + { + "epoch": 0.87424, + "grad_norm": 0.5782660841941833, + "learning_rate": 0.00028824875487952614, + "loss": 5.0679, + "step": 683 + }, + { + "epoch": 0.87552, + "grad_norm": 0.6747514009475708, + "learning_rate": 0.0002882083725938888, + "loss": 5.097, + "step": 684 + }, + { + "epoch": 0.8768, + "grad_norm": 0.7318249940872192, + "learning_rate": 0.00028816799030825145, + "loss": 5.1338, + "step": 685 + }, + { + "epoch": 0.87808, + "grad_norm": 0.6769218444824219, + "learning_rate": 0.00028812760802261403, + "loss": 5.1185, + "step": 686 + }, + { + "epoch": 0.87936, + "grad_norm": 0.6341337561607361, + "learning_rate": 0.00028808722573697666, + "loss": 5.1188, + "step": 687 + }, + { + "epoch": 0.88064, + "grad_norm": 0.6396981477737427, + "learning_rate": 0.0002880468434513393, + "loss": 5.0718, + "step": 688 + }, + { + "epoch": 0.88192, + "grad_norm": 0.703349232673645, + "learning_rate": 0.000288006461165702, + "loss": 5.1783, + "step": 689 + }, + { + "epoch": 0.8832, + "grad_norm": 0.7219293713569641, + "learning_rate": 0.0002879660788800646, + "loss": 5.0716, + "step": 690 + }, + { + "epoch": 0.88448, + "grad_norm": 0.6729134321212769, + "learning_rate": 0.00028792569659442723, + "loss": 5.0406, + "step": 691 + }, + { + "epoch": 0.88576, + "grad_norm": 0.6320032477378845, + "learning_rate": 0.00028788531430878986, + "loss": 5.0302, + "step": 692 + }, + { + "epoch": 0.88704, + "grad_norm": 0.662146806716919, + "learning_rate": 0.0002878449320231525, + "loss": 5.0877, + "step": 693 + }, + { + "epoch": 0.88832, + "grad_norm": 0.6499378085136414, + "learning_rate": 0.0002878045497375151, + "loss": 4.983, + "step": 694 + }, + { + "epoch": 0.8896, + "grad_norm": 0.600250780582428, + "learning_rate": 0.00028776416745187775, + "loss": 5.0135, + "step": 695 + }, + { + "epoch": 0.89088, + "grad_norm": 0.6061322689056396, + "learning_rate": 0.0002877237851662404, + "loss": 5.0913, + "step": 696 + }, + { + "epoch": 0.89216, + "grad_norm": 0.6367159485816956, + "learning_rate": 0.000287683402880603, + "loss": 5.0589, + "step": 697 + }, + { + "epoch": 0.89344, + "grad_norm": 0.6972000598907471, + "learning_rate": 0.00028764302059496564, + "loss": 5.057, + "step": 698 + }, + { + "epoch": 0.89472, + "grad_norm": 0.7460048794746399, + "learning_rate": 0.0002876026383093283, + "loss": 4.9804, + "step": 699 + }, + { + "epoch": 0.896, + "grad_norm": 0.7880580425262451, + "learning_rate": 0.0002875622560236909, + "loss": 5.0366, + "step": 700 + }, + { + "epoch": 0.89728, + "grad_norm": 0.857755720615387, + "learning_rate": 0.00028752187373805353, + "loss": 5.1399, + "step": 701 + }, + { + "epoch": 0.89856, + "grad_norm": 0.8949594497680664, + "learning_rate": 0.00028748149145241616, + "loss": 5.0537, + "step": 702 + }, + { + "epoch": 0.89984, + "grad_norm": 0.6883854866027832, + "learning_rate": 0.0002874411091667788, + "loss": 5.0771, + "step": 703 + }, + { + "epoch": 0.90112, + "grad_norm": 0.7915716171264648, + "learning_rate": 0.0002874007268811414, + "loss": 5.1028, + "step": 704 + }, + { + "epoch": 0.9024, + "grad_norm": 0.6309002041816711, + "learning_rate": 0.00028736034459550405, + "loss": 5.079, + "step": 705 + }, + { + "epoch": 0.90368, + "grad_norm": 0.639817476272583, + "learning_rate": 0.00028731996230986674, + "loss": 5.1016, + "step": 706 + }, + { + "epoch": 0.90496, + "grad_norm": 0.6250396966934204, + "learning_rate": 0.00028727958002422937, + "loss": 4.9772, + "step": 707 + }, + { + "epoch": 0.90624, + "grad_norm": 0.5922684669494629, + "learning_rate": 0.000287239197738592, + "loss": 5.0849, + "step": 708 + }, + { + "epoch": 0.90752, + "grad_norm": 0.6167359352111816, + "learning_rate": 0.0002871988154529546, + "loss": 5.0408, + "step": 709 + }, + { + "epoch": 0.9088, + "grad_norm": 0.6019271612167358, + "learning_rate": 0.00028715843316731726, + "loss": 5.0312, + "step": 710 + }, + { + "epoch": 0.91008, + "grad_norm": 0.6929312348365784, + "learning_rate": 0.0002871180508816799, + "loss": 5.0345, + "step": 711 + }, + { + "epoch": 0.91136, + "grad_norm": 0.6549073457717896, + "learning_rate": 0.0002870776685960425, + "loss": 5.0344, + "step": 712 + }, + { + "epoch": 0.91264, + "grad_norm": 0.6558994054794312, + "learning_rate": 0.00028703728631040515, + "loss": 4.9554, + "step": 713 + }, + { + "epoch": 0.91392, + "grad_norm": 0.5860869884490967, + "learning_rate": 0.0002869969040247678, + "loss": 4.9838, + "step": 714 + }, + { + "epoch": 0.9152, + "grad_norm": 0.6493767499923706, + "learning_rate": 0.0002869565217391304, + "loss": 5.0245, + "step": 715 + }, + { + "epoch": 0.91648, + "grad_norm": 0.6401461958885193, + "learning_rate": 0.00028691613945349304, + "loss": 5.0256, + "step": 716 + }, + { + "epoch": 0.91776, + "grad_norm": 0.8375266194343567, + "learning_rate": 0.00028687575716785567, + "loss": 4.9672, + "step": 717 + }, + { + "epoch": 0.91904, + "grad_norm": 0.7304956912994385, + "learning_rate": 0.0002868353748822183, + "loss": 5.024, + "step": 718 + }, + { + "epoch": 0.92032, + "grad_norm": 0.7382602095603943, + "learning_rate": 0.00028679499259658093, + "loss": 5.0483, + "step": 719 + }, + { + "epoch": 0.9216, + "grad_norm": 0.7157966494560242, + "learning_rate": 0.00028675461031094356, + "loss": 4.9645, + "step": 720 + }, + { + "epoch": 0.92288, + "grad_norm": 0.7719393372535706, + "learning_rate": 0.0002867142280253062, + "loss": 5.0018, + "step": 721 + }, + { + "epoch": 0.92416, + "grad_norm": 0.6828843355178833, + "learning_rate": 0.0002866738457396688, + "loss": 4.9844, + "step": 722 + }, + { + "epoch": 0.92544, + "grad_norm": 0.6229972243309021, + "learning_rate": 0.0002866334634540315, + "loss": 4.9949, + "step": 723 + }, + { + "epoch": 0.92672, + "grad_norm": 0.6336907744407654, + "learning_rate": 0.00028659308116839413, + "loss": 5.0498, + "step": 724 + }, + { + "epoch": 0.928, + "grad_norm": 0.6664116978645325, + "learning_rate": 0.0002865526988827567, + "loss": 5.0202, + "step": 725 + }, + { + "epoch": 0.92928, + "grad_norm": 0.6601686477661133, + "learning_rate": 0.00028651231659711934, + "loss": 4.892, + "step": 726 + }, + { + "epoch": 0.93056, + "grad_norm": 0.713094174861908, + "learning_rate": 0.000286471934311482, + "loss": 4.9817, + "step": 727 + }, + { + "epoch": 0.93184, + "grad_norm": 0.5439857840538025, + "learning_rate": 0.00028643155202584465, + "loss": 5.0493, + "step": 728 + }, + { + "epoch": 0.93312, + "grad_norm": 0.6119688749313354, + "learning_rate": 0.0002863911697402073, + "loss": 5.0149, + "step": 729 + }, + { + "epoch": 0.9344, + "grad_norm": 0.6848023533821106, + "learning_rate": 0.0002863507874545699, + "loss": 5.0056, + "step": 730 + }, + { + "epoch": 0.93568, + "grad_norm": 0.6950737833976746, + "learning_rate": 0.00028631040516893255, + "loss": 5.0121, + "step": 731 + }, + { + "epoch": 0.93696, + "grad_norm": 0.825659453868866, + "learning_rate": 0.0002862700228832952, + "loss": 4.9852, + "step": 732 + }, + { + "epoch": 0.93824, + "grad_norm": 0.7373353242874146, + "learning_rate": 0.0002862296405976578, + "loss": 4.943, + "step": 733 + }, + { + "epoch": 0.93952, + "grad_norm": 0.6202784776687622, + "learning_rate": 0.00028618925831202044, + "loss": 5.0669, + "step": 734 + }, + { + "epoch": 0.9408, + "grad_norm": 0.6890573501586914, + "learning_rate": 0.00028614887602638307, + "loss": 4.9571, + "step": 735 + }, + { + "epoch": 0.94208, + "grad_norm": 0.6824938654899597, + "learning_rate": 0.0002861084937407457, + "loss": 4.9799, + "step": 736 + }, + { + "epoch": 0.94336, + "grad_norm": 0.693109393119812, + "learning_rate": 0.0002860681114551083, + "loss": 5.0417, + "step": 737 + }, + { + "epoch": 0.94464, + "grad_norm": 0.5777782201766968, + "learning_rate": 0.00028602772916947096, + "loss": 5.0035, + "step": 738 + }, + { + "epoch": 0.94592, + "grad_norm": 0.6748936772346497, + "learning_rate": 0.0002859873468838336, + "loss": 4.9764, + "step": 739 + }, + { + "epoch": 0.9472, + "grad_norm": 0.7863640785217285, + "learning_rate": 0.00028594696459819627, + "loss": 4.9554, + "step": 740 + }, + { + "epoch": 0.94848, + "grad_norm": 0.7226070761680603, + "learning_rate": 0.0002859065823125589, + "loss": 4.9605, + "step": 741 + }, + { + "epoch": 0.94976, + "grad_norm": 0.6418464779853821, + "learning_rate": 0.0002858662000269215, + "loss": 4.9418, + "step": 742 + }, + { + "epoch": 0.95104, + "grad_norm": 0.8660367727279663, + "learning_rate": 0.0002858258177412841, + "loss": 5.0473, + "step": 743 + }, + { + "epoch": 0.95232, + "grad_norm": 0.7218934297561646, + "learning_rate": 0.00028578543545564674, + "loss": 5.0287, + "step": 744 + }, + { + "epoch": 0.9536, + "grad_norm": 0.6135517954826355, + "learning_rate": 0.0002857450531700094, + "loss": 4.9141, + "step": 745 + }, + { + "epoch": 0.95488, + "grad_norm": 0.7062588334083557, + "learning_rate": 0.00028570467088437205, + "loss": 4.974, + "step": 746 + }, + { + "epoch": 0.95616, + "grad_norm": 0.6527339220046997, + "learning_rate": 0.0002856642885987347, + "loss": 4.9275, + "step": 747 + }, + { + "epoch": 0.95744, + "grad_norm": 0.601574718952179, + "learning_rate": 0.0002856239063130973, + "loss": 5.0131, + "step": 748 + }, + { + "epoch": 0.95872, + "grad_norm": 0.6972563862800598, + "learning_rate": 0.00028558352402745994, + "loss": 5.0037, + "step": 749 + }, + { + "epoch": 0.96, + "grad_norm": 0.6463833451271057, + "learning_rate": 0.00028554314174182257, + "loss": 5.0173, + "step": 750 + }, + { + "epoch": 0.96128, + "grad_norm": 0.7086387872695923, + "learning_rate": 0.0002855027594561852, + "loss": 5.0418, + "step": 751 + }, + { + "epoch": 0.96256, + "grad_norm": 0.6756019592285156, + "learning_rate": 0.00028546237717054783, + "loss": 4.9527, + "step": 752 + }, + { + "epoch": 0.96384, + "grad_norm": 0.7218304872512817, + "learning_rate": 0.00028542199488491046, + "loss": 4.99, + "step": 753 + }, + { + "epoch": 0.96512, + "grad_norm": 0.6159854531288147, + "learning_rate": 0.0002853816125992731, + "loss": 4.9612, + "step": 754 + }, + { + "epoch": 0.9664, + "grad_norm": 0.7188109159469604, + "learning_rate": 0.0002853412303136357, + "loss": 4.957, + "step": 755 + }, + { + "epoch": 0.96768, + "grad_norm": 0.7729025483131409, + "learning_rate": 0.00028530084802799835, + "loss": 5.03, + "step": 756 + }, + { + "epoch": 0.96896, + "grad_norm": 0.6761441230773926, + "learning_rate": 0.000285260465742361, + "loss": 4.9148, + "step": 757 + }, + { + "epoch": 0.97024, + "grad_norm": 0.6675111055374146, + "learning_rate": 0.0002852200834567236, + "loss": 4.9832, + "step": 758 + }, + { + "epoch": 0.97152, + "grad_norm": 0.6335251927375793, + "learning_rate": 0.00028517970117108624, + "loss": 4.9268, + "step": 759 + }, + { + "epoch": 0.9728, + "grad_norm": 0.633287787437439, + "learning_rate": 0.00028513931888544887, + "loss": 4.9717, + "step": 760 + }, + { + "epoch": 0.97408, + "grad_norm": 0.5922399163246155, + "learning_rate": 0.0002850989365998115, + "loss": 4.9945, + "step": 761 + }, + { + "epoch": 0.97536, + "grad_norm": 0.5817521214485168, + "learning_rate": 0.0002850585543141742, + "loss": 4.9261, + "step": 762 + }, + { + "epoch": 0.97664, + "grad_norm": 0.6280835270881653, + "learning_rate": 0.0002850181720285368, + "loss": 4.9935, + "step": 763 + }, + { + "epoch": 0.97792, + "grad_norm": 0.6975805163383484, + "learning_rate": 0.00028497778974289945, + "loss": 4.9161, + "step": 764 + }, + { + "epoch": 0.9792, + "grad_norm": 0.63359135389328, + "learning_rate": 0.000284937407457262, + "loss": 4.9866, + "step": 765 + }, + { + "epoch": 0.98048, + "grad_norm": 0.5709059238433838, + "learning_rate": 0.0002848970251716247, + "loss": 5.0262, + "step": 766 + }, + { + "epoch": 0.98176, + "grad_norm": 0.5760507583618164, + "learning_rate": 0.00028485664288598734, + "loss": 4.9359, + "step": 767 + }, + { + "epoch": 0.98304, + "grad_norm": 0.5946138501167297, + "learning_rate": 0.00028481626060034997, + "loss": 4.9843, + "step": 768 + }, + { + "epoch": 0.98432, + "grad_norm": 0.6102321743965149, + "learning_rate": 0.0002847758783147126, + "loss": 4.9541, + "step": 769 + }, + { + "epoch": 0.9856, + "grad_norm": 0.7592500448226929, + "learning_rate": 0.0002847354960290752, + "loss": 4.9617, + "step": 770 + }, + { + "epoch": 0.98688, + "grad_norm": 0.7633534669876099, + "learning_rate": 0.00028469511374343786, + "loss": 4.9587, + "step": 771 + }, + { + "epoch": 0.98816, + "grad_norm": 0.778260350227356, + "learning_rate": 0.0002846547314578005, + "loss": 4.9701, + "step": 772 + }, + { + "epoch": 0.98944, + "grad_norm": 0.6937357187271118, + "learning_rate": 0.0002846143491721631, + "loss": 4.8862, + "step": 773 + }, + { + "epoch": 0.99072, + "grad_norm": 0.7559827566146851, + "learning_rate": 0.00028457396688652575, + "loss": 4.9598, + "step": 774 + }, + { + "epoch": 0.992, + "grad_norm": 0.699320375919342, + "learning_rate": 0.0002845335846008884, + "loss": 4.9923, + "step": 775 + }, + { + "epoch": 0.99328, + "grad_norm": 0.6152313947677612, + "learning_rate": 0.000284493202315251, + "loss": 4.9211, + "step": 776 + }, + { + "epoch": 0.99456, + "grad_norm": 0.6833812594413757, + "learning_rate": 0.00028445282002961364, + "loss": 4.9163, + "step": 777 + }, + { + "epoch": 0.99584, + "grad_norm": 0.6250735521316528, + "learning_rate": 0.00028441243774397627, + "loss": 4.8906, + "step": 778 + }, + { + "epoch": 0.99712, + "grad_norm": 0.592545747756958, + "learning_rate": 0.00028437205545833895, + "loss": 4.8642, + "step": 779 + }, + { + "epoch": 0.9984, + "grad_norm": 0.6293628811836243, + "learning_rate": 0.0002843316731727016, + "loss": 4.899, + "step": 780 + }, + { + "epoch": 0.99968, + "grad_norm": 0.8118374943733215, + "learning_rate": 0.00028429129088706416, + "loss": 4.9125, + "step": 781 + }, + { + "epoch": 1.0, + "grad_norm": 1.0649632215499878, + "learning_rate": 0.0002842509086014268, + "loss": 4.8757, + "step": 782 + }, + { + "epoch": 1.00128, + "grad_norm": 0.8830782175064087, + "learning_rate": 0.0002842105263157894, + "loss": 4.9041, + "step": 783 + }, + { + "epoch": 1.00256, + "grad_norm": 0.9660894870758057, + "learning_rate": 0.0002841701440301521, + "loss": 4.9646, + "step": 784 + }, + { + "epoch": 1.00384, + "grad_norm": 0.8279880285263062, + "learning_rate": 0.00028412976174451473, + "loss": 4.9266, + "step": 785 + }, + { + "epoch": 1.00512, + "grad_norm": 0.8470652103424072, + "learning_rate": 0.00028408937945887736, + "loss": 4.8514, + "step": 786 + }, + { + "epoch": 1.0064, + "grad_norm": 0.7699068784713745, + "learning_rate": 0.00028404899717324, + "loss": 4.809, + "step": 787 + }, + { + "epoch": 1.00768, + "grad_norm": 0.7924918532371521, + "learning_rate": 0.0002840086148876026, + "loss": 4.8487, + "step": 788 + }, + { + "epoch": 1.00896, + "grad_norm": 0.7347840070724487, + "learning_rate": 0.00028396823260196525, + "loss": 4.8208, + "step": 789 + }, + { + "epoch": 1.01024, + "grad_norm": 0.6861421465873718, + "learning_rate": 0.0002839278503163279, + "loss": 4.8625, + "step": 790 + }, + { + "epoch": 1.01152, + "grad_norm": 0.6445014476776123, + "learning_rate": 0.0002838874680306905, + "loss": 4.916, + "step": 791 + }, + { + "epoch": 1.0128, + "grad_norm": 0.6200584769248962, + "learning_rate": 0.00028384708574505314, + "loss": 4.955, + "step": 792 + }, + { + "epoch": 1.01408, + "grad_norm": 0.5568555593490601, + "learning_rate": 0.00028380670345941577, + "loss": 4.9134, + "step": 793 + }, + { + "epoch": 1.01536, + "grad_norm": 0.6153519749641418, + "learning_rate": 0.0002837663211737784, + "loss": 4.8438, + "step": 794 + }, + { + "epoch": 1.01664, + "grad_norm": 0.7319142818450928, + "learning_rate": 0.00028372593888814103, + "loss": 4.919, + "step": 795 + }, + { + "epoch": 1.01792, + "grad_norm": 0.7483083605766296, + "learning_rate": 0.00028368555660250366, + "loss": 4.8444, + "step": 796 + }, + { + "epoch": 1.0192, + "grad_norm": 0.7571709156036377, + "learning_rate": 0.0002836451743168663, + "loss": 4.907, + "step": 797 + }, + { + "epoch": 1.02048, + "grad_norm": 0.7250765562057495, + "learning_rate": 0.0002836047920312289, + "loss": 4.9101, + "step": 798 + }, + { + "epoch": 1.02176, + "grad_norm": 0.7267122864723206, + "learning_rate": 0.00028356440974559155, + "loss": 4.8398, + "step": 799 + }, + { + "epoch": 1.02304, + "grad_norm": 0.7116459608078003, + "learning_rate": 0.0002835240274599542, + "loss": 4.8418, + "step": 800 + }, + { + "epoch": 1.02432, + "grad_norm": 0.6556388735771179, + "learning_rate": 0.00028348364517431687, + "loss": 4.8538, + "step": 801 + }, + { + "epoch": 1.0256, + "grad_norm": 0.6552951335906982, + "learning_rate": 0.0002834432628886795, + "loss": 4.8909, + "step": 802 + }, + { + "epoch": 1.02688, + "grad_norm": 0.7316877841949463, + "learning_rate": 0.00028340288060304213, + "loss": 4.8016, + "step": 803 + }, + { + "epoch": 1.02816, + "grad_norm": 0.7442478537559509, + "learning_rate": 0.0002833624983174047, + "loss": 4.8387, + "step": 804 + }, + { + "epoch": 1.02944, + "grad_norm": 0.6588963866233826, + "learning_rate": 0.0002833221160317674, + "loss": 4.8561, + "step": 805 + }, + { + "epoch": 1.03072, + "grad_norm": 0.6899812817573547, + "learning_rate": 0.00028328173374613, + "loss": 4.914, + "step": 806 + }, + { + "epoch": 1.032, + "grad_norm": 0.7177838087081909, + "learning_rate": 0.00028324135146049265, + "loss": 4.7727, + "step": 807 + }, + { + "epoch": 1.03328, + "grad_norm": 0.7071174383163452, + "learning_rate": 0.0002832009691748553, + "loss": 4.9105, + "step": 808 + }, + { + "epoch": 1.03456, + "grad_norm": 0.6587151288986206, + "learning_rate": 0.0002831605868892179, + "loss": 4.8851, + "step": 809 + }, + { + "epoch": 1.03584, + "grad_norm": 0.6153483390808105, + "learning_rate": 0.00028312020460358054, + "loss": 4.8638, + "step": 810 + }, + { + "epoch": 1.03712, + "grad_norm": 0.594373345375061, + "learning_rate": 0.00028307982231794317, + "loss": 4.865, + "step": 811 + }, + { + "epoch": 1.0384, + "grad_norm": 0.6585094332695007, + "learning_rate": 0.0002830394400323058, + "loss": 4.8542, + "step": 812 + }, + { + "epoch": 1.03968, + "grad_norm": 0.6621379852294922, + "learning_rate": 0.00028299905774666843, + "loss": 4.8349, + "step": 813 + }, + { + "epoch": 1.04096, + "grad_norm": 0.5880494713783264, + "learning_rate": 0.00028295867546103106, + "loss": 4.8509, + "step": 814 + }, + { + "epoch": 1.04224, + "grad_norm": 0.6204310059547424, + "learning_rate": 0.0002829182931753937, + "loss": 4.8359, + "step": 815 + }, + { + "epoch": 1.04352, + "grad_norm": 0.6215956211090088, + "learning_rate": 0.0002828779108897563, + "loss": 4.8127, + "step": 816 + }, + { + "epoch": 1.0448, + "grad_norm": 0.6148485541343689, + "learning_rate": 0.00028283752860411895, + "loss": 4.8714, + "step": 817 + }, + { + "epoch": 1.04608, + "grad_norm": 0.6803502440452576, + "learning_rate": 0.00028279714631848163, + "loss": 4.8163, + "step": 818 + }, + { + "epoch": 1.04736, + "grad_norm": 0.6470843553543091, + "learning_rate": 0.00028275676403284426, + "loss": 4.8039, + "step": 819 + }, + { + "epoch": 1.04864, + "grad_norm": 0.5897905230522156, + "learning_rate": 0.0002827163817472069, + "loss": 4.8399, + "step": 820 + }, + { + "epoch": 1.04992, + "grad_norm": 0.6008836030960083, + "learning_rate": 0.00028267599946156947, + "loss": 4.8523, + "step": 821 + }, + { + "epoch": 1.0512, + "grad_norm": 0.5722599029541016, + "learning_rate": 0.0002826356171759321, + "loss": 4.8057, + "step": 822 + }, + { + "epoch": 1.05248, + "grad_norm": 0.5825332403182983, + "learning_rate": 0.0002825952348902948, + "loss": 4.8341, + "step": 823 + }, + { + "epoch": 1.05376, + "grad_norm": 0.6503397822380066, + "learning_rate": 0.0002825548526046574, + "loss": 4.8925, + "step": 824 + }, + { + "epoch": 1.05504, + "grad_norm": 0.6947764158248901, + "learning_rate": 0.00028251447031902004, + "loss": 4.7832, + "step": 825 + }, + { + "epoch": 1.05632, + "grad_norm": 0.6167904734611511, + "learning_rate": 0.0002824740880333827, + "loss": 4.7814, + "step": 826 + }, + { + "epoch": 1.0576, + "grad_norm": 0.624954342842102, + "learning_rate": 0.0002824337057477453, + "loss": 4.7072, + "step": 827 + }, + { + "epoch": 1.05888, + "grad_norm": 0.7460287809371948, + "learning_rate": 0.00028239332346210793, + "loss": 4.7652, + "step": 828 + }, + { + "epoch": 1.06016, + "grad_norm": 0.6951721906661987, + "learning_rate": 0.00028235294117647056, + "loss": 4.8089, + "step": 829 + }, + { + "epoch": 1.06144, + "grad_norm": 0.7647305130958557, + "learning_rate": 0.0002823125588908332, + "loss": 4.8414, + "step": 830 + }, + { + "epoch": 1.06272, + "grad_norm": 0.7322429418563843, + "learning_rate": 0.0002822721766051958, + "loss": 4.787, + "step": 831 + }, + { + "epoch": 1.064, + "grad_norm": 0.6130174398422241, + "learning_rate": 0.00028223179431955845, + "loss": 4.8232, + "step": 832 + }, + { + "epoch": 1.06528, + "grad_norm": 0.7291455268859863, + "learning_rate": 0.0002821914120339211, + "loss": 4.9358, + "step": 833 + }, + { + "epoch": 1.06656, + "grad_norm": 0.7091724872589111, + "learning_rate": 0.0002821510297482837, + "loss": 4.7688, + "step": 834 + }, + { + "epoch": 1.06784, + "grad_norm": 0.6038631200790405, + "learning_rate": 0.0002821106474626464, + "loss": 4.8498, + "step": 835 + }, + { + "epoch": 1.06912, + "grad_norm": 0.5576080083847046, + "learning_rate": 0.00028207026517700903, + "loss": 4.7541, + "step": 836 + }, + { + "epoch": 1.0704, + "grad_norm": 0.6607750058174133, + "learning_rate": 0.0002820298828913716, + "loss": 4.7774, + "step": 837 + }, + { + "epoch": 1.07168, + "grad_norm": 0.6005157232284546, + "learning_rate": 0.00028198950060573423, + "loss": 4.8224, + "step": 838 + }, + { + "epoch": 1.07296, + "grad_norm": 0.7134353518486023, + "learning_rate": 0.00028194911832009686, + "loss": 4.7846, + "step": 839 + }, + { + "epoch": 1.07424, + "grad_norm": 0.8062997460365295, + "learning_rate": 0.00028190873603445955, + "loss": 4.8548, + "step": 840 + }, + { + "epoch": 1.07552, + "grad_norm": 0.7981602549552917, + "learning_rate": 0.0002818683537488222, + "loss": 4.8362, + "step": 841 + }, + { + "epoch": 1.0768, + "grad_norm": 0.7718906998634338, + "learning_rate": 0.0002818279714631848, + "loss": 4.8407, + "step": 842 + }, + { + "epoch": 1.07808, + "grad_norm": 0.7952736020088196, + "learning_rate": 0.00028178758917754744, + "loss": 4.8551, + "step": 843 + }, + { + "epoch": 1.07936, + "grad_norm": 0.7064546346664429, + "learning_rate": 0.00028174720689191007, + "loss": 4.755, + "step": 844 + }, + { + "epoch": 1.08064, + "grad_norm": 0.6656110882759094, + "learning_rate": 0.0002817068246062727, + "loss": 4.7786, + "step": 845 + }, + { + "epoch": 1.08192, + "grad_norm": 0.6304865479469299, + "learning_rate": 0.00028166644232063533, + "loss": 4.8153, + "step": 846 + }, + { + "epoch": 1.0832, + "grad_norm": 0.6220903992652893, + "learning_rate": 0.00028162606003499796, + "loss": 4.8118, + "step": 847 + }, + { + "epoch": 1.08448, + "grad_norm": 0.7010989189147949, + "learning_rate": 0.0002815856777493606, + "loss": 4.8627, + "step": 848 + }, + { + "epoch": 1.08576, + "grad_norm": 0.6893423795700073, + "learning_rate": 0.0002815452954637232, + "loss": 4.7906, + "step": 849 + }, + { + "epoch": 1.08704, + "grad_norm": 0.6807447075843811, + "learning_rate": 0.00028150491317808585, + "loss": 4.7955, + "step": 850 + }, + { + "epoch": 1.08832, + "grad_norm": 0.6272568702697754, + "learning_rate": 0.0002814645308924485, + "loss": 4.6878, + "step": 851 + }, + { + "epoch": 1.0896, + "grad_norm": 0.6356240510940552, + "learning_rate": 0.0002814241486068111, + "loss": 4.7682, + "step": 852 + }, + { + "epoch": 1.09088, + "grad_norm": 0.7134535312652588, + "learning_rate": 0.00028138376632117374, + "loss": 4.7629, + "step": 853 + }, + { + "epoch": 1.09216, + "grad_norm": 0.6460220813751221, + "learning_rate": 0.00028134338403553637, + "loss": 4.7132, + "step": 854 + }, + { + "epoch": 1.09344, + "grad_norm": 0.746244490146637, + "learning_rate": 0.000281303001749899, + "loss": 4.8273, + "step": 855 + }, + { + "epoch": 1.09472, + "grad_norm": 0.7077825665473938, + "learning_rate": 0.00028126261946426163, + "loss": 4.7677, + "step": 856 + }, + { + "epoch": 1.096, + "grad_norm": 0.652271568775177, + "learning_rate": 0.0002812222371786243, + "loss": 4.812, + "step": 857 + }, + { + "epoch": 1.09728, + "grad_norm": 0.6102200150489807, + "learning_rate": 0.00028118185489298694, + "loss": 4.7964, + "step": 858 + }, + { + "epoch": 1.09856, + "grad_norm": 0.7953206896781921, + "learning_rate": 0.0002811414726073496, + "loss": 4.7903, + "step": 859 + }, + { + "epoch": 1.09984, + "grad_norm": 0.641959547996521, + "learning_rate": 0.00028110109032171215, + "loss": 4.8177, + "step": 860 + }, + { + "epoch": 1.10112, + "grad_norm": 0.7065843939781189, + "learning_rate": 0.00028106070803607483, + "loss": 4.7971, + "step": 861 + }, + { + "epoch": 1.1024, + "grad_norm": 0.6949476003646851, + "learning_rate": 0.00028102032575043746, + "loss": 4.836, + "step": 862 + }, + { + "epoch": 1.10368, + "grad_norm": 0.7160281538963318, + "learning_rate": 0.0002809799434648001, + "loss": 4.7041, + "step": 863 + }, + { + "epoch": 1.10496, + "grad_norm": 0.6318966150283813, + "learning_rate": 0.0002809395611791627, + "loss": 4.7332, + "step": 864 + }, + { + "epoch": 1.1062400000000001, + "grad_norm": 0.6665629148483276, + "learning_rate": 0.00028089917889352535, + "loss": 4.8316, + "step": 865 + }, + { + "epoch": 1.10752, + "grad_norm": 0.6924526691436768, + "learning_rate": 0.000280858796607888, + "loss": 4.7881, + "step": 866 + }, + { + "epoch": 1.1088, + "grad_norm": 0.6507744789123535, + "learning_rate": 0.0002808184143222506, + "loss": 4.8198, + "step": 867 + }, + { + "epoch": 1.11008, + "grad_norm": 0.7218488454818726, + "learning_rate": 0.00028077803203661324, + "loss": 4.8512, + "step": 868 + }, + { + "epoch": 1.11136, + "grad_norm": 0.7143361568450928, + "learning_rate": 0.0002807376497509759, + "loss": 4.795, + "step": 869 + }, + { + "epoch": 1.11264, + "grad_norm": 0.6624107956886292, + "learning_rate": 0.0002806972674653385, + "loss": 4.7502, + "step": 870 + }, + { + "epoch": 1.11392, + "grad_norm": 0.6978575587272644, + "learning_rate": 0.00028065688517970114, + "loss": 4.8148, + "step": 871 + }, + { + "epoch": 1.1152, + "grad_norm": 0.6641087532043457, + "learning_rate": 0.00028061650289406377, + "loss": 4.7369, + "step": 872 + }, + { + "epoch": 1.11648, + "grad_norm": 0.7032644748687744, + "learning_rate": 0.0002805761206084264, + "loss": 4.7203, + "step": 873 + }, + { + "epoch": 1.11776, + "grad_norm": 0.6251493096351624, + "learning_rate": 0.0002805357383227891, + "loss": 4.8037, + "step": 874 + }, + { + "epoch": 1.11904, + "grad_norm": 0.6594140529632568, + "learning_rate": 0.0002804953560371517, + "loss": 4.7187, + "step": 875 + }, + { + "epoch": 1.12032, + "grad_norm": 0.6820217370986938, + "learning_rate": 0.0002804549737515143, + "loss": 4.7954, + "step": 876 + }, + { + "epoch": 1.1216, + "grad_norm": 0.7188774347305298, + "learning_rate": 0.0002804145914658769, + "loss": 4.7619, + "step": 877 + }, + { + "epoch": 1.12288, + "grad_norm": 0.7069806456565857, + "learning_rate": 0.00028037420918023955, + "loss": 4.7879, + "step": 878 + }, + { + "epoch": 1.12416, + "grad_norm": 0.5844770669937134, + "learning_rate": 0.00028033382689460223, + "loss": 4.7458, + "step": 879 + }, + { + "epoch": 1.12544, + "grad_norm": 0.6429082751274109, + "learning_rate": 0.00028029344460896486, + "loss": 4.6233, + "step": 880 + }, + { + "epoch": 1.12672, + "grad_norm": 0.736958384513855, + "learning_rate": 0.0002802530623233275, + "loss": 4.745, + "step": 881 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.6151041984558105, + "learning_rate": 0.0002802126800376901, + "loss": 4.6771, + "step": 882 + }, + { + "epoch": 1.12928, + "grad_norm": 0.6079396605491638, + "learning_rate": 0.00028017229775205275, + "loss": 4.8183, + "step": 883 + }, + { + "epoch": 1.13056, + "grad_norm": 0.6472486853599548, + "learning_rate": 0.0002801319154664154, + "loss": 4.713, + "step": 884 + }, + { + "epoch": 1.13184, + "grad_norm": 0.6157152056694031, + "learning_rate": 0.000280091533180778, + "loss": 4.727, + "step": 885 + }, + { + "epoch": 1.13312, + "grad_norm": 0.5990360975265503, + "learning_rate": 0.00028005115089514064, + "loss": 4.8425, + "step": 886 + }, + { + "epoch": 1.1344, + "grad_norm": 0.6508432030677795, + "learning_rate": 0.00028001076860950327, + "loss": 4.7481, + "step": 887 + }, + { + "epoch": 1.13568, + "grad_norm": 0.6805848479270935, + "learning_rate": 0.0002799703863238659, + "loss": 4.7301, + "step": 888 + }, + { + "epoch": 1.13696, + "grad_norm": 0.6594050526618958, + "learning_rate": 0.00027993000403822853, + "loss": 4.726, + "step": 889 + }, + { + "epoch": 1.13824, + "grad_norm": 0.5740160942077637, + "learning_rate": 0.00027988962175259116, + "loss": 4.7209, + "step": 890 + }, + { + "epoch": 1.13952, + "grad_norm": 0.6788250207901001, + "learning_rate": 0.0002798492394669538, + "loss": 4.7201, + "step": 891 + }, + { + "epoch": 1.1408, + "grad_norm": 0.6584210395812988, + "learning_rate": 0.0002798088571813165, + "loss": 4.7145, + "step": 892 + }, + { + "epoch": 1.14208, + "grad_norm": 0.6468678116798401, + "learning_rate": 0.00027976847489567905, + "loss": 4.6967, + "step": 893 + }, + { + "epoch": 1.14336, + "grad_norm": 0.6821480989456177, + "learning_rate": 0.0002797280926100417, + "loss": 4.7826, + "step": 894 + }, + { + "epoch": 1.1446399999999999, + "grad_norm": 0.6273735165596008, + "learning_rate": 0.0002796877103244043, + "loss": 4.7037, + "step": 895 + }, + { + "epoch": 1.14592, + "grad_norm": 0.6939446926116943, + "learning_rate": 0.000279647328038767, + "loss": 4.7438, + "step": 896 + }, + { + "epoch": 1.1472, + "grad_norm": 0.7489894032478333, + "learning_rate": 0.0002796069457531296, + "loss": 4.6979, + "step": 897 + }, + { + "epoch": 1.14848, + "grad_norm": 0.7742366194725037, + "learning_rate": 0.00027956656346749226, + "loss": 4.691, + "step": 898 + }, + { + "epoch": 1.1497600000000001, + "grad_norm": 0.7484803199768066, + "learning_rate": 0.00027952618118185483, + "loss": 4.8254, + "step": 899 + }, + { + "epoch": 1.15104, + "grad_norm": 0.8308618068695068, + "learning_rate": 0.0002794857988962175, + "loss": 4.7098, + "step": 900 + }, + { + "epoch": 1.15232, + "grad_norm": 0.6708440780639648, + "learning_rate": 0.00027944541661058015, + "loss": 4.8018, + "step": 901 + }, + { + "epoch": 1.1536, + "grad_norm": 0.6915727853775024, + "learning_rate": 0.0002794050343249428, + "loss": 4.6875, + "step": 902 + }, + { + "epoch": 1.15488, + "grad_norm": 0.6630460619926453, + "learning_rate": 0.0002793646520393054, + "loss": 4.7815, + "step": 903 + }, + { + "epoch": 1.15616, + "grad_norm": 0.6760913133621216, + "learning_rate": 0.00027932426975366804, + "loss": 4.7237, + "step": 904 + }, + { + "epoch": 1.15744, + "grad_norm": 0.6769623160362244, + "learning_rate": 0.00027928388746803067, + "loss": 4.814, + "step": 905 + }, + { + "epoch": 1.15872, + "grad_norm": 0.6976802945137024, + "learning_rate": 0.0002792435051823933, + "loss": 4.7954, + "step": 906 + }, + { + "epoch": 1.16, + "grad_norm": 0.5799634456634521, + "learning_rate": 0.0002792031228967559, + "loss": 4.6929, + "step": 907 + }, + { + "epoch": 1.16128, + "grad_norm": 0.6125800609588623, + "learning_rate": 0.00027916274061111856, + "loss": 4.6535, + "step": 908 + }, + { + "epoch": 1.16256, + "grad_norm": 0.6879367828369141, + "learning_rate": 0.0002791223583254812, + "loss": 4.7521, + "step": 909 + }, + { + "epoch": 1.16384, + "grad_norm": 0.737657368183136, + "learning_rate": 0.0002790819760398438, + "loss": 4.7687, + "step": 910 + }, + { + "epoch": 1.16512, + "grad_norm": 0.6375575065612793, + "learning_rate": 0.00027904159375420645, + "loss": 4.8383, + "step": 911 + }, + { + "epoch": 1.1663999999999999, + "grad_norm": 0.5931581258773804, + "learning_rate": 0.0002790012114685691, + "loss": 4.7189, + "step": 912 + }, + { + "epoch": 1.16768, + "grad_norm": 0.7057203054428101, + "learning_rate": 0.00027896082918293176, + "loss": 4.6494, + "step": 913 + }, + { + "epoch": 1.16896, + "grad_norm": 0.7094667553901672, + "learning_rate": 0.0002789204468972944, + "loss": 4.7491, + "step": 914 + }, + { + "epoch": 1.17024, + "grad_norm": 0.6365569233894348, + "learning_rate": 0.000278880064611657, + "loss": 4.7078, + "step": 915 + }, + { + "epoch": 1.1715200000000001, + "grad_norm": 0.7238393425941467, + "learning_rate": 0.0002788396823260196, + "loss": 4.6681, + "step": 916 + }, + { + "epoch": 1.1728, + "grad_norm": 0.7257333993911743, + "learning_rate": 0.0002787993000403822, + "loss": 4.759, + "step": 917 + }, + { + "epoch": 1.17408, + "grad_norm": 0.644520103931427, + "learning_rate": 0.0002787589177547449, + "loss": 4.7318, + "step": 918 + }, + { + "epoch": 1.17536, + "grad_norm": 0.6309005618095398, + "learning_rate": 0.00027871853546910754, + "loss": 4.7544, + "step": 919 + }, + { + "epoch": 1.17664, + "grad_norm": 0.715380847454071, + "learning_rate": 0.00027867815318347017, + "loss": 4.7209, + "step": 920 + }, + { + "epoch": 1.17792, + "grad_norm": 0.7284442186355591, + "learning_rate": 0.0002786377708978328, + "loss": 4.7191, + "step": 921 + }, + { + "epoch": 1.1792, + "grad_norm": 0.6391010284423828, + "learning_rate": 0.00027859738861219543, + "loss": 4.7282, + "step": 922 + }, + { + "epoch": 1.18048, + "grad_norm": 0.6188117861747742, + "learning_rate": 0.00027855700632655806, + "loss": 4.6801, + "step": 923 + }, + { + "epoch": 1.18176, + "grad_norm": 0.6303733587265015, + "learning_rate": 0.0002785166240409207, + "loss": 4.7246, + "step": 924 + }, + { + "epoch": 1.18304, + "grad_norm": 0.5999323129653931, + "learning_rate": 0.0002784762417552833, + "loss": 4.749, + "step": 925 + }, + { + "epoch": 1.18432, + "grad_norm": 0.6512132883071899, + "learning_rate": 0.00027843585946964595, + "loss": 4.6749, + "step": 926 + }, + { + "epoch": 1.1856, + "grad_norm": 0.6530429124832153, + "learning_rate": 0.0002783954771840086, + "loss": 4.7483, + "step": 927 + }, + { + "epoch": 1.18688, + "grad_norm": 0.6326019763946533, + "learning_rate": 0.0002783550948983712, + "loss": 4.6818, + "step": 928 + }, + { + "epoch": 1.1881599999999999, + "grad_norm": 0.6044508218765259, + "learning_rate": 0.00027831471261273384, + "loss": 4.737, + "step": 929 + }, + { + "epoch": 1.18944, + "grad_norm": 0.6268877983093262, + "learning_rate": 0.00027827433032709647, + "loss": 4.6278, + "step": 930 + }, + { + "epoch": 1.19072, + "grad_norm": 0.6803573966026306, + "learning_rate": 0.00027823394804145916, + "loss": 4.6907, + "step": 931 + }, + { + "epoch": 1.192, + "grad_norm": 0.7021913528442383, + "learning_rate": 0.00027819356575582173, + "loss": 4.683, + "step": 932 + }, + { + "epoch": 1.1932800000000001, + "grad_norm": 0.6417155861854553, + "learning_rate": 0.00027815318347018436, + "loss": 4.7198, + "step": 933 + }, + { + "epoch": 1.19456, + "grad_norm": 0.6091512441635132, + "learning_rate": 0.000278112801184547, + "loss": 4.694, + "step": 934 + }, + { + "epoch": 1.19584, + "grad_norm": 0.7351169586181641, + "learning_rate": 0.0002780724188989097, + "loss": 4.697, + "step": 935 + }, + { + "epoch": 1.19712, + "grad_norm": 0.7421384453773499, + "learning_rate": 0.0002780320366132723, + "loss": 4.6373, + "step": 936 + }, + { + "epoch": 1.1984, + "grad_norm": 0.745575487613678, + "learning_rate": 0.00027799165432763494, + "loss": 4.6504, + "step": 937 + }, + { + "epoch": 1.19968, + "grad_norm": 0.650227963924408, + "learning_rate": 0.00027795127204199757, + "loss": 4.6666, + "step": 938 + }, + { + "epoch": 1.20096, + "grad_norm": 0.6333515048027039, + "learning_rate": 0.0002779108897563602, + "loss": 4.6865, + "step": 939 + }, + { + "epoch": 1.20224, + "grad_norm": 0.6844230890274048, + "learning_rate": 0.00027787050747072283, + "loss": 4.6709, + "step": 940 + }, + { + "epoch": 1.20352, + "grad_norm": 0.6432576775550842, + "learning_rate": 0.00027783012518508546, + "loss": 4.7449, + "step": 941 + }, + { + "epoch": 1.2048, + "grad_norm": 0.7197453379631042, + "learning_rate": 0.0002777897428994481, + "loss": 4.6855, + "step": 942 + }, + { + "epoch": 1.20608, + "grad_norm": 0.8134915232658386, + "learning_rate": 0.0002777493606138107, + "loss": 4.7429, + "step": 943 + }, + { + "epoch": 1.20736, + "grad_norm": 0.7857049107551575, + "learning_rate": 0.00027770897832817335, + "loss": 4.7009, + "step": 944 + }, + { + "epoch": 1.20864, + "grad_norm": 0.6796789765357971, + "learning_rate": 0.000277668596042536, + "loss": 4.7261, + "step": 945 + }, + { + "epoch": 1.2099199999999999, + "grad_norm": 0.7167194485664368, + "learning_rate": 0.0002776282137568986, + "loss": 4.6099, + "step": 946 + }, + { + "epoch": 1.2112, + "grad_norm": 0.7006614804267883, + "learning_rate": 0.00027758783147126124, + "loss": 4.6235, + "step": 947 + }, + { + "epoch": 1.21248, + "grad_norm": 0.7764577865600586, + "learning_rate": 0.00027754744918562387, + "loss": 4.667, + "step": 948 + }, + { + "epoch": 1.21376, + "grad_norm": 0.6722946166992188, + "learning_rate": 0.0002775070668999865, + "loss": 4.7208, + "step": 949 + }, + { + "epoch": 1.2150400000000001, + "grad_norm": 0.6938252449035645, + "learning_rate": 0.00027746668461434913, + "loss": 4.6394, + "step": 950 + }, + { + "epoch": 1.21632, + "grad_norm": 0.7050387859344482, + "learning_rate": 0.00027742630232871176, + "loss": 4.6333, + "step": 951 + }, + { + "epoch": 1.2176, + "grad_norm": 0.7311280965805054, + "learning_rate": 0.00027738592004307444, + "loss": 4.6614, + "step": 952 + }, + { + "epoch": 1.21888, + "grad_norm": 0.6674593091011047, + "learning_rate": 0.00027734553775743707, + "loss": 4.739, + "step": 953 + }, + { + "epoch": 1.22016, + "grad_norm": 0.698934018611908, + "learning_rate": 0.0002773051554717997, + "loss": 4.6763, + "step": 954 + }, + { + "epoch": 1.22144, + "grad_norm": 0.7208249568939209, + "learning_rate": 0.0002772647731861623, + "loss": 4.6478, + "step": 955 + }, + { + "epoch": 1.22272, + "grad_norm": 0.6473603248596191, + "learning_rate": 0.00027722439090052496, + "loss": 4.6642, + "step": 956 + }, + { + "epoch": 1.224, + "grad_norm": 0.7811574339866638, + "learning_rate": 0.0002771840086148876, + "loss": 4.7188, + "step": 957 + }, + { + "epoch": 1.22528, + "grad_norm": 0.7508848309516907, + "learning_rate": 0.0002771436263292502, + "loss": 4.6873, + "step": 958 + }, + { + "epoch": 1.22656, + "grad_norm": 0.6528134942054749, + "learning_rate": 0.00027710324404361285, + "loss": 4.6341, + "step": 959 + }, + { + "epoch": 1.22784, + "grad_norm": 0.7064421772956848, + "learning_rate": 0.0002770628617579755, + "loss": 4.6409, + "step": 960 + }, + { + "epoch": 1.22912, + "grad_norm": 0.7718419432640076, + "learning_rate": 0.0002770224794723381, + "loss": 4.7507, + "step": 961 + }, + { + "epoch": 1.2304, + "grad_norm": 0.6676574945449829, + "learning_rate": 0.00027698209718670074, + "loss": 4.6491, + "step": 962 + }, + { + "epoch": 1.2316799999999999, + "grad_norm": 0.7909667491912842, + "learning_rate": 0.0002769417149010634, + "loss": 4.6478, + "step": 963 + }, + { + "epoch": 1.23296, + "grad_norm": 0.8206857442855835, + "learning_rate": 0.000276901332615426, + "loss": 4.6306, + "step": 964 + }, + { + "epoch": 1.23424, + "grad_norm": 0.6872743964195251, + "learning_rate": 0.00027686095032978863, + "loss": 4.6992, + "step": 965 + }, + { + "epoch": 1.23552, + "grad_norm": 0.659874439239502, + "learning_rate": 0.00027682056804415126, + "loss": 4.6612, + "step": 966 + }, + { + "epoch": 1.2368000000000001, + "grad_norm": 0.7639236450195312, + "learning_rate": 0.0002767801857585139, + "loss": 4.6523, + "step": 967 + }, + { + "epoch": 1.23808, + "grad_norm": 0.6640061140060425, + "learning_rate": 0.0002767398034728765, + "loss": 4.6604, + "step": 968 + }, + { + "epoch": 1.23936, + "grad_norm": 0.6309702396392822, + "learning_rate": 0.0002766994211872392, + "loss": 4.6036, + "step": 969 + }, + { + "epoch": 1.24064, + "grad_norm": 0.6865962147712708, + "learning_rate": 0.00027665903890160184, + "loss": 4.6821, + "step": 970 + }, + { + "epoch": 1.24192, + "grad_norm": 0.7177154421806335, + "learning_rate": 0.0002766186566159644, + "loss": 4.5896, + "step": 971 + }, + { + "epoch": 1.2432, + "grad_norm": 0.7251400351524353, + "learning_rate": 0.00027657827433032704, + "loss": 4.567, + "step": 972 + }, + { + "epoch": 1.24448, + "grad_norm": 0.6386240124702454, + "learning_rate": 0.0002765378920446897, + "loss": 4.5937, + "step": 973 + }, + { + "epoch": 1.24576, + "grad_norm": 0.666797935962677, + "learning_rate": 0.00027649750975905236, + "loss": 4.6854, + "step": 974 + }, + { + "epoch": 1.24704, + "grad_norm": 0.6357764601707458, + "learning_rate": 0.000276457127473415, + "loss": 4.623, + "step": 975 + }, + { + "epoch": 1.24832, + "grad_norm": 0.6851757168769836, + "learning_rate": 0.0002764167451877776, + "loss": 4.6517, + "step": 976 + }, + { + "epoch": 1.2496, + "grad_norm": 0.6265616416931152, + "learning_rate": 0.00027637636290214025, + "loss": 4.6597, + "step": 977 + }, + { + "epoch": 1.25088, + "grad_norm": 0.698387861251831, + "learning_rate": 0.0002763359806165029, + "loss": 4.5611, + "step": 978 + }, + { + "epoch": 1.25216, + "grad_norm": 0.6262152194976807, + "learning_rate": 0.0002762955983308655, + "loss": 4.6248, + "step": 979 + }, + { + "epoch": 1.2534399999999999, + "grad_norm": 0.675564706325531, + "learning_rate": 0.00027625521604522814, + "loss": 4.6137, + "step": 980 + }, + { + "epoch": 1.25472, + "grad_norm": 0.5722922682762146, + "learning_rate": 0.00027621483375959077, + "loss": 4.6744, + "step": 981 + }, + { + "epoch": 1.256, + "grad_norm": 0.6412468552589417, + "learning_rate": 0.0002761744514739534, + "loss": 4.5666, + "step": 982 + }, + { + "epoch": 1.25728, + "grad_norm": 0.7287204265594482, + "learning_rate": 0.00027613406918831603, + "loss": 4.6355, + "step": 983 + }, + { + "epoch": 1.2585600000000001, + "grad_norm": 0.742250382900238, + "learning_rate": 0.00027609368690267866, + "loss": 4.667, + "step": 984 + }, + { + "epoch": 1.25984, + "grad_norm": 0.6279268860816956, + "learning_rate": 0.0002760533046170413, + "loss": 4.5517, + "step": 985 + }, + { + "epoch": 1.26112, + "grad_norm": 0.6083707213401794, + "learning_rate": 0.0002760129223314039, + "loss": 4.5803, + "step": 986 + }, + { + "epoch": 1.2624, + "grad_norm": 0.6249653697013855, + "learning_rate": 0.0002759725400457666, + "loss": 4.6421, + "step": 987 + }, + { + "epoch": 1.26368, + "grad_norm": 0.5808601975440979, + "learning_rate": 0.0002759321577601292, + "loss": 4.6447, + "step": 988 + }, + { + "epoch": 1.2649599999999999, + "grad_norm": 0.6520289778709412, + "learning_rate": 0.0002758917754744918, + "loss": 4.7274, + "step": 989 + }, + { + "epoch": 1.26624, + "grad_norm": 0.7377476692199707, + "learning_rate": 0.00027585139318885444, + "loss": 4.625, + "step": 990 + }, + { + "epoch": 1.26752, + "grad_norm": 0.6483809947967529, + "learning_rate": 0.0002758110109032171, + "loss": 4.6238, + "step": 991 + }, + { + "epoch": 1.2688, + "grad_norm": 0.625088632106781, + "learning_rate": 0.00027577062861757975, + "loss": 4.6473, + "step": 992 + }, + { + "epoch": 1.27008, + "grad_norm": 0.6513147950172424, + "learning_rate": 0.0002757302463319424, + "loss": 4.5971, + "step": 993 + }, + { + "epoch": 1.27136, + "grad_norm": 0.6792516112327576, + "learning_rate": 0.000275689864046305, + "loss": 4.6855, + "step": 994 + }, + { + "epoch": 1.27264, + "grad_norm": 0.6334472894668579, + "learning_rate": 0.00027564948176066764, + "loss": 4.6, + "step": 995 + }, + { + "epoch": 1.27392, + "grad_norm": 0.6187386512756348, + "learning_rate": 0.0002756090994750303, + "loss": 4.6733, + "step": 996 + }, + { + "epoch": 1.2752, + "grad_norm": 0.6294391751289368, + "learning_rate": 0.0002755687171893929, + "loss": 4.6067, + "step": 997 + }, + { + "epoch": 1.27648, + "grad_norm": 0.6575160026550293, + "learning_rate": 0.00027552833490375553, + "loss": 4.662, + "step": 998 + }, + { + "epoch": 1.27776, + "grad_norm": 0.6640904545783997, + "learning_rate": 0.00027548795261811816, + "loss": 4.6074, + "step": 999 + }, + { + "epoch": 1.27904, + "grad_norm": 0.630664050579071, + "learning_rate": 0.0002754475703324808, + "loss": 4.6227, + "step": 1000 + }, + { + "epoch": 1.2803200000000001, + "grad_norm": 0.7079930901527405, + "learning_rate": 0.0002754071880468434, + "loss": 4.6341, + "step": 1001 + }, + { + "epoch": 1.2816, + "grad_norm": 0.7556226253509521, + "learning_rate": 0.00027536680576120605, + "loss": 4.6352, + "step": 1002 + }, + { + "epoch": 1.28288, + "grad_norm": 0.5955154895782471, + "learning_rate": 0.0002753264234755687, + "loss": 4.5975, + "step": 1003 + }, + { + "epoch": 1.28416, + "grad_norm": 0.7106191515922546, + "learning_rate": 0.0002752860411899313, + "loss": 4.6302, + "step": 1004 + }, + { + "epoch": 1.28544, + "grad_norm": 0.7254298329353333, + "learning_rate": 0.00027524565890429394, + "loss": 4.6382, + "step": 1005 + }, + { + "epoch": 1.2867199999999999, + "grad_norm": 0.6350436210632324, + "learning_rate": 0.0002752052766186566, + "loss": 4.6083, + "step": 1006 + }, + { + "epoch": 1.288, + "grad_norm": 0.7333962321281433, + "learning_rate": 0.0002751648943330192, + "loss": 4.6096, + "step": 1007 + }, + { + "epoch": 1.28928, + "grad_norm": 0.5859498977661133, + "learning_rate": 0.0002751245120473819, + "loss": 4.6632, + "step": 1008 + }, + { + "epoch": 1.29056, + "grad_norm": 0.7196768522262573, + "learning_rate": 0.0002750841297617445, + "loss": 4.7089, + "step": 1009 + }, + { + "epoch": 1.29184, + "grad_norm": 0.6264676451683044, + "learning_rate": 0.00027504374747610715, + "loss": 4.609, + "step": 1010 + }, + { + "epoch": 1.29312, + "grad_norm": 0.6764931082725525, + "learning_rate": 0.0002750033651904697, + "loss": 4.5912, + "step": 1011 + }, + { + "epoch": 1.2944, + "grad_norm": 0.6790111064910889, + "learning_rate": 0.00027496298290483236, + "loss": 4.6433, + "step": 1012 + }, + { + "epoch": 1.29568, + "grad_norm": 0.6634168028831482, + "learning_rate": 0.00027492260061919504, + "loss": 4.6097, + "step": 1013 + }, + { + "epoch": 1.29696, + "grad_norm": 0.63548344373703, + "learning_rate": 0.00027488221833355767, + "loss": 4.5944, + "step": 1014 + }, + { + "epoch": 1.29824, + "grad_norm": 0.6369843482971191, + "learning_rate": 0.0002748418360479203, + "loss": 4.5632, + "step": 1015 + }, + { + "epoch": 1.29952, + "grad_norm": 0.6853580474853516, + "learning_rate": 0.00027480145376228293, + "loss": 4.6141, + "step": 1016 + }, + { + "epoch": 1.3008, + "grad_norm": 0.648949921131134, + "learning_rate": 0.00027476107147664556, + "loss": 4.5685, + "step": 1017 + }, + { + "epoch": 1.3020800000000001, + "grad_norm": 0.5883113145828247, + "learning_rate": 0.0002747206891910082, + "loss": 4.6188, + "step": 1018 + }, + { + "epoch": 1.30336, + "grad_norm": 0.6754709482192993, + "learning_rate": 0.0002746803069053708, + "loss": 4.6567, + "step": 1019 + }, + { + "epoch": 1.30464, + "grad_norm": 0.6051660776138306, + "learning_rate": 0.00027463992461973345, + "loss": 4.6101, + "step": 1020 + }, + { + "epoch": 1.30592, + "grad_norm": 0.609302282333374, + "learning_rate": 0.0002745995423340961, + "loss": 4.5448, + "step": 1021 + }, + { + "epoch": 1.3072, + "grad_norm": 0.6241468191146851, + "learning_rate": 0.0002745591600484587, + "loss": 4.6168, + "step": 1022 + }, + { + "epoch": 1.3084799999999999, + "grad_norm": 0.6044094562530518, + "learning_rate": 0.00027451877776282134, + "loss": 4.6363, + "step": 1023 + }, + { + "epoch": 1.30976, + "grad_norm": 0.6301230788230896, + "learning_rate": 0.00027447839547718397, + "loss": 4.5699, + "step": 1024 + }, + { + "epoch": 1.31104, + "grad_norm": 0.6775204539299011, + "learning_rate": 0.0002744380131915466, + "loss": 4.582, + "step": 1025 + }, + { + "epoch": 1.31232, + "grad_norm": 0.631717324256897, + "learning_rate": 0.0002743976309059093, + "loss": 4.559, + "step": 1026 + }, + { + "epoch": 1.3136, + "grad_norm": 0.631300687789917, + "learning_rate": 0.00027435724862027186, + "loss": 4.5394, + "step": 1027 + }, + { + "epoch": 1.31488, + "grad_norm": 0.6998112201690674, + "learning_rate": 0.0002743168663346345, + "loss": 4.5337, + "step": 1028 + }, + { + "epoch": 1.31616, + "grad_norm": 0.6501991748809814, + "learning_rate": 0.0002742764840489971, + "loss": 4.5545, + "step": 1029 + }, + { + "epoch": 1.31744, + "grad_norm": 0.6726256608963013, + "learning_rate": 0.0002742361017633598, + "loss": 4.5374, + "step": 1030 + }, + { + "epoch": 1.31872, + "grad_norm": 0.6180437803268433, + "learning_rate": 0.00027419571947772244, + "loss": 4.5881, + "step": 1031 + }, + { + "epoch": 1.32, + "grad_norm": 0.6200023889541626, + "learning_rate": 0.00027415533719208507, + "loss": 4.5298, + "step": 1032 + }, + { + "epoch": 1.32128, + "grad_norm": 0.6141354441642761, + "learning_rate": 0.0002741149549064477, + "loss": 4.4838, + "step": 1033 + }, + { + "epoch": 1.32256, + "grad_norm": 0.600661039352417, + "learning_rate": 0.0002740745726208103, + "loss": 4.5947, + "step": 1034 + }, + { + "epoch": 1.3238400000000001, + "grad_norm": 0.7037731409072876, + "learning_rate": 0.00027403419033517296, + "loss": 4.544, + "step": 1035 + }, + { + "epoch": 1.32512, + "grad_norm": 0.6471102237701416, + "learning_rate": 0.0002739938080495356, + "loss": 4.5557, + "step": 1036 + }, + { + "epoch": 1.3264, + "grad_norm": 0.6341505646705627, + "learning_rate": 0.0002739534257638982, + "loss": 4.662, + "step": 1037 + }, + { + "epoch": 1.32768, + "grad_norm": 0.660749614238739, + "learning_rate": 0.00027391304347826085, + "loss": 4.628, + "step": 1038 + }, + { + "epoch": 1.32896, + "grad_norm": 0.6301653385162354, + "learning_rate": 0.0002738726611926235, + "loss": 4.5595, + "step": 1039 + }, + { + "epoch": 1.3302399999999999, + "grad_norm": 0.6750319004058838, + "learning_rate": 0.0002738322789069861, + "loss": 4.6398, + "step": 1040 + }, + { + "epoch": 1.33152, + "grad_norm": 0.6259351372718811, + "learning_rate": 0.00027379189662134874, + "loss": 4.6368, + "step": 1041 + }, + { + "epoch": 1.3328, + "grad_norm": 0.6357682347297668, + "learning_rate": 0.00027375151433571137, + "loss": 4.5332, + "step": 1042 + }, + { + "epoch": 1.33408, + "grad_norm": 0.6520000100135803, + "learning_rate": 0.000273711132050074, + "loss": 4.6516, + "step": 1043 + }, + { + "epoch": 1.33536, + "grad_norm": 0.7159348130226135, + "learning_rate": 0.0002736707497644366, + "loss": 4.5656, + "step": 1044 + }, + { + "epoch": 1.33664, + "grad_norm": 0.7513425946235657, + "learning_rate": 0.00027363036747879926, + "loss": 4.6232, + "step": 1045 + }, + { + "epoch": 1.33792, + "grad_norm": 0.7019796967506409, + "learning_rate": 0.0002735899851931619, + "loss": 4.516, + "step": 1046 + }, + { + "epoch": 1.3392, + "grad_norm": 0.5741701722145081, + "learning_rate": 0.00027354960290752457, + "loss": 4.5341, + "step": 1047 + }, + { + "epoch": 1.34048, + "grad_norm": 0.6854413151741028, + "learning_rate": 0.0002735092206218872, + "loss": 4.5648, + "step": 1048 + }, + { + "epoch": 1.34176, + "grad_norm": 0.599707305431366, + "learning_rate": 0.00027346883833624983, + "loss": 4.5988, + "step": 1049 + }, + { + "epoch": 1.34304, + "grad_norm": 0.6394950151443481, + "learning_rate": 0.0002734284560506124, + "loss": 4.5769, + "step": 1050 + }, + { + "epoch": 1.34432, + "grad_norm": 0.5611634850502014, + "learning_rate": 0.00027338807376497504, + "loss": 4.5538, + "step": 1051 + }, + { + "epoch": 1.3456000000000001, + "grad_norm": 0.6633642315864563, + "learning_rate": 0.0002733476914793377, + "loss": 4.4661, + "step": 1052 + }, + { + "epoch": 1.34688, + "grad_norm": 0.6434425711631775, + "learning_rate": 0.00027330730919370035, + "loss": 4.5914, + "step": 1053 + }, + { + "epoch": 1.34816, + "grad_norm": 0.6126766204833984, + "learning_rate": 0.000273266926908063, + "loss": 4.5852, + "step": 1054 + }, + { + "epoch": 1.34944, + "grad_norm": 0.6653432250022888, + "learning_rate": 0.0002732265446224256, + "loss": 4.603, + "step": 1055 + }, + { + "epoch": 1.35072, + "grad_norm": 0.7112525701522827, + "learning_rate": 0.00027318616233678824, + "loss": 4.5842, + "step": 1056 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.6654365062713623, + "learning_rate": 0.00027314578005115087, + "loss": 4.5841, + "step": 1057 + }, + { + "epoch": 1.35328, + "grad_norm": 0.6465290784835815, + "learning_rate": 0.0002731053977655135, + "loss": 4.5549, + "step": 1058 + }, + { + "epoch": 1.35456, + "grad_norm": 0.6904200911521912, + "learning_rate": 0.00027306501547987613, + "loss": 4.6345, + "step": 1059 + }, + { + "epoch": 1.35584, + "grad_norm": 0.5807105302810669, + "learning_rate": 0.00027302463319423876, + "loss": 4.4827, + "step": 1060 + }, + { + "epoch": 1.35712, + "grad_norm": 0.6408985257148743, + "learning_rate": 0.0002729842509086014, + "loss": 4.5378, + "step": 1061 + }, + { + "epoch": 1.3584, + "grad_norm": 0.5737316608428955, + "learning_rate": 0.000272943868622964, + "loss": 4.5322, + "step": 1062 + }, + { + "epoch": 1.35968, + "grad_norm": 0.6886394023895264, + "learning_rate": 0.00027290348633732665, + "loss": 4.5859, + "step": 1063 + }, + { + "epoch": 1.36096, + "grad_norm": 0.6456865072250366, + "learning_rate": 0.0002728631040516893, + "loss": 4.6009, + "step": 1064 + }, + { + "epoch": 1.36224, + "grad_norm": 0.6285141110420227, + "learning_rate": 0.00027282272176605197, + "loss": 4.6286, + "step": 1065 + }, + { + "epoch": 1.36352, + "grad_norm": 0.5843459963798523, + "learning_rate": 0.00027278233948041454, + "loss": 4.5497, + "step": 1066 + }, + { + "epoch": 1.3648, + "grad_norm": 0.6061887145042419, + "learning_rate": 0.00027274195719477717, + "loss": 4.6066, + "step": 1067 + }, + { + "epoch": 1.36608, + "grad_norm": 0.668018639087677, + "learning_rate": 0.0002727015749091398, + "loss": 4.5314, + "step": 1068 + }, + { + "epoch": 1.3673600000000001, + "grad_norm": 0.6002050042152405, + "learning_rate": 0.0002726611926235025, + "loss": 4.5271, + "step": 1069 + }, + { + "epoch": 1.36864, + "grad_norm": 0.6185527443885803, + "learning_rate": 0.0002726208103378651, + "loss": 4.5401, + "step": 1070 + }, + { + "epoch": 1.36992, + "grad_norm": 0.6066594123840332, + "learning_rate": 0.00027258042805222775, + "loss": 4.5988, + "step": 1071 + }, + { + "epoch": 1.3712, + "grad_norm": 0.6096265912055969, + "learning_rate": 0.0002725400457665904, + "loss": 4.5043, + "step": 1072 + }, + { + "epoch": 1.37248, + "grad_norm": 0.6521106958389282, + "learning_rate": 0.000272499663480953, + "loss": 4.5546, + "step": 1073 + }, + { + "epoch": 1.3737599999999999, + "grad_norm": 0.582575798034668, + "learning_rate": 0.00027245928119531564, + "loss": 4.4859, + "step": 1074 + }, + { + "epoch": 1.37504, + "grad_norm": 0.5959002375602722, + "learning_rate": 0.00027241889890967827, + "loss": 4.5375, + "step": 1075 + }, + { + "epoch": 1.37632, + "grad_norm": 0.7106321454048157, + "learning_rate": 0.0002723785166240409, + "loss": 4.53, + "step": 1076 + }, + { + "epoch": 1.3776, + "grad_norm": 0.648299515247345, + "learning_rate": 0.0002723381343384035, + "loss": 4.6495, + "step": 1077 + }, + { + "epoch": 1.37888, + "grad_norm": 0.7060708403587341, + "learning_rate": 0.00027229775205276616, + "loss": 4.5456, + "step": 1078 + }, + { + "epoch": 1.38016, + "grad_norm": 0.655817985534668, + "learning_rate": 0.0002722573697671288, + "loss": 4.5239, + "step": 1079 + }, + { + "epoch": 1.38144, + "grad_norm": 0.625787079334259, + "learning_rate": 0.0002722169874814914, + "loss": 4.5517, + "step": 1080 + }, + { + "epoch": 1.38272, + "grad_norm": 0.6610313057899475, + "learning_rate": 0.00027217660519585405, + "loss": 4.4966, + "step": 1081 + }, + { + "epoch": 1.384, + "grad_norm": 0.6236038208007812, + "learning_rate": 0.00027213622291021673, + "loss": 4.5368, + "step": 1082 + }, + { + "epoch": 1.38528, + "grad_norm": 0.7284337878227234, + "learning_rate": 0.0002720958406245793, + "loss": 4.5835, + "step": 1083 + }, + { + "epoch": 1.38656, + "grad_norm": 0.6679523587226868, + "learning_rate": 0.00027205545833894194, + "loss": 4.4663, + "step": 1084 + }, + { + "epoch": 1.38784, + "grad_norm": 0.6354249715805054, + "learning_rate": 0.00027201507605330457, + "loss": 4.4995, + "step": 1085 + }, + { + "epoch": 1.3891200000000001, + "grad_norm": 0.5772947669029236, + "learning_rate": 0.00027197469376766725, + "loss": 4.4817, + "step": 1086 + }, + { + "epoch": 1.3904, + "grad_norm": 0.6011262536048889, + "learning_rate": 0.0002719343114820299, + "loss": 4.5965, + "step": 1087 + }, + { + "epoch": 1.39168, + "grad_norm": 0.6319146752357483, + "learning_rate": 0.0002718939291963925, + "loss": 4.4057, + "step": 1088 + }, + { + "epoch": 1.39296, + "grad_norm": 0.6585641503334045, + "learning_rate": 0.00027185354691075514, + "loss": 4.4783, + "step": 1089 + }, + { + "epoch": 1.39424, + "grad_norm": 0.6011718511581421, + "learning_rate": 0.00027181316462511777, + "loss": 4.5192, + "step": 1090 + }, + { + "epoch": 1.3955199999999999, + "grad_norm": 0.6210798621177673, + "learning_rate": 0.0002717727823394804, + "loss": 4.5047, + "step": 1091 + }, + { + "epoch": 1.3968, + "grad_norm": 0.591758131980896, + "learning_rate": 0.00027173240005384303, + "loss": 4.5768, + "step": 1092 + }, + { + "epoch": 1.39808, + "grad_norm": 0.636916995048523, + "learning_rate": 0.00027169201776820566, + "loss": 4.5244, + "step": 1093 + }, + { + "epoch": 1.39936, + "grad_norm": 0.697331428527832, + "learning_rate": 0.0002716516354825683, + "loss": 4.5199, + "step": 1094 + }, + { + "epoch": 1.40064, + "grad_norm": 0.6302866339683533, + "learning_rate": 0.0002716112531969309, + "loss": 4.5211, + "step": 1095 + }, + { + "epoch": 1.40192, + "grad_norm": 0.7289062738418579, + "learning_rate": 0.00027157087091129355, + "loss": 4.4961, + "step": 1096 + }, + { + "epoch": 1.4032, + "grad_norm": 0.6740280985832214, + "learning_rate": 0.0002715304886256562, + "loss": 4.5848, + "step": 1097 + }, + { + "epoch": 1.40448, + "grad_norm": 0.6113433241844177, + "learning_rate": 0.0002714901063400188, + "loss": 4.482, + "step": 1098 + }, + { + "epoch": 1.40576, + "grad_norm": 0.6321492195129395, + "learning_rate": 0.00027144972405438144, + "loss": 4.463, + "step": 1099 + }, + { + "epoch": 1.40704, + "grad_norm": 0.6910666227340698, + "learning_rate": 0.00027140934176874407, + "loss": 4.53, + "step": 1100 + }, + { + "epoch": 1.40832, + "grad_norm": 0.6177242994308472, + "learning_rate": 0.0002713689594831067, + "loss": 4.5152, + "step": 1101 + }, + { + "epoch": 1.4096, + "grad_norm": 0.6206321716308594, + "learning_rate": 0.00027132857719746933, + "loss": 4.5393, + "step": 1102 + }, + { + "epoch": 1.4108800000000001, + "grad_norm": 0.6209645867347717, + "learning_rate": 0.000271288194911832, + "loss": 4.4624, + "step": 1103 + }, + { + "epoch": 1.41216, + "grad_norm": 0.7263031601905823, + "learning_rate": 0.00027124781262619465, + "loss": 4.5301, + "step": 1104 + }, + { + "epoch": 1.41344, + "grad_norm": 0.6254947781562805, + "learning_rate": 0.0002712074303405573, + "loss": 4.5234, + "step": 1105 + }, + { + "epoch": 1.41472, + "grad_norm": 0.6119322180747986, + "learning_rate": 0.00027116704805491985, + "loss": 4.4959, + "step": 1106 + }, + { + "epoch": 1.416, + "grad_norm": 0.6006280183792114, + "learning_rate": 0.0002711266657692825, + "loss": 4.5361, + "step": 1107 + }, + { + "epoch": 1.4172799999999999, + "grad_norm": 0.6143077611923218, + "learning_rate": 0.00027108628348364517, + "loss": 4.4948, + "step": 1108 + }, + { + "epoch": 1.41856, + "grad_norm": 0.6156766414642334, + "learning_rate": 0.0002710459011980078, + "loss": 4.502, + "step": 1109 + }, + { + "epoch": 1.41984, + "grad_norm": 0.6217614412307739, + "learning_rate": 0.00027100551891237043, + "loss": 4.4685, + "step": 1110 + }, + { + "epoch": 1.42112, + "grad_norm": 0.592921793460846, + "learning_rate": 0.00027096513662673306, + "loss": 4.5341, + "step": 1111 + }, + { + "epoch": 1.4224, + "grad_norm": 0.5901917815208435, + "learning_rate": 0.0002709247543410957, + "loss": 4.4953, + "step": 1112 + }, + { + "epoch": 1.42368, + "grad_norm": 0.7119625210762024, + "learning_rate": 0.0002708843720554583, + "loss": 4.5269, + "step": 1113 + }, + { + "epoch": 1.42496, + "grad_norm": 0.7488095164299011, + "learning_rate": 0.00027084398976982095, + "loss": 4.5553, + "step": 1114 + }, + { + "epoch": 1.42624, + "grad_norm": 0.6430398225784302, + "learning_rate": 0.0002708036074841836, + "loss": 4.524, + "step": 1115 + }, + { + "epoch": 1.42752, + "grad_norm": 0.6690952777862549, + "learning_rate": 0.0002707632251985462, + "loss": 4.5503, + "step": 1116 + }, + { + "epoch": 1.4288, + "grad_norm": 0.7325178980827332, + "learning_rate": 0.00027072284291290884, + "loss": 4.4675, + "step": 1117 + }, + { + "epoch": 1.43008, + "grad_norm": 0.6277335286140442, + "learning_rate": 0.00027068246062727147, + "loss": 4.5683, + "step": 1118 + }, + { + "epoch": 1.43136, + "grad_norm": 0.6613750457763672, + "learning_rate": 0.0002706420783416341, + "loss": 4.5421, + "step": 1119 + }, + { + "epoch": 1.4326400000000001, + "grad_norm": 0.6405130624771118, + "learning_rate": 0.00027060169605599673, + "loss": 4.4973, + "step": 1120 + }, + { + "epoch": 1.43392, + "grad_norm": 0.7388433218002319, + "learning_rate": 0.0002705613137703594, + "loss": 4.4663, + "step": 1121 + }, + { + "epoch": 1.4352, + "grad_norm": 0.7195430397987366, + "learning_rate": 0.000270520931484722, + "loss": 4.4404, + "step": 1122 + }, + { + "epoch": 1.43648, + "grad_norm": 0.5988063812255859, + "learning_rate": 0.0002704805491990846, + "loss": 4.4924, + "step": 1123 + }, + { + "epoch": 1.43776, + "grad_norm": 0.6605175733566284, + "learning_rate": 0.00027044016691344725, + "loss": 4.4393, + "step": 1124 + }, + { + "epoch": 1.4390399999999999, + "grad_norm": 0.6142287850379944, + "learning_rate": 0.00027039978462780993, + "loss": 4.4968, + "step": 1125 + }, + { + "epoch": 1.44032, + "grad_norm": 0.6107270121574402, + "learning_rate": 0.00027035940234217256, + "loss": 4.5411, + "step": 1126 + }, + { + "epoch": 1.4416, + "grad_norm": 0.5510809421539307, + "learning_rate": 0.0002703190200565352, + "loss": 4.5815, + "step": 1127 + }, + { + "epoch": 1.44288, + "grad_norm": 0.6231082081794739, + "learning_rate": 0.0002702786377708978, + "loss": 4.4505, + "step": 1128 + }, + { + "epoch": 1.44416, + "grad_norm": 0.6307341456413269, + "learning_rate": 0.00027023825548526045, + "loss": 4.5774, + "step": 1129 + }, + { + "epoch": 1.44544, + "grad_norm": 0.5655580759048462, + "learning_rate": 0.0002701978731996231, + "loss": 4.451, + "step": 1130 + }, + { + "epoch": 1.44672, + "grad_norm": 0.6464011669158936, + "learning_rate": 0.0002701574909139857, + "loss": 4.446, + "step": 1131 + }, + { + "epoch": 1.448, + "grad_norm": 0.6590990424156189, + "learning_rate": 0.00027011710862834834, + "loss": 4.5092, + "step": 1132 + }, + { + "epoch": 1.44928, + "grad_norm": 0.5793240666389465, + "learning_rate": 0.000270076726342711, + "loss": 4.4777, + "step": 1133 + }, + { + "epoch": 1.45056, + "grad_norm": 0.6022570729255676, + "learning_rate": 0.0002700363440570736, + "loss": 4.4337, + "step": 1134 + }, + { + "epoch": 1.45184, + "grad_norm": 0.6188995838165283, + "learning_rate": 0.00026999596177143623, + "loss": 4.5174, + "step": 1135 + }, + { + "epoch": 1.45312, + "grad_norm": 0.6144684553146362, + "learning_rate": 0.00026995557948579886, + "loss": 4.4565, + "step": 1136 + }, + { + "epoch": 1.4544000000000001, + "grad_norm": 0.6763806939125061, + "learning_rate": 0.0002699151972001615, + "loss": 4.4995, + "step": 1137 + }, + { + "epoch": 1.45568, + "grad_norm": 0.7390356063842773, + "learning_rate": 0.0002698748149145241, + "loss": 4.5674, + "step": 1138 + }, + { + "epoch": 1.45696, + "grad_norm": 0.6634504199028015, + "learning_rate": 0.00026983443262888675, + "loss": 4.5301, + "step": 1139 + }, + { + "epoch": 1.45824, + "grad_norm": 0.6377415060997009, + "learning_rate": 0.0002697940503432494, + "loss": 4.4919, + "step": 1140 + }, + { + "epoch": 1.45952, + "grad_norm": 0.6241118907928467, + "learning_rate": 0.000269753668057612, + "loss": 4.5093, + "step": 1141 + }, + { + "epoch": 1.4607999999999999, + "grad_norm": 0.676541805267334, + "learning_rate": 0.0002697132857719747, + "loss": 4.5714, + "step": 1142 + }, + { + "epoch": 1.46208, + "grad_norm": 0.6231991648674011, + "learning_rate": 0.00026967290348633733, + "loss": 4.494, + "step": 1143 + }, + { + "epoch": 1.46336, + "grad_norm": 0.6391655802726746, + "learning_rate": 0.00026963252120069996, + "loss": 4.5113, + "step": 1144 + }, + { + "epoch": 1.46464, + "grad_norm": 0.6239742636680603, + "learning_rate": 0.00026959213891506253, + "loss": 4.4657, + "step": 1145 + }, + { + "epoch": 1.4659200000000001, + "grad_norm": 0.6046785116195679, + "learning_rate": 0.00026955175662942516, + "loss": 4.5132, + "step": 1146 + }, + { + "epoch": 1.4672, + "grad_norm": 0.663732647895813, + "learning_rate": 0.00026951137434378785, + "loss": 4.5261, + "step": 1147 + }, + { + "epoch": 1.46848, + "grad_norm": 0.6262268424034119, + "learning_rate": 0.0002694709920581505, + "loss": 4.5257, + "step": 1148 + }, + { + "epoch": 1.46976, + "grad_norm": 0.552179217338562, + "learning_rate": 0.0002694306097725131, + "loss": 4.5232, + "step": 1149 + }, + { + "epoch": 1.47104, + "grad_norm": 0.6553817987442017, + "learning_rate": 0.00026939022748687574, + "loss": 4.442, + "step": 1150 + }, + { + "epoch": 1.47232, + "grad_norm": 0.6651473641395569, + "learning_rate": 0.00026934984520123837, + "loss": 4.5369, + "step": 1151 + }, + { + "epoch": 1.4736, + "grad_norm": 0.6147942543029785, + "learning_rate": 0.000269309462915601, + "loss": 4.5399, + "step": 1152 + }, + { + "epoch": 1.47488, + "grad_norm": 0.662200927734375, + "learning_rate": 0.00026926908062996363, + "loss": 4.5464, + "step": 1153 + }, + { + "epoch": 1.4761600000000001, + "grad_norm": 0.6553072929382324, + "learning_rate": 0.00026922869834432626, + "loss": 4.5623, + "step": 1154 + }, + { + "epoch": 1.47744, + "grad_norm": 0.5930311679840088, + "learning_rate": 0.0002691883160586889, + "loss": 4.4286, + "step": 1155 + }, + { + "epoch": 1.47872, + "grad_norm": 0.6711944341659546, + "learning_rate": 0.0002691479337730515, + "loss": 4.4298, + "step": 1156 + }, + { + "epoch": 1.48, + "grad_norm": 0.5974631905555725, + "learning_rate": 0.00026910755148741415, + "loss": 4.4365, + "step": 1157 + }, + { + "epoch": 1.48128, + "grad_norm": 0.6224148273468018, + "learning_rate": 0.0002690671692017768, + "loss": 4.4888, + "step": 1158 + }, + { + "epoch": 1.4825599999999999, + "grad_norm": 0.5823886394500732, + "learning_rate": 0.0002690267869161394, + "loss": 4.48, + "step": 1159 + }, + { + "epoch": 1.48384, + "grad_norm": 0.6274778842926025, + "learning_rate": 0.0002689864046305021, + "loss": 4.5036, + "step": 1160 + }, + { + "epoch": 1.48512, + "grad_norm": 0.6259602904319763, + "learning_rate": 0.0002689460223448647, + "loss": 4.5225, + "step": 1161 + }, + { + "epoch": 1.4864, + "grad_norm": 0.5894110798835754, + "learning_rate": 0.0002689056400592273, + "loss": 4.5004, + "step": 1162 + }, + { + "epoch": 1.4876800000000001, + "grad_norm": 0.6239898204803467, + "learning_rate": 0.00026886525777358993, + "loss": 4.5111, + "step": 1163 + }, + { + "epoch": 1.48896, + "grad_norm": 0.6786986589431763, + "learning_rate": 0.0002688248754879526, + "loss": 4.5008, + "step": 1164 + }, + { + "epoch": 1.49024, + "grad_norm": 0.5871908664703369, + "learning_rate": 0.00026878449320231524, + "loss": 4.5222, + "step": 1165 + }, + { + "epoch": 1.49152, + "grad_norm": 0.7175498604774475, + "learning_rate": 0.0002687441109166779, + "loss": 4.5113, + "step": 1166 + }, + { + "epoch": 1.4928, + "grad_norm": 0.6610713005065918, + "learning_rate": 0.0002687037286310405, + "loss": 4.489, + "step": 1167 + }, + { + "epoch": 1.49408, + "grad_norm": 0.6492409110069275, + "learning_rate": 0.00026866334634540313, + "loss": 4.5002, + "step": 1168 + }, + { + "epoch": 1.49536, + "grad_norm": 0.637315034866333, + "learning_rate": 0.00026862296405976576, + "loss": 4.503, + "step": 1169 + }, + { + "epoch": 1.49664, + "grad_norm": 0.6214714646339417, + "learning_rate": 0.0002685825817741284, + "loss": 4.5694, + "step": 1170 + }, + { + "epoch": 1.49792, + "grad_norm": 0.6326239109039307, + "learning_rate": 0.000268542199488491, + "loss": 4.4157, + "step": 1171 + }, + { + "epoch": 1.4992, + "grad_norm": 0.726486086845398, + "learning_rate": 0.00026850181720285366, + "loss": 4.445, + "step": 1172 + }, + { + "epoch": 1.50048, + "grad_norm": 0.5670982599258423, + "learning_rate": 0.0002684614349172163, + "loss": 4.5223, + "step": 1173 + }, + { + "epoch": 1.50176, + "grad_norm": 0.6748657822608948, + "learning_rate": 0.0002684210526315789, + "loss": 4.4798, + "step": 1174 + }, + { + "epoch": 1.50304, + "grad_norm": 0.6664227843284607, + "learning_rate": 0.00026838067034594155, + "loss": 4.5394, + "step": 1175 + }, + { + "epoch": 1.5043199999999999, + "grad_norm": 0.6093580722808838, + "learning_rate": 0.0002683402880603042, + "loss": 4.4771, + "step": 1176 + }, + { + "epoch": 1.5056, + "grad_norm": 0.593794047832489, + "learning_rate": 0.00026829990577466686, + "loss": 4.4568, + "step": 1177 + }, + { + "epoch": 1.50688, + "grad_norm": 0.5962923765182495, + "learning_rate": 0.00026825952348902944, + "loss": 4.5111, + "step": 1178 + }, + { + "epoch": 1.50816, + "grad_norm": 0.5938080549240112, + "learning_rate": 0.00026821914120339207, + "loss": 4.4462, + "step": 1179 + }, + { + "epoch": 1.5094400000000001, + "grad_norm": 0.600837767124176, + "learning_rate": 0.0002681787589177547, + "loss": 4.5177, + "step": 1180 + }, + { + "epoch": 1.51072, + "grad_norm": 0.601696252822876, + "learning_rate": 0.0002681383766321174, + "loss": 4.4331, + "step": 1181 + }, + { + "epoch": 1.512, + "grad_norm": 0.5914311408996582, + "learning_rate": 0.00026809799434648, + "loss": 4.4859, + "step": 1182 + }, + { + "epoch": 1.51328, + "grad_norm": 0.6770086288452148, + "learning_rate": 0.00026805761206084264, + "loss": 4.4447, + "step": 1183 + }, + { + "epoch": 1.51456, + "grad_norm": 0.6460147500038147, + "learning_rate": 0.00026801722977520527, + "loss": 4.4399, + "step": 1184 + }, + { + "epoch": 1.5158399999999999, + "grad_norm": 0.5961685180664062, + "learning_rate": 0.00026797684748956785, + "loss": 4.4354, + "step": 1185 + }, + { + "epoch": 1.51712, + "grad_norm": 0.6054593324661255, + "learning_rate": 0.00026793646520393053, + "loss": 4.3969, + "step": 1186 + }, + { + "epoch": 1.5184, + "grad_norm": 0.6508775949478149, + "learning_rate": 0.00026789608291829316, + "loss": 4.4683, + "step": 1187 + }, + { + "epoch": 1.5196800000000001, + "grad_norm": 0.7032009363174438, + "learning_rate": 0.0002678557006326558, + "loss": 4.4864, + "step": 1188 + }, + { + "epoch": 1.52096, + "grad_norm": 0.6030545234680176, + "learning_rate": 0.0002678153183470184, + "loss": 4.389, + "step": 1189 + }, + { + "epoch": 1.52224, + "grad_norm": 0.5884782075881958, + "learning_rate": 0.00026777493606138105, + "loss": 4.5031, + "step": 1190 + }, + { + "epoch": 1.52352, + "grad_norm": 0.6099117994308472, + "learning_rate": 0.0002677345537757437, + "loss": 4.4668, + "step": 1191 + }, + { + "epoch": 1.5248, + "grad_norm": 0.5954748392105103, + "learning_rate": 0.0002676941714901063, + "loss": 4.4554, + "step": 1192 + }, + { + "epoch": 1.5260799999999999, + "grad_norm": 0.5383332371711731, + "learning_rate": 0.00026765378920446894, + "loss": 4.5648, + "step": 1193 + }, + { + "epoch": 1.52736, + "grad_norm": 0.5823171138763428, + "learning_rate": 0.00026761340691883157, + "loss": 4.5046, + "step": 1194 + }, + { + "epoch": 1.52864, + "grad_norm": 0.6584584712982178, + "learning_rate": 0.0002675730246331942, + "loss": 4.4529, + "step": 1195 + }, + { + "epoch": 1.52992, + "grad_norm": 0.5978401899337769, + "learning_rate": 0.00026753264234755683, + "loss": 4.4333, + "step": 1196 + }, + { + "epoch": 1.5312000000000001, + "grad_norm": 0.591083288192749, + "learning_rate": 0.00026749226006191946, + "loss": 4.4279, + "step": 1197 + }, + { + "epoch": 1.53248, + "grad_norm": 0.5882396101951599, + "learning_rate": 0.00026745187777628215, + "loss": 4.3951, + "step": 1198 + }, + { + "epoch": 1.53376, + "grad_norm": 0.571337103843689, + "learning_rate": 0.0002674114954906448, + "loss": 4.4579, + "step": 1199 + }, + { + "epoch": 1.53504, + "grad_norm": 0.5543642044067383, + "learning_rate": 0.0002673711132050074, + "loss": 4.4323, + "step": 1200 + }, + { + "epoch": 1.53632, + "grad_norm": 0.5652990341186523, + "learning_rate": 0.00026733073091937, + "loss": 4.4284, + "step": 1201 + }, + { + "epoch": 1.5375999999999999, + "grad_norm": 0.5995742678642273, + "learning_rate": 0.0002672903486337326, + "loss": 4.4789, + "step": 1202 + }, + { + "epoch": 1.53888, + "grad_norm": 0.5706351399421692, + "learning_rate": 0.0002672499663480953, + "loss": 4.4814, + "step": 1203 + }, + { + "epoch": 1.54016, + "grad_norm": 0.5830731391906738, + "learning_rate": 0.0002672095840624579, + "loss": 4.4525, + "step": 1204 + }, + { + "epoch": 1.5414400000000001, + "grad_norm": 0.6036829352378845, + "learning_rate": 0.00026716920177682056, + "loss": 4.3816, + "step": 1205 + }, + { + "epoch": 1.54272, + "grad_norm": 0.5913065671920776, + "learning_rate": 0.0002671288194911832, + "loss": 4.4997, + "step": 1206 + }, + { + "epoch": 1.544, + "grad_norm": 0.6033707857131958, + "learning_rate": 0.0002670884372055458, + "loss": 4.4725, + "step": 1207 + }, + { + "epoch": 1.54528, + "grad_norm": 0.6108120679855347, + "learning_rate": 0.00026704805491990845, + "loss": 4.4673, + "step": 1208 + }, + { + "epoch": 1.54656, + "grad_norm": 0.6634685397148132, + "learning_rate": 0.0002670076726342711, + "loss": 4.4737, + "step": 1209 + }, + { + "epoch": 1.5478399999999999, + "grad_norm": 0.5890428423881531, + "learning_rate": 0.0002669672903486337, + "loss": 4.4499, + "step": 1210 + }, + { + "epoch": 1.54912, + "grad_norm": 0.6377815008163452, + "learning_rate": 0.00026692690806299634, + "loss": 4.4546, + "step": 1211 + }, + { + "epoch": 1.5504, + "grad_norm": 0.6385360956192017, + "learning_rate": 0.00026688652577735897, + "loss": 4.4226, + "step": 1212 + }, + { + "epoch": 1.55168, + "grad_norm": 0.5952944159507751, + "learning_rate": 0.0002668461434917216, + "loss": 4.4762, + "step": 1213 + }, + { + "epoch": 1.5529600000000001, + "grad_norm": 0.5970460772514343, + "learning_rate": 0.0002668057612060842, + "loss": 4.4617, + "step": 1214 + }, + { + "epoch": 1.55424, + "grad_norm": 0.6275203227996826, + "learning_rate": 0.00026676537892044686, + "loss": 4.4252, + "step": 1215 + }, + { + "epoch": 1.55552, + "grad_norm": 0.6631754636764526, + "learning_rate": 0.00026672499663480954, + "loss": 4.5075, + "step": 1216 + }, + { + "epoch": 1.5568, + "grad_norm": 0.6074349880218506, + "learning_rate": 0.0002666846143491721, + "loss": 4.442, + "step": 1217 + }, + { + "epoch": 1.55808, + "grad_norm": 0.6577631235122681, + "learning_rate": 0.00026664423206353475, + "loss": 4.4871, + "step": 1218 + }, + { + "epoch": 1.5593599999999999, + "grad_norm": 0.6167892217636108, + "learning_rate": 0.0002666038497778974, + "loss": 4.4119, + "step": 1219 + }, + { + "epoch": 1.56064, + "grad_norm": 0.6109944581985474, + "learning_rate": 0.00026656346749226006, + "loss": 4.439, + "step": 1220 + }, + { + "epoch": 1.56192, + "grad_norm": 0.6446990966796875, + "learning_rate": 0.0002665230852066227, + "loss": 4.4472, + "step": 1221 + }, + { + "epoch": 1.5632000000000001, + "grad_norm": 0.5861752033233643, + "learning_rate": 0.0002664827029209853, + "loss": 4.4121, + "step": 1222 + }, + { + "epoch": 1.56448, + "grad_norm": 0.620345413684845, + "learning_rate": 0.00026644232063534795, + "loss": 4.5141, + "step": 1223 + }, + { + "epoch": 1.56576, + "grad_norm": 0.6211066842079163, + "learning_rate": 0.0002664019383497106, + "loss": 4.4676, + "step": 1224 + }, + { + "epoch": 1.56704, + "grad_norm": 0.5987910628318787, + "learning_rate": 0.0002663615560640732, + "loss": 4.4051, + "step": 1225 + }, + { + "epoch": 1.56832, + "grad_norm": 0.6385419368743896, + "learning_rate": 0.00026632117377843584, + "loss": 4.457, + "step": 1226 + }, + { + "epoch": 1.5695999999999999, + "grad_norm": 0.6010638475418091, + "learning_rate": 0.00026628079149279847, + "loss": 4.4322, + "step": 1227 + }, + { + "epoch": 1.57088, + "grad_norm": 0.6236041784286499, + "learning_rate": 0.0002662404092071611, + "loss": 4.4491, + "step": 1228 + }, + { + "epoch": 1.57216, + "grad_norm": 0.6467570066452026, + "learning_rate": 0.00026620002692152373, + "loss": 4.4418, + "step": 1229 + }, + { + "epoch": 1.57344, + "grad_norm": 0.5972265601158142, + "learning_rate": 0.00026615964463588636, + "loss": 4.4185, + "step": 1230 + }, + { + "epoch": 1.5747200000000001, + "grad_norm": 0.6430701017379761, + "learning_rate": 0.000266119262350249, + "loss": 4.4144, + "step": 1231 + }, + { + "epoch": 1.576, + "grad_norm": 0.5837542414665222, + "learning_rate": 0.0002660788800646116, + "loss": 4.4138, + "step": 1232 + }, + { + "epoch": 1.57728, + "grad_norm": 0.6243807077407837, + "learning_rate": 0.0002660384977789743, + "loss": 4.4507, + "step": 1233 + }, + { + "epoch": 1.57856, + "grad_norm": 0.5919942259788513, + "learning_rate": 0.0002659981154933369, + "loss": 4.4508, + "step": 1234 + }, + { + "epoch": 1.57984, + "grad_norm": 0.540010929107666, + "learning_rate": 0.0002659577332076995, + "loss": 4.3369, + "step": 1235 + }, + { + "epoch": 1.5811199999999999, + "grad_norm": 0.568360447883606, + "learning_rate": 0.00026591735092206214, + "loss": 4.4326, + "step": 1236 + }, + { + "epoch": 1.5824, + "grad_norm": 0.5942397117614746, + "learning_rate": 0.0002658769686364248, + "loss": 4.412, + "step": 1237 + }, + { + "epoch": 1.58368, + "grad_norm": 0.5995660424232483, + "learning_rate": 0.00026583658635078746, + "loss": 4.4086, + "step": 1238 + }, + { + "epoch": 1.5849600000000001, + "grad_norm": 0.620974063873291, + "learning_rate": 0.0002657962040651501, + "loss": 4.3333, + "step": 1239 + }, + { + "epoch": 1.58624, + "grad_norm": 0.6151325702667236, + "learning_rate": 0.00026575582177951266, + "loss": 4.4844, + "step": 1240 + }, + { + "epoch": 1.58752, + "grad_norm": 0.640434741973877, + "learning_rate": 0.0002657154394938753, + "loss": 4.3976, + "step": 1241 + }, + { + "epoch": 1.5888, + "grad_norm": 0.5995258688926697, + "learning_rate": 0.000265675057208238, + "loss": 4.3342, + "step": 1242 + }, + { + "epoch": 1.59008, + "grad_norm": 0.5680806636810303, + "learning_rate": 0.0002656346749226006, + "loss": 4.4304, + "step": 1243 + }, + { + "epoch": 1.5913599999999999, + "grad_norm": 0.6433593034744263, + "learning_rate": 0.00026559429263696324, + "loss": 4.375, + "step": 1244 + }, + { + "epoch": 1.5926399999999998, + "grad_norm": 0.5471886992454529, + "learning_rate": 0.00026555391035132587, + "loss": 4.3626, + "step": 1245 + }, + { + "epoch": 1.59392, + "grad_norm": 0.6657853126525879, + "learning_rate": 0.0002655135280656885, + "loss": 4.3994, + "step": 1246 + }, + { + "epoch": 1.5952, + "grad_norm": 0.6484869122505188, + "learning_rate": 0.00026547314578005113, + "loss": 4.4166, + "step": 1247 + }, + { + "epoch": 1.5964800000000001, + "grad_norm": 0.6245217323303223, + "learning_rate": 0.00026543276349441376, + "loss": 4.3975, + "step": 1248 + }, + { + "epoch": 1.59776, + "grad_norm": 0.6242031455039978, + "learning_rate": 0.0002653923812087764, + "loss": 4.4047, + "step": 1249 + }, + { + "epoch": 1.59904, + "grad_norm": 0.6540752649307251, + "learning_rate": 0.000265351998923139, + "loss": 4.3767, + "step": 1250 + }, + { + "epoch": 1.60032, + "grad_norm": 0.5985410213470459, + "learning_rate": 0.00026531161663750165, + "loss": 4.4277, + "step": 1251 + }, + { + "epoch": 1.6016, + "grad_norm": 0.7160242795944214, + "learning_rate": 0.0002652712343518643, + "loss": 4.3891, + "step": 1252 + }, + { + "epoch": 1.6028799999999999, + "grad_norm": 0.5682446360588074, + "learning_rate": 0.0002652308520662269, + "loss": 4.3845, + "step": 1253 + }, + { + "epoch": 1.60416, + "grad_norm": 0.6273159980773926, + "learning_rate": 0.00026519046978058954, + "loss": 4.3962, + "step": 1254 + }, + { + "epoch": 1.60544, + "grad_norm": 0.631803035736084, + "learning_rate": 0.0002651500874949522, + "loss": 4.4692, + "step": 1255 + }, + { + "epoch": 1.6067200000000001, + "grad_norm": 0.5755056142807007, + "learning_rate": 0.00026510970520931485, + "loss": 4.4311, + "step": 1256 + }, + { + "epoch": 1.608, + "grad_norm": 0.5929430723190308, + "learning_rate": 0.00026506932292367743, + "loss": 4.4624, + "step": 1257 + }, + { + "epoch": 1.60928, + "grad_norm": 0.6523311138153076, + "learning_rate": 0.00026502894063804006, + "loss": 4.417, + "step": 1258 + }, + { + "epoch": 1.61056, + "grad_norm": 0.5619366765022278, + "learning_rate": 0.00026498855835240274, + "loss": 4.3603, + "step": 1259 + }, + { + "epoch": 1.61184, + "grad_norm": 0.60685795545578, + "learning_rate": 0.00026494817606676537, + "loss": 4.4113, + "step": 1260 + }, + { + "epoch": 1.6131199999999999, + "grad_norm": 0.5795391201972961, + "learning_rate": 0.000264907793781128, + "loss": 4.3658, + "step": 1261 + }, + { + "epoch": 1.6143999999999998, + "grad_norm": 0.6668011546134949, + "learning_rate": 0.00026486741149549063, + "loss": 4.3773, + "step": 1262 + }, + { + "epoch": 1.61568, + "grad_norm": 0.7039081454277039, + "learning_rate": 0.00026482702920985326, + "loss": 4.4388, + "step": 1263 + }, + { + "epoch": 1.61696, + "grad_norm": 0.6980475187301636, + "learning_rate": 0.0002647866469242159, + "loss": 4.4047, + "step": 1264 + }, + { + "epoch": 1.6182400000000001, + "grad_norm": 0.6274135112762451, + "learning_rate": 0.0002647462646385785, + "loss": 4.3472, + "step": 1265 + }, + { + "epoch": 1.61952, + "grad_norm": 0.5991054773330688, + "learning_rate": 0.00026470588235294115, + "loss": 4.402, + "step": 1266 + }, + { + "epoch": 1.6208, + "grad_norm": 0.5963894724845886, + "learning_rate": 0.0002646655000673038, + "loss": 4.4285, + "step": 1267 + }, + { + "epoch": 1.62208, + "grad_norm": 0.6448376178741455, + "learning_rate": 0.0002646251177816664, + "loss": 4.3825, + "step": 1268 + }, + { + "epoch": 1.62336, + "grad_norm": 0.5770543217658997, + "learning_rate": 0.00026458473549602904, + "loss": 4.3484, + "step": 1269 + }, + { + "epoch": 1.6246399999999999, + "grad_norm": 0.6031239032745361, + "learning_rate": 0.0002645443532103917, + "loss": 4.3538, + "step": 1270 + }, + { + "epoch": 1.62592, + "grad_norm": 0.5922113060951233, + "learning_rate": 0.0002645039709247543, + "loss": 4.3903, + "step": 1271 + }, + { + "epoch": 1.6272, + "grad_norm": 0.5830681324005127, + "learning_rate": 0.000264463588639117, + "loss": 4.4233, + "step": 1272 + }, + { + "epoch": 1.6284800000000001, + "grad_norm": 0.5209598541259766, + "learning_rate": 0.00026442320635347956, + "loss": 4.404, + "step": 1273 + }, + { + "epoch": 1.62976, + "grad_norm": 0.6153559684753418, + "learning_rate": 0.0002643828240678422, + "loss": 4.3666, + "step": 1274 + }, + { + "epoch": 1.63104, + "grad_norm": 0.566469132900238, + "learning_rate": 0.0002643424417822048, + "loss": 4.3731, + "step": 1275 + }, + { + "epoch": 1.63232, + "grad_norm": 0.5851961374282837, + "learning_rate": 0.0002643020594965675, + "loss": 4.3827, + "step": 1276 + }, + { + "epoch": 1.6336, + "grad_norm": 0.5738173127174377, + "learning_rate": 0.00026426167721093014, + "loss": 4.3779, + "step": 1277 + }, + { + "epoch": 1.6348799999999999, + "grad_norm": 0.5467177629470825, + "learning_rate": 0.00026422129492529277, + "loss": 4.3595, + "step": 1278 + }, + { + "epoch": 1.6361599999999998, + "grad_norm": 0.5850498080253601, + "learning_rate": 0.0002641809126396554, + "loss": 4.3581, + "step": 1279 + }, + { + "epoch": 1.63744, + "grad_norm": 0.5361942052841187, + "learning_rate": 0.000264140530354018, + "loss": 4.4323, + "step": 1280 + }, + { + "epoch": 1.63872, + "grad_norm": 0.5595226883888245, + "learning_rate": 0.00026410014806838066, + "loss": 4.4528, + "step": 1281 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.6067069172859192, + "learning_rate": 0.0002640597657827433, + "loss": 4.3867, + "step": 1282 + }, + { + "epoch": 1.64128, + "grad_norm": 0.5635828971862793, + "learning_rate": 0.0002640193834971059, + "loss": 4.3671, + "step": 1283 + }, + { + "epoch": 1.64256, + "grad_norm": 0.6461728811264038, + "learning_rate": 0.00026397900121146855, + "loss": 4.3887, + "step": 1284 + }, + { + "epoch": 1.64384, + "grad_norm": 0.5938706398010254, + "learning_rate": 0.0002639386189258312, + "loss": 4.4002, + "step": 1285 + }, + { + "epoch": 1.64512, + "grad_norm": 0.6693472862243652, + "learning_rate": 0.0002638982366401938, + "loss": 4.3767, + "step": 1286 + }, + { + "epoch": 1.6463999999999999, + "grad_norm": 0.6100233197212219, + "learning_rate": 0.00026385785435455644, + "loss": 4.4038, + "step": 1287 + }, + { + "epoch": 1.64768, + "grad_norm": 0.5637868642807007, + "learning_rate": 0.00026381747206891907, + "loss": 4.3205, + "step": 1288 + }, + { + "epoch": 1.64896, + "grad_norm": 0.6211172342300415, + "learning_rate": 0.0002637770897832817, + "loss": 4.3709, + "step": 1289 + }, + { + "epoch": 1.6502400000000002, + "grad_norm": 0.5223026871681213, + "learning_rate": 0.00026373670749764433, + "loss": 4.381, + "step": 1290 + }, + { + "epoch": 1.65152, + "grad_norm": 0.5736677646636963, + "learning_rate": 0.00026369632521200696, + "loss": 4.3631, + "step": 1291 + }, + { + "epoch": 1.6528, + "grad_norm": 0.5660174489021301, + "learning_rate": 0.0002636559429263696, + "loss": 4.4151, + "step": 1292 + }, + { + "epoch": 1.65408, + "grad_norm": 0.5827959775924683, + "learning_rate": 0.0002636155606407322, + "loss": 4.2954, + "step": 1293 + }, + { + "epoch": 1.65536, + "grad_norm": 0.6205708384513855, + "learning_rate": 0.0002635751783550949, + "loss": 4.3476, + "step": 1294 + }, + { + "epoch": 1.65664, + "grad_norm": 0.599597692489624, + "learning_rate": 0.00026353479606945753, + "loss": 4.3624, + "step": 1295 + }, + { + "epoch": 1.6579199999999998, + "grad_norm": 0.6274026036262512, + "learning_rate": 0.0002634944137838201, + "loss": 4.4197, + "step": 1296 + }, + { + "epoch": 1.6592, + "grad_norm": 0.570179283618927, + "learning_rate": 0.00026345403149818274, + "loss": 4.4012, + "step": 1297 + }, + { + "epoch": 1.66048, + "grad_norm": 0.618751049041748, + "learning_rate": 0.0002634136492125454, + "loss": 4.3122, + "step": 1298 + }, + { + "epoch": 1.6617600000000001, + "grad_norm": 0.6431691646575928, + "learning_rate": 0.00026337326692690805, + "loss": 4.4175, + "step": 1299 + }, + { + "epoch": 1.66304, + "grad_norm": 0.6071829199790955, + "learning_rate": 0.0002633328846412707, + "loss": 4.394, + "step": 1300 + }, + { + "epoch": 1.66432, + "grad_norm": 0.5877499580383301, + "learning_rate": 0.0002632925023556333, + "loss": 4.3989, + "step": 1301 + }, + { + "epoch": 1.6656, + "grad_norm": 0.5928570628166199, + "learning_rate": 0.00026325212006999594, + "loss": 4.4153, + "step": 1302 + }, + { + "epoch": 1.66688, + "grad_norm": 0.5851948261260986, + "learning_rate": 0.0002632117377843586, + "loss": 4.4008, + "step": 1303 + }, + { + "epoch": 1.6681599999999999, + "grad_norm": 0.5977253317832947, + "learning_rate": 0.0002631713554987212, + "loss": 4.3256, + "step": 1304 + }, + { + "epoch": 1.66944, + "grad_norm": 0.5974704027175903, + "learning_rate": 0.00026313097321308383, + "loss": 4.3814, + "step": 1305 + }, + { + "epoch": 1.67072, + "grad_norm": 0.631009578704834, + "learning_rate": 0.00026309059092744646, + "loss": 4.3496, + "step": 1306 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.5566964149475098, + "learning_rate": 0.0002630502086418091, + "loss": 4.4002, + "step": 1307 + }, + { + "epoch": 1.67328, + "grad_norm": 0.5787959694862366, + "learning_rate": 0.0002630098263561717, + "loss": 4.3317, + "step": 1308 + }, + { + "epoch": 1.67456, + "grad_norm": 0.6714434623718262, + "learning_rate": 0.00026296944407053435, + "loss": 4.3374, + "step": 1309 + }, + { + "epoch": 1.67584, + "grad_norm": 0.6594877243041992, + "learning_rate": 0.000262929061784897, + "loss": 4.4183, + "step": 1310 + }, + { + "epoch": 1.67712, + "grad_norm": 0.5786787867546082, + "learning_rate": 0.00026288867949925967, + "loss": 4.3543, + "step": 1311 + }, + { + "epoch": 1.6784, + "grad_norm": 0.5720179677009583, + "learning_rate": 0.00026284829721362225, + "loss": 4.3702, + "step": 1312 + }, + { + "epoch": 1.6796799999999998, + "grad_norm": 0.6159886121749878, + "learning_rate": 0.0002628079149279849, + "loss": 4.3829, + "step": 1313 + }, + { + "epoch": 1.68096, + "grad_norm": 0.6541562080383301, + "learning_rate": 0.0002627675326423475, + "loss": 4.3588, + "step": 1314 + }, + { + "epoch": 1.68224, + "grad_norm": 0.6164893507957458, + "learning_rate": 0.0002627271503567102, + "loss": 4.3731, + "step": 1315 + }, + { + "epoch": 1.6835200000000001, + "grad_norm": 0.611280083656311, + "learning_rate": 0.0002626867680710728, + "loss": 4.454, + "step": 1316 + }, + { + "epoch": 1.6848, + "grad_norm": 0.6023463606834412, + "learning_rate": 0.00026264638578543545, + "loss": 4.3007, + "step": 1317 + }, + { + "epoch": 1.68608, + "grad_norm": 0.6427450180053711, + "learning_rate": 0.0002626060034997981, + "loss": 4.3692, + "step": 1318 + }, + { + "epoch": 1.68736, + "grad_norm": 0.5956165194511414, + "learning_rate": 0.00026256562121416066, + "loss": 4.352, + "step": 1319 + }, + { + "epoch": 1.68864, + "grad_norm": 0.5930037498474121, + "learning_rate": 0.00026252523892852334, + "loss": 4.3185, + "step": 1320 + }, + { + "epoch": 1.6899199999999999, + "grad_norm": 0.5842078328132629, + "learning_rate": 0.00026248485664288597, + "loss": 4.3912, + "step": 1321 + }, + { + "epoch": 1.6912, + "grad_norm": 0.5779310464859009, + "learning_rate": 0.0002624444743572486, + "loss": 4.4048, + "step": 1322 + }, + { + "epoch": 1.69248, + "grad_norm": 0.5934463739395142, + "learning_rate": 0.00026240409207161123, + "loss": 4.3657, + "step": 1323 + }, + { + "epoch": 1.6937600000000002, + "grad_norm": 0.636505663394928, + "learning_rate": 0.00026236370978597386, + "loss": 4.4074, + "step": 1324 + }, + { + "epoch": 1.69504, + "grad_norm": 0.6304707527160645, + "learning_rate": 0.0002623233275003365, + "loss": 4.4129, + "step": 1325 + }, + { + "epoch": 1.69632, + "grad_norm": 0.660848081111908, + "learning_rate": 0.0002622829452146991, + "loss": 4.3167, + "step": 1326 + }, + { + "epoch": 1.6976, + "grad_norm": 0.5828097462654114, + "learning_rate": 0.00026224256292906175, + "loss": 4.3415, + "step": 1327 + }, + { + "epoch": 1.69888, + "grad_norm": 0.6500957608222961, + "learning_rate": 0.00026220218064342443, + "loss": 4.3257, + "step": 1328 + }, + { + "epoch": 1.70016, + "grad_norm": 0.5696218013763428, + "learning_rate": 0.000262161798357787, + "loss": 4.423, + "step": 1329 + }, + { + "epoch": 1.7014399999999998, + "grad_norm": 0.6378457546234131, + "learning_rate": 0.00026212141607214964, + "loss": 4.371, + "step": 1330 + }, + { + "epoch": 1.70272, + "grad_norm": 0.5619131326675415, + "learning_rate": 0.00026208103378651227, + "loss": 4.3918, + "step": 1331 + }, + { + "epoch": 1.704, + "grad_norm": 0.607213020324707, + "learning_rate": 0.00026204065150087496, + "loss": 4.2955, + "step": 1332 + }, + { + "epoch": 1.7052800000000001, + "grad_norm": 0.6598789095878601, + "learning_rate": 0.0002620002692152376, + "loss": 4.4082, + "step": 1333 + }, + { + "epoch": 1.70656, + "grad_norm": 0.6173972487449646, + "learning_rate": 0.0002619598869296002, + "loss": 4.3991, + "step": 1334 + }, + { + "epoch": 1.70784, + "grad_norm": 0.709816575050354, + "learning_rate": 0.00026191950464396285, + "loss": 4.3563, + "step": 1335 + }, + { + "epoch": 1.70912, + "grad_norm": 0.5555682182312012, + "learning_rate": 0.0002618791223583254, + "loss": 4.386, + "step": 1336 + }, + { + "epoch": 1.7104, + "grad_norm": 0.6640469431877136, + "learning_rate": 0.0002618387400726881, + "loss": 4.3813, + "step": 1337 + }, + { + "epoch": 1.7116799999999999, + "grad_norm": 0.5649694800376892, + "learning_rate": 0.00026179835778705074, + "loss": 4.3737, + "step": 1338 + }, + { + "epoch": 1.71296, + "grad_norm": 0.5696978569030762, + "learning_rate": 0.00026175797550141337, + "loss": 4.3254, + "step": 1339 + }, + { + "epoch": 1.71424, + "grad_norm": 0.5677264928817749, + "learning_rate": 0.000261717593215776, + "loss": 4.3328, + "step": 1340 + }, + { + "epoch": 1.7155200000000002, + "grad_norm": 0.5705046057701111, + "learning_rate": 0.0002616772109301386, + "loss": 4.2976, + "step": 1341 + }, + { + "epoch": 1.7168, + "grad_norm": 0.5434849858283997, + "learning_rate": 0.00026163682864450126, + "loss": 4.4098, + "step": 1342 + }, + { + "epoch": 1.71808, + "grad_norm": 0.6120153665542603, + "learning_rate": 0.0002615964463588639, + "loss": 4.3524, + "step": 1343 + }, + { + "epoch": 1.71936, + "grad_norm": 0.5820379853248596, + "learning_rate": 0.0002615560640732265, + "loss": 4.3321, + "step": 1344 + }, + { + "epoch": 1.72064, + "grad_norm": 0.586432695388794, + "learning_rate": 0.00026151568178758915, + "loss": 4.3806, + "step": 1345 + }, + { + "epoch": 1.72192, + "grad_norm": 0.6412893533706665, + "learning_rate": 0.0002614752995019518, + "loss": 4.3795, + "step": 1346 + }, + { + "epoch": 1.7231999999999998, + "grad_norm": 0.6333811283111572, + "learning_rate": 0.0002614349172163144, + "loss": 4.3555, + "step": 1347 + }, + { + "epoch": 1.72448, + "grad_norm": 0.5543333292007446, + "learning_rate": 0.00026139453493067704, + "loss": 4.3285, + "step": 1348 + }, + { + "epoch": 1.72576, + "grad_norm": 0.5972537398338318, + "learning_rate": 0.00026135415264503967, + "loss": 4.3328, + "step": 1349 + }, + { + "epoch": 1.7270400000000001, + "grad_norm": 0.5956273078918457, + "learning_rate": 0.00026131377035940235, + "loss": 4.3246, + "step": 1350 + }, + { + "epoch": 1.72832, + "grad_norm": 0.6021464467048645, + "learning_rate": 0.000261273388073765, + "loss": 4.4009, + "step": 1351 + }, + { + "epoch": 1.7296, + "grad_norm": 0.6117057800292969, + "learning_rate": 0.00026123300578812756, + "loss": 4.3637, + "step": 1352 + }, + { + "epoch": 1.73088, + "grad_norm": 0.5697115659713745, + "learning_rate": 0.0002611926235024902, + "loss": 4.3032, + "step": 1353 + }, + { + "epoch": 1.73216, + "grad_norm": 0.6455910205841064, + "learning_rate": 0.00026115224121685287, + "loss": 4.2995, + "step": 1354 + }, + { + "epoch": 1.7334399999999999, + "grad_norm": 0.5969467759132385, + "learning_rate": 0.0002611118589312155, + "loss": 4.3408, + "step": 1355 + }, + { + "epoch": 1.73472, + "grad_norm": 0.6078411340713501, + "learning_rate": 0.00026107147664557813, + "loss": 4.3799, + "step": 1356 + }, + { + "epoch": 1.736, + "grad_norm": 0.5551348328590393, + "learning_rate": 0.00026103109435994076, + "loss": 4.3688, + "step": 1357 + }, + { + "epoch": 1.7372800000000002, + "grad_norm": 0.6095216870307922, + "learning_rate": 0.0002609907120743034, + "loss": 4.3867, + "step": 1358 + }, + { + "epoch": 1.73856, + "grad_norm": 0.5706983208656311, + "learning_rate": 0.000260950329788666, + "loss": 4.3618, + "step": 1359 + }, + { + "epoch": 1.73984, + "grad_norm": 0.5468519926071167, + "learning_rate": 0.00026090994750302865, + "loss": 4.361, + "step": 1360 + }, + { + "epoch": 1.74112, + "grad_norm": 0.5678277611732483, + "learning_rate": 0.0002608695652173913, + "loss": 4.3545, + "step": 1361 + }, + { + "epoch": 1.7424, + "grad_norm": 0.5359609723091125, + "learning_rate": 0.0002608291829317539, + "loss": 4.4574, + "step": 1362 + }, + { + "epoch": 1.74368, + "grad_norm": 0.5697743892669678, + "learning_rate": 0.00026078880064611654, + "loss": 4.295, + "step": 1363 + }, + { + "epoch": 1.7449599999999998, + "grad_norm": 0.605635404586792, + "learning_rate": 0.00026074841836047917, + "loss": 4.3309, + "step": 1364 + }, + { + "epoch": 1.74624, + "grad_norm": 0.5920169353485107, + "learning_rate": 0.0002607080360748418, + "loss": 4.3717, + "step": 1365 + }, + { + "epoch": 1.74752, + "grad_norm": 0.6027970910072327, + "learning_rate": 0.00026066765378920443, + "loss": 4.3445, + "step": 1366 + }, + { + "epoch": 1.7488000000000001, + "grad_norm": 0.5497733354568481, + "learning_rate": 0.0002606272715035671, + "loss": 4.3752, + "step": 1367 + }, + { + "epoch": 1.75008, + "grad_norm": 0.6217249631881714, + "learning_rate": 0.0002605868892179297, + "loss": 4.3425, + "step": 1368 + }, + { + "epoch": 1.75136, + "grad_norm": 0.5949388742446899, + "learning_rate": 0.0002605465069322923, + "loss": 4.3937, + "step": 1369 + }, + { + "epoch": 1.75264, + "grad_norm": 0.5935553908348083, + "learning_rate": 0.00026050612464665495, + "loss": 4.318, + "step": 1370 + }, + { + "epoch": 1.75392, + "grad_norm": 0.5247051119804382, + "learning_rate": 0.00026046574236101764, + "loss": 4.305, + "step": 1371 + }, + { + "epoch": 1.7551999999999999, + "grad_norm": 0.5983899235725403, + "learning_rate": 0.00026042536007538027, + "loss": 4.3383, + "step": 1372 + }, + { + "epoch": 1.75648, + "grad_norm": 0.5618011951446533, + "learning_rate": 0.0002603849777897429, + "loss": 4.3788, + "step": 1373 + }, + { + "epoch": 1.75776, + "grad_norm": 0.5628573298454285, + "learning_rate": 0.0002603445955041055, + "loss": 4.2755, + "step": 1374 + }, + { + "epoch": 1.7590400000000002, + "grad_norm": 0.580451488494873, + "learning_rate": 0.0002603042132184681, + "loss": 4.3533, + "step": 1375 + }, + { + "epoch": 1.76032, + "grad_norm": 0.5849772691726685, + "learning_rate": 0.0002602638309328308, + "loss": 4.3453, + "step": 1376 + }, + { + "epoch": 1.7616, + "grad_norm": 0.6029077172279358, + "learning_rate": 0.0002602234486471934, + "loss": 4.3619, + "step": 1377 + }, + { + "epoch": 1.76288, + "grad_norm": 0.5800113677978516, + "learning_rate": 0.00026018306636155605, + "loss": 4.3204, + "step": 1378 + }, + { + "epoch": 1.76416, + "grad_norm": 0.5955139994621277, + "learning_rate": 0.0002601426840759187, + "loss": 4.3915, + "step": 1379 + }, + { + "epoch": 1.76544, + "grad_norm": 0.5562100410461426, + "learning_rate": 0.0002601023017902813, + "loss": 4.2759, + "step": 1380 + }, + { + "epoch": 1.7667199999999998, + "grad_norm": 0.61802738904953, + "learning_rate": 0.00026006191950464394, + "loss": 4.2538, + "step": 1381 + }, + { + "epoch": 1.768, + "grad_norm": 0.5850224494934082, + "learning_rate": 0.00026002153721900657, + "loss": 4.3395, + "step": 1382 + }, + { + "epoch": 1.76928, + "grad_norm": 0.5995482802391052, + "learning_rate": 0.0002599811549333692, + "loss": 4.2534, + "step": 1383 + }, + { + "epoch": 1.7705600000000001, + "grad_norm": 0.5256519913673401, + "learning_rate": 0.00025994077264773183, + "loss": 4.2427, + "step": 1384 + }, + { + "epoch": 1.77184, + "grad_norm": 0.5336461067199707, + "learning_rate": 0.00025990039036209446, + "loss": 4.3061, + "step": 1385 + }, + { + "epoch": 1.77312, + "grad_norm": 0.5198113322257996, + "learning_rate": 0.0002598600080764571, + "loss": 4.3705, + "step": 1386 + }, + { + "epoch": 1.7744, + "grad_norm": 0.5634206533432007, + "learning_rate": 0.0002598196257908197, + "loss": 4.3167, + "step": 1387 + }, + { + "epoch": 1.77568, + "grad_norm": 0.598160445690155, + "learning_rate": 0.00025977924350518235, + "loss": 4.2889, + "step": 1388 + }, + { + "epoch": 1.7769599999999999, + "grad_norm": 0.6371156573295593, + "learning_rate": 0.00025973886121954503, + "loss": 4.3066, + "step": 1389 + }, + { + "epoch": 1.77824, + "grad_norm": 0.5342087149620056, + "learning_rate": 0.00025969847893390766, + "loss": 4.2872, + "step": 1390 + }, + { + "epoch": 1.77952, + "grad_norm": 0.6202256083488464, + "learning_rate": 0.00025965809664827024, + "loss": 4.3316, + "step": 1391 + }, + { + "epoch": 1.7808000000000002, + "grad_norm": 0.5649963021278381, + "learning_rate": 0.00025961771436263287, + "loss": 4.3691, + "step": 1392 + }, + { + "epoch": 1.78208, + "grad_norm": 0.5771881341934204, + "learning_rate": 0.00025957733207699555, + "loss": 4.3228, + "step": 1393 + }, + { + "epoch": 1.78336, + "grad_norm": 0.5943997502326965, + "learning_rate": 0.0002595369497913582, + "loss": 4.4159, + "step": 1394 + }, + { + "epoch": 1.78464, + "grad_norm": 0.5805171132087708, + "learning_rate": 0.0002594965675057208, + "loss": 4.287, + "step": 1395 + }, + { + "epoch": 1.78592, + "grad_norm": 0.5803609490394592, + "learning_rate": 0.00025945618522008344, + "loss": 4.2801, + "step": 1396 + }, + { + "epoch": 1.7872, + "grad_norm": 0.5380381345748901, + "learning_rate": 0.00025941580293444607, + "loss": 4.3378, + "step": 1397 + }, + { + "epoch": 1.7884799999999998, + "grad_norm": 0.5976554155349731, + "learning_rate": 0.0002593754206488087, + "loss": 4.3923, + "step": 1398 + }, + { + "epoch": 1.78976, + "grad_norm": 0.6023809909820557, + "learning_rate": 0.00025933503836317133, + "loss": 4.3821, + "step": 1399 + }, + { + "epoch": 1.79104, + "grad_norm": 0.4953809976577759, + "learning_rate": 0.00025929465607753396, + "loss": 4.3558, + "step": 1400 + }, + { + "epoch": 1.7923200000000001, + "grad_norm": 0.6279889345169067, + "learning_rate": 0.0002592542737918966, + "loss": 4.319, + "step": 1401 + }, + { + "epoch": 1.7936, + "grad_norm": 0.5510340929031372, + "learning_rate": 0.0002592138915062592, + "loss": 4.3578, + "step": 1402 + }, + { + "epoch": 1.79488, + "grad_norm": 0.5811865925788879, + "learning_rate": 0.00025917350922062185, + "loss": 4.3256, + "step": 1403 + }, + { + "epoch": 1.79616, + "grad_norm": 0.5557198524475098, + "learning_rate": 0.0002591331269349845, + "loss": 4.2975, + "step": 1404 + }, + { + "epoch": 1.79744, + "grad_norm": 0.5800006985664368, + "learning_rate": 0.0002590927446493471, + "loss": 4.3556, + "step": 1405 + }, + { + "epoch": 1.7987199999999999, + "grad_norm": 0.5096418857574463, + "learning_rate": 0.0002590523623637098, + "loss": 4.2946, + "step": 1406 + }, + { + "epoch": 1.8, + "grad_norm": 0.5896133780479431, + "learning_rate": 0.00025901198007807243, + "loss": 4.3412, + "step": 1407 + }, + { + "epoch": 1.80128, + "grad_norm": 0.5542340874671936, + "learning_rate": 0.000258971597792435, + "loss": 4.3673, + "step": 1408 + }, + { + "epoch": 1.8025600000000002, + "grad_norm": 0.5914231538772583, + "learning_rate": 0.00025893121550679763, + "loss": 4.3599, + "step": 1409 + }, + { + "epoch": 1.80384, + "grad_norm": 0.6253953576087952, + "learning_rate": 0.0002588908332211603, + "loss": 4.2901, + "step": 1410 + }, + { + "epoch": 1.80512, + "grad_norm": 0.5754669904708862, + "learning_rate": 0.00025885045093552295, + "loss": 4.3587, + "step": 1411 + }, + { + "epoch": 1.8064, + "grad_norm": 0.5589056015014648, + "learning_rate": 0.0002588100686498856, + "loss": 4.2848, + "step": 1412 + }, + { + "epoch": 1.80768, + "grad_norm": 0.6159608960151672, + "learning_rate": 0.0002587696863642482, + "loss": 4.2993, + "step": 1413 + }, + { + "epoch": 1.80896, + "grad_norm": 0.5682271718978882, + "learning_rate": 0.0002587293040786108, + "loss": 4.3867, + "step": 1414 + }, + { + "epoch": 1.8102399999999998, + "grad_norm": 0.5960492491722107, + "learning_rate": 0.00025868892179297347, + "loss": 4.2558, + "step": 1415 + }, + { + "epoch": 1.81152, + "grad_norm": 0.6439085006713867, + "learning_rate": 0.0002586485395073361, + "loss": 4.3216, + "step": 1416 + }, + { + "epoch": 1.8128, + "grad_norm": 0.5476023554801941, + "learning_rate": 0.00025860815722169873, + "loss": 4.34, + "step": 1417 + }, + { + "epoch": 1.8140800000000001, + "grad_norm": 0.6070132851600647, + "learning_rate": 0.00025856777493606136, + "loss": 4.3153, + "step": 1418 + }, + { + "epoch": 1.81536, + "grad_norm": 0.6208356618881226, + "learning_rate": 0.000258527392650424, + "loss": 4.4003, + "step": 1419 + }, + { + "epoch": 1.81664, + "grad_norm": 0.553810715675354, + "learning_rate": 0.0002584870103647866, + "loss": 4.321, + "step": 1420 + }, + { + "epoch": 1.81792, + "grad_norm": 0.6032571196556091, + "learning_rate": 0.00025844662807914925, + "loss": 4.3312, + "step": 1421 + }, + { + "epoch": 1.8192, + "grad_norm": 0.5861594676971436, + "learning_rate": 0.0002584062457935119, + "loss": 4.3138, + "step": 1422 + }, + { + "epoch": 1.8204799999999999, + "grad_norm": 0.5738282203674316, + "learning_rate": 0.00025836586350787456, + "loss": 4.2894, + "step": 1423 + }, + { + "epoch": 1.82176, + "grad_norm": 0.5541486144065857, + "learning_rate": 0.00025832548122223714, + "loss": 4.3835, + "step": 1424 + }, + { + "epoch": 1.82304, + "grad_norm": 0.5869741439819336, + "learning_rate": 0.00025828509893659977, + "loss": 4.2246, + "step": 1425 + }, + { + "epoch": 1.8243200000000002, + "grad_norm": 0.5501008033752441, + "learning_rate": 0.0002582447166509624, + "loss": 4.3119, + "step": 1426 + }, + { + "epoch": 1.8256000000000001, + "grad_norm": 0.5797874927520752, + "learning_rate": 0.00025820433436532503, + "loss": 4.2885, + "step": 1427 + }, + { + "epoch": 1.82688, + "grad_norm": 0.5502588748931885, + "learning_rate": 0.0002581639520796877, + "loss": 4.2423, + "step": 1428 + }, + { + "epoch": 1.82816, + "grad_norm": 0.6173840761184692, + "learning_rate": 0.00025812356979405034, + "loss": 4.259, + "step": 1429 + }, + { + "epoch": 1.82944, + "grad_norm": 0.5949298143386841, + "learning_rate": 0.000258083187508413, + "loss": 4.3307, + "step": 1430 + }, + { + "epoch": 1.83072, + "grad_norm": 0.5942235589027405, + "learning_rate": 0.00025804280522277555, + "loss": 4.279, + "step": 1431 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.6103411912918091, + "learning_rate": 0.00025800242293713823, + "loss": 4.2624, + "step": 1432 + }, + { + "epoch": 1.83328, + "grad_norm": 0.5956135392189026, + "learning_rate": 0.00025796204065150086, + "loss": 4.3028, + "step": 1433 + }, + { + "epoch": 1.83456, + "grad_norm": 0.5850538015365601, + "learning_rate": 0.0002579216583658635, + "loss": 4.2829, + "step": 1434 + }, + { + "epoch": 1.8358400000000001, + "grad_norm": 0.5809298157691956, + "learning_rate": 0.0002578812760802261, + "loss": 4.332, + "step": 1435 + }, + { + "epoch": 1.83712, + "grad_norm": 0.8085721135139465, + "learning_rate": 0.00025784089379458875, + "loss": 4.3069, + "step": 1436 + }, + { + "epoch": 1.8384, + "grad_norm": 0.5805991888046265, + "learning_rate": 0.0002578005115089514, + "loss": 4.3018, + "step": 1437 + }, + { + "epoch": 1.83968, + "grad_norm": 0.5863122344017029, + "learning_rate": 0.000257760129223314, + "loss": 4.275, + "step": 1438 + }, + { + "epoch": 1.84096, + "grad_norm": 0.573593258857727, + "learning_rate": 0.00025771974693767664, + "loss": 4.2953, + "step": 1439 + }, + { + "epoch": 1.8422399999999999, + "grad_norm": 0.6307305097579956, + "learning_rate": 0.0002576793646520393, + "loss": 4.3356, + "step": 1440 + }, + { + "epoch": 1.84352, + "grad_norm": 0.5647289752960205, + "learning_rate": 0.0002576389823664019, + "loss": 4.3151, + "step": 1441 + }, + { + "epoch": 1.8448, + "grad_norm": 0.640838086605072, + "learning_rate": 0.00025759860008076453, + "loss": 4.3466, + "step": 1442 + }, + { + "epoch": 1.8460800000000002, + "grad_norm": 0.5609426498413086, + "learning_rate": 0.00025755821779512716, + "loss": 4.3062, + "step": 1443 + }, + { + "epoch": 1.8473600000000001, + "grad_norm": 0.6041327118873596, + "learning_rate": 0.0002575178355094898, + "loss": 4.3406, + "step": 1444 + }, + { + "epoch": 1.84864, + "grad_norm": 0.5379366874694824, + "learning_rate": 0.0002574774532238525, + "loss": 4.2807, + "step": 1445 + }, + { + "epoch": 1.84992, + "grad_norm": 0.582633376121521, + "learning_rate": 0.0002574370709382151, + "loss": 4.316, + "step": 1446 + }, + { + "epoch": 1.8512, + "grad_norm": 0.6131902933120728, + "learning_rate": 0.0002573966886525777, + "loss": 4.2406, + "step": 1447 + }, + { + "epoch": 1.85248, + "grad_norm": 0.6240634322166443, + "learning_rate": 0.0002573563063669403, + "loss": 4.307, + "step": 1448 + }, + { + "epoch": 1.8537599999999999, + "grad_norm": 0.5122919678688049, + "learning_rate": 0.000257315924081303, + "loss": 4.253, + "step": 1449 + }, + { + "epoch": 1.85504, + "grad_norm": 0.6418374180793762, + "learning_rate": 0.00025727554179566563, + "loss": 4.3222, + "step": 1450 + }, + { + "epoch": 1.85632, + "grad_norm": 0.5671072602272034, + "learning_rate": 0.00025723515951002826, + "loss": 4.3006, + "step": 1451 + }, + { + "epoch": 1.8576000000000001, + "grad_norm": 0.5645801424980164, + "learning_rate": 0.0002571947772243909, + "loss": 4.3623, + "step": 1452 + }, + { + "epoch": 1.85888, + "grad_norm": 0.5495478510856628, + "learning_rate": 0.0002571543949387535, + "loss": 4.2478, + "step": 1453 + }, + { + "epoch": 1.86016, + "grad_norm": 0.5628487467765808, + "learning_rate": 0.00025711401265311615, + "loss": 4.2659, + "step": 1454 + }, + { + "epoch": 1.86144, + "grad_norm": 0.5409569144248962, + "learning_rate": 0.0002570736303674788, + "loss": 4.2707, + "step": 1455 + }, + { + "epoch": 1.86272, + "grad_norm": 0.5503236651420593, + "learning_rate": 0.0002570332480818414, + "loss": 4.2997, + "step": 1456 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.5647298693656921, + "learning_rate": 0.00025699286579620404, + "loss": 4.3171, + "step": 1457 + }, + { + "epoch": 1.86528, + "grad_norm": 0.5661958456039429, + "learning_rate": 0.00025695248351056667, + "loss": 4.2884, + "step": 1458 + }, + { + "epoch": 1.86656, + "grad_norm": 0.5522183775901794, + "learning_rate": 0.0002569121012249293, + "loss": 4.1818, + "step": 1459 + }, + { + "epoch": 1.86784, + "grad_norm": 0.5519942045211792, + "learning_rate": 0.00025687171893929193, + "loss": 4.3356, + "step": 1460 + }, + { + "epoch": 1.8691200000000001, + "grad_norm": 0.5674711465835571, + "learning_rate": 0.00025683133665365456, + "loss": 4.3287, + "step": 1461 + }, + { + "epoch": 1.8704, + "grad_norm": 0.6130083799362183, + "learning_rate": 0.00025679095436801724, + "loss": 4.3046, + "step": 1462 + }, + { + "epoch": 1.87168, + "grad_norm": 0.5580954551696777, + "learning_rate": 0.0002567505720823798, + "loss": 4.2846, + "step": 1463 + }, + { + "epoch": 1.87296, + "grad_norm": 0.6269332766532898, + "learning_rate": 0.00025671018979674245, + "loss": 4.3118, + "step": 1464 + }, + { + "epoch": 1.87424, + "grad_norm": 0.5813364386558533, + "learning_rate": 0.0002566698075111051, + "loss": 4.2828, + "step": 1465 + }, + { + "epoch": 1.8755199999999999, + "grad_norm": 0.584498941898346, + "learning_rate": 0.00025662942522546776, + "loss": 4.2625, + "step": 1466 + }, + { + "epoch": 1.8768, + "grad_norm": 0.6057561635971069, + "learning_rate": 0.0002565890429398304, + "loss": 4.3507, + "step": 1467 + }, + { + "epoch": 1.87808, + "grad_norm": 0.5964322686195374, + "learning_rate": 0.000256548660654193, + "loss": 4.3682, + "step": 1468 + }, + { + "epoch": 1.8793600000000001, + "grad_norm": 0.6251275539398193, + "learning_rate": 0.00025650827836855565, + "loss": 4.3605, + "step": 1469 + }, + { + "epoch": 1.88064, + "grad_norm": 0.6410034894943237, + "learning_rate": 0.00025646789608291823, + "loss": 4.3105, + "step": 1470 + }, + { + "epoch": 1.88192, + "grad_norm": 0.6155179738998413, + "learning_rate": 0.0002564275137972809, + "loss": 4.2392, + "step": 1471 + }, + { + "epoch": 1.8832, + "grad_norm": 0.5981379747390747, + "learning_rate": 0.00025638713151164355, + "loss": 4.3841, + "step": 1472 + }, + { + "epoch": 1.88448, + "grad_norm": 0.637157142162323, + "learning_rate": 0.0002563467492260062, + "loss": 4.2601, + "step": 1473 + }, + { + "epoch": 1.8857599999999999, + "grad_norm": 0.5592889189720154, + "learning_rate": 0.0002563063669403688, + "loss": 4.3205, + "step": 1474 + }, + { + "epoch": 1.88704, + "grad_norm": 0.6076372265815735, + "learning_rate": 0.00025626598465473144, + "loss": 4.2291, + "step": 1475 + }, + { + "epoch": 1.88832, + "grad_norm": 0.5905406475067139, + "learning_rate": 0.00025622560236909407, + "loss": 4.2718, + "step": 1476 + }, + { + "epoch": 1.8896, + "grad_norm": 0.5950998663902283, + "learning_rate": 0.0002561852200834567, + "loss": 4.3082, + "step": 1477 + }, + { + "epoch": 1.8908800000000001, + "grad_norm": 0.6349307298660278, + "learning_rate": 0.0002561448377978193, + "loss": 4.3038, + "step": 1478 + }, + { + "epoch": 1.89216, + "grad_norm": 0.5779610276222229, + "learning_rate": 0.000256104455512182, + "loss": 4.2575, + "step": 1479 + }, + { + "epoch": 1.89344, + "grad_norm": 0.5922044515609741, + "learning_rate": 0.0002560640732265446, + "loss": 4.2998, + "step": 1480 + }, + { + "epoch": 1.89472, + "grad_norm": 0.6293798089027405, + "learning_rate": 0.0002560236909409072, + "loss": 4.2816, + "step": 1481 + }, + { + "epoch": 1.896, + "grad_norm": 0.542755126953125, + "learning_rate": 0.00025598330865526985, + "loss": 4.3644, + "step": 1482 + }, + { + "epoch": 1.8972799999999999, + "grad_norm": 0.591120719909668, + "learning_rate": 0.0002559429263696325, + "loss": 4.277, + "step": 1483 + }, + { + "epoch": 1.89856, + "grad_norm": 0.5428628325462341, + "learning_rate": 0.00025590254408399516, + "loss": 4.1826, + "step": 1484 + }, + { + "epoch": 1.89984, + "grad_norm": 0.5935090780258179, + "learning_rate": 0.0002558621617983578, + "loss": 4.265, + "step": 1485 + }, + { + "epoch": 1.9011200000000001, + "grad_norm": 0.532969057559967, + "learning_rate": 0.00025582177951272037, + "loss": 4.3217, + "step": 1486 + }, + { + "epoch": 1.9024, + "grad_norm": 0.5747160315513611, + "learning_rate": 0.000255781397227083, + "loss": 4.2958, + "step": 1487 + }, + { + "epoch": 1.90368, + "grad_norm": 0.5506500005722046, + "learning_rate": 0.0002557410149414457, + "loss": 4.3303, + "step": 1488 + }, + { + "epoch": 1.90496, + "grad_norm": 0.5252817869186401, + "learning_rate": 0.0002557006326558083, + "loss": 4.2845, + "step": 1489 + }, + { + "epoch": 1.90624, + "grad_norm": 0.5319386124610901, + "learning_rate": 0.00025566025037017094, + "loss": 4.2375, + "step": 1490 + }, + { + "epoch": 1.9075199999999999, + "grad_norm": 0.5286985635757446, + "learning_rate": 0.00025561986808453357, + "loss": 4.3033, + "step": 1491 + }, + { + "epoch": 1.9088, + "grad_norm": 0.6041566133499146, + "learning_rate": 0.0002555794857988962, + "loss": 4.2996, + "step": 1492 + }, + { + "epoch": 1.91008, + "grad_norm": 0.563486635684967, + "learning_rate": 0.00025553910351325883, + "loss": 4.3267, + "step": 1493 + }, + { + "epoch": 1.91136, + "grad_norm": 0.5622044801712036, + "learning_rate": 0.00025549872122762146, + "loss": 4.2366, + "step": 1494 + }, + { + "epoch": 1.9126400000000001, + "grad_norm": 0.5659383535385132, + "learning_rate": 0.0002554583389419841, + "loss": 4.3028, + "step": 1495 + }, + { + "epoch": 1.91392, + "grad_norm": 0.6131373047828674, + "learning_rate": 0.0002554179566563467, + "loss": 4.2772, + "step": 1496 + }, + { + "epoch": 1.9152, + "grad_norm": 0.5460529923439026, + "learning_rate": 0.00025537757437070935, + "loss": 4.2903, + "step": 1497 + }, + { + "epoch": 1.91648, + "grad_norm": 0.6126199960708618, + "learning_rate": 0.000255337192085072, + "loss": 4.3442, + "step": 1498 + }, + { + "epoch": 1.91776, + "grad_norm": 0.5673563480377197, + "learning_rate": 0.0002552968097994346, + "loss": 4.2208, + "step": 1499 + }, + { + "epoch": 1.9190399999999999, + "grad_norm": 0.5467031002044678, + "learning_rate": 0.00025525642751379724, + "loss": 4.287, + "step": 1500 + }, + { + "epoch": 1.92032, + "grad_norm": 0.5507038235664368, + "learning_rate": 0.0002552160452281599, + "loss": 4.2693, + "step": 1501 + }, + { + "epoch": 1.9216, + "grad_norm": 0.5434231758117676, + "learning_rate": 0.00025517566294252256, + "loss": 4.2913, + "step": 1502 + }, + { + "epoch": 1.9228800000000001, + "grad_norm": 0.5921497941017151, + "learning_rate": 0.00025513528065688513, + "loss": 4.3204, + "step": 1503 + }, + { + "epoch": 1.92416, + "grad_norm": 0.5684370398521423, + "learning_rate": 0.00025509489837124776, + "loss": 4.2581, + "step": 1504 + }, + { + "epoch": 1.92544, + "grad_norm": 0.5858400464057922, + "learning_rate": 0.00025505451608561045, + "loss": 4.2407, + "step": 1505 + }, + { + "epoch": 1.92672, + "grad_norm": 0.5300204157829285, + "learning_rate": 0.0002550141337999731, + "loss": 4.3043, + "step": 1506 + }, + { + "epoch": 1.928, + "grad_norm": 0.5366335511207581, + "learning_rate": 0.0002549737515143357, + "loss": 4.2903, + "step": 1507 + }, + { + "epoch": 1.9292799999999999, + "grad_norm": 0.5699880719184875, + "learning_rate": 0.00025493336922869834, + "loss": 4.2495, + "step": 1508 + }, + { + "epoch": 1.93056, + "grad_norm": 0.5679522752761841, + "learning_rate": 0.00025489298694306097, + "loss": 4.2577, + "step": 1509 + }, + { + "epoch": 1.93184, + "grad_norm": 0.5857032537460327, + "learning_rate": 0.0002548526046574236, + "loss": 4.273, + "step": 1510 + }, + { + "epoch": 1.93312, + "grad_norm": 0.574712872505188, + "learning_rate": 0.0002548122223717862, + "loss": 4.3117, + "step": 1511 + }, + { + "epoch": 1.9344000000000001, + "grad_norm": 0.5547901391983032, + "learning_rate": 0.00025477184008614886, + "loss": 4.3168, + "step": 1512 + }, + { + "epoch": 1.93568, + "grad_norm": 0.557061493396759, + "learning_rate": 0.0002547314578005115, + "loss": 4.2803, + "step": 1513 + }, + { + "epoch": 1.93696, + "grad_norm": 0.5565205216407776, + "learning_rate": 0.0002546910755148741, + "loss": 4.2377, + "step": 1514 + }, + { + "epoch": 1.93824, + "grad_norm": 0.5655121803283691, + "learning_rate": 0.00025465069322923675, + "loss": 4.2526, + "step": 1515 + }, + { + "epoch": 1.93952, + "grad_norm": 0.6136040091514587, + "learning_rate": 0.0002546103109435994, + "loss": 4.3138, + "step": 1516 + }, + { + "epoch": 1.9407999999999999, + "grad_norm": 0.5623380541801453, + "learning_rate": 0.000254569928657962, + "loss": 4.1775, + "step": 1517 + }, + { + "epoch": 1.94208, + "grad_norm": 0.5378047823905945, + "learning_rate": 0.0002545295463723247, + "loss": 4.2809, + "step": 1518 + }, + { + "epoch": 1.94336, + "grad_norm": 0.6011268496513367, + "learning_rate": 0.00025448916408668727, + "loss": 4.2017, + "step": 1519 + }, + { + "epoch": 1.9446400000000001, + "grad_norm": 0.5707088708877563, + "learning_rate": 0.0002544487818010499, + "loss": 4.22, + "step": 1520 + }, + { + "epoch": 1.94592, + "grad_norm": 0.5897967219352722, + "learning_rate": 0.00025440839951541253, + "loss": 4.2578, + "step": 1521 + }, + { + "epoch": 1.9472, + "grad_norm": 0.5613348484039307, + "learning_rate": 0.00025436801722977516, + "loss": 4.2774, + "step": 1522 + }, + { + "epoch": 1.94848, + "grad_norm": 0.5693078637123108, + "learning_rate": 0.00025432763494413784, + "loss": 4.2862, + "step": 1523 + }, + { + "epoch": 1.94976, + "grad_norm": 0.6159646511077881, + "learning_rate": 0.00025428725265850047, + "loss": 4.3357, + "step": 1524 + }, + { + "epoch": 1.9510399999999999, + "grad_norm": 0.5490036010742188, + "learning_rate": 0.0002542468703728631, + "loss": 4.2645, + "step": 1525 + }, + { + "epoch": 1.95232, + "grad_norm": 0.6109630465507507, + "learning_rate": 0.0002542064880872257, + "loss": 4.2504, + "step": 1526 + }, + { + "epoch": 1.9536, + "grad_norm": 0.5659199953079224, + "learning_rate": 0.00025416610580158836, + "loss": 4.2713, + "step": 1527 + }, + { + "epoch": 1.95488, + "grad_norm": 0.5810695886611938, + "learning_rate": 0.000254125723515951, + "loss": 4.2639, + "step": 1528 + }, + { + "epoch": 1.9561600000000001, + "grad_norm": 0.5711521506309509, + "learning_rate": 0.0002540853412303136, + "loss": 4.2478, + "step": 1529 + }, + { + "epoch": 1.95744, + "grad_norm": 0.571063756942749, + "learning_rate": 0.00025404495894467625, + "loss": 4.2389, + "step": 1530 + }, + { + "epoch": 1.95872, + "grad_norm": 0.5734847784042358, + "learning_rate": 0.0002540045766590389, + "loss": 4.216, + "step": 1531 + }, + { + "epoch": 1.96, + "grad_norm": 0.5617849230766296, + "learning_rate": 0.0002539641943734015, + "loss": 4.2903, + "step": 1532 + }, + { + "epoch": 1.96128, + "grad_norm": 0.5797948241233826, + "learning_rate": 0.00025392381208776414, + "loss": 4.2572, + "step": 1533 + }, + { + "epoch": 1.9625599999999999, + "grad_norm": 0.5373856425285339, + "learning_rate": 0.00025388342980212677, + "loss": 4.2827, + "step": 1534 + }, + { + "epoch": 1.96384, + "grad_norm": 0.5615948438644409, + "learning_rate": 0.0002538430475164894, + "loss": 4.2297, + "step": 1535 + }, + { + "epoch": 1.96512, + "grad_norm": 0.5433390140533447, + "learning_rate": 0.00025380266523085203, + "loss": 4.3284, + "step": 1536 + }, + { + "epoch": 1.9664000000000001, + "grad_norm": 0.604446530342102, + "learning_rate": 0.00025376228294521466, + "loss": 4.2762, + "step": 1537 + }, + { + "epoch": 1.96768, + "grad_norm": 0.5676113367080688, + "learning_rate": 0.0002537219006595773, + "loss": 4.174, + "step": 1538 + }, + { + "epoch": 1.96896, + "grad_norm": 0.5773786902427673, + "learning_rate": 0.0002536815183739399, + "loss": 4.227, + "step": 1539 + }, + { + "epoch": 1.97024, + "grad_norm": 0.5377183556556702, + "learning_rate": 0.0002536411360883026, + "loss": 4.232, + "step": 1540 + }, + { + "epoch": 1.97152, + "grad_norm": 0.5951273441314697, + "learning_rate": 0.00025360075380266524, + "loss": 4.2514, + "step": 1541 + }, + { + "epoch": 1.9727999999999999, + "grad_norm": 0.5194257497787476, + "learning_rate": 0.0002535603715170278, + "loss": 4.2341, + "step": 1542 + }, + { + "epoch": 1.9740799999999998, + "grad_norm": 0.5489000678062439, + "learning_rate": 0.00025351998923139044, + "loss": 4.2023, + "step": 1543 + }, + { + "epoch": 1.97536, + "grad_norm": 0.5540454983711243, + "learning_rate": 0.00025347960694575313, + "loss": 4.2174, + "step": 1544 + }, + { + "epoch": 1.97664, + "grad_norm": 0.5399606823921204, + "learning_rate": 0.00025343922466011576, + "loss": 4.335, + "step": 1545 + }, + { + "epoch": 1.9779200000000001, + "grad_norm": 0.5686156749725342, + "learning_rate": 0.0002533988423744784, + "loss": 4.2687, + "step": 1546 + }, + { + "epoch": 1.9792, + "grad_norm": 0.528323769569397, + "learning_rate": 0.000253358460088841, + "loss": 4.3328, + "step": 1547 + }, + { + "epoch": 1.98048, + "grad_norm": 0.5400658845901489, + "learning_rate": 0.00025331807780320365, + "loss": 4.2065, + "step": 1548 + }, + { + "epoch": 1.98176, + "grad_norm": 0.5493596792221069, + "learning_rate": 0.0002532776955175663, + "loss": 4.2173, + "step": 1549 + }, + { + "epoch": 1.98304, + "grad_norm": 0.5500946044921875, + "learning_rate": 0.0002532373132319289, + "loss": 4.2876, + "step": 1550 + }, + { + "epoch": 1.9843199999999999, + "grad_norm": 0.5746954679489136, + "learning_rate": 0.00025319693094629154, + "loss": 4.3206, + "step": 1551 + }, + { + "epoch": 1.9856, + "grad_norm": 0.5554184913635254, + "learning_rate": 0.00025315654866065417, + "loss": 4.2171, + "step": 1552 + }, + { + "epoch": 1.98688, + "grad_norm": 0.5702386498451233, + "learning_rate": 0.0002531161663750168, + "loss": 4.2557, + "step": 1553 + }, + { + "epoch": 1.9881600000000001, + "grad_norm": 0.5968025922775269, + "learning_rate": 0.00025307578408937943, + "loss": 4.2913, + "step": 1554 + }, + { + "epoch": 1.98944, + "grad_norm": 0.5453073382377625, + "learning_rate": 0.00025303540180374206, + "loss": 4.2114, + "step": 1555 + }, + { + "epoch": 1.99072, + "grad_norm": 0.5790675282478333, + "learning_rate": 0.0002529950195181047, + "loss": 4.2186, + "step": 1556 + }, + { + "epoch": 1.992, + "grad_norm": 0.5652514100074768, + "learning_rate": 0.00025295463723246737, + "loss": 4.2595, + "step": 1557 + }, + { + "epoch": 1.99328, + "grad_norm": 0.5572407245635986, + "learning_rate": 0.00025291425494682995, + "loss": 4.2544, + "step": 1558 + }, + { + "epoch": 1.9945599999999999, + "grad_norm": 0.5312286019325256, + "learning_rate": 0.0002528738726611926, + "loss": 4.2369, + "step": 1559 + }, + { + "epoch": 1.9958399999999998, + "grad_norm": 0.5568103194236755, + "learning_rate": 0.0002528334903755552, + "loss": 4.3173, + "step": 1560 + }, + { + "epoch": 1.99712, + "grad_norm": 0.5829612612724304, + "learning_rate": 0.00025279310808991784, + "loss": 4.2247, + "step": 1561 + }, + { + "epoch": 1.9984, + "grad_norm": 0.5639691352844238, + "learning_rate": 0.0002527527258042805, + "loss": 4.2355, + "step": 1562 + }, + { + "epoch": 1.9996800000000001, + "grad_norm": 0.5624785423278809, + "learning_rate": 0.00025271234351864315, + "loss": 4.2893, + "step": 1563 + }, + { + "epoch": 2.0, + "grad_norm": 0.9181849360466003, + "learning_rate": 0.0002526719612330058, + "loss": 4.2339, + "step": 1564 + }, + { + "epoch": 2.00128, + "grad_norm": 0.6469334959983826, + "learning_rate": 0.00025263157894736836, + "loss": 4.1552, + "step": 1565 + }, + { + "epoch": 2.00256, + "grad_norm": 0.607097327709198, + "learning_rate": 0.00025259119666173104, + "loss": 4.1764, + "step": 1566 + }, + { + "epoch": 2.00384, + "grad_norm": 0.5565678477287292, + "learning_rate": 0.0002525508143760937, + "loss": 4.1168, + "step": 1567 + }, + { + "epoch": 2.00512, + "grad_norm": 0.5955238342285156, + "learning_rate": 0.0002525104320904563, + "loss": 4.1852, + "step": 1568 + }, + { + "epoch": 2.0064, + "grad_norm": 0.5889109969139099, + "learning_rate": 0.00025247004980481893, + "loss": 4.1784, + "step": 1569 + }, + { + "epoch": 2.00768, + "grad_norm": 0.6503939032554626, + "learning_rate": 0.00025242966751918156, + "loss": 4.1564, + "step": 1570 + }, + { + "epoch": 2.00896, + "grad_norm": 0.5937455296516418, + "learning_rate": 0.0002523892852335442, + "loss": 4.2143, + "step": 1571 + }, + { + "epoch": 2.01024, + "grad_norm": 0.6147340536117554, + "learning_rate": 0.0002523489029479068, + "loss": 4.1123, + "step": 1572 + }, + { + "epoch": 2.01152, + "grad_norm": 0.5893435478210449, + "learning_rate": 0.00025230852066226945, + "loss": 4.0739, + "step": 1573 + }, + { + "epoch": 2.0128, + "grad_norm": 0.584883451461792, + "learning_rate": 0.00025226813837663214, + "loss": 4.2348, + "step": 1574 + }, + { + "epoch": 2.01408, + "grad_norm": 0.5527332425117493, + "learning_rate": 0.0002522277560909947, + "loss": 4.205, + "step": 1575 + }, + { + "epoch": 2.01536, + "grad_norm": 0.5769275426864624, + "learning_rate": 0.00025218737380535734, + "loss": 4.1406, + "step": 1576 + }, + { + "epoch": 2.01664, + "grad_norm": 0.5587900280952454, + "learning_rate": 0.00025214699151972, + "loss": 4.1601, + "step": 1577 + }, + { + "epoch": 2.01792, + "grad_norm": 0.5883750319480896, + "learning_rate": 0.0002521066092340826, + "loss": 4.1505, + "step": 1578 + }, + { + "epoch": 2.0192, + "grad_norm": 0.5383011102676392, + "learning_rate": 0.0002520662269484453, + "loss": 4.1801, + "step": 1579 + }, + { + "epoch": 2.02048, + "grad_norm": 0.5970503687858582, + "learning_rate": 0.0002520258446628079, + "loss": 4.1597, + "step": 1580 + }, + { + "epoch": 2.02176, + "grad_norm": 0.5630191564559937, + "learning_rate": 0.00025198546237717055, + "loss": 4.1925, + "step": 1581 + }, + { + "epoch": 2.02304, + "grad_norm": 0.5271741151809692, + "learning_rate": 0.0002519450800915331, + "loss": 4.2025, + "step": 1582 + }, + { + "epoch": 2.02432, + "grad_norm": 0.5406798124313354, + "learning_rate": 0.0002519046978058958, + "loss": 4.1666, + "step": 1583 + }, + { + "epoch": 2.0256, + "grad_norm": 0.5316964387893677, + "learning_rate": 0.00025186431552025844, + "loss": 4.1439, + "step": 1584 + }, + { + "epoch": 2.02688, + "grad_norm": 0.5242227911949158, + "learning_rate": 0.00025182393323462107, + "loss": 4.1849, + "step": 1585 + }, + { + "epoch": 2.02816, + "grad_norm": 0.516384482383728, + "learning_rate": 0.0002517835509489837, + "loss": 4.1493, + "step": 1586 + }, + { + "epoch": 2.02944, + "grad_norm": 0.5419257879257202, + "learning_rate": 0.00025174316866334633, + "loss": 4.1466, + "step": 1587 + }, + { + "epoch": 2.03072, + "grad_norm": 0.5190739631652832, + "learning_rate": 0.00025170278637770896, + "loss": 4.2076, + "step": 1588 + }, + { + "epoch": 2.032, + "grad_norm": 0.5474783778190613, + "learning_rate": 0.0002516624040920716, + "loss": 4.1463, + "step": 1589 + }, + { + "epoch": 2.03328, + "grad_norm": 0.5086979269981384, + "learning_rate": 0.0002516220218064342, + "loss": 4.1831, + "step": 1590 + }, + { + "epoch": 2.03456, + "grad_norm": 0.5824074149131775, + "learning_rate": 0.00025158163952079685, + "loss": 4.1523, + "step": 1591 + }, + { + "epoch": 2.03584, + "grad_norm": 0.523669958114624, + "learning_rate": 0.0002515412572351595, + "loss": 4.1735, + "step": 1592 + }, + { + "epoch": 2.03712, + "grad_norm": 0.5100351572036743, + "learning_rate": 0.0002515008749495221, + "loss": 4.1265, + "step": 1593 + }, + { + "epoch": 2.0384, + "grad_norm": 0.5400765538215637, + "learning_rate": 0.00025146049266388474, + "loss": 4.1038, + "step": 1594 + }, + { + "epoch": 2.03968, + "grad_norm": 0.5364828705787659, + "learning_rate": 0.00025142011037824737, + "loss": 4.1539, + "step": 1595 + }, + { + "epoch": 2.04096, + "grad_norm": 0.5193732976913452, + "learning_rate": 0.00025137972809261005, + "loss": 4.1766, + "step": 1596 + }, + { + "epoch": 2.04224, + "grad_norm": 0.5233401656150818, + "learning_rate": 0.0002513393458069727, + "loss": 4.1535, + "step": 1597 + }, + { + "epoch": 2.04352, + "grad_norm": 0.5625421404838562, + "learning_rate": 0.00025129896352133526, + "loss": 4.1345, + "step": 1598 + }, + { + "epoch": 2.0448, + "grad_norm": 0.5566902756690979, + "learning_rate": 0.0002512585812356979, + "loss": 4.1438, + "step": 1599 + }, + { + "epoch": 2.04608, + "grad_norm": 0.5623239874839783, + "learning_rate": 0.0002512181989500606, + "loss": 4.1251, + "step": 1600 + }, + { + "epoch": 2.04736, + "grad_norm": 0.578046441078186, + "learning_rate": 0.0002511778166644232, + "loss": 4.0756, + "step": 1601 + }, + { + "epoch": 2.04864, + "grad_norm": 0.5461819171905518, + "learning_rate": 0.00025113743437878583, + "loss": 4.1906, + "step": 1602 + }, + { + "epoch": 2.04992, + "grad_norm": 0.5973131656646729, + "learning_rate": 0.00025109705209314846, + "loss": 4.1781, + "step": 1603 + }, + { + "epoch": 2.0512, + "grad_norm": 0.5380710959434509, + "learning_rate": 0.0002510566698075111, + "loss": 4.1321, + "step": 1604 + }, + { + "epoch": 2.05248, + "grad_norm": 0.5838267207145691, + "learning_rate": 0.0002510162875218737, + "loss": 4.1575, + "step": 1605 + }, + { + "epoch": 2.05376, + "grad_norm": 0.5240848660469055, + "learning_rate": 0.00025097590523623635, + "loss": 4.0436, + "step": 1606 + }, + { + "epoch": 2.05504, + "grad_norm": 0.5829635858535767, + "learning_rate": 0.000250935522950599, + "loss": 4.1312, + "step": 1607 + }, + { + "epoch": 2.05632, + "grad_norm": 0.5419280529022217, + "learning_rate": 0.0002508951406649616, + "loss": 4.1518, + "step": 1608 + }, + { + "epoch": 2.0576, + "grad_norm": 0.5682242512702942, + "learning_rate": 0.00025085475837932424, + "loss": 4.2161, + "step": 1609 + }, + { + "epoch": 2.05888, + "grad_norm": 0.5263741612434387, + "learning_rate": 0.0002508143760936869, + "loss": 4.1819, + "step": 1610 + }, + { + "epoch": 2.06016, + "grad_norm": 0.5568137168884277, + "learning_rate": 0.0002507739938080495, + "loss": 4.0907, + "step": 1611 + }, + { + "epoch": 2.06144, + "grad_norm": 0.5448589324951172, + "learning_rate": 0.00025073361152241214, + "loss": 4.171, + "step": 1612 + }, + { + "epoch": 2.06272, + "grad_norm": 0.5299394726753235, + "learning_rate": 0.0002506932292367748, + "loss": 4.1997, + "step": 1613 + }, + { + "epoch": 2.064, + "grad_norm": 0.5272650122642517, + "learning_rate": 0.0002506528469511374, + "loss": 4.0966, + "step": 1614 + }, + { + "epoch": 2.06528, + "grad_norm": 0.5520989894866943, + "learning_rate": 0.0002506124646655, + "loss": 4.1154, + "step": 1615 + }, + { + "epoch": 2.06656, + "grad_norm": 0.5448061227798462, + "learning_rate": 0.00025057208237986266, + "loss": 4.096, + "step": 1616 + }, + { + "epoch": 2.06784, + "grad_norm": 0.5410912036895752, + "learning_rate": 0.0002505317000942253, + "loss": 4.1071, + "step": 1617 + }, + { + "epoch": 2.06912, + "grad_norm": 0.5440801978111267, + "learning_rate": 0.00025049131780858797, + "loss": 4.2481, + "step": 1618 + }, + { + "epoch": 2.0704, + "grad_norm": 0.5587443709373474, + "learning_rate": 0.0002504509355229506, + "loss": 4.1345, + "step": 1619 + }, + { + "epoch": 2.07168, + "grad_norm": 0.5543466210365295, + "learning_rate": 0.00025041055323731323, + "loss": 4.2023, + "step": 1620 + }, + { + "epoch": 2.07296, + "grad_norm": 0.5711597800254822, + "learning_rate": 0.0002503701709516758, + "loss": 4.1222, + "step": 1621 + }, + { + "epoch": 2.07424, + "grad_norm": 0.5779085159301758, + "learning_rate": 0.0002503297886660385, + "loss": 4.1628, + "step": 1622 + }, + { + "epoch": 2.07552, + "grad_norm": 0.5895453691482544, + "learning_rate": 0.0002502894063804011, + "loss": 4.1175, + "step": 1623 + }, + { + "epoch": 2.0768, + "grad_norm": 0.5698716640472412, + "learning_rate": 0.00025024902409476375, + "loss": 4.0683, + "step": 1624 + }, + { + "epoch": 2.07808, + "grad_norm": 0.6177151799201965, + "learning_rate": 0.0002502086418091264, + "loss": 4.2269, + "step": 1625 + }, + { + "epoch": 2.07936, + "grad_norm": 0.5702541470527649, + "learning_rate": 0.000250168259523489, + "loss": 4.1249, + "step": 1626 + }, + { + "epoch": 2.08064, + "grad_norm": 0.5631515383720398, + "learning_rate": 0.00025012787723785164, + "loss": 4.0755, + "step": 1627 + }, + { + "epoch": 2.08192, + "grad_norm": 0.5638665556907654, + "learning_rate": 0.00025008749495221427, + "loss": 4.1912, + "step": 1628 + }, + { + "epoch": 2.0832, + "grad_norm": 0.5572354793548584, + "learning_rate": 0.0002500471126665769, + "loss": 4.1421, + "step": 1629 + }, + { + "epoch": 2.08448, + "grad_norm": 0.5218389630317688, + "learning_rate": 0.00025000673038093953, + "loss": 4.1606, + "step": 1630 + }, + { + "epoch": 2.08576, + "grad_norm": 0.543541669845581, + "learning_rate": 0.00024996634809530216, + "loss": 4.2001, + "step": 1631 + }, + { + "epoch": 2.08704, + "grad_norm": 0.5522968173027039, + "learning_rate": 0.0002499259658096648, + "loss": 4.1279, + "step": 1632 + }, + { + "epoch": 2.08832, + "grad_norm": 0.5614569783210754, + "learning_rate": 0.0002498855835240274, + "loss": 4.057, + "step": 1633 + }, + { + "epoch": 2.0896, + "grad_norm": 0.5415524244308472, + "learning_rate": 0.00024984520123839005, + "loss": 4.2062, + "step": 1634 + }, + { + "epoch": 2.09088, + "grad_norm": 0.5276180505752563, + "learning_rate": 0.00024980481895275274, + "loss": 4.1219, + "step": 1635 + }, + { + "epoch": 2.09216, + "grad_norm": 0.5661503076553345, + "learning_rate": 0.00024976443666711537, + "loss": 4.0845, + "step": 1636 + }, + { + "epoch": 2.09344, + "grad_norm": 0.5094005465507507, + "learning_rate": 0.00024972405438147794, + "loss": 4.1161, + "step": 1637 + }, + { + "epoch": 2.09472, + "grad_norm": 0.5565866231918335, + "learning_rate": 0.00024968367209584057, + "loss": 4.1302, + "step": 1638 + }, + { + "epoch": 2.096, + "grad_norm": 0.5100904107093811, + "learning_rate": 0.00024964328981020326, + "loss": 4.189, + "step": 1639 + }, + { + "epoch": 2.09728, + "grad_norm": 0.5307742953300476, + "learning_rate": 0.0002496029075245659, + "loss": 4.0909, + "step": 1640 + }, + { + "epoch": 2.09856, + "grad_norm": 0.5522825717926025, + "learning_rate": 0.0002495625252389285, + "loss": 4.086, + "step": 1641 + }, + { + "epoch": 2.09984, + "grad_norm": 0.5281661152839661, + "learning_rate": 0.00024952214295329115, + "loss": 4.0938, + "step": 1642 + }, + { + "epoch": 2.10112, + "grad_norm": 0.5300430655479431, + "learning_rate": 0.0002494817606676538, + "loss": 4.2113, + "step": 1643 + }, + { + "epoch": 2.1024, + "grad_norm": 0.5836507081985474, + "learning_rate": 0.0002494413783820164, + "loss": 4.1544, + "step": 1644 + }, + { + "epoch": 2.1036799999999998, + "grad_norm": 0.5055437088012695, + "learning_rate": 0.00024940099609637904, + "loss": 4.1888, + "step": 1645 + }, + { + "epoch": 2.10496, + "grad_norm": 0.5788672566413879, + "learning_rate": 0.00024936061381074167, + "loss": 4.1254, + "step": 1646 + }, + { + "epoch": 2.10624, + "grad_norm": 0.5161530375480652, + "learning_rate": 0.0002493202315251043, + "loss": 4.1339, + "step": 1647 + }, + { + "epoch": 2.10752, + "grad_norm": 0.526464581489563, + "learning_rate": 0.0002492798492394669, + "loss": 4.0877, + "step": 1648 + }, + { + "epoch": 2.1088, + "grad_norm": 0.5108904242515564, + "learning_rate": 0.00024923946695382956, + "loss": 4.1818, + "step": 1649 + }, + { + "epoch": 2.11008, + "grad_norm": 0.5350533723831177, + "learning_rate": 0.0002491990846681922, + "loss": 4.1912, + "step": 1650 + }, + { + "epoch": 2.11136, + "grad_norm": 0.552305281162262, + "learning_rate": 0.0002491587023825548, + "loss": 4.0919, + "step": 1651 + }, + { + "epoch": 2.11264, + "grad_norm": 0.5402622818946838, + "learning_rate": 0.0002491183200969175, + "loss": 4.1347, + "step": 1652 + }, + { + "epoch": 2.11392, + "grad_norm": 0.5491945147514343, + "learning_rate": 0.00024907793781128013, + "loss": 4.1458, + "step": 1653 + }, + { + "epoch": 2.1152, + "grad_norm": 0.5369943380355835, + "learning_rate": 0.0002490375555256427, + "loss": 4.1845, + "step": 1654 + }, + { + "epoch": 2.11648, + "grad_norm": 0.5386082530021667, + "learning_rate": 0.00024899717324000534, + "loss": 4.1702, + "step": 1655 + }, + { + "epoch": 2.11776, + "grad_norm": 0.5493932366371155, + "learning_rate": 0.00024895679095436797, + "loss": 4.1743, + "step": 1656 + }, + { + "epoch": 2.11904, + "grad_norm": 0.5019392967224121, + "learning_rate": 0.00024891640866873065, + "loss": 4.0747, + "step": 1657 + }, + { + "epoch": 2.12032, + "grad_norm": 0.556978166103363, + "learning_rate": 0.0002488760263830933, + "loss": 4.1214, + "step": 1658 + }, + { + "epoch": 2.1216, + "grad_norm": 0.5479657053947449, + "learning_rate": 0.0002488356440974559, + "loss": 4.1133, + "step": 1659 + }, + { + "epoch": 2.12288, + "grad_norm": 0.5306862592697144, + "learning_rate": 0.0002487952618118185, + "loss": 4.1571, + "step": 1660 + }, + { + "epoch": 2.12416, + "grad_norm": 0.5123226046562195, + "learning_rate": 0.00024875487952618117, + "loss": 4.1351, + "step": 1661 + }, + { + "epoch": 2.12544, + "grad_norm": 0.5251739621162415, + "learning_rate": 0.0002487144972405438, + "loss": 4.1093, + "step": 1662 + }, + { + "epoch": 2.12672, + "grad_norm": 0.5215139389038086, + "learning_rate": 0.00024867411495490643, + "loss": 4.1659, + "step": 1663 + }, + { + "epoch": 2.128, + "grad_norm": 0.579883873462677, + "learning_rate": 0.00024863373266926906, + "loss": 4.153, + "step": 1664 + }, + { + "epoch": 2.12928, + "grad_norm": 0.5173061490058899, + "learning_rate": 0.0002485933503836317, + "loss": 4.1132, + "step": 1665 + }, + { + "epoch": 2.13056, + "grad_norm": 0.5922846794128418, + "learning_rate": 0.0002485529680979943, + "loss": 4.149, + "step": 1666 + }, + { + "epoch": 2.13184, + "grad_norm": 0.5453259348869324, + "learning_rate": 0.00024851258581235695, + "loss": 4.1019, + "step": 1667 + }, + { + "epoch": 2.13312, + "grad_norm": 0.5758420825004578, + "learning_rate": 0.0002484722035267196, + "loss": 4.1267, + "step": 1668 + }, + { + "epoch": 2.1344, + "grad_norm": 0.5560485124588013, + "learning_rate": 0.0002484318212410822, + "loss": 4.1299, + "step": 1669 + }, + { + "epoch": 2.13568, + "grad_norm": 0.5476799011230469, + "learning_rate": 0.00024839143895544484, + "loss": 4.0798, + "step": 1670 + }, + { + "epoch": 2.13696, + "grad_norm": 0.5895335078239441, + "learning_rate": 0.00024835105666980747, + "loss": 4.1217, + "step": 1671 + }, + { + "epoch": 2.13824, + "grad_norm": 0.5773458480834961, + "learning_rate": 0.0002483106743841701, + "loss": 4.1645, + "step": 1672 + }, + { + "epoch": 2.13952, + "grad_norm": 0.5620774626731873, + "learning_rate": 0.00024827029209853273, + "loss": 4.1691, + "step": 1673 + }, + { + "epoch": 2.1408, + "grad_norm": 0.5326398015022278, + "learning_rate": 0.0002482299098128954, + "loss": 4.1493, + "step": 1674 + }, + { + "epoch": 2.14208, + "grad_norm": 0.5294104218482971, + "learning_rate": 0.00024818952752725805, + "loss": 4.0135, + "step": 1675 + }, + { + "epoch": 2.14336, + "grad_norm": 0.6189349293708801, + "learning_rate": 0.0002481491452416207, + "loss": 4.1086, + "step": 1676 + }, + { + "epoch": 2.14464, + "grad_norm": 0.5574718713760376, + "learning_rate": 0.00024810876295598325, + "loss": 4.0684, + "step": 1677 + }, + { + "epoch": 2.14592, + "grad_norm": 0.5645029544830322, + "learning_rate": 0.00024806838067034594, + "loss": 4.0929, + "step": 1678 + }, + { + "epoch": 2.1471999999999998, + "grad_norm": 0.5400739312171936, + "learning_rate": 0.00024802799838470857, + "loss": 4.0444, + "step": 1679 + }, + { + "epoch": 2.14848, + "grad_norm": 0.5390148162841797, + "learning_rate": 0.0002479876160990712, + "loss": 4.1553, + "step": 1680 + }, + { + "epoch": 2.14976, + "grad_norm": 0.520582377910614, + "learning_rate": 0.00024794723381343383, + "loss": 4.0592, + "step": 1681 + }, + { + "epoch": 2.15104, + "grad_norm": 0.5372869968414307, + "learning_rate": 0.00024790685152779646, + "loss": 4.1065, + "step": 1682 + }, + { + "epoch": 2.15232, + "grad_norm": 0.510082483291626, + "learning_rate": 0.0002478664692421591, + "loss": 4.1625, + "step": 1683 + }, + { + "epoch": 2.1536, + "grad_norm": 0.5741236805915833, + "learning_rate": 0.0002478260869565217, + "loss": 4.0973, + "step": 1684 + }, + { + "epoch": 2.15488, + "grad_norm": 0.5604896545410156, + "learning_rate": 0.00024778570467088435, + "loss": 4.0886, + "step": 1685 + }, + { + "epoch": 2.15616, + "grad_norm": 0.5670208930969238, + "learning_rate": 0.000247745322385247, + "loss": 4.1643, + "step": 1686 + }, + { + "epoch": 2.15744, + "grad_norm": 0.5674409866333008, + "learning_rate": 0.0002477049400996096, + "loss": 4.1747, + "step": 1687 + }, + { + "epoch": 2.15872, + "grad_norm": 0.5394083857536316, + "learning_rate": 0.00024766455781397224, + "loss": 4.1235, + "step": 1688 + }, + { + "epoch": 2.16, + "grad_norm": 0.5426709651947021, + "learning_rate": 0.00024762417552833487, + "loss": 4.1005, + "step": 1689 + }, + { + "epoch": 2.16128, + "grad_norm": 0.5474469661712646, + "learning_rate": 0.0002475837932426975, + "loss": 4.1152, + "step": 1690 + }, + { + "epoch": 2.16256, + "grad_norm": 0.6519491076469421, + "learning_rate": 0.0002475434109570602, + "loss": 4.0955, + "step": 1691 + }, + { + "epoch": 2.16384, + "grad_norm": 0.5159724354743958, + "learning_rate": 0.0002475030286714228, + "loss": 4.1734, + "step": 1692 + }, + { + "epoch": 2.16512, + "grad_norm": 0.5642454028129578, + "learning_rate": 0.0002474626463857854, + "loss": 4.0931, + "step": 1693 + }, + { + "epoch": 2.1664, + "grad_norm": 0.5765841603279114, + "learning_rate": 0.000247422264100148, + "loss": 4.1322, + "step": 1694 + }, + { + "epoch": 2.16768, + "grad_norm": 0.5622481107711792, + "learning_rate": 0.0002473818818145107, + "loss": 4.1613, + "step": 1695 + }, + { + "epoch": 2.16896, + "grad_norm": 0.5255911350250244, + "learning_rate": 0.00024734149952887333, + "loss": 4.1129, + "step": 1696 + }, + { + "epoch": 2.17024, + "grad_norm": 0.5299469232559204, + "learning_rate": 0.00024730111724323596, + "loss": 4.0843, + "step": 1697 + }, + { + "epoch": 2.17152, + "grad_norm": 0.5808050632476807, + "learning_rate": 0.0002472607349575986, + "loss": 4.1286, + "step": 1698 + }, + { + "epoch": 2.1728, + "grad_norm": 0.563703179359436, + "learning_rate": 0.0002472203526719612, + "loss": 4.1928, + "step": 1699 + }, + { + "epoch": 2.17408, + "grad_norm": 0.5682193636894226, + "learning_rate": 0.00024717997038632385, + "loss": 4.1669, + "step": 1700 + }, + { + "epoch": 2.17536, + "grad_norm": 0.5745776295661926, + "learning_rate": 0.0002471395881006865, + "loss": 4.1129, + "step": 1701 + }, + { + "epoch": 2.17664, + "grad_norm": 0.5572461485862732, + "learning_rate": 0.0002470992058150491, + "loss": 4.1546, + "step": 1702 + }, + { + "epoch": 2.17792, + "grad_norm": 0.5437954664230347, + "learning_rate": 0.00024705882352941174, + "loss": 4.159, + "step": 1703 + }, + { + "epoch": 2.1792, + "grad_norm": 0.536284327507019, + "learning_rate": 0.0002470184412437744, + "loss": 4.1498, + "step": 1704 + }, + { + "epoch": 2.18048, + "grad_norm": 0.5438897609710693, + "learning_rate": 0.000246978058958137, + "loss": 4.0687, + "step": 1705 + }, + { + "epoch": 2.18176, + "grad_norm": 0.5237944722175598, + "learning_rate": 0.00024693767667249963, + "loss": 4.1393, + "step": 1706 + }, + { + "epoch": 2.18304, + "grad_norm": 0.5314120054244995, + "learning_rate": 0.00024689729438686226, + "loss": 4.1041, + "step": 1707 + }, + { + "epoch": 2.18432, + "grad_norm": 0.5380268096923828, + "learning_rate": 0.00024685691210122495, + "loss": 4.1046, + "step": 1708 + }, + { + "epoch": 2.1856, + "grad_norm": 0.5044991970062256, + "learning_rate": 0.0002468165298155875, + "loss": 4.0293, + "step": 1709 + }, + { + "epoch": 2.18688, + "grad_norm": 0.5603588223457336, + "learning_rate": 0.00024677614752995015, + "loss": 4.1426, + "step": 1710 + }, + { + "epoch": 2.18816, + "grad_norm": 0.5293896794319153, + "learning_rate": 0.0002467357652443128, + "loss": 4.0825, + "step": 1711 + }, + { + "epoch": 2.18944, + "grad_norm": 0.5398738980293274, + "learning_rate": 0.0002466953829586754, + "loss": 4.0774, + "step": 1712 + }, + { + "epoch": 2.19072, + "grad_norm": 0.5111704468727112, + "learning_rate": 0.0002466550006730381, + "loss": 4.0992, + "step": 1713 + }, + { + "epoch": 2.192, + "grad_norm": 0.5483059287071228, + "learning_rate": 0.00024661461838740073, + "loss": 4.0721, + "step": 1714 + }, + { + "epoch": 2.19328, + "grad_norm": 0.5132558345794678, + "learning_rate": 0.00024657423610176336, + "loss": 4.1077, + "step": 1715 + }, + { + "epoch": 2.19456, + "grad_norm": 0.5515888929367065, + "learning_rate": 0.00024653385381612593, + "loss": 4.0439, + "step": 1716 + }, + { + "epoch": 2.19584, + "grad_norm": 0.5087373852729797, + "learning_rate": 0.0002464934715304886, + "loss": 4.1587, + "step": 1717 + }, + { + "epoch": 2.19712, + "grad_norm": 0.5192369222640991, + "learning_rate": 0.00024645308924485125, + "loss": 4.0419, + "step": 1718 + }, + { + "epoch": 2.1984, + "grad_norm": 0.5411146283149719, + "learning_rate": 0.0002464127069592139, + "loss": 4.1674, + "step": 1719 + }, + { + "epoch": 2.19968, + "grad_norm": 0.5327039957046509, + "learning_rate": 0.0002463723246735765, + "loss": 4.1486, + "step": 1720 + }, + { + "epoch": 2.20096, + "grad_norm": 0.5581017732620239, + "learning_rate": 0.00024633194238793914, + "loss": 4.1585, + "step": 1721 + }, + { + "epoch": 2.20224, + "grad_norm": 0.5399036407470703, + "learning_rate": 0.00024629156010230177, + "loss": 4.1631, + "step": 1722 + }, + { + "epoch": 2.20352, + "grad_norm": 0.5582420229911804, + "learning_rate": 0.0002462511778166644, + "loss": 4.0919, + "step": 1723 + }, + { + "epoch": 2.2048, + "grad_norm": 0.5208436846733093, + "learning_rate": 0.00024621079553102703, + "loss": 4.1291, + "step": 1724 + }, + { + "epoch": 2.20608, + "grad_norm": 0.5868030190467834, + "learning_rate": 0.00024617041324538966, + "loss": 4.0925, + "step": 1725 + }, + { + "epoch": 2.20736, + "grad_norm": 0.5274373292922974, + "learning_rate": 0.0002461300309597523, + "loss": 4.1289, + "step": 1726 + }, + { + "epoch": 2.20864, + "grad_norm": 0.5727094411849976, + "learning_rate": 0.0002460896486741149, + "loss": 4.1407, + "step": 1727 + }, + { + "epoch": 2.20992, + "grad_norm": 0.5434361100196838, + "learning_rate": 0.00024604926638847755, + "loss": 4.1471, + "step": 1728 + }, + { + "epoch": 2.2112, + "grad_norm": 0.5813825726509094, + "learning_rate": 0.0002460088841028402, + "loss": 4.1329, + "step": 1729 + }, + { + "epoch": 2.2124800000000002, + "grad_norm": 0.5424474477767944, + "learning_rate": 0.00024596850181720286, + "loss": 4.0815, + "step": 1730 + }, + { + "epoch": 2.21376, + "grad_norm": 0.546615481376648, + "learning_rate": 0.0002459281195315655, + "loss": 4.1526, + "step": 1731 + }, + { + "epoch": 2.21504, + "grad_norm": 0.5413877964019775, + "learning_rate": 0.00024588773724592807, + "loss": 4.104, + "step": 1732 + }, + { + "epoch": 2.21632, + "grad_norm": 0.5641849040985107, + "learning_rate": 0.0002458473549602907, + "loss": 4.0903, + "step": 1733 + }, + { + "epoch": 2.2176, + "grad_norm": 0.6042072176933289, + "learning_rate": 0.0002458069726746534, + "loss": 4.1078, + "step": 1734 + }, + { + "epoch": 2.21888, + "grad_norm": 0.5650026202201843, + "learning_rate": 0.000245766590389016, + "loss": 4.1688, + "step": 1735 + }, + { + "epoch": 2.22016, + "grad_norm": 0.5781028866767883, + "learning_rate": 0.00024572620810337864, + "loss": 4.1266, + "step": 1736 + }, + { + "epoch": 2.22144, + "grad_norm": 0.5524300932884216, + "learning_rate": 0.0002456858258177413, + "loss": 4.1045, + "step": 1737 + }, + { + "epoch": 2.22272, + "grad_norm": 0.5217704772949219, + "learning_rate": 0.0002456454435321039, + "loss": 4.1377, + "step": 1738 + }, + { + "epoch": 2.224, + "grad_norm": 0.5524937510490417, + "learning_rate": 0.00024560506124646653, + "loss": 4.1003, + "step": 1739 + }, + { + "epoch": 2.22528, + "grad_norm": 0.5031275153160095, + "learning_rate": 0.00024556467896082916, + "loss": 4.1322, + "step": 1740 + }, + { + "epoch": 2.22656, + "grad_norm": 0.5842592716217041, + "learning_rate": 0.0002455242966751918, + "loss": 4.1308, + "step": 1741 + }, + { + "epoch": 2.22784, + "grad_norm": 0.5045456290245056, + "learning_rate": 0.0002454839143895544, + "loss": 4.1172, + "step": 1742 + }, + { + "epoch": 2.22912, + "grad_norm": 0.5572609901428223, + "learning_rate": 0.00024544353210391705, + "loss": 4.0625, + "step": 1743 + }, + { + "epoch": 2.2304, + "grad_norm": 0.5589746236801147, + "learning_rate": 0.0002454031498182797, + "loss": 4.0665, + "step": 1744 + }, + { + "epoch": 2.23168, + "grad_norm": 0.5695862174034119, + "learning_rate": 0.0002453627675326423, + "loss": 4.0537, + "step": 1745 + }, + { + "epoch": 2.23296, + "grad_norm": 0.5234766006469727, + "learning_rate": 0.00024532238524700494, + "loss": 4.1759, + "step": 1746 + }, + { + "epoch": 2.23424, + "grad_norm": 0.5754182934761047, + "learning_rate": 0.00024528200296136763, + "loss": 4.1404, + "step": 1747 + }, + { + "epoch": 2.23552, + "grad_norm": 0.5176681280136108, + "learning_rate": 0.00024524162067573026, + "loss": 4.1085, + "step": 1748 + }, + { + "epoch": 2.2368, + "grad_norm": 0.5456019043922424, + "learning_rate": 0.00024520123839009283, + "loss": 4.1433, + "step": 1749 + }, + { + "epoch": 2.23808, + "grad_norm": 0.5013742446899414, + "learning_rate": 0.00024516085610445547, + "loss": 4.1624, + "step": 1750 + }, + { + "epoch": 2.23936, + "grad_norm": 0.5228570103645325, + "learning_rate": 0.0002451204738188181, + "loss": 4.0994, + "step": 1751 + }, + { + "epoch": 2.24064, + "grad_norm": 0.5208329558372498, + "learning_rate": 0.0002450800915331808, + "loss": 4.1814, + "step": 1752 + }, + { + "epoch": 2.24192, + "grad_norm": 0.5297362804412842, + "learning_rate": 0.0002450397092475434, + "loss": 4.147, + "step": 1753 + }, + { + "epoch": 2.2432, + "grad_norm": 0.5192136764526367, + "learning_rate": 0.00024499932696190604, + "loss": 4.175, + "step": 1754 + }, + { + "epoch": 2.24448, + "grad_norm": 0.5594687461853027, + "learning_rate": 0.00024495894467626867, + "loss": 4.1391, + "step": 1755 + }, + { + "epoch": 2.24576, + "grad_norm": 0.5374172329902649, + "learning_rate": 0.0002449185623906313, + "loss": 4.1404, + "step": 1756 + }, + { + "epoch": 2.24704, + "grad_norm": 0.5575613975524902, + "learning_rate": 0.00024487818010499393, + "loss": 4.0406, + "step": 1757 + }, + { + "epoch": 2.24832, + "grad_norm": 0.507201075553894, + "learning_rate": 0.00024483779781935656, + "loss": 4.1444, + "step": 1758 + }, + { + "epoch": 2.2496, + "grad_norm": 0.5673658847808838, + "learning_rate": 0.0002447974155337192, + "loss": 4.0135, + "step": 1759 + }, + { + "epoch": 2.25088, + "grad_norm": 0.5208572149276733, + "learning_rate": 0.0002447570332480818, + "loss": 4.1486, + "step": 1760 + }, + { + "epoch": 2.25216, + "grad_norm": 0.5534422397613525, + "learning_rate": 0.00024471665096244445, + "loss": 4.0913, + "step": 1761 + }, + { + "epoch": 2.25344, + "grad_norm": 0.5177516937255859, + "learning_rate": 0.0002446762686768071, + "loss": 4.131, + "step": 1762 + }, + { + "epoch": 2.25472, + "grad_norm": 0.5717141032218933, + "learning_rate": 0.0002446358863911697, + "loss": 4.0954, + "step": 1763 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 0.5581514835357666, + "learning_rate": 0.00024459550410553234, + "loss": 4.1365, + "step": 1764 + }, + { + "epoch": 2.25728, + "grad_norm": 0.5282269716262817, + "learning_rate": 0.00024455512181989497, + "loss": 4.0941, + "step": 1765 + }, + { + "epoch": 2.25856, + "grad_norm": 0.5452956557273865, + "learning_rate": 0.0002445147395342576, + "loss": 4.1019, + "step": 1766 + }, + { + "epoch": 2.25984, + "grad_norm": 0.5826310515403748, + "learning_rate": 0.00024447435724862023, + "loss": 4.0641, + "step": 1767 + }, + { + "epoch": 2.26112, + "grad_norm": 0.5480028986930847, + "learning_rate": 0.00024443397496298286, + "loss": 4.096, + "step": 1768 + }, + { + "epoch": 2.2624, + "grad_norm": 0.5349941849708557, + "learning_rate": 0.00024439359267734554, + "loss": 4.088, + "step": 1769 + }, + { + "epoch": 2.26368, + "grad_norm": 0.5315197706222534, + "learning_rate": 0.0002443532103917082, + "loss": 4.1428, + "step": 1770 + }, + { + "epoch": 2.26496, + "grad_norm": 0.5485361218452454, + "learning_rate": 0.0002443128281060708, + "loss": 4.0701, + "step": 1771 + }, + { + "epoch": 2.26624, + "grad_norm": 0.49806830286979675, + "learning_rate": 0.0002442724458204334, + "loss": 4.0907, + "step": 1772 + }, + { + "epoch": 2.26752, + "grad_norm": 0.5614639520645142, + "learning_rate": 0.00024423206353479607, + "loss": 4.1116, + "step": 1773 + }, + { + "epoch": 2.2688, + "grad_norm": 0.5616679191589355, + "learning_rate": 0.0002441916812491587, + "loss": 4.1148, + "step": 1774 + }, + { + "epoch": 2.27008, + "grad_norm": 0.5605160593986511, + "learning_rate": 0.0002441512989635213, + "loss": 4.0161, + "step": 1775 + }, + { + "epoch": 2.27136, + "grad_norm": 0.5229089856147766, + "learning_rate": 0.00024411091667788393, + "loss": 4.1171, + "step": 1776 + }, + { + "epoch": 2.27264, + "grad_norm": 0.5960175395011902, + "learning_rate": 0.00024407053439224656, + "loss": 4.1207, + "step": 1777 + }, + { + "epoch": 2.27392, + "grad_norm": 0.5144199132919312, + "learning_rate": 0.00024403015210660922, + "loss": 4.162, + "step": 1778 + }, + { + "epoch": 2.2752, + "grad_norm": 0.5757052302360535, + "learning_rate": 0.00024398976982097185, + "loss": 4.129, + "step": 1779 + }, + { + "epoch": 2.27648, + "grad_norm": 0.5588791966438293, + "learning_rate": 0.00024394938753533448, + "loss": 4.0564, + "step": 1780 + }, + { + "epoch": 2.27776, + "grad_norm": 0.5604344606399536, + "learning_rate": 0.0002439090052496971, + "loss": 4.0195, + "step": 1781 + }, + { + "epoch": 2.27904, + "grad_norm": 0.5186293721199036, + "learning_rate": 0.00024386862296405976, + "loss": 4.0465, + "step": 1782 + }, + { + "epoch": 2.28032, + "grad_norm": 0.5566515922546387, + "learning_rate": 0.0002438282406784224, + "loss": 4.1026, + "step": 1783 + }, + { + "epoch": 2.2816, + "grad_norm": 0.5155622363090515, + "learning_rate": 0.000243787858392785, + "loss": 4.096, + "step": 1784 + }, + { + "epoch": 2.28288, + "grad_norm": 0.5197412967681885, + "learning_rate": 0.00024374747610714763, + "loss": 4.0739, + "step": 1785 + }, + { + "epoch": 2.28416, + "grad_norm": 0.5129163861274719, + "learning_rate": 0.00024370709382151028, + "loss": 4.054, + "step": 1786 + }, + { + "epoch": 2.28544, + "grad_norm": 0.4941207468509674, + "learning_rate": 0.0002436667115358729, + "loss": 4.0845, + "step": 1787 + }, + { + "epoch": 2.28672, + "grad_norm": 0.5023811459541321, + "learning_rate": 0.00024362632925023554, + "loss": 4.1381, + "step": 1788 + }, + { + "epoch": 2.288, + "grad_norm": 0.507765531539917, + "learning_rate": 0.00024358594696459817, + "loss": 4.0451, + "step": 1789 + }, + { + "epoch": 2.2892799999999998, + "grad_norm": 0.4993181526660919, + "learning_rate": 0.0002435455646789608, + "loss": 4.0173, + "step": 1790 + }, + { + "epoch": 2.29056, + "grad_norm": 0.5167925953865051, + "learning_rate": 0.00024350518239332346, + "loss": 4.1821, + "step": 1791 + }, + { + "epoch": 2.29184, + "grad_norm": 0.4934568703174591, + "learning_rate": 0.00024346480010768606, + "loss": 4.1292, + "step": 1792 + }, + { + "epoch": 2.29312, + "grad_norm": 0.5314911007881165, + "learning_rate": 0.0002434244178220487, + "loss": 4.1175, + "step": 1793 + }, + { + "epoch": 2.2944, + "grad_norm": 0.5263243913650513, + "learning_rate": 0.00024338403553641132, + "loss": 4.0788, + "step": 1794 + }, + { + "epoch": 2.29568, + "grad_norm": 0.5136982202529907, + "learning_rate": 0.00024334365325077398, + "loss": 4.0407, + "step": 1795 + }, + { + "epoch": 2.29696, + "grad_norm": 0.53715580701828, + "learning_rate": 0.0002433032709651366, + "loss": 4.0982, + "step": 1796 + }, + { + "epoch": 2.29824, + "grad_norm": 0.537679135799408, + "learning_rate": 0.00024326288867949924, + "loss": 4.0186, + "step": 1797 + }, + { + "epoch": 2.2995200000000002, + "grad_norm": 0.5566760301589966, + "learning_rate": 0.00024322250639386187, + "loss": 4.0081, + "step": 1798 + }, + { + "epoch": 2.3008, + "grad_norm": 0.5105803608894348, + "learning_rate": 0.00024318212410822453, + "loss": 4.1103, + "step": 1799 + }, + { + "epoch": 2.30208, + "grad_norm": 0.5302501916885376, + "learning_rate": 0.00024314174182258713, + "loss": 4.0884, + "step": 1800 + }, + { + "epoch": 2.30336, + "grad_norm": 0.5275909900665283, + "learning_rate": 0.00024310135953694976, + "loss": 4.1443, + "step": 1801 + }, + { + "epoch": 2.30464, + "grad_norm": 0.529685378074646, + "learning_rate": 0.0002430609772513124, + "loss": 4.0931, + "step": 1802 + }, + { + "epoch": 2.30592, + "grad_norm": 0.5178261995315552, + "learning_rate": 0.00024302059496567502, + "loss": 4.0161, + "step": 1803 + }, + { + "epoch": 2.3072, + "grad_norm": 0.5614741444587708, + "learning_rate": 0.00024298021268003768, + "loss": 4.0476, + "step": 1804 + }, + { + "epoch": 2.30848, + "grad_norm": 0.5194922685623169, + "learning_rate": 0.0002429398303944003, + "loss": 4.0889, + "step": 1805 + }, + { + "epoch": 2.30976, + "grad_norm": 0.5637295842170715, + "learning_rate": 0.00024289944810876294, + "loss": 4.1037, + "step": 1806 + }, + { + "epoch": 2.31104, + "grad_norm": 0.5420023202896118, + "learning_rate": 0.00024285906582312554, + "loss": 4.1075, + "step": 1807 + }, + { + "epoch": 2.31232, + "grad_norm": 0.5864022374153137, + "learning_rate": 0.0002428186835374882, + "loss": 4.0976, + "step": 1808 + }, + { + "epoch": 2.3136, + "grad_norm": 0.5702569484710693, + "learning_rate": 0.00024277830125185083, + "loss": 4.0542, + "step": 1809 + }, + { + "epoch": 2.31488, + "grad_norm": 0.5524796843528748, + "learning_rate": 0.00024273791896621346, + "loss": 4.0888, + "step": 1810 + }, + { + "epoch": 2.31616, + "grad_norm": 0.5307409167289734, + "learning_rate": 0.0002426975366805761, + "loss": 4.1252, + "step": 1811 + }, + { + "epoch": 2.31744, + "grad_norm": 0.543082594871521, + "learning_rate": 0.00024265715439493875, + "loss": 4.1532, + "step": 1812 + }, + { + "epoch": 2.31872, + "grad_norm": 0.5330712199211121, + "learning_rate": 0.00024261677210930138, + "loss": 4.1552, + "step": 1813 + }, + { + "epoch": 2.32, + "grad_norm": 0.5282273292541504, + "learning_rate": 0.000242576389823664, + "loss": 4.1571, + "step": 1814 + }, + { + "epoch": 2.32128, + "grad_norm": 0.5447540283203125, + "learning_rate": 0.0002425360075380266, + "loss": 4.1582, + "step": 1815 + }, + { + "epoch": 2.32256, + "grad_norm": 0.5365498065948486, + "learning_rate": 0.0002424956252523893, + "loss": 4.1033, + "step": 1816 + }, + { + "epoch": 2.32384, + "grad_norm": 0.5478320121765137, + "learning_rate": 0.0002424552429667519, + "loss": 4.1227, + "step": 1817 + }, + { + "epoch": 2.32512, + "grad_norm": 0.5289247035980225, + "learning_rate": 0.00024241486068111453, + "loss": 4.1199, + "step": 1818 + }, + { + "epoch": 2.3264, + "grad_norm": 0.5543386340141296, + "learning_rate": 0.00024237447839547716, + "loss": 4.0492, + "step": 1819 + }, + { + "epoch": 2.32768, + "grad_norm": 0.5105769634246826, + "learning_rate": 0.0002423340961098398, + "loss": 4.0887, + "step": 1820 + }, + { + "epoch": 2.32896, + "grad_norm": 0.5353228449821472, + "learning_rate": 0.00024229371382420244, + "loss": 3.9982, + "step": 1821 + }, + { + "epoch": 2.33024, + "grad_norm": 0.4944192171096802, + "learning_rate": 0.00024225333153856507, + "loss": 4.0972, + "step": 1822 + }, + { + "epoch": 2.33152, + "grad_norm": 0.5060207843780518, + "learning_rate": 0.00024221294925292768, + "loss": 4.0681, + "step": 1823 + }, + { + "epoch": 2.3327999999999998, + "grad_norm": 0.503966212272644, + "learning_rate": 0.0002421725669672903, + "loss": 4.095, + "step": 1824 + }, + { + "epoch": 2.33408, + "grad_norm": 0.5177589654922485, + "learning_rate": 0.00024213218468165296, + "loss": 4.048, + "step": 1825 + }, + { + "epoch": 2.33536, + "grad_norm": 0.513486921787262, + "learning_rate": 0.0002420918023960156, + "loss": 4.1617, + "step": 1826 + }, + { + "epoch": 2.33664, + "grad_norm": 0.5109826326370239, + "learning_rate": 0.00024205142011037822, + "loss": 4.1553, + "step": 1827 + }, + { + "epoch": 2.33792, + "grad_norm": 0.5415542125701904, + "learning_rate": 0.00024201103782474085, + "loss": 4.0252, + "step": 1828 + }, + { + "epoch": 2.3392, + "grad_norm": 0.5250722169876099, + "learning_rate": 0.0002419706555391035, + "loss": 4.1262, + "step": 1829 + }, + { + "epoch": 2.34048, + "grad_norm": 0.490961492061615, + "learning_rate": 0.00024193027325346614, + "loss": 4.1085, + "step": 1830 + }, + { + "epoch": 2.34176, + "grad_norm": 0.5075300931930542, + "learning_rate": 0.00024188989096782877, + "loss": 4.0679, + "step": 1831 + }, + { + "epoch": 2.3430400000000002, + "grad_norm": 0.5052638053894043, + "learning_rate": 0.00024184950868219138, + "loss": 4.0838, + "step": 1832 + }, + { + "epoch": 2.34432, + "grad_norm": 0.5473994016647339, + "learning_rate": 0.000241809126396554, + "loss": 4.0862, + "step": 1833 + }, + { + "epoch": 2.3456, + "grad_norm": 0.502493143081665, + "learning_rate": 0.00024176874411091666, + "loss": 4.0654, + "step": 1834 + }, + { + "epoch": 2.34688, + "grad_norm": 0.5568253397941589, + "learning_rate": 0.0002417283618252793, + "loss": 4.072, + "step": 1835 + }, + { + "epoch": 2.34816, + "grad_norm": 0.535667359828949, + "learning_rate": 0.00024168797953964192, + "loss": 4.1611, + "step": 1836 + }, + { + "epoch": 2.34944, + "grad_norm": 0.5129615068435669, + "learning_rate": 0.00024164759725400455, + "loss": 4.0566, + "step": 1837 + }, + { + "epoch": 2.35072, + "grad_norm": 0.5144100785255432, + "learning_rate": 0.0002416072149683672, + "loss": 4.2091, + "step": 1838 + }, + { + "epoch": 2.352, + "grad_norm": 0.500097393989563, + "learning_rate": 0.00024156683268272984, + "loss": 4.1066, + "step": 1839 + }, + { + "epoch": 2.35328, + "grad_norm": 0.520469069480896, + "learning_rate": 0.00024152645039709244, + "loss": 4.0778, + "step": 1840 + }, + { + "epoch": 2.35456, + "grad_norm": 0.5127030611038208, + "learning_rate": 0.00024148606811145507, + "loss": 4.1135, + "step": 1841 + }, + { + "epoch": 2.35584, + "grad_norm": 0.540470540523529, + "learning_rate": 0.00024144568582581773, + "loss": 4.0279, + "step": 1842 + }, + { + "epoch": 2.35712, + "grad_norm": 0.5427557826042175, + "learning_rate": 0.00024140530354018036, + "loss": 4.0734, + "step": 1843 + }, + { + "epoch": 2.3584, + "grad_norm": 0.5099870562553406, + "learning_rate": 0.000241364921254543, + "loss": 4.0705, + "step": 1844 + }, + { + "epoch": 2.35968, + "grad_norm": 0.5234814882278442, + "learning_rate": 0.00024132453896890562, + "loss": 4.0138, + "step": 1845 + }, + { + "epoch": 2.36096, + "grad_norm": 0.5056787729263306, + "learning_rate": 0.00024128415668326825, + "loss": 4.1004, + "step": 1846 + }, + { + "epoch": 2.36224, + "grad_norm": 0.5264895558357239, + "learning_rate": 0.0002412437743976309, + "loss": 4.0975, + "step": 1847 + }, + { + "epoch": 2.36352, + "grad_norm": 0.5280450582504272, + "learning_rate": 0.0002412033921119935, + "loss": 4.0805, + "step": 1848 + }, + { + "epoch": 2.3648, + "grad_norm": 0.509524941444397, + "learning_rate": 0.00024116300982635614, + "loss": 4.0108, + "step": 1849 + }, + { + "epoch": 2.36608, + "grad_norm": 0.5179848074913025, + "learning_rate": 0.00024112262754071877, + "loss": 4.0785, + "step": 1850 + }, + { + "epoch": 2.36736, + "grad_norm": 0.5557752847671509, + "learning_rate": 0.00024108224525508143, + "loss": 4.1493, + "step": 1851 + }, + { + "epoch": 2.36864, + "grad_norm": 0.5200245976448059, + "learning_rate": 0.00024104186296944406, + "loss": 4.0801, + "step": 1852 + }, + { + "epoch": 2.36992, + "grad_norm": 0.5246480703353882, + "learning_rate": 0.0002410014806838067, + "loss": 4.0722, + "step": 1853 + }, + { + "epoch": 2.3712, + "grad_norm": 0.5433456301689148, + "learning_rate": 0.00024096109839816932, + "loss": 4.1251, + "step": 1854 + }, + { + "epoch": 2.37248, + "grad_norm": 0.5170203447341919, + "learning_rate": 0.00024092071611253198, + "loss": 4.0856, + "step": 1855 + }, + { + "epoch": 2.37376, + "grad_norm": 0.5600740313529968, + "learning_rate": 0.00024088033382689458, + "loss": 4.0519, + "step": 1856 + }, + { + "epoch": 2.37504, + "grad_norm": 0.549588680267334, + "learning_rate": 0.0002408399515412572, + "loss": 4.0641, + "step": 1857 + }, + { + "epoch": 2.3763199999999998, + "grad_norm": 0.5169657468795776, + "learning_rate": 0.00024079956925561984, + "loss": 4.0794, + "step": 1858 + }, + { + "epoch": 2.3776, + "grad_norm": 0.5614109039306641, + "learning_rate": 0.00024075918696998247, + "loss": 4.1284, + "step": 1859 + }, + { + "epoch": 2.37888, + "grad_norm": 0.5183959603309631, + "learning_rate": 0.00024071880468434513, + "loss": 4.133, + "step": 1860 + }, + { + "epoch": 2.38016, + "grad_norm": 0.584951639175415, + "learning_rate": 0.00024067842239870776, + "loss": 4.0667, + "step": 1861 + }, + { + "epoch": 2.38144, + "grad_norm": 0.5184288620948792, + "learning_rate": 0.00024063804011307039, + "loss": 4.0535, + "step": 1862 + }, + { + "epoch": 2.38272, + "grad_norm": 0.558650016784668, + "learning_rate": 0.000240597657827433, + "loss": 4.0802, + "step": 1863 + }, + { + "epoch": 2.384, + "grad_norm": 0.5584487318992615, + "learning_rate": 0.00024055727554179565, + "loss": 4.1414, + "step": 1864 + }, + { + "epoch": 2.38528, + "grad_norm": 0.5305871963500977, + "learning_rate": 0.00024051689325615828, + "loss": 4.0945, + "step": 1865 + }, + { + "epoch": 2.3865600000000002, + "grad_norm": 0.5361427664756775, + "learning_rate": 0.0002404765109705209, + "loss": 4.0935, + "step": 1866 + }, + { + "epoch": 2.38784, + "grad_norm": 0.49007391929626465, + "learning_rate": 0.00024043612868488354, + "loss": 4.1032, + "step": 1867 + }, + { + "epoch": 2.38912, + "grad_norm": 0.5409270524978638, + "learning_rate": 0.0002403957463992462, + "loss": 4.0758, + "step": 1868 + }, + { + "epoch": 2.3904, + "grad_norm": 0.5215279459953308, + "learning_rate": 0.00024035536411360882, + "loss": 4.0599, + "step": 1869 + }, + { + "epoch": 2.39168, + "grad_norm": 0.5199880599975586, + "learning_rate": 0.00024031498182797145, + "loss": 4.0554, + "step": 1870 + }, + { + "epoch": 2.39296, + "grad_norm": 0.5253363251686096, + "learning_rate": 0.00024027459954233406, + "loss": 4.1072, + "step": 1871 + }, + { + "epoch": 2.39424, + "grad_norm": 0.5469797253608704, + "learning_rate": 0.0002402342172566967, + "loss": 4.0496, + "step": 1872 + }, + { + "epoch": 2.39552, + "grad_norm": 0.5029854774475098, + "learning_rate": 0.00024019383497105934, + "loss": 4.0267, + "step": 1873 + }, + { + "epoch": 2.3968, + "grad_norm": 0.5808607339859009, + "learning_rate": 0.00024015345268542197, + "loss": 4.0903, + "step": 1874 + }, + { + "epoch": 2.39808, + "grad_norm": 0.4929969906806946, + "learning_rate": 0.0002401130703997846, + "loss": 4.0763, + "step": 1875 + }, + { + "epoch": 2.39936, + "grad_norm": 0.5704619288444519, + "learning_rate": 0.00024007268811414723, + "loss": 4.0923, + "step": 1876 + }, + { + "epoch": 2.40064, + "grad_norm": 0.5273481607437134, + "learning_rate": 0.0002400323058285099, + "loss": 4.0345, + "step": 1877 + }, + { + "epoch": 2.40192, + "grad_norm": 0.5336358547210693, + "learning_rate": 0.00023999192354287252, + "loss": 4.1127, + "step": 1878 + }, + { + "epoch": 2.4032, + "grad_norm": 0.5384494662284851, + "learning_rate": 0.00023995154125723512, + "loss": 4.131, + "step": 1879 + }, + { + "epoch": 2.40448, + "grad_norm": 0.5182468891143799, + "learning_rate": 0.00023991115897159775, + "loss": 4.088, + "step": 1880 + }, + { + "epoch": 2.40576, + "grad_norm": 0.54859459400177, + "learning_rate": 0.0002398707766859604, + "loss": 4.0991, + "step": 1881 + }, + { + "epoch": 2.40704, + "grad_norm": 0.5565007925033569, + "learning_rate": 0.00023983039440032304, + "loss": 4.0784, + "step": 1882 + }, + { + "epoch": 2.40832, + "grad_norm": 0.5592086315155029, + "learning_rate": 0.00023979001211468567, + "loss": 4.126, + "step": 1883 + }, + { + "epoch": 2.4096, + "grad_norm": 0.5493931174278259, + "learning_rate": 0.0002397496298290483, + "loss": 4.1278, + "step": 1884 + }, + { + "epoch": 2.41088, + "grad_norm": 0.5680795311927795, + "learning_rate": 0.00023970924754341093, + "loss": 4.1366, + "step": 1885 + }, + { + "epoch": 2.41216, + "grad_norm": 0.5174584984779358, + "learning_rate": 0.0002396688652577736, + "loss": 4.0928, + "step": 1886 + }, + { + "epoch": 2.41344, + "grad_norm": 0.5714235305786133, + "learning_rate": 0.0002396284829721362, + "loss": 4.1146, + "step": 1887 + }, + { + "epoch": 2.41472, + "grad_norm": 0.5287262201309204, + "learning_rate": 0.00023958810068649882, + "loss": 4.1175, + "step": 1888 + }, + { + "epoch": 2.416, + "grad_norm": 0.5177103877067566, + "learning_rate": 0.00023954771840086145, + "loss": 4.0145, + "step": 1889 + }, + { + "epoch": 2.41728, + "grad_norm": 0.5252452492713928, + "learning_rate": 0.0002395073361152241, + "loss": 4.0529, + "step": 1890 + }, + { + "epoch": 2.41856, + "grad_norm": 0.5262645483016968, + "learning_rate": 0.00023946695382958674, + "loss": 4.0403, + "step": 1891 + }, + { + "epoch": 2.4198399999999998, + "grad_norm": 0.5197089910507202, + "learning_rate": 0.00023942657154394937, + "loss": 4.0744, + "step": 1892 + }, + { + "epoch": 2.42112, + "grad_norm": 0.512195885181427, + "learning_rate": 0.000239386189258312, + "loss": 4.0726, + "step": 1893 + }, + { + "epoch": 2.4224, + "grad_norm": 0.5221033692359924, + "learning_rate": 0.00023934580697267466, + "loss": 4.1091, + "step": 1894 + }, + { + "epoch": 2.42368, + "grad_norm": 0.5204744338989258, + "learning_rate": 0.00023930542468703726, + "loss": 4.0276, + "step": 1895 + }, + { + "epoch": 2.42496, + "grad_norm": 0.5527137517929077, + "learning_rate": 0.0002392650424013999, + "loss": 4.0472, + "step": 1896 + }, + { + "epoch": 2.42624, + "grad_norm": 0.5223075151443481, + "learning_rate": 0.00023922466011576252, + "loss": 4.0678, + "step": 1897 + }, + { + "epoch": 2.42752, + "grad_norm": 0.5136579871177673, + "learning_rate": 0.00023918427783012515, + "loss": 3.968, + "step": 1898 + }, + { + "epoch": 2.4288, + "grad_norm": 0.5277352929115295, + "learning_rate": 0.0002391438955444878, + "loss": 4.0798, + "step": 1899 + }, + { + "epoch": 2.4300800000000002, + "grad_norm": 0.524479329586029, + "learning_rate": 0.00023910351325885044, + "loss": 4.0096, + "step": 1900 + }, + { + "epoch": 2.43136, + "grad_norm": 0.5202826261520386, + "learning_rate": 0.00023906313097321307, + "loss": 4.0303, + "step": 1901 + }, + { + "epoch": 2.43264, + "grad_norm": 0.5311558842658997, + "learning_rate": 0.00023902274868757567, + "loss": 4.1015, + "step": 1902 + }, + { + "epoch": 2.43392, + "grad_norm": 0.5515623092651367, + "learning_rate": 0.00023898236640193835, + "loss": 4.0423, + "step": 1903 + }, + { + "epoch": 2.4352, + "grad_norm": 0.55174320936203, + "learning_rate": 0.00023894198411630096, + "loss": 4.0474, + "step": 1904 + }, + { + "epoch": 2.43648, + "grad_norm": 0.5991604924201965, + "learning_rate": 0.0002389016018306636, + "loss": 4.1282, + "step": 1905 + }, + { + "epoch": 2.43776, + "grad_norm": 0.5626751780509949, + "learning_rate": 0.00023886121954502622, + "loss": 4.0855, + "step": 1906 + }, + { + "epoch": 2.43904, + "grad_norm": 0.5392490029335022, + "learning_rate": 0.00023882083725938887, + "loss": 4.1173, + "step": 1907 + }, + { + "epoch": 2.44032, + "grad_norm": 0.5552417635917664, + "learning_rate": 0.0002387804549737515, + "loss": 4.123, + "step": 1908 + }, + { + "epoch": 2.4416, + "grad_norm": 0.5797650218009949, + "learning_rate": 0.00023874007268811413, + "loss": 4.0547, + "step": 1909 + }, + { + "epoch": 2.44288, + "grad_norm": 0.5046030282974243, + "learning_rate": 0.00023869969040247674, + "loss": 4.1031, + "step": 1910 + }, + { + "epoch": 2.44416, + "grad_norm": 0.6118117570877075, + "learning_rate": 0.00023865930811683937, + "loss": 4.0797, + "step": 1911 + }, + { + "epoch": 2.44544, + "grad_norm": 0.5165206789970398, + "learning_rate": 0.00023861892583120203, + "loss": 4.0243, + "step": 1912 + }, + { + "epoch": 2.44672, + "grad_norm": 0.5483618378639221, + "learning_rate": 0.00023857854354556466, + "loss": 4.0234, + "step": 1913 + }, + { + "epoch": 2.448, + "grad_norm": 0.5128400325775146, + "learning_rate": 0.00023853816125992729, + "loss": 3.9816, + "step": 1914 + }, + { + "epoch": 2.44928, + "grad_norm": 0.5472554564476013, + "learning_rate": 0.00023849777897428992, + "loss": 4.0127, + "step": 1915 + }, + { + "epoch": 2.45056, + "grad_norm": 0.5363740921020508, + "learning_rate": 0.00023845739668865257, + "loss": 4.1075, + "step": 1916 + }, + { + "epoch": 2.45184, + "grad_norm": 0.5734500288963318, + "learning_rate": 0.0002384170144030152, + "loss": 4.0291, + "step": 1917 + }, + { + "epoch": 2.45312, + "grad_norm": 0.5291422009468079, + "learning_rate": 0.00023837663211737783, + "loss": 4.119, + "step": 1918 + }, + { + "epoch": 2.4544, + "grad_norm": 0.541768491268158, + "learning_rate": 0.00023833624983174044, + "loss": 4.0881, + "step": 1919 + }, + { + "epoch": 2.45568, + "grad_norm": 0.5272345542907715, + "learning_rate": 0.0002382958675461031, + "loss": 4.0666, + "step": 1920 + }, + { + "epoch": 2.45696, + "grad_norm": 0.5372911095619202, + "learning_rate": 0.00023825548526046572, + "loss": 4.0557, + "step": 1921 + }, + { + "epoch": 2.45824, + "grad_norm": 0.5213048458099365, + "learning_rate": 0.00023821510297482835, + "loss": 4.0735, + "step": 1922 + }, + { + "epoch": 2.45952, + "grad_norm": 0.5202589631080627, + "learning_rate": 0.00023817472068919098, + "loss": 4.0209, + "step": 1923 + }, + { + "epoch": 2.4608, + "grad_norm": 0.4860571026802063, + "learning_rate": 0.0002381343384035536, + "loss": 4.1663, + "step": 1924 + }, + { + "epoch": 2.46208, + "grad_norm": 0.5281109809875488, + "learning_rate": 0.00023809395611791627, + "loss": 4.0593, + "step": 1925 + }, + { + "epoch": 2.4633599999999998, + "grad_norm": 0.508752167224884, + "learning_rate": 0.0002380535738322789, + "loss": 4.0568, + "step": 1926 + }, + { + "epoch": 2.46464, + "grad_norm": 0.5302470326423645, + "learning_rate": 0.0002380131915466415, + "loss": 4.0717, + "step": 1927 + }, + { + "epoch": 2.46592, + "grad_norm": 0.5096002817153931, + "learning_rate": 0.00023797280926100413, + "loss": 4.0652, + "step": 1928 + }, + { + "epoch": 2.4672, + "grad_norm": 0.5053885579109192, + "learning_rate": 0.0002379324269753668, + "loss": 4.1062, + "step": 1929 + }, + { + "epoch": 2.46848, + "grad_norm": 0.5277340412139893, + "learning_rate": 0.00023789204468972942, + "loss": 4.1013, + "step": 1930 + }, + { + "epoch": 2.46976, + "grad_norm": 0.5312148928642273, + "learning_rate": 0.00023785166240409205, + "loss": 4.0802, + "step": 1931 + }, + { + "epoch": 2.47104, + "grad_norm": 0.5116835832595825, + "learning_rate": 0.00023781128011845468, + "loss": 3.9968, + "step": 1932 + }, + { + "epoch": 2.47232, + "grad_norm": 0.5169728398323059, + "learning_rate": 0.00023777089783281734, + "loss": 4.0566, + "step": 1933 + }, + { + "epoch": 2.4736000000000002, + "grad_norm": 0.5306958556175232, + "learning_rate": 0.00023773051554717997, + "loss": 4.0178, + "step": 1934 + }, + { + "epoch": 2.47488, + "grad_norm": 0.5176283717155457, + "learning_rate": 0.00023769013326154257, + "loss": 4.0897, + "step": 1935 + }, + { + "epoch": 2.47616, + "grad_norm": 0.5138524174690247, + "learning_rate": 0.0002376497509759052, + "loss": 4.0544, + "step": 1936 + }, + { + "epoch": 2.47744, + "grad_norm": 0.5273920297622681, + "learning_rate": 0.00023760936869026783, + "loss": 4.0539, + "step": 1937 + }, + { + "epoch": 2.47872, + "grad_norm": 0.5098421573638916, + "learning_rate": 0.0002375689864046305, + "loss": 4.0386, + "step": 1938 + }, + { + "epoch": 2.48, + "grad_norm": 0.5550551414489746, + "learning_rate": 0.00023752860411899312, + "loss": 4.0598, + "step": 1939 + }, + { + "epoch": 2.48128, + "grad_norm": 0.5031646490097046, + "learning_rate": 0.00023748822183335575, + "loss": 4.1084, + "step": 1940 + }, + { + "epoch": 2.48256, + "grad_norm": 0.5071708559989929, + "learning_rate": 0.00023744783954771838, + "loss": 4.07, + "step": 1941 + }, + { + "epoch": 2.48384, + "grad_norm": 0.5383012890815735, + "learning_rate": 0.00023740745726208104, + "loss": 4.0044, + "step": 1942 + }, + { + "epoch": 2.48512, + "grad_norm": 0.5164763927459717, + "learning_rate": 0.00023736707497644364, + "loss": 4.1217, + "step": 1943 + }, + { + "epoch": 2.4864, + "grad_norm": 0.531086266040802, + "learning_rate": 0.00023732669269080627, + "loss": 4.055, + "step": 1944 + }, + { + "epoch": 2.48768, + "grad_norm": 0.5626212358474731, + "learning_rate": 0.0002372863104051689, + "loss": 3.9839, + "step": 1945 + }, + { + "epoch": 2.48896, + "grad_norm": 0.48967134952545166, + "learning_rate": 0.00023724592811953156, + "loss": 4.0099, + "step": 1946 + }, + { + "epoch": 2.49024, + "grad_norm": 0.4974314272403717, + "learning_rate": 0.00023720554583389419, + "loss": 4.0544, + "step": 1947 + }, + { + "epoch": 2.49152, + "grad_norm": 0.5294041037559509, + "learning_rate": 0.00023716516354825682, + "loss": 4.0783, + "step": 1948 + }, + { + "epoch": 2.4928, + "grad_norm": 0.5068480372428894, + "learning_rate": 0.00023712478126261945, + "loss": 4.0757, + "step": 1949 + }, + { + "epoch": 2.49408, + "grad_norm": 0.4977387487888336, + "learning_rate": 0.0002370843989769821, + "loss": 4.1196, + "step": 1950 + }, + { + "epoch": 2.49536, + "grad_norm": 0.4864615797996521, + "learning_rate": 0.0002370440166913447, + "loss": 4.0737, + "step": 1951 + }, + { + "epoch": 2.49664, + "grad_norm": 0.5214549899101257, + "learning_rate": 0.00023700363440570734, + "loss": 4.0846, + "step": 1952 + }, + { + "epoch": 2.49792, + "grad_norm": 0.538516640663147, + "learning_rate": 0.00023696325212006997, + "loss": 4.0236, + "step": 1953 + }, + { + "epoch": 2.4992, + "grad_norm": 0.5184758901596069, + "learning_rate": 0.0002369228698344326, + "loss": 4.0424, + "step": 1954 + }, + { + "epoch": 2.50048, + "grad_norm": 0.512209415435791, + "learning_rate": 0.00023688248754879525, + "loss": 4.0921, + "step": 1955 + }, + { + "epoch": 2.50176, + "grad_norm": 0.5255154371261597, + "learning_rate": 0.00023684210526315788, + "loss": 4.1313, + "step": 1956 + }, + { + "epoch": 2.50304, + "grad_norm": 0.5283589959144592, + "learning_rate": 0.00023680172297752051, + "loss": 4.0266, + "step": 1957 + }, + { + "epoch": 2.50432, + "grad_norm": 0.5340486764907837, + "learning_rate": 0.00023676134069188312, + "loss": 4.1258, + "step": 1958 + }, + { + "epoch": 2.5056000000000003, + "grad_norm": 0.5446499586105347, + "learning_rate": 0.00023672095840624577, + "loss": 4.0665, + "step": 1959 + }, + { + "epoch": 2.5068799999999998, + "grad_norm": 0.4942466914653778, + "learning_rate": 0.0002366805761206084, + "loss": 4.0474, + "step": 1960 + }, + { + "epoch": 2.50816, + "grad_norm": 0.5173544883728027, + "learning_rate": 0.00023664019383497103, + "loss": 3.9846, + "step": 1961 + }, + { + "epoch": 2.50944, + "grad_norm": 0.5310766100883484, + "learning_rate": 0.00023659981154933366, + "loss": 4.0044, + "step": 1962 + }, + { + "epoch": 2.51072, + "grad_norm": 0.5134363770484924, + "learning_rate": 0.00023655942926369632, + "loss": 4.0734, + "step": 1963 + }, + { + "epoch": 2.512, + "grad_norm": 0.5086606740951538, + "learning_rate": 0.00023651904697805895, + "loss": 4.0758, + "step": 1964 + }, + { + "epoch": 2.51328, + "grad_norm": 0.5497402548789978, + "learning_rate": 0.00023647866469242158, + "loss": 4.075, + "step": 1965 + }, + { + "epoch": 2.51456, + "grad_norm": 0.5068183541297913, + "learning_rate": 0.00023643828240678418, + "loss": 4.0666, + "step": 1966 + }, + { + "epoch": 2.51584, + "grad_norm": 0.5550326704978943, + "learning_rate": 0.00023639790012114681, + "loss": 4.0539, + "step": 1967 + }, + { + "epoch": 2.5171200000000002, + "grad_norm": 0.49992436170578003, + "learning_rate": 0.00023635751783550947, + "loss": 4.0962, + "step": 1968 + }, + { + "epoch": 2.5183999999999997, + "grad_norm": 0.5428693294525146, + "learning_rate": 0.0002363171355498721, + "loss": 3.9748, + "step": 1969 + }, + { + "epoch": 2.51968, + "grad_norm": 0.5194831490516663, + "learning_rate": 0.00023627675326423473, + "loss": 4.1121, + "step": 1970 + }, + { + "epoch": 2.52096, + "grad_norm": 0.5650715827941895, + "learning_rate": 0.00023623637097859736, + "loss": 4.0938, + "step": 1971 + }, + { + "epoch": 2.52224, + "grad_norm": 0.5177878141403198, + "learning_rate": 0.00023619598869296002, + "loss": 4.0946, + "step": 1972 + }, + { + "epoch": 2.52352, + "grad_norm": 0.5797203183174133, + "learning_rate": 0.00023615560640732265, + "loss": 4.0559, + "step": 1973 + }, + { + "epoch": 2.5248, + "grad_norm": 0.5099804997444153, + "learning_rate": 0.00023611522412168525, + "loss": 4.0196, + "step": 1974 + }, + { + "epoch": 2.52608, + "grad_norm": 0.5663504004478455, + "learning_rate": 0.00023607484183604788, + "loss": 4.0268, + "step": 1975 + }, + { + "epoch": 2.52736, + "grad_norm": 0.4963954985141754, + "learning_rate": 0.00023603445955041054, + "loss": 4.0723, + "step": 1976 + }, + { + "epoch": 2.52864, + "grad_norm": 0.5366328954696655, + "learning_rate": 0.00023599407726477317, + "loss": 4.0549, + "step": 1977 + }, + { + "epoch": 2.5299199999999997, + "grad_norm": 0.515116274356842, + "learning_rate": 0.0002359536949791358, + "loss": 4.0863, + "step": 1978 + }, + { + "epoch": 2.5312, + "grad_norm": 0.5633923411369324, + "learning_rate": 0.00023591331269349843, + "loss": 4.0656, + "step": 1979 + }, + { + "epoch": 2.53248, + "grad_norm": 0.515352725982666, + "learning_rate": 0.00023587293040786106, + "loss": 4.0995, + "step": 1980 + }, + { + "epoch": 2.53376, + "grad_norm": 0.5454360842704773, + "learning_rate": 0.00023583254812222372, + "loss": 4.0187, + "step": 1981 + }, + { + "epoch": 2.53504, + "grad_norm": 0.5400582551956177, + "learning_rate": 0.00023579216583658632, + "loss": 4.0149, + "step": 1982 + }, + { + "epoch": 2.53632, + "grad_norm": 0.5306532382965088, + "learning_rate": 0.00023575178355094895, + "loss": 4.0082, + "step": 1983 + }, + { + "epoch": 2.5376, + "grad_norm": 0.535999059677124, + "learning_rate": 0.00023571140126531158, + "loss": 4.0181, + "step": 1984 + }, + { + "epoch": 2.53888, + "grad_norm": 0.5338032245635986, + "learning_rate": 0.00023567101897967424, + "loss": 3.9664, + "step": 1985 + }, + { + "epoch": 2.54016, + "grad_norm": 0.5602062344551086, + "learning_rate": 0.00023563063669403687, + "loss": 4.0308, + "step": 1986 + }, + { + "epoch": 2.54144, + "grad_norm": 0.5280570387840271, + "learning_rate": 0.0002355902544083995, + "loss": 4.0465, + "step": 1987 + }, + { + "epoch": 2.54272, + "grad_norm": 0.5154047608375549, + "learning_rate": 0.00023554987212276213, + "loss": 4.0587, + "step": 1988 + }, + { + "epoch": 2.544, + "grad_norm": 0.551994800567627, + "learning_rate": 0.00023550948983712478, + "loss": 4.0683, + "step": 1989 + }, + { + "epoch": 2.54528, + "grad_norm": 0.5081275105476379, + "learning_rate": 0.00023546910755148741, + "loss": 4.0972, + "step": 1990 + }, + { + "epoch": 2.54656, + "grad_norm": 0.5598514676094055, + "learning_rate": 0.00023542872526585002, + "loss": 4.0323, + "step": 1991 + }, + { + "epoch": 2.54784, + "grad_norm": 0.48178520798683167, + "learning_rate": 0.00023538834298021265, + "loss": 3.9906, + "step": 1992 + }, + { + "epoch": 2.5491200000000003, + "grad_norm": 0.558735191822052, + "learning_rate": 0.00023534796069457528, + "loss": 4.0245, + "step": 1993 + }, + { + "epoch": 2.5504, + "grad_norm": 0.540560781955719, + "learning_rate": 0.00023530757840893794, + "loss": 4.1077, + "step": 1994 + }, + { + "epoch": 2.55168, + "grad_norm": 0.5557510852813721, + "learning_rate": 0.00023526719612330057, + "loss": 4.0786, + "step": 1995 + }, + { + "epoch": 2.55296, + "grad_norm": 0.5289320945739746, + "learning_rate": 0.0002352268138376632, + "loss": 4.0445, + "step": 1996 + }, + { + "epoch": 2.55424, + "grad_norm": 0.5117323398590088, + "learning_rate": 0.0002351864315520258, + "loss": 4.0146, + "step": 1997 + }, + { + "epoch": 2.55552, + "grad_norm": 0.525393009185791, + "learning_rate": 0.00023514604926638848, + "loss": 4.1132, + "step": 1998 + }, + { + "epoch": 2.5568, + "grad_norm": 0.5251143574714661, + "learning_rate": 0.00023510566698075109, + "loss": 4.0507, + "step": 1999 + }, + { + "epoch": 2.55808, + "grad_norm": 0.5183683633804321, + "learning_rate": 0.00023506528469511372, + "loss": 4.0713, + "step": 2000 + }, + { + "epoch": 2.55936, + "grad_norm": 0.5445199012756348, + "learning_rate": 0.00023502490240947635, + "loss": 4.0467, + "step": 2001 + }, + { + "epoch": 2.5606400000000002, + "grad_norm": 0.5163613557815552, + "learning_rate": 0.000234984520123839, + "loss": 4.0313, + "step": 2002 + }, + { + "epoch": 2.5619199999999998, + "grad_norm": 0.5514837503433228, + "learning_rate": 0.00023494413783820163, + "loss": 4.03, + "step": 2003 + }, + { + "epoch": 2.5632, + "grad_norm": 0.527066707611084, + "learning_rate": 0.00023490375555256426, + "loss": 4.097, + "step": 2004 + }, + { + "epoch": 2.56448, + "grad_norm": 0.5349636673927307, + "learning_rate": 0.0002348633732669269, + "loss": 4.019, + "step": 2005 + }, + { + "epoch": 2.56576, + "grad_norm": 0.5117310285568237, + "learning_rate": 0.0002348229909812895, + "loss": 3.9889, + "step": 2006 + }, + { + "epoch": 2.56704, + "grad_norm": 0.5110161900520325, + "learning_rate": 0.00023478260869565215, + "loss": 4.0419, + "step": 2007 + }, + { + "epoch": 2.56832, + "grad_norm": 0.5449533462524414, + "learning_rate": 0.00023474222641001478, + "loss": 4.1393, + "step": 2008 + }, + { + "epoch": 2.5696, + "grad_norm": 0.5280770063400269, + "learning_rate": 0.0002347018441243774, + "loss": 4.0109, + "step": 2009 + }, + { + "epoch": 2.57088, + "grad_norm": 0.5067214369773865, + "learning_rate": 0.00023466146183874004, + "loss": 4.0415, + "step": 2010 + }, + { + "epoch": 2.5721600000000002, + "grad_norm": 0.5153365135192871, + "learning_rate": 0.0002346210795531027, + "loss": 4.0349, + "step": 2011 + }, + { + "epoch": 2.5734399999999997, + "grad_norm": 0.509453296661377, + "learning_rate": 0.00023458069726746533, + "loss": 4.0768, + "step": 2012 + }, + { + "epoch": 2.57472, + "grad_norm": 0.5531152486801147, + "learning_rate": 0.00023454031498182796, + "loss": 4.0643, + "step": 2013 + }, + { + "epoch": 2.576, + "grad_norm": 0.5219248533248901, + "learning_rate": 0.00023449993269619056, + "loss": 4.0365, + "step": 2014 + }, + { + "epoch": 2.57728, + "grad_norm": 0.5232629776000977, + "learning_rate": 0.00023445955041055322, + "loss": 4.0989, + "step": 2015 + }, + { + "epoch": 2.57856, + "grad_norm": 0.5008588433265686, + "learning_rate": 0.00023441916812491585, + "loss": 3.9987, + "step": 2016 + }, + { + "epoch": 2.57984, + "grad_norm": 0.5122629404067993, + "learning_rate": 0.00023437878583927848, + "loss": 4.0479, + "step": 2017 + }, + { + "epoch": 2.58112, + "grad_norm": 0.48956623673439026, + "learning_rate": 0.0002343384035536411, + "loss": 4.1008, + "step": 2018 + }, + { + "epoch": 2.5824, + "grad_norm": 0.5428857207298279, + "learning_rate": 0.00023429802126800374, + "loss": 4.0692, + "step": 2019 + }, + { + "epoch": 2.58368, + "grad_norm": 0.5066841840744019, + "learning_rate": 0.0002342576389823664, + "loss": 4.0896, + "step": 2020 + }, + { + "epoch": 2.58496, + "grad_norm": 0.5314661860466003, + "learning_rate": 0.00023421725669672903, + "loss": 3.9449, + "step": 2021 + }, + { + "epoch": 2.58624, + "grad_norm": 0.5035257935523987, + "learning_rate": 0.00023417687441109163, + "loss": 4.0175, + "step": 2022 + }, + { + "epoch": 2.58752, + "grad_norm": 0.4997243583202362, + "learning_rate": 0.00023413649212545426, + "loss": 4.1032, + "step": 2023 + }, + { + "epoch": 2.5888, + "grad_norm": 0.4923264980316162, + "learning_rate": 0.00023409610983981692, + "loss": 4.0363, + "step": 2024 + }, + { + "epoch": 2.59008, + "grad_norm": 0.5122047662734985, + "learning_rate": 0.00023405572755417955, + "loss": 4.0297, + "step": 2025 + }, + { + "epoch": 2.59136, + "grad_norm": 0.5043531060218811, + "learning_rate": 0.00023401534526854218, + "loss": 4.0732, + "step": 2026 + }, + { + "epoch": 2.59264, + "grad_norm": 0.5018818974494934, + "learning_rate": 0.0002339749629829048, + "loss": 4.0188, + "step": 2027 + }, + { + "epoch": 2.59392, + "grad_norm": 0.4968951940536499, + "learning_rate": 0.00023393458069726747, + "loss": 4.0574, + "step": 2028 + }, + { + "epoch": 2.5952, + "grad_norm": 0.5588295459747314, + "learning_rate": 0.0002338941984116301, + "loss": 4.0049, + "step": 2029 + }, + { + "epoch": 2.59648, + "grad_norm": 0.5243644714355469, + "learning_rate": 0.0002338538161259927, + "loss": 3.8995, + "step": 2030 + }, + { + "epoch": 2.59776, + "grad_norm": 0.5623689293861389, + "learning_rate": 0.00023381343384035533, + "loss": 4.0505, + "step": 2031 + }, + { + "epoch": 2.59904, + "grad_norm": 0.5318677425384521, + "learning_rate": 0.00023377305155471796, + "loss": 3.9892, + "step": 2032 + }, + { + "epoch": 2.60032, + "grad_norm": 0.5533915758132935, + "learning_rate": 0.00023373266926908062, + "loss": 4.0111, + "step": 2033 + }, + { + "epoch": 2.6016, + "grad_norm": 0.5214724540710449, + "learning_rate": 0.00023369228698344325, + "loss": 4.0176, + "step": 2034 + }, + { + "epoch": 2.60288, + "grad_norm": 0.550571620464325, + "learning_rate": 0.00023365190469780588, + "loss": 4.0881, + "step": 2035 + }, + { + "epoch": 2.6041600000000003, + "grad_norm": 0.5152320265769958, + "learning_rate": 0.0002336115224121685, + "loss": 4.043, + "step": 2036 + }, + { + "epoch": 2.6054399999999998, + "grad_norm": 0.5005114078521729, + "learning_rate": 0.00023357114012653116, + "loss": 3.9851, + "step": 2037 + }, + { + "epoch": 2.60672, + "grad_norm": 0.5171644687652588, + "learning_rate": 0.00023353075784089377, + "loss": 3.9649, + "step": 2038 + }, + { + "epoch": 2.608, + "grad_norm": 0.5362170338630676, + "learning_rate": 0.0002334903755552564, + "loss": 4.0048, + "step": 2039 + }, + { + "epoch": 2.60928, + "grad_norm": 0.5056260824203491, + "learning_rate": 0.00023344999326961903, + "loss": 4.056, + "step": 2040 + }, + { + "epoch": 2.61056, + "grad_norm": 0.5133628249168396, + "learning_rate": 0.00023340961098398168, + "loss": 4.0981, + "step": 2041 + }, + { + "epoch": 2.61184, + "grad_norm": 0.5374043583869934, + "learning_rate": 0.00023336922869834431, + "loss": 3.9999, + "step": 2042 + }, + { + "epoch": 2.61312, + "grad_norm": 0.4886053800582886, + "learning_rate": 0.00023332884641270694, + "loss": 4.1064, + "step": 2043 + }, + { + "epoch": 2.6144, + "grad_norm": 0.5053584575653076, + "learning_rate": 0.00023328846412706957, + "loss": 3.9454, + "step": 2044 + }, + { + "epoch": 2.6156800000000002, + "grad_norm": 0.4969666004180908, + "learning_rate": 0.00023324808184143218, + "loss": 4.146, + "step": 2045 + }, + { + "epoch": 2.6169599999999997, + "grad_norm": 0.5076320767402649, + "learning_rate": 0.00023320769955579483, + "loss": 4.0381, + "step": 2046 + }, + { + "epoch": 2.61824, + "grad_norm": 0.4875280261039734, + "learning_rate": 0.00023316731727015746, + "loss": 4.0423, + "step": 2047 + }, + { + "epoch": 2.61952, + "grad_norm": 0.5112111568450928, + "learning_rate": 0.0002331269349845201, + "loss": 4.0637, + "step": 2048 + }, + { + "epoch": 2.6208, + "grad_norm": 0.5074511170387268, + "learning_rate": 0.00023308655269888272, + "loss": 3.9859, + "step": 2049 + }, + { + "epoch": 2.62208, + "grad_norm": 0.5409026145935059, + "learning_rate": 0.00023304617041324538, + "loss": 4.1341, + "step": 2050 + }, + { + "epoch": 2.62336, + "grad_norm": 0.5368846654891968, + "learning_rate": 0.000233005788127608, + "loss": 3.9908, + "step": 2051 + }, + { + "epoch": 2.62464, + "grad_norm": 0.5240885615348816, + "learning_rate": 0.00023296540584197064, + "loss": 4.0777, + "step": 2052 + }, + { + "epoch": 2.62592, + "grad_norm": 0.5103102326393127, + "learning_rate": 0.00023292502355633325, + "loss": 4.0188, + "step": 2053 + }, + { + "epoch": 2.6272, + "grad_norm": 0.5250452756881714, + "learning_rate": 0.0002328846412706959, + "loss": 4.0489, + "step": 2054 + }, + { + "epoch": 2.62848, + "grad_norm": 0.5160833597183228, + "learning_rate": 0.00023284425898505853, + "loss": 4.0773, + "step": 2055 + }, + { + "epoch": 2.62976, + "grad_norm": 0.5143999457359314, + "learning_rate": 0.00023280387669942116, + "loss": 4.0191, + "step": 2056 + }, + { + "epoch": 2.63104, + "grad_norm": 0.5295541286468506, + "learning_rate": 0.0002327634944137838, + "loss": 4.0769, + "step": 2057 + }, + { + "epoch": 2.63232, + "grad_norm": 0.5218490958213806, + "learning_rate": 0.00023272311212814642, + "loss": 3.9913, + "step": 2058 + }, + { + "epoch": 2.6336, + "grad_norm": 0.488608717918396, + "learning_rate": 0.00023268272984250908, + "loss": 4.0126, + "step": 2059 + }, + { + "epoch": 2.63488, + "grad_norm": 0.5450034141540527, + "learning_rate": 0.0002326423475568717, + "loss": 4.0593, + "step": 2060 + }, + { + "epoch": 2.63616, + "grad_norm": 0.4820656478404999, + "learning_rate": 0.0002326019652712343, + "loss": 4.0532, + "step": 2061 + }, + { + "epoch": 2.63744, + "grad_norm": 0.5224117636680603, + "learning_rate": 0.00023256158298559694, + "loss": 4.0053, + "step": 2062 + }, + { + "epoch": 2.63872, + "grad_norm": 0.4709533154964447, + "learning_rate": 0.0002325212006999596, + "loss": 3.9811, + "step": 2063 + }, + { + "epoch": 2.64, + "grad_norm": 0.5281003713607788, + "learning_rate": 0.00023248081841432223, + "loss": 4.0737, + "step": 2064 + }, + { + "epoch": 2.64128, + "grad_norm": 0.49592503905296326, + "learning_rate": 0.00023244043612868486, + "loss": 4.0537, + "step": 2065 + }, + { + "epoch": 2.64256, + "grad_norm": 0.5004292726516724, + "learning_rate": 0.0002324000538430475, + "loss": 4.0797, + "step": 2066 + }, + { + "epoch": 2.64384, + "grad_norm": 0.5057252049446106, + "learning_rate": 0.00023235967155741015, + "loss": 4.0881, + "step": 2067 + }, + { + "epoch": 2.64512, + "grad_norm": 0.511310875415802, + "learning_rate": 0.00023231928927177278, + "loss": 4.0383, + "step": 2068 + }, + { + "epoch": 2.6464, + "grad_norm": 0.5321905016899109, + "learning_rate": 0.00023227890698613538, + "loss": 4.0814, + "step": 2069 + }, + { + "epoch": 2.6476800000000003, + "grad_norm": 0.5654548406600952, + "learning_rate": 0.000232238524700498, + "loss": 4.1224, + "step": 2070 + }, + { + "epoch": 2.6489599999999998, + "grad_norm": 0.527220606803894, + "learning_rate": 0.00023219814241486067, + "loss": 4.0622, + "step": 2071 + }, + { + "epoch": 2.65024, + "grad_norm": 0.5580291152000427, + "learning_rate": 0.0002321577601292233, + "loss": 4.0797, + "step": 2072 + }, + { + "epoch": 2.65152, + "grad_norm": 0.545524001121521, + "learning_rate": 0.00023211737784358593, + "loss": 4.0666, + "step": 2073 + }, + { + "epoch": 2.6528, + "grad_norm": 0.5084550380706787, + "learning_rate": 0.00023207699555794856, + "loss": 4.0051, + "step": 2074 + }, + { + "epoch": 2.65408, + "grad_norm": 0.5336723923683167, + "learning_rate": 0.0002320366132723112, + "loss": 4.0471, + "step": 2075 + }, + { + "epoch": 2.65536, + "grad_norm": 0.5522205829620361, + "learning_rate": 0.00023199623098667385, + "loss": 4.0846, + "step": 2076 + }, + { + "epoch": 2.65664, + "grad_norm": 0.5465154051780701, + "learning_rate": 0.00023195584870103645, + "loss": 3.9559, + "step": 2077 + }, + { + "epoch": 2.65792, + "grad_norm": 0.5187036395072937, + "learning_rate": 0.00023191546641539908, + "loss": 3.9902, + "step": 2078 + }, + { + "epoch": 2.6592000000000002, + "grad_norm": 0.5408727526664734, + "learning_rate": 0.0002318750841297617, + "loss": 4.0133, + "step": 2079 + }, + { + "epoch": 2.6604799999999997, + "grad_norm": 0.5757235288619995, + "learning_rate": 0.00023183470184412437, + "loss": 4.0698, + "step": 2080 + }, + { + "epoch": 2.66176, + "grad_norm": 0.5221036076545715, + "learning_rate": 0.000231794319558487, + "loss": 4.1101, + "step": 2081 + }, + { + "epoch": 2.66304, + "grad_norm": 0.5259095430374146, + "learning_rate": 0.00023175393727284963, + "loss": 3.9804, + "step": 2082 + }, + { + "epoch": 2.66432, + "grad_norm": 0.520976185798645, + "learning_rate": 0.00023171355498721226, + "loss": 4.02, + "step": 2083 + }, + { + "epoch": 2.6656, + "grad_norm": 0.5386399626731873, + "learning_rate": 0.0002316731727015749, + "loss": 4.1377, + "step": 2084 + }, + { + "epoch": 2.66688, + "grad_norm": 0.5121170282363892, + "learning_rate": 0.00023163279041593754, + "loss": 4.0627, + "step": 2085 + }, + { + "epoch": 2.66816, + "grad_norm": 0.5606420636177063, + "learning_rate": 0.00023159240813030015, + "loss": 4.0281, + "step": 2086 + }, + { + "epoch": 2.66944, + "grad_norm": 0.5451529622077942, + "learning_rate": 0.00023155202584466278, + "loss": 3.9724, + "step": 2087 + }, + { + "epoch": 2.67072, + "grad_norm": 0.5385946035385132, + "learning_rate": 0.0002315116435590254, + "loss": 4.0462, + "step": 2088 + }, + { + "epoch": 2.672, + "grad_norm": 0.5405113101005554, + "learning_rate": 0.00023147126127338806, + "loss": 3.9622, + "step": 2089 + }, + { + "epoch": 2.67328, + "grad_norm": 0.5446247458457947, + "learning_rate": 0.0002314308789877507, + "loss": 4.0747, + "step": 2090 + }, + { + "epoch": 2.67456, + "grad_norm": 0.5345394611358643, + "learning_rate": 0.00023139049670211332, + "loss": 3.9045, + "step": 2091 + }, + { + "epoch": 2.67584, + "grad_norm": 0.5287266969680786, + "learning_rate": 0.00023135011441647595, + "loss": 4.0296, + "step": 2092 + }, + { + "epoch": 2.67712, + "grad_norm": 0.4985159933567047, + "learning_rate": 0.0002313097321308386, + "loss": 4.0478, + "step": 2093 + }, + { + "epoch": 2.6784, + "grad_norm": 0.5355061292648315, + "learning_rate": 0.00023126934984520121, + "loss": 4.0843, + "step": 2094 + }, + { + "epoch": 2.67968, + "grad_norm": 0.4830259382724762, + "learning_rate": 0.00023122896755956384, + "loss": 4.0583, + "step": 2095 + }, + { + "epoch": 2.68096, + "grad_norm": 0.5162714719772339, + "learning_rate": 0.00023118858527392647, + "loss": 3.967, + "step": 2096 + }, + { + "epoch": 2.68224, + "grad_norm": 0.5130695700645447, + "learning_rate": 0.00023114820298828913, + "loss": 4.0423, + "step": 2097 + }, + { + "epoch": 2.68352, + "grad_norm": 0.507447361946106, + "learning_rate": 0.00023110782070265176, + "loss": 4.0212, + "step": 2098 + }, + { + "epoch": 2.6848, + "grad_norm": 0.6723561882972717, + "learning_rate": 0.0002310674384170144, + "loss": 4.0209, + "step": 2099 + }, + { + "epoch": 2.68608, + "grad_norm": 0.49744725227355957, + "learning_rate": 0.00023102705613137702, + "loss": 3.9865, + "step": 2100 + }, + { + "epoch": 2.68736, + "grad_norm": 0.5632631778717041, + "learning_rate": 0.00023098667384573962, + "loss": 4.0426, + "step": 2101 + }, + { + "epoch": 2.68864, + "grad_norm": 0.5552526712417603, + "learning_rate": 0.00023094629156010228, + "loss": 4.0527, + "step": 2102 + }, + { + "epoch": 2.68992, + "grad_norm": 0.5476700663566589, + "learning_rate": 0.0002309059092744649, + "loss": 4.0647, + "step": 2103 + }, + { + "epoch": 2.6912000000000003, + "grad_norm": 0.545758068561554, + "learning_rate": 0.00023086552698882754, + "loss": 4.0247, + "step": 2104 + }, + { + "epoch": 2.6924799999999998, + "grad_norm": 0.5172504186630249, + "learning_rate": 0.00023082514470319017, + "loss": 4.0629, + "step": 2105 + }, + { + "epoch": 2.69376, + "grad_norm": 0.5142618417739868, + "learning_rate": 0.00023078476241755283, + "loss": 3.9725, + "step": 2106 + }, + { + "epoch": 2.69504, + "grad_norm": 0.49800601601600647, + "learning_rate": 0.00023074438013191546, + "loss": 3.9891, + "step": 2107 + }, + { + "epoch": 2.69632, + "grad_norm": 0.5199533700942993, + "learning_rate": 0.0002307039978462781, + "loss": 4.0479, + "step": 2108 + }, + { + "epoch": 2.6976, + "grad_norm": 0.4997861981391907, + "learning_rate": 0.0002306636155606407, + "loss": 3.9831, + "step": 2109 + }, + { + "epoch": 2.69888, + "grad_norm": 0.5149181485176086, + "learning_rate": 0.00023062323327500335, + "loss": 4.0014, + "step": 2110 + }, + { + "epoch": 2.70016, + "grad_norm": 0.5101584792137146, + "learning_rate": 0.00023058285098936598, + "loss": 4.0213, + "step": 2111 + }, + { + "epoch": 2.70144, + "grad_norm": 0.4832100570201874, + "learning_rate": 0.0002305424687037286, + "loss": 4.0012, + "step": 2112 + }, + { + "epoch": 2.7027200000000002, + "grad_norm": 0.527621328830719, + "learning_rate": 0.00023050208641809124, + "loss": 4.0418, + "step": 2113 + }, + { + "epoch": 2.7039999999999997, + "grad_norm": 0.4823199510574341, + "learning_rate": 0.00023046170413245387, + "loss": 4.0175, + "step": 2114 + }, + { + "epoch": 2.70528, + "grad_norm": 0.5674161911010742, + "learning_rate": 0.00023042132184681653, + "loss": 4.0703, + "step": 2115 + }, + { + "epoch": 2.70656, + "grad_norm": 0.5498639345169067, + "learning_rate": 0.00023038093956117916, + "loss": 4.0389, + "step": 2116 + }, + { + "epoch": 2.70784, + "grad_norm": 0.5504740476608276, + "learning_rate": 0.00023034055727554176, + "loss": 4.0138, + "step": 2117 + }, + { + "epoch": 2.70912, + "grad_norm": 0.5479692816734314, + "learning_rate": 0.0002303001749899044, + "loss": 4.0722, + "step": 2118 + }, + { + "epoch": 2.7104, + "grad_norm": 0.4745061993598938, + "learning_rate": 0.00023025979270426705, + "loss": 4.0672, + "step": 2119 + }, + { + "epoch": 2.71168, + "grad_norm": 0.5192096829414368, + "learning_rate": 0.00023021941041862968, + "loss": 3.9691, + "step": 2120 + }, + { + "epoch": 2.71296, + "grad_norm": 0.5102022290229797, + "learning_rate": 0.0002301790281329923, + "loss": 4.0599, + "step": 2121 + }, + { + "epoch": 2.71424, + "grad_norm": 0.49399974942207336, + "learning_rate": 0.00023013864584735494, + "loss": 3.9948, + "step": 2122 + }, + { + "epoch": 2.71552, + "grad_norm": 0.5214704871177673, + "learning_rate": 0.0002300982635617176, + "loss": 4.0376, + "step": 2123 + }, + { + "epoch": 2.7168, + "grad_norm": 0.5198361873626709, + "learning_rate": 0.00023005788127608022, + "loss": 4.0227, + "step": 2124 + }, + { + "epoch": 2.71808, + "grad_norm": 0.5009457468986511, + "learning_rate": 0.00023001749899044283, + "loss": 4.0152, + "step": 2125 + }, + { + "epoch": 2.71936, + "grad_norm": 0.5245611667633057, + "learning_rate": 0.00022997711670480546, + "loss": 4.056, + "step": 2126 + }, + { + "epoch": 2.72064, + "grad_norm": 0.4951135814189911, + "learning_rate": 0.0002299367344191681, + "loss": 4.0599, + "step": 2127 + }, + { + "epoch": 2.72192, + "grad_norm": 0.5234236717224121, + "learning_rate": 0.00022989635213353074, + "loss": 4.0412, + "step": 2128 + }, + { + "epoch": 2.7232, + "grad_norm": 0.5335028171539307, + "learning_rate": 0.00022985596984789337, + "loss": 3.9789, + "step": 2129 + }, + { + "epoch": 2.72448, + "grad_norm": 0.527600109577179, + "learning_rate": 0.000229815587562256, + "loss": 4.0591, + "step": 2130 + }, + { + "epoch": 2.72576, + "grad_norm": 0.5804891586303711, + "learning_rate": 0.00022977520527661864, + "loss": 4.029, + "step": 2131 + }, + { + "epoch": 2.72704, + "grad_norm": 0.5268876552581787, + "learning_rate": 0.0002297348229909813, + "loss": 3.9655, + "step": 2132 + }, + { + "epoch": 2.72832, + "grad_norm": 0.5200521349906921, + "learning_rate": 0.0002296944407053439, + "loss": 3.9752, + "step": 2133 + }, + { + "epoch": 2.7296, + "grad_norm": 0.5403656363487244, + "learning_rate": 0.00022965405841970653, + "loss": 4.0228, + "step": 2134 + }, + { + "epoch": 2.73088, + "grad_norm": 0.528051495552063, + "learning_rate": 0.00022961367613406916, + "loss": 3.9931, + "step": 2135 + }, + { + "epoch": 2.73216, + "grad_norm": 0.5134397745132446, + "learning_rate": 0.0002295732938484318, + "loss": 4.0045, + "step": 2136 + }, + { + "epoch": 2.73344, + "grad_norm": 0.5310769081115723, + "learning_rate": 0.00022953291156279444, + "loss": 4.0, + "step": 2137 + }, + { + "epoch": 2.7347200000000003, + "grad_norm": 0.5168265700340271, + "learning_rate": 0.00022949252927715707, + "loss": 4.0438, + "step": 2138 + }, + { + "epoch": 2.7359999999999998, + "grad_norm": 0.5118173360824585, + "learning_rate": 0.0002294521469915197, + "loss": 4.0294, + "step": 2139 + }, + { + "epoch": 2.73728, + "grad_norm": 0.5167641043663025, + "learning_rate": 0.0002294117647058823, + "loss": 3.9984, + "step": 2140 + }, + { + "epoch": 2.73856, + "grad_norm": 0.48330435156822205, + "learning_rate": 0.00022937138242024496, + "loss": 4.076, + "step": 2141 + }, + { + "epoch": 2.73984, + "grad_norm": 0.5215317606925964, + "learning_rate": 0.0002293310001346076, + "loss": 4.0358, + "step": 2142 + }, + { + "epoch": 2.74112, + "grad_norm": 0.5229592323303223, + "learning_rate": 0.00022929061784897022, + "loss": 3.9641, + "step": 2143 + }, + { + "epoch": 2.7424, + "grad_norm": 0.49107488989830017, + "learning_rate": 0.00022925023556333285, + "loss": 3.9869, + "step": 2144 + }, + { + "epoch": 2.74368, + "grad_norm": 0.5166721343994141, + "learning_rate": 0.0002292098532776955, + "loss": 3.9876, + "step": 2145 + }, + { + "epoch": 2.74496, + "grad_norm": 0.5030099749565125, + "learning_rate": 0.00022916947099205814, + "loss": 3.9987, + "step": 2146 + }, + { + "epoch": 2.7462400000000002, + "grad_norm": 0.5043811202049255, + "learning_rate": 0.00022912908870642077, + "loss": 4.0329, + "step": 2147 + }, + { + "epoch": 2.7475199999999997, + "grad_norm": 0.5001031756401062, + "learning_rate": 0.00022908870642078337, + "loss": 4.0079, + "step": 2148 + }, + { + "epoch": 2.7488, + "grad_norm": 0.5411626100540161, + "learning_rate": 0.00022904832413514603, + "loss": 4.0499, + "step": 2149 + }, + { + "epoch": 2.75008, + "grad_norm": 0.529230535030365, + "learning_rate": 0.00022900794184950866, + "loss": 4.0719, + "step": 2150 + }, + { + "epoch": 2.75136, + "grad_norm": 0.5309909582138062, + "learning_rate": 0.0002289675595638713, + "loss": 3.9798, + "step": 2151 + }, + { + "epoch": 2.75264, + "grad_norm": 0.5642079710960388, + "learning_rate": 0.00022892717727823392, + "loss": 4.0106, + "step": 2152 + }, + { + "epoch": 2.75392, + "grad_norm": 0.5178594589233398, + "learning_rate": 0.00022888679499259655, + "loss": 4.057, + "step": 2153 + }, + { + "epoch": 2.7552, + "grad_norm": 0.5369943380355835, + "learning_rate": 0.0002288464127069592, + "loss": 3.996, + "step": 2154 + }, + { + "epoch": 2.75648, + "grad_norm": 0.5157840251922607, + "learning_rate": 0.00022880603042132184, + "loss": 4.038, + "step": 2155 + }, + { + "epoch": 2.75776, + "grad_norm": 0.5400508642196655, + "learning_rate": 0.00022876564813568444, + "loss": 3.9986, + "step": 2156 + }, + { + "epoch": 2.75904, + "grad_norm": 0.49471211433410645, + "learning_rate": 0.00022872526585004707, + "loss": 4.0016, + "step": 2157 + }, + { + "epoch": 2.76032, + "grad_norm": 0.5369953513145447, + "learning_rate": 0.00022868488356440973, + "loss": 4.0411, + "step": 2158 + }, + { + "epoch": 2.7616, + "grad_norm": 0.5281562209129333, + "learning_rate": 0.00022864450127877236, + "loss": 4.0303, + "step": 2159 + }, + { + "epoch": 2.76288, + "grad_norm": 0.5228774547576904, + "learning_rate": 0.000228604118993135, + "loss": 4.0643, + "step": 2160 + }, + { + "epoch": 2.76416, + "grad_norm": 0.5000044107437134, + "learning_rate": 0.00022856373670749762, + "loss": 4.0538, + "step": 2161 + }, + { + "epoch": 2.76544, + "grad_norm": 0.505082905292511, + "learning_rate": 0.00022852335442186028, + "loss": 4.0106, + "step": 2162 + }, + { + "epoch": 2.76672, + "grad_norm": 0.5144949555397034, + "learning_rate": 0.0002284829721362229, + "loss": 3.9746, + "step": 2163 + }, + { + "epoch": 2.768, + "grad_norm": 0.5088104605674744, + "learning_rate": 0.0002284425898505855, + "loss": 3.9718, + "step": 2164 + }, + { + "epoch": 2.76928, + "grad_norm": 0.5328196287155151, + "learning_rate": 0.00022840220756494814, + "loss": 4.0054, + "step": 2165 + }, + { + "epoch": 2.77056, + "grad_norm": 0.5345327854156494, + "learning_rate": 0.00022836182527931077, + "loss": 4.0231, + "step": 2166 + }, + { + "epoch": 2.77184, + "grad_norm": 0.49523457884788513, + "learning_rate": 0.00022832144299367343, + "loss": 3.9831, + "step": 2167 + }, + { + "epoch": 2.77312, + "grad_norm": 0.5508283376693726, + "learning_rate": 0.00022828106070803606, + "loss": 3.9685, + "step": 2168 + }, + { + "epoch": 2.7744, + "grad_norm": 0.5366639494895935, + "learning_rate": 0.00022824067842239869, + "loss": 4.0006, + "step": 2169 + }, + { + "epoch": 2.77568, + "grad_norm": 0.5522692799568176, + "learning_rate": 0.00022820029613676132, + "loss": 3.9667, + "step": 2170 + }, + { + "epoch": 2.77696, + "grad_norm": 0.4968757927417755, + "learning_rate": 0.00022815991385112397, + "loss": 4.0107, + "step": 2171 + }, + { + "epoch": 2.7782400000000003, + "grad_norm": 0.5238058567047119, + "learning_rate": 0.0002281195315654866, + "loss": 3.9716, + "step": 2172 + }, + { + "epoch": 2.7795199999999998, + "grad_norm": 0.5009492039680481, + "learning_rate": 0.0002280791492798492, + "loss": 4.0209, + "step": 2173 + }, + { + "epoch": 2.7808, + "grad_norm": 0.5323073863983154, + "learning_rate": 0.00022803876699421184, + "loss": 4.0487, + "step": 2174 + }, + { + "epoch": 2.78208, + "grad_norm": 0.5168647170066833, + "learning_rate": 0.0002279983847085745, + "loss": 4.0385, + "step": 2175 + }, + { + "epoch": 2.78336, + "grad_norm": 0.5620954632759094, + "learning_rate": 0.00022795800242293712, + "loss": 4.0084, + "step": 2176 + }, + { + "epoch": 2.78464, + "grad_norm": 0.5150801539421082, + "learning_rate": 0.00022791762013729975, + "loss": 3.9903, + "step": 2177 + }, + { + "epoch": 2.78592, + "grad_norm": 0.5225955843925476, + "learning_rate": 0.00022787723785166238, + "loss": 3.9493, + "step": 2178 + }, + { + "epoch": 2.7872, + "grad_norm": 0.5136468410491943, + "learning_rate": 0.000227836855566025, + "loss": 4.0297, + "step": 2179 + }, + { + "epoch": 2.78848, + "grad_norm": 0.5070092082023621, + "learning_rate": 0.00022779647328038767, + "loss": 3.9916, + "step": 2180 + }, + { + "epoch": 2.7897600000000002, + "grad_norm": 0.48506057262420654, + "learning_rate": 0.00022775609099475027, + "loss": 4.049, + "step": 2181 + }, + { + "epoch": 2.7910399999999997, + "grad_norm": 0.5056405067443848, + "learning_rate": 0.0002277157087091129, + "loss": 4.0344, + "step": 2182 + }, + { + "epoch": 2.79232, + "grad_norm": 0.5065452456474304, + "learning_rate": 0.00022767532642347553, + "loss": 4.0575, + "step": 2183 + }, + { + "epoch": 2.7936, + "grad_norm": 0.4931611716747284, + "learning_rate": 0.0002276349441378382, + "loss": 3.9662, + "step": 2184 + }, + { + "epoch": 2.79488, + "grad_norm": 0.5401006937026978, + "learning_rate": 0.00022759456185220082, + "loss": 3.9968, + "step": 2185 + }, + { + "epoch": 2.79616, + "grad_norm": 0.48308640718460083, + "learning_rate": 0.00022755417956656345, + "loss": 3.9867, + "step": 2186 + }, + { + "epoch": 2.79744, + "grad_norm": 0.5414093732833862, + "learning_rate": 0.00022751379728092608, + "loss": 3.938, + "step": 2187 + }, + { + "epoch": 2.79872, + "grad_norm": 0.47990182042121887, + "learning_rate": 0.00022747341499528874, + "loss": 4.1135, + "step": 2188 + }, + { + "epoch": 2.8, + "grad_norm": 0.5447584986686707, + "learning_rate": 0.00022743303270965134, + "loss": 4.1674, + "step": 2189 + }, + { + "epoch": 2.80128, + "grad_norm": 0.4829598665237427, + "learning_rate": 0.00022739265042401397, + "loss": 4.0011, + "step": 2190 + }, + { + "epoch": 2.80256, + "grad_norm": 0.5174542665481567, + "learning_rate": 0.0002273522681383766, + "loss": 4.0109, + "step": 2191 + }, + { + "epoch": 2.80384, + "grad_norm": 0.5266432166099548, + "learning_rate": 0.00022731188585273926, + "loss": 3.9463, + "step": 2192 + }, + { + "epoch": 2.80512, + "grad_norm": 0.5114743709564209, + "learning_rate": 0.0002272715035671019, + "loss": 4.0119, + "step": 2193 + }, + { + "epoch": 2.8064, + "grad_norm": 0.5330787301063538, + "learning_rate": 0.00022723112128146452, + "loss": 4.0184, + "step": 2194 + }, + { + "epoch": 2.80768, + "grad_norm": 0.49550044536590576, + "learning_rate": 0.00022719073899582715, + "loss": 4.0103, + "step": 2195 + }, + { + "epoch": 2.80896, + "grad_norm": 0.5248696804046631, + "learning_rate": 0.00022715035671018975, + "loss": 4.0211, + "step": 2196 + }, + { + "epoch": 2.81024, + "grad_norm": 0.528124213218689, + "learning_rate": 0.0002271099744245524, + "loss": 3.9816, + "step": 2197 + }, + { + "epoch": 2.81152, + "grad_norm": 0.498807817697525, + "learning_rate": 0.00022706959213891504, + "loss": 3.9876, + "step": 2198 + }, + { + "epoch": 2.8128, + "grad_norm": 0.5002254247665405, + "learning_rate": 0.00022702920985327767, + "loss": 3.9442, + "step": 2199 + }, + { + "epoch": 2.81408, + "grad_norm": 0.5136492848396301, + "learning_rate": 0.0002269888275676403, + "loss": 4.0369, + "step": 2200 + }, + { + "epoch": 2.81536, + "grad_norm": 0.5032553672790527, + "learning_rate": 0.00022694844528200296, + "loss": 4.0838, + "step": 2201 + }, + { + "epoch": 2.81664, + "grad_norm": 0.4921877086162567, + "learning_rate": 0.0002269080629963656, + "loss": 4.0399, + "step": 2202 + }, + { + "epoch": 2.81792, + "grad_norm": 0.48802658915519714, + "learning_rate": 0.00022686768071072822, + "loss": 3.9631, + "step": 2203 + }, + { + "epoch": 2.8192, + "grad_norm": 0.5272952914237976, + "learning_rate": 0.00022682729842509082, + "loss": 4.0163, + "step": 2204 + }, + { + "epoch": 2.82048, + "grad_norm": 0.5049785375595093, + "learning_rate": 0.00022678691613945348, + "loss": 3.9296, + "step": 2205 + }, + { + "epoch": 2.8217600000000003, + "grad_norm": 0.512064516544342, + "learning_rate": 0.0002267465338538161, + "loss": 3.9826, + "step": 2206 + }, + { + "epoch": 2.8230399999999998, + "grad_norm": 0.49538886547088623, + "learning_rate": 0.00022670615156817874, + "loss": 3.9296, + "step": 2207 + }, + { + "epoch": 2.82432, + "grad_norm": 0.5314378142356873, + "learning_rate": 0.00022666576928254137, + "loss": 4.0114, + "step": 2208 + }, + { + "epoch": 2.8256, + "grad_norm": 0.5182934403419495, + "learning_rate": 0.000226625386996904, + "loss": 4.0499, + "step": 2209 + }, + { + "epoch": 2.82688, + "grad_norm": 0.5282546877861023, + "learning_rate": 0.00022658500471126665, + "loss": 4.0127, + "step": 2210 + }, + { + "epoch": 2.82816, + "grad_norm": 0.49509358406066895, + "learning_rate": 0.00022654462242562929, + "loss": 3.9841, + "step": 2211 + }, + { + "epoch": 2.82944, + "grad_norm": 0.5452840328216553, + "learning_rate": 0.0002265042401399919, + "loss": 4.0112, + "step": 2212 + }, + { + "epoch": 2.83072, + "grad_norm": 0.5101568102836609, + "learning_rate": 0.00022646385785435452, + "loss": 3.9766, + "step": 2213 + }, + { + "epoch": 2.832, + "grad_norm": 0.5204622149467468, + "learning_rate": 0.00022642347556871718, + "loss": 4.0391, + "step": 2214 + }, + { + "epoch": 2.8332800000000002, + "grad_norm": 0.49549055099487305, + "learning_rate": 0.0002263830932830798, + "loss": 3.9876, + "step": 2215 + }, + { + "epoch": 2.8345599999999997, + "grad_norm": 0.5057578682899475, + "learning_rate": 0.00022634271099744244, + "loss": 4.0252, + "step": 2216 + }, + { + "epoch": 2.83584, + "grad_norm": 0.5117456912994385, + "learning_rate": 0.00022630232871180507, + "loss": 4.0438, + "step": 2217 + }, + { + "epoch": 2.83712, + "grad_norm": 0.49402594566345215, + "learning_rate": 0.00022626194642616772, + "loss": 4.0064, + "step": 2218 + }, + { + "epoch": 2.8384, + "grad_norm": 0.49484384059906006, + "learning_rate": 0.00022622156414053035, + "loss": 3.9281, + "step": 2219 + }, + { + "epoch": 2.83968, + "grad_norm": 0.49207979440689087, + "learning_rate": 0.00022618118185489296, + "loss": 4.0203, + "step": 2220 + }, + { + "epoch": 2.84096, + "grad_norm": 0.5029460191726685, + "learning_rate": 0.00022614079956925559, + "loss": 4.0214, + "step": 2221 + }, + { + "epoch": 2.84224, + "grad_norm": 0.49335840344429016, + "learning_rate": 0.00022610041728361822, + "loss": 4.0127, + "step": 2222 + }, + { + "epoch": 2.84352, + "grad_norm": 0.5236724019050598, + "learning_rate": 0.00022606003499798087, + "loss": 4.0602, + "step": 2223 + }, + { + "epoch": 2.8448, + "grad_norm": 0.5051988363265991, + "learning_rate": 0.0002260196527123435, + "loss": 3.9739, + "step": 2224 + }, + { + "epoch": 2.84608, + "grad_norm": 0.48310649394989014, + "learning_rate": 0.00022597927042670613, + "loss": 4.0336, + "step": 2225 + }, + { + "epoch": 2.84736, + "grad_norm": 0.49196314811706543, + "learning_rate": 0.00022593888814106876, + "loss": 4.068, + "step": 2226 + }, + { + "epoch": 2.84864, + "grad_norm": 0.48503825068473816, + "learning_rate": 0.00022589850585543142, + "loss": 4.0305, + "step": 2227 + }, + { + "epoch": 2.84992, + "grad_norm": 0.5028344988822937, + "learning_rate": 0.00022585812356979402, + "loss": 3.9993, + "step": 2228 + }, + { + "epoch": 2.8512, + "grad_norm": 0.4741714298725128, + "learning_rate": 0.00022581774128415665, + "loss": 4.0975, + "step": 2229 + }, + { + "epoch": 2.85248, + "grad_norm": 0.5247095227241516, + "learning_rate": 0.00022577735899851928, + "loss": 4.0058, + "step": 2230 + }, + { + "epoch": 2.85376, + "grad_norm": 0.5030754804611206, + "learning_rate": 0.00022573697671288194, + "loss": 3.968, + "step": 2231 + }, + { + "epoch": 2.85504, + "grad_norm": 0.5085668563842773, + "learning_rate": 0.00022569659442724457, + "loss": 4.0359, + "step": 2232 + }, + { + "epoch": 2.85632, + "grad_norm": 0.5207344889640808, + "learning_rate": 0.0002256562121416072, + "loss": 3.9844, + "step": 2233 + }, + { + "epoch": 2.8576, + "grad_norm": 0.5112427473068237, + "learning_rate": 0.00022561582985596983, + "loss": 4.0718, + "step": 2234 + }, + { + "epoch": 2.85888, + "grad_norm": 0.49677979946136475, + "learning_rate": 0.00022557544757033243, + "loss": 3.9815, + "step": 2235 + }, + { + "epoch": 2.86016, + "grad_norm": 0.5005208253860474, + "learning_rate": 0.0002255350652846951, + "loss": 4.0531, + "step": 2236 + }, + { + "epoch": 2.86144, + "grad_norm": 0.4932452142238617, + "learning_rate": 0.00022549468299905772, + "loss": 4.0379, + "step": 2237 + }, + { + "epoch": 2.86272, + "grad_norm": 0.510642945766449, + "learning_rate": 0.00022545430071342035, + "loss": 4.0408, + "step": 2238 + }, + { + "epoch": 2.864, + "grad_norm": 0.49646908044815063, + "learning_rate": 0.00022541391842778298, + "loss": 4.0044, + "step": 2239 + }, + { + "epoch": 2.8652800000000003, + "grad_norm": 0.5001757144927979, + "learning_rate": 0.00022537353614214564, + "loss": 4.0446, + "step": 2240 + }, + { + "epoch": 2.8665599999999998, + "grad_norm": 0.5238614678382874, + "learning_rate": 0.00022533315385650827, + "loss": 3.9808, + "step": 2241 + }, + { + "epoch": 2.86784, + "grad_norm": 0.5164071917533875, + "learning_rate": 0.0002252927715708709, + "loss": 4.0207, + "step": 2242 + }, + { + "epoch": 2.86912, + "grad_norm": 0.5213832855224609, + "learning_rate": 0.0002252523892852335, + "loss": 4.0449, + "step": 2243 + }, + { + "epoch": 2.8704, + "grad_norm": 0.4973851144313812, + "learning_rate": 0.00022521200699959619, + "loss": 3.9746, + "step": 2244 + }, + { + "epoch": 2.87168, + "grad_norm": 0.479270875453949, + "learning_rate": 0.0002251716247139588, + "loss": 3.9801, + "step": 2245 + }, + { + "epoch": 2.87296, + "grad_norm": 0.48888400197029114, + "learning_rate": 0.00022513124242832142, + "loss": 4.0178, + "step": 2246 + }, + { + "epoch": 2.87424, + "grad_norm": 0.48972752690315247, + "learning_rate": 0.00022509086014268405, + "loss": 4.0051, + "step": 2247 + }, + { + "epoch": 2.87552, + "grad_norm": 0.4787149429321289, + "learning_rate": 0.00022505047785704668, + "loss": 3.8699, + "step": 2248 + }, + { + "epoch": 2.8768000000000002, + "grad_norm": 0.49900394678115845, + "learning_rate": 0.00022501009557140934, + "loss": 3.9537, + "step": 2249 + }, + { + "epoch": 2.8780799999999997, + "grad_norm": 0.48665642738342285, + "learning_rate": 0.00022496971328577197, + "loss": 3.9841, + "step": 2250 + }, + { + "epoch": 2.87936, + "grad_norm": 0.48270756006240845, + "learning_rate": 0.00022492933100013457, + "loss": 4.0203, + "step": 2251 + }, + { + "epoch": 2.88064, + "grad_norm": 0.48800358176231384, + "learning_rate": 0.0002248889487144972, + "loss": 4.0187, + "step": 2252 + }, + { + "epoch": 2.88192, + "grad_norm": 0.5024043321609497, + "learning_rate": 0.00022484856642885986, + "loss": 3.968, + "step": 2253 + }, + { + "epoch": 2.8832, + "grad_norm": 0.5062508583068848, + "learning_rate": 0.0002248081841432225, + "loss": 4.0056, + "step": 2254 + }, + { + "epoch": 2.88448, + "grad_norm": 0.4831932485103607, + "learning_rate": 0.00022476780185758512, + "loss": 4.0342, + "step": 2255 + }, + { + "epoch": 2.88576, + "grad_norm": 0.4784030020236969, + "learning_rate": 0.00022472741957194775, + "loss": 4.0064, + "step": 2256 + }, + { + "epoch": 2.88704, + "grad_norm": 0.49033915996551514, + "learning_rate": 0.0002246870372863104, + "loss": 4.0344, + "step": 2257 + }, + { + "epoch": 2.88832, + "grad_norm": 0.505535364151001, + "learning_rate": 0.00022464665500067303, + "loss": 3.9952, + "step": 2258 + }, + { + "epoch": 2.8895999999999997, + "grad_norm": 0.5267712473869324, + "learning_rate": 0.00022460627271503566, + "loss": 3.9862, + "step": 2259 + }, + { + "epoch": 2.89088, + "grad_norm": 0.5031368136405945, + "learning_rate": 0.00022456589042939827, + "loss": 4.0297, + "step": 2260 + }, + { + "epoch": 2.89216, + "grad_norm": 0.5366697907447815, + "learning_rate": 0.0002245255081437609, + "loss": 4.037, + "step": 2261 + }, + { + "epoch": 2.89344, + "grad_norm": 0.5078505873680115, + "learning_rate": 0.00022448512585812355, + "loss": 3.9513, + "step": 2262 + }, + { + "epoch": 2.89472, + "grad_norm": 0.5012052655220032, + "learning_rate": 0.00022444474357248618, + "loss": 3.9266, + "step": 2263 + }, + { + "epoch": 2.896, + "grad_norm": 0.5153883695602417, + "learning_rate": 0.00022440436128684881, + "loss": 3.9948, + "step": 2264 + }, + { + "epoch": 2.89728, + "grad_norm": 0.5152749419212341, + "learning_rate": 0.00022436397900121144, + "loss": 3.9879, + "step": 2265 + }, + { + "epoch": 2.89856, + "grad_norm": 0.5094638466835022, + "learning_rate": 0.0002243235967155741, + "loss": 3.9949, + "step": 2266 + }, + { + "epoch": 2.89984, + "grad_norm": 0.489305317401886, + "learning_rate": 0.00022428321442993673, + "loss": 3.9818, + "step": 2267 + }, + { + "epoch": 2.90112, + "grad_norm": 0.4954513907432556, + "learning_rate": 0.00022424283214429933, + "loss": 3.9221, + "step": 2268 + }, + { + "epoch": 2.9024, + "grad_norm": 0.5154023170471191, + "learning_rate": 0.00022420244985866196, + "loss": 4.0049, + "step": 2269 + }, + { + "epoch": 2.90368, + "grad_norm": 0.5139758586883545, + "learning_rate": 0.00022416206757302462, + "loss": 3.9232, + "step": 2270 + }, + { + "epoch": 2.90496, + "grad_norm": 0.5243357419967651, + "learning_rate": 0.00022412168528738725, + "loss": 4.0215, + "step": 2271 + }, + { + "epoch": 2.90624, + "grad_norm": 0.5350946187973022, + "learning_rate": 0.00022408130300174988, + "loss": 3.9502, + "step": 2272 + }, + { + "epoch": 2.90752, + "grad_norm": 0.5469184517860413, + "learning_rate": 0.0002240409207161125, + "loss": 3.9617, + "step": 2273 + }, + { + "epoch": 2.9088000000000003, + "grad_norm": 0.5040557980537415, + "learning_rate": 0.00022400053843047514, + "loss": 3.9336, + "step": 2274 + }, + { + "epoch": 2.91008, + "grad_norm": 0.5210381150245667, + "learning_rate": 0.0002239601561448378, + "loss": 3.9707, + "step": 2275 + }, + { + "epoch": 2.91136, + "grad_norm": 0.5240786075592041, + "learning_rate": 0.0002239197738592004, + "loss": 3.9758, + "step": 2276 + }, + { + "epoch": 2.91264, + "grad_norm": 0.5405933260917664, + "learning_rate": 0.00022387939157356303, + "loss": 4.036, + "step": 2277 + }, + { + "epoch": 2.91392, + "grad_norm": 0.5046157240867615, + "learning_rate": 0.00022383900928792566, + "loss": 3.9727, + "step": 2278 + }, + { + "epoch": 2.9152, + "grad_norm": 0.49960970878601074, + "learning_rate": 0.00022379862700228832, + "loss": 4.0635, + "step": 2279 + }, + { + "epoch": 2.91648, + "grad_norm": 0.514678418636322, + "learning_rate": 0.00022375824471665095, + "loss": 4.01, + "step": 2280 + }, + { + "epoch": 2.91776, + "grad_norm": 0.5296341180801392, + "learning_rate": 0.00022371786243101358, + "loss": 3.9588, + "step": 2281 + }, + { + "epoch": 2.91904, + "grad_norm": 0.49709784984588623, + "learning_rate": 0.0002236774801453762, + "loss": 3.9068, + "step": 2282 + }, + { + "epoch": 2.9203200000000002, + "grad_norm": 0.5464451909065247, + "learning_rate": 0.00022363709785973887, + "loss": 4.0025, + "step": 2283 + }, + { + "epoch": 2.9215999999999998, + "grad_norm": 0.5068522095680237, + "learning_rate": 0.00022359671557410147, + "loss": 3.9793, + "step": 2284 + }, + { + "epoch": 2.92288, + "grad_norm": 0.5234406590461731, + "learning_rate": 0.0002235563332884641, + "loss": 3.9384, + "step": 2285 + }, + { + "epoch": 2.92416, + "grad_norm": 0.5124624967575073, + "learning_rate": 0.00022351595100282673, + "loss": 3.9735, + "step": 2286 + }, + { + "epoch": 2.92544, + "grad_norm": 0.5246328115463257, + "learning_rate": 0.00022347556871718936, + "loss": 4.0739, + "step": 2287 + }, + { + "epoch": 2.92672, + "grad_norm": 0.5275157690048218, + "learning_rate": 0.00022343518643155202, + "loss": 3.9339, + "step": 2288 + }, + { + "epoch": 2.928, + "grad_norm": 0.5206854939460754, + "learning_rate": 0.00022339480414591465, + "loss": 3.9908, + "step": 2289 + }, + { + "epoch": 2.92928, + "grad_norm": 0.50496506690979, + "learning_rate": 0.00022335442186027728, + "loss": 4.0184, + "step": 2290 + }, + { + "epoch": 2.93056, + "grad_norm": 0.5281472206115723, + "learning_rate": 0.00022331403957463988, + "loss": 4.0205, + "step": 2291 + }, + { + "epoch": 2.9318400000000002, + "grad_norm": 0.5035421252250671, + "learning_rate": 0.00022327365728900254, + "loss": 4.016, + "step": 2292 + }, + { + "epoch": 2.9331199999999997, + "grad_norm": 0.5101969242095947, + "learning_rate": 0.00022323327500336517, + "loss": 3.9902, + "step": 2293 + }, + { + "epoch": 2.9344, + "grad_norm": 0.49704429507255554, + "learning_rate": 0.0002231928927177278, + "loss": 3.9911, + "step": 2294 + }, + { + "epoch": 2.93568, + "grad_norm": 0.5134301781654358, + "learning_rate": 0.00022315251043209043, + "loss": 3.9861, + "step": 2295 + }, + { + "epoch": 2.93696, + "grad_norm": 0.5616884231567383, + "learning_rate": 0.00022311212814645309, + "loss": 4.0493, + "step": 2296 + }, + { + "epoch": 2.93824, + "grad_norm": 0.4961213767528534, + "learning_rate": 0.00022307174586081572, + "loss": 3.9555, + "step": 2297 + }, + { + "epoch": 2.93952, + "grad_norm": 0.5437341928482056, + "learning_rate": 0.00022303136357517835, + "loss": 3.9674, + "step": 2298 + }, + { + "epoch": 2.9408, + "grad_norm": 0.5309537053108215, + "learning_rate": 0.00022299098128954095, + "loss": 4.0694, + "step": 2299 + }, + { + "epoch": 2.94208, + "grad_norm": 0.5304505825042725, + "learning_rate": 0.00022295059900390358, + "loss": 3.9685, + "step": 2300 + }, + { + "epoch": 2.94336, + "grad_norm": 0.5457247495651245, + "learning_rate": 0.00022291021671826624, + "loss": 3.9616, + "step": 2301 + }, + { + "epoch": 2.94464, + "grad_norm": 0.5207186341285706, + "learning_rate": 0.00022286983443262887, + "loss": 3.9636, + "step": 2302 + }, + { + "epoch": 2.94592, + "grad_norm": 0.511628270149231, + "learning_rate": 0.0002228294521469915, + "loss": 4.0361, + "step": 2303 + }, + { + "epoch": 2.9472, + "grad_norm": 0.5089471340179443, + "learning_rate": 0.00022278906986135413, + "loss": 3.9737, + "step": 2304 + }, + { + "epoch": 2.94848, + "grad_norm": 0.5251920819282532, + "learning_rate": 0.00022274868757571678, + "loss": 4.0431, + "step": 2305 + }, + { + "epoch": 2.94976, + "grad_norm": 0.528783917427063, + "learning_rate": 0.0002227083052900794, + "loss": 3.9471, + "step": 2306 + }, + { + "epoch": 2.95104, + "grad_norm": 0.507645845413208, + "learning_rate": 0.00022266792300444202, + "loss": 4.0272, + "step": 2307 + }, + { + "epoch": 2.9523200000000003, + "grad_norm": 0.5167163014411926, + "learning_rate": 0.00022262754071880465, + "loss": 3.9864, + "step": 2308 + }, + { + "epoch": 2.9536, + "grad_norm": 0.48903942108154297, + "learning_rate": 0.0002225871584331673, + "loss": 4.0455, + "step": 2309 + }, + { + "epoch": 2.95488, + "grad_norm": 0.5110606551170349, + "learning_rate": 0.00022254677614752993, + "loss": 3.9581, + "step": 2310 + }, + { + "epoch": 2.95616, + "grad_norm": 0.5128504633903503, + "learning_rate": 0.00022250639386189256, + "loss": 3.9707, + "step": 2311 + }, + { + "epoch": 2.95744, + "grad_norm": 0.5019478797912598, + "learning_rate": 0.0002224660115762552, + "loss": 4.0202, + "step": 2312 + }, + { + "epoch": 2.95872, + "grad_norm": 0.4747721254825592, + "learning_rate": 0.00022242562929061785, + "loss": 4.0257, + "step": 2313 + }, + { + "epoch": 2.96, + "grad_norm": 0.4858660399913788, + "learning_rate": 0.00022238524700498048, + "loss": 3.9312, + "step": 2314 + }, + { + "epoch": 2.96128, + "grad_norm": 0.49668505787849426, + "learning_rate": 0.00022234486471934308, + "loss": 4.0515, + "step": 2315 + }, + { + "epoch": 2.96256, + "grad_norm": 0.5372991561889648, + "learning_rate": 0.00022230448243370571, + "loss": 3.8684, + "step": 2316 + }, + { + "epoch": 2.9638400000000003, + "grad_norm": 0.49556052684783936, + "learning_rate": 0.00022226410014806834, + "loss": 4.0073, + "step": 2317 + }, + { + "epoch": 2.9651199999999998, + "grad_norm": 0.5234814882278442, + "learning_rate": 0.000222223717862431, + "loss": 3.9878, + "step": 2318 + }, + { + "epoch": 2.9664, + "grad_norm": 0.49563363194465637, + "learning_rate": 0.00022218333557679363, + "loss": 4.0238, + "step": 2319 + }, + { + "epoch": 2.96768, + "grad_norm": 0.5069983005523682, + "learning_rate": 0.00022214295329115626, + "loss": 3.9518, + "step": 2320 + }, + { + "epoch": 2.96896, + "grad_norm": 0.5253263711929321, + "learning_rate": 0.0002221025710055189, + "loss": 4.012, + "step": 2321 + }, + { + "epoch": 2.97024, + "grad_norm": 0.5037320256233215, + "learning_rate": 0.00022206218871988155, + "loss": 3.965, + "step": 2322 + }, + { + "epoch": 2.97152, + "grad_norm": 0.5014638900756836, + "learning_rate": 0.00022202180643424415, + "loss": 3.9803, + "step": 2323 + }, + { + "epoch": 2.9728, + "grad_norm": 0.5158342719078064, + "learning_rate": 0.00022198142414860678, + "loss": 3.9949, + "step": 2324 + }, + { + "epoch": 2.97408, + "grad_norm": 0.5213910937309265, + "learning_rate": 0.0002219410418629694, + "loss": 3.9926, + "step": 2325 + }, + { + "epoch": 2.9753600000000002, + "grad_norm": 0.5123266577720642, + "learning_rate": 0.00022190065957733207, + "loss": 3.957, + "step": 2326 + }, + { + "epoch": 2.9766399999999997, + "grad_norm": 0.5103859305381775, + "learning_rate": 0.0002218602772916947, + "loss": 4.0097, + "step": 2327 + }, + { + "epoch": 2.97792, + "grad_norm": 0.5117716789245605, + "learning_rate": 0.00022181989500605733, + "loss": 3.9832, + "step": 2328 + }, + { + "epoch": 2.9792, + "grad_norm": 0.48839670419692993, + "learning_rate": 0.00022177951272041996, + "loss": 4.0224, + "step": 2329 + }, + { + "epoch": 2.98048, + "grad_norm": 0.5360726714134216, + "learning_rate": 0.00022173913043478256, + "loss": 3.9817, + "step": 2330 + }, + { + "epoch": 2.98176, + "grad_norm": 0.4825299382209778, + "learning_rate": 0.00022169874814914525, + "loss": 3.9989, + "step": 2331 + }, + { + "epoch": 2.98304, + "grad_norm": 0.5100297331809998, + "learning_rate": 0.00022165836586350785, + "loss": 3.9841, + "step": 2332 + }, + { + "epoch": 2.98432, + "grad_norm": 0.4855094850063324, + "learning_rate": 0.00022161798357787048, + "loss": 3.963, + "step": 2333 + }, + { + "epoch": 2.9856, + "grad_norm": 0.48381808400154114, + "learning_rate": 0.0002215776012922331, + "loss": 3.9805, + "step": 2334 + }, + { + "epoch": 2.98688, + "grad_norm": 0.4909382462501526, + "learning_rate": 0.00022153721900659577, + "loss": 3.9692, + "step": 2335 + }, + { + "epoch": 2.98816, + "grad_norm": 0.4872201085090637, + "learning_rate": 0.0002214968367209584, + "loss": 3.9689, + "step": 2336 + }, + { + "epoch": 2.98944, + "grad_norm": 0.470969557762146, + "learning_rate": 0.00022145645443532103, + "loss": 3.9348, + "step": 2337 + }, + { + "epoch": 2.99072, + "grad_norm": 0.4978778064250946, + "learning_rate": 0.00022141607214968363, + "loss": 3.9148, + "step": 2338 + }, + { + "epoch": 2.992, + "grad_norm": 0.48822498321533203, + "learning_rate": 0.00022137568986404631, + "loss": 3.858, + "step": 2339 + }, + { + "epoch": 2.99328, + "grad_norm": 0.4724681079387665, + "learning_rate": 0.00022133530757840892, + "loss": 3.9882, + "step": 2340 + }, + { + "epoch": 2.99456, + "grad_norm": 0.5068411231040955, + "learning_rate": 0.00022129492529277155, + "loss": 3.9354, + "step": 2341 + }, + { + "epoch": 2.99584, + "grad_norm": 0.49231788516044617, + "learning_rate": 0.00022125454300713418, + "loss": 3.9973, + "step": 2342 + }, + { + "epoch": 2.99712, + "grad_norm": 0.5040780901908875, + "learning_rate": 0.0002212141607214968, + "loss": 3.9624, + "step": 2343 + }, + { + "epoch": 2.9984, + "grad_norm": 0.5048054456710815, + "learning_rate": 0.00022117377843585946, + "loss": 4.0559, + "step": 2344 + }, + { + "epoch": 2.99968, + "grad_norm": 0.5000688433647156, + "learning_rate": 0.0002211333961502221, + "loss": 4.0228, + "step": 2345 + }, + { + "epoch": 3.0, + "grad_norm": 0.9295437932014465, + "learning_rate": 0.00022109301386458472, + "loss": 3.9067, + "step": 2346 + }, + { + "epoch": 3.00128, + "grad_norm": 0.5954660177230835, + "learning_rate": 0.00022105263157894733, + "loss": 3.7776, + "step": 2347 + }, + { + "epoch": 3.00256, + "grad_norm": 0.5306954979896545, + "learning_rate": 0.00022101224929330998, + "loss": 3.8486, + "step": 2348 + }, + { + "epoch": 3.00384, + "grad_norm": 0.5421395897865295, + "learning_rate": 0.00022097186700767261, + "loss": 3.8924, + "step": 2349 + }, + { + "epoch": 3.00512, + "grad_norm": 0.5273167490959167, + "learning_rate": 0.00022093148472203524, + "loss": 3.8585, + "step": 2350 + }, + { + "epoch": 3.0064, + "grad_norm": 0.5256027579307556, + "learning_rate": 0.00022089110243639788, + "loss": 3.8755, + "step": 2351 + }, + { + "epoch": 3.00768, + "grad_norm": 0.5198791027069092, + "learning_rate": 0.00022085072015076053, + "loss": 3.8962, + "step": 2352 + }, + { + "epoch": 3.00896, + "grad_norm": 0.5103099346160889, + "learning_rate": 0.00022081033786512316, + "loss": 3.8984, + "step": 2353 + }, + { + "epoch": 3.01024, + "grad_norm": 0.48842713236808777, + "learning_rate": 0.0002207699555794858, + "loss": 3.9612, + "step": 2354 + }, + { + "epoch": 3.01152, + "grad_norm": 0.5094363689422607, + "learning_rate": 0.0002207295732938484, + "loss": 3.8044, + "step": 2355 + }, + { + "epoch": 3.0128, + "grad_norm": 0.537132740020752, + "learning_rate": 0.00022068919100821103, + "loss": 3.9496, + "step": 2356 + }, + { + "epoch": 3.01408, + "grad_norm": 0.5116694569587708, + "learning_rate": 0.00022064880872257368, + "loss": 3.8491, + "step": 2357 + }, + { + "epoch": 3.01536, + "grad_norm": 0.5091819763183594, + "learning_rate": 0.0002206084264369363, + "loss": 3.8916, + "step": 2358 + }, + { + "epoch": 3.01664, + "grad_norm": 0.5170096158981323, + "learning_rate": 0.00022056804415129894, + "loss": 3.9251, + "step": 2359 + }, + { + "epoch": 3.01792, + "grad_norm": 0.4943462014198303, + "learning_rate": 0.00022052766186566157, + "loss": 3.9045, + "step": 2360 + }, + { + "epoch": 3.0192, + "grad_norm": 0.4762275815010071, + "learning_rate": 0.00022048727958002423, + "loss": 3.8835, + "step": 2361 + }, + { + "epoch": 3.02048, + "grad_norm": 0.5517547130584717, + "learning_rate": 0.00022044689729438686, + "loss": 3.8825, + "step": 2362 + }, + { + "epoch": 3.02176, + "grad_norm": 0.5428676605224609, + "learning_rate": 0.00022040651500874946, + "loss": 3.8153, + "step": 2363 + }, + { + "epoch": 3.02304, + "grad_norm": 0.5272209644317627, + "learning_rate": 0.0002203661327231121, + "loss": 3.8748, + "step": 2364 + }, + { + "epoch": 3.02432, + "grad_norm": 0.5124238729476929, + "learning_rate": 0.00022032575043747475, + "loss": 3.9815, + "step": 2365 + }, + { + "epoch": 3.0256, + "grad_norm": 0.5289603471755981, + "learning_rate": 0.00022028536815183738, + "loss": 3.8756, + "step": 2366 + }, + { + "epoch": 3.02688, + "grad_norm": 0.5256252884864807, + "learning_rate": 0.0002202449858662, + "loss": 3.8492, + "step": 2367 + }, + { + "epoch": 3.02816, + "grad_norm": 0.5126022696495056, + "learning_rate": 0.00022020460358056264, + "loss": 3.814, + "step": 2368 + }, + { + "epoch": 3.02944, + "grad_norm": 0.5239128470420837, + "learning_rate": 0.00022016422129492527, + "loss": 3.8932, + "step": 2369 + }, + { + "epoch": 3.03072, + "grad_norm": 0.49754399061203003, + "learning_rate": 0.00022012383900928793, + "loss": 3.8759, + "step": 2370 + }, + { + "epoch": 3.032, + "grad_norm": 0.49979376792907715, + "learning_rate": 0.00022008345672365053, + "loss": 3.9152, + "step": 2371 + }, + { + "epoch": 3.03328, + "grad_norm": 0.48074454069137573, + "learning_rate": 0.00022004307443801316, + "loss": 3.8479, + "step": 2372 + }, + { + "epoch": 3.03456, + "grad_norm": 0.5178848505020142, + "learning_rate": 0.0002200026921523758, + "loss": 3.8689, + "step": 2373 + }, + { + "epoch": 3.03584, + "grad_norm": 0.5065923929214478, + "learning_rate": 0.00021996230986673845, + "loss": 3.868, + "step": 2374 + }, + { + "epoch": 3.03712, + "grad_norm": 0.4748089015483856, + "learning_rate": 0.00021992192758110108, + "loss": 3.8268, + "step": 2375 + }, + { + "epoch": 3.0384, + "grad_norm": 0.4894319474697113, + "learning_rate": 0.0002198815452954637, + "loss": 3.7976, + "step": 2376 + }, + { + "epoch": 3.03968, + "grad_norm": 0.5088182687759399, + "learning_rate": 0.00021984116300982634, + "loss": 3.8785, + "step": 2377 + }, + { + "epoch": 3.04096, + "grad_norm": 0.48660656809806824, + "learning_rate": 0.000219800780724189, + "loss": 3.8136, + "step": 2378 + }, + { + "epoch": 3.04224, + "grad_norm": 0.5052852034568787, + "learning_rate": 0.0002197603984385516, + "loss": 3.8922, + "step": 2379 + }, + { + "epoch": 3.04352, + "grad_norm": 0.49681487679481506, + "learning_rate": 0.00021972001615291423, + "loss": 3.8612, + "step": 2380 + }, + { + "epoch": 3.0448, + "grad_norm": 0.4781433045864105, + "learning_rate": 0.00021967963386727686, + "loss": 3.844, + "step": 2381 + }, + { + "epoch": 3.04608, + "grad_norm": 0.5147704482078552, + "learning_rate": 0.0002196392515816395, + "loss": 3.8779, + "step": 2382 + }, + { + "epoch": 3.04736, + "grad_norm": 0.49647682905197144, + "learning_rate": 0.00021959886929600215, + "loss": 3.8326, + "step": 2383 + }, + { + "epoch": 3.04864, + "grad_norm": 0.4942707121372223, + "learning_rate": 0.00021955848701036478, + "loss": 3.8618, + "step": 2384 + }, + { + "epoch": 3.04992, + "grad_norm": 0.5220282077789307, + "learning_rate": 0.0002195181047247274, + "loss": 3.9195, + "step": 2385 + }, + { + "epoch": 3.0512, + "grad_norm": 0.4788862466812134, + "learning_rate": 0.00021947772243909, + "loss": 3.8758, + "step": 2386 + }, + { + "epoch": 3.05248, + "grad_norm": 0.4899081289768219, + "learning_rate": 0.00021943734015345267, + "loss": 3.8233, + "step": 2387 + }, + { + "epoch": 3.05376, + "grad_norm": 0.4979023337364197, + "learning_rate": 0.0002193969578678153, + "loss": 3.8196, + "step": 2388 + }, + { + "epoch": 3.05504, + "grad_norm": 0.48769399523735046, + "learning_rate": 0.00021935657558217793, + "loss": 3.85, + "step": 2389 + }, + { + "epoch": 3.05632, + "grad_norm": 0.5051479935646057, + "learning_rate": 0.00021931619329654056, + "loss": 3.8659, + "step": 2390 + }, + { + "epoch": 3.0576, + "grad_norm": 0.508083701133728, + "learning_rate": 0.00021927581101090321, + "loss": 3.8542, + "step": 2391 + }, + { + "epoch": 3.05888, + "grad_norm": 0.5128031373023987, + "learning_rate": 0.00021923542872526584, + "loss": 3.9097, + "step": 2392 + }, + { + "epoch": 3.06016, + "grad_norm": 0.49602705240249634, + "learning_rate": 0.00021919504643962847, + "loss": 3.8487, + "step": 2393 + }, + { + "epoch": 3.06144, + "grad_norm": 0.5159900188446045, + "learning_rate": 0.00021915466415399108, + "loss": 3.7835, + "step": 2394 + }, + { + "epoch": 3.06272, + "grad_norm": 0.4902365803718567, + "learning_rate": 0.0002191142818683537, + "loss": 3.8462, + "step": 2395 + }, + { + "epoch": 3.064, + "grad_norm": 0.5062222480773926, + "learning_rate": 0.00021907389958271636, + "loss": 3.8156, + "step": 2396 + }, + { + "epoch": 3.06528, + "grad_norm": 0.5004411339759827, + "learning_rate": 0.000219033517297079, + "loss": 3.7797, + "step": 2397 + }, + { + "epoch": 3.06656, + "grad_norm": 0.5420650839805603, + "learning_rate": 0.00021899313501144162, + "loss": 3.9092, + "step": 2398 + }, + { + "epoch": 3.06784, + "grad_norm": 0.5353574752807617, + "learning_rate": 0.00021895275272580425, + "loss": 3.8596, + "step": 2399 + }, + { + "epoch": 3.06912, + "grad_norm": 0.4925222396850586, + "learning_rate": 0.0002189123704401669, + "loss": 3.8678, + "step": 2400 + }, + { + "epoch": 3.0704, + "grad_norm": 0.54100501537323, + "learning_rate": 0.00021887198815452954, + "loss": 3.8494, + "step": 2401 + }, + { + "epoch": 3.07168, + "grad_norm": 0.5081486105918884, + "learning_rate": 0.00021883160586889214, + "loss": 3.8965, + "step": 2402 + }, + { + "epoch": 3.07296, + "grad_norm": 0.5015511512756348, + "learning_rate": 0.00021879122358325477, + "loss": 3.9272, + "step": 2403 + }, + { + "epoch": 3.07424, + "grad_norm": 0.5019526481628418, + "learning_rate": 0.00021875084129761743, + "loss": 3.8459, + "step": 2404 + }, + { + "epoch": 3.07552, + "grad_norm": 0.491059273481369, + "learning_rate": 0.00021871045901198006, + "loss": 3.8815, + "step": 2405 + }, + { + "epoch": 3.0768, + "grad_norm": 0.5351020097732544, + "learning_rate": 0.0002186700767263427, + "loss": 3.8489, + "step": 2406 + }, + { + "epoch": 3.07808, + "grad_norm": 0.5229183435440063, + "learning_rate": 0.00021862969444070532, + "loss": 3.8421, + "step": 2407 + }, + { + "epoch": 3.07936, + "grad_norm": 0.507849395275116, + "learning_rate": 0.00021858931215506795, + "loss": 3.8573, + "step": 2408 + }, + { + "epoch": 3.08064, + "grad_norm": 0.5348861813545227, + "learning_rate": 0.0002185489298694306, + "loss": 3.8958, + "step": 2409 + }, + { + "epoch": 3.08192, + "grad_norm": 0.5243686437606812, + "learning_rate": 0.0002185085475837932, + "loss": 3.8338, + "step": 2410 + }, + { + "epoch": 3.0832, + "grad_norm": 0.5294802784919739, + "learning_rate": 0.00021846816529815584, + "loss": 3.9149, + "step": 2411 + }, + { + "epoch": 3.08448, + "grad_norm": 0.5236945748329163, + "learning_rate": 0.00021842778301251847, + "loss": 3.848, + "step": 2412 + }, + { + "epoch": 3.08576, + "grad_norm": 0.5086913704872131, + "learning_rate": 0.00021838740072688113, + "loss": 3.794, + "step": 2413 + }, + { + "epoch": 3.08704, + "grad_norm": 0.5362405776977539, + "learning_rate": 0.00021834701844124376, + "loss": 3.9174, + "step": 2414 + }, + { + "epoch": 3.08832, + "grad_norm": 0.5077964067459106, + "learning_rate": 0.0002183066361556064, + "loss": 3.8253, + "step": 2415 + }, + { + "epoch": 3.0896, + "grad_norm": 0.5096865296363831, + "learning_rate": 0.00021826625386996902, + "loss": 3.8887, + "step": 2416 + }, + { + "epoch": 3.09088, + "grad_norm": 0.5185666084289551, + "learning_rate": 0.00021822587158433168, + "loss": 3.8292, + "step": 2417 + }, + { + "epoch": 3.09216, + "grad_norm": 0.4858289659023285, + "learning_rate": 0.0002181854892986943, + "loss": 3.8525, + "step": 2418 + }, + { + "epoch": 3.09344, + "grad_norm": 0.5283902287483215, + "learning_rate": 0.0002181451070130569, + "loss": 3.8476, + "step": 2419 + }, + { + "epoch": 3.09472, + "grad_norm": 0.5762797594070435, + "learning_rate": 0.00021810472472741954, + "loss": 3.8591, + "step": 2420 + }, + { + "epoch": 3.096, + "grad_norm": 0.5110985636711121, + "learning_rate": 0.00021806434244178217, + "loss": 3.7624, + "step": 2421 + }, + { + "epoch": 3.09728, + "grad_norm": 0.5029181241989136, + "learning_rate": 0.00021802396015614483, + "loss": 3.8236, + "step": 2422 + }, + { + "epoch": 3.09856, + "grad_norm": 0.5185674428939819, + "learning_rate": 0.00021798357787050746, + "loss": 3.8156, + "step": 2423 + }, + { + "epoch": 3.09984, + "grad_norm": 0.5150001645088196, + "learning_rate": 0.0002179431955848701, + "loss": 3.8182, + "step": 2424 + }, + { + "epoch": 3.10112, + "grad_norm": 0.4945181608200073, + "learning_rate": 0.0002179028132992327, + "loss": 3.8906, + "step": 2425 + }, + { + "epoch": 3.1024, + "grad_norm": 0.5029820799827576, + "learning_rate": 0.00021786243101359537, + "loss": 3.8907, + "step": 2426 + }, + { + "epoch": 3.1036799999999998, + "grad_norm": 0.5240580439567566, + "learning_rate": 0.00021782204872795798, + "loss": 3.911, + "step": 2427 + }, + { + "epoch": 3.10496, + "grad_norm": 0.526394248008728, + "learning_rate": 0.0002177816664423206, + "loss": 3.928, + "step": 2428 + }, + { + "epoch": 3.10624, + "grad_norm": 0.517666757106781, + "learning_rate": 0.00021774128415668324, + "loss": 3.8165, + "step": 2429 + }, + { + "epoch": 3.10752, + "grad_norm": 0.5141002535820007, + "learning_rate": 0.0002177009018710459, + "loss": 3.7981, + "step": 2430 + }, + { + "epoch": 3.1088, + "grad_norm": 0.4856560230255127, + "learning_rate": 0.00021766051958540853, + "loss": 3.8666, + "step": 2431 + }, + { + "epoch": 3.11008, + "grad_norm": 0.5310648083686829, + "learning_rate": 0.00021762013729977116, + "loss": 3.8851, + "step": 2432 + }, + { + "epoch": 3.11136, + "grad_norm": 0.5003724098205566, + "learning_rate": 0.00021757975501413379, + "loss": 3.8379, + "step": 2433 + }, + { + "epoch": 3.11264, + "grad_norm": 0.5054803490638733, + "learning_rate": 0.0002175393727284964, + "loss": 3.7736, + "step": 2434 + }, + { + "epoch": 3.11392, + "grad_norm": 0.4943746328353882, + "learning_rate": 0.00021749899044285905, + "loss": 3.8826, + "step": 2435 + }, + { + "epoch": 3.1152, + "grad_norm": 0.49656054377555847, + "learning_rate": 0.00021745860815722168, + "loss": 3.838, + "step": 2436 + }, + { + "epoch": 3.11648, + "grad_norm": 0.5456673502922058, + "learning_rate": 0.0002174182258715843, + "loss": 3.9265, + "step": 2437 + }, + { + "epoch": 3.11776, + "grad_norm": 0.4923163056373596, + "learning_rate": 0.00021737784358594694, + "loss": 3.9262, + "step": 2438 + }, + { + "epoch": 3.11904, + "grad_norm": 0.5190243124961853, + "learning_rate": 0.0002173374613003096, + "loss": 3.8364, + "step": 2439 + }, + { + "epoch": 3.12032, + "grad_norm": 0.47707489132881165, + "learning_rate": 0.00021729707901467222, + "loss": 3.8502, + "step": 2440 + }, + { + "epoch": 3.1216, + "grad_norm": 0.5326244235038757, + "learning_rate": 0.00021725669672903485, + "loss": 3.8078, + "step": 2441 + }, + { + "epoch": 3.12288, + "grad_norm": 0.5211552381515503, + "learning_rate": 0.00021721631444339746, + "loss": 3.8304, + "step": 2442 + }, + { + "epoch": 3.12416, + "grad_norm": 0.524936854839325, + "learning_rate": 0.0002171759321577601, + "loss": 3.7785, + "step": 2443 + }, + { + "epoch": 3.12544, + "grad_norm": 0.5139725804328918, + "learning_rate": 0.00021713554987212274, + "loss": 3.8666, + "step": 2444 + }, + { + "epoch": 3.12672, + "grad_norm": 0.5270793437957764, + "learning_rate": 0.00021709516758648537, + "loss": 3.8508, + "step": 2445 + }, + { + "epoch": 3.128, + "grad_norm": 0.5018444657325745, + "learning_rate": 0.000217054785300848, + "loss": 3.8951, + "step": 2446 + }, + { + "epoch": 3.12928, + "grad_norm": 0.5170176029205322, + "learning_rate": 0.00021701440301521066, + "loss": 3.8977, + "step": 2447 + }, + { + "epoch": 3.13056, + "grad_norm": 0.49703750014305115, + "learning_rate": 0.0002169740207295733, + "loss": 3.7843, + "step": 2448 + }, + { + "epoch": 3.13184, + "grad_norm": 0.5318528413772583, + "learning_rate": 0.00021693363844393592, + "loss": 3.9147, + "step": 2449 + }, + { + "epoch": 3.13312, + "grad_norm": 0.5124686360359192, + "learning_rate": 0.00021689325615829852, + "loss": 3.8174, + "step": 2450 + }, + { + "epoch": 3.1344, + "grad_norm": 0.5229965448379517, + "learning_rate": 0.00021685287387266115, + "loss": 3.9171, + "step": 2451 + }, + { + "epoch": 3.13568, + "grad_norm": 0.5190241932868958, + "learning_rate": 0.0002168124915870238, + "loss": 3.8315, + "step": 2452 + }, + { + "epoch": 3.13696, + "grad_norm": 0.5506916642189026, + "learning_rate": 0.00021677210930138644, + "loss": 3.8351, + "step": 2453 + }, + { + "epoch": 3.13824, + "grad_norm": 0.5469547510147095, + "learning_rate": 0.00021673172701574907, + "loss": 3.9098, + "step": 2454 + }, + { + "epoch": 3.13952, + "grad_norm": 0.5336132049560547, + "learning_rate": 0.0002166913447301117, + "loss": 3.8963, + "step": 2455 + }, + { + "epoch": 3.1408, + "grad_norm": 0.549925684928894, + "learning_rate": 0.00021665096244447436, + "loss": 3.8541, + "step": 2456 + }, + { + "epoch": 3.14208, + "grad_norm": 0.5148488283157349, + "learning_rate": 0.000216610580158837, + "loss": 3.8654, + "step": 2457 + }, + { + "epoch": 3.14336, + "grad_norm": 0.5574371218681335, + "learning_rate": 0.0002165701978731996, + "loss": 3.9102, + "step": 2458 + }, + { + "epoch": 3.14464, + "grad_norm": 0.5164331793785095, + "learning_rate": 0.00021652981558756222, + "loss": 3.7154, + "step": 2459 + }, + { + "epoch": 3.14592, + "grad_norm": 0.5179708003997803, + "learning_rate": 0.00021648943330192488, + "loss": 3.9097, + "step": 2460 + }, + { + "epoch": 3.1471999999999998, + "grad_norm": 0.5360696315765381, + "learning_rate": 0.0002164490510162875, + "loss": 3.8359, + "step": 2461 + }, + { + "epoch": 3.14848, + "grad_norm": 0.5113580822944641, + "learning_rate": 0.00021640866873065014, + "loss": 3.911, + "step": 2462 + }, + { + "epoch": 3.14976, + "grad_norm": 0.5048407316207886, + "learning_rate": 0.00021636828644501277, + "loss": 3.8197, + "step": 2463 + }, + { + "epoch": 3.15104, + "grad_norm": 0.5186136960983276, + "learning_rate": 0.0002163279041593754, + "loss": 3.8374, + "step": 2464 + }, + { + "epoch": 3.15232, + "grad_norm": 0.4925592243671417, + "learning_rate": 0.00021628752187373806, + "loss": 3.8343, + "step": 2465 + }, + { + "epoch": 3.1536, + "grad_norm": 0.49732711911201477, + "learning_rate": 0.00021624713958810066, + "loss": 3.8915, + "step": 2466 + }, + { + "epoch": 3.15488, + "grad_norm": 0.5022438764572144, + "learning_rate": 0.0002162067573024633, + "loss": 3.8251, + "step": 2467 + }, + { + "epoch": 3.15616, + "grad_norm": 0.5367256999015808, + "learning_rate": 0.00021616637501682592, + "loss": 3.8802, + "step": 2468 + }, + { + "epoch": 3.15744, + "grad_norm": 0.4930475950241089, + "learning_rate": 0.00021612599273118858, + "loss": 3.8364, + "step": 2469 + }, + { + "epoch": 3.15872, + "grad_norm": 0.5071558356285095, + "learning_rate": 0.0002160856104455512, + "loss": 3.828, + "step": 2470 + }, + { + "epoch": 3.16, + "grad_norm": 0.5163344740867615, + "learning_rate": 0.00021604522815991384, + "loss": 3.7945, + "step": 2471 + }, + { + "epoch": 3.16128, + "grad_norm": 0.5042238831520081, + "learning_rate": 0.00021600484587427647, + "loss": 3.8724, + "step": 2472 + }, + { + "epoch": 3.16256, + "grad_norm": 0.5206556916236877, + "learning_rate": 0.00021596446358863912, + "loss": 3.8445, + "step": 2473 + }, + { + "epoch": 3.16384, + "grad_norm": 0.4869347810745239, + "learning_rate": 0.00021592408130300173, + "loss": 3.8982, + "step": 2474 + }, + { + "epoch": 3.16512, + "grad_norm": 0.4939110279083252, + "learning_rate": 0.00021588369901736436, + "loss": 3.8215, + "step": 2475 + }, + { + "epoch": 3.1664, + "grad_norm": 0.4792523980140686, + "learning_rate": 0.000215843316731727, + "loss": 3.8195, + "step": 2476 + }, + { + "epoch": 3.16768, + "grad_norm": 0.4833280146121979, + "learning_rate": 0.00021580293444608962, + "loss": 3.878, + "step": 2477 + }, + { + "epoch": 3.16896, + "grad_norm": 0.48897233605384827, + "learning_rate": 0.00021576255216045227, + "loss": 3.8418, + "step": 2478 + }, + { + "epoch": 3.17024, + "grad_norm": 0.5081768035888672, + "learning_rate": 0.0002157221698748149, + "loss": 3.8296, + "step": 2479 + }, + { + "epoch": 3.17152, + "grad_norm": 0.499881386756897, + "learning_rate": 0.00021568178758917753, + "loss": 3.9065, + "step": 2480 + }, + { + "epoch": 3.1728, + "grad_norm": 0.5089867115020752, + "learning_rate": 0.00021564140530354014, + "loss": 3.9007, + "step": 2481 + }, + { + "epoch": 3.17408, + "grad_norm": 0.490253746509552, + "learning_rate": 0.0002156010230179028, + "loss": 3.9089, + "step": 2482 + }, + { + "epoch": 3.17536, + "grad_norm": 0.49601680040359497, + "learning_rate": 0.00021556064073226542, + "loss": 3.8876, + "step": 2483 + }, + { + "epoch": 3.17664, + "grad_norm": 0.5113343000411987, + "learning_rate": 0.00021552025844662805, + "loss": 3.8393, + "step": 2484 + }, + { + "epoch": 3.17792, + "grad_norm": 0.47494059801101685, + "learning_rate": 0.00021547987616099068, + "loss": 3.8461, + "step": 2485 + }, + { + "epoch": 3.1792, + "grad_norm": 0.5105437636375427, + "learning_rate": 0.00021543949387535334, + "loss": 3.8837, + "step": 2486 + }, + { + "epoch": 3.18048, + "grad_norm": 0.5032381415367126, + "learning_rate": 0.00021539911158971597, + "loss": 3.8457, + "step": 2487 + }, + { + "epoch": 3.18176, + "grad_norm": 0.49342939257621765, + "learning_rate": 0.0002153587293040786, + "loss": 3.819, + "step": 2488 + }, + { + "epoch": 3.18304, + "grad_norm": 0.49922072887420654, + "learning_rate": 0.0002153183470184412, + "loss": 3.792, + "step": 2489 + }, + { + "epoch": 3.18432, + "grad_norm": 0.49167224764823914, + "learning_rate": 0.00021527796473280384, + "loss": 3.8155, + "step": 2490 + }, + { + "epoch": 3.1856, + "grad_norm": 0.4955747425556183, + "learning_rate": 0.0002152375824471665, + "loss": 3.9052, + "step": 2491 + }, + { + "epoch": 3.18688, + "grad_norm": 0.513163149356842, + "learning_rate": 0.00021519720016152912, + "loss": 3.946, + "step": 2492 + }, + { + "epoch": 3.18816, + "grad_norm": 0.4796956181526184, + "learning_rate": 0.00021515681787589175, + "loss": 3.8418, + "step": 2493 + }, + { + "epoch": 3.18944, + "grad_norm": 0.5129488110542297, + "learning_rate": 0.00021511643559025438, + "loss": 3.9219, + "step": 2494 + }, + { + "epoch": 3.19072, + "grad_norm": 0.49274012446403503, + "learning_rate": 0.00021507605330461704, + "loss": 3.9179, + "step": 2495 + }, + { + "epoch": 3.192, + "grad_norm": 0.5269827246665955, + "learning_rate": 0.00021503567101897967, + "loss": 3.8845, + "step": 2496 + }, + { + "epoch": 3.19328, + "grad_norm": 0.5039204955101013, + "learning_rate": 0.00021499528873334227, + "loss": 3.8432, + "step": 2497 + }, + { + "epoch": 3.19456, + "grad_norm": 0.5350068807601929, + "learning_rate": 0.0002149549064477049, + "loss": 3.7202, + "step": 2498 + }, + { + "epoch": 3.19584, + "grad_norm": 0.49248334765434265, + "learning_rate": 0.00021491452416206756, + "loss": 3.848, + "step": 2499 + }, + { + "epoch": 3.19712, + "grad_norm": 0.5159714221954346, + "learning_rate": 0.0002148741418764302, + "loss": 3.8145, + "step": 2500 + }, + { + "epoch": 3.1984, + "grad_norm": 0.4729326367378235, + "learning_rate": 0.00021483375959079282, + "loss": 3.9304, + "step": 2501 + }, + { + "epoch": 3.19968, + "grad_norm": 0.5187593698501587, + "learning_rate": 0.00021479337730515545, + "loss": 3.8462, + "step": 2502 + }, + { + "epoch": 3.20096, + "grad_norm": 0.521251380443573, + "learning_rate": 0.00021475299501951808, + "loss": 3.8268, + "step": 2503 + }, + { + "epoch": 3.20224, + "grad_norm": 0.4812679886817932, + "learning_rate": 0.00021471261273388074, + "loss": 3.8409, + "step": 2504 + }, + { + "epoch": 3.20352, + "grad_norm": 0.4940718114376068, + "learning_rate": 0.00021467223044824337, + "loss": 3.8029, + "step": 2505 + }, + { + "epoch": 3.2048, + "grad_norm": 0.5103253126144409, + "learning_rate": 0.00021463184816260597, + "loss": 3.8354, + "step": 2506 + }, + { + "epoch": 3.20608, + "grad_norm": 0.5237296223640442, + "learning_rate": 0.0002145914658769686, + "loss": 3.8567, + "step": 2507 + }, + { + "epoch": 3.20736, + "grad_norm": 0.5030867457389832, + "learning_rate": 0.00021455108359133126, + "loss": 3.8715, + "step": 2508 + }, + { + "epoch": 3.20864, + "grad_norm": 0.49784404039382935, + "learning_rate": 0.0002145107013056939, + "loss": 3.7889, + "step": 2509 + }, + { + "epoch": 3.20992, + "grad_norm": 0.5086187124252319, + "learning_rate": 0.00021447031902005652, + "loss": 3.9156, + "step": 2510 + }, + { + "epoch": 3.2112, + "grad_norm": 0.5107704401016235, + "learning_rate": 0.00021442993673441915, + "loss": 3.7905, + "step": 2511 + }, + { + "epoch": 3.2124800000000002, + "grad_norm": 0.4935329258441925, + "learning_rate": 0.0002143895544487818, + "loss": 3.8257, + "step": 2512 + }, + { + "epoch": 3.21376, + "grad_norm": 0.5265173316001892, + "learning_rate": 0.00021434917216314444, + "loss": 3.8725, + "step": 2513 + }, + { + "epoch": 3.21504, + "grad_norm": 0.5278881192207336, + "learning_rate": 0.00021430878987750704, + "loss": 3.8997, + "step": 2514 + }, + { + "epoch": 3.21632, + "grad_norm": 0.5073565244674683, + "learning_rate": 0.00021426840759186967, + "loss": 3.8788, + "step": 2515 + }, + { + "epoch": 3.2176, + "grad_norm": 0.5007202625274658, + "learning_rate": 0.0002142280253062323, + "loss": 3.8289, + "step": 2516 + }, + { + "epoch": 3.21888, + "grad_norm": 0.5268493294715881, + "learning_rate": 0.00021418764302059496, + "loss": 3.9057, + "step": 2517 + }, + { + "epoch": 3.22016, + "grad_norm": 0.5133515000343323, + "learning_rate": 0.00021414726073495759, + "loss": 3.9227, + "step": 2518 + }, + { + "epoch": 3.22144, + "grad_norm": 0.5341963171958923, + "learning_rate": 0.00021410687844932022, + "loss": 3.8918, + "step": 2519 + }, + { + "epoch": 3.22272, + "grad_norm": 0.488156795501709, + "learning_rate": 0.00021406649616368285, + "loss": 3.9192, + "step": 2520 + }, + { + "epoch": 3.224, + "grad_norm": 0.5290320515632629, + "learning_rate": 0.0002140261138780455, + "loss": 3.7908, + "step": 2521 + }, + { + "epoch": 3.22528, + "grad_norm": 0.510662853717804, + "learning_rate": 0.0002139857315924081, + "loss": 3.837, + "step": 2522 + }, + { + "epoch": 3.22656, + "grad_norm": 0.5320321321487427, + "learning_rate": 0.00021394534930677074, + "loss": 3.8889, + "step": 2523 + }, + { + "epoch": 3.22784, + "grad_norm": 0.5298739075660706, + "learning_rate": 0.00021390496702113337, + "loss": 3.9002, + "step": 2524 + }, + { + "epoch": 3.22912, + "grad_norm": 0.5062413215637207, + "learning_rate": 0.00021386458473549602, + "loss": 3.8557, + "step": 2525 + }, + { + "epoch": 3.2304, + "grad_norm": 0.5112326145172119, + "learning_rate": 0.00021382420244985865, + "loss": 3.8719, + "step": 2526 + }, + { + "epoch": 3.23168, + "grad_norm": 0.48387032747268677, + "learning_rate": 0.00021378382016422128, + "loss": 3.8151, + "step": 2527 + }, + { + "epoch": 3.23296, + "grad_norm": 0.49385952949523926, + "learning_rate": 0.0002137434378785839, + "loss": 3.8642, + "step": 2528 + }, + { + "epoch": 3.23424, + "grad_norm": 0.5138410925865173, + "learning_rate": 0.00021370305559294652, + "loss": 3.9133, + "step": 2529 + }, + { + "epoch": 3.23552, + "grad_norm": 0.5019318461418152, + "learning_rate": 0.00021366267330730917, + "loss": 3.8908, + "step": 2530 + }, + { + "epoch": 3.2368, + "grad_norm": 0.4930397868156433, + "learning_rate": 0.0002136222910216718, + "loss": 3.8062, + "step": 2531 + }, + { + "epoch": 3.23808, + "grad_norm": 0.5121169090270996, + "learning_rate": 0.00021358190873603443, + "loss": 3.7471, + "step": 2532 + }, + { + "epoch": 3.23936, + "grad_norm": 0.4978577494621277, + "learning_rate": 0.00021354152645039706, + "loss": 3.9114, + "step": 2533 + }, + { + "epoch": 3.24064, + "grad_norm": 0.5160923004150391, + "learning_rate": 0.00021350114416475972, + "loss": 3.832, + "step": 2534 + }, + { + "epoch": 3.24192, + "grad_norm": 0.49270156025886536, + "learning_rate": 0.00021346076187912235, + "loss": 3.8092, + "step": 2535 + }, + { + "epoch": 3.2432, + "grad_norm": 0.5121193528175354, + "learning_rate": 0.00021342037959348498, + "loss": 3.7557, + "step": 2536 + }, + { + "epoch": 3.24448, + "grad_norm": 0.518182098865509, + "learning_rate": 0.00021337999730784758, + "loss": 3.8582, + "step": 2537 + }, + { + "epoch": 3.24576, + "grad_norm": 0.5109360814094543, + "learning_rate": 0.00021333961502221024, + "loss": 3.8122, + "step": 2538 + }, + { + "epoch": 3.24704, + "grad_norm": 0.5071660280227661, + "learning_rate": 0.00021329923273657287, + "loss": 3.8727, + "step": 2539 + }, + { + "epoch": 3.24832, + "grad_norm": 0.5436646342277527, + "learning_rate": 0.0002132588504509355, + "loss": 3.903, + "step": 2540 + }, + { + "epoch": 3.2496, + "grad_norm": 0.5585988163948059, + "learning_rate": 0.00021321846816529813, + "loss": 3.8499, + "step": 2541 + }, + { + "epoch": 3.25088, + "grad_norm": 0.5021224617958069, + "learning_rate": 0.00021317808587966076, + "loss": 3.8658, + "step": 2542 + }, + { + "epoch": 3.25216, + "grad_norm": 0.532137393951416, + "learning_rate": 0.00021313770359402342, + "loss": 3.8951, + "step": 2543 + }, + { + "epoch": 3.25344, + "grad_norm": 0.5324034690856934, + "learning_rate": 0.00021309732130838605, + "loss": 3.8472, + "step": 2544 + }, + { + "epoch": 3.25472, + "grad_norm": 0.5090122222900391, + "learning_rate": 0.00021305693902274865, + "loss": 3.8011, + "step": 2545 + }, + { + "epoch": 3.2560000000000002, + "grad_norm": 0.5446542501449585, + "learning_rate": 0.00021301655673711128, + "loss": 3.8029, + "step": 2546 + }, + { + "epoch": 3.25728, + "grad_norm": 0.5329357385635376, + "learning_rate": 0.00021297617445147394, + "loss": 3.7843, + "step": 2547 + }, + { + "epoch": 3.25856, + "grad_norm": 0.552749752998352, + "learning_rate": 0.00021293579216583657, + "loss": 3.8847, + "step": 2548 + }, + { + "epoch": 3.25984, + "grad_norm": 0.529541552066803, + "learning_rate": 0.0002128954098801992, + "loss": 3.9156, + "step": 2549 + }, + { + "epoch": 3.26112, + "grad_norm": 0.5321997404098511, + "learning_rate": 0.00021285502759456183, + "loss": 3.8645, + "step": 2550 + }, + { + "epoch": 3.2624, + "grad_norm": 0.5043097138404846, + "learning_rate": 0.0002128146453089245, + "loss": 3.8073, + "step": 2551 + }, + { + "epoch": 3.26368, + "grad_norm": 0.5317241549491882, + "learning_rate": 0.00021277426302328712, + "loss": 3.8946, + "step": 2552 + }, + { + "epoch": 3.26496, + "grad_norm": 0.5115827918052673, + "learning_rate": 0.00021273388073764972, + "loss": 3.8361, + "step": 2553 + }, + { + "epoch": 3.26624, + "grad_norm": 0.5349282622337341, + "learning_rate": 0.00021269349845201235, + "loss": 3.8482, + "step": 2554 + }, + { + "epoch": 3.26752, + "grad_norm": 0.5121451616287231, + "learning_rate": 0.00021265311616637498, + "loss": 3.8416, + "step": 2555 + }, + { + "epoch": 3.2688, + "grad_norm": 0.4995376765727997, + "learning_rate": 0.00021261273388073764, + "loss": 3.848, + "step": 2556 + }, + { + "epoch": 3.27008, + "grad_norm": 0.5239928960800171, + "learning_rate": 0.00021257235159510027, + "loss": 3.8595, + "step": 2557 + }, + { + "epoch": 3.27136, + "grad_norm": 0.5018921494483948, + "learning_rate": 0.0002125319693094629, + "loss": 3.8748, + "step": 2558 + }, + { + "epoch": 3.27264, + "grad_norm": 0.5001853108406067, + "learning_rate": 0.00021249158702382553, + "loss": 3.7935, + "step": 2559 + }, + { + "epoch": 3.27392, + "grad_norm": 0.5090051889419556, + "learning_rate": 0.00021245120473818818, + "loss": 3.8594, + "step": 2560 + }, + { + "epoch": 3.2752, + "grad_norm": 0.491327702999115, + "learning_rate": 0.0002124108224525508, + "loss": 3.8575, + "step": 2561 + }, + { + "epoch": 3.27648, + "grad_norm": 0.516203761100769, + "learning_rate": 0.00021237044016691342, + "loss": 3.8331, + "step": 2562 + }, + { + "epoch": 3.27776, + "grad_norm": 0.5098548531532288, + "learning_rate": 0.00021233005788127605, + "loss": 3.8881, + "step": 2563 + }, + { + "epoch": 3.27904, + "grad_norm": 0.5232280492782593, + "learning_rate": 0.0002122896755956387, + "loss": 3.8788, + "step": 2564 + }, + { + "epoch": 3.28032, + "grad_norm": 0.5226551294326782, + "learning_rate": 0.00021224929331000133, + "loss": 3.7725, + "step": 2565 + }, + { + "epoch": 3.2816, + "grad_norm": 0.5192921161651611, + "learning_rate": 0.00021220891102436396, + "loss": 3.9056, + "step": 2566 + }, + { + "epoch": 3.28288, + "grad_norm": 0.5096680521965027, + "learning_rate": 0.0002121685287387266, + "loss": 3.8403, + "step": 2567 + }, + { + "epoch": 3.28416, + "grad_norm": 0.4832972586154938, + "learning_rate": 0.00021212814645308925, + "loss": 3.8747, + "step": 2568 + }, + { + "epoch": 3.28544, + "grad_norm": 0.5360098481178284, + "learning_rate": 0.00021208776416745185, + "loss": 3.8489, + "step": 2569 + }, + { + "epoch": 3.28672, + "grad_norm": 0.5098423361778259, + "learning_rate": 0.00021204738188181448, + "loss": 3.8063, + "step": 2570 + }, + { + "epoch": 3.288, + "grad_norm": 0.5014640092849731, + "learning_rate": 0.00021200699959617712, + "loss": 3.8698, + "step": 2571 + }, + { + "epoch": 3.2892799999999998, + "grad_norm": 0.531622052192688, + "learning_rate": 0.00021196661731053975, + "loss": 4.001, + "step": 2572 + }, + { + "epoch": 3.29056, + "grad_norm": 0.5045267939567566, + "learning_rate": 0.0002119262350249024, + "loss": 3.8148, + "step": 2573 + }, + { + "epoch": 3.29184, + "grad_norm": 0.5068945288658142, + "learning_rate": 0.00021188585273926503, + "loss": 3.8387, + "step": 2574 + }, + { + "epoch": 3.29312, + "grad_norm": 0.5441416501998901, + "learning_rate": 0.00021184547045362766, + "loss": 3.8937, + "step": 2575 + }, + { + "epoch": 3.2944, + "grad_norm": 0.5135897397994995, + "learning_rate": 0.00021180508816799027, + "loss": 3.8456, + "step": 2576 + }, + { + "epoch": 3.29568, + "grad_norm": 0.5423394441604614, + "learning_rate": 0.00021176470588235295, + "loss": 3.9034, + "step": 2577 + }, + { + "epoch": 3.29696, + "grad_norm": 0.5269873738288879, + "learning_rate": 0.00021172432359671555, + "loss": 3.8488, + "step": 2578 + }, + { + "epoch": 3.29824, + "grad_norm": 0.57085782289505, + "learning_rate": 0.00021168394131107818, + "loss": 3.8606, + "step": 2579 + }, + { + "epoch": 3.2995200000000002, + "grad_norm": 0.5277544260025024, + "learning_rate": 0.0002116435590254408, + "loss": 3.8475, + "step": 2580 + }, + { + "epoch": 3.3008, + "grad_norm": 0.5475723743438721, + "learning_rate": 0.00021160317673980347, + "loss": 3.8538, + "step": 2581 + }, + { + "epoch": 3.30208, + "grad_norm": 0.5339505076408386, + "learning_rate": 0.0002115627944541661, + "loss": 3.8168, + "step": 2582 + }, + { + "epoch": 3.30336, + "grad_norm": 0.561708927154541, + "learning_rate": 0.00021152241216852873, + "loss": 3.8303, + "step": 2583 + }, + { + "epoch": 3.30464, + "grad_norm": 0.507378339767456, + "learning_rate": 0.00021148202988289133, + "loss": 3.8375, + "step": 2584 + }, + { + "epoch": 3.30592, + "grad_norm": 0.5297276377677917, + "learning_rate": 0.00021144164759725396, + "loss": 3.8747, + "step": 2585 + }, + { + "epoch": 3.3072, + "grad_norm": 0.5358178615570068, + "learning_rate": 0.00021140126531161662, + "loss": 3.8599, + "step": 2586 + }, + { + "epoch": 3.30848, + "grad_norm": 0.5373551845550537, + "learning_rate": 0.00021136088302597925, + "loss": 3.8097, + "step": 2587 + }, + { + "epoch": 3.30976, + "grad_norm": 0.5402244925498962, + "learning_rate": 0.00021132050074034188, + "loss": 3.8853, + "step": 2588 + }, + { + "epoch": 3.31104, + "grad_norm": 0.5271474123001099, + "learning_rate": 0.0002112801184547045, + "loss": 3.7922, + "step": 2589 + }, + { + "epoch": 3.31232, + "grad_norm": 0.5495550632476807, + "learning_rate": 0.00021123973616906717, + "loss": 3.8268, + "step": 2590 + }, + { + "epoch": 3.3136, + "grad_norm": 0.5179243683815002, + "learning_rate": 0.0002111993538834298, + "loss": 3.9258, + "step": 2591 + }, + { + "epoch": 3.31488, + "grad_norm": 0.5357790589332581, + "learning_rate": 0.00021115897159779243, + "loss": 3.8901, + "step": 2592 + }, + { + "epoch": 3.31616, + "grad_norm": 0.4818624258041382, + "learning_rate": 0.00021111858931215503, + "loss": 3.8431, + "step": 2593 + }, + { + "epoch": 3.31744, + "grad_norm": 0.5085586905479431, + "learning_rate": 0.0002110782070265177, + "loss": 3.8345, + "step": 2594 + }, + { + "epoch": 3.31872, + "grad_norm": 0.518665611743927, + "learning_rate": 0.00021103782474088032, + "loss": 3.7475, + "step": 2595 + }, + { + "epoch": 3.32, + "grad_norm": 0.5138918161392212, + "learning_rate": 0.00021099744245524295, + "loss": 3.8736, + "step": 2596 + }, + { + "epoch": 3.32128, + "grad_norm": 0.5055665373802185, + "learning_rate": 0.00021095706016960558, + "loss": 3.8456, + "step": 2597 + }, + { + "epoch": 3.32256, + "grad_norm": 0.4695318937301636, + "learning_rate": 0.0002109166778839682, + "loss": 3.8635, + "step": 2598 + }, + { + "epoch": 3.32384, + "grad_norm": 0.47867250442504883, + "learning_rate": 0.00021087629559833087, + "loss": 3.8418, + "step": 2599 + }, + { + "epoch": 3.32512, + "grad_norm": 0.5042849779129028, + "learning_rate": 0.0002108359133126935, + "loss": 3.8351, + "step": 2600 + }, + { + "epoch": 3.3264, + "grad_norm": 0.5028448700904846, + "learning_rate": 0.0002107955310270561, + "loss": 3.8349, + "step": 2601 + }, + { + "epoch": 3.32768, + "grad_norm": 0.5034447312355042, + "learning_rate": 0.00021075514874141873, + "loss": 3.8095, + "step": 2602 + }, + { + "epoch": 3.32896, + "grad_norm": 0.4787198007106781, + "learning_rate": 0.00021071476645578139, + "loss": 3.7805, + "step": 2603 + }, + { + "epoch": 3.33024, + "grad_norm": 0.535429835319519, + "learning_rate": 0.00021067438417014402, + "loss": 3.8945, + "step": 2604 + }, + { + "epoch": 3.33152, + "grad_norm": 0.5033429265022278, + "learning_rate": 0.00021063400188450665, + "loss": 3.7983, + "step": 2605 + }, + { + "epoch": 3.3327999999999998, + "grad_norm": 0.4938407242298126, + "learning_rate": 0.00021059361959886928, + "loss": 3.8798, + "step": 2606 + }, + { + "epoch": 3.33408, + "grad_norm": 0.5288311839103699, + "learning_rate": 0.00021055323731323193, + "loss": 3.833, + "step": 2607 + }, + { + "epoch": 3.33536, + "grad_norm": 0.5072165131568909, + "learning_rate": 0.00021051285502759456, + "loss": 3.8302, + "step": 2608 + }, + { + "epoch": 3.33664, + "grad_norm": 0.5025217533111572, + "learning_rate": 0.00021047247274195717, + "loss": 3.8571, + "step": 2609 + }, + { + "epoch": 3.33792, + "grad_norm": 0.5042093396186829, + "learning_rate": 0.0002104320904563198, + "loss": 3.8965, + "step": 2610 + }, + { + "epoch": 3.3392, + "grad_norm": 0.5024438500404358, + "learning_rate": 0.00021039170817068243, + "loss": 3.8112, + "step": 2611 + }, + { + "epoch": 3.34048, + "grad_norm": 0.4909932613372803, + "learning_rate": 0.00021035132588504508, + "loss": 3.8429, + "step": 2612 + }, + { + "epoch": 3.34176, + "grad_norm": 0.505427896976471, + "learning_rate": 0.00021031094359940771, + "loss": 3.9004, + "step": 2613 + }, + { + "epoch": 3.3430400000000002, + "grad_norm": 0.5072449445724487, + "learning_rate": 0.00021027056131377034, + "loss": 3.824, + "step": 2614 + }, + { + "epoch": 3.34432, + "grad_norm": 0.5105025172233582, + "learning_rate": 0.00021023017902813297, + "loss": 3.8643, + "step": 2615 + }, + { + "epoch": 3.3456, + "grad_norm": 0.5096602439880371, + "learning_rate": 0.00021018979674249563, + "loss": 3.8363, + "step": 2616 + }, + { + "epoch": 3.34688, + "grad_norm": 0.4964136779308319, + "learning_rate": 0.00021014941445685823, + "loss": 3.8228, + "step": 2617 + }, + { + "epoch": 3.34816, + "grad_norm": 0.510119616985321, + "learning_rate": 0.00021010903217122086, + "loss": 3.8846, + "step": 2618 + }, + { + "epoch": 3.34944, + "grad_norm": 0.504747211933136, + "learning_rate": 0.0002100686498855835, + "loss": 3.8775, + "step": 2619 + }, + { + "epoch": 3.35072, + "grad_norm": 0.496619313955307, + "learning_rate": 0.00021002826759994615, + "loss": 3.8405, + "step": 2620 + }, + { + "epoch": 3.352, + "grad_norm": 0.5143598914146423, + "learning_rate": 0.00020998788531430878, + "loss": 3.8538, + "step": 2621 + }, + { + "epoch": 3.35328, + "grad_norm": 0.49931085109710693, + "learning_rate": 0.0002099475030286714, + "loss": 3.8484, + "step": 2622 + }, + { + "epoch": 3.35456, + "grad_norm": 0.49670740962028503, + "learning_rate": 0.00020990712074303404, + "loss": 3.8549, + "step": 2623 + }, + { + "epoch": 3.35584, + "grad_norm": 0.5194510221481323, + "learning_rate": 0.00020986673845739664, + "loss": 3.9321, + "step": 2624 + }, + { + "epoch": 3.35712, + "grad_norm": 0.4933635890483856, + "learning_rate": 0.0002098263561717593, + "loss": 3.9203, + "step": 2625 + }, + { + "epoch": 3.3584, + "grad_norm": 0.5265275835990906, + "learning_rate": 0.00020978597388612193, + "loss": 3.7854, + "step": 2626 + }, + { + "epoch": 3.35968, + "grad_norm": 0.5067412853240967, + "learning_rate": 0.00020974559160048456, + "loss": 3.8297, + "step": 2627 + }, + { + "epoch": 3.36096, + "grad_norm": 0.5101473927497864, + "learning_rate": 0.0002097052093148472, + "loss": 3.8358, + "step": 2628 + }, + { + "epoch": 3.36224, + "grad_norm": 0.5028688311576843, + "learning_rate": 0.00020966482702920985, + "loss": 3.8475, + "step": 2629 + }, + { + "epoch": 3.36352, + "grad_norm": 0.5051252245903015, + "learning_rate": 0.00020962444474357248, + "loss": 3.8307, + "step": 2630 + }, + { + "epoch": 3.3648, + "grad_norm": 0.490156888961792, + "learning_rate": 0.0002095840624579351, + "loss": 3.843, + "step": 2631 + }, + { + "epoch": 3.36608, + "grad_norm": 0.49856409430503845, + "learning_rate": 0.0002095436801722977, + "loss": 3.8606, + "step": 2632 + }, + { + "epoch": 3.36736, + "grad_norm": 0.47945141792297363, + "learning_rate": 0.00020950329788666037, + "loss": 3.8623, + "step": 2633 + }, + { + "epoch": 3.36864, + "grad_norm": 0.5224535465240479, + "learning_rate": 0.000209462915601023, + "loss": 3.8351, + "step": 2634 + }, + { + "epoch": 3.36992, + "grad_norm": 0.49152296781539917, + "learning_rate": 0.00020942253331538563, + "loss": 3.812, + "step": 2635 + }, + { + "epoch": 3.3712, + "grad_norm": 0.4920552968978882, + "learning_rate": 0.00020938215102974826, + "loss": 3.8568, + "step": 2636 + }, + { + "epoch": 3.37248, + "grad_norm": 0.4960016906261444, + "learning_rate": 0.0002093417687441109, + "loss": 3.8487, + "step": 2637 + }, + { + "epoch": 3.37376, + "grad_norm": 0.5091875195503235, + "learning_rate": 0.00020930138645847355, + "loss": 3.8627, + "step": 2638 + }, + { + "epoch": 3.37504, + "grad_norm": 0.4994186758995056, + "learning_rate": 0.00020926100417283618, + "loss": 3.8548, + "step": 2639 + }, + { + "epoch": 3.3763199999999998, + "grad_norm": 0.49067172408103943, + "learning_rate": 0.00020922062188719878, + "loss": 3.8619, + "step": 2640 + }, + { + "epoch": 3.3776, + "grad_norm": 0.4815825819969177, + "learning_rate": 0.0002091802396015614, + "loss": 3.8202, + "step": 2641 + }, + { + "epoch": 3.37888, + "grad_norm": 0.501998245716095, + "learning_rate": 0.00020913985731592407, + "loss": 3.8243, + "step": 2642 + }, + { + "epoch": 3.38016, + "grad_norm": 0.49832189083099365, + "learning_rate": 0.0002090994750302867, + "loss": 3.8018, + "step": 2643 + }, + { + "epoch": 3.38144, + "grad_norm": 0.5382563471794128, + "learning_rate": 0.00020905909274464933, + "loss": 3.8121, + "step": 2644 + }, + { + "epoch": 3.38272, + "grad_norm": 0.5151340961456299, + "learning_rate": 0.00020901871045901196, + "loss": 3.8461, + "step": 2645 + }, + { + "epoch": 3.384, + "grad_norm": 0.5148212909698486, + "learning_rate": 0.00020897832817337461, + "loss": 3.9021, + "step": 2646 + }, + { + "epoch": 3.38528, + "grad_norm": 0.49888908863067627, + "learning_rate": 0.00020893794588773724, + "loss": 3.9323, + "step": 2647 + }, + { + "epoch": 3.3865600000000002, + "grad_norm": 0.5344513058662415, + "learning_rate": 0.00020889756360209985, + "loss": 3.8421, + "step": 2648 + }, + { + "epoch": 3.38784, + "grad_norm": 0.5026503205299377, + "learning_rate": 0.00020885718131646248, + "loss": 3.7643, + "step": 2649 + }, + { + "epoch": 3.38912, + "grad_norm": 0.4816747009754181, + "learning_rate": 0.0002088167990308251, + "loss": 3.7415, + "step": 2650 + }, + { + "epoch": 3.3904, + "grad_norm": 0.5043998956680298, + "learning_rate": 0.00020877641674518777, + "loss": 3.8523, + "step": 2651 + }, + { + "epoch": 3.39168, + "grad_norm": 0.5302547812461853, + "learning_rate": 0.0002087360344595504, + "loss": 3.8868, + "step": 2652 + }, + { + "epoch": 3.39296, + "grad_norm": 0.47889089584350586, + "learning_rate": 0.00020869565217391303, + "loss": 3.7999, + "step": 2653 + }, + { + "epoch": 3.39424, + "grad_norm": 0.5710118412971497, + "learning_rate": 0.00020865526988827566, + "loss": 3.8922, + "step": 2654 + }, + { + "epoch": 3.39552, + "grad_norm": 0.504432201385498, + "learning_rate": 0.0002086148876026383, + "loss": 3.8663, + "step": 2655 + }, + { + "epoch": 3.3968, + "grad_norm": 0.5317485928535461, + "learning_rate": 0.00020857450531700092, + "loss": 3.8964, + "step": 2656 + }, + { + "epoch": 3.39808, + "grad_norm": 0.5189566612243652, + "learning_rate": 0.00020853412303136355, + "loss": 3.8165, + "step": 2657 + }, + { + "epoch": 3.39936, + "grad_norm": 0.5216357111930847, + "learning_rate": 0.00020849374074572618, + "loss": 3.8114, + "step": 2658 + }, + { + "epoch": 3.40064, + "grad_norm": 0.517992377281189, + "learning_rate": 0.00020845335846008883, + "loss": 3.8755, + "step": 2659 + }, + { + "epoch": 3.40192, + "grad_norm": 0.5476480722427368, + "learning_rate": 0.00020841297617445146, + "loss": 3.8006, + "step": 2660 + }, + { + "epoch": 3.4032, + "grad_norm": 0.5155659914016724, + "learning_rate": 0.0002083725938888141, + "loss": 3.7955, + "step": 2661 + }, + { + "epoch": 3.40448, + "grad_norm": 0.5250527262687683, + "learning_rate": 0.00020833221160317672, + "loss": 3.9009, + "step": 2662 + }, + { + "epoch": 3.40576, + "grad_norm": 0.5005857944488525, + "learning_rate": 0.00020829182931753933, + "loss": 3.8746, + "step": 2663 + }, + { + "epoch": 3.40704, + "grad_norm": 0.5127919316291809, + "learning_rate": 0.000208251447031902, + "loss": 3.8685, + "step": 2664 + }, + { + "epoch": 3.40832, + "grad_norm": 0.49947747588157654, + "learning_rate": 0.0002082110647462646, + "loss": 3.8362, + "step": 2665 + }, + { + "epoch": 3.4096, + "grad_norm": 0.48780733346939087, + "learning_rate": 0.00020817068246062724, + "loss": 3.821, + "step": 2666 + }, + { + "epoch": 3.41088, + "grad_norm": 0.5010076761245728, + "learning_rate": 0.00020813030017498987, + "loss": 3.7895, + "step": 2667 + }, + { + "epoch": 3.41216, + "grad_norm": 0.5000413060188293, + "learning_rate": 0.00020808991788935253, + "loss": 3.8729, + "step": 2668 + }, + { + "epoch": 3.41344, + "grad_norm": 0.5171912312507629, + "learning_rate": 0.00020804953560371516, + "loss": 3.8008, + "step": 2669 + }, + { + "epoch": 3.41472, + "grad_norm": 0.49489569664001465, + "learning_rate": 0.0002080091533180778, + "loss": 3.8762, + "step": 2670 + }, + { + "epoch": 3.416, + "grad_norm": 0.49014776945114136, + "learning_rate": 0.0002079687710324404, + "loss": 3.7536, + "step": 2671 + }, + { + "epoch": 3.41728, + "grad_norm": 0.5062114596366882, + "learning_rate": 0.00020792838874680308, + "loss": 3.8479, + "step": 2672 + }, + { + "epoch": 3.41856, + "grad_norm": 0.49555784463882446, + "learning_rate": 0.00020788800646116568, + "loss": 3.851, + "step": 2673 + }, + { + "epoch": 3.4198399999999998, + "grad_norm": 0.48227742314338684, + "learning_rate": 0.0002078476241755283, + "loss": 3.8026, + "step": 2674 + }, + { + "epoch": 3.42112, + "grad_norm": 0.49328896403312683, + "learning_rate": 0.00020780724188989094, + "loss": 3.8565, + "step": 2675 + }, + { + "epoch": 3.4224, + "grad_norm": 0.4922971725463867, + "learning_rate": 0.00020776685960425357, + "loss": 3.8081, + "step": 2676 + }, + { + "epoch": 3.42368, + "grad_norm": 0.5213683843612671, + "learning_rate": 0.00020772647731861623, + "loss": 3.7913, + "step": 2677 + }, + { + "epoch": 3.42496, + "grad_norm": 0.4958494007587433, + "learning_rate": 0.00020768609503297886, + "loss": 3.8678, + "step": 2678 + }, + { + "epoch": 3.42624, + "grad_norm": 0.4969683885574341, + "learning_rate": 0.0002076457127473415, + "loss": 3.7767, + "step": 2679 + }, + { + "epoch": 3.42752, + "grad_norm": 0.5043670535087585, + "learning_rate": 0.0002076053304617041, + "loss": 3.8217, + "step": 2680 + }, + { + "epoch": 3.4288, + "grad_norm": 0.4834843575954437, + "learning_rate": 0.00020756494817606675, + "loss": 3.7935, + "step": 2681 + }, + { + "epoch": 3.4300800000000002, + "grad_norm": 0.5033546686172485, + "learning_rate": 0.00020752456589042938, + "loss": 3.8507, + "step": 2682 + }, + { + "epoch": 3.43136, + "grad_norm": 0.4869885742664337, + "learning_rate": 0.000207484183604792, + "loss": 3.7742, + "step": 2683 + }, + { + "epoch": 3.43264, + "grad_norm": 0.5016533732414246, + "learning_rate": 0.00020744380131915464, + "loss": 3.8161, + "step": 2684 + }, + { + "epoch": 3.43392, + "grad_norm": 0.4721163809299469, + "learning_rate": 0.0002074034190335173, + "loss": 3.8743, + "step": 2685 + }, + { + "epoch": 3.4352, + "grad_norm": 0.5129477381706238, + "learning_rate": 0.00020736303674787993, + "loss": 3.9012, + "step": 2686 + }, + { + "epoch": 3.43648, + "grad_norm": 0.5066404938697815, + "learning_rate": 0.00020732265446224256, + "loss": 3.8459, + "step": 2687 + }, + { + "epoch": 3.43776, + "grad_norm": 0.5318638682365417, + "learning_rate": 0.00020728227217660516, + "loss": 3.744, + "step": 2688 + }, + { + "epoch": 3.43904, + "grad_norm": 0.497341513633728, + "learning_rate": 0.00020724188989096782, + "loss": 3.8856, + "step": 2689 + }, + { + "epoch": 3.44032, + "grad_norm": 0.5572753548622131, + "learning_rate": 0.00020720150760533045, + "loss": 3.7982, + "step": 2690 + }, + { + "epoch": 3.4416, + "grad_norm": 0.4999849498271942, + "learning_rate": 0.00020716112531969308, + "loss": 3.7728, + "step": 2691 + }, + { + "epoch": 3.44288, + "grad_norm": 0.5322354435920715, + "learning_rate": 0.0002071207430340557, + "loss": 3.9, + "step": 2692 + }, + { + "epoch": 3.44416, + "grad_norm": 0.5226873755455017, + "learning_rate": 0.00020708036074841834, + "loss": 3.8428, + "step": 2693 + }, + { + "epoch": 3.44544, + "grad_norm": 0.5086526870727539, + "learning_rate": 0.000207039978462781, + "loss": 3.8667, + "step": 2694 + }, + { + "epoch": 3.44672, + "grad_norm": 0.4990271329879761, + "learning_rate": 0.00020699959617714362, + "loss": 3.8685, + "step": 2695 + }, + { + "epoch": 3.448, + "grad_norm": 0.5120396018028259, + "learning_rate": 0.00020695921389150623, + "loss": 3.8694, + "step": 2696 + }, + { + "epoch": 3.44928, + "grad_norm": 0.5174608826637268, + "learning_rate": 0.00020691883160586886, + "loss": 3.8495, + "step": 2697 + }, + { + "epoch": 3.45056, + "grad_norm": 0.5169567465782166, + "learning_rate": 0.00020687844932023151, + "loss": 3.8418, + "step": 2698 + }, + { + "epoch": 3.45184, + "grad_norm": 0.5117883682250977, + "learning_rate": 0.00020683806703459414, + "loss": 3.8707, + "step": 2699 + }, + { + "epoch": 3.45312, + "grad_norm": 0.5186121463775635, + "learning_rate": 0.00020679768474895677, + "loss": 3.8601, + "step": 2700 + }, + { + "epoch": 3.4544, + "grad_norm": 0.5014731884002686, + "learning_rate": 0.0002067573024633194, + "loss": 3.8657, + "step": 2701 + }, + { + "epoch": 3.45568, + "grad_norm": 0.4928348958492279, + "learning_rate": 0.00020671692017768206, + "loss": 3.7956, + "step": 2702 + }, + { + "epoch": 3.45696, + "grad_norm": 0.4971044063568115, + "learning_rate": 0.0002066765378920447, + "loss": 3.9533, + "step": 2703 + }, + { + "epoch": 3.45824, + "grad_norm": 0.5187544226646423, + "learning_rate": 0.0002066361556064073, + "loss": 3.847, + "step": 2704 + }, + { + "epoch": 3.45952, + "grad_norm": 0.4827615022659302, + "learning_rate": 0.00020659577332076992, + "loss": 3.8399, + "step": 2705 + }, + { + "epoch": 3.4608, + "grad_norm": 0.5062766671180725, + "learning_rate": 0.00020655539103513255, + "loss": 3.9054, + "step": 2706 + }, + { + "epoch": 3.46208, + "grad_norm": 0.49423202872276306, + "learning_rate": 0.0002065150087494952, + "loss": 3.8656, + "step": 2707 + }, + { + "epoch": 3.4633599999999998, + "grad_norm": 0.5079230666160583, + "learning_rate": 0.00020647462646385784, + "loss": 3.7847, + "step": 2708 + }, + { + "epoch": 3.46464, + "grad_norm": 0.5124461650848389, + "learning_rate": 0.00020643424417822047, + "loss": 3.7327, + "step": 2709 + }, + { + "epoch": 3.46592, + "grad_norm": 0.5108417868614197, + "learning_rate": 0.0002063938618925831, + "loss": 3.848, + "step": 2710 + }, + { + "epoch": 3.4672, + "grad_norm": 0.5090741515159607, + "learning_rate": 0.00020635347960694576, + "loss": 3.7348, + "step": 2711 + }, + { + "epoch": 3.46848, + "grad_norm": 0.4850289523601532, + "learning_rate": 0.00020631309732130836, + "loss": 3.8364, + "step": 2712 + }, + { + "epoch": 3.46976, + "grad_norm": 0.5068538188934326, + "learning_rate": 0.000206272715035671, + "loss": 3.8469, + "step": 2713 + }, + { + "epoch": 3.47104, + "grad_norm": 0.5119503140449524, + "learning_rate": 0.00020623233275003362, + "loss": 3.9369, + "step": 2714 + }, + { + "epoch": 3.47232, + "grad_norm": 0.49882522225379944, + "learning_rate": 0.00020619195046439628, + "loss": 3.8124, + "step": 2715 + }, + { + "epoch": 3.4736000000000002, + "grad_norm": 0.5284246802330017, + "learning_rate": 0.0002061515681787589, + "loss": 3.7402, + "step": 2716 + }, + { + "epoch": 3.47488, + "grad_norm": 0.5059674382209778, + "learning_rate": 0.00020611118589312154, + "loss": 3.8813, + "step": 2717 + }, + { + "epoch": 3.47616, + "grad_norm": 0.5152750015258789, + "learning_rate": 0.00020607080360748417, + "loss": 3.8808, + "step": 2718 + }, + { + "epoch": 3.47744, + "grad_norm": 0.5104813575744629, + "learning_rate": 0.00020603042132184677, + "loss": 3.7924, + "step": 2719 + }, + { + "epoch": 3.47872, + "grad_norm": 0.500645637512207, + "learning_rate": 0.00020599003903620943, + "loss": 3.8189, + "step": 2720 + }, + { + "epoch": 3.48, + "grad_norm": 0.511396050453186, + "learning_rate": 0.00020594965675057206, + "loss": 3.8472, + "step": 2721 + }, + { + "epoch": 3.48128, + "grad_norm": 0.5173667073249817, + "learning_rate": 0.0002059092744649347, + "loss": 3.8637, + "step": 2722 + }, + { + "epoch": 3.48256, + "grad_norm": 0.49966612458229065, + "learning_rate": 0.00020586889217929732, + "loss": 3.8889, + "step": 2723 + }, + { + "epoch": 3.48384, + "grad_norm": 0.4898408055305481, + "learning_rate": 0.00020582850989365998, + "loss": 3.8718, + "step": 2724 + }, + { + "epoch": 3.48512, + "grad_norm": 0.5065445303916931, + "learning_rate": 0.0002057881276080226, + "loss": 3.8627, + "step": 2725 + }, + { + "epoch": 3.4864, + "grad_norm": 0.485765278339386, + "learning_rate": 0.00020574774532238524, + "loss": 3.7733, + "step": 2726 + }, + { + "epoch": 3.48768, + "grad_norm": 0.5096103549003601, + "learning_rate": 0.00020570736303674784, + "loss": 3.7589, + "step": 2727 + }, + { + "epoch": 3.48896, + "grad_norm": 0.5073655843734741, + "learning_rate": 0.0002056669807511105, + "loss": 3.8593, + "step": 2728 + }, + { + "epoch": 3.49024, + "grad_norm": 0.48420706391334534, + "learning_rate": 0.00020562659846547313, + "loss": 3.7242, + "step": 2729 + }, + { + "epoch": 3.49152, + "grad_norm": 0.5061640739440918, + "learning_rate": 0.00020558621617983576, + "loss": 3.9017, + "step": 2730 + }, + { + "epoch": 3.4928, + "grad_norm": 0.5040037035942078, + "learning_rate": 0.0002055458338941984, + "loss": 3.8566, + "step": 2731 + }, + { + "epoch": 3.49408, + "grad_norm": 0.4816730320453644, + "learning_rate": 0.00020550545160856102, + "loss": 3.8164, + "step": 2732 + }, + { + "epoch": 3.49536, + "grad_norm": 0.5050880908966064, + "learning_rate": 0.00020546506932292368, + "loss": 3.899, + "step": 2733 + }, + { + "epoch": 3.49664, + "grad_norm": 0.47876450419425964, + "learning_rate": 0.0002054246870372863, + "loss": 3.8351, + "step": 2734 + }, + { + "epoch": 3.49792, + "grad_norm": 0.5011029243469238, + "learning_rate": 0.0002053843047516489, + "loss": 3.7863, + "step": 2735 + }, + { + "epoch": 3.4992, + "grad_norm": 0.509458065032959, + "learning_rate": 0.00020534392246601154, + "loss": 3.889, + "step": 2736 + }, + { + "epoch": 3.50048, + "grad_norm": 0.4975719749927521, + "learning_rate": 0.0002053035401803742, + "loss": 3.9014, + "step": 2737 + }, + { + "epoch": 3.50176, + "grad_norm": 0.49265387654304504, + "learning_rate": 0.00020526315789473683, + "loss": 3.839, + "step": 2738 + }, + { + "epoch": 3.50304, + "grad_norm": 0.5019585490226746, + "learning_rate": 0.00020522277560909946, + "loss": 3.8466, + "step": 2739 + }, + { + "epoch": 3.50432, + "grad_norm": 0.49570876359939575, + "learning_rate": 0.00020518239332346209, + "loss": 3.8707, + "step": 2740 + }, + { + "epoch": 3.5056000000000003, + "grad_norm": 0.5000244975090027, + "learning_rate": 0.00020514201103782474, + "loss": 3.7987, + "step": 2741 + }, + { + "epoch": 3.5068799999999998, + "grad_norm": 0.5056788325309753, + "learning_rate": 0.00020510162875218737, + "loss": 3.8735, + "step": 2742 + }, + { + "epoch": 3.50816, + "grad_norm": 0.5059694647789001, + "learning_rate": 0.00020506124646654998, + "loss": 3.8705, + "step": 2743 + }, + { + "epoch": 3.50944, + "grad_norm": 0.47248658537864685, + "learning_rate": 0.0002050208641809126, + "loss": 3.8288, + "step": 2744 + }, + { + "epoch": 3.51072, + "grad_norm": 0.5001699328422546, + "learning_rate": 0.00020498048189527524, + "loss": 3.8767, + "step": 2745 + }, + { + "epoch": 3.512, + "grad_norm": 0.5233591794967651, + "learning_rate": 0.0002049400996096379, + "loss": 3.8299, + "step": 2746 + }, + { + "epoch": 3.51328, + "grad_norm": 0.5596291422843933, + "learning_rate": 0.00020489971732400052, + "loss": 3.9301, + "step": 2747 + }, + { + "epoch": 3.51456, + "grad_norm": 0.5503137111663818, + "learning_rate": 0.00020485933503836315, + "loss": 3.8602, + "step": 2748 + }, + { + "epoch": 3.51584, + "grad_norm": 0.48577359318733215, + "learning_rate": 0.00020481895275272578, + "loss": 3.7755, + "step": 2749 + }, + { + "epoch": 3.5171200000000002, + "grad_norm": 0.5109073519706726, + "learning_rate": 0.00020477857046708844, + "loss": 3.8155, + "step": 2750 + }, + { + "epoch": 3.5183999999999997, + "grad_norm": 0.5062686800956726, + "learning_rate": 0.00020473818818145104, + "loss": 3.819, + "step": 2751 + }, + { + "epoch": 3.51968, + "grad_norm": 0.5061745047569275, + "learning_rate": 0.00020469780589581367, + "loss": 3.892, + "step": 2752 + }, + { + "epoch": 3.52096, + "grad_norm": 0.49665218591690063, + "learning_rate": 0.0002046574236101763, + "loss": 3.8608, + "step": 2753 + }, + { + "epoch": 3.52224, + "grad_norm": 0.511926531791687, + "learning_rate": 0.00020461704132453896, + "loss": 3.7848, + "step": 2754 + }, + { + "epoch": 3.52352, + "grad_norm": 0.4888019859790802, + "learning_rate": 0.0002045766590389016, + "loss": 3.7409, + "step": 2755 + }, + { + "epoch": 3.5248, + "grad_norm": 0.4757629334926605, + "learning_rate": 0.00020453627675326422, + "loss": 3.8083, + "step": 2756 + }, + { + "epoch": 3.52608, + "grad_norm": 0.48579922318458557, + "learning_rate": 0.00020449589446762685, + "loss": 3.8436, + "step": 2757 + }, + { + "epoch": 3.52736, + "grad_norm": 0.47741061449050903, + "learning_rate": 0.00020445551218198945, + "loss": 3.7938, + "step": 2758 + }, + { + "epoch": 3.52864, + "grad_norm": 0.5141867995262146, + "learning_rate": 0.00020441512989635214, + "loss": 3.8169, + "step": 2759 + }, + { + "epoch": 3.5299199999999997, + "grad_norm": 0.4805345833301544, + "learning_rate": 0.00020437474761071474, + "loss": 3.8487, + "step": 2760 + }, + { + "epoch": 3.5312, + "grad_norm": 0.5061795711517334, + "learning_rate": 0.00020433436532507737, + "loss": 3.811, + "step": 2761 + }, + { + "epoch": 3.53248, + "grad_norm": 0.48661288619041443, + "learning_rate": 0.00020429398303944, + "loss": 3.7974, + "step": 2762 + }, + { + "epoch": 3.53376, + "grad_norm": 0.5097336173057556, + "learning_rate": 0.00020425360075380266, + "loss": 3.8154, + "step": 2763 + }, + { + "epoch": 3.53504, + "grad_norm": 0.4881674349308014, + "learning_rate": 0.0002042132184681653, + "loss": 3.8714, + "step": 2764 + }, + { + "epoch": 3.53632, + "grad_norm": 0.5308511853218079, + "learning_rate": 0.00020417283618252792, + "loss": 3.786, + "step": 2765 + }, + { + "epoch": 3.5376, + "grad_norm": 0.48169809579849243, + "learning_rate": 0.00020413245389689055, + "loss": 3.8127, + "step": 2766 + }, + { + "epoch": 3.53888, + "grad_norm": 0.5140031576156616, + "learning_rate": 0.0002040920716112532, + "loss": 3.8552, + "step": 2767 + }, + { + "epoch": 3.54016, + "grad_norm": 0.4895153343677521, + "learning_rate": 0.0002040516893256158, + "loss": 3.8319, + "step": 2768 + }, + { + "epoch": 3.54144, + "grad_norm": 0.5244706869125366, + "learning_rate": 0.00020401130703997844, + "loss": 3.7663, + "step": 2769 + }, + { + "epoch": 3.54272, + "grad_norm": 0.5048360228538513, + "learning_rate": 0.00020397092475434107, + "loss": 3.8417, + "step": 2770 + }, + { + "epoch": 3.544, + "grad_norm": 0.515129029750824, + "learning_rate": 0.0002039305424687037, + "loss": 3.9146, + "step": 2771 + }, + { + "epoch": 3.54528, + "grad_norm": 0.5083556771278381, + "learning_rate": 0.00020389016018306636, + "loss": 3.8344, + "step": 2772 + }, + { + "epoch": 3.54656, + "grad_norm": 0.4877746105194092, + "learning_rate": 0.000203849777897429, + "loss": 3.8346, + "step": 2773 + }, + { + "epoch": 3.54784, + "grad_norm": 0.508374035358429, + "learning_rate": 0.00020380939561179162, + "loss": 3.7713, + "step": 2774 + }, + { + "epoch": 3.5491200000000003, + "grad_norm": 0.496378093957901, + "learning_rate": 0.00020376901332615422, + "loss": 3.8809, + "step": 2775 + }, + { + "epoch": 3.5504, + "grad_norm": 0.5039408206939697, + "learning_rate": 0.00020372863104051688, + "loss": 3.8309, + "step": 2776 + }, + { + "epoch": 3.55168, + "grad_norm": 0.496198445558548, + "learning_rate": 0.0002036882487548795, + "loss": 3.8634, + "step": 2777 + }, + { + "epoch": 3.55296, + "grad_norm": 0.48462727665901184, + "learning_rate": 0.00020364786646924214, + "loss": 3.8348, + "step": 2778 + }, + { + "epoch": 3.55424, + "grad_norm": 0.48560038208961487, + "learning_rate": 0.00020360748418360477, + "loss": 3.7579, + "step": 2779 + }, + { + "epoch": 3.55552, + "grad_norm": 0.502811074256897, + "learning_rate": 0.00020356710189796742, + "loss": 3.8567, + "step": 2780 + }, + { + "epoch": 3.5568, + "grad_norm": 0.4968845546245575, + "learning_rate": 0.00020352671961233005, + "loss": 3.8431, + "step": 2781 + }, + { + "epoch": 3.55808, + "grad_norm": 0.48642468452453613, + "learning_rate": 0.00020348633732669268, + "loss": 3.7647, + "step": 2782 + }, + { + "epoch": 3.55936, + "grad_norm": 0.4881822168827057, + "learning_rate": 0.0002034459550410553, + "loss": 3.792, + "step": 2783 + }, + { + "epoch": 3.5606400000000002, + "grad_norm": 0.49411314725875854, + "learning_rate": 0.00020340557275541792, + "loss": 3.794, + "step": 2784 + }, + { + "epoch": 3.5619199999999998, + "grad_norm": 0.48321542143821716, + "learning_rate": 0.00020336519046978057, + "loss": 3.8337, + "step": 2785 + }, + { + "epoch": 3.5632, + "grad_norm": 0.4860212504863739, + "learning_rate": 0.0002033248081841432, + "loss": 3.8145, + "step": 2786 + }, + { + "epoch": 3.56448, + "grad_norm": 0.4979793429374695, + "learning_rate": 0.00020328442589850583, + "loss": 3.8073, + "step": 2787 + }, + { + "epoch": 3.56576, + "grad_norm": 0.4866310954093933, + "learning_rate": 0.00020324404361286846, + "loss": 3.7945, + "step": 2788 + }, + { + "epoch": 3.56704, + "grad_norm": 0.516071617603302, + "learning_rate": 0.00020320366132723112, + "loss": 3.8237, + "step": 2789 + }, + { + "epoch": 3.56832, + "grad_norm": 0.5022051930427551, + "learning_rate": 0.00020316327904159375, + "loss": 3.8451, + "step": 2790 + }, + { + "epoch": 3.5696, + "grad_norm": 0.5036980509757996, + "learning_rate": 0.00020312289675595636, + "loss": 3.7898, + "step": 2791 + }, + { + "epoch": 3.57088, + "grad_norm": 0.5079161524772644, + "learning_rate": 0.00020308251447031899, + "loss": 3.7985, + "step": 2792 + }, + { + "epoch": 3.5721600000000002, + "grad_norm": 0.48699140548706055, + "learning_rate": 0.00020304213218468164, + "loss": 3.8116, + "step": 2793 + }, + { + "epoch": 3.5734399999999997, + "grad_norm": 0.5011927485466003, + "learning_rate": 0.00020300174989904427, + "loss": 3.9555, + "step": 2794 + }, + { + "epoch": 3.57472, + "grad_norm": 0.5062665939331055, + "learning_rate": 0.0002029613676134069, + "loss": 3.8183, + "step": 2795 + }, + { + "epoch": 3.576, + "grad_norm": 0.49101898074150085, + "learning_rate": 0.00020292098532776953, + "loss": 3.8428, + "step": 2796 + }, + { + "epoch": 3.57728, + "grad_norm": 0.5135859251022339, + "learning_rate": 0.00020288060304213216, + "loss": 3.8469, + "step": 2797 + }, + { + "epoch": 3.57856, + "grad_norm": 0.5152158141136169, + "learning_rate": 0.00020284022075649482, + "loss": 3.8385, + "step": 2798 + }, + { + "epoch": 3.57984, + "grad_norm": 0.4942494034767151, + "learning_rate": 0.00020279983847085742, + "loss": 3.9062, + "step": 2799 + }, + { + "epoch": 3.58112, + "grad_norm": 0.5052478313446045, + "learning_rate": 0.00020275945618522005, + "loss": 3.7949, + "step": 2800 + }, + { + "epoch": 3.5824, + "grad_norm": 0.5218126177787781, + "learning_rate": 0.00020271907389958268, + "loss": 3.8128, + "step": 2801 + }, + { + "epoch": 3.58368, + "grad_norm": 0.5123757719993591, + "learning_rate": 0.00020267869161394534, + "loss": 3.8098, + "step": 2802 + }, + { + "epoch": 3.58496, + "grad_norm": 0.48139896988868713, + "learning_rate": 0.00020263830932830797, + "loss": 3.8333, + "step": 2803 + }, + { + "epoch": 3.58624, + "grad_norm": 0.494536817073822, + "learning_rate": 0.0002025979270426706, + "loss": 3.7192, + "step": 2804 + }, + { + "epoch": 3.58752, + "grad_norm": 0.4789034128189087, + "learning_rate": 0.00020255754475703323, + "loss": 3.7481, + "step": 2805 + }, + { + "epoch": 3.5888, + "grad_norm": 0.4984082877635956, + "learning_rate": 0.0002025171624713959, + "loss": 3.8158, + "step": 2806 + }, + { + "epoch": 3.59008, + "grad_norm": 0.48809587955474854, + "learning_rate": 0.0002024767801857585, + "loss": 3.8401, + "step": 2807 + }, + { + "epoch": 3.59136, + "grad_norm": 0.5058606863021851, + "learning_rate": 0.00020243639790012112, + "loss": 3.8788, + "step": 2808 + }, + { + "epoch": 3.59264, + "grad_norm": 0.5134665369987488, + "learning_rate": 0.00020239601561448375, + "loss": 3.8255, + "step": 2809 + }, + { + "epoch": 3.59392, + "grad_norm": 0.4707070589065552, + "learning_rate": 0.0002023556333288464, + "loss": 3.7997, + "step": 2810 + }, + { + "epoch": 3.5952, + "grad_norm": 0.4775642454624176, + "learning_rate": 0.00020231525104320904, + "loss": 3.8048, + "step": 2811 + }, + { + "epoch": 3.59648, + "grad_norm": 0.5038156509399414, + "learning_rate": 0.00020227486875757167, + "loss": 3.8541, + "step": 2812 + }, + { + "epoch": 3.59776, + "grad_norm": 0.5030291676521301, + "learning_rate": 0.0002022344864719343, + "loss": 3.8111, + "step": 2813 + }, + { + "epoch": 3.59904, + "grad_norm": 0.5007965564727783, + "learning_rate": 0.0002021941041862969, + "loss": 3.7762, + "step": 2814 + }, + { + "epoch": 3.60032, + "grad_norm": 0.49890565872192383, + "learning_rate": 0.00020215372190065956, + "loss": 3.7919, + "step": 2815 + }, + { + "epoch": 3.6016, + "grad_norm": 0.48519954085350037, + "learning_rate": 0.0002021133396150222, + "loss": 3.8374, + "step": 2816 + }, + { + "epoch": 3.60288, + "grad_norm": 0.5019754767417908, + "learning_rate": 0.00020207295732938482, + "loss": 3.7988, + "step": 2817 + }, + { + "epoch": 3.6041600000000003, + "grad_norm": 0.4958573877811432, + "learning_rate": 0.00020203257504374745, + "loss": 3.8074, + "step": 2818 + }, + { + "epoch": 3.6054399999999998, + "grad_norm": 0.5017514228820801, + "learning_rate": 0.0002019921927581101, + "loss": 3.819, + "step": 2819 + }, + { + "epoch": 3.60672, + "grad_norm": 0.5000422596931458, + "learning_rate": 0.00020195181047247274, + "loss": 3.8453, + "step": 2820 + }, + { + "epoch": 3.608, + "grad_norm": 0.4973451793193817, + "learning_rate": 0.00020191142818683537, + "loss": 3.7835, + "step": 2821 + }, + { + "epoch": 3.60928, + "grad_norm": 0.4728100001811981, + "learning_rate": 0.00020187104590119797, + "loss": 3.8969, + "step": 2822 + }, + { + "epoch": 3.61056, + "grad_norm": 0.5011451840400696, + "learning_rate": 0.00020183066361556063, + "loss": 3.8835, + "step": 2823 + }, + { + "epoch": 3.61184, + "grad_norm": 0.5048329830169678, + "learning_rate": 0.00020179028132992326, + "loss": 3.8225, + "step": 2824 + }, + { + "epoch": 3.61312, + "grad_norm": 0.5037258863449097, + "learning_rate": 0.00020174989904428589, + "loss": 3.7936, + "step": 2825 + }, + { + "epoch": 3.6144, + "grad_norm": 0.4970305263996124, + "learning_rate": 0.00020170951675864852, + "loss": 3.8651, + "step": 2826 + }, + { + "epoch": 3.6156800000000002, + "grad_norm": 0.5086222290992737, + "learning_rate": 0.00020166913447301115, + "loss": 3.7828, + "step": 2827 + }, + { + "epoch": 3.6169599999999997, + "grad_norm": 0.4862697720527649, + "learning_rate": 0.0002016287521873738, + "loss": 3.7793, + "step": 2828 + }, + { + "epoch": 3.61824, + "grad_norm": 0.5067240595817566, + "learning_rate": 0.00020158836990173643, + "loss": 3.8363, + "step": 2829 + }, + { + "epoch": 3.61952, + "grad_norm": 0.5017095804214478, + "learning_rate": 0.00020154798761609904, + "loss": 3.8429, + "step": 2830 + }, + { + "epoch": 3.6208, + "grad_norm": 0.4909166991710663, + "learning_rate": 0.00020150760533046167, + "loss": 3.8017, + "step": 2831 + }, + { + "epoch": 3.62208, + "grad_norm": 0.5125616192817688, + "learning_rate": 0.00020146722304482432, + "loss": 3.7553, + "step": 2832 + }, + { + "epoch": 3.62336, + "grad_norm": 0.48391321301460266, + "learning_rate": 0.00020142684075918695, + "loss": 3.8041, + "step": 2833 + }, + { + "epoch": 3.62464, + "grad_norm": 0.4965222477912903, + "learning_rate": 0.00020138645847354958, + "loss": 3.8016, + "step": 2834 + }, + { + "epoch": 3.62592, + "grad_norm": 0.4799792170524597, + "learning_rate": 0.00020134607618791221, + "loss": 3.8301, + "step": 2835 + }, + { + "epoch": 3.6272, + "grad_norm": 0.5186593532562256, + "learning_rate": 0.00020130569390227487, + "loss": 3.7594, + "step": 2836 + }, + { + "epoch": 3.62848, + "grad_norm": 0.4939570426940918, + "learning_rate": 0.0002012653116166375, + "loss": 3.8343, + "step": 2837 + }, + { + "epoch": 3.62976, + "grad_norm": 0.4945909082889557, + "learning_rate": 0.0002012249293310001, + "loss": 3.8158, + "step": 2838 + }, + { + "epoch": 3.63104, + "grad_norm": 0.5133264660835266, + "learning_rate": 0.00020118454704536273, + "loss": 3.8507, + "step": 2839 + }, + { + "epoch": 3.63232, + "grad_norm": 0.4962891936302185, + "learning_rate": 0.00020114416475972536, + "loss": 3.8135, + "step": 2840 + }, + { + "epoch": 3.6336, + "grad_norm": 0.5103831887245178, + "learning_rate": 0.00020110378247408802, + "loss": 3.7948, + "step": 2841 + }, + { + "epoch": 3.63488, + "grad_norm": 0.5109889507293701, + "learning_rate": 0.00020106340018845065, + "loss": 3.8751, + "step": 2842 + }, + { + "epoch": 3.63616, + "grad_norm": 0.4803555905818939, + "learning_rate": 0.00020102301790281328, + "loss": 3.7989, + "step": 2843 + }, + { + "epoch": 3.63744, + "grad_norm": 0.494994193315506, + "learning_rate": 0.0002009826356171759, + "loss": 3.8933, + "step": 2844 + }, + { + "epoch": 3.63872, + "grad_norm": 0.4815881848335266, + "learning_rate": 0.00020094225333153857, + "loss": 3.7667, + "step": 2845 + }, + { + "epoch": 3.64, + "grad_norm": 0.5187839865684509, + "learning_rate": 0.0002009018710459012, + "loss": 3.8322, + "step": 2846 + }, + { + "epoch": 3.64128, + "grad_norm": 0.5170251727104187, + "learning_rate": 0.0002008614887602638, + "loss": 3.8624, + "step": 2847 + }, + { + "epoch": 3.64256, + "grad_norm": 0.4987940490245819, + "learning_rate": 0.00020082110647462643, + "loss": 3.8021, + "step": 2848 + }, + { + "epoch": 3.64384, + "grad_norm": 0.5137864351272583, + "learning_rate": 0.0002007807241889891, + "loss": 3.8342, + "step": 2849 + }, + { + "epoch": 3.64512, + "grad_norm": 0.4848984479904175, + "learning_rate": 0.00020074034190335172, + "loss": 3.7829, + "step": 2850 + }, + { + "epoch": 3.6464, + "grad_norm": 0.5238636136054993, + "learning_rate": 0.00020069995961771435, + "loss": 3.8299, + "step": 2851 + }, + { + "epoch": 3.6476800000000003, + "grad_norm": 0.49876147508621216, + "learning_rate": 0.00020065957733207698, + "loss": 3.7739, + "step": 2852 + }, + { + "epoch": 3.6489599999999998, + "grad_norm": 0.5346967577934265, + "learning_rate": 0.00020061919504643958, + "loss": 3.8934, + "step": 2853 + }, + { + "epoch": 3.65024, + "grad_norm": 0.4966239631175995, + "learning_rate": 0.00020057881276080227, + "loss": 3.9085, + "step": 2854 + }, + { + "epoch": 3.65152, + "grad_norm": 0.5036845207214355, + "learning_rate": 0.00020053843047516487, + "loss": 3.7837, + "step": 2855 + }, + { + "epoch": 3.6528, + "grad_norm": 0.49921709299087524, + "learning_rate": 0.0002004980481895275, + "loss": 3.8029, + "step": 2856 + }, + { + "epoch": 3.65408, + "grad_norm": 0.5502278804779053, + "learning_rate": 0.00020045766590389013, + "loss": 3.8748, + "step": 2857 + }, + { + "epoch": 3.65536, + "grad_norm": 0.4963222146034241, + "learning_rate": 0.0002004172836182528, + "loss": 3.7893, + "step": 2858 + }, + { + "epoch": 3.65664, + "grad_norm": 0.5209027528762817, + "learning_rate": 0.00020037690133261542, + "loss": 3.7979, + "step": 2859 + }, + { + "epoch": 3.65792, + "grad_norm": 0.4983651340007782, + "learning_rate": 0.00020033651904697805, + "loss": 3.8321, + "step": 2860 + }, + { + "epoch": 3.6592000000000002, + "grad_norm": 0.6087284088134766, + "learning_rate": 0.00020029613676134068, + "loss": 3.8507, + "step": 2861 + }, + { + "epoch": 3.6604799999999997, + "grad_norm": 0.5027658343315125, + "learning_rate": 0.00020025575447570333, + "loss": 3.8565, + "step": 2862 + }, + { + "epoch": 3.66176, + "grad_norm": 0.5117068886756897, + "learning_rate": 0.00020021537219006594, + "loss": 3.825, + "step": 2863 + }, + { + "epoch": 3.66304, + "grad_norm": 0.5000763535499573, + "learning_rate": 0.00020017498990442857, + "loss": 3.8242, + "step": 2864 + }, + { + "epoch": 3.66432, + "grad_norm": 0.5197402834892273, + "learning_rate": 0.0002001346076187912, + "loss": 3.8651, + "step": 2865 + }, + { + "epoch": 3.6656, + "grad_norm": 0.5055534839630127, + "learning_rate": 0.00020009422533315383, + "loss": 3.8355, + "step": 2866 + }, + { + "epoch": 3.66688, + "grad_norm": 0.5054891705513, + "learning_rate": 0.00020005384304751648, + "loss": 3.8619, + "step": 2867 + }, + { + "epoch": 3.66816, + "grad_norm": 0.47256743907928467, + "learning_rate": 0.00020001346076187911, + "loss": 3.7752, + "step": 2868 + }, + { + "epoch": 3.66944, + "grad_norm": 0.48075199127197266, + "learning_rate": 0.00019997307847624174, + "loss": 3.8054, + "step": 2869 + }, + { + "epoch": 3.67072, + "grad_norm": 0.4974772036075592, + "learning_rate": 0.00019993269619060435, + "loss": 3.7665, + "step": 2870 + }, + { + "epoch": 3.672, + "grad_norm": 0.48151201009750366, + "learning_rate": 0.000199892313904967, + "loss": 3.8581, + "step": 2871 + }, + { + "epoch": 3.67328, + "grad_norm": 0.5086482763290405, + "learning_rate": 0.00019985193161932964, + "loss": 3.8571, + "step": 2872 + }, + { + "epoch": 3.67456, + "grad_norm": 0.4769311845302582, + "learning_rate": 0.00019981154933369227, + "loss": 3.8258, + "step": 2873 + }, + { + "epoch": 3.67584, + "grad_norm": 0.5033696889877319, + "learning_rate": 0.0001997711670480549, + "loss": 3.8217, + "step": 2874 + }, + { + "epoch": 3.67712, + "grad_norm": 0.4959774613380432, + "learning_rate": 0.00019973078476241755, + "loss": 3.8151, + "step": 2875 + }, + { + "epoch": 3.6784, + "grad_norm": 0.4797199070453644, + "learning_rate": 0.00019969040247678018, + "loss": 3.8164, + "step": 2876 + }, + { + "epoch": 3.67968, + "grad_norm": 0.5271828770637512, + "learning_rate": 0.0001996500201911428, + "loss": 3.7692, + "step": 2877 + }, + { + "epoch": 3.68096, + "grad_norm": 0.489942729473114, + "learning_rate": 0.00019960963790550542, + "loss": 3.8024, + "step": 2878 + }, + { + "epoch": 3.68224, + "grad_norm": 0.5125555992126465, + "learning_rate": 0.00019956925561986805, + "loss": 3.7912, + "step": 2879 + }, + { + "epoch": 3.68352, + "grad_norm": 0.4915968179702759, + "learning_rate": 0.0001995288733342307, + "loss": 3.7815, + "step": 2880 + }, + { + "epoch": 3.6848, + "grad_norm": 0.49331218004226685, + "learning_rate": 0.00019948849104859333, + "loss": 3.878, + "step": 2881 + }, + { + "epoch": 3.68608, + "grad_norm": 0.5256069898605347, + "learning_rate": 0.00019944810876295596, + "loss": 3.8137, + "step": 2882 + }, + { + "epoch": 3.68736, + "grad_norm": 0.4750528335571289, + "learning_rate": 0.0001994077264773186, + "loss": 3.7832, + "step": 2883 + }, + { + "epoch": 3.68864, + "grad_norm": 0.5187920928001404, + "learning_rate": 0.00019936734419168125, + "loss": 3.8111, + "step": 2884 + }, + { + "epoch": 3.68992, + "grad_norm": 0.4905349016189575, + "learning_rate": 0.00019932696190604388, + "loss": 3.8054, + "step": 2885 + }, + { + "epoch": 3.6912000000000003, + "grad_norm": 0.5346660614013672, + "learning_rate": 0.00019928657962040648, + "loss": 3.8365, + "step": 2886 + }, + { + "epoch": 3.6924799999999998, + "grad_norm": 0.5014554262161255, + "learning_rate": 0.0001992461973347691, + "loss": 3.8105, + "step": 2887 + }, + { + "epoch": 3.69376, + "grad_norm": 0.4906976521015167, + "learning_rate": 0.00019920581504913177, + "loss": 3.7636, + "step": 2888 + }, + { + "epoch": 3.69504, + "grad_norm": 0.5077499747276306, + "learning_rate": 0.0001991654327634944, + "loss": 3.7638, + "step": 2889 + }, + { + "epoch": 3.69632, + "grad_norm": 0.48475882411003113, + "learning_rate": 0.00019912505047785703, + "loss": 3.8723, + "step": 2890 + }, + { + "epoch": 3.6976, + "grad_norm": 0.509909451007843, + "learning_rate": 0.00019908466819221966, + "loss": 3.8396, + "step": 2891 + }, + { + "epoch": 3.69888, + "grad_norm": 0.49699631333351135, + "learning_rate": 0.0001990442859065823, + "loss": 3.7865, + "step": 2892 + }, + { + "epoch": 3.70016, + "grad_norm": 0.4960545301437378, + "learning_rate": 0.00019900390362094495, + "loss": 3.819, + "step": 2893 + }, + { + "epoch": 3.70144, + "grad_norm": 0.5059762597084045, + "learning_rate": 0.00019896352133530755, + "loss": 3.8588, + "step": 2894 + }, + { + "epoch": 3.7027200000000002, + "grad_norm": 0.503450870513916, + "learning_rate": 0.00019892313904967018, + "loss": 3.8109, + "step": 2895 + }, + { + "epoch": 3.7039999999999997, + "grad_norm": 0.505183219909668, + "learning_rate": 0.0001988827567640328, + "loss": 3.7826, + "step": 2896 + }, + { + "epoch": 3.70528, + "grad_norm": 0.5047440528869629, + "learning_rate": 0.00019884237447839547, + "loss": 3.8386, + "step": 2897 + }, + { + "epoch": 3.70656, + "grad_norm": 0.5242089033126831, + "learning_rate": 0.0001988019921927581, + "loss": 3.7865, + "step": 2898 + }, + { + "epoch": 3.70784, + "grad_norm": 0.48960578441619873, + "learning_rate": 0.00019876160990712073, + "loss": 3.8107, + "step": 2899 + }, + { + "epoch": 3.70912, + "grad_norm": 0.5148612260818481, + "learning_rate": 0.00019872122762148336, + "loss": 3.7782, + "step": 2900 + }, + { + "epoch": 3.7104, + "grad_norm": 0.4915827810764313, + "learning_rate": 0.00019868084533584602, + "loss": 3.8166, + "step": 2901 + }, + { + "epoch": 3.71168, + "grad_norm": 0.4962410032749176, + "learning_rate": 0.00019864046305020862, + "loss": 3.7864, + "step": 2902 + }, + { + "epoch": 3.71296, + "grad_norm": 0.50472092628479, + "learning_rate": 0.00019860008076457125, + "loss": 3.7607, + "step": 2903 + }, + { + "epoch": 3.71424, + "grad_norm": 0.5142050981521606, + "learning_rate": 0.00019855969847893388, + "loss": 3.835, + "step": 2904 + }, + { + "epoch": 3.71552, + "grad_norm": 0.5082858204841614, + "learning_rate": 0.0001985193161932965, + "loss": 3.903, + "step": 2905 + }, + { + "epoch": 3.7168, + "grad_norm": 0.529228150844574, + "learning_rate": 0.00019847893390765917, + "loss": 3.7935, + "step": 2906 + }, + { + "epoch": 3.71808, + "grad_norm": 0.515741765499115, + "learning_rate": 0.0001984385516220218, + "loss": 3.863, + "step": 2907 + }, + { + "epoch": 3.71936, + "grad_norm": 0.5094571113586426, + "learning_rate": 0.00019839816933638443, + "loss": 3.8557, + "step": 2908 + }, + { + "epoch": 3.72064, + "grad_norm": 0.5035596489906311, + "learning_rate": 0.00019835778705074703, + "loss": 3.8019, + "step": 2909 + }, + { + "epoch": 3.72192, + "grad_norm": 0.5284061431884766, + "learning_rate": 0.00019831740476510969, + "loss": 3.8547, + "step": 2910 + }, + { + "epoch": 3.7232, + "grad_norm": 0.5245686173439026, + "learning_rate": 0.00019827702247947232, + "loss": 3.7454, + "step": 2911 + }, + { + "epoch": 3.72448, + "grad_norm": 0.514725923538208, + "learning_rate": 0.00019823664019383495, + "loss": 3.8237, + "step": 2912 + }, + { + "epoch": 3.72576, + "grad_norm": 0.5170477628707886, + "learning_rate": 0.00019819625790819758, + "loss": 3.8264, + "step": 2913 + }, + { + "epoch": 3.72704, + "grad_norm": 0.5244784951210022, + "learning_rate": 0.00019815587562256023, + "loss": 3.7872, + "step": 2914 + }, + { + "epoch": 3.72832, + "grad_norm": 0.510220468044281, + "learning_rate": 0.00019811549333692286, + "loss": 3.8574, + "step": 2915 + }, + { + "epoch": 3.7296, + "grad_norm": 0.5242193937301636, + "learning_rate": 0.0001980751110512855, + "loss": 3.8547, + "step": 2916 + }, + { + "epoch": 3.73088, + "grad_norm": 0.48602530360221863, + "learning_rate": 0.0001980347287656481, + "loss": 3.7948, + "step": 2917 + }, + { + "epoch": 3.73216, + "grad_norm": 0.4796367585659027, + "learning_rate": 0.00019799434648001073, + "loss": 3.793, + "step": 2918 + }, + { + "epoch": 3.73344, + "grad_norm": 0.4868846535682678, + "learning_rate": 0.00019795396419437338, + "loss": 3.8138, + "step": 2919 + }, + { + "epoch": 3.7347200000000003, + "grad_norm": 0.48275643587112427, + "learning_rate": 0.00019791358190873601, + "loss": 3.8185, + "step": 2920 + }, + { + "epoch": 3.7359999999999998, + "grad_norm": 0.4956841468811035, + "learning_rate": 0.00019787319962309864, + "loss": 3.8374, + "step": 2921 + }, + { + "epoch": 3.73728, + "grad_norm": 0.5024325251579285, + "learning_rate": 0.00019783281733746127, + "loss": 3.9032, + "step": 2922 + }, + { + "epoch": 3.73856, + "grad_norm": 0.4799942970275879, + "learning_rate": 0.00019779243505182393, + "loss": 3.8317, + "step": 2923 + }, + { + "epoch": 3.73984, + "grad_norm": 0.5100151300430298, + "learning_rate": 0.00019775205276618656, + "loss": 3.8394, + "step": 2924 + }, + { + "epoch": 3.74112, + "grad_norm": 0.4840579926967621, + "learning_rate": 0.00019771167048054916, + "loss": 3.7803, + "step": 2925 + }, + { + "epoch": 3.7424, + "grad_norm": 0.495714008808136, + "learning_rate": 0.0001976712881949118, + "loss": 3.7849, + "step": 2926 + }, + { + "epoch": 3.74368, + "grad_norm": 0.49256429076194763, + "learning_rate": 0.00019763090590927445, + "loss": 3.7815, + "step": 2927 + }, + { + "epoch": 3.74496, + "grad_norm": 0.4639676809310913, + "learning_rate": 0.00019759052362363708, + "loss": 3.8497, + "step": 2928 + }, + { + "epoch": 3.7462400000000002, + "grad_norm": 0.486234575510025, + "learning_rate": 0.0001975501413379997, + "loss": 3.866, + "step": 2929 + }, + { + "epoch": 3.7475199999999997, + "grad_norm": 0.4823131263256073, + "learning_rate": 0.00019750975905236234, + "loss": 3.7977, + "step": 2930 + }, + { + "epoch": 3.7488, + "grad_norm": 0.4761921465396881, + "learning_rate": 0.000197469376766725, + "loss": 3.7816, + "step": 2931 + }, + { + "epoch": 3.75008, + "grad_norm": 0.4939184784889221, + "learning_rate": 0.00019742899448108763, + "loss": 3.8268, + "step": 2932 + }, + { + "epoch": 3.75136, + "grad_norm": 0.4557759761810303, + "learning_rate": 0.00019738861219545026, + "loss": 3.7831, + "step": 2933 + }, + { + "epoch": 3.75264, + "grad_norm": 0.4687483012676239, + "learning_rate": 0.00019734822990981286, + "loss": 3.8796, + "step": 2934 + }, + { + "epoch": 3.75392, + "grad_norm": 0.4676748514175415, + "learning_rate": 0.0001973078476241755, + "loss": 3.8334, + "step": 2935 + }, + { + "epoch": 3.7552, + "grad_norm": 0.47445160150527954, + "learning_rate": 0.00019726746533853815, + "loss": 3.7532, + "step": 2936 + }, + { + "epoch": 3.75648, + "grad_norm": 0.4851106107234955, + "learning_rate": 0.00019722708305290078, + "loss": 3.7783, + "step": 2937 + }, + { + "epoch": 3.75776, + "grad_norm": 0.4669123888015747, + "learning_rate": 0.0001971867007672634, + "loss": 3.8144, + "step": 2938 + }, + { + "epoch": 3.75904, + "grad_norm": 0.4675630033016205, + "learning_rate": 0.00019714631848162604, + "loss": 3.7909, + "step": 2939 + }, + { + "epoch": 3.76032, + "grad_norm": 0.4890853762626648, + "learning_rate": 0.0001971059361959887, + "loss": 3.803, + "step": 2940 + }, + { + "epoch": 3.7616, + "grad_norm": 0.4596586227416992, + "learning_rate": 0.00019706555391035133, + "loss": 3.7987, + "step": 2941 + }, + { + "epoch": 3.76288, + "grad_norm": 0.5085187554359436, + "learning_rate": 0.00019702517162471393, + "loss": 3.7938, + "step": 2942 + }, + { + "epoch": 3.76416, + "grad_norm": 0.4674232006072998, + "learning_rate": 0.00019698478933907656, + "loss": 3.9139, + "step": 2943 + }, + { + "epoch": 3.76544, + "grad_norm": 0.4964526891708374, + "learning_rate": 0.00019694440705343922, + "loss": 3.805, + "step": 2944 + }, + { + "epoch": 3.76672, + "grad_norm": 0.4870041310787201, + "learning_rate": 0.00019690402476780185, + "loss": 3.7573, + "step": 2945 + }, + { + "epoch": 3.768, + "grad_norm": 0.4788266718387604, + "learning_rate": 0.00019686364248216448, + "loss": 3.8082, + "step": 2946 + }, + { + "epoch": 3.76928, + "grad_norm": 0.5005785822868347, + "learning_rate": 0.0001968232601965271, + "loss": 3.7935, + "step": 2947 + }, + { + "epoch": 3.77056, + "grad_norm": 0.48432236909866333, + "learning_rate": 0.00019678287791088974, + "loss": 3.8643, + "step": 2948 + }, + { + "epoch": 3.77184, + "grad_norm": 0.4844403862953186, + "learning_rate": 0.0001967424956252524, + "loss": 3.8046, + "step": 2949 + }, + { + "epoch": 3.77312, + "grad_norm": 0.49328166246414185, + "learning_rate": 0.000196702113339615, + "loss": 3.8041, + "step": 2950 + }, + { + "epoch": 3.7744, + "grad_norm": 0.48709914088249207, + "learning_rate": 0.00019666173105397763, + "loss": 3.7822, + "step": 2951 + }, + { + "epoch": 3.77568, + "grad_norm": 0.4914034903049469, + "learning_rate": 0.00019662134876834026, + "loss": 3.866, + "step": 2952 + }, + { + "epoch": 3.77696, + "grad_norm": 0.5005134344100952, + "learning_rate": 0.00019658096648270292, + "loss": 3.7386, + "step": 2953 + }, + { + "epoch": 3.7782400000000003, + "grad_norm": 0.4764963388442993, + "learning_rate": 0.00019654058419706555, + "loss": 3.8049, + "step": 2954 + }, + { + "epoch": 3.7795199999999998, + "grad_norm": 0.4586517810821533, + "learning_rate": 0.00019650020191142818, + "loss": 3.8072, + "step": 2955 + }, + { + "epoch": 3.7808, + "grad_norm": 0.4838949739933014, + "learning_rate": 0.0001964598196257908, + "loss": 3.8139, + "step": 2956 + }, + { + "epoch": 3.78208, + "grad_norm": 0.48450782895088196, + "learning_rate": 0.00019641943734015346, + "loss": 3.8376, + "step": 2957 + }, + { + "epoch": 3.78336, + "grad_norm": 0.5044925808906555, + "learning_rate": 0.00019637905505451607, + "loss": 3.7702, + "step": 2958 + }, + { + "epoch": 3.78464, + "grad_norm": 0.4797843396663666, + "learning_rate": 0.0001963386727688787, + "loss": 3.7628, + "step": 2959 + }, + { + "epoch": 3.78592, + "grad_norm": 0.4827995300292969, + "learning_rate": 0.00019629829048324133, + "loss": 3.7901, + "step": 2960 + }, + { + "epoch": 3.7872, + "grad_norm": 0.4926610291004181, + "learning_rate": 0.00019625790819760396, + "loss": 3.8232, + "step": 2961 + }, + { + "epoch": 3.78848, + "grad_norm": 0.46942880749702454, + "learning_rate": 0.0001962175259119666, + "loss": 3.7819, + "step": 2962 + }, + { + "epoch": 3.7897600000000002, + "grad_norm": 0.5101065039634705, + "learning_rate": 0.00019617714362632924, + "loss": 3.8026, + "step": 2963 + }, + { + "epoch": 3.7910399999999997, + "grad_norm": 0.4860995411872864, + "learning_rate": 0.00019613676134069187, + "loss": 3.8669, + "step": 2964 + }, + { + "epoch": 3.79232, + "grad_norm": 0.49931204319000244, + "learning_rate": 0.00019609637905505448, + "loss": 3.8031, + "step": 2965 + }, + { + "epoch": 3.7936, + "grad_norm": 0.4776234030723572, + "learning_rate": 0.00019605599676941713, + "loss": 3.8091, + "step": 2966 + }, + { + "epoch": 3.79488, + "grad_norm": 0.5147337317466736, + "learning_rate": 0.00019601561448377976, + "loss": 3.8127, + "step": 2967 + }, + { + "epoch": 3.79616, + "grad_norm": 0.49513697624206543, + "learning_rate": 0.0001959752321981424, + "loss": 3.9379, + "step": 2968 + }, + { + "epoch": 3.79744, + "grad_norm": 0.5062114596366882, + "learning_rate": 0.00019593484991250502, + "loss": 3.833, + "step": 2969 + }, + { + "epoch": 3.79872, + "grad_norm": 0.48605409264564514, + "learning_rate": 0.00019589446762686768, + "loss": 3.8156, + "step": 2970 + }, + { + "epoch": 3.8, + "grad_norm": 0.48607662320137024, + "learning_rate": 0.0001958540853412303, + "loss": 3.8235, + "step": 2971 + }, + { + "epoch": 3.80128, + "grad_norm": 0.48669230937957764, + "learning_rate": 0.00019581370305559294, + "loss": 3.7372, + "step": 2972 + }, + { + "epoch": 3.80256, + "grad_norm": 0.48195433616638184, + "learning_rate": 0.00019577332076995554, + "loss": 3.798, + "step": 2973 + }, + { + "epoch": 3.80384, + "grad_norm": 0.4917437434196472, + "learning_rate": 0.00019573293848431817, + "loss": 3.8529, + "step": 2974 + }, + { + "epoch": 3.80512, + "grad_norm": 0.5031919479370117, + "learning_rate": 0.00019569255619868083, + "loss": 3.7061, + "step": 2975 + }, + { + "epoch": 3.8064, + "grad_norm": 0.5153105854988098, + "learning_rate": 0.00019565217391304346, + "loss": 3.7935, + "step": 2976 + }, + { + "epoch": 3.80768, + "grad_norm": 0.5166071057319641, + "learning_rate": 0.0001956117916274061, + "loss": 3.8835, + "step": 2977 + }, + { + "epoch": 3.80896, + "grad_norm": 0.47903141379356384, + "learning_rate": 0.00019557140934176872, + "loss": 3.7476, + "step": 2978 + }, + { + "epoch": 3.81024, + "grad_norm": 0.4941449761390686, + "learning_rate": 0.00019553102705613138, + "loss": 3.7796, + "step": 2979 + }, + { + "epoch": 3.81152, + "grad_norm": 0.5048912763595581, + "learning_rate": 0.000195490644770494, + "loss": 3.7708, + "step": 2980 + }, + { + "epoch": 3.8128, + "grad_norm": 0.4860134422779083, + "learning_rate": 0.0001954502624848566, + "loss": 3.8097, + "step": 2981 + }, + { + "epoch": 3.81408, + "grad_norm": 0.48185980319976807, + "learning_rate": 0.00019540988019921924, + "loss": 3.8444, + "step": 2982 + }, + { + "epoch": 3.81536, + "grad_norm": 0.4978376030921936, + "learning_rate": 0.0001953694979135819, + "loss": 3.7791, + "step": 2983 + }, + { + "epoch": 3.81664, + "grad_norm": 0.5320491194725037, + "learning_rate": 0.00019532911562794453, + "loss": 3.7774, + "step": 2984 + }, + { + "epoch": 3.81792, + "grad_norm": 0.4956910014152527, + "learning_rate": 0.00019528873334230716, + "loss": 3.8242, + "step": 2985 + }, + { + "epoch": 3.8192, + "grad_norm": 0.4940122663974762, + "learning_rate": 0.0001952483510566698, + "loss": 3.7981, + "step": 2986 + }, + { + "epoch": 3.82048, + "grad_norm": 0.49585092067718506, + "learning_rate": 0.00019520796877103242, + "loss": 3.7541, + "step": 2987 + }, + { + "epoch": 3.8217600000000003, + "grad_norm": 0.49807533621788025, + "learning_rate": 0.00019516758648539508, + "loss": 3.7866, + "step": 2988 + }, + { + "epoch": 3.8230399999999998, + "grad_norm": 0.5053405165672302, + "learning_rate": 0.00019512720419975768, + "loss": 3.8625, + "step": 2989 + }, + { + "epoch": 3.82432, + "grad_norm": 0.5178548693656921, + "learning_rate": 0.0001950868219141203, + "loss": 3.7998, + "step": 2990 + }, + { + "epoch": 3.8256, + "grad_norm": 0.5026320815086365, + "learning_rate": 0.00019504643962848294, + "loss": 3.815, + "step": 2991 + }, + { + "epoch": 3.82688, + "grad_norm": 0.5041228532791138, + "learning_rate": 0.0001950060573428456, + "loss": 3.7673, + "step": 2992 + }, + { + "epoch": 3.82816, + "grad_norm": 0.5110290050506592, + "learning_rate": 0.00019496567505720823, + "loss": 3.8342, + "step": 2993 + }, + { + "epoch": 3.82944, + "grad_norm": 0.5149263739585876, + "learning_rate": 0.00019492529277157086, + "loss": 3.8922, + "step": 2994 + }, + { + "epoch": 3.83072, + "grad_norm": 0.49787768721580505, + "learning_rate": 0.0001948849104859335, + "loss": 3.8788, + "step": 2995 + }, + { + "epoch": 3.832, + "grad_norm": 0.48449409008026123, + "learning_rate": 0.00019484452820029614, + "loss": 3.7207, + "step": 2996 + }, + { + "epoch": 3.8332800000000002, + "grad_norm": 0.49689674377441406, + "learning_rate": 0.00019480414591465875, + "loss": 3.7686, + "step": 2997 + }, + { + "epoch": 3.8345599999999997, + "grad_norm": 0.533508837223053, + "learning_rate": 0.00019476376362902138, + "loss": 3.8948, + "step": 2998 + }, + { + "epoch": 3.83584, + "grad_norm": 0.48604533076286316, + "learning_rate": 0.000194723381343384, + "loss": 3.7144, + "step": 2999 + }, + { + "epoch": 3.83712, + "grad_norm": 0.5123369693756104, + "learning_rate": 0.00019468299905774664, + "loss": 3.7716, + "step": 3000 + }, + { + "epoch": 3.8384, + "grad_norm": 0.4925461709499359, + "learning_rate": 0.0001946426167721093, + "loss": 3.862, + "step": 3001 + }, + { + "epoch": 3.83968, + "grad_norm": 0.5041381120681763, + "learning_rate": 0.00019460223448647192, + "loss": 3.7986, + "step": 3002 + }, + { + "epoch": 3.84096, + "grad_norm": 0.4902060031890869, + "learning_rate": 0.00019456185220083455, + "loss": 3.8608, + "step": 3003 + }, + { + "epoch": 3.84224, + "grad_norm": 0.5032182931900024, + "learning_rate": 0.00019452146991519716, + "loss": 3.8192, + "step": 3004 + }, + { + "epoch": 3.84352, + "grad_norm": 0.5030463337898254, + "learning_rate": 0.00019448108762955984, + "loss": 3.8128, + "step": 3005 + }, + { + "epoch": 3.8448, + "grad_norm": 0.49824386835098267, + "learning_rate": 0.00019444070534392244, + "loss": 3.7264, + "step": 3006 + }, + { + "epoch": 3.84608, + "grad_norm": 0.5038865208625793, + "learning_rate": 0.00019440032305828507, + "loss": 3.8521, + "step": 3007 + }, + { + "epoch": 3.84736, + "grad_norm": 0.5093699097633362, + "learning_rate": 0.0001943599407726477, + "loss": 3.7095, + "step": 3008 + }, + { + "epoch": 3.84864, + "grad_norm": 0.49875956773757935, + "learning_rate": 0.00019431955848701036, + "loss": 3.868, + "step": 3009 + }, + { + "epoch": 3.84992, + "grad_norm": 0.5082076787948608, + "learning_rate": 0.000194279176201373, + "loss": 3.8264, + "step": 3010 + }, + { + "epoch": 3.8512, + "grad_norm": 0.5088693499565125, + "learning_rate": 0.00019423879391573562, + "loss": 3.8094, + "step": 3011 + }, + { + "epoch": 3.85248, + "grad_norm": 0.49538499116897583, + "learning_rate": 0.00019419841163009823, + "loss": 3.8007, + "step": 3012 + }, + { + "epoch": 3.85376, + "grad_norm": 0.5211043953895569, + "learning_rate": 0.00019415802934446086, + "loss": 3.7566, + "step": 3013 + }, + { + "epoch": 3.85504, + "grad_norm": 0.4987725019454956, + "learning_rate": 0.0001941176470588235, + "loss": 3.8489, + "step": 3014 + }, + { + "epoch": 3.85632, + "grad_norm": 0.4974265992641449, + "learning_rate": 0.00019407726477318614, + "loss": 3.8121, + "step": 3015 + }, + { + "epoch": 3.8576, + "grad_norm": 0.5079092383384705, + "learning_rate": 0.00019403688248754877, + "loss": 3.8206, + "step": 3016 + }, + { + "epoch": 3.85888, + "grad_norm": 0.4906683564186096, + "learning_rate": 0.0001939965002019114, + "loss": 3.7863, + "step": 3017 + }, + { + "epoch": 3.86016, + "grad_norm": 0.5003571510314941, + "learning_rate": 0.00019395611791627406, + "loss": 3.7905, + "step": 3018 + }, + { + "epoch": 3.86144, + "grad_norm": 0.5029097199440002, + "learning_rate": 0.0001939157356306367, + "loss": 3.8112, + "step": 3019 + }, + { + "epoch": 3.86272, + "grad_norm": 0.494642972946167, + "learning_rate": 0.00019387535334499932, + "loss": 3.7935, + "step": 3020 + }, + { + "epoch": 3.864, + "grad_norm": 0.48063820600509644, + "learning_rate": 0.00019383497105936192, + "loss": 3.8126, + "step": 3021 + }, + { + "epoch": 3.8652800000000003, + "grad_norm": 0.4792153835296631, + "learning_rate": 0.00019379458877372458, + "loss": 3.7361, + "step": 3022 + }, + { + "epoch": 3.8665599999999998, + "grad_norm": 0.46592941880226135, + "learning_rate": 0.0001937542064880872, + "loss": 3.8774, + "step": 3023 + }, + { + "epoch": 3.86784, + "grad_norm": 0.46901074051856995, + "learning_rate": 0.00019371382420244984, + "loss": 3.8471, + "step": 3024 + }, + { + "epoch": 3.86912, + "grad_norm": 0.4709526300430298, + "learning_rate": 0.00019367344191681247, + "loss": 3.8575, + "step": 3025 + }, + { + "epoch": 3.8704, + "grad_norm": 0.4765014946460724, + "learning_rate": 0.0001936330596311751, + "loss": 3.7684, + "step": 3026 + }, + { + "epoch": 3.87168, + "grad_norm": 0.48558369278907776, + "learning_rate": 0.00019359267734553776, + "loss": 3.8208, + "step": 3027 + }, + { + "epoch": 3.87296, + "grad_norm": 0.47557520866394043, + "learning_rate": 0.0001935522950599004, + "loss": 3.755, + "step": 3028 + }, + { + "epoch": 3.87424, + "grad_norm": 0.4816844165325165, + "learning_rate": 0.000193511912774263, + "loss": 3.8242, + "step": 3029 + }, + { + "epoch": 3.87552, + "grad_norm": 0.464916467666626, + "learning_rate": 0.00019347153048862562, + "loss": 3.8323, + "step": 3030 + }, + { + "epoch": 3.8768000000000002, + "grad_norm": 0.4912608563899994, + "learning_rate": 0.00019343114820298828, + "loss": 3.833, + "step": 3031 + }, + { + "epoch": 3.8780799999999997, + "grad_norm": 0.4688814580440521, + "learning_rate": 0.0001933907659173509, + "loss": 3.8551, + "step": 3032 + }, + { + "epoch": 3.87936, + "grad_norm": 0.4833512306213379, + "learning_rate": 0.00019335038363171354, + "loss": 3.8419, + "step": 3033 + }, + { + "epoch": 3.88064, + "grad_norm": 0.49540048837661743, + "learning_rate": 0.00019331000134607617, + "loss": 3.8368, + "step": 3034 + }, + { + "epoch": 3.88192, + "grad_norm": 0.4662631154060364, + "learning_rate": 0.00019326961906043883, + "loss": 3.9145, + "step": 3035 + }, + { + "epoch": 3.8832, + "grad_norm": 0.4789687693119049, + "learning_rate": 0.00019322923677480146, + "loss": 3.809, + "step": 3036 + }, + { + "epoch": 3.88448, + "grad_norm": 0.47663864493370056, + "learning_rate": 0.00019318885448916406, + "loss": 3.7629, + "step": 3037 + }, + { + "epoch": 3.88576, + "grad_norm": 0.46691396832466125, + "learning_rate": 0.0001931484722035267, + "loss": 3.7381, + "step": 3038 + }, + { + "epoch": 3.88704, + "grad_norm": 0.5005871057510376, + "learning_rate": 0.00019310808991788932, + "loss": 3.8393, + "step": 3039 + }, + { + "epoch": 3.88832, + "grad_norm": 0.4863739311695099, + "learning_rate": 0.00019306770763225198, + "loss": 3.7096, + "step": 3040 + }, + { + "epoch": 3.8895999999999997, + "grad_norm": 0.49686795473098755, + "learning_rate": 0.0001930273253466146, + "loss": 3.74, + "step": 3041 + }, + { + "epoch": 3.89088, + "grad_norm": 0.4751260578632355, + "learning_rate": 0.00019298694306097724, + "loss": 3.8243, + "step": 3042 + }, + { + "epoch": 3.89216, + "grad_norm": 0.4915493428707123, + "learning_rate": 0.00019294656077533987, + "loss": 3.8116, + "step": 3043 + }, + { + "epoch": 3.89344, + "grad_norm": 0.4819605350494385, + "learning_rate": 0.00019290617848970252, + "loss": 3.843, + "step": 3044 + }, + { + "epoch": 3.89472, + "grad_norm": 0.4787873923778534, + "learning_rate": 0.00019286579620406513, + "loss": 3.7245, + "step": 3045 + }, + { + "epoch": 3.896, + "grad_norm": 0.4805007874965668, + "learning_rate": 0.00019282541391842776, + "loss": 3.7573, + "step": 3046 + }, + { + "epoch": 3.89728, + "grad_norm": 0.4945317804813385, + "learning_rate": 0.00019278503163279039, + "loss": 3.9226, + "step": 3047 + }, + { + "epoch": 3.89856, + "grad_norm": 0.49421226978302, + "learning_rate": 0.00019274464934715304, + "loss": 3.7792, + "step": 3048 + }, + { + "epoch": 3.89984, + "grad_norm": 0.47549110651016235, + "learning_rate": 0.00019270426706151567, + "loss": 3.8229, + "step": 3049 + }, + { + "epoch": 3.90112, + "grad_norm": 0.5038889050483704, + "learning_rate": 0.0001926638847758783, + "loss": 3.8079, + "step": 3050 + }, + { + "epoch": 3.9024, + "grad_norm": 0.4997271001338959, + "learning_rate": 0.00019262350249024093, + "loss": 3.8135, + "step": 3051 + }, + { + "epoch": 3.90368, + "grad_norm": 0.4927690625190735, + "learning_rate": 0.00019258312020460354, + "loss": 3.8534, + "step": 3052 + }, + { + "epoch": 3.90496, + "grad_norm": 0.4786875545978546, + "learning_rate": 0.0001925427379189662, + "loss": 3.7229, + "step": 3053 + }, + { + "epoch": 3.90624, + "grad_norm": 0.5047836899757385, + "learning_rate": 0.00019250235563332882, + "loss": 3.8432, + "step": 3054 + }, + { + "epoch": 3.90752, + "grad_norm": 0.5120929479598999, + "learning_rate": 0.00019246197334769145, + "loss": 3.777, + "step": 3055 + }, + { + "epoch": 3.9088000000000003, + "grad_norm": 0.49564608931541443, + "learning_rate": 0.00019242159106205408, + "loss": 3.8209, + "step": 3056 + }, + { + "epoch": 3.91008, + "grad_norm": 0.4866943657398224, + "learning_rate": 0.00019238120877641674, + "loss": 3.7899, + "step": 3057 + }, + { + "epoch": 3.91136, + "grad_norm": 0.487051784992218, + "learning_rate": 0.00019234082649077937, + "loss": 3.799, + "step": 3058 + }, + { + "epoch": 3.91264, + "grad_norm": 0.4829874634742737, + "learning_rate": 0.000192300444205142, + "loss": 3.7346, + "step": 3059 + }, + { + "epoch": 3.91392, + "grad_norm": 0.4827929735183716, + "learning_rate": 0.0001922600619195046, + "loss": 3.7442, + "step": 3060 + }, + { + "epoch": 3.9152, + "grad_norm": 0.4668867588043213, + "learning_rate": 0.00019221967963386726, + "loss": 3.7373, + "step": 3061 + }, + { + "epoch": 3.91648, + "grad_norm": 0.4852744936943054, + "learning_rate": 0.0001921792973482299, + "loss": 3.7558, + "step": 3062 + }, + { + "epoch": 3.91776, + "grad_norm": 0.4621356129646301, + "learning_rate": 0.00019213891506259252, + "loss": 3.7306, + "step": 3063 + }, + { + "epoch": 3.91904, + "grad_norm": 0.47092723846435547, + "learning_rate": 0.00019209853277695515, + "loss": 3.7813, + "step": 3064 + }, + { + "epoch": 3.9203200000000002, + "grad_norm": 0.48093530535697937, + "learning_rate": 0.0001920581504913178, + "loss": 3.7827, + "step": 3065 + }, + { + "epoch": 3.9215999999999998, + "grad_norm": 0.4793148338794708, + "learning_rate": 0.00019201776820568044, + "loss": 3.7267, + "step": 3066 + }, + { + "epoch": 3.92288, + "grad_norm": 0.4646039605140686, + "learning_rate": 0.00019197738592004307, + "loss": 3.7712, + "step": 3067 + }, + { + "epoch": 3.92416, + "grad_norm": 0.47865545749664307, + "learning_rate": 0.00019193700363440567, + "loss": 3.7797, + "step": 3068 + }, + { + "epoch": 3.92544, + "grad_norm": 0.4747565984725952, + "learning_rate": 0.0001918966213487683, + "loss": 3.7688, + "step": 3069 + }, + { + "epoch": 3.92672, + "grad_norm": 0.4655320346355438, + "learning_rate": 0.00019185623906313096, + "loss": 3.8159, + "step": 3070 + }, + { + "epoch": 3.928, + "grad_norm": 0.4775558114051819, + "learning_rate": 0.0001918158567774936, + "loss": 3.7575, + "step": 3071 + }, + { + "epoch": 3.92928, + "grad_norm": 0.4783431589603424, + "learning_rate": 0.00019177547449185622, + "loss": 3.6739, + "step": 3072 + }, + { + "epoch": 3.93056, + "grad_norm": 0.48486626148223877, + "learning_rate": 0.00019173509220621885, + "loss": 3.7762, + "step": 3073 + }, + { + "epoch": 3.9318400000000002, + "grad_norm": 0.4990038275718689, + "learning_rate": 0.0001916947099205815, + "loss": 3.821, + "step": 3074 + }, + { + "epoch": 3.9331199999999997, + "grad_norm": 0.4997273087501526, + "learning_rate": 0.00019165432763494414, + "loss": 3.8262, + "step": 3075 + }, + { + "epoch": 3.9344, + "grad_norm": 0.48248130083084106, + "learning_rate": 0.00019161394534930674, + "loss": 3.8229, + "step": 3076 + }, + { + "epoch": 3.93568, + "grad_norm": 0.4947473406791687, + "learning_rate": 0.00019157356306366937, + "loss": 3.8035, + "step": 3077 + }, + { + "epoch": 3.93696, + "grad_norm": 0.49929845333099365, + "learning_rate": 0.00019153318077803203, + "loss": 3.7907, + "step": 3078 + }, + { + "epoch": 3.93824, + "grad_norm": 0.5050824880599976, + "learning_rate": 0.00019149279849239466, + "loss": 3.7588, + "step": 3079 + }, + { + "epoch": 3.93952, + "grad_norm": 0.5087640881538391, + "learning_rate": 0.0001914524162067573, + "loss": 3.8331, + "step": 3080 + }, + { + "epoch": 3.9408, + "grad_norm": 0.4996209442615509, + "learning_rate": 0.00019141203392111992, + "loss": 3.7626, + "step": 3081 + }, + { + "epoch": 3.94208, + "grad_norm": 0.49905964732170105, + "learning_rate": 0.00019137165163548255, + "loss": 3.7356, + "step": 3082 + }, + { + "epoch": 3.94336, + "grad_norm": 0.5181260704994202, + "learning_rate": 0.0001913312693498452, + "loss": 3.8077, + "step": 3083 + }, + { + "epoch": 3.94464, + "grad_norm": 0.5053640604019165, + "learning_rate": 0.0001912908870642078, + "loss": 3.8205, + "step": 3084 + }, + { + "epoch": 3.94592, + "grad_norm": 0.5048377513885498, + "learning_rate": 0.00019125050477857044, + "loss": 3.7736, + "step": 3085 + }, + { + "epoch": 3.9472, + "grad_norm": 0.4769393503665924, + "learning_rate": 0.00019121012249293307, + "loss": 3.7224, + "step": 3086 + }, + { + "epoch": 3.94848, + "grad_norm": 0.5042276382446289, + "learning_rate": 0.00019116974020729572, + "loss": 3.7301, + "step": 3087 + }, + { + "epoch": 3.94976, + "grad_norm": 0.4883654713630676, + "learning_rate": 0.00019112935792165835, + "loss": 3.7873, + "step": 3088 + }, + { + "epoch": 3.95104, + "grad_norm": 0.46280887722969055, + "learning_rate": 0.00019108897563602098, + "loss": 3.8232, + "step": 3089 + }, + { + "epoch": 3.9523200000000003, + "grad_norm": 0.47715169191360474, + "learning_rate": 0.00019104859335038361, + "loss": 3.8285, + "step": 3090 + }, + { + "epoch": 3.9536, + "grad_norm": 0.4663173258304596, + "learning_rate": 0.00019100821106474627, + "loss": 3.7407, + "step": 3091 + }, + { + "epoch": 3.95488, + "grad_norm": 0.4840726852416992, + "learning_rate": 0.0001909678287791089, + "loss": 3.7961, + "step": 3092 + }, + { + "epoch": 3.95616, + "grad_norm": 0.4775456190109253, + "learning_rate": 0.0001909274464934715, + "loss": 3.7243, + "step": 3093 + }, + { + "epoch": 3.95744, + "grad_norm": 0.506077229976654, + "learning_rate": 0.00019088706420783414, + "loss": 3.832, + "step": 3094 + }, + { + "epoch": 3.95872, + "grad_norm": 0.4873894155025482, + "learning_rate": 0.00019084668192219677, + "loss": 3.8327, + "step": 3095 + }, + { + "epoch": 3.96, + "grad_norm": 0.4892643690109253, + "learning_rate": 0.00019080629963655942, + "loss": 3.8131, + "step": 3096 + }, + { + "epoch": 3.96128, + "grad_norm": 0.503563642501831, + "learning_rate": 0.00019076591735092205, + "loss": 3.8291, + "step": 3097 + }, + { + "epoch": 3.96256, + "grad_norm": 0.4653085768222809, + "learning_rate": 0.00019072553506528468, + "loss": 3.7782, + "step": 3098 + }, + { + "epoch": 3.9638400000000003, + "grad_norm": 0.46706753969192505, + "learning_rate": 0.00019068515277964729, + "loss": 3.7926, + "step": 3099 + }, + { + "epoch": 3.9651199999999998, + "grad_norm": 0.4738360345363617, + "learning_rate": 0.00019064477049400997, + "loss": 3.7901, + "step": 3100 + }, + { + "epoch": 3.9664, + "grad_norm": 0.48858094215393066, + "learning_rate": 0.00019060438820837257, + "loss": 3.7761, + "step": 3101 + }, + { + "epoch": 3.96768, + "grad_norm": 0.4789169430732727, + "learning_rate": 0.0001905640059227352, + "loss": 3.8362, + "step": 3102 + }, + { + "epoch": 3.96896, + "grad_norm": 0.4818405508995056, + "learning_rate": 0.00019052362363709783, + "loss": 3.7566, + "step": 3103 + }, + { + "epoch": 3.97024, + "grad_norm": 0.4910163879394531, + "learning_rate": 0.0001904832413514605, + "loss": 3.8183, + "step": 3104 + }, + { + "epoch": 3.97152, + "grad_norm": 0.47668853402137756, + "learning_rate": 0.00019044285906582312, + "loss": 3.7694, + "step": 3105 + }, + { + "epoch": 3.9728, + "grad_norm": 0.4871366024017334, + "learning_rate": 0.00019040247678018575, + "loss": 3.8093, + "step": 3106 + }, + { + "epoch": 3.97408, + "grad_norm": 0.4852496087551117, + "learning_rate": 0.00019036209449454838, + "loss": 3.8424, + "step": 3107 + }, + { + "epoch": 3.9753600000000002, + "grad_norm": 0.49890244007110596, + "learning_rate": 0.00019032171220891098, + "loss": 3.7696, + "step": 3108 + }, + { + "epoch": 3.9766399999999997, + "grad_norm": 0.49646714329719543, + "learning_rate": 0.00019028132992327364, + "loss": 3.8346, + "step": 3109 + }, + { + "epoch": 3.97792, + "grad_norm": 0.48989424109458923, + "learning_rate": 0.00019024094763763627, + "loss": 3.7555, + "step": 3110 + }, + { + "epoch": 3.9792, + "grad_norm": 0.4914839267730713, + "learning_rate": 0.0001902005653519989, + "loss": 3.8055, + "step": 3111 + }, + { + "epoch": 3.98048, + "grad_norm": 0.48345229029655457, + "learning_rate": 0.00019016018306636153, + "loss": 3.8296, + "step": 3112 + }, + { + "epoch": 3.98176, + "grad_norm": 0.4965314269065857, + "learning_rate": 0.0001901198007807242, + "loss": 3.8099, + "step": 3113 + }, + { + "epoch": 3.98304, + "grad_norm": 0.47324666380882263, + "learning_rate": 0.00019007941849508682, + "loss": 3.8194, + "step": 3114 + }, + { + "epoch": 3.98432, + "grad_norm": 0.48489847779273987, + "learning_rate": 0.00019003903620944945, + "loss": 3.792, + "step": 3115 + }, + { + "epoch": 3.9856, + "grad_norm": 0.4774338901042938, + "learning_rate": 0.00018999865392381205, + "loss": 3.8118, + "step": 3116 + }, + { + "epoch": 3.98688, + "grad_norm": 0.4876146912574768, + "learning_rate": 0.0001899582716381747, + "loss": 3.8224, + "step": 3117 + }, + { + "epoch": 3.98816, + "grad_norm": 0.4737136662006378, + "learning_rate": 0.00018991788935253734, + "loss": 3.8342, + "step": 3118 + }, + { + "epoch": 3.98944, + "grad_norm": 0.46729955077171326, + "learning_rate": 0.00018987750706689997, + "loss": 3.7307, + "step": 3119 + }, + { + "epoch": 3.99072, + "grad_norm": 0.46377983689308167, + "learning_rate": 0.0001898371247812626, + "loss": 3.8138, + "step": 3120 + }, + { + "epoch": 3.992, + "grad_norm": 0.4843078851699829, + "learning_rate": 0.00018979674249562523, + "loss": 3.8357, + "step": 3121 + }, + { + "epoch": 3.99328, + "grad_norm": 0.47113025188446045, + "learning_rate": 0.00018975636020998789, + "loss": 3.7578, + "step": 3122 + }, + { + "epoch": 3.99456, + "grad_norm": 0.5183811187744141, + "learning_rate": 0.00018971597792435052, + "loss": 3.7808, + "step": 3123 + }, + { + "epoch": 3.99584, + "grad_norm": 0.4774457812309265, + "learning_rate": 0.00018967559563871312, + "loss": 3.7492, + "step": 3124 + }, + { + "epoch": 3.99712, + "grad_norm": 0.5077742338180542, + "learning_rate": 0.00018963521335307575, + "loss": 3.6986, + "step": 3125 + }, + { + "epoch": 3.9984, + "grad_norm": 0.489953875541687, + "learning_rate": 0.0001895948310674384, + "loss": 3.7662, + "step": 3126 + }, + { + "epoch": 3.99968, + "grad_norm": 0.5075393319129944, + "learning_rate": 0.00018955444878180104, + "loss": 3.7844, + "step": 3127 + }, + { + "epoch": 4.0, + "grad_norm": 0.8455784916877747, + "learning_rate": 0.00018951406649616367, + "loss": 3.6669, + "step": 3128 + }, + { + "epoch": 4.00128, + "grad_norm": 0.5457375645637512, + "learning_rate": 0.0001894736842105263, + "loss": 3.696, + "step": 3129 + }, + { + "epoch": 4.00256, + "grad_norm": 0.5012986063957214, + "learning_rate": 0.00018943330192488895, + "loss": 3.7483, + "step": 3130 + }, + { + "epoch": 4.00384, + "grad_norm": 0.5168102979660034, + "learning_rate": 0.00018939291963925158, + "loss": 3.6833, + "step": 3131 + }, + { + "epoch": 4.00512, + "grad_norm": 0.5085991024971008, + "learning_rate": 0.0001893525373536142, + "loss": 3.7756, + "step": 3132 + }, + { + "epoch": 4.0064, + "grad_norm": 0.5034124255180359, + "learning_rate": 0.00018931215506797682, + "loss": 3.6705, + "step": 3133 + }, + { + "epoch": 4.00768, + "grad_norm": 0.5143512487411499, + "learning_rate": 0.00018927177278233945, + "loss": 3.7017, + "step": 3134 + }, + { + "epoch": 4.00896, + "grad_norm": 0.49328911304473877, + "learning_rate": 0.0001892313904967021, + "loss": 3.618, + "step": 3135 + }, + { + "epoch": 4.01024, + "grad_norm": 0.49207302927970886, + "learning_rate": 0.00018919100821106473, + "loss": 3.6588, + "step": 3136 + }, + { + "epoch": 4.01152, + "grad_norm": 0.5474900603294373, + "learning_rate": 0.00018915062592542736, + "loss": 3.7265, + "step": 3137 + }, + { + "epoch": 4.0128, + "grad_norm": 0.48699188232421875, + "learning_rate": 0.00018911024363979, + "loss": 3.7186, + "step": 3138 + }, + { + "epoch": 4.01408, + "grad_norm": 0.5149625539779663, + "learning_rate": 0.00018906986135415265, + "loss": 3.6609, + "step": 3139 + }, + { + "epoch": 4.01536, + "grad_norm": 0.5144675374031067, + "learning_rate": 0.00018902947906851525, + "loss": 3.6982, + "step": 3140 + }, + { + "epoch": 4.01664, + "grad_norm": 0.4917832612991333, + "learning_rate": 0.00018898909678287788, + "loss": 3.6243, + "step": 3141 + }, + { + "epoch": 4.01792, + "grad_norm": 0.5480942726135254, + "learning_rate": 0.00018894871449724051, + "loss": 3.7372, + "step": 3142 + }, + { + "epoch": 4.0192, + "grad_norm": 0.49381646513938904, + "learning_rate": 0.00018890833221160317, + "loss": 3.6939, + "step": 3143 + }, + { + "epoch": 4.02048, + "grad_norm": 0.5019636750221252, + "learning_rate": 0.0001888679499259658, + "loss": 3.7502, + "step": 3144 + }, + { + "epoch": 4.0217600000000004, + "grad_norm": 0.49695998430252075, + "learning_rate": 0.00018882756764032843, + "loss": 3.7423, + "step": 3145 + }, + { + "epoch": 4.02304, + "grad_norm": 0.5167843103408813, + "learning_rate": 0.00018878718535469106, + "loss": 3.7029, + "step": 3146 + }, + { + "epoch": 4.02432, + "grad_norm": 0.49102476239204407, + "learning_rate": 0.00018874680306905366, + "loss": 3.6505, + "step": 3147 + }, + { + "epoch": 4.0256, + "grad_norm": 0.5086076855659485, + "learning_rate": 0.00018870642078341632, + "loss": 3.654, + "step": 3148 + }, + { + "epoch": 4.02688, + "grad_norm": 0.5053331851959229, + "learning_rate": 0.00018866603849777895, + "loss": 3.6908, + "step": 3149 + }, + { + "epoch": 4.02816, + "grad_norm": 0.5181117653846741, + "learning_rate": 0.00018862565621214158, + "loss": 3.6742, + "step": 3150 + }, + { + "epoch": 4.02944, + "grad_norm": 0.4862511157989502, + "learning_rate": 0.0001885852739265042, + "loss": 3.6927, + "step": 3151 + }, + { + "epoch": 4.03072, + "grad_norm": 0.5201453566551208, + "learning_rate": 0.00018854489164086687, + "loss": 3.7295, + "step": 3152 + }, + { + "epoch": 4.032, + "grad_norm": 0.4780125319957733, + "learning_rate": 0.0001885045093552295, + "loss": 3.6679, + "step": 3153 + }, + { + "epoch": 4.03328, + "grad_norm": 0.5376641750335693, + "learning_rate": 0.00018846412706959213, + "loss": 3.7114, + "step": 3154 + }, + { + "epoch": 4.03456, + "grad_norm": 0.4939403235912323, + "learning_rate": 0.00018842374478395473, + "loss": 3.6443, + "step": 3155 + }, + { + "epoch": 4.03584, + "grad_norm": 0.502256453037262, + "learning_rate": 0.0001883833624983174, + "loss": 3.7188, + "step": 3156 + }, + { + "epoch": 4.03712, + "grad_norm": 0.49722784757614136, + "learning_rate": 0.00018834298021268002, + "loss": 3.6703, + "step": 3157 + }, + { + "epoch": 4.0384, + "grad_norm": 0.4928574562072754, + "learning_rate": 0.00018830259792704265, + "loss": 3.6398, + "step": 3158 + }, + { + "epoch": 4.03968, + "grad_norm": 0.5021003484725952, + "learning_rate": 0.00018826221564140528, + "loss": 3.6147, + "step": 3159 + }, + { + "epoch": 4.04096, + "grad_norm": 0.49602240324020386, + "learning_rate": 0.0001882218333557679, + "loss": 3.6448, + "step": 3160 + }, + { + "epoch": 4.04224, + "grad_norm": 0.5034336447715759, + "learning_rate": 0.00018818145107013057, + "loss": 3.7424, + "step": 3161 + }, + { + "epoch": 4.04352, + "grad_norm": 0.4804299473762512, + "learning_rate": 0.0001881410687844932, + "loss": 3.6461, + "step": 3162 + }, + { + "epoch": 4.0448, + "grad_norm": 0.509303092956543, + "learning_rate": 0.0001881006864988558, + "loss": 3.6735, + "step": 3163 + }, + { + "epoch": 4.04608, + "grad_norm": 0.5042724609375, + "learning_rate": 0.00018806030421321843, + "loss": 3.6822, + "step": 3164 + }, + { + "epoch": 4.04736, + "grad_norm": 0.49135807156562805, + "learning_rate": 0.0001880199219275811, + "loss": 3.6458, + "step": 3165 + }, + { + "epoch": 4.04864, + "grad_norm": 0.486905038356781, + "learning_rate": 0.00018797953964194372, + "loss": 3.6209, + "step": 3166 + }, + { + "epoch": 4.04992, + "grad_norm": 0.4827145040035248, + "learning_rate": 0.00018793915735630635, + "loss": 3.6738, + "step": 3167 + }, + { + "epoch": 4.0512, + "grad_norm": 0.4779025912284851, + "learning_rate": 0.00018789877507066898, + "loss": 3.6753, + "step": 3168 + }, + { + "epoch": 4.05248, + "grad_norm": 0.49523454904556274, + "learning_rate": 0.00018785839278503163, + "loss": 3.6973, + "step": 3169 + }, + { + "epoch": 4.05376, + "grad_norm": 0.49853113293647766, + "learning_rate": 0.00018781801049939426, + "loss": 3.7392, + "step": 3170 + }, + { + "epoch": 4.05504, + "grad_norm": 0.500675618648529, + "learning_rate": 0.00018777762821375687, + "loss": 3.6922, + "step": 3171 + }, + { + "epoch": 4.05632, + "grad_norm": 0.49067091941833496, + "learning_rate": 0.0001877372459281195, + "loss": 3.6531, + "step": 3172 + }, + { + "epoch": 4.0576, + "grad_norm": 0.4818088710308075, + "learning_rate": 0.00018769686364248213, + "loss": 3.7128, + "step": 3173 + }, + { + "epoch": 4.05888, + "grad_norm": 0.5128709673881531, + "learning_rate": 0.00018765648135684479, + "loss": 3.7044, + "step": 3174 + }, + { + "epoch": 4.06016, + "grad_norm": 0.5045835375785828, + "learning_rate": 0.00018761609907120742, + "loss": 3.6572, + "step": 3175 + }, + { + "epoch": 4.06144, + "grad_norm": 0.4782851040363312, + "learning_rate": 0.00018757571678557005, + "loss": 3.7127, + "step": 3176 + }, + { + "epoch": 4.06272, + "grad_norm": 0.4840090274810791, + "learning_rate": 0.00018753533449993268, + "loss": 3.6702, + "step": 3177 + }, + { + "epoch": 4.064, + "grad_norm": 0.49672845005989075, + "learning_rate": 0.00018749495221429533, + "loss": 3.6537, + "step": 3178 + }, + { + "epoch": 4.06528, + "grad_norm": 0.4983557462692261, + "learning_rate": 0.00018745456992865796, + "loss": 3.6794, + "step": 3179 + }, + { + "epoch": 4.06656, + "grad_norm": 0.5004916191101074, + "learning_rate": 0.00018741418764302057, + "loss": 3.6771, + "step": 3180 + }, + { + "epoch": 4.06784, + "grad_norm": 0.5043333768844604, + "learning_rate": 0.0001873738053573832, + "loss": 3.7056, + "step": 3181 + }, + { + "epoch": 4.06912, + "grad_norm": 0.4966271221637726, + "learning_rate": 0.00018733342307174585, + "loss": 3.6367, + "step": 3182 + }, + { + "epoch": 4.0704, + "grad_norm": 0.5312650203704834, + "learning_rate": 0.00018729304078610848, + "loss": 3.6806, + "step": 3183 + }, + { + "epoch": 4.07168, + "grad_norm": 0.48910966515541077, + "learning_rate": 0.0001872526585004711, + "loss": 3.726, + "step": 3184 + }, + { + "epoch": 4.07296, + "grad_norm": 0.5237113237380981, + "learning_rate": 0.00018721227621483374, + "loss": 3.6479, + "step": 3185 + }, + { + "epoch": 4.07424, + "grad_norm": 0.48384127020835876, + "learning_rate": 0.0001871718939291964, + "loss": 3.7127, + "step": 3186 + }, + { + "epoch": 4.07552, + "grad_norm": 0.5084925889968872, + "learning_rate": 0.00018713151164355903, + "loss": 3.6814, + "step": 3187 + }, + { + "epoch": 4.0768, + "grad_norm": 0.5011091828346252, + "learning_rate": 0.00018709112935792163, + "loss": 3.6603, + "step": 3188 + }, + { + "epoch": 4.07808, + "grad_norm": 0.4861987829208374, + "learning_rate": 0.00018705074707228426, + "loss": 3.7693, + "step": 3189 + }, + { + "epoch": 4.07936, + "grad_norm": 0.5180191993713379, + "learning_rate": 0.0001870103647866469, + "loss": 3.6892, + "step": 3190 + }, + { + "epoch": 4.08064, + "grad_norm": 0.48718804121017456, + "learning_rate": 0.00018696998250100955, + "loss": 3.6065, + "step": 3191 + }, + { + "epoch": 4.08192, + "grad_norm": 0.520859956741333, + "learning_rate": 0.00018692960021537218, + "loss": 3.656, + "step": 3192 + }, + { + "epoch": 4.0832, + "grad_norm": 0.49004432559013367, + "learning_rate": 0.0001868892179297348, + "loss": 3.6136, + "step": 3193 + }, + { + "epoch": 4.08448, + "grad_norm": 0.5408604741096497, + "learning_rate": 0.00018684883564409744, + "loss": 3.6482, + "step": 3194 + }, + { + "epoch": 4.08576, + "grad_norm": 0.4856225252151489, + "learning_rate": 0.0001868084533584601, + "loss": 3.6174, + "step": 3195 + }, + { + "epoch": 4.08704, + "grad_norm": 0.5170305371284485, + "learning_rate": 0.0001867680710728227, + "loss": 3.7183, + "step": 3196 + }, + { + "epoch": 4.08832, + "grad_norm": 0.5067542791366577, + "learning_rate": 0.00018672768878718533, + "loss": 3.6856, + "step": 3197 + }, + { + "epoch": 4.0896, + "grad_norm": 0.4944978952407837, + "learning_rate": 0.00018668730650154796, + "loss": 3.6034, + "step": 3198 + }, + { + "epoch": 4.09088, + "grad_norm": 0.5182904601097107, + "learning_rate": 0.00018664692421591062, + "loss": 3.7634, + "step": 3199 + }, + { + "epoch": 4.09216, + "grad_norm": 0.5076016187667847, + "learning_rate": 0.00018660654193027325, + "loss": 3.6408, + "step": 3200 + }, + { + "epoch": 4.09344, + "grad_norm": 0.4935440123081207, + "learning_rate": 0.00018656615964463588, + "loss": 3.6573, + "step": 3201 + }, + { + "epoch": 4.09472, + "grad_norm": 0.5223529934883118, + "learning_rate": 0.0001865257773589985, + "loss": 3.6398, + "step": 3202 + }, + { + "epoch": 4.096, + "grad_norm": 0.4967701733112335, + "learning_rate": 0.0001864853950733611, + "loss": 3.6774, + "step": 3203 + }, + { + "epoch": 4.09728, + "grad_norm": 0.5154727697372437, + "learning_rate": 0.00018644501278772377, + "loss": 3.7467, + "step": 3204 + }, + { + "epoch": 4.09856, + "grad_norm": 0.5095906257629395, + "learning_rate": 0.0001864046305020864, + "loss": 3.7121, + "step": 3205 + }, + { + "epoch": 4.09984, + "grad_norm": 0.5027210116386414, + "learning_rate": 0.00018636424821644903, + "loss": 3.671, + "step": 3206 + }, + { + "epoch": 4.10112, + "grad_norm": 0.537636399269104, + "learning_rate": 0.00018632386593081166, + "loss": 3.7246, + "step": 3207 + }, + { + "epoch": 4.1024, + "grad_norm": 0.4799419641494751, + "learning_rate": 0.00018628348364517432, + "loss": 3.7003, + "step": 3208 + }, + { + "epoch": 4.10368, + "grad_norm": 0.5242260098457336, + "learning_rate": 0.00018624310135953695, + "loss": 3.6924, + "step": 3209 + }, + { + "epoch": 4.10496, + "grad_norm": 0.5360248684883118, + "learning_rate": 0.00018620271907389958, + "loss": 3.6041, + "step": 3210 + }, + { + "epoch": 4.10624, + "grad_norm": 0.5119033455848694, + "learning_rate": 0.00018616233678826218, + "loss": 3.6326, + "step": 3211 + }, + { + "epoch": 4.10752, + "grad_norm": 0.5700018405914307, + "learning_rate": 0.00018612195450262484, + "loss": 3.7075, + "step": 3212 + }, + { + "epoch": 4.1088, + "grad_norm": 0.5085955262184143, + "learning_rate": 0.00018608157221698747, + "loss": 3.6672, + "step": 3213 + }, + { + "epoch": 4.11008, + "grad_norm": 0.5086491703987122, + "learning_rate": 0.0001860411899313501, + "loss": 3.6875, + "step": 3214 + }, + { + "epoch": 4.11136, + "grad_norm": 0.5247465968132019, + "learning_rate": 0.00018600080764571273, + "loss": 3.6448, + "step": 3215 + }, + { + "epoch": 4.11264, + "grad_norm": 0.5041964054107666, + "learning_rate": 0.00018596042536007536, + "loss": 3.6232, + "step": 3216 + }, + { + "epoch": 4.11392, + "grad_norm": 0.5098519325256348, + "learning_rate": 0.00018592004307443801, + "loss": 3.6918, + "step": 3217 + }, + { + "epoch": 4.1152, + "grad_norm": 0.5236632227897644, + "learning_rate": 0.00018587966078880064, + "loss": 3.6658, + "step": 3218 + }, + { + "epoch": 4.11648, + "grad_norm": 0.4816301465034485, + "learning_rate": 0.00018583927850316325, + "loss": 3.7479, + "step": 3219 + }, + { + "epoch": 4.11776, + "grad_norm": 0.5393995642662048, + "learning_rate": 0.00018579889621752588, + "loss": 3.6477, + "step": 3220 + }, + { + "epoch": 4.11904, + "grad_norm": 0.4957987666130066, + "learning_rate": 0.00018575851393188853, + "loss": 3.7106, + "step": 3221 + }, + { + "epoch": 4.12032, + "grad_norm": 0.5068425536155701, + "learning_rate": 0.00018571813164625116, + "loss": 3.6857, + "step": 3222 + }, + { + "epoch": 4.1216, + "grad_norm": 0.5265698432922363, + "learning_rate": 0.0001856777493606138, + "loss": 3.7724, + "step": 3223 + }, + { + "epoch": 4.12288, + "grad_norm": 0.5055711269378662, + "learning_rate": 0.00018563736707497642, + "loss": 3.7152, + "step": 3224 + }, + { + "epoch": 4.12416, + "grad_norm": 0.5185781717300415, + "learning_rate": 0.00018559698478933908, + "loss": 3.729, + "step": 3225 + }, + { + "epoch": 4.12544, + "grad_norm": 0.5067470669746399, + "learning_rate": 0.0001855566025037017, + "loss": 3.7011, + "step": 3226 + }, + { + "epoch": 4.12672, + "grad_norm": 0.5095682144165039, + "learning_rate": 0.00018551622021806431, + "loss": 3.6616, + "step": 3227 + }, + { + "epoch": 4.128, + "grad_norm": 0.5128963589668274, + "learning_rate": 0.00018547583793242694, + "loss": 3.6418, + "step": 3228 + }, + { + "epoch": 4.12928, + "grad_norm": 0.5066363215446472, + "learning_rate": 0.00018543545564678957, + "loss": 3.6522, + "step": 3229 + }, + { + "epoch": 4.13056, + "grad_norm": 0.5100614428520203, + "learning_rate": 0.00018539507336115223, + "loss": 3.6805, + "step": 3230 + }, + { + "epoch": 4.13184, + "grad_norm": 0.5063032507896423, + "learning_rate": 0.00018535469107551486, + "loss": 3.654, + "step": 3231 + }, + { + "epoch": 4.13312, + "grad_norm": 0.5069268345832825, + "learning_rate": 0.0001853143087898775, + "loss": 3.7731, + "step": 3232 + }, + { + "epoch": 4.1344, + "grad_norm": 0.5150942206382751, + "learning_rate": 0.00018527392650424012, + "loss": 3.7019, + "step": 3233 + }, + { + "epoch": 4.13568, + "grad_norm": 0.4924696087837219, + "learning_rate": 0.00018523354421860278, + "loss": 3.7097, + "step": 3234 + }, + { + "epoch": 4.13696, + "grad_norm": 0.5000399351119995, + "learning_rate": 0.00018519316193296538, + "loss": 3.6958, + "step": 3235 + }, + { + "epoch": 4.13824, + "grad_norm": 0.468902051448822, + "learning_rate": 0.000185152779647328, + "loss": 3.6186, + "step": 3236 + }, + { + "epoch": 4.13952, + "grad_norm": 0.49955734610557556, + "learning_rate": 0.00018511239736169064, + "loss": 3.7507, + "step": 3237 + }, + { + "epoch": 4.1408, + "grad_norm": 0.5048097372055054, + "learning_rate": 0.0001850720150760533, + "loss": 3.6257, + "step": 3238 + }, + { + "epoch": 4.14208, + "grad_norm": 0.4985654056072235, + "learning_rate": 0.00018503163279041593, + "loss": 3.6897, + "step": 3239 + }, + { + "epoch": 4.14336, + "grad_norm": 0.4974263310432434, + "learning_rate": 0.00018499125050477856, + "loss": 3.7192, + "step": 3240 + }, + { + "epoch": 4.14464, + "grad_norm": 0.5002862215042114, + "learning_rate": 0.0001849508682191412, + "loss": 3.694, + "step": 3241 + }, + { + "epoch": 4.14592, + "grad_norm": 0.49395427107810974, + "learning_rate": 0.0001849104859335038, + "loss": 3.7136, + "step": 3242 + }, + { + "epoch": 4.1472, + "grad_norm": 0.5043503046035767, + "learning_rate": 0.00018487010364786645, + "loss": 3.6868, + "step": 3243 + }, + { + "epoch": 4.14848, + "grad_norm": 0.4990045726299286, + "learning_rate": 0.00018482972136222908, + "loss": 3.6802, + "step": 3244 + }, + { + "epoch": 4.14976, + "grad_norm": 0.5021066069602966, + "learning_rate": 0.0001847893390765917, + "loss": 3.6486, + "step": 3245 + }, + { + "epoch": 4.15104, + "grad_norm": 0.48476442694664, + "learning_rate": 0.00018474895679095434, + "loss": 3.6201, + "step": 3246 + }, + { + "epoch": 4.15232, + "grad_norm": 0.5046278238296509, + "learning_rate": 0.000184708574505317, + "loss": 3.7327, + "step": 3247 + }, + { + "epoch": 4.1536, + "grad_norm": 0.49205055832862854, + "learning_rate": 0.00018466819221967963, + "loss": 3.7463, + "step": 3248 + }, + { + "epoch": 4.15488, + "grad_norm": 0.4804619252681732, + "learning_rate": 0.00018462780993404226, + "loss": 3.5683, + "step": 3249 + }, + { + "epoch": 4.15616, + "grad_norm": 0.4981594979763031, + "learning_rate": 0.00018458742764840486, + "loss": 3.6402, + "step": 3250 + }, + { + "epoch": 4.15744, + "grad_norm": 0.4988977313041687, + "learning_rate": 0.00018454704536276755, + "loss": 3.6497, + "step": 3251 + }, + { + "epoch": 4.15872, + "grad_norm": 0.5172099471092224, + "learning_rate": 0.00018450666307713015, + "loss": 3.6574, + "step": 3252 + }, + { + "epoch": 4.16, + "grad_norm": 0.5064343810081482, + "learning_rate": 0.00018446628079149278, + "loss": 3.6283, + "step": 3253 + }, + { + "epoch": 4.16128, + "grad_norm": 0.47614553570747375, + "learning_rate": 0.0001844258985058554, + "loss": 3.6493, + "step": 3254 + }, + { + "epoch": 4.16256, + "grad_norm": 0.48512086272239685, + "learning_rate": 0.00018438551622021804, + "loss": 3.6168, + "step": 3255 + }, + { + "epoch": 4.16384, + "grad_norm": 0.5282117128372192, + "learning_rate": 0.0001843451339345807, + "loss": 3.7312, + "step": 3256 + }, + { + "epoch": 4.16512, + "grad_norm": 0.4799935817718506, + "learning_rate": 0.00018430475164894333, + "loss": 3.6321, + "step": 3257 + }, + { + "epoch": 4.1664, + "grad_norm": 0.5284485816955566, + "learning_rate": 0.00018426436936330593, + "loss": 3.6389, + "step": 3258 + }, + { + "epoch": 4.16768, + "grad_norm": 0.49284806847572327, + "learning_rate": 0.00018422398707766856, + "loss": 3.7423, + "step": 3259 + }, + { + "epoch": 4.16896, + "grad_norm": 0.5154812335968018, + "learning_rate": 0.00018418360479203122, + "loss": 3.6436, + "step": 3260 + }, + { + "epoch": 4.17024, + "grad_norm": 0.49612143635749817, + "learning_rate": 0.00018414322250639385, + "loss": 3.6282, + "step": 3261 + }, + { + "epoch": 4.17152, + "grad_norm": 0.5098751783370972, + "learning_rate": 0.00018410284022075648, + "loss": 3.7001, + "step": 3262 + }, + { + "epoch": 4.1728, + "grad_norm": 0.5058940649032593, + "learning_rate": 0.0001840624579351191, + "loss": 3.7325, + "step": 3263 + }, + { + "epoch": 4.17408, + "grad_norm": 0.4903821051120758, + "learning_rate": 0.00018402207564948176, + "loss": 3.7294, + "step": 3264 + }, + { + "epoch": 4.17536, + "grad_norm": 0.5205444693565369, + "learning_rate": 0.0001839816933638444, + "loss": 3.748, + "step": 3265 + }, + { + "epoch": 4.17664, + "grad_norm": 0.5151527523994446, + "learning_rate": 0.00018394131107820702, + "loss": 3.7253, + "step": 3266 + }, + { + "epoch": 4.17792, + "grad_norm": 0.5021133422851562, + "learning_rate": 0.00018390092879256963, + "loss": 3.7138, + "step": 3267 + }, + { + "epoch": 4.1792, + "grad_norm": 0.5128991007804871, + "learning_rate": 0.00018386054650693226, + "loss": 3.695, + "step": 3268 + }, + { + "epoch": 4.18048, + "grad_norm": 0.4980866014957428, + "learning_rate": 0.0001838201642212949, + "loss": 3.6914, + "step": 3269 + }, + { + "epoch": 4.18176, + "grad_norm": 0.4788151681423187, + "learning_rate": 0.00018377978193565754, + "loss": 3.7036, + "step": 3270 + }, + { + "epoch": 4.18304, + "grad_norm": 0.5027557611465454, + "learning_rate": 0.00018373939965002017, + "loss": 3.6309, + "step": 3271 + }, + { + "epoch": 4.18432, + "grad_norm": 0.4961322247982025, + "learning_rate": 0.0001836990173643828, + "loss": 3.7176, + "step": 3272 + }, + { + "epoch": 4.1856, + "grad_norm": 0.48880982398986816, + "learning_rate": 0.00018365863507874546, + "loss": 3.6192, + "step": 3273 + }, + { + "epoch": 4.18688, + "grad_norm": 0.5047913193702698, + "learning_rate": 0.0001836182527931081, + "loss": 3.6584, + "step": 3274 + }, + { + "epoch": 4.18816, + "grad_norm": 0.5131552815437317, + "learning_rate": 0.0001835778705074707, + "loss": 3.6554, + "step": 3275 + }, + { + "epoch": 4.18944, + "grad_norm": 0.4939292073249817, + "learning_rate": 0.00018353748822183332, + "loss": 3.6149, + "step": 3276 + }, + { + "epoch": 4.19072, + "grad_norm": 0.5033706426620483, + "learning_rate": 0.00018349710593619598, + "loss": 3.704, + "step": 3277 + }, + { + "epoch": 4.192, + "grad_norm": 0.4993079602718353, + "learning_rate": 0.0001834567236505586, + "loss": 3.7489, + "step": 3278 + }, + { + "epoch": 4.19328, + "grad_norm": 0.49270492792129517, + "learning_rate": 0.00018341634136492124, + "loss": 3.6576, + "step": 3279 + }, + { + "epoch": 4.19456, + "grad_norm": 0.513554036617279, + "learning_rate": 0.00018337595907928387, + "loss": 3.6928, + "step": 3280 + }, + { + "epoch": 4.19584, + "grad_norm": 0.4733978509902954, + "learning_rate": 0.0001833355767936465, + "loss": 3.6538, + "step": 3281 + }, + { + "epoch": 4.19712, + "grad_norm": 0.5104435086250305, + "learning_rate": 0.00018329519450800916, + "loss": 3.637, + "step": 3282 + }, + { + "epoch": 4.1984, + "grad_norm": 0.49731189012527466, + "learning_rate": 0.00018325481222237176, + "loss": 3.7184, + "step": 3283 + }, + { + "epoch": 4.19968, + "grad_norm": 0.5013545155525208, + "learning_rate": 0.0001832144299367344, + "loss": 3.6749, + "step": 3284 + }, + { + "epoch": 4.20096, + "grad_norm": 0.5214062929153442, + "learning_rate": 0.00018317404765109702, + "loss": 3.6357, + "step": 3285 + }, + { + "epoch": 4.20224, + "grad_norm": 0.4991886615753174, + "learning_rate": 0.00018313366536545968, + "loss": 3.6431, + "step": 3286 + }, + { + "epoch": 4.20352, + "grad_norm": 0.5190424919128418, + "learning_rate": 0.0001830932830798223, + "loss": 3.6487, + "step": 3287 + }, + { + "epoch": 4.2048, + "grad_norm": 0.4960169792175293, + "learning_rate": 0.00018305290079418494, + "loss": 3.7144, + "step": 3288 + }, + { + "epoch": 4.20608, + "grad_norm": 0.49390795826911926, + "learning_rate": 0.00018301251850854757, + "loss": 3.6899, + "step": 3289 + }, + { + "epoch": 4.2073599999999995, + "grad_norm": 0.5029337406158447, + "learning_rate": 0.00018297213622291023, + "loss": 3.7521, + "step": 3290 + }, + { + "epoch": 4.20864, + "grad_norm": 0.48957720398902893, + "learning_rate": 0.00018293175393727283, + "loss": 3.7307, + "step": 3291 + }, + { + "epoch": 4.20992, + "grad_norm": 0.48470643162727356, + "learning_rate": 0.00018289137165163546, + "loss": 3.6277, + "step": 3292 + }, + { + "epoch": 4.2112, + "grad_norm": 0.4815894663333893, + "learning_rate": 0.0001828509893659981, + "loss": 3.7091, + "step": 3293 + }, + { + "epoch": 4.21248, + "grad_norm": 0.49646440148353577, + "learning_rate": 0.00018281060708036072, + "loss": 3.7575, + "step": 3294 + }, + { + "epoch": 4.21376, + "grad_norm": 0.4831089675426483, + "learning_rate": 0.00018277022479472338, + "loss": 3.745, + "step": 3295 + }, + { + "epoch": 4.21504, + "grad_norm": 0.5089125633239746, + "learning_rate": 0.000182729842509086, + "loss": 3.7263, + "step": 3296 + }, + { + "epoch": 4.21632, + "grad_norm": 0.4908083975315094, + "learning_rate": 0.00018268946022344864, + "loss": 3.6735, + "step": 3297 + }, + { + "epoch": 4.2176, + "grad_norm": 0.46788400411605835, + "learning_rate": 0.00018264907793781124, + "loss": 3.6856, + "step": 3298 + }, + { + "epoch": 4.21888, + "grad_norm": 0.4864565432071686, + "learning_rate": 0.0001826086956521739, + "loss": 3.6435, + "step": 3299 + }, + { + "epoch": 4.22016, + "grad_norm": 0.5052434802055359, + "learning_rate": 0.00018256831336653653, + "loss": 3.6445, + "step": 3300 + }, + { + "epoch": 4.22144, + "grad_norm": 0.47929632663726807, + "learning_rate": 0.00018252793108089916, + "loss": 3.6773, + "step": 3301 + }, + { + "epoch": 4.22272, + "grad_norm": 0.522189736366272, + "learning_rate": 0.0001824875487952618, + "loss": 3.6605, + "step": 3302 + }, + { + "epoch": 4.224, + "grad_norm": 0.5014151334762573, + "learning_rate": 0.00018244716650962444, + "loss": 3.6866, + "step": 3303 + }, + { + "epoch": 4.22528, + "grad_norm": 0.49791914224624634, + "learning_rate": 0.00018240678422398707, + "loss": 3.7027, + "step": 3304 + }, + { + "epoch": 4.22656, + "grad_norm": 0.5259125828742981, + "learning_rate": 0.0001823664019383497, + "loss": 3.7465, + "step": 3305 + }, + { + "epoch": 4.22784, + "grad_norm": 0.49660128355026245, + "learning_rate": 0.0001823260196527123, + "loss": 3.7207, + "step": 3306 + }, + { + "epoch": 4.22912, + "grad_norm": 0.5191150307655334, + "learning_rate": 0.00018228563736707496, + "loss": 3.8187, + "step": 3307 + }, + { + "epoch": 4.2304, + "grad_norm": 0.5008911490440369, + "learning_rate": 0.0001822452550814376, + "loss": 3.6294, + "step": 3308 + }, + { + "epoch": 4.23168, + "grad_norm": 0.4917088449001312, + "learning_rate": 0.00018220487279580022, + "loss": 3.7032, + "step": 3309 + }, + { + "epoch": 4.23296, + "grad_norm": 0.5173110961914062, + "learning_rate": 0.00018216449051016285, + "loss": 3.7054, + "step": 3310 + }, + { + "epoch": 4.23424, + "grad_norm": 0.5040363073348999, + "learning_rate": 0.00018212410822452549, + "loss": 3.7053, + "step": 3311 + }, + { + "epoch": 4.23552, + "grad_norm": 0.48391327261924744, + "learning_rate": 0.00018208372593888814, + "loss": 3.6792, + "step": 3312 + }, + { + "epoch": 4.2368, + "grad_norm": 0.5244686603546143, + "learning_rate": 0.00018204334365325077, + "loss": 3.6416, + "step": 3313 + }, + { + "epoch": 4.23808, + "grad_norm": 0.5149886608123779, + "learning_rate": 0.00018200296136761338, + "loss": 3.7686, + "step": 3314 + }, + { + "epoch": 4.23936, + "grad_norm": 0.5035302042961121, + "learning_rate": 0.000181962579081976, + "loss": 3.7187, + "step": 3315 + }, + { + "epoch": 4.24064, + "grad_norm": 0.5213099122047424, + "learning_rate": 0.00018192219679633866, + "loss": 3.7372, + "step": 3316 + }, + { + "epoch": 4.24192, + "grad_norm": 0.49687349796295166, + "learning_rate": 0.0001818818145107013, + "loss": 3.705, + "step": 3317 + }, + { + "epoch": 4.2432, + "grad_norm": 0.49955183267593384, + "learning_rate": 0.00018184143222506392, + "loss": 3.6738, + "step": 3318 + }, + { + "epoch": 4.24448, + "grad_norm": 0.49271631240844727, + "learning_rate": 0.00018180104993942655, + "loss": 3.671, + "step": 3319 + }, + { + "epoch": 4.24576, + "grad_norm": 0.4815945625305176, + "learning_rate": 0.0001817606676537892, + "loss": 3.7131, + "step": 3320 + }, + { + "epoch": 4.24704, + "grad_norm": 0.5155565142631531, + "learning_rate": 0.00018172028536815184, + "loss": 3.6281, + "step": 3321 + }, + { + "epoch": 4.24832, + "grad_norm": 0.48677995800971985, + "learning_rate": 0.00018167990308251444, + "loss": 3.6267, + "step": 3322 + }, + { + "epoch": 4.2496, + "grad_norm": 0.5277391076087952, + "learning_rate": 0.00018163952079687707, + "loss": 3.6703, + "step": 3323 + }, + { + "epoch": 4.25088, + "grad_norm": 0.4805748164653778, + "learning_rate": 0.0001815991385112397, + "loss": 3.6601, + "step": 3324 + }, + { + "epoch": 4.25216, + "grad_norm": 0.527442216873169, + "learning_rate": 0.00018155875622560236, + "loss": 3.7159, + "step": 3325 + }, + { + "epoch": 4.25344, + "grad_norm": 0.48253729939460754, + "learning_rate": 0.000181518373939965, + "loss": 3.6895, + "step": 3326 + }, + { + "epoch": 4.25472, + "grad_norm": 0.49663540720939636, + "learning_rate": 0.00018147799165432762, + "loss": 3.7617, + "step": 3327 + }, + { + "epoch": 4.256, + "grad_norm": 0.48462435603141785, + "learning_rate": 0.00018143760936869025, + "loss": 3.6659, + "step": 3328 + }, + { + "epoch": 4.25728, + "grad_norm": 0.4957021474838257, + "learning_rate": 0.0001813972270830529, + "loss": 3.6396, + "step": 3329 + }, + { + "epoch": 4.25856, + "grad_norm": 0.4980667233467102, + "learning_rate": 0.0001813568447974155, + "loss": 3.6964, + "step": 3330 + }, + { + "epoch": 4.25984, + "grad_norm": 0.47956860065460205, + "learning_rate": 0.00018131646251177814, + "loss": 3.6877, + "step": 3331 + }, + { + "epoch": 4.26112, + "grad_norm": 0.4983195960521698, + "learning_rate": 0.00018127608022614077, + "loss": 3.6595, + "step": 3332 + }, + { + "epoch": 4.2624, + "grad_norm": 0.5003688335418701, + "learning_rate": 0.00018123569794050343, + "loss": 3.7105, + "step": 3333 + }, + { + "epoch": 4.26368, + "grad_norm": 0.4797384738922119, + "learning_rate": 0.00018119531565486606, + "loss": 3.7471, + "step": 3334 + }, + { + "epoch": 4.26496, + "grad_norm": 0.5109444856643677, + "learning_rate": 0.0001811549333692287, + "loss": 3.6958, + "step": 3335 + }, + { + "epoch": 4.26624, + "grad_norm": 0.4882170557975769, + "learning_rate": 0.00018111455108359132, + "loss": 3.6789, + "step": 3336 + }, + { + "epoch": 4.26752, + "grad_norm": 0.5124632120132446, + "learning_rate": 0.00018107416879795392, + "loss": 3.702, + "step": 3337 + }, + { + "epoch": 4.2688, + "grad_norm": 0.500720739364624, + "learning_rate": 0.0001810337865123166, + "loss": 3.6423, + "step": 3338 + }, + { + "epoch": 4.27008, + "grad_norm": 0.49544718861579895, + "learning_rate": 0.0001809934042266792, + "loss": 3.7047, + "step": 3339 + }, + { + "epoch": 4.27136, + "grad_norm": 0.5047063231468201, + "learning_rate": 0.00018095302194104184, + "loss": 3.7542, + "step": 3340 + }, + { + "epoch": 4.27264, + "grad_norm": 0.4655781388282776, + "learning_rate": 0.00018091263965540447, + "loss": 3.7705, + "step": 3341 + }, + { + "epoch": 4.27392, + "grad_norm": 0.4966285228729248, + "learning_rate": 0.00018087225736976713, + "loss": 3.6768, + "step": 3342 + }, + { + "epoch": 4.2752, + "grad_norm": 0.4942556321620941, + "learning_rate": 0.00018083187508412976, + "loss": 3.593, + "step": 3343 + }, + { + "epoch": 4.27648, + "grad_norm": 0.517810046672821, + "learning_rate": 0.00018079149279849239, + "loss": 3.6881, + "step": 3344 + }, + { + "epoch": 4.27776, + "grad_norm": 0.4855895936489105, + "learning_rate": 0.000180751110512855, + "loss": 3.6054, + "step": 3345 + }, + { + "epoch": 4.27904, + "grad_norm": 0.5209439396858215, + "learning_rate": 0.00018071072822721767, + "loss": 3.6611, + "step": 3346 + }, + { + "epoch": 4.28032, + "grad_norm": 0.5114684700965881, + "learning_rate": 0.00018067034594158028, + "loss": 3.6896, + "step": 3347 + }, + { + "epoch": 4.2816, + "grad_norm": 0.49628081917762756, + "learning_rate": 0.0001806299636559429, + "loss": 3.7266, + "step": 3348 + }, + { + "epoch": 4.2828800000000005, + "grad_norm": 0.4987524747848511, + "learning_rate": 0.00018058958137030554, + "loss": 3.6737, + "step": 3349 + }, + { + "epoch": 4.28416, + "grad_norm": 0.4812726676464081, + "learning_rate": 0.00018054919908466817, + "loss": 3.6078, + "step": 3350 + }, + { + "epoch": 4.28544, + "grad_norm": 0.5009292960166931, + "learning_rate": 0.00018050881679903082, + "loss": 3.7269, + "step": 3351 + }, + { + "epoch": 4.28672, + "grad_norm": 0.5155140161514282, + "learning_rate": 0.00018046843451339345, + "loss": 3.6942, + "step": 3352 + }, + { + "epoch": 4.288, + "grad_norm": 0.5101702809333801, + "learning_rate": 0.00018042805222775608, + "loss": 3.7346, + "step": 3353 + }, + { + "epoch": 4.28928, + "grad_norm": 0.49652808904647827, + "learning_rate": 0.0001803876699421187, + "loss": 3.6923, + "step": 3354 + }, + { + "epoch": 4.29056, + "grad_norm": 0.5246519446372986, + "learning_rate": 0.00018034728765648134, + "loss": 3.6841, + "step": 3355 + }, + { + "epoch": 4.29184, + "grad_norm": 0.49491071701049805, + "learning_rate": 0.00018030690537084397, + "loss": 3.649, + "step": 3356 + }, + { + "epoch": 4.29312, + "grad_norm": 0.5071698427200317, + "learning_rate": 0.0001802665230852066, + "loss": 3.7119, + "step": 3357 + }, + { + "epoch": 4.2943999999999996, + "grad_norm": 0.48967668414115906, + "learning_rate": 0.00018022614079956923, + "loss": 3.6469, + "step": 3358 + }, + { + "epoch": 4.29568, + "grad_norm": 0.502606987953186, + "learning_rate": 0.0001801857585139319, + "loss": 3.7297, + "step": 3359 + }, + { + "epoch": 4.29696, + "grad_norm": 0.5163201689720154, + "learning_rate": 0.00018014537622829452, + "loss": 3.7605, + "step": 3360 + }, + { + "epoch": 4.29824, + "grad_norm": 0.500606894493103, + "learning_rate": 0.00018010499394265715, + "loss": 3.7258, + "step": 3361 + }, + { + "epoch": 4.29952, + "grad_norm": 0.4911852180957794, + "learning_rate": 0.00018006461165701975, + "loss": 3.7299, + "step": 3362 + }, + { + "epoch": 4.3008, + "grad_norm": 0.494475781917572, + "learning_rate": 0.00018002422937138238, + "loss": 3.6581, + "step": 3363 + }, + { + "epoch": 4.30208, + "grad_norm": 0.49037453532218933, + "learning_rate": 0.00017998384708574504, + "loss": 3.7097, + "step": 3364 + }, + { + "epoch": 4.30336, + "grad_norm": 0.4940754175186157, + "learning_rate": 0.00017994346480010767, + "loss": 3.7747, + "step": 3365 + }, + { + "epoch": 4.30464, + "grad_norm": 0.4911755323410034, + "learning_rate": 0.0001799030825144703, + "loss": 3.7565, + "step": 3366 + }, + { + "epoch": 4.30592, + "grad_norm": 0.473459392786026, + "learning_rate": 0.00017986270022883293, + "loss": 3.7478, + "step": 3367 + }, + { + "epoch": 4.3072, + "grad_norm": 0.4972091317176819, + "learning_rate": 0.0001798223179431956, + "loss": 3.5687, + "step": 3368 + }, + { + "epoch": 4.30848, + "grad_norm": 0.475479394197464, + "learning_rate": 0.00017978193565755822, + "loss": 3.6893, + "step": 3369 + }, + { + "epoch": 4.30976, + "grad_norm": 0.4981231987476349, + "learning_rate": 0.00017974155337192082, + "loss": 3.6605, + "step": 3370 + }, + { + "epoch": 4.31104, + "grad_norm": 0.4935183823108673, + "learning_rate": 0.00017970117108628345, + "loss": 3.6837, + "step": 3371 + }, + { + "epoch": 4.31232, + "grad_norm": 0.491040974855423, + "learning_rate": 0.0001796607888006461, + "loss": 3.6831, + "step": 3372 + }, + { + "epoch": 4.3136, + "grad_norm": 0.48840728402137756, + "learning_rate": 0.00017962040651500874, + "loss": 3.7186, + "step": 3373 + }, + { + "epoch": 4.31488, + "grad_norm": 0.49155256152153015, + "learning_rate": 0.00017958002422937137, + "loss": 3.7328, + "step": 3374 + }, + { + "epoch": 4.31616, + "grad_norm": 0.4962817132472992, + "learning_rate": 0.000179539641943734, + "loss": 3.7796, + "step": 3375 + }, + { + "epoch": 4.31744, + "grad_norm": 0.47991764545440674, + "learning_rate": 0.00017949925965809663, + "loss": 3.6687, + "step": 3376 + }, + { + "epoch": 4.31872, + "grad_norm": 0.4865191876888275, + "learning_rate": 0.0001794588773724593, + "loss": 3.6614, + "step": 3377 + }, + { + "epoch": 4.32, + "grad_norm": 0.4849129617214203, + "learning_rate": 0.0001794184950868219, + "loss": 3.6637, + "step": 3378 + }, + { + "epoch": 4.32128, + "grad_norm": 0.49385684728622437, + "learning_rate": 0.00017937811280118452, + "loss": 3.6891, + "step": 3379 + }, + { + "epoch": 4.32256, + "grad_norm": 0.5029520988464355, + "learning_rate": 0.00017933773051554715, + "loss": 3.6901, + "step": 3380 + }, + { + "epoch": 4.32384, + "grad_norm": 0.4933001399040222, + "learning_rate": 0.0001792973482299098, + "loss": 3.6471, + "step": 3381 + }, + { + "epoch": 4.32512, + "grad_norm": 0.5136417746543884, + "learning_rate": 0.00017925696594427244, + "loss": 3.6926, + "step": 3382 + }, + { + "epoch": 4.3264, + "grad_norm": 0.5117876529693604, + "learning_rate": 0.00017921658365863507, + "loss": 3.7587, + "step": 3383 + }, + { + "epoch": 4.32768, + "grad_norm": 0.5118081569671631, + "learning_rate": 0.0001791762013729977, + "loss": 3.7112, + "step": 3384 + }, + { + "epoch": 4.32896, + "grad_norm": 0.5017783641815186, + "learning_rate": 0.00017913581908736035, + "loss": 3.6705, + "step": 3385 + }, + { + "epoch": 4.33024, + "grad_norm": 0.5022070407867432, + "learning_rate": 0.00017909543680172296, + "loss": 3.7245, + "step": 3386 + }, + { + "epoch": 4.33152, + "grad_norm": 0.4982999265193939, + "learning_rate": 0.0001790550545160856, + "loss": 3.6153, + "step": 3387 + }, + { + "epoch": 4.3328, + "grad_norm": 0.49681100249290466, + "learning_rate": 0.00017901467223044822, + "loss": 3.7025, + "step": 3388 + }, + { + "epoch": 4.33408, + "grad_norm": 0.5171028971672058, + "learning_rate": 0.00017897428994481085, + "loss": 3.7245, + "step": 3389 + }, + { + "epoch": 4.33536, + "grad_norm": 0.5349324345588684, + "learning_rate": 0.0001789339076591735, + "loss": 3.6531, + "step": 3390 + }, + { + "epoch": 4.33664, + "grad_norm": 0.512718677520752, + "learning_rate": 0.00017889352537353614, + "loss": 3.709, + "step": 3391 + }, + { + "epoch": 4.33792, + "grad_norm": 0.4975050687789917, + "learning_rate": 0.00017885314308789877, + "loss": 3.7275, + "step": 3392 + }, + { + "epoch": 4.3392, + "grad_norm": 0.49178048968315125, + "learning_rate": 0.00017881276080226137, + "loss": 3.721, + "step": 3393 + }, + { + "epoch": 4.34048, + "grad_norm": 0.5306468605995178, + "learning_rate": 0.00017877237851662403, + "loss": 3.7299, + "step": 3394 + }, + { + "epoch": 4.34176, + "grad_norm": 0.5229880213737488, + "learning_rate": 0.00017873199623098666, + "loss": 3.694, + "step": 3395 + }, + { + "epoch": 4.34304, + "grad_norm": 0.4882638156414032, + "learning_rate": 0.00017869161394534929, + "loss": 3.6635, + "step": 3396 + }, + { + "epoch": 4.34432, + "grad_norm": 0.5192221403121948, + "learning_rate": 0.00017865123165971192, + "loss": 3.6414, + "step": 3397 + }, + { + "epoch": 4.3456, + "grad_norm": 0.5011906027793884, + "learning_rate": 0.00017861084937407457, + "loss": 3.7079, + "step": 3398 + }, + { + "epoch": 4.34688, + "grad_norm": 0.49621590971946716, + "learning_rate": 0.0001785704670884372, + "loss": 3.7085, + "step": 3399 + }, + { + "epoch": 4.34816, + "grad_norm": 0.5203524827957153, + "learning_rate": 0.00017853008480279983, + "loss": 3.6853, + "step": 3400 + }, + { + "epoch": 4.3494399999999995, + "grad_norm": 0.5072999596595764, + "learning_rate": 0.00017848970251716244, + "loss": 3.6291, + "step": 3401 + }, + { + "epoch": 4.35072, + "grad_norm": 0.4849739670753479, + "learning_rate": 0.00017844932023152507, + "loss": 3.6109, + "step": 3402 + }, + { + "epoch": 4.352, + "grad_norm": 0.510128915309906, + "learning_rate": 0.00017840893794588772, + "loss": 3.7104, + "step": 3403 + }, + { + "epoch": 4.35328, + "grad_norm": 0.4891580045223236, + "learning_rate": 0.00017836855566025035, + "loss": 3.6278, + "step": 3404 + }, + { + "epoch": 4.35456, + "grad_norm": 0.5085670948028564, + "learning_rate": 0.00017832817337461298, + "loss": 3.7082, + "step": 3405 + }, + { + "epoch": 4.35584, + "grad_norm": 0.49909740686416626, + "learning_rate": 0.0001782877910889756, + "loss": 3.6073, + "step": 3406 + }, + { + "epoch": 4.35712, + "grad_norm": 0.49702468514442444, + "learning_rate": 0.00017824740880333827, + "loss": 3.6564, + "step": 3407 + }, + { + "epoch": 4.3584, + "grad_norm": 0.5047982335090637, + "learning_rate": 0.0001782070265177009, + "loss": 3.7321, + "step": 3408 + }, + { + "epoch": 4.35968, + "grad_norm": 0.4968804717063904, + "learning_rate": 0.0001781666442320635, + "loss": 3.6783, + "step": 3409 + }, + { + "epoch": 4.36096, + "grad_norm": 0.5167127251625061, + "learning_rate": 0.00017812626194642613, + "loss": 3.6759, + "step": 3410 + }, + { + "epoch": 4.36224, + "grad_norm": 0.4954107403755188, + "learning_rate": 0.0001780858796607888, + "loss": 3.7047, + "step": 3411 + }, + { + "epoch": 4.36352, + "grad_norm": 0.519900381565094, + "learning_rate": 0.00017804549737515142, + "loss": 3.629, + "step": 3412 + }, + { + "epoch": 4.3648, + "grad_norm": 0.5000714659690857, + "learning_rate": 0.00017800511508951405, + "loss": 3.6723, + "step": 3413 + }, + { + "epoch": 4.36608, + "grad_norm": 0.49087822437286377, + "learning_rate": 0.00017796473280387668, + "loss": 3.7102, + "step": 3414 + }, + { + "epoch": 4.36736, + "grad_norm": 0.5202825665473938, + "learning_rate": 0.0001779243505182393, + "loss": 3.6964, + "step": 3415 + }, + { + "epoch": 4.36864, + "grad_norm": 0.5085675716400146, + "learning_rate": 0.00017788396823260197, + "loss": 3.7475, + "step": 3416 + }, + { + "epoch": 4.3699200000000005, + "grad_norm": 0.5006064176559448, + "learning_rate": 0.00017784358594696457, + "loss": 3.7466, + "step": 3417 + }, + { + "epoch": 4.3712, + "grad_norm": 0.4939427375793457, + "learning_rate": 0.0001778032036613272, + "loss": 3.7078, + "step": 3418 + }, + { + "epoch": 4.37248, + "grad_norm": 0.4983762204647064, + "learning_rate": 0.00017776282137568983, + "loss": 3.728, + "step": 3419 + }, + { + "epoch": 4.37376, + "grad_norm": 0.48928341269493103, + "learning_rate": 0.0001777224390900525, + "loss": 3.6844, + "step": 3420 + }, + { + "epoch": 4.37504, + "grad_norm": 0.47871965169906616, + "learning_rate": 0.00017768205680441512, + "loss": 3.7157, + "step": 3421 + }, + { + "epoch": 4.37632, + "grad_norm": 0.5062506794929504, + "learning_rate": 0.00017764167451877775, + "loss": 3.6605, + "step": 3422 + }, + { + "epoch": 4.3776, + "grad_norm": 0.47811704874038696, + "learning_rate": 0.00017760129223314038, + "loss": 3.6821, + "step": 3423 + }, + { + "epoch": 4.37888, + "grad_norm": 0.510657787322998, + "learning_rate": 0.00017756090994750304, + "loss": 3.6808, + "step": 3424 + }, + { + "epoch": 4.38016, + "grad_norm": 0.4923449754714966, + "learning_rate": 0.00017752052766186564, + "loss": 3.7051, + "step": 3425 + }, + { + "epoch": 4.38144, + "grad_norm": 0.5173118710517883, + "learning_rate": 0.00017748014537622827, + "loss": 3.7312, + "step": 3426 + }, + { + "epoch": 4.38272, + "grad_norm": 0.4835803508758545, + "learning_rate": 0.0001774397630905909, + "loss": 3.5541, + "step": 3427 + }, + { + "epoch": 4.384, + "grad_norm": 0.5001106262207031, + "learning_rate": 0.00017739938080495356, + "loss": 3.6304, + "step": 3428 + }, + { + "epoch": 4.38528, + "grad_norm": 0.4953722059726715, + "learning_rate": 0.00017735899851931619, + "loss": 3.7689, + "step": 3429 + }, + { + "epoch": 4.38656, + "grad_norm": 0.4921940863132477, + "learning_rate": 0.00017731861623367882, + "loss": 3.7265, + "step": 3430 + }, + { + "epoch": 4.38784, + "grad_norm": 0.5121296048164368, + "learning_rate": 0.00017727823394804145, + "loss": 3.7272, + "step": 3431 + }, + { + "epoch": 4.38912, + "grad_norm": 0.48463988304138184, + "learning_rate": 0.00017723785166240405, + "loss": 3.6316, + "step": 3432 + }, + { + "epoch": 4.3904, + "grad_norm": 0.48769861459732056, + "learning_rate": 0.00017719746937676673, + "loss": 3.6833, + "step": 3433 + }, + { + "epoch": 4.39168, + "grad_norm": 0.5136172771453857, + "learning_rate": 0.00017715708709112934, + "loss": 3.6876, + "step": 3434 + }, + { + "epoch": 4.39296, + "grad_norm": 0.4963926672935486, + "learning_rate": 0.00017711670480549197, + "loss": 3.7335, + "step": 3435 + }, + { + "epoch": 4.39424, + "grad_norm": 0.49306657910346985, + "learning_rate": 0.0001770763225198546, + "loss": 3.7481, + "step": 3436 + }, + { + "epoch": 4.39552, + "grad_norm": 0.5056197643280029, + "learning_rate": 0.00017703594023421725, + "loss": 3.6287, + "step": 3437 + }, + { + "epoch": 4.3968, + "grad_norm": 0.47748926281929016, + "learning_rate": 0.00017699555794857988, + "loss": 3.6423, + "step": 3438 + }, + { + "epoch": 4.39808, + "grad_norm": 0.5189681053161621, + "learning_rate": 0.00017695517566294251, + "loss": 3.6958, + "step": 3439 + }, + { + "epoch": 4.39936, + "grad_norm": 0.48089975118637085, + "learning_rate": 0.00017691479337730514, + "loss": 3.6822, + "step": 3440 + }, + { + "epoch": 4.40064, + "grad_norm": 0.4775058925151825, + "learning_rate": 0.0001768744110916678, + "loss": 3.7351, + "step": 3441 + }, + { + "epoch": 4.40192, + "grad_norm": 0.5030752420425415, + "learning_rate": 0.0001768340288060304, + "loss": 3.6806, + "step": 3442 + }, + { + "epoch": 4.4032, + "grad_norm": 0.4860950708389282, + "learning_rate": 0.00017679364652039303, + "loss": 3.6407, + "step": 3443 + }, + { + "epoch": 4.40448, + "grad_norm": 0.4938212037086487, + "learning_rate": 0.00017675326423475566, + "loss": 3.6792, + "step": 3444 + }, + { + "epoch": 4.40576, + "grad_norm": 0.5040499567985535, + "learning_rate": 0.0001767128819491183, + "loss": 3.6686, + "step": 3445 + }, + { + "epoch": 4.40704, + "grad_norm": 0.4846273362636566, + "learning_rate": 0.00017667249966348095, + "loss": 3.6957, + "step": 3446 + }, + { + "epoch": 4.40832, + "grad_norm": 0.49627983570098877, + "learning_rate": 0.00017663211737784358, + "loss": 3.7428, + "step": 3447 + }, + { + "epoch": 4.4096, + "grad_norm": 0.4737420380115509, + "learning_rate": 0.0001765917350922062, + "loss": 3.6932, + "step": 3448 + }, + { + "epoch": 4.41088, + "grad_norm": 0.5013241171836853, + "learning_rate": 0.00017655135280656881, + "loss": 3.7328, + "step": 3449 + }, + { + "epoch": 4.41216, + "grad_norm": 0.46999281644821167, + "learning_rate": 0.00017651097052093147, + "loss": 3.7462, + "step": 3450 + }, + { + "epoch": 4.41344, + "grad_norm": 0.5018439888954163, + "learning_rate": 0.0001764705882352941, + "loss": 3.684, + "step": 3451 + }, + { + "epoch": 4.41472, + "grad_norm": 0.49331605434417725, + "learning_rate": 0.00017643020594965673, + "loss": 3.7176, + "step": 3452 + }, + { + "epoch": 4.416, + "grad_norm": 0.48275622725486755, + "learning_rate": 0.00017638982366401936, + "loss": 3.6986, + "step": 3453 + }, + { + "epoch": 4.41728, + "grad_norm": 0.48251670598983765, + "learning_rate": 0.00017634944137838202, + "loss": 3.6331, + "step": 3454 + }, + { + "epoch": 4.41856, + "grad_norm": 0.487105131149292, + "learning_rate": 0.00017630905909274465, + "loss": 3.5988, + "step": 3455 + }, + { + "epoch": 4.41984, + "grad_norm": 0.4950862526893616, + "learning_rate": 0.00017626867680710728, + "loss": 3.6718, + "step": 3456 + }, + { + "epoch": 4.42112, + "grad_norm": 0.4823586344718933, + "learning_rate": 0.00017622829452146988, + "loss": 3.6558, + "step": 3457 + }, + { + "epoch": 4.4224, + "grad_norm": 0.49396684765815735, + "learning_rate": 0.0001761879122358325, + "loss": 3.6592, + "step": 3458 + }, + { + "epoch": 4.42368, + "grad_norm": 0.5019631385803223, + "learning_rate": 0.00017614752995019517, + "loss": 3.683, + "step": 3459 + }, + { + "epoch": 4.4249600000000004, + "grad_norm": 0.4832041263580322, + "learning_rate": 0.0001761071476645578, + "loss": 3.6948, + "step": 3460 + }, + { + "epoch": 4.42624, + "grad_norm": 0.5026285648345947, + "learning_rate": 0.00017606676537892043, + "loss": 3.7162, + "step": 3461 + }, + { + "epoch": 4.42752, + "grad_norm": 0.478025883436203, + "learning_rate": 0.00017602638309328306, + "loss": 3.6677, + "step": 3462 + }, + { + "epoch": 4.4288, + "grad_norm": 0.4910399913787842, + "learning_rate": 0.00017598600080764572, + "loss": 3.7119, + "step": 3463 + }, + { + "epoch": 4.43008, + "grad_norm": 0.4960950016975403, + "learning_rate": 0.00017594561852200835, + "loss": 3.7209, + "step": 3464 + }, + { + "epoch": 4.43136, + "grad_norm": 0.4814871549606323, + "learning_rate": 0.00017590523623637095, + "loss": 3.6908, + "step": 3465 + }, + { + "epoch": 4.43264, + "grad_norm": 0.48105382919311523, + "learning_rate": 0.00017586485395073358, + "loss": 3.742, + "step": 3466 + }, + { + "epoch": 4.43392, + "grad_norm": 0.4949689209461212, + "learning_rate": 0.00017582447166509624, + "loss": 3.6408, + "step": 3467 + }, + { + "epoch": 4.4352, + "grad_norm": 0.4880666732788086, + "learning_rate": 0.00017578408937945887, + "loss": 3.6666, + "step": 3468 + }, + { + "epoch": 4.4364799999999995, + "grad_norm": 0.4796549081802368, + "learning_rate": 0.0001757437070938215, + "loss": 3.6802, + "step": 3469 + }, + { + "epoch": 4.43776, + "grad_norm": 0.5090510845184326, + "learning_rate": 0.00017570332480818413, + "loss": 3.727, + "step": 3470 + }, + { + "epoch": 4.43904, + "grad_norm": 0.48752468824386597, + "learning_rate": 0.00017566294252254676, + "loss": 3.694, + "step": 3471 + }, + { + "epoch": 4.44032, + "grad_norm": 0.49315810203552246, + "learning_rate": 0.00017562256023690942, + "loss": 3.6637, + "step": 3472 + }, + { + "epoch": 4.4416, + "grad_norm": 0.4910982549190521, + "learning_rate": 0.00017558217795127202, + "loss": 3.697, + "step": 3473 + }, + { + "epoch": 4.44288, + "grad_norm": 0.48540931940078735, + "learning_rate": 0.00017554179566563465, + "loss": 3.6457, + "step": 3474 + }, + { + "epoch": 4.44416, + "grad_norm": 0.5137031674385071, + "learning_rate": 0.00017550141337999728, + "loss": 3.6628, + "step": 3475 + }, + { + "epoch": 4.44544, + "grad_norm": 0.4871312379837036, + "learning_rate": 0.00017546103109435994, + "loss": 3.685, + "step": 3476 + }, + { + "epoch": 4.44672, + "grad_norm": 0.48285388946533203, + "learning_rate": 0.00017542064880872257, + "loss": 3.655, + "step": 3477 + }, + { + "epoch": 4.448, + "grad_norm": 0.49935588240623474, + "learning_rate": 0.0001753802665230852, + "loss": 3.6451, + "step": 3478 + }, + { + "epoch": 4.44928, + "grad_norm": 0.49114716053009033, + "learning_rate": 0.00017533988423744783, + "loss": 3.6701, + "step": 3479 + }, + { + "epoch": 4.45056, + "grad_norm": 0.4787088632583618, + "learning_rate": 0.00017529950195181048, + "loss": 3.6956, + "step": 3480 + }, + { + "epoch": 4.45184, + "grad_norm": 0.48162198066711426, + "learning_rate": 0.00017525911966617309, + "loss": 3.6084, + "step": 3481 + }, + { + "epoch": 4.45312, + "grad_norm": 0.49390703439712524, + "learning_rate": 0.00017521873738053572, + "loss": 3.6884, + "step": 3482 + }, + { + "epoch": 4.4544, + "grad_norm": 0.4952353835105896, + "learning_rate": 0.00017517835509489835, + "loss": 3.6476, + "step": 3483 + }, + { + "epoch": 4.45568, + "grad_norm": 0.48062050342559814, + "learning_rate": 0.00017513797280926098, + "loss": 3.6257, + "step": 3484 + }, + { + "epoch": 4.45696, + "grad_norm": 0.5192123055458069, + "learning_rate": 0.00017509759052362363, + "loss": 3.6835, + "step": 3485 + }, + { + "epoch": 4.45824, + "grad_norm": 0.4845411777496338, + "learning_rate": 0.00017505720823798626, + "loss": 3.6836, + "step": 3486 + }, + { + "epoch": 4.45952, + "grad_norm": 0.47390422224998474, + "learning_rate": 0.0001750168259523489, + "loss": 3.6174, + "step": 3487 + }, + { + "epoch": 4.4608, + "grad_norm": 0.5086140036582947, + "learning_rate": 0.0001749764436667115, + "loss": 3.7345, + "step": 3488 + }, + { + "epoch": 4.46208, + "grad_norm": 0.49887514114379883, + "learning_rate": 0.00017493606138107415, + "loss": 3.7634, + "step": 3489 + }, + { + "epoch": 4.46336, + "grad_norm": 0.5016082525253296, + "learning_rate": 0.00017489567909543678, + "loss": 3.753, + "step": 3490 + }, + { + "epoch": 4.46464, + "grad_norm": 0.5016016960144043, + "learning_rate": 0.00017485529680979941, + "loss": 3.649, + "step": 3491 + }, + { + "epoch": 4.46592, + "grad_norm": 0.5083345174789429, + "learning_rate": 0.00017481491452416204, + "loss": 3.7257, + "step": 3492 + }, + { + "epoch": 4.4672, + "grad_norm": 0.4881742298603058, + "learning_rate": 0.0001747745322385247, + "loss": 3.7086, + "step": 3493 + }, + { + "epoch": 4.46848, + "grad_norm": 0.5043129920959473, + "learning_rate": 0.00017473414995288733, + "loss": 3.7117, + "step": 3494 + }, + { + "epoch": 4.46976, + "grad_norm": 0.5088573098182678, + "learning_rate": 0.00017469376766724996, + "loss": 3.6652, + "step": 3495 + }, + { + "epoch": 4.47104, + "grad_norm": 0.5027455687522888, + "learning_rate": 0.00017465338538161256, + "loss": 3.7306, + "step": 3496 + }, + { + "epoch": 4.47232, + "grad_norm": 0.49364176392555237, + "learning_rate": 0.0001746130030959752, + "loss": 3.7809, + "step": 3497 + }, + { + "epoch": 4.4736, + "grad_norm": 0.4937020242214203, + "learning_rate": 0.00017457262081033785, + "loss": 3.7053, + "step": 3498 + }, + { + "epoch": 4.47488, + "grad_norm": 0.47408318519592285, + "learning_rate": 0.00017453223852470048, + "loss": 3.6676, + "step": 3499 + }, + { + "epoch": 4.47616, + "grad_norm": 0.4941287338733673, + "learning_rate": 0.0001744918562390631, + "loss": 3.6673, + "step": 3500 + }, + { + "epoch": 4.47744, + "grad_norm": 0.4902247190475464, + "learning_rate": 0.00017445147395342574, + "loss": 3.6797, + "step": 3501 + }, + { + "epoch": 4.47872, + "grad_norm": 0.4781825840473175, + "learning_rate": 0.0001744110916677884, + "loss": 3.767, + "step": 3502 + }, + { + "epoch": 4.48, + "grad_norm": 0.5159146189689636, + "learning_rate": 0.00017437070938215103, + "loss": 3.6403, + "step": 3503 + }, + { + "epoch": 4.48128, + "grad_norm": 0.49446627497673035, + "learning_rate": 0.00017433032709651363, + "loss": 3.6994, + "step": 3504 + }, + { + "epoch": 4.48256, + "grad_norm": 0.49100860953330994, + "learning_rate": 0.00017428994481087626, + "loss": 3.6639, + "step": 3505 + }, + { + "epoch": 4.48384, + "grad_norm": 0.495257705450058, + "learning_rate": 0.00017424956252523892, + "loss": 3.6325, + "step": 3506 + }, + { + "epoch": 4.48512, + "grad_norm": 0.4940941333770752, + "learning_rate": 0.00017420918023960155, + "loss": 3.7, + "step": 3507 + }, + { + "epoch": 4.4864, + "grad_norm": 0.4889221489429474, + "learning_rate": 0.00017416879795396418, + "loss": 3.6322, + "step": 3508 + }, + { + "epoch": 4.48768, + "grad_norm": 0.49470090866088867, + "learning_rate": 0.0001741284156683268, + "loss": 3.6749, + "step": 3509 + }, + { + "epoch": 4.48896, + "grad_norm": 0.48783206939697266, + "learning_rate": 0.00017408803338268944, + "loss": 3.6823, + "step": 3510 + }, + { + "epoch": 4.49024, + "grad_norm": 0.47950899600982666, + "learning_rate": 0.0001740476510970521, + "loss": 3.7092, + "step": 3511 + }, + { + "epoch": 4.49152, + "grad_norm": 0.49544283747673035, + "learning_rate": 0.0001740072688114147, + "loss": 3.6669, + "step": 3512 + }, + { + "epoch": 4.4928, + "grad_norm": 0.5024276375770569, + "learning_rate": 0.00017396688652577733, + "loss": 3.7114, + "step": 3513 + }, + { + "epoch": 4.49408, + "grad_norm": 0.49346235394477844, + "learning_rate": 0.00017392650424013996, + "loss": 3.684, + "step": 3514 + }, + { + "epoch": 4.49536, + "grad_norm": 0.4775454103946686, + "learning_rate": 0.00017388612195450262, + "loss": 3.6177, + "step": 3515 + }, + { + "epoch": 4.49664, + "grad_norm": 0.5153406858444214, + "learning_rate": 0.00017384573966886525, + "loss": 3.6474, + "step": 3516 + }, + { + "epoch": 4.49792, + "grad_norm": 0.5022422075271606, + "learning_rate": 0.00017380535738322788, + "loss": 3.6711, + "step": 3517 + }, + { + "epoch": 4.4992, + "grad_norm": 0.49757120013237, + "learning_rate": 0.0001737649750975905, + "loss": 3.6771, + "step": 3518 + }, + { + "epoch": 4.50048, + "grad_norm": 0.49992772936820984, + "learning_rate": 0.00017372459281195316, + "loss": 3.5962, + "step": 3519 + }, + { + "epoch": 4.50176, + "grad_norm": 0.5268425345420837, + "learning_rate": 0.0001736842105263158, + "loss": 3.7249, + "step": 3520 + }, + { + "epoch": 4.50304, + "grad_norm": 0.4990698993206024, + "learning_rate": 0.0001736438282406784, + "loss": 3.6477, + "step": 3521 + }, + { + "epoch": 4.50432, + "grad_norm": 0.49010327458381653, + "learning_rate": 0.00017360344595504103, + "loss": 3.6864, + "step": 3522 + }, + { + "epoch": 4.5056, + "grad_norm": 0.5165103673934937, + "learning_rate": 0.00017356306366940366, + "loss": 3.8092, + "step": 3523 + }, + { + "epoch": 4.50688, + "grad_norm": 0.48552533984184265, + "learning_rate": 0.00017352268138376631, + "loss": 3.6701, + "step": 3524 + }, + { + "epoch": 4.50816, + "grad_norm": 0.501692533493042, + "learning_rate": 0.00017348229909812894, + "loss": 3.6105, + "step": 3525 + }, + { + "epoch": 4.50944, + "grad_norm": 0.5025333762168884, + "learning_rate": 0.00017344191681249157, + "loss": 3.6874, + "step": 3526 + }, + { + "epoch": 4.51072, + "grad_norm": 0.4903654456138611, + "learning_rate": 0.00017340153452685418, + "loss": 3.664, + "step": 3527 + }, + { + "epoch": 4.5120000000000005, + "grad_norm": 0.4915325343608856, + "learning_rate": 0.00017336115224121686, + "loss": 3.7199, + "step": 3528 + }, + { + "epoch": 4.51328, + "grad_norm": 0.5007879734039307, + "learning_rate": 0.00017332076995557946, + "loss": 3.6746, + "step": 3529 + }, + { + "epoch": 4.51456, + "grad_norm": 0.48502570390701294, + "learning_rate": 0.0001732803876699421, + "loss": 3.6661, + "step": 3530 + }, + { + "epoch": 4.51584, + "grad_norm": 0.5233677625656128, + "learning_rate": 0.00017324000538430473, + "loss": 3.6716, + "step": 3531 + }, + { + "epoch": 4.51712, + "grad_norm": 0.5040356516838074, + "learning_rate": 0.00017319962309866738, + "loss": 3.656, + "step": 3532 + }, + { + "epoch": 4.5184, + "grad_norm": 0.5123605132102966, + "learning_rate": 0.00017315924081303, + "loss": 3.6736, + "step": 3533 + }, + { + "epoch": 4.51968, + "grad_norm": 0.48310691118240356, + "learning_rate": 0.00017311885852739264, + "loss": 3.667, + "step": 3534 + }, + { + "epoch": 4.52096, + "grad_norm": 0.48786744475364685, + "learning_rate": 0.00017307847624175527, + "loss": 3.7278, + "step": 3535 + }, + { + "epoch": 4.52224, + "grad_norm": 0.4786374270915985, + "learning_rate": 0.00017303809395611788, + "loss": 3.6351, + "step": 3536 + }, + { + "epoch": 4.5235199999999995, + "grad_norm": 0.49251246452331543, + "learning_rate": 0.00017299771167048053, + "loss": 3.6494, + "step": 3537 + }, + { + "epoch": 4.5248, + "grad_norm": 0.4909958243370056, + "learning_rate": 0.00017295732938484316, + "loss": 3.6081, + "step": 3538 + }, + { + "epoch": 4.52608, + "grad_norm": 0.49989423155784607, + "learning_rate": 0.0001729169470992058, + "loss": 3.6365, + "step": 3539 + }, + { + "epoch": 4.52736, + "grad_norm": 0.5115346312522888, + "learning_rate": 0.00017287656481356842, + "loss": 3.6791, + "step": 3540 + }, + { + "epoch": 4.52864, + "grad_norm": 0.49784228205680847, + "learning_rate": 0.00017283618252793108, + "loss": 3.6783, + "step": 3541 + }, + { + "epoch": 4.52992, + "grad_norm": 0.5055098533630371, + "learning_rate": 0.0001727958002422937, + "loss": 3.7022, + "step": 3542 + }, + { + "epoch": 4.5312, + "grad_norm": 0.5355752110481262, + "learning_rate": 0.00017275541795665634, + "loss": 3.7565, + "step": 3543 + }, + { + "epoch": 4.53248, + "grad_norm": 0.5136594176292419, + "learning_rate": 0.00017271503567101894, + "loss": 3.6853, + "step": 3544 + }, + { + "epoch": 4.53376, + "grad_norm": 0.502863347530365, + "learning_rate": 0.0001726746533853816, + "loss": 3.7711, + "step": 3545 + }, + { + "epoch": 4.53504, + "grad_norm": 0.5095115303993225, + "learning_rate": 0.00017263427109974423, + "loss": 3.6232, + "step": 3546 + }, + { + "epoch": 4.53632, + "grad_norm": 0.5266593098640442, + "learning_rate": 0.00017259388881410686, + "loss": 3.7058, + "step": 3547 + }, + { + "epoch": 4.5376, + "grad_norm": 0.5122666358947754, + "learning_rate": 0.0001725535065284695, + "loss": 3.6821, + "step": 3548 + }, + { + "epoch": 4.53888, + "grad_norm": 0.5304042100906372, + "learning_rate": 0.00017251312424283212, + "loss": 3.7056, + "step": 3549 + }, + { + "epoch": 4.54016, + "grad_norm": 0.495039165019989, + "learning_rate": 0.00017247274195719478, + "loss": 3.64, + "step": 3550 + }, + { + "epoch": 4.54144, + "grad_norm": 0.5002861618995667, + "learning_rate": 0.0001724323596715574, + "loss": 3.628, + "step": 3551 + }, + { + "epoch": 4.54272, + "grad_norm": 0.47562626004219055, + "learning_rate": 0.00017239197738592, + "loss": 3.6679, + "step": 3552 + }, + { + "epoch": 4.5440000000000005, + "grad_norm": 0.4953966438770294, + "learning_rate": 0.00017235159510028264, + "loss": 3.6984, + "step": 3553 + }, + { + "epoch": 4.54528, + "grad_norm": 0.4846183955669403, + "learning_rate": 0.0001723112128146453, + "loss": 3.7035, + "step": 3554 + }, + { + "epoch": 4.54656, + "grad_norm": 0.4903351068496704, + "learning_rate": 0.00017227083052900793, + "loss": 3.6948, + "step": 3555 + }, + { + "epoch": 4.54784, + "grad_norm": 0.5202940702438354, + "learning_rate": 0.00017223044824337056, + "loss": 3.7083, + "step": 3556 + }, + { + "epoch": 4.54912, + "grad_norm": 0.47886836528778076, + "learning_rate": 0.0001721900659577332, + "loss": 3.7083, + "step": 3557 + }, + { + "epoch": 4.5504, + "grad_norm": 0.47739362716674805, + "learning_rate": 0.00017214968367209585, + "loss": 3.6878, + "step": 3558 + }, + { + "epoch": 4.55168, + "grad_norm": 0.4860458970069885, + "learning_rate": 0.00017210930138645848, + "loss": 3.6605, + "step": 3559 + }, + { + "epoch": 4.55296, + "grad_norm": 0.48289328813552856, + "learning_rate": 0.00017206891910082108, + "loss": 3.681, + "step": 3560 + }, + { + "epoch": 4.55424, + "grad_norm": 0.48003146052360535, + "learning_rate": 0.0001720285368151837, + "loss": 3.5914, + "step": 3561 + }, + { + "epoch": 4.55552, + "grad_norm": 0.4859391152858734, + "learning_rate": 0.00017198815452954637, + "loss": 3.6338, + "step": 3562 + }, + { + "epoch": 4.5568, + "grad_norm": 0.5082067251205444, + "learning_rate": 0.000171947772243909, + "loss": 3.6709, + "step": 3563 + }, + { + "epoch": 4.55808, + "grad_norm": 0.5058284401893616, + "learning_rate": 0.00017190738995827163, + "loss": 3.7295, + "step": 3564 + }, + { + "epoch": 4.55936, + "grad_norm": 0.4948490560054779, + "learning_rate": 0.00017186700767263426, + "loss": 3.7362, + "step": 3565 + }, + { + "epoch": 4.56064, + "grad_norm": 0.48526522517204285, + "learning_rate": 0.00017182662538699689, + "loss": 3.6609, + "step": 3566 + }, + { + "epoch": 4.56192, + "grad_norm": 0.49817943572998047, + "learning_rate": 0.00017178624310135954, + "loss": 3.7036, + "step": 3567 + }, + { + "epoch": 4.5632, + "grad_norm": 0.4790593981742859, + "learning_rate": 0.00017174586081572215, + "loss": 3.6634, + "step": 3568 + }, + { + "epoch": 4.56448, + "grad_norm": 0.47792062163352966, + "learning_rate": 0.00017170547853008478, + "loss": 3.688, + "step": 3569 + }, + { + "epoch": 4.56576, + "grad_norm": 0.5085933208465576, + "learning_rate": 0.0001716650962444474, + "loss": 3.6739, + "step": 3570 + }, + { + "epoch": 4.56704, + "grad_norm": 0.4828382730484009, + "learning_rate": 0.00017162471395881006, + "loss": 3.6292, + "step": 3571 + }, + { + "epoch": 4.56832, + "grad_norm": 0.47902053594589233, + "learning_rate": 0.0001715843316731727, + "loss": 3.642, + "step": 3572 + }, + { + "epoch": 4.5696, + "grad_norm": 0.48181986808776855, + "learning_rate": 0.00017154394938753532, + "loss": 3.7166, + "step": 3573 + }, + { + "epoch": 4.57088, + "grad_norm": 0.49542367458343506, + "learning_rate": 0.00017150356710189795, + "loss": 3.6691, + "step": 3574 + }, + { + "epoch": 4.57216, + "grad_norm": 0.48702624440193176, + "learning_rate": 0.0001714631848162606, + "loss": 3.6509, + "step": 3575 + }, + { + "epoch": 4.57344, + "grad_norm": 0.4805475175380707, + "learning_rate": 0.00017142280253062321, + "loss": 3.7377, + "step": 3576 + }, + { + "epoch": 4.57472, + "grad_norm": 0.4737749695777893, + "learning_rate": 0.00017138242024498584, + "loss": 3.7663, + "step": 3577 + }, + { + "epoch": 4.576, + "grad_norm": 0.46877261996269226, + "learning_rate": 0.00017134203795934847, + "loss": 3.7753, + "step": 3578 + }, + { + "epoch": 4.57728, + "grad_norm": 0.4928486943244934, + "learning_rate": 0.0001713016556737111, + "loss": 3.7, + "step": 3579 + }, + { + "epoch": 4.5785599999999995, + "grad_norm": 0.4829467833042145, + "learning_rate": 0.00017126127338807376, + "loss": 3.6577, + "step": 3580 + }, + { + "epoch": 4.57984, + "grad_norm": 0.4743864834308624, + "learning_rate": 0.0001712208911024364, + "loss": 3.7088, + "step": 3581 + }, + { + "epoch": 4.58112, + "grad_norm": 0.4854520261287689, + "learning_rate": 0.00017118050881679902, + "loss": 3.6397, + "step": 3582 + }, + { + "epoch": 4.5824, + "grad_norm": 0.48954036831855774, + "learning_rate": 0.00017114012653116162, + "loss": 3.7009, + "step": 3583 + }, + { + "epoch": 4.58368, + "grad_norm": 0.4740862250328064, + "learning_rate": 0.00017109974424552428, + "loss": 3.6539, + "step": 3584 + }, + { + "epoch": 4.58496, + "grad_norm": 0.5085862278938293, + "learning_rate": 0.0001710593619598869, + "loss": 3.7184, + "step": 3585 + }, + { + "epoch": 4.58624, + "grad_norm": 0.48374059796333313, + "learning_rate": 0.00017101897967424954, + "loss": 3.6622, + "step": 3586 + }, + { + "epoch": 4.58752, + "grad_norm": 0.49264445900917053, + "learning_rate": 0.00017097859738861217, + "loss": 3.7557, + "step": 3587 + }, + { + "epoch": 4.5888, + "grad_norm": 0.5074787735939026, + "learning_rate": 0.00017093821510297483, + "loss": 3.7262, + "step": 3588 + }, + { + "epoch": 4.59008, + "grad_norm": 0.4799599051475525, + "learning_rate": 0.00017089783281733746, + "loss": 3.642, + "step": 3589 + }, + { + "epoch": 4.59136, + "grad_norm": 0.49617815017700195, + "learning_rate": 0.0001708574505317001, + "loss": 3.7215, + "step": 3590 + }, + { + "epoch": 4.59264, + "grad_norm": 0.48311010003089905, + "learning_rate": 0.0001708170682460627, + "loss": 3.683, + "step": 3591 + }, + { + "epoch": 4.59392, + "grad_norm": 0.48525270819664, + "learning_rate": 0.00017077668596042532, + "loss": 3.6323, + "step": 3592 + }, + { + "epoch": 4.5952, + "grad_norm": 0.48173174262046814, + "learning_rate": 0.00017073630367478798, + "loss": 3.7176, + "step": 3593 + }, + { + "epoch": 4.59648, + "grad_norm": 0.4832940101623535, + "learning_rate": 0.0001706959213891506, + "loss": 3.6404, + "step": 3594 + }, + { + "epoch": 4.59776, + "grad_norm": 0.48117658495903015, + "learning_rate": 0.00017065553910351324, + "loss": 3.6617, + "step": 3595 + }, + { + "epoch": 4.5990400000000005, + "grad_norm": 0.4833637475967407, + "learning_rate": 0.00017061515681787587, + "loss": 3.7209, + "step": 3596 + }, + { + "epoch": 4.60032, + "grad_norm": 0.48473599553108215, + "learning_rate": 0.00017057477453223853, + "loss": 3.6349, + "step": 3597 + }, + { + "epoch": 4.6016, + "grad_norm": 0.475893497467041, + "learning_rate": 0.00017053439224660116, + "loss": 3.6932, + "step": 3598 + }, + { + "epoch": 4.60288, + "grad_norm": 0.49287232756614685, + "learning_rate": 0.00017049400996096376, + "loss": 3.6374, + "step": 3599 + }, + { + "epoch": 4.60416, + "grad_norm": 0.47559598088264465, + "learning_rate": 0.0001704536276753264, + "loss": 3.6659, + "step": 3600 + }, + { + "epoch": 4.60544, + "grad_norm": 0.4943821430206299, + "learning_rate": 0.00017041324538968905, + "loss": 3.6761, + "step": 3601 + }, + { + "epoch": 4.60672, + "grad_norm": 0.5159769058227539, + "learning_rate": 0.00017037286310405168, + "loss": 3.6661, + "step": 3602 + }, + { + "epoch": 4.608, + "grad_norm": 0.5051320791244507, + "learning_rate": 0.0001703324808184143, + "loss": 3.7369, + "step": 3603 + }, + { + "epoch": 4.60928, + "grad_norm": 0.48964956402778625, + "learning_rate": 0.00017029209853277694, + "loss": 3.6809, + "step": 3604 + }, + { + "epoch": 4.6105599999999995, + "grad_norm": 0.4939142167568207, + "learning_rate": 0.00017025171624713957, + "loss": 3.694, + "step": 3605 + }, + { + "epoch": 4.61184, + "grad_norm": 0.5127921104431152, + "learning_rate": 0.00017021133396150222, + "loss": 3.7422, + "step": 3606 + }, + { + "epoch": 4.61312, + "grad_norm": 0.5049188733100891, + "learning_rate": 0.00017017095167586485, + "loss": 3.6797, + "step": 3607 + }, + { + "epoch": 4.6144, + "grad_norm": 0.5061559081077576, + "learning_rate": 0.00017013056939022746, + "loss": 3.6901, + "step": 3608 + }, + { + "epoch": 4.61568, + "grad_norm": 0.48725587129592896, + "learning_rate": 0.0001700901871045901, + "loss": 3.7239, + "step": 3609 + }, + { + "epoch": 4.61696, + "grad_norm": 0.48895296454429626, + "learning_rate": 0.00017004980481895274, + "loss": 3.73, + "step": 3610 + }, + { + "epoch": 4.61824, + "grad_norm": 0.5009008646011353, + "learning_rate": 0.00017000942253331538, + "loss": 3.636, + "step": 3611 + }, + { + "epoch": 4.61952, + "grad_norm": 0.4935823678970337, + "learning_rate": 0.000169969040247678, + "loss": 3.623, + "step": 3612 + }, + { + "epoch": 4.6208, + "grad_norm": 0.4923403263092041, + "learning_rate": 0.00016992865796204064, + "loss": 3.7194, + "step": 3613 + }, + { + "epoch": 4.62208, + "grad_norm": 0.4789236783981323, + "learning_rate": 0.0001698882756764033, + "loss": 3.7151, + "step": 3614 + }, + { + "epoch": 4.62336, + "grad_norm": 0.5072107911109924, + "learning_rate": 0.00016984789339076592, + "loss": 3.7177, + "step": 3615 + }, + { + "epoch": 4.62464, + "grad_norm": 0.4821150302886963, + "learning_rate": 0.00016980751110512853, + "loss": 3.6819, + "step": 3616 + }, + { + "epoch": 4.62592, + "grad_norm": 0.48691630363464355, + "learning_rate": 0.00016976712881949116, + "loss": 3.6939, + "step": 3617 + }, + { + "epoch": 4.6272, + "grad_norm": 0.4875396490097046, + "learning_rate": 0.00016972674653385379, + "loss": 3.7381, + "step": 3618 + }, + { + "epoch": 4.62848, + "grad_norm": 0.4814916253089905, + "learning_rate": 0.00016968636424821644, + "loss": 3.6668, + "step": 3619 + }, + { + "epoch": 4.62976, + "grad_norm": 0.5000630021095276, + "learning_rate": 0.00016964598196257907, + "loss": 3.7047, + "step": 3620 + }, + { + "epoch": 4.6310400000000005, + "grad_norm": 0.4792519211769104, + "learning_rate": 0.0001696055996769417, + "loss": 3.6889, + "step": 3621 + }, + { + "epoch": 4.63232, + "grad_norm": 0.4922233819961548, + "learning_rate": 0.00016956521739130433, + "loss": 3.6608, + "step": 3622 + }, + { + "epoch": 4.6336, + "grad_norm": 0.49414339661598206, + "learning_rate": 0.000169524835105667, + "loss": 3.6623, + "step": 3623 + }, + { + "epoch": 4.63488, + "grad_norm": 0.48598942160606384, + "learning_rate": 0.0001694844528200296, + "loss": 3.6867, + "step": 3624 + }, + { + "epoch": 4.63616, + "grad_norm": 0.49352526664733887, + "learning_rate": 0.00016944407053439222, + "loss": 3.6493, + "step": 3625 + }, + { + "epoch": 4.63744, + "grad_norm": 0.5058410167694092, + "learning_rate": 0.00016940368824875485, + "loss": 3.6692, + "step": 3626 + }, + { + "epoch": 4.63872, + "grad_norm": 0.4912284016609192, + "learning_rate": 0.0001693633059631175, + "loss": 3.7055, + "step": 3627 + }, + { + "epoch": 4.64, + "grad_norm": 0.4703737199306488, + "learning_rate": 0.00016932292367748014, + "loss": 3.6885, + "step": 3628 + }, + { + "epoch": 4.64128, + "grad_norm": 0.4926270842552185, + "learning_rate": 0.00016928254139184277, + "loss": 3.6547, + "step": 3629 + }, + { + "epoch": 4.64256, + "grad_norm": 0.48712384700775146, + "learning_rate": 0.0001692421591062054, + "loss": 3.7905, + "step": 3630 + }, + { + "epoch": 4.64384, + "grad_norm": 0.5001680850982666, + "learning_rate": 0.000169201776820568, + "loss": 3.6893, + "step": 3631 + }, + { + "epoch": 4.64512, + "grad_norm": 0.4809771180152893, + "learning_rate": 0.00016916139453493066, + "loss": 3.738, + "step": 3632 + }, + { + "epoch": 4.6464, + "grad_norm": 0.4865325093269348, + "learning_rate": 0.0001691210122492933, + "loss": 3.6982, + "step": 3633 + }, + { + "epoch": 4.64768, + "grad_norm": 0.48700863122940063, + "learning_rate": 0.00016908062996365592, + "loss": 3.681, + "step": 3634 + }, + { + "epoch": 4.64896, + "grad_norm": 0.4818150997161865, + "learning_rate": 0.00016904024767801855, + "loss": 3.6595, + "step": 3635 + }, + { + "epoch": 4.65024, + "grad_norm": 0.4853065609931946, + "learning_rate": 0.0001689998653923812, + "loss": 3.6553, + "step": 3636 + }, + { + "epoch": 4.65152, + "grad_norm": 0.47990745306015015, + "learning_rate": 0.00016895948310674384, + "loss": 3.7326, + "step": 3637 + }, + { + "epoch": 4.6528, + "grad_norm": 0.4638812839984894, + "learning_rate": 0.00016891910082110647, + "loss": 3.6394, + "step": 3638 + }, + { + "epoch": 4.65408, + "grad_norm": 0.48503032326698303, + "learning_rate": 0.00016887871853546907, + "loss": 3.6976, + "step": 3639 + }, + { + "epoch": 4.65536, + "grad_norm": 0.48162126541137695, + "learning_rate": 0.00016883833624983173, + "loss": 3.6568, + "step": 3640 + }, + { + "epoch": 4.65664, + "grad_norm": 0.5152369737625122, + "learning_rate": 0.00016879795396419436, + "loss": 3.6476, + "step": 3641 + }, + { + "epoch": 4.65792, + "grad_norm": 0.4773014783859253, + "learning_rate": 0.000168757571678557, + "loss": 3.6812, + "step": 3642 + }, + { + "epoch": 4.6592, + "grad_norm": 0.5280107855796814, + "learning_rate": 0.00016871718939291962, + "loss": 3.6671, + "step": 3643 + }, + { + "epoch": 4.66048, + "grad_norm": 0.49400994181632996, + "learning_rate": 0.00016867680710728225, + "loss": 3.6199, + "step": 3644 + }, + { + "epoch": 4.66176, + "grad_norm": 0.5342155694961548, + "learning_rate": 0.0001686364248216449, + "loss": 3.6443, + "step": 3645 + }, + { + "epoch": 4.66304, + "grad_norm": 0.500808835029602, + "learning_rate": 0.00016859604253600754, + "loss": 3.6912, + "step": 3646 + }, + { + "epoch": 4.66432, + "grad_norm": 0.511306643486023, + "learning_rate": 0.00016855566025037014, + "loss": 3.6898, + "step": 3647 + }, + { + "epoch": 4.6655999999999995, + "grad_norm": 0.48556506633758545, + "learning_rate": 0.00016851527796473277, + "loss": 3.6869, + "step": 3648 + }, + { + "epoch": 4.66688, + "grad_norm": 0.48522791266441345, + "learning_rate": 0.00016847489567909543, + "loss": 3.7425, + "step": 3649 + }, + { + "epoch": 4.66816, + "grad_norm": 0.4987458288669586, + "learning_rate": 0.00016843451339345806, + "loss": 3.6531, + "step": 3650 + }, + { + "epoch": 4.66944, + "grad_norm": 0.5028944611549377, + "learning_rate": 0.0001683941311078207, + "loss": 3.6606, + "step": 3651 + }, + { + "epoch": 4.67072, + "grad_norm": 0.4878036081790924, + "learning_rate": 0.00016835374882218332, + "loss": 3.6368, + "step": 3652 + }, + { + "epoch": 4.672, + "grad_norm": 0.5253325700759888, + "learning_rate": 0.00016831336653654597, + "loss": 3.7309, + "step": 3653 + }, + { + "epoch": 4.67328, + "grad_norm": 0.48149093985557556, + "learning_rate": 0.0001682729842509086, + "loss": 3.7106, + "step": 3654 + }, + { + "epoch": 4.67456, + "grad_norm": 0.5140679478645325, + "learning_rate": 0.0001682326019652712, + "loss": 3.721, + "step": 3655 + }, + { + "epoch": 4.67584, + "grad_norm": 0.49887320399284363, + "learning_rate": 0.00016819221967963384, + "loss": 3.6681, + "step": 3656 + }, + { + "epoch": 4.67712, + "grad_norm": 0.47831347584724426, + "learning_rate": 0.00016815183739399647, + "loss": 3.6551, + "step": 3657 + }, + { + "epoch": 4.6784, + "grad_norm": 0.5047725439071655, + "learning_rate": 0.00016811145510835912, + "loss": 3.6415, + "step": 3658 + }, + { + "epoch": 4.67968, + "grad_norm": 0.4860306978225708, + "learning_rate": 0.00016807107282272175, + "loss": 3.6685, + "step": 3659 + }, + { + "epoch": 4.68096, + "grad_norm": 0.4931819438934326, + "learning_rate": 0.00016803069053708438, + "loss": 3.6904, + "step": 3660 + }, + { + "epoch": 4.68224, + "grad_norm": 0.5166349411010742, + "learning_rate": 0.00016799030825144701, + "loss": 3.727, + "step": 3661 + }, + { + "epoch": 4.68352, + "grad_norm": 0.5045530796051025, + "learning_rate": 0.00016794992596580967, + "loss": 3.691, + "step": 3662 + }, + { + "epoch": 4.6848, + "grad_norm": 0.49202415347099304, + "learning_rate": 0.00016790954368017227, + "loss": 3.6231, + "step": 3663 + }, + { + "epoch": 4.6860800000000005, + "grad_norm": 0.4810471832752228, + "learning_rate": 0.0001678691613945349, + "loss": 3.6767, + "step": 3664 + }, + { + "epoch": 4.68736, + "grad_norm": 0.514689028263092, + "learning_rate": 0.00016782877910889753, + "loss": 3.6977, + "step": 3665 + }, + { + "epoch": 4.68864, + "grad_norm": 0.4772043526172638, + "learning_rate": 0.0001677883968232602, + "loss": 3.6731, + "step": 3666 + }, + { + "epoch": 4.68992, + "grad_norm": 0.5057224035263062, + "learning_rate": 0.00016774801453762282, + "loss": 3.7433, + "step": 3667 + }, + { + "epoch": 4.6912, + "grad_norm": 0.4916319251060486, + "learning_rate": 0.00016770763225198545, + "loss": 3.6379, + "step": 3668 + }, + { + "epoch": 4.69248, + "grad_norm": 0.5023067593574524, + "learning_rate": 0.00016766724996634808, + "loss": 3.7392, + "step": 3669 + }, + { + "epoch": 4.69376, + "grad_norm": 0.49110403656959534, + "learning_rate": 0.00016762686768071069, + "loss": 3.6819, + "step": 3670 + }, + { + "epoch": 4.69504, + "grad_norm": 0.47794216871261597, + "learning_rate": 0.00016758648539507334, + "loss": 3.6164, + "step": 3671 + }, + { + "epoch": 4.69632, + "grad_norm": 0.5044650435447693, + "learning_rate": 0.00016754610310943597, + "loss": 3.6418, + "step": 3672 + }, + { + "epoch": 4.6975999999999996, + "grad_norm": 0.5018360614776611, + "learning_rate": 0.0001675057208237986, + "loss": 3.6579, + "step": 3673 + }, + { + "epoch": 4.69888, + "grad_norm": 0.513712465763092, + "learning_rate": 0.00016746533853816123, + "loss": 3.7271, + "step": 3674 + }, + { + "epoch": 4.70016, + "grad_norm": 0.5256522297859192, + "learning_rate": 0.0001674249562525239, + "loss": 3.6982, + "step": 3675 + }, + { + "epoch": 4.70144, + "grad_norm": 0.5181000232696533, + "learning_rate": 0.00016738457396688652, + "loss": 3.6359, + "step": 3676 + }, + { + "epoch": 4.70272, + "grad_norm": 0.5020421147346497, + "learning_rate": 0.00016734419168124915, + "loss": 3.7344, + "step": 3677 + }, + { + "epoch": 4.704, + "grad_norm": 0.5058776140213013, + "learning_rate": 0.00016730380939561175, + "loss": 3.6141, + "step": 3678 + }, + { + "epoch": 4.70528, + "grad_norm": 0.5099439024925232, + "learning_rate": 0.00016726342710997444, + "loss": 3.6955, + "step": 3679 + }, + { + "epoch": 4.70656, + "grad_norm": 0.5178961753845215, + "learning_rate": 0.00016722304482433704, + "loss": 3.6717, + "step": 3680 + }, + { + "epoch": 4.70784, + "grad_norm": 0.5064387917518616, + "learning_rate": 0.00016718266253869967, + "loss": 3.6438, + "step": 3681 + }, + { + "epoch": 4.70912, + "grad_norm": 0.48872536420822144, + "learning_rate": 0.0001671422802530623, + "loss": 3.6893, + "step": 3682 + }, + { + "epoch": 4.7104, + "grad_norm": 0.486896276473999, + "learning_rate": 0.00016710189796742496, + "loss": 3.6217, + "step": 3683 + }, + { + "epoch": 4.71168, + "grad_norm": 0.49064919352531433, + "learning_rate": 0.0001670615156817876, + "loss": 3.693, + "step": 3684 + }, + { + "epoch": 4.71296, + "grad_norm": 0.5264923572540283, + "learning_rate": 0.00016702113339615022, + "loss": 3.7152, + "step": 3685 + }, + { + "epoch": 4.71424, + "grad_norm": 0.5014359354972839, + "learning_rate": 0.00016698075111051282, + "loss": 3.7828, + "step": 3686 + }, + { + "epoch": 4.71552, + "grad_norm": 0.4952867329120636, + "learning_rate": 0.00016694036882487545, + "loss": 3.7426, + "step": 3687 + }, + { + "epoch": 4.7168, + "grad_norm": 0.49848371744155884, + "learning_rate": 0.0001668999865392381, + "loss": 3.6818, + "step": 3688 + }, + { + "epoch": 4.7180800000000005, + "grad_norm": 0.4835038185119629, + "learning_rate": 0.00016685960425360074, + "loss": 3.6319, + "step": 3689 + }, + { + "epoch": 4.71936, + "grad_norm": 0.5079094171524048, + "learning_rate": 0.00016681922196796337, + "loss": 3.6938, + "step": 3690 + }, + { + "epoch": 4.7206399999999995, + "grad_norm": 0.5054168701171875, + "learning_rate": 0.000166778839682326, + "loss": 3.7248, + "step": 3691 + }, + { + "epoch": 4.72192, + "grad_norm": 0.4924614727497101, + "learning_rate": 0.00016673845739668866, + "loss": 3.7232, + "step": 3692 + }, + { + "epoch": 4.7232, + "grad_norm": 0.5226231813430786, + "learning_rate": 0.00016669807511105129, + "loss": 3.6702, + "step": 3693 + }, + { + "epoch": 4.72448, + "grad_norm": 0.5076504945755005, + "learning_rate": 0.00016665769282541392, + "loss": 3.7035, + "step": 3694 + }, + { + "epoch": 4.72576, + "grad_norm": 0.5075111985206604, + "learning_rate": 0.00016661731053977652, + "loss": 3.6875, + "step": 3695 + }, + { + "epoch": 4.72704, + "grad_norm": 0.5135008096694946, + "learning_rate": 0.00016657692825413918, + "loss": 3.6545, + "step": 3696 + }, + { + "epoch": 4.72832, + "grad_norm": 0.5121733546257019, + "learning_rate": 0.0001665365459685018, + "loss": 3.6973, + "step": 3697 + }, + { + "epoch": 4.7296, + "grad_norm": 0.4708007574081421, + "learning_rate": 0.00016649616368286444, + "loss": 3.6359, + "step": 3698 + }, + { + "epoch": 4.73088, + "grad_norm": 0.5306479334831238, + "learning_rate": 0.00016645578139722707, + "loss": 3.7159, + "step": 3699 + }, + { + "epoch": 4.73216, + "grad_norm": 0.4930853545665741, + "learning_rate": 0.0001664153991115897, + "loss": 3.6615, + "step": 3700 + }, + { + "epoch": 4.73344, + "grad_norm": 0.5061611533164978, + "learning_rate": 0.00016637501682595235, + "loss": 3.7783, + "step": 3701 + }, + { + "epoch": 4.73472, + "grad_norm": 0.4978872239589691, + "learning_rate": 0.00016633463454031498, + "loss": 3.6809, + "step": 3702 + }, + { + "epoch": 4.736, + "grad_norm": 0.5093433856964111, + "learning_rate": 0.00016629425225467759, + "loss": 3.6233, + "step": 3703 + }, + { + "epoch": 4.73728, + "grad_norm": 0.4769956171512604, + "learning_rate": 0.00016625386996904022, + "loss": 3.5826, + "step": 3704 + }, + { + "epoch": 4.73856, + "grad_norm": 0.4818449020385742, + "learning_rate": 0.00016621348768340287, + "loss": 3.6416, + "step": 3705 + }, + { + "epoch": 4.73984, + "grad_norm": 0.4861428439617157, + "learning_rate": 0.0001661731053977655, + "loss": 3.6334, + "step": 3706 + }, + { + "epoch": 4.7411200000000004, + "grad_norm": 0.48706743121147156, + "learning_rate": 0.00016613272311212813, + "loss": 3.6829, + "step": 3707 + }, + { + "epoch": 4.7424, + "grad_norm": 0.48839133977890015, + "learning_rate": 0.00016609234082649076, + "loss": 3.6995, + "step": 3708 + }, + { + "epoch": 4.74368, + "grad_norm": 0.49105438590049744, + "learning_rate": 0.00016605195854085342, + "loss": 3.6665, + "step": 3709 + }, + { + "epoch": 4.74496, + "grad_norm": 0.503555417060852, + "learning_rate": 0.00016601157625521605, + "loss": 3.7246, + "step": 3710 + }, + { + "epoch": 4.74624, + "grad_norm": 0.4822165071964264, + "learning_rate": 0.00016597119396957865, + "loss": 3.6801, + "step": 3711 + }, + { + "epoch": 4.74752, + "grad_norm": 0.4817500114440918, + "learning_rate": 0.00016593081168394128, + "loss": 3.575, + "step": 3712 + }, + { + "epoch": 4.7488, + "grad_norm": 0.48095250129699707, + "learning_rate": 0.00016589042939830391, + "loss": 3.6716, + "step": 3713 + }, + { + "epoch": 4.75008, + "grad_norm": 0.47289326786994934, + "learning_rate": 0.00016585004711266657, + "loss": 3.7431, + "step": 3714 + }, + { + "epoch": 4.75136, + "grad_norm": 0.46789786219596863, + "learning_rate": 0.0001658096648270292, + "loss": 3.6603, + "step": 3715 + }, + { + "epoch": 4.7526399999999995, + "grad_norm": 0.47495347261428833, + "learning_rate": 0.00016576928254139183, + "loss": 3.6394, + "step": 3716 + }, + { + "epoch": 4.75392, + "grad_norm": 0.47441309690475464, + "learning_rate": 0.00016572890025575446, + "loss": 3.7065, + "step": 3717 + }, + { + "epoch": 4.7552, + "grad_norm": 0.48996591567993164, + "learning_rate": 0.00016568851797011712, + "loss": 3.7313, + "step": 3718 + }, + { + "epoch": 4.75648, + "grad_norm": 0.47536635398864746, + "learning_rate": 0.00016564813568447972, + "loss": 3.7061, + "step": 3719 + }, + { + "epoch": 4.75776, + "grad_norm": 0.508357048034668, + "learning_rate": 0.00016560775339884235, + "loss": 3.7152, + "step": 3720 + }, + { + "epoch": 4.75904, + "grad_norm": 0.4819643795490265, + "learning_rate": 0.00016556737111320498, + "loss": 3.6843, + "step": 3721 + }, + { + "epoch": 4.76032, + "grad_norm": 0.48836493492126465, + "learning_rate": 0.00016552698882756764, + "loss": 3.6814, + "step": 3722 + }, + { + "epoch": 4.7616, + "grad_norm": 0.4822399616241455, + "learning_rate": 0.00016548660654193027, + "loss": 3.6709, + "step": 3723 + }, + { + "epoch": 4.76288, + "grad_norm": 0.4872921407222748, + "learning_rate": 0.0001654462242562929, + "loss": 3.6587, + "step": 3724 + }, + { + "epoch": 4.76416, + "grad_norm": 0.486806184053421, + "learning_rate": 0.00016540584197065553, + "loss": 3.669, + "step": 3725 + }, + { + "epoch": 4.76544, + "grad_norm": 0.49304312467575073, + "learning_rate": 0.00016536545968501813, + "loss": 3.7521, + "step": 3726 + }, + { + "epoch": 4.76672, + "grad_norm": 0.4882862865924835, + "learning_rate": 0.0001653250773993808, + "loss": 3.7018, + "step": 3727 + }, + { + "epoch": 4.768, + "grad_norm": 0.4865057170391083, + "learning_rate": 0.00016528469511374342, + "loss": 3.6966, + "step": 3728 + }, + { + "epoch": 4.76928, + "grad_norm": 0.5051146745681763, + "learning_rate": 0.00016524431282810605, + "loss": 3.7802, + "step": 3729 + }, + { + "epoch": 4.77056, + "grad_norm": 0.5136163830757141, + "learning_rate": 0.00016520393054246868, + "loss": 3.6855, + "step": 3730 + }, + { + "epoch": 4.77184, + "grad_norm": 0.4789270758628845, + "learning_rate": 0.00016516354825683134, + "loss": 3.6787, + "step": 3731 + }, + { + "epoch": 4.7731200000000005, + "grad_norm": 0.5074111223220825, + "learning_rate": 0.00016512316597119397, + "loss": 3.6951, + "step": 3732 + }, + { + "epoch": 4.7744, + "grad_norm": 0.5275261998176575, + "learning_rate": 0.0001650827836855566, + "loss": 3.6443, + "step": 3733 + }, + { + "epoch": 4.77568, + "grad_norm": 0.49672234058380127, + "learning_rate": 0.0001650424013999192, + "loss": 3.6661, + "step": 3734 + }, + { + "epoch": 4.77696, + "grad_norm": 0.4923790693283081, + "learning_rate": 0.00016500201911428186, + "loss": 3.6796, + "step": 3735 + }, + { + "epoch": 4.77824, + "grad_norm": 0.4953721761703491, + "learning_rate": 0.0001649616368286445, + "loss": 3.6743, + "step": 3736 + }, + { + "epoch": 4.77952, + "grad_norm": 0.4764186441898346, + "learning_rate": 0.00016492125454300712, + "loss": 3.6672, + "step": 3737 + }, + { + "epoch": 4.7808, + "grad_norm": 0.5113012790679932, + "learning_rate": 0.00016488087225736975, + "loss": 3.629, + "step": 3738 + }, + { + "epoch": 4.78208, + "grad_norm": 0.48934149742126465, + "learning_rate": 0.00016484048997173238, + "loss": 3.7223, + "step": 3739 + }, + { + "epoch": 4.78336, + "grad_norm": 0.5140074491500854, + "learning_rate": 0.00016480010768609503, + "loss": 3.7186, + "step": 3740 + }, + { + "epoch": 4.78464, + "grad_norm": 0.4866582751274109, + "learning_rate": 0.00016475972540045766, + "loss": 3.62, + "step": 3741 + }, + { + "epoch": 4.78592, + "grad_norm": 0.48126623034477234, + "learning_rate": 0.00016471934311482027, + "loss": 3.608, + "step": 3742 + }, + { + "epoch": 4.7872, + "grad_norm": 0.48048514127731323, + "learning_rate": 0.0001646789608291829, + "loss": 3.6883, + "step": 3743 + }, + { + "epoch": 4.78848, + "grad_norm": 0.4765174984931946, + "learning_rate": 0.00016463857854354555, + "loss": 3.7108, + "step": 3744 + }, + { + "epoch": 4.78976, + "grad_norm": 0.4911894202232361, + "learning_rate": 0.00016459819625790818, + "loss": 3.6355, + "step": 3745 + }, + { + "epoch": 4.79104, + "grad_norm": 0.5015087723731995, + "learning_rate": 0.00016455781397227081, + "loss": 3.6922, + "step": 3746 + }, + { + "epoch": 4.79232, + "grad_norm": 0.4827320873737335, + "learning_rate": 0.00016451743168663344, + "loss": 3.6264, + "step": 3747 + }, + { + "epoch": 4.7936, + "grad_norm": 0.4948306083679199, + "learning_rate": 0.0001644770494009961, + "loss": 3.7218, + "step": 3748 + }, + { + "epoch": 4.79488, + "grad_norm": 0.48731568455696106, + "learning_rate": 0.00016443666711535873, + "loss": 3.6451, + "step": 3749 + }, + { + "epoch": 4.79616, + "grad_norm": 0.4918150305747986, + "learning_rate": 0.00016439628482972133, + "loss": 3.7111, + "step": 3750 + }, + { + "epoch": 4.79744, + "grad_norm": 0.4810066521167755, + "learning_rate": 0.00016435590254408397, + "loss": 3.6458, + "step": 3751 + }, + { + "epoch": 4.79872, + "grad_norm": 0.49135395884513855, + "learning_rate": 0.0001643155202584466, + "loss": 3.6688, + "step": 3752 + }, + { + "epoch": 4.8, + "grad_norm": 0.4967387020587921, + "learning_rate": 0.00016427513797280925, + "loss": 3.6786, + "step": 3753 + }, + { + "epoch": 4.80128, + "grad_norm": 0.48914146423339844, + "learning_rate": 0.00016423475568717188, + "loss": 3.7037, + "step": 3754 + }, + { + "epoch": 4.80256, + "grad_norm": 0.5048929452896118, + "learning_rate": 0.0001641943734015345, + "loss": 3.6746, + "step": 3755 + }, + { + "epoch": 4.80384, + "grad_norm": 0.4980829954147339, + "learning_rate": 0.00016415399111589714, + "loss": 3.6226, + "step": 3756 + }, + { + "epoch": 4.80512, + "grad_norm": 0.49464380741119385, + "learning_rate": 0.0001641136088302598, + "loss": 3.6642, + "step": 3757 + }, + { + "epoch": 4.8064, + "grad_norm": 0.4930364489555359, + "learning_rate": 0.0001640732265446224, + "loss": 3.7118, + "step": 3758 + }, + { + "epoch": 4.8076799999999995, + "grad_norm": 0.5031747221946716, + "learning_rate": 0.00016403284425898503, + "loss": 3.6796, + "step": 3759 + }, + { + "epoch": 4.80896, + "grad_norm": 0.4856695830821991, + "learning_rate": 0.00016399246197334766, + "loss": 3.6577, + "step": 3760 + }, + { + "epoch": 4.81024, + "grad_norm": 0.5000268816947937, + "learning_rate": 0.00016395207968771032, + "loss": 3.638, + "step": 3761 + }, + { + "epoch": 4.81152, + "grad_norm": 0.5013075470924377, + "learning_rate": 0.00016391169740207295, + "loss": 3.7025, + "step": 3762 + }, + { + "epoch": 4.8128, + "grad_norm": 0.49570322036743164, + "learning_rate": 0.00016387131511643558, + "loss": 3.6357, + "step": 3763 + }, + { + "epoch": 4.81408, + "grad_norm": 0.4977611005306244, + "learning_rate": 0.0001638309328307982, + "loss": 3.671, + "step": 3764 + }, + { + "epoch": 4.81536, + "grad_norm": 0.5036880970001221, + "learning_rate": 0.0001637905505451608, + "loss": 3.6263, + "step": 3765 + }, + { + "epoch": 4.81664, + "grad_norm": 0.4825877249240875, + "learning_rate": 0.0001637501682595235, + "loss": 3.6794, + "step": 3766 + }, + { + "epoch": 4.81792, + "grad_norm": 0.4834206998348236, + "learning_rate": 0.0001637097859738861, + "loss": 3.6166, + "step": 3767 + }, + { + "epoch": 4.8192, + "grad_norm": 0.5228777527809143, + "learning_rate": 0.00016366940368824873, + "loss": 3.7228, + "step": 3768 + }, + { + "epoch": 4.82048, + "grad_norm": 0.4963701367378235, + "learning_rate": 0.00016362902140261136, + "loss": 3.6723, + "step": 3769 + }, + { + "epoch": 4.82176, + "grad_norm": 0.5132946968078613, + "learning_rate": 0.00016358863911697402, + "loss": 3.7093, + "step": 3770 + }, + { + "epoch": 4.82304, + "grad_norm": 0.5134023427963257, + "learning_rate": 0.00016354825683133665, + "loss": 3.6616, + "step": 3771 + }, + { + "epoch": 4.82432, + "grad_norm": 0.4977473318576813, + "learning_rate": 0.00016350787454569928, + "loss": 3.6783, + "step": 3772 + }, + { + "epoch": 4.8256, + "grad_norm": 0.49358707666397095, + "learning_rate": 0.00016346749226006188, + "loss": 3.6623, + "step": 3773 + }, + { + "epoch": 4.82688, + "grad_norm": 0.5281035304069519, + "learning_rate": 0.00016342710997442457, + "loss": 3.7313, + "step": 3774 + }, + { + "epoch": 4.8281600000000005, + "grad_norm": 0.4807729125022888, + "learning_rate": 0.00016338672768878717, + "loss": 3.6286, + "step": 3775 + }, + { + "epoch": 4.82944, + "grad_norm": 0.4929489195346832, + "learning_rate": 0.0001633463454031498, + "loss": 3.7261, + "step": 3776 + }, + { + "epoch": 4.83072, + "grad_norm": 0.493528813123703, + "learning_rate": 0.00016330596311751243, + "loss": 3.6753, + "step": 3777 + }, + { + "epoch": 4.832, + "grad_norm": 0.49640294909477234, + "learning_rate": 0.00016326558083187506, + "loss": 3.6256, + "step": 3778 + }, + { + "epoch": 4.83328, + "grad_norm": 0.48569512367248535, + "learning_rate": 0.00016322519854623772, + "loss": 3.6442, + "step": 3779 + }, + { + "epoch": 4.83456, + "grad_norm": 0.498219758272171, + "learning_rate": 0.00016318481626060035, + "loss": 3.6997, + "step": 3780 + }, + { + "epoch": 4.83584, + "grad_norm": 0.5126128792762756, + "learning_rate": 0.00016314443397496298, + "loss": 3.7796, + "step": 3781 + }, + { + "epoch": 4.83712, + "grad_norm": 0.4836713671684265, + "learning_rate": 0.00016310405168932558, + "loss": 3.6532, + "step": 3782 + }, + { + "epoch": 4.8384, + "grad_norm": 0.5176852345466614, + "learning_rate": 0.00016306366940368824, + "loss": 3.6887, + "step": 3783 + }, + { + "epoch": 4.8396799999999995, + "grad_norm": 0.4919340908527374, + "learning_rate": 0.00016302328711805087, + "loss": 3.7374, + "step": 3784 + }, + { + "epoch": 4.84096, + "grad_norm": 0.5173082947731018, + "learning_rate": 0.0001629829048324135, + "loss": 3.6584, + "step": 3785 + }, + { + "epoch": 4.84224, + "grad_norm": 0.4877385199069977, + "learning_rate": 0.00016294252254677613, + "loss": 3.7104, + "step": 3786 + }, + { + "epoch": 4.84352, + "grad_norm": 0.5282792448997498, + "learning_rate": 0.00016290214026113878, + "loss": 3.7029, + "step": 3787 + }, + { + "epoch": 4.8448, + "grad_norm": 0.4718897044658661, + "learning_rate": 0.0001628617579755014, + "loss": 3.5776, + "step": 3788 + }, + { + "epoch": 4.84608, + "grad_norm": 0.4864949584007263, + "learning_rate": 0.00016282137568986404, + "loss": 3.6709, + "step": 3789 + }, + { + "epoch": 4.84736, + "grad_norm": 0.4912480115890503, + "learning_rate": 0.00016278099340422665, + "loss": 3.6211, + "step": 3790 + }, + { + "epoch": 4.84864, + "grad_norm": 0.5093181729316711, + "learning_rate": 0.00016274061111858928, + "loss": 3.8126, + "step": 3791 + }, + { + "epoch": 4.84992, + "grad_norm": 0.4893922209739685, + "learning_rate": 0.00016270022883295193, + "loss": 3.6797, + "step": 3792 + }, + { + "epoch": 4.8512, + "grad_norm": 0.4943704903125763, + "learning_rate": 0.00016265984654731456, + "loss": 3.6392, + "step": 3793 + }, + { + "epoch": 4.85248, + "grad_norm": 0.5246706604957581, + "learning_rate": 0.0001626194642616772, + "loss": 3.685, + "step": 3794 + }, + { + "epoch": 4.85376, + "grad_norm": 0.4894337058067322, + "learning_rate": 0.00016257908197603982, + "loss": 3.6819, + "step": 3795 + }, + { + "epoch": 4.85504, + "grad_norm": 0.49564680457115173, + "learning_rate": 0.00016253869969040248, + "loss": 3.6284, + "step": 3796 + }, + { + "epoch": 4.85632, + "grad_norm": 0.49830925464630127, + "learning_rate": 0.0001624983174047651, + "loss": 3.6549, + "step": 3797 + }, + { + "epoch": 4.8576, + "grad_norm": 0.47998228669166565, + "learning_rate": 0.00016245793511912771, + "loss": 3.6778, + "step": 3798 + }, + { + "epoch": 4.85888, + "grad_norm": 0.4880746006965637, + "learning_rate": 0.00016241755283349034, + "loss": 3.6366, + "step": 3799 + }, + { + "epoch": 4.8601600000000005, + "grad_norm": 0.4946172535419464, + "learning_rate": 0.000162377170547853, + "loss": 3.7328, + "step": 3800 + }, + { + "epoch": 4.86144, + "grad_norm": 0.48107582330703735, + "learning_rate": 0.00016233678826221563, + "loss": 3.6619, + "step": 3801 + }, + { + "epoch": 4.86272, + "grad_norm": 0.47235268354415894, + "learning_rate": 0.00016229640597657826, + "loss": 3.6913, + "step": 3802 + }, + { + "epoch": 4.864, + "grad_norm": 0.4809187948703766, + "learning_rate": 0.0001622560236909409, + "loss": 3.7307, + "step": 3803 + }, + { + "epoch": 4.86528, + "grad_norm": 0.4874497950077057, + "learning_rate": 0.00016221564140530355, + "loss": 3.7355, + "step": 3804 + }, + { + "epoch": 4.86656, + "grad_norm": 0.49071377515792847, + "learning_rate": 0.00016217525911966618, + "loss": 3.579, + "step": 3805 + }, + { + "epoch": 4.86784, + "grad_norm": 0.4813006818294525, + "learning_rate": 0.00016213487683402878, + "loss": 3.6804, + "step": 3806 + }, + { + "epoch": 4.86912, + "grad_norm": 0.4884811341762543, + "learning_rate": 0.0001620944945483914, + "loss": 3.7033, + "step": 3807 + }, + { + "epoch": 4.8704, + "grad_norm": 0.5217158198356628, + "learning_rate": 0.00016205411226275404, + "loss": 3.6843, + "step": 3808 + }, + { + "epoch": 4.87168, + "grad_norm": 0.48379671573638916, + "learning_rate": 0.0001620137299771167, + "loss": 3.7468, + "step": 3809 + }, + { + "epoch": 4.87296, + "grad_norm": 0.484479159116745, + "learning_rate": 0.00016197334769147933, + "loss": 3.7231, + "step": 3810 + }, + { + "epoch": 4.87424, + "grad_norm": 0.5106998085975647, + "learning_rate": 0.00016193296540584196, + "loss": 3.6457, + "step": 3811 + }, + { + "epoch": 4.87552, + "grad_norm": 0.4962330162525177, + "learning_rate": 0.0001618925831202046, + "loss": 3.6255, + "step": 3812 + }, + { + "epoch": 4.8768, + "grad_norm": 0.5050908923149109, + "learning_rate": 0.00016185220083456725, + "loss": 3.7097, + "step": 3813 + }, + { + "epoch": 4.87808, + "grad_norm": 0.495358407497406, + "learning_rate": 0.00016181181854892985, + "loss": 3.7255, + "step": 3814 + }, + { + "epoch": 4.87936, + "grad_norm": 0.495797336101532, + "learning_rate": 0.00016177143626329248, + "loss": 3.6743, + "step": 3815 + }, + { + "epoch": 4.88064, + "grad_norm": 0.49159398674964905, + "learning_rate": 0.0001617310539776551, + "loss": 3.6507, + "step": 3816 + }, + { + "epoch": 4.88192, + "grad_norm": 0.4823581874370575, + "learning_rate": 0.00016169067169201777, + "loss": 3.7253, + "step": 3817 + }, + { + "epoch": 4.8832, + "grad_norm": 0.49354973435401917, + "learning_rate": 0.0001616502894063804, + "loss": 3.6406, + "step": 3818 + }, + { + "epoch": 4.88448, + "grad_norm": 0.4922613799571991, + "learning_rate": 0.00016160990712074303, + "loss": 3.6345, + "step": 3819 + }, + { + "epoch": 4.88576, + "grad_norm": 0.5070391893386841, + "learning_rate": 0.00016156952483510566, + "loss": 3.6943, + "step": 3820 + }, + { + "epoch": 4.88704, + "grad_norm": 0.4859599173069, + "learning_rate": 0.00016152914254946826, + "loss": 3.6488, + "step": 3821 + }, + { + "epoch": 4.88832, + "grad_norm": 0.4835364520549774, + "learning_rate": 0.00016148876026383092, + "loss": 3.6938, + "step": 3822 + }, + { + "epoch": 4.8896, + "grad_norm": 0.4754778742790222, + "learning_rate": 0.00016144837797819355, + "loss": 3.6802, + "step": 3823 + }, + { + "epoch": 4.89088, + "grad_norm": 0.508832573890686, + "learning_rate": 0.00016140799569255618, + "loss": 3.6323, + "step": 3824 + }, + { + "epoch": 4.89216, + "grad_norm": 0.4894411861896515, + "learning_rate": 0.0001613676134069188, + "loss": 3.7234, + "step": 3825 + }, + { + "epoch": 4.89344, + "grad_norm": 0.5034093260765076, + "learning_rate": 0.00016132723112128146, + "loss": 3.6997, + "step": 3826 + }, + { + "epoch": 4.8947199999999995, + "grad_norm": 0.4902549684047699, + "learning_rate": 0.0001612868488356441, + "loss": 3.6539, + "step": 3827 + }, + { + "epoch": 4.896, + "grad_norm": 0.49553415179252625, + "learning_rate": 0.00016124646655000672, + "loss": 3.6314, + "step": 3828 + }, + { + "epoch": 4.89728, + "grad_norm": 0.5032349824905396, + "learning_rate": 0.00016120608426436933, + "loss": 3.6966, + "step": 3829 + }, + { + "epoch": 4.89856, + "grad_norm": 0.5162962079048157, + "learning_rate": 0.00016116570197873198, + "loss": 3.7436, + "step": 3830 + }, + { + "epoch": 4.89984, + "grad_norm": 0.49726447463035583, + "learning_rate": 0.00016112531969309462, + "loss": 3.6775, + "step": 3831 + }, + { + "epoch": 4.90112, + "grad_norm": 0.47404661774635315, + "learning_rate": 0.00016108493740745725, + "loss": 3.7073, + "step": 3832 + }, + { + "epoch": 4.9024, + "grad_norm": 0.48400676250457764, + "learning_rate": 0.00016104455512181988, + "loss": 3.6737, + "step": 3833 + }, + { + "epoch": 4.90368, + "grad_norm": 0.48407965898513794, + "learning_rate": 0.0001610041728361825, + "loss": 3.6579, + "step": 3834 + }, + { + "epoch": 4.90496, + "grad_norm": 0.47430333495140076, + "learning_rate": 0.00016096379055054516, + "loss": 3.6103, + "step": 3835 + }, + { + "epoch": 4.90624, + "grad_norm": 0.49005916714668274, + "learning_rate": 0.0001609234082649078, + "loss": 3.5885, + "step": 3836 + }, + { + "epoch": 4.90752, + "grad_norm": 0.472774863243103, + "learning_rate": 0.0001608830259792704, + "loss": 3.7037, + "step": 3837 + }, + { + "epoch": 4.9088, + "grad_norm": 0.48804715275764465, + "learning_rate": 0.00016084264369363303, + "loss": 3.7332, + "step": 3838 + }, + { + "epoch": 4.91008, + "grad_norm": 0.4966636002063751, + "learning_rate": 0.00016080226140799568, + "loss": 3.6174, + "step": 3839 + }, + { + "epoch": 4.91136, + "grad_norm": 0.4942854642868042, + "learning_rate": 0.0001607618791223583, + "loss": 3.6865, + "step": 3840 + }, + { + "epoch": 4.91264, + "grad_norm": 0.48685768246650696, + "learning_rate": 0.00016072149683672094, + "loss": 3.6556, + "step": 3841 + }, + { + "epoch": 4.91392, + "grad_norm": 0.5361490845680237, + "learning_rate": 0.00016068111455108357, + "loss": 3.7219, + "step": 3842 + }, + { + "epoch": 4.9152000000000005, + "grad_norm": 0.47551262378692627, + "learning_rate": 0.00016064073226544623, + "loss": 3.7257, + "step": 3843 + }, + { + "epoch": 4.91648, + "grad_norm": 0.48748570680618286, + "learning_rate": 0.00016060034997980886, + "loss": 3.7161, + "step": 3844 + }, + { + "epoch": 4.91776, + "grad_norm": 0.48473209142684937, + "learning_rate": 0.00016055996769417146, + "loss": 3.6119, + "step": 3845 + }, + { + "epoch": 4.91904, + "grad_norm": 0.4792821407318115, + "learning_rate": 0.0001605195854085341, + "loss": 3.628, + "step": 3846 + }, + { + "epoch": 4.92032, + "grad_norm": 0.5115758180618286, + "learning_rate": 0.00016047920312289672, + "loss": 3.7057, + "step": 3847 + }, + { + "epoch": 4.9216, + "grad_norm": 0.4800952970981598, + "learning_rate": 0.00016043882083725938, + "loss": 3.6729, + "step": 3848 + }, + { + "epoch": 4.92288, + "grad_norm": 0.48038557171821594, + "learning_rate": 0.000160398438551622, + "loss": 3.6507, + "step": 3849 + }, + { + "epoch": 4.92416, + "grad_norm": 0.49345579743385315, + "learning_rate": 0.00016035805626598464, + "loss": 3.6492, + "step": 3850 + }, + { + "epoch": 4.92544, + "grad_norm": 0.4855850338935852, + "learning_rate": 0.00016031767398034727, + "loss": 3.7056, + "step": 3851 + }, + { + "epoch": 4.9267199999999995, + "grad_norm": 0.47242411971092224, + "learning_rate": 0.00016027729169470993, + "loss": 3.7455, + "step": 3852 + }, + { + "epoch": 4.928, + "grad_norm": 0.4987708330154419, + "learning_rate": 0.00016023690940907256, + "loss": 3.7361, + "step": 3853 + }, + { + "epoch": 4.92928, + "grad_norm": 0.4940935969352722, + "learning_rate": 0.00016019652712343516, + "loss": 3.6286, + "step": 3854 + }, + { + "epoch": 4.93056, + "grad_norm": 0.4955693483352661, + "learning_rate": 0.0001601561448377978, + "loss": 3.6897, + "step": 3855 + }, + { + "epoch": 4.93184, + "grad_norm": 0.4994462728500366, + "learning_rate": 0.00016011576255216045, + "loss": 3.6699, + "step": 3856 + }, + { + "epoch": 4.93312, + "grad_norm": 0.5072110295295715, + "learning_rate": 0.00016007538026652308, + "loss": 3.7264, + "step": 3857 + }, + { + "epoch": 4.9344, + "grad_norm": 0.49533572793006897, + "learning_rate": 0.0001600349979808857, + "loss": 3.6975, + "step": 3858 + }, + { + "epoch": 4.93568, + "grad_norm": 0.512359619140625, + "learning_rate": 0.00015999461569524834, + "loss": 3.7374, + "step": 3859 + }, + { + "epoch": 4.93696, + "grad_norm": 0.4889034628868103, + "learning_rate": 0.00015995423340961094, + "loss": 3.6486, + "step": 3860 + }, + { + "epoch": 4.93824, + "grad_norm": 0.4879732131958008, + "learning_rate": 0.00015991385112397363, + "loss": 3.6722, + "step": 3861 + }, + { + "epoch": 4.93952, + "grad_norm": 0.5002577900886536, + "learning_rate": 0.00015987346883833623, + "loss": 3.7448, + "step": 3862 + }, + { + "epoch": 4.9408, + "grad_norm": 0.5065283179283142, + "learning_rate": 0.00015983308655269886, + "loss": 3.7383, + "step": 3863 + }, + { + "epoch": 4.94208, + "grad_norm": 0.48911115527153015, + "learning_rate": 0.0001597927042670615, + "loss": 3.6939, + "step": 3864 + }, + { + "epoch": 4.94336, + "grad_norm": 0.500379741191864, + "learning_rate": 0.00015975232198142415, + "loss": 3.6542, + "step": 3865 + }, + { + "epoch": 4.94464, + "grad_norm": 0.47929057478904724, + "learning_rate": 0.00015971193969578678, + "loss": 3.7623, + "step": 3866 + }, + { + "epoch": 4.94592, + "grad_norm": 0.4697692394256592, + "learning_rate": 0.0001596715574101494, + "loss": 3.6306, + "step": 3867 + }, + { + "epoch": 4.9472000000000005, + "grad_norm": 0.4873841106891632, + "learning_rate": 0.00015963117512451204, + "loss": 3.5985, + "step": 3868 + }, + { + "epoch": 4.94848, + "grad_norm": 0.49631455540657043, + "learning_rate": 0.0001595907928388747, + "loss": 3.7473, + "step": 3869 + }, + { + "epoch": 4.94976, + "grad_norm": 0.48060086369514465, + "learning_rate": 0.0001595504105532373, + "loss": 3.667, + "step": 3870 + }, + { + "epoch": 4.95104, + "grad_norm": 0.4914751350879669, + "learning_rate": 0.00015951002826759993, + "loss": 3.6433, + "step": 3871 + }, + { + "epoch": 4.95232, + "grad_norm": 0.5151807069778442, + "learning_rate": 0.00015946964598196256, + "loss": 3.6871, + "step": 3872 + }, + { + "epoch": 4.9536, + "grad_norm": 0.48747456073760986, + "learning_rate": 0.0001594292636963252, + "loss": 3.6983, + "step": 3873 + }, + { + "epoch": 4.95488, + "grad_norm": 0.49215272068977356, + "learning_rate": 0.00015938888141068784, + "loss": 3.7184, + "step": 3874 + }, + { + "epoch": 4.95616, + "grad_norm": 0.47915881872177124, + "learning_rate": 0.00015934849912505047, + "loss": 3.6691, + "step": 3875 + }, + { + "epoch": 4.95744, + "grad_norm": 0.4974246323108673, + "learning_rate": 0.0001593081168394131, + "loss": 3.6795, + "step": 3876 + }, + { + "epoch": 4.95872, + "grad_norm": 0.47061559557914734, + "learning_rate": 0.0001592677345537757, + "loss": 3.6472, + "step": 3877 + }, + { + "epoch": 4.96, + "grad_norm": 0.4836273193359375, + "learning_rate": 0.00015922735226813836, + "loss": 3.6728, + "step": 3878 + }, + { + "epoch": 4.96128, + "grad_norm": 0.5134618878364563, + "learning_rate": 0.000159186969982501, + "loss": 3.6932, + "step": 3879 + }, + { + "epoch": 4.96256, + "grad_norm": 0.49473533034324646, + "learning_rate": 0.00015914658769686362, + "loss": 3.6023, + "step": 3880 + }, + { + "epoch": 4.96384, + "grad_norm": 0.5070651173591614, + "learning_rate": 0.00015910620541122625, + "loss": 3.6855, + "step": 3881 + }, + { + "epoch": 4.96512, + "grad_norm": 0.4944706857204437, + "learning_rate": 0.0001590658231255889, + "loss": 3.6878, + "step": 3882 + }, + { + "epoch": 4.9664, + "grad_norm": 0.4829122722148895, + "learning_rate": 0.00015902544083995154, + "loss": 3.6283, + "step": 3883 + }, + { + "epoch": 4.96768, + "grad_norm": 0.47128134965896606, + "learning_rate": 0.00015898505855431417, + "loss": 3.7395, + "step": 3884 + }, + { + "epoch": 4.96896, + "grad_norm": 0.5024461150169373, + "learning_rate": 0.00015894467626867677, + "loss": 3.7182, + "step": 3885 + }, + { + "epoch": 4.97024, + "grad_norm": 0.4943237602710724, + "learning_rate": 0.0001589042939830394, + "loss": 3.705, + "step": 3886 + }, + { + "epoch": 4.97152, + "grad_norm": 0.49064311385154724, + "learning_rate": 0.00015886391169740206, + "loss": 3.6011, + "step": 3887 + }, + { + "epoch": 4.9728, + "grad_norm": 0.4886166751384735, + "learning_rate": 0.0001588235294117647, + "loss": 3.6503, + "step": 3888 + }, + { + "epoch": 4.97408, + "grad_norm": 0.4722157120704651, + "learning_rate": 0.00015878314712612732, + "loss": 3.5743, + "step": 3889 + }, + { + "epoch": 4.97536, + "grad_norm": 0.49199846386909485, + "learning_rate": 0.00015874276484048995, + "loss": 3.6846, + "step": 3890 + }, + { + "epoch": 4.97664, + "grad_norm": 0.504956066608429, + "learning_rate": 0.0001587023825548526, + "loss": 3.6009, + "step": 3891 + }, + { + "epoch": 4.97792, + "grad_norm": 0.49585065245628357, + "learning_rate": 0.00015866200026921524, + "loss": 3.6558, + "step": 3892 + }, + { + "epoch": 4.9792, + "grad_norm": 0.5018090605735779, + "learning_rate": 0.00015862161798357784, + "loss": 3.6642, + "step": 3893 + }, + { + "epoch": 4.98048, + "grad_norm": 0.49588721990585327, + "learning_rate": 0.00015858123569794047, + "loss": 3.6914, + "step": 3894 + }, + { + "epoch": 4.9817599999999995, + "grad_norm": 0.4850863516330719, + "learning_rate": 0.00015854085341230313, + "loss": 3.6784, + "step": 3895 + }, + { + "epoch": 4.98304, + "grad_norm": 0.4720815122127533, + "learning_rate": 0.00015850047112666576, + "loss": 3.6412, + "step": 3896 + }, + { + "epoch": 4.98432, + "grad_norm": 0.49307361245155334, + "learning_rate": 0.0001584600888410284, + "loss": 3.6723, + "step": 3897 + }, + { + "epoch": 4.9856, + "grad_norm": 0.48994728922843933, + "learning_rate": 0.00015841970655539102, + "loss": 3.6367, + "step": 3898 + }, + { + "epoch": 4.98688, + "grad_norm": 0.47764644026756287, + "learning_rate": 0.00015837932426975365, + "loss": 3.6513, + "step": 3899 + }, + { + "epoch": 4.98816, + "grad_norm": 0.4945572316646576, + "learning_rate": 0.0001583389419841163, + "loss": 3.6575, + "step": 3900 + }, + { + "epoch": 4.98944, + "grad_norm": 0.4753789007663727, + "learning_rate": 0.0001582985596984789, + "loss": 3.63, + "step": 3901 + }, + { + "epoch": 4.99072, + "grad_norm": 0.49885818362236023, + "learning_rate": 0.00015825817741284154, + "loss": 3.7375, + "step": 3902 + }, + { + "epoch": 4.992, + "grad_norm": 0.4667688310146332, + "learning_rate": 0.00015821779512720417, + "loss": 3.636, + "step": 3903 + }, + { + "epoch": 4.99328, + "grad_norm": 0.46380728483200073, + "learning_rate": 0.00015817741284156683, + "loss": 3.5965, + "step": 3904 + }, + { + "epoch": 4.99456, + "grad_norm": 0.5012388229370117, + "learning_rate": 0.00015813703055592946, + "loss": 3.7338, + "step": 3905 + }, + { + "epoch": 4.99584, + "grad_norm": 0.5032443404197693, + "learning_rate": 0.0001580966482702921, + "loss": 3.6502, + "step": 3906 + }, + { + "epoch": 4.99712, + "grad_norm": 0.46951258182525635, + "learning_rate": 0.00015805626598465472, + "loss": 3.6379, + "step": 3907 + }, + { + "epoch": 4.9984, + "grad_norm": 0.502934992313385, + "learning_rate": 0.00015801588369901737, + "loss": 3.6745, + "step": 3908 + }, + { + "epoch": 4.99968, + "grad_norm": 0.46453621983528137, + "learning_rate": 0.00015797550141337998, + "loss": 3.6497, + "step": 3909 + }, + { + "epoch": 5.0, + "grad_norm": 0.9125704169273376, + "learning_rate": 0.0001579351191277426, + "loss": 3.7091, + "step": 3910 + }, + { + "epoch": 5.00128, + "grad_norm": 0.5202215909957886, + "learning_rate": 0.00015789473684210524, + "loss": 3.5637, + "step": 3911 + }, + { + "epoch": 5.00256, + "grad_norm": 0.5078494548797607, + "learning_rate": 0.00015785435455646787, + "loss": 3.5149, + "step": 3912 + }, + { + "epoch": 5.00384, + "grad_norm": 0.463044673204422, + "learning_rate": 0.00015781397227083053, + "loss": 3.5784, + "step": 3913 + }, + { + "epoch": 5.00512, + "grad_norm": 0.5001516342163086, + "learning_rate": 0.00015777358998519316, + "loss": 3.5504, + "step": 3914 + }, + { + "epoch": 5.0064, + "grad_norm": 0.4898684620857239, + "learning_rate": 0.00015773320769955579, + "loss": 3.5146, + "step": 3915 + }, + { + "epoch": 5.00768, + "grad_norm": 0.5046645998954773, + "learning_rate": 0.0001576928254139184, + "loss": 3.5652, + "step": 3916 + }, + { + "epoch": 5.00896, + "grad_norm": 0.5051593780517578, + "learning_rate": 0.00015765244312828105, + "loss": 3.4985, + "step": 3917 + }, + { + "epoch": 5.01024, + "grad_norm": 0.49468353390693665, + "learning_rate": 0.00015761206084264368, + "loss": 3.5268, + "step": 3918 + }, + { + "epoch": 5.01152, + "grad_norm": 0.48806026577949524, + "learning_rate": 0.0001575716785570063, + "loss": 3.5752, + "step": 3919 + }, + { + "epoch": 5.0128, + "grad_norm": 0.48567843437194824, + "learning_rate": 0.00015753129627136894, + "loss": 3.5006, + "step": 3920 + }, + { + "epoch": 5.01408, + "grad_norm": 0.4762195348739624, + "learning_rate": 0.0001574909139857316, + "loss": 3.523, + "step": 3921 + }, + { + "epoch": 5.01536, + "grad_norm": 0.4921858012676239, + "learning_rate": 0.00015745053170009422, + "loss": 3.501, + "step": 3922 + }, + { + "epoch": 5.01664, + "grad_norm": 0.49300310015678406, + "learning_rate": 0.00015741014941445685, + "loss": 3.4951, + "step": 3923 + }, + { + "epoch": 5.01792, + "grad_norm": 0.5041300058364868, + "learning_rate": 0.00015736976712881946, + "loss": 3.5221, + "step": 3924 + }, + { + "epoch": 5.0192, + "grad_norm": 0.4857078790664673, + "learning_rate": 0.00015732938484318214, + "loss": 3.5616, + "step": 3925 + }, + { + "epoch": 5.02048, + "grad_norm": 0.49047553539276123, + "learning_rate": 0.00015728900255754474, + "loss": 3.4968, + "step": 3926 + }, + { + "epoch": 5.0217600000000004, + "grad_norm": 0.510486900806427, + "learning_rate": 0.00015724862027190737, + "loss": 3.5057, + "step": 3927 + }, + { + "epoch": 5.02304, + "grad_norm": 0.5110107064247131, + "learning_rate": 0.00015720823798627, + "loss": 3.5816, + "step": 3928 + }, + { + "epoch": 5.02432, + "grad_norm": 0.4852614104747772, + "learning_rate": 0.00015716785570063263, + "loss": 3.5548, + "step": 3929 + }, + { + "epoch": 5.0256, + "grad_norm": 0.4855445921421051, + "learning_rate": 0.0001571274734149953, + "loss": 3.4835, + "step": 3930 + }, + { + "epoch": 5.02688, + "grad_norm": 0.5051552057266235, + "learning_rate": 0.00015708709112935792, + "loss": 3.5332, + "step": 3931 + }, + { + "epoch": 5.02816, + "grad_norm": 0.4770444631576538, + "learning_rate": 0.00015704670884372052, + "loss": 3.5793, + "step": 3932 + }, + { + "epoch": 5.02944, + "grad_norm": 0.49611613154411316, + "learning_rate": 0.00015700632655808315, + "loss": 3.5763, + "step": 3933 + }, + { + "epoch": 5.03072, + "grad_norm": 0.5175138711929321, + "learning_rate": 0.0001569659442724458, + "loss": 3.5942, + "step": 3934 + }, + { + "epoch": 5.032, + "grad_norm": 0.5066589117050171, + "learning_rate": 0.00015692556198680844, + "loss": 3.5047, + "step": 3935 + }, + { + "epoch": 5.03328, + "grad_norm": 0.5006115436553955, + "learning_rate": 0.00015688517970117107, + "loss": 3.44, + "step": 3936 + }, + { + "epoch": 5.03456, + "grad_norm": 0.493075430393219, + "learning_rate": 0.0001568447974155337, + "loss": 3.591, + "step": 3937 + }, + { + "epoch": 5.03584, + "grad_norm": 0.5144281387329102, + "learning_rate": 0.00015680441512989636, + "loss": 3.6073, + "step": 3938 + }, + { + "epoch": 5.03712, + "grad_norm": 0.5123122930526733, + "learning_rate": 0.000156764032844259, + "loss": 3.5315, + "step": 3939 + }, + { + "epoch": 5.0384, + "grad_norm": 0.5100513696670532, + "learning_rate": 0.00015672365055862162, + "loss": 3.4975, + "step": 3940 + }, + { + "epoch": 5.03968, + "grad_norm": 0.5032030940055847, + "learning_rate": 0.00015668326827298422, + "loss": 3.58, + "step": 3941 + }, + { + "epoch": 5.04096, + "grad_norm": 0.4905944764614105, + "learning_rate": 0.00015664288598734685, + "loss": 3.6283, + "step": 3942 + }, + { + "epoch": 5.04224, + "grad_norm": 0.5140021443367004, + "learning_rate": 0.0001566025037017095, + "loss": 3.5188, + "step": 3943 + }, + { + "epoch": 5.04352, + "grad_norm": 0.4894171953201294, + "learning_rate": 0.00015656212141607214, + "loss": 3.4722, + "step": 3944 + }, + { + "epoch": 5.0448, + "grad_norm": 0.4924975335597992, + "learning_rate": 0.00015652173913043477, + "loss": 3.582, + "step": 3945 + }, + { + "epoch": 5.04608, + "grad_norm": 0.5047486424446106, + "learning_rate": 0.0001564813568447974, + "loss": 3.5844, + "step": 3946 + }, + { + "epoch": 5.04736, + "grad_norm": 0.5076228380203247, + "learning_rate": 0.00015644097455916006, + "loss": 3.4754, + "step": 3947 + }, + { + "epoch": 5.04864, + "grad_norm": 0.48156487941741943, + "learning_rate": 0.00015640059227352269, + "loss": 3.5801, + "step": 3948 + }, + { + "epoch": 5.04992, + "grad_norm": 0.52396559715271, + "learning_rate": 0.0001563602099878853, + "loss": 3.5727, + "step": 3949 + }, + { + "epoch": 5.0512, + "grad_norm": 0.5130919814109802, + "learning_rate": 0.00015631982770224792, + "loss": 3.5455, + "step": 3950 + }, + { + "epoch": 5.05248, + "grad_norm": 0.5077847242355347, + "learning_rate": 0.00015627944541661058, + "loss": 3.574, + "step": 3951 + }, + { + "epoch": 5.05376, + "grad_norm": 0.5045159459114075, + "learning_rate": 0.0001562390631309732, + "loss": 3.5511, + "step": 3952 + }, + { + "epoch": 5.05504, + "grad_norm": 0.5126574635505676, + "learning_rate": 0.00015619868084533584, + "loss": 3.6289, + "step": 3953 + }, + { + "epoch": 5.05632, + "grad_norm": 0.4960666000843048, + "learning_rate": 0.00015615829855969847, + "loss": 3.5108, + "step": 3954 + }, + { + "epoch": 5.0576, + "grad_norm": 0.49016058444976807, + "learning_rate": 0.0001561179162740611, + "loss": 3.5506, + "step": 3955 + }, + { + "epoch": 5.05888, + "grad_norm": 0.5145988464355469, + "learning_rate": 0.00015607753398842375, + "loss": 3.5351, + "step": 3956 + }, + { + "epoch": 5.06016, + "grad_norm": 0.495882123708725, + "learning_rate": 0.00015603715170278636, + "loss": 3.5469, + "step": 3957 + }, + { + "epoch": 5.06144, + "grad_norm": 0.514789879322052, + "learning_rate": 0.000155996769417149, + "loss": 3.5172, + "step": 3958 + }, + { + "epoch": 5.06272, + "grad_norm": 0.4931298494338989, + "learning_rate": 0.00015595638713151162, + "loss": 3.5787, + "step": 3959 + }, + { + "epoch": 5.064, + "grad_norm": 0.47730547189712524, + "learning_rate": 0.00015591600484587427, + "loss": 3.5478, + "step": 3960 + }, + { + "epoch": 5.06528, + "grad_norm": 0.534153938293457, + "learning_rate": 0.0001558756225602369, + "loss": 3.5449, + "step": 3961 + }, + { + "epoch": 5.06656, + "grad_norm": 0.4863246977329254, + "learning_rate": 0.00015583524027459953, + "loss": 3.4829, + "step": 3962 + }, + { + "epoch": 5.06784, + "grad_norm": 0.5165325999259949, + "learning_rate": 0.00015579485798896216, + "loss": 3.5866, + "step": 3963 + }, + { + "epoch": 5.06912, + "grad_norm": 0.5204668641090393, + "learning_rate": 0.00015575447570332482, + "loss": 3.5346, + "step": 3964 + }, + { + "epoch": 5.0704, + "grad_norm": 0.5140581130981445, + "learning_rate": 0.00015571409341768742, + "loss": 3.5652, + "step": 3965 + }, + { + "epoch": 5.07168, + "grad_norm": 0.5002838373184204, + "learning_rate": 0.00015567371113205005, + "loss": 3.5275, + "step": 3966 + }, + { + "epoch": 5.07296, + "grad_norm": 0.48074761033058167, + "learning_rate": 0.00015563332884641268, + "loss": 3.5435, + "step": 3967 + }, + { + "epoch": 5.07424, + "grad_norm": 0.49693623185157776, + "learning_rate": 0.00015559294656077531, + "loss": 3.5735, + "step": 3968 + }, + { + "epoch": 5.07552, + "grad_norm": 0.5019185543060303, + "learning_rate": 0.00015555256427513797, + "loss": 3.5511, + "step": 3969 + }, + { + "epoch": 5.0768, + "grad_norm": 0.5159568190574646, + "learning_rate": 0.0001555121819895006, + "loss": 3.5598, + "step": 3970 + }, + { + "epoch": 5.07808, + "grad_norm": 0.49248459935188293, + "learning_rate": 0.00015547179970386323, + "loss": 3.594, + "step": 3971 + }, + { + "epoch": 5.07936, + "grad_norm": 0.5076256990432739, + "learning_rate": 0.00015543141741822584, + "loss": 3.586, + "step": 3972 + }, + { + "epoch": 5.08064, + "grad_norm": 0.5257125496864319, + "learning_rate": 0.0001553910351325885, + "loss": 3.5359, + "step": 3973 + }, + { + "epoch": 5.08192, + "grad_norm": 0.48996734619140625, + "learning_rate": 0.00015535065284695112, + "loss": 3.5319, + "step": 3974 + }, + { + "epoch": 5.0832, + "grad_norm": 0.510186493396759, + "learning_rate": 0.00015531027056131375, + "loss": 3.5718, + "step": 3975 + }, + { + "epoch": 5.08448, + "grad_norm": 0.49666544795036316, + "learning_rate": 0.00015526988827567638, + "loss": 3.5401, + "step": 3976 + }, + { + "epoch": 5.08576, + "grad_norm": 0.5227068066596985, + "learning_rate": 0.00015522950599003904, + "loss": 3.6226, + "step": 3977 + }, + { + "epoch": 5.08704, + "grad_norm": 0.5159752368927002, + "learning_rate": 0.00015518912370440167, + "loss": 3.5353, + "step": 3978 + }, + { + "epoch": 5.08832, + "grad_norm": 0.49110716581344604, + "learning_rate": 0.0001551487414187643, + "loss": 3.4429, + "step": 3979 + }, + { + "epoch": 5.0896, + "grad_norm": 0.5033590793609619, + "learning_rate": 0.0001551083591331269, + "loss": 3.5877, + "step": 3980 + }, + { + "epoch": 5.09088, + "grad_norm": 0.5176244974136353, + "learning_rate": 0.00015506797684748953, + "loss": 3.5388, + "step": 3981 + }, + { + "epoch": 5.09216, + "grad_norm": 0.5007118582725525, + "learning_rate": 0.0001550275945618522, + "loss": 3.5527, + "step": 3982 + }, + { + "epoch": 5.09344, + "grad_norm": 0.4941405653953552, + "learning_rate": 0.00015498721227621482, + "loss": 3.5253, + "step": 3983 + }, + { + "epoch": 5.09472, + "grad_norm": 0.49819836020469666, + "learning_rate": 0.00015494682999057745, + "loss": 3.5868, + "step": 3984 + }, + { + "epoch": 5.096, + "grad_norm": 0.4690377712249756, + "learning_rate": 0.00015490644770494008, + "loss": 3.521, + "step": 3985 + }, + { + "epoch": 5.09728, + "grad_norm": 0.5075995326042175, + "learning_rate": 0.00015486606541930274, + "loss": 3.4984, + "step": 3986 + }, + { + "epoch": 5.09856, + "grad_norm": 0.4840170741081238, + "learning_rate": 0.00015482568313366537, + "loss": 3.5351, + "step": 3987 + }, + { + "epoch": 5.09984, + "grad_norm": 0.5013239979743958, + "learning_rate": 0.00015478530084802797, + "loss": 3.5958, + "step": 3988 + }, + { + "epoch": 5.10112, + "grad_norm": 0.500015377998352, + "learning_rate": 0.0001547449185623906, + "loss": 3.4832, + "step": 3989 + }, + { + "epoch": 5.1024, + "grad_norm": 0.5128130316734314, + "learning_rate": 0.00015470453627675326, + "loss": 3.5816, + "step": 3990 + }, + { + "epoch": 5.10368, + "grad_norm": 0.5172329545021057, + "learning_rate": 0.0001546641539911159, + "loss": 3.474, + "step": 3991 + }, + { + "epoch": 5.10496, + "grad_norm": 0.4870198369026184, + "learning_rate": 0.00015462377170547852, + "loss": 3.5239, + "step": 3992 + }, + { + "epoch": 5.10624, + "grad_norm": 0.5066513419151306, + "learning_rate": 0.00015458338941984115, + "loss": 3.6001, + "step": 3993 + }, + { + "epoch": 5.10752, + "grad_norm": 0.5186413526535034, + "learning_rate": 0.00015454300713420378, + "loss": 3.5248, + "step": 3994 + }, + { + "epoch": 5.1088, + "grad_norm": 0.4880644679069519, + "learning_rate": 0.00015450262484856644, + "loss": 3.5703, + "step": 3995 + }, + { + "epoch": 5.11008, + "grad_norm": 0.5023115277290344, + "learning_rate": 0.00015446224256292904, + "loss": 3.6021, + "step": 3996 + }, + { + "epoch": 5.11136, + "grad_norm": 0.490438312292099, + "learning_rate": 0.00015442186027729167, + "loss": 3.4991, + "step": 3997 + }, + { + "epoch": 5.11264, + "grad_norm": 0.48022663593292236, + "learning_rate": 0.0001543814779916543, + "loss": 3.411, + "step": 3998 + }, + { + "epoch": 5.11392, + "grad_norm": 0.4824524521827698, + "learning_rate": 0.00015434109570601696, + "loss": 3.5548, + "step": 3999 + }, + { + "epoch": 5.1152, + "grad_norm": 0.48679637908935547, + "learning_rate": 0.00015430071342037959, + "loss": 3.4643, + "step": 4000 + }, + { + "epoch": 5.11648, + "grad_norm": 0.5010484457015991, + "learning_rate": 0.00015426033113474222, + "loss": 3.5182, + "step": 4001 + }, + { + "epoch": 5.11776, + "grad_norm": 0.49517738819122314, + "learning_rate": 0.00015421994884910485, + "loss": 3.5063, + "step": 4002 + }, + { + "epoch": 5.11904, + "grad_norm": 0.5106388926506042, + "learning_rate": 0.0001541795665634675, + "loss": 3.6582, + "step": 4003 + }, + { + "epoch": 5.12032, + "grad_norm": 0.4887462556362152, + "learning_rate": 0.0001541391842778301, + "loss": 3.6125, + "step": 4004 + }, + { + "epoch": 5.1216, + "grad_norm": 0.4951592981815338, + "learning_rate": 0.00015409880199219274, + "loss": 3.5286, + "step": 4005 + }, + { + "epoch": 5.12288, + "grad_norm": 0.5019987225532532, + "learning_rate": 0.00015405841970655537, + "loss": 3.5732, + "step": 4006 + }, + { + "epoch": 5.12416, + "grad_norm": 0.48524710536003113, + "learning_rate": 0.000154018037420918, + "loss": 3.5385, + "step": 4007 + }, + { + "epoch": 5.12544, + "grad_norm": 0.5041416883468628, + "learning_rate": 0.00015397765513528065, + "loss": 3.6019, + "step": 4008 + }, + { + "epoch": 5.12672, + "grad_norm": 0.48453205823898315, + "learning_rate": 0.00015393727284964328, + "loss": 3.5418, + "step": 4009 + }, + { + "epoch": 5.128, + "grad_norm": 0.5173502564430237, + "learning_rate": 0.0001538968905640059, + "loss": 3.597, + "step": 4010 + }, + { + "epoch": 5.12928, + "grad_norm": 0.5132037997245789, + "learning_rate": 0.00015385650827836852, + "loss": 3.4569, + "step": 4011 + }, + { + "epoch": 5.13056, + "grad_norm": 0.5146015286445618, + "learning_rate": 0.0001538161259927312, + "loss": 3.5441, + "step": 4012 + }, + { + "epoch": 5.13184, + "grad_norm": 0.5107573866844177, + "learning_rate": 0.0001537757437070938, + "loss": 3.569, + "step": 4013 + }, + { + "epoch": 5.13312, + "grad_norm": 0.5163334012031555, + "learning_rate": 0.00015373536142145643, + "loss": 3.6092, + "step": 4014 + }, + { + "epoch": 5.1344, + "grad_norm": 0.512444019317627, + "learning_rate": 0.00015369497913581906, + "loss": 3.6463, + "step": 4015 + }, + { + "epoch": 5.13568, + "grad_norm": 0.5129110813140869, + "learning_rate": 0.00015365459685018172, + "loss": 3.5649, + "step": 4016 + }, + { + "epoch": 5.13696, + "grad_norm": 0.52217698097229, + "learning_rate": 0.00015361421456454435, + "loss": 3.5541, + "step": 4017 + }, + { + "epoch": 5.13824, + "grad_norm": 0.5022246241569519, + "learning_rate": 0.00015357383227890698, + "loss": 3.4864, + "step": 4018 + }, + { + "epoch": 5.13952, + "grad_norm": 0.5264055132865906, + "learning_rate": 0.00015353344999326958, + "loss": 3.5228, + "step": 4019 + }, + { + "epoch": 5.1408, + "grad_norm": 0.5241397023200989, + "learning_rate": 0.00015349306770763221, + "loss": 3.5233, + "step": 4020 + }, + { + "epoch": 5.14208, + "grad_norm": 0.6818204522132874, + "learning_rate": 0.00015345268542199487, + "loss": 3.639, + "step": 4021 + }, + { + "epoch": 5.14336, + "grad_norm": 0.4898463785648346, + "learning_rate": 0.0001534123031363575, + "loss": 3.4437, + "step": 4022 + }, + { + "epoch": 5.14464, + "grad_norm": 0.521457314491272, + "learning_rate": 0.00015337192085072013, + "loss": 3.5246, + "step": 4023 + }, + { + "epoch": 5.14592, + "grad_norm": 0.5136117935180664, + "learning_rate": 0.00015333153856508276, + "loss": 3.5557, + "step": 4024 + }, + { + "epoch": 5.1472, + "grad_norm": 0.49397048354148865, + "learning_rate": 0.00015329115627944542, + "loss": 3.4808, + "step": 4025 + }, + { + "epoch": 5.14848, + "grad_norm": 0.5146060585975647, + "learning_rate": 0.00015325077399380805, + "loss": 3.518, + "step": 4026 + }, + { + "epoch": 5.14976, + "grad_norm": 0.5058059692382812, + "learning_rate": 0.00015321039170817068, + "loss": 3.5399, + "step": 4027 + }, + { + "epoch": 5.15104, + "grad_norm": 0.4919331967830658, + "learning_rate": 0.00015317000942253328, + "loss": 3.5817, + "step": 4028 + }, + { + "epoch": 5.15232, + "grad_norm": 0.4798501133918762, + "learning_rate": 0.00015312962713689594, + "loss": 3.5386, + "step": 4029 + }, + { + "epoch": 5.1536, + "grad_norm": 0.49712446331977844, + "learning_rate": 0.00015308924485125857, + "loss": 3.532, + "step": 4030 + }, + { + "epoch": 5.15488, + "grad_norm": 0.5078490376472473, + "learning_rate": 0.0001530488625656212, + "loss": 3.5602, + "step": 4031 + }, + { + "epoch": 5.15616, + "grad_norm": 0.49252021312713623, + "learning_rate": 0.00015300848027998383, + "loss": 3.5433, + "step": 4032 + }, + { + "epoch": 5.15744, + "grad_norm": 0.5331763029098511, + "learning_rate": 0.00015296809799434646, + "loss": 3.5577, + "step": 4033 + }, + { + "epoch": 5.15872, + "grad_norm": 0.4990587830543518, + "learning_rate": 0.00015292771570870912, + "loss": 3.559, + "step": 4034 + }, + { + "epoch": 5.16, + "grad_norm": 0.5130864977836609, + "learning_rate": 0.00015288733342307175, + "loss": 3.5403, + "step": 4035 + }, + { + "epoch": 5.16128, + "grad_norm": 0.4923115372657776, + "learning_rate": 0.00015284695113743435, + "loss": 3.4671, + "step": 4036 + }, + { + "epoch": 5.16256, + "grad_norm": 0.5185816287994385, + "learning_rate": 0.00015280656885179698, + "loss": 3.6086, + "step": 4037 + }, + { + "epoch": 5.16384, + "grad_norm": 0.5011401176452637, + "learning_rate": 0.00015276618656615964, + "loss": 3.5315, + "step": 4038 + }, + { + "epoch": 5.16512, + "grad_norm": 0.5076486468315125, + "learning_rate": 0.00015272580428052227, + "loss": 3.5743, + "step": 4039 + }, + { + "epoch": 5.1664, + "grad_norm": 0.49623793363571167, + "learning_rate": 0.0001526854219948849, + "loss": 3.6119, + "step": 4040 + }, + { + "epoch": 5.16768, + "grad_norm": 0.5004082918167114, + "learning_rate": 0.00015264503970924753, + "loss": 3.4876, + "step": 4041 + }, + { + "epoch": 5.16896, + "grad_norm": 0.4831116199493408, + "learning_rate": 0.00015260465742361018, + "loss": 3.5508, + "step": 4042 + }, + { + "epoch": 5.17024, + "grad_norm": 0.5067176818847656, + "learning_rate": 0.00015256427513797281, + "loss": 3.56, + "step": 4043 + }, + { + "epoch": 5.17152, + "grad_norm": 0.5060960650444031, + "learning_rate": 0.00015252389285233542, + "loss": 3.5146, + "step": 4044 + }, + { + "epoch": 5.1728, + "grad_norm": 0.5012076497077942, + "learning_rate": 0.00015248351056669805, + "loss": 3.5358, + "step": 4045 + }, + { + "epoch": 5.17408, + "grad_norm": 0.48700445890426636, + "learning_rate": 0.0001524431282810607, + "loss": 3.6407, + "step": 4046 + }, + { + "epoch": 5.17536, + "grad_norm": 0.49862363934516907, + "learning_rate": 0.00015240274599542333, + "loss": 3.5476, + "step": 4047 + }, + { + "epoch": 5.17664, + "grad_norm": 0.5184906721115112, + "learning_rate": 0.00015236236370978596, + "loss": 3.6116, + "step": 4048 + }, + { + "epoch": 5.17792, + "grad_norm": 0.4845800995826721, + "learning_rate": 0.0001523219814241486, + "loss": 3.4903, + "step": 4049 + }, + { + "epoch": 5.1792, + "grad_norm": 0.5097942352294922, + "learning_rate": 0.00015228159913851122, + "loss": 3.5238, + "step": 4050 + }, + { + "epoch": 5.18048, + "grad_norm": 0.5073378086090088, + "learning_rate": 0.00015224121685287388, + "loss": 3.6045, + "step": 4051 + }, + { + "epoch": 5.18176, + "grad_norm": 0.4886874258518219, + "learning_rate": 0.00015220083456723649, + "loss": 3.5069, + "step": 4052 + }, + { + "epoch": 5.18304, + "grad_norm": 0.49647676944732666, + "learning_rate": 0.00015216045228159912, + "loss": 3.5864, + "step": 4053 + }, + { + "epoch": 5.18432, + "grad_norm": 0.4973941445350647, + "learning_rate": 0.00015212006999596175, + "loss": 3.5571, + "step": 4054 + }, + { + "epoch": 5.1856, + "grad_norm": 0.5050147175788879, + "learning_rate": 0.0001520796877103244, + "loss": 3.5203, + "step": 4055 + }, + { + "epoch": 5.18688, + "grad_norm": 0.48217907547950745, + "learning_rate": 0.00015203930542468703, + "loss": 3.5327, + "step": 4056 + }, + { + "epoch": 5.18816, + "grad_norm": 0.5200319290161133, + "learning_rate": 0.00015199892313904966, + "loss": 3.6177, + "step": 4057 + }, + { + "epoch": 5.18944, + "grad_norm": 0.4934719204902649, + "learning_rate": 0.0001519585408534123, + "loss": 3.5692, + "step": 4058 + }, + { + "epoch": 5.19072, + "grad_norm": 0.5016127824783325, + "learning_rate": 0.00015191815856777495, + "loss": 3.5915, + "step": 4059 + }, + { + "epoch": 5.192, + "grad_norm": 0.502185046672821, + "learning_rate": 0.00015187777628213755, + "loss": 3.5411, + "step": 4060 + }, + { + "epoch": 5.19328, + "grad_norm": 0.489409863948822, + "learning_rate": 0.00015183739399650018, + "loss": 3.5464, + "step": 4061 + }, + { + "epoch": 5.19456, + "grad_norm": 0.5016448497772217, + "learning_rate": 0.0001517970117108628, + "loss": 3.5695, + "step": 4062 + }, + { + "epoch": 5.19584, + "grad_norm": 0.49411389231681824, + "learning_rate": 0.00015175662942522544, + "loss": 3.5293, + "step": 4063 + }, + { + "epoch": 5.19712, + "grad_norm": 0.5000140070915222, + "learning_rate": 0.0001517162471395881, + "loss": 3.5094, + "step": 4064 + }, + { + "epoch": 5.1984, + "grad_norm": 0.49250856041908264, + "learning_rate": 0.00015167586485395073, + "loss": 3.5954, + "step": 4065 + }, + { + "epoch": 5.19968, + "grad_norm": 0.48768413066864014, + "learning_rate": 0.00015163548256831336, + "loss": 3.4345, + "step": 4066 + }, + { + "epoch": 5.20096, + "grad_norm": 0.506270170211792, + "learning_rate": 0.00015159510028267596, + "loss": 3.5516, + "step": 4067 + }, + { + "epoch": 5.20224, + "grad_norm": 0.5164130926132202, + "learning_rate": 0.00015155471799703862, + "loss": 3.558, + "step": 4068 + }, + { + "epoch": 5.20352, + "grad_norm": 0.5122447609901428, + "learning_rate": 0.00015151433571140125, + "loss": 3.596, + "step": 4069 + }, + { + "epoch": 5.2048, + "grad_norm": 0.5188588500022888, + "learning_rate": 0.00015147395342576388, + "loss": 3.5264, + "step": 4070 + }, + { + "epoch": 5.20608, + "grad_norm": 0.49674516916275024, + "learning_rate": 0.0001514335711401265, + "loss": 3.5941, + "step": 4071 + }, + { + "epoch": 5.2073599999999995, + "grad_norm": 0.5053087472915649, + "learning_rate": 0.00015139318885448917, + "loss": 3.5922, + "step": 4072 + }, + { + "epoch": 5.20864, + "grad_norm": 0.4910738468170166, + "learning_rate": 0.0001513528065688518, + "loss": 3.516, + "step": 4073 + }, + { + "epoch": 5.20992, + "grad_norm": 0.48027095198631287, + "learning_rate": 0.00015131242428321443, + "loss": 3.4961, + "step": 4074 + }, + { + "epoch": 5.2112, + "grad_norm": 0.5024387240409851, + "learning_rate": 0.00015127204199757703, + "loss": 3.4995, + "step": 4075 + }, + { + "epoch": 5.21248, + "grad_norm": 0.5177842378616333, + "learning_rate": 0.00015123165971193966, + "loss": 3.5783, + "step": 4076 + }, + { + "epoch": 5.21376, + "grad_norm": 0.4885920584201813, + "learning_rate": 0.00015119127742630232, + "loss": 3.6155, + "step": 4077 + }, + { + "epoch": 5.21504, + "grad_norm": 0.5077448487281799, + "learning_rate": 0.00015115089514066495, + "loss": 3.581, + "step": 4078 + }, + { + "epoch": 5.21632, + "grad_norm": 0.48989179730415344, + "learning_rate": 0.00015111051285502758, + "loss": 3.5222, + "step": 4079 + }, + { + "epoch": 5.2176, + "grad_norm": 0.4760512113571167, + "learning_rate": 0.0001510701305693902, + "loss": 3.5481, + "step": 4080 + }, + { + "epoch": 5.21888, + "grad_norm": 0.49710988998413086, + "learning_rate": 0.00015102974828375287, + "loss": 3.5808, + "step": 4081 + }, + { + "epoch": 5.22016, + "grad_norm": 0.49187228083610535, + "learning_rate": 0.0001509893659981155, + "loss": 3.5191, + "step": 4082 + }, + { + "epoch": 5.22144, + "grad_norm": 0.5187274813652039, + "learning_rate": 0.0001509489837124781, + "loss": 3.6189, + "step": 4083 + }, + { + "epoch": 5.22272, + "grad_norm": 0.523126482963562, + "learning_rate": 0.00015090860142684073, + "loss": 3.6136, + "step": 4084 + }, + { + "epoch": 5.224, + "grad_norm": 0.49129125475883484, + "learning_rate": 0.00015086821914120339, + "loss": 3.5228, + "step": 4085 + }, + { + "epoch": 5.22528, + "grad_norm": 0.5184364318847656, + "learning_rate": 0.00015082783685556602, + "loss": 3.5247, + "step": 4086 + }, + { + "epoch": 5.22656, + "grad_norm": 0.5139320492744446, + "learning_rate": 0.00015078745456992865, + "loss": 3.5408, + "step": 4087 + }, + { + "epoch": 5.22784, + "grad_norm": 0.5081346035003662, + "learning_rate": 0.00015074707228429128, + "loss": 3.6262, + "step": 4088 + }, + { + "epoch": 5.22912, + "grad_norm": 0.5397776365280151, + "learning_rate": 0.0001507066899986539, + "loss": 3.5904, + "step": 4089 + }, + { + "epoch": 5.2304, + "grad_norm": 0.5284587740898132, + "learning_rate": 0.00015066630771301656, + "loss": 3.5313, + "step": 4090 + }, + { + "epoch": 5.23168, + "grad_norm": 0.4830555319786072, + "learning_rate": 0.00015062592542737917, + "loss": 3.5611, + "step": 4091 + }, + { + "epoch": 5.23296, + "grad_norm": 0.5219169855117798, + "learning_rate": 0.0001505855431417418, + "loss": 3.6486, + "step": 4092 + }, + { + "epoch": 5.23424, + "grad_norm": 0.5159496068954468, + "learning_rate": 0.00015054516085610443, + "loss": 3.4704, + "step": 4093 + }, + { + "epoch": 5.23552, + "grad_norm": 0.530392587184906, + "learning_rate": 0.00015050477857046708, + "loss": 3.517, + "step": 4094 + }, + { + "epoch": 5.2368, + "grad_norm": 0.5039182901382446, + "learning_rate": 0.00015046439628482971, + "loss": 3.5843, + "step": 4095 + }, + { + "epoch": 5.23808, + "grad_norm": 0.5051393508911133, + "learning_rate": 0.00015042401399919234, + "loss": 3.5637, + "step": 4096 + }, + { + "epoch": 5.23936, + "grad_norm": 0.5062666535377502, + "learning_rate": 0.00015038363171355497, + "loss": 3.5598, + "step": 4097 + }, + { + "epoch": 5.24064, + "grad_norm": 0.5078728199005127, + "learning_rate": 0.00015034324942791763, + "loss": 3.5031, + "step": 4098 + }, + { + "epoch": 5.24192, + "grad_norm": 0.5176907181739807, + "learning_rate": 0.00015030286714228023, + "loss": 3.5606, + "step": 4099 + }, + { + "epoch": 5.2432, + "grad_norm": 0.5005961060523987, + "learning_rate": 0.00015026248485664286, + "loss": 3.658, + "step": 4100 + }, + { + "epoch": 5.24448, + "grad_norm": 0.5151212811470032, + "learning_rate": 0.0001502221025710055, + "loss": 3.6295, + "step": 4101 + }, + { + "epoch": 5.24576, + "grad_norm": 0.5224319100379944, + "learning_rate": 0.00015018172028536812, + "loss": 3.5873, + "step": 4102 + }, + { + "epoch": 5.24704, + "grad_norm": 0.5269538164138794, + "learning_rate": 0.00015014133799973078, + "loss": 3.5697, + "step": 4103 + }, + { + "epoch": 5.24832, + "grad_norm": 0.4943110942840576, + "learning_rate": 0.0001501009557140934, + "loss": 3.4694, + "step": 4104 + }, + { + "epoch": 5.2496, + "grad_norm": 0.5208888649940491, + "learning_rate": 0.00015006057342845604, + "loss": 3.5701, + "step": 4105 + }, + { + "epoch": 5.25088, + "grad_norm": 0.5066584944725037, + "learning_rate": 0.00015002019114281864, + "loss": 3.6059, + "step": 4106 + }, + { + "epoch": 5.25216, + "grad_norm": 0.5034709572792053, + "learning_rate": 0.0001499798088571813, + "loss": 3.5807, + "step": 4107 + }, + { + "epoch": 5.25344, + "grad_norm": 0.49684327840805054, + "learning_rate": 0.00014993942657154393, + "loss": 3.5518, + "step": 4108 + }, + { + "epoch": 5.25472, + "grad_norm": 0.5177910327911377, + "learning_rate": 0.00014989904428590656, + "loss": 3.5615, + "step": 4109 + }, + { + "epoch": 5.256, + "grad_norm": 0.5041977167129517, + "learning_rate": 0.00014985866200026922, + "loss": 3.5278, + "step": 4110 + }, + { + "epoch": 5.25728, + "grad_norm": 0.5060576796531677, + "learning_rate": 0.00014981827971463182, + "loss": 3.5217, + "step": 4111 + }, + { + "epoch": 5.25856, + "grad_norm": 0.5150559544563293, + "learning_rate": 0.00014977789742899448, + "loss": 3.5016, + "step": 4112 + }, + { + "epoch": 5.25984, + "grad_norm": 0.48947054147720337, + "learning_rate": 0.0001497375151433571, + "loss": 3.5498, + "step": 4113 + }, + { + "epoch": 5.26112, + "grad_norm": 0.5176462531089783, + "learning_rate": 0.00014969713285771974, + "loss": 3.5463, + "step": 4114 + }, + { + "epoch": 5.2624, + "grad_norm": 0.501726508140564, + "learning_rate": 0.00014965675057208237, + "loss": 3.5083, + "step": 4115 + }, + { + "epoch": 5.26368, + "grad_norm": 0.5025134086608887, + "learning_rate": 0.000149616368286445, + "loss": 3.5474, + "step": 4116 + }, + { + "epoch": 5.26496, + "grad_norm": 0.4934270679950714, + "learning_rate": 0.00014957598600080763, + "loss": 3.5807, + "step": 4117 + }, + { + "epoch": 5.26624, + "grad_norm": 0.49150562286376953, + "learning_rate": 0.00014953560371517026, + "loss": 3.5932, + "step": 4118 + }, + { + "epoch": 5.26752, + "grad_norm": 0.4923061728477478, + "learning_rate": 0.0001494952214295329, + "loss": 3.5125, + "step": 4119 + }, + { + "epoch": 5.2688, + "grad_norm": 0.506479024887085, + "learning_rate": 0.00014945483914389552, + "loss": 3.6409, + "step": 4120 + }, + { + "epoch": 5.27008, + "grad_norm": 0.47946614027023315, + "learning_rate": 0.00014941445685825818, + "loss": 3.5563, + "step": 4121 + }, + { + "epoch": 5.27136, + "grad_norm": 0.5117473006248474, + "learning_rate": 0.0001493740745726208, + "loss": 3.5387, + "step": 4122 + }, + { + "epoch": 5.27264, + "grad_norm": 0.49644434452056885, + "learning_rate": 0.00014933369228698344, + "loss": 3.6186, + "step": 4123 + }, + { + "epoch": 5.27392, + "grad_norm": 0.5136969685554504, + "learning_rate": 0.00014929331000134607, + "loss": 3.6033, + "step": 4124 + }, + { + "epoch": 5.2752, + "grad_norm": 0.5244076251983643, + "learning_rate": 0.0001492529277157087, + "loss": 3.6165, + "step": 4125 + }, + { + "epoch": 5.27648, + "grad_norm": 0.49152183532714844, + "learning_rate": 0.00014921254543007133, + "loss": 3.4846, + "step": 4126 + }, + { + "epoch": 5.27776, + "grad_norm": 0.49941569566726685, + "learning_rate": 0.00014917216314443396, + "loss": 3.6035, + "step": 4127 + }, + { + "epoch": 5.27904, + "grad_norm": 0.502313494682312, + "learning_rate": 0.0001491317808587966, + "loss": 3.603, + "step": 4128 + }, + { + "epoch": 5.28032, + "grad_norm": 0.5102902054786682, + "learning_rate": 0.00014909139857315922, + "loss": 3.6138, + "step": 4129 + }, + { + "epoch": 5.2816, + "grad_norm": 0.5135358572006226, + "learning_rate": 0.00014905101628752187, + "loss": 3.5688, + "step": 4130 + }, + { + "epoch": 5.2828800000000005, + "grad_norm": 0.4999293386936188, + "learning_rate": 0.00014901063400188448, + "loss": 3.536, + "step": 4131 + }, + { + "epoch": 5.28416, + "grad_norm": 0.49614301323890686, + "learning_rate": 0.00014897025171624714, + "loss": 3.5474, + "step": 4132 + }, + { + "epoch": 5.28544, + "grad_norm": 0.5085309147834778, + "learning_rate": 0.00014892986943060977, + "loss": 3.6116, + "step": 4133 + }, + { + "epoch": 5.28672, + "grad_norm": 0.479676753282547, + "learning_rate": 0.0001488894871449724, + "loss": 3.5612, + "step": 4134 + }, + { + "epoch": 5.288, + "grad_norm": 0.4859643280506134, + "learning_rate": 0.00014884910485933503, + "loss": 3.5404, + "step": 4135 + }, + { + "epoch": 5.28928, + "grad_norm": 0.49102458357810974, + "learning_rate": 0.00014880872257369766, + "loss": 3.567, + "step": 4136 + }, + { + "epoch": 5.29056, + "grad_norm": 0.4906257688999176, + "learning_rate": 0.00014876834028806029, + "loss": 3.57, + "step": 4137 + }, + { + "epoch": 5.29184, + "grad_norm": 0.48179754614830017, + "learning_rate": 0.00014872795800242294, + "loss": 3.5837, + "step": 4138 + }, + { + "epoch": 5.29312, + "grad_norm": 0.48550358414649963, + "learning_rate": 0.00014868757571678555, + "loss": 3.4787, + "step": 4139 + }, + { + "epoch": 5.2943999999999996, + "grad_norm": 0.49071556329727173, + "learning_rate": 0.0001486471934311482, + "loss": 3.5332, + "step": 4140 + }, + { + "epoch": 5.29568, + "grad_norm": 0.4878070652484894, + "learning_rate": 0.00014860681114551083, + "loss": 3.5856, + "step": 4141 + }, + { + "epoch": 5.29696, + "grad_norm": 0.4919761121273041, + "learning_rate": 0.00014856642885987344, + "loss": 3.5238, + "step": 4142 + }, + { + "epoch": 5.29824, + "grad_norm": 0.502760648727417, + "learning_rate": 0.0001485260465742361, + "loss": 3.5677, + "step": 4143 + }, + { + "epoch": 5.29952, + "grad_norm": 0.5055625438690186, + "learning_rate": 0.00014848566428859872, + "loss": 3.6354, + "step": 4144 + }, + { + "epoch": 5.3008, + "grad_norm": 0.4847923517227173, + "learning_rate": 0.00014844528200296135, + "loss": 3.5031, + "step": 4145 + }, + { + "epoch": 5.30208, + "grad_norm": 0.4974401593208313, + "learning_rate": 0.00014840489971732398, + "loss": 3.5579, + "step": 4146 + }, + { + "epoch": 5.30336, + "grad_norm": 0.4971027076244354, + "learning_rate": 0.0001483645174316866, + "loss": 3.5142, + "step": 4147 + }, + { + "epoch": 5.30464, + "grad_norm": 0.5040155649185181, + "learning_rate": 0.00014832413514604924, + "loss": 3.629, + "step": 4148 + }, + { + "epoch": 5.30592, + "grad_norm": 0.5108280181884766, + "learning_rate": 0.0001482837528604119, + "loss": 3.5502, + "step": 4149 + }, + { + "epoch": 5.3072, + "grad_norm": 0.5081576704978943, + "learning_rate": 0.0001482433705747745, + "loss": 3.5522, + "step": 4150 + }, + { + "epoch": 5.30848, + "grad_norm": 0.49711495637893677, + "learning_rate": 0.00014820298828913716, + "loss": 3.5817, + "step": 4151 + }, + { + "epoch": 5.30976, + "grad_norm": 0.4980352520942688, + "learning_rate": 0.0001481626060034998, + "loss": 3.5515, + "step": 4152 + }, + { + "epoch": 5.31104, + "grad_norm": 0.5169786214828491, + "learning_rate": 0.00014812222371786242, + "loss": 3.5684, + "step": 4153 + }, + { + "epoch": 5.31232, + "grad_norm": 0.500923752784729, + "learning_rate": 0.00014808184143222505, + "loss": 3.5626, + "step": 4154 + }, + { + "epoch": 5.3136, + "grad_norm": 0.4993918836116791, + "learning_rate": 0.00014804145914658768, + "loss": 3.5241, + "step": 4155 + }, + { + "epoch": 5.31488, + "grad_norm": 0.5014583468437195, + "learning_rate": 0.0001480010768609503, + "loss": 3.5245, + "step": 4156 + }, + { + "epoch": 5.31616, + "grad_norm": 0.5105462670326233, + "learning_rate": 0.00014796069457531294, + "loss": 3.5115, + "step": 4157 + }, + { + "epoch": 5.31744, + "grad_norm": 0.506993293762207, + "learning_rate": 0.0001479203122896756, + "loss": 3.5967, + "step": 4158 + }, + { + "epoch": 5.31872, + "grad_norm": 0.4856204688549042, + "learning_rate": 0.0001478799300040382, + "loss": 3.5997, + "step": 4159 + }, + { + "epoch": 5.32, + "grad_norm": 0.5217317938804626, + "learning_rate": 0.00014783954771840086, + "loss": 3.5199, + "step": 4160 + }, + { + "epoch": 5.32128, + "grad_norm": 0.4968002140522003, + "learning_rate": 0.0001477991654327635, + "loss": 3.5319, + "step": 4161 + }, + { + "epoch": 5.32256, + "grad_norm": 0.49874651432037354, + "learning_rate": 0.00014775878314712612, + "loss": 3.5956, + "step": 4162 + }, + { + "epoch": 5.32384, + "grad_norm": 0.5208538770675659, + "learning_rate": 0.00014771840086148875, + "loss": 3.5383, + "step": 4163 + }, + { + "epoch": 5.32512, + "grad_norm": 0.48870334029197693, + "learning_rate": 0.00014767801857585138, + "loss": 3.526, + "step": 4164 + }, + { + "epoch": 5.3264, + "grad_norm": 0.47953054308891296, + "learning_rate": 0.000147637636290214, + "loss": 3.5108, + "step": 4165 + }, + { + "epoch": 5.32768, + "grad_norm": 0.5066089034080505, + "learning_rate": 0.00014759725400457667, + "loss": 3.5234, + "step": 4166 + }, + { + "epoch": 5.32896, + "grad_norm": 0.5130050778388977, + "learning_rate": 0.00014755687171893927, + "loss": 3.6001, + "step": 4167 + }, + { + "epoch": 5.33024, + "grad_norm": 0.5042834281921387, + "learning_rate": 0.0001475164894333019, + "loss": 3.6354, + "step": 4168 + }, + { + "epoch": 5.33152, + "grad_norm": 0.5093918442726135, + "learning_rate": 0.00014747610714766456, + "loss": 3.5279, + "step": 4169 + }, + { + "epoch": 5.3328, + "grad_norm": 0.5074060559272766, + "learning_rate": 0.00014743572486202716, + "loss": 3.5477, + "step": 4170 + }, + { + "epoch": 5.33408, + "grad_norm": 0.5181780457496643, + "learning_rate": 0.00014739534257638982, + "loss": 3.5895, + "step": 4171 + }, + { + "epoch": 5.33536, + "grad_norm": 0.4882206618785858, + "learning_rate": 0.00014735496029075245, + "loss": 3.5639, + "step": 4172 + }, + { + "epoch": 5.33664, + "grad_norm": 0.4917638599872589, + "learning_rate": 0.00014731457800511508, + "loss": 3.5782, + "step": 4173 + }, + { + "epoch": 5.33792, + "grad_norm": 0.5091182589530945, + "learning_rate": 0.0001472741957194777, + "loss": 3.565, + "step": 4174 + }, + { + "epoch": 5.3392, + "grad_norm": 0.5003569722175598, + "learning_rate": 0.00014723381343384034, + "loss": 3.5255, + "step": 4175 + }, + { + "epoch": 5.34048, + "grad_norm": 0.5167132019996643, + "learning_rate": 0.00014719343114820297, + "loss": 3.5591, + "step": 4176 + }, + { + "epoch": 5.34176, + "grad_norm": 0.521111011505127, + "learning_rate": 0.00014715304886256562, + "loss": 3.6051, + "step": 4177 + }, + { + "epoch": 5.34304, + "grad_norm": 0.5027885437011719, + "learning_rate": 0.00014711266657692823, + "loss": 3.5387, + "step": 4178 + }, + { + "epoch": 5.34432, + "grad_norm": 0.5094881057739258, + "learning_rate": 0.00014707228429129088, + "loss": 3.4888, + "step": 4179 + }, + { + "epoch": 5.3456, + "grad_norm": 0.5104243755340576, + "learning_rate": 0.00014703190200565351, + "loss": 3.6305, + "step": 4180 + }, + { + "epoch": 5.34688, + "grad_norm": 0.5000970363616943, + "learning_rate": 0.00014699151972001614, + "loss": 3.5628, + "step": 4181 + }, + { + "epoch": 5.34816, + "grad_norm": 0.5333858132362366, + "learning_rate": 0.00014695113743437877, + "loss": 3.553, + "step": 4182 + }, + { + "epoch": 5.3494399999999995, + "grad_norm": 0.4926629364490509, + "learning_rate": 0.0001469107551487414, + "loss": 3.5841, + "step": 4183 + }, + { + "epoch": 5.35072, + "grad_norm": 0.5149261951446533, + "learning_rate": 0.00014687037286310403, + "loss": 3.5844, + "step": 4184 + }, + { + "epoch": 5.352, + "grad_norm": 0.48902156949043274, + "learning_rate": 0.00014682999057746666, + "loss": 3.5553, + "step": 4185 + }, + { + "epoch": 5.35328, + "grad_norm": 0.5008950233459473, + "learning_rate": 0.0001467896082918293, + "loss": 3.5949, + "step": 4186 + }, + { + "epoch": 5.35456, + "grad_norm": 0.5129325985908508, + "learning_rate": 0.00014674922600619192, + "loss": 3.5755, + "step": 4187 + }, + { + "epoch": 5.35584, + "grad_norm": 0.5095540881156921, + "learning_rate": 0.00014670884372055458, + "loss": 3.6056, + "step": 4188 + }, + { + "epoch": 5.35712, + "grad_norm": 0.497380793094635, + "learning_rate": 0.0001466684614349172, + "loss": 3.5998, + "step": 4189 + }, + { + "epoch": 5.3584, + "grad_norm": 0.5010343194007874, + "learning_rate": 0.00014662807914927984, + "loss": 3.5809, + "step": 4190 + }, + { + "epoch": 5.35968, + "grad_norm": 0.498839795589447, + "learning_rate": 0.00014658769686364247, + "loss": 3.6314, + "step": 4191 + }, + { + "epoch": 5.36096, + "grad_norm": 0.5233437418937683, + "learning_rate": 0.0001465473145780051, + "loss": 3.5928, + "step": 4192 + }, + { + "epoch": 5.36224, + "grad_norm": 0.4920739233493805, + "learning_rate": 0.00014650693229236773, + "loss": 3.5803, + "step": 4193 + }, + { + "epoch": 5.36352, + "grad_norm": 0.5078881978988647, + "learning_rate": 0.00014646655000673036, + "loss": 3.6682, + "step": 4194 + }, + { + "epoch": 5.3648, + "grad_norm": 0.48847028613090515, + "learning_rate": 0.000146426167721093, + "loss": 3.5167, + "step": 4195 + }, + { + "epoch": 5.36608, + "grad_norm": 0.49251747131347656, + "learning_rate": 0.00014638578543545562, + "loss": 3.6127, + "step": 4196 + }, + { + "epoch": 5.36736, + "grad_norm": 0.491445928812027, + "learning_rate": 0.00014634540314981828, + "loss": 3.6277, + "step": 4197 + }, + { + "epoch": 5.36864, + "grad_norm": 0.5016657710075378, + "learning_rate": 0.00014630502086418088, + "loss": 3.5437, + "step": 4198 + }, + { + "epoch": 5.3699200000000005, + "grad_norm": 0.4860299825668335, + "learning_rate": 0.00014626463857854354, + "loss": 3.5576, + "step": 4199 + }, + { + "epoch": 5.3712, + "grad_norm": 0.5150024890899658, + "learning_rate": 0.00014622425629290617, + "loss": 3.5746, + "step": 4200 + }, + { + "epoch": 5.37248, + "grad_norm": 0.49160733819007874, + "learning_rate": 0.0001461838740072688, + "loss": 3.5343, + "step": 4201 + }, + { + "epoch": 5.37376, + "grad_norm": 0.5088993906974792, + "learning_rate": 0.00014614349172163143, + "loss": 3.6182, + "step": 4202 + }, + { + "epoch": 5.37504, + "grad_norm": 0.4988862872123718, + "learning_rate": 0.00014610310943599406, + "loss": 3.5938, + "step": 4203 + }, + { + "epoch": 5.37632, + "grad_norm": 0.49565553665161133, + "learning_rate": 0.0001460627271503567, + "loss": 3.5703, + "step": 4204 + }, + { + "epoch": 5.3776, + "grad_norm": 0.5155361294746399, + "learning_rate": 0.00014602234486471935, + "loss": 3.5962, + "step": 4205 + }, + { + "epoch": 5.37888, + "grad_norm": 0.49401092529296875, + "learning_rate": 0.00014598196257908195, + "loss": 3.571, + "step": 4206 + }, + { + "epoch": 5.38016, + "grad_norm": 0.4905524253845215, + "learning_rate": 0.0001459415802934446, + "loss": 3.5594, + "step": 4207 + }, + { + "epoch": 5.38144, + "grad_norm": 0.5025436878204346, + "learning_rate": 0.00014590119800780724, + "loss": 3.5889, + "step": 4208 + }, + { + "epoch": 5.38272, + "grad_norm": 0.5062000155448914, + "learning_rate": 0.00014586081572216987, + "loss": 3.5641, + "step": 4209 + }, + { + "epoch": 5.384, + "grad_norm": 0.4925670027732849, + "learning_rate": 0.0001458204334365325, + "loss": 3.5346, + "step": 4210 + }, + { + "epoch": 5.38528, + "grad_norm": 0.5170202255249023, + "learning_rate": 0.00014578005115089513, + "loss": 3.5173, + "step": 4211 + }, + { + "epoch": 5.38656, + "grad_norm": 0.4998747706413269, + "learning_rate": 0.00014573966886525776, + "loss": 3.4946, + "step": 4212 + }, + { + "epoch": 5.38784, + "grad_norm": 0.5495707392692566, + "learning_rate": 0.0001456992865796204, + "loss": 3.6223, + "step": 4213 + }, + { + "epoch": 5.38912, + "grad_norm": 0.5236058831214905, + "learning_rate": 0.00014565890429398302, + "loss": 3.5723, + "step": 4214 + }, + { + "epoch": 5.3904, + "grad_norm": 0.49637115001678467, + "learning_rate": 0.00014561852200834565, + "loss": 3.5521, + "step": 4215 + }, + { + "epoch": 5.39168, + "grad_norm": 0.5067302584648132, + "learning_rate": 0.0001455781397227083, + "loss": 3.6365, + "step": 4216 + }, + { + "epoch": 5.39296, + "grad_norm": 0.5126550197601318, + "learning_rate": 0.00014553775743707094, + "loss": 3.5732, + "step": 4217 + }, + { + "epoch": 5.39424, + "grad_norm": 0.4854673743247986, + "learning_rate": 0.00014549737515143357, + "loss": 3.5872, + "step": 4218 + }, + { + "epoch": 5.39552, + "grad_norm": 0.5130731463432312, + "learning_rate": 0.0001454569928657962, + "loss": 3.5871, + "step": 4219 + }, + { + "epoch": 5.3968, + "grad_norm": 0.5053207278251648, + "learning_rate": 0.00014541661058015883, + "loss": 3.5029, + "step": 4220 + }, + { + "epoch": 5.39808, + "grad_norm": 0.5023342370986938, + "learning_rate": 0.00014537622829452146, + "loss": 3.633, + "step": 4221 + }, + { + "epoch": 5.39936, + "grad_norm": 0.4882518947124481, + "learning_rate": 0.00014533584600888409, + "loss": 3.5118, + "step": 4222 + }, + { + "epoch": 5.40064, + "grad_norm": 0.4898047149181366, + "learning_rate": 0.00014529546372324672, + "loss": 3.5909, + "step": 4223 + }, + { + "epoch": 5.40192, + "grad_norm": 0.4916389286518097, + "learning_rate": 0.00014525508143760935, + "loss": 3.5209, + "step": 4224 + }, + { + "epoch": 5.4032, + "grad_norm": 0.4937368631362915, + "learning_rate": 0.000145214699151972, + "loss": 3.5068, + "step": 4225 + }, + { + "epoch": 5.40448, + "grad_norm": 0.49129781126976013, + "learning_rate": 0.0001451743168663346, + "loss": 3.6266, + "step": 4226 + }, + { + "epoch": 5.40576, + "grad_norm": 0.5058757066726685, + "learning_rate": 0.00014513393458069726, + "loss": 3.5565, + "step": 4227 + }, + { + "epoch": 5.40704, + "grad_norm": 0.5102645754814148, + "learning_rate": 0.0001450935522950599, + "loss": 3.6193, + "step": 4228 + }, + { + "epoch": 5.40832, + "grad_norm": 0.5095773935317993, + "learning_rate": 0.00014505317000942252, + "loss": 3.5692, + "step": 4229 + }, + { + "epoch": 5.4096, + "grad_norm": 0.5054805278778076, + "learning_rate": 0.00014501278772378515, + "loss": 3.5361, + "step": 4230 + }, + { + "epoch": 5.41088, + "grad_norm": 0.4920346438884735, + "learning_rate": 0.00014497240543814778, + "loss": 3.5224, + "step": 4231 + }, + { + "epoch": 5.41216, + "grad_norm": 0.4970761239528656, + "learning_rate": 0.00014493202315251041, + "loss": 3.5793, + "step": 4232 + }, + { + "epoch": 5.41344, + "grad_norm": 0.502128005027771, + "learning_rate": 0.00014489164086687307, + "loss": 3.5549, + "step": 4233 + }, + { + "epoch": 5.41472, + "grad_norm": 0.4969402551651001, + "learning_rate": 0.00014485125858123567, + "loss": 3.5693, + "step": 4234 + }, + { + "epoch": 5.416, + "grad_norm": 0.4911673665046692, + "learning_rate": 0.0001448108762955983, + "loss": 3.5352, + "step": 4235 + }, + { + "epoch": 5.41728, + "grad_norm": 0.5134766101837158, + "learning_rate": 0.00014477049400996096, + "loss": 3.5124, + "step": 4236 + }, + { + "epoch": 5.41856, + "grad_norm": 0.5096125602722168, + "learning_rate": 0.00014473011172432356, + "loss": 3.5739, + "step": 4237 + }, + { + "epoch": 5.41984, + "grad_norm": 0.5185458064079285, + "learning_rate": 0.00014468972943868622, + "loss": 3.5937, + "step": 4238 + }, + { + "epoch": 5.42112, + "grad_norm": 0.5176596641540527, + "learning_rate": 0.00014464934715304885, + "loss": 3.5228, + "step": 4239 + }, + { + "epoch": 5.4224, + "grad_norm": 0.5246290564537048, + "learning_rate": 0.00014460896486741148, + "loss": 3.5686, + "step": 4240 + }, + { + "epoch": 5.42368, + "grad_norm": 0.5069420337677002, + "learning_rate": 0.0001445685825817741, + "loss": 3.5579, + "step": 4241 + }, + { + "epoch": 5.4249600000000004, + "grad_norm": 0.5116468667984009, + "learning_rate": 0.00014452820029613674, + "loss": 3.5462, + "step": 4242 + }, + { + "epoch": 5.42624, + "grad_norm": 0.5116104483604431, + "learning_rate": 0.00014448781801049937, + "loss": 3.587, + "step": 4243 + }, + { + "epoch": 5.42752, + "grad_norm": 0.5079525113105774, + "learning_rate": 0.00014444743572486203, + "loss": 3.5666, + "step": 4244 + }, + { + "epoch": 5.4288, + "grad_norm": 0.49015963077545166, + "learning_rate": 0.00014440705343922466, + "loss": 3.5803, + "step": 4245 + }, + { + "epoch": 5.43008, + "grad_norm": 0.5075419545173645, + "learning_rate": 0.0001443666711535873, + "loss": 3.5527, + "step": 4246 + }, + { + "epoch": 5.43136, + "grad_norm": 0.4922216534614563, + "learning_rate": 0.00014432628886794992, + "loss": 3.5723, + "step": 4247 + }, + { + "epoch": 5.43264, + "grad_norm": 0.5076229572296143, + "learning_rate": 0.00014428590658231255, + "loss": 3.5729, + "step": 4248 + }, + { + "epoch": 5.43392, + "grad_norm": 0.4939456284046173, + "learning_rate": 0.00014424552429667518, + "loss": 3.4065, + "step": 4249 + }, + { + "epoch": 5.4352, + "grad_norm": 0.5079296827316284, + "learning_rate": 0.0001442051420110378, + "loss": 3.6077, + "step": 4250 + }, + { + "epoch": 5.4364799999999995, + "grad_norm": 0.4930500090122223, + "learning_rate": 0.00014416475972540044, + "loss": 3.554, + "step": 4251 + }, + { + "epoch": 5.43776, + "grad_norm": 0.5013241767883301, + "learning_rate": 0.00014412437743976307, + "loss": 3.5551, + "step": 4252 + }, + { + "epoch": 5.43904, + "grad_norm": 0.5022532939910889, + "learning_rate": 0.00014408399515412573, + "loss": 3.5839, + "step": 4253 + }, + { + "epoch": 5.44032, + "grad_norm": 0.5082839727401733, + "learning_rate": 0.00014404361286848833, + "loss": 3.6063, + "step": 4254 + }, + { + "epoch": 5.4416, + "grad_norm": 0.5208956599235535, + "learning_rate": 0.000144003230582851, + "loss": 3.4991, + "step": 4255 + }, + { + "epoch": 5.44288, + "grad_norm": 0.5048259496688843, + "learning_rate": 0.00014396284829721362, + "loss": 3.5349, + "step": 4256 + }, + { + "epoch": 5.44416, + "grad_norm": 0.49307942390441895, + "learning_rate": 0.00014392246601157625, + "loss": 3.5114, + "step": 4257 + }, + { + "epoch": 5.44544, + "grad_norm": 0.5296189188957214, + "learning_rate": 0.00014388208372593888, + "loss": 3.5802, + "step": 4258 + }, + { + "epoch": 5.44672, + "grad_norm": 0.5138427019119263, + "learning_rate": 0.0001438417014403015, + "loss": 3.5841, + "step": 4259 + }, + { + "epoch": 5.448, + "grad_norm": 0.4858483672142029, + "learning_rate": 0.00014380131915466414, + "loss": 3.5527, + "step": 4260 + }, + { + "epoch": 5.44928, + "grad_norm": 0.5035324692726135, + "learning_rate": 0.00014376093686902677, + "loss": 3.4928, + "step": 4261 + }, + { + "epoch": 5.45056, + "grad_norm": 0.49650654196739197, + "learning_rate": 0.0001437205545833894, + "loss": 3.631, + "step": 4262 + }, + { + "epoch": 5.45184, + "grad_norm": 0.5007782578468323, + "learning_rate": 0.00014368017229775203, + "loss": 3.5251, + "step": 4263 + }, + { + "epoch": 5.45312, + "grad_norm": 0.4857237637042999, + "learning_rate": 0.00014363979001211468, + "loss": 3.5328, + "step": 4264 + }, + { + "epoch": 5.4544, + "grad_norm": 0.5186595320701599, + "learning_rate": 0.0001435994077264773, + "loss": 3.5839, + "step": 4265 + }, + { + "epoch": 5.45568, + "grad_norm": 0.48634448647499084, + "learning_rate": 0.00014355902544083994, + "loss": 3.5082, + "step": 4266 + }, + { + "epoch": 5.45696, + "grad_norm": 0.5036529302597046, + "learning_rate": 0.00014351864315520257, + "loss": 3.6215, + "step": 4267 + }, + { + "epoch": 5.45824, + "grad_norm": 0.49025043845176697, + "learning_rate": 0.0001434782608695652, + "loss": 3.5371, + "step": 4268 + }, + { + "epoch": 5.45952, + "grad_norm": 0.5002211928367615, + "learning_rate": 0.00014343787858392783, + "loss": 3.5434, + "step": 4269 + }, + { + "epoch": 5.4608, + "grad_norm": 0.48219621181488037, + "learning_rate": 0.00014339749629829046, + "loss": 3.567, + "step": 4270 + }, + { + "epoch": 5.46208, + "grad_norm": 0.4965188801288605, + "learning_rate": 0.0001433571140126531, + "loss": 3.5367, + "step": 4271 + }, + { + "epoch": 5.46336, + "grad_norm": 0.4985227584838867, + "learning_rate": 0.00014331673172701575, + "loss": 3.4608, + "step": 4272 + }, + { + "epoch": 5.46464, + "grad_norm": 0.4829920828342438, + "learning_rate": 0.00014327634944137836, + "loss": 3.5967, + "step": 4273 + }, + { + "epoch": 5.46592, + "grad_norm": 0.49400630593299866, + "learning_rate": 0.000143235967155741, + "loss": 3.5266, + "step": 4274 + }, + { + "epoch": 5.4672, + "grad_norm": 0.5178740620613098, + "learning_rate": 0.00014319558487010364, + "loss": 3.5871, + "step": 4275 + }, + { + "epoch": 5.46848, + "grad_norm": 0.49614524841308594, + "learning_rate": 0.00014315520258446627, + "loss": 3.6358, + "step": 4276 + }, + { + "epoch": 5.46976, + "grad_norm": 0.48628175258636475, + "learning_rate": 0.0001431148202988289, + "loss": 3.4888, + "step": 4277 + }, + { + "epoch": 5.47104, + "grad_norm": 0.5142548680305481, + "learning_rate": 0.00014307443801319153, + "loss": 3.5692, + "step": 4278 + }, + { + "epoch": 5.47232, + "grad_norm": 0.5002409219741821, + "learning_rate": 0.00014303405572755416, + "loss": 3.6088, + "step": 4279 + }, + { + "epoch": 5.4736, + "grad_norm": 0.5171060562133789, + "learning_rate": 0.0001429936734419168, + "loss": 3.5909, + "step": 4280 + }, + { + "epoch": 5.47488, + "grad_norm": 0.5077352523803711, + "learning_rate": 0.00014295329115627945, + "loss": 3.655, + "step": 4281 + }, + { + "epoch": 5.47616, + "grad_norm": 0.4823010563850403, + "learning_rate": 0.00014291290887064205, + "loss": 3.5184, + "step": 4282 + }, + { + "epoch": 5.47744, + "grad_norm": 0.5208946466445923, + "learning_rate": 0.0001428725265850047, + "loss": 3.6297, + "step": 4283 + }, + { + "epoch": 5.47872, + "grad_norm": 0.5040287375450134, + "learning_rate": 0.00014283214429936734, + "loss": 3.5483, + "step": 4284 + }, + { + "epoch": 5.48, + "grad_norm": 0.5379658341407776, + "learning_rate": 0.00014279176201372997, + "loss": 3.5732, + "step": 4285 + }, + { + "epoch": 5.48128, + "grad_norm": 0.4982692003250122, + "learning_rate": 0.0001427513797280926, + "loss": 3.5691, + "step": 4286 + }, + { + "epoch": 5.48256, + "grad_norm": 0.5001688003540039, + "learning_rate": 0.00014271099744245523, + "loss": 3.6635, + "step": 4287 + }, + { + "epoch": 5.48384, + "grad_norm": 0.4979035258293152, + "learning_rate": 0.00014267061515681786, + "loss": 3.582, + "step": 4288 + }, + { + "epoch": 5.48512, + "grad_norm": 0.49051418900489807, + "learning_rate": 0.0001426302328711805, + "loss": 3.5999, + "step": 4289 + }, + { + "epoch": 5.4864, + "grad_norm": 0.4869391918182373, + "learning_rate": 0.00014258985058554312, + "loss": 3.5503, + "step": 4290 + }, + { + "epoch": 5.48768, + "grad_norm": 0.5105782747268677, + "learning_rate": 0.00014254946829990575, + "loss": 3.6143, + "step": 4291 + }, + { + "epoch": 5.48896, + "grad_norm": 0.4911419451236725, + "learning_rate": 0.0001425090860142684, + "loss": 3.5467, + "step": 4292 + }, + { + "epoch": 5.49024, + "grad_norm": 0.4994243085384369, + "learning_rate": 0.000142468703728631, + "loss": 3.6287, + "step": 4293 + }, + { + "epoch": 5.49152, + "grad_norm": 0.4881093502044678, + "learning_rate": 0.00014242832144299367, + "loss": 3.5324, + "step": 4294 + }, + { + "epoch": 5.4928, + "grad_norm": 0.4933335781097412, + "learning_rate": 0.0001423879391573563, + "loss": 3.5793, + "step": 4295 + }, + { + "epoch": 5.49408, + "grad_norm": 0.4933607876300812, + "learning_rate": 0.00014234755687171893, + "loss": 3.5533, + "step": 4296 + }, + { + "epoch": 5.49536, + "grad_norm": 0.4941942095756531, + "learning_rate": 0.00014230717458608156, + "loss": 3.5883, + "step": 4297 + }, + { + "epoch": 5.49664, + "grad_norm": 0.5198619961738586, + "learning_rate": 0.0001422667923004442, + "loss": 3.667, + "step": 4298 + }, + { + "epoch": 5.49792, + "grad_norm": 0.4889330267906189, + "learning_rate": 0.00014222641001480682, + "loss": 3.5658, + "step": 4299 + }, + { + "epoch": 5.4992, + "grad_norm": 0.48626118898391724, + "learning_rate": 0.00014218602772916948, + "loss": 3.5525, + "step": 4300 + }, + { + "epoch": 5.50048, + "grad_norm": 0.4914554953575134, + "learning_rate": 0.00014214564544353208, + "loss": 3.5968, + "step": 4301 + }, + { + "epoch": 5.50176, + "grad_norm": 0.49524810910224915, + "learning_rate": 0.0001421052631578947, + "loss": 3.5555, + "step": 4302 + }, + { + "epoch": 5.50304, + "grad_norm": 0.5117614269256592, + "learning_rate": 0.00014206488087225737, + "loss": 3.6015, + "step": 4303 + }, + { + "epoch": 5.50432, + "grad_norm": 0.4925834834575653, + "learning_rate": 0.00014202449858662, + "loss": 3.5378, + "step": 4304 + }, + { + "epoch": 5.5056, + "grad_norm": 0.494954913854599, + "learning_rate": 0.00014198411630098263, + "loss": 3.5463, + "step": 4305 + }, + { + "epoch": 5.50688, + "grad_norm": 0.5056374073028564, + "learning_rate": 0.00014194373401534526, + "loss": 3.6499, + "step": 4306 + }, + { + "epoch": 5.50816, + "grad_norm": 0.495236873626709, + "learning_rate": 0.00014190335172970789, + "loss": 3.5903, + "step": 4307 + }, + { + "epoch": 5.50944, + "grad_norm": 0.48822644352912903, + "learning_rate": 0.00014186296944407052, + "loss": 3.5119, + "step": 4308 + }, + { + "epoch": 5.51072, + "grad_norm": 0.5125917792320251, + "learning_rate": 0.00014182258715843315, + "loss": 3.5481, + "step": 4309 + }, + { + "epoch": 5.5120000000000005, + "grad_norm": 0.5041484832763672, + "learning_rate": 0.00014178220487279578, + "loss": 3.542, + "step": 4310 + }, + { + "epoch": 5.51328, + "grad_norm": 0.48397096991539, + "learning_rate": 0.00014174182258715843, + "loss": 3.5553, + "step": 4311 + }, + { + "epoch": 5.51456, + "grad_norm": 0.49939045310020447, + "learning_rate": 0.00014170144030152106, + "loss": 3.6071, + "step": 4312 + }, + { + "epoch": 5.51584, + "grad_norm": 0.4981886148452759, + "learning_rate": 0.0001416610580158837, + "loss": 3.6154, + "step": 4313 + }, + { + "epoch": 5.51712, + "grad_norm": 0.4908769130706787, + "learning_rate": 0.00014162067573024632, + "loss": 3.5811, + "step": 4314 + }, + { + "epoch": 5.5184, + "grad_norm": 0.4873631000518799, + "learning_rate": 0.00014158029344460895, + "loss": 3.5451, + "step": 4315 + }, + { + "epoch": 5.51968, + "grad_norm": 0.5031121969223022, + "learning_rate": 0.00014153991115897158, + "loss": 3.4982, + "step": 4316 + }, + { + "epoch": 5.52096, + "grad_norm": 0.5029070973396301, + "learning_rate": 0.00014149952887333421, + "loss": 3.5159, + "step": 4317 + }, + { + "epoch": 5.52224, + "grad_norm": 0.5104166865348816, + "learning_rate": 0.00014145914658769684, + "loss": 3.5713, + "step": 4318 + }, + { + "epoch": 5.5235199999999995, + "grad_norm": 0.512605607509613, + "learning_rate": 0.00014141876430205947, + "loss": 3.5637, + "step": 4319 + }, + { + "epoch": 5.5248, + "grad_norm": 0.5072982311248779, + "learning_rate": 0.00014137838201642213, + "loss": 3.5527, + "step": 4320 + }, + { + "epoch": 5.52608, + "grad_norm": 0.5111182332038879, + "learning_rate": 0.00014133799973078473, + "loss": 3.6156, + "step": 4321 + }, + { + "epoch": 5.52736, + "grad_norm": 0.5078898668289185, + "learning_rate": 0.0001412976174451474, + "loss": 3.5136, + "step": 4322 + }, + { + "epoch": 5.52864, + "grad_norm": 0.49358004331588745, + "learning_rate": 0.00014125723515951002, + "loss": 3.481, + "step": 4323 + }, + { + "epoch": 5.52992, + "grad_norm": 0.4940800070762634, + "learning_rate": 0.00014121685287387265, + "loss": 3.5878, + "step": 4324 + }, + { + "epoch": 5.5312, + "grad_norm": 0.4859899580478668, + "learning_rate": 0.00014117647058823528, + "loss": 3.5579, + "step": 4325 + }, + { + "epoch": 5.53248, + "grad_norm": 0.5090684294700623, + "learning_rate": 0.0001411360883025979, + "loss": 3.5731, + "step": 4326 + }, + { + "epoch": 5.53376, + "grad_norm": 0.5048773884773254, + "learning_rate": 0.00014109570601696054, + "loss": 3.6074, + "step": 4327 + }, + { + "epoch": 5.53504, + "grad_norm": 0.5031253099441528, + "learning_rate": 0.0001410553237313232, + "loss": 3.554, + "step": 4328 + }, + { + "epoch": 5.53632, + "grad_norm": 0.48305338621139526, + "learning_rate": 0.0001410149414456858, + "loss": 3.5785, + "step": 4329 + }, + { + "epoch": 5.5376, + "grad_norm": 0.5137234330177307, + "learning_rate": 0.00014097455916004843, + "loss": 3.5755, + "step": 4330 + }, + { + "epoch": 5.53888, + "grad_norm": 0.4992203414440155, + "learning_rate": 0.0001409341768744111, + "loss": 3.5376, + "step": 4331 + }, + { + "epoch": 5.54016, + "grad_norm": 0.5001314878463745, + "learning_rate": 0.00014089379458877372, + "loss": 3.6757, + "step": 4332 + }, + { + "epoch": 5.54144, + "grad_norm": 0.4925899803638458, + "learning_rate": 0.00014085341230313635, + "loss": 3.5308, + "step": 4333 + }, + { + "epoch": 5.54272, + "grad_norm": 0.5240253210067749, + "learning_rate": 0.00014081303001749898, + "loss": 3.539, + "step": 4334 + }, + { + "epoch": 5.5440000000000005, + "grad_norm": 0.5012069940567017, + "learning_rate": 0.0001407726477318616, + "loss": 3.5401, + "step": 4335 + }, + { + "epoch": 5.54528, + "grad_norm": 0.4972357451915741, + "learning_rate": 0.00014073226544622424, + "loss": 3.5214, + "step": 4336 + }, + { + "epoch": 5.54656, + "grad_norm": 0.5054651498794556, + "learning_rate": 0.00014069188316058687, + "loss": 3.4729, + "step": 4337 + }, + { + "epoch": 5.54784, + "grad_norm": 0.4873622953891754, + "learning_rate": 0.0001406515008749495, + "loss": 3.5886, + "step": 4338 + }, + { + "epoch": 5.54912, + "grad_norm": 0.5032194256782532, + "learning_rate": 0.00014061111858931216, + "loss": 3.4718, + "step": 4339 + }, + { + "epoch": 5.5504, + "grad_norm": 0.5260450839996338, + "learning_rate": 0.0001405707363036748, + "loss": 3.5285, + "step": 4340 + }, + { + "epoch": 5.55168, + "grad_norm": 0.4904164671897888, + "learning_rate": 0.00014053035401803742, + "loss": 3.5785, + "step": 4341 + }, + { + "epoch": 5.55296, + "grad_norm": 0.49703943729400635, + "learning_rate": 0.00014048997173240005, + "loss": 3.5338, + "step": 4342 + }, + { + "epoch": 5.55424, + "grad_norm": 0.50665283203125, + "learning_rate": 0.00014044958944676268, + "loss": 3.5355, + "step": 4343 + }, + { + "epoch": 5.55552, + "grad_norm": 0.5014834403991699, + "learning_rate": 0.0001404092071611253, + "loss": 3.5493, + "step": 4344 + }, + { + "epoch": 5.5568, + "grad_norm": 0.5009499788284302, + "learning_rate": 0.00014036882487548794, + "loss": 3.6308, + "step": 4345 + }, + { + "epoch": 5.55808, + "grad_norm": 0.4850377142429352, + "learning_rate": 0.00014032844258985057, + "loss": 3.6086, + "step": 4346 + }, + { + "epoch": 5.55936, + "grad_norm": 0.5173080563545227, + "learning_rate": 0.0001402880603042132, + "loss": 3.5599, + "step": 4347 + }, + { + "epoch": 5.56064, + "grad_norm": 0.5030738711357117, + "learning_rate": 0.00014024767801857585, + "loss": 3.618, + "step": 4348 + }, + { + "epoch": 5.56192, + "grad_norm": 0.5130670666694641, + "learning_rate": 0.00014020729573293846, + "loss": 3.5968, + "step": 4349 + }, + { + "epoch": 5.5632, + "grad_norm": 0.5004739761352539, + "learning_rate": 0.00014016691344730111, + "loss": 3.5825, + "step": 4350 + }, + { + "epoch": 5.56448, + "grad_norm": 0.5110647082328796, + "learning_rate": 0.00014012653116166375, + "loss": 3.5971, + "step": 4351 + }, + { + "epoch": 5.56576, + "grad_norm": 0.4908808767795563, + "learning_rate": 0.00014008614887602638, + "loss": 3.5662, + "step": 4352 + }, + { + "epoch": 5.56704, + "grad_norm": 0.49879395961761475, + "learning_rate": 0.000140045766590389, + "loss": 3.5705, + "step": 4353 + }, + { + "epoch": 5.56832, + "grad_norm": 0.5124648809432983, + "learning_rate": 0.00014000538430475164, + "loss": 3.618, + "step": 4354 + }, + { + "epoch": 5.5696, + "grad_norm": 0.48847079277038574, + "learning_rate": 0.00013996500201911427, + "loss": 3.5621, + "step": 4355 + }, + { + "epoch": 5.57088, + "grad_norm": 0.5030008554458618, + "learning_rate": 0.0001399246197334769, + "loss": 3.4727, + "step": 4356 + }, + { + "epoch": 5.57216, + "grad_norm": 0.5004321932792664, + "learning_rate": 0.00013988423744783953, + "loss": 3.5718, + "step": 4357 + }, + { + "epoch": 5.57344, + "grad_norm": 0.49990010261535645, + "learning_rate": 0.00013984385516220216, + "loss": 3.5607, + "step": 4358 + }, + { + "epoch": 5.57472, + "grad_norm": 0.4781895577907562, + "learning_rate": 0.0001398034728765648, + "loss": 3.5735, + "step": 4359 + }, + { + "epoch": 5.576, + "grad_norm": 0.4892998933792114, + "learning_rate": 0.00013976309059092742, + "loss": 3.6314, + "step": 4360 + }, + { + "epoch": 5.57728, + "grad_norm": 0.5173603296279907, + "learning_rate": 0.00013972270830529007, + "loss": 3.5335, + "step": 4361 + }, + { + "epoch": 5.5785599999999995, + "grad_norm": 0.4709049463272095, + "learning_rate": 0.0001396823260196527, + "loss": 3.5218, + "step": 4362 + }, + { + "epoch": 5.57984, + "grad_norm": 0.5281604528427124, + "learning_rate": 0.00013964194373401533, + "loss": 3.589, + "step": 4363 + }, + { + "epoch": 5.58112, + "grad_norm": 0.49531951546669006, + "learning_rate": 0.00013960156144837796, + "loss": 3.5535, + "step": 4364 + }, + { + "epoch": 5.5824, + "grad_norm": 0.5021586418151855, + "learning_rate": 0.0001395611791627406, + "loss": 3.461, + "step": 4365 + }, + { + "epoch": 5.58368, + "grad_norm": 0.5123895406723022, + "learning_rate": 0.00013952079687710322, + "loss": 3.4411, + "step": 4366 + }, + { + "epoch": 5.58496, + "grad_norm": 0.515090823173523, + "learning_rate": 0.00013948041459146588, + "loss": 3.6073, + "step": 4367 + }, + { + "epoch": 5.58624, + "grad_norm": 0.5333694815635681, + "learning_rate": 0.0001394400323058285, + "loss": 3.6164, + "step": 4368 + }, + { + "epoch": 5.58752, + "grad_norm": 0.48728322982788086, + "learning_rate": 0.0001393996500201911, + "loss": 3.529, + "step": 4369 + }, + { + "epoch": 5.5888, + "grad_norm": 0.5210532546043396, + "learning_rate": 0.00013935926773455377, + "loss": 3.455, + "step": 4370 + }, + { + "epoch": 5.59008, + "grad_norm": 0.5110505223274231, + "learning_rate": 0.0001393188854489164, + "loss": 3.5412, + "step": 4371 + }, + { + "epoch": 5.59136, + "grad_norm": 0.4856546223163605, + "learning_rate": 0.00013927850316327903, + "loss": 3.5445, + "step": 4372 + }, + { + "epoch": 5.59264, + "grad_norm": 0.5165418386459351, + "learning_rate": 0.00013923812087764166, + "loss": 3.5215, + "step": 4373 + }, + { + "epoch": 5.59392, + "grad_norm": 0.524249255657196, + "learning_rate": 0.0001391977385920043, + "loss": 3.579, + "step": 4374 + }, + { + "epoch": 5.5952, + "grad_norm": 0.5186324715614319, + "learning_rate": 0.00013915735630636692, + "loss": 3.5448, + "step": 4375 + }, + { + "epoch": 5.59648, + "grad_norm": 0.4949183166027069, + "learning_rate": 0.00013911697402072958, + "loss": 3.5652, + "step": 4376 + }, + { + "epoch": 5.59776, + "grad_norm": 0.5009962916374207, + "learning_rate": 0.00013907659173509218, + "loss": 3.6081, + "step": 4377 + }, + { + "epoch": 5.5990400000000005, + "grad_norm": 0.4920154809951782, + "learning_rate": 0.00013903620944945484, + "loss": 3.5249, + "step": 4378 + }, + { + "epoch": 5.60032, + "grad_norm": 0.5058812499046326, + "learning_rate": 0.00013899582716381747, + "loss": 3.612, + "step": 4379 + }, + { + "epoch": 5.6016, + "grad_norm": 0.5027937889099121, + "learning_rate": 0.0001389554448781801, + "loss": 3.5013, + "step": 4380 + }, + { + "epoch": 5.60288, + "grad_norm": 0.5067436099052429, + "learning_rate": 0.00013891506259254273, + "loss": 3.5146, + "step": 4381 + }, + { + "epoch": 5.60416, + "grad_norm": 0.49127253890037537, + "learning_rate": 0.00013887468030690536, + "loss": 3.6259, + "step": 4382 + }, + { + "epoch": 5.60544, + "grad_norm": 0.5145688056945801, + "learning_rate": 0.000138834298021268, + "loss": 3.6815, + "step": 4383 + }, + { + "epoch": 5.60672, + "grad_norm": 0.5067407488822937, + "learning_rate": 0.00013879391573563062, + "loss": 3.6327, + "step": 4384 + }, + { + "epoch": 5.608, + "grad_norm": 0.5062004923820496, + "learning_rate": 0.00013875353344999325, + "loss": 3.5035, + "step": 4385 + }, + { + "epoch": 5.60928, + "grad_norm": 0.483181893825531, + "learning_rate": 0.00013871315116435588, + "loss": 3.5152, + "step": 4386 + }, + { + "epoch": 5.6105599999999995, + "grad_norm": 0.5145739912986755, + "learning_rate": 0.00013867276887871854, + "loss": 3.572, + "step": 4387 + }, + { + "epoch": 5.61184, + "grad_norm": 0.4981113374233246, + "learning_rate": 0.00013863238659308114, + "loss": 3.5428, + "step": 4388 + }, + { + "epoch": 5.61312, + "grad_norm": 0.4911549389362335, + "learning_rate": 0.0001385920043074438, + "loss": 3.5261, + "step": 4389 + }, + { + "epoch": 5.6144, + "grad_norm": 0.5160556435585022, + "learning_rate": 0.00013855162202180643, + "loss": 3.5217, + "step": 4390 + }, + { + "epoch": 5.61568, + "grad_norm": 0.4965737760066986, + "learning_rate": 0.00013851123973616906, + "loss": 3.5228, + "step": 4391 + }, + { + "epoch": 5.61696, + "grad_norm": 0.49209117889404297, + "learning_rate": 0.0001384708574505317, + "loss": 3.5664, + "step": 4392 + }, + { + "epoch": 5.61824, + "grad_norm": 0.4959764778614044, + "learning_rate": 0.00013843047516489432, + "loss": 3.6293, + "step": 4393 + }, + { + "epoch": 5.61952, + "grad_norm": 0.5043753981590271, + "learning_rate": 0.00013839009287925695, + "loss": 3.4969, + "step": 4394 + }, + { + "epoch": 5.6208, + "grad_norm": 0.5141623020172119, + "learning_rate": 0.0001383497105936196, + "loss": 3.5033, + "step": 4395 + }, + { + "epoch": 5.62208, + "grad_norm": 0.4955049753189087, + "learning_rate": 0.0001383093283079822, + "loss": 3.5877, + "step": 4396 + }, + { + "epoch": 5.62336, + "grad_norm": 0.4988468289375305, + "learning_rate": 0.00013826894602234484, + "loss": 3.5641, + "step": 4397 + }, + { + "epoch": 5.62464, + "grad_norm": 0.49312546849250793, + "learning_rate": 0.0001382285637367075, + "loss": 3.5163, + "step": 4398 + }, + { + "epoch": 5.62592, + "grad_norm": 0.49335968494415283, + "learning_rate": 0.00013818818145107012, + "loss": 3.5558, + "step": 4399 + }, + { + "epoch": 5.6272, + "grad_norm": 0.49063315987586975, + "learning_rate": 0.00013814779916543275, + "loss": 3.6151, + "step": 4400 + }, + { + "epoch": 5.62848, + "grad_norm": 0.48111316561698914, + "learning_rate": 0.00013810741687979538, + "loss": 3.5394, + "step": 4401 + }, + { + "epoch": 5.62976, + "grad_norm": 0.4941888153553009, + "learning_rate": 0.00013806703459415801, + "loss": 3.5944, + "step": 4402 + }, + { + "epoch": 5.6310400000000005, + "grad_norm": 0.5081154704093933, + "learning_rate": 0.00013802665230852064, + "loss": 3.546, + "step": 4403 + }, + { + "epoch": 5.63232, + "grad_norm": 0.49409082531929016, + "learning_rate": 0.0001379862700228833, + "loss": 3.6016, + "step": 4404 + }, + { + "epoch": 5.6336, + "grad_norm": 0.4921821355819702, + "learning_rate": 0.0001379458877372459, + "loss": 3.56, + "step": 4405 + }, + { + "epoch": 5.63488, + "grad_norm": 0.5267976522445679, + "learning_rate": 0.00013790550545160856, + "loss": 3.5694, + "step": 4406 + }, + { + "epoch": 5.63616, + "grad_norm": 0.4959132671356201, + "learning_rate": 0.0001378651231659712, + "loss": 3.5944, + "step": 4407 + }, + { + "epoch": 5.63744, + "grad_norm": 0.4983578622341156, + "learning_rate": 0.00013782474088033382, + "loss": 3.514, + "step": 4408 + }, + { + "epoch": 5.63872, + "grad_norm": 0.5071173310279846, + "learning_rate": 0.00013778435859469645, + "loss": 3.5079, + "step": 4409 + }, + { + "epoch": 5.64, + "grad_norm": 0.4979630410671234, + "learning_rate": 0.00013774397630905908, + "loss": 3.5649, + "step": 4410 + }, + { + "epoch": 5.64128, + "grad_norm": 0.48886778950691223, + "learning_rate": 0.0001377035940234217, + "loss": 3.5758, + "step": 4411 + }, + { + "epoch": 5.64256, + "grad_norm": 0.5049862265586853, + "learning_rate": 0.00013766321173778434, + "loss": 3.5894, + "step": 4412 + }, + { + "epoch": 5.64384, + "grad_norm": 0.4826545715332031, + "learning_rate": 0.00013762282945214697, + "loss": 3.6285, + "step": 4413 + }, + { + "epoch": 5.64512, + "grad_norm": 0.5009142160415649, + "learning_rate": 0.0001375824471665096, + "loss": 3.5911, + "step": 4414 + }, + { + "epoch": 5.6464, + "grad_norm": 0.49813807010650635, + "learning_rate": 0.00013754206488087226, + "loss": 3.5388, + "step": 4415 + }, + { + "epoch": 5.64768, + "grad_norm": 0.48648300766944885, + "learning_rate": 0.00013750168259523486, + "loss": 3.5231, + "step": 4416 + }, + { + "epoch": 5.64896, + "grad_norm": 0.5059703588485718, + "learning_rate": 0.00013746130030959752, + "loss": 3.5606, + "step": 4417 + }, + { + "epoch": 5.65024, + "grad_norm": 0.49821940064430237, + "learning_rate": 0.00013742091802396015, + "loss": 3.5415, + "step": 4418 + }, + { + "epoch": 5.65152, + "grad_norm": 0.5076292157173157, + "learning_rate": 0.00013738053573832278, + "loss": 3.5893, + "step": 4419 + }, + { + "epoch": 5.6528, + "grad_norm": 0.5033696293830872, + "learning_rate": 0.0001373401534526854, + "loss": 3.5241, + "step": 4420 + }, + { + "epoch": 5.65408, + "grad_norm": 0.49491533637046814, + "learning_rate": 0.00013729977116704804, + "loss": 3.5782, + "step": 4421 + }, + { + "epoch": 5.65536, + "grad_norm": 0.5117203593254089, + "learning_rate": 0.00013725938888141067, + "loss": 3.5175, + "step": 4422 + }, + { + "epoch": 5.65664, + "grad_norm": 0.5070961117744446, + "learning_rate": 0.0001372190065957733, + "loss": 3.5301, + "step": 4423 + }, + { + "epoch": 5.65792, + "grad_norm": 0.5147043466567993, + "learning_rate": 0.00013717862431013593, + "loss": 3.5733, + "step": 4424 + }, + { + "epoch": 5.6592, + "grad_norm": 0.5103664398193359, + "learning_rate": 0.00013713824202449856, + "loss": 3.6548, + "step": 4425 + }, + { + "epoch": 5.66048, + "grad_norm": 0.4940325915813446, + "learning_rate": 0.00013709785973886122, + "loss": 3.6025, + "step": 4426 + }, + { + "epoch": 5.66176, + "grad_norm": 0.49647340178489685, + "learning_rate": 0.00013705747745322385, + "loss": 3.4952, + "step": 4427 + }, + { + "epoch": 5.66304, + "grad_norm": 0.5108480453491211, + "learning_rate": 0.00013701709516758648, + "loss": 3.6175, + "step": 4428 + }, + { + "epoch": 5.66432, + "grad_norm": 0.5075586438179016, + "learning_rate": 0.0001369767128819491, + "loss": 3.5372, + "step": 4429 + }, + { + "epoch": 5.6655999999999995, + "grad_norm": 0.49585655331611633, + "learning_rate": 0.00013693633059631174, + "loss": 3.5006, + "step": 4430 + }, + { + "epoch": 5.66688, + "grad_norm": 0.5268029570579529, + "learning_rate": 0.00013689594831067437, + "loss": 3.5752, + "step": 4431 + }, + { + "epoch": 5.66816, + "grad_norm": 0.5017437934875488, + "learning_rate": 0.000136855566025037, + "loss": 3.5115, + "step": 4432 + }, + { + "epoch": 5.66944, + "grad_norm": 0.5259951949119568, + "learning_rate": 0.00013681518373939963, + "loss": 3.5269, + "step": 4433 + }, + { + "epoch": 5.67072, + "grad_norm": 0.49649110436439514, + "learning_rate": 0.00013677480145376229, + "loss": 3.5358, + "step": 4434 + }, + { + "epoch": 5.672, + "grad_norm": 0.5151202082633972, + "learning_rate": 0.00013673441916812492, + "loss": 3.5663, + "step": 4435 + }, + { + "epoch": 5.67328, + "grad_norm": 0.5322958827018738, + "learning_rate": 0.00013669403688248752, + "loss": 3.5762, + "step": 4436 + }, + { + "epoch": 5.67456, + "grad_norm": 0.505379319190979, + "learning_rate": 0.00013665365459685018, + "loss": 3.5684, + "step": 4437 + }, + { + "epoch": 5.67584, + "grad_norm": 0.5019539594650269, + "learning_rate": 0.0001366132723112128, + "loss": 3.5522, + "step": 4438 + }, + { + "epoch": 5.67712, + "grad_norm": 0.5176437497138977, + "learning_rate": 0.00013657289002557544, + "loss": 3.5629, + "step": 4439 + }, + { + "epoch": 5.6784, + "grad_norm": 0.4964632987976074, + "learning_rate": 0.00013653250773993807, + "loss": 3.6072, + "step": 4440 + }, + { + "epoch": 5.67968, + "grad_norm": 0.49535518884658813, + "learning_rate": 0.0001364921254543007, + "loss": 3.6126, + "step": 4441 + }, + { + "epoch": 5.68096, + "grad_norm": 0.4958983063697815, + "learning_rate": 0.00013645174316866333, + "loss": 3.5765, + "step": 4442 + }, + { + "epoch": 5.68224, + "grad_norm": 0.4932209253311157, + "learning_rate": 0.00013641136088302598, + "loss": 3.5613, + "step": 4443 + }, + { + "epoch": 5.68352, + "grad_norm": 0.48881053924560547, + "learning_rate": 0.00013637097859738859, + "loss": 3.5207, + "step": 4444 + }, + { + "epoch": 5.6848, + "grad_norm": 0.4943525791168213, + "learning_rate": 0.00013633059631175124, + "loss": 3.616, + "step": 4445 + }, + { + "epoch": 5.6860800000000005, + "grad_norm": 0.504132091999054, + "learning_rate": 0.00013629021402611387, + "loss": 3.5652, + "step": 4446 + }, + { + "epoch": 5.68736, + "grad_norm": 0.5116013288497925, + "learning_rate": 0.0001362498317404765, + "loss": 3.5694, + "step": 4447 + }, + { + "epoch": 5.68864, + "grad_norm": 0.49192342162132263, + "learning_rate": 0.00013620944945483913, + "loss": 3.5206, + "step": 4448 + }, + { + "epoch": 5.68992, + "grad_norm": 0.49895188212394714, + "learning_rate": 0.00013616906716920176, + "loss": 3.5208, + "step": 4449 + }, + { + "epoch": 5.6912, + "grad_norm": 0.5205003619194031, + "learning_rate": 0.0001361286848835644, + "loss": 3.6103, + "step": 4450 + }, + { + "epoch": 5.69248, + "grad_norm": 0.5131163001060486, + "learning_rate": 0.00013608830259792702, + "loss": 3.5963, + "step": 4451 + }, + { + "epoch": 5.69376, + "grad_norm": 0.4999935030937195, + "learning_rate": 0.00013604792031228965, + "loss": 3.5195, + "step": 4452 + }, + { + "epoch": 5.69504, + "grad_norm": 0.5080338716506958, + "learning_rate": 0.00013600753802665228, + "loss": 3.5515, + "step": 4453 + }, + { + "epoch": 5.69632, + "grad_norm": 0.5050440430641174, + "learning_rate": 0.00013596715574101494, + "loss": 3.5676, + "step": 4454 + }, + { + "epoch": 5.6975999999999996, + "grad_norm": 0.4881718158721924, + "learning_rate": 0.00013592677345537757, + "loss": 3.5505, + "step": 4455 + }, + { + "epoch": 5.69888, + "grad_norm": 0.5083419680595398, + "learning_rate": 0.0001358863911697402, + "loss": 3.5002, + "step": 4456 + }, + { + "epoch": 5.70016, + "grad_norm": 0.5217411518096924, + "learning_rate": 0.00013584600888410283, + "loss": 3.5094, + "step": 4457 + }, + { + "epoch": 5.70144, + "grad_norm": 0.4996829628944397, + "learning_rate": 0.00013580562659846546, + "loss": 3.6286, + "step": 4458 + }, + { + "epoch": 5.70272, + "grad_norm": 0.5101032853126526, + "learning_rate": 0.0001357652443128281, + "loss": 3.5412, + "step": 4459 + }, + { + "epoch": 5.704, + "grad_norm": 0.5019393563270569, + "learning_rate": 0.00013572486202719072, + "loss": 3.5845, + "step": 4460 + }, + { + "epoch": 5.70528, + "grad_norm": 0.5044521689414978, + "learning_rate": 0.00013568447974155335, + "loss": 3.633, + "step": 4461 + }, + { + "epoch": 5.70656, + "grad_norm": 0.5104800462722778, + "learning_rate": 0.000135644097455916, + "loss": 3.5554, + "step": 4462 + }, + { + "epoch": 5.70784, + "grad_norm": 0.4980311989784241, + "learning_rate": 0.00013560371517027864, + "loss": 3.5151, + "step": 4463 + }, + { + "epoch": 5.70912, + "grad_norm": 0.501975417137146, + "learning_rate": 0.00013556333288464124, + "loss": 3.5999, + "step": 4464 + }, + { + "epoch": 5.7104, + "grad_norm": 0.4794447422027588, + "learning_rate": 0.0001355229505990039, + "loss": 3.5524, + "step": 4465 + }, + { + "epoch": 5.71168, + "grad_norm": 0.5068936944007874, + "learning_rate": 0.00013548256831336653, + "loss": 3.5434, + "step": 4466 + }, + { + "epoch": 5.71296, + "grad_norm": 0.5029814839363098, + "learning_rate": 0.00013544218602772916, + "loss": 3.531, + "step": 4467 + }, + { + "epoch": 5.71424, + "grad_norm": 0.48336413502693176, + "learning_rate": 0.0001354018037420918, + "loss": 3.5022, + "step": 4468 + }, + { + "epoch": 5.71552, + "grad_norm": 0.4998653829097748, + "learning_rate": 0.00013536142145645442, + "loss": 3.6426, + "step": 4469 + }, + { + "epoch": 5.7168, + "grad_norm": 0.5224689841270447, + "learning_rate": 0.00013532103917081705, + "loss": 3.6244, + "step": 4470 + }, + { + "epoch": 5.7180800000000005, + "grad_norm": 0.49238261580467224, + "learning_rate": 0.0001352806568851797, + "loss": 3.5419, + "step": 4471 + }, + { + "epoch": 5.71936, + "grad_norm": 0.5162872672080994, + "learning_rate": 0.0001352402745995423, + "loss": 3.5593, + "step": 4472 + }, + { + "epoch": 5.7206399999999995, + "grad_norm": 0.4984869658946991, + "learning_rate": 0.00013519989231390497, + "loss": 3.5436, + "step": 4473 + }, + { + "epoch": 5.72192, + "grad_norm": 0.49168428778648376, + "learning_rate": 0.0001351595100282676, + "loss": 3.5536, + "step": 4474 + }, + { + "epoch": 5.7232, + "grad_norm": 0.47858181595802307, + "learning_rate": 0.00013511912774263023, + "loss": 3.5043, + "step": 4475 + }, + { + "epoch": 5.72448, + "grad_norm": 0.5016847848892212, + "learning_rate": 0.00013507874545699286, + "loss": 3.6073, + "step": 4476 + }, + { + "epoch": 5.72576, + "grad_norm": 0.5029941201210022, + "learning_rate": 0.0001350383631713555, + "loss": 3.6127, + "step": 4477 + }, + { + "epoch": 5.72704, + "grad_norm": 0.48141196370124817, + "learning_rate": 0.00013499798088571812, + "loss": 3.574, + "step": 4478 + }, + { + "epoch": 5.72832, + "grad_norm": 0.4969176948070526, + "learning_rate": 0.00013495759860008075, + "loss": 3.5789, + "step": 4479 + }, + { + "epoch": 5.7296, + "grad_norm": 0.48834389448165894, + "learning_rate": 0.00013491721631444338, + "loss": 3.5863, + "step": 4480 + }, + { + "epoch": 5.73088, + "grad_norm": 0.49428460001945496, + "learning_rate": 0.000134876834028806, + "loss": 3.5619, + "step": 4481 + }, + { + "epoch": 5.73216, + "grad_norm": 0.49237480759620667, + "learning_rate": 0.00013483645174316866, + "loss": 3.5791, + "step": 4482 + }, + { + "epoch": 5.73344, + "grad_norm": 0.5003536343574524, + "learning_rate": 0.00013479606945753127, + "loss": 3.595, + "step": 4483 + }, + { + "epoch": 5.73472, + "grad_norm": 0.4975607395172119, + "learning_rate": 0.00013475568717189392, + "loss": 3.5885, + "step": 4484 + }, + { + "epoch": 5.736, + "grad_norm": 0.5209734439849854, + "learning_rate": 0.00013471530488625655, + "loss": 3.5795, + "step": 4485 + }, + { + "epoch": 5.73728, + "grad_norm": 0.49605241417884827, + "learning_rate": 0.00013467492260061918, + "loss": 3.5311, + "step": 4486 + }, + { + "epoch": 5.73856, + "grad_norm": 0.4768206775188446, + "learning_rate": 0.00013463454031498181, + "loss": 3.556, + "step": 4487 + }, + { + "epoch": 5.73984, + "grad_norm": 0.4915754199028015, + "learning_rate": 0.00013459415802934444, + "loss": 3.6062, + "step": 4488 + }, + { + "epoch": 5.7411200000000004, + "grad_norm": 0.4907207489013672, + "learning_rate": 0.00013455377574370707, + "loss": 3.5279, + "step": 4489 + }, + { + "epoch": 5.7424, + "grad_norm": 0.4975474178791046, + "learning_rate": 0.0001345133934580697, + "loss": 3.5248, + "step": 4490 + }, + { + "epoch": 5.74368, + "grad_norm": 0.481803834438324, + "learning_rate": 0.00013447301117243236, + "loss": 3.578, + "step": 4491 + }, + { + "epoch": 5.74496, + "grad_norm": 0.48836055397987366, + "learning_rate": 0.00013443262888679497, + "loss": 3.5618, + "step": 4492 + }, + { + "epoch": 5.74624, + "grad_norm": 0.48223766684532166, + "learning_rate": 0.00013439224660115762, + "loss": 3.5529, + "step": 4493 + }, + { + "epoch": 5.74752, + "grad_norm": 0.47637295722961426, + "learning_rate": 0.00013435186431552025, + "loss": 3.5735, + "step": 4494 + }, + { + "epoch": 5.7488, + "grad_norm": 0.48114699125289917, + "learning_rate": 0.00013431148202988288, + "loss": 3.51, + "step": 4495 + }, + { + "epoch": 5.75008, + "grad_norm": 0.49171772599220276, + "learning_rate": 0.0001342710997442455, + "loss": 3.6133, + "step": 4496 + }, + { + "epoch": 5.75136, + "grad_norm": 0.49519824981689453, + "learning_rate": 0.00013423071745860814, + "loss": 3.5715, + "step": 4497 + }, + { + "epoch": 5.7526399999999995, + "grad_norm": 0.4867844879627228, + "learning_rate": 0.00013419033517297077, + "loss": 3.5635, + "step": 4498 + }, + { + "epoch": 5.75392, + "grad_norm": 0.4920177161693573, + "learning_rate": 0.00013414995288733343, + "loss": 3.5152, + "step": 4499 + }, + { + "epoch": 5.7552, + "grad_norm": 0.4885466396808624, + "learning_rate": 0.00013410957060169603, + "loss": 3.5387, + "step": 4500 + }, + { + "epoch": 5.75648, + "grad_norm": 0.49109700322151184, + "learning_rate": 0.0001340691883160587, + "loss": 3.4059, + "step": 4501 + }, + { + "epoch": 5.75776, + "grad_norm": 0.4847142696380615, + "learning_rate": 0.00013402880603042132, + "loss": 3.4758, + "step": 4502 + }, + { + "epoch": 5.75904, + "grad_norm": 0.48357903957366943, + "learning_rate": 0.00013398842374478392, + "loss": 3.4209, + "step": 4503 + }, + { + "epoch": 5.76032, + "grad_norm": 0.4984717071056366, + "learning_rate": 0.00013394804145914658, + "loss": 3.4487, + "step": 4504 + }, + { + "epoch": 5.7616, + "grad_norm": 0.4831765592098236, + "learning_rate": 0.0001339076591735092, + "loss": 3.4414, + "step": 4505 + }, + { + "epoch": 5.76288, + "grad_norm": 0.5067592263221741, + "learning_rate": 0.00013386727688787184, + "loss": 3.4324, + "step": 4506 + }, + { + "epoch": 5.76416, + "grad_norm": 0.496747225522995, + "learning_rate": 0.00013382689460223447, + "loss": 3.5664, + "step": 4507 + }, + { + "epoch": 5.76544, + "grad_norm": 0.4953940808773041, + "learning_rate": 0.0001337865123165971, + "loss": 3.425, + "step": 4508 + }, + { + "epoch": 5.76672, + "grad_norm": 0.5052168369293213, + "learning_rate": 0.00013374613003095973, + "loss": 3.3923, + "step": 4509 + }, + { + "epoch": 5.768, + "grad_norm": 0.49418163299560547, + "learning_rate": 0.0001337057477453224, + "loss": 3.435, + "step": 4510 + }, + { + "epoch": 5.76928, + "grad_norm": 0.5005443096160889, + "learning_rate": 0.000133665365459685, + "loss": 3.4357, + "step": 4511 + }, + { + "epoch": 5.77056, + "grad_norm": 0.4933774471282959, + "learning_rate": 0.00013362498317404765, + "loss": 3.4941, + "step": 4512 + }, + { + "epoch": 5.77184, + "grad_norm": 0.49201011657714844, + "learning_rate": 0.00013358460088841028, + "loss": 3.4382, + "step": 4513 + }, + { + "epoch": 5.7731200000000005, + "grad_norm": 0.4911024570465088, + "learning_rate": 0.0001335442186027729, + "loss": 3.4343, + "step": 4514 + }, + { + "epoch": 5.7744, + "grad_norm": 0.4980873167514801, + "learning_rate": 0.00013350383631713554, + "loss": 3.4233, + "step": 4515 + }, + { + "epoch": 5.77568, + "grad_norm": 0.4919237792491913, + "learning_rate": 0.00013346345403149817, + "loss": 3.5077, + "step": 4516 + }, + { + "epoch": 5.77696, + "grad_norm": 0.5091717839241028, + "learning_rate": 0.0001334230717458608, + "loss": 3.3742, + "step": 4517 + }, + { + "epoch": 5.77824, + "grad_norm": 0.49231696128845215, + "learning_rate": 0.00013338268946022343, + "loss": 3.4706, + "step": 4518 + }, + { + "epoch": 5.77952, + "grad_norm": 0.4917304813861847, + "learning_rate": 0.00013334230717458606, + "loss": 3.4684, + "step": 4519 + }, + { + "epoch": 5.7808, + "grad_norm": 0.5030657649040222, + "learning_rate": 0.0001333019248889487, + "loss": 3.4561, + "step": 4520 + }, + { + "epoch": 5.78208, + "grad_norm": 0.5002293586730957, + "learning_rate": 0.00013326154260331135, + "loss": 3.4811, + "step": 4521 + }, + { + "epoch": 5.78336, + "grad_norm": 0.49681586027145386, + "learning_rate": 0.00013322116031767398, + "loss": 3.4188, + "step": 4522 + }, + { + "epoch": 5.78464, + "grad_norm": 0.4821189045906067, + "learning_rate": 0.0001331807780320366, + "loss": 3.3997, + "step": 4523 + }, + { + "epoch": 5.78592, + "grad_norm": 0.4861787259578705, + "learning_rate": 0.00013314039574639924, + "loss": 3.4388, + "step": 4524 + }, + { + "epoch": 5.7872, + "grad_norm": 0.48813995718955994, + "learning_rate": 0.00013310001346076187, + "loss": 3.4686, + "step": 4525 + }, + { + "epoch": 5.78848, + "grad_norm": 0.4976069927215576, + "learning_rate": 0.0001330596311751245, + "loss": 3.4327, + "step": 4526 + }, + { + "epoch": 5.78976, + "grad_norm": 0.5135701298713684, + "learning_rate": 0.00013301924888948715, + "loss": 3.4294, + "step": 4527 + }, + { + "epoch": 5.79104, + "grad_norm": 0.5241384506225586, + "learning_rate": 0.00013297886660384976, + "loss": 3.5, + "step": 4528 + }, + { + "epoch": 5.79232, + "grad_norm": 0.523514449596405, + "learning_rate": 0.0001329384843182124, + "loss": 3.4332, + "step": 4529 + }, + { + "epoch": 5.7936, + "grad_norm": 0.49370086193084717, + "learning_rate": 0.00013289810203257504, + "loss": 3.4522, + "step": 4530 + }, + { + "epoch": 5.79488, + "grad_norm": 0.5221703052520752, + "learning_rate": 0.00013285771974693765, + "loss": 3.4623, + "step": 4531 + }, + { + "epoch": 5.79616, + "grad_norm": 0.5171593427658081, + "learning_rate": 0.0001328173374613003, + "loss": 3.5817, + "step": 4532 + }, + { + "epoch": 5.79744, + "grad_norm": 0.5053053498268127, + "learning_rate": 0.00013277695517566293, + "loss": 3.4717, + "step": 4533 + }, + { + "epoch": 5.79872, + "grad_norm": 0.5156946182250977, + "learning_rate": 0.00013273657289002556, + "loss": 3.4493, + "step": 4534 + }, + { + "epoch": 5.8, + "grad_norm": 0.5175525546073914, + "learning_rate": 0.0001326961906043882, + "loss": 3.4689, + "step": 4535 + }, + { + "epoch": 5.80128, + "grad_norm": 0.4784032106399536, + "learning_rate": 0.00013265580831875082, + "loss": 3.3858, + "step": 4536 + }, + { + "epoch": 5.80256, + "grad_norm": 0.49603456258773804, + "learning_rate": 0.00013261542603311345, + "loss": 3.4567, + "step": 4537 + }, + { + "epoch": 5.80384, + "grad_norm": 0.512304425239563, + "learning_rate": 0.0001325750437474761, + "loss": 3.4817, + "step": 4538 + }, + { + "epoch": 5.80512, + "grad_norm": 0.5141502618789673, + "learning_rate": 0.00013253466146183871, + "loss": 3.3569, + "step": 4539 + }, + { + "epoch": 5.8064, + "grad_norm": 0.5332363843917847, + "learning_rate": 0.00013249427917620137, + "loss": 3.4232, + "step": 4540 + }, + { + "epoch": 5.8076799999999995, + "grad_norm": 0.5129743814468384, + "learning_rate": 0.000132453896890564, + "loss": 3.5049, + "step": 4541 + }, + { + "epoch": 5.80896, + "grad_norm": 0.5121658444404602, + "learning_rate": 0.00013241351460492663, + "loss": 3.3905, + "step": 4542 + }, + { + "epoch": 5.81024, + "grad_norm": 0.5085654854774475, + "learning_rate": 0.00013237313231928926, + "loss": 3.4257, + "step": 4543 + }, + { + "epoch": 5.81152, + "grad_norm": 0.5045654773712158, + "learning_rate": 0.0001323327500336519, + "loss": 3.4154, + "step": 4544 + }, + { + "epoch": 5.8128, + "grad_norm": 0.49208593368530273, + "learning_rate": 0.00013229236774801452, + "loss": 3.4489, + "step": 4545 + }, + { + "epoch": 5.81408, + "grad_norm": 0.5095329880714417, + "learning_rate": 0.00013225198546237715, + "loss": 3.4906, + "step": 4546 + }, + { + "epoch": 5.81536, + "grad_norm": 0.5090464353561401, + "learning_rate": 0.00013221160317673978, + "loss": 3.4252, + "step": 4547 + }, + { + "epoch": 5.81664, + "grad_norm": 0.5093858242034912, + "learning_rate": 0.0001321712208911024, + "loss": 3.4324, + "step": 4548 + }, + { + "epoch": 5.81792, + "grad_norm": 0.5184779167175293, + "learning_rate": 0.00013213083860546507, + "loss": 3.4721, + "step": 4549 + }, + { + "epoch": 5.8192, + "grad_norm": 0.5006645321846008, + "learning_rate": 0.0001320904563198277, + "loss": 3.4371, + "step": 4550 + }, + { + "epoch": 5.82048, + "grad_norm": 0.5141642689704895, + "learning_rate": 0.00013205007403419033, + "loss": 3.3902, + "step": 4551 + }, + { + "epoch": 5.82176, + "grad_norm": 0.5118807554244995, + "learning_rate": 0.00013200969174855296, + "loss": 3.4246, + "step": 4552 + }, + { + "epoch": 5.82304, + "grad_norm": 0.5281966924667358, + "learning_rate": 0.0001319693094629156, + "loss": 3.5053, + "step": 4553 + }, + { + "epoch": 5.82432, + "grad_norm": 0.5333678126335144, + "learning_rate": 0.00013192892717727822, + "loss": 3.4423, + "step": 4554 + }, + { + "epoch": 5.8256, + "grad_norm": 0.5082951188087463, + "learning_rate": 0.00013188854489164085, + "loss": 3.4716, + "step": 4555 + }, + { + "epoch": 5.82688, + "grad_norm": 0.5443100929260254, + "learning_rate": 0.00013184816260600348, + "loss": 3.4103, + "step": 4556 + }, + { + "epoch": 5.8281600000000005, + "grad_norm": 0.5330334901809692, + "learning_rate": 0.0001318077803203661, + "loss": 3.4818, + "step": 4557 + }, + { + "epoch": 5.82944, + "grad_norm": 0.525781512260437, + "learning_rate": 0.00013176739803472877, + "loss": 3.5291, + "step": 4558 + }, + { + "epoch": 5.83072, + "grad_norm": 0.5201729536056519, + "learning_rate": 0.00013172701574909137, + "loss": 3.5319, + "step": 4559 + }, + { + "epoch": 5.832, + "grad_norm": 0.5168731212615967, + "learning_rate": 0.00013168663346345403, + "loss": 3.3785, + "step": 4560 + }, + { + "epoch": 5.83328, + "grad_norm": 0.5290531516075134, + "learning_rate": 0.00013164625117781666, + "loss": 3.4085, + "step": 4561 + }, + { + "epoch": 5.83456, + "grad_norm": 0.5436719059944153, + "learning_rate": 0.0001316058688921793, + "loss": 3.5384, + "step": 4562 + }, + { + "epoch": 5.83584, + "grad_norm": 0.5178592801094055, + "learning_rate": 0.00013156548660654192, + "loss": 3.3646, + "step": 4563 + }, + { + "epoch": 5.83712, + "grad_norm": 0.5080958604812622, + "learning_rate": 0.00013152510432090455, + "loss": 3.4061, + "step": 4564 + }, + { + "epoch": 5.8384, + "grad_norm": 0.52479487657547, + "learning_rate": 0.00013148472203526718, + "loss": 3.5095, + "step": 4565 + }, + { + "epoch": 5.8396799999999995, + "grad_norm": 0.531376302242279, + "learning_rate": 0.00013144433974962983, + "loss": 3.4566, + "step": 4566 + }, + { + "epoch": 5.84096, + "grad_norm": 0.5431889891624451, + "learning_rate": 0.00013140395746399244, + "loss": 3.508, + "step": 4567 + }, + { + "epoch": 5.84224, + "grad_norm": 0.5298098921775818, + "learning_rate": 0.0001313635751783551, + "loss": 3.4721, + "step": 4568 + }, + { + "epoch": 5.84352, + "grad_norm": 0.5101543664932251, + "learning_rate": 0.00013132319289271772, + "loss": 3.4789, + "step": 4569 + }, + { + "epoch": 5.8448, + "grad_norm": 0.5432095527648926, + "learning_rate": 0.00013128281060708033, + "loss": 3.3721, + "step": 4570 + }, + { + "epoch": 5.84608, + "grad_norm": 0.5078821778297424, + "learning_rate": 0.00013124242832144299, + "loss": 3.4997, + "step": 4571 + }, + { + "epoch": 5.84736, + "grad_norm": 0.5241081118583679, + "learning_rate": 0.00013120204603580562, + "loss": 3.3372, + "step": 4572 + }, + { + "epoch": 5.84864, + "grad_norm": 0.5384151935577393, + "learning_rate": 0.00013116166375016825, + "loss": 3.5231, + "step": 4573 + }, + { + "epoch": 5.84992, + "grad_norm": 0.5285201072692871, + "learning_rate": 0.00013112128146453088, + "loss": 3.4693, + "step": 4574 + }, + { + "epoch": 5.8512, + "grad_norm": 0.5212448239326477, + "learning_rate": 0.0001310808991788935, + "loss": 3.4739, + "step": 4575 + }, + { + "epoch": 5.85248, + "grad_norm": 0.5330734252929688, + "learning_rate": 0.00013104051689325614, + "loss": 3.4567, + "step": 4576 + }, + { + "epoch": 5.85376, + "grad_norm": 0.5170498490333557, + "learning_rate": 0.0001310001346076188, + "loss": 3.4205, + "step": 4577 + }, + { + "epoch": 5.85504, + "grad_norm": 0.5200040340423584, + "learning_rate": 0.00013095975232198142, + "loss": 3.5097, + "step": 4578 + }, + { + "epoch": 5.85632, + "grad_norm": 0.5016375184059143, + "learning_rate": 0.00013091937003634405, + "loss": 3.4698, + "step": 4579 + }, + { + "epoch": 5.8576, + "grad_norm": 0.5077860951423645, + "learning_rate": 0.00013087898775070668, + "loss": 3.4589, + "step": 4580 + }, + { + "epoch": 5.85888, + "grad_norm": 0.4943268597126007, + "learning_rate": 0.0001308386054650693, + "loss": 3.4539, + "step": 4581 + }, + { + "epoch": 5.8601600000000005, + "grad_norm": 0.5271481871604919, + "learning_rate": 0.00013079822317943194, + "loss": 3.4382, + "step": 4582 + }, + { + "epoch": 5.86144, + "grad_norm": 0.5146161913871765, + "learning_rate": 0.00013075784089379457, + "loss": 3.4512, + "step": 4583 + }, + { + "epoch": 5.86272, + "grad_norm": 0.4986138343811035, + "learning_rate": 0.0001307174586081572, + "loss": 3.4464, + "step": 4584 + }, + { + "epoch": 5.864, + "grad_norm": 0.5161074995994568, + "learning_rate": 0.00013067707632251983, + "loss": 3.4749, + "step": 4585 + }, + { + "epoch": 5.86528, + "grad_norm": 0.5155913829803467, + "learning_rate": 0.0001306366940368825, + "loss": 3.3951, + "step": 4586 + }, + { + "epoch": 5.86656, + "grad_norm": 0.511010468006134, + "learning_rate": 0.0001305963117512451, + "loss": 3.5491, + "step": 4587 + }, + { + "epoch": 5.86784, + "grad_norm": 0.5245579481124878, + "learning_rate": 0.00013055592946560775, + "loss": 3.4911, + "step": 4588 + }, + { + "epoch": 5.86912, + "grad_norm": 0.5299880504608154, + "learning_rate": 0.00013051554717997038, + "loss": 3.5118, + "step": 4589 + }, + { + "epoch": 5.8704, + "grad_norm": 0.5251659154891968, + "learning_rate": 0.000130475164894333, + "loss": 3.4269, + "step": 4590 + }, + { + "epoch": 5.87168, + "grad_norm": 0.5328729748725891, + "learning_rate": 0.00013043478260869564, + "loss": 3.4694, + "step": 4591 + }, + { + "epoch": 5.87296, + "grad_norm": 0.5197456479072571, + "learning_rate": 0.00013039440032305827, + "loss": 3.4146, + "step": 4592 + }, + { + "epoch": 5.87424, + "grad_norm": 0.5089783668518066, + "learning_rate": 0.0001303540180374209, + "loss": 3.4786, + "step": 4593 + }, + { + "epoch": 5.87552, + "grad_norm": 0.5036661624908447, + "learning_rate": 0.00013031363575178356, + "loss": 3.494, + "step": 4594 + }, + { + "epoch": 5.8768, + "grad_norm": 0.5152899622917175, + "learning_rate": 0.00013027325346614616, + "loss": 3.4804, + "step": 4595 + }, + { + "epoch": 5.87808, + "grad_norm": 0.5099830031394958, + "learning_rate": 0.00013023287118050882, + "loss": 3.5013, + "step": 4596 + }, + { + "epoch": 5.87936, + "grad_norm": 0.5049211382865906, + "learning_rate": 0.00013019248889487145, + "loss": 3.4966, + "step": 4597 + }, + { + "epoch": 5.88064, + "grad_norm": 0.5128461122512817, + "learning_rate": 0.00013015210660923405, + "loss": 3.5043, + "step": 4598 + }, + { + "epoch": 5.88192, + "grad_norm": 0.5299509763717651, + "learning_rate": 0.0001301117243235967, + "loss": 3.5566, + "step": 4599 + }, + { + "epoch": 5.8832, + "grad_norm": 0.49836984276771545, + "learning_rate": 0.00013007134203795934, + "loss": 3.4689, + "step": 4600 + }, + { + "epoch": 5.88448, + "grad_norm": 0.5294039845466614, + "learning_rate": 0.00013003095975232197, + "loss": 3.417, + "step": 4601 + }, + { + "epoch": 5.88576, + "grad_norm": 0.5025843381881714, + "learning_rate": 0.0001299905774666846, + "loss": 3.4296, + "step": 4602 + }, + { + "epoch": 5.88704, + "grad_norm": 0.5195445418357849, + "learning_rate": 0.00012995019518104723, + "loss": 3.4922, + "step": 4603 + }, + { + "epoch": 5.88832, + "grad_norm": 0.5041271448135376, + "learning_rate": 0.00012990981289540986, + "loss": 3.3655, + "step": 4604 + }, + { + "epoch": 5.8896, + "grad_norm": 0.5099116563796997, + "learning_rate": 0.00012986943060977252, + "loss": 3.3968, + "step": 4605 + }, + { + "epoch": 5.89088, + "grad_norm": 0.5230572819709778, + "learning_rate": 0.00012982904832413512, + "loss": 3.4775, + "step": 4606 + }, + { + "epoch": 5.89216, + "grad_norm": 0.4989769458770752, + "learning_rate": 0.00012978866603849778, + "loss": 3.4765, + "step": 4607 + }, + { + "epoch": 5.89344, + "grad_norm": 0.5254843831062317, + "learning_rate": 0.0001297482837528604, + "loss": 3.4976, + "step": 4608 + }, + { + "epoch": 5.8947199999999995, + "grad_norm": 0.501315176486969, + "learning_rate": 0.00012970790146722304, + "loss": 3.3925, + "step": 4609 + }, + { + "epoch": 5.896, + "grad_norm": 0.5095430016517639, + "learning_rate": 0.00012966751918158567, + "loss": 3.4135, + "step": 4610 + }, + { + "epoch": 5.89728, + "grad_norm": 0.5203284025192261, + "learning_rate": 0.0001296271368959483, + "loss": 3.5653, + "step": 4611 + }, + { + "epoch": 5.89856, + "grad_norm": 0.524929940700531, + "learning_rate": 0.00012958675461031093, + "loss": 3.4164, + "step": 4612 + }, + { + "epoch": 5.89984, + "grad_norm": 0.49622589349746704, + "learning_rate": 0.00012954637232467356, + "loss": 3.4858, + "step": 4613 + }, + { + "epoch": 5.90112, + "grad_norm": 0.5328811407089233, + "learning_rate": 0.00012950599003903621, + "loss": 3.4519, + "step": 4614 + }, + { + "epoch": 5.9024, + "grad_norm": 0.5233384370803833, + "learning_rate": 0.00012946560775339882, + "loss": 3.4802, + "step": 4615 + }, + { + "epoch": 5.90368, + "grad_norm": 0.5104706287384033, + "learning_rate": 0.00012942522546776147, + "loss": 3.503, + "step": 4616 + }, + { + "epoch": 5.90496, + "grad_norm": 0.5111770033836365, + "learning_rate": 0.0001293848431821241, + "loss": 3.3882, + "step": 4617 + }, + { + "epoch": 5.90624, + "grad_norm": 0.5340044498443604, + "learning_rate": 0.00012934446089648673, + "loss": 3.4865, + "step": 4618 + }, + { + "epoch": 5.90752, + "grad_norm": 0.5179080367088318, + "learning_rate": 0.00012930407861084936, + "loss": 3.448, + "step": 4619 + }, + { + "epoch": 5.9088, + "grad_norm": 0.5176510214805603, + "learning_rate": 0.000129263696325212, + "loss": 3.4914, + "step": 4620 + }, + { + "epoch": 5.91008, + "grad_norm": 0.5105175375938416, + "learning_rate": 0.00012922331403957462, + "loss": 3.4427, + "step": 4621 + }, + { + "epoch": 5.91136, + "grad_norm": 0.4958018362522125, + "learning_rate": 0.00012918293175393728, + "loss": 3.4691, + "step": 4622 + }, + { + "epoch": 5.91264, + "grad_norm": 0.5175390839576721, + "learning_rate": 0.00012914254946829988, + "loss": 3.4176, + "step": 4623 + }, + { + "epoch": 5.91392, + "grad_norm": 0.49366575479507446, + "learning_rate": 0.00012910216718266251, + "loss": 3.4056, + "step": 4624 + }, + { + "epoch": 5.9152000000000005, + "grad_norm": 0.49612316489219666, + "learning_rate": 0.00012906178489702517, + "loss": 3.4146, + "step": 4625 + }, + { + "epoch": 5.91648, + "grad_norm": 0.5153398513793945, + "learning_rate": 0.00012902140261138777, + "loss": 3.4182, + "step": 4626 + }, + { + "epoch": 5.91776, + "grad_norm": 0.5095019936561584, + "learning_rate": 0.00012898102032575043, + "loss": 3.4108, + "step": 4627 + }, + { + "epoch": 5.91904, + "grad_norm": 0.5022311210632324, + "learning_rate": 0.00012894063804011306, + "loss": 3.4513, + "step": 4628 + }, + { + "epoch": 5.92032, + "grad_norm": 0.5156865119934082, + "learning_rate": 0.0001289002557544757, + "loss": 3.4506, + "step": 4629 + }, + { + "epoch": 5.9216, + "grad_norm": 0.4920865595340729, + "learning_rate": 0.00012885987346883832, + "loss": 3.3831, + "step": 4630 + }, + { + "epoch": 5.92288, + "grad_norm": 0.5026581883430481, + "learning_rate": 0.00012881949118320095, + "loss": 3.4406, + "step": 4631 + }, + { + "epoch": 5.92416, + "grad_norm": 0.5237811803817749, + "learning_rate": 0.00012877910889756358, + "loss": 3.441, + "step": 4632 + }, + { + "epoch": 5.92544, + "grad_norm": 0.5131553411483765, + "learning_rate": 0.00012873872661192624, + "loss": 3.4405, + "step": 4633 + }, + { + "epoch": 5.9267199999999995, + "grad_norm": 0.5147265791893005, + "learning_rate": 0.00012869834432628884, + "loss": 3.4757, + "step": 4634 + }, + { + "epoch": 5.928, + "grad_norm": 0.5250530242919922, + "learning_rate": 0.0001286579620406515, + "loss": 3.4194, + "step": 4635 + }, + { + "epoch": 5.92928, + "grad_norm": 0.5036343336105347, + "learning_rate": 0.00012861757975501413, + "loss": 3.3476, + "step": 4636 + }, + { + "epoch": 5.93056, + "grad_norm": 0.5144938826560974, + "learning_rate": 0.00012857719746937676, + "loss": 3.4407, + "step": 4637 + }, + { + "epoch": 5.93184, + "grad_norm": 0.5358858704566956, + "learning_rate": 0.0001285368151837394, + "loss": 3.4875, + "step": 4638 + }, + { + "epoch": 5.93312, + "grad_norm": 0.503612220287323, + "learning_rate": 0.00012849643289810202, + "loss": 3.4862, + "step": 4639 + }, + { + "epoch": 5.9344, + "grad_norm": 0.5070048570632935, + "learning_rate": 0.00012845605061246465, + "loss": 3.4842, + "step": 4640 + }, + { + "epoch": 5.93568, + "grad_norm": 0.5292816758155823, + "learning_rate": 0.00012841566832682728, + "loss": 3.4764, + "step": 4641 + }, + { + "epoch": 5.93696, + "grad_norm": 0.5156516432762146, + "learning_rate": 0.0001283752860411899, + "loss": 3.4486, + "step": 4642 + }, + { + "epoch": 5.93824, + "grad_norm": 0.5131589770317078, + "learning_rate": 0.00012833490375555254, + "loss": 3.4249, + "step": 4643 + }, + { + "epoch": 5.93952, + "grad_norm": 0.5197781324386597, + "learning_rate": 0.0001282945214699152, + "loss": 3.4929, + "step": 4644 + }, + { + "epoch": 5.9408, + "grad_norm": 0.5134233236312866, + "learning_rate": 0.00012825413918427783, + "loss": 3.445, + "step": 4645 + }, + { + "epoch": 5.94208, + "grad_norm": 0.5193766355514526, + "learning_rate": 0.00012821375689864046, + "loss": 3.4006, + "step": 4646 + }, + { + "epoch": 5.94336, + "grad_norm": 0.5153205990791321, + "learning_rate": 0.0001281733746130031, + "loss": 3.4529, + "step": 4647 + }, + { + "epoch": 5.94464, + "grad_norm": 0.5201491713523865, + "learning_rate": 0.00012813299232736572, + "loss": 3.4742, + "step": 4648 + }, + { + "epoch": 5.94592, + "grad_norm": 0.5122271180152893, + "learning_rate": 0.00012809261004172835, + "loss": 3.4414, + "step": 4649 + }, + { + "epoch": 5.9472000000000005, + "grad_norm": 0.5078105926513672, + "learning_rate": 0.000128052227756091, + "loss": 3.3802, + "step": 4650 + }, + { + "epoch": 5.94848, + "grad_norm": 0.5191426873207092, + "learning_rate": 0.0001280118454704536, + "loss": 3.3924, + "step": 4651 + }, + { + "epoch": 5.94976, + "grad_norm": 0.511929452419281, + "learning_rate": 0.00012797146318481624, + "loss": 3.4573, + "step": 4652 + }, + { + "epoch": 5.95104, + "grad_norm": 0.512454628944397, + "learning_rate": 0.0001279310808991789, + "loss": 3.4888, + "step": 4653 + }, + { + "epoch": 5.95232, + "grad_norm": 0.5255744457244873, + "learning_rate": 0.0001278906986135415, + "loss": 3.5094, + "step": 4654 + }, + { + "epoch": 5.9536, + "grad_norm": 0.5149945616722107, + "learning_rate": 0.00012785031632790416, + "loss": 3.4169, + "step": 4655 + }, + { + "epoch": 5.95488, + "grad_norm": 0.507247269153595, + "learning_rate": 0.00012780993404226679, + "loss": 3.4484, + "step": 4656 + }, + { + "epoch": 5.95616, + "grad_norm": 0.5141899585723877, + "learning_rate": 0.00012776955175662942, + "loss": 3.4004, + "step": 4657 + }, + { + "epoch": 5.95744, + "grad_norm": 0.525027334690094, + "learning_rate": 0.00012772916947099205, + "loss": 3.5179, + "step": 4658 + }, + { + "epoch": 5.95872, + "grad_norm": 0.5136873126029968, + "learning_rate": 0.00012768878718535468, + "loss": 3.4939, + "step": 4659 + }, + { + "epoch": 5.96, + "grad_norm": 0.5179911255836487, + "learning_rate": 0.0001276484048997173, + "loss": 3.4725, + "step": 4660 + }, + { + "epoch": 5.96128, + "grad_norm": 0.5283648371696472, + "learning_rate": 0.00012760802261407996, + "loss": 3.4843, + "step": 4661 + }, + { + "epoch": 5.96256, + "grad_norm": 0.5253385305404663, + "learning_rate": 0.00012756764032844257, + "loss": 3.4472, + "step": 4662 + }, + { + "epoch": 5.96384, + "grad_norm": 0.5267717242240906, + "learning_rate": 0.00012752725804280522, + "loss": 3.4637, + "step": 4663 + }, + { + "epoch": 5.96512, + "grad_norm": 0.5325003266334534, + "learning_rate": 0.00012748687575716785, + "loss": 3.4575, + "step": 4664 + }, + { + "epoch": 5.9664, + "grad_norm": 0.51334148645401, + "learning_rate": 0.00012744649347153048, + "loss": 3.4401, + "step": 4665 + }, + { + "epoch": 5.96768, + "grad_norm": 0.5157260298728943, + "learning_rate": 0.0001274061111858931, + "loss": 3.4944, + "step": 4666 + }, + { + "epoch": 5.96896, + "grad_norm": 0.5126465559005737, + "learning_rate": 0.00012736572890025574, + "loss": 3.4313, + "step": 4667 + }, + { + "epoch": 5.97024, + "grad_norm": 0.5327089428901672, + "learning_rate": 0.00012732534661461837, + "loss": 3.4903, + "step": 4668 + }, + { + "epoch": 5.97152, + "grad_norm": 0.5237742066383362, + "learning_rate": 0.000127284964328981, + "loss": 3.4409, + "step": 4669 + }, + { + "epoch": 5.9728, + "grad_norm": 0.5220187306404114, + "learning_rate": 0.00012724458204334363, + "loss": 3.4712, + "step": 4670 + }, + { + "epoch": 5.97408, + "grad_norm": 0.5200389623641968, + "learning_rate": 0.00012720419975770626, + "loss": 3.5174, + "step": 4671 + }, + { + "epoch": 5.97536, + "grad_norm": 0.5262846946716309, + "learning_rate": 0.00012716381747206892, + "loss": 3.4454, + "step": 4672 + }, + { + "epoch": 5.97664, + "grad_norm": 0.5050617456436157, + "learning_rate": 0.00012712343518643155, + "loss": 3.4899, + "step": 4673 + }, + { + "epoch": 5.97792, + "grad_norm": 0.503463089466095, + "learning_rate": 0.00012708305290079418, + "loss": 3.4283, + "step": 4674 + }, + { + "epoch": 5.9792, + "grad_norm": 0.5130547881126404, + "learning_rate": 0.0001270426706151568, + "loss": 3.4625, + "step": 4675 + }, + { + "epoch": 5.98048, + "grad_norm": 0.5259498953819275, + "learning_rate": 0.00012700228832951944, + "loss": 3.491, + "step": 4676 + }, + { + "epoch": 5.9817599999999995, + "grad_norm": 0.4981289803981781, + "learning_rate": 0.00012696190604388207, + "loss": 3.4928, + "step": 4677 + }, + { + "epoch": 5.98304, + "grad_norm": 0.520211398601532, + "learning_rate": 0.0001269215237582447, + "loss": 3.4893, + "step": 4678 + }, + { + "epoch": 5.98432, + "grad_norm": 0.5219488739967346, + "learning_rate": 0.00012688114147260733, + "loss": 3.4604, + "step": 4679 + }, + { + "epoch": 5.9856, + "grad_norm": 0.5058456063270569, + "learning_rate": 0.00012684075918696996, + "loss": 3.4903, + "step": 4680 + }, + { + "epoch": 5.98688, + "grad_norm": 0.5197397470474243, + "learning_rate": 0.00012680037690133262, + "loss": 3.493, + "step": 4681 + }, + { + "epoch": 5.98816, + "grad_norm": 0.5335855484008789, + "learning_rate": 0.00012675999461569522, + "loss": 3.5079, + "step": 4682 + }, + { + "epoch": 5.98944, + "grad_norm": 0.5145869255065918, + "learning_rate": 0.00012671961233005788, + "loss": 3.4, + "step": 4683 + }, + { + "epoch": 5.99072, + "grad_norm": 0.5230121612548828, + "learning_rate": 0.0001266792300444205, + "loss": 3.4968, + "step": 4684 + }, + { + "epoch": 5.992, + "grad_norm": 0.5136299729347229, + "learning_rate": 0.00012663884775878314, + "loss": 3.4992, + "step": 4685 + }, + { + "epoch": 5.99328, + "grad_norm": 0.5114295482635498, + "learning_rate": 0.00012659846547314577, + "loss": 3.4265, + "step": 4686 + }, + { + "epoch": 5.99456, + "grad_norm": 0.511169970035553, + "learning_rate": 0.0001265580831875084, + "loss": 3.4609, + "step": 4687 + }, + { + "epoch": 5.99584, + "grad_norm": 0.5237197279930115, + "learning_rate": 0.00012651770090187103, + "loss": 3.4253, + "step": 4688 + }, + { + "epoch": 5.99712, + "grad_norm": 0.5328317880630493, + "learning_rate": 0.00012647731861623369, + "loss": 3.3671, + "step": 4689 + }, + { + "epoch": 5.9984, + "grad_norm": 0.5123398303985596, + "learning_rate": 0.0001264369363305963, + "loss": 3.4323, + "step": 4690 + }, + { + "epoch": 5.99968, + "grad_norm": 0.5049529671669006, + "learning_rate": 0.00012639655404495892, + "loss": 3.4654, + "step": 4691 + }, + { + "epoch": 6.0, + "grad_norm": 0.8919804692268372, + "learning_rate": 0.00012635617175932158, + "loss": 3.1534, + "step": 4692 + }, + { + "epoch": 6.00128, + "grad_norm": 0.5235251188278198, + "learning_rate": 0.00012631578947368418, + "loss": 3.476, + "step": 4693 + }, + { + "epoch": 6.00256, + "grad_norm": 0.5221917629241943, + "learning_rate": 0.00012627540718804684, + "loss": 3.4639, + "step": 4694 + }, + { + "epoch": 6.00384, + "grad_norm": 0.5179303288459778, + "learning_rate": 0.00012623502490240947, + "loss": 3.4869, + "step": 4695 + }, + { + "epoch": 6.00512, + "grad_norm": 0.509280264377594, + "learning_rate": 0.0001261946426167721, + "loss": 3.5512, + "step": 4696 + }, + { + "epoch": 6.0064, + "grad_norm": 0.5337740182876587, + "learning_rate": 0.00012615426033113473, + "loss": 3.4508, + "step": 4697 + }, + { + "epoch": 6.00768, + "grad_norm": 0.511637806892395, + "learning_rate": 0.00012611387804549736, + "loss": 3.4186, + "step": 4698 + }, + { + "epoch": 6.00896, + "grad_norm": 0.5246609449386597, + "learning_rate": 0.00012607349575986, + "loss": 3.4857, + "step": 4699 + }, + { + "epoch": 6.01024, + "grad_norm": 0.5246115922927856, + "learning_rate": 0.00012603311347422264, + "loss": 3.4306, + "step": 4700 + }, + { + "epoch": 6.01152, + "grad_norm": 0.5044547915458679, + "learning_rate": 0.00012599273118858527, + "loss": 3.4457, + "step": 4701 + }, + { + "epoch": 6.0128, + "grad_norm": 0.524234414100647, + "learning_rate": 0.0001259523489029479, + "loss": 3.4756, + "step": 4702 + }, + { + "epoch": 6.01408, + "grad_norm": 0.5325637459754944, + "learning_rate": 0.00012591196661731053, + "loss": 3.4793, + "step": 4703 + }, + { + "epoch": 6.01536, + "grad_norm": 0.5177270174026489, + "learning_rate": 0.00012587158433167316, + "loss": 3.4615, + "step": 4704 + }, + { + "epoch": 6.01664, + "grad_norm": 0.5080181360244751, + "learning_rate": 0.0001258312020460358, + "loss": 3.4701, + "step": 4705 + }, + { + "epoch": 6.01792, + "grad_norm": 0.5270411968231201, + "learning_rate": 0.00012579081976039842, + "loss": 3.4342, + "step": 4706 + }, + { + "epoch": 6.0192, + "grad_norm": 0.526719868183136, + "learning_rate": 0.00012575043747476105, + "loss": 3.4343, + "step": 4707 + }, + { + "epoch": 6.02048, + "grad_norm": 0.5216172933578491, + "learning_rate": 0.00012571005518912368, + "loss": 3.4759, + "step": 4708 + }, + { + "epoch": 6.0217600000000004, + "grad_norm": 0.5187539458274841, + "learning_rate": 0.00012566967290348634, + "loss": 3.5129, + "step": 4709 + }, + { + "epoch": 6.02304, + "grad_norm": 0.5328104496002197, + "learning_rate": 0.00012562929061784894, + "loss": 3.4285, + "step": 4710 + }, + { + "epoch": 6.02432, + "grad_norm": 0.5354236364364624, + "learning_rate": 0.0001255889083322116, + "loss": 3.4189, + "step": 4711 + }, + { + "epoch": 6.0256, + "grad_norm": 0.5221328139305115, + "learning_rate": 0.00012554852604657423, + "loss": 3.338, + "step": 4712 + }, + { + "epoch": 6.02688, + "grad_norm": 0.5087037682533264, + "learning_rate": 0.00012550814376093686, + "loss": 3.4602, + "step": 4713 + }, + { + "epoch": 6.02816, + "grad_norm": 0.5138629674911499, + "learning_rate": 0.0001254677614752995, + "loss": 3.4034, + "step": 4714 + }, + { + "epoch": 6.02944, + "grad_norm": 0.5265618562698364, + "learning_rate": 0.00012542737918966212, + "loss": 3.4126, + "step": 4715 + }, + { + "epoch": 6.03072, + "grad_norm": 0.5210886597633362, + "learning_rate": 0.00012538699690402475, + "loss": 3.4671, + "step": 4716 + }, + { + "epoch": 6.032, + "grad_norm": 0.5192465782165527, + "learning_rate": 0.0001253466146183874, + "loss": 3.3955, + "step": 4717 + }, + { + "epoch": 6.03328, + "grad_norm": 0.5116483569145203, + "learning_rate": 0.00012530623233275, + "loss": 3.4776, + "step": 4718 + }, + { + "epoch": 6.03456, + "grad_norm": 0.5206137299537659, + "learning_rate": 0.00012526585004711264, + "loss": 3.4733, + "step": 4719 + }, + { + "epoch": 6.03584, + "grad_norm": 0.523862361907959, + "learning_rate": 0.0001252254677614753, + "loss": 3.3922, + "step": 4720 + }, + { + "epoch": 6.03712, + "grad_norm": 0.5030114054679871, + "learning_rate": 0.0001251850854758379, + "loss": 3.4266, + "step": 4721 + }, + { + "epoch": 6.0384, + "grad_norm": 0.5357561111450195, + "learning_rate": 0.00012514470319020056, + "loss": 3.4608, + "step": 4722 + }, + { + "epoch": 6.03968, + "grad_norm": 0.5137104392051697, + "learning_rate": 0.0001251043209045632, + "loss": 3.4066, + "step": 4723 + }, + { + "epoch": 6.04096, + "grad_norm": 0.49622246623039246, + "learning_rate": 0.00012506393861892582, + "loss": 3.4098, + "step": 4724 + }, + { + "epoch": 6.04224, + "grad_norm": 0.5154051780700684, + "learning_rate": 0.00012502355633328845, + "loss": 3.4739, + "step": 4725 + }, + { + "epoch": 6.04352, + "grad_norm": 0.5417183041572571, + "learning_rate": 0.00012498317404765108, + "loss": 3.4566, + "step": 4726 + }, + { + "epoch": 6.0448, + "grad_norm": 0.5043236613273621, + "learning_rate": 0.0001249427917620137, + "loss": 3.5022, + "step": 4727 + }, + { + "epoch": 6.04608, + "grad_norm": 0.5146865248680115, + "learning_rate": 0.00012490240947637637, + "loss": 3.5059, + "step": 4728 + }, + { + "epoch": 6.04736, + "grad_norm": 0.5209075212478638, + "learning_rate": 0.00012486202719073897, + "loss": 3.5078, + "step": 4729 + }, + { + "epoch": 6.04864, + "grad_norm": 0.5176512002944946, + "learning_rate": 0.00012482164490510163, + "loss": 3.4633, + "step": 4730 + }, + { + "epoch": 6.04992, + "grad_norm": 0.5079405903816223, + "learning_rate": 0.00012478126261946426, + "loss": 3.4449, + "step": 4731 + }, + { + "epoch": 6.0512, + "grad_norm": 0.5069144368171692, + "learning_rate": 0.0001247408803338269, + "loss": 3.5183, + "step": 4732 + }, + { + "epoch": 6.05248, + "grad_norm": 0.5130882263183594, + "learning_rate": 0.00012470049804818952, + "loss": 3.4276, + "step": 4733 + }, + { + "epoch": 6.05376, + "grad_norm": 0.5110294222831726, + "learning_rate": 0.00012466011576255215, + "loss": 3.4404, + "step": 4734 + }, + { + "epoch": 6.05504, + "grad_norm": 0.513787031173706, + "learning_rate": 0.00012461973347691478, + "loss": 3.4809, + "step": 4735 + }, + { + "epoch": 6.05632, + "grad_norm": 0.4997158348560333, + "learning_rate": 0.0001245793511912774, + "loss": 3.4271, + "step": 4736 + }, + { + "epoch": 6.0576, + "grad_norm": 0.5131158828735352, + "learning_rate": 0.00012453896890564007, + "loss": 3.4951, + "step": 4737 + }, + { + "epoch": 6.05888, + "grad_norm": 0.511933445930481, + "learning_rate": 0.00012449858662000267, + "loss": 3.4584, + "step": 4738 + }, + { + "epoch": 6.06016, + "grad_norm": 0.5187560319900513, + "learning_rate": 0.00012445820433436533, + "loss": 3.4039, + "step": 4739 + }, + { + "epoch": 6.06144, + "grad_norm": 0.5163741111755371, + "learning_rate": 0.00012441782204872796, + "loss": 3.4948, + "step": 4740 + }, + { + "epoch": 6.06272, + "grad_norm": 0.5106804370880127, + "learning_rate": 0.00012437743976309059, + "loss": 3.4327, + "step": 4741 + }, + { + "epoch": 6.064, + "grad_norm": 0.5465947985649109, + "learning_rate": 0.00012433705747745322, + "loss": 3.5185, + "step": 4742 + }, + { + "epoch": 6.06528, + "grad_norm": 0.49931657314300537, + "learning_rate": 0.00012429667519181585, + "loss": 3.3982, + "step": 4743 + }, + { + "epoch": 6.06656, + "grad_norm": 0.510944664478302, + "learning_rate": 0.00012425629290617848, + "loss": 3.4117, + "step": 4744 + }, + { + "epoch": 6.06784, + "grad_norm": 0.5148774981498718, + "learning_rate": 0.0001242159106205411, + "loss": 3.4909, + "step": 4745 + }, + { + "epoch": 6.06912, + "grad_norm": 0.5062376260757446, + "learning_rate": 0.00012417552833490374, + "loss": 3.4504, + "step": 4746 + }, + { + "epoch": 6.0704, + "grad_norm": 0.5255651473999023, + "learning_rate": 0.00012413514604926637, + "loss": 3.5011, + "step": 4747 + }, + { + "epoch": 6.07168, + "grad_norm": 0.5072139501571655, + "learning_rate": 0.00012409476376362902, + "loss": 3.5092, + "step": 4748 + }, + { + "epoch": 6.07296, + "grad_norm": 0.5306870341300964, + "learning_rate": 0.00012405438147799163, + "loss": 3.4533, + "step": 4749 + }, + { + "epoch": 6.07424, + "grad_norm": 0.5114386677742004, + "learning_rate": 0.00012401399919235428, + "loss": 3.4855, + "step": 4750 + }, + { + "epoch": 6.07552, + "grad_norm": 0.5131011605262756, + "learning_rate": 0.00012397361690671691, + "loss": 3.4171, + "step": 4751 + }, + { + "epoch": 6.0768, + "grad_norm": 0.5139480233192444, + "learning_rate": 0.00012393323462107954, + "loss": 3.3778, + "step": 4752 + }, + { + "epoch": 6.07808, + "grad_norm": 0.5153706073760986, + "learning_rate": 0.00012389285233544217, + "loss": 3.4194, + "step": 4753 + }, + { + "epoch": 6.07936, + "grad_norm": 0.5061531066894531, + "learning_rate": 0.0001238524700498048, + "loss": 3.4817, + "step": 4754 + }, + { + "epoch": 6.08064, + "grad_norm": 0.5213887691497803, + "learning_rate": 0.00012381208776416743, + "loss": 3.4902, + "step": 4755 + }, + { + "epoch": 6.08192, + "grad_norm": 0.5053319931030273, + "learning_rate": 0.0001237717054785301, + "loss": 3.4497, + "step": 4756 + }, + { + "epoch": 6.0832, + "grad_norm": 0.5132631659507751, + "learning_rate": 0.0001237313231928927, + "loss": 3.4278, + "step": 4757 + }, + { + "epoch": 6.08448, + "grad_norm": 0.5187950730323792, + "learning_rate": 0.00012369094090725535, + "loss": 3.4636, + "step": 4758 + }, + { + "epoch": 6.08576, + "grad_norm": 0.5095174908638, + "learning_rate": 0.00012365055862161798, + "loss": 3.5092, + "step": 4759 + }, + { + "epoch": 6.08704, + "grad_norm": 0.5095679759979248, + "learning_rate": 0.0001236101763359806, + "loss": 3.4271, + "step": 4760 + }, + { + "epoch": 6.08832, + "grad_norm": 0.5283337831497192, + "learning_rate": 0.00012356979405034324, + "loss": 3.4452, + "step": 4761 + }, + { + "epoch": 6.0896, + "grad_norm": 0.5230772495269775, + "learning_rate": 0.00012352941176470587, + "loss": 3.4283, + "step": 4762 + }, + { + "epoch": 6.09088, + "grad_norm": 0.5018015503883362, + "learning_rate": 0.0001234890294790685, + "loss": 3.5196, + "step": 4763 + }, + { + "epoch": 6.09216, + "grad_norm": 0.5031048655509949, + "learning_rate": 0.00012344864719343113, + "loss": 3.4459, + "step": 4764 + }, + { + "epoch": 6.09344, + "grad_norm": 0.49931055307388306, + "learning_rate": 0.00012340826490779376, + "loss": 3.4384, + "step": 4765 + }, + { + "epoch": 6.09472, + "grad_norm": 0.5125550627708435, + "learning_rate": 0.0001233678826221564, + "loss": 3.5177, + "step": 4766 + }, + { + "epoch": 6.096, + "grad_norm": 0.5063710808753967, + "learning_rate": 0.00012332750033651905, + "loss": 3.3931, + "step": 4767 + }, + { + "epoch": 6.09728, + "grad_norm": 0.5210559368133545, + "learning_rate": 0.00012328711805088168, + "loss": 3.4212, + "step": 4768 + }, + { + "epoch": 6.09856, + "grad_norm": 0.5136427879333496, + "learning_rate": 0.0001232467357652443, + "loss": 3.513, + "step": 4769 + }, + { + "epoch": 6.09984, + "grad_norm": 0.5130379796028137, + "learning_rate": 0.00012320635347960694, + "loss": 3.4504, + "step": 4770 + }, + { + "epoch": 6.10112, + "grad_norm": 0.5283026099205017, + "learning_rate": 0.00012316597119396957, + "loss": 3.4788, + "step": 4771 + }, + { + "epoch": 6.1024, + "grad_norm": 0.5066593289375305, + "learning_rate": 0.0001231255889083322, + "loss": 3.4462, + "step": 4772 + }, + { + "epoch": 6.10368, + "grad_norm": 0.5073418021202087, + "learning_rate": 0.00012308520662269483, + "loss": 3.4944, + "step": 4773 + }, + { + "epoch": 6.10496, + "grad_norm": 0.5373370051383972, + "learning_rate": 0.00012304482433705746, + "loss": 3.4202, + "step": 4774 + }, + { + "epoch": 6.10624, + "grad_norm": 0.5177907347679138, + "learning_rate": 0.0001230044420514201, + "loss": 3.455, + "step": 4775 + }, + { + "epoch": 6.10752, + "grad_norm": 0.515060305595398, + "learning_rate": 0.00012296405976578275, + "loss": 3.3564, + "step": 4776 + }, + { + "epoch": 6.1088, + "grad_norm": 0.5271644592285156, + "learning_rate": 0.00012292367748014535, + "loss": 3.5478, + "step": 4777 + }, + { + "epoch": 6.11008, + "grad_norm": 0.5147733688354492, + "learning_rate": 0.000122883295194508, + "loss": 3.4428, + "step": 4778 + }, + { + "epoch": 6.11136, + "grad_norm": 0.5317122340202332, + "learning_rate": 0.00012284291290887064, + "loss": 3.4613, + "step": 4779 + }, + { + "epoch": 6.11264, + "grad_norm": 0.5279807448387146, + "learning_rate": 0.00012280253062323327, + "loss": 3.3885, + "step": 4780 + }, + { + "epoch": 6.11392, + "grad_norm": 0.5295637249946594, + "learning_rate": 0.0001227621483375959, + "loss": 3.4598, + "step": 4781 + }, + { + "epoch": 6.1152, + "grad_norm": 0.5437308549880981, + "learning_rate": 0.00012272176605195853, + "loss": 3.4348, + "step": 4782 + }, + { + "epoch": 6.11648, + "grad_norm": 0.5441926717758179, + "learning_rate": 0.00012268138376632116, + "loss": 3.4985, + "step": 4783 + }, + { + "epoch": 6.11776, + "grad_norm": 0.5059524774551392, + "learning_rate": 0.00012264100148068381, + "loss": 3.3963, + "step": 4784 + }, + { + "epoch": 6.11904, + "grad_norm": 0.5069935917854309, + "learning_rate": 0.00012260061919504642, + "loss": 3.4357, + "step": 4785 + }, + { + "epoch": 6.12032, + "grad_norm": 0.5232740044593811, + "learning_rate": 0.00012256023690940905, + "loss": 3.417, + "step": 4786 + }, + { + "epoch": 6.1216, + "grad_norm": 0.5216338634490967, + "learning_rate": 0.0001225198546237717, + "loss": 3.5042, + "step": 4787 + }, + { + "epoch": 6.12288, + "grad_norm": 0.5343948006629944, + "learning_rate": 0.00012247947233813433, + "loss": 3.4608, + "step": 4788 + }, + { + "epoch": 6.12416, + "grad_norm": 0.5192834138870239, + "learning_rate": 0.00012243909005249696, + "loss": 3.4506, + "step": 4789 + }, + { + "epoch": 6.12544, + "grad_norm": 0.5342220067977905, + "learning_rate": 0.0001223987077668596, + "loss": 3.5125, + "step": 4790 + }, + { + "epoch": 6.12672, + "grad_norm": 0.5249612331390381, + "learning_rate": 0.00012235832548122223, + "loss": 3.4471, + "step": 4791 + }, + { + "epoch": 6.128, + "grad_norm": 0.521793782711029, + "learning_rate": 0.00012231794319558486, + "loss": 3.4716, + "step": 4792 + }, + { + "epoch": 6.12928, + "grad_norm": 0.5028202533721924, + "learning_rate": 0.00012227756090994749, + "loss": 3.482, + "step": 4793 + }, + { + "epoch": 6.13056, + "grad_norm": 0.5394800901412964, + "learning_rate": 0.00012223717862431012, + "loss": 3.4238, + "step": 4794 + }, + { + "epoch": 6.13184, + "grad_norm": 0.5089391469955444, + "learning_rate": 0.00012219679633867277, + "loss": 3.4219, + "step": 4795 + }, + { + "epoch": 6.13312, + "grad_norm": 0.5048559904098511, + "learning_rate": 0.0001221564140530354, + "loss": 3.4201, + "step": 4796 + }, + { + "epoch": 6.1344, + "grad_norm": 0.5039951801300049, + "learning_rate": 0.00012211603176739803, + "loss": 3.492, + "step": 4797 + }, + { + "epoch": 6.13568, + "grad_norm": 0.5205702185630798, + "learning_rate": 0.00012207564948176066, + "loss": 3.4307, + "step": 4798 + }, + { + "epoch": 6.13696, + "grad_norm": 0.5324816107749939, + "learning_rate": 0.00012203526719612328, + "loss": 3.4693, + "step": 4799 + }, + { + "epoch": 6.13824, + "grad_norm": 0.4904349446296692, + "learning_rate": 0.00012199488491048592, + "loss": 3.3403, + "step": 4800 + }, + { + "epoch": 6.13952, + "grad_norm": 0.5032051801681519, + "learning_rate": 0.00012195450262484855, + "loss": 3.3592, + "step": 4801 + }, + { + "epoch": 6.1408, + "grad_norm": 0.5135837197303772, + "learning_rate": 0.0001219141203392112, + "loss": 3.4193, + "step": 4802 + }, + { + "epoch": 6.14208, + "grad_norm": 0.49779608845710754, + "learning_rate": 0.00012187373805357381, + "loss": 3.4737, + "step": 4803 + }, + { + "epoch": 6.14336, + "grad_norm": 0.5166708827018738, + "learning_rate": 0.00012183335576793646, + "loss": 3.4728, + "step": 4804 + }, + { + "epoch": 6.14464, + "grad_norm": 0.524376392364502, + "learning_rate": 0.00012179297348229909, + "loss": 3.5283, + "step": 4805 + }, + { + "epoch": 6.14592, + "grad_norm": 0.5073275566101074, + "learning_rate": 0.00012175259119666173, + "loss": 3.4763, + "step": 4806 + }, + { + "epoch": 6.1472, + "grad_norm": 0.5140321850776672, + "learning_rate": 0.00012171220891102435, + "loss": 3.5098, + "step": 4807 + }, + { + "epoch": 6.14848, + "grad_norm": 0.5006900429725647, + "learning_rate": 0.00012167182662538699, + "loss": 3.4547, + "step": 4808 + }, + { + "epoch": 6.14976, + "grad_norm": 0.5014587640762329, + "learning_rate": 0.00012163144433974962, + "loss": 3.457, + "step": 4809 + }, + { + "epoch": 6.15104, + "grad_norm": 0.5160591006278992, + "learning_rate": 0.00012159106205411226, + "loss": 3.4686, + "step": 4810 + }, + { + "epoch": 6.15232, + "grad_norm": 0.5109474658966064, + "learning_rate": 0.00012155067976847488, + "loss": 3.4054, + "step": 4811 + }, + { + "epoch": 6.1536, + "grad_norm": 0.49859675765037537, + "learning_rate": 0.00012151029748283751, + "loss": 3.4768, + "step": 4812 + }, + { + "epoch": 6.15488, + "grad_norm": 0.5176275372505188, + "learning_rate": 0.00012146991519720015, + "loss": 3.4349, + "step": 4813 + }, + { + "epoch": 6.15616, + "grad_norm": 0.5007618069648743, + "learning_rate": 0.00012142953291156277, + "loss": 3.3093, + "step": 4814 + }, + { + "epoch": 6.15744, + "grad_norm": 0.5273295044898987, + "learning_rate": 0.00012138915062592541, + "loss": 3.4368, + "step": 4815 + }, + { + "epoch": 6.15872, + "grad_norm": 0.49494895339012146, + "learning_rate": 0.00012134876834028804, + "loss": 3.4666, + "step": 4816 + }, + { + "epoch": 6.16, + "grad_norm": 0.5105710625648499, + "learning_rate": 0.00012130838605465069, + "loss": 3.4899, + "step": 4817 + }, + { + "epoch": 6.16128, + "grad_norm": 0.5119180083274841, + "learning_rate": 0.0001212680037690133, + "loss": 3.4086, + "step": 4818 + }, + { + "epoch": 6.16256, + "grad_norm": 0.5197157263755798, + "learning_rate": 0.00012122762148337595, + "loss": 3.4382, + "step": 4819 + }, + { + "epoch": 6.16384, + "grad_norm": 0.5209784507751465, + "learning_rate": 0.00012118723919773858, + "loss": 3.4292, + "step": 4820 + }, + { + "epoch": 6.16512, + "grad_norm": 0.5159290432929993, + "learning_rate": 0.00012114685691210122, + "loss": 3.48, + "step": 4821 + }, + { + "epoch": 6.1664, + "grad_norm": 0.5117263793945312, + "learning_rate": 0.00012110647462646384, + "loss": 3.4611, + "step": 4822 + }, + { + "epoch": 6.16768, + "grad_norm": 0.5287443995475769, + "learning_rate": 0.00012106609234082648, + "loss": 3.4852, + "step": 4823 + }, + { + "epoch": 6.16896, + "grad_norm": 0.5062198042869568, + "learning_rate": 0.00012102571005518911, + "loss": 3.3852, + "step": 4824 + }, + { + "epoch": 6.17024, + "grad_norm": 0.5322122573852539, + "learning_rate": 0.00012098532776955176, + "loss": 3.5064, + "step": 4825 + }, + { + "epoch": 6.17152, + "grad_norm": 0.525723397731781, + "learning_rate": 0.00012094494548391439, + "loss": 3.486, + "step": 4826 + }, + { + "epoch": 6.1728, + "grad_norm": 0.5092231035232544, + "learning_rate": 0.000120904563198277, + "loss": 3.4674, + "step": 4827 + }, + { + "epoch": 6.17408, + "grad_norm": 0.5155490636825562, + "learning_rate": 0.00012086418091263965, + "loss": 3.4667, + "step": 4828 + }, + { + "epoch": 6.17536, + "grad_norm": 0.5157750248908997, + "learning_rate": 0.00012082379862700228, + "loss": 3.4403, + "step": 4829 + }, + { + "epoch": 6.17664, + "grad_norm": 0.49893108010292053, + "learning_rate": 0.00012078341634136492, + "loss": 3.4855, + "step": 4830 + }, + { + "epoch": 6.17792, + "grad_norm": 0.5173389911651611, + "learning_rate": 0.00012074303405572754, + "loss": 3.5103, + "step": 4831 + }, + { + "epoch": 6.1792, + "grad_norm": 0.515521228313446, + "learning_rate": 0.00012070265177009018, + "loss": 3.5259, + "step": 4832 + }, + { + "epoch": 6.18048, + "grad_norm": 0.5073827505111694, + "learning_rate": 0.00012066226948445281, + "loss": 3.46, + "step": 4833 + }, + { + "epoch": 6.18176, + "grad_norm": 0.5405731797218323, + "learning_rate": 0.00012062188719881545, + "loss": 3.4988, + "step": 4834 + }, + { + "epoch": 6.18304, + "grad_norm": 0.5233080387115479, + "learning_rate": 0.00012058150491317807, + "loss": 3.4526, + "step": 4835 + }, + { + "epoch": 6.18432, + "grad_norm": 0.5276393294334412, + "learning_rate": 0.00012054112262754071, + "loss": 3.4405, + "step": 4836 + }, + { + "epoch": 6.1856, + "grad_norm": 0.5196059346199036, + "learning_rate": 0.00012050074034190334, + "loss": 3.4805, + "step": 4837 + }, + { + "epoch": 6.18688, + "grad_norm": 0.5034788250923157, + "learning_rate": 0.00012046035805626599, + "loss": 3.5371, + "step": 4838 + }, + { + "epoch": 6.18816, + "grad_norm": 0.516704797744751, + "learning_rate": 0.0001204199757706286, + "loss": 3.4467, + "step": 4839 + }, + { + "epoch": 6.18944, + "grad_norm": 0.512168288230896, + "learning_rate": 0.00012037959348499123, + "loss": 3.4096, + "step": 4840 + }, + { + "epoch": 6.19072, + "grad_norm": 0.5281592607498169, + "learning_rate": 0.00012033921119935388, + "loss": 3.5391, + "step": 4841 + }, + { + "epoch": 6.192, + "grad_norm": 0.49945464730262756, + "learning_rate": 0.0001202988289137165, + "loss": 3.478, + "step": 4842 + }, + { + "epoch": 6.19328, + "grad_norm": 0.5051952004432678, + "learning_rate": 0.00012025844662807914, + "loss": 3.4026, + "step": 4843 + }, + { + "epoch": 6.19456, + "grad_norm": 0.5274873971939087, + "learning_rate": 0.00012021806434244177, + "loss": 3.5439, + "step": 4844 + }, + { + "epoch": 6.19584, + "grad_norm": 0.5015951991081238, + "learning_rate": 0.00012017768205680441, + "loss": 3.4883, + "step": 4845 + }, + { + "epoch": 6.19712, + "grad_norm": 0.5088681578636169, + "learning_rate": 0.00012013729977116703, + "loss": 3.4458, + "step": 4846 + }, + { + "epoch": 6.1984, + "grad_norm": 0.49841299653053284, + "learning_rate": 0.00012009691748552967, + "loss": 3.4178, + "step": 4847 + }, + { + "epoch": 6.19968, + "grad_norm": 0.5172284245491028, + "learning_rate": 0.0001200565351998923, + "loss": 3.4052, + "step": 4848 + }, + { + "epoch": 6.20096, + "grad_norm": 0.5176679491996765, + "learning_rate": 0.00012001615291425495, + "loss": 3.4595, + "step": 4849 + }, + { + "epoch": 6.20224, + "grad_norm": 0.5075709819793701, + "learning_rate": 0.00011997577062861756, + "loss": 3.439, + "step": 4850 + }, + { + "epoch": 6.20352, + "grad_norm": 0.5104464888572693, + "learning_rate": 0.0001199353883429802, + "loss": 3.4479, + "step": 4851 + }, + { + "epoch": 6.2048, + "grad_norm": 0.504559338092804, + "learning_rate": 0.00011989500605734284, + "loss": 3.4244, + "step": 4852 + }, + { + "epoch": 6.20608, + "grad_norm": 0.5160381197929382, + "learning_rate": 0.00011985462377170547, + "loss": 3.4028, + "step": 4853 + }, + { + "epoch": 6.2073599999999995, + "grad_norm": 0.532819390296936, + "learning_rate": 0.0001198142414860681, + "loss": 3.4141, + "step": 4854 + }, + { + "epoch": 6.20864, + "grad_norm": 0.5191538333892822, + "learning_rate": 0.00011977385920043073, + "loss": 3.4793, + "step": 4855 + }, + { + "epoch": 6.20992, + "grad_norm": 0.5210806727409363, + "learning_rate": 0.00011973347691479337, + "loss": 3.4534, + "step": 4856 + }, + { + "epoch": 6.2112, + "grad_norm": 0.5080645680427551, + "learning_rate": 0.000119693094629156, + "loss": 3.447, + "step": 4857 + }, + { + "epoch": 6.21248, + "grad_norm": 0.5201761722564697, + "learning_rate": 0.00011965271234351863, + "loss": 3.481, + "step": 4858 + }, + { + "epoch": 6.21376, + "grad_norm": 0.5427380800247192, + "learning_rate": 0.00011961233005788126, + "loss": 3.462, + "step": 4859 + }, + { + "epoch": 6.21504, + "grad_norm": 0.5049818158149719, + "learning_rate": 0.0001195719477722439, + "loss": 3.4419, + "step": 4860 + }, + { + "epoch": 6.21632, + "grad_norm": 0.5197364687919617, + "learning_rate": 0.00011953156548660653, + "loss": 3.4193, + "step": 4861 + }, + { + "epoch": 6.2176, + "grad_norm": 0.5341804623603821, + "learning_rate": 0.00011949118320096918, + "loss": 3.4411, + "step": 4862 + }, + { + "epoch": 6.21888, + "grad_norm": 0.5286295413970947, + "learning_rate": 0.0001194508009153318, + "loss": 3.4425, + "step": 4863 + }, + { + "epoch": 6.22016, + "grad_norm": 0.5018596053123474, + "learning_rate": 0.00011941041862969444, + "loss": 3.4148, + "step": 4864 + }, + { + "epoch": 6.22144, + "grad_norm": 0.5178782939910889, + "learning_rate": 0.00011937003634405707, + "loss": 3.4321, + "step": 4865 + }, + { + "epoch": 6.22272, + "grad_norm": 0.5278565287590027, + "learning_rate": 0.00011932965405841968, + "loss": 3.4431, + "step": 4866 + }, + { + "epoch": 6.224, + "grad_norm": 0.5157015919685364, + "learning_rate": 0.00011928927177278233, + "loss": 3.5341, + "step": 4867 + }, + { + "epoch": 6.22528, + "grad_norm": 0.5221292972564697, + "learning_rate": 0.00011924888948714496, + "loss": 3.4478, + "step": 4868 + }, + { + "epoch": 6.22656, + "grad_norm": 0.5204195976257324, + "learning_rate": 0.0001192085072015076, + "loss": 3.4702, + "step": 4869 + }, + { + "epoch": 6.22784, + "grad_norm": 0.5190892219543457, + "learning_rate": 0.00011916812491587022, + "loss": 3.553, + "step": 4870 + }, + { + "epoch": 6.22912, + "grad_norm": 0.52687007188797, + "learning_rate": 0.00011912774263023286, + "loss": 3.4113, + "step": 4871 + }, + { + "epoch": 6.2304, + "grad_norm": 0.5159428715705872, + "learning_rate": 0.00011908736034459549, + "loss": 3.3968, + "step": 4872 + }, + { + "epoch": 6.23168, + "grad_norm": 0.5180546641349792, + "learning_rate": 0.00011904697805895814, + "loss": 3.4233, + "step": 4873 + }, + { + "epoch": 6.23296, + "grad_norm": 0.5099559426307678, + "learning_rate": 0.00011900659577332075, + "loss": 3.419, + "step": 4874 + }, + { + "epoch": 6.23424, + "grad_norm": 0.5090861916542053, + "learning_rate": 0.0001189662134876834, + "loss": 3.4224, + "step": 4875 + }, + { + "epoch": 6.23552, + "grad_norm": 0.5132479667663574, + "learning_rate": 0.00011892583120204603, + "loss": 3.4735, + "step": 4876 + }, + { + "epoch": 6.2368, + "grad_norm": 0.5198119878768921, + "learning_rate": 0.00011888544891640867, + "loss": 3.4072, + "step": 4877 + }, + { + "epoch": 6.23808, + "grad_norm": 0.5327208042144775, + "learning_rate": 0.00011884506663077129, + "loss": 3.4841, + "step": 4878 + }, + { + "epoch": 6.23936, + "grad_norm": 0.5163946747779846, + "learning_rate": 0.00011880468434513392, + "loss": 3.483, + "step": 4879 + }, + { + "epoch": 6.24064, + "grad_norm": 0.4975844621658325, + "learning_rate": 0.00011876430205949656, + "loss": 3.5362, + "step": 4880 + }, + { + "epoch": 6.24192, + "grad_norm": 0.5258901715278625, + "learning_rate": 0.00011872391977385919, + "loss": 3.449, + "step": 4881 + }, + { + "epoch": 6.2432, + "grad_norm": 0.5171861052513123, + "learning_rate": 0.00011868353748822182, + "loss": 3.3967, + "step": 4882 + }, + { + "epoch": 6.24448, + "grad_norm": 0.5271828174591064, + "learning_rate": 0.00011864315520258445, + "loss": 3.4485, + "step": 4883 + }, + { + "epoch": 6.24576, + "grad_norm": 0.5008876323699951, + "learning_rate": 0.00011860277291694709, + "loss": 3.507, + "step": 4884 + }, + { + "epoch": 6.24704, + "grad_norm": 0.5155926942825317, + "learning_rate": 0.00011856239063130972, + "loss": 3.4627, + "step": 4885 + }, + { + "epoch": 6.24832, + "grad_norm": 0.5127007365226746, + "learning_rate": 0.00011852200834567235, + "loss": 3.4644, + "step": 4886 + }, + { + "epoch": 6.2496, + "grad_norm": 0.5121738314628601, + "learning_rate": 0.00011848162606003498, + "loss": 3.4467, + "step": 4887 + }, + { + "epoch": 6.25088, + "grad_norm": 0.5105927586555481, + "learning_rate": 0.00011844124377439763, + "loss": 3.4391, + "step": 4888 + }, + { + "epoch": 6.25216, + "grad_norm": 0.5314432382583618, + "learning_rate": 0.00011840086148876026, + "loss": 3.5517, + "step": 4889 + }, + { + "epoch": 6.25344, + "grad_norm": 0.5190565586090088, + "learning_rate": 0.00011836047920312289, + "loss": 3.41, + "step": 4890 + }, + { + "epoch": 6.25472, + "grad_norm": 0.5455322861671448, + "learning_rate": 0.00011832009691748552, + "loss": 3.4535, + "step": 4891 + }, + { + "epoch": 6.256, + "grad_norm": 0.5055553913116455, + "learning_rate": 0.00011827971463184816, + "loss": 3.5182, + "step": 4892 + }, + { + "epoch": 6.25728, + "grad_norm": 0.5350437760353088, + "learning_rate": 0.00011823933234621079, + "loss": 3.4637, + "step": 4893 + }, + { + "epoch": 6.25856, + "grad_norm": 0.5320977568626404, + "learning_rate": 0.00011819895006057341, + "loss": 3.4818, + "step": 4894 + }, + { + "epoch": 6.25984, + "grad_norm": 0.5101127624511719, + "learning_rate": 0.00011815856777493605, + "loss": 3.4325, + "step": 4895 + }, + { + "epoch": 6.26112, + "grad_norm": 0.5186399817466736, + "learning_rate": 0.00011811818548929868, + "loss": 3.4622, + "step": 4896 + }, + { + "epoch": 6.2624, + "grad_norm": 0.5287879109382629, + "learning_rate": 0.00011807780320366132, + "loss": 3.4531, + "step": 4897 + }, + { + "epoch": 6.26368, + "grad_norm": 0.5211589932441711, + "learning_rate": 0.00011803742091802394, + "loss": 3.5668, + "step": 4898 + }, + { + "epoch": 6.26496, + "grad_norm": 0.5217843651771545, + "learning_rate": 0.00011799703863238658, + "loss": 3.4539, + "step": 4899 + }, + { + "epoch": 6.26624, + "grad_norm": 0.5402528047561646, + "learning_rate": 0.00011795665634674921, + "loss": 3.4953, + "step": 4900 + }, + { + "epoch": 6.26752, + "grad_norm": 0.5263811945915222, + "learning_rate": 0.00011791627406111186, + "loss": 3.4311, + "step": 4901 + }, + { + "epoch": 6.2688, + "grad_norm": 0.5283783674240112, + "learning_rate": 0.00011787589177547448, + "loss": 3.4235, + "step": 4902 + }, + { + "epoch": 6.27008, + "grad_norm": 0.5406724214553833, + "learning_rate": 0.00011783550948983712, + "loss": 3.4439, + "step": 4903 + }, + { + "epoch": 6.27136, + "grad_norm": 0.5088178515434265, + "learning_rate": 0.00011779512720419975, + "loss": 3.4806, + "step": 4904 + }, + { + "epoch": 6.27264, + "grad_norm": 0.5212264060974121, + "learning_rate": 0.00011775474491856239, + "loss": 3.3909, + "step": 4905 + }, + { + "epoch": 6.27392, + "grad_norm": 0.5153950452804565, + "learning_rate": 0.00011771436263292501, + "loss": 3.3559, + "step": 4906 + }, + { + "epoch": 6.2752, + "grad_norm": 0.5014641880989075, + "learning_rate": 0.00011767398034728764, + "loss": 3.4949, + "step": 4907 + }, + { + "epoch": 6.27648, + "grad_norm": 0.5058307647705078, + "learning_rate": 0.00011763359806165028, + "loss": 3.4748, + "step": 4908 + }, + { + "epoch": 6.27776, + "grad_norm": 0.5182768106460571, + "learning_rate": 0.0001175932157760129, + "loss": 3.5126, + "step": 4909 + }, + { + "epoch": 6.27904, + "grad_norm": 0.50447678565979, + "learning_rate": 0.00011755283349037554, + "loss": 3.4261, + "step": 4910 + }, + { + "epoch": 6.28032, + "grad_norm": 0.5356603860855103, + "learning_rate": 0.00011751245120473817, + "loss": 3.489, + "step": 4911 + }, + { + "epoch": 6.2816, + "grad_norm": 0.5243074893951416, + "learning_rate": 0.00011747206891910082, + "loss": 3.4682, + "step": 4912 + }, + { + "epoch": 6.2828800000000005, + "grad_norm": 0.5267544984817505, + "learning_rate": 0.00011743168663346345, + "loss": 3.4675, + "step": 4913 + }, + { + "epoch": 6.28416, + "grad_norm": 0.5158196687698364, + "learning_rate": 0.00011739130434782608, + "loss": 3.4589, + "step": 4914 + }, + { + "epoch": 6.28544, + "grad_norm": 0.5127290487289429, + "learning_rate": 0.0001173509220621887, + "loss": 3.4685, + "step": 4915 + }, + { + "epoch": 6.28672, + "grad_norm": 0.5056057572364807, + "learning_rate": 0.00011731053977655135, + "loss": 3.4143, + "step": 4916 + }, + { + "epoch": 6.288, + "grad_norm": 0.5179975032806396, + "learning_rate": 0.00011727015749091398, + "loss": 3.4381, + "step": 4917 + }, + { + "epoch": 6.28928, + "grad_norm": 0.5318068265914917, + "learning_rate": 0.00011722977520527661, + "loss": 3.4787, + "step": 4918 + }, + { + "epoch": 6.29056, + "grad_norm": 0.5078819990158081, + "learning_rate": 0.00011718939291963924, + "loss": 3.4634, + "step": 4919 + }, + { + "epoch": 6.29184, + "grad_norm": 0.5218449831008911, + "learning_rate": 0.00011714901063400187, + "loss": 3.4495, + "step": 4920 + }, + { + "epoch": 6.29312, + "grad_norm": 0.5181546211242676, + "learning_rate": 0.00011710862834836451, + "loss": 3.5156, + "step": 4921 + }, + { + "epoch": 6.2943999999999996, + "grad_norm": 0.5055398344993591, + "learning_rate": 0.00011706824606272713, + "loss": 3.4951, + "step": 4922 + }, + { + "epoch": 6.29568, + "grad_norm": 0.5111787915229797, + "learning_rate": 0.00011702786377708977, + "loss": 3.4858, + "step": 4923 + }, + { + "epoch": 6.29696, + "grad_norm": 0.5231371521949768, + "learning_rate": 0.0001169874814914524, + "loss": 3.4905, + "step": 4924 + }, + { + "epoch": 6.29824, + "grad_norm": 0.5168370008468628, + "learning_rate": 0.00011694709920581505, + "loss": 3.4655, + "step": 4925 + }, + { + "epoch": 6.29952, + "grad_norm": 0.5131970047950745, + "learning_rate": 0.00011690671692017766, + "loss": 3.4392, + "step": 4926 + }, + { + "epoch": 6.3008, + "grad_norm": 0.5175589323043823, + "learning_rate": 0.00011686633463454031, + "loss": 3.4428, + "step": 4927 + }, + { + "epoch": 6.30208, + "grad_norm": 0.5290152430534363, + "learning_rate": 0.00011682595234890294, + "loss": 3.4106, + "step": 4928 + }, + { + "epoch": 6.30336, + "grad_norm": 0.5033940672874451, + "learning_rate": 0.00011678557006326558, + "loss": 3.4195, + "step": 4929 + }, + { + "epoch": 6.30464, + "grad_norm": 0.5039850473403931, + "learning_rate": 0.0001167451877776282, + "loss": 3.4448, + "step": 4930 + }, + { + "epoch": 6.30592, + "grad_norm": 0.5078590512275696, + "learning_rate": 0.00011670480549199084, + "loss": 3.4318, + "step": 4931 + }, + { + "epoch": 6.3072, + "grad_norm": 0.5021501779556274, + "learning_rate": 0.00011666442320635347, + "loss": 3.4763, + "step": 4932 + }, + { + "epoch": 6.30848, + "grad_norm": 0.5251853466033936, + "learning_rate": 0.00011662404092071609, + "loss": 3.4158, + "step": 4933 + }, + { + "epoch": 6.30976, + "grad_norm": 0.5062684416770935, + "learning_rate": 0.00011658365863507873, + "loss": 3.4314, + "step": 4934 + }, + { + "epoch": 6.31104, + "grad_norm": 0.5202977061271667, + "learning_rate": 0.00011654327634944136, + "loss": 3.4411, + "step": 4935 + }, + { + "epoch": 6.31232, + "grad_norm": 0.5242408514022827, + "learning_rate": 0.000116502894063804, + "loss": 3.5071, + "step": 4936 + }, + { + "epoch": 6.3136, + "grad_norm": 0.5003301501274109, + "learning_rate": 0.00011646251177816662, + "loss": 3.4682, + "step": 4937 + }, + { + "epoch": 6.31488, + "grad_norm": 0.5161333680152893, + "learning_rate": 0.00011642212949252927, + "loss": 3.4328, + "step": 4938 + }, + { + "epoch": 6.31616, + "grad_norm": 0.4897507429122925, + "learning_rate": 0.0001163817472068919, + "loss": 3.5237, + "step": 4939 + }, + { + "epoch": 6.31744, + "grad_norm": 0.5239113569259644, + "learning_rate": 0.00011634136492125454, + "loss": 3.4244, + "step": 4940 + }, + { + "epoch": 6.31872, + "grad_norm": 0.5347805023193359, + "learning_rate": 0.00011630098263561716, + "loss": 3.4078, + "step": 4941 + }, + { + "epoch": 6.32, + "grad_norm": 0.49066007137298584, + "learning_rate": 0.0001162606003499798, + "loss": 3.474, + "step": 4942 + }, + { + "epoch": 6.32128, + "grad_norm": 0.5044615268707275, + "learning_rate": 0.00011622021806434243, + "loss": 3.5421, + "step": 4943 + }, + { + "epoch": 6.32256, + "grad_norm": 0.5164585113525391, + "learning_rate": 0.00011617983577870507, + "loss": 3.4537, + "step": 4944 + }, + { + "epoch": 6.32384, + "grad_norm": 0.5173794627189636, + "learning_rate": 0.00011613945349306769, + "loss": 3.4225, + "step": 4945 + }, + { + "epoch": 6.32512, + "grad_norm": 0.5148087739944458, + "learning_rate": 0.00011609907120743033, + "loss": 3.4548, + "step": 4946 + }, + { + "epoch": 6.3264, + "grad_norm": 0.533028781414032, + "learning_rate": 0.00011605868892179296, + "loss": 3.5002, + "step": 4947 + }, + { + "epoch": 6.32768, + "grad_norm": 0.5113101005554199, + "learning_rate": 0.0001160183066361556, + "loss": 3.425, + "step": 4948 + }, + { + "epoch": 6.32896, + "grad_norm": 0.5407309532165527, + "learning_rate": 0.00011597792435051822, + "loss": 3.52, + "step": 4949 + }, + { + "epoch": 6.33024, + "grad_norm": 0.5209356546401978, + "learning_rate": 0.00011593754206488085, + "loss": 3.4593, + "step": 4950 + }, + { + "epoch": 6.33152, + "grad_norm": 0.516190230846405, + "learning_rate": 0.0001158971597792435, + "loss": 3.4489, + "step": 4951 + }, + { + "epoch": 6.3328, + "grad_norm": 0.503466784954071, + "learning_rate": 0.00011585677749360613, + "loss": 3.3637, + "step": 4952 + }, + { + "epoch": 6.33408, + "grad_norm": 0.534572422504425, + "learning_rate": 0.00011581639520796877, + "loss": 3.4754, + "step": 4953 + }, + { + "epoch": 6.33536, + "grad_norm": 0.5381052494049072, + "learning_rate": 0.00011577601292233139, + "loss": 3.4082, + "step": 4954 + }, + { + "epoch": 6.33664, + "grad_norm": 0.5160523653030396, + "learning_rate": 0.00011573563063669403, + "loss": 3.4497, + "step": 4955 + }, + { + "epoch": 6.33792, + "grad_norm": 0.5168169140815735, + "learning_rate": 0.00011569524835105666, + "loss": 3.4778, + "step": 4956 + }, + { + "epoch": 6.3392, + "grad_norm": 0.5144256949424744, + "learning_rate": 0.0001156548660654193, + "loss": 3.5237, + "step": 4957 + }, + { + "epoch": 6.34048, + "grad_norm": 0.529626190662384, + "learning_rate": 0.00011561448377978192, + "loss": 3.4262, + "step": 4958 + }, + { + "epoch": 6.34176, + "grad_norm": 0.5221841335296631, + "learning_rate": 0.00011557410149414457, + "loss": 3.4335, + "step": 4959 + }, + { + "epoch": 6.34304, + "grad_norm": 0.5182304382324219, + "learning_rate": 0.0001155337192085072, + "loss": 3.3717, + "step": 4960 + }, + { + "epoch": 6.34432, + "grad_norm": 0.5277256369590759, + "learning_rate": 0.00011549333692286981, + "loss": 3.4908, + "step": 4961 + }, + { + "epoch": 6.3456, + "grad_norm": 0.5436074733734131, + "learning_rate": 0.00011545295463723246, + "loss": 3.5252, + "step": 4962 + }, + { + "epoch": 6.34688, + "grad_norm": 0.5004885196685791, + "learning_rate": 0.00011541257235159509, + "loss": 3.4502, + "step": 4963 + }, + { + "epoch": 6.34816, + "grad_norm": 0.5198299884796143, + "learning_rate": 0.00011537219006595773, + "loss": 3.4588, + "step": 4964 + }, + { + "epoch": 6.3494399999999995, + "grad_norm": 0.5136741399765015, + "learning_rate": 0.00011533180778032035, + "loss": 3.4659, + "step": 4965 + }, + { + "epoch": 6.35072, + "grad_norm": 0.5468453764915466, + "learning_rate": 0.00011529142549468299, + "loss": 3.4652, + "step": 4966 + }, + { + "epoch": 6.352, + "grad_norm": 0.4965207874774933, + "learning_rate": 0.00011525104320904562, + "loss": 3.4365, + "step": 4967 + }, + { + "epoch": 6.35328, + "grad_norm": 0.5069467425346375, + "learning_rate": 0.00011521066092340826, + "loss": 3.4168, + "step": 4968 + }, + { + "epoch": 6.35456, + "grad_norm": 0.5168161988258362, + "learning_rate": 0.00011517027863777088, + "loss": 3.519, + "step": 4969 + }, + { + "epoch": 6.35584, + "grad_norm": 0.5197111368179321, + "learning_rate": 0.00011512989635213352, + "loss": 3.4787, + "step": 4970 + }, + { + "epoch": 6.35712, + "grad_norm": 0.5127108097076416, + "learning_rate": 0.00011508951406649615, + "loss": 3.5048, + "step": 4971 + }, + { + "epoch": 6.3584, + "grad_norm": 0.49976083636283875, + "learning_rate": 0.0001150491317808588, + "loss": 3.3816, + "step": 4972 + }, + { + "epoch": 6.35968, + "grad_norm": 0.5142550468444824, + "learning_rate": 0.00011500874949522141, + "loss": 3.4407, + "step": 4973 + }, + { + "epoch": 6.36096, + "grad_norm": 0.5232443809509277, + "learning_rate": 0.00011496836720958404, + "loss": 3.4582, + "step": 4974 + }, + { + "epoch": 6.36224, + "grad_norm": 0.5081977844238281, + "learning_rate": 0.00011492798492394669, + "loss": 3.4799, + "step": 4975 + }, + { + "epoch": 6.36352, + "grad_norm": 0.49449968338012695, + "learning_rate": 0.00011488760263830932, + "loss": 3.4118, + "step": 4976 + }, + { + "epoch": 6.3648, + "grad_norm": 0.5099364519119263, + "learning_rate": 0.00011484722035267195, + "loss": 3.4475, + "step": 4977 + }, + { + "epoch": 6.36608, + "grad_norm": 0.5084264278411865, + "learning_rate": 0.00011480683806703458, + "loss": 3.5485, + "step": 4978 + }, + { + "epoch": 6.36736, + "grad_norm": 0.5079696178436279, + "learning_rate": 0.00011476645578139722, + "loss": 3.4334, + "step": 4979 + }, + { + "epoch": 6.36864, + "grad_norm": 0.5050063729286194, + "learning_rate": 0.00011472607349575985, + "loss": 3.4747, + "step": 4980 + }, + { + "epoch": 6.3699200000000005, + "grad_norm": 0.5045068264007568, + "learning_rate": 0.00011468569121012248, + "loss": 3.4687, + "step": 4981 + }, + { + "epoch": 6.3712, + "grad_norm": 0.5139387845993042, + "learning_rate": 0.00011464530892448511, + "loss": 3.4448, + "step": 4982 + }, + { + "epoch": 6.37248, + "grad_norm": 0.5034843683242798, + "learning_rate": 0.00011460492663884776, + "loss": 3.4381, + "step": 4983 + }, + { + "epoch": 6.37376, + "grad_norm": 0.5243584513664246, + "learning_rate": 0.00011456454435321039, + "loss": 3.5206, + "step": 4984 + }, + { + "epoch": 6.37504, + "grad_norm": 0.5162044167518616, + "learning_rate": 0.00011452416206757302, + "loss": 3.5582, + "step": 4985 + }, + { + "epoch": 6.37632, + "grad_norm": 0.49788084626197815, + "learning_rate": 0.00011448377978193565, + "loss": 3.4328, + "step": 4986 + }, + { + "epoch": 6.3776, + "grad_norm": 0.5192862153053284, + "learning_rate": 0.00011444339749629828, + "loss": 3.4449, + "step": 4987 + }, + { + "epoch": 6.37888, + "grad_norm": 0.5089589357376099, + "learning_rate": 0.00011440301521066092, + "loss": 3.4463, + "step": 4988 + }, + { + "epoch": 6.38016, + "grad_norm": 0.5230059623718262, + "learning_rate": 0.00011436263292502354, + "loss": 3.4992, + "step": 4989 + }, + { + "epoch": 6.38144, + "grad_norm": 0.5021970868110657, + "learning_rate": 0.00011432225063938618, + "loss": 3.461, + "step": 4990 + }, + { + "epoch": 6.38272, + "grad_norm": 0.5131734013557434, + "learning_rate": 0.00011428186835374881, + "loss": 3.4653, + "step": 4991 + }, + { + "epoch": 6.384, + "grad_norm": 0.49421629309654236, + "learning_rate": 0.00011424148606811145, + "loss": 3.4365, + "step": 4992 + }, + { + "epoch": 6.38528, + "grad_norm": 0.5171757340431213, + "learning_rate": 0.00011420110378247407, + "loss": 3.4412, + "step": 4993 + }, + { + "epoch": 6.38656, + "grad_norm": 0.530982494354248, + "learning_rate": 0.00011416072149683671, + "loss": 3.4623, + "step": 4994 + }, + { + "epoch": 6.38784, + "grad_norm": 0.5211870670318604, + "learning_rate": 0.00011412033921119934, + "loss": 3.4505, + "step": 4995 + }, + { + "epoch": 6.38912, + "grad_norm": 0.5216143131256104, + "learning_rate": 0.00011407995692556199, + "loss": 3.4959, + "step": 4996 + }, + { + "epoch": 6.3904, + "grad_norm": 0.5288113355636597, + "learning_rate": 0.0001140395746399246, + "loss": 3.4404, + "step": 4997 + }, + { + "epoch": 6.39168, + "grad_norm": 0.5123031735420227, + "learning_rate": 0.00011399919235428725, + "loss": 3.449, + "step": 4998 + }, + { + "epoch": 6.39296, + "grad_norm": 0.49802166223526, + "learning_rate": 0.00011395881006864988, + "loss": 3.3765, + "step": 4999 + }, + { + "epoch": 6.39424, + "grad_norm": 0.5179408192634583, + "learning_rate": 0.0001139184277830125, + "loss": 3.4644, + "step": 5000 + }, + { + "epoch": 6.39552, + "grad_norm": 0.5292978286743164, + "learning_rate": 0.00011387804549737514, + "loss": 3.489, + "step": 5001 + }, + { + "epoch": 6.3968, + "grad_norm": 0.5212100744247437, + "learning_rate": 0.00011383766321173777, + "loss": 3.4714, + "step": 5002 + }, + { + "epoch": 6.39808, + "grad_norm": 0.5056613087654114, + "learning_rate": 0.00011379728092610041, + "loss": 3.4111, + "step": 5003 + }, + { + "epoch": 6.39936, + "grad_norm": 0.5097994804382324, + "learning_rate": 0.00011375689864046304, + "loss": 3.4453, + "step": 5004 + }, + { + "epoch": 6.40064, + "grad_norm": 0.5117822885513306, + "learning_rate": 0.00011371651635482567, + "loss": 3.483, + "step": 5005 + }, + { + "epoch": 6.40192, + "grad_norm": 0.51289302110672, + "learning_rate": 0.0001136761340691883, + "loss": 3.5011, + "step": 5006 + }, + { + "epoch": 6.4032, + "grad_norm": 0.5105553865432739, + "learning_rate": 0.00011363575178355094, + "loss": 3.4876, + "step": 5007 + }, + { + "epoch": 6.40448, + "grad_norm": 0.5061222314834595, + "learning_rate": 0.00011359536949791357, + "loss": 3.4529, + "step": 5008 + }, + { + "epoch": 6.40576, + "grad_norm": 0.5081943869590759, + "learning_rate": 0.0001135549872122762, + "loss": 3.5208, + "step": 5009 + }, + { + "epoch": 6.40704, + "grad_norm": 0.5123940706253052, + "learning_rate": 0.00011351460492663883, + "loss": 3.4691, + "step": 5010 + }, + { + "epoch": 6.40832, + "grad_norm": 0.5150337219238281, + "learning_rate": 0.00011347422264100148, + "loss": 3.5093, + "step": 5011 + }, + { + "epoch": 6.4096, + "grad_norm": 0.5125681161880493, + "learning_rate": 0.00011343384035536411, + "loss": 3.4547, + "step": 5012 + }, + { + "epoch": 6.41088, + "grad_norm": 0.509105384349823, + "learning_rate": 0.00011339345806972674, + "loss": 3.4142, + "step": 5013 + }, + { + "epoch": 6.41216, + "grad_norm": 0.5203231573104858, + "learning_rate": 0.00011335307578408937, + "loss": 3.4432, + "step": 5014 + }, + { + "epoch": 6.41344, + "grad_norm": 0.5590437650680542, + "learning_rate": 0.000113312693498452, + "loss": 3.4524, + "step": 5015 + }, + { + "epoch": 6.41472, + "grad_norm": 0.5066128969192505, + "learning_rate": 0.00011327231121281464, + "loss": 3.5085, + "step": 5016 + }, + { + "epoch": 6.416, + "grad_norm": 0.5234223008155823, + "learning_rate": 0.00011323192892717726, + "loss": 3.5202, + "step": 5017 + }, + { + "epoch": 6.41728, + "grad_norm": 0.49386492371559143, + "learning_rate": 0.0001131915466415399, + "loss": 3.4921, + "step": 5018 + }, + { + "epoch": 6.41856, + "grad_norm": 0.5142998695373535, + "learning_rate": 0.00011315116435590253, + "loss": 3.4908, + "step": 5019 + }, + { + "epoch": 6.41984, + "grad_norm": 0.5108798742294312, + "learning_rate": 0.00011311078207026518, + "loss": 3.3707, + "step": 5020 + }, + { + "epoch": 6.42112, + "grad_norm": 0.5349586009979248, + "learning_rate": 0.00011307039978462779, + "loss": 3.5232, + "step": 5021 + }, + { + "epoch": 6.4224, + "grad_norm": 0.5002762675285339, + "learning_rate": 0.00011303001749899044, + "loss": 3.4531, + "step": 5022 + }, + { + "epoch": 6.42368, + "grad_norm": 0.5073869228363037, + "learning_rate": 0.00011298963521335307, + "loss": 3.4888, + "step": 5023 + }, + { + "epoch": 6.4249600000000004, + "grad_norm": 0.5094159245491028, + "learning_rate": 0.00011294925292771571, + "loss": 3.4729, + "step": 5024 + }, + { + "epoch": 6.42624, + "grad_norm": 0.502030074596405, + "learning_rate": 0.00011290887064207833, + "loss": 3.4268, + "step": 5025 + }, + { + "epoch": 6.42752, + "grad_norm": 0.5096508264541626, + "learning_rate": 0.00011286848835644097, + "loss": 3.4771, + "step": 5026 + }, + { + "epoch": 6.4288, + "grad_norm": 0.5121338367462158, + "learning_rate": 0.0001128281060708036, + "loss": 3.5188, + "step": 5027 + }, + { + "epoch": 6.43008, + "grad_norm": 0.5106524229049683, + "learning_rate": 0.00011278772378516622, + "loss": 3.487, + "step": 5028 + }, + { + "epoch": 6.43136, + "grad_norm": 0.5135172605514526, + "learning_rate": 0.00011274734149952886, + "loss": 3.4631, + "step": 5029 + }, + { + "epoch": 6.43264, + "grad_norm": 0.5284242033958435, + "learning_rate": 0.00011270695921389149, + "loss": 3.5371, + "step": 5030 + }, + { + "epoch": 6.43392, + "grad_norm": 0.49686378240585327, + "learning_rate": 0.00011266657692825413, + "loss": 3.4035, + "step": 5031 + }, + { + "epoch": 6.4352, + "grad_norm": 0.5379998683929443, + "learning_rate": 0.00011262619464261675, + "loss": 3.4359, + "step": 5032 + }, + { + "epoch": 6.4364799999999995, + "grad_norm": 0.5207564830780029, + "learning_rate": 0.0001125858123569794, + "loss": 3.433, + "step": 5033 + }, + { + "epoch": 6.43776, + "grad_norm": 0.49627432227134705, + "learning_rate": 0.00011254543007134202, + "loss": 3.4353, + "step": 5034 + }, + { + "epoch": 6.43904, + "grad_norm": 0.5221447944641113, + "learning_rate": 0.00011250504778570467, + "loss": 3.4024, + "step": 5035 + }, + { + "epoch": 6.44032, + "grad_norm": 0.5198942422866821, + "learning_rate": 0.00011246466550006728, + "loss": 3.4782, + "step": 5036 + }, + { + "epoch": 6.4416, + "grad_norm": 0.5337004661560059, + "learning_rate": 0.00011242428321442993, + "loss": 3.5193, + "step": 5037 + }, + { + "epoch": 6.44288, + "grad_norm": 0.5141910314559937, + "learning_rate": 0.00011238390092879256, + "loss": 3.5241, + "step": 5038 + }, + { + "epoch": 6.44416, + "grad_norm": 0.5050771832466125, + "learning_rate": 0.0001123435186431552, + "loss": 3.4287, + "step": 5039 + }, + { + "epoch": 6.44544, + "grad_norm": 0.5285760164260864, + "learning_rate": 0.00011230313635751783, + "loss": 3.4792, + "step": 5040 + }, + { + "epoch": 6.44672, + "grad_norm": 0.5192365646362305, + "learning_rate": 0.00011226275407188045, + "loss": 3.4925, + "step": 5041 + }, + { + "epoch": 6.448, + "grad_norm": 0.5274104475975037, + "learning_rate": 0.00011222237178624309, + "loss": 3.444, + "step": 5042 + }, + { + "epoch": 6.44928, + "grad_norm": 0.5355402231216431, + "learning_rate": 0.00011218198950060572, + "loss": 3.5535, + "step": 5043 + }, + { + "epoch": 6.45056, + "grad_norm": 0.5089603662490845, + "learning_rate": 0.00011214160721496837, + "loss": 3.4749, + "step": 5044 + }, + { + "epoch": 6.45184, + "grad_norm": 0.5029155611991882, + "learning_rate": 0.00011210122492933098, + "loss": 3.3917, + "step": 5045 + }, + { + "epoch": 6.45312, + "grad_norm": 0.5419932007789612, + "learning_rate": 0.00011206084264369363, + "loss": 3.4852, + "step": 5046 + }, + { + "epoch": 6.4544, + "grad_norm": 0.5222985148429871, + "learning_rate": 0.00011202046035805626, + "loss": 3.4503, + "step": 5047 + }, + { + "epoch": 6.45568, + "grad_norm": 0.5103903412818909, + "learning_rate": 0.0001119800780724189, + "loss": 3.3846, + "step": 5048 + }, + { + "epoch": 6.45696, + "grad_norm": 0.5334930419921875, + "learning_rate": 0.00011193969578678152, + "loss": 3.4451, + "step": 5049 + }, + { + "epoch": 6.45824, + "grad_norm": 0.506489098072052, + "learning_rate": 0.00011189931350114416, + "loss": 3.4588, + "step": 5050 + }, + { + "epoch": 6.45952, + "grad_norm": 0.5101386308670044, + "learning_rate": 0.00011185893121550679, + "loss": 3.5153, + "step": 5051 + }, + { + "epoch": 6.4608, + "grad_norm": 0.5294933915138245, + "learning_rate": 0.00011181854892986943, + "loss": 3.4905, + "step": 5052 + }, + { + "epoch": 6.46208, + "grad_norm": 0.5064314603805542, + "learning_rate": 0.00011177816664423205, + "loss": 3.4571, + "step": 5053 + }, + { + "epoch": 6.46336, + "grad_norm": 0.526623010635376, + "learning_rate": 0.00011173778435859468, + "loss": 3.4502, + "step": 5054 + }, + { + "epoch": 6.46464, + "grad_norm": 0.5297154784202576, + "learning_rate": 0.00011169740207295732, + "loss": 3.5096, + "step": 5055 + }, + { + "epoch": 6.46592, + "grad_norm": 0.5068486332893372, + "learning_rate": 0.00011165701978731994, + "loss": 3.4319, + "step": 5056 + }, + { + "epoch": 6.4672, + "grad_norm": 0.5235540866851807, + "learning_rate": 0.00011161663750168258, + "loss": 3.4706, + "step": 5057 + }, + { + "epoch": 6.46848, + "grad_norm": 0.5198733806610107, + "learning_rate": 0.00011157625521604521, + "loss": 3.487, + "step": 5058 + }, + { + "epoch": 6.46976, + "grad_norm": 0.5035125613212585, + "learning_rate": 0.00011153587293040786, + "loss": 3.4629, + "step": 5059 + }, + { + "epoch": 6.47104, + "grad_norm": 0.5046782493591309, + "learning_rate": 0.00011149549064477047, + "loss": 3.4192, + "step": 5060 + }, + { + "epoch": 6.47232, + "grad_norm": 0.50188809633255, + "learning_rate": 0.00011145510835913312, + "loss": 3.4571, + "step": 5061 + }, + { + "epoch": 6.4736, + "grad_norm": 0.5252716541290283, + "learning_rate": 0.00011141472607349575, + "loss": 3.4554, + "step": 5062 + }, + { + "epoch": 6.47488, + "grad_norm": 0.4985215365886688, + "learning_rate": 0.00011137434378785839, + "loss": 3.4759, + "step": 5063 + }, + { + "epoch": 6.47616, + "grad_norm": 0.5182859897613525, + "learning_rate": 0.00011133396150222101, + "loss": 3.4293, + "step": 5064 + }, + { + "epoch": 6.47744, + "grad_norm": 0.5109648108482361, + "learning_rate": 0.00011129357921658365, + "loss": 3.449, + "step": 5065 + }, + { + "epoch": 6.47872, + "grad_norm": 0.4976121485233307, + "learning_rate": 0.00011125319693094628, + "loss": 3.4391, + "step": 5066 + }, + { + "epoch": 6.48, + "grad_norm": 0.501159131526947, + "learning_rate": 0.00011121281464530893, + "loss": 3.4683, + "step": 5067 + }, + { + "epoch": 6.48128, + "grad_norm": 0.5295117497444153, + "learning_rate": 0.00011117243235967154, + "loss": 3.4945, + "step": 5068 + }, + { + "epoch": 6.48256, + "grad_norm": 0.5260968804359436, + "learning_rate": 0.00011113205007403417, + "loss": 3.4764, + "step": 5069 + }, + { + "epoch": 6.48384, + "grad_norm": 0.5069438219070435, + "learning_rate": 0.00011109166778839682, + "loss": 3.4699, + "step": 5070 + }, + { + "epoch": 6.48512, + "grad_norm": 0.5271986126899719, + "learning_rate": 0.00011105128550275945, + "loss": 3.5268, + "step": 5071 + }, + { + "epoch": 6.4864, + "grad_norm": 0.5156261324882507, + "learning_rate": 0.00011101090321712208, + "loss": 3.4877, + "step": 5072 + }, + { + "epoch": 6.48768, + "grad_norm": 0.5246565937995911, + "learning_rate": 0.0001109705209314847, + "loss": 3.4697, + "step": 5073 + }, + { + "epoch": 6.48896, + "grad_norm": 0.5298268795013428, + "learning_rate": 0.00011093013864584735, + "loss": 3.4842, + "step": 5074 + }, + { + "epoch": 6.49024, + "grad_norm": 0.5147030353546143, + "learning_rate": 0.00011088975636020998, + "loss": 3.5063, + "step": 5075 + }, + { + "epoch": 6.49152, + "grad_norm": 0.5172019004821777, + "learning_rate": 0.00011084937407457262, + "loss": 3.4344, + "step": 5076 + }, + { + "epoch": 6.4928, + "grad_norm": 0.5194494724273682, + "learning_rate": 0.00011080899178893524, + "loss": 3.4808, + "step": 5077 + }, + { + "epoch": 6.49408, + "grad_norm": 0.5049477815628052, + "learning_rate": 0.00011076860950329788, + "loss": 3.4354, + "step": 5078 + }, + { + "epoch": 6.49536, + "grad_norm": 0.5107006430625916, + "learning_rate": 0.00011072822721766051, + "loss": 3.4447, + "step": 5079 + }, + { + "epoch": 6.49664, + "grad_norm": 0.5118290185928345, + "learning_rate": 0.00011068784493202316, + "loss": 3.4469, + "step": 5080 + }, + { + "epoch": 6.49792, + "grad_norm": 0.5051876306533813, + "learning_rate": 0.00011064746264638577, + "loss": 3.4521, + "step": 5081 + }, + { + "epoch": 6.4992, + "grad_norm": 0.4978385865688324, + "learning_rate": 0.0001106070803607484, + "loss": 3.3929, + "step": 5082 + }, + { + "epoch": 6.50048, + "grad_norm": 0.5098656415939331, + "learning_rate": 0.00011056669807511105, + "loss": 3.4236, + "step": 5083 + }, + { + "epoch": 6.50176, + "grad_norm": 0.5078462362289429, + "learning_rate": 0.00011052631578947366, + "loss": 3.4781, + "step": 5084 + }, + { + "epoch": 6.50304, + "grad_norm": 0.5070273876190186, + "learning_rate": 0.00011048593350383631, + "loss": 3.4513, + "step": 5085 + }, + { + "epoch": 6.50432, + "grad_norm": 0.4959174394607544, + "learning_rate": 0.00011044555121819894, + "loss": 3.4036, + "step": 5086 + }, + { + "epoch": 6.5056, + "grad_norm": 0.5386160016059875, + "learning_rate": 0.00011040516893256158, + "loss": 3.5321, + "step": 5087 + }, + { + "epoch": 6.50688, + "grad_norm": 0.518519937992096, + "learning_rate": 0.0001103647866469242, + "loss": 3.4958, + "step": 5088 + }, + { + "epoch": 6.50816, + "grad_norm": 0.5082813501358032, + "learning_rate": 0.00011032440436128684, + "loss": 3.4061, + "step": 5089 + }, + { + "epoch": 6.50944, + "grad_norm": 0.5264861583709717, + "learning_rate": 0.00011028402207564947, + "loss": 3.4816, + "step": 5090 + }, + { + "epoch": 6.51072, + "grad_norm": 0.5055463910102844, + "learning_rate": 0.00011024363979001212, + "loss": 3.5055, + "step": 5091 + }, + { + "epoch": 6.5120000000000005, + "grad_norm": 0.5075529217720032, + "learning_rate": 0.00011020325750437473, + "loss": 3.4639, + "step": 5092 + }, + { + "epoch": 6.51328, + "grad_norm": 0.5140883326530457, + "learning_rate": 0.00011016287521873738, + "loss": 3.4615, + "step": 5093 + }, + { + "epoch": 6.51456, + "grad_norm": 0.5120643377304077, + "learning_rate": 0.0001101224929331, + "loss": 3.4331, + "step": 5094 + }, + { + "epoch": 6.51584, + "grad_norm": 0.501859724521637, + "learning_rate": 0.00011008211064746264, + "loss": 3.4614, + "step": 5095 + }, + { + "epoch": 6.51712, + "grad_norm": 0.514984667301178, + "learning_rate": 0.00011004172836182527, + "loss": 3.4491, + "step": 5096 + }, + { + "epoch": 6.5184, + "grad_norm": 0.5052075386047363, + "learning_rate": 0.0001100013460761879, + "loss": 3.5753, + "step": 5097 + }, + { + "epoch": 6.51968, + "grad_norm": 0.5124235153198242, + "learning_rate": 0.00010996096379055054, + "loss": 3.4338, + "step": 5098 + }, + { + "epoch": 6.52096, + "grad_norm": 0.48975175619125366, + "learning_rate": 0.00010992058150491317, + "loss": 3.4508, + "step": 5099 + }, + { + "epoch": 6.52224, + "grad_norm": 0.49977898597717285, + "learning_rate": 0.0001098801992192758, + "loss": 3.3994, + "step": 5100 + }, + { + "epoch": 6.5235199999999995, + "grad_norm": 0.4916556775569916, + "learning_rate": 0.00010983981693363843, + "loss": 3.4289, + "step": 5101 + }, + { + "epoch": 6.5248, + "grad_norm": 0.5177762508392334, + "learning_rate": 0.00010979943464800107, + "loss": 3.5018, + "step": 5102 + }, + { + "epoch": 6.52608, + "grad_norm": 0.5140178799629211, + "learning_rate": 0.0001097590523623637, + "loss": 3.4727, + "step": 5103 + }, + { + "epoch": 6.52736, + "grad_norm": 0.4911567270755768, + "learning_rate": 0.00010971867007672633, + "loss": 3.5407, + "step": 5104 + }, + { + "epoch": 6.52864, + "grad_norm": 0.505172610282898, + "learning_rate": 0.00010967828779108896, + "loss": 3.3872, + "step": 5105 + }, + { + "epoch": 6.52992, + "grad_norm": 0.5191845297813416, + "learning_rate": 0.00010963790550545161, + "loss": 3.482, + "step": 5106 + }, + { + "epoch": 6.5312, + "grad_norm": 0.5074360370635986, + "learning_rate": 0.00010959752321981424, + "loss": 3.4597, + "step": 5107 + }, + { + "epoch": 6.53248, + "grad_norm": 0.5102279782295227, + "learning_rate": 0.00010955714093417685, + "loss": 3.3886, + "step": 5108 + }, + { + "epoch": 6.53376, + "grad_norm": 0.5071060061454773, + "learning_rate": 0.0001095167586485395, + "loss": 3.4508, + "step": 5109 + }, + { + "epoch": 6.53504, + "grad_norm": 0.5164380669593811, + "learning_rate": 0.00010947637636290213, + "loss": 3.446, + "step": 5110 + }, + { + "epoch": 6.53632, + "grad_norm": 0.5028985142707825, + "learning_rate": 0.00010943599407726477, + "loss": 3.5139, + "step": 5111 + }, + { + "epoch": 6.5376, + "grad_norm": 0.49001002311706543, + "learning_rate": 0.00010939561179162739, + "loss": 3.3891, + "step": 5112 + }, + { + "epoch": 6.53888, + "grad_norm": 0.5156526565551758, + "learning_rate": 0.00010935522950599003, + "loss": 3.4913, + "step": 5113 + }, + { + "epoch": 6.54016, + "grad_norm": 0.4975212514400482, + "learning_rate": 0.00010931484722035266, + "loss": 3.4462, + "step": 5114 + }, + { + "epoch": 6.54144, + "grad_norm": 0.5028743147850037, + "learning_rate": 0.0001092744649347153, + "loss": 3.396, + "step": 5115 + }, + { + "epoch": 6.54272, + "grad_norm": 0.5105941295623779, + "learning_rate": 0.00010923408264907792, + "loss": 3.5302, + "step": 5116 + }, + { + "epoch": 6.5440000000000005, + "grad_norm": 0.5078137516975403, + "learning_rate": 0.00010919370036344056, + "loss": 3.4376, + "step": 5117 + }, + { + "epoch": 6.54528, + "grad_norm": 0.4959731101989746, + "learning_rate": 0.0001091533180778032, + "loss": 3.4079, + "step": 5118 + }, + { + "epoch": 6.54656, + "grad_norm": 0.5086926817893982, + "learning_rate": 0.00010911293579216584, + "loss": 3.4873, + "step": 5119 + }, + { + "epoch": 6.54784, + "grad_norm": 0.5077411532402039, + "learning_rate": 0.00010907255350652845, + "loss": 3.4758, + "step": 5120 + }, + { + "epoch": 6.54912, + "grad_norm": 0.5054405331611633, + "learning_rate": 0.00010903217122089109, + "loss": 3.4363, + "step": 5121 + }, + { + "epoch": 6.5504, + "grad_norm": 0.5057424902915955, + "learning_rate": 0.00010899178893525373, + "loss": 3.5458, + "step": 5122 + }, + { + "epoch": 6.55168, + "grad_norm": 0.5203006267547607, + "learning_rate": 0.00010895140664961635, + "loss": 3.5374, + "step": 5123 + }, + { + "epoch": 6.55296, + "grad_norm": 0.5345029830932617, + "learning_rate": 0.00010891102436397899, + "loss": 3.4367, + "step": 5124 + }, + { + "epoch": 6.55424, + "grad_norm": 0.5119213461875916, + "learning_rate": 0.00010887064207834162, + "loss": 3.4897, + "step": 5125 + }, + { + "epoch": 6.55552, + "grad_norm": 0.5316451787948608, + "learning_rate": 0.00010883025979270426, + "loss": 3.4209, + "step": 5126 + }, + { + "epoch": 6.5568, + "grad_norm": 0.5099210739135742, + "learning_rate": 0.00010878987750706689, + "loss": 3.418, + "step": 5127 + }, + { + "epoch": 6.55808, + "grad_norm": 0.5128821730613708, + "learning_rate": 0.00010874949522142952, + "loss": 3.4507, + "step": 5128 + }, + { + "epoch": 6.55936, + "grad_norm": 0.5047348141670227, + "learning_rate": 0.00010870911293579215, + "loss": 3.4232, + "step": 5129 + }, + { + "epoch": 6.56064, + "grad_norm": 0.514415979385376, + "learning_rate": 0.0001086687306501548, + "loss": 3.5885, + "step": 5130 + }, + { + "epoch": 6.56192, + "grad_norm": 0.5269995927810669, + "learning_rate": 0.00010862834836451743, + "loss": 3.5069, + "step": 5131 + }, + { + "epoch": 6.5632, + "grad_norm": 0.5068910717964172, + "learning_rate": 0.00010858796607888006, + "loss": 3.427, + "step": 5132 + }, + { + "epoch": 6.56448, + "grad_norm": 0.514909565448761, + "learning_rate": 0.00010854758379324269, + "loss": 3.5029, + "step": 5133 + }, + { + "epoch": 6.56576, + "grad_norm": 0.5140718817710876, + "learning_rate": 0.00010850720150760533, + "loss": 3.4053, + "step": 5134 + }, + { + "epoch": 6.56704, + "grad_norm": 0.524402379989624, + "learning_rate": 0.00010846681922196796, + "loss": 3.523, + "step": 5135 + }, + { + "epoch": 6.56832, + "grad_norm": 0.519688069820404, + "learning_rate": 0.00010842643693633058, + "loss": 3.4752, + "step": 5136 + }, + { + "epoch": 6.5696, + "grad_norm": 0.5132143497467041, + "learning_rate": 0.00010838605465069322, + "loss": 3.4902, + "step": 5137 + }, + { + "epoch": 6.57088, + "grad_norm": 0.5264431834220886, + "learning_rate": 0.00010834567236505585, + "loss": 3.5565, + "step": 5138 + }, + { + "epoch": 6.57216, + "grad_norm": 0.5338211059570312, + "learning_rate": 0.0001083052900794185, + "loss": 3.4898, + "step": 5139 + }, + { + "epoch": 6.57344, + "grad_norm": 0.5249161720275879, + "learning_rate": 0.00010826490779378111, + "loss": 3.5718, + "step": 5140 + }, + { + "epoch": 6.57472, + "grad_norm": 0.5375153422355652, + "learning_rate": 0.00010822452550814375, + "loss": 3.3902, + "step": 5141 + }, + { + "epoch": 6.576, + "grad_norm": 0.5202714204788208, + "learning_rate": 0.00010818414322250638, + "loss": 3.4019, + "step": 5142 + }, + { + "epoch": 6.57728, + "grad_norm": 0.5213868618011475, + "learning_rate": 0.00010814376093686903, + "loss": 3.5119, + "step": 5143 + }, + { + "epoch": 6.5785599999999995, + "grad_norm": 0.5079566240310669, + "learning_rate": 0.00010810337865123164, + "loss": 3.4668, + "step": 5144 + }, + { + "epoch": 6.57984, + "grad_norm": 0.5234741568565369, + "learning_rate": 0.00010806299636559429, + "loss": 3.4789, + "step": 5145 + }, + { + "epoch": 6.58112, + "grad_norm": 0.5120872259140015, + "learning_rate": 0.00010802261407995692, + "loss": 3.4737, + "step": 5146 + }, + { + "epoch": 6.5824, + "grad_norm": 0.5126521587371826, + "learning_rate": 0.00010798223179431956, + "loss": 3.4625, + "step": 5147 + }, + { + "epoch": 6.58368, + "grad_norm": 0.5083807110786438, + "learning_rate": 0.00010794184950868218, + "loss": 3.5056, + "step": 5148 + }, + { + "epoch": 6.58496, + "grad_norm": 0.5182278752326965, + "learning_rate": 0.00010790146722304481, + "loss": 3.4856, + "step": 5149 + }, + { + "epoch": 6.58624, + "grad_norm": 0.5055496096611023, + "learning_rate": 0.00010786108493740745, + "loss": 3.4205, + "step": 5150 + }, + { + "epoch": 6.58752, + "grad_norm": 0.5119015574455261, + "learning_rate": 0.00010782070265177007, + "loss": 3.4647, + "step": 5151 + }, + { + "epoch": 6.5888, + "grad_norm": 0.5228475332260132, + "learning_rate": 0.00010778032036613271, + "loss": 3.4117, + "step": 5152 + }, + { + "epoch": 6.59008, + "grad_norm": 0.5190859436988831, + "learning_rate": 0.00010773993808049534, + "loss": 3.4451, + "step": 5153 + }, + { + "epoch": 6.59136, + "grad_norm": 0.509190022945404, + "learning_rate": 0.00010769955579485799, + "loss": 3.4443, + "step": 5154 + }, + { + "epoch": 6.59264, + "grad_norm": 0.4972391724586487, + "learning_rate": 0.0001076591735092206, + "loss": 3.4357, + "step": 5155 + }, + { + "epoch": 6.59392, + "grad_norm": 0.5034371614456177, + "learning_rate": 0.00010761879122358325, + "loss": 3.4543, + "step": 5156 + }, + { + "epoch": 6.5952, + "grad_norm": 0.5267638564109802, + "learning_rate": 0.00010757840893794588, + "loss": 3.4192, + "step": 5157 + }, + { + "epoch": 6.59648, + "grad_norm": 0.4983810484409332, + "learning_rate": 0.00010753802665230852, + "loss": 3.3902, + "step": 5158 + }, + { + "epoch": 6.59776, + "grad_norm": 0.49326276779174805, + "learning_rate": 0.00010749764436667114, + "loss": 3.4052, + "step": 5159 + }, + { + "epoch": 6.5990400000000005, + "grad_norm": 0.526161789894104, + "learning_rate": 0.00010745726208103378, + "loss": 3.4829, + "step": 5160 + }, + { + "epoch": 6.60032, + "grad_norm": 0.49957576394081116, + "learning_rate": 0.00010741687979539641, + "loss": 3.3863, + "step": 5161 + }, + { + "epoch": 6.6016, + "grad_norm": 0.5143072009086609, + "learning_rate": 0.00010737649750975904, + "loss": 3.502, + "step": 5162 + }, + { + "epoch": 6.60288, + "grad_norm": 0.5053294897079468, + "learning_rate": 0.00010733611522412168, + "loss": 3.4453, + "step": 5163 + }, + { + "epoch": 6.60416, + "grad_norm": 0.49979978799819946, + "learning_rate": 0.0001072957329384843, + "loss": 3.4394, + "step": 5164 + }, + { + "epoch": 6.60544, + "grad_norm": 0.5157962441444397, + "learning_rate": 0.00010725535065284694, + "loss": 3.4921, + "step": 5165 + }, + { + "epoch": 6.60672, + "grad_norm": 0.5077204704284668, + "learning_rate": 0.00010721496836720957, + "loss": 3.4838, + "step": 5166 + }, + { + "epoch": 6.608, + "grad_norm": 0.5278066992759705, + "learning_rate": 0.00010717458608157222, + "loss": 3.4713, + "step": 5167 + }, + { + "epoch": 6.60928, + "grad_norm": 0.506314218044281, + "learning_rate": 0.00010713420379593483, + "loss": 3.5477, + "step": 5168 + }, + { + "epoch": 6.6105599999999995, + "grad_norm": 0.5169037580490112, + "learning_rate": 0.00010709382151029748, + "loss": 3.5052, + "step": 5169 + }, + { + "epoch": 6.61184, + "grad_norm": 0.4995163679122925, + "learning_rate": 0.00010705343922466011, + "loss": 3.4526, + "step": 5170 + }, + { + "epoch": 6.61312, + "grad_norm": 0.516046404838562, + "learning_rate": 0.00010701305693902275, + "loss": 3.4841, + "step": 5171 + }, + { + "epoch": 6.6144, + "grad_norm": 0.5097321271896362, + "learning_rate": 0.00010697267465338537, + "loss": 3.502, + "step": 5172 + }, + { + "epoch": 6.61568, + "grad_norm": 0.5002099871635437, + "learning_rate": 0.00010693229236774801, + "loss": 3.4349, + "step": 5173 + }, + { + "epoch": 6.61696, + "grad_norm": 0.5027880668640137, + "learning_rate": 0.00010689191008211064, + "loss": 3.4844, + "step": 5174 + }, + { + "epoch": 6.61824, + "grad_norm": 0.5286634564399719, + "learning_rate": 0.00010685152779647326, + "loss": 3.4879, + "step": 5175 + }, + { + "epoch": 6.61952, + "grad_norm": 0.5041953325271606, + "learning_rate": 0.0001068111455108359, + "loss": 3.4475, + "step": 5176 + }, + { + "epoch": 6.6208, + "grad_norm": 0.5107352137565613, + "learning_rate": 0.00010677076322519853, + "loss": 3.3829, + "step": 5177 + }, + { + "epoch": 6.62208, + "grad_norm": 0.52018803358078, + "learning_rate": 0.00010673038093956118, + "loss": 3.4578, + "step": 5178 + }, + { + "epoch": 6.62336, + "grad_norm": 0.5006090998649597, + "learning_rate": 0.00010668999865392379, + "loss": 3.4867, + "step": 5179 + }, + { + "epoch": 6.62464, + "grad_norm": 0.5116918683052063, + "learning_rate": 0.00010664961636828644, + "loss": 3.3972, + "step": 5180 + }, + { + "epoch": 6.62592, + "grad_norm": 0.5370351076126099, + "learning_rate": 0.00010660923408264907, + "loss": 3.4639, + "step": 5181 + }, + { + "epoch": 6.6272, + "grad_norm": 0.5006325244903564, + "learning_rate": 0.00010656885179701171, + "loss": 3.5232, + "step": 5182 + }, + { + "epoch": 6.62848, + "grad_norm": 0.5160982608795166, + "learning_rate": 0.00010652846951137433, + "loss": 3.4108, + "step": 5183 + }, + { + "epoch": 6.62976, + "grad_norm": 0.5148961544036865, + "learning_rate": 0.00010648808722573697, + "loss": 3.5224, + "step": 5184 + }, + { + "epoch": 6.6310400000000005, + "grad_norm": 0.4979293644428253, + "learning_rate": 0.0001064477049400996, + "loss": 3.4324, + "step": 5185 + }, + { + "epoch": 6.63232, + "grad_norm": 0.5204696655273438, + "learning_rate": 0.00010640732265446224, + "loss": 3.4534, + "step": 5186 + }, + { + "epoch": 6.6336, + "grad_norm": 0.5214214324951172, + "learning_rate": 0.00010636694036882486, + "loss": 3.5314, + "step": 5187 + }, + { + "epoch": 6.63488, + "grad_norm": 0.5053163766860962, + "learning_rate": 0.00010632655808318749, + "loss": 3.3898, + "step": 5188 + }, + { + "epoch": 6.63616, + "grad_norm": 0.5069863200187683, + "learning_rate": 0.00010628617579755013, + "loss": 3.4414, + "step": 5189 + }, + { + "epoch": 6.63744, + "grad_norm": 0.5157361030578613, + "learning_rate": 0.00010624579351191276, + "loss": 3.5128, + "step": 5190 + }, + { + "epoch": 6.63872, + "grad_norm": 0.500819206237793, + "learning_rate": 0.0001062054112262754, + "loss": 3.4923, + "step": 5191 + }, + { + "epoch": 6.64, + "grad_norm": 0.5084678530693054, + "learning_rate": 0.00010616502894063802, + "loss": 3.4921, + "step": 5192 + }, + { + "epoch": 6.64128, + "grad_norm": 0.520411491394043, + "learning_rate": 0.00010612464665500067, + "loss": 3.4166, + "step": 5193 + }, + { + "epoch": 6.64256, + "grad_norm": 0.5096445083618164, + "learning_rate": 0.0001060842643693633, + "loss": 3.4323, + "step": 5194 + }, + { + "epoch": 6.64384, + "grad_norm": 0.503450870513916, + "learning_rate": 0.00010604388208372593, + "loss": 3.3905, + "step": 5195 + }, + { + "epoch": 6.64512, + "grad_norm": 0.5244500041007996, + "learning_rate": 0.00010600349979808856, + "loss": 3.5189, + "step": 5196 + }, + { + "epoch": 6.6464, + "grad_norm": 0.5261669158935547, + "learning_rate": 0.0001059631175124512, + "loss": 3.4409, + "step": 5197 + }, + { + "epoch": 6.64768, + "grad_norm": 0.5130569338798523, + "learning_rate": 0.00010592273522681383, + "loss": 3.4309, + "step": 5198 + }, + { + "epoch": 6.64896, + "grad_norm": 0.5225698947906494, + "learning_rate": 0.00010588235294117647, + "loss": 3.4626, + "step": 5199 + }, + { + "epoch": 6.65024, + "grad_norm": 0.5169934034347534, + "learning_rate": 0.00010584197065553909, + "loss": 3.465, + "step": 5200 + }, + { + "epoch": 6.65152, + "grad_norm": 0.49897530674934387, + "learning_rate": 0.00010580158836990174, + "loss": 3.4524, + "step": 5201 + }, + { + "epoch": 6.6528, + "grad_norm": 0.5103302001953125, + "learning_rate": 0.00010576120608426437, + "loss": 3.4629, + "step": 5202 + }, + { + "epoch": 6.65408, + "grad_norm": 0.5289749503135681, + "learning_rate": 0.00010572082379862698, + "loss": 3.4733, + "step": 5203 + }, + { + "epoch": 6.65536, + "grad_norm": 0.5066429376602173, + "learning_rate": 0.00010568044151298963, + "loss": 3.4266, + "step": 5204 + }, + { + "epoch": 6.65664, + "grad_norm": 0.5125609636306763, + "learning_rate": 0.00010564005922735226, + "loss": 3.403, + "step": 5205 + }, + { + "epoch": 6.65792, + "grad_norm": 0.5179828405380249, + "learning_rate": 0.0001055996769417149, + "loss": 3.4441, + "step": 5206 + }, + { + "epoch": 6.6592, + "grad_norm": 0.5193979740142822, + "learning_rate": 0.00010555929465607752, + "loss": 3.5631, + "step": 5207 + }, + { + "epoch": 6.66048, + "grad_norm": 0.5336554050445557, + "learning_rate": 0.00010551891237044016, + "loss": 3.5912, + "step": 5208 + }, + { + "epoch": 6.66176, + "grad_norm": 0.5226610898971558, + "learning_rate": 0.00010547853008480279, + "loss": 3.461, + "step": 5209 + }, + { + "epoch": 6.66304, + "grad_norm": 0.523395836353302, + "learning_rate": 0.00010543814779916543, + "loss": 3.4231, + "step": 5210 + }, + { + "epoch": 6.66432, + "grad_norm": 0.5272761583328247, + "learning_rate": 0.00010539776551352805, + "loss": 3.4297, + "step": 5211 + }, + { + "epoch": 6.6655999999999995, + "grad_norm": 0.5125918984413147, + "learning_rate": 0.00010535738322789069, + "loss": 3.4043, + "step": 5212 + }, + { + "epoch": 6.66688, + "grad_norm": 0.4985739290714264, + "learning_rate": 0.00010531700094225332, + "loss": 3.5057, + "step": 5213 + }, + { + "epoch": 6.66816, + "grad_norm": 0.5222551226615906, + "learning_rate": 0.00010527661865661597, + "loss": 3.4881, + "step": 5214 + }, + { + "epoch": 6.66944, + "grad_norm": 0.5044218301773071, + "learning_rate": 0.00010523623637097858, + "loss": 3.4112, + "step": 5215 + }, + { + "epoch": 6.67072, + "grad_norm": 0.5089839696884155, + "learning_rate": 0.00010519585408534121, + "loss": 3.4329, + "step": 5216 + }, + { + "epoch": 6.672, + "grad_norm": 0.5202680826187134, + "learning_rate": 0.00010515547179970386, + "loss": 3.5092, + "step": 5217 + }, + { + "epoch": 6.67328, + "grad_norm": 0.5112069845199585, + "learning_rate": 0.00010511508951406649, + "loss": 3.4684, + "step": 5218 + }, + { + "epoch": 6.67456, + "grad_norm": 0.5102248787879944, + "learning_rate": 0.00010507470722842912, + "loss": 3.5571, + "step": 5219 + }, + { + "epoch": 6.67584, + "grad_norm": 0.5237480998039246, + "learning_rate": 0.00010503432494279175, + "loss": 3.5169, + "step": 5220 + }, + { + "epoch": 6.67712, + "grad_norm": 0.5136720538139343, + "learning_rate": 0.00010499394265715439, + "loss": 3.4614, + "step": 5221 + }, + { + "epoch": 6.6784, + "grad_norm": 0.5107023119926453, + "learning_rate": 0.00010495356037151702, + "loss": 3.4888, + "step": 5222 + }, + { + "epoch": 6.67968, + "grad_norm": 0.5016779899597168, + "learning_rate": 0.00010491317808587965, + "loss": 3.4266, + "step": 5223 + }, + { + "epoch": 6.68096, + "grad_norm": 0.5184606909751892, + "learning_rate": 0.00010487279580024228, + "loss": 3.4299, + "step": 5224 + }, + { + "epoch": 6.68224, + "grad_norm": 0.5030345320701599, + "learning_rate": 0.00010483241351460492, + "loss": 3.4783, + "step": 5225 + }, + { + "epoch": 6.68352, + "grad_norm": 0.5136353373527527, + "learning_rate": 0.00010479203122896755, + "loss": 3.5147, + "step": 5226 + }, + { + "epoch": 6.6848, + "grad_norm": 0.5140992999076843, + "learning_rate": 0.00010475164894333018, + "loss": 3.4818, + "step": 5227 + }, + { + "epoch": 6.6860800000000005, + "grad_norm": 0.5170648694038391, + "learning_rate": 0.00010471126665769281, + "loss": 3.4642, + "step": 5228 + }, + { + "epoch": 6.68736, + "grad_norm": 0.5128257274627686, + "learning_rate": 0.00010467088437205544, + "loss": 3.5038, + "step": 5229 + }, + { + "epoch": 6.68864, + "grad_norm": 0.5143338441848755, + "learning_rate": 0.00010463050208641809, + "loss": 3.435, + "step": 5230 + }, + { + "epoch": 6.68992, + "grad_norm": 0.5160678029060364, + "learning_rate": 0.0001045901198007807, + "loss": 3.4402, + "step": 5231 + }, + { + "epoch": 6.6912, + "grad_norm": 0.4955798387527466, + "learning_rate": 0.00010454973751514335, + "loss": 3.4803, + "step": 5232 + }, + { + "epoch": 6.69248, + "grad_norm": 0.5133627653121948, + "learning_rate": 0.00010450935522950598, + "loss": 3.5029, + "step": 5233 + }, + { + "epoch": 6.69376, + "grad_norm": 0.5118188261985779, + "learning_rate": 0.00010446897294386862, + "loss": 3.4282, + "step": 5234 + }, + { + "epoch": 6.69504, + "grad_norm": 0.5236147046089172, + "learning_rate": 0.00010442859065823124, + "loss": 3.5077, + "step": 5235 + }, + { + "epoch": 6.69632, + "grad_norm": 0.510515034198761, + "learning_rate": 0.00010438820837259388, + "loss": 3.6193, + "step": 5236 + }, + { + "epoch": 6.6975999999999996, + "grad_norm": 0.51131272315979, + "learning_rate": 0.00010434782608695651, + "loss": 3.5257, + "step": 5237 + }, + { + "epoch": 6.69888, + "grad_norm": 0.5147497653961182, + "learning_rate": 0.00010430744380131916, + "loss": 3.4566, + "step": 5238 + }, + { + "epoch": 6.70016, + "grad_norm": 0.4983549416065216, + "learning_rate": 0.00010426706151568177, + "loss": 3.5215, + "step": 5239 + }, + { + "epoch": 6.70144, + "grad_norm": 0.49420973658561707, + "learning_rate": 0.00010422667923004442, + "loss": 3.4522, + "step": 5240 + }, + { + "epoch": 6.70272, + "grad_norm": 0.5193718075752258, + "learning_rate": 0.00010418629694440705, + "loss": 3.5033, + "step": 5241 + }, + { + "epoch": 6.704, + "grad_norm": 0.5157377123832703, + "learning_rate": 0.00010414591465876966, + "loss": 3.4045, + "step": 5242 + }, + { + "epoch": 6.70528, + "grad_norm": 0.4960803985595703, + "learning_rate": 0.0001041055323731323, + "loss": 3.4672, + "step": 5243 + }, + { + "epoch": 6.70656, + "grad_norm": 0.5239757299423218, + "learning_rate": 0.00010406515008749494, + "loss": 3.57, + "step": 5244 + }, + { + "epoch": 6.70784, + "grad_norm": 0.5048173069953918, + "learning_rate": 0.00010402476780185758, + "loss": 3.4302, + "step": 5245 + }, + { + "epoch": 6.70912, + "grad_norm": 0.5351990461349487, + "learning_rate": 0.0001039843855162202, + "loss": 3.5531, + "step": 5246 + }, + { + "epoch": 6.7104, + "grad_norm": 0.49860915541648865, + "learning_rate": 0.00010394400323058284, + "loss": 3.3879, + "step": 5247 + }, + { + "epoch": 6.71168, + "grad_norm": 0.5203588604927063, + "learning_rate": 0.00010390362094494547, + "loss": 3.4665, + "step": 5248 + }, + { + "epoch": 6.71296, + "grad_norm": 0.5277810096740723, + "learning_rate": 0.00010386323865930811, + "loss": 3.4588, + "step": 5249 + }, + { + "epoch": 6.71424, + "grad_norm": 0.5041863918304443, + "learning_rate": 0.00010382285637367074, + "loss": 3.4085, + "step": 5250 + }, + { + "epoch": 6.71552, + "grad_norm": 0.5177097916603088, + "learning_rate": 0.00010378247408803337, + "loss": 3.4501, + "step": 5251 + }, + { + "epoch": 6.7168, + "grad_norm": 0.5193495750427246, + "learning_rate": 0.000103742091802396, + "loss": 3.4086, + "step": 5252 + }, + { + "epoch": 6.7180800000000005, + "grad_norm": 0.5192511081695557, + "learning_rate": 0.00010370170951675865, + "loss": 3.4711, + "step": 5253 + }, + { + "epoch": 6.71936, + "grad_norm": 0.5242570638656616, + "learning_rate": 0.00010366132723112128, + "loss": 3.4035, + "step": 5254 + }, + { + "epoch": 6.7206399999999995, + "grad_norm": 0.5168814063072205, + "learning_rate": 0.00010362094494548391, + "loss": 3.4503, + "step": 5255 + }, + { + "epoch": 6.72192, + "grad_norm": 0.506004810333252, + "learning_rate": 0.00010358056265984654, + "loss": 3.456, + "step": 5256 + }, + { + "epoch": 6.7232, + "grad_norm": 0.5204073786735535, + "learning_rate": 0.00010354018037420917, + "loss": 3.4142, + "step": 5257 + }, + { + "epoch": 6.72448, + "grad_norm": 0.4996756315231323, + "learning_rate": 0.00010349979808857181, + "loss": 3.4942, + "step": 5258 + }, + { + "epoch": 6.72576, + "grad_norm": 0.5085786581039429, + "learning_rate": 0.00010345941580293443, + "loss": 3.4686, + "step": 5259 + }, + { + "epoch": 6.72704, + "grad_norm": 0.5232467651367188, + "learning_rate": 0.00010341903351729707, + "loss": 3.5521, + "step": 5260 + }, + { + "epoch": 6.72832, + "grad_norm": 0.49586203694343567, + "learning_rate": 0.0001033786512316597, + "loss": 3.4765, + "step": 5261 + }, + { + "epoch": 6.7296, + "grad_norm": 0.5204891562461853, + "learning_rate": 0.00010333826894602235, + "loss": 3.4617, + "step": 5262 + }, + { + "epoch": 6.73088, + "grad_norm": 0.5140677094459534, + "learning_rate": 0.00010329788666038496, + "loss": 3.4447, + "step": 5263 + }, + { + "epoch": 6.73216, + "grad_norm": 0.501307487487793, + "learning_rate": 0.0001032575043747476, + "loss": 3.4758, + "step": 5264 + }, + { + "epoch": 6.73344, + "grad_norm": 0.5081751346588135, + "learning_rate": 0.00010321712208911024, + "loss": 3.4248, + "step": 5265 + }, + { + "epoch": 6.73472, + "grad_norm": 0.5316794514656067, + "learning_rate": 0.00010317673980347288, + "loss": 3.4928, + "step": 5266 + }, + { + "epoch": 6.736, + "grad_norm": 0.5068027377128601, + "learning_rate": 0.0001031363575178355, + "loss": 3.4352, + "step": 5267 + }, + { + "epoch": 6.73728, + "grad_norm": 0.5198256373405457, + "learning_rate": 0.00010309597523219814, + "loss": 3.5322, + "step": 5268 + }, + { + "epoch": 6.73856, + "grad_norm": 0.5113339424133301, + "learning_rate": 0.00010305559294656077, + "loss": 3.4244, + "step": 5269 + }, + { + "epoch": 6.73984, + "grad_norm": 0.5019405484199524, + "learning_rate": 0.00010301521066092339, + "loss": 3.5323, + "step": 5270 + }, + { + "epoch": 6.7411200000000004, + "grad_norm": 0.5108396410942078, + "learning_rate": 0.00010297482837528603, + "loss": 3.5005, + "step": 5271 + }, + { + "epoch": 6.7424, + "grad_norm": 0.5199779868125916, + "learning_rate": 0.00010293444608964866, + "loss": 3.519, + "step": 5272 + }, + { + "epoch": 6.74368, + "grad_norm": 0.5146390795707703, + "learning_rate": 0.0001028940638040113, + "loss": 3.4188, + "step": 5273 + }, + { + "epoch": 6.74496, + "grad_norm": 0.5107973217964172, + "learning_rate": 0.00010285368151837392, + "loss": 3.4814, + "step": 5274 + }, + { + "epoch": 6.74624, + "grad_norm": 0.5244498252868652, + "learning_rate": 0.00010281329923273656, + "loss": 3.4791, + "step": 5275 + }, + { + "epoch": 6.74752, + "grad_norm": 0.5274310111999512, + "learning_rate": 0.0001027729169470992, + "loss": 3.5072, + "step": 5276 + }, + { + "epoch": 6.7488, + "grad_norm": 0.528631329536438, + "learning_rate": 0.00010273253466146184, + "loss": 3.5248, + "step": 5277 + }, + { + "epoch": 6.75008, + "grad_norm": 0.5297229886054993, + "learning_rate": 0.00010269215237582445, + "loss": 3.4491, + "step": 5278 + }, + { + "epoch": 6.75136, + "grad_norm": 0.507632315158844, + "learning_rate": 0.0001026517700901871, + "loss": 3.4365, + "step": 5279 + }, + { + "epoch": 6.7526399999999995, + "grad_norm": 0.5038201808929443, + "learning_rate": 0.00010261138780454973, + "loss": 3.5131, + "step": 5280 + }, + { + "epoch": 6.75392, + "grad_norm": 0.5153136253356934, + "learning_rate": 0.00010257100551891237, + "loss": 3.4924, + "step": 5281 + }, + { + "epoch": 6.7552, + "grad_norm": 0.4991972744464874, + "learning_rate": 0.00010253062323327499, + "loss": 3.4242, + "step": 5282 + }, + { + "epoch": 6.75648, + "grad_norm": 0.4989522099494934, + "learning_rate": 0.00010249024094763762, + "loss": 3.5012, + "step": 5283 + }, + { + "epoch": 6.75776, + "grad_norm": 0.49682870507240295, + "learning_rate": 0.00010244985866200026, + "loss": 3.4454, + "step": 5284 + }, + { + "epoch": 6.75904, + "grad_norm": 0.49864479899406433, + "learning_rate": 0.00010240947637636289, + "loss": 3.443, + "step": 5285 + }, + { + "epoch": 6.76032, + "grad_norm": 0.5117205381393433, + "learning_rate": 0.00010236909409072552, + "loss": 3.4396, + "step": 5286 + }, + { + "epoch": 6.7616, + "grad_norm": 0.5127941966056824, + "learning_rate": 0.00010232871180508815, + "loss": 3.496, + "step": 5287 + }, + { + "epoch": 6.76288, + "grad_norm": 0.5159007906913757, + "learning_rate": 0.0001022883295194508, + "loss": 3.4693, + "step": 5288 + }, + { + "epoch": 6.76416, + "grad_norm": 0.5161288976669312, + "learning_rate": 0.00010224794723381343, + "loss": 3.4029, + "step": 5289 + }, + { + "epoch": 6.76544, + "grad_norm": 0.4983842670917511, + "learning_rate": 0.00010220756494817607, + "loss": 3.4671, + "step": 5290 + }, + { + "epoch": 6.76672, + "grad_norm": 0.5110688209533691, + "learning_rate": 0.00010216718266253869, + "loss": 3.435, + "step": 5291 + }, + { + "epoch": 6.768, + "grad_norm": 0.524663507938385, + "learning_rate": 0.00010212680037690133, + "loss": 3.5548, + "step": 5292 + }, + { + "epoch": 6.76928, + "grad_norm": 0.49911680817604065, + "learning_rate": 0.00010208641809126396, + "loss": 3.4678, + "step": 5293 + }, + { + "epoch": 6.77056, + "grad_norm": 0.5108251571655273, + "learning_rate": 0.0001020460358056266, + "loss": 3.4816, + "step": 5294 + }, + { + "epoch": 6.77184, + "grad_norm": 0.5047867298126221, + "learning_rate": 0.00010200565351998922, + "loss": 3.4348, + "step": 5295 + }, + { + "epoch": 6.7731200000000005, + "grad_norm": 0.5126866698265076, + "learning_rate": 0.00010196527123435185, + "loss": 3.4728, + "step": 5296 + }, + { + "epoch": 6.7744, + "grad_norm": 0.5074558258056641, + "learning_rate": 0.0001019248889487145, + "loss": 3.4696, + "step": 5297 + }, + { + "epoch": 6.77568, + "grad_norm": 0.5098884105682373, + "learning_rate": 0.00010188450666307711, + "loss": 3.4211, + "step": 5298 + }, + { + "epoch": 6.77696, + "grad_norm": 0.5483927726745605, + "learning_rate": 0.00010184412437743975, + "loss": 3.427, + "step": 5299 + }, + { + "epoch": 6.77824, + "grad_norm": 0.49813124537467957, + "learning_rate": 0.00010180374209180238, + "loss": 3.4238, + "step": 5300 + }, + { + "epoch": 6.77952, + "grad_norm": 0.5243797898292542, + "learning_rate": 0.00010176335980616503, + "loss": 3.5178, + "step": 5301 + }, + { + "epoch": 6.7808, + "grad_norm": 0.5047301054000854, + "learning_rate": 0.00010172297752052764, + "loss": 3.3934, + "step": 5302 + }, + { + "epoch": 6.78208, + "grad_norm": 0.4998399019241333, + "learning_rate": 0.00010168259523489029, + "loss": 3.4589, + "step": 5303 + }, + { + "epoch": 6.78336, + "grad_norm": 0.5172979831695557, + "learning_rate": 0.00010164221294925292, + "loss": 3.4315, + "step": 5304 + }, + { + "epoch": 6.78464, + "grad_norm": 0.5025821328163147, + "learning_rate": 0.00010160183066361556, + "loss": 3.4013, + "step": 5305 + }, + { + "epoch": 6.78592, + "grad_norm": 0.49646785855293274, + "learning_rate": 0.00010156144837797818, + "loss": 3.4618, + "step": 5306 + }, + { + "epoch": 6.7872, + "grad_norm": 0.49145373702049255, + "learning_rate": 0.00010152106609234082, + "loss": 3.4746, + "step": 5307 + }, + { + "epoch": 6.78848, + "grad_norm": 0.5159743428230286, + "learning_rate": 0.00010148068380670345, + "loss": 3.4769, + "step": 5308 + }, + { + "epoch": 6.78976, + "grad_norm": 0.4938885271549225, + "learning_rate": 0.00010144030152106608, + "loss": 3.3877, + "step": 5309 + }, + { + "epoch": 6.79104, + "grad_norm": 0.5240892767906189, + "learning_rate": 0.00010139991923542871, + "loss": 3.5444, + "step": 5310 + }, + { + "epoch": 6.79232, + "grad_norm": 0.5187338590621948, + "learning_rate": 0.00010135953694979134, + "loss": 3.5475, + "step": 5311 + }, + { + "epoch": 6.7936, + "grad_norm": 0.5051224231719971, + "learning_rate": 0.00010131915466415399, + "loss": 3.4844, + "step": 5312 + }, + { + "epoch": 6.79488, + "grad_norm": 0.5185756683349609, + "learning_rate": 0.00010127877237851662, + "loss": 3.5207, + "step": 5313 + }, + { + "epoch": 6.79616, + "grad_norm": 0.5253816843032837, + "learning_rate": 0.00010123839009287925, + "loss": 3.4069, + "step": 5314 + }, + { + "epoch": 6.79744, + "grad_norm": 0.511680543422699, + "learning_rate": 0.00010119800780724188, + "loss": 3.4456, + "step": 5315 + }, + { + "epoch": 6.79872, + "grad_norm": 0.5037193298339844, + "learning_rate": 0.00010115762552160452, + "loss": 3.5139, + "step": 5316 + }, + { + "epoch": 6.8, + "grad_norm": 0.5148517489433289, + "learning_rate": 0.00010111724323596715, + "loss": 3.4717, + "step": 5317 + }, + { + "epoch": 6.80128, + "grad_norm": 0.4965799152851105, + "learning_rate": 0.00010107686095032978, + "loss": 3.4902, + "step": 5318 + }, + { + "epoch": 6.80256, + "grad_norm": 0.5093517899513245, + "learning_rate": 0.00010103647866469241, + "loss": 3.5074, + "step": 5319 + }, + { + "epoch": 6.80384, + "grad_norm": 0.5288740396499634, + "learning_rate": 0.00010099609637905505, + "loss": 3.4673, + "step": 5320 + }, + { + "epoch": 6.80512, + "grad_norm": 0.5054797530174255, + "learning_rate": 0.00010095571409341768, + "loss": 3.5055, + "step": 5321 + }, + { + "epoch": 6.8064, + "grad_norm": 0.5090057253837585, + "learning_rate": 0.00010091533180778031, + "loss": 3.5057, + "step": 5322 + }, + { + "epoch": 6.8076799999999995, + "grad_norm": 0.5045148730278015, + "learning_rate": 0.00010087494952214294, + "loss": 3.4295, + "step": 5323 + }, + { + "epoch": 6.80896, + "grad_norm": 0.5039887428283691, + "learning_rate": 0.00010083456723650557, + "loss": 3.4735, + "step": 5324 + }, + { + "epoch": 6.81024, + "grad_norm": 0.4997764229774475, + "learning_rate": 0.00010079418495086822, + "loss": 3.4807, + "step": 5325 + }, + { + "epoch": 6.81152, + "grad_norm": 0.5063294768333435, + "learning_rate": 0.00010075380266523083, + "loss": 3.446, + "step": 5326 + }, + { + "epoch": 6.8128, + "grad_norm": 0.5159018039703369, + "learning_rate": 0.00010071342037959348, + "loss": 3.4766, + "step": 5327 + }, + { + "epoch": 6.81408, + "grad_norm": 0.5194793343544006, + "learning_rate": 0.00010067303809395611, + "loss": 3.5094, + "step": 5328 + }, + { + "epoch": 6.81536, + "grad_norm": 0.5233056545257568, + "learning_rate": 0.00010063265580831875, + "loss": 3.5093, + "step": 5329 + }, + { + "epoch": 6.81664, + "grad_norm": 0.5220425128936768, + "learning_rate": 0.00010059227352268137, + "loss": 3.5481, + "step": 5330 + }, + { + "epoch": 6.81792, + "grad_norm": 0.4991253912448883, + "learning_rate": 0.00010055189123704401, + "loss": 3.4397, + "step": 5331 + }, + { + "epoch": 6.8192, + "grad_norm": 0.5196431279182434, + "learning_rate": 0.00010051150895140664, + "loss": 3.429, + "step": 5332 + }, + { + "epoch": 6.82048, + "grad_norm": 0.510365903377533, + "learning_rate": 0.00010047112666576928, + "loss": 3.4965, + "step": 5333 + }, + { + "epoch": 6.82176, + "grad_norm": 0.5005152821540833, + "learning_rate": 0.0001004307443801319, + "loss": 3.4985, + "step": 5334 + }, + { + "epoch": 6.82304, + "grad_norm": 0.508727490901947, + "learning_rate": 0.00010039036209449454, + "loss": 3.3829, + "step": 5335 + }, + { + "epoch": 6.82432, + "grad_norm": 0.5061600208282471, + "learning_rate": 0.00010034997980885717, + "loss": 3.4547, + "step": 5336 + }, + { + "epoch": 6.8256, + "grad_norm": 0.5172441601753235, + "learning_rate": 0.00010030959752321979, + "loss": 3.3778, + "step": 5337 + }, + { + "epoch": 6.82688, + "grad_norm": 0.4944312572479248, + "learning_rate": 0.00010026921523758243, + "loss": 3.4106, + "step": 5338 + }, + { + "epoch": 6.8281600000000005, + "grad_norm": 0.5171020030975342, + "learning_rate": 0.00010022883295194506, + "loss": 3.4994, + "step": 5339 + }, + { + "epoch": 6.82944, + "grad_norm": 0.5048860907554626, + "learning_rate": 0.00010018845066630771, + "loss": 3.4575, + "step": 5340 + }, + { + "epoch": 6.83072, + "grad_norm": 0.5153966546058655, + "learning_rate": 0.00010014806838067034, + "loss": 3.4745, + "step": 5341 + }, + { + "epoch": 6.832, + "grad_norm": 0.5013549327850342, + "learning_rate": 0.00010010768609503297, + "loss": 3.4541, + "step": 5342 + }, + { + "epoch": 6.83328, + "grad_norm": 0.520453929901123, + "learning_rate": 0.0001000673038093956, + "loss": 3.4551, + "step": 5343 + }, + { + "epoch": 6.83456, + "grad_norm": 0.5147715210914612, + "learning_rate": 0.00010002692152375824, + "loss": 3.4985, + "step": 5344 + }, + { + "epoch": 6.83584, + "grad_norm": 0.5121247172355652, + "learning_rate": 9.998653923812087e-05, + "loss": 3.4887, + "step": 5345 + }, + { + "epoch": 6.83712, + "grad_norm": 0.5188484787940979, + "learning_rate": 9.99461569524835e-05, + "loss": 3.4733, + "step": 5346 + }, + { + "epoch": 6.8384, + "grad_norm": 0.5076323747634888, + "learning_rate": 9.990577466684613e-05, + "loss": 3.4318, + "step": 5347 + }, + { + "epoch": 6.8396799999999995, + "grad_norm": 0.5205701589584351, + "learning_rate": 9.986539238120878e-05, + "loss": 3.4467, + "step": 5348 + }, + { + "epoch": 6.84096, + "grad_norm": 0.5038503408432007, + "learning_rate": 9.98250100955714e-05, + "loss": 3.3956, + "step": 5349 + }, + { + "epoch": 6.84224, + "grad_norm": 0.4975230395793915, + "learning_rate": 9.978462780993402e-05, + "loss": 3.4585, + "step": 5350 + }, + { + "epoch": 6.84352, + "grad_norm": 0.5168045163154602, + "learning_rate": 9.974424552429667e-05, + "loss": 3.4813, + "step": 5351 + }, + { + "epoch": 6.8448, + "grad_norm": 0.5278074145317078, + "learning_rate": 9.97038632386593e-05, + "loss": 3.5233, + "step": 5352 + }, + { + "epoch": 6.84608, + "grad_norm": 0.5026320219039917, + "learning_rate": 9.966348095302194e-05, + "loss": 3.3954, + "step": 5353 + }, + { + "epoch": 6.84736, + "grad_norm": 0.5222524404525757, + "learning_rate": 9.962309866738456e-05, + "loss": 3.5426, + "step": 5354 + }, + { + "epoch": 6.84864, + "grad_norm": 0.49351999163627625, + "learning_rate": 9.95827163817472e-05, + "loss": 3.5092, + "step": 5355 + }, + { + "epoch": 6.84992, + "grad_norm": 0.5170454978942871, + "learning_rate": 9.954233409610983e-05, + "loss": 3.5309, + "step": 5356 + }, + { + "epoch": 6.8512, + "grad_norm": 0.49120914936065674, + "learning_rate": 9.950195181047247e-05, + "loss": 3.4582, + "step": 5357 + }, + { + "epoch": 6.85248, + "grad_norm": 0.5041413903236389, + "learning_rate": 9.946156952483509e-05, + "loss": 3.5076, + "step": 5358 + }, + { + "epoch": 6.85376, + "grad_norm": 0.5105189085006714, + "learning_rate": 9.942118723919773e-05, + "loss": 3.5075, + "step": 5359 + }, + { + "epoch": 6.85504, + "grad_norm": 0.4975397288799286, + "learning_rate": 9.938080495356036e-05, + "loss": 3.4318, + "step": 5360 + }, + { + "epoch": 6.85632, + "grad_norm": 0.49615657329559326, + "learning_rate": 9.934042266792301e-05, + "loss": 3.4473, + "step": 5361 + }, + { + "epoch": 6.8576, + "grad_norm": 0.4980040490627289, + "learning_rate": 9.930004038228562e-05, + "loss": 3.4313, + "step": 5362 + }, + { + "epoch": 6.85888, + "grad_norm": 0.4903441071510315, + "learning_rate": 9.925965809664825e-05, + "loss": 3.4639, + "step": 5363 + }, + { + "epoch": 6.8601600000000005, + "grad_norm": 0.5194531679153442, + "learning_rate": 9.92192758110109e-05, + "loss": 3.5111, + "step": 5364 + }, + { + "epoch": 6.86144, + "grad_norm": 0.5149429440498352, + "learning_rate": 9.917889352537351e-05, + "loss": 3.4815, + "step": 5365 + }, + { + "epoch": 6.86272, + "grad_norm": 0.49559643864631653, + "learning_rate": 9.913851123973616e-05, + "loss": 3.4756, + "step": 5366 + }, + { + "epoch": 6.864, + "grad_norm": 0.5016883015632629, + "learning_rate": 9.909812895409879e-05, + "loss": 3.4241, + "step": 5367 + }, + { + "epoch": 6.86528, + "grad_norm": 0.5163260102272034, + "learning_rate": 9.905774666846143e-05, + "loss": 3.4777, + "step": 5368 + }, + { + "epoch": 6.86656, + "grad_norm": 0.5174842476844788, + "learning_rate": 9.901736438282405e-05, + "loss": 3.534, + "step": 5369 + }, + { + "epoch": 6.86784, + "grad_norm": 0.49278444051742554, + "learning_rate": 9.897698209718669e-05, + "loss": 3.453, + "step": 5370 + }, + { + "epoch": 6.86912, + "grad_norm": 0.5156992673873901, + "learning_rate": 9.893659981154932e-05, + "loss": 3.4667, + "step": 5371 + }, + { + "epoch": 6.8704, + "grad_norm": 0.5196961164474487, + "learning_rate": 9.889621752591197e-05, + "loss": 3.5111, + "step": 5372 + }, + { + "epoch": 6.87168, + "grad_norm": 0.5395616888999939, + "learning_rate": 9.885583524027458e-05, + "loss": 3.4736, + "step": 5373 + }, + { + "epoch": 6.87296, + "grad_norm": 0.49754607677459717, + "learning_rate": 9.881545295463723e-05, + "loss": 3.4057, + "step": 5374 + }, + { + "epoch": 6.87424, + "grad_norm": 0.5215137004852295, + "learning_rate": 9.877507066899986e-05, + "loss": 3.4588, + "step": 5375 + }, + { + "epoch": 6.87552, + "grad_norm": 0.5358836650848389, + "learning_rate": 9.87346883833625e-05, + "loss": 3.542, + "step": 5376 + }, + { + "epoch": 6.8768, + "grad_norm": 0.5070124268531799, + "learning_rate": 9.869430609772513e-05, + "loss": 3.4484, + "step": 5377 + }, + { + "epoch": 6.87808, + "grad_norm": 0.5137901902198792, + "learning_rate": 9.865392381208775e-05, + "loss": 3.447, + "step": 5378 + }, + { + "epoch": 6.87936, + "grad_norm": 0.5140761137008667, + "learning_rate": 9.861354152645039e-05, + "loss": 3.4393, + "step": 5379 + }, + { + "epoch": 6.88064, + "grad_norm": 0.5206302404403687, + "learning_rate": 9.857315924081302e-05, + "loss": 3.4926, + "step": 5380 + }, + { + "epoch": 6.88192, + "grad_norm": 0.5339552760124207, + "learning_rate": 9.853277695517566e-05, + "loss": 3.5058, + "step": 5381 + }, + { + "epoch": 6.8832, + "grad_norm": 0.5175068974494934, + "learning_rate": 9.849239466953828e-05, + "loss": 3.4939, + "step": 5382 + }, + { + "epoch": 6.88448, + "grad_norm": 0.4954850673675537, + "learning_rate": 9.845201238390092e-05, + "loss": 3.4226, + "step": 5383 + }, + { + "epoch": 6.88576, + "grad_norm": 0.5178061127662659, + "learning_rate": 9.841163009826355e-05, + "loss": 3.4637, + "step": 5384 + }, + { + "epoch": 6.88704, + "grad_norm": 0.5334754586219788, + "learning_rate": 9.83712478126262e-05, + "loss": 3.4926, + "step": 5385 + }, + { + "epoch": 6.88832, + "grad_norm": 0.5084394216537476, + "learning_rate": 9.833086552698881e-05, + "loss": 3.413, + "step": 5386 + }, + { + "epoch": 6.8896, + "grad_norm": 0.5201271772384644, + "learning_rate": 9.829048324135146e-05, + "loss": 3.4399, + "step": 5387 + }, + { + "epoch": 6.89088, + "grad_norm": 0.5018450021743774, + "learning_rate": 9.825010095571409e-05, + "loss": 3.4989, + "step": 5388 + }, + { + "epoch": 6.89216, + "grad_norm": 0.524716854095459, + "learning_rate": 9.820971867007673e-05, + "loss": 3.4713, + "step": 5389 + }, + { + "epoch": 6.89344, + "grad_norm": 0.5103277564048767, + "learning_rate": 9.816933638443935e-05, + "loss": 3.4293, + "step": 5390 + }, + { + "epoch": 6.8947199999999995, + "grad_norm": 0.5264323353767395, + "learning_rate": 9.812895409880198e-05, + "loss": 3.5141, + "step": 5391 + }, + { + "epoch": 6.896, + "grad_norm": 0.5120908617973328, + "learning_rate": 9.808857181316462e-05, + "loss": 3.5326, + "step": 5392 + }, + { + "epoch": 6.89728, + "grad_norm": 0.510351300239563, + "learning_rate": 9.804818952752724e-05, + "loss": 3.3871, + "step": 5393 + }, + { + "epoch": 6.89856, + "grad_norm": 0.5060296058654785, + "learning_rate": 9.800780724188988e-05, + "loss": 3.4455, + "step": 5394 + }, + { + "epoch": 6.89984, + "grad_norm": 0.5129922032356262, + "learning_rate": 9.796742495625251e-05, + "loss": 3.5359, + "step": 5395 + }, + { + "epoch": 6.90112, + "grad_norm": 0.5016512274742126, + "learning_rate": 9.792704267061516e-05, + "loss": 3.457, + "step": 5396 + }, + { + "epoch": 6.9024, + "grad_norm": 0.50436931848526, + "learning_rate": 9.788666038497777e-05, + "loss": 3.5133, + "step": 5397 + }, + { + "epoch": 6.90368, + "grad_norm": 0.5089812874794006, + "learning_rate": 9.784627809934042e-05, + "loss": 3.4324, + "step": 5398 + }, + { + "epoch": 6.90496, + "grad_norm": 0.503563642501831, + "learning_rate": 9.780589581370305e-05, + "loss": 3.4798, + "step": 5399 + }, + { + "epoch": 6.90624, + "grad_norm": 0.49585676193237305, + "learning_rate": 9.776551352806569e-05, + "loss": 3.4961, + "step": 5400 + }, + { + "epoch": 6.90752, + "grad_norm": 0.5292691588401794, + "learning_rate": 9.77251312424283e-05, + "loss": 3.4779, + "step": 5401 + }, + { + "epoch": 6.9088, + "grad_norm": 0.4982292056083679, + "learning_rate": 9.768474895679095e-05, + "loss": 3.5183, + "step": 5402 + }, + { + "epoch": 6.91008, + "grad_norm": 0.5063697099685669, + "learning_rate": 9.764436667115358e-05, + "loss": 3.5731, + "step": 5403 + }, + { + "epoch": 6.91136, + "grad_norm": 0.5067952871322632, + "learning_rate": 9.760398438551621e-05, + "loss": 3.4067, + "step": 5404 + }, + { + "epoch": 6.91264, + "grad_norm": 0.5001175403594971, + "learning_rate": 9.756360209987884e-05, + "loss": 3.4509, + "step": 5405 + }, + { + "epoch": 6.91392, + "grad_norm": 0.5022578239440918, + "learning_rate": 9.752321981424147e-05, + "loss": 3.5178, + "step": 5406 + }, + { + "epoch": 6.9152000000000005, + "grad_norm": 0.49592211842536926, + "learning_rate": 9.748283752860411e-05, + "loss": 3.4671, + "step": 5407 + }, + { + "epoch": 6.91648, + "grad_norm": 0.5008174777030945, + "learning_rate": 9.744245524296674e-05, + "loss": 3.463, + "step": 5408 + }, + { + "epoch": 6.91776, + "grad_norm": 0.49405258893966675, + "learning_rate": 9.740207295732937e-05, + "loss": 3.4687, + "step": 5409 + }, + { + "epoch": 6.91904, + "grad_norm": 0.5040400624275208, + "learning_rate": 9.7361690671692e-05, + "loss": 3.4612, + "step": 5410 + }, + { + "epoch": 6.92032, + "grad_norm": 0.5239134430885315, + "learning_rate": 9.732130838605465e-05, + "loss": 3.4698, + "step": 5411 + }, + { + "epoch": 6.9216, + "grad_norm": 0.5057811737060547, + "learning_rate": 9.728092610041728e-05, + "loss": 3.4569, + "step": 5412 + }, + { + "epoch": 6.92288, + "grad_norm": 0.49874454736709595, + "learning_rate": 9.724054381477992e-05, + "loss": 3.4118, + "step": 5413 + }, + { + "epoch": 6.92416, + "grad_norm": 0.5236087441444397, + "learning_rate": 9.720016152914254e-05, + "loss": 3.5346, + "step": 5414 + }, + { + "epoch": 6.92544, + "grad_norm": 0.5117159485816956, + "learning_rate": 9.715977924350518e-05, + "loss": 3.5194, + "step": 5415 + }, + { + "epoch": 6.9267199999999995, + "grad_norm": 0.5199319124221802, + "learning_rate": 9.711939695786781e-05, + "loss": 3.4622, + "step": 5416 + }, + { + "epoch": 6.928, + "grad_norm": 0.5007228255271912, + "learning_rate": 9.707901467223043e-05, + "loss": 3.4657, + "step": 5417 + }, + { + "epoch": 6.92928, + "grad_norm": 0.4935159683227539, + "learning_rate": 9.703863238659307e-05, + "loss": 3.473, + "step": 5418 + }, + { + "epoch": 6.93056, + "grad_norm": 0.5131574273109436, + "learning_rate": 9.69982501009557e-05, + "loss": 3.4111, + "step": 5419 + }, + { + "epoch": 6.93184, + "grad_norm": 0.4788220226764679, + "learning_rate": 9.695786781531834e-05, + "loss": 3.4845, + "step": 5420 + }, + { + "epoch": 6.93312, + "grad_norm": 0.5057500600814819, + "learning_rate": 9.691748552968096e-05, + "loss": 3.505, + "step": 5421 + }, + { + "epoch": 6.9344, + "grad_norm": 0.5060935020446777, + "learning_rate": 9.68771032440436e-05, + "loss": 3.4094, + "step": 5422 + }, + { + "epoch": 6.93568, + "grad_norm": 0.4928586184978485, + "learning_rate": 9.683672095840624e-05, + "loss": 3.4754, + "step": 5423 + }, + { + "epoch": 6.93696, + "grad_norm": 0.5091453790664673, + "learning_rate": 9.679633867276888e-05, + "loss": 3.4907, + "step": 5424 + }, + { + "epoch": 6.93824, + "grad_norm": 0.5088662505149841, + "learning_rate": 9.67559563871315e-05, + "loss": 3.4644, + "step": 5425 + }, + { + "epoch": 6.93952, + "grad_norm": 0.50926274061203, + "learning_rate": 9.671557410149414e-05, + "loss": 3.4988, + "step": 5426 + }, + { + "epoch": 6.9408, + "grad_norm": 0.5116270780563354, + "learning_rate": 9.667519181585677e-05, + "loss": 3.4536, + "step": 5427 + }, + { + "epoch": 6.94208, + "grad_norm": 0.4971657395362854, + "learning_rate": 9.663480953021941e-05, + "loss": 3.472, + "step": 5428 + }, + { + "epoch": 6.94336, + "grad_norm": 0.5055294632911682, + "learning_rate": 9.659442724458203e-05, + "loss": 3.4944, + "step": 5429 + }, + { + "epoch": 6.94464, + "grad_norm": 0.5095595121383667, + "learning_rate": 9.655404495894466e-05, + "loss": 3.4677, + "step": 5430 + }, + { + "epoch": 6.94592, + "grad_norm": 0.4956967830657959, + "learning_rate": 9.65136626733073e-05, + "loss": 3.4282, + "step": 5431 + }, + { + "epoch": 6.9472000000000005, + "grad_norm": 0.5110241174697876, + "learning_rate": 9.647328038766993e-05, + "loss": 3.4498, + "step": 5432 + }, + { + "epoch": 6.94848, + "grad_norm": 0.4950363039970398, + "learning_rate": 9.643289810203256e-05, + "loss": 3.4326, + "step": 5433 + }, + { + "epoch": 6.94976, + "grad_norm": 0.5211616158485413, + "learning_rate": 9.639251581639519e-05, + "loss": 3.512, + "step": 5434 + }, + { + "epoch": 6.95104, + "grad_norm": 0.5052586793899536, + "learning_rate": 9.635213353075784e-05, + "loss": 3.4677, + "step": 5435 + }, + { + "epoch": 6.95232, + "grad_norm": 0.509345293045044, + "learning_rate": 9.631175124512047e-05, + "loss": 3.4526, + "step": 5436 + }, + { + "epoch": 6.9536, + "grad_norm": 0.5046288967132568, + "learning_rate": 9.62713689594831e-05, + "loss": 3.4675, + "step": 5437 + }, + { + "epoch": 6.95488, + "grad_norm": 0.511081337928772, + "learning_rate": 9.623098667384573e-05, + "loss": 3.4853, + "step": 5438 + }, + { + "epoch": 6.95616, + "grad_norm": 0.512008011341095, + "learning_rate": 9.619060438820837e-05, + "loss": 3.4673, + "step": 5439 + }, + { + "epoch": 6.95744, + "grad_norm": 0.5058484077453613, + "learning_rate": 9.6150222102571e-05, + "loss": 3.4265, + "step": 5440 + }, + { + "epoch": 6.95872, + "grad_norm": 0.5225197076797485, + "learning_rate": 9.610983981693363e-05, + "loss": 3.495, + "step": 5441 + }, + { + "epoch": 6.96, + "grad_norm": 0.5050714015960693, + "learning_rate": 9.606945753129626e-05, + "loss": 3.4189, + "step": 5442 + }, + { + "epoch": 6.96128, + "grad_norm": 0.5137665271759033, + "learning_rate": 9.60290752456589e-05, + "loss": 3.4752, + "step": 5443 + }, + { + "epoch": 6.96256, + "grad_norm": 0.5180103778839111, + "learning_rate": 9.598869296002153e-05, + "loss": 3.4813, + "step": 5444 + }, + { + "epoch": 6.96384, + "grad_norm": 0.5160430073738098, + "learning_rate": 9.594831067438415e-05, + "loss": 3.4638, + "step": 5445 + }, + { + "epoch": 6.96512, + "grad_norm": 0.511764407157898, + "learning_rate": 9.59079283887468e-05, + "loss": 3.4263, + "step": 5446 + }, + { + "epoch": 6.9664, + "grad_norm": 0.5118876099586487, + "learning_rate": 9.586754610310942e-05, + "loss": 3.4935, + "step": 5447 + }, + { + "epoch": 6.96768, + "grad_norm": 0.5115414261817932, + "learning_rate": 9.582716381747207e-05, + "loss": 3.4721, + "step": 5448 + }, + { + "epoch": 6.96896, + "grad_norm": 0.5058209896087646, + "learning_rate": 9.578678153183468e-05, + "loss": 3.464, + "step": 5449 + }, + { + "epoch": 6.97024, + "grad_norm": 0.4981795847415924, + "learning_rate": 9.574639924619733e-05, + "loss": 3.473, + "step": 5450 + }, + { + "epoch": 6.97152, + "grad_norm": 0.5040668249130249, + "learning_rate": 9.570601696055996e-05, + "loss": 3.3973, + "step": 5451 + }, + { + "epoch": 6.9728, + "grad_norm": 0.49496889114379883, + "learning_rate": 9.56656346749226e-05, + "loss": 3.372, + "step": 5452 + }, + { + "epoch": 6.97408, + "grad_norm": 0.5042381882667542, + "learning_rate": 9.562525238928522e-05, + "loss": 3.4545, + "step": 5453 + }, + { + "epoch": 6.97536, + "grad_norm": 0.5015059113502502, + "learning_rate": 9.558487010364786e-05, + "loss": 3.4237, + "step": 5454 + }, + { + "epoch": 6.97664, + "grad_norm": 0.5201092958450317, + "learning_rate": 9.554448781801049e-05, + "loss": 3.4955, + "step": 5455 + }, + { + "epoch": 6.97792, + "grad_norm": 0.5009555220603943, + "learning_rate": 9.550410553237314e-05, + "loss": 3.4304, + "step": 5456 + }, + { + "epoch": 6.9792, + "grad_norm": 0.49957409501075745, + "learning_rate": 9.546372324673575e-05, + "loss": 3.435, + "step": 5457 + }, + { + "epoch": 6.98048, + "grad_norm": 0.512995719909668, + "learning_rate": 9.542334096109838e-05, + "loss": 3.5062, + "step": 5458 + }, + { + "epoch": 6.9817599999999995, + "grad_norm": 0.5111671090126038, + "learning_rate": 9.538295867546103e-05, + "loss": 3.4766, + "step": 5459 + }, + { + "epoch": 6.98304, + "grad_norm": 0.5001948475837708, + "learning_rate": 9.534257638982364e-05, + "loss": 3.4496, + "step": 5460 + }, + { + "epoch": 6.98432, + "grad_norm": 0.5309041738510132, + "learning_rate": 9.530219410418629e-05, + "loss": 3.5057, + "step": 5461 + }, + { + "epoch": 6.9856, + "grad_norm": 0.5163221955299377, + "learning_rate": 9.526181181854892e-05, + "loss": 3.4416, + "step": 5462 + }, + { + "epoch": 6.98688, + "grad_norm": 0.4984453022480011, + "learning_rate": 9.522142953291156e-05, + "loss": 3.46, + "step": 5463 + }, + { + "epoch": 6.98816, + "grad_norm": 0.5184572339057922, + "learning_rate": 9.518104724727419e-05, + "loss": 3.4491, + "step": 5464 + }, + { + "epoch": 6.98944, + "grad_norm": 0.5225724577903748, + "learning_rate": 9.514066496163682e-05, + "loss": 3.4354, + "step": 5465 + }, + { + "epoch": 6.99072, + "grad_norm": 0.507361114025116, + "learning_rate": 9.510028267599945e-05, + "loss": 3.4749, + "step": 5466 + }, + { + "epoch": 6.992, + "grad_norm": 0.5018404126167297, + "learning_rate": 9.50599003903621e-05, + "loss": 3.4538, + "step": 5467 + }, + { + "epoch": 6.99328, + "grad_norm": 0.5017131567001343, + "learning_rate": 9.501951810472472e-05, + "loss": 3.4954, + "step": 5468 + }, + { + "epoch": 6.99456, + "grad_norm": 0.5111985802650452, + "learning_rate": 9.497913581908735e-05, + "loss": 3.4424, + "step": 5469 + }, + { + "epoch": 6.99584, + "grad_norm": 0.5290077328681946, + "learning_rate": 9.493875353344998e-05, + "loss": 3.4288, + "step": 5470 + }, + { + "epoch": 6.99712, + "grad_norm": 0.5016758441925049, + "learning_rate": 9.489837124781261e-05, + "loss": 3.4364, + "step": 5471 + }, + { + "epoch": 6.9984, + "grad_norm": 0.491237074136734, + "learning_rate": 9.485798896217526e-05, + "loss": 3.4449, + "step": 5472 + }, + { + "epoch": 6.99968, + "grad_norm": 0.5003752112388611, + "learning_rate": 9.481760667653787e-05, + "loss": 3.4199, + "step": 5473 + }, + { + "epoch": 7.0, + "grad_norm": 0.9872975945472717, + "learning_rate": 9.477722439090052e-05, + "loss": 3.5161, + "step": 5474 + }, + { + "epoch": 7.00128, + "grad_norm": 0.5078200101852417, + "learning_rate": 9.473684210526315e-05, + "loss": 3.33, + "step": 5475 + }, + { + "epoch": 7.00256, + "grad_norm": 0.5141358971595764, + "learning_rate": 9.469645981962579e-05, + "loss": 3.333, + "step": 5476 + }, + { + "epoch": 7.00384, + "grad_norm": 0.5374777913093567, + "learning_rate": 9.465607753398841e-05, + "loss": 3.3227, + "step": 5477 + }, + { + "epoch": 7.00512, + "grad_norm": 0.5191708207130432, + "learning_rate": 9.461569524835105e-05, + "loss": 3.4003, + "step": 5478 + }, + { + "epoch": 7.0064, + "grad_norm": 0.5081069469451904, + "learning_rate": 9.457531296271368e-05, + "loss": 3.3985, + "step": 5479 + }, + { + "epoch": 7.00768, + "grad_norm": 0.5270673036575317, + "learning_rate": 9.453493067707633e-05, + "loss": 3.3482, + "step": 5480 + }, + { + "epoch": 7.00896, + "grad_norm": 0.5204823017120361, + "learning_rate": 9.449454839143894e-05, + "loss": 3.3075, + "step": 5481 + }, + { + "epoch": 7.01024, + "grad_norm": 0.5076407194137573, + "learning_rate": 9.445416610580159e-05, + "loss": 3.2981, + "step": 5482 + }, + { + "epoch": 7.01152, + "grad_norm": 0.5210766196250916, + "learning_rate": 9.441378382016422e-05, + "loss": 3.4319, + "step": 5483 + }, + { + "epoch": 7.0128, + "grad_norm": 0.5091572403907776, + "learning_rate": 9.437340153452683e-05, + "loss": 3.3706, + "step": 5484 + }, + { + "epoch": 7.01408, + "grad_norm": 0.5025621056556702, + "learning_rate": 9.433301924888948e-05, + "loss": 3.274, + "step": 5485 + }, + { + "epoch": 7.01536, + "grad_norm": 0.5144043564796448, + "learning_rate": 9.42926369632521e-05, + "loss": 3.2991, + "step": 5486 + }, + { + "epoch": 7.01664, + "grad_norm": 0.5310031175613403, + "learning_rate": 9.425225467761475e-05, + "loss": 3.3532, + "step": 5487 + }, + { + "epoch": 7.01792, + "grad_norm": 0.506899893283844, + "learning_rate": 9.421187239197737e-05, + "loss": 3.3682, + "step": 5488 + }, + { + "epoch": 7.0192, + "grad_norm": 0.5226014256477356, + "learning_rate": 9.417149010634001e-05, + "loss": 3.3653, + "step": 5489 + }, + { + "epoch": 7.02048, + "grad_norm": 0.5002744197845459, + "learning_rate": 9.413110782070264e-05, + "loss": 3.3786, + "step": 5490 + }, + { + "epoch": 7.0217600000000004, + "grad_norm": 0.5125488638877869, + "learning_rate": 9.409072553506528e-05, + "loss": 3.3929, + "step": 5491 + }, + { + "epoch": 7.02304, + "grad_norm": 0.5321469902992249, + "learning_rate": 9.40503432494279e-05, + "loss": 3.3667, + "step": 5492 + }, + { + "epoch": 7.02432, + "grad_norm": 0.5275914072990417, + "learning_rate": 9.400996096379054e-05, + "loss": 3.3928, + "step": 5493 + }, + { + "epoch": 7.0256, + "grad_norm": 0.5226854681968689, + "learning_rate": 9.396957867815317e-05, + "loss": 3.3962, + "step": 5494 + }, + { + "epoch": 7.02688, + "grad_norm": 0.5123041272163391, + "learning_rate": 9.392919639251582e-05, + "loss": 3.2311, + "step": 5495 + }, + { + "epoch": 7.02816, + "grad_norm": 0.49747586250305176, + "learning_rate": 9.388881410687843e-05, + "loss": 3.3694, + "step": 5496 + }, + { + "epoch": 7.02944, + "grad_norm": 0.5055866241455078, + "learning_rate": 9.384843182124106e-05, + "loss": 3.3472, + "step": 5497 + }, + { + "epoch": 7.03072, + "grad_norm": 0.5089617371559143, + "learning_rate": 9.380804953560371e-05, + "loss": 3.3635, + "step": 5498 + }, + { + "epoch": 7.032, + "grad_norm": 0.5178612470626831, + "learning_rate": 9.376766724996634e-05, + "loss": 3.3485, + "step": 5499 + }, + { + "epoch": 7.03328, + "grad_norm": 0.5137771964073181, + "learning_rate": 9.372728496432898e-05, + "loss": 3.3414, + "step": 5500 + }, + { + "epoch": 7.03456, + "grad_norm": 0.5222017765045166, + "learning_rate": 9.36869026786916e-05, + "loss": 3.3504, + "step": 5501 + }, + { + "epoch": 7.03584, + "grad_norm": 0.506715714931488, + "learning_rate": 9.364652039305424e-05, + "loss": 3.342, + "step": 5502 + }, + { + "epoch": 7.03712, + "grad_norm": 0.5213739275932312, + "learning_rate": 9.360613810741687e-05, + "loss": 3.3492, + "step": 5503 + }, + { + "epoch": 7.0384, + "grad_norm": 0.5213996767997742, + "learning_rate": 9.356575582177952e-05, + "loss": 3.3364, + "step": 5504 + }, + { + "epoch": 7.03968, + "grad_norm": 0.5308609008789062, + "learning_rate": 9.352537353614213e-05, + "loss": 3.2428, + "step": 5505 + }, + { + "epoch": 7.04096, + "grad_norm": 0.520922064781189, + "learning_rate": 9.348499125050478e-05, + "loss": 3.3741, + "step": 5506 + }, + { + "epoch": 7.04224, + "grad_norm": 0.49800974130630493, + "learning_rate": 9.34446089648674e-05, + "loss": 3.35, + "step": 5507 + }, + { + "epoch": 7.04352, + "grad_norm": 0.53228360414505, + "learning_rate": 9.340422667923005e-05, + "loss": 3.3215, + "step": 5508 + }, + { + "epoch": 7.0448, + "grad_norm": 0.5500988960266113, + "learning_rate": 9.336384439359267e-05, + "loss": 3.4593, + "step": 5509 + }, + { + "epoch": 7.04608, + "grad_norm": 0.5122029185295105, + "learning_rate": 9.332346210795531e-05, + "loss": 3.3557, + "step": 5510 + }, + { + "epoch": 7.04736, + "grad_norm": 0.4989195764064789, + "learning_rate": 9.328307982231794e-05, + "loss": 3.3231, + "step": 5511 + }, + { + "epoch": 7.04864, + "grad_norm": 0.521932065486908, + "learning_rate": 9.324269753668056e-05, + "loss": 3.3911, + "step": 5512 + }, + { + "epoch": 7.04992, + "grad_norm": 0.5335235595703125, + "learning_rate": 9.32023152510432e-05, + "loss": 3.3212, + "step": 5513 + }, + { + "epoch": 7.0512, + "grad_norm": 0.5094990730285645, + "learning_rate": 9.316193296540583e-05, + "loss": 3.2888, + "step": 5514 + }, + { + "epoch": 7.05248, + "grad_norm": 0.5203416347503662, + "learning_rate": 9.312155067976847e-05, + "loss": 3.4157, + "step": 5515 + }, + { + "epoch": 7.05376, + "grad_norm": 0.5284749865531921, + "learning_rate": 9.308116839413109e-05, + "loss": 3.3612, + "step": 5516 + }, + { + "epoch": 7.05504, + "grad_norm": 0.5061756372451782, + "learning_rate": 9.304078610849373e-05, + "loss": 3.4084, + "step": 5517 + }, + { + "epoch": 7.05632, + "grad_norm": 0.5212084054946899, + "learning_rate": 9.300040382285636e-05, + "loss": 3.2901, + "step": 5518 + }, + { + "epoch": 7.0576, + "grad_norm": 0.528948187828064, + "learning_rate": 9.296002153721901e-05, + "loss": 3.3865, + "step": 5519 + }, + { + "epoch": 7.05888, + "grad_norm": 0.507111132144928, + "learning_rate": 9.291963925158162e-05, + "loss": 3.4119, + "step": 5520 + }, + { + "epoch": 7.06016, + "grad_norm": 0.5240997672080994, + "learning_rate": 9.287925696594427e-05, + "loss": 3.4243, + "step": 5521 + }, + { + "epoch": 7.06144, + "grad_norm": 0.5230147242546082, + "learning_rate": 9.28388746803069e-05, + "loss": 3.376, + "step": 5522 + }, + { + "epoch": 7.06272, + "grad_norm": 0.526611864566803, + "learning_rate": 9.279849239466954e-05, + "loss": 3.3711, + "step": 5523 + }, + { + "epoch": 7.064, + "grad_norm": 0.5132043957710266, + "learning_rate": 9.275811010903216e-05, + "loss": 3.2992, + "step": 5524 + }, + { + "epoch": 7.06528, + "grad_norm": 0.4980516731739044, + "learning_rate": 9.271772782339479e-05, + "loss": 3.2729, + "step": 5525 + }, + { + "epoch": 7.06656, + "grad_norm": 0.5084584355354309, + "learning_rate": 9.267734553775743e-05, + "loss": 3.2491, + "step": 5526 + }, + { + "epoch": 7.06784, + "grad_norm": 0.515275776386261, + "learning_rate": 9.263696325212006e-05, + "loss": 3.3712, + "step": 5527 + }, + { + "epoch": 7.06912, + "grad_norm": 0.5161235928535461, + "learning_rate": 9.259658096648269e-05, + "loss": 3.4019, + "step": 5528 + }, + { + "epoch": 7.0704, + "grad_norm": 0.5367205142974854, + "learning_rate": 9.255619868084532e-05, + "loss": 3.4287, + "step": 5529 + }, + { + "epoch": 7.07168, + "grad_norm": 0.5217522978782654, + "learning_rate": 9.251581639520796e-05, + "loss": 3.4843, + "step": 5530 + }, + { + "epoch": 7.07296, + "grad_norm": 0.5207579135894775, + "learning_rate": 9.24754341095706e-05, + "loss": 3.3275, + "step": 5531 + }, + { + "epoch": 7.07424, + "grad_norm": 0.5166409611701965, + "learning_rate": 9.243505182393323e-05, + "loss": 3.293, + "step": 5532 + }, + { + "epoch": 7.07552, + "grad_norm": 0.5201752185821533, + "learning_rate": 9.239466953829586e-05, + "loss": 3.3349, + "step": 5533 + }, + { + "epoch": 7.0768, + "grad_norm": 0.5157431364059448, + "learning_rate": 9.23542872526585e-05, + "loss": 3.3274, + "step": 5534 + }, + { + "epoch": 7.07808, + "grad_norm": 0.5315886735916138, + "learning_rate": 9.231390496702113e-05, + "loss": 3.3802, + "step": 5535 + }, + { + "epoch": 7.07936, + "grad_norm": 0.5255934000015259, + "learning_rate": 9.227352268138377e-05, + "loss": 3.3859, + "step": 5536 + }, + { + "epoch": 7.08064, + "grad_norm": 0.5185539126396179, + "learning_rate": 9.223314039574639e-05, + "loss": 3.3037, + "step": 5537 + }, + { + "epoch": 7.08192, + "grad_norm": 0.5157800912857056, + "learning_rate": 9.219275811010902e-05, + "loss": 3.4169, + "step": 5538 + }, + { + "epoch": 7.0832, + "grad_norm": 0.5315351486206055, + "learning_rate": 9.215237582447166e-05, + "loss": 3.3251, + "step": 5539 + }, + { + "epoch": 7.08448, + "grad_norm": 0.5046893358230591, + "learning_rate": 9.211199353883428e-05, + "loss": 3.3554, + "step": 5540 + }, + { + "epoch": 7.08576, + "grad_norm": 0.5140607357025146, + "learning_rate": 9.207161125319692e-05, + "loss": 3.3262, + "step": 5541 + }, + { + "epoch": 7.08704, + "grad_norm": 0.508526086807251, + "learning_rate": 9.203122896755955e-05, + "loss": 3.2896, + "step": 5542 + }, + { + "epoch": 7.08832, + "grad_norm": 0.5273757576942444, + "learning_rate": 9.19908466819222e-05, + "loss": 3.4774, + "step": 5543 + }, + { + "epoch": 7.0896, + "grad_norm": 0.5205854773521423, + "learning_rate": 9.195046439628481e-05, + "loss": 3.3347, + "step": 5544 + }, + { + "epoch": 7.09088, + "grad_norm": 0.5332198739051819, + "learning_rate": 9.191008211064746e-05, + "loss": 3.3342, + "step": 5545 + }, + { + "epoch": 7.09216, + "grad_norm": 0.5330145955085754, + "learning_rate": 9.186969982501009e-05, + "loss": 3.3769, + "step": 5546 + }, + { + "epoch": 7.09344, + "grad_norm": 0.5021401047706604, + "learning_rate": 9.182931753937273e-05, + "loss": 3.3476, + "step": 5547 + }, + { + "epoch": 7.09472, + "grad_norm": 0.5273832678794861, + "learning_rate": 9.178893525373535e-05, + "loss": 3.333, + "step": 5548 + }, + { + "epoch": 7.096, + "grad_norm": 0.5313226580619812, + "learning_rate": 9.174855296809799e-05, + "loss": 3.4208, + "step": 5549 + }, + { + "epoch": 7.09728, + "grad_norm": 0.5218507051467896, + "learning_rate": 9.170817068246062e-05, + "loss": 3.3408, + "step": 5550 + }, + { + "epoch": 7.09856, + "grad_norm": 0.5151420831680298, + "learning_rate": 9.166778839682325e-05, + "loss": 3.3944, + "step": 5551 + }, + { + "epoch": 7.09984, + "grad_norm": 0.5464621186256409, + "learning_rate": 9.162740611118588e-05, + "loss": 3.3923, + "step": 5552 + }, + { + "epoch": 7.10112, + "grad_norm": 0.5159791707992554, + "learning_rate": 9.158702382554851e-05, + "loss": 3.379, + "step": 5553 + }, + { + "epoch": 7.1024, + "grad_norm": 0.526742160320282, + "learning_rate": 9.154664153991115e-05, + "loss": 3.3621, + "step": 5554 + }, + { + "epoch": 7.10368, + "grad_norm": 0.5101780891418457, + "learning_rate": 9.150625925427378e-05, + "loss": 3.3674, + "step": 5555 + }, + { + "epoch": 7.10496, + "grad_norm": 0.5260143876075745, + "learning_rate": 9.146587696863641e-05, + "loss": 3.3164, + "step": 5556 + }, + { + "epoch": 7.10624, + "grad_norm": 0.5367382168769836, + "learning_rate": 9.142549468299904e-05, + "loss": 3.3863, + "step": 5557 + }, + { + "epoch": 7.10752, + "grad_norm": 0.5072177648544312, + "learning_rate": 9.138511239736169e-05, + "loss": 3.335, + "step": 5558 + }, + { + "epoch": 7.1088, + "grad_norm": 0.5360726118087769, + "learning_rate": 9.134473011172432e-05, + "loss": 3.3996, + "step": 5559 + }, + { + "epoch": 7.11008, + "grad_norm": 0.5281416177749634, + "learning_rate": 9.130434782608695e-05, + "loss": 3.3302, + "step": 5560 + }, + { + "epoch": 7.11136, + "grad_norm": 0.526565432548523, + "learning_rate": 9.126396554044958e-05, + "loss": 3.3779, + "step": 5561 + }, + { + "epoch": 7.11264, + "grad_norm": 0.5221660733222961, + "learning_rate": 9.122358325481222e-05, + "loss": 3.3879, + "step": 5562 + }, + { + "epoch": 7.11392, + "grad_norm": 0.5545408129692078, + "learning_rate": 9.118320096917485e-05, + "loss": 3.3599, + "step": 5563 + }, + { + "epoch": 7.1152, + "grad_norm": 0.5166964530944824, + "learning_rate": 9.114281868353748e-05, + "loss": 3.3786, + "step": 5564 + }, + { + "epoch": 7.11648, + "grad_norm": 0.5115810632705688, + "learning_rate": 9.110243639790011e-05, + "loss": 3.3983, + "step": 5565 + }, + { + "epoch": 7.11776, + "grad_norm": 0.527052640914917, + "learning_rate": 9.106205411226274e-05, + "loss": 3.3886, + "step": 5566 + }, + { + "epoch": 7.11904, + "grad_norm": 0.5167108774185181, + "learning_rate": 9.102167182662539e-05, + "loss": 3.3837, + "step": 5567 + }, + { + "epoch": 7.12032, + "grad_norm": 0.5196815729141235, + "learning_rate": 9.0981289540988e-05, + "loss": 3.4007, + "step": 5568 + }, + { + "epoch": 7.1216, + "grad_norm": 0.5209911465644836, + "learning_rate": 9.094090725535065e-05, + "loss": 3.3465, + "step": 5569 + }, + { + "epoch": 7.12288, + "grad_norm": 0.5412890315055847, + "learning_rate": 9.090052496971328e-05, + "loss": 3.3554, + "step": 5570 + }, + { + "epoch": 7.12416, + "grad_norm": 0.5221083164215088, + "learning_rate": 9.086014268407592e-05, + "loss": 3.3713, + "step": 5571 + }, + { + "epoch": 7.12544, + "grad_norm": 0.5288161039352417, + "learning_rate": 9.081976039843854e-05, + "loss": 3.3445, + "step": 5572 + }, + { + "epoch": 7.12672, + "grad_norm": 0.5313680171966553, + "learning_rate": 9.077937811280118e-05, + "loss": 3.3708, + "step": 5573 + }, + { + "epoch": 7.128, + "grad_norm": 0.5292986035346985, + "learning_rate": 9.073899582716381e-05, + "loss": 3.3137, + "step": 5574 + }, + { + "epoch": 7.12928, + "grad_norm": 0.5138509273529053, + "learning_rate": 9.069861354152645e-05, + "loss": 3.3887, + "step": 5575 + }, + { + "epoch": 7.13056, + "grad_norm": 0.540255606174469, + "learning_rate": 9.065823125588907e-05, + "loss": 3.3745, + "step": 5576 + }, + { + "epoch": 7.13184, + "grad_norm": 0.5347320437431335, + "learning_rate": 9.061784897025171e-05, + "loss": 3.3579, + "step": 5577 + }, + { + "epoch": 7.13312, + "grad_norm": 0.49910813570022583, + "learning_rate": 9.057746668461434e-05, + "loss": 3.3225, + "step": 5578 + }, + { + "epoch": 7.1344, + "grad_norm": 0.5152397751808167, + "learning_rate": 9.053708439897696e-05, + "loss": 3.3496, + "step": 5579 + }, + { + "epoch": 7.13568, + "grad_norm": 0.5248477458953857, + "learning_rate": 9.04967021133396e-05, + "loss": 3.3574, + "step": 5580 + }, + { + "epoch": 7.13696, + "grad_norm": 0.5127560496330261, + "learning_rate": 9.045631982770223e-05, + "loss": 3.3692, + "step": 5581 + }, + { + "epoch": 7.13824, + "grad_norm": 0.5316688418388367, + "learning_rate": 9.041593754206488e-05, + "loss": 3.3383, + "step": 5582 + }, + { + "epoch": 7.13952, + "grad_norm": 0.5397014021873474, + "learning_rate": 9.03755552564275e-05, + "loss": 3.3939, + "step": 5583 + }, + { + "epoch": 7.1408, + "grad_norm": 0.5297314524650574, + "learning_rate": 9.033517297079014e-05, + "loss": 3.3205, + "step": 5584 + }, + { + "epoch": 7.14208, + "grad_norm": 0.5164687633514404, + "learning_rate": 9.029479068515277e-05, + "loss": 3.311, + "step": 5585 + }, + { + "epoch": 7.14336, + "grad_norm": 0.5183284878730774, + "learning_rate": 9.025440839951541e-05, + "loss": 3.3063, + "step": 5586 + }, + { + "epoch": 7.14464, + "grad_norm": 0.508436918258667, + "learning_rate": 9.021402611387804e-05, + "loss": 3.3496, + "step": 5587 + }, + { + "epoch": 7.14592, + "grad_norm": 0.5277997851371765, + "learning_rate": 9.017364382824067e-05, + "loss": 3.3782, + "step": 5588 + }, + { + "epoch": 7.1472, + "grad_norm": 0.5223129987716675, + "learning_rate": 9.01332615426033e-05, + "loss": 3.3175, + "step": 5589 + }, + { + "epoch": 7.14848, + "grad_norm": 0.4987069070339203, + "learning_rate": 9.009287925696595e-05, + "loss": 3.3346, + "step": 5590 + }, + { + "epoch": 7.14976, + "grad_norm": 0.5348743796348572, + "learning_rate": 9.005249697132858e-05, + "loss": 3.3543, + "step": 5591 + }, + { + "epoch": 7.15104, + "grad_norm": 0.5277005434036255, + "learning_rate": 9.001211468569119e-05, + "loss": 3.4288, + "step": 5592 + }, + { + "epoch": 7.15232, + "grad_norm": 0.5248242020606995, + "learning_rate": 8.997173240005384e-05, + "loss": 3.3001, + "step": 5593 + }, + { + "epoch": 7.1536, + "grad_norm": 0.5100671052932739, + "learning_rate": 8.993135011441647e-05, + "loss": 3.3571, + "step": 5594 + }, + { + "epoch": 7.15488, + "grad_norm": 0.5024705529212952, + "learning_rate": 8.989096782877911e-05, + "loss": 3.35, + "step": 5595 + }, + { + "epoch": 7.15616, + "grad_norm": 0.5296070575714111, + "learning_rate": 8.985058554314173e-05, + "loss": 3.3975, + "step": 5596 + }, + { + "epoch": 7.15744, + "grad_norm": 0.535521388053894, + "learning_rate": 8.981020325750437e-05, + "loss": 3.4045, + "step": 5597 + }, + { + "epoch": 7.15872, + "grad_norm": 0.5225574374198914, + "learning_rate": 8.9769820971867e-05, + "loss": 3.3518, + "step": 5598 + }, + { + "epoch": 7.16, + "grad_norm": 0.509151816368103, + "learning_rate": 8.972943868622964e-05, + "loss": 3.438, + "step": 5599 + }, + { + "epoch": 7.16128, + "grad_norm": 0.5296933054924011, + "learning_rate": 8.968905640059226e-05, + "loss": 3.4388, + "step": 5600 + }, + { + "epoch": 7.16256, + "grad_norm": 0.527597963809967, + "learning_rate": 8.96486741149549e-05, + "loss": 3.36, + "step": 5601 + }, + { + "epoch": 7.16384, + "grad_norm": 0.5163085460662842, + "learning_rate": 8.960829182931753e-05, + "loss": 3.3173, + "step": 5602 + }, + { + "epoch": 7.16512, + "grad_norm": 0.5081968307495117, + "learning_rate": 8.956790954368018e-05, + "loss": 3.3379, + "step": 5603 + }, + { + "epoch": 7.1664, + "grad_norm": 0.5179334878921509, + "learning_rate": 8.95275272580428e-05, + "loss": 3.3674, + "step": 5604 + }, + { + "epoch": 7.16768, + "grad_norm": 0.5077155828475952, + "learning_rate": 8.948714497240542e-05, + "loss": 3.3791, + "step": 5605 + }, + { + "epoch": 7.16896, + "grad_norm": 0.5360821485519409, + "learning_rate": 8.944676268676807e-05, + "loss": 3.3437, + "step": 5606 + }, + { + "epoch": 7.17024, + "grad_norm": 0.5339843034744263, + "learning_rate": 8.940638040113068e-05, + "loss": 3.4416, + "step": 5607 + }, + { + "epoch": 7.17152, + "grad_norm": 0.5199359059333801, + "learning_rate": 8.936599811549333e-05, + "loss": 3.4527, + "step": 5608 + }, + { + "epoch": 7.1728, + "grad_norm": 0.5207464098930359, + "learning_rate": 8.932561582985596e-05, + "loss": 3.3807, + "step": 5609 + }, + { + "epoch": 7.17408, + "grad_norm": 0.5150240659713745, + "learning_rate": 8.92852335442186e-05, + "loss": 3.3255, + "step": 5610 + }, + { + "epoch": 7.17536, + "grad_norm": 0.5204200744628906, + "learning_rate": 8.924485125858122e-05, + "loss": 3.3301, + "step": 5611 + }, + { + "epoch": 7.17664, + "grad_norm": 0.5184539556503296, + "learning_rate": 8.920446897294386e-05, + "loss": 3.3256, + "step": 5612 + }, + { + "epoch": 7.17792, + "grad_norm": 0.5265978574752808, + "learning_rate": 8.916408668730649e-05, + "loss": 3.3752, + "step": 5613 + }, + { + "epoch": 7.1792, + "grad_norm": 0.5245677828788757, + "learning_rate": 8.912370440166914e-05, + "loss": 3.3356, + "step": 5614 + }, + { + "epoch": 7.18048, + "grad_norm": 0.5343059301376343, + "learning_rate": 8.908332211603175e-05, + "loss": 3.3374, + "step": 5615 + }, + { + "epoch": 7.18176, + "grad_norm": 0.5347888469696045, + "learning_rate": 8.90429398303944e-05, + "loss": 3.3665, + "step": 5616 + }, + { + "epoch": 7.18304, + "grad_norm": 0.5191536545753479, + "learning_rate": 8.900255754475703e-05, + "loss": 3.3387, + "step": 5617 + }, + { + "epoch": 7.18432, + "grad_norm": 0.5158880949020386, + "learning_rate": 8.896217525911966e-05, + "loss": 3.312, + "step": 5618 + }, + { + "epoch": 7.1856, + "grad_norm": 0.5173276662826538, + "learning_rate": 8.892179297348229e-05, + "loss": 3.352, + "step": 5619 + }, + { + "epoch": 7.18688, + "grad_norm": 0.515876829624176, + "learning_rate": 8.888141068784492e-05, + "loss": 3.3369, + "step": 5620 + }, + { + "epoch": 7.18816, + "grad_norm": 0.5200967788696289, + "learning_rate": 8.884102840220756e-05, + "loss": 3.3315, + "step": 5621 + }, + { + "epoch": 7.18944, + "grad_norm": 0.5234977602958679, + "learning_rate": 8.880064611657019e-05, + "loss": 3.3648, + "step": 5622 + }, + { + "epoch": 7.19072, + "grad_norm": 0.5436128377914429, + "learning_rate": 8.876026383093282e-05, + "loss": 3.3157, + "step": 5623 + }, + { + "epoch": 7.192, + "grad_norm": 0.5315131545066833, + "learning_rate": 8.871988154529545e-05, + "loss": 3.3696, + "step": 5624 + }, + { + "epoch": 7.19328, + "grad_norm": 0.5198819637298584, + "learning_rate": 8.867949925965809e-05, + "loss": 3.3841, + "step": 5625 + }, + { + "epoch": 7.19456, + "grad_norm": 0.5186936259269714, + "learning_rate": 8.863911697402072e-05, + "loss": 3.3401, + "step": 5626 + }, + { + "epoch": 7.19584, + "grad_norm": 0.5158634781837463, + "learning_rate": 8.859873468838337e-05, + "loss": 3.3416, + "step": 5627 + }, + { + "epoch": 7.19712, + "grad_norm": 0.5173478722572327, + "learning_rate": 8.855835240274598e-05, + "loss": 3.3616, + "step": 5628 + }, + { + "epoch": 7.1984, + "grad_norm": 0.5127228498458862, + "learning_rate": 8.851797011710863e-05, + "loss": 3.4288, + "step": 5629 + }, + { + "epoch": 7.19968, + "grad_norm": 0.5157126188278198, + "learning_rate": 8.847758783147126e-05, + "loss": 3.3122, + "step": 5630 + }, + { + "epoch": 7.20096, + "grad_norm": 0.5168270468711853, + "learning_rate": 8.84372055458339e-05, + "loss": 3.3206, + "step": 5631 + }, + { + "epoch": 7.20224, + "grad_norm": 0.5436537861824036, + "learning_rate": 8.839682326019652e-05, + "loss": 3.415, + "step": 5632 + }, + { + "epoch": 7.20352, + "grad_norm": 0.5260339975357056, + "learning_rate": 8.835644097455915e-05, + "loss": 3.3191, + "step": 5633 + }, + { + "epoch": 7.2048, + "grad_norm": 0.5202707648277283, + "learning_rate": 8.831605868892179e-05, + "loss": 3.3391, + "step": 5634 + }, + { + "epoch": 7.20608, + "grad_norm": 0.5285276174545288, + "learning_rate": 8.827567640328441e-05, + "loss": 3.3521, + "step": 5635 + }, + { + "epoch": 7.2073599999999995, + "grad_norm": 0.5395596027374268, + "learning_rate": 8.823529411764705e-05, + "loss": 3.3539, + "step": 5636 + }, + { + "epoch": 7.20864, + "grad_norm": 0.543609619140625, + "learning_rate": 8.819491183200968e-05, + "loss": 3.3652, + "step": 5637 + }, + { + "epoch": 7.20992, + "grad_norm": 0.5348373651504517, + "learning_rate": 8.815452954637232e-05, + "loss": 3.3671, + "step": 5638 + }, + { + "epoch": 7.2112, + "grad_norm": 0.5275009274482727, + "learning_rate": 8.811414726073494e-05, + "loss": 3.3737, + "step": 5639 + }, + { + "epoch": 7.21248, + "grad_norm": 0.5249333381652832, + "learning_rate": 8.807376497509758e-05, + "loss": 3.4308, + "step": 5640 + }, + { + "epoch": 7.21376, + "grad_norm": 0.5271218419075012, + "learning_rate": 8.803338268946022e-05, + "loss": 3.3363, + "step": 5641 + }, + { + "epoch": 7.21504, + "grad_norm": 0.5315967798233032, + "learning_rate": 8.799300040382286e-05, + "loss": 3.4132, + "step": 5642 + }, + { + "epoch": 7.21632, + "grad_norm": 0.5222828984260559, + "learning_rate": 8.795261811818548e-05, + "loss": 3.3832, + "step": 5643 + }, + { + "epoch": 7.2176, + "grad_norm": 0.5247751474380493, + "learning_rate": 8.791223583254812e-05, + "loss": 3.255, + "step": 5644 + }, + { + "epoch": 7.21888, + "grad_norm": 0.5310077667236328, + "learning_rate": 8.787185354691075e-05, + "loss": 3.3478, + "step": 5645 + }, + { + "epoch": 7.22016, + "grad_norm": 0.5162127614021301, + "learning_rate": 8.783147126127338e-05, + "loss": 3.3882, + "step": 5646 + }, + { + "epoch": 7.22144, + "grad_norm": 0.5231219530105591, + "learning_rate": 8.779108897563601e-05, + "loss": 3.3114, + "step": 5647 + }, + { + "epoch": 7.22272, + "grad_norm": 0.5083349943161011, + "learning_rate": 8.775070668999864e-05, + "loss": 3.2972, + "step": 5648 + }, + { + "epoch": 7.224, + "grad_norm": 0.5252796411514282, + "learning_rate": 8.771032440436128e-05, + "loss": 3.2982, + "step": 5649 + }, + { + "epoch": 7.22528, + "grad_norm": 0.5133306980133057, + "learning_rate": 8.766994211872391e-05, + "loss": 3.3952, + "step": 5650 + }, + { + "epoch": 7.22656, + "grad_norm": 0.5313632488250732, + "learning_rate": 8.762955983308654e-05, + "loss": 3.4222, + "step": 5651 + }, + { + "epoch": 7.22784, + "grad_norm": 0.5258992910385132, + "learning_rate": 8.758917754744917e-05, + "loss": 3.3971, + "step": 5652 + }, + { + "epoch": 7.22912, + "grad_norm": 0.5275927186012268, + "learning_rate": 8.754879526181182e-05, + "loss": 3.3187, + "step": 5653 + }, + { + "epoch": 7.2304, + "grad_norm": 0.5042412877082825, + "learning_rate": 8.750841297617445e-05, + "loss": 3.3668, + "step": 5654 + }, + { + "epoch": 7.23168, + "grad_norm": 0.528590202331543, + "learning_rate": 8.746803069053708e-05, + "loss": 3.4068, + "step": 5655 + }, + { + "epoch": 7.23296, + "grad_norm": 0.5280982255935669, + "learning_rate": 8.742764840489971e-05, + "loss": 3.4039, + "step": 5656 + }, + { + "epoch": 7.23424, + "grad_norm": 0.5211199522018433, + "learning_rate": 8.738726611926235e-05, + "loss": 3.3706, + "step": 5657 + }, + { + "epoch": 7.23552, + "grad_norm": 0.5214046239852905, + "learning_rate": 8.734688383362498e-05, + "loss": 3.3703, + "step": 5658 + }, + { + "epoch": 7.2368, + "grad_norm": 0.5262265205383301, + "learning_rate": 8.73065015479876e-05, + "loss": 3.2918, + "step": 5659 + }, + { + "epoch": 7.23808, + "grad_norm": 0.534403920173645, + "learning_rate": 8.726611926235024e-05, + "loss": 3.3363, + "step": 5660 + }, + { + "epoch": 7.23936, + "grad_norm": 0.5397913455963135, + "learning_rate": 8.722573697671287e-05, + "loss": 3.3867, + "step": 5661 + }, + { + "epoch": 7.24064, + "grad_norm": 0.5387073755264282, + "learning_rate": 8.718535469107551e-05, + "loss": 3.4315, + "step": 5662 + }, + { + "epoch": 7.24192, + "grad_norm": 0.5351658463478088, + "learning_rate": 8.714497240543813e-05, + "loss": 3.4319, + "step": 5663 + }, + { + "epoch": 7.2432, + "grad_norm": 0.5469005107879639, + "learning_rate": 8.710459011980077e-05, + "loss": 3.4048, + "step": 5664 + }, + { + "epoch": 7.24448, + "grad_norm": 0.539505660533905, + "learning_rate": 8.70642078341634e-05, + "loss": 3.4154, + "step": 5665 + }, + { + "epoch": 7.24576, + "grad_norm": 0.543161928653717, + "learning_rate": 8.702382554852605e-05, + "loss": 3.4247, + "step": 5666 + }, + { + "epoch": 7.24704, + "grad_norm": 0.5213647484779358, + "learning_rate": 8.698344326288866e-05, + "loss": 3.3403, + "step": 5667 + }, + { + "epoch": 7.24832, + "grad_norm": 0.5247822403907776, + "learning_rate": 8.694306097725131e-05, + "loss": 3.3623, + "step": 5668 + }, + { + "epoch": 7.2496, + "grad_norm": 0.5359296202659607, + "learning_rate": 8.690267869161394e-05, + "loss": 3.3437, + "step": 5669 + }, + { + "epoch": 7.25088, + "grad_norm": 0.5125774145126343, + "learning_rate": 8.686229640597658e-05, + "loss": 3.4178, + "step": 5670 + }, + { + "epoch": 7.25216, + "grad_norm": 0.5092083215713501, + "learning_rate": 8.68219141203392e-05, + "loss": 3.3376, + "step": 5671 + }, + { + "epoch": 7.25344, + "grad_norm": 0.5361836552619934, + "learning_rate": 8.678153183470183e-05, + "loss": 3.2985, + "step": 5672 + }, + { + "epoch": 7.25472, + "grad_norm": 0.525852382183075, + "learning_rate": 8.674114954906447e-05, + "loss": 3.3765, + "step": 5673 + }, + { + "epoch": 7.256, + "grad_norm": 0.5261217355728149, + "learning_rate": 8.670076726342709e-05, + "loss": 3.3436, + "step": 5674 + }, + { + "epoch": 7.25728, + "grad_norm": 0.5285188555717468, + "learning_rate": 8.666038497778973e-05, + "loss": 3.4639, + "step": 5675 + }, + { + "epoch": 7.25856, + "grad_norm": 0.5249500274658203, + "learning_rate": 8.662000269215236e-05, + "loss": 3.3356, + "step": 5676 + }, + { + "epoch": 7.25984, + "grad_norm": 0.5341816544532776, + "learning_rate": 8.6579620406515e-05, + "loss": 3.3849, + "step": 5677 + }, + { + "epoch": 7.26112, + "grad_norm": 0.538826048374176, + "learning_rate": 8.653923812087764e-05, + "loss": 3.4035, + "step": 5678 + }, + { + "epoch": 7.2624, + "grad_norm": 0.5222164392471313, + "learning_rate": 8.649885583524027e-05, + "loss": 3.3679, + "step": 5679 + }, + { + "epoch": 7.26368, + "grad_norm": 0.5226588249206543, + "learning_rate": 8.64584735496029e-05, + "loss": 3.4012, + "step": 5680 + }, + { + "epoch": 7.26496, + "grad_norm": 0.5181993246078491, + "learning_rate": 8.641809126396554e-05, + "loss": 3.3866, + "step": 5681 + }, + { + "epoch": 7.26624, + "grad_norm": 0.5294241905212402, + "learning_rate": 8.637770897832817e-05, + "loss": 3.4343, + "step": 5682 + }, + { + "epoch": 7.26752, + "grad_norm": 0.5253875851631165, + "learning_rate": 8.63373266926908e-05, + "loss": 3.3867, + "step": 5683 + }, + { + "epoch": 7.2688, + "grad_norm": 0.5317622423171997, + "learning_rate": 8.629694440705343e-05, + "loss": 3.3478, + "step": 5684 + }, + { + "epoch": 7.27008, + "grad_norm": 0.5254741907119751, + "learning_rate": 8.625656212141606e-05, + "loss": 3.4261, + "step": 5685 + }, + { + "epoch": 7.27136, + "grad_norm": 0.5245876312255859, + "learning_rate": 8.62161798357787e-05, + "loss": 3.3631, + "step": 5686 + }, + { + "epoch": 7.27264, + "grad_norm": 0.5217685699462891, + "learning_rate": 8.617579755014132e-05, + "loss": 3.3236, + "step": 5687 + }, + { + "epoch": 7.27392, + "grad_norm": 0.5240602493286133, + "learning_rate": 8.613541526450396e-05, + "loss": 3.3932, + "step": 5688 + }, + { + "epoch": 7.2752, + "grad_norm": 0.5256776809692383, + "learning_rate": 8.60950329788666e-05, + "loss": 3.4072, + "step": 5689 + }, + { + "epoch": 7.27648, + "grad_norm": 0.5401302576065063, + "learning_rate": 8.605465069322924e-05, + "loss": 3.3593, + "step": 5690 + }, + { + "epoch": 7.27776, + "grad_norm": 0.5390993356704712, + "learning_rate": 8.601426840759185e-05, + "loss": 3.3646, + "step": 5691 + }, + { + "epoch": 7.27904, + "grad_norm": 0.5312697887420654, + "learning_rate": 8.59738861219545e-05, + "loss": 3.4073, + "step": 5692 + }, + { + "epoch": 7.28032, + "grad_norm": 0.5275899171829224, + "learning_rate": 8.593350383631713e-05, + "loss": 3.3039, + "step": 5693 + }, + { + "epoch": 7.2816, + "grad_norm": 0.5374751687049866, + "learning_rate": 8.589312155067977e-05, + "loss": 3.3517, + "step": 5694 + }, + { + "epoch": 7.2828800000000005, + "grad_norm": 0.5126158595085144, + "learning_rate": 8.585273926504239e-05, + "loss": 3.3677, + "step": 5695 + }, + { + "epoch": 7.28416, + "grad_norm": 0.5134846568107605, + "learning_rate": 8.581235697940503e-05, + "loss": 3.2869, + "step": 5696 + }, + { + "epoch": 7.28544, + "grad_norm": 0.5251624584197998, + "learning_rate": 8.577197469376766e-05, + "loss": 3.3785, + "step": 5697 + }, + { + "epoch": 7.28672, + "grad_norm": 0.5131708979606628, + "learning_rate": 8.57315924081303e-05, + "loss": 3.3422, + "step": 5698 + }, + { + "epoch": 7.288, + "grad_norm": 0.5143713355064392, + "learning_rate": 8.569121012249292e-05, + "loss": 3.3171, + "step": 5699 + }, + { + "epoch": 7.28928, + "grad_norm": 0.5281633734703064, + "learning_rate": 8.565082783685555e-05, + "loss": 3.324, + "step": 5700 + }, + { + "epoch": 7.29056, + "grad_norm": 0.5322510600090027, + "learning_rate": 8.56104455512182e-05, + "loss": 3.3639, + "step": 5701 + }, + { + "epoch": 7.29184, + "grad_norm": 0.5279068946838379, + "learning_rate": 8.557006326558081e-05, + "loss": 3.3392, + "step": 5702 + }, + { + "epoch": 7.29312, + "grad_norm": 0.5267346501350403, + "learning_rate": 8.552968097994346e-05, + "loss": 3.4094, + "step": 5703 + }, + { + "epoch": 7.2943999999999996, + "grad_norm": 0.512809693813324, + "learning_rate": 8.548929869430609e-05, + "loss": 3.3388, + "step": 5704 + }, + { + "epoch": 7.29568, + "grad_norm": 0.5133838057518005, + "learning_rate": 8.544891640866873e-05, + "loss": 3.3506, + "step": 5705 + }, + { + "epoch": 7.29696, + "grad_norm": 0.5310163497924805, + "learning_rate": 8.540853412303135e-05, + "loss": 3.3633, + "step": 5706 + }, + { + "epoch": 7.29824, + "grad_norm": 0.5210456848144531, + "learning_rate": 8.536815183739399e-05, + "loss": 3.3905, + "step": 5707 + }, + { + "epoch": 7.29952, + "grad_norm": 0.5178525447845459, + "learning_rate": 8.532776955175662e-05, + "loss": 3.3811, + "step": 5708 + }, + { + "epoch": 7.3008, + "grad_norm": 0.5075005292892456, + "learning_rate": 8.528738726611926e-05, + "loss": 3.4205, + "step": 5709 + }, + { + "epoch": 7.30208, + "grad_norm": 0.5155579447746277, + "learning_rate": 8.524700498048188e-05, + "loss": 3.4302, + "step": 5710 + }, + { + "epoch": 7.30336, + "grad_norm": 0.5333580374717712, + "learning_rate": 8.520662269484452e-05, + "loss": 3.4141, + "step": 5711 + }, + { + "epoch": 7.30464, + "grad_norm": 0.520826518535614, + "learning_rate": 8.516624040920715e-05, + "loss": 3.3472, + "step": 5712 + }, + { + "epoch": 7.30592, + "grad_norm": 0.5248406529426575, + "learning_rate": 8.512585812356978e-05, + "loss": 3.3902, + "step": 5713 + }, + { + "epoch": 7.3072, + "grad_norm": 0.5231212973594666, + "learning_rate": 8.508547583793243e-05, + "loss": 3.376, + "step": 5714 + }, + { + "epoch": 7.30848, + "grad_norm": 0.5204564929008484, + "learning_rate": 8.504509355229504e-05, + "loss": 3.3655, + "step": 5715 + }, + { + "epoch": 7.30976, + "grad_norm": 0.5213709473609924, + "learning_rate": 8.500471126665769e-05, + "loss": 3.3372, + "step": 5716 + }, + { + "epoch": 7.31104, + "grad_norm": 0.5297428369522095, + "learning_rate": 8.496432898102032e-05, + "loss": 3.3941, + "step": 5717 + }, + { + "epoch": 7.31232, + "grad_norm": 0.5178290605545044, + "learning_rate": 8.492394669538296e-05, + "loss": 3.3335, + "step": 5718 + }, + { + "epoch": 7.3136, + "grad_norm": 0.537998616695404, + "learning_rate": 8.488356440974558e-05, + "loss": 3.4031, + "step": 5719 + }, + { + "epoch": 7.31488, + "grad_norm": 0.5190197825431824, + "learning_rate": 8.484318212410822e-05, + "loss": 3.435, + "step": 5720 + }, + { + "epoch": 7.31616, + "grad_norm": 0.5143560767173767, + "learning_rate": 8.480279983847085e-05, + "loss": 3.3524, + "step": 5721 + }, + { + "epoch": 7.31744, + "grad_norm": 0.5168914198875427, + "learning_rate": 8.47624175528335e-05, + "loss": 3.365, + "step": 5722 + }, + { + "epoch": 7.31872, + "grad_norm": 0.5233278870582581, + "learning_rate": 8.472203526719611e-05, + "loss": 3.3914, + "step": 5723 + }, + { + "epoch": 7.32, + "grad_norm": 0.5222857594490051, + "learning_rate": 8.468165298155876e-05, + "loss": 3.445, + "step": 5724 + }, + { + "epoch": 7.32128, + "grad_norm": 0.5074923634529114, + "learning_rate": 8.464127069592139e-05, + "loss": 3.3527, + "step": 5725 + }, + { + "epoch": 7.32256, + "grad_norm": 0.5240098834037781, + "learning_rate": 8.4600888410284e-05, + "loss": 3.33, + "step": 5726 + }, + { + "epoch": 7.32384, + "grad_norm": 0.513421893119812, + "learning_rate": 8.456050612464665e-05, + "loss": 3.3835, + "step": 5727 + }, + { + "epoch": 7.32512, + "grad_norm": 0.524813175201416, + "learning_rate": 8.452012383900928e-05, + "loss": 3.3284, + "step": 5728 + }, + { + "epoch": 7.3264, + "grad_norm": 0.5159763693809509, + "learning_rate": 8.447974155337192e-05, + "loss": 3.3274, + "step": 5729 + }, + { + "epoch": 7.32768, + "grad_norm": 0.5181670784950256, + "learning_rate": 8.443935926773454e-05, + "loss": 3.4098, + "step": 5730 + }, + { + "epoch": 7.32896, + "grad_norm": 0.5126925706863403, + "learning_rate": 8.439897698209718e-05, + "loss": 3.2956, + "step": 5731 + }, + { + "epoch": 7.33024, + "grad_norm": 0.531437873840332, + "learning_rate": 8.435859469645981e-05, + "loss": 3.395, + "step": 5732 + }, + { + "epoch": 7.33152, + "grad_norm": 0.5225799083709717, + "learning_rate": 8.431821241082245e-05, + "loss": 3.3421, + "step": 5733 + }, + { + "epoch": 7.3328, + "grad_norm": 0.5173592567443848, + "learning_rate": 8.427783012518507e-05, + "loss": 3.3345, + "step": 5734 + }, + { + "epoch": 7.33408, + "grad_norm": 0.525583028793335, + "learning_rate": 8.423744783954771e-05, + "loss": 3.4155, + "step": 5735 + }, + { + "epoch": 7.33536, + "grad_norm": 0.5271779894828796, + "learning_rate": 8.419706555391034e-05, + "loss": 3.4121, + "step": 5736 + }, + { + "epoch": 7.33664, + "grad_norm": 0.5142373442649841, + "learning_rate": 8.415668326827299e-05, + "loss": 3.38, + "step": 5737 + }, + { + "epoch": 7.33792, + "grad_norm": 0.5095165967941284, + "learning_rate": 8.41163009826356e-05, + "loss": 3.3846, + "step": 5738 + }, + { + "epoch": 7.3392, + "grad_norm": 0.5181048512458801, + "learning_rate": 8.407591869699823e-05, + "loss": 3.4532, + "step": 5739 + }, + { + "epoch": 7.34048, + "grad_norm": 0.5084232091903687, + "learning_rate": 8.403553641136088e-05, + "loss": 3.3129, + "step": 5740 + }, + { + "epoch": 7.34176, + "grad_norm": 0.5222226977348328, + "learning_rate": 8.399515412572351e-05, + "loss": 3.3317, + "step": 5741 + }, + { + "epoch": 7.34304, + "grad_norm": 0.5268555283546448, + "learning_rate": 8.395477184008614e-05, + "loss": 3.3111, + "step": 5742 + }, + { + "epoch": 7.34432, + "grad_norm": 0.5401803851127625, + "learning_rate": 8.391438955444877e-05, + "loss": 3.3389, + "step": 5743 + }, + { + "epoch": 7.3456, + "grad_norm": 0.49834394454956055, + "learning_rate": 8.387400726881141e-05, + "loss": 3.3859, + "step": 5744 + }, + { + "epoch": 7.34688, + "grad_norm": 0.5210273265838623, + "learning_rate": 8.383362498317404e-05, + "loss": 3.371, + "step": 5745 + }, + { + "epoch": 7.34816, + "grad_norm": 0.5339854955673218, + "learning_rate": 8.379324269753667e-05, + "loss": 3.3539, + "step": 5746 + }, + { + "epoch": 7.3494399999999995, + "grad_norm": 0.5159910917282104, + "learning_rate": 8.37528604118993e-05, + "loss": 3.3968, + "step": 5747 + }, + { + "epoch": 7.35072, + "grad_norm": 0.514874279499054, + "learning_rate": 8.371247812626194e-05, + "loss": 3.4268, + "step": 5748 + }, + { + "epoch": 7.352, + "grad_norm": 0.5367988348007202, + "learning_rate": 8.367209584062457e-05, + "loss": 3.4254, + "step": 5749 + }, + { + "epoch": 7.35328, + "grad_norm": 0.5135504603385925, + "learning_rate": 8.363171355498722e-05, + "loss": 3.4124, + "step": 5750 + }, + { + "epoch": 7.35456, + "grad_norm": 0.5267705321311951, + "learning_rate": 8.359133126934984e-05, + "loss": 3.4418, + "step": 5751 + }, + { + "epoch": 7.35584, + "grad_norm": 0.5178395509719849, + "learning_rate": 8.355094898371248e-05, + "loss": 3.3394, + "step": 5752 + }, + { + "epoch": 7.35712, + "grad_norm": 0.5211403369903564, + "learning_rate": 8.351056669807511e-05, + "loss": 3.3646, + "step": 5753 + }, + { + "epoch": 7.3584, + "grad_norm": 0.5357264876365662, + "learning_rate": 8.347018441243773e-05, + "loss": 3.4218, + "step": 5754 + }, + { + "epoch": 7.35968, + "grad_norm": 0.5282320976257324, + "learning_rate": 8.342980212680037e-05, + "loss": 3.3026, + "step": 5755 + }, + { + "epoch": 7.36096, + "grad_norm": 0.5219427347183228, + "learning_rate": 8.3389419841163e-05, + "loss": 3.3327, + "step": 5756 + }, + { + "epoch": 7.36224, + "grad_norm": 0.5319931507110596, + "learning_rate": 8.334903755552564e-05, + "loss": 3.4432, + "step": 5757 + }, + { + "epoch": 7.36352, + "grad_norm": 0.5045892000198364, + "learning_rate": 8.330865526988826e-05, + "loss": 3.2485, + "step": 5758 + }, + { + "epoch": 7.3648, + "grad_norm": 0.5378910303115845, + "learning_rate": 8.32682729842509e-05, + "loss": 3.3374, + "step": 5759 + }, + { + "epoch": 7.36608, + "grad_norm": 0.5356464982032776, + "learning_rate": 8.322789069861353e-05, + "loss": 3.3517, + "step": 5760 + }, + { + "epoch": 7.36736, + "grad_norm": 0.5344239473342896, + "learning_rate": 8.318750841297618e-05, + "loss": 3.4034, + "step": 5761 + }, + { + "epoch": 7.36864, + "grad_norm": 0.530697226524353, + "learning_rate": 8.314712612733879e-05, + "loss": 3.3704, + "step": 5762 + }, + { + "epoch": 7.3699200000000005, + "grad_norm": 0.5439151525497437, + "learning_rate": 8.310674384170144e-05, + "loss": 3.3844, + "step": 5763 + }, + { + "epoch": 7.3712, + "grad_norm": 0.5485351085662842, + "learning_rate": 8.306636155606407e-05, + "loss": 3.3354, + "step": 5764 + }, + { + "epoch": 7.37248, + "grad_norm": 0.5290150046348572, + "learning_rate": 8.302597927042671e-05, + "loss": 3.3475, + "step": 5765 + }, + { + "epoch": 7.37376, + "grad_norm": 0.5363977551460266, + "learning_rate": 8.298559698478933e-05, + "loss": 3.3585, + "step": 5766 + }, + { + "epoch": 7.37504, + "grad_norm": 0.5304407477378845, + "learning_rate": 8.294521469915196e-05, + "loss": 3.3573, + "step": 5767 + }, + { + "epoch": 7.37632, + "grad_norm": 0.5171595215797424, + "learning_rate": 8.29048324135146e-05, + "loss": 3.3317, + "step": 5768 + }, + { + "epoch": 7.3776, + "grad_norm": 0.5493373870849609, + "learning_rate": 8.286445012787723e-05, + "loss": 3.4143, + "step": 5769 + }, + { + "epoch": 7.37888, + "grad_norm": 0.5111392736434937, + "learning_rate": 8.282406784223986e-05, + "loss": 3.3583, + "step": 5770 + }, + { + "epoch": 7.38016, + "grad_norm": 0.5278791189193726, + "learning_rate": 8.278368555660249e-05, + "loss": 3.3444, + "step": 5771 + }, + { + "epoch": 7.38144, + "grad_norm": 0.5462014079093933, + "learning_rate": 8.274330327096513e-05, + "loss": 3.3613, + "step": 5772 + }, + { + "epoch": 7.38272, + "grad_norm": 0.5223116874694824, + "learning_rate": 8.270292098532776e-05, + "loss": 3.3495, + "step": 5773 + }, + { + "epoch": 7.384, + "grad_norm": 0.5300905108451843, + "learning_rate": 8.26625386996904e-05, + "loss": 3.351, + "step": 5774 + }, + { + "epoch": 7.38528, + "grad_norm": 0.5270613431930542, + "learning_rate": 8.262215641405302e-05, + "loss": 3.3506, + "step": 5775 + }, + { + "epoch": 7.38656, + "grad_norm": 0.5194157958030701, + "learning_rate": 8.258177412841567e-05, + "loss": 3.3194, + "step": 5776 + }, + { + "epoch": 7.38784, + "grad_norm": 0.5169289708137512, + "learning_rate": 8.25413918427783e-05, + "loss": 3.3789, + "step": 5777 + }, + { + "epoch": 7.38912, + "grad_norm": 0.5390181541442871, + "learning_rate": 8.250100955714093e-05, + "loss": 3.3735, + "step": 5778 + }, + { + "epoch": 7.3904, + "grad_norm": 0.5157946348190308, + "learning_rate": 8.246062727150356e-05, + "loss": 3.3887, + "step": 5779 + }, + { + "epoch": 7.39168, + "grad_norm": 0.5124242901802063, + "learning_rate": 8.242024498586619e-05, + "loss": 3.3066, + "step": 5780 + }, + { + "epoch": 7.39296, + "grad_norm": 0.523602306842804, + "learning_rate": 8.237986270022883e-05, + "loss": 3.4837, + "step": 5781 + }, + { + "epoch": 7.39424, + "grad_norm": 0.525345504283905, + "learning_rate": 8.233948041459145e-05, + "loss": 3.343, + "step": 5782 + }, + { + "epoch": 7.39552, + "grad_norm": 0.5302904844284058, + "learning_rate": 8.229909812895409e-05, + "loss": 3.358, + "step": 5783 + }, + { + "epoch": 7.3968, + "grad_norm": 0.5098133087158203, + "learning_rate": 8.225871584331672e-05, + "loss": 3.3791, + "step": 5784 + }, + { + "epoch": 7.39808, + "grad_norm": 0.5192782282829285, + "learning_rate": 8.221833355767937e-05, + "loss": 3.4295, + "step": 5785 + }, + { + "epoch": 7.39936, + "grad_norm": 0.5070282816886902, + "learning_rate": 8.217795127204198e-05, + "loss": 3.3926, + "step": 5786 + }, + { + "epoch": 7.40064, + "grad_norm": 0.5000091791152954, + "learning_rate": 8.213756898640463e-05, + "loss": 3.3154, + "step": 5787 + }, + { + "epoch": 7.40192, + "grad_norm": 0.5293865203857422, + "learning_rate": 8.209718670076726e-05, + "loss": 3.3813, + "step": 5788 + }, + { + "epoch": 7.4032, + "grad_norm": 0.5276476144790649, + "learning_rate": 8.20568044151299e-05, + "loss": 3.3801, + "step": 5789 + }, + { + "epoch": 7.40448, + "grad_norm": 0.5471776127815247, + "learning_rate": 8.201642212949252e-05, + "loss": 3.4044, + "step": 5790 + }, + { + "epoch": 7.40576, + "grad_norm": 0.5328643918037415, + "learning_rate": 8.197603984385516e-05, + "loss": 3.3514, + "step": 5791 + }, + { + "epoch": 7.40704, + "grad_norm": 0.5258887410163879, + "learning_rate": 8.193565755821779e-05, + "loss": 3.3467, + "step": 5792 + }, + { + "epoch": 7.40832, + "grad_norm": 0.5234854221343994, + "learning_rate": 8.18952752725804e-05, + "loss": 3.4287, + "step": 5793 + }, + { + "epoch": 7.4096, + "grad_norm": 0.516001284122467, + "learning_rate": 8.185489298694305e-05, + "loss": 3.3589, + "step": 5794 + }, + { + "epoch": 7.41088, + "grad_norm": 0.5244291424751282, + "learning_rate": 8.181451070130568e-05, + "loss": 3.4387, + "step": 5795 + }, + { + "epoch": 7.41216, + "grad_norm": 0.5443776249885559, + "learning_rate": 8.177412841566832e-05, + "loss": 3.3916, + "step": 5796 + }, + { + "epoch": 7.41344, + "grad_norm": 0.5309216380119324, + "learning_rate": 8.173374613003094e-05, + "loss": 3.3954, + "step": 5797 + }, + { + "epoch": 7.41472, + "grad_norm": 0.5038948059082031, + "learning_rate": 8.169336384439358e-05, + "loss": 3.343, + "step": 5798 + }, + { + "epoch": 7.416, + "grad_norm": 0.52728271484375, + "learning_rate": 8.165298155875621e-05, + "loss": 3.3784, + "step": 5799 + }, + { + "epoch": 7.41728, + "grad_norm": 0.5411390066146851, + "learning_rate": 8.161259927311886e-05, + "loss": 3.4319, + "step": 5800 + }, + { + "epoch": 7.41856, + "grad_norm": 0.5185636878013611, + "learning_rate": 8.157221698748149e-05, + "loss": 3.4297, + "step": 5801 + }, + { + "epoch": 7.41984, + "grad_norm": 0.5342210531234741, + "learning_rate": 8.153183470184412e-05, + "loss": 3.3683, + "step": 5802 + }, + { + "epoch": 7.42112, + "grad_norm": 0.5324079394340515, + "learning_rate": 8.149145241620675e-05, + "loss": 3.3805, + "step": 5803 + }, + { + "epoch": 7.4224, + "grad_norm": 0.520026445388794, + "learning_rate": 8.145107013056939e-05, + "loss": 3.3222, + "step": 5804 + }, + { + "epoch": 7.42368, + "grad_norm": 0.5147387981414795, + "learning_rate": 8.141068784493202e-05, + "loss": 3.3722, + "step": 5805 + }, + { + "epoch": 7.4249600000000004, + "grad_norm": 0.5206732749938965, + "learning_rate": 8.137030555929464e-05, + "loss": 3.4011, + "step": 5806 + }, + { + "epoch": 7.42624, + "grad_norm": 0.5213762521743774, + "learning_rate": 8.132992327365728e-05, + "loss": 3.3676, + "step": 5807 + }, + { + "epoch": 7.42752, + "grad_norm": 0.5201925039291382, + "learning_rate": 8.128954098801991e-05, + "loss": 3.3553, + "step": 5808 + }, + { + "epoch": 7.4288, + "grad_norm": 0.5245599150657654, + "learning_rate": 8.124915870238256e-05, + "loss": 3.4287, + "step": 5809 + }, + { + "epoch": 7.43008, + "grad_norm": 0.5151320695877075, + "learning_rate": 8.120877641674517e-05, + "loss": 3.3672, + "step": 5810 + }, + { + "epoch": 7.43136, + "grad_norm": 0.5449185967445374, + "learning_rate": 8.116839413110782e-05, + "loss": 3.4269, + "step": 5811 + }, + { + "epoch": 7.43264, + "grad_norm": 0.5150814056396484, + "learning_rate": 8.112801184547045e-05, + "loss": 3.4574, + "step": 5812 + }, + { + "epoch": 7.43392, + "grad_norm": 0.5201151371002197, + "learning_rate": 8.108762955983309e-05, + "loss": 3.3769, + "step": 5813 + }, + { + "epoch": 7.4352, + "grad_norm": 0.5087258219718933, + "learning_rate": 8.10472472741957e-05, + "loss": 3.4079, + "step": 5814 + }, + { + "epoch": 7.4364799999999995, + "grad_norm": 0.5286344289779663, + "learning_rate": 8.100686498855835e-05, + "loss": 3.4387, + "step": 5815 + }, + { + "epoch": 7.43776, + "grad_norm": 0.5217651724815369, + "learning_rate": 8.096648270292098e-05, + "loss": 3.3826, + "step": 5816 + }, + { + "epoch": 7.43904, + "grad_norm": 0.5195396542549133, + "learning_rate": 8.092610041728362e-05, + "loss": 3.3419, + "step": 5817 + }, + { + "epoch": 7.44032, + "grad_norm": 0.5105847716331482, + "learning_rate": 8.088571813164624e-05, + "loss": 3.3915, + "step": 5818 + }, + { + "epoch": 7.4416, + "grad_norm": 0.5170384645462036, + "learning_rate": 8.084533584600888e-05, + "loss": 3.3614, + "step": 5819 + }, + { + "epoch": 7.44288, + "grad_norm": 0.5139781832695007, + "learning_rate": 8.080495356037151e-05, + "loss": 3.427, + "step": 5820 + }, + { + "epoch": 7.44416, + "grad_norm": 0.5179570317268372, + "learning_rate": 8.076457127473413e-05, + "loss": 3.4167, + "step": 5821 + }, + { + "epoch": 7.44544, + "grad_norm": 0.5289480090141296, + "learning_rate": 8.072418898909677e-05, + "loss": 3.3394, + "step": 5822 + }, + { + "epoch": 7.44672, + "grad_norm": 0.5202321410179138, + "learning_rate": 8.06838067034594e-05, + "loss": 3.3681, + "step": 5823 + }, + { + "epoch": 7.448, + "grad_norm": 0.5124974846839905, + "learning_rate": 8.064342441782205e-05, + "loss": 3.3676, + "step": 5824 + }, + { + "epoch": 7.44928, + "grad_norm": 0.5194815993309021, + "learning_rate": 8.060304213218466e-05, + "loss": 3.3824, + "step": 5825 + }, + { + "epoch": 7.45056, + "grad_norm": 0.5214881896972656, + "learning_rate": 8.056265984654731e-05, + "loss": 3.3963, + "step": 5826 + }, + { + "epoch": 7.45184, + "grad_norm": 0.5211467742919922, + "learning_rate": 8.052227756090994e-05, + "loss": 3.3754, + "step": 5827 + }, + { + "epoch": 7.45312, + "grad_norm": 0.5255600214004517, + "learning_rate": 8.048189527527258e-05, + "loss": 3.4049, + "step": 5828 + }, + { + "epoch": 7.4544, + "grad_norm": 0.5339398980140686, + "learning_rate": 8.04415129896352e-05, + "loss": 3.3593, + "step": 5829 + }, + { + "epoch": 7.45568, + "grad_norm": 0.5176902413368225, + "learning_rate": 8.040113070399784e-05, + "loss": 3.3843, + "step": 5830 + }, + { + "epoch": 7.45696, + "grad_norm": 0.5297396779060364, + "learning_rate": 8.036074841836047e-05, + "loss": 3.3186, + "step": 5831 + }, + { + "epoch": 7.45824, + "grad_norm": 0.531353235244751, + "learning_rate": 8.032036613272312e-05, + "loss": 3.3528, + "step": 5832 + }, + { + "epoch": 7.45952, + "grad_norm": 0.5194066166877747, + "learning_rate": 8.027998384708573e-05, + "loss": 3.4387, + "step": 5833 + }, + { + "epoch": 7.4608, + "grad_norm": 0.5318957567214966, + "learning_rate": 8.023960156144836e-05, + "loss": 3.3612, + "step": 5834 + }, + { + "epoch": 7.46208, + "grad_norm": 0.5308012366294861, + "learning_rate": 8.0199219275811e-05, + "loss": 3.4369, + "step": 5835 + }, + { + "epoch": 7.46336, + "grad_norm": 0.5104942321777344, + "learning_rate": 8.015883699017364e-05, + "loss": 3.4146, + "step": 5836 + }, + { + "epoch": 7.46464, + "grad_norm": 0.5228284597396851, + "learning_rate": 8.011845470453628e-05, + "loss": 3.3655, + "step": 5837 + }, + { + "epoch": 7.46592, + "grad_norm": 0.5318959951400757, + "learning_rate": 8.00780724188989e-05, + "loss": 3.3857, + "step": 5838 + }, + { + "epoch": 7.4672, + "grad_norm": 0.5241490006446838, + "learning_rate": 8.003769013326154e-05, + "loss": 3.4119, + "step": 5839 + }, + { + "epoch": 7.46848, + "grad_norm": 0.5517506003379822, + "learning_rate": 7.999730784762417e-05, + "loss": 3.395, + "step": 5840 + }, + { + "epoch": 7.46976, + "grad_norm": 0.529964804649353, + "learning_rate": 7.995692556198681e-05, + "loss": 3.3613, + "step": 5841 + }, + { + "epoch": 7.47104, + "grad_norm": 0.5344815850257874, + "learning_rate": 7.991654327634943e-05, + "loss": 3.3307, + "step": 5842 + }, + { + "epoch": 7.47232, + "grad_norm": 0.5124824643135071, + "learning_rate": 7.987616099071207e-05, + "loss": 3.3268, + "step": 5843 + }, + { + "epoch": 7.4736, + "grad_norm": 0.5115407705307007, + "learning_rate": 7.98357787050747e-05, + "loss": 3.3561, + "step": 5844 + }, + { + "epoch": 7.47488, + "grad_norm": 0.5252261757850647, + "learning_rate": 7.979539641943735e-05, + "loss": 3.3923, + "step": 5845 + }, + { + "epoch": 7.47616, + "grad_norm": 0.521026611328125, + "learning_rate": 7.975501413379996e-05, + "loss": 3.3531, + "step": 5846 + }, + { + "epoch": 7.47744, + "grad_norm": 0.5293687582015991, + "learning_rate": 7.97146318481626e-05, + "loss": 3.4022, + "step": 5847 + }, + { + "epoch": 7.47872, + "grad_norm": 0.5290361642837524, + "learning_rate": 7.967424956252524e-05, + "loss": 3.3675, + "step": 5848 + }, + { + "epoch": 7.48, + "grad_norm": 0.5217385292053223, + "learning_rate": 7.963386727688785e-05, + "loss": 3.3527, + "step": 5849 + }, + { + "epoch": 7.48128, + "grad_norm": 0.5071624517440796, + "learning_rate": 7.95934849912505e-05, + "loss": 3.3443, + "step": 5850 + }, + { + "epoch": 7.48256, + "grad_norm": 0.5297966599464417, + "learning_rate": 7.955310270561313e-05, + "loss": 3.3398, + "step": 5851 + }, + { + "epoch": 7.48384, + "grad_norm": 0.5170907378196716, + "learning_rate": 7.951272041997577e-05, + "loss": 3.3573, + "step": 5852 + }, + { + "epoch": 7.48512, + "grad_norm": 0.5123015642166138, + "learning_rate": 7.947233813433839e-05, + "loss": 3.3852, + "step": 5853 + }, + { + "epoch": 7.4864, + "grad_norm": 0.5429021120071411, + "learning_rate": 7.943195584870103e-05, + "loss": 3.435, + "step": 5854 + }, + { + "epoch": 7.48768, + "grad_norm": 0.5190651416778564, + "learning_rate": 7.939157356306366e-05, + "loss": 3.4174, + "step": 5855 + }, + { + "epoch": 7.48896, + "grad_norm": 0.5276857614517212, + "learning_rate": 7.93511912774263e-05, + "loss": 3.368, + "step": 5856 + }, + { + "epoch": 7.49024, + "grad_norm": 0.5189977288246155, + "learning_rate": 7.931080899178892e-05, + "loss": 3.3205, + "step": 5857 + }, + { + "epoch": 7.49152, + "grad_norm": 0.5297253131866455, + "learning_rate": 7.927042670615156e-05, + "loss": 3.3809, + "step": 5858 + }, + { + "epoch": 7.4928, + "grad_norm": 0.513972818851471, + "learning_rate": 7.92300444205142e-05, + "loss": 3.3473, + "step": 5859 + }, + { + "epoch": 7.49408, + "grad_norm": 0.539738118648529, + "learning_rate": 7.918966213487682e-05, + "loss": 3.3157, + "step": 5860 + }, + { + "epoch": 7.49536, + "grad_norm": 0.5355290174484253, + "learning_rate": 7.914927984923946e-05, + "loss": 3.4172, + "step": 5861 + }, + { + "epoch": 7.49664, + "grad_norm": 0.5001344084739685, + "learning_rate": 7.910889756360209e-05, + "loss": 3.2782, + "step": 5862 + }, + { + "epoch": 7.49792, + "grad_norm": 0.5256953835487366, + "learning_rate": 7.906851527796473e-05, + "loss": 3.3973, + "step": 5863 + }, + { + "epoch": 7.4992, + "grad_norm": 0.526117205619812, + "learning_rate": 7.902813299232736e-05, + "loss": 3.3484, + "step": 5864 + }, + { + "epoch": 7.50048, + "grad_norm": 0.5159139037132263, + "learning_rate": 7.898775070668999e-05, + "loss": 3.2791, + "step": 5865 + }, + { + "epoch": 7.50176, + "grad_norm": 0.5434691309928894, + "learning_rate": 7.894736842105262e-05, + "loss": 3.3503, + "step": 5866 + }, + { + "epoch": 7.50304, + "grad_norm": 0.5286847949028015, + "learning_rate": 7.890698613541526e-05, + "loss": 3.3695, + "step": 5867 + }, + { + "epoch": 7.50432, + "grad_norm": 0.5285123586654663, + "learning_rate": 7.886660384977789e-05, + "loss": 3.4208, + "step": 5868 + }, + { + "epoch": 7.5056, + "grad_norm": 0.5370778441429138, + "learning_rate": 7.882622156414052e-05, + "loss": 3.4278, + "step": 5869 + }, + { + "epoch": 7.50688, + "grad_norm": 0.5134227275848389, + "learning_rate": 7.878583927850315e-05, + "loss": 3.3727, + "step": 5870 + }, + { + "epoch": 7.50816, + "grad_norm": 0.5174378156661987, + "learning_rate": 7.87454569928658e-05, + "loss": 3.3265, + "step": 5871 + }, + { + "epoch": 7.50944, + "grad_norm": 0.5034676194190979, + "learning_rate": 7.870507470722843e-05, + "loss": 3.3688, + "step": 5872 + }, + { + "epoch": 7.51072, + "grad_norm": 0.5375760197639465, + "learning_rate": 7.866469242159107e-05, + "loss": 3.3616, + "step": 5873 + }, + { + "epoch": 7.5120000000000005, + "grad_norm": 0.5142561197280884, + "learning_rate": 7.862431013595369e-05, + "loss": 3.3702, + "step": 5874 + }, + { + "epoch": 7.51328, + "grad_norm": 0.5261555314064026, + "learning_rate": 7.858392785031632e-05, + "loss": 3.372, + "step": 5875 + }, + { + "epoch": 7.51456, + "grad_norm": 0.5355465412139893, + "learning_rate": 7.854354556467896e-05, + "loss": 3.3866, + "step": 5876 + }, + { + "epoch": 7.51584, + "grad_norm": 0.5262174010276794, + "learning_rate": 7.850316327904158e-05, + "loss": 3.4209, + "step": 5877 + }, + { + "epoch": 7.51712, + "grad_norm": 0.5196813344955444, + "learning_rate": 7.846278099340422e-05, + "loss": 3.4005, + "step": 5878 + }, + { + "epoch": 7.5184, + "grad_norm": 0.5177279114723206, + "learning_rate": 7.842239870776685e-05, + "loss": 3.364, + "step": 5879 + }, + { + "epoch": 7.51968, + "grad_norm": 0.5248557329177856, + "learning_rate": 7.83820164221295e-05, + "loss": 3.3458, + "step": 5880 + }, + { + "epoch": 7.52096, + "grad_norm": 0.5306951999664307, + "learning_rate": 7.834163413649211e-05, + "loss": 3.3503, + "step": 5881 + }, + { + "epoch": 7.52224, + "grad_norm": 0.5404332876205444, + "learning_rate": 7.830125185085475e-05, + "loss": 3.2522, + "step": 5882 + }, + { + "epoch": 7.5235199999999995, + "grad_norm": 0.5501363277435303, + "learning_rate": 7.826086956521738e-05, + "loss": 3.3744, + "step": 5883 + }, + { + "epoch": 7.5248, + "grad_norm": 0.5222617387771606, + "learning_rate": 7.822048727958003e-05, + "loss": 3.3439, + "step": 5884 + }, + { + "epoch": 7.52608, + "grad_norm": 0.5240084528923035, + "learning_rate": 7.818010499394264e-05, + "loss": 3.3858, + "step": 5885 + }, + { + "epoch": 7.52736, + "grad_norm": 0.5329712629318237, + "learning_rate": 7.813972270830529e-05, + "loss": 3.3657, + "step": 5886 + }, + { + "epoch": 7.52864, + "grad_norm": 0.5383173823356628, + "learning_rate": 7.809934042266792e-05, + "loss": 3.4141, + "step": 5887 + }, + { + "epoch": 7.52992, + "grad_norm": 0.5195038318634033, + "learning_rate": 7.805895813703055e-05, + "loss": 3.43, + "step": 5888 + }, + { + "epoch": 7.5312, + "grad_norm": 0.5309056043624878, + "learning_rate": 7.801857585139318e-05, + "loss": 3.3459, + "step": 5889 + }, + { + "epoch": 7.53248, + "grad_norm": 0.5201624631881714, + "learning_rate": 7.797819356575581e-05, + "loss": 3.3979, + "step": 5890 + }, + { + "epoch": 7.53376, + "grad_norm": 0.520959198474884, + "learning_rate": 7.793781128011845e-05, + "loss": 3.3557, + "step": 5891 + }, + { + "epoch": 7.53504, + "grad_norm": 0.5142325758934021, + "learning_rate": 7.789742899448108e-05, + "loss": 3.4258, + "step": 5892 + }, + { + "epoch": 7.53632, + "grad_norm": 0.5339785814285278, + "learning_rate": 7.785704670884371e-05, + "loss": 3.3424, + "step": 5893 + }, + { + "epoch": 7.5376, + "grad_norm": 0.5324097275733948, + "learning_rate": 7.781666442320634e-05, + "loss": 3.3253, + "step": 5894 + }, + { + "epoch": 7.53888, + "grad_norm": 0.5203735828399658, + "learning_rate": 7.777628213756899e-05, + "loss": 3.422, + "step": 5895 + }, + { + "epoch": 7.54016, + "grad_norm": 0.5267855525016785, + "learning_rate": 7.773589985193162e-05, + "loss": 3.3743, + "step": 5896 + }, + { + "epoch": 7.54144, + "grad_norm": 0.51844722032547, + "learning_rate": 7.769551756629425e-05, + "loss": 3.2686, + "step": 5897 + }, + { + "epoch": 7.54272, + "grad_norm": 0.5209645628929138, + "learning_rate": 7.765513528065688e-05, + "loss": 3.3467, + "step": 5898 + }, + { + "epoch": 7.5440000000000005, + "grad_norm": 0.530583918094635, + "learning_rate": 7.761475299501952e-05, + "loss": 3.437, + "step": 5899 + }, + { + "epoch": 7.54528, + "grad_norm": 0.5081465840339661, + "learning_rate": 7.757437070938215e-05, + "loss": 3.3694, + "step": 5900 + }, + { + "epoch": 7.54656, + "grad_norm": 0.5179983973503113, + "learning_rate": 7.753398842374477e-05, + "loss": 3.3602, + "step": 5901 + }, + { + "epoch": 7.54784, + "grad_norm": 0.5412976145744324, + "learning_rate": 7.749360613810741e-05, + "loss": 3.3402, + "step": 5902 + }, + { + "epoch": 7.54912, + "grad_norm": 0.5343058705329895, + "learning_rate": 7.745322385247004e-05, + "loss": 3.4374, + "step": 5903 + }, + { + "epoch": 7.5504, + "grad_norm": 0.5409019589424133, + "learning_rate": 7.741284156683268e-05, + "loss": 3.323, + "step": 5904 + }, + { + "epoch": 7.55168, + "grad_norm": 0.5322821736335754, + "learning_rate": 7.73724592811953e-05, + "loss": 3.4257, + "step": 5905 + }, + { + "epoch": 7.55296, + "grad_norm": 0.5157960653305054, + "learning_rate": 7.733207699555794e-05, + "loss": 3.3649, + "step": 5906 + }, + { + "epoch": 7.55424, + "grad_norm": 0.5299696922302246, + "learning_rate": 7.729169470992057e-05, + "loss": 3.3594, + "step": 5907 + }, + { + "epoch": 7.55552, + "grad_norm": 0.5254634618759155, + "learning_rate": 7.725131242428322e-05, + "loss": 3.4228, + "step": 5908 + }, + { + "epoch": 7.5568, + "grad_norm": 0.5235505104064941, + "learning_rate": 7.721093013864583e-05, + "loss": 3.357, + "step": 5909 + }, + { + "epoch": 7.55808, + "grad_norm": 0.5225314497947693, + "learning_rate": 7.717054785300848e-05, + "loss": 3.3528, + "step": 5910 + }, + { + "epoch": 7.55936, + "grad_norm": 0.51315838098526, + "learning_rate": 7.713016556737111e-05, + "loss": 3.4245, + "step": 5911 + }, + { + "epoch": 7.56064, + "grad_norm": 0.536595344543457, + "learning_rate": 7.708978328173375e-05, + "loss": 3.3816, + "step": 5912 + }, + { + "epoch": 7.56192, + "grad_norm": 0.5261996984481812, + "learning_rate": 7.704940099609637e-05, + "loss": 3.3648, + "step": 5913 + }, + { + "epoch": 7.5632, + "grad_norm": 0.5061861276626587, + "learning_rate": 7.7009018710459e-05, + "loss": 3.4095, + "step": 5914 + }, + { + "epoch": 7.56448, + "grad_norm": 0.52558434009552, + "learning_rate": 7.696863642482164e-05, + "loss": 3.4055, + "step": 5915 + }, + { + "epoch": 7.56576, + "grad_norm": 0.5246904492378235, + "learning_rate": 7.692825413918426e-05, + "loss": 3.341, + "step": 5916 + }, + { + "epoch": 7.56704, + "grad_norm": 0.5209987759590149, + "learning_rate": 7.68878718535469e-05, + "loss": 3.3632, + "step": 5917 + }, + { + "epoch": 7.56832, + "grad_norm": 0.5232482552528381, + "learning_rate": 7.684748956790953e-05, + "loss": 3.3235, + "step": 5918 + }, + { + "epoch": 7.5696, + "grad_norm": 0.5276979207992554, + "learning_rate": 7.680710728227218e-05, + "loss": 3.4077, + "step": 5919 + }, + { + "epoch": 7.57088, + "grad_norm": 0.5282018184661865, + "learning_rate": 7.676672499663479e-05, + "loss": 3.2798, + "step": 5920 + }, + { + "epoch": 7.57216, + "grad_norm": 0.5135851502418518, + "learning_rate": 7.672634271099744e-05, + "loss": 3.4252, + "step": 5921 + }, + { + "epoch": 7.57344, + "grad_norm": 0.5193025469779968, + "learning_rate": 7.668596042536007e-05, + "loss": 3.2833, + "step": 5922 + }, + { + "epoch": 7.57472, + "grad_norm": 0.5133464336395264, + "learning_rate": 7.664557813972271e-05, + "loss": 3.3827, + "step": 5923 + }, + { + "epoch": 7.576, + "grad_norm": 0.5185580253601074, + "learning_rate": 7.660519585408534e-05, + "loss": 3.3816, + "step": 5924 + }, + { + "epoch": 7.57728, + "grad_norm": 0.5164629220962524, + "learning_rate": 7.656481356844797e-05, + "loss": 3.4205, + "step": 5925 + }, + { + "epoch": 7.5785599999999995, + "grad_norm": 0.5288913249969482, + "learning_rate": 7.65244312828106e-05, + "loss": 3.3824, + "step": 5926 + }, + { + "epoch": 7.57984, + "grad_norm": 0.518733024597168, + "learning_rate": 7.648404899717323e-05, + "loss": 3.3578, + "step": 5927 + }, + { + "epoch": 7.58112, + "grad_norm": 0.5246930718421936, + "learning_rate": 7.644366671153587e-05, + "loss": 3.3632, + "step": 5928 + }, + { + "epoch": 7.5824, + "grad_norm": 0.5315341949462891, + "learning_rate": 7.640328442589849e-05, + "loss": 3.3564, + "step": 5929 + }, + { + "epoch": 7.58368, + "grad_norm": 0.5214976668357849, + "learning_rate": 7.636290214026113e-05, + "loss": 3.3396, + "step": 5930 + }, + { + "epoch": 7.58496, + "grad_norm": 0.5225616097450256, + "learning_rate": 7.632251985462376e-05, + "loss": 3.4089, + "step": 5931 + }, + { + "epoch": 7.58624, + "grad_norm": 0.5472208857536316, + "learning_rate": 7.628213756898641e-05, + "loss": 3.397, + "step": 5932 + }, + { + "epoch": 7.58752, + "grad_norm": 0.5345306396484375, + "learning_rate": 7.624175528334902e-05, + "loss": 3.395, + "step": 5933 + }, + { + "epoch": 7.5888, + "grad_norm": 0.5214905142784119, + "learning_rate": 7.620137299771167e-05, + "loss": 3.3558, + "step": 5934 + }, + { + "epoch": 7.59008, + "grad_norm": 0.53025221824646, + "learning_rate": 7.61609907120743e-05, + "loss": 3.3575, + "step": 5935 + }, + { + "epoch": 7.59136, + "grad_norm": 0.5355333685874939, + "learning_rate": 7.612060842643694e-05, + "loss": 3.3473, + "step": 5936 + }, + { + "epoch": 7.59264, + "grad_norm": 0.5342869758605957, + "learning_rate": 7.608022614079956e-05, + "loss": 3.3687, + "step": 5937 + }, + { + "epoch": 7.59392, + "grad_norm": 0.5135994553565979, + "learning_rate": 7.60398438551622e-05, + "loss": 3.3285, + "step": 5938 + }, + { + "epoch": 7.5952, + "grad_norm": 0.523440957069397, + "learning_rate": 7.599946156952483e-05, + "loss": 3.3989, + "step": 5939 + }, + { + "epoch": 7.59648, + "grad_norm": 0.5180022716522217, + "learning_rate": 7.595907928388747e-05, + "loss": 3.3353, + "step": 5940 + }, + { + "epoch": 7.59776, + "grad_norm": 0.5203397870063782, + "learning_rate": 7.591869699825009e-05, + "loss": 3.3903, + "step": 5941 + }, + { + "epoch": 7.5990400000000005, + "grad_norm": 0.5081101059913635, + "learning_rate": 7.587831471261272e-05, + "loss": 3.3339, + "step": 5942 + }, + { + "epoch": 7.60032, + "grad_norm": 0.5131347179412842, + "learning_rate": 7.583793242697537e-05, + "loss": 3.4133, + "step": 5943 + }, + { + "epoch": 7.6016, + "grad_norm": 0.5185050964355469, + "learning_rate": 7.579755014133798e-05, + "loss": 3.2982, + "step": 5944 + }, + { + "epoch": 7.60288, + "grad_norm": 0.5262473821640015, + "learning_rate": 7.575716785570063e-05, + "loss": 3.4203, + "step": 5945 + }, + { + "epoch": 7.60416, + "grad_norm": 0.5225714445114136, + "learning_rate": 7.571678557006326e-05, + "loss": 3.4279, + "step": 5946 + }, + { + "epoch": 7.60544, + "grad_norm": 0.5234572887420654, + "learning_rate": 7.56764032844259e-05, + "loss": 3.4388, + "step": 5947 + }, + { + "epoch": 7.60672, + "grad_norm": 0.51933753490448, + "learning_rate": 7.563602099878852e-05, + "loss": 3.4044, + "step": 5948 + }, + { + "epoch": 7.608, + "grad_norm": 0.5347757339477539, + "learning_rate": 7.559563871315116e-05, + "loss": 3.4157, + "step": 5949 + }, + { + "epoch": 7.60928, + "grad_norm": 0.5258873105049133, + "learning_rate": 7.555525642751379e-05, + "loss": 3.4091, + "step": 5950 + }, + { + "epoch": 7.6105599999999995, + "grad_norm": 0.5191522836685181, + "learning_rate": 7.551487414187643e-05, + "loss": 3.2995, + "step": 5951 + }, + { + "epoch": 7.61184, + "grad_norm": 0.5296809673309326, + "learning_rate": 7.547449185623905e-05, + "loss": 3.4695, + "step": 5952 + }, + { + "epoch": 7.61312, + "grad_norm": 0.5205982327461243, + "learning_rate": 7.543410957060169e-05, + "loss": 3.3205, + "step": 5953 + }, + { + "epoch": 7.6144, + "grad_norm": 0.5297776460647583, + "learning_rate": 7.539372728496432e-05, + "loss": 3.4047, + "step": 5954 + }, + { + "epoch": 7.61568, + "grad_norm": 0.5211127996444702, + "learning_rate": 7.535334499932695e-05, + "loss": 3.2961, + "step": 5955 + }, + { + "epoch": 7.61696, + "grad_norm": 0.5078245401382446, + "learning_rate": 7.531296271368958e-05, + "loss": 3.3458, + "step": 5956 + }, + { + "epoch": 7.61824, + "grad_norm": 0.5190492272377014, + "learning_rate": 7.527258042805221e-05, + "loss": 3.3822, + "step": 5957 + }, + { + "epoch": 7.61952, + "grad_norm": 0.506162703037262, + "learning_rate": 7.523219814241486e-05, + "loss": 3.3511, + "step": 5958 + }, + { + "epoch": 7.6208, + "grad_norm": 0.5184814929962158, + "learning_rate": 7.519181585677749e-05, + "loss": 3.3438, + "step": 5959 + }, + { + "epoch": 7.62208, + "grad_norm": 0.516089916229248, + "learning_rate": 7.515143357114012e-05, + "loss": 3.4503, + "step": 5960 + }, + { + "epoch": 7.62336, + "grad_norm": 0.5319845676422119, + "learning_rate": 7.511105128550275e-05, + "loss": 3.3838, + "step": 5961 + }, + { + "epoch": 7.62464, + "grad_norm": 0.5250706076622009, + "learning_rate": 7.507066899986539e-05, + "loss": 3.4235, + "step": 5962 + }, + { + "epoch": 7.62592, + "grad_norm": 0.512394368648529, + "learning_rate": 7.503028671422802e-05, + "loss": 3.3163, + "step": 5963 + }, + { + "epoch": 7.6272, + "grad_norm": 0.5351422429084778, + "learning_rate": 7.498990442859065e-05, + "loss": 3.4371, + "step": 5964 + }, + { + "epoch": 7.62848, + "grad_norm": 0.5451759696006775, + "learning_rate": 7.494952214295328e-05, + "loss": 3.3743, + "step": 5965 + }, + { + "epoch": 7.62976, + "grad_norm": 0.5345867872238159, + "learning_rate": 7.490913985731591e-05, + "loss": 3.3628, + "step": 5966 + }, + { + "epoch": 7.6310400000000005, + "grad_norm": 0.5288296341896057, + "learning_rate": 7.486875757167855e-05, + "loss": 3.3488, + "step": 5967 + }, + { + "epoch": 7.63232, + "grad_norm": 0.5270797610282898, + "learning_rate": 7.482837528604118e-05, + "loss": 3.3402, + "step": 5968 + }, + { + "epoch": 7.6336, + "grad_norm": 0.5452633500099182, + "learning_rate": 7.478799300040381e-05, + "loss": 3.3927, + "step": 5969 + }, + { + "epoch": 7.63488, + "grad_norm": 0.5138344764709473, + "learning_rate": 7.474761071476644e-05, + "loss": 3.3503, + "step": 5970 + }, + { + "epoch": 7.63616, + "grad_norm": 0.5245453715324402, + "learning_rate": 7.470722842912909e-05, + "loss": 3.3953, + "step": 5971 + }, + { + "epoch": 7.63744, + "grad_norm": 0.5355793237686157, + "learning_rate": 7.466684614349172e-05, + "loss": 3.3593, + "step": 5972 + }, + { + "epoch": 7.63872, + "grad_norm": 0.5104573369026184, + "learning_rate": 7.462646385785435e-05, + "loss": 3.347, + "step": 5973 + }, + { + "epoch": 7.64, + "grad_norm": 0.5135825276374817, + "learning_rate": 7.458608157221698e-05, + "loss": 3.3998, + "step": 5974 + }, + { + "epoch": 7.64128, + "grad_norm": 0.5211784243583679, + "learning_rate": 7.454569928657961e-05, + "loss": 3.3404, + "step": 5975 + }, + { + "epoch": 7.64256, + "grad_norm": 0.5213080644607544, + "learning_rate": 7.450531700094224e-05, + "loss": 3.41, + "step": 5976 + }, + { + "epoch": 7.64384, + "grad_norm": 0.518886923789978, + "learning_rate": 7.446493471530488e-05, + "loss": 3.41, + "step": 5977 + }, + { + "epoch": 7.64512, + "grad_norm": 0.5232623815536499, + "learning_rate": 7.442455242966751e-05, + "loss": 3.324, + "step": 5978 + }, + { + "epoch": 7.6464, + "grad_norm": 0.5307015776634216, + "learning_rate": 7.438417014403014e-05, + "loss": 3.3834, + "step": 5979 + }, + { + "epoch": 7.64768, + "grad_norm": 0.5195972323417664, + "learning_rate": 7.434378785839277e-05, + "loss": 3.4509, + "step": 5980 + }, + { + "epoch": 7.64896, + "grad_norm": 0.5420621037483215, + "learning_rate": 7.430340557275542e-05, + "loss": 3.3845, + "step": 5981 + }, + { + "epoch": 7.65024, + "grad_norm": 0.5237659215927124, + "learning_rate": 7.426302328711805e-05, + "loss": 3.3393, + "step": 5982 + }, + { + "epoch": 7.65152, + "grad_norm": 0.5215779542922974, + "learning_rate": 7.422264100148068e-05, + "loss": 3.3277, + "step": 5983 + }, + { + "epoch": 7.6528, + "grad_norm": 0.5218585729598999, + "learning_rate": 7.41822587158433e-05, + "loss": 3.3724, + "step": 5984 + }, + { + "epoch": 7.65408, + "grad_norm": 0.5390816926956177, + "learning_rate": 7.414187643020595e-05, + "loss": 3.3104, + "step": 5985 + }, + { + "epoch": 7.65536, + "grad_norm": 0.5215725302696228, + "learning_rate": 7.410149414456858e-05, + "loss": 3.3578, + "step": 5986 + }, + { + "epoch": 7.65664, + "grad_norm": 0.52614825963974, + "learning_rate": 7.406111185893121e-05, + "loss": 3.3742, + "step": 5987 + }, + { + "epoch": 7.65792, + "grad_norm": 0.5331546664237976, + "learning_rate": 7.402072957329384e-05, + "loss": 3.4524, + "step": 5988 + }, + { + "epoch": 7.6592, + "grad_norm": 0.5238944888114929, + "learning_rate": 7.398034728765647e-05, + "loss": 3.3707, + "step": 5989 + }, + { + "epoch": 7.66048, + "grad_norm": 0.5260823965072632, + "learning_rate": 7.39399650020191e-05, + "loss": 3.4557, + "step": 5990 + }, + { + "epoch": 7.66176, + "grad_norm": 0.5286345481872559, + "learning_rate": 7.389958271638174e-05, + "loss": 3.3191, + "step": 5991 + }, + { + "epoch": 7.66304, + "grad_norm": 0.5183789134025574, + "learning_rate": 7.385920043074437e-05, + "loss": 3.3766, + "step": 5992 + }, + { + "epoch": 7.66432, + "grad_norm": 0.5158443450927734, + "learning_rate": 7.3818818145107e-05, + "loss": 3.3405, + "step": 5993 + }, + { + "epoch": 7.6655999999999995, + "grad_norm": 0.5058552026748657, + "learning_rate": 7.377843585946963e-05, + "loss": 3.3266, + "step": 5994 + }, + { + "epoch": 7.66688, + "grad_norm": 0.5316519737243652, + "learning_rate": 7.373805357383228e-05, + "loss": 3.4087, + "step": 5995 + }, + { + "epoch": 7.66816, + "grad_norm": 0.5409033298492432, + "learning_rate": 7.369767128819491e-05, + "loss": 3.3138, + "step": 5996 + }, + { + "epoch": 7.66944, + "grad_norm": 0.5183295607566833, + "learning_rate": 7.365728900255754e-05, + "loss": 3.3506, + "step": 5997 + }, + { + "epoch": 7.67072, + "grad_norm": 0.5203860998153687, + "learning_rate": 7.361690671692017e-05, + "loss": 3.3259, + "step": 5998 + }, + { + "epoch": 7.672, + "grad_norm": 0.5194439888000488, + "learning_rate": 7.357652443128281e-05, + "loss": 3.3583, + "step": 5999 + }, + { + "epoch": 7.67328, + "grad_norm": 0.5380210280418396, + "learning_rate": 7.353614214564544e-05, + "loss": 3.3976, + "step": 6000 + }, + { + "epoch": 7.67456, + "grad_norm": 0.5133769512176514, + "learning_rate": 7.349575986000807e-05, + "loss": 3.3776, + "step": 6001 + }, + { + "epoch": 7.67584, + "grad_norm": 0.5389159321784973, + "learning_rate": 7.34553775743707e-05, + "loss": 3.3787, + "step": 6002 + }, + { + "epoch": 7.67712, + "grad_norm": 0.5545948147773743, + "learning_rate": 7.341499528873333e-05, + "loss": 3.3445, + "step": 6003 + }, + { + "epoch": 7.6784, + "grad_norm": 0.5201895236968994, + "learning_rate": 7.337461300309596e-05, + "loss": 3.2763, + "step": 6004 + }, + { + "epoch": 7.67968, + "grad_norm": 0.538422167301178, + "learning_rate": 7.33342307174586e-05, + "loss": 3.3572, + "step": 6005 + }, + { + "epoch": 7.68096, + "grad_norm": 0.5424717664718628, + "learning_rate": 7.329384843182124e-05, + "loss": 3.3556, + "step": 6006 + }, + { + "epoch": 7.68224, + "grad_norm": 0.5273032784461975, + "learning_rate": 7.325346614618387e-05, + "loss": 3.3478, + "step": 6007 + }, + { + "epoch": 7.68352, + "grad_norm": 0.5107418298721313, + "learning_rate": 7.32130838605465e-05, + "loss": 3.4369, + "step": 6008 + }, + { + "epoch": 7.6848, + "grad_norm": 0.5244372487068176, + "learning_rate": 7.317270157490914e-05, + "loss": 3.3214, + "step": 6009 + }, + { + "epoch": 7.6860800000000005, + "grad_norm": 0.5354893207550049, + "learning_rate": 7.313231928927177e-05, + "loss": 3.3396, + "step": 6010 + }, + { + "epoch": 7.68736, + "grad_norm": 0.5245274901390076, + "learning_rate": 7.30919370036344e-05, + "loss": 3.423, + "step": 6011 + }, + { + "epoch": 7.68864, + "grad_norm": 0.528121829032898, + "learning_rate": 7.305155471799703e-05, + "loss": 3.4042, + "step": 6012 + }, + { + "epoch": 7.68992, + "grad_norm": 0.5323206782341003, + "learning_rate": 7.301117243235967e-05, + "loss": 3.4324, + "step": 6013 + }, + { + "epoch": 7.6912, + "grad_norm": 0.5250928401947021, + "learning_rate": 7.29707901467223e-05, + "loss": 3.3946, + "step": 6014 + }, + { + "epoch": 7.69248, + "grad_norm": 0.5359845757484436, + "learning_rate": 7.293040786108493e-05, + "loss": 3.3705, + "step": 6015 + }, + { + "epoch": 7.69376, + "grad_norm": 0.5194463729858398, + "learning_rate": 7.289002557544756e-05, + "loss": 3.3554, + "step": 6016 + }, + { + "epoch": 7.69504, + "grad_norm": 0.5283463597297668, + "learning_rate": 7.28496432898102e-05, + "loss": 3.3502, + "step": 6017 + }, + { + "epoch": 7.69632, + "grad_norm": 0.5234790444374084, + "learning_rate": 7.280926100417282e-05, + "loss": 3.3667, + "step": 6018 + }, + { + "epoch": 7.6975999999999996, + "grad_norm": 0.5353789329528809, + "learning_rate": 7.276887871853547e-05, + "loss": 3.4113, + "step": 6019 + }, + { + "epoch": 7.69888, + "grad_norm": 0.5060636401176453, + "learning_rate": 7.27284964328981e-05, + "loss": 3.4047, + "step": 6020 + }, + { + "epoch": 7.70016, + "grad_norm": 0.5186939239501953, + "learning_rate": 7.268811414726073e-05, + "loss": 3.3966, + "step": 6021 + }, + { + "epoch": 7.70144, + "grad_norm": 0.5250869989395142, + "learning_rate": 7.264773186162336e-05, + "loss": 3.4134, + "step": 6022 + }, + { + "epoch": 7.70272, + "grad_norm": 0.5262473821640015, + "learning_rate": 7.2607349575986e-05, + "loss": 3.3685, + "step": 6023 + }, + { + "epoch": 7.704, + "grad_norm": 0.5311341881752014, + "learning_rate": 7.256696729034863e-05, + "loss": 3.3809, + "step": 6024 + }, + { + "epoch": 7.70528, + "grad_norm": 0.522276759147644, + "learning_rate": 7.252658500471126e-05, + "loss": 3.3186, + "step": 6025 + }, + { + "epoch": 7.70656, + "grad_norm": 0.5295814871788025, + "learning_rate": 7.248620271907389e-05, + "loss": 3.3868, + "step": 6026 + }, + { + "epoch": 7.70784, + "grad_norm": 0.5124343037605286, + "learning_rate": 7.244582043343654e-05, + "loss": 3.4317, + "step": 6027 + }, + { + "epoch": 7.70912, + "grad_norm": 0.5283524990081787, + "learning_rate": 7.240543814779915e-05, + "loss": 3.3263, + "step": 6028 + }, + { + "epoch": 7.7104, + "grad_norm": 0.5237511992454529, + "learning_rate": 7.236505586216178e-05, + "loss": 3.3889, + "step": 6029 + }, + { + "epoch": 7.71168, + "grad_norm": 0.5321406126022339, + "learning_rate": 7.232467357652443e-05, + "loss": 3.3616, + "step": 6030 + }, + { + "epoch": 7.71296, + "grad_norm": 0.5302574634552002, + "learning_rate": 7.228429129088706e-05, + "loss": 3.4753, + "step": 6031 + }, + { + "epoch": 7.71424, + "grad_norm": 0.5124385356903076, + "learning_rate": 7.224390900524969e-05, + "loss": 3.3905, + "step": 6032 + }, + { + "epoch": 7.71552, + "grad_norm": 0.5328637361526489, + "learning_rate": 7.220352671961233e-05, + "loss": 3.4005, + "step": 6033 + }, + { + "epoch": 7.7168, + "grad_norm": 0.5470924377441406, + "learning_rate": 7.216314443397496e-05, + "loss": 3.3782, + "step": 6034 + }, + { + "epoch": 7.7180800000000005, + "grad_norm": 0.553785502910614, + "learning_rate": 7.212276214833759e-05, + "loss": 3.3522, + "step": 6035 + }, + { + "epoch": 7.71936, + "grad_norm": 0.533015251159668, + "learning_rate": 7.208237986270022e-05, + "loss": 3.3962, + "step": 6036 + }, + { + "epoch": 7.7206399999999995, + "grad_norm": 0.5438454747200012, + "learning_rate": 7.204199757706286e-05, + "loss": 3.3756, + "step": 6037 + }, + { + "epoch": 7.72192, + "grad_norm": 0.5408405661582947, + "learning_rate": 7.20016152914255e-05, + "loss": 3.3191, + "step": 6038 + }, + { + "epoch": 7.7232, + "grad_norm": 0.543483316898346, + "learning_rate": 7.196123300578812e-05, + "loss": 3.4062, + "step": 6039 + }, + { + "epoch": 7.72448, + "grad_norm": 0.49515143036842346, + "learning_rate": 7.192085072015075e-05, + "loss": 3.3466, + "step": 6040 + }, + { + "epoch": 7.72576, + "grad_norm": 0.5400490164756775, + "learning_rate": 7.188046843451338e-05, + "loss": 3.3837, + "step": 6041 + }, + { + "epoch": 7.72704, + "grad_norm": 0.5192726850509644, + "learning_rate": 7.184008614887601e-05, + "loss": 3.3819, + "step": 6042 + }, + { + "epoch": 7.72832, + "grad_norm": 0.5206571221351624, + "learning_rate": 7.179970386323864e-05, + "loss": 3.4132, + "step": 6043 + }, + { + "epoch": 7.7296, + "grad_norm": 0.5262637138366699, + "learning_rate": 7.175932157760129e-05, + "loss": 3.4121, + "step": 6044 + }, + { + "epoch": 7.73088, + "grad_norm": 0.5163931846618652, + "learning_rate": 7.171893929196392e-05, + "loss": 3.3883, + "step": 6045 + }, + { + "epoch": 7.73216, + "grad_norm": 0.5316890478134155, + "learning_rate": 7.167855700632655e-05, + "loss": 3.3518, + "step": 6046 + }, + { + "epoch": 7.73344, + "grad_norm": 0.5298048853874207, + "learning_rate": 7.163817472068918e-05, + "loss": 3.4238, + "step": 6047 + }, + { + "epoch": 7.73472, + "grad_norm": 0.5294202566146851, + "learning_rate": 7.159779243505182e-05, + "loss": 3.3889, + "step": 6048 + }, + { + "epoch": 7.736, + "grad_norm": 0.5132120847702026, + "learning_rate": 7.155741014941445e-05, + "loss": 3.365, + "step": 6049 + }, + { + "epoch": 7.73728, + "grad_norm": 0.5124995708465576, + "learning_rate": 7.151702786377708e-05, + "loss": 3.3348, + "step": 6050 + }, + { + "epoch": 7.73856, + "grad_norm": 0.5307555794715881, + "learning_rate": 7.147664557813973e-05, + "loss": 3.3825, + "step": 6051 + }, + { + "epoch": 7.73984, + "grad_norm": 0.5224402546882629, + "learning_rate": 7.143626329250236e-05, + "loss": 3.3489, + "step": 6052 + }, + { + "epoch": 7.7411200000000004, + "grad_norm": 0.5100345015525818, + "learning_rate": 7.139588100686499e-05, + "loss": 3.4441, + "step": 6053 + }, + { + "epoch": 7.7424, + "grad_norm": 0.5374388694763184, + "learning_rate": 7.135549872122762e-05, + "loss": 3.3539, + "step": 6054 + }, + { + "epoch": 7.74368, + "grad_norm": 0.5344933867454529, + "learning_rate": 7.131511643559025e-05, + "loss": 3.3818, + "step": 6055 + }, + { + "epoch": 7.74496, + "grad_norm": 0.5270034670829773, + "learning_rate": 7.127473414995288e-05, + "loss": 3.3496, + "step": 6056 + }, + { + "epoch": 7.74624, + "grad_norm": 0.5249508619308472, + "learning_rate": 7.12343518643155e-05, + "loss": 3.4065, + "step": 6057 + }, + { + "epoch": 7.74752, + "grad_norm": 0.5132743716239929, + "learning_rate": 7.119396957867815e-05, + "loss": 3.3701, + "step": 6058 + }, + { + "epoch": 7.7488, + "grad_norm": 0.5167114734649658, + "learning_rate": 7.115358729304078e-05, + "loss": 3.385, + "step": 6059 + }, + { + "epoch": 7.75008, + "grad_norm": 0.5301070809364319, + "learning_rate": 7.111320500740341e-05, + "loss": 3.3957, + "step": 6060 + }, + { + "epoch": 7.75136, + "grad_norm": 0.5281143188476562, + "learning_rate": 7.107282272176604e-05, + "loss": 3.4052, + "step": 6061 + }, + { + "epoch": 7.7526399999999995, + "grad_norm": 0.5261697173118591, + "learning_rate": 7.103244043612868e-05, + "loss": 3.4694, + "step": 6062 + }, + { + "epoch": 7.75392, + "grad_norm": 0.5167369246482849, + "learning_rate": 7.099205815049131e-05, + "loss": 3.378, + "step": 6063 + }, + { + "epoch": 7.7552, + "grad_norm": 0.5419825911521912, + "learning_rate": 7.095167586485394e-05, + "loss": 3.4047, + "step": 6064 + }, + { + "epoch": 7.75648, + "grad_norm": 0.5268843173980713, + "learning_rate": 7.091129357921657e-05, + "loss": 3.3606, + "step": 6065 + }, + { + "epoch": 7.75776, + "grad_norm": 0.5122654438018799, + "learning_rate": 7.087091129357922e-05, + "loss": 3.3676, + "step": 6066 + }, + { + "epoch": 7.75904, + "grad_norm": 0.5293072462081909, + "learning_rate": 7.083052900794185e-05, + "loss": 3.3321, + "step": 6067 + }, + { + "epoch": 7.76032, + "grad_norm": 0.5248163342475891, + "learning_rate": 7.079014672230448e-05, + "loss": 3.348, + "step": 6068 + }, + { + "epoch": 7.7616, + "grad_norm": 0.5331951975822449, + "learning_rate": 7.074976443666711e-05, + "loss": 3.3557, + "step": 6069 + }, + { + "epoch": 7.76288, + "grad_norm": 0.5159996151924133, + "learning_rate": 7.070938215102974e-05, + "loss": 3.3483, + "step": 6070 + }, + { + "epoch": 7.76416, + "grad_norm": 0.519347071647644, + "learning_rate": 7.066899986539237e-05, + "loss": 3.3843, + "step": 6071 + }, + { + "epoch": 7.76544, + "grad_norm": 0.5147475004196167, + "learning_rate": 7.062861757975501e-05, + "loss": 3.3476, + "step": 6072 + }, + { + "epoch": 7.76672, + "grad_norm": 0.5350569486618042, + "learning_rate": 7.058823529411764e-05, + "loss": 3.4024, + "step": 6073 + }, + { + "epoch": 7.768, + "grad_norm": 0.5119604468345642, + "learning_rate": 7.054785300848027e-05, + "loss": 3.3335, + "step": 6074 + }, + { + "epoch": 7.76928, + "grad_norm": 0.5323851108551025, + "learning_rate": 7.05074707228429e-05, + "loss": 3.4564, + "step": 6075 + }, + { + "epoch": 7.77056, + "grad_norm": 0.5254625678062439, + "learning_rate": 7.046708843720554e-05, + "loss": 3.4561, + "step": 6076 + }, + { + "epoch": 7.77184, + "grad_norm": 0.5181812644004822, + "learning_rate": 7.042670615156817e-05, + "loss": 3.3622, + "step": 6077 + }, + { + "epoch": 7.7731200000000005, + "grad_norm": 0.5225897431373596, + "learning_rate": 7.03863238659308e-05, + "loss": 3.4193, + "step": 6078 + }, + { + "epoch": 7.7744, + "grad_norm": 0.5444203615188599, + "learning_rate": 7.034594158029343e-05, + "loss": 3.4026, + "step": 6079 + }, + { + "epoch": 7.77568, + "grad_norm": 0.5264379978179932, + "learning_rate": 7.030555929465608e-05, + "loss": 3.448, + "step": 6080 + }, + { + "epoch": 7.77696, + "grad_norm": 0.5221927762031555, + "learning_rate": 7.026517700901871e-05, + "loss": 3.3665, + "step": 6081 + }, + { + "epoch": 7.77824, + "grad_norm": 0.5278477072715759, + "learning_rate": 7.022479472338134e-05, + "loss": 3.3952, + "step": 6082 + }, + { + "epoch": 7.77952, + "grad_norm": 0.5419744849205017, + "learning_rate": 7.018441243774397e-05, + "loss": 3.4158, + "step": 6083 + }, + { + "epoch": 7.7808, + "grad_norm": 0.5510439276695251, + "learning_rate": 7.01440301521066e-05, + "loss": 3.3966, + "step": 6084 + }, + { + "epoch": 7.78208, + "grad_norm": 0.5095901489257812, + "learning_rate": 7.010364786646923e-05, + "loss": 3.3745, + "step": 6085 + }, + { + "epoch": 7.78336, + "grad_norm": 0.5376297831535339, + "learning_rate": 7.006326558083187e-05, + "loss": 3.3438, + "step": 6086 + }, + { + "epoch": 7.78464, + "grad_norm": 0.5214537382125854, + "learning_rate": 7.00228832951945e-05, + "loss": 3.4108, + "step": 6087 + }, + { + "epoch": 7.78592, + "grad_norm": 0.5194911956787109, + "learning_rate": 6.998250100955713e-05, + "loss": 3.4137, + "step": 6088 + }, + { + "epoch": 7.7872, + "grad_norm": 0.528439462184906, + "learning_rate": 6.994211872391976e-05, + "loss": 3.3615, + "step": 6089 + }, + { + "epoch": 7.78848, + "grad_norm": 0.5447664856910706, + "learning_rate": 6.99017364382824e-05, + "loss": 3.3933, + "step": 6090 + }, + { + "epoch": 7.78976, + "grad_norm": 0.5379071235656738, + "learning_rate": 6.986135415264504e-05, + "loss": 3.4011, + "step": 6091 + }, + { + "epoch": 7.79104, + "grad_norm": 0.5375171303749084, + "learning_rate": 6.982097186700767e-05, + "loss": 3.4155, + "step": 6092 + }, + { + "epoch": 7.79232, + "grad_norm": 0.5257745385169983, + "learning_rate": 6.97805895813703e-05, + "loss": 3.4416, + "step": 6093 + }, + { + "epoch": 7.7936, + "grad_norm": 0.5261110663414001, + "learning_rate": 6.974020729573294e-05, + "loss": 3.4095, + "step": 6094 + }, + { + "epoch": 7.79488, + "grad_norm": 0.5397837162017822, + "learning_rate": 6.969982501009556e-05, + "loss": 3.2971, + "step": 6095 + }, + { + "epoch": 7.79616, + "grad_norm": 0.5269252061843872, + "learning_rate": 6.96594427244582e-05, + "loss": 3.3842, + "step": 6096 + }, + { + "epoch": 7.79744, + "grad_norm": 0.514562726020813, + "learning_rate": 6.961906043882083e-05, + "loss": 3.4729, + "step": 6097 + }, + { + "epoch": 7.79872, + "grad_norm": 0.5287720561027527, + "learning_rate": 6.957867815318346e-05, + "loss": 3.3533, + "step": 6098 + }, + { + "epoch": 7.8, + "grad_norm": 0.5171098709106445, + "learning_rate": 6.953829586754609e-05, + "loss": 3.356, + "step": 6099 + }, + { + "epoch": 7.80128, + "grad_norm": 0.5358878374099731, + "learning_rate": 6.949791358190873e-05, + "loss": 3.3566, + "step": 6100 + }, + { + "epoch": 7.80256, + "grad_norm": 0.5136488080024719, + "learning_rate": 6.945753129627136e-05, + "loss": 3.4024, + "step": 6101 + }, + { + "epoch": 7.80384, + "grad_norm": 0.5352822542190552, + "learning_rate": 6.9417149010634e-05, + "loss": 3.4127, + "step": 6102 + }, + { + "epoch": 7.80512, + "grad_norm": 0.5107256174087524, + "learning_rate": 6.937676672499662e-05, + "loss": 3.3742, + "step": 6103 + }, + { + "epoch": 7.8064, + "grad_norm": 0.527970552444458, + "learning_rate": 6.933638443935927e-05, + "loss": 3.4109, + "step": 6104 + }, + { + "epoch": 7.8076799999999995, + "grad_norm": 0.5363277196884155, + "learning_rate": 6.92960021537219e-05, + "loss": 3.3898, + "step": 6105 + }, + { + "epoch": 7.80896, + "grad_norm": 0.5282407999038696, + "learning_rate": 6.925561986808453e-05, + "loss": 3.3657, + "step": 6106 + }, + { + "epoch": 7.81024, + "grad_norm": 0.5155186057090759, + "learning_rate": 6.921523758244716e-05, + "loss": 3.3866, + "step": 6107 + }, + { + "epoch": 7.81152, + "grad_norm": 0.5109987258911133, + "learning_rate": 6.91748552968098e-05, + "loss": 3.3924, + "step": 6108 + }, + { + "epoch": 7.8128, + "grad_norm": 0.5373751521110535, + "learning_rate": 6.913447301117242e-05, + "loss": 3.3803, + "step": 6109 + }, + { + "epoch": 7.81408, + "grad_norm": 0.5129071474075317, + "learning_rate": 6.909409072553506e-05, + "loss": 3.3581, + "step": 6110 + }, + { + "epoch": 7.81536, + "grad_norm": 0.5419071316719055, + "learning_rate": 6.905370843989769e-05, + "loss": 3.367, + "step": 6111 + }, + { + "epoch": 7.81664, + "grad_norm": 0.5342729687690735, + "learning_rate": 6.901332615426032e-05, + "loss": 3.5005, + "step": 6112 + }, + { + "epoch": 7.81792, + "grad_norm": 0.5251415371894836, + "learning_rate": 6.897294386862295e-05, + "loss": 3.3677, + "step": 6113 + }, + { + "epoch": 7.8192, + "grad_norm": 0.5298883318901062, + "learning_rate": 6.89325615829856e-05, + "loss": 3.4143, + "step": 6114 + }, + { + "epoch": 7.82048, + "grad_norm": 0.5260635018348694, + "learning_rate": 6.889217929734823e-05, + "loss": 3.4216, + "step": 6115 + }, + { + "epoch": 7.82176, + "grad_norm": 0.5399680733680725, + "learning_rate": 6.885179701171086e-05, + "loss": 3.3927, + "step": 6116 + }, + { + "epoch": 7.82304, + "grad_norm": 0.5211296081542969, + "learning_rate": 6.881141472607349e-05, + "loss": 3.4147, + "step": 6117 + }, + { + "epoch": 7.82432, + "grad_norm": 0.5149162411689758, + "learning_rate": 6.877103244043613e-05, + "loss": 3.4004, + "step": 6118 + }, + { + "epoch": 7.8256, + "grad_norm": 0.5208600759506226, + "learning_rate": 6.873065015479876e-05, + "loss": 3.305, + "step": 6119 + }, + { + "epoch": 7.82688, + "grad_norm": 0.5258411765098572, + "learning_rate": 6.869026786916139e-05, + "loss": 3.3602, + "step": 6120 + }, + { + "epoch": 7.8281600000000005, + "grad_norm": 0.5164110660552979, + "learning_rate": 6.864988558352402e-05, + "loss": 3.295, + "step": 6121 + }, + { + "epoch": 7.82944, + "grad_norm": 0.539932131767273, + "learning_rate": 6.860950329788665e-05, + "loss": 3.3754, + "step": 6122 + }, + { + "epoch": 7.83072, + "grad_norm": 0.5369081497192383, + "learning_rate": 6.856912101224928e-05, + "loss": 3.3797, + "step": 6123 + }, + { + "epoch": 7.832, + "grad_norm": 0.5187609791755676, + "learning_rate": 6.852873872661192e-05, + "loss": 3.3654, + "step": 6124 + }, + { + "epoch": 7.83328, + "grad_norm": 0.5491527915000916, + "learning_rate": 6.848835644097455e-05, + "loss": 3.4431, + "step": 6125 + }, + { + "epoch": 7.83456, + "grad_norm": 0.5081817507743835, + "learning_rate": 6.844797415533718e-05, + "loss": 3.4171, + "step": 6126 + }, + { + "epoch": 7.83584, + "grad_norm": 0.5206778645515442, + "learning_rate": 6.840759186969981e-05, + "loss": 3.38, + "step": 6127 + }, + { + "epoch": 7.83712, + "grad_norm": 0.5148397088050842, + "learning_rate": 6.836720958406246e-05, + "loss": 3.3939, + "step": 6128 + }, + { + "epoch": 7.8384, + "grad_norm": 0.5235511660575867, + "learning_rate": 6.832682729842509e-05, + "loss": 3.3396, + "step": 6129 + }, + { + "epoch": 7.8396799999999995, + "grad_norm": 0.5200296640396118, + "learning_rate": 6.828644501278772e-05, + "loss": 3.3397, + "step": 6130 + }, + { + "epoch": 7.84096, + "grad_norm": 0.5297372341156006, + "learning_rate": 6.824606272715035e-05, + "loss": 3.4623, + "step": 6131 + }, + { + "epoch": 7.84224, + "grad_norm": 0.5361821055412292, + "learning_rate": 6.820568044151299e-05, + "loss": 3.381, + "step": 6132 + }, + { + "epoch": 7.84352, + "grad_norm": 0.5185701847076416, + "learning_rate": 6.816529815587562e-05, + "loss": 3.3566, + "step": 6133 + }, + { + "epoch": 7.8448, + "grad_norm": 0.5242226123809814, + "learning_rate": 6.812491587023825e-05, + "loss": 3.3903, + "step": 6134 + }, + { + "epoch": 7.84608, + "grad_norm": 0.532424807548523, + "learning_rate": 6.808453358460088e-05, + "loss": 3.4556, + "step": 6135 + }, + { + "epoch": 7.84736, + "grad_norm": 0.5150805115699768, + "learning_rate": 6.804415129896351e-05, + "loss": 3.3373, + "step": 6136 + }, + { + "epoch": 7.84864, + "grad_norm": 0.5233043432235718, + "learning_rate": 6.800376901332614e-05, + "loss": 3.4448, + "step": 6137 + }, + { + "epoch": 7.84992, + "grad_norm": 0.517623245716095, + "learning_rate": 6.796338672768879e-05, + "loss": 3.3695, + "step": 6138 + }, + { + "epoch": 7.8512, + "grad_norm": 0.51863032579422, + "learning_rate": 6.792300444205142e-05, + "loss": 3.3963, + "step": 6139 + }, + { + "epoch": 7.85248, + "grad_norm": 0.52508544921875, + "learning_rate": 6.788262215641405e-05, + "loss": 3.3959, + "step": 6140 + }, + { + "epoch": 7.85376, + "grad_norm": 0.525643527507782, + "learning_rate": 6.784223987077668e-05, + "loss": 3.3873, + "step": 6141 + }, + { + "epoch": 7.85504, + "grad_norm": 0.5046698451042175, + "learning_rate": 6.780185758513932e-05, + "loss": 3.4145, + "step": 6142 + }, + { + "epoch": 7.85632, + "grad_norm": 0.5362650156021118, + "learning_rate": 6.776147529950195e-05, + "loss": 3.4785, + "step": 6143 + }, + { + "epoch": 7.8576, + "grad_norm": 0.5106462240219116, + "learning_rate": 6.772109301386458e-05, + "loss": 3.4154, + "step": 6144 + }, + { + "epoch": 7.85888, + "grad_norm": 0.5172496438026428, + "learning_rate": 6.768071072822721e-05, + "loss": 3.4247, + "step": 6145 + }, + { + "epoch": 7.8601600000000005, + "grad_norm": 0.5204678773880005, + "learning_rate": 6.764032844258985e-05, + "loss": 3.4397, + "step": 6146 + }, + { + "epoch": 7.86144, + "grad_norm": 0.5216001868247986, + "learning_rate": 6.759994615695248e-05, + "loss": 3.4111, + "step": 6147 + }, + { + "epoch": 7.86272, + "grad_norm": 0.5186067819595337, + "learning_rate": 6.755956387131511e-05, + "loss": 3.3803, + "step": 6148 + }, + { + "epoch": 7.864, + "grad_norm": 0.5313200950622559, + "learning_rate": 6.751918158567774e-05, + "loss": 3.3722, + "step": 6149 + }, + { + "epoch": 7.86528, + "grad_norm": 0.530690610408783, + "learning_rate": 6.747879930004037e-05, + "loss": 3.3698, + "step": 6150 + }, + { + "epoch": 7.86656, + "grad_norm": 0.5053249001502991, + "learning_rate": 6.7438417014403e-05, + "loss": 3.2738, + "step": 6151 + }, + { + "epoch": 7.86784, + "grad_norm": 0.5100921988487244, + "learning_rate": 6.739803472876563e-05, + "loss": 3.3805, + "step": 6152 + }, + { + "epoch": 7.86912, + "grad_norm": 0.5200990438461304, + "learning_rate": 6.735765244312828e-05, + "loss": 3.3805, + "step": 6153 + }, + { + "epoch": 7.8704, + "grad_norm": 0.5229262113571167, + "learning_rate": 6.731727015749091e-05, + "loss": 3.3675, + "step": 6154 + }, + { + "epoch": 7.87168, + "grad_norm": 0.536566436290741, + "learning_rate": 6.727688787185354e-05, + "loss": 3.4692, + "step": 6155 + }, + { + "epoch": 7.87296, + "grad_norm": 0.5252888798713684, + "learning_rate": 6.723650558621618e-05, + "loss": 3.375, + "step": 6156 + }, + { + "epoch": 7.87424, + "grad_norm": 0.5295806527137756, + "learning_rate": 6.719612330057881e-05, + "loss": 3.3758, + "step": 6157 + }, + { + "epoch": 7.87552, + "grad_norm": 0.5360879302024841, + "learning_rate": 6.715574101494144e-05, + "loss": 3.394, + "step": 6158 + }, + { + "epoch": 7.8768, + "grad_norm": 0.5231373310089111, + "learning_rate": 6.711535872930407e-05, + "loss": 3.3922, + "step": 6159 + }, + { + "epoch": 7.87808, + "grad_norm": 0.5426596999168396, + "learning_rate": 6.707497644366671e-05, + "loss": 3.4305, + "step": 6160 + }, + { + "epoch": 7.87936, + "grad_norm": 0.5190793871879578, + "learning_rate": 6.703459415802935e-05, + "loss": 3.3645, + "step": 6161 + }, + { + "epoch": 7.88064, + "grad_norm": 0.5220685005187988, + "learning_rate": 6.699421187239196e-05, + "loss": 3.3472, + "step": 6162 + }, + { + "epoch": 7.88192, + "grad_norm": 0.5117445588111877, + "learning_rate": 6.69538295867546e-05, + "loss": 3.3592, + "step": 6163 + }, + { + "epoch": 7.8832, + "grad_norm": 0.520155668258667, + "learning_rate": 6.691344730111724e-05, + "loss": 3.4109, + "step": 6164 + }, + { + "epoch": 7.88448, + "grad_norm": 0.5211771726608276, + "learning_rate": 6.687306501547987e-05, + "loss": 3.359, + "step": 6165 + }, + { + "epoch": 7.88576, + "grad_norm": 0.5182546973228455, + "learning_rate": 6.68326827298425e-05, + "loss": 3.3939, + "step": 6166 + }, + { + "epoch": 7.88704, + "grad_norm": 0.5254797339439392, + "learning_rate": 6.679230044420514e-05, + "loss": 3.359, + "step": 6167 + }, + { + "epoch": 7.88832, + "grad_norm": 0.5227944254875183, + "learning_rate": 6.675191815856777e-05, + "loss": 3.4111, + "step": 6168 + }, + { + "epoch": 7.8896, + "grad_norm": 0.5160208940505981, + "learning_rate": 6.67115358729304e-05, + "loss": 3.3495, + "step": 6169 + }, + { + "epoch": 7.89088, + "grad_norm": 0.5098345279693604, + "learning_rate": 6.667115358729303e-05, + "loss": 3.329, + "step": 6170 + }, + { + "epoch": 7.89216, + "grad_norm": 0.5212696194648743, + "learning_rate": 6.663077130165567e-05, + "loss": 3.4255, + "step": 6171 + }, + { + "epoch": 7.89344, + "grad_norm": 0.5201159715652466, + "learning_rate": 6.65903890160183e-05, + "loss": 3.3864, + "step": 6172 + }, + { + "epoch": 7.8947199999999995, + "grad_norm": 0.5263365507125854, + "learning_rate": 6.655000673038093e-05, + "loss": 3.3921, + "step": 6173 + }, + { + "epoch": 7.896, + "grad_norm": 0.5230242013931274, + "learning_rate": 6.650962444474358e-05, + "loss": 3.3675, + "step": 6174 + }, + { + "epoch": 7.89728, + "grad_norm": 0.5143063068389893, + "learning_rate": 6.64692421591062e-05, + "loss": 3.3708, + "step": 6175 + }, + { + "epoch": 7.89856, + "grad_norm": 0.5212923288345337, + "learning_rate": 6.642885987346882e-05, + "loss": 3.4216, + "step": 6176 + }, + { + "epoch": 7.89984, + "grad_norm": 0.5219258069992065, + "learning_rate": 6.638847758783147e-05, + "loss": 3.3633, + "step": 6177 + }, + { + "epoch": 7.90112, + "grad_norm": 0.5209653377532959, + "learning_rate": 6.63480953021941e-05, + "loss": 3.3671, + "step": 6178 + }, + { + "epoch": 7.9024, + "grad_norm": 0.5172528624534607, + "learning_rate": 6.630771301655673e-05, + "loss": 3.398, + "step": 6179 + }, + { + "epoch": 7.90368, + "grad_norm": 0.5204746127128601, + "learning_rate": 6.626733073091936e-05, + "loss": 3.3294, + "step": 6180 + }, + { + "epoch": 7.90496, + "grad_norm": 0.5181044936180115, + "learning_rate": 6.6226948445282e-05, + "loss": 3.2986, + "step": 6181 + }, + { + "epoch": 7.90624, + "grad_norm": 0.503993034362793, + "learning_rate": 6.618656615964463e-05, + "loss": 3.2642, + "step": 6182 + }, + { + "epoch": 7.90752, + "grad_norm": 0.5091139674186707, + "learning_rate": 6.614618387400726e-05, + "loss": 3.3122, + "step": 6183 + }, + { + "epoch": 7.9088, + "grad_norm": 0.5170398950576782, + "learning_rate": 6.610580158836989e-05, + "loss": 3.3814, + "step": 6184 + }, + { + "epoch": 7.91008, + "grad_norm": 0.5216673016548157, + "learning_rate": 6.606541930273253e-05, + "loss": 3.3738, + "step": 6185 + }, + { + "epoch": 7.91136, + "grad_norm": 0.5129463076591492, + "learning_rate": 6.602503701709516e-05, + "loss": 3.285, + "step": 6186 + }, + { + "epoch": 7.91264, + "grad_norm": 0.5073363184928894, + "learning_rate": 6.59846547314578e-05, + "loss": 3.3271, + "step": 6187 + }, + { + "epoch": 7.91392, + "grad_norm": 0.5254284143447876, + "learning_rate": 6.594427244582042e-05, + "loss": 3.3854, + "step": 6188 + }, + { + "epoch": 7.9152000000000005, + "grad_norm": 0.5197746157646179, + "learning_rate": 6.590389016018305e-05, + "loss": 3.4062, + "step": 6189 + }, + { + "epoch": 7.91648, + "grad_norm": 0.5055711269378662, + "learning_rate": 6.586350787454568e-05, + "loss": 3.3037, + "step": 6190 + }, + { + "epoch": 7.91776, + "grad_norm": 0.523854672908783, + "learning_rate": 6.582312558890833e-05, + "loss": 3.3311, + "step": 6191 + }, + { + "epoch": 7.91904, + "grad_norm": 0.5296027064323425, + "learning_rate": 6.578274330327096e-05, + "loss": 3.4084, + "step": 6192 + }, + { + "epoch": 7.92032, + "grad_norm": 0.5208480954170227, + "learning_rate": 6.574236101763359e-05, + "loss": 3.4572, + "step": 6193 + }, + { + "epoch": 7.9216, + "grad_norm": 0.5234736800193787, + "learning_rate": 6.570197873199622e-05, + "loss": 3.4112, + "step": 6194 + }, + { + "epoch": 7.92288, + "grad_norm": 0.5259544849395752, + "learning_rate": 6.566159644635886e-05, + "loss": 3.4369, + "step": 6195 + }, + { + "epoch": 7.92416, + "grad_norm": 0.5302821397781372, + "learning_rate": 6.562121416072149e-05, + "loss": 3.3975, + "step": 6196 + }, + { + "epoch": 7.92544, + "grad_norm": 0.5262648463249207, + "learning_rate": 6.558083187508412e-05, + "loss": 3.4066, + "step": 6197 + }, + { + "epoch": 7.9267199999999995, + "grad_norm": 0.5254402756690979, + "learning_rate": 6.554044958944675e-05, + "loss": 3.3711, + "step": 6198 + }, + { + "epoch": 7.928, + "grad_norm": 0.5265082120895386, + "learning_rate": 6.55000673038094e-05, + "loss": 3.367, + "step": 6199 + }, + { + "epoch": 7.92928, + "grad_norm": 0.5290309190750122, + "learning_rate": 6.545968501817203e-05, + "loss": 3.3912, + "step": 6200 + }, + { + "epoch": 7.93056, + "grad_norm": 0.5364521145820618, + "learning_rate": 6.541930273253466e-05, + "loss": 3.466, + "step": 6201 + }, + { + "epoch": 7.93184, + "grad_norm": 0.5207248330116272, + "learning_rate": 6.537892044689729e-05, + "loss": 3.3728, + "step": 6202 + }, + { + "epoch": 7.93312, + "grad_norm": 0.533179759979248, + "learning_rate": 6.533853816125992e-05, + "loss": 3.354, + "step": 6203 + }, + { + "epoch": 7.9344, + "grad_norm": 0.5137979388237, + "learning_rate": 6.529815587562255e-05, + "loss": 3.3751, + "step": 6204 + }, + { + "epoch": 7.93568, + "grad_norm": 0.5287781953811646, + "learning_rate": 6.525777358998519e-05, + "loss": 3.3851, + "step": 6205 + }, + { + "epoch": 7.93696, + "grad_norm": 0.5349233746528625, + "learning_rate": 6.521739130434782e-05, + "loss": 3.4174, + "step": 6206 + }, + { + "epoch": 7.93824, + "grad_norm": 0.5217055082321167, + "learning_rate": 6.517700901871045e-05, + "loss": 3.3981, + "step": 6207 + }, + { + "epoch": 7.93952, + "grad_norm": 0.5294625759124756, + "learning_rate": 6.513662673307308e-05, + "loss": 3.4748, + "step": 6208 + }, + { + "epoch": 7.9408, + "grad_norm": 0.5194503664970398, + "learning_rate": 6.509624444743572e-05, + "loss": 3.3751, + "step": 6209 + }, + { + "epoch": 7.94208, + "grad_norm": 0.528069257736206, + "learning_rate": 6.505586216179835e-05, + "loss": 3.3722, + "step": 6210 + }, + { + "epoch": 7.94336, + "grad_norm": 0.5228124856948853, + "learning_rate": 6.501547987616098e-05, + "loss": 3.3791, + "step": 6211 + }, + { + "epoch": 7.94464, + "grad_norm": 0.5172755718231201, + "learning_rate": 6.497509759052361e-05, + "loss": 3.4098, + "step": 6212 + }, + { + "epoch": 7.94592, + "grad_norm": 0.5152941346168518, + "learning_rate": 6.493471530488626e-05, + "loss": 3.4003, + "step": 6213 + }, + { + "epoch": 7.9472000000000005, + "grad_norm": 0.53346186876297, + "learning_rate": 6.489433301924889e-05, + "loss": 3.3025, + "step": 6214 + }, + { + "epoch": 7.94848, + "grad_norm": 0.5327422022819519, + "learning_rate": 6.485395073361152e-05, + "loss": 3.422, + "step": 6215 + }, + { + "epoch": 7.94976, + "grad_norm": 0.5168991088867188, + "learning_rate": 6.481356844797415e-05, + "loss": 3.3234, + "step": 6216 + }, + { + "epoch": 7.95104, + "grad_norm": 0.5401486754417419, + "learning_rate": 6.477318616233678e-05, + "loss": 3.4159, + "step": 6217 + }, + { + "epoch": 7.95232, + "grad_norm": 0.5209543704986572, + "learning_rate": 6.473280387669941e-05, + "loss": 3.3988, + "step": 6218 + }, + { + "epoch": 7.9536, + "grad_norm": 0.5165715217590332, + "learning_rate": 6.469242159106205e-05, + "loss": 3.3658, + "step": 6219 + }, + { + "epoch": 7.95488, + "grad_norm": 0.5266816020011902, + "learning_rate": 6.465203930542468e-05, + "loss": 3.4037, + "step": 6220 + }, + { + "epoch": 7.95616, + "grad_norm": 0.5167005658149719, + "learning_rate": 6.461165701978731e-05, + "loss": 3.343, + "step": 6221 + }, + { + "epoch": 7.95744, + "grad_norm": 0.5324759483337402, + "learning_rate": 6.457127473414994e-05, + "loss": 3.3873, + "step": 6222 + }, + { + "epoch": 7.95872, + "grad_norm": 0.5235298871994019, + "learning_rate": 6.453089244851259e-05, + "loss": 3.4334, + "step": 6223 + }, + { + "epoch": 7.96, + "grad_norm": 0.5192801356315613, + "learning_rate": 6.449051016287522e-05, + "loss": 3.3825, + "step": 6224 + }, + { + "epoch": 7.96128, + "grad_norm": 0.5173263549804688, + "learning_rate": 6.445012787723785e-05, + "loss": 3.4143, + "step": 6225 + }, + { + "epoch": 7.96256, + "grad_norm": 0.5249988436698914, + "learning_rate": 6.440974559160048e-05, + "loss": 3.3573, + "step": 6226 + }, + { + "epoch": 7.96384, + "grad_norm": 0.5259899497032166, + "learning_rate": 6.436936330596312e-05, + "loss": 3.3955, + "step": 6227 + }, + { + "epoch": 7.96512, + "grad_norm": 0.5165438055992126, + "learning_rate": 6.432898102032575e-05, + "loss": 3.4257, + "step": 6228 + }, + { + "epoch": 7.9664, + "grad_norm": 0.5043834447860718, + "learning_rate": 6.428859873468838e-05, + "loss": 3.3238, + "step": 6229 + }, + { + "epoch": 7.96768, + "grad_norm": 0.5150227546691895, + "learning_rate": 6.424821644905101e-05, + "loss": 3.3857, + "step": 6230 + }, + { + "epoch": 7.96896, + "grad_norm": 0.5162477493286133, + "learning_rate": 6.420783416341364e-05, + "loss": 3.351, + "step": 6231 + }, + { + "epoch": 7.97024, + "grad_norm": 0.5149223804473877, + "learning_rate": 6.416745187777627e-05, + "loss": 3.3808, + "step": 6232 + }, + { + "epoch": 7.97152, + "grad_norm": 0.5162880420684814, + "learning_rate": 6.412706959213891e-05, + "loss": 3.3048, + "step": 6233 + }, + { + "epoch": 7.9728, + "grad_norm": 0.5108062624931335, + "learning_rate": 6.408668730650154e-05, + "loss": 3.442, + "step": 6234 + }, + { + "epoch": 7.97408, + "grad_norm": 0.5145354270935059, + "learning_rate": 6.404630502086417e-05, + "loss": 3.3521, + "step": 6235 + }, + { + "epoch": 7.97536, + "grad_norm": 0.5244855284690857, + "learning_rate": 6.40059227352268e-05, + "loss": 3.376, + "step": 6236 + }, + { + "epoch": 7.97664, + "grad_norm": 0.5116865038871765, + "learning_rate": 6.396554044958945e-05, + "loss": 3.3633, + "step": 6237 + }, + { + "epoch": 7.97792, + "grad_norm": 0.5205397009849548, + "learning_rate": 6.392515816395208e-05, + "loss": 3.4108, + "step": 6238 + }, + { + "epoch": 7.9792, + "grad_norm": 0.5130834579467773, + "learning_rate": 6.388477587831471e-05, + "loss": 3.3463, + "step": 6239 + }, + { + "epoch": 7.98048, + "grad_norm": 0.5261620283126831, + "learning_rate": 6.384439359267734e-05, + "loss": 3.4818, + "step": 6240 + }, + { + "epoch": 7.9817599999999995, + "grad_norm": 0.5290659070014954, + "learning_rate": 6.380401130703998e-05, + "loss": 3.4639, + "step": 6241 + }, + { + "epoch": 7.98304, + "grad_norm": 0.5278947353363037, + "learning_rate": 6.376362902140261e-05, + "loss": 3.4368, + "step": 6242 + }, + { + "epoch": 7.98432, + "grad_norm": 0.5171101093292236, + "learning_rate": 6.372324673576524e-05, + "loss": 3.3449, + "step": 6243 + }, + { + "epoch": 7.9856, + "grad_norm": 0.5271726250648499, + "learning_rate": 6.368286445012787e-05, + "loss": 3.3427, + "step": 6244 + }, + { + "epoch": 7.98688, + "grad_norm": 0.5079402923583984, + "learning_rate": 6.36424821644905e-05, + "loss": 3.3444, + "step": 6245 + }, + { + "epoch": 7.98816, + "grad_norm": 0.5159564018249512, + "learning_rate": 6.360209987885313e-05, + "loss": 3.3274, + "step": 6246 + }, + { + "epoch": 7.98944, + "grad_norm": 0.5179126858711243, + "learning_rate": 6.356171759321578e-05, + "loss": 3.2998, + "step": 6247 + }, + { + "epoch": 7.99072, + "grad_norm": 0.5274621248245239, + "learning_rate": 6.35213353075784e-05, + "loss": 3.2867, + "step": 6248 + }, + { + "epoch": 7.992, + "grad_norm": 0.5217450857162476, + "learning_rate": 6.348095302194104e-05, + "loss": 3.4232, + "step": 6249 + }, + { + "epoch": 7.99328, + "grad_norm": 0.5240947604179382, + "learning_rate": 6.344057073630367e-05, + "loss": 3.4344, + "step": 6250 + }, + { + "epoch": 7.99456, + "grad_norm": 0.5134342312812805, + "learning_rate": 6.340018845066631e-05, + "loss": 3.365, + "step": 6251 + }, + { + "epoch": 7.99584, + "grad_norm": 0.5243435502052307, + "learning_rate": 6.335980616502894e-05, + "loss": 3.3951, + "step": 6252 + }, + { + "epoch": 7.99712, + "grad_norm": 0.5112805366516113, + "learning_rate": 6.331942387939157e-05, + "loss": 3.3713, + "step": 6253 + }, + { + "epoch": 7.9984, + "grad_norm": 0.532479465007782, + "learning_rate": 6.32790415937542e-05, + "loss": 3.4131, + "step": 6254 + }, + { + "epoch": 7.99968, + "grad_norm": 0.5240690112113953, + "learning_rate": 6.323865930811684e-05, + "loss": 3.3779, + "step": 6255 + }, + { + "epoch": 8.0, + "grad_norm": 1.0870201587677002, + "learning_rate": 6.319827702247946e-05, + "loss": 3.4098, + "step": 6256 + }, + { + "epoch": 8.00128, + "grad_norm": 0.5369760394096375, + "learning_rate": 6.315789473684209e-05, + "loss": 3.3303, + "step": 6257 + }, + { + "epoch": 8.00256, + "grad_norm": 0.5155583620071411, + "learning_rate": 6.311751245120473e-05, + "loss": 3.3235, + "step": 6258 + }, + { + "epoch": 8.00384, + "grad_norm": 0.50521320104599, + "learning_rate": 6.307713016556736e-05, + "loss": 3.3097, + "step": 6259 + }, + { + "epoch": 8.00512, + "grad_norm": 0.5237175226211548, + "learning_rate": 6.303674787993e-05, + "loss": 3.3051, + "step": 6260 + }, + { + "epoch": 8.0064, + "grad_norm": 0.521220862865448, + "learning_rate": 6.299636559429264e-05, + "loss": 3.2319, + "step": 6261 + }, + { + "epoch": 8.00768, + "grad_norm": 0.5294133424758911, + "learning_rate": 6.295598330865527e-05, + "loss": 3.2934, + "step": 6262 + }, + { + "epoch": 8.00896, + "grad_norm": 0.5125667452812195, + "learning_rate": 6.29156010230179e-05, + "loss": 3.1766, + "step": 6263 + }, + { + "epoch": 8.01024, + "grad_norm": 0.5298846364021301, + "learning_rate": 6.287521873738053e-05, + "loss": 3.2827, + "step": 6264 + }, + { + "epoch": 8.01152, + "grad_norm": 0.5239753127098083, + "learning_rate": 6.283483645174317e-05, + "loss": 3.256, + "step": 6265 + }, + { + "epoch": 8.0128, + "grad_norm": 0.5199098587036133, + "learning_rate": 6.27944541661058e-05, + "loss": 3.2353, + "step": 6266 + }, + { + "epoch": 8.01408, + "grad_norm": 0.5292956233024597, + "learning_rate": 6.275407188046843e-05, + "loss": 3.2311, + "step": 6267 + }, + { + "epoch": 8.01536, + "grad_norm": 0.5216575860977173, + "learning_rate": 6.271368959483106e-05, + "loss": 3.3264, + "step": 6268 + }, + { + "epoch": 8.01664, + "grad_norm": 0.514209508895874, + "learning_rate": 6.26733073091937e-05, + "loss": 3.2135, + "step": 6269 + }, + { + "epoch": 8.01792, + "grad_norm": 0.5441368818283081, + "learning_rate": 6.263292502355632e-05, + "loss": 3.3422, + "step": 6270 + }, + { + "epoch": 8.0192, + "grad_norm": 0.5475019216537476, + "learning_rate": 6.259254273791895e-05, + "loss": 3.2504, + "step": 6271 + }, + { + "epoch": 8.02048, + "grad_norm": 0.5108786821365356, + "learning_rate": 6.25521604522816e-05, + "loss": 3.291, + "step": 6272 + }, + { + "epoch": 8.02176, + "grad_norm": 0.532529890537262, + "learning_rate": 6.251177816664423e-05, + "loss": 3.3117, + "step": 6273 + }, + { + "epoch": 8.02304, + "grad_norm": 0.5336194634437561, + "learning_rate": 6.247139588100686e-05, + "loss": 3.2797, + "step": 6274 + }, + { + "epoch": 8.02432, + "grad_norm": 0.5180751085281372, + "learning_rate": 6.243101359536949e-05, + "loss": 3.2857, + "step": 6275 + }, + { + "epoch": 8.0256, + "grad_norm": 0.5190191864967346, + "learning_rate": 6.239063130973213e-05, + "loss": 3.2449, + "step": 6276 + }, + { + "epoch": 8.02688, + "grad_norm": 0.533854603767395, + "learning_rate": 6.235024902409476e-05, + "loss": 3.3588, + "step": 6277 + }, + { + "epoch": 8.02816, + "grad_norm": 0.5298550724983215, + "learning_rate": 6.230986673845739e-05, + "loss": 3.2968, + "step": 6278 + }, + { + "epoch": 8.02944, + "grad_norm": 0.5271071791648865, + "learning_rate": 6.226948445282003e-05, + "loss": 3.18, + "step": 6279 + }, + { + "epoch": 8.03072, + "grad_norm": 0.5124052166938782, + "learning_rate": 6.222910216718266e-05, + "loss": 3.3069, + "step": 6280 + }, + { + "epoch": 8.032, + "grad_norm": 0.5358180403709412, + "learning_rate": 6.218871988154529e-05, + "loss": 3.2968, + "step": 6281 + }, + { + "epoch": 8.03328, + "grad_norm": 0.5310022234916687, + "learning_rate": 6.214833759590792e-05, + "loss": 3.3983, + "step": 6282 + }, + { + "epoch": 8.03456, + "grad_norm": 0.5346135497093201, + "learning_rate": 6.210795531027055e-05, + "loss": 3.2622, + "step": 6283 + }, + { + "epoch": 8.03584, + "grad_norm": 0.5122181177139282, + "learning_rate": 6.206757302463318e-05, + "loss": 3.3501, + "step": 6284 + }, + { + "epoch": 8.03712, + "grad_norm": 0.5202990770339966, + "learning_rate": 6.202719073899581e-05, + "loss": 3.2425, + "step": 6285 + }, + { + "epoch": 8.0384, + "grad_norm": 0.5240968465805054, + "learning_rate": 6.198680845335846e-05, + "loss": 3.2669, + "step": 6286 + }, + { + "epoch": 8.03968, + "grad_norm": 0.5298383831977844, + "learning_rate": 6.194642616772109e-05, + "loss": 3.3057, + "step": 6287 + }, + { + "epoch": 8.04096, + "grad_norm": 0.5407096743583679, + "learning_rate": 6.190604388208372e-05, + "loss": 3.2781, + "step": 6288 + }, + { + "epoch": 8.04224, + "grad_norm": 0.5193719267845154, + "learning_rate": 6.186566159644635e-05, + "loss": 3.3438, + "step": 6289 + }, + { + "epoch": 8.043520000000001, + "grad_norm": 0.5216055512428284, + "learning_rate": 6.182527931080899e-05, + "loss": 3.2728, + "step": 6290 + }, + { + "epoch": 8.0448, + "grad_norm": 0.5255246758460999, + "learning_rate": 6.178489702517162e-05, + "loss": 3.2996, + "step": 6291 + }, + { + "epoch": 8.04608, + "grad_norm": 0.5254226326942444, + "learning_rate": 6.174451473953425e-05, + "loss": 3.293, + "step": 6292 + }, + { + "epoch": 8.04736, + "grad_norm": 0.5267927050590515, + "learning_rate": 6.170413245389688e-05, + "loss": 3.2104, + "step": 6293 + }, + { + "epoch": 8.04864, + "grad_norm": 0.5204477906227112, + "learning_rate": 6.166375016825952e-05, + "loss": 3.297, + "step": 6294 + }, + { + "epoch": 8.04992, + "grad_norm": 0.5418780446052551, + "learning_rate": 6.162336788262215e-05, + "loss": 3.3136, + "step": 6295 + }, + { + "epoch": 8.0512, + "grad_norm": 0.5511976480484009, + "learning_rate": 6.158298559698478e-05, + "loss": 3.332, + "step": 6296 + }, + { + "epoch": 8.05248, + "grad_norm": 0.5536786317825317, + "learning_rate": 6.154260331134741e-05, + "loss": 3.3544, + "step": 6297 + }, + { + "epoch": 8.05376, + "grad_norm": 0.5378971099853516, + "learning_rate": 6.150222102571004e-05, + "loss": 3.2811, + "step": 6298 + }, + { + "epoch": 8.05504, + "grad_norm": 0.5422542095184326, + "learning_rate": 6.146183874007267e-05, + "loss": 3.2559, + "step": 6299 + }, + { + "epoch": 8.05632, + "grad_norm": 0.5333569049835205, + "learning_rate": 6.142145645443532e-05, + "loss": 3.3139, + "step": 6300 + }, + { + "epoch": 8.0576, + "grad_norm": 0.5348880290985107, + "learning_rate": 6.138107416879795e-05, + "loss": 3.3467, + "step": 6301 + }, + { + "epoch": 8.05888, + "grad_norm": 0.5167289972305298, + "learning_rate": 6.134069188316058e-05, + "loss": 3.2289, + "step": 6302 + }, + { + "epoch": 8.06016, + "grad_norm": 0.5306516289710999, + "learning_rate": 6.130030959752321e-05, + "loss": 3.3242, + "step": 6303 + }, + { + "epoch": 8.06144, + "grad_norm": 0.5348290801048279, + "learning_rate": 6.125992731188585e-05, + "loss": 3.36, + "step": 6304 + }, + { + "epoch": 8.06272, + "grad_norm": 0.5243771076202393, + "learning_rate": 6.121954502624848e-05, + "loss": 3.2549, + "step": 6305 + }, + { + "epoch": 8.064, + "grad_norm": 0.5279974341392517, + "learning_rate": 6.117916274061111e-05, + "loss": 3.3017, + "step": 6306 + }, + { + "epoch": 8.06528, + "grad_norm": 0.539170503616333, + "learning_rate": 6.113878045497374e-05, + "loss": 3.2932, + "step": 6307 + }, + { + "epoch": 8.06656, + "grad_norm": 0.5232791304588318, + "learning_rate": 6.109839816933639e-05, + "loss": 3.2744, + "step": 6308 + }, + { + "epoch": 8.06784, + "grad_norm": 0.5283025503158569, + "learning_rate": 6.105801588369902e-05, + "loss": 3.3529, + "step": 6309 + }, + { + "epoch": 8.06912, + "grad_norm": 0.5246297121047974, + "learning_rate": 6.101763359806164e-05, + "loss": 3.2392, + "step": 6310 + }, + { + "epoch": 8.0704, + "grad_norm": 0.5458753108978271, + "learning_rate": 6.0977251312424276e-05, + "loss": 3.2406, + "step": 6311 + }, + { + "epoch": 8.07168, + "grad_norm": 0.5204522609710693, + "learning_rate": 6.0936869026786907e-05, + "loss": 3.3418, + "step": 6312 + }, + { + "epoch": 8.07296, + "grad_norm": 0.5313632488250732, + "learning_rate": 6.089648674114954e-05, + "loss": 3.3164, + "step": 6313 + }, + { + "epoch": 8.07424, + "grad_norm": 0.5227934122085571, + "learning_rate": 6.0856104455512173e-05, + "loss": 3.2728, + "step": 6314 + }, + { + "epoch": 8.07552, + "grad_norm": 0.5291623473167419, + "learning_rate": 6.081572216987481e-05, + "loss": 3.307, + "step": 6315 + }, + { + "epoch": 8.0768, + "grad_norm": 0.5221740007400513, + "learning_rate": 6.077533988423744e-05, + "loss": 3.301, + "step": 6316 + }, + { + "epoch": 8.07808, + "grad_norm": 0.528735339641571, + "learning_rate": 6.073495759860008e-05, + "loss": 3.2217, + "step": 6317 + }, + { + "epoch": 8.07936, + "grad_norm": 0.5461089611053467, + "learning_rate": 6.069457531296271e-05, + "loss": 3.2215, + "step": 6318 + }, + { + "epoch": 8.08064, + "grad_norm": 0.5363235473632812, + "learning_rate": 6.0654193027325344e-05, + "loss": 3.2331, + "step": 6319 + }, + { + "epoch": 8.08192, + "grad_norm": 0.5322678089141846, + "learning_rate": 6.0613810741687974e-05, + "loss": 3.2927, + "step": 6320 + }, + { + "epoch": 8.0832, + "grad_norm": 0.5260446071624756, + "learning_rate": 6.057342845605061e-05, + "loss": 3.2021, + "step": 6321 + }, + { + "epoch": 8.08448, + "grad_norm": 0.5323824286460876, + "learning_rate": 6.053304617041324e-05, + "loss": 3.3024, + "step": 6322 + }, + { + "epoch": 8.08576, + "grad_norm": 0.5511333346366882, + "learning_rate": 6.049266388477588e-05, + "loss": 3.2838, + "step": 6323 + }, + { + "epoch": 8.08704, + "grad_norm": 0.5331417322158813, + "learning_rate": 6.04522815991385e-05, + "loss": 3.3127, + "step": 6324 + }, + { + "epoch": 8.08832, + "grad_norm": 0.5371091961860657, + "learning_rate": 6.041189931350114e-05, + "loss": 3.3004, + "step": 6325 + }, + { + "epoch": 8.0896, + "grad_norm": 0.5571906566619873, + "learning_rate": 6.037151702786377e-05, + "loss": 3.2509, + "step": 6326 + }, + { + "epoch": 8.09088, + "grad_norm": 0.5286134481430054, + "learning_rate": 6.0331134742226405e-05, + "loss": 3.3488, + "step": 6327 + }, + { + "epoch": 8.09216, + "grad_norm": 0.5509176254272461, + "learning_rate": 6.0290752456589035e-05, + "loss": 3.3017, + "step": 6328 + }, + { + "epoch": 8.09344, + "grad_norm": 0.5480915307998657, + "learning_rate": 6.025037017095167e-05, + "loss": 3.2503, + "step": 6329 + }, + { + "epoch": 8.09472, + "grad_norm": 0.5310816764831543, + "learning_rate": 6.02099878853143e-05, + "loss": 3.309, + "step": 6330 + }, + { + "epoch": 8.096, + "grad_norm": 0.5437869429588318, + "learning_rate": 6.016960559967694e-05, + "loss": 3.2537, + "step": 6331 + }, + { + "epoch": 8.09728, + "grad_norm": 0.5316812992095947, + "learning_rate": 6.012922331403957e-05, + "loss": 3.2957, + "step": 6332 + }, + { + "epoch": 8.09856, + "grad_norm": 0.5401766896247864, + "learning_rate": 6.0088841028402206e-05, + "loss": 3.2455, + "step": 6333 + }, + { + "epoch": 8.09984, + "grad_norm": 0.518835723400116, + "learning_rate": 6.0048458742764836e-05, + "loss": 3.298, + "step": 6334 + }, + { + "epoch": 8.10112, + "grad_norm": 0.531012237071991, + "learning_rate": 6.000807645712747e-05, + "loss": 3.2884, + "step": 6335 + }, + { + "epoch": 8.1024, + "grad_norm": 0.5368117094039917, + "learning_rate": 5.99676941714901e-05, + "loss": 3.2962, + "step": 6336 + }, + { + "epoch": 8.10368, + "grad_norm": 0.5301125049591064, + "learning_rate": 5.992731188585273e-05, + "loss": 3.366, + "step": 6337 + }, + { + "epoch": 8.10496, + "grad_norm": 0.5062065720558167, + "learning_rate": 5.988692960021536e-05, + "loss": 3.2497, + "step": 6338 + }, + { + "epoch": 8.10624, + "grad_norm": 0.5318953990936279, + "learning_rate": 5.9846547314578e-05, + "loss": 3.3522, + "step": 6339 + }, + { + "epoch": 8.10752, + "grad_norm": 0.5129379034042358, + "learning_rate": 5.980616502894063e-05, + "loss": 3.2826, + "step": 6340 + }, + { + "epoch": 8.1088, + "grad_norm": 0.5241764187812805, + "learning_rate": 5.976578274330327e-05, + "loss": 3.295, + "step": 6341 + }, + { + "epoch": 8.11008, + "grad_norm": 0.5228458046913147, + "learning_rate": 5.97254004576659e-05, + "loss": 3.2485, + "step": 6342 + }, + { + "epoch": 8.11136, + "grad_norm": 0.523200273513794, + "learning_rate": 5.9685018172028534e-05, + "loss": 3.306, + "step": 6343 + }, + { + "epoch": 8.11264, + "grad_norm": 0.5235135555267334, + "learning_rate": 5.9644635886391164e-05, + "loss": 3.2701, + "step": 6344 + }, + { + "epoch": 8.11392, + "grad_norm": 0.5147677063941956, + "learning_rate": 5.96042536007538e-05, + "loss": 3.2849, + "step": 6345 + }, + { + "epoch": 8.1152, + "grad_norm": 0.5381238460540771, + "learning_rate": 5.956387131511643e-05, + "loss": 3.3421, + "step": 6346 + }, + { + "epoch": 8.11648, + "grad_norm": 0.5454177260398865, + "learning_rate": 5.952348902947907e-05, + "loss": 3.3036, + "step": 6347 + }, + { + "epoch": 8.11776, + "grad_norm": 0.5195689797401428, + "learning_rate": 5.94831067438417e-05, + "loss": 3.2856, + "step": 6348 + }, + { + "epoch": 8.11904, + "grad_norm": 0.5343729853630066, + "learning_rate": 5.9442724458204335e-05, + "loss": 3.2902, + "step": 6349 + }, + { + "epoch": 8.12032, + "grad_norm": 0.5302938222885132, + "learning_rate": 5.940234217256696e-05, + "loss": 3.2755, + "step": 6350 + }, + { + "epoch": 8.1216, + "grad_norm": 0.5336959958076477, + "learning_rate": 5.9361959886929595e-05, + "loss": 3.2973, + "step": 6351 + }, + { + "epoch": 8.12288, + "grad_norm": 0.525851845741272, + "learning_rate": 5.9321577601292225e-05, + "loss": 3.2702, + "step": 6352 + }, + { + "epoch": 8.12416, + "grad_norm": 0.5346911549568176, + "learning_rate": 5.928119531565486e-05, + "loss": 3.3683, + "step": 6353 + }, + { + "epoch": 8.12544, + "grad_norm": 0.5326193571090698, + "learning_rate": 5.924081303001749e-05, + "loss": 3.3173, + "step": 6354 + }, + { + "epoch": 8.12672, + "grad_norm": 0.5400025844573975, + "learning_rate": 5.920043074438013e-05, + "loss": 3.279, + "step": 6355 + }, + { + "epoch": 8.128, + "grad_norm": 0.5323216915130615, + "learning_rate": 5.916004845874276e-05, + "loss": 3.2818, + "step": 6356 + }, + { + "epoch": 8.12928, + "grad_norm": 0.5249941945075989, + "learning_rate": 5.9119666173105395e-05, + "loss": 3.2571, + "step": 6357 + }, + { + "epoch": 8.13056, + "grad_norm": 0.5245072245597839, + "learning_rate": 5.9079283887468026e-05, + "loss": 3.2323, + "step": 6358 + }, + { + "epoch": 8.13184, + "grad_norm": 0.5230678915977478, + "learning_rate": 5.903890160183066e-05, + "loss": 3.2932, + "step": 6359 + }, + { + "epoch": 8.13312, + "grad_norm": 0.5385542511940002, + "learning_rate": 5.899851931619329e-05, + "loss": 3.3357, + "step": 6360 + }, + { + "epoch": 8.1344, + "grad_norm": 0.5333831310272217, + "learning_rate": 5.895813703055593e-05, + "loss": 3.301, + "step": 6361 + }, + { + "epoch": 8.13568, + "grad_norm": 0.5369766354560852, + "learning_rate": 5.891775474491856e-05, + "loss": 3.2642, + "step": 6362 + }, + { + "epoch": 8.13696, + "grad_norm": 0.5322566032409668, + "learning_rate": 5.8877372459281196e-05, + "loss": 3.2542, + "step": 6363 + }, + { + "epoch": 8.13824, + "grad_norm": 0.5302499532699585, + "learning_rate": 5.883699017364382e-05, + "loss": 3.2494, + "step": 6364 + }, + { + "epoch": 8.13952, + "grad_norm": 0.5144988298416138, + "learning_rate": 5.879660788800645e-05, + "loss": 3.1926, + "step": 6365 + }, + { + "epoch": 8.1408, + "grad_norm": 0.5225915908813477, + "learning_rate": 5.8756225602369086e-05, + "loss": 3.3514, + "step": 6366 + }, + { + "epoch": 8.14208, + "grad_norm": 0.5414659976959229, + "learning_rate": 5.871584331673172e-05, + "loss": 3.3684, + "step": 6367 + }, + { + "epoch": 8.14336, + "grad_norm": 0.5391842722892761, + "learning_rate": 5.867546103109435e-05, + "loss": 3.3141, + "step": 6368 + }, + { + "epoch": 8.14464, + "grad_norm": 0.5198903679847717, + "learning_rate": 5.863507874545699e-05, + "loss": 3.284, + "step": 6369 + }, + { + "epoch": 8.14592, + "grad_norm": 0.5301628112792969, + "learning_rate": 5.859469645981962e-05, + "loss": 3.2225, + "step": 6370 + }, + { + "epoch": 8.1472, + "grad_norm": 0.5428996682167053, + "learning_rate": 5.855431417418226e-05, + "loss": 3.3104, + "step": 6371 + }, + { + "epoch": 8.14848, + "grad_norm": 0.5339773297309875, + "learning_rate": 5.851393188854489e-05, + "loss": 3.3018, + "step": 6372 + }, + { + "epoch": 8.14976, + "grad_norm": 0.52174973487854, + "learning_rate": 5.8473549602907524e-05, + "loss": 3.2703, + "step": 6373 + }, + { + "epoch": 8.15104, + "grad_norm": 0.5308415293693542, + "learning_rate": 5.8433167317270154e-05, + "loss": 3.2691, + "step": 6374 + }, + { + "epoch": 8.15232, + "grad_norm": 0.5573276877403259, + "learning_rate": 5.839278503163279e-05, + "loss": 3.34, + "step": 6375 + }, + { + "epoch": 8.1536, + "grad_norm": 0.5412933230400085, + "learning_rate": 5.835240274599542e-05, + "loss": 3.286, + "step": 6376 + }, + { + "epoch": 8.15488, + "grad_norm": 0.5277814269065857, + "learning_rate": 5.8312020460358044e-05, + "loss": 3.3651, + "step": 6377 + }, + { + "epoch": 8.15616, + "grad_norm": 0.5270769596099854, + "learning_rate": 5.827163817472068e-05, + "loss": 3.3258, + "step": 6378 + }, + { + "epoch": 8.15744, + "grad_norm": 0.5277435779571533, + "learning_rate": 5.823125588908331e-05, + "loss": 3.1801, + "step": 6379 + }, + { + "epoch": 8.15872, + "grad_norm": 0.5209450125694275, + "learning_rate": 5.819087360344595e-05, + "loss": 3.2954, + "step": 6380 + }, + { + "epoch": 8.16, + "grad_norm": 0.5338136553764343, + "learning_rate": 5.815049131780858e-05, + "loss": 3.2964, + "step": 6381 + }, + { + "epoch": 8.16128, + "grad_norm": 0.547455370426178, + "learning_rate": 5.8110109032171215e-05, + "loss": 3.3137, + "step": 6382 + }, + { + "epoch": 8.16256, + "grad_norm": 0.5422819256782532, + "learning_rate": 5.8069726746533845e-05, + "loss": 3.2683, + "step": 6383 + }, + { + "epoch": 8.16384, + "grad_norm": 0.536233127117157, + "learning_rate": 5.802934446089648e-05, + "loss": 3.3036, + "step": 6384 + }, + { + "epoch": 8.16512, + "grad_norm": 0.5339389443397522, + "learning_rate": 5.798896217525911e-05, + "loss": 3.3953, + "step": 6385 + }, + { + "epoch": 8.1664, + "grad_norm": 0.5347949266433716, + "learning_rate": 5.794857988962175e-05, + "loss": 3.3728, + "step": 6386 + }, + { + "epoch": 8.16768, + "grad_norm": 0.5407658219337463, + "learning_rate": 5.7908197603984386e-05, + "loss": 3.3447, + "step": 6387 + }, + { + "epoch": 8.16896, + "grad_norm": 0.5457347631454468, + "learning_rate": 5.7867815318347016e-05, + "loss": 3.3002, + "step": 6388 + }, + { + "epoch": 8.17024, + "grad_norm": 0.5405694842338562, + "learning_rate": 5.782743303270965e-05, + "loss": 3.2992, + "step": 6389 + }, + { + "epoch": 8.17152, + "grad_norm": 0.5365909934043884, + "learning_rate": 5.778705074707228e-05, + "loss": 3.3, + "step": 6390 + }, + { + "epoch": 8.1728, + "grad_norm": 0.5313602089881897, + "learning_rate": 5.7746668461434906e-05, + "loss": 3.2851, + "step": 6391 + }, + { + "epoch": 8.17408, + "grad_norm": 0.5397671461105347, + "learning_rate": 5.770628617579754e-05, + "loss": 3.3395, + "step": 6392 + }, + { + "epoch": 8.17536, + "grad_norm": 0.5422872304916382, + "learning_rate": 5.766590389016017e-05, + "loss": 3.2891, + "step": 6393 + }, + { + "epoch": 8.17664, + "grad_norm": 0.5259747505187988, + "learning_rate": 5.762552160452281e-05, + "loss": 3.2912, + "step": 6394 + }, + { + "epoch": 8.17792, + "grad_norm": 0.5186034440994263, + "learning_rate": 5.758513931888544e-05, + "loss": 3.2599, + "step": 6395 + }, + { + "epoch": 8.1792, + "grad_norm": 0.5253992080688477, + "learning_rate": 5.754475703324808e-05, + "loss": 3.2672, + "step": 6396 + }, + { + "epoch": 8.18048, + "grad_norm": 0.5303462743759155, + "learning_rate": 5.750437474761071e-05, + "loss": 3.3237, + "step": 6397 + }, + { + "epoch": 8.18176, + "grad_norm": 0.5276507139205933, + "learning_rate": 5.7463992461973344e-05, + "loss": 3.2787, + "step": 6398 + }, + { + "epoch": 8.18304, + "grad_norm": 0.5117890238761902, + "learning_rate": 5.7423610176335974e-05, + "loss": 3.2535, + "step": 6399 + }, + { + "epoch": 8.18432, + "grad_norm": 0.5390210151672363, + "learning_rate": 5.738322789069861e-05, + "loss": 3.2985, + "step": 6400 + }, + { + "epoch": 8.1856, + "grad_norm": 0.5502768754959106, + "learning_rate": 5.734284560506124e-05, + "loss": 3.2703, + "step": 6401 + }, + { + "epoch": 8.18688, + "grad_norm": 0.533101499080658, + "learning_rate": 5.730246331942388e-05, + "loss": 3.3369, + "step": 6402 + }, + { + "epoch": 8.18816, + "grad_norm": 0.5256611108779907, + "learning_rate": 5.726208103378651e-05, + "loss": 3.286, + "step": 6403 + }, + { + "epoch": 8.18944, + "grad_norm": 0.5344136357307434, + "learning_rate": 5.722169874814914e-05, + "loss": 3.2811, + "step": 6404 + }, + { + "epoch": 8.19072, + "grad_norm": 0.53043532371521, + "learning_rate": 5.718131646251177e-05, + "loss": 3.1944, + "step": 6405 + }, + { + "epoch": 8.192, + "grad_norm": 0.5280171036720276, + "learning_rate": 5.7140934176874405e-05, + "loss": 3.3055, + "step": 6406 + }, + { + "epoch": 8.19328, + "grad_norm": 0.5500734448432922, + "learning_rate": 5.7100551891237035e-05, + "loss": 3.3447, + "step": 6407 + }, + { + "epoch": 8.19456, + "grad_norm": 0.5249525308609009, + "learning_rate": 5.706016960559967e-05, + "loss": 3.2479, + "step": 6408 + }, + { + "epoch": 8.19584, + "grad_norm": 0.5362176895141602, + "learning_rate": 5.70197873199623e-05, + "loss": 3.3179, + "step": 6409 + }, + { + "epoch": 8.19712, + "grad_norm": 0.53925621509552, + "learning_rate": 5.697940503432494e-05, + "loss": 3.3497, + "step": 6410 + }, + { + "epoch": 8.1984, + "grad_norm": 0.5505580306053162, + "learning_rate": 5.693902274868757e-05, + "loss": 3.3724, + "step": 6411 + }, + { + "epoch": 8.19968, + "grad_norm": 0.5253210067749023, + "learning_rate": 5.6898640463050205e-05, + "loss": 3.2298, + "step": 6412 + }, + { + "epoch": 8.20096, + "grad_norm": 0.5256701707839966, + "learning_rate": 5.6858258177412836e-05, + "loss": 3.3066, + "step": 6413 + }, + { + "epoch": 8.20224, + "grad_norm": 0.5349451899528503, + "learning_rate": 5.681787589177547e-05, + "loss": 3.3191, + "step": 6414 + }, + { + "epoch": 8.20352, + "grad_norm": 0.5368797779083252, + "learning_rate": 5.67774936061381e-05, + "loss": 3.2995, + "step": 6415 + }, + { + "epoch": 8.2048, + "grad_norm": 0.5200009942054749, + "learning_rate": 5.673711132050074e-05, + "loss": 3.3066, + "step": 6416 + }, + { + "epoch": 8.20608, + "grad_norm": 0.5384882092475891, + "learning_rate": 5.669672903486337e-05, + "loss": 3.2356, + "step": 6417 + }, + { + "epoch": 8.20736, + "grad_norm": 0.5600325465202332, + "learning_rate": 5.6656346749226e-05, + "loss": 3.341, + "step": 6418 + }, + { + "epoch": 8.20864, + "grad_norm": 0.5298986434936523, + "learning_rate": 5.661596446358863e-05, + "loss": 3.278, + "step": 6419 + }, + { + "epoch": 8.20992, + "grad_norm": 0.5479633808135986, + "learning_rate": 5.6575582177951266e-05, + "loss": 3.313, + "step": 6420 + }, + { + "epoch": 8.2112, + "grad_norm": 0.5331804156303406, + "learning_rate": 5.6535199892313896e-05, + "loss": 3.2668, + "step": 6421 + }, + { + "epoch": 8.21248, + "grad_norm": 0.5298319458961487, + "learning_rate": 5.649481760667653e-05, + "loss": 3.2764, + "step": 6422 + }, + { + "epoch": 8.21376, + "grad_norm": 0.5214959383010864, + "learning_rate": 5.645443532103916e-05, + "loss": 3.2612, + "step": 6423 + }, + { + "epoch": 8.21504, + "grad_norm": 0.5243988633155823, + "learning_rate": 5.64140530354018e-05, + "loss": 3.2165, + "step": 6424 + }, + { + "epoch": 8.21632, + "grad_norm": 0.5409027934074402, + "learning_rate": 5.637367074976443e-05, + "loss": 3.3805, + "step": 6425 + }, + { + "epoch": 8.2176, + "grad_norm": 0.5333059430122375, + "learning_rate": 5.633328846412707e-05, + "loss": 3.3277, + "step": 6426 + }, + { + "epoch": 8.21888, + "grad_norm": 0.5418739914894104, + "learning_rate": 5.62929061784897e-05, + "loss": 3.3094, + "step": 6427 + }, + { + "epoch": 8.22016, + "grad_norm": 0.544068455696106, + "learning_rate": 5.6252523892852334e-05, + "loss": 3.3722, + "step": 6428 + }, + { + "epoch": 8.22144, + "grad_norm": 0.5321391820907593, + "learning_rate": 5.6212141607214964e-05, + "loss": 3.2523, + "step": 6429 + }, + { + "epoch": 8.22272, + "grad_norm": 0.538314163684845, + "learning_rate": 5.61717593215776e-05, + "loss": 3.3437, + "step": 6430 + }, + { + "epoch": 8.224, + "grad_norm": 0.519304096698761, + "learning_rate": 5.6131377035940224e-05, + "loss": 3.3433, + "step": 6431 + }, + { + "epoch": 8.22528, + "grad_norm": 0.5347556471824646, + "learning_rate": 5.609099475030286e-05, + "loss": 3.2909, + "step": 6432 + }, + { + "epoch": 8.22656, + "grad_norm": 0.5274844169616699, + "learning_rate": 5.605061246466549e-05, + "loss": 3.31, + "step": 6433 + }, + { + "epoch": 8.22784, + "grad_norm": 0.516024112701416, + "learning_rate": 5.601023017902813e-05, + "loss": 3.2947, + "step": 6434 + }, + { + "epoch": 8.22912, + "grad_norm": 0.5266866087913513, + "learning_rate": 5.596984789339076e-05, + "loss": 3.2854, + "step": 6435 + }, + { + "epoch": 8.2304, + "grad_norm": 0.537972629070282, + "learning_rate": 5.5929465607753395e-05, + "loss": 3.3274, + "step": 6436 + }, + { + "epoch": 8.23168, + "grad_norm": 0.5341406464576721, + "learning_rate": 5.5889083322116025e-05, + "loss": 3.2714, + "step": 6437 + }, + { + "epoch": 8.23296, + "grad_norm": 0.5305014252662659, + "learning_rate": 5.584870103647866e-05, + "loss": 3.2615, + "step": 6438 + }, + { + "epoch": 8.23424, + "grad_norm": 0.5244127511978149, + "learning_rate": 5.580831875084129e-05, + "loss": 3.328, + "step": 6439 + }, + { + "epoch": 8.23552, + "grad_norm": 0.54160475730896, + "learning_rate": 5.576793646520393e-05, + "loss": 3.258, + "step": 6440 + }, + { + "epoch": 8.2368, + "grad_norm": 0.5337183475494385, + "learning_rate": 5.572755417956656e-05, + "loss": 3.279, + "step": 6441 + }, + { + "epoch": 8.23808, + "grad_norm": 0.5248258709907532, + "learning_rate": 5.5687171893929196e-05, + "loss": 3.2938, + "step": 6442 + }, + { + "epoch": 8.23936, + "grad_norm": 0.5294501185417175, + "learning_rate": 5.5646789608291826e-05, + "loss": 3.2288, + "step": 6443 + }, + { + "epoch": 8.24064, + "grad_norm": 0.5414791703224182, + "learning_rate": 5.560640732265446e-05, + "loss": 3.2559, + "step": 6444 + }, + { + "epoch": 8.24192, + "grad_norm": 0.5217268466949463, + "learning_rate": 5.5566025037017086e-05, + "loss": 3.341, + "step": 6445 + }, + { + "epoch": 8.2432, + "grad_norm": 0.5463381409645081, + "learning_rate": 5.552564275137972e-05, + "loss": 3.3459, + "step": 6446 + }, + { + "epoch": 8.24448, + "grad_norm": 0.5450925827026367, + "learning_rate": 5.548526046574235e-05, + "loss": 3.287, + "step": 6447 + }, + { + "epoch": 8.24576, + "grad_norm": 0.5325472950935364, + "learning_rate": 5.544487818010499e-05, + "loss": 3.285, + "step": 6448 + }, + { + "epoch": 8.24704, + "grad_norm": 0.5504860877990723, + "learning_rate": 5.540449589446762e-05, + "loss": 3.3032, + "step": 6449 + }, + { + "epoch": 8.24832, + "grad_norm": 0.5341525077819824, + "learning_rate": 5.536411360883026e-05, + "loss": 3.2986, + "step": 6450 + }, + { + "epoch": 8.2496, + "grad_norm": 0.5327278971672058, + "learning_rate": 5.532373132319289e-05, + "loss": 3.2495, + "step": 6451 + }, + { + "epoch": 8.25088, + "grad_norm": 0.5406553745269775, + "learning_rate": 5.5283349037555524e-05, + "loss": 3.3444, + "step": 6452 + }, + { + "epoch": 8.25216, + "grad_norm": 0.5242081880569458, + "learning_rate": 5.5242966751918154e-05, + "loss": 3.264, + "step": 6453 + }, + { + "epoch": 8.25344, + "grad_norm": 0.534663200378418, + "learning_rate": 5.520258446628079e-05, + "loss": 3.3117, + "step": 6454 + }, + { + "epoch": 8.25472, + "grad_norm": 0.5145909190177917, + "learning_rate": 5.516220218064342e-05, + "loss": 3.255, + "step": 6455 + }, + { + "epoch": 8.256, + "grad_norm": 0.5177584886550903, + "learning_rate": 5.512181989500606e-05, + "loss": 3.2581, + "step": 6456 + }, + { + "epoch": 8.25728, + "grad_norm": 0.5454705357551575, + "learning_rate": 5.508143760936869e-05, + "loss": 3.2777, + "step": 6457 + }, + { + "epoch": 8.25856, + "grad_norm": 0.5337566137313843, + "learning_rate": 5.504105532373132e-05, + "loss": 3.2425, + "step": 6458 + }, + { + "epoch": 8.25984, + "grad_norm": 0.5299128293991089, + "learning_rate": 5.500067303809395e-05, + "loss": 3.3318, + "step": 6459 + }, + { + "epoch": 8.26112, + "grad_norm": 0.5477068424224854, + "learning_rate": 5.4960290752456585e-05, + "loss": 3.3498, + "step": 6460 + }, + { + "epoch": 8.2624, + "grad_norm": 0.5416424870491028, + "learning_rate": 5.4919908466819215e-05, + "loss": 3.3292, + "step": 6461 + }, + { + "epoch": 8.26368, + "grad_norm": 0.5232448577880859, + "learning_rate": 5.487952618118185e-05, + "loss": 3.282, + "step": 6462 + }, + { + "epoch": 8.26496, + "grad_norm": 0.5199217796325684, + "learning_rate": 5.483914389554448e-05, + "loss": 3.3211, + "step": 6463 + }, + { + "epoch": 8.26624, + "grad_norm": 0.5238966941833496, + "learning_rate": 5.479876160990712e-05, + "loss": 3.2954, + "step": 6464 + }, + { + "epoch": 8.26752, + "grad_norm": 0.5222386121749878, + "learning_rate": 5.475837932426975e-05, + "loss": 3.3141, + "step": 6465 + }, + { + "epoch": 8.2688, + "grad_norm": 0.5436253547668457, + "learning_rate": 5.4717997038632385e-05, + "loss": 3.3027, + "step": 6466 + }, + { + "epoch": 8.27008, + "grad_norm": 0.5344957709312439, + "learning_rate": 5.4677614752995015e-05, + "loss": 3.3603, + "step": 6467 + }, + { + "epoch": 8.27136, + "grad_norm": 0.5461180806159973, + "learning_rate": 5.463723246735765e-05, + "loss": 3.3247, + "step": 6468 + }, + { + "epoch": 8.272639999999999, + "grad_norm": 0.5331898927688599, + "learning_rate": 5.459685018172028e-05, + "loss": 3.2901, + "step": 6469 + }, + { + "epoch": 8.27392, + "grad_norm": 0.5247555375099182, + "learning_rate": 5.455646789608292e-05, + "loss": 3.2163, + "step": 6470 + }, + { + "epoch": 8.2752, + "grad_norm": 0.5188766121864319, + "learning_rate": 5.451608561044554e-05, + "loss": 3.2779, + "step": 6471 + }, + { + "epoch": 8.27648, + "grad_norm": 0.5405585169792175, + "learning_rate": 5.447570332480817e-05, + "loss": 3.2365, + "step": 6472 + }, + { + "epoch": 8.27776, + "grad_norm": 0.529931366443634, + "learning_rate": 5.443532103917081e-05, + "loss": 3.3514, + "step": 6473 + }, + { + "epoch": 8.27904, + "grad_norm": 0.5279687643051147, + "learning_rate": 5.4394938753533446e-05, + "loss": 3.2361, + "step": 6474 + }, + { + "epoch": 8.28032, + "grad_norm": 0.537344753742218, + "learning_rate": 5.4354556467896076e-05, + "loss": 3.2811, + "step": 6475 + }, + { + "epoch": 8.2816, + "grad_norm": 0.5340175628662109, + "learning_rate": 5.431417418225871e-05, + "loss": 3.2844, + "step": 6476 + }, + { + "epoch": 8.28288, + "grad_norm": 0.5298900604248047, + "learning_rate": 5.427379189662134e-05, + "loss": 3.3131, + "step": 6477 + }, + { + "epoch": 8.28416, + "grad_norm": 0.5220656394958496, + "learning_rate": 5.423340961098398e-05, + "loss": 3.2696, + "step": 6478 + }, + { + "epoch": 8.28544, + "grad_norm": 0.5230215191841125, + "learning_rate": 5.419302732534661e-05, + "loss": 3.2052, + "step": 6479 + }, + { + "epoch": 8.28672, + "grad_norm": 0.5259697437286377, + "learning_rate": 5.415264503970925e-05, + "loss": 3.2847, + "step": 6480 + }, + { + "epoch": 8.288, + "grad_norm": 0.5399904251098633, + "learning_rate": 5.411226275407188e-05, + "loss": 3.3553, + "step": 6481 + }, + { + "epoch": 8.28928, + "grad_norm": 0.5356093645095825, + "learning_rate": 5.4071880468434514e-05, + "loss": 3.2998, + "step": 6482 + }, + { + "epoch": 8.29056, + "grad_norm": 0.5491902232170105, + "learning_rate": 5.4031498182797144e-05, + "loss": 3.3246, + "step": 6483 + }, + { + "epoch": 8.29184, + "grad_norm": 0.5284157991409302, + "learning_rate": 5.399111589715978e-05, + "loss": 3.3414, + "step": 6484 + }, + { + "epoch": 8.29312, + "grad_norm": 0.5413424372673035, + "learning_rate": 5.3950733611522404e-05, + "loss": 3.3315, + "step": 6485 + }, + { + "epoch": 8.2944, + "grad_norm": 0.5393568873405457, + "learning_rate": 5.3910351325885034e-05, + "loss": 3.3165, + "step": 6486 + }, + { + "epoch": 8.29568, + "grad_norm": 0.5338280200958252, + "learning_rate": 5.386996904024767e-05, + "loss": 3.2115, + "step": 6487 + }, + { + "epoch": 8.29696, + "grad_norm": 0.5379251837730408, + "learning_rate": 5.38295867546103e-05, + "loss": 3.2415, + "step": 6488 + }, + { + "epoch": 8.29824, + "grad_norm": 0.5346460938453674, + "learning_rate": 5.378920446897294e-05, + "loss": 3.2432, + "step": 6489 + }, + { + "epoch": 8.29952, + "grad_norm": 0.5359485149383545, + "learning_rate": 5.374882218333557e-05, + "loss": 3.285, + "step": 6490 + }, + { + "epoch": 8.3008, + "grad_norm": 0.5321866869926453, + "learning_rate": 5.3708439897698205e-05, + "loss": 3.2818, + "step": 6491 + }, + { + "epoch": 8.30208, + "grad_norm": 0.5524401068687439, + "learning_rate": 5.366805761206084e-05, + "loss": 3.317, + "step": 6492 + }, + { + "epoch": 8.30336, + "grad_norm": 0.5479095578193665, + "learning_rate": 5.362767532642347e-05, + "loss": 3.3955, + "step": 6493 + }, + { + "epoch": 8.30464, + "grad_norm": 0.5439134836196899, + "learning_rate": 5.358729304078611e-05, + "loss": 3.3354, + "step": 6494 + }, + { + "epoch": 8.30592, + "grad_norm": 0.5380696058273315, + "learning_rate": 5.354691075514874e-05, + "loss": 3.2792, + "step": 6495 + }, + { + "epoch": 8.3072, + "grad_norm": 0.537322998046875, + "learning_rate": 5.3506528469511376e-05, + "loss": 3.3121, + "step": 6496 + }, + { + "epoch": 8.30848, + "grad_norm": 0.5562627911567688, + "learning_rate": 5.3466146183874006e-05, + "loss": 3.3261, + "step": 6497 + }, + { + "epoch": 8.30976, + "grad_norm": 0.5425490140914917, + "learning_rate": 5.342576389823663e-05, + "loss": 3.3206, + "step": 6498 + }, + { + "epoch": 8.31104, + "grad_norm": 0.5292763113975525, + "learning_rate": 5.3385381612599266e-05, + "loss": 3.283, + "step": 6499 + }, + { + "epoch": 8.31232, + "grad_norm": 0.5407996773719788, + "learning_rate": 5.3344999326961896e-05, + "loss": 3.2909, + "step": 6500 + }, + { + "epoch": 8.3136, + "grad_norm": 0.547214925289154, + "learning_rate": 5.330461704132453e-05, + "loss": 3.3512, + "step": 6501 + }, + { + "epoch": 8.31488, + "grad_norm": 0.5352652668952942, + "learning_rate": 5.326423475568716e-05, + "loss": 3.337, + "step": 6502 + }, + { + "epoch": 8.31616, + "grad_norm": 0.5354547500610352, + "learning_rate": 5.32238524700498e-05, + "loss": 3.2523, + "step": 6503 + }, + { + "epoch": 8.31744, + "grad_norm": 0.5254250168800354, + "learning_rate": 5.318347018441243e-05, + "loss": 3.3103, + "step": 6504 + }, + { + "epoch": 8.31872, + "grad_norm": 0.5346370935440063, + "learning_rate": 5.314308789877507e-05, + "loss": 3.2874, + "step": 6505 + }, + { + "epoch": 8.32, + "grad_norm": 0.5400400161743164, + "learning_rate": 5.31027056131377e-05, + "loss": 3.3643, + "step": 6506 + }, + { + "epoch": 8.32128, + "grad_norm": 0.5396955609321594, + "learning_rate": 5.3062323327500334e-05, + "loss": 3.2664, + "step": 6507 + }, + { + "epoch": 8.32256, + "grad_norm": 0.5290980339050293, + "learning_rate": 5.3021941041862964e-05, + "loss": 3.2937, + "step": 6508 + }, + { + "epoch": 8.32384, + "grad_norm": 0.5446150898933411, + "learning_rate": 5.29815587562256e-05, + "loss": 3.3385, + "step": 6509 + }, + { + "epoch": 8.32512, + "grad_norm": 0.5492910146713257, + "learning_rate": 5.294117647058824e-05, + "loss": 3.2376, + "step": 6510 + }, + { + "epoch": 8.3264, + "grad_norm": 0.518017053604126, + "learning_rate": 5.290079418495087e-05, + "loss": 3.2401, + "step": 6511 + }, + { + "epoch": 8.32768, + "grad_norm": 0.527169406414032, + "learning_rate": 5.286041189931349e-05, + "loss": 3.2828, + "step": 6512 + }, + { + "epoch": 8.32896, + "grad_norm": 0.5413787364959717, + "learning_rate": 5.282002961367613e-05, + "loss": 3.338, + "step": 6513 + }, + { + "epoch": 8.33024, + "grad_norm": 0.5491598844528198, + "learning_rate": 5.277964732803876e-05, + "loss": 3.3348, + "step": 6514 + }, + { + "epoch": 8.33152, + "grad_norm": 0.529373049736023, + "learning_rate": 5.2739265042401395e-05, + "loss": 3.3055, + "step": 6515 + }, + { + "epoch": 8.3328, + "grad_norm": 0.5360579490661621, + "learning_rate": 5.2698882756764025e-05, + "loss": 3.2354, + "step": 6516 + }, + { + "epoch": 8.33408, + "grad_norm": 0.5181642770767212, + "learning_rate": 5.265850047112666e-05, + "loss": 3.2713, + "step": 6517 + }, + { + "epoch": 8.33536, + "grad_norm": 0.5401854515075684, + "learning_rate": 5.261811818548929e-05, + "loss": 3.314, + "step": 6518 + }, + { + "epoch": 8.33664, + "grad_norm": 0.5461652874946594, + "learning_rate": 5.257773589985193e-05, + "loss": 3.2553, + "step": 6519 + }, + { + "epoch": 8.33792, + "grad_norm": 0.5387153625488281, + "learning_rate": 5.253735361421456e-05, + "loss": 3.3327, + "step": 6520 + }, + { + "epoch": 8.3392, + "grad_norm": 0.5197851657867432, + "learning_rate": 5.2496971328577195e-05, + "loss": 3.3562, + "step": 6521 + }, + { + "epoch": 8.34048, + "grad_norm": 0.5335418581962585, + "learning_rate": 5.2456589042939825e-05, + "loss": 3.3083, + "step": 6522 + }, + { + "epoch": 8.34176, + "grad_norm": 0.5208085775375366, + "learning_rate": 5.241620675730246e-05, + "loss": 3.2347, + "step": 6523 + }, + { + "epoch": 8.34304, + "grad_norm": 0.5384852290153503, + "learning_rate": 5.237582447166509e-05, + "loss": 3.285, + "step": 6524 + }, + { + "epoch": 8.34432, + "grad_norm": 0.5363668203353882, + "learning_rate": 5.233544218602772e-05, + "loss": 3.3247, + "step": 6525 + }, + { + "epoch": 8.3456, + "grad_norm": 0.5379889011383057, + "learning_rate": 5.229505990039035e-05, + "loss": 3.3292, + "step": 6526 + }, + { + "epoch": 8.34688, + "grad_norm": 0.5510496497154236, + "learning_rate": 5.225467761475299e-05, + "loss": 3.3561, + "step": 6527 + }, + { + "epoch": 8.34816, + "grad_norm": 0.5368717908859253, + "learning_rate": 5.221429532911562e-05, + "loss": 3.2615, + "step": 6528 + }, + { + "epoch": 8.34944, + "grad_norm": 0.5422154068946838, + "learning_rate": 5.2173913043478256e-05, + "loss": 3.2918, + "step": 6529 + }, + { + "epoch": 8.35072, + "grad_norm": 0.5283968448638916, + "learning_rate": 5.2133530757840886e-05, + "loss": 3.2817, + "step": 6530 + }, + { + "epoch": 8.352, + "grad_norm": 0.5295470952987671, + "learning_rate": 5.209314847220352e-05, + "loss": 3.2432, + "step": 6531 + }, + { + "epoch": 8.35328, + "grad_norm": 0.5349634885787964, + "learning_rate": 5.205276618656615e-05, + "loss": 3.2913, + "step": 6532 + }, + { + "epoch": 8.35456, + "grad_norm": 0.5370112657546997, + "learning_rate": 5.201238390092879e-05, + "loss": 3.2895, + "step": 6533 + }, + { + "epoch": 8.35584, + "grad_norm": 0.5439594984054565, + "learning_rate": 5.197200161529142e-05, + "loss": 3.3035, + "step": 6534 + }, + { + "epoch": 8.35712, + "grad_norm": 0.5383702516555786, + "learning_rate": 5.193161932965406e-05, + "loss": 3.3121, + "step": 6535 + }, + { + "epoch": 8.3584, + "grad_norm": 0.5429950952529907, + "learning_rate": 5.189123704401669e-05, + "loss": 3.3598, + "step": 6536 + }, + { + "epoch": 8.35968, + "grad_norm": 0.5377165675163269, + "learning_rate": 5.1850854758379324e-05, + "loss": 3.2763, + "step": 6537 + }, + { + "epoch": 8.36096, + "grad_norm": 0.5333889126777649, + "learning_rate": 5.1810472472741954e-05, + "loss": 3.2872, + "step": 6538 + }, + { + "epoch": 8.36224, + "grad_norm": 0.5367612242698669, + "learning_rate": 5.1770090187104584e-05, + "loss": 3.2694, + "step": 6539 + }, + { + "epoch": 8.36352, + "grad_norm": 0.5359439253807068, + "learning_rate": 5.1729707901467214e-05, + "loss": 3.2822, + "step": 6540 + }, + { + "epoch": 8.3648, + "grad_norm": 0.531587541103363, + "learning_rate": 5.168932561582985e-05, + "loss": 3.3343, + "step": 6541 + }, + { + "epoch": 8.36608, + "grad_norm": 0.5479581952095032, + "learning_rate": 5.164894333019248e-05, + "loss": 3.2928, + "step": 6542 + }, + { + "epoch": 8.36736, + "grad_norm": 0.5197001099586487, + "learning_rate": 5.160856104455512e-05, + "loss": 3.3011, + "step": 6543 + }, + { + "epoch": 8.36864, + "grad_norm": 0.5361149311065674, + "learning_rate": 5.156817875891775e-05, + "loss": 3.2673, + "step": 6544 + }, + { + "epoch": 8.36992, + "grad_norm": 0.5395957827568054, + "learning_rate": 5.1527796473280385e-05, + "loss": 3.3285, + "step": 6545 + }, + { + "epoch": 8.3712, + "grad_norm": 0.538285493850708, + "learning_rate": 5.1487414187643015e-05, + "loss": 3.3397, + "step": 6546 + }, + { + "epoch": 8.37248, + "grad_norm": 0.5274959206581116, + "learning_rate": 5.144703190200565e-05, + "loss": 3.2938, + "step": 6547 + }, + { + "epoch": 8.37376, + "grad_norm": 0.5292668342590332, + "learning_rate": 5.140664961636828e-05, + "loss": 3.2236, + "step": 6548 + }, + { + "epoch": 8.37504, + "grad_norm": 0.5355281233787537, + "learning_rate": 5.136626733073092e-05, + "loss": 3.3204, + "step": 6549 + }, + { + "epoch": 8.37632, + "grad_norm": 0.5475792288780212, + "learning_rate": 5.132588504509355e-05, + "loss": 3.2975, + "step": 6550 + }, + { + "epoch": 8.3776, + "grad_norm": 0.5429788827896118, + "learning_rate": 5.1285502759456186e-05, + "loss": 3.2851, + "step": 6551 + }, + { + "epoch": 8.37888, + "grad_norm": 0.5370329022407532, + "learning_rate": 5.124512047381881e-05, + "loss": 3.32, + "step": 6552 + }, + { + "epoch": 8.38016, + "grad_norm": 0.5378831028938293, + "learning_rate": 5.1204738188181446e-05, + "loss": 3.2567, + "step": 6553 + }, + { + "epoch": 8.38144, + "grad_norm": 0.5425311326980591, + "learning_rate": 5.1164355902544076e-05, + "loss": 3.2856, + "step": 6554 + }, + { + "epoch": 8.38272, + "grad_norm": 0.5385202765464783, + "learning_rate": 5.112397361690671e-05, + "loss": 3.3203, + "step": 6555 + }, + { + "epoch": 8.384, + "grad_norm": 0.5249854922294617, + "learning_rate": 5.108359133126934e-05, + "loss": 3.2775, + "step": 6556 + }, + { + "epoch": 8.38528, + "grad_norm": 0.5497921705245972, + "learning_rate": 5.104320904563198e-05, + "loss": 3.375, + "step": 6557 + }, + { + "epoch": 8.38656, + "grad_norm": 0.5290823578834534, + "learning_rate": 5.100282675999461e-05, + "loss": 3.4186, + "step": 6558 + }, + { + "epoch": 8.38784, + "grad_norm": 0.5338512659072876, + "learning_rate": 5.096244447435725e-05, + "loss": 3.3006, + "step": 6559 + }, + { + "epoch": 8.38912, + "grad_norm": 0.5281364321708679, + "learning_rate": 5.092206218871988e-05, + "loss": 3.3595, + "step": 6560 + }, + { + "epoch": 8.3904, + "grad_norm": 0.5322358012199402, + "learning_rate": 5.0881679903082514e-05, + "loss": 3.2731, + "step": 6561 + }, + { + "epoch": 8.39168, + "grad_norm": 0.5335200428962708, + "learning_rate": 5.0841297617445144e-05, + "loss": 3.2838, + "step": 6562 + }, + { + "epoch": 8.39296, + "grad_norm": 0.5297821760177612, + "learning_rate": 5.080091533180778e-05, + "loss": 3.3354, + "step": 6563 + }, + { + "epoch": 8.39424, + "grad_norm": 0.5368463397026062, + "learning_rate": 5.076053304617041e-05, + "loss": 3.3192, + "step": 6564 + }, + { + "epoch": 8.39552, + "grad_norm": 0.5247287154197693, + "learning_rate": 5.072015076053304e-05, + "loss": 3.3062, + "step": 6565 + }, + { + "epoch": 8.3968, + "grad_norm": 0.540341317653656, + "learning_rate": 5.067976847489567e-05, + "loss": 3.3337, + "step": 6566 + }, + { + "epoch": 8.39808, + "grad_norm": 0.5336794853210449, + "learning_rate": 5.063938618925831e-05, + "loss": 3.2744, + "step": 6567 + }, + { + "epoch": 8.39936, + "grad_norm": 0.5233598351478577, + "learning_rate": 5.059900390362094e-05, + "loss": 3.2749, + "step": 6568 + }, + { + "epoch": 8.40064, + "grad_norm": 0.5248898267745972, + "learning_rate": 5.0558621617983575e-05, + "loss": 3.2279, + "step": 6569 + }, + { + "epoch": 8.40192, + "grad_norm": 0.5336484313011169, + "learning_rate": 5.0518239332346205e-05, + "loss": 3.2535, + "step": 6570 + }, + { + "epoch": 8.4032, + "grad_norm": 0.550398051738739, + "learning_rate": 5.047785704670884e-05, + "loss": 3.3059, + "step": 6571 + }, + { + "epoch": 8.40448, + "grad_norm": 0.5399869084358215, + "learning_rate": 5.043747476107147e-05, + "loss": 3.3524, + "step": 6572 + }, + { + "epoch": 8.40576, + "grad_norm": 0.5220796465873718, + "learning_rate": 5.039709247543411e-05, + "loss": 3.2817, + "step": 6573 + }, + { + "epoch": 8.40704, + "grad_norm": 0.5332768559455872, + "learning_rate": 5.035671018979674e-05, + "loss": 3.2817, + "step": 6574 + }, + { + "epoch": 8.40832, + "grad_norm": 0.5253673195838928, + "learning_rate": 5.0316327904159375e-05, + "loss": 3.2993, + "step": 6575 + }, + { + "epoch": 8.4096, + "grad_norm": 0.5454734563827515, + "learning_rate": 5.0275945618522005e-05, + "loss": 3.2808, + "step": 6576 + }, + { + "epoch": 8.41088, + "grad_norm": 0.5498039126396179, + "learning_rate": 5.023556333288464e-05, + "loss": 3.3277, + "step": 6577 + }, + { + "epoch": 8.41216, + "grad_norm": 0.52730792760849, + "learning_rate": 5.019518104724727e-05, + "loss": 3.2521, + "step": 6578 + }, + { + "epoch": 8.41344, + "grad_norm": 0.5146779417991638, + "learning_rate": 5.0154798761609896e-05, + "loss": 3.3197, + "step": 6579 + }, + { + "epoch": 8.414719999999999, + "grad_norm": 0.5284165740013123, + "learning_rate": 5.011441647597253e-05, + "loss": 3.3388, + "step": 6580 + }, + { + "epoch": 8.416, + "grad_norm": 0.5253511667251587, + "learning_rate": 5.007403419033517e-05, + "loss": 3.324, + "step": 6581 + }, + { + "epoch": 8.41728, + "grad_norm": 0.53730309009552, + "learning_rate": 5.00336519046978e-05, + "loss": 3.3254, + "step": 6582 + }, + { + "epoch": 8.41856, + "grad_norm": 0.5318365693092346, + "learning_rate": 4.9993269619060436e-05, + "loss": 3.3386, + "step": 6583 + }, + { + "epoch": 8.41984, + "grad_norm": 0.522120475769043, + "learning_rate": 4.9952887333423066e-05, + "loss": 3.2756, + "step": 6584 + }, + { + "epoch": 8.42112, + "grad_norm": 0.5341548919677734, + "learning_rate": 4.99125050477857e-05, + "loss": 3.2815, + "step": 6585 + }, + { + "epoch": 8.4224, + "grad_norm": 0.5254530310630798, + "learning_rate": 4.987212276214833e-05, + "loss": 3.3148, + "step": 6586 + }, + { + "epoch": 8.42368, + "grad_norm": 0.516562283039093, + "learning_rate": 4.983174047651097e-05, + "loss": 3.3143, + "step": 6587 + }, + { + "epoch": 8.42496, + "grad_norm": 0.5405133366584778, + "learning_rate": 4.97913581908736e-05, + "loss": 3.2954, + "step": 6588 + }, + { + "epoch": 8.42624, + "grad_norm": 0.5392136573791504, + "learning_rate": 4.975097590523624e-05, + "loss": 3.2129, + "step": 6589 + }, + { + "epoch": 8.42752, + "grad_norm": 0.5335177779197693, + "learning_rate": 4.971059361959887e-05, + "loss": 3.2511, + "step": 6590 + }, + { + "epoch": 8.4288, + "grad_norm": 0.5130221843719482, + "learning_rate": 4.9670211333961504e-05, + "loss": 3.2803, + "step": 6591 + }, + { + "epoch": 8.43008, + "grad_norm": 0.5377788543701172, + "learning_rate": 4.962982904832413e-05, + "loss": 3.3044, + "step": 6592 + }, + { + "epoch": 8.43136, + "grad_norm": 0.535243034362793, + "learning_rate": 4.958944676268676e-05, + "loss": 3.3071, + "step": 6593 + }, + { + "epoch": 8.43264, + "grad_norm": 0.5345697999000549, + "learning_rate": 4.9549064477049394e-05, + "loss": 3.3025, + "step": 6594 + }, + { + "epoch": 8.43392, + "grad_norm": 0.5292974710464478, + "learning_rate": 4.9508682191412024e-05, + "loss": 3.3006, + "step": 6595 + }, + { + "epoch": 8.4352, + "grad_norm": 0.5351077318191528, + "learning_rate": 4.946829990577466e-05, + "loss": 3.3633, + "step": 6596 + }, + { + "epoch": 8.43648, + "grad_norm": 0.546981155872345, + "learning_rate": 4.942791762013729e-05, + "loss": 3.3898, + "step": 6597 + }, + { + "epoch": 8.43776, + "grad_norm": 0.5353581309318542, + "learning_rate": 4.938753533449993e-05, + "loss": 3.2787, + "step": 6598 + }, + { + "epoch": 8.43904, + "grad_norm": 0.5259009003639221, + "learning_rate": 4.9347153048862565e-05, + "loss": 3.204, + "step": 6599 + }, + { + "epoch": 8.44032, + "grad_norm": 0.5294379591941833, + "learning_rate": 4.9306770763225195e-05, + "loss": 3.3074, + "step": 6600 + }, + { + "epoch": 8.4416, + "grad_norm": 0.5439087748527527, + "learning_rate": 4.926638847758783e-05, + "loss": 3.2858, + "step": 6601 + }, + { + "epoch": 8.44288, + "grad_norm": 0.5535233020782471, + "learning_rate": 4.922600619195046e-05, + "loss": 3.2854, + "step": 6602 + }, + { + "epoch": 8.44416, + "grad_norm": 0.5227919220924377, + "learning_rate": 4.91856239063131e-05, + "loss": 3.2956, + "step": 6603 + }, + { + "epoch": 8.44544, + "grad_norm": 0.525883674621582, + "learning_rate": 4.914524162067573e-05, + "loss": 3.3417, + "step": 6604 + }, + { + "epoch": 8.44672, + "grad_norm": 0.5234944224357605, + "learning_rate": 4.9104859335038366e-05, + "loss": 3.2156, + "step": 6605 + }, + { + "epoch": 8.448, + "grad_norm": 0.5246995091438293, + "learning_rate": 4.906447704940099e-05, + "loss": 3.2844, + "step": 6606 + }, + { + "epoch": 8.44928, + "grad_norm": 0.5292709469795227, + "learning_rate": 4.902409476376362e-05, + "loss": 3.3131, + "step": 6607 + }, + { + "epoch": 8.45056, + "grad_norm": 0.5411702394485474, + "learning_rate": 4.8983712478126256e-05, + "loss": 3.2209, + "step": 6608 + }, + { + "epoch": 8.45184, + "grad_norm": 0.5438616871833801, + "learning_rate": 4.8943330192488886e-05, + "loss": 3.3588, + "step": 6609 + }, + { + "epoch": 8.45312, + "grad_norm": 0.5124935507774353, + "learning_rate": 4.890294790685152e-05, + "loss": 3.2722, + "step": 6610 + }, + { + "epoch": 8.4544, + "grad_norm": 0.5373281240463257, + "learning_rate": 4.886256562121415e-05, + "loss": 3.2317, + "step": 6611 + }, + { + "epoch": 8.45568, + "grad_norm": 0.5373220443725586, + "learning_rate": 4.882218333557679e-05, + "loss": 3.2904, + "step": 6612 + }, + { + "epoch": 8.45696, + "grad_norm": 0.526901125907898, + "learning_rate": 4.878180104993942e-05, + "loss": 3.3668, + "step": 6613 + }, + { + "epoch": 8.45824, + "grad_norm": 0.5479453802108765, + "learning_rate": 4.874141876430206e-05, + "loss": 3.3688, + "step": 6614 + }, + { + "epoch": 8.45952, + "grad_norm": 0.5343961715698242, + "learning_rate": 4.870103647866469e-05, + "loss": 3.3287, + "step": 6615 + }, + { + "epoch": 8.4608, + "grad_norm": 0.5295287370681763, + "learning_rate": 4.8660654193027324e-05, + "loss": 3.3605, + "step": 6616 + }, + { + "epoch": 8.46208, + "grad_norm": 0.5407337546348572, + "learning_rate": 4.862027190738996e-05, + "loss": 3.2565, + "step": 6617 + }, + { + "epoch": 8.46336, + "grad_norm": 0.5319713950157166, + "learning_rate": 4.857988962175259e-05, + "loss": 3.2649, + "step": 6618 + }, + { + "epoch": 8.46464, + "grad_norm": 0.5339657664299011, + "learning_rate": 4.8539507336115214e-05, + "loss": 3.2722, + "step": 6619 + }, + { + "epoch": 8.46592, + "grad_norm": 0.5170288681983948, + "learning_rate": 4.849912505047785e-05, + "loss": 3.2732, + "step": 6620 + }, + { + "epoch": 8.4672, + "grad_norm": 0.5304208397865295, + "learning_rate": 4.845874276484048e-05, + "loss": 3.2746, + "step": 6621 + }, + { + "epoch": 8.46848, + "grad_norm": 0.5211367011070251, + "learning_rate": 4.841836047920312e-05, + "loss": 3.2367, + "step": 6622 + }, + { + "epoch": 8.46976, + "grad_norm": 0.5377680659294128, + "learning_rate": 4.837797819356575e-05, + "loss": 3.3002, + "step": 6623 + }, + { + "epoch": 8.47104, + "grad_norm": 0.532772421836853, + "learning_rate": 4.8337595907928385e-05, + "loss": 3.3249, + "step": 6624 + }, + { + "epoch": 8.47232, + "grad_norm": 0.5326812863349915, + "learning_rate": 4.8297213622291015e-05, + "loss": 3.277, + "step": 6625 + }, + { + "epoch": 8.4736, + "grad_norm": 0.532048761844635, + "learning_rate": 4.825683133665365e-05, + "loss": 3.3765, + "step": 6626 + }, + { + "epoch": 8.47488, + "grad_norm": 0.5287336707115173, + "learning_rate": 4.821644905101628e-05, + "loss": 3.3689, + "step": 6627 + }, + { + "epoch": 8.47616, + "grad_norm": 0.5333081483840942, + "learning_rate": 4.817606676537892e-05, + "loss": 3.3058, + "step": 6628 + }, + { + "epoch": 8.47744, + "grad_norm": 0.5384047031402588, + "learning_rate": 4.813568447974155e-05, + "loss": 3.2469, + "step": 6629 + }, + { + "epoch": 8.47872, + "grad_norm": 0.5469292998313904, + "learning_rate": 4.8095302194104185e-05, + "loss": 3.2969, + "step": 6630 + }, + { + "epoch": 8.48, + "grad_norm": 0.5489159226417542, + "learning_rate": 4.8054919908466815e-05, + "loss": 3.3041, + "step": 6631 + }, + { + "epoch": 8.48128, + "grad_norm": 0.5341904759407043, + "learning_rate": 4.801453762282945e-05, + "loss": 3.2527, + "step": 6632 + }, + { + "epoch": 8.48256, + "grad_norm": 0.5368700623512268, + "learning_rate": 4.7974155337192076e-05, + "loss": 3.2925, + "step": 6633 + }, + { + "epoch": 8.48384, + "grad_norm": 0.5526073575019836, + "learning_rate": 4.793377305155471e-05, + "loss": 3.2979, + "step": 6634 + }, + { + "epoch": 8.48512, + "grad_norm": 0.5394701361656189, + "learning_rate": 4.789339076591734e-05, + "loss": 3.3142, + "step": 6635 + }, + { + "epoch": 8.4864, + "grad_norm": 0.5406348705291748, + "learning_rate": 4.785300848027998e-05, + "loss": 3.277, + "step": 6636 + }, + { + "epoch": 8.48768, + "grad_norm": 0.5281654000282288, + "learning_rate": 4.781262619464261e-05, + "loss": 3.2689, + "step": 6637 + }, + { + "epoch": 8.48896, + "grad_norm": 0.519383430480957, + "learning_rate": 4.7772243909005246e-05, + "loss": 3.2823, + "step": 6638 + }, + { + "epoch": 8.49024, + "grad_norm": 0.5329931974411011, + "learning_rate": 4.7731861623367876e-05, + "loss": 3.3037, + "step": 6639 + }, + { + "epoch": 8.49152, + "grad_norm": 0.5432103872299194, + "learning_rate": 4.769147933773051e-05, + "loss": 3.3509, + "step": 6640 + }, + { + "epoch": 8.4928, + "grad_norm": 0.523330807685852, + "learning_rate": 4.765109705209314e-05, + "loss": 3.3401, + "step": 6641 + }, + { + "epoch": 8.49408, + "grad_norm": 0.5295400619506836, + "learning_rate": 4.761071476645578e-05, + "loss": 3.2433, + "step": 6642 + }, + { + "epoch": 8.49536, + "grad_norm": 0.5371250510215759, + "learning_rate": 4.757033248081841e-05, + "loss": 3.2771, + "step": 6643 + }, + { + "epoch": 8.49664, + "grad_norm": 0.5461635589599609, + "learning_rate": 4.752995019518105e-05, + "loss": 3.3759, + "step": 6644 + }, + { + "epoch": 8.49792, + "grad_norm": 0.5435198545455933, + "learning_rate": 4.748956790954368e-05, + "loss": 3.343, + "step": 6645 + }, + { + "epoch": 8.4992, + "grad_norm": 0.5389981269836426, + "learning_rate": 4.744918562390631e-05, + "loss": 3.2387, + "step": 6646 + }, + { + "epoch": 8.50048, + "grad_norm": 0.5361154675483704, + "learning_rate": 4.740880333826894e-05, + "loss": 3.312, + "step": 6647 + }, + { + "epoch": 8.50176, + "grad_norm": 0.5247445702552795, + "learning_rate": 4.7368421052631574e-05, + "loss": 3.2876, + "step": 6648 + }, + { + "epoch": 8.50304, + "grad_norm": 0.5444439053535461, + "learning_rate": 4.7328038766994204e-05, + "loss": 3.2557, + "step": 6649 + }, + { + "epoch": 8.50432, + "grad_norm": 0.549291729927063, + "learning_rate": 4.728765648135684e-05, + "loss": 3.291, + "step": 6650 + }, + { + "epoch": 8.5056, + "grad_norm": 0.5521546602249146, + "learning_rate": 4.724727419571947e-05, + "loss": 3.3253, + "step": 6651 + }, + { + "epoch": 8.50688, + "grad_norm": 0.5204415917396545, + "learning_rate": 4.720689191008211e-05, + "loss": 3.2887, + "step": 6652 + }, + { + "epoch": 8.50816, + "grad_norm": 0.5449312329292297, + "learning_rate": 4.716650962444474e-05, + "loss": 3.2632, + "step": 6653 + }, + { + "epoch": 8.50944, + "grad_norm": 0.558323323726654, + "learning_rate": 4.7126127338807375e-05, + "loss": 3.356, + "step": 6654 + }, + { + "epoch": 8.51072, + "grad_norm": 0.5528385639190674, + "learning_rate": 4.7085745053170005e-05, + "loss": 3.3238, + "step": 6655 + }, + { + "epoch": 8.512, + "grad_norm": 0.5442092418670654, + "learning_rate": 4.704536276753264e-05, + "loss": 3.1951, + "step": 6656 + }, + { + "epoch": 8.51328, + "grad_norm": 0.5273060202598572, + "learning_rate": 4.700498048189527e-05, + "loss": 3.2714, + "step": 6657 + }, + { + "epoch": 8.51456, + "grad_norm": 0.5554311871528625, + "learning_rate": 4.696459819625791e-05, + "loss": 3.3199, + "step": 6658 + }, + { + "epoch": 8.51584, + "grad_norm": 0.540524423122406, + "learning_rate": 4.692421591062053e-05, + "loss": 3.3171, + "step": 6659 + }, + { + "epoch": 8.51712, + "grad_norm": 0.543006420135498, + "learning_rate": 4.688383362498317e-05, + "loss": 3.3168, + "step": 6660 + }, + { + "epoch": 8.5184, + "grad_norm": 0.552527666091919, + "learning_rate": 4.68434513393458e-05, + "loss": 3.3671, + "step": 6661 + }, + { + "epoch": 8.51968, + "grad_norm": 0.5287879109382629, + "learning_rate": 4.6803069053708436e-05, + "loss": 3.2603, + "step": 6662 + }, + { + "epoch": 8.52096, + "grad_norm": 0.5465594530105591, + "learning_rate": 4.6762686768071066e-05, + "loss": 3.287, + "step": 6663 + }, + { + "epoch": 8.52224, + "grad_norm": 0.5492598414421082, + "learning_rate": 4.67223044824337e-05, + "loss": 3.299, + "step": 6664 + }, + { + "epoch": 8.52352, + "grad_norm": 0.534814178943634, + "learning_rate": 4.668192219679633e-05, + "loss": 3.2982, + "step": 6665 + }, + { + "epoch": 8.5248, + "grad_norm": 0.5302172303199768, + "learning_rate": 4.664153991115897e-05, + "loss": 3.2304, + "step": 6666 + }, + { + "epoch": 8.52608, + "grad_norm": 0.5542375445365906, + "learning_rate": 4.66011576255216e-05, + "loss": 3.3307, + "step": 6667 + }, + { + "epoch": 8.52736, + "grad_norm": 0.5232375860214233, + "learning_rate": 4.6560775339884237e-05, + "loss": 3.2925, + "step": 6668 + }, + { + "epoch": 8.52864, + "grad_norm": 0.5362772941589355, + "learning_rate": 4.652039305424687e-05, + "loss": 3.3613, + "step": 6669 + }, + { + "epoch": 8.52992, + "grad_norm": 0.5412434935569763, + "learning_rate": 4.6480010768609504e-05, + "loss": 3.3056, + "step": 6670 + }, + { + "epoch": 8.5312, + "grad_norm": 0.5264190435409546, + "learning_rate": 4.6439628482972134e-05, + "loss": 3.2676, + "step": 6671 + }, + { + "epoch": 8.53248, + "grad_norm": 0.5494565367698669, + "learning_rate": 4.639924619733477e-05, + "loss": 3.3138, + "step": 6672 + }, + { + "epoch": 8.533760000000001, + "grad_norm": 0.5212265253067017, + "learning_rate": 4.6358863911697394e-05, + "loss": 3.3393, + "step": 6673 + }, + { + "epoch": 8.53504, + "grad_norm": 0.5425857901573181, + "learning_rate": 4.631848162606003e-05, + "loss": 3.3338, + "step": 6674 + }, + { + "epoch": 8.53632, + "grad_norm": 0.5471029877662659, + "learning_rate": 4.627809934042266e-05, + "loss": 3.2664, + "step": 6675 + }, + { + "epoch": 8.5376, + "grad_norm": 0.5544648170471191, + "learning_rate": 4.62377170547853e-05, + "loss": 3.2952, + "step": 6676 + }, + { + "epoch": 8.53888, + "grad_norm": 0.5421332716941833, + "learning_rate": 4.619733476914793e-05, + "loss": 3.3276, + "step": 6677 + }, + { + "epoch": 8.54016, + "grad_norm": 0.5286380052566528, + "learning_rate": 4.6156952483510564e-05, + "loss": 3.2644, + "step": 6678 + }, + { + "epoch": 8.54144, + "grad_norm": 0.5330486297607422, + "learning_rate": 4.6116570197873195e-05, + "loss": 3.3331, + "step": 6679 + }, + { + "epoch": 8.54272, + "grad_norm": 0.5347878932952881, + "learning_rate": 4.607618791223583e-05, + "loss": 3.2441, + "step": 6680 + }, + { + "epoch": 8.544, + "grad_norm": 0.5383884906768799, + "learning_rate": 4.603580562659846e-05, + "loss": 3.3088, + "step": 6681 + }, + { + "epoch": 8.54528, + "grad_norm": 0.5470399856567383, + "learning_rate": 4.59954233409611e-05, + "loss": 3.2866, + "step": 6682 + }, + { + "epoch": 8.54656, + "grad_norm": 0.5385159254074097, + "learning_rate": 4.595504105532373e-05, + "loss": 3.248, + "step": 6683 + }, + { + "epoch": 8.54784, + "grad_norm": 0.5333342552185059, + "learning_rate": 4.5914658769686365e-05, + "loss": 3.3123, + "step": 6684 + }, + { + "epoch": 8.54912, + "grad_norm": 0.5450199842453003, + "learning_rate": 4.5874276484048995e-05, + "loss": 3.2946, + "step": 6685 + }, + { + "epoch": 8.5504, + "grad_norm": 0.552905797958374, + "learning_rate": 4.5833894198411625e-05, + "loss": 3.3255, + "step": 6686 + }, + { + "epoch": 8.55168, + "grad_norm": 0.5411385297775269, + "learning_rate": 4.5793511912774255e-05, + "loss": 3.3116, + "step": 6687 + }, + { + "epoch": 8.55296, + "grad_norm": 0.5458307266235352, + "learning_rate": 4.575312962713689e-05, + "loss": 3.3065, + "step": 6688 + }, + { + "epoch": 8.55424, + "grad_norm": 0.526465892791748, + "learning_rate": 4.571274734149952e-05, + "loss": 3.3649, + "step": 6689 + }, + { + "epoch": 8.55552, + "grad_norm": 0.5384424924850464, + "learning_rate": 4.567236505586216e-05, + "loss": 3.3591, + "step": 6690 + }, + { + "epoch": 8.556799999999999, + "grad_norm": 0.5338189601898193, + "learning_rate": 4.563198277022479e-05, + "loss": 3.2857, + "step": 6691 + }, + { + "epoch": 8.55808, + "grad_norm": 0.5296444296836853, + "learning_rate": 4.5591600484587426e-05, + "loss": 3.2774, + "step": 6692 + }, + { + "epoch": 8.55936, + "grad_norm": 0.5314897894859314, + "learning_rate": 4.5551218198950056e-05, + "loss": 3.3251, + "step": 6693 + }, + { + "epoch": 8.56064, + "grad_norm": 0.5258769989013672, + "learning_rate": 4.551083591331269e-05, + "loss": 3.285, + "step": 6694 + }, + { + "epoch": 8.56192, + "grad_norm": 0.5228273868560791, + "learning_rate": 4.547045362767532e-05, + "loss": 3.327, + "step": 6695 + }, + { + "epoch": 8.5632, + "grad_norm": 0.5333476662635803, + "learning_rate": 4.543007134203796e-05, + "loss": 3.2938, + "step": 6696 + }, + { + "epoch": 8.56448, + "grad_norm": 0.5421110987663269, + "learning_rate": 4.538968905640059e-05, + "loss": 3.3763, + "step": 6697 + }, + { + "epoch": 8.565760000000001, + "grad_norm": 0.526771605014801, + "learning_rate": 4.534930677076323e-05, + "loss": 3.2587, + "step": 6698 + }, + { + "epoch": 8.56704, + "grad_norm": 0.5357040762901306, + "learning_rate": 4.530892448512586e-05, + "loss": 3.3598, + "step": 6699 + }, + { + "epoch": 8.56832, + "grad_norm": 0.5350444316864014, + "learning_rate": 4.526854219948848e-05, + "loss": 3.2902, + "step": 6700 + }, + { + "epoch": 8.5696, + "grad_norm": 0.524205207824707, + "learning_rate": 4.522815991385112e-05, + "loss": 3.3145, + "step": 6701 + }, + { + "epoch": 8.57088, + "grad_norm": 0.5205404758453369, + "learning_rate": 4.518777762821375e-05, + "loss": 3.2893, + "step": 6702 + }, + { + "epoch": 8.57216, + "grad_norm": 0.5430803894996643, + "learning_rate": 4.5147395342576384e-05, + "loss": 3.3511, + "step": 6703 + }, + { + "epoch": 8.57344, + "grad_norm": 0.5332580804824829, + "learning_rate": 4.510701305693902e-05, + "loss": 3.3326, + "step": 6704 + }, + { + "epoch": 8.57472, + "grad_norm": 0.5290676951408386, + "learning_rate": 4.506663077130165e-05, + "loss": 3.2984, + "step": 6705 + }, + { + "epoch": 8.576, + "grad_norm": 0.5440507531166077, + "learning_rate": 4.502624848566429e-05, + "loss": 3.3492, + "step": 6706 + }, + { + "epoch": 8.57728, + "grad_norm": 0.5364778637886047, + "learning_rate": 4.498586620002692e-05, + "loss": 3.2265, + "step": 6707 + }, + { + "epoch": 8.57856, + "grad_norm": 0.5364824533462524, + "learning_rate": 4.4945483914389555e-05, + "loss": 3.291, + "step": 6708 + }, + { + "epoch": 8.57984, + "grad_norm": 0.5298408269882202, + "learning_rate": 4.4905101628752185e-05, + "loss": 3.3376, + "step": 6709 + }, + { + "epoch": 8.58112, + "grad_norm": 0.5235030651092529, + "learning_rate": 4.486471934311482e-05, + "loss": 3.3014, + "step": 6710 + }, + { + "epoch": 8.5824, + "grad_norm": 0.5457448959350586, + "learning_rate": 4.482433705747745e-05, + "loss": 3.3132, + "step": 6711 + }, + { + "epoch": 8.58368, + "grad_norm": 0.5244759321212769, + "learning_rate": 4.478395477184009e-05, + "loss": 3.3322, + "step": 6712 + }, + { + "epoch": 8.58496, + "grad_norm": 0.5247620344161987, + "learning_rate": 4.474357248620271e-05, + "loss": 3.3435, + "step": 6713 + }, + { + "epoch": 8.58624, + "grad_norm": 0.5359330177307129, + "learning_rate": 4.470319020056534e-05, + "loss": 3.2807, + "step": 6714 + }, + { + "epoch": 8.58752, + "grad_norm": 0.5311152338981628, + "learning_rate": 4.466280791492798e-05, + "loss": 3.2728, + "step": 6715 + }, + { + "epoch": 8.588799999999999, + "grad_norm": 0.5377166867256165, + "learning_rate": 4.462242562929061e-05, + "loss": 3.2665, + "step": 6716 + }, + { + "epoch": 8.59008, + "grad_norm": 0.5189564824104309, + "learning_rate": 4.4582043343653246e-05, + "loss": 3.2582, + "step": 6717 + }, + { + "epoch": 8.59136, + "grad_norm": 0.5309081673622131, + "learning_rate": 4.4541661058015876e-05, + "loss": 3.3129, + "step": 6718 + }, + { + "epoch": 8.59264, + "grad_norm": 0.536917507648468, + "learning_rate": 4.450127877237851e-05, + "loss": 3.3365, + "step": 6719 + }, + { + "epoch": 8.59392, + "grad_norm": 0.5495189428329468, + "learning_rate": 4.446089648674114e-05, + "loss": 3.2733, + "step": 6720 + }, + { + "epoch": 8.5952, + "grad_norm": 0.5371974110603333, + "learning_rate": 4.442051420110378e-05, + "loss": 3.2837, + "step": 6721 + }, + { + "epoch": 8.59648, + "grad_norm": 0.5370466113090515, + "learning_rate": 4.438013191546641e-05, + "loss": 3.3062, + "step": 6722 + }, + { + "epoch": 8.59776, + "grad_norm": 0.5342411398887634, + "learning_rate": 4.4339749629829047e-05, + "loss": 3.3036, + "step": 6723 + }, + { + "epoch": 8.59904, + "grad_norm": 0.5245928764343262, + "learning_rate": 4.4299367344191683e-05, + "loss": 3.2969, + "step": 6724 + }, + { + "epoch": 8.60032, + "grad_norm": 0.5341728329658508, + "learning_rate": 4.4258985058554314e-05, + "loss": 3.3438, + "step": 6725 + }, + { + "epoch": 8.6016, + "grad_norm": 0.5377516150474548, + "learning_rate": 4.421860277291695e-05, + "loss": 3.2733, + "step": 6726 + }, + { + "epoch": 8.60288, + "grad_norm": 0.5328512191772461, + "learning_rate": 4.4178220487279574e-05, + "loss": 3.3902, + "step": 6727 + }, + { + "epoch": 8.60416, + "grad_norm": 0.5404960513114929, + "learning_rate": 4.4137838201642204e-05, + "loss": 3.3356, + "step": 6728 + }, + { + "epoch": 8.60544, + "grad_norm": 0.5246291160583496, + "learning_rate": 4.409745591600484e-05, + "loss": 3.2186, + "step": 6729 + }, + { + "epoch": 8.60672, + "grad_norm": 0.5350708961486816, + "learning_rate": 4.405707363036747e-05, + "loss": 3.3441, + "step": 6730 + }, + { + "epoch": 8.608, + "grad_norm": 0.525844395160675, + "learning_rate": 4.401669134473011e-05, + "loss": 3.3156, + "step": 6731 + }, + { + "epoch": 8.60928, + "grad_norm": 0.5407573580741882, + "learning_rate": 4.397630905909274e-05, + "loss": 3.2902, + "step": 6732 + }, + { + "epoch": 8.61056, + "grad_norm": 0.5487748384475708, + "learning_rate": 4.3935926773455374e-05, + "loss": 3.2765, + "step": 6733 + }, + { + "epoch": 8.61184, + "grad_norm": 0.56423020362854, + "learning_rate": 4.3895544487818005e-05, + "loss": 3.2918, + "step": 6734 + }, + { + "epoch": 8.61312, + "grad_norm": 0.5294039249420166, + "learning_rate": 4.385516220218064e-05, + "loss": 3.3598, + "step": 6735 + }, + { + "epoch": 8.6144, + "grad_norm": 0.5347636342048645, + "learning_rate": 4.381477991654327e-05, + "loss": 3.2215, + "step": 6736 + }, + { + "epoch": 8.61568, + "grad_norm": 0.5291707515716553, + "learning_rate": 4.377439763090591e-05, + "loss": 3.3315, + "step": 6737 + }, + { + "epoch": 8.61696, + "grad_norm": 0.5520625710487366, + "learning_rate": 4.373401534526854e-05, + "loss": 3.3297, + "step": 6738 + }, + { + "epoch": 8.61824, + "grad_norm": 0.5564367771148682, + "learning_rate": 4.3693633059631175e-05, + "loss": 3.3697, + "step": 6739 + }, + { + "epoch": 8.61952, + "grad_norm": 0.5352851748466492, + "learning_rate": 4.36532507739938e-05, + "loss": 3.322, + "step": 6740 + }, + { + "epoch": 8.6208, + "grad_norm": 0.5492033362388611, + "learning_rate": 4.3612868488356435e-05, + "loss": 3.3269, + "step": 6741 + }, + { + "epoch": 8.62208, + "grad_norm": 0.5320399403572083, + "learning_rate": 4.3572486202719065e-05, + "loss": 3.3004, + "step": 6742 + }, + { + "epoch": 8.62336, + "grad_norm": 0.5452033877372742, + "learning_rate": 4.35321039170817e-05, + "loss": 3.3551, + "step": 6743 + }, + { + "epoch": 8.62464, + "grad_norm": 0.5418961644172668, + "learning_rate": 4.349172163144433e-05, + "loss": 3.3384, + "step": 6744 + }, + { + "epoch": 8.62592, + "grad_norm": 0.5382152199745178, + "learning_rate": 4.345133934580697e-05, + "loss": 3.3212, + "step": 6745 + }, + { + "epoch": 8.6272, + "grad_norm": 0.5336072444915771, + "learning_rate": 4.34109570601696e-05, + "loss": 3.2777, + "step": 6746 + }, + { + "epoch": 8.62848, + "grad_norm": 0.5362292528152466, + "learning_rate": 4.3370574774532236e-05, + "loss": 3.3193, + "step": 6747 + }, + { + "epoch": 8.62976, + "grad_norm": 0.5334807634353638, + "learning_rate": 4.3330192488894866e-05, + "loss": 3.2778, + "step": 6748 + }, + { + "epoch": 8.63104, + "grad_norm": 0.5414783954620361, + "learning_rate": 4.32898102032575e-05, + "loss": 3.3341, + "step": 6749 + }, + { + "epoch": 8.63232, + "grad_norm": 0.5455512404441833, + "learning_rate": 4.324942791762013e-05, + "loss": 3.2904, + "step": 6750 + }, + { + "epoch": 8.6336, + "grad_norm": 0.5280025601387024, + "learning_rate": 4.320904563198277e-05, + "loss": 3.2243, + "step": 6751 + }, + { + "epoch": 8.63488, + "grad_norm": 0.5390767455101013, + "learning_rate": 4.31686633463454e-05, + "loss": 3.3436, + "step": 6752 + }, + { + "epoch": 8.63616, + "grad_norm": 0.5390846133232117, + "learning_rate": 4.312828106070803e-05, + "loss": 3.261, + "step": 6753 + }, + { + "epoch": 8.63744, + "grad_norm": 0.5264686942100525, + "learning_rate": 4.308789877507066e-05, + "loss": 3.3319, + "step": 6754 + }, + { + "epoch": 8.63872, + "grad_norm": 0.5262269377708435, + "learning_rate": 4.30475164894333e-05, + "loss": 3.2829, + "step": 6755 + }, + { + "epoch": 8.64, + "grad_norm": 0.5335661768913269, + "learning_rate": 4.300713420379593e-05, + "loss": 3.2751, + "step": 6756 + }, + { + "epoch": 8.64128, + "grad_norm": 0.5311907529830933, + "learning_rate": 4.2966751918158564e-05, + "loss": 3.3261, + "step": 6757 + }, + { + "epoch": 8.64256, + "grad_norm": 0.5352111458778381, + "learning_rate": 4.2926369632521194e-05, + "loss": 3.2754, + "step": 6758 + }, + { + "epoch": 8.64384, + "grad_norm": 0.5307966470718384, + "learning_rate": 4.288598734688383e-05, + "loss": 3.3116, + "step": 6759 + }, + { + "epoch": 8.64512, + "grad_norm": 0.536453127861023, + "learning_rate": 4.284560506124646e-05, + "loss": 3.3126, + "step": 6760 + }, + { + "epoch": 8.6464, + "grad_norm": 0.5201568007469177, + "learning_rate": 4.28052227756091e-05, + "loss": 3.2846, + "step": 6761 + }, + { + "epoch": 8.64768, + "grad_norm": 0.5415171980857849, + "learning_rate": 4.276484048997173e-05, + "loss": 3.3514, + "step": 6762 + }, + { + "epoch": 8.64896, + "grad_norm": 0.5305605530738831, + "learning_rate": 4.2724458204334365e-05, + "loss": 3.2887, + "step": 6763 + }, + { + "epoch": 8.65024, + "grad_norm": 0.5250943303108215, + "learning_rate": 4.2684075918696995e-05, + "loss": 3.3334, + "step": 6764 + }, + { + "epoch": 8.65152, + "grad_norm": 0.5324791073799133, + "learning_rate": 4.264369363305963e-05, + "loss": 3.246, + "step": 6765 + }, + { + "epoch": 8.6528, + "grad_norm": 0.5431379675865173, + "learning_rate": 4.260331134742226e-05, + "loss": 3.2801, + "step": 6766 + }, + { + "epoch": 8.65408, + "grad_norm": 0.5285992622375488, + "learning_rate": 4.256292906178489e-05, + "loss": 3.2725, + "step": 6767 + }, + { + "epoch": 8.65536, + "grad_norm": 0.5373889803886414, + "learning_rate": 4.252254677614752e-05, + "loss": 3.3643, + "step": 6768 + }, + { + "epoch": 8.65664, + "grad_norm": 0.5431947708129883, + "learning_rate": 4.248216449051016e-05, + "loss": 3.3618, + "step": 6769 + }, + { + "epoch": 8.65792, + "grad_norm": 0.5260449647903442, + "learning_rate": 4.244178220487279e-05, + "loss": 3.265, + "step": 6770 + }, + { + "epoch": 8.6592, + "grad_norm": 0.5484868288040161, + "learning_rate": 4.2401399919235426e-05, + "loss": 3.3274, + "step": 6771 + }, + { + "epoch": 8.66048, + "grad_norm": 0.5380119681358337, + "learning_rate": 4.2361017633598056e-05, + "loss": 3.2953, + "step": 6772 + }, + { + "epoch": 8.66176, + "grad_norm": 0.539348840713501, + "learning_rate": 4.232063534796069e-05, + "loss": 3.2966, + "step": 6773 + }, + { + "epoch": 8.66304, + "grad_norm": 0.528852641582489, + "learning_rate": 4.228025306232332e-05, + "loss": 3.3094, + "step": 6774 + }, + { + "epoch": 8.66432, + "grad_norm": 0.5262014269828796, + "learning_rate": 4.223987077668596e-05, + "loss": 3.237, + "step": 6775 + }, + { + "epoch": 8.6656, + "grad_norm": 0.5200825333595276, + "learning_rate": 4.219948849104859e-05, + "loss": 3.2816, + "step": 6776 + }, + { + "epoch": 8.66688, + "grad_norm": 0.5474440455436707, + "learning_rate": 4.2159106205411227e-05, + "loss": 3.3406, + "step": 6777 + }, + { + "epoch": 8.66816, + "grad_norm": 0.5352291464805603, + "learning_rate": 4.2118723919773857e-05, + "loss": 3.279, + "step": 6778 + }, + { + "epoch": 8.66944, + "grad_norm": 0.5264368653297424, + "learning_rate": 4.2078341634136493e-05, + "loss": 3.2559, + "step": 6779 + }, + { + "epoch": 8.67072, + "grad_norm": 0.5307362675666809, + "learning_rate": 4.203795934849912e-05, + "loss": 3.2877, + "step": 6780 + }, + { + "epoch": 8.672, + "grad_norm": 0.5339285135269165, + "learning_rate": 4.1997577062861754e-05, + "loss": 3.3039, + "step": 6781 + }, + { + "epoch": 8.67328, + "grad_norm": 0.5298980474472046, + "learning_rate": 4.1957194777224384e-05, + "loss": 3.2824, + "step": 6782 + }, + { + "epoch": 8.67456, + "grad_norm": 0.5294161438941956, + "learning_rate": 4.191681249158702e-05, + "loss": 3.3326, + "step": 6783 + }, + { + "epoch": 8.67584, + "grad_norm": 0.5197745561599731, + "learning_rate": 4.187643020594965e-05, + "loss": 3.2917, + "step": 6784 + }, + { + "epoch": 8.67712, + "grad_norm": 0.5530399680137634, + "learning_rate": 4.183604792031229e-05, + "loss": 3.2526, + "step": 6785 + }, + { + "epoch": 8.6784, + "grad_norm": 0.534653902053833, + "learning_rate": 4.179566563467492e-05, + "loss": 3.3122, + "step": 6786 + }, + { + "epoch": 8.67968, + "grad_norm": 0.5447145700454712, + "learning_rate": 4.1755283349037554e-05, + "loss": 3.3016, + "step": 6787 + }, + { + "epoch": 8.68096, + "grad_norm": 0.5226910710334778, + "learning_rate": 4.1714901063400184e-05, + "loss": 3.2551, + "step": 6788 + }, + { + "epoch": 8.68224, + "grad_norm": 0.5207378268241882, + "learning_rate": 4.167451877776282e-05, + "loss": 3.247, + "step": 6789 + }, + { + "epoch": 8.68352, + "grad_norm": 0.5464200973510742, + "learning_rate": 4.163413649212545e-05, + "loss": 3.2968, + "step": 6790 + }, + { + "epoch": 8.6848, + "grad_norm": 0.5588473081588745, + "learning_rate": 4.159375420648809e-05, + "loss": 3.301, + "step": 6791 + }, + { + "epoch": 8.68608, + "grad_norm": 0.5364888906478882, + "learning_rate": 4.155337192085072e-05, + "loss": 3.2832, + "step": 6792 + }, + { + "epoch": 8.68736, + "grad_norm": 0.5507677793502808, + "learning_rate": 4.1512989635213355e-05, + "loss": 3.3312, + "step": 6793 + }, + { + "epoch": 8.68864, + "grad_norm": 0.5295298099517822, + "learning_rate": 4.147260734957598e-05, + "loss": 3.3066, + "step": 6794 + }, + { + "epoch": 8.68992, + "grad_norm": 0.5314168334007263, + "learning_rate": 4.1432225063938615e-05, + "loss": 3.2516, + "step": 6795 + }, + { + "epoch": 8.6912, + "grad_norm": 0.5450182557106018, + "learning_rate": 4.1391842778301245e-05, + "loss": 3.309, + "step": 6796 + }, + { + "epoch": 8.69248, + "grad_norm": 0.5385010242462158, + "learning_rate": 4.135146049266388e-05, + "loss": 3.1977, + "step": 6797 + }, + { + "epoch": 8.69376, + "grad_norm": 0.5317445397377014, + "learning_rate": 4.131107820702651e-05, + "loss": 3.2691, + "step": 6798 + }, + { + "epoch": 8.69504, + "grad_norm": 0.536864697933197, + "learning_rate": 4.127069592138915e-05, + "loss": 3.2888, + "step": 6799 + }, + { + "epoch": 8.69632, + "grad_norm": 0.5189259648323059, + "learning_rate": 4.123031363575178e-05, + "loss": 3.2824, + "step": 6800 + }, + { + "epoch": 8.6976, + "grad_norm": 0.5167062282562256, + "learning_rate": 4.1189931350114416e-05, + "loss": 3.2385, + "step": 6801 + }, + { + "epoch": 8.698879999999999, + "grad_norm": 0.5420131683349609, + "learning_rate": 4.1149549064477046e-05, + "loss": 3.4164, + "step": 6802 + }, + { + "epoch": 8.70016, + "grad_norm": 0.533211886882782, + "learning_rate": 4.110916677883968e-05, + "loss": 3.2774, + "step": 6803 + }, + { + "epoch": 8.70144, + "grad_norm": 0.5351250767707825, + "learning_rate": 4.106878449320231e-05, + "loss": 3.3219, + "step": 6804 + }, + { + "epoch": 8.70272, + "grad_norm": 0.5335774421691895, + "learning_rate": 4.102840220756495e-05, + "loss": 3.272, + "step": 6805 + }, + { + "epoch": 8.704, + "grad_norm": 0.5470417737960815, + "learning_rate": 4.098801992192758e-05, + "loss": 3.265, + "step": 6806 + }, + { + "epoch": 8.70528, + "grad_norm": 0.5419219732284546, + "learning_rate": 4.09476376362902e-05, + "loss": 3.3382, + "step": 6807 + }, + { + "epoch": 8.70656, + "grad_norm": 0.5278633832931519, + "learning_rate": 4.090725535065284e-05, + "loss": 3.341, + "step": 6808 + }, + { + "epoch": 8.707840000000001, + "grad_norm": 0.5203990936279297, + "learning_rate": 4.086687306501547e-05, + "loss": 3.3018, + "step": 6809 + }, + { + "epoch": 8.70912, + "grad_norm": 0.530105710029602, + "learning_rate": 4.082649077937811e-05, + "loss": 3.3272, + "step": 6810 + }, + { + "epoch": 8.7104, + "grad_norm": 0.5211985111236572, + "learning_rate": 4.0786108493740744e-05, + "loss": 3.2695, + "step": 6811 + }, + { + "epoch": 8.71168, + "grad_norm": 0.5373910665512085, + "learning_rate": 4.0745726208103374e-05, + "loss": 3.3031, + "step": 6812 + }, + { + "epoch": 8.71296, + "grad_norm": 0.5470989942550659, + "learning_rate": 4.070534392246601e-05, + "loss": 3.3386, + "step": 6813 + }, + { + "epoch": 8.71424, + "grad_norm": 0.5321961045265198, + "learning_rate": 4.066496163682864e-05, + "loss": 3.2882, + "step": 6814 + }, + { + "epoch": 8.71552, + "grad_norm": 0.5434589385986328, + "learning_rate": 4.062457935119128e-05, + "loss": 3.3023, + "step": 6815 + }, + { + "epoch": 8.7168, + "grad_norm": 0.5319420099258423, + "learning_rate": 4.058419706555391e-05, + "loss": 3.3207, + "step": 6816 + }, + { + "epoch": 8.71808, + "grad_norm": 0.5350143313407898, + "learning_rate": 4.0543814779916545e-05, + "loss": 3.3401, + "step": 6817 + }, + { + "epoch": 8.71936, + "grad_norm": 0.5368631482124329, + "learning_rate": 4.0503432494279175e-05, + "loss": 3.2471, + "step": 6818 + }, + { + "epoch": 8.72064, + "grad_norm": 0.541717529296875, + "learning_rate": 4.046305020864181e-05, + "loss": 3.3548, + "step": 6819 + }, + { + "epoch": 8.72192, + "grad_norm": 0.5320764183998108, + "learning_rate": 4.042266792300444e-05, + "loss": 3.2743, + "step": 6820 + }, + { + "epoch": 8.7232, + "grad_norm": 0.536125659942627, + "learning_rate": 4.0382285637367065e-05, + "loss": 3.3071, + "step": 6821 + }, + { + "epoch": 8.72448, + "grad_norm": 0.5400855541229248, + "learning_rate": 4.03419033517297e-05, + "loss": 3.3063, + "step": 6822 + }, + { + "epoch": 8.72576, + "grad_norm": 0.5443190336227417, + "learning_rate": 4.030152106609233e-05, + "loss": 3.3044, + "step": 6823 + }, + { + "epoch": 8.72704, + "grad_norm": 0.5323216319084167, + "learning_rate": 4.026113878045497e-05, + "loss": 3.2328, + "step": 6824 + }, + { + "epoch": 8.72832, + "grad_norm": 0.5174821615219116, + "learning_rate": 4.02207564948176e-05, + "loss": 3.3146, + "step": 6825 + }, + { + "epoch": 8.7296, + "grad_norm": 0.5285962224006653, + "learning_rate": 4.0180374209180236e-05, + "loss": 3.3673, + "step": 6826 + }, + { + "epoch": 8.730879999999999, + "grad_norm": 0.525948703289032, + "learning_rate": 4.0139991923542866e-05, + "loss": 3.3524, + "step": 6827 + }, + { + "epoch": 8.73216, + "grad_norm": 0.526118814945221, + "learning_rate": 4.00996096379055e-05, + "loss": 3.3361, + "step": 6828 + }, + { + "epoch": 8.73344, + "grad_norm": 0.5256155729293823, + "learning_rate": 4.005922735226814e-05, + "loss": 3.35, + "step": 6829 + }, + { + "epoch": 8.73472, + "grad_norm": 0.5380839705467224, + "learning_rate": 4.001884506663077e-05, + "loss": 3.3115, + "step": 6830 + }, + { + "epoch": 8.736, + "grad_norm": 0.5296849012374878, + "learning_rate": 3.9978462780993406e-05, + "loss": 3.2831, + "step": 6831 + }, + { + "epoch": 8.73728, + "grad_norm": 0.5297383069992065, + "learning_rate": 3.9938080495356037e-05, + "loss": 3.2457, + "step": 6832 + }, + { + "epoch": 8.73856, + "grad_norm": 0.528474748134613, + "learning_rate": 3.989769820971867e-05, + "loss": 3.2796, + "step": 6833 + }, + { + "epoch": 8.739840000000001, + "grad_norm": 0.5146127939224243, + "learning_rate": 3.98573159240813e-05, + "loss": 3.2998, + "step": 6834 + }, + { + "epoch": 8.74112, + "grad_norm": 0.5278341174125671, + "learning_rate": 3.981693363844393e-05, + "loss": 3.2445, + "step": 6835 + }, + { + "epoch": 8.7424, + "grad_norm": 0.5160564184188843, + "learning_rate": 3.9776551352806564e-05, + "loss": 3.307, + "step": 6836 + }, + { + "epoch": 8.74368, + "grad_norm": 0.5266233682632446, + "learning_rate": 3.9736169067169194e-05, + "loss": 3.2835, + "step": 6837 + }, + { + "epoch": 8.74496, + "grad_norm": 0.5305745601654053, + "learning_rate": 3.969578678153183e-05, + "loss": 3.3538, + "step": 6838 + }, + { + "epoch": 8.74624, + "grad_norm": 0.5437188744544983, + "learning_rate": 3.965540449589446e-05, + "loss": 3.3189, + "step": 6839 + }, + { + "epoch": 8.74752, + "grad_norm": 0.5442191958427429, + "learning_rate": 3.96150222102571e-05, + "loss": 3.3207, + "step": 6840 + }, + { + "epoch": 8.7488, + "grad_norm": 0.5521953701972961, + "learning_rate": 3.957463992461973e-05, + "loss": 3.3707, + "step": 6841 + }, + { + "epoch": 8.75008, + "grad_norm": 0.5367425084114075, + "learning_rate": 3.9534257638982364e-05, + "loss": 3.3494, + "step": 6842 + }, + { + "epoch": 8.75136, + "grad_norm": 0.5323737263679504, + "learning_rate": 3.9493875353344994e-05, + "loss": 3.3304, + "step": 6843 + }, + { + "epoch": 8.75264, + "grad_norm": 0.5559576749801636, + "learning_rate": 3.945349306770763e-05, + "loss": 3.2795, + "step": 6844 + }, + { + "epoch": 8.75392, + "grad_norm": 0.5402018427848816, + "learning_rate": 3.941311078207026e-05, + "loss": 3.2885, + "step": 6845 + }, + { + "epoch": 8.7552, + "grad_norm": 0.5311586856842041, + "learning_rate": 3.93727284964329e-05, + "loss": 3.2718, + "step": 6846 + }, + { + "epoch": 8.75648, + "grad_norm": 0.5346354246139526, + "learning_rate": 3.9332346210795535e-05, + "loss": 3.2709, + "step": 6847 + }, + { + "epoch": 8.75776, + "grad_norm": 0.5428527593612671, + "learning_rate": 3.929196392515816e-05, + "loss": 3.296, + "step": 6848 + }, + { + "epoch": 8.75904, + "grad_norm": 0.5390904545783997, + "learning_rate": 3.925158163952079e-05, + "loss": 3.2799, + "step": 6849 + }, + { + "epoch": 8.76032, + "grad_norm": 0.5223475694656372, + "learning_rate": 3.9211199353883425e-05, + "loss": 3.3201, + "step": 6850 + }, + { + "epoch": 8.7616, + "grad_norm": 0.5381412506103516, + "learning_rate": 3.9170817068246055e-05, + "loss": 3.3655, + "step": 6851 + }, + { + "epoch": 8.76288, + "grad_norm": 0.5343440175056458, + "learning_rate": 3.913043478260869e-05, + "loss": 3.264, + "step": 6852 + }, + { + "epoch": 8.76416, + "grad_norm": 0.5274061560630798, + "learning_rate": 3.909005249697132e-05, + "loss": 3.35, + "step": 6853 + }, + { + "epoch": 8.76544, + "grad_norm": 0.5369771718978882, + "learning_rate": 3.904967021133396e-05, + "loss": 3.3448, + "step": 6854 + }, + { + "epoch": 8.76672, + "grad_norm": 0.5344805717468262, + "learning_rate": 3.900928792569659e-05, + "loss": 3.3441, + "step": 6855 + }, + { + "epoch": 8.768, + "grad_norm": 0.5286930203437805, + "learning_rate": 3.8968905640059226e-05, + "loss": 3.3262, + "step": 6856 + }, + { + "epoch": 8.76928, + "grad_norm": 0.5351747870445251, + "learning_rate": 3.8928523354421856e-05, + "loss": 3.3277, + "step": 6857 + }, + { + "epoch": 8.77056, + "grad_norm": 0.5334113836288452, + "learning_rate": 3.888814106878449e-05, + "loss": 3.3462, + "step": 6858 + }, + { + "epoch": 8.77184, + "grad_norm": 0.5328750014305115, + "learning_rate": 3.884775878314712e-05, + "loss": 3.257, + "step": 6859 + }, + { + "epoch": 8.77312, + "grad_norm": 0.5237380266189575, + "learning_rate": 3.880737649750976e-05, + "loss": 3.2508, + "step": 6860 + }, + { + "epoch": 8.7744, + "grad_norm": 0.542219877243042, + "learning_rate": 3.876699421187238e-05, + "loss": 3.3518, + "step": 6861 + }, + { + "epoch": 8.77568, + "grad_norm": 0.5408708453178406, + "learning_rate": 3.872661192623502e-05, + "loss": 3.4064, + "step": 6862 + }, + { + "epoch": 8.77696, + "grad_norm": 0.543810248374939, + "learning_rate": 3.868622964059765e-05, + "loss": 3.3481, + "step": 6863 + }, + { + "epoch": 8.77824, + "grad_norm": 0.5442402958869934, + "learning_rate": 3.864584735496029e-05, + "loss": 3.3938, + "step": 6864 + }, + { + "epoch": 8.77952, + "grad_norm": 0.5541428923606873, + "learning_rate": 3.860546506932292e-05, + "loss": 3.3481, + "step": 6865 + }, + { + "epoch": 8.7808, + "grad_norm": 0.5401941537857056, + "learning_rate": 3.8565082783685554e-05, + "loss": 3.3197, + "step": 6866 + }, + { + "epoch": 8.78208, + "grad_norm": 0.5313037633895874, + "learning_rate": 3.8524700498048184e-05, + "loss": 3.2591, + "step": 6867 + }, + { + "epoch": 8.78336, + "grad_norm": 0.5377905368804932, + "learning_rate": 3.848431821241082e-05, + "loss": 3.3435, + "step": 6868 + }, + { + "epoch": 8.78464, + "grad_norm": 0.5325095057487488, + "learning_rate": 3.844393592677345e-05, + "loss": 3.3114, + "step": 6869 + }, + { + "epoch": 8.78592, + "grad_norm": 0.540044367313385, + "learning_rate": 3.840355364113609e-05, + "loss": 3.3397, + "step": 6870 + }, + { + "epoch": 8.7872, + "grad_norm": 0.531450092792511, + "learning_rate": 3.836317135549872e-05, + "loss": 3.2683, + "step": 6871 + }, + { + "epoch": 8.78848, + "grad_norm": 0.5396316647529602, + "learning_rate": 3.8322789069861355e-05, + "loss": 3.2794, + "step": 6872 + }, + { + "epoch": 8.78976, + "grad_norm": 0.5314230918884277, + "learning_rate": 3.8282406784223985e-05, + "loss": 3.305, + "step": 6873 + }, + { + "epoch": 8.79104, + "grad_norm": 0.5359961986541748, + "learning_rate": 3.8242024498586615e-05, + "loss": 3.3323, + "step": 6874 + }, + { + "epoch": 8.79232, + "grad_norm": 0.5214905738830566, + "learning_rate": 3.8201642212949245e-05, + "loss": 3.2747, + "step": 6875 + }, + { + "epoch": 8.7936, + "grad_norm": 0.5209900140762329, + "learning_rate": 3.816125992731188e-05, + "loss": 3.2886, + "step": 6876 + }, + { + "epoch": 8.79488, + "grad_norm": 0.5249080061912537, + "learning_rate": 3.812087764167451e-05, + "loss": 3.3115, + "step": 6877 + }, + { + "epoch": 8.79616, + "grad_norm": 0.561204195022583, + "learning_rate": 3.808049535603715e-05, + "loss": 3.3694, + "step": 6878 + }, + { + "epoch": 8.79744, + "grad_norm": 0.5340297222137451, + "learning_rate": 3.804011307039978e-05, + "loss": 3.2916, + "step": 6879 + }, + { + "epoch": 8.79872, + "grad_norm": 0.5248726010322571, + "learning_rate": 3.7999730784762416e-05, + "loss": 3.3112, + "step": 6880 + }, + { + "epoch": 8.8, + "grad_norm": 0.5359779596328735, + "learning_rate": 3.7959348499125046e-05, + "loss": 3.3402, + "step": 6881 + }, + { + "epoch": 8.80128, + "grad_norm": 0.5329644083976746, + "learning_rate": 3.791896621348768e-05, + "loss": 3.3107, + "step": 6882 + }, + { + "epoch": 8.80256, + "grad_norm": 0.5371171236038208, + "learning_rate": 3.787858392785031e-05, + "loss": 3.3409, + "step": 6883 + }, + { + "epoch": 8.80384, + "grad_norm": 0.5442470908164978, + "learning_rate": 3.783820164221295e-05, + "loss": 3.3277, + "step": 6884 + }, + { + "epoch": 8.80512, + "grad_norm": 0.5545779466629028, + "learning_rate": 3.779781935657558e-05, + "loss": 3.3126, + "step": 6885 + }, + { + "epoch": 8.8064, + "grad_norm": 0.5447943210601807, + "learning_rate": 3.7757437070938216e-05, + "loss": 3.2873, + "step": 6886 + }, + { + "epoch": 8.80768, + "grad_norm": 0.5211206674575806, + "learning_rate": 3.7717054785300847e-05, + "loss": 3.3024, + "step": 6887 + }, + { + "epoch": 8.80896, + "grad_norm": 0.5305944681167603, + "learning_rate": 3.7676672499663477e-05, + "loss": 3.3325, + "step": 6888 + }, + { + "epoch": 8.81024, + "grad_norm": 0.5540784001350403, + "learning_rate": 3.763629021402611e-05, + "loss": 3.3269, + "step": 6889 + }, + { + "epoch": 8.81152, + "grad_norm": 0.5730512738227844, + "learning_rate": 3.7595907928388744e-05, + "loss": 3.3965, + "step": 6890 + }, + { + "epoch": 8.8128, + "grad_norm": 0.5502961874008179, + "learning_rate": 3.7555525642751374e-05, + "loss": 3.2786, + "step": 6891 + }, + { + "epoch": 8.81408, + "grad_norm": 0.5448852777481079, + "learning_rate": 3.751514335711401e-05, + "loss": 3.3031, + "step": 6892 + }, + { + "epoch": 8.81536, + "grad_norm": 0.5359817743301392, + "learning_rate": 3.747476107147664e-05, + "loss": 3.318, + "step": 6893 + }, + { + "epoch": 8.81664, + "grad_norm": 0.5279159545898438, + "learning_rate": 3.743437878583928e-05, + "loss": 3.3592, + "step": 6894 + }, + { + "epoch": 8.81792, + "grad_norm": 0.5456517338752747, + "learning_rate": 3.739399650020191e-05, + "loss": 3.3106, + "step": 6895 + }, + { + "epoch": 8.8192, + "grad_norm": 0.5353114604949951, + "learning_rate": 3.7353614214564544e-05, + "loss": 3.3135, + "step": 6896 + }, + { + "epoch": 8.82048, + "grad_norm": 0.5308430194854736, + "learning_rate": 3.7313231928927174e-05, + "loss": 3.2659, + "step": 6897 + }, + { + "epoch": 8.82176, + "grad_norm": 0.5371167659759521, + "learning_rate": 3.7272849643289804e-05, + "loss": 3.244, + "step": 6898 + }, + { + "epoch": 8.82304, + "grad_norm": 0.5514832735061646, + "learning_rate": 3.723246735765244e-05, + "loss": 3.3401, + "step": 6899 + }, + { + "epoch": 8.82432, + "grad_norm": 0.536336362361908, + "learning_rate": 3.719208507201507e-05, + "loss": 3.3118, + "step": 6900 + }, + { + "epoch": 8.8256, + "grad_norm": 0.5184128284454346, + "learning_rate": 3.715170278637771e-05, + "loss": 3.1998, + "step": 6901 + }, + { + "epoch": 8.82688, + "grad_norm": 0.5253881812095642, + "learning_rate": 3.711132050074034e-05, + "loss": 3.1437, + "step": 6902 + }, + { + "epoch": 8.82816, + "grad_norm": 0.5361356735229492, + "learning_rate": 3.7070938215102975e-05, + "loss": 3.1978, + "step": 6903 + }, + { + "epoch": 8.82944, + "grad_norm": 0.533018946647644, + "learning_rate": 3.7030555929465605e-05, + "loss": 3.2561, + "step": 6904 + }, + { + "epoch": 8.83072, + "grad_norm": 0.5180771350860596, + "learning_rate": 3.6990173643828235e-05, + "loss": 3.2524, + "step": 6905 + }, + { + "epoch": 8.832, + "grad_norm": 0.5097092986106873, + "learning_rate": 3.694979135819087e-05, + "loss": 3.1094, + "step": 6906 + }, + { + "epoch": 8.83328, + "grad_norm": 0.5372468829154968, + "learning_rate": 3.69094090725535e-05, + "loss": 3.1413, + "step": 6907 + }, + { + "epoch": 8.83456, + "grad_norm": 0.5473780632019043, + "learning_rate": 3.686902678691614e-05, + "loss": 3.2555, + "step": 6908 + }, + { + "epoch": 8.83584, + "grad_norm": 0.537969172000885, + "learning_rate": 3.682864450127877e-05, + "loss": 3.0862, + "step": 6909 + }, + { + "epoch": 8.83712, + "grad_norm": 0.5439502596855164, + "learning_rate": 3.6788262215641406e-05, + "loss": 3.1283, + "step": 6910 + }, + { + "epoch": 8.8384, + "grad_norm": 0.5461182594299316, + "learning_rate": 3.6747879930004036e-05, + "loss": 3.2333, + "step": 6911 + }, + { + "epoch": 8.83968, + "grad_norm": 0.5343428254127502, + "learning_rate": 3.6707497644366666e-05, + "loss": 3.1861, + "step": 6912 + }, + { + "epoch": 8.84096, + "grad_norm": 0.5404158234596252, + "learning_rate": 3.66671153587293e-05, + "loss": 3.2274, + "step": 6913 + }, + { + "epoch": 8.84224, + "grad_norm": 0.5451922416687012, + "learning_rate": 3.662673307309193e-05, + "loss": 3.1915, + "step": 6914 + }, + { + "epoch": 8.84352, + "grad_norm": 0.5269055366516113, + "learning_rate": 3.658635078745457e-05, + "loss": 3.2036, + "step": 6915 + }, + { + "epoch": 8.8448, + "grad_norm": 0.5376463532447815, + "learning_rate": 3.65459685018172e-05, + "loss": 3.0966, + "step": 6916 + }, + { + "epoch": 8.84608, + "grad_norm": 0.5396373271942139, + "learning_rate": 3.650558621617984e-05, + "loss": 3.2148, + "step": 6917 + }, + { + "epoch": 8.84736, + "grad_norm": 0.5474383234977722, + "learning_rate": 3.646520393054247e-05, + "loss": 3.0508, + "step": 6918 + }, + { + "epoch": 8.84864, + "grad_norm": 0.5413127541542053, + "learning_rate": 3.64248216449051e-05, + "loss": 3.2371, + "step": 6919 + }, + { + "epoch": 8.849920000000001, + "grad_norm": 0.5485610365867615, + "learning_rate": 3.6384439359267734e-05, + "loss": 3.183, + "step": 6920 + }, + { + "epoch": 8.8512, + "grad_norm": 0.551889955997467, + "learning_rate": 3.6344057073630364e-05, + "loss": 3.2028, + "step": 6921 + }, + { + "epoch": 8.85248, + "grad_norm": 0.5297598242759705, + "learning_rate": 3.6303674787993e-05, + "loss": 3.1678, + "step": 6922 + }, + { + "epoch": 8.85376, + "grad_norm": 0.5409109592437744, + "learning_rate": 3.626329250235563e-05, + "loss": 3.1554, + "step": 6923 + }, + { + "epoch": 8.85504, + "grad_norm": 0.5404353737831116, + "learning_rate": 3.622291021671827e-05, + "loss": 3.2286, + "step": 6924 + }, + { + "epoch": 8.85632, + "grad_norm": 0.5331123471260071, + "learning_rate": 3.618252793108089e-05, + "loss": 3.1951, + "step": 6925 + }, + { + "epoch": 8.8576, + "grad_norm": 0.53252112865448, + "learning_rate": 3.614214564544353e-05, + "loss": 3.1818, + "step": 6926 + }, + { + "epoch": 8.85888, + "grad_norm": 0.5230981707572937, + "learning_rate": 3.6101763359806165e-05, + "loss": 3.1821, + "step": 6927 + }, + { + "epoch": 8.86016, + "grad_norm": 0.5399007797241211, + "learning_rate": 3.6061381074168795e-05, + "loss": 3.1559, + "step": 6928 + }, + { + "epoch": 8.86144, + "grad_norm": 0.5414989590644836, + "learning_rate": 3.602099878853143e-05, + "loss": 3.1675, + "step": 6929 + }, + { + "epoch": 8.86272, + "grad_norm": 0.5386769771575928, + "learning_rate": 3.598061650289406e-05, + "loss": 3.1707, + "step": 6930 + }, + { + "epoch": 8.864, + "grad_norm": 0.5298625230789185, + "learning_rate": 3.594023421725669e-05, + "loss": 3.2093, + "step": 6931 + }, + { + "epoch": 8.86528, + "grad_norm": 0.5370173454284668, + "learning_rate": 3.589985193161932e-05, + "loss": 3.1237, + "step": 6932 + }, + { + "epoch": 8.86656, + "grad_norm": 0.5402517914772034, + "learning_rate": 3.585946964598196e-05, + "loss": 3.2736, + "step": 6933 + }, + { + "epoch": 8.86784, + "grad_norm": 0.5565125346183777, + "learning_rate": 3.581908736034459e-05, + "loss": 3.2115, + "step": 6934 + }, + { + "epoch": 8.86912, + "grad_norm": 0.5417731404304504, + "learning_rate": 3.5778705074707226e-05, + "loss": 3.2296, + "step": 6935 + }, + { + "epoch": 8.8704, + "grad_norm": 0.5352853536605835, + "learning_rate": 3.573832278906986e-05, + "loss": 3.1484, + "step": 6936 + }, + { + "epoch": 8.87168, + "grad_norm": 0.5440704226493835, + "learning_rate": 3.569794050343249e-05, + "loss": 3.1939, + "step": 6937 + }, + { + "epoch": 8.872959999999999, + "grad_norm": 0.553403913974762, + "learning_rate": 3.565755821779512e-05, + "loss": 3.1381, + "step": 6938 + }, + { + "epoch": 8.87424, + "grad_norm": 0.5392211079597473, + "learning_rate": 3.561717593215775e-05, + "loss": 3.2004, + "step": 6939 + }, + { + "epoch": 8.87552, + "grad_norm": 0.5346492528915405, + "learning_rate": 3.557679364652039e-05, + "loss": 3.2119, + "step": 6940 + }, + { + "epoch": 8.8768, + "grad_norm": 0.5390061140060425, + "learning_rate": 3.553641136088302e-05, + "loss": 3.2046, + "step": 6941 + }, + { + "epoch": 8.87808, + "grad_norm": 0.5476921796798706, + "learning_rate": 3.5496029075245657e-05, + "loss": 3.2266, + "step": 6942 + }, + { + "epoch": 8.87936, + "grad_norm": 0.5377036333084106, + "learning_rate": 3.5455646789608287e-05, + "loss": 3.2189, + "step": 6943 + }, + { + "epoch": 8.88064, + "grad_norm": 0.5338594317436218, + "learning_rate": 3.5415264503970923e-05, + "loss": 3.2326, + "step": 6944 + }, + { + "epoch": 8.881920000000001, + "grad_norm": 0.5427373647689819, + "learning_rate": 3.5374882218333554e-05, + "loss": 3.2784, + "step": 6945 + }, + { + "epoch": 8.8832, + "grad_norm": 0.5387793779373169, + "learning_rate": 3.5334499932696184e-05, + "loss": 3.1931, + "step": 6946 + }, + { + "epoch": 8.88448, + "grad_norm": 0.5435247421264648, + "learning_rate": 3.529411764705882e-05, + "loss": 3.1342, + "step": 6947 + }, + { + "epoch": 8.88576, + "grad_norm": 0.5299807786941528, + "learning_rate": 3.525373536142145e-05, + "loss": 3.1364, + "step": 6948 + }, + { + "epoch": 8.88704, + "grad_norm": 0.5515544414520264, + "learning_rate": 3.521335307578409e-05, + "loss": 3.2109, + "step": 6949 + }, + { + "epoch": 8.88832, + "grad_norm": 0.5315101146697998, + "learning_rate": 3.517297079014672e-05, + "loss": 3.0987, + "step": 6950 + }, + { + "epoch": 8.8896, + "grad_norm": 0.5465390682220459, + "learning_rate": 3.5132588504509354e-05, + "loss": 3.1256, + "step": 6951 + }, + { + "epoch": 8.89088, + "grad_norm": 0.5575345158576965, + "learning_rate": 3.5092206218871984e-05, + "loss": 3.194, + "step": 6952 + }, + { + "epoch": 8.89216, + "grad_norm": 0.5439948439598083, + "learning_rate": 3.5051823933234614e-05, + "loss": 3.2022, + "step": 6953 + }, + { + "epoch": 8.89344, + "grad_norm": 0.5493448972702026, + "learning_rate": 3.501144164759725e-05, + "loss": 3.2155, + "step": 6954 + }, + { + "epoch": 8.89472, + "grad_norm": 0.5295661091804504, + "learning_rate": 3.497105936195988e-05, + "loss": 3.1321, + "step": 6955 + }, + { + "epoch": 8.896, + "grad_norm": 0.5430306792259216, + "learning_rate": 3.493067707632252e-05, + "loss": 3.1363, + "step": 6956 + }, + { + "epoch": 8.89728, + "grad_norm": 0.5532288551330566, + "learning_rate": 3.489029479068515e-05, + "loss": 3.2848, + "step": 6957 + }, + { + "epoch": 8.89856, + "grad_norm": 0.5648680329322815, + "learning_rate": 3.484991250504778e-05, + "loss": 3.1427, + "step": 6958 + }, + { + "epoch": 8.89984, + "grad_norm": 0.5307405591011047, + "learning_rate": 3.4809530219410415e-05, + "loss": 3.2077, + "step": 6959 + }, + { + "epoch": 8.90112, + "grad_norm": 0.5643923282623291, + "learning_rate": 3.4769147933773045e-05, + "loss": 3.1602, + "step": 6960 + }, + { + "epoch": 8.9024, + "grad_norm": 0.5337897539138794, + "learning_rate": 3.472876564813568e-05, + "loss": 3.186, + "step": 6961 + }, + { + "epoch": 8.90368, + "grad_norm": 0.5449624061584473, + "learning_rate": 3.468838336249831e-05, + "loss": 3.223, + "step": 6962 + }, + { + "epoch": 8.904959999999999, + "grad_norm": 0.5331872701644897, + "learning_rate": 3.464800107686095e-05, + "loss": 3.1097, + "step": 6963 + }, + { + "epoch": 8.90624, + "grad_norm": 0.5646026134490967, + "learning_rate": 3.460761879122358e-05, + "loss": 3.203, + "step": 6964 + }, + { + "epoch": 8.90752, + "grad_norm": 0.554621696472168, + "learning_rate": 3.456723650558621e-05, + "loss": 3.1597, + "step": 6965 + }, + { + "epoch": 8.9088, + "grad_norm": 0.5355739593505859, + "learning_rate": 3.4526854219948846e-05, + "loss": 3.2187, + "step": 6966 + }, + { + "epoch": 8.91008, + "grad_norm": 0.5411005020141602, + "learning_rate": 3.4486471934311476e-05, + "loss": 3.1622, + "step": 6967 + }, + { + "epoch": 8.91136, + "grad_norm": 0.5598929524421692, + "learning_rate": 3.444608964867411e-05, + "loss": 3.1857, + "step": 6968 + }, + { + "epoch": 8.91264, + "grad_norm": 0.5395918488502502, + "learning_rate": 3.440570736303674e-05, + "loss": 3.1413, + "step": 6969 + }, + { + "epoch": 8.91392, + "grad_norm": 0.5290459394454956, + "learning_rate": 3.436532507739938e-05, + "loss": 3.1358, + "step": 6970 + }, + { + "epoch": 8.9152, + "grad_norm": 0.5369711518287659, + "learning_rate": 3.432494279176201e-05, + "loss": 3.1303, + "step": 6971 + }, + { + "epoch": 8.91648, + "grad_norm": 0.5442224740982056, + "learning_rate": 3.428456050612464e-05, + "loss": 3.1369, + "step": 6972 + }, + { + "epoch": 8.91776, + "grad_norm": 0.5366105437278748, + "learning_rate": 3.424417822048728e-05, + "loss": 3.1323, + "step": 6973 + }, + { + "epoch": 8.91904, + "grad_norm": 0.5568270087242126, + "learning_rate": 3.420379593484991e-05, + "loss": 3.1774, + "step": 6974 + }, + { + "epoch": 8.92032, + "grad_norm": 0.5447999835014343, + "learning_rate": 3.4163413649212544e-05, + "loss": 3.1728, + "step": 6975 + }, + { + "epoch": 8.9216, + "grad_norm": 0.5445603728294373, + "learning_rate": 3.4123031363575174e-05, + "loss": 3.1144, + "step": 6976 + }, + { + "epoch": 8.92288, + "grad_norm": 0.5516496896743774, + "learning_rate": 3.408264907793781e-05, + "loss": 3.1628, + "step": 6977 + }, + { + "epoch": 8.92416, + "grad_norm": 0.5592992901802063, + "learning_rate": 3.404226679230044e-05, + "loss": 3.1663, + "step": 6978 + }, + { + "epoch": 8.92544, + "grad_norm": 0.5563246607780457, + "learning_rate": 3.400188450666307e-05, + "loss": 3.1543, + "step": 6979 + }, + { + "epoch": 8.92672, + "grad_norm": 0.5511194467544556, + "learning_rate": 3.396150222102571e-05, + "loss": 3.2003, + "step": 6980 + }, + { + "epoch": 8.928, + "grad_norm": 0.5466040372848511, + "learning_rate": 3.392111993538834e-05, + "loss": 3.1386, + "step": 6981 + }, + { + "epoch": 8.92928, + "grad_norm": 0.5524888038635254, + "learning_rate": 3.3880737649750975e-05, + "loss": 3.0766, + "step": 6982 + }, + { + "epoch": 8.93056, + "grad_norm": 0.5536906123161316, + "learning_rate": 3.3840355364113605e-05, + "loss": 3.1638, + "step": 6983 + }, + { + "epoch": 8.93184, + "grad_norm": 0.5587788224220276, + "learning_rate": 3.379997307847624e-05, + "loss": 3.2024, + "step": 6984 + }, + { + "epoch": 8.93312, + "grad_norm": 0.5503929257392883, + "learning_rate": 3.375959079283887e-05, + "loss": 3.2101, + "step": 6985 + }, + { + "epoch": 8.9344, + "grad_norm": 0.5357226729393005, + "learning_rate": 3.37192085072015e-05, + "loss": 3.2078, + "step": 6986 + }, + { + "epoch": 8.93568, + "grad_norm": 0.5584990382194519, + "learning_rate": 3.367882622156414e-05, + "loss": 3.186, + "step": 6987 + }, + { + "epoch": 8.93696, + "grad_norm": 0.5508469939231873, + "learning_rate": 3.363844393592677e-05, + "loss": 3.1558, + "step": 6988 + }, + { + "epoch": 8.93824, + "grad_norm": 0.5476589202880859, + "learning_rate": 3.3598061650289406e-05, + "loss": 3.1501, + "step": 6989 + }, + { + "epoch": 8.93952, + "grad_norm": 0.5468451380729675, + "learning_rate": 3.3557679364652036e-05, + "loss": 3.211, + "step": 6990 + }, + { + "epoch": 8.9408, + "grad_norm": 0.5536555647850037, + "learning_rate": 3.351729707901467e-05, + "loss": 3.1646, + "step": 6991 + }, + { + "epoch": 8.94208, + "grad_norm": 0.5505756735801697, + "learning_rate": 3.34769147933773e-05, + "loss": 3.1244, + "step": 6992 + }, + { + "epoch": 8.94336, + "grad_norm": 0.560417890548706, + "learning_rate": 3.343653250773993e-05, + "loss": 3.1672, + "step": 6993 + }, + { + "epoch": 8.94464, + "grad_norm": 0.5642468929290771, + "learning_rate": 3.339615022210257e-05, + "loss": 3.1955, + "step": 6994 + }, + { + "epoch": 8.94592, + "grad_norm": 0.5576351881027222, + "learning_rate": 3.33557679364652e-05, + "loss": 3.1591, + "step": 6995 + }, + { + "epoch": 8.9472, + "grad_norm": 0.543550968170166, + "learning_rate": 3.3315385650827836e-05, + "loss": 3.1024, + "step": 6996 + }, + { + "epoch": 8.94848, + "grad_norm": 0.5292938947677612, + "learning_rate": 3.3275003365190467e-05, + "loss": 3.1107, + "step": 6997 + }, + { + "epoch": 8.94976, + "grad_norm": 0.5450161695480347, + "learning_rate": 3.32346210795531e-05, + "loss": 3.1862, + "step": 6998 + }, + { + "epoch": 8.95104, + "grad_norm": 0.5542306303977966, + "learning_rate": 3.3194238793915733e-05, + "loss": 3.209, + "step": 6999 + }, + { + "epoch": 8.95232, + "grad_norm": 0.552590548992157, + "learning_rate": 3.3153856508278364e-05, + "loss": 3.2279, + "step": 7000 + }, + { + "epoch": 8.9536, + "grad_norm": 0.548201858997345, + "learning_rate": 3.3113474222641e-05, + "loss": 3.1338, + "step": 7001 + }, + { + "epoch": 8.95488, + "grad_norm": 0.5407779216766357, + "learning_rate": 3.307309193700363e-05, + "loss": 3.1704, + "step": 7002 + }, + { + "epoch": 8.95616, + "grad_norm": 0.5337063074111938, + "learning_rate": 3.303270965136627e-05, + "loss": 3.1294, + "step": 7003 + }, + { + "epoch": 8.95744, + "grad_norm": 0.5435624718666077, + "learning_rate": 3.29923273657289e-05, + "loss": 3.2299, + "step": 7004 + }, + { + "epoch": 8.95872, + "grad_norm": 0.5566171407699585, + "learning_rate": 3.295194508009153e-05, + "loss": 3.2128, + "step": 7005 + }, + { + "epoch": 8.96, + "grad_norm": 0.5542343854904175, + "learning_rate": 3.2911562794454164e-05, + "loss": 3.1888, + "step": 7006 + }, + { + "epoch": 8.96128, + "grad_norm": 0.5571079254150391, + "learning_rate": 3.2871180508816794e-05, + "loss": 3.1994, + "step": 7007 + }, + { + "epoch": 8.96256, + "grad_norm": 0.5469478964805603, + "learning_rate": 3.283079822317943e-05, + "loss": 3.1812, + "step": 7008 + }, + { + "epoch": 8.96384, + "grad_norm": 0.5388615727424622, + "learning_rate": 3.279041593754206e-05, + "loss": 3.1808, + "step": 7009 + }, + { + "epoch": 8.96512, + "grad_norm": 0.5441920161247253, + "learning_rate": 3.27500336519047e-05, + "loss": 3.1783, + "step": 7010 + }, + { + "epoch": 8.9664, + "grad_norm": 0.5513103604316711, + "learning_rate": 3.270965136626733e-05, + "loss": 3.1525, + "step": 7011 + }, + { + "epoch": 8.96768, + "grad_norm": 0.5584927201271057, + "learning_rate": 3.266926908062996e-05, + "loss": 3.207, + "step": 7012 + }, + { + "epoch": 8.96896, + "grad_norm": 0.5390376448631287, + "learning_rate": 3.2628886794992595e-05, + "loss": 3.1688, + "step": 7013 + }, + { + "epoch": 8.97024, + "grad_norm": 0.5638677477836609, + "learning_rate": 3.2588504509355225e-05, + "loss": 3.2089, + "step": 7014 + }, + { + "epoch": 8.97152, + "grad_norm": 0.5538638830184937, + "learning_rate": 3.254812222371786e-05, + "loss": 3.1597, + "step": 7015 + }, + { + "epoch": 8.9728, + "grad_norm": 0.5503858923912048, + "learning_rate": 3.250773993808049e-05, + "loss": 3.1927, + "step": 7016 + }, + { + "epoch": 8.97408, + "grad_norm": 0.5431429147720337, + "learning_rate": 3.246735765244313e-05, + "loss": 3.2362, + "step": 7017 + }, + { + "epoch": 8.97536, + "grad_norm": 0.5614336133003235, + "learning_rate": 3.242697536680576e-05, + "loss": 3.1585, + "step": 7018 + }, + { + "epoch": 8.97664, + "grad_norm": 0.540317714214325, + "learning_rate": 3.238659308116839e-05, + "loss": 3.2157, + "step": 7019 + }, + { + "epoch": 8.97792, + "grad_norm": 0.5396054983139038, + "learning_rate": 3.2346210795531026e-05, + "loss": 3.1543, + "step": 7020 + }, + { + "epoch": 8.9792, + "grad_norm": 0.541448712348938, + "learning_rate": 3.2305828509893656e-05, + "loss": 3.1709, + "step": 7021 + }, + { + "epoch": 8.98048, + "grad_norm": 0.553905725479126, + "learning_rate": 3.226544622425629e-05, + "loss": 3.1969, + "step": 7022 + }, + { + "epoch": 8.98176, + "grad_norm": 0.5426627397537231, + "learning_rate": 3.222506393861892e-05, + "loss": 3.2246, + "step": 7023 + }, + { + "epoch": 8.98304, + "grad_norm": 0.5612876415252686, + "learning_rate": 3.218468165298156e-05, + "loss": 3.2059, + "step": 7024 + }, + { + "epoch": 8.98432, + "grad_norm": 0.5525398850440979, + "learning_rate": 3.214429936734419e-05, + "loss": 3.1842, + "step": 7025 + }, + { + "epoch": 8.9856, + "grad_norm": 0.5441053509712219, + "learning_rate": 3.210391708170682e-05, + "loss": 3.2081, + "step": 7026 + }, + { + "epoch": 8.98688, + "grad_norm": 0.5565687417984009, + "learning_rate": 3.206353479606946e-05, + "loss": 3.2089, + "step": 7027 + }, + { + "epoch": 8.98816, + "grad_norm": 0.553962230682373, + "learning_rate": 3.202315251043209e-05, + "loss": 3.2249, + "step": 7028 + }, + { + "epoch": 8.98944, + "grad_norm": 0.5383572578430176, + "learning_rate": 3.1982770224794724e-05, + "loss": 3.1249, + "step": 7029 + }, + { + "epoch": 8.99072, + "grad_norm": 0.5608500242233276, + "learning_rate": 3.1942387939157354e-05, + "loss": 3.2204, + "step": 7030 + }, + { + "epoch": 8.992, + "grad_norm": 0.5564176440238953, + "learning_rate": 3.190200565351999e-05, + "loss": 3.222, + "step": 7031 + }, + { + "epoch": 8.99328, + "grad_norm": 0.5383712649345398, + "learning_rate": 3.186162336788262e-05, + "loss": 3.1404, + "step": 7032 + }, + { + "epoch": 8.99456, + "grad_norm": 0.5401707291603088, + "learning_rate": 3.182124108224525e-05, + "loss": 3.1736, + "step": 7033 + }, + { + "epoch": 8.99584, + "grad_norm": 0.5486315488815308, + "learning_rate": 3.178085879660789e-05, + "loss": 3.1421, + "step": 7034 + }, + { + "epoch": 8.99712, + "grad_norm": 0.5637720227241516, + "learning_rate": 3.174047651097052e-05, + "loss": 3.0799, + "step": 7035 + }, + { + "epoch": 8.9984, + "grad_norm": 0.5368375182151794, + "learning_rate": 3.1700094225333155e-05, + "loss": 3.1562, + "step": 7036 + }, + { + "epoch": 8.99968, + "grad_norm": 0.5359020233154297, + "learning_rate": 3.1659711939695785e-05, + "loss": 3.1867, + "step": 7037 + }, + { + "epoch": 9.0, + "grad_norm": 0.9711993932723999, + "learning_rate": 3.161932965405842e-05, + "loss": 2.7479, + "step": 7038 + }, + { + "epoch": 9.00128, + "grad_norm": 0.5583388209342957, + "learning_rate": 3.1578947368421045e-05, + "loss": 3.2842, + "step": 7039 + }, + { + "epoch": 9.00256, + "grad_norm": 0.5551260113716125, + "learning_rate": 3.153856508278368e-05, + "loss": 3.2511, + "step": 7040 + }, + { + "epoch": 9.00384, + "grad_norm": 0.5703940987586975, + "learning_rate": 3.149818279714632e-05, + "loss": 3.2821, + "step": 7041 + }, + { + "epoch": 9.00512, + "grad_norm": 0.5474501848220825, + "learning_rate": 3.145780051150895e-05, + "loss": 3.2544, + "step": 7042 + }, + { + "epoch": 9.0064, + "grad_norm": 0.5608713030815125, + "learning_rate": 3.1417418225871586e-05, + "loss": 3.3234, + "step": 7043 + }, + { + "epoch": 9.00768, + "grad_norm": 0.5595000386238098, + "learning_rate": 3.1377035940234216e-05, + "loss": 3.2516, + "step": 7044 + }, + { + "epoch": 9.00896, + "grad_norm": 0.5540786385536194, + "learning_rate": 3.133665365459685e-05, + "loss": 3.3641, + "step": 7045 + }, + { + "epoch": 9.01024, + "grad_norm": 0.5481628775596619, + "learning_rate": 3.1296271368959476e-05, + "loss": 3.1462, + "step": 7046 + }, + { + "epoch": 9.01152, + "grad_norm": 0.545194685459137, + "learning_rate": 3.125588908332211e-05, + "loss": 3.2292, + "step": 7047 + }, + { + "epoch": 9.0128, + "grad_norm": 0.5309725403785706, + "learning_rate": 3.121550679768474e-05, + "loss": 3.244, + "step": 7048 + }, + { + "epoch": 9.01408, + "grad_norm": 0.5396099090576172, + "learning_rate": 3.117512451204738e-05, + "loss": 3.3037, + "step": 7049 + }, + { + "epoch": 9.01536, + "grad_norm": 0.5477572083473206, + "learning_rate": 3.1134742226410016e-05, + "loss": 3.1592, + "step": 7050 + }, + { + "epoch": 9.01664, + "grad_norm": 0.5614184737205505, + "learning_rate": 3.1094359940772646e-05, + "loss": 3.1785, + "step": 7051 + }, + { + "epoch": 9.01792, + "grad_norm": 0.5425687432289124, + "learning_rate": 3.1053977655135277e-05, + "loss": 3.2016, + "step": 7052 + }, + { + "epoch": 9.0192, + "grad_norm": 0.5536109209060669, + "learning_rate": 3.1013595369497907e-05, + "loss": 3.247, + "step": 7053 + }, + { + "epoch": 9.02048, + "grad_norm": 0.5437664985656738, + "learning_rate": 3.0973213083860543e-05, + "loss": 3.2428, + "step": 7054 + }, + { + "epoch": 9.02176, + "grad_norm": 0.5411404371261597, + "learning_rate": 3.0932830798223174e-05, + "loss": 3.2555, + "step": 7055 + }, + { + "epoch": 9.02304, + "grad_norm": 0.5475705862045288, + "learning_rate": 3.089244851258581e-05, + "loss": 3.2672, + "step": 7056 + }, + { + "epoch": 9.02432, + "grad_norm": 0.5438557863235474, + "learning_rate": 3.085206622694844e-05, + "loss": 3.1994, + "step": 7057 + }, + { + "epoch": 9.0256, + "grad_norm": 0.54364413022995, + "learning_rate": 3.081168394131108e-05, + "loss": 3.249, + "step": 7058 + }, + { + "epoch": 9.02688, + "grad_norm": 0.5349510908126831, + "learning_rate": 3.077130165567371e-05, + "loss": 3.2396, + "step": 7059 + }, + { + "epoch": 9.02816, + "grad_norm": 0.5500809550285339, + "learning_rate": 3.073091937003634e-05, + "loss": 3.2324, + "step": 7060 + }, + { + "epoch": 9.02944, + "grad_norm": 0.5365533232688904, + "learning_rate": 3.0690537084398974e-05, + "loss": 3.1999, + "step": 7061 + }, + { + "epoch": 9.03072, + "grad_norm": 0.5576481819152832, + "learning_rate": 3.0650154798761604e-05, + "loss": 3.3374, + "step": 7062 + }, + { + "epoch": 9.032, + "grad_norm": 0.5433608293533325, + "learning_rate": 3.060977251312424e-05, + "loss": 3.2957, + "step": 7063 + }, + { + "epoch": 9.03328, + "grad_norm": 0.5531395673751831, + "learning_rate": 3.056939022748687e-05, + "loss": 3.2484, + "step": 7064 + }, + { + "epoch": 9.03456, + "grad_norm": 0.5414525866508484, + "learning_rate": 3.052900794184951e-05, + "loss": 3.218, + "step": 7065 + }, + { + "epoch": 9.03584, + "grad_norm": 0.5335577726364136, + "learning_rate": 3.0488625656212138e-05, + "loss": 3.1524, + "step": 7066 + }, + { + "epoch": 9.03712, + "grad_norm": 0.5346528887748718, + "learning_rate": 3.044824337057477e-05, + "loss": 3.172, + "step": 7067 + }, + { + "epoch": 9.0384, + "grad_norm": 0.5395179390907288, + "learning_rate": 3.0407861084937405e-05, + "loss": 3.1621, + "step": 7068 + }, + { + "epoch": 9.03968, + "grad_norm": 0.544449508190155, + "learning_rate": 3.036747879930004e-05, + "loss": 3.2452, + "step": 7069 + }, + { + "epoch": 9.04096, + "grad_norm": 0.5622183680534363, + "learning_rate": 3.0327096513662672e-05, + "loss": 3.2826, + "step": 7070 + }, + { + "epoch": 9.04224, + "grad_norm": 0.543907880783081, + "learning_rate": 3.0286714228025306e-05, + "loss": 3.2684, + "step": 7071 + }, + { + "epoch": 9.043520000000001, + "grad_norm": 0.5473181009292603, + "learning_rate": 3.024633194238794e-05, + "loss": 3.2676, + "step": 7072 + }, + { + "epoch": 9.0448, + "grad_norm": 0.5329403877258301, + "learning_rate": 3.020594965675057e-05, + "loss": 3.2042, + "step": 7073 + }, + { + "epoch": 9.04608, + "grad_norm": 0.519966185092926, + "learning_rate": 3.0165567371113203e-05, + "loss": 3.1961, + "step": 7074 + }, + { + "epoch": 9.04736, + "grad_norm": 0.5505262017250061, + "learning_rate": 3.0125185085475836e-05, + "loss": 3.2669, + "step": 7075 + }, + { + "epoch": 9.04864, + "grad_norm": 0.5391356945037842, + "learning_rate": 3.008480279983847e-05, + "loss": 3.2735, + "step": 7076 + }, + { + "epoch": 9.04992, + "grad_norm": 0.5384522676467896, + "learning_rate": 3.0044420514201103e-05, + "loss": 3.3018, + "step": 7077 + }, + { + "epoch": 9.0512, + "grad_norm": 0.5468092560768127, + "learning_rate": 3.0004038228563736e-05, + "loss": 3.2427, + "step": 7078 + }, + { + "epoch": 9.05248, + "grad_norm": 0.5462827682495117, + "learning_rate": 2.9963655942926366e-05, + "loss": 3.2236, + "step": 7079 + }, + { + "epoch": 9.05376, + "grad_norm": 0.5457635521888733, + "learning_rate": 2.9923273657289e-05, + "loss": 3.1881, + "step": 7080 + }, + { + "epoch": 9.05504, + "grad_norm": 0.5283154249191284, + "learning_rate": 2.9882891371651633e-05, + "loss": 3.2305, + "step": 7081 + }, + { + "epoch": 9.05632, + "grad_norm": 0.533507227897644, + "learning_rate": 2.9842509086014267e-05, + "loss": 3.2302, + "step": 7082 + }, + { + "epoch": 9.0576, + "grad_norm": 0.5426484942436218, + "learning_rate": 2.98021268003769e-05, + "loss": 3.2233, + "step": 7083 + }, + { + "epoch": 9.05888, + "grad_norm": 0.5597418546676636, + "learning_rate": 2.9761744514739534e-05, + "loss": 3.2381, + "step": 7084 + }, + { + "epoch": 9.06016, + "grad_norm": 0.5480257868766785, + "learning_rate": 2.9721362229102167e-05, + "loss": 3.2169, + "step": 7085 + }, + { + "epoch": 9.06144, + "grad_norm": 0.5565598011016846, + "learning_rate": 2.9680979943464797e-05, + "loss": 3.2586, + "step": 7086 + }, + { + "epoch": 9.06272, + "grad_norm": 0.5538652539253235, + "learning_rate": 2.964059765782743e-05, + "loss": 3.211, + "step": 7087 + }, + { + "epoch": 9.064, + "grad_norm": 0.5413698554039001, + "learning_rate": 2.9600215372190064e-05, + "loss": 3.2349, + "step": 7088 + }, + { + "epoch": 9.06528, + "grad_norm": 0.5424516797065735, + "learning_rate": 2.9559833086552698e-05, + "loss": 3.2359, + "step": 7089 + }, + { + "epoch": 9.06656, + "grad_norm": 0.5624558925628662, + "learning_rate": 2.951945080091533e-05, + "loss": 3.2008, + "step": 7090 + }, + { + "epoch": 9.06784, + "grad_norm": 0.5545729398727417, + "learning_rate": 2.9479068515277965e-05, + "loss": 3.2058, + "step": 7091 + }, + { + "epoch": 9.06912, + "grad_norm": 0.5462035536766052, + "learning_rate": 2.9438686229640598e-05, + "loss": 3.2632, + "step": 7092 + }, + { + "epoch": 9.0704, + "grad_norm": 0.5403680801391602, + "learning_rate": 2.9398303944003225e-05, + "loss": 3.2468, + "step": 7093 + }, + { + "epoch": 9.07168, + "grad_norm": 0.5381686091423035, + "learning_rate": 2.935792165836586e-05, + "loss": 3.2539, + "step": 7094 + }, + { + "epoch": 9.07296, + "grad_norm": 0.5552157759666443, + "learning_rate": 2.9317539372728495e-05, + "loss": 3.2792, + "step": 7095 + }, + { + "epoch": 9.07424, + "grad_norm": 0.5554720759391785, + "learning_rate": 2.927715708709113e-05, + "loss": 3.1714, + "step": 7096 + }, + { + "epoch": 9.07552, + "grad_norm": 0.5406906604766846, + "learning_rate": 2.9236774801453762e-05, + "loss": 3.1821, + "step": 7097 + }, + { + "epoch": 9.0768, + "grad_norm": 0.5409752130508423, + "learning_rate": 2.9196392515816396e-05, + "loss": 3.2008, + "step": 7098 + }, + { + "epoch": 9.07808, + "grad_norm": 0.5609580874443054, + "learning_rate": 2.9156010230179022e-05, + "loss": 3.2278, + "step": 7099 + }, + { + "epoch": 9.07936, + "grad_norm": 0.5392387509346008, + "learning_rate": 2.9115627944541656e-05, + "loss": 3.2108, + "step": 7100 + }, + { + "epoch": 9.08064, + "grad_norm": 0.5388525128364563, + "learning_rate": 2.907524565890429e-05, + "loss": 3.2953, + "step": 7101 + }, + { + "epoch": 9.08192, + "grad_norm": 0.544260561466217, + "learning_rate": 2.9034863373266923e-05, + "loss": 3.2398, + "step": 7102 + }, + { + "epoch": 9.0832, + "grad_norm": 0.5434393882751465, + "learning_rate": 2.8994481087629556e-05, + "loss": 3.2731, + "step": 7103 + }, + { + "epoch": 9.08448, + "grad_norm": 0.5499718189239502, + "learning_rate": 2.8954098801992193e-05, + "loss": 3.1798, + "step": 7104 + }, + { + "epoch": 9.08576, + "grad_norm": 0.53836590051651, + "learning_rate": 2.8913716516354826e-05, + "loss": 3.2279, + "step": 7105 + }, + { + "epoch": 9.08704, + "grad_norm": 0.5385804772377014, + "learning_rate": 2.8873334230717453e-05, + "loss": 3.2543, + "step": 7106 + }, + { + "epoch": 9.08832, + "grad_norm": 0.5505902767181396, + "learning_rate": 2.8832951945080087e-05, + "loss": 3.1762, + "step": 7107 + }, + { + "epoch": 9.0896, + "grad_norm": 0.5531958937644958, + "learning_rate": 2.879256965944272e-05, + "loss": 3.197, + "step": 7108 + }, + { + "epoch": 9.09088, + "grad_norm": 0.5575078725814819, + "learning_rate": 2.8752187373805353e-05, + "loss": 3.2469, + "step": 7109 + }, + { + "epoch": 9.09216, + "grad_norm": 0.5386976003646851, + "learning_rate": 2.8711805088167987e-05, + "loss": 3.2092, + "step": 7110 + }, + { + "epoch": 9.09344, + "grad_norm": 0.5384331345558167, + "learning_rate": 2.867142280253062e-05, + "loss": 3.2562, + "step": 7111 + }, + { + "epoch": 9.09472, + "grad_norm": 0.549574077129364, + "learning_rate": 2.8631040516893254e-05, + "loss": 3.2311, + "step": 7112 + }, + { + "epoch": 9.096, + "grad_norm": 0.5679108500480652, + "learning_rate": 2.8590658231255884e-05, + "loss": 3.3449, + "step": 7113 + }, + { + "epoch": 9.09728, + "grad_norm": 0.5485793948173523, + "learning_rate": 2.8550275945618517e-05, + "loss": 3.2654, + "step": 7114 + }, + { + "epoch": 9.09856, + "grad_norm": 0.5479905605316162, + "learning_rate": 2.850989365998115e-05, + "loss": 3.252, + "step": 7115 + }, + { + "epoch": 9.09984, + "grad_norm": 0.5610731840133667, + "learning_rate": 2.8469511374343784e-05, + "loss": 3.2753, + "step": 7116 + }, + { + "epoch": 9.10112, + "grad_norm": 0.5504294633865356, + "learning_rate": 2.8429129088706418e-05, + "loss": 3.2685, + "step": 7117 + }, + { + "epoch": 9.1024, + "grad_norm": 0.5451478958129883, + "learning_rate": 2.838874680306905e-05, + "loss": 3.1972, + "step": 7118 + }, + { + "epoch": 9.10368, + "grad_norm": 0.5582462549209595, + "learning_rate": 2.8348364517431685e-05, + "loss": 3.2463, + "step": 7119 + }, + { + "epoch": 9.10496, + "grad_norm": 0.5593787431716919, + "learning_rate": 2.8307982231794315e-05, + "loss": 3.304, + "step": 7120 + }, + { + "epoch": 9.10624, + "grad_norm": 0.5443241596221924, + "learning_rate": 2.8267599946156948e-05, + "loss": 3.1912, + "step": 7121 + }, + { + "epoch": 9.10752, + "grad_norm": 0.52846759557724, + "learning_rate": 2.822721766051958e-05, + "loss": 3.1861, + "step": 7122 + }, + { + "epoch": 9.1088, + "grad_norm": 0.5365118384361267, + "learning_rate": 2.8186835374882215e-05, + "loss": 3.227, + "step": 7123 + }, + { + "epoch": 9.11008, + "grad_norm": 0.5490257740020752, + "learning_rate": 2.814645308924485e-05, + "loss": 3.2294, + "step": 7124 + }, + { + "epoch": 9.11136, + "grad_norm": 0.5475121140480042, + "learning_rate": 2.8106070803607482e-05, + "loss": 3.2356, + "step": 7125 + }, + { + "epoch": 9.11264, + "grad_norm": 0.5491126775741577, + "learning_rate": 2.8065688517970112e-05, + "loss": 3.2843, + "step": 7126 + }, + { + "epoch": 9.11392, + "grad_norm": 0.5404393076896667, + "learning_rate": 2.8025306232332746e-05, + "loss": 3.2315, + "step": 7127 + }, + { + "epoch": 9.1152, + "grad_norm": 0.5436875224113464, + "learning_rate": 2.798492394669538e-05, + "loss": 3.2436, + "step": 7128 + }, + { + "epoch": 9.11648, + "grad_norm": 0.5332518219947815, + "learning_rate": 2.7944541661058013e-05, + "loss": 3.2059, + "step": 7129 + }, + { + "epoch": 9.11776, + "grad_norm": 0.5293616056442261, + "learning_rate": 2.7904159375420646e-05, + "loss": 3.2773, + "step": 7130 + }, + { + "epoch": 9.11904, + "grad_norm": 0.5454232692718506, + "learning_rate": 2.786377708978328e-05, + "loss": 3.2034, + "step": 7131 + }, + { + "epoch": 9.12032, + "grad_norm": 0.5401408076286316, + "learning_rate": 2.7823394804145913e-05, + "loss": 3.1661, + "step": 7132 + }, + { + "epoch": 9.1216, + "grad_norm": 0.5544297695159912, + "learning_rate": 2.7783012518508543e-05, + "loss": 3.2872, + "step": 7133 + }, + { + "epoch": 9.12288, + "grad_norm": 0.5589653253555298, + "learning_rate": 2.7742630232871176e-05, + "loss": 3.3172, + "step": 7134 + }, + { + "epoch": 9.12416, + "grad_norm": 0.5525814294815063, + "learning_rate": 2.770224794723381e-05, + "loss": 3.2713, + "step": 7135 + }, + { + "epoch": 9.12544, + "grad_norm": 0.5349526405334473, + "learning_rate": 2.7661865661596443e-05, + "loss": 3.2127, + "step": 7136 + }, + { + "epoch": 9.12672, + "grad_norm": 0.5305134654045105, + "learning_rate": 2.7621483375959077e-05, + "loss": 3.1684, + "step": 7137 + }, + { + "epoch": 9.128, + "grad_norm": 0.5401234030723572, + "learning_rate": 2.758110109032171e-05, + "loss": 3.3105, + "step": 7138 + }, + { + "epoch": 9.12928, + "grad_norm": 0.5535856485366821, + "learning_rate": 2.7540718804684344e-05, + "loss": 3.2225, + "step": 7139 + }, + { + "epoch": 9.13056, + "grad_norm": 0.5476309061050415, + "learning_rate": 2.7500336519046974e-05, + "loss": 3.2879, + "step": 7140 + }, + { + "epoch": 9.13184, + "grad_norm": 0.5413089394569397, + "learning_rate": 2.7459954233409607e-05, + "loss": 3.1868, + "step": 7141 + }, + { + "epoch": 9.13312, + "grad_norm": 0.548028290271759, + "learning_rate": 2.741957194777224e-05, + "loss": 3.2499, + "step": 7142 + }, + { + "epoch": 9.1344, + "grad_norm": 0.5540904402732849, + "learning_rate": 2.7379189662134874e-05, + "loss": 3.2888, + "step": 7143 + }, + { + "epoch": 9.13568, + "grad_norm": 0.5426977872848511, + "learning_rate": 2.7338807376497508e-05, + "loss": 3.227, + "step": 7144 + }, + { + "epoch": 9.13696, + "grad_norm": 0.5503297448158264, + "learning_rate": 2.729842509086014e-05, + "loss": 3.1835, + "step": 7145 + }, + { + "epoch": 9.13824, + "grad_norm": 0.5338129997253418, + "learning_rate": 2.725804280522277e-05, + "loss": 3.257, + "step": 7146 + }, + { + "epoch": 9.13952, + "grad_norm": 0.5652978420257568, + "learning_rate": 2.7217660519585405e-05, + "loss": 3.2995, + "step": 7147 + }, + { + "epoch": 9.1408, + "grad_norm": 0.5556164979934692, + "learning_rate": 2.7177278233948038e-05, + "loss": 3.2905, + "step": 7148 + }, + { + "epoch": 9.14208, + "grad_norm": 0.5410715341567993, + "learning_rate": 2.713689594831067e-05, + "loss": 3.3382, + "step": 7149 + }, + { + "epoch": 9.14336, + "grad_norm": 0.546202540397644, + "learning_rate": 2.7096513662673305e-05, + "loss": 3.321, + "step": 7150 + }, + { + "epoch": 9.14464, + "grad_norm": 0.5364200472831726, + "learning_rate": 2.705613137703594e-05, + "loss": 3.2471, + "step": 7151 + }, + { + "epoch": 9.14592, + "grad_norm": 0.5380752086639404, + "learning_rate": 2.7015749091398572e-05, + "loss": 3.3008, + "step": 7152 + }, + { + "epoch": 9.1472, + "grad_norm": 0.538922131061554, + "learning_rate": 2.6975366805761202e-05, + "loss": 3.2591, + "step": 7153 + }, + { + "epoch": 9.14848, + "grad_norm": 0.5580816864967346, + "learning_rate": 2.6934984520123836e-05, + "loss": 3.2522, + "step": 7154 + }, + { + "epoch": 9.14976, + "grad_norm": 0.5336586236953735, + "learning_rate": 2.689460223448647e-05, + "loss": 3.1542, + "step": 7155 + }, + { + "epoch": 9.15104, + "grad_norm": 0.5401532649993896, + "learning_rate": 2.6854219948849103e-05, + "loss": 3.1945, + "step": 7156 + }, + { + "epoch": 9.15232, + "grad_norm": 0.5493281483650208, + "learning_rate": 2.6813837663211736e-05, + "loss": 3.3135, + "step": 7157 + }, + { + "epoch": 9.1536, + "grad_norm": 0.545318603515625, + "learning_rate": 2.677345537757437e-05, + "loss": 3.2359, + "step": 7158 + }, + { + "epoch": 9.15488, + "grad_norm": 0.5534164309501648, + "learning_rate": 2.6733073091937003e-05, + "loss": 3.2863, + "step": 7159 + }, + { + "epoch": 9.15616, + "grad_norm": 0.547370970249176, + "learning_rate": 2.6692690806299633e-05, + "loss": 3.2479, + "step": 7160 + }, + { + "epoch": 9.15744, + "grad_norm": 0.5454353094100952, + "learning_rate": 2.6652308520662266e-05, + "loss": 3.2092, + "step": 7161 + }, + { + "epoch": 9.15872, + "grad_norm": 0.5439633727073669, + "learning_rate": 2.66119262350249e-05, + "loss": 3.2338, + "step": 7162 + }, + { + "epoch": 9.16, + "grad_norm": 0.5375586152076721, + "learning_rate": 2.6571543949387533e-05, + "loss": 3.2332, + "step": 7163 + }, + { + "epoch": 9.16128, + "grad_norm": 0.5413592457771301, + "learning_rate": 2.6531161663750167e-05, + "loss": 3.1831, + "step": 7164 + }, + { + "epoch": 9.16256, + "grad_norm": 0.5374849438667297, + "learning_rate": 2.64907793781128e-05, + "loss": 3.2544, + "step": 7165 + }, + { + "epoch": 9.16384, + "grad_norm": 0.5411040782928467, + "learning_rate": 2.6450397092475434e-05, + "loss": 3.2554, + "step": 7166 + }, + { + "epoch": 9.16512, + "grad_norm": 0.5422086119651794, + "learning_rate": 2.6410014806838064e-05, + "loss": 3.1815, + "step": 7167 + }, + { + "epoch": 9.1664, + "grad_norm": 0.5530745983123779, + "learning_rate": 2.6369632521200697e-05, + "loss": 3.329, + "step": 7168 + }, + { + "epoch": 9.16768, + "grad_norm": 0.535277247428894, + "learning_rate": 2.632925023556333e-05, + "loss": 3.1514, + "step": 7169 + }, + { + "epoch": 9.16896, + "grad_norm": 0.5430591106414795, + "learning_rate": 2.6288867949925964e-05, + "loss": 3.2385, + "step": 7170 + }, + { + "epoch": 9.17024, + "grad_norm": 0.5327318906784058, + "learning_rate": 2.6248485664288598e-05, + "loss": 3.2665, + "step": 7171 + }, + { + "epoch": 9.17152, + "grad_norm": 0.5375852584838867, + "learning_rate": 2.620810337865123e-05, + "loss": 3.1724, + "step": 7172 + }, + { + "epoch": 9.1728, + "grad_norm": 0.5394271612167358, + "learning_rate": 2.616772109301386e-05, + "loss": 3.1937, + "step": 7173 + }, + { + "epoch": 9.17408, + "grad_norm": 0.5466704368591309, + "learning_rate": 2.6127338807376495e-05, + "loss": 3.1974, + "step": 7174 + }, + { + "epoch": 9.17536, + "grad_norm": 0.5488426685333252, + "learning_rate": 2.6086956521739128e-05, + "loss": 3.2674, + "step": 7175 + }, + { + "epoch": 9.17664, + "grad_norm": 0.5353536009788513, + "learning_rate": 2.604657423610176e-05, + "loss": 3.2576, + "step": 7176 + }, + { + "epoch": 9.17792, + "grad_norm": 0.5377178192138672, + "learning_rate": 2.6006191950464395e-05, + "loss": 3.2777, + "step": 7177 + }, + { + "epoch": 9.1792, + "grad_norm": 0.5338087677955627, + "learning_rate": 2.596580966482703e-05, + "loss": 3.2165, + "step": 7178 + }, + { + "epoch": 9.18048, + "grad_norm": 0.5402014851570129, + "learning_rate": 2.5925427379189662e-05, + "loss": 3.2682, + "step": 7179 + }, + { + "epoch": 9.18176, + "grad_norm": 0.5575425028800964, + "learning_rate": 2.5885045093552292e-05, + "loss": 3.2025, + "step": 7180 + }, + { + "epoch": 9.18304, + "grad_norm": 0.5500099658966064, + "learning_rate": 2.5844662807914926e-05, + "loss": 3.2428, + "step": 7181 + }, + { + "epoch": 9.18432, + "grad_norm": 0.5393756628036499, + "learning_rate": 2.580428052227756e-05, + "loss": 3.3224, + "step": 7182 + }, + { + "epoch": 9.1856, + "grad_norm": 0.5401633381843567, + "learning_rate": 2.5763898236640192e-05, + "loss": 3.1761, + "step": 7183 + }, + { + "epoch": 9.18688, + "grad_norm": 0.5423944592475891, + "learning_rate": 2.5723515951002826e-05, + "loss": 3.2521, + "step": 7184 + }, + { + "epoch": 9.18816, + "grad_norm": 0.5446013808250427, + "learning_rate": 2.568313366536546e-05, + "loss": 3.2659, + "step": 7185 + }, + { + "epoch": 9.18944, + "grad_norm": 0.5417162775993347, + "learning_rate": 2.5642751379728093e-05, + "loss": 3.2457, + "step": 7186 + }, + { + "epoch": 9.19072, + "grad_norm": 0.5435442328453064, + "learning_rate": 2.5602369094090723e-05, + "loss": 3.2624, + "step": 7187 + }, + { + "epoch": 9.192, + "grad_norm": 0.5303104519844055, + "learning_rate": 2.5561986808453356e-05, + "loss": 3.2494, + "step": 7188 + }, + { + "epoch": 9.19328, + "grad_norm": 0.5464146137237549, + "learning_rate": 2.552160452281599e-05, + "loss": 3.2261, + "step": 7189 + }, + { + "epoch": 9.19456, + "grad_norm": 0.5271885395050049, + "learning_rate": 2.5481222237178623e-05, + "loss": 3.2895, + "step": 7190 + }, + { + "epoch": 9.19584, + "grad_norm": 0.5422387719154358, + "learning_rate": 2.5440839951541257e-05, + "loss": 3.2853, + "step": 7191 + }, + { + "epoch": 9.19712, + "grad_norm": 0.533074676990509, + "learning_rate": 2.540045766590389e-05, + "loss": 3.2502, + "step": 7192 + }, + { + "epoch": 9.1984, + "grad_norm": 0.5399423837661743, + "learning_rate": 2.536007538026652e-05, + "loss": 3.2562, + "step": 7193 + }, + { + "epoch": 9.19968, + "grad_norm": 0.5299935936927795, + "learning_rate": 2.5319693094629154e-05, + "loss": 3.1815, + "step": 7194 + }, + { + "epoch": 9.20096, + "grad_norm": 0.538884162902832, + "learning_rate": 2.5279310808991787e-05, + "loss": 3.1976, + "step": 7195 + }, + { + "epoch": 9.20224, + "grad_norm": 0.551018238067627, + "learning_rate": 2.523892852335442e-05, + "loss": 3.2776, + "step": 7196 + }, + { + "epoch": 9.20352, + "grad_norm": 0.5342896580696106, + "learning_rate": 2.5198546237717054e-05, + "loss": 3.1877, + "step": 7197 + }, + { + "epoch": 9.2048, + "grad_norm": 0.5489623546600342, + "learning_rate": 2.5158163952079688e-05, + "loss": 3.2537, + "step": 7198 + }, + { + "epoch": 9.20608, + "grad_norm": 0.5403774380683899, + "learning_rate": 2.511778166644232e-05, + "loss": 3.2378, + "step": 7199 + }, + { + "epoch": 9.20736, + "grad_norm": 0.5302898287773132, + "learning_rate": 2.5077399380804948e-05, + "loss": 3.1942, + "step": 7200 + }, + { + "epoch": 9.20864, + "grad_norm": 0.5362948179244995, + "learning_rate": 2.5037017095167585e-05, + "loss": 3.2845, + "step": 7201 + }, + { + "epoch": 9.20992, + "grad_norm": 0.5381386280059814, + "learning_rate": 2.4996634809530218e-05, + "loss": 3.2257, + "step": 7202 + }, + { + "epoch": 9.2112, + "grad_norm": 0.5591046214103699, + "learning_rate": 2.495625252389285e-05, + "loss": 3.2319, + "step": 7203 + }, + { + "epoch": 9.21248, + "grad_norm": 0.5458407998085022, + "learning_rate": 2.4915870238255485e-05, + "loss": 3.24, + "step": 7204 + }, + { + "epoch": 9.21376, + "grad_norm": 0.5337607860565186, + "learning_rate": 2.487548795261812e-05, + "loss": 3.3132, + "step": 7205 + }, + { + "epoch": 9.21504, + "grad_norm": 0.5361645817756653, + "learning_rate": 2.4835105666980752e-05, + "loss": 3.2828, + "step": 7206 + }, + { + "epoch": 9.21632, + "grad_norm": 0.5330992937088013, + "learning_rate": 2.479472338134338e-05, + "loss": 3.2427, + "step": 7207 + }, + { + "epoch": 9.2176, + "grad_norm": 0.5395091772079468, + "learning_rate": 2.4754341095706012e-05, + "loss": 3.2042, + "step": 7208 + }, + { + "epoch": 9.21888, + "grad_norm": 0.5380380749702454, + "learning_rate": 2.4713958810068646e-05, + "loss": 3.2095, + "step": 7209 + }, + { + "epoch": 9.22016, + "grad_norm": 0.5278270244598389, + "learning_rate": 2.4673576524431282e-05, + "loss": 3.2077, + "step": 7210 + }, + { + "epoch": 9.22144, + "grad_norm": 0.5470373034477234, + "learning_rate": 2.4633194238793916e-05, + "loss": 3.1724, + "step": 7211 + }, + { + "epoch": 9.22272, + "grad_norm": 0.5408812165260315, + "learning_rate": 2.459281195315655e-05, + "loss": 3.3655, + "step": 7212 + }, + { + "epoch": 9.224, + "grad_norm": 0.5187749266624451, + "learning_rate": 2.4552429667519183e-05, + "loss": 3.2324, + "step": 7213 + }, + { + "epoch": 9.22528, + "grad_norm": 0.5471019148826599, + "learning_rate": 2.451204738188181e-05, + "loss": 3.227, + "step": 7214 + }, + { + "epoch": 9.22656, + "grad_norm": 0.5407528281211853, + "learning_rate": 2.4471665096244443e-05, + "loss": 3.2282, + "step": 7215 + }, + { + "epoch": 9.22784, + "grad_norm": 0.5427590608596802, + "learning_rate": 2.4431282810607076e-05, + "loss": 3.2541, + "step": 7216 + }, + { + "epoch": 9.22912, + "grad_norm": 0.5527137517929077, + "learning_rate": 2.439090052496971e-05, + "loss": 3.3002, + "step": 7217 + }, + { + "epoch": 9.2304, + "grad_norm": 0.5542083978652954, + "learning_rate": 2.4350518239332343e-05, + "loss": 3.2398, + "step": 7218 + }, + { + "epoch": 9.23168, + "grad_norm": 0.5390406250953674, + "learning_rate": 2.431013595369498e-05, + "loss": 3.2449, + "step": 7219 + }, + { + "epoch": 9.23296, + "grad_norm": 0.5343725681304932, + "learning_rate": 2.4269753668057607e-05, + "loss": 3.205, + "step": 7220 + }, + { + "epoch": 9.23424, + "grad_norm": 0.53874671459198, + "learning_rate": 2.422937138242024e-05, + "loss": 3.2123, + "step": 7221 + }, + { + "epoch": 9.23552, + "grad_norm": 0.5471300482749939, + "learning_rate": 2.4188989096782874e-05, + "loss": 3.2171, + "step": 7222 + }, + { + "epoch": 9.2368, + "grad_norm": 0.5401138067245483, + "learning_rate": 2.4148606811145507e-05, + "loss": 3.2517, + "step": 7223 + }, + { + "epoch": 9.23808, + "grad_norm": 0.5442971587181091, + "learning_rate": 2.410822452550814e-05, + "loss": 3.226, + "step": 7224 + }, + { + "epoch": 9.23936, + "grad_norm": 0.5361701846122742, + "learning_rate": 2.4067842239870774e-05, + "loss": 3.2387, + "step": 7225 + }, + { + "epoch": 9.24064, + "grad_norm": 0.5273582935333252, + "learning_rate": 2.4027459954233408e-05, + "loss": 3.2343, + "step": 7226 + }, + { + "epoch": 9.24192, + "grad_norm": 0.5558703541755676, + "learning_rate": 2.3987077668596038e-05, + "loss": 3.291, + "step": 7227 + }, + { + "epoch": 9.2432, + "grad_norm": 0.5563510656356812, + "learning_rate": 2.394669538295867e-05, + "loss": 3.2446, + "step": 7228 + }, + { + "epoch": 9.24448, + "grad_norm": 0.5549482107162476, + "learning_rate": 2.3906313097321305e-05, + "loss": 3.1956, + "step": 7229 + }, + { + "epoch": 9.24576, + "grad_norm": 0.5281810760498047, + "learning_rate": 2.3865930811683938e-05, + "loss": 3.3095, + "step": 7230 + }, + { + "epoch": 9.24704, + "grad_norm": 0.5421765446662903, + "learning_rate": 2.382554852604657e-05, + "loss": 3.2243, + "step": 7231 + }, + { + "epoch": 9.24832, + "grad_norm": 0.5279957056045532, + "learning_rate": 2.3785166240409205e-05, + "loss": 3.2774, + "step": 7232 + }, + { + "epoch": 9.2496, + "grad_norm": 0.5412895679473877, + "learning_rate": 2.374478395477184e-05, + "loss": 3.195, + "step": 7233 + }, + { + "epoch": 9.25088, + "grad_norm": 0.5304553508758545, + "learning_rate": 2.370440166913447e-05, + "loss": 3.2945, + "step": 7234 + }, + { + "epoch": 9.25216, + "grad_norm": 0.536499559879303, + "learning_rate": 2.3664019383497102e-05, + "loss": 3.1877, + "step": 7235 + }, + { + "epoch": 9.25344, + "grad_norm": 0.5327797532081604, + "learning_rate": 2.3623637097859736e-05, + "loss": 3.186, + "step": 7236 + }, + { + "epoch": 9.25472, + "grad_norm": 0.5580500960350037, + "learning_rate": 2.358325481222237e-05, + "loss": 3.2472, + "step": 7237 + }, + { + "epoch": 9.256, + "grad_norm": 0.5418148040771484, + "learning_rate": 2.3542872526585002e-05, + "loss": 3.293, + "step": 7238 + }, + { + "epoch": 9.25728, + "grad_norm": 0.5535669922828674, + "learning_rate": 2.3502490240947636e-05, + "loss": 3.247, + "step": 7239 + }, + { + "epoch": 9.25856, + "grad_norm": 0.5439664721488953, + "learning_rate": 2.3462107955310266e-05, + "loss": 3.2334, + "step": 7240 + }, + { + "epoch": 9.25984, + "grad_norm": 0.5313609838485718, + "learning_rate": 2.34217256696729e-05, + "loss": 3.1706, + "step": 7241 + }, + { + "epoch": 9.26112, + "grad_norm": 0.5202425122261047, + "learning_rate": 2.3381343384035533e-05, + "loss": 3.3, + "step": 7242 + }, + { + "epoch": 9.2624, + "grad_norm": 0.5400013327598572, + "learning_rate": 2.3340961098398166e-05, + "loss": 3.2212, + "step": 7243 + }, + { + "epoch": 9.26368, + "grad_norm": 0.5324732661247253, + "learning_rate": 2.33005788127608e-05, + "loss": 3.1796, + "step": 7244 + }, + { + "epoch": 9.26496, + "grad_norm": 0.5566893815994263, + "learning_rate": 2.3260196527123433e-05, + "loss": 3.2211, + "step": 7245 + }, + { + "epoch": 9.26624, + "grad_norm": 0.5362138152122498, + "learning_rate": 2.3219814241486067e-05, + "loss": 3.2751, + "step": 7246 + }, + { + "epoch": 9.26752, + "grad_norm": 0.5280265808105469, + "learning_rate": 2.3179431955848697e-05, + "loss": 3.3033, + "step": 7247 + }, + { + "epoch": 9.2688, + "grad_norm": 0.5292081236839294, + "learning_rate": 2.313904967021133e-05, + "loss": 3.2031, + "step": 7248 + }, + { + "epoch": 9.27008, + "grad_norm": 0.529188334941864, + "learning_rate": 2.3098667384573964e-05, + "loss": 3.1846, + "step": 7249 + }, + { + "epoch": 9.27136, + "grad_norm": 0.5365098118782043, + "learning_rate": 2.3058285098936597e-05, + "loss": 3.2667, + "step": 7250 + }, + { + "epoch": 9.272639999999999, + "grad_norm": 0.5411892533302307, + "learning_rate": 2.301790281329923e-05, + "loss": 3.2621, + "step": 7251 + }, + { + "epoch": 9.27392, + "grad_norm": 0.5581626296043396, + "learning_rate": 2.2977520527661864e-05, + "loss": 3.2695, + "step": 7252 + }, + { + "epoch": 9.2752, + "grad_norm": 0.5382561087608337, + "learning_rate": 2.2937138242024498e-05, + "loss": 3.1889, + "step": 7253 + }, + { + "epoch": 9.27648, + "grad_norm": 0.5616065859794617, + "learning_rate": 2.2896755956387128e-05, + "loss": 3.3357, + "step": 7254 + }, + { + "epoch": 9.27776, + "grad_norm": 0.5404537320137024, + "learning_rate": 2.285637367074976e-05, + "loss": 3.2573, + "step": 7255 + }, + { + "epoch": 9.27904, + "grad_norm": 0.5330450534820557, + "learning_rate": 2.2815991385112395e-05, + "loss": 3.1843, + "step": 7256 + }, + { + "epoch": 9.28032, + "grad_norm": 0.5430814027786255, + "learning_rate": 2.2775609099475028e-05, + "loss": 3.1601, + "step": 7257 + }, + { + "epoch": 9.2816, + "grad_norm": 0.5439402461051941, + "learning_rate": 2.273522681383766e-05, + "loss": 3.2283, + "step": 7258 + }, + { + "epoch": 9.28288, + "grad_norm": 0.5354278087615967, + "learning_rate": 2.2694844528200295e-05, + "loss": 3.1896, + "step": 7259 + }, + { + "epoch": 9.28416, + "grad_norm": 0.5329548120498657, + "learning_rate": 2.265446224256293e-05, + "loss": 3.278, + "step": 7260 + }, + { + "epoch": 9.28544, + "grad_norm": 0.5459061861038208, + "learning_rate": 2.261407995692556e-05, + "loss": 3.2793, + "step": 7261 + }, + { + "epoch": 9.28672, + "grad_norm": 0.5458545684814453, + "learning_rate": 2.2573697671288192e-05, + "loss": 3.2462, + "step": 7262 + }, + { + "epoch": 9.288, + "grad_norm": 0.5448905229568481, + "learning_rate": 2.2533315385650826e-05, + "loss": 3.1968, + "step": 7263 + }, + { + "epoch": 9.28928, + "grad_norm": 0.5559900403022766, + "learning_rate": 2.249293310001346e-05, + "loss": 3.21, + "step": 7264 + }, + { + "epoch": 9.29056, + "grad_norm": 0.5367174744606018, + "learning_rate": 2.2452550814376092e-05, + "loss": 3.137, + "step": 7265 + }, + { + "epoch": 9.29184, + "grad_norm": 0.5573728084564209, + "learning_rate": 2.2412168528738726e-05, + "loss": 3.2597, + "step": 7266 + }, + { + "epoch": 9.29312, + "grad_norm": 0.5518680214881897, + "learning_rate": 2.2371786243101356e-05, + "loss": 3.3036, + "step": 7267 + }, + { + "epoch": 9.2944, + "grad_norm": 0.5395819544792175, + "learning_rate": 2.233140395746399e-05, + "loss": 3.2503, + "step": 7268 + }, + { + "epoch": 9.29568, + "grad_norm": 0.528838038444519, + "learning_rate": 2.2291021671826623e-05, + "loss": 3.2374, + "step": 7269 + }, + { + "epoch": 9.29696, + "grad_norm": 0.5213932394981384, + "learning_rate": 2.2250639386189256e-05, + "loss": 3.2252, + "step": 7270 + }, + { + "epoch": 9.29824, + "grad_norm": 0.5346904993057251, + "learning_rate": 2.221025710055189e-05, + "loss": 3.2775, + "step": 7271 + }, + { + "epoch": 9.29952, + "grad_norm": 0.5518351197242737, + "learning_rate": 2.2169874814914523e-05, + "loss": 3.2446, + "step": 7272 + }, + { + "epoch": 9.3008, + "grad_norm": 0.5426953434944153, + "learning_rate": 2.2129492529277157e-05, + "loss": 3.2718, + "step": 7273 + }, + { + "epoch": 9.30208, + "grad_norm": 0.5330793261528015, + "learning_rate": 2.2089110243639787e-05, + "loss": 3.2078, + "step": 7274 + }, + { + "epoch": 9.30336, + "grad_norm": 0.5355795621871948, + "learning_rate": 2.204872795800242e-05, + "loss": 3.2268, + "step": 7275 + }, + { + "epoch": 9.30464, + "grad_norm": 0.5403215885162354, + "learning_rate": 2.2008345672365054e-05, + "loss": 3.2271, + "step": 7276 + }, + { + "epoch": 9.30592, + "grad_norm": 0.5434122085571289, + "learning_rate": 2.1967963386727687e-05, + "loss": 3.2366, + "step": 7277 + }, + { + "epoch": 9.3072, + "grad_norm": 0.5365995168685913, + "learning_rate": 2.192758110109032e-05, + "loss": 3.2595, + "step": 7278 + }, + { + "epoch": 9.30848, + "grad_norm": 0.537118136882782, + "learning_rate": 2.1887198815452954e-05, + "loss": 3.2239, + "step": 7279 + }, + { + "epoch": 9.30976, + "grad_norm": 0.5411548614501953, + "learning_rate": 2.1846816529815588e-05, + "loss": 3.3102, + "step": 7280 + }, + { + "epoch": 9.31104, + "grad_norm": 0.5356780290603638, + "learning_rate": 2.1806434244178218e-05, + "loss": 3.3315, + "step": 7281 + }, + { + "epoch": 9.31232, + "grad_norm": 0.5491350889205933, + "learning_rate": 2.176605195854085e-05, + "loss": 3.3068, + "step": 7282 + }, + { + "epoch": 9.3136, + "grad_norm": 0.5464149713516235, + "learning_rate": 2.1725669672903485e-05, + "loss": 3.2792, + "step": 7283 + }, + { + "epoch": 9.31488, + "grad_norm": 0.5239377617835999, + "learning_rate": 2.1685287387266118e-05, + "loss": 3.1874, + "step": 7284 + }, + { + "epoch": 9.31616, + "grad_norm": 0.5476245284080505, + "learning_rate": 2.164490510162875e-05, + "loss": 3.2341, + "step": 7285 + }, + { + "epoch": 9.31744, + "grad_norm": 0.546082079410553, + "learning_rate": 2.1604522815991385e-05, + "loss": 3.237, + "step": 7286 + }, + { + "epoch": 9.31872, + "grad_norm": 0.5425671339035034, + "learning_rate": 2.1564140530354015e-05, + "loss": 3.2401, + "step": 7287 + }, + { + "epoch": 9.32, + "grad_norm": 0.5253366827964783, + "learning_rate": 2.152375824471665e-05, + "loss": 3.2288, + "step": 7288 + }, + { + "epoch": 9.32128, + "grad_norm": 0.5394009351730347, + "learning_rate": 2.1483375959079282e-05, + "loss": 3.2209, + "step": 7289 + }, + { + "epoch": 9.32256, + "grad_norm": 0.5472865104675293, + "learning_rate": 2.1442993673441915e-05, + "loss": 3.2802, + "step": 7290 + }, + { + "epoch": 9.32384, + "grad_norm": 0.5333254933357239, + "learning_rate": 2.140261138780455e-05, + "loss": 3.2069, + "step": 7291 + }, + { + "epoch": 9.32512, + "grad_norm": 0.5474021434783936, + "learning_rate": 2.1362229102167182e-05, + "loss": 3.2606, + "step": 7292 + }, + { + "epoch": 9.3264, + "grad_norm": 0.5382992625236511, + "learning_rate": 2.1321846816529816e-05, + "loss": 3.2327, + "step": 7293 + }, + { + "epoch": 9.32768, + "grad_norm": 0.533406674861908, + "learning_rate": 2.1281464530892446e-05, + "loss": 3.2464, + "step": 7294 + }, + { + "epoch": 9.32896, + "grad_norm": 0.5368064641952515, + "learning_rate": 2.124108224525508e-05, + "loss": 3.2933, + "step": 7295 + }, + { + "epoch": 9.33024, + "grad_norm": 0.5427320599555969, + "learning_rate": 2.1200699959617713e-05, + "loss": 3.2846, + "step": 7296 + }, + { + "epoch": 9.33152, + "grad_norm": 0.5371754765510559, + "learning_rate": 2.1160317673980346e-05, + "loss": 3.3316, + "step": 7297 + }, + { + "epoch": 9.3328, + "grad_norm": 0.5448208451271057, + "learning_rate": 2.111993538834298e-05, + "loss": 3.2638, + "step": 7298 + }, + { + "epoch": 9.33408, + "grad_norm": 0.5369369983673096, + "learning_rate": 2.1079553102705613e-05, + "loss": 3.172, + "step": 7299 + }, + { + "epoch": 9.33536, + "grad_norm": 0.5618038773536682, + "learning_rate": 2.1039170817068247e-05, + "loss": 3.2398, + "step": 7300 + }, + { + "epoch": 9.33664, + "grad_norm": 0.5481551289558411, + "learning_rate": 2.0998788531430877e-05, + "loss": 3.25, + "step": 7301 + }, + { + "epoch": 9.33792, + "grad_norm": 0.5384693741798401, + "learning_rate": 2.095840624579351e-05, + "loss": 3.2027, + "step": 7302 + }, + { + "epoch": 9.3392, + "grad_norm": 0.5455001592636108, + "learning_rate": 2.0918023960156144e-05, + "loss": 3.2312, + "step": 7303 + }, + { + "epoch": 9.34048, + "grad_norm": 0.5411433577537537, + "learning_rate": 2.0877641674518777e-05, + "loss": 3.2789, + "step": 7304 + }, + { + "epoch": 9.34176, + "grad_norm": 0.5493059158325195, + "learning_rate": 2.083725938888141e-05, + "loss": 3.2225, + "step": 7305 + }, + { + "epoch": 9.34304, + "grad_norm": 0.5578423738479614, + "learning_rate": 2.0796877103244044e-05, + "loss": 3.2215, + "step": 7306 + }, + { + "epoch": 9.34432, + "grad_norm": 0.5362764596939087, + "learning_rate": 2.0756494817606678e-05, + "loss": 3.1921, + "step": 7307 + }, + { + "epoch": 9.3456, + "grad_norm": 0.5485124588012695, + "learning_rate": 2.0716112531969308e-05, + "loss": 3.2242, + "step": 7308 + }, + { + "epoch": 9.34688, + "grad_norm": 0.5391074419021606, + "learning_rate": 2.067573024633194e-05, + "loss": 3.2832, + "step": 7309 + }, + { + "epoch": 9.34816, + "grad_norm": 0.5347970128059387, + "learning_rate": 2.0635347960694575e-05, + "loss": 3.2889, + "step": 7310 + }, + { + "epoch": 9.34944, + "grad_norm": 0.5423051714897156, + "learning_rate": 2.0594965675057208e-05, + "loss": 3.2891, + "step": 7311 + }, + { + "epoch": 9.35072, + "grad_norm": 0.5293038487434387, + "learning_rate": 2.055458338941984e-05, + "loss": 3.224, + "step": 7312 + }, + { + "epoch": 9.352, + "grad_norm": 0.5548036098480225, + "learning_rate": 2.0514201103782475e-05, + "loss": 3.2699, + "step": 7313 + }, + { + "epoch": 9.35328, + "grad_norm": 0.534817636013031, + "learning_rate": 2.04738188181451e-05, + "loss": 3.1907, + "step": 7314 + }, + { + "epoch": 9.35456, + "grad_norm": 0.5452345013618469, + "learning_rate": 2.0433436532507735e-05, + "loss": 3.2226, + "step": 7315 + }, + { + "epoch": 9.35584, + "grad_norm": 0.5356956124305725, + "learning_rate": 2.0393054246870372e-05, + "loss": 3.2319, + "step": 7316 + }, + { + "epoch": 9.35712, + "grad_norm": 0.5347426533699036, + "learning_rate": 2.0352671961233005e-05, + "loss": 3.2123, + "step": 7317 + }, + { + "epoch": 9.3584, + "grad_norm": 0.533959686756134, + "learning_rate": 2.031228967559564e-05, + "loss": 3.213, + "step": 7318 + }, + { + "epoch": 9.35968, + "grad_norm": 0.549071729183197, + "learning_rate": 2.0271907389958272e-05, + "loss": 3.3334, + "step": 7319 + }, + { + "epoch": 9.36096, + "grad_norm": 0.5512160658836365, + "learning_rate": 2.0231525104320906e-05, + "loss": 3.2198, + "step": 7320 + }, + { + "epoch": 9.36224, + "grad_norm": 0.5341069102287292, + "learning_rate": 2.0191142818683533e-05, + "loss": 3.1991, + "step": 7321 + }, + { + "epoch": 9.36352, + "grad_norm": 0.536259114742279, + "learning_rate": 2.0150760533046166e-05, + "loss": 3.2656, + "step": 7322 + }, + { + "epoch": 9.3648, + "grad_norm": 0.5421985387802124, + "learning_rate": 2.01103782474088e-05, + "loss": 3.1539, + "step": 7323 + }, + { + "epoch": 9.36608, + "grad_norm": 0.5535771250724792, + "learning_rate": 2.0069995961771433e-05, + "loss": 3.2589, + "step": 7324 + }, + { + "epoch": 9.36736, + "grad_norm": 0.5412119626998901, + "learning_rate": 2.002961367613407e-05, + "loss": 3.3689, + "step": 7325 + }, + { + "epoch": 9.36864, + "grad_norm": 0.5178043842315674, + "learning_rate": 1.9989231390496703e-05, + "loss": 3.177, + "step": 7326 + }, + { + "epoch": 9.36992, + "grad_norm": 0.5386024117469788, + "learning_rate": 1.9948849104859337e-05, + "loss": 3.23, + "step": 7327 + }, + { + "epoch": 9.3712, + "grad_norm": 0.5399037599563599, + "learning_rate": 1.9908466819221963e-05, + "loss": 3.274, + "step": 7328 + }, + { + "epoch": 9.37248, + "grad_norm": 0.5423693060874939, + "learning_rate": 1.9868084533584597e-05, + "loss": 3.1935, + "step": 7329 + }, + { + "epoch": 9.37376, + "grad_norm": 0.5459950566291809, + "learning_rate": 1.982770224794723e-05, + "loss": 3.1973, + "step": 7330 + }, + { + "epoch": 9.37504, + "grad_norm": 0.5486062169075012, + "learning_rate": 1.9787319962309864e-05, + "loss": 3.1877, + "step": 7331 + }, + { + "epoch": 9.37632, + "grad_norm": 0.552658200263977, + "learning_rate": 1.9746937676672497e-05, + "loss": 3.2007, + "step": 7332 + }, + { + "epoch": 9.3776, + "grad_norm": 0.5424798130989075, + "learning_rate": 1.970655539103513e-05, + "loss": 3.2671, + "step": 7333 + }, + { + "epoch": 9.37888, + "grad_norm": 0.5408488512039185, + "learning_rate": 1.9666173105397768e-05, + "loss": 3.222, + "step": 7334 + }, + { + "epoch": 9.38016, + "grad_norm": 0.537208080291748, + "learning_rate": 1.9625790819760394e-05, + "loss": 3.3263, + "step": 7335 + }, + { + "epoch": 9.38144, + "grad_norm": 0.5427209734916687, + "learning_rate": 1.9585408534123028e-05, + "loss": 3.2509, + "step": 7336 + }, + { + "epoch": 9.38272, + "grad_norm": 0.5269715189933777, + "learning_rate": 1.954502624848566e-05, + "loss": 3.1778, + "step": 7337 + }, + { + "epoch": 9.384, + "grad_norm": 0.5249371528625488, + "learning_rate": 1.9504643962848295e-05, + "loss": 3.1646, + "step": 7338 + }, + { + "epoch": 9.38528, + "grad_norm": 0.5350185632705688, + "learning_rate": 1.9464261677210928e-05, + "loss": 3.1933, + "step": 7339 + }, + { + "epoch": 9.38656, + "grad_norm": 0.5533794164657593, + "learning_rate": 1.942387939157356e-05, + "loss": 3.3036, + "step": 7340 + }, + { + "epoch": 9.38784, + "grad_norm": 0.537314236164093, + "learning_rate": 1.938349710593619e-05, + "loss": 3.266, + "step": 7341 + }, + { + "epoch": 9.38912, + "grad_norm": 0.5408215522766113, + "learning_rate": 1.9343114820298825e-05, + "loss": 3.2319, + "step": 7342 + }, + { + "epoch": 9.3904, + "grad_norm": 0.5351347923278809, + "learning_rate": 1.930273253466146e-05, + "loss": 3.2069, + "step": 7343 + }, + { + "epoch": 9.39168, + "grad_norm": 0.56521075963974, + "learning_rate": 1.9262350249024092e-05, + "loss": 3.2888, + "step": 7344 + }, + { + "epoch": 9.39296, + "grad_norm": 0.5516117811203003, + "learning_rate": 1.9221967963386725e-05, + "loss": 3.2223, + "step": 7345 + }, + { + "epoch": 9.39424, + "grad_norm": 0.5425495505332947, + "learning_rate": 1.918158567774936e-05, + "loss": 3.2373, + "step": 7346 + }, + { + "epoch": 9.39552, + "grad_norm": 0.5260109901428223, + "learning_rate": 1.9141203392111992e-05, + "loss": 3.2283, + "step": 7347 + }, + { + "epoch": 9.3968, + "grad_norm": 0.530563473701477, + "learning_rate": 1.9100821106474622e-05, + "loss": 3.2014, + "step": 7348 + }, + { + "epoch": 9.39808, + "grad_norm": 0.5480346083641052, + "learning_rate": 1.9060438820837256e-05, + "loss": 3.3066, + "step": 7349 + }, + { + "epoch": 9.39936, + "grad_norm": 0.5326750874519348, + "learning_rate": 1.902005653519989e-05, + "loss": 3.2109, + "step": 7350 + }, + { + "epoch": 9.40064, + "grad_norm": 0.5301714539527893, + "learning_rate": 1.8979674249562523e-05, + "loss": 3.2464, + "step": 7351 + }, + { + "epoch": 9.40192, + "grad_norm": 0.5453739166259766, + "learning_rate": 1.8939291963925156e-05, + "loss": 3.2734, + "step": 7352 + }, + { + "epoch": 9.4032, + "grad_norm": 0.5450626611709595, + "learning_rate": 1.889890967828779e-05, + "loss": 3.2435, + "step": 7353 + }, + { + "epoch": 9.40448, + "grad_norm": 0.5502852201461792, + "learning_rate": 1.8858527392650423e-05, + "loss": 3.3081, + "step": 7354 + }, + { + "epoch": 9.40576, + "grad_norm": 0.5369656682014465, + "learning_rate": 1.8818145107013053e-05, + "loss": 3.2784, + "step": 7355 + }, + { + "epoch": 9.40704, + "grad_norm": 0.5568560361862183, + "learning_rate": 1.8777762821375687e-05, + "loss": 3.3034, + "step": 7356 + }, + { + "epoch": 9.40832, + "grad_norm": 0.541373074054718, + "learning_rate": 1.873738053573832e-05, + "loss": 3.2594, + "step": 7357 + }, + { + "epoch": 9.4096, + "grad_norm": 0.5414525866508484, + "learning_rate": 1.8696998250100954e-05, + "loss": 3.2231, + "step": 7358 + }, + { + "epoch": 9.41088, + "grad_norm": 0.535603940486908, + "learning_rate": 1.8656615964463587e-05, + "loss": 3.2663, + "step": 7359 + }, + { + "epoch": 9.41216, + "grad_norm": 0.5469928979873657, + "learning_rate": 1.861623367882622e-05, + "loss": 3.2296, + "step": 7360 + }, + { + "epoch": 9.41344, + "grad_norm": 0.5573539733886719, + "learning_rate": 1.8575851393188854e-05, + "loss": 3.2841, + "step": 7361 + }, + { + "epoch": 9.414719999999999, + "grad_norm": 0.5386171340942383, + "learning_rate": 1.8535469107551488e-05, + "loss": 3.2654, + "step": 7362 + }, + { + "epoch": 9.416, + "grad_norm": 0.5473566651344299, + "learning_rate": 1.8495086821914118e-05, + "loss": 3.206, + "step": 7363 + }, + { + "epoch": 9.41728, + "grad_norm": 0.54440838098526, + "learning_rate": 1.845470453627675e-05, + "loss": 3.2718, + "step": 7364 + }, + { + "epoch": 9.41856, + "grad_norm": 0.5463802814483643, + "learning_rate": 1.8414322250639385e-05, + "loss": 3.268, + "step": 7365 + }, + { + "epoch": 9.41984, + "grad_norm": 0.5378354787826538, + "learning_rate": 1.8373939965002018e-05, + "loss": 3.2652, + "step": 7366 + }, + { + "epoch": 9.42112, + "grad_norm": 0.5412339568138123, + "learning_rate": 1.833355767936465e-05, + "loss": 3.1854, + "step": 7367 + }, + { + "epoch": 9.4224, + "grad_norm": 0.5401472449302673, + "learning_rate": 1.8293175393727285e-05, + "loss": 3.3367, + "step": 7368 + }, + { + "epoch": 9.42368, + "grad_norm": 0.5603813529014587, + "learning_rate": 1.825279310808992e-05, + "loss": 3.2289, + "step": 7369 + }, + { + "epoch": 9.42496, + "grad_norm": 0.5412847399711609, + "learning_rate": 1.821241082245255e-05, + "loss": 3.2606, + "step": 7370 + }, + { + "epoch": 9.42624, + "grad_norm": 0.5373712778091431, + "learning_rate": 1.8172028536815182e-05, + "loss": 3.2097, + "step": 7371 + }, + { + "epoch": 9.42752, + "grad_norm": 0.5573616623878479, + "learning_rate": 1.8131646251177815e-05, + "loss": 3.2397, + "step": 7372 + }, + { + "epoch": 9.4288, + "grad_norm": 0.5405897498130798, + "learning_rate": 1.8091263965540446e-05, + "loss": 3.259, + "step": 7373 + }, + { + "epoch": 9.43008, + "grad_norm": 0.5547781586647034, + "learning_rate": 1.8050881679903082e-05, + "loss": 3.247, + "step": 7374 + }, + { + "epoch": 9.43136, + "grad_norm": 0.5367437601089478, + "learning_rate": 1.8010499394265716e-05, + "loss": 3.25, + "step": 7375 + }, + { + "epoch": 9.43264, + "grad_norm": 0.5429502725601196, + "learning_rate": 1.7970117108628346e-05, + "loss": 3.2392, + "step": 7376 + }, + { + "epoch": 9.43392, + "grad_norm": 0.5472865700721741, + "learning_rate": 1.792973482299098e-05, + "loss": 3.2329, + "step": 7377 + }, + { + "epoch": 9.4352, + "grad_norm": 0.542381227016449, + "learning_rate": 1.7889352537353613e-05, + "loss": 3.2845, + "step": 7378 + }, + { + "epoch": 9.43648, + "grad_norm": 0.5476444959640503, + "learning_rate": 1.7848970251716246e-05, + "loss": 3.2889, + "step": 7379 + }, + { + "epoch": 9.43776, + "grad_norm": 0.5548306107521057, + "learning_rate": 1.7808587966078876e-05, + "loss": 3.2567, + "step": 7380 + }, + { + "epoch": 9.43904, + "grad_norm": 0.5338513851165771, + "learning_rate": 1.776820568044151e-05, + "loss": 3.1739, + "step": 7381 + }, + { + "epoch": 9.44032, + "grad_norm": 0.5605148673057556, + "learning_rate": 1.7727823394804143e-05, + "loss": 3.2783, + "step": 7382 + }, + { + "epoch": 9.4416, + "grad_norm": 0.5437026619911194, + "learning_rate": 1.7687441109166777e-05, + "loss": 3.1916, + "step": 7383 + }, + { + "epoch": 9.44288, + "grad_norm": 0.5323542952537537, + "learning_rate": 1.764705882352941e-05, + "loss": 3.2599, + "step": 7384 + }, + { + "epoch": 9.44416, + "grad_norm": 0.5453789830207825, + "learning_rate": 1.7606676537892044e-05, + "loss": 3.2619, + "step": 7385 + }, + { + "epoch": 9.44544, + "grad_norm": 0.5387235283851624, + "learning_rate": 1.7566294252254677e-05, + "loss": 3.2936, + "step": 7386 + }, + { + "epoch": 9.44672, + "grad_norm": 0.5475628972053528, + "learning_rate": 1.7525911966617307e-05, + "loss": 3.244, + "step": 7387 + }, + { + "epoch": 9.448, + "grad_norm": 0.5442694425582886, + "learning_rate": 1.748552968097994e-05, + "loss": 3.2876, + "step": 7388 + }, + { + "epoch": 9.44928, + "grad_norm": 0.5453842282295227, + "learning_rate": 1.7445147395342574e-05, + "loss": 3.2533, + "step": 7389 + }, + { + "epoch": 9.45056, + "grad_norm": 0.5407310128211975, + "learning_rate": 1.7404765109705208e-05, + "loss": 3.2382, + "step": 7390 + }, + { + "epoch": 9.45184, + "grad_norm": 0.531166672706604, + "learning_rate": 1.736438282406784e-05, + "loss": 3.2376, + "step": 7391 + }, + { + "epoch": 9.45312, + "grad_norm": 0.5313123464584351, + "learning_rate": 1.7324000538430475e-05, + "loss": 3.2362, + "step": 7392 + }, + { + "epoch": 9.4544, + "grad_norm": 0.5363158583641052, + "learning_rate": 1.7283618252793105e-05, + "loss": 3.2088, + "step": 7393 + }, + { + "epoch": 9.45568, + "grad_norm": 0.5450273752212524, + "learning_rate": 1.7243235967155738e-05, + "loss": 3.26, + "step": 7394 + }, + { + "epoch": 9.45696, + "grad_norm": 0.57196044921875, + "learning_rate": 1.720285368151837e-05, + "loss": 3.2556, + "step": 7395 + }, + { + "epoch": 9.45824, + "grad_norm": 0.5576905012130737, + "learning_rate": 1.7162471395881005e-05, + "loss": 3.2441, + "step": 7396 + }, + { + "epoch": 9.45952, + "grad_norm": 0.5438629388809204, + "learning_rate": 1.712208911024364e-05, + "loss": 3.1963, + "step": 7397 + }, + { + "epoch": 9.4608, + "grad_norm": 0.5366012454032898, + "learning_rate": 1.7081706824606272e-05, + "loss": 3.2966, + "step": 7398 + }, + { + "epoch": 9.46208, + "grad_norm": 0.5356854796409607, + "learning_rate": 1.7041324538968905e-05, + "loss": 3.22, + "step": 7399 + }, + { + "epoch": 9.46336, + "grad_norm": 0.5522456169128418, + "learning_rate": 1.7000942253331535e-05, + "loss": 3.2367, + "step": 7400 + }, + { + "epoch": 9.46464, + "grad_norm": 0.5475519299507141, + "learning_rate": 1.696055996769417e-05, + "loss": 3.2766, + "step": 7401 + }, + { + "epoch": 9.46592, + "grad_norm": 0.5508553385734558, + "learning_rate": 1.6920177682056802e-05, + "loss": 3.2885, + "step": 7402 + }, + { + "epoch": 9.4672, + "grad_norm": 0.5404077172279358, + "learning_rate": 1.6879795396419436e-05, + "loss": 3.2627, + "step": 7403 + }, + { + "epoch": 9.46848, + "grad_norm": 0.5281044840812683, + "learning_rate": 1.683941311078207e-05, + "loss": 3.2113, + "step": 7404 + }, + { + "epoch": 9.46976, + "grad_norm": 0.5426114201545715, + "learning_rate": 1.6799030825144703e-05, + "loss": 3.2066, + "step": 7405 + }, + { + "epoch": 9.47104, + "grad_norm": 0.5397725701332092, + "learning_rate": 1.6758648539507336e-05, + "loss": 3.2879, + "step": 7406 + }, + { + "epoch": 9.47232, + "grad_norm": 0.5343126058578491, + "learning_rate": 1.6718266253869966e-05, + "loss": 3.21, + "step": 7407 + }, + { + "epoch": 9.4736, + "grad_norm": 0.527040421962738, + "learning_rate": 1.66778839682326e-05, + "loss": 3.2397, + "step": 7408 + }, + { + "epoch": 9.47488, + "grad_norm": 0.5362902879714966, + "learning_rate": 1.6637501682595233e-05, + "loss": 3.2548, + "step": 7409 + }, + { + "epoch": 9.47616, + "grad_norm": 0.5329915285110474, + "learning_rate": 1.6597119396957867e-05, + "loss": 3.2235, + "step": 7410 + }, + { + "epoch": 9.47744, + "grad_norm": 0.5368509292602539, + "learning_rate": 1.65567371113205e-05, + "loss": 3.1814, + "step": 7411 + }, + { + "epoch": 9.47872, + "grad_norm": 0.5509939193725586, + "learning_rate": 1.6516354825683134e-05, + "loss": 3.2374, + "step": 7412 + }, + { + "epoch": 9.48, + "grad_norm": 0.5431427359580994, + "learning_rate": 1.6475972540045764e-05, + "loss": 3.1822, + "step": 7413 + }, + { + "epoch": 9.48128, + "grad_norm": 0.5423570275306702, + "learning_rate": 1.6435590254408397e-05, + "loss": 3.2381, + "step": 7414 + }, + { + "epoch": 9.48256, + "grad_norm": 0.5203372240066528, + "learning_rate": 1.639520796877103e-05, + "loss": 3.238, + "step": 7415 + }, + { + "epoch": 9.48384, + "grad_norm": 0.5459218621253967, + "learning_rate": 1.6354825683133664e-05, + "loss": 3.2343, + "step": 7416 + }, + { + "epoch": 9.48512, + "grad_norm": 0.5473526120185852, + "learning_rate": 1.6314443397496298e-05, + "loss": 3.263, + "step": 7417 + }, + { + "epoch": 9.4864, + "grad_norm": 0.5523941516876221, + "learning_rate": 1.627406111185893e-05, + "loss": 3.2685, + "step": 7418 + }, + { + "epoch": 9.48768, + "grad_norm": 0.541957437992096, + "learning_rate": 1.6233678826221565e-05, + "loss": 3.3066, + "step": 7419 + }, + { + "epoch": 9.48896, + "grad_norm": 0.5321203470230103, + "learning_rate": 1.6193296540584195e-05, + "loss": 3.2308, + "step": 7420 + }, + { + "epoch": 9.49024, + "grad_norm": 0.5349709391593933, + "learning_rate": 1.6152914254946828e-05, + "loss": 3.2747, + "step": 7421 + }, + { + "epoch": 9.49152, + "grad_norm": 0.5336790084838867, + "learning_rate": 1.611253196930946e-05, + "loss": 3.2135, + "step": 7422 + }, + { + "epoch": 9.4928, + "grad_norm": 0.551705539226532, + "learning_rate": 1.6072149683672095e-05, + "loss": 3.2396, + "step": 7423 + }, + { + "epoch": 9.49408, + "grad_norm": 0.547311007976532, + "learning_rate": 1.603176739803473e-05, + "loss": 3.3115, + "step": 7424 + }, + { + "epoch": 9.49536, + "grad_norm": 0.558070182800293, + "learning_rate": 1.5991385112397362e-05, + "loss": 3.2538, + "step": 7425 + }, + { + "epoch": 9.49664, + "grad_norm": 0.5429165363311768, + "learning_rate": 1.5951002826759995e-05, + "loss": 3.1978, + "step": 7426 + }, + { + "epoch": 9.49792, + "grad_norm": 0.5304774045944214, + "learning_rate": 1.5910620541122625e-05, + "loss": 3.2507, + "step": 7427 + }, + { + "epoch": 9.4992, + "grad_norm": 0.5391626358032227, + "learning_rate": 1.587023825548526e-05, + "loss": 3.2713, + "step": 7428 + }, + { + "epoch": 9.50048, + "grad_norm": 0.5373072028160095, + "learning_rate": 1.5829855969847892e-05, + "loss": 3.2066, + "step": 7429 + }, + { + "epoch": 9.50176, + "grad_norm": 0.52620929479599, + "learning_rate": 1.5789473684210522e-05, + "loss": 3.2278, + "step": 7430 + }, + { + "epoch": 9.50304, + "grad_norm": 0.5313057899475098, + "learning_rate": 1.574909139857316e-05, + "loss": 3.2863, + "step": 7431 + }, + { + "epoch": 9.50432, + "grad_norm": 0.555780291557312, + "learning_rate": 1.5708709112935793e-05, + "loss": 3.3081, + "step": 7432 + }, + { + "epoch": 9.5056, + "grad_norm": 0.5542112588882446, + "learning_rate": 1.5668326827298426e-05, + "loss": 3.2342, + "step": 7433 + }, + { + "epoch": 9.50688, + "grad_norm": 0.5538204312324524, + "learning_rate": 1.5627944541661056e-05, + "loss": 3.2699, + "step": 7434 + }, + { + "epoch": 9.50816, + "grad_norm": 0.5440041422843933, + "learning_rate": 1.558756225602369e-05, + "loss": 3.195, + "step": 7435 + }, + { + "epoch": 9.50944, + "grad_norm": 0.529220700263977, + "learning_rate": 1.5547179970386323e-05, + "loss": 3.2317, + "step": 7436 + }, + { + "epoch": 9.51072, + "grad_norm": 0.5430290102958679, + "learning_rate": 1.5506797684748953e-05, + "loss": 3.2327, + "step": 7437 + }, + { + "epoch": 9.512, + "grad_norm": 0.5630149245262146, + "learning_rate": 1.5466415399111587e-05, + "loss": 3.3014, + "step": 7438 + }, + { + "epoch": 9.51328, + "grad_norm": 0.5409126877784729, + "learning_rate": 1.542603311347422e-05, + "loss": 3.2794, + "step": 7439 + }, + { + "epoch": 9.51456, + "grad_norm": 0.5274693369865417, + "learning_rate": 1.5385650827836854e-05, + "loss": 3.2504, + "step": 7440 + }, + { + "epoch": 9.51584, + "grad_norm": 0.5387745499610901, + "learning_rate": 1.5345268542199487e-05, + "loss": 3.1959, + "step": 7441 + }, + { + "epoch": 9.51712, + "grad_norm": 0.5413409471511841, + "learning_rate": 1.530488625656212e-05, + "loss": 3.208, + "step": 7442 + }, + { + "epoch": 9.5184, + "grad_norm": 0.5244672298431396, + "learning_rate": 1.5264503970924754e-05, + "loss": 3.1847, + "step": 7443 + }, + { + "epoch": 9.51968, + "grad_norm": 0.5482396483421326, + "learning_rate": 1.5224121685287386e-05, + "loss": 3.2254, + "step": 7444 + }, + { + "epoch": 9.52096, + "grad_norm": 0.5391595363616943, + "learning_rate": 1.518373939965002e-05, + "loss": 3.174, + "step": 7445 + }, + { + "epoch": 9.52224, + "grad_norm": 0.5382583737373352, + "learning_rate": 1.5143357114012653e-05, + "loss": 3.2514, + "step": 7446 + }, + { + "epoch": 9.52352, + "grad_norm": 0.5451508164405823, + "learning_rate": 1.5102974828375285e-05, + "loss": 3.3266, + "step": 7447 + }, + { + "epoch": 9.5248, + "grad_norm": 0.5408745408058167, + "learning_rate": 1.5062592542737918e-05, + "loss": 3.2239, + "step": 7448 + }, + { + "epoch": 9.52608, + "grad_norm": 0.5346300601959229, + "learning_rate": 1.5022210257100551e-05, + "loss": 3.1678, + "step": 7449 + }, + { + "epoch": 9.52736, + "grad_norm": 0.54719078540802, + "learning_rate": 1.4981827971463183e-05, + "loss": 3.3131, + "step": 7450 + }, + { + "epoch": 9.52864, + "grad_norm": 0.5220864415168762, + "learning_rate": 1.4941445685825817e-05, + "loss": 3.2922, + "step": 7451 + }, + { + "epoch": 9.52992, + "grad_norm": 0.538643479347229, + "learning_rate": 1.490106340018845e-05, + "loss": 3.2359, + "step": 7452 + }, + { + "epoch": 9.5312, + "grad_norm": 0.5405166745185852, + "learning_rate": 1.4860681114551084e-05, + "loss": 3.1694, + "step": 7453 + }, + { + "epoch": 9.53248, + "grad_norm": 0.5491073131561279, + "learning_rate": 1.4820298828913715e-05, + "loss": 3.2585, + "step": 7454 + }, + { + "epoch": 9.533760000000001, + "grad_norm": 0.5413920879364014, + "learning_rate": 1.4779916543276349e-05, + "loss": 3.1619, + "step": 7455 + }, + { + "epoch": 9.53504, + "grad_norm": 0.5279734134674072, + "learning_rate": 1.4739534257638982e-05, + "loss": 3.1632, + "step": 7456 + }, + { + "epoch": 9.53632, + "grad_norm": 0.5433555245399475, + "learning_rate": 1.4699151972001612e-05, + "loss": 3.2692, + "step": 7457 + }, + { + "epoch": 9.5376, + "grad_norm": 0.5342807173728943, + "learning_rate": 1.4658769686364248e-05, + "loss": 3.2832, + "step": 7458 + }, + { + "epoch": 9.53888, + "grad_norm": 0.5404999256134033, + "learning_rate": 1.4618387400726881e-05, + "loss": 3.2091, + "step": 7459 + }, + { + "epoch": 9.54016, + "grad_norm": 0.5424739718437195, + "learning_rate": 1.4578005115089511e-05, + "loss": 3.192, + "step": 7460 + }, + { + "epoch": 9.54144, + "grad_norm": 0.5365021228790283, + "learning_rate": 1.4537622829452145e-05, + "loss": 3.2372, + "step": 7461 + }, + { + "epoch": 9.54272, + "grad_norm": 0.5258615016937256, + "learning_rate": 1.4497240543814778e-05, + "loss": 3.2472, + "step": 7462 + }, + { + "epoch": 9.544, + "grad_norm": 0.543051540851593, + "learning_rate": 1.4456858258177413e-05, + "loss": 3.2782, + "step": 7463 + }, + { + "epoch": 9.54528, + "grad_norm": 0.5488424897193909, + "learning_rate": 1.4416475972540043e-05, + "loss": 3.2517, + "step": 7464 + }, + { + "epoch": 9.54656, + "grad_norm": 0.5342796444892883, + "learning_rate": 1.4376093686902677e-05, + "loss": 3.2171, + "step": 7465 + }, + { + "epoch": 9.54784, + "grad_norm": 0.5366568565368652, + "learning_rate": 1.433571140126531e-05, + "loss": 3.2127, + "step": 7466 + }, + { + "epoch": 9.54912, + "grad_norm": 0.5328598022460938, + "learning_rate": 1.4295329115627942e-05, + "loss": 3.1996, + "step": 7467 + }, + { + "epoch": 9.5504, + "grad_norm": 0.5342961549758911, + "learning_rate": 1.4254946829990575e-05, + "loss": 3.2439, + "step": 7468 + }, + { + "epoch": 9.55168, + "grad_norm": 0.5400911569595337, + "learning_rate": 1.4214564544353209e-05, + "loss": 3.2113, + "step": 7469 + }, + { + "epoch": 9.55296, + "grad_norm": 0.536578893661499, + "learning_rate": 1.4174182258715842e-05, + "loss": 3.2135, + "step": 7470 + }, + { + "epoch": 9.55424, + "grad_norm": 0.5431190729141235, + "learning_rate": 1.4133799973078474e-05, + "loss": 3.2889, + "step": 7471 + }, + { + "epoch": 9.55552, + "grad_norm": 0.554277241230011, + "learning_rate": 1.4093417687441108e-05, + "loss": 3.2935, + "step": 7472 + }, + { + "epoch": 9.556799999999999, + "grad_norm": 0.5410692095756531, + "learning_rate": 1.4053035401803741e-05, + "loss": 3.2235, + "step": 7473 + }, + { + "epoch": 9.55808, + "grad_norm": 0.5578665733337402, + "learning_rate": 1.4012653116166373e-05, + "loss": 3.246, + "step": 7474 + }, + { + "epoch": 9.55936, + "grad_norm": 0.5440622568130493, + "learning_rate": 1.3972270830529006e-05, + "loss": 3.2371, + "step": 7475 + }, + { + "epoch": 9.56064, + "grad_norm": 0.5429546236991882, + "learning_rate": 1.393188854489164e-05, + "loss": 3.1835, + "step": 7476 + }, + { + "epoch": 9.56192, + "grad_norm": 0.5361295342445374, + "learning_rate": 1.3891506259254272e-05, + "loss": 3.1896, + "step": 7477 + }, + { + "epoch": 9.5632, + "grad_norm": 0.5314315557479858, + "learning_rate": 1.3851123973616905e-05, + "loss": 3.279, + "step": 7478 + }, + { + "epoch": 9.56448, + "grad_norm": 0.5387744903564453, + "learning_rate": 1.3810741687979538e-05, + "loss": 3.2884, + "step": 7479 + }, + { + "epoch": 9.565760000000001, + "grad_norm": 0.5472334027290344, + "learning_rate": 1.3770359402342172e-05, + "loss": 3.2379, + "step": 7480 + }, + { + "epoch": 9.56704, + "grad_norm": 0.5438302755355835, + "learning_rate": 1.3729977116704804e-05, + "loss": 3.2577, + "step": 7481 + }, + { + "epoch": 9.56832, + "grad_norm": 0.5295417904853821, + "learning_rate": 1.3689594831067437e-05, + "loss": 3.3273, + "step": 7482 + }, + { + "epoch": 9.5696, + "grad_norm": 0.5329756140708923, + "learning_rate": 1.364921254543007e-05, + "loss": 3.1549, + "step": 7483 + }, + { + "epoch": 9.57088, + "grad_norm": 0.5324873328208923, + "learning_rate": 1.3608830259792702e-05, + "loss": 3.245, + "step": 7484 + }, + { + "epoch": 9.57216, + "grad_norm": 0.5475909113883972, + "learning_rate": 1.3568447974155336e-05, + "loss": 3.3146, + "step": 7485 + }, + { + "epoch": 9.57344, + "grad_norm": 0.552155077457428, + "learning_rate": 1.352806568851797e-05, + "loss": 3.22, + "step": 7486 + }, + { + "epoch": 9.57472, + "grad_norm": 0.5365369915962219, + "learning_rate": 1.3487683402880601e-05, + "loss": 3.2338, + "step": 7487 + }, + { + "epoch": 9.576, + "grad_norm": 0.5464675426483154, + "learning_rate": 1.3447301117243235e-05, + "loss": 3.3097, + "step": 7488 + }, + { + "epoch": 9.57728, + "grad_norm": 0.5443135499954224, + "learning_rate": 1.3406918831605868e-05, + "loss": 3.2514, + "step": 7489 + }, + { + "epoch": 9.57856, + "grad_norm": 0.5426671504974365, + "learning_rate": 1.3366536545968501e-05, + "loss": 3.2772, + "step": 7490 + }, + { + "epoch": 9.57984, + "grad_norm": 0.5442038774490356, + "learning_rate": 1.3326154260331133e-05, + "loss": 3.2219, + "step": 7491 + }, + { + "epoch": 9.58112, + "grad_norm": 0.5287876129150391, + "learning_rate": 1.3285771974693767e-05, + "loss": 3.246, + "step": 7492 + }, + { + "epoch": 9.5824, + "grad_norm": 0.5401532649993896, + "learning_rate": 1.32453896890564e-05, + "loss": 3.2587, + "step": 7493 + }, + { + "epoch": 9.58368, + "grad_norm": 0.538408637046814, + "learning_rate": 1.3205007403419032e-05, + "loss": 3.2394, + "step": 7494 + }, + { + "epoch": 9.58496, + "grad_norm": 0.5716665387153625, + "learning_rate": 1.3164625117781665e-05, + "loss": 3.2705, + "step": 7495 + }, + { + "epoch": 9.58624, + "grad_norm": 0.5246546864509583, + "learning_rate": 1.3124242832144299e-05, + "loss": 3.2281, + "step": 7496 + }, + { + "epoch": 9.58752, + "grad_norm": 0.5313910245895386, + "learning_rate": 1.308386054650693e-05, + "loss": 3.2713, + "step": 7497 + }, + { + "epoch": 9.588799999999999, + "grad_norm": 0.5274931788444519, + "learning_rate": 1.3043478260869564e-05, + "loss": 3.2255, + "step": 7498 + }, + { + "epoch": 9.59008, + "grad_norm": 0.5328890085220337, + "learning_rate": 1.3003095975232198e-05, + "loss": 3.2064, + "step": 7499 + }, + { + "epoch": 9.59136, + "grad_norm": 0.5413006544113159, + "learning_rate": 1.2962713689594831e-05, + "loss": 3.2733, + "step": 7500 + }, + { + "epoch": 9.59264, + "grad_norm": 0.5474052429199219, + "learning_rate": 1.2922331403957463e-05, + "loss": 3.2836, + "step": 7501 + }, + { + "epoch": 9.59392, + "grad_norm": 0.5338591933250427, + "learning_rate": 1.2881949118320096e-05, + "loss": 3.2265, + "step": 7502 + }, + { + "epoch": 9.5952, + "grad_norm": 0.533933699131012, + "learning_rate": 1.284156683268273e-05, + "loss": 3.2688, + "step": 7503 + }, + { + "epoch": 9.59648, + "grad_norm": 0.5405238270759583, + "learning_rate": 1.2801184547045361e-05, + "loss": 3.3086, + "step": 7504 + }, + { + "epoch": 9.59776, + "grad_norm": 0.5274151563644409, + "learning_rate": 1.2760802261407995e-05, + "loss": 3.2516, + "step": 7505 + }, + { + "epoch": 9.59904, + "grad_norm": 0.533254861831665, + "learning_rate": 1.2720419975770628e-05, + "loss": 3.2087, + "step": 7506 + }, + { + "epoch": 9.60032, + "grad_norm": 0.5390244722366333, + "learning_rate": 1.268003769013326e-05, + "loss": 3.1609, + "step": 7507 + }, + { + "epoch": 9.6016, + "grad_norm": 0.5491555333137512, + "learning_rate": 1.2639655404495894e-05, + "loss": 3.261, + "step": 7508 + }, + { + "epoch": 9.60288, + "grad_norm": 0.5407535433769226, + "learning_rate": 1.2599273118858527e-05, + "loss": 3.2147, + "step": 7509 + }, + { + "epoch": 9.60416, + "grad_norm": 0.549811065196991, + "learning_rate": 1.255889083322116e-05, + "loss": 3.2559, + "step": 7510 + }, + { + "epoch": 9.60544, + "grad_norm": 0.5414533019065857, + "learning_rate": 1.2518508547583792e-05, + "loss": 3.2521, + "step": 7511 + }, + { + "epoch": 9.60672, + "grad_norm": 0.5452784895896912, + "learning_rate": 1.2478126261946426e-05, + "loss": 3.2758, + "step": 7512 + }, + { + "epoch": 9.608, + "grad_norm": 0.5487382411956787, + "learning_rate": 1.243774397630906e-05, + "loss": 3.2842, + "step": 7513 + }, + { + "epoch": 9.60928, + "grad_norm": 0.5465408563613892, + "learning_rate": 1.239736169067169e-05, + "loss": 3.242, + "step": 7514 + }, + { + "epoch": 9.61056, + "grad_norm": 0.5443041920661926, + "learning_rate": 1.2356979405034323e-05, + "loss": 3.2918, + "step": 7515 + }, + { + "epoch": 9.61184, + "grad_norm": 0.5370469689369202, + "learning_rate": 1.2316597119396958e-05, + "loss": 3.2174, + "step": 7516 + }, + { + "epoch": 9.61312, + "grad_norm": 0.5332541465759277, + "learning_rate": 1.2276214833759591e-05, + "loss": 3.2014, + "step": 7517 + }, + { + "epoch": 9.6144, + "grad_norm": 0.5249244570732117, + "learning_rate": 1.2235832548122221e-05, + "loss": 3.2625, + "step": 7518 + }, + { + "epoch": 9.61568, + "grad_norm": 0.5379579663276672, + "learning_rate": 1.2195450262484855e-05, + "loss": 3.2321, + "step": 7519 + }, + { + "epoch": 9.61696, + "grad_norm": 0.5287061333656311, + "learning_rate": 1.215506797684749e-05, + "loss": 3.2617, + "step": 7520 + }, + { + "epoch": 9.61824, + "grad_norm": 0.5358967185020447, + "learning_rate": 1.211468569121012e-05, + "loss": 3.2692, + "step": 7521 + }, + { + "epoch": 9.61952, + "grad_norm": 0.5491884350776672, + "learning_rate": 1.2074303405572754e-05, + "loss": 3.208, + "step": 7522 + }, + { + "epoch": 9.6208, + "grad_norm": 0.5433226227760315, + "learning_rate": 1.2033921119935387e-05, + "loss": 3.2463, + "step": 7523 + }, + { + "epoch": 9.62208, + "grad_norm": 0.548875093460083, + "learning_rate": 1.1993538834298019e-05, + "loss": 3.2841, + "step": 7524 + }, + { + "epoch": 9.62336, + "grad_norm": 0.5372313261032104, + "learning_rate": 1.1953156548660652e-05, + "loss": 3.2922, + "step": 7525 + }, + { + "epoch": 9.62464, + "grad_norm": 0.5375744700431824, + "learning_rate": 1.1912774263023286e-05, + "loss": 3.2362, + "step": 7526 + }, + { + "epoch": 9.62592, + "grad_norm": 0.5384906530380249, + "learning_rate": 1.187239197738592e-05, + "loss": 3.3354, + "step": 7527 + }, + { + "epoch": 9.6272, + "grad_norm": 0.532680869102478, + "learning_rate": 1.1832009691748551e-05, + "loss": 3.2397, + "step": 7528 + }, + { + "epoch": 9.62848, + "grad_norm": 0.5564149618148804, + "learning_rate": 1.1791627406111185e-05, + "loss": 3.2319, + "step": 7529 + }, + { + "epoch": 9.62976, + "grad_norm": 0.5441488027572632, + "learning_rate": 1.1751245120473818e-05, + "loss": 3.2088, + "step": 7530 + }, + { + "epoch": 9.63104, + "grad_norm": 0.5175425410270691, + "learning_rate": 1.171086283483645e-05, + "loss": 3.2514, + "step": 7531 + }, + { + "epoch": 9.63232, + "grad_norm": 0.5204359292984009, + "learning_rate": 1.1670480549199083e-05, + "loss": 3.2264, + "step": 7532 + }, + { + "epoch": 9.6336, + "grad_norm": 0.5370702743530273, + "learning_rate": 1.1630098263561717e-05, + "loss": 3.2164, + "step": 7533 + }, + { + "epoch": 9.63488, + "grad_norm": 0.562423050403595, + "learning_rate": 1.1589715977924348e-05, + "loss": 3.2344, + "step": 7534 + }, + { + "epoch": 9.63616, + "grad_norm": 0.5344887375831604, + "learning_rate": 1.1549333692286982e-05, + "loss": 3.1734, + "step": 7535 + }, + { + "epoch": 9.63744, + "grad_norm": 0.5403535962104797, + "learning_rate": 1.1508951406649615e-05, + "loss": 3.2056, + "step": 7536 + }, + { + "epoch": 9.63872, + "grad_norm": 0.5409168004989624, + "learning_rate": 1.1468569121012249e-05, + "loss": 3.2126, + "step": 7537 + }, + { + "epoch": 9.64, + "grad_norm": 0.5526571273803711, + "learning_rate": 1.142818683537488e-05, + "loss": 3.2779, + "step": 7538 + }, + { + "epoch": 9.64128, + "grad_norm": 0.5267760753631592, + "learning_rate": 1.1387804549737514e-05, + "loss": 3.2548, + "step": 7539 + }, + { + "epoch": 9.64256, + "grad_norm": 0.5352032780647278, + "learning_rate": 1.1347422264100148e-05, + "loss": 3.2621, + "step": 7540 + }, + { + "epoch": 9.64384, + "grad_norm": 0.5344724059104919, + "learning_rate": 1.130703997846278e-05, + "loss": 3.1993, + "step": 7541 + }, + { + "epoch": 9.64512, + "grad_norm": 0.5381539463996887, + "learning_rate": 1.1266657692825413e-05, + "loss": 3.2864, + "step": 7542 + }, + { + "epoch": 9.6464, + "grad_norm": 0.5417296886444092, + "learning_rate": 1.1226275407188046e-05, + "loss": 3.2113, + "step": 7543 + }, + { + "epoch": 9.64768, + "grad_norm": 0.5437041521072388, + "learning_rate": 1.1185893121550678e-05, + "loss": 3.1934, + "step": 7544 + }, + { + "epoch": 9.64896, + "grad_norm": 0.5453625321388245, + "learning_rate": 1.1145510835913311e-05, + "loss": 3.2435, + "step": 7545 + }, + { + "epoch": 9.65024, + "grad_norm": 0.5361076593399048, + "learning_rate": 1.1105128550275945e-05, + "loss": 3.2032, + "step": 7546 + }, + { + "epoch": 9.65152, + "grad_norm": 0.5454299449920654, + "learning_rate": 1.1064746264638578e-05, + "loss": 3.3266, + "step": 7547 + }, + { + "epoch": 9.6528, + "grad_norm": 0.5517051219940186, + "learning_rate": 1.102436397900121e-05, + "loss": 3.2157, + "step": 7548 + }, + { + "epoch": 9.65408, + "grad_norm": 0.5611749291419983, + "learning_rate": 1.0983981693363844e-05, + "loss": 3.2855, + "step": 7549 + }, + { + "epoch": 9.65536, + "grad_norm": 0.5386665463447571, + "learning_rate": 1.0943599407726477e-05, + "loss": 3.2633, + "step": 7550 + }, + { + "epoch": 9.65664, + "grad_norm": 0.545172929763794, + "learning_rate": 1.0903217122089109e-05, + "loss": 3.2414, + "step": 7551 + }, + { + "epoch": 9.65792, + "grad_norm": 0.5379675030708313, + "learning_rate": 1.0862834836451742e-05, + "loss": 3.2504, + "step": 7552 + }, + { + "epoch": 9.6592, + "grad_norm": 0.545353353023529, + "learning_rate": 1.0822452550814376e-05, + "loss": 3.2483, + "step": 7553 + }, + { + "epoch": 9.66048, + "grad_norm": 0.5340268015861511, + "learning_rate": 1.0782070265177008e-05, + "loss": 3.2026, + "step": 7554 + }, + { + "epoch": 9.66176, + "grad_norm": 0.5404039621353149, + "learning_rate": 1.0741687979539641e-05, + "loss": 3.2664, + "step": 7555 + }, + { + "epoch": 9.66304, + "grad_norm": 0.5477861762046814, + "learning_rate": 1.0701305693902274e-05, + "loss": 3.2691, + "step": 7556 + }, + { + "epoch": 9.66432, + "grad_norm": 0.5429335832595825, + "learning_rate": 1.0660923408264908e-05, + "loss": 3.2404, + "step": 7557 + }, + { + "epoch": 9.6656, + "grad_norm": 0.5497188568115234, + "learning_rate": 1.062054112262754e-05, + "loss": 3.1482, + "step": 7558 + }, + { + "epoch": 9.66688, + "grad_norm": 0.5364313721656799, + "learning_rate": 1.0580158836990173e-05, + "loss": 3.3282, + "step": 7559 + }, + { + "epoch": 9.66816, + "grad_norm": 0.5295746326446533, + "learning_rate": 1.0539776551352807e-05, + "loss": 3.2642, + "step": 7560 + }, + { + "epoch": 9.66944, + "grad_norm": 0.5566303133964539, + "learning_rate": 1.0499394265715438e-05, + "loss": 3.3054, + "step": 7561 + }, + { + "epoch": 9.67072, + "grad_norm": 0.5400784611701965, + "learning_rate": 1.0459011980078072e-05, + "loss": 3.2225, + "step": 7562 + }, + { + "epoch": 9.672, + "grad_norm": 0.546852707862854, + "learning_rate": 1.0418629694440705e-05, + "loss": 3.2187, + "step": 7563 + }, + { + "epoch": 9.67328, + "grad_norm": 0.5458729267120361, + "learning_rate": 1.0378247408803339e-05, + "loss": 3.265, + "step": 7564 + }, + { + "epoch": 9.67456, + "grad_norm": 0.5483717322349548, + "learning_rate": 1.033786512316597e-05, + "loss": 3.2788, + "step": 7565 + }, + { + "epoch": 9.67584, + "grad_norm": 0.5389772057533264, + "learning_rate": 1.0297482837528604e-05, + "loss": 3.2733, + "step": 7566 + }, + { + "epoch": 9.67712, + "grad_norm": 0.5320543646812439, + "learning_rate": 1.0257100551891237e-05, + "loss": 3.2597, + "step": 7567 + }, + { + "epoch": 9.6784, + "grad_norm": 0.5460016131401062, + "learning_rate": 1.0216718266253868e-05, + "loss": 3.226, + "step": 7568 + }, + { + "epoch": 9.67968, + "grad_norm": 0.5420553684234619, + "learning_rate": 1.0176335980616503e-05, + "loss": 3.2611, + "step": 7569 + }, + { + "epoch": 9.68096, + "grad_norm": 0.5544966459274292, + "learning_rate": 1.0135953694979136e-05, + "loss": 3.2986, + "step": 7570 + }, + { + "epoch": 9.68224, + "grad_norm": 0.5373578071594238, + "learning_rate": 1.0095571409341766e-05, + "loss": 3.2232, + "step": 7571 + }, + { + "epoch": 9.68352, + "grad_norm": 0.5499645471572876, + "learning_rate": 1.00551891237044e-05, + "loss": 3.2186, + "step": 7572 + }, + { + "epoch": 9.6848, + "grad_norm": 0.5397508144378662, + "learning_rate": 1.0014806838067035e-05, + "loss": 3.284, + "step": 7573 + }, + { + "epoch": 9.68608, + "grad_norm": 0.5304678082466125, + "learning_rate": 9.974424552429668e-06, + "loss": 3.2274, + "step": 7574 + }, + { + "epoch": 9.68736, + "grad_norm": 0.5234605073928833, + "learning_rate": 9.934042266792298e-06, + "loss": 3.216, + "step": 7575 + }, + { + "epoch": 9.68864, + "grad_norm": 0.5570342540740967, + "learning_rate": 9.893659981154932e-06, + "loss": 3.2837, + "step": 7576 + }, + { + "epoch": 9.68992, + "grad_norm": 0.5394232273101807, + "learning_rate": 9.853277695517565e-06, + "loss": 3.1624, + "step": 7577 + }, + { + "epoch": 9.6912, + "grad_norm": 0.5388901829719543, + "learning_rate": 9.812895409880197e-06, + "loss": 3.2089, + "step": 7578 + }, + { + "epoch": 9.69248, + "grad_norm": 0.5200719833374023, + "learning_rate": 9.77251312424283e-06, + "loss": 3.198, + "step": 7579 + }, + { + "epoch": 9.69376, + "grad_norm": 0.550902247428894, + "learning_rate": 9.732130838605464e-06, + "loss": 3.2799, + "step": 7580 + }, + { + "epoch": 9.69504, + "grad_norm": 0.531865656375885, + "learning_rate": 9.691748552968096e-06, + "loss": 3.242, + "step": 7581 + }, + { + "epoch": 9.69632, + "grad_norm": 0.533011257648468, + "learning_rate": 9.65136626733073e-06, + "loss": 3.2183, + "step": 7582 + }, + { + "epoch": 9.6976, + "grad_norm": 0.538938581943512, + "learning_rate": 9.610983981693363e-06, + "loss": 3.2257, + "step": 7583 + }, + { + "epoch": 9.698879999999999, + "grad_norm": 0.5364640951156616, + "learning_rate": 9.570601696055996e-06, + "loss": 3.2082, + "step": 7584 + }, + { + "epoch": 9.70016, + "grad_norm": 0.5598052740097046, + "learning_rate": 9.530219410418628e-06, + "loss": 3.3729, + "step": 7585 + }, + { + "epoch": 9.70144, + "grad_norm": 0.5431958436965942, + "learning_rate": 9.489837124781261e-06, + "loss": 3.2299, + "step": 7586 + }, + { + "epoch": 9.70272, + "grad_norm": 0.5261279344558716, + "learning_rate": 9.449454839143895e-06, + "loss": 3.2334, + "step": 7587 + }, + { + "epoch": 9.704, + "grad_norm": 0.5326124429702759, + "learning_rate": 9.409072553506527e-06, + "loss": 3.2051, + "step": 7588 + }, + { + "epoch": 9.70528, + "grad_norm": 0.551518440246582, + "learning_rate": 9.36869026786916e-06, + "loss": 3.1614, + "step": 7589 + }, + { + "epoch": 9.70656, + "grad_norm": 0.5355722308158875, + "learning_rate": 9.328307982231794e-06, + "loss": 3.2427, + "step": 7590 + }, + { + "epoch": 9.707840000000001, + "grad_norm": 0.5317171812057495, + "learning_rate": 9.287925696594427e-06, + "loss": 3.1419, + "step": 7591 + }, + { + "epoch": 9.70912, + "grad_norm": 0.5412980914115906, + "learning_rate": 9.247543410957059e-06, + "loss": 3.2759, + "step": 7592 + }, + { + "epoch": 9.7104, + "grad_norm": 0.546062171459198, + "learning_rate": 9.207161125319692e-06, + "loss": 3.2773, + "step": 7593 + }, + { + "epoch": 9.71168, + "grad_norm": 0.5412468314170837, + "learning_rate": 9.166778839682326e-06, + "loss": 3.247, + "step": 7594 + }, + { + "epoch": 9.71296, + "grad_norm": 0.5379220247268677, + "learning_rate": 9.12639655404496e-06, + "loss": 3.2106, + "step": 7595 + }, + { + "epoch": 9.71424, + "grad_norm": 0.5455538034439087, + "learning_rate": 9.086014268407591e-06, + "loss": 3.2229, + "step": 7596 + }, + { + "epoch": 9.71552, + "grad_norm": 0.5402326583862305, + "learning_rate": 9.045631982770223e-06, + "loss": 3.2599, + "step": 7597 + }, + { + "epoch": 9.7168, + "grad_norm": 0.5508577823638916, + "learning_rate": 9.005249697132858e-06, + "loss": 3.272, + "step": 7598 + }, + { + "epoch": 9.71808, + "grad_norm": 0.5405802726745605, + "learning_rate": 8.96486741149549e-06, + "loss": 3.2912, + "step": 7599 + }, + { + "epoch": 9.71936, + "grad_norm": 0.5341350436210632, + "learning_rate": 8.924485125858123e-06, + "loss": 3.2955, + "step": 7600 + }, + { + "epoch": 9.72064, + "grad_norm": 0.5520155429840088, + "learning_rate": 8.884102840220755e-06, + "loss": 3.272, + "step": 7601 + }, + { + "epoch": 9.72192, + "grad_norm": 0.5376763939857483, + "learning_rate": 8.843720554583388e-06, + "loss": 3.215, + "step": 7602 + }, + { + "epoch": 9.7232, + "grad_norm": 0.5320829749107361, + "learning_rate": 8.803338268946022e-06, + "loss": 3.2852, + "step": 7603 + }, + { + "epoch": 9.72448, + "grad_norm": 0.5295441746711731, + "learning_rate": 8.762955983308654e-06, + "loss": 3.1911, + "step": 7604 + }, + { + "epoch": 9.72576, + "grad_norm": 0.5424656271934509, + "learning_rate": 8.722573697671287e-06, + "loss": 3.2145, + "step": 7605 + }, + { + "epoch": 9.72704, + "grad_norm": 0.5340449810028076, + "learning_rate": 8.68219141203392e-06, + "loss": 3.275, + "step": 7606 + }, + { + "epoch": 9.72832, + "grad_norm": 0.532345175743103, + "learning_rate": 8.641809126396552e-06, + "loss": 3.1572, + "step": 7607 + }, + { + "epoch": 9.7296, + "grad_norm": 0.5268622636795044, + "learning_rate": 8.601426840759186e-06, + "loss": 3.2741, + "step": 7608 + }, + { + "epoch": 9.730879999999999, + "grad_norm": 0.5382117629051208, + "learning_rate": 8.56104455512182e-06, + "loss": 3.2102, + "step": 7609 + }, + { + "epoch": 9.73216, + "grad_norm": 0.5525277256965637, + "learning_rate": 8.520662269484453e-06, + "loss": 3.3048, + "step": 7610 + }, + { + "epoch": 9.73344, + "grad_norm": 0.5279812216758728, + "learning_rate": 8.480279983847084e-06, + "loss": 3.256, + "step": 7611 + }, + { + "epoch": 9.73472, + "grad_norm": 0.5394153594970703, + "learning_rate": 8.439897698209718e-06, + "loss": 3.2207, + "step": 7612 + }, + { + "epoch": 9.736, + "grad_norm": 0.533865213394165, + "learning_rate": 8.399515412572351e-06, + "loss": 3.1895, + "step": 7613 + }, + { + "epoch": 9.73728, + "grad_norm": 0.5335356593132019, + "learning_rate": 8.359133126934983e-06, + "loss": 3.2414, + "step": 7614 + }, + { + "epoch": 9.73856, + "grad_norm": 0.5455939173698425, + "learning_rate": 8.318750841297617e-06, + "loss": 3.2906, + "step": 7615 + }, + { + "epoch": 9.739840000000001, + "grad_norm": 0.5451316237449646, + "learning_rate": 8.27836855566025e-06, + "loss": 3.2626, + "step": 7616 + }, + { + "epoch": 9.74112, + "grad_norm": 0.5487070679664612, + "learning_rate": 8.237986270022882e-06, + "loss": 3.2446, + "step": 7617 + }, + { + "epoch": 9.7424, + "grad_norm": 0.5351135730743408, + "learning_rate": 8.197603984385515e-06, + "loss": 3.2191, + "step": 7618 + }, + { + "epoch": 9.74368, + "grad_norm": 0.5554419755935669, + "learning_rate": 8.157221698748149e-06, + "loss": 3.272, + "step": 7619 + }, + { + "epoch": 9.74496, + "grad_norm": 0.5496061444282532, + "learning_rate": 8.116839413110782e-06, + "loss": 3.2893, + "step": 7620 + }, + { + "epoch": 9.74624, + "grad_norm": 0.5537092685699463, + "learning_rate": 8.076457127473414e-06, + "loss": 3.3493, + "step": 7621 + }, + { + "epoch": 9.74752, + "grad_norm": 0.5267013907432556, + "learning_rate": 8.036074841836047e-06, + "loss": 3.2105, + "step": 7622 + }, + { + "epoch": 9.7488, + "grad_norm": 0.537086009979248, + "learning_rate": 7.995692556198681e-06, + "loss": 3.2392, + "step": 7623 + }, + { + "epoch": 9.75008, + "grad_norm": 0.5526744723320007, + "learning_rate": 7.955310270561313e-06, + "loss": 3.2584, + "step": 7624 + }, + { + "epoch": 9.75136, + "grad_norm": 0.5359621047973633, + "learning_rate": 7.914927984923946e-06, + "loss": 3.2106, + "step": 7625 + }, + { + "epoch": 9.75264, + "grad_norm": 0.5292626619338989, + "learning_rate": 7.87454569928658e-06, + "loss": 3.2777, + "step": 7626 + }, + { + "epoch": 9.75392, + "grad_norm": 0.5385765433311462, + "learning_rate": 7.834163413649213e-06, + "loss": 3.2815, + "step": 7627 + }, + { + "epoch": 9.7552, + "grad_norm": 0.5359296798706055, + "learning_rate": 7.793781128011845e-06, + "loss": 3.2214, + "step": 7628 + }, + { + "epoch": 9.75648, + "grad_norm": 0.5338163375854492, + "learning_rate": 7.753398842374477e-06, + "loss": 3.2129, + "step": 7629 + }, + { + "epoch": 9.75776, + "grad_norm": 0.5449706315994263, + "learning_rate": 7.71301655673711e-06, + "loss": 3.2514, + "step": 7630 + }, + { + "epoch": 9.75904, + "grad_norm": 0.5432578325271606, + "learning_rate": 7.672634271099744e-06, + "loss": 3.2642, + "step": 7631 + }, + { + "epoch": 9.76032, + "grad_norm": 0.5408291816711426, + "learning_rate": 7.632251985462377e-06, + "loss": 3.2562, + "step": 7632 + }, + { + "epoch": 9.7616, + "grad_norm": 0.5344693064689636, + "learning_rate": 7.59186969982501e-06, + "loss": 3.2389, + "step": 7633 + }, + { + "epoch": 9.76288, + "grad_norm": 0.5311311483383179, + "learning_rate": 7.551487414187642e-06, + "loss": 3.2397, + "step": 7634 + }, + { + "epoch": 9.76416, + "grad_norm": 0.5357692241668701, + "learning_rate": 7.511105128550276e-06, + "loss": 3.2205, + "step": 7635 + }, + { + "epoch": 9.76544, + "grad_norm": 0.5423558950424194, + "learning_rate": 7.470722842912908e-06, + "loss": 3.2897, + "step": 7636 + }, + { + "epoch": 9.76672, + "grad_norm": 0.5260913372039795, + "learning_rate": 7.430340557275542e-06, + "loss": 3.1949, + "step": 7637 + }, + { + "epoch": 9.768, + "grad_norm": 0.5533113479614258, + "learning_rate": 7.3899582716381744e-06, + "loss": 3.2199, + "step": 7638 + }, + { + "epoch": 9.76928, + "grad_norm": 0.5424081087112427, + "learning_rate": 7.349575986000806e-06, + "loss": 3.3368, + "step": 7639 + }, + { + "epoch": 9.77056, + "grad_norm": 0.5297884941101074, + "learning_rate": 7.3091937003634405e-06, + "loss": 3.1818, + "step": 7640 + }, + { + "epoch": 9.77184, + "grad_norm": 0.5412716865539551, + "learning_rate": 7.268811414726072e-06, + "loss": 3.219, + "step": 7641 + }, + { + "epoch": 9.77312, + "grad_norm": 0.5297017693519592, + "learning_rate": 7.228429129088707e-06, + "loss": 3.2751, + "step": 7642 + }, + { + "epoch": 9.7744, + "grad_norm": 0.5230127573013306, + "learning_rate": 7.188046843451338e-06, + "loss": 3.2886, + "step": 7643 + }, + { + "epoch": 9.77568, + "grad_norm": 0.5418264865875244, + "learning_rate": 7.147664557813971e-06, + "loss": 3.2826, + "step": 7644 + }, + { + "epoch": 9.77696, + "grad_norm": 0.5246114134788513, + "learning_rate": 7.1072822721766044e-06, + "loss": 3.2256, + "step": 7645 + }, + { + "epoch": 9.77824, + "grad_norm": 0.5591127872467041, + "learning_rate": 7.066899986539237e-06, + "loss": 3.2657, + "step": 7646 + }, + { + "epoch": 9.77952, + "grad_norm": 0.5307941436767578, + "learning_rate": 7.0265177009018705e-06, + "loss": 3.2904, + "step": 7647 + }, + { + "epoch": 9.7808, + "grad_norm": 0.5432814359664917, + "learning_rate": 6.986135415264503e-06, + "loss": 3.3016, + "step": 7648 + }, + { + "epoch": 9.78208, + "grad_norm": 0.53673255443573, + "learning_rate": 6.945753129627136e-06, + "loss": 3.1894, + "step": 7649 + }, + { + "epoch": 9.78336, + "grad_norm": 0.5456019043922424, + "learning_rate": 6.905370843989769e-06, + "loss": 3.251, + "step": 7650 + }, + { + "epoch": 9.78464, + "grad_norm": 0.5382410287857056, + "learning_rate": 6.864988558352402e-06, + "loss": 3.2033, + "step": 7651 + }, + { + "epoch": 9.78592, + "grad_norm": 0.5357476472854614, + "learning_rate": 6.824606272715035e-06, + "loss": 3.2886, + "step": 7652 + }, + { + "epoch": 9.7872, + "grad_norm": 0.5439364314079285, + "learning_rate": 6.784223987077668e-06, + "loss": 3.314, + "step": 7653 + }, + { + "epoch": 9.78848, + "grad_norm": 0.5283563137054443, + "learning_rate": 6.7438417014403005e-06, + "loss": 3.2845, + "step": 7654 + }, + { + "epoch": 9.78976, + "grad_norm": 0.5235844254493713, + "learning_rate": 6.703459415802934e-06, + "loss": 3.2595, + "step": 7655 + }, + { + "epoch": 9.79104, + "grad_norm": 0.5161533951759338, + "learning_rate": 6.663077130165567e-06, + "loss": 3.2127, + "step": 7656 + }, + { + "epoch": 9.79232, + "grad_norm": 0.5272259712219238, + "learning_rate": 6.6226948445282e-06, + "loss": 3.2172, + "step": 7657 + }, + { + "epoch": 9.7936, + "grad_norm": 0.5525578260421753, + "learning_rate": 6.582312558890833e-06, + "loss": 3.2411, + "step": 7658 + }, + { + "epoch": 9.79488, + "grad_norm": 0.5354979634284973, + "learning_rate": 6.541930273253465e-06, + "loss": 3.1923, + "step": 7659 + }, + { + "epoch": 9.79616, + "grad_norm": 0.5332154035568237, + "learning_rate": 6.501547987616099e-06, + "loss": 3.204, + "step": 7660 + }, + { + "epoch": 9.79744, + "grad_norm": 0.5478113889694214, + "learning_rate": 6.461165701978731e-06, + "loss": 3.2707, + "step": 7661 + }, + { + "epoch": 9.79872, + "grad_norm": 0.5272064208984375, + "learning_rate": 6.420783416341365e-06, + "loss": 3.2359, + "step": 7662 + }, + { + "epoch": 9.8, + "grad_norm": 0.5392633676528931, + "learning_rate": 6.3804011307039975e-06, + "loss": 3.1821, + "step": 7663 + }, + { + "epoch": 9.80128, + "grad_norm": 0.5203908681869507, + "learning_rate": 6.34001884506663e-06, + "loss": 3.2749, + "step": 7664 + }, + { + "epoch": 9.80256, + "grad_norm": 0.5322408080101013, + "learning_rate": 6.2996365594292635e-06, + "loss": 3.2694, + "step": 7665 + }, + { + "epoch": 9.80384, + "grad_norm": 0.5292795896530151, + "learning_rate": 6.259254273791896e-06, + "loss": 3.2264, + "step": 7666 + }, + { + "epoch": 9.80512, + "grad_norm": 0.5364894866943359, + "learning_rate": 6.21887198815453e-06, + "loss": 3.1952, + "step": 7667 + }, + { + "epoch": 9.8064, + "grad_norm": 0.5564066171646118, + "learning_rate": 6.178489702517161e-06, + "loss": 3.3316, + "step": 7668 + }, + { + "epoch": 9.80768, + "grad_norm": 0.5362501740455627, + "learning_rate": 6.138107416879796e-06, + "loss": 3.2155, + "step": 7669 + }, + { + "epoch": 9.80896, + "grad_norm": 0.5605980753898621, + "learning_rate": 6.0977251312424275e-06, + "loss": 3.2895, + "step": 7670 + }, + { + "epoch": 9.81024, + "grad_norm": 0.538100004196167, + "learning_rate": 6.05734284560506e-06, + "loss": 3.2186, + "step": 7671 + }, + { + "epoch": 9.81152, + "grad_norm": 0.5335180163383484, + "learning_rate": 6.0169605599676936e-06, + "loss": 3.2623, + "step": 7672 + }, + { + "epoch": 9.8128, + "grad_norm": 0.535961389541626, + "learning_rate": 5.976578274330326e-06, + "loss": 3.2338, + "step": 7673 + }, + { + "epoch": 9.81408, + "grad_norm": 0.5322915315628052, + "learning_rate": 5.93619598869296e-06, + "loss": 3.1992, + "step": 7674 + }, + { + "epoch": 9.81536, + "grad_norm": 0.52436763048172, + "learning_rate": 5.895813703055592e-06, + "loss": 3.257, + "step": 7675 + }, + { + "epoch": 9.81664, + "grad_norm": 0.5428990125656128, + "learning_rate": 5.855431417418225e-06, + "loss": 3.1951, + "step": 7676 + }, + { + "epoch": 9.81792, + "grad_norm": 0.5562485456466675, + "learning_rate": 5.815049131780858e-06, + "loss": 3.303, + "step": 7677 + }, + { + "epoch": 9.8192, + "grad_norm": 0.5489171743392944, + "learning_rate": 5.774666846143491e-06, + "loss": 3.2214, + "step": 7678 + }, + { + "epoch": 9.82048, + "grad_norm": 0.5357405543327332, + "learning_rate": 5.734284560506124e-06, + "loss": 3.2282, + "step": 7679 + }, + { + "epoch": 9.82176, + "grad_norm": 0.5341373682022095, + "learning_rate": 5.693902274868757e-06, + "loss": 3.2049, + "step": 7680 + }, + { + "epoch": 9.82304, + "grad_norm": 0.5386019349098206, + "learning_rate": 5.65351998923139e-06, + "loss": 3.2545, + "step": 7681 + }, + { + "epoch": 9.82432, + "grad_norm": 0.5332474708557129, + "learning_rate": 5.613137703594023e-06, + "loss": 3.207, + "step": 7682 + }, + { + "epoch": 9.8256, + "grad_norm": 0.5416889786720276, + "learning_rate": 5.572755417956656e-06, + "loss": 3.2406, + "step": 7683 + }, + { + "epoch": 9.82688, + "grad_norm": 0.5246055722236633, + "learning_rate": 5.532373132319289e-06, + "loss": 3.3026, + "step": 7684 + }, + { + "epoch": 9.82816, + "grad_norm": 0.5286515951156616, + "learning_rate": 5.491990846681922e-06, + "loss": 3.2483, + "step": 7685 + }, + { + "epoch": 9.82944, + "grad_norm": 0.5369952917098999, + "learning_rate": 5.451608561044554e-06, + "loss": 3.2452, + "step": 7686 + }, + { + "epoch": 9.83072, + "grad_norm": 0.5319638252258301, + "learning_rate": 5.411226275407188e-06, + "loss": 3.2861, + "step": 7687 + }, + { + "epoch": 9.832, + "grad_norm": 0.5259891748428345, + "learning_rate": 5.3708439897698205e-06, + "loss": 3.2605, + "step": 7688 + }, + { + "epoch": 9.83328, + "grad_norm": 0.5339581966400146, + "learning_rate": 5.330461704132454e-06, + "loss": 3.2469, + "step": 7689 + }, + { + "epoch": 9.83456, + "grad_norm": 0.5350625514984131, + "learning_rate": 5.290079418495087e-06, + "loss": 3.2347, + "step": 7690 + }, + { + "epoch": 9.83584, + "grad_norm": 0.5377787947654724, + "learning_rate": 5.249697132857719e-06, + "loss": 3.1947, + "step": 7691 + }, + { + "epoch": 9.83712, + "grad_norm": 0.5324146747589111, + "learning_rate": 5.209314847220353e-06, + "loss": 3.2842, + "step": 7692 + }, + { + "epoch": 9.8384, + "grad_norm": 0.5433526635169983, + "learning_rate": 5.168932561582985e-06, + "loss": 3.1448, + "step": 7693 + }, + { + "epoch": 9.83968, + "grad_norm": 0.5497397184371948, + "learning_rate": 5.128550275945619e-06, + "loss": 3.2761, + "step": 7694 + }, + { + "epoch": 9.84096, + "grad_norm": 0.5405837893486023, + "learning_rate": 5.088167990308251e-06, + "loss": 3.2843, + "step": 7695 + }, + { + "epoch": 9.84224, + "grad_norm": 0.5413019061088562, + "learning_rate": 5.047785704670883e-06, + "loss": 3.2202, + "step": 7696 + }, + { + "epoch": 9.84352, + "grad_norm": 0.5248864889144897, + "learning_rate": 5.0074034190335174e-06, + "loss": 3.234, + "step": 7697 + }, + { + "epoch": 9.8448, + "grad_norm": 0.5380563139915466, + "learning_rate": 4.967021133396149e-06, + "loss": 3.2331, + "step": 7698 + }, + { + "epoch": 9.84608, + "grad_norm": 0.5344839096069336, + "learning_rate": 4.926638847758783e-06, + "loss": 3.3208, + "step": 7699 + }, + { + "epoch": 9.84736, + "grad_norm": 0.544310450553894, + "learning_rate": 4.886256562121415e-06, + "loss": 3.242, + "step": 7700 + }, + { + "epoch": 9.84864, + "grad_norm": 0.5436192750930786, + "learning_rate": 4.845874276484048e-06, + "loss": 3.2463, + "step": 7701 + }, + { + "epoch": 9.849920000000001, + "grad_norm": 0.5517845749855042, + "learning_rate": 4.805491990846681e-06, + "loss": 3.2587, + "step": 7702 + }, + { + "epoch": 9.8512, + "grad_norm": 0.5308093428611755, + "learning_rate": 4.765109705209314e-06, + "loss": 3.238, + "step": 7703 + }, + { + "epoch": 9.85248, + "grad_norm": 0.5324238538742065, + "learning_rate": 4.7247274195719474e-06, + "loss": 3.1847, + "step": 7704 + }, + { + "epoch": 9.85376, + "grad_norm": 0.5277994871139526, + "learning_rate": 4.68434513393458e-06, + "loss": 3.1705, + "step": 7705 + }, + { + "epoch": 9.85504, + "grad_norm": 0.5271157026290894, + "learning_rate": 4.6439628482972135e-06, + "loss": 3.1905, + "step": 7706 + }, + { + "epoch": 9.85632, + "grad_norm": 0.531658947467804, + "learning_rate": 4.603580562659846e-06, + "loss": 3.2695, + "step": 7707 + }, + { + "epoch": 9.8576, + "grad_norm": 0.5260719060897827, + "learning_rate": 4.56319827702248e-06, + "loss": 3.2373, + "step": 7708 + }, + { + "epoch": 9.85888, + "grad_norm": 0.5422466993331909, + "learning_rate": 4.522815991385111e-06, + "loss": 3.2401, + "step": 7709 + }, + { + "epoch": 9.86016, + "grad_norm": 0.5454497337341309, + "learning_rate": 4.482433705747745e-06, + "loss": 3.2724, + "step": 7710 + }, + { + "epoch": 9.86144, + "grad_norm": 0.5371284484863281, + "learning_rate": 4.4420514201103775e-06, + "loss": 3.267, + "step": 7711 + }, + { + "epoch": 9.86272, + "grad_norm": 0.5461208820343018, + "learning_rate": 4.401669134473011e-06, + "loss": 3.2026, + "step": 7712 + }, + { + "epoch": 9.864, + "grad_norm": 0.5462895631790161, + "learning_rate": 4.3612868488356435e-06, + "loss": 3.1647, + "step": 7713 + }, + { + "epoch": 9.86528, + "grad_norm": 0.5370737314224243, + "learning_rate": 4.320904563198276e-06, + "loss": 3.2734, + "step": 7714 + }, + { + "epoch": 9.86656, + "grad_norm": 0.5331903100013733, + "learning_rate": 4.28052227756091e-06, + "loss": 3.1304, + "step": 7715 + }, + { + "epoch": 9.86784, + "grad_norm": 0.5273880958557129, + "learning_rate": 4.240139991923542e-06, + "loss": 3.2982, + "step": 7716 + }, + { + "epoch": 9.86912, + "grad_norm": 0.5413378477096558, + "learning_rate": 4.199757706286176e-06, + "loss": 3.1814, + "step": 7717 + }, + { + "epoch": 9.8704, + "grad_norm": 0.5351929664611816, + "learning_rate": 4.159375420648808e-06, + "loss": 3.2605, + "step": 7718 + }, + { + "epoch": 9.87168, + "grad_norm": 0.545540452003479, + "learning_rate": 4.118993135011441e-06, + "loss": 3.2398, + "step": 7719 + }, + { + "epoch": 9.872959999999999, + "grad_norm": 0.5313085913658142, + "learning_rate": 4.078610849374074e-06, + "loss": 3.2208, + "step": 7720 + }, + { + "epoch": 9.87424, + "grad_norm": 0.5450189709663391, + "learning_rate": 4.038228563736707e-06, + "loss": 3.2366, + "step": 7721 + }, + { + "epoch": 9.87552, + "grad_norm": 0.5320997834205627, + "learning_rate": 3.9978462780993405e-06, + "loss": 3.2865, + "step": 7722 + }, + { + "epoch": 9.8768, + "grad_norm": 0.5449676513671875, + "learning_rate": 3.957463992461973e-06, + "loss": 3.3148, + "step": 7723 + }, + { + "epoch": 9.87808, + "grad_norm": 0.525892972946167, + "learning_rate": 3.9170817068246066e-06, + "loss": 3.1522, + "step": 7724 + }, + { + "epoch": 9.87936, + "grad_norm": 0.5414352416992188, + "learning_rate": 3.876699421187238e-06, + "loss": 3.2166, + "step": 7725 + }, + { + "epoch": 9.88064, + "grad_norm": 0.5205792188644409, + "learning_rate": 3.836317135549872e-06, + "loss": 3.2689, + "step": 7726 + }, + { + "epoch": 9.881920000000001, + "grad_norm": 0.5402050614356995, + "learning_rate": 3.795934849912505e-06, + "loss": 3.2732, + "step": 7727 + }, + { + "epoch": 9.8832, + "grad_norm": 0.5389068722724915, + "learning_rate": 3.755552564275138e-06, + "loss": 3.2579, + "step": 7728 + }, + { + "epoch": 9.88448, + "grad_norm": 0.5307827591896057, + "learning_rate": 3.715170278637771e-06, + "loss": 3.2471, + "step": 7729 + }, + { + "epoch": 9.88576, + "grad_norm": 0.5479539036750793, + "learning_rate": 3.674787993000403e-06, + "loss": 3.2576, + "step": 7730 + }, + { + "epoch": 9.88704, + "grad_norm": 0.5323429703712463, + "learning_rate": 3.634405707363036e-06, + "loss": 3.2332, + "step": 7731 + }, + { + "epoch": 9.88832, + "grad_norm": 0.5382372736930847, + "learning_rate": 3.594023421725669e-06, + "loss": 3.2954, + "step": 7732 + }, + { + "epoch": 9.8896, + "grad_norm": 0.5464025735855103, + "learning_rate": 3.5536411360883022e-06, + "loss": 3.2248, + "step": 7733 + }, + { + "epoch": 9.89088, + "grad_norm": 0.5193501114845276, + "learning_rate": 3.5132588504509353e-06, + "loss": 3.2544, + "step": 7734 + }, + { + "epoch": 9.89216, + "grad_norm": 0.5276758074760437, + "learning_rate": 3.472876564813568e-06, + "loss": 3.1927, + "step": 7735 + }, + { + "epoch": 9.89344, + "grad_norm": 0.5477931499481201, + "learning_rate": 3.432494279176201e-06, + "loss": 3.2572, + "step": 7736 + }, + { + "epoch": 9.89472, + "grad_norm": 0.5434842705726624, + "learning_rate": 3.392111993538834e-06, + "loss": 3.3412, + "step": 7737 + }, + { + "epoch": 9.896, + "grad_norm": 0.5318534970283508, + "learning_rate": 3.351729707901467e-06, + "loss": 3.2676, + "step": 7738 + }, + { + "epoch": 9.89728, + "grad_norm": 0.5352578163146973, + "learning_rate": 3.3113474222641e-06, + "loss": 3.2455, + "step": 7739 + }, + { + "epoch": 9.89856, + "grad_norm": 0.5339382290840149, + "learning_rate": 3.2709651366267327e-06, + "loss": 3.2947, + "step": 7740 + }, + { + "epoch": 9.89984, + "grad_norm": 0.5460047721862793, + "learning_rate": 3.2305828509893657e-06, + "loss": 3.3013, + "step": 7741 + }, + { + "epoch": 9.90112, + "grad_norm": 0.5352959632873535, + "learning_rate": 3.1902005653519987e-06, + "loss": 3.2407, + "step": 7742 + }, + { + "epoch": 9.9024, + "grad_norm": 0.5354017019271851, + "learning_rate": 3.1498182797146318e-06, + "loss": 3.1999, + "step": 7743 + }, + { + "epoch": 9.90368, + "grad_norm": 0.5433909296989441, + "learning_rate": 3.109435994077265e-06, + "loss": 3.1953, + "step": 7744 + }, + { + "epoch": 9.904959999999999, + "grad_norm": 0.5294229388237, + "learning_rate": 3.069053708439898e-06, + "loss": 3.19, + "step": 7745 + }, + { + "epoch": 9.90624, + "grad_norm": 0.5373355150222778, + "learning_rate": 3.02867142280253e-06, + "loss": 3.2349, + "step": 7746 + }, + { + "epoch": 9.90752, + "grad_norm": 0.5573491454124451, + "learning_rate": 2.988289137165163e-06, + "loss": 3.1777, + "step": 7747 + }, + { + "epoch": 9.9088, + "grad_norm": 0.54659104347229, + "learning_rate": 2.947906851527796e-06, + "loss": 3.2335, + "step": 7748 + }, + { + "epoch": 9.91008, + "grad_norm": 0.5471231341362, + "learning_rate": 2.907524565890429e-06, + "loss": 3.2676, + "step": 7749 + }, + { + "epoch": 9.91136, + "grad_norm": 0.5398557186126709, + "learning_rate": 2.867142280253062e-06, + "loss": 3.2567, + "step": 7750 + }, + { + "epoch": 9.91264, + "grad_norm": 0.5324544310569763, + "learning_rate": 2.826759994615695e-06, + "loss": 3.2461, + "step": 7751 + }, + { + "epoch": 9.91392, + "grad_norm": 0.5316013693809509, + "learning_rate": 2.786377708978328e-06, + "loss": 3.2755, + "step": 7752 + }, + { + "epoch": 9.9152, + "grad_norm": 0.5322742462158203, + "learning_rate": 2.745995423340961e-06, + "loss": 3.2667, + "step": 7753 + }, + { + "epoch": 9.91648, + "grad_norm": 0.5327667593955994, + "learning_rate": 2.705613137703594e-06, + "loss": 3.278, + "step": 7754 + }, + { + "epoch": 9.91776, + "grad_norm": 0.5432612895965576, + "learning_rate": 2.665230852066227e-06, + "loss": 3.2554, + "step": 7755 + }, + { + "epoch": 9.91904, + "grad_norm": 0.5206037163734436, + "learning_rate": 2.6248485664288596e-06, + "loss": 3.2419, + "step": 7756 + }, + { + "epoch": 9.92032, + "grad_norm": 0.5430627465248108, + "learning_rate": 2.5844662807914926e-06, + "loss": 3.2313, + "step": 7757 + }, + { + "epoch": 9.9216, + "grad_norm": 0.527530312538147, + "learning_rate": 2.5440839951541257e-06, + "loss": 3.2726, + "step": 7758 + }, + { + "epoch": 9.92288, + "grad_norm": 0.5246833562850952, + "learning_rate": 2.5037017095167587e-06, + "loss": 3.1722, + "step": 7759 + }, + { + "epoch": 9.92416, + "grad_norm": 0.5283114314079285, + "learning_rate": 2.4633194238793913e-06, + "loss": 3.2337, + "step": 7760 + }, + { + "epoch": 9.92544, + "grad_norm": 0.5455198884010315, + "learning_rate": 2.422937138242024e-06, + "loss": 3.2814, + "step": 7761 + }, + { + "epoch": 9.92672, + "grad_norm": 0.5268814563751221, + "learning_rate": 2.382554852604657e-06, + "loss": 3.2407, + "step": 7762 + }, + { + "epoch": 9.928, + "grad_norm": 0.5490137338638306, + "learning_rate": 2.34217256696729e-06, + "loss": 3.2843, + "step": 7763 + }, + { + "epoch": 9.92928, + "grad_norm": 0.525651216506958, + "learning_rate": 2.301790281329923e-06, + "loss": 3.2615, + "step": 7764 + }, + { + "epoch": 9.93056, + "grad_norm": 0.5373636484146118, + "learning_rate": 2.2614079956925557e-06, + "loss": 3.2135, + "step": 7765 + }, + { + "epoch": 9.93184, + "grad_norm": 0.5554747581481934, + "learning_rate": 2.2210257100551887e-06, + "loss": 3.2698, + "step": 7766 + }, + { + "epoch": 9.93312, + "grad_norm": 0.5517007112503052, + "learning_rate": 2.1806434244178218e-06, + "loss": 3.3233, + "step": 7767 + }, + { + "epoch": 9.9344, + "grad_norm": 0.5402884483337402, + "learning_rate": 2.140261138780455e-06, + "loss": 3.2164, + "step": 7768 + }, + { + "epoch": 9.93568, + "grad_norm": 0.5363087058067322, + "learning_rate": 2.099878853143088e-06, + "loss": 3.3029, + "step": 7769 + }, + { + "epoch": 9.93696, + "grad_norm": 0.547885537147522, + "learning_rate": 2.0594965675057205e-06, + "loss": 3.2803, + "step": 7770 + }, + { + "epoch": 9.93824, + "grad_norm": 0.5402307510375977, + "learning_rate": 2.0191142818683535e-06, + "loss": 3.2742, + "step": 7771 + }, + { + "epoch": 9.93952, + "grad_norm": 0.5349573493003845, + "learning_rate": 1.9787319962309865e-06, + "loss": 3.2538, + "step": 7772 + }, + { + "epoch": 9.9408, + "grad_norm": 0.5216020345687866, + "learning_rate": 1.938349710593619e-06, + "loss": 3.1519, + "step": 7773 + }, + { + "epoch": 9.94208, + "grad_norm": 0.5340332388877869, + "learning_rate": 1.8979674249562524e-06, + "loss": 3.2481, + "step": 7774 + }, + { + "epoch": 9.94336, + "grad_norm": 0.5314613580703735, + "learning_rate": 1.8575851393188855e-06, + "loss": 3.2759, + "step": 7775 + }, + { + "epoch": 9.94464, + "grad_norm": 0.5448631048202515, + "learning_rate": 1.817202853681518e-06, + "loss": 3.2906, + "step": 7776 + }, + { + "epoch": 9.94592, + "grad_norm": 0.5468592047691345, + "learning_rate": 1.7768205680441511e-06, + "loss": 3.2446, + "step": 7777 + }, + { + "epoch": 9.9472, + "grad_norm": 0.5331081748008728, + "learning_rate": 1.736438282406784e-06, + "loss": 3.2681, + "step": 7778 + }, + { + "epoch": 9.94848, + "grad_norm": 0.5275120139122009, + "learning_rate": 1.696055996769417e-06, + "loss": 3.2593, + "step": 7779 + }, + { + "epoch": 9.94976, + "grad_norm": 0.5483401417732239, + "learning_rate": 1.65567371113205e-06, + "loss": 3.2909, + "step": 7780 + }, + { + "epoch": 9.95104, + "grad_norm": 0.5375556349754333, + "learning_rate": 1.6152914254946828e-06, + "loss": 3.219, + "step": 7781 + }, + { + "epoch": 9.95232, + "grad_norm": 0.532954752445221, + "learning_rate": 1.5749091398573159e-06, + "loss": 3.2613, + "step": 7782 + }, + { + "epoch": 9.9536, + "grad_norm": 0.534887969493866, + "learning_rate": 1.534526854219949e-06, + "loss": 3.2319, + "step": 7783 + }, + { + "epoch": 9.95488, + "grad_norm": 0.5328458547592163, + "learning_rate": 1.4941445685825815e-06, + "loss": 3.2237, + "step": 7784 + }, + { + "epoch": 9.95616, + "grad_norm": 0.530035674571991, + "learning_rate": 1.4537622829452146e-06, + "loss": 3.2497, + "step": 7785 + }, + { + "epoch": 9.95744, + "grad_norm": 0.5311540365219116, + "learning_rate": 1.4133799973078474e-06, + "loss": 3.1767, + "step": 7786 + }, + { + "epoch": 9.95872, + "grad_norm": 0.5378389358520508, + "learning_rate": 1.3729977116704805e-06, + "loss": 3.253, + "step": 7787 + }, + { + "epoch": 9.96, + "grad_norm": 0.5423367619514465, + "learning_rate": 1.3326154260331135e-06, + "loss": 3.2811, + "step": 7788 + }, + { + "epoch": 9.96128, + "grad_norm": 0.5347367525100708, + "learning_rate": 1.2922331403957463e-06, + "loss": 3.2527, + "step": 7789 + }, + { + "epoch": 9.96256, + "grad_norm": 0.547399640083313, + "learning_rate": 1.2518508547583794e-06, + "loss": 3.2159, + "step": 7790 + }, + { + "epoch": 9.96384, + "grad_norm": 0.5291503667831421, + "learning_rate": 1.211468569121012e-06, + "loss": 3.1828, + "step": 7791 + }, + { + "epoch": 9.96512, + "grad_norm": 0.5376866459846497, + "learning_rate": 1.171086283483645e-06, + "loss": 3.2118, + "step": 7792 + }, + { + "epoch": 9.9664, + "grad_norm": 0.5406926274299622, + "learning_rate": 1.1307039978462778e-06, + "loss": 3.1903, + "step": 7793 + }, + { + "epoch": 9.96768, + "grad_norm": 0.5449708104133606, + "learning_rate": 1.0903217122089109e-06, + "loss": 3.2392, + "step": 7794 + }, + { + "epoch": 9.96896, + "grad_norm": 0.5226523876190186, + "learning_rate": 1.049939426571544e-06, + "loss": 3.2205, + "step": 7795 + }, + { + "epoch": 9.97024, + "grad_norm": 0.5401772260665894, + "learning_rate": 1.0095571409341768e-06, + "loss": 3.2496, + "step": 7796 + }, + { + "epoch": 9.97152, + "grad_norm": 0.5378510355949402, + "learning_rate": 9.691748552968096e-07, + "loss": 3.1916, + "step": 7797 + }, + { + "epoch": 9.9728, + "grad_norm": 0.5274825096130371, + "learning_rate": 9.287925696594427e-07, + "loss": 3.2405, + "step": 7798 + }, + { + "epoch": 9.97408, + "grad_norm": 0.5408015251159668, + "learning_rate": 8.884102840220756e-07, + "loss": 3.2419, + "step": 7799 + }, + { + "epoch": 9.97536, + "grad_norm": 0.5279608964920044, + "learning_rate": 8.480279983847085e-07, + "loss": 3.2378, + "step": 7800 + }, + { + "epoch": 9.97664, + "grad_norm": 0.5284964442253113, + "learning_rate": 8.076457127473414e-07, + "loss": 3.2899, + "step": 7801 + }, + { + "epoch": 9.97792, + "grad_norm": 0.537851095199585, + "learning_rate": 7.672634271099745e-07, + "loss": 3.308, + "step": 7802 + }, + { + "epoch": 9.9792, + "grad_norm": 0.5350565314292908, + "learning_rate": 7.268811414726073e-07, + "loss": 3.1784, + "step": 7803 + }, + { + "epoch": 9.98048, + "grad_norm": 0.5333359837532043, + "learning_rate": 6.864988558352402e-07, + "loss": 3.2109, + "step": 7804 + }, + { + "epoch": 9.98176, + "grad_norm": 0.5407792925834656, + "learning_rate": 6.461165701978732e-07, + "loss": 3.2474, + "step": 7805 + }, + { + "epoch": 9.98304, + "grad_norm": 0.545315146446228, + "learning_rate": 6.05734284560506e-07, + "loss": 3.2454, + "step": 7806 + }, + { + "epoch": 9.98432, + "grad_norm": 0.5328691601753235, + "learning_rate": 5.653519989231389e-07, + "loss": 3.2104, + "step": 7807 + }, + { + "epoch": 9.9856, + "grad_norm": 0.5297470092773438, + "learning_rate": 5.24969713285772e-07, + "loss": 3.2225, + "step": 7808 + }, + { + "epoch": 9.98688, + "grad_norm": 0.5331732630729675, + "learning_rate": 4.845874276484048e-07, + "loss": 3.2141, + "step": 7809 + }, + { + "epoch": 9.98816, + "grad_norm": 0.5387186408042908, + "learning_rate": 4.442051420110378e-07, + "loss": 3.2828, + "step": 7810 + }, + { + "epoch": 9.98944, + "grad_norm": 0.5281053781509399, + "learning_rate": 4.038228563736707e-07, + "loss": 3.2234, + "step": 7811 + }, + { + "epoch": 9.99072, + "grad_norm": 0.5405586957931519, + "learning_rate": 3.6344057073630365e-07, + "loss": 3.3145, + "step": 7812 + }, + { + "epoch": 9.992, + "grad_norm": 0.5372722744941711, + "learning_rate": 3.230582850989366e-07, + "loss": 3.2676, + "step": 7813 + }, + { + "epoch": 9.99328, + "grad_norm": 0.5271605253219604, + "learning_rate": 2.8267599946156946e-07, + "loss": 3.2855, + "step": 7814 + }, + { + "epoch": 9.99456, + "grad_norm": 0.5379496812820435, + "learning_rate": 2.422937138242024e-07, + "loss": 3.1735, + "step": 7815 + }, + { + "epoch": 9.99584, + "grad_norm": 0.5309686660766602, + "learning_rate": 2.0191142818683536e-07, + "loss": 3.2678, + "step": 7816 + }, + { + "epoch": 9.99712, + "grad_norm": 0.5377652645111084, + "learning_rate": 1.615291425494683e-07, + "loss": 3.3243, + "step": 7817 + }, + { + "epoch": 9.9984, + "grad_norm": 0.5389779806137085, + "learning_rate": 1.211468569121012e-07, + "loss": 3.2626, + "step": 7818 + }, + { + "epoch": 9.99968, + "grad_norm": 0.54366135597229, + "learning_rate": 8.076457127473414e-08, + "loss": 3.2713, + "step": 7819 + }, + { + "epoch": 10.0, + "grad_norm": 1.0505841970443726, + "learning_rate": 4.038228563736707e-08, + "loss": 3.2862, + "step": 7820 + } + ], + "logging_steps": 1, + "max_steps": 7820, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.5963011260928e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}