{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 490, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01020408163265306, "grad_norm": 1.5778175592422485, "learning_rate": 0.0, "loss": 0.1624, "step": 1 }, { "epoch": 0.02040816326530612, "grad_norm": 1.3363069295883179, "learning_rate": 4.0816326530612243e-07, "loss": 0.1369, "step": 2 }, { "epoch": 0.030612244897959183, "grad_norm": 1.248914361000061, "learning_rate": 8.163265306122449e-07, "loss": 0.1151, "step": 3 }, { "epoch": 0.04081632653061224, "grad_norm": 1.4152894020080566, "learning_rate": 1.2244897959183673e-06, "loss": 0.1031, "step": 4 }, { "epoch": 0.05102040816326531, "grad_norm": 1.2651602029800415, "learning_rate": 1.6326530612244897e-06, "loss": 0.097, "step": 5 }, { "epoch": 0.061224489795918366, "grad_norm": 1.289127230644226, "learning_rate": 2.0408163265306125e-06, "loss": 0.095, "step": 6 }, { "epoch": 0.07142857142857142, "grad_norm": 1.1998013257980347, "learning_rate": 2.4489795918367347e-06, "loss": 0.1236, "step": 7 }, { "epoch": 0.08163265306122448, "grad_norm": 1.538970947265625, "learning_rate": 2.8571428571428573e-06, "loss": 0.1177, "step": 8 }, { "epoch": 0.09183673469387756, "grad_norm": 1.2269916534423828, "learning_rate": 3.2653061224489794e-06, "loss": 0.0931, "step": 9 }, { "epoch": 0.10204081632653061, "grad_norm": 1.2894030809402466, "learning_rate": 3.6734693877551024e-06, "loss": 0.1049, "step": 10 }, { "epoch": 0.11224489795918367, "grad_norm": 0.9413341283798218, "learning_rate": 4.081632653061225e-06, "loss": 0.0757, "step": 11 }, { "epoch": 0.12244897959183673, "grad_norm": 1.1163856983184814, "learning_rate": 4.489795918367348e-06, "loss": 0.0936, "step": 12 }, { "epoch": 0.1326530612244898, "grad_norm": 0.997565507888794, "learning_rate": 4.897959183673469e-06, "loss": 0.0797, "step": 13 }, { "epoch": 0.14285714285714285, "grad_norm": 1.0046685934066772, "learning_rate": 5.306122448979593e-06, "loss": 0.0855, "step": 14 }, { "epoch": 0.15306122448979592, "grad_norm": 0.9205936789512634, "learning_rate": 5.7142857142857145e-06, "loss": 0.079, "step": 15 }, { "epoch": 0.16326530612244897, "grad_norm": 0.8712719678878784, "learning_rate": 6.122448979591837e-06, "loss": 0.0666, "step": 16 }, { "epoch": 0.17346938775510204, "grad_norm": 0.9426755905151367, "learning_rate": 6.530612244897959e-06, "loss": 0.073, "step": 17 }, { "epoch": 0.1836734693877551, "grad_norm": 0.8015092611312866, "learning_rate": 6.938775510204082e-06, "loss": 0.0669, "step": 18 }, { "epoch": 0.19387755102040816, "grad_norm": 0.7584081292152405, "learning_rate": 7.346938775510205e-06, "loss": 0.0517, "step": 19 }, { "epoch": 0.20408163265306123, "grad_norm": 0.7984261512756348, "learning_rate": 7.755102040816327e-06, "loss": 0.0667, "step": 20 }, { "epoch": 0.21428571428571427, "grad_norm": 0.7533179521560669, "learning_rate": 8.16326530612245e-06, "loss": 0.0639, "step": 21 }, { "epoch": 0.22448979591836735, "grad_norm": 0.9156713485717773, "learning_rate": 8.571428571428571e-06, "loss": 0.0729, "step": 22 }, { "epoch": 0.23469387755102042, "grad_norm": 0.6727928519248962, "learning_rate": 8.979591836734695e-06, "loss": 0.0565, "step": 23 }, { "epoch": 0.24489795918367346, "grad_norm": 0.5908196568489075, "learning_rate": 9.387755102040818e-06, "loss": 0.0501, "step": 24 }, { "epoch": 0.25510204081632654, "grad_norm": 0.5994157195091248, "learning_rate": 9.795918367346939e-06, "loss": 0.0596, "step": 25 }, { "epoch": 0.2653061224489796, "grad_norm": 0.5699151754379272, "learning_rate": 1.0204081632653063e-05, "loss": 0.0478, "step": 26 }, { "epoch": 0.2755102040816326, "grad_norm": 0.41071373224258423, "learning_rate": 1.0612244897959186e-05, "loss": 0.0306, "step": 27 }, { "epoch": 0.2857142857142857, "grad_norm": 0.6520228981971741, "learning_rate": 1.1020408163265306e-05, "loss": 0.0509, "step": 28 }, { "epoch": 0.29591836734693877, "grad_norm": 0.5062035918235779, "learning_rate": 1.1428571428571429e-05, "loss": 0.0415, "step": 29 }, { "epoch": 0.30612244897959184, "grad_norm": 0.5349479913711548, "learning_rate": 1.1836734693877552e-05, "loss": 0.0396, "step": 30 }, { "epoch": 0.3163265306122449, "grad_norm": 0.29080551862716675, "learning_rate": 1.2244897959183674e-05, "loss": 0.0215, "step": 31 }, { "epoch": 0.32653061224489793, "grad_norm": 0.5437124371528625, "learning_rate": 1.2653061224489798e-05, "loss": 0.0402, "step": 32 }, { "epoch": 0.336734693877551, "grad_norm": 0.7592443823814392, "learning_rate": 1.3061224489795918e-05, "loss": 0.0692, "step": 33 }, { "epoch": 0.3469387755102041, "grad_norm": 0.6122593283653259, "learning_rate": 1.3469387755102042e-05, "loss": 0.0602, "step": 34 }, { "epoch": 0.35714285714285715, "grad_norm": 0.31229618191719055, "learning_rate": 1.3877551020408165e-05, "loss": 0.0215, "step": 35 }, { "epoch": 0.3673469387755102, "grad_norm": 0.37940043210983276, "learning_rate": 1.4285714285714287e-05, "loss": 0.0274, "step": 36 }, { "epoch": 0.37755102040816324, "grad_norm": 0.2848958969116211, "learning_rate": 1.469387755102041e-05, "loss": 0.0212, "step": 37 }, { "epoch": 0.3877551020408163, "grad_norm": 0.3313491940498352, "learning_rate": 1.510204081632653e-05, "loss": 0.0231, "step": 38 }, { "epoch": 0.3979591836734694, "grad_norm": 0.24816128611564636, "learning_rate": 1.5510204081632655e-05, "loss": 0.0159, "step": 39 }, { "epoch": 0.40816326530612246, "grad_norm": 0.2411227524280548, "learning_rate": 1.5918367346938776e-05, "loss": 0.0154, "step": 40 }, { "epoch": 0.41836734693877553, "grad_norm": 0.20029953122138977, "learning_rate": 1.63265306122449e-05, "loss": 0.013, "step": 41 }, { "epoch": 0.42857142857142855, "grad_norm": 0.23513872921466827, "learning_rate": 1.673469387755102e-05, "loss": 0.0144, "step": 42 }, { "epoch": 0.4387755102040816, "grad_norm": 0.4002116918563843, "learning_rate": 1.7142857142857142e-05, "loss": 0.0353, "step": 43 }, { "epoch": 0.4489795918367347, "grad_norm": 0.27058476209640503, "learning_rate": 1.7551020408163266e-05, "loss": 0.0169, "step": 44 }, { "epoch": 0.45918367346938777, "grad_norm": 0.09177622944116592, "learning_rate": 1.795918367346939e-05, "loss": 0.0055, "step": 45 }, { "epoch": 0.46938775510204084, "grad_norm": 0.16117192804813385, "learning_rate": 1.836734693877551e-05, "loss": 0.0098, "step": 46 }, { "epoch": 0.47959183673469385, "grad_norm": 0.1534506231546402, "learning_rate": 1.8775510204081636e-05, "loss": 0.0071, "step": 47 }, { "epoch": 0.4897959183673469, "grad_norm": 0.2407277524471283, "learning_rate": 1.9183673469387756e-05, "loss": 0.0167, "step": 48 }, { "epoch": 0.5, "grad_norm": 0.1129893809556961, "learning_rate": 1.9591836734693877e-05, "loss": 0.0062, "step": 49 }, { "epoch": 0.5102040816326531, "grad_norm": 0.15642525255680084, "learning_rate": 2e-05, "loss": 0.0064, "step": 50 }, { "epoch": 0.5204081632653061, "grad_norm": 0.30184754729270935, "learning_rate": 1.9954648526077098e-05, "loss": 0.0125, "step": 51 }, { "epoch": 0.5306122448979592, "grad_norm": 0.07106052339076996, "learning_rate": 1.9909297052154198e-05, "loss": 0.0044, "step": 52 }, { "epoch": 0.5408163265306123, "grad_norm": 0.26723626255989075, "learning_rate": 1.9863945578231295e-05, "loss": 0.0193, "step": 53 }, { "epoch": 0.5510204081632653, "grad_norm": 0.09843797981739044, "learning_rate": 1.981859410430839e-05, "loss": 0.0058, "step": 54 }, { "epoch": 0.5612244897959183, "grad_norm": 0.07936914265155792, "learning_rate": 1.977324263038549e-05, "loss": 0.0043, "step": 55 }, { "epoch": 0.5714285714285714, "grad_norm": 0.06574582308530807, "learning_rate": 1.9727891156462588e-05, "loss": 0.0036, "step": 56 }, { "epoch": 0.5816326530612245, "grad_norm": 0.030592354014515877, "learning_rate": 1.9682539682539684e-05, "loss": 0.0018, "step": 57 }, { "epoch": 0.5918367346938775, "grad_norm": 0.07422778010368347, "learning_rate": 1.963718820861678e-05, "loss": 0.0039, "step": 58 }, { "epoch": 0.6020408163265306, "grad_norm": 0.05910489708185196, "learning_rate": 1.9591836734693877e-05, "loss": 0.0031, "step": 59 }, { "epoch": 0.6122448979591837, "grad_norm": 0.035012971609830856, "learning_rate": 1.9546485260770977e-05, "loss": 0.0019, "step": 60 }, { "epoch": 0.6224489795918368, "grad_norm": 0.05377289652824402, "learning_rate": 1.9501133786848074e-05, "loss": 0.003, "step": 61 }, { "epoch": 0.6326530612244898, "grad_norm": 0.059411946684122086, "learning_rate": 1.945578231292517e-05, "loss": 0.003, "step": 62 }, { "epoch": 0.6428571428571429, "grad_norm": 0.07829850167036057, "learning_rate": 1.941043083900227e-05, "loss": 0.0039, "step": 63 }, { "epoch": 0.6530612244897959, "grad_norm": 0.1004122868180275, "learning_rate": 1.9365079365079367e-05, "loss": 0.0048, "step": 64 }, { "epoch": 0.6632653061224489, "grad_norm": 0.022585352882742882, "learning_rate": 1.9319727891156463e-05, "loss": 0.0013, "step": 65 }, { "epoch": 0.673469387755102, "grad_norm": 0.08342932909727097, "learning_rate": 1.9274376417233563e-05, "loss": 0.0039, "step": 66 }, { "epoch": 0.6836734693877551, "grad_norm": 0.3428645730018616, "learning_rate": 1.922902494331066e-05, "loss": 0.0113, "step": 67 }, { "epoch": 0.6938775510204082, "grad_norm": 0.08267664909362793, "learning_rate": 1.9183673469387756e-05, "loss": 0.0042, "step": 68 }, { "epoch": 0.7040816326530612, "grad_norm": 0.07195252925157547, "learning_rate": 1.9138321995464853e-05, "loss": 0.0029, "step": 69 }, { "epoch": 0.7142857142857143, "grad_norm": 0.024874132126569748, "learning_rate": 1.9092970521541953e-05, "loss": 0.0014, "step": 70 }, { "epoch": 0.7244897959183674, "grad_norm": 0.03532341867685318, "learning_rate": 1.904761904761905e-05, "loss": 0.0012, "step": 71 }, { "epoch": 0.7346938775510204, "grad_norm": 0.01858861930668354, "learning_rate": 1.9002267573696146e-05, "loss": 0.001, "step": 72 }, { "epoch": 0.7448979591836735, "grad_norm": 0.36321982741355896, "learning_rate": 1.8956916099773243e-05, "loss": 0.0128, "step": 73 }, { "epoch": 0.7551020408163265, "grad_norm": 0.20222659409046173, "learning_rate": 1.8911564625850343e-05, "loss": 0.0076, "step": 74 }, { "epoch": 0.7653061224489796, "grad_norm": 0.07980707287788391, "learning_rate": 1.886621315192744e-05, "loss": 0.0031, "step": 75 }, { "epoch": 0.7755102040816326, "grad_norm": 0.020555464550852776, "learning_rate": 1.8820861678004536e-05, "loss": 0.0012, "step": 76 }, { "epoch": 0.7857142857142857, "grad_norm": 0.02769128419458866, "learning_rate": 1.8775510204081636e-05, "loss": 0.0014, "step": 77 }, { "epoch": 0.7959183673469388, "grad_norm": 0.030886279419064522, "learning_rate": 1.8730158730158732e-05, "loss": 0.0015, "step": 78 }, { "epoch": 0.8061224489795918, "grad_norm": 0.04239689186215401, "learning_rate": 1.868480725623583e-05, "loss": 0.0017, "step": 79 }, { "epoch": 0.8163265306122449, "grad_norm": 0.03217000514268875, "learning_rate": 1.863945578231293e-05, "loss": 0.0014, "step": 80 }, { "epoch": 0.826530612244898, "grad_norm": 0.029874522238969803, "learning_rate": 1.8594104308390025e-05, "loss": 0.0015, "step": 81 }, { "epoch": 0.8367346938775511, "grad_norm": 0.02627841755747795, "learning_rate": 1.8548752834467122e-05, "loss": 0.0013, "step": 82 }, { "epoch": 0.8469387755102041, "grad_norm": 0.019378235563635826, "learning_rate": 1.8503401360544218e-05, "loss": 0.001, "step": 83 }, { "epoch": 0.8571428571428571, "grad_norm": 0.08690612763166428, "learning_rate": 1.8458049886621315e-05, "loss": 0.0021, "step": 84 }, { "epoch": 0.8673469387755102, "grad_norm": 0.014376318082213402, "learning_rate": 1.8412698412698415e-05, "loss": 0.0008, "step": 85 }, { "epoch": 0.8775510204081632, "grad_norm": 0.01724099926650524, "learning_rate": 1.836734693877551e-05, "loss": 0.0009, "step": 86 }, { "epoch": 0.8877551020408163, "grad_norm": 0.3265489339828491, "learning_rate": 1.8321995464852608e-05, "loss": 0.0117, "step": 87 }, { "epoch": 0.8979591836734694, "grad_norm": 0.09740184992551804, "learning_rate": 1.8276643990929708e-05, "loss": 0.003, "step": 88 }, { "epoch": 0.9081632653061225, "grad_norm": 0.015478034503757954, "learning_rate": 1.8231292517006804e-05, "loss": 0.0008, "step": 89 }, { "epoch": 0.9183673469387755, "grad_norm": 0.18761862814426422, "learning_rate": 1.81859410430839e-05, "loss": 0.0068, "step": 90 }, { "epoch": 0.9285714285714286, "grad_norm": 0.038408756256103516, "learning_rate": 1.8140589569161e-05, "loss": 0.0014, "step": 91 }, { "epoch": 0.9387755102040817, "grad_norm": 0.03130817040801048, "learning_rate": 1.8095238095238097e-05, "loss": 0.0014, "step": 92 }, { "epoch": 0.9489795918367347, "grad_norm": 0.014020106755197048, "learning_rate": 1.8049886621315194e-05, "loss": 0.0007, "step": 93 }, { "epoch": 0.9591836734693877, "grad_norm": 0.02029995806515217, "learning_rate": 1.8004535147392294e-05, "loss": 0.0011, "step": 94 }, { "epoch": 0.9693877551020408, "grad_norm": 0.021185798570513725, "learning_rate": 1.795918367346939e-05, "loss": 0.0009, "step": 95 }, { "epoch": 0.9795918367346939, "grad_norm": 0.014589856378734112, "learning_rate": 1.7913832199546487e-05, "loss": 0.0008, "step": 96 }, { "epoch": 0.9897959183673469, "grad_norm": 0.022265039384365082, "learning_rate": 1.7868480725623583e-05, "loss": 0.0011, "step": 97 }, { "epoch": 1.0, "grad_norm": 0.0408700592815876, "learning_rate": 1.782312925170068e-05, "loss": 0.0011, "step": 98 }, { "epoch": 1.010204081632653, "grad_norm": 0.009026318788528442, "learning_rate": 1.7777777777777777e-05, "loss": 0.0005, "step": 99 }, { "epoch": 1.0204081632653061, "grad_norm": 0.007882497273385525, "learning_rate": 1.7732426303854877e-05, "loss": 0.0005, "step": 100 }, { "epoch": 1.030612244897959, "grad_norm": 0.025666292756795883, "learning_rate": 1.7687074829931973e-05, "loss": 0.0012, "step": 101 }, { "epoch": 1.0408163265306123, "grad_norm": 0.01795661635696888, "learning_rate": 1.7641723356009073e-05, "loss": 0.0008, "step": 102 }, { "epoch": 1.0510204081632653, "grad_norm": 0.04071149602532387, "learning_rate": 1.759637188208617e-05, "loss": 0.0016, "step": 103 }, { "epoch": 1.0612244897959184, "grad_norm": 0.007932674139738083, "learning_rate": 1.7551020408163266e-05, "loss": 0.0005, "step": 104 }, { "epoch": 1.0714285714285714, "grad_norm": 0.03695099800825119, "learning_rate": 1.7505668934240366e-05, "loss": 0.0015, "step": 105 }, { "epoch": 1.0816326530612246, "grad_norm": 0.008060461841523647, "learning_rate": 1.7460317460317463e-05, "loss": 0.0005, "step": 106 }, { "epoch": 1.0918367346938775, "grad_norm": 0.04425932839512825, "learning_rate": 1.741496598639456e-05, "loss": 0.0018, "step": 107 }, { "epoch": 1.1020408163265305, "grad_norm": 0.010241498239338398, "learning_rate": 1.736961451247166e-05, "loss": 0.0006, "step": 108 }, { "epoch": 1.1122448979591837, "grad_norm": 0.010430874302983284, "learning_rate": 1.7324263038548756e-05, "loss": 0.0006, "step": 109 }, { "epoch": 1.1224489795918366, "grad_norm": 0.16115950047969818, "learning_rate": 1.7278911564625852e-05, "loss": 0.0043, "step": 110 }, { "epoch": 1.1326530612244898, "grad_norm": 0.018837768584489822, "learning_rate": 1.723356009070295e-05, "loss": 0.0007, "step": 111 }, { "epoch": 1.1428571428571428, "grad_norm": 0.016730893403291702, "learning_rate": 1.7188208616780045e-05, "loss": 0.0009, "step": 112 }, { "epoch": 1.153061224489796, "grad_norm": 0.011841993778944016, "learning_rate": 1.7142857142857142e-05, "loss": 0.0007, "step": 113 }, { "epoch": 1.163265306122449, "grad_norm": 0.045097168534994125, "learning_rate": 1.7097505668934242e-05, "loss": 0.0019, "step": 114 }, { "epoch": 1.1734693877551021, "grad_norm": 0.09953276813030243, "learning_rate": 1.705215419501134e-05, "loss": 0.0032, "step": 115 }, { "epoch": 1.183673469387755, "grad_norm": 0.007014868780970573, "learning_rate": 1.7006802721088435e-05, "loss": 0.0004, "step": 116 }, { "epoch": 1.193877551020408, "grad_norm": 0.009045367129147053, "learning_rate": 1.6961451247165535e-05, "loss": 0.0005, "step": 117 }, { "epoch": 1.2040816326530612, "grad_norm": 0.007859342731535435, "learning_rate": 1.691609977324263e-05, "loss": 0.0005, "step": 118 }, { "epoch": 1.2142857142857142, "grad_norm": 0.032524097710847855, "learning_rate": 1.687074829931973e-05, "loss": 0.0009, "step": 119 }, { "epoch": 1.2244897959183674, "grad_norm": 0.05151795968413353, "learning_rate": 1.6825396825396828e-05, "loss": 0.0018, "step": 120 }, { "epoch": 1.2346938775510203, "grad_norm": 0.010988794267177582, "learning_rate": 1.6780045351473924e-05, "loss": 0.0006, "step": 121 }, { "epoch": 1.2448979591836735, "grad_norm": 0.006904716603457928, "learning_rate": 1.673469387755102e-05, "loss": 0.0004, "step": 122 }, { "epoch": 1.2551020408163265, "grad_norm": 0.006797518581151962, "learning_rate": 1.668934240362812e-05, "loss": 0.0004, "step": 123 }, { "epoch": 1.2653061224489797, "grad_norm": 0.01896447129547596, "learning_rate": 1.6643990929705217e-05, "loss": 0.0008, "step": 124 }, { "epoch": 1.2755102040816326, "grad_norm": 0.01258290559053421, "learning_rate": 1.6598639455782314e-05, "loss": 0.0007, "step": 125 }, { "epoch": 1.2857142857142856, "grad_norm": 0.015102504752576351, "learning_rate": 1.655328798185941e-05, "loss": 0.0006, "step": 126 }, { "epoch": 1.2959183673469388, "grad_norm": 0.005591754335910082, "learning_rate": 1.6507936507936507e-05, "loss": 0.0004, "step": 127 }, { "epoch": 1.306122448979592, "grad_norm": 0.10714168101549149, "learning_rate": 1.6462585034013607e-05, "loss": 0.0032, "step": 128 }, { "epoch": 1.316326530612245, "grad_norm": 0.030577057972550392, "learning_rate": 1.6417233560090704e-05, "loss": 0.0011, "step": 129 }, { "epoch": 1.3265306122448979, "grad_norm": 0.017115091904997826, "learning_rate": 1.63718820861678e-05, "loss": 0.0008, "step": 130 }, { "epoch": 1.336734693877551, "grad_norm": 0.011700804345309734, "learning_rate": 1.63265306122449e-05, "loss": 0.0006, "step": 131 }, { "epoch": 1.346938775510204, "grad_norm": 0.008858302608132362, "learning_rate": 1.6281179138321997e-05, "loss": 0.0004, "step": 132 }, { "epoch": 1.3571428571428572, "grad_norm": 0.00907884445041418, "learning_rate": 1.6235827664399097e-05, "loss": 0.0005, "step": 133 }, { "epoch": 1.3673469387755102, "grad_norm": 0.00645515276119113, "learning_rate": 1.6190476190476193e-05, "loss": 0.0003, "step": 134 }, { "epoch": 1.3775510204081631, "grad_norm": 0.01644102856516838, "learning_rate": 1.614512471655329e-05, "loss": 0.0006, "step": 135 }, { "epoch": 1.3877551020408163, "grad_norm": 0.023088015615940094, "learning_rate": 1.6099773242630386e-05, "loss": 0.0009, "step": 136 }, { "epoch": 1.3979591836734695, "grad_norm": 0.004741874989122152, "learning_rate": 1.6054421768707483e-05, "loss": 0.0003, "step": 137 }, { "epoch": 1.4081632653061225, "grad_norm": 0.005127857904881239, "learning_rate": 1.6009070294784583e-05, "loss": 0.0003, "step": 138 }, { "epoch": 1.4183673469387754, "grad_norm": 0.009942681528627872, "learning_rate": 1.596371882086168e-05, "loss": 0.0005, "step": 139 }, { "epoch": 1.4285714285714286, "grad_norm": 0.009013745002448559, "learning_rate": 1.5918367346938776e-05, "loss": 0.0005, "step": 140 }, { "epoch": 1.4387755102040816, "grad_norm": 0.008382913656532764, "learning_rate": 1.5873015873015872e-05, "loss": 0.0005, "step": 141 }, { "epoch": 1.4489795918367347, "grad_norm": 0.01376293320208788, "learning_rate": 1.5827664399092972e-05, "loss": 0.0006, "step": 142 }, { "epoch": 1.4591836734693877, "grad_norm": 0.07127456367015839, "learning_rate": 1.578231292517007e-05, "loss": 0.0022, "step": 143 }, { "epoch": 1.469387755102041, "grad_norm": 0.006247013341635466, "learning_rate": 1.5736961451247165e-05, "loss": 0.0004, "step": 144 }, { "epoch": 1.4795918367346939, "grad_norm": 0.03836556524038315, "learning_rate": 1.5691609977324265e-05, "loss": 0.0012, "step": 145 }, { "epoch": 1.489795918367347, "grad_norm": 0.011062193661928177, "learning_rate": 1.5646258503401362e-05, "loss": 0.0006, "step": 146 }, { "epoch": 1.5, "grad_norm": 0.005953874904662371, "learning_rate": 1.5600907029478462e-05, "loss": 0.0003, "step": 147 }, { "epoch": 1.510204081632653, "grad_norm": 0.014096422120928764, "learning_rate": 1.555555555555556e-05, "loss": 0.0008, "step": 148 }, { "epoch": 1.5204081632653061, "grad_norm": 0.0064276340417563915, "learning_rate": 1.5510204081632655e-05, "loss": 0.0004, "step": 149 }, { "epoch": 1.5306122448979593, "grad_norm": 0.007169738411903381, "learning_rate": 1.546485260770975e-05, "loss": 0.0004, "step": 150 }, { "epoch": 1.5408163265306123, "grad_norm": 0.005434677470475435, "learning_rate": 1.5419501133786848e-05, "loss": 0.0004, "step": 151 }, { "epoch": 1.5510204081632653, "grad_norm": 0.008770623244345188, "learning_rate": 1.5374149659863945e-05, "loss": 0.0004, "step": 152 }, { "epoch": 1.5612244897959182, "grad_norm": 0.011159502901136875, "learning_rate": 1.5328798185941044e-05, "loss": 0.0007, "step": 153 }, { "epoch": 1.5714285714285714, "grad_norm": 0.10471421480178833, "learning_rate": 1.528344671201814e-05, "loss": 0.0022, "step": 154 }, { "epoch": 1.5816326530612246, "grad_norm": 0.008612933568656445, "learning_rate": 1.523809523809524e-05, "loss": 0.0005, "step": 155 }, { "epoch": 1.5918367346938775, "grad_norm": 0.006375262048095465, "learning_rate": 1.5192743764172338e-05, "loss": 0.0003, "step": 156 }, { "epoch": 1.6020408163265305, "grad_norm": 0.00903844740241766, "learning_rate": 1.5147392290249434e-05, "loss": 0.0005, "step": 157 }, { "epoch": 1.6122448979591837, "grad_norm": 0.005267101805657148, "learning_rate": 1.510204081632653e-05, "loss": 0.0003, "step": 158 }, { "epoch": 1.6224489795918369, "grad_norm": 0.006081985309720039, "learning_rate": 1.505668934240363e-05, "loss": 0.0004, "step": 159 }, { "epoch": 1.6326530612244898, "grad_norm": 0.0072037833742797375, "learning_rate": 1.5011337868480727e-05, "loss": 0.0004, "step": 160 }, { "epoch": 1.6428571428571428, "grad_norm": 0.0033731532748788595, "learning_rate": 1.4965986394557825e-05, "loss": 0.0002, "step": 161 }, { "epoch": 1.6530612244897958, "grad_norm": 0.0077390824444592, "learning_rate": 1.4920634920634922e-05, "loss": 0.0005, "step": 162 }, { "epoch": 1.663265306122449, "grad_norm": 0.009692452847957611, "learning_rate": 1.4875283446712018e-05, "loss": 0.0005, "step": 163 }, { "epoch": 1.6734693877551021, "grad_norm": 0.006450532004237175, "learning_rate": 1.4829931972789118e-05, "loss": 0.0003, "step": 164 }, { "epoch": 1.683673469387755, "grad_norm": 0.009719816036522388, "learning_rate": 1.4784580498866215e-05, "loss": 0.0005, "step": 165 }, { "epoch": 1.693877551020408, "grad_norm": 0.010457034222781658, "learning_rate": 1.4739229024943311e-05, "loss": 0.0005, "step": 166 }, { "epoch": 1.7040816326530612, "grad_norm": 0.008015105500817299, "learning_rate": 1.469387755102041e-05, "loss": 0.0004, "step": 167 }, { "epoch": 1.7142857142857144, "grad_norm": 0.006307144183665514, "learning_rate": 1.4648526077097506e-05, "loss": 0.0003, "step": 168 }, { "epoch": 1.7244897959183674, "grad_norm": 0.005334992427378893, "learning_rate": 1.4603174603174603e-05, "loss": 0.0003, "step": 169 }, { "epoch": 1.7346938775510203, "grad_norm": 0.008067265152931213, "learning_rate": 1.4557823129251703e-05, "loss": 0.0003, "step": 170 }, { "epoch": 1.7448979591836735, "grad_norm": 0.012681787833571434, "learning_rate": 1.45124716553288e-05, "loss": 0.0005, "step": 171 }, { "epoch": 1.7551020408163265, "grad_norm": 0.01536930724978447, "learning_rate": 1.4467120181405896e-05, "loss": 0.0005, "step": 172 }, { "epoch": 1.7653061224489797, "grad_norm": 0.0037332891952246428, "learning_rate": 1.4421768707482994e-05, "loss": 0.0002, "step": 173 }, { "epoch": 1.7755102040816326, "grad_norm": 0.010341755114495754, "learning_rate": 1.4376417233560092e-05, "loss": 0.0005, "step": 174 }, { "epoch": 1.7857142857142856, "grad_norm": 0.0045587471686303616, "learning_rate": 1.433106575963719e-05, "loss": 0.0003, "step": 175 }, { "epoch": 1.7959183673469388, "grad_norm": 0.016639186069369316, "learning_rate": 1.4285714285714287e-05, "loss": 0.0006, "step": 176 }, { "epoch": 1.806122448979592, "grad_norm": 0.005003046710044146, "learning_rate": 1.4240362811791384e-05, "loss": 0.0003, "step": 177 }, { "epoch": 1.816326530612245, "grad_norm": 0.009210484102368355, "learning_rate": 1.4195011337868484e-05, "loss": 0.0004, "step": 178 }, { "epoch": 1.8265306122448979, "grad_norm": 0.007876208052039146, "learning_rate": 1.414965986394558e-05, "loss": 0.0004, "step": 179 }, { "epoch": 1.836734693877551, "grad_norm": 0.0038002703804522753, "learning_rate": 1.4104308390022677e-05, "loss": 0.0002, "step": 180 }, { "epoch": 1.8469387755102042, "grad_norm": 0.00423433817923069, "learning_rate": 1.4058956916099775e-05, "loss": 0.0002, "step": 181 }, { "epoch": 1.8571428571428572, "grad_norm": 0.008944015018641949, "learning_rate": 1.4013605442176872e-05, "loss": 0.0005, "step": 182 }, { "epoch": 1.8673469387755102, "grad_norm": 0.004832221195101738, "learning_rate": 1.3968253968253968e-05, "loss": 0.0003, "step": 183 }, { "epoch": 1.8775510204081631, "grad_norm": 0.005358612630516291, "learning_rate": 1.3922902494331068e-05, "loss": 0.0003, "step": 184 }, { "epoch": 1.8877551020408163, "grad_norm": 0.004266591276973486, "learning_rate": 1.3877551020408165e-05, "loss": 0.0002, "step": 185 }, { "epoch": 1.8979591836734695, "grad_norm": 0.004511923063546419, "learning_rate": 1.3832199546485261e-05, "loss": 0.0003, "step": 186 }, { "epoch": 1.9081632653061225, "grad_norm": 0.12353862076997757, "learning_rate": 1.378684807256236e-05, "loss": 0.0032, "step": 187 }, { "epoch": 1.9183673469387754, "grad_norm": 0.009472350589931011, "learning_rate": 1.3741496598639456e-05, "loss": 0.0006, "step": 188 }, { "epoch": 1.9285714285714286, "grad_norm": 0.005253692157566547, "learning_rate": 1.3696145124716554e-05, "loss": 0.0003, "step": 189 }, { "epoch": 1.9387755102040818, "grad_norm": 0.01199701614677906, "learning_rate": 1.3650793650793652e-05, "loss": 0.0005, "step": 190 }, { "epoch": 1.9489795918367347, "grad_norm": 0.006006367038935423, "learning_rate": 1.3605442176870749e-05, "loss": 0.0003, "step": 191 }, { "epoch": 1.9591836734693877, "grad_norm": 0.010423636995255947, "learning_rate": 1.3560090702947847e-05, "loss": 0.0004, "step": 192 }, { "epoch": 1.9693877551020407, "grad_norm": 0.006484678015112877, "learning_rate": 1.3514739229024945e-05, "loss": 0.0004, "step": 193 }, { "epoch": 1.9795918367346939, "grad_norm": 0.007823942229151726, "learning_rate": 1.3469387755102042e-05, "loss": 0.0004, "step": 194 }, { "epoch": 1.989795918367347, "grad_norm": 0.005013170652091503, "learning_rate": 1.342403628117914e-05, "loss": 0.0003, "step": 195 }, { "epoch": 2.0, "grad_norm": 0.003286719787865877, "learning_rate": 1.3378684807256237e-05, "loss": 0.0001, "step": 196 }, { "epoch": 2.010204081632653, "grad_norm": 0.007182662840932608, "learning_rate": 1.3333333333333333e-05, "loss": 0.0003, "step": 197 }, { "epoch": 2.020408163265306, "grad_norm": 0.007124132476747036, "learning_rate": 1.3287981859410433e-05, "loss": 0.0003, "step": 198 }, { "epoch": 2.0306122448979593, "grad_norm": 0.0033109758514910936, "learning_rate": 1.324263038548753e-05, "loss": 0.0002, "step": 199 }, { "epoch": 2.0408163265306123, "grad_norm": 0.0030752080492675304, "learning_rate": 1.3197278911564626e-05, "loss": 0.0002, "step": 200 }, { "epoch": 2.0510204081632653, "grad_norm": 0.003937916364520788, "learning_rate": 1.3151927437641725e-05, "loss": 0.0003, "step": 201 }, { "epoch": 2.061224489795918, "grad_norm": 0.003902744734659791, "learning_rate": 1.3106575963718821e-05, "loss": 0.0002, "step": 202 }, { "epoch": 2.0714285714285716, "grad_norm": 0.003365420503541827, "learning_rate": 1.3061224489795918e-05, "loss": 0.0002, "step": 203 }, { "epoch": 2.0816326530612246, "grad_norm": 0.005177025683224201, "learning_rate": 1.3015873015873018e-05, "loss": 0.0003, "step": 204 }, { "epoch": 2.0918367346938775, "grad_norm": 0.004482835531234741, "learning_rate": 1.2970521541950114e-05, "loss": 0.0003, "step": 205 }, { "epoch": 2.1020408163265305, "grad_norm": 0.03137246519327164, "learning_rate": 1.2925170068027212e-05, "loss": 0.0008, "step": 206 }, { "epoch": 2.1122448979591835, "grad_norm": 0.0061664879322052, "learning_rate": 1.2879818594104309e-05, "loss": 0.0004, "step": 207 }, { "epoch": 2.122448979591837, "grad_norm": 0.007974425330758095, "learning_rate": 1.2834467120181407e-05, "loss": 0.0004, "step": 208 }, { "epoch": 2.13265306122449, "grad_norm": 0.008234084583818913, "learning_rate": 1.2789115646258505e-05, "loss": 0.0004, "step": 209 }, { "epoch": 2.142857142857143, "grad_norm": 0.005270775873214006, "learning_rate": 1.2743764172335602e-05, "loss": 0.0003, "step": 210 }, { "epoch": 2.1530612244897958, "grad_norm": 0.008359711617231369, "learning_rate": 1.2698412698412699e-05, "loss": 0.0004, "step": 211 }, { "epoch": 2.163265306122449, "grad_norm": 0.004007325973361731, "learning_rate": 1.2653061224489798e-05, "loss": 0.0002, "step": 212 }, { "epoch": 2.173469387755102, "grad_norm": 0.004752746783196926, "learning_rate": 1.2607709750566895e-05, "loss": 0.0002, "step": 213 }, { "epoch": 2.183673469387755, "grad_norm": 0.0031563639640808105, "learning_rate": 1.2562358276643992e-05, "loss": 0.0002, "step": 214 }, { "epoch": 2.193877551020408, "grad_norm": 0.003636228386312723, "learning_rate": 1.251700680272109e-05, "loss": 0.0002, "step": 215 }, { "epoch": 2.204081632653061, "grad_norm": 0.0034094173461198807, "learning_rate": 1.2471655328798186e-05, "loss": 0.0002, "step": 216 }, { "epoch": 2.2142857142857144, "grad_norm": 0.004791253712028265, "learning_rate": 1.2426303854875283e-05, "loss": 0.0003, "step": 217 }, { "epoch": 2.2244897959183674, "grad_norm": 0.010279831476509571, "learning_rate": 1.2380952380952383e-05, "loss": 0.0004, "step": 218 }, { "epoch": 2.2346938775510203, "grad_norm": 0.006269859150052071, "learning_rate": 1.233560090702948e-05, "loss": 0.0003, "step": 219 }, { "epoch": 2.2448979591836733, "grad_norm": 0.003878034185618162, "learning_rate": 1.2290249433106578e-05, "loss": 0.0002, "step": 220 }, { "epoch": 2.2551020408163267, "grad_norm": 0.0031356397084891796, "learning_rate": 1.2244897959183674e-05, "loss": 0.0002, "step": 221 }, { "epoch": 2.2653061224489797, "grad_norm": 0.004956800024956465, "learning_rate": 1.219954648526077e-05, "loss": 0.0003, "step": 222 }, { "epoch": 2.2755102040816326, "grad_norm": 0.0036491460632532835, "learning_rate": 1.215419501133787e-05, "loss": 0.0002, "step": 223 }, { "epoch": 2.2857142857142856, "grad_norm": 0.005171376280486584, "learning_rate": 1.2108843537414967e-05, "loss": 0.0003, "step": 224 }, { "epoch": 2.295918367346939, "grad_norm": 0.0029648093041032553, "learning_rate": 1.2063492063492064e-05, "loss": 0.0002, "step": 225 }, { "epoch": 2.306122448979592, "grad_norm": 0.006329487543553114, "learning_rate": 1.2018140589569162e-05, "loss": 0.0003, "step": 226 }, { "epoch": 2.316326530612245, "grad_norm": 0.0031556261237710714, "learning_rate": 1.197278911564626e-05, "loss": 0.0002, "step": 227 }, { "epoch": 2.326530612244898, "grad_norm": 0.009794807992875576, "learning_rate": 1.1927437641723357e-05, "loss": 0.0004, "step": 228 }, { "epoch": 2.336734693877551, "grad_norm": 0.003714526304975152, "learning_rate": 1.1882086167800455e-05, "loss": 0.0002, "step": 229 }, { "epoch": 2.3469387755102042, "grad_norm": 0.0031528149265795946, "learning_rate": 1.1836734693877552e-05, "loss": 0.0002, "step": 230 }, { "epoch": 2.357142857142857, "grad_norm": 0.024612465873360634, "learning_rate": 1.1791383219954648e-05, "loss": 0.001, "step": 231 }, { "epoch": 2.36734693877551, "grad_norm": 0.00424389261752367, "learning_rate": 1.1746031746031748e-05, "loss": 0.0002, "step": 232 }, { "epoch": 2.377551020408163, "grad_norm": 0.01282750815153122, "learning_rate": 1.1700680272108845e-05, "loss": 0.0006, "step": 233 }, { "epoch": 2.387755102040816, "grad_norm": 0.006169433705508709, "learning_rate": 1.1655328798185943e-05, "loss": 0.0003, "step": 234 }, { "epoch": 2.3979591836734695, "grad_norm": 0.005562425125390291, "learning_rate": 1.160997732426304e-05, "loss": 0.0003, "step": 235 }, { "epoch": 2.4081632653061225, "grad_norm": 0.01002059318125248, "learning_rate": 1.1564625850340136e-05, "loss": 0.0005, "step": 236 }, { "epoch": 2.4183673469387754, "grad_norm": 0.007645392790436745, "learning_rate": 1.1519274376417236e-05, "loss": 0.0004, "step": 237 }, { "epoch": 2.4285714285714284, "grad_norm": 0.03314538300037384, "learning_rate": 1.1473922902494332e-05, "loss": 0.0011, "step": 238 }, { "epoch": 2.438775510204082, "grad_norm": 0.04838201776146889, "learning_rate": 1.1428571428571429e-05, "loss": 0.0009, "step": 239 }, { "epoch": 2.4489795918367347, "grad_norm": 0.006126615218818188, "learning_rate": 1.1383219954648527e-05, "loss": 0.0004, "step": 240 }, { "epoch": 2.4591836734693877, "grad_norm": 0.005240059457719326, "learning_rate": 1.1337868480725624e-05, "loss": 0.0003, "step": 241 }, { "epoch": 2.4693877551020407, "grad_norm": 0.006122751161456108, "learning_rate": 1.1292517006802722e-05, "loss": 0.0003, "step": 242 }, { "epoch": 2.479591836734694, "grad_norm": 0.0024781699758023024, "learning_rate": 1.124716553287982e-05, "loss": 0.0002, "step": 243 }, { "epoch": 2.489795918367347, "grad_norm": 0.00649678660556674, "learning_rate": 1.1201814058956917e-05, "loss": 0.0004, "step": 244 }, { "epoch": 2.5, "grad_norm": 0.003478443017229438, "learning_rate": 1.1156462585034013e-05, "loss": 0.0002, "step": 245 }, { "epoch": 2.510204081632653, "grad_norm": 0.003858257783576846, "learning_rate": 1.1111111111111113e-05, "loss": 0.0002, "step": 246 }, { "epoch": 2.520408163265306, "grad_norm": 0.006577960215508938, "learning_rate": 1.106575963718821e-05, "loss": 0.0004, "step": 247 }, { "epoch": 2.5306122448979593, "grad_norm": 0.004543396644294262, "learning_rate": 1.1020408163265306e-05, "loss": 0.0003, "step": 248 }, { "epoch": 2.5408163265306123, "grad_norm": 0.0032837800681591034, "learning_rate": 1.0975056689342405e-05, "loss": 0.0002, "step": 249 }, { "epoch": 2.5510204081632653, "grad_norm": 0.012741784565150738, "learning_rate": 1.0929705215419501e-05, "loss": 0.0006, "step": 250 }, { "epoch": 2.561224489795918, "grad_norm": 0.002741026459261775, "learning_rate": 1.0884353741496601e-05, "loss": 0.0002, "step": 251 }, { "epoch": 2.571428571428571, "grad_norm": 0.0027930692303925753, "learning_rate": 1.0839002267573698e-05, "loss": 0.0002, "step": 252 }, { "epoch": 2.5816326530612246, "grad_norm": 0.003156407503411174, "learning_rate": 1.0793650793650794e-05, "loss": 0.0002, "step": 253 }, { "epoch": 2.5918367346938775, "grad_norm": 0.0036412908229976892, "learning_rate": 1.0748299319727893e-05, "loss": 0.0002, "step": 254 }, { "epoch": 2.6020408163265305, "grad_norm": 0.04034988954663277, "learning_rate": 1.0702947845804989e-05, "loss": 0.0013, "step": 255 }, { "epoch": 2.612244897959184, "grad_norm": 0.0034895159769803286, "learning_rate": 1.0657596371882086e-05, "loss": 0.0002, "step": 256 }, { "epoch": 2.622448979591837, "grad_norm": 0.04325950890779495, "learning_rate": 1.0612244897959186e-05, "loss": 0.0012, "step": 257 }, { "epoch": 2.63265306122449, "grad_norm": 0.004671269562095404, "learning_rate": 1.0566893424036282e-05, "loss": 0.0003, "step": 258 }, { "epoch": 2.642857142857143, "grad_norm": 0.002534637926146388, "learning_rate": 1.0521541950113379e-05, "loss": 0.0002, "step": 259 }, { "epoch": 2.6530612244897958, "grad_norm": 0.004457306116819382, "learning_rate": 1.0476190476190477e-05, "loss": 0.0003, "step": 260 }, { "epoch": 2.663265306122449, "grad_norm": 0.004050545394420624, "learning_rate": 1.0430839002267575e-05, "loss": 0.0002, "step": 261 }, { "epoch": 2.673469387755102, "grad_norm": 0.04582836106419563, "learning_rate": 1.0385487528344672e-05, "loss": 0.0011, "step": 262 }, { "epoch": 2.683673469387755, "grad_norm": 0.004835136700421572, "learning_rate": 1.034013605442177e-05, "loss": 0.0003, "step": 263 }, { "epoch": 2.693877551020408, "grad_norm": 0.008025884628295898, "learning_rate": 1.0294784580498866e-05, "loss": 0.0003, "step": 264 }, { "epoch": 2.704081632653061, "grad_norm": 0.007876653224229813, "learning_rate": 1.0249433106575966e-05, "loss": 0.0004, "step": 265 }, { "epoch": 2.7142857142857144, "grad_norm": 0.006527318619191647, "learning_rate": 1.0204081632653063e-05, "loss": 0.0003, "step": 266 }, { "epoch": 2.7244897959183674, "grad_norm": 0.002143925055861473, "learning_rate": 1.015873015873016e-05, "loss": 0.0001, "step": 267 }, { "epoch": 2.7346938775510203, "grad_norm": 0.003183850785717368, "learning_rate": 1.0113378684807258e-05, "loss": 0.0002, "step": 268 }, { "epoch": 2.7448979591836737, "grad_norm": 0.003816920565441251, "learning_rate": 1.0068027210884354e-05, "loss": 0.0002, "step": 269 }, { "epoch": 2.7551020408163263, "grad_norm": 0.005489765666425228, "learning_rate": 1.0022675736961451e-05, "loss": 0.0003, "step": 270 }, { "epoch": 2.7653061224489797, "grad_norm": 0.002469045575708151, "learning_rate": 9.977324263038549e-06, "loss": 0.0002, "step": 271 }, { "epoch": 2.7755102040816326, "grad_norm": 0.0031796926632523537, "learning_rate": 9.931972789115647e-06, "loss": 0.0002, "step": 272 }, { "epoch": 2.7857142857142856, "grad_norm": 0.003679267829284072, "learning_rate": 9.886621315192746e-06, "loss": 0.0002, "step": 273 }, { "epoch": 2.795918367346939, "grad_norm": 0.010035431012511253, "learning_rate": 9.841269841269842e-06, "loss": 0.0004, "step": 274 }, { "epoch": 2.806122448979592, "grad_norm": 0.0031564754899591208, "learning_rate": 9.795918367346939e-06, "loss": 0.0002, "step": 275 }, { "epoch": 2.816326530612245, "grad_norm": 0.005581808276474476, "learning_rate": 9.750566893424037e-06, "loss": 0.0003, "step": 276 }, { "epoch": 2.826530612244898, "grad_norm": 0.002813218394294381, "learning_rate": 9.705215419501135e-06, "loss": 0.0002, "step": 277 }, { "epoch": 2.836734693877551, "grad_norm": 0.003005703678354621, "learning_rate": 9.659863945578232e-06, "loss": 0.0002, "step": 278 }, { "epoch": 2.8469387755102042, "grad_norm": 0.0068191043101251125, "learning_rate": 9.61451247165533e-06, "loss": 0.0004, "step": 279 }, { "epoch": 2.857142857142857, "grad_norm": 0.008098878897726536, "learning_rate": 9.569160997732427e-06, "loss": 0.0004, "step": 280 }, { "epoch": 2.86734693877551, "grad_norm": 0.014086649753153324, "learning_rate": 9.523809523809525e-06, "loss": 0.0004, "step": 281 }, { "epoch": 2.877551020408163, "grad_norm": 0.004192018415778875, "learning_rate": 9.478458049886621e-06, "loss": 0.0002, "step": 282 }, { "epoch": 2.887755102040816, "grad_norm": 0.0025980896316468716, "learning_rate": 9.43310657596372e-06, "loss": 0.0002, "step": 283 }, { "epoch": 2.8979591836734695, "grad_norm": 0.010852901265025139, "learning_rate": 9.387755102040818e-06, "loss": 0.0004, "step": 284 }, { "epoch": 2.9081632653061225, "grad_norm": 0.003038214286789298, "learning_rate": 9.342403628117914e-06, "loss": 0.0002, "step": 285 }, { "epoch": 2.9183673469387754, "grad_norm": 0.0038609837647527456, "learning_rate": 9.297052154195013e-06, "loss": 0.0002, "step": 286 }, { "epoch": 2.928571428571429, "grad_norm": 0.0073219058103859425, "learning_rate": 9.251700680272109e-06, "loss": 0.0004, "step": 287 }, { "epoch": 2.938775510204082, "grad_norm": 0.004274952691048384, "learning_rate": 9.206349206349207e-06, "loss": 0.0003, "step": 288 }, { "epoch": 2.9489795918367347, "grad_norm": 0.003952549304813147, "learning_rate": 9.160997732426304e-06, "loss": 0.0002, "step": 289 }, { "epoch": 2.9591836734693877, "grad_norm": 0.03005502186715603, "learning_rate": 9.115646258503402e-06, "loss": 0.0006, "step": 290 }, { "epoch": 2.9693877551020407, "grad_norm": 0.0023858973290771246, "learning_rate": 9.0702947845805e-06, "loss": 0.0002, "step": 291 }, { "epoch": 2.979591836734694, "grad_norm": 0.004339877981692553, "learning_rate": 9.024943310657597e-06, "loss": 0.0003, "step": 292 }, { "epoch": 2.989795918367347, "grad_norm": 0.006001957226544619, "learning_rate": 8.979591836734695e-06, "loss": 0.0003, "step": 293 }, { "epoch": 3.0, "grad_norm": 0.007597202900797129, "learning_rate": 8.934240362811792e-06, "loss": 0.0002, "step": 294 }, { "epoch": 3.010204081632653, "grad_norm": 0.003272986738011241, "learning_rate": 8.888888888888888e-06, "loss": 0.0002, "step": 295 }, { "epoch": 3.020408163265306, "grad_norm": 0.0023373092990368605, "learning_rate": 8.843537414965987e-06, "loss": 0.0001, "step": 296 }, { "epoch": 3.0306122448979593, "grad_norm": 0.0037628381978720427, "learning_rate": 8.798185941043085e-06, "loss": 0.0002, "step": 297 }, { "epoch": 3.0408163265306123, "grad_norm": 0.011344632133841515, "learning_rate": 8.752834467120183e-06, "loss": 0.0005, "step": 298 }, { "epoch": 3.0510204081632653, "grad_norm": 0.009169838391244411, "learning_rate": 8.70748299319728e-06, "loss": 0.0004, "step": 299 }, { "epoch": 3.061224489795918, "grad_norm": 0.019571438431739807, "learning_rate": 8.662131519274378e-06, "loss": 0.0005, "step": 300 }, { "epoch": 3.0714285714285716, "grad_norm": 0.0035050984006375074, "learning_rate": 8.616780045351474e-06, "loss": 0.0002, "step": 301 }, { "epoch": 3.0816326530612246, "grad_norm": 0.004051654599606991, "learning_rate": 8.571428571428571e-06, "loss": 0.0002, "step": 302 }, { "epoch": 3.0918367346938775, "grad_norm": 0.002926639514043927, "learning_rate": 8.52607709750567e-06, "loss": 0.0002, "step": 303 }, { "epoch": 3.1020408163265305, "grad_norm": 0.013055351562798023, "learning_rate": 8.480725623582767e-06, "loss": 0.0004, "step": 304 }, { "epoch": 3.1122448979591835, "grad_norm": 0.004692048765718937, "learning_rate": 8.435374149659866e-06, "loss": 0.0002, "step": 305 }, { "epoch": 3.122448979591837, "grad_norm": 0.0025202229153364897, "learning_rate": 8.390022675736962e-06, "loss": 0.0002, "step": 306 }, { "epoch": 3.13265306122449, "grad_norm": 0.005598872900009155, "learning_rate": 8.34467120181406e-06, "loss": 0.0002, "step": 307 }, { "epoch": 3.142857142857143, "grad_norm": 0.0032469748985022306, "learning_rate": 8.299319727891157e-06, "loss": 0.0002, "step": 308 }, { "epoch": 3.1530612244897958, "grad_norm": 0.004803687799721956, "learning_rate": 8.253968253968254e-06, "loss": 0.0003, "step": 309 }, { "epoch": 3.163265306122449, "grad_norm": 0.0046676271595060825, "learning_rate": 8.208616780045352e-06, "loss": 0.0003, "step": 310 }, { "epoch": 3.173469387755102, "grad_norm": 0.002468443475663662, "learning_rate": 8.16326530612245e-06, "loss": 0.0002, "step": 311 }, { "epoch": 3.183673469387755, "grad_norm": 0.006342902779579163, "learning_rate": 8.117913832199548e-06, "loss": 0.0004, "step": 312 }, { "epoch": 3.193877551020408, "grad_norm": 0.0023443913087248802, "learning_rate": 8.072562358276645e-06, "loss": 0.0002, "step": 313 }, { "epoch": 3.204081632653061, "grad_norm": 0.0020017994102090597, "learning_rate": 8.027210884353741e-06, "loss": 0.0001, "step": 314 }, { "epoch": 3.2142857142857144, "grad_norm": 0.0026365304365754128, "learning_rate": 7.98185941043084e-06, "loss": 0.0002, "step": 315 }, { "epoch": 3.2244897959183674, "grad_norm": 0.0056705656461417675, "learning_rate": 7.936507936507936e-06, "loss": 0.0004, "step": 316 }, { "epoch": 3.2346938775510203, "grad_norm": 0.009689562022686005, "learning_rate": 7.891156462585034e-06, "loss": 0.0004, "step": 317 }, { "epoch": 3.2448979591836733, "grad_norm": 0.008008199743926525, "learning_rate": 7.845804988662133e-06, "loss": 0.0003, "step": 318 }, { "epoch": 3.2551020408163267, "grad_norm": 0.0026869464199990034, "learning_rate": 7.800453514739231e-06, "loss": 0.0002, "step": 319 }, { "epoch": 3.2653061224489797, "grad_norm": 0.0031625712290406227, "learning_rate": 7.755102040816327e-06, "loss": 0.0002, "step": 320 }, { "epoch": 3.2755102040816326, "grad_norm": 0.0028163609094917774, "learning_rate": 7.709750566893424e-06, "loss": 0.0002, "step": 321 }, { "epoch": 3.2857142857142856, "grad_norm": 0.005660755559802055, "learning_rate": 7.664399092970522e-06, "loss": 0.0003, "step": 322 }, { "epoch": 3.295918367346939, "grad_norm": 0.0039995694532990456, "learning_rate": 7.61904761904762e-06, "loss": 0.0003, "step": 323 }, { "epoch": 3.306122448979592, "grad_norm": 0.005670357029885054, "learning_rate": 7.573696145124717e-06, "loss": 0.0003, "step": 324 }, { "epoch": 3.316326530612245, "grad_norm": 0.0025813328102231026, "learning_rate": 7.528344671201815e-06, "loss": 0.0002, "step": 325 }, { "epoch": 3.326530612244898, "grad_norm": 0.0030447246972471476, "learning_rate": 7.482993197278913e-06, "loss": 0.0002, "step": 326 }, { "epoch": 3.336734693877551, "grad_norm": 0.0019142641685903072, "learning_rate": 7.437641723356009e-06, "loss": 0.0001, "step": 327 }, { "epoch": 3.3469387755102042, "grad_norm": 0.004176548682153225, "learning_rate": 7.3922902494331075e-06, "loss": 0.0002, "step": 328 }, { "epoch": 3.357142857142857, "grad_norm": 0.009249200113117695, "learning_rate": 7.346938775510205e-06, "loss": 0.0004, "step": 329 }, { "epoch": 3.36734693877551, "grad_norm": 0.005048077553510666, "learning_rate": 7.301587301587301e-06, "loss": 0.0002, "step": 330 }, { "epoch": 3.377551020408163, "grad_norm": 0.0024696551263332367, "learning_rate": 7.2562358276644e-06, "loss": 0.0002, "step": 331 }, { "epoch": 3.387755102040816, "grad_norm": 0.002270912518724799, "learning_rate": 7.210884353741497e-06, "loss": 0.0002, "step": 332 }, { "epoch": 3.3979591836734695, "grad_norm": 0.0020916208159178495, "learning_rate": 7.165532879818595e-06, "loss": 0.0001, "step": 333 }, { "epoch": 3.4081632653061225, "grad_norm": 0.002925699343904853, "learning_rate": 7.120181405895692e-06, "loss": 0.0002, "step": 334 }, { "epoch": 3.4183673469387754, "grad_norm": 0.003520503407344222, "learning_rate": 7.07482993197279e-06, "loss": 0.0002, "step": 335 }, { "epoch": 3.4285714285714284, "grad_norm": 0.0024117021821439266, "learning_rate": 7.0294784580498875e-06, "loss": 0.0001, "step": 336 }, { "epoch": 3.438775510204082, "grad_norm": 0.011653084307909012, "learning_rate": 6.984126984126984e-06, "loss": 0.0005, "step": 337 }, { "epoch": 3.4489795918367347, "grad_norm": 0.0021838736720383167, "learning_rate": 6.938775510204082e-06, "loss": 0.0001, "step": 338 }, { "epoch": 3.4591836734693877, "grad_norm": 0.005142625421285629, "learning_rate": 6.89342403628118e-06, "loss": 0.0003, "step": 339 }, { "epoch": 3.4693877551020407, "grad_norm": 0.009561867453157902, "learning_rate": 6.848072562358277e-06, "loss": 0.0003, "step": 340 }, { "epoch": 3.479591836734694, "grad_norm": 0.0029086614958941936, "learning_rate": 6.8027210884353745e-06, "loss": 0.0002, "step": 341 }, { "epoch": 3.489795918367347, "grad_norm": 0.004574024584144354, "learning_rate": 6.757369614512473e-06, "loss": 0.0002, "step": 342 }, { "epoch": 3.5, "grad_norm": 0.002138937823474407, "learning_rate": 6.71201814058957e-06, "loss": 0.0001, "step": 343 }, { "epoch": 3.510204081632653, "grad_norm": 0.003517791396006942, "learning_rate": 6.666666666666667e-06, "loss": 0.0002, "step": 344 }, { "epoch": 3.520408163265306, "grad_norm": 0.03443054482340813, "learning_rate": 6.621315192743765e-06, "loss": 0.0008, "step": 345 }, { "epoch": 3.5306122448979593, "grad_norm": 0.008042026311159134, "learning_rate": 6.575963718820862e-06, "loss": 0.0002, "step": 346 }, { "epoch": 3.5408163265306123, "grad_norm": 0.0047872308641672134, "learning_rate": 6.530612244897959e-06, "loss": 0.0003, "step": 347 }, { "epoch": 3.5510204081632653, "grad_norm": 0.010120042599737644, "learning_rate": 6.485260770975057e-06, "loss": 0.0003, "step": 348 }, { "epoch": 3.561224489795918, "grad_norm": 0.004412388429045677, "learning_rate": 6.4399092970521545e-06, "loss": 0.0003, "step": 349 }, { "epoch": 3.571428571428571, "grad_norm": 0.003939002752304077, "learning_rate": 6.394557823129253e-06, "loss": 0.0002, "step": 350 }, { "epoch": 3.5816326530612246, "grad_norm": 0.003072823630645871, "learning_rate": 6.349206349206349e-06, "loss": 0.0002, "step": 351 }, { "epoch": 3.5918367346938775, "grad_norm": 0.002594695193693042, "learning_rate": 6.3038548752834475e-06, "loss": 0.0002, "step": 352 }, { "epoch": 3.6020408163265305, "grad_norm": 0.0021781930699944496, "learning_rate": 6.258503401360545e-06, "loss": 0.0001, "step": 353 }, { "epoch": 3.612244897959184, "grad_norm": 0.004957903642207384, "learning_rate": 6.2131519274376415e-06, "loss": 0.0002, "step": 354 }, { "epoch": 3.622448979591837, "grad_norm": 0.0020664699841290712, "learning_rate": 6.16780045351474e-06, "loss": 0.0001, "step": 355 }, { "epoch": 3.63265306122449, "grad_norm": 0.00455419672653079, "learning_rate": 6.122448979591837e-06, "loss": 0.0002, "step": 356 }, { "epoch": 3.642857142857143, "grad_norm": 0.001891249674372375, "learning_rate": 6.077097505668935e-06, "loss": 0.0001, "step": 357 }, { "epoch": 3.6530612244897958, "grad_norm": 0.0015174165600910783, "learning_rate": 6.031746031746032e-06, "loss": 0.0001, "step": 358 }, { "epoch": 3.663265306122449, "grad_norm": 0.008895975537598133, "learning_rate": 5.98639455782313e-06, "loss": 0.0003, "step": 359 }, { "epoch": 3.673469387755102, "grad_norm": 0.010570279322564602, "learning_rate": 5.9410430839002275e-06, "loss": 0.0003, "step": 360 }, { "epoch": 3.683673469387755, "grad_norm": 0.005755205638706684, "learning_rate": 5.895691609977324e-06, "loss": 0.0002, "step": 361 }, { "epoch": 3.693877551020408, "grad_norm": 0.00319477915763855, "learning_rate": 5.850340136054422e-06, "loss": 0.0002, "step": 362 }, { "epoch": 3.704081632653061, "grad_norm": 0.0023295124992728233, "learning_rate": 5.80498866213152e-06, "loss": 0.0001, "step": 363 }, { "epoch": 3.7142857142857144, "grad_norm": 0.0038169752806425095, "learning_rate": 5.759637188208618e-06, "loss": 0.0003, "step": 364 }, { "epoch": 3.7244897959183674, "grad_norm": 0.007799374870955944, "learning_rate": 5.7142857142857145e-06, "loss": 0.0003, "step": 365 }, { "epoch": 3.7346938775510203, "grad_norm": 0.002488058526068926, "learning_rate": 5.668934240362812e-06, "loss": 0.0002, "step": 366 }, { "epoch": 3.7448979591836737, "grad_norm": 0.01512609887868166, "learning_rate": 5.62358276643991e-06, "loss": 0.0006, "step": 367 }, { "epoch": 3.7551020408163263, "grad_norm": 0.004572188016027212, "learning_rate": 5.578231292517007e-06, "loss": 0.0003, "step": 368 }, { "epoch": 3.7653061224489797, "grad_norm": 0.0024051195941865444, "learning_rate": 5.532879818594105e-06, "loss": 0.0002, "step": 369 }, { "epoch": 3.7755102040816326, "grad_norm": 0.0032509195152670145, "learning_rate": 5.487528344671202e-06, "loss": 0.0002, "step": 370 }, { "epoch": 3.7857142857142856, "grad_norm": 0.0019066549139097333, "learning_rate": 5.442176870748301e-06, "loss": 0.0001, "step": 371 }, { "epoch": 3.795918367346939, "grad_norm": 0.004059778060764074, "learning_rate": 5.396825396825397e-06, "loss": 0.0002, "step": 372 }, { "epoch": 3.806122448979592, "grad_norm": 0.003823889186605811, "learning_rate": 5.3514739229024945e-06, "loss": 0.0002, "step": 373 }, { "epoch": 3.816326530612245, "grad_norm": 0.005696111358702183, "learning_rate": 5.306122448979593e-06, "loss": 0.0003, "step": 374 }, { "epoch": 3.826530612244898, "grad_norm": 0.002276304177939892, "learning_rate": 5.260770975056689e-06, "loss": 0.0001, "step": 375 }, { "epoch": 3.836734693877551, "grad_norm": 0.003423569956794381, "learning_rate": 5.2154195011337876e-06, "loss": 0.0002, "step": 376 }, { "epoch": 3.8469387755102042, "grad_norm": 0.009261609055101871, "learning_rate": 5.170068027210885e-06, "loss": 0.0004, "step": 377 }, { "epoch": 3.857142857142857, "grad_norm": 0.0026830616407096386, "learning_rate": 5.124716553287983e-06, "loss": 0.0002, "step": 378 }, { "epoch": 3.86734693877551, "grad_norm": 0.007292145863175392, "learning_rate": 5.07936507936508e-06, "loss": 0.0003, "step": 379 }, { "epoch": 3.877551020408163, "grad_norm": 0.001463556895032525, "learning_rate": 5.034013605442177e-06, "loss": 0.0001, "step": 380 }, { "epoch": 3.887755102040816, "grad_norm": 0.0077773998491466045, "learning_rate": 4.9886621315192745e-06, "loss": 0.0003, "step": 381 }, { "epoch": 3.8979591836734695, "grad_norm": 0.001333568710833788, "learning_rate": 4.943310657596373e-06, "loss": 0.0001, "step": 382 }, { "epoch": 3.9081632653061225, "grad_norm": 0.0033744387328624725, "learning_rate": 4.897959183673469e-06, "loss": 0.0002, "step": 383 }, { "epoch": 3.9183673469387754, "grad_norm": 0.0031404553446918726, "learning_rate": 4.852607709750568e-06, "loss": 0.0002, "step": 384 }, { "epoch": 3.928571428571429, "grad_norm": 0.002246819669380784, "learning_rate": 4.807256235827665e-06, "loss": 0.0002, "step": 385 }, { "epoch": 3.938775510204082, "grad_norm": 0.006392229348421097, "learning_rate": 4.761904761904762e-06, "loss": 0.0003, "step": 386 }, { "epoch": 3.9489795918367347, "grad_norm": 0.002482037292793393, "learning_rate": 4.71655328798186e-06, "loss": 0.0002, "step": 387 }, { "epoch": 3.9591836734693877, "grad_norm": 0.0029472103342413902, "learning_rate": 4.671201814058957e-06, "loss": 0.0002, "step": 388 }, { "epoch": 3.9693877551020407, "grad_norm": 0.001341557246632874, "learning_rate": 4.6258503401360546e-06, "loss": 0.0001, "step": 389 }, { "epoch": 3.979591836734694, "grad_norm": 0.003789098234847188, "learning_rate": 4.580498866213152e-06, "loss": 0.0002, "step": 390 }, { "epoch": 3.989795918367347, "grad_norm": 0.0022696068044751883, "learning_rate": 4.53514739229025e-06, "loss": 0.0001, "step": 391 }, { "epoch": 4.0, "grad_norm": 0.0036536771804094315, "learning_rate": 4.489795918367348e-06, "loss": 0.0001, "step": 392 }, { "epoch": 4.010204081632653, "grad_norm": 0.0017391832079738379, "learning_rate": 4.444444444444444e-06, "loss": 0.0001, "step": 393 }, { "epoch": 4.020408163265306, "grad_norm": 0.003093178616836667, "learning_rate": 4.399092970521542e-06, "loss": 0.0002, "step": 394 }, { "epoch": 4.030612244897959, "grad_norm": 0.0017501730471849442, "learning_rate": 4.35374149659864e-06, "loss": 0.0001, "step": 395 }, { "epoch": 4.040816326530612, "grad_norm": 0.03281351551413536, "learning_rate": 4.308390022675737e-06, "loss": 0.0007, "step": 396 }, { "epoch": 4.051020408163265, "grad_norm": 0.002625512657687068, "learning_rate": 4.263038548752835e-06, "loss": 0.0002, "step": 397 }, { "epoch": 4.061224489795919, "grad_norm": 0.003758464241400361, "learning_rate": 4.217687074829933e-06, "loss": 0.0002, "step": 398 }, { "epoch": 4.071428571428571, "grad_norm": 0.0021065385080873966, "learning_rate": 4.17233560090703e-06, "loss": 0.0001, "step": 399 }, { "epoch": 4.081632653061225, "grad_norm": 0.0022317173425108194, "learning_rate": 4.126984126984127e-06, "loss": 0.0001, "step": 400 }, { "epoch": 4.091836734693878, "grad_norm": 0.0025758843403309584, "learning_rate": 4.081632653061225e-06, "loss": 0.0002, "step": 401 }, { "epoch": 4.1020408163265305, "grad_norm": 0.003262228099629283, "learning_rate": 4.036281179138322e-06, "loss": 0.0002, "step": 402 }, { "epoch": 4.112244897959184, "grad_norm": 0.002355805365368724, "learning_rate": 3.99092970521542e-06, "loss": 0.0001, "step": 403 }, { "epoch": 4.122448979591836, "grad_norm": 0.002239174908027053, "learning_rate": 3.945578231292517e-06, "loss": 0.0001, "step": 404 }, { "epoch": 4.13265306122449, "grad_norm": 0.003173491917550564, "learning_rate": 3.9002267573696154e-06, "loss": 0.0002, "step": 405 }, { "epoch": 4.142857142857143, "grad_norm": 0.009472887963056564, "learning_rate": 3.854875283446712e-06, "loss": 0.0004, "step": 406 }, { "epoch": 4.153061224489796, "grad_norm": 0.010682443156838417, "learning_rate": 3.80952380952381e-06, "loss": 0.0004, "step": 407 }, { "epoch": 4.163265306122449, "grad_norm": 0.01789182610809803, "learning_rate": 3.7641723356009076e-06, "loss": 0.0006, "step": 408 }, { "epoch": 4.173469387755102, "grad_norm": 0.002530967351049185, "learning_rate": 3.7188208616780046e-06, "loss": 0.0001, "step": 409 }, { "epoch": 4.183673469387755, "grad_norm": 0.0029371839482337236, "learning_rate": 3.6734693877551024e-06, "loss": 0.0002, "step": 410 }, { "epoch": 4.1938775510204085, "grad_norm": 0.004367890767753124, "learning_rate": 3.6281179138322e-06, "loss": 0.0002, "step": 411 }, { "epoch": 4.204081632653061, "grad_norm": 0.0021538427099585533, "learning_rate": 3.5827664399092976e-06, "loss": 0.0001, "step": 412 }, { "epoch": 4.214285714285714, "grad_norm": 0.0021221789065748453, "learning_rate": 3.537414965986395e-06, "loss": 0.0001, "step": 413 }, { "epoch": 4.224489795918367, "grad_norm": 0.0021122246980667114, "learning_rate": 3.492063492063492e-06, "loss": 0.0001, "step": 414 }, { "epoch": 4.23469387755102, "grad_norm": 0.0025512792635709047, "learning_rate": 3.44671201814059e-06, "loss": 0.0001, "step": 415 }, { "epoch": 4.244897959183674, "grad_norm": 0.0041538686491549015, "learning_rate": 3.4013605442176872e-06, "loss": 0.0003, "step": 416 }, { "epoch": 4.255102040816326, "grad_norm": 0.0017510091420263052, "learning_rate": 3.356009070294785e-06, "loss": 0.0001, "step": 417 }, { "epoch": 4.26530612244898, "grad_norm": 0.0024814323987811804, "learning_rate": 3.3106575963718824e-06, "loss": 0.0002, "step": 418 }, { "epoch": 4.275510204081632, "grad_norm": 0.00235186074860394, "learning_rate": 3.2653061224489794e-06, "loss": 0.0001, "step": 419 }, { "epoch": 4.285714285714286, "grad_norm": 0.0030012091156095266, "learning_rate": 3.2199546485260772e-06, "loss": 0.0002, "step": 420 }, { "epoch": 4.295918367346939, "grad_norm": 0.004921985324472189, "learning_rate": 3.1746031746031746e-06, "loss": 0.0003, "step": 421 }, { "epoch": 4.3061224489795915, "grad_norm": 0.009844960644841194, "learning_rate": 3.1292517006802725e-06, "loss": 0.0004, "step": 422 }, { "epoch": 4.316326530612245, "grad_norm": 0.003105542156845331, "learning_rate": 3.08390022675737e-06, "loss": 0.0002, "step": 423 }, { "epoch": 4.326530612244898, "grad_norm": 0.005888419691473246, "learning_rate": 3.0385487528344677e-06, "loss": 0.0003, "step": 424 }, { "epoch": 4.336734693877551, "grad_norm": 0.002076453994959593, "learning_rate": 2.993197278911565e-06, "loss": 0.0001, "step": 425 }, { "epoch": 4.346938775510204, "grad_norm": 0.0016607132274657488, "learning_rate": 2.947845804988662e-06, "loss": 0.0001, "step": 426 }, { "epoch": 4.357142857142857, "grad_norm": 0.0024275570176541805, "learning_rate": 2.90249433106576e-06, "loss": 0.0002, "step": 427 }, { "epoch": 4.36734693877551, "grad_norm": 0.003902298165485263, "learning_rate": 2.8571428571428573e-06, "loss": 0.0002, "step": 428 }, { "epoch": 4.377551020408164, "grad_norm": 0.0023378883488476276, "learning_rate": 2.811791383219955e-06, "loss": 0.0002, "step": 429 }, { "epoch": 4.387755102040816, "grad_norm": 0.0051103937439620495, "learning_rate": 2.7664399092970525e-06, "loss": 0.0002, "step": 430 }, { "epoch": 4.3979591836734695, "grad_norm": 0.0026863350067287683, "learning_rate": 2.7210884353741503e-06, "loss": 0.0002, "step": 431 }, { "epoch": 4.408163265306122, "grad_norm": 0.002001287881284952, "learning_rate": 2.6757369614512473e-06, "loss": 0.0001, "step": 432 }, { "epoch": 4.418367346938775, "grad_norm": 0.008789247833192348, "learning_rate": 2.6303854875283447e-06, "loss": 0.0003, "step": 433 }, { "epoch": 4.428571428571429, "grad_norm": 0.004610543139278889, "learning_rate": 2.5850340136054425e-06, "loss": 0.0002, "step": 434 }, { "epoch": 4.438775510204081, "grad_norm": 0.00599480327218771, "learning_rate": 2.53968253968254e-06, "loss": 0.0003, "step": 435 }, { "epoch": 4.448979591836735, "grad_norm": 0.00554778054356575, "learning_rate": 2.4943310657596373e-06, "loss": 0.0003, "step": 436 }, { "epoch": 4.459183673469388, "grad_norm": 0.00669802725315094, "learning_rate": 2.4489795918367347e-06, "loss": 0.0003, "step": 437 }, { "epoch": 4.469387755102041, "grad_norm": 0.0016791113885119557, "learning_rate": 2.4036281179138325e-06, "loss": 0.0001, "step": 438 }, { "epoch": 4.479591836734694, "grad_norm": 0.0024550866801291704, "learning_rate": 2.35827664399093e-06, "loss": 0.0002, "step": 439 }, { "epoch": 4.489795918367347, "grad_norm": 0.0024079831782728434, "learning_rate": 2.3129251700680273e-06, "loss": 0.0002, "step": 440 }, { "epoch": 4.5, "grad_norm": 0.0028663375414907932, "learning_rate": 2.267573696145125e-06, "loss": 0.0002, "step": 441 }, { "epoch": 4.510204081632653, "grad_norm": 0.008847690187394619, "learning_rate": 2.222222222222222e-06, "loss": 0.0003, "step": 442 }, { "epoch": 4.520408163265306, "grad_norm": 0.005266358610242605, "learning_rate": 2.17687074829932e-06, "loss": 0.0003, "step": 443 }, { "epoch": 4.530612244897959, "grad_norm": 0.0024195676669478416, "learning_rate": 2.1315192743764173e-06, "loss": 0.0002, "step": 444 }, { "epoch": 4.540816326530612, "grad_norm": 0.00421124929562211, "learning_rate": 2.086167800453515e-06, "loss": 0.0002, "step": 445 }, { "epoch": 4.551020408163265, "grad_norm": 0.0020824300590902567, "learning_rate": 2.0408163265306125e-06, "loss": 0.0001, "step": 446 }, { "epoch": 4.561224489795919, "grad_norm": 0.005051845218986273, "learning_rate": 1.99546485260771e-06, "loss": 0.0002, "step": 447 }, { "epoch": 4.571428571428571, "grad_norm": 0.0022977020125836134, "learning_rate": 1.9501133786848077e-06, "loss": 0.0002, "step": 448 }, { "epoch": 4.581632653061225, "grad_norm": 0.001990046352148056, "learning_rate": 1.904761904761905e-06, "loss": 0.0001, "step": 449 }, { "epoch": 4.591836734693878, "grad_norm": 0.0031708430033177137, "learning_rate": 1.8594104308390023e-06, "loss": 0.0002, "step": 450 }, { "epoch": 4.6020408163265305, "grad_norm": 0.0034788285847753286, "learning_rate": 1.8140589569161e-06, "loss": 0.0002, "step": 451 }, { "epoch": 4.612244897959184, "grad_norm": 0.0018601809861138463, "learning_rate": 1.7687074829931975e-06, "loss": 0.0001, "step": 452 }, { "epoch": 4.622448979591836, "grad_norm": 0.016590220853686333, "learning_rate": 1.723356009070295e-06, "loss": 0.0003, "step": 453 }, { "epoch": 4.63265306122449, "grad_norm": 0.003050972009077668, "learning_rate": 1.6780045351473925e-06, "loss": 0.0002, "step": 454 }, { "epoch": 4.642857142857143, "grad_norm": 0.002196480752900243, "learning_rate": 1.6326530612244897e-06, "loss": 0.0001, "step": 455 }, { "epoch": 4.653061224489796, "grad_norm": 0.0025891121476888657, "learning_rate": 1.5873015873015873e-06, "loss": 0.0001, "step": 456 }, { "epoch": 4.663265306122449, "grad_norm": 0.002245939103886485, "learning_rate": 1.541950113378685e-06, "loss": 0.0001, "step": 457 }, { "epoch": 4.673469387755102, "grad_norm": 0.0021706093102693558, "learning_rate": 1.4965986394557825e-06, "loss": 0.0001, "step": 458 }, { "epoch": 4.683673469387755, "grad_norm": 0.0034395295660942793, "learning_rate": 1.45124716553288e-06, "loss": 0.0002, "step": 459 }, { "epoch": 4.6938775510204085, "grad_norm": 0.0023007583804428577, "learning_rate": 1.4058956916099775e-06, "loss": 0.0001, "step": 460 }, { "epoch": 4.704081632653061, "grad_norm": 0.004494468215852976, "learning_rate": 1.3605442176870751e-06, "loss": 0.0002, "step": 461 }, { "epoch": 4.714285714285714, "grad_norm": 0.0017052018083631992, "learning_rate": 1.3151927437641723e-06, "loss": 0.0001, "step": 462 }, { "epoch": 4.724489795918368, "grad_norm": 0.0016002283664420247, "learning_rate": 1.26984126984127e-06, "loss": 0.0001, "step": 463 }, { "epoch": 4.73469387755102, "grad_norm": 0.0033771556336432695, "learning_rate": 1.2244897959183673e-06, "loss": 0.0002, "step": 464 }, { "epoch": 4.744897959183674, "grad_norm": 0.002580232685431838, "learning_rate": 1.179138321995465e-06, "loss": 0.0002, "step": 465 }, { "epoch": 4.755102040816326, "grad_norm": 0.0020106956362724304, "learning_rate": 1.1337868480725626e-06, "loss": 0.0001, "step": 466 }, { "epoch": 4.76530612244898, "grad_norm": 0.0036402051337063313, "learning_rate": 1.08843537414966e-06, "loss": 0.0002, "step": 467 }, { "epoch": 4.775510204081632, "grad_norm": 0.004601712804287672, "learning_rate": 1.0430839002267576e-06, "loss": 0.0002, "step": 468 }, { "epoch": 4.785714285714286, "grad_norm": 0.002733904868364334, "learning_rate": 9.97732426303855e-07, "loss": 0.0002, "step": 469 }, { "epoch": 4.795918367346939, "grad_norm": 0.0027253010775893927, "learning_rate": 9.523809523809525e-07, "loss": 0.0002, "step": 470 }, { "epoch": 4.8061224489795915, "grad_norm": 0.02664267271757126, "learning_rate": 9.0702947845805e-07, "loss": 0.0007, "step": 471 }, { "epoch": 4.816326530612245, "grad_norm": 0.002349977381527424, "learning_rate": 8.616780045351475e-07, "loss": 0.0002, "step": 472 }, { "epoch": 4.826530612244898, "grad_norm": 0.01735026389360428, "learning_rate": 8.163265306122449e-07, "loss": 0.0006, "step": 473 }, { "epoch": 4.836734693877551, "grad_norm": 0.003476122161373496, "learning_rate": 7.709750566893425e-07, "loss": 0.0002, "step": 474 }, { "epoch": 4.846938775510204, "grad_norm": 0.0016622812254354358, "learning_rate": 7.2562358276644e-07, "loss": 0.0001, "step": 475 }, { "epoch": 4.857142857142857, "grad_norm": 0.0042258999310433865, "learning_rate": 6.802721088435376e-07, "loss": 0.0002, "step": 476 }, { "epoch": 4.86734693877551, "grad_norm": 0.0023899853695183992, "learning_rate": 6.34920634920635e-07, "loss": 0.0001, "step": 477 }, { "epoch": 4.877551020408164, "grad_norm": 0.0037495435681194067, "learning_rate": 5.895691609977325e-07, "loss": 0.0002, "step": 478 }, { "epoch": 4.887755102040816, "grad_norm": 0.00362688978202641, "learning_rate": 5.4421768707483e-07, "loss": 0.0002, "step": 479 }, { "epoch": 4.8979591836734695, "grad_norm": 0.005572126246988773, "learning_rate": 4.988662131519275e-07, "loss": 0.0003, "step": 480 }, { "epoch": 4.908163265306122, "grad_norm": 0.0033278209157288074, "learning_rate": 4.53514739229025e-07, "loss": 0.0002, "step": 481 }, { "epoch": 4.918367346938775, "grad_norm": 0.0017971718916669488, "learning_rate": 4.0816326530612243e-07, "loss": 0.0001, "step": 482 }, { "epoch": 4.928571428571429, "grad_norm": 0.003246983280405402, "learning_rate": 3.6281179138322e-07, "loss": 0.0002, "step": 483 }, { "epoch": 4.938775510204081, "grad_norm": 0.002570765558630228, "learning_rate": 3.174603174603175e-07, "loss": 0.0002, "step": 484 }, { "epoch": 4.948979591836735, "grad_norm": 0.003006896935403347, "learning_rate": 2.72108843537415e-07, "loss": 0.0002, "step": 485 }, { "epoch": 4.959183673469388, "grad_norm": 0.003843962447717786, "learning_rate": 2.267573696145125e-07, "loss": 0.0002, "step": 486 }, { "epoch": 4.969387755102041, "grad_norm": 0.005520265083760023, "learning_rate": 1.8140589569161e-07, "loss": 0.0002, "step": 487 }, { "epoch": 4.979591836734694, "grad_norm": 0.0035644182935357094, "learning_rate": 1.360544217687075e-07, "loss": 0.0002, "step": 488 }, { "epoch": 4.989795918367347, "grad_norm": 0.009897944517433643, "learning_rate": 9.0702947845805e-08, "loss": 0.0004, "step": 489 }, { "epoch": 5.0, "grad_norm": 0.005771194584667683, "learning_rate": 4.53514739229025e-08, "loss": 0.0002, "step": 490 } ], "logging_steps": 1, "max_steps": 490, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }