{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999803343166175, "eval_steps": 500, "global_step": 2542, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00039331366764995085, "grad_norm": 7.214968204498291, "learning_rate": 1.9607843137254902e-08, "loss": 0.1717, "step": 1 }, { "epoch": 0.0007866273352999017, "grad_norm": 8.45617961883545, "learning_rate": 3.9215686274509804e-08, "loss": 0.1394, "step": 2 }, { "epoch": 0.0011799410029498525, "grad_norm": 9.644225120544434, "learning_rate": 5.882352941176471e-08, "loss": 0.1416, "step": 3 }, { "epoch": 0.0015732546705998034, "grad_norm": 6.772904872894287, "learning_rate": 7.843137254901961e-08, "loss": 0.1696, "step": 4 }, { "epoch": 0.0019665683382497543, "grad_norm": 11.89709758758545, "learning_rate": 9.803921568627452e-08, "loss": 0.2043, "step": 5 }, { "epoch": 0.002359882005899705, "grad_norm": 30.768009185791016, "learning_rate": 1.1764705882352942e-07, "loss": 0.1557, "step": 6 }, { "epoch": 0.0027531956735496557, "grad_norm": 7.8864569664001465, "learning_rate": 1.3725490196078432e-07, "loss": 0.1478, "step": 7 }, { "epoch": 0.003146509341199607, "grad_norm": 10.4628267288208, "learning_rate": 1.5686274509803921e-07, "loss": 0.162, "step": 8 }, { "epoch": 0.0035398230088495575, "grad_norm": 8.983762741088867, "learning_rate": 1.7647058823529414e-07, "loss": 0.1482, "step": 9 }, { "epoch": 0.003933136676499509, "grad_norm": 9.961833953857422, "learning_rate": 1.9607843137254904e-07, "loss": 0.1851, "step": 10 }, { "epoch": 0.004326450344149459, "grad_norm": 7.383552074432373, "learning_rate": 2.1568627450980394e-07, "loss": 0.1483, "step": 11 }, { "epoch": 0.00471976401179941, "grad_norm": 10.243701934814453, "learning_rate": 2.3529411764705883e-07, "loss": 0.1457, "step": 12 }, { "epoch": 0.005113077679449361, "grad_norm": 9.73193645477295, "learning_rate": 2.5490196078431376e-07, "loss": 0.1623, "step": 13 }, { "epoch": 0.005506391347099311, "grad_norm": 6.044100284576416, "learning_rate": 2.7450980392156863e-07, "loss": 0.1346, "step": 14 }, { "epoch": 0.0058997050147492625, "grad_norm": 28.241085052490234, "learning_rate": 2.9411764705882356e-07, "loss": 0.1583, "step": 15 }, { "epoch": 0.006293018682399214, "grad_norm": 11.225924491882324, "learning_rate": 3.1372549019607843e-07, "loss": 0.1781, "step": 16 }, { "epoch": 0.006686332350049164, "grad_norm": 9.774815559387207, "learning_rate": 3.3333333333333335e-07, "loss": 0.1567, "step": 17 }, { "epoch": 0.007079646017699115, "grad_norm": 10.569445610046387, "learning_rate": 3.529411764705883e-07, "loss": 0.1362, "step": 18 }, { "epoch": 0.007472959685349066, "grad_norm": 6.202274322509766, "learning_rate": 3.7254901960784315e-07, "loss": 0.1493, "step": 19 }, { "epoch": 0.007866273352999017, "grad_norm": 9.480630874633789, "learning_rate": 3.921568627450981e-07, "loss": 0.1528, "step": 20 }, { "epoch": 0.008259587020648967, "grad_norm": 13.586874008178711, "learning_rate": 4.1176470588235295e-07, "loss": 0.1266, "step": 21 }, { "epoch": 0.008652900688298918, "grad_norm": 11.455598831176758, "learning_rate": 4.3137254901960787e-07, "loss": 0.1225, "step": 22 }, { "epoch": 0.00904621435594887, "grad_norm": 12.348589897155762, "learning_rate": 4.509803921568628e-07, "loss": 0.1863, "step": 23 }, { "epoch": 0.00943952802359882, "grad_norm": 7.493137836456299, "learning_rate": 4.7058823529411767e-07, "loss": 0.1214, "step": 24 }, { "epoch": 0.00983284169124877, "grad_norm": 11.203600883483887, "learning_rate": 4.901960784313725e-07, "loss": 0.1511, "step": 25 }, { "epoch": 0.010226155358898722, "grad_norm": 10.017373085021973, "learning_rate": 5.098039215686275e-07, "loss": 0.1464, "step": 26 }, { "epoch": 0.010619469026548672, "grad_norm": 7.930361270904541, "learning_rate": 5.294117647058824e-07, "loss": 0.1716, "step": 27 }, { "epoch": 0.011012782694198623, "grad_norm": 6.609414577484131, "learning_rate": 5.490196078431373e-07, "loss": 0.1499, "step": 28 }, { "epoch": 0.011406096361848575, "grad_norm": 9.198175430297852, "learning_rate": 5.686274509803922e-07, "loss": 0.1513, "step": 29 }, { "epoch": 0.011799410029498525, "grad_norm": 7.527069091796875, "learning_rate": 5.882352941176471e-07, "loss": 0.1344, "step": 30 }, { "epoch": 0.012192723697148475, "grad_norm": 25.97745704650879, "learning_rate": 6.07843137254902e-07, "loss": 0.1025, "step": 31 }, { "epoch": 0.012586037364798427, "grad_norm": 6.214263916015625, "learning_rate": 6.274509803921569e-07, "loss": 0.1387, "step": 32 }, { "epoch": 0.012979351032448377, "grad_norm": 7.101906776428223, "learning_rate": 6.470588235294118e-07, "loss": 0.1335, "step": 33 }, { "epoch": 0.013372664700098328, "grad_norm": 7.696187496185303, "learning_rate": 6.666666666666667e-07, "loss": 0.1277, "step": 34 }, { "epoch": 0.01376597836774828, "grad_norm": 9.324244499206543, "learning_rate": 6.862745098039217e-07, "loss": 0.1512, "step": 35 }, { "epoch": 0.01415929203539823, "grad_norm": 3.9664223194122314, "learning_rate": 7.058823529411766e-07, "loss": 0.0816, "step": 36 }, { "epoch": 0.01455260570304818, "grad_norm": 4.77344274520874, "learning_rate": 7.254901960784315e-07, "loss": 0.1036, "step": 37 }, { "epoch": 0.014945919370698132, "grad_norm": 5.8425612449646, "learning_rate": 7.450980392156863e-07, "loss": 0.0857, "step": 38 }, { "epoch": 0.015339233038348082, "grad_norm": 4.707705020904541, "learning_rate": 7.647058823529413e-07, "loss": 0.0905, "step": 39 }, { "epoch": 0.015732546705998034, "grad_norm": 8.28884220123291, "learning_rate": 7.843137254901962e-07, "loss": 0.1273, "step": 40 }, { "epoch": 0.016125860373647983, "grad_norm": 5.381669998168945, "learning_rate": 8.039215686274511e-07, "loss": 0.0938, "step": 41 }, { "epoch": 0.016519174041297935, "grad_norm": 4.281416893005371, "learning_rate": 8.235294117647059e-07, "loss": 0.0935, "step": 42 }, { "epoch": 0.016912487708947887, "grad_norm": 6.621143817901611, "learning_rate": 8.431372549019609e-07, "loss": 0.1002, "step": 43 }, { "epoch": 0.017305801376597835, "grad_norm": 4.4914350509643555, "learning_rate": 8.627450980392157e-07, "loss": 0.097, "step": 44 }, { "epoch": 0.017699115044247787, "grad_norm": 3.7035109996795654, "learning_rate": 8.823529411764707e-07, "loss": 0.0887, "step": 45 }, { "epoch": 0.01809242871189774, "grad_norm": 4.306455612182617, "learning_rate": 9.019607843137256e-07, "loss": 0.1027, "step": 46 }, { "epoch": 0.018485742379547688, "grad_norm": 5.768416881561279, "learning_rate": 9.215686274509806e-07, "loss": 0.1006, "step": 47 }, { "epoch": 0.01887905604719764, "grad_norm": 19.471040725708008, "learning_rate": 9.411764705882353e-07, "loss": 0.1178, "step": 48 }, { "epoch": 0.019272369714847592, "grad_norm": 6.249476432800293, "learning_rate": 9.607843137254904e-07, "loss": 0.1, "step": 49 }, { "epoch": 0.01966568338249754, "grad_norm": 5.785927772521973, "learning_rate": 9.80392156862745e-07, "loss": 0.078, "step": 50 }, { "epoch": 0.020058997050147492, "grad_norm": 6.312557220458984, "learning_rate": 1.0000000000000002e-06, "loss": 0.1117, "step": 51 }, { "epoch": 0.020452310717797444, "grad_norm": 3.5102477073669434, "learning_rate": 1.019607843137255e-06, "loss": 0.0913, "step": 52 }, { "epoch": 0.020845624385447393, "grad_norm": 6.845943450927734, "learning_rate": 1.03921568627451e-06, "loss": 0.1353, "step": 53 }, { "epoch": 0.021238938053097345, "grad_norm": 5.505466461181641, "learning_rate": 1.0588235294117648e-06, "loss": 0.0965, "step": 54 }, { "epoch": 0.021632251720747297, "grad_norm": 4.362204551696777, "learning_rate": 1.0784313725490197e-06, "loss": 0.0844, "step": 55 }, { "epoch": 0.022025565388397245, "grad_norm": 4.358127117156982, "learning_rate": 1.0980392156862745e-06, "loss": 0.1155, "step": 56 }, { "epoch": 0.022418879056047197, "grad_norm": 7.55561637878418, "learning_rate": 1.1176470588235296e-06, "loss": 0.0742, "step": 57 }, { "epoch": 0.02281219272369715, "grad_norm": 5.882073879241943, "learning_rate": 1.1372549019607845e-06, "loss": 0.1112, "step": 58 }, { "epoch": 0.023205506391347098, "grad_norm": 2.456120491027832, "learning_rate": 1.1568627450980394e-06, "loss": 0.0605, "step": 59 }, { "epoch": 0.02359882005899705, "grad_norm": 19.60419273376465, "learning_rate": 1.1764705882352942e-06, "loss": 0.1267, "step": 60 }, { "epoch": 0.023992133726647002, "grad_norm": 3.074788808822632, "learning_rate": 1.196078431372549e-06, "loss": 0.0821, "step": 61 }, { "epoch": 0.02438544739429695, "grad_norm": 3.561314344406128, "learning_rate": 1.215686274509804e-06, "loss": 0.0572, "step": 62 }, { "epoch": 0.024778761061946902, "grad_norm": 13.668036460876465, "learning_rate": 1.235294117647059e-06, "loss": 0.1268, "step": 63 }, { "epoch": 0.025172074729596854, "grad_norm": 3.8883397579193115, "learning_rate": 1.2549019607843137e-06, "loss": 0.0849, "step": 64 }, { "epoch": 0.025565388397246803, "grad_norm": 4.154886245727539, "learning_rate": 1.2745098039215686e-06, "loss": 0.1071, "step": 65 }, { "epoch": 0.025958702064896755, "grad_norm": 5.3974127769470215, "learning_rate": 1.2941176470588237e-06, "loss": 0.0749, "step": 66 }, { "epoch": 0.026352015732546707, "grad_norm": 3.088780164718628, "learning_rate": 1.3137254901960785e-06, "loss": 0.0768, "step": 67 }, { "epoch": 0.026745329400196655, "grad_norm": 3.2044262886047363, "learning_rate": 1.3333333333333334e-06, "loss": 0.0641, "step": 68 }, { "epoch": 0.027138643067846607, "grad_norm": 5.424925327301025, "learning_rate": 1.3529411764705883e-06, "loss": 0.061, "step": 69 }, { "epoch": 0.02753195673549656, "grad_norm": 4.061574935913086, "learning_rate": 1.3725490196078434e-06, "loss": 0.0851, "step": 70 }, { "epoch": 0.027925270403146508, "grad_norm": 5.696750164031982, "learning_rate": 1.3921568627450982e-06, "loss": 0.1107, "step": 71 }, { "epoch": 0.02831858407079646, "grad_norm": 4.410640716552734, "learning_rate": 1.4117647058823531e-06, "loss": 0.0714, "step": 72 }, { "epoch": 0.028711897738446412, "grad_norm": 6.307974815368652, "learning_rate": 1.4313725490196078e-06, "loss": 0.0866, "step": 73 }, { "epoch": 0.02910521140609636, "grad_norm": 2.53486967086792, "learning_rate": 1.450980392156863e-06, "loss": 0.0613, "step": 74 }, { "epoch": 0.029498525073746312, "grad_norm": 6.9410881996154785, "learning_rate": 1.4705882352941177e-06, "loss": 0.086, "step": 75 }, { "epoch": 0.029891838741396264, "grad_norm": 2.5871775150299072, "learning_rate": 1.4901960784313726e-06, "loss": 0.0507, "step": 76 }, { "epoch": 0.030285152409046213, "grad_norm": 2.2673654556274414, "learning_rate": 1.5098039215686275e-06, "loss": 0.0676, "step": 77 }, { "epoch": 0.030678466076696165, "grad_norm": 2.789076805114746, "learning_rate": 1.5294117647058826e-06, "loss": 0.0632, "step": 78 }, { "epoch": 0.031071779744346117, "grad_norm": 6.127337455749512, "learning_rate": 1.5490196078431374e-06, "loss": 0.0498, "step": 79 }, { "epoch": 0.03146509341199607, "grad_norm": 2.758253574371338, "learning_rate": 1.5686274509803923e-06, "loss": 0.0706, "step": 80 }, { "epoch": 0.03185840707964602, "grad_norm": 6.687328815460205, "learning_rate": 1.5882352941176472e-06, "loss": 0.0961, "step": 81 }, { "epoch": 0.032251720747295966, "grad_norm": 7.499604225158691, "learning_rate": 1.6078431372549023e-06, "loss": 0.0715, "step": 82 }, { "epoch": 0.03264503441494592, "grad_norm": 6.008899211883545, "learning_rate": 1.6274509803921571e-06, "loss": 0.123, "step": 83 }, { "epoch": 0.03303834808259587, "grad_norm": 4.841026306152344, "learning_rate": 1.6470588235294118e-06, "loss": 0.0647, "step": 84 }, { "epoch": 0.03343166175024582, "grad_norm": 3.0710766315460205, "learning_rate": 1.6666666666666667e-06, "loss": 0.0372, "step": 85 }, { "epoch": 0.033824975417895774, "grad_norm": 3.3783321380615234, "learning_rate": 1.6862745098039217e-06, "loss": 0.0843, "step": 86 }, { "epoch": 0.03421828908554572, "grad_norm": 2.6547350883483887, "learning_rate": 1.7058823529411766e-06, "loss": 0.0589, "step": 87 }, { "epoch": 0.03461160275319567, "grad_norm": 3.6741859912872314, "learning_rate": 1.7254901960784315e-06, "loss": 0.0308, "step": 88 }, { "epoch": 0.035004916420845626, "grad_norm": 3.555490493774414, "learning_rate": 1.7450980392156864e-06, "loss": 0.0497, "step": 89 }, { "epoch": 0.035398230088495575, "grad_norm": 3.1174697875976562, "learning_rate": 1.7647058823529414e-06, "loss": 0.063, "step": 90 }, { "epoch": 0.03579154375614552, "grad_norm": 4.790848255157471, "learning_rate": 1.7843137254901963e-06, "loss": 0.0834, "step": 91 }, { "epoch": 0.03618485742379548, "grad_norm": 3.2931265830993652, "learning_rate": 1.8039215686274512e-06, "loss": 0.0531, "step": 92 }, { "epoch": 0.03657817109144543, "grad_norm": 13.777477264404297, "learning_rate": 1.8235294117647058e-06, "loss": 0.0786, "step": 93 }, { "epoch": 0.036971484759095376, "grad_norm": 4.943524360656738, "learning_rate": 1.8431372549019611e-06, "loss": 0.0602, "step": 94 }, { "epoch": 0.03736479842674533, "grad_norm": 6.189723014831543, "learning_rate": 1.8627450980392158e-06, "loss": 0.0697, "step": 95 }, { "epoch": 0.03775811209439528, "grad_norm": 3.5542352199554443, "learning_rate": 1.8823529411764707e-06, "loss": 0.0863, "step": 96 }, { "epoch": 0.03815142576204523, "grad_norm": 5.407109260559082, "learning_rate": 1.9019607843137255e-06, "loss": 0.088, "step": 97 }, { "epoch": 0.038544739429695184, "grad_norm": 3.3334732055664062, "learning_rate": 1.921568627450981e-06, "loss": 0.0889, "step": 98 }, { "epoch": 0.03893805309734513, "grad_norm": 2.48398756980896, "learning_rate": 1.9411764705882353e-06, "loss": 0.0483, "step": 99 }, { "epoch": 0.03933136676499508, "grad_norm": 2.3380913734436035, "learning_rate": 1.96078431372549e-06, "loss": 0.0707, "step": 100 }, { "epoch": 0.039724680432645036, "grad_norm": 4.355076789855957, "learning_rate": 1.980392156862745e-06, "loss": 0.0639, "step": 101 }, { "epoch": 0.040117994100294985, "grad_norm": 4.081620693206787, "learning_rate": 2.0000000000000003e-06, "loss": 0.06, "step": 102 }, { "epoch": 0.04051130776794493, "grad_norm": 4.437114715576172, "learning_rate": 2.019607843137255e-06, "loss": 0.1017, "step": 103 }, { "epoch": 0.04090462143559489, "grad_norm": 4.925793647766113, "learning_rate": 2.03921568627451e-06, "loss": 0.0934, "step": 104 }, { "epoch": 0.04129793510324484, "grad_norm": 2.085400104522705, "learning_rate": 2.058823529411765e-06, "loss": 0.058, "step": 105 }, { "epoch": 0.041691248770894786, "grad_norm": 2.8664395809173584, "learning_rate": 2.07843137254902e-06, "loss": 0.0709, "step": 106 }, { "epoch": 0.04208456243854474, "grad_norm": 1.7521601915359497, "learning_rate": 2.0980392156862747e-06, "loss": 0.031, "step": 107 }, { "epoch": 0.04247787610619469, "grad_norm": 3.7575159072875977, "learning_rate": 2.1176470588235296e-06, "loss": 0.0777, "step": 108 }, { "epoch": 0.04287118977384464, "grad_norm": 4.240278720855713, "learning_rate": 2.1372549019607844e-06, "loss": 0.0965, "step": 109 }, { "epoch": 0.043264503441494594, "grad_norm": 3.841932773590088, "learning_rate": 2.1568627450980393e-06, "loss": 0.0844, "step": 110 }, { "epoch": 0.04365781710914454, "grad_norm": 4.4334397315979, "learning_rate": 2.176470588235294e-06, "loss": 0.0956, "step": 111 }, { "epoch": 0.04405113077679449, "grad_norm": 4.255678653717041, "learning_rate": 2.196078431372549e-06, "loss": 0.0855, "step": 112 }, { "epoch": 0.044444444444444446, "grad_norm": 2.3486170768737793, "learning_rate": 2.215686274509804e-06, "loss": 0.0417, "step": 113 }, { "epoch": 0.044837758112094395, "grad_norm": 2.222768783569336, "learning_rate": 2.2352941176470592e-06, "loss": 0.0556, "step": 114 }, { "epoch": 0.04523107177974434, "grad_norm": 2.750119686126709, "learning_rate": 2.254901960784314e-06, "loss": 0.0481, "step": 115 }, { "epoch": 0.0456243854473943, "grad_norm": 4.375302314758301, "learning_rate": 2.274509803921569e-06, "loss": 0.098, "step": 116 }, { "epoch": 0.04601769911504425, "grad_norm": 3.7654221057891846, "learning_rate": 2.2941176470588234e-06, "loss": 0.1025, "step": 117 }, { "epoch": 0.046411012782694196, "grad_norm": 2.422442674636841, "learning_rate": 2.3137254901960787e-06, "loss": 0.0675, "step": 118 }, { "epoch": 0.04680432645034415, "grad_norm": 3.3458054065704346, "learning_rate": 2.3333333333333336e-06, "loss": 0.067, "step": 119 }, { "epoch": 0.0471976401179941, "grad_norm": 2.7424211502075195, "learning_rate": 2.3529411764705885e-06, "loss": 0.0774, "step": 120 }, { "epoch": 0.04759095378564405, "grad_norm": 3.4825127124786377, "learning_rate": 2.3725490196078433e-06, "loss": 0.086, "step": 121 }, { "epoch": 0.047984267453294004, "grad_norm": 55.36836242675781, "learning_rate": 2.392156862745098e-06, "loss": 0.0938, "step": 122 }, { "epoch": 0.04837758112094395, "grad_norm": 2.256223201751709, "learning_rate": 2.411764705882353e-06, "loss": 0.0673, "step": 123 }, { "epoch": 0.0487708947885939, "grad_norm": 3.8095710277557373, "learning_rate": 2.431372549019608e-06, "loss": 0.0728, "step": 124 }, { "epoch": 0.049164208456243856, "grad_norm": 1.8562949895858765, "learning_rate": 2.450980392156863e-06, "loss": 0.0629, "step": 125 }, { "epoch": 0.049557522123893805, "grad_norm": 4.999472618103027, "learning_rate": 2.470588235294118e-06, "loss": 0.059, "step": 126 }, { "epoch": 0.04995083579154375, "grad_norm": 3.9088096618652344, "learning_rate": 2.490196078431373e-06, "loss": 0.0662, "step": 127 }, { "epoch": 0.05034414945919371, "grad_norm": 4.975748062133789, "learning_rate": 2.5098039215686274e-06, "loss": 0.0688, "step": 128 }, { "epoch": 0.05073746312684366, "grad_norm": 2.183948516845703, "learning_rate": 2.5294117647058823e-06, "loss": 0.0477, "step": 129 }, { "epoch": 0.051130776794493606, "grad_norm": 4.890422821044922, "learning_rate": 2.549019607843137e-06, "loss": 0.0793, "step": 130 }, { "epoch": 0.05152409046214356, "grad_norm": 4.04612398147583, "learning_rate": 2.568627450980392e-06, "loss": 0.0705, "step": 131 }, { "epoch": 0.05191740412979351, "grad_norm": 2.8650074005126953, "learning_rate": 2.5882352941176473e-06, "loss": 0.0777, "step": 132 }, { "epoch": 0.05231071779744346, "grad_norm": 3.9029088020324707, "learning_rate": 2.6078431372549022e-06, "loss": 0.0766, "step": 133 }, { "epoch": 0.052704031465093414, "grad_norm": 2.4210422039031982, "learning_rate": 2.627450980392157e-06, "loss": 0.0663, "step": 134 }, { "epoch": 0.05309734513274336, "grad_norm": 3.0176892280578613, "learning_rate": 2.647058823529412e-06, "loss": 0.0703, "step": 135 }, { "epoch": 0.05349065880039331, "grad_norm": 13.886055946350098, "learning_rate": 2.666666666666667e-06, "loss": 0.064, "step": 136 }, { "epoch": 0.053883972468043266, "grad_norm": 2.40460205078125, "learning_rate": 2.6862745098039217e-06, "loss": 0.0492, "step": 137 }, { "epoch": 0.054277286135693215, "grad_norm": 3.829288959503174, "learning_rate": 2.7058823529411766e-06, "loss": 0.0564, "step": 138 }, { "epoch": 0.05467059980334316, "grad_norm": 2.2005629539489746, "learning_rate": 2.7254901960784314e-06, "loss": 0.0483, "step": 139 }, { "epoch": 0.05506391347099312, "grad_norm": 14.79651927947998, "learning_rate": 2.7450980392156867e-06, "loss": 0.0937, "step": 140 }, { "epoch": 0.05545722713864307, "grad_norm": 1.6898876428604126, "learning_rate": 2.7647058823529416e-06, "loss": 0.0693, "step": 141 }, { "epoch": 0.055850540806293016, "grad_norm": 3.5447332859039307, "learning_rate": 2.7843137254901965e-06, "loss": 0.1311, "step": 142 }, { "epoch": 0.05624385447394297, "grad_norm": 2.291607618331909, "learning_rate": 2.8039215686274514e-06, "loss": 0.061, "step": 143 }, { "epoch": 0.05663716814159292, "grad_norm": 4.079521656036377, "learning_rate": 2.8235294117647062e-06, "loss": 0.1169, "step": 144 }, { "epoch": 0.05703048180924287, "grad_norm": 5.1168012619018555, "learning_rate": 2.843137254901961e-06, "loss": 0.0436, "step": 145 }, { "epoch": 0.057423795476892824, "grad_norm": 4.056823253631592, "learning_rate": 2.8627450980392155e-06, "loss": 0.09, "step": 146 }, { "epoch": 0.05781710914454277, "grad_norm": 2.1756484508514404, "learning_rate": 2.8823529411764704e-06, "loss": 0.0747, "step": 147 }, { "epoch": 0.05821042281219272, "grad_norm": 2.8064467906951904, "learning_rate": 2.901960784313726e-06, "loss": 0.0261, "step": 148 }, { "epoch": 0.058603736479842676, "grad_norm": 2.9834907054901123, "learning_rate": 2.9215686274509806e-06, "loss": 0.0735, "step": 149 }, { "epoch": 0.058997050147492625, "grad_norm": 15.821993827819824, "learning_rate": 2.9411764705882355e-06, "loss": 0.0835, "step": 150 }, { "epoch": 0.05939036381514257, "grad_norm": 6.1172709465026855, "learning_rate": 2.9607843137254903e-06, "loss": 0.0621, "step": 151 }, { "epoch": 0.05978367748279253, "grad_norm": 3.961477041244507, "learning_rate": 2.980392156862745e-06, "loss": 0.0777, "step": 152 }, { "epoch": 0.06017699115044248, "grad_norm": 3.682879686355591, "learning_rate": 3e-06, "loss": 0.0836, "step": 153 }, { "epoch": 0.060570304818092426, "grad_norm": 1.2253718376159668, "learning_rate": 3.019607843137255e-06, "loss": 0.0255, "step": 154 }, { "epoch": 0.06096361848574238, "grad_norm": 2.107466220855713, "learning_rate": 3.03921568627451e-06, "loss": 0.0698, "step": 155 }, { "epoch": 0.06135693215339233, "grad_norm": 2.720797061920166, "learning_rate": 3.058823529411765e-06, "loss": 0.0683, "step": 156 }, { "epoch": 0.06175024582104228, "grad_norm": 2.0135252475738525, "learning_rate": 3.07843137254902e-06, "loss": 0.0594, "step": 157 }, { "epoch": 0.062143559488692234, "grad_norm": 2.011382579803467, "learning_rate": 3.098039215686275e-06, "loss": 0.0643, "step": 158 }, { "epoch": 0.06253687315634218, "grad_norm": 3.047201156616211, "learning_rate": 3.1176470588235297e-06, "loss": 0.0564, "step": 159 }, { "epoch": 0.06293018682399214, "grad_norm": 2.3302555084228516, "learning_rate": 3.1372549019607846e-06, "loss": 0.0404, "step": 160 }, { "epoch": 0.06332350049164208, "grad_norm": 2.7288010120391846, "learning_rate": 3.1568627450980395e-06, "loss": 0.1009, "step": 161 }, { "epoch": 0.06371681415929203, "grad_norm": 2.852647304534912, "learning_rate": 3.1764705882352943e-06, "loss": 0.0508, "step": 162 }, { "epoch": 0.06411012782694199, "grad_norm": 2.101698637008667, "learning_rate": 3.1960784313725492e-06, "loss": 0.0814, "step": 163 }, { "epoch": 0.06450344149459193, "grad_norm": 2.864086151123047, "learning_rate": 3.2156862745098045e-06, "loss": 0.0543, "step": 164 }, { "epoch": 0.06489675516224189, "grad_norm": 2.587751865386963, "learning_rate": 3.2352941176470594e-06, "loss": 0.0753, "step": 165 }, { "epoch": 0.06529006882989184, "grad_norm": 1.5767340660095215, "learning_rate": 3.2549019607843143e-06, "loss": 0.0399, "step": 166 }, { "epoch": 0.06568338249754178, "grad_norm": 3.7279415130615234, "learning_rate": 3.2745098039215687e-06, "loss": 0.0804, "step": 167 }, { "epoch": 0.06607669616519174, "grad_norm": 2.9727795124053955, "learning_rate": 3.2941176470588236e-06, "loss": 0.0548, "step": 168 }, { "epoch": 0.0664700098328417, "grad_norm": 2.0582468509674072, "learning_rate": 3.3137254901960785e-06, "loss": 0.0656, "step": 169 }, { "epoch": 0.06686332350049164, "grad_norm": 7.246119499206543, "learning_rate": 3.3333333333333333e-06, "loss": 0.0499, "step": 170 }, { "epoch": 0.06725663716814159, "grad_norm": 70.4866714477539, "learning_rate": 3.352941176470588e-06, "loss": 0.0764, "step": 171 }, { "epoch": 0.06764995083579155, "grad_norm": 1.8262776136398315, "learning_rate": 3.3725490196078435e-06, "loss": 0.0497, "step": 172 }, { "epoch": 0.06804326450344149, "grad_norm": 2.6392412185668945, "learning_rate": 3.3921568627450984e-06, "loss": 0.072, "step": 173 }, { "epoch": 0.06843657817109144, "grad_norm": 1.2957279682159424, "learning_rate": 3.4117647058823532e-06, "loss": 0.0749, "step": 174 }, { "epoch": 0.0688298918387414, "grad_norm": 1.5801424980163574, "learning_rate": 3.431372549019608e-06, "loss": 0.0504, "step": 175 }, { "epoch": 0.06922320550639134, "grad_norm": 1.6194735765457153, "learning_rate": 3.450980392156863e-06, "loss": 0.0396, "step": 176 }, { "epoch": 0.0696165191740413, "grad_norm": 3.31343674659729, "learning_rate": 3.470588235294118e-06, "loss": 0.0624, "step": 177 }, { "epoch": 0.07000983284169125, "grad_norm": 2.1785762310028076, "learning_rate": 3.4901960784313727e-06, "loss": 0.0548, "step": 178 }, { "epoch": 0.0704031465093412, "grad_norm": 1.3683737516403198, "learning_rate": 3.5098039215686276e-06, "loss": 0.0274, "step": 179 }, { "epoch": 0.07079646017699115, "grad_norm": 3.2981035709381104, "learning_rate": 3.529411764705883e-06, "loss": 0.0816, "step": 180 }, { "epoch": 0.0711897738446411, "grad_norm": 2.3660190105438232, "learning_rate": 3.5490196078431378e-06, "loss": 0.0445, "step": 181 }, { "epoch": 0.07158308751229105, "grad_norm": 3.4103376865386963, "learning_rate": 3.5686274509803926e-06, "loss": 0.0959, "step": 182 }, { "epoch": 0.071976401179941, "grad_norm": 2.7939486503601074, "learning_rate": 3.5882352941176475e-06, "loss": 0.096, "step": 183 }, { "epoch": 0.07236971484759096, "grad_norm": 2.009209632873535, "learning_rate": 3.6078431372549024e-06, "loss": 0.0548, "step": 184 }, { "epoch": 0.0727630285152409, "grad_norm": 1.9003010988235474, "learning_rate": 3.6274509803921573e-06, "loss": 0.058, "step": 185 }, { "epoch": 0.07315634218289085, "grad_norm": 2.788331985473633, "learning_rate": 3.6470588235294117e-06, "loss": 0.0828, "step": 186 }, { "epoch": 0.07354965585054081, "grad_norm": 2.2508130073547363, "learning_rate": 3.6666666666666666e-06, "loss": 0.089, "step": 187 }, { "epoch": 0.07394296951819075, "grad_norm": 14.532478332519531, "learning_rate": 3.6862745098039223e-06, "loss": 0.0878, "step": 188 }, { "epoch": 0.0743362831858407, "grad_norm": 1.3768811225891113, "learning_rate": 3.7058823529411767e-06, "loss": 0.0534, "step": 189 }, { "epoch": 0.07472959685349066, "grad_norm": 2.9948389530181885, "learning_rate": 3.7254901960784316e-06, "loss": 0.0704, "step": 190 }, { "epoch": 0.0751229105211406, "grad_norm": 1.4626399278640747, "learning_rate": 3.7450980392156865e-06, "loss": 0.0306, "step": 191 }, { "epoch": 0.07551622418879056, "grad_norm": 3.062840700149536, "learning_rate": 3.7647058823529414e-06, "loss": 0.0802, "step": 192 }, { "epoch": 0.07590953785644051, "grad_norm": 5.729097843170166, "learning_rate": 3.7843137254901962e-06, "loss": 0.1013, "step": 193 }, { "epoch": 0.07630285152409046, "grad_norm": 1.8716782331466675, "learning_rate": 3.803921568627451e-06, "loss": 0.0664, "step": 194 }, { "epoch": 0.07669616519174041, "grad_norm": 2.058469533920288, "learning_rate": 3.8235294117647055e-06, "loss": 0.0683, "step": 195 }, { "epoch": 0.07708947885939037, "grad_norm": 12.551715850830078, "learning_rate": 3.843137254901962e-06, "loss": 0.09, "step": 196 }, { "epoch": 0.07748279252704031, "grad_norm": 2.2984426021575928, "learning_rate": 3.862745098039216e-06, "loss": 0.0672, "step": 197 }, { "epoch": 0.07787610619469026, "grad_norm": 4.480764865875244, "learning_rate": 3.882352941176471e-06, "loss": 0.051, "step": 198 }, { "epoch": 0.07826941986234022, "grad_norm": 1.4032012224197388, "learning_rate": 3.901960784313726e-06, "loss": 0.0289, "step": 199 }, { "epoch": 0.07866273352999016, "grad_norm": 3.133589029312134, "learning_rate": 3.92156862745098e-06, "loss": 0.0807, "step": 200 }, { "epoch": 0.07905604719764012, "grad_norm": 4.1782307624816895, "learning_rate": 3.941176470588236e-06, "loss": 0.0683, "step": 201 }, { "epoch": 0.07944936086529007, "grad_norm": 11.163358688354492, "learning_rate": 3.96078431372549e-06, "loss": 0.0421, "step": 202 }, { "epoch": 0.07984267453294001, "grad_norm": 1.3736735582351685, "learning_rate": 3.980392156862745e-06, "loss": 0.0339, "step": 203 }, { "epoch": 0.08023598820058997, "grad_norm": 6.474332332611084, "learning_rate": 4.000000000000001e-06, "loss": 0.0606, "step": 204 }, { "epoch": 0.08062930186823992, "grad_norm": 2.8827829360961914, "learning_rate": 4.019607843137255e-06, "loss": 0.1104, "step": 205 }, { "epoch": 0.08102261553588987, "grad_norm": 1.8476606607437134, "learning_rate": 4.03921568627451e-06, "loss": 0.0479, "step": 206 }, { "epoch": 0.08141592920353982, "grad_norm": 3.2202746868133545, "learning_rate": 4.058823529411765e-06, "loss": 0.088, "step": 207 }, { "epoch": 0.08180924287118978, "grad_norm": 3.4121432304382324, "learning_rate": 4.07843137254902e-06, "loss": 0.1051, "step": 208 }, { "epoch": 0.08220255653883972, "grad_norm": 2.4771883487701416, "learning_rate": 4.098039215686275e-06, "loss": 0.0477, "step": 209 }, { "epoch": 0.08259587020648967, "grad_norm": 2.9881558418273926, "learning_rate": 4.11764705882353e-06, "loss": 0.0472, "step": 210 }, { "epoch": 0.08298918387413963, "grad_norm": 2.8722712993621826, "learning_rate": 4.137254901960784e-06, "loss": 0.0856, "step": 211 }, { "epoch": 0.08338249754178957, "grad_norm": 1.9073129892349243, "learning_rate": 4.15686274509804e-06, "loss": 0.0542, "step": 212 }, { "epoch": 0.08377581120943953, "grad_norm": 3.5067648887634277, "learning_rate": 4.176470588235295e-06, "loss": 0.0567, "step": 213 }, { "epoch": 0.08416912487708948, "grad_norm": 2.5827410221099854, "learning_rate": 4.196078431372549e-06, "loss": 0.1062, "step": 214 }, { "epoch": 0.08456243854473942, "grad_norm": 1.8257296085357666, "learning_rate": 4.215686274509805e-06, "loss": 0.0821, "step": 215 }, { "epoch": 0.08495575221238938, "grad_norm": 3.9571404457092285, "learning_rate": 4.235294117647059e-06, "loss": 0.1143, "step": 216 }, { "epoch": 0.08534906588003933, "grad_norm": 2.6589484214782715, "learning_rate": 4.254901960784314e-06, "loss": 0.0814, "step": 217 }, { "epoch": 0.08574237954768928, "grad_norm": 0.915239155292511, "learning_rate": 4.274509803921569e-06, "loss": 0.0355, "step": 218 }, { "epoch": 0.08613569321533923, "grad_norm": 2.9066381454467773, "learning_rate": 4.294117647058823e-06, "loss": 0.0783, "step": 219 }, { "epoch": 0.08652900688298919, "grad_norm": 1.581722378730774, "learning_rate": 4.313725490196079e-06, "loss": 0.0589, "step": 220 }, { "epoch": 0.08692232055063913, "grad_norm": 2.2173354625701904, "learning_rate": 4.333333333333334e-06, "loss": 0.0791, "step": 221 }, { "epoch": 0.08731563421828908, "grad_norm": 1.784740686416626, "learning_rate": 4.352941176470588e-06, "loss": 0.0616, "step": 222 }, { "epoch": 0.08770894788593904, "grad_norm": 1.9993363618850708, "learning_rate": 4.372549019607844e-06, "loss": 0.0864, "step": 223 }, { "epoch": 0.08810226155358898, "grad_norm": 4.089532375335693, "learning_rate": 4.392156862745098e-06, "loss": 0.0982, "step": 224 }, { "epoch": 0.08849557522123894, "grad_norm": 2.5914440155029297, "learning_rate": 4.411764705882353e-06, "loss": 0.0702, "step": 225 }, { "epoch": 0.08888888888888889, "grad_norm": 2.555253028869629, "learning_rate": 4.431372549019608e-06, "loss": 0.0831, "step": 226 }, { "epoch": 0.08928220255653883, "grad_norm": 2.2960548400878906, "learning_rate": 4.450980392156863e-06, "loss": 0.0641, "step": 227 }, { "epoch": 0.08967551622418879, "grad_norm": 1.402106761932373, "learning_rate": 4.4705882352941184e-06, "loss": 0.0594, "step": 228 }, { "epoch": 0.09006882989183874, "grad_norm": 3.1225955486297607, "learning_rate": 4.490196078431373e-06, "loss": 0.1042, "step": 229 }, { "epoch": 0.09046214355948869, "grad_norm": 1.7568937540054321, "learning_rate": 4.509803921568628e-06, "loss": 0.0689, "step": 230 }, { "epoch": 0.09085545722713864, "grad_norm": 2.8846213817596436, "learning_rate": 4.529411764705883e-06, "loss": 0.0955, "step": 231 }, { "epoch": 0.0912487708947886, "grad_norm": 4.436802387237549, "learning_rate": 4.549019607843138e-06, "loss": 0.1668, "step": 232 }, { "epoch": 0.09164208456243854, "grad_norm": 2.784074068069458, "learning_rate": 4.568627450980392e-06, "loss": 0.083, "step": 233 }, { "epoch": 0.0920353982300885, "grad_norm": 2.276759147644043, "learning_rate": 4.588235294117647e-06, "loss": 0.0725, "step": 234 }, { "epoch": 0.09242871189773845, "grad_norm": 2.5278875827789307, "learning_rate": 4.607843137254902e-06, "loss": 0.0744, "step": 235 }, { "epoch": 0.09282202556538839, "grad_norm": 1.711602807044983, "learning_rate": 4.627450980392157e-06, "loss": 0.0749, "step": 236 }, { "epoch": 0.09321533923303835, "grad_norm": 1.4517807960510254, "learning_rate": 4.647058823529412e-06, "loss": 0.0587, "step": 237 }, { "epoch": 0.0936086529006883, "grad_norm": 1.090840220451355, "learning_rate": 4.666666666666667e-06, "loss": 0.0719, "step": 238 }, { "epoch": 0.09400196656833824, "grad_norm": 1.8589414358139038, "learning_rate": 4.686274509803922e-06, "loss": 0.0563, "step": 239 }, { "epoch": 0.0943952802359882, "grad_norm": 2.264702081680298, "learning_rate": 4.705882352941177e-06, "loss": 0.0648, "step": 240 }, { "epoch": 0.09478859390363815, "grad_norm": 1.4464210271835327, "learning_rate": 4.725490196078431e-06, "loss": 0.0238, "step": 241 }, { "epoch": 0.0951819075712881, "grad_norm": 1.9937217235565186, "learning_rate": 4.745098039215687e-06, "loss": 0.0493, "step": 242 }, { "epoch": 0.09557522123893805, "grad_norm": 2.2047340869903564, "learning_rate": 4.764705882352941e-06, "loss": 0.091, "step": 243 }, { "epoch": 0.09596853490658801, "grad_norm": 4.057810306549072, "learning_rate": 4.784313725490196e-06, "loss": 0.0938, "step": 244 }, { "epoch": 0.09636184857423795, "grad_norm": 1.6187644004821777, "learning_rate": 4.803921568627452e-06, "loss": 0.0673, "step": 245 }, { "epoch": 0.0967551622418879, "grad_norm": 2.7249605655670166, "learning_rate": 4.823529411764706e-06, "loss": 0.0848, "step": 246 }, { "epoch": 0.09714847590953786, "grad_norm": 1.7594577074050903, "learning_rate": 4.8431372549019614e-06, "loss": 0.0594, "step": 247 }, { "epoch": 0.0975417895771878, "grad_norm": 2.6266980171203613, "learning_rate": 4.862745098039216e-06, "loss": 0.0866, "step": 248 }, { "epoch": 0.09793510324483776, "grad_norm": 3.3526737689971924, "learning_rate": 4.882352941176471e-06, "loss": 0.1115, "step": 249 }, { "epoch": 0.09832841691248771, "grad_norm": 2.7514872550964355, "learning_rate": 4.901960784313726e-06, "loss": 0.0694, "step": 250 }, { "epoch": 0.09872173058013765, "grad_norm": 2.44143009185791, "learning_rate": 4.921568627450981e-06, "loss": 0.0715, "step": 251 }, { "epoch": 0.09911504424778761, "grad_norm": 2.214268207550049, "learning_rate": 4.941176470588236e-06, "loss": 0.0576, "step": 252 }, { "epoch": 0.09950835791543756, "grad_norm": 1.7012481689453125, "learning_rate": 4.960784313725491e-06, "loss": 0.0754, "step": 253 }, { "epoch": 0.0999016715830875, "grad_norm": 1.8335487842559814, "learning_rate": 4.980392156862746e-06, "loss": 0.0617, "step": 254 }, { "epoch": 0.10029498525073746, "grad_norm": 2.3848774433135986, "learning_rate": 5e-06, "loss": 0.1011, "step": 255 }, { "epoch": 0.10068829891838742, "grad_norm": 2.1847634315490723, "learning_rate": 4.999997641274725e-06, "loss": 0.0793, "step": 256 }, { "epoch": 0.10108161258603736, "grad_norm": 1.5467146635055542, "learning_rate": 4.999990565103349e-06, "loss": 0.0685, "step": 257 }, { "epoch": 0.10147492625368731, "grad_norm": 1.5211800336837769, "learning_rate": 4.999978771499224e-06, "loss": 0.0453, "step": 258 }, { "epoch": 0.10186823992133727, "grad_norm": 1.944356918334961, "learning_rate": 4.999962260484607e-06, "loss": 0.0726, "step": 259 }, { "epoch": 0.10226155358898721, "grad_norm": 2.206536054611206, "learning_rate": 4.999941032090652e-06, "loss": 0.0963, "step": 260 }, { "epoch": 0.10265486725663717, "grad_norm": 0.9998722076416016, "learning_rate": 4.999915086357417e-06, "loss": 0.0425, "step": 261 }, { "epoch": 0.10304818092428712, "grad_norm": 2.102257013320923, "learning_rate": 4.99988442333386e-06, "loss": 0.0857, "step": 262 }, { "epoch": 0.10344149459193706, "grad_norm": 2.055304765701294, "learning_rate": 4.999849043077843e-06, "loss": 0.058, "step": 263 }, { "epoch": 0.10383480825958702, "grad_norm": 2.11883544921875, "learning_rate": 4.999808945656128e-06, "loss": 0.1135, "step": 264 }, { "epoch": 0.10422812192723697, "grad_norm": 1.4651076793670654, "learning_rate": 4.999764131144377e-06, "loss": 0.0609, "step": 265 }, { "epoch": 0.10462143559488692, "grad_norm": 1.3278563022613525, "learning_rate": 4.999714599627155e-06, "loss": 0.0506, "step": 266 }, { "epoch": 0.10501474926253687, "grad_norm": 3.376959800720215, "learning_rate": 4.999660351197926e-06, "loss": 0.0505, "step": 267 }, { "epoch": 0.10540806293018683, "grad_norm": 14.901459693908691, "learning_rate": 4.999601385959056e-06, "loss": 0.0717, "step": 268 }, { "epoch": 0.10580137659783677, "grad_norm": 1.7644176483154297, "learning_rate": 4.999537704021812e-06, "loss": 0.1109, "step": 269 }, { "epoch": 0.10619469026548672, "grad_norm": 1.3101154565811157, "learning_rate": 4.99946930550636e-06, "loss": 0.0433, "step": 270 }, { "epoch": 0.10658800393313668, "grad_norm": 3.403160572052002, "learning_rate": 4.999396190541766e-06, "loss": 0.1082, "step": 271 }, { "epoch": 0.10698131760078662, "grad_norm": 2.1354033946990967, "learning_rate": 4.999318359265998e-06, "loss": 0.0698, "step": 272 }, { "epoch": 0.10737463126843658, "grad_norm": 1.1540406942367554, "learning_rate": 4.999235811825921e-06, "loss": 0.0857, "step": 273 }, { "epoch": 0.10776794493608653, "grad_norm": 1.4908989667892456, "learning_rate": 4.9991485483773e-06, "loss": 0.0627, "step": 274 }, { "epoch": 0.10816125860373647, "grad_norm": 1.5307058095932007, "learning_rate": 4.999056569084801e-06, "loss": 0.0555, "step": 275 }, { "epoch": 0.10855457227138643, "grad_norm": 2.4000704288482666, "learning_rate": 4.998959874121986e-06, "loss": 0.068, "step": 276 }, { "epoch": 0.10894788593903638, "grad_norm": 1.2169445753097534, "learning_rate": 4.998858463671316e-06, "loss": 0.0716, "step": 277 }, { "epoch": 0.10934119960668633, "grad_norm": 1.496738076210022, "learning_rate": 4.998752337924152e-06, "loss": 0.063, "step": 278 }, { "epoch": 0.10973451327433628, "grad_norm": 1.3070656061172485, "learning_rate": 4.998641497080749e-06, "loss": 0.0444, "step": 279 }, { "epoch": 0.11012782694198624, "grad_norm": 3.1283788681030273, "learning_rate": 4.998525941350264e-06, "loss": 0.1097, "step": 280 }, { "epoch": 0.11052114060963618, "grad_norm": 2.3517940044403076, "learning_rate": 4.998405670950747e-06, "loss": 0.0778, "step": 281 }, { "epoch": 0.11091445427728613, "grad_norm": 1.4366756677627563, "learning_rate": 4.998280686109146e-06, "loss": 0.0645, "step": 282 }, { "epoch": 0.11130776794493609, "grad_norm": 1.5536798238754272, "learning_rate": 4.998150987061304e-06, "loss": 0.0483, "step": 283 }, { "epoch": 0.11170108161258603, "grad_norm": 2.191906690597534, "learning_rate": 4.9980165740519625e-06, "loss": 0.061, "step": 284 }, { "epoch": 0.11209439528023599, "grad_norm": 2.2331135272979736, "learning_rate": 4.997877447334754e-06, "loss": 0.073, "step": 285 }, { "epoch": 0.11248770894788594, "grad_norm": 2.7030222415924072, "learning_rate": 4.99773360717221e-06, "loss": 0.0924, "step": 286 }, { "epoch": 0.11288102261553588, "grad_norm": 1.2399053573608398, "learning_rate": 4.997585053835754e-06, "loss": 0.0603, "step": 287 }, { "epoch": 0.11327433628318584, "grad_norm": 1.5186935663223267, "learning_rate": 4.997431787605701e-06, "loss": 0.0733, "step": 288 }, { "epoch": 0.1136676499508358, "grad_norm": 5.53955078125, "learning_rate": 4.997273808771263e-06, "loss": 0.0735, "step": 289 }, { "epoch": 0.11406096361848574, "grad_norm": 1.861646294593811, "learning_rate": 4.997111117630543e-06, "loss": 0.0365, "step": 290 }, { "epoch": 0.11445427728613569, "grad_norm": 1.5158923864364624, "learning_rate": 4.996943714490535e-06, "loss": 0.0598, "step": 291 }, { "epoch": 0.11484759095378565, "grad_norm": 3.7808361053466797, "learning_rate": 4.996771599667126e-06, "loss": 0.09, "step": 292 }, { "epoch": 0.11524090462143559, "grad_norm": 1.3470269441604614, "learning_rate": 4.996594773485093e-06, "loss": 0.0304, "step": 293 }, { "epoch": 0.11563421828908554, "grad_norm": 2.0843825340270996, "learning_rate": 4.996413236278104e-06, "loss": 0.0556, "step": 294 }, { "epoch": 0.1160275319567355, "grad_norm": 1.6657154560089111, "learning_rate": 4.996226988388716e-06, "loss": 0.0628, "step": 295 }, { "epoch": 0.11642084562438544, "grad_norm": 1.9300707578659058, "learning_rate": 4.9960360301683755e-06, "loss": 0.0701, "step": 296 }, { "epoch": 0.1168141592920354, "grad_norm": 1.6507627964019775, "learning_rate": 4.995840361977416e-06, "loss": 0.0783, "step": 297 }, { "epoch": 0.11720747295968535, "grad_norm": 1.9679419994354248, "learning_rate": 4.995639984185059e-06, "loss": 0.0714, "step": 298 }, { "epoch": 0.1176007866273353, "grad_norm": 1.7199714183807373, "learning_rate": 4.9954348971694146e-06, "loss": 0.046, "step": 299 }, { "epoch": 0.11799410029498525, "grad_norm": 1.3099826574325562, "learning_rate": 4.995225101317478e-06, "loss": 0.0542, "step": 300 }, { "epoch": 0.1183874139626352, "grad_norm": 1.4102526903152466, "learning_rate": 4.99501059702513e-06, "loss": 0.07, "step": 301 }, { "epoch": 0.11878072763028515, "grad_norm": 2.6054928302764893, "learning_rate": 4.9947913846971345e-06, "loss": 0.0753, "step": 302 }, { "epoch": 0.1191740412979351, "grad_norm": 2.4399526119232178, "learning_rate": 4.994567464747141e-06, "loss": 0.1051, "step": 303 }, { "epoch": 0.11956735496558506, "grad_norm": 3.065548896789551, "learning_rate": 4.994338837597683e-06, "loss": 0.0955, "step": 304 }, { "epoch": 0.119960668633235, "grad_norm": 1.3317792415618896, "learning_rate": 4.994105503680176e-06, "loss": 0.0595, "step": 305 }, { "epoch": 0.12035398230088495, "grad_norm": 1.5237491130828857, "learning_rate": 4.993867463434916e-06, "loss": 0.0909, "step": 306 }, { "epoch": 0.12074729596853491, "grad_norm": 0.8940740823745728, "learning_rate": 4.9936247173110785e-06, "loss": 0.0628, "step": 307 }, { "epoch": 0.12114060963618485, "grad_norm": 2.6642251014709473, "learning_rate": 4.993377265766723e-06, "loss": 0.0679, "step": 308 }, { "epoch": 0.1215339233038348, "grad_norm": 2.868943452835083, "learning_rate": 4.993125109268784e-06, "loss": 0.047, "step": 309 }, { "epoch": 0.12192723697148476, "grad_norm": 1.1550475358963013, "learning_rate": 4.992868248293077e-06, "loss": 0.0771, "step": 310 }, { "epoch": 0.1223205506391347, "grad_norm": 1.7380859851837158, "learning_rate": 4.9926066833242926e-06, "loss": 0.0573, "step": 311 }, { "epoch": 0.12271386430678466, "grad_norm": 1.8886913061141968, "learning_rate": 4.9923404148559995e-06, "loss": 0.1034, "step": 312 }, { "epoch": 0.12310717797443461, "grad_norm": 1.5682885646820068, "learning_rate": 4.992069443390641e-06, "loss": 0.0595, "step": 313 }, { "epoch": 0.12350049164208456, "grad_norm": 2.2674522399902344, "learning_rate": 4.991793769439534e-06, "loss": 0.0855, "step": 314 }, { "epoch": 0.12389380530973451, "grad_norm": 1.3800448179244995, "learning_rate": 4.991513393522871e-06, "loss": 0.0537, "step": 315 }, { "epoch": 0.12428711897738447, "grad_norm": 1.9727108478546143, "learning_rate": 4.991228316169715e-06, "loss": 0.0698, "step": 316 }, { "epoch": 0.12468043264503441, "grad_norm": 1.1997886896133423, "learning_rate": 4.990938537918001e-06, "loss": 0.0513, "step": 317 }, { "epoch": 0.12507374631268436, "grad_norm": 1.0357115268707275, "learning_rate": 4.990644059314536e-06, "loss": 0.0537, "step": 318 }, { "epoch": 0.1254670599803343, "grad_norm": 2.9861936569213867, "learning_rate": 4.990344880914994e-06, "loss": 0.0836, "step": 319 }, { "epoch": 0.12586037364798427, "grad_norm": 1.0183316469192505, "learning_rate": 4.990041003283921e-06, "loss": 0.0595, "step": 320 }, { "epoch": 0.12625368731563422, "grad_norm": 3.085170269012451, "learning_rate": 4.989732426994725e-06, "loss": 0.1097, "step": 321 }, { "epoch": 0.12664700098328416, "grad_norm": 1.6864210367202759, "learning_rate": 4.989419152629685e-06, "loss": 0.0546, "step": 322 }, { "epoch": 0.12704031465093413, "grad_norm": 1.678736686706543, "learning_rate": 4.9891011807799435e-06, "loss": 0.0436, "step": 323 }, { "epoch": 0.12743362831858407, "grad_norm": 1.6153947114944458, "learning_rate": 4.988778512045507e-06, "loss": 0.0885, "step": 324 }, { "epoch": 0.127826941986234, "grad_norm": 2.239644765853882, "learning_rate": 4.9884511470352456e-06, "loss": 0.0841, "step": 325 }, { "epoch": 0.12822025565388398, "grad_norm": 2.258629560470581, "learning_rate": 4.9881190863668895e-06, "loss": 0.0547, "step": 326 }, { "epoch": 0.12861356932153392, "grad_norm": 1.519643783569336, "learning_rate": 4.98778233066703e-06, "loss": 0.076, "step": 327 }, { "epoch": 0.12900688298918386, "grad_norm": 2.382768154144287, "learning_rate": 4.987440880571121e-06, "loss": 0.0754, "step": 328 }, { "epoch": 0.12940019665683383, "grad_norm": 1.1717922687530518, "learning_rate": 4.98709473672347e-06, "loss": 0.0431, "step": 329 }, { "epoch": 0.12979351032448377, "grad_norm": 2.597674608230591, "learning_rate": 4.986743899777244e-06, "loss": 0.0831, "step": 330 }, { "epoch": 0.13018682399213372, "grad_norm": 2.2018444538116455, "learning_rate": 4.986388370394466e-06, "loss": 0.0967, "step": 331 }, { "epoch": 0.13058013765978368, "grad_norm": 2.4188756942749023, "learning_rate": 4.986028149246013e-06, "loss": 0.0706, "step": 332 }, { "epoch": 0.13097345132743363, "grad_norm": 1.3178000450134277, "learning_rate": 4.985663237011614e-06, "loss": 0.0814, "step": 333 }, { "epoch": 0.13136676499508357, "grad_norm": 1.007521390914917, "learning_rate": 4.985293634379852e-06, "loss": 0.0518, "step": 334 }, { "epoch": 0.13176007866273354, "grad_norm": 2.3999087810516357, "learning_rate": 4.984919342048159e-06, "loss": 0.0526, "step": 335 }, { "epoch": 0.13215339233038348, "grad_norm": 2.07135272026062, "learning_rate": 4.984540360722819e-06, "loss": 0.0493, "step": 336 }, { "epoch": 0.13254670599803342, "grad_norm": 1.2785420417785645, "learning_rate": 4.98415669111896e-06, "loss": 0.0671, "step": 337 }, { "epoch": 0.1329400196656834, "grad_norm": 1.264936089515686, "learning_rate": 4.9837683339605615e-06, "loss": 0.0619, "step": 338 }, { "epoch": 0.13333333333333333, "grad_norm": 2.3385870456695557, "learning_rate": 4.983375289980443e-06, "loss": 0.1164, "step": 339 }, { "epoch": 0.13372664700098327, "grad_norm": 2.5312047004699707, "learning_rate": 4.982977559920273e-06, "loss": 0.1017, "step": 340 }, { "epoch": 0.13411996066863324, "grad_norm": 1.6104050874710083, "learning_rate": 4.982575144530559e-06, "loss": 0.0647, "step": 341 }, { "epoch": 0.13451327433628318, "grad_norm": 1.557822346687317, "learning_rate": 4.982168044570652e-06, "loss": 0.0546, "step": 342 }, { "epoch": 0.13490658800393313, "grad_norm": 1.430794596672058, "learning_rate": 4.981756260808741e-06, "loss": 0.0553, "step": 343 }, { "epoch": 0.1352999016715831, "grad_norm": 1.718525767326355, "learning_rate": 4.981339794021853e-06, "loss": 0.0633, "step": 344 }, { "epoch": 0.13569321533923304, "grad_norm": 0.9465076327323914, "learning_rate": 4.9809186449958536e-06, "loss": 0.0468, "step": 345 }, { "epoch": 0.13608652900688298, "grad_norm": 1.7588387727737427, "learning_rate": 4.980492814525442e-06, "loss": 0.0687, "step": 346 }, { "epoch": 0.13647984267453295, "grad_norm": 1.392269492149353, "learning_rate": 4.980062303414152e-06, "loss": 0.0363, "step": 347 }, { "epoch": 0.1368731563421829, "grad_norm": 2.146742582321167, "learning_rate": 4.97962711247435e-06, "loss": 0.0604, "step": 348 }, { "epoch": 0.13726647000983283, "grad_norm": 2.926267385482788, "learning_rate": 4.979187242527233e-06, "loss": 0.086, "step": 349 }, { "epoch": 0.1376597836774828, "grad_norm": 1.9409819841384888, "learning_rate": 4.978742694402825e-06, "loss": 0.0588, "step": 350 }, { "epoch": 0.13805309734513274, "grad_norm": 1.8433561325073242, "learning_rate": 4.978293468939982e-06, "loss": 0.0676, "step": 351 }, { "epoch": 0.13844641101278268, "grad_norm": 2.0934383869171143, "learning_rate": 4.977839566986382e-06, "loss": 0.0713, "step": 352 }, { "epoch": 0.13883972468043265, "grad_norm": 1.8030976057052612, "learning_rate": 4.977380989398529e-06, "loss": 0.1169, "step": 353 }, { "epoch": 0.1392330383480826, "grad_norm": 2.014277935028076, "learning_rate": 4.976917737041751e-06, "loss": 0.0376, "step": 354 }, { "epoch": 0.13962635201573254, "grad_norm": 1.3366997241973877, "learning_rate": 4.976449810790196e-06, "loss": 0.0644, "step": 355 }, { "epoch": 0.1400196656833825, "grad_norm": 1.63720703125, "learning_rate": 4.97597721152683e-06, "loss": 0.067, "step": 356 }, { "epoch": 0.14041297935103245, "grad_norm": 2.317793846130371, "learning_rate": 4.975499940143439e-06, "loss": 0.0732, "step": 357 }, { "epoch": 0.1408062930186824, "grad_norm": 1.352824330329895, "learning_rate": 4.975017997540625e-06, "loss": 0.0721, "step": 358 }, { "epoch": 0.14119960668633236, "grad_norm": 1.2860400676727295, "learning_rate": 4.974531384627805e-06, "loss": 0.0604, "step": 359 }, { "epoch": 0.1415929203539823, "grad_norm": 2.315216064453125, "learning_rate": 4.974040102323207e-06, "loss": 0.0492, "step": 360 }, { "epoch": 0.14198623402163224, "grad_norm": 1.771453857421875, "learning_rate": 4.973544151553869e-06, "loss": 0.0554, "step": 361 }, { "epoch": 0.1423795476892822, "grad_norm": 0.9052230715751648, "learning_rate": 4.973043533255645e-06, "loss": 0.0524, "step": 362 }, { "epoch": 0.14277286135693215, "grad_norm": 2.327606439590454, "learning_rate": 4.972538248373188e-06, "loss": 0.0583, "step": 363 }, { "epoch": 0.1431661750245821, "grad_norm": 2.986643075942993, "learning_rate": 4.9720282978599625e-06, "loss": 0.0726, "step": 364 }, { "epoch": 0.14355948869223206, "grad_norm": 1.1824491024017334, "learning_rate": 4.971513682678234e-06, "loss": 0.0749, "step": 365 }, { "epoch": 0.143952802359882, "grad_norm": 3.0968868732452393, "learning_rate": 4.970994403799072e-06, "loss": 0.0547, "step": 366 }, { "epoch": 0.14434611602753195, "grad_norm": 1.2194032669067383, "learning_rate": 4.970470462202343e-06, "loss": 0.0651, "step": 367 }, { "epoch": 0.14473942969518191, "grad_norm": 1.3438714742660522, "learning_rate": 4.969941858876719e-06, "loss": 0.0416, "step": 368 }, { "epoch": 0.14513274336283186, "grad_norm": 1.4193546772003174, "learning_rate": 4.96940859481966e-06, "loss": 0.06, "step": 369 }, { "epoch": 0.1455260570304818, "grad_norm": 1.2842000722885132, "learning_rate": 4.968870671037427e-06, "loss": 0.0598, "step": 370 }, { "epoch": 0.14591937069813177, "grad_norm": 2.3905892372131348, "learning_rate": 4.96832808854507e-06, "loss": 0.0652, "step": 371 }, { "epoch": 0.1463126843657817, "grad_norm": 1.5380994081497192, "learning_rate": 4.967780848366432e-06, "loss": 0.1034, "step": 372 }, { "epoch": 0.14670599803343165, "grad_norm": 1.3698018789291382, "learning_rate": 4.967228951534144e-06, "loss": 0.0695, "step": 373 }, { "epoch": 0.14709931170108162, "grad_norm": 1.6553199291229248, "learning_rate": 4.966672399089626e-06, "loss": 0.0358, "step": 374 }, { "epoch": 0.14749262536873156, "grad_norm": 1.966484546661377, "learning_rate": 4.966111192083081e-06, "loss": 0.0396, "step": 375 }, { "epoch": 0.1478859390363815, "grad_norm": 1.1057041883468628, "learning_rate": 4.965545331573493e-06, "loss": 0.0294, "step": 376 }, { "epoch": 0.14827925270403147, "grad_norm": 1.3603320121765137, "learning_rate": 4.964974818628633e-06, "loss": 0.0431, "step": 377 }, { "epoch": 0.1486725663716814, "grad_norm": 3.8050637245178223, "learning_rate": 4.964399654325045e-06, "loss": 0.063, "step": 378 }, { "epoch": 0.14906588003933136, "grad_norm": 1.361873984336853, "learning_rate": 4.963819839748055e-06, "loss": 0.0258, "step": 379 }, { "epoch": 0.14945919370698132, "grad_norm": 1.0739333629608154, "learning_rate": 4.96323537599176e-06, "loss": 0.0553, "step": 380 }, { "epoch": 0.14985250737463127, "grad_norm": 1.5606439113616943, "learning_rate": 4.962646264159031e-06, "loss": 0.0341, "step": 381 }, { "epoch": 0.1502458210422812, "grad_norm": 1.526953101158142, "learning_rate": 4.962052505361512e-06, "loss": 0.0693, "step": 382 }, { "epoch": 0.15063913470993118, "grad_norm": 3.761380195617676, "learning_rate": 4.9614541007196136e-06, "loss": 0.0685, "step": 383 }, { "epoch": 0.15103244837758112, "grad_norm": 2.7432498931884766, "learning_rate": 4.960851051362514e-06, "loss": 0.0501, "step": 384 }, { "epoch": 0.15142576204523106, "grad_norm": 2.669240951538086, "learning_rate": 4.960243358428154e-06, "loss": 0.1198, "step": 385 }, { "epoch": 0.15181907571288103, "grad_norm": 1.5905970335006714, "learning_rate": 4.959631023063238e-06, "loss": 0.0803, "step": 386 }, { "epoch": 0.15221238938053097, "grad_norm": 1.1858878135681152, "learning_rate": 4.959014046423233e-06, "loss": 0.0654, "step": 387 }, { "epoch": 0.1526057030481809, "grad_norm": 1.7795485258102417, "learning_rate": 4.9583924296723606e-06, "loss": 0.0598, "step": 388 }, { "epoch": 0.15299901671583088, "grad_norm": 1.2830811738967896, "learning_rate": 4.957766173983598e-06, "loss": 0.0437, "step": 389 }, { "epoch": 0.15339233038348082, "grad_norm": 0.8960599303245544, "learning_rate": 4.9571352805386795e-06, "loss": 0.0455, "step": 390 }, { "epoch": 0.15378564405113077, "grad_norm": 2.005126714706421, "learning_rate": 4.956499750528086e-06, "loss": 0.0755, "step": 391 }, { "epoch": 0.15417895771878073, "grad_norm": 1.5545151233673096, "learning_rate": 4.955859585151054e-06, "loss": 0.0449, "step": 392 }, { "epoch": 0.15457227138643068, "grad_norm": 1.0876412391662598, "learning_rate": 4.955214785615558e-06, "loss": 0.0718, "step": 393 }, { "epoch": 0.15496558505408062, "grad_norm": 1.9705466032028198, "learning_rate": 4.9545653531383255e-06, "loss": 0.0612, "step": 394 }, { "epoch": 0.1553588987217306, "grad_norm": 1.3790346384048462, "learning_rate": 4.953911288944821e-06, "loss": 0.0371, "step": 395 }, { "epoch": 0.15575221238938053, "grad_norm": 1.0736052989959717, "learning_rate": 4.953252594269252e-06, "loss": 0.056, "step": 396 }, { "epoch": 0.15614552605703047, "grad_norm": 1.919756531715393, "learning_rate": 4.9525892703545604e-06, "loss": 0.0737, "step": 397 }, { "epoch": 0.15653883972468044, "grad_norm": 1.333601713180542, "learning_rate": 4.951921318452428e-06, "loss": 0.0628, "step": 398 }, { "epoch": 0.15693215339233038, "grad_norm": 1.5093313455581665, "learning_rate": 4.951248739823264e-06, "loss": 0.0677, "step": 399 }, { "epoch": 0.15732546705998032, "grad_norm": 1.5697554349899292, "learning_rate": 4.950571535736214e-06, "loss": 0.0672, "step": 400 }, { "epoch": 0.1577187807276303, "grad_norm": 1.4692028760910034, "learning_rate": 4.949889707469145e-06, "loss": 0.0472, "step": 401 }, { "epoch": 0.15811209439528023, "grad_norm": 0.9199762940406799, "learning_rate": 4.949203256308658e-06, "loss": 0.0661, "step": 402 }, { "epoch": 0.15850540806293018, "grad_norm": 1.4585742950439453, "learning_rate": 4.948512183550068e-06, "loss": 0.0776, "step": 403 }, { "epoch": 0.15889872173058014, "grad_norm": 1.2560405731201172, "learning_rate": 4.947816490497419e-06, "loss": 0.0932, "step": 404 }, { "epoch": 0.1592920353982301, "grad_norm": 1.6395833492279053, "learning_rate": 4.947116178463469e-06, "loss": 0.0399, "step": 405 }, { "epoch": 0.15968534906588003, "grad_norm": 0.8655360341072083, "learning_rate": 4.946411248769693e-06, "loss": 0.0421, "step": 406 }, { "epoch": 0.16007866273353, "grad_norm": 0.9741353392601013, "learning_rate": 4.945701702746279e-06, "loss": 0.0469, "step": 407 }, { "epoch": 0.16047197640117994, "grad_norm": 0.9401141405105591, "learning_rate": 4.944987541732126e-06, "loss": 0.0668, "step": 408 }, { "epoch": 0.16086529006882988, "grad_norm": 0.8718335032463074, "learning_rate": 4.944268767074842e-06, "loss": 0.0597, "step": 409 }, { "epoch": 0.16125860373647985, "grad_norm": 1.3456203937530518, "learning_rate": 4.943545380130742e-06, "loss": 0.0755, "step": 410 }, { "epoch": 0.1616519174041298, "grad_norm": 1.1579302549362183, "learning_rate": 4.942817382264842e-06, "loss": 0.0583, "step": 411 }, { "epoch": 0.16204523107177973, "grad_norm": 1.664872169494629, "learning_rate": 4.942084774850858e-06, "loss": 0.0777, "step": 412 }, { "epoch": 0.1624385447394297, "grad_norm": 2.256772518157959, "learning_rate": 4.941347559271208e-06, "loss": 0.0734, "step": 413 }, { "epoch": 0.16283185840707964, "grad_norm": 1.235349416732788, "learning_rate": 4.9406057369170015e-06, "loss": 0.051, "step": 414 }, { "epoch": 0.16322517207472959, "grad_norm": 1.6716983318328857, "learning_rate": 4.939859309188044e-06, "loss": 0.0728, "step": 415 }, { "epoch": 0.16361848574237955, "grad_norm": 1.3591656684875488, "learning_rate": 4.939108277492829e-06, "loss": 0.0725, "step": 416 }, { "epoch": 0.1640117994100295, "grad_norm": 0.6709238886833191, "learning_rate": 4.9383526432485375e-06, "loss": 0.0452, "step": 417 }, { "epoch": 0.16440511307767944, "grad_norm": 1.2356040477752686, "learning_rate": 4.937592407881039e-06, "loss": 0.0682, "step": 418 }, { "epoch": 0.1647984267453294, "grad_norm": 1.0750470161437988, "learning_rate": 4.93682757282488e-06, "loss": 0.0383, "step": 419 }, { "epoch": 0.16519174041297935, "grad_norm": 1.5483283996582031, "learning_rate": 4.936058139523291e-06, "loss": 0.0645, "step": 420 }, { "epoch": 0.1655850540806293, "grad_norm": 2.0328383445739746, "learning_rate": 4.935284109428177e-06, "loss": 0.0623, "step": 421 }, { "epoch": 0.16597836774827926, "grad_norm": 1.5979444980621338, "learning_rate": 4.934505484000116e-06, "loss": 0.0751, "step": 422 }, { "epoch": 0.1663716814159292, "grad_norm": 1.1430745124816895, "learning_rate": 4.93372226470836e-06, "loss": 0.0542, "step": 423 }, { "epoch": 0.16676499508357914, "grad_norm": 2.062899112701416, "learning_rate": 4.932934453030829e-06, "loss": 0.0873, "step": 424 }, { "epoch": 0.1671583087512291, "grad_norm": 3.2697086334228516, "learning_rate": 4.932142050454107e-06, "loss": 0.0733, "step": 425 }, { "epoch": 0.16755162241887905, "grad_norm": 1.2826026678085327, "learning_rate": 4.931345058473443e-06, "loss": 0.0497, "step": 426 }, { "epoch": 0.167944936086529, "grad_norm": 2.3819937705993652, "learning_rate": 4.930543478592743e-06, "loss": 0.0789, "step": 427 }, { "epoch": 0.16833824975417896, "grad_norm": 2.840121030807495, "learning_rate": 4.929737312324574e-06, "loss": 0.054, "step": 428 }, { "epoch": 0.1687315634218289, "grad_norm": 0.6918103098869324, "learning_rate": 4.928926561190155e-06, "loss": 0.0448, "step": 429 }, { "epoch": 0.16912487708947885, "grad_norm": 0.8336203694343567, "learning_rate": 4.928111226719359e-06, "loss": 0.0629, "step": 430 }, { "epoch": 0.16951819075712882, "grad_norm": 1.9415661096572876, "learning_rate": 4.927291310450705e-06, "loss": 0.0731, "step": 431 }, { "epoch": 0.16991150442477876, "grad_norm": 1.3499138355255127, "learning_rate": 4.926466813931358e-06, "loss": 0.0562, "step": 432 }, { "epoch": 0.1703048180924287, "grad_norm": 1.0689488649368286, "learning_rate": 4.925637738717127e-06, "loss": 0.0706, "step": 433 }, { "epoch": 0.17069813176007867, "grad_norm": 2.7924535274505615, "learning_rate": 4.924804086372462e-06, "loss": 0.0671, "step": 434 }, { "epoch": 0.1710914454277286, "grad_norm": 0.8586186170578003, "learning_rate": 4.9239658584704466e-06, "loss": 0.049, "step": 435 }, { "epoch": 0.17148475909537855, "grad_norm": 1.8235011100769043, "learning_rate": 4.923123056592801e-06, "loss": 0.0715, "step": 436 }, { "epoch": 0.17187807276302852, "grad_norm": 1.1591852903366089, "learning_rate": 4.922275682329876e-06, "loss": 0.0799, "step": 437 }, { "epoch": 0.17227138643067846, "grad_norm": 1.2786961793899536, "learning_rate": 4.921423737280649e-06, "loss": 0.0561, "step": 438 }, { "epoch": 0.1726647000983284, "grad_norm": 1.602005958557129, "learning_rate": 4.9205672230527254e-06, "loss": 0.0517, "step": 439 }, { "epoch": 0.17305801376597837, "grad_norm": 1.3069565296173096, "learning_rate": 4.919706141262329e-06, "loss": 0.063, "step": 440 }, { "epoch": 0.17345132743362832, "grad_norm": 1.4721592664718628, "learning_rate": 4.918840493534305e-06, "loss": 0.0789, "step": 441 }, { "epoch": 0.17384464110127826, "grad_norm": 2.0551934242248535, "learning_rate": 4.917970281502112e-06, "loss": 0.0711, "step": 442 }, { "epoch": 0.17423795476892823, "grad_norm": 1.175560474395752, "learning_rate": 4.917095506807824e-06, "loss": 0.0646, "step": 443 }, { "epoch": 0.17463126843657817, "grad_norm": 1.3429381847381592, "learning_rate": 4.916216171102124e-06, "loss": 0.0609, "step": 444 }, { "epoch": 0.1750245821042281, "grad_norm": 1.306825041770935, "learning_rate": 4.9153322760443015e-06, "loss": 0.0529, "step": 445 }, { "epoch": 0.17541789577187808, "grad_norm": 1.4618321657180786, "learning_rate": 4.914443823302246e-06, "loss": 0.0509, "step": 446 }, { "epoch": 0.17581120943952802, "grad_norm": 1.054541826248169, "learning_rate": 4.913550814552454e-06, "loss": 0.0613, "step": 447 }, { "epoch": 0.17620452310717796, "grad_norm": 0.9349273443222046, "learning_rate": 4.912653251480013e-06, "loss": 0.0531, "step": 448 }, { "epoch": 0.17659783677482793, "grad_norm": 1.302675724029541, "learning_rate": 4.9117511357786075e-06, "loss": 0.0661, "step": 449 }, { "epoch": 0.17699115044247787, "grad_norm": 2.327521562576294, "learning_rate": 4.910844469150512e-06, "loss": 0.08, "step": 450 }, { "epoch": 0.17738446411012782, "grad_norm": 1.7499988079071045, "learning_rate": 4.909933253306588e-06, "loss": 0.0368, "step": 451 }, { "epoch": 0.17777777777777778, "grad_norm": 1.1263257265090942, "learning_rate": 4.909017489966283e-06, "loss": 0.0322, "step": 452 }, { "epoch": 0.17817109144542773, "grad_norm": 2.8002772331237793, "learning_rate": 4.9080971808576226e-06, "loss": 0.0597, "step": 453 }, { "epoch": 0.17856440511307767, "grad_norm": 2.0555684566497803, "learning_rate": 4.907172327717214e-06, "loss": 0.0754, "step": 454 }, { "epoch": 0.17895771878072764, "grad_norm": 2.3041601181030273, "learning_rate": 4.906242932290234e-06, "loss": 0.0838, "step": 455 }, { "epoch": 0.17935103244837758, "grad_norm": 2.3882484436035156, "learning_rate": 4.905308996330437e-06, "loss": 0.063, "step": 456 }, { "epoch": 0.17974434611602752, "grad_norm": 1.4339286088943481, "learning_rate": 4.904370521600138e-06, "loss": 0.0723, "step": 457 }, { "epoch": 0.1801376597836775, "grad_norm": 1.387052059173584, "learning_rate": 4.903427509870222e-06, "loss": 0.0708, "step": 458 }, { "epoch": 0.18053097345132743, "grad_norm": 0.8694115877151489, "learning_rate": 4.902479962920134e-06, "loss": 0.0519, "step": 459 }, { "epoch": 0.18092428711897737, "grad_norm": 1.0308964252471924, "learning_rate": 4.901527882537876e-06, "loss": 0.054, "step": 460 }, { "epoch": 0.18131760078662734, "grad_norm": 2.4914846420288086, "learning_rate": 4.900571270520004e-06, "loss": 0.115, "step": 461 }, { "epoch": 0.18171091445427728, "grad_norm": 2.637059450149536, "learning_rate": 4.899610128671626e-06, "loss": 0.0851, "step": 462 }, { "epoch": 0.18210422812192723, "grad_norm": 1.9722718000411987, "learning_rate": 4.898644458806398e-06, "loss": 0.0637, "step": 463 }, { "epoch": 0.1824975417895772, "grad_norm": 0.9795344471931458, "learning_rate": 4.897674262746522e-06, "loss": 0.0622, "step": 464 }, { "epoch": 0.18289085545722714, "grad_norm": 1.2904670238494873, "learning_rate": 4.896699542322736e-06, "loss": 0.0384, "step": 465 }, { "epoch": 0.18328416912487708, "grad_norm": 1.4417036771774292, "learning_rate": 4.895720299374319e-06, "loss": 0.1118, "step": 466 }, { "epoch": 0.18367748279252705, "grad_norm": 1.6243058443069458, "learning_rate": 4.894736535749083e-06, "loss": 0.0756, "step": 467 }, { "epoch": 0.184070796460177, "grad_norm": 1.0999799966812134, "learning_rate": 4.89374825330337e-06, "loss": 0.0525, "step": 468 }, { "epoch": 0.18446411012782693, "grad_norm": 1.9067320823669434, "learning_rate": 4.892755453902051e-06, "loss": 0.066, "step": 469 }, { "epoch": 0.1848574237954769, "grad_norm": 1.1623554229736328, "learning_rate": 4.8917581394185175e-06, "loss": 0.0547, "step": 470 }, { "epoch": 0.18525073746312684, "grad_norm": 1.2230125665664673, "learning_rate": 4.890756311734683e-06, "loss": 0.0753, "step": 471 }, { "epoch": 0.18564405113077678, "grad_norm": 1.376905083656311, "learning_rate": 4.8897499727409755e-06, "loss": 0.0637, "step": 472 }, { "epoch": 0.18603736479842675, "grad_norm": 2.381087064743042, "learning_rate": 4.888739124336338e-06, "loss": 0.0818, "step": 473 }, { "epoch": 0.1864306784660767, "grad_norm": 1.5327961444854736, "learning_rate": 4.8877237684282205e-06, "loss": 0.0689, "step": 474 }, { "epoch": 0.18682399213372664, "grad_norm": 1.7480573654174805, "learning_rate": 4.8867039069325804e-06, "loss": 0.0713, "step": 475 }, { "epoch": 0.1872173058013766, "grad_norm": 1.2657626867294312, "learning_rate": 4.8856795417738754e-06, "loss": 0.0742, "step": 476 }, { "epoch": 0.18761061946902655, "grad_norm": 1.0295419692993164, "learning_rate": 4.884650674885062e-06, "loss": 0.0448, "step": 477 }, { "epoch": 0.1880039331366765, "grad_norm": 1.9904601573944092, "learning_rate": 4.883617308207592e-06, "loss": 0.0801, "step": 478 }, { "epoch": 0.18839724680432646, "grad_norm": 1.4027286767959595, "learning_rate": 4.88257944369141e-06, "loss": 0.0502, "step": 479 }, { "epoch": 0.1887905604719764, "grad_norm": 2.087235689163208, "learning_rate": 4.8815370832949425e-06, "loss": 0.1021, "step": 480 }, { "epoch": 0.18918387413962634, "grad_norm": 0.8643338680267334, "learning_rate": 4.880490228985104e-06, "loss": 0.0732, "step": 481 }, { "epoch": 0.1895771878072763, "grad_norm": 1.4668515920639038, "learning_rate": 4.8794388827372884e-06, "loss": 0.0548, "step": 482 }, { "epoch": 0.18997050147492625, "grad_norm": 1.8225198984146118, "learning_rate": 4.878383046535366e-06, "loss": 0.0882, "step": 483 }, { "epoch": 0.1903638151425762, "grad_norm": 1.6394109725952148, "learning_rate": 4.877322722371677e-06, "loss": 0.1029, "step": 484 }, { "epoch": 0.19075712881022616, "grad_norm": 0.9612401723861694, "learning_rate": 4.876257912247033e-06, "loss": 0.0442, "step": 485 }, { "epoch": 0.1911504424778761, "grad_norm": 2.0715410709381104, "learning_rate": 4.8751886181707105e-06, "loss": 0.0793, "step": 486 }, { "epoch": 0.19154375614552605, "grad_norm": 1.14213228225708, "learning_rate": 4.874114842160445e-06, "loss": 0.0782, "step": 487 }, { "epoch": 0.19193706981317601, "grad_norm": 1.7314140796661377, "learning_rate": 4.873036586242431e-06, "loss": 0.0478, "step": 488 }, { "epoch": 0.19233038348082596, "grad_norm": 0.6948450803756714, "learning_rate": 4.871953852451316e-06, "loss": 0.0546, "step": 489 }, { "epoch": 0.1927236971484759, "grad_norm": 1.9421541690826416, "learning_rate": 4.8708666428301975e-06, "loss": 0.0793, "step": 490 }, { "epoch": 0.19311701081612587, "grad_norm": 0.5670569539070129, "learning_rate": 4.869774959430619e-06, "loss": 0.0506, "step": 491 }, { "epoch": 0.1935103244837758, "grad_norm": 1.437902808189392, "learning_rate": 4.868678804312565e-06, "loss": 0.0545, "step": 492 }, { "epoch": 0.19390363815142575, "grad_norm": 1.8984867334365845, "learning_rate": 4.867578179544457e-06, "loss": 0.0658, "step": 493 }, { "epoch": 0.19429695181907572, "grad_norm": 2.0684666633605957, "learning_rate": 4.866473087203154e-06, "loss": 0.0565, "step": 494 }, { "epoch": 0.19469026548672566, "grad_norm": 1.5473408699035645, "learning_rate": 4.865363529373944e-06, "loss": 0.0481, "step": 495 }, { "epoch": 0.1950835791543756, "grad_norm": 1.678281545639038, "learning_rate": 4.864249508150539e-06, "loss": 0.056, "step": 496 }, { "epoch": 0.19547689282202557, "grad_norm": 1.3713724613189697, "learning_rate": 4.863131025635076e-06, "loss": 0.0474, "step": 497 }, { "epoch": 0.1958702064896755, "grad_norm": 2.0483641624450684, "learning_rate": 4.862008083938109e-06, "loss": 0.0712, "step": 498 }, { "epoch": 0.19626352015732546, "grad_norm": 1.701915979385376, "learning_rate": 4.8608806851786075e-06, "loss": 0.0642, "step": 499 }, { "epoch": 0.19665683382497542, "grad_norm": 1.4159979820251465, "learning_rate": 4.859748831483949e-06, "loss": 0.0706, "step": 500 }, { "epoch": 0.19705014749262537, "grad_norm": 0.9921556711196899, "learning_rate": 4.858612524989921e-06, "loss": 0.0311, "step": 501 }, { "epoch": 0.1974434611602753, "grad_norm": 0.6453993320465088, "learning_rate": 4.857471767840709e-06, "loss": 0.0304, "step": 502 }, { "epoch": 0.19783677482792528, "grad_norm": 2.1691184043884277, "learning_rate": 4.856326562188902e-06, "loss": 0.0573, "step": 503 }, { "epoch": 0.19823008849557522, "grad_norm": 1.424170732498169, "learning_rate": 4.855176910195479e-06, "loss": 0.0371, "step": 504 }, { "epoch": 0.19862340216322516, "grad_norm": 2.0996835231781006, "learning_rate": 4.854022814029809e-06, "loss": 0.06, "step": 505 }, { "epoch": 0.19901671583087513, "grad_norm": 2.2325479984283447, "learning_rate": 4.852864275869652e-06, "loss": 0.0686, "step": 506 }, { "epoch": 0.19941002949852507, "grad_norm": 1.8133199214935303, "learning_rate": 4.851701297901144e-06, "loss": 0.0811, "step": 507 }, { "epoch": 0.199803343166175, "grad_norm": 1.4886740446090698, "learning_rate": 4.850533882318803e-06, "loss": 0.0516, "step": 508 }, { "epoch": 0.20019665683382498, "grad_norm": 1.685327172279358, "learning_rate": 4.849362031325518e-06, "loss": 0.0427, "step": 509 }, { "epoch": 0.20058997050147492, "grad_norm": 2.726207733154297, "learning_rate": 4.8481857471325485e-06, "loss": 0.0686, "step": 510 }, { "epoch": 0.20098328416912487, "grad_norm": 1.1494991779327393, "learning_rate": 4.847005031959521e-06, "loss": 0.0642, "step": 511 }, { "epoch": 0.20137659783677483, "grad_norm": 2.118980884552002, "learning_rate": 4.84581988803442e-06, "loss": 0.0504, "step": 512 }, { "epoch": 0.20176991150442478, "grad_norm": 1.4535127878189087, "learning_rate": 4.84463031759359e-06, "loss": 0.0482, "step": 513 }, { "epoch": 0.20216322517207472, "grad_norm": 0.8411951065063477, "learning_rate": 4.843436322881725e-06, "loss": 0.0491, "step": 514 }, { "epoch": 0.2025565388397247, "grad_norm": 0.9351110458374023, "learning_rate": 4.8422379061518705e-06, "loss": 0.0278, "step": 515 }, { "epoch": 0.20294985250737463, "grad_norm": 1.2653199434280396, "learning_rate": 4.841035069665416e-06, "loss": 0.0494, "step": 516 }, { "epoch": 0.20334316617502457, "grad_norm": 2.1194064617156982, "learning_rate": 4.83982781569209e-06, "loss": 0.0985, "step": 517 }, { "epoch": 0.20373647984267454, "grad_norm": 0.9621169567108154, "learning_rate": 4.838616146509956e-06, "loss": 0.0681, "step": 518 }, { "epoch": 0.20412979351032448, "grad_norm": 2.935671091079712, "learning_rate": 4.83740006440541e-06, "loss": 0.1056, "step": 519 }, { "epoch": 0.20452310717797442, "grad_norm": 1.5503019094467163, "learning_rate": 4.8361795716731744e-06, "loss": 0.0736, "step": 520 }, { "epoch": 0.2049164208456244, "grad_norm": 1.5426656007766724, "learning_rate": 4.8349546706162965e-06, "loss": 0.0768, "step": 521 }, { "epoch": 0.20530973451327433, "grad_norm": 1.788036823272705, "learning_rate": 4.833725363546139e-06, "loss": 0.0785, "step": 522 }, { "epoch": 0.20570304818092428, "grad_norm": 1.3642781972885132, "learning_rate": 4.8324916527823795e-06, "loss": 0.0582, "step": 523 }, { "epoch": 0.20609636184857424, "grad_norm": 2.6498544216156006, "learning_rate": 4.831253540653007e-06, "loss": 0.068, "step": 524 }, { "epoch": 0.20648967551622419, "grad_norm": 1.3358078002929688, "learning_rate": 4.8300110294943145e-06, "loss": 0.0689, "step": 525 }, { "epoch": 0.20688298918387413, "grad_norm": 2.4475595951080322, "learning_rate": 4.828764121650896e-06, "loss": 0.0685, "step": 526 }, { "epoch": 0.2072763028515241, "grad_norm": 1.8231087923049927, "learning_rate": 4.827512819475641e-06, "loss": 0.061, "step": 527 }, { "epoch": 0.20766961651917404, "grad_norm": 1.6098417043685913, "learning_rate": 4.826257125329733e-06, "loss": 0.0775, "step": 528 }, { "epoch": 0.20806293018682398, "grad_norm": 1.2955044507980347, "learning_rate": 4.824997041582641e-06, "loss": 0.0828, "step": 529 }, { "epoch": 0.20845624385447395, "grad_norm": 1.600419282913208, "learning_rate": 4.82373257061212e-06, "loss": 0.0868, "step": 530 }, { "epoch": 0.2088495575221239, "grad_norm": 1.2169928550720215, "learning_rate": 4.8224637148042e-06, "loss": 0.0543, "step": 531 }, { "epoch": 0.20924287118977383, "grad_norm": 1.6863512992858887, "learning_rate": 4.821190476553186e-06, "loss": 0.0703, "step": 532 }, { "epoch": 0.2096361848574238, "grad_norm": 1.9771099090576172, "learning_rate": 4.819912858261656e-06, "loss": 0.0799, "step": 533 }, { "epoch": 0.21002949852507374, "grad_norm": 1.276354432106018, "learning_rate": 4.818630862340449e-06, "loss": 0.0661, "step": 534 }, { "epoch": 0.21042281219272368, "grad_norm": 1.1068519353866577, "learning_rate": 4.817344491208665e-06, "loss": 0.0496, "step": 535 }, { "epoch": 0.21081612586037365, "grad_norm": 1.1699997186660767, "learning_rate": 4.816053747293663e-06, "loss": 0.0395, "step": 536 }, { "epoch": 0.2112094395280236, "grad_norm": 1.290640115737915, "learning_rate": 4.814758633031049e-06, "loss": 0.0526, "step": 537 }, { "epoch": 0.21160275319567354, "grad_norm": 1.8085367679595947, "learning_rate": 4.813459150864681e-06, "loss": 0.0593, "step": 538 }, { "epoch": 0.2119960668633235, "grad_norm": 1.6277810335159302, "learning_rate": 4.812155303246653e-06, "loss": 0.0645, "step": 539 }, { "epoch": 0.21238938053097345, "grad_norm": 0.9544056057929993, "learning_rate": 4.810847092637301e-06, "loss": 0.063, "step": 540 }, { "epoch": 0.2127826941986234, "grad_norm": 1.349601149559021, "learning_rate": 4.809534521505192e-06, "loss": 0.0877, "step": 541 }, { "epoch": 0.21317600786627336, "grad_norm": 1.6013360023498535, "learning_rate": 4.8082175923271235e-06, "loss": 0.0637, "step": 542 }, { "epoch": 0.2135693215339233, "grad_norm": 1.130764365196228, "learning_rate": 4.806896307588113e-06, "loss": 0.086, "step": 543 }, { "epoch": 0.21396263520157324, "grad_norm": 1.40028715133667, "learning_rate": 4.805570669781399e-06, "loss": 0.0876, "step": 544 }, { "epoch": 0.2143559488692232, "grad_norm": 1.7551463842391968, "learning_rate": 4.804240681408434e-06, "loss": 0.0593, "step": 545 }, { "epoch": 0.21474926253687315, "grad_norm": 1.648735523223877, "learning_rate": 4.802906344978881e-06, "loss": 0.0772, "step": 546 }, { "epoch": 0.2151425762045231, "grad_norm": 0.8385063409805298, "learning_rate": 4.801567663010605e-06, "loss": 0.0706, "step": 547 }, { "epoch": 0.21553588987217306, "grad_norm": 1.8120150566101074, "learning_rate": 4.800224638029672e-06, "loss": 0.0696, "step": 548 }, { "epoch": 0.215929203539823, "grad_norm": 0.5346795916557312, "learning_rate": 4.798877272570343e-06, "loss": 0.0494, "step": 549 }, { "epoch": 0.21632251720747295, "grad_norm": 1.4182865619659424, "learning_rate": 4.797525569175073e-06, "loss": 0.0711, "step": 550 }, { "epoch": 0.21671583087512292, "grad_norm": 0.9838932752609253, "learning_rate": 4.796169530394498e-06, "loss": 0.0843, "step": 551 }, { "epoch": 0.21710914454277286, "grad_norm": 1.5188270807266235, "learning_rate": 4.7948091587874355e-06, "loss": 0.0663, "step": 552 }, { "epoch": 0.2175024582104228, "grad_norm": 1.796202540397644, "learning_rate": 4.793444456920881e-06, "loss": 0.0655, "step": 553 }, { "epoch": 0.21789577187807277, "grad_norm": 1.4925826787948608, "learning_rate": 4.7920754273699985e-06, "loss": 0.0607, "step": 554 }, { "epoch": 0.2182890855457227, "grad_norm": 1.2840732336044312, "learning_rate": 4.790702072718121e-06, "loss": 0.0634, "step": 555 }, { "epoch": 0.21868239921337265, "grad_norm": 1.0566197633743286, "learning_rate": 4.789324395556741e-06, "loss": 0.0475, "step": 556 }, { "epoch": 0.21907571288102262, "grad_norm": 1.2299338579177856, "learning_rate": 4.7879423984855085e-06, "loss": 0.054, "step": 557 }, { "epoch": 0.21946902654867256, "grad_norm": 1.7808493375778198, "learning_rate": 4.786556084112224e-06, "loss": 0.0905, "step": 558 }, { "epoch": 0.2198623402163225, "grad_norm": 1.054694652557373, "learning_rate": 4.785165455052836e-06, "loss": 0.0561, "step": 559 }, { "epoch": 0.22025565388397247, "grad_norm": 2.180976629257202, "learning_rate": 4.783770513931433e-06, "loss": 0.0705, "step": 560 }, { "epoch": 0.22064896755162242, "grad_norm": 0.9467242956161499, "learning_rate": 4.782371263380242e-06, "loss": 0.0471, "step": 561 }, { "epoch": 0.22104228121927236, "grad_norm": 1.0072274208068848, "learning_rate": 4.780967706039622e-06, "loss": 0.0642, "step": 562 }, { "epoch": 0.22143559488692233, "grad_norm": 0.9987531304359436, "learning_rate": 4.779559844558056e-06, "loss": 0.0556, "step": 563 }, { "epoch": 0.22182890855457227, "grad_norm": 1.5135668516159058, "learning_rate": 4.778147681592152e-06, "loss": 0.051, "step": 564 }, { "epoch": 0.2222222222222222, "grad_norm": 1.6369942426681519, "learning_rate": 4.776731219806634e-06, "loss": 0.1089, "step": 565 }, { "epoch": 0.22261553588987218, "grad_norm": 1.8307068347930908, "learning_rate": 4.775310461874337e-06, "loss": 0.0555, "step": 566 }, { "epoch": 0.22300884955752212, "grad_norm": 1.2417643070220947, "learning_rate": 4.773885410476202e-06, "loss": 0.0356, "step": 567 }, { "epoch": 0.22340216322517206, "grad_norm": 0.8904944658279419, "learning_rate": 4.7724560683012735e-06, "loss": 0.0649, "step": 568 }, { "epoch": 0.22379547689282203, "grad_norm": 1.3853691816329956, "learning_rate": 4.771022438046693e-06, "loss": 0.0429, "step": 569 }, { "epoch": 0.22418879056047197, "grad_norm": 1.6937843561172485, "learning_rate": 4.769584522417691e-06, "loss": 0.0831, "step": 570 }, { "epoch": 0.22458210422812191, "grad_norm": 1.6160171031951904, "learning_rate": 4.768142324127586e-06, "loss": 0.0754, "step": 571 }, { "epoch": 0.22497541789577188, "grad_norm": 1.2548290491104126, "learning_rate": 4.766695845897778e-06, "loss": 0.073, "step": 572 }, { "epoch": 0.22536873156342183, "grad_norm": 2.645967483520508, "learning_rate": 4.765245090457744e-06, "loss": 0.1022, "step": 573 }, { "epoch": 0.22576204523107177, "grad_norm": 1.2090085744857788, "learning_rate": 4.763790060545028e-06, "loss": 0.0449, "step": 574 }, { "epoch": 0.22615535889872174, "grad_norm": 1.5384302139282227, "learning_rate": 4.762330758905246e-06, "loss": 0.0523, "step": 575 }, { "epoch": 0.22654867256637168, "grad_norm": 1.3840306997299194, "learning_rate": 4.760867188292068e-06, "loss": 0.0409, "step": 576 }, { "epoch": 0.22694198623402162, "grad_norm": 0.8169382214546204, "learning_rate": 4.7593993514672255e-06, "loss": 0.0526, "step": 577 }, { "epoch": 0.2273352999016716, "grad_norm": 0.6939831972122192, "learning_rate": 4.757927251200497e-06, "loss": 0.0497, "step": 578 }, { "epoch": 0.22772861356932153, "grad_norm": 2.4073455333709717, "learning_rate": 4.756450890269705e-06, "loss": 0.0703, "step": 579 }, { "epoch": 0.22812192723697147, "grad_norm": 1.4490169286727905, "learning_rate": 4.754970271460714e-06, "loss": 0.0429, "step": 580 }, { "epoch": 0.22851524090462144, "grad_norm": 0.8039276599884033, "learning_rate": 4.753485397567424e-06, "loss": 0.0525, "step": 581 }, { "epoch": 0.22890855457227138, "grad_norm": 0.9220805764198303, "learning_rate": 4.751996271391761e-06, "loss": 0.056, "step": 582 }, { "epoch": 0.22930186823992132, "grad_norm": 2.1960690021514893, "learning_rate": 4.750502895743677e-06, "loss": 0.0636, "step": 583 }, { "epoch": 0.2296951819075713, "grad_norm": 1.5164406299591064, "learning_rate": 4.749005273441143e-06, "loss": 0.0557, "step": 584 }, { "epoch": 0.23008849557522124, "grad_norm": 1.8541299104690552, "learning_rate": 4.747503407310142e-06, "loss": 0.0679, "step": 585 }, { "epoch": 0.23048180924287118, "grad_norm": 5.52957010269165, "learning_rate": 4.745997300184666e-06, "loss": 0.0805, "step": 586 }, { "epoch": 0.23087512291052115, "grad_norm": 1.318687915802002, "learning_rate": 4.744486954906709e-06, "loss": 0.0499, "step": 587 }, { "epoch": 0.2312684365781711, "grad_norm": 1.1736847162246704, "learning_rate": 4.742972374326262e-06, "loss": 0.0371, "step": 588 }, { "epoch": 0.23166175024582103, "grad_norm": 1.7209968566894531, "learning_rate": 4.74145356130131e-06, "loss": 0.0553, "step": 589 }, { "epoch": 0.232055063913471, "grad_norm": 1.392303228378296, "learning_rate": 4.739930518697823e-06, "loss": 0.0468, "step": 590 }, { "epoch": 0.23244837758112094, "grad_norm": 1.6198259592056274, "learning_rate": 4.738403249389752e-06, "loss": 0.0671, "step": 591 }, { "epoch": 0.23284169124877088, "grad_norm": 1.394888997077942, "learning_rate": 4.736871756259023e-06, "loss": 0.0851, "step": 592 }, { "epoch": 0.23323500491642085, "grad_norm": 1.2976491451263428, "learning_rate": 4.7353360421955345e-06, "loss": 0.0614, "step": 593 }, { "epoch": 0.2336283185840708, "grad_norm": 1.2485517263412476, "learning_rate": 4.733796110097148e-06, "loss": 0.0429, "step": 594 }, { "epoch": 0.23402163225172073, "grad_norm": 2.0384671688079834, "learning_rate": 4.732251962869685e-06, "loss": 0.0549, "step": 595 }, { "epoch": 0.2344149459193707, "grad_norm": 2.514827251434326, "learning_rate": 4.730703603426921e-06, "loss": 0.0934, "step": 596 }, { "epoch": 0.23480825958702065, "grad_norm": 1.5746873617172241, "learning_rate": 4.729151034690579e-06, "loss": 0.0797, "step": 597 }, { "epoch": 0.2352015732546706, "grad_norm": 1.458757996559143, "learning_rate": 4.727594259590326e-06, "loss": 0.07, "step": 598 }, { "epoch": 0.23559488692232056, "grad_norm": 1.9289155006408691, "learning_rate": 4.726033281063766e-06, "loss": 0.0447, "step": 599 }, { "epoch": 0.2359882005899705, "grad_norm": 2.641873359680176, "learning_rate": 4.724468102056434e-06, "loss": 0.1165, "step": 600 }, { "epoch": 0.23638151425762044, "grad_norm": 0.6296206116676331, "learning_rate": 4.722898725521793e-06, "loss": 0.0597, "step": 601 }, { "epoch": 0.2367748279252704, "grad_norm": 1.7393361330032349, "learning_rate": 4.721325154421224e-06, "loss": 0.0508, "step": 602 }, { "epoch": 0.23716814159292035, "grad_norm": 1.639045000076294, "learning_rate": 4.7197473917240255e-06, "loss": 0.0433, "step": 603 }, { "epoch": 0.2375614552605703, "grad_norm": 1.4411070346832275, "learning_rate": 4.718165440407404e-06, "loss": 0.0626, "step": 604 }, { "epoch": 0.23795476892822026, "grad_norm": 1.7141265869140625, "learning_rate": 4.716579303456471e-06, "loss": 0.0641, "step": 605 }, { "epoch": 0.2383480825958702, "grad_norm": 1.1153072118759155, "learning_rate": 4.714988983864235e-06, "loss": 0.0524, "step": 606 }, { "epoch": 0.23874139626352014, "grad_norm": 0.6169893741607666, "learning_rate": 4.713394484631598e-06, "loss": 0.0485, "step": 607 }, { "epoch": 0.23913470993117011, "grad_norm": 2.24593186378479, "learning_rate": 4.711795808767348e-06, "loss": 0.0767, "step": 608 }, { "epoch": 0.23952802359882006, "grad_norm": 0.8726077675819397, "learning_rate": 4.7101929592881545e-06, "loss": 0.0506, "step": 609 }, { "epoch": 0.23992133726647, "grad_norm": 1.0482176542282104, "learning_rate": 4.708585939218564e-06, "loss": 0.0374, "step": 610 }, { "epoch": 0.24031465093411997, "grad_norm": 1.031867265701294, "learning_rate": 4.7069747515909905e-06, "loss": 0.0513, "step": 611 }, { "epoch": 0.2407079646017699, "grad_norm": 1.548361897468567, "learning_rate": 4.7053593994457135e-06, "loss": 0.0524, "step": 612 }, { "epoch": 0.24110127826941985, "grad_norm": 2.367420196533203, "learning_rate": 4.70373988583087e-06, "loss": 0.0915, "step": 613 }, { "epoch": 0.24149459193706982, "grad_norm": 1.440256953239441, "learning_rate": 4.7021162138024524e-06, "loss": 0.0829, "step": 614 }, { "epoch": 0.24188790560471976, "grad_norm": 1.6830074787139893, "learning_rate": 4.700488386424294e-06, "loss": 0.0706, "step": 615 }, { "epoch": 0.2422812192723697, "grad_norm": 2.811821699142456, "learning_rate": 4.698856406768076e-06, "loss": 0.0531, "step": 616 }, { "epoch": 0.24267453294001967, "grad_norm": 2.031094551086426, "learning_rate": 4.697220277913311e-06, "loss": 0.0751, "step": 617 }, { "epoch": 0.2430678466076696, "grad_norm": 1.9269078969955444, "learning_rate": 4.695580002947341e-06, "loss": 0.0624, "step": 618 }, { "epoch": 0.24346116027531955, "grad_norm": 1.3828526735305786, "learning_rate": 4.6939355849653325e-06, "loss": 0.0776, "step": 619 }, { "epoch": 0.24385447394296952, "grad_norm": 1.0781844854354858, "learning_rate": 4.69228702707027e-06, "loss": 0.0477, "step": 620 }, { "epoch": 0.24424778761061947, "grad_norm": 1.0195046663284302, "learning_rate": 4.69063433237295e-06, "loss": 0.06, "step": 621 }, { "epoch": 0.2446411012782694, "grad_norm": 0.6686704158782959, "learning_rate": 4.688977503991975e-06, "loss": 0.0713, "step": 622 }, { "epoch": 0.24503441494591938, "grad_norm": 1.7740367650985718, "learning_rate": 4.687316545053746e-06, "loss": 0.092, "step": 623 }, { "epoch": 0.24542772861356932, "grad_norm": 1.1935254335403442, "learning_rate": 4.68565145869246e-06, "loss": 0.0697, "step": 624 }, { "epoch": 0.24582104228121926, "grad_norm": 0.7092412710189819, "learning_rate": 4.683982248050103e-06, "loss": 0.0647, "step": 625 }, { "epoch": 0.24621435594886923, "grad_norm": 2.2962708473205566, "learning_rate": 4.6823089162764425e-06, "loss": 0.07, "step": 626 }, { "epoch": 0.24660766961651917, "grad_norm": 1.1462363004684448, "learning_rate": 4.6806314665290205e-06, "loss": 0.0519, "step": 627 }, { "epoch": 0.2470009832841691, "grad_norm": 2.2198500633239746, "learning_rate": 4.678949901973154e-06, "loss": 0.0411, "step": 628 }, { "epoch": 0.24739429695181908, "grad_norm": 0.703561007976532, "learning_rate": 4.677264225781921e-06, "loss": 0.0505, "step": 629 }, { "epoch": 0.24778761061946902, "grad_norm": 1.4070128202438354, "learning_rate": 4.6755744411361585e-06, "loss": 0.0659, "step": 630 }, { "epoch": 0.24818092428711896, "grad_norm": 0.9832798838615417, "learning_rate": 4.6738805512244575e-06, "loss": 0.0917, "step": 631 }, { "epoch": 0.24857423795476893, "grad_norm": 0.9056950807571411, "learning_rate": 4.672182559243155e-06, "loss": 0.0484, "step": 632 }, { "epoch": 0.24896755162241888, "grad_norm": 2.0713984966278076, "learning_rate": 4.670480468396327e-06, "loss": 0.0729, "step": 633 }, { "epoch": 0.24936086529006882, "grad_norm": 0.9963469505310059, "learning_rate": 4.668774281895786e-06, "loss": 0.0507, "step": 634 }, { "epoch": 0.2497541789577188, "grad_norm": 0.9695498943328857, "learning_rate": 4.667064002961073e-06, "loss": 0.0538, "step": 635 }, { "epoch": 0.25014749262536873, "grad_norm": 1.3090274333953857, "learning_rate": 4.66534963481945e-06, "loss": 0.0931, "step": 636 }, { "epoch": 0.25054080629301867, "grad_norm": 1.2280491590499878, "learning_rate": 4.663631180705894e-06, "loss": 0.0488, "step": 637 }, { "epoch": 0.2509341199606686, "grad_norm": 1.050603985786438, "learning_rate": 4.661908643863096e-06, "loss": 0.0723, "step": 638 }, { "epoch": 0.2513274336283186, "grad_norm": 1.2820688486099243, "learning_rate": 4.66018202754145e-06, "loss": 0.0854, "step": 639 }, { "epoch": 0.25172074729596855, "grad_norm": 0.9909592866897583, "learning_rate": 4.658451334999043e-06, "loss": 0.0613, "step": 640 }, { "epoch": 0.2521140609636185, "grad_norm": 0.7117825746536255, "learning_rate": 4.656716569501661e-06, "loss": 0.0249, "step": 641 }, { "epoch": 0.25250737463126843, "grad_norm": 1.803819179534912, "learning_rate": 4.654977734322772e-06, "loss": 0.0744, "step": 642 }, { "epoch": 0.2529006882989184, "grad_norm": 1.2123903036117554, "learning_rate": 4.653234832743521e-06, "loss": 0.0893, "step": 643 }, { "epoch": 0.2532940019665683, "grad_norm": 1.3053680658340454, "learning_rate": 4.651487868052731e-06, "loss": 0.0794, "step": 644 }, { "epoch": 0.2536873156342183, "grad_norm": 1.5112253427505493, "learning_rate": 4.64973684354689e-06, "loss": 0.1139, "step": 645 }, { "epoch": 0.25408062930186825, "grad_norm": 0.4444582164287567, "learning_rate": 4.647981762530145e-06, "loss": 0.031, "step": 646 }, { "epoch": 0.2544739429695182, "grad_norm": 0.863317608833313, "learning_rate": 4.6462226283143e-06, "loss": 0.0336, "step": 647 }, { "epoch": 0.25486725663716814, "grad_norm": 2.007761001586914, "learning_rate": 4.644459444218807e-06, "loss": 0.0531, "step": 648 }, { "epoch": 0.2552605703048181, "grad_norm": 2.1189866065979004, "learning_rate": 4.642692213570759e-06, "loss": 0.0906, "step": 649 }, { "epoch": 0.255653883972468, "grad_norm": 0.7463569045066833, "learning_rate": 4.640920939704885e-06, "loss": 0.0449, "step": 650 }, { "epoch": 0.256047197640118, "grad_norm": 2.031602144241333, "learning_rate": 4.639145625963544e-06, "loss": 0.0673, "step": 651 }, { "epoch": 0.25644051130776796, "grad_norm": 2.0455472469329834, "learning_rate": 4.637366275696718e-06, "loss": 0.0495, "step": 652 }, { "epoch": 0.2568338249754179, "grad_norm": 1.2602909803390503, "learning_rate": 4.635582892262006e-06, "loss": 0.0442, "step": 653 }, { "epoch": 0.25722713864306784, "grad_norm": 1.3121466636657715, "learning_rate": 4.633795479024616e-06, "loss": 0.0404, "step": 654 }, { "epoch": 0.2576204523107178, "grad_norm": 1.028448224067688, "learning_rate": 4.632004039357364e-06, "loss": 0.0497, "step": 655 }, { "epoch": 0.2580137659783677, "grad_norm": 0.9586936235427856, "learning_rate": 4.630208576640659e-06, "loss": 0.0499, "step": 656 }, { "epoch": 0.2584070796460177, "grad_norm": 1.3646454811096191, "learning_rate": 4.628409094262504e-06, "loss": 0.0383, "step": 657 }, { "epoch": 0.25880039331366766, "grad_norm": 1.6489843130111694, "learning_rate": 4.6266055956184865e-06, "loss": 0.0458, "step": 658 }, { "epoch": 0.2591937069813176, "grad_norm": 1.8696314096450806, "learning_rate": 4.624798084111773e-06, "loss": 0.0783, "step": 659 }, { "epoch": 0.25958702064896755, "grad_norm": 1.5261452198028564, "learning_rate": 4.622986563153104e-06, "loss": 0.0465, "step": 660 }, { "epoch": 0.2599803343166175, "grad_norm": 1.8203606605529785, "learning_rate": 4.621171036160781e-06, "loss": 0.0767, "step": 661 }, { "epoch": 0.26037364798426743, "grad_norm": 1.3250322341918945, "learning_rate": 4.6193515065606675e-06, "loss": 0.0607, "step": 662 }, { "epoch": 0.26076696165191743, "grad_norm": 1.298017978668213, "learning_rate": 4.617527977786182e-06, "loss": 0.0619, "step": 663 }, { "epoch": 0.26116027531956737, "grad_norm": 1.0446304082870483, "learning_rate": 4.615700453278285e-06, "loss": 0.0268, "step": 664 }, { "epoch": 0.2615535889872173, "grad_norm": 1.0812922716140747, "learning_rate": 4.61386893648548e-06, "loss": 0.0519, "step": 665 }, { "epoch": 0.26194690265486725, "grad_norm": 1.8242236375808716, "learning_rate": 4.612033430863804e-06, "loss": 0.0565, "step": 666 }, { "epoch": 0.2623402163225172, "grad_norm": 1.567988634109497, "learning_rate": 4.610193939876818e-06, "loss": 0.0476, "step": 667 }, { "epoch": 0.26273352999016714, "grad_norm": 3.7344436645507812, "learning_rate": 4.608350466995606e-06, "loss": 0.0519, "step": 668 }, { "epoch": 0.26312684365781713, "grad_norm": 3.131584882736206, "learning_rate": 4.606503015698765e-06, "loss": 0.0696, "step": 669 }, { "epoch": 0.2635201573254671, "grad_norm": 1.2186100482940674, "learning_rate": 4.6046515894723985e-06, "loss": 0.0596, "step": 670 }, { "epoch": 0.263913470993117, "grad_norm": 0.8804354667663574, "learning_rate": 4.602796191810113e-06, "loss": 0.0465, "step": 671 }, { "epoch": 0.26430678466076696, "grad_norm": 1.961540937423706, "learning_rate": 4.600936826213004e-06, "loss": 0.0756, "step": 672 }, { "epoch": 0.2647000983284169, "grad_norm": 0.739213764667511, "learning_rate": 4.59907349618966e-06, "loss": 0.0475, "step": 673 }, { "epoch": 0.26509341199606684, "grad_norm": 0.8394540548324585, "learning_rate": 4.597206205256147e-06, "loss": 0.0538, "step": 674 }, { "epoch": 0.26548672566371684, "grad_norm": 1.5452135801315308, "learning_rate": 4.595334956936007e-06, "loss": 0.0664, "step": 675 }, { "epoch": 0.2658800393313668, "grad_norm": 1.613324522972107, "learning_rate": 4.593459754760248e-06, "loss": 0.0673, "step": 676 }, { "epoch": 0.2662733529990167, "grad_norm": 1.4427350759506226, "learning_rate": 4.591580602267338e-06, "loss": 0.0509, "step": 677 }, { "epoch": 0.26666666666666666, "grad_norm": 1.7156988382339478, "learning_rate": 4.589697503003203e-06, "loss": 0.0601, "step": 678 }, { "epoch": 0.2670599803343166, "grad_norm": 1.4072953462600708, "learning_rate": 4.587810460521213e-06, "loss": 0.0678, "step": 679 }, { "epoch": 0.26745329400196655, "grad_norm": 0.7101967930793762, "learning_rate": 4.585919478382178e-06, "loss": 0.0522, "step": 680 }, { "epoch": 0.26784660766961654, "grad_norm": 0.5038359761238098, "learning_rate": 4.584024560154348e-06, "loss": 0.0408, "step": 681 }, { "epoch": 0.2682399213372665, "grad_norm": 1.1651291847229004, "learning_rate": 4.582125709413392e-06, "loss": 0.0719, "step": 682 }, { "epoch": 0.2686332350049164, "grad_norm": 1.0390863418579102, "learning_rate": 4.580222929742407e-06, "loss": 0.0402, "step": 683 }, { "epoch": 0.26902654867256637, "grad_norm": 1.8808722496032715, "learning_rate": 4.5783162247318986e-06, "loss": 0.0612, "step": 684 }, { "epoch": 0.2694198623402163, "grad_norm": 1.4362890720367432, "learning_rate": 4.576405597979782e-06, "loss": 0.0367, "step": 685 }, { "epoch": 0.26981317600786625, "grad_norm": 0.9547756910324097, "learning_rate": 4.5744910530913725e-06, "loss": 0.0799, "step": 686 }, { "epoch": 0.27020648967551625, "grad_norm": 1.8914170265197754, "learning_rate": 4.572572593679379e-06, "loss": 0.048, "step": 687 }, { "epoch": 0.2705998033431662, "grad_norm": 1.460436224937439, "learning_rate": 4.5706502233638935e-06, "loss": 0.0633, "step": 688 }, { "epoch": 0.27099311701081613, "grad_norm": 1.7330501079559326, "learning_rate": 4.568723945772394e-06, "loss": 0.0332, "step": 689 }, { "epoch": 0.2713864306784661, "grad_norm": 1.1326316595077515, "learning_rate": 4.5667937645397276e-06, "loss": 0.0555, "step": 690 }, { "epoch": 0.271779744346116, "grad_norm": 0.8753216862678528, "learning_rate": 4.564859683308107e-06, "loss": 0.0416, "step": 691 }, { "epoch": 0.27217305801376596, "grad_norm": 0.8659785389900208, "learning_rate": 4.562921705727106e-06, "loss": 0.0551, "step": 692 }, { "epoch": 0.27256637168141595, "grad_norm": 0.502169668674469, "learning_rate": 4.5609798354536495e-06, "loss": 0.0284, "step": 693 }, { "epoch": 0.2729596853490659, "grad_norm": 2.1083321571350098, "learning_rate": 4.559034076152009e-06, "loss": 0.0779, "step": 694 }, { "epoch": 0.27335299901671584, "grad_norm": 1.5410869121551514, "learning_rate": 4.557084431493793e-06, "loss": 0.0788, "step": 695 }, { "epoch": 0.2737463126843658, "grad_norm": 1.707189679145813, "learning_rate": 4.555130905157943e-06, "loss": 0.0921, "step": 696 }, { "epoch": 0.2741396263520157, "grad_norm": 1.2371059656143188, "learning_rate": 4.553173500830724e-06, "loss": 0.0562, "step": 697 }, { "epoch": 0.27453294001966566, "grad_norm": 1.6234147548675537, "learning_rate": 4.55121222220572e-06, "loss": 0.0471, "step": 698 }, { "epoch": 0.27492625368731566, "grad_norm": 1.2629426717758179, "learning_rate": 4.549247072983825e-06, "loss": 0.0795, "step": 699 }, { "epoch": 0.2753195673549656, "grad_norm": 1.7955608367919922, "learning_rate": 4.5472780568732356e-06, "loss": 0.0468, "step": 700 }, { "epoch": 0.27571288102261554, "grad_norm": 7.252640724182129, "learning_rate": 4.545305177589448e-06, "loss": 0.0699, "step": 701 }, { "epoch": 0.2761061946902655, "grad_norm": 1.8121711015701294, "learning_rate": 4.5433284388552435e-06, "loss": 0.0718, "step": 702 }, { "epoch": 0.2764995083579154, "grad_norm": 0.901907742023468, "learning_rate": 4.541347844400692e-06, "loss": 0.0255, "step": 703 }, { "epoch": 0.27689282202556537, "grad_norm": 0.7126281261444092, "learning_rate": 4.539363397963134e-06, "loss": 0.0509, "step": 704 }, { "epoch": 0.27728613569321536, "grad_norm": 2.012707233428955, "learning_rate": 4.537375103287183e-06, "loss": 0.0904, "step": 705 }, { "epoch": 0.2776794493608653, "grad_norm": 1.7197178602218628, "learning_rate": 4.53538296412471e-06, "loss": 0.0617, "step": 706 }, { "epoch": 0.27807276302851525, "grad_norm": 2.5714545249938965, "learning_rate": 4.533386984234841e-06, "loss": 0.0825, "step": 707 }, { "epoch": 0.2784660766961652, "grad_norm": 1.3491824865341187, "learning_rate": 4.5313871673839525e-06, "loss": 0.0545, "step": 708 }, { "epoch": 0.27885939036381513, "grad_norm": 1.0081161260604858, "learning_rate": 4.52938351734566e-06, "loss": 0.046, "step": 709 }, { "epoch": 0.27925270403146507, "grad_norm": 1.3097039461135864, "learning_rate": 4.52737603790081e-06, "loss": 0.0678, "step": 710 }, { "epoch": 0.27964601769911507, "grad_norm": 1.264832615852356, "learning_rate": 4.525364732837476e-06, "loss": 0.0408, "step": 711 }, { "epoch": 0.280039331366765, "grad_norm": 1.6724627017974854, "learning_rate": 4.523349605950953e-06, "loss": 0.0583, "step": 712 }, { "epoch": 0.28043264503441495, "grad_norm": 1.2600414752960205, "learning_rate": 4.521330661043744e-06, "loss": 0.0762, "step": 713 }, { "epoch": 0.2808259587020649, "grad_norm": 0.8454362750053406, "learning_rate": 4.519307901925558e-06, "loss": 0.0433, "step": 714 }, { "epoch": 0.28121927236971483, "grad_norm": 2.131969451904297, "learning_rate": 4.517281332413302e-06, "loss": 0.0738, "step": 715 }, { "epoch": 0.2816125860373648, "grad_norm": 2.226288080215454, "learning_rate": 4.515250956331072e-06, "loss": 0.0892, "step": 716 }, { "epoch": 0.2820058997050148, "grad_norm": 1.6737391948699951, "learning_rate": 4.513216777510149e-06, "loss": 0.0556, "step": 717 }, { "epoch": 0.2823992133726647, "grad_norm": 1.5575467348098755, "learning_rate": 4.511178799788987e-06, "loss": 0.0561, "step": 718 }, { "epoch": 0.28279252704031466, "grad_norm": 1.7405011653900146, "learning_rate": 4.50913702701321e-06, "loss": 0.0653, "step": 719 }, { "epoch": 0.2831858407079646, "grad_norm": 1.097738265991211, "learning_rate": 4.507091463035601e-06, "loss": 0.0772, "step": 720 }, { "epoch": 0.28357915437561454, "grad_norm": 0.8409376740455627, "learning_rate": 4.505042111716103e-06, "loss": 0.0645, "step": 721 }, { "epoch": 0.2839724680432645, "grad_norm": 1.1851140260696411, "learning_rate": 4.502988976921797e-06, "loss": 0.0462, "step": 722 }, { "epoch": 0.2843657817109145, "grad_norm": 1.7740516662597656, "learning_rate": 4.50093206252691e-06, "loss": 0.0717, "step": 723 }, { "epoch": 0.2847590953785644, "grad_norm": 2.491065263748169, "learning_rate": 4.498871372412798e-06, "loss": 0.0575, "step": 724 }, { "epoch": 0.28515240904621436, "grad_norm": 1.446291446685791, "learning_rate": 4.496806910467944e-06, "loss": 0.0566, "step": 725 }, { "epoch": 0.2855457227138643, "grad_norm": 1.2584576606750488, "learning_rate": 4.494738680587946e-06, "loss": 0.053, "step": 726 }, { "epoch": 0.28593903638151424, "grad_norm": 1.188159704208374, "learning_rate": 4.492666686675511e-06, "loss": 0.0627, "step": 727 }, { "epoch": 0.2863323500491642, "grad_norm": 1.2687791585922241, "learning_rate": 4.490590932640453e-06, "loss": 0.0676, "step": 728 }, { "epoch": 0.2867256637168142, "grad_norm": 1.7722615003585815, "learning_rate": 4.488511422399677e-06, "loss": 0.0548, "step": 729 }, { "epoch": 0.2871189773844641, "grad_norm": 3.2244741916656494, "learning_rate": 4.48642815987718e-06, "loss": 0.0763, "step": 730 }, { "epoch": 0.28751229105211407, "grad_norm": 1.1106655597686768, "learning_rate": 4.484341149004035e-06, "loss": 0.0862, "step": 731 }, { "epoch": 0.287905604719764, "grad_norm": 0.6258023381233215, "learning_rate": 4.482250393718392e-06, "loss": 0.0526, "step": 732 }, { "epoch": 0.28829891838741395, "grad_norm": 0.7904531955718994, "learning_rate": 4.480155897965463e-06, "loss": 0.0367, "step": 733 }, { "epoch": 0.2886922320550639, "grad_norm": 1.5454163551330566, "learning_rate": 4.47805766569752e-06, "loss": 0.0747, "step": 734 }, { "epoch": 0.2890855457227139, "grad_norm": 2.1076667308807373, "learning_rate": 4.475955700873888e-06, "loss": 0.0939, "step": 735 }, { "epoch": 0.28947885939036383, "grad_norm": 1.407893419265747, "learning_rate": 4.473850007460932e-06, "loss": 0.0524, "step": 736 }, { "epoch": 0.28987217305801377, "grad_norm": 1.957629680633545, "learning_rate": 4.471740589432053e-06, "loss": 0.0541, "step": 737 }, { "epoch": 0.2902654867256637, "grad_norm": 1.0253725051879883, "learning_rate": 4.469627450767682e-06, "loss": 0.0478, "step": 738 }, { "epoch": 0.29065880039331365, "grad_norm": 1.5762360095977783, "learning_rate": 4.46751059545527e-06, "loss": 0.0936, "step": 739 }, { "epoch": 0.2910521140609636, "grad_norm": 1.2460707426071167, "learning_rate": 4.465390027489279e-06, "loss": 0.0596, "step": 740 }, { "epoch": 0.2914454277286136, "grad_norm": 1.042962670326233, "learning_rate": 4.463265750871182e-06, "loss": 0.0615, "step": 741 }, { "epoch": 0.29183874139626353, "grad_norm": 1.554513692855835, "learning_rate": 4.461137769609445e-06, "loss": 0.0562, "step": 742 }, { "epoch": 0.2922320550639135, "grad_norm": 1.5099841356277466, "learning_rate": 4.459006087719527e-06, "loss": 0.0462, "step": 743 }, { "epoch": 0.2926253687315634, "grad_norm": 0.8272073864936829, "learning_rate": 4.45687070922387e-06, "loss": 0.0311, "step": 744 }, { "epoch": 0.29301868239921336, "grad_norm": 1.1962639093399048, "learning_rate": 4.4547316381518905e-06, "loss": 0.054, "step": 745 }, { "epoch": 0.2934119960668633, "grad_norm": 0.7265387773513794, "learning_rate": 4.4525888785399725e-06, "loss": 0.0322, "step": 746 }, { "epoch": 0.2938053097345133, "grad_norm": 2.045783042907715, "learning_rate": 4.450442434431463e-06, "loss": 0.0668, "step": 747 }, { "epoch": 0.29419862340216324, "grad_norm": 1.417593240737915, "learning_rate": 4.448292309876657e-06, "loss": 0.0499, "step": 748 }, { "epoch": 0.2945919370698132, "grad_norm": 1.4235261678695679, "learning_rate": 4.4461385089328e-06, "loss": 0.0904, "step": 749 }, { "epoch": 0.2949852507374631, "grad_norm": 1.050933837890625, "learning_rate": 4.44398103566407e-06, "loss": 0.05, "step": 750 }, { "epoch": 0.29537856440511306, "grad_norm": 1.3113094568252563, "learning_rate": 4.4418198941415756e-06, "loss": 0.0717, "step": 751 }, { "epoch": 0.295771878072763, "grad_norm": 1.1153532266616821, "learning_rate": 4.4396550884433495e-06, "loss": 0.0613, "step": 752 }, { "epoch": 0.296165191740413, "grad_norm": 1.6574000120162964, "learning_rate": 4.437486622654337e-06, "loss": 0.08, "step": 753 }, { "epoch": 0.29655850540806294, "grad_norm": 1.037023901939392, "learning_rate": 4.43531450086639e-06, "loss": 0.059, "step": 754 }, { "epoch": 0.2969518190757129, "grad_norm": 1.3382397890090942, "learning_rate": 4.433138727178259e-06, "loss": 0.0504, "step": 755 }, { "epoch": 0.2973451327433628, "grad_norm": 2.023531198501587, "learning_rate": 4.4309593056955865e-06, "loss": 0.0682, "step": 756 }, { "epoch": 0.29773844641101277, "grad_norm": 1.3962974548339844, "learning_rate": 4.4287762405308974e-06, "loss": 0.0678, "step": 757 }, { "epoch": 0.2981317600786627, "grad_norm": 0.6099796295166016, "learning_rate": 4.426589535803593e-06, "loss": 0.0496, "step": 758 }, { "epoch": 0.2985250737463127, "grad_norm": 1.6071325540542603, "learning_rate": 4.424399195639941e-06, "loss": 0.0519, "step": 759 }, { "epoch": 0.29891838741396265, "grad_norm": 1.116490125656128, "learning_rate": 4.422205224173071e-06, "loss": 0.0651, "step": 760 }, { "epoch": 0.2993117010816126, "grad_norm": 1.163526177406311, "learning_rate": 4.420007625542963e-06, "loss": 0.042, "step": 761 }, { "epoch": 0.29970501474926253, "grad_norm": 0.6789044737815857, "learning_rate": 4.417806403896442e-06, "loss": 0.0652, "step": 762 }, { "epoch": 0.3000983284169125, "grad_norm": 1.6137206554412842, "learning_rate": 4.41560156338717e-06, "loss": 0.073, "step": 763 }, { "epoch": 0.3004916420845624, "grad_norm": 1.9308634996414185, "learning_rate": 4.413393108175637e-06, "loss": 0.0805, "step": 764 }, { "epoch": 0.3008849557522124, "grad_norm": 1.6792504787445068, "learning_rate": 4.411181042429156e-06, "loss": 0.0471, "step": 765 }, { "epoch": 0.30127826941986235, "grad_norm": 1.1271363496780396, "learning_rate": 4.40896537032185e-06, "loss": 0.0378, "step": 766 }, { "epoch": 0.3016715830875123, "grad_norm": 1.0671911239624023, "learning_rate": 4.406746096034647e-06, "loss": 0.0548, "step": 767 }, { "epoch": 0.30206489675516224, "grad_norm": 1.2227768898010254, "learning_rate": 4.4045232237552756e-06, "loss": 0.0701, "step": 768 }, { "epoch": 0.3024582104228122, "grad_norm": 1.471924901008606, "learning_rate": 4.4022967576782525e-06, "loss": 0.0568, "step": 769 }, { "epoch": 0.3028515240904621, "grad_norm": 1.6219385862350464, "learning_rate": 4.400066702004874e-06, "loss": 0.05, "step": 770 }, { "epoch": 0.3032448377581121, "grad_norm": 1.4471542835235596, "learning_rate": 4.39783306094321e-06, "loss": 0.0685, "step": 771 }, { "epoch": 0.30363815142576206, "grad_norm": 1.525600552558899, "learning_rate": 4.395595838708099e-06, "loss": 0.0513, "step": 772 }, { "epoch": 0.304031465093412, "grad_norm": 1.3881157636642456, "learning_rate": 4.393355039521134e-06, "loss": 0.0812, "step": 773 }, { "epoch": 0.30442477876106194, "grad_norm": 1.1738461256027222, "learning_rate": 4.391110667610658e-06, "loss": 0.0595, "step": 774 }, { "epoch": 0.3048180924287119, "grad_norm": 1.1576417684555054, "learning_rate": 4.388862727211759e-06, "loss": 0.0541, "step": 775 }, { "epoch": 0.3052114060963618, "grad_norm": 1.283400058746338, "learning_rate": 4.386611222566254e-06, "loss": 0.0505, "step": 776 }, { "epoch": 0.3056047197640118, "grad_norm": 1.4386646747589111, "learning_rate": 4.384356157922688e-06, "loss": 0.0706, "step": 777 }, { "epoch": 0.30599803343166176, "grad_norm": 2.0160024166107178, "learning_rate": 4.382097537536322e-06, "loss": 0.0596, "step": 778 }, { "epoch": 0.3063913470993117, "grad_norm": 1.3747514486312866, "learning_rate": 4.379835365669132e-06, "loss": 0.0561, "step": 779 }, { "epoch": 0.30678466076696165, "grad_norm": 1.5668084621429443, "learning_rate": 4.377569646589789e-06, "loss": 0.0522, "step": 780 }, { "epoch": 0.3071779744346116, "grad_norm": 1.6369160413742065, "learning_rate": 4.375300384573659e-06, "loss": 0.05, "step": 781 }, { "epoch": 0.30757128810226153, "grad_norm": 1.2633172273635864, "learning_rate": 4.373027583902796e-06, "loss": 0.0447, "step": 782 }, { "epoch": 0.30796460176991153, "grad_norm": 1.3119875192642212, "learning_rate": 4.370751248865929e-06, "loss": 0.062, "step": 783 }, { "epoch": 0.30835791543756147, "grad_norm": 2.1404073238372803, "learning_rate": 4.368471383758459e-06, "loss": 0.0446, "step": 784 }, { "epoch": 0.3087512291052114, "grad_norm": 0.7563901543617249, "learning_rate": 4.366187992882444e-06, "loss": 0.0429, "step": 785 }, { "epoch": 0.30914454277286135, "grad_norm": 0.7048685550689697, "learning_rate": 4.3639010805466e-06, "loss": 0.0299, "step": 786 }, { "epoch": 0.3095378564405113, "grad_norm": 0.7395270466804504, "learning_rate": 4.361610651066283e-06, "loss": 0.0334, "step": 787 }, { "epoch": 0.30993117010816124, "grad_norm": 1.2910830974578857, "learning_rate": 4.35931670876349e-06, "loss": 0.0666, "step": 788 }, { "epoch": 0.31032448377581123, "grad_norm": 3.32393217086792, "learning_rate": 4.357019257966844e-06, "loss": 0.0773, "step": 789 }, { "epoch": 0.3107177974434612, "grad_norm": 1.2098692655563354, "learning_rate": 4.354718303011588e-06, "loss": 0.0524, "step": 790 }, { "epoch": 0.3111111111111111, "grad_norm": 1.650527834892273, "learning_rate": 4.352413848239579e-06, "loss": 0.0518, "step": 791 }, { "epoch": 0.31150442477876106, "grad_norm": 0.8377374410629272, "learning_rate": 4.35010589799928e-06, "loss": 0.0482, "step": 792 }, { "epoch": 0.311897738446411, "grad_norm": 1.225882649421692, "learning_rate": 4.347794456645744e-06, "loss": 0.0405, "step": 793 }, { "epoch": 0.31229105211406094, "grad_norm": 2.0014147758483887, "learning_rate": 4.345479528540618e-06, "loss": 0.053, "step": 794 }, { "epoch": 0.31268436578171094, "grad_norm": 1.2061558961868286, "learning_rate": 4.343161118052123e-06, "loss": 0.045, "step": 795 }, { "epoch": 0.3130776794493609, "grad_norm": 0.8555061221122742, "learning_rate": 4.340839229555056e-06, "loss": 0.0673, "step": 796 }, { "epoch": 0.3134709931170108, "grad_norm": 1.4630858898162842, "learning_rate": 4.338513867430773e-06, "loss": 0.0414, "step": 797 }, { "epoch": 0.31386430678466076, "grad_norm": 1.101480484008789, "learning_rate": 4.336185036067187e-06, "loss": 0.0383, "step": 798 }, { "epoch": 0.3142576204523107, "grad_norm": 0.6861633658409119, "learning_rate": 4.3338527398587575e-06, "loss": 0.0393, "step": 799 }, { "epoch": 0.31465093411996065, "grad_norm": 1.0716795921325684, "learning_rate": 4.33151698320648e-06, "loss": 0.0407, "step": 800 }, { "epoch": 0.31504424778761064, "grad_norm": 1.0103176832199097, "learning_rate": 4.329177770517881e-06, "loss": 0.0467, "step": 801 }, { "epoch": 0.3154375614552606, "grad_norm": 1.1415047645568848, "learning_rate": 4.32683510620701e-06, "loss": 0.0518, "step": 802 }, { "epoch": 0.3158308751229105, "grad_norm": 1.0959949493408203, "learning_rate": 4.324488994694427e-06, "loss": 0.0447, "step": 803 }, { "epoch": 0.31622418879056047, "grad_norm": 3.7971184253692627, "learning_rate": 4.322139440407198e-06, "loss": 0.1218, "step": 804 }, { "epoch": 0.3166175024582104, "grad_norm": 1.0682744979858398, "learning_rate": 4.319786447778887e-06, "loss": 0.0271, "step": 805 }, { "epoch": 0.31701081612586035, "grad_norm": 0.7397903800010681, "learning_rate": 4.317430021249543e-06, "loss": 0.0313, "step": 806 }, { "epoch": 0.31740412979351035, "grad_norm": 1.9803013801574707, "learning_rate": 4.315070165265695e-06, "loss": 0.0832, "step": 807 }, { "epoch": 0.3177974434611603, "grad_norm": 0.9591525793075562, "learning_rate": 4.312706884280349e-06, "loss": 0.0611, "step": 808 }, { "epoch": 0.31819075712881023, "grad_norm": 0.7980911731719971, "learning_rate": 4.310340182752965e-06, "loss": 0.0163, "step": 809 }, { "epoch": 0.3185840707964602, "grad_norm": 0.8986029028892517, "learning_rate": 4.307970065149464e-06, "loss": 0.0382, "step": 810 }, { "epoch": 0.3189773844641101, "grad_norm": 0.9218258857727051, "learning_rate": 4.305596535942211e-06, "loss": 0.0362, "step": 811 }, { "epoch": 0.31937069813176006, "grad_norm": 1.9387575387954712, "learning_rate": 4.303219599610009e-06, "loss": 0.045, "step": 812 }, { "epoch": 0.31976401179941005, "grad_norm": 2.1032979488372803, "learning_rate": 4.300839260638089e-06, "loss": 0.0583, "step": 813 }, { "epoch": 0.32015732546706, "grad_norm": 0.8777870535850525, "learning_rate": 4.298455523518102e-06, "loss": 0.0611, "step": 814 }, { "epoch": 0.32055063913470994, "grad_norm": 1.7572643756866455, "learning_rate": 4.296068392748116e-06, "loss": 0.053, "step": 815 }, { "epoch": 0.3209439528023599, "grad_norm": 1.3729215860366821, "learning_rate": 4.293677872832599e-06, "loss": 0.1014, "step": 816 }, { "epoch": 0.3213372664700098, "grad_norm": 2.968247175216675, "learning_rate": 4.291283968282413e-06, "loss": 0.0422, "step": 817 }, { "epoch": 0.32173058013765976, "grad_norm": 1.2367733716964722, "learning_rate": 4.288886683614809e-06, "loss": 0.0598, "step": 818 }, { "epoch": 0.32212389380530976, "grad_norm": 2.149622678756714, "learning_rate": 4.286486023353417e-06, "loss": 0.0834, "step": 819 }, { "epoch": 0.3225172074729597, "grad_norm": 2.1104652881622314, "learning_rate": 4.284081992028235e-06, "loss": 0.0764, "step": 820 }, { "epoch": 0.32291052114060964, "grad_norm": 1.5311528444290161, "learning_rate": 4.281674594175621e-06, "loss": 0.0586, "step": 821 }, { "epoch": 0.3233038348082596, "grad_norm": 1.432000756263733, "learning_rate": 4.2792638343382894e-06, "loss": 0.0787, "step": 822 }, { "epoch": 0.3236971484759095, "grad_norm": 1.2007765769958496, "learning_rate": 4.276849717065295e-06, "loss": 0.0462, "step": 823 }, { "epoch": 0.32409046214355947, "grad_norm": 1.0811890363693237, "learning_rate": 4.2744322469120296e-06, "loss": 0.0624, "step": 824 }, { "epoch": 0.32448377581120946, "grad_norm": 1.440487265586853, "learning_rate": 4.272011428440212e-06, "loss": 0.0557, "step": 825 }, { "epoch": 0.3248770894788594, "grad_norm": 2.677267551422119, "learning_rate": 4.269587266217878e-06, "loss": 0.0804, "step": 826 }, { "epoch": 0.32527040314650935, "grad_norm": 1.07245671749115, "learning_rate": 4.2671597648193745e-06, "loss": 0.0542, "step": 827 }, { "epoch": 0.3256637168141593, "grad_norm": 1.0649880170822144, "learning_rate": 4.264728928825347e-06, "loss": 0.0573, "step": 828 }, { "epoch": 0.32605703048180923, "grad_norm": 1.880872130393982, "learning_rate": 4.262294762822738e-06, "loss": 0.0892, "step": 829 }, { "epoch": 0.32645034414945917, "grad_norm": 1.7007864713668823, "learning_rate": 4.259857271404767e-06, "loss": 0.097, "step": 830 }, { "epoch": 0.32684365781710917, "grad_norm": 0.9796857237815857, "learning_rate": 4.257416459170935e-06, "loss": 0.0372, "step": 831 }, { "epoch": 0.3272369714847591, "grad_norm": 1.3802924156188965, "learning_rate": 4.254972330727004e-06, "loss": 0.0388, "step": 832 }, { "epoch": 0.32763028515240905, "grad_norm": 1.8189585208892822, "learning_rate": 4.252524890685e-06, "loss": 0.0504, "step": 833 }, { "epoch": 0.328023598820059, "grad_norm": 1.2440087795257568, "learning_rate": 4.250074143663189e-06, "loss": 0.055, "step": 834 }, { "epoch": 0.32841691248770893, "grad_norm": 1.26856529712677, "learning_rate": 4.247620094286085e-06, "loss": 0.0528, "step": 835 }, { "epoch": 0.3288102261553589, "grad_norm": 1.8983615636825562, "learning_rate": 4.2451627471844305e-06, "loss": 0.0527, "step": 836 }, { "epoch": 0.3292035398230089, "grad_norm": 0.9810947179794312, "learning_rate": 4.24270210699519e-06, "loss": 0.04, "step": 837 }, { "epoch": 0.3295968534906588, "grad_norm": 1.2199605703353882, "learning_rate": 4.240238178361543e-06, "loss": 0.0443, "step": 838 }, { "epoch": 0.32999016715830876, "grad_norm": 0.5256842374801636, "learning_rate": 4.237770965932875e-06, "loss": 0.0267, "step": 839 }, { "epoch": 0.3303834808259587, "grad_norm": 1.456432819366455, "learning_rate": 4.235300474364766e-06, "loss": 0.0623, "step": 840 }, { "epoch": 0.33077679449360864, "grad_norm": 1.4406569004058838, "learning_rate": 4.232826708318985e-06, "loss": 0.0453, "step": 841 }, { "epoch": 0.3311701081612586, "grad_norm": 1.9302328824996948, "learning_rate": 4.230349672463481e-06, "loss": 0.0655, "step": 842 }, { "epoch": 0.3315634218289086, "grad_norm": 0.7055051922798157, "learning_rate": 4.22786937147237e-06, "loss": 0.0405, "step": 843 }, { "epoch": 0.3319567354965585, "grad_norm": 2.823591947555542, "learning_rate": 4.2253858100259304e-06, "loss": 0.1111, "step": 844 }, { "epoch": 0.33235004916420846, "grad_norm": 1.458694577217102, "learning_rate": 4.222898992810596e-06, "loss": 0.0688, "step": 845 }, { "epoch": 0.3327433628318584, "grad_norm": 1.3440479040145874, "learning_rate": 4.220408924518939e-06, "loss": 0.0654, "step": 846 }, { "epoch": 0.33313667649950834, "grad_norm": 1.2197304964065552, "learning_rate": 4.217915609849671e-06, "loss": 0.0269, "step": 847 }, { "epoch": 0.3335299901671583, "grad_norm": 1.0218877792358398, "learning_rate": 4.215419053507626e-06, "loss": 0.0525, "step": 848 }, { "epoch": 0.3339233038348083, "grad_norm": 1.4025174379348755, "learning_rate": 4.212919260203757e-06, "loss": 0.0947, "step": 849 }, { "epoch": 0.3343166175024582, "grad_norm": 0.7898326516151428, "learning_rate": 4.210416234655125e-06, "loss": 0.0337, "step": 850 }, { "epoch": 0.33470993117010817, "grad_norm": 1.196540355682373, "learning_rate": 4.207909981584889e-06, "loss": 0.0578, "step": 851 }, { "epoch": 0.3351032448377581, "grad_norm": 0.926796555519104, "learning_rate": 4.2054005057223e-06, "loss": 0.0672, "step": 852 }, { "epoch": 0.33549655850540805, "grad_norm": 1.2736568450927734, "learning_rate": 4.202887811802687e-06, "loss": 0.0484, "step": 853 }, { "epoch": 0.335889872173058, "grad_norm": 1.2440752983093262, "learning_rate": 4.200371904567457e-06, "loss": 0.0478, "step": 854 }, { "epoch": 0.336283185840708, "grad_norm": 1.4759784936904907, "learning_rate": 4.197852788764075e-06, "loss": 0.0458, "step": 855 }, { "epoch": 0.33667649950835793, "grad_norm": 0.7424830794334412, "learning_rate": 4.195330469146063e-06, "loss": 0.0327, "step": 856 }, { "epoch": 0.33706981317600787, "grad_norm": 1.2250968217849731, "learning_rate": 4.1928049504729886e-06, "loss": 0.0637, "step": 857 }, { "epoch": 0.3374631268436578, "grad_norm": 1.2263579368591309, "learning_rate": 4.1902762375104555e-06, "loss": 0.0733, "step": 858 }, { "epoch": 0.33785644051130775, "grad_norm": 0.5867930054664612, "learning_rate": 4.187744335030095e-06, "loss": 0.055, "step": 859 }, { "epoch": 0.3382497541789577, "grad_norm": 2.040759563446045, "learning_rate": 4.185209247809557e-06, "loss": 0.0664, "step": 860 }, { "epoch": 0.3386430678466077, "grad_norm": 2.09037709236145, "learning_rate": 4.182670980632501e-06, "loss": 0.0728, "step": 861 }, { "epoch": 0.33903638151425763, "grad_norm": 3.822634220123291, "learning_rate": 4.180129538288587e-06, "loss": 0.0912, "step": 862 }, { "epoch": 0.3394296951819076, "grad_norm": 1.7590773105621338, "learning_rate": 4.177584925573466e-06, "loss": 0.0623, "step": 863 }, { "epoch": 0.3398230088495575, "grad_norm": 1.2151440382003784, "learning_rate": 4.175037147288772e-06, "loss": 0.044, "step": 864 }, { "epoch": 0.34021632251720746, "grad_norm": 0.765602171421051, "learning_rate": 4.172486208242113e-06, "loss": 0.0811, "step": 865 }, { "epoch": 0.3406096361848574, "grad_norm": 0.9690750241279602, "learning_rate": 4.169932113247059e-06, "loss": 0.0587, "step": 866 }, { "epoch": 0.3410029498525074, "grad_norm": 0.6641612648963928, "learning_rate": 4.167374867123138e-06, "loss": 0.0336, "step": 867 }, { "epoch": 0.34139626352015734, "grad_norm": 0.9194386601448059, "learning_rate": 4.164814474695823e-06, "loss": 0.0566, "step": 868 }, { "epoch": 0.3417895771878073, "grad_norm": 2.2128334045410156, "learning_rate": 4.162250940796523e-06, "loss": 0.074, "step": 869 }, { "epoch": 0.3421828908554572, "grad_norm": 1.8464068174362183, "learning_rate": 4.159684270262576e-06, "loss": 0.0736, "step": 870 }, { "epoch": 0.34257620452310716, "grad_norm": 0.9694234728813171, "learning_rate": 4.157114467937239e-06, "loss": 0.0413, "step": 871 }, { "epoch": 0.3429695181907571, "grad_norm": 1.4554444551467896, "learning_rate": 4.154541538669677e-06, "loss": 0.0468, "step": 872 }, { "epoch": 0.3433628318584071, "grad_norm": 1.3524583578109741, "learning_rate": 4.151965487314959e-06, "loss": 0.049, "step": 873 }, { "epoch": 0.34375614552605704, "grad_norm": 1.6620694398880005, "learning_rate": 4.1493863187340415e-06, "loss": 0.0686, "step": 874 }, { "epoch": 0.344149459193707, "grad_norm": 0.8126603364944458, "learning_rate": 4.146804037793763e-06, "loss": 0.0335, "step": 875 }, { "epoch": 0.3445427728613569, "grad_norm": 1.852401852607727, "learning_rate": 4.144218649366839e-06, "loss": 0.0488, "step": 876 }, { "epoch": 0.34493608652900687, "grad_norm": 1.165703296661377, "learning_rate": 4.141630158331845e-06, "loss": 0.0464, "step": 877 }, { "epoch": 0.3453294001966568, "grad_norm": 2.391685962677002, "learning_rate": 4.139038569573213e-06, "loss": 0.0829, "step": 878 }, { "epoch": 0.3457227138643068, "grad_norm": 1.832273006439209, "learning_rate": 4.1364438879812194e-06, "loss": 0.0406, "step": 879 }, { "epoch": 0.34611602753195675, "grad_norm": 1.1527806520462036, "learning_rate": 4.1338461184519776e-06, "loss": 0.0682, "step": 880 }, { "epoch": 0.3465093411996067, "grad_norm": 1.8680974245071411, "learning_rate": 4.131245265887426e-06, "loss": 0.0847, "step": 881 }, { "epoch": 0.34690265486725663, "grad_norm": 1.7685651779174805, "learning_rate": 4.1286413351953235e-06, "loss": 0.0461, "step": 882 }, { "epoch": 0.3472959685349066, "grad_norm": 2.0602667331695557, "learning_rate": 4.126034331289235e-06, "loss": 0.0992, "step": 883 }, { "epoch": 0.3476892822025565, "grad_norm": 1.4323168992996216, "learning_rate": 4.123424259088525e-06, "loss": 0.0992, "step": 884 }, { "epoch": 0.3480825958702065, "grad_norm": 0.9091783165931702, "learning_rate": 4.120811123518349e-06, "loss": 0.0519, "step": 885 }, { "epoch": 0.34847590953785645, "grad_norm": 1.3111385107040405, "learning_rate": 4.1181949295096415e-06, "loss": 0.0811, "step": 886 }, { "epoch": 0.3488692232055064, "grad_norm": 2.218848705291748, "learning_rate": 4.11557568199911e-06, "loss": 0.0743, "step": 887 }, { "epoch": 0.34926253687315634, "grad_norm": 0.9991410970687866, "learning_rate": 4.112953385929221e-06, "loss": 0.0488, "step": 888 }, { "epoch": 0.3496558505408063, "grad_norm": 1.4411261081695557, "learning_rate": 4.110328046248196e-06, "loss": 0.0704, "step": 889 }, { "epoch": 0.3500491642084562, "grad_norm": 1.3707761764526367, "learning_rate": 4.107699667909999e-06, "loss": 0.0514, "step": 890 }, { "epoch": 0.3504424778761062, "grad_norm": 1.438081979751587, "learning_rate": 4.105068255874328e-06, "loss": 0.0622, "step": 891 }, { "epoch": 0.35083579154375616, "grad_norm": 1.0999984741210938, "learning_rate": 4.102433815106606e-06, "loss": 0.0423, "step": 892 }, { "epoch": 0.3512291052114061, "grad_norm": 1.6553218364715576, "learning_rate": 4.09979635057797e-06, "loss": 0.0621, "step": 893 }, { "epoch": 0.35162241887905604, "grad_norm": 2.6534736156463623, "learning_rate": 4.097155867265264e-06, "loss": 0.0956, "step": 894 }, { "epoch": 0.352015732546706, "grad_norm": 1.2164000272750854, "learning_rate": 4.094512370151027e-06, "loss": 0.064, "step": 895 }, { "epoch": 0.3524090462143559, "grad_norm": 1.4759900569915771, "learning_rate": 4.091865864223487e-06, "loss": 0.0496, "step": 896 }, { "epoch": 0.3528023598820059, "grad_norm": 1.3511669635772705, "learning_rate": 4.089216354476545e-06, "loss": 0.0662, "step": 897 }, { "epoch": 0.35319567354965586, "grad_norm": 1.4343103170394897, "learning_rate": 4.086563845909779e-06, "loss": 0.0543, "step": 898 }, { "epoch": 0.3535889872173058, "grad_norm": 0.5085878968238831, "learning_rate": 4.083908343528415e-06, "loss": 0.0457, "step": 899 }, { "epoch": 0.35398230088495575, "grad_norm": 0.9629530906677246, "learning_rate": 4.081249852343336e-06, "loss": 0.0422, "step": 900 }, { "epoch": 0.3543756145526057, "grad_norm": 1.697277307510376, "learning_rate": 4.078588377371062e-06, "loss": 0.0583, "step": 901 }, { "epoch": 0.35476892822025563, "grad_norm": 1.2820713520050049, "learning_rate": 4.075923923633745e-06, "loss": 0.0621, "step": 902 }, { "epoch": 0.3551622418879056, "grad_norm": 0.9127804636955261, "learning_rate": 4.073256496159153e-06, "loss": 0.0616, "step": 903 }, { "epoch": 0.35555555555555557, "grad_norm": 1.4303189516067505, "learning_rate": 4.070586099980672e-06, "loss": 0.0556, "step": 904 }, { "epoch": 0.3559488692232055, "grad_norm": 0.8110685348510742, "learning_rate": 4.067912740137285e-06, "loss": 0.0665, "step": 905 }, { "epoch": 0.35634218289085545, "grad_norm": 1.490004062652588, "learning_rate": 4.06523642167357e-06, "loss": 0.0771, "step": 906 }, { "epoch": 0.3567354965585054, "grad_norm": 1.763295292854309, "learning_rate": 4.062557149639688e-06, "loss": 0.0824, "step": 907 }, { "epoch": 0.35712881022615534, "grad_norm": 2.5675792694091797, "learning_rate": 4.059874929091369e-06, "loss": 0.0886, "step": 908 }, { "epoch": 0.35752212389380533, "grad_norm": 1.442456841468811, "learning_rate": 4.057189765089914e-06, "loss": 0.0507, "step": 909 }, { "epoch": 0.3579154375614553, "grad_norm": 1.2593395709991455, "learning_rate": 4.054501662702172e-06, "loss": 0.0555, "step": 910 }, { "epoch": 0.3583087512291052, "grad_norm": 1.1391284465789795, "learning_rate": 4.05181062700054e-06, "loss": 0.058, "step": 911 }, { "epoch": 0.35870206489675516, "grad_norm": 0.7833881378173828, "learning_rate": 4.049116663062949e-06, "loss": 0.0588, "step": 912 }, { "epoch": 0.3590953785644051, "grad_norm": 1.7920033931732178, "learning_rate": 4.046419775972855e-06, "loss": 0.1015, "step": 913 }, { "epoch": 0.35948869223205504, "grad_norm": 1.4693628549575806, "learning_rate": 4.043719970819231e-06, "loss": 0.0734, "step": 914 }, { "epoch": 0.35988200589970504, "grad_norm": 0.9692854285240173, "learning_rate": 4.041017252696556e-06, "loss": 0.0537, "step": 915 }, { "epoch": 0.360275319567355, "grad_norm": 0.9593791961669922, "learning_rate": 4.038311626704806e-06, "loss": 0.0599, "step": 916 }, { "epoch": 0.3606686332350049, "grad_norm": 1.1619371175765991, "learning_rate": 4.035603097949444e-06, "loss": 0.0597, "step": 917 }, { "epoch": 0.36106194690265486, "grad_norm": 1.3384184837341309, "learning_rate": 4.032891671541409e-06, "loss": 0.0513, "step": 918 }, { "epoch": 0.3614552605703048, "grad_norm": 0.7744063138961792, "learning_rate": 4.030177352597109e-06, "loss": 0.0428, "step": 919 }, { "epoch": 0.36184857423795475, "grad_norm": 1.1778054237365723, "learning_rate": 4.027460146238411e-06, "loss": 0.0733, "step": 920 }, { "epoch": 0.36224188790560474, "grad_norm": 1.161788821220398, "learning_rate": 4.02474005759263e-06, "loss": 0.0735, "step": 921 }, { "epoch": 0.3626352015732547, "grad_norm": 2.0623209476470947, "learning_rate": 4.022017091792518e-06, "loss": 0.065, "step": 922 }, { "epoch": 0.3630285152409046, "grad_norm": 1.3139375448226929, "learning_rate": 4.01929125397626e-06, "loss": 0.0582, "step": 923 }, { "epoch": 0.36342182890855457, "grad_norm": 2.0761849880218506, "learning_rate": 4.016562549287455e-06, "loss": 0.0557, "step": 924 }, { "epoch": 0.3638151425762045, "grad_norm": 1.474522352218628, "learning_rate": 4.013830982875117e-06, "loss": 0.0665, "step": 925 }, { "epoch": 0.36420845624385445, "grad_norm": 1.7274634838104248, "learning_rate": 4.0110965598936565e-06, "loss": 0.0735, "step": 926 }, { "epoch": 0.36460176991150445, "grad_norm": 0.7064616084098816, "learning_rate": 4.008359285502877e-06, "loss": 0.0449, "step": 927 }, { "epoch": 0.3649950835791544, "grad_norm": 0.8762916922569275, "learning_rate": 4.005619164867959e-06, "loss": 0.0582, "step": 928 }, { "epoch": 0.36538839724680433, "grad_norm": 1.2766094207763672, "learning_rate": 4.002876203159458e-06, "loss": 0.0467, "step": 929 }, { "epoch": 0.36578171091445427, "grad_norm": 1.4357662200927734, "learning_rate": 4.000130405553287e-06, "loss": 0.0676, "step": 930 }, { "epoch": 0.3661750245821042, "grad_norm": 1.755672574043274, "learning_rate": 3.997381777230714e-06, "loss": 0.0647, "step": 931 }, { "epoch": 0.36656833824975416, "grad_norm": 0.9483436942100525, "learning_rate": 3.994630323378344e-06, "loss": 0.0601, "step": 932 }, { "epoch": 0.36696165191740415, "grad_norm": 1.6659551858901978, "learning_rate": 3.991876049188116e-06, "loss": 0.0738, "step": 933 }, { "epoch": 0.3673549655850541, "grad_norm": 1.5737981796264648, "learning_rate": 3.989118959857293e-06, "loss": 0.0483, "step": 934 }, { "epoch": 0.36774827925270404, "grad_norm": 1.5014865398406982, "learning_rate": 3.986359060588446e-06, "loss": 0.0458, "step": 935 }, { "epoch": 0.368141592920354, "grad_norm": 1.5164520740509033, "learning_rate": 3.983596356589452e-06, "loss": 0.0617, "step": 936 }, { "epoch": 0.3685349065880039, "grad_norm": 2.2842421531677246, "learning_rate": 3.980830853073476e-06, "loss": 0.0816, "step": 937 }, { "epoch": 0.36892822025565386, "grad_norm": 1.5114701986312866, "learning_rate": 3.978062555258972e-06, "loss": 0.0355, "step": 938 }, { "epoch": 0.36932153392330386, "grad_norm": 1.2816709280014038, "learning_rate": 3.975291468369661e-06, "loss": 0.0556, "step": 939 }, { "epoch": 0.3697148475909538, "grad_norm": 2.0237350463867188, "learning_rate": 3.97251759763453e-06, "loss": 0.0622, "step": 940 }, { "epoch": 0.37010816125860374, "grad_norm": 1.3120791912078857, "learning_rate": 3.969740948287817e-06, "loss": 0.0414, "step": 941 }, { "epoch": 0.3705014749262537, "grad_norm": 1.3838061094284058, "learning_rate": 3.966961525569005e-06, "loss": 0.0653, "step": 942 }, { "epoch": 0.3708947885939036, "grad_norm": 0.6813984513282776, "learning_rate": 3.964179334722811e-06, "loss": 0.0345, "step": 943 }, { "epoch": 0.37128810226155357, "grad_norm": 0.8976694345474243, "learning_rate": 3.961394380999173e-06, "loss": 0.0314, "step": 944 }, { "epoch": 0.37168141592920356, "grad_norm": 0.9033572673797607, "learning_rate": 3.958606669653243e-06, "loss": 0.0542, "step": 945 }, { "epoch": 0.3720747295968535, "grad_norm": 0.901779055595398, "learning_rate": 3.955816205945378e-06, "loss": 0.0359, "step": 946 }, { "epoch": 0.37246804326450345, "grad_norm": 2.198181390762329, "learning_rate": 3.953022995141128e-06, "loss": 0.0473, "step": 947 }, { "epoch": 0.3728613569321534, "grad_norm": 1.4871481657028198, "learning_rate": 3.950227042511226e-06, "loss": 0.0888, "step": 948 }, { "epoch": 0.37325467059980333, "grad_norm": 1.3157522678375244, "learning_rate": 3.947428353331579e-06, "loss": 0.041, "step": 949 }, { "epoch": 0.37364798426745327, "grad_norm": 1.431186318397522, "learning_rate": 3.94462693288326e-06, "loss": 0.0799, "step": 950 }, { "epoch": 0.37404129793510327, "grad_norm": 1.389054775238037, "learning_rate": 3.941822786452491e-06, "loss": 0.0457, "step": 951 }, { "epoch": 0.3744346116027532, "grad_norm": 1.6102625131607056, "learning_rate": 3.939015919330643e-06, "loss": 0.0926, "step": 952 }, { "epoch": 0.37482792527040315, "grad_norm": 0.8472495675086975, "learning_rate": 3.936206336814219e-06, "loss": 0.0408, "step": 953 }, { "epoch": 0.3752212389380531, "grad_norm": 0.8631911873817444, "learning_rate": 3.933394044204843e-06, "loss": 0.0405, "step": 954 }, { "epoch": 0.37561455260570303, "grad_norm": 5.559257507324219, "learning_rate": 3.930579046809259e-06, "loss": 0.048, "step": 955 }, { "epoch": 0.376007866273353, "grad_norm": 1.6139276027679443, "learning_rate": 3.92776134993931e-06, "loss": 0.0596, "step": 956 }, { "epoch": 0.376401179941003, "grad_norm": 1.7035290002822876, "learning_rate": 3.924940958911933e-06, "loss": 0.061, "step": 957 }, { "epoch": 0.3767944936086529, "grad_norm": 0.8409842848777771, "learning_rate": 3.922117879049152e-06, "loss": 0.0416, "step": 958 }, { "epoch": 0.37718780727630286, "grad_norm": 1.9367414712905884, "learning_rate": 3.91929211567806e-06, "loss": 0.0617, "step": 959 }, { "epoch": 0.3775811209439528, "grad_norm": 1.0128939151763916, "learning_rate": 3.916463674130821e-06, "loss": 0.0477, "step": 960 }, { "epoch": 0.37797443461160274, "grad_norm": 1.9125791788101196, "learning_rate": 3.913632559744645e-06, "loss": 0.0571, "step": 961 }, { "epoch": 0.3783677482792527, "grad_norm": 1.4633182287216187, "learning_rate": 3.910798777861788e-06, "loss": 0.0511, "step": 962 }, { "epoch": 0.3787610619469027, "grad_norm": 0.9891822934150696, "learning_rate": 3.9079623338295436e-06, "loss": 0.0485, "step": 963 }, { "epoch": 0.3791543756145526, "grad_norm": 1.2277315855026245, "learning_rate": 3.9051232330002245e-06, "loss": 0.0449, "step": 964 }, { "epoch": 0.37954768928220256, "grad_norm": 0.49736377596855164, "learning_rate": 3.902281480731156e-06, "loss": 0.0213, "step": 965 }, { "epoch": 0.3799410029498525, "grad_norm": 0.982218861579895, "learning_rate": 3.899437082384671e-06, "loss": 0.0581, "step": 966 }, { "epoch": 0.38033431661750244, "grad_norm": 0.8971213102340698, "learning_rate": 3.89659004332809e-06, "loss": 0.0458, "step": 967 }, { "epoch": 0.3807276302851524, "grad_norm": 0.4127979874610901, "learning_rate": 3.893740368933722e-06, "loss": 0.0313, "step": 968 }, { "epoch": 0.3811209439528024, "grad_norm": 2.5857155323028564, "learning_rate": 3.8908880645788464e-06, "loss": 0.0711, "step": 969 }, { "epoch": 0.3815142576204523, "grad_norm": 1.2110406160354614, "learning_rate": 3.888033135645702e-06, "loss": 0.0508, "step": 970 }, { "epoch": 0.38190757128810227, "grad_norm": 1.58492112159729, "learning_rate": 3.885175587521486e-06, "loss": 0.0662, "step": 971 }, { "epoch": 0.3823008849557522, "grad_norm": 0.8792701363563538, "learning_rate": 3.882315425598334e-06, "loss": 0.0767, "step": 972 }, { "epoch": 0.38269419862340215, "grad_norm": 1.797515869140625, "learning_rate": 3.879452655273316e-06, "loss": 0.0585, "step": 973 }, { "epoch": 0.3830875122910521, "grad_norm": 1.6386829614639282, "learning_rate": 3.876587281948422e-06, "loss": 0.08, "step": 974 }, { "epoch": 0.3834808259587021, "grad_norm": 1.1229251623153687, "learning_rate": 3.873719311030556e-06, "loss": 0.0585, "step": 975 }, { "epoch": 0.38387413962635203, "grad_norm": 1.2260591983795166, "learning_rate": 3.8708487479315204e-06, "loss": 0.0647, "step": 976 }, { "epoch": 0.38426745329400197, "grad_norm": 1.565321683883667, "learning_rate": 3.867975598068012e-06, "loss": 0.067, "step": 977 }, { "epoch": 0.3846607669616519, "grad_norm": 1.4004123210906982, "learning_rate": 3.8650998668616085e-06, "loss": 0.0765, "step": 978 }, { "epoch": 0.38505408062930185, "grad_norm": 1.5652803182601929, "learning_rate": 3.862221559738757e-06, "loss": 0.0672, "step": 979 }, { "epoch": 0.3854473942969518, "grad_norm": 4.284322738647461, "learning_rate": 3.859340682130766e-06, "loss": 0.0692, "step": 980 }, { "epoch": 0.3858407079646018, "grad_norm": 1.21330988407135, "learning_rate": 3.856457239473795e-06, "loss": 0.0828, "step": 981 }, { "epoch": 0.38623402163225173, "grad_norm": 2.4526336193084717, "learning_rate": 3.853571237208843e-06, "loss": 0.0694, "step": 982 }, { "epoch": 0.3866273352999017, "grad_norm": 1.0117402076721191, "learning_rate": 3.8506826807817395e-06, "loss": 0.0362, "step": 983 }, { "epoch": 0.3870206489675516, "grad_norm": 1.1363615989685059, "learning_rate": 3.847791575643134e-06, "loss": 0.0543, "step": 984 }, { "epoch": 0.38741396263520156, "grad_norm": 1.1766973733901978, "learning_rate": 3.844897927248483e-06, "loss": 0.0488, "step": 985 }, { "epoch": 0.3878072763028515, "grad_norm": 0.8534460067749023, "learning_rate": 3.842001741058045e-06, "loss": 0.0603, "step": 986 }, { "epoch": 0.3882005899705015, "grad_norm": 1.5655368566513062, "learning_rate": 3.839103022536865e-06, "loss": 0.0713, "step": 987 }, { "epoch": 0.38859390363815144, "grad_norm": 0.6574957966804504, "learning_rate": 3.836201777154769e-06, "loss": 0.0583, "step": 988 }, { "epoch": 0.3889872173058014, "grad_norm": 0.8077657222747803, "learning_rate": 3.833298010386347e-06, "loss": 0.05, "step": 989 }, { "epoch": 0.3893805309734513, "grad_norm": 1.513853669166565, "learning_rate": 3.830391727710954e-06, "loss": 0.0502, "step": 990 }, { "epoch": 0.38977384464110126, "grad_norm": 2.019428253173828, "learning_rate": 3.827482934612684e-06, "loss": 0.0557, "step": 991 }, { "epoch": 0.3901671583087512, "grad_norm": 1.0257922410964966, "learning_rate": 3.824571636580372e-06, "loss": 0.0625, "step": 992 }, { "epoch": 0.3905604719764012, "grad_norm": 0.5803849697113037, "learning_rate": 3.821657839107583e-06, "loss": 0.0442, "step": 993 }, { "epoch": 0.39095378564405114, "grad_norm": 0.8499471545219421, "learning_rate": 3.818741547692593e-06, "loss": 0.0342, "step": 994 }, { "epoch": 0.3913470993117011, "grad_norm": 0.4951908588409424, "learning_rate": 3.815822767838386e-06, "loss": 0.0343, "step": 995 }, { "epoch": 0.391740412979351, "grad_norm": 1.5221655368804932, "learning_rate": 3.812901505052642e-06, "loss": 0.0465, "step": 996 }, { "epoch": 0.39213372664700097, "grad_norm": 1.7891956567764282, "learning_rate": 3.8099777648477264e-06, "loss": 0.0821, "step": 997 }, { "epoch": 0.3925270403146509, "grad_norm": 0.8419029116630554, "learning_rate": 3.8070515527406803e-06, "loss": 0.0546, "step": 998 }, { "epoch": 0.3929203539823009, "grad_norm": 0.9236086010932922, "learning_rate": 3.8041228742532064e-06, "loss": 0.0423, "step": 999 }, { "epoch": 0.39331366764995085, "grad_norm": 1.0892646312713623, "learning_rate": 3.8011917349116633e-06, "loss": 0.0531, "step": 1000 }, { "epoch": 0.3937069813176008, "grad_norm": 1.6544411182403564, "learning_rate": 3.7982581402470536e-06, "loss": 0.0404, "step": 1001 }, { "epoch": 0.39410029498525073, "grad_norm": 1.8338655233383179, "learning_rate": 3.795322095795012e-06, "loss": 0.0535, "step": 1002 }, { "epoch": 0.3944936086529007, "grad_norm": 1.4561970233917236, "learning_rate": 3.7923836070957963e-06, "loss": 0.0506, "step": 1003 }, { "epoch": 0.3948869223205506, "grad_norm": 1.1206718683242798, "learning_rate": 3.7894426796942773e-06, "loss": 0.07, "step": 1004 }, { "epoch": 0.3952802359882006, "grad_norm": 1.5864077806472778, "learning_rate": 3.786499319139926e-06, "loss": 0.0511, "step": 1005 }, { "epoch": 0.39567354965585055, "grad_norm": 1.6479477882385254, "learning_rate": 3.7835535309868055e-06, "loss": 0.1065, "step": 1006 }, { "epoch": 0.3960668633235005, "grad_norm": 1.173240303993225, "learning_rate": 3.78060532079356e-06, "loss": 0.0366, "step": 1007 }, { "epoch": 0.39646017699115044, "grad_norm": 1.512009859085083, "learning_rate": 3.777654694123404e-06, "loss": 0.0333, "step": 1008 }, { "epoch": 0.3968534906588004, "grad_norm": 0.7629926800727844, "learning_rate": 3.7747016565441112e-06, "loss": 0.0293, "step": 1009 }, { "epoch": 0.3972468043264503, "grad_norm": 1.325535774230957, "learning_rate": 3.771746213628006e-06, "loss": 0.0494, "step": 1010 }, { "epoch": 0.3976401179941003, "grad_norm": 0.9456796050071716, "learning_rate": 3.7687883709519496e-06, "loss": 0.0347, "step": 1011 }, { "epoch": 0.39803343166175026, "grad_norm": 1.6305729150772095, "learning_rate": 3.7658281340973336e-06, "loss": 0.0782, "step": 1012 }, { "epoch": 0.3984267453294002, "grad_norm": 2.3638815879821777, "learning_rate": 3.7628655086500654e-06, "loss": 0.0746, "step": 1013 }, { "epoch": 0.39882005899705014, "grad_norm": 1.1770771741867065, "learning_rate": 3.7599005002005616e-06, "loss": 0.0436, "step": 1014 }, { "epoch": 0.3992133726647001, "grad_norm": 1.2992199659347534, "learning_rate": 3.7569331143437336e-06, "loss": 0.0565, "step": 1015 }, { "epoch": 0.39960668633235, "grad_norm": 1.2094827890396118, "learning_rate": 3.7539633566789812e-06, "loss": 0.0536, "step": 1016 }, { "epoch": 0.4, "grad_norm": 1.641381859779358, "learning_rate": 3.750991232810177e-06, "loss": 0.0373, "step": 1017 }, { "epoch": 0.40039331366764996, "grad_norm": 0.7891103029251099, "learning_rate": 3.7480167483456603e-06, "loss": 0.0632, "step": 1018 }, { "epoch": 0.4007866273352999, "grad_norm": 0.7216825485229492, "learning_rate": 3.7450399088982247e-06, "loss": 0.0513, "step": 1019 }, { "epoch": 0.40117994100294985, "grad_norm": 0.7158090472221375, "learning_rate": 3.742060720085107e-06, "loss": 0.0456, "step": 1020 }, { "epoch": 0.4015732546705998, "grad_norm": 0.58232182264328, "learning_rate": 3.739079187527978e-06, "loss": 0.027, "step": 1021 }, { "epoch": 0.40196656833824973, "grad_norm": 1.546899437904358, "learning_rate": 3.73609531685293e-06, "loss": 0.1034, "step": 1022 }, { "epoch": 0.4023598820058997, "grad_norm": 1.1753488779067993, "learning_rate": 3.733109113690469e-06, "loss": 0.0609, "step": 1023 }, { "epoch": 0.40275319567354967, "grad_norm": 1.5217546224594116, "learning_rate": 3.7301205836755006e-06, "loss": 0.0853, "step": 1024 }, { "epoch": 0.4031465093411996, "grad_norm": 0.9366397857666016, "learning_rate": 3.727129732447322e-06, "loss": 0.0511, "step": 1025 }, { "epoch": 0.40353982300884955, "grad_norm": 0.8296689391136169, "learning_rate": 3.7241365656496103e-06, "loss": 0.0336, "step": 1026 }, { "epoch": 0.4039331366764995, "grad_norm": 0.8638429641723633, "learning_rate": 3.7211410889304117e-06, "loss": 0.0675, "step": 1027 }, { "epoch": 0.40432645034414944, "grad_norm": 0.6674923896789551, "learning_rate": 3.7181433079421316e-06, "loss": 0.0299, "step": 1028 }, { "epoch": 0.40471976401179943, "grad_norm": 1.5683988332748413, "learning_rate": 3.7151432283415244e-06, "loss": 0.0814, "step": 1029 }, { "epoch": 0.4051130776794494, "grad_norm": 0.6941884756088257, "learning_rate": 3.712140855789679e-06, "loss": 0.0428, "step": 1030 }, { "epoch": 0.4055063913470993, "grad_norm": 0.8299364447593689, "learning_rate": 3.709136195952015e-06, "loss": 0.0534, "step": 1031 }, { "epoch": 0.40589970501474926, "grad_norm": 1.065128207206726, "learning_rate": 3.706129254498266e-06, "loss": 0.0527, "step": 1032 }, { "epoch": 0.4062930186823992, "grad_norm": 1.3388938903808594, "learning_rate": 3.703120037102469e-06, "loss": 0.0619, "step": 1033 }, { "epoch": 0.40668633235004914, "grad_norm": 1.6854989528656006, "learning_rate": 3.7001085494429596e-06, "loss": 0.0605, "step": 1034 }, { "epoch": 0.40707964601769914, "grad_norm": 1.7878034114837646, "learning_rate": 3.697094797202355e-06, "loss": 0.0644, "step": 1035 }, { "epoch": 0.4074729596853491, "grad_norm": 0.7512350082397461, "learning_rate": 3.694078786067546e-06, "loss": 0.0561, "step": 1036 }, { "epoch": 0.407866273352999, "grad_norm": 0.5946680307388306, "learning_rate": 3.691060521729686e-06, "loss": 0.032, "step": 1037 }, { "epoch": 0.40825958702064896, "grad_norm": 0.7464413642883301, "learning_rate": 3.6880400098841794e-06, "loss": 0.0581, "step": 1038 }, { "epoch": 0.4086529006882989, "grad_norm": 1.3339935541152954, "learning_rate": 3.6850172562306735e-06, "loss": 0.065, "step": 1039 }, { "epoch": 0.40904621435594885, "grad_norm": 1.2734817266464233, "learning_rate": 3.681992266473044e-06, "loss": 0.0302, "step": 1040 }, { "epoch": 0.40943952802359884, "grad_norm": 1.6477503776550293, "learning_rate": 3.6789650463193864e-06, "loss": 0.0454, "step": 1041 }, { "epoch": 0.4098328416912488, "grad_norm": 1.9478659629821777, "learning_rate": 3.675935601482006e-06, "loss": 0.0906, "step": 1042 }, { "epoch": 0.4102261553588987, "grad_norm": 1.2177263498306274, "learning_rate": 3.6729039376774055e-06, "loss": 0.0708, "step": 1043 }, { "epoch": 0.41061946902654867, "grad_norm": 1.3361903429031372, "learning_rate": 3.6698700606262733e-06, "loss": 0.0542, "step": 1044 }, { "epoch": 0.4110127826941986, "grad_norm": 0.7786129117012024, "learning_rate": 3.6668339760534768e-06, "loss": 0.0666, "step": 1045 }, { "epoch": 0.41140609636184855, "grad_norm": 0.4651035964488983, "learning_rate": 3.6637956896880465e-06, "loss": 0.0442, "step": 1046 }, { "epoch": 0.41179941002949855, "grad_norm": 0.28553763031959534, "learning_rate": 3.6607552072631685e-06, "loss": 0.0266, "step": 1047 }, { "epoch": 0.4121927236971485, "grad_norm": 1.054947018623352, "learning_rate": 3.6577125345161748e-06, "loss": 0.0533, "step": 1048 }, { "epoch": 0.41258603736479843, "grad_norm": 0.6713748574256897, "learning_rate": 3.6546676771885257e-06, "loss": 0.0347, "step": 1049 }, { "epoch": 0.41297935103244837, "grad_norm": 1.4435083866119385, "learning_rate": 3.6516206410258092e-06, "loss": 0.0384, "step": 1050 }, { "epoch": 0.4133726647000983, "grad_norm": 1.4494538307189941, "learning_rate": 3.6485714317777223e-06, "loss": 0.068, "step": 1051 }, { "epoch": 0.41376597836774826, "grad_norm": 1.666913390159607, "learning_rate": 3.6455200551980605e-06, "loss": 0.0685, "step": 1052 }, { "epoch": 0.41415929203539825, "grad_norm": 2.99609375, "learning_rate": 3.642466517044713e-06, "loss": 0.1213, "step": 1053 }, { "epoch": 0.4145526057030482, "grad_norm": 1.6199326515197754, "learning_rate": 3.6394108230796455e-06, "loss": 0.0557, "step": 1054 }, { "epoch": 0.41494591937069814, "grad_norm": 0.6611631512641907, "learning_rate": 3.636352979068891e-06, "loss": 0.0333, "step": 1055 }, { "epoch": 0.4153392330383481, "grad_norm": 0.8349502086639404, "learning_rate": 3.6332929907825426e-06, "loss": 0.0285, "step": 1056 }, { "epoch": 0.415732546705998, "grad_norm": 1.6354492902755737, "learning_rate": 3.630230863994736e-06, "loss": 0.0808, "step": 1057 }, { "epoch": 0.41612586037364796, "grad_norm": 0.8214701414108276, "learning_rate": 3.6271666044836433e-06, "loss": 0.0355, "step": 1058 }, { "epoch": 0.41651917404129796, "grad_norm": 1.321581244468689, "learning_rate": 3.624100218031464e-06, "loss": 0.0444, "step": 1059 }, { "epoch": 0.4169124877089479, "grad_norm": 0.7428562641143799, "learning_rate": 3.621031710424407e-06, "loss": 0.0259, "step": 1060 }, { "epoch": 0.41730580137659784, "grad_norm": 0.7929845452308655, "learning_rate": 3.6179610874526856e-06, "loss": 0.0345, "step": 1061 }, { "epoch": 0.4176991150442478, "grad_norm": 0.6758319139480591, "learning_rate": 3.614888354910505e-06, "loss": 0.037, "step": 1062 }, { "epoch": 0.4180924287118977, "grad_norm": 1.5147916078567505, "learning_rate": 3.6118135185960507e-06, "loss": 0.0855, "step": 1063 }, { "epoch": 0.41848574237954766, "grad_norm": 1.0528610944747925, "learning_rate": 3.6087365843114773e-06, "loss": 0.0324, "step": 1064 }, { "epoch": 0.41887905604719766, "grad_norm": 1.3274002075195312, "learning_rate": 3.6056575578629006e-06, "loss": 0.0475, "step": 1065 }, { "epoch": 0.4192723697148476, "grad_norm": 0.5520153641700745, "learning_rate": 3.6025764450603808e-06, "loss": 0.022, "step": 1066 }, { "epoch": 0.41966568338249755, "grad_norm": 1.81023371219635, "learning_rate": 3.5994932517179182e-06, "loss": 0.043, "step": 1067 }, { "epoch": 0.4200589970501475, "grad_norm": 1.3602193593978882, "learning_rate": 3.596407983653436e-06, "loss": 0.073, "step": 1068 }, { "epoch": 0.42045231071779743, "grad_norm": 1.921582579612732, "learning_rate": 3.5933206466887755e-06, "loss": 0.0759, "step": 1069 }, { "epoch": 0.42084562438544737, "grad_norm": 0.8578033447265625, "learning_rate": 3.59023124664968e-06, "loss": 0.0249, "step": 1070 }, { "epoch": 0.42123893805309737, "grad_norm": 1.7219325304031372, "learning_rate": 3.5871397893657867e-06, "loss": 0.0596, "step": 1071 }, { "epoch": 0.4216322517207473, "grad_norm": 0.9463638663291931, "learning_rate": 3.5840462806706126e-06, "loss": 0.0454, "step": 1072 }, { "epoch": 0.42202556538839725, "grad_norm": 1.9718307256698608, "learning_rate": 3.5809507264015502e-06, "loss": 0.0623, "step": 1073 }, { "epoch": 0.4224188790560472, "grad_norm": 2.0382165908813477, "learning_rate": 3.5778531323998465e-06, "loss": 0.0497, "step": 1074 }, { "epoch": 0.42281219272369713, "grad_norm": 1.496324062347412, "learning_rate": 3.574753504510602e-06, "loss": 0.0826, "step": 1075 }, { "epoch": 0.4232055063913471, "grad_norm": 0.49463126063346863, "learning_rate": 3.571651848582753e-06, "loss": 0.0415, "step": 1076 }, { "epoch": 0.42359882005899707, "grad_norm": 1.1558905839920044, "learning_rate": 3.5685481704690617e-06, "loss": 0.0473, "step": 1077 }, { "epoch": 0.423992133726647, "grad_norm": 3.914982795715332, "learning_rate": 3.5654424760261082e-06, "loss": 0.0853, "step": 1078 }, { "epoch": 0.42438544739429696, "grad_norm": 1.7288295030593872, "learning_rate": 3.5623347711142764e-06, "loss": 0.0817, "step": 1079 }, { "epoch": 0.4247787610619469, "grad_norm": 1.0033987760543823, "learning_rate": 3.5592250615977434e-06, "loss": 0.0552, "step": 1080 }, { "epoch": 0.42517207472959684, "grad_norm": 1.461305856704712, "learning_rate": 3.5561133533444703e-06, "loss": 0.0659, "step": 1081 }, { "epoch": 0.4255653883972468, "grad_norm": 0.7007796168327332, "learning_rate": 3.552999652226189e-06, "loss": 0.0332, "step": 1082 }, { "epoch": 0.4259587020648968, "grad_norm": 0.7041943073272705, "learning_rate": 3.549883964118392e-06, "loss": 0.0205, "step": 1083 }, { "epoch": 0.4263520157325467, "grad_norm": 1.5797779560089111, "learning_rate": 3.54676629490032e-06, "loss": 0.0564, "step": 1084 }, { "epoch": 0.42674532940019666, "grad_norm": 1.4408408403396606, "learning_rate": 3.543646650454955e-06, "loss": 0.0347, "step": 1085 }, { "epoch": 0.4271386430678466, "grad_norm": 0.709080159664154, "learning_rate": 3.5405250366690023e-06, "loss": 0.0259, "step": 1086 }, { "epoch": 0.42753195673549654, "grad_norm": 1.4579590559005737, "learning_rate": 3.5374014594328877e-06, "loss": 0.0712, "step": 1087 }, { "epoch": 0.4279252704031465, "grad_norm": 0.9378184676170349, "learning_rate": 3.5342759246407378e-06, "loss": 0.0583, "step": 1088 }, { "epoch": 0.4283185840707965, "grad_norm": 0.9149574041366577, "learning_rate": 3.5311484381903754e-06, "loss": 0.0594, "step": 1089 }, { "epoch": 0.4287118977384464, "grad_norm": 1.2301528453826904, "learning_rate": 3.528019005983306e-06, "loss": 0.0603, "step": 1090 }, { "epoch": 0.42910521140609637, "grad_norm": 1.222373127937317, "learning_rate": 3.5248876339247053e-06, "loss": 0.0331, "step": 1091 }, { "epoch": 0.4294985250737463, "grad_norm": 1.5141066312789917, "learning_rate": 3.521754327923412e-06, "loss": 0.0662, "step": 1092 }, { "epoch": 0.42989183874139625, "grad_norm": 1.581040620803833, "learning_rate": 3.5186190938919106e-06, "loss": 0.0634, "step": 1093 }, { "epoch": 0.4302851524090462, "grad_norm": 1.1250847578048706, "learning_rate": 3.515481937746327e-06, "loss": 0.0428, "step": 1094 }, { "epoch": 0.4306784660766962, "grad_norm": 1.6886603832244873, "learning_rate": 3.5123428654064134e-06, "loss": 0.043, "step": 1095 }, { "epoch": 0.43107177974434613, "grad_norm": 2.050182819366455, "learning_rate": 3.509201882795536e-06, "loss": 0.1201, "step": 1096 }, { "epoch": 0.43146509341199607, "grad_norm": 1.2001996040344238, "learning_rate": 3.5060589958406677e-06, "loss": 0.0453, "step": 1097 }, { "epoch": 0.431858407079646, "grad_norm": 1.0683172941207886, "learning_rate": 3.5029142104723725e-06, "loss": 0.0331, "step": 1098 }, { "epoch": 0.43225172074729595, "grad_norm": 2.0737650394439697, "learning_rate": 3.4997675326247993e-06, "loss": 0.0526, "step": 1099 }, { "epoch": 0.4326450344149459, "grad_norm": 0.8983532190322876, "learning_rate": 3.4966189682356677e-06, "loss": 0.0532, "step": 1100 }, { "epoch": 0.4330383480825959, "grad_norm": 1.8358802795410156, "learning_rate": 3.493468523246255e-06, "loss": 0.0598, "step": 1101 }, { "epoch": 0.43343166175024583, "grad_norm": 2.076266050338745, "learning_rate": 3.4903162036013894e-06, "loss": 0.0836, "step": 1102 }, { "epoch": 0.4338249754178958, "grad_norm": 2.4419870376586914, "learning_rate": 3.487162015249436e-06, "loss": 0.0758, "step": 1103 }, { "epoch": 0.4342182890855457, "grad_norm": 1.3942052125930786, "learning_rate": 3.484005964142285e-06, "loss": 0.0803, "step": 1104 }, { "epoch": 0.43461160275319566, "grad_norm": 1.3950960636138916, "learning_rate": 3.4808480562353426e-06, "loss": 0.0675, "step": 1105 }, { "epoch": 0.4350049164208456, "grad_norm": 1.5000733137130737, "learning_rate": 3.477688297487519e-06, "loss": 0.0448, "step": 1106 }, { "epoch": 0.4353982300884956, "grad_norm": 1.5005849599838257, "learning_rate": 3.474526693861216e-06, "loss": 0.0729, "step": 1107 }, { "epoch": 0.43579154375614554, "grad_norm": 0.6299577951431274, "learning_rate": 3.4713632513223178e-06, "loss": 0.039, "step": 1108 }, { "epoch": 0.4361848574237955, "grad_norm": 0.8964212536811829, "learning_rate": 3.4681979758401767e-06, "loss": 0.0521, "step": 1109 }, { "epoch": 0.4365781710914454, "grad_norm": 1.3757152557373047, "learning_rate": 3.465030873387606e-06, "loss": 0.0598, "step": 1110 }, { "epoch": 0.43697148475909536, "grad_norm": 0.48663070797920227, "learning_rate": 3.461861949940865e-06, "loss": 0.0442, "step": 1111 }, { "epoch": 0.4373647984267453, "grad_norm": 0.8878856897354126, "learning_rate": 3.458691211479649e-06, "loss": 0.023, "step": 1112 }, { "epoch": 0.4377581120943953, "grad_norm": 1.1162179708480835, "learning_rate": 3.4555186639870795e-06, "loss": 0.0493, "step": 1113 }, { "epoch": 0.43815142576204524, "grad_norm": 1.1180258989334106, "learning_rate": 3.4523443134496916e-06, "loss": 0.0577, "step": 1114 }, { "epoch": 0.4385447394296952, "grad_norm": 0.6240465641021729, "learning_rate": 3.4491681658574205e-06, "loss": 0.0295, "step": 1115 }, { "epoch": 0.4389380530973451, "grad_norm": 2.439685106277466, "learning_rate": 3.445990227203594e-06, "loss": 0.0676, "step": 1116 }, { "epoch": 0.43933136676499507, "grad_norm": 1.1544771194458008, "learning_rate": 3.442810503484921e-06, "loss": 0.0487, "step": 1117 }, { "epoch": 0.439724680432645, "grad_norm": 1.794083833694458, "learning_rate": 3.4396290007014752e-06, "loss": 0.043, "step": 1118 }, { "epoch": 0.440117994100295, "grad_norm": 0.8073402643203735, "learning_rate": 3.4364457248566913e-06, "loss": 0.0404, "step": 1119 }, { "epoch": 0.44051130776794495, "grad_norm": 0.4391036331653595, "learning_rate": 3.433260681957346e-06, "loss": 0.0394, "step": 1120 }, { "epoch": 0.4409046214355949, "grad_norm": 1.0611299276351929, "learning_rate": 3.430073878013554e-06, "loss": 0.0263, "step": 1121 }, { "epoch": 0.44129793510324483, "grad_norm": 0.48767581582069397, "learning_rate": 3.4268853190387496e-06, "loss": 0.0341, "step": 1122 }, { "epoch": 0.4416912487708948, "grad_norm": 0.6423639059066772, "learning_rate": 3.423695011049683e-06, "loss": 0.0234, "step": 1123 }, { "epoch": 0.4420845624385447, "grad_norm": 1.0390664339065552, "learning_rate": 3.4205029600663996e-06, "loss": 0.0593, "step": 1124 }, { "epoch": 0.4424778761061947, "grad_norm": 1.2516858577728271, "learning_rate": 3.4173091721122375e-06, "loss": 0.0375, "step": 1125 }, { "epoch": 0.44287118977384465, "grad_norm": 1.670310139656067, "learning_rate": 3.414113653213812e-06, "loss": 0.0504, "step": 1126 }, { "epoch": 0.4432645034414946, "grad_norm": 2.317314624786377, "learning_rate": 3.410916409401004e-06, "loss": 0.0911, "step": 1127 }, { "epoch": 0.44365781710914454, "grad_norm": 1.418398141860962, "learning_rate": 3.407717446706948e-06, "loss": 0.0439, "step": 1128 }, { "epoch": 0.4440511307767945, "grad_norm": 1.1104565858840942, "learning_rate": 3.4045167711680244e-06, "loss": 0.0485, "step": 1129 }, { "epoch": 0.4444444444444444, "grad_norm": 1.8792333602905273, "learning_rate": 3.4013143888238455e-06, "loss": 0.064, "step": 1130 }, { "epoch": 0.4448377581120944, "grad_norm": 1.7921650409698486, "learning_rate": 3.398110305717241e-06, "loss": 0.0495, "step": 1131 }, { "epoch": 0.44523107177974436, "grad_norm": 1.4747095108032227, "learning_rate": 3.3949045278942545e-06, "loss": 0.0743, "step": 1132 }, { "epoch": 0.4456243854473943, "grad_norm": 0.6847875118255615, "learning_rate": 3.3916970614041244e-06, "loss": 0.0224, "step": 1133 }, { "epoch": 0.44601769911504424, "grad_norm": 0.7522935271263123, "learning_rate": 3.3884879122992762e-06, "loss": 0.0334, "step": 1134 }, { "epoch": 0.4464110127826942, "grad_norm": 1.5176104307174683, "learning_rate": 3.3852770866353125e-06, "loss": 0.0729, "step": 1135 }, { "epoch": 0.4468043264503441, "grad_norm": 1.188468337059021, "learning_rate": 3.382064590470996e-06, "loss": 0.0315, "step": 1136 }, { "epoch": 0.4471976401179941, "grad_norm": 0.5583229660987854, "learning_rate": 3.378850429868244e-06, "loss": 0.0292, "step": 1137 }, { "epoch": 0.44759095378564406, "grad_norm": 0.7804880738258362, "learning_rate": 3.3756346108921145e-06, "loss": 0.0378, "step": 1138 }, { "epoch": 0.447984267453294, "grad_norm": 1.090079426765442, "learning_rate": 3.372417139610793e-06, "loss": 0.0549, "step": 1139 }, { "epoch": 0.44837758112094395, "grad_norm": 1.363856554031372, "learning_rate": 3.369198022095585e-06, "loss": 0.0859, "step": 1140 }, { "epoch": 0.4487708947885939, "grad_norm": 1.162818431854248, "learning_rate": 3.3659772644209023e-06, "loss": 0.0292, "step": 1141 }, { "epoch": 0.44916420845624383, "grad_norm": 0.8213643431663513, "learning_rate": 3.36275487266425e-06, "loss": 0.0435, "step": 1142 }, { "epoch": 0.4495575221238938, "grad_norm": 0.8050291538238525, "learning_rate": 3.3595308529062176e-06, "loss": 0.0279, "step": 1143 }, { "epoch": 0.44995083579154377, "grad_norm": 1.1065354347229004, "learning_rate": 3.3563052112304674e-06, "loss": 0.0425, "step": 1144 }, { "epoch": 0.4503441494591937, "grad_norm": 0.9072518348693848, "learning_rate": 3.3530779537237194e-06, "loss": 0.0315, "step": 1145 }, { "epoch": 0.45073746312684365, "grad_norm": 0.8572150468826294, "learning_rate": 3.349849086475747e-06, "loss": 0.0306, "step": 1146 }, { "epoch": 0.4511307767944936, "grad_norm": 1.552173137664795, "learning_rate": 3.346618615579359e-06, "loss": 0.0671, "step": 1147 }, { "epoch": 0.45152409046214353, "grad_norm": 0.9978398084640503, "learning_rate": 3.3433865471303876e-06, "loss": 0.0667, "step": 1148 }, { "epoch": 0.45191740412979353, "grad_norm": 2.7961080074310303, "learning_rate": 3.3401528872276847e-06, "loss": 0.0696, "step": 1149 }, { "epoch": 0.4523107177974435, "grad_norm": 1.520912528038025, "learning_rate": 3.3369176419731004e-06, "loss": 0.0722, "step": 1150 }, { "epoch": 0.4527040314650934, "grad_norm": 0.8389769196510315, "learning_rate": 3.33368081747148e-06, "loss": 0.0444, "step": 1151 }, { "epoch": 0.45309734513274336, "grad_norm": 2.075424909591675, "learning_rate": 3.3304424198306464e-06, "loss": 0.0826, "step": 1152 }, { "epoch": 0.4534906588003933, "grad_norm": 0.7416201829910278, "learning_rate": 3.3272024551613926e-06, "loss": 0.0283, "step": 1153 }, { "epoch": 0.45388397246804324, "grad_norm": 1.0457786321640015, "learning_rate": 3.3239609295774667e-06, "loss": 0.0418, "step": 1154 }, { "epoch": 0.45427728613569324, "grad_norm": 0.9312077760696411, "learning_rate": 3.3207178491955656e-06, "loss": 0.0341, "step": 1155 }, { "epoch": 0.4546705998033432, "grad_norm": 0.9886119365692139, "learning_rate": 3.3174732201353155e-06, "loss": 0.0623, "step": 1156 }, { "epoch": 0.4550639134709931, "grad_norm": 1.2970693111419678, "learning_rate": 3.3142270485192683e-06, "loss": 0.087, "step": 1157 }, { "epoch": 0.45545722713864306, "grad_norm": 1.273305892944336, "learning_rate": 3.3109793404728855e-06, "loss": 0.0654, "step": 1158 }, { "epoch": 0.455850540806293, "grad_norm": 0.8121715188026428, "learning_rate": 3.3077301021245285e-06, "loss": 0.0257, "step": 1159 }, { "epoch": 0.45624385447394294, "grad_norm": 1.6593793630599976, "learning_rate": 3.3044793396054447e-06, "loss": 0.0679, "step": 1160 }, { "epoch": 0.45663716814159294, "grad_norm": 1.2623846530914307, "learning_rate": 3.3012270590497596e-06, "loss": 0.071, "step": 1161 }, { "epoch": 0.4570304818092429, "grad_norm": 0.9096400737762451, "learning_rate": 3.2979732665944615e-06, "loss": 0.067, "step": 1162 }, { "epoch": 0.4574237954768928, "grad_norm": 0.9472593069076538, "learning_rate": 3.2947179683793928e-06, "loss": 0.0395, "step": 1163 }, { "epoch": 0.45781710914454277, "grad_norm": 0.9576103091239929, "learning_rate": 3.291461170547237e-06, "loss": 0.049, "step": 1164 }, { "epoch": 0.4582104228121927, "grad_norm": 0.9918181300163269, "learning_rate": 3.2882028792435072e-06, "loss": 0.0318, "step": 1165 }, { "epoch": 0.45860373647984265, "grad_norm": 1.843493938446045, "learning_rate": 3.2849431006165343e-06, "loss": 0.0634, "step": 1166 }, { "epoch": 0.45899705014749265, "grad_norm": 0.8672575950622559, "learning_rate": 3.2816818408174567e-06, "loss": 0.0826, "step": 1167 }, { "epoch": 0.4593903638151426, "grad_norm": 1.5660734176635742, "learning_rate": 3.278419106000206e-06, "loss": 0.0695, "step": 1168 }, { "epoch": 0.45978367748279253, "grad_norm": 1.3234399557113647, "learning_rate": 3.2751549023214995e-06, "loss": 0.0381, "step": 1169 }, { "epoch": 0.46017699115044247, "grad_norm": 1.7596269845962524, "learning_rate": 3.2718892359408245e-06, "loss": 0.0438, "step": 1170 }, { "epoch": 0.4605703048180924, "grad_norm": 0.6878931522369385, "learning_rate": 3.2686221130204287e-06, "loss": 0.0347, "step": 1171 }, { "epoch": 0.46096361848574235, "grad_norm": 1.0857138633728027, "learning_rate": 3.265353539725309e-06, "loss": 0.0609, "step": 1172 }, { "epoch": 0.46135693215339235, "grad_norm": 0.777098536491394, "learning_rate": 3.2620835222231972e-06, "loss": 0.0597, "step": 1173 }, { "epoch": 0.4617502458210423, "grad_norm": 4.028940677642822, "learning_rate": 3.2588120666845534e-06, "loss": 0.0702, "step": 1174 }, { "epoch": 0.46214355948869223, "grad_norm": 1.3609766960144043, "learning_rate": 3.255539179282548e-06, "loss": 0.0478, "step": 1175 }, { "epoch": 0.4625368731563422, "grad_norm": 1.3808916807174683, "learning_rate": 3.2522648661930558e-06, "loss": 0.0787, "step": 1176 }, { "epoch": 0.4629301868239921, "grad_norm": 1.464201807975769, "learning_rate": 3.2489891335946413e-06, "loss": 0.0565, "step": 1177 }, { "epoch": 0.46332350049164206, "grad_norm": 1.4196548461914062, "learning_rate": 3.245711987668545e-06, "loss": 0.0747, "step": 1178 }, { "epoch": 0.46371681415929206, "grad_norm": 1.5526188611984253, "learning_rate": 3.2424334345986787e-06, "loss": 0.0384, "step": 1179 }, { "epoch": 0.464110127826942, "grad_norm": 1.4707880020141602, "learning_rate": 3.239153480571605e-06, "loss": 0.0669, "step": 1180 }, { "epoch": 0.46450344149459194, "grad_norm": 1.5997252464294434, "learning_rate": 3.2358721317765344e-06, "loss": 0.063, "step": 1181 }, { "epoch": 0.4648967551622419, "grad_norm": 0.7773184180259705, "learning_rate": 3.2325893944053066e-06, "loss": 0.0515, "step": 1182 }, { "epoch": 0.4652900688298918, "grad_norm": 1.1635929346084595, "learning_rate": 3.2293052746523814e-06, "loss": 0.0494, "step": 1183 }, { "epoch": 0.46568338249754176, "grad_norm": 0.9854192137718201, "learning_rate": 3.2260197787148277e-06, "loss": 0.0559, "step": 1184 }, { "epoch": 0.46607669616519176, "grad_norm": 1.9313583374023438, "learning_rate": 3.222732912792313e-06, "loss": 0.0447, "step": 1185 }, { "epoch": 0.4664700098328417, "grad_norm": 2.149656295776367, "learning_rate": 3.2194446830870865e-06, "loss": 0.0772, "step": 1186 }, { "epoch": 0.46686332350049164, "grad_norm": 1.784822940826416, "learning_rate": 3.2161550958039732e-06, "loss": 0.0746, "step": 1187 }, { "epoch": 0.4672566371681416, "grad_norm": 1.5821526050567627, "learning_rate": 3.2128641571503594e-06, "loss": 0.0613, "step": 1188 }, { "epoch": 0.46764995083579153, "grad_norm": 1.6123450994491577, "learning_rate": 3.2095718733361803e-06, "loss": 0.0419, "step": 1189 }, { "epoch": 0.46804326450344147, "grad_norm": 1.5458816289901733, "learning_rate": 3.2062782505739125e-06, "loss": 0.0854, "step": 1190 }, { "epoch": 0.46843657817109147, "grad_norm": 1.5308221578598022, "learning_rate": 3.202983295078555e-06, "loss": 0.063, "step": 1191 }, { "epoch": 0.4688298918387414, "grad_norm": 1.166703224182129, "learning_rate": 3.199687013067624e-06, "loss": 0.0759, "step": 1192 }, { "epoch": 0.46922320550639135, "grad_norm": 1.2040659189224243, "learning_rate": 3.1963894107611395e-06, "loss": 0.0648, "step": 1193 }, { "epoch": 0.4696165191740413, "grad_norm": 0.8159343004226685, "learning_rate": 3.1930904943816104e-06, "loss": 0.0252, "step": 1194 }, { "epoch": 0.47000983284169123, "grad_norm": 0.5714221596717834, "learning_rate": 3.189790270154028e-06, "loss": 0.0402, "step": 1195 }, { "epoch": 0.4704031465093412, "grad_norm": 1.1028029918670654, "learning_rate": 3.186488744305849e-06, "loss": 0.0358, "step": 1196 }, { "epoch": 0.47079646017699117, "grad_norm": 1.1706167459487915, "learning_rate": 3.183185923066988e-06, "loss": 0.0405, "step": 1197 }, { "epoch": 0.4711897738446411, "grad_norm": 2.2323551177978516, "learning_rate": 3.179881812669804e-06, "loss": 0.0626, "step": 1198 }, { "epoch": 0.47158308751229105, "grad_norm": 1.4933780431747437, "learning_rate": 3.1765764193490863e-06, "loss": 0.0421, "step": 1199 }, { "epoch": 0.471976401179941, "grad_norm": 1.759582281112671, "learning_rate": 3.173269749342047e-06, "loss": 0.0386, "step": 1200 }, { "epoch": 0.47236971484759094, "grad_norm": 0.9716536998748779, "learning_rate": 3.1699618088883094e-06, "loss": 0.0469, "step": 1201 }, { "epoch": 0.4727630285152409, "grad_norm": 1.4588727951049805, "learning_rate": 3.1666526042298883e-06, "loss": 0.062, "step": 1202 }, { "epoch": 0.4731563421828909, "grad_norm": 0.7807295918464661, "learning_rate": 3.16334214161119e-06, "loss": 0.0516, "step": 1203 }, { "epoch": 0.4735496558505408, "grad_norm": 0.9360034465789795, "learning_rate": 3.1600304272789904e-06, "loss": 0.0413, "step": 1204 }, { "epoch": 0.47394296951819076, "grad_norm": 3.0252861976623535, "learning_rate": 3.1567174674824303e-06, "loss": 0.0517, "step": 1205 }, { "epoch": 0.4743362831858407, "grad_norm": 1.2127926349639893, "learning_rate": 3.1534032684729978e-06, "loss": 0.0634, "step": 1206 }, { "epoch": 0.47472959685349064, "grad_norm": 1.008239984512329, "learning_rate": 3.1500878365045217e-06, "loss": 0.035, "step": 1207 }, { "epoch": 0.4751229105211406, "grad_norm": 0.8630732893943787, "learning_rate": 3.1467711778331573e-06, "loss": 0.0432, "step": 1208 }, { "epoch": 0.4755162241887906, "grad_norm": 0.5713632702827454, "learning_rate": 3.143453298717373e-06, "loss": 0.0293, "step": 1209 }, { "epoch": 0.4759095378564405, "grad_norm": 1.3503292798995972, "learning_rate": 3.14013420541794e-06, "loss": 0.0488, "step": 1210 }, { "epoch": 0.47630285152409046, "grad_norm": 0.6340729594230652, "learning_rate": 3.1368139041979235e-06, "loss": 0.0352, "step": 1211 }, { "epoch": 0.4766961651917404, "grad_norm": 2.0643789768218994, "learning_rate": 3.133492401322666e-06, "loss": 0.0602, "step": 1212 }, { "epoch": 0.47708947885939035, "grad_norm": 1.456824779510498, "learning_rate": 3.1301697030597772e-06, "loss": 0.0576, "step": 1213 }, { "epoch": 0.4774827925270403, "grad_norm": 1.6788169145584106, "learning_rate": 3.126845815679123e-06, "loss": 0.0473, "step": 1214 }, { "epoch": 0.4778761061946903, "grad_norm": 0.9894094467163086, "learning_rate": 3.1235207454528137e-06, "loss": 0.0486, "step": 1215 }, { "epoch": 0.47826941986234023, "grad_norm": 0.6644244194030762, "learning_rate": 3.12019449865519e-06, "loss": 0.0348, "step": 1216 }, { "epoch": 0.47866273352999017, "grad_norm": 1.8796205520629883, "learning_rate": 3.116867081562815e-06, "loss": 0.0711, "step": 1217 }, { "epoch": 0.4790560471976401, "grad_norm": 0.71921706199646, "learning_rate": 3.1135385004544584e-06, "loss": 0.0439, "step": 1218 }, { "epoch": 0.47944936086529005, "grad_norm": 1.4723786115646362, "learning_rate": 3.1102087616110866e-06, "loss": 0.0948, "step": 1219 }, { "epoch": 0.47984267453294, "grad_norm": 1.0385109186172485, "learning_rate": 3.1068778713158515e-06, "loss": 0.0481, "step": 1220 }, { "epoch": 0.48023598820059, "grad_norm": 1.8688119649887085, "learning_rate": 3.1035458358540764e-06, "loss": 0.0962, "step": 1221 }, { "epoch": 0.48062930186823993, "grad_norm": 0.988058865070343, "learning_rate": 3.100212661513247e-06, "loss": 0.0862, "step": 1222 }, { "epoch": 0.4810226155358899, "grad_norm": 0.7118948698043823, "learning_rate": 3.096878354582998e-06, "loss": 0.0492, "step": 1223 }, { "epoch": 0.4814159292035398, "grad_norm": 1.1759183406829834, "learning_rate": 3.093542921355099e-06, "loss": 0.0278, "step": 1224 }, { "epoch": 0.48180924287118976, "grad_norm": 0.8185058832168579, "learning_rate": 3.0902063681234473e-06, "loss": 0.0618, "step": 1225 }, { "epoch": 0.4822025565388397, "grad_norm": 1.0773781538009644, "learning_rate": 3.086868701184054e-06, "loss": 0.0393, "step": 1226 }, { "epoch": 0.4825958702064897, "grad_norm": 1.4859130382537842, "learning_rate": 3.083529926835028e-06, "loss": 0.0425, "step": 1227 }, { "epoch": 0.48298918387413964, "grad_norm": 0.8524113297462463, "learning_rate": 3.0801900513765732e-06, "loss": 0.0667, "step": 1228 }, { "epoch": 0.4833824975417896, "grad_norm": 1.2344658374786377, "learning_rate": 3.076849081110967e-06, "loss": 0.0469, "step": 1229 }, { "epoch": 0.4837758112094395, "grad_norm": 1.4112597703933716, "learning_rate": 3.073507022342554e-06, "loss": 0.0439, "step": 1230 }, { "epoch": 0.48416912487708946, "grad_norm": 1.0202746391296387, "learning_rate": 3.070163881377734e-06, "loss": 0.0953, "step": 1231 }, { "epoch": 0.4845624385447394, "grad_norm": 1.2902711629867554, "learning_rate": 3.066819664524947e-06, "loss": 0.0378, "step": 1232 }, { "epoch": 0.4849557522123894, "grad_norm": 0.8746582269668579, "learning_rate": 3.063474378094665e-06, "loss": 0.0404, "step": 1233 }, { "epoch": 0.48534906588003934, "grad_norm": 1.8847814798355103, "learning_rate": 3.060128028399376e-06, "loss": 0.0779, "step": 1234 }, { "epoch": 0.4857423795476893, "grad_norm": 1.2793282270431519, "learning_rate": 3.056780621753577e-06, "loss": 0.0433, "step": 1235 }, { "epoch": 0.4861356932153392, "grad_norm": 1.4302126169204712, "learning_rate": 3.0534321644737574e-06, "loss": 0.0565, "step": 1236 }, { "epoch": 0.48652900688298917, "grad_norm": 0.8506616353988647, "learning_rate": 3.0500826628783903e-06, "loss": 0.0448, "step": 1237 }, { "epoch": 0.4869223205506391, "grad_norm": 1.7796978950500488, "learning_rate": 3.046732123287918e-06, "loss": 0.0449, "step": 1238 }, { "epoch": 0.4873156342182891, "grad_norm": 1.4967756271362305, "learning_rate": 3.043380552024744e-06, "loss": 0.0409, "step": 1239 }, { "epoch": 0.48770894788593905, "grad_norm": 1.2920217514038086, "learning_rate": 3.0400279554132157e-06, "loss": 0.0465, "step": 1240 }, { "epoch": 0.488102261553589, "grad_norm": 1.9115070104599, "learning_rate": 3.0366743397796166e-06, "loss": 0.0591, "step": 1241 }, { "epoch": 0.48849557522123893, "grad_norm": 0.988409161567688, "learning_rate": 3.033319711452154e-06, "loss": 0.042, "step": 1242 }, { "epoch": 0.4888888888888889, "grad_norm": 2.1158268451690674, "learning_rate": 3.0299640767609447e-06, "loss": 0.0792, "step": 1243 }, { "epoch": 0.4892822025565388, "grad_norm": 1.1518357992172241, "learning_rate": 3.0266074420380043e-06, "loss": 0.0554, "step": 1244 }, { "epoch": 0.4896755162241888, "grad_norm": 1.3400568962097168, "learning_rate": 3.023249813617238e-06, "loss": 0.0545, "step": 1245 }, { "epoch": 0.49006882989183875, "grad_norm": 0.8380603790283203, "learning_rate": 3.0198911978344213e-06, "loss": 0.0377, "step": 1246 }, { "epoch": 0.4904621435594887, "grad_norm": 1.3251253366470337, "learning_rate": 3.0165316010271982e-06, "loss": 0.0419, "step": 1247 }, { "epoch": 0.49085545722713864, "grad_norm": 0.7429760098457336, "learning_rate": 3.0131710295350615e-06, "loss": 0.0487, "step": 1248 }, { "epoch": 0.4912487708947886, "grad_norm": 1.619492530822754, "learning_rate": 3.0098094896993413e-06, "loss": 0.0364, "step": 1249 }, { "epoch": 0.4916420845624385, "grad_norm": 1.8555465936660767, "learning_rate": 3.0064469878631986e-06, "loss": 0.0327, "step": 1250 }, { "epoch": 0.4920353982300885, "grad_norm": 2.1514008045196533, "learning_rate": 3.003083530371606e-06, "loss": 0.0961, "step": 1251 }, { "epoch": 0.49242871189773846, "grad_norm": 1.1894843578338623, "learning_rate": 2.9997191235713435e-06, "loss": 0.0773, "step": 1252 }, { "epoch": 0.4928220255653884, "grad_norm": 1.375878095626831, "learning_rate": 2.9963537738109783e-06, "loss": 0.0635, "step": 1253 }, { "epoch": 0.49321533923303834, "grad_norm": 0.9740056395530701, "learning_rate": 2.9929874874408595e-06, "loss": 0.0581, "step": 1254 }, { "epoch": 0.4936086529006883, "grad_norm": 1.21156907081604, "learning_rate": 2.9896202708131027e-06, "loss": 0.0524, "step": 1255 }, { "epoch": 0.4940019665683382, "grad_norm": 4.271803855895996, "learning_rate": 2.98625213028158e-06, "loss": 0.0437, "step": 1256 }, { "epoch": 0.4943952802359882, "grad_norm": 1.0697994232177734, "learning_rate": 2.9828830722019046e-06, "loss": 0.0693, "step": 1257 }, { "epoch": 0.49478859390363816, "grad_norm": 1.0657457113265991, "learning_rate": 2.979513102931424e-06, "loss": 0.0788, "step": 1258 }, { "epoch": 0.4951819075712881, "grad_norm": 1.6833268404006958, "learning_rate": 2.9761422288292017e-06, "loss": 0.0755, "step": 1259 }, { "epoch": 0.49557522123893805, "grad_norm": 0.7139087915420532, "learning_rate": 2.9727704562560124e-06, "loss": 0.0416, "step": 1260 }, { "epoch": 0.495968534906588, "grad_norm": 1.025672435760498, "learning_rate": 2.9693977915743227e-06, "loss": 0.057, "step": 1261 }, { "epoch": 0.49636184857423793, "grad_norm": 1.6005637645721436, "learning_rate": 2.9660242411482848e-06, "loss": 0.0694, "step": 1262 }, { "epoch": 0.4967551622418879, "grad_norm": 1.2426131963729858, "learning_rate": 2.9626498113437215e-06, "loss": 0.0443, "step": 1263 }, { "epoch": 0.49714847590953787, "grad_norm": 1.0461783409118652, "learning_rate": 2.9592745085281154e-06, "loss": 0.0449, "step": 1264 }, { "epoch": 0.4975417895771878, "grad_norm": 1.1440929174423218, "learning_rate": 2.955898339070596e-06, "loss": 0.0429, "step": 1265 }, { "epoch": 0.49793510324483775, "grad_norm": 1.5936861038208008, "learning_rate": 2.9525213093419275e-06, "loss": 0.0517, "step": 1266 }, { "epoch": 0.4983284169124877, "grad_norm": 0.9140682220458984, "learning_rate": 2.9491434257144995e-06, "loss": 0.0699, "step": 1267 }, { "epoch": 0.49872173058013763, "grad_norm": 0.6656792759895325, "learning_rate": 2.9457646945623107e-06, "loss": 0.023, "step": 1268 }, { "epoch": 0.49911504424778763, "grad_norm": 1.1062997579574585, "learning_rate": 2.9423851222609607e-06, "loss": 0.0801, "step": 1269 }, { "epoch": 0.4995083579154376, "grad_norm": 0.9155628085136414, "learning_rate": 2.939004715187635e-06, "loss": 0.0704, "step": 1270 }, { "epoch": 0.4999016715830875, "grad_norm": 0.8905113339424133, "learning_rate": 2.935623479721095e-06, "loss": 0.0442, "step": 1271 }, { "epoch": 0.5002949852507375, "grad_norm": 0.8276392817497253, "learning_rate": 2.932241422241665e-06, "loss": 0.0535, "step": 1272 }, { "epoch": 0.5006882989183874, "grad_norm": 0.5640360713005066, "learning_rate": 2.9288585491312206e-06, "loss": 0.0411, "step": 1273 }, { "epoch": 0.5010816125860373, "grad_norm": 1.5979022979736328, "learning_rate": 2.925474866773176e-06, "loss": 0.0703, "step": 1274 }, { "epoch": 0.5014749262536873, "grad_norm": 1.1477428674697876, "learning_rate": 2.922090381552475e-06, "loss": 0.0488, "step": 1275 }, { "epoch": 0.5018682399213372, "grad_norm": 1.544410228729248, "learning_rate": 2.9187050998555715e-06, "loss": 0.0689, "step": 1276 }, { "epoch": 0.5022615535889872, "grad_norm": 1.16623055934906, "learning_rate": 2.915319028070427e-06, "loss": 0.0681, "step": 1277 }, { "epoch": 0.5026548672566372, "grad_norm": 0.2639702558517456, "learning_rate": 2.9119321725864914e-06, "loss": 0.0321, "step": 1278 }, { "epoch": 0.5030481809242872, "grad_norm": 0.9400918483734131, "learning_rate": 2.908544539794693e-06, "loss": 0.0726, "step": 1279 }, { "epoch": 0.5034414945919371, "grad_norm": 2.083108425140381, "learning_rate": 2.9051561360874297e-06, "loss": 0.0567, "step": 1280 }, { "epoch": 0.503834808259587, "grad_norm": 0.9149637818336487, "learning_rate": 2.901766967858551e-06, "loss": 0.0626, "step": 1281 }, { "epoch": 0.504228121927237, "grad_norm": 0.6115841269493103, "learning_rate": 2.8983770415033507e-06, "loss": 0.0386, "step": 1282 }, { "epoch": 0.5046214355948869, "grad_norm": 1.530674695968628, "learning_rate": 2.8949863634185533e-06, "loss": 0.0743, "step": 1283 }, { "epoch": 0.5050147492625369, "grad_norm": 0.9860877990722656, "learning_rate": 2.8915949400022995e-06, "loss": 0.0397, "step": 1284 }, { "epoch": 0.5054080629301868, "grad_norm": 1.6740636825561523, "learning_rate": 2.8882027776541406e-06, "loss": 0.0997, "step": 1285 }, { "epoch": 0.5058013765978367, "grad_norm": 1.1494807004928589, "learning_rate": 2.8848098827750186e-06, "loss": 0.0639, "step": 1286 }, { "epoch": 0.5061946902654867, "grad_norm": 1.5039880275726318, "learning_rate": 2.8814162617672586e-06, "loss": 0.0615, "step": 1287 }, { "epoch": 0.5065880039331366, "grad_norm": 1.2192140817642212, "learning_rate": 2.8780219210345573e-06, "loss": 0.0543, "step": 1288 }, { "epoch": 0.5069813176007866, "grad_norm": 1.1865425109863281, "learning_rate": 2.8746268669819676e-06, "loss": 0.069, "step": 1289 }, { "epoch": 0.5073746312684366, "grad_norm": 1.6422653198242188, "learning_rate": 2.8712311060158904e-06, "loss": 0.0407, "step": 1290 }, { "epoch": 0.5077679449360866, "grad_norm": 1.0872414112091064, "learning_rate": 2.8678346445440588e-06, "loss": 0.0485, "step": 1291 }, { "epoch": 0.5081612586037365, "grad_norm": 1.3887152671813965, "learning_rate": 2.8644374889755284e-06, "loss": 0.0594, "step": 1292 }, { "epoch": 0.5085545722713865, "grad_norm": 0.9311152100563049, "learning_rate": 2.861039645720664e-06, "loss": 0.0558, "step": 1293 }, { "epoch": 0.5089478859390364, "grad_norm": 0.5611655116081238, "learning_rate": 2.85764112119113e-06, "loss": 0.0326, "step": 1294 }, { "epoch": 0.5093411996066863, "grad_norm": 0.6655589938163757, "learning_rate": 2.854241921799874e-06, "loss": 0.0608, "step": 1295 }, { "epoch": 0.5097345132743363, "grad_norm": 0.9743668437004089, "learning_rate": 2.850842053961119e-06, "loss": 0.0674, "step": 1296 }, { "epoch": 0.5101278269419862, "grad_norm": 0.3803253471851349, "learning_rate": 2.847441524090347e-06, "loss": 0.0318, "step": 1297 }, { "epoch": 0.5105211406096362, "grad_norm": 0.9651347398757935, "learning_rate": 2.844040338604291e-06, "loss": 0.0467, "step": 1298 }, { "epoch": 0.5109144542772861, "grad_norm": 1.3503124713897705, "learning_rate": 2.8406385039209217e-06, "loss": 0.0353, "step": 1299 }, { "epoch": 0.511307767944936, "grad_norm": 1.3085218667984009, "learning_rate": 2.837236026459432e-06, "loss": 0.0677, "step": 1300 }, { "epoch": 0.511701081612586, "grad_norm": 0.759332537651062, "learning_rate": 2.833832912640232e-06, "loss": 0.0399, "step": 1301 }, { "epoch": 0.512094395280236, "grad_norm": 1.254012107849121, "learning_rate": 2.8304291688849283e-06, "loss": 0.0469, "step": 1302 }, { "epoch": 0.512487708947886, "grad_norm": 1.6213202476501465, "learning_rate": 2.827024801616319e-06, "loss": 0.077, "step": 1303 }, { "epoch": 0.5128810226155359, "grad_norm": 0.751507580280304, "learning_rate": 2.8236198172583765e-06, "loss": 0.0499, "step": 1304 }, { "epoch": 0.5132743362831859, "grad_norm": 0.6438438296318054, "learning_rate": 2.820214222236241e-06, "loss": 0.0638, "step": 1305 }, { "epoch": 0.5136676499508358, "grad_norm": 0.8826209902763367, "learning_rate": 2.816808022976201e-06, "loss": 0.0422, "step": 1306 }, { "epoch": 0.5140609636184857, "grad_norm": 0.4389915466308594, "learning_rate": 2.813401225905688e-06, "loss": 0.0192, "step": 1307 }, { "epoch": 0.5144542772861357, "grad_norm": 0.7698509693145752, "learning_rate": 2.8099938374532615e-06, "loss": 0.043, "step": 1308 }, { "epoch": 0.5148475909537856, "grad_norm": 1.0304797887802124, "learning_rate": 2.806585864048594e-06, "loss": 0.0648, "step": 1309 }, { "epoch": 0.5152409046214356, "grad_norm": 0.9679722189903259, "learning_rate": 2.8031773121224665e-06, "loss": 0.0528, "step": 1310 }, { "epoch": 0.5156342182890855, "grad_norm": 0.8979973793029785, "learning_rate": 2.799768188106747e-06, "loss": 0.0493, "step": 1311 }, { "epoch": 0.5160275319567355, "grad_norm": 1.266461730003357, "learning_rate": 2.7963584984343856e-06, "loss": 0.0489, "step": 1312 }, { "epoch": 0.5164208456243854, "grad_norm": 1.1776021718978882, "learning_rate": 2.7929482495393995e-06, "loss": 0.0453, "step": 1313 }, { "epoch": 0.5168141592920354, "grad_norm": 0.89280104637146, "learning_rate": 2.7895374478568608e-06, "loss": 0.0506, "step": 1314 }, { "epoch": 0.5172074729596854, "grad_norm": 1.046673059463501, "learning_rate": 2.786126099822885e-06, "loss": 0.0812, "step": 1315 }, { "epoch": 0.5176007866273353, "grad_norm": 1.451196312904358, "learning_rate": 2.7827142118746187e-06, "loss": 0.0388, "step": 1316 }, { "epoch": 0.5179941002949853, "grad_norm": 0.9998504519462585, "learning_rate": 2.779301790450226e-06, "loss": 0.0505, "step": 1317 }, { "epoch": 0.5183874139626352, "grad_norm": 1.0535742044448853, "learning_rate": 2.7758888419888797e-06, "loss": 0.0377, "step": 1318 }, { "epoch": 0.5187807276302852, "grad_norm": 0.9973492622375488, "learning_rate": 2.7724753729307454e-06, "loss": 0.0512, "step": 1319 }, { "epoch": 0.5191740412979351, "grad_norm": 1.3732929229736328, "learning_rate": 2.769061389716971e-06, "loss": 0.0992, "step": 1320 }, { "epoch": 0.519567354965585, "grad_norm": 1.1079411506652832, "learning_rate": 2.765646898789677e-06, "loss": 0.0438, "step": 1321 }, { "epoch": 0.519960668633235, "grad_norm": 1.0692771673202515, "learning_rate": 2.762231906591939e-06, "loss": 0.0482, "step": 1322 }, { "epoch": 0.5203539823008849, "grad_norm": 0.773914098739624, "learning_rate": 2.75881641956778e-06, "loss": 0.0307, "step": 1323 }, { "epoch": 0.5207472959685349, "grad_norm": 0.8193982243537903, "learning_rate": 2.7554004441621562e-06, "loss": 0.0357, "step": 1324 }, { "epoch": 0.5211406096361848, "grad_norm": 1.0655934810638428, "learning_rate": 2.7519839868209462e-06, "loss": 0.0564, "step": 1325 }, { "epoch": 0.5215339233038349, "grad_norm": 0.668292760848999, "learning_rate": 2.748567053990937e-06, "loss": 0.0394, "step": 1326 }, { "epoch": 0.5219272369714848, "grad_norm": 1.5048760175704956, "learning_rate": 2.7451496521198144e-06, "loss": 0.0756, "step": 1327 }, { "epoch": 0.5223205506391347, "grad_norm": 1.869588017463684, "learning_rate": 2.741731787656146e-06, "loss": 0.08, "step": 1328 }, { "epoch": 0.5227138643067847, "grad_norm": 1.6091140508651733, "learning_rate": 2.7383134670493765e-06, "loss": 0.0618, "step": 1329 }, { "epoch": 0.5231071779744346, "grad_norm": 0.5614988207817078, "learning_rate": 2.734894696749808e-06, "loss": 0.022, "step": 1330 }, { "epoch": 0.5235004916420846, "grad_norm": 1.5846737623214722, "learning_rate": 2.7314754832085926e-06, "loss": 0.0617, "step": 1331 }, { "epoch": 0.5238938053097345, "grad_norm": 1.0142868757247925, "learning_rate": 2.728055832877719e-06, "loss": 0.1201, "step": 1332 }, { "epoch": 0.5242871189773844, "grad_norm": 0.9764862060546875, "learning_rate": 2.7246357522099996e-06, "loss": 0.0576, "step": 1333 }, { "epoch": 0.5246804326450344, "grad_norm": 0.7208642363548279, "learning_rate": 2.721215247659059e-06, "loss": 0.0165, "step": 1334 }, { "epoch": 0.5250737463126843, "grad_norm": 1.2766616344451904, "learning_rate": 2.7177943256793214e-06, "loss": 0.0589, "step": 1335 }, { "epoch": 0.5254670599803343, "grad_norm": 1.7238527536392212, "learning_rate": 2.7143729927259992e-06, "loss": 0.0415, "step": 1336 }, { "epoch": 0.5258603736479842, "grad_norm": 0.9424237608909607, "learning_rate": 2.7109512552550804e-06, "loss": 0.088, "step": 1337 }, { "epoch": 0.5262536873156343, "grad_norm": 0.8586751818656921, "learning_rate": 2.707529119723315e-06, "loss": 0.0621, "step": 1338 }, { "epoch": 0.5266470009832842, "grad_norm": 0.6910445690155029, "learning_rate": 2.7041065925882054e-06, "loss": 0.0473, "step": 1339 }, { "epoch": 0.5270403146509341, "grad_norm": 0.6774911880493164, "learning_rate": 2.7006836803079934e-06, "loss": 0.0401, "step": 1340 }, { "epoch": 0.5274336283185841, "grad_norm": 1.1810059547424316, "learning_rate": 2.697260389341645e-06, "loss": 0.0464, "step": 1341 }, { "epoch": 0.527826941986234, "grad_norm": 0.6813443303108215, "learning_rate": 2.693836726148844e-06, "loss": 0.0502, "step": 1342 }, { "epoch": 0.528220255653884, "grad_norm": 1.6458402872085571, "learning_rate": 2.6904126971899754e-06, "loss": 0.0644, "step": 1343 }, { "epoch": 0.5286135693215339, "grad_norm": 1.4540367126464844, "learning_rate": 2.686988308926112e-06, "loss": 0.0564, "step": 1344 }, { "epoch": 0.5290068829891839, "grad_norm": 0.6865090131759644, "learning_rate": 2.68356356781901e-06, "loss": 0.0448, "step": 1345 }, { "epoch": 0.5294001966568338, "grad_norm": 1.91966712474823, "learning_rate": 2.6801384803310855e-06, "loss": 0.0431, "step": 1346 }, { "epoch": 0.5297935103244837, "grad_norm": 0.6628435254096985, "learning_rate": 2.676713052925411e-06, "loss": 0.0513, "step": 1347 }, { "epoch": 0.5301868239921337, "grad_norm": 1.0600309371948242, "learning_rate": 2.6732872920657018e-06, "loss": 0.0321, "step": 1348 }, { "epoch": 0.5305801376597836, "grad_norm": 0.5295042991638184, "learning_rate": 2.6698612042162995e-06, "loss": 0.0299, "step": 1349 }, { "epoch": 0.5309734513274337, "grad_norm": 1.229316234588623, "learning_rate": 2.6664347958421647e-06, "loss": 0.0475, "step": 1350 }, { "epoch": 0.5313667649950836, "grad_norm": 0.8785441517829895, "learning_rate": 2.6630080734088625e-06, "loss": 0.0424, "step": 1351 }, { "epoch": 0.5317600786627336, "grad_norm": 1.3285952806472778, "learning_rate": 2.6595810433825496e-06, "loss": 0.0359, "step": 1352 }, { "epoch": 0.5321533923303835, "grad_norm": 0.8368435502052307, "learning_rate": 2.6561537122299647e-06, "loss": 0.0503, "step": 1353 }, { "epoch": 0.5325467059980334, "grad_norm": 0.790544331073761, "learning_rate": 2.6527260864184135e-06, "loss": 0.0321, "step": 1354 }, { "epoch": 0.5329400196656834, "grad_norm": 1.5722286701202393, "learning_rate": 2.6492981724157576e-06, "loss": 0.0765, "step": 1355 }, { "epoch": 0.5333333333333333, "grad_norm": 1.0913268327713013, "learning_rate": 2.6458699766904033e-06, "loss": 0.0526, "step": 1356 }, { "epoch": 0.5337266470009833, "grad_norm": 1.2754257917404175, "learning_rate": 2.6424415057112883e-06, "loss": 0.0585, "step": 1357 }, { "epoch": 0.5341199606686332, "grad_norm": 2.0785610675811768, "learning_rate": 2.6390127659478698e-06, "loss": 0.0995, "step": 1358 }, { "epoch": 0.5345132743362832, "grad_norm": 1.3484556674957275, "learning_rate": 2.6355837638701115e-06, "loss": 0.0462, "step": 1359 }, { "epoch": 0.5349065880039331, "grad_norm": 0.7563539147377014, "learning_rate": 2.632154505948472e-06, "loss": 0.0614, "step": 1360 }, { "epoch": 0.535299901671583, "grad_norm": 0.7201266288757324, "learning_rate": 2.6287249986538944e-06, "loss": 0.0449, "step": 1361 }, { "epoch": 0.5356932153392331, "grad_norm": 1.439516544342041, "learning_rate": 2.62529524845779e-06, "loss": 0.0694, "step": 1362 }, { "epoch": 0.536086529006883, "grad_norm": 0.6716679334640503, "learning_rate": 2.6218652618320306e-06, "loss": 0.0302, "step": 1363 }, { "epoch": 0.536479842674533, "grad_norm": 1.9574276208877563, "learning_rate": 2.6184350452489317e-06, "loss": 0.0708, "step": 1364 }, { "epoch": 0.5368731563421829, "grad_norm": 1.3900701999664307, "learning_rate": 2.615004605181246e-06, "loss": 0.0833, "step": 1365 }, { "epoch": 0.5372664700098329, "grad_norm": 0.9019057154655457, "learning_rate": 2.611573948102144e-06, "loss": 0.0625, "step": 1366 }, { "epoch": 0.5376597836774828, "grad_norm": 2.0217947959899902, "learning_rate": 2.6081430804852093e-06, "loss": 0.0837, "step": 1367 }, { "epoch": 0.5380530973451327, "grad_norm": 1.5341334342956543, "learning_rate": 2.604712008804421e-06, "loss": 0.0734, "step": 1368 }, { "epoch": 0.5384464110127827, "grad_norm": 1.3491941690444946, "learning_rate": 2.601280739534143e-06, "loss": 0.0631, "step": 1369 }, { "epoch": 0.5388397246804326, "grad_norm": 1.264906406402588, "learning_rate": 2.5978492791491126e-06, "loss": 0.0361, "step": 1370 }, { "epoch": 0.5392330383480826, "grad_norm": 1.567254900932312, "learning_rate": 2.594417634124428e-06, "loss": 0.0802, "step": 1371 }, { "epoch": 0.5396263520157325, "grad_norm": 0.912144124507904, "learning_rate": 2.590985810935535e-06, "loss": 0.0321, "step": 1372 }, { "epoch": 0.5400196656833824, "grad_norm": 0.7098456025123596, "learning_rate": 2.5875538160582176e-06, "loss": 0.0625, "step": 1373 }, { "epoch": 0.5404129793510325, "grad_norm": 1.4193458557128906, "learning_rate": 2.58412165596858e-06, "loss": 0.0518, "step": 1374 }, { "epoch": 0.5408062930186824, "grad_norm": 1.3003660440444946, "learning_rate": 2.5806893371430413e-06, "loss": 0.0625, "step": 1375 }, { "epoch": 0.5411996066863324, "grad_norm": 1.4275062084197998, "learning_rate": 2.57725686605832e-06, "loss": 0.0628, "step": 1376 }, { "epoch": 0.5415929203539823, "grad_norm": 1.3604398965835571, "learning_rate": 2.5738242491914206e-06, "loss": 0.0733, "step": 1377 }, { "epoch": 0.5419862340216323, "grad_norm": 2.859689235687256, "learning_rate": 2.5703914930196227e-06, "loss": 0.0547, "step": 1378 }, { "epoch": 0.5423795476892822, "grad_norm": 0.770262598991394, "learning_rate": 2.5669586040204697e-06, "loss": 0.0644, "step": 1379 }, { "epoch": 0.5427728613569321, "grad_norm": 0.7974931001663208, "learning_rate": 2.5635255886717553e-06, "loss": 0.0687, "step": 1380 }, { "epoch": 0.5431661750245821, "grad_norm": 0.9779230356216431, "learning_rate": 2.560092453451512e-06, "loss": 0.0586, "step": 1381 }, { "epoch": 0.543559488692232, "grad_norm": 2.3653101921081543, "learning_rate": 2.5566592048379975e-06, "loss": 0.0697, "step": 1382 }, { "epoch": 0.543952802359882, "grad_norm": 1.6566016674041748, "learning_rate": 2.553225849309684e-06, "loss": 0.104, "step": 1383 }, { "epoch": 0.5443461160275319, "grad_norm": 1.516684889793396, "learning_rate": 2.5497923933452464e-06, "loss": 0.0423, "step": 1384 }, { "epoch": 0.5447394296951819, "grad_norm": 1.3681788444519043, "learning_rate": 2.5463588434235463e-06, "loss": 0.052, "step": 1385 }, { "epoch": 0.5451327433628319, "grad_norm": 0.49628522992134094, "learning_rate": 2.542925206023626e-06, "loss": 0.0255, "step": 1386 }, { "epoch": 0.5455260570304818, "grad_norm": 0.9334824681282043, "learning_rate": 2.5394914876246916e-06, "loss": 0.0517, "step": 1387 }, { "epoch": 0.5459193706981318, "grad_norm": 1.3869428634643555, "learning_rate": 2.5360576947061004e-06, "loss": 0.051, "step": 1388 }, { "epoch": 0.5463126843657817, "grad_norm": 0.7261596918106079, "learning_rate": 2.5326238337473537e-06, "loss": 0.0349, "step": 1389 }, { "epoch": 0.5467059980334317, "grad_norm": 1.0270626544952393, "learning_rate": 2.5291899112280765e-06, "loss": 0.0574, "step": 1390 }, { "epoch": 0.5470993117010816, "grad_norm": 0.9097653031349182, "learning_rate": 2.5257559336280145e-06, "loss": 0.0434, "step": 1391 }, { "epoch": 0.5474926253687316, "grad_norm": 1.5684995651245117, "learning_rate": 2.522321907427016e-06, "loss": 0.0394, "step": 1392 }, { "epoch": 0.5478859390363815, "grad_norm": 0.5134732723236084, "learning_rate": 2.5188878391050187e-06, "loss": 0.0642, "step": 1393 }, { "epoch": 0.5482792527040314, "grad_norm": 1.6495331525802612, "learning_rate": 2.515453735142043e-06, "loss": 0.0335, "step": 1394 }, { "epoch": 0.5486725663716814, "grad_norm": 0.949030876159668, "learning_rate": 2.5120196020181752e-06, "loss": 0.069, "step": 1395 }, { "epoch": 0.5490658800393313, "grad_norm": 0.5853769183158875, "learning_rate": 2.5085854462135556e-06, "loss": 0.035, "step": 1396 }, { "epoch": 0.5494591937069813, "grad_norm": 1.0677484273910522, "learning_rate": 2.505151274208369e-06, "loss": 0.0511, "step": 1397 }, { "epoch": 0.5498525073746313, "grad_norm": 1.5644643306732178, "learning_rate": 2.50171709248283e-06, "loss": 0.0814, "step": 1398 }, { "epoch": 0.5502458210422813, "grad_norm": 0.736179769039154, "learning_rate": 2.4982829075171714e-06, "loss": 0.0452, "step": 1399 }, { "epoch": 0.5506391347099312, "grad_norm": 0.8911694288253784, "learning_rate": 2.494848725791632e-06, "loss": 0.0564, "step": 1400 }, { "epoch": 0.5510324483775811, "grad_norm": 1.9409581422805786, "learning_rate": 2.4914145537864453e-06, "loss": 0.0724, "step": 1401 }, { "epoch": 0.5514257620452311, "grad_norm": 1.1989744901657104, "learning_rate": 2.4879803979818256e-06, "loss": 0.0496, "step": 1402 }, { "epoch": 0.551819075712881, "grad_norm": 1.8545705080032349, "learning_rate": 2.4845462648579573e-06, "loss": 0.0527, "step": 1403 }, { "epoch": 0.552212389380531, "grad_norm": 1.8136131763458252, "learning_rate": 2.481112160894982e-06, "loss": 0.0601, "step": 1404 }, { "epoch": 0.5526057030481809, "grad_norm": 1.070971131324768, "learning_rate": 2.4776780925729853e-06, "loss": 0.0612, "step": 1405 }, { "epoch": 0.5529990167158308, "grad_norm": 1.127616047859192, "learning_rate": 2.474244066371986e-06, "loss": 0.0503, "step": 1406 }, { "epoch": 0.5533923303834808, "grad_norm": 1.5506644248962402, "learning_rate": 2.4708100887719243e-06, "loss": 0.0638, "step": 1407 }, { "epoch": 0.5537856440511307, "grad_norm": 1.5224863290786743, "learning_rate": 2.4673761662526475e-06, "loss": 0.0521, "step": 1408 }, { "epoch": 0.5541789577187807, "grad_norm": 1.2066714763641357, "learning_rate": 2.4639423052938995e-06, "loss": 0.0533, "step": 1409 }, { "epoch": 0.5545722713864307, "grad_norm": 1.389074683189392, "learning_rate": 2.4605085123753097e-06, "loss": 0.0809, "step": 1410 }, { "epoch": 0.5549655850540807, "grad_norm": 0.6731852293014526, "learning_rate": 2.4570747939763745e-06, "loss": 0.0249, "step": 1411 }, { "epoch": 0.5553588987217306, "grad_norm": 1.2953534126281738, "learning_rate": 2.453641156576454e-06, "loss": 0.0473, "step": 1412 }, { "epoch": 0.5557522123893806, "grad_norm": 0.9251944422721863, "learning_rate": 2.4502076066547545e-06, "loss": 0.0765, "step": 1413 }, { "epoch": 0.5561455260570305, "grad_norm": 1.831679344177246, "learning_rate": 2.4467741506903162e-06, "loss": 0.0798, "step": 1414 }, { "epoch": 0.5565388397246804, "grad_norm": 1.2218101024627686, "learning_rate": 2.443340795162003e-06, "loss": 0.0393, "step": 1415 }, { "epoch": 0.5569321533923304, "grad_norm": 1.164400577545166, "learning_rate": 2.4399075465484883e-06, "loss": 0.0681, "step": 1416 }, { "epoch": 0.5573254670599803, "grad_norm": 1.0514402389526367, "learning_rate": 2.4364744113282455e-06, "loss": 0.0593, "step": 1417 }, { "epoch": 0.5577187807276303, "grad_norm": 1.9647271633148193, "learning_rate": 2.433041395979531e-06, "loss": 0.0785, "step": 1418 }, { "epoch": 0.5581120943952802, "grad_norm": 0.7550022006034851, "learning_rate": 2.429608506980378e-06, "loss": 0.0443, "step": 1419 }, { "epoch": 0.5585054080629301, "grad_norm": 1.2886439561843872, "learning_rate": 2.4261757508085803e-06, "loss": 0.0625, "step": 1420 }, { "epoch": 0.5588987217305801, "grad_norm": 0.6531363129615784, "learning_rate": 2.422743133941681e-06, "loss": 0.0437, "step": 1421 }, { "epoch": 0.5592920353982301, "grad_norm": 1.3166404962539673, "learning_rate": 2.419310662856959e-06, "loss": 0.0363, "step": 1422 }, { "epoch": 0.5596853490658801, "grad_norm": 0.9738766551017761, "learning_rate": 2.415878344031421e-06, "loss": 0.0499, "step": 1423 }, { "epoch": 0.56007866273353, "grad_norm": 1.1199309825897217, "learning_rate": 2.4124461839417832e-06, "loss": 0.0638, "step": 1424 }, { "epoch": 0.56047197640118, "grad_norm": 0.7884669303894043, "learning_rate": 2.4090141890644654e-06, "loss": 0.0219, "step": 1425 }, { "epoch": 0.5608652900688299, "grad_norm": 1.508720874786377, "learning_rate": 2.405582365875573e-06, "loss": 0.0722, "step": 1426 }, { "epoch": 0.5612586037364798, "grad_norm": 0.9353559017181396, "learning_rate": 2.4021507208508882e-06, "loss": 0.0654, "step": 1427 }, { "epoch": 0.5616519174041298, "grad_norm": 1.9918673038482666, "learning_rate": 2.398719260465858e-06, "loss": 0.0741, "step": 1428 }, { "epoch": 0.5620452310717797, "grad_norm": 0.9243260622024536, "learning_rate": 2.3952879911955794e-06, "loss": 0.0369, "step": 1429 }, { "epoch": 0.5624385447394297, "grad_norm": 1.3456679582595825, "learning_rate": 2.391856919514791e-06, "loss": 0.0811, "step": 1430 }, { "epoch": 0.5628318584070796, "grad_norm": 1.5919969081878662, "learning_rate": 2.3884260518978562e-06, "loss": 0.0402, "step": 1431 }, { "epoch": 0.5632251720747296, "grad_norm": 0.5894349813461304, "learning_rate": 2.3849953948187552e-06, "loss": 0.0396, "step": 1432 }, { "epoch": 0.5636184857423795, "grad_norm": 1.708106517791748, "learning_rate": 2.3815649547510687e-06, "loss": 0.0575, "step": 1433 }, { "epoch": 0.5640117994100295, "grad_norm": 1.6241428852081299, "learning_rate": 2.37813473816797e-06, "loss": 0.047, "step": 1434 }, { "epoch": 0.5644051130776795, "grad_norm": 1.1760050058364868, "learning_rate": 2.3747047515422102e-06, "loss": 0.049, "step": 1435 }, { "epoch": 0.5647984267453294, "grad_norm": 0.6579201221466064, "learning_rate": 2.371275001346106e-06, "loss": 0.0569, "step": 1436 }, { "epoch": 0.5651917404129794, "grad_norm": 0.5577812194824219, "learning_rate": 2.367845494051529e-06, "loss": 0.0338, "step": 1437 }, { "epoch": 0.5655850540806293, "grad_norm": 0.9575706124305725, "learning_rate": 2.3644162361298897e-06, "loss": 0.0622, "step": 1438 }, { "epoch": 0.5659783677482793, "grad_norm": 0.6951814889907837, "learning_rate": 2.360987234052131e-06, "loss": 0.0329, "step": 1439 }, { "epoch": 0.5663716814159292, "grad_norm": 1.079609990119934, "learning_rate": 2.357558494288712e-06, "loss": 0.0672, "step": 1440 }, { "epoch": 0.5667649950835791, "grad_norm": 1.0509586334228516, "learning_rate": 2.354130023309597e-06, "loss": 0.0755, "step": 1441 }, { "epoch": 0.5671583087512291, "grad_norm": 0.9782833456993103, "learning_rate": 2.350701827584243e-06, "loss": 0.0319, "step": 1442 }, { "epoch": 0.567551622418879, "grad_norm": 1.019370675086975, "learning_rate": 2.3472739135815877e-06, "loss": 0.0696, "step": 1443 }, { "epoch": 0.567944936086529, "grad_norm": 1.419137716293335, "learning_rate": 2.343846287770036e-06, "loss": 0.0797, "step": 1444 }, { "epoch": 0.5683382497541789, "grad_norm": 1.8223907947540283, "learning_rate": 2.340418956617451e-06, "loss": 0.0462, "step": 1445 }, { "epoch": 0.568731563421829, "grad_norm": 1.1286693811416626, "learning_rate": 2.336991926591138e-06, "loss": 0.0735, "step": 1446 }, { "epoch": 0.5691248770894789, "grad_norm": 1.7998546361923218, "learning_rate": 2.3335652041578352e-06, "loss": 0.0964, "step": 1447 }, { "epoch": 0.5695181907571288, "grad_norm": 1.0016109943389893, "learning_rate": 2.3301387957837017e-06, "loss": 0.0631, "step": 1448 }, { "epoch": 0.5699115044247788, "grad_norm": 1.876328706741333, "learning_rate": 2.326712707934299e-06, "loss": 0.0683, "step": 1449 }, { "epoch": 0.5703048180924287, "grad_norm": 1.8099371194839478, "learning_rate": 2.3232869470745893e-06, "loss": 0.058, "step": 1450 }, { "epoch": 0.5706981317600787, "grad_norm": 0.8637019395828247, "learning_rate": 2.3198615196689153e-06, "loss": 0.0655, "step": 1451 }, { "epoch": 0.5710914454277286, "grad_norm": 2.1426312923431396, "learning_rate": 2.3164364321809906e-06, "loss": 0.0572, "step": 1452 }, { "epoch": 0.5714847590953785, "grad_norm": 1.6157870292663574, "learning_rate": 2.3130116910738874e-06, "loss": 0.0321, "step": 1453 }, { "epoch": 0.5718780727630285, "grad_norm": 0.8953425288200378, "learning_rate": 2.309587302810026e-06, "loss": 0.0292, "step": 1454 }, { "epoch": 0.5722713864306784, "grad_norm": 0.8132373094558716, "learning_rate": 2.306163273851157e-06, "loss": 0.0517, "step": 1455 }, { "epoch": 0.5726647000983284, "grad_norm": 0.8843181729316711, "learning_rate": 2.302739610658356e-06, "loss": 0.0389, "step": 1456 }, { "epoch": 0.5730580137659783, "grad_norm": 1.1060006618499756, "learning_rate": 2.2993163196920075e-06, "loss": 0.08, "step": 1457 }, { "epoch": 0.5734513274336284, "grad_norm": 1.1257623434066772, "learning_rate": 2.295893407411795e-06, "loss": 0.053, "step": 1458 }, { "epoch": 0.5738446411012783, "grad_norm": 1.0160799026489258, "learning_rate": 2.2924708802766857e-06, "loss": 0.0439, "step": 1459 }, { "epoch": 0.5742379547689282, "grad_norm": 1.231930136680603, "learning_rate": 2.2890487447449204e-06, "loss": 0.0569, "step": 1460 }, { "epoch": 0.5746312684365782, "grad_norm": 0.8130099177360535, "learning_rate": 2.285627007274001e-06, "loss": 0.0361, "step": 1461 }, { "epoch": 0.5750245821042281, "grad_norm": 0.6949229836463928, "learning_rate": 2.282205674320679e-06, "loss": 0.0598, "step": 1462 }, { "epoch": 0.5754178957718781, "grad_norm": 1.0386853218078613, "learning_rate": 2.2787847523409416e-06, "loss": 0.0601, "step": 1463 }, { "epoch": 0.575811209439528, "grad_norm": 0.48775455355644226, "learning_rate": 2.2753642477900012e-06, "loss": 0.0483, "step": 1464 }, { "epoch": 0.576204523107178, "grad_norm": 1.220493197441101, "learning_rate": 2.2719441671222815e-06, "loss": 0.0398, "step": 1465 }, { "epoch": 0.5765978367748279, "grad_norm": 0.747078537940979, "learning_rate": 2.268524516791408e-06, "loss": 0.0313, "step": 1466 }, { "epoch": 0.5769911504424778, "grad_norm": 0.7773571014404297, "learning_rate": 2.2651053032501928e-06, "loss": 0.0395, "step": 1467 }, { "epoch": 0.5773844641101278, "grad_norm": 0.4083022177219391, "learning_rate": 2.261686532950624e-06, "loss": 0.0255, "step": 1468 }, { "epoch": 0.5777777777777777, "grad_norm": 1.0136034488677979, "learning_rate": 2.2582682123438547e-06, "loss": 0.0499, "step": 1469 }, { "epoch": 0.5781710914454278, "grad_norm": 1.2290290594100952, "learning_rate": 2.254850347880187e-06, "loss": 0.0649, "step": 1470 }, { "epoch": 0.5785644051130777, "grad_norm": 1.4913883209228516, "learning_rate": 2.2514329460090633e-06, "loss": 0.0595, "step": 1471 }, { "epoch": 0.5789577187807277, "grad_norm": 1.210160732269287, "learning_rate": 2.248016013179054e-06, "loss": 0.0433, "step": 1472 }, { "epoch": 0.5793510324483776, "grad_norm": 0.757161557674408, "learning_rate": 2.244599555837844e-06, "loss": 0.035, "step": 1473 }, { "epoch": 0.5797443461160275, "grad_norm": 1.0250403881072998, "learning_rate": 2.2411835804322206e-06, "loss": 0.0375, "step": 1474 }, { "epoch": 0.5801376597836775, "grad_norm": 1.1955897808074951, "learning_rate": 2.2377680934080625e-06, "loss": 0.0449, "step": 1475 }, { "epoch": 0.5805309734513274, "grad_norm": 1.7066453695297241, "learning_rate": 2.2343531012103244e-06, "loss": 0.0722, "step": 1476 }, { "epoch": 0.5809242871189774, "grad_norm": 0.6709203720092773, "learning_rate": 2.2309386102830295e-06, "loss": 0.0354, "step": 1477 }, { "epoch": 0.5813176007866273, "grad_norm": 0.9403322339057922, "learning_rate": 2.227524627069256e-06, "loss": 0.039, "step": 1478 }, { "epoch": 0.5817109144542773, "grad_norm": 1.1907342672348022, "learning_rate": 2.2241111580111207e-06, "loss": 0.0894, "step": 1479 }, { "epoch": 0.5821042281219272, "grad_norm": 0.9678034782409668, "learning_rate": 2.220698209549774e-06, "loss": 0.0492, "step": 1480 }, { "epoch": 0.5824975417895771, "grad_norm": 0.5867919325828552, "learning_rate": 2.2172857881253825e-06, "loss": 0.0329, "step": 1481 }, { "epoch": 0.5828908554572272, "grad_norm": 0.9085230827331543, "learning_rate": 2.2138739001771157e-06, "loss": 0.0501, "step": 1482 }, { "epoch": 0.5832841691248771, "grad_norm": 1.015177845954895, "learning_rate": 2.2104625521431396e-06, "loss": 0.0297, "step": 1483 }, { "epoch": 0.5836774827925271, "grad_norm": 0.48682698607444763, "learning_rate": 2.207051750460601e-06, "loss": 0.0329, "step": 1484 }, { "epoch": 0.584070796460177, "grad_norm": 1.861662745475769, "learning_rate": 2.2036415015656148e-06, "loss": 0.0619, "step": 1485 }, { "epoch": 0.584464110127827, "grad_norm": 0.9373002648353577, "learning_rate": 2.2002318118932543e-06, "loss": 0.0563, "step": 1486 }, { "epoch": 0.5848574237954769, "grad_norm": 0.4820902943611145, "learning_rate": 2.1968226878775347e-06, "loss": 0.0206, "step": 1487 }, { "epoch": 0.5852507374631268, "grad_norm": 0.6255022287368774, "learning_rate": 2.1934141359514062e-06, "loss": 0.0319, "step": 1488 }, { "epoch": 0.5856440511307768, "grad_norm": 0.8468760848045349, "learning_rate": 2.1900061625467393e-06, "loss": 0.0574, "step": 1489 }, { "epoch": 0.5860373647984267, "grad_norm": 0.519826352596283, "learning_rate": 2.1865987740943116e-06, "loss": 0.0595, "step": 1490 }, { "epoch": 0.5864306784660767, "grad_norm": 1.6838140487670898, "learning_rate": 2.183191977023799e-06, "loss": 0.0549, "step": 1491 }, { "epoch": 0.5868239921337266, "grad_norm": 1.3588017225265503, "learning_rate": 2.17978577776376e-06, "loss": 0.058, "step": 1492 }, { "epoch": 0.5872173058013765, "grad_norm": 0.9913402199745178, "learning_rate": 2.176380182741624e-06, "loss": 0.021, "step": 1493 }, { "epoch": 0.5876106194690266, "grad_norm": 1.7032448053359985, "learning_rate": 2.172975198383682e-06, "loss": 0.0565, "step": 1494 }, { "epoch": 0.5880039331366765, "grad_norm": 0.9853689670562744, "learning_rate": 2.169570831115072e-06, "loss": 0.0532, "step": 1495 }, { "epoch": 0.5883972468043265, "grad_norm": 1.061571717262268, "learning_rate": 2.1661670873597686e-06, "loss": 0.042, "step": 1496 }, { "epoch": 0.5887905604719764, "grad_norm": 1.0780665874481201, "learning_rate": 2.1627639735405683e-06, "loss": 0.0412, "step": 1497 }, { "epoch": 0.5891838741396264, "grad_norm": 1.1072509288787842, "learning_rate": 2.1593614960790795e-06, "loss": 0.0369, "step": 1498 }, { "epoch": 0.5895771878072763, "grad_norm": 0.9231078028678894, "learning_rate": 2.15595966139571e-06, "loss": 0.0388, "step": 1499 }, { "epoch": 0.5899705014749262, "grad_norm": 0.8702555894851685, "learning_rate": 2.152558475909654e-06, "loss": 0.0719, "step": 1500 }, { "epoch": 0.5903638151425762, "grad_norm": 0.910358726978302, "learning_rate": 2.149157946038882e-06, "loss": 0.0468, "step": 1501 }, { "epoch": 0.5907571288102261, "grad_norm": 1.3807059526443481, "learning_rate": 2.145758078200126e-06, "loss": 0.0729, "step": 1502 }, { "epoch": 0.5911504424778761, "grad_norm": 0.9765854477882385, "learning_rate": 2.1423588788088704e-06, "loss": 0.0407, "step": 1503 }, { "epoch": 0.591543756145526, "grad_norm": 1.021924376487732, "learning_rate": 2.1389603542793364e-06, "loss": 0.0342, "step": 1504 }, { "epoch": 0.591937069813176, "grad_norm": 1.098352313041687, "learning_rate": 2.1355625110244725e-06, "loss": 0.0668, "step": 1505 }, { "epoch": 0.592330383480826, "grad_norm": 1.5986775159835815, "learning_rate": 2.1321653554559425e-06, "loss": 0.0673, "step": 1506 }, { "epoch": 0.592723697148476, "grad_norm": 1.2270184755325317, "learning_rate": 2.1287688939841104e-06, "loss": 0.0405, "step": 1507 }, { "epoch": 0.5931170108161259, "grad_norm": 0.6227984428405762, "learning_rate": 2.125373133018033e-06, "loss": 0.0362, "step": 1508 }, { "epoch": 0.5935103244837758, "grad_norm": 1.1838734149932861, "learning_rate": 2.1219780789654436e-06, "loss": 0.0705, "step": 1509 }, { "epoch": 0.5939036381514258, "grad_norm": 1.5811330080032349, "learning_rate": 2.1185837382327422e-06, "loss": 0.0811, "step": 1510 }, { "epoch": 0.5942969518190757, "grad_norm": 1.6723252534866333, "learning_rate": 2.1151901172249823e-06, "loss": 0.0711, "step": 1511 }, { "epoch": 0.5946902654867257, "grad_norm": 1.1075739860534668, "learning_rate": 2.1117972223458598e-06, "loss": 0.0365, "step": 1512 }, { "epoch": 0.5950835791543756, "grad_norm": 1.0250906944274902, "learning_rate": 2.108405059997701e-06, "loss": 0.0534, "step": 1513 }, { "epoch": 0.5954768928220255, "grad_norm": 1.4097585678100586, "learning_rate": 2.1050136365814484e-06, "loss": 0.0633, "step": 1514 }, { "epoch": 0.5958702064896755, "grad_norm": 1.0003234148025513, "learning_rate": 2.10162295849665e-06, "loss": 0.0331, "step": 1515 }, { "epoch": 0.5962635201573254, "grad_norm": 1.203927755355835, "learning_rate": 2.0982330321414495e-06, "loss": 0.0397, "step": 1516 }, { "epoch": 0.5966568338249754, "grad_norm": 1.1078671216964722, "learning_rate": 2.094843863912571e-06, "loss": 0.061, "step": 1517 }, { "epoch": 0.5970501474926254, "grad_norm": 0.9437456130981445, "learning_rate": 2.0914554602053072e-06, "loss": 0.0549, "step": 1518 }, { "epoch": 0.5974434611602754, "grad_norm": 0.34665971994400024, "learning_rate": 2.0880678274135103e-06, "loss": 0.0374, "step": 1519 }, { "epoch": 0.5978367748279253, "grad_norm": 1.6303670406341553, "learning_rate": 2.084680971929574e-06, "loss": 0.0729, "step": 1520 }, { "epoch": 0.5982300884955752, "grad_norm": 1.1011961698532104, "learning_rate": 2.0812949001444293e-06, "loss": 0.0399, "step": 1521 }, { "epoch": 0.5986234021632252, "grad_norm": 0.8066303730010986, "learning_rate": 2.077909618447526e-06, "loss": 0.05, "step": 1522 }, { "epoch": 0.5990167158308751, "grad_norm": 1.4448401927947998, "learning_rate": 2.0745251332268238e-06, "loss": 0.0616, "step": 1523 }, { "epoch": 0.5994100294985251, "grad_norm": 0.49370574951171875, "learning_rate": 2.07114145086878e-06, "loss": 0.0496, "step": 1524 }, { "epoch": 0.599803343166175, "grad_norm": 1.0275585651397705, "learning_rate": 2.0677585777583366e-06, "loss": 0.038, "step": 1525 }, { "epoch": 0.600196656833825, "grad_norm": 1.1347780227661133, "learning_rate": 2.0643765202789064e-06, "loss": 0.0324, "step": 1526 }, { "epoch": 0.6005899705014749, "grad_norm": 1.2602198123931885, "learning_rate": 2.060995284812366e-06, "loss": 0.0699, "step": 1527 }, { "epoch": 0.6009832841691248, "grad_norm": 1.4369268417358398, "learning_rate": 2.0576148777390397e-06, "loss": 0.0664, "step": 1528 }, { "epoch": 0.6013765978367748, "grad_norm": 1.8620692491531372, "learning_rate": 2.0542353054376893e-06, "loss": 0.0566, "step": 1529 }, { "epoch": 0.6017699115044248, "grad_norm": 1.026005506515503, "learning_rate": 2.0508565742855017e-06, "loss": 0.023, "step": 1530 }, { "epoch": 0.6021632251720748, "grad_norm": 0.8947687149047852, "learning_rate": 2.0474786906580733e-06, "loss": 0.0573, "step": 1531 }, { "epoch": 0.6025565388397247, "grad_norm": 1.1179437637329102, "learning_rate": 2.044101660929405e-06, "loss": 0.0551, "step": 1532 }, { "epoch": 0.6029498525073747, "grad_norm": 0.6822925806045532, "learning_rate": 2.040725491471885e-06, "loss": 0.0393, "step": 1533 }, { "epoch": 0.6033431661750246, "grad_norm": 1.8381119966506958, "learning_rate": 2.037350188656279e-06, "loss": 0.0502, "step": 1534 }, { "epoch": 0.6037364798426745, "grad_norm": 1.5118048191070557, "learning_rate": 2.0339757588517165e-06, "loss": 0.0403, "step": 1535 }, { "epoch": 0.6041297935103245, "grad_norm": 1.0197237730026245, "learning_rate": 2.0306022084256786e-06, "loss": 0.0651, "step": 1536 }, { "epoch": 0.6045231071779744, "grad_norm": 2.17777943611145, "learning_rate": 2.027229543743989e-06, "loss": 0.069, "step": 1537 }, { "epoch": 0.6049164208456244, "grad_norm": 1.1577013731002808, "learning_rate": 2.0238577711707987e-06, "loss": 0.0615, "step": 1538 }, { "epoch": 0.6053097345132743, "grad_norm": 1.1709601879119873, "learning_rate": 2.0204868970685764e-06, "loss": 0.0548, "step": 1539 }, { "epoch": 0.6057030481809242, "grad_norm": 0.8054937124252319, "learning_rate": 2.0171169277980954e-06, "loss": 0.0479, "step": 1540 }, { "epoch": 0.6060963618485742, "grad_norm": 0.9096735715866089, "learning_rate": 2.0137478697184205e-06, "loss": 0.0655, "step": 1541 }, { "epoch": 0.6064896755162242, "grad_norm": 0.9453304409980774, "learning_rate": 2.0103797291868977e-06, "loss": 0.0812, "step": 1542 }, { "epoch": 0.6068829891838742, "grad_norm": 0.8558923602104187, "learning_rate": 2.0070125125591414e-06, "loss": 0.0468, "step": 1543 }, { "epoch": 0.6072763028515241, "grad_norm": 1.2030149698257446, "learning_rate": 2.0036462261890225e-06, "loss": 0.0542, "step": 1544 }, { "epoch": 0.6076696165191741, "grad_norm": 0.9261341691017151, "learning_rate": 2.0002808764286573e-06, "loss": 0.0706, "step": 1545 }, { "epoch": 0.608062930186824, "grad_norm": 0.7496268153190613, "learning_rate": 1.9969164696283945e-06, "loss": 0.0298, "step": 1546 }, { "epoch": 0.6084562438544739, "grad_norm": 1.2815377712249756, "learning_rate": 1.9935530121368023e-06, "loss": 0.0555, "step": 1547 }, { "epoch": 0.6088495575221239, "grad_norm": 0.964885413646698, "learning_rate": 1.990190510300659e-06, "loss": 0.0211, "step": 1548 }, { "epoch": 0.6092428711897738, "grad_norm": 0.8117434978485107, "learning_rate": 1.986828970464939e-06, "loss": 0.0417, "step": 1549 }, { "epoch": 0.6096361848574238, "grad_norm": 0.4136671721935272, "learning_rate": 1.983468398972802e-06, "loss": 0.0177, "step": 1550 }, { "epoch": 0.6100294985250737, "grad_norm": 0.8469100594520569, "learning_rate": 1.980108802165579e-06, "loss": 0.0375, "step": 1551 }, { "epoch": 0.6104228121927237, "grad_norm": 0.8030047416687012, "learning_rate": 1.976750186382764e-06, "loss": 0.0237, "step": 1552 }, { "epoch": 0.6108161258603736, "grad_norm": 1.6747819185256958, "learning_rate": 1.9733925579619965e-06, "loss": 0.072, "step": 1553 }, { "epoch": 0.6112094395280236, "grad_norm": 0.8288264870643616, "learning_rate": 1.970035923239056e-06, "loss": 0.0347, "step": 1554 }, { "epoch": 0.6116027531956736, "grad_norm": 0.8544471859931946, "learning_rate": 1.9666802885478463e-06, "loss": 0.0445, "step": 1555 }, { "epoch": 0.6119960668633235, "grad_norm": 0.8386610150337219, "learning_rate": 1.963325660220384e-06, "loss": 0.0609, "step": 1556 }, { "epoch": 0.6123893805309735, "grad_norm": 1.3670865297317505, "learning_rate": 1.9599720445867856e-06, "loss": 0.0601, "step": 1557 }, { "epoch": 0.6127826941986234, "grad_norm": 1.0806509256362915, "learning_rate": 1.956619447975257e-06, "loss": 0.058, "step": 1558 }, { "epoch": 0.6131760078662734, "grad_norm": 0.9588520526885986, "learning_rate": 1.9532678767120827e-06, "loss": 0.0422, "step": 1559 }, { "epoch": 0.6135693215339233, "grad_norm": 1.370969295501709, "learning_rate": 1.9499173371216105e-06, "loss": 0.0646, "step": 1560 }, { "epoch": 0.6139626352015732, "grad_norm": 1.074244499206543, "learning_rate": 1.946567835526243e-06, "loss": 0.0613, "step": 1561 }, { "epoch": 0.6143559488692232, "grad_norm": 0.8812416195869446, "learning_rate": 1.943219378246423e-06, "loss": 0.0626, "step": 1562 }, { "epoch": 0.6147492625368731, "grad_norm": 1.3703498840332031, "learning_rate": 1.9398719716006246e-06, "loss": 0.0673, "step": 1563 }, { "epoch": 0.6151425762045231, "grad_norm": 1.3188180923461914, "learning_rate": 1.936525621905336e-06, "loss": 0.0711, "step": 1564 }, { "epoch": 0.615535889872173, "grad_norm": 0.5656819939613342, "learning_rate": 1.9331803354750537e-06, "loss": 0.0496, "step": 1565 }, { "epoch": 0.6159292035398231, "grad_norm": 1.2018178701400757, "learning_rate": 1.9298361186222665e-06, "loss": 0.052, "step": 1566 }, { "epoch": 0.616322517207473, "grad_norm": 1.197943091392517, "learning_rate": 1.926492977657446e-06, "loss": 0.0667, "step": 1567 }, { "epoch": 0.6167158308751229, "grad_norm": 0.6885368227958679, "learning_rate": 1.9231509188890345e-06, "loss": 0.0374, "step": 1568 }, { "epoch": 0.6171091445427729, "grad_norm": 0.8017690181732178, "learning_rate": 1.919809948623428e-06, "loss": 0.053, "step": 1569 }, { "epoch": 0.6175024582104228, "grad_norm": 1.5223562717437744, "learning_rate": 1.9164700731649723e-06, "loss": 0.0605, "step": 1570 }, { "epoch": 0.6178957718780728, "grad_norm": 1.8122631311416626, "learning_rate": 1.913131298815947e-06, "loss": 0.0719, "step": 1571 }, { "epoch": 0.6182890855457227, "grad_norm": 1.5113699436187744, "learning_rate": 1.9097936318765527e-06, "loss": 0.0547, "step": 1572 }, { "epoch": 0.6186823992133726, "grad_norm": 0.7732280492782593, "learning_rate": 1.906457078644901e-06, "loss": 0.0456, "step": 1573 }, { "epoch": 0.6190757128810226, "grad_norm": 1.347740650177002, "learning_rate": 1.903121645417003e-06, "loss": 0.0469, "step": 1574 }, { "epoch": 0.6194690265486725, "grad_norm": 0.6614682674407959, "learning_rate": 1.8997873384867534e-06, "loss": 0.0266, "step": 1575 }, { "epoch": 0.6198623402163225, "grad_norm": 1.1419849395751953, "learning_rate": 1.8964541641459242e-06, "loss": 0.0465, "step": 1576 }, { "epoch": 0.6202556538839724, "grad_norm": 0.9635249972343445, "learning_rate": 1.893122128684149e-06, "loss": 0.0482, "step": 1577 }, { "epoch": 0.6206489675516225, "grad_norm": 0.9544531106948853, "learning_rate": 1.8897912383889138e-06, "loss": 0.0689, "step": 1578 }, { "epoch": 0.6210422812192724, "grad_norm": 0.7220961451530457, "learning_rate": 1.886461499545543e-06, "loss": 0.0521, "step": 1579 }, { "epoch": 0.6214355948869223, "grad_norm": 2.5634989738464355, "learning_rate": 1.883132918437186e-06, "loss": 0.0702, "step": 1580 }, { "epoch": 0.6218289085545723, "grad_norm": 1.1183925867080688, "learning_rate": 1.8798055013448105e-06, "loss": 0.0623, "step": 1581 }, { "epoch": 0.6222222222222222, "grad_norm": 0.7888696193695068, "learning_rate": 1.8764792545471872e-06, "loss": 0.0452, "step": 1582 }, { "epoch": 0.6226155358898722, "grad_norm": 0.4925548732280731, "learning_rate": 1.8731541843208772e-06, "loss": 0.0481, "step": 1583 }, { "epoch": 0.6230088495575221, "grad_norm": 1.184525489807129, "learning_rate": 1.869830296940223e-06, "loss": 0.0947, "step": 1584 }, { "epoch": 0.6234021632251721, "grad_norm": 1.0969839096069336, "learning_rate": 1.8665075986773346e-06, "loss": 0.0786, "step": 1585 }, { "epoch": 0.623795476892822, "grad_norm": 1.2557084560394287, "learning_rate": 1.863186095802077e-06, "loss": 0.048, "step": 1586 }, { "epoch": 0.6241887905604719, "grad_norm": 0.9532119631767273, "learning_rate": 1.8598657945820605e-06, "loss": 0.0356, "step": 1587 }, { "epoch": 0.6245821042281219, "grad_norm": 0.6121819019317627, "learning_rate": 1.8565467012826282e-06, "loss": 0.0395, "step": 1588 }, { "epoch": 0.6249754178957718, "grad_norm": 0.9521839022636414, "learning_rate": 1.853228822166843e-06, "loss": 0.0417, "step": 1589 }, { "epoch": 0.6253687315634219, "grad_norm": 1.3007653951644897, "learning_rate": 1.849912163495479e-06, "loss": 0.0376, "step": 1590 }, { "epoch": 0.6257620452310718, "grad_norm": 1.0467530488967896, "learning_rate": 1.8465967315270029e-06, "loss": 0.0531, "step": 1591 }, { "epoch": 0.6261553588987218, "grad_norm": 0.8435487747192383, "learning_rate": 1.8432825325175707e-06, "loss": 0.0333, "step": 1592 }, { "epoch": 0.6265486725663717, "grad_norm": 1.2616933584213257, "learning_rate": 1.8399695727210098e-06, "loss": 0.0556, "step": 1593 }, { "epoch": 0.6269419862340216, "grad_norm": 1.1721434593200684, "learning_rate": 1.836657858388811e-06, "loss": 0.0658, "step": 1594 }, { "epoch": 0.6273352999016716, "grad_norm": 0.6084288954734802, "learning_rate": 1.8333473957701126e-06, "loss": 0.0385, "step": 1595 }, { "epoch": 0.6277286135693215, "grad_norm": 1.4398316144943237, "learning_rate": 1.830038191111692e-06, "loss": 0.0606, "step": 1596 }, { "epoch": 0.6281219272369715, "grad_norm": 1.9486684799194336, "learning_rate": 1.8267302506579532e-06, "loss": 0.0853, "step": 1597 }, { "epoch": 0.6285152409046214, "grad_norm": 0.7250006794929504, "learning_rate": 1.8234235806509145e-06, "loss": 0.0295, "step": 1598 }, { "epoch": 0.6289085545722713, "grad_norm": 1.2927533388137817, "learning_rate": 1.8201181873301967e-06, "loss": 0.046, "step": 1599 }, { "epoch": 0.6293018682399213, "grad_norm": 1.2859911918640137, "learning_rate": 1.816814076933012e-06, "loss": 0.0579, "step": 1600 }, { "epoch": 0.6296951819075712, "grad_norm": 1.900543451309204, "learning_rate": 1.813511255694152e-06, "loss": 0.0567, "step": 1601 }, { "epoch": 0.6300884955752213, "grad_norm": 2.090280532836914, "learning_rate": 1.8102097298459732e-06, "loss": 0.0865, "step": 1602 }, { "epoch": 0.6304818092428712, "grad_norm": 1.3595722913742065, "learning_rate": 1.80690950561839e-06, "loss": 0.0561, "step": 1603 }, { "epoch": 0.6308751229105212, "grad_norm": 1.022291660308838, "learning_rate": 1.8036105892388611e-06, "loss": 0.0382, "step": 1604 }, { "epoch": 0.6312684365781711, "grad_norm": 0.8052154779434204, "learning_rate": 1.800312986932376e-06, "loss": 0.0529, "step": 1605 }, { "epoch": 0.631661750245821, "grad_norm": 4.667014122009277, "learning_rate": 1.7970167049214466e-06, "loss": 0.0492, "step": 1606 }, { "epoch": 0.632055063913471, "grad_norm": 1.5009123086929321, "learning_rate": 1.7937217494260888e-06, "loss": 0.0779, "step": 1607 }, { "epoch": 0.6324483775811209, "grad_norm": 1.570505976676941, "learning_rate": 1.7904281266638201e-06, "loss": 0.0577, "step": 1608 }, { "epoch": 0.6328416912487709, "grad_norm": 1.3305639028549194, "learning_rate": 1.7871358428496416e-06, "loss": 0.0979, "step": 1609 }, { "epoch": 0.6332350049164208, "grad_norm": 0.6136133074760437, "learning_rate": 1.7838449041960276e-06, "loss": 0.0424, "step": 1610 }, { "epoch": 0.6336283185840708, "grad_norm": 0.7882452607154846, "learning_rate": 1.7805553169129142e-06, "loss": 0.0656, "step": 1611 }, { "epoch": 0.6340216322517207, "grad_norm": 2.1648337841033936, "learning_rate": 1.7772670872076883e-06, "loss": 0.0622, "step": 1612 }, { "epoch": 0.6344149459193706, "grad_norm": 0.5130072832107544, "learning_rate": 1.773980221285173e-06, "loss": 0.0394, "step": 1613 }, { "epoch": 0.6348082595870207, "grad_norm": 1.0151782035827637, "learning_rate": 1.7706947253476194e-06, "loss": 0.0424, "step": 1614 }, { "epoch": 0.6352015732546706, "grad_norm": 0.8527183532714844, "learning_rate": 1.767410605594694e-06, "loss": 0.0394, "step": 1615 }, { "epoch": 0.6355948869223206, "grad_norm": 1.3671120405197144, "learning_rate": 1.7641278682234658e-06, "loss": 0.0625, "step": 1616 }, { "epoch": 0.6359882005899705, "grad_norm": 0.8969728350639343, "learning_rate": 1.7608465194283958e-06, "loss": 0.0295, "step": 1617 }, { "epoch": 0.6363815142576205, "grad_norm": 0.7407302260398865, "learning_rate": 1.757566565401323e-06, "loss": 0.055, "step": 1618 }, { "epoch": 0.6367748279252704, "grad_norm": 1.153152346611023, "learning_rate": 1.7542880123314559e-06, "loss": 0.0945, "step": 1619 }, { "epoch": 0.6371681415929203, "grad_norm": 1.259879231452942, "learning_rate": 1.75101086640536e-06, "loss": 0.0537, "step": 1620 }, { "epoch": 0.6375614552605703, "grad_norm": 0.6502655744552612, "learning_rate": 1.7477351338069442e-06, "loss": 0.0443, "step": 1621 }, { "epoch": 0.6379547689282202, "grad_norm": 0.9160225987434387, "learning_rate": 1.7444608207174519e-06, "loss": 0.0494, "step": 1622 }, { "epoch": 0.6383480825958702, "grad_norm": 1.6503887176513672, "learning_rate": 1.741187933315448e-06, "loss": 0.0415, "step": 1623 }, { "epoch": 0.6387413962635201, "grad_norm": 1.2449769973754883, "learning_rate": 1.7379164777768038e-06, "loss": 0.0607, "step": 1624 }, { "epoch": 0.63913470993117, "grad_norm": 0.799196720123291, "learning_rate": 1.734646460274692e-06, "loss": 0.0404, "step": 1625 }, { "epoch": 0.6395280235988201, "grad_norm": 1.6735135316848755, "learning_rate": 1.7313778869795717e-06, "loss": 0.0626, "step": 1626 }, { "epoch": 0.63992133726647, "grad_norm": 1.090598702430725, "learning_rate": 1.728110764059176e-06, "loss": 0.0649, "step": 1627 }, { "epoch": 0.64031465093412, "grad_norm": 0.6586104035377502, "learning_rate": 1.7248450976785011e-06, "loss": 0.0501, "step": 1628 }, { "epoch": 0.6407079646017699, "grad_norm": 1.8684154748916626, "learning_rate": 1.7215808939997945e-06, "loss": 0.0653, "step": 1629 }, { "epoch": 0.6411012782694199, "grad_norm": 1.1549500226974487, "learning_rate": 1.7183181591825437e-06, "loss": 0.0332, "step": 1630 }, { "epoch": 0.6414945919370698, "grad_norm": 1.295351505279541, "learning_rate": 1.7150568993834666e-06, "loss": 0.0535, "step": 1631 }, { "epoch": 0.6418879056047198, "grad_norm": 0.8795567750930786, "learning_rate": 1.7117971207564934e-06, "loss": 0.0866, "step": 1632 }, { "epoch": 0.6422812192723697, "grad_norm": 0.6757074594497681, "learning_rate": 1.7085388294527632e-06, "loss": 0.0385, "step": 1633 }, { "epoch": 0.6426745329400196, "grad_norm": 0.9733456373214722, "learning_rate": 1.705282031620608e-06, "loss": 0.0923, "step": 1634 }, { "epoch": 0.6430678466076696, "grad_norm": 1.0591400861740112, "learning_rate": 1.7020267334055393e-06, "loss": 0.0492, "step": 1635 }, { "epoch": 0.6434611602753195, "grad_norm": 0.8595137596130371, "learning_rate": 1.6987729409502412e-06, "loss": 0.0411, "step": 1636 }, { "epoch": 0.6438544739429695, "grad_norm": 1.831631064414978, "learning_rate": 1.6955206603945557e-06, "loss": 0.0733, "step": 1637 }, { "epoch": 0.6442477876106195, "grad_norm": 0.5861109495162964, "learning_rate": 1.6922698978754726e-06, "loss": 0.045, "step": 1638 }, { "epoch": 0.6446411012782695, "grad_norm": 1.3072712421417236, "learning_rate": 1.6890206595271153e-06, "loss": 0.0713, "step": 1639 }, { "epoch": 0.6450344149459194, "grad_norm": 0.8035500049591064, "learning_rate": 1.6857729514807325e-06, "loss": 0.0379, "step": 1640 }, { "epoch": 0.6454277286135693, "grad_norm": 0.7814714312553406, "learning_rate": 1.6825267798646851e-06, "loss": 0.041, "step": 1641 }, { "epoch": 0.6458210422812193, "grad_norm": 1.3243709802627563, "learning_rate": 1.6792821508044352e-06, "loss": 0.0633, "step": 1642 }, { "epoch": 0.6462143559488692, "grad_norm": 0.8479057550430298, "learning_rate": 1.6760390704225333e-06, "loss": 0.0561, "step": 1643 }, { "epoch": 0.6466076696165192, "grad_norm": 1.0051478147506714, "learning_rate": 1.672797544838608e-06, "loss": 0.0372, "step": 1644 }, { "epoch": 0.6470009832841691, "grad_norm": 0.962547779083252, "learning_rate": 1.6695575801693549e-06, "loss": 0.0398, "step": 1645 }, { "epoch": 0.647394296951819, "grad_norm": 1.314014196395874, "learning_rate": 1.6663191825285214e-06, "loss": 0.0492, "step": 1646 }, { "epoch": 0.647787610619469, "grad_norm": 0.6934694647789001, "learning_rate": 1.6630823580269005e-06, "loss": 0.0367, "step": 1647 }, { "epoch": 0.6481809242871189, "grad_norm": 1.1256476640701294, "learning_rate": 1.6598471127723162e-06, "loss": 0.0476, "step": 1648 }, { "epoch": 0.6485742379547689, "grad_norm": 1.5946294069290161, "learning_rate": 1.6566134528696126e-06, "loss": 0.0484, "step": 1649 }, { "epoch": 0.6489675516224189, "grad_norm": 1.1677006483078003, "learning_rate": 1.6533813844206426e-06, "loss": 0.0443, "step": 1650 }, { "epoch": 0.6493608652900689, "grad_norm": 0.9727287292480469, "learning_rate": 1.6501509135242533e-06, "loss": 0.036, "step": 1651 }, { "epoch": 0.6497541789577188, "grad_norm": 1.6365562677383423, "learning_rate": 1.6469220462762807e-06, "loss": 0.0794, "step": 1652 }, { "epoch": 0.6501474926253688, "grad_norm": 0.9197725057601929, "learning_rate": 1.6436947887695336e-06, "loss": 0.0314, "step": 1653 }, { "epoch": 0.6505408062930187, "grad_norm": 0.9444229006767273, "learning_rate": 1.6404691470937829e-06, "loss": 0.017, "step": 1654 }, { "epoch": 0.6509341199606686, "grad_norm": 1.0287470817565918, "learning_rate": 1.6372451273357504e-06, "loss": 0.0674, "step": 1655 }, { "epoch": 0.6513274336283186, "grad_norm": 0.9683353900909424, "learning_rate": 1.6340227355790988e-06, "loss": 0.0727, "step": 1656 }, { "epoch": 0.6517207472959685, "grad_norm": 0.9869152903556824, "learning_rate": 1.6308019779044154e-06, "loss": 0.0526, "step": 1657 }, { "epoch": 0.6521140609636185, "grad_norm": 2.224297046661377, "learning_rate": 1.6275828603892078e-06, "loss": 0.0635, "step": 1658 }, { "epoch": 0.6525073746312684, "grad_norm": 0.8496151566505432, "learning_rate": 1.6243653891078864e-06, "loss": 0.0581, "step": 1659 }, { "epoch": 0.6529006882989183, "grad_norm": 1.2158007621765137, "learning_rate": 1.6211495701317565e-06, "loss": 0.0728, "step": 1660 }, { "epoch": 0.6532940019665683, "grad_norm": 0.48335015773773193, "learning_rate": 1.6179354095290051e-06, "loss": 0.0405, "step": 1661 }, { "epoch": 0.6536873156342183, "grad_norm": 0.679865300655365, "learning_rate": 1.6147229133646885e-06, "loss": 0.0497, "step": 1662 }, { "epoch": 0.6540806293018683, "grad_norm": 2.487617254257202, "learning_rate": 1.611512087700724e-06, "loss": 0.1029, "step": 1663 }, { "epoch": 0.6544739429695182, "grad_norm": 1.0901083946228027, "learning_rate": 1.6083029385958762e-06, "loss": 0.0706, "step": 1664 }, { "epoch": 0.6548672566371682, "grad_norm": 1.4582974910736084, "learning_rate": 1.6050954721057461e-06, "loss": 0.0651, "step": 1665 }, { "epoch": 0.6552605703048181, "grad_norm": 1.1469032764434814, "learning_rate": 1.6018896942827595e-06, "loss": 0.0533, "step": 1666 }, { "epoch": 0.655653883972468, "grad_norm": 1.5001522302627563, "learning_rate": 1.5986856111761562e-06, "loss": 0.0688, "step": 1667 }, { "epoch": 0.656047197640118, "grad_norm": 0.7778475880622864, "learning_rate": 1.595483228831976e-06, "loss": 0.0457, "step": 1668 }, { "epoch": 0.6564405113077679, "grad_norm": 0.910394549369812, "learning_rate": 1.5922825532930526e-06, "loss": 0.0295, "step": 1669 }, { "epoch": 0.6568338249754179, "grad_norm": 1.1938371658325195, "learning_rate": 1.5890835905989969e-06, "loss": 0.0533, "step": 1670 }, { "epoch": 0.6572271386430678, "grad_norm": 0.9362410306930542, "learning_rate": 1.5858863467861882e-06, "loss": 0.054, "step": 1671 }, { "epoch": 0.6576204523107178, "grad_norm": 0.5481738448143005, "learning_rate": 1.582690827887763e-06, "loss": 0.037, "step": 1672 }, { "epoch": 0.6580137659783677, "grad_norm": 0.8186729550361633, "learning_rate": 1.5794970399336012e-06, "loss": 0.0355, "step": 1673 }, { "epoch": 0.6584070796460177, "grad_norm": 0.885360598564148, "learning_rate": 1.576304988950318e-06, "loss": 0.0478, "step": 1674 }, { "epoch": 0.6588003933136677, "grad_norm": 1.0103771686553955, "learning_rate": 1.5731146809612508e-06, "loss": 0.0562, "step": 1675 }, { "epoch": 0.6591937069813176, "grad_norm": 0.9461012482643127, "learning_rate": 1.569926121986447e-06, "loss": 0.0301, "step": 1676 }, { "epoch": 0.6595870206489676, "grad_norm": 1.5684260129928589, "learning_rate": 1.566739318042655e-06, "loss": 0.0339, "step": 1677 }, { "epoch": 0.6599803343166175, "grad_norm": 0.7456137537956238, "learning_rate": 1.56355427514331e-06, "loss": 0.0592, "step": 1678 }, { "epoch": 0.6603736479842675, "grad_norm": 1.6279810667037964, "learning_rate": 1.5603709992985256e-06, "loss": 0.0452, "step": 1679 }, { "epoch": 0.6607669616519174, "grad_norm": 1.3496975898742676, "learning_rate": 1.5571894965150796e-06, "loss": 0.058, "step": 1680 }, { "epoch": 0.6611602753195673, "grad_norm": 1.0409663915634155, "learning_rate": 1.554009772796406e-06, "loss": 0.0635, "step": 1681 }, { "epoch": 0.6615535889872173, "grad_norm": 0.6893079876899719, "learning_rate": 1.55083183414258e-06, "loss": 0.042, "step": 1682 }, { "epoch": 0.6619469026548672, "grad_norm": 1.3735069036483765, "learning_rate": 1.5476556865503095e-06, "loss": 0.0418, "step": 1683 }, { "epoch": 0.6623402163225172, "grad_norm": 0.9965916275978088, "learning_rate": 1.5444813360129207e-06, "loss": 0.0436, "step": 1684 }, { "epoch": 0.6627335299901671, "grad_norm": 0.41811513900756836, "learning_rate": 1.5413087885203515e-06, "loss": 0.032, "step": 1685 }, { "epoch": 0.6631268436578172, "grad_norm": 1.2320137023925781, "learning_rate": 1.538138050059136e-06, "loss": 0.0588, "step": 1686 }, { "epoch": 0.6635201573254671, "grad_norm": 1.2540123462677002, "learning_rate": 1.5349691266123946e-06, "loss": 0.0527, "step": 1687 }, { "epoch": 0.663913470993117, "grad_norm": 0.8406708240509033, "learning_rate": 1.5318020241598248e-06, "loss": 0.0479, "step": 1688 }, { "epoch": 0.664306784660767, "grad_norm": 1.1033174991607666, "learning_rate": 1.5286367486776835e-06, "loss": 0.0566, "step": 1689 }, { "epoch": 0.6647000983284169, "grad_norm": 1.4875179529190063, "learning_rate": 1.5254733061387846e-06, "loss": 0.0566, "step": 1690 }, { "epoch": 0.6650934119960669, "grad_norm": 1.0827391147613525, "learning_rate": 1.5223117025124817e-06, "loss": 0.0333, "step": 1691 }, { "epoch": 0.6654867256637168, "grad_norm": 1.2373061180114746, "learning_rate": 1.5191519437646576e-06, "loss": 0.048, "step": 1692 }, { "epoch": 0.6658800393313667, "grad_norm": 0.9508680701255798, "learning_rate": 1.5159940358577151e-06, "loss": 0.0499, "step": 1693 }, { "epoch": 0.6662733529990167, "grad_norm": 0.4500909447669983, "learning_rate": 1.512837984750565e-06, "loss": 0.0207, "step": 1694 }, { "epoch": 0.6666666666666666, "grad_norm": 1.83719003200531, "learning_rate": 1.5096837963986112e-06, "loss": 0.0541, "step": 1695 }, { "epoch": 0.6670599803343166, "grad_norm": 1.0231764316558838, "learning_rate": 1.5065314767537453e-06, "loss": 0.0255, "step": 1696 }, { "epoch": 0.6674532940019666, "grad_norm": 0.8618975877761841, "learning_rate": 1.5033810317643327e-06, "loss": 0.0398, "step": 1697 }, { "epoch": 0.6678466076696166, "grad_norm": 0.40866029262542725, "learning_rate": 1.5002324673752006e-06, "loss": 0.031, "step": 1698 }, { "epoch": 0.6682399213372665, "grad_norm": 0.7475729584693909, "learning_rate": 1.4970857895276285e-06, "loss": 0.0534, "step": 1699 }, { "epoch": 0.6686332350049164, "grad_norm": 1.0545064210891724, "learning_rate": 1.4939410041593338e-06, "loss": 0.0451, "step": 1700 }, { "epoch": 0.6690265486725664, "grad_norm": 1.023006796836853, "learning_rate": 1.4907981172044647e-06, "loss": 0.0594, "step": 1701 }, { "epoch": 0.6694198623402163, "grad_norm": 0.9975923299789429, "learning_rate": 1.487657134593587e-06, "loss": 0.0634, "step": 1702 }, { "epoch": 0.6698131760078663, "grad_norm": 1.2105883359909058, "learning_rate": 1.4845180622536728e-06, "loss": 0.0482, "step": 1703 }, { "epoch": 0.6702064896755162, "grad_norm": 1.007332682609558, "learning_rate": 1.4813809061080893e-06, "loss": 0.0706, "step": 1704 }, { "epoch": 0.6705998033431662, "grad_norm": 0.7119497060775757, "learning_rate": 1.4782456720765895e-06, "loss": 0.0409, "step": 1705 }, { "epoch": 0.6709931170108161, "grad_norm": 1.0542527437210083, "learning_rate": 1.4751123660752955e-06, "loss": 0.0388, "step": 1706 }, { "epoch": 0.671386430678466, "grad_norm": 2.3204405307769775, "learning_rate": 1.4719809940166952e-06, "loss": 0.0724, "step": 1707 }, { "epoch": 0.671779744346116, "grad_norm": 0.5740649700164795, "learning_rate": 1.4688515618096252e-06, "loss": 0.0319, "step": 1708 }, { "epoch": 0.672173058013766, "grad_norm": 0.9803503155708313, "learning_rate": 1.4657240753592627e-06, "loss": 0.0504, "step": 1709 }, { "epoch": 0.672566371681416, "grad_norm": 0.8115725517272949, "learning_rate": 1.462598540567113e-06, "loss": 0.0605, "step": 1710 }, { "epoch": 0.6729596853490659, "grad_norm": 1.3304479122161865, "learning_rate": 1.4594749633309981e-06, "loss": 0.0758, "step": 1711 }, { "epoch": 0.6733529990167159, "grad_norm": 1.208067774772644, "learning_rate": 1.456353349545046e-06, "loss": 0.0706, "step": 1712 }, { "epoch": 0.6737463126843658, "grad_norm": 1.1107121706008911, "learning_rate": 1.4532337050996804e-06, "loss": 0.0468, "step": 1713 }, { "epoch": 0.6741396263520157, "grad_norm": 1.192116618156433, "learning_rate": 1.4501160358816085e-06, "loss": 0.0657, "step": 1714 }, { "epoch": 0.6745329400196657, "grad_norm": 1.0967481136322021, "learning_rate": 1.4470003477738111e-06, "loss": 0.0499, "step": 1715 }, { "epoch": 0.6749262536873156, "grad_norm": 1.3263583183288574, "learning_rate": 1.4438866466555308e-06, "loss": 0.0449, "step": 1716 }, { "epoch": 0.6753195673549656, "grad_norm": 1.5055456161499023, "learning_rate": 1.4407749384022576e-06, "loss": 0.0489, "step": 1717 }, { "epoch": 0.6757128810226155, "grad_norm": 1.5726017951965332, "learning_rate": 1.4376652288857249e-06, "loss": 0.0626, "step": 1718 }, { "epoch": 0.6761061946902654, "grad_norm": 1.6234389543533325, "learning_rate": 1.4345575239738928e-06, "loss": 0.0606, "step": 1719 }, { "epoch": 0.6764995083579154, "grad_norm": 1.7149680852890015, "learning_rate": 1.431451829530939e-06, "loss": 0.0527, "step": 1720 }, { "epoch": 0.6768928220255654, "grad_norm": 0.8043215870857239, "learning_rate": 1.4283481514172487e-06, "loss": 0.0454, "step": 1721 }, { "epoch": 0.6772861356932154, "grad_norm": 1.3794721364974976, "learning_rate": 1.425246495489399e-06, "loss": 0.0522, "step": 1722 }, { "epoch": 0.6776794493608653, "grad_norm": 0.7596322298049927, "learning_rate": 1.4221468676001544e-06, "loss": 0.0507, "step": 1723 }, { "epoch": 0.6780727630285153, "grad_norm": 0.9277907013893127, "learning_rate": 1.419049273598451e-06, "loss": 0.0406, "step": 1724 }, { "epoch": 0.6784660766961652, "grad_norm": 1.7175707817077637, "learning_rate": 1.4159537193293876e-06, "loss": 0.0477, "step": 1725 }, { "epoch": 0.6788593903638152, "grad_norm": 0.5326056480407715, "learning_rate": 1.4128602106342154e-06, "loss": 0.0248, "step": 1726 }, { "epoch": 0.6792527040314651, "grad_norm": 1.259993314743042, "learning_rate": 1.4097687533503213e-06, "loss": 0.05, "step": 1727 }, { "epoch": 0.679646017699115, "grad_norm": 0.9844882488250732, "learning_rate": 1.4066793533112255e-06, "loss": 0.0407, "step": 1728 }, { "epoch": 0.680039331366765, "grad_norm": 1.6221920251846313, "learning_rate": 1.4035920163465648e-06, "loss": 0.0589, "step": 1729 }, { "epoch": 0.6804326450344149, "grad_norm": 2.0537407398223877, "learning_rate": 1.400506748282083e-06, "loss": 0.0622, "step": 1730 }, { "epoch": 0.6808259587020649, "grad_norm": 1.1460561752319336, "learning_rate": 1.3974235549396198e-06, "loss": 0.0448, "step": 1731 }, { "epoch": 0.6812192723697148, "grad_norm": 1.2280306816101074, "learning_rate": 1.3943424421370998e-06, "loss": 0.0621, "step": 1732 }, { "epoch": 0.6816125860373649, "grad_norm": 1.9272797107696533, "learning_rate": 1.3912634156885235e-06, "loss": 0.0559, "step": 1733 }, { "epoch": 0.6820058997050148, "grad_norm": 0.8985779285430908, "learning_rate": 1.3881864814039503e-06, "loss": 0.0568, "step": 1734 }, { "epoch": 0.6823992133726647, "grad_norm": 0.5459672808647156, "learning_rate": 1.3851116450894959e-06, "loss": 0.03, "step": 1735 }, { "epoch": 0.6827925270403147, "grad_norm": 0.8683139085769653, "learning_rate": 1.382038912547315e-06, "loss": 0.0513, "step": 1736 }, { "epoch": 0.6831858407079646, "grad_norm": 0.7696962952613831, "learning_rate": 1.3789682895755935e-06, "loss": 0.0448, "step": 1737 }, { "epoch": 0.6835791543756146, "grad_norm": 1.2431952953338623, "learning_rate": 1.3758997819685366e-06, "loss": 0.0493, "step": 1738 }, { "epoch": 0.6839724680432645, "grad_norm": 0.9553192853927612, "learning_rate": 1.3728333955163565e-06, "loss": 0.0321, "step": 1739 }, { "epoch": 0.6843657817109144, "grad_norm": 1.2432819604873657, "learning_rate": 1.3697691360052646e-06, "loss": 0.0744, "step": 1740 }, { "epoch": 0.6847590953785644, "grad_norm": 0.6021830439567566, "learning_rate": 1.3667070092174587e-06, "loss": 0.0471, "step": 1741 }, { "epoch": 0.6851524090462143, "grad_norm": 1.0340098142623901, "learning_rate": 1.3636470209311093e-06, "loss": 0.0645, "step": 1742 }, { "epoch": 0.6855457227138643, "grad_norm": 1.2661107778549194, "learning_rate": 1.360589176920355e-06, "loss": 0.0314, "step": 1743 }, { "epoch": 0.6859390363815142, "grad_norm": 1.7685880661010742, "learning_rate": 1.357533482955287e-06, "loss": 0.0635, "step": 1744 }, { "epoch": 0.6863323500491643, "grad_norm": 1.249866008758545, "learning_rate": 1.354479944801939e-06, "loss": 0.0257, "step": 1745 }, { "epoch": 0.6867256637168142, "grad_norm": 0.8888324499130249, "learning_rate": 1.3514285682222777e-06, "loss": 0.0501, "step": 1746 }, { "epoch": 0.6871189773844641, "grad_norm": 0.9306212067604065, "learning_rate": 1.3483793589741901e-06, "loss": 0.0535, "step": 1747 }, { "epoch": 0.6875122910521141, "grad_norm": 1.239108920097351, "learning_rate": 1.3453323228114745e-06, "loss": 0.0645, "step": 1748 }, { "epoch": 0.687905604719764, "grad_norm": 1.971179723739624, "learning_rate": 1.3422874654838263e-06, "loss": 0.0617, "step": 1749 }, { "epoch": 0.688298918387414, "grad_norm": 0.8780958652496338, "learning_rate": 1.3392447927368315e-06, "loss": 0.0303, "step": 1750 }, { "epoch": 0.6886922320550639, "grad_norm": 0.5229460000991821, "learning_rate": 1.3362043103119537e-06, "loss": 0.0408, "step": 1751 }, { "epoch": 0.6890855457227139, "grad_norm": 1.0178303718566895, "learning_rate": 1.3331660239465232e-06, "loss": 0.0692, "step": 1752 }, { "epoch": 0.6894788593903638, "grad_norm": 1.1098684072494507, "learning_rate": 1.3301299393737262e-06, "loss": 0.0553, "step": 1753 }, { "epoch": 0.6898721730580137, "grad_norm": 0.9905382990837097, "learning_rate": 1.3270960623225953e-06, "loss": 0.0551, "step": 1754 }, { "epoch": 0.6902654867256637, "grad_norm": 1.15705406665802, "learning_rate": 1.324064398517994e-06, "loss": 0.0606, "step": 1755 }, { "epoch": 0.6906588003933136, "grad_norm": 0.7547001838684082, "learning_rate": 1.3210349536806138e-06, "loss": 0.0375, "step": 1756 }, { "epoch": 0.6910521140609637, "grad_norm": 0.9143390655517578, "learning_rate": 1.3180077335269565e-06, "loss": 0.0557, "step": 1757 }, { "epoch": 0.6914454277286136, "grad_norm": 1.5813028812408447, "learning_rate": 1.3149827437693267e-06, "loss": 0.0734, "step": 1758 }, { "epoch": 0.6918387413962636, "grad_norm": 1.3135156631469727, "learning_rate": 1.3119599901158214e-06, "loss": 0.0454, "step": 1759 }, { "epoch": 0.6922320550639135, "grad_norm": 1.3713979721069336, "learning_rate": 1.3089394782703152e-06, "loss": 0.0459, "step": 1760 }, { "epoch": 0.6926253687315634, "grad_norm": 1.0648804903030396, "learning_rate": 1.3059212139324548e-06, "loss": 0.0562, "step": 1761 }, { "epoch": 0.6930186823992134, "grad_norm": 0.8367137312889099, "learning_rate": 1.3029052027976457e-06, "loss": 0.0269, "step": 1762 }, { "epoch": 0.6934119960668633, "grad_norm": 1.1222723722457886, "learning_rate": 1.299891450557041e-06, "loss": 0.0458, "step": 1763 }, { "epoch": 0.6938053097345133, "grad_norm": 1.087550163269043, "learning_rate": 1.2968799628975311e-06, "loss": 0.0357, "step": 1764 }, { "epoch": 0.6941986234021632, "grad_norm": 0.8797011375427246, "learning_rate": 1.2938707455017358e-06, "loss": 0.0459, "step": 1765 }, { "epoch": 0.6945919370698131, "grad_norm": 1.4389101266860962, "learning_rate": 1.2908638040479855e-06, "loss": 0.0715, "step": 1766 }, { "epoch": 0.6949852507374631, "grad_norm": 0.826977014541626, "learning_rate": 1.2878591442103215e-06, "loss": 0.0498, "step": 1767 }, { "epoch": 0.695378564405113, "grad_norm": 1.2073124647140503, "learning_rate": 1.2848567716584764e-06, "loss": 0.0401, "step": 1768 }, { "epoch": 0.6957718780727631, "grad_norm": 1.2512377500534058, "learning_rate": 1.2818566920578684e-06, "loss": 0.0545, "step": 1769 }, { "epoch": 0.696165191740413, "grad_norm": 1.003304123878479, "learning_rate": 1.2788589110695896e-06, "loss": 0.0657, "step": 1770 }, { "epoch": 0.696558505408063, "grad_norm": 1.6829479932785034, "learning_rate": 1.275863434350391e-06, "loss": 0.0488, "step": 1771 }, { "epoch": 0.6969518190757129, "grad_norm": 1.0957913398742676, "learning_rate": 1.2728702675526788e-06, "loss": 0.0695, "step": 1772 }, { "epoch": 0.6973451327433628, "grad_norm": 1.2029186487197876, "learning_rate": 1.2698794163244998e-06, "loss": 0.0574, "step": 1773 }, { "epoch": 0.6977384464110128, "grad_norm": 0.8925944566726685, "learning_rate": 1.2668908863095311e-06, "loss": 0.0424, "step": 1774 }, { "epoch": 0.6981317600786627, "grad_norm": 0.8353788256645203, "learning_rate": 1.2639046831470697e-06, "loss": 0.038, "step": 1775 }, { "epoch": 0.6985250737463127, "grad_norm": 2.284682273864746, "learning_rate": 1.2609208124720228e-06, "loss": 0.0687, "step": 1776 }, { "epoch": 0.6989183874139626, "grad_norm": 0.9992805123329163, "learning_rate": 1.2579392799148938e-06, "loss": 0.0401, "step": 1777 }, { "epoch": 0.6993117010816126, "grad_norm": 1.329393744468689, "learning_rate": 1.2549600911017761e-06, "loss": 0.0768, "step": 1778 }, { "epoch": 0.6997050147492625, "grad_norm": 1.184579849243164, "learning_rate": 1.25198325165434e-06, "loss": 0.0467, "step": 1779 }, { "epoch": 0.7000983284169124, "grad_norm": 0.6934780478477478, "learning_rate": 1.2490087671898234e-06, "loss": 0.0454, "step": 1780 }, { "epoch": 0.7004916420845625, "grad_norm": 0.5612182021141052, "learning_rate": 1.24603664332102e-06, "loss": 0.0397, "step": 1781 }, { "epoch": 0.7008849557522124, "grad_norm": 1.493826985359192, "learning_rate": 1.243066885656267e-06, "loss": 0.0815, "step": 1782 }, { "epoch": 0.7012782694198624, "grad_norm": 0.7363511323928833, "learning_rate": 1.240099499799439e-06, "loss": 0.0496, "step": 1783 }, { "epoch": 0.7016715830875123, "grad_norm": 1.6472634077072144, "learning_rate": 1.237134491349935e-06, "loss": 0.0741, "step": 1784 }, { "epoch": 0.7020648967551623, "grad_norm": 1.3183567523956299, "learning_rate": 1.234171865902667e-06, "loss": 0.043, "step": 1785 }, { "epoch": 0.7024582104228122, "grad_norm": 1.0543493032455444, "learning_rate": 1.2312116290480506e-06, "loss": 0.0401, "step": 1786 }, { "epoch": 0.7028515240904621, "grad_norm": 0.8686029314994812, "learning_rate": 1.228253786371995e-06, "loss": 0.0335, "step": 1787 }, { "epoch": 0.7032448377581121, "grad_norm": 1.9254342317581177, "learning_rate": 1.2252983434558894e-06, "loss": 0.0361, "step": 1788 }, { "epoch": 0.703638151425762, "grad_norm": 0.8810344338417053, "learning_rate": 1.2223453058765966e-06, "loss": 0.0442, "step": 1789 }, { "epoch": 0.704031465093412, "grad_norm": 1.138178825378418, "learning_rate": 1.2193946792064403e-06, "loss": 0.0768, "step": 1790 }, { "epoch": 0.7044247787610619, "grad_norm": 0.7755922675132751, "learning_rate": 1.2164464690131947e-06, "loss": 0.0303, "step": 1791 }, { "epoch": 0.7048180924287119, "grad_norm": 1.5868074893951416, "learning_rate": 1.2135006808600752e-06, "loss": 0.052, "step": 1792 }, { "epoch": 0.7052114060963619, "grad_norm": 0.9672881364822388, "learning_rate": 1.2105573203057233e-06, "loss": 0.0432, "step": 1793 }, { "epoch": 0.7056047197640118, "grad_norm": 0.9986976981163025, "learning_rate": 1.207616392904204e-06, "loss": 0.0464, "step": 1794 }, { "epoch": 0.7059980334316618, "grad_norm": 0.646554708480835, "learning_rate": 1.2046779042049883e-06, "loss": 0.0268, "step": 1795 }, { "epoch": 0.7063913470993117, "grad_norm": 0.6818554997444153, "learning_rate": 1.2017418597529464e-06, "loss": 0.0521, "step": 1796 }, { "epoch": 0.7067846607669617, "grad_norm": 0.5991765260696411, "learning_rate": 1.1988082650883376e-06, "loss": 0.0538, "step": 1797 }, { "epoch": 0.7071779744346116, "grad_norm": 1.1525814533233643, "learning_rate": 1.1958771257467946e-06, "loss": 0.0451, "step": 1798 }, { "epoch": 0.7075712881022616, "grad_norm": 0.8486371040344238, "learning_rate": 1.1929484472593205e-06, "loss": 0.0514, "step": 1799 }, { "epoch": 0.7079646017699115, "grad_norm": 1.393419623374939, "learning_rate": 1.190022235152274e-06, "loss": 0.0609, "step": 1800 }, { "epoch": 0.7083579154375614, "grad_norm": 0.7574542760848999, "learning_rate": 1.1870984949473586e-06, "loss": 0.0604, "step": 1801 }, { "epoch": 0.7087512291052114, "grad_norm": 1.0601574182510376, "learning_rate": 1.184177232161615e-06, "loss": 0.0459, "step": 1802 }, { "epoch": 0.7091445427728613, "grad_norm": 0.7535306811332703, "learning_rate": 1.1812584523074089e-06, "loss": 0.0351, "step": 1803 }, { "epoch": 0.7095378564405113, "grad_norm": 1.3023512363433838, "learning_rate": 1.1783421608924183e-06, "loss": 0.0598, "step": 1804 }, { "epoch": 0.7099311701081613, "grad_norm": 1.1070560216903687, "learning_rate": 1.1754283634196285e-06, "loss": 0.0471, "step": 1805 }, { "epoch": 0.7103244837758113, "grad_norm": 0.9613627791404724, "learning_rate": 1.1725170653873174e-06, "loss": 0.0486, "step": 1806 }, { "epoch": 0.7107177974434612, "grad_norm": 0.7932494282722473, "learning_rate": 1.1696082722890474e-06, "loss": 0.0774, "step": 1807 }, { "epoch": 0.7111111111111111, "grad_norm": 0.684893786907196, "learning_rate": 1.1667019896136539e-06, "loss": 0.0454, "step": 1808 }, { "epoch": 0.7115044247787611, "grad_norm": 1.3207006454467773, "learning_rate": 1.1637982228452329e-06, "loss": 0.0473, "step": 1809 }, { "epoch": 0.711897738446411, "grad_norm": 1.3429388999938965, "learning_rate": 1.1608969774631366e-06, "loss": 0.0412, "step": 1810 }, { "epoch": 0.712291052114061, "grad_norm": 1.4132349491119385, "learning_rate": 1.1579982589419568e-06, "loss": 0.0549, "step": 1811 }, { "epoch": 0.7126843657817109, "grad_norm": 0.7561691999435425, "learning_rate": 1.155102072751518e-06, "loss": 0.0337, "step": 1812 }, { "epoch": 0.7130776794493608, "grad_norm": 0.7749929428100586, "learning_rate": 1.152208424356867e-06, "loss": 0.034, "step": 1813 }, { "epoch": 0.7134709931170108, "grad_norm": 1.1324396133422852, "learning_rate": 1.1493173192182613e-06, "loss": 0.032, "step": 1814 }, { "epoch": 0.7138643067846607, "grad_norm": 0.7702449560165405, "learning_rate": 1.1464287627911577e-06, "loss": 0.0451, "step": 1815 }, { "epoch": 0.7142576204523107, "grad_norm": 0.7402438521385193, "learning_rate": 1.1435427605262057e-06, "loss": 0.0489, "step": 1816 }, { "epoch": 0.7146509341199607, "grad_norm": 1.3986225128173828, "learning_rate": 1.1406593178692346e-06, "loss": 0.0463, "step": 1817 }, { "epoch": 0.7150442477876107, "grad_norm": 0.7235271334648132, "learning_rate": 1.1377784402612439e-06, "loss": 0.0519, "step": 1818 }, { "epoch": 0.7154375614552606, "grad_norm": 0.8625795841217041, "learning_rate": 1.1349001331383921e-06, "loss": 0.0375, "step": 1819 }, { "epoch": 0.7158308751229105, "grad_norm": 1.5163322687149048, "learning_rate": 1.132024401931988e-06, "loss": 0.0557, "step": 1820 }, { "epoch": 0.7162241887905605, "grad_norm": 0.6675801277160645, "learning_rate": 1.12915125206848e-06, "loss": 0.0261, "step": 1821 }, { "epoch": 0.7166175024582104, "grad_norm": 0.9029967188835144, "learning_rate": 1.1262806889694455e-06, "loss": 0.037, "step": 1822 }, { "epoch": 0.7170108161258604, "grad_norm": 0.716080367565155, "learning_rate": 1.1234127180515787e-06, "loss": 0.0559, "step": 1823 }, { "epoch": 0.7174041297935103, "grad_norm": 0.9414195418357849, "learning_rate": 1.1205473447266843e-06, "loss": 0.0466, "step": 1824 }, { "epoch": 0.7177974434611603, "grad_norm": 0.9414455890655518, "learning_rate": 1.117684574401666e-06, "loss": 0.0408, "step": 1825 }, { "epoch": 0.7181907571288102, "grad_norm": 0.6914128065109253, "learning_rate": 1.1148244124785143e-06, "loss": 0.0286, "step": 1826 }, { "epoch": 0.7185840707964601, "grad_norm": 1.238477349281311, "learning_rate": 1.111966864354298e-06, "loss": 0.0606, "step": 1827 }, { "epoch": 0.7189773844641101, "grad_norm": 1.5670506954193115, "learning_rate": 1.1091119354211544e-06, "loss": 0.045, "step": 1828 }, { "epoch": 0.7193706981317601, "grad_norm": 1.5129029750823975, "learning_rate": 1.1062596310662775e-06, "loss": 0.0352, "step": 1829 }, { "epoch": 0.7197640117994101, "grad_norm": 1.0257515907287598, "learning_rate": 1.1034099566719104e-06, "loss": 0.0267, "step": 1830 }, { "epoch": 0.72015732546706, "grad_norm": 0.8426341414451599, "learning_rate": 1.1005629176153302e-06, "loss": 0.0331, "step": 1831 }, { "epoch": 0.72055063913471, "grad_norm": 1.1478296518325806, "learning_rate": 1.097718519268844e-06, "loss": 0.0601, "step": 1832 }, { "epoch": 0.7209439528023599, "grad_norm": 1.6983435153961182, "learning_rate": 1.0948767669997762e-06, "loss": 0.0671, "step": 1833 }, { "epoch": 0.7213372664700098, "grad_norm": 0.992310643196106, "learning_rate": 1.092037666170456e-06, "loss": 0.0554, "step": 1834 }, { "epoch": 0.7217305801376598, "grad_norm": 1.258967399597168, "learning_rate": 1.0892012221382115e-06, "loss": 0.0423, "step": 1835 }, { "epoch": 0.7221238938053097, "grad_norm": 0.8152772188186646, "learning_rate": 1.0863674402553564e-06, "loss": 0.0638, "step": 1836 }, { "epoch": 0.7225172074729597, "grad_norm": 0.8680564165115356, "learning_rate": 1.08353632586918e-06, "loss": 0.0322, "step": 1837 }, { "epoch": 0.7229105211406096, "grad_norm": 0.4944194257259369, "learning_rate": 1.0807078843219395e-06, "loss": 0.0684, "step": 1838 }, { "epoch": 0.7233038348082595, "grad_norm": 1.0787291526794434, "learning_rate": 1.077882120950849e-06, "loss": 0.0355, "step": 1839 }, { "epoch": 0.7236971484759095, "grad_norm": 0.4451111853122711, "learning_rate": 1.0750590410880671e-06, "loss": 0.0291, "step": 1840 }, { "epoch": 0.7240904621435595, "grad_norm": 0.48384201526641846, "learning_rate": 1.072238650060691e-06, "loss": 0.0344, "step": 1841 }, { "epoch": 0.7244837758112095, "grad_norm": 1.1826977729797363, "learning_rate": 1.0694209531907412e-06, "loss": 0.0302, "step": 1842 }, { "epoch": 0.7248770894788594, "grad_norm": 0.5904631614685059, "learning_rate": 1.0666059557951566e-06, "loss": 0.0268, "step": 1843 }, { "epoch": 0.7252704031465094, "grad_norm": 0.7693639993667603, "learning_rate": 1.0637936631857815e-06, "loss": 0.0329, "step": 1844 }, { "epoch": 0.7256637168141593, "grad_norm": 1.1267420053482056, "learning_rate": 1.0609840806693567e-06, "loss": 0.0584, "step": 1845 }, { "epoch": 0.7260570304818093, "grad_norm": 0.8826761841773987, "learning_rate": 1.0581772135475089e-06, "loss": 0.0371, "step": 1846 }, { "epoch": 0.7264503441494592, "grad_norm": 0.9510964751243591, "learning_rate": 1.0553730671167412e-06, "loss": 0.0366, "step": 1847 }, { "epoch": 0.7268436578171091, "grad_norm": 1.4061312675476074, "learning_rate": 1.052571646668421e-06, "loss": 0.0548, "step": 1848 }, { "epoch": 0.7272369714847591, "grad_norm": 1.7235345840454102, "learning_rate": 1.0497729574887744e-06, "loss": 0.0729, "step": 1849 }, { "epoch": 0.727630285152409, "grad_norm": 1.10977041721344, "learning_rate": 1.0469770048588723e-06, "loss": 0.042, "step": 1850 }, { "epoch": 0.728023598820059, "grad_norm": 1.054607629776001, "learning_rate": 1.0441837940546217e-06, "loss": 0.0286, "step": 1851 }, { "epoch": 0.7284169124877089, "grad_norm": 1.315953016281128, "learning_rate": 1.0413933303467578e-06, "loss": 0.0415, "step": 1852 }, { "epoch": 0.728810226155359, "grad_norm": 1.4497429132461548, "learning_rate": 1.038605619000828e-06, "loss": 0.0566, "step": 1853 }, { "epoch": 0.7292035398230089, "grad_norm": 1.1214773654937744, "learning_rate": 1.0358206652771896e-06, "loss": 0.0388, "step": 1854 }, { "epoch": 0.7295968534906588, "grad_norm": 0.8499764204025269, "learning_rate": 1.033038474430995e-06, "loss": 0.022, "step": 1855 }, { "epoch": 0.7299901671583088, "grad_norm": 0.993175745010376, "learning_rate": 1.0302590517121835e-06, "loss": 0.0351, "step": 1856 }, { "epoch": 0.7303834808259587, "grad_norm": 1.3063788414001465, "learning_rate": 1.0274824023654717e-06, "loss": 0.049, "step": 1857 }, { "epoch": 0.7307767944936087, "grad_norm": 0.6438285112380981, "learning_rate": 1.0247085316303401e-06, "loss": 0.0322, "step": 1858 }, { "epoch": 0.7311701081612586, "grad_norm": 1.801291823387146, "learning_rate": 1.0219374447410289e-06, "loss": 0.0724, "step": 1859 }, { "epoch": 0.7315634218289085, "grad_norm": 1.5461159944534302, "learning_rate": 1.019169146926524e-06, "loss": 0.0466, "step": 1860 }, { "epoch": 0.7319567354965585, "grad_norm": 1.0814778804779053, "learning_rate": 1.016403643410549e-06, "loss": 0.0532, "step": 1861 }, { "epoch": 0.7323500491642084, "grad_norm": 1.1939774751663208, "learning_rate": 1.013640939411554e-06, "loss": 0.0349, "step": 1862 }, { "epoch": 0.7327433628318584, "grad_norm": 2.0183346271514893, "learning_rate": 1.010881040142708e-06, "loss": 0.0802, "step": 1863 }, { "epoch": 0.7331366764995083, "grad_norm": 1.4486076831817627, "learning_rate": 1.0081239508118842e-06, "loss": 0.0381, "step": 1864 }, { "epoch": 0.7335299901671584, "grad_norm": 0.7198472023010254, "learning_rate": 1.0053696766216566e-06, "loss": 0.0332, "step": 1865 }, { "epoch": 0.7339233038348083, "grad_norm": 1.0703610181808472, "learning_rate": 1.0026182227692865e-06, "loss": 0.0321, "step": 1866 }, { "epoch": 0.7343166175024582, "grad_norm": 0.9748527407646179, "learning_rate": 9.998695944467127e-07, "loss": 0.0312, "step": 1867 }, { "epoch": 0.7347099311701082, "grad_norm": 0.6599907279014587, "learning_rate": 9.97123796840543e-07, "loss": 0.05, "step": 1868 }, { "epoch": 0.7351032448377581, "grad_norm": 1.033435583114624, "learning_rate": 9.943808351320418e-07, "loss": 0.0482, "step": 1869 }, { "epoch": 0.7354965585054081, "grad_norm": 1.139096975326538, "learning_rate": 9.916407144971245e-07, "loss": 0.046, "step": 1870 }, { "epoch": 0.735889872173058, "grad_norm": 1.5064547061920166, "learning_rate": 9.889034401063443e-07, "loss": 0.0629, "step": 1871 }, { "epoch": 0.736283185840708, "grad_norm": 0.7273301482200623, "learning_rate": 9.861690171248841e-07, "loss": 0.0314, "step": 1872 }, { "epoch": 0.7366764995083579, "grad_norm": 0.579467236995697, "learning_rate": 9.834374507125458e-07, "loss": 0.0527, "step": 1873 }, { "epoch": 0.7370698131760078, "grad_norm": 0.8448885679244995, "learning_rate": 9.807087460237419e-07, "loss": 0.0326, "step": 1874 }, { "epoch": 0.7374631268436578, "grad_norm": 1.0001413822174072, "learning_rate": 9.779829082074827e-07, "loss": 0.0657, "step": 1875 }, { "epoch": 0.7378564405113077, "grad_norm": 1.2145143747329712, "learning_rate": 9.752599424073707e-07, "loss": 0.0339, "step": 1876 }, { "epoch": 0.7382497541789578, "grad_norm": 1.0525156259536743, "learning_rate": 9.725398537615894e-07, "loss": 0.0459, "step": 1877 }, { "epoch": 0.7386430678466077, "grad_norm": 1.2982537746429443, "learning_rate": 9.698226474028913e-07, "loss": 0.0744, "step": 1878 }, { "epoch": 0.7390363815142577, "grad_norm": 0.8789856433868408, "learning_rate": 9.671083284585925e-07, "loss": 0.0442, "step": 1879 }, { "epoch": 0.7394296951819076, "grad_norm": 2.672044515609741, "learning_rate": 9.643969020505573e-07, "loss": 0.0769, "step": 1880 }, { "epoch": 0.7398230088495575, "grad_norm": 1.0391490459442139, "learning_rate": 9.616883732951945e-07, "loss": 0.0721, "step": 1881 }, { "epoch": 0.7402163225172075, "grad_norm": 1.1753817796707153, "learning_rate": 9.589827473034443e-07, "loss": 0.0463, "step": 1882 }, { "epoch": 0.7406096361848574, "grad_norm": 1.260125994682312, "learning_rate": 9.562800291807695e-07, "loss": 0.0637, "step": 1883 }, { "epoch": 0.7410029498525074, "grad_norm": 0.9175117015838623, "learning_rate": 9.535802240271455e-07, "loss": 0.037, "step": 1884 }, { "epoch": 0.7413962635201573, "grad_norm": 0.9132412075996399, "learning_rate": 9.508833369370524e-07, "loss": 0.056, "step": 1885 }, { "epoch": 0.7417895771878072, "grad_norm": 1.965725302696228, "learning_rate": 9.481893729994609e-07, "loss": 0.0545, "step": 1886 }, { "epoch": 0.7421828908554572, "grad_norm": 2.073374032974243, "learning_rate": 9.454983372978288e-07, "loss": 0.0754, "step": 1887 }, { "epoch": 0.7425762045231071, "grad_norm": 1.0531790256500244, "learning_rate": 9.428102349100868e-07, "loss": 0.0459, "step": 1888 }, { "epoch": 0.7429695181907572, "grad_norm": 1.7750204801559448, "learning_rate": 9.40125070908631e-07, "loss": 0.061, "step": 1889 }, { "epoch": 0.7433628318584071, "grad_norm": 0.6801098585128784, "learning_rate": 9.374428503603139e-07, "loss": 0.0597, "step": 1890 }, { "epoch": 0.7437561455260571, "grad_norm": 0.6724294424057007, "learning_rate": 9.347635783264309e-07, "loss": 0.0302, "step": 1891 }, { "epoch": 0.744149459193707, "grad_norm": 0.7799742817878723, "learning_rate": 9.32087259862716e-07, "loss": 0.0679, "step": 1892 }, { "epoch": 0.744542772861357, "grad_norm": 1.623399257659912, "learning_rate": 9.294139000193292e-07, "loss": 0.0553, "step": 1893 }, { "epoch": 0.7449360865290069, "grad_norm": 0.8977343440055847, "learning_rate": 9.267435038408479e-07, "loss": 0.0284, "step": 1894 }, { "epoch": 0.7453294001966568, "grad_norm": 0.7733441591262817, "learning_rate": 9.240760763662562e-07, "loss": 0.0339, "step": 1895 }, { "epoch": 0.7457227138643068, "grad_norm": 1.5382790565490723, "learning_rate": 9.214116226289388e-07, "loss": 0.0746, "step": 1896 }, { "epoch": 0.7461160275319567, "grad_norm": 1.144547700881958, "learning_rate": 9.187501476566648e-07, "loss": 0.0351, "step": 1897 }, { "epoch": 0.7465093411996067, "grad_norm": 0.7251105904579163, "learning_rate": 9.16091656471586e-07, "loss": 0.0634, "step": 1898 }, { "epoch": 0.7469026548672566, "grad_norm": 0.999096155166626, "learning_rate": 9.134361540902225e-07, "loss": 0.0421, "step": 1899 }, { "epoch": 0.7472959685349065, "grad_norm": 0.830605685710907, "learning_rate": 9.10783645523455e-07, "loss": 0.0426, "step": 1900 }, { "epoch": 0.7476892822025566, "grad_norm": 1.5645976066589355, "learning_rate": 9.081341357765145e-07, "loss": 0.0416, "step": 1901 }, { "epoch": 0.7480825958702065, "grad_norm": 0.8770972490310669, "learning_rate": 9.054876298489742e-07, "loss": 0.0561, "step": 1902 }, { "epoch": 0.7484759095378565, "grad_norm": 1.5209007263183594, "learning_rate": 9.02844132734737e-07, "loss": 0.0419, "step": 1903 }, { "epoch": 0.7488692232055064, "grad_norm": 3.409085512161255, "learning_rate": 9.002036494220306e-07, "loss": 0.0752, "step": 1904 }, { "epoch": 0.7492625368731564, "grad_norm": 1.448819875717163, "learning_rate": 8.975661848933945e-07, "loss": 0.0523, "step": 1905 }, { "epoch": 0.7496558505408063, "grad_norm": 0.998282790184021, "learning_rate": 8.949317441256724e-07, "loss": 0.0733, "step": 1906 }, { "epoch": 0.7500491642084562, "grad_norm": 1.4408761262893677, "learning_rate": 8.923003320900014e-07, "loss": 0.0577, "step": 1907 }, { "epoch": 0.7504424778761062, "grad_norm": 0.9130271077156067, "learning_rate": 8.896719537518048e-07, "loss": 0.0317, "step": 1908 }, { "epoch": 0.7508357915437561, "grad_norm": 1.9195144176483154, "learning_rate": 8.870466140707795e-07, "loss": 0.0666, "step": 1909 }, { "epoch": 0.7512291052114061, "grad_norm": 1.457318902015686, "learning_rate": 8.844243180008913e-07, "loss": 0.0762, "step": 1910 }, { "epoch": 0.751622418879056, "grad_norm": 1.4528069496154785, "learning_rate": 8.818050704903589e-07, "loss": 0.0423, "step": 1911 }, { "epoch": 0.752015732546706, "grad_norm": 0.849536120891571, "learning_rate": 8.791888764816514e-07, "loss": 0.0289, "step": 1912 }, { "epoch": 0.752409046214356, "grad_norm": 1.4856075048446655, "learning_rate": 8.765757409114753e-07, "loss": 0.0665, "step": 1913 }, { "epoch": 0.752802359882006, "grad_norm": 0.8997237086296082, "learning_rate": 8.739656687107656e-07, "loss": 0.0619, "step": 1914 }, { "epoch": 0.7531956735496559, "grad_norm": 0.8566966652870178, "learning_rate": 8.713586648046768e-07, "loss": 0.0476, "step": 1915 }, { "epoch": 0.7535889872173058, "grad_norm": 0.9483917355537415, "learning_rate": 8.68754734112574e-07, "loss": 0.0486, "step": 1916 }, { "epoch": 0.7539823008849558, "grad_norm": 1.0472768545150757, "learning_rate": 8.661538815480228e-07, "loss": 0.0422, "step": 1917 }, { "epoch": 0.7543756145526057, "grad_norm": 1.4821901321411133, "learning_rate": 8.635561120187813e-07, "loss": 0.0408, "step": 1918 }, { "epoch": 0.7547689282202557, "grad_norm": 0.7954731583595276, "learning_rate": 8.609614304267877e-07, "loss": 0.059, "step": 1919 }, { "epoch": 0.7551622418879056, "grad_norm": 0.9966669082641602, "learning_rate": 8.583698416681555e-07, "loss": 0.0303, "step": 1920 }, { "epoch": 0.7555555555555555, "grad_norm": 0.39692261815071106, "learning_rate": 8.557813506331616e-07, "loss": 0.0324, "step": 1921 }, { "epoch": 0.7559488692232055, "grad_norm": 1.7129300832748413, "learning_rate": 8.531959622062372e-07, "loss": 0.0397, "step": 1922 }, { "epoch": 0.7563421828908554, "grad_norm": 1.0999704599380493, "learning_rate": 8.506136812659601e-07, "loss": 0.0455, "step": 1923 }, { "epoch": 0.7567354965585054, "grad_norm": 1.2547434568405151, "learning_rate": 8.480345126850414e-07, "loss": 0.0658, "step": 1924 }, { "epoch": 0.7571288102261554, "grad_norm": 1.1041603088378906, "learning_rate": 8.454584613303227e-07, "loss": 0.0339, "step": 1925 }, { "epoch": 0.7575221238938054, "grad_norm": 0.8621834516525269, "learning_rate": 8.428855320627613e-07, "loss": 0.0294, "step": 1926 }, { "epoch": 0.7579154375614553, "grad_norm": 0.7350767254829407, "learning_rate": 8.403157297374239e-07, "loss": 0.023, "step": 1927 }, { "epoch": 0.7583087512291052, "grad_norm": 0.9072149991989136, "learning_rate": 8.377490592034779e-07, "loss": 0.0704, "step": 1928 }, { "epoch": 0.7587020648967552, "grad_norm": 0.715020477771759, "learning_rate": 8.35185525304178e-07, "loss": 0.0321, "step": 1929 }, { "epoch": 0.7590953785644051, "grad_norm": 0.7303974032402039, "learning_rate": 8.326251328768626e-07, "loss": 0.0207, "step": 1930 }, { "epoch": 0.7594886922320551, "grad_norm": 1.534783124923706, "learning_rate": 8.300678867529415e-07, "loss": 0.0715, "step": 1931 }, { "epoch": 0.759882005899705, "grad_norm": 0.6678977012634277, "learning_rate": 8.275137917578879e-07, "loss": 0.0454, "step": 1932 }, { "epoch": 0.760275319567355, "grad_norm": 0.7839411497116089, "learning_rate": 8.249628527112282e-07, "loss": 0.053, "step": 1933 }, { "epoch": 0.7606686332350049, "grad_norm": 0.6599370241165161, "learning_rate": 8.224150744265352e-07, "loss": 0.0312, "step": 1934 }, { "epoch": 0.7610619469026548, "grad_norm": 0.8593689799308777, "learning_rate": 8.198704617114143e-07, "loss": 0.0219, "step": 1935 }, { "epoch": 0.7614552605703048, "grad_norm": 1.0792686939239502, "learning_rate": 8.173290193674996e-07, "loss": 0.0688, "step": 1936 }, { "epoch": 0.7618485742379548, "grad_norm": 1.1030522584915161, "learning_rate": 8.147907521904433e-07, "loss": 0.0598, "step": 1937 }, { "epoch": 0.7622418879056048, "grad_norm": 1.4342604875564575, "learning_rate": 8.122556649699051e-07, "loss": 0.072, "step": 1938 }, { "epoch": 0.7626352015732547, "grad_norm": 1.555779218673706, "learning_rate": 8.097237624895452e-07, "loss": 0.0875, "step": 1939 }, { "epoch": 0.7630285152409046, "grad_norm": 1.7069602012634277, "learning_rate": 8.07195049527012e-07, "loss": 0.0625, "step": 1940 }, { "epoch": 0.7634218289085546, "grad_norm": 1.4105464220046997, "learning_rate": 8.046695308539376e-07, "loss": 0.0302, "step": 1941 }, { "epoch": 0.7638151425762045, "grad_norm": 0.9220629930496216, "learning_rate": 8.021472112359255e-07, "loss": 0.0788, "step": 1942 }, { "epoch": 0.7642084562438545, "grad_norm": 1.7221704721450806, "learning_rate": 7.996280954325433e-07, "loss": 0.0701, "step": 1943 }, { "epoch": 0.7646017699115044, "grad_norm": 1.240715503692627, "learning_rate": 7.971121881973126e-07, "loss": 0.0605, "step": 1944 }, { "epoch": 0.7649950835791544, "grad_norm": 1.054165005683899, "learning_rate": 7.945994942777016e-07, "loss": 0.0278, "step": 1945 }, { "epoch": 0.7653883972468043, "grad_norm": 0.3918832242488861, "learning_rate": 7.92090018415112e-07, "loss": 0.0433, "step": 1946 }, { "epoch": 0.7657817109144542, "grad_norm": 1.2010436058044434, "learning_rate": 7.895837653448759e-07, "loss": 0.0645, "step": 1947 }, { "epoch": 0.7661750245821042, "grad_norm": 0.6880310773849487, "learning_rate": 7.870807397962438e-07, "loss": 0.0466, "step": 1948 }, { "epoch": 0.7665683382497542, "grad_norm": 0.8154659867286682, "learning_rate": 7.845809464923748e-07, "loss": 0.0478, "step": 1949 }, { "epoch": 0.7669616519174042, "grad_norm": 0.7172273397445679, "learning_rate": 7.820843901503308e-07, "loss": 0.0352, "step": 1950 }, { "epoch": 0.7673549655850541, "grad_norm": 1.7781319618225098, "learning_rate": 7.79591075481062e-07, "loss": 0.0732, "step": 1951 }, { "epoch": 0.7677482792527041, "grad_norm": 0.6639533638954163, "learning_rate": 7.771010071894052e-07, "loss": 0.0179, "step": 1952 }, { "epoch": 0.768141592920354, "grad_norm": 0.8761031627655029, "learning_rate": 7.7461418997407e-07, "loss": 0.0281, "step": 1953 }, { "epoch": 0.7685349065880039, "grad_norm": 0.7496312856674194, "learning_rate": 7.721306285276309e-07, "loss": 0.053, "step": 1954 }, { "epoch": 0.7689282202556539, "grad_norm": 0.46650174260139465, "learning_rate": 7.696503275365194e-07, "loss": 0.0513, "step": 1955 }, { "epoch": 0.7693215339233038, "grad_norm": 1.1080721616744995, "learning_rate": 7.671732916810154e-07, "loss": 0.0507, "step": 1956 }, { "epoch": 0.7697148475909538, "grad_norm": 0.6540339589118958, "learning_rate": 7.646995256352346e-07, "loss": 0.028, "step": 1957 }, { "epoch": 0.7701081612586037, "grad_norm": 1.099401593208313, "learning_rate": 7.622290340671256e-07, "loss": 0.0623, "step": 1958 }, { "epoch": 0.7705014749262536, "grad_norm": 0.9163020253181458, "learning_rate": 7.597618216384576e-07, "loss": 0.0251, "step": 1959 }, { "epoch": 0.7708947885939036, "grad_norm": 1.32003915309906, "learning_rate": 7.572978930048108e-07, "loss": 0.0467, "step": 1960 }, { "epoch": 0.7712881022615536, "grad_norm": 1.0354825258255005, "learning_rate": 7.54837252815571e-07, "loss": 0.0491, "step": 1961 }, { "epoch": 0.7716814159292036, "grad_norm": 1.0285413265228271, "learning_rate": 7.523799057139158e-07, "loss": 0.0598, "step": 1962 }, { "epoch": 0.7720747295968535, "grad_norm": 1.7109252214431763, "learning_rate": 7.49925856336812e-07, "loss": 0.058, "step": 1963 }, { "epoch": 0.7724680432645035, "grad_norm": 1.3561407327651978, "learning_rate": 7.474751093150015e-07, "loss": 0.0351, "step": 1964 }, { "epoch": 0.7728613569321534, "grad_norm": 0.4150741696357727, "learning_rate": 7.450276692729957e-07, "loss": 0.0181, "step": 1965 }, { "epoch": 0.7732546705998034, "grad_norm": 1.0091959238052368, "learning_rate": 7.425835408290655e-07, "loss": 0.0403, "step": 1966 }, { "epoch": 0.7736479842674533, "grad_norm": 2.851815938949585, "learning_rate": 7.40142728595234e-07, "loss": 0.0491, "step": 1967 }, { "epoch": 0.7740412979351032, "grad_norm": 1.306333303451538, "learning_rate": 7.377052371772637e-07, "loss": 0.058, "step": 1968 }, { "epoch": 0.7744346116027532, "grad_norm": 0.8560998439788818, "learning_rate": 7.352710711746536e-07, "loss": 0.0284, "step": 1969 }, { "epoch": 0.7748279252704031, "grad_norm": 1.8746119737625122, "learning_rate": 7.328402351806269e-07, "loss": 0.0654, "step": 1970 }, { "epoch": 0.7752212389380531, "grad_norm": 1.0875734090805054, "learning_rate": 7.304127337821229e-07, "loss": 0.0402, "step": 1971 }, { "epoch": 0.775614552605703, "grad_norm": 0.8440957069396973, "learning_rate": 7.279885715597896e-07, "loss": 0.0367, "step": 1972 }, { "epoch": 0.776007866273353, "grad_norm": 1.528245210647583, "learning_rate": 7.255677530879713e-07, "loss": 0.0336, "step": 1973 }, { "epoch": 0.776401179941003, "grad_norm": 1.6772621870040894, "learning_rate": 7.231502829347056e-07, "loss": 0.0388, "step": 1974 }, { "epoch": 0.7767944936086529, "grad_norm": 0.85129314661026, "learning_rate": 7.207361656617112e-07, "loss": 0.0521, "step": 1975 }, { "epoch": 0.7771878072763029, "grad_norm": 1.1908273696899414, "learning_rate": 7.183254058243791e-07, "loss": 0.0419, "step": 1976 }, { "epoch": 0.7775811209439528, "grad_norm": 1.2314374446868896, "learning_rate": 7.159180079717656e-07, "loss": 0.044, "step": 1977 }, { "epoch": 0.7779744346116028, "grad_norm": 1.7192610502243042, "learning_rate": 7.135139766465838e-07, "loss": 0.0663, "step": 1978 }, { "epoch": 0.7783677482792527, "grad_norm": 1.5432205200195312, "learning_rate": 7.111133163851916e-07, "loss": 0.0267, "step": 1979 }, { "epoch": 0.7787610619469026, "grad_norm": 0.759152352809906, "learning_rate": 7.087160317175881e-07, "loss": 0.0299, "step": 1980 }, { "epoch": 0.7791543756145526, "grad_norm": 0.9122269749641418, "learning_rate": 7.06322127167402e-07, "loss": 0.0301, "step": 1981 }, { "epoch": 0.7795476892822025, "grad_norm": 0.7516564130783081, "learning_rate": 7.03931607251884e-07, "loss": 0.0627, "step": 1982 }, { "epoch": 0.7799410029498525, "grad_norm": 1.2953605651855469, "learning_rate": 7.015444764818988e-07, "loss": 0.0571, "step": 1983 }, { "epoch": 0.7803343166175024, "grad_norm": 0.8770161271095276, "learning_rate": 6.991607393619129e-07, "loss": 0.0322, "step": 1984 }, { "epoch": 0.7807276302851525, "grad_norm": 0.8347287774085999, "learning_rate": 6.967804003899925e-07, "loss": 0.0497, "step": 1985 }, { "epoch": 0.7811209439528024, "grad_norm": 0.5185628533363342, "learning_rate": 6.944034640577896e-07, "loss": 0.0292, "step": 1986 }, { "epoch": 0.7815142576204523, "grad_norm": 0.9084299802780151, "learning_rate": 6.920299348505365e-07, "loss": 0.0343, "step": 1987 }, { "epoch": 0.7819075712881023, "grad_norm": 1.2148305177688599, "learning_rate": 6.896598172470356e-07, "loss": 0.07, "step": 1988 }, { "epoch": 0.7823008849557522, "grad_norm": 1.0693104267120361, "learning_rate": 6.872931157196519e-07, "loss": 0.0509, "step": 1989 }, { "epoch": 0.7826941986234022, "grad_norm": 0.5483916997909546, "learning_rate": 6.849298347343044e-07, "loss": 0.04, "step": 1990 }, { "epoch": 0.7830875122910521, "grad_norm": 0.9246038794517517, "learning_rate": 6.825699787504586e-07, "loss": 0.0602, "step": 1991 }, { "epoch": 0.783480825958702, "grad_norm": 0.7501392960548401, "learning_rate": 6.802135522211142e-07, "loss": 0.0331, "step": 1992 }, { "epoch": 0.783874139626352, "grad_norm": 0.8467764854431152, "learning_rate": 6.778605595928025e-07, "loss": 0.0325, "step": 1993 }, { "epoch": 0.7842674532940019, "grad_norm": 0.5727487206459045, "learning_rate": 6.755110053055738e-07, "loss": 0.0264, "step": 1994 }, { "epoch": 0.7846607669616519, "grad_norm": 1.1488757133483887, "learning_rate": 6.731648937929911e-07, "loss": 0.0548, "step": 1995 }, { "epoch": 0.7850540806293018, "grad_norm": 0.7147387862205505, "learning_rate": 6.708222294821196e-07, "loss": 0.0548, "step": 1996 }, { "epoch": 0.7854473942969519, "grad_norm": 1.0995930433273315, "learning_rate": 6.684830167935207e-07, "loss": 0.0476, "step": 1997 }, { "epoch": 0.7858407079646018, "grad_norm": 1.1355059146881104, "learning_rate": 6.66147260141243e-07, "loss": 0.0501, "step": 1998 }, { "epoch": 0.7862340216322518, "grad_norm": 0.7553796768188477, "learning_rate": 6.638149639328134e-07, "loss": 0.0686, "step": 1999 }, { "epoch": 0.7866273352999017, "grad_norm": 0.8902336359024048, "learning_rate": 6.614861325692277e-07, "loss": 0.0349, "step": 2000 }, { "epoch": 0.7870206489675516, "grad_norm": 1.090766429901123, "learning_rate": 6.591607704449446e-07, "loss": 0.0527, "step": 2001 }, { "epoch": 0.7874139626352016, "grad_norm": 1.142582654953003, "learning_rate": 6.568388819478769e-07, "loss": 0.0537, "step": 2002 }, { "epoch": 0.7878072763028515, "grad_norm": 1.449288010597229, "learning_rate": 6.545204714593825e-07, "loss": 0.0587, "step": 2003 }, { "epoch": 0.7882005899705015, "grad_norm": 1.7187999486923218, "learning_rate": 6.522055433542557e-07, "loss": 0.0624, "step": 2004 }, { "epoch": 0.7885939036381514, "grad_norm": 1.5539288520812988, "learning_rate": 6.49894102000721e-07, "loss": 0.0553, "step": 2005 }, { "epoch": 0.7889872173058013, "grad_norm": 1.4520833492279053, "learning_rate": 6.47586151760421e-07, "loss": 0.0297, "step": 2006 }, { "epoch": 0.7893805309734513, "grad_norm": 1.2936962842941284, "learning_rate": 6.452816969884127e-07, "loss": 0.0335, "step": 2007 }, { "epoch": 0.7897738446411012, "grad_norm": 1.2932931184768677, "learning_rate": 6.429807420331568e-07, "loss": 0.0622, "step": 2008 }, { "epoch": 0.7901671583087513, "grad_norm": 0.9521369934082031, "learning_rate": 6.406832912365101e-07, "loss": 0.0669, "step": 2009 }, { "epoch": 0.7905604719764012, "grad_norm": 0.9570633172988892, "learning_rate": 6.383893489337172e-07, "loss": 0.054, "step": 2010 }, { "epoch": 0.7909537856440512, "grad_norm": 0.7929260730743408, "learning_rate": 6.360989194534004e-07, "loss": 0.028, "step": 2011 }, { "epoch": 0.7913470993117011, "grad_norm": 1.2527369260787964, "learning_rate": 6.338120071175558e-07, "loss": 0.0631, "step": 2012 }, { "epoch": 0.791740412979351, "grad_norm": 0.9790352582931519, "learning_rate": 6.315286162415412e-07, "loss": 0.0485, "step": 2013 }, { "epoch": 0.792133726647001, "grad_norm": 1.417540431022644, "learning_rate": 6.292487511340709e-07, "loss": 0.0575, "step": 2014 }, { "epoch": 0.7925270403146509, "grad_norm": 1.3456201553344727, "learning_rate": 6.269724160972043e-07, "loss": 0.0709, "step": 2015 }, { "epoch": 0.7929203539823009, "grad_norm": 1.3013477325439453, "learning_rate": 6.246996154263421e-07, "loss": 0.0571, "step": 2016 }, { "epoch": 0.7933136676499508, "grad_norm": 1.0679081678390503, "learning_rate": 6.224303534102125e-07, "loss": 0.0395, "step": 2017 }, { "epoch": 0.7937069813176008, "grad_norm": 1.3359334468841553, "learning_rate": 6.201646343308685e-07, "loss": 0.0439, "step": 2018 }, { "epoch": 0.7941002949852507, "grad_norm": 1.4549192190170288, "learning_rate": 6.179024624636772e-07, "loss": 0.057, "step": 2019 }, { "epoch": 0.7944936086529006, "grad_norm": 0.8267070055007935, "learning_rate": 6.156438420773125e-07, "loss": 0.0207, "step": 2020 }, { "epoch": 0.7948869223205507, "grad_norm": 1.1873496770858765, "learning_rate": 6.133887774337471e-07, "loss": 0.0449, "step": 2021 }, { "epoch": 0.7952802359882006, "grad_norm": 1.971118450164795, "learning_rate": 6.111372727882417e-07, "loss": 0.0444, "step": 2022 }, { "epoch": 0.7956735496558506, "grad_norm": 0.5039023160934448, "learning_rate": 6.088893323893419e-07, "loss": 0.0165, "step": 2023 }, { "epoch": 0.7960668633235005, "grad_norm": 1.2124491930007935, "learning_rate": 6.066449604788666e-07, "loss": 0.0384, "step": 2024 }, { "epoch": 0.7964601769911505, "grad_norm": 1.4836233854293823, "learning_rate": 6.044041612919016e-07, "loss": 0.0711, "step": 2025 }, { "epoch": 0.7968534906588004, "grad_norm": 1.4890559911727905, "learning_rate": 6.021669390567902e-07, "loss": 0.048, "step": 2026 }, { "epoch": 0.7972468043264503, "grad_norm": 0.5430221557617188, "learning_rate": 5.999332979951272e-07, "loss": 0.049, "step": 2027 }, { "epoch": 0.7976401179941003, "grad_norm": 0.9645549654960632, "learning_rate": 5.977032423217482e-07, "loss": 0.0201, "step": 2028 }, { "epoch": 0.7980334316617502, "grad_norm": 1.7599254846572876, "learning_rate": 5.954767762447244e-07, "loss": 0.0524, "step": 2029 }, { "epoch": 0.7984267453294002, "grad_norm": 0.6832358241081238, "learning_rate": 5.932539039653535e-07, "loss": 0.0451, "step": 2030 }, { "epoch": 0.7988200589970501, "grad_norm": 0.5469837188720703, "learning_rate": 5.910346296781511e-07, "loss": 0.0342, "step": 2031 }, { "epoch": 0.7992133726647, "grad_norm": 1.466138482093811, "learning_rate": 5.888189575708453e-07, "loss": 0.0619, "step": 2032 }, { "epoch": 0.7996066863323501, "grad_norm": 1.1846930980682373, "learning_rate": 5.866068918243634e-07, "loss": 0.0527, "step": 2033 }, { "epoch": 0.8, "grad_norm": 0.8236525058746338, "learning_rate": 5.843984366128308e-07, "loss": 0.0427, "step": 2034 }, { "epoch": 0.80039331366765, "grad_norm": 0.8086917996406555, "learning_rate": 5.821935961035589e-07, "loss": 0.0743, "step": 2035 }, { "epoch": 0.8007866273352999, "grad_norm": 1.3642960786819458, "learning_rate": 5.799923744570376e-07, "loss": 0.0609, "step": 2036 }, { "epoch": 0.8011799410029499, "grad_norm": 1.4578794240951538, "learning_rate": 5.777947758269295e-07, "loss": 0.0828, "step": 2037 }, { "epoch": 0.8015732546705998, "grad_norm": 0.5745184421539307, "learning_rate": 5.756008043600594e-07, "loss": 0.0444, "step": 2038 }, { "epoch": 0.8019665683382498, "grad_norm": 2.3881709575653076, "learning_rate": 5.734104641964075e-07, "loss": 0.074, "step": 2039 }, { "epoch": 0.8023598820058997, "grad_norm": 1.0504474639892578, "learning_rate": 5.712237594691028e-07, "loss": 0.0573, "step": 2040 }, { "epoch": 0.8027531956735496, "grad_norm": 1.7040578126907349, "learning_rate": 5.690406943044138e-07, "loss": 0.0472, "step": 2041 }, { "epoch": 0.8031465093411996, "grad_norm": 0.9709568619728088, "learning_rate": 5.668612728217412e-07, "loss": 0.0305, "step": 2042 }, { "epoch": 0.8035398230088495, "grad_norm": 2.0475189685821533, "learning_rate": 5.646854991336112e-07, "loss": 0.0661, "step": 2043 }, { "epoch": 0.8039331366764995, "grad_norm": 1.4109443426132202, "learning_rate": 5.625133773456639e-07, "loss": 0.0698, "step": 2044 }, { "epoch": 0.8043264503441495, "grad_norm": 0.8161342740058899, "learning_rate": 5.603449115566511e-07, "loss": 0.0417, "step": 2045 }, { "epoch": 0.8047197640117995, "grad_norm": 1.1740028858184814, "learning_rate": 5.581801058584252e-07, "loss": 0.0444, "step": 2046 }, { "epoch": 0.8051130776794494, "grad_norm": 2.580334424972534, "learning_rate": 5.560189643359312e-07, "loss": 0.0988, "step": 2047 }, { "epoch": 0.8055063913470993, "grad_norm": 0.8429194092750549, "learning_rate": 5.538614910672005e-07, "loss": 0.0312, "step": 2048 }, { "epoch": 0.8058997050147493, "grad_norm": 0.8115060925483704, "learning_rate": 5.517076901233434e-07, "loss": 0.0561, "step": 2049 }, { "epoch": 0.8062930186823992, "grad_norm": 0.5982792377471924, "learning_rate": 5.495575655685382e-07, "loss": 0.0369, "step": 2050 }, { "epoch": 0.8066863323500492, "grad_norm": 1.5597193241119385, "learning_rate": 5.474111214600278e-07, "loss": 0.0701, "step": 2051 }, { "epoch": 0.8070796460176991, "grad_norm": 1.3873978853225708, "learning_rate": 5.452683618481103e-07, "loss": 0.0372, "step": 2052 }, { "epoch": 0.807472959685349, "grad_norm": 0.9317770004272461, "learning_rate": 5.431292907761305e-07, "loss": 0.0433, "step": 2053 }, { "epoch": 0.807866273352999, "grad_norm": 1.736678957939148, "learning_rate": 5.409939122804736e-07, "loss": 0.0562, "step": 2054 }, { "epoch": 0.8082595870206489, "grad_norm": 1.1516214609146118, "learning_rate": 5.388622303905558e-07, "loss": 0.0438, "step": 2055 }, { "epoch": 0.8086529006882989, "grad_norm": 0.855049192905426, "learning_rate": 5.367342491288186e-07, "loss": 0.0389, "step": 2056 }, { "epoch": 0.8090462143559489, "grad_norm": 0.8584917187690735, "learning_rate": 5.346099725107213e-07, "loss": 0.0686, "step": 2057 }, { "epoch": 0.8094395280235989, "grad_norm": 1.1630586385726929, "learning_rate": 5.324894045447312e-07, "loss": 0.0361, "step": 2058 }, { "epoch": 0.8098328416912488, "grad_norm": 1.2655314207077026, "learning_rate": 5.303725492323194e-07, "loss": 0.0284, "step": 2059 }, { "epoch": 0.8102261553588987, "grad_norm": 1.1947369575500488, "learning_rate": 5.282594105679481e-07, "loss": 0.0562, "step": 2060 }, { "epoch": 0.8106194690265487, "grad_norm": 0.7869384288787842, "learning_rate": 5.261499925390692e-07, "loss": 0.0407, "step": 2061 }, { "epoch": 0.8110127826941986, "grad_norm": 1.6076072454452515, "learning_rate": 5.240442991261127e-07, "loss": 0.0384, "step": 2062 }, { "epoch": 0.8114060963618486, "grad_norm": 2.237993001937866, "learning_rate": 5.219423343024804e-07, "loss": 0.0539, "step": 2063 }, { "epoch": 0.8117994100294985, "grad_norm": 0.8259546756744385, "learning_rate": 5.198441020345382e-07, "loss": 0.0436, "step": 2064 }, { "epoch": 0.8121927236971485, "grad_norm": 1.2509441375732422, "learning_rate": 5.177496062816101e-07, "loss": 0.0462, "step": 2065 }, { "epoch": 0.8125860373647984, "grad_norm": 1.06137216091156, "learning_rate": 5.156588509959659e-07, "loss": 0.0339, "step": 2066 }, { "epoch": 0.8129793510324483, "grad_norm": 0.7373847365379333, "learning_rate": 5.13571840122821e-07, "loss": 0.0301, "step": 2067 }, { "epoch": 0.8133726647000983, "grad_norm": 1.1653954982757568, "learning_rate": 5.114885776003234e-07, "loss": 0.0427, "step": 2068 }, { "epoch": 0.8137659783677483, "grad_norm": 1.518700122833252, "learning_rate": 5.094090673595478e-07, "loss": 0.0568, "step": 2069 }, { "epoch": 0.8141592920353983, "grad_norm": 0.9491556286811829, "learning_rate": 5.073333133244896e-07, "loss": 0.0296, "step": 2070 }, { "epoch": 0.8145526057030482, "grad_norm": 1.12187922000885, "learning_rate": 5.052613194120554e-07, "loss": 0.0625, "step": 2071 }, { "epoch": 0.8149459193706982, "grad_norm": 0.9381184577941895, "learning_rate": 5.031930895320569e-07, "loss": 0.0318, "step": 2072 }, { "epoch": 0.8153392330383481, "grad_norm": 0.8680362701416016, "learning_rate": 5.011286275872021e-07, "loss": 0.0631, "step": 2073 }, { "epoch": 0.815732546705998, "grad_norm": 1.5543493032455444, "learning_rate": 4.990679374730905e-07, "loss": 0.0754, "step": 2074 }, { "epoch": 0.816125860373648, "grad_norm": 1.3975200653076172, "learning_rate": 4.970110230782035e-07, "loss": 0.072, "step": 2075 }, { "epoch": 0.8165191740412979, "grad_norm": 0.8037746548652649, "learning_rate": 4.949578882838982e-07, "loss": 0.0385, "step": 2076 }, { "epoch": 0.8169124877089479, "grad_norm": 0.7833993434906006, "learning_rate": 4.929085369643988e-07, "loss": 0.0418, "step": 2077 }, { "epoch": 0.8173058013765978, "grad_norm": 0.8177001476287842, "learning_rate": 4.908629729867908e-07, "loss": 0.0485, "step": 2078 }, { "epoch": 0.8176991150442477, "grad_norm": 0.7933450937271118, "learning_rate": 4.88821200211014e-07, "loss": 0.0466, "step": 2079 }, { "epoch": 0.8180924287118977, "grad_norm": 0.5968790054321289, "learning_rate": 4.867832224898517e-07, "loss": 0.0253, "step": 2080 }, { "epoch": 0.8184857423795477, "grad_norm": 1.4022417068481445, "learning_rate": 4.847490436689281e-07, "loss": 0.0431, "step": 2081 }, { "epoch": 0.8188790560471977, "grad_norm": 2.319401264190674, "learning_rate": 4.827186675866985e-07, "loss": 0.0493, "step": 2082 }, { "epoch": 0.8192723697148476, "grad_norm": 1.0119627714157104, "learning_rate": 4.806920980744426e-07, "loss": 0.0606, "step": 2083 }, { "epoch": 0.8196656833824976, "grad_norm": 1.2110787630081177, "learning_rate": 4.786693389562566e-07, "loss": 0.0582, "step": 2084 }, { "epoch": 0.8200589970501475, "grad_norm": 0.7724167704582214, "learning_rate": 4.7665039404904747e-07, "loss": 0.0457, "step": 2085 }, { "epoch": 0.8204523107177975, "grad_norm": 1.5843499898910522, "learning_rate": 4.746352671625237e-07, "loss": 0.0482, "step": 2086 }, { "epoch": 0.8208456243854474, "grad_norm": 1.3220843076705933, "learning_rate": 4.72623962099191e-07, "loss": 0.0505, "step": 2087 }, { "epoch": 0.8212389380530973, "grad_norm": 1.6696242094039917, "learning_rate": 4.7061648265434053e-07, "loss": 0.0587, "step": 2088 }, { "epoch": 0.8216322517207473, "grad_norm": 1.341960072517395, "learning_rate": 4.6861283261604745e-07, "loss": 0.0781, "step": 2089 }, { "epoch": 0.8220255653883972, "grad_norm": 1.6525554656982422, "learning_rate": 4.666130157651594e-07, "loss": 0.052, "step": 2090 }, { "epoch": 0.8224188790560472, "grad_norm": 1.0084091424942017, "learning_rate": 4.6461703587529106e-07, "loss": 0.0354, "step": 2091 }, { "epoch": 0.8228121927236971, "grad_norm": 0.8987352848052979, "learning_rate": 4.62624896712818e-07, "loss": 0.0351, "step": 2092 }, { "epoch": 0.8232055063913472, "grad_norm": 1.0085314512252808, "learning_rate": 4.6063660203686635e-07, "loss": 0.0459, "step": 2093 }, { "epoch": 0.8235988200589971, "grad_norm": 1.4987783432006836, "learning_rate": 4.586521555993087e-07, "loss": 0.0771, "step": 2094 }, { "epoch": 0.823992133726647, "grad_norm": 1.5976486206054688, "learning_rate": 4.5667156114475695e-07, "loss": 0.0766, "step": 2095 }, { "epoch": 0.824385447394297, "grad_norm": 0.9721060395240784, "learning_rate": 4.5469482241055324e-07, "loss": 0.0514, "step": 2096 }, { "epoch": 0.8247787610619469, "grad_norm": 0.835397481918335, "learning_rate": 4.527219431267646e-07, "loss": 0.0352, "step": 2097 }, { "epoch": 0.8251720747295969, "grad_norm": 1.1280697584152222, "learning_rate": 4.507529270161759e-07, "loss": 0.0712, "step": 2098 }, { "epoch": 0.8255653883972468, "grad_norm": 1.8154939413070679, "learning_rate": 4.4878777779428034e-07, "loss": 0.0918, "step": 2099 }, { "epoch": 0.8259587020648967, "grad_norm": 1.067765474319458, "learning_rate": 4.4682649916927614e-07, "loss": 0.0357, "step": 2100 }, { "epoch": 0.8263520157325467, "grad_norm": 1.0095484256744385, "learning_rate": 4.4486909484205725e-07, "loss": 0.0315, "step": 2101 }, { "epoch": 0.8267453294001966, "grad_norm": 1.7903807163238525, "learning_rate": 4.429155685062073e-07, "loss": 0.0598, "step": 2102 }, { "epoch": 0.8271386430678466, "grad_norm": 1.5948070287704468, "learning_rate": 4.409659238479919e-07, "loss": 0.0408, "step": 2103 }, { "epoch": 0.8275319567354965, "grad_norm": 0.805156946182251, "learning_rate": 4.39020164546351e-07, "loss": 0.0448, "step": 2104 }, { "epoch": 0.8279252704031466, "grad_norm": 0.4440039098262787, "learning_rate": 4.370782942728946e-07, "loss": 0.0279, "step": 2105 }, { "epoch": 0.8283185840707965, "grad_norm": 0.9887676239013672, "learning_rate": 4.3514031669189325e-07, "loss": 0.0706, "step": 2106 }, { "epoch": 0.8287118977384464, "grad_norm": 1.1825933456420898, "learning_rate": 4.3320623546027283e-07, "loss": 0.0608, "step": 2107 }, { "epoch": 0.8291052114060964, "grad_norm": 1.8713337182998657, "learning_rate": 4.312760542276059e-07, "loss": 0.049, "step": 2108 }, { "epoch": 0.8294985250737463, "grad_norm": 0.9182631969451904, "learning_rate": 4.293497766361068e-07, "loss": 0.0436, "step": 2109 }, { "epoch": 0.8298918387413963, "grad_norm": 1.1083096265792847, "learning_rate": 4.2742740632062243e-07, "loss": 0.0483, "step": 2110 }, { "epoch": 0.8302851524090462, "grad_norm": 2.0837628841400146, "learning_rate": 4.255089469086279e-07, "loss": 0.0663, "step": 2111 }, { "epoch": 0.8306784660766962, "grad_norm": 1.2065215110778809, "learning_rate": 4.235944020202182e-07, "loss": 0.0673, "step": 2112 }, { "epoch": 0.8310717797443461, "grad_norm": 1.3495663404464722, "learning_rate": 4.216837752681019e-07, "loss": 0.0589, "step": 2113 }, { "epoch": 0.831465093411996, "grad_norm": 0.8407555818557739, "learning_rate": 4.19777070257594e-07, "loss": 0.0309, "step": 2114 }, { "epoch": 0.831858407079646, "grad_norm": 0.9763451814651489, "learning_rate": 4.1787429058660845e-07, "loss": 0.0231, "step": 2115 }, { "epoch": 0.8322517207472959, "grad_norm": 1.1487807035446167, "learning_rate": 4.159754398456531e-07, "loss": 0.0582, "step": 2116 }, { "epoch": 0.832645034414946, "grad_norm": 0.9778567552566528, "learning_rate": 4.14080521617822e-07, "loss": 0.0349, "step": 2117 }, { "epoch": 0.8330383480825959, "grad_norm": 1.1251294612884521, "learning_rate": 4.121895394787881e-07, "loss": 0.0608, "step": 2118 }, { "epoch": 0.8334316617502459, "grad_norm": 0.8375036716461182, "learning_rate": 4.103024969967981e-07, "loss": 0.0406, "step": 2119 }, { "epoch": 0.8338249754178958, "grad_norm": 1.1409391164779663, "learning_rate": 4.084193977326625e-07, "loss": 0.0545, "step": 2120 }, { "epoch": 0.8342182890855457, "grad_norm": 1.0144537687301636, "learning_rate": 4.0654024523975323e-07, "loss": 0.076, "step": 2121 }, { "epoch": 0.8346116027531957, "grad_norm": 1.7752301692962646, "learning_rate": 4.0466504306399366e-07, "loss": 0.0647, "step": 2122 }, { "epoch": 0.8350049164208456, "grad_norm": 1.1848422288894653, "learning_rate": 4.027937947438532e-07, "loss": 0.0642, "step": 2123 }, { "epoch": 0.8353982300884956, "grad_norm": 0.8530738353729248, "learning_rate": 4.009265038103402e-07, "loss": 0.0407, "step": 2124 }, { "epoch": 0.8357915437561455, "grad_norm": 0.9213998317718506, "learning_rate": 3.9906317378699684e-07, "loss": 0.0306, "step": 2125 }, { "epoch": 0.8361848574237954, "grad_norm": 0.8134070038795471, "learning_rate": 3.972038081898885e-07, "loss": 0.0378, "step": 2126 }, { "epoch": 0.8365781710914454, "grad_norm": 1.0904289484024048, "learning_rate": 3.9534841052760174e-07, "loss": 0.032, "step": 2127 }, { "epoch": 0.8369714847590953, "grad_norm": 2.0691423416137695, "learning_rate": 3.9349698430123566e-07, "loss": 0.0737, "step": 2128 }, { "epoch": 0.8373647984267454, "grad_norm": 1.1641324758529663, "learning_rate": 3.9164953300439456e-07, "loss": 0.0546, "step": 2129 }, { "epoch": 0.8377581120943953, "grad_norm": 0.9116164445877075, "learning_rate": 3.898060601231832e-07, "loss": 0.0533, "step": 2130 }, { "epoch": 0.8381514257620453, "grad_norm": 1.0761325359344482, "learning_rate": 3.879665691361975e-07, "loss": 0.0465, "step": 2131 }, { "epoch": 0.8385447394296952, "grad_norm": 1.2517597675323486, "learning_rate": 3.861310635145207e-07, "loss": 0.0509, "step": 2132 }, { "epoch": 0.8389380530973451, "grad_norm": 0.7470773458480835, "learning_rate": 3.8429954672171613e-07, "loss": 0.0452, "step": 2133 }, { "epoch": 0.8393313667649951, "grad_norm": 1.572190284729004, "learning_rate": 3.824720222138192e-07, "loss": 0.0388, "step": 2134 }, { "epoch": 0.839724680432645, "grad_norm": 1.1324615478515625, "learning_rate": 3.806484934393331e-07, "loss": 0.0696, "step": 2135 }, { "epoch": 0.840117994100295, "grad_norm": 1.03518807888031, "learning_rate": 3.788289638392206e-07, "loss": 0.0333, "step": 2136 }, { "epoch": 0.8405113077679449, "grad_norm": 1.2855054140090942, "learning_rate": 3.7701343684689725e-07, "loss": 0.0573, "step": 2137 }, { "epoch": 0.8409046214355949, "grad_norm": 1.5672320127487183, "learning_rate": 3.7520191588822695e-07, "loss": 0.0618, "step": 2138 }, { "epoch": 0.8412979351032448, "grad_norm": 1.3046908378601074, "learning_rate": 3.7339440438151383e-07, "loss": 0.0633, "step": 2139 }, { "epoch": 0.8416912487708947, "grad_norm": 0.9728895425796509, "learning_rate": 3.7159090573749693e-07, "loss": 0.0287, "step": 2140 }, { "epoch": 0.8420845624385448, "grad_norm": 1.4470866918563843, "learning_rate": 3.6979142335934246e-07, "loss": 0.0439, "step": 2141 }, { "epoch": 0.8424778761061947, "grad_norm": 0.802937924861908, "learning_rate": 3.67995960642637e-07, "loss": 0.0316, "step": 2142 }, { "epoch": 0.8428711897738447, "grad_norm": 0.8089593052864075, "learning_rate": 3.6620452097538424e-07, "loss": 0.0506, "step": 2143 }, { "epoch": 0.8432645034414946, "grad_norm": 0.9571702480316162, "learning_rate": 3.644171077379949e-07, "loss": 0.0273, "step": 2144 }, { "epoch": 0.8436578171091446, "grad_norm": 1.022767186164856, "learning_rate": 3.6263372430328266e-07, "loss": 0.0497, "step": 2145 }, { "epoch": 0.8440511307767945, "grad_norm": 1.133183479309082, "learning_rate": 3.6085437403645645e-07, "loss": 0.0375, "step": 2146 }, { "epoch": 0.8444444444444444, "grad_norm": 1.603365421295166, "learning_rate": 3.5907906029511606e-07, "loss": 0.0535, "step": 2147 }, { "epoch": 0.8448377581120944, "grad_norm": 1.052833080291748, "learning_rate": 3.573077864292421e-07, "loss": 0.0419, "step": 2148 }, { "epoch": 0.8452310717797443, "grad_norm": 0.8957949280738831, "learning_rate": 3.555405557811936e-07, "loss": 0.054, "step": 2149 }, { "epoch": 0.8456243854473943, "grad_norm": 1.3401049375534058, "learning_rate": 3.537773716857004e-07, "loss": 0.0558, "step": 2150 }, { "epoch": 0.8460176991150442, "grad_norm": 1.3811299800872803, "learning_rate": 3.5201823746985554e-07, "loss": 0.0436, "step": 2151 }, { "epoch": 0.8464110127826941, "grad_norm": 1.3221920728683472, "learning_rate": 3.5026315645311114e-07, "loss": 0.0679, "step": 2152 }, { "epoch": 0.8468043264503442, "grad_norm": 0.608182966709137, "learning_rate": 3.485121319472695e-07, "loss": 0.0624, "step": 2153 }, { "epoch": 0.8471976401179941, "grad_norm": 0.8964172601699829, "learning_rate": 3.4676516725647953e-07, "loss": 0.0394, "step": 2154 }, { "epoch": 0.8475909537856441, "grad_norm": 0.7584964632987976, "learning_rate": 3.450222656772292e-07, "loss": 0.0484, "step": 2155 }, { "epoch": 0.847984267453294, "grad_norm": 0.3789440095424652, "learning_rate": 3.43283430498339e-07, "loss": 0.0277, "step": 2156 }, { "epoch": 0.848377581120944, "grad_norm": 0.7871941924095154, "learning_rate": 3.4154866500095695e-07, "loss": 0.0493, "step": 2157 }, { "epoch": 0.8487708947885939, "grad_norm": 1.302708625793457, "learning_rate": 3.3981797245855096e-07, "loss": 0.0799, "step": 2158 }, { "epoch": 0.8491642084562439, "grad_norm": 0.7635212540626526, "learning_rate": 3.380913561369037e-07, "loss": 0.0427, "step": 2159 }, { "epoch": 0.8495575221238938, "grad_norm": 0.8605564832687378, "learning_rate": 3.363688192941067e-07, "loss": 0.0462, "step": 2160 }, { "epoch": 0.8499508357915437, "grad_norm": 0.9630613923072815, "learning_rate": 3.346503651805513e-07, "loss": 0.0637, "step": 2161 }, { "epoch": 0.8503441494591937, "grad_norm": 1.0170080661773682, "learning_rate": 3.329359970389279e-07, "loss": 0.061, "step": 2162 }, { "epoch": 0.8507374631268436, "grad_norm": 0.8377442359924316, "learning_rate": 3.312257181042142e-07, "loss": 0.0449, "step": 2163 }, { "epoch": 0.8511307767944936, "grad_norm": 0.9564546346664429, "learning_rate": 3.2951953160367365e-07, "loss": 0.0496, "step": 2164 }, { "epoch": 0.8515240904621436, "grad_norm": 0.5969823002815247, "learning_rate": 3.2781744075684576e-07, "loss": 0.0404, "step": 2165 }, { "epoch": 0.8519174041297936, "grad_norm": 1.0183027982711792, "learning_rate": 3.261194487755426e-07, "loss": 0.0563, "step": 2166 }, { "epoch": 0.8523107177974435, "grad_norm": 1.3610613346099854, "learning_rate": 3.2442555886384145e-07, "loss": 0.0791, "step": 2167 }, { "epoch": 0.8527040314650934, "grad_norm": 0.7566685080528259, "learning_rate": 3.2273577421807976e-07, "loss": 0.0415, "step": 2168 }, { "epoch": 0.8530973451327434, "grad_norm": 1.1211597919464111, "learning_rate": 3.2105009802684636e-07, "loss": 0.0874, "step": 2169 }, { "epoch": 0.8534906588003933, "grad_norm": 1.6669408082962036, "learning_rate": 3.1936853347097923e-07, "loss": 0.0521, "step": 2170 }, { "epoch": 0.8538839724680433, "grad_norm": 0.9726613163948059, "learning_rate": 3.1769108372355804e-07, "loss": 0.0457, "step": 2171 }, { "epoch": 0.8542772861356932, "grad_norm": 1.5157469511032104, "learning_rate": 3.1601775194989693e-07, "loss": 0.0574, "step": 2172 }, { "epoch": 0.8546705998033431, "grad_norm": 2.319978713989258, "learning_rate": 3.143485413075398e-07, "loss": 0.0604, "step": 2173 }, { "epoch": 0.8550639134709931, "grad_norm": 1.160510778427124, "learning_rate": 3.1268345494625486e-07, "loss": 0.0454, "step": 2174 }, { "epoch": 0.855457227138643, "grad_norm": 1.0284311771392822, "learning_rate": 3.1102249600802573e-07, "loss": 0.0375, "step": 2175 }, { "epoch": 0.855850540806293, "grad_norm": 0.7068095207214355, "learning_rate": 3.093656676270501e-07, "loss": 0.0409, "step": 2176 }, { "epoch": 0.856243854473943, "grad_norm": 0.8698954582214355, "learning_rate": 3.0771297292972986e-07, "loss": 0.0547, "step": 2177 }, { "epoch": 0.856637168141593, "grad_norm": 0.7371048331260681, "learning_rate": 3.0606441503466753e-07, "loss": 0.0661, "step": 2178 }, { "epoch": 0.8570304818092429, "grad_norm": 0.6116827726364136, "learning_rate": 3.044199970526593e-07, "loss": 0.0199, "step": 2179 }, { "epoch": 0.8574237954768928, "grad_norm": 0.9910300374031067, "learning_rate": 3.027797220866896e-07, "loss": 0.0454, "step": 2180 }, { "epoch": 0.8578171091445428, "grad_norm": 0.9253597855567932, "learning_rate": 3.01143593231924e-07, "loss": 0.0465, "step": 2181 }, { "epoch": 0.8582104228121927, "grad_norm": 0.6476548314094543, "learning_rate": 2.995116135757059e-07, "loss": 0.0385, "step": 2182 }, { "epoch": 0.8586037364798427, "grad_norm": 0.8749169707298279, "learning_rate": 2.978837861975484e-07, "loss": 0.0474, "step": 2183 }, { "epoch": 0.8589970501474926, "grad_norm": 1.4006898403167725, "learning_rate": 2.962601141691296e-07, "loss": 0.0511, "step": 2184 }, { "epoch": 0.8593903638151426, "grad_norm": 0.8508985638618469, "learning_rate": 2.9464060055428703e-07, "loss": 0.0549, "step": 2185 }, { "epoch": 0.8597836774827925, "grad_norm": 1.1002285480499268, "learning_rate": 2.930252484090101e-07, "loss": 0.0283, "step": 2186 }, { "epoch": 0.8601769911504424, "grad_norm": 0.8702027201652527, "learning_rate": 2.9141406078143644e-07, "loss": 0.0605, "step": 2187 }, { "epoch": 0.8605703048180924, "grad_norm": 0.79606693983078, "learning_rate": 2.8980704071184557e-07, "loss": 0.0598, "step": 2188 }, { "epoch": 0.8609636184857424, "grad_norm": 1.1964335441589355, "learning_rate": 2.882041912326525e-07, "loss": 0.046, "step": 2189 }, { "epoch": 0.8613569321533924, "grad_norm": 1.1686105728149414, "learning_rate": 2.8660551536840277e-07, "loss": 0.0329, "step": 2190 }, { "epoch": 0.8617502458210423, "grad_norm": 0.858632504940033, "learning_rate": 2.8501101613576526e-07, "loss": 0.0661, "step": 2191 }, { "epoch": 0.8621435594886923, "grad_norm": 0.984893262386322, "learning_rate": 2.834206965435293e-07, "loss": 0.0351, "step": 2192 }, { "epoch": 0.8625368731563422, "grad_norm": 1.3127596378326416, "learning_rate": 2.818345595925959e-07, "loss": 0.0387, "step": 2193 }, { "epoch": 0.8629301868239921, "grad_norm": 1.4564718008041382, "learning_rate": 2.8025260827597463e-07, "loss": 0.0424, "step": 2194 }, { "epoch": 0.8633235004916421, "grad_norm": 0.5872806310653687, "learning_rate": 2.7867484557877607e-07, "loss": 0.0414, "step": 2195 }, { "epoch": 0.863716814159292, "grad_norm": 1.0555849075317383, "learning_rate": 2.7710127447820783e-07, "loss": 0.0519, "step": 2196 }, { "epoch": 0.864110127826942, "grad_norm": 1.0422883033752441, "learning_rate": 2.7553189794356615e-07, "loss": 0.0562, "step": 2197 }, { "epoch": 0.8645034414945919, "grad_norm": 1.2551977634429932, "learning_rate": 2.739667189362347e-07, "loss": 0.0344, "step": 2198 }, { "epoch": 0.8648967551622418, "grad_norm": 1.0713584423065186, "learning_rate": 2.724057404096744e-07, "loss": 0.0385, "step": 2199 }, { "epoch": 0.8652900688298918, "grad_norm": 0.6667132377624512, "learning_rate": 2.708489653094218e-07, "loss": 0.0525, "step": 2200 }, { "epoch": 0.8656833824975418, "grad_norm": 0.9178755283355713, "learning_rate": 2.692963965730805e-07, "loss": 0.0722, "step": 2201 }, { "epoch": 0.8660766961651918, "grad_norm": 1.2695622444152832, "learning_rate": 2.677480371303162e-07, "loss": 0.0759, "step": 2202 }, { "epoch": 0.8664700098328417, "grad_norm": 1.1370331048965454, "learning_rate": 2.662038899028532e-07, "loss": 0.0396, "step": 2203 }, { "epoch": 0.8668633235004917, "grad_norm": 0.6956948041915894, "learning_rate": 2.6466395780446657e-07, "loss": 0.062, "step": 2204 }, { "epoch": 0.8672566371681416, "grad_norm": 0.5956060886383057, "learning_rate": 2.6312824374097794e-07, "loss": 0.049, "step": 2205 }, { "epoch": 0.8676499508357916, "grad_norm": 3.8347904682159424, "learning_rate": 2.6159675061024905e-07, "loss": 0.0654, "step": 2206 }, { "epoch": 0.8680432645034415, "grad_norm": 1.0327752828598022, "learning_rate": 2.6006948130217815e-07, "loss": 0.024, "step": 2207 }, { "epoch": 0.8684365781710914, "grad_norm": 1.1763917207717896, "learning_rate": 2.585464386986908e-07, "loss": 0.0487, "step": 2208 }, { "epoch": 0.8688298918387414, "grad_norm": 1.6335638761520386, "learning_rate": 2.570276256737386e-07, "loss": 0.0451, "step": 2209 }, { "epoch": 0.8692232055063913, "grad_norm": 1.1163750886917114, "learning_rate": 2.555130450932922e-07, "loss": 0.072, "step": 2210 }, { "epoch": 0.8696165191740413, "grad_norm": 1.2412861585617065, "learning_rate": 2.54002699815335e-07, "loss": 0.0541, "step": 2211 }, { "epoch": 0.8700098328416912, "grad_norm": 0.9547197222709656, "learning_rate": 2.52496592689859e-07, "loss": 0.04, "step": 2212 }, { "epoch": 0.8704031465093413, "grad_norm": 1.4851540327072144, "learning_rate": 2.5099472655885777e-07, "loss": 0.0602, "step": 2213 }, { "epoch": 0.8707964601769912, "grad_norm": 0.9040324687957764, "learning_rate": 2.4949710425632353e-07, "loss": 0.0395, "step": 2214 }, { "epoch": 0.8711897738446411, "grad_norm": 1.1058231592178345, "learning_rate": 2.4800372860823956e-07, "loss": 0.0472, "step": 2215 }, { "epoch": 0.8715830875122911, "grad_norm": 0.814282238483429, "learning_rate": 2.465146024325765e-07, "loss": 0.0541, "step": 2216 }, { "epoch": 0.871976401179941, "grad_norm": 0.9722008109092712, "learning_rate": 2.4502972853928606e-07, "loss": 0.0581, "step": 2217 }, { "epoch": 0.872369714847591, "grad_norm": 0.9943141341209412, "learning_rate": 2.435491097302961e-07, "loss": 0.0435, "step": 2218 }, { "epoch": 0.8727630285152409, "grad_norm": 1.2543455362319946, "learning_rate": 2.420727487995045e-07, "loss": 0.0613, "step": 2219 }, { "epoch": 0.8731563421828908, "grad_norm": 0.8473043441772461, "learning_rate": 2.40600648532775e-07, "loss": 0.0391, "step": 2220 }, { "epoch": 0.8735496558505408, "grad_norm": 1.0976766347885132, "learning_rate": 2.3913281170793196e-07, "loss": 0.0341, "step": 2221 }, { "epoch": 0.8739429695181907, "grad_norm": 0.765153169631958, "learning_rate": 2.376692410947548e-07, "loss": 0.0335, "step": 2222 }, { "epoch": 0.8743362831858407, "grad_norm": 1.2966009378433228, "learning_rate": 2.3620993945497217e-07, "loss": 0.0571, "step": 2223 }, { "epoch": 0.8747295968534906, "grad_norm": 1.0903987884521484, "learning_rate": 2.347549095422569e-07, "loss": 0.0602, "step": 2224 }, { "epoch": 0.8751229105211407, "grad_norm": 0.9129044413566589, "learning_rate": 2.3330415410222212e-07, "loss": 0.0508, "step": 2225 }, { "epoch": 0.8755162241887906, "grad_norm": 1.3771973848342896, "learning_rate": 2.3185767587241447e-07, "loss": 0.0282, "step": 2226 }, { "epoch": 0.8759095378564405, "grad_norm": 1.1595170497894287, "learning_rate": 2.3041547758230977e-07, "loss": 0.0768, "step": 2227 }, { "epoch": 0.8763028515240905, "grad_norm": 0.7576168775558472, "learning_rate": 2.2897756195330773e-07, "loss": 0.0296, "step": 2228 }, { "epoch": 0.8766961651917404, "grad_norm": 1.2020797729492188, "learning_rate": 2.2754393169872685e-07, "loss": 0.0392, "step": 2229 }, { "epoch": 0.8770894788593904, "grad_norm": 1.2221319675445557, "learning_rate": 2.2611458952379872e-07, "loss": 0.0319, "step": 2230 }, { "epoch": 0.8774827925270403, "grad_norm": 1.1023682355880737, "learning_rate": 2.246895381256639e-07, "loss": 0.0523, "step": 2231 }, { "epoch": 0.8778761061946903, "grad_norm": 1.0071845054626465, "learning_rate": 2.232687801933664e-07, "loss": 0.034, "step": 2232 }, { "epoch": 0.8782694198623402, "grad_norm": 0.8645428419113159, "learning_rate": 2.2185231840784778e-07, "loss": 0.0628, "step": 2233 }, { "epoch": 0.8786627335299901, "grad_norm": 0.6460661292076111, "learning_rate": 2.204401554419444e-07, "loss": 0.045, "step": 2234 }, { "epoch": 0.8790560471976401, "grad_norm": 1.7761812210083008, "learning_rate": 2.1903229396037896e-07, "loss": 0.0739, "step": 2235 }, { "epoch": 0.87944936086529, "grad_norm": 1.3595634698867798, "learning_rate": 2.1762873661975825e-07, "loss": 0.041, "step": 2236 }, { "epoch": 0.8798426745329401, "grad_norm": 0.8807711601257324, "learning_rate": 2.1622948606856765e-07, "loss": 0.0623, "step": 2237 }, { "epoch": 0.88023598820059, "grad_norm": 1.0638388395309448, "learning_rate": 2.1483454494716504e-07, "loss": 0.0337, "step": 2238 }, { "epoch": 0.88062930186824, "grad_norm": 0.9859362244606018, "learning_rate": 2.1344391588777658e-07, "loss": 0.0389, "step": 2239 }, { "epoch": 0.8810226155358899, "grad_norm": 1.0022567510604858, "learning_rate": 2.1205760151449206e-07, "loss": 0.0358, "step": 2240 }, { "epoch": 0.8814159292035398, "grad_norm": 0.8748469948768616, "learning_rate": 2.106756044432598e-07, "loss": 0.0367, "step": 2241 }, { "epoch": 0.8818092428711898, "grad_norm": 1.0613561868667603, "learning_rate": 2.0929792728187986e-07, "loss": 0.0608, "step": 2242 }, { "epoch": 0.8822025565388397, "grad_norm": 1.8184490203857422, "learning_rate": 2.079245726300022e-07, "loss": 0.0586, "step": 2243 }, { "epoch": 0.8825958702064897, "grad_norm": 1.0881813764572144, "learning_rate": 2.0655554307911997e-07, "loss": 0.0603, "step": 2244 }, { "epoch": 0.8829891838741396, "grad_norm": 1.0074139833450317, "learning_rate": 2.05190841212565e-07, "loss": 0.0666, "step": 2245 }, { "epoch": 0.8833824975417895, "grad_norm": 1.1435564756393433, "learning_rate": 2.038304696055024e-07, "loss": 0.0312, "step": 2246 }, { "epoch": 0.8837758112094395, "grad_norm": 0.6284701228141785, "learning_rate": 2.0247443082492686e-07, "loss": 0.0235, "step": 2247 }, { "epoch": 0.8841691248770894, "grad_norm": 1.6139885187149048, "learning_rate": 2.0112272742965678e-07, "loss": 0.0262, "step": 2248 }, { "epoch": 0.8845624385447395, "grad_norm": 0.8762457966804504, "learning_rate": 1.997753619703291e-07, "loss": 0.0431, "step": 2249 }, { "epoch": 0.8849557522123894, "grad_norm": 1.287406086921692, "learning_rate": 1.9843233698939617e-07, "loss": 0.0457, "step": 2250 }, { "epoch": 0.8853490658800394, "grad_norm": 1.3118491172790527, "learning_rate": 1.9709365502111944e-07, "loss": 0.0487, "step": 2251 }, { "epoch": 0.8857423795476893, "grad_norm": 0.8101546764373779, "learning_rate": 1.957593185915657e-07, "loss": 0.0458, "step": 2252 }, { "epoch": 0.8861356932153392, "grad_norm": 1.5364015102386475, "learning_rate": 1.9442933021860095e-07, "loss": 0.0407, "step": 2253 }, { "epoch": 0.8865290068829892, "grad_norm": 0.9168291091918945, "learning_rate": 1.9310369241188732e-07, "loss": 0.0474, "step": 2254 }, { "epoch": 0.8869223205506391, "grad_norm": 1.0423481464385986, "learning_rate": 1.9178240767287666e-07, "loss": 0.035, "step": 2255 }, { "epoch": 0.8873156342182891, "grad_norm": 0.995087742805481, "learning_rate": 1.904654784948079e-07, "loss": 0.0596, "step": 2256 }, { "epoch": 0.887708947885939, "grad_norm": 1.1472982168197632, "learning_rate": 1.8915290736269965e-07, "loss": 0.069, "step": 2257 }, { "epoch": 0.888102261553589, "grad_norm": 0.7572572231292725, "learning_rate": 1.878446967533476e-07, "loss": 0.061, "step": 2258 }, { "epoch": 0.8884955752212389, "grad_norm": 0.5118011832237244, "learning_rate": 1.865408491353199e-07, "loss": 0.0313, "step": 2259 }, { "epoch": 0.8888888888888888, "grad_norm": 0.8399426937103271, "learning_rate": 1.8524136696895068e-07, "loss": 0.0444, "step": 2260 }, { "epoch": 0.8892822025565389, "grad_norm": 0.8290569186210632, "learning_rate": 1.8394625270633793e-07, "loss": 0.0384, "step": 2261 }, { "epoch": 0.8896755162241888, "grad_norm": 1.0309621095657349, "learning_rate": 1.8265550879133538e-07, "loss": 0.0522, "step": 2262 }, { "epoch": 0.8900688298918388, "grad_norm": 2.102466583251953, "learning_rate": 1.8136913765955195e-07, "loss": 0.0684, "step": 2263 }, { "epoch": 0.8904621435594887, "grad_norm": 0.9560519456863403, "learning_rate": 1.8008714173834456e-07, "loss": 0.0411, "step": 2264 }, { "epoch": 0.8908554572271387, "grad_norm": 0.7714261412620544, "learning_rate": 1.7880952344681402e-07, "loss": 0.0393, "step": 2265 }, { "epoch": 0.8912487708947886, "grad_norm": 2.210777521133423, "learning_rate": 1.7753628519580097e-07, "loss": 0.0531, "step": 2266 }, { "epoch": 0.8916420845624385, "grad_norm": 1.3124444484710693, "learning_rate": 1.7626742938788105e-07, "loss": 0.0808, "step": 2267 }, { "epoch": 0.8920353982300885, "grad_norm": 0.8876106142997742, "learning_rate": 1.7500295841735905e-07, "loss": 0.0299, "step": 2268 }, { "epoch": 0.8924287118977384, "grad_norm": 0.9470813870429993, "learning_rate": 1.7374287467026767e-07, "loss": 0.0289, "step": 2269 }, { "epoch": 0.8928220255653884, "grad_norm": 1.1278401613235474, "learning_rate": 1.7248718052435942e-07, "loss": 0.0557, "step": 2270 }, { "epoch": 0.8932153392330383, "grad_norm": 1.0883233547210693, "learning_rate": 1.712358783491047e-07, "loss": 0.0493, "step": 2271 }, { "epoch": 0.8936086529006882, "grad_norm": 1.8595354557037354, "learning_rate": 1.6998897050568618e-07, "loss": 0.0583, "step": 2272 }, { "epoch": 0.8940019665683383, "grad_norm": 1.1858155727386475, "learning_rate": 1.6874645934699342e-07, "loss": 0.0406, "step": 2273 }, { "epoch": 0.8943952802359882, "grad_norm": 0.8429166674613953, "learning_rate": 1.6750834721762117e-07, "loss": 0.0575, "step": 2274 }, { "epoch": 0.8947885939036382, "grad_norm": 1.4577648639678955, "learning_rate": 1.6627463645386199e-07, "loss": 0.0412, "step": 2275 }, { "epoch": 0.8951819075712881, "grad_norm": 0.6947933435440063, "learning_rate": 1.6504532938370427e-07, "loss": 0.0465, "step": 2276 }, { "epoch": 0.8955752212389381, "grad_norm": 0.8350834846496582, "learning_rate": 1.6382042832682577e-07, "loss": 0.0438, "step": 2277 }, { "epoch": 0.895968534906588, "grad_norm": 1.2530003786087036, "learning_rate": 1.6259993559459091e-07, "loss": 0.0415, "step": 2278 }, { "epoch": 0.896361848574238, "grad_norm": 1.0597574710845947, "learning_rate": 1.613838534900447e-07, "loss": 0.0399, "step": 2279 }, { "epoch": 0.8967551622418879, "grad_norm": 0.8264654278755188, "learning_rate": 1.601721843079107e-07, "loss": 0.0348, "step": 2280 }, { "epoch": 0.8971484759095378, "grad_norm": 0.8567057251930237, "learning_rate": 1.5896493033458416e-07, "loss": 0.029, "step": 2281 }, { "epoch": 0.8975417895771878, "grad_norm": 1.390363335609436, "learning_rate": 1.5776209384812946e-07, "loss": 0.0815, "step": 2282 }, { "epoch": 0.8979351032448377, "grad_norm": 0.9575844407081604, "learning_rate": 1.5656367711827602e-07, "loss": 0.0526, "step": 2283 }, { "epoch": 0.8983284169124877, "grad_norm": 0.7833372950553894, "learning_rate": 1.553696824064116e-07, "loss": 0.0329, "step": 2284 }, { "epoch": 0.8987217305801377, "grad_norm": 0.8829760551452637, "learning_rate": 1.5418011196558085e-07, "loss": 0.0395, "step": 2285 }, { "epoch": 0.8991150442477877, "grad_norm": 1.0580815076828003, "learning_rate": 1.529949680404799e-07, "loss": 0.0648, "step": 2286 }, { "epoch": 0.8995083579154376, "grad_norm": 1.051527738571167, "learning_rate": 1.5181425286745155e-07, "loss": 0.0618, "step": 2287 }, { "epoch": 0.8999016715830875, "grad_norm": 1.5211282968521118, "learning_rate": 1.5063796867448243e-07, "loss": 0.047, "step": 2288 }, { "epoch": 0.9002949852507375, "grad_norm": 0.3931565582752228, "learning_rate": 1.4946611768119763e-07, "loss": 0.0371, "step": 2289 }, { "epoch": 0.9006882989183874, "grad_norm": 0.40819835662841797, "learning_rate": 1.4829870209885605e-07, "loss": 0.0399, "step": 2290 }, { "epoch": 0.9010816125860374, "grad_norm": 1.5606259107589722, "learning_rate": 1.471357241303481e-07, "loss": 0.0537, "step": 2291 }, { "epoch": 0.9014749262536873, "grad_norm": 0.4650862514972687, "learning_rate": 1.4597718597019055e-07, "loss": 0.0169, "step": 2292 }, { "epoch": 0.9018682399213372, "grad_norm": 0.8470922112464905, "learning_rate": 1.4482308980452164e-07, "loss": 0.0308, "step": 2293 }, { "epoch": 0.9022615535889872, "grad_norm": 1.1515922546386719, "learning_rate": 1.436734378110985e-07, "loss": 0.0459, "step": 2294 }, { "epoch": 0.9026548672566371, "grad_norm": 1.0158207416534424, "learning_rate": 1.425282321592908e-07, "loss": 0.0667, "step": 2295 }, { "epoch": 0.9030481809242871, "grad_norm": 0.6387980580329895, "learning_rate": 1.4138747501007966e-07, "loss": 0.0419, "step": 2296 }, { "epoch": 0.9034414945919371, "grad_norm": 1.8949992656707764, "learning_rate": 1.4025116851605125e-07, "loss": 0.0556, "step": 2297 }, { "epoch": 0.9038348082595871, "grad_norm": 0.8390710949897766, "learning_rate": 1.3911931482139317e-07, "loss": 0.0322, "step": 2298 }, { "epoch": 0.904228121927237, "grad_norm": 0.6234549880027771, "learning_rate": 1.379919160618909e-07, "loss": 0.0334, "step": 2299 }, { "epoch": 0.904621435594887, "grad_norm": 1.1114718914031982, "learning_rate": 1.368689743649243e-07, "loss": 0.0536, "step": 2300 }, { "epoch": 0.9050147492625369, "grad_norm": 0.7461351752281189, "learning_rate": 1.3575049184946122e-07, "loss": 0.0371, "step": 2301 }, { "epoch": 0.9054080629301868, "grad_norm": 0.9355785250663757, "learning_rate": 1.346364706260564e-07, "loss": 0.0296, "step": 2302 }, { "epoch": 0.9058013765978368, "grad_norm": 0.5872256755828857, "learning_rate": 1.3352691279684582e-07, "loss": 0.0281, "step": 2303 }, { "epoch": 0.9061946902654867, "grad_norm": 1.7544050216674805, "learning_rate": 1.324218204555433e-07, "loss": 0.056, "step": 2304 }, { "epoch": 0.9065880039331367, "grad_norm": 0.6219866871833801, "learning_rate": 1.3132119568743662e-07, "loss": 0.0288, "step": 2305 }, { "epoch": 0.9069813176007866, "grad_norm": 1.4340651035308838, "learning_rate": 1.3022504056938196e-07, "loss": 0.0504, "step": 2306 }, { "epoch": 0.9073746312684365, "grad_norm": 0.5100427269935608, "learning_rate": 1.2913335716980307e-07, "loss": 0.0473, "step": 2307 }, { "epoch": 0.9077679449360865, "grad_norm": 0.650513768196106, "learning_rate": 1.2804614754868466e-07, "loss": 0.0537, "step": 2308 }, { "epoch": 0.9081612586037365, "grad_norm": 1.4720587730407715, "learning_rate": 1.2696341375756982e-07, "loss": 0.043, "step": 2309 }, { "epoch": 0.9085545722713865, "grad_norm": 1.7473880052566528, "learning_rate": 1.2588515783955564e-07, "loss": 0.0551, "step": 2310 }, { "epoch": 0.9089478859390364, "grad_norm": 0.7824367880821228, "learning_rate": 1.2481138182929065e-07, "loss": 0.0299, "step": 2311 }, { "epoch": 0.9093411996066864, "grad_norm": 1.2818101644515991, "learning_rate": 1.2374208775296742e-07, "loss": 0.0664, "step": 2312 }, { "epoch": 0.9097345132743363, "grad_norm": 1.6559642553329468, "learning_rate": 1.2267727762832388e-07, "loss": 0.0667, "step": 2313 }, { "epoch": 0.9101278269419862, "grad_norm": 0.8255678415298462, "learning_rate": 1.2161695346463498e-07, "loss": 0.042, "step": 2314 }, { "epoch": 0.9105211406096362, "grad_norm": 0.7617945075035095, "learning_rate": 1.2056111726271192e-07, "loss": 0.0464, "step": 2315 }, { "epoch": 0.9109144542772861, "grad_norm": 1.3965145349502563, "learning_rate": 1.195097710148968e-07, "loss": 0.039, "step": 2316 }, { "epoch": 0.9113077679449361, "grad_norm": 1.3296297788619995, "learning_rate": 1.1846291670505855e-07, "loss": 0.0552, "step": 2317 }, { "epoch": 0.911701081612586, "grad_norm": 0.7849988341331482, "learning_rate": 1.1742055630859117e-07, "loss": 0.0338, "step": 2318 }, { "epoch": 0.912094395280236, "grad_norm": 2.0398993492126465, "learning_rate": 1.1638269179240796e-07, "loss": 0.0542, "step": 2319 }, { "epoch": 0.9124877089478859, "grad_norm": 0.7769688367843628, "learning_rate": 1.1534932511493846e-07, "loss": 0.0343, "step": 2320 }, { "epoch": 0.9128810226155359, "grad_norm": 0.6311588287353516, "learning_rate": 1.1432045822612564e-07, "loss": 0.0483, "step": 2321 }, { "epoch": 0.9132743362831859, "grad_norm": 0.9618848562240601, "learning_rate": 1.132960930674204e-07, "loss": 0.0498, "step": 2322 }, { "epoch": 0.9136676499508358, "grad_norm": 0.8956164121627808, "learning_rate": 1.1227623157177986e-07, "loss": 0.0316, "step": 2323 }, { "epoch": 0.9140609636184858, "grad_norm": 1.1387652158737183, "learning_rate": 1.1126087566366266e-07, "loss": 0.0669, "step": 2324 }, { "epoch": 0.9144542772861357, "grad_norm": 0.7763038277626038, "learning_rate": 1.1025002725902484e-07, "loss": 0.0512, "step": 2325 }, { "epoch": 0.9148475909537856, "grad_norm": 1.52693510055542, "learning_rate": 1.0924368826531751e-07, "loss": 0.0745, "step": 2326 }, { "epoch": 0.9152409046214356, "grad_norm": 1.1928157806396484, "learning_rate": 1.0824186058148278e-07, "loss": 0.047, "step": 2327 }, { "epoch": 0.9156342182890855, "grad_norm": 0.6993405818939209, "learning_rate": 1.0724454609794931e-07, "loss": 0.0258, "step": 2328 }, { "epoch": 0.9160275319567355, "grad_norm": 0.8654144406318665, "learning_rate": 1.0625174669663036e-07, "loss": 0.0493, "step": 2329 }, { "epoch": 0.9164208456243854, "grad_norm": 1.6443697214126587, "learning_rate": 1.0526346425091815e-07, "loss": 0.0641, "step": 2330 }, { "epoch": 0.9168141592920354, "grad_norm": 2.2090344429016113, "learning_rate": 1.042797006256821e-07, "loss": 0.0916, "step": 2331 }, { "epoch": 0.9172074729596853, "grad_norm": 1.2032400369644165, "learning_rate": 1.0330045767726504e-07, "loss": 0.043, "step": 2332 }, { "epoch": 0.9176007866273354, "grad_norm": 1.0382981300354004, "learning_rate": 1.023257372534786e-07, "loss": 0.0478, "step": 2333 }, { "epoch": 0.9179941002949853, "grad_norm": 1.3554562330245972, "learning_rate": 1.0135554119360153e-07, "loss": 0.076, "step": 2334 }, { "epoch": 0.9183874139626352, "grad_norm": 0.7670255899429321, "learning_rate": 1.0038987132837435e-07, "loss": 0.0666, "step": 2335 }, { "epoch": 0.9187807276302852, "grad_norm": 1.3432739973068237, "learning_rate": 9.942872947999672e-08, "loss": 0.0472, "step": 2336 }, { "epoch": 0.9191740412979351, "grad_norm": 0.7896971702575684, "learning_rate": 9.847211746212504e-08, "loss": 0.0636, "step": 2337 }, { "epoch": 0.9195673549655851, "grad_norm": 0.7464331388473511, "learning_rate": 9.752003707986652e-08, "loss": 0.036, "step": 2338 }, { "epoch": 0.919960668633235, "grad_norm": 1.4482289552688599, "learning_rate": 9.657249012977821e-08, "loss": 0.047, "step": 2339 }, { "epoch": 0.9203539823008849, "grad_norm": 0.7451487183570862, "learning_rate": 9.562947839986264e-08, "loss": 0.0516, "step": 2340 }, { "epoch": 0.9207472959685349, "grad_norm": 1.0219905376434326, "learning_rate": 9.469100366956391e-08, "loss": 0.0515, "step": 2341 }, { "epoch": 0.9211406096361848, "grad_norm": 0.776695966720581, "learning_rate": 9.375706770976573e-08, "loss": 0.0289, "step": 2342 }, { "epoch": 0.9215339233038348, "grad_norm": 0.9781972169876099, "learning_rate": 9.282767228278672e-08, "loss": 0.0767, "step": 2343 }, { "epoch": 0.9219272369714847, "grad_norm": 1.0278164148330688, "learning_rate": 9.190281914237736e-08, "loss": 0.0333, "step": 2344 }, { "epoch": 0.9223205506391348, "grad_norm": 1.5040227174758911, "learning_rate": 9.09825100337175e-08, "loss": 0.0788, "step": 2345 }, { "epoch": 0.9227138643067847, "grad_norm": 1.5312731266021729, "learning_rate": 9.006674669341214e-08, "loss": 0.0744, "step": 2346 }, { "epoch": 0.9231071779744346, "grad_norm": 1.6249146461486816, "learning_rate": 8.915553084948847e-08, "loss": 0.0442, "step": 2347 }, { "epoch": 0.9235004916420846, "grad_norm": 1.0247668027877808, "learning_rate": 8.824886422139273e-08, "loss": 0.0621, "step": 2348 }, { "epoch": 0.9238938053097345, "grad_norm": 1.506390929222107, "learning_rate": 8.734674851998748e-08, "loss": 0.0755, "step": 2349 }, { "epoch": 0.9242871189773845, "grad_norm": 0.8823897838592529, "learning_rate": 8.64491854475466e-08, "loss": 0.0637, "step": 2350 }, { "epoch": 0.9246804326450344, "grad_norm": 0.7110940217971802, "learning_rate": 8.55561766977539e-08, "loss": 0.0326, "step": 2351 }, { "epoch": 0.9250737463126844, "grad_norm": 0.5734057426452637, "learning_rate": 8.46677239556995e-08, "loss": 0.0305, "step": 2352 }, { "epoch": 0.9254670599803343, "grad_norm": 0.8686132431030273, "learning_rate": 8.378382889787596e-08, "loss": 0.0405, "step": 2353 }, { "epoch": 0.9258603736479842, "grad_norm": 1.6284774541854858, "learning_rate": 8.290449319217603e-08, "loss": 0.0583, "step": 2354 }, { "epoch": 0.9262536873156342, "grad_norm": 1.2678624391555786, "learning_rate": 8.202971849788854e-08, "loss": 0.0474, "step": 2355 }, { "epoch": 0.9266470009832841, "grad_norm": 1.2101284265518188, "learning_rate": 8.115950646569587e-08, "loss": 0.0391, "step": 2356 }, { "epoch": 0.9270403146509342, "grad_norm": 0.6382131576538086, "learning_rate": 8.029385873767115e-08, "loss": 0.0512, "step": 2357 }, { "epoch": 0.9274336283185841, "grad_norm": 1.0339092016220093, "learning_rate": 7.943277694727469e-08, "loss": 0.0528, "step": 2358 }, { "epoch": 0.927826941986234, "grad_norm": 0.7545960545539856, "learning_rate": 7.857626271935037e-08, "loss": 0.0418, "step": 2359 }, { "epoch": 0.928220255653884, "grad_norm": 0.9588167071342468, "learning_rate": 7.772431767012423e-08, "loss": 0.0552, "step": 2360 }, { "epoch": 0.9286135693215339, "grad_norm": 0.7952490448951721, "learning_rate": 7.68769434071992e-08, "loss": 0.0431, "step": 2361 }, { "epoch": 0.9290068829891839, "grad_norm": 1.0601327419281006, "learning_rate": 7.603414152955374e-08, "loss": 0.0262, "step": 2362 }, { "epoch": 0.9294001966568338, "grad_norm": 0.8356077075004578, "learning_rate": 7.519591362753848e-08, "loss": 0.0309, "step": 2363 }, { "epoch": 0.9297935103244838, "grad_norm": 1.068089246749878, "learning_rate": 7.436226128287288e-08, "loss": 0.0374, "step": 2364 }, { "epoch": 0.9301868239921337, "grad_norm": 1.1383631229400635, "learning_rate": 7.35331860686428e-08, "loss": 0.0515, "step": 2365 }, { "epoch": 0.9305801376597836, "grad_norm": 0.9927535653114319, "learning_rate": 7.270868954929595e-08, "loss": 0.056, "step": 2366 }, { "epoch": 0.9309734513274336, "grad_norm": 0.6153873801231384, "learning_rate": 7.188877328064142e-08, "loss": 0.0437, "step": 2367 }, { "epoch": 0.9313667649950835, "grad_norm": 0.8163816928863525, "learning_rate": 7.107343880984496e-08, "loss": 0.0541, "step": 2368 }, { "epoch": 0.9317600786627336, "grad_norm": 1.144721269607544, "learning_rate": 7.026268767542671e-08, "loss": 0.055, "step": 2369 }, { "epoch": 0.9321533923303835, "grad_norm": 0.9538362622261047, "learning_rate": 6.94565214072579e-08, "loss": 0.0845, "step": 2370 }, { "epoch": 0.9325467059980335, "grad_norm": 1.0417604446411133, "learning_rate": 6.86549415265586e-08, "loss": 0.054, "step": 2371 }, { "epoch": 0.9329400196656834, "grad_norm": 0.8085368275642395, "learning_rate": 6.785794954589365e-08, "loss": 0.0338, "step": 2372 }, { "epoch": 0.9333333333333333, "grad_norm": 0.6007797718048096, "learning_rate": 6.706554696917139e-08, "loss": 0.0314, "step": 2373 }, { "epoch": 0.9337266470009833, "grad_norm": 0.8648099303245544, "learning_rate": 6.627773529163994e-08, "loss": 0.0302, "step": 2374 }, { "epoch": 0.9341199606686332, "grad_norm": 0.5465229749679565, "learning_rate": 6.549451599988432e-08, "loss": 0.0359, "step": 2375 }, { "epoch": 0.9345132743362832, "grad_norm": 0.6655777096748352, "learning_rate": 6.471589057182398e-08, "loss": 0.0435, "step": 2376 }, { "epoch": 0.9349065880039331, "grad_norm": 1.1010547876358032, "learning_rate": 6.394186047670947e-08, "loss": 0.0377, "step": 2377 }, { "epoch": 0.9352999016715831, "grad_norm": 0.7519053816795349, "learning_rate": 6.317242717511995e-08, "loss": 0.033, "step": 2378 }, { "epoch": 0.935693215339233, "grad_norm": 0.8617828488349915, "learning_rate": 6.240759211896153e-08, "loss": 0.0434, "step": 2379 }, { "epoch": 0.9360865290068829, "grad_norm": 1.5556560754776, "learning_rate": 6.16473567514625e-08, "loss": 0.0893, "step": 2380 }, { "epoch": 0.936479842674533, "grad_norm": 1.6594090461730957, "learning_rate": 6.089172250717201e-08, "loss": 0.0667, "step": 2381 }, { "epoch": 0.9368731563421829, "grad_norm": 0.7117483019828796, "learning_rate": 6.014069081195673e-08, "loss": 0.0256, "step": 2382 }, { "epoch": 0.9372664700098329, "grad_norm": 0.8783112168312073, "learning_rate": 5.9394263082998836e-08, "loss": 0.0439, "step": 2383 }, { "epoch": 0.9376597836774828, "grad_norm": 0.73135906457901, "learning_rate": 5.8652440728792504e-08, "loss": 0.0514, "step": 2384 }, { "epoch": 0.9380530973451328, "grad_norm": 0.5708735585212708, "learning_rate": 5.791522514914216e-08, "loss": 0.0332, "step": 2385 }, { "epoch": 0.9384464110127827, "grad_norm": 1.1698683500289917, "learning_rate": 5.718261773515865e-08, "loss": 0.026, "step": 2386 }, { "epoch": 0.9388397246804326, "grad_norm": 0.8288942575454712, "learning_rate": 5.64546198692581e-08, "loss": 0.0401, "step": 2387 }, { "epoch": 0.9392330383480826, "grad_norm": 1.1005017757415771, "learning_rate": 5.573123292515775e-08, "loss": 0.0625, "step": 2388 }, { "epoch": 0.9396263520157325, "grad_norm": 1.4169667959213257, "learning_rate": 5.50124582678746e-08, "loss": 0.0561, "step": 2389 }, { "epoch": 0.9400196656833825, "grad_norm": 1.8534727096557617, "learning_rate": 5.429829725372204e-08, "loss": 0.0563, "step": 2390 }, { "epoch": 0.9404129793510324, "grad_norm": 0.49012327194213867, "learning_rate": 5.3588751230307935e-08, "loss": 0.0371, "step": 2391 }, { "epoch": 0.9408062930186823, "grad_norm": 1.5290131568908691, "learning_rate": 5.2883821536531545e-08, "loss": 0.0471, "step": 2392 }, { "epoch": 0.9411996066863324, "grad_norm": 0.37540706992149353, "learning_rate": 5.218350950258133e-08, "loss": 0.0224, "step": 2393 }, { "epoch": 0.9415929203539823, "grad_norm": 1.6441450119018555, "learning_rate": 5.1487816449932174e-08, "loss": 0.0545, "step": 2394 }, { "epoch": 0.9419862340216323, "grad_norm": 0.8181889057159424, "learning_rate": 5.079674369134313e-08, "loss": 0.0528, "step": 2395 }, { "epoch": 0.9423795476892822, "grad_norm": 1.6283776760101318, "learning_rate": 5.0110292530854696e-08, "loss": 0.0528, "step": 2396 }, { "epoch": 0.9427728613569322, "grad_norm": 4.418090343475342, "learning_rate": 4.942846426378683e-08, "loss": 0.052, "step": 2397 }, { "epoch": 0.9431661750245821, "grad_norm": 0.9668748378753662, "learning_rate": 4.875126017673593e-08, "loss": 0.0441, "step": 2398 }, { "epoch": 0.943559488692232, "grad_norm": 1.2723820209503174, "learning_rate": 4.807868154757284e-08, "loss": 0.0504, "step": 2399 }, { "epoch": 0.943952802359882, "grad_norm": 1.2000619173049927, "learning_rate": 4.741072964543958e-08, "loss": 0.0669, "step": 2400 }, { "epoch": 0.9443461160275319, "grad_norm": 1.4198737144470215, "learning_rate": 4.6747405730748765e-08, "loss": 0.0768, "step": 2401 }, { "epoch": 0.9447394296951819, "grad_norm": 0.5707858800888062, "learning_rate": 4.6088711055179426e-08, "loss": 0.0363, "step": 2402 }, { "epoch": 0.9451327433628318, "grad_norm": 0.9884591698646545, "learning_rate": 4.543464686167537e-08, "loss": 0.0617, "step": 2403 }, { "epoch": 0.9455260570304818, "grad_norm": 1.1140447854995728, "learning_rate": 4.478521438444267e-08, "loss": 0.0307, "step": 2404 }, { "epoch": 0.9459193706981318, "grad_norm": 1.7241696119308472, "learning_rate": 4.414041484894743e-08, "loss": 0.0468, "step": 2405 }, { "epoch": 0.9463126843657818, "grad_norm": 1.4963939189910889, "learning_rate": 4.3500249471913616e-08, "loss": 0.0424, "step": 2406 }, { "epoch": 0.9467059980334317, "grad_norm": 1.4940134286880493, "learning_rate": 4.2864719461321036e-08, "loss": 0.062, "step": 2407 }, { "epoch": 0.9470993117010816, "grad_norm": 1.2279117107391357, "learning_rate": 4.223382601640208e-08, "loss": 0.0557, "step": 2408 }, { "epoch": 0.9474926253687316, "grad_norm": 0.5514369606971741, "learning_rate": 4.160757032764001e-08, "loss": 0.0211, "step": 2409 }, { "epoch": 0.9478859390363815, "grad_norm": 1.1696200370788574, "learning_rate": 4.098595357676732e-08, "loss": 0.0525, "step": 2410 }, { "epoch": 0.9482792527040315, "grad_norm": 1.4047200679779053, "learning_rate": 4.036897693676184e-08, "loss": 0.0582, "step": 2411 }, { "epoch": 0.9486725663716814, "grad_norm": 0.9069812893867493, "learning_rate": 3.9756641571847e-08, "loss": 0.0451, "step": 2412 }, { "epoch": 0.9490658800393313, "grad_norm": 0.7696250677108765, "learning_rate": 3.914894863748714e-08, "loss": 0.0596, "step": 2413 }, { "epoch": 0.9494591937069813, "grad_norm": 1.0009849071502686, "learning_rate": 3.854589928038666e-08, "loss": 0.0531, "step": 2414 }, { "epoch": 0.9498525073746312, "grad_norm": 0.6316270232200623, "learning_rate": 3.794749463848835e-08, "loss": 0.0261, "step": 2415 }, { "epoch": 0.9502458210422812, "grad_norm": 1.1284974813461304, "learning_rate": 3.735373584096924e-08, "loss": 0.0485, "step": 2416 }, { "epoch": 0.9506391347099312, "grad_norm": 0.744842529296875, "learning_rate": 3.676462400824088e-08, "loss": 0.0437, "step": 2417 }, { "epoch": 0.9510324483775812, "grad_norm": 1.1578047275543213, "learning_rate": 3.618016025194598e-08, "loss": 0.0458, "step": 2418 }, { "epoch": 0.9514257620452311, "grad_norm": 1.029968023300171, "learning_rate": 3.560034567495513e-08, "loss": 0.063, "step": 2419 }, { "epoch": 0.951819075712881, "grad_norm": 0.8940306305885315, "learning_rate": 3.5025181371367844e-08, "loss": 0.0583, "step": 2420 }, { "epoch": 0.952212389380531, "grad_norm": 1.1246992349624634, "learning_rate": 3.4454668426507076e-08, "loss": 0.0446, "step": 2421 }, { "epoch": 0.9526057030481809, "grad_norm": 1.069629192352295, "learning_rate": 3.388880791692001e-08, "loss": 0.0422, "step": 2422 }, { "epoch": 0.9529990167158309, "grad_norm": 1.080478549003601, "learning_rate": 3.33276009103739e-08, "loss": 0.0547, "step": 2423 }, { "epoch": 0.9533923303834808, "grad_norm": 1.105726718902588, "learning_rate": 3.2771048465855546e-08, "loss": 0.0478, "step": 2424 }, { "epoch": 0.9537856440511308, "grad_norm": 0.9557194709777832, "learning_rate": 3.221915163356848e-08, "loss": 0.0454, "step": 2425 }, { "epoch": 0.9541789577187807, "grad_norm": 0.7306869626045227, "learning_rate": 3.167191145493076e-08, "loss": 0.0306, "step": 2426 }, { "epoch": 0.9545722713864306, "grad_norm": 0.9311756491661072, "learning_rate": 3.1129328962573865e-08, "loss": 0.0378, "step": 2427 }, { "epoch": 0.9549655850540806, "grad_norm": 1.6339657306671143, "learning_rate": 3.05914051803402e-08, "loss": 0.053, "step": 2428 }, { "epoch": 0.9553588987217306, "grad_norm": 1.5211260318756104, "learning_rate": 3.005814112328143e-08, "loss": 0.0408, "step": 2429 }, { "epoch": 0.9557522123893806, "grad_norm": 1.1606007814407349, "learning_rate": 2.9529537797656215e-08, "loss": 0.0531, "step": 2430 }, { "epoch": 0.9561455260570305, "grad_norm": 0.5916828513145447, "learning_rate": 2.900559620092891e-08, "loss": 0.0625, "step": 2431 }, { "epoch": 0.9565388397246805, "grad_norm": 0.49938130378723145, "learning_rate": 2.8486317321766432e-08, "loss": 0.0395, "step": 2432 }, { "epoch": 0.9569321533923304, "grad_norm": 1.587057113647461, "learning_rate": 2.797170214003775e-08, "loss": 0.1053, "step": 2433 }, { "epoch": 0.9573254670599803, "grad_norm": 1.176936149597168, "learning_rate": 2.7461751626811916e-08, "loss": 0.0462, "step": 2434 }, { "epoch": 0.9577187807276303, "grad_norm": 0.5434470176696777, "learning_rate": 2.6956466744355315e-08, "loss": 0.0268, "step": 2435 }, { "epoch": 0.9581120943952802, "grad_norm": 0.6117231845855713, "learning_rate": 2.6455848446130526e-08, "loss": 0.0572, "step": 2436 }, { "epoch": 0.9585054080629302, "grad_norm": 1.2302024364471436, "learning_rate": 2.5959897676794134e-08, "loss": 0.0613, "step": 2437 }, { "epoch": 0.9588987217305801, "grad_norm": 1.686108946800232, "learning_rate": 2.546861537219586e-08, "loss": 0.0726, "step": 2438 }, { "epoch": 0.95929203539823, "grad_norm": 0.9010059833526611, "learning_rate": 2.4982002459375265e-08, "loss": 0.0356, "step": 2439 }, { "epoch": 0.95968534906588, "grad_norm": 0.7760159373283386, "learning_rate": 2.450005985656173e-08, "loss": 0.0376, "step": 2440 }, { "epoch": 0.96007866273353, "grad_norm": 0.788345456123352, "learning_rate": 2.4022788473170853e-08, "loss": 0.0657, "step": 2441 }, { "epoch": 0.96047197640118, "grad_norm": 0.8711709976196289, "learning_rate": 2.355018920980501e-08, "loss": 0.0444, "step": 2442 }, { "epoch": 0.9608652900688299, "grad_norm": 0.6124730110168457, "learning_rate": 2.308226295824917e-08, "loss": 0.0542, "step": 2443 }, { "epoch": 0.9612586037364799, "grad_norm": 1.0837171077728271, "learning_rate": 2.2619010601470925e-08, "loss": 0.0577, "step": 2444 }, { "epoch": 0.9616519174041298, "grad_norm": 1.9453260898590088, "learning_rate": 2.2160433013618533e-08, "loss": 0.058, "step": 2445 }, { "epoch": 0.9620452310717797, "grad_norm": 0.8556208610534668, "learning_rate": 2.170653106001841e-08, "loss": 0.0281, "step": 2446 }, { "epoch": 0.9624385447394297, "grad_norm": 0.9196289777755737, "learning_rate": 2.1257305597175428e-08, "loss": 0.0414, "step": 2447 }, { "epoch": 0.9628318584070796, "grad_norm": 1.5880217552185059, "learning_rate": 2.0812757472768175e-08, "loss": 0.0496, "step": 2448 }, { "epoch": 0.9632251720747296, "grad_norm": 1.4076353311538696, "learning_rate": 2.037288752565064e-08, "loss": 0.049, "step": 2449 }, { "epoch": 0.9636184857423795, "grad_norm": 0.8668321967124939, "learning_rate": 1.99376965858486e-08, "loss": 0.0606, "step": 2450 }, { "epoch": 0.9640117994100295, "grad_norm": 0.7461321353912354, "learning_rate": 1.9507185474558765e-08, "loss": 0.0343, "step": 2451 }, { "epoch": 0.9644051130776794, "grad_norm": 0.6470179557800293, "learning_rate": 1.908135500414743e-08, "loss": 0.0334, "step": 2452 }, { "epoch": 0.9647984267453295, "grad_norm": 1.0918750762939453, "learning_rate": 1.866020597814766e-08, "loss": 0.0451, "step": 2453 }, { "epoch": 0.9651917404129794, "grad_norm": 0.6877756118774414, "learning_rate": 1.8243739191259603e-08, "loss": 0.0397, "step": 2454 }, { "epoch": 0.9655850540806293, "grad_norm": 0.9845160245895386, "learning_rate": 1.7831955429348235e-08, "loss": 0.0227, "step": 2455 }, { "epoch": 0.9659783677482793, "grad_norm": 1.178027629852295, "learning_rate": 1.7424855469440617e-08, "loss": 0.0941, "step": 2456 }, { "epoch": 0.9663716814159292, "grad_norm": 1.0678149461746216, "learning_rate": 1.7022440079726976e-08, "loss": 0.0519, "step": 2457 }, { "epoch": 0.9667649950835792, "grad_norm": 0.7598469257354736, "learning_rate": 1.6624710019556844e-08, "loss": 0.0303, "step": 2458 }, { "epoch": 0.9671583087512291, "grad_norm": 1.8913023471832275, "learning_rate": 1.623166603943932e-08, "loss": 0.0573, "step": 2459 }, { "epoch": 0.967551622418879, "grad_norm": 0.8094140887260437, "learning_rate": 1.584330888104002e-08, "loss": 0.0454, "step": 2460 }, { "epoch": 0.967944936086529, "grad_norm": 1.0645431280136108, "learning_rate": 1.5459639277181637e-08, "loss": 0.0482, "step": 2461 }, { "epoch": 0.9683382497541789, "grad_norm": 1.1675747632980347, "learning_rate": 1.508065795184116e-08, "loss": 0.0587, "step": 2462 }, { "epoch": 0.9687315634218289, "grad_norm": 1.6579506397247314, "learning_rate": 1.4706365620149043e-08, "loss": 0.0389, "step": 2463 }, { "epoch": 0.9691248770894788, "grad_norm": 1.4258586168289185, "learning_rate": 1.433676298838671e-08, "loss": 0.0571, "step": 2464 }, { "epoch": 0.9695181907571289, "grad_norm": 1.555445671081543, "learning_rate": 1.3971850753987936e-08, "loss": 0.0561, "step": 2465 }, { "epoch": 0.9699115044247788, "grad_norm": 1.851238489151001, "learning_rate": 1.3611629605534139e-08, "loss": 0.0614, "step": 2466 }, { "epoch": 0.9703048180924287, "grad_norm": 1.4167311191558838, "learning_rate": 1.325610022275603e-08, "loss": 0.0541, "step": 2467 }, { "epoch": 0.9706981317600787, "grad_norm": 1.103963017463684, "learning_rate": 1.29052632765303e-08, "loss": 0.0515, "step": 2468 }, { "epoch": 0.9710914454277286, "grad_norm": 0.8383644819259644, "learning_rate": 1.2559119428879607e-08, "loss": 0.0439, "step": 2469 }, { "epoch": 0.9714847590953786, "grad_norm": 1.5626074075698853, "learning_rate": 1.2217669332970084e-08, "loss": 0.0358, "step": 2470 }, { "epoch": 0.9718780727630285, "grad_norm": 0.965404748916626, "learning_rate": 1.1880913633111335e-08, "loss": 0.0588, "step": 2471 }, { "epoch": 0.9722713864306785, "grad_norm": 1.2146902084350586, "learning_rate": 1.1548852964755053e-08, "loss": 0.0473, "step": 2472 }, { "epoch": 0.9726647000983284, "grad_norm": 1.4855893850326538, "learning_rate": 1.122148795449307e-08, "loss": 0.0543, "step": 2473 }, { "epoch": 0.9730580137659783, "grad_norm": 1.1908034086227417, "learning_rate": 1.0898819220056811e-08, "loss": 0.0486, "step": 2474 }, { "epoch": 0.9734513274336283, "grad_norm": 1.0501704216003418, "learning_rate": 1.058084737031534e-08, "loss": 0.0475, "step": 2475 }, { "epoch": 0.9738446411012782, "grad_norm": 0.6650611162185669, "learning_rate": 1.0267573005275645e-08, "loss": 0.0297, "step": 2476 }, { "epoch": 0.9742379547689283, "grad_norm": 0.6201514601707458, "learning_rate": 9.95899671607986e-09, "loss": 0.047, "step": 2477 }, { "epoch": 0.9746312684365782, "grad_norm": 1.1360257863998413, "learning_rate": 9.655119085005827e-09, "loss": 0.0363, "step": 2478 }, { "epoch": 0.9750245821042282, "grad_norm": 0.8666075468063354, "learning_rate": 9.355940685464305e-09, "loss": 0.0458, "step": 2479 }, { "epoch": 0.9754178957718781, "grad_norm": 1.1366305351257324, "learning_rate": 9.061462081999262e-09, "loss": 0.0471, "step": 2480 }, { "epoch": 0.975811209439528, "grad_norm": 0.6694433689117432, "learning_rate": 8.771683830285649e-09, "loss": 0.0387, "step": 2481 }, { "epoch": 0.976204523107178, "grad_norm": 2.0710513591766357, "learning_rate": 8.486606477129677e-09, "loss": 0.075, "step": 2482 }, { "epoch": 0.9765978367748279, "grad_norm": 0.9630718231201172, "learning_rate": 8.206230560466322e-09, "loss": 0.0431, "step": 2483 }, { "epoch": 0.9769911504424779, "grad_norm": 0.9957706332206726, "learning_rate": 7.930556609359596e-09, "loss": 0.0398, "step": 2484 }, { "epoch": 0.9773844641101278, "grad_norm": 0.8392490148544312, "learning_rate": 7.659585144000892e-09, "loss": 0.1203, "step": 2485 }, { "epoch": 0.9777777777777777, "grad_norm": 0.763048529624939, "learning_rate": 7.393316675707584e-09, "loss": 0.048, "step": 2486 }, { "epoch": 0.9781710914454277, "grad_norm": 0.591249942779541, "learning_rate": 7.131751706923595e-09, "loss": 0.0276, "step": 2487 }, { "epoch": 0.9785644051130776, "grad_norm": 0.7118191719055176, "learning_rate": 6.8748907312163325e-09, "loss": 0.0459, "step": 2488 }, { "epoch": 0.9789577187807277, "grad_norm": 1.2333048582077026, "learning_rate": 6.622734233277528e-09, "loss": 0.0547, "step": 2489 }, { "epoch": 0.9793510324483776, "grad_norm": 1.8401693105697632, "learning_rate": 6.375282688921569e-09, "loss": 0.0499, "step": 2490 }, { "epoch": 0.9797443461160276, "grad_norm": 0.8339464068412781, "learning_rate": 6.132536565084945e-09, "loss": 0.0343, "step": 2491 }, { "epoch": 0.9801376597836775, "grad_norm": 0.7225338220596313, "learning_rate": 5.894496319824306e-09, "loss": 0.0373, "step": 2492 }, { "epoch": 0.9805309734513274, "grad_norm": 0.7467345595359802, "learning_rate": 5.661162402316733e-09, "loss": 0.0294, "step": 2493 }, { "epoch": 0.9809242871189774, "grad_norm": 0.7157261967658997, "learning_rate": 5.432535252859472e-09, "loss": 0.0388, "step": 2494 }, { "epoch": 0.9813176007866273, "grad_norm": 1.0490740537643433, "learning_rate": 5.208615302866593e-09, "loss": 0.0552, "step": 2495 }, { "epoch": 0.9817109144542773, "grad_norm": 0.9684942364692688, "learning_rate": 4.989402974871216e-09, "loss": 0.0482, "step": 2496 }, { "epoch": 0.9821042281219272, "grad_norm": 0.7083243727684021, "learning_rate": 4.774898682522455e-09, "loss": 0.0354, "step": 2497 }, { "epoch": 0.9824975417895772, "grad_norm": 0.6887216567993164, "learning_rate": 4.565102830585699e-09, "loss": 0.0555, "step": 2498 }, { "epoch": 0.9828908554572271, "grad_norm": 0.9905696511268616, "learning_rate": 4.360015814941498e-09, "loss": 0.044, "step": 2499 }, { "epoch": 0.983284169124877, "grad_norm": 1.4582995176315308, "learning_rate": 4.159638022585011e-09, "loss": 0.0555, "step": 2500 }, { "epoch": 0.9836774827925271, "grad_norm": 0.8839958906173706, "learning_rate": 3.96396983162517e-09, "loss": 0.0322, "step": 2501 }, { "epoch": 0.984070796460177, "grad_norm": 0.9634173512458801, "learning_rate": 3.773011611284128e-09, "loss": 0.0305, "step": 2502 }, { "epoch": 0.984464110127827, "grad_norm": 0.9942337870597839, "learning_rate": 3.586763721896147e-09, "loss": 0.0725, "step": 2503 }, { "epoch": 0.9848574237954769, "grad_norm": 0.8074241876602173, "learning_rate": 3.4052265149070453e-09, "loss": 0.048, "step": 2504 }, { "epoch": 0.9852507374631269, "grad_norm": 1.1746639013290405, "learning_rate": 3.2284003328744706e-09, "loss": 0.0565, "step": 2505 }, { "epoch": 0.9856440511307768, "grad_norm": 1.454350233078003, "learning_rate": 3.056285509465684e-09, "loss": 0.0462, "step": 2506 }, { "epoch": 0.9860373647984267, "grad_norm": 1.0500266551971436, "learning_rate": 2.888882369457835e-09, "loss": 0.0229, "step": 2507 }, { "epoch": 0.9864306784660767, "grad_norm": 0.5939337611198425, "learning_rate": 2.726191228737407e-09, "loss": 0.0441, "step": 2508 }, { "epoch": 0.9868239921337266, "grad_norm": 0.7773805856704712, "learning_rate": 2.5682123942993852e-09, "loss": 0.0388, "step": 2509 }, { "epoch": 0.9872173058013766, "grad_norm": 0.9417904019355774, "learning_rate": 2.414946164246701e-09, "loss": 0.0448, "step": 2510 }, { "epoch": 0.9876106194690265, "grad_norm": 0.8849769830703735, "learning_rate": 2.2663928277896763e-09, "loss": 0.0482, "step": 2511 }, { "epoch": 0.9880039331366764, "grad_norm": 1.0469379425048828, "learning_rate": 2.122552665245747e-09, "loss": 0.0479, "step": 2512 }, { "epoch": 0.9883972468043265, "grad_norm": 0.4294953942298889, "learning_rate": 1.9834259480380756e-09, "loss": 0.017, "step": 2513 }, { "epoch": 0.9887905604719764, "grad_norm": 1.0931810140609741, "learning_rate": 1.8490129386963818e-09, "loss": 0.0376, "step": 2514 }, { "epoch": 0.9891838741396264, "grad_norm": 0.5045303702354431, "learning_rate": 1.719313890855001e-09, "loss": 0.0203, "step": 2515 }, { "epoch": 0.9895771878072763, "grad_norm": 1.2506543397903442, "learning_rate": 1.5943290492539953e-09, "loss": 0.0415, "step": 2516 }, { "epoch": 0.9899705014749263, "grad_norm": 0.6282764673233032, "learning_rate": 1.4740586497366538e-09, "loss": 0.043, "step": 2517 }, { "epoch": 0.9903638151425762, "grad_norm": 1.0732625722885132, "learning_rate": 1.358502919251159e-09, "loss": 0.049, "step": 2518 }, { "epoch": 0.9907571288102262, "grad_norm": 0.8076870441436768, "learning_rate": 1.247662075848921e-09, "loss": 0.0367, "step": 2519 }, { "epoch": 0.9911504424778761, "grad_norm": 1.1323729753494263, "learning_rate": 1.1415363286843007e-09, "loss": 0.0549, "step": 2520 }, { "epoch": 0.991543756145526, "grad_norm": 1.2635443210601807, "learning_rate": 1.0401258780146084e-09, "loss": 0.0375, "step": 2521 }, { "epoch": 0.991937069813176, "grad_norm": 1.430897831916809, "learning_rate": 9.434309151992727e-10, "loss": 0.075, "step": 2522 }, { "epoch": 0.9923303834808259, "grad_norm": 1.1660479307174683, "learning_rate": 8.514516226998393e-10, "loss": 0.0562, "step": 2523 }, { "epoch": 0.9927236971484759, "grad_norm": 2.029007911682129, "learning_rate": 7.641881740794166e-10, "loss": 0.0481, "step": 2524 }, { "epoch": 0.9931170108161259, "grad_norm": 0.7072765827178955, "learning_rate": 6.816407340023978e-10, "loss": 0.0188, "step": 2525 }, { "epoch": 0.9935103244837759, "grad_norm": 0.8789957165718079, "learning_rate": 6.03809458233906e-10, "loss": 0.0573, "step": 2526 }, { "epoch": 0.9939036381514258, "grad_norm": 0.7415314316749573, "learning_rate": 5.306944936406266e-10, "loss": 0.0458, "step": 2527 }, { "epoch": 0.9942969518190757, "grad_norm": 0.6154326796531677, "learning_rate": 4.622959781883096e-10, "loss": 0.0236, "step": 2528 }, { "epoch": 0.9946902654867257, "grad_norm": 0.810153067111969, "learning_rate": 3.9861404094426734e-10, "loss": 0.0443, "step": 2529 }, { "epoch": 0.9950835791543756, "grad_norm": 0.743605375289917, "learning_rate": 3.3964880207459916e-10, "loss": 0.052, "step": 2530 }, { "epoch": 0.9954768928220256, "grad_norm": 1.1516720056533813, "learning_rate": 2.8540037284557897e-10, "loss": 0.0729, "step": 2531 }, { "epoch": 0.9958702064896755, "grad_norm": 1.1776301860809326, "learning_rate": 2.358688556233779e-10, "loss": 0.0401, "step": 2532 }, { "epoch": 0.9962635201573254, "grad_norm": 1.0834025144577026, "learning_rate": 1.9105434387239886e-10, "loss": 0.0593, "step": 2533 }, { "epoch": 0.9966568338249754, "grad_norm": 1.4529463052749634, "learning_rate": 1.509569221569418e-10, "loss": 0.0423, "step": 2534 }, { "epoch": 0.9970501474926253, "grad_norm": 1.1381511688232422, "learning_rate": 1.1557666614037122e-10, "loss": 0.0411, "step": 2535 }, { "epoch": 0.9974434611602753, "grad_norm": 1.113553762435913, "learning_rate": 8.49136425840058e-11, "loss": 0.0611, "step": 2536 }, { "epoch": 0.9978367748279253, "grad_norm": 1.071913719177246, "learning_rate": 5.896790934878383e-11, "loss": 0.0609, "step": 2537 }, { "epoch": 0.9982300884955753, "grad_norm": 1.7356159687042236, "learning_rate": 3.7739515393320215e-11, "loss": 0.0524, "step": 2538 }, { "epoch": 0.9986234021632252, "grad_norm": 1.0763658285140991, "learning_rate": 2.122850077584948e-11, "loss": 0.0527, "step": 2539 }, { "epoch": 0.9990167158308751, "grad_norm": 0.6793241500854492, "learning_rate": 9.434896651727699e-12, "loss": 0.0462, "step": 2540 }, { "epoch": 0.9994100294985251, "grad_norm": 0.9101441502571106, "learning_rate": 2.358725275652951e-12, "loss": 0.0453, "step": 2541 }, { "epoch": 0.999803343166175, "grad_norm": 1.0394845008850098, "learning_rate": 0.0, "loss": 0.0578, "step": 2542 }, { "epoch": 0.999803343166175, "step": 2542, "total_flos": 5.5848341785175654e+17, "train_loss": 0.05740805761998535, "train_runtime": 78224.1342, "train_samples_per_second": 1.04, "train_steps_per_second": 0.032 } ], "logging_steps": 1.0, "max_steps": 2542, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.5848341785175654e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }