diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17836 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999803343166175, + "eval_steps": 500, + "global_step": 2542, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00039331366764995085, + "grad_norm": 7.214968204498291, + "learning_rate": 1.9607843137254902e-08, + "loss": 0.1717, + "step": 1 + }, + { + "epoch": 0.0007866273352999017, + "grad_norm": 8.45617961883545, + "learning_rate": 3.9215686274509804e-08, + "loss": 0.1394, + "step": 2 + }, + { + "epoch": 0.0011799410029498525, + "grad_norm": 9.644225120544434, + "learning_rate": 5.882352941176471e-08, + "loss": 0.1416, + "step": 3 + }, + { + "epoch": 0.0015732546705998034, + "grad_norm": 6.772904872894287, + "learning_rate": 7.843137254901961e-08, + "loss": 0.1696, + "step": 4 + }, + { + "epoch": 0.0019665683382497543, + "grad_norm": 11.89709758758545, + "learning_rate": 9.803921568627452e-08, + "loss": 0.2043, + "step": 5 + }, + { + "epoch": 0.002359882005899705, + "grad_norm": 30.768009185791016, + "learning_rate": 1.1764705882352942e-07, + "loss": 0.1557, + "step": 6 + }, + { + "epoch": 0.0027531956735496557, + "grad_norm": 7.8864569664001465, + "learning_rate": 1.3725490196078432e-07, + "loss": 0.1478, + "step": 7 + }, + { + "epoch": 0.003146509341199607, + "grad_norm": 10.4628267288208, + "learning_rate": 1.5686274509803921e-07, + "loss": 0.162, + "step": 8 + }, + { + "epoch": 0.0035398230088495575, + "grad_norm": 8.983762741088867, + "learning_rate": 1.7647058823529414e-07, + "loss": 0.1482, + "step": 9 + }, + { + "epoch": 0.003933136676499509, + "grad_norm": 9.961833953857422, + "learning_rate": 1.9607843137254904e-07, + "loss": 0.1851, + "step": 10 + }, + { + "epoch": 0.004326450344149459, + "grad_norm": 7.383552074432373, + "learning_rate": 2.1568627450980394e-07, + "loss": 0.1483, + "step": 11 + }, + { + "epoch": 0.00471976401179941, + "grad_norm": 10.243701934814453, + "learning_rate": 2.3529411764705883e-07, + "loss": 0.1457, + "step": 12 + }, + { + "epoch": 0.005113077679449361, + "grad_norm": 9.73193645477295, + "learning_rate": 2.5490196078431376e-07, + "loss": 0.1623, + "step": 13 + }, + { + "epoch": 0.005506391347099311, + "grad_norm": 6.044100284576416, + "learning_rate": 2.7450980392156863e-07, + "loss": 0.1346, + "step": 14 + }, + { + "epoch": 0.0058997050147492625, + "grad_norm": 28.241085052490234, + "learning_rate": 2.9411764705882356e-07, + "loss": 0.1583, + "step": 15 + }, + { + "epoch": 0.006293018682399214, + "grad_norm": 11.225924491882324, + "learning_rate": 3.1372549019607843e-07, + "loss": 0.1781, + "step": 16 + }, + { + "epoch": 0.006686332350049164, + "grad_norm": 9.774815559387207, + "learning_rate": 3.3333333333333335e-07, + "loss": 0.1567, + "step": 17 + }, + { + "epoch": 0.007079646017699115, + "grad_norm": 10.569445610046387, + "learning_rate": 3.529411764705883e-07, + "loss": 0.1362, + "step": 18 + }, + { + "epoch": 0.007472959685349066, + "grad_norm": 6.202274322509766, + "learning_rate": 3.7254901960784315e-07, + "loss": 0.1493, + "step": 19 + }, + { + "epoch": 0.007866273352999017, + "grad_norm": 9.480630874633789, + "learning_rate": 3.921568627450981e-07, + "loss": 0.1528, + "step": 20 + }, + { + "epoch": 0.008259587020648967, + "grad_norm": 13.586874008178711, + "learning_rate": 4.1176470588235295e-07, + "loss": 0.1266, + "step": 21 + }, + { + "epoch": 0.008652900688298918, + "grad_norm": 11.455598831176758, + "learning_rate": 4.3137254901960787e-07, + "loss": 0.1225, + "step": 22 + }, + { + "epoch": 0.00904621435594887, + "grad_norm": 12.348589897155762, + "learning_rate": 4.509803921568628e-07, + "loss": 0.1863, + "step": 23 + }, + { + "epoch": 0.00943952802359882, + "grad_norm": 7.493137836456299, + "learning_rate": 4.7058823529411767e-07, + "loss": 0.1214, + "step": 24 + }, + { + "epoch": 0.00983284169124877, + "grad_norm": 11.203600883483887, + "learning_rate": 4.901960784313725e-07, + "loss": 0.1511, + "step": 25 + }, + { + "epoch": 0.010226155358898722, + "grad_norm": 10.017373085021973, + "learning_rate": 5.098039215686275e-07, + "loss": 0.1464, + "step": 26 + }, + { + "epoch": 0.010619469026548672, + "grad_norm": 7.930361270904541, + "learning_rate": 5.294117647058824e-07, + "loss": 0.1716, + "step": 27 + }, + { + "epoch": 0.011012782694198623, + "grad_norm": 6.609414577484131, + "learning_rate": 5.490196078431373e-07, + "loss": 0.1499, + "step": 28 + }, + { + "epoch": 0.011406096361848575, + "grad_norm": 9.198175430297852, + "learning_rate": 5.686274509803922e-07, + "loss": 0.1513, + "step": 29 + }, + { + "epoch": 0.011799410029498525, + "grad_norm": 7.527069091796875, + "learning_rate": 5.882352941176471e-07, + "loss": 0.1344, + "step": 30 + }, + { + "epoch": 0.012192723697148475, + "grad_norm": 25.97745704650879, + "learning_rate": 6.07843137254902e-07, + "loss": 0.1025, + "step": 31 + }, + { + "epoch": 0.012586037364798427, + "grad_norm": 6.214263916015625, + "learning_rate": 6.274509803921569e-07, + "loss": 0.1387, + "step": 32 + }, + { + "epoch": 0.012979351032448377, + "grad_norm": 7.101906776428223, + "learning_rate": 6.470588235294118e-07, + "loss": 0.1335, + "step": 33 + }, + { + "epoch": 0.013372664700098328, + "grad_norm": 7.696187496185303, + "learning_rate": 6.666666666666667e-07, + "loss": 0.1277, + "step": 34 + }, + { + "epoch": 0.01376597836774828, + "grad_norm": 9.324244499206543, + "learning_rate": 6.862745098039217e-07, + "loss": 0.1512, + "step": 35 + }, + { + "epoch": 0.01415929203539823, + "grad_norm": 3.9664223194122314, + "learning_rate": 7.058823529411766e-07, + "loss": 0.0816, + "step": 36 + }, + { + "epoch": 0.01455260570304818, + "grad_norm": 4.77344274520874, + "learning_rate": 7.254901960784315e-07, + "loss": 0.1036, + "step": 37 + }, + { + "epoch": 0.014945919370698132, + "grad_norm": 5.8425612449646, + "learning_rate": 7.450980392156863e-07, + "loss": 0.0857, + "step": 38 + }, + { + "epoch": 0.015339233038348082, + "grad_norm": 4.707705020904541, + "learning_rate": 7.647058823529413e-07, + "loss": 0.0905, + "step": 39 + }, + { + "epoch": 0.015732546705998034, + "grad_norm": 8.28884220123291, + "learning_rate": 7.843137254901962e-07, + "loss": 0.1273, + "step": 40 + }, + { + "epoch": 0.016125860373647983, + "grad_norm": 5.381669998168945, + "learning_rate": 8.039215686274511e-07, + "loss": 0.0938, + "step": 41 + }, + { + "epoch": 0.016519174041297935, + "grad_norm": 4.281416893005371, + "learning_rate": 8.235294117647059e-07, + "loss": 0.0935, + "step": 42 + }, + { + "epoch": 0.016912487708947887, + "grad_norm": 6.621143817901611, + "learning_rate": 8.431372549019609e-07, + "loss": 0.1002, + "step": 43 + }, + { + "epoch": 0.017305801376597835, + "grad_norm": 4.4914350509643555, + "learning_rate": 8.627450980392157e-07, + "loss": 0.097, + "step": 44 + }, + { + "epoch": 0.017699115044247787, + "grad_norm": 3.7035109996795654, + "learning_rate": 8.823529411764707e-07, + "loss": 0.0887, + "step": 45 + }, + { + "epoch": 0.01809242871189774, + "grad_norm": 4.306455612182617, + "learning_rate": 9.019607843137256e-07, + "loss": 0.1027, + "step": 46 + }, + { + "epoch": 0.018485742379547688, + "grad_norm": 5.768416881561279, + "learning_rate": 9.215686274509806e-07, + "loss": 0.1006, + "step": 47 + }, + { + "epoch": 0.01887905604719764, + "grad_norm": 19.471040725708008, + "learning_rate": 9.411764705882353e-07, + "loss": 0.1178, + "step": 48 + }, + { + "epoch": 0.019272369714847592, + "grad_norm": 6.249476432800293, + "learning_rate": 9.607843137254904e-07, + "loss": 0.1, + "step": 49 + }, + { + "epoch": 0.01966568338249754, + "grad_norm": 5.785927772521973, + "learning_rate": 9.80392156862745e-07, + "loss": 0.078, + "step": 50 + }, + { + "epoch": 0.020058997050147492, + "grad_norm": 6.312557220458984, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.1117, + "step": 51 + }, + { + "epoch": 0.020452310717797444, + "grad_norm": 3.5102477073669434, + "learning_rate": 1.019607843137255e-06, + "loss": 0.0913, + "step": 52 + }, + { + "epoch": 0.020845624385447393, + "grad_norm": 6.845943450927734, + "learning_rate": 1.03921568627451e-06, + "loss": 0.1353, + "step": 53 + }, + { + "epoch": 0.021238938053097345, + "grad_norm": 5.505466461181641, + "learning_rate": 1.0588235294117648e-06, + "loss": 0.0965, + "step": 54 + }, + { + "epoch": 0.021632251720747297, + "grad_norm": 4.362204551696777, + "learning_rate": 1.0784313725490197e-06, + "loss": 0.0844, + "step": 55 + }, + { + "epoch": 0.022025565388397245, + "grad_norm": 4.358127117156982, + "learning_rate": 1.0980392156862745e-06, + "loss": 0.1155, + "step": 56 + }, + { + "epoch": 0.022418879056047197, + "grad_norm": 7.55561637878418, + "learning_rate": 1.1176470588235296e-06, + "loss": 0.0742, + "step": 57 + }, + { + "epoch": 0.02281219272369715, + "grad_norm": 5.882073879241943, + "learning_rate": 1.1372549019607845e-06, + "loss": 0.1112, + "step": 58 + }, + { + "epoch": 0.023205506391347098, + "grad_norm": 2.456120491027832, + "learning_rate": 1.1568627450980394e-06, + "loss": 0.0605, + "step": 59 + }, + { + "epoch": 0.02359882005899705, + "grad_norm": 19.60419273376465, + "learning_rate": 1.1764705882352942e-06, + "loss": 0.1267, + "step": 60 + }, + { + "epoch": 0.023992133726647002, + "grad_norm": 3.074788808822632, + "learning_rate": 1.196078431372549e-06, + "loss": 0.0821, + "step": 61 + }, + { + "epoch": 0.02438544739429695, + "grad_norm": 3.561314344406128, + "learning_rate": 1.215686274509804e-06, + "loss": 0.0572, + "step": 62 + }, + { + "epoch": 0.024778761061946902, + "grad_norm": 13.668036460876465, + "learning_rate": 1.235294117647059e-06, + "loss": 0.1268, + "step": 63 + }, + { + "epoch": 0.025172074729596854, + "grad_norm": 3.8883397579193115, + "learning_rate": 1.2549019607843137e-06, + "loss": 0.0849, + "step": 64 + }, + { + "epoch": 0.025565388397246803, + "grad_norm": 4.154886245727539, + "learning_rate": 1.2745098039215686e-06, + "loss": 0.1071, + "step": 65 + }, + { + "epoch": 0.025958702064896755, + "grad_norm": 5.3974127769470215, + "learning_rate": 1.2941176470588237e-06, + "loss": 0.0749, + "step": 66 + }, + { + "epoch": 0.026352015732546707, + "grad_norm": 3.088780164718628, + "learning_rate": 1.3137254901960785e-06, + "loss": 0.0768, + "step": 67 + }, + { + "epoch": 0.026745329400196655, + "grad_norm": 3.2044262886047363, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.0641, + "step": 68 + }, + { + "epoch": 0.027138643067846607, + "grad_norm": 5.424925327301025, + "learning_rate": 1.3529411764705883e-06, + "loss": 0.061, + "step": 69 + }, + { + "epoch": 0.02753195673549656, + "grad_norm": 4.061574935913086, + "learning_rate": 1.3725490196078434e-06, + "loss": 0.0851, + "step": 70 + }, + { + "epoch": 0.027925270403146508, + "grad_norm": 5.696750164031982, + "learning_rate": 1.3921568627450982e-06, + "loss": 0.1107, + "step": 71 + }, + { + "epoch": 0.02831858407079646, + "grad_norm": 4.410640716552734, + "learning_rate": 1.4117647058823531e-06, + "loss": 0.0714, + "step": 72 + }, + { + "epoch": 0.028711897738446412, + "grad_norm": 6.307974815368652, + "learning_rate": 1.4313725490196078e-06, + "loss": 0.0866, + "step": 73 + }, + { + "epoch": 0.02910521140609636, + "grad_norm": 2.53486967086792, + "learning_rate": 1.450980392156863e-06, + "loss": 0.0613, + "step": 74 + }, + { + "epoch": 0.029498525073746312, + "grad_norm": 6.9410881996154785, + "learning_rate": 1.4705882352941177e-06, + "loss": 0.086, + "step": 75 + }, + { + "epoch": 0.029891838741396264, + "grad_norm": 2.5871775150299072, + "learning_rate": 1.4901960784313726e-06, + "loss": 0.0507, + "step": 76 + }, + { + "epoch": 0.030285152409046213, + "grad_norm": 2.2673654556274414, + "learning_rate": 1.5098039215686275e-06, + "loss": 0.0676, + "step": 77 + }, + { + "epoch": 0.030678466076696165, + "grad_norm": 2.789076805114746, + "learning_rate": 1.5294117647058826e-06, + "loss": 0.0632, + "step": 78 + }, + { + "epoch": 0.031071779744346117, + "grad_norm": 6.127337455749512, + "learning_rate": 1.5490196078431374e-06, + "loss": 0.0498, + "step": 79 + }, + { + "epoch": 0.03146509341199607, + "grad_norm": 2.758253574371338, + "learning_rate": 1.5686274509803923e-06, + "loss": 0.0706, + "step": 80 + }, + { + "epoch": 0.03185840707964602, + "grad_norm": 6.687328815460205, + "learning_rate": 1.5882352941176472e-06, + "loss": 0.0961, + "step": 81 + }, + { + "epoch": 0.032251720747295966, + "grad_norm": 7.499604225158691, + "learning_rate": 1.6078431372549023e-06, + "loss": 0.0715, + "step": 82 + }, + { + "epoch": 0.03264503441494592, + "grad_norm": 6.008899211883545, + "learning_rate": 1.6274509803921571e-06, + "loss": 0.123, + "step": 83 + }, + { + "epoch": 0.03303834808259587, + "grad_norm": 4.841026306152344, + "learning_rate": 1.6470588235294118e-06, + "loss": 0.0647, + "step": 84 + }, + { + "epoch": 0.03343166175024582, + "grad_norm": 3.0710766315460205, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.0372, + "step": 85 + }, + { + "epoch": 0.033824975417895774, + "grad_norm": 3.3783321380615234, + "learning_rate": 1.6862745098039217e-06, + "loss": 0.0843, + "step": 86 + }, + { + "epoch": 0.03421828908554572, + "grad_norm": 2.6547350883483887, + "learning_rate": 1.7058823529411766e-06, + "loss": 0.0589, + "step": 87 + }, + { + "epoch": 0.03461160275319567, + "grad_norm": 3.6741859912872314, + "learning_rate": 1.7254901960784315e-06, + "loss": 0.0308, + "step": 88 + }, + { + "epoch": 0.035004916420845626, + "grad_norm": 3.555490493774414, + "learning_rate": 1.7450980392156864e-06, + "loss": 0.0497, + "step": 89 + }, + { + "epoch": 0.035398230088495575, + "grad_norm": 3.1174697875976562, + "learning_rate": 1.7647058823529414e-06, + "loss": 0.063, + "step": 90 + }, + { + "epoch": 0.03579154375614552, + "grad_norm": 4.790848255157471, + "learning_rate": 1.7843137254901963e-06, + "loss": 0.0834, + "step": 91 + }, + { + "epoch": 0.03618485742379548, + "grad_norm": 3.2931265830993652, + "learning_rate": 1.8039215686274512e-06, + "loss": 0.0531, + "step": 92 + }, + { + "epoch": 0.03657817109144543, + "grad_norm": 13.777477264404297, + "learning_rate": 1.8235294117647058e-06, + "loss": 0.0786, + "step": 93 + }, + { + "epoch": 0.036971484759095376, + "grad_norm": 4.943524360656738, + "learning_rate": 1.8431372549019611e-06, + "loss": 0.0602, + "step": 94 + }, + { + "epoch": 0.03736479842674533, + "grad_norm": 6.189723014831543, + "learning_rate": 1.8627450980392158e-06, + "loss": 0.0697, + "step": 95 + }, + { + "epoch": 0.03775811209439528, + "grad_norm": 3.5542352199554443, + "learning_rate": 1.8823529411764707e-06, + "loss": 0.0863, + "step": 96 + }, + { + "epoch": 0.03815142576204523, + "grad_norm": 5.407109260559082, + "learning_rate": 1.9019607843137255e-06, + "loss": 0.088, + "step": 97 + }, + { + "epoch": 0.038544739429695184, + "grad_norm": 3.3334732055664062, + "learning_rate": 1.921568627450981e-06, + "loss": 0.0889, + "step": 98 + }, + { + "epoch": 0.03893805309734513, + "grad_norm": 2.48398756980896, + "learning_rate": 1.9411764705882353e-06, + "loss": 0.0483, + "step": 99 + }, + { + "epoch": 0.03933136676499508, + "grad_norm": 2.3380913734436035, + "learning_rate": 1.96078431372549e-06, + "loss": 0.0707, + "step": 100 + }, + { + "epoch": 0.039724680432645036, + "grad_norm": 4.355076789855957, + "learning_rate": 1.980392156862745e-06, + "loss": 0.0639, + "step": 101 + }, + { + "epoch": 0.040117994100294985, + "grad_norm": 4.081620693206787, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.06, + "step": 102 + }, + { + "epoch": 0.04051130776794493, + "grad_norm": 4.437114715576172, + "learning_rate": 2.019607843137255e-06, + "loss": 0.1017, + "step": 103 + }, + { + "epoch": 0.04090462143559489, + "grad_norm": 4.925793647766113, + "learning_rate": 2.03921568627451e-06, + "loss": 0.0934, + "step": 104 + }, + { + "epoch": 0.04129793510324484, + "grad_norm": 2.085400104522705, + "learning_rate": 2.058823529411765e-06, + "loss": 0.058, + "step": 105 + }, + { + "epoch": 0.041691248770894786, + "grad_norm": 2.8664395809173584, + "learning_rate": 2.07843137254902e-06, + "loss": 0.0709, + "step": 106 + }, + { + "epoch": 0.04208456243854474, + "grad_norm": 1.7521601915359497, + "learning_rate": 2.0980392156862747e-06, + "loss": 0.031, + "step": 107 + }, + { + "epoch": 0.04247787610619469, + "grad_norm": 3.7575159072875977, + "learning_rate": 2.1176470588235296e-06, + "loss": 0.0777, + "step": 108 + }, + { + "epoch": 0.04287118977384464, + "grad_norm": 4.240278720855713, + "learning_rate": 2.1372549019607844e-06, + "loss": 0.0965, + "step": 109 + }, + { + "epoch": 0.043264503441494594, + "grad_norm": 3.841932773590088, + "learning_rate": 2.1568627450980393e-06, + "loss": 0.0844, + "step": 110 + }, + { + "epoch": 0.04365781710914454, + "grad_norm": 4.4334397315979, + "learning_rate": 2.176470588235294e-06, + "loss": 0.0956, + "step": 111 + }, + { + "epoch": 0.04405113077679449, + "grad_norm": 4.255678653717041, + "learning_rate": 2.196078431372549e-06, + "loss": 0.0855, + "step": 112 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 2.3486170768737793, + "learning_rate": 2.215686274509804e-06, + "loss": 0.0417, + "step": 113 + }, + { + "epoch": 0.044837758112094395, + "grad_norm": 2.222768783569336, + "learning_rate": 2.2352941176470592e-06, + "loss": 0.0556, + "step": 114 + }, + { + "epoch": 0.04523107177974434, + "grad_norm": 2.750119686126709, + "learning_rate": 2.254901960784314e-06, + "loss": 0.0481, + "step": 115 + }, + { + "epoch": 0.0456243854473943, + "grad_norm": 4.375302314758301, + "learning_rate": 2.274509803921569e-06, + "loss": 0.098, + "step": 116 + }, + { + "epoch": 0.04601769911504425, + "grad_norm": 3.7654221057891846, + "learning_rate": 2.2941176470588234e-06, + "loss": 0.1025, + "step": 117 + }, + { + "epoch": 0.046411012782694196, + "grad_norm": 2.422442674636841, + "learning_rate": 2.3137254901960787e-06, + "loss": 0.0675, + "step": 118 + }, + { + "epoch": 0.04680432645034415, + "grad_norm": 3.3458054065704346, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.067, + "step": 119 + }, + { + "epoch": 0.0471976401179941, + "grad_norm": 2.7424211502075195, + "learning_rate": 2.3529411764705885e-06, + "loss": 0.0774, + "step": 120 + }, + { + "epoch": 0.04759095378564405, + "grad_norm": 3.4825127124786377, + "learning_rate": 2.3725490196078433e-06, + "loss": 0.086, + "step": 121 + }, + { + "epoch": 0.047984267453294004, + "grad_norm": 55.36836242675781, + "learning_rate": 2.392156862745098e-06, + "loss": 0.0938, + "step": 122 + }, + { + "epoch": 0.04837758112094395, + "grad_norm": 2.256223201751709, + "learning_rate": 2.411764705882353e-06, + "loss": 0.0673, + "step": 123 + }, + { + "epoch": 0.0487708947885939, + "grad_norm": 3.8095710277557373, + "learning_rate": 2.431372549019608e-06, + "loss": 0.0728, + "step": 124 + }, + { + "epoch": 0.049164208456243856, + "grad_norm": 1.8562949895858765, + "learning_rate": 2.450980392156863e-06, + "loss": 0.0629, + "step": 125 + }, + { + "epoch": 0.049557522123893805, + "grad_norm": 4.999472618103027, + "learning_rate": 2.470588235294118e-06, + "loss": 0.059, + "step": 126 + }, + { + "epoch": 0.04995083579154375, + "grad_norm": 3.9088096618652344, + "learning_rate": 2.490196078431373e-06, + "loss": 0.0662, + "step": 127 + }, + { + "epoch": 0.05034414945919371, + "grad_norm": 4.975748062133789, + "learning_rate": 2.5098039215686274e-06, + "loss": 0.0688, + "step": 128 + }, + { + "epoch": 0.05073746312684366, + "grad_norm": 2.183948516845703, + "learning_rate": 2.5294117647058823e-06, + "loss": 0.0477, + "step": 129 + }, + { + "epoch": 0.051130776794493606, + "grad_norm": 4.890422821044922, + "learning_rate": 2.549019607843137e-06, + "loss": 0.0793, + "step": 130 + }, + { + "epoch": 0.05152409046214356, + "grad_norm": 4.04612398147583, + "learning_rate": 2.568627450980392e-06, + "loss": 0.0705, + "step": 131 + }, + { + "epoch": 0.05191740412979351, + "grad_norm": 2.8650074005126953, + "learning_rate": 2.5882352941176473e-06, + "loss": 0.0777, + "step": 132 + }, + { + "epoch": 0.05231071779744346, + "grad_norm": 3.9029088020324707, + "learning_rate": 2.6078431372549022e-06, + "loss": 0.0766, + "step": 133 + }, + { + "epoch": 0.052704031465093414, + "grad_norm": 2.4210422039031982, + "learning_rate": 2.627450980392157e-06, + "loss": 0.0663, + "step": 134 + }, + { + "epoch": 0.05309734513274336, + "grad_norm": 3.0176892280578613, + "learning_rate": 2.647058823529412e-06, + "loss": 0.0703, + "step": 135 + }, + { + "epoch": 0.05349065880039331, + "grad_norm": 13.886055946350098, + "learning_rate": 2.666666666666667e-06, + "loss": 0.064, + "step": 136 + }, + { + "epoch": 0.053883972468043266, + "grad_norm": 2.40460205078125, + "learning_rate": 2.6862745098039217e-06, + "loss": 0.0492, + "step": 137 + }, + { + "epoch": 0.054277286135693215, + "grad_norm": 3.829288959503174, + "learning_rate": 2.7058823529411766e-06, + "loss": 0.0564, + "step": 138 + }, + { + "epoch": 0.05467059980334316, + "grad_norm": 2.2005629539489746, + "learning_rate": 2.7254901960784314e-06, + "loss": 0.0483, + "step": 139 + }, + { + "epoch": 0.05506391347099312, + "grad_norm": 14.79651927947998, + "learning_rate": 2.7450980392156867e-06, + "loss": 0.0937, + "step": 140 + }, + { + "epoch": 0.05545722713864307, + "grad_norm": 1.6898876428604126, + "learning_rate": 2.7647058823529416e-06, + "loss": 0.0693, + "step": 141 + }, + { + "epoch": 0.055850540806293016, + "grad_norm": 3.5447332859039307, + "learning_rate": 2.7843137254901965e-06, + "loss": 0.1311, + "step": 142 + }, + { + "epoch": 0.05624385447394297, + "grad_norm": 2.291607618331909, + "learning_rate": 2.8039215686274514e-06, + "loss": 0.061, + "step": 143 + }, + { + "epoch": 0.05663716814159292, + "grad_norm": 4.079521656036377, + "learning_rate": 2.8235294117647062e-06, + "loss": 0.1169, + "step": 144 + }, + { + "epoch": 0.05703048180924287, + "grad_norm": 5.1168012619018555, + "learning_rate": 2.843137254901961e-06, + "loss": 0.0436, + "step": 145 + }, + { + "epoch": 0.057423795476892824, + "grad_norm": 4.056823253631592, + "learning_rate": 2.8627450980392155e-06, + "loss": 0.09, + "step": 146 + }, + { + "epoch": 0.05781710914454277, + "grad_norm": 2.1756484508514404, + "learning_rate": 2.8823529411764704e-06, + "loss": 0.0747, + "step": 147 + }, + { + "epoch": 0.05821042281219272, + "grad_norm": 2.8064467906951904, + "learning_rate": 2.901960784313726e-06, + "loss": 0.0261, + "step": 148 + }, + { + "epoch": 0.058603736479842676, + "grad_norm": 2.9834907054901123, + "learning_rate": 2.9215686274509806e-06, + "loss": 0.0735, + "step": 149 + }, + { + "epoch": 0.058997050147492625, + "grad_norm": 15.821993827819824, + "learning_rate": 2.9411764705882355e-06, + "loss": 0.0835, + "step": 150 + }, + { + "epoch": 0.05939036381514257, + "grad_norm": 6.1172709465026855, + "learning_rate": 2.9607843137254903e-06, + "loss": 0.0621, + "step": 151 + }, + { + "epoch": 0.05978367748279253, + "grad_norm": 3.961477041244507, + "learning_rate": 2.980392156862745e-06, + "loss": 0.0777, + "step": 152 + }, + { + "epoch": 0.06017699115044248, + "grad_norm": 3.682879686355591, + "learning_rate": 3e-06, + "loss": 0.0836, + "step": 153 + }, + { + "epoch": 0.060570304818092426, + "grad_norm": 1.2253718376159668, + "learning_rate": 3.019607843137255e-06, + "loss": 0.0255, + "step": 154 + }, + { + "epoch": 0.06096361848574238, + "grad_norm": 2.107466220855713, + "learning_rate": 3.03921568627451e-06, + "loss": 0.0698, + "step": 155 + }, + { + "epoch": 0.06135693215339233, + "grad_norm": 2.720797061920166, + "learning_rate": 3.058823529411765e-06, + "loss": 0.0683, + "step": 156 + }, + { + "epoch": 0.06175024582104228, + "grad_norm": 2.0135252475738525, + "learning_rate": 3.07843137254902e-06, + "loss": 0.0594, + "step": 157 + }, + { + "epoch": 0.062143559488692234, + "grad_norm": 2.011382579803467, + "learning_rate": 3.098039215686275e-06, + "loss": 0.0643, + "step": 158 + }, + { + "epoch": 0.06253687315634218, + "grad_norm": 3.047201156616211, + "learning_rate": 3.1176470588235297e-06, + "loss": 0.0564, + "step": 159 + }, + { + "epoch": 0.06293018682399214, + "grad_norm": 2.3302555084228516, + "learning_rate": 3.1372549019607846e-06, + "loss": 0.0404, + "step": 160 + }, + { + "epoch": 0.06332350049164208, + "grad_norm": 2.7288010120391846, + "learning_rate": 3.1568627450980395e-06, + "loss": 0.1009, + "step": 161 + }, + { + "epoch": 0.06371681415929203, + "grad_norm": 2.852647304534912, + "learning_rate": 3.1764705882352943e-06, + "loss": 0.0508, + "step": 162 + }, + { + "epoch": 0.06411012782694199, + "grad_norm": 2.101698637008667, + "learning_rate": 3.1960784313725492e-06, + "loss": 0.0814, + "step": 163 + }, + { + "epoch": 0.06450344149459193, + "grad_norm": 2.864086151123047, + "learning_rate": 3.2156862745098045e-06, + "loss": 0.0543, + "step": 164 + }, + { + "epoch": 0.06489675516224189, + "grad_norm": 2.587751865386963, + "learning_rate": 3.2352941176470594e-06, + "loss": 0.0753, + "step": 165 + }, + { + "epoch": 0.06529006882989184, + "grad_norm": 1.5767340660095215, + "learning_rate": 3.2549019607843143e-06, + "loss": 0.0399, + "step": 166 + }, + { + "epoch": 0.06568338249754178, + "grad_norm": 3.7279415130615234, + "learning_rate": 3.2745098039215687e-06, + "loss": 0.0804, + "step": 167 + }, + { + "epoch": 0.06607669616519174, + "grad_norm": 2.9727795124053955, + "learning_rate": 3.2941176470588236e-06, + "loss": 0.0548, + "step": 168 + }, + { + "epoch": 0.0664700098328417, + "grad_norm": 2.0582468509674072, + "learning_rate": 3.3137254901960785e-06, + "loss": 0.0656, + "step": 169 + }, + { + "epoch": 0.06686332350049164, + "grad_norm": 7.246119499206543, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0499, + "step": 170 + }, + { + "epoch": 0.06725663716814159, + "grad_norm": 70.4866714477539, + "learning_rate": 3.352941176470588e-06, + "loss": 0.0764, + "step": 171 + }, + { + "epoch": 0.06764995083579155, + "grad_norm": 1.8262776136398315, + "learning_rate": 3.3725490196078435e-06, + "loss": 0.0497, + "step": 172 + }, + { + "epoch": 0.06804326450344149, + "grad_norm": 2.6392412185668945, + "learning_rate": 3.3921568627450984e-06, + "loss": 0.072, + "step": 173 + }, + { + "epoch": 0.06843657817109144, + "grad_norm": 1.2957279682159424, + "learning_rate": 3.4117647058823532e-06, + "loss": 0.0749, + "step": 174 + }, + { + "epoch": 0.0688298918387414, + "grad_norm": 1.5801424980163574, + "learning_rate": 3.431372549019608e-06, + "loss": 0.0504, + "step": 175 + }, + { + "epoch": 0.06922320550639134, + "grad_norm": 1.6194735765457153, + "learning_rate": 3.450980392156863e-06, + "loss": 0.0396, + "step": 176 + }, + { + "epoch": 0.0696165191740413, + "grad_norm": 3.31343674659729, + "learning_rate": 3.470588235294118e-06, + "loss": 0.0624, + "step": 177 + }, + { + "epoch": 0.07000983284169125, + "grad_norm": 2.1785762310028076, + "learning_rate": 3.4901960784313727e-06, + "loss": 0.0548, + "step": 178 + }, + { + "epoch": 0.0704031465093412, + "grad_norm": 1.3683737516403198, + "learning_rate": 3.5098039215686276e-06, + "loss": 0.0274, + "step": 179 + }, + { + "epoch": 0.07079646017699115, + "grad_norm": 3.2981035709381104, + "learning_rate": 3.529411764705883e-06, + "loss": 0.0816, + "step": 180 + }, + { + "epoch": 0.0711897738446411, + "grad_norm": 2.3660190105438232, + "learning_rate": 3.5490196078431378e-06, + "loss": 0.0445, + "step": 181 + }, + { + "epoch": 0.07158308751229105, + "grad_norm": 3.4103376865386963, + "learning_rate": 3.5686274509803926e-06, + "loss": 0.0959, + "step": 182 + }, + { + "epoch": 0.071976401179941, + "grad_norm": 2.7939486503601074, + "learning_rate": 3.5882352941176475e-06, + "loss": 0.096, + "step": 183 + }, + { + "epoch": 0.07236971484759096, + "grad_norm": 2.009209632873535, + "learning_rate": 3.6078431372549024e-06, + "loss": 0.0548, + "step": 184 + }, + { + "epoch": 0.0727630285152409, + "grad_norm": 1.9003010988235474, + "learning_rate": 3.6274509803921573e-06, + "loss": 0.058, + "step": 185 + }, + { + "epoch": 0.07315634218289085, + "grad_norm": 2.788331985473633, + "learning_rate": 3.6470588235294117e-06, + "loss": 0.0828, + "step": 186 + }, + { + "epoch": 0.07354965585054081, + "grad_norm": 2.2508130073547363, + "learning_rate": 3.6666666666666666e-06, + "loss": 0.089, + "step": 187 + }, + { + "epoch": 0.07394296951819075, + "grad_norm": 14.532478332519531, + "learning_rate": 3.6862745098039223e-06, + "loss": 0.0878, + "step": 188 + }, + { + "epoch": 0.0743362831858407, + "grad_norm": 1.3768811225891113, + "learning_rate": 3.7058823529411767e-06, + "loss": 0.0534, + "step": 189 + }, + { + "epoch": 0.07472959685349066, + "grad_norm": 2.9948389530181885, + "learning_rate": 3.7254901960784316e-06, + "loss": 0.0704, + "step": 190 + }, + { + "epoch": 0.0751229105211406, + "grad_norm": 1.4626399278640747, + "learning_rate": 3.7450980392156865e-06, + "loss": 0.0306, + "step": 191 + }, + { + "epoch": 0.07551622418879056, + "grad_norm": 3.062840700149536, + "learning_rate": 3.7647058823529414e-06, + "loss": 0.0802, + "step": 192 + }, + { + "epoch": 0.07590953785644051, + "grad_norm": 5.729097843170166, + "learning_rate": 3.7843137254901962e-06, + "loss": 0.1013, + "step": 193 + }, + { + "epoch": 0.07630285152409046, + "grad_norm": 1.8716782331466675, + "learning_rate": 3.803921568627451e-06, + "loss": 0.0664, + "step": 194 + }, + { + "epoch": 0.07669616519174041, + "grad_norm": 2.058469533920288, + "learning_rate": 3.8235294117647055e-06, + "loss": 0.0683, + "step": 195 + }, + { + "epoch": 0.07708947885939037, + "grad_norm": 12.551715850830078, + "learning_rate": 3.843137254901962e-06, + "loss": 0.09, + "step": 196 + }, + { + "epoch": 0.07748279252704031, + "grad_norm": 2.2984426021575928, + "learning_rate": 3.862745098039216e-06, + "loss": 0.0672, + "step": 197 + }, + { + "epoch": 0.07787610619469026, + "grad_norm": 4.480764865875244, + "learning_rate": 3.882352941176471e-06, + "loss": 0.051, + "step": 198 + }, + { + "epoch": 0.07826941986234022, + "grad_norm": 1.4032012224197388, + "learning_rate": 3.901960784313726e-06, + "loss": 0.0289, + "step": 199 + }, + { + "epoch": 0.07866273352999016, + "grad_norm": 3.133589029312134, + "learning_rate": 3.92156862745098e-06, + "loss": 0.0807, + "step": 200 + }, + { + "epoch": 0.07905604719764012, + "grad_norm": 4.1782307624816895, + "learning_rate": 3.941176470588236e-06, + "loss": 0.0683, + "step": 201 + }, + { + "epoch": 0.07944936086529007, + "grad_norm": 11.163358688354492, + "learning_rate": 3.96078431372549e-06, + "loss": 0.0421, + "step": 202 + }, + { + "epoch": 0.07984267453294001, + "grad_norm": 1.3736735582351685, + "learning_rate": 3.980392156862745e-06, + "loss": 0.0339, + "step": 203 + }, + { + "epoch": 0.08023598820058997, + "grad_norm": 6.474332332611084, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0606, + "step": 204 + }, + { + "epoch": 0.08062930186823992, + "grad_norm": 2.8827829360961914, + "learning_rate": 4.019607843137255e-06, + "loss": 0.1104, + "step": 205 + }, + { + "epoch": 0.08102261553588987, + "grad_norm": 1.8476606607437134, + "learning_rate": 4.03921568627451e-06, + "loss": 0.0479, + "step": 206 + }, + { + "epoch": 0.08141592920353982, + "grad_norm": 3.2202746868133545, + "learning_rate": 4.058823529411765e-06, + "loss": 0.088, + "step": 207 + }, + { + "epoch": 0.08180924287118978, + "grad_norm": 3.4121432304382324, + "learning_rate": 4.07843137254902e-06, + "loss": 0.1051, + "step": 208 + }, + { + "epoch": 0.08220255653883972, + "grad_norm": 2.4771883487701416, + "learning_rate": 4.098039215686275e-06, + "loss": 0.0477, + "step": 209 + }, + { + "epoch": 0.08259587020648967, + "grad_norm": 2.9881558418273926, + "learning_rate": 4.11764705882353e-06, + "loss": 0.0472, + "step": 210 + }, + { + "epoch": 0.08298918387413963, + "grad_norm": 2.8722712993621826, + "learning_rate": 4.137254901960784e-06, + "loss": 0.0856, + "step": 211 + }, + { + "epoch": 0.08338249754178957, + "grad_norm": 1.9073129892349243, + "learning_rate": 4.15686274509804e-06, + "loss": 0.0542, + "step": 212 + }, + { + "epoch": 0.08377581120943953, + "grad_norm": 3.5067648887634277, + "learning_rate": 4.176470588235295e-06, + "loss": 0.0567, + "step": 213 + }, + { + "epoch": 0.08416912487708948, + "grad_norm": 2.5827410221099854, + "learning_rate": 4.196078431372549e-06, + "loss": 0.1062, + "step": 214 + }, + { + "epoch": 0.08456243854473942, + "grad_norm": 1.8257296085357666, + "learning_rate": 4.215686274509805e-06, + "loss": 0.0821, + "step": 215 + }, + { + "epoch": 0.08495575221238938, + "grad_norm": 3.9571404457092285, + "learning_rate": 4.235294117647059e-06, + "loss": 0.1143, + "step": 216 + }, + { + "epoch": 0.08534906588003933, + "grad_norm": 2.6589484214782715, + "learning_rate": 4.254901960784314e-06, + "loss": 0.0814, + "step": 217 + }, + { + "epoch": 0.08574237954768928, + "grad_norm": 0.915239155292511, + "learning_rate": 4.274509803921569e-06, + "loss": 0.0355, + "step": 218 + }, + { + "epoch": 0.08613569321533923, + "grad_norm": 2.9066381454467773, + "learning_rate": 4.294117647058823e-06, + "loss": 0.0783, + "step": 219 + }, + { + "epoch": 0.08652900688298919, + "grad_norm": 1.581722378730774, + "learning_rate": 4.313725490196079e-06, + "loss": 0.0589, + "step": 220 + }, + { + "epoch": 0.08692232055063913, + "grad_norm": 2.2173354625701904, + "learning_rate": 4.333333333333334e-06, + "loss": 0.0791, + "step": 221 + }, + { + "epoch": 0.08731563421828908, + "grad_norm": 1.784740686416626, + "learning_rate": 4.352941176470588e-06, + "loss": 0.0616, + "step": 222 + }, + { + "epoch": 0.08770894788593904, + "grad_norm": 1.9993363618850708, + "learning_rate": 4.372549019607844e-06, + "loss": 0.0864, + "step": 223 + }, + { + "epoch": 0.08810226155358898, + "grad_norm": 4.089532375335693, + "learning_rate": 4.392156862745098e-06, + "loss": 0.0982, + "step": 224 + }, + { + "epoch": 0.08849557522123894, + "grad_norm": 2.5914440155029297, + "learning_rate": 4.411764705882353e-06, + "loss": 0.0702, + "step": 225 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 2.555253028869629, + "learning_rate": 4.431372549019608e-06, + "loss": 0.0831, + "step": 226 + }, + { + "epoch": 0.08928220255653883, + "grad_norm": 2.2960548400878906, + "learning_rate": 4.450980392156863e-06, + "loss": 0.0641, + "step": 227 + }, + { + "epoch": 0.08967551622418879, + "grad_norm": 1.402106761932373, + "learning_rate": 4.4705882352941184e-06, + "loss": 0.0594, + "step": 228 + }, + { + "epoch": 0.09006882989183874, + "grad_norm": 3.1225955486297607, + "learning_rate": 4.490196078431373e-06, + "loss": 0.1042, + "step": 229 + }, + { + "epoch": 0.09046214355948869, + "grad_norm": 1.7568937540054321, + "learning_rate": 4.509803921568628e-06, + "loss": 0.0689, + "step": 230 + }, + { + "epoch": 0.09085545722713864, + "grad_norm": 2.8846213817596436, + "learning_rate": 4.529411764705883e-06, + "loss": 0.0955, + "step": 231 + }, + { + "epoch": 0.0912487708947886, + "grad_norm": 4.436802387237549, + "learning_rate": 4.549019607843138e-06, + "loss": 0.1668, + "step": 232 + }, + { + "epoch": 0.09164208456243854, + "grad_norm": 2.784074068069458, + "learning_rate": 4.568627450980392e-06, + "loss": 0.083, + "step": 233 + }, + { + "epoch": 0.0920353982300885, + "grad_norm": 2.276759147644043, + "learning_rate": 4.588235294117647e-06, + "loss": 0.0725, + "step": 234 + }, + { + "epoch": 0.09242871189773845, + "grad_norm": 2.5278875827789307, + "learning_rate": 4.607843137254902e-06, + "loss": 0.0744, + "step": 235 + }, + { + "epoch": 0.09282202556538839, + "grad_norm": 1.711602807044983, + "learning_rate": 4.627450980392157e-06, + "loss": 0.0749, + "step": 236 + }, + { + "epoch": 0.09321533923303835, + "grad_norm": 1.4517807960510254, + "learning_rate": 4.647058823529412e-06, + "loss": 0.0587, + "step": 237 + }, + { + "epoch": 0.0936086529006883, + "grad_norm": 1.090840220451355, + "learning_rate": 4.666666666666667e-06, + "loss": 0.0719, + "step": 238 + }, + { + "epoch": 0.09400196656833824, + "grad_norm": 1.8589414358139038, + "learning_rate": 4.686274509803922e-06, + "loss": 0.0563, + "step": 239 + }, + { + "epoch": 0.0943952802359882, + "grad_norm": 2.264702081680298, + "learning_rate": 4.705882352941177e-06, + "loss": 0.0648, + "step": 240 + }, + { + "epoch": 0.09478859390363815, + "grad_norm": 1.4464210271835327, + "learning_rate": 4.725490196078431e-06, + "loss": 0.0238, + "step": 241 + }, + { + "epoch": 0.0951819075712881, + "grad_norm": 1.9937217235565186, + "learning_rate": 4.745098039215687e-06, + "loss": 0.0493, + "step": 242 + }, + { + "epoch": 0.09557522123893805, + "grad_norm": 2.2047340869903564, + "learning_rate": 4.764705882352941e-06, + "loss": 0.091, + "step": 243 + }, + { + "epoch": 0.09596853490658801, + "grad_norm": 4.057810306549072, + "learning_rate": 4.784313725490196e-06, + "loss": 0.0938, + "step": 244 + }, + { + "epoch": 0.09636184857423795, + "grad_norm": 1.6187644004821777, + "learning_rate": 4.803921568627452e-06, + "loss": 0.0673, + "step": 245 + }, + { + "epoch": 0.0967551622418879, + "grad_norm": 2.7249605655670166, + "learning_rate": 4.823529411764706e-06, + "loss": 0.0848, + "step": 246 + }, + { + "epoch": 0.09714847590953786, + "grad_norm": 1.7594577074050903, + "learning_rate": 4.8431372549019614e-06, + "loss": 0.0594, + "step": 247 + }, + { + "epoch": 0.0975417895771878, + "grad_norm": 2.6266980171203613, + "learning_rate": 4.862745098039216e-06, + "loss": 0.0866, + "step": 248 + }, + { + "epoch": 0.09793510324483776, + "grad_norm": 3.3526737689971924, + "learning_rate": 4.882352941176471e-06, + "loss": 0.1115, + "step": 249 + }, + { + "epoch": 0.09832841691248771, + "grad_norm": 2.7514872550964355, + "learning_rate": 4.901960784313726e-06, + "loss": 0.0694, + "step": 250 + }, + { + "epoch": 0.09872173058013765, + "grad_norm": 2.44143009185791, + "learning_rate": 4.921568627450981e-06, + "loss": 0.0715, + "step": 251 + }, + { + "epoch": 0.09911504424778761, + "grad_norm": 2.214268207550049, + "learning_rate": 4.941176470588236e-06, + "loss": 0.0576, + "step": 252 + }, + { + "epoch": 0.09950835791543756, + "grad_norm": 1.7012481689453125, + "learning_rate": 4.960784313725491e-06, + "loss": 0.0754, + "step": 253 + }, + { + "epoch": 0.0999016715830875, + "grad_norm": 1.8335487842559814, + "learning_rate": 4.980392156862746e-06, + "loss": 0.0617, + "step": 254 + }, + { + "epoch": 0.10029498525073746, + "grad_norm": 2.3848774433135986, + "learning_rate": 5e-06, + "loss": 0.1011, + "step": 255 + }, + { + "epoch": 0.10068829891838742, + "grad_norm": 2.1847634315490723, + "learning_rate": 4.999997641274725e-06, + "loss": 0.0793, + "step": 256 + }, + { + "epoch": 0.10108161258603736, + "grad_norm": 1.5467146635055542, + "learning_rate": 4.999990565103349e-06, + "loss": 0.0685, + "step": 257 + }, + { + "epoch": 0.10147492625368731, + "grad_norm": 1.5211800336837769, + "learning_rate": 4.999978771499224e-06, + "loss": 0.0453, + "step": 258 + }, + { + "epoch": 0.10186823992133727, + "grad_norm": 1.944356918334961, + "learning_rate": 4.999962260484607e-06, + "loss": 0.0726, + "step": 259 + }, + { + "epoch": 0.10226155358898721, + "grad_norm": 2.206536054611206, + "learning_rate": 4.999941032090652e-06, + "loss": 0.0963, + "step": 260 + }, + { + "epoch": 0.10265486725663717, + "grad_norm": 0.9998722076416016, + "learning_rate": 4.999915086357417e-06, + "loss": 0.0425, + "step": 261 + }, + { + "epoch": 0.10304818092428712, + "grad_norm": 2.102257013320923, + "learning_rate": 4.99988442333386e-06, + "loss": 0.0857, + "step": 262 + }, + { + "epoch": 0.10344149459193706, + "grad_norm": 2.055304765701294, + "learning_rate": 4.999849043077843e-06, + "loss": 0.058, + "step": 263 + }, + { + "epoch": 0.10383480825958702, + "grad_norm": 2.11883544921875, + "learning_rate": 4.999808945656128e-06, + "loss": 0.1135, + "step": 264 + }, + { + "epoch": 0.10422812192723697, + "grad_norm": 1.4651076793670654, + "learning_rate": 4.999764131144377e-06, + "loss": 0.0609, + "step": 265 + }, + { + "epoch": 0.10462143559488692, + "grad_norm": 1.3278563022613525, + "learning_rate": 4.999714599627155e-06, + "loss": 0.0506, + "step": 266 + }, + { + "epoch": 0.10501474926253687, + "grad_norm": 3.376959800720215, + "learning_rate": 4.999660351197926e-06, + "loss": 0.0505, + "step": 267 + }, + { + "epoch": 0.10540806293018683, + "grad_norm": 14.901459693908691, + "learning_rate": 4.999601385959056e-06, + "loss": 0.0717, + "step": 268 + }, + { + "epoch": 0.10580137659783677, + "grad_norm": 1.7644176483154297, + "learning_rate": 4.999537704021812e-06, + "loss": 0.1109, + "step": 269 + }, + { + "epoch": 0.10619469026548672, + "grad_norm": 1.3101154565811157, + "learning_rate": 4.99946930550636e-06, + "loss": 0.0433, + "step": 270 + }, + { + "epoch": 0.10658800393313668, + "grad_norm": 3.403160572052002, + "learning_rate": 4.999396190541766e-06, + "loss": 0.1082, + "step": 271 + }, + { + "epoch": 0.10698131760078662, + "grad_norm": 2.1354033946990967, + "learning_rate": 4.999318359265998e-06, + "loss": 0.0698, + "step": 272 + }, + { + "epoch": 0.10737463126843658, + "grad_norm": 1.1540406942367554, + "learning_rate": 4.999235811825921e-06, + "loss": 0.0857, + "step": 273 + }, + { + "epoch": 0.10776794493608653, + "grad_norm": 1.4908989667892456, + "learning_rate": 4.9991485483773e-06, + "loss": 0.0627, + "step": 274 + }, + { + "epoch": 0.10816125860373647, + "grad_norm": 1.5307058095932007, + "learning_rate": 4.999056569084801e-06, + "loss": 0.0555, + "step": 275 + }, + { + "epoch": 0.10855457227138643, + "grad_norm": 2.4000704288482666, + "learning_rate": 4.998959874121986e-06, + "loss": 0.068, + "step": 276 + }, + { + "epoch": 0.10894788593903638, + "grad_norm": 1.2169445753097534, + "learning_rate": 4.998858463671316e-06, + "loss": 0.0716, + "step": 277 + }, + { + "epoch": 0.10934119960668633, + "grad_norm": 1.496738076210022, + "learning_rate": 4.998752337924152e-06, + "loss": 0.063, + "step": 278 + }, + { + "epoch": 0.10973451327433628, + "grad_norm": 1.3070656061172485, + "learning_rate": 4.998641497080749e-06, + "loss": 0.0444, + "step": 279 + }, + { + "epoch": 0.11012782694198624, + "grad_norm": 3.1283788681030273, + "learning_rate": 4.998525941350264e-06, + "loss": 0.1097, + "step": 280 + }, + { + "epoch": 0.11052114060963618, + "grad_norm": 2.3517940044403076, + "learning_rate": 4.998405670950747e-06, + "loss": 0.0778, + "step": 281 + }, + { + "epoch": 0.11091445427728613, + "grad_norm": 1.4366756677627563, + "learning_rate": 4.998280686109146e-06, + "loss": 0.0645, + "step": 282 + }, + { + "epoch": 0.11130776794493609, + "grad_norm": 1.5536798238754272, + "learning_rate": 4.998150987061304e-06, + "loss": 0.0483, + "step": 283 + }, + { + "epoch": 0.11170108161258603, + "grad_norm": 2.191906690597534, + "learning_rate": 4.9980165740519625e-06, + "loss": 0.061, + "step": 284 + }, + { + "epoch": 0.11209439528023599, + "grad_norm": 2.2331135272979736, + "learning_rate": 4.997877447334754e-06, + "loss": 0.073, + "step": 285 + }, + { + "epoch": 0.11248770894788594, + "grad_norm": 2.7030222415924072, + "learning_rate": 4.99773360717221e-06, + "loss": 0.0924, + "step": 286 + }, + { + "epoch": 0.11288102261553588, + "grad_norm": 1.2399053573608398, + "learning_rate": 4.997585053835754e-06, + "loss": 0.0603, + "step": 287 + }, + { + "epoch": 0.11327433628318584, + "grad_norm": 1.5186935663223267, + "learning_rate": 4.997431787605701e-06, + "loss": 0.0733, + "step": 288 + }, + { + "epoch": 0.1136676499508358, + "grad_norm": 5.53955078125, + "learning_rate": 4.997273808771263e-06, + "loss": 0.0735, + "step": 289 + }, + { + "epoch": 0.11406096361848574, + "grad_norm": 1.861646294593811, + "learning_rate": 4.997111117630543e-06, + "loss": 0.0365, + "step": 290 + }, + { + "epoch": 0.11445427728613569, + "grad_norm": 1.5158923864364624, + "learning_rate": 4.996943714490535e-06, + "loss": 0.0598, + "step": 291 + }, + { + "epoch": 0.11484759095378565, + "grad_norm": 3.7808361053466797, + "learning_rate": 4.996771599667126e-06, + "loss": 0.09, + "step": 292 + }, + { + "epoch": 0.11524090462143559, + "grad_norm": 1.3470269441604614, + "learning_rate": 4.996594773485093e-06, + "loss": 0.0304, + "step": 293 + }, + { + "epoch": 0.11563421828908554, + "grad_norm": 2.0843825340270996, + "learning_rate": 4.996413236278104e-06, + "loss": 0.0556, + "step": 294 + }, + { + "epoch": 0.1160275319567355, + "grad_norm": 1.6657154560089111, + "learning_rate": 4.996226988388716e-06, + "loss": 0.0628, + "step": 295 + }, + { + "epoch": 0.11642084562438544, + "grad_norm": 1.9300707578659058, + "learning_rate": 4.9960360301683755e-06, + "loss": 0.0701, + "step": 296 + }, + { + "epoch": 0.1168141592920354, + "grad_norm": 1.6507627964019775, + "learning_rate": 4.995840361977416e-06, + "loss": 0.0783, + "step": 297 + }, + { + "epoch": 0.11720747295968535, + "grad_norm": 1.9679419994354248, + "learning_rate": 4.995639984185059e-06, + "loss": 0.0714, + "step": 298 + }, + { + "epoch": 0.1176007866273353, + "grad_norm": 1.7199714183807373, + "learning_rate": 4.9954348971694146e-06, + "loss": 0.046, + "step": 299 + }, + { + "epoch": 0.11799410029498525, + "grad_norm": 1.3099826574325562, + "learning_rate": 4.995225101317478e-06, + "loss": 0.0542, + "step": 300 + }, + { + "epoch": 0.1183874139626352, + "grad_norm": 1.4102526903152466, + "learning_rate": 4.99501059702513e-06, + "loss": 0.07, + "step": 301 + }, + { + "epoch": 0.11878072763028515, + "grad_norm": 2.6054928302764893, + "learning_rate": 4.9947913846971345e-06, + "loss": 0.0753, + "step": 302 + }, + { + "epoch": 0.1191740412979351, + "grad_norm": 2.4399526119232178, + "learning_rate": 4.994567464747141e-06, + "loss": 0.1051, + "step": 303 + }, + { + "epoch": 0.11956735496558506, + "grad_norm": 3.065548896789551, + "learning_rate": 4.994338837597683e-06, + "loss": 0.0955, + "step": 304 + }, + { + "epoch": 0.119960668633235, + "grad_norm": 1.3317792415618896, + "learning_rate": 4.994105503680176e-06, + "loss": 0.0595, + "step": 305 + }, + { + "epoch": 0.12035398230088495, + "grad_norm": 1.5237491130828857, + "learning_rate": 4.993867463434916e-06, + "loss": 0.0909, + "step": 306 + }, + { + "epoch": 0.12074729596853491, + "grad_norm": 0.8940740823745728, + "learning_rate": 4.9936247173110785e-06, + "loss": 0.0628, + "step": 307 + }, + { + "epoch": 0.12114060963618485, + "grad_norm": 2.6642251014709473, + "learning_rate": 4.993377265766723e-06, + "loss": 0.0679, + "step": 308 + }, + { + "epoch": 0.1215339233038348, + "grad_norm": 2.868943452835083, + "learning_rate": 4.993125109268784e-06, + "loss": 0.047, + "step": 309 + }, + { + "epoch": 0.12192723697148476, + "grad_norm": 1.1550475358963013, + "learning_rate": 4.992868248293077e-06, + "loss": 0.0771, + "step": 310 + }, + { + "epoch": 0.1223205506391347, + "grad_norm": 1.7380859851837158, + "learning_rate": 4.9926066833242926e-06, + "loss": 0.0573, + "step": 311 + }, + { + "epoch": 0.12271386430678466, + "grad_norm": 1.8886913061141968, + "learning_rate": 4.9923404148559995e-06, + "loss": 0.1034, + "step": 312 + }, + { + "epoch": 0.12310717797443461, + "grad_norm": 1.5682885646820068, + "learning_rate": 4.992069443390641e-06, + "loss": 0.0595, + "step": 313 + }, + { + "epoch": 0.12350049164208456, + "grad_norm": 2.2674522399902344, + "learning_rate": 4.991793769439534e-06, + "loss": 0.0855, + "step": 314 + }, + { + "epoch": 0.12389380530973451, + "grad_norm": 1.3800448179244995, + "learning_rate": 4.991513393522871e-06, + "loss": 0.0537, + "step": 315 + }, + { + "epoch": 0.12428711897738447, + "grad_norm": 1.9727108478546143, + "learning_rate": 4.991228316169715e-06, + "loss": 0.0698, + "step": 316 + }, + { + "epoch": 0.12468043264503441, + "grad_norm": 1.1997886896133423, + "learning_rate": 4.990938537918001e-06, + "loss": 0.0513, + "step": 317 + }, + { + "epoch": 0.12507374631268436, + "grad_norm": 1.0357115268707275, + "learning_rate": 4.990644059314536e-06, + "loss": 0.0537, + "step": 318 + }, + { + "epoch": 0.1254670599803343, + "grad_norm": 2.9861936569213867, + "learning_rate": 4.990344880914994e-06, + "loss": 0.0836, + "step": 319 + }, + { + "epoch": 0.12586037364798427, + "grad_norm": 1.0183316469192505, + "learning_rate": 4.990041003283921e-06, + "loss": 0.0595, + "step": 320 + }, + { + "epoch": 0.12625368731563422, + "grad_norm": 3.085170269012451, + "learning_rate": 4.989732426994725e-06, + "loss": 0.1097, + "step": 321 + }, + { + "epoch": 0.12664700098328416, + "grad_norm": 1.6864210367202759, + "learning_rate": 4.989419152629685e-06, + "loss": 0.0546, + "step": 322 + }, + { + "epoch": 0.12704031465093413, + "grad_norm": 1.678736686706543, + "learning_rate": 4.9891011807799435e-06, + "loss": 0.0436, + "step": 323 + }, + { + "epoch": 0.12743362831858407, + "grad_norm": 1.6153947114944458, + "learning_rate": 4.988778512045507e-06, + "loss": 0.0885, + "step": 324 + }, + { + "epoch": 0.127826941986234, + "grad_norm": 2.239644765853882, + "learning_rate": 4.9884511470352456e-06, + "loss": 0.0841, + "step": 325 + }, + { + "epoch": 0.12822025565388398, + "grad_norm": 2.258629560470581, + "learning_rate": 4.9881190863668895e-06, + "loss": 0.0547, + "step": 326 + }, + { + "epoch": 0.12861356932153392, + "grad_norm": 1.519643783569336, + "learning_rate": 4.98778233066703e-06, + "loss": 0.076, + "step": 327 + }, + { + "epoch": 0.12900688298918386, + "grad_norm": 2.382768154144287, + "learning_rate": 4.987440880571121e-06, + "loss": 0.0754, + "step": 328 + }, + { + "epoch": 0.12940019665683383, + "grad_norm": 1.1717922687530518, + "learning_rate": 4.98709473672347e-06, + "loss": 0.0431, + "step": 329 + }, + { + "epoch": 0.12979351032448377, + "grad_norm": 2.597674608230591, + "learning_rate": 4.986743899777244e-06, + "loss": 0.0831, + "step": 330 + }, + { + "epoch": 0.13018682399213372, + "grad_norm": 2.2018444538116455, + "learning_rate": 4.986388370394466e-06, + "loss": 0.0967, + "step": 331 + }, + { + "epoch": 0.13058013765978368, + "grad_norm": 2.4188756942749023, + "learning_rate": 4.986028149246013e-06, + "loss": 0.0706, + "step": 332 + }, + { + "epoch": 0.13097345132743363, + "grad_norm": 1.3178000450134277, + "learning_rate": 4.985663237011614e-06, + "loss": 0.0814, + "step": 333 + }, + { + "epoch": 0.13136676499508357, + "grad_norm": 1.007521390914917, + "learning_rate": 4.985293634379852e-06, + "loss": 0.0518, + "step": 334 + }, + { + "epoch": 0.13176007866273354, + "grad_norm": 2.3999087810516357, + "learning_rate": 4.984919342048159e-06, + "loss": 0.0526, + "step": 335 + }, + { + "epoch": 0.13215339233038348, + "grad_norm": 2.07135272026062, + "learning_rate": 4.984540360722819e-06, + "loss": 0.0493, + "step": 336 + }, + { + "epoch": 0.13254670599803342, + "grad_norm": 1.2785420417785645, + "learning_rate": 4.98415669111896e-06, + "loss": 0.0671, + "step": 337 + }, + { + "epoch": 0.1329400196656834, + "grad_norm": 1.264936089515686, + "learning_rate": 4.9837683339605615e-06, + "loss": 0.0619, + "step": 338 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 2.3385870456695557, + "learning_rate": 4.983375289980443e-06, + "loss": 0.1164, + "step": 339 + }, + { + "epoch": 0.13372664700098327, + "grad_norm": 2.5312047004699707, + "learning_rate": 4.982977559920273e-06, + "loss": 0.1017, + "step": 340 + }, + { + "epoch": 0.13411996066863324, + "grad_norm": 1.6104050874710083, + "learning_rate": 4.982575144530559e-06, + "loss": 0.0647, + "step": 341 + }, + { + "epoch": 0.13451327433628318, + "grad_norm": 1.557822346687317, + "learning_rate": 4.982168044570652e-06, + "loss": 0.0546, + "step": 342 + }, + { + "epoch": 0.13490658800393313, + "grad_norm": 1.430794596672058, + "learning_rate": 4.981756260808741e-06, + "loss": 0.0553, + "step": 343 + }, + { + "epoch": 0.1352999016715831, + "grad_norm": 1.718525767326355, + "learning_rate": 4.981339794021853e-06, + "loss": 0.0633, + "step": 344 + }, + { + "epoch": 0.13569321533923304, + "grad_norm": 0.9465076327323914, + "learning_rate": 4.9809186449958536e-06, + "loss": 0.0468, + "step": 345 + }, + { + "epoch": 0.13608652900688298, + "grad_norm": 1.7588387727737427, + "learning_rate": 4.980492814525442e-06, + "loss": 0.0687, + "step": 346 + }, + { + "epoch": 0.13647984267453295, + "grad_norm": 1.392269492149353, + "learning_rate": 4.980062303414152e-06, + "loss": 0.0363, + "step": 347 + }, + { + "epoch": 0.1368731563421829, + "grad_norm": 2.146742582321167, + "learning_rate": 4.97962711247435e-06, + "loss": 0.0604, + "step": 348 + }, + { + "epoch": 0.13726647000983283, + "grad_norm": 2.926267385482788, + "learning_rate": 4.979187242527233e-06, + "loss": 0.086, + "step": 349 + }, + { + "epoch": 0.1376597836774828, + "grad_norm": 1.9409819841384888, + "learning_rate": 4.978742694402825e-06, + "loss": 0.0588, + "step": 350 + }, + { + "epoch": 0.13805309734513274, + "grad_norm": 1.8433561325073242, + "learning_rate": 4.978293468939982e-06, + "loss": 0.0676, + "step": 351 + }, + { + "epoch": 0.13844641101278268, + "grad_norm": 2.0934383869171143, + "learning_rate": 4.977839566986382e-06, + "loss": 0.0713, + "step": 352 + }, + { + "epoch": 0.13883972468043265, + "grad_norm": 1.8030976057052612, + "learning_rate": 4.977380989398529e-06, + "loss": 0.1169, + "step": 353 + }, + { + "epoch": 0.1392330383480826, + "grad_norm": 2.014277935028076, + "learning_rate": 4.976917737041751e-06, + "loss": 0.0376, + "step": 354 + }, + { + "epoch": 0.13962635201573254, + "grad_norm": 1.3366997241973877, + "learning_rate": 4.976449810790196e-06, + "loss": 0.0644, + "step": 355 + }, + { + "epoch": 0.1400196656833825, + "grad_norm": 1.63720703125, + "learning_rate": 4.97597721152683e-06, + "loss": 0.067, + "step": 356 + }, + { + "epoch": 0.14041297935103245, + "grad_norm": 2.317793846130371, + "learning_rate": 4.975499940143439e-06, + "loss": 0.0732, + "step": 357 + }, + { + "epoch": 0.1408062930186824, + "grad_norm": 1.352824330329895, + "learning_rate": 4.975017997540625e-06, + "loss": 0.0721, + "step": 358 + }, + { + "epoch": 0.14119960668633236, + "grad_norm": 1.2860400676727295, + "learning_rate": 4.974531384627805e-06, + "loss": 0.0604, + "step": 359 + }, + { + "epoch": 0.1415929203539823, + "grad_norm": 2.315216064453125, + "learning_rate": 4.974040102323207e-06, + "loss": 0.0492, + "step": 360 + }, + { + "epoch": 0.14198623402163224, + "grad_norm": 1.771453857421875, + "learning_rate": 4.973544151553869e-06, + "loss": 0.0554, + "step": 361 + }, + { + "epoch": 0.1423795476892822, + "grad_norm": 0.9052230715751648, + "learning_rate": 4.973043533255645e-06, + "loss": 0.0524, + "step": 362 + }, + { + "epoch": 0.14277286135693215, + "grad_norm": 2.327606439590454, + "learning_rate": 4.972538248373188e-06, + "loss": 0.0583, + "step": 363 + }, + { + "epoch": 0.1431661750245821, + "grad_norm": 2.986643075942993, + "learning_rate": 4.9720282978599625e-06, + "loss": 0.0726, + "step": 364 + }, + { + "epoch": 0.14355948869223206, + "grad_norm": 1.1824491024017334, + "learning_rate": 4.971513682678234e-06, + "loss": 0.0749, + "step": 365 + }, + { + "epoch": 0.143952802359882, + "grad_norm": 3.0968868732452393, + "learning_rate": 4.970994403799072e-06, + "loss": 0.0547, + "step": 366 + }, + { + "epoch": 0.14434611602753195, + "grad_norm": 1.2194032669067383, + "learning_rate": 4.970470462202343e-06, + "loss": 0.0651, + "step": 367 + }, + { + "epoch": 0.14473942969518191, + "grad_norm": 1.3438714742660522, + "learning_rate": 4.969941858876719e-06, + "loss": 0.0416, + "step": 368 + }, + { + "epoch": 0.14513274336283186, + "grad_norm": 1.4193546772003174, + "learning_rate": 4.96940859481966e-06, + "loss": 0.06, + "step": 369 + }, + { + "epoch": 0.1455260570304818, + "grad_norm": 1.2842000722885132, + "learning_rate": 4.968870671037427e-06, + "loss": 0.0598, + "step": 370 + }, + { + "epoch": 0.14591937069813177, + "grad_norm": 2.3905892372131348, + "learning_rate": 4.96832808854507e-06, + "loss": 0.0652, + "step": 371 + }, + { + "epoch": 0.1463126843657817, + "grad_norm": 1.5380994081497192, + "learning_rate": 4.967780848366432e-06, + "loss": 0.1034, + "step": 372 + }, + { + "epoch": 0.14670599803343165, + "grad_norm": 1.3698018789291382, + "learning_rate": 4.967228951534144e-06, + "loss": 0.0695, + "step": 373 + }, + { + "epoch": 0.14709931170108162, + "grad_norm": 1.6553199291229248, + "learning_rate": 4.966672399089626e-06, + "loss": 0.0358, + "step": 374 + }, + { + "epoch": 0.14749262536873156, + "grad_norm": 1.966484546661377, + "learning_rate": 4.966111192083081e-06, + "loss": 0.0396, + "step": 375 + }, + { + "epoch": 0.1478859390363815, + "grad_norm": 1.1057041883468628, + "learning_rate": 4.965545331573493e-06, + "loss": 0.0294, + "step": 376 + }, + { + "epoch": 0.14827925270403147, + "grad_norm": 1.3603320121765137, + "learning_rate": 4.964974818628633e-06, + "loss": 0.0431, + "step": 377 + }, + { + "epoch": 0.1486725663716814, + "grad_norm": 3.8050637245178223, + "learning_rate": 4.964399654325045e-06, + "loss": 0.063, + "step": 378 + }, + { + "epoch": 0.14906588003933136, + "grad_norm": 1.361873984336853, + "learning_rate": 4.963819839748055e-06, + "loss": 0.0258, + "step": 379 + }, + { + "epoch": 0.14945919370698132, + "grad_norm": 1.0739333629608154, + "learning_rate": 4.96323537599176e-06, + "loss": 0.0553, + "step": 380 + }, + { + "epoch": 0.14985250737463127, + "grad_norm": 1.5606439113616943, + "learning_rate": 4.962646264159031e-06, + "loss": 0.0341, + "step": 381 + }, + { + "epoch": 0.1502458210422812, + "grad_norm": 1.526953101158142, + "learning_rate": 4.962052505361512e-06, + "loss": 0.0693, + "step": 382 + }, + { + "epoch": 0.15063913470993118, + "grad_norm": 3.761380195617676, + "learning_rate": 4.9614541007196136e-06, + "loss": 0.0685, + "step": 383 + }, + { + "epoch": 0.15103244837758112, + "grad_norm": 2.7432498931884766, + "learning_rate": 4.960851051362514e-06, + "loss": 0.0501, + "step": 384 + }, + { + "epoch": 0.15142576204523106, + "grad_norm": 2.669240951538086, + "learning_rate": 4.960243358428154e-06, + "loss": 0.1198, + "step": 385 + }, + { + "epoch": 0.15181907571288103, + "grad_norm": 1.5905970335006714, + "learning_rate": 4.959631023063238e-06, + "loss": 0.0803, + "step": 386 + }, + { + "epoch": 0.15221238938053097, + "grad_norm": 1.1858878135681152, + "learning_rate": 4.959014046423233e-06, + "loss": 0.0654, + "step": 387 + }, + { + "epoch": 0.1526057030481809, + "grad_norm": 1.7795485258102417, + "learning_rate": 4.9583924296723606e-06, + "loss": 0.0598, + "step": 388 + }, + { + "epoch": 0.15299901671583088, + "grad_norm": 1.2830811738967896, + "learning_rate": 4.957766173983598e-06, + "loss": 0.0437, + "step": 389 + }, + { + "epoch": 0.15339233038348082, + "grad_norm": 0.8960599303245544, + "learning_rate": 4.9571352805386795e-06, + "loss": 0.0455, + "step": 390 + }, + { + "epoch": 0.15378564405113077, + "grad_norm": 2.005126714706421, + "learning_rate": 4.956499750528086e-06, + "loss": 0.0755, + "step": 391 + }, + { + "epoch": 0.15417895771878073, + "grad_norm": 1.5545151233673096, + "learning_rate": 4.955859585151054e-06, + "loss": 0.0449, + "step": 392 + }, + { + "epoch": 0.15457227138643068, + "grad_norm": 1.0876412391662598, + "learning_rate": 4.955214785615558e-06, + "loss": 0.0718, + "step": 393 + }, + { + "epoch": 0.15496558505408062, + "grad_norm": 1.9705466032028198, + "learning_rate": 4.9545653531383255e-06, + "loss": 0.0612, + "step": 394 + }, + { + "epoch": 0.1553588987217306, + "grad_norm": 1.3790346384048462, + "learning_rate": 4.953911288944821e-06, + "loss": 0.0371, + "step": 395 + }, + { + "epoch": 0.15575221238938053, + "grad_norm": 1.0736052989959717, + "learning_rate": 4.953252594269252e-06, + "loss": 0.056, + "step": 396 + }, + { + "epoch": 0.15614552605703047, + "grad_norm": 1.919756531715393, + "learning_rate": 4.9525892703545604e-06, + "loss": 0.0737, + "step": 397 + }, + { + "epoch": 0.15653883972468044, + "grad_norm": 1.333601713180542, + "learning_rate": 4.951921318452428e-06, + "loss": 0.0628, + "step": 398 + }, + { + "epoch": 0.15693215339233038, + "grad_norm": 1.5093313455581665, + "learning_rate": 4.951248739823264e-06, + "loss": 0.0677, + "step": 399 + }, + { + "epoch": 0.15732546705998032, + "grad_norm": 1.5697554349899292, + "learning_rate": 4.950571535736214e-06, + "loss": 0.0672, + "step": 400 + }, + { + "epoch": 0.1577187807276303, + "grad_norm": 1.4692028760910034, + "learning_rate": 4.949889707469145e-06, + "loss": 0.0472, + "step": 401 + }, + { + "epoch": 0.15811209439528023, + "grad_norm": 0.9199762940406799, + "learning_rate": 4.949203256308658e-06, + "loss": 0.0661, + "step": 402 + }, + { + "epoch": 0.15850540806293018, + "grad_norm": 1.4585742950439453, + "learning_rate": 4.948512183550068e-06, + "loss": 0.0776, + "step": 403 + }, + { + "epoch": 0.15889872173058014, + "grad_norm": 1.2560405731201172, + "learning_rate": 4.947816490497419e-06, + "loss": 0.0932, + "step": 404 + }, + { + "epoch": 0.1592920353982301, + "grad_norm": 1.6395833492279053, + "learning_rate": 4.947116178463469e-06, + "loss": 0.0399, + "step": 405 + }, + { + "epoch": 0.15968534906588003, + "grad_norm": 0.8655360341072083, + "learning_rate": 4.946411248769693e-06, + "loss": 0.0421, + "step": 406 + }, + { + "epoch": 0.16007866273353, + "grad_norm": 0.9741353392601013, + "learning_rate": 4.945701702746279e-06, + "loss": 0.0469, + "step": 407 + }, + { + "epoch": 0.16047197640117994, + "grad_norm": 0.9401141405105591, + "learning_rate": 4.944987541732126e-06, + "loss": 0.0668, + "step": 408 + }, + { + "epoch": 0.16086529006882988, + "grad_norm": 0.8718335032463074, + "learning_rate": 4.944268767074842e-06, + "loss": 0.0597, + "step": 409 + }, + { + "epoch": 0.16125860373647985, + "grad_norm": 1.3456203937530518, + "learning_rate": 4.943545380130742e-06, + "loss": 0.0755, + "step": 410 + }, + { + "epoch": 0.1616519174041298, + "grad_norm": 1.1579302549362183, + "learning_rate": 4.942817382264842e-06, + "loss": 0.0583, + "step": 411 + }, + { + "epoch": 0.16204523107177973, + "grad_norm": 1.664872169494629, + "learning_rate": 4.942084774850858e-06, + "loss": 0.0777, + "step": 412 + }, + { + "epoch": 0.1624385447394297, + "grad_norm": 2.256772518157959, + "learning_rate": 4.941347559271208e-06, + "loss": 0.0734, + "step": 413 + }, + { + "epoch": 0.16283185840707964, + "grad_norm": 1.235349416732788, + "learning_rate": 4.9406057369170015e-06, + "loss": 0.051, + "step": 414 + }, + { + "epoch": 0.16322517207472959, + "grad_norm": 1.6716983318328857, + "learning_rate": 4.939859309188044e-06, + "loss": 0.0728, + "step": 415 + }, + { + "epoch": 0.16361848574237955, + "grad_norm": 1.3591656684875488, + "learning_rate": 4.939108277492829e-06, + "loss": 0.0725, + "step": 416 + }, + { + "epoch": 0.1640117994100295, + "grad_norm": 0.6709238886833191, + "learning_rate": 4.9383526432485375e-06, + "loss": 0.0452, + "step": 417 + }, + { + "epoch": 0.16440511307767944, + "grad_norm": 1.2356040477752686, + "learning_rate": 4.937592407881039e-06, + "loss": 0.0682, + "step": 418 + }, + { + "epoch": 0.1647984267453294, + "grad_norm": 1.0750470161437988, + "learning_rate": 4.93682757282488e-06, + "loss": 0.0383, + "step": 419 + }, + { + "epoch": 0.16519174041297935, + "grad_norm": 1.5483283996582031, + "learning_rate": 4.936058139523291e-06, + "loss": 0.0645, + "step": 420 + }, + { + "epoch": 0.1655850540806293, + "grad_norm": 2.0328383445739746, + "learning_rate": 4.935284109428177e-06, + "loss": 0.0623, + "step": 421 + }, + { + "epoch": 0.16597836774827926, + "grad_norm": 1.5979444980621338, + "learning_rate": 4.934505484000116e-06, + "loss": 0.0751, + "step": 422 + }, + { + "epoch": 0.1663716814159292, + "grad_norm": 1.1430745124816895, + "learning_rate": 4.93372226470836e-06, + "loss": 0.0542, + "step": 423 + }, + { + "epoch": 0.16676499508357914, + "grad_norm": 2.062899112701416, + "learning_rate": 4.932934453030829e-06, + "loss": 0.0873, + "step": 424 + }, + { + "epoch": 0.1671583087512291, + "grad_norm": 3.2697086334228516, + "learning_rate": 4.932142050454107e-06, + "loss": 0.0733, + "step": 425 + }, + { + "epoch": 0.16755162241887905, + "grad_norm": 1.2826026678085327, + "learning_rate": 4.931345058473443e-06, + "loss": 0.0497, + "step": 426 + }, + { + "epoch": 0.167944936086529, + "grad_norm": 2.3819937705993652, + "learning_rate": 4.930543478592743e-06, + "loss": 0.0789, + "step": 427 + }, + { + "epoch": 0.16833824975417896, + "grad_norm": 2.840121030807495, + "learning_rate": 4.929737312324574e-06, + "loss": 0.054, + "step": 428 + }, + { + "epoch": 0.1687315634218289, + "grad_norm": 0.6918103098869324, + "learning_rate": 4.928926561190155e-06, + "loss": 0.0448, + "step": 429 + }, + { + "epoch": 0.16912487708947885, + "grad_norm": 0.8336203694343567, + "learning_rate": 4.928111226719359e-06, + "loss": 0.0629, + "step": 430 + }, + { + "epoch": 0.16951819075712882, + "grad_norm": 1.9415661096572876, + "learning_rate": 4.927291310450705e-06, + "loss": 0.0731, + "step": 431 + }, + { + "epoch": 0.16991150442477876, + "grad_norm": 1.3499138355255127, + "learning_rate": 4.926466813931358e-06, + "loss": 0.0562, + "step": 432 + }, + { + "epoch": 0.1703048180924287, + "grad_norm": 1.0689488649368286, + "learning_rate": 4.925637738717127e-06, + "loss": 0.0706, + "step": 433 + }, + { + "epoch": 0.17069813176007867, + "grad_norm": 2.7924535274505615, + "learning_rate": 4.924804086372462e-06, + "loss": 0.0671, + "step": 434 + }, + { + "epoch": 0.1710914454277286, + "grad_norm": 0.8586186170578003, + "learning_rate": 4.9239658584704466e-06, + "loss": 0.049, + "step": 435 + }, + { + "epoch": 0.17148475909537855, + "grad_norm": 1.8235011100769043, + "learning_rate": 4.923123056592801e-06, + "loss": 0.0715, + "step": 436 + }, + { + "epoch": 0.17187807276302852, + "grad_norm": 1.1591852903366089, + "learning_rate": 4.922275682329876e-06, + "loss": 0.0799, + "step": 437 + }, + { + "epoch": 0.17227138643067846, + "grad_norm": 1.2786961793899536, + "learning_rate": 4.921423737280649e-06, + "loss": 0.0561, + "step": 438 + }, + { + "epoch": 0.1726647000983284, + "grad_norm": 1.602005958557129, + "learning_rate": 4.9205672230527254e-06, + "loss": 0.0517, + "step": 439 + }, + { + "epoch": 0.17305801376597837, + "grad_norm": 1.3069565296173096, + "learning_rate": 4.919706141262329e-06, + "loss": 0.063, + "step": 440 + }, + { + "epoch": 0.17345132743362832, + "grad_norm": 1.4721592664718628, + "learning_rate": 4.918840493534305e-06, + "loss": 0.0789, + "step": 441 + }, + { + "epoch": 0.17384464110127826, + "grad_norm": 2.0551934242248535, + "learning_rate": 4.917970281502112e-06, + "loss": 0.0711, + "step": 442 + }, + { + "epoch": 0.17423795476892823, + "grad_norm": 1.175560474395752, + "learning_rate": 4.917095506807824e-06, + "loss": 0.0646, + "step": 443 + }, + { + "epoch": 0.17463126843657817, + "grad_norm": 1.3429381847381592, + "learning_rate": 4.916216171102124e-06, + "loss": 0.0609, + "step": 444 + }, + { + "epoch": 0.1750245821042281, + "grad_norm": 1.306825041770935, + "learning_rate": 4.9153322760443015e-06, + "loss": 0.0529, + "step": 445 + }, + { + "epoch": 0.17541789577187808, + "grad_norm": 1.4618321657180786, + "learning_rate": 4.914443823302246e-06, + "loss": 0.0509, + "step": 446 + }, + { + "epoch": 0.17581120943952802, + "grad_norm": 1.054541826248169, + "learning_rate": 4.913550814552454e-06, + "loss": 0.0613, + "step": 447 + }, + { + "epoch": 0.17620452310717796, + "grad_norm": 0.9349273443222046, + "learning_rate": 4.912653251480013e-06, + "loss": 0.0531, + "step": 448 + }, + { + "epoch": 0.17659783677482793, + "grad_norm": 1.302675724029541, + "learning_rate": 4.9117511357786075e-06, + "loss": 0.0661, + "step": 449 + }, + { + "epoch": 0.17699115044247787, + "grad_norm": 2.327521562576294, + "learning_rate": 4.910844469150512e-06, + "loss": 0.08, + "step": 450 + }, + { + "epoch": 0.17738446411012782, + "grad_norm": 1.7499988079071045, + "learning_rate": 4.909933253306588e-06, + "loss": 0.0368, + "step": 451 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 1.1263257265090942, + "learning_rate": 4.909017489966283e-06, + "loss": 0.0322, + "step": 452 + }, + { + "epoch": 0.17817109144542773, + "grad_norm": 2.8002772331237793, + "learning_rate": 4.9080971808576226e-06, + "loss": 0.0597, + "step": 453 + }, + { + "epoch": 0.17856440511307767, + "grad_norm": 2.0555684566497803, + "learning_rate": 4.907172327717214e-06, + "loss": 0.0754, + "step": 454 + }, + { + "epoch": 0.17895771878072764, + "grad_norm": 2.3041601181030273, + "learning_rate": 4.906242932290234e-06, + "loss": 0.0838, + "step": 455 + }, + { + "epoch": 0.17935103244837758, + "grad_norm": 2.3882484436035156, + "learning_rate": 4.905308996330437e-06, + "loss": 0.063, + "step": 456 + }, + { + "epoch": 0.17974434611602752, + "grad_norm": 1.4339286088943481, + "learning_rate": 4.904370521600138e-06, + "loss": 0.0723, + "step": 457 + }, + { + "epoch": 0.1801376597836775, + "grad_norm": 1.387052059173584, + "learning_rate": 4.903427509870222e-06, + "loss": 0.0708, + "step": 458 + }, + { + "epoch": 0.18053097345132743, + "grad_norm": 0.8694115877151489, + "learning_rate": 4.902479962920134e-06, + "loss": 0.0519, + "step": 459 + }, + { + "epoch": 0.18092428711897737, + "grad_norm": 1.0308964252471924, + "learning_rate": 4.901527882537876e-06, + "loss": 0.054, + "step": 460 + }, + { + "epoch": 0.18131760078662734, + "grad_norm": 2.4914846420288086, + "learning_rate": 4.900571270520004e-06, + "loss": 0.115, + "step": 461 + }, + { + "epoch": 0.18171091445427728, + "grad_norm": 2.637059450149536, + "learning_rate": 4.899610128671626e-06, + "loss": 0.0851, + "step": 462 + }, + { + "epoch": 0.18210422812192723, + "grad_norm": 1.9722718000411987, + "learning_rate": 4.898644458806398e-06, + "loss": 0.0637, + "step": 463 + }, + { + "epoch": 0.1824975417895772, + "grad_norm": 0.9795344471931458, + "learning_rate": 4.897674262746522e-06, + "loss": 0.0622, + "step": 464 + }, + { + "epoch": 0.18289085545722714, + "grad_norm": 1.2904670238494873, + "learning_rate": 4.896699542322736e-06, + "loss": 0.0384, + "step": 465 + }, + { + "epoch": 0.18328416912487708, + "grad_norm": 1.4417036771774292, + "learning_rate": 4.895720299374319e-06, + "loss": 0.1118, + "step": 466 + }, + { + "epoch": 0.18367748279252705, + "grad_norm": 1.6243058443069458, + "learning_rate": 4.894736535749083e-06, + "loss": 0.0756, + "step": 467 + }, + { + "epoch": 0.184070796460177, + "grad_norm": 1.0999799966812134, + "learning_rate": 4.89374825330337e-06, + "loss": 0.0525, + "step": 468 + }, + { + "epoch": 0.18446411012782693, + "grad_norm": 1.9067320823669434, + "learning_rate": 4.892755453902051e-06, + "loss": 0.066, + "step": 469 + }, + { + "epoch": 0.1848574237954769, + "grad_norm": 1.1623554229736328, + "learning_rate": 4.8917581394185175e-06, + "loss": 0.0547, + "step": 470 + }, + { + "epoch": 0.18525073746312684, + "grad_norm": 1.2230125665664673, + "learning_rate": 4.890756311734683e-06, + "loss": 0.0753, + "step": 471 + }, + { + "epoch": 0.18564405113077678, + "grad_norm": 1.376905083656311, + "learning_rate": 4.8897499727409755e-06, + "loss": 0.0637, + "step": 472 + }, + { + "epoch": 0.18603736479842675, + "grad_norm": 2.381087064743042, + "learning_rate": 4.888739124336338e-06, + "loss": 0.0818, + "step": 473 + }, + { + "epoch": 0.1864306784660767, + "grad_norm": 1.5327961444854736, + "learning_rate": 4.8877237684282205e-06, + "loss": 0.0689, + "step": 474 + }, + { + "epoch": 0.18682399213372664, + "grad_norm": 1.7480573654174805, + "learning_rate": 4.8867039069325804e-06, + "loss": 0.0713, + "step": 475 + }, + { + "epoch": 0.1872173058013766, + "grad_norm": 1.2657626867294312, + "learning_rate": 4.8856795417738754e-06, + "loss": 0.0742, + "step": 476 + }, + { + "epoch": 0.18761061946902655, + "grad_norm": 1.0295419692993164, + "learning_rate": 4.884650674885062e-06, + "loss": 0.0448, + "step": 477 + }, + { + "epoch": 0.1880039331366765, + "grad_norm": 1.9904601573944092, + "learning_rate": 4.883617308207592e-06, + "loss": 0.0801, + "step": 478 + }, + { + "epoch": 0.18839724680432646, + "grad_norm": 1.4027286767959595, + "learning_rate": 4.88257944369141e-06, + "loss": 0.0502, + "step": 479 + }, + { + "epoch": 0.1887905604719764, + "grad_norm": 2.087235689163208, + "learning_rate": 4.8815370832949425e-06, + "loss": 0.1021, + "step": 480 + }, + { + "epoch": 0.18918387413962634, + "grad_norm": 0.8643338680267334, + "learning_rate": 4.880490228985104e-06, + "loss": 0.0732, + "step": 481 + }, + { + "epoch": 0.1895771878072763, + "grad_norm": 1.4668515920639038, + "learning_rate": 4.8794388827372884e-06, + "loss": 0.0548, + "step": 482 + }, + { + "epoch": 0.18997050147492625, + "grad_norm": 1.8225198984146118, + "learning_rate": 4.878383046535366e-06, + "loss": 0.0882, + "step": 483 + }, + { + "epoch": 0.1903638151425762, + "grad_norm": 1.6394109725952148, + "learning_rate": 4.877322722371677e-06, + "loss": 0.1029, + "step": 484 + }, + { + "epoch": 0.19075712881022616, + "grad_norm": 0.9612401723861694, + "learning_rate": 4.876257912247033e-06, + "loss": 0.0442, + "step": 485 + }, + { + "epoch": 0.1911504424778761, + "grad_norm": 2.0715410709381104, + "learning_rate": 4.8751886181707105e-06, + "loss": 0.0793, + "step": 486 + }, + { + "epoch": 0.19154375614552605, + "grad_norm": 1.14213228225708, + "learning_rate": 4.874114842160445e-06, + "loss": 0.0782, + "step": 487 + }, + { + "epoch": 0.19193706981317601, + "grad_norm": 1.7314140796661377, + "learning_rate": 4.873036586242431e-06, + "loss": 0.0478, + "step": 488 + }, + { + "epoch": 0.19233038348082596, + "grad_norm": 0.6948450803756714, + "learning_rate": 4.871953852451316e-06, + "loss": 0.0546, + "step": 489 + }, + { + "epoch": 0.1927236971484759, + "grad_norm": 1.9421541690826416, + "learning_rate": 4.8708666428301975e-06, + "loss": 0.0793, + "step": 490 + }, + { + "epoch": 0.19311701081612587, + "grad_norm": 0.5670569539070129, + "learning_rate": 4.869774959430619e-06, + "loss": 0.0506, + "step": 491 + }, + { + "epoch": 0.1935103244837758, + "grad_norm": 1.437902808189392, + "learning_rate": 4.868678804312565e-06, + "loss": 0.0545, + "step": 492 + }, + { + "epoch": 0.19390363815142575, + "grad_norm": 1.8984867334365845, + "learning_rate": 4.867578179544457e-06, + "loss": 0.0658, + "step": 493 + }, + { + "epoch": 0.19429695181907572, + "grad_norm": 2.0684666633605957, + "learning_rate": 4.866473087203154e-06, + "loss": 0.0565, + "step": 494 + }, + { + "epoch": 0.19469026548672566, + "grad_norm": 1.5473408699035645, + "learning_rate": 4.865363529373944e-06, + "loss": 0.0481, + "step": 495 + }, + { + "epoch": 0.1950835791543756, + "grad_norm": 1.678281545639038, + "learning_rate": 4.864249508150539e-06, + "loss": 0.056, + "step": 496 + }, + { + "epoch": 0.19547689282202557, + "grad_norm": 1.3713724613189697, + "learning_rate": 4.863131025635076e-06, + "loss": 0.0474, + "step": 497 + }, + { + "epoch": 0.1958702064896755, + "grad_norm": 2.0483641624450684, + "learning_rate": 4.862008083938109e-06, + "loss": 0.0712, + "step": 498 + }, + { + "epoch": 0.19626352015732546, + "grad_norm": 1.701915979385376, + "learning_rate": 4.8608806851786075e-06, + "loss": 0.0642, + "step": 499 + }, + { + "epoch": 0.19665683382497542, + "grad_norm": 1.4159979820251465, + "learning_rate": 4.859748831483949e-06, + "loss": 0.0706, + "step": 500 + }, + { + "epoch": 0.19705014749262537, + "grad_norm": 0.9921556711196899, + "learning_rate": 4.858612524989921e-06, + "loss": 0.0311, + "step": 501 + }, + { + "epoch": 0.1974434611602753, + "grad_norm": 0.6453993320465088, + "learning_rate": 4.857471767840709e-06, + "loss": 0.0304, + "step": 502 + }, + { + "epoch": 0.19783677482792528, + "grad_norm": 2.1691184043884277, + "learning_rate": 4.856326562188902e-06, + "loss": 0.0573, + "step": 503 + }, + { + "epoch": 0.19823008849557522, + "grad_norm": 1.424170732498169, + "learning_rate": 4.855176910195479e-06, + "loss": 0.0371, + "step": 504 + }, + { + "epoch": 0.19862340216322516, + "grad_norm": 2.0996835231781006, + "learning_rate": 4.854022814029809e-06, + "loss": 0.06, + "step": 505 + }, + { + "epoch": 0.19901671583087513, + "grad_norm": 2.2325479984283447, + "learning_rate": 4.852864275869652e-06, + "loss": 0.0686, + "step": 506 + }, + { + "epoch": 0.19941002949852507, + "grad_norm": 1.8133199214935303, + "learning_rate": 4.851701297901144e-06, + "loss": 0.0811, + "step": 507 + }, + { + "epoch": 0.199803343166175, + "grad_norm": 1.4886740446090698, + "learning_rate": 4.850533882318803e-06, + "loss": 0.0516, + "step": 508 + }, + { + "epoch": 0.20019665683382498, + "grad_norm": 1.685327172279358, + "learning_rate": 4.849362031325518e-06, + "loss": 0.0427, + "step": 509 + }, + { + "epoch": 0.20058997050147492, + "grad_norm": 2.726207733154297, + "learning_rate": 4.8481857471325485e-06, + "loss": 0.0686, + "step": 510 + }, + { + "epoch": 0.20098328416912487, + "grad_norm": 1.1494991779327393, + "learning_rate": 4.847005031959521e-06, + "loss": 0.0642, + "step": 511 + }, + { + "epoch": 0.20137659783677483, + "grad_norm": 2.118980884552002, + "learning_rate": 4.84581988803442e-06, + "loss": 0.0504, + "step": 512 + }, + { + "epoch": 0.20176991150442478, + "grad_norm": 1.4535127878189087, + "learning_rate": 4.84463031759359e-06, + "loss": 0.0482, + "step": 513 + }, + { + "epoch": 0.20216322517207472, + "grad_norm": 0.8411951065063477, + "learning_rate": 4.843436322881725e-06, + "loss": 0.0491, + "step": 514 + }, + { + "epoch": 0.2025565388397247, + "grad_norm": 0.9351110458374023, + "learning_rate": 4.8422379061518705e-06, + "loss": 0.0278, + "step": 515 + }, + { + "epoch": 0.20294985250737463, + "grad_norm": 1.2653199434280396, + "learning_rate": 4.841035069665416e-06, + "loss": 0.0494, + "step": 516 + }, + { + "epoch": 0.20334316617502457, + "grad_norm": 2.1194064617156982, + "learning_rate": 4.83982781569209e-06, + "loss": 0.0985, + "step": 517 + }, + { + "epoch": 0.20373647984267454, + "grad_norm": 0.9621169567108154, + "learning_rate": 4.838616146509956e-06, + "loss": 0.0681, + "step": 518 + }, + { + "epoch": 0.20412979351032448, + "grad_norm": 2.935671091079712, + "learning_rate": 4.83740006440541e-06, + "loss": 0.1056, + "step": 519 + }, + { + "epoch": 0.20452310717797442, + "grad_norm": 1.5503019094467163, + "learning_rate": 4.8361795716731744e-06, + "loss": 0.0736, + "step": 520 + }, + { + "epoch": 0.2049164208456244, + "grad_norm": 1.5426656007766724, + "learning_rate": 4.8349546706162965e-06, + "loss": 0.0768, + "step": 521 + }, + { + "epoch": 0.20530973451327433, + "grad_norm": 1.788036823272705, + "learning_rate": 4.833725363546139e-06, + "loss": 0.0785, + "step": 522 + }, + { + "epoch": 0.20570304818092428, + "grad_norm": 1.3642781972885132, + "learning_rate": 4.8324916527823795e-06, + "loss": 0.0582, + "step": 523 + }, + { + "epoch": 0.20609636184857424, + "grad_norm": 2.6498544216156006, + "learning_rate": 4.831253540653007e-06, + "loss": 0.068, + "step": 524 + }, + { + "epoch": 0.20648967551622419, + "grad_norm": 1.3358078002929688, + "learning_rate": 4.8300110294943145e-06, + "loss": 0.0689, + "step": 525 + }, + { + "epoch": 0.20688298918387413, + "grad_norm": 2.4475595951080322, + "learning_rate": 4.828764121650896e-06, + "loss": 0.0685, + "step": 526 + }, + { + "epoch": 0.2072763028515241, + "grad_norm": 1.8231087923049927, + "learning_rate": 4.827512819475641e-06, + "loss": 0.061, + "step": 527 + }, + { + "epoch": 0.20766961651917404, + "grad_norm": 1.6098417043685913, + "learning_rate": 4.826257125329733e-06, + "loss": 0.0775, + "step": 528 + }, + { + "epoch": 0.20806293018682398, + "grad_norm": 1.2955044507980347, + "learning_rate": 4.824997041582641e-06, + "loss": 0.0828, + "step": 529 + }, + { + "epoch": 0.20845624385447395, + "grad_norm": 1.600419282913208, + "learning_rate": 4.82373257061212e-06, + "loss": 0.0868, + "step": 530 + }, + { + "epoch": 0.2088495575221239, + "grad_norm": 1.2169928550720215, + "learning_rate": 4.8224637148042e-06, + "loss": 0.0543, + "step": 531 + }, + { + "epoch": 0.20924287118977383, + "grad_norm": 1.6863512992858887, + "learning_rate": 4.821190476553186e-06, + "loss": 0.0703, + "step": 532 + }, + { + "epoch": 0.2096361848574238, + "grad_norm": 1.9771099090576172, + "learning_rate": 4.819912858261656e-06, + "loss": 0.0799, + "step": 533 + }, + { + "epoch": 0.21002949852507374, + "grad_norm": 1.276354432106018, + "learning_rate": 4.818630862340449e-06, + "loss": 0.0661, + "step": 534 + }, + { + "epoch": 0.21042281219272368, + "grad_norm": 1.1068519353866577, + "learning_rate": 4.817344491208665e-06, + "loss": 0.0496, + "step": 535 + }, + { + "epoch": 0.21081612586037365, + "grad_norm": 1.1699997186660767, + "learning_rate": 4.816053747293663e-06, + "loss": 0.0395, + "step": 536 + }, + { + "epoch": 0.2112094395280236, + "grad_norm": 1.290640115737915, + "learning_rate": 4.814758633031049e-06, + "loss": 0.0526, + "step": 537 + }, + { + "epoch": 0.21160275319567354, + "grad_norm": 1.8085367679595947, + "learning_rate": 4.813459150864681e-06, + "loss": 0.0593, + "step": 538 + }, + { + "epoch": 0.2119960668633235, + "grad_norm": 1.6277810335159302, + "learning_rate": 4.812155303246653e-06, + "loss": 0.0645, + "step": 539 + }, + { + "epoch": 0.21238938053097345, + "grad_norm": 0.9544056057929993, + "learning_rate": 4.810847092637301e-06, + "loss": 0.063, + "step": 540 + }, + { + "epoch": 0.2127826941986234, + "grad_norm": 1.349601149559021, + "learning_rate": 4.809534521505192e-06, + "loss": 0.0877, + "step": 541 + }, + { + "epoch": 0.21317600786627336, + "grad_norm": 1.6013360023498535, + "learning_rate": 4.8082175923271235e-06, + "loss": 0.0637, + "step": 542 + }, + { + "epoch": 0.2135693215339233, + "grad_norm": 1.130764365196228, + "learning_rate": 4.806896307588113e-06, + "loss": 0.086, + "step": 543 + }, + { + "epoch": 0.21396263520157324, + "grad_norm": 1.40028715133667, + "learning_rate": 4.805570669781399e-06, + "loss": 0.0876, + "step": 544 + }, + { + "epoch": 0.2143559488692232, + "grad_norm": 1.7551463842391968, + "learning_rate": 4.804240681408434e-06, + "loss": 0.0593, + "step": 545 + }, + { + "epoch": 0.21474926253687315, + "grad_norm": 1.648735523223877, + "learning_rate": 4.802906344978881e-06, + "loss": 0.0772, + "step": 546 + }, + { + "epoch": 0.2151425762045231, + "grad_norm": 0.8385063409805298, + "learning_rate": 4.801567663010605e-06, + "loss": 0.0706, + "step": 547 + }, + { + "epoch": 0.21553588987217306, + "grad_norm": 1.8120150566101074, + "learning_rate": 4.800224638029672e-06, + "loss": 0.0696, + "step": 548 + }, + { + "epoch": 0.215929203539823, + "grad_norm": 0.5346795916557312, + "learning_rate": 4.798877272570343e-06, + "loss": 0.0494, + "step": 549 + }, + { + "epoch": 0.21632251720747295, + "grad_norm": 1.4182865619659424, + "learning_rate": 4.797525569175073e-06, + "loss": 0.0711, + "step": 550 + }, + { + "epoch": 0.21671583087512292, + "grad_norm": 0.9838932752609253, + "learning_rate": 4.796169530394498e-06, + "loss": 0.0843, + "step": 551 + }, + { + "epoch": 0.21710914454277286, + "grad_norm": 1.5188270807266235, + "learning_rate": 4.7948091587874355e-06, + "loss": 0.0663, + "step": 552 + }, + { + "epoch": 0.2175024582104228, + "grad_norm": 1.796202540397644, + "learning_rate": 4.793444456920881e-06, + "loss": 0.0655, + "step": 553 + }, + { + "epoch": 0.21789577187807277, + "grad_norm": 1.4925826787948608, + "learning_rate": 4.7920754273699985e-06, + "loss": 0.0607, + "step": 554 + }, + { + "epoch": 0.2182890855457227, + "grad_norm": 1.2840732336044312, + "learning_rate": 4.790702072718121e-06, + "loss": 0.0634, + "step": 555 + }, + { + "epoch": 0.21868239921337265, + "grad_norm": 1.0566197633743286, + "learning_rate": 4.789324395556741e-06, + "loss": 0.0475, + "step": 556 + }, + { + "epoch": 0.21907571288102262, + "grad_norm": 1.2299338579177856, + "learning_rate": 4.7879423984855085e-06, + "loss": 0.054, + "step": 557 + }, + { + "epoch": 0.21946902654867256, + "grad_norm": 1.7808493375778198, + "learning_rate": 4.786556084112224e-06, + "loss": 0.0905, + "step": 558 + }, + { + "epoch": 0.2198623402163225, + "grad_norm": 1.054694652557373, + "learning_rate": 4.785165455052836e-06, + "loss": 0.0561, + "step": 559 + }, + { + "epoch": 0.22025565388397247, + "grad_norm": 2.180976629257202, + "learning_rate": 4.783770513931433e-06, + "loss": 0.0705, + "step": 560 + }, + { + "epoch": 0.22064896755162242, + "grad_norm": 0.9467242956161499, + "learning_rate": 4.782371263380242e-06, + "loss": 0.0471, + "step": 561 + }, + { + "epoch": 0.22104228121927236, + "grad_norm": 1.0072274208068848, + "learning_rate": 4.780967706039622e-06, + "loss": 0.0642, + "step": 562 + }, + { + "epoch": 0.22143559488692233, + "grad_norm": 0.9987531304359436, + "learning_rate": 4.779559844558056e-06, + "loss": 0.0556, + "step": 563 + }, + { + "epoch": 0.22182890855457227, + "grad_norm": 1.5135668516159058, + "learning_rate": 4.778147681592152e-06, + "loss": 0.051, + "step": 564 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 1.6369942426681519, + "learning_rate": 4.776731219806634e-06, + "loss": 0.1089, + "step": 565 + }, + { + "epoch": 0.22261553588987218, + "grad_norm": 1.8307068347930908, + "learning_rate": 4.775310461874337e-06, + "loss": 0.0555, + "step": 566 + }, + { + "epoch": 0.22300884955752212, + "grad_norm": 1.2417643070220947, + "learning_rate": 4.773885410476202e-06, + "loss": 0.0356, + "step": 567 + }, + { + "epoch": 0.22340216322517206, + "grad_norm": 0.8904944658279419, + "learning_rate": 4.7724560683012735e-06, + "loss": 0.0649, + "step": 568 + }, + { + "epoch": 0.22379547689282203, + "grad_norm": 1.3853691816329956, + "learning_rate": 4.771022438046693e-06, + "loss": 0.0429, + "step": 569 + }, + { + "epoch": 0.22418879056047197, + "grad_norm": 1.6937843561172485, + "learning_rate": 4.769584522417691e-06, + "loss": 0.0831, + "step": 570 + }, + { + "epoch": 0.22458210422812191, + "grad_norm": 1.6160171031951904, + "learning_rate": 4.768142324127586e-06, + "loss": 0.0754, + "step": 571 + }, + { + "epoch": 0.22497541789577188, + "grad_norm": 1.2548290491104126, + "learning_rate": 4.766695845897778e-06, + "loss": 0.073, + "step": 572 + }, + { + "epoch": 0.22536873156342183, + "grad_norm": 2.645967483520508, + "learning_rate": 4.765245090457744e-06, + "loss": 0.1022, + "step": 573 + }, + { + "epoch": 0.22576204523107177, + "grad_norm": 1.2090085744857788, + "learning_rate": 4.763790060545028e-06, + "loss": 0.0449, + "step": 574 + }, + { + "epoch": 0.22615535889872174, + "grad_norm": 1.5384302139282227, + "learning_rate": 4.762330758905246e-06, + "loss": 0.0523, + "step": 575 + }, + { + "epoch": 0.22654867256637168, + "grad_norm": 1.3840306997299194, + "learning_rate": 4.760867188292068e-06, + "loss": 0.0409, + "step": 576 + }, + { + "epoch": 0.22694198623402162, + "grad_norm": 0.8169382214546204, + "learning_rate": 4.7593993514672255e-06, + "loss": 0.0526, + "step": 577 + }, + { + "epoch": 0.2273352999016716, + "grad_norm": 0.6939831972122192, + "learning_rate": 4.757927251200497e-06, + "loss": 0.0497, + "step": 578 + }, + { + "epoch": 0.22772861356932153, + "grad_norm": 2.4073455333709717, + "learning_rate": 4.756450890269705e-06, + "loss": 0.0703, + "step": 579 + }, + { + "epoch": 0.22812192723697147, + "grad_norm": 1.4490169286727905, + "learning_rate": 4.754970271460714e-06, + "loss": 0.0429, + "step": 580 + }, + { + "epoch": 0.22851524090462144, + "grad_norm": 0.8039276599884033, + "learning_rate": 4.753485397567424e-06, + "loss": 0.0525, + "step": 581 + }, + { + "epoch": 0.22890855457227138, + "grad_norm": 0.9220805764198303, + "learning_rate": 4.751996271391761e-06, + "loss": 0.056, + "step": 582 + }, + { + "epoch": 0.22930186823992132, + "grad_norm": 2.1960690021514893, + "learning_rate": 4.750502895743677e-06, + "loss": 0.0636, + "step": 583 + }, + { + "epoch": 0.2296951819075713, + "grad_norm": 1.5164406299591064, + "learning_rate": 4.749005273441143e-06, + "loss": 0.0557, + "step": 584 + }, + { + "epoch": 0.23008849557522124, + "grad_norm": 1.8541299104690552, + "learning_rate": 4.747503407310142e-06, + "loss": 0.0679, + "step": 585 + }, + { + "epoch": 0.23048180924287118, + "grad_norm": 5.52957010269165, + "learning_rate": 4.745997300184666e-06, + "loss": 0.0805, + "step": 586 + }, + { + "epoch": 0.23087512291052115, + "grad_norm": 1.318687915802002, + "learning_rate": 4.744486954906709e-06, + "loss": 0.0499, + "step": 587 + }, + { + "epoch": 0.2312684365781711, + "grad_norm": 1.1736847162246704, + "learning_rate": 4.742972374326262e-06, + "loss": 0.0371, + "step": 588 + }, + { + "epoch": 0.23166175024582103, + "grad_norm": 1.7209968566894531, + "learning_rate": 4.74145356130131e-06, + "loss": 0.0553, + "step": 589 + }, + { + "epoch": 0.232055063913471, + "grad_norm": 1.392303228378296, + "learning_rate": 4.739930518697823e-06, + "loss": 0.0468, + "step": 590 + }, + { + "epoch": 0.23244837758112094, + "grad_norm": 1.6198259592056274, + "learning_rate": 4.738403249389752e-06, + "loss": 0.0671, + "step": 591 + }, + { + "epoch": 0.23284169124877088, + "grad_norm": 1.394888997077942, + "learning_rate": 4.736871756259023e-06, + "loss": 0.0851, + "step": 592 + }, + { + "epoch": 0.23323500491642085, + "grad_norm": 1.2976491451263428, + "learning_rate": 4.7353360421955345e-06, + "loss": 0.0614, + "step": 593 + }, + { + "epoch": 0.2336283185840708, + "grad_norm": 1.2485517263412476, + "learning_rate": 4.733796110097148e-06, + "loss": 0.0429, + "step": 594 + }, + { + "epoch": 0.23402163225172073, + "grad_norm": 2.0384671688079834, + "learning_rate": 4.732251962869685e-06, + "loss": 0.0549, + "step": 595 + }, + { + "epoch": 0.2344149459193707, + "grad_norm": 2.514827251434326, + "learning_rate": 4.730703603426921e-06, + "loss": 0.0934, + "step": 596 + }, + { + "epoch": 0.23480825958702065, + "grad_norm": 1.5746873617172241, + "learning_rate": 4.729151034690579e-06, + "loss": 0.0797, + "step": 597 + }, + { + "epoch": 0.2352015732546706, + "grad_norm": 1.458757996559143, + "learning_rate": 4.727594259590326e-06, + "loss": 0.07, + "step": 598 + }, + { + "epoch": 0.23559488692232056, + "grad_norm": 1.9289155006408691, + "learning_rate": 4.726033281063766e-06, + "loss": 0.0447, + "step": 599 + }, + { + "epoch": 0.2359882005899705, + "grad_norm": 2.641873359680176, + "learning_rate": 4.724468102056434e-06, + "loss": 0.1165, + "step": 600 + }, + { + "epoch": 0.23638151425762044, + "grad_norm": 0.6296206116676331, + "learning_rate": 4.722898725521793e-06, + "loss": 0.0597, + "step": 601 + }, + { + "epoch": 0.2367748279252704, + "grad_norm": 1.7393361330032349, + "learning_rate": 4.721325154421224e-06, + "loss": 0.0508, + "step": 602 + }, + { + "epoch": 0.23716814159292035, + "grad_norm": 1.639045000076294, + "learning_rate": 4.7197473917240255e-06, + "loss": 0.0433, + "step": 603 + }, + { + "epoch": 0.2375614552605703, + "grad_norm": 1.4411070346832275, + "learning_rate": 4.718165440407404e-06, + "loss": 0.0626, + "step": 604 + }, + { + "epoch": 0.23795476892822026, + "grad_norm": 1.7141265869140625, + "learning_rate": 4.716579303456471e-06, + "loss": 0.0641, + "step": 605 + }, + { + "epoch": 0.2383480825958702, + "grad_norm": 1.1153072118759155, + "learning_rate": 4.714988983864235e-06, + "loss": 0.0524, + "step": 606 + }, + { + "epoch": 0.23874139626352014, + "grad_norm": 0.6169893741607666, + "learning_rate": 4.713394484631598e-06, + "loss": 0.0485, + "step": 607 + }, + { + "epoch": 0.23913470993117011, + "grad_norm": 2.24593186378479, + "learning_rate": 4.711795808767348e-06, + "loss": 0.0767, + "step": 608 + }, + { + "epoch": 0.23952802359882006, + "grad_norm": 0.8726077675819397, + "learning_rate": 4.7101929592881545e-06, + "loss": 0.0506, + "step": 609 + }, + { + "epoch": 0.23992133726647, + "grad_norm": 1.0482176542282104, + "learning_rate": 4.708585939218564e-06, + "loss": 0.0374, + "step": 610 + }, + { + "epoch": 0.24031465093411997, + "grad_norm": 1.031867265701294, + "learning_rate": 4.7069747515909905e-06, + "loss": 0.0513, + "step": 611 + }, + { + "epoch": 0.2407079646017699, + "grad_norm": 1.548361897468567, + "learning_rate": 4.7053593994457135e-06, + "loss": 0.0524, + "step": 612 + }, + { + "epoch": 0.24110127826941985, + "grad_norm": 2.367420196533203, + "learning_rate": 4.70373988583087e-06, + "loss": 0.0915, + "step": 613 + }, + { + "epoch": 0.24149459193706982, + "grad_norm": 1.440256953239441, + "learning_rate": 4.7021162138024524e-06, + "loss": 0.0829, + "step": 614 + }, + { + "epoch": 0.24188790560471976, + "grad_norm": 1.6830074787139893, + "learning_rate": 4.700488386424294e-06, + "loss": 0.0706, + "step": 615 + }, + { + "epoch": 0.2422812192723697, + "grad_norm": 2.811821699142456, + "learning_rate": 4.698856406768076e-06, + "loss": 0.0531, + "step": 616 + }, + { + "epoch": 0.24267453294001967, + "grad_norm": 2.031094551086426, + "learning_rate": 4.697220277913311e-06, + "loss": 0.0751, + "step": 617 + }, + { + "epoch": 0.2430678466076696, + "grad_norm": 1.9269078969955444, + "learning_rate": 4.695580002947341e-06, + "loss": 0.0624, + "step": 618 + }, + { + "epoch": 0.24346116027531955, + "grad_norm": 1.3828526735305786, + "learning_rate": 4.6939355849653325e-06, + "loss": 0.0776, + "step": 619 + }, + { + "epoch": 0.24385447394296952, + "grad_norm": 1.0781844854354858, + "learning_rate": 4.69228702707027e-06, + "loss": 0.0477, + "step": 620 + }, + { + "epoch": 0.24424778761061947, + "grad_norm": 1.0195046663284302, + "learning_rate": 4.69063433237295e-06, + "loss": 0.06, + "step": 621 + }, + { + "epoch": 0.2446411012782694, + "grad_norm": 0.6686704158782959, + "learning_rate": 4.688977503991975e-06, + "loss": 0.0713, + "step": 622 + }, + { + "epoch": 0.24503441494591938, + "grad_norm": 1.7740367650985718, + "learning_rate": 4.687316545053746e-06, + "loss": 0.092, + "step": 623 + }, + { + "epoch": 0.24542772861356932, + "grad_norm": 1.1935254335403442, + "learning_rate": 4.68565145869246e-06, + "loss": 0.0697, + "step": 624 + }, + { + "epoch": 0.24582104228121926, + "grad_norm": 0.7092412710189819, + "learning_rate": 4.683982248050103e-06, + "loss": 0.0647, + "step": 625 + }, + { + "epoch": 0.24621435594886923, + "grad_norm": 2.2962708473205566, + "learning_rate": 4.6823089162764425e-06, + "loss": 0.07, + "step": 626 + }, + { + "epoch": 0.24660766961651917, + "grad_norm": 1.1462363004684448, + "learning_rate": 4.6806314665290205e-06, + "loss": 0.0519, + "step": 627 + }, + { + "epoch": 0.2470009832841691, + "grad_norm": 2.2198500633239746, + "learning_rate": 4.678949901973154e-06, + "loss": 0.0411, + "step": 628 + }, + { + "epoch": 0.24739429695181908, + "grad_norm": 0.703561007976532, + "learning_rate": 4.677264225781921e-06, + "loss": 0.0505, + "step": 629 + }, + { + "epoch": 0.24778761061946902, + "grad_norm": 1.4070128202438354, + "learning_rate": 4.6755744411361585e-06, + "loss": 0.0659, + "step": 630 + }, + { + "epoch": 0.24818092428711896, + "grad_norm": 0.9832798838615417, + "learning_rate": 4.6738805512244575e-06, + "loss": 0.0917, + "step": 631 + }, + { + "epoch": 0.24857423795476893, + "grad_norm": 0.9056950807571411, + "learning_rate": 4.672182559243155e-06, + "loss": 0.0484, + "step": 632 + }, + { + "epoch": 0.24896755162241888, + "grad_norm": 2.0713984966278076, + "learning_rate": 4.670480468396327e-06, + "loss": 0.0729, + "step": 633 + }, + { + "epoch": 0.24936086529006882, + "grad_norm": 0.9963469505310059, + "learning_rate": 4.668774281895786e-06, + "loss": 0.0507, + "step": 634 + }, + { + "epoch": 0.2497541789577188, + "grad_norm": 0.9695498943328857, + "learning_rate": 4.667064002961073e-06, + "loss": 0.0538, + "step": 635 + }, + { + "epoch": 0.25014749262536873, + "grad_norm": 1.3090274333953857, + "learning_rate": 4.66534963481945e-06, + "loss": 0.0931, + "step": 636 + }, + { + "epoch": 0.25054080629301867, + "grad_norm": 1.2280491590499878, + "learning_rate": 4.663631180705894e-06, + "loss": 0.0488, + "step": 637 + }, + { + "epoch": 0.2509341199606686, + "grad_norm": 1.050603985786438, + "learning_rate": 4.661908643863096e-06, + "loss": 0.0723, + "step": 638 + }, + { + "epoch": 0.2513274336283186, + "grad_norm": 1.2820688486099243, + "learning_rate": 4.66018202754145e-06, + "loss": 0.0854, + "step": 639 + }, + { + "epoch": 0.25172074729596855, + "grad_norm": 0.9909592866897583, + "learning_rate": 4.658451334999043e-06, + "loss": 0.0613, + "step": 640 + }, + { + "epoch": 0.2521140609636185, + "grad_norm": 0.7117825746536255, + "learning_rate": 4.656716569501661e-06, + "loss": 0.0249, + "step": 641 + }, + { + "epoch": 0.25250737463126843, + "grad_norm": 1.803819179534912, + "learning_rate": 4.654977734322772e-06, + "loss": 0.0744, + "step": 642 + }, + { + "epoch": 0.2529006882989184, + "grad_norm": 1.2123903036117554, + "learning_rate": 4.653234832743521e-06, + "loss": 0.0893, + "step": 643 + }, + { + "epoch": 0.2532940019665683, + "grad_norm": 1.3053680658340454, + "learning_rate": 4.651487868052731e-06, + "loss": 0.0794, + "step": 644 + }, + { + "epoch": 0.2536873156342183, + "grad_norm": 1.5112253427505493, + "learning_rate": 4.64973684354689e-06, + "loss": 0.1139, + "step": 645 + }, + { + "epoch": 0.25408062930186825, + "grad_norm": 0.4444582164287567, + "learning_rate": 4.647981762530145e-06, + "loss": 0.031, + "step": 646 + }, + { + "epoch": 0.2544739429695182, + "grad_norm": 0.863317608833313, + "learning_rate": 4.6462226283143e-06, + "loss": 0.0336, + "step": 647 + }, + { + "epoch": 0.25486725663716814, + "grad_norm": 2.007761001586914, + "learning_rate": 4.644459444218807e-06, + "loss": 0.0531, + "step": 648 + }, + { + "epoch": 0.2552605703048181, + "grad_norm": 2.1189866065979004, + "learning_rate": 4.642692213570759e-06, + "loss": 0.0906, + "step": 649 + }, + { + "epoch": 0.255653883972468, + "grad_norm": 0.7463569045066833, + "learning_rate": 4.640920939704885e-06, + "loss": 0.0449, + "step": 650 + }, + { + "epoch": 0.256047197640118, + "grad_norm": 2.031602144241333, + "learning_rate": 4.639145625963544e-06, + "loss": 0.0673, + "step": 651 + }, + { + "epoch": 0.25644051130776796, + "grad_norm": 2.0455472469329834, + "learning_rate": 4.637366275696718e-06, + "loss": 0.0495, + "step": 652 + }, + { + "epoch": 0.2568338249754179, + "grad_norm": 1.2602909803390503, + "learning_rate": 4.635582892262006e-06, + "loss": 0.0442, + "step": 653 + }, + { + "epoch": 0.25722713864306784, + "grad_norm": 1.3121466636657715, + "learning_rate": 4.633795479024616e-06, + "loss": 0.0404, + "step": 654 + }, + { + "epoch": 0.2576204523107178, + "grad_norm": 1.028448224067688, + "learning_rate": 4.632004039357364e-06, + "loss": 0.0497, + "step": 655 + }, + { + "epoch": 0.2580137659783677, + "grad_norm": 0.9586936235427856, + "learning_rate": 4.630208576640659e-06, + "loss": 0.0499, + "step": 656 + }, + { + "epoch": 0.2584070796460177, + "grad_norm": 1.3646454811096191, + "learning_rate": 4.628409094262504e-06, + "loss": 0.0383, + "step": 657 + }, + { + "epoch": 0.25880039331366766, + "grad_norm": 1.6489843130111694, + "learning_rate": 4.6266055956184865e-06, + "loss": 0.0458, + "step": 658 + }, + { + "epoch": 0.2591937069813176, + "grad_norm": 1.8696314096450806, + "learning_rate": 4.624798084111773e-06, + "loss": 0.0783, + "step": 659 + }, + { + "epoch": 0.25958702064896755, + "grad_norm": 1.5261452198028564, + "learning_rate": 4.622986563153104e-06, + "loss": 0.0465, + "step": 660 + }, + { + "epoch": 0.2599803343166175, + "grad_norm": 1.8203606605529785, + "learning_rate": 4.621171036160781e-06, + "loss": 0.0767, + "step": 661 + }, + { + "epoch": 0.26037364798426743, + "grad_norm": 1.3250322341918945, + "learning_rate": 4.6193515065606675e-06, + "loss": 0.0607, + "step": 662 + }, + { + "epoch": 0.26076696165191743, + "grad_norm": 1.298017978668213, + "learning_rate": 4.617527977786182e-06, + "loss": 0.0619, + "step": 663 + }, + { + "epoch": 0.26116027531956737, + "grad_norm": 1.0446304082870483, + "learning_rate": 4.615700453278285e-06, + "loss": 0.0268, + "step": 664 + }, + { + "epoch": 0.2615535889872173, + "grad_norm": 1.0812922716140747, + "learning_rate": 4.61386893648548e-06, + "loss": 0.0519, + "step": 665 + }, + { + "epoch": 0.26194690265486725, + "grad_norm": 1.8242236375808716, + "learning_rate": 4.612033430863804e-06, + "loss": 0.0565, + "step": 666 + }, + { + "epoch": 0.2623402163225172, + "grad_norm": 1.567988634109497, + "learning_rate": 4.610193939876818e-06, + "loss": 0.0476, + "step": 667 + }, + { + "epoch": 0.26273352999016714, + "grad_norm": 3.7344436645507812, + "learning_rate": 4.608350466995606e-06, + "loss": 0.0519, + "step": 668 + }, + { + "epoch": 0.26312684365781713, + "grad_norm": 3.131584882736206, + "learning_rate": 4.606503015698765e-06, + "loss": 0.0696, + "step": 669 + }, + { + "epoch": 0.2635201573254671, + "grad_norm": 1.2186100482940674, + "learning_rate": 4.6046515894723985e-06, + "loss": 0.0596, + "step": 670 + }, + { + "epoch": 0.263913470993117, + "grad_norm": 0.8804354667663574, + "learning_rate": 4.602796191810113e-06, + "loss": 0.0465, + "step": 671 + }, + { + "epoch": 0.26430678466076696, + "grad_norm": 1.961540937423706, + "learning_rate": 4.600936826213004e-06, + "loss": 0.0756, + "step": 672 + }, + { + "epoch": 0.2647000983284169, + "grad_norm": 0.739213764667511, + "learning_rate": 4.59907349618966e-06, + "loss": 0.0475, + "step": 673 + }, + { + "epoch": 0.26509341199606684, + "grad_norm": 0.8394540548324585, + "learning_rate": 4.597206205256147e-06, + "loss": 0.0538, + "step": 674 + }, + { + "epoch": 0.26548672566371684, + "grad_norm": 1.5452135801315308, + "learning_rate": 4.595334956936007e-06, + "loss": 0.0664, + "step": 675 + }, + { + "epoch": 0.2658800393313668, + "grad_norm": 1.613324522972107, + "learning_rate": 4.593459754760248e-06, + "loss": 0.0673, + "step": 676 + }, + { + "epoch": 0.2662733529990167, + "grad_norm": 1.4427350759506226, + "learning_rate": 4.591580602267338e-06, + "loss": 0.0509, + "step": 677 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.7156988382339478, + "learning_rate": 4.589697503003203e-06, + "loss": 0.0601, + "step": 678 + }, + { + "epoch": 0.2670599803343166, + "grad_norm": 1.4072953462600708, + "learning_rate": 4.587810460521213e-06, + "loss": 0.0678, + "step": 679 + }, + { + "epoch": 0.26745329400196655, + "grad_norm": 0.7101967930793762, + "learning_rate": 4.585919478382178e-06, + "loss": 0.0522, + "step": 680 + }, + { + "epoch": 0.26784660766961654, + "grad_norm": 0.5038359761238098, + "learning_rate": 4.584024560154348e-06, + "loss": 0.0408, + "step": 681 + }, + { + "epoch": 0.2682399213372665, + "grad_norm": 1.1651291847229004, + "learning_rate": 4.582125709413392e-06, + "loss": 0.0719, + "step": 682 + }, + { + "epoch": 0.2686332350049164, + "grad_norm": 1.0390863418579102, + "learning_rate": 4.580222929742407e-06, + "loss": 0.0402, + "step": 683 + }, + { + "epoch": 0.26902654867256637, + "grad_norm": 1.8808722496032715, + "learning_rate": 4.5783162247318986e-06, + "loss": 0.0612, + "step": 684 + }, + { + "epoch": 0.2694198623402163, + "grad_norm": 1.4362890720367432, + "learning_rate": 4.576405597979782e-06, + "loss": 0.0367, + "step": 685 + }, + { + "epoch": 0.26981317600786625, + "grad_norm": 0.9547756910324097, + "learning_rate": 4.5744910530913725e-06, + "loss": 0.0799, + "step": 686 + }, + { + "epoch": 0.27020648967551625, + "grad_norm": 1.8914170265197754, + "learning_rate": 4.572572593679379e-06, + "loss": 0.048, + "step": 687 + }, + { + "epoch": 0.2705998033431662, + "grad_norm": 1.460436224937439, + "learning_rate": 4.5706502233638935e-06, + "loss": 0.0633, + "step": 688 + }, + { + "epoch": 0.27099311701081613, + "grad_norm": 1.7330501079559326, + "learning_rate": 4.568723945772394e-06, + "loss": 0.0332, + "step": 689 + }, + { + "epoch": 0.2713864306784661, + "grad_norm": 1.1326316595077515, + "learning_rate": 4.5667937645397276e-06, + "loss": 0.0555, + "step": 690 + }, + { + "epoch": 0.271779744346116, + "grad_norm": 0.8753216862678528, + "learning_rate": 4.564859683308107e-06, + "loss": 0.0416, + "step": 691 + }, + { + "epoch": 0.27217305801376596, + "grad_norm": 0.8659785389900208, + "learning_rate": 4.562921705727106e-06, + "loss": 0.0551, + "step": 692 + }, + { + "epoch": 0.27256637168141595, + "grad_norm": 0.502169668674469, + "learning_rate": 4.5609798354536495e-06, + "loss": 0.0284, + "step": 693 + }, + { + "epoch": 0.2729596853490659, + "grad_norm": 2.1083321571350098, + "learning_rate": 4.559034076152009e-06, + "loss": 0.0779, + "step": 694 + }, + { + "epoch": 0.27335299901671584, + "grad_norm": 1.5410869121551514, + "learning_rate": 4.557084431493793e-06, + "loss": 0.0788, + "step": 695 + }, + { + "epoch": 0.2737463126843658, + "grad_norm": 1.707189679145813, + "learning_rate": 4.555130905157943e-06, + "loss": 0.0921, + "step": 696 + }, + { + "epoch": 0.2741396263520157, + "grad_norm": 1.2371059656143188, + "learning_rate": 4.553173500830724e-06, + "loss": 0.0562, + "step": 697 + }, + { + "epoch": 0.27453294001966566, + "grad_norm": 1.6234147548675537, + "learning_rate": 4.55121222220572e-06, + "loss": 0.0471, + "step": 698 + }, + { + "epoch": 0.27492625368731566, + "grad_norm": 1.2629426717758179, + "learning_rate": 4.549247072983825e-06, + "loss": 0.0795, + "step": 699 + }, + { + "epoch": 0.2753195673549656, + "grad_norm": 1.7955608367919922, + "learning_rate": 4.5472780568732356e-06, + "loss": 0.0468, + "step": 700 + }, + { + "epoch": 0.27571288102261554, + "grad_norm": 7.252640724182129, + "learning_rate": 4.545305177589448e-06, + "loss": 0.0699, + "step": 701 + }, + { + "epoch": 0.2761061946902655, + "grad_norm": 1.8121711015701294, + "learning_rate": 4.5433284388552435e-06, + "loss": 0.0718, + "step": 702 + }, + { + "epoch": 0.2764995083579154, + "grad_norm": 0.901907742023468, + "learning_rate": 4.541347844400692e-06, + "loss": 0.0255, + "step": 703 + }, + { + "epoch": 0.27689282202556537, + "grad_norm": 0.7126281261444092, + "learning_rate": 4.539363397963134e-06, + "loss": 0.0509, + "step": 704 + }, + { + "epoch": 0.27728613569321536, + "grad_norm": 2.012707233428955, + "learning_rate": 4.537375103287183e-06, + "loss": 0.0904, + "step": 705 + }, + { + "epoch": 0.2776794493608653, + "grad_norm": 1.7197178602218628, + "learning_rate": 4.53538296412471e-06, + "loss": 0.0617, + "step": 706 + }, + { + "epoch": 0.27807276302851525, + "grad_norm": 2.5714545249938965, + "learning_rate": 4.533386984234841e-06, + "loss": 0.0825, + "step": 707 + }, + { + "epoch": 0.2784660766961652, + "grad_norm": 1.3491824865341187, + "learning_rate": 4.5313871673839525e-06, + "loss": 0.0545, + "step": 708 + }, + { + "epoch": 0.27885939036381513, + "grad_norm": 1.0081161260604858, + "learning_rate": 4.52938351734566e-06, + "loss": 0.046, + "step": 709 + }, + { + "epoch": 0.27925270403146507, + "grad_norm": 1.3097039461135864, + "learning_rate": 4.52737603790081e-06, + "loss": 0.0678, + "step": 710 + }, + { + "epoch": 0.27964601769911507, + "grad_norm": 1.264832615852356, + "learning_rate": 4.525364732837476e-06, + "loss": 0.0408, + "step": 711 + }, + { + "epoch": 0.280039331366765, + "grad_norm": 1.6724627017974854, + "learning_rate": 4.523349605950953e-06, + "loss": 0.0583, + "step": 712 + }, + { + "epoch": 0.28043264503441495, + "grad_norm": 1.2600414752960205, + "learning_rate": 4.521330661043744e-06, + "loss": 0.0762, + "step": 713 + }, + { + "epoch": 0.2808259587020649, + "grad_norm": 0.8454362750053406, + "learning_rate": 4.519307901925558e-06, + "loss": 0.0433, + "step": 714 + }, + { + "epoch": 0.28121927236971483, + "grad_norm": 2.131969451904297, + "learning_rate": 4.517281332413302e-06, + "loss": 0.0738, + "step": 715 + }, + { + "epoch": 0.2816125860373648, + "grad_norm": 2.226288080215454, + "learning_rate": 4.515250956331072e-06, + "loss": 0.0892, + "step": 716 + }, + { + "epoch": 0.2820058997050148, + "grad_norm": 1.6737391948699951, + "learning_rate": 4.513216777510149e-06, + "loss": 0.0556, + "step": 717 + }, + { + "epoch": 0.2823992133726647, + "grad_norm": 1.5575467348098755, + "learning_rate": 4.511178799788987e-06, + "loss": 0.0561, + "step": 718 + }, + { + "epoch": 0.28279252704031466, + "grad_norm": 1.7405011653900146, + "learning_rate": 4.50913702701321e-06, + "loss": 0.0653, + "step": 719 + }, + { + "epoch": 0.2831858407079646, + "grad_norm": 1.097738265991211, + "learning_rate": 4.507091463035601e-06, + "loss": 0.0772, + "step": 720 + }, + { + "epoch": 0.28357915437561454, + "grad_norm": 0.8409376740455627, + "learning_rate": 4.505042111716103e-06, + "loss": 0.0645, + "step": 721 + }, + { + "epoch": 0.2839724680432645, + "grad_norm": 1.1851140260696411, + "learning_rate": 4.502988976921797e-06, + "loss": 0.0462, + "step": 722 + }, + { + "epoch": 0.2843657817109145, + "grad_norm": 1.7740516662597656, + "learning_rate": 4.50093206252691e-06, + "loss": 0.0717, + "step": 723 + }, + { + "epoch": 0.2847590953785644, + "grad_norm": 2.491065263748169, + "learning_rate": 4.498871372412798e-06, + "loss": 0.0575, + "step": 724 + }, + { + "epoch": 0.28515240904621436, + "grad_norm": 1.446291446685791, + "learning_rate": 4.496806910467944e-06, + "loss": 0.0566, + "step": 725 + }, + { + "epoch": 0.2855457227138643, + "grad_norm": 1.2584576606750488, + "learning_rate": 4.494738680587946e-06, + "loss": 0.053, + "step": 726 + }, + { + "epoch": 0.28593903638151424, + "grad_norm": 1.188159704208374, + "learning_rate": 4.492666686675511e-06, + "loss": 0.0627, + "step": 727 + }, + { + "epoch": 0.2863323500491642, + "grad_norm": 1.2687791585922241, + "learning_rate": 4.490590932640453e-06, + "loss": 0.0676, + "step": 728 + }, + { + "epoch": 0.2867256637168142, + "grad_norm": 1.7722615003585815, + "learning_rate": 4.488511422399677e-06, + "loss": 0.0548, + "step": 729 + }, + { + "epoch": 0.2871189773844641, + "grad_norm": 3.2244741916656494, + "learning_rate": 4.48642815987718e-06, + "loss": 0.0763, + "step": 730 + }, + { + "epoch": 0.28751229105211407, + "grad_norm": 1.1106655597686768, + "learning_rate": 4.484341149004035e-06, + "loss": 0.0862, + "step": 731 + }, + { + "epoch": 0.287905604719764, + "grad_norm": 0.6258023381233215, + "learning_rate": 4.482250393718392e-06, + "loss": 0.0526, + "step": 732 + }, + { + "epoch": 0.28829891838741395, + "grad_norm": 0.7904531955718994, + "learning_rate": 4.480155897965463e-06, + "loss": 0.0367, + "step": 733 + }, + { + "epoch": 0.2886922320550639, + "grad_norm": 1.5454163551330566, + "learning_rate": 4.47805766569752e-06, + "loss": 0.0747, + "step": 734 + }, + { + "epoch": 0.2890855457227139, + "grad_norm": 2.1076667308807373, + "learning_rate": 4.475955700873888e-06, + "loss": 0.0939, + "step": 735 + }, + { + "epoch": 0.28947885939036383, + "grad_norm": 1.407893419265747, + "learning_rate": 4.473850007460932e-06, + "loss": 0.0524, + "step": 736 + }, + { + "epoch": 0.28987217305801377, + "grad_norm": 1.957629680633545, + "learning_rate": 4.471740589432053e-06, + "loss": 0.0541, + "step": 737 + }, + { + "epoch": 0.2902654867256637, + "grad_norm": 1.0253725051879883, + "learning_rate": 4.469627450767682e-06, + "loss": 0.0478, + "step": 738 + }, + { + "epoch": 0.29065880039331365, + "grad_norm": 1.5762360095977783, + "learning_rate": 4.46751059545527e-06, + "loss": 0.0936, + "step": 739 + }, + { + "epoch": 0.2910521140609636, + "grad_norm": 1.2460707426071167, + "learning_rate": 4.465390027489279e-06, + "loss": 0.0596, + "step": 740 + }, + { + "epoch": 0.2914454277286136, + "grad_norm": 1.042962670326233, + "learning_rate": 4.463265750871182e-06, + "loss": 0.0615, + "step": 741 + }, + { + "epoch": 0.29183874139626353, + "grad_norm": 1.554513692855835, + "learning_rate": 4.461137769609445e-06, + "loss": 0.0562, + "step": 742 + }, + { + "epoch": 0.2922320550639135, + "grad_norm": 1.5099841356277466, + "learning_rate": 4.459006087719527e-06, + "loss": 0.0462, + "step": 743 + }, + { + "epoch": 0.2926253687315634, + "grad_norm": 0.8272073864936829, + "learning_rate": 4.45687070922387e-06, + "loss": 0.0311, + "step": 744 + }, + { + "epoch": 0.29301868239921336, + "grad_norm": 1.1962639093399048, + "learning_rate": 4.4547316381518905e-06, + "loss": 0.054, + "step": 745 + }, + { + "epoch": 0.2934119960668633, + "grad_norm": 0.7265387773513794, + "learning_rate": 4.4525888785399725e-06, + "loss": 0.0322, + "step": 746 + }, + { + "epoch": 0.2938053097345133, + "grad_norm": 2.045783042907715, + "learning_rate": 4.450442434431463e-06, + "loss": 0.0668, + "step": 747 + }, + { + "epoch": 0.29419862340216324, + "grad_norm": 1.417593240737915, + "learning_rate": 4.448292309876657e-06, + "loss": 0.0499, + "step": 748 + }, + { + "epoch": 0.2945919370698132, + "grad_norm": 1.4235261678695679, + "learning_rate": 4.4461385089328e-06, + "loss": 0.0904, + "step": 749 + }, + { + "epoch": 0.2949852507374631, + "grad_norm": 1.050933837890625, + "learning_rate": 4.44398103566407e-06, + "loss": 0.05, + "step": 750 + }, + { + "epoch": 0.29537856440511306, + "grad_norm": 1.3113094568252563, + "learning_rate": 4.4418198941415756e-06, + "loss": 0.0717, + "step": 751 + }, + { + "epoch": 0.295771878072763, + "grad_norm": 1.1153532266616821, + "learning_rate": 4.4396550884433495e-06, + "loss": 0.0613, + "step": 752 + }, + { + "epoch": 0.296165191740413, + "grad_norm": 1.6574000120162964, + "learning_rate": 4.437486622654337e-06, + "loss": 0.08, + "step": 753 + }, + { + "epoch": 0.29655850540806294, + "grad_norm": 1.037023901939392, + "learning_rate": 4.43531450086639e-06, + "loss": 0.059, + "step": 754 + }, + { + "epoch": 0.2969518190757129, + "grad_norm": 1.3382397890090942, + "learning_rate": 4.433138727178259e-06, + "loss": 0.0504, + "step": 755 + }, + { + "epoch": 0.2973451327433628, + "grad_norm": 2.023531198501587, + "learning_rate": 4.4309593056955865e-06, + "loss": 0.0682, + "step": 756 + }, + { + "epoch": 0.29773844641101277, + "grad_norm": 1.3962974548339844, + "learning_rate": 4.4287762405308974e-06, + "loss": 0.0678, + "step": 757 + }, + { + "epoch": 0.2981317600786627, + "grad_norm": 0.6099796295166016, + "learning_rate": 4.426589535803593e-06, + "loss": 0.0496, + "step": 758 + }, + { + "epoch": 0.2985250737463127, + "grad_norm": 1.6071325540542603, + "learning_rate": 4.424399195639941e-06, + "loss": 0.0519, + "step": 759 + }, + { + "epoch": 0.29891838741396265, + "grad_norm": 1.116490125656128, + "learning_rate": 4.422205224173071e-06, + "loss": 0.0651, + "step": 760 + }, + { + "epoch": 0.2993117010816126, + "grad_norm": 1.163526177406311, + "learning_rate": 4.420007625542963e-06, + "loss": 0.042, + "step": 761 + }, + { + "epoch": 0.29970501474926253, + "grad_norm": 0.6789044737815857, + "learning_rate": 4.417806403896442e-06, + "loss": 0.0652, + "step": 762 + }, + { + "epoch": 0.3000983284169125, + "grad_norm": 1.6137206554412842, + "learning_rate": 4.41560156338717e-06, + "loss": 0.073, + "step": 763 + }, + { + "epoch": 0.3004916420845624, + "grad_norm": 1.9308634996414185, + "learning_rate": 4.413393108175637e-06, + "loss": 0.0805, + "step": 764 + }, + { + "epoch": 0.3008849557522124, + "grad_norm": 1.6792504787445068, + "learning_rate": 4.411181042429156e-06, + "loss": 0.0471, + "step": 765 + }, + { + "epoch": 0.30127826941986235, + "grad_norm": 1.1271363496780396, + "learning_rate": 4.40896537032185e-06, + "loss": 0.0378, + "step": 766 + }, + { + "epoch": 0.3016715830875123, + "grad_norm": 1.0671911239624023, + "learning_rate": 4.406746096034647e-06, + "loss": 0.0548, + "step": 767 + }, + { + "epoch": 0.30206489675516224, + "grad_norm": 1.2227768898010254, + "learning_rate": 4.4045232237552756e-06, + "loss": 0.0701, + "step": 768 + }, + { + "epoch": 0.3024582104228122, + "grad_norm": 1.471924901008606, + "learning_rate": 4.4022967576782525e-06, + "loss": 0.0568, + "step": 769 + }, + { + "epoch": 0.3028515240904621, + "grad_norm": 1.6219385862350464, + "learning_rate": 4.400066702004874e-06, + "loss": 0.05, + "step": 770 + }, + { + "epoch": 0.3032448377581121, + "grad_norm": 1.4471542835235596, + "learning_rate": 4.39783306094321e-06, + "loss": 0.0685, + "step": 771 + }, + { + "epoch": 0.30363815142576206, + "grad_norm": 1.525600552558899, + "learning_rate": 4.395595838708099e-06, + "loss": 0.0513, + "step": 772 + }, + { + "epoch": 0.304031465093412, + "grad_norm": 1.3881157636642456, + "learning_rate": 4.393355039521134e-06, + "loss": 0.0812, + "step": 773 + }, + { + "epoch": 0.30442477876106194, + "grad_norm": 1.1738461256027222, + "learning_rate": 4.391110667610658e-06, + "loss": 0.0595, + "step": 774 + }, + { + "epoch": 0.3048180924287119, + "grad_norm": 1.1576417684555054, + "learning_rate": 4.388862727211759e-06, + "loss": 0.0541, + "step": 775 + }, + { + "epoch": 0.3052114060963618, + "grad_norm": 1.283400058746338, + "learning_rate": 4.386611222566254e-06, + "loss": 0.0505, + "step": 776 + }, + { + "epoch": 0.3056047197640118, + "grad_norm": 1.4386646747589111, + "learning_rate": 4.384356157922688e-06, + "loss": 0.0706, + "step": 777 + }, + { + "epoch": 0.30599803343166176, + "grad_norm": 2.0160024166107178, + "learning_rate": 4.382097537536322e-06, + "loss": 0.0596, + "step": 778 + }, + { + "epoch": 0.3063913470993117, + "grad_norm": 1.3747514486312866, + "learning_rate": 4.379835365669132e-06, + "loss": 0.0561, + "step": 779 + }, + { + "epoch": 0.30678466076696165, + "grad_norm": 1.5668084621429443, + "learning_rate": 4.377569646589789e-06, + "loss": 0.0522, + "step": 780 + }, + { + "epoch": 0.3071779744346116, + "grad_norm": 1.6369160413742065, + "learning_rate": 4.375300384573659e-06, + "loss": 0.05, + "step": 781 + }, + { + "epoch": 0.30757128810226153, + "grad_norm": 1.2633172273635864, + "learning_rate": 4.373027583902796e-06, + "loss": 0.0447, + "step": 782 + }, + { + "epoch": 0.30796460176991153, + "grad_norm": 1.3119875192642212, + "learning_rate": 4.370751248865929e-06, + "loss": 0.062, + "step": 783 + }, + { + "epoch": 0.30835791543756147, + "grad_norm": 2.1404073238372803, + "learning_rate": 4.368471383758459e-06, + "loss": 0.0446, + "step": 784 + }, + { + "epoch": 0.3087512291052114, + "grad_norm": 0.7563901543617249, + "learning_rate": 4.366187992882444e-06, + "loss": 0.0429, + "step": 785 + }, + { + "epoch": 0.30914454277286135, + "grad_norm": 0.7048685550689697, + "learning_rate": 4.3639010805466e-06, + "loss": 0.0299, + "step": 786 + }, + { + "epoch": 0.3095378564405113, + "grad_norm": 0.7395270466804504, + "learning_rate": 4.361610651066283e-06, + "loss": 0.0334, + "step": 787 + }, + { + "epoch": 0.30993117010816124, + "grad_norm": 1.2910830974578857, + "learning_rate": 4.35931670876349e-06, + "loss": 0.0666, + "step": 788 + }, + { + "epoch": 0.31032448377581123, + "grad_norm": 3.32393217086792, + "learning_rate": 4.357019257966844e-06, + "loss": 0.0773, + "step": 789 + }, + { + "epoch": 0.3107177974434612, + "grad_norm": 1.2098692655563354, + "learning_rate": 4.354718303011588e-06, + "loss": 0.0524, + "step": 790 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 1.650527834892273, + "learning_rate": 4.352413848239579e-06, + "loss": 0.0518, + "step": 791 + }, + { + "epoch": 0.31150442477876106, + "grad_norm": 0.8377374410629272, + "learning_rate": 4.35010589799928e-06, + "loss": 0.0482, + "step": 792 + }, + { + "epoch": 0.311897738446411, + "grad_norm": 1.225882649421692, + "learning_rate": 4.347794456645744e-06, + "loss": 0.0405, + "step": 793 + }, + { + "epoch": 0.31229105211406094, + "grad_norm": 2.0014147758483887, + "learning_rate": 4.345479528540618e-06, + "loss": 0.053, + "step": 794 + }, + { + "epoch": 0.31268436578171094, + "grad_norm": 1.2061558961868286, + "learning_rate": 4.343161118052123e-06, + "loss": 0.045, + "step": 795 + }, + { + "epoch": 0.3130776794493609, + "grad_norm": 0.8555061221122742, + "learning_rate": 4.340839229555056e-06, + "loss": 0.0673, + "step": 796 + }, + { + "epoch": 0.3134709931170108, + "grad_norm": 1.4630858898162842, + "learning_rate": 4.338513867430773e-06, + "loss": 0.0414, + "step": 797 + }, + { + "epoch": 0.31386430678466076, + "grad_norm": 1.101480484008789, + "learning_rate": 4.336185036067187e-06, + "loss": 0.0383, + "step": 798 + }, + { + "epoch": 0.3142576204523107, + "grad_norm": 0.6861633658409119, + "learning_rate": 4.3338527398587575e-06, + "loss": 0.0393, + "step": 799 + }, + { + "epoch": 0.31465093411996065, + "grad_norm": 1.0716795921325684, + "learning_rate": 4.33151698320648e-06, + "loss": 0.0407, + "step": 800 + }, + { + "epoch": 0.31504424778761064, + "grad_norm": 1.0103176832199097, + "learning_rate": 4.329177770517881e-06, + "loss": 0.0467, + "step": 801 + }, + { + "epoch": 0.3154375614552606, + "grad_norm": 1.1415047645568848, + "learning_rate": 4.32683510620701e-06, + "loss": 0.0518, + "step": 802 + }, + { + "epoch": 0.3158308751229105, + "grad_norm": 1.0959949493408203, + "learning_rate": 4.324488994694427e-06, + "loss": 0.0447, + "step": 803 + }, + { + "epoch": 0.31622418879056047, + "grad_norm": 3.7971184253692627, + "learning_rate": 4.322139440407198e-06, + "loss": 0.1218, + "step": 804 + }, + { + "epoch": 0.3166175024582104, + "grad_norm": 1.0682744979858398, + "learning_rate": 4.319786447778887e-06, + "loss": 0.0271, + "step": 805 + }, + { + "epoch": 0.31701081612586035, + "grad_norm": 0.7397903800010681, + "learning_rate": 4.317430021249543e-06, + "loss": 0.0313, + "step": 806 + }, + { + "epoch": 0.31740412979351035, + "grad_norm": 1.9803013801574707, + "learning_rate": 4.315070165265695e-06, + "loss": 0.0832, + "step": 807 + }, + { + "epoch": 0.3177974434611603, + "grad_norm": 0.9591525793075562, + "learning_rate": 4.312706884280349e-06, + "loss": 0.0611, + "step": 808 + }, + { + "epoch": 0.31819075712881023, + "grad_norm": 0.7980911731719971, + "learning_rate": 4.310340182752965e-06, + "loss": 0.0163, + "step": 809 + }, + { + "epoch": 0.3185840707964602, + "grad_norm": 0.8986029028892517, + "learning_rate": 4.307970065149464e-06, + "loss": 0.0382, + "step": 810 + }, + { + "epoch": 0.3189773844641101, + "grad_norm": 0.9218258857727051, + "learning_rate": 4.305596535942211e-06, + "loss": 0.0362, + "step": 811 + }, + { + "epoch": 0.31937069813176006, + "grad_norm": 1.9387575387954712, + "learning_rate": 4.303219599610009e-06, + "loss": 0.045, + "step": 812 + }, + { + "epoch": 0.31976401179941005, + "grad_norm": 2.1032979488372803, + "learning_rate": 4.300839260638089e-06, + "loss": 0.0583, + "step": 813 + }, + { + "epoch": 0.32015732546706, + "grad_norm": 0.8777870535850525, + "learning_rate": 4.298455523518102e-06, + "loss": 0.0611, + "step": 814 + }, + { + "epoch": 0.32055063913470994, + "grad_norm": 1.7572643756866455, + "learning_rate": 4.296068392748116e-06, + "loss": 0.053, + "step": 815 + }, + { + "epoch": 0.3209439528023599, + "grad_norm": 1.3729215860366821, + "learning_rate": 4.293677872832599e-06, + "loss": 0.1014, + "step": 816 + }, + { + "epoch": 0.3213372664700098, + "grad_norm": 2.968247175216675, + "learning_rate": 4.291283968282413e-06, + "loss": 0.0422, + "step": 817 + }, + { + "epoch": 0.32173058013765976, + "grad_norm": 1.2367733716964722, + "learning_rate": 4.288886683614809e-06, + "loss": 0.0598, + "step": 818 + }, + { + "epoch": 0.32212389380530976, + "grad_norm": 2.149622678756714, + "learning_rate": 4.286486023353417e-06, + "loss": 0.0834, + "step": 819 + }, + { + "epoch": 0.3225172074729597, + "grad_norm": 2.1104652881622314, + "learning_rate": 4.284081992028235e-06, + "loss": 0.0764, + "step": 820 + }, + { + "epoch": 0.32291052114060964, + "grad_norm": 1.5311528444290161, + "learning_rate": 4.281674594175621e-06, + "loss": 0.0586, + "step": 821 + }, + { + "epoch": 0.3233038348082596, + "grad_norm": 1.432000756263733, + "learning_rate": 4.2792638343382894e-06, + "loss": 0.0787, + "step": 822 + }, + { + "epoch": 0.3236971484759095, + "grad_norm": 1.2007765769958496, + "learning_rate": 4.276849717065295e-06, + "loss": 0.0462, + "step": 823 + }, + { + "epoch": 0.32409046214355947, + "grad_norm": 1.0811890363693237, + "learning_rate": 4.2744322469120296e-06, + "loss": 0.0624, + "step": 824 + }, + { + "epoch": 0.32448377581120946, + "grad_norm": 1.440487265586853, + "learning_rate": 4.272011428440212e-06, + "loss": 0.0557, + "step": 825 + }, + { + "epoch": 0.3248770894788594, + "grad_norm": 2.677267551422119, + "learning_rate": 4.269587266217878e-06, + "loss": 0.0804, + "step": 826 + }, + { + "epoch": 0.32527040314650935, + "grad_norm": 1.07245671749115, + "learning_rate": 4.2671597648193745e-06, + "loss": 0.0542, + "step": 827 + }, + { + "epoch": 0.3256637168141593, + "grad_norm": 1.0649880170822144, + "learning_rate": 4.264728928825347e-06, + "loss": 0.0573, + "step": 828 + }, + { + "epoch": 0.32605703048180923, + "grad_norm": 1.880872130393982, + "learning_rate": 4.262294762822738e-06, + "loss": 0.0892, + "step": 829 + }, + { + "epoch": 0.32645034414945917, + "grad_norm": 1.7007864713668823, + "learning_rate": 4.259857271404767e-06, + "loss": 0.097, + "step": 830 + }, + { + "epoch": 0.32684365781710917, + "grad_norm": 0.9796857237815857, + "learning_rate": 4.257416459170935e-06, + "loss": 0.0372, + "step": 831 + }, + { + "epoch": 0.3272369714847591, + "grad_norm": 1.3802924156188965, + "learning_rate": 4.254972330727004e-06, + "loss": 0.0388, + "step": 832 + }, + { + "epoch": 0.32763028515240905, + "grad_norm": 1.8189585208892822, + "learning_rate": 4.252524890685e-06, + "loss": 0.0504, + "step": 833 + }, + { + "epoch": 0.328023598820059, + "grad_norm": 1.2440087795257568, + "learning_rate": 4.250074143663189e-06, + "loss": 0.055, + "step": 834 + }, + { + "epoch": 0.32841691248770893, + "grad_norm": 1.26856529712677, + "learning_rate": 4.247620094286085e-06, + "loss": 0.0528, + "step": 835 + }, + { + "epoch": 0.3288102261553589, + "grad_norm": 1.8983615636825562, + "learning_rate": 4.2451627471844305e-06, + "loss": 0.0527, + "step": 836 + }, + { + "epoch": 0.3292035398230089, + "grad_norm": 0.9810947179794312, + "learning_rate": 4.24270210699519e-06, + "loss": 0.04, + "step": 837 + }, + { + "epoch": 0.3295968534906588, + "grad_norm": 1.2199605703353882, + "learning_rate": 4.240238178361543e-06, + "loss": 0.0443, + "step": 838 + }, + { + "epoch": 0.32999016715830876, + "grad_norm": 0.5256842374801636, + "learning_rate": 4.237770965932875e-06, + "loss": 0.0267, + "step": 839 + }, + { + "epoch": 0.3303834808259587, + "grad_norm": 1.456432819366455, + "learning_rate": 4.235300474364766e-06, + "loss": 0.0623, + "step": 840 + }, + { + "epoch": 0.33077679449360864, + "grad_norm": 1.4406569004058838, + "learning_rate": 4.232826708318985e-06, + "loss": 0.0453, + "step": 841 + }, + { + "epoch": 0.3311701081612586, + "grad_norm": 1.9302328824996948, + "learning_rate": 4.230349672463481e-06, + "loss": 0.0655, + "step": 842 + }, + { + "epoch": 0.3315634218289086, + "grad_norm": 0.7055051922798157, + "learning_rate": 4.22786937147237e-06, + "loss": 0.0405, + "step": 843 + }, + { + "epoch": 0.3319567354965585, + "grad_norm": 2.823591947555542, + "learning_rate": 4.2253858100259304e-06, + "loss": 0.1111, + "step": 844 + }, + { + "epoch": 0.33235004916420846, + "grad_norm": 1.458694577217102, + "learning_rate": 4.222898992810596e-06, + "loss": 0.0688, + "step": 845 + }, + { + "epoch": 0.3327433628318584, + "grad_norm": 1.3440479040145874, + "learning_rate": 4.220408924518939e-06, + "loss": 0.0654, + "step": 846 + }, + { + "epoch": 0.33313667649950834, + "grad_norm": 1.2197304964065552, + "learning_rate": 4.217915609849671e-06, + "loss": 0.0269, + "step": 847 + }, + { + "epoch": 0.3335299901671583, + "grad_norm": 1.0218877792358398, + "learning_rate": 4.215419053507626e-06, + "loss": 0.0525, + "step": 848 + }, + { + "epoch": 0.3339233038348083, + "grad_norm": 1.4025174379348755, + "learning_rate": 4.212919260203757e-06, + "loss": 0.0947, + "step": 849 + }, + { + "epoch": 0.3343166175024582, + "grad_norm": 0.7898326516151428, + "learning_rate": 4.210416234655125e-06, + "loss": 0.0337, + "step": 850 + }, + { + "epoch": 0.33470993117010817, + "grad_norm": 1.196540355682373, + "learning_rate": 4.207909981584889e-06, + "loss": 0.0578, + "step": 851 + }, + { + "epoch": 0.3351032448377581, + "grad_norm": 0.926796555519104, + "learning_rate": 4.2054005057223e-06, + "loss": 0.0672, + "step": 852 + }, + { + "epoch": 0.33549655850540805, + "grad_norm": 1.2736568450927734, + "learning_rate": 4.202887811802687e-06, + "loss": 0.0484, + "step": 853 + }, + { + "epoch": 0.335889872173058, + "grad_norm": 1.2440752983093262, + "learning_rate": 4.200371904567457e-06, + "loss": 0.0478, + "step": 854 + }, + { + "epoch": 0.336283185840708, + "grad_norm": 1.4759784936904907, + "learning_rate": 4.197852788764075e-06, + "loss": 0.0458, + "step": 855 + }, + { + "epoch": 0.33667649950835793, + "grad_norm": 0.7424830794334412, + "learning_rate": 4.195330469146063e-06, + "loss": 0.0327, + "step": 856 + }, + { + "epoch": 0.33706981317600787, + "grad_norm": 1.2250968217849731, + "learning_rate": 4.1928049504729886e-06, + "loss": 0.0637, + "step": 857 + }, + { + "epoch": 0.3374631268436578, + "grad_norm": 1.2263579368591309, + "learning_rate": 4.1902762375104555e-06, + "loss": 0.0733, + "step": 858 + }, + { + "epoch": 0.33785644051130775, + "grad_norm": 0.5867930054664612, + "learning_rate": 4.187744335030095e-06, + "loss": 0.055, + "step": 859 + }, + { + "epoch": 0.3382497541789577, + "grad_norm": 2.040759563446045, + "learning_rate": 4.185209247809557e-06, + "loss": 0.0664, + "step": 860 + }, + { + "epoch": 0.3386430678466077, + "grad_norm": 2.09037709236145, + "learning_rate": 4.182670980632501e-06, + "loss": 0.0728, + "step": 861 + }, + { + "epoch": 0.33903638151425763, + "grad_norm": 3.822634220123291, + "learning_rate": 4.180129538288587e-06, + "loss": 0.0912, + "step": 862 + }, + { + "epoch": 0.3394296951819076, + "grad_norm": 1.7590773105621338, + "learning_rate": 4.177584925573466e-06, + "loss": 0.0623, + "step": 863 + }, + { + "epoch": 0.3398230088495575, + "grad_norm": 1.2151440382003784, + "learning_rate": 4.175037147288772e-06, + "loss": 0.044, + "step": 864 + }, + { + "epoch": 0.34021632251720746, + "grad_norm": 0.765602171421051, + "learning_rate": 4.172486208242113e-06, + "loss": 0.0811, + "step": 865 + }, + { + "epoch": 0.3406096361848574, + "grad_norm": 0.9690750241279602, + "learning_rate": 4.169932113247059e-06, + "loss": 0.0587, + "step": 866 + }, + { + "epoch": 0.3410029498525074, + "grad_norm": 0.6641612648963928, + "learning_rate": 4.167374867123138e-06, + "loss": 0.0336, + "step": 867 + }, + { + "epoch": 0.34139626352015734, + "grad_norm": 0.9194386601448059, + "learning_rate": 4.164814474695823e-06, + "loss": 0.0566, + "step": 868 + }, + { + "epoch": 0.3417895771878073, + "grad_norm": 2.2128334045410156, + "learning_rate": 4.162250940796523e-06, + "loss": 0.074, + "step": 869 + }, + { + "epoch": 0.3421828908554572, + "grad_norm": 1.8464068174362183, + "learning_rate": 4.159684270262576e-06, + "loss": 0.0736, + "step": 870 + }, + { + "epoch": 0.34257620452310716, + "grad_norm": 0.9694234728813171, + "learning_rate": 4.157114467937239e-06, + "loss": 0.0413, + "step": 871 + }, + { + "epoch": 0.3429695181907571, + "grad_norm": 1.4554444551467896, + "learning_rate": 4.154541538669677e-06, + "loss": 0.0468, + "step": 872 + }, + { + "epoch": 0.3433628318584071, + "grad_norm": 1.3524583578109741, + "learning_rate": 4.151965487314959e-06, + "loss": 0.049, + "step": 873 + }, + { + "epoch": 0.34375614552605704, + "grad_norm": 1.6620694398880005, + "learning_rate": 4.1493863187340415e-06, + "loss": 0.0686, + "step": 874 + }, + { + "epoch": 0.344149459193707, + "grad_norm": 0.8126603364944458, + "learning_rate": 4.146804037793763e-06, + "loss": 0.0335, + "step": 875 + }, + { + "epoch": 0.3445427728613569, + "grad_norm": 1.852401852607727, + "learning_rate": 4.144218649366839e-06, + "loss": 0.0488, + "step": 876 + }, + { + "epoch": 0.34493608652900687, + "grad_norm": 1.165703296661377, + "learning_rate": 4.141630158331845e-06, + "loss": 0.0464, + "step": 877 + }, + { + "epoch": 0.3453294001966568, + "grad_norm": 2.391685962677002, + "learning_rate": 4.139038569573213e-06, + "loss": 0.0829, + "step": 878 + }, + { + "epoch": 0.3457227138643068, + "grad_norm": 1.832273006439209, + "learning_rate": 4.1364438879812194e-06, + "loss": 0.0406, + "step": 879 + }, + { + "epoch": 0.34611602753195675, + "grad_norm": 1.1527806520462036, + "learning_rate": 4.1338461184519776e-06, + "loss": 0.0682, + "step": 880 + }, + { + "epoch": 0.3465093411996067, + "grad_norm": 1.8680974245071411, + "learning_rate": 4.131245265887426e-06, + "loss": 0.0847, + "step": 881 + }, + { + "epoch": 0.34690265486725663, + "grad_norm": 1.7685651779174805, + "learning_rate": 4.1286413351953235e-06, + "loss": 0.0461, + "step": 882 + }, + { + "epoch": 0.3472959685349066, + "grad_norm": 2.0602667331695557, + "learning_rate": 4.126034331289235e-06, + "loss": 0.0992, + "step": 883 + }, + { + "epoch": 0.3476892822025565, + "grad_norm": 1.4323168992996216, + "learning_rate": 4.123424259088525e-06, + "loss": 0.0992, + "step": 884 + }, + { + "epoch": 0.3480825958702065, + "grad_norm": 0.9091783165931702, + "learning_rate": 4.120811123518349e-06, + "loss": 0.0519, + "step": 885 + }, + { + "epoch": 0.34847590953785645, + "grad_norm": 1.3111385107040405, + "learning_rate": 4.1181949295096415e-06, + "loss": 0.0811, + "step": 886 + }, + { + "epoch": 0.3488692232055064, + "grad_norm": 2.218848705291748, + "learning_rate": 4.11557568199911e-06, + "loss": 0.0743, + "step": 887 + }, + { + "epoch": 0.34926253687315634, + "grad_norm": 0.9991410970687866, + "learning_rate": 4.112953385929221e-06, + "loss": 0.0488, + "step": 888 + }, + { + "epoch": 0.3496558505408063, + "grad_norm": 1.4411261081695557, + "learning_rate": 4.110328046248196e-06, + "loss": 0.0704, + "step": 889 + }, + { + "epoch": 0.3500491642084562, + "grad_norm": 1.3707761764526367, + "learning_rate": 4.107699667909999e-06, + "loss": 0.0514, + "step": 890 + }, + { + "epoch": 0.3504424778761062, + "grad_norm": 1.438081979751587, + "learning_rate": 4.105068255874328e-06, + "loss": 0.0622, + "step": 891 + }, + { + "epoch": 0.35083579154375616, + "grad_norm": 1.0999984741210938, + "learning_rate": 4.102433815106606e-06, + "loss": 0.0423, + "step": 892 + }, + { + "epoch": 0.3512291052114061, + "grad_norm": 1.6553218364715576, + "learning_rate": 4.09979635057797e-06, + "loss": 0.0621, + "step": 893 + }, + { + "epoch": 0.35162241887905604, + "grad_norm": 2.6534736156463623, + "learning_rate": 4.097155867265264e-06, + "loss": 0.0956, + "step": 894 + }, + { + "epoch": 0.352015732546706, + "grad_norm": 1.2164000272750854, + "learning_rate": 4.094512370151027e-06, + "loss": 0.064, + "step": 895 + }, + { + "epoch": 0.3524090462143559, + "grad_norm": 1.4759900569915771, + "learning_rate": 4.091865864223487e-06, + "loss": 0.0496, + "step": 896 + }, + { + "epoch": 0.3528023598820059, + "grad_norm": 1.3511669635772705, + "learning_rate": 4.089216354476545e-06, + "loss": 0.0662, + "step": 897 + }, + { + "epoch": 0.35319567354965586, + "grad_norm": 1.4343103170394897, + "learning_rate": 4.086563845909779e-06, + "loss": 0.0543, + "step": 898 + }, + { + "epoch": 0.3535889872173058, + "grad_norm": 0.5085878968238831, + "learning_rate": 4.083908343528415e-06, + "loss": 0.0457, + "step": 899 + }, + { + "epoch": 0.35398230088495575, + "grad_norm": 0.9629530906677246, + "learning_rate": 4.081249852343336e-06, + "loss": 0.0422, + "step": 900 + }, + { + "epoch": 0.3543756145526057, + "grad_norm": 1.697277307510376, + "learning_rate": 4.078588377371062e-06, + "loss": 0.0583, + "step": 901 + }, + { + "epoch": 0.35476892822025563, + "grad_norm": 1.2820713520050049, + "learning_rate": 4.075923923633745e-06, + "loss": 0.0621, + "step": 902 + }, + { + "epoch": 0.3551622418879056, + "grad_norm": 0.9127804636955261, + "learning_rate": 4.073256496159153e-06, + "loss": 0.0616, + "step": 903 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 1.4303189516067505, + "learning_rate": 4.070586099980672e-06, + "loss": 0.0556, + "step": 904 + }, + { + "epoch": 0.3559488692232055, + "grad_norm": 0.8110685348510742, + "learning_rate": 4.067912740137285e-06, + "loss": 0.0665, + "step": 905 + }, + { + "epoch": 0.35634218289085545, + "grad_norm": 1.490004062652588, + "learning_rate": 4.06523642167357e-06, + "loss": 0.0771, + "step": 906 + }, + { + "epoch": 0.3567354965585054, + "grad_norm": 1.763295292854309, + "learning_rate": 4.062557149639688e-06, + "loss": 0.0824, + "step": 907 + }, + { + "epoch": 0.35712881022615534, + "grad_norm": 2.5675792694091797, + "learning_rate": 4.059874929091369e-06, + "loss": 0.0886, + "step": 908 + }, + { + "epoch": 0.35752212389380533, + "grad_norm": 1.442456841468811, + "learning_rate": 4.057189765089914e-06, + "loss": 0.0507, + "step": 909 + }, + { + "epoch": 0.3579154375614553, + "grad_norm": 1.2593395709991455, + "learning_rate": 4.054501662702172e-06, + "loss": 0.0555, + "step": 910 + }, + { + "epoch": 0.3583087512291052, + "grad_norm": 1.1391284465789795, + "learning_rate": 4.05181062700054e-06, + "loss": 0.058, + "step": 911 + }, + { + "epoch": 0.35870206489675516, + "grad_norm": 0.7833881378173828, + "learning_rate": 4.049116663062949e-06, + "loss": 0.0588, + "step": 912 + }, + { + "epoch": 0.3590953785644051, + "grad_norm": 1.7920033931732178, + "learning_rate": 4.046419775972855e-06, + "loss": 0.1015, + "step": 913 + }, + { + "epoch": 0.35948869223205504, + "grad_norm": 1.4693628549575806, + "learning_rate": 4.043719970819231e-06, + "loss": 0.0734, + "step": 914 + }, + { + "epoch": 0.35988200589970504, + "grad_norm": 0.9692854285240173, + "learning_rate": 4.041017252696556e-06, + "loss": 0.0537, + "step": 915 + }, + { + "epoch": 0.360275319567355, + "grad_norm": 0.9593791961669922, + "learning_rate": 4.038311626704806e-06, + "loss": 0.0599, + "step": 916 + }, + { + "epoch": 0.3606686332350049, + "grad_norm": 1.1619371175765991, + "learning_rate": 4.035603097949444e-06, + "loss": 0.0597, + "step": 917 + }, + { + "epoch": 0.36106194690265486, + "grad_norm": 1.3384184837341309, + "learning_rate": 4.032891671541409e-06, + "loss": 0.0513, + "step": 918 + }, + { + "epoch": 0.3614552605703048, + "grad_norm": 0.7744063138961792, + "learning_rate": 4.030177352597109e-06, + "loss": 0.0428, + "step": 919 + }, + { + "epoch": 0.36184857423795475, + "grad_norm": 1.1778054237365723, + "learning_rate": 4.027460146238411e-06, + "loss": 0.0733, + "step": 920 + }, + { + "epoch": 0.36224188790560474, + "grad_norm": 1.161788821220398, + "learning_rate": 4.02474005759263e-06, + "loss": 0.0735, + "step": 921 + }, + { + "epoch": 0.3626352015732547, + "grad_norm": 2.0623209476470947, + "learning_rate": 4.022017091792518e-06, + "loss": 0.065, + "step": 922 + }, + { + "epoch": 0.3630285152409046, + "grad_norm": 1.3139375448226929, + "learning_rate": 4.01929125397626e-06, + "loss": 0.0582, + "step": 923 + }, + { + "epoch": 0.36342182890855457, + "grad_norm": 2.0761849880218506, + "learning_rate": 4.016562549287455e-06, + "loss": 0.0557, + "step": 924 + }, + { + "epoch": 0.3638151425762045, + "grad_norm": 1.474522352218628, + "learning_rate": 4.013830982875117e-06, + "loss": 0.0665, + "step": 925 + }, + { + "epoch": 0.36420845624385445, + "grad_norm": 1.7274634838104248, + "learning_rate": 4.0110965598936565e-06, + "loss": 0.0735, + "step": 926 + }, + { + "epoch": 0.36460176991150445, + "grad_norm": 0.7064616084098816, + "learning_rate": 4.008359285502877e-06, + "loss": 0.0449, + "step": 927 + }, + { + "epoch": 0.3649950835791544, + "grad_norm": 0.8762916922569275, + "learning_rate": 4.005619164867959e-06, + "loss": 0.0582, + "step": 928 + }, + { + "epoch": 0.36538839724680433, + "grad_norm": 1.2766094207763672, + "learning_rate": 4.002876203159458e-06, + "loss": 0.0467, + "step": 929 + }, + { + "epoch": 0.36578171091445427, + "grad_norm": 1.4357662200927734, + "learning_rate": 4.000130405553287e-06, + "loss": 0.0676, + "step": 930 + }, + { + "epoch": 0.3661750245821042, + "grad_norm": 1.755672574043274, + "learning_rate": 3.997381777230714e-06, + "loss": 0.0647, + "step": 931 + }, + { + "epoch": 0.36656833824975416, + "grad_norm": 0.9483436942100525, + "learning_rate": 3.994630323378344e-06, + "loss": 0.0601, + "step": 932 + }, + { + "epoch": 0.36696165191740415, + "grad_norm": 1.6659551858901978, + "learning_rate": 3.991876049188116e-06, + "loss": 0.0738, + "step": 933 + }, + { + "epoch": 0.3673549655850541, + "grad_norm": 1.5737981796264648, + "learning_rate": 3.989118959857293e-06, + "loss": 0.0483, + "step": 934 + }, + { + "epoch": 0.36774827925270404, + "grad_norm": 1.5014865398406982, + "learning_rate": 3.986359060588446e-06, + "loss": 0.0458, + "step": 935 + }, + { + "epoch": 0.368141592920354, + "grad_norm": 1.5164520740509033, + "learning_rate": 3.983596356589452e-06, + "loss": 0.0617, + "step": 936 + }, + { + "epoch": 0.3685349065880039, + "grad_norm": 2.2842421531677246, + "learning_rate": 3.980830853073476e-06, + "loss": 0.0816, + "step": 937 + }, + { + "epoch": 0.36892822025565386, + "grad_norm": 1.5114701986312866, + "learning_rate": 3.978062555258972e-06, + "loss": 0.0355, + "step": 938 + }, + { + "epoch": 0.36932153392330386, + "grad_norm": 1.2816709280014038, + "learning_rate": 3.975291468369661e-06, + "loss": 0.0556, + "step": 939 + }, + { + "epoch": 0.3697148475909538, + "grad_norm": 2.0237350463867188, + "learning_rate": 3.97251759763453e-06, + "loss": 0.0622, + "step": 940 + }, + { + "epoch": 0.37010816125860374, + "grad_norm": 1.3120791912078857, + "learning_rate": 3.969740948287817e-06, + "loss": 0.0414, + "step": 941 + }, + { + "epoch": 0.3705014749262537, + "grad_norm": 1.3838061094284058, + "learning_rate": 3.966961525569005e-06, + "loss": 0.0653, + "step": 942 + }, + { + "epoch": 0.3708947885939036, + "grad_norm": 0.6813984513282776, + "learning_rate": 3.964179334722811e-06, + "loss": 0.0345, + "step": 943 + }, + { + "epoch": 0.37128810226155357, + "grad_norm": 0.8976694345474243, + "learning_rate": 3.961394380999173e-06, + "loss": 0.0314, + "step": 944 + }, + { + "epoch": 0.37168141592920356, + "grad_norm": 0.9033572673797607, + "learning_rate": 3.958606669653243e-06, + "loss": 0.0542, + "step": 945 + }, + { + "epoch": 0.3720747295968535, + "grad_norm": 0.901779055595398, + "learning_rate": 3.955816205945378e-06, + "loss": 0.0359, + "step": 946 + }, + { + "epoch": 0.37246804326450345, + "grad_norm": 2.198181390762329, + "learning_rate": 3.953022995141128e-06, + "loss": 0.0473, + "step": 947 + }, + { + "epoch": 0.3728613569321534, + "grad_norm": 1.4871481657028198, + "learning_rate": 3.950227042511226e-06, + "loss": 0.0888, + "step": 948 + }, + { + "epoch": 0.37325467059980333, + "grad_norm": 1.3157522678375244, + "learning_rate": 3.947428353331579e-06, + "loss": 0.041, + "step": 949 + }, + { + "epoch": 0.37364798426745327, + "grad_norm": 1.431186318397522, + "learning_rate": 3.94462693288326e-06, + "loss": 0.0799, + "step": 950 + }, + { + "epoch": 0.37404129793510327, + "grad_norm": 1.389054775238037, + "learning_rate": 3.941822786452491e-06, + "loss": 0.0457, + "step": 951 + }, + { + "epoch": 0.3744346116027532, + "grad_norm": 1.6102625131607056, + "learning_rate": 3.939015919330643e-06, + "loss": 0.0926, + "step": 952 + }, + { + "epoch": 0.37482792527040315, + "grad_norm": 0.8472495675086975, + "learning_rate": 3.936206336814219e-06, + "loss": 0.0408, + "step": 953 + }, + { + "epoch": 0.3752212389380531, + "grad_norm": 0.8631911873817444, + "learning_rate": 3.933394044204843e-06, + "loss": 0.0405, + "step": 954 + }, + { + "epoch": 0.37561455260570303, + "grad_norm": 5.559257507324219, + "learning_rate": 3.930579046809259e-06, + "loss": 0.048, + "step": 955 + }, + { + "epoch": 0.376007866273353, + "grad_norm": 1.6139276027679443, + "learning_rate": 3.92776134993931e-06, + "loss": 0.0596, + "step": 956 + }, + { + "epoch": 0.376401179941003, + "grad_norm": 1.7035290002822876, + "learning_rate": 3.924940958911933e-06, + "loss": 0.061, + "step": 957 + }, + { + "epoch": 0.3767944936086529, + "grad_norm": 0.8409842848777771, + "learning_rate": 3.922117879049152e-06, + "loss": 0.0416, + "step": 958 + }, + { + "epoch": 0.37718780727630286, + "grad_norm": 1.9367414712905884, + "learning_rate": 3.91929211567806e-06, + "loss": 0.0617, + "step": 959 + }, + { + "epoch": 0.3775811209439528, + "grad_norm": 1.0128939151763916, + "learning_rate": 3.916463674130821e-06, + "loss": 0.0477, + "step": 960 + }, + { + "epoch": 0.37797443461160274, + "grad_norm": 1.9125791788101196, + "learning_rate": 3.913632559744645e-06, + "loss": 0.0571, + "step": 961 + }, + { + "epoch": 0.3783677482792527, + "grad_norm": 1.4633182287216187, + "learning_rate": 3.910798777861788e-06, + "loss": 0.0511, + "step": 962 + }, + { + "epoch": 0.3787610619469027, + "grad_norm": 0.9891822934150696, + "learning_rate": 3.9079623338295436e-06, + "loss": 0.0485, + "step": 963 + }, + { + "epoch": 0.3791543756145526, + "grad_norm": 1.2277315855026245, + "learning_rate": 3.9051232330002245e-06, + "loss": 0.0449, + "step": 964 + }, + { + "epoch": 0.37954768928220256, + "grad_norm": 0.49736377596855164, + "learning_rate": 3.902281480731156e-06, + "loss": 0.0213, + "step": 965 + }, + { + "epoch": 0.3799410029498525, + "grad_norm": 0.982218861579895, + "learning_rate": 3.899437082384671e-06, + "loss": 0.0581, + "step": 966 + }, + { + "epoch": 0.38033431661750244, + "grad_norm": 0.8971213102340698, + "learning_rate": 3.89659004332809e-06, + "loss": 0.0458, + "step": 967 + }, + { + "epoch": 0.3807276302851524, + "grad_norm": 0.4127979874610901, + "learning_rate": 3.893740368933722e-06, + "loss": 0.0313, + "step": 968 + }, + { + "epoch": 0.3811209439528024, + "grad_norm": 2.5857155323028564, + "learning_rate": 3.8908880645788464e-06, + "loss": 0.0711, + "step": 969 + }, + { + "epoch": 0.3815142576204523, + "grad_norm": 1.2110406160354614, + "learning_rate": 3.888033135645702e-06, + "loss": 0.0508, + "step": 970 + }, + { + "epoch": 0.38190757128810227, + "grad_norm": 1.58492112159729, + "learning_rate": 3.885175587521486e-06, + "loss": 0.0662, + "step": 971 + }, + { + "epoch": 0.3823008849557522, + "grad_norm": 0.8792701363563538, + "learning_rate": 3.882315425598334e-06, + "loss": 0.0767, + "step": 972 + }, + { + "epoch": 0.38269419862340215, + "grad_norm": 1.797515869140625, + "learning_rate": 3.879452655273316e-06, + "loss": 0.0585, + "step": 973 + }, + { + "epoch": 0.3830875122910521, + "grad_norm": 1.6386829614639282, + "learning_rate": 3.876587281948422e-06, + "loss": 0.08, + "step": 974 + }, + { + "epoch": 0.3834808259587021, + "grad_norm": 1.1229251623153687, + "learning_rate": 3.873719311030556e-06, + "loss": 0.0585, + "step": 975 + }, + { + "epoch": 0.38387413962635203, + "grad_norm": 1.2260591983795166, + "learning_rate": 3.8708487479315204e-06, + "loss": 0.0647, + "step": 976 + }, + { + "epoch": 0.38426745329400197, + "grad_norm": 1.565321683883667, + "learning_rate": 3.867975598068012e-06, + "loss": 0.067, + "step": 977 + }, + { + "epoch": 0.3846607669616519, + "grad_norm": 1.4004123210906982, + "learning_rate": 3.8650998668616085e-06, + "loss": 0.0765, + "step": 978 + }, + { + "epoch": 0.38505408062930185, + "grad_norm": 1.5652803182601929, + "learning_rate": 3.862221559738757e-06, + "loss": 0.0672, + "step": 979 + }, + { + "epoch": 0.3854473942969518, + "grad_norm": 4.284322738647461, + "learning_rate": 3.859340682130766e-06, + "loss": 0.0692, + "step": 980 + }, + { + "epoch": 0.3858407079646018, + "grad_norm": 1.21330988407135, + "learning_rate": 3.856457239473795e-06, + "loss": 0.0828, + "step": 981 + }, + { + "epoch": 0.38623402163225173, + "grad_norm": 2.4526336193084717, + "learning_rate": 3.853571237208843e-06, + "loss": 0.0694, + "step": 982 + }, + { + "epoch": 0.3866273352999017, + "grad_norm": 1.0117402076721191, + "learning_rate": 3.8506826807817395e-06, + "loss": 0.0362, + "step": 983 + }, + { + "epoch": 0.3870206489675516, + "grad_norm": 1.1363615989685059, + "learning_rate": 3.847791575643134e-06, + "loss": 0.0543, + "step": 984 + }, + { + "epoch": 0.38741396263520156, + "grad_norm": 1.1766973733901978, + "learning_rate": 3.844897927248483e-06, + "loss": 0.0488, + "step": 985 + }, + { + "epoch": 0.3878072763028515, + "grad_norm": 0.8534460067749023, + "learning_rate": 3.842001741058045e-06, + "loss": 0.0603, + "step": 986 + }, + { + "epoch": 0.3882005899705015, + "grad_norm": 1.5655368566513062, + "learning_rate": 3.839103022536865e-06, + "loss": 0.0713, + "step": 987 + }, + { + "epoch": 0.38859390363815144, + "grad_norm": 0.6574957966804504, + "learning_rate": 3.836201777154769e-06, + "loss": 0.0583, + "step": 988 + }, + { + "epoch": 0.3889872173058014, + "grad_norm": 0.8077657222747803, + "learning_rate": 3.833298010386347e-06, + "loss": 0.05, + "step": 989 + }, + { + "epoch": 0.3893805309734513, + "grad_norm": 1.513853669166565, + "learning_rate": 3.830391727710954e-06, + "loss": 0.0502, + "step": 990 + }, + { + "epoch": 0.38977384464110126, + "grad_norm": 2.019428253173828, + "learning_rate": 3.827482934612684e-06, + "loss": 0.0557, + "step": 991 + }, + { + "epoch": 0.3901671583087512, + "grad_norm": 1.0257922410964966, + "learning_rate": 3.824571636580372e-06, + "loss": 0.0625, + "step": 992 + }, + { + "epoch": 0.3905604719764012, + "grad_norm": 0.5803849697113037, + "learning_rate": 3.821657839107583e-06, + "loss": 0.0442, + "step": 993 + }, + { + "epoch": 0.39095378564405114, + "grad_norm": 0.8499471545219421, + "learning_rate": 3.818741547692593e-06, + "loss": 0.0342, + "step": 994 + }, + { + "epoch": 0.3913470993117011, + "grad_norm": 0.4951908588409424, + "learning_rate": 3.815822767838386e-06, + "loss": 0.0343, + "step": 995 + }, + { + "epoch": 0.391740412979351, + "grad_norm": 1.5221655368804932, + "learning_rate": 3.812901505052642e-06, + "loss": 0.0465, + "step": 996 + }, + { + "epoch": 0.39213372664700097, + "grad_norm": 1.7891956567764282, + "learning_rate": 3.8099777648477264e-06, + "loss": 0.0821, + "step": 997 + }, + { + "epoch": 0.3925270403146509, + "grad_norm": 0.8419029116630554, + "learning_rate": 3.8070515527406803e-06, + "loss": 0.0546, + "step": 998 + }, + { + "epoch": 0.3929203539823009, + "grad_norm": 0.9236086010932922, + "learning_rate": 3.8041228742532064e-06, + "loss": 0.0423, + "step": 999 + }, + { + "epoch": 0.39331366764995085, + "grad_norm": 1.0892646312713623, + "learning_rate": 3.8011917349116633e-06, + "loss": 0.0531, + "step": 1000 + }, + { + "epoch": 0.3937069813176008, + "grad_norm": 1.6544411182403564, + "learning_rate": 3.7982581402470536e-06, + "loss": 0.0404, + "step": 1001 + }, + { + "epoch": 0.39410029498525073, + "grad_norm": 1.8338655233383179, + "learning_rate": 3.795322095795012e-06, + "loss": 0.0535, + "step": 1002 + }, + { + "epoch": 0.3944936086529007, + "grad_norm": 1.4561970233917236, + "learning_rate": 3.7923836070957963e-06, + "loss": 0.0506, + "step": 1003 + }, + { + "epoch": 0.3948869223205506, + "grad_norm": 1.1206718683242798, + "learning_rate": 3.7894426796942773e-06, + "loss": 0.07, + "step": 1004 + }, + { + "epoch": 0.3952802359882006, + "grad_norm": 1.5864077806472778, + "learning_rate": 3.786499319139926e-06, + "loss": 0.0511, + "step": 1005 + }, + { + "epoch": 0.39567354965585055, + "grad_norm": 1.6479477882385254, + "learning_rate": 3.7835535309868055e-06, + "loss": 0.1065, + "step": 1006 + }, + { + "epoch": 0.3960668633235005, + "grad_norm": 1.173240303993225, + "learning_rate": 3.78060532079356e-06, + "loss": 0.0366, + "step": 1007 + }, + { + "epoch": 0.39646017699115044, + "grad_norm": 1.512009859085083, + "learning_rate": 3.777654694123404e-06, + "loss": 0.0333, + "step": 1008 + }, + { + "epoch": 0.3968534906588004, + "grad_norm": 0.7629926800727844, + "learning_rate": 3.7747016565441112e-06, + "loss": 0.0293, + "step": 1009 + }, + { + "epoch": 0.3972468043264503, + "grad_norm": 1.325535774230957, + "learning_rate": 3.771746213628006e-06, + "loss": 0.0494, + "step": 1010 + }, + { + "epoch": 0.3976401179941003, + "grad_norm": 0.9456796050071716, + "learning_rate": 3.7687883709519496e-06, + "loss": 0.0347, + "step": 1011 + }, + { + "epoch": 0.39803343166175026, + "grad_norm": 1.6305729150772095, + "learning_rate": 3.7658281340973336e-06, + "loss": 0.0782, + "step": 1012 + }, + { + "epoch": 0.3984267453294002, + "grad_norm": 2.3638815879821777, + "learning_rate": 3.7628655086500654e-06, + "loss": 0.0746, + "step": 1013 + }, + { + "epoch": 0.39882005899705014, + "grad_norm": 1.1770771741867065, + "learning_rate": 3.7599005002005616e-06, + "loss": 0.0436, + "step": 1014 + }, + { + "epoch": 0.3992133726647001, + "grad_norm": 1.2992199659347534, + "learning_rate": 3.7569331143437336e-06, + "loss": 0.0565, + "step": 1015 + }, + { + "epoch": 0.39960668633235, + "grad_norm": 1.2094827890396118, + "learning_rate": 3.7539633566789812e-06, + "loss": 0.0536, + "step": 1016 + }, + { + "epoch": 0.4, + "grad_norm": 1.641381859779358, + "learning_rate": 3.750991232810177e-06, + "loss": 0.0373, + "step": 1017 + }, + { + "epoch": 0.40039331366764996, + "grad_norm": 0.7891103029251099, + "learning_rate": 3.7480167483456603e-06, + "loss": 0.0632, + "step": 1018 + }, + { + "epoch": 0.4007866273352999, + "grad_norm": 0.7216825485229492, + "learning_rate": 3.7450399088982247e-06, + "loss": 0.0513, + "step": 1019 + }, + { + "epoch": 0.40117994100294985, + "grad_norm": 0.7158090472221375, + "learning_rate": 3.742060720085107e-06, + "loss": 0.0456, + "step": 1020 + }, + { + "epoch": 0.4015732546705998, + "grad_norm": 0.58232182264328, + "learning_rate": 3.739079187527978e-06, + "loss": 0.027, + "step": 1021 + }, + { + "epoch": 0.40196656833824973, + "grad_norm": 1.546899437904358, + "learning_rate": 3.73609531685293e-06, + "loss": 0.1034, + "step": 1022 + }, + { + "epoch": 0.4023598820058997, + "grad_norm": 1.1753488779067993, + "learning_rate": 3.733109113690469e-06, + "loss": 0.0609, + "step": 1023 + }, + { + "epoch": 0.40275319567354967, + "grad_norm": 1.5217546224594116, + "learning_rate": 3.7301205836755006e-06, + "loss": 0.0853, + "step": 1024 + }, + { + "epoch": 0.4031465093411996, + "grad_norm": 0.9366397857666016, + "learning_rate": 3.727129732447322e-06, + "loss": 0.0511, + "step": 1025 + }, + { + "epoch": 0.40353982300884955, + "grad_norm": 0.8296689391136169, + "learning_rate": 3.7241365656496103e-06, + "loss": 0.0336, + "step": 1026 + }, + { + "epoch": 0.4039331366764995, + "grad_norm": 0.8638429641723633, + "learning_rate": 3.7211410889304117e-06, + "loss": 0.0675, + "step": 1027 + }, + { + "epoch": 0.40432645034414944, + "grad_norm": 0.6674923896789551, + "learning_rate": 3.7181433079421316e-06, + "loss": 0.0299, + "step": 1028 + }, + { + "epoch": 0.40471976401179943, + "grad_norm": 1.5683988332748413, + "learning_rate": 3.7151432283415244e-06, + "loss": 0.0814, + "step": 1029 + }, + { + "epoch": 0.4051130776794494, + "grad_norm": 0.6941884756088257, + "learning_rate": 3.712140855789679e-06, + "loss": 0.0428, + "step": 1030 + }, + { + "epoch": 0.4055063913470993, + "grad_norm": 0.8299364447593689, + "learning_rate": 3.709136195952015e-06, + "loss": 0.0534, + "step": 1031 + }, + { + "epoch": 0.40589970501474926, + "grad_norm": 1.065128207206726, + "learning_rate": 3.706129254498266e-06, + "loss": 0.0527, + "step": 1032 + }, + { + "epoch": 0.4062930186823992, + "grad_norm": 1.3388938903808594, + "learning_rate": 3.703120037102469e-06, + "loss": 0.0619, + "step": 1033 + }, + { + "epoch": 0.40668633235004914, + "grad_norm": 1.6854989528656006, + "learning_rate": 3.7001085494429596e-06, + "loss": 0.0605, + "step": 1034 + }, + { + "epoch": 0.40707964601769914, + "grad_norm": 1.7878034114837646, + "learning_rate": 3.697094797202355e-06, + "loss": 0.0644, + "step": 1035 + }, + { + "epoch": 0.4074729596853491, + "grad_norm": 0.7512350082397461, + "learning_rate": 3.694078786067546e-06, + "loss": 0.0561, + "step": 1036 + }, + { + "epoch": 0.407866273352999, + "grad_norm": 0.5946680307388306, + "learning_rate": 3.691060521729686e-06, + "loss": 0.032, + "step": 1037 + }, + { + "epoch": 0.40825958702064896, + "grad_norm": 0.7464413642883301, + "learning_rate": 3.6880400098841794e-06, + "loss": 0.0581, + "step": 1038 + }, + { + "epoch": 0.4086529006882989, + "grad_norm": 1.3339935541152954, + "learning_rate": 3.6850172562306735e-06, + "loss": 0.065, + "step": 1039 + }, + { + "epoch": 0.40904621435594885, + "grad_norm": 1.2734817266464233, + "learning_rate": 3.681992266473044e-06, + "loss": 0.0302, + "step": 1040 + }, + { + "epoch": 0.40943952802359884, + "grad_norm": 1.6477503776550293, + "learning_rate": 3.6789650463193864e-06, + "loss": 0.0454, + "step": 1041 + }, + { + "epoch": 0.4098328416912488, + "grad_norm": 1.9478659629821777, + "learning_rate": 3.675935601482006e-06, + "loss": 0.0906, + "step": 1042 + }, + { + "epoch": 0.4102261553588987, + "grad_norm": 1.2177263498306274, + "learning_rate": 3.6729039376774055e-06, + "loss": 0.0708, + "step": 1043 + }, + { + "epoch": 0.41061946902654867, + "grad_norm": 1.3361903429031372, + "learning_rate": 3.6698700606262733e-06, + "loss": 0.0542, + "step": 1044 + }, + { + "epoch": 0.4110127826941986, + "grad_norm": 0.7786129117012024, + "learning_rate": 3.6668339760534768e-06, + "loss": 0.0666, + "step": 1045 + }, + { + "epoch": 0.41140609636184855, + "grad_norm": 0.4651035964488983, + "learning_rate": 3.6637956896880465e-06, + "loss": 0.0442, + "step": 1046 + }, + { + "epoch": 0.41179941002949855, + "grad_norm": 0.28553763031959534, + "learning_rate": 3.6607552072631685e-06, + "loss": 0.0266, + "step": 1047 + }, + { + "epoch": 0.4121927236971485, + "grad_norm": 1.054947018623352, + "learning_rate": 3.6577125345161748e-06, + "loss": 0.0533, + "step": 1048 + }, + { + "epoch": 0.41258603736479843, + "grad_norm": 0.6713748574256897, + "learning_rate": 3.6546676771885257e-06, + "loss": 0.0347, + "step": 1049 + }, + { + "epoch": 0.41297935103244837, + "grad_norm": 1.4435083866119385, + "learning_rate": 3.6516206410258092e-06, + "loss": 0.0384, + "step": 1050 + }, + { + "epoch": 0.4133726647000983, + "grad_norm": 1.4494538307189941, + "learning_rate": 3.6485714317777223e-06, + "loss": 0.068, + "step": 1051 + }, + { + "epoch": 0.41376597836774826, + "grad_norm": 1.666913390159607, + "learning_rate": 3.6455200551980605e-06, + "loss": 0.0685, + "step": 1052 + }, + { + "epoch": 0.41415929203539825, + "grad_norm": 2.99609375, + "learning_rate": 3.642466517044713e-06, + "loss": 0.1213, + "step": 1053 + }, + { + "epoch": 0.4145526057030482, + "grad_norm": 1.6199326515197754, + "learning_rate": 3.6394108230796455e-06, + "loss": 0.0557, + "step": 1054 + }, + { + "epoch": 0.41494591937069814, + "grad_norm": 0.6611631512641907, + "learning_rate": 3.636352979068891e-06, + "loss": 0.0333, + "step": 1055 + }, + { + "epoch": 0.4153392330383481, + "grad_norm": 0.8349502086639404, + "learning_rate": 3.6332929907825426e-06, + "loss": 0.0285, + "step": 1056 + }, + { + "epoch": 0.415732546705998, + "grad_norm": 1.6354492902755737, + "learning_rate": 3.630230863994736e-06, + "loss": 0.0808, + "step": 1057 + }, + { + "epoch": 0.41612586037364796, + "grad_norm": 0.8214701414108276, + "learning_rate": 3.6271666044836433e-06, + "loss": 0.0355, + "step": 1058 + }, + { + "epoch": 0.41651917404129796, + "grad_norm": 1.321581244468689, + "learning_rate": 3.624100218031464e-06, + "loss": 0.0444, + "step": 1059 + }, + { + "epoch": 0.4169124877089479, + "grad_norm": 0.7428562641143799, + "learning_rate": 3.621031710424407e-06, + "loss": 0.0259, + "step": 1060 + }, + { + "epoch": 0.41730580137659784, + "grad_norm": 0.7929845452308655, + "learning_rate": 3.6179610874526856e-06, + "loss": 0.0345, + "step": 1061 + }, + { + "epoch": 0.4176991150442478, + "grad_norm": 0.6758319139480591, + "learning_rate": 3.614888354910505e-06, + "loss": 0.037, + "step": 1062 + }, + { + "epoch": 0.4180924287118977, + "grad_norm": 1.5147916078567505, + "learning_rate": 3.6118135185960507e-06, + "loss": 0.0855, + "step": 1063 + }, + { + "epoch": 0.41848574237954766, + "grad_norm": 1.0528610944747925, + "learning_rate": 3.6087365843114773e-06, + "loss": 0.0324, + "step": 1064 + }, + { + "epoch": 0.41887905604719766, + "grad_norm": 1.3274002075195312, + "learning_rate": 3.6056575578629006e-06, + "loss": 0.0475, + "step": 1065 + }, + { + "epoch": 0.4192723697148476, + "grad_norm": 0.5520153641700745, + "learning_rate": 3.6025764450603808e-06, + "loss": 0.022, + "step": 1066 + }, + { + "epoch": 0.41966568338249755, + "grad_norm": 1.81023371219635, + "learning_rate": 3.5994932517179182e-06, + "loss": 0.043, + "step": 1067 + }, + { + "epoch": 0.4200589970501475, + "grad_norm": 1.3602193593978882, + "learning_rate": 3.596407983653436e-06, + "loss": 0.073, + "step": 1068 + }, + { + "epoch": 0.42045231071779743, + "grad_norm": 1.921582579612732, + "learning_rate": 3.5933206466887755e-06, + "loss": 0.0759, + "step": 1069 + }, + { + "epoch": 0.42084562438544737, + "grad_norm": 0.8578033447265625, + "learning_rate": 3.59023124664968e-06, + "loss": 0.0249, + "step": 1070 + }, + { + "epoch": 0.42123893805309737, + "grad_norm": 1.7219325304031372, + "learning_rate": 3.5871397893657867e-06, + "loss": 0.0596, + "step": 1071 + }, + { + "epoch": 0.4216322517207473, + "grad_norm": 0.9463638663291931, + "learning_rate": 3.5840462806706126e-06, + "loss": 0.0454, + "step": 1072 + }, + { + "epoch": 0.42202556538839725, + "grad_norm": 1.9718307256698608, + "learning_rate": 3.5809507264015502e-06, + "loss": 0.0623, + "step": 1073 + }, + { + "epoch": 0.4224188790560472, + "grad_norm": 2.0382165908813477, + "learning_rate": 3.5778531323998465e-06, + "loss": 0.0497, + "step": 1074 + }, + { + "epoch": 0.42281219272369713, + "grad_norm": 1.496324062347412, + "learning_rate": 3.574753504510602e-06, + "loss": 0.0826, + "step": 1075 + }, + { + "epoch": 0.4232055063913471, + "grad_norm": 0.49463126063346863, + "learning_rate": 3.571651848582753e-06, + "loss": 0.0415, + "step": 1076 + }, + { + "epoch": 0.42359882005899707, + "grad_norm": 1.1558905839920044, + "learning_rate": 3.5685481704690617e-06, + "loss": 0.0473, + "step": 1077 + }, + { + "epoch": 0.423992133726647, + "grad_norm": 3.914982795715332, + "learning_rate": 3.5654424760261082e-06, + "loss": 0.0853, + "step": 1078 + }, + { + "epoch": 0.42438544739429696, + "grad_norm": 1.7288295030593872, + "learning_rate": 3.5623347711142764e-06, + "loss": 0.0817, + "step": 1079 + }, + { + "epoch": 0.4247787610619469, + "grad_norm": 1.0033987760543823, + "learning_rate": 3.5592250615977434e-06, + "loss": 0.0552, + "step": 1080 + }, + { + "epoch": 0.42517207472959684, + "grad_norm": 1.461305856704712, + "learning_rate": 3.5561133533444703e-06, + "loss": 0.0659, + "step": 1081 + }, + { + "epoch": 0.4255653883972468, + "grad_norm": 0.7007796168327332, + "learning_rate": 3.552999652226189e-06, + "loss": 0.0332, + "step": 1082 + }, + { + "epoch": 0.4259587020648968, + "grad_norm": 0.7041943073272705, + "learning_rate": 3.549883964118392e-06, + "loss": 0.0205, + "step": 1083 + }, + { + "epoch": 0.4263520157325467, + "grad_norm": 1.5797779560089111, + "learning_rate": 3.54676629490032e-06, + "loss": 0.0564, + "step": 1084 + }, + { + "epoch": 0.42674532940019666, + "grad_norm": 1.4408408403396606, + "learning_rate": 3.543646650454955e-06, + "loss": 0.0347, + "step": 1085 + }, + { + "epoch": 0.4271386430678466, + "grad_norm": 0.709080159664154, + "learning_rate": 3.5405250366690023e-06, + "loss": 0.0259, + "step": 1086 + }, + { + "epoch": 0.42753195673549654, + "grad_norm": 1.4579590559005737, + "learning_rate": 3.5374014594328877e-06, + "loss": 0.0712, + "step": 1087 + }, + { + "epoch": 0.4279252704031465, + "grad_norm": 0.9378184676170349, + "learning_rate": 3.5342759246407378e-06, + "loss": 0.0583, + "step": 1088 + }, + { + "epoch": 0.4283185840707965, + "grad_norm": 0.9149574041366577, + "learning_rate": 3.5311484381903754e-06, + "loss": 0.0594, + "step": 1089 + }, + { + "epoch": 0.4287118977384464, + "grad_norm": 1.2301528453826904, + "learning_rate": 3.528019005983306e-06, + "loss": 0.0603, + "step": 1090 + }, + { + "epoch": 0.42910521140609637, + "grad_norm": 1.222373127937317, + "learning_rate": 3.5248876339247053e-06, + "loss": 0.0331, + "step": 1091 + }, + { + "epoch": 0.4294985250737463, + "grad_norm": 1.5141066312789917, + "learning_rate": 3.521754327923412e-06, + "loss": 0.0662, + "step": 1092 + }, + { + "epoch": 0.42989183874139625, + "grad_norm": 1.581040620803833, + "learning_rate": 3.5186190938919106e-06, + "loss": 0.0634, + "step": 1093 + }, + { + "epoch": 0.4302851524090462, + "grad_norm": 1.1250847578048706, + "learning_rate": 3.515481937746327e-06, + "loss": 0.0428, + "step": 1094 + }, + { + "epoch": 0.4306784660766962, + "grad_norm": 1.6886603832244873, + "learning_rate": 3.5123428654064134e-06, + "loss": 0.043, + "step": 1095 + }, + { + "epoch": 0.43107177974434613, + "grad_norm": 2.050182819366455, + "learning_rate": 3.509201882795536e-06, + "loss": 0.1201, + "step": 1096 + }, + { + "epoch": 0.43146509341199607, + "grad_norm": 1.2001996040344238, + "learning_rate": 3.5060589958406677e-06, + "loss": 0.0453, + "step": 1097 + }, + { + "epoch": 0.431858407079646, + "grad_norm": 1.0683172941207886, + "learning_rate": 3.5029142104723725e-06, + "loss": 0.0331, + "step": 1098 + }, + { + "epoch": 0.43225172074729595, + "grad_norm": 2.0737650394439697, + "learning_rate": 3.4997675326247993e-06, + "loss": 0.0526, + "step": 1099 + }, + { + "epoch": 0.4326450344149459, + "grad_norm": 0.8983532190322876, + "learning_rate": 3.4966189682356677e-06, + "loss": 0.0532, + "step": 1100 + }, + { + "epoch": 0.4330383480825959, + "grad_norm": 1.8358802795410156, + "learning_rate": 3.493468523246255e-06, + "loss": 0.0598, + "step": 1101 + }, + { + "epoch": 0.43343166175024583, + "grad_norm": 2.076266050338745, + "learning_rate": 3.4903162036013894e-06, + "loss": 0.0836, + "step": 1102 + }, + { + "epoch": 0.4338249754178958, + "grad_norm": 2.4419870376586914, + "learning_rate": 3.487162015249436e-06, + "loss": 0.0758, + "step": 1103 + }, + { + "epoch": 0.4342182890855457, + "grad_norm": 1.3942052125930786, + "learning_rate": 3.484005964142285e-06, + "loss": 0.0803, + "step": 1104 + }, + { + "epoch": 0.43461160275319566, + "grad_norm": 1.3950960636138916, + "learning_rate": 3.4808480562353426e-06, + "loss": 0.0675, + "step": 1105 + }, + { + "epoch": 0.4350049164208456, + "grad_norm": 1.5000733137130737, + "learning_rate": 3.477688297487519e-06, + "loss": 0.0448, + "step": 1106 + }, + { + "epoch": 0.4353982300884956, + "grad_norm": 1.5005849599838257, + "learning_rate": 3.474526693861216e-06, + "loss": 0.0729, + "step": 1107 + }, + { + "epoch": 0.43579154375614554, + "grad_norm": 0.6299577951431274, + "learning_rate": 3.4713632513223178e-06, + "loss": 0.039, + "step": 1108 + }, + { + "epoch": 0.4361848574237955, + "grad_norm": 0.8964212536811829, + "learning_rate": 3.4681979758401767e-06, + "loss": 0.0521, + "step": 1109 + }, + { + "epoch": 0.4365781710914454, + "grad_norm": 1.3757152557373047, + "learning_rate": 3.465030873387606e-06, + "loss": 0.0598, + "step": 1110 + }, + { + "epoch": 0.43697148475909536, + "grad_norm": 0.48663070797920227, + "learning_rate": 3.461861949940865e-06, + "loss": 0.0442, + "step": 1111 + }, + { + "epoch": 0.4373647984267453, + "grad_norm": 0.8878856897354126, + "learning_rate": 3.458691211479649e-06, + "loss": 0.023, + "step": 1112 + }, + { + "epoch": 0.4377581120943953, + "grad_norm": 1.1162179708480835, + "learning_rate": 3.4555186639870795e-06, + "loss": 0.0493, + "step": 1113 + }, + { + "epoch": 0.43815142576204524, + "grad_norm": 1.1180258989334106, + "learning_rate": 3.4523443134496916e-06, + "loss": 0.0577, + "step": 1114 + }, + { + "epoch": 0.4385447394296952, + "grad_norm": 0.6240465641021729, + "learning_rate": 3.4491681658574205e-06, + "loss": 0.0295, + "step": 1115 + }, + { + "epoch": 0.4389380530973451, + "grad_norm": 2.439685106277466, + "learning_rate": 3.445990227203594e-06, + "loss": 0.0676, + "step": 1116 + }, + { + "epoch": 0.43933136676499507, + "grad_norm": 1.1544771194458008, + "learning_rate": 3.442810503484921e-06, + "loss": 0.0487, + "step": 1117 + }, + { + "epoch": 0.439724680432645, + "grad_norm": 1.794083833694458, + "learning_rate": 3.4396290007014752e-06, + "loss": 0.043, + "step": 1118 + }, + { + "epoch": 0.440117994100295, + "grad_norm": 0.8073402643203735, + "learning_rate": 3.4364457248566913e-06, + "loss": 0.0404, + "step": 1119 + }, + { + "epoch": 0.44051130776794495, + "grad_norm": 0.4391036331653595, + "learning_rate": 3.433260681957346e-06, + "loss": 0.0394, + "step": 1120 + }, + { + "epoch": 0.4409046214355949, + "grad_norm": 1.0611299276351929, + "learning_rate": 3.430073878013554e-06, + "loss": 0.0263, + "step": 1121 + }, + { + "epoch": 0.44129793510324483, + "grad_norm": 0.48767581582069397, + "learning_rate": 3.4268853190387496e-06, + "loss": 0.0341, + "step": 1122 + }, + { + "epoch": 0.4416912487708948, + "grad_norm": 0.6423639059066772, + "learning_rate": 3.423695011049683e-06, + "loss": 0.0234, + "step": 1123 + }, + { + "epoch": 0.4420845624385447, + "grad_norm": 1.0390664339065552, + "learning_rate": 3.4205029600663996e-06, + "loss": 0.0593, + "step": 1124 + }, + { + "epoch": 0.4424778761061947, + "grad_norm": 1.2516858577728271, + "learning_rate": 3.4173091721122375e-06, + "loss": 0.0375, + "step": 1125 + }, + { + "epoch": 0.44287118977384465, + "grad_norm": 1.670310139656067, + "learning_rate": 3.414113653213812e-06, + "loss": 0.0504, + "step": 1126 + }, + { + "epoch": 0.4432645034414946, + "grad_norm": 2.317314624786377, + "learning_rate": 3.410916409401004e-06, + "loss": 0.0911, + "step": 1127 + }, + { + "epoch": 0.44365781710914454, + "grad_norm": 1.418398141860962, + "learning_rate": 3.407717446706948e-06, + "loss": 0.0439, + "step": 1128 + }, + { + "epoch": 0.4440511307767945, + "grad_norm": 1.1104565858840942, + "learning_rate": 3.4045167711680244e-06, + "loss": 0.0485, + "step": 1129 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 1.8792333602905273, + "learning_rate": 3.4013143888238455e-06, + "loss": 0.064, + "step": 1130 + }, + { + "epoch": 0.4448377581120944, + "grad_norm": 1.7921650409698486, + "learning_rate": 3.398110305717241e-06, + "loss": 0.0495, + "step": 1131 + }, + { + "epoch": 0.44523107177974436, + "grad_norm": 1.4747095108032227, + "learning_rate": 3.3949045278942545e-06, + "loss": 0.0743, + "step": 1132 + }, + { + "epoch": 0.4456243854473943, + "grad_norm": 0.6847875118255615, + "learning_rate": 3.3916970614041244e-06, + "loss": 0.0224, + "step": 1133 + }, + { + "epoch": 0.44601769911504424, + "grad_norm": 0.7522935271263123, + "learning_rate": 3.3884879122992762e-06, + "loss": 0.0334, + "step": 1134 + }, + { + "epoch": 0.4464110127826942, + "grad_norm": 1.5176104307174683, + "learning_rate": 3.3852770866353125e-06, + "loss": 0.0729, + "step": 1135 + }, + { + "epoch": 0.4468043264503441, + "grad_norm": 1.188468337059021, + "learning_rate": 3.382064590470996e-06, + "loss": 0.0315, + "step": 1136 + }, + { + "epoch": 0.4471976401179941, + "grad_norm": 0.5583229660987854, + "learning_rate": 3.378850429868244e-06, + "loss": 0.0292, + "step": 1137 + }, + { + "epoch": 0.44759095378564406, + "grad_norm": 0.7804880738258362, + "learning_rate": 3.3756346108921145e-06, + "loss": 0.0378, + "step": 1138 + }, + { + "epoch": 0.447984267453294, + "grad_norm": 1.090079426765442, + "learning_rate": 3.372417139610793e-06, + "loss": 0.0549, + "step": 1139 + }, + { + "epoch": 0.44837758112094395, + "grad_norm": 1.363856554031372, + "learning_rate": 3.369198022095585e-06, + "loss": 0.0859, + "step": 1140 + }, + { + "epoch": 0.4487708947885939, + "grad_norm": 1.162818431854248, + "learning_rate": 3.3659772644209023e-06, + "loss": 0.0292, + "step": 1141 + }, + { + "epoch": 0.44916420845624383, + "grad_norm": 0.8213643431663513, + "learning_rate": 3.36275487266425e-06, + "loss": 0.0435, + "step": 1142 + }, + { + "epoch": 0.4495575221238938, + "grad_norm": 0.8050291538238525, + "learning_rate": 3.3595308529062176e-06, + "loss": 0.0279, + "step": 1143 + }, + { + "epoch": 0.44995083579154377, + "grad_norm": 1.1065354347229004, + "learning_rate": 3.3563052112304674e-06, + "loss": 0.0425, + "step": 1144 + }, + { + "epoch": 0.4503441494591937, + "grad_norm": 0.9072518348693848, + "learning_rate": 3.3530779537237194e-06, + "loss": 0.0315, + "step": 1145 + }, + { + "epoch": 0.45073746312684365, + "grad_norm": 0.8572150468826294, + "learning_rate": 3.349849086475747e-06, + "loss": 0.0306, + "step": 1146 + }, + { + "epoch": 0.4511307767944936, + "grad_norm": 1.552173137664795, + "learning_rate": 3.346618615579359e-06, + "loss": 0.0671, + "step": 1147 + }, + { + "epoch": 0.45152409046214353, + "grad_norm": 0.9978398084640503, + "learning_rate": 3.3433865471303876e-06, + "loss": 0.0667, + "step": 1148 + }, + { + "epoch": 0.45191740412979353, + "grad_norm": 2.7961080074310303, + "learning_rate": 3.3401528872276847e-06, + "loss": 0.0696, + "step": 1149 + }, + { + "epoch": 0.4523107177974435, + "grad_norm": 1.520912528038025, + "learning_rate": 3.3369176419731004e-06, + "loss": 0.0722, + "step": 1150 + }, + { + "epoch": 0.4527040314650934, + "grad_norm": 0.8389769196510315, + "learning_rate": 3.33368081747148e-06, + "loss": 0.0444, + "step": 1151 + }, + { + "epoch": 0.45309734513274336, + "grad_norm": 2.075424909591675, + "learning_rate": 3.3304424198306464e-06, + "loss": 0.0826, + "step": 1152 + }, + { + "epoch": 0.4534906588003933, + "grad_norm": 0.7416201829910278, + "learning_rate": 3.3272024551613926e-06, + "loss": 0.0283, + "step": 1153 + }, + { + "epoch": 0.45388397246804324, + "grad_norm": 1.0457786321640015, + "learning_rate": 3.3239609295774667e-06, + "loss": 0.0418, + "step": 1154 + }, + { + "epoch": 0.45427728613569324, + "grad_norm": 0.9312077760696411, + "learning_rate": 3.3207178491955656e-06, + "loss": 0.0341, + "step": 1155 + }, + { + "epoch": 0.4546705998033432, + "grad_norm": 0.9886119365692139, + "learning_rate": 3.3174732201353155e-06, + "loss": 0.0623, + "step": 1156 + }, + { + "epoch": 0.4550639134709931, + "grad_norm": 1.2970693111419678, + "learning_rate": 3.3142270485192683e-06, + "loss": 0.087, + "step": 1157 + }, + { + "epoch": 0.45545722713864306, + "grad_norm": 1.273305892944336, + "learning_rate": 3.3109793404728855e-06, + "loss": 0.0654, + "step": 1158 + }, + { + "epoch": 0.455850540806293, + "grad_norm": 0.8121715188026428, + "learning_rate": 3.3077301021245285e-06, + "loss": 0.0257, + "step": 1159 + }, + { + "epoch": 0.45624385447394294, + "grad_norm": 1.6593793630599976, + "learning_rate": 3.3044793396054447e-06, + "loss": 0.0679, + "step": 1160 + }, + { + "epoch": 0.45663716814159294, + "grad_norm": 1.2623846530914307, + "learning_rate": 3.3012270590497596e-06, + "loss": 0.071, + "step": 1161 + }, + { + "epoch": 0.4570304818092429, + "grad_norm": 0.9096400737762451, + "learning_rate": 3.2979732665944615e-06, + "loss": 0.067, + "step": 1162 + }, + { + "epoch": 0.4574237954768928, + "grad_norm": 0.9472593069076538, + "learning_rate": 3.2947179683793928e-06, + "loss": 0.0395, + "step": 1163 + }, + { + "epoch": 0.45781710914454277, + "grad_norm": 0.9576103091239929, + "learning_rate": 3.291461170547237e-06, + "loss": 0.049, + "step": 1164 + }, + { + "epoch": 0.4582104228121927, + "grad_norm": 0.9918181300163269, + "learning_rate": 3.2882028792435072e-06, + "loss": 0.0318, + "step": 1165 + }, + { + "epoch": 0.45860373647984265, + "grad_norm": 1.843493938446045, + "learning_rate": 3.2849431006165343e-06, + "loss": 0.0634, + "step": 1166 + }, + { + "epoch": 0.45899705014749265, + "grad_norm": 0.8672575950622559, + "learning_rate": 3.2816818408174567e-06, + "loss": 0.0826, + "step": 1167 + }, + { + "epoch": 0.4593903638151426, + "grad_norm": 1.5660734176635742, + "learning_rate": 3.278419106000206e-06, + "loss": 0.0695, + "step": 1168 + }, + { + "epoch": 0.45978367748279253, + "grad_norm": 1.3234399557113647, + "learning_rate": 3.2751549023214995e-06, + "loss": 0.0381, + "step": 1169 + }, + { + "epoch": 0.46017699115044247, + "grad_norm": 1.7596269845962524, + "learning_rate": 3.2718892359408245e-06, + "loss": 0.0438, + "step": 1170 + }, + { + "epoch": 0.4605703048180924, + "grad_norm": 0.6878931522369385, + "learning_rate": 3.2686221130204287e-06, + "loss": 0.0347, + "step": 1171 + }, + { + "epoch": 0.46096361848574235, + "grad_norm": 1.0857138633728027, + "learning_rate": 3.265353539725309e-06, + "loss": 0.0609, + "step": 1172 + }, + { + "epoch": 0.46135693215339235, + "grad_norm": 0.777098536491394, + "learning_rate": 3.2620835222231972e-06, + "loss": 0.0597, + "step": 1173 + }, + { + "epoch": 0.4617502458210423, + "grad_norm": 4.028940677642822, + "learning_rate": 3.2588120666845534e-06, + "loss": 0.0702, + "step": 1174 + }, + { + "epoch": 0.46214355948869223, + "grad_norm": 1.3609766960144043, + "learning_rate": 3.255539179282548e-06, + "loss": 0.0478, + "step": 1175 + }, + { + "epoch": 0.4625368731563422, + "grad_norm": 1.3808916807174683, + "learning_rate": 3.2522648661930558e-06, + "loss": 0.0787, + "step": 1176 + }, + { + "epoch": 0.4629301868239921, + "grad_norm": 1.464201807975769, + "learning_rate": 3.2489891335946413e-06, + "loss": 0.0565, + "step": 1177 + }, + { + "epoch": 0.46332350049164206, + "grad_norm": 1.4196548461914062, + "learning_rate": 3.245711987668545e-06, + "loss": 0.0747, + "step": 1178 + }, + { + "epoch": 0.46371681415929206, + "grad_norm": 1.5526188611984253, + "learning_rate": 3.2424334345986787e-06, + "loss": 0.0384, + "step": 1179 + }, + { + "epoch": 0.464110127826942, + "grad_norm": 1.4707880020141602, + "learning_rate": 3.239153480571605e-06, + "loss": 0.0669, + "step": 1180 + }, + { + "epoch": 0.46450344149459194, + "grad_norm": 1.5997252464294434, + "learning_rate": 3.2358721317765344e-06, + "loss": 0.063, + "step": 1181 + }, + { + "epoch": 0.4648967551622419, + "grad_norm": 0.7773184180259705, + "learning_rate": 3.2325893944053066e-06, + "loss": 0.0515, + "step": 1182 + }, + { + "epoch": 0.4652900688298918, + "grad_norm": 1.1635929346084595, + "learning_rate": 3.2293052746523814e-06, + "loss": 0.0494, + "step": 1183 + }, + { + "epoch": 0.46568338249754176, + "grad_norm": 0.9854192137718201, + "learning_rate": 3.2260197787148277e-06, + "loss": 0.0559, + "step": 1184 + }, + { + "epoch": 0.46607669616519176, + "grad_norm": 1.9313583374023438, + "learning_rate": 3.222732912792313e-06, + "loss": 0.0447, + "step": 1185 + }, + { + "epoch": 0.4664700098328417, + "grad_norm": 2.149656295776367, + "learning_rate": 3.2194446830870865e-06, + "loss": 0.0772, + "step": 1186 + }, + { + "epoch": 0.46686332350049164, + "grad_norm": 1.784822940826416, + "learning_rate": 3.2161550958039732e-06, + "loss": 0.0746, + "step": 1187 + }, + { + "epoch": 0.4672566371681416, + "grad_norm": 1.5821526050567627, + "learning_rate": 3.2128641571503594e-06, + "loss": 0.0613, + "step": 1188 + }, + { + "epoch": 0.46764995083579153, + "grad_norm": 1.6123450994491577, + "learning_rate": 3.2095718733361803e-06, + "loss": 0.0419, + "step": 1189 + }, + { + "epoch": 0.46804326450344147, + "grad_norm": 1.5458816289901733, + "learning_rate": 3.2062782505739125e-06, + "loss": 0.0854, + "step": 1190 + }, + { + "epoch": 0.46843657817109147, + "grad_norm": 1.5308221578598022, + "learning_rate": 3.202983295078555e-06, + "loss": 0.063, + "step": 1191 + }, + { + "epoch": 0.4688298918387414, + "grad_norm": 1.166703224182129, + "learning_rate": 3.199687013067624e-06, + "loss": 0.0759, + "step": 1192 + }, + { + "epoch": 0.46922320550639135, + "grad_norm": 1.2040659189224243, + "learning_rate": 3.1963894107611395e-06, + "loss": 0.0648, + "step": 1193 + }, + { + "epoch": 0.4696165191740413, + "grad_norm": 0.8159343004226685, + "learning_rate": 3.1930904943816104e-06, + "loss": 0.0252, + "step": 1194 + }, + { + "epoch": 0.47000983284169123, + "grad_norm": 0.5714221596717834, + "learning_rate": 3.189790270154028e-06, + "loss": 0.0402, + "step": 1195 + }, + { + "epoch": 0.4704031465093412, + "grad_norm": 1.1028029918670654, + "learning_rate": 3.186488744305849e-06, + "loss": 0.0358, + "step": 1196 + }, + { + "epoch": 0.47079646017699117, + "grad_norm": 1.1706167459487915, + "learning_rate": 3.183185923066988e-06, + "loss": 0.0405, + "step": 1197 + }, + { + "epoch": 0.4711897738446411, + "grad_norm": 2.2323551177978516, + "learning_rate": 3.179881812669804e-06, + "loss": 0.0626, + "step": 1198 + }, + { + "epoch": 0.47158308751229105, + "grad_norm": 1.4933780431747437, + "learning_rate": 3.1765764193490863e-06, + "loss": 0.0421, + "step": 1199 + }, + { + "epoch": 0.471976401179941, + "grad_norm": 1.759582281112671, + "learning_rate": 3.173269749342047e-06, + "loss": 0.0386, + "step": 1200 + }, + { + "epoch": 0.47236971484759094, + "grad_norm": 0.9716536998748779, + "learning_rate": 3.1699618088883094e-06, + "loss": 0.0469, + "step": 1201 + }, + { + "epoch": 0.4727630285152409, + "grad_norm": 1.4588727951049805, + "learning_rate": 3.1666526042298883e-06, + "loss": 0.062, + "step": 1202 + }, + { + "epoch": 0.4731563421828909, + "grad_norm": 0.7807295918464661, + "learning_rate": 3.16334214161119e-06, + "loss": 0.0516, + "step": 1203 + }, + { + "epoch": 0.4735496558505408, + "grad_norm": 0.9360034465789795, + "learning_rate": 3.1600304272789904e-06, + "loss": 0.0413, + "step": 1204 + }, + { + "epoch": 0.47394296951819076, + "grad_norm": 3.0252861976623535, + "learning_rate": 3.1567174674824303e-06, + "loss": 0.0517, + "step": 1205 + }, + { + "epoch": 0.4743362831858407, + "grad_norm": 1.2127926349639893, + "learning_rate": 3.1534032684729978e-06, + "loss": 0.0634, + "step": 1206 + }, + { + "epoch": 0.47472959685349064, + "grad_norm": 1.008239984512329, + "learning_rate": 3.1500878365045217e-06, + "loss": 0.035, + "step": 1207 + }, + { + "epoch": 0.4751229105211406, + "grad_norm": 0.8630732893943787, + "learning_rate": 3.1467711778331573e-06, + "loss": 0.0432, + "step": 1208 + }, + { + "epoch": 0.4755162241887906, + "grad_norm": 0.5713632702827454, + "learning_rate": 3.143453298717373e-06, + "loss": 0.0293, + "step": 1209 + }, + { + "epoch": 0.4759095378564405, + "grad_norm": 1.3503292798995972, + "learning_rate": 3.14013420541794e-06, + "loss": 0.0488, + "step": 1210 + }, + { + "epoch": 0.47630285152409046, + "grad_norm": 0.6340729594230652, + "learning_rate": 3.1368139041979235e-06, + "loss": 0.0352, + "step": 1211 + }, + { + "epoch": 0.4766961651917404, + "grad_norm": 2.0643789768218994, + "learning_rate": 3.133492401322666e-06, + "loss": 0.0602, + "step": 1212 + }, + { + "epoch": 0.47708947885939035, + "grad_norm": 1.456824779510498, + "learning_rate": 3.1301697030597772e-06, + "loss": 0.0576, + "step": 1213 + }, + { + "epoch": 0.4774827925270403, + "grad_norm": 1.6788169145584106, + "learning_rate": 3.126845815679123e-06, + "loss": 0.0473, + "step": 1214 + }, + { + "epoch": 0.4778761061946903, + "grad_norm": 0.9894094467163086, + "learning_rate": 3.1235207454528137e-06, + "loss": 0.0486, + "step": 1215 + }, + { + "epoch": 0.47826941986234023, + "grad_norm": 0.6644244194030762, + "learning_rate": 3.12019449865519e-06, + "loss": 0.0348, + "step": 1216 + }, + { + "epoch": 0.47866273352999017, + "grad_norm": 1.8796205520629883, + "learning_rate": 3.116867081562815e-06, + "loss": 0.0711, + "step": 1217 + }, + { + "epoch": 0.4790560471976401, + "grad_norm": 0.71921706199646, + "learning_rate": 3.1135385004544584e-06, + "loss": 0.0439, + "step": 1218 + }, + { + "epoch": 0.47944936086529005, + "grad_norm": 1.4723786115646362, + "learning_rate": 3.1102087616110866e-06, + "loss": 0.0948, + "step": 1219 + }, + { + "epoch": 0.47984267453294, + "grad_norm": 1.0385109186172485, + "learning_rate": 3.1068778713158515e-06, + "loss": 0.0481, + "step": 1220 + }, + { + "epoch": 0.48023598820059, + "grad_norm": 1.8688119649887085, + "learning_rate": 3.1035458358540764e-06, + "loss": 0.0962, + "step": 1221 + }, + { + "epoch": 0.48062930186823993, + "grad_norm": 0.988058865070343, + "learning_rate": 3.100212661513247e-06, + "loss": 0.0862, + "step": 1222 + }, + { + "epoch": 0.4810226155358899, + "grad_norm": 0.7118948698043823, + "learning_rate": 3.096878354582998e-06, + "loss": 0.0492, + "step": 1223 + }, + { + "epoch": 0.4814159292035398, + "grad_norm": 1.1759183406829834, + "learning_rate": 3.093542921355099e-06, + "loss": 0.0278, + "step": 1224 + }, + { + "epoch": 0.48180924287118976, + "grad_norm": 0.8185058832168579, + "learning_rate": 3.0902063681234473e-06, + "loss": 0.0618, + "step": 1225 + }, + { + "epoch": 0.4822025565388397, + "grad_norm": 1.0773781538009644, + "learning_rate": 3.086868701184054e-06, + "loss": 0.0393, + "step": 1226 + }, + { + "epoch": 0.4825958702064897, + "grad_norm": 1.4859130382537842, + "learning_rate": 3.083529926835028e-06, + "loss": 0.0425, + "step": 1227 + }, + { + "epoch": 0.48298918387413964, + "grad_norm": 0.8524113297462463, + "learning_rate": 3.0801900513765732e-06, + "loss": 0.0667, + "step": 1228 + }, + { + "epoch": 0.4833824975417896, + "grad_norm": 1.2344658374786377, + "learning_rate": 3.076849081110967e-06, + "loss": 0.0469, + "step": 1229 + }, + { + "epoch": 0.4837758112094395, + "grad_norm": 1.4112597703933716, + "learning_rate": 3.073507022342554e-06, + "loss": 0.0439, + "step": 1230 + }, + { + "epoch": 0.48416912487708946, + "grad_norm": 1.0202746391296387, + "learning_rate": 3.070163881377734e-06, + "loss": 0.0953, + "step": 1231 + }, + { + "epoch": 0.4845624385447394, + "grad_norm": 1.2902711629867554, + "learning_rate": 3.066819664524947e-06, + "loss": 0.0378, + "step": 1232 + }, + { + "epoch": 0.4849557522123894, + "grad_norm": 0.8746582269668579, + "learning_rate": 3.063474378094665e-06, + "loss": 0.0404, + "step": 1233 + }, + { + "epoch": 0.48534906588003934, + "grad_norm": 1.8847814798355103, + "learning_rate": 3.060128028399376e-06, + "loss": 0.0779, + "step": 1234 + }, + { + "epoch": 0.4857423795476893, + "grad_norm": 1.2793282270431519, + "learning_rate": 3.056780621753577e-06, + "loss": 0.0433, + "step": 1235 + }, + { + "epoch": 0.4861356932153392, + "grad_norm": 1.4302126169204712, + "learning_rate": 3.0534321644737574e-06, + "loss": 0.0565, + "step": 1236 + }, + { + "epoch": 0.48652900688298917, + "grad_norm": 0.8506616353988647, + "learning_rate": 3.0500826628783903e-06, + "loss": 0.0448, + "step": 1237 + }, + { + "epoch": 0.4869223205506391, + "grad_norm": 1.7796978950500488, + "learning_rate": 3.046732123287918e-06, + "loss": 0.0449, + "step": 1238 + }, + { + "epoch": 0.4873156342182891, + "grad_norm": 1.4967756271362305, + "learning_rate": 3.043380552024744e-06, + "loss": 0.0409, + "step": 1239 + }, + { + "epoch": 0.48770894788593905, + "grad_norm": 1.2920217514038086, + "learning_rate": 3.0400279554132157e-06, + "loss": 0.0465, + "step": 1240 + }, + { + "epoch": 0.488102261553589, + "grad_norm": 1.9115070104599, + "learning_rate": 3.0366743397796166e-06, + "loss": 0.0591, + "step": 1241 + }, + { + "epoch": 0.48849557522123893, + "grad_norm": 0.988409161567688, + "learning_rate": 3.033319711452154e-06, + "loss": 0.042, + "step": 1242 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 2.1158268451690674, + "learning_rate": 3.0299640767609447e-06, + "loss": 0.0792, + "step": 1243 + }, + { + "epoch": 0.4892822025565388, + "grad_norm": 1.1518357992172241, + "learning_rate": 3.0266074420380043e-06, + "loss": 0.0554, + "step": 1244 + }, + { + "epoch": 0.4896755162241888, + "grad_norm": 1.3400568962097168, + "learning_rate": 3.023249813617238e-06, + "loss": 0.0545, + "step": 1245 + }, + { + "epoch": 0.49006882989183875, + "grad_norm": 0.8380603790283203, + "learning_rate": 3.0198911978344213e-06, + "loss": 0.0377, + "step": 1246 + }, + { + "epoch": 0.4904621435594887, + "grad_norm": 1.3251253366470337, + "learning_rate": 3.0165316010271982e-06, + "loss": 0.0419, + "step": 1247 + }, + { + "epoch": 0.49085545722713864, + "grad_norm": 0.7429760098457336, + "learning_rate": 3.0131710295350615e-06, + "loss": 0.0487, + "step": 1248 + }, + { + "epoch": 0.4912487708947886, + "grad_norm": 1.619492530822754, + "learning_rate": 3.0098094896993413e-06, + "loss": 0.0364, + "step": 1249 + }, + { + "epoch": 0.4916420845624385, + "grad_norm": 1.8555465936660767, + "learning_rate": 3.0064469878631986e-06, + "loss": 0.0327, + "step": 1250 + }, + { + "epoch": 0.4920353982300885, + "grad_norm": 2.1514008045196533, + "learning_rate": 3.003083530371606e-06, + "loss": 0.0961, + "step": 1251 + }, + { + "epoch": 0.49242871189773846, + "grad_norm": 1.1894843578338623, + "learning_rate": 2.9997191235713435e-06, + "loss": 0.0773, + "step": 1252 + }, + { + "epoch": 0.4928220255653884, + "grad_norm": 1.375878095626831, + "learning_rate": 2.9963537738109783e-06, + "loss": 0.0635, + "step": 1253 + }, + { + "epoch": 0.49321533923303834, + "grad_norm": 0.9740056395530701, + "learning_rate": 2.9929874874408595e-06, + "loss": 0.0581, + "step": 1254 + }, + { + "epoch": 0.4936086529006883, + "grad_norm": 1.21156907081604, + "learning_rate": 2.9896202708131027e-06, + "loss": 0.0524, + "step": 1255 + }, + { + "epoch": 0.4940019665683382, + "grad_norm": 4.271803855895996, + "learning_rate": 2.98625213028158e-06, + "loss": 0.0437, + "step": 1256 + }, + { + "epoch": 0.4943952802359882, + "grad_norm": 1.0697994232177734, + "learning_rate": 2.9828830722019046e-06, + "loss": 0.0693, + "step": 1257 + }, + { + "epoch": 0.49478859390363816, + "grad_norm": 1.0657457113265991, + "learning_rate": 2.979513102931424e-06, + "loss": 0.0788, + "step": 1258 + }, + { + "epoch": 0.4951819075712881, + "grad_norm": 1.6833268404006958, + "learning_rate": 2.9761422288292017e-06, + "loss": 0.0755, + "step": 1259 + }, + { + "epoch": 0.49557522123893805, + "grad_norm": 0.7139087915420532, + "learning_rate": 2.9727704562560124e-06, + "loss": 0.0416, + "step": 1260 + }, + { + "epoch": 0.495968534906588, + "grad_norm": 1.025672435760498, + "learning_rate": 2.9693977915743227e-06, + "loss": 0.057, + "step": 1261 + }, + { + "epoch": 0.49636184857423793, + "grad_norm": 1.6005637645721436, + "learning_rate": 2.9660242411482848e-06, + "loss": 0.0694, + "step": 1262 + }, + { + "epoch": 0.4967551622418879, + "grad_norm": 1.2426131963729858, + "learning_rate": 2.9626498113437215e-06, + "loss": 0.0443, + "step": 1263 + }, + { + "epoch": 0.49714847590953787, + "grad_norm": 1.0461783409118652, + "learning_rate": 2.9592745085281154e-06, + "loss": 0.0449, + "step": 1264 + }, + { + "epoch": 0.4975417895771878, + "grad_norm": 1.1440929174423218, + "learning_rate": 2.955898339070596e-06, + "loss": 0.0429, + "step": 1265 + }, + { + "epoch": 0.49793510324483775, + "grad_norm": 1.5936861038208008, + "learning_rate": 2.9525213093419275e-06, + "loss": 0.0517, + "step": 1266 + }, + { + "epoch": 0.4983284169124877, + "grad_norm": 0.9140682220458984, + "learning_rate": 2.9491434257144995e-06, + "loss": 0.0699, + "step": 1267 + }, + { + "epoch": 0.49872173058013763, + "grad_norm": 0.6656792759895325, + "learning_rate": 2.9457646945623107e-06, + "loss": 0.023, + "step": 1268 + }, + { + "epoch": 0.49911504424778763, + "grad_norm": 1.1062997579574585, + "learning_rate": 2.9423851222609607e-06, + "loss": 0.0801, + "step": 1269 + }, + { + "epoch": 0.4995083579154376, + "grad_norm": 0.9155628085136414, + "learning_rate": 2.939004715187635e-06, + "loss": 0.0704, + "step": 1270 + }, + { + "epoch": 0.4999016715830875, + "grad_norm": 0.8905113339424133, + "learning_rate": 2.935623479721095e-06, + "loss": 0.0442, + "step": 1271 + }, + { + "epoch": 0.5002949852507375, + "grad_norm": 0.8276392817497253, + "learning_rate": 2.932241422241665e-06, + "loss": 0.0535, + "step": 1272 + }, + { + "epoch": 0.5006882989183874, + "grad_norm": 0.5640360713005066, + "learning_rate": 2.9288585491312206e-06, + "loss": 0.0411, + "step": 1273 + }, + { + "epoch": 0.5010816125860373, + "grad_norm": 1.5979022979736328, + "learning_rate": 2.925474866773176e-06, + "loss": 0.0703, + "step": 1274 + }, + { + "epoch": 0.5014749262536873, + "grad_norm": 1.1477428674697876, + "learning_rate": 2.922090381552475e-06, + "loss": 0.0488, + "step": 1275 + }, + { + "epoch": 0.5018682399213372, + "grad_norm": 1.544410228729248, + "learning_rate": 2.9187050998555715e-06, + "loss": 0.0689, + "step": 1276 + }, + { + "epoch": 0.5022615535889872, + "grad_norm": 1.16623055934906, + "learning_rate": 2.915319028070427e-06, + "loss": 0.0681, + "step": 1277 + }, + { + "epoch": 0.5026548672566372, + "grad_norm": 0.2639702558517456, + "learning_rate": 2.9119321725864914e-06, + "loss": 0.0321, + "step": 1278 + }, + { + "epoch": 0.5030481809242872, + "grad_norm": 0.9400918483734131, + "learning_rate": 2.908544539794693e-06, + "loss": 0.0726, + "step": 1279 + }, + { + "epoch": 0.5034414945919371, + "grad_norm": 2.083108425140381, + "learning_rate": 2.9051561360874297e-06, + "loss": 0.0567, + "step": 1280 + }, + { + "epoch": 0.503834808259587, + "grad_norm": 0.9149637818336487, + "learning_rate": 2.901766967858551e-06, + "loss": 0.0626, + "step": 1281 + }, + { + "epoch": 0.504228121927237, + "grad_norm": 0.6115841269493103, + "learning_rate": 2.8983770415033507e-06, + "loss": 0.0386, + "step": 1282 + }, + { + "epoch": 0.5046214355948869, + "grad_norm": 1.530674695968628, + "learning_rate": 2.8949863634185533e-06, + "loss": 0.0743, + "step": 1283 + }, + { + "epoch": 0.5050147492625369, + "grad_norm": 0.9860877990722656, + "learning_rate": 2.8915949400022995e-06, + "loss": 0.0397, + "step": 1284 + }, + { + "epoch": 0.5054080629301868, + "grad_norm": 1.6740636825561523, + "learning_rate": 2.8882027776541406e-06, + "loss": 0.0997, + "step": 1285 + }, + { + "epoch": 0.5058013765978367, + "grad_norm": 1.1494807004928589, + "learning_rate": 2.8848098827750186e-06, + "loss": 0.0639, + "step": 1286 + }, + { + "epoch": 0.5061946902654867, + "grad_norm": 1.5039880275726318, + "learning_rate": 2.8814162617672586e-06, + "loss": 0.0615, + "step": 1287 + }, + { + "epoch": 0.5065880039331366, + "grad_norm": 1.2192140817642212, + "learning_rate": 2.8780219210345573e-06, + "loss": 0.0543, + "step": 1288 + }, + { + "epoch": 0.5069813176007866, + "grad_norm": 1.1865425109863281, + "learning_rate": 2.8746268669819676e-06, + "loss": 0.069, + "step": 1289 + }, + { + "epoch": 0.5073746312684366, + "grad_norm": 1.6422653198242188, + "learning_rate": 2.8712311060158904e-06, + "loss": 0.0407, + "step": 1290 + }, + { + "epoch": 0.5077679449360866, + "grad_norm": 1.0872414112091064, + "learning_rate": 2.8678346445440588e-06, + "loss": 0.0485, + "step": 1291 + }, + { + "epoch": 0.5081612586037365, + "grad_norm": 1.3887152671813965, + "learning_rate": 2.8644374889755284e-06, + "loss": 0.0594, + "step": 1292 + }, + { + "epoch": 0.5085545722713865, + "grad_norm": 0.9311152100563049, + "learning_rate": 2.861039645720664e-06, + "loss": 0.0558, + "step": 1293 + }, + { + "epoch": 0.5089478859390364, + "grad_norm": 0.5611655116081238, + "learning_rate": 2.85764112119113e-06, + "loss": 0.0326, + "step": 1294 + }, + { + "epoch": 0.5093411996066863, + "grad_norm": 0.6655589938163757, + "learning_rate": 2.854241921799874e-06, + "loss": 0.0608, + "step": 1295 + }, + { + "epoch": 0.5097345132743363, + "grad_norm": 0.9743668437004089, + "learning_rate": 2.850842053961119e-06, + "loss": 0.0674, + "step": 1296 + }, + { + "epoch": 0.5101278269419862, + "grad_norm": 0.3803253471851349, + "learning_rate": 2.847441524090347e-06, + "loss": 0.0318, + "step": 1297 + }, + { + "epoch": 0.5105211406096362, + "grad_norm": 0.9651347398757935, + "learning_rate": 2.844040338604291e-06, + "loss": 0.0467, + "step": 1298 + }, + { + "epoch": 0.5109144542772861, + "grad_norm": 1.3503124713897705, + "learning_rate": 2.8406385039209217e-06, + "loss": 0.0353, + "step": 1299 + }, + { + "epoch": 0.511307767944936, + "grad_norm": 1.3085218667984009, + "learning_rate": 2.837236026459432e-06, + "loss": 0.0677, + "step": 1300 + }, + { + "epoch": 0.511701081612586, + "grad_norm": 0.759332537651062, + "learning_rate": 2.833832912640232e-06, + "loss": 0.0399, + "step": 1301 + }, + { + "epoch": 0.512094395280236, + "grad_norm": 1.254012107849121, + "learning_rate": 2.8304291688849283e-06, + "loss": 0.0469, + "step": 1302 + }, + { + "epoch": 0.512487708947886, + "grad_norm": 1.6213202476501465, + "learning_rate": 2.827024801616319e-06, + "loss": 0.077, + "step": 1303 + }, + { + "epoch": 0.5128810226155359, + "grad_norm": 0.751507580280304, + "learning_rate": 2.8236198172583765e-06, + "loss": 0.0499, + "step": 1304 + }, + { + "epoch": 0.5132743362831859, + "grad_norm": 0.6438438296318054, + "learning_rate": 2.820214222236241e-06, + "loss": 0.0638, + "step": 1305 + }, + { + "epoch": 0.5136676499508358, + "grad_norm": 0.8826209902763367, + "learning_rate": 2.816808022976201e-06, + "loss": 0.0422, + "step": 1306 + }, + { + "epoch": 0.5140609636184857, + "grad_norm": 0.4389915466308594, + "learning_rate": 2.813401225905688e-06, + "loss": 0.0192, + "step": 1307 + }, + { + "epoch": 0.5144542772861357, + "grad_norm": 0.7698509693145752, + "learning_rate": 2.8099938374532615e-06, + "loss": 0.043, + "step": 1308 + }, + { + "epoch": 0.5148475909537856, + "grad_norm": 1.0304797887802124, + "learning_rate": 2.806585864048594e-06, + "loss": 0.0648, + "step": 1309 + }, + { + "epoch": 0.5152409046214356, + "grad_norm": 0.9679722189903259, + "learning_rate": 2.8031773121224665e-06, + "loss": 0.0528, + "step": 1310 + }, + { + "epoch": 0.5156342182890855, + "grad_norm": 0.8979973793029785, + "learning_rate": 2.799768188106747e-06, + "loss": 0.0493, + "step": 1311 + }, + { + "epoch": 0.5160275319567355, + "grad_norm": 1.266461730003357, + "learning_rate": 2.7963584984343856e-06, + "loss": 0.0489, + "step": 1312 + }, + { + "epoch": 0.5164208456243854, + "grad_norm": 1.1776021718978882, + "learning_rate": 2.7929482495393995e-06, + "loss": 0.0453, + "step": 1313 + }, + { + "epoch": 0.5168141592920354, + "grad_norm": 0.89280104637146, + "learning_rate": 2.7895374478568608e-06, + "loss": 0.0506, + "step": 1314 + }, + { + "epoch": 0.5172074729596854, + "grad_norm": 1.046673059463501, + "learning_rate": 2.786126099822885e-06, + "loss": 0.0812, + "step": 1315 + }, + { + "epoch": 0.5176007866273353, + "grad_norm": 1.451196312904358, + "learning_rate": 2.7827142118746187e-06, + "loss": 0.0388, + "step": 1316 + }, + { + "epoch": 0.5179941002949853, + "grad_norm": 0.9998504519462585, + "learning_rate": 2.779301790450226e-06, + "loss": 0.0505, + "step": 1317 + }, + { + "epoch": 0.5183874139626352, + "grad_norm": 1.0535742044448853, + "learning_rate": 2.7758888419888797e-06, + "loss": 0.0377, + "step": 1318 + }, + { + "epoch": 0.5187807276302852, + "grad_norm": 0.9973492622375488, + "learning_rate": 2.7724753729307454e-06, + "loss": 0.0512, + "step": 1319 + }, + { + "epoch": 0.5191740412979351, + "grad_norm": 1.3732929229736328, + "learning_rate": 2.769061389716971e-06, + "loss": 0.0992, + "step": 1320 + }, + { + "epoch": 0.519567354965585, + "grad_norm": 1.1079411506652832, + "learning_rate": 2.765646898789677e-06, + "loss": 0.0438, + "step": 1321 + }, + { + "epoch": 0.519960668633235, + "grad_norm": 1.0692771673202515, + "learning_rate": 2.762231906591939e-06, + "loss": 0.0482, + "step": 1322 + }, + { + "epoch": 0.5203539823008849, + "grad_norm": 0.773914098739624, + "learning_rate": 2.75881641956778e-06, + "loss": 0.0307, + "step": 1323 + }, + { + "epoch": 0.5207472959685349, + "grad_norm": 0.8193982243537903, + "learning_rate": 2.7554004441621562e-06, + "loss": 0.0357, + "step": 1324 + }, + { + "epoch": 0.5211406096361848, + "grad_norm": 1.0655934810638428, + "learning_rate": 2.7519839868209462e-06, + "loss": 0.0564, + "step": 1325 + }, + { + "epoch": 0.5215339233038349, + "grad_norm": 0.668292760848999, + "learning_rate": 2.748567053990937e-06, + "loss": 0.0394, + "step": 1326 + }, + { + "epoch": 0.5219272369714848, + "grad_norm": 1.5048760175704956, + "learning_rate": 2.7451496521198144e-06, + "loss": 0.0756, + "step": 1327 + }, + { + "epoch": 0.5223205506391347, + "grad_norm": 1.869588017463684, + "learning_rate": 2.741731787656146e-06, + "loss": 0.08, + "step": 1328 + }, + { + "epoch": 0.5227138643067847, + "grad_norm": 1.6091140508651733, + "learning_rate": 2.7383134670493765e-06, + "loss": 0.0618, + "step": 1329 + }, + { + "epoch": 0.5231071779744346, + "grad_norm": 0.5614988207817078, + "learning_rate": 2.734894696749808e-06, + "loss": 0.022, + "step": 1330 + }, + { + "epoch": 0.5235004916420846, + "grad_norm": 1.5846737623214722, + "learning_rate": 2.7314754832085926e-06, + "loss": 0.0617, + "step": 1331 + }, + { + "epoch": 0.5238938053097345, + "grad_norm": 1.0142868757247925, + "learning_rate": 2.728055832877719e-06, + "loss": 0.1201, + "step": 1332 + }, + { + "epoch": 0.5242871189773844, + "grad_norm": 0.9764862060546875, + "learning_rate": 2.7246357522099996e-06, + "loss": 0.0576, + "step": 1333 + }, + { + "epoch": 0.5246804326450344, + "grad_norm": 0.7208642363548279, + "learning_rate": 2.721215247659059e-06, + "loss": 0.0165, + "step": 1334 + }, + { + "epoch": 0.5250737463126843, + "grad_norm": 1.2766616344451904, + "learning_rate": 2.7177943256793214e-06, + "loss": 0.0589, + "step": 1335 + }, + { + "epoch": 0.5254670599803343, + "grad_norm": 1.7238527536392212, + "learning_rate": 2.7143729927259992e-06, + "loss": 0.0415, + "step": 1336 + }, + { + "epoch": 0.5258603736479842, + "grad_norm": 0.9424237608909607, + "learning_rate": 2.7109512552550804e-06, + "loss": 0.088, + "step": 1337 + }, + { + "epoch": 0.5262536873156343, + "grad_norm": 0.8586751818656921, + "learning_rate": 2.707529119723315e-06, + "loss": 0.0621, + "step": 1338 + }, + { + "epoch": 0.5266470009832842, + "grad_norm": 0.6910445690155029, + "learning_rate": 2.7041065925882054e-06, + "loss": 0.0473, + "step": 1339 + }, + { + "epoch": 0.5270403146509341, + "grad_norm": 0.6774911880493164, + "learning_rate": 2.7006836803079934e-06, + "loss": 0.0401, + "step": 1340 + }, + { + "epoch": 0.5274336283185841, + "grad_norm": 1.1810059547424316, + "learning_rate": 2.697260389341645e-06, + "loss": 0.0464, + "step": 1341 + }, + { + "epoch": 0.527826941986234, + "grad_norm": 0.6813443303108215, + "learning_rate": 2.693836726148844e-06, + "loss": 0.0502, + "step": 1342 + }, + { + "epoch": 0.528220255653884, + "grad_norm": 1.6458402872085571, + "learning_rate": 2.6904126971899754e-06, + "loss": 0.0644, + "step": 1343 + }, + { + "epoch": 0.5286135693215339, + "grad_norm": 1.4540367126464844, + "learning_rate": 2.686988308926112e-06, + "loss": 0.0564, + "step": 1344 + }, + { + "epoch": 0.5290068829891839, + "grad_norm": 0.6865090131759644, + "learning_rate": 2.68356356781901e-06, + "loss": 0.0448, + "step": 1345 + }, + { + "epoch": 0.5294001966568338, + "grad_norm": 1.91966712474823, + "learning_rate": 2.6801384803310855e-06, + "loss": 0.0431, + "step": 1346 + }, + { + "epoch": 0.5297935103244837, + "grad_norm": 0.6628435254096985, + "learning_rate": 2.676713052925411e-06, + "loss": 0.0513, + "step": 1347 + }, + { + "epoch": 0.5301868239921337, + "grad_norm": 1.0600309371948242, + "learning_rate": 2.6732872920657018e-06, + "loss": 0.0321, + "step": 1348 + }, + { + "epoch": 0.5305801376597836, + "grad_norm": 0.5295042991638184, + "learning_rate": 2.6698612042162995e-06, + "loss": 0.0299, + "step": 1349 + }, + { + "epoch": 0.5309734513274337, + "grad_norm": 1.229316234588623, + "learning_rate": 2.6664347958421647e-06, + "loss": 0.0475, + "step": 1350 + }, + { + "epoch": 0.5313667649950836, + "grad_norm": 0.8785441517829895, + "learning_rate": 2.6630080734088625e-06, + "loss": 0.0424, + "step": 1351 + }, + { + "epoch": 0.5317600786627336, + "grad_norm": 1.3285952806472778, + "learning_rate": 2.6595810433825496e-06, + "loss": 0.0359, + "step": 1352 + }, + { + "epoch": 0.5321533923303835, + "grad_norm": 0.8368435502052307, + "learning_rate": 2.6561537122299647e-06, + "loss": 0.0503, + "step": 1353 + }, + { + "epoch": 0.5325467059980334, + "grad_norm": 0.790544331073761, + "learning_rate": 2.6527260864184135e-06, + "loss": 0.0321, + "step": 1354 + }, + { + "epoch": 0.5329400196656834, + "grad_norm": 1.5722286701202393, + "learning_rate": 2.6492981724157576e-06, + "loss": 0.0765, + "step": 1355 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.0913268327713013, + "learning_rate": 2.6458699766904033e-06, + "loss": 0.0526, + "step": 1356 + }, + { + "epoch": 0.5337266470009833, + "grad_norm": 1.2754257917404175, + "learning_rate": 2.6424415057112883e-06, + "loss": 0.0585, + "step": 1357 + }, + { + "epoch": 0.5341199606686332, + "grad_norm": 2.0785610675811768, + "learning_rate": 2.6390127659478698e-06, + "loss": 0.0995, + "step": 1358 + }, + { + "epoch": 0.5345132743362832, + "grad_norm": 1.3484556674957275, + "learning_rate": 2.6355837638701115e-06, + "loss": 0.0462, + "step": 1359 + }, + { + "epoch": 0.5349065880039331, + "grad_norm": 0.7563539147377014, + "learning_rate": 2.632154505948472e-06, + "loss": 0.0614, + "step": 1360 + }, + { + "epoch": 0.535299901671583, + "grad_norm": 0.7201266288757324, + "learning_rate": 2.6287249986538944e-06, + "loss": 0.0449, + "step": 1361 + }, + { + "epoch": 0.5356932153392331, + "grad_norm": 1.439516544342041, + "learning_rate": 2.62529524845779e-06, + "loss": 0.0694, + "step": 1362 + }, + { + "epoch": 0.536086529006883, + "grad_norm": 0.6716679334640503, + "learning_rate": 2.6218652618320306e-06, + "loss": 0.0302, + "step": 1363 + }, + { + "epoch": 0.536479842674533, + "grad_norm": 1.9574276208877563, + "learning_rate": 2.6184350452489317e-06, + "loss": 0.0708, + "step": 1364 + }, + { + "epoch": 0.5368731563421829, + "grad_norm": 1.3900701999664307, + "learning_rate": 2.615004605181246e-06, + "loss": 0.0833, + "step": 1365 + }, + { + "epoch": 0.5372664700098329, + "grad_norm": 0.9019057154655457, + "learning_rate": 2.611573948102144e-06, + "loss": 0.0625, + "step": 1366 + }, + { + "epoch": 0.5376597836774828, + "grad_norm": 2.0217947959899902, + "learning_rate": 2.6081430804852093e-06, + "loss": 0.0837, + "step": 1367 + }, + { + "epoch": 0.5380530973451327, + "grad_norm": 1.5341334342956543, + "learning_rate": 2.604712008804421e-06, + "loss": 0.0734, + "step": 1368 + }, + { + "epoch": 0.5384464110127827, + "grad_norm": 1.3491941690444946, + "learning_rate": 2.601280739534143e-06, + "loss": 0.0631, + "step": 1369 + }, + { + "epoch": 0.5388397246804326, + "grad_norm": 1.264906406402588, + "learning_rate": 2.5978492791491126e-06, + "loss": 0.0361, + "step": 1370 + }, + { + "epoch": 0.5392330383480826, + "grad_norm": 1.567254900932312, + "learning_rate": 2.594417634124428e-06, + "loss": 0.0802, + "step": 1371 + }, + { + "epoch": 0.5396263520157325, + "grad_norm": 0.912144124507904, + "learning_rate": 2.590985810935535e-06, + "loss": 0.0321, + "step": 1372 + }, + { + "epoch": 0.5400196656833824, + "grad_norm": 0.7098456025123596, + "learning_rate": 2.5875538160582176e-06, + "loss": 0.0625, + "step": 1373 + }, + { + "epoch": 0.5404129793510325, + "grad_norm": 1.4193458557128906, + "learning_rate": 2.58412165596858e-06, + "loss": 0.0518, + "step": 1374 + }, + { + "epoch": 0.5408062930186824, + "grad_norm": 1.3003660440444946, + "learning_rate": 2.5806893371430413e-06, + "loss": 0.0625, + "step": 1375 + }, + { + "epoch": 0.5411996066863324, + "grad_norm": 1.4275062084197998, + "learning_rate": 2.57725686605832e-06, + "loss": 0.0628, + "step": 1376 + }, + { + "epoch": 0.5415929203539823, + "grad_norm": 1.3604398965835571, + "learning_rate": 2.5738242491914206e-06, + "loss": 0.0733, + "step": 1377 + }, + { + "epoch": 0.5419862340216323, + "grad_norm": 2.859689235687256, + "learning_rate": 2.5703914930196227e-06, + "loss": 0.0547, + "step": 1378 + }, + { + "epoch": 0.5423795476892822, + "grad_norm": 0.770262598991394, + "learning_rate": 2.5669586040204697e-06, + "loss": 0.0644, + "step": 1379 + }, + { + "epoch": 0.5427728613569321, + "grad_norm": 0.7974931001663208, + "learning_rate": 2.5635255886717553e-06, + "loss": 0.0687, + "step": 1380 + }, + { + "epoch": 0.5431661750245821, + "grad_norm": 0.9779230356216431, + "learning_rate": 2.560092453451512e-06, + "loss": 0.0586, + "step": 1381 + }, + { + "epoch": 0.543559488692232, + "grad_norm": 2.3653101921081543, + "learning_rate": 2.5566592048379975e-06, + "loss": 0.0697, + "step": 1382 + }, + { + "epoch": 0.543952802359882, + "grad_norm": 1.6566016674041748, + "learning_rate": 2.553225849309684e-06, + "loss": 0.104, + "step": 1383 + }, + { + "epoch": 0.5443461160275319, + "grad_norm": 1.516684889793396, + "learning_rate": 2.5497923933452464e-06, + "loss": 0.0423, + "step": 1384 + }, + { + "epoch": 0.5447394296951819, + "grad_norm": 1.3681788444519043, + "learning_rate": 2.5463588434235463e-06, + "loss": 0.052, + "step": 1385 + }, + { + "epoch": 0.5451327433628319, + "grad_norm": 0.49628522992134094, + "learning_rate": 2.542925206023626e-06, + "loss": 0.0255, + "step": 1386 + }, + { + "epoch": 0.5455260570304818, + "grad_norm": 0.9334824681282043, + "learning_rate": 2.5394914876246916e-06, + "loss": 0.0517, + "step": 1387 + }, + { + "epoch": 0.5459193706981318, + "grad_norm": 1.3869428634643555, + "learning_rate": 2.5360576947061004e-06, + "loss": 0.051, + "step": 1388 + }, + { + "epoch": 0.5463126843657817, + "grad_norm": 0.7261596918106079, + "learning_rate": 2.5326238337473537e-06, + "loss": 0.0349, + "step": 1389 + }, + { + "epoch": 0.5467059980334317, + "grad_norm": 1.0270626544952393, + "learning_rate": 2.5291899112280765e-06, + "loss": 0.0574, + "step": 1390 + }, + { + "epoch": 0.5470993117010816, + "grad_norm": 0.9097653031349182, + "learning_rate": 2.5257559336280145e-06, + "loss": 0.0434, + "step": 1391 + }, + { + "epoch": 0.5474926253687316, + "grad_norm": 1.5684995651245117, + "learning_rate": 2.522321907427016e-06, + "loss": 0.0394, + "step": 1392 + }, + { + "epoch": 0.5478859390363815, + "grad_norm": 0.5134732723236084, + "learning_rate": 2.5188878391050187e-06, + "loss": 0.0642, + "step": 1393 + }, + { + "epoch": 0.5482792527040314, + "grad_norm": 1.6495331525802612, + "learning_rate": 2.515453735142043e-06, + "loss": 0.0335, + "step": 1394 + }, + { + "epoch": 0.5486725663716814, + "grad_norm": 0.949030876159668, + "learning_rate": 2.5120196020181752e-06, + "loss": 0.069, + "step": 1395 + }, + { + "epoch": 0.5490658800393313, + "grad_norm": 0.5853769183158875, + "learning_rate": 2.5085854462135556e-06, + "loss": 0.035, + "step": 1396 + }, + { + "epoch": 0.5494591937069813, + "grad_norm": 1.0677484273910522, + "learning_rate": 2.505151274208369e-06, + "loss": 0.0511, + "step": 1397 + }, + { + "epoch": 0.5498525073746313, + "grad_norm": 1.5644643306732178, + "learning_rate": 2.50171709248283e-06, + "loss": 0.0814, + "step": 1398 + }, + { + "epoch": 0.5502458210422813, + "grad_norm": 0.736179769039154, + "learning_rate": 2.4982829075171714e-06, + "loss": 0.0452, + "step": 1399 + }, + { + "epoch": 0.5506391347099312, + "grad_norm": 0.8911694288253784, + "learning_rate": 2.494848725791632e-06, + "loss": 0.0564, + "step": 1400 + }, + { + "epoch": 0.5510324483775811, + "grad_norm": 1.9409581422805786, + "learning_rate": 2.4914145537864453e-06, + "loss": 0.0724, + "step": 1401 + }, + { + "epoch": 0.5514257620452311, + "grad_norm": 1.1989744901657104, + "learning_rate": 2.4879803979818256e-06, + "loss": 0.0496, + "step": 1402 + }, + { + "epoch": 0.551819075712881, + "grad_norm": 1.8545705080032349, + "learning_rate": 2.4845462648579573e-06, + "loss": 0.0527, + "step": 1403 + }, + { + "epoch": 0.552212389380531, + "grad_norm": 1.8136131763458252, + "learning_rate": 2.481112160894982e-06, + "loss": 0.0601, + "step": 1404 + }, + { + "epoch": 0.5526057030481809, + "grad_norm": 1.070971131324768, + "learning_rate": 2.4776780925729853e-06, + "loss": 0.0612, + "step": 1405 + }, + { + "epoch": 0.5529990167158308, + "grad_norm": 1.127616047859192, + "learning_rate": 2.474244066371986e-06, + "loss": 0.0503, + "step": 1406 + }, + { + "epoch": 0.5533923303834808, + "grad_norm": 1.5506644248962402, + "learning_rate": 2.4708100887719243e-06, + "loss": 0.0638, + "step": 1407 + }, + { + "epoch": 0.5537856440511307, + "grad_norm": 1.5224863290786743, + "learning_rate": 2.4673761662526475e-06, + "loss": 0.0521, + "step": 1408 + }, + { + "epoch": 0.5541789577187807, + "grad_norm": 1.2066714763641357, + "learning_rate": 2.4639423052938995e-06, + "loss": 0.0533, + "step": 1409 + }, + { + "epoch": 0.5545722713864307, + "grad_norm": 1.389074683189392, + "learning_rate": 2.4605085123753097e-06, + "loss": 0.0809, + "step": 1410 + }, + { + "epoch": 0.5549655850540807, + "grad_norm": 0.6731852293014526, + "learning_rate": 2.4570747939763745e-06, + "loss": 0.0249, + "step": 1411 + }, + { + "epoch": 0.5553588987217306, + "grad_norm": 1.2953534126281738, + "learning_rate": 2.453641156576454e-06, + "loss": 0.0473, + "step": 1412 + }, + { + "epoch": 0.5557522123893806, + "grad_norm": 0.9251944422721863, + "learning_rate": 2.4502076066547545e-06, + "loss": 0.0765, + "step": 1413 + }, + { + "epoch": 0.5561455260570305, + "grad_norm": 1.831679344177246, + "learning_rate": 2.4467741506903162e-06, + "loss": 0.0798, + "step": 1414 + }, + { + "epoch": 0.5565388397246804, + "grad_norm": 1.2218101024627686, + "learning_rate": 2.443340795162003e-06, + "loss": 0.0393, + "step": 1415 + }, + { + "epoch": 0.5569321533923304, + "grad_norm": 1.164400577545166, + "learning_rate": 2.4399075465484883e-06, + "loss": 0.0681, + "step": 1416 + }, + { + "epoch": 0.5573254670599803, + "grad_norm": 1.0514402389526367, + "learning_rate": 2.4364744113282455e-06, + "loss": 0.0593, + "step": 1417 + }, + { + "epoch": 0.5577187807276303, + "grad_norm": 1.9647271633148193, + "learning_rate": 2.433041395979531e-06, + "loss": 0.0785, + "step": 1418 + }, + { + "epoch": 0.5581120943952802, + "grad_norm": 0.7550022006034851, + "learning_rate": 2.429608506980378e-06, + "loss": 0.0443, + "step": 1419 + }, + { + "epoch": 0.5585054080629301, + "grad_norm": 1.2886439561843872, + "learning_rate": 2.4261757508085803e-06, + "loss": 0.0625, + "step": 1420 + }, + { + "epoch": 0.5588987217305801, + "grad_norm": 0.6531363129615784, + "learning_rate": 2.422743133941681e-06, + "loss": 0.0437, + "step": 1421 + }, + { + "epoch": 0.5592920353982301, + "grad_norm": 1.3166404962539673, + "learning_rate": 2.419310662856959e-06, + "loss": 0.0363, + "step": 1422 + }, + { + "epoch": 0.5596853490658801, + "grad_norm": 0.9738766551017761, + "learning_rate": 2.415878344031421e-06, + "loss": 0.0499, + "step": 1423 + }, + { + "epoch": 0.56007866273353, + "grad_norm": 1.1199309825897217, + "learning_rate": 2.4124461839417832e-06, + "loss": 0.0638, + "step": 1424 + }, + { + "epoch": 0.56047197640118, + "grad_norm": 0.7884669303894043, + "learning_rate": 2.4090141890644654e-06, + "loss": 0.0219, + "step": 1425 + }, + { + "epoch": 0.5608652900688299, + "grad_norm": 1.508720874786377, + "learning_rate": 2.405582365875573e-06, + "loss": 0.0722, + "step": 1426 + }, + { + "epoch": 0.5612586037364798, + "grad_norm": 0.9353559017181396, + "learning_rate": 2.4021507208508882e-06, + "loss": 0.0654, + "step": 1427 + }, + { + "epoch": 0.5616519174041298, + "grad_norm": 1.9918673038482666, + "learning_rate": 2.398719260465858e-06, + "loss": 0.0741, + "step": 1428 + }, + { + "epoch": 0.5620452310717797, + "grad_norm": 0.9243260622024536, + "learning_rate": 2.3952879911955794e-06, + "loss": 0.0369, + "step": 1429 + }, + { + "epoch": 0.5624385447394297, + "grad_norm": 1.3456679582595825, + "learning_rate": 2.391856919514791e-06, + "loss": 0.0811, + "step": 1430 + }, + { + "epoch": 0.5628318584070796, + "grad_norm": 1.5919969081878662, + "learning_rate": 2.3884260518978562e-06, + "loss": 0.0402, + "step": 1431 + }, + { + "epoch": 0.5632251720747296, + "grad_norm": 0.5894349813461304, + "learning_rate": 2.3849953948187552e-06, + "loss": 0.0396, + "step": 1432 + }, + { + "epoch": 0.5636184857423795, + "grad_norm": 1.708106517791748, + "learning_rate": 2.3815649547510687e-06, + "loss": 0.0575, + "step": 1433 + }, + { + "epoch": 0.5640117994100295, + "grad_norm": 1.6241428852081299, + "learning_rate": 2.37813473816797e-06, + "loss": 0.047, + "step": 1434 + }, + { + "epoch": 0.5644051130776795, + "grad_norm": 1.1760050058364868, + "learning_rate": 2.3747047515422102e-06, + "loss": 0.049, + "step": 1435 + }, + { + "epoch": 0.5647984267453294, + "grad_norm": 0.6579201221466064, + "learning_rate": 2.371275001346106e-06, + "loss": 0.0569, + "step": 1436 + }, + { + "epoch": 0.5651917404129794, + "grad_norm": 0.5577812194824219, + "learning_rate": 2.367845494051529e-06, + "loss": 0.0338, + "step": 1437 + }, + { + "epoch": 0.5655850540806293, + "grad_norm": 0.9575706124305725, + "learning_rate": 2.3644162361298897e-06, + "loss": 0.0622, + "step": 1438 + }, + { + "epoch": 0.5659783677482793, + "grad_norm": 0.6951814889907837, + "learning_rate": 2.360987234052131e-06, + "loss": 0.0329, + "step": 1439 + }, + { + "epoch": 0.5663716814159292, + "grad_norm": 1.079609990119934, + "learning_rate": 2.357558494288712e-06, + "loss": 0.0672, + "step": 1440 + }, + { + "epoch": 0.5667649950835791, + "grad_norm": 1.0509586334228516, + "learning_rate": 2.354130023309597e-06, + "loss": 0.0755, + "step": 1441 + }, + { + "epoch": 0.5671583087512291, + "grad_norm": 0.9782833456993103, + "learning_rate": 2.350701827584243e-06, + "loss": 0.0319, + "step": 1442 + }, + { + "epoch": 0.567551622418879, + "grad_norm": 1.019370675086975, + "learning_rate": 2.3472739135815877e-06, + "loss": 0.0696, + "step": 1443 + }, + { + "epoch": 0.567944936086529, + "grad_norm": 1.419137716293335, + "learning_rate": 2.343846287770036e-06, + "loss": 0.0797, + "step": 1444 + }, + { + "epoch": 0.5683382497541789, + "grad_norm": 1.8223907947540283, + "learning_rate": 2.340418956617451e-06, + "loss": 0.0462, + "step": 1445 + }, + { + "epoch": 0.568731563421829, + "grad_norm": 1.1286693811416626, + "learning_rate": 2.336991926591138e-06, + "loss": 0.0735, + "step": 1446 + }, + { + "epoch": 0.5691248770894789, + "grad_norm": 1.7998546361923218, + "learning_rate": 2.3335652041578352e-06, + "loss": 0.0964, + "step": 1447 + }, + { + "epoch": 0.5695181907571288, + "grad_norm": 1.0016109943389893, + "learning_rate": 2.3301387957837017e-06, + "loss": 0.0631, + "step": 1448 + }, + { + "epoch": 0.5699115044247788, + "grad_norm": 1.876328706741333, + "learning_rate": 2.326712707934299e-06, + "loss": 0.0683, + "step": 1449 + }, + { + "epoch": 0.5703048180924287, + "grad_norm": 1.8099371194839478, + "learning_rate": 2.3232869470745893e-06, + "loss": 0.058, + "step": 1450 + }, + { + "epoch": 0.5706981317600787, + "grad_norm": 0.8637019395828247, + "learning_rate": 2.3198615196689153e-06, + "loss": 0.0655, + "step": 1451 + }, + { + "epoch": 0.5710914454277286, + "grad_norm": 2.1426312923431396, + "learning_rate": 2.3164364321809906e-06, + "loss": 0.0572, + "step": 1452 + }, + { + "epoch": 0.5714847590953785, + "grad_norm": 1.6157870292663574, + "learning_rate": 2.3130116910738874e-06, + "loss": 0.0321, + "step": 1453 + }, + { + "epoch": 0.5718780727630285, + "grad_norm": 0.8953425288200378, + "learning_rate": 2.309587302810026e-06, + "loss": 0.0292, + "step": 1454 + }, + { + "epoch": 0.5722713864306784, + "grad_norm": 0.8132373094558716, + "learning_rate": 2.306163273851157e-06, + "loss": 0.0517, + "step": 1455 + }, + { + "epoch": 0.5726647000983284, + "grad_norm": 0.8843181729316711, + "learning_rate": 2.302739610658356e-06, + "loss": 0.0389, + "step": 1456 + }, + { + "epoch": 0.5730580137659783, + "grad_norm": 1.1060006618499756, + "learning_rate": 2.2993163196920075e-06, + "loss": 0.08, + "step": 1457 + }, + { + "epoch": 0.5734513274336284, + "grad_norm": 1.1257623434066772, + "learning_rate": 2.295893407411795e-06, + "loss": 0.053, + "step": 1458 + }, + { + "epoch": 0.5738446411012783, + "grad_norm": 1.0160799026489258, + "learning_rate": 2.2924708802766857e-06, + "loss": 0.0439, + "step": 1459 + }, + { + "epoch": 0.5742379547689282, + "grad_norm": 1.231930136680603, + "learning_rate": 2.2890487447449204e-06, + "loss": 0.0569, + "step": 1460 + }, + { + "epoch": 0.5746312684365782, + "grad_norm": 0.8130099177360535, + "learning_rate": 2.285627007274001e-06, + "loss": 0.0361, + "step": 1461 + }, + { + "epoch": 0.5750245821042281, + "grad_norm": 0.6949229836463928, + "learning_rate": 2.282205674320679e-06, + "loss": 0.0598, + "step": 1462 + }, + { + "epoch": 0.5754178957718781, + "grad_norm": 1.0386853218078613, + "learning_rate": 2.2787847523409416e-06, + "loss": 0.0601, + "step": 1463 + }, + { + "epoch": 0.575811209439528, + "grad_norm": 0.48775455355644226, + "learning_rate": 2.2753642477900012e-06, + "loss": 0.0483, + "step": 1464 + }, + { + "epoch": 0.576204523107178, + "grad_norm": 1.220493197441101, + "learning_rate": 2.2719441671222815e-06, + "loss": 0.0398, + "step": 1465 + }, + { + "epoch": 0.5765978367748279, + "grad_norm": 0.747078537940979, + "learning_rate": 2.268524516791408e-06, + "loss": 0.0313, + "step": 1466 + }, + { + "epoch": 0.5769911504424778, + "grad_norm": 0.7773571014404297, + "learning_rate": 2.2651053032501928e-06, + "loss": 0.0395, + "step": 1467 + }, + { + "epoch": 0.5773844641101278, + "grad_norm": 0.4083022177219391, + "learning_rate": 2.261686532950624e-06, + "loss": 0.0255, + "step": 1468 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 1.0136034488677979, + "learning_rate": 2.2582682123438547e-06, + "loss": 0.0499, + "step": 1469 + }, + { + "epoch": 0.5781710914454278, + "grad_norm": 1.2290290594100952, + "learning_rate": 2.254850347880187e-06, + "loss": 0.0649, + "step": 1470 + }, + { + "epoch": 0.5785644051130777, + "grad_norm": 1.4913883209228516, + "learning_rate": 2.2514329460090633e-06, + "loss": 0.0595, + "step": 1471 + }, + { + "epoch": 0.5789577187807277, + "grad_norm": 1.210160732269287, + "learning_rate": 2.248016013179054e-06, + "loss": 0.0433, + "step": 1472 + }, + { + "epoch": 0.5793510324483776, + "grad_norm": 0.757161557674408, + "learning_rate": 2.244599555837844e-06, + "loss": 0.035, + "step": 1473 + }, + { + "epoch": 0.5797443461160275, + "grad_norm": 1.0250403881072998, + "learning_rate": 2.2411835804322206e-06, + "loss": 0.0375, + "step": 1474 + }, + { + "epoch": 0.5801376597836775, + "grad_norm": 1.1955897808074951, + "learning_rate": 2.2377680934080625e-06, + "loss": 0.0449, + "step": 1475 + }, + { + "epoch": 0.5805309734513274, + "grad_norm": 1.7066453695297241, + "learning_rate": 2.2343531012103244e-06, + "loss": 0.0722, + "step": 1476 + }, + { + "epoch": 0.5809242871189774, + "grad_norm": 0.6709203720092773, + "learning_rate": 2.2309386102830295e-06, + "loss": 0.0354, + "step": 1477 + }, + { + "epoch": 0.5813176007866273, + "grad_norm": 0.9403322339057922, + "learning_rate": 2.227524627069256e-06, + "loss": 0.039, + "step": 1478 + }, + { + "epoch": 0.5817109144542773, + "grad_norm": 1.1907342672348022, + "learning_rate": 2.2241111580111207e-06, + "loss": 0.0894, + "step": 1479 + }, + { + "epoch": 0.5821042281219272, + "grad_norm": 0.9678034782409668, + "learning_rate": 2.220698209549774e-06, + "loss": 0.0492, + "step": 1480 + }, + { + "epoch": 0.5824975417895771, + "grad_norm": 0.5867919325828552, + "learning_rate": 2.2172857881253825e-06, + "loss": 0.0329, + "step": 1481 + }, + { + "epoch": 0.5828908554572272, + "grad_norm": 0.9085230827331543, + "learning_rate": 2.2138739001771157e-06, + "loss": 0.0501, + "step": 1482 + }, + { + "epoch": 0.5832841691248771, + "grad_norm": 1.015177845954895, + "learning_rate": 2.2104625521431396e-06, + "loss": 0.0297, + "step": 1483 + }, + { + "epoch": 0.5836774827925271, + "grad_norm": 0.48682698607444763, + "learning_rate": 2.207051750460601e-06, + "loss": 0.0329, + "step": 1484 + }, + { + "epoch": 0.584070796460177, + "grad_norm": 1.861662745475769, + "learning_rate": 2.2036415015656148e-06, + "loss": 0.0619, + "step": 1485 + }, + { + "epoch": 0.584464110127827, + "grad_norm": 0.9373002648353577, + "learning_rate": 2.2002318118932543e-06, + "loss": 0.0563, + "step": 1486 + }, + { + "epoch": 0.5848574237954769, + "grad_norm": 0.4820902943611145, + "learning_rate": 2.1968226878775347e-06, + "loss": 0.0206, + "step": 1487 + }, + { + "epoch": 0.5852507374631268, + "grad_norm": 0.6255022287368774, + "learning_rate": 2.1934141359514062e-06, + "loss": 0.0319, + "step": 1488 + }, + { + "epoch": 0.5856440511307768, + "grad_norm": 0.8468760848045349, + "learning_rate": 2.1900061625467393e-06, + "loss": 0.0574, + "step": 1489 + }, + { + "epoch": 0.5860373647984267, + "grad_norm": 0.519826352596283, + "learning_rate": 2.1865987740943116e-06, + "loss": 0.0595, + "step": 1490 + }, + { + "epoch": 0.5864306784660767, + "grad_norm": 1.6838140487670898, + "learning_rate": 2.183191977023799e-06, + "loss": 0.0549, + "step": 1491 + }, + { + "epoch": 0.5868239921337266, + "grad_norm": 1.3588017225265503, + "learning_rate": 2.17978577776376e-06, + "loss": 0.058, + "step": 1492 + }, + { + "epoch": 0.5872173058013765, + "grad_norm": 0.9913402199745178, + "learning_rate": 2.176380182741624e-06, + "loss": 0.021, + "step": 1493 + }, + { + "epoch": 0.5876106194690266, + "grad_norm": 1.7032448053359985, + "learning_rate": 2.172975198383682e-06, + "loss": 0.0565, + "step": 1494 + }, + { + "epoch": 0.5880039331366765, + "grad_norm": 0.9853689670562744, + "learning_rate": 2.169570831115072e-06, + "loss": 0.0532, + "step": 1495 + }, + { + "epoch": 0.5883972468043265, + "grad_norm": 1.061571717262268, + "learning_rate": 2.1661670873597686e-06, + "loss": 0.042, + "step": 1496 + }, + { + "epoch": 0.5887905604719764, + "grad_norm": 1.0780665874481201, + "learning_rate": 2.1627639735405683e-06, + "loss": 0.0412, + "step": 1497 + }, + { + "epoch": 0.5891838741396264, + "grad_norm": 1.1072509288787842, + "learning_rate": 2.1593614960790795e-06, + "loss": 0.0369, + "step": 1498 + }, + { + "epoch": 0.5895771878072763, + "grad_norm": 0.9231078028678894, + "learning_rate": 2.15595966139571e-06, + "loss": 0.0388, + "step": 1499 + }, + { + "epoch": 0.5899705014749262, + "grad_norm": 0.8702555894851685, + "learning_rate": 2.152558475909654e-06, + "loss": 0.0719, + "step": 1500 + }, + { + "epoch": 0.5903638151425762, + "grad_norm": 0.910358726978302, + "learning_rate": 2.149157946038882e-06, + "loss": 0.0468, + "step": 1501 + }, + { + "epoch": 0.5907571288102261, + "grad_norm": 1.3807059526443481, + "learning_rate": 2.145758078200126e-06, + "loss": 0.0729, + "step": 1502 + }, + { + "epoch": 0.5911504424778761, + "grad_norm": 0.9765854477882385, + "learning_rate": 2.1423588788088704e-06, + "loss": 0.0407, + "step": 1503 + }, + { + "epoch": 0.591543756145526, + "grad_norm": 1.021924376487732, + "learning_rate": 2.1389603542793364e-06, + "loss": 0.0342, + "step": 1504 + }, + { + "epoch": 0.591937069813176, + "grad_norm": 1.098352313041687, + "learning_rate": 2.1355625110244725e-06, + "loss": 0.0668, + "step": 1505 + }, + { + "epoch": 0.592330383480826, + "grad_norm": 1.5986775159835815, + "learning_rate": 2.1321653554559425e-06, + "loss": 0.0673, + "step": 1506 + }, + { + "epoch": 0.592723697148476, + "grad_norm": 1.2270184755325317, + "learning_rate": 2.1287688939841104e-06, + "loss": 0.0405, + "step": 1507 + }, + { + "epoch": 0.5931170108161259, + "grad_norm": 0.6227984428405762, + "learning_rate": 2.125373133018033e-06, + "loss": 0.0362, + "step": 1508 + }, + { + "epoch": 0.5935103244837758, + "grad_norm": 1.1838734149932861, + "learning_rate": 2.1219780789654436e-06, + "loss": 0.0705, + "step": 1509 + }, + { + "epoch": 0.5939036381514258, + "grad_norm": 1.5811330080032349, + "learning_rate": 2.1185837382327422e-06, + "loss": 0.0811, + "step": 1510 + }, + { + "epoch": 0.5942969518190757, + "grad_norm": 1.6723252534866333, + "learning_rate": 2.1151901172249823e-06, + "loss": 0.0711, + "step": 1511 + }, + { + "epoch": 0.5946902654867257, + "grad_norm": 1.1075739860534668, + "learning_rate": 2.1117972223458598e-06, + "loss": 0.0365, + "step": 1512 + }, + { + "epoch": 0.5950835791543756, + "grad_norm": 1.0250906944274902, + "learning_rate": 2.108405059997701e-06, + "loss": 0.0534, + "step": 1513 + }, + { + "epoch": 0.5954768928220255, + "grad_norm": 1.4097585678100586, + "learning_rate": 2.1050136365814484e-06, + "loss": 0.0633, + "step": 1514 + }, + { + "epoch": 0.5958702064896755, + "grad_norm": 1.0003234148025513, + "learning_rate": 2.10162295849665e-06, + "loss": 0.0331, + "step": 1515 + }, + { + "epoch": 0.5962635201573254, + "grad_norm": 1.203927755355835, + "learning_rate": 2.0982330321414495e-06, + "loss": 0.0397, + "step": 1516 + }, + { + "epoch": 0.5966568338249754, + "grad_norm": 1.1078671216964722, + "learning_rate": 2.094843863912571e-06, + "loss": 0.061, + "step": 1517 + }, + { + "epoch": 0.5970501474926254, + "grad_norm": 0.9437456130981445, + "learning_rate": 2.0914554602053072e-06, + "loss": 0.0549, + "step": 1518 + }, + { + "epoch": 0.5974434611602754, + "grad_norm": 0.34665971994400024, + "learning_rate": 2.0880678274135103e-06, + "loss": 0.0374, + "step": 1519 + }, + { + "epoch": 0.5978367748279253, + "grad_norm": 1.6303670406341553, + "learning_rate": 2.084680971929574e-06, + "loss": 0.0729, + "step": 1520 + }, + { + "epoch": 0.5982300884955752, + "grad_norm": 1.1011961698532104, + "learning_rate": 2.0812949001444293e-06, + "loss": 0.0399, + "step": 1521 + }, + { + "epoch": 0.5986234021632252, + "grad_norm": 0.8066303730010986, + "learning_rate": 2.077909618447526e-06, + "loss": 0.05, + "step": 1522 + }, + { + "epoch": 0.5990167158308751, + "grad_norm": 1.4448401927947998, + "learning_rate": 2.0745251332268238e-06, + "loss": 0.0616, + "step": 1523 + }, + { + "epoch": 0.5994100294985251, + "grad_norm": 0.49370574951171875, + "learning_rate": 2.07114145086878e-06, + "loss": 0.0496, + "step": 1524 + }, + { + "epoch": 0.599803343166175, + "grad_norm": 1.0275585651397705, + "learning_rate": 2.0677585777583366e-06, + "loss": 0.038, + "step": 1525 + }, + { + "epoch": 0.600196656833825, + "grad_norm": 1.1347780227661133, + "learning_rate": 2.0643765202789064e-06, + "loss": 0.0324, + "step": 1526 + }, + { + "epoch": 0.6005899705014749, + "grad_norm": 1.2602198123931885, + "learning_rate": 2.060995284812366e-06, + "loss": 0.0699, + "step": 1527 + }, + { + "epoch": 0.6009832841691248, + "grad_norm": 1.4369268417358398, + "learning_rate": 2.0576148777390397e-06, + "loss": 0.0664, + "step": 1528 + }, + { + "epoch": 0.6013765978367748, + "grad_norm": 1.8620692491531372, + "learning_rate": 2.0542353054376893e-06, + "loss": 0.0566, + "step": 1529 + }, + { + "epoch": 0.6017699115044248, + "grad_norm": 1.026005506515503, + "learning_rate": 2.0508565742855017e-06, + "loss": 0.023, + "step": 1530 + }, + { + "epoch": 0.6021632251720748, + "grad_norm": 0.8947687149047852, + "learning_rate": 2.0474786906580733e-06, + "loss": 0.0573, + "step": 1531 + }, + { + "epoch": 0.6025565388397247, + "grad_norm": 1.1179437637329102, + "learning_rate": 2.044101660929405e-06, + "loss": 0.0551, + "step": 1532 + }, + { + "epoch": 0.6029498525073747, + "grad_norm": 0.6822925806045532, + "learning_rate": 2.040725491471885e-06, + "loss": 0.0393, + "step": 1533 + }, + { + "epoch": 0.6033431661750246, + "grad_norm": 1.8381119966506958, + "learning_rate": 2.037350188656279e-06, + "loss": 0.0502, + "step": 1534 + }, + { + "epoch": 0.6037364798426745, + "grad_norm": 1.5118048191070557, + "learning_rate": 2.0339757588517165e-06, + "loss": 0.0403, + "step": 1535 + }, + { + "epoch": 0.6041297935103245, + "grad_norm": 1.0197237730026245, + "learning_rate": 2.0306022084256786e-06, + "loss": 0.0651, + "step": 1536 + }, + { + "epoch": 0.6045231071779744, + "grad_norm": 2.17777943611145, + "learning_rate": 2.027229543743989e-06, + "loss": 0.069, + "step": 1537 + }, + { + "epoch": 0.6049164208456244, + "grad_norm": 1.1577013731002808, + "learning_rate": 2.0238577711707987e-06, + "loss": 0.0615, + "step": 1538 + }, + { + "epoch": 0.6053097345132743, + "grad_norm": 1.1709601879119873, + "learning_rate": 2.0204868970685764e-06, + "loss": 0.0548, + "step": 1539 + }, + { + "epoch": 0.6057030481809242, + "grad_norm": 0.8054937124252319, + "learning_rate": 2.0171169277980954e-06, + "loss": 0.0479, + "step": 1540 + }, + { + "epoch": 0.6060963618485742, + "grad_norm": 0.9096735715866089, + "learning_rate": 2.0137478697184205e-06, + "loss": 0.0655, + "step": 1541 + }, + { + "epoch": 0.6064896755162242, + "grad_norm": 0.9453304409980774, + "learning_rate": 2.0103797291868977e-06, + "loss": 0.0812, + "step": 1542 + }, + { + "epoch": 0.6068829891838742, + "grad_norm": 0.8558923602104187, + "learning_rate": 2.0070125125591414e-06, + "loss": 0.0468, + "step": 1543 + }, + { + "epoch": 0.6072763028515241, + "grad_norm": 1.2030149698257446, + "learning_rate": 2.0036462261890225e-06, + "loss": 0.0542, + "step": 1544 + }, + { + "epoch": 0.6076696165191741, + "grad_norm": 0.9261341691017151, + "learning_rate": 2.0002808764286573e-06, + "loss": 0.0706, + "step": 1545 + }, + { + "epoch": 0.608062930186824, + "grad_norm": 0.7496268153190613, + "learning_rate": 1.9969164696283945e-06, + "loss": 0.0298, + "step": 1546 + }, + { + "epoch": 0.6084562438544739, + "grad_norm": 1.2815377712249756, + "learning_rate": 1.9935530121368023e-06, + "loss": 0.0555, + "step": 1547 + }, + { + "epoch": 0.6088495575221239, + "grad_norm": 0.964885413646698, + "learning_rate": 1.990190510300659e-06, + "loss": 0.0211, + "step": 1548 + }, + { + "epoch": 0.6092428711897738, + "grad_norm": 0.8117434978485107, + "learning_rate": 1.986828970464939e-06, + "loss": 0.0417, + "step": 1549 + }, + { + "epoch": 0.6096361848574238, + "grad_norm": 0.4136671721935272, + "learning_rate": 1.983468398972802e-06, + "loss": 0.0177, + "step": 1550 + }, + { + "epoch": 0.6100294985250737, + "grad_norm": 0.8469100594520569, + "learning_rate": 1.980108802165579e-06, + "loss": 0.0375, + "step": 1551 + }, + { + "epoch": 0.6104228121927237, + "grad_norm": 0.8030047416687012, + "learning_rate": 1.976750186382764e-06, + "loss": 0.0237, + "step": 1552 + }, + { + "epoch": 0.6108161258603736, + "grad_norm": 1.6747819185256958, + "learning_rate": 1.9733925579619965e-06, + "loss": 0.072, + "step": 1553 + }, + { + "epoch": 0.6112094395280236, + "grad_norm": 0.8288264870643616, + "learning_rate": 1.970035923239056e-06, + "loss": 0.0347, + "step": 1554 + }, + { + "epoch": 0.6116027531956736, + "grad_norm": 0.8544471859931946, + "learning_rate": 1.9666802885478463e-06, + "loss": 0.0445, + "step": 1555 + }, + { + "epoch": 0.6119960668633235, + "grad_norm": 0.8386610150337219, + "learning_rate": 1.963325660220384e-06, + "loss": 0.0609, + "step": 1556 + }, + { + "epoch": 0.6123893805309735, + "grad_norm": 1.3670865297317505, + "learning_rate": 1.9599720445867856e-06, + "loss": 0.0601, + "step": 1557 + }, + { + "epoch": 0.6127826941986234, + "grad_norm": 1.0806509256362915, + "learning_rate": 1.956619447975257e-06, + "loss": 0.058, + "step": 1558 + }, + { + "epoch": 0.6131760078662734, + "grad_norm": 0.9588520526885986, + "learning_rate": 1.9532678767120827e-06, + "loss": 0.0422, + "step": 1559 + }, + { + "epoch": 0.6135693215339233, + "grad_norm": 1.370969295501709, + "learning_rate": 1.9499173371216105e-06, + "loss": 0.0646, + "step": 1560 + }, + { + "epoch": 0.6139626352015732, + "grad_norm": 1.074244499206543, + "learning_rate": 1.946567835526243e-06, + "loss": 0.0613, + "step": 1561 + }, + { + "epoch": 0.6143559488692232, + "grad_norm": 0.8812416195869446, + "learning_rate": 1.943219378246423e-06, + "loss": 0.0626, + "step": 1562 + }, + { + "epoch": 0.6147492625368731, + "grad_norm": 1.3703498840332031, + "learning_rate": 1.9398719716006246e-06, + "loss": 0.0673, + "step": 1563 + }, + { + "epoch": 0.6151425762045231, + "grad_norm": 1.3188180923461914, + "learning_rate": 1.936525621905336e-06, + "loss": 0.0711, + "step": 1564 + }, + { + "epoch": 0.615535889872173, + "grad_norm": 0.5656819939613342, + "learning_rate": 1.9331803354750537e-06, + "loss": 0.0496, + "step": 1565 + }, + { + "epoch": 0.6159292035398231, + "grad_norm": 1.2018178701400757, + "learning_rate": 1.9298361186222665e-06, + "loss": 0.052, + "step": 1566 + }, + { + "epoch": 0.616322517207473, + "grad_norm": 1.197943091392517, + "learning_rate": 1.926492977657446e-06, + "loss": 0.0667, + "step": 1567 + }, + { + "epoch": 0.6167158308751229, + "grad_norm": 0.6885368227958679, + "learning_rate": 1.9231509188890345e-06, + "loss": 0.0374, + "step": 1568 + }, + { + "epoch": 0.6171091445427729, + "grad_norm": 0.8017690181732178, + "learning_rate": 1.919809948623428e-06, + "loss": 0.053, + "step": 1569 + }, + { + "epoch": 0.6175024582104228, + "grad_norm": 1.5223562717437744, + "learning_rate": 1.9164700731649723e-06, + "loss": 0.0605, + "step": 1570 + }, + { + "epoch": 0.6178957718780728, + "grad_norm": 1.8122631311416626, + "learning_rate": 1.913131298815947e-06, + "loss": 0.0719, + "step": 1571 + }, + { + "epoch": 0.6182890855457227, + "grad_norm": 1.5113699436187744, + "learning_rate": 1.9097936318765527e-06, + "loss": 0.0547, + "step": 1572 + }, + { + "epoch": 0.6186823992133726, + "grad_norm": 0.7732280492782593, + "learning_rate": 1.906457078644901e-06, + "loss": 0.0456, + "step": 1573 + }, + { + "epoch": 0.6190757128810226, + "grad_norm": 1.347740650177002, + "learning_rate": 1.903121645417003e-06, + "loss": 0.0469, + "step": 1574 + }, + { + "epoch": 0.6194690265486725, + "grad_norm": 0.6614682674407959, + "learning_rate": 1.8997873384867534e-06, + "loss": 0.0266, + "step": 1575 + }, + { + "epoch": 0.6198623402163225, + "grad_norm": 1.1419849395751953, + "learning_rate": 1.8964541641459242e-06, + "loss": 0.0465, + "step": 1576 + }, + { + "epoch": 0.6202556538839724, + "grad_norm": 0.9635249972343445, + "learning_rate": 1.893122128684149e-06, + "loss": 0.0482, + "step": 1577 + }, + { + "epoch": 0.6206489675516225, + "grad_norm": 0.9544531106948853, + "learning_rate": 1.8897912383889138e-06, + "loss": 0.0689, + "step": 1578 + }, + { + "epoch": 0.6210422812192724, + "grad_norm": 0.7220961451530457, + "learning_rate": 1.886461499545543e-06, + "loss": 0.0521, + "step": 1579 + }, + { + "epoch": 0.6214355948869223, + "grad_norm": 2.5634989738464355, + "learning_rate": 1.883132918437186e-06, + "loss": 0.0702, + "step": 1580 + }, + { + "epoch": 0.6218289085545723, + "grad_norm": 1.1183925867080688, + "learning_rate": 1.8798055013448105e-06, + "loss": 0.0623, + "step": 1581 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.7888696193695068, + "learning_rate": 1.8764792545471872e-06, + "loss": 0.0452, + "step": 1582 + }, + { + "epoch": 0.6226155358898722, + "grad_norm": 0.4925548732280731, + "learning_rate": 1.8731541843208772e-06, + "loss": 0.0481, + "step": 1583 + }, + { + "epoch": 0.6230088495575221, + "grad_norm": 1.184525489807129, + "learning_rate": 1.869830296940223e-06, + "loss": 0.0947, + "step": 1584 + }, + { + "epoch": 0.6234021632251721, + "grad_norm": 1.0969839096069336, + "learning_rate": 1.8665075986773346e-06, + "loss": 0.0786, + "step": 1585 + }, + { + "epoch": 0.623795476892822, + "grad_norm": 1.2557084560394287, + "learning_rate": 1.863186095802077e-06, + "loss": 0.048, + "step": 1586 + }, + { + "epoch": 0.6241887905604719, + "grad_norm": 0.9532119631767273, + "learning_rate": 1.8598657945820605e-06, + "loss": 0.0356, + "step": 1587 + }, + { + "epoch": 0.6245821042281219, + "grad_norm": 0.6121819019317627, + "learning_rate": 1.8565467012826282e-06, + "loss": 0.0395, + "step": 1588 + }, + { + "epoch": 0.6249754178957718, + "grad_norm": 0.9521839022636414, + "learning_rate": 1.853228822166843e-06, + "loss": 0.0417, + "step": 1589 + }, + { + "epoch": 0.6253687315634219, + "grad_norm": 1.3007653951644897, + "learning_rate": 1.849912163495479e-06, + "loss": 0.0376, + "step": 1590 + }, + { + "epoch": 0.6257620452310718, + "grad_norm": 1.0467530488967896, + "learning_rate": 1.8465967315270029e-06, + "loss": 0.0531, + "step": 1591 + }, + { + "epoch": 0.6261553588987218, + "grad_norm": 0.8435487747192383, + "learning_rate": 1.8432825325175707e-06, + "loss": 0.0333, + "step": 1592 + }, + { + "epoch": 0.6265486725663717, + "grad_norm": 1.2616933584213257, + "learning_rate": 1.8399695727210098e-06, + "loss": 0.0556, + "step": 1593 + }, + { + "epoch": 0.6269419862340216, + "grad_norm": 1.1721434593200684, + "learning_rate": 1.836657858388811e-06, + "loss": 0.0658, + "step": 1594 + }, + { + "epoch": 0.6273352999016716, + "grad_norm": 0.6084288954734802, + "learning_rate": 1.8333473957701126e-06, + "loss": 0.0385, + "step": 1595 + }, + { + "epoch": 0.6277286135693215, + "grad_norm": 1.4398316144943237, + "learning_rate": 1.830038191111692e-06, + "loss": 0.0606, + "step": 1596 + }, + { + "epoch": 0.6281219272369715, + "grad_norm": 1.9486684799194336, + "learning_rate": 1.8267302506579532e-06, + "loss": 0.0853, + "step": 1597 + }, + { + "epoch": 0.6285152409046214, + "grad_norm": 0.7250006794929504, + "learning_rate": 1.8234235806509145e-06, + "loss": 0.0295, + "step": 1598 + }, + { + "epoch": 0.6289085545722713, + "grad_norm": 1.2927533388137817, + "learning_rate": 1.8201181873301967e-06, + "loss": 0.046, + "step": 1599 + }, + { + "epoch": 0.6293018682399213, + "grad_norm": 1.2859911918640137, + "learning_rate": 1.816814076933012e-06, + "loss": 0.0579, + "step": 1600 + }, + { + "epoch": 0.6296951819075712, + "grad_norm": 1.900543451309204, + "learning_rate": 1.813511255694152e-06, + "loss": 0.0567, + "step": 1601 + }, + { + "epoch": 0.6300884955752213, + "grad_norm": 2.090280532836914, + "learning_rate": 1.8102097298459732e-06, + "loss": 0.0865, + "step": 1602 + }, + { + "epoch": 0.6304818092428712, + "grad_norm": 1.3595722913742065, + "learning_rate": 1.80690950561839e-06, + "loss": 0.0561, + "step": 1603 + }, + { + "epoch": 0.6308751229105212, + "grad_norm": 1.022291660308838, + "learning_rate": 1.8036105892388611e-06, + "loss": 0.0382, + "step": 1604 + }, + { + "epoch": 0.6312684365781711, + "grad_norm": 0.8052154779434204, + "learning_rate": 1.800312986932376e-06, + "loss": 0.0529, + "step": 1605 + }, + { + "epoch": 0.631661750245821, + "grad_norm": 4.667014122009277, + "learning_rate": 1.7970167049214466e-06, + "loss": 0.0492, + "step": 1606 + }, + { + "epoch": 0.632055063913471, + "grad_norm": 1.5009123086929321, + "learning_rate": 1.7937217494260888e-06, + "loss": 0.0779, + "step": 1607 + }, + { + "epoch": 0.6324483775811209, + "grad_norm": 1.570505976676941, + "learning_rate": 1.7904281266638201e-06, + "loss": 0.0577, + "step": 1608 + }, + { + "epoch": 0.6328416912487709, + "grad_norm": 1.3305639028549194, + "learning_rate": 1.7871358428496416e-06, + "loss": 0.0979, + "step": 1609 + }, + { + "epoch": 0.6332350049164208, + "grad_norm": 0.6136133074760437, + "learning_rate": 1.7838449041960276e-06, + "loss": 0.0424, + "step": 1610 + }, + { + "epoch": 0.6336283185840708, + "grad_norm": 0.7882452607154846, + "learning_rate": 1.7805553169129142e-06, + "loss": 0.0656, + "step": 1611 + }, + { + "epoch": 0.6340216322517207, + "grad_norm": 2.1648337841033936, + "learning_rate": 1.7772670872076883e-06, + "loss": 0.0622, + "step": 1612 + }, + { + "epoch": 0.6344149459193706, + "grad_norm": 0.5130072832107544, + "learning_rate": 1.773980221285173e-06, + "loss": 0.0394, + "step": 1613 + }, + { + "epoch": 0.6348082595870207, + "grad_norm": 1.0151782035827637, + "learning_rate": 1.7706947253476194e-06, + "loss": 0.0424, + "step": 1614 + }, + { + "epoch": 0.6352015732546706, + "grad_norm": 0.8527183532714844, + "learning_rate": 1.767410605594694e-06, + "loss": 0.0394, + "step": 1615 + }, + { + "epoch": 0.6355948869223206, + "grad_norm": 1.3671120405197144, + "learning_rate": 1.7641278682234658e-06, + "loss": 0.0625, + "step": 1616 + }, + { + "epoch": 0.6359882005899705, + "grad_norm": 0.8969728350639343, + "learning_rate": 1.7608465194283958e-06, + "loss": 0.0295, + "step": 1617 + }, + { + "epoch": 0.6363815142576205, + "grad_norm": 0.7407302260398865, + "learning_rate": 1.757566565401323e-06, + "loss": 0.055, + "step": 1618 + }, + { + "epoch": 0.6367748279252704, + "grad_norm": 1.153152346611023, + "learning_rate": 1.7542880123314559e-06, + "loss": 0.0945, + "step": 1619 + }, + { + "epoch": 0.6371681415929203, + "grad_norm": 1.259879231452942, + "learning_rate": 1.75101086640536e-06, + "loss": 0.0537, + "step": 1620 + }, + { + "epoch": 0.6375614552605703, + "grad_norm": 0.6502655744552612, + "learning_rate": 1.7477351338069442e-06, + "loss": 0.0443, + "step": 1621 + }, + { + "epoch": 0.6379547689282202, + "grad_norm": 0.9160225987434387, + "learning_rate": 1.7444608207174519e-06, + "loss": 0.0494, + "step": 1622 + }, + { + "epoch": 0.6383480825958702, + "grad_norm": 1.6503887176513672, + "learning_rate": 1.741187933315448e-06, + "loss": 0.0415, + "step": 1623 + }, + { + "epoch": 0.6387413962635201, + "grad_norm": 1.2449769973754883, + "learning_rate": 1.7379164777768038e-06, + "loss": 0.0607, + "step": 1624 + }, + { + "epoch": 0.63913470993117, + "grad_norm": 0.799196720123291, + "learning_rate": 1.734646460274692e-06, + "loss": 0.0404, + "step": 1625 + }, + { + "epoch": 0.6395280235988201, + "grad_norm": 1.6735135316848755, + "learning_rate": 1.7313778869795717e-06, + "loss": 0.0626, + "step": 1626 + }, + { + "epoch": 0.63992133726647, + "grad_norm": 1.090598702430725, + "learning_rate": 1.728110764059176e-06, + "loss": 0.0649, + "step": 1627 + }, + { + "epoch": 0.64031465093412, + "grad_norm": 0.6586104035377502, + "learning_rate": 1.7248450976785011e-06, + "loss": 0.0501, + "step": 1628 + }, + { + "epoch": 0.6407079646017699, + "grad_norm": 1.8684154748916626, + "learning_rate": 1.7215808939997945e-06, + "loss": 0.0653, + "step": 1629 + }, + { + "epoch": 0.6411012782694199, + "grad_norm": 1.1549500226974487, + "learning_rate": 1.7183181591825437e-06, + "loss": 0.0332, + "step": 1630 + }, + { + "epoch": 0.6414945919370698, + "grad_norm": 1.295351505279541, + "learning_rate": 1.7150568993834666e-06, + "loss": 0.0535, + "step": 1631 + }, + { + "epoch": 0.6418879056047198, + "grad_norm": 0.8795567750930786, + "learning_rate": 1.7117971207564934e-06, + "loss": 0.0866, + "step": 1632 + }, + { + "epoch": 0.6422812192723697, + "grad_norm": 0.6757074594497681, + "learning_rate": 1.7085388294527632e-06, + "loss": 0.0385, + "step": 1633 + }, + { + "epoch": 0.6426745329400196, + "grad_norm": 0.9733456373214722, + "learning_rate": 1.705282031620608e-06, + "loss": 0.0923, + "step": 1634 + }, + { + "epoch": 0.6430678466076696, + "grad_norm": 1.0591400861740112, + "learning_rate": 1.7020267334055393e-06, + "loss": 0.0492, + "step": 1635 + }, + { + "epoch": 0.6434611602753195, + "grad_norm": 0.8595137596130371, + "learning_rate": 1.6987729409502412e-06, + "loss": 0.0411, + "step": 1636 + }, + { + "epoch": 0.6438544739429695, + "grad_norm": 1.831631064414978, + "learning_rate": 1.6955206603945557e-06, + "loss": 0.0733, + "step": 1637 + }, + { + "epoch": 0.6442477876106195, + "grad_norm": 0.5861109495162964, + "learning_rate": 1.6922698978754726e-06, + "loss": 0.045, + "step": 1638 + }, + { + "epoch": 0.6446411012782695, + "grad_norm": 1.3072712421417236, + "learning_rate": 1.6890206595271153e-06, + "loss": 0.0713, + "step": 1639 + }, + { + "epoch": 0.6450344149459194, + "grad_norm": 0.8035500049591064, + "learning_rate": 1.6857729514807325e-06, + "loss": 0.0379, + "step": 1640 + }, + { + "epoch": 0.6454277286135693, + "grad_norm": 0.7814714312553406, + "learning_rate": 1.6825267798646851e-06, + "loss": 0.041, + "step": 1641 + }, + { + "epoch": 0.6458210422812193, + "grad_norm": 1.3243709802627563, + "learning_rate": 1.6792821508044352e-06, + "loss": 0.0633, + "step": 1642 + }, + { + "epoch": 0.6462143559488692, + "grad_norm": 0.8479057550430298, + "learning_rate": 1.6760390704225333e-06, + "loss": 0.0561, + "step": 1643 + }, + { + "epoch": 0.6466076696165192, + "grad_norm": 1.0051478147506714, + "learning_rate": 1.672797544838608e-06, + "loss": 0.0372, + "step": 1644 + }, + { + "epoch": 0.6470009832841691, + "grad_norm": 0.962547779083252, + "learning_rate": 1.6695575801693549e-06, + "loss": 0.0398, + "step": 1645 + }, + { + "epoch": 0.647394296951819, + "grad_norm": 1.314014196395874, + "learning_rate": 1.6663191825285214e-06, + "loss": 0.0492, + "step": 1646 + }, + { + "epoch": 0.647787610619469, + "grad_norm": 0.6934694647789001, + "learning_rate": 1.6630823580269005e-06, + "loss": 0.0367, + "step": 1647 + }, + { + "epoch": 0.6481809242871189, + "grad_norm": 1.1256476640701294, + "learning_rate": 1.6598471127723162e-06, + "loss": 0.0476, + "step": 1648 + }, + { + "epoch": 0.6485742379547689, + "grad_norm": 1.5946294069290161, + "learning_rate": 1.6566134528696126e-06, + "loss": 0.0484, + "step": 1649 + }, + { + "epoch": 0.6489675516224189, + "grad_norm": 1.1677006483078003, + "learning_rate": 1.6533813844206426e-06, + "loss": 0.0443, + "step": 1650 + }, + { + "epoch": 0.6493608652900689, + "grad_norm": 0.9727287292480469, + "learning_rate": 1.6501509135242533e-06, + "loss": 0.036, + "step": 1651 + }, + { + "epoch": 0.6497541789577188, + "grad_norm": 1.6365562677383423, + "learning_rate": 1.6469220462762807e-06, + "loss": 0.0794, + "step": 1652 + }, + { + "epoch": 0.6501474926253688, + "grad_norm": 0.9197725057601929, + "learning_rate": 1.6436947887695336e-06, + "loss": 0.0314, + "step": 1653 + }, + { + "epoch": 0.6505408062930187, + "grad_norm": 0.9444229006767273, + "learning_rate": 1.6404691470937829e-06, + "loss": 0.017, + "step": 1654 + }, + { + "epoch": 0.6509341199606686, + "grad_norm": 1.0287470817565918, + "learning_rate": 1.6372451273357504e-06, + "loss": 0.0674, + "step": 1655 + }, + { + "epoch": 0.6513274336283186, + "grad_norm": 0.9683353900909424, + "learning_rate": 1.6340227355790988e-06, + "loss": 0.0727, + "step": 1656 + }, + { + "epoch": 0.6517207472959685, + "grad_norm": 0.9869152903556824, + "learning_rate": 1.6308019779044154e-06, + "loss": 0.0526, + "step": 1657 + }, + { + "epoch": 0.6521140609636185, + "grad_norm": 2.224297046661377, + "learning_rate": 1.6275828603892078e-06, + "loss": 0.0635, + "step": 1658 + }, + { + "epoch": 0.6525073746312684, + "grad_norm": 0.8496151566505432, + "learning_rate": 1.6243653891078864e-06, + "loss": 0.0581, + "step": 1659 + }, + { + "epoch": 0.6529006882989183, + "grad_norm": 1.2158007621765137, + "learning_rate": 1.6211495701317565e-06, + "loss": 0.0728, + "step": 1660 + }, + { + "epoch": 0.6532940019665683, + "grad_norm": 0.48335015773773193, + "learning_rate": 1.6179354095290051e-06, + "loss": 0.0405, + "step": 1661 + }, + { + "epoch": 0.6536873156342183, + "grad_norm": 0.679865300655365, + "learning_rate": 1.6147229133646885e-06, + "loss": 0.0497, + "step": 1662 + }, + { + "epoch": 0.6540806293018683, + "grad_norm": 2.487617254257202, + "learning_rate": 1.611512087700724e-06, + "loss": 0.1029, + "step": 1663 + }, + { + "epoch": 0.6544739429695182, + "grad_norm": 1.0901083946228027, + "learning_rate": 1.6083029385958762e-06, + "loss": 0.0706, + "step": 1664 + }, + { + "epoch": 0.6548672566371682, + "grad_norm": 1.4582974910736084, + "learning_rate": 1.6050954721057461e-06, + "loss": 0.0651, + "step": 1665 + }, + { + "epoch": 0.6552605703048181, + "grad_norm": 1.1469032764434814, + "learning_rate": 1.6018896942827595e-06, + "loss": 0.0533, + "step": 1666 + }, + { + "epoch": 0.655653883972468, + "grad_norm": 1.5001522302627563, + "learning_rate": 1.5986856111761562e-06, + "loss": 0.0688, + "step": 1667 + }, + { + "epoch": 0.656047197640118, + "grad_norm": 0.7778475880622864, + "learning_rate": 1.595483228831976e-06, + "loss": 0.0457, + "step": 1668 + }, + { + "epoch": 0.6564405113077679, + "grad_norm": 0.910394549369812, + "learning_rate": 1.5922825532930526e-06, + "loss": 0.0295, + "step": 1669 + }, + { + "epoch": 0.6568338249754179, + "grad_norm": 1.1938371658325195, + "learning_rate": 1.5890835905989969e-06, + "loss": 0.0533, + "step": 1670 + }, + { + "epoch": 0.6572271386430678, + "grad_norm": 0.9362410306930542, + "learning_rate": 1.5858863467861882e-06, + "loss": 0.054, + "step": 1671 + }, + { + "epoch": 0.6576204523107178, + "grad_norm": 0.5481738448143005, + "learning_rate": 1.582690827887763e-06, + "loss": 0.037, + "step": 1672 + }, + { + "epoch": 0.6580137659783677, + "grad_norm": 0.8186729550361633, + "learning_rate": 1.5794970399336012e-06, + "loss": 0.0355, + "step": 1673 + }, + { + "epoch": 0.6584070796460177, + "grad_norm": 0.885360598564148, + "learning_rate": 1.576304988950318e-06, + "loss": 0.0478, + "step": 1674 + }, + { + "epoch": 0.6588003933136677, + "grad_norm": 1.0103771686553955, + "learning_rate": 1.5731146809612508e-06, + "loss": 0.0562, + "step": 1675 + }, + { + "epoch": 0.6591937069813176, + "grad_norm": 0.9461012482643127, + "learning_rate": 1.569926121986447e-06, + "loss": 0.0301, + "step": 1676 + }, + { + "epoch": 0.6595870206489676, + "grad_norm": 1.5684260129928589, + "learning_rate": 1.566739318042655e-06, + "loss": 0.0339, + "step": 1677 + }, + { + "epoch": 0.6599803343166175, + "grad_norm": 0.7456137537956238, + "learning_rate": 1.56355427514331e-06, + "loss": 0.0592, + "step": 1678 + }, + { + "epoch": 0.6603736479842675, + "grad_norm": 1.6279810667037964, + "learning_rate": 1.5603709992985256e-06, + "loss": 0.0452, + "step": 1679 + }, + { + "epoch": 0.6607669616519174, + "grad_norm": 1.3496975898742676, + "learning_rate": 1.5571894965150796e-06, + "loss": 0.058, + "step": 1680 + }, + { + "epoch": 0.6611602753195673, + "grad_norm": 1.0409663915634155, + "learning_rate": 1.554009772796406e-06, + "loss": 0.0635, + "step": 1681 + }, + { + "epoch": 0.6615535889872173, + "grad_norm": 0.6893079876899719, + "learning_rate": 1.55083183414258e-06, + "loss": 0.042, + "step": 1682 + }, + { + "epoch": 0.6619469026548672, + "grad_norm": 1.3735069036483765, + "learning_rate": 1.5476556865503095e-06, + "loss": 0.0418, + "step": 1683 + }, + { + "epoch": 0.6623402163225172, + "grad_norm": 0.9965916275978088, + "learning_rate": 1.5444813360129207e-06, + "loss": 0.0436, + "step": 1684 + }, + { + "epoch": 0.6627335299901671, + "grad_norm": 0.41811513900756836, + "learning_rate": 1.5413087885203515e-06, + "loss": 0.032, + "step": 1685 + }, + { + "epoch": 0.6631268436578172, + "grad_norm": 1.2320137023925781, + "learning_rate": 1.538138050059136e-06, + "loss": 0.0588, + "step": 1686 + }, + { + "epoch": 0.6635201573254671, + "grad_norm": 1.2540123462677002, + "learning_rate": 1.5349691266123946e-06, + "loss": 0.0527, + "step": 1687 + }, + { + "epoch": 0.663913470993117, + "grad_norm": 0.8406708240509033, + "learning_rate": 1.5318020241598248e-06, + "loss": 0.0479, + "step": 1688 + }, + { + "epoch": 0.664306784660767, + "grad_norm": 1.1033174991607666, + "learning_rate": 1.5286367486776835e-06, + "loss": 0.0566, + "step": 1689 + }, + { + "epoch": 0.6647000983284169, + "grad_norm": 1.4875179529190063, + "learning_rate": 1.5254733061387846e-06, + "loss": 0.0566, + "step": 1690 + }, + { + "epoch": 0.6650934119960669, + "grad_norm": 1.0827391147613525, + "learning_rate": 1.5223117025124817e-06, + "loss": 0.0333, + "step": 1691 + }, + { + "epoch": 0.6654867256637168, + "grad_norm": 1.2373061180114746, + "learning_rate": 1.5191519437646576e-06, + "loss": 0.048, + "step": 1692 + }, + { + "epoch": 0.6658800393313667, + "grad_norm": 0.9508680701255798, + "learning_rate": 1.5159940358577151e-06, + "loss": 0.0499, + "step": 1693 + }, + { + "epoch": 0.6662733529990167, + "grad_norm": 0.4500909447669983, + "learning_rate": 1.512837984750565e-06, + "loss": 0.0207, + "step": 1694 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.83719003200531, + "learning_rate": 1.5096837963986112e-06, + "loss": 0.0541, + "step": 1695 + }, + { + "epoch": 0.6670599803343166, + "grad_norm": 1.0231764316558838, + "learning_rate": 1.5065314767537453e-06, + "loss": 0.0255, + "step": 1696 + }, + { + "epoch": 0.6674532940019666, + "grad_norm": 0.8618975877761841, + "learning_rate": 1.5033810317643327e-06, + "loss": 0.0398, + "step": 1697 + }, + { + "epoch": 0.6678466076696166, + "grad_norm": 0.40866029262542725, + "learning_rate": 1.5002324673752006e-06, + "loss": 0.031, + "step": 1698 + }, + { + "epoch": 0.6682399213372665, + "grad_norm": 0.7475729584693909, + "learning_rate": 1.4970857895276285e-06, + "loss": 0.0534, + "step": 1699 + }, + { + "epoch": 0.6686332350049164, + "grad_norm": 1.0545064210891724, + "learning_rate": 1.4939410041593338e-06, + "loss": 0.0451, + "step": 1700 + }, + { + "epoch": 0.6690265486725664, + "grad_norm": 1.023006796836853, + "learning_rate": 1.4907981172044647e-06, + "loss": 0.0594, + "step": 1701 + }, + { + "epoch": 0.6694198623402163, + "grad_norm": 0.9975923299789429, + "learning_rate": 1.487657134593587e-06, + "loss": 0.0634, + "step": 1702 + }, + { + "epoch": 0.6698131760078663, + "grad_norm": 1.2105883359909058, + "learning_rate": 1.4845180622536728e-06, + "loss": 0.0482, + "step": 1703 + }, + { + "epoch": 0.6702064896755162, + "grad_norm": 1.007332682609558, + "learning_rate": 1.4813809061080893e-06, + "loss": 0.0706, + "step": 1704 + }, + { + "epoch": 0.6705998033431662, + "grad_norm": 0.7119497060775757, + "learning_rate": 1.4782456720765895e-06, + "loss": 0.0409, + "step": 1705 + }, + { + "epoch": 0.6709931170108161, + "grad_norm": 1.0542527437210083, + "learning_rate": 1.4751123660752955e-06, + "loss": 0.0388, + "step": 1706 + }, + { + "epoch": 0.671386430678466, + "grad_norm": 2.3204405307769775, + "learning_rate": 1.4719809940166952e-06, + "loss": 0.0724, + "step": 1707 + }, + { + "epoch": 0.671779744346116, + "grad_norm": 0.5740649700164795, + "learning_rate": 1.4688515618096252e-06, + "loss": 0.0319, + "step": 1708 + }, + { + "epoch": 0.672173058013766, + "grad_norm": 0.9803503155708313, + "learning_rate": 1.4657240753592627e-06, + "loss": 0.0504, + "step": 1709 + }, + { + "epoch": 0.672566371681416, + "grad_norm": 0.8115725517272949, + "learning_rate": 1.462598540567113e-06, + "loss": 0.0605, + "step": 1710 + }, + { + "epoch": 0.6729596853490659, + "grad_norm": 1.3304479122161865, + "learning_rate": 1.4594749633309981e-06, + "loss": 0.0758, + "step": 1711 + }, + { + "epoch": 0.6733529990167159, + "grad_norm": 1.208067774772644, + "learning_rate": 1.456353349545046e-06, + "loss": 0.0706, + "step": 1712 + }, + { + "epoch": 0.6737463126843658, + "grad_norm": 1.1107121706008911, + "learning_rate": 1.4532337050996804e-06, + "loss": 0.0468, + "step": 1713 + }, + { + "epoch": 0.6741396263520157, + "grad_norm": 1.192116618156433, + "learning_rate": 1.4501160358816085e-06, + "loss": 0.0657, + "step": 1714 + }, + { + "epoch": 0.6745329400196657, + "grad_norm": 1.0967481136322021, + "learning_rate": 1.4470003477738111e-06, + "loss": 0.0499, + "step": 1715 + }, + { + "epoch": 0.6749262536873156, + "grad_norm": 1.3263583183288574, + "learning_rate": 1.4438866466555308e-06, + "loss": 0.0449, + "step": 1716 + }, + { + "epoch": 0.6753195673549656, + "grad_norm": 1.5055456161499023, + "learning_rate": 1.4407749384022576e-06, + "loss": 0.0489, + "step": 1717 + }, + { + "epoch": 0.6757128810226155, + "grad_norm": 1.5726017951965332, + "learning_rate": 1.4376652288857249e-06, + "loss": 0.0626, + "step": 1718 + }, + { + "epoch": 0.6761061946902654, + "grad_norm": 1.6234389543533325, + "learning_rate": 1.4345575239738928e-06, + "loss": 0.0606, + "step": 1719 + }, + { + "epoch": 0.6764995083579154, + "grad_norm": 1.7149680852890015, + "learning_rate": 1.431451829530939e-06, + "loss": 0.0527, + "step": 1720 + }, + { + "epoch": 0.6768928220255654, + "grad_norm": 0.8043215870857239, + "learning_rate": 1.4283481514172487e-06, + "loss": 0.0454, + "step": 1721 + }, + { + "epoch": 0.6772861356932154, + "grad_norm": 1.3794721364974976, + "learning_rate": 1.425246495489399e-06, + "loss": 0.0522, + "step": 1722 + }, + { + "epoch": 0.6776794493608653, + "grad_norm": 0.7596322298049927, + "learning_rate": 1.4221468676001544e-06, + "loss": 0.0507, + "step": 1723 + }, + { + "epoch": 0.6780727630285153, + "grad_norm": 0.9277907013893127, + "learning_rate": 1.419049273598451e-06, + "loss": 0.0406, + "step": 1724 + }, + { + "epoch": 0.6784660766961652, + "grad_norm": 1.7175707817077637, + "learning_rate": 1.4159537193293876e-06, + "loss": 0.0477, + "step": 1725 + }, + { + "epoch": 0.6788593903638152, + "grad_norm": 0.5326056480407715, + "learning_rate": 1.4128602106342154e-06, + "loss": 0.0248, + "step": 1726 + }, + { + "epoch": 0.6792527040314651, + "grad_norm": 1.259993314743042, + "learning_rate": 1.4097687533503213e-06, + "loss": 0.05, + "step": 1727 + }, + { + "epoch": 0.679646017699115, + "grad_norm": 0.9844882488250732, + "learning_rate": 1.4066793533112255e-06, + "loss": 0.0407, + "step": 1728 + }, + { + "epoch": 0.680039331366765, + "grad_norm": 1.6221920251846313, + "learning_rate": 1.4035920163465648e-06, + "loss": 0.0589, + "step": 1729 + }, + { + "epoch": 0.6804326450344149, + "grad_norm": 2.0537407398223877, + "learning_rate": 1.400506748282083e-06, + "loss": 0.0622, + "step": 1730 + }, + { + "epoch": 0.6808259587020649, + "grad_norm": 1.1460561752319336, + "learning_rate": 1.3974235549396198e-06, + "loss": 0.0448, + "step": 1731 + }, + { + "epoch": 0.6812192723697148, + "grad_norm": 1.2280306816101074, + "learning_rate": 1.3943424421370998e-06, + "loss": 0.0621, + "step": 1732 + }, + { + "epoch": 0.6816125860373649, + "grad_norm": 1.9272797107696533, + "learning_rate": 1.3912634156885235e-06, + "loss": 0.0559, + "step": 1733 + }, + { + "epoch": 0.6820058997050148, + "grad_norm": 0.8985779285430908, + "learning_rate": 1.3881864814039503e-06, + "loss": 0.0568, + "step": 1734 + }, + { + "epoch": 0.6823992133726647, + "grad_norm": 0.5459672808647156, + "learning_rate": 1.3851116450894959e-06, + "loss": 0.03, + "step": 1735 + }, + { + "epoch": 0.6827925270403147, + "grad_norm": 0.8683139085769653, + "learning_rate": 1.382038912547315e-06, + "loss": 0.0513, + "step": 1736 + }, + { + "epoch": 0.6831858407079646, + "grad_norm": 0.7696962952613831, + "learning_rate": 1.3789682895755935e-06, + "loss": 0.0448, + "step": 1737 + }, + { + "epoch": 0.6835791543756146, + "grad_norm": 1.2431952953338623, + "learning_rate": 1.3758997819685366e-06, + "loss": 0.0493, + "step": 1738 + }, + { + "epoch": 0.6839724680432645, + "grad_norm": 0.9553192853927612, + "learning_rate": 1.3728333955163565e-06, + "loss": 0.0321, + "step": 1739 + }, + { + "epoch": 0.6843657817109144, + "grad_norm": 1.2432819604873657, + "learning_rate": 1.3697691360052646e-06, + "loss": 0.0744, + "step": 1740 + }, + { + "epoch": 0.6847590953785644, + "grad_norm": 0.6021830439567566, + "learning_rate": 1.3667070092174587e-06, + "loss": 0.0471, + "step": 1741 + }, + { + "epoch": 0.6851524090462143, + "grad_norm": 1.0340098142623901, + "learning_rate": 1.3636470209311093e-06, + "loss": 0.0645, + "step": 1742 + }, + { + "epoch": 0.6855457227138643, + "grad_norm": 1.2661107778549194, + "learning_rate": 1.360589176920355e-06, + "loss": 0.0314, + "step": 1743 + }, + { + "epoch": 0.6859390363815142, + "grad_norm": 1.7685880661010742, + "learning_rate": 1.357533482955287e-06, + "loss": 0.0635, + "step": 1744 + }, + { + "epoch": 0.6863323500491643, + "grad_norm": 1.249866008758545, + "learning_rate": 1.354479944801939e-06, + "loss": 0.0257, + "step": 1745 + }, + { + "epoch": 0.6867256637168142, + "grad_norm": 0.8888324499130249, + "learning_rate": 1.3514285682222777e-06, + "loss": 0.0501, + "step": 1746 + }, + { + "epoch": 0.6871189773844641, + "grad_norm": 0.9306212067604065, + "learning_rate": 1.3483793589741901e-06, + "loss": 0.0535, + "step": 1747 + }, + { + "epoch": 0.6875122910521141, + "grad_norm": 1.239108920097351, + "learning_rate": 1.3453323228114745e-06, + "loss": 0.0645, + "step": 1748 + }, + { + "epoch": 0.687905604719764, + "grad_norm": 1.971179723739624, + "learning_rate": 1.3422874654838263e-06, + "loss": 0.0617, + "step": 1749 + }, + { + "epoch": 0.688298918387414, + "grad_norm": 0.8780958652496338, + "learning_rate": 1.3392447927368315e-06, + "loss": 0.0303, + "step": 1750 + }, + { + "epoch": 0.6886922320550639, + "grad_norm": 0.5229460000991821, + "learning_rate": 1.3362043103119537e-06, + "loss": 0.0408, + "step": 1751 + }, + { + "epoch": 0.6890855457227139, + "grad_norm": 1.0178303718566895, + "learning_rate": 1.3331660239465232e-06, + "loss": 0.0692, + "step": 1752 + }, + { + "epoch": 0.6894788593903638, + "grad_norm": 1.1098684072494507, + "learning_rate": 1.3301299393737262e-06, + "loss": 0.0553, + "step": 1753 + }, + { + "epoch": 0.6898721730580137, + "grad_norm": 0.9905382990837097, + "learning_rate": 1.3270960623225953e-06, + "loss": 0.0551, + "step": 1754 + }, + { + "epoch": 0.6902654867256637, + "grad_norm": 1.15705406665802, + "learning_rate": 1.324064398517994e-06, + "loss": 0.0606, + "step": 1755 + }, + { + "epoch": 0.6906588003933136, + "grad_norm": 0.7547001838684082, + "learning_rate": 1.3210349536806138e-06, + "loss": 0.0375, + "step": 1756 + }, + { + "epoch": 0.6910521140609637, + "grad_norm": 0.9143390655517578, + "learning_rate": 1.3180077335269565e-06, + "loss": 0.0557, + "step": 1757 + }, + { + "epoch": 0.6914454277286136, + "grad_norm": 1.5813028812408447, + "learning_rate": 1.3149827437693267e-06, + "loss": 0.0734, + "step": 1758 + }, + { + "epoch": 0.6918387413962636, + "grad_norm": 1.3135156631469727, + "learning_rate": 1.3119599901158214e-06, + "loss": 0.0454, + "step": 1759 + }, + { + "epoch": 0.6922320550639135, + "grad_norm": 1.3713979721069336, + "learning_rate": 1.3089394782703152e-06, + "loss": 0.0459, + "step": 1760 + }, + { + "epoch": 0.6926253687315634, + "grad_norm": 1.0648804903030396, + "learning_rate": 1.3059212139324548e-06, + "loss": 0.0562, + "step": 1761 + }, + { + "epoch": 0.6930186823992134, + "grad_norm": 0.8367137312889099, + "learning_rate": 1.3029052027976457e-06, + "loss": 0.0269, + "step": 1762 + }, + { + "epoch": 0.6934119960668633, + "grad_norm": 1.1222723722457886, + "learning_rate": 1.299891450557041e-06, + "loss": 0.0458, + "step": 1763 + }, + { + "epoch": 0.6938053097345133, + "grad_norm": 1.087550163269043, + "learning_rate": 1.2968799628975311e-06, + "loss": 0.0357, + "step": 1764 + }, + { + "epoch": 0.6941986234021632, + "grad_norm": 0.8797011375427246, + "learning_rate": 1.2938707455017358e-06, + "loss": 0.0459, + "step": 1765 + }, + { + "epoch": 0.6945919370698131, + "grad_norm": 1.4389101266860962, + "learning_rate": 1.2908638040479855e-06, + "loss": 0.0715, + "step": 1766 + }, + { + "epoch": 0.6949852507374631, + "grad_norm": 0.826977014541626, + "learning_rate": 1.2878591442103215e-06, + "loss": 0.0498, + "step": 1767 + }, + { + "epoch": 0.695378564405113, + "grad_norm": 1.2073124647140503, + "learning_rate": 1.2848567716584764e-06, + "loss": 0.0401, + "step": 1768 + }, + { + "epoch": 0.6957718780727631, + "grad_norm": 1.2512377500534058, + "learning_rate": 1.2818566920578684e-06, + "loss": 0.0545, + "step": 1769 + }, + { + "epoch": 0.696165191740413, + "grad_norm": 1.003304123878479, + "learning_rate": 1.2788589110695896e-06, + "loss": 0.0657, + "step": 1770 + }, + { + "epoch": 0.696558505408063, + "grad_norm": 1.6829479932785034, + "learning_rate": 1.275863434350391e-06, + "loss": 0.0488, + "step": 1771 + }, + { + "epoch": 0.6969518190757129, + "grad_norm": 1.0957913398742676, + "learning_rate": 1.2728702675526788e-06, + "loss": 0.0695, + "step": 1772 + }, + { + "epoch": 0.6973451327433628, + "grad_norm": 1.2029186487197876, + "learning_rate": 1.2698794163244998e-06, + "loss": 0.0574, + "step": 1773 + }, + { + "epoch": 0.6977384464110128, + "grad_norm": 0.8925944566726685, + "learning_rate": 1.2668908863095311e-06, + "loss": 0.0424, + "step": 1774 + }, + { + "epoch": 0.6981317600786627, + "grad_norm": 0.8353788256645203, + "learning_rate": 1.2639046831470697e-06, + "loss": 0.038, + "step": 1775 + }, + { + "epoch": 0.6985250737463127, + "grad_norm": 2.284682273864746, + "learning_rate": 1.2609208124720228e-06, + "loss": 0.0687, + "step": 1776 + }, + { + "epoch": 0.6989183874139626, + "grad_norm": 0.9992805123329163, + "learning_rate": 1.2579392799148938e-06, + "loss": 0.0401, + "step": 1777 + }, + { + "epoch": 0.6993117010816126, + "grad_norm": 1.329393744468689, + "learning_rate": 1.2549600911017761e-06, + "loss": 0.0768, + "step": 1778 + }, + { + "epoch": 0.6997050147492625, + "grad_norm": 1.184579849243164, + "learning_rate": 1.25198325165434e-06, + "loss": 0.0467, + "step": 1779 + }, + { + "epoch": 0.7000983284169124, + "grad_norm": 0.6934780478477478, + "learning_rate": 1.2490087671898234e-06, + "loss": 0.0454, + "step": 1780 + }, + { + "epoch": 0.7004916420845625, + "grad_norm": 0.5612182021141052, + "learning_rate": 1.24603664332102e-06, + "loss": 0.0397, + "step": 1781 + }, + { + "epoch": 0.7008849557522124, + "grad_norm": 1.493826985359192, + "learning_rate": 1.243066885656267e-06, + "loss": 0.0815, + "step": 1782 + }, + { + "epoch": 0.7012782694198624, + "grad_norm": 0.7363511323928833, + "learning_rate": 1.240099499799439e-06, + "loss": 0.0496, + "step": 1783 + }, + { + "epoch": 0.7016715830875123, + "grad_norm": 1.6472634077072144, + "learning_rate": 1.237134491349935e-06, + "loss": 0.0741, + "step": 1784 + }, + { + "epoch": 0.7020648967551623, + "grad_norm": 1.3183567523956299, + "learning_rate": 1.234171865902667e-06, + "loss": 0.043, + "step": 1785 + }, + { + "epoch": 0.7024582104228122, + "grad_norm": 1.0543493032455444, + "learning_rate": 1.2312116290480506e-06, + "loss": 0.0401, + "step": 1786 + }, + { + "epoch": 0.7028515240904621, + "grad_norm": 0.8686029314994812, + "learning_rate": 1.228253786371995e-06, + "loss": 0.0335, + "step": 1787 + }, + { + "epoch": 0.7032448377581121, + "grad_norm": 1.9254342317581177, + "learning_rate": 1.2252983434558894e-06, + "loss": 0.0361, + "step": 1788 + }, + { + "epoch": 0.703638151425762, + "grad_norm": 0.8810344338417053, + "learning_rate": 1.2223453058765966e-06, + "loss": 0.0442, + "step": 1789 + }, + { + "epoch": 0.704031465093412, + "grad_norm": 1.138178825378418, + "learning_rate": 1.2193946792064403e-06, + "loss": 0.0768, + "step": 1790 + }, + { + "epoch": 0.7044247787610619, + "grad_norm": 0.7755922675132751, + "learning_rate": 1.2164464690131947e-06, + "loss": 0.0303, + "step": 1791 + }, + { + "epoch": 0.7048180924287119, + "grad_norm": 1.5868074893951416, + "learning_rate": 1.2135006808600752e-06, + "loss": 0.052, + "step": 1792 + }, + { + "epoch": 0.7052114060963619, + "grad_norm": 0.9672881364822388, + "learning_rate": 1.2105573203057233e-06, + "loss": 0.0432, + "step": 1793 + }, + { + "epoch": 0.7056047197640118, + "grad_norm": 0.9986976981163025, + "learning_rate": 1.207616392904204e-06, + "loss": 0.0464, + "step": 1794 + }, + { + "epoch": 0.7059980334316618, + "grad_norm": 0.646554708480835, + "learning_rate": 1.2046779042049883e-06, + "loss": 0.0268, + "step": 1795 + }, + { + "epoch": 0.7063913470993117, + "grad_norm": 0.6818554997444153, + "learning_rate": 1.2017418597529464e-06, + "loss": 0.0521, + "step": 1796 + }, + { + "epoch": 0.7067846607669617, + "grad_norm": 0.5991765260696411, + "learning_rate": 1.1988082650883376e-06, + "loss": 0.0538, + "step": 1797 + }, + { + "epoch": 0.7071779744346116, + "grad_norm": 1.1525814533233643, + "learning_rate": 1.1958771257467946e-06, + "loss": 0.0451, + "step": 1798 + }, + { + "epoch": 0.7075712881022616, + "grad_norm": 0.8486371040344238, + "learning_rate": 1.1929484472593205e-06, + "loss": 0.0514, + "step": 1799 + }, + { + "epoch": 0.7079646017699115, + "grad_norm": 1.393419623374939, + "learning_rate": 1.190022235152274e-06, + "loss": 0.0609, + "step": 1800 + }, + { + "epoch": 0.7083579154375614, + "grad_norm": 0.7574542760848999, + "learning_rate": 1.1870984949473586e-06, + "loss": 0.0604, + "step": 1801 + }, + { + "epoch": 0.7087512291052114, + "grad_norm": 1.0601574182510376, + "learning_rate": 1.184177232161615e-06, + "loss": 0.0459, + "step": 1802 + }, + { + "epoch": 0.7091445427728613, + "grad_norm": 0.7535306811332703, + "learning_rate": 1.1812584523074089e-06, + "loss": 0.0351, + "step": 1803 + }, + { + "epoch": 0.7095378564405113, + "grad_norm": 1.3023512363433838, + "learning_rate": 1.1783421608924183e-06, + "loss": 0.0598, + "step": 1804 + }, + { + "epoch": 0.7099311701081613, + "grad_norm": 1.1070560216903687, + "learning_rate": 1.1754283634196285e-06, + "loss": 0.0471, + "step": 1805 + }, + { + "epoch": 0.7103244837758113, + "grad_norm": 0.9613627791404724, + "learning_rate": 1.1725170653873174e-06, + "loss": 0.0486, + "step": 1806 + }, + { + "epoch": 0.7107177974434612, + "grad_norm": 0.7932494282722473, + "learning_rate": 1.1696082722890474e-06, + "loss": 0.0774, + "step": 1807 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.684893786907196, + "learning_rate": 1.1667019896136539e-06, + "loss": 0.0454, + "step": 1808 + }, + { + "epoch": 0.7115044247787611, + "grad_norm": 1.3207006454467773, + "learning_rate": 1.1637982228452329e-06, + "loss": 0.0473, + "step": 1809 + }, + { + "epoch": 0.711897738446411, + "grad_norm": 1.3429388999938965, + "learning_rate": 1.1608969774631366e-06, + "loss": 0.0412, + "step": 1810 + }, + { + "epoch": 0.712291052114061, + "grad_norm": 1.4132349491119385, + "learning_rate": 1.1579982589419568e-06, + "loss": 0.0549, + "step": 1811 + }, + { + "epoch": 0.7126843657817109, + "grad_norm": 0.7561691999435425, + "learning_rate": 1.155102072751518e-06, + "loss": 0.0337, + "step": 1812 + }, + { + "epoch": 0.7130776794493608, + "grad_norm": 0.7749929428100586, + "learning_rate": 1.152208424356867e-06, + "loss": 0.034, + "step": 1813 + }, + { + "epoch": 0.7134709931170108, + "grad_norm": 1.1324396133422852, + "learning_rate": 1.1493173192182613e-06, + "loss": 0.032, + "step": 1814 + }, + { + "epoch": 0.7138643067846607, + "grad_norm": 0.7702449560165405, + "learning_rate": 1.1464287627911577e-06, + "loss": 0.0451, + "step": 1815 + }, + { + "epoch": 0.7142576204523107, + "grad_norm": 0.7402438521385193, + "learning_rate": 1.1435427605262057e-06, + "loss": 0.0489, + "step": 1816 + }, + { + "epoch": 0.7146509341199607, + "grad_norm": 1.3986225128173828, + "learning_rate": 1.1406593178692346e-06, + "loss": 0.0463, + "step": 1817 + }, + { + "epoch": 0.7150442477876107, + "grad_norm": 0.7235271334648132, + "learning_rate": 1.1377784402612439e-06, + "loss": 0.0519, + "step": 1818 + }, + { + "epoch": 0.7154375614552606, + "grad_norm": 0.8625795841217041, + "learning_rate": 1.1349001331383921e-06, + "loss": 0.0375, + "step": 1819 + }, + { + "epoch": 0.7158308751229105, + "grad_norm": 1.5163322687149048, + "learning_rate": 1.132024401931988e-06, + "loss": 0.0557, + "step": 1820 + }, + { + "epoch": 0.7162241887905605, + "grad_norm": 0.6675801277160645, + "learning_rate": 1.12915125206848e-06, + "loss": 0.0261, + "step": 1821 + }, + { + "epoch": 0.7166175024582104, + "grad_norm": 0.9029967188835144, + "learning_rate": 1.1262806889694455e-06, + "loss": 0.037, + "step": 1822 + }, + { + "epoch": 0.7170108161258604, + "grad_norm": 0.716080367565155, + "learning_rate": 1.1234127180515787e-06, + "loss": 0.0559, + "step": 1823 + }, + { + "epoch": 0.7174041297935103, + "grad_norm": 0.9414195418357849, + "learning_rate": 1.1205473447266843e-06, + "loss": 0.0466, + "step": 1824 + }, + { + "epoch": 0.7177974434611603, + "grad_norm": 0.9414455890655518, + "learning_rate": 1.117684574401666e-06, + "loss": 0.0408, + "step": 1825 + }, + { + "epoch": 0.7181907571288102, + "grad_norm": 0.6914128065109253, + "learning_rate": 1.1148244124785143e-06, + "loss": 0.0286, + "step": 1826 + }, + { + "epoch": 0.7185840707964601, + "grad_norm": 1.238477349281311, + "learning_rate": 1.111966864354298e-06, + "loss": 0.0606, + "step": 1827 + }, + { + "epoch": 0.7189773844641101, + "grad_norm": 1.5670506954193115, + "learning_rate": 1.1091119354211544e-06, + "loss": 0.045, + "step": 1828 + }, + { + "epoch": 0.7193706981317601, + "grad_norm": 1.5129029750823975, + "learning_rate": 1.1062596310662775e-06, + "loss": 0.0352, + "step": 1829 + }, + { + "epoch": 0.7197640117994101, + "grad_norm": 1.0257515907287598, + "learning_rate": 1.1034099566719104e-06, + "loss": 0.0267, + "step": 1830 + }, + { + "epoch": 0.72015732546706, + "grad_norm": 0.8426341414451599, + "learning_rate": 1.1005629176153302e-06, + "loss": 0.0331, + "step": 1831 + }, + { + "epoch": 0.72055063913471, + "grad_norm": 1.1478296518325806, + "learning_rate": 1.097718519268844e-06, + "loss": 0.0601, + "step": 1832 + }, + { + "epoch": 0.7209439528023599, + "grad_norm": 1.6983435153961182, + "learning_rate": 1.0948767669997762e-06, + "loss": 0.0671, + "step": 1833 + }, + { + "epoch": 0.7213372664700098, + "grad_norm": 0.992310643196106, + "learning_rate": 1.092037666170456e-06, + "loss": 0.0554, + "step": 1834 + }, + { + "epoch": 0.7217305801376598, + "grad_norm": 1.258967399597168, + "learning_rate": 1.0892012221382115e-06, + "loss": 0.0423, + "step": 1835 + }, + { + "epoch": 0.7221238938053097, + "grad_norm": 0.8152772188186646, + "learning_rate": 1.0863674402553564e-06, + "loss": 0.0638, + "step": 1836 + }, + { + "epoch": 0.7225172074729597, + "grad_norm": 0.8680564165115356, + "learning_rate": 1.08353632586918e-06, + "loss": 0.0322, + "step": 1837 + }, + { + "epoch": 0.7229105211406096, + "grad_norm": 0.4944194257259369, + "learning_rate": 1.0807078843219395e-06, + "loss": 0.0684, + "step": 1838 + }, + { + "epoch": 0.7233038348082595, + "grad_norm": 1.0787291526794434, + "learning_rate": 1.077882120950849e-06, + "loss": 0.0355, + "step": 1839 + }, + { + "epoch": 0.7236971484759095, + "grad_norm": 0.4451111853122711, + "learning_rate": 1.0750590410880671e-06, + "loss": 0.0291, + "step": 1840 + }, + { + "epoch": 0.7240904621435595, + "grad_norm": 0.48384201526641846, + "learning_rate": 1.072238650060691e-06, + "loss": 0.0344, + "step": 1841 + }, + { + "epoch": 0.7244837758112095, + "grad_norm": 1.1826977729797363, + "learning_rate": 1.0694209531907412e-06, + "loss": 0.0302, + "step": 1842 + }, + { + "epoch": 0.7248770894788594, + "grad_norm": 0.5904631614685059, + "learning_rate": 1.0666059557951566e-06, + "loss": 0.0268, + "step": 1843 + }, + { + "epoch": 0.7252704031465094, + "grad_norm": 0.7693639993667603, + "learning_rate": 1.0637936631857815e-06, + "loss": 0.0329, + "step": 1844 + }, + { + "epoch": 0.7256637168141593, + "grad_norm": 1.1267420053482056, + "learning_rate": 1.0609840806693567e-06, + "loss": 0.0584, + "step": 1845 + }, + { + "epoch": 0.7260570304818093, + "grad_norm": 0.8826761841773987, + "learning_rate": 1.0581772135475089e-06, + "loss": 0.0371, + "step": 1846 + }, + { + "epoch": 0.7264503441494592, + "grad_norm": 0.9510964751243591, + "learning_rate": 1.0553730671167412e-06, + "loss": 0.0366, + "step": 1847 + }, + { + "epoch": 0.7268436578171091, + "grad_norm": 1.4061312675476074, + "learning_rate": 1.052571646668421e-06, + "loss": 0.0548, + "step": 1848 + }, + { + "epoch": 0.7272369714847591, + "grad_norm": 1.7235345840454102, + "learning_rate": 1.0497729574887744e-06, + "loss": 0.0729, + "step": 1849 + }, + { + "epoch": 0.727630285152409, + "grad_norm": 1.10977041721344, + "learning_rate": 1.0469770048588723e-06, + "loss": 0.042, + "step": 1850 + }, + { + "epoch": 0.728023598820059, + "grad_norm": 1.054607629776001, + "learning_rate": 1.0441837940546217e-06, + "loss": 0.0286, + "step": 1851 + }, + { + "epoch": 0.7284169124877089, + "grad_norm": 1.315953016281128, + "learning_rate": 1.0413933303467578e-06, + "loss": 0.0415, + "step": 1852 + }, + { + "epoch": 0.728810226155359, + "grad_norm": 1.4497429132461548, + "learning_rate": 1.038605619000828e-06, + "loss": 0.0566, + "step": 1853 + }, + { + "epoch": 0.7292035398230089, + "grad_norm": 1.1214773654937744, + "learning_rate": 1.0358206652771896e-06, + "loss": 0.0388, + "step": 1854 + }, + { + "epoch": 0.7295968534906588, + "grad_norm": 0.8499764204025269, + "learning_rate": 1.033038474430995e-06, + "loss": 0.022, + "step": 1855 + }, + { + "epoch": 0.7299901671583088, + "grad_norm": 0.993175745010376, + "learning_rate": 1.0302590517121835e-06, + "loss": 0.0351, + "step": 1856 + }, + { + "epoch": 0.7303834808259587, + "grad_norm": 1.3063788414001465, + "learning_rate": 1.0274824023654717e-06, + "loss": 0.049, + "step": 1857 + }, + { + "epoch": 0.7307767944936087, + "grad_norm": 0.6438285112380981, + "learning_rate": 1.0247085316303401e-06, + "loss": 0.0322, + "step": 1858 + }, + { + "epoch": 0.7311701081612586, + "grad_norm": 1.801291823387146, + "learning_rate": 1.0219374447410289e-06, + "loss": 0.0724, + "step": 1859 + }, + { + "epoch": 0.7315634218289085, + "grad_norm": 1.5461159944534302, + "learning_rate": 1.019169146926524e-06, + "loss": 0.0466, + "step": 1860 + }, + { + "epoch": 0.7319567354965585, + "grad_norm": 1.0814778804779053, + "learning_rate": 1.016403643410549e-06, + "loss": 0.0532, + "step": 1861 + }, + { + "epoch": 0.7323500491642084, + "grad_norm": 1.1939774751663208, + "learning_rate": 1.013640939411554e-06, + "loss": 0.0349, + "step": 1862 + }, + { + "epoch": 0.7327433628318584, + "grad_norm": 2.0183346271514893, + "learning_rate": 1.010881040142708e-06, + "loss": 0.0802, + "step": 1863 + }, + { + "epoch": 0.7331366764995083, + "grad_norm": 1.4486076831817627, + "learning_rate": 1.0081239508118842e-06, + "loss": 0.0381, + "step": 1864 + }, + { + "epoch": 0.7335299901671584, + "grad_norm": 0.7198472023010254, + "learning_rate": 1.0053696766216566e-06, + "loss": 0.0332, + "step": 1865 + }, + { + "epoch": 0.7339233038348083, + "grad_norm": 1.0703610181808472, + "learning_rate": 1.0026182227692865e-06, + "loss": 0.0321, + "step": 1866 + }, + { + "epoch": 0.7343166175024582, + "grad_norm": 0.9748527407646179, + "learning_rate": 9.998695944467127e-07, + "loss": 0.0312, + "step": 1867 + }, + { + "epoch": 0.7347099311701082, + "grad_norm": 0.6599907279014587, + "learning_rate": 9.97123796840543e-07, + "loss": 0.05, + "step": 1868 + }, + { + "epoch": 0.7351032448377581, + "grad_norm": 1.033435583114624, + "learning_rate": 9.943808351320418e-07, + "loss": 0.0482, + "step": 1869 + }, + { + "epoch": 0.7354965585054081, + "grad_norm": 1.139096975326538, + "learning_rate": 9.916407144971245e-07, + "loss": 0.046, + "step": 1870 + }, + { + "epoch": 0.735889872173058, + "grad_norm": 1.5064547061920166, + "learning_rate": 9.889034401063443e-07, + "loss": 0.0629, + "step": 1871 + }, + { + "epoch": 0.736283185840708, + "grad_norm": 0.7273301482200623, + "learning_rate": 9.861690171248841e-07, + "loss": 0.0314, + "step": 1872 + }, + { + "epoch": 0.7366764995083579, + "grad_norm": 0.579467236995697, + "learning_rate": 9.834374507125458e-07, + "loss": 0.0527, + "step": 1873 + }, + { + "epoch": 0.7370698131760078, + "grad_norm": 0.8448885679244995, + "learning_rate": 9.807087460237419e-07, + "loss": 0.0326, + "step": 1874 + }, + { + "epoch": 0.7374631268436578, + "grad_norm": 1.0001413822174072, + "learning_rate": 9.779829082074827e-07, + "loss": 0.0657, + "step": 1875 + }, + { + "epoch": 0.7378564405113077, + "grad_norm": 1.2145143747329712, + "learning_rate": 9.752599424073707e-07, + "loss": 0.0339, + "step": 1876 + }, + { + "epoch": 0.7382497541789578, + "grad_norm": 1.0525156259536743, + "learning_rate": 9.725398537615894e-07, + "loss": 0.0459, + "step": 1877 + }, + { + "epoch": 0.7386430678466077, + "grad_norm": 1.2982537746429443, + "learning_rate": 9.698226474028913e-07, + "loss": 0.0744, + "step": 1878 + }, + { + "epoch": 0.7390363815142577, + "grad_norm": 0.8789856433868408, + "learning_rate": 9.671083284585925e-07, + "loss": 0.0442, + "step": 1879 + }, + { + "epoch": 0.7394296951819076, + "grad_norm": 2.672044515609741, + "learning_rate": 9.643969020505573e-07, + "loss": 0.0769, + "step": 1880 + }, + { + "epoch": 0.7398230088495575, + "grad_norm": 1.0391490459442139, + "learning_rate": 9.616883732951945e-07, + "loss": 0.0721, + "step": 1881 + }, + { + "epoch": 0.7402163225172075, + "grad_norm": 1.1753817796707153, + "learning_rate": 9.589827473034443e-07, + "loss": 0.0463, + "step": 1882 + }, + { + "epoch": 0.7406096361848574, + "grad_norm": 1.260125994682312, + "learning_rate": 9.562800291807695e-07, + "loss": 0.0637, + "step": 1883 + }, + { + "epoch": 0.7410029498525074, + "grad_norm": 0.9175117015838623, + "learning_rate": 9.535802240271455e-07, + "loss": 0.037, + "step": 1884 + }, + { + "epoch": 0.7413962635201573, + "grad_norm": 0.9132412075996399, + "learning_rate": 9.508833369370524e-07, + "loss": 0.056, + "step": 1885 + }, + { + "epoch": 0.7417895771878072, + "grad_norm": 1.965725302696228, + "learning_rate": 9.481893729994609e-07, + "loss": 0.0545, + "step": 1886 + }, + { + "epoch": 0.7421828908554572, + "grad_norm": 2.073374032974243, + "learning_rate": 9.454983372978288e-07, + "loss": 0.0754, + "step": 1887 + }, + { + "epoch": 0.7425762045231071, + "grad_norm": 1.0531790256500244, + "learning_rate": 9.428102349100868e-07, + "loss": 0.0459, + "step": 1888 + }, + { + "epoch": 0.7429695181907572, + "grad_norm": 1.7750204801559448, + "learning_rate": 9.40125070908631e-07, + "loss": 0.061, + "step": 1889 + }, + { + "epoch": 0.7433628318584071, + "grad_norm": 0.6801098585128784, + "learning_rate": 9.374428503603139e-07, + "loss": 0.0597, + "step": 1890 + }, + { + "epoch": 0.7437561455260571, + "grad_norm": 0.6724294424057007, + "learning_rate": 9.347635783264309e-07, + "loss": 0.0302, + "step": 1891 + }, + { + "epoch": 0.744149459193707, + "grad_norm": 0.7799742817878723, + "learning_rate": 9.32087259862716e-07, + "loss": 0.0679, + "step": 1892 + }, + { + "epoch": 0.744542772861357, + "grad_norm": 1.623399257659912, + "learning_rate": 9.294139000193292e-07, + "loss": 0.0553, + "step": 1893 + }, + { + "epoch": 0.7449360865290069, + "grad_norm": 0.8977343440055847, + "learning_rate": 9.267435038408479e-07, + "loss": 0.0284, + "step": 1894 + }, + { + "epoch": 0.7453294001966568, + "grad_norm": 0.7733441591262817, + "learning_rate": 9.240760763662562e-07, + "loss": 0.0339, + "step": 1895 + }, + { + "epoch": 0.7457227138643068, + "grad_norm": 1.5382790565490723, + "learning_rate": 9.214116226289388e-07, + "loss": 0.0746, + "step": 1896 + }, + { + "epoch": 0.7461160275319567, + "grad_norm": 1.144547700881958, + "learning_rate": 9.187501476566648e-07, + "loss": 0.0351, + "step": 1897 + }, + { + "epoch": 0.7465093411996067, + "grad_norm": 0.7251105904579163, + "learning_rate": 9.16091656471586e-07, + "loss": 0.0634, + "step": 1898 + }, + { + "epoch": 0.7469026548672566, + "grad_norm": 0.999096155166626, + "learning_rate": 9.134361540902225e-07, + "loss": 0.0421, + "step": 1899 + }, + { + "epoch": 0.7472959685349065, + "grad_norm": 0.830605685710907, + "learning_rate": 9.10783645523455e-07, + "loss": 0.0426, + "step": 1900 + }, + { + "epoch": 0.7476892822025566, + "grad_norm": 1.5645976066589355, + "learning_rate": 9.081341357765145e-07, + "loss": 0.0416, + "step": 1901 + }, + { + "epoch": 0.7480825958702065, + "grad_norm": 0.8770972490310669, + "learning_rate": 9.054876298489742e-07, + "loss": 0.0561, + "step": 1902 + }, + { + "epoch": 0.7484759095378565, + "grad_norm": 1.5209007263183594, + "learning_rate": 9.02844132734737e-07, + "loss": 0.0419, + "step": 1903 + }, + { + "epoch": 0.7488692232055064, + "grad_norm": 3.409085512161255, + "learning_rate": 9.002036494220306e-07, + "loss": 0.0752, + "step": 1904 + }, + { + "epoch": 0.7492625368731564, + "grad_norm": 1.448819875717163, + "learning_rate": 8.975661848933945e-07, + "loss": 0.0523, + "step": 1905 + }, + { + "epoch": 0.7496558505408063, + "grad_norm": 0.998282790184021, + "learning_rate": 8.949317441256724e-07, + "loss": 0.0733, + "step": 1906 + }, + { + "epoch": 0.7500491642084562, + "grad_norm": 1.4408761262893677, + "learning_rate": 8.923003320900014e-07, + "loss": 0.0577, + "step": 1907 + }, + { + "epoch": 0.7504424778761062, + "grad_norm": 0.9130271077156067, + "learning_rate": 8.896719537518048e-07, + "loss": 0.0317, + "step": 1908 + }, + { + "epoch": 0.7508357915437561, + "grad_norm": 1.9195144176483154, + "learning_rate": 8.870466140707795e-07, + "loss": 0.0666, + "step": 1909 + }, + { + "epoch": 0.7512291052114061, + "grad_norm": 1.457318902015686, + "learning_rate": 8.844243180008913e-07, + "loss": 0.0762, + "step": 1910 + }, + { + "epoch": 0.751622418879056, + "grad_norm": 1.4528069496154785, + "learning_rate": 8.818050704903589e-07, + "loss": 0.0423, + "step": 1911 + }, + { + "epoch": 0.752015732546706, + "grad_norm": 0.849536120891571, + "learning_rate": 8.791888764816514e-07, + "loss": 0.0289, + "step": 1912 + }, + { + "epoch": 0.752409046214356, + "grad_norm": 1.4856075048446655, + "learning_rate": 8.765757409114753e-07, + "loss": 0.0665, + "step": 1913 + }, + { + "epoch": 0.752802359882006, + "grad_norm": 0.8997237086296082, + "learning_rate": 8.739656687107656e-07, + "loss": 0.0619, + "step": 1914 + }, + { + "epoch": 0.7531956735496559, + "grad_norm": 0.8566966652870178, + "learning_rate": 8.713586648046768e-07, + "loss": 0.0476, + "step": 1915 + }, + { + "epoch": 0.7535889872173058, + "grad_norm": 0.9483917355537415, + "learning_rate": 8.68754734112574e-07, + "loss": 0.0486, + "step": 1916 + }, + { + "epoch": 0.7539823008849558, + "grad_norm": 1.0472768545150757, + "learning_rate": 8.661538815480228e-07, + "loss": 0.0422, + "step": 1917 + }, + { + "epoch": 0.7543756145526057, + "grad_norm": 1.4821901321411133, + "learning_rate": 8.635561120187813e-07, + "loss": 0.0408, + "step": 1918 + }, + { + "epoch": 0.7547689282202557, + "grad_norm": 0.7954731583595276, + "learning_rate": 8.609614304267877e-07, + "loss": 0.059, + "step": 1919 + }, + { + "epoch": 0.7551622418879056, + "grad_norm": 0.9966669082641602, + "learning_rate": 8.583698416681555e-07, + "loss": 0.0303, + "step": 1920 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 0.39692261815071106, + "learning_rate": 8.557813506331616e-07, + "loss": 0.0324, + "step": 1921 + }, + { + "epoch": 0.7559488692232055, + "grad_norm": 1.7129300832748413, + "learning_rate": 8.531959622062372e-07, + "loss": 0.0397, + "step": 1922 + }, + { + "epoch": 0.7563421828908554, + "grad_norm": 1.0999704599380493, + "learning_rate": 8.506136812659601e-07, + "loss": 0.0455, + "step": 1923 + }, + { + "epoch": 0.7567354965585054, + "grad_norm": 1.2547434568405151, + "learning_rate": 8.480345126850414e-07, + "loss": 0.0658, + "step": 1924 + }, + { + "epoch": 0.7571288102261554, + "grad_norm": 1.1041603088378906, + "learning_rate": 8.454584613303227e-07, + "loss": 0.0339, + "step": 1925 + }, + { + "epoch": 0.7575221238938054, + "grad_norm": 0.8621834516525269, + "learning_rate": 8.428855320627613e-07, + "loss": 0.0294, + "step": 1926 + }, + { + "epoch": 0.7579154375614553, + "grad_norm": 0.7350767254829407, + "learning_rate": 8.403157297374239e-07, + "loss": 0.023, + "step": 1927 + }, + { + "epoch": 0.7583087512291052, + "grad_norm": 0.9072149991989136, + "learning_rate": 8.377490592034779e-07, + "loss": 0.0704, + "step": 1928 + }, + { + "epoch": 0.7587020648967552, + "grad_norm": 0.715020477771759, + "learning_rate": 8.35185525304178e-07, + "loss": 0.0321, + "step": 1929 + }, + { + "epoch": 0.7590953785644051, + "grad_norm": 0.7303974032402039, + "learning_rate": 8.326251328768626e-07, + "loss": 0.0207, + "step": 1930 + }, + { + "epoch": 0.7594886922320551, + "grad_norm": 1.534783124923706, + "learning_rate": 8.300678867529415e-07, + "loss": 0.0715, + "step": 1931 + }, + { + "epoch": 0.759882005899705, + "grad_norm": 0.6678977012634277, + "learning_rate": 8.275137917578879e-07, + "loss": 0.0454, + "step": 1932 + }, + { + "epoch": 0.760275319567355, + "grad_norm": 0.7839411497116089, + "learning_rate": 8.249628527112282e-07, + "loss": 0.053, + "step": 1933 + }, + { + "epoch": 0.7606686332350049, + "grad_norm": 0.6599370241165161, + "learning_rate": 8.224150744265352e-07, + "loss": 0.0312, + "step": 1934 + }, + { + "epoch": 0.7610619469026548, + "grad_norm": 0.8593689799308777, + "learning_rate": 8.198704617114143e-07, + "loss": 0.0219, + "step": 1935 + }, + { + "epoch": 0.7614552605703048, + "grad_norm": 1.0792686939239502, + "learning_rate": 8.173290193674996e-07, + "loss": 0.0688, + "step": 1936 + }, + { + "epoch": 0.7618485742379548, + "grad_norm": 1.1030522584915161, + "learning_rate": 8.147907521904433e-07, + "loss": 0.0598, + "step": 1937 + }, + { + "epoch": 0.7622418879056048, + "grad_norm": 1.4342604875564575, + "learning_rate": 8.122556649699051e-07, + "loss": 0.072, + "step": 1938 + }, + { + "epoch": 0.7626352015732547, + "grad_norm": 1.555779218673706, + "learning_rate": 8.097237624895452e-07, + "loss": 0.0875, + "step": 1939 + }, + { + "epoch": 0.7630285152409046, + "grad_norm": 1.7069602012634277, + "learning_rate": 8.07195049527012e-07, + "loss": 0.0625, + "step": 1940 + }, + { + "epoch": 0.7634218289085546, + "grad_norm": 1.4105464220046997, + "learning_rate": 8.046695308539376e-07, + "loss": 0.0302, + "step": 1941 + }, + { + "epoch": 0.7638151425762045, + "grad_norm": 0.9220629930496216, + "learning_rate": 8.021472112359255e-07, + "loss": 0.0788, + "step": 1942 + }, + { + "epoch": 0.7642084562438545, + "grad_norm": 1.7221704721450806, + "learning_rate": 7.996280954325433e-07, + "loss": 0.0701, + "step": 1943 + }, + { + "epoch": 0.7646017699115044, + "grad_norm": 1.240715503692627, + "learning_rate": 7.971121881973126e-07, + "loss": 0.0605, + "step": 1944 + }, + { + "epoch": 0.7649950835791544, + "grad_norm": 1.054165005683899, + "learning_rate": 7.945994942777016e-07, + "loss": 0.0278, + "step": 1945 + }, + { + "epoch": 0.7653883972468043, + "grad_norm": 0.3918832242488861, + "learning_rate": 7.92090018415112e-07, + "loss": 0.0433, + "step": 1946 + }, + { + "epoch": 0.7657817109144542, + "grad_norm": 1.2010436058044434, + "learning_rate": 7.895837653448759e-07, + "loss": 0.0645, + "step": 1947 + }, + { + "epoch": 0.7661750245821042, + "grad_norm": 0.6880310773849487, + "learning_rate": 7.870807397962438e-07, + "loss": 0.0466, + "step": 1948 + }, + { + "epoch": 0.7665683382497542, + "grad_norm": 0.8154659867286682, + "learning_rate": 7.845809464923748e-07, + "loss": 0.0478, + "step": 1949 + }, + { + "epoch": 0.7669616519174042, + "grad_norm": 0.7172273397445679, + "learning_rate": 7.820843901503308e-07, + "loss": 0.0352, + "step": 1950 + }, + { + "epoch": 0.7673549655850541, + "grad_norm": 1.7781319618225098, + "learning_rate": 7.79591075481062e-07, + "loss": 0.0732, + "step": 1951 + }, + { + "epoch": 0.7677482792527041, + "grad_norm": 0.6639533638954163, + "learning_rate": 7.771010071894052e-07, + "loss": 0.0179, + "step": 1952 + }, + { + "epoch": 0.768141592920354, + "grad_norm": 0.8761031627655029, + "learning_rate": 7.7461418997407e-07, + "loss": 0.0281, + "step": 1953 + }, + { + "epoch": 0.7685349065880039, + "grad_norm": 0.7496312856674194, + "learning_rate": 7.721306285276309e-07, + "loss": 0.053, + "step": 1954 + }, + { + "epoch": 0.7689282202556539, + "grad_norm": 0.46650174260139465, + "learning_rate": 7.696503275365194e-07, + "loss": 0.0513, + "step": 1955 + }, + { + "epoch": 0.7693215339233038, + "grad_norm": 1.1080721616744995, + "learning_rate": 7.671732916810154e-07, + "loss": 0.0507, + "step": 1956 + }, + { + "epoch": 0.7697148475909538, + "grad_norm": 0.6540339589118958, + "learning_rate": 7.646995256352346e-07, + "loss": 0.028, + "step": 1957 + }, + { + "epoch": 0.7701081612586037, + "grad_norm": 1.099401593208313, + "learning_rate": 7.622290340671256e-07, + "loss": 0.0623, + "step": 1958 + }, + { + "epoch": 0.7705014749262536, + "grad_norm": 0.9163020253181458, + "learning_rate": 7.597618216384576e-07, + "loss": 0.0251, + "step": 1959 + }, + { + "epoch": 0.7708947885939036, + "grad_norm": 1.32003915309906, + "learning_rate": 7.572978930048108e-07, + "loss": 0.0467, + "step": 1960 + }, + { + "epoch": 0.7712881022615536, + "grad_norm": 1.0354825258255005, + "learning_rate": 7.54837252815571e-07, + "loss": 0.0491, + "step": 1961 + }, + { + "epoch": 0.7716814159292036, + "grad_norm": 1.0285413265228271, + "learning_rate": 7.523799057139158e-07, + "loss": 0.0598, + "step": 1962 + }, + { + "epoch": 0.7720747295968535, + "grad_norm": 1.7109252214431763, + "learning_rate": 7.49925856336812e-07, + "loss": 0.058, + "step": 1963 + }, + { + "epoch": 0.7724680432645035, + "grad_norm": 1.3561407327651978, + "learning_rate": 7.474751093150015e-07, + "loss": 0.0351, + "step": 1964 + }, + { + "epoch": 0.7728613569321534, + "grad_norm": 0.4150741696357727, + "learning_rate": 7.450276692729957e-07, + "loss": 0.0181, + "step": 1965 + }, + { + "epoch": 0.7732546705998034, + "grad_norm": 1.0091959238052368, + "learning_rate": 7.425835408290655e-07, + "loss": 0.0403, + "step": 1966 + }, + { + "epoch": 0.7736479842674533, + "grad_norm": 2.851815938949585, + "learning_rate": 7.40142728595234e-07, + "loss": 0.0491, + "step": 1967 + }, + { + "epoch": 0.7740412979351032, + "grad_norm": 1.306333303451538, + "learning_rate": 7.377052371772637e-07, + "loss": 0.058, + "step": 1968 + }, + { + "epoch": 0.7744346116027532, + "grad_norm": 0.8560998439788818, + "learning_rate": 7.352710711746536e-07, + "loss": 0.0284, + "step": 1969 + }, + { + "epoch": 0.7748279252704031, + "grad_norm": 1.8746119737625122, + "learning_rate": 7.328402351806269e-07, + "loss": 0.0654, + "step": 1970 + }, + { + "epoch": 0.7752212389380531, + "grad_norm": 1.0875734090805054, + "learning_rate": 7.304127337821229e-07, + "loss": 0.0402, + "step": 1971 + }, + { + "epoch": 0.775614552605703, + "grad_norm": 0.8440957069396973, + "learning_rate": 7.279885715597896e-07, + "loss": 0.0367, + "step": 1972 + }, + { + "epoch": 0.776007866273353, + "grad_norm": 1.528245210647583, + "learning_rate": 7.255677530879713e-07, + "loss": 0.0336, + "step": 1973 + }, + { + "epoch": 0.776401179941003, + "grad_norm": 1.6772621870040894, + "learning_rate": 7.231502829347056e-07, + "loss": 0.0388, + "step": 1974 + }, + { + "epoch": 0.7767944936086529, + "grad_norm": 0.85129314661026, + "learning_rate": 7.207361656617112e-07, + "loss": 0.0521, + "step": 1975 + }, + { + "epoch": 0.7771878072763029, + "grad_norm": 1.1908273696899414, + "learning_rate": 7.183254058243791e-07, + "loss": 0.0419, + "step": 1976 + }, + { + "epoch": 0.7775811209439528, + "grad_norm": 1.2314374446868896, + "learning_rate": 7.159180079717656e-07, + "loss": 0.044, + "step": 1977 + }, + { + "epoch": 0.7779744346116028, + "grad_norm": 1.7192610502243042, + "learning_rate": 7.135139766465838e-07, + "loss": 0.0663, + "step": 1978 + }, + { + "epoch": 0.7783677482792527, + "grad_norm": 1.5432205200195312, + "learning_rate": 7.111133163851916e-07, + "loss": 0.0267, + "step": 1979 + }, + { + "epoch": 0.7787610619469026, + "grad_norm": 0.759152352809906, + "learning_rate": 7.087160317175881e-07, + "loss": 0.0299, + "step": 1980 + }, + { + "epoch": 0.7791543756145526, + "grad_norm": 0.9122269749641418, + "learning_rate": 7.06322127167402e-07, + "loss": 0.0301, + "step": 1981 + }, + { + "epoch": 0.7795476892822025, + "grad_norm": 0.7516564130783081, + "learning_rate": 7.03931607251884e-07, + "loss": 0.0627, + "step": 1982 + }, + { + "epoch": 0.7799410029498525, + "grad_norm": 1.2953605651855469, + "learning_rate": 7.015444764818988e-07, + "loss": 0.0571, + "step": 1983 + }, + { + "epoch": 0.7803343166175024, + "grad_norm": 0.8770161271095276, + "learning_rate": 6.991607393619129e-07, + "loss": 0.0322, + "step": 1984 + }, + { + "epoch": 0.7807276302851525, + "grad_norm": 0.8347287774085999, + "learning_rate": 6.967804003899925e-07, + "loss": 0.0497, + "step": 1985 + }, + { + "epoch": 0.7811209439528024, + "grad_norm": 0.5185628533363342, + "learning_rate": 6.944034640577896e-07, + "loss": 0.0292, + "step": 1986 + }, + { + "epoch": 0.7815142576204523, + "grad_norm": 0.9084299802780151, + "learning_rate": 6.920299348505365e-07, + "loss": 0.0343, + "step": 1987 + }, + { + "epoch": 0.7819075712881023, + "grad_norm": 1.2148305177688599, + "learning_rate": 6.896598172470356e-07, + "loss": 0.07, + "step": 1988 + }, + { + "epoch": 0.7823008849557522, + "grad_norm": 1.0693104267120361, + "learning_rate": 6.872931157196519e-07, + "loss": 0.0509, + "step": 1989 + }, + { + "epoch": 0.7826941986234022, + "grad_norm": 0.5483916997909546, + "learning_rate": 6.849298347343044e-07, + "loss": 0.04, + "step": 1990 + }, + { + "epoch": 0.7830875122910521, + "grad_norm": 0.9246038794517517, + "learning_rate": 6.825699787504586e-07, + "loss": 0.0602, + "step": 1991 + }, + { + "epoch": 0.783480825958702, + "grad_norm": 0.7501392960548401, + "learning_rate": 6.802135522211142e-07, + "loss": 0.0331, + "step": 1992 + }, + { + "epoch": 0.783874139626352, + "grad_norm": 0.8467764854431152, + "learning_rate": 6.778605595928025e-07, + "loss": 0.0325, + "step": 1993 + }, + { + "epoch": 0.7842674532940019, + "grad_norm": 0.5727487206459045, + "learning_rate": 6.755110053055738e-07, + "loss": 0.0264, + "step": 1994 + }, + { + "epoch": 0.7846607669616519, + "grad_norm": 1.1488757133483887, + "learning_rate": 6.731648937929911e-07, + "loss": 0.0548, + "step": 1995 + }, + { + "epoch": 0.7850540806293018, + "grad_norm": 0.7147387862205505, + "learning_rate": 6.708222294821196e-07, + "loss": 0.0548, + "step": 1996 + }, + { + "epoch": 0.7854473942969519, + "grad_norm": 1.0995930433273315, + "learning_rate": 6.684830167935207e-07, + "loss": 0.0476, + "step": 1997 + }, + { + "epoch": 0.7858407079646018, + "grad_norm": 1.1355059146881104, + "learning_rate": 6.66147260141243e-07, + "loss": 0.0501, + "step": 1998 + }, + { + "epoch": 0.7862340216322518, + "grad_norm": 0.7553796768188477, + "learning_rate": 6.638149639328134e-07, + "loss": 0.0686, + "step": 1999 + }, + { + "epoch": 0.7866273352999017, + "grad_norm": 0.8902336359024048, + "learning_rate": 6.614861325692277e-07, + "loss": 0.0349, + "step": 2000 + }, + { + "epoch": 0.7870206489675516, + "grad_norm": 1.090766429901123, + "learning_rate": 6.591607704449446e-07, + "loss": 0.0527, + "step": 2001 + }, + { + "epoch": 0.7874139626352016, + "grad_norm": 1.142582654953003, + "learning_rate": 6.568388819478769e-07, + "loss": 0.0537, + "step": 2002 + }, + { + "epoch": 0.7878072763028515, + "grad_norm": 1.449288010597229, + "learning_rate": 6.545204714593825e-07, + "loss": 0.0587, + "step": 2003 + }, + { + "epoch": 0.7882005899705015, + "grad_norm": 1.7187999486923218, + "learning_rate": 6.522055433542557e-07, + "loss": 0.0624, + "step": 2004 + }, + { + "epoch": 0.7885939036381514, + "grad_norm": 1.5539288520812988, + "learning_rate": 6.49894102000721e-07, + "loss": 0.0553, + "step": 2005 + }, + { + "epoch": 0.7889872173058013, + "grad_norm": 1.4520833492279053, + "learning_rate": 6.47586151760421e-07, + "loss": 0.0297, + "step": 2006 + }, + { + "epoch": 0.7893805309734513, + "grad_norm": 1.2936962842941284, + "learning_rate": 6.452816969884127e-07, + "loss": 0.0335, + "step": 2007 + }, + { + "epoch": 0.7897738446411012, + "grad_norm": 1.2932931184768677, + "learning_rate": 6.429807420331568e-07, + "loss": 0.0622, + "step": 2008 + }, + { + "epoch": 0.7901671583087513, + "grad_norm": 0.9521369934082031, + "learning_rate": 6.406832912365101e-07, + "loss": 0.0669, + "step": 2009 + }, + { + "epoch": 0.7905604719764012, + "grad_norm": 0.9570633172988892, + "learning_rate": 6.383893489337172e-07, + "loss": 0.054, + "step": 2010 + }, + { + "epoch": 0.7909537856440512, + "grad_norm": 0.7929260730743408, + "learning_rate": 6.360989194534004e-07, + "loss": 0.028, + "step": 2011 + }, + { + "epoch": 0.7913470993117011, + "grad_norm": 1.2527369260787964, + "learning_rate": 6.338120071175558e-07, + "loss": 0.0631, + "step": 2012 + }, + { + "epoch": 0.791740412979351, + "grad_norm": 0.9790352582931519, + "learning_rate": 6.315286162415412e-07, + "loss": 0.0485, + "step": 2013 + }, + { + "epoch": 0.792133726647001, + "grad_norm": 1.417540431022644, + "learning_rate": 6.292487511340709e-07, + "loss": 0.0575, + "step": 2014 + }, + { + "epoch": 0.7925270403146509, + "grad_norm": 1.3456201553344727, + "learning_rate": 6.269724160972043e-07, + "loss": 0.0709, + "step": 2015 + }, + { + "epoch": 0.7929203539823009, + "grad_norm": 1.3013477325439453, + "learning_rate": 6.246996154263421e-07, + "loss": 0.0571, + "step": 2016 + }, + { + "epoch": 0.7933136676499508, + "grad_norm": 1.0679081678390503, + "learning_rate": 6.224303534102125e-07, + "loss": 0.0395, + "step": 2017 + }, + { + "epoch": 0.7937069813176008, + "grad_norm": 1.3359334468841553, + "learning_rate": 6.201646343308685e-07, + "loss": 0.0439, + "step": 2018 + }, + { + "epoch": 0.7941002949852507, + "grad_norm": 1.4549192190170288, + "learning_rate": 6.179024624636772e-07, + "loss": 0.057, + "step": 2019 + }, + { + "epoch": 0.7944936086529006, + "grad_norm": 0.8267070055007935, + "learning_rate": 6.156438420773125e-07, + "loss": 0.0207, + "step": 2020 + }, + { + "epoch": 0.7948869223205507, + "grad_norm": 1.1873496770858765, + "learning_rate": 6.133887774337471e-07, + "loss": 0.0449, + "step": 2021 + }, + { + "epoch": 0.7952802359882006, + "grad_norm": 1.971118450164795, + "learning_rate": 6.111372727882417e-07, + "loss": 0.0444, + "step": 2022 + }, + { + "epoch": 0.7956735496558506, + "grad_norm": 0.5039023160934448, + "learning_rate": 6.088893323893419e-07, + "loss": 0.0165, + "step": 2023 + }, + { + "epoch": 0.7960668633235005, + "grad_norm": 1.2124491930007935, + "learning_rate": 6.066449604788666e-07, + "loss": 0.0384, + "step": 2024 + }, + { + "epoch": 0.7964601769911505, + "grad_norm": 1.4836233854293823, + "learning_rate": 6.044041612919016e-07, + "loss": 0.0711, + "step": 2025 + }, + { + "epoch": 0.7968534906588004, + "grad_norm": 1.4890559911727905, + "learning_rate": 6.021669390567902e-07, + "loss": 0.048, + "step": 2026 + }, + { + "epoch": 0.7972468043264503, + "grad_norm": 0.5430221557617188, + "learning_rate": 5.999332979951272e-07, + "loss": 0.049, + "step": 2027 + }, + { + "epoch": 0.7976401179941003, + "grad_norm": 0.9645549654960632, + "learning_rate": 5.977032423217482e-07, + "loss": 0.0201, + "step": 2028 + }, + { + "epoch": 0.7980334316617502, + "grad_norm": 1.7599254846572876, + "learning_rate": 5.954767762447244e-07, + "loss": 0.0524, + "step": 2029 + }, + { + "epoch": 0.7984267453294002, + "grad_norm": 0.6832358241081238, + "learning_rate": 5.932539039653535e-07, + "loss": 0.0451, + "step": 2030 + }, + { + "epoch": 0.7988200589970501, + "grad_norm": 0.5469837188720703, + "learning_rate": 5.910346296781511e-07, + "loss": 0.0342, + "step": 2031 + }, + { + "epoch": 0.7992133726647, + "grad_norm": 1.466138482093811, + "learning_rate": 5.888189575708453e-07, + "loss": 0.0619, + "step": 2032 + }, + { + "epoch": 0.7996066863323501, + "grad_norm": 1.1846930980682373, + "learning_rate": 5.866068918243634e-07, + "loss": 0.0527, + "step": 2033 + }, + { + "epoch": 0.8, + "grad_norm": 0.8236525058746338, + "learning_rate": 5.843984366128308e-07, + "loss": 0.0427, + "step": 2034 + }, + { + "epoch": 0.80039331366765, + "grad_norm": 0.8086917996406555, + "learning_rate": 5.821935961035589e-07, + "loss": 0.0743, + "step": 2035 + }, + { + "epoch": 0.8007866273352999, + "grad_norm": 1.3642960786819458, + "learning_rate": 5.799923744570376e-07, + "loss": 0.0609, + "step": 2036 + }, + { + "epoch": 0.8011799410029499, + "grad_norm": 1.4578794240951538, + "learning_rate": 5.777947758269295e-07, + "loss": 0.0828, + "step": 2037 + }, + { + "epoch": 0.8015732546705998, + "grad_norm": 0.5745184421539307, + "learning_rate": 5.756008043600594e-07, + "loss": 0.0444, + "step": 2038 + }, + { + "epoch": 0.8019665683382498, + "grad_norm": 2.3881709575653076, + "learning_rate": 5.734104641964075e-07, + "loss": 0.074, + "step": 2039 + }, + { + "epoch": 0.8023598820058997, + "grad_norm": 1.0504474639892578, + "learning_rate": 5.712237594691028e-07, + "loss": 0.0573, + "step": 2040 + }, + { + "epoch": 0.8027531956735496, + "grad_norm": 1.7040578126907349, + "learning_rate": 5.690406943044138e-07, + "loss": 0.0472, + "step": 2041 + }, + { + "epoch": 0.8031465093411996, + "grad_norm": 0.9709568619728088, + "learning_rate": 5.668612728217412e-07, + "loss": 0.0305, + "step": 2042 + }, + { + "epoch": 0.8035398230088495, + "grad_norm": 2.0475189685821533, + "learning_rate": 5.646854991336112e-07, + "loss": 0.0661, + "step": 2043 + }, + { + "epoch": 0.8039331366764995, + "grad_norm": 1.4109443426132202, + "learning_rate": 5.625133773456639e-07, + "loss": 0.0698, + "step": 2044 + }, + { + "epoch": 0.8043264503441495, + "grad_norm": 0.8161342740058899, + "learning_rate": 5.603449115566511e-07, + "loss": 0.0417, + "step": 2045 + }, + { + "epoch": 0.8047197640117995, + "grad_norm": 1.1740028858184814, + "learning_rate": 5.581801058584252e-07, + "loss": 0.0444, + "step": 2046 + }, + { + "epoch": 0.8051130776794494, + "grad_norm": 2.580334424972534, + "learning_rate": 5.560189643359312e-07, + "loss": 0.0988, + "step": 2047 + }, + { + "epoch": 0.8055063913470993, + "grad_norm": 0.8429194092750549, + "learning_rate": 5.538614910672005e-07, + "loss": 0.0312, + "step": 2048 + }, + { + "epoch": 0.8058997050147493, + "grad_norm": 0.8115060925483704, + "learning_rate": 5.517076901233434e-07, + "loss": 0.0561, + "step": 2049 + }, + { + "epoch": 0.8062930186823992, + "grad_norm": 0.5982792377471924, + "learning_rate": 5.495575655685382e-07, + "loss": 0.0369, + "step": 2050 + }, + { + "epoch": 0.8066863323500492, + "grad_norm": 1.5597193241119385, + "learning_rate": 5.474111214600278e-07, + "loss": 0.0701, + "step": 2051 + }, + { + "epoch": 0.8070796460176991, + "grad_norm": 1.3873978853225708, + "learning_rate": 5.452683618481103e-07, + "loss": 0.0372, + "step": 2052 + }, + { + "epoch": 0.807472959685349, + "grad_norm": 0.9317770004272461, + "learning_rate": 5.431292907761305e-07, + "loss": 0.0433, + "step": 2053 + }, + { + "epoch": 0.807866273352999, + "grad_norm": 1.736678957939148, + "learning_rate": 5.409939122804736e-07, + "loss": 0.0562, + "step": 2054 + }, + { + "epoch": 0.8082595870206489, + "grad_norm": 1.1516214609146118, + "learning_rate": 5.388622303905558e-07, + "loss": 0.0438, + "step": 2055 + }, + { + "epoch": 0.8086529006882989, + "grad_norm": 0.855049192905426, + "learning_rate": 5.367342491288186e-07, + "loss": 0.0389, + "step": 2056 + }, + { + "epoch": 0.8090462143559489, + "grad_norm": 0.8584917187690735, + "learning_rate": 5.346099725107213e-07, + "loss": 0.0686, + "step": 2057 + }, + { + "epoch": 0.8094395280235989, + "grad_norm": 1.1630586385726929, + "learning_rate": 5.324894045447312e-07, + "loss": 0.0361, + "step": 2058 + }, + { + "epoch": 0.8098328416912488, + "grad_norm": 1.2655314207077026, + "learning_rate": 5.303725492323194e-07, + "loss": 0.0284, + "step": 2059 + }, + { + "epoch": 0.8102261553588987, + "grad_norm": 1.1947369575500488, + "learning_rate": 5.282594105679481e-07, + "loss": 0.0562, + "step": 2060 + }, + { + "epoch": 0.8106194690265487, + "grad_norm": 0.7869384288787842, + "learning_rate": 5.261499925390692e-07, + "loss": 0.0407, + "step": 2061 + }, + { + "epoch": 0.8110127826941986, + "grad_norm": 1.6076072454452515, + "learning_rate": 5.240442991261127e-07, + "loss": 0.0384, + "step": 2062 + }, + { + "epoch": 0.8114060963618486, + "grad_norm": 2.237993001937866, + "learning_rate": 5.219423343024804e-07, + "loss": 0.0539, + "step": 2063 + }, + { + "epoch": 0.8117994100294985, + "grad_norm": 0.8259546756744385, + "learning_rate": 5.198441020345382e-07, + "loss": 0.0436, + "step": 2064 + }, + { + "epoch": 0.8121927236971485, + "grad_norm": 1.2509441375732422, + "learning_rate": 5.177496062816101e-07, + "loss": 0.0462, + "step": 2065 + }, + { + "epoch": 0.8125860373647984, + "grad_norm": 1.06137216091156, + "learning_rate": 5.156588509959659e-07, + "loss": 0.0339, + "step": 2066 + }, + { + "epoch": 0.8129793510324483, + "grad_norm": 0.7373847365379333, + "learning_rate": 5.13571840122821e-07, + "loss": 0.0301, + "step": 2067 + }, + { + "epoch": 0.8133726647000983, + "grad_norm": 1.1653954982757568, + "learning_rate": 5.114885776003234e-07, + "loss": 0.0427, + "step": 2068 + }, + { + "epoch": 0.8137659783677483, + "grad_norm": 1.518700122833252, + "learning_rate": 5.094090673595478e-07, + "loss": 0.0568, + "step": 2069 + }, + { + "epoch": 0.8141592920353983, + "grad_norm": 0.9491556286811829, + "learning_rate": 5.073333133244896e-07, + "loss": 0.0296, + "step": 2070 + }, + { + "epoch": 0.8145526057030482, + "grad_norm": 1.12187922000885, + "learning_rate": 5.052613194120554e-07, + "loss": 0.0625, + "step": 2071 + }, + { + "epoch": 0.8149459193706982, + "grad_norm": 0.9381184577941895, + "learning_rate": 5.031930895320569e-07, + "loss": 0.0318, + "step": 2072 + }, + { + "epoch": 0.8153392330383481, + "grad_norm": 0.8680362701416016, + "learning_rate": 5.011286275872021e-07, + "loss": 0.0631, + "step": 2073 + }, + { + "epoch": 0.815732546705998, + "grad_norm": 1.5543493032455444, + "learning_rate": 4.990679374730905e-07, + "loss": 0.0754, + "step": 2074 + }, + { + "epoch": 0.816125860373648, + "grad_norm": 1.3975200653076172, + "learning_rate": 4.970110230782035e-07, + "loss": 0.072, + "step": 2075 + }, + { + "epoch": 0.8165191740412979, + "grad_norm": 0.8037746548652649, + "learning_rate": 4.949578882838982e-07, + "loss": 0.0385, + "step": 2076 + }, + { + "epoch": 0.8169124877089479, + "grad_norm": 0.7833993434906006, + "learning_rate": 4.929085369643988e-07, + "loss": 0.0418, + "step": 2077 + }, + { + "epoch": 0.8173058013765978, + "grad_norm": 0.8177001476287842, + "learning_rate": 4.908629729867908e-07, + "loss": 0.0485, + "step": 2078 + }, + { + "epoch": 0.8176991150442477, + "grad_norm": 0.7933450937271118, + "learning_rate": 4.88821200211014e-07, + "loss": 0.0466, + "step": 2079 + }, + { + "epoch": 0.8180924287118977, + "grad_norm": 0.5968790054321289, + "learning_rate": 4.867832224898517e-07, + "loss": 0.0253, + "step": 2080 + }, + { + "epoch": 0.8184857423795477, + "grad_norm": 1.4022417068481445, + "learning_rate": 4.847490436689281e-07, + "loss": 0.0431, + "step": 2081 + }, + { + "epoch": 0.8188790560471977, + "grad_norm": 2.319401264190674, + "learning_rate": 4.827186675866985e-07, + "loss": 0.0493, + "step": 2082 + }, + { + "epoch": 0.8192723697148476, + "grad_norm": 1.0119627714157104, + "learning_rate": 4.806920980744426e-07, + "loss": 0.0606, + "step": 2083 + }, + { + "epoch": 0.8196656833824976, + "grad_norm": 1.2110787630081177, + "learning_rate": 4.786693389562566e-07, + "loss": 0.0582, + "step": 2084 + }, + { + "epoch": 0.8200589970501475, + "grad_norm": 0.7724167704582214, + "learning_rate": 4.7665039404904747e-07, + "loss": 0.0457, + "step": 2085 + }, + { + "epoch": 0.8204523107177975, + "grad_norm": 1.5843499898910522, + "learning_rate": 4.746352671625237e-07, + "loss": 0.0482, + "step": 2086 + }, + { + "epoch": 0.8208456243854474, + "grad_norm": 1.3220843076705933, + "learning_rate": 4.72623962099191e-07, + "loss": 0.0505, + "step": 2087 + }, + { + "epoch": 0.8212389380530973, + "grad_norm": 1.6696242094039917, + "learning_rate": 4.7061648265434053e-07, + "loss": 0.0587, + "step": 2088 + }, + { + "epoch": 0.8216322517207473, + "grad_norm": 1.341960072517395, + "learning_rate": 4.6861283261604745e-07, + "loss": 0.0781, + "step": 2089 + }, + { + "epoch": 0.8220255653883972, + "grad_norm": 1.6525554656982422, + "learning_rate": 4.666130157651594e-07, + "loss": 0.052, + "step": 2090 + }, + { + "epoch": 0.8224188790560472, + "grad_norm": 1.0084091424942017, + "learning_rate": 4.6461703587529106e-07, + "loss": 0.0354, + "step": 2091 + }, + { + "epoch": 0.8228121927236971, + "grad_norm": 0.8987352848052979, + "learning_rate": 4.62624896712818e-07, + "loss": 0.0351, + "step": 2092 + }, + { + "epoch": 0.8232055063913472, + "grad_norm": 1.0085314512252808, + "learning_rate": 4.6063660203686635e-07, + "loss": 0.0459, + "step": 2093 + }, + { + "epoch": 0.8235988200589971, + "grad_norm": 1.4987783432006836, + "learning_rate": 4.586521555993087e-07, + "loss": 0.0771, + "step": 2094 + }, + { + "epoch": 0.823992133726647, + "grad_norm": 1.5976486206054688, + "learning_rate": 4.5667156114475695e-07, + "loss": 0.0766, + "step": 2095 + }, + { + "epoch": 0.824385447394297, + "grad_norm": 0.9721060395240784, + "learning_rate": 4.5469482241055324e-07, + "loss": 0.0514, + "step": 2096 + }, + { + "epoch": 0.8247787610619469, + "grad_norm": 0.835397481918335, + "learning_rate": 4.527219431267646e-07, + "loss": 0.0352, + "step": 2097 + }, + { + "epoch": 0.8251720747295969, + "grad_norm": 1.1280697584152222, + "learning_rate": 4.507529270161759e-07, + "loss": 0.0712, + "step": 2098 + }, + { + "epoch": 0.8255653883972468, + "grad_norm": 1.8154939413070679, + "learning_rate": 4.4878777779428034e-07, + "loss": 0.0918, + "step": 2099 + }, + { + "epoch": 0.8259587020648967, + "grad_norm": 1.067765474319458, + "learning_rate": 4.4682649916927614e-07, + "loss": 0.0357, + "step": 2100 + }, + { + "epoch": 0.8263520157325467, + "grad_norm": 1.0095484256744385, + "learning_rate": 4.4486909484205725e-07, + "loss": 0.0315, + "step": 2101 + }, + { + "epoch": 0.8267453294001966, + "grad_norm": 1.7903807163238525, + "learning_rate": 4.429155685062073e-07, + "loss": 0.0598, + "step": 2102 + }, + { + "epoch": 0.8271386430678466, + "grad_norm": 1.5948070287704468, + "learning_rate": 4.409659238479919e-07, + "loss": 0.0408, + "step": 2103 + }, + { + "epoch": 0.8275319567354965, + "grad_norm": 0.805156946182251, + "learning_rate": 4.39020164546351e-07, + "loss": 0.0448, + "step": 2104 + }, + { + "epoch": 0.8279252704031466, + "grad_norm": 0.4440039098262787, + "learning_rate": 4.370782942728946e-07, + "loss": 0.0279, + "step": 2105 + }, + { + "epoch": 0.8283185840707965, + "grad_norm": 0.9887676239013672, + "learning_rate": 4.3514031669189325e-07, + "loss": 0.0706, + "step": 2106 + }, + { + "epoch": 0.8287118977384464, + "grad_norm": 1.1825933456420898, + "learning_rate": 4.3320623546027283e-07, + "loss": 0.0608, + "step": 2107 + }, + { + "epoch": 0.8291052114060964, + "grad_norm": 1.8713337182998657, + "learning_rate": 4.312760542276059e-07, + "loss": 0.049, + "step": 2108 + }, + { + "epoch": 0.8294985250737463, + "grad_norm": 0.9182631969451904, + "learning_rate": 4.293497766361068e-07, + "loss": 0.0436, + "step": 2109 + }, + { + "epoch": 0.8298918387413963, + "grad_norm": 1.1083096265792847, + "learning_rate": 4.2742740632062243e-07, + "loss": 0.0483, + "step": 2110 + }, + { + "epoch": 0.8302851524090462, + "grad_norm": 2.0837628841400146, + "learning_rate": 4.255089469086279e-07, + "loss": 0.0663, + "step": 2111 + }, + { + "epoch": 0.8306784660766962, + "grad_norm": 1.2065215110778809, + "learning_rate": 4.235944020202182e-07, + "loss": 0.0673, + "step": 2112 + }, + { + "epoch": 0.8310717797443461, + "grad_norm": 1.3495663404464722, + "learning_rate": 4.216837752681019e-07, + "loss": 0.0589, + "step": 2113 + }, + { + "epoch": 0.831465093411996, + "grad_norm": 0.8407555818557739, + "learning_rate": 4.19777070257594e-07, + "loss": 0.0309, + "step": 2114 + }, + { + "epoch": 0.831858407079646, + "grad_norm": 0.9763451814651489, + "learning_rate": 4.1787429058660845e-07, + "loss": 0.0231, + "step": 2115 + }, + { + "epoch": 0.8322517207472959, + "grad_norm": 1.1487807035446167, + "learning_rate": 4.159754398456531e-07, + "loss": 0.0582, + "step": 2116 + }, + { + "epoch": 0.832645034414946, + "grad_norm": 0.9778567552566528, + "learning_rate": 4.14080521617822e-07, + "loss": 0.0349, + "step": 2117 + }, + { + "epoch": 0.8330383480825959, + "grad_norm": 1.1251294612884521, + "learning_rate": 4.121895394787881e-07, + "loss": 0.0608, + "step": 2118 + }, + { + "epoch": 0.8334316617502459, + "grad_norm": 0.8375036716461182, + "learning_rate": 4.103024969967981e-07, + "loss": 0.0406, + "step": 2119 + }, + { + "epoch": 0.8338249754178958, + "grad_norm": 1.1409391164779663, + "learning_rate": 4.084193977326625e-07, + "loss": 0.0545, + "step": 2120 + }, + { + "epoch": 0.8342182890855457, + "grad_norm": 1.0144537687301636, + "learning_rate": 4.0654024523975323e-07, + "loss": 0.076, + "step": 2121 + }, + { + "epoch": 0.8346116027531957, + "grad_norm": 1.7752301692962646, + "learning_rate": 4.0466504306399366e-07, + "loss": 0.0647, + "step": 2122 + }, + { + "epoch": 0.8350049164208456, + "grad_norm": 1.1848422288894653, + "learning_rate": 4.027937947438532e-07, + "loss": 0.0642, + "step": 2123 + }, + { + "epoch": 0.8353982300884956, + "grad_norm": 0.8530738353729248, + "learning_rate": 4.009265038103402e-07, + "loss": 0.0407, + "step": 2124 + }, + { + "epoch": 0.8357915437561455, + "grad_norm": 0.9213998317718506, + "learning_rate": 3.9906317378699684e-07, + "loss": 0.0306, + "step": 2125 + }, + { + "epoch": 0.8361848574237954, + "grad_norm": 0.8134070038795471, + "learning_rate": 3.972038081898885e-07, + "loss": 0.0378, + "step": 2126 + }, + { + "epoch": 0.8365781710914454, + "grad_norm": 1.0904289484024048, + "learning_rate": 3.9534841052760174e-07, + "loss": 0.032, + "step": 2127 + }, + { + "epoch": 0.8369714847590953, + "grad_norm": 2.0691423416137695, + "learning_rate": 3.9349698430123566e-07, + "loss": 0.0737, + "step": 2128 + }, + { + "epoch": 0.8373647984267454, + "grad_norm": 1.1641324758529663, + "learning_rate": 3.9164953300439456e-07, + "loss": 0.0546, + "step": 2129 + }, + { + "epoch": 0.8377581120943953, + "grad_norm": 0.9116164445877075, + "learning_rate": 3.898060601231832e-07, + "loss": 0.0533, + "step": 2130 + }, + { + "epoch": 0.8381514257620453, + "grad_norm": 1.0761325359344482, + "learning_rate": 3.879665691361975e-07, + "loss": 0.0465, + "step": 2131 + }, + { + "epoch": 0.8385447394296952, + "grad_norm": 1.2517597675323486, + "learning_rate": 3.861310635145207e-07, + "loss": 0.0509, + "step": 2132 + }, + { + "epoch": 0.8389380530973451, + "grad_norm": 0.7470773458480835, + "learning_rate": 3.8429954672171613e-07, + "loss": 0.0452, + "step": 2133 + }, + { + "epoch": 0.8393313667649951, + "grad_norm": 1.572190284729004, + "learning_rate": 3.824720222138192e-07, + "loss": 0.0388, + "step": 2134 + }, + { + "epoch": 0.839724680432645, + "grad_norm": 1.1324615478515625, + "learning_rate": 3.806484934393331e-07, + "loss": 0.0696, + "step": 2135 + }, + { + "epoch": 0.840117994100295, + "grad_norm": 1.03518807888031, + "learning_rate": 3.788289638392206e-07, + "loss": 0.0333, + "step": 2136 + }, + { + "epoch": 0.8405113077679449, + "grad_norm": 1.2855054140090942, + "learning_rate": 3.7701343684689725e-07, + "loss": 0.0573, + "step": 2137 + }, + { + "epoch": 0.8409046214355949, + "grad_norm": 1.5672320127487183, + "learning_rate": 3.7520191588822695e-07, + "loss": 0.0618, + "step": 2138 + }, + { + "epoch": 0.8412979351032448, + "grad_norm": 1.3046908378601074, + "learning_rate": 3.7339440438151383e-07, + "loss": 0.0633, + "step": 2139 + }, + { + "epoch": 0.8416912487708947, + "grad_norm": 0.9728895425796509, + "learning_rate": 3.7159090573749693e-07, + "loss": 0.0287, + "step": 2140 + }, + { + "epoch": 0.8420845624385448, + "grad_norm": 1.4470866918563843, + "learning_rate": 3.6979142335934246e-07, + "loss": 0.0439, + "step": 2141 + }, + { + "epoch": 0.8424778761061947, + "grad_norm": 0.802937924861908, + "learning_rate": 3.67995960642637e-07, + "loss": 0.0316, + "step": 2142 + }, + { + "epoch": 0.8428711897738447, + "grad_norm": 0.8089593052864075, + "learning_rate": 3.6620452097538424e-07, + "loss": 0.0506, + "step": 2143 + }, + { + "epoch": 0.8432645034414946, + "grad_norm": 0.9571702480316162, + "learning_rate": 3.644171077379949e-07, + "loss": 0.0273, + "step": 2144 + }, + { + "epoch": 0.8436578171091446, + "grad_norm": 1.022767186164856, + "learning_rate": 3.6263372430328266e-07, + "loss": 0.0497, + "step": 2145 + }, + { + "epoch": 0.8440511307767945, + "grad_norm": 1.133183479309082, + "learning_rate": 3.6085437403645645e-07, + "loss": 0.0375, + "step": 2146 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 1.603365421295166, + "learning_rate": 3.5907906029511606e-07, + "loss": 0.0535, + "step": 2147 + }, + { + "epoch": 0.8448377581120944, + "grad_norm": 1.052833080291748, + "learning_rate": 3.573077864292421e-07, + "loss": 0.0419, + "step": 2148 + }, + { + "epoch": 0.8452310717797443, + "grad_norm": 0.8957949280738831, + "learning_rate": 3.555405557811936e-07, + "loss": 0.054, + "step": 2149 + }, + { + "epoch": 0.8456243854473943, + "grad_norm": 1.3401049375534058, + "learning_rate": 3.537773716857004e-07, + "loss": 0.0558, + "step": 2150 + }, + { + "epoch": 0.8460176991150442, + "grad_norm": 1.3811299800872803, + "learning_rate": 3.5201823746985554e-07, + "loss": 0.0436, + "step": 2151 + }, + { + "epoch": 0.8464110127826941, + "grad_norm": 1.3221920728683472, + "learning_rate": 3.5026315645311114e-07, + "loss": 0.0679, + "step": 2152 + }, + { + "epoch": 0.8468043264503442, + "grad_norm": 0.608182966709137, + "learning_rate": 3.485121319472695e-07, + "loss": 0.0624, + "step": 2153 + }, + { + "epoch": 0.8471976401179941, + "grad_norm": 0.8964172601699829, + "learning_rate": 3.4676516725647953e-07, + "loss": 0.0394, + "step": 2154 + }, + { + "epoch": 0.8475909537856441, + "grad_norm": 0.7584964632987976, + "learning_rate": 3.450222656772292e-07, + "loss": 0.0484, + "step": 2155 + }, + { + "epoch": 0.847984267453294, + "grad_norm": 0.3789440095424652, + "learning_rate": 3.43283430498339e-07, + "loss": 0.0277, + "step": 2156 + }, + { + "epoch": 0.848377581120944, + "grad_norm": 0.7871941924095154, + "learning_rate": 3.4154866500095695e-07, + "loss": 0.0493, + "step": 2157 + }, + { + "epoch": 0.8487708947885939, + "grad_norm": 1.302708625793457, + "learning_rate": 3.3981797245855096e-07, + "loss": 0.0799, + "step": 2158 + }, + { + "epoch": 0.8491642084562439, + "grad_norm": 0.7635212540626526, + "learning_rate": 3.380913561369037e-07, + "loss": 0.0427, + "step": 2159 + }, + { + "epoch": 0.8495575221238938, + "grad_norm": 0.8605564832687378, + "learning_rate": 3.363688192941067e-07, + "loss": 0.0462, + "step": 2160 + }, + { + "epoch": 0.8499508357915437, + "grad_norm": 0.9630613923072815, + "learning_rate": 3.346503651805513e-07, + "loss": 0.0637, + "step": 2161 + }, + { + "epoch": 0.8503441494591937, + "grad_norm": 1.0170080661773682, + "learning_rate": 3.329359970389279e-07, + "loss": 0.061, + "step": 2162 + }, + { + "epoch": 0.8507374631268436, + "grad_norm": 0.8377442359924316, + "learning_rate": 3.312257181042142e-07, + "loss": 0.0449, + "step": 2163 + }, + { + "epoch": 0.8511307767944936, + "grad_norm": 0.9564546346664429, + "learning_rate": 3.2951953160367365e-07, + "loss": 0.0496, + "step": 2164 + }, + { + "epoch": 0.8515240904621436, + "grad_norm": 0.5969823002815247, + "learning_rate": 3.2781744075684576e-07, + "loss": 0.0404, + "step": 2165 + }, + { + "epoch": 0.8519174041297936, + "grad_norm": 1.0183027982711792, + "learning_rate": 3.261194487755426e-07, + "loss": 0.0563, + "step": 2166 + }, + { + "epoch": 0.8523107177974435, + "grad_norm": 1.3610613346099854, + "learning_rate": 3.2442555886384145e-07, + "loss": 0.0791, + "step": 2167 + }, + { + "epoch": 0.8527040314650934, + "grad_norm": 0.7566685080528259, + "learning_rate": 3.2273577421807976e-07, + "loss": 0.0415, + "step": 2168 + }, + { + "epoch": 0.8530973451327434, + "grad_norm": 1.1211597919464111, + "learning_rate": 3.2105009802684636e-07, + "loss": 0.0874, + "step": 2169 + }, + { + "epoch": 0.8534906588003933, + "grad_norm": 1.6669408082962036, + "learning_rate": 3.1936853347097923e-07, + "loss": 0.0521, + "step": 2170 + }, + { + "epoch": 0.8538839724680433, + "grad_norm": 0.9726613163948059, + "learning_rate": 3.1769108372355804e-07, + "loss": 0.0457, + "step": 2171 + }, + { + "epoch": 0.8542772861356932, + "grad_norm": 1.5157469511032104, + "learning_rate": 3.1601775194989693e-07, + "loss": 0.0574, + "step": 2172 + }, + { + "epoch": 0.8546705998033431, + "grad_norm": 2.319978713989258, + "learning_rate": 3.143485413075398e-07, + "loss": 0.0604, + "step": 2173 + }, + { + "epoch": 0.8550639134709931, + "grad_norm": 1.160510778427124, + "learning_rate": 3.1268345494625486e-07, + "loss": 0.0454, + "step": 2174 + }, + { + "epoch": 0.855457227138643, + "grad_norm": 1.0284311771392822, + "learning_rate": 3.1102249600802573e-07, + "loss": 0.0375, + "step": 2175 + }, + { + "epoch": 0.855850540806293, + "grad_norm": 0.7068095207214355, + "learning_rate": 3.093656676270501e-07, + "loss": 0.0409, + "step": 2176 + }, + { + "epoch": 0.856243854473943, + "grad_norm": 0.8698954582214355, + "learning_rate": 3.0771297292972986e-07, + "loss": 0.0547, + "step": 2177 + }, + { + "epoch": 0.856637168141593, + "grad_norm": 0.7371048331260681, + "learning_rate": 3.0606441503466753e-07, + "loss": 0.0661, + "step": 2178 + }, + { + "epoch": 0.8570304818092429, + "grad_norm": 0.6116827726364136, + "learning_rate": 3.044199970526593e-07, + "loss": 0.0199, + "step": 2179 + }, + { + "epoch": 0.8574237954768928, + "grad_norm": 0.9910300374031067, + "learning_rate": 3.027797220866896e-07, + "loss": 0.0454, + "step": 2180 + }, + { + "epoch": 0.8578171091445428, + "grad_norm": 0.9253597855567932, + "learning_rate": 3.01143593231924e-07, + "loss": 0.0465, + "step": 2181 + }, + { + "epoch": 0.8582104228121927, + "grad_norm": 0.6476548314094543, + "learning_rate": 2.995116135757059e-07, + "loss": 0.0385, + "step": 2182 + }, + { + "epoch": 0.8586037364798427, + "grad_norm": 0.8749169707298279, + "learning_rate": 2.978837861975484e-07, + "loss": 0.0474, + "step": 2183 + }, + { + "epoch": 0.8589970501474926, + "grad_norm": 1.4006898403167725, + "learning_rate": 2.962601141691296e-07, + "loss": 0.0511, + "step": 2184 + }, + { + "epoch": 0.8593903638151426, + "grad_norm": 0.8508985638618469, + "learning_rate": 2.9464060055428703e-07, + "loss": 0.0549, + "step": 2185 + }, + { + "epoch": 0.8597836774827925, + "grad_norm": 1.1002285480499268, + "learning_rate": 2.930252484090101e-07, + "loss": 0.0283, + "step": 2186 + }, + { + "epoch": 0.8601769911504424, + "grad_norm": 0.8702027201652527, + "learning_rate": 2.9141406078143644e-07, + "loss": 0.0605, + "step": 2187 + }, + { + "epoch": 0.8605703048180924, + "grad_norm": 0.79606693983078, + "learning_rate": 2.8980704071184557e-07, + "loss": 0.0598, + "step": 2188 + }, + { + "epoch": 0.8609636184857424, + "grad_norm": 1.1964335441589355, + "learning_rate": 2.882041912326525e-07, + "loss": 0.046, + "step": 2189 + }, + { + "epoch": 0.8613569321533924, + "grad_norm": 1.1686105728149414, + "learning_rate": 2.8660551536840277e-07, + "loss": 0.0329, + "step": 2190 + }, + { + "epoch": 0.8617502458210423, + "grad_norm": 0.858632504940033, + "learning_rate": 2.8501101613576526e-07, + "loss": 0.0661, + "step": 2191 + }, + { + "epoch": 0.8621435594886923, + "grad_norm": 0.984893262386322, + "learning_rate": 2.834206965435293e-07, + "loss": 0.0351, + "step": 2192 + }, + { + "epoch": 0.8625368731563422, + "grad_norm": 1.3127596378326416, + "learning_rate": 2.818345595925959e-07, + "loss": 0.0387, + "step": 2193 + }, + { + "epoch": 0.8629301868239921, + "grad_norm": 1.4564718008041382, + "learning_rate": 2.8025260827597463e-07, + "loss": 0.0424, + "step": 2194 + }, + { + "epoch": 0.8633235004916421, + "grad_norm": 0.5872806310653687, + "learning_rate": 2.7867484557877607e-07, + "loss": 0.0414, + "step": 2195 + }, + { + "epoch": 0.863716814159292, + "grad_norm": 1.0555849075317383, + "learning_rate": 2.7710127447820783e-07, + "loss": 0.0519, + "step": 2196 + }, + { + "epoch": 0.864110127826942, + "grad_norm": 1.0422883033752441, + "learning_rate": 2.7553189794356615e-07, + "loss": 0.0562, + "step": 2197 + }, + { + "epoch": 0.8645034414945919, + "grad_norm": 1.2551977634429932, + "learning_rate": 2.739667189362347e-07, + "loss": 0.0344, + "step": 2198 + }, + { + "epoch": 0.8648967551622418, + "grad_norm": 1.0713584423065186, + "learning_rate": 2.724057404096744e-07, + "loss": 0.0385, + "step": 2199 + }, + { + "epoch": 0.8652900688298918, + "grad_norm": 0.6667132377624512, + "learning_rate": 2.708489653094218e-07, + "loss": 0.0525, + "step": 2200 + }, + { + "epoch": 0.8656833824975418, + "grad_norm": 0.9178755283355713, + "learning_rate": 2.692963965730805e-07, + "loss": 0.0722, + "step": 2201 + }, + { + "epoch": 0.8660766961651918, + "grad_norm": 1.2695622444152832, + "learning_rate": 2.677480371303162e-07, + "loss": 0.0759, + "step": 2202 + }, + { + "epoch": 0.8664700098328417, + "grad_norm": 1.1370331048965454, + "learning_rate": 2.662038899028532e-07, + "loss": 0.0396, + "step": 2203 + }, + { + "epoch": 0.8668633235004917, + "grad_norm": 0.6956948041915894, + "learning_rate": 2.6466395780446657e-07, + "loss": 0.062, + "step": 2204 + }, + { + "epoch": 0.8672566371681416, + "grad_norm": 0.5956060886383057, + "learning_rate": 2.6312824374097794e-07, + "loss": 0.049, + "step": 2205 + }, + { + "epoch": 0.8676499508357916, + "grad_norm": 3.8347904682159424, + "learning_rate": 2.6159675061024905e-07, + "loss": 0.0654, + "step": 2206 + }, + { + "epoch": 0.8680432645034415, + "grad_norm": 1.0327752828598022, + "learning_rate": 2.6006948130217815e-07, + "loss": 0.024, + "step": 2207 + }, + { + "epoch": 0.8684365781710914, + "grad_norm": 1.1763917207717896, + "learning_rate": 2.585464386986908e-07, + "loss": 0.0487, + "step": 2208 + }, + { + "epoch": 0.8688298918387414, + "grad_norm": 1.6335638761520386, + "learning_rate": 2.570276256737386e-07, + "loss": 0.0451, + "step": 2209 + }, + { + "epoch": 0.8692232055063913, + "grad_norm": 1.1163750886917114, + "learning_rate": 2.555130450932922e-07, + "loss": 0.072, + "step": 2210 + }, + { + "epoch": 0.8696165191740413, + "grad_norm": 1.2412861585617065, + "learning_rate": 2.54002699815335e-07, + "loss": 0.0541, + "step": 2211 + }, + { + "epoch": 0.8700098328416912, + "grad_norm": 0.9547197222709656, + "learning_rate": 2.52496592689859e-07, + "loss": 0.04, + "step": 2212 + }, + { + "epoch": 0.8704031465093413, + "grad_norm": 1.4851540327072144, + "learning_rate": 2.5099472655885777e-07, + "loss": 0.0602, + "step": 2213 + }, + { + "epoch": 0.8707964601769912, + "grad_norm": 0.9040324687957764, + "learning_rate": 2.4949710425632353e-07, + "loss": 0.0395, + "step": 2214 + }, + { + "epoch": 0.8711897738446411, + "grad_norm": 1.1058231592178345, + "learning_rate": 2.4800372860823956e-07, + "loss": 0.0472, + "step": 2215 + }, + { + "epoch": 0.8715830875122911, + "grad_norm": 0.814282238483429, + "learning_rate": 2.465146024325765e-07, + "loss": 0.0541, + "step": 2216 + }, + { + "epoch": 0.871976401179941, + "grad_norm": 0.9722008109092712, + "learning_rate": 2.4502972853928606e-07, + "loss": 0.0581, + "step": 2217 + }, + { + "epoch": 0.872369714847591, + "grad_norm": 0.9943141341209412, + "learning_rate": 2.435491097302961e-07, + "loss": 0.0435, + "step": 2218 + }, + { + "epoch": 0.8727630285152409, + "grad_norm": 1.2543455362319946, + "learning_rate": 2.420727487995045e-07, + "loss": 0.0613, + "step": 2219 + }, + { + "epoch": 0.8731563421828908, + "grad_norm": 0.8473043441772461, + "learning_rate": 2.40600648532775e-07, + "loss": 0.0391, + "step": 2220 + }, + { + "epoch": 0.8735496558505408, + "grad_norm": 1.0976766347885132, + "learning_rate": 2.3913281170793196e-07, + "loss": 0.0341, + "step": 2221 + }, + { + "epoch": 0.8739429695181907, + "grad_norm": 0.765153169631958, + "learning_rate": 2.376692410947548e-07, + "loss": 0.0335, + "step": 2222 + }, + { + "epoch": 0.8743362831858407, + "grad_norm": 1.2966009378433228, + "learning_rate": 2.3620993945497217e-07, + "loss": 0.0571, + "step": 2223 + }, + { + "epoch": 0.8747295968534906, + "grad_norm": 1.0903987884521484, + "learning_rate": 2.347549095422569e-07, + "loss": 0.0602, + "step": 2224 + }, + { + "epoch": 0.8751229105211407, + "grad_norm": 0.9129044413566589, + "learning_rate": 2.3330415410222212e-07, + "loss": 0.0508, + "step": 2225 + }, + { + "epoch": 0.8755162241887906, + "grad_norm": 1.3771973848342896, + "learning_rate": 2.3185767587241447e-07, + "loss": 0.0282, + "step": 2226 + }, + { + "epoch": 0.8759095378564405, + "grad_norm": 1.1595170497894287, + "learning_rate": 2.3041547758230977e-07, + "loss": 0.0768, + "step": 2227 + }, + { + "epoch": 0.8763028515240905, + "grad_norm": 0.7576168775558472, + "learning_rate": 2.2897756195330773e-07, + "loss": 0.0296, + "step": 2228 + }, + { + "epoch": 0.8766961651917404, + "grad_norm": 1.2020797729492188, + "learning_rate": 2.2754393169872685e-07, + "loss": 0.0392, + "step": 2229 + }, + { + "epoch": 0.8770894788593904, + "grad_norm": 1.2221319675445557, + "learning_rate": 2.2611458952379872e-07, + "loss": 0.0319, + "step": 2230 + }, + { + "epoch": 0.8774827925270403, + "grad_norm": 1.1023682355880737, + "learning_rate": 2.246895381256639e-07, + "loss": 0.0523, + "step": 2231 + }, + { + "epoch": 0.8778761061946903, + "grad_norm": 1.0071845054626465, + "learning_rate": 2.232687801933664e-07, + "loss": 0.034, + "step": 2232 + }, + { + "epoch": 0.8782694198623402, + "grad_norm": 0.8645428419113159, + "learning_rate": 2.2185231840784778e-07, + "loss": 0.0628, + "step": 2233 + }, + { + "epoch": 0.8786627335299901, + "grad_norm": 0.6460661292076111, + "learning_rate": 2.204401554419444e-07, + "loss": 0.045, + "step": 2234 + }, + { + "epoch": 0.8790560471976401, + "grad_norm": 1.7761812210083008, + "learning_rate": 2.1903229396037896e-07, + "loss": 0.0739, + "step": 2235 + }, + { + "epoch": 0.87944936086529, + "grad_norm": 1.3595634698867798, + "learning_rate": 2.1762873661975825e-07, + "loss": 0.041, + "step": 2236 + }, + { + "epoch": 0.8798426745329401, + "grad_norm": 0.8807711601257324, + "learning_rate": 2.1622948606856765e-07, + "loss": 0.0623, + "step": 2237 + }, + { + "epoch": 0.88023598820059, + "grad_norm": 1.0638388395309448, + "learning_rate": 2.1483454494716504e-07, + "loss": 0.0337, + "step": 2238 + }, + { + "epoch": 0.88062930186824, + "grad_norm": 0.9859362244606018, + "learning_rate": 2.1344391588777658e-07, + "loss": 0.0389, + "step": 2239 + }, + { + "epoch": 0.8810226155358899, + "grad_norm": 1.0022567510604858, + "learning_rate": 2.1205760151449206e-07, + "loss": 0.0358, + "step": 2240 + }, + { + "epoch": 0.8814159292035398, + "grad_norm": 0.8748469948768616, + "learning_rate": 2.106756044432598e-07, + "loss": 0.0367, + "step": 2241 + }, + { + "epoch": 0.8818092428711898, + "grad_norm": 1.0613561868667603, + "learning_rate": 2.0929792728187986e-07, + "loss": 0.0608, + "step": 2242 + }, + { + "epoch": 0.8822025565388397, + "grad_norm": 1.8184490203857422, + "learning_rate": 2.079245726300022e-07, + "loss": 0.0586, + "step": 2243 + }, + { + "epoch": 0.8825958702064897, + "grad_norm": 1.0881813764572144, + "learning_rate": 2.0655554307911997e-07, + "loss": 0.0603, + "step": 2244 + }, + { + "epoch": 0.8829891838741396, + "grad_norm": 1.0074139833450317, + "learning_rate": 2.05190841212565e-07, + "loss": 0.0666, + "step": 2245 + }, + { + "epoch": 0.8833824975417895, + "grad_norm": 1.1435564756393433, + "learning_rate": 2.038304696055024e-07, + "loss": 0.0312, + "step": 2246 + }, + { + "epoch": 0.8837758112094395, + "grad_norm": 0.6284701228141785, + "learning_rate": 2.0247443082492686e-07, + "loss": 0.0235, + "step": 2247 + }, + { + "epoch": 0.8841691248770894, + "grad_norm": 1.6139885187149048, + "learning_rate": 2.0112272742965678e-07, + "loss": 0.0262, + "step": 2248 + }, + { + "epoch": 0.8845624385447395, + "grad_norm": 0.8762457966804504, + "learning_rate": 1.997753619703291e-07, + "loss": 0.0431, + "step": 2249 + }, + { + "epoch": 0.8849557522123894, + "grad_norm": 1.287406086921692, + "learning_rate": 1.9843233698939617e-07, + "loss": 0.0457, + "step": 2250 + }, + { + "epoch": 0.8853490658800394, + "grad_norm": 1.3118491172790527, + "learning_rate": 1.9709365502111944e-07, + "loss": 0.0487, + "step": 2251 + }, + { + "epoch": 0.8857423795476893, + "grad_norm": 0.8101546764373779, + "learning_rate": 1.957593185915657e-07, + "loss": 0.0458, + "step": 2252 + }, + { + "epoch": 0.8861356932153392, + "grad_norm": 1.5364015102386475, + "learning_rate": 1.9442933021860095e-07, + "loss": 0.0407, + "step": 2253 + }, + { + "epoch": 0.8865290068829892, + "grad_norm": 0.9168291091918945, + "learning_rate": 1.9310369241188732e-07, + "loss": 0.0474, + "step": 2254 + }, + { + "epoch": 0.8869223205506391, + "grad_norm": 1.0423481464385986, + "learning_rate": 1.9178240767287666e-07, + "loss": 0.035, + "step": 2255 + }, + { + "epoch": 0.8873156342182891, + "grad_norm": 0.995087742805481, + "learning_rate": 1.904654784948079e-07, + "loss": 0.0596, + "step": 2256 + }, + { + "epoch": 0.887708947885939, + "grad_norm": 1.1472982168197632, + "learning_rate": 1.8915290736269965e-07, + "loss": 0.069, + "step": 2257 + }, + { + "epoch": 0.888102261553589, + "grad_norm": 0.7572572231292725, + "learning_rate": 1.878446967533476e-07, + "loss": 0.061, + "step": 2258 + }, + { + "epoch": 0.8884955752212389, + "grad_norm": 0.5118011832237244, + "learning_rate": 1.865408491353199e-07, + "loss": 0.0313, + "step": 2259 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.8399426937103271, + "learning_rate": 1.8524136696895068e-07, + "loss": 0.0444, + "step": 2260 + }, + { + "epoch": 0.8892822025565389, + "grad_norm": 0.8290569186210632, + "learning_rate": 1.8394625270633793e-07, + "loss": 0.0384, + "step": 2261 + }, + { + "epoch": 0.8896755162241888, + "grad_norm": 1.0309621095657349, + "learning_rate": 1.8265550879133538e-07, + "loss": 0.0522, + "step": 2262 + }, + { + "epoch": 0.8900688298918388, + "grad_norm": 2.102466583251953, + "learning_rate": 1.8136913765955195e-07, + "loss": 0.0684, + "step": 2263 + }, + { + "epoch": 0.8904621435594887, + "grad_norm": 0.9560519456863403, + "learning_rate": 1.8008714173834456e-07, + "loss": 0.0411, + "step": 2264 + }, + { + "epoch": 0.8908554572271387, + "grad_norm": 0.7714261412620544, + "learning_rate": 1.7880952344681402e-07, + "loss": 0.0393, + "step": 2265 + }, + { + "epoch": 0.8912487708947886, + "grad_norm": 2.210777521133423, + "learning_rate": 1.7753628519580097e-07, + "loss": 0.0531, + "step": 2266 + }, + { + "epoch": 0.8916420845624385, + "grad_norm": 1.3124444484710693, + "learning_rate": 1.7626742938788105e-07, + "loss": 0.0808, + "step": 2267 + }, + { + "epoch": 0.8920353982300885, + "grad_norm": 0.8876106142997742, + "learning_rate": 1.7500295841735905e-07, + "loss": 0.0299, + "step": 2268 + }, + { + "epoch": 0.8924287118977384, + "grad_norm": 0.9470813870429993, + "learning_rate": 1.7374287467026767e-07, + "loss": 0.0289, + "step": 2269 + }, + { + "epoch": 0.8928220255653884, + "grad_norm": 1.1278401613235474, + "learning_rate": 1.7248718052435942e-07, + "loss": 0.0557, + "step": 2270 + }, + { + "epoch": 0.8932153392330383, + "grad_norm": 1.0883233547210693, + "learning_rate": 1.712358783491047e-07, + "loss": 0.0493, + "step": 2271 + }, + { + "epoch": 0.8936086529006882, + "grad_norm": 1.8595354557037354, + "learning_rate": 1.6998897050568618e-07, + "loss": 0.0583, + "step": 2272 + }, + { + "epoch": 0.8940019665683383, + "grad_norm": 1.1858155727386475, + "learning_rate": 1.6874645934699342e-07, + "loss": 0.0406, + "step": 2273 + }, + { + "epoch": 0.8943952802359882, + "grad_norm": 0.8429166674613953, + "learning_rate": 1.6750834721762117e-07, + "loss": 0.0575, + "step": 2274 + }, + { + "epoch": 0.8947885939036382, + "grad_norm": 1.4577648639678955, + "learning_rate": 1.6627463645386199e-07, + "loss": 0.0412, + "step": 2275 + }, + { + "epoch": 0.8951819075712881, + "grad_norm": 0.6947933435440063, + "learning_rate": 1.6504532938370427e-07, + "loss": 0.0465, + "step": 2276 + }, + { + "epoch": 0.8955752212389381, + "grad_norm": 0.8350834846496582, + "learning_rate": 1.6382042832682577e-07, + "loss": 0.0438, + "step": 2277 + }, + { + "epoch": 0.895968534906588, + "grad_norm": 1.2530003786087036, + "learning_rate": 1.6259993559459091e-07, + "loss": 0.0415, + "step": 2278 + }, + { + "epoch": 0.896361848574238, + "grad_norm": 1.0597574710845947, + "learning_rate": 1.613838534900447e-07, + "loss": 0.0399, + "step": 2279 + }, + { + "epoch": 0.8967551622418879, + "grad_norm": 0.8264654278755188, + "learning_rate": 1.601721843079107e-07, + "loss": 0.0348, + "step": 2280 + }, + { + "epoch": 0.8971484759095378, + "grad_norm": 0.8567057251930237, + "learning_rate": 1.5896493033458416e-07, + "loss": 0.029, + "step": 2281 + }, + { + "epoch": 0.8975417895771878, + "grad_norm": 1.390363335609436, + "learning_rate": 1.5776209384812946e-07, + "loss": 0.0815, + "step": 2282 + }, + { + "epoch": 0.8979351032448377, + "grad_norm": 0.9575844407081604, + "learning_rate": 1.5656367711827602e-07, + "loss": 0.0526, + "step": 2283 + }, + { + "epoch": 0.8983284169124877, + "grad_norm": 0.7833372950553894, + "learning_rate": 1.553696824064116e-07, + "loss": 0.0329, + "step": 2284 + }, + { + "epoch": 0.8987217305801377, + "grad_norm": 0.8829760551452637, + "learning_rate": 1.5418011196558085e-07, + "loss": 0.0395, + "step": 2285 + }, + { + "epoch": 0.8991150442477877, + "grad_norm": 1.0580815076828003, + "learning_rate": 1.529949680404799e-07, + "loss": 0.0648, + "step": 2286 + }, + { + "epoch": 0.8995083579154376, + "grad_norm": 1.051527738571167, + "learning_rate": 1.5181425286745155e-07, + "loss": 0.0618, + "step": 2287 + }, + { + "epoch": 0.8999016715830875, + "grad_norm": 1.5211282968521118, + "learning_rate": 1.5063796867448243e-07, + "loss": 0.047, + "step": 2288 + }, + { + "epoch": 0.9002949852507375, + "grad_norm": 0.3931565582752228, + "learning_rate": 1.4946611768119763e-07, + "loss": 0.0371, + "step": 2289 + }, + { + "epoch": 0.9006882989183874, + "grad_norm": 0.40819835662841797, + "learning_rate": 1.4829870209885605e-07, + "loss": 0.0399, + "step": 2290 + }, + { + "epoch": 0.9010816125860374, + "grad_norm": 1.5606259107589722, + "learning_rate": 1.471357241303481e-07, + "loss": 0.0537, + "step": 2291 + }, + { + "epoch": 0.9014749262536873, + "grad_norm": 0.4650862514972687, + "learning_rate": 1.4597718597019055e-07, + "loss": 0.0169, + "step": 2292 + }, + { + "epoch": 0.9018682399213372, + "grad_norm": 0.8470922112464905, + "learning_rate": 1.4482308980452164e-07, + "loss": 0.0308, + "step": 2293 + }, + { + "epoch": 0.9022615535889872, + "grad_norm": 1.1515922546386719, + "learning_rate": 1.436734378110985e-07, + "loss": 0.0459, + "step": 2294 + }, + { + "epoch": 0.9026548672566371, + "grad_norm": 1.0158207416534424, + "learning_rate": 1.425282321592908e-07, + "loss": 0.0667, + "step": 2295 + }, + { + "epoch": 0.9030481809242871, + "grad_norm": 0.6387980580329895, + "learning_rate": 1.4138747501007966e-07, + "loss": 0.0419, + "step": 2296 + }, + { + "epoch": 0.9034414945919371, + "grad_norm": 1.8949992656707764, + "learning_rate": 1.4025116851605125e-07, + "loss": 0.0556, + "step": 2297 + }, + { + "epoch": 0.9038348082595871, + "grad_norm": 0.8390710949897766, + "learning_rate": 1.3911931482139317e-07, + "loss": 0.0322, + "step": 2298 + }, + { + "epoch": 0.904228121927237, + "grad_norm": 0.6234549880027771, + "learning_rate": 1.379919160618909e-07, + "loss": 0.0334, + "step": 2299 + }, + { + "epoch": 0.904621435594887, + "grad_norm": 1.1114718914031982, + "learning_rate": 1.368689743649243e-07, + "loss": 0.0536, + "step": 2300 + }, + { + "epoch": 0.9050147492625369, + "grad_norm": 0.7461351752281189, + "learning_rate": 1.3575049184946122e-07, + "loss": 0.0371, + "step": 2301 + }, + { + "epoch": 0.9054080629301868, + "grad_norm": 0.9355785250663757, + "learning_rate": 1.346364706260564e-07, + "loss": 0.0296, + "step": 2302 + }, + { + "epoch": 0.9058013765978368, + "grad_norm": 0.5872256755828857, + "learning_rate": 1.3352691279684582e-07, + "loss": 0.0281, + "step": 2303 + }, + { + "epoch": 0.9061946902654867, + "grad_norm": 1.7544050216674805, + "learning_rate": 1.324218204555433e-07, + "loss": 0.056, + "step": 2304 + }, + { + "epoch": 0.9065880039331367, + "grad_norm": 0.6219866871833801, + "learning_rate": 1.3132119568743662e-07, + "loss": 0.0288, + "step": 2305 + }, + { + "epoch": 0.9069813176007866, + "grad_norm": 1.4340651035308838, + "learning_rate": 1.3022504056938196e-07, + "loss": 0.0504, + "step": 2306 + }, + { + "epoch": 0.9073746312684365, + "grad_norm": 0.5100427269935608, + "learning_rate": 1.2913335716980307e-07, + "loss": 0.0473, + "step": 2307 + }, + { + "epoch": 0.9077679449360865, + "grad_norm": 0.650513768196106, + "learning_rate": 1.2804614754868466e-07, + "loss": 0.0537, + "step": 2308 + }, + { + "epoch": 0.9081612586037365, + "grad_norm": 1.4720587730407715, + "learning_rate": 1.2696341375756982e-07, + "loss": 0.043, + "step": 2309 + }, + { + "epoch": 0.9085545722713865, + "grad_norm": 1.7473880052566528, + "learning_rate": 1.2588515783955564e-07, + "loss": 0.0551, + "step": 2310 + }, + { + "epoch": 0.9089478859390364, + "grad_norm": 0.7824367880821228, + "learning_rate": 1.2481138182929065e-07, + "loss": 0.0299, + "step": 2311 + }, + { + "epoch": 0.9093411996066864, + "grad_norm": 1.2818101644515991, + "learning_rate": 1.2374208775296742e-07, + "loss": 0.0664, + "step": 2312 + }, + { + "epoch": 0.9097345132743363, + "grad_norm": 1.6559642553329468, + "learning_rate": 1.2267727762832388e-07, + "loss": 0.0667, + "step": 2313 + }, + { + "epoch": 0.9101278269419862, + "grad_norm": 0.8255678415298462, + "learning_rate": 1.2161695346463498e-07, + "loss": 0.042, + "step": 2314 + }, + { + "epoch": 0.9105211406096362, + "grad_norm": 0.7617945075035095, + "learning_rate": 1.2056111726271192e-07, + "loss": 0.0464, + "step": 2315 + }, + { + "epoch": 0.9109144542772861, + "grad_norm": 1.3965145349502563, + "learning_rate": 1.195097710148968e-07, + "loss": 0.039, + "step": 2316 + }, + { + "epoch": 0.9113077679449361, + "grad_norm": 1.3296297788619995, + "learning_rate": 1.1846291670505855e-07, + "loss": 0.0552, + "step": 2317 + }, + { + "epoch": 0.911701081612586, + "grad_norm": 0.7849988341331482, + "learning_rate": 1.1742055630859117e-07, + "loss": 0.0338, + "step": 2318 + }, + { + "epoch": 0.912094395280236, + "grad_norm": 2.0398993492126465, + "learning_rate": 1.1638269179240796e-07, + "loss": 0.0542, + "step": 2319 + }, + { + "epoch": 0.9124877089478859, + "grad_norm": 0.7769688367843628, + "learning_rate": 1.1534932511493846e-07, + "loss": 0.0343, + "step": 2320 + }, + { + "epoch": 0.9128810226155359, + "grad_norm": 0.6311588287353516, + "learning_rate": 1.1432045822612564e-07, + "loss": 0.0483, + "step": 2321 + }, + { + "epoch": 0.9132743362831859, + "grad_norm": 0.9618848562240601, + "learning_rate": 1.132960930674204e-07, + "loss": 0.0498, + "step": 2322 + }, + { + "epoch": 0.9136676499508358, + "grad_norm": 0.8956164121627808, + "learning_rate": 1.1227623157177986e-07, + "loss": 0.0316, + "step": 2323 + }, + { + "epoch": 0.9140609636184858, + "grad_norm": 1.1387652158737183, + "learning_rate": 1.1126087566366266e-07, + "loss": 0.0669, + "step": 2324 + }, + { + "epoch": 0.9144542772861357, + "grad_norm": 0.7763038277626038, + "learning_rate": 1.1025002725902484e-07, + "loss": 0.0512, + "step": 2325 + }, + { + "epoch": 0.9148475909537856, + "grad_norm": 1.52693510055542, + "learning_rate": 1.0924368826531751e-07, + "loss": 0.0745, + "step": 2326 + }, + { + "epoch": 0.9152409046214356, + "grad_norm": 1.1928157806396484, + "learning_rate": 1.0824186058148278e-07, + "loss": 0.047, + "step": 2327 + }, + { + "epoch": 0.9156342182890855, + "grad_norm": 0.6993405818939209, + "learning_rate": 1.0724454609794931e-07, + "loss": 0.0258, + "step": 2328 + }, + { + "epoch": 0.9160275319567355, + "grad_norm": 0.8654144406318665, + "learning_rate": 1.0625174669663036e-07, + "loss": 0.0493, + "step": 2329 + }, + { + "epoch": 0.9164208456243854, + "grad_norm": 1.6443697214126587, + "learning_rate": 1.0526346425091815e-07, + "loss": 0.0641, + "step": 2330 + }, + { + "epoch": 0.9168141592920354, + "grad_norm": 2.2090344429016113, + "learning_rate": 1.042797006256821e-07, + "loss": 0.0916, + "step": 2331 + }, + { + "epoch": 0.9172074729596853, + "grad_norm": 1.2032400369644165, + "learning_rate": 1.0330045767726504e-07, + "loss": 0.043, + "step": 2332 + }, + { + "epoch": 0.9176007866273354, + "grad_norm": 1.0382981300354004, + "learning_rate": 1.023257372534786e-07, + "loss": 0.0478, + "step": 2333 + }, + { + "epoch": 0.9179941002949853, + "grad_norm": 1.3554562330245972, + "learning_rate": 1.0135554119360153e-07, + "loss": 0.076, + "step": 2334 + }, + { + "epoch": 0.9183874139626352, + "grad_norm": 0.7670255899429321, + "learning_rate": 1.0038987132837435e-07, + "loss": 0.0666, + "step": 2335 + }, + { + "epoch": 0.9187807276302852, + "grad_norm": 1.3432739973068237, + "learning_rate": 9.942872947999672e-08, + "loss": 0.0472, + "step": 2336 + }, + { + "epoch": 0.9191740412979351, + "grad_norm": 0.7896971702575684, + "learning_rate": 9.847211746212504e-08, + "loss": 0.0636, + "step": 2337 + }, + { + "epoch": 0.9195673549655851, + "grad_norm": 0.7464331388473511, + "learning_rate": 9.752003707986652e-08, + "loss": 0.036, + "step": 2338 + }, + { + "epoch": 0.919960668633235, + "grad_norm": 1.4482289552688599, + "learning_rate": 9.657249012977821e-08, + "loss": 0.047, + "step": 2339 + }, + { + "epoch": 0.9203539823008849, + "grad_norm": 0.7451487183570862, + "learning_rate": 9.562947839986264e-08, + "loss": 0.0516, + "step": 2340 + }, + { + "epoch": 0.9207472959685349, + "grad_norm": 1.0219905376434326, + "learning_rate": 9.469100366956391e-08, + "loss": 0.0515, + "step": 2341 + }, + { + "epoch": 0.9211406096361848, + "grad_norm": 0.776695966720581, + "learning_rate": 9.375706770976573e-08, + "loss": 0.0289, + "step": 2342 + }, + { + "epoch": 0.9215339233038348, + "grad_norm": 0.9781972169876099, + "learning_rate": 9.282767228278672e-08, + "loss": 0.0767, + "step": 2343 + }, + { + "epoch": 0.9219272369714847, + "grad_norm": 1.0278164148330688, + "learning_rate": 9.190281914237736e-08, + "loss": 0.0333, + "step": 2344 + }, + { + "epoch": 0.9223205506391348, + "grad_norm": 1.5040227174758911, + "learning_rate": 9.09825100337175e-08, + "loss": 0.0788, + "step": 2345 + }, + { + "epoch": 0.9227138643067847, + "grad_norm": 1.5312731266021729, + "learning_rate": 9.006674669341214e-08, + "loss": 0.0744, + "step": 2346 + }, + { + "epoch": 0.9231071779744346, + "grad_norm": 1.6249146461486816, + "learning_rate": 8.915553084948847e-08, + "loss": 0.0442, + "step": 2347 + }, + { + "epoch": 0.9235004916420846, + "grad_norm": 1.0247668027877808, + "learning_rate": 8.824886422139273e-08, + "loss": 0.0621, + "step": 2348 + }, + { + "epoch": 0.9238938053097345, + "grad_norm": 1.506390929222107, + "learning_rate": 8.734674851998748e-08, + "loss": 0.0755, + "step": 2349 + }, + { + "epoch": 0.9242871189773845, + "grad_norm": 0.8823897838592529, + "learning_rate": 8.64491854475466e-08, + "loss": 0.0637, + "step": 2350 + }, + { + "epoch": 0.9246804326450344, + "grad_norm": 0.7110940217971802, + "learning_rate": 8.55561766977539e-08, + "loss": 0.0326, + "step": 2351 + }, + { + "epoch": 0.9250737463126844, + "grad_norm": 0.5734057426452637, + "learning_rate": 8.46677239556995e-08, + "loss": 0.0305, + "step": 2352 + }, + { + "epoch": 0.9254670599803343, + "grad_norm": 0.8686132431030273, + "learning_rate": 8.378382889787596e-08, + "loss": 0.0405, + "step": 2353 + }, + { + "epoch": 0.9258603736479842, + "grad_norm": 1.6284774541854858, + "learning_rate": 8.290449319217603e-08, + "loss": 0.0583, + "step": 2354 + }, + { + "epoch": 0.9262536873156342, + "grad_norm": 1.2678624391555786, + "learning_rate": 8.202971849788854e-08, + "loss": 0.0474, + "step": 2355 + }, + { + "epoch": 0.9266470009832841, + "grad_norm": 1.2101284265518188, + "learning_rate": 8.115950646569587e-08, + "loss": 0.0391, + "step": 2356 + }, + { + "epoch": 0.9270403146509342, + "grad_norm": 0.6382131576538086, + "learning_rate": 8.029385873767115e-08, + "loss": 0.0512, + "step": 2357 + }, + { + "epoch": 0.9274336283185841, + "grad_norm": 1.0339092016220093, + "learning_rate": 7.943277694727469e-08, + "loss": 0.0528, + "step": 2358 + }, + { + "epoch": 0.927826941986234, + "grad_norm": 0.7545960545539856, + "learning_rate": 7.857626271935037e-08, + "loss": 0.0418, + "step": 2359 + }, + { + "epoch": 0.928220255653884, + "grad_norm": 0.9588167071342468, + "learning_rate": 7.772431767012423e-08, + "loss": 0.0552, + "step": 2360 + }, + { + "epoch": 0.9286135693215339, + "grad_norm": 0.7952490448951721, + "learning_rate": 7.68769434071992e-08, + "loss": 0.0431, + "step": 2361 + }, + { + "epoch": 0.9290068829891839, + "grad_norm": 1.0601327419281006, + "learning_rate": 7.603414152955374e-08, + "loss": 0.0262, + "step": 2362 + }, + { + "epoch": 0.9294001966568338, + "grad_norm": 0.8356077075004578, + "learning_rate": 7.519591362753848e-08, + "loss": 0.0309, + "step": 2363 + }, + { + "epoch": 0.9297935103244838, + "grad_norm": 1.068089246749878, + "learning_rate": 7.436226128287288e-08, + "loss": 0.0374, + "step": 2364 + }, + { + "epoch": 0.9301868239921337, + "grad_norm": 1.1383631229400635, + "learning_rate": 7.35331860686428e-08, + "loss": 0.0515, + "step": 2365 + }, + { + "epoch": 0.9305801376597836, + "grad_norm": 0.9927535653114319, + "learning_rate": 7.270868954929595e-08, + "loss": 0.056, + "step": 2366 + }, + { + "epoch": 0.9309734513274336, + "grad_norm": 0.6153873801231384, + "learning_rate": 7.188877328064142e-08, + "loss": 0.0437, + "step": 2367 + }, + { + "epoch": 0.9313667649950835, + "grad_norm": 0.8163816928863525, + "learning_rate": 7.107343880984496e-08, + "loss": 0.0541, + "step": 2368 + }, + { + "epoch": 0.9317600786627336, + "grad_norm": 1.144721269607544, + "learning_rate": 7.026268767542671e-08, + "loss": 0.055, + "step": 2369 + }, + { + "epoch": 0.9321533923303835, + "grad_norm": 0.9538362622261047, + "learning_rate": 6.94565214072579e-08, + "loss": 0.0845, + "step": 2370 + }, + { + "epoch": 0.9325467059980335, + "grad_norm": 1.0417604446411133, + "learning_rate": 6.86549415265586e-08, + "loss": 0.054, + "step": 2371 + }, + { + "epoch": 0.9329400196656834, + "grad_norm": 0.8085368275642395, + "learning_rate": 6.785794954589365e-08, + "loss": 0.0338, + "step": 2372 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.6007797718048096, + "learning_rate": 6.706554696917139e-08, + "loss": 0.0314, + "step": 2373 + }, + { + "epoch": 0.9337266470009833, + "grad_norm": 0.8648099303245544, + "learning_rate": 6.627773529163994e-08, + "loss": 0.0302, + "step": 2374 + }, + { + "epoch": 0.9341199606686332, + "grad_norm": 0.5465229749679565, + "learning_rate": 6.549451599988432e-08, + "loss": 0.0359, + "step": 2375 + }, + { + "epoch": 0.9345132743362832, + "grad_norm": 0.6655777096748352, + "learning_rate": 6.471589057182398e-08, + "loss": 0.0435, + "step": 2376 + }, + { + "epoch": 0.9349065880039331, + "grad_norm": 1.1010547876358032, + "learning_rate": 6.394186047670947e-08, + "loss": 0.0377, + "step": 2377 + }, + { + "epoch": 0.9352999016715831, + "grad_norm": 0.7519053816795349, + "learning_rate": 6.317242717511995e-08, + "loss": 0.033, + "step": 2378 + }, + { + "epoch": 0.935693215339233, + "grad_norm": 0.8617828488349915, + "learning_rate": 6.240759211896153e-08, + "loss": 0.0434, + "step": 2379 + }, + { + "epoch": 0.9360865290068829, + "grad_norm": 1.5556560754776, + "learning_rate": 6.16473567514625e-08, + "loss": 0.0893, + "step": 2380 + }, + { + "epoch": 0.936479842674533, + "grad_norm": 1.6594090461730957, + "learning_rate": 6.089172250717201e-08, + "loss": 0.0667, + "step": 2381 + }, + { + "epoch": 0.9368731563421829, + "grad_norm": 0.7117483019828796, + "learning_rate": 6.014069081195673e-08, + "loss": 0.0256, + "step": 2382 + }, + { + "epoch": 0.9372664700098329, + "grad_norm": 0.8783112168312073, + "learning_rate": 5.9394263082998836e-08, + "loss": 0.0439, + "step": 2383 + }, + { + "epoch": 0.9376597836774828, + "grad_norm": 0.73135906457901, + "learning_rate": 5.8652440728792504e-08, + "loss": 0.0514, + "step": 2384 + }, + { + "epoch": 0.9380530973451328, + "grad_norm": 0.5708735585212708, + "learning_rate": 5.791522514914216e-08, + "loss": 0.0332, + "step": 2385 + }, + { + "epoch": 0.9384464110127827, + "grad_norm": 1.1698683500289917, + "learning_rate": 5.718261773515865e-08, + "loss": 0.026, + "step": 2386 + }, + { + "epoch": 0.9388397246804326, + "grad_norm": 0.8288942575454712, + "learning_rate": 5.64546198692581e-08, + "loss": 0.0401, + "step": 2387 + }, + { + "epoch": 0.9392330383480826, + "grad_norm": 1.1005017757415771, + "learning_rate": 5.573123292515775e-08, + "loss": 0.0625, + "step": 2388 + }, + { + "epoch": 0.9396263520157325, + "grad_norm": 1.4169667959213257, + "learning_rate": 5.50124582678746e-08, + "loss": 0.0561, + "step": 2389 + }, + { + "epoch": 0.9400196656833825, + "grad_norm": 1.8534727096557617, + "learning_rate": 5.429829725372204e-08, + "loss": 0.0563, + "step": 2390 + }, + { + "epoch": 0.9404129793510324, + "grad_norm": 0.49012327194213867, + "learning_rate": 5.3588751230307935e-08, + "loss": 0.0371, + "step": 2391 + }, + { + "epoch": 0.9408062930186823, + "grad_norm": 1.5290131568908691, + "learning_rate": 5.2883821536531545e-08, + "loss": 0.0471, + "step": 2392 + }, + { + "epoch": 0.9411996066863324, + "grad_norm": 0.37540706992149353, + "learning_rate": 5.218350950258133e-08, + "loss": 0.0224, + "step": 2393 + }, + { + "epoch": 0.9415929203539823, + "grad_norm": 1.6441450119018555, + "learning_rate": 5.1487816449932174e-08, + "loss": 0.0545, + "step": 2394 + }, + { + "epoch": 0.9419862340216323, + "grad_norm": 0.8181889057159424, + "learning_rate": 5.079674369134313e-08, + "loss": 0.0528, + "step": 2395 + }, + { + "epoch": 0.9423795476892822, + "grad_norm": 1.6283776760101318, + "learning_rate": 5.0110292530854696e-08, + "loss": 0.0528, + "step": 2396 + }, + { + "epoch": 0.9427728613569322, + "grad_norm": 4.418090343475342, + "learning_rate": 4.942846426378683e-08, + "loss": 0.052, + "step": 2397 + }, + { + "epoch": 0.9431661750245821, + "grad_norm": 0.9668748378753662, + "learning_rate": 4.875126017673593e-08, + "loss": 0.0441, + "step": 2398 + }, + { + "epoch": 0.943559488692232, + "grad_norm": 1.2723820209503174, + "learning_rate": 4.807868154757284e-08, + "loss": 0.0504, + "step": 2399 + }, + { + "epoch": 0.943952802359882, + "grad_norm": 1.2000619173049927, + "learning_rate": 4.741072964543958e-08, + "loss": 0.0669, + "step": 2400 + }, + { + "epoch": 0.9443461160275319, + "grad_norm": 1.4198737144470215, + "learning_rate": 4.6747405730748765e-08, + "loss": 0.0768, + "step": 2401 + }, + { + "epoch": 0.9447394296951819, + "grad_norm": 0.5707858800888062, + "learning_rate": 4.6088711055179426e-08, + "loss": 0.0363, + "step": 2402 + }, + { + "epoch": 0.9451327433628318, + "grad_norm": 0.9884591698646545, + "learning_rate": 4.543464686167537e-08, + "loss": 0.0617, + "step": 2403 + }, + { + "epoch": 0.9455260570304818, + "grad_norm": 1.1140447854995728, + "learning_rate": 4.478521438444267e-08, + "loss": 0.0307, + "step": 2404 + }, + { + "epoch": 0.9459193706981318, + "grad_norm": 1.7241696119308472, + "learning_rate": 4.414041484894743e-08, + "loss": 0.0468, + "step": 2405 + }, + { + "epoch": 0.9463126843657818, + "grad_norm": 1.4963939189910889, + "learning_rate": 4.3500249471913616e-08, + "loss": 0.0424, + "step": 2406 + }, + { + "epoch": 0.9467059980334317, + "grad_norm": 1.4940134286880493, + "learning_rate": 4.2864719461321036e-08, + "loss": 0.062, + "step": 2407 + }, + { + "epoch": 0.9470993117010816, + "grad_norm": 1.2279117107391357, + "learning_rate": 4.223382601640208e-08, + "loss": 0.0557, + "step": 2408 + }, + { + "epoch": 0.9474926253687316, + "grad_norm": 0.5514369606971741, + "learning_rate": 4.160757032764001e-08, + "loss": 0.0211, + "step": 2409 + }, + { + "epoch": 0.9478859390363815, + "grad_norm": 1.1696200370788574, + "learning_rate": 4.098595357676732e-08, + "loss": 0.0525, + "step": 2410 + }, + { + "epoch": 0.9482792527040315, + "grad_norm": 1.4047200679779053, + "learning_rate": 4.036897693676184e-08, + "loss": 0.0582, + "step": 2411 + }, + { + "epoch": 0.9486725663716814, + "grad_norm": 0.9069812893867493, + "learning_rate": 3.9756641571847e-08, + "loss": 0.0451, + "step": 2412 + }, + { + "epoch": 0.9490658800393313, + "grad_norm": 0.7696250677108765, + "learning_rate": 3.914894863748714e-08, + "loss": 0.0596, + "step": 2413 + }, + { + "epoch": 0.9494591937069813, + "grad_norm": 1.0009849071502686, + "learning_rate": 3.854589928038666e-08, + "loss": 0.0531, + "step": 2414 + }, + { + "epoch": 0.9498525073746312, + "grad_norm": 0.6316270232200623, + "learning_rate": 3.794749463848835e-08, + "loss": 0.0261, + "step": 2415 + }, + { + "epoch": 0.9502458210422812, + "grad_norm": 1.1284974813461304, + "learning_rate": 3.735373584096924e-08, + "loss": 0.0485, + "step": 2416 + }, + { + "epoch": 0.9506391347099312, + "grad_norm": 0.744842529296875, + "learning_rate": 3.676462400824088e-08, + "loss": 0.0437, + "step": 2417 + }, + { + "epoch": 0.9510324483775812, + "grad_norm": 1.1578047275543213, + "learning_rate": 3.618016025194598e-08, + "loss": 0.0458, + "step": 2418 + }, + { + "epoch": 0.9514257620452311, + "grad_norm": 1.029968023300171, + "learning_rate": 3.560034567495513e-08, + "loss": 0.063, + "step": 2419 + }, + { + "epoch": 0.951819075712881, + "grad_norm": 0.8940306305885315, + "learning_rate": 3.5025181371367844e-08, + "loss": 0.0583, + "step": 2420 + }, + { + "epoch": 0.952212389380531, + "grad_norm": 1.1246992349624634, + "learning_rate": 3.4454668426507076e-08, + "loss": 0.0446, + "step": 2421 + }, + { + "epoch": 0.9526057030481809, + "grad_norm": 1.069629192352295, + "learning_rate": 3.388880791692001e-08, + "loss": 0.0422, + "step": 2422 + }, + { + "epoch": 0.9529990167158309, + "grad_norm": 1.080478549003601, + "learning_rate": 3.33276009103739e-08, + "loss": 0.0547, + "step": 2423 + }, + { + "epoch": 0.9533923303834808, + "grad_norm": 1.105726718902588, + "learning_rate": 3.2771048465855546e-08, + "loss": 0.0478, + "step": 2424 + }, + { + "epoch": 0.9537856440511308, + "grad_norm": 0.9557194709777832, + "learning_rate": 3.221915163356848e-08, + "loss": 0.0454, + "step": 2425 + }, + { + "epoch": 0.9541789577187807, + "grad_norm": 0.7306869626045227, + "learning_rate": 3.167191145493076e-08, + "loss": 0.0306, + "step": 2426 + }, + { + "epoch": 0.9545722713864306, + "grad_norm": 0.9311756491661072, + "learning_rate": 3.1129328962573865e-08, + "loss": 0.0378, + "step": 2427 + }, + { + "epoch": 0.9549655850540806, + "grad_norm": 1.6339657306671143, + "learning_rate": 3.05914051803402e-08, + "loss": 0.053, + "step": 2428 + }, + { + "epoch": 0.9553588987217306, + "grad_norm": 1.5211260318756104, + "learning_rate": 3.005814112328143e-08, + "loss": 0.0408, + "step": 2429 + }, + { + "epoch": 0.9557522123893806, + "grad_norm": 1.1606007814407349, + "learning_rate": 2.9529537797656215e-08, + "loss": 0.0531, + "step": 2430 + }, + { + "epoch": 0.9561455260570305, + "grad_norm": 0.5916828513145447, + "learning_rate": 2.900559620092891e-08, + "loss": 0.0625, + "step": 2431 + }, + { + "epoch": 0.9565388397246805, + "grad_norm": 0.49938130378723145, + "learning_rate": 2.8486317321766432e-08, + "loss": 0.0395, + "step": 2432 + }, + { + "epoch": 0.9569321533923304, + "grad_norm": 1.587057113647461, + "learning_rate": 2.797170214003775e-08, + "loss": 0.1053, + "step": 2433 + }, + { + "epoch": 0.9573254670599803, + "grad_norm": 1.176936149597168, + "learning_rate": 2.7461751626811916e-08, + "loss": 0.0462, + "step": 2434 + }, + { + "epoch": 0.9577187807276303, + "grad_norm": 0.5434470176696777, + "learning_rate": 2.6956466744355315e-08, + "loss": 0.0268, + "step": 2435 + }, + { + "epoch": 0.9581120943952802, + "grad_norm": 0.6117231845855713, + "learning_rate": 2.6455848446130526e-08, + "loss": 0.0572, + "step": 2436 + }, + { + "epoch": 0.9585054080629302, + "grad_norm": 1.2302024364471436, + "learning_rate": 2.5959897676794134e-08, + "loss": 0.0613, + "step": 2437 + }, + { + "epoch": 0.9588987217305801, + "grad_norm": 1.686108946800232, + "learning_rate": 2.546861537219586e-08, + "loss": 0.0726, + "step": 2438 + }, + { + "epoch": 0.95929203539823, + "grad_norm": 0.9010059833526611, + "learning_rate": 2.4982002459375265e-08, + "loss": 0.0356, + "step": 2439 + }, + { + "epoch": 0.95968534906588, + "grad_norm": 0.7760159373283386, + "learning_rate": 2.450005985656173e-08, + "loss": 0.0376, + "step": 2440 + }, + { + "epoch": 0.96007866273353, + "grad_norm": 0.788345456123352, + "learning_rate": 2.4022788473170853e-08, + "loss": 0.0657, + "step": 2441 + }, + { + "epoch": 0.96047197640118, + "grad_norm": 0.8711709976196289, + "learning_rate": 2.355018920980501e-08, + "loss": 0.0444, + "step": 2442 + }, + { + "epoch": 0.9608652900688299, + "grad_norm": 0.6124730110168457, + "learning_rate": 2.308226295824917e-08, + "loss": 0.0542, + "step": 2443 + }, + { + "epoch": 0.9612586037364799, + "grad_norm": 1.0837171077728271, + "learning_rate": 2.2619010601470925e-08, + "loss": 0.0577, + "step": 2444 + }, + { + "epoch": 0.9616519174041298, + "grad_norm": 1.9453260898590088, + "learning_rate": 2.2160433013618533e-08, + "loss": 0.058, + "step": 2445 + }, + { + "epoch": 0.9620452310717797, + "grad_norm": 0.8556208610534668, + "learning_rate": 2.170653106001841e-08, + "loss": 0.0281, + "step": 2446 + }, + { + "epoch": 0.9624385447394297, + "grad_norm": 0.9196289777755737, + "learning_rate": 2.1257305597175428e-08, + "loss": 0.0414, + "step": 2447 + }, + { + "epoch": 0.9628318584070796, + "grad_norm": 1.5880217552185059, + "learning_rate": 2.0812757472768175e-08, + "loss": 0.0496, + "step": 2448 + }, + { + "epoch": 0.9632251720747296, + "grad_norm": 1.4076353311538696, + "learning_rate": 2.037288752565064e-08, + "loss": 0.049, + "step": 2449 + }, + { + "epoch": 0.9636184857423795, + "grad_norm": 0.8668321967124939, + "learning_rate": 1.99376965858486e-08, + "loss": 0.0606, + "step": 2450 + }, + { + "epoch": 0.9640117994100295, + "grad_norm": 0.7461321353912354, + "learning_rate": 1.9507185474558765e-08, + "loss": 0.0343, + "step": 2451 + }, + { + "epoch": 0.9644051130776794, + "grad_norm": 0.6470179557800293, + "learning_rate": 1.908135500414743e-08, + "loss": 0.0334, + "step": 2452 + }, + { + "epoch": 0.9647984267453295, + "grad_norm": 1.0918750762939453, + "learning_rate": 1.866020597814766e-08, + "loss": 0.0451, + "step": 2453 + }, + { + "epoch": 0.9651917404129794, + "grad_norm": 0.6877756118774414, + "learning_rate": 1.8243739191259603e-08, + "loss": 0.0397, + "step": 2454 + }, + { + "epoch": 0.9655850540806293, + "grad_norm": 0.9845160245895386, + "learning_rate": 1.7831955429348235e-08, + "loss": 0.0227, + "step": 2455 + }, + { + "epoch": 0.9659783677482793, + "grad_norm": 1.178027629852295, + "learning_rate": 1.7424855469440617e-08, + "loss": 0.0941, + "step": 2456 + }, + { + "epoch": 0.9663716814159292, + "grad_norm": 1.0678149461746216, + "learning_rate": 1.7022440079726976e-08, + "loss": 0.0519, + "step": 2457 + }, + { + "epoch": 0.9667649950835792, + "grad_norm": 0.7598469257354736, + "learning_rate": 1.6624710019556844e-08, + "loss": 0.0303, + "step": 2458 + }, + { + "epoch": 0.9671583087512291, + "grad_norm": 1.8913023471832275, + "learning_rate": 1.623166603943932e-08, + "loss": 0.0573, + "step": 2459 + }, + { + "epoch": 0.967551622418879, + "grad_norm": 0.8094140887260437, + "learning_rate": 1.584330888104002e-08, + "loss": 0.0454, + "step": 2460 + }, + { + "epoch": 0.967944936086529, + "grad_norm": 1.0645431280136108, + "learning_rate": 1.5459639277181637e-08, + "loss": 0.0482, + "step": 2461 + }, + { + "epoch": 0.9683382497541789, + "grad_norm": 1.1675747632980347, + "learning_rate": 1.508065795184116e-08, + "loss": 0.0587, + "step": 2462 + }, + { + "epoch": 0.9687315634218289, + "grad_norm": 1.6579506397247314, + "learning_rate": 1.4706365620149043e-08, + "loss": 0.0389, + "step": 2463 + }, + { + "epoch": 0.9691248770894788, + "grad_norm": 1.4258586168289185, + "learning_rate": 1.433676298838671e-08, + "loss": 0.0571, + "step": 2464 + }, + { + "epoch": 0.9695181907571289, + "grad_norm": 1.555445671081543, + "learning_rate": 1.3971850753987936e-08, + "loss": 0.0561, + "step": 2465 + }, + { + "epoch": 0.9699115044247788, + "grad_norm": 1.851238489151001, + "learning_rate": 1.3611629605534139e-08, + "loss": 0.0614, + "step": 2466 + }, + { + "epoch": 0.9703048180924287, + "grad_norm": 1.4167311191558838, + "learning_rate": 1.325610022275603e-08, + "loss": 0.0541, + "step": 2467 + }, + { + "epoch": 0.9706981317600787, + "grad_norm": 1.103963017463684, + "learning_rate": 1.29052632765303e-08, + "loss": 0.0515, + "step": 2468 + }, + { + "epoch": 0.9710914454277286, + "grad_norm": 0.8383644819259644, + "learning_rate": 1.2559119428879607e-08, + "loss": 0.0439, + "step": 2469 + }, + { + "epoch": 0.9714847590953786, + "grad_norm": 1.5626074075698853, + "learning_rate": 1.2217669332970084e-08, + "loss": 0.0358, + "step": 2470 + }, + { + "epoch": 0.9718780727630285, + "grad_norm": 0.965404748916626, + "learning_rate": 1.1880913633111335e-08, + "loss": 0.0588, + "step": 2471 + }, + { + "epoch": 0.9722713864306785, + "grad_norm": 1.2146902084350586, + "learning_rate": 1.1548852964755053e-08, + "loss": 0.0473, + "step": 2472 + }, + { + "epoch": 0.9726647000983284, + "grad_norm": 1.4855893850326538, + "learning_rate": 1.122148795449307e-08, + "loss": 0.0543, + "step": 2473 + }, + { + "epoch": 0.9730580137659783, + "grad_norm": 1.1908034086227417, + "learning_rate": 1.0898819220056811e-08, + "loss": 0.0486, + "step": 2474 + }, + { + "epoch": 0.9734513274336283, + "grad_norm": 1.0501704216003418, + "learning_rate": 1.058084737031534e-08, + "loss": 0.0475, + "step": 2475 + }, + { + "epoch": 0.9738446411012782, + "grad_norm": 0.6650611162185669, + "learning_rate": 1.0267573005275645e-08, + "loss": 0.0297, + "step": 2476 + }, + { + "epoch": 0.9742379547689283, + "grad_norm": 0.6201514601707458, + "learning_rate": 9.95899671607986e-09, + "loss": 0.047, + "step": 2477 + }, + { + "epoch": 0.9746312684365782, + "grad_norm": 1.1360257863998413, + "learning_rate": 9.655119085005827e-09, + "loss": 0.0363, + "step": 2478 + }, + { + "epoch": 0.9750245821042282, + "grad_norm": 0.8666075468063354, + "learning_rate": 9.355940685464305e-09, + "loss": 0.0458, + "step": 2479 + }, + { + "epoch": 0.9754178957718781, + "grad_norm": 1.1366305351257324, + "learning_rate": 9.061462081999262e-09, + "loss": 0.0471, + "step": 2480 + }, + { + "epoch": 0.975811209439528, + "grad_norm": 0.6694433689117432, + "learning_rate": 8.771683830285649e-09, + "loss": 0.0387, + "step": 2481 + }, + { + "epoch": 0.976204523107178, + "grad_norm": 2.0710513591766357, + "learning_rate": 8.486606477129677e-09, + "loss": 0.075, + "step": 2482 + }, + { + "epoch": 0.9765978367748279, + "grad_norm": 0.9630718231201172, + "learning_rate": 8.206230560466322e-09, + "loss": 0.0431, + "step": 2483 + }, + { + "epoch": 0.9769911504424779, + "grad_norm": 0.9957706332206726, + "learning_rate": 7.930556609359596e-09, + "loss": 0.0398, + "step": 2484 + }, + { + "epoch": 0.9773844641101278, + "grad_norm": 0.8392490148544312, + "learning_rate": 7.659585144000892e-09, + "loss": 0.1203, + "step": 2485 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.763048529624939, + "learning_rate": 7.393316675707584e-09, + "loss": 0.048, + "step": 2486 + }, + { + "epoch": 0.9781710914454277, + "grad_norm": 0.591249942779541, + "learning_rate": 7.131751706923595e-09, + "loss": 0.0276, + "step": 2487 + }, + { + "epoch": 0.9785644051130776, + "grad_norm": 0.7118191719055176, + "learning_rate": 6.8748907312163325e-09, + "loss": 0.0459, + "step": 2488 + }, + { + "epoch": 0.9789577187807277, + "grad_norm": 1.2333048582077026, + "learning_rate": 6.622734233277528e-09, + "loss": 0.0547, + "step": 2489 + }, + { + "epoch": 0.9793510324483776, + "grad_norm": 1.8401693105697632, + "learning_rate": 6.375282688921569e-09, + "loss": 0.0499, + "step": 2490 + }, + { + "epoch": 0.9797443461160276, + "grad_norm": 0.8339464068412781, + "learning_rate": 6.132536565084945e-09, + "loss": 0.0343, + "step": 2491 + }, + { + "epoch": 0.9801376597836775, + "grad_norm": 0.7225338220596313, + "learning_rate": 5.894496319824306e-09, + "loss": 0.0373, + "step": 2492 + }, + { + "epoch": 0.9805309734513274, + "grad_norm": 0.7467345595359802, + "learning_rate": 5.661162402316733e-09, + "loss": 0.0294, + "step": 2493 + }, + { + "epoch": 0.9809242871189774, + "grad_norm": 0.7157261967658997, + "learning_rate": 5.432535252859472e-09, + "loss": 0.0388, + "step": 2494 + }, + { + "epoch": 0.9813176007866273, + "grad_norm": 1.0490740537643433, + "learning_rate": 5.208615302866593e-09, + "loss": 0.0552, + "step": 2495 + }, + { + "epoch": 0.9817109144542773, + "grad_norm": 0.9684942364692688, + "learning_rate": 4.989402974871216e-09, + "loss": 0.0482, + "step": 2496 + }, + { + "epoch": 0.9821042281219272, + "grad_norm": 0.7083243727684021, + "learning_rate": 4.774898682522455e-09, + "loss": 0.0354, + "step": 2497 + }, + { + "epoch": 0.9824975417895772, + "grad_norm": 0.6887216567993164, + "learning_rate": 4.565102830585699e-09, + "loss": 0.0555, + "step": 2498 + }, + { + "epoch": 0.9828908554572271, + "grad_norm": 0.9905696511268616, + "learning_rate": 4.360015814941498e-09, + "loss": 0.044, + "step": 2499 + }, + { + "epoch": 0.983284169124877, + "grad_norm": 1.4582995176315308, + "learning_rate": 4.159638022585011e-09, + "loss": 0.0555, + "step": 2500 + }, + { + "epoch": 0.9836774827925271, + "grad_norm": 0.8839958906173706, + "learning_rate": 3.96396983162517e-09, + "loss": 0.0322, + "step": 2501 + }, + { + "epoch": 0.984070796460177, + "grad_norm": 0.9634173512458801, + "learning_rate": 3.773011611284128e-09, + "loss": 0.0305, + "step": 2502 + }, + { + "epoch": 0.984464110127827, + "grad_norm": 0.9942337870597839, + "learning_rate": 3.586763721896147e-09, + "loss": 0.0725, + "step": 2503 + }, + { + "epoch": 0.9848574237954769, + "grad_norm": 0.8074241876602173, + "learning_rate": 3.4052265149070453e-09, + "loss": 0.048, + "step": 2504 + }, + { + "epoch": 0.9852507374631269, + "grad_norm": 1.1746639013290405, + "learning_rate": 3.2284003328744706e-09, + "loss": 0.0565, + "step": 2505 + }, + { + "epoch": 0.9856440511307768, + "grad_norm": 1.454350233078003, + "learning_rate": 3.056285509465684e-09, + "loss": 0.0462, + "step": 2506 + }, + { + "epoch": 0.9860373647984267, + "grad_norm": 1.0500266551971436, + "learning_rate": 2.888882369457835e-09, + "loss": 0.0229, + "step": 2507 + }, + { + "epoch": 0.9864306784660767, + "grad_norm": 0.5939337611198425, + "learning_rate": 2.726191228737407e-09, + "loss": 0.0441, + "step": 2508 + }, + { + "epoch": 0.9868239921337266, + "grad_norm": 0.7773805856704712, + "learning_rate": 2.5682123942993852e-09, + "loss": 0.0388, + "step": 2509 + }, + { + "epoch": 0.9872173058013766, + "grad_norm": 0.9417904019355774, + "learning_rate": 2.414946164246701e-09, + "loss": 0.0448, + "step": 2510 + }, + { + "epoch": 0.9876106194690265, + "grad_norm": 0.8849769830703735, + "learning_rate": 2.2663928277896763e-09, + "loss": 0.0482, + "step": 2511 + }, + { + "epoch": 0.9880039331366764, + "grad_norm": 1.0469379425048828, + "learning_rate": 2.122552665245747e-09, + "loss": 0.0479, + "step": 2512 + }, + { + "epoch": 0.9883972468043265, + "grad_norm": 0.4294953942298889, + "learning_rate": 1.9834259480380756e-09, + "loss": 0.017, + "step": 2513 + }, + { + "epoch": 0.9887905604719764, + "grad_norm": 1.0931810140609741, + "learning_rate": 1.8490129386963818e-09, + "loss": 0.0376, + "step": 2514 + }, + { + "epoch": 0.9891838741396264, + "grad_norm": 0.5045303702354431, + "learning_rate": 1.719313890855001e-09, + "loss": 0.0203, + "step": 2515 + }, + { + "epoch": 0.9895771878072763, + "grad_norm": 1.2506543397903442, + "learning_rate": 1.5943290492539953e-09, + "loss": 0.0415, + "step": 2516 + }, + { + "epoch": 0.9899705014749263, + "grad_norm": 0.6282764673233032, + "learning_rate": 1.4740586497366538e-09, + "loss": 0.043, + "step": 2517 + }, + { + "epoch": 0.9903638151425762, + "grad_norm": 1.0732625722885132, + "learning_rate": 1.358502919251159e-09, + "loss": 0.049, + "step": 2518 + }, + { + "epoch": 0.9907571288102262, + "grad_norm": 0.8076870441436768, + "learning_rate": 1.247662075848921e-09, + "loss": 0.0367, + "step": 2519 + }, + { + "epoch": 0.9911504424778761, + "grad_norm": 1.1323729753494263, + "learning_rate": 1.1415363286843007e-09, + "loss": 0.0549, + "step": 2520 + }, + { + "epoch": 0.991543756145526, + "grad_norm": 1.2635443210601807, + "learning_rate": 1.0401258780146084e-09, + "loss": 0.0375, + "step": 2521 + }, + { + "epoch": 0.991937069813176, + "grad_norm": 1.430897831916809, + "learning_rate": 9.434309151992727e-10, + "loss": 0.075, + "step": 2522 + }, + { + "epoch": 0.9923303834808259, + "grad_norm": 1.1660479307174683, + "learning_rate": 8.514516226998393e-10, + "loss": 0.0562, + "step": 2523 + }, + { + "epoch": 0.9927236971484759, + "grad_norm": 2.029007911682129, + "learning_rate": 7.641881740794166e-10, + "loss": 0.0481, + "step": 2524 + }, + { + "epoch": 0.9931170108161259, + "grad_norm": 0.7072765827178955, + "learning_rate": 6.816407340023978e-10, + "loss": 0.0188, + "step": 2525 + }, + { + "epoch": 0.9935103244837759, + "grad_norm": 0.8789957165718079, + "learning_rate": 6.03809458233906e-10, + "loss": 0.0573, + "step": 2526 + }, + { + "epoch": 0.9939036381514258, + "grad_norm": 0.7415314316749573, + "learning_rate": 5.306944936406266e-10, + "loss": 0.0458, + "step": 2527 + }, + { + "epoch": 0.9942969518190757, + "grad_norm": 0.6154326796531677, + "learning_rate": 4.622959781883096e-10, + "loss": 0.0236, + "step": 2528 + }, + { + "epoch": 0.9946902654867257, + "grad_norm": 0.810153067111969, + "learning_rate": 3.9861404094426734e-10, + "loss": 0.0443, + "step": 2529 + }, + { + "epoch": 0.9950835791543756, + "grad_norm": 0.743605375289917, + "learning_rate": 3.3964880207459916e-10, + "loss": 0.052, + "step": 2530 + }, + { + "epoch": 0.9954768928220256, + "grad_norm": 1.1516720056533813, + "learning_rate": 2.8540037284557897e-10, + "loss": 0.0729, + "step": 2531 + }, + { + "epoch": 0.9958702064896755, + "grad_norm": 1.1776301860809326, + "learning_rate": 2.358688556233779e-10, + "loss": 0.0401, + "step": 2532 + }, + { + "epoch": 0.9962635201573254, + "grad_norm": 1.0834025144577026, + "learning_rate": 1.9105434387239886e-10, + "loss": 0.0593, + "step": 2533 + }, + { + "epoch": 0.9966568338249754, + "grad_norm": 1.4529463052749634, + "learning_rate": 1.509569221569418e-10, + "loss": 0.0423, + "step": 2534 + }, + { + "epoch": 0.9970501474926253, + "grad_norm": 1.1381511688232422, + "learning_rate": 1.1557666614037122e-10, + "loss": 0.0411, + "step": 2535 + }, + { + "epoch": 0.9974434611602753, + "grad_norm": 1.113553762435913, + "learning_rate": 8.49136425840058e-11, + "loss": 0.0611, + "step": 2536 + }, + { + "epoch": 0.9978367748279253, + "grad_norm": 1.071913719177246, + "learning_rate": 5.896790934878383e-11, + "loss": 0.0609, + "step": 2537 + }, + { + "epoch": 0.9982300884955753, + "grad_norm": 1.7356159687042236, + "learning_rate": 3.7739515393320215e-11, + "loss": 0.0524, + "step": 2538 + }, + { + "epoch": 0.9986234021632252, + "grad_norm": 1.0763658285140991, + "learning_rate": 2.122850077584948e-11, + "loss": 0.0527, + "step": 2539 + }, + { + "epoch": 0.9990167158308751, + "grad_norm": 0.6793241500854492, + "learning_rate": 9.434896651727699e-12, + "loss": 0.0462, + "step": 2540 + }, + { + "epoch": 0.9994100294985251, + "grad_norm": 0.9101441502571106, + "learning_rate": 2.358725275652951e-12, + "loss": 0.0453, + "step": 2541 + }, + { + "epoch": 0.999803343166175, + "grad_norm": 1.0394845008850098, + "learning_rate": 0.0, + "loss": 0.0578, + "step": 2542 + }, + { + "epoch": 0.999803343166175, + "step": 2542, + "total_flos": 5.5848341785175654e+17, + "train_loss": 0.05740805761998535, + "train_runtime": 78224.1342, + "train_samples_per_second": 1.04, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 1.0, + "max_steps": 2542, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.5848341785175654e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}