{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 10990, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00045495905368516835, "grad_norm": 9.424069126182447, "learning_rate": 5e-06, "loss": 0.1263, "step": 1 }, { "epoch": 0.0009099181073703367, "grad_norm": 10.713711803681479, "learning_rate": 4.999999897855645e-06, "loss": 0.1917, "step": 2 }, { "epoch": 0.001364877161055505, "grad_norm": 14.140338542227335, "learning_rate": 4.9999995914225884e-06, "loss": 0.1578, "step": 3 }, { "epoch": 0.0018198362147406734, "grad_norm": 3.597475372738082, "learning_rate": 4.999999080700855e-06, "loss": 0.1266, "step": 4 }, { "epoch": 0.0022747952684258415, "grad_norm": 6.105724745538744, "learning_rate": 4.999998365690486e-06, "loss": 0.1182, "step": 5 }, { "epoch": 0.00272975432211101, "grad_norm": 2.3169263707348047, "learning_rate": 4.999997446391542e-06, "loss": 0.0837, "step": 6 }, { "epoch": 0.0031847133757961785, "grad_norm": 1.5580577162131912, "learning_rate": 4.999996322804095e-06, "loss": 0.0761, "step": 7 }, { "epoch": 0.003639672429481347, "grad_norm": 7.8184917268265455, "learning_rate": 4.999994994928239e-06, "loss": 0.0922, "step": 8 }, { "epoch": 0.004094631483166515, "grad_norm": 1.0452365500769838, "learning_rate": 4.999993462764082e-06, "loss": 0.0478, "step": 9 }, { "epoch": 0.004549590536851683, "grad_norm": 13.965028712537013, "learning_rate": 4.999991726311749e-06, "loss": 0.0846, "step": 10 }, { "epoch": 0.005004549590536852, "grad_norm": 5.125925143296543, "learning_rate": 4.999989785571382e-06, "loss": 0.0881, "step": 11 }, { "epoch": 0.00545950864422202, "grad_norm": 2.2007100936893242, "learning_rate": 4.999987640543139e-06, "loss": 0.0896, "step": 12 }, { "epoch": 0.005914467697907188, "grad_norm": 1.4259973806728683, "learning_rate": 4.999985291227196e-06, "loss": 0.0707, "step": 13 }, { "epoch": 0.006369426751592357, "grad_norm": 2.5296942505090376, "learning_rate": 4.999982737623746e-06, "loss": 0.1089, "step": 14 }, { "epoch": 0.006824385805277525, "grad_norm": 1.9950751818037182, "learning_rate": 4.999979979732995e-06, "loss": 0.0868, "step": 15 }, { "epoch": 0.007279344858962694, "grad_norm": 1.3920340257758652, "learning_rate": 4.999977017555171e-06, "loss": 0.0667, "step": 16 }, { "epoch": 0.0077343039126478615, "grad_norm": 1.6901228042476675, "learning_rate": 4.999973851090514e-06, "loss": 0.1032, "step": 17 }, { "epoch": 0.00818926296633303, "grad_norm": 1.8139241575982044, "learning_rate": 4.999970480339284e-06, "loss": 0.0848, "step": 18 }, { "epoch": 0.008644222020018199, "grad_norm": 2.792209216647474, "learning_rate": 4.9999669053017564e-06, "loss": 0.0804, "step": 19 }, { "epoch": 0.009099181073703366, "grad_norm": 1.9016199513748882, "learning_rate": 4.9999631259782235e-06, "loss": 0.0612, "step": 20 }, { "epoch": 0.009554140127388535, "grad_norm": 1.9965871271660314, "learning_rate": 4.999959142368993e-06, "loss": 0.0969, "step": 21 }, { "epoch": 0.010009099181073703, "grad_norm": 2.0914009303085033, "learning_rate": 4.999954954474391e-06, "loss": 0.0697, "step": 22 }, { "epoch": 0.010464058234758872, "grad_norm": 1.4245797905814712, "learning_rate": 4.9999505622947594e-06, "loss": 0.0832, "step": 23 }, { "epoch": 0.01091901728844404, "grad_norm": 1.5918336957933508, "learning_rate": 4.999945965830458e-06, "loss": 0.0995, "step": 24 }, { "epoch": 0.011373976342129208, "grad_norm": 1.5479918567604505, "learning_rate": 4.999941165081863e-06, "loss": 0.0807, "step": 25 }, { "epoch": 0.011828935395814377, "grad_norm": 1.0230515440884096, "learning_rate": 4.999936160049364e-06, "loss": 0.0643, "step": 26 }, { "epoch": 0.012283894449499545, "grad_norm": 1.5686069800283207, "learning_rate": 4.999930950733373e-06, "loss": 0.0931, "step": 27 }, { "epoch": 0.012738853503184714, "grad_norm": 1.2554970571666952, "learning_rate": 4.999925537134312e-06, "loss": 0.0815, "step": 28 }, { "epoch": 0.013193812556869881, "grad_norm": 2.006239028459661, "learning_rate": 4.9999199192526286e-06, "loss": 0.1058, "step": 29 }, { "epoch": 0.01364877161055505, "grad_norm": 1.4436359414979703, "learning_rate": 4.9999140970887775e-06, "loss": 0.0869, "step": 30 }, { "epoch": 0.014103730664240218, "grad_norm": 1.9267705188401287, "learning_rate": 4.999908070643236e-06, "loss": 0.0781, "step": 31 }, { "epoch": 0.014558689717925387, "grad_norm": 1.4021843278575745, "learning_rate": 4.999901839916495e-06, "loss": 0.0623, "step": 32 }, { "epoch": 0.015013648771610554, "grad_norm": 1.208153452070421, "learning_rate": 4.999895404909067e-06, "loss": 0.063, "step": 33 }, { "epoch": 0.015468607825295723, "grad_norm": 2.273185304548797, "learning_rate": 4.999888765621476e-06, "loss": 0.0901, "step": 34 }, { "epoch": 0.01592356687898089, "grad_norm": 1.0383667898934177, "learning_rate": 4.999881922054264e-06, "loss": 0.0529, "step": 35 }, { "epoch": 0.01637852593266606, "grad_norm": 1.1537070720156926, "learning_rate": 4.999874874207991e-06, "loss": 0.0539, "step": 36 }, { "epoch": 0.01683348498635123, "grad_norm": 7.004645996036244, "learning_rate": 4.999867622083232e-06, "loss": 0.1028, "step": 37 }, { "epoch": 0.017288444040036398, "grad_norm": 2.6515111867419217, "learning_rate": 4.99986016568058e-06, "loss": 0.0958, "step": 38 }, { "epoch": 0.017743403093721567, "grad_norm": 1.5437471575403858, "learning_rate": 4.999852505000646e-06, "loss": 0.0738, "step": 39 }, { "epoch": 0.018198362147406732, "grad_norm": 1.4798019454902687, "learning_rate": 4.999844640044053e-06, "loss": 0.0695, "step": 40 }, { "epoch": 0.0186533212010919, "grad_norm": 1.3064785518172293, "learning_rate": 4.999836570811445e-06, "loss": 0.0738, "step": 41 }, { "epoch": 0.01910828025477707, "grad_norm": 2.6092308850144086, "learning_rate": 4.999828297303483e-06, "loss": 0.0854, "step": 42 }, { "epoch": 0.019563239308462238, "grad_norm": 1.1588535962376392, "learning_rate": 4.9998198195208405e-06, "loss": 0.0783, "step": 43 }, { "epoch": 0.020018198362147407, "grad_norm": 1.441993661454023, "learning_rate": 4.999811137464212e-06, "loss": 0.0826, "step": 44 }, { "epoch": 0.020473157415832575, "grad_norm": 1.6833012903770388, "learning_rate": 4.999802251134307e-06, "loss": 0.0932, "step": 45 }, { "epoch": 0.020928116469517744, "grad_norm": 1.061841465675538, "learning_rate": 4.99979316053185e-06, "loss": 0.0602, "step": 46 }, { "epoch": 0.021383075523202913, "grad_norm": 6.235552737213317, "learning_rate": 4.999783865657585e-06, "loss": 0.1756, "step": 47 }, { "epoch": 0.02183803457688808, "grad_norm": 4.150615789136632, "learning_rate": 4.999774366512272e-06, "loss": 0.1765, "step": 48 }, { "epoch": 0.022292993630573247, "grad_norm": 1.6544370579186418, "learning_rate": 4.9997646630966865e-06, "loss": 0.0841, "step": 49 }, { "epoch": 0.022747952684258416, "grad_norm": 1.3759378168890601, "learning_rate": 4.999754755411621e-06, "loss": 0.0669, "step": 50 }, { "epoch": 0.023202911737943584, "grad_norm": 1.182095773050476, "learning_rate": 4.9997446434578865e-06, "loss": 0.0653, "step": 51 }, { "epoch": 0.023657870791628753, "grad_norm": 1.035739970953985, "learning_rate": 4.999734327236307e-06, "loss": 0.0678, "step": 52 }, { "epoch": 0.024112829845313922, "grad_norm": 0.7085636728418604, "learning_rate": 4.999723806747728e-06, "loss": 0.0498, "step": 53 }, { "epoch": 0.02456778889899909, "grad_norm": 2.2722150874810185, "learning_rate": 4.99971308199301e-06, "loss": 0.0666, "step": 54 }, { "epoch": 0.02502274795268426, "grad_norm": 0.9420150282219443, "learning_rate": 4.999702152973025e-06, "loss": 0.0516, "step": 55 }, { "epoch": 0.025477707006369428, "grad_norm": 1.0929779986912587, "learning_rate": 4.9996910196886694e-06, "loss": 0.0593, "step": 56 }, { "epoch": 0.025932666060054597, "grad_norm": 0.783956655534044, "learning_rate": 4.999679682140852e-06, "loss": 0.0377, "step": 57 }, { "epoch": 0.026387625113739762, "grad_norm": 1.5218504285246661, "learning_rate": 4.999668140330499e-06, "loss": 0.1052, "step": 58 }, { "epoch": 0.02684258416742493, "grad_norm": 1.0791722855226673, "learning_rate": 4.999656394258555e-06, "loss": 0.0632, "step": 59 }, { "epoch": 0.0272975432211101, "grad_norm": 0.9557512868551324, "learning_rate": 4.999644443925978e-06, "loss": 0.0634, "step": 60 }, { "epoch": 0.027752502274795268, "grad_norm": 1.0667565930302423, "learning_rate": 4.999632289333746e-06, "loss": 0.0518, "step": 61 }, { "epoch": 0.028207461328480437, "grad_norm": 1.646318745184601, "learning_rate": 4.999619930482852e-06, "loss": 0.0766, "step": 62 }, { "epoch": 0.028662420382165606, "grad_norm": 1.2186400155674944, "learning_rate": 4.999607367374304e-06, "loss": 0.0741, "step": 63 }, { "epoch": 0.029117379435850774, "grad_norm": 1.0807362476000584, "learning_rate": 4.999594600009131e-06, "loss": 0.0553, "step": 64 }, { "epoch": 0.029572338489535943, "grad_norm": 1.3403222529377026, "learning_rate": 4.999581628388375e-06, "loss": 0.0886, "step": 65 }, { "epoch": 0.03002729754322111, "grad_norm": 1.5384085589580356, "learning_rate": 4.999568452513097e-06, "loss": 0.1371, "step": 66 }, { "epoch": 0.030482256596906277, "grad_norm": 1.3705617237121213, "learning_rate": 4.9995550723843726e-06, "loss": 0.0766, "step": 67 }, { "epoch": 0.030937215650591446, "grad_norm": 1.0692361538736996, "learning_rate": 4.999541488003295e-06, "loss": 0.0607, "step": 68 }, { "epoch": 0.03139217470427662, "grad_norm": 0.9190382606343962, "learning_rate": 4.999527699370975e-06, "loss": 0.0598, "step": 69 }, { "epoch": 0.03184713375796178, "grad_norm": 1.182484013540807, "learning_rate": 4.99951370648854e-06, "loss": 0.0583, "step": 70 }, { "epoch": 0.03230209281164695, "grad_norm": 1.0728084003134533, "learning_rate": 4.999499509357132e-06, "loss": 0.0595, "step": 71 }, { "epoch": 0.03275705186533212, "grad_norm": 4.354112851430141, "learning_rate": 4.999485107977912e-06, "loss": 0.063, "step": 72 }, { "epoch": 0.033212010919017286, "grad_norm": 1.5709570309947747, "learning_rate": 4.999470502352057e-06, "loss": 0.0511, "step": 73 }, { "epoch": 0.03366696997270246, "grad_norm": 5.936498302941106, "learning_rate": 4.999455692480759e-06, "loss": 0.0733, "step": 74 }, { "epoch": 0.034121929026387623, "grad_norm": 2.227018438923651, "learning_rate": 4.999440678365229e-06, "loss": 0.0504, "step": 75 }, { "epoch": 0.034576888080072796, "grad_norm": 1.7106603940792875, "learning_rate": 4.999425460006695e-06, "loss": 0.0672, "step": 76 }, { "epoch": 0.03503184713375796, "grad_norm": 3.603233120133456, "learning_rate": 4.9994100374063995e-06, "loss": 0.0605, "step": 77 }, { "epoch": 0.03548680618744313, "grad_norm": 1.3736083353388526, "learning_rate": 4.9993944105656035e-06, "loss": 0.0892, "step": 78 }, { "epoch": 0.0359417652411283, "grad_norm": 1.0823220835311542, "learning_rate": 4.999378579485582e-06, "loss": 0.0657, "step": 79 }, { "epoch": 0.036396724294813464, "grad_norm": 2.150924215229101, "learning_rate": 4.999362544167632e-06, "loss": 0.0787, "step": 80 }, { "epoch": 0.036851683348498636, "grad_norm": 1.7178625535843866, "learning_rate": 4.99934630461306e-06, "loss": 0.0428, "step": 81 }, { "epoch": 0.0373066424021838, "grad_norm": 1.3444344286672891, "learning_rate": 4.999329860823197e-06, "loss": 0.0683, "step": 82 }, { "epoch": 0.03776160145586897, "grad_norm": 0.9890758669005086, "learning_rate": 4.999313212799383e-06, "loss": 0.0684, "step": 83 }, { "epoch": 0.03821656050955414, "grad_norm": 1.341850788541947, "learning_rate": 4.99929636054298e-06, "loss": 0.0683, "step": 84 }, { "epoch": 0.03867151956323931, "grad_norm": 1.1425756088631416, "learning_rate": 4.999279304055366e-06, "loss": 0.0781, "step": 85 }, { "epoch": 0.039126478616924476, "grad_norm": 1.1872176417370066, "learning_rate": 4.999262043337933e-06, "loss": 0.0652, "step": 86 }, { "epoch": 0.03958143767060965, "grad_norm": 1.1143093977494338, "learning_rate": 4.999244578392094e-06, "loss": 0.0752, "step": 87 }, { "epoch": 0.040036396724294813, "grad_norm": 1.2369204202074342, "learning_rate": 4.9992269092192736e-06, "loss": 0.0822, "step": 88 }, { "epoch": 0.04049135577797998, "grad_norm": 1.1130108244588752, "learning_rate": 4.9992090358209166e-06, "loss": 0.0548, "step": 89 }, { "epoch": 0.04094631483166515, "grad_norm": 1.0691923631453497, "learning_rate": 4.9991909581984835e-06, "loss": 0.058, "step": 90 }, { "epoch": 0.041401273885350316, "grad_norm": 0.8020461125153492, "learning_rate": 4.999172676353451e-06, "loss": 0.0341, "step": 91 }, { "epoch": 0.04185623293903549, "grad_norm": 0.8729429986066347, "learning_rate": 4.999154190287314e-06, "loss": 0.0524, "step": 92 }, { "epoch": 0.042311191992720654, "grad_norm": 1.3186052508212676, "learning_rate": 4.999135500001583e-06, "loss": 0.1067, "step": 93 }, { "epoch": 0.042766151046405826, "grad_norm": 0.9402363215265714, "learning_rate": 4.9991166054977844e-06, "loss": 0.0631, "step": 94 }, { "epoch": 0.04322111010009099, "grad_norm": 1.8336904222617239, "learning_rate": 4.999097506777463e-06, "loss": 0.0897, "step": 95 }, { "epoch": 0.04367606915377616, "grad_norm": 1.0700343361679827, "learning_rate": 4.999078203842179e-06, "loss": 0.084, "step": 96 }, { "epoch": 0.04413102820746133, "grad_norm": 0.8783050881223096, "learning_rate": 4.999058696693511e-06, "loss": 0.0421, "step": 97 }, { "epoch": 0.044585987261146494, "grad_norm": 0.9801149440827129, "learning_rate": 4.99903898533305e-06, "loss": 0.0616, "step": 98 }, { "epoch": 0.045040946314831666, "grad_norm": 0.9471216563783236, "learning_rate": 4.99901906976241e-06, "loss": 0.0614, "step": 99 }, { "epoch": 0.04549590536851683, "grad_norm": 1.1616379193988644, "learning_rate": 4.998998949983217e-06, "loss": 0.0604, "step": 100 }, { "epoch": 0.045950864422202004, "grad_norm": 1.123688602696856, "learning_rate": 4.998978625997115e-06, "loss": 0.0831, "step": 101 }, { "epoch": 0.04640582347588717, "grad_norm": 1.1154387442545128, "learning_rate": 4.998958097805765e-06, "loss": 0.0686, "step": 102 }, { "epoch": 0.04686078252957234, "grad_norm": 0.9538196832365717, "learning_rate": 4.9989373654108445e-06, "loss": 0.0586, "step": 103 }, { "epoch": 0.047315741583257506, "grad_norm": 14.714854180857428, "learning_rate": 4.9989164288140465e-06, "loss": 0.2765, "step": 104 }, { "epoch": 0.04777070063694268, "grad_norm": 1.5310230722630254, "learning_rate": 4.998895288017085e-06, "loss": 0.1114, "step": 105 }, { "epoch": 0.048225659690627844, "grad_norm": 0.8704851988514942, "learning_rate": 4.998873943021684e-06, "loss": 0.0481, "step": 106 }, { "epoch": 0.04868061874431301, "grad_norm": 0.9229853294124807, "learning_rate": 4.998852393829589e-06, "loss": 0.0559, "step": 107 }, { "epoch": 0.04913557779799818, "grad_norm": 1.6890853415724327, "learning_rate": 4.9988306404425625e-06, "loss": 0.1104, "step": 108 }, { "epoch": 0.049590536851683346, "grad_norm": 0.9281407020626959, "learning_rate": 4.99880868286238e-06, "loss": 0.0636, "step": 109 }, { "epoch": 0.05004549590536852, "grad_norm": 1.2440415104108336, "learning_rate": 4.998786521090836e-06, "loss": 0.0522, "step": 110 }, { "epoch": 0.050500454959053684, "grad_norm": 1.06604652034606, "learning_rate": 4.9987641551297426e-06, "loss": 0.0916, "step": 111 }, { "epoch": 0.050955414012738856, "grad_norm": 0.9619782747004665, "learning_rate": 4.998741584980926e-06, "loss": 0.0822, "step": 112 }, { "epoch": 0.05141037306642402, "grad_norm": 1.0679619427370142, "learning_rate": 4.9987188106462314e-06, "loss": 0.0644, "step": 113 }, { "epoch": 0.051865332120109194, "grad_norm": 0.8424012677371406, "learning_rate": 4.99869583212752e-06, "loss": 0.0536, "step": 114 }, { "epoch": 0.05232029117379436, "grad_norm": 1.660603270099433, "learning_rate": 4.9986726494266694e-06, "loss": 0.1336, "step": 115 }, { "epoch": 0.052775250227479524, "grad_norm": 1.0314643506984187, "learning_rate": 4.998649262545574e-06, "loss": 0.0606, "step": 116 }, { "epoch": 0.053230209281164696, "grad_norm": 0.9468486095046134, "learning_rate": 4.998625671486144e-06, "loss": 0.0598, "step": 117 }, { "epoch": 0.05368516833484986, "grad_norm": 0.8800045267913842, "learning_rate": 4.998601876250308e-06, "loss": 0.06, "step": 118 }, { "epoch": 0.054140127388535034, "grad_norm": 1.0192910760666323, "learning_rate": 4.998577876840011e-06, "loss": 0.0601, "step": 119 }, { "epoch": 0.0545950864422202, "grad_norm": 0.9462635574827357, "learning_rate": 4.9985536732572124e-06, "loss": 0.06, "step": 120 }, { "epoch": 0.05505004549590537, "grad_norm": 0.7487116084320051, "learning_rate": 4.998529265503891e-06, "loss": 0.0458, "step": 121 }, { "epoch": 0.055505004549590536, "grad_norm": 1.0663282141956507, "learning_rate": 4.9985046535820416e-06, "loss": 0.0758, "step": 122 }, { "epoch": 0.05595996360327571, "grad_norm": 1.7476635252011261, "learning_rate": 4.998479837493675e-06, "loss": 0.0876, "step": 123 }, { "epoch": 0.056414922656960874, "grad_norm": 1.1513932571098853, "learning_rate": 4.9984548172408195e-06, "loss": 0.0475, "step": 124 }, { "epoch": 0.05686988171064604, "grad_norm": 97.54679492281674, "learning_rate": 4.998429592825519e-06, "loss": 0.2117, "step": 125 }, { "epoch": 0.05732484076433121, "grad_norm": 1.2146893500357796, "learning_rate": 4.998404164249835e-06, "loss": 0.0887, "step": 126 }, { "epoch": 0.05777979981801638, "grad_norm": 0.8319799978985719, "learning_rate": 4.998378531515845e-06, "loss": 0.0411, "step": 127 }, { "epoch": 0.05823475887170155, "grad_norm": 1.5818516008522756, "learning_rate": 4.998352694625645e-06, "loss": 0.068, "step": 128 }, { "epoch": 0.058689717925386714, "grad_norm": 0.883733186490376, "learning_rate": 4.998326653581343e-06, "loss": 0.0595, "step": 129 }, { "epoch": 0.059144676979071886, "grad_norm": 0.9357726879327158, "learning_rate": 4.998300408385072e-06, "loss": 0.0686, "step": 130 }, { "epoch": 0.05959963603275705, "grad_norm": 1.3606472296483436, "learning_rate": 4.998273959038972e-06, "loss": 0.0837, "step": 131 }, { "epoch": 0.06005459508644222, "grad_norm": 0.9597337111291308, "learning_rate": 4.998247305545207e-06, "loss": 0.0733, "step": 132 }, { "epoch": 0.06050955414012739, "grad_norm": 0.7271469650592398, "learning_rate": 4.998220447905953e-06, "loss": 0.0454, "step": 133 }, { "epoch": 0.060964513193812554, "grad_norm": 0.9630498239095886, "learning_rate": 4.998193386123408e-06, "loss": 0.074, "step": 134 }, { "epoch": 0.061419472247497726, "grad_norm": 1.133544314724227, "learning_rate": 4.99816612019978e-06, "loss": 0.077, "step": 135 }, { "epoch": 0.06187443130118289, "grad_norm": 4.162875613842665, "learning_rate": 4.998138650137298e-06, "loss": 0.1461, "step": 136 }, { "epoch": 0.062329390354868064, "grad_norm": 1.02851611153301, "learning_rate": 4.998110975938208e-06, "loss": 0.0883, "step": 137 }, { "epoch": 0.06278434940855324, "grad_norm": 1.4803017864082986, "learning_rate": 4.998083097604769e-06, "loss": 0.093, "step": 138 }, { "epoch": 0.0632393084622384, "grad_norm": 0.775173461523887, "learning_rate": 4.998055015139261e-06, "loss": 0.0446, "step": 139 }, { "epoch": 0.06369426751592357, "grad_norm": 0.9314427643573137, "learning_rate": 4.998026728543979e-06, "loss": 0.0627, "step": 140 }, { "epoch": 0.06414922656960874, "grad_norm": 0.8532842969957802, "learning_rate": 4.997998237821233e-06, "loss": 0.07, "step": 141 }, { "epoch": 0.0646041856232939, "grad_norm": 0.8003964270143441, "learning_rate": 4.997969542973352e-06, "loss": 0.0563, "step": 142 }, { "epoch": 0.06505914467697907, "grad_norm": 1.0449654693074535, "learning_rate": 4.997940644002681e-06, "loss": 0.0705, "step": 143 }, { "epoch": 0.06551410373066424, "grad_norm": 1.2317539206735935, "learning_rate": 4.997911540911581e-06, "loss": 0.0552, "step": 144 }, { "epoch": 0.06596906278434941, "grad_norm": 1.0170288864286834, "learning_rate": 4.99788223370243e-06, "loss": 0.075, "step": 145 }, { "epoch": 0.06642402183803457, "grad_norm": 2.1516221031707796, "learning_rate": 4.9978527223776245e-06, "loss": 0.1294, "step": 146 }, { "epoch": 0.06687898089171974, "grad_norm": 0.8159636795919125, "learning_rate": 4.9978230069395735e-06, "loss": 0.0512, "step": 147 }, { "epoch": 0.06733393994540492, "grad_norm": 1.0575473809333984, "learning_rate": 4.9977930873907065e-06, "loss": 0.0598, "step": 148 }, { "epoch": 0.06778889899909009, "grad_norm": 1.0958109016760909, "learning_rate": 4.997762963733468e-06, "loss": 0.074, "step": 149 }, { "epoch": 0.06824385805277525, "grad_norm": 1.047477211054204, "learning_rate": 4.997732635970321e-06, "loss": 0.0539, "step": 150 }, { "epoch": 0.06869881710646042, "grad_norm": 1.0301191819422422, "learning_rate": 4.9977021041037425e-06, "loss": 0.0686, "step": 151 }, { "epoch": 0.06915377616014559, "grad_norm": 1.225998339573777, "learning_rate": 4.9976713681362265e-06, "loss": 0.0859, "step": 152 }, { "epoch": 0.06960873521383075, "grad_norm": 1.416119617095304, "learning_rate": 4.997640428070286e-06, "loss": 0.1051, "step": 153 }, { "epoch": 0.07006369426751592, "grad_norm": 0.9227148160074169, "learning_rate": 4.99760928390845e-06, "loss": 0.0476, "step": 154 }, { "epoch": 0.0705186533212011, "grad_norm": 0.9417296172183, "learning_rate": 4.997577935653262e-06, "loss": 0.0546, "step": 155 }, { "epoch": 0.07097361237488627, "grad_norm": 0.7429922167271485, "learning_rate": 4.9975463833072835e-06, "loss": 0.0438, "step": 156 }, { "epoch": 0.07142857142857142, "grad_norm": 1.1317968054046752, "learning_rate": 4.997514626873093e-06, "loss": 0.0723, "step": 157 }, { "epoch": 0.0718835304822566, "grad_norm": 0.894309839547546, "learning_rate": 4.997482666353287e-06, "loss": 0.0484, "step": 158 }, { "epoch": 0.07233848953594177, "grad_norm": 1.2064896460901124, "learning_rate": 4.997450501750476e-06, "loss": 0.0686, "step": 159 }, { "epoch": 0.07279344858962693, "grad_norm": 1.0338993985106997, "learning_rate": 4.997418133067288e-06, "loss": 0.066, "step": 160 }, { "epoch": 0.0732484076433121, "grad_norm": 1.0854078217047458, "learning_rate": 4.997385560306368e-06, "loss": 0.075, "step": 161 }, { "epoch": 0.07370336669699727, "grad_norm": 0.9955556312708298, "learning_rate": 4.997352783470379e-06, "loss": 0.0693, "step": 162 }, { "epoch": 0.07415832575068244, "grad_norm": 1.1119344699280262, "learning_rate": 4.997319802561997e-06, "loss": 0.0687, "step": 163 }, { "epoch": 0.0746132848043676, "grad_norm": 0.8118992710097626, "learning_rate": 4.9972866175839196e-06, "loss": 0.061, "step": 164 }, { "epoch": 0.07506824385805277, "grad_norm": 1.0509201052861925, "learning_rate": 4.9972532285388575e-06, "loss": 0.0738, "step": 165 }, { "epoch": 0.07552320291173795, "grad_norm": 1.1660685920656126, "learning_rate": 4.997219635429538e-06, "loss": 0.1018, "step": 166 }, { "epoch": 0.07597816196542312, "grad_norm": 0.9894981496034668, "learning_rate": 4.997185838258709e-06, "loss": 0.0534, "step": 167 }, { "epoch": 0.07643312101910828, "grad_norm": 0.9397553289113793, "learning_rate": 4.997151837029129e-06, "loss": 0.0527, "step": 168 }, { "epoch": 0.07688808007279345, "grad_norm": 0.9368221512292729, "learning_rate": 4.997117631743579e-06, "loss": 0.0648, "step": 169 }, { "epoch": 0.07734303912647862, "grad_norm": 0.898690664067523, "learning_rate": 4.997083222404852e-06, "loss": 0.0479, "step": 170 }, { "epoch": 0.07779799818016378, "grad_norm": 0.835250569463016, "learning_rate": 4.997048609015762e-06, "loss": 0.0528, "step": 171 }, { "epoch": 0.07825295723384895, "grad_norm": 0.9098471940978452, "learning_rate": 4.997013791579136e-06, "loss": 0.0641, "step": 172 }, { "epoch": 0.07870791628753412, "grad_norm": 0.9538942863622895, "learning_rate": 4.996978770097819e-06, "loss": 0.0648, "step": 173 }, { "epoch": 0.0791628753412193, "grad_norm": 0.9163372515795332, "learning_rate": 4.996943544574673e-06, "loss": 0.0682, "step": 174 }, { "epoch": 0.07961783439490445, "grad_norm": 0.8193165634479148, "learning_rate": 4.996908115012576e-06, "loss": 0.0485, "step": 175 }, { "epoch": 0.08007279344858963, "grad_norm": 1.0278777387100766, "learning_rate": 4.996872481414425e-06, "loss": 0.0741, "step": 176 }, { "epoch": 0.0805277525022748, "grad_norm": 1.8323226365700802, "learning_rate": 4.9968366437831305e-06, "loss": 0.1107, "step": 177 }, { "epoch": 0.08098271155595996, "grad_norm": 0.5562843681536768, "learning_rate": 4.99680060212162e-06, "loss": 0.0379, "step": 178 }, { "epoch": 0.08143767060964513, "grad_norm": 0.6982410679992989, "learning_rate": 4.996764356432841e-06, "loss": 0.0576, "step": 179 }, { "epoch": 0.0818926296633303, "grad_norm": 0.9996693552976796, "learning_rate": 4.996727906719754e-06, "loss": 0.056, "step": 180 }, { "epoch": 0.08234758871701547, "grad_norm": 0.8092230365331524, "learning_rate": 4.9966912529853365e-06, "loss": 0.036, "step": 181 }, { "epoch": 0.08280254777070063, "grad_norm": 0.8856317784665715, "learning_rate": 4.996654395232585e-06, "loss": 0.0546, "step": 182 }, { "epoch": 0.0832575068243858, "grad_norm": 0.7648943084887926, "learning_rate": 4.996617333464512e-06, "loss": 0.0456, "step": 183 }, { "epoch": 0.08371246587807098, "grad_norm": 0.8896960831413809, "learning_rate": 4.996580067684145e-06, "loss": 0.0505, "step": 184 }, { "epoch": 0.08416742493175614, "grad_norm": 0.8819070603063018, "learning_rate": 4.996542597894528e-06, "loss": 0.0833, "step": 185 }, { "epoch": 0.08462238398544131, "grad_norm": 0.8756856388162975, "learning_rate": 4.996504924098726e-06, "loss": 0.0641, "step": 186 }, { "epoch": 0.08507734303912648, "grad_norm": 0.8527408544485862, "learning_rate": 4.9964670462998145e-06, "loss": 0.0553, "step": 187 }, { "epoch": 0.08553230209281165, "grad_norm": 0.9875356023767464, "learning_rate": 4.99642896450089e-06, "loss": 0.0874, "step": 188 }, { "epoch": 0.08598726114649681, "grad_norm": 2.0664437318649003, "learning_rate": 4.9963906787050656e-06, "loss": 0.0901, "step": 189 }, { "epoch": 0.08644222020018198, "grad_norm": 0.772276028123917, "learning_rate": 4.996352188915467e-06, "loss": 0.0457, "step": 190 }, { "epoch": 0.08689717925386715, "grad_norm": 1.5995533229184502, "learning_rate": 4.996313495135242e-06, "loss": 0.0902, "step": 191 }, { "epoch": 0.08735213830755233, "grad_norm": 1.14262643514501, "learning_rate": 4.9962745973675505e-06, "loss": 0.0887, "step": 192 }, { "epoch": 0.08780709736123748, "grad_norm": 0.653471766542576, "learning_rate": 4.996235495615572e-06, "loss": 0.0381, "step": 193 }, { "epoch": 0.08826205641492266, "grad_norm": 1.21800497391657, "learning_rate": 4.996196189882503e-06, "loss": 0.0859, "step": 194 }, { "epoch": 0.08871701546860783, "grad_norm": 1.2184077345088562, "learning_rate": 4.996156680171552e-06, "loss": 0.0858, "step": 195 }, { "epoch": 0.08917197452229299, "grad_norm": 0.8525171751383268, "learning_rate": 4.996116966485951e-06, "loss": 0.0542, "step": 196 }, { "epoch": 0.08962693357597816, "grad_norm": 1.0438941172842933, "learning_rate": 4.996077048828944e-06, "loss": 0.0735, "step": 197 }, { "epoch": 0.09008189262966333, "grad_norm": 0.9982779135093925, "learning_rate": 4.996036927203793e-06, "loss": 0.0773, "step": 198 }, { "epoch": 0.0905368516833485, "grad_norm": 1.5215875068980074, "learning_rate": 4.995996601613775e-06, "loss": 0.0814, "step": 199 }, { "epoch": 0.09099181073703366, "grad_norm": 0.9525593904667519, "learning_rate": 4.9959560720621875e-06, "loss": 0.0631, "step": 200 }, { "epoch": 0.09144676979071883, "grad_norm": 1.6658936796296464, "learning_rate": 4.995915338552341e-06, "loss": 0.0892, "step": 201 }, { "epoch": 0.09190172884440401, "grad_norm": 1.0100426736293826, "learning_rate": 4.995874401087565e-06, "loss": 0.0618, "step": 202 }, { "epoch": 0.09235668789808917, "grad_norm": 1.2729210933806279, "learning_rate": 4.9958332596712035e-06, "loss": 0.0808, "step": 203 }, { "epoch": 0.09281164695177434, "grad_norm": 1.0142800844722413, "learning_rate": 4.99579191430662e-06, "loss": 0.0715, "step": 204 }, { "epoch": 0.09326660600545951, "grad_norm": 4.237455676216414, "learning_rate": 4.995750364997192e-06, "loss": 0.062, "step": 205 }, { "epoch": 0.09372156505914468, "grad_norm": 7.872559330750363, "learning_rate": 4.995708611746314e-06, "loss": 0.0548, "step": 206 }, { "epoch": 0.09417652411282984, "grad_norm": 1.2028032815765721, "learning_rate": 4.995666654557399e-06, "loss": 0.0678, "step": 207 }, { "epoch": 0.09463148316651501, "grad_norm": 0.9911372243080299, "learning_rate": 4.995624493433876e-06, "loss": 0.0728, "step": 208 }, { "epoch": 0.09508644222020018, "grad_norm": 2.5900155398471942, "learning_rate": 4.995582128379189e-06, "loss": 0.0822, "step": 209 }, { "epoch": 0.09554140127388536, "grad_norm": 1.4214627215980935, "learning_rate": 4.9955395593968e-06, "loss": 0.1096, "step": 210 }, { "epoch": 0.09599636032757052, "grad_norm": 11.75678149199321, "learning_rate": 4.99549678649019e-06, "loss": 0.0579, "step": 211 }, { "epoch": 0.09645131938125569, "grad_norm": 3.8898709501740747, "learning_rate": 4.99545380966285e-06, "loss": 0.0695, "step": 212 }, { "epoch": 0.09690627843494086, "grad_norm": 4.099783756040842, "learning_rate": 4.995410628918294e-06, "loss": 0.0711, "step": 213 }, { "epoch": 0.09736123748862602, "grad_norm": 3.9495811570453445, "learning_rate": 4.995367244260052e-06, "loss": 0.0871, "step": 214 }, { "epoch": 0.09781619654231119, "grad_norm": 0.7508672950199423, "learning_rate": 4.995323655691667e-06, "loss": 0.0369, "step": 215 }, { "epoch": 0.09827115559599636, "grad_norm": 1.3368080010868653, "learning_rate": 4.995279863216702e-06, "loss": 0.0752, "step": 216 }, { "epoch": 0.09872611464968153, "grad_norm": 0.8823975012529762, "learning_rate": 4.995235866838735e-06, "loss": 0.0695, "step": 217 }, { "epoch": 0.09918107370336669, "grad_norm": 0.8099194866460178, "learning_rate": 4.995191666561361e-06, "loss": 0.0561, "step": 218 }, { "epoch": 0.09963603275705187, "grad_norm": 0.6772333028080019, "learning_rate": 4.995147262388192e-06, "loss": 0.0441, "step": 219 }, { "epoch": 0.10009099181073704, "grad_norm": 0.9342067677666205, "learning_rate": 4.995102654322858e-06, "loss": 0.0613, "step": 220 }, { "epoch": 0.1005459508644222, "grad_norm": 0.7594825525973931, "learning_rate": 4.995057842369002e-06, "loss": 0.0349, "step": 221 }, { "epoch": 0.10100090991810737, "grad_norm": 0.8418616902443392, "learning_rate": 4.995012826530287e-06, "loss": 0.0693, "step": 222 }, { "epoch": 0.10145586897179254, "grad_norm": 1.4826966236644097, "learning_rate": 4.99496760681039e-06, "loss": 0.0971, "step": 223 }, { "epoch": 0.10191082802547771, "grad_norm": 1.3244278108579797, "learning_rate": 4.994922183213009e-06, "loss": 0.0963, "step": 224 }, { "epoch": 0.10236578707916287, "grad_norm": 0.5464933779715734, "learning_rate": 4.9948765557418535e-06, "loss": 0.0357, "step": 225 }, { "epoch": 0.10282074613284804, "grad_norm": 1.1325271027713097, "learning_rate": 4.994830724400653e-06, "loss": 0.0756, "step": 226 }, { "epoch": 0.10327570518653321, "grad_norm": 0.7823528354045581, "learning_rate": 4.994784689193151e-06, "loss": 0.0609, "step": 227 }, { "epoch": 0.10373066424021839, "grad_norm": 0.6599438687201707, "learning_rate": 4.994738450123111e-06, "loss": 0.046, "step": 228 }, { "epoch": 0.10418562329390355, "grad_norm": 0.9666854434475629, "learning_rate": 4.994692007194312e-06, "loss": 0.0743, "step": 229 }, { "epoch": 0.10464058234758872, "grad_norm": 0.7151615241659314, "learning_rate": 4.994645360410547e-06, "loss": 0.0583, "step": 230 }, { "epoch": 0.10509554140127389, "grad_norm": 0.7773674174360427, "learning_rate": 4.99459850977563e-06, "loss": 0.0618, "step": 231 }, { "epoch": 0.10555050045495905, "grad_norm": 0.8418236580272198, "learning_rate": 4.994551455293388e-06, "loss": 0.046, "step": 232 }, { "epoch": 0.10600545950864422, "grad_norm": 0.9714541810445473, "learning_rate": 4.9945041969676654e-06, "loss": 0.0634, "step": 233 }, { "epoch": 0.10646041856232939, "grad_norm": 1.0109494927023708, "learning_rate": 4.994456734802325e-06, "loss": 0.0551, "step": 234 }, { "epoch": 0.10691537761601456, "grad_norm": 0.714933750259254, "learning_rate": 4.994409068801247e-06, "loss": 0.0593, "step": 235 }, { "epoch": 0.10737033666969972, "grad_norm": 1.998280137227604, "learning_rate": 4.994361198968323e-06, "loss": 0.0632, "step": 236 }, { "epoch": 0.1078252957233849, "grad_norm": 1.2708633718893245, "learning_rate": 4.994313125307466e-06, "loss": 0.0909, "step": 237 }, { "epoch": 0.10828025477707007, "grad_norm": 0.7903038049799667, "learning_rate": 4.994264847822605e-06, "loss": 0.0579, "step": 238 }, { "epoch": 0.10873521383075523, "grad_norm": 0.7076795429019287, "learning_rate": 4.994216366517684e-06, "loss": 0.0419, "step": 239 }, { "epoch": 0.1091901728844404, "grad_norm": 0.9078047157633448, "learning_rate": 4.994167681396667e-06, "loss": 0.0631, "step": 240 }, { "epoch": 0.10964513193812557, "grad_norm": 1.122407784822992, "learning_rate": 4.994118792463529e-06, "loss": 0.0771, "step": 241 }, { "epoch": 0.11010009099181074, "grad_norm": 1.3544092697698327, "learning_rate": 4.994069699722267e-06, "loss": 0.1034, "step": 242 }, { "epoch": 0.1105550500454959, "grad_norm": 1.0823051140179736, "learning_rate": 4.994020403176893e-06, "loss": 0.0737, "step": 243 }, { "epoch": 0.11101000909918107, "grad_norm": 1.4097890081473512, "learning_rate": 4.9939709028314345e-06, "loss": 0.0882, "step": 244 }, { "epoch": 0.11146496815286625, "grad_norm": 3.3536954759034883, "learning_rate": 4.993921198689935e-06, "loss": 0.0448, "step": 245 }, { "epoch": 0.11191992720655142, "grad_norm": 0.8141751078988797, "learning_rate": 4.993871290756459e-06, "loss": 0.053, "step": 246 }, { "epoch": 0.11237488626023658, "grad_norm": 0.7556149519633891, "learning_rate": 4.9938211790350835e-06, "loss": 0.053, "step": 247 }, { "epoch": 0.11282984531392175, "grad_norm": 1.028865867099704, "learning_rate": 4.993770863529902e-06, "loss": 0.068, "step": 248 }, { "epoch": 0.11328480436760692, "grad_norm": 0.8709129466992336, "learning_rate": 4.993720344245029e-06, "loss": 0.0533, "step": 249 }, { "epoch": 0.11373976342129208, "grad_norm": 0.8992015471183187, "learning_rate": 4.99366962118459e-06, "loss": 0.0589, "step": 250 }, { "epoch": 0.11419472247497725, "grad_norm": 0.8276539094244998, "learning_rate": 4.99361869435273e-06, "loss": 0.0537, "step": 251 }, { "epoch": 0.11464968152866242, "grad_norm": 1.1164497627583263, "learning_rate": 4.993567563753613e-06, "loss": 0.0627, "step": 252 }, { "epoch": 0.1151046405823476, "grad_norm": 1.049662917063972, "learning_rate": 4.993516229391414e-06, "loss": 0.0708, "step": 253 }, { "epoch": 0.11555959963603275, "grad_norm": 0.8007012653446455, "learning_rate": 4.993464691270331e-06, "loss": 0.036, "step": 254 }, { "epoch": 0.11601455868971793, "grad_norm": 1.0491396902879628, "learning_rate": 4.993412949394572e-06, "loss": 0.0564, "step": 255 }, { "epoch": 0.1164695177434031, "grad_norm": 1.0461746265014504, "learning_rate": 4.993361003768369e-06, "loss": 0.0547, "step": 256 }, { "epoch": 0.11692447679708826, "grad_norm": 0.7167785855145479, "learning_rate": 4.993308854395963e-06, "loss": 0.0543, "step": 257 }, { "epoch": 0.11737943585077343, "grad_norm": 1.7377303264454829, "learning_rate": 4.993256501281618e-06, "loss": 0.0385, "step": 258 }, { "epoch": 0.1178343949044586, "grad_norm": 0.9843734560261626, "learning_rate": 4.993203944429611e-06, "loss": 0.0793, "step": 259 }, { "epoch": 0.11828935395814377, "grad_norm": 0.7687699158665893, "learning_rate": 4.993151183844236e-06, "loss": 0.0554, "step": 260 }, { "epoch": 0.11874431301182893, "grad_norm": 0.8273770606193852, "learning_rate": 4.9930982195298065e-06, "loss": 0.0485, "step": 261 }, { "epoch": 0.1191992720655141, "grad_norm": 0.8576587444889947, "learning_rate": 4.9930450514906484e-06, "loss": 0.0668, "step": 262 }, { "epoch": 0.11965423111919928, "grad_norm": 0.8611188803026584, "learning_rate": 4.9929916797311075e-06, "loss": 0.0511, "step": 263 }, { "epoch": 0.12010919017288443, "grad_norm": 1.5171570240611278, "learning_rate": 4.992938104255545e-06, "loss": 0.105, "step": 264 }, { "epoch": 0.1205641492265696, "grad_norm": 0.9293458324727663, "learning_rate": 4.992884325068339e-06, "loss": 0.0519, "step": 265 }, { "epoch": 0.12101910828025478, "grad_norm": 0.854174247687424, "learning_rate": 4.992830342173882e-06, "loss": 0.0739, "step": 266 }, { "epoch": 0.12147406733393995, "grad_norm": 1.185366954699452, "learning_rate": 4.992776155576589e-06, "loss": 0.088, "step": 267 }, { "epoch": 0.12192902638762511, "grad_norm": 0.8266584460330494, "learning_rate": 4.992721765280884e-06, "loss": 0.0766, "step": 268 }, { "epoch": 0.12238398544131028, "grad_norm": 1.4759867391060453, "learning_rate": 4.992667171291215e-06, "loss": 0.0935, "step": 269 }, { "epoch": 0.12283894449499545, "grad_norm": 0.7694789949011869, "learning_rate": 4.992612373612042e-06, "loss": 0.0444, "step": 270 }, { "epoch": 0.12329390354868063, "grad_norm": 1.3788521475642475, "learning_rate": 4.99255737224784e-06, "loss": 0.0686, "step": 271 }, { "epoch": 0.12374886260236578, "grad_norm": 0.6815172893718315, "learning_rate": 4.9925021672031075e-06, "loss": 0.0597, "step": 272 }, { "epoch": 0.12420382165605096, "grad_norm": 1.0361010622054052, "learning_rate": 4.992446758482353e-06, "loss": 0.0577, "step": 273 }, { "epoch": 0.12465878070973613, "grad_norm": 1.3644605952379871, "learning_rate": 4.992391146090106e-06, "loss": 0.1058, "step": 274 }, { "epoch": 0.1251137397634213, "grad_norm": 0.6194563221594529, "learning_rate": 4.99233533003091e-06, "loss": 0.0481, "step": 275 }, { "epoch": 0.12556869881710647, "grad_norm": 0.8571229878526591, "learning_rate": 4.992279310309326e-06, "loss": 0.0811, "step": 276 }, { "epoch": 0.12602365787079162, "grad_norm": 1.543151015857885, "learning_rate": 4.9922230869299316e-06, "loss": 0.1184, "step": 277 }, { "epoch": 0.1264786169244768, "grad_norm": 0.8807612446920655, "learning_rate": 4.992166659897321e-06, "loss": 0.0629, "step": 278 }, { "epoch": 0.12693357597816196, "grad_norm": 0.754844132552438, "learning_rate": 4.992110029216106e-06, "loss": 0.0488, "step": 279 }, { "epoch": 0.12738853503184713, "grad_norm": 0.8268029175550342, "learning_rate": 4.992053194890914e-06, "loss": 0.0463, "step": 280 }, { "epoch": 0.1278434940855323, "grad_norm": 3.022212928870756, "learning_rate": 4.991996156926388e-06, "loss": 0.0622, "step": 281 }, { "epoch": 0.12829845313921748, "grad_norm": 11.09472751599815, "learning_rate": 4.9919389153271904e-06, "loss": 0.0631, "step": 282 }, { "epoch": 0.12875341219290265, "grad_norm": 5.099079327460766, "learning_rate": 4.991881470097998e-06, "loss": 0.0666, "step": 283 }, { "epoch": 0.1292083712465878, "grad_norm": 27.36348462792037, "learning_rate": 4.991823821243505e-06, "loss": 0.0601, "step": 284 }, { "epoch": 0.12966333030027297, "grad_norm": 4.3922238380370375, "learning_rate": 4.991765968768422e-06, "loss": 0.0801, "step": 285 }, { "epoch": 0.13011828935395814, "grad_norm": 2.2745288954264855, "learning_rate": 4.991707912677477e-06, "loss": 0.0461, "step": 286 }, { "epoch": 0.1305732484076433, "grad_norm": 4.103515429733392, "learning_rate": 4.991649652975414e-06, "loss": 0.0464, "step": 287 }, { "epoch": 0.13102820746132848, "grad_norm": 8.218943562506432, "learning_rate": 4.991591189666994e-06, "loss": 0.048, "step": 288 }, { "epoch": 0.13148316651501366, "grad_norm": 2.4966842175341917, "learning_rate": 4.991532522756993e-06, "loss": 0.0635, "step": 289 }, { "epoch": 0.13193812556869883, "grad_norm": 2.714522360598833, "learning_rate": 4.991473652250207e-06, "loss": 0.0416, "step": 290 }, { "epoch": 0.13239308462238397, "grad_norm": 0.7496525159208725, "learning_rate": 4.991414578151445e-06, "loss": 0.0558, "step": 291 }, { "epoch": 0.13284804367606914, "grad_norm": 1.9010591397052237, "learning_rate": 4.991355300465535e-06, "loss": 0.1319, "step": 292 }, { "epoch": 0.13330300272975432, "grad_norm": 1.0962153747662344, "learning_rate": 4.99129581919732e-06, "loss": 0.0597, "step": 293 }, { "epoch": 0.1337579617834395, "grad_norm": 0.682643165278525, "learning_rate": 4.9912361343516616e-06, "loss": 0.0392, "step": 294 }, { "epoch": 0.13421292083712466, "grad_norm": 0.8036831706718465, "learning_rate": 4.991176245933437e-06, "loss": 0.0572, "step": 295 }, { "epoch": 0.13466787989080983, "grad_norm": 0.799489770852785, "learning_rate": 4.9911161539475385e-06, "loss": 0.0533, "step": 296 }, { "epoch": 0.135122838944495, "grad_norm": 1.5836938564798495, "learning_rate": 4.991055858398879e-06, "loss": 0.0875, "step": 297 }, { "epoch": 0.13557779799818018, "grad_norm": 1.1468269055025655, "learning_rate": 4.990995359292384e-06, "loss": 0.0843, "step": 298 }, { "epoch": 0.13603275705186532, "grad_norm": 1.025610687153708, "learning_rate": 4.990934656632997e-06, "loss": 0.0767, "step": 299 }, { "epoch": 0.1364877161055505, "grad_norm": 1.135419351562697, "learning_rate": 4.990873750425679e-06, "loss": 0.0521, "step": 300 }, { "epoch": 0.13694267515923567, "grad_norm": 0.7857019349684609, "learning_rate": 4.990812640675406e-06, "loss": 0.0577, "step": 301 }, { "epoch": 0.13739763421292084, "grad_norm": 0.6543121694694685, "learning_rate": 4.990751327387174e-06, "loss": 0.0408, "step": 302 }, { "epoch": 0.137852593266606, "grad_norm": 0.9867579368206506, "learning_rate": 4.99068981056599e-06, "loss": 0.0644, "step": 303 }, { "epoch": 0.13830755232029118, "grad_norm": 0.9387206564680207, "learning_rate": 4.990628090216885e-06, "loss": 0.0725, "step": 304 }, { "epoch": 0.13876251137397635, "grad_norm": 0.6895906486970027, "learning_rate": 4.990566166344898e-06, "loss": 0.0444, "step": 305 }, { "epoch": 0.1392174704276615, "grad_norm": 0.8627162803317235, "learning_rate": 4.990504038955092e-06, "loss": 0.0639, "step": 306 }, { "epoch": 0.13967242948134667, "grad_norm": 0.9832854011829437, "learning_rate": 4.990441708052542e-06, "loss": 0.067, "step": 307 }, { "epoch": 0.14012738853503184, "grad_norm": 0.6828895359949346, "learning_rate": 4.9903791736423435e-06, "loss": 0.0511, "step": 308 }, { "epoch": 0.14058234758871702, "grad_norm": 0.949508820368659, "learning_rate": 4.9903164357296044e-06, "loss": 0.0586, "step": 309 }, { "epoch": 0.1410373066424022, "grad_norm": 0.8262401805570133, "learning_rate": 4.990253494319453e-06, "loss": 0.072, "step": 310 }, { "epoch": 0.14149226569608736, "grad_norm": 0.7329455864605506, "learning_rate": 4.990190349417032e-06, "loss": 0.0659, "step": 311 }, { "epoch": 0.14194722474977253, "grad_norm": 1.008005243411958, "learning_rate": 4.990127001027501e-06, "loss": 0.0682, "step": 312 }, { "epoch": 0.14240218380345768, "grad_norm": 1.3159038760119786, "learning_rate": 4.990063449156037e-06, "loss": 0.0485, "step": 313 }, { "epoch": 0.14285714285714285, "grad_norm": 1.053132819530921, "learning_rate": 4.989999693807832e-06, "loss": 0.0736, "step": 314 }, { "epoch": 0.14331210191082802, "grad_norm": 1.2097384821970267, "learning_rate": 4.989935734988098e-06, "loss": 0.0752, "step": 315 }, { "epoch": 0.1437670609645132, "grad_norm": 0.8883071525106219, "learning_rate": 4.98987157270206e-06, "loss": 0.0614, "step": 316 }, { "epoch": 0.14422202001819837, "grad_norm": 1.3457063090752772, "learning_rate": 4.989807206954961e-06, "loss": 0.0896, "step": 317 }, { "epoch": 0.14467697907188354, "grad_norm": 0.6077627024555071, "learning_rate": 4.9897426377520605e-06, "loss": 0.0426, "step": 318 }, { "epoch": 0.1451319381255687, "grad_norm": 1.4177858466419022, "learning_rate": 4.989677865098636e-06, "loss": 0.0922, "step": 319 }, { "epoch": 0.14558689717925385, "grad_norm": 0.5838535924114719, "learning_rate": 4.989612888999978e-06, "loss": 0.04, "step": 320 }, { "epoch": 0.14604185623293903, "grad_norm": 0.7991836602542821, "learning_rate": 4.9895477094614e-06, "loss": 0.0644, "step": 321 }, { "epoch": 0.1464968152866242, "grad_norm": 0.6309987592236359, "learning_rate": 4.989482326488225e-06, "loss": 0.0457, "step": 322 }, { "epoch": 0.14695177434030937, "grad_norm": 0.850157804894001, "learning_rate": 4.989416740085796e-06, "loss": 0.0706, "step": 323 }, { "epoch": 0.14740673339399454, "grad_norm": 0.8703332109039406, "learning_rate": 4.9893509502594735e-06, "loss": 0.0503, "step": 324 }, { "epoch": 0.14786169244767972, "grad_norm": 0.9357603198363387, "learning_rate": 4.9892849570146335e-06, "loss": 0.0799, "step": 325 }, { "epoch": 0.1483166515013649, "grad_norm": 0.9508555727006773, "learning_rate": 4.989218760356668e-06, "loss": 0.0703, "step": 326 }, { "epoch": 0.14877161055505003, "grad_norm": 0.8548982254979315, "learning_rate": 4.989152360290987e-06, "loss": 0.0706, "step": 327 }, { "epoch": 0.1492265696087352, "grad_norm": 1.1548758627037845, "learning_rate": 4.989085756823015e-06, "loss": 0.0868, "step": 328 }, { "epoch": 0.14968152866242038, "grad_norm": 0.872011841531817, "learning_rate": 4.989018949958197e-06, "loss": 0.0642, "step": 329 }, { "epoch": 0.15013648771610555, "grad_norm": 0.7767447334589991, "learning_rate": 4.98895193970199e-06, "loss": 0.0428, "step": 330 }, { "epoch": 0.15059144676979072, "grad_norm": 0.9215786343037755, "learning_rate": 4.9888847260598705e-06, "loss": 0.0652, "step": 331 }, { "epoch": 0.1510464058234759, "grad_norm": 1.0293746869379716, "learning_rate": 4.98881730903733e-06, "loss": 0.0768, "step": 332 }, { "epoch": 0.15150136487716107, "grad_norm": 1.2190824076232663, "learning_rate": 4.98874968863988e-06, "loss": 0.0746, "step": 333 }, { "epoch": 0.15195632393084624, "grad_norm": 0.8899729802614444, "learning_rate": 4.988681864873044e-06, "loss": 0.0638, "step": 334 }, { "epoch": 0.15241128298453138, "grad_norm": 0.8009499929718743, "learning_rate": 4.988613837742364e-06, "loss": 0.0556, "step": 335 }, { "epoch": 0.15286624203821655, "grad_norm": 1.0942561304100769, "learning_rate": 4.9885456072534015e-06, "loss": 0.0685, "step": 336 }, { "epoch": 0.15332120109190173, "grad_norm": 1.1210686024600067, "learning_rate": 4.988477173411728e-06, "loss": 0.0649, "step": 337 }, { "epoch": 0.1537761601455869, "grad_norm": 0.713128381997935, "learning_rate": 4.988408536222939e-06, "loss": 0.043, "step": 338 }, { "epoch": 0.15423111919927207, "grad_norm": 0.8820810335281195, "learning_rate": 4.9883396956926416e-06, "loss": 0.0545, "step": 339 }, { "epoch": 0.15468607825295724, "grad_norm": 0.7198251112806523, "learning_rate": 4.988270651826462e-06, "loss": 0.0419, "step": 340 }, { "epoch": 0.15514103730664242, "grad_norm": 0.9319745452557298, "learning_rate": 4.988201404630041e-06, "loss": 0.0556, "step": 341 }, { "epoch": 0.15559599636032756, "grad_norm": 0.7744733545189804, "learning_rate": 4.988131954109038e-06, "loss": 0.0566, "step": 342 }, { "epoch": 0.15605095541401273, "grad_norm": 1.2609547822192495, "learning_rate": 4.988062300269128e-06, "loss": 0.0931, "step": 343 }, { "epoch": 0.1565059144676979, "grad_norm": 1.0356035457639365, "learning_rate": 4.987992443116003e-06, "loss": 0.0592, "step": 344 }, { "epoch": 0.15696087352138308, "grad_norm": 0.8613107982126194, "learning_rate": 4.987922382655372e-06, "loss": 0.0599, "step": 345 }, { "epoch": 0.15741583257506825, "grad_norm": 1.2274429381178749, "learning_rate": 4.987852118892958e-06, "loss": 0.104, "step": 346 }, { "epoch": 0.15787079162875342, "grad_norm": 0.8982827327342306, "learning_rate": 4.987781651834503e-06, "loss": 0.0777, "step": 347 }, { "epoch": 0.1583257506824386, "grad_norm": 1.124267218302162, "learning_rate": 4.987710981485768e-06, "loss": 0.0815, "step": 348 }, { "epoch": 0.15878070973612374, "grad_norm": 1.1417083606361687, "learning_rate": 4.987640107852525e-06, "loss": 0.0968, "step": 349 }, { "epoch": 0.1592356687898089, "grad_norm": 0.7137928465125194, "learning_rate": 4.987569030940567e-06, "loss": 0.0525, "step": 350 }, { "epoch": 0.15969062784349408, "grad_norm": 0.8074447940975472, "learning_rate": 4.987497750755702e-06, "loss": 0.0478, "step": 351 }, { "epoch": 0.16014558689717925, "grad_norm": 1.320795972993318, "learning_rate": 4.987426267303753e-06, "loss": 0.0814, "step": 352 }, { "epoch": 0.16060054595086443, "grad_norm": 0.956458465296858, "learning_rate": 4.987354580590563e-06, "loss": 0.0728, "step": 353 }, { "epoch": 0.1610555050045496, "grad_norm": 0.9487388071568301, "learning_rate": 4.987282690621991e-06, "loss": 0.0778, "step": 354 }, { "epoch": 0.16151046405823477, "grad_norm": 0.8111568286998416, "learning_rate": 4.987210597403907e-06, "loss": 0.0634, "step": 355 }, { "epoch": 0.16196542311191992, "grad_norm": 0.9291291865293426, "learning_rate": 4.987138300942208e-06, "loss": 0.057, "step": 356 }, { "epoch": 0.1624203821656051, "grad_norm": 0.7796831533037398, "learning_rate": 4.987065801242798e-06, "loss": 0.0591, "step": 357 }, { "epoch": 0.16287534121929026, "grad_norm": 1.0091637666603208, "learning_rate": 4.986993098311601e-06, "loss": 0.0712, "step": 358 }, { "epoch": 0.16333030027297543, "grad_norm": 0.9599752405823201, "learning_rate": 4.986920192154561e-06, "loss": 0.0712, "step": 359 }, { "epoch": 0.1637852593266606, "grad_norm": 0.6975593533750986, "learning_rate": 4.986847082777632e-06, "loss": 0.0489, "step": 360 }, { "epoch": 0.16424021838034578, "grad_norm": 0.8407792898194115, "learning_rate": 4.986773770186791e-06, "loss": 0.0687, "step": 361 }, { "epoch": 0.16469517743403095, "grad_norm": 1.16032280422667, "learning_rate": 4.986700254388027e-06, "loss": 0.0814, "step": 362 }, { "epoch": 0.1651501364877161, "grad_norm": 0.6789989236352713, "learning_rate": 4.986626535387349e-06, "loss": 0.0502, "step": 363 }, { "epoch": 0.16560509554140126, "grad_norm": 0.8858819178004838, "learning_rate": 4.9865526131907795e-06, "loss": 0.0584, "step": 364 }, { "epoch": 0.16606005459508644, "grad_norm": 1.0159257224317502, "learning_rate": 4.9864784878043595e-06, "loss": 0.0828, "step": 365 }, { "epoch": 0.1665150136487716, "grad_norm": 1.1632391007958518, "learning_rate": 4.986404159234146e-06, "loss": 0.0693, "step": 366 }, { "epoch": 0.16696997270245678, "grad_norm": 0.7286212082146628, "learning_rate": 4.986329627486213e-06, "loss": 0.048, "step": 367 }, { "epoch": 0.16742493175614195, "grad_norm": 1.1675091585135315, "learning_rate": 4.986254892566652e-06, "loss": 0.0831, "step": 368 }, { "epoch": 0.16787989080982713, "grad_norm": 0.7791126867293955, "learning_rate": 4.9861799544815684e-06, "loss": 0.0511, "step": 369 }, { "epoch": 0.16833484986351227, "grad_norm": 0.8594476885535768, "learning_rate": 4.986104813237086e-06, "loss": 0.0605, "step": 370 }, { "epoch": 0.16878980891719744, "grad_norm": 0.8510456749795352, "learning_rate": 4.986029468839346e-06, "loss": 0.0568, "step": 371 }, { "epoch": 0.16924476797088261, "grad_norm": 1.1617139473891909, "learning_rate": 4.985953921294505e-06, "loss": 0.09, "step": 372 }, { "epoch": 0.1696997270245678, "grad_norm": 0.6957569576277562, "learning_rate": 4.985878170608736e-06, "loss": 0.038, "step": 373 }, { "epoch": 0.17015468607825296, "grad_norm": 0.8584263131532073, "learning_rate": 4.985802216788228e-06, "loss": 0.0517, "step": 374 }, { "epoch": 0.17060964513193813, "grad_norm": 0.9366771679720911, "learning_rate": 4.98572605983919e-06, "loss": 0.063, "step": 375 }, { "epoch": 0.1710646041856233, "grad_norm": 0.5935251092125957, "learning_rate": 4.985649699767842e-06, "loss": 0.0399, "step": 376 }, { "epoch": 0.17151956323930848, "grad_norm": 0.7556873935071919, "learning_rate": 4.985573136580427e-06, "loss": 0.0606, "step": 377 }, { "epoch": 0.17197452229299362, "grad_norm": 0.723085424895094, "learning_rate": 4.9854963702832e-06, "loss": 0.0498, "step": 378 }, { "epoch": 0.1724294813466788, "grad_norm": 0.9057911616547558, "learning_rate": 4.985419400882433e-06, "loss": 0.0733, "step": 379 }, { "epoch": 0.17288444040036396, "grad_norm": 1.0911724774245748, "learning_rate": 4.985342228384418e-06, "loss": 0.0974, "step": 380 }, { "epoch": 0.17333939945404914, "grad_norm": 15.867955316807802, "learning_rate": 4.985264852795459e-06, "loss": 0.4597, "step": 381 }, { "epoch": 0.1737943585077343, "grad_norm": 0.8242169703714594, "learning_rate": 4.98518727412188e-06, "loss": 0.0592, "step": 382 }, { "epoch": 0.17424931756141948, "grad_norm": 1.189476180626615, "learning_rate": 4.98510949237002e-06, "loss": 0.0871, "step": 383 }, { "epoch": 0.17470427661510465, "grad_norm": 0.9035387689173863, "learning_rate": 4.985031507546234e-06, "loss": 0.0659, "step": 384 }, { "epoch": 0.1751592356687898, "grad_norm": 1.5548450607275692, "learning_rate": 4.984953319656896e-06, "loss": 0.102, "step": 385 }, { "epoch": 0.17561419472247497, "grad_norm": 0.9148861743530409, "learning_rate": 4.984874928708395e-06, "loss": 0.0621, "step": 386 }, { "epoch": 0.17606915377616014, "grad_norm": 1.0088623446062757, "learning_rate": 4.984796334707136e-06, "loss": 0.0801, "step": 387 }, { "epoch": 0.17652411282984531, "grad_norm": 7.099087459170151, "learning_rate": 4.984717537659542e-06, "loss": 0.1139, "step": 388 }, { "epoch": 0.1769790718835305, "grad_norm": 0.6271204554143699, "learning_rate": 4.984638537572052e-06, "loss": 0.0362, "step": 389 }, { "epoch": 0.17743403093721566, "grad_norm": 0.9099126199173307, "learning_rate": 4.984559334451121e-06, "loss": 0.0589, "step": 390 }, { "epoch": 0.17788898999090083, "grad_norm": 0.9635928903258919, "learning_rate": 4.984479928303221e-06, "loss": 0.0485, "step": 391 }, { "epoch": 0.17834394904458598, "grad_norm": 0.8684293064054923, "learning_rate": 4.984400319134841e-06, "loss": 0.0488, "step": 392 }, { "epoch": 0.17879890809827115, "grad_norm": 1.490825595774446, "learning_rate": 4.984320506952487e-06, "loss": 0.1164, "step": 393 }, { "epoch": 0.17925386715195632, "grad_norm": 1.0210666975638372, "learning_rate": 4.9842404917626796e-06, "loss": 0.0765, "step": 394 }, { "epoch": 0.1797088262056415, "grad_norm": 0.7827897024774737, "learning_rate": 4.984160273571959e-06, "loss": 0.0627, "step": 395 }, { "epoch": 0.18016378525932666, "grad_norm": 0.9460976796008799, "learning_rate": 4.9840798523868785e-06, "loss": 0.0802, "step": 396 }, { "epoch": 0.18061874431301184, "grad_norm": 0.6974747481172566, "learning_rate": 4.983999228214011e-06, "loss": 0.0483, "step": 397 }, { "epoch": 0.181073703366697, "grad_norm": 0.7442577439773002, "learning_rate": 4.983918401059943e-06, "loss": 0.0501, "step": 398 }, { "epoch": 0.18152866242038215, "grad_norm": 1.001863981150214, "learning_rate": 4.983837370931282e-06, "loss": 0.0866, "step": 399 }, { "epoch": 0.18198362147406733, "grad_norm": 1.258993794296855, "learning_rate": 4.983756137834647e-06, "loss": 0.1164, "step": 400 }, { "epoch": 0.1824385805277525, "grad_norm": 1.1296307149258726, "learning_rate": 4.9836747017766765e-06, "loss": 0.0698, "step": 401 }, { "epoch": 0.18289353958143767, "grad_norm": 0.9299919208142283, "learning_rate": 4.983593062764027e-06, "loss": 0.0767, "step": 402 }, { "epoch": 0.18334849863512284, "grad_norm": 1.6483380962062835, "learning_rate": 4.983511220803367e-06, "loss": 0.0982, "step": 403 }, { "epoch": 0.18380345768880801, "grad_norm": 0.7951232146562915, "learning_rate": 4.983429175901386e-06, "loss": 0.0621, "step": 404 }, { "epoch": 0.1842584167424932, "grad_norm": 0.7346583458526271, "learning_rate": 4.983346928064788e-06, "loss": 0.0485, "step": 405 }, { "epoch": 0.18471337579617833, "grad_norm": 0.8488964995265393, "learning_rate": 4.9832644773002935e-06, "loss": 0.0697, "step": 406 }, { "epoch": 0.1851683348498635, "grad_norm": 0.637978257841365, "learning_rate": 4.98318182361464e-06, "loss": 0.0578, "step": 407 }, { "epoch": 0.18562329390354868, "grad_norm": 1.006460769017827, "learning_rate": 4.9830989670145825e-06, "loss": 0.0741, "step": 408 }, { "epoch": 0.18607825295723385, "grad_norm": 1.0063850758607982, "learning_rate": 4.9830159075068905e-06, "loss": 0.0698, "step": 409 }, { "epoch": 0.18653321201091902, "grad_norm": 0.9365632618002147, "learning_rate": 4.9829326450983514e-06, "loss": 0.0779, "step": 410 }, { "epoch": 0.1869881710646042, "grad_norm": 0.8773564274313461, "learning_rate": 4.98284917979577e-06, "loss": 0.0608, "step": 411 }, { "epoch": 0.18744313011828936, "grad_norm": 0.9057984183185465, "learning_rate": 4.9827655116059656e-06, "loss": 0.0639, "step": 412 }, { "epoch": 0.18789808917197454, "grad_norm": 0.8657789325497686, "learning_rate": 4.9826816405357755e-06, "loss": 0.0749, "step": 413 }, { "epoch": 0.18835304822565968, "grad_norm": 0.5817294435867961, "learning_rate": 4.982597566592054e-06, "loss": 0.0353, "step": 414 }, { "epoch": 0.18880800727934485, "grad_norm": 1.2277963790590036, "learning_rate": 4.982513289781671e-06, "loss": 0.091, "step": 415 }, { "epoch": 0.18926296633303002, "grad_norm": 0.7616764372047586, "learning_rate": 4.982428810111512e-06, "loss": 0.0597, "step": 416 }, { "epoch": 0.1897179253867152, "grad_norm": 0.710019161677026, "learning_rate": 4.9823441275884814e-06, "loss": 0.0535, "step": 417 }, { "epoch": 0.19017288444040037, "grad_norm": 1.1202371935797844, "learning_rate": 4.982259242219499e-06, "loss": 0.0643, "step": 418 }, { "epoch": 0.19062784349408554, "grad_norm": 0.6803190221634923, "learning_rate": 4.9821741540115006e-06, "loss": 0.0483, "step": 419 }, { "epoch": 0.1910828025477707, "grad_norm": 0.8014131027464055, "learning_rate": 4.982088862971441e-06, "loss": 0.0703, "step": 420 }, { "epoch": 0.19153776160145586, "grad_norm": 0.960552266983122, "learning_rate": 4.982003369106287e-06, "loss": 0.0709, "step": 421 }, { "epoch": 0.19199272065514103, "grad_norm": 0.6179685927519944, "learning_rate": 4.981917672423028e-06, "loss": 0.0407, "step": 422 }, { "epoch": 0.1924476797088262, "grad_norm": 0.9538296833436659, "learning_rate": 4.981831772928664e-06, "loss": 0.0681, "step": 423 }, { "epoch": 0.19290263876251137, "grad_norm": 1.076872796407403, "learning_rate": 4.981745670630216e-06, "loss": 0.0918, "step": 424 }, { "epoch": 0.19335759781619655, "grad_norm": 0.8486267027177018, "learning_rate": 4.981659365534718e-06, "loss": 0.081, "step": 425 }, { "epoch": 0.19381255686988172, "grad_norm": 1.2668354345440433, "learning_rate": 4.981572857649225e-06, "loss": 0.0855, "step": 426 }, { "epoch": 0.1942675159235669, "grad_norm": 0.785685618330662, "learning_rate": 4.981486146980804e-06, "loss": 0.0525, "step": 427 }, { "epoch": 0.19472247497725204, "grad_norm": 0.9012661112199176, "learning_rate": 4.9813992335365415e-06, "loss": 0.0616, "step": 428 }, { "epoch": 0.1951774340309372, "grad_norm": 0.9140326707870835, "learning_rate": 4.98131211732354e-06, "loss": 0.0742, "step": 429 }, { "epoch": 0.19563239308462238, "grad_norm": 0.8802115121731895, "learning_rate": 4.981224798348917e-06, "loss": 0.0543, "step": 430 }, { "epoch": 0.19608735213830755, "grad_norm": 1.2263655680320666, "learning_rate": 4.981137276619809e-06, "loss": 0.1, "step": 431 }, { "epoch": 0.19654231119199272, "grad_norm": 0.7179258520773776, "learning_rate": 4.9810495521433675e-06, "loss": 0.0563, "step": 432 }, { "epoch": 0.1969972702456779, "grad_norm": 1.2006165727982114, "learning_rate": 4.9809616249267616e-06, "loss": 0.0919, "step": 433 }, { "epoch": 0.19745222929936307, "grad_norm": 1.0426641838922892, "learning_rate": 4.980873494977174e-06, "loss": 0.0845, "step": 434 }, { "epoch": 0.1979071883530482, "grad_norm": 0.8009974020959663, "learning_rate": 4.98078516230181e-06, "loss": 0.0495, "step": 435 }, { "epoch": 0.19836214740673339, "grad_norm": 0.8146116166212912, "learning_rate": 4.980696626907884e-06, "loss": 0.0656, "step": 436 }, { "epoch": 0.19881710646041856, "grad_norm": 0.8146964454257942, "learning_rate": 4.980607888802633e-06, "loss": 0.0717, "step": 437 }, { "epoch": 0.19927206551410373, "grad_norm": 0.8473418815819729, "learning_rate": 4.980518947993307e-06, "loss": 0.0701, "step": 438 }, { "epoch": 0.1997270245677889, "grad_norm": 0.8132123262524923, "learning_rate": 4.980429804487176e-06, "loss": 0.0657, "step": 439 }, { "epoch": 0.20018198362147407, "grad_norm": 0.7631308196097977, "learning_rate": 4.980340458291521e-06, "loss": 0.0519, "step": 440 }, { "epoch": 0.20063694267515925, "grad_norm": 0.7710009886187632, "learning_rate": 4.980250909413646e-06, "loss": 0.0668, "step": 441 }, { "epoch": 0.2010919017288444, "grad_norm": 0.8960590823111618, "learning_rate": 4.980161157860867e-06, "loss": 0.066, "step": 442 }, { "epoch": 0.20154686078252956, "grad_norm": 1.0148659081855533, "learning_rate": 4.980071203640519e-06, "loss": 0.0666, "step": 443 }, { "epoch": 0.20200181983621474, "grad_norm": 0.6157365971883945, "learning_rate": 4.979981046759952e-06, "loss": 0.0441, "step": 444 }, { "epoch": 0.2024567788898999, "grad_norm": 0.8862364575439057, "learning_rate": 4.979890687226533e-06, "loss": 0.0638, "step": 445 }, { "epoch": 0.20291173794358508, "grad_norm": 1.051789940808801, "learning_rate": 4.979800125047647e-06, "loss": 0.0571, "step": 446 }, { "epoch": 0.20336669699727025, "grad_norm": 0.8963335794848035, "learning_rate": 4.979709360230692e-06, "loss": 0.0706, "step": 447 }, { "epoch": 0.20382165605095542, "grad_norm": 0.8639092050645877, "learning_rate": 4.979618392783087e-06, "loss": 0.0535, "step": 448 }, { "epoch": 0.20427661510464057, "grad_norm": 0.630704913013139, "learning_rate": 4.979527222712266e-06, "loss": 0.0553, "step": 449 }, { "epoch": 0.20473157415832574, "grad_norm": 0.6653631844503811, "learning_rate": 4.9794358500256765e-06, "loss": 0.0438, "step": 450 }, { "epoch": 0.2051865332120109, "grad_norm": 0.8074584078493093, "learning_rate": 4.979344274730786e-06, "loss": 0.0607, "step": 451 }, { "epoch": 0.20564149226569609, "grad_norm": 1.1020725070982913, "learning_rate": 4.979252496835079e-06, "loss": 0.0812, "step": 452 }, { "epoch": 0.20609645131938126, "grad_norm": 1.2231182771798559, "learning_rate": 4.979160516346054e-06, "loss": 0.1074, "step": 453 }, { "epoch": 0.20655141037306643, "grad_norm": 26.716723850026153, "learning_rate": 4.979068333271227e-06, "loss": 0.8002, "step": 454 }, { "epoch": 0.2070063694267516, "grad_norm": 1.2123236026672213, "learning_rate": 4.978975947618131e-06, "loss": 0.0788, "step": 455 }, { "epoch": 0.20746132848043677, "grad_norm": 0.8671125203100531, "learning_rate": 4.978883359394316e-06, "loss": 0.0902, "step": 456 }, { "epoch": 0.20791628753412192, "grad_norm": 0.9848601155594614, "learning_rate": 4.978790568607347e-06, "loss": 0.0606, "step": 457 }, { "epoch": 0.2083712465878071, "grad_norm": 1.013839640652733, "learning_rate": 4.9786975752648076e-06, "loss": 0.0873, "step": 458 }, { "epoch": 0.20882620564149226, "grad_norm": 0.7483252407807567, "learning_rate": 4.978604379374295e-06, "loss": 0.0592, "step": 459 }, { "epoch": 0.20928116469517744, "grad_norm": 0.7178737508101655, "learning_rate": 4.978510980943427e-06, "loss": 0.0506, "step": 460 }, { "epoch": 0.2097361237488626, "grad_norm": 1.0919247632044238, "learning_rate": 4.978417379979834e-06, "loss": 0.0778, "step": 461 }, { "epoch": 0.21019108280254778, "grad_norm": 0.8331653357443332, "learning_rate": 4.978323576491165e-06, "loss": 0.0577, "step": 462 }, { "epoch": 0.21064604185623295, "grad_norm": 0.8152928496306786, "learning_rate": 4.978229570485085e-06, "loss": 0.072, "step": 463 }, { "epoch": 0.2111010009099181, "grad_norm": 0.751813285906743, "learning_rate": 4.978135361969276e-06, "loss": 0.0649, "step": 464 }, { "epoch": 0.21155595996360327, "grad_norm": 0.8232278152234197, "learning_rate": 4.9780409509514375e-06, "loss": 0.0642, "step": 465 }, { "epoch": 0.21201091901728844, "grad_norm": 1.5303665195432214, "learning_rate": 4.977946337439282e-06, "loss": 0.1217, "step": 466 }, { "epoch": 0.2124658780709736, "grad_norm": 0.9269370490140525, "learning_rate": 4.9778515214405436e-06, "loss": 0.081, "step": 467 }, { "epoch": 0.21292083712465878, "grad_norm": 0.8830556120481512, "learning_rate": 4.977756502962967e-06, "loss": 0.0684, "step": 468 }, { "epoch": 0.21337579617834396, "grad_norm": 0.6113061227600053, "learning_rate": 4.97766128201432e-06, "loss": 0.0446, "step": 469 }, { "epoch": 0.21383075523202913, "grad_norm": 0.6077789311617329, "learning_rate": 4.977565858602381e-06, "loss": 0.0554, "step": 470 }, { "epoch": 0.21428571428571427, "grad_norm": 0.8598515142264441, "learning_rate": 4.977470232734949e-06, "loss": 0.0727, "step": 471 }, { "epoch": 0.21474067333939945, "grad_norm": 0.8043286169945988, "learning_rate": 4.977374404419838e-06, "loss": 0.0592, "step": 472 }, { "epoch": 0.21519563239308462, "grad_norm": 0.7551382062036437, "learning_rate": 4.977278373664877e-06, "loss": 0.0571, "step": 473 }, { "epoch": 0.2156505914467698, "grad_norm": 1.8211283606473743, "learning_rate": 4.977182140477916e-06, "loss": 0.1033, "step": 474 }, { "epoch": 0.21610555050045496, "grad_norm": 0.7146087276289771, "learning_rate": 4.977085704866817e-06, "loss": 0.0462, "step": 475 }, { "epoch": 0.21656050955414013, "grad_norm": 0.6542895317184714, "learning_rate": 4.97698906683946e-06, "loss": 0.061, "step": 476 }, { "epoch": 0.2170154686078253, "grad_norm": 1.0732518420250663, "learning_rate": 4.9768922264037435e-06, "loss": 0.0845, "step": 477 }, { "epoch": 0.21747042766151045, "grad_norm": 0.6769767303273837, "learning_rate": 4.976795183567579e-06, "loss": 0.0484, "step": 478 }, { "epoch": 0.21792538671519562, "grad_norm": 0.6792925907901064, "learning_rate": 4.976697938338898e-06, "loss": 0.0479, "step": 479 }, { "epoch": 0.2183803457688808, "grad_norm": 0.550587338837319, "learning_rate": 4.976600490725645e-06, "loss": 0.0402, "step": 480 }, { "epoch": 0.21883530482256597, "grad_norm": 0.9934557115485821, "learning_rate": 4.976502840735785e-06, "loss": 0.1096, "step": 481 }, { "epoch": 0.21929026387625114, "grad_norm": 0.7026152824587227, "learning_rate": 4.976404988377297e-06, "loss": 0.0442, "step": 482 }, { "epoch": 0.2197452229299363, "grad_norm": 1.1796498075270252, "learning_rate": 4.976306933658176e-06, "loss": 0.0896, "step": 483 }, { "epoch": 0.22020018198362148, "grad_norm": 0.9196991108702705, "learning_rate": 4.976208676586435e-06, "loss": 0.0903, "step": 484 }, { "epoch": 0.22065514103730663, "grad_norm": 0.9221909008992407, "learning_rate": 4.976110217170104e-06, "loss": 0.061, "step": 485 }, { "epoch": 0.2211101000909918, "grad_norm": 0.8446946807888076, "learning_rate": 4.976011555417228e-06, "loss": 0.06, "step": 486 }, { "epoch": 0.22156505914467697, "grad_norm": 0.8008200895651435, "learning_rate": 4.975912691335869e-06, "loss": 0.0552, "step": 487 }, { "epoch": 0.22202001819836215, "grad_norm": 0.7897004108366357, "learning_rate": 4.975813624934106e-06, "loss": 0.0524, "step": 488 }, { "epoch": 0.22247497725204732, "grad_norm": 0.7656059256782066, "learning_rate": 4.975714356220035e-06, "loss": 0.0532, "step": 489 }, { "epoch": 0.2229299363057325, "grad_norm": 0.49990009073007735, "learning_rate": 4.975614885201766e-06, "loss": 0.0335, "step": 490 }, { "epoch": 0.22338489535941766, "grad_norm": 0.7764965839211172, "learning_rate": 4.975515211887429e-06, "loss": 0.0663, "step": 491 }, { "epoch": 0.22383985441310283, "grad_norm": 0.8335023150964008, "learning_rate": 4.9754153362851684e-06, "loss": 0.0635, "step": 492 }, { "epoch": 0.22429481346678798, "grad_norm": 1.362631121260362, "learning_rate": 4.975315258403145e-06, "loss": 0.1184, "step": 493 }, { "epoch": 0.22474977252047315, "grad_norm": 0.8072718888075444, "learning_rate": 4.975214978249537e-06, "loss": 0.0575, "step": 494 }, { "epoch": 0.22520473157415832, "grad_norm": 0.7237599062848806, "learning_rate": 4.975114495832539e-06, "loss": 0.0629, "step": 495 }, { "epoch": 0.2256596906278435, "grad_norm": 0.9013757169049615, "learning_rate": 4.975013811160362e-06, "loss": 0.0641, "step": 496 }, { "epoch": 0.22611464968152867, "grad_norm": 1.046688141426079, "learning_rate": 4.974912924241233e-06, "loss": 0.0679, "step": 497 }, { "epoch": 0.22656960873521384, "grad_norm": 0.7549334371309422, "learning_rate": 4.974811835083397e-06, "loss": 0.0619, "step": 498 }, { "epoch": 0.227024567788899, "grad_norm": 1.4092663615099252, "learning_rate": 4.974710543695114e-06, "loss": 0.0907, "step": 499 }, { "epoch": 0.22747952684258416, "grad_norm": 1.2767203765961839, "learning_rate": 4.974609050084661e-06, "loss": 0.1037, "step": 500 }, { "epoch": 0.22793448589626933, "grad_norm": 0.957265553607594, "learning_rate": 4.974507354260332e-06, "loss": 0.0841, "step": 501 }, { "epoch": 0.2283894449499545, "grad_norm": 1.0285318937850472, "learning_rate": 4.974405456230436e-06, "loss": 0.0876, "step": 502 }, { "epoch": 0.22884440400363967, "grad_norm": 0.9438000836090487, "learning_rate": 4.974303356003301e-06, "loss": 0.0618, "step": 503 }, { "epoch": 0.22929936305732485, "grad_norm": 0.7641433481492992, "learning_rate": 4.974201053587268e-06, "loss": 0.0623, "step": 504 }, { "epoch": 0.22975432211101002, "grad_norm": 0.7211862506979909, "learning_rate": 4.9740985489907005e-06, "loss": 0.0458, "step": 505 }, { "epoch": 0.2302092811646952, "grad_norm": 1.3113691041435898, "learning_rate": 4.973995842221971e-06, "loss": 0.0865, "step": 506 }, { "epoch": 0.23066424021838033, "grad_norm": 1.1027187330835053, "learning_rate": 4.973892933289476e-06, "loss": 0.0817, "step": 507 }, { "epoch": 0.2311191992720655, "grad_norm": 0.8000847819873458, "learning_rate": 4.97378982220162e-06, "loss": 0.0639, "step": 508 }, { "epoch": 0.23157415832575068, "grad_norm": 0.5709614643890362, "learning_rate": 4.973686508966832e-06, "loss": 0.0427, "step": 509 }, { "epoch": 0.23202911737943585, "grad_norm": 0.6348346044427912, "learning_rate": 4.973582993593554e-06, "loss": 0.0453, "step": 510 }, { "epoch": 0.23248407643312102, "grad_norm": 0.7080077445614887, "learning_rate": 4.973479276090244e-06, "loss": 0.0567, "step": 511 }, { "epoch": 0.2329390354868062, "grad_norm": 0.586722983901754, "learning_rate": 4.973375356465378e-06, "loss": 0.0398, "step": 512 }, { "epoch": 0.23339399454049137, "grad_norm": 0.9373759345632122, "learning_rate": 4.973271234727447e-06, "loss": 0.083, "step": 513 }, { "epoch": 0.2338489535941765, "grad_norm": 0.7290102387520916, "learning_rate": 4.97316691088496e-06, "loss": 0.0573, "step": 514 }, { "epoch": 0.23430391264786168, "grad_norm": 1.2047650698868653, "learning_rate": 4.973062384946442e-06, "loss": 0.0979, "step": 515 }, { "epoch": 0.23475887170154686, "grad_norm": 0.5553854533375087, "learning_rate": 4.9729576569204345e-06, "loss": 0.0493, "step": 516 }, { "epoch": 0.23521383075523203, "grad_norm": 0.966683679171784, "learning_rate": 4.972852726815495e-06, "loss": 0.0744, "step": 517 }, { "epoch": 0.2356687898089172, "grad_norm": 0.8972567842292303, "learning_rate": 4.972747594640197e-06, "loss": 0.0822, "step": 518 }, { "epoch": 0.23612374886260237, "grad_norm": 0.9532248896529997, "learning_rate": 4.9726422604031335e-06, "loss": 0.0628, "step": 519 }, { "epoch": 0.23657870791628755, "grad_norm": 0.5831731409388041, "learning_rate": 4.97253672411291e-06, "loss": 0.0499, "step": 520 }, { "epoch": 0.2370336669699727, "grad_norm": 0.7629148584956371, "learning_rate": 4.972430985778152e-06, "loss": 0.0502, "step": 521 }, { "epoch": 0.23748862602365786, "grad_norm": 0.8867114815888714, "learning_rate": 4.972325045407499e-06, "loss": 0.0551, "step": 522 }, { "epoch": 0.23794358507734303, "grad_norm": 1.2463480840549028, "learning_rate": 4.972218903009608e-06, "loss": 0.0715, "step": 523 }, { "epoch": 0.2383985441310282, "grad_norm": 0.782156462915191, "learning_rate": 4.972112558593153e-06, "loss": 0.0658, "step": 524 }, { "epoch": 0.23885350318471338, "grad_norm": 0.5674610459457798, "learning_rate": 4.972006012166823e-06, "loss": 0.0443, "step": 525 }, { "epoch": 0.23930846223839855, "grad_norm": 0.6676557313621811, "learning_rate": 4.971899263739326e-06, "loss": 0.052, "step": 526 }, { "epoch": 0.23976342129208372, "grad_norm": 0.8996461781463584, "learning_rate": 4.971792313319384e-06, "loss": 0.0761, "step": 527 }, { "epoch": 0.24021838034576887, "grad_norm": 0.7869388715576839, "learning_rate": 4.971685160915737e-06, "loss": 0.059, "step": 528 }, { "epoch": 0.24067333939945404, "grad_norm": 0.8601250360554993, "learning_rate": 4.971577806537139e-06, "loss": 0.058, "step": 529 }, { "epoch": 0.2411282984531392, "grad_norm": 0.860384363291072, "learning_rate": 4.971470250192366e-06, "loss": 0.0746, "step": 530 }, { "epoch": 0.24158325750682438, "grad_norm": 17.481585256275345, "learning_rate": 4.9713624918902045e-06, "loss": 0.3357, "step": 531 }, { "epoch": 0.24203821656050956, "grad_norm": 1.3228769141545746, "learning_rate": 4.971254531639461e-06, "loss": 0.0978, "step": 532 }, { "epoch": 0.24249317561419473, "grad_norm": 0.9022991420443233, "learning_rate": 4.971146369448957e-06, "loss": 0.073, "step": 533 }, { "epoch": 0.2429481346678799, "grad_norm": 0.8487996347147105, "learning_rate": 4.971038005327532e-06, "loss": 0.0772, "step": 534 }, { "epoch": 0.24340309372156507, "grad_norm": 1.0939700661439853, "learning_rate": 4.970929439284039e-06, "loss": 0.1052, "step": 535 }, { "epoch": 0.24385805277525022, "grad_norm": 1.1117793169544092, "learning_rate": 4.970820671327351e-06, "loss": 0.0838, "step": 536 }, { "epoch": 0.2443130118289354, "grad_norm": 0.5711568883528185, "learning_rate": 4.9707117014663565e-06, "loss": 0.0477, "step": 537 }, { "epoch": 0.24476797088262056, "grad_norm": 0.9911963887990124, "learning_rate": 4.97060252970996e-06, "loss": 0.0859, "step": 538 }, { "epoch": 0.24522292993630573, "grad_norm": 0.8786877928757788, "learning_rate": 4.970493156067081e-06, "loss": 0.0672, "step": 539 }, { "epoch": 0.2456778889899909, "grad_norm": 0.6358718673962386, "learning_rate": 4.970383580546658e-06, "loss": 0.049, "step": 540 }, { "epoch": 0.24613284804367608, "grad_norm": 0.9673038276315246, "learning_rate": 4.970273803157645e-06, "loss": 0.0789, "step": 541 }, { "epoch": 0.24658780709736125, "grad_norm": 0.7896663626576268, "learning_rate": 4.970163823909013e-06, "loss": 0.0636, "step": 542 }, { "epoch": 0.2470427661510464, "grad_norm": 0.7725841407720596, "learning_rate": 4.970053642809748e-06, "loss": 0.0591, "step": 543 }, { "epoch": 0.24749772520473157, "grad_norm": 0.8834486709832678, "learning_rate": 4.969943259868853e-06, "loss": 0.0741, "step": 544 }, { "epoch": 0.24795268425841674, "grad_norm": 0.9862513700188255, "learning_rate": 4.969832675095351e-06, "loss": 0.0733, "step": 545 }, { "epoch": 0.2484076433121019, "grad_norm": 0.9230048911450578, "learning_rate": 4.969721888498275e-06, "loss": 0.0784, "step": 546 }, { "epoch": 0.24886260236578708, "grad_norm": 0.678321429576158, "learning_rate": 4.96961090008668e-06, "loss": 0.0548, "step": 547 }, { "epoch": 0.24931756141947226, "grad_norm": 1.0377618196684284, "learning_rate": 4.969499709869635e-06, "loss": 0.0972, "step": 548 }, { "epoch": 0.24977252047315743, "grad_norm": 1.0401408232919482, "learning_rate": 4.969388317856225e-06, "loss": 0.0803, "step": 549 }, { "epoch": 0.2502274795268426, "grad_norm": 1.1187089275098543, "learning_rate": 4.969276724055554e-06, "loss": 0.0959, "step": 550 }, { "epoch": 0.25068243858052774, "grad_norm": 0.955462869329459, "learning_rate": 4.969164928476741e-06, "loss": 0.0676, "step": 551 }, { "epoch": 0.25113739763421294, "grad_norm": 0.8046461909524141, "learning_rate": 4.969052931128919e-06, "loss": 0.0648, "step": 552 }, { "epoch": 0.2515923566878981, "grad_norm": 0.7081920862352523, "learning_rate": 4.968940732021243e-06, "loss": 0.0603, "step": 553 }, { "epoch": 0.25204731574158323, "grad_norm": 0.9857688144173427, "learning_rate": 4.9688283311628795e-06, "loss": 0.0918, "step": 554 }, { "epoch": 0.25250227479526843, "grad_norm": 0.8534813080817202, "learning_rate": 4.968715728563014e-06, "loss": 0.0679, "step": 555 }, { "epoch": 0.2529572338489536, "grad_norm": 0.5525293734820541, "learning_rate": 4.968602924230847e-06, "loss": 0.0413, "step": 556 }, { "epoch": 0.2534121929026388, "grad_norm": 1.2973130655518506, "learning_rate": 4.968489918175598e-06, "loss": 0.085, "step": 557 }, { "epoch": 0.2538671519563239, "grad_norm": 1.1050509785585005, "learning_rate": 4.9683767104065014e-06, "loss": 0.0758, "step": 558 }, { "epoch": 0.2543221110100091, "grad_norm": 0.7040398410425142, "learning_rate": 4.968263300932806e-06, "loss": 0.0484, "step": 559 }, { "epoch": 0.25477707006369427, "grad_norm": 0.5860142844568907, "learning_rate": 4.968149689763781e-06, "loss": 0.0477, "step": 560 }, { "epoch": 0.2552320291173794, "grad_norm": 0.7720444359609591, "learning_rate": 4.968035876908708e-06, "loss": 0.0716, "step": 561 }, { "epoch": 0.2556869881710646, "grad_norm": 0.9073150271174998, "learning_rate": 4.967921862376889e-06, "loss": 0.0775, "step": 562 }, { "epoch": 0.25614194722474976, "grad_norm": 1.0634349883750702, "learning_rate": 4.9678076461776415e-06, "loss": 0.0843, "step": 563 }, { "epoch": 0.25659690627843496, "grad_norm": 1.0056095668838196, "learning_rate": 4.9676932283202965e-06, "loss": 0.0845, "step": 564 }, { "epoch": 0.2570518653321201, "grad_norm": 1.0935445939907518, "learning_rate": 4.967578608814205e-06, "loss": 0.0844, "step": 565 }, { "epoch": 0.2575068243858053, "grad_norm": 0.736642742743355, "learning_rate": 4.9674637876687345e-06, "loss": 0.0683, "step": 566 }, { "epoch": 0.25796178343949044, "grad_norm": 0.5647018247667355, "learning_rate": 4.967348764893265e-06, "loss": 0.0453, "step": 567 }, { "epoch": 0.2584167424931756, "grad_norm": 0.882122047098411, "learning_rate": 4.967233540497197e-06, "loss": 0.0575, "step": 568 }, { "epoch": 0.2588717015468608, "grad_norm": 0.8934709872440615, "learning_rate": 4.967118114489946e-06, "loss": 0.0562, "step": 569 }, { "epoch": 0.25932666060054593, "grad_norm": 0.7282868782108531, "learning_rate": 4.967002486880944e-06, "loss": 0.0486, "step": 570 }, { "epoch": 0.25978161965423113, "grad_norm": 1.0082182066998666, "learning_rate": 4.966886657679641e-06, "loss": 0.0766, "step": 571 }, { "epoch": 0.2602365787079163, "grad_norm": 1.348224587830696, "learning_rate": 4.966770626895499e-06, "loss": 0.0845, "step": 572 }, { "epoch": 0.2606915377616015, "grad_norm": 0.9025549046907797, "learning_rate": 4.966654394538002e-06, "loss": 0.0738, "step": 573 }, { "epoch": 0.2611464968152866, "grad_norm": 0.639234711788688, "learning_rate": 4.966537960616646e-06, "loss": 0.0495, "step": 574 }, { "epoch": 0.26160145586897177, "grad_norm": 1.0233503597101892, "learning_rate": 4.9664213251409486e-06, "loss": 0.0637, "step": 575 }, { "epoch": 0.26205641492265697, "grad_norm": 1.1124626991947715, "learning_rate": 4.9663044881204375e-06, "loss": 0.1045, "step": 576 }, { "epoch": 0.2625113739763421, "grad_norm": 0.837344371555646, "learning_rate": 4.9661874495646615e-06, "loss": 0.0646, "step": 577 }, { "epoch": 0.2629663330300273, "grad_norm": 0.7068866993603214, "learning_rate": 4.9660702094831845e-06, "loss": 0.0619, "step": 578 }, { "epoch": 0.26342129208371245, "grad_norm": 0.9495992534959607, "learning_rate": 4.965952767885587e-06, "loss": 0.0635, "step": 579 }, { "epoch": 0.26387625113739765, "grad_norm": 1.1302962930077667, "learning_rate": 4.965835124781465e-06, "loss": 0.0852, "step": 580 }, { "epoch": 0.2643312101910828, "grad_norm": 0.6086948754935466, "learning_rate": 4.965717280180432e-06, "loss": 0.0551, "step": 581 }, { "epoch": 0.26478616924476794, "grad_norm": 0.7060150486109749, "learning_rate": 4.965599234092118e-06, "loss": 0.0546, "step": 582 }, { "epoch": 0.26524112829845314, "grad_norm": 0.8543624689211352, "learning_rate": 4.96548098652617e-06, "loss": 0.0754, "step": 583 }, { "epoch": 0.2656960873521383, "grad_norm": 1.0333432760993717, "learning_rate": 4.965362537492249e-06, "loss": 0.0864, "step": 584 }, { "epoch": 0.2661510464058235, "grad_norm": 0.7262201210260119, "learning_rate": 4.9652438870000356e-06, "loss": 0.0555, "step": 585 }, { "epoch": 0.26660600545950863, "grad_norm": 0.5794267973617044, "learning_rate": 4.965125035059224e-06, "loss": 0.0553, "step": 586 }, { "epoch": 0.26706096451319383, "grad_norm": 0.7938840996429771, "learning_rate": 4.965005981679527e-06, "loss": 0.0624, "step": 587 }, { "epoch": 0.267515923566879, "grad_norm": 1.1569186716649804, "learning_rate": 4.964886726870673e-06, "loss": 0.0905, "step": 588 }, { "epoch": 0.2679708826205642, "grad_norm": 0.8131461154142043, "learning_rate": 4.964767270642407e-06, "loss": 0.0542, "step": 589 }, { "epoch": 0.2684258416742493, "grad_norm": 0.9434951271078357, "learning_rate": 4.964647613004491e-06, "loss": 0.0747, "step": 590 }, { "epoch": 0.26888080072793447, "grad_norm": 0.7006034344602099, "learning_rate": 4.964527753966702e-06, "loss": 0.0512, "step": 591 }, { "epoch": 0.26933575978161967, "grad_norm": 0.684347542401468, "learning_rate": 4.964407693538834e-06, "loss": 0.0573, "step": 592 }, { "epoch": 0.2697907188353048, "grad_norm": 1.0140148730488754, "learning_rate": 4.9642874317307e-06, "loss": 0.0843, "step": 593 }, { "epoch": 0.27024567788899, "grad_norm": 0.8814518099865631, "learning_rate": 4.964166968552124e-06, "loss": 0.0874, "step": 594 }, { "epoch": 0.27070063694267515, "grad_norm": 0.7465912736193613, "learning_rate": 4.9640463040129525e-06, "loss": 0.0516, "step": 595 }, { "epoch": 0.27115559599636035, "grad_norm": 0.6714302701088581, "learning_rate": 4.963925438123044e-06, "loss": 0.0454, "step": 596 }, { "epoch": 0.2716105550500455, "grad_norm": 0.8301899302632495, "learning_rate": 4.963804370892276e-06, "loss": 0.0647, "step": 597 }, { "epoch": 0.27206551410373064, "grad_norm": 1.0257561895944438, "learning_rate": 4.9636831023305405e-06, "loss": 0.087, "step": 598 }, { "epoch": 0.27252047315741584, "grad_norm": 0.6785463668955102, "learning_rate": 4.963561632447748e-06, "loss": 0.0478, "step": 599 }, { "epoch": 0.272975432211101, "grad_norm": 0.6400387189893691, "learning_rate": 4.9634399612538255e-06, "loss": 0.0461, "step": 600 }, { "epoch": 0.2734303912647862, "grad_norm": 12.91966872067954, "learning_rate": 4.963318088758714e-06, "loss": 0.2613, "step": 601 }, { "epoch": 0.27388535031847133, "grad_norm": 0.797345248624046, "learning_rate": 4.963196014972371e-06, "loss": 0.0525, "step": 602 }, { "epoch": 0.27434030937215653, "grad_norm": 0.7395484231820286, "learning_rate": 4.963073739904775e-06, "loss": 0.0555, "step": 603 }, { "epoch": 0.2747952684258417, "grad_norm": 0.7278963125143824, "learning_rate": 4.962951263565915e-06, "loss": 0.0516, "step": 604 }, { "epoch": 0.2752502274795268, "grad_norm": 1.1277517505176968, "learning_rate": 4.962828585965801e-06, "loss": 0.0682, "step": 605 }, { "epoch": 0.275705186533212, "grad_norm": 0.8383324875513333, "learning_rate": 4.962705707114457e-06, "loss": 0.0653, "step": 606 }, { "epoch": 0.27616014558689717, "grad_norm": 0.8259808296763246, "learning_rate": 4.962582627021923e-06, "loss": 0.067, "step": 607 }, { "epoch": 0.27661510464058237, "grad_norm": 1.0342239300300777, "learning_rate": 4.962459345698258e-06, "loss": 0.0818, "step": 608 }, { "epoch": 0.2770700636942675, "grad_norm": 0.9575696366695832, "learning_rate": 4.962335863153537e-06, "loss": 0.0774, "step": 609 }, { "epoch": 0.2775250227479527, "grad_norm": 0.6545425176058414, "learning_rate": 4.962212179397847e-06, "loss": 0.0559, "step": 610 }, { "epoch": 0.27797998180163785, "grad_norm": 0.762093168945694, "learning_rate": 4.962088294441299e-06, "loss": 0.0486, "step": 611 }, { "epoch": 0.278434940855323, "grad_norm": 0.7471732165438408, "learning_rate": 4.9619642082940135e-06, "loss": 0.0653, "step": 612 }, { "epoch": 0.2788898999090082, "grad_norm": 0.9260147407911207, "learning_rate": 4.9618399209661305e-06, "loss": 0.0793, "step": 613 }, { "epoch": 0.27934485896269334, "grad_norm": 0.7947481326693611, "learning_rate": 4.961715432467807e-06, "loss": 0.0494, "step": 614 }, { "epoch": 0.27979981801637854, "grad_norm": 0.676048071191175, "learning_rate": 4.961590742809216e-06, "loss": 0.0499, "step": 615 }, { "epoch": 0.2802547770700637, "grad_norm": 0.776818740518446, "learning_rate": 4.961465852000545e-06, "loss": 0.0622, "step": 616 }, { "epoch": 0.2807097361237489, "grad_norm": 0.9447441776692052, "learning_rate": 4.961340760052001e-06, "loss": 0.0605, "step": 617 }, { "epoch": 0.28116469517743403, "grad_norm": 0.7631458630471165, "learning_rate": 4.961215466973806e-06, "loss": 0.0517, "step": 618 }, { "epoch": 0.2816196542311192, "grad_norm": 1.4586634447892357, "learning_rate": 4.961089972776197e-06, "loss": 0.1213, "step": 619 }, { "epoch": 0.2820746132848044, "grad_norm": 0.7305250205616287, "learning_rate": 4.9609642774694285e-06, "loss": 0.0491, "step": 620 }, { "epoch": 0.2825295723384895, "grad_norm": 1.0406398369078378, "learning_rate": 4.960838381063774e-06, "loss": 0.0663, "step": 621 }, { "epoch": 0.2829845313921747, "grad_norm": 1.2055388650663057, "learning_rate": 4.960712283569521e-06, "loss": 0.0954, "step": 622 }, { "epoch": 0.28343949044585987, "grad_norm": 1.0747699071981542, "learning_rate": 4.960585984996971e-06, "loss": 0.0708, "step": 623 }, { "epoch": 0.28389444949954507, "grad_norm": 0.8880949030661993, "learning_rate": 4.960459485356447e-06, "loss": 0.0863, "step": 624 }, { "epoch": 0.2843494085532302, "grad_norm": 0.7242068952577403, "learning_rate": 4.960332784658285e-06, "loss": 0.0626, "step": 625 }, { "epoch": 0.28480436760691535, "grad_norm": 1.1695122446283712, "learning_rate": 4.960205882912839e-06, "loss": 0.0891, "step": 626 }, { "epoch": 0.28525932666060055, "grad_norm": 1.3443620909335443, "learning_rate": 4.9600787801304785e-06, "loss": 0.125, "step": 627 }, { "epoch": 0.2857142857142857, "grad_norm": 1.155929017304206, "learning_rate": 4.959951476321589e-06, "loss": 0.0871, "step": 628 }, { "epoch": 0.2861692447679709, "grad_norm": 0.5929895200550217, "learning_rate": 4.959823971496575e-06, "loss": 0.0552, "step": 629 }, { "epoch": 0.28662420382165604, "grad_norm": 2.1679171173353, "learning_rate": 4.959696265665853e-06, "loss": 0.1544, "step": 630 }, { "epoch": 0.28707916287534124, "grad_norm": 0.5970059996178373, "learning_rate": 4.959568358839862e-06, "loss": 0.0341, "step": 631 }, { "epoch": 0.2875341219290264, "grad_norm": 0.7134802025387853, "learning_rate": 4.95944025102905e-06, "loss": 0.0561, "step": 632 }, { "epoch": 0.28798908098271153, "grad_norm": 1.0949999376535642, "learning_rate": 4.959311942243888e-06, "loss": 0.0847, "step": 633 }, { "epoch": 0.28844404003639673, "grad_norm": 0.6935143421827713, "learning_rate": 4.95918343249486e-06, "loss": 0.0527, "step": 634 }, { "epoch": 0.2888989990900819, "grad_norm": 0.9012545498209729, "learning_rate": 4.959054721792469e-06, "loss": 0.0765, "step": 635 }, { "epoch": 0.2893539581437671, "grad_norm": 0.718784967956377, "learning_rate": 4.958925810147231e-06, "loss": 0.0521, "step": 636 }, { "epoch": 0.2898089171974522, "grad_norm": 0.8036871187269504, "learning_rate": 4.958796697569679e-06, "loss": 0.0575, "step": 637 }, { "epoch": 0.2902638762511374, "grad_norm": 0.7083108170821438, "learning_rate": 4.958667384070365e-06, "loss": 0.0474, "step": 638 }, { "epoch": 0.29071883530482256, "grad_norm": 0.8852706168812431, "learning_rate": 4.958537869659855e-06, "loss": 0.078, "step": 639 }, { "epoch": 0.2911737943585077, "grad_norm": 0.7427839621337862, "learning_rate": 4.958408154348734e-06, "loss": 0.0481, "step": 640 }, { "epoch": 0.2916287534121929, "grad_norm": 0.8545728670719058, "learning_rate": 4.9582782381476e-06, "loss": 0.0775, "step": 641 }, { "epoch": 0.29208371246587805, "grad_norm": 1.112593592536602, "learning_rate": 4.958148121067071e-06, "loss": 0.1085, "step": 642 }, { "epoch": 0.29253867151956325, "grad_norm": 0.8382409456326524, "learning_rate": 4.9580178031177775e-06, "loss": 0.0834, "step": 643 }, { "epoch": 0.2929936305732484, "grad_norm": 0.7181010571245596, "learning_rate": 4.9578872843103694e-06, "loss": 0.0706, "step": 644 }, { "epoch": 0.2934485896269336, "grad_norm": 0.838533684989998, "learning_rate": 4.957756564655513e-06, "loss": 0.0699, "step": 645 }, { "epoch": 0.29390354868061874, "grad_norm": 0.8489478508314325, "learning_rate": 4.957625644163888e-06, "loss": 0.0786, "step": 646 }, { "epoch": 0.2943585077343039, "grad_norm": 0.880567536766304, "learning_rate": 4.957494522846194e-06, "loss": 0.0634, "step": 647 }, { "epoch": 0.2948134667879891, "grad_norm": 0.9985911504789262, "learning_rate": 4.957363200713146e-06, "loss": 0.0971, "step": 648 }, { "epoch": 0.29526842584167423, "grad_norm": 0.7465438435071795, "learning_rate": 4.957231677775475e-06, "loss": 0.0543, "step": 649 }, { "epoch": 0.29572338489535943, "grad_norm": 1.0873754674099565, "learning_rate": 4.957099954043928e-06, "loss": 0.0975, "step": 650 }, { "epoch": 0.2961783439490446, "grad_norm": 0.7743109510832679, "learning_rate": 4.956968029529269e-06, "loss": 0.0782, "step": 651 }, { "epoch": 0.2966333030027298, "grad_norm": 0.8294361612806938, "learning_rate": 4.956835904242277e-06, "loss": 0.0741, "step": 652 }, { "epoch": 0.2970882620564149, "grad_norm": 0.5971734320200014, "learning_rate": 4.9567035781937516e-06, "loss": 0.0382, "step": 653 }, { "epoch": 0.29754322111010006, "grad_norm": 0.9121379516261049, "learning_rate": 4.9565710513945024e-06, "loss": 0.0639, "step": 654 }, { "epoch": 0.29799818016378526, "grad_norm": 0.9983314125142588, "learning_rate": 4.956438323855362e-06, "loss": 0.0745, "step": 655 }, { "epoch": 0.2984531392174704, "grad_norm": 1.057472958552687, "learning_rate": 4.956305395587174e-06, "loss": 0.091, "step": 656 }, { "epoch": 0.2989080982711556, "grad_norm": 0.7245370640267725, "learning_rate": 4.956172266600802e-06, "loss": 0.0566, "step": 657 }, { "epoch": 0.29936305732484075, "grad_norm": 0.7068763180795751, "learning_rate": 4.956038936907125e-06, "loss": 0.0523, "step": 658 }, { "epoch": 0.29981801637852595, "grad_norm": 0.7580044270083526, "learning_rate": 4.955905406517036e-06, "loss": 0.0515, "step": 659 }, { "epoch": 0.3002729754322111, "grad_norm": 0.9984766712753593, "learning_rate": 4.95577167544145e-06, "loss": 0.0793, "step": 660 }, { "epoch": 0.30072793448589624, "grad_norm": 0.9743764266009726, "learning_rate": 4.955637743691291e-06, "loss": 0.0726, "step": 661 }, { "epoch": 0.30118289353958144, "grad_norm": 0.9004872852534804, "learning_rate": 4.955503611277506e-06, "loss": 0.0652, "step": 662 }, { "epoch": 0.3016378525932666, "grad_norm": 0.79764221013725, "learning_rate": 4.955369278211055e-06, "loss": 0.0536, "step": 663 }, { "epoch": 0.3020928116469518, "grad_norm": 0.9464070117627001, "learning_rate": 4.955234744502914e-06, "loss": 0.0662, "step": 664 }, { "epoch": 0.30254777070063693, "grad_norm": 1.0516204782864038, "learning_rate": 4.955100010164079e-06, "loss": 0.081, "step": 665 }, { "epoch": 0.30300272975432213, "grad_norm": 0.7302891890803844, "learning_rate": 4.954965075205557e-06, "loss": 0.0513, "step": 666 }, { "epoch": 0.3034576888080073, "grad_norm": 2.2645221771727537, "learning_rate": 4.9548299396383755e-06, "loss": 0.1286, "step": 667 }, { "epoch": 0.3039126478616925, "grad_norm": 0.8238624251883593, "learning_rate": 4.954694603473578e-06, "loss": 0.0514, "step": 668 }, { "epoch": 0.3043676069153776, "grad_norm": 1.1280026909584604, "learning_rate": 4.954559066722222e-06, "loss": 0.0872, "step": 669 }, { "epoch": 0.30482256596906276, "grad_norm": 0.9142124471934124, "learning_rate": 4.954423329395385e-06, "loss": 0.0795, "step": 670 }, { "epoch": 0.30527752502274796, "grad_norm": 1.0203604655093028, "learning_rate": 4.954287391504156e-06, "loss": 0.0887, "step": 671 }, { "epoch": 0.3057324840764331, "grad_norm": 1.137669290207854, "learning_rate": 4.9541512530596455e-06, "loss": 0.0946, "step": 672 }, { "epoch": 0.3061874431301183, "grad_norm": 0.8645784943070317, "learning_rate": 4.954014914072978e-06, "loss": 0.069, "step": 673 }, { "epoch": 0.30664240218380345, "grad_norm": 0.7612707675408733, "learning_rate": 4.9538783745552934e-06, "loss": 0.0655, "step": 674 }, { "epoch": 0.30709736123748865, "grad_norm": 1.266400392804822, "learning_rate": 4.95374163451775e-06, "loss": 0.0993, "step": 675 }, { "epoch": 0.3075523202911738, "grad_norm": 0.9601214063413259, "learning_rate": 4.953604693971521e-06, "loss": 0.066, "step": 676 }, { "epoch": 0.30800727934485894, "grad_norm": 0.7836256655266565, "learning_rate": 4.953467552927798e-06, "loss": 0.042, "step": 677 }, { "epoch": 0.30846223839854414, "grad_norm": 1.300589530536382, "learning_rate": 4.9533302113977845e-06, "loss": 0.0899, "step": 678 }, { "epoch": 0.3089171974522293, "grad_norm": 1.1474483826754185, "learning_rate": 4.9531926693927055e-06, "loss": 0.0808, "step": 679 }, { "epoch": 0.3093721565059145, "grad_norm": 0.9596950226202976, "learning_rate": 4.953054926923801e-06, "loss": 0.0795, "step": 680 }, { "epoch": 0.30982711555959963, "grad_norm": 1.0372662479293318, "learning_rate": 4.952916984002325e-06, "loss": 0.0726, "step": 681 }, { "epoch": 0.31028207461328483, "grad_norm": 0.9537762571435, "learning_rate": 4.95277884063955e-06, "loss": 0.0784, "step": 682 }, { "epoch": 0.31073703366697, "grad_norm": 0.7652431915975989, "learning_rate": 4.952640496846766e-06, "loss": 0.0736, "step": 683 }, { "epoch": 0.3111919927206551, "grad_norm": 0.6958333798668543, "learning_rate": 4.952501952635276e-06, "loss": 0.0563, "step": 684 }, { "epoch": 0.3116469517743403, "grad_norm": 1.1475385694550302, "learning_rate": 4.952363208016402e-06, "loss": 0.0969, "step": 685 }, { "epoch": 0.31210191082802546, "grad_norm": 0.8003142285493542, "learning_rate": 4.952224263001482e-06, "loss": 0.0499, "step": 686 }, { "epoch": 0.31255686988171066, "grad_norm": 0.5867807916718473, "learning_rate": 4.952085117601868e-06, "loss": 0.0477, "step": 687 }, { "epoch": 0.3130118289353958, "grad_norm": 1.0024535273535888, "learning_rate": 4.951945771828933e-06, "loss": 0.0999, "step": 688 }, { "epoch": 0.313466787989081, "grad_norm": 0.6050221715881425, "learning_rate": 4.951806225694061e-06, "loss": 0.059, "step": 689 }, { "epoch": 0.31392174704276615, "grad_norm": 0.9849063687092052, "learning_rate": 4.951666479208658e-06, "loss": 0.072, "step": 690 }, { "epoch": 0.3143767060964513, "grad_norm": 0.6941254969326264, "learning_rate": 4.951526532384141e-06, "loss": 0.0561, "step": 691 }, { "epoch": 0.3148316651501365, "grad_norm": 0.9726304738330778, "learning_rate": 4.951386385231946e-06, "loss": 0.0717, "step": 692 }, { "epoch": 0.31528662420382164, "grad_norm": 0.6220356445541609, "learning_rate": 4.951246037763528e-06, "loss": 0.0468, "step": 693 }, { "epoch": 0.31574158325750684, "grad_norm": 0.8409955006728991, "learning_rate": 4.9511054899903524e-06, "loss": 0.0547, "step": 694 }, { "epoch": 0.316196542311192, "grad_norm": 0.9382334215030735, "learning_rate": 4.950964741923905e-06, "loss": 0.0741, "step": 695 }, { "epoch": 0.3166515013648772, "grad_norm": 0.782635605280389, "learning_rate": 4.950823793575688e-06, "loss": 0.0581, "step": 696 }, { "epoch": 0.31710646041856233, "grad_norm": 1.2001834327909027, "learning_rate": 4.950682644957218e-06, "loss": 0.0963, "step": 697 }, { "epoch": 0.3175614194722475, "grad_norm": 1.1857201792014638, "learning_rate": 4.9505412960800295e-06, "loss": 0.0883, "step": 698 }, { "epoch": 0.3180163785259327, "grad_norm": 0.8662471412292134, "learning_rate": 4.950399746955673e-06, "loss": 0.0707, "step": 699 }, { "epoch": 0.3184713375796178, "grad_norm": 0.886786111665171, "learning_rate": 4.950257997595716e-06, "loss": 0.0647, "step": 700 }, { "epoch": 0.318926296633303, "grad_norm": 0.945494756521827, "learning_rate": 4.950116048011739e-06, "loss": 0.0682, "step": 701 }, { "epoch": 0.31938125568698816, "grad_norm": 0.5880480229225298, "learning_rate": 4.949973898215344e-06, "loss": 0.0371, "step": 702 }, { "epoch": 0.31983621474067336, "grad_norm": 0.8807599912178138, "learning_rate": 4.949831548218146e-06, "loss": 0.0685, "step": 703 }, { "epoch": 0.3202911737943585, "grad_norm": 1.0002963777426177, "learning_rate": 4.949688998031777e-06, "loss": 0.0727, "step": 704 }, { "epoch": 0.32074613284804365, "grad_norm": 0.7255329110919103, "learning_rate": 4.949546247667886e-06, "loss": 0.05, "step": 705 }, { "epoch": 0.32120109190172885, "grad_norm": 0.9082181101140028, "learning_rate": 4.949403297138137e-06, "loss": 0.0649, "step": 706 }, { "epoch": 0.321656050955414, "grad_norm": 0.805531599078662, "learning_rate": 4.949260146454212e-06, "loss": 0.0729, "step": 707 }, { "epoch": 0.3221110100090992, "grad_norm": 0.854802575466473, "learning_rate": 4.94911679562781e-06, "loss": 0.0562, "step": 708 }, { "epoch": 0.32256596906278434, "grad_norm": 1.0984796609960896, "learning_rate": 4.948973244670643e-06, "loss": 0.0725, "step": 709 }, { "epoch": 0.32302092811646954, "grad_norm": 0.7556324629138267, "learning_rate": 4.948829493594441e-06, "loss": 0.0544, "step": 710 }, { "epoch": 0.3234758871701547, "grad_norm": 0.5920439294431348, "learning_rate": 4.9486855424109524e-06, "loss": 0.0411, "step": 711 }, { "epoch": 0.32393084622383983, "grad_norm": 0.6808571640088359, "learning_rate": 4.948541391131939e-06, "loss": 0.0593, "step": 712 }, { "epoch": 0.32438580527752503, "grad_norm": 0.6475999202690299, "learning_rate": 4.948397039769181e-06, "loss": 0.0368, "step": 713 }, { "epoch": 0.3248407643312102, "grad_norm": 0.5475655472838014, "learning_rate": 4.948252488334474e-06, "loss": 0.034, "step": 714 }, { "epoch": 0.3252957233848954, "grad_norm": 0.5762149944161961, "learning_rate": 4.948107736839629e-06, "loss": 0.0499, "step": 715 }, { "epoch": 0.3257506824385805, "grad_norm": 0.7900459721209473, "learning_rate": 4.947962785296476e-06, "loss": 0.0774, "step": 716 }, { "epoch": 0.3262056414922657, "grad_norm": 1.0482560180703868, "learning_rate": 4.9478176337168594e-06, "loss": 0.0836, "step": 717 }, { "epoch": 0.32666060054595086, "grad_norm": 1.401558028095644, "learning_rate": 4.9476722821126386e-06, "loss": 0.1193, "step": 718 }, { "epoch": 0.327115559599636, "grad_norm": 0.771148987620668, "learning_rate": 4.9475267304956945e-06, "loss": 0.0689, "step": 719 }, { "epoch": 0.3275705186533212, "grad_norm": 1.0131729662598652, "learning_rate": 4.947380978877917e-06, "loss": 0.0755, "step": 720 }, { "epoch": 0.32802547770700635, "grad_norm": 0.9498239138116331, "learning_rate": 4.947235027271219e-06, "loss": 0.098, "step": 721 }, { "epoch": 0.32848043676069155, "grad_norm": 0.6332324914490356, "learning_rate": 4.9470888756875265e-06, "loss": 0.0447, "step": 722 }, { "epoch": 0.3289353958143767, "grad_norm": 0.7271597537238114, "learning_rate": 4.946942524138782e-06, "loss": 0.0483, "step": 723 }, { "epoch": 0.3293903548680619, "grad_norm": 0.8308219382266808, "learning_rate": 4.946795972636944e-06, "loss": 0.0631, "step": 724 }, { "epoch": 0.32984531392174704, "grad_norm": 1.0116359294484205, "learning_rate": 4.94664922119399e-06, "loss": 0.0896, "step": 725 }, { "epoch": 0.3303002729754322, "grad_norm": 0.7583834250675704, "learning_rate": 4.94650226982191e-06, "loss": 0.0598, "step": 726 }, { "epoch": 0.3307552320291174, "grad_norm": 1.3027980178767569, "learning_rate": 4.9463551185327115e-06, "loss": 0.1405, "step": 727 }, { "epoch": 0.33121019108280253, "grad_norm": 0.7112357392323109, "learning_rate": 4.946207767338422e-06, "loss": 0.0536, "step": 728 }, { "epoch": 0.33166515013648773, "grad_norm": 0.6618610768771548, "learning_rate": 4.9460602162510805e-06, "loss": 0.0516, "step": 729 }, { "epoch": 0.3321201091901729, "grad_norm": 0.7624917659824647, "learning_rate": 4.945912465282744e-06, "loss": 0.0586, "step": 730 }, { "epoch": 0.3325750682438581, "grad_norm": 0.9623565075253229, "learning_rate": 4.945764514445487e-06, "loss": 0.0966, "step": 731 }, { "epoch": 0.3330300272975432, "grad_norm": 1.158591935392886, "learning_rate": 4.9456163637513986e-06, "loss": 0.0762, "step": 732 }, { "epoch": 0.33348498635122836, "grad_norm": 1.0561029184500623, "learning_rate": 4.945468013212585e-06, "loss": 0.0736, "step": 733 }, { "epoch": 0.33393994540491356, "grad_norm": 0.8218447404278697, "learning_rate": 4.945319462841169e-06, "loss": 0.0716, "step": 734 }, { "epoch": 0.3343949044585987, "grad_norm": 0.9168665120865833, "learning_rate": 4.94517071264929e-06, "loss": 0.0726, "step": 735 }, { "epoch": 0.3348498635122839, "grad_norm": 0.7363681967868748, "learning_rate": 4.945021762649102e-06, "loss": 0.043, "step": 736 }, { "epoch": 0.33530482256596905, "grad_norm": 0.725798212067647, "learning_rate": 4.9448726128527776e-06, "loss": 0.0636, "step": 737 }, { "epoch": 0.33575978161965425, "grad_norm": 0.7941303692620212, "learning_rate": 4.944723263272504e-06, "loss": 0.0695, "step": 738 }, { "epoch": 0.3362147406733394, "grad_norm": 0.834891383255751, "learning_rate": 4.944573713920485e-06, "loss": 0.0712, "step": 739 }, { "epoch": 0.33666969972702454, "grad_norm": 2.4254280630054783, "learning_rate": 4.944423964808943e-06, "loss": 0.151, "step": 740 }, { "epoch": 0.33712465878070974, "grad_norm": 0.8195094139530902, "learning_rate": 4.944274015950113e-06, "loss": 0.0631, "step": 741 }, { "epoch": 0.3375796178343949, "grad_norm": 0.886058873471566, "learning_rate": 4.944123867356249e-06, "loss": 0.0535, "step": 742 }, { "epoch": 0.3380345768880801, "grad_norm": 1.069728524255416, "learning_rate": 4.943973519039619e-06, "loss": 0.0931, "step": 743 }, { "epoch": 0.33848953594176523, "grad_norm": 0.6746904422385723, "learning_rate": 4.943822971012511e-06, "loss": 0.0473, "step": 744 }, { "epoch": 0.33894449499545043, "grad_norm": 0.6169402633729492, "learning_rate": 4.943672223287226e-06, "loss": 0.0409, "step": 745 }, { "epoch": 0.3393994540491356, "grad_norm": 1.1379727803205435, "learning_rate": 4.9435212758760815e-06, "loss": 0.0974, "step": 746 }, { "epoch": 0.3398544131028208, "grad_norm": 0.6974504786844343, "learning_rate": 4.943370128791413e-06, "loss": 0.0484, "step": 747 }, { "epoch": 0.3403093721565059, "grad_norm": 0.7031181253608232, "learning_rate": 4.943218782045574e-06, "loss": 0.063, "step": 748 }, { "epoch": 0.34076433121019106, "grad_norm": 0.9627083360594578, "learning_rate": 4.943067235650927e-06, "loss": 0.08, "step": 749 }, { "epoch": 0.34121929026387626, "grad_norm": 0.9077046325521676, "learning_rate": 4.942915489619859e-06, "loss": 0.0789, "step": 750 }, { "epoch": 0.3416742493175614, "grad_norm": 0.8348448351284486, "learning_rate": 4.9427635439647704e-06, "loss": 0.0729, "step": 751 }, { "epoch": 0.3421292083712466, "grad_norm": 0.8858012000453745, "learning_rate": 4.942611398698075e-06, "loss": 0.0664, "step": 752 }, { "epoch": 0.34258416742493175, "grad_norm": 0.8901557022008841, "learning_rate": 4.942459053832208e-06, "loss": 0.0693, "step": 753 }, { "epoch": 0.34303912647861695, "grad_norm": 0.8234502982899836, "learning_rate": 4.942306509379617e-06, "loss": 0.0597, "step": 754 }, { "epoch": 0.3434940855323021, "grad_norm": 0.6946642261242176, "learning_rate": 4.942153765352767e-06, "loss": 0.0655, "step": 755 }, { "epoch": 0.34394904458598724, "grad_norm": 0.7281851362306453, "learning_rate": 4.94200082176414e-06, "loss": 0.0477, "step": 756 }, { "epoch": 0.34440400363967244, "grad_norm": 0.702216811046451, "learning_rate": 4.941847678626234e-06, "loss": 0.051, "step": 757 }, { "epoch": 0.3448589626933576, "grad_norm": 0.7406781926848387, "learning_rate": 4.941694335951563e-06, "loss": 0.0684, "step": 758 }, { "epoch": 0.3453139217470428, "grad_norm": 0.963572123895698, "learning_rate": 4.9415407937526575e-06, "loss": 0.071, "step": 759 }, { "epoch": 0.34576888080072793, "grad_norm": 1.0435397926975072, "learning_rate": 4.9413870520420635e-06, "loss": 0.0872, "step": 760 }, { "epoch": 0.34622383985441313, "grad_norm": 0.681188662305879, "learning_rate": 4.941233110832346e-06, "loss": 0.0408, "step": 761 }, { "epoch": 0.3466787989080983, "grad_norm": 1.0327113714700533, "learning_rate": 4.941078970136082e-06, "loss": 0.0773, "step": 762 }, { "epoch": 0.3471337579617834, "grad_norm": 0.9228809359894766, "learning_rate": 4.940924629965869e-06, "loss": 0.086, "step": 763 }, { "epoch": 0.3475887170154686, "grad_norm": 0.7250520063147825, "learning_rate": 4.940770090334319e-06, "loss": 0.0463, "step": 764 }, { "epoch": 0.34804367606915376, "grad_norm": 0.7506801006626348, "learning_rate": 4.940615351254059e-06, "loss": 0.0544, "step": 765 }, { "epoch": 0.34849863512283896, "grad_norm": 0.5540561760336307, "learning_rate": 4.940460412737734e-06, "loss": 0.0526, "step": 766 }, { "epoch": 0.3489535941765241, "grad_norm": 0.695612040454574, "learning_rate": 4.940305274798005e-06, "loss": 0.0524, "step": 767 }, { "epoch": 0.3494085532302093, "grad_norm": 0.6935188585127503, "learning_rate": 4.940149937447549e-06, "loss": 0.067, "step": 768 }, { "epoch": 0.34986351228389445, "grad_norm": 0.5312867727529762, "learning_rate": 4.939994400699061e-06, "loss": 0.041, "step": 769 }, { "epoch": 0.3503184713375796, "grad_norm": 0.7505879638280587, "learning_rate": 4.939838664565248e-06, "loss": 0.0701, "step": 770 }, { "epoch": 0.3507734303912648, "grad_norm": 0.6454845261456662, "learning_rate": 4.939682729058839e-06, "loss": 0.0529, "step": 771 }, { "epoch": 0.35122838944494994, "grad_norm": 0.7301095931814326, "learning_rate": 4.939526594192574e-06, "loss": 0.0542, "step": 772 }, { "epoch": 0.35168334849863514, "grad_norm": 0.652333772985979, "learning_rate": 4.939370259979213e-06, "loss": 0.048, "step": 773 }, { "epoch": 0.3521383075523203, "grad_norm": 0.9727171709896085, "learning_rate": 4.9392137264315295e-06, "loss": 0.0829, "step": 774 }, { "epoch": 0.3525932666060055, "grad_norm": 1.002727349516092, "learning_rate": 4.939056993562316e-06, "loss": 0.1006, "step": 775 }, { "epoch": 0.35304822565969063, "grad_norm": 0.8290934071283628, "learning_rate": 4.9389000613843805e-06, "loss": 0.0604, "step": 776 }, { "epoch": 0.3535031847133758, "grad_norm": 0.7094397892635911, "learning_rate": 4.938742929910546e-06, "loss": 0.0626, "step": 777 }, { "epoch": 0.353958143767061, "grad_norm": 0.463742445825036, "learning_rate": 4.938585599153652e-06, "loss": 0.0401, "step": 778 }, { "epoch": 0.3544131028207461, "grad_norm": 0.8795124878712618, "learning_rate": 4.938428069126555e-06, "loss": 0.0751, "step": 779 }, { "epoch": 0.3548680618744313, "grad_norm": 0.9849362153346484, "learning_rate": 4.9382703398421285e-06, "loss": 0.0685, "step": 780 }, { "epoch": 0.35532302092811646, "grad_norm": 0.9119479350188407, "learning_rate": 4.938112411313261e-06, "loss": 0.0717, "step": 781 }, { "epoch": 0.35577797998180166, "grad_norm": 0.7192019183778731, "learning_rate": 4.937954283552858e-06, "loss": 0.0522, "step": 782 }, { "epoch": 0.3562329390354868, "grad_norm": 1.1237812582622189, "learning_rate": 4.93779595657384e-06, "loss": 0.0876, "step": 783 }, { "epoch": 0.35668789808917195, "grad_norm": 1.0406475235702408, "learning_rate": 4.937637430389145e-06, "loss": 0.0893, "step": 784 }, { "epoch": 0.35714285714285715, "grad_norm": 0.7721864924395869, "learning_rate": 4.937478705011729e-06, "loss": 0.0446, "step": 785 }, { "epoch": 0.3575978161965423, "grad_norm": 0.8008956251240891, "learning_rate": 4.937319780454559e-06, "loss": 0.0705, "step": 786 }, { "epoch": 0.3580527752502275, "grad_norm": 0.691329548634175, "learning_rate": 4.937160656730625e-06, "loss": 0.0682, "step": 787 }, { "epoch": 0.35850773430391264, "grad_norm": 0.767648045445437, "learning_rate": 4.9370013338529274e-06, "loss": 0.0588, "step": 788 }, { "epoch": 0.35896269335759784, "grad_norm": 0.6437326183327811, "learning_rate": 4.936841811834486e-06, "loss": 0.0585, "step": 789 }, { "epoch": 0.359417652411283, "grad_norm": 1.0756251482963843, "learning_rate": 4.936682090688337e-06, "loss": 0.1152, "step": 790 }, { "epoch": 0.35987261146496813, "grad_norm": 0.7633629281483238, "learning_rate": 4.936522170427531e-06, "loss": 0.0519, "step": 791 }, { "epoch": 0.36032757051865333, "grad_norm": 0.9657586384218257, "learning_rate": 4.936362051065136e-06, "loss": 0.0622, "step": 792 }, { "epoch": 0.3607825295723385, "grad_norm": 0.7614427682900681, "learning_rate": 4.936201732614238e-06, "loss": 0.0578, "step": 793 }, { "epoch": 0.3612374886260237, "grad_norm": 0.8385664953370223, "learning_rate": 4.9360412150879355e-06, "loss": 0.0631, "step": 794 }, { "epoch": 0.3616924476797088, "grad_norm": 0.8652339075360451, "learning_rate": 4.935880498499346e-06, "loss": 0.0561, "step": 795 }, { "epoch": 0.362147406733394, "grad_norm": 1.1585494708567998, "learning_rate": 4.935719582861604e-06, "loss": 0.0798, "step": 796 }, { "epoch": 0.36260236578707916, "grad_norm": 0.8782653098452117, "learning_rate": 4.935558468187855e-06, "loss": 0.0785, "step": 797 }, { "epoch": 0.3630573248407643, "grad_norm": 1.0471698671960776, "learning_rate": 4.935397154491268e-06, "loss": 0.0843, "step": 798 }, { "epoch": 0.3635122838944495, "grad_norm": 1.072226910369364, "learning_rate": 4.935235641785023e-06, "loss": 0.1002, "step": 799 }, { "epoch": 0.36396724294813465, "grad_norm": 0.8003883087872027, "learning_rate": 4.935073930082319e-06, "loss": 0.077, "step": 800 }, { "epoch": 0.36442220200181985, "grad_norm": 0.666158838387195, "learning_rate": 4.93491201939637e-06, "loss": 0.05, "step": 801 }, { "epoch": 0.364877161055505, "grad_norm": 0.8051537513165499, "learning_rate": 4.934749909740408e-06, "loss": 0.0754, "step": 802 }, { "epoch": 0.3653321201091902, "grad_norm": 0.6922057989945819, "learning_rate": 4.934587601127677e-06, "loss": 0.059, "step": 803 }, { "epoch": 0.36578707916287534, "grad_norm": 0.7128402432948275, "learning_rate": 4.934425093571442e-06, "loss": 0.0619, "step": 804 }, { "epoch": 0.3662420382165605, "grad_norm": 0.7563859967433937, "learning_rate": 4.934262387084984e-06, "loss": 0.0627, "step": 805 }, { "epoch": 0.3666969972702457, "grad_norm": 0.7832340212976855, "learning_rate": 4.934099481681595e-06, "loss": 0.0526, "step": 806 }, { "epoch": 0.36715195632393083, "grad_norm": 0.86739587683623, "learning_rate": 4.933936377374589e-06, "loss": 0.0723, "step": 807 }, { "epoch": 0.36760691537761603, "grad_norm": 0.5600488971824944, "learning_rate": 4.933773074177293e-06, "loss": 0.053, "step": 808 }, { "epoch": 0.3680618744313012, "grad_norm": 0.7459592894739776, "learning_rate": 4.933609572103053e-06, "loss": 0.0575, "step": 809 }, { "epoch": 0.3685168334849864, "grad_norm": 1.0970116117153337, "learning_rate": 4.933445871165229e-06, "loss": 0.0956, "step": 810 }, { "epoch": 0.3689717925386715, "grad_norm": 0.7191805409301932, "learning_rate": 4.933281971377197e-06, "loss": 0.0519, "step": 811 }, { "epoch": 0.36942675159235666, "grad_norm": 0.8243120557909177, "learning_rate": 4.933117872752352e-06, "loss": 0.071, "step": 812 }, { "epoch": 0.36988171064604186, "grad_norm": 1.1020763342548079, "learning_rate": 4.932953575304102e-06, "loss": 0.0782, "step": 813 }, { "epoch": 0.370336669699727, "grad_norm": 0.9022725332415404, "learning_rate": 4.932789079045873e-06, "loss": 0.0833, "step": 814 }, { "epoch": 0.3707916287534122, "grad_norm": 0.9496599803899396, "learning_rate": 4.932624383991106e-06, "loss": 0.0847, "step": 815 }, { "epoch": 0.37124658780709735, "grad_norm": 1.0562705583617722, "learning_rate": 4.9324594901532605e-06, "loss": 0.0867, "step": 816 }, { "epoch": 0.37170154686078255, "grad_norm": 0.7181054591410602, "learning_rate": 4.93229439754581e-06, "loss": 0.0607, "step": 817 }, { "epoch": 0.3721565059144677, "grad_norm": 1.249078514543796, "learning_rate": 4.932129106182246e-06, "loss": 0.0695, "step": 818 }, { "epoch": 0.37261146496815284, "grad_norm": 0.9464577385866231, "learning_rate": 4.931963616076075e-06, "loss": 0.0555, "step": 819 }, { "epoch": 0.37306642402183804, "grad_norm": 0.6354068817614167, "learning_rate": 4.93179792724082e-06, "loss": 0.0506, "step": 820 }, { "epoch": 0.3735213830755232, "grad_norm": 0.733808597213929, "learning_rate": 4.9316320396900195e-06, "loss": 0.0624, "step": 821 }, { "epoch": 0.3739763421292084, "grad_norm": 1.0993304075084718, "learning_rate": 4.9314659534372305e-06, "loss": 0.0963, "step": 822 }, { "epoch": 0.37443130118289353, "grad_norm": 0.686462250780803, "learning_rate": 4.931299668496024e-06, "loss": 0.0439, "step": 823 }, { "epoch": 0.37488626023657873, "grad_norm": 0.7707830593490947, "learning_rate": 4.931133184879988e-06, "loss": 0.0602, "step": 824 }, { "epoch": 0.37534121929026387, "grad_norm": 0.577810862774901, "learning_rate": 4.930966502602727e-06, "loss": 0.046, "step": 825 }, { "epoch": 0.37579617834394907, "grad_norm": 1.227234424763045, "learning_rate": 4.930799621677862e-06, "loss": 0.0984, "step": 826 }, { "epoch": 0.3762511373976342, "grad_norm": 0.9596192413203867, "learning_rate": 4.93063254211903e-06, "loss": 0.0733, "step": 827 }, { "epoch": 0.37670609645131936, "grad_norm": 0.6852793283145953, "learning_rate": 4.930465263939882e-06, "loss": 0.046, "step": 828 }, { "epoch": 0.37716105550500456, "grad_norm": 1.0111400448234127, "learning_rate": 4.9302977871540894e-06, "loss": 0.0808, "step": 829 }, { "epoch": 0.3776160145586897, "grad_norm": 0.7993690990324225, "learning_rate": 4.930130111775336e-06, "loss": 0.0635, "step": 830 }, { "epoch": 0.3780709736123749, "grad_norm": 0.5709164804262241, "learning_rate": 4.9299622378173245e-06, "loss": 0.0403, "step": 831 }, { "epoch": 0.37852593266606005, "grad_norm": 1.104047361341013, "learning_rate": 4.929794165293773e-06, "loss": 0.0864, "step": 832 }, { "epoch": 0.37898089171974525, "grad_norm": 0.6855131484796984, "learning_rate": 4.9296258942184145e-06, "loss": 0.0617, "step": 833 }, { "epoch": 0.3794358507734304, "grad_norm": 1.0311774748471771, "learning_rate": 4.929457424605e-06, "loss": 0.0788, "step": 834 }, { "epoch": 0.37989080982711554, "grad_norm": 0.9165897835058952, "learning_rate": 4.929288756467296e-06, "loss": 0.0893, "step": 835 }, { "epoch": 0.38034576888080074, "grad_norm": 0.7941921577921506, "learning_rate": 4.929119889819086e-06, "loss": 0.0534, "step": 836 }, { "epoch": 0.3808007279344859, "grad_norm": 1.557335360800504, "learning_rate": 4.928950824674169e-06, "loss": 0.112, "step": 837 }, { "epoch": 0.3812556869881711, "grad_norm": 0.7901013784423294, "learning_rate": 4.928781561046359e-06, "loss": 0.0644, "step": 838 }, { "epoch": 0.3817106460418562, "grad_norm": 0.8005670034055866, "learning_rate": 4.928612098949488e-06, "loss": 0.0651, "step": 839 }, { "epoch": 0.3821656050955414, "grad_norm": 0.7907149517921656, "learning_rate": 4.9284424383974026e-06, "loss": 0.0666, "step": 840 }, { "epoch": 0.38262056414922657, "grad_norm": 0.5599277146162008, "learning_rate": 4.928272579403969e-06, "loss": 0.0415, "step": 841 }, { "epoch": 0.3830755232029117, "grad_norm": 0.8167324319310735, "learning_rate": 4.928102521983067e-06, "loss": 0.0832, "step": 842 }, { "epoch": 0.3835304822565969, "grad_norm": 1.110106061772308, "learning_rate": 4.9279322661485906e-06, "loss": 0.1075, "step": 843 }, { "epoch": 0.38398544131028206, "grad_norm": 0.9108736659112359, "learning_rate": 4.927761811914455e-06, "loss": 0.0782, "step": 844 }, { "epoch": 0.38444040036396726, "grad_norm": 0.7133113314845626, "learning_rate": 4.927591159294587e-06, "loss": 0.0597, "step": 845 }, { "epoch": 0.3848953594176524, "grad_norm": 1.2379543972496645, "learning_rate": 4.927420308302933e-06, "loss": 0.0739, "step": 846 }, { "epoch": 0.3853503184713376, "grad_norm": 0.7205090516697029, "learning_rate": 4.927249258953454e-06, "loss": 0.0637, "step": 847 }, { "epoch": 0.38580527752502275, "grad_norm": 0.9577940179044298, "learning_rate": 4.927078011260126e-06, "loss": 0.0647, "step": 848 }, { "epoch": 0.3862602365787079, "grad_norm": 1.063680913893135, "learning_rate": 4.926906565236943e-06, "loss": 0.0884, "step": 849 }, { "epoch": 0.3867151956323931, "grad_norm": 0.8411451706944509, "learning_rate": 4.926734920897916e-06, "loss": 0.0641, "step": 850 }, { "epoch": 0.38717015468607824, "grad_norm": 0.6435257771689179, "learning_rate": 4.926563078257071e-06, "loss": 0.0645, "step": 851 }, { "epoch": 0.38762511373976344, "grad_norm": 0.5478103039564508, "learning_rate": 4.926391037328448e-06, "loss": 0.0562, "step": 852 }, { "epoch": 0.3880800727934486, "grad_norm": 0.7813544786492084, "learning_rate": 4.926218798126108e-06, "loss": 0.0644, "step": 853 }, { "epoch": 0.3885350318471338, "grad_norm": 0.8655211183499932, "learning_rate": 4.926046360664124e-06, "loss": 0.059, "step": 854 }, { "epoch": 0.3889899909008189, "grad_norm": 0.9101899928988302, "learning_rate": 4.925873724956588e-06, "loss": 0.0737, "step": 855 }, { "epoch": 0.38944494995450407, "grad_norm": 1.0168400071509458, "learning_rate": 4.9257008910176065e-06, "loss": 0.1121, "step": 856 }, { "epoch": 0.38989990900818927, "grad_norm": 0.8167976616887521, "learning_rate": 4.925527858861302e-06, "loss": 0.0564, "step": 857 }, { "epoch": 0.3903548680618744, "grad_norm": 0.8798735310808856, "learning_rate": 4.925354628501814e-06, "loss": 0.0658, "step": 858 }, { "epoch": 0.3908098271155596, "grad_norm": 1.072539154167554, "learning_rate": 4.925181199953299e-06, "loss": 0.073, "step": 859 }, { "epoch": 0.39126478616924476, "grad_norm": 0.6908230723682215, "learning_rate": 4.9250075732299285e-06, "loss": 0.0623, "step": 860 }, { "epoch": 0.39171974522292996, "grad_norm": 0.9571638979072821, "learning_rate": 4.92483374834589e-06, "loss": 0.0773, "step": 861 }, { "epoch": 0.3921747042766151, "grad_norm": 0.756709209444031, "learning_rate": 4.9246597253153884e-06, "loss": 0.0579, "step": 862 }, { "epoch": 0.39262966333030025, "grad_norm": 0.5927412643446517, "learning_rate": 4.924485504152644e-06, "loss": 0.0534, "step": 863 }, { "epoch": 0.39308462238398545, "grad_norm": 0.9103877688416242, "learning_rate": 4.924311084871892e-06, "loss": 0.0706, "step": 864 }, { "epoch": 0.3935395814376706, "grad_norm": 1.0326915390707718, "learning_rate": 4.924136467487387e-06, "loss": 0.0598, "step": 865 }, { "epoch": 0.3939945404913558, "grad_norm": 0.6750287379400403, "learning_rate": 4.923961652013397e-06, "loss": 0.0544, "step": 866 }, { "epoch": 0.39444949954504094, "grad_norm": 0.6971234476522602, "learning_rate": 4.923786638464207e-06, "loss": 0.068, "step": 867 }, { "epoch": 0.39490445859872614, "grad_norm": 0.6838356971258669, "learning_rate": 4.9236114268541196e-06, "loss": 0.0547, "step": 868 }, { "epoch": 0.3953594176524113, "grad_norm": 0.7448093953926782, "learning_rate": 4.923436017197451e-06, "loss": 0.052, "step": 869 }, { "epoch": 0.3958143767060964, "grad_norm": 0.6641780681039909, "learning_rate": 4.923260409508535e-06, "loss": 0.0537, "step": 870 }, { "epoch": 0.3962693357597816, "grad_norm": 1.0599598667933217, "learning_rate": 4.9230846038017214e-06, "loss": 0.1064, "step": 871 }, { "epoch": 0.39672429481346677, "grad_norm": 0.7170909347633128, "learning_rate": 4.922908600091378e-06, "loss": 0.052, "step": 872 }, { "epoch": 0.39717925386715197, "grad_norm": 0.9331039569297795, "learning_rate": 4.9227323983918835e-06, "loss": 0.1059, "step": 873 }, { "epoch": 0.3976342129208371, "grad_norm": 0.6631065170731679, "learning_rate": 4.922555998717639e-06, "loss": 0.0617, "step": 874 }, { "epoch": 0.3980891719745223, "grad_norm": 0.6608685360122281, "learning_rate": 4.922379401083058e-06, "loss": 0.0499, "step": 875 }, { "epoch": 0.39854413102820746, "grad_norm": 0.7327299678205453, "learning_rate": 4.922202605502573e-06, "loss": 0.0566, "step": 876 }, { "epoch": 0.3989990900818926, "grad_norm": 0.6975891149084547, "learning_rate": 4.922025611990629e-06, "loss": 0.0516, "step": 877 }, { "epoch": 0.3994540491355778, "grad_norm": 0.7261723405077012, "learning_rate": 4.92184842056169e-06, "loss": 0.0564, "step": 878 }, { "epoch": 0.39990900818926295, "grad_norm": 0.7685758032234701, "learning_rate": 4.921671031230235e-06, "loss": 0.0607, "step": 879 }, { "epoch": 0.40036396724294815, "grad_norm": 0.8663271064629626, "learning_rate": 4.921493444010759e-06, "loss": 0.0772, "step": 880 }, { "epoch": 0.4008189262966333, "grad_norm": 0.6323885494682957, "learning_rate": 4.921315658917774e-06, "loss": 0.0542, "step": 881 }, { "epoch": 0.4012738853503185, "grad_norm": 0.7490017305697232, "learning_rate": 4.921137675965809e-06, "loss": 0.0561, "step": 882 }, { "epoch": 0.40172884440400364, "grad_norm": 0.5661173516415018, "learning_rate": 4.920959495169406e-06, "loss": 0.0514, "step": 883 }, { "epoch": 0.4021838034576888, "grad_norm": 0.90985620289341, "learning_rate": 4.920781116543126e-06, "loss": 0.0793, "step": 884 }, { "epoch": 0.402638762511374, "grad_norm": 0.737559568798236, "learning_rate": 4.920602540101546e-06, "loss": 0.0532, "step": 885 }, { "epoch": 0.4030937215650591, "grad_norm": 0.9457532899317224, "learning_rate": 4.920423765859257e-06, "loss": 0.0736, "step": 886 }, { "epoch": 0.4035486806187443, "grad_norm": 0.8223411810090336, "learning_rate": 4.920244793830869e-06, "loss": 0.0617, "step": 887 }, { "epoch": 0.40400363967242947, "grad_norm": 0.841036201739517, "learning_rate": 4.920065624031006e-06, "loss": 0.0663, "step": 888 }, { "epoch": 0.40445859872611467, "grad_norm": 0.6414512707848916, "learning_rate": 4.919886256474309e-06, "loss": 0.0577, "step": 889 }, { "epoch": 0.4049135577797998, "grad_norm": 0.9454993871214441, "learning_rate": 4.919706691175435e-06, "loss": 0.0691, "step": 890 }, { "epoch": 0.40536851683348496, "grad_norm": 1.139839821047098, "learning_rate": 4.919526928149058e-06, "loss": 0.0981, "step": 891 }, { "epoch": 0.40582347588717016, "grad_norm": 0.7527352667811262, "learning_rate": 4.919346967409867e-06, "loss": 0.0705, "step": 892 }, { "epoch": 0.4062784349408553, "grad_norm": 0.8215025181864493, "learning_rate": 4.919166808972567e-06, "loss": 0.0822, "step": 893 }, { "epoch": 0.4067333939945405, "grad_norm": 1.0573127490280785, "learning_rate": 4.918986452851881e-06, "loss": 0.0811, "step": 894 }, { "epoch": 0.40718835304822565, "grad_norm": 0.6965463925423991, "learning_rate": 4.918805899062545e-06, "loss": 0.0503, "step": 895 }, { "epoch": 0.40764331210191085, "grad_norm": 0.4193896755189461, "learning_rate": 4.9186251476193146e-06, "loss": 0.0341, "step": 896 }, { "epoch": 0.408098271155596, "grad_norm": 0.9727030498845781, "learning_rate": 4.918444198536959e-06, "loss": 0.0918, "step": 897 }, { "epoch": 0.40855323020928114, "grad_norm": 0.848379430601135, "learning_rate": 4.918263051830267e-06, "loss": 0.0846, "step": 898 }, { "epoch": 0.40900818926296634, "grad_norm": 0.8940054586896501, "learning_rate": 4.918081707514037e-06, "loss": 0.0561, "step": 899 }, { "epoch": 0.4094631483166515, "grad_norm": 1.0448933980565918, "learning_rate": 4.917900165603091e-06, "loss": 0.0881, "step": 900 }, { "epoch": 0.4099181073703367, "grad_norm": 0.8513360075907803, "learning_rate": 4.9177184261122624e-06, "loss": 0.0774, "step": 901 }, { "epoch": 0.4103730664240218, "grad_norm": 0.9926871621583441, "learning_rate": 4.917536489056402e-06, "loss": 0.0676, "step": 902 }, { "epoch": 0.410828025477707, "grad_norm": 0.7421973042221751, "learning_rate": 4.9173543544503775e-06, "loss": 0.0561, "step": 903 }, { "epoch": 0.41128298453139217, "grad_norm": 0.5540464230672232, "learning_rate": 4.917172022309072e-06, "loss": 0.0445, "step": 904 }, { "epoch": 0.41173794358507737, "grad_norm": 0.5720091238043334, "learning_rate": 4.916989492647385e-06, "loss": 0.0433, "step": 905 }, { "epoch": 0.4121929026387625, "grad_norm": 0.6162762711529532, "learning_rate": 4.916806765480231e-06, "loss": 0.0475, "step": 906 }, { "epoch": 0.41264786169244766, "grad_norm": 0.974787490907103, "learning_rate": 4.9166238408225416e-06, "loss": 0.1111, "step": 907 }, { "epoch": 0.41310282074613286, "grad_norm": 0.8865154928732101, "learning_rate": 4.916440718689267e-06, "loss": 0.0749, "step": 908 }, { "epoch": 0.413557779799818, "grad_norm": 0.5226001788391972, "learning_rate": 4.916257399095369e-06, "loss": 0.0395, "step": 909 }, { "epoch": 0.4140127388535032, "grad_norm": 0.5318996108265455, "learning_rate": 4.916073882055827e-06, "loss": 0.0433, "step": 910 }, { "epoch": 0.41446769790718835, "grad_norm": 1.0710581316835899, "learning_rate": 4.91589016758564e-06, "loss": 0.0763, "step": 911 }, { "epoch": 0.41492265696087355, "grad_norm": 0.795399749143522, "learning_rate": 4.915706255699817e-06, "loss": 0.0764, "step": 912 }, { "epoch": 0.4153776160145587, "grad_norm": 1.0890078502818572, "learning_rate": 4.915522146413389e-06, "loss": 0.1131, "step": 913 }, { "epoch": 0.41583257506824384, "grad_norm": 0.7960045886425829, "learning_rate": 4.9153378397413985e-06, "loss": 0.0683, "step": 914 }, { "epoch": 0.41628753412192904, "grad_norm": 1.1128797991041262, "learning_rate": 4.915153335698908e-06, "loss": 0.0913, "step": 915 }, { "epoch": 0.4167424931756142, "grad_norm": 1.0003336530508022, "learning_rate": 4.914968634300994e-06, "loss": 0.0908, "step": 916 }, { "epoch": 0.4171974522292994, "grad_norm": 0.6465244244795542, "learning_rate": 4.914783735562748e-06, "loss": 0.0567, "step": 917 }, { "epoch": 0.4176524112829845, "grad_norm": 0.7181629552621807, "learning_rate": 4.914598639499281e-06, "loss": 0.0601, "step": 918 }, { "epoch": 0.4181073703366697, "grad_norm": 0.6532643628064463, "learning_rate": 4.914413346125717e-06, "loss": 0.0601, "step": 919 }, { "epoch": 0.41856232939035487, "grad_norm": 0.6191538196132823, "learning_rate": 4.914227855457199e-06, "loss": 0.0499, "step": 920 }, { "epoch": 0.41901728844404, "grad_norm": 0.8550108331341532, "learning_rate": 4.914042167508881e-06, "loss": 0.0593, "step": 921 }, { "epoch": 0.4194722474977252, "grad_norm": 0.7149006472238378, "learning_rate": 4.9138562822959416e-06, "loss": 0.0445, "step": 922 }, { "epoch": 0.41992720655141036, "grad_norm": 0.8618890926980373, "learning_rate": 4.913670199833566e-06, "loss": 0.0623, "step": 923 }, { "epoch": 0.42038216560509556, "grad_norm": 0.7722281622664096, "learning_rate": 4.913483920136961e-06, "loss": 0.0599, "step": 924 }, { "epoch": 0.4208371246587807, "grad_norm": 1.0525677617663958, "learning_rate": 4.91329744322135e-06, "loss": 0.072, "step": 925 }, { "epoch": 0.4212920837124659, "grad_norm": 0.67535122521756, "learning_rate": 4.913110769101971e-06, "loss": 0.0591, "step": 926 }, { "epoch": 0.42174704276615105, "grad_norm": 0.822068707444294, "learning_rate": 4.912923897794077e-06, "loss": 0.0614, "step": 927 }, { "epoch": 0.4222020018198362, "grad_norm": 0.7885176236662199, "learning_rate": 4.912736829312938e-06, "loss": 0.0704, "step": 928 }, { "epoch": 0.4226569608735214, "grad_norm": 0.8766248992772606, "learning_rate": 4.912549563673842e-06, "loss": 0.0745, "step": 929 }, { "epoch": 0.42311191992720654, "grad_norm": 0.8283902735051627, "learning_rate": 4.912362100892091e-06, "loss": 0.092, "step": 930 }, { "epoch": 0.42356687898089174, "grad_norm": 0.6254134609819529, "learning_rate": 4.912174440983002e-06, "loss": 0.0537, "step": 931 }, { "epoch": 0.4240218380345769, "grad_norm": 0.882666141582528, "learning_rate": 4.911986583961912e-06, "loss": 0.0786, "step": 932 }, { "epoch": 0.4244767970882621, "grad_norm": 0.9420756223720808, "learning_rate": 4.91179852984417e-06, "loss": 0.0712, "step": 933 }, { "epoch": 0.4249317561419472, "grad_norm": 0.7094879380363412, "learning_rate": 4.911610278645144e-06, "loss": 0.0584, "step": 934 }, { "epoch": 0.42538671519563237, "grad_norm": 0.5926543191052056, "learning_rate": 4.911421830380217e-06, "loss": 0.0405, "step": 935 }, { "epoch": 0.42584167424931757, "grad_norm": 1.0230610119902441, "learning_rate": 4.911233185064788e-06, "loss": 0.0862, "step": 936 }, { "epoch": 0.4262966333030027, "grad_norm": 0.7907623777513295, "learning_rate": 4.911044342714272e-06, "loss": 0.0613, "step": 937 }, { "epoch": 0.4267515923566879, "grad_norm": 0.6568153392494973, "learning_rate": 4.9108553033440995e-06, "loss": 0.0476, "step": 938 }, { "epoch": 0.42720655141037306, "grad_norm": 0.6694889451748832, "learning_rate": 4.91066606696972e-06, "loss": 0.0524, "step": 939 }, { "epoch": 0.42766151046405826, "grad_norm": 0.8635963157899842, "learning_rate": 4.910476633606597e-06, "loss": 0.0614, "step": 940 }, { "epoch": 0.4281164695177434, "grad_norm": 0.7292867271167381, "learning_rate": 4.9102870032702075e-06, "loss": 0.0414, "step": 941 }, { "epoch": 0.42857142857142855, "grad_norm": 0.7616247338566872, "learning_rate": 4.910097175976049e-06, "loss": 0.0549, "step": 942 }, { "epoch": 0.42902638762511375, "grad_norm": 0.7194658405319286, "learning_rate": 4.909907151739634e-06, "loss": 0.0499, "step": 943 }, { "epoch": 0.4294813466787989, "grad_norm": 1.2091203072284862, "learning_rate": 4.909716930576489e-06, "loss": 0.0963, "step": 944 }, { "epoch": 0.4299363057324841, "grad_norm": 0.7905516413222626, "learning_rate": 4.909526512502158e-06, "loss": 0.0783, "step": 945 }, { "epoch": 0.43039126478616924, "grad_norm": 0.8761882129732462, "learning_rate": 4.9093358975322025e-06, "loss": 0.0703, "step": 946 }, { "epoch": 0.43084622383985444, "grad_norm": 0.8329694474419541, "learning_rate": 4.909145085682198e-06, "loss": 0.0747, "step": 947 }, { "epoch": 0.4313011828935396, "grad_norm": 0.7018276200636866, "learning_rate": 4.908954076967737e-06, "loss": 0.05, "step": 948 }, { "epoch": 0.4317561419472247, "grad_norm": 0.8623770812142951, "learning_rate": 4.908762871404427e-06, "loss": 0.1035, "step": 949 }, { "epoch": 0.4322111010009099, "grad_norm": 0.7203196297821325, "learning_rate": 4.908571469007893e-06, "loss": 0.0597, "step": 950 }, { "epoch": 0.43266606005459507, "grad_norm": 0.891364295683239, "learning_rate": 4.908379869793776e-06, "loss": 0.0656, "step": 951 }, { "epoch": 0.43312101910828027, "grad_norm": 0.7267875932577169, "learning_rate": 4.908188073777732e-06, "loss": 0.0537, "step": 952 }, { "epoch": 0.4335759781619654, "grad_norm": 1.136525948908607, "learning_rate": 4.9079960809754334e-06, "loss": 0.1066, "step": 953 }, { "epoch": 0.4340309372156506, "grad_norm": 0.6404455978391249, "learning_rate": 4.90780389140257e-06, "loss": 0.0506, "step": 954 }, { "epoch": 0.43448589626933576, "grad_norm": 1.1106980810150886, "learning_rate": 4.907611505074846e-06, "loss": 0.0756, "step": 955 }, { "epoch": 0.4349408553230209, "grad_norm": 0.8892492946222721, "learning_rate": 4.907418922007983e-06, "loss": 0.0755, "step": 956 }, { "epoch": 0.4353958143767061, "grad_norm": 0.7842836260244193, "learning_rate": 4.907226142217717e-06, "loss": 0.0584, "step": 957 }, { "epoch": 0.43585077343039125, "grad_norm": 0.7902286572984042, "learning_rate": 4.9070331657198015e-06, "loss": 0.0607, "step": 958 }, { "epoch": 0.43630573248407645, "grad_norm": 0.8874306420389249, "learning_rate": 4.906839992530006e-06, "loss": 0.0785, "step": 959 }, { "epoch": 0.4367606915377616, "grad_norm": 0.8366306399984552, "learning_rate": 4.906646622664115e-06, "loss": 0.0713, "step": 960 }, { "epoch": 0.4372156505914468, "grad_norm": 0.7074038721272251, "learning_rate": 4.906453056137931e-06, "loss": 0.041, "step": 961 }, { "epoch": 0.43767060964513194, "grad_norm": 1.1462267850180623, "learning_rate": 4.90625929296727e-06, "loss": 0.1047, "step": 962 }, { "epoch": 0.4381255686988171, "grad_norm": 0.6641712180680458, "learning_rate": 4.9060653331679665e-06, "loss": 0.0685, "step": 963 }, { "epoch": 0.4385805277525023, "grad_norm": 0.8700860351399569, "learning_rate": 4.90587117675587e-06, "loss": 0.0821, "step": 964 }, { "epoch": 0.4390354868061874, "grad_norm": 0.9136082234067431, "learning_rate": 4.905676823746846e-06, "loss": 0.0645, "step": 965 }, { "epoch": 0.4394904458598726, "grad_norm": 0.5926511030671447, "learning_rate": 4.9054822741567745e-06, "loss": 0.0487, "step": 966 }, { "epoch": 0.43994540491355777, "grad_norm": 0.8424384986445863, "learning_rate": 4.905287528001555e-06, "loss": 0.0621, "step": 967 }, { "epoch": 0.44040036396724297, "grad_norm": 0.6973422193542876, "learning_rate": 4.905092585297102e-06, "loss": 0.0583, "step": 968 }, { "epoch": 0.4408553230209281, "grad_norm": 0.9155989139411984, "learning_rate": 4.904897446059344e-06, "loss": 0.0699, "step": 969 }, { "epoch": 0.44131028207461326, "grad_norm": 0.7443964393050531, "learning_rate": 4.9047021103042255e-06, "loss": 0.051, "step": 970 }, { "epoch": 0.44176524112829846, "grad_norm": 0.7677456881758877, "learning_rate": 4.904506578047712e-06, "loss": 0.0559, "step": 971 }, { "epoch": 0.4422202001819836, "grad_norm": 0.9425858695883391, "learning_rate": 4.9043108493057785e-06, "loss": 0.0633, "step": 972 }, { "epoch": 0.4426751592356688, "grad_norm": 0.6366934546327063, "learning_rate": 4.904114924094421e-06, "loss": 0.0464, "step": 973 }, { "epoch": 0.44313011828935395, "grad_norm": 0.802460890266695, "learning_rate": 4.903918802429648e-06, "loss": 0.0727, "step": 974 }, { "epoch": 0.44358507734303915, "grad_norm": 0.44074125028934635, "learning_rate": 4.9037224843274875e-06, "loss": 0.0375, "step": 975 }, { "epoch": 0.4440400363967243, "grad_norm": 0.9236685790892595, "learning_rate": 4.903525969803979e-06, "loss": 0.0914, "step": 976 }, { "epoch": 0.44449499545040944, "grad_norm": 0.8186044519325196, "learning_rate": 4.903329258875184e-06, "loss": 0.0582, "step": 977 }, { "epoch": 0.44494995450409464, "grad_norm": 0.760419987901125, "learning_rate": 4.903132351557175e-06, "loss": 0.0662, "step": 978 }, { "epoch": 0.4454049135577798, "grad_norm": 0.8487192410638724, "learning_rate": 4.902935247866043e-06, "loss": 0.0622, "step": 979 }, { "epoch": 0.445859872611465, "grad_norm": 0.8969279017038029, "learning_rate": 4.9027379478178935e-06, "loss": 0.0696, "step": 980 }, { "epoch": 0.4463148316651501, "grad_norm": 0.8275157306730986, "learning_rate": 4.90254045142885e-06, "loss": 0.0617, "step": 981 }, { "epoch": 0.4467697907188353, "grad_norm": 0.8042954485928273, "learning_rate": 4.90234275871505e-06, "loss": 0.053, "step": 982 }, { "epoch": 0.44722474977252047, "grad_norm": 1.1786231664461284, "learning_rate": 4.9021448696926486e-06, "loss": 0.0986, "step": 983 }, { "epoch": 0.44767970882620567, "grad_norm": 0.6298200533016487, "learning_rate": 4.901946784377816e-06, "loss": 0.065, "step": 984 }, { "epoch": 0.4481346678798908, "grad_norm": 0.594574358873745, "learning_rate": 4.90174850278674e-06, "loss": 0.0539, "step": 985 }, { "epoch": 0.44858962693357596, "grad_norm": 0.679879881000302, "learning_rate": 4.901550024935623e-06, "loss": 0.0654, "step": 986 }, { "epoch": 0.44904458598726116, "grad_norm": 0.5886266734655748, "learning_rate": 4.901351350840683e-06, "loss": 0.0532, "step": 987 }, { "epoch": 0.4494995450409463, "grad_norm": 0.7808229432327206, "learning_rate": 4.901152480518155e-06, "loss": 0.048, "step": 988 }, { "epoch": 0.4499545040946315, "grad_norm": 0.6018998346440647, "learning_rate": 4.900953413984289e-06, "loss": 0.0494, "step": 989 }, { "epoch": 0.45040946314831665, "grad_norm": 1.136855215162297, "learning_rate": 4.900754151255353e-06, "loss": 0.1101, "step": 990 }, { "epoch": 0.45086442220200185, "grad_norm": 0.7654221991027399, "learning_rate": 4.9005546923476305e-06, "loss": 0.0514, "step": 991 }, { "epoch": 0.451319381255687, "grad_norm": 0.8646543296697372, "learning_rate": 4.9003550372774185e-06, "loss": 0.0773, "step": 992 }, { "epoch": 0.45177434030937214, "grad_norm": 0.7391042261561228, "learning_rate": 4.900155186061033e-06, "loss": 0.0593, "step": 993 }, { "epoch": 0.45222929936305734, "grad_norm": 0.9599073893248152, "learning_rate": 4.8999551387148045e-06, "loss": 0.0609, "step": 994 }, { "epoch": 0.4526842584167425, "grad_norm": 0.6526165463731411, "learning_rate": 4.89975489525508e-06, "loss": 0.0561, "step": 995 }, { "epoch": 0.4531392174704277, "grad_norm": 0.8465255037900871, "learning_rate": 4.899554455698223e-06, "loss": 0.0671, "step": 996 }, { "epoch": 0.4535941765241128, "grad_norm": 0.7201042670777416, "learning_rate": 4.899353820060612e-06, "loss": 0.0528, "step": 997 }, { "epoch": 0.454049135577798, "grad_norm": 1.0593597416511176, "learning_rate": 4.899152988358643e-06, "loss": 0.0911, "step": 998 }, { "epoch": 0.45450409463148317, "grad_norm": 0.6740510359790731, "learning_rate": 4.898951960608725e-06, "loss": 0.0516, "step": 999 }, { "epoch": 0.4549590536851683, "grad_norm": 0.7844312545261172, "learning_rate": 4.8987507368272865e-06, "loss": 0.0669, "step": 1000 }, { "epoch": 0.4554140127388535, "grad_norm": 0.8616523144825781, "learning_rate": 4.898549317030772e-06, "loss": 0.0793, "step": 1001 }, { "epoch": 0.45586897179253866, "grad_norm": 0.9076102187991024, "learning_rate": 4.898347701235637e-06, "loss": 0.0774, "step": 1002 }, { "epoch": 0.45632393084622386, "grad_norm": 0.9763695817464088, "learning_rate": 4.89814588945836e-06, "loss": 0.0893, "step": 1003 }, { "epoch": 0.456778889899909, "grad_norm": 0.83319057600543, "learning_rate": 4.89794388171543e-06, "loss": 0.0707, "step": 1004 }, { "epoch": 0.4572338489535942, "grad_norm": 1.8792537681412733, "learning_rate": 4.897741678023356e-06, "loss": 0.0764, "step": 1005 }, { "epoch": 0.45768880800727935, "grad_norm": 0.7734685973931732, "learning_rate": 4.897539278398659e-06, "loss": 0.0627, "step": 1006 }, { "epoch": 0.4581437670609645, "grad_norm": 0.9415629435575145, "learning_rate": 4.8973366828578804e-06, "loss": 0.0739, "step": 1007 }, { "epoch": 0.4585987261146497, "grad_norm": 0.7425043558467179, "learning_rate": 4.897133891417574e-06, "loss": 0.0654, "step": 1008 }, { "epoch": 0.45905368516833484, "grad_norm": 0.8911942098198534, "learning_rate": 4.896930904094311e-06, "loss": 0.0561, "step": 1009 }, { "epoch": 0.45950864422202004, "grad_norm": 1.2263772300651212, "learning_rate": 4.896727720904679e-06, "loss": 0.0864, "step": 1010 }, { "epoch": 0.4599636032757052, "grad_norm": 0.5601400077515136, "learning_rate": 4.896524341865282e-06, "loss": 0.0438, "step": 1011 }, { "epoch": 0.4604185623293904, "grad_norm": 0.877896381579309, "learning_rate": 4.896320766992737e-06, "loss": 0.0942, "step": 1012 }, { "epoch": 0.4608735213830755, "grad_norm": 0.5305552323500966, "learning_rate": 4.896116996303682e-06, "loss": 0.0529, "step": 1013 }, { "epoch": 0.46132848043676067, "grad_norm": 1.0131750705641154, "learning_rate": 4.895913029814766e-06, "loss": 0.0615, "step": 1014 }, { "epoch": 0.46178343949044587, "grad_norm": 1.000424807381978, "learning_rate": 4.895708867542658e-06, "loss": 0.0715, "step": 1015 }, { "epoch": 0.462238398544131, "grad_norm": 0.8460760286247231, "learning_rate": 4.895504509504039e-06, "loss": 0.0668, "step": 1016 }, { "epoch": 0.4626933575978162, "grad_norm": 0.7307313549457798, "learning_rate": 4.89529995571561e-06, "loss": 0.0703, "step": 1017 }, { "epoch": 0.46314831665150136, "grad_norm": 0.9064676368721324, "learning_rate": 4.895095206194086e-06, "loss": 0.0741, "step": 1018 }, { "epoch": 0.46360327570518656, "grad_norm": 0.7836127913087492, "learning_rate": 4.894890260956198e-06, "loss": 0.0609, "step": 1019 }, { "epoch": 0.4640582347588717, "grad_norm": 0.8243374701106395, "learning_rate": 4.8946851200186925e-06, "loss": 0.0714, "step": 1020 }, { "epoch": 0.46451319381255685, "grad_norm": 0.7466209538262989, "learning_rate": 4.894479783398334e-06, "loss": 0.0645, "step": 1021 }, { "epoch": 0.46496815286624205, "grad_norm": 0.8777577697024573, "learning_rate": 4.8942742511119004e-06, "loss": 0.0702, "step": 1022 }, { "epoch": 0.4654231119199272, "grad_norm": 2.534404305990435, "learning_rate": 4.894068523176187e-06, "loss": 0.1764, "step": 1023 }, { "epoch": 0.4658780709736124, "grad_norm": 0.6909522226512711, "learning_rate": 4.8938625996080056e-06, "loss": 0.0609, "step": 1024 }, { "epoch": 0.46633303002729753, "grad_norm": 0.49151101507719136, "learning_rate": 4.893656480424184e-06, "loss": 0.038, "step": 1025 }, { "epoch": 0.46678798908098273, "grad_norm": 0.6934743220996047, "learning_rate": 4.893450165641564e-06, "loss": 0.0639, "step": 1026 }, { "epoch": 0.4672429481346679, "grad_norm": 0.6130829752556336, "learning_rate": 4.893243655277005e-06, "loss": 0.0555, "step": 1027 }, { "epoch": 0.467697907188353, "grad_norm": 0.7605330677818611, "learning_rate": 4.893036949347383e-06, "loss": 0.0617, "step": 1028 }, { "epoch": 0.4681528662420382, "grad_norm": 0.6903185226412064, "learning_rate": 4.892830047869588e-06, "loss": 0.0568, "step": 1029 }, { "epoch": 0.46860782529572337, "grad_norm": 0.6912281627309118, "learning_rate": 4.892622950860527e-06, "loss": 0.0395, "step": 1030 }, { "epoch": 0.46906278434940857, "grad_norm": 0.6581879258835811, "learning_rate": 4.892415658337123e-06, "loss": 0.0634, "step": 1031 }, { "epoch": 0.4695177434030937, "grad_norm": 0.6826891181604423, "learning_rate": 4.892208170316317e-06, "loss": 0.054, "step": 1032 }, { "epoch": 0.4699727024567789, "grad_norm": 0.5663228273948423, "learning_rate": 4.892000486815062e-06, "loss": 0.0429, "step": 1033 }, { "epoch": 0.47042766151046406, "grad_norm": 0.6736706134956636, "learning_rate": 4.891792607850328e-06, "loss": 0.0576, "step": 1034 }, { "epoch": 0.4708826205641492, "grad_norm": 1.3024590269353873, "learning_rate": 4.891584533439104e-06, "loss": 0.0942, "step": 1035 }, { "epoch": 0.4713375796178344, "grad_norm": 0.7591015395797897, "learning_rate": 4.891376263598393e-06, "loss": 0.0686, "step": 1036 }, { "epoch": 0.47179253867151955, "grad_norm": 0.6830269838411529, "learning_rate": 4.891167798345213e-06, "loss": 0.0546, "step": 1037 }, { "epoch": 0.47224749772520475, "grad_norm": 1.08323398335216, "learning_rate": 4.890959137696598e-06, "loss": 0.0891, "step": 1038 }, { "epoch": 0.4727024567788899, "grad_norm": 0.8474479862966637, "learning_rate": 4.890750281669601e-06, "loss": 0.0647, "step": 1039 }, { "epoch": 0.4731574158325751, "grad_norm": 0.5830298328980045, "learning_rate": 4.890541230281287e-06, "loss": 0.0434, "step": 1040 }, { "epoch": 0.47361237488626023, "grad_norm": 0.8284120242669826, "learning_rate": 4.8903319835487385e-06, "loss": 0.0658, "step": 1041 }, { "epoch": 0.4740673339399454, "grad_norm": 1.0749546085564385, "learning_rate": 4.890122541489056e-06, "loss": 0.0781, "step": 1042 }, { "epoch": 0.4745222929936306, "grad_norm": 0.8028746869091197, "learning_rate": 4.889912904119353e-06, "loss": 0.0745, "step": 1043 }, { "epoch": 0.4749772520473157, "grad_norm": 0.9995418792465922, "learning_rate": 4.88970307145676e-06, "loss": 0.0676, "step": 1044 }, { "epoch": 0.4754322111010009, "grad_norm": 0.6522882405079128, "learning_rate": 4.889493043518423e-06, "loss": 0.0562, "step": 1045 }, { "epoch": 0.47588717015468607, "grad_norm": 0.5881171761200348, "learning_rate": 4.889282820321506e-06, "loss": 0.0346, "step": 1046 }, { "epoch": 0.47634212920837127, "grad_norm": 0.9483028123171037, "learning_rate": 4.889072401883187e-06, "loss": 0.0667, "step": 1047 }, { "epoch": 0.4767970882620564, "grad_norm": 0.6670957840215991, "learning_rate": 4.88886178822066e-06, "loss": 0.058, "step": 1048 }, { "epoch": 0.47725204731574156, "grad_norm": 0.8148080209615183, "learning_rate": 4.888650979351136e-06, "loss": 0.0719, "step": 1049 }, { "epoch": 0.47770700636942676, "grad_norm": 0.9532630639821967, "learning_rate": 4.888439975291841e-06, "loss": 0.0953, "step": 1050 }, { "epoch": 0.4781619654231119, "grad_norm": 0.6515552152853241, "learning_rate": 4.888228776060017e-06, "loss": 0.0599, "step": 1051 }, { "epoch": 0.4786169244767971, "grad_norm": 0.8306189556551553, "learning_rate": 4.888017381672923e-06, "loss": 0.0616, "step": 1052 }, { "epoch": 0.47907188353048225, "grad_norm": 0.7827901374918418, "learning_rate": 4.887805792147832e-06, "loss": 0.0609, "step": 1053 }, { "epoch": 0.47952684258416745, "grad_norm": 1.0177718214883258, "learning_rate": 4.887594007502036e-06, "loss": 0.0655, "step": 1054 }, { "epoch": 0.4799818016378526, "grad_norm": 1.0068335369987877, "learning_rate": 4.887382027752838e-06, "loss": 0.0723, "step": 1055 }, { "epoch": 0.48043676069153773, "grad_norm": 0.7718561478385435, "learning_rate": 4.8871698529175636e-06, "loss": 0.0706, "step": 1056 }, { "epoch": 0.48089171974522293, "grad_norm": 0.8506407410185749, "learning_rate": 4.886957483013549e-06, "loss": 0.0794, "step": 1057 }, { "epoch": 0.4813466787989081, "grad_norm": 1.0436903958800676, "learning_rate": 4.886744918058149e-06, "loss": 0.0863, "step": 1058 }, { "epoch": 0.4818016378525933, "grad_norm": 0.8684291097009643, "learning_rate": 4.886532158068732e-06, "loss": 0.0639, "step": 1059 }, { "epoch": 0.4822565969062784, "grad_norm": 0.8196534236848143, "learning_rate": 4.886319203062683e-06, "loss": 0.0575, "step": 1060 }, { "epoch": 0.4827115559599636, "grad_norm": 0.9689674310762992, "learning_rate": 4.886106053057408e-06, "loss": 0.0676, "step": 1061 }, { "epoch": 0.48316651501364877, "grad_norm": 0.9818608722703667, "learning_rate": 4.88589270807032e-06, "loss": 0.0799, "step": 1062 }, { "epoch": 0.48362147406733397, "grad_norm": 0.7489753545607997, "learning_rate": 4.885679168118855e-06, "loss": 0.071, "step": 1063 }, { "epoch": 0.4840764331210191, "grad_norm": 0.9547184698601481, "learning_rate": 4.8854654332204635e-06, "loss": 0.0898, "step": 1064 }, { "epoch": 0.48453139217470426, "grad_norm": 0.7541528249104351, "learning_rate": 4.885251503392607e-06, "loss": 0.0543, "step": 1065 }, { "epoch": 0.48498635122838946, "grad_norm": 1.067810398256581, "learning_rate": 4.885037378652771e-06, "loss": 0.092, "step": 1066 }, { "epoch": 0.4854413102820746, "grad_norm": 0.8614801184366359, "learning_rate": 4.884823059018451e-06, "loss": 0.0523, "step": 1067 }, { "epoch": 0.4858962693357598, "grad_norm": 0.8148290305727078, "learning_rate": 4.88460854450716e-06, "loss": 0.0584, "step": 1068 }, { "epoch": 0.48635122838944495, "grad_norm": 0.5855038739875491, "learning_rate": 4.884393835136427e-06, "loss": 0.0518, "step": 1069 }, { "epoch": 0.48680618744313015, "grad_norm": 0.8161089915400275, "learning_rate": 4.884178930923799e-06, "loss": 0.0574, "step": 1070 }, { "epoch": 0.4872611464968153, "grad_norm": 0.9133819494317933, "learning_rate": 4.883963831886834e-06, "loss": 0.0646, "step": 1071 }, { "epoch": 0.48771610555050043, "grad_norm": 0.8617450933904238, "learning_rate": 4.8837485380431115e-06, "loss": 0.0681, "step": 1072 }, { "epoch": 0.48817106460418563, "grad_norm": 0.6850383590418775, "learning_rate": 4.883533049410223e-06, "loss": 0.0547, "step": 1073 }, { "epoch": 0.4886260236578708, "grad_norm": 0.9045667246036927, "learning_rate": 4.8833173660057785e-06, "loss": 0.0759, "step": 1074 }, { "epoch": 0.489080982711556, "grad_norm": 0.8809498978267978, "learning_rate": 4.8831014878474004e-06, "loss": 0.0695, "step": 1075 }, { "epoch": 0.4895359417652411, "grad_norm": 0.7567941392113556, "learning_rate": 4.882885414952732e-06, "loss": 0.0626, "step": 1076 }, { "epoch": 0.4899909008189263, "grad_norm": 0.5783840394829795, "learning_rate": 4.882669147339428e-06, "loss": 0.0398, "step": 1077 }, { "epoch": 0.49044585987261147, "grad_norm": 0.6472410247637905, "learning_rate": 4.882452685025161e-06, "loss": 0.0433, "step": 1078 }, { "epoch": 0.4909008189262966, "grad_norm": 1.0450390786251647, "learning_rate": 4.88223602802762e-06, "loss": 0.0777, "step": 1079 }, { "epoch": 0.4913557779799818, "grad_norm": 0.7447912796292419, "learning_rate": 4.882019176364509e-06, "loss": 0.0717, "step": 1080 }, { "epoch": 0.49181073703366696, "grad_norm": 1.0045515435128731, "learning_rate": 4.881802130053548e-06, "loss": 0.0846, "step": 1081 }, { "epoch": 0.49226569608735216, "grad_norm": 0.7406749978383526, "learning_rate": 4.881584889112473e-06, "loss": 0.0576, "step": 1082 }, { "epoch": 0.4927206551410373, "grad_norm": 0.8307193744296768, "learning_rate": 4.881367453559036e-06, "loss": 0.0666, "step": 1083 }, { "epoch": 0.4931756141947225, "grad_norm": 0.6076387216705473, "learning_rate": 4.881149823411005e-06, "loss": 0.039, "step": 1084 }, { "epoch": 0.49363057324840764, "grad_norm": 0.6332467754054336, "learning_rate": 4.880931998686162e-06, "loss": 0.0494, "step": 1085 }, { "epoch": 0.4940855323020928, "grad_norm": 1.1521530496126213, "learning_rate": 4.880713979402311e-06, "loss": 0.1118, "step": 1086 }, { "epoch": 0.494540491355778, "grad_norm": 1.3887385722770256, "learning_rate": 4.880495765577263e-06, "loss": 0.0973, "step": 1087 }, { "epoch": 0.49499545040946313, "grad_norm": 0.8060673459124312, "learning_rate": 4.880277357228852e-06, "loss": 0.057, "step": 1088 }, { "epoch": 0.49545040946314833, "grad_norm": 0.5602480552863556, "learning_rate": 4.880058754374923e-06, "loss": 0.0521, "step": 1089 }, { "epoch": 0.4959053685168335, "grad_norm": 0.823959291020419, "learning_rate": 4.879839957033343e-06, "loss": 0.0732, "step": 1090 }, { "epoch": 0.4963603275705187, "grad_norm": 0.7660167202497931, "learning_rate": 4.879620965221987e-06, "loss": 0.0607, "step": 1091 }, { "epoch": 0.4968152866242038, "grad_norm": 0.6055131455183578, "learning_rate": 4.879401778958755e-06, "loss": 0.0478, "step": 1092 }, { "epoch": 0.49727024567788897, "grad_norm": 0.6291031678530673, "learning_rate": 4.8791823982615525e-06, "loss": 0.041, "step": 1093 }, { "epoch": 0.49772520473157417, "grad_norm": 0.5530508650871759, "learning_rate": 4.878962823148308e-06, "loss": 0.0424, "step": 1094 }, { "epoch": 0.4981801637852593, "grad_norm": 0.8531604298870573, "learning_rate": 4.878743053636968e-06, "loss": 0.0701, "step": 1095 }, { "epoch": 0.4986351228389445, "grad_norm": 0.7748184089023449, "learning_rate": 4.878523089745485e-06, "loss": 0.0748, "step": 1096 }, { "epoch": 0.49909008189262966, "grad_norm": 0.572162405974625, "learning_rate": 4.878302931491837e-06, "loss": 0.0531, "step": 1097 }, { "epoch": 0.49954504094631486, "grad_norm": 0.5587152183134356, "learning_rate": 4.8780825788940145e-06, "loss": 0.0377, "step": 1098 }, { "epoch": 0.5, "grad_norm": 0.7739049242000703, "learning_rate": 4.877862031970023e-06, "loss": 0.0653, "step": 1099 }, { "epoch": 0.5004549590536852, "grad_norm": 0.9657161074292855, "learning_rate": 4.8776412907378845e-06, "loss": 0.0693, "step": 1100 }, { "epoch": 0.5009099181073703, "grad_norm": 0.7975995176144653, "learning_rate": 4.877420355215637e-06, "loss": 0.0647, "step": 1101 }, { "epoch": 0.5013648771610555, "grad_norm": 1.0916440196980206, "learning_rate": 4.877199225421334e-06, "loss": 0.0904, "step": 1102 }, { "epoch": 0.5018198362147407, "grad_norm": 1.031914303869678, "learning_rate": 4.8769779013730454e-06, "loss": 0.1104, "step": 1103 }, { "epoch": 0.5022747952684259, "grad_norm": 0.9179664512897192, "learning_rate": 4.876756383088858e-06, "loss": 0.0731, "step": 1104 }, { "epoch": 0.502729754322111, "grad_norm": 0.5895244169254785, "learning_rate": 4.876534670586872e-06, "loss": 0.0513, "step": 1105 }, { "epoch": 0.5031847133757962, "grad_norm": 0.654710185038575, "learning_rate": 4.8763127638852045e-06, "loss": 0.0605, "step": 1106 }, { "epoch": 0.5036396724294814, "grad_norm": 0.7685792189309535, "learning_rate": 4.87609066300199e-06, "loss": 0.068, "step": 1107 }, { "epoch": 0.5040946314831665, "grad_norm": 0.8932660080856344, "learning_rate": 4.875868367955376e-06, "loss": 0.0789, "step": 1108 }, { "epoch": 0.5045495905368517, "grad_norm": 1.0120677400517832, "learning_rate": 4.87564587876353e-06, "loss": 0.0864, "step": 1109 }, { "epoch": 0.5050045495905369, "grad_norm": 0.835851016625387, "learning_rate": 4.87542319544463e-06, "loss": 0.0582, "step": 1110 }, { "epoch": 0.5054595086442221, "grad_norm": 0.7695840357212476, "learning_rate": 4.875200318016873e-06, "loss": 0.0675, "step": 1111 }, { "epoch": 0.5059144676979072, "grad_norm": 0.7971275413132646, "learning_rate": 4.8749772464984736e-06, "loss": 0.0743, "step": 1112 }, { "epoch": 0.5063694267515924, "grad_norm": 0.8814806686041933, "learning_rate": 4.874753980907658e-06, "loss": 0.0856, "step": 1113 }, { "epoch": 0.5068243858052776, "grad_norm": 0.6757729077082226, "learning_rate": 4.8745305212626714e-06, "loss": 0.0512, "step": 1114 }, { "epoch": 0.5072793448589626, "grad_norm": 0.7352914895461456, "learning_rate": 4.874306867581775e-06, "loss": 0.0618, "step": 1115 }, { "epoch": 0.5077343039126478, "grad_norm": 0.5868042194217611, "learning_rate": 4.874083019883242e-06, "loss": 0.0366, "step": 1116 }, { "epoch": 0.508189262966333, "grad_norm": 0.9033247458477103, "learning_rate": 4.873858978185367e-06, "loss": 0.0806, "step": 1117 }, { "epoch": 0.5086442220200182, "grad_norm": 1.2038578681531908, "learning_rate": 4.8736347425064565e-06, "loss": 0.1055, "step": 1118 }, { "epoch": 0.5090991810737033, "grad_norm": 0.8178175242835675, "learning_rate": 4.873410312864833e-06, "loss": 0.0609, "step": 1119 }, { "epoch": 0.5095541401273885, "grad_norm": 0.6653546708039177, "learning_rate": 4.8731856892788384e-06, "loss": 0.0495, "step": 1120 }, { "epoch": 0.5100090991810737, "grad_norm": 1.0479881259244002, "learning_rate": 4.872960871766826e-06, "loss": 0.0943, "step": 1121 }, { "epoch": 0.5104640582347588, "grad_norm": 0.6898301418972904, "learning_rate": 4.8727358603471675e-06, "loss": 0.072, "step": 1122 }, { "epoch": 0.510919017288444, "grad_norm": 0.9228312888198933, "learning_rate": 4.872510655038249e-06, "loss": 0.0594, "step": 1123 }, { "epoch": 0.5113739763421292, "grad_norm": 0.6811960575278385, "learning_rate": 4.872285255858476e-06, "loss": 0.0675, "step": 1124 }, { "epoch": 0.5118289353958144, "grad_norm": 0.9624744009083318, "learning_rate": 4.872059662826263e-06, "loss": 0.0766, "step": 1125 }, { "epoch": 0.5122838944494995, "grad_norm": 0.7538277543537744, "learning_rate": 4.8718338759600465e-06, "loss": 0.0592, "step": 1126 }, { "epoch": 0.5127388535031847, "grad_norm": 0.7210100766327706, "learning_rate": 4.871607895278278e-06, "loss": 0.0723, "step": 1127 }, { "epoch": 0.5131938125568699, "grad_norm": 0.6525437186084021, "learning_rate": 4.871381720799421e-06, "loss": 0.0474, "step": 1128 }, { "epoch": 0.513648771610555, "grad_norm": 0.8115901002403193, "learning_rate": 4.8711553525419595e-06, "loss": 0.066, "step": 1129 }, { "epoch": 0.5141037306642402, "grad_norm": 0.7148215823332176, "learning_rate": 4.87092879052439e-06, "loss": 0.0627, "step": 1130 }, { "epoch": 0.5145586897179254, "grad_norm": 0.48371660949391987, "learning_rate": 4.8707020347652275e-06, "loss": 0.0392, "step": 1131 }, { "epoch": 0.5150136487716106, "grad_norm": 0.7655779285836447, "learning_rate": 4.870475085283001e-06, "loss": 0.0659, "step": 1132 }, { "epoch": 0.5154686078252957, "grad_norm": 0.8307291704590695, "learning_rate": 4.870247942096254e-06, "loss": 0.0675, "step": 1133 }, { "epoch": 0.5159235668789809, "grad_norm": 0.6005245010930204, "learning_rate": 4.870020605223551e-06, "loss": 0.0435, "step": 1134 }, { "epoch": 0.5163785259326661, "grad_norm": 0.8060750325493741, "learning_rate": 4.869793074683466e-06, "loss": 0.06, "step": 1135 }, { "epoch": 0.5168334849863512, "grad_norm": 1.1918274978409322, "learning_rate": 4.8695653504945925e-06, "loss": 0.082, "step": 1136 }, { "epoch": 0.5172884440400364, "grad_norm": 1.307377261046503, "learning_rate": 4.8693374326755405e-06, "loss": 0.1036, "step": 1137 }, { "epoch": 0.5177434030937216, "grad_norm": 0.6775134916120404, "learning_rate": 4.869109321244932e-06, "loss": 0.0626, "step": 1138 }, { "epoch": 0.5181983621474068, "grad_norm": 0.7826832431767746, "learning_rate": 4.86888101622141e-06, "loss": 0.0678, "step": 1139 }, { "epoch": 0.5186533212010919, "grad_norm": 0.7645020910331457, "learning_rate": 4.868652517623629e-06, "loss": 0.0489, "step": 1140 }, { "epoch": 0.5191082802547771, "grad_norm": 0.632952475643817, "learning_rate": 4.86842382547026e-06, "loss": 0.0494, "step": 1141 }, { "epoch": 0.5195632393084623, "grad_norm": 0.6876406168526772, "learning_rate": 4.868194939779992e-06, "loss": 0.0396, "step": 1142 }, { "epoch": 0.5200181983621474, "grad_norm": 0.6327165174782583, "learning_rate": 4.867965860571529e-06, "loss": 0.054, "step": 1143 }, { "epoch": 0.5204731574158326, "grad_norm": 1.0011588527834634, "learning_rate": 4.867736587863589e-06, "loss": 0.0877, "step": 1144 }, { "epoch": 0.5209281164695178, "grad_norm": 1.1185639295345813, "learning_rate": 4.867507121674907e-06, "loss": 0.0861, "step": 1145 }, { "epoch": 0.521383075523203, "grad_norm": 0.8624559113619508, "learning_rate": 4.867277462024235e-06, "loss": 0.0629, "step": 1146 }, { "epoch": 0.521838034576888, "grad_norm": 0.9348410440606036, "learning_rate": 4.8670476089303395e-06, "loss": 0.0933, "step": 1147 }, { "epoch": 0.5222929936305732, "grad_norm": 0.9320714625912361, "learning_rate": 4.866817562412003e-06, "loss": 0.1038, "step": 1148 }, { "epoch": 0.5227479526842584, "grad_norm": 0.8075624594132853, "learning_rate": 4.866587322488024e-06, "loss": 0.0809, "step": 1149 }, { "epoch": 0.5232029117379435, "grad_norm": 1.3413067521083781, "learning_rate": 4.866356889177216e-06, "loss": 0.108, "step": 1150 }, { "epoch": 0.5236578707916287, "grad_norm": 0.9232308217568203, "learning_rate": 4.866126262498409e-06, "loss": 0.083, "step": 1151 }, { "epoch": 0.5241128298453139, "grad_norm": 1.305379354125092, "learning_rate": 4.865895442470449e-06, "loss": 0.0958, "step": 1152 }, { "epoch": 0.5245677888989991, "grad_norm": 0.5576012713848366, "learning_rate": 4.865664429112199e-06, "loss": 0.0452, "step": 1153 }, { "epoch": 0.5250227479526842, "grad_norm": 0.921237979155653, "learning_rate": 4.8654332224425345e-06, "loss": 0.0711, "step": 1154 }, { "epoch": 0.5254777070063694, "grad_norm": 0.8929194799091247, "learning_rate": 4.865201822480349e-06, "loss": 0.0764, "step": 1155 }, { "epoch": 0.5259326660600546, "grad_norm": 0.581346673528037, "learning_rate": 4.864970229244552e-06, "loss": 0.0424, "step": 1156 }, { "epoch": 0.5263876251137397, "grad_norm": 0.5623832436496817, "learning_rate": 4.864738442754068e-06, "loss": 0.0434, "step": 1157 }, { "epoch": 0.5268425841674249, "grad_norm": 0.613920526367082, "learning_rate": 4.864506463027837e-06, "loss": 0.0506, "step": 1158 }, { "epoch": 0.5272975432211101, "grad_norm": 1.0387758175670767, "learning_rate": 4.864274290084816e-06, "loss": 0.0875, "step": 1159 }, { "epoch": 0.5277525022747953, "grad_norm": 0.8664987550519835, "learning_rate": 4.864041923943978e-06, "loss": 0.0633, "step": 1160 }, { "epoch": 0.5282074613284804, "grad_norm": 0.8313294484835317, "learning_rate": 4.863809364624309e-06, "loss": 0.069, "step": 1161 }, { "epoch": 0.5286624203821656, "grad_norm": 0.744844430385872, "learning_rate": 4.863576612144814e-06, "loss": 0.0669, "step": 1162 }, { "epoch": 0.5291173794358508, "grad_norm": 0.9237519112875051, "learning_rate": 4.863343666524512e-06, "loss": 0.0735, "step": 1163 }, { "epoch": 0.5295723384895359, "grad_norm": 1.0354177918634275, "learning_rate": 4.863110527782437e-06, "loss": 0.0663, "step": 1164 }, { "epoch": 0.5300272975432211, "grad_norm": 0.8421924118581489, "learning_rate": 4.8628771959376435e-06, "loss": 0.0611, "step": 1165 }, { "epoch": 0.5304822565969063, "grad_norm": 0.9052022304190199, "learning_rate": 4.862643671009195e-06, "loss": 0.0678, "step": 1166 }, { "epoch": 0.5309372156505915, "grad_norm": 0.5590184662551977, "learning_rate": 4.862409953016175e-06, "loss": 0.0643, "step": 1167 }, { "epoch": 0.5313921747042766, "grad_norm": 1.017387677331305, "learning_rate": 4.862176041977683e-06, "loss": 0.0893, "step": 1168 }, { "epoch": 0.5318471337579618, "grad_norm": 0.7533348435394249, "learning_rate": 4.861941937912832e-06, "loss": 0.0674, "step": 1169 }, { "epoch": 0.532302092811647, "grad_norm": 0.6445462791157965, "learning_rate": 4.861707640840752e-06, "loss": 0.0493, "step": 1170 }, { "epoch": 0.5327570518653321, "grad_norm": 0.7797944668273296, "learning_rate": 4.861473150780589e-06, "loss": 0.0676, "step": 1171 }, { "epoch": 0.5332120109190173, "grad_norm": 0.9208202792762923, "learning_rate": 4.8612384677515054e-06, "loss": 0.0823, "step": 1172 }, { "epoch": 0.5336669699727025, "grad_norm": 0.597595752698138, "learning_rate": 4.861003591772677e-06, "loss": 0.0494, "step": 1173 }, { "epoch": 0.5341219290263877, "grad_norm": 0.8195100456101612, "learning_rate": 4.860768522863297e-06, "loss": 0.0538, "step": 1174 }, { "epoch": 0.5345768880800728, "grad_norm": 0.7767115772582979, "learning_rate": 4.860533261042574e-06, "loss": 0.0623, "step": 1175 }, { "epoch": 0.535031847133758, "grad_norm": 0.7588370967921573, "learning_rate": 4.8602978063297336e-06, "loss": 0.0825, "step": 1176 }, { "epoch": 0.5354868061874432, "grad_norm": 0.8816447645987197, "learning_rate": 4.8600621587440155e-06, "loss": 0.0608, "step": 1177 }, { "epoch": 0.5359417652411284, "grad_norm": 0.8625275373939374, "learning_rate": 4.859826318304676e-06, "loss": 0.0778, "step": 1178 }, { "epoch": 0.5363967242948134, "grad_norm": 0.5461509620774402, "learning_rate": 4.859590285030986e-06, "loss": 0.0555, "step": 1179 }, { "epoch": 0.5368516833484986, "grad_norm": 0.8435272435369155, "learning_rate": 4.859354058942234e-06, "loss": 0.0748, "step": 1180 }, { "epoch": 0.5373066424021838, "grad_norm": 0.7720087336226316, "learning_rate": 4.859117640057723e-06, "loss": 0.0671, "step": 1181 }, { "epoch": 0.5377616014558689, "grad_norm": 0.9891841529967211, "learning_rate": 4.858881028396773e-06, "loss": 0.0912, "step": 1182 }, { "epoch": 0.5382165605095541, "grad_norm": 0.8570437893831205, "learning_rate": 4.8586442239787165e-06, "loss": 0.065, "step": 1183 }, { "epoch": 0.5386715195632393, "grad_norm": 0.8305844073661711, "learning_rate": 4.858407226822906e-06, "loss": 0.0762, "step": 1184 }, { "epoch": 0.5391264786169245, "grad_norm": 0.640701641053205, "learning_rate": 4.858170036948707e-06, "loss": 0.0581, "step": 1185 }, { "epoch": 0.5395814376706096, "grad_norm": 0.6468235214549333, "learning_rate": 4.857932654375503e-06, "loss": 0.0482, "step": 1186 }, { "epoch": 0.5400363967242948, "grad_norm": 1.1837084026628135, "learning_rate": 4.857695079122691e-06, "loss": 0.1159, "step": 1187 }, { "epoch": 0.54049135577798, "grad_norm": 0.6715223091608853, "learning_rate": 4.857457311209683e-06, "loss": 0.0601, "step": 1188 }, { "epoch": 0.5409463148316651, "grad_norm": 0.6371483119799936, "learning_rate": 4.857219350655911e-06, "loss": 0.0528, "step": 1189 }, { "epoch": 0.5414012738853503, "grad_norm": 0.6786845490419491, "learning_rate": 4.856981197480818e-06, "loss": 0.0567, "step": 1190 }, { "epoch": 0.5418562329390355, "grad_norm": 0.5942209619771379, "learning_rate": 4.856742851703866e-06, "loss": 0.0489, "step": 1191 }, { "epoch": 0.5423111919927207, "grad_norm": 0.859323950389801, "learning_rate": 4.856504313344531e-06, "loss": 0.0904, "step": 1192 }, { "epoch": 0.5427661510464058, "grad_norm": 0.8018437461164658, "learning_rate": 4.8562655824223055e-06, "loss": 0.0597, "step": 1193 }, { "epoch": 0.543221110100091, "grad_norm": 0.588833511104325, "learning_rate": 4.856026658956697e-06, "loss": 0.0423, "step": 1194 }, { "epoch": 0.5436760691537762, "grad_norm": 0.573460707090398, "learning_rate": 4.8557875429672295e-06, "loss": 0.0633, "step": 1195 }, { "epoch": 0.5441310282074613, "grad_norm": 0.7742873470777307, "learning_rate": 4.855548234473444e-06, "loss": 0.0854, "step": 1196 }, { "epoch": 0.5445859872611465, "grad_norm": 0.9198876967543222, "learning_rate": 4.8553087334948935e-06, "loss": 0.0838, "step": 1197 }, { "epoch": 0.5450409463148317, "grad_norm": 0.6622698873314925, "learning_rate": 4.855069040051149e-06, "loss": 0.0557, "step": 1198 }, { "epoch": 0.5454959053685169, "grad_norm": 0.9436539139154301, "learning_rate": 4.854829154161799e-06, "loss": 0.0816, "step": 1199 }, { "epoch": 0.545950864422202, "grad_norm": 0.738597876294885, "learning_rate": 4.854589075846445e-06, "loss": 0.0706, "step": 1200 }, { "epoch": 0.5464058234758872, "grad_norm": 0.6650944152999292, "learning_rate": 4.854348805124704e-06, "loss": 0.0615, "step": 1201 }, { "epoch": 0.5468607825295724, "grad_norm": 0.7740616702375358, "learning_rate": 4.85410834201621e-06, "loss": 0.0622, "step": 1202 }, { "epoch": 0.5473157415832575, "grad_norm": 1.0655587786032952, "learning_rate": 4.8538676865406155e-06, "loss": 0.1008, "step": 1203 }, { "epoch": 0.5477707006369427, "grad_norm": 0.562596201287674, "learning_rate": 4.853626838717582e-06, "loss": 0.0446, "step": 1204 }, { "epoch": 0.5482256596906279, "grad_norm": 0.7450161661122177, "learning_rate": 4.853385798566793e-06, "loss": 0.0505, "step": 1205 }, { "epoch": 0.5486806187443131, "grad_norm": 0.7252053220128412, "learning_rate": 4.8531445661079444e-06, "loss": 0.0556, "step": 1206 }, { "epoch": 0.5491355777979982, "grad_norm": 0.8995425654201067, "learning_rate": 4.852903141360749e-06, "loss": 0.0759, "step": 1207 }, { "epoch": 0.5495905368516834, "grad_norm": 0.5908915627465986, "learning_rate": 4.852661524344933e-06, "loss": 0.0383, "step": 1208 }, { "epoch": 0.5500454959053686, "grad_norm": 0.8182852324660039, "learning_rate": 4.852419715080244e-06, "loss": 0.0764, "step": 1209 }, { "epoch": 0.5505004549590536, "grad_norm": 0.7043639311746182, "learning_rate": 4.852177713586437e-06, "loss": 0.0573, "step": 1210 }, { "epoch": 0.5509554140127388, "grad_norm": 0.6570600921060993, "learning_rate": 4.85193551988329e-06, "loss": 0.054, "step": 1211 }, { "epoch": 0.551410373066424, "grad_norm": 0.6708751527421707, "learning_rate": 4.851693133990594e-06, "loss": 0.0506, "step": 1212 }, { "epoch": 0.5518653321201092, "grad_norm": 0.5972152927280668, "learning_rate": 4.851450555928155e-06, "loss": 0.0427, "step": 1213 }, { "epoch": 0.5523202911737943, "grad_norm": 1.0934539341786074, "learning_rate": 4.851207785715797e-06, "loss": 0.1214, "step": 1214 }, { "epoch": 0.5527752502274795, "grad_norm": 0.7434164602830275, "learning_rate": 4.850964823373355e-06, "loss": 0.0836, "step": 1215 }, { "epoch": 0.5532302092811647, "grad_norm": 0.6176814717659638, "learning_rate": 4.850721668920685e-06, "loss": 0.0518, "step": 1216 }, { "epoch": 0.5536851683348498, "grad_norm": 0.848848092705782, "learning_rate": 4.850478322377657e-06, "loss": 0.0768, "step": 1217 }, { "epoch": 0.554140127388535, "grad_norm": 0.9187012656708098, "learning_rate": 4.8502347837641536e-06, "loss": 0.0936, "step": 1218 }, { "epoch": 0.5545950864422202, "grad_norm": 0.8042146566684509, "learning_rate": 4.8499910531000776e-06, "loss": 0.0672, "step": 1219 }, { "epoch": 0.5550500454959054, "grad_norm": 0.9323679662294462, "learning_rate": 4.849747130405346e-06, "loss": 0.0685, "step": 1220 }, { "epoch": 0.5555050045495905, "grad_norm": 0.8486293499713085, "learning_rate": 4.849503015699889e-06, "loss": 0.0637, "step": 1221 }, { "epoch": 0.5559599636032757, "grad_norm": 0.8023393545361088, "learning_rate": 4.849258709003657e-06, "loss": 0.064, "step": 1222 }, { "epoch": 0.5564149226569609, "grad_norm": 1.043454299347868, "learning_rate": 4.849014210336612e-06, "loss": 0.0837, "step": 1223 }, { "epoch": 0.556869881710646, "grad_norm": 0.880437517370894, "learning_rate": 4.848769519718734e-06, "loss": 0.0886, "step": 1224 }, { "epoch": 0.5573248407643312, "grad_norm": 0.8100133023892003, "learning_rate": 4.848524637170018e-06, "loss": 0.063, "step": 1225 }, { "epoch": 0.5577797998180164, "grad_norm": 1.2239449746298685, "learning_rate": 4.848279562710474e-06, "loss": 0.1003, "step": 1226 }, { "epoch": 0.5582347588717016, "grad_norm": 0.5784631570123066, "learning_rate": 4.848034296360129e-06, "loss": 0.0461, "step": 1227 }, { "epoch": 0.5586897179253867, "grad_norm": 0.5785185693838462, "learning_rate": 4.847788838139025e-06, "loss": 0.0584, "step": 1228 }, { "epoch": 0.5591446769790719, "grad_norm": 0.7383840653779203, "learning_rate": 4.847543188067219e-06, "loss": 0.0556, "step": 1229 }, { "epoch": 0.5595996360327571, "grad_norm": 0.8449984615944548, "learning_rate": 4.847297346164786e-06, "loss": 0.0656, "step": 1230 }, { "epoch": 0.5600545950864422, "grad_norm": 0.7138008169538578, "learning_rate": 4.8470513124518134e-06, "loss": 0.0627, "step": 1231 }, { "epoch": 0.5605095541401274, "grad_norm": 1.2286881556843667, "learning_rate": 4.8468050869484075e-06, "loss": 0.0863, "step": 1232 }, { "epoch": 0.5609645131938126, "grad_norm": 0.7827144293564832, "learning_rate": 4.846558669674688e-06, "loss": 0.0535, "step": 1233 }, { "epoch": 0.5614194722474978, "grad_norm": 0.8311970732348628, "learning_rate": 4.8463120606507904e-06, "loss": 0.0577, "step": 1234 }, { "epoch": 0.5618744313011829, "grad_norm": 1.0881117043806725, "learning_rate": 4.846065259896867e-06, "loss": 0.0825, "step": 1235 }, { "epoch": 0.5623293903548681, "grad_norm": 0.9620936539691768, "learning_rate": 4.845818267433086e-06, "loss": 0.089, "step": 1236 }, { "epoch": 0.5627843494085533, "grad_norm": 0.7942713424944172, "learning_rate": 4.845571083279629e-06, "loss": 0.0654, "step": 1237 }, { "epoch": 0.5632393084622384, "grad_norm": 0.5998278656859003, "learning_rate": 4.845323707456696e-06, "loss": 0.0649, "step": 1238 }, { "epoch": 0.5636942675159236, "grad_norm": 0.741973804021484, "learning_rate": 4.845076139984502e-06, "loss": 0.06, "step": 1239 }, { "epoch": 0.5641492265696088, "grad_norm": 0.9737908411420552, "learning_rate": 4.844828380883274e-06, "loss": 0.0788, "step": 1240 }, { "epoch": 0.564604185623294, "grad_norm": 0.6456783705803008, "learning_rate": 4.844580430173261e-06, "loss": 0.062, "step": 1241 }, { "epoch": 0.565059144676979, "grad_norm": 0.4512477885968687, "learning_rate": 4.8443322878747236e-06, "loss": 0.0338, "step": 1242 }, { "epoch": 0.5655141037306642, "grad_norm": 0.6789010313138605, "learning_rate": 4.844083954007938e-06, "loss": 0.0553, "step": 1243 }, { "epoch": 0.5659690627843494, "grad_norm": 1.0592017111841259, "learning_rate": 4.843835428593198e-06, "loss": 0.0964, "step": 1244 }, { "epoch": 0.5664240218380345, "grad_norm": 0.8615337952745731, "learning_rate": 4.84358671165081e-06, "loss": 0.0803, "step": 1245 }, { "epoch": 0.5668789808917197, "grad_norm": 0.9068648816735045, "learning_rate": 4.843337803201102e-06, "loss": 0.0957, "step": 1246 }, { "epoch": 0.5673339399454049, "grad_norm": 0.7801553916627879, "learning_rate": 4.8430887032644094e-06, "loss": 0.0707, "step": 1247 }, { "epoch": 0.5677888989990901, "grad_norm": 0.888048216687448, "learning_rate": 4.842839411861089e-06, "loss": 0.0713, "step": 1248 }, { "epoch": 0.5682438580527752, "grad_norm": 0.7108300238400989, "learning_rate": 4.842589929011513e-06, "loss": 0.0609, "step": 1249 }, { "epoch": 0.5686988171064604, "grad_norm": 0.8602271760854026, "learning_rate": 4.8423402547360665e-06, "loss": 0.071, "step": 1250 }, { "epoch": 0.5691537761601456, "grad_norm": 0.7368968504486557, "learning_rate": 4.842090389055153e-06, "loss": 0.0549, "step": 1251 }, { "epoch": 0.5696087352138307, "grad_norm": 0.6376797315072175, "learning_rate": 4.841840331989189e-06, "loss": 0.0536, "step": 1252 }, { "epoch": 0.5700636942675159, "grad_norm": 0.7105471536122931, "learning_rate": 4.841590083558608e-06, "loss": 0.0589, "step": 1253 }, { "epoch": 0.5705186533212011, "grad_norm": 0.7851951409187395, "learning_rate": 4.841339643783861e-06, "loss": 0.0667, "step": 1254 }, { "epoch": 0.5709736123748863, "grad_norm": 0.9143449154341239, "learning_rate": 4.841089012685412e-06, "loss": 0.094, "step": 1255 }, { "epoch": 0.5714285714285714, "grad_norm": 0.7279105322404732, "learning_rate": 4.840838190283741e-06, "loss": 0.0665, "step": 1256 }, { "epoch": 0.5718835304822566, "grad_norm": 0.5565255651391556, "learning_rate": 4.8405871765993435e-06, "loss": 0.0374, "step": 1257 }, { "epoch": 0.5723384895359418, "grad_norm": 0.7414165634317871, "learning_rate": 4.840335971652732e-06, "loss": 0.055, "step": 1258 }, { "epoch": 0.5727934485896269, "grad_norm": 0.7491117483341468, "learning_rate": 4.840084575464434e-06, "loss": 0.0663, "step": 1259 }, { "epoch": 0.5732484076433121, "grad_norm": 0.7016390473579003, "learning_rate": 4.839832988054992e-06, "loss": 0.0585, "step": 1260 }, { "epoch": 0.5737033666969973, "grad_norm": 0.6978517385357002, "learning_rate": 4.839581209444966e-06, "loss": 0.0515, "step": 1261 }, { "epoch": 0.5741583257506825, "grad_norm": 0.8617787500306493, "learning_rate": 4.839329239654927e-06, "loss": 0.0695, "step": 1262 }, { "epoch": 0.5746132848043676, "grad_norm": 0.8166990461173421, "learning_rate": 4.839077078705468e-06, "loss": 0.055, "step": 1263 }, { "epoch": 0.5750682438580528, "grad_norm": 0.9058417331374469, "learning_rate": 4.838824726617194e-06, "loss": 0.0821, "step": 1264 }, { "epoch": 0.575523202911738, "grad_norm": 0.8868783686078405, "learning_rate": 4.838572183410725e-06, "loss": 0.0708, "step": 1265 }, { "epoch": 0.5759781619654231, "grad_norm": 0.8499550028220518, "learning_rate": 4.838319449106697e-06, "loss": 0.071, "step": 1266 }, { "epoch": 0.5764331210191083, "grad_norm": 0.6310677239795418, "learning_rate": 4.838066523725764e-06, "loss": 0.0466, "step": 1267 }, { "epoch": 0.5768880800727935, "grad_norm": 0.7334881695978646, "learning_rate": 4.837813407288594e-06, "loss": 0.0672, "step": 1268 }, { "epoch": 0.5773430391264787, "grad_norm": 0.6599472606872935, "learning_rate": 4.837560099815869e-06, "loss": 0.0514, "step": 1269 }, { "epoch": 0.5777979981801638, "grad_norm": 0.7760258987880212, "learning_rate": 4.837306601328289e-06, "loss": 0.0684, "step": 1270 }, { "epoch": 0.578252957233849, "grad_norm": 0.6309403280935036, "learning_rate": 4.837052911846569e-06, "loss": 0.0626, "step": 1271 }, { "epoch": 0.5787079162875342, "grad_norm": 0.8629672961688011, "learning_rate": 4.836799031391439e-06, "loss": 0.0784, "step": 1272 }, { "epoch": 0.5791628753412192, "grad_norm": 0.3854359252943462, "learning_rate": 4.836544959983645e-06, "loss": 0.033, "step": 1273 }, { "epoch": 0.5796178343949044, "grad_norm": 1.0318224319043552, "learning_rate": 4.8362906976439485e-06, "loss": 0.0849, "step": 1274 }, { "epoch": 0.5800727934485896, "grad_norm": 0.9665553458146198, "learning_rate": 4.836036244393127e-06, "loss": 0.0958, "step": 1275 }, { "epoch": 0.5805277525022748, "grad_norm": 0.9363624449700683, "learning_rate": 4.835781600251973e-06, "loss": 0.0765, "step": 1276 }, { "epoch": 0.5809827115559599, "grad_norm": 0.6147554145907947, "learning_rate": 4.835526765241295e-06, "loss": 0.0488, "step": 1277 }, { "epoch": 0.5814376706096451, "grad_norm": 0.8334344326658653, "learning_rate": 4.835271739381917e-06, "loss": 0.0721, "step": 1278 }, { "epoch": 0.5818926296633303, "grad_norm": 0.49964314159638445, "learning_rate": 4.835016522694678e-06, "loss": 0.0493, "step": 1279 }, { "epoch": 0.5823475887170154, "grad_norm": 0.9514130488464217, "learning_rate": 4.834761115200434e-06, "loss": 0.1112, "step": 1280 }, { "epoch": 0.5828025477707006, "grad_norm": 0.7622176607459498, "learning_rate": 4.834505516920055e-06, "loss": 0.0773, "step": 1281 }, { "epoch": 0.5832575068243858, "grad_norm": 0.8319534302623502, "learning_rate": 4.834249727874428e-06, "loss": 0.0734, "step": 1282 }, { "epoch": 0.583712465878071, "grad_norm": 0.6580298023552714, "learning_rate": 4.833993748084455e-06, "loss": 0.0487, "step": 1283 }, { "epoch": 0.5841674249317561, "grad_norm": 0.5709038914352429, "learning_rate": 4.833737577571052e-06, "loss": 0.0437, "step": 1284 }, { "epoch": 0.5846223839854413, "grad_norm": 0.7738487313994183, "learning_rate": 4.833481216355153e-06, "loss": 0.0593, "step": 1285 }, { "epoch": 0.5850773430391265, "grad_norm": 1.3097720038286855, "learning_rate": 4.833224664457709e-06, "loss": 0.1053, "step": 1286 }, { "epoch": 0.5855323020928116, "grad_norm": 1.191215763782131, "learning_rate": 4.83296792189968e-06, "loss": 0.0791, "step": 1287 }, { "epoch": 0.5859872611464968, "grad_norm": 0.7550564620604591, "learning_rate": 4.83271098870205e-06, "loss": 0.0614, "step": 1288 }, { "epoch": 0.586442220200182, "grad_norm": 0.7855417872089538, "learning_rate": 4.832453864885811e-06, "loss": 0.0765, "step": 1289 }, { "epoch": 0.5868971792538672, "grad_norm": 0.7082768853538572, "learning_rate": 4.832196550471976e-06, "loss": 0.0584, "step": 1290 }, { "epoch": 0.5873521383075523, "grad_norm": 0.7586707116910946, "learning_rate": 4.831939045481571e-06, "loss": 0.0693, "step": 1291 }, { "epoch": 0.5878070973612375, "grad_norm": 0.6804347439528804, "learning_rate": 4.8316813499356375e-06, "loss": 0.0579, "step": 1292 }, { "epoch": 0.5882620564149227, "grad_norm": 0.6650179741094593, "learning_rate": 4.831423463855235e-06, "loss": 0.0473, "step": 1293 }, { "epoch": 0.5887170154686078, "grad_norm": 0.7381087388697778, "learning_rate": 4.8311653872614345e-06, "loss": 0.061, "step": 1294 }, { "epoch": 0.589171974522293, "grad_norm": 0.6163282527593773, "learning_rate": 4.830907120175327e-06, "loss": 0.0458, "step": 1295 }, { "epoch": 0.5896269335759782, "grad_norm": 1.355736098526988, "learning_rate": 4.830648662618015e-06, "loss": 0.1213, "step": 1296 }, { "epoch": 0.5900818926296634, "grad_norm": 0.49193334954985163, "learning_rate": 4.83039001461062e-06, "loss": 0.0379, "step": 1297 }, { "epoch": 0.5905368516833485, "grad_norm": 0.8251065772364982, "learning_rate": 4.830131176174276e-06, "loss": 0.0614, "step": 1298 }, { "epoch": 0.5909918107370337, "grad_norm": 0.9196499228007727, "learning_rate": 4.829872147330136e-06, "loss": 0.0747, "step": 1299 }, { "epoch": 0.5914467697907189, "grad_norm": 0.6635211914340154, "learning_rate": 4.829612928099366e-06, "loss": 0.0599, "step": 1300 }, { "epoch": 0.591901728844404, "grad_norm": 0.5214758111450317, "learning_rate": 4.829353518503147e-06, "loss": 0.0466, "step": 1301 }, { "epoch": 0.5923566878980892, "grad_norm": 0.610000532317552, "learning_rate": 4.829093918562678e-06, "loss": 0.048, "step": 1302 }, { "epoch": 0.5928116469517744, "grad_norm": 1.0296471042370532, "learning_rate": 4.828834128299173e-06, "loss": 0.0849, "step": 1303 }, { "epoch": 0.5932666060054596, "grad_norm": 1.0181697893257282, "learning_rate": 4.828574147733859e-06, "loss": 0.0917, "step": 1304 }, { "epoch": 0.5937215650591446, "grad_norm": 0.7456729558547099, "learning_rate": 4.828313976887982e-06, "loss": 0.0566, "step": 1305 }, { "epoch": 0.5941765241128298, "grad_norm": 1.2895345953766368, "learning_rate": 4.8280536157828e-06, "loss": 0.0768, "step": 1306 }, { "epoch": 0.594631483166515, "grad_norm": 0.7193657333914658, "learning_rate": 4.827793064439592e-06, "loss": 0.0649, "step": 1307 }, { "epoch": 0.5950864422202001, "grad_norm": 0.6369033412844897, "learning_rate": 4.8275323228796455e-06, "loss": 0.0485, "step": 1308 }, { "epoch": 0.5955414012738853, "grad_norm": 0.4260555886565184, "learning_rate": 4.8272713911242695e-06, "loss": 0.0233, "step": 1309 }, { "epoch": 0.5959963603275705, "grad_norm": 0.5487802818772052, "learning_rate": 4.827010269194785e-06, "loss": 0.0429, "step": 1310 }, { "epoch": 0.5964513193812557, "grad_norm": 0.9163026616839156, "learning_rate": 4.8267489571125295e-06, "loss": 0.0723, "step": 1311 }, { "epoch": 0.5969062784349408, "grad_norm": 1.2697687714050636, "learning_rate": 4.826487454898857e-06, "loss": 0.1022, "step": 1312 }, { "epoch": 0.597361237488626, "grad_norm": 0.6502381859861477, "learning_rate": 4.826225762575136e-06, "loss": 0.0566, "step": 1313 }, { "epoch": 0.5978161965423112, "grad_norm": 0.6784651371796548, "learning_rate": 4.825963880162752e-06, "loss": 0.0569, "step": 1314 }, { "epoch": 0.5982711555959963, "grad_norm": 0.9827857531768842, "learning_rate": 4.825701807683102e-06, "loss": 0.0709, "step": 1315 }, { "epoch": 0.5987261146496815, "grad_norm": 0.8148040815518555, "learning_rate": 4.825439545157603e-06, "loss": 0.0661, "step": 1316 }, { "epoch": 0.5991810737033667, "grad_norm": 0.7818624869971815, "learning_rate": 4.825177092607687e-06, "loss": 0.0756, "step": 1317 }, { "epoch": 0.5996360327570519, "grad_norm": 0.6526378977883536, "learning_rate": 4.8249144500547995e-06, "loss": 0.0549, "step": 1318 }, { "epoch": 0.600090991810737, "grad_norm": 0.5697295150295824, "learning_rate": 4.824651617520402e-06, "loss": 0.0393, "step": 1319 }, { "epoch": 0.6005459508644222, "grad_norm": 0.7421021671142831, "learning_rate": 4.824388595025972e-06, "loss": 0.0789, "step": 1320 }, { "epoch": 0.6010009099181074, "grad_norm": 0.7191904617460073, "learning_rate": 4.824125382593003e-06, "loss": 0.0532, "step": 1321 }, { "epoch": 0.6014558689717925, "grad_norm": 0.7309054499990442, "learning_rate": 4.823861980243003e-06, "loss": 0.0748, "step": 1322 }, { "epoch": 0.6019108280254777, "grad_norm": 0.8448893024828844, "learning_rate": 4.823598387997497e-06, "loss": 0.0667, "step": 1323 }, { "epoch": 0.6023657870791629, "grad_norm": 0.7601489588572167, "learning_rate": 4.823334605878024e-06, "loss": 0.0523, "step": 1324 }, { "epoch": 0.6028207461328481, "grad_norm": 0.7433654268959281, "learning_rate": 4.82307063390614e-06, "loss": 0.0553, "step": 1325 }, { "epoch": 0.6032757051865332, "grad_norm": 0.8187296751030086, "learning_rate": 4.822806472103413e-06, "loss": 0.0676, "step": 1326 }, { "epoch": 0.6037306642402184, "grad_norm": 0.6394929202903299, "learning_rate": 4.822542120491431e-06, "loss": 0.0577, "step": 1327 }, { "epoch": 0.6041856232939036, "grad_norm": 0.4810746844764873, "learning_rate": 4.822277579091796e-06, "loss": 0.0548, "step": 1328 }, { "epoch": 0.6046405823475887, "grad_norm": 0.6400955174315186, "learning_rate": 4.822012847926125e-06, "loss": 0.0527, "step": 1329 }, { "epoch": 0.6050955414012739, "grad_norm": 0.6867529527775732, "learning_rate": 4.821747927016049e-06, "loss": 0.0434, "step": 1330 }, { "epoch": 0.6055505004549591, "grad_norm": 0.8517417056997812, "learning_rate": 4.821482816383219e-06, "loss": 0.0785, "step": 1331 }, { "epoch": 0.6060054595086443, "grad_norm": 0.5351287181203948, "learning_rate": 4.821217516049296e-06, "loss": 0.0451, "step": 1332 }, { "epoch": 0.6064604185623294, "grad_norm": 0.7138436850600612, "learning_rate": 4.82095202603596e-06, "loss": 0.0636, "step": 1333 }, { "epoch": 0.6069153776160146, "grad_norm": 0.7109233850284291, "learning_rate": 4.820686346364906e-06, "loss": 0.0563, "step": 1334 }, { "epoch": 0.6073703366696998, "grad_norm": 0.9928633837693652, "learning_rate": 4.820420477057843e-06, "loss": 0.073, "step": 1335 }, { "epoch": 0.607825295723385, "grad_norm": 0.8108842754609783, "learning_rate": 4.820154418136498e-06, "loss": 0.0732, "step": 1336 }, { "epoch": 0.60828025477707, "grad_norm": 0.9409136888664106, "learning_rate": 4.819888169622612e-06, "loss": 0.0746, "step": 1337 }, { "epoch": 0.6087352138307552, "grad_norm": 0.8704561721777555, "learning_rate": 4.819621731537942e-06, "loss": 0.0863, "step": 1338 }, { "epoch": 0.6091901728844404, "grad_norm": 0.679765432028962, "learning_rate": 4.819355103904259e-06, "loss": 0.0522, "step": 1339 }, { "epoch": 0.6096451319381255, "grad_norm": 0.7506288496766044, "learning_rate": 4.81908828674335e-06, "loss": 0.0581, "step": 1340 }, { "epoch": 0.6101000909918107, "grad_norm": 0.7533146973996597, "learning_rate": 4.81882128007702e-06, "loss": 0.0508, "step": 1341 }, { "epoch": 0.6105550500454959, "grad_norm": 0.7623699590970283, "learning_rate": 4.818554083927086e-06, "loss": 0.0602, "step": 1342 }, { "epoch": 0.6110100090991811, "grad_norm": 0.8511179695780368, "learning_rate": 4.818286698315383e-06, "loss": 0.0577, "step": 1343 }, { "epoch": 0.6114649681528662, "grad_norm": 0.7862894129476269, "learning_rate": 4.818019123263761e-06, "loss": 0.0817, "step": 1344 }, { "epoch": 0.6119199272065514, "grad_norm": 0.6793771839239909, "learning_rate": 4.817751358794084e-06, "loss": 0.0516, "step": 1345 }, { "epoch": 0.6123748862602366, "grad_norm": 0.6744967187177401, "learning_rate": 4.8174834049282325e-06, "loss": 0.06, "step": 1346 }, { "epoch": 0.6128298453139217, "grad_norm": 1.0595508979402892, "learning_rate": 4.817215261688104e-06, "loss": 0.0928, "step": 1347 }, { "epoch": 0.6132848043676069, "grad_norm": 0.7276826984658654, "learning_rate": 4.816946929095607e-06, "loss": 0.0502, "step": 1348 }, { "epoch": 0.6137397634212921, "grad_norm": 0.741447673760934, "learning_rate": 4.816678407172671e-06, "loss": 0.0741, "step": 1349 }, { "epoch": 0.6141947224749773, "grad_norm": 0.7665525289277765, "learning_rate": 4.816409695941238e-06, "loss": 0.0586, "step": 1350 }, { "epoch": 0.6146496815286624, "grad_norm": 0.7571976993587441, "learning_rate": 4.816140795423265e-06, "loss": 0.0646, "step": 1351 }, { "epoch": 0.6151046405823476, "grad_norm": 1.0671078250910566, "learning_rate": 4.8158717056407255e-06, "loss": 0.0906, "step": 1352 }, { "epoch": 0.6155595996360328, "grad_norm": 1.068257180900936, "learning_rate": 4.815602426615609e-06, "loss": 0.0814, "step": 1353 }, { "epoch": 0.6160145586897179, "grad_norm": 0.7704299563830304, "learning_rate": 4.815332958369919e-06, "loss": 0.0628, "step": 1354 }, { "epoch": 0.6164695177434031, "grad_norm": 0.7309675809198951, "learning_rate": 4.815063300925677e-06, "loss": 0.0534, "step": 1355 }, { "epoch": 0.6169244767970883, "grad_norm": 0.6905459067357435, "learning_rate": 4.814793454304915e-06, "loss": 0.0664, "step": 1356 }, { "epoch": 0.6173794358507735, "grad_norm": 0.7612784977520042, "learning_rate": 4.814523418529686e-06, "loss": 0.071, "step": 1357 }, { "epoch": 0.6178343949044586, "grad_norm": 0.7397558513678282, "learning_rate": 4.814253193622056e-06, "loss": 0.0658, "step": 1358 }, { "epoch": 0.6182893539581438, "grad_norm": 0.8273217031416162, "learning_rate": 4.813982779604106e-06, "loss": 0.0542, "step": 1359 }, { "epoch": 0.618744313011829, "grad_norm": 0.7097594863375644, "learning_rate": 4.813712176497933e-06, "loss": 0.0695, "step": 1360 }, { "epoch": 0.6191992720655141, "grad_norm": 0.9081905345796648, "learning_rate": 4.813441384325649e-06, "loss": 0.0742, "step": 1361 }, { "epoch": 0.6196542311191993, "grad_norm": 0.6161179936509155, "learning_rate": 4.813170403109383e-06, "loss": 0.0435, "step": 1362 }, { "epoch": 0.6201091901728845, "grad_norm": 0.6587599265658766, "learning_rate": 4.8128992328712774e-06, "loss": 0.0511, "step": 1363 }, { "epoch": 0.6205641492265697, "grad_norm": 0.6246519005543884, "learning_rate": 4.812627873633492e-06, "loss": 0.0547, "step": 1364 }, { "epoch": 0.6210191082802548, "grad_norm": 0.9162916767800175, "learning_rate": 4.8123563254182e-06, "loss": 0.0909, "step": 1365 }, { "epoch": 0.62147406733394, "grad_norm": 0.9475342021978096, "learning_rate": 4.8120845882475924e-06, "loss": 0.0834, "step": 1366 }, { "epoch": 0.6219290263876252, "grad_norm": 0.9962945342835489, "learning_rate": 4.8118126621438734e-06, "loss": 0.082, "step": 1367 }, { "epoch": 0.6223839854413102, "grad_norm": 0.8129731557585484, "learning_rate": 4.811540547129263e-06, "loss": 0.102, "step": 1368 }, { "epoch": 0.6228389444949954, "grad_norm": 1.0476685985771956, "learning_rate": 4.811268243225999e-06, "loss": 0.0863, "step": 1369 }, { "epoch": 0.6232939035486806, "grad_norm": 0.6364270484543224, "learning_rate": 4.810995750456331e-06, "loss": 0.049, "step": 1370 }, { "epoch": 0.6237488626023658, "grad_norm": 0.9605165651320201, "learning_rate": 4.810723068842526e-06, "loss": 0.0907, "step": 1371 }, { "epoch": 0.6242038216560509, "grad_norm": 0.907972626235469, "learning_rate": 4.810450198406867e-06, "loss": 0.089, "step": 1372 }, { "epoch": 0.6246587807097361, "grad_norm": 1.2105959950909937, "learning_rate": 4.810177139171653e-06, "loss": 0.0997, "step": 1373 }, { "epoch": 0.6251137397634213, "grad_norm": 0.5261266936372415, "learning_rate": 4.809903891159195e-06, "loss": 0.0369, "step": 1374 }, { "epoch": 0.6255686988171064, "grad_norm": 1.0914928147545504, "learning_rate": 4.809630454391822e-06, "loss": 0.0763, "step": 1375 }, { "epoch": 0.6260236578707916, "grad_norm": 0.9315193606392632, "learning_rate": 4.80935682889188e-06, "loss": 0.0994, "step": 1376 }, { "epoch": 0.6264786169244768, "grad_norm": 0.8071975479211501, "learning_rate": 4.809083014681726e-06, "loss": 0.0754, "step": 1377 }, { "epoch": 0.626933575978162, "grad_norm": 0.8407873246258533, "learning_rate": 4.808809011783735e-06, "loss": 0.0862, "step": 1378 }, { "epoch": 0.6273885350318471, "grad_norm": 0.7028834708201565, "learning_rate": 4.808534820220299e-06, "loss": 0.0557, "step": 1379 }, { "epoch": 0.6278434940855323, "grad_norm": 0.9130142462887187, "learning_rate": 4.8082604400138226e-06, "loss": 0.0907, "step": 1380 }, { "epoch": 0.6282984531392175, "grad_norm": 0.5572902974057224, "learning_rate": 4.807985871186726e-06, "loss": 0.0538, "step": 1381 }, { "epoch": 0.6287534121929026, "grad_norm": 0.9359956622314829, "learning_rate": 4.8077111137614484e-06, "loss": 0.0761, "step": 1382 }, { "epoch": 0.6292083712465878, "grad_norm": 0.9259969123573535, "learning_rate": 4.8074361677604394e-06, "loss": 0.08, "step": 1383 }, { "epoch": 0.629663330300273, "grad_norm": 0.7515102950917599, "learning_rate": 4.807161033206168e-06, "loss": 0.068, "step": 1384 }, { "epoch": 0.6301182893539582, "grad_norm": 0.8350119106641899, "learning_rate": 4.806885710121114e-06, "loss": 0.0717, "step": 1385 }, { "epoch": 0.6305732484076433, "grad_norm": 0.7425131820751144, "learning_rate": 4.806610198527779e-06, "loss": 0.059, "step": 1386 }, { "epoch": 0.6310282074613285, "grad_norm": 0.8471900633166635, "learning_rate": 4.8063344984486755e-06, "loss": 0.0624, "step": 1387 }, { "epoch": 0.6314831665150137, "grad_norm": 0.8231045305624575, "learning_rate": 4.806058609906331e-06, "loss": 0.0708, "step": 1388 }, { "epoch": 0.6319381255686988, "grad_norm": 1.0923257733711043, "learning_rate": 4.805782532923292e-06, "loss": 0.088, "step": 1389 }, { "epoch": 0.632393084622384, "grad_norm": 0.8065424294249984, "learning_rate": 4.805506267522116e-06, "loss": 0.0817, "step": 1390 }, { "epoch": 0.6328480436760692, "grad_norm": 1.1107266551952906, "learning_rate": 4.80522981372538e-06, "loss": 0.0917, "step": 1391 }, { "epoch": 0.6333030027297544, "grad_norm": 0.5047747692042878, "learning_rate": 4.804953171555674e-06, "loss": 0.046, "step": 1392 }, { "epoch": 0.6337579617834395, "grad_norm": 1.1009908125405006, "learning_rate": 4.8046763410356046e-06, "loss": 0.0721, "step": 1393 }, { "epoch": 0.6342129208371247, "grad_norm": 1.1234718918773754, "learning_rate": 4.804399322187791e-06, "loss": 0.1011, "step": 1394 }, { "epoch": 0.6346678798908099, "grad_norm": 1.083495863144811, "learning_rate": 4.8041221150348725e-06, "loss": 0.0993, "step": 1395 }, { "epoch": 0.635122838944495, "grad_norm": 0.824505933705283, "learning_rate": 4.8038447195995e-06, "loss": 0.0714, "step": 1396 }, { "epoch": 0.6355777979981801, "grad_norm": 0.8879521653149162, "learning_rate": 4.80356713590434e-06, "loss": 0.0709, "step": 1397 }, { "epoch": 0.6360327570518653, "grad_norm": 1.0230090809736052, "learning_rate": 4.803289363972078e-06, "loss": 0.0902, "step": 1398 }, { "epoch": 0.6364877161055505, "grad_norm": 0.6519234189862375, "learning_rate": 4.8030114038254094e-06, "loss": 0.0522, "step": 1399 }, { "epoch": 0.6369426751592356, "grad_norm": 0.741318365992446, "learning_rate": 4.80273325548705e-06, "loss": 0.0611, "step": 1400 }, { "epoch": 0.6373976342129208, "grad_norm": 0.7527321897876023, "learning_rate": 4.802454918979728e-06, "loss": 0.0606, "step": 1401 }, { "epoch": 0.637852593266606, "grad_norm": 1.1417182907652552, "learning_rate": 4.802176394326187e-06, "loss": 0.1069, "step": 1402 }, { "epoch": 0.6383075523202911, "grad_norm": 0.8131687655992657, "learning_rate": 4.801897681549188e-06, "loss": 0.0464, "step": 1403 }, { "epoch": 0.6387625113739763, "grad_norm": 0.8327902884834529, "learning_rate": 4.801618780671506e-06, "loss": 0.0747, "step": 1404 }, { "epoch": 0.6392174704276615, "grad_norm": 0.692422710517289, "learning_rate": 4.801339691715932e-06, "loss": 0.0699, "step": 1405 }, { "epoch": 0.6396724294813467, "grad_norm": 0.6800001240174697, "learning_rate": 4.8010604147052695e-06, "loss": 0.0503, "step": 1406 }, { "epoch": 0.6401273885350318, "grad_norm": 0.8019856852977274, "learning_rate": 4.800780949662343e-06, "loss": 0.0698, "step": 1407 }, { "epoch": 0.640582347588717, "grad_norm": 0.6564335023756012, "learning_rate": 4.800501296609986e-06, "loss": 0.0501, "step": 1408 }, { "epoch": 0.6410373066424022, "grad_norm": 0.855136459668507, "learning_rate": 4.800221455571053e-06, "loss": 0.0777, "step": 1409 }, { "epoch": 0.6414922656960873, "grad_norm": 0.6154657454528484, "learning_rate": 4.7999414265684105e-06, "loss": 0.0527, "step": 1410 }, { "epoch": 0.6419472247497725, "grad_norm": 0.7989007320367253, "learning_rate": 4.79966120962494e-06, "loss": 0.0734, "step": 1411 }, { "epoch": 0.6424021838034577, "grad_norm": 0.7788506951611326, "learning_rate": 4.799380804763542e-06, "loss": 0.0634, "step": 1412 }, { "epoch": 0.6428571428571429, "grad_norm": 0.8023595788447846, "learning_rate": 4.799100212007128e-06, "loss": 0.0635, "step": 1413 }, { "epoch": 0.643312101910828, "grad_norm": 0.6671118062408689, "learning_rate": 4.7988194313786275e-06, "loss": 0.0502, "step": 1414 }, { "epoch": 0.6437670609645132, "grad_norm": 0.4852500706051457, "learning_rate": 4.798538462900984e-06, "loss": 0.0439, "step": 1415 }, { "epoch": 0.6442220200181984, "grad_norm": 0.6172531415308445, "learning_rate": 4.798257306597157e-06, "loss": 0.0512, "step": 1416 }, { "epoch": 0.6446769790718835, "grad_norm": 0.7721959959019802, "learning_rate": 4.797975962490122e-06, "loss": 0.071, "step": 1417 }, { "epoch": 0.6451319381255687, "grad_norm": 0.5314044706595764, "learning_rate": 4.797694430602869e-06, "loss": 0.0348, "step": 1418 }, { "epoch": 0.6455868971792539, "grad_norm": 0.9359827868935178, "learning_rate": 4.797412710958405e-06, "loss": 0.0813, "step": 1419 }, { "epoch": 0.6460418562329391, "grad_norm": 0.895180226763773, "learning_rate": 4.797130803579747e-06, "loss": 0.0725, "step": 1420 }, { "epoch": 0.6464968152866242, "grad_norm": 0.9382178957271444, "learning_rate": 4.796848708489935e-06, "loss": 0.0876, "step": 1421 }, { "epoch": 0.6469517743403094, "grad_norm": 0.8047561179790783, "learning_rate": 4.796566425712018e-06, "loss": 0.0791, "step": 1422 }, { "epoch": 0.6474067333939946, "grad_norm": 0.7813970242588332, "learning_rate": 4.796283955269065e-06, "loss": 0.0868, "step": 1423 }, { "epoch": 0.6478616924476797, "grad_norm": 0.9241460431035805, "learning_rate": 4.796001297184156e-06, "loss": 0.0905, "step": 1424 }, { "epoch": 0.6483166515013649, "grad_norm": 0.9826565480521312, "learning_rate": 4.79571845148039e-06, "loss": 0.0941, "step": 1425 }, { "epoch": 0.6487716105550501, "grad_norm": 0.5534927969311005, "learning_rate": 4.795435418180879e-06, "loss": 0.0579, "step": 1426 }, { "epoch": 0.6492265696087353, "grad_norm": 0.8672553303494054, "learning_rate": 4.795152197308753e-06, "loss": 0.0712, "step": 1427 }, { "epoch": 0.6496815286624203, "grad_norm": 0.49524738598652407, "learning_rate": 4.794868788887154e-06, "loss": 0.0379, "step": 1428 }, { "epoch": 0.6501364877161055, "grad_norm": 0.827670164958526, "learning_rate": 4.79458519293924e-06, "loss": 0.0882, "step": 1429 }, { "epoch": 0.6505914467697907, "grad_norm": 0.5992195124496454, "learning_rate": 4.794301409488187e-06, "loss": 0.0483, "step": 1430 }, { "epoch": 0.6510464058234758, "grad_norm": 0.7192899332508552, "learning_rate": 4.7940174385571835e-06, "loss": 0.0595, "step": 1431 }, { "epoch": 0.651501364877161, "grad_norm": 0.6956421052259842, "learning_rate": 4.793733280169435e-06, "loss": 0.0706, "step": 1432 }, { "epoch": 0.6519563239308462, "grad_norm": 1.165394616398272, "learning_rate": 4.7934489343481614e-06, "loss": 0.0993, "step": 1433 }, { "epoch": 0.6524112829845314, "grad_norm": 0.9487576511550925, "learning_rate": 4.7931644011165975e-06, "loss": 0.0668, "step": 1434 }, { "epoch": 0.6528662420382165, "grad_norm": 0.6703994528089227, "learning_rate": 4.792879680497995e-06, "loss": 0.0579, "step": 1435 }, { "epoch": 0.6533212010919017, "grad_norm": 0.8121521545363791, "learning_rate": 4.79259477251562e-06, "loss": 0.071, "step": 1436 }, { "epoch": 0.6537761601455869, "grad_norm": 1.0536792067264262, "learning_rate": 4.792309677192753e-06, "loss": 0.0987, "step": 1437 }, { "epoch": 0.654231119199272, "grad_norm": 0.7922763227676367, "learning_rate": 4.79202439455269e-06, "loss": 0.0608, "step": 1438 }, { "epoch": 0.6546860782529572, "grad_norm": 0.6328855607330163, "learning_rate": 4.791738924618745e-06, "loss": 0.0576, "step": 1439 }, { "epoch": 0.6551410373066424, "grad_norm": 0.7130766129638374, "learning_rate": 4.791453267414245e-06, "loss": 0.0474, "step": 1440 }, { "epoch": 0.6555959963603276, "grad_norm": 0.7668931671367808, "learning_rate": 4.7911674229625316e-06, "loss": 0.0608, "step": 1441 }, { "epoch": 0.6560509554140127, "grad_norm": 1.026785066290622, "learning_rate": 4.790881391286963e-06, "loss": 0.0784, "step": 1442 }, { "epoch": 0.6565059144676979, "grad_norm": 0.8437938287309505, "learning_rate": 4.790595172410914e-06, "loss": 0.0688, "step": 1443 }, { "epoch": 0.6569608735213831, "grad_norm": 0.95620015056413, "learning_rate": 4.79030876635777e-06, "loss": 0.0866, "step": 1444 }, { "epoch": 0.6574158325750682, "grad_norm": 0.7126304636743447, "learning_rate": 4.790022173150938e-06, "loss": 0.0633, "step": 1445 }, { "epoch": 0.6578707916287534, "grad_norm": 0.5125812984853052, "learning_rate": 4.789735392813835e-06, "loss": 0.0423, "step": 1446 }, { "epoch": 0.6583257506824386, "grad_norm": 0.7255097967015932, "learning_rate": 4.789448425369896e-06, "loss": 0.0591, "step": 1447 }, { "epoch": 0.6587807097361238, "grad_norm": 0.7245916935945349, "learning_rate": 4.789161270842571e-06, "loss": 0.0617, "step": 1448 }, { "epoch": 0.6592356687898089, "grad_norm": 0.8534895421081942, "learning_rate": 4.7888739292553235e-06, "loss": 0.0824, "step": 1449 }, { "epoch": 0.6596906278434941, "grad_norm": 0.9129297819396048, "learning_rate": 4.788586400631636e-06, "loss": 0.0864, "step": 1450 }, { "epoch": 0.6601455868971793, "grad_norm": 0.6321747250216057, "learning_rate": 4.788298684995003e-06, "loss": 0.0606, "step": 1451 }, { "epoch": 0.6606005459508644, "grad_norm": 0.6569500610517134, "learning_rate": 4.7880107823689355e-06, "loss": 0.0471, "step": 1452 }, { "epoch": 0.6610555050045496, "grad_norm": 0.836923575196295, "learning_rate": 4.787722692776958e-06, "loss": 0.0806, "step": 1453 }, { "epoch": 0.6615104640582348, "grad_norm": 0.8618715247200026, "learning_rate": 4.787434416242615e-06, "loss": 0.0796, "step": 1454 }, { "epoch": 0.66196542311192, "grad_norm": 1.2913844032281525, "learning_rate": 4.787145952789461e-06, "loss": 0.1144, "step": 1455 }, { "epoch": 0.6624203821656051, "grad_norm": 0.7740924771134702, "learning_rate": 4.786857302441069e-06, "loss": 0.0501, "step": 1456 }, { "epoch": 0.6628753412192903, "grad_norm": 0.8850884382043015, "learning_rate": 4.786568465221025e-06, "loss": 0.0776, "step": 1457 }, { "epoch": 0.6633303002729755, "grad_norm": 0.8065764669411247, "learning_rate": 4.7862794411529315e-06, "loss": 0.0714, "step": 1458 }, { "epoch": 0.6637852593266605, "grad_norm": 1.0967948038936701, "learning_rate": 4.7859902302604075e-06, "loss": 0.0996, "step": 1459 }, { "epoch": 0.6642402183803457, "grad_norm": 0.9974285175173262, "learning_rate": 4.785700832567085e-06, "loss": 0.0776, "step": 1460 }, { "epoch": 0.664695177434031, "grad_norm": 0.6236115703442758, "learning_rate": 4.785411248096613e-06, "loss": 0.0476, "step": 1461 }, { "epoch": 0.6651501364877161, "grad_norm": 0.5778143724368887, "learning_rate": 4.785121476872654e-06, "loss": 0.0623, "step": 1462 }, { "epoch": 0.6656050955414012, "grad_norm": 0.8331029721226916, "learning_rate": 4.784831518918888e-06, "loss": 0.086, "step": 1463 }, { "epoch": 0.6660600545950864, "grad_norm": 0.6555727601304058, "learning_rate": 4.784541374259008e-06, "loss": 0.0604, "step": 1464 }, { "epoch": 0.6665150136487716, "grad_norm": 0.840422831333416, "learning_rate": 4.7842510429167244e-06, "loss": 0.0705, "step": 1465 }, { "epoch": 0.6669699727024567, "grad_norm": 0.8495238637146105, "learning_rate": 4.783960524915761e-06, "loss": 0.0795, "step": 1466 }, { "epoch": 0.6674249317561419, "grad_norm": 0.7305356040505075, "learning_rate": 4.783669820279858e-06, "loss": 0.0632, "step": 1467 }, { "epoch": 0.6678798908098271, "grad_norm": 0.6350015765617174, "learning_rate": 4.783378929032769e-06, "loss": 0.0537, "step": 1468 }, { "epoch": 0.6683348498635123, "grad_norm": 0.7555553607544558, "learning_rate": 4.783087851198267e-06, "loss": 0.0664, "step": 1469 }, { "epoch": 0.6687898089171974, "grad_norm": 0.9513776608510918, "learning_rate": 4.7827965868001356e-06, "loss": 0.0797, "step": 1470 }, { "epoch": 0.6692447679708826, "grad_norm": 0.7540264419548088, "learning_rate": 4.782505135862176e-06, "loss": 0.0645, "step": 1471 }, { "epoch": 0.6696997270245678, "grad_norm": 0.8806981016853459, "learning_rate": 4.782213498408205e-06, "loss": 0.0792, "step": 1472 }, { "epoch": 0.6701546860782529, "grad_norm": 0.816738183429135, "learning_rate": 4.781921674462053e-06, "loss": 0.0567, "step": 1473 }, { "epoch": 0.6706096451319381, "grad_norm": 0.9563479172789231, "learning_rate": 4.781629664047566e-06, "loss": 0.0726, "step": 1474 }, { "epoch": 0.6710646041856233, "grad_norm": 0.6676816189321566, "learning_rate": 4.781337467188607e-06, "loss": 0.0674, "step": 1475 }, { "epoch": 0.6715195632393085, "grad_norm": 0.8153655901587608, "learning_rate": 4.781045083909053e-06, "loss": 0.0708, "step": 1476 }, { "epoch": 0.6719745222929936, "grad_norm": 0.9702425819685979, "learning_rate": 4.780752514232796e-06, "loss": 0.066, "step": 1477 }, { "epoch": 0.6724294813466788, "grad_norm": 0.6736063833667711, "learning_rate": 4.780459758183743e-06, "loss": 0.0594, "step": 1478 }, { "epoch": 0.672884440400364, "grad_norm": 0.6356551069651334, "learning_rate": 4.780166815785817e-06, "loss": 0.0572, "step": 1479 }, { "epoch": 0.6733393994540491, "grad_norm": 0.7785893078479745, "learning_rate": 4.7798736870629554e-06, "loss": 0.0826, "step": 1480 }, { "epoch": 0.6737943585077343, "grad_norm": 0.7319247864552544, "learning_rate": 4.779580372039113e-06, "loss": 0.0536, "step": 1481 }, { "epoch": 0.6742493175614195, "grad_norm": 0.9576832944027126, "learning_rate": 4.779286870738256e-06, "loss": 0.0689, "step": 1482 }, { "epoch": 0.6747042766151047, "grad_norm": 0.8166538023451795, "learning_rate": 4.778993183184371e-06, "loss": 0.0556, "step": 1483 }, { "epoch": 0.6751592356687898, "grad_norm": 1.1745759297821086, "learning_rate": 4.778699309401453e-06, "loss": 0.0897, "step": 1484 }, { "epoch": 0.675614194722475, "grad_norm": 0.7555435279337044, "learning_rate": 4.7784052494135195e-06, "loss": 0.0653, "step": 1485 }, { "epoch": 0.6760691537761602, "grad_norm": 0.664770902902504, "learning_rate": 4.778111003244596e-06, "loss": 0.0683, "step": 1486 }, { "epoch": 0.6765241128298453, "grad_norm": 0.5809114716709589, "learning_rate": 4.777816570918731e-06, "loss": 0.05, "step": 1487 }, { "epoch": 0.6769790718835305, "grad_norm": 0.6928304723812573, "learning_rate": 4.777521952459982e-06, "loss": 0.064, "step": 1488 }, { "epoch": 0.6774340309372157, "grad_norm": 0.8604944031525139, "learning_rate": 4.777227147892424e-06, "loss": 0.067, "step": 1489 }, { "epoch": 0.6778889899909009, "grad_norm": 0.752451889272893, "learning_rate": 4.776932157240147e-06, "loss": 0.065, "step": 1490 }, { "epoch": 0.678343949044586, "grad_norm": 0.8473681581598411, "learning_rate": 4.776636980527257e-06, "loss": 0.0623, "step": 1491 }, { "epoch": 0.6787989080982711, "grad_norm": 0.9703686393076305, "learning_rate": 4.776341617777874e-06, "loss": 0.0686, "step": 1492 }, { "epoch": 0.6792538671519563, "grad_norm": 0.811693083005839, "learning_rate": 4.776046069016135e-06, "loss": 0.0672, "step": 1493 }, { "epoch": 0.6797088262056415, "grad_norm": 1.0089795417423277, "learning_rate": 4.775750334266188e-06, "loss": 0.0867, "step": 1494 }, { "epoch": 0.6801637852593266, "grad_norm": 0.6348129639773868, "learning_rate": 4.775454413552202e-06, "loss": 0.0478, "step": 1495 }, { "epoch": 0.6806187443130118, "grad_norm": 1.8046001058548395, "learning_rate": 4.775158306898358e-06, "loss": 0.0856, "step": 1496 }, { "epoch": 0.681073703366697, "grad_norm": 0.804030519135084, "learning_rate": 4.774862014328849e-06, "loss": 0.0682, "step": 1497 }, { "epoch": 0.6815286624203821, "grad_norm": 0.7475852207063984, "learning_rate": 4.774565535867892e-06, "loss": 0.0621, "step": 1498 }, { "epoch": 0.6819836214740673, "grad_norm": 1.036971872978779, "learning_rate": 4.77426887153971e-06, "loss": 0.0772, "step": 1499 }, { "epoch": 0.6824385805277525, "grad_norm": 0.8877829921629609, "learning_rate": 4.773972021368546e-06, "loss": 0.0792, "step": 1500 }, { "epoch": 0.6828935395814377, "grad_norm": 0.9837757799688718, "learning_rate": 4.773674985378658e-06, "loss": 0.1229, "step": 1501 }, { "epoch": 0.6833484986351228, "grad_norm": 0.7772757895891362, "learning_rate": 4.773377763594319e-06, "loss": 0.0472, "step": 1502 }, { "epoch": 0.683803457688808, "grad_norm": 0.8631194323034224, "learning_rate": 4.773080356039814e-06, "loss": 0.0645, "step": 1503 }, { "epoch": 0.6842584167424932, "grad_norm": 0.6366545290180244, "learning_rate": 4.772782762739448e-06, "loss": 0.0595, "step": 1504 }, { "epoch": 0.6847133757961783, "grad_norm": 0.7166260637548661, "learning_rate": 4.772484983717539e-06, "loss": 0.0623, "step": 1505 }, { "epoch": 0.6851683348498635, "grad_norm": 0.8757638645378785, "learning_rate": 4.77218701899842e-06, "loss": 0.0655, "step": 1506 }, { "epoch": 0.6856232939035487, "grad_norm": 1.0305728337056401, "learning_rate": 4.771888868606438e-06, "loss": 0.098, "step": 1507 }, { "epoch": 0.6860782529572339, "grad_norm": 0.5264608517167783, "learning_rate": 4.771590532565957e-06, "loss": 0.0384, "step": 1508 }, { "epoch": 0.686533212010919, "grad_norm": 0.7225210635255812, "learning_rate": 4.771292010901357e-06, "loss": 0.0529, "step": 1509 }, { "epoch": 0.6869881710646042, "grad_norm": 0.9811455221159325, "learning_rate": 4.77099330363703e-06, "loss": 0.0734, "step": 1510 }, { "epoch": 0.6874431301182894, "grad_norm": 0.590972914047016, "learning_rate": 4.770694410797387e-06, "loss": 0.0552, "step": 1511 }, { "epoch": 0.6878980891719745, "grad_norm": 0.8133978032493828, "learning_rate": 4.770395332406851e-06, "loss": 0.061, "step": 1512 }, { "epoch": 0.6883530482256597, "grad_norm": 0.8194249817407185, "learning_rate": 4.770096068489861e-06, "loss": 0.0741, "step": 1513 }, { "epoch": 0.6888080072793449, "grad_norm": 0.6084096486485657, "learning_rate": 4.769796619070872e-06, "loss": 0.0602, "step": 1514 }, { "epoch": 0.6892629663330301, "grad_norm": 0.5651191066612926, "learning_rate": 4.769496984174353e-06, "loss": 0.0486, "step": 1515 }, { "epoch": 0.6897179253867152, "grad_norm": 0.9194603499902049, "learning_rate": 4.769197163824791e-06, "loss": 0.0656, "step": 1516 }, { "epoch": 0.6901728844404004, "grad_norm": 0.8500604771260194, "learning_rate": 4.768897158046683e-06, "loss": 0.0653, "step": 1517 }, { "epoch": 0.6906278434940856, "grad_norm": 0.6862315615479446, "learning_rate": 4.768596966864546e-06, "loss": 0.0524, "step": 1518 }, { "epoch": 0.6910828025477707, "grad_norm": 1.2304461047991757, "learning_rate": 4.76829659030291e-06, "loss": 0.1137, "step": 1519 }, { "epoch": 0.6915377616014559, "grad_norm": 0.876128817036191, "learning_rate": 4.767996028386319e-06, "loss": 0.0757, "step": 1520 }, { "epoch": 0.6919927206551411, "grad_norm": 1.07669637523419, "learning_rate": 4.767695281139336e-06, "loss": 0.0679, "step": 1521 }, { "epoch": 0.6924476797088263, "grad_norm": 0.8211862034870426, "learning_rate": 4.767394348586535e-06, "loss": 0.0619, "step": 1522 }, { "epoch": 0.6929026387625113, "grad_norm": 0.9099029471086892, "learning_rate": 4.767093230752507e-06, "loss": 0.0903, "step": 1523 }, { "epoch": 0.6933575978161965, "grad_norm": 0.7799051318100109, "learning_rate": 4.766791927661859e-06, "loss": 0.0766, "step": 1524 }, { "epoch": 0.6938125568698817, "grad_norm": 1.0883988105102491, "learning_rate": 4.766490439339211e-06, "loss": 0.0661, "step": 1525 }, { "epoch": 0.6942675159235668, "grad_norm": 0.8400663441468114, "learning_rate": 4.7661887658092e-06, "loss": 0.0653, "step": 1526 }, { "epoch": 0.694722474977252, "grad_norm": 0.8744033325354778, "learning_rate": 4.765886907096477e-06, "loss": 0.0712, "step": 1527 }, { "epoch": 0.6951774340309372, "grad_norm": 1.0117866298788205, "learning_rate": 4.7655848632257084e-06, "loss": 0.0961, "step": 1528 }, { "epoch": 0.6956323930846224, "grad_norm": 0.8451483006341224, "learning_rate": 4.7652826342215764e-06, "loss": 0.0805, "step": 1529 }, { "epoch": 0.6960873521383075, "grad_norm": 0.9721200931533607, "learning_rate": 4.764980220108777e-06, "loss": 0.0868, "step": 1530 }, { "epoch": 0.6965423111919927, "grad_norm": 1.139955334493506, "learning_rate": 4.764677620912022e-06, "loss": 0.0922, "step": 1531 }, { "epoch": 0.6969972702456779, "grad_norm": 0.6557667551942458, "learning_rate": 4.764374836656041e-06, "loss": 0.061, "step": 1532 }, { "epoch": 0.697452229299363, "grad_norm": 0.6235355321610729, "learning_rate": 4.764071867365571e-06, "loss": 0.0717, "step": 1533 }, { "epoch": 0.6979071883530482, "grad_norm": 0.8241396053903132, "learning_rate": 4.763768713065375e-06, "loss": 0.0635, "step": 1534 }, { "epoch": 0.6983621474067334, "grad_norm": 1.1215482357989177, "learning_rate": 4.763465373780223e-06, "loss": 0.0854, "step": 1535 }, { "epoch": 0.6988171064604186, "grad_norm": 0.9398991503235029, "learning_rate": 4.763161849534902e-06, "loss": 0.0708, "step": 1536 }, { "epoch": 0.6992720655141037, "grad_norm": 0.5429847226903595, "learning_rate": 4.762858140354214e-06, "loss": 0.0563, "step": 1537 }, { "epoch": 0.6997270245677889, "grad_norm": 0.7113795106394718, "learning_rate": 4.7625542462629785e-06, "loss": 0.0639, "step": 1538 }, { "epoch": 0.7001819836214741, "grad_norm": 0.4626733648173771, "learning_rate": 4.762250167286027e-06, "loss": 0.0323, "step": 1539 }, { "epoch": 0.7006369426751592, "grad_norm": 0.9326531891804614, "learning_rate": 4.761945903448209e-06, "loss": 0.0901, "step": 1540 }, { "epoch": 0.7010919017288444, "grad_norm": 0.5398486547930679, "learning_rate": 4.761641454774386e-06, "loss": 0.053, "step": 1541 }, { "epoch": 0.7015468607825296, "grad_norm": 0.9874559613432076, "learning_rate": 4.761336821289436e-06, "loss": 0.0966, "step": 1542 }, { "epoch": 0.7020018198362148, "grad_norm": 0.5432433449970621, "learning_rate": 4.761032003018254e-06, "loss": 0.0513, "step": 1543 }, { "epoch": 0.7024567788898999, "grad_norm": 0.622579728480354, "learning_rate": 4.760726999985748e-06, "loss": 0.0441, "step": 1544 }, { "epoch": 0.7029117379435851, "grad_norm": 0.6451217496312431, "learning_rate": 4.7604218122168406e-06, "loss": 0.0552, "step": 1545 }, { "epoch": 0.7033666969972703, "grad_norm": 1.1118626340333584, "learning_rate": 4.760116439736471e-06, "loss": 0.1001, "step": 1546 }, { "epoch": 0.7038216560509554, "grad_norm": 1.1718299505240957, "learning_rate": 4.759810882569591e-06, "loss": 0.1093, "step": 1547 }, { "epoch": 0.7042766151046406, "grad_norm": 0.4549553550039402, "learning_rate": 4.759505140741172e-06, "loss": 0.037, "step": 1548 }, { "epoch": 0.7047315741583258, "grad_norm": 1.2374534576601486, "learning_rate": 4.759199214276196e-06, "loss": 0.1075, "step": 1549 }, { "epoch": 0.705186533212011, "grad_norm": 0.6890766528984787, "learning_rate": 4.758893103199665e-06, "loss": 0.0704, "step": 1550 }, { "epoch": 0.7056414922656961, "grad_norm": 0.809006377964544, "learning_rate": 4.758586807536588e-06, "loss": 0.0635, "step": 1551 }, { "epoch": 0.7060964513193813, "grad_norm": 0.46816790314940004, "learning_rate": 4.758280327311998e-06, "loss": 0.0396, "step": 1552 }, { "epoch": 0.7065514103730665, "grad_norm": 0.956864928582162, "learning_rate": 4.757973662550938e-06, "loss": 0.0715, "step": 1553 }, { "epoch": 0.7070063694267515, "grad_norm": 0.6528599047999262, "learning_rate": 4.757666813278466e-06, "loss": 0.0525, "step": 1554 }, { "epoch": 0.7074613284804367, "grad_norm": 0.8181545206568527, "learning_rate": 4.757359779519659e-06, "loss": 0.0727, "step": 1555 }, { "epoch": 0.707916287534122, "grad_norm": 0.9844274916847088, "learning_rate": 4.757052561299604e-06, "loss": 0.0991, "step": 1556 }, { "epoch": 0.7083712465878071, "grad_norm": 1.0768378272925192, "learning_rate": 4.756745158643407e-06, "loss": 0.0976, "step": 1557 }, { "epoch": 0.7088262056414922, "grad_norm": 0.8642164719519719, "learning_rate": 4.7564375715761865e-06, "loss": 0.0791, "step": 1558 }, { "epoch": 0.7092811646951774, "grad_norm": 0.433851399002758, "learning_rate": 4.756129800123078e-06, "loss": 0.0264, "step": 1559 }, { "epoch": 0.7097361237488626, "grad_norm": 0.701794046170542, "learning_rate": 4.755821844309232e-06, "loss": 0.078, "step": 1560 }, { "epoch": 0.7101910828025477, "grad_norm": 0.7934468928491815, "learning_rate": 4.75551370415981e-06, "loss": 0.0705, "step": 1561 }, { "epoch": 0.7106460418562329, "grad_norm": 0.8567300928308393, "learning_rate": 4.755205379699996e-06, "loss": 0.0708, "step": 1562 }, { "epoch": 0.7111010009099181, "grad_norm": 0.7631031658693246, "learning_rate": 4.75489687095498e-06, "loss": 0.0696, "step": 1563 }, { "epoch": 0.7115559599636033, "grad_norm": 0.920620931877378, "learning_rate": 4.754588177949977e-06, "loss": 0.0721, "step": 1564 }, { "epoch": 0.7120109190172884, "grad_norm": 0.763031156086878, "learning_rate": 4.7542793007102086e-06, "loss": 0.0519, "step": 1565 }, { "epoch": 0.7124658780709736, "grad_norm": 0.8338043690194923, "learning_rate": 4.7539702392609165e-06, "loss": 0.091, "step": 1566 }, { "epoch": 0.7129208371246588, "grad_norm": 0.5783419362259836, "learning_rate": 4.753660993627356e-06, "loss": 0.0383, "step": 1567 }, { "epoch": 0.7133757961783439, "grad_norm": 1.091175103721431, "learning_rate": 4.753351563834795e-06, "loss": 0.0874, "step": 1568 }, { "epoch": 0.7138307552320291, "grad_norm": 0.9259203812623706, "learning_rate": 4.753041949908521e-06, "loss": 0.0658, "step": 1569 }, { "epoch": 0.7142857142857143, "grad_norm": 0.9365180546757296, "learning_rate": 4.752732151873834e-06, "loss": 0.0692, "step": 1570 }, { "epoch": 0.7147406733393995, "grad_norm": 0.7100718342966104, "learning_rate": 4.752422169756048e-06, "loss": 0.0711, "step": 1571 }, { "epoch": 0.7151956323930846, "grad_norm": 0.5978251540753616, "learning_rate": 4.752112003580495e-06, "loss": 0.0486, "step": 1572 }, { "epoch": 0.7156505914467698, "grad_norm": 0.7341732428573583, "learning_rate": 4.751801653372518e-06, "loss": 0.0462, "step": 1573 }, { "epoch": 0.716105550500455, "grad_norm": 0.7943019606068298, "learning_rate": 4.751491119157481e-06, "loss": 0.0679, "step": 1574 }, { "epoch": 0.7165605095541401, "grad_norm": 0.5499580471761844, "learning_rate": 4.751180400960756e-06, "loss": 0.0469, "step": 1575 }, { "epoch": 0.7170154686078253, "grad_norm": 0.8873520635980867, "learning_rate": 4.7508694988077355e-06, "loss": 0.0804, "step": 1576 }, { "epoch": 0.7174704276615105, "grad_norm": 0.7826027240405181, "learning_rate": 4.750558412723824e-06, "loss": 0.0534, "step": 1577 }, { "epoch": 0.7179253867151957, "grad_norm": 0.5436399916764901, "learning_rate": 4.750247142734442e-06, "loss": 0.0422, "step": 1578 }, { "epoch": 0.7183803457688808, "grad_norm": 0.8976952415351162, "learning_rate": 4.749935688865026e-06, "loss": 0.0946, "step": 1579 }, { "epoch": 0.718835304822566, "grad_norm": 0.5897366750354841, "learning_rate": 4.749624051141026e-06, "loss": 0.0448, "step": 1580 }, { "epoch": 0.7192902638762512, "grad_norm": 0.522008805738841, "learning_rate": 4.7493122295879076e-06, "loss": 0.0479, "step": 1581 }, { "epoch": 0.7197452229299363, "grad_norm": 0.8905616220483812, "learning_rate": 4.7490002242311525e-06, "loss": 0.0769, "step": 1582 }, { "epoch": 0.7202001819836215, "grad_norm": 0.7188643596274509, "learning_rate": 4.748688035096255e-06, "loss": 0.059, "step": 1583 }, { "epoch": 0.7206551410373067, "grad_norm": 0.6538082906296614, "learning_rate": 4.748375662208726e-06, "loss": 0.0421, "step": 1584 }, { "epoch": 0.7211101000909919, "grad_norm": 0.9507382067003013, "learning_rate": 4.748063105594092e-06, "loss": 0.0885, "step": 1585 }, { "epoch": 0.721565059144677, "grad_norm": 0.8751185263070331, "learning_rate": 4.747750365277892e-06, "loss": 0.082, "step": 1586 }, { "epoch": 0.7220200181983621, "grad_norm": 0.7815947443354453, "learning_rate": 4.747437441285684e-06, "loss": 0.0496, "step": 1587 }, { "epoch": 0.7224749772520473, "grad_norm": 0.8056389354729365, "learning_rate": 4.747124333643038e-06, "loss": 0.0758, "step": 1588 }, { "epoch": 0.7229299363057324, "grad_norm": 0.8424702785094896, "learning_rate": 4.746811042375538e-06, "loss": 0.0685, "step": 1589 }, { "epoch": 0.7233848953594176, "grad_norm": 0.7103882690080215, "learning_rate": 4.746497567508787e-06, "loss": 0.0611, "step": 1590 }, { "epoch": 0.7238398544131028, "grad_norm": 0.7217352053119206, "learning_rate": 4.7461839090684e-06, "loss": 0.0749, "step": 1591 }, { "epoch": 0.724294813466788, "grad_norm": 0.820137321844301, "learning_rate": 4.745870067080007e-06, "loss": 0.0624, "step": 1592 }, { "epoch": 0.7247497725204731, "grad_norm": 1.3704268529969041, "learning_rate": 4.7455560415692545e-06, "loss": 0.1186, "step": 1593 }, { "epoch": 0.7252047315741583, "grad_norm": 0.8173046496173154, "learning_rate": 4.745241832561803e-06, "loss": 0.0518, "step": 1594 }, { "epoch": 0.7256596906278435, "grad_norm": 0.6436436457004329, "learning_rate": 4.744927440083329e-06, "loss": 0.0601, "step": 1595 }, { "epoch": 0.7261146496815286, "grad_norm": 0.7006481377697166, "learning_rate": 4.744612864159522e-06, "loss": 0.0564, "step": 1596 }, { "epoch": 0.7265696087352138, "grad_norm": 0.9334346972782172, "learning_rate": 4.7442981048160895e-06, "loss": 0.0923, "step": 1597 }, { "epoch": 0.727024567788899, "grad_norm": 0.7193743397132841, "learning_rate": 4.74398316207875e-06, "loss": 0.0513, "step": 1598 }, { "epoch": 0.7274795268425842, "grad_norm": 1.3623828249001875, "learning_rate": 4.74366803597324e-06, "loss": 0.1304, "step": 1599 }, { "epoch": 0.7279344858962693, "grad_norm": 0.7986369654273386, "learning_rate": 4.743352726525311e-06, "loss": 0.0657, "step": 1600 }, { "epoch": 0.7283894449499545, "grad_norm": 0.6297195371205284, "learning_rate": 4.743037233760728e-06, "loss": 0.0518, "step": 1601 }, { "epoch": 0.7288444040036397, "grad_norm": 0.7797911267746347, "learning_rate": 4.742721557705271e-06, "loss": 0.0576, "step": 1602 }, { "epoch": 0.7292993630573248, "grad_norm": 0.6771322942350662, "learning_rate": 4.7424056983847374e-06, "loss": 0.0721, "step": 1603 }, { "epoch": 0.72975432211101, "grad_norm": 0.9370484628146732, "learning_rate": 4.7420896558249366e-06, "loss": 0.0818, "step": 1604 }, { "epoch": 0.7302092811646952, "grad_norm": 0.49858668464501216, "learning_rate": 4.741773430051694e-06, "loss": 0.0396, "step": 1605 }, { "epoch": 0.7306642402183804, "grad_norm": 0.7992861361600685, "learning_rate": 4.74145702109085e-06, "loss": 0.0681, "step": 1606 }, { "epoch": 0.7311191992720655, "grad_norm": 1.1366113719813769, "learning_rate": 4.741140428968261e-06, "loss": 0.0899, "step": 1607 }, { "epoch": 0.7315741583257507, "grad_norm": 0.4255609619960085, "learning_rate": 4.740823653709797e-06, "loss": 0.0374, "step": 1608 }, { "epoch": 0.7320291173794359, "grad_norm": 0.515074709614876, "learning_rate": 4.740506695341343e-06, "loss": 0.05, "step": 1609 }, { "epoch": 0.732484076433121, "grad_norm": 0.9479583163425525, "learning_rate": 4.740189553888801e-06, "loss": 0.0951, "step": 1610 }, { "epoch": 0.7329390354868062, "grad_norm": 0.6968388029166215, "learning_rate": 4.739872229378085e-06, "loss": 0.0585, "step": 1611 }, { "epoch": 0.7333939945404914, "grad_norm": 0.6907841663601652, "learning_rate": 4.739554721835125e-06, "loss": 0.0516, "step": 1612 }, { "epoch": 0.7338489535941766, "grad_norm": 0.931301306576401, "learning_rate": 4.739237031285867e-06, "loss": 0.0853, "step": 1613 }, { "epoch": 0.7343039126478617, "grad_norm": 0.8746710093849102, "learning_rate": 4.738919157756272e-06, "loss": 0.0726, "step": 1614 }, { "epoch": 0.7347588717015469, "grad_norm": 0.7323772065729443, "learning_rate": 4.738601101272313e-06, "loss": 0.0728, "step": 1615 }, { "epoch": 0.7352138307552321, "grad_norm": 0.6527662262438081, "learning_rate": 4.738282861859983e-06, "loss": 0.0566, "step": 1616 }, { "epoch": 0.7356687898089171, "grad_norm": 0.7588905056806852, "learning_rate": 4.737964439545284e-06, "loss": 0.0654, "step": 1617 }, { "epoch": 0.7361237488626023, "grad_norm": 0.7391620420308275, "learning_rate": 4.737645834354238e-06, "loss": 0.0577, "step": 1618 }, { "epoch": 0.7365787079162875, "grad_norm": 0.6241259905981298, "learning_rate": 4.737327046312879e-06, "loss": 0.0463, "step": 1619 }, { "epoch": 0.7370336669699727, "grad_norm": 0.7979480345626465, "learning_rate": 4.737008075447259e-06, "loss": 0.0594, "step": 1620 }, { "epoch": 0.7374886260236578, "grad_norm": 0.8204459689677482, "learning_rate": 4.73668892178344e-06, "loss": 0.0709, "step": 1621 }, { "epoch": 0.737943585077343, "grad_norm": 0.7262256791849234, "learning_rate": 4.736369585347503e-06, "loss": 0.0684, "step": 1622 }, { "epoch": 0.7383985441310282, "grad_norm": 1.0855588116680628, "learning_rate": 4.736050066165544e-06, "loss": 0.0733, "step": 1623 }, { "epoch": 0.7388535031847133, "grad_norm": 0.9128385411196811, "learning_rate": 4.735730364263671e-06, "loss": 0.0805, "step": 1624 }, { "epoch": 0.7393084622383985, "grad_norm": 0.6804670184644261, "learning_rate": 4.735410479668009e-06, "loss": 0.0625, "step": 1625 }, { "epoch": 0.7397634212920837, "grad_norm": 0.8471626944635055, "learning_rate": 4.735090412404697e-06, "loss": 0.0792, "step": 1626 }, { "epoch": 0.7402183803457689, "grad_norm": 0.740952084115674, "learning_rate": 4.734770162499891e-06, "loss": 0.054, "step": 1627 }, { "epoch": 0.740673339399454, "grad_norm": 0.6979194464789156, "learning_rate": 4.734449729979759e-06, "loss": 0.0516, "step": 1628 }, { "epoch": 0.7411282984531392, "grad_norm": 0.934620178703864, "learning_rate": 4.734129114870486e-06, "loss": 0.0728, "step": 1629 }, { "epoch": 0.7415832575068244, "grad_norm": 0.6776690850388026, "learning_rate": 4.733808317198271e-06, "loss": 0.0396, "step": 1630 }, { "epoch": 0.7420382165605095, "grad_norm": 0.6772926595393153, "learning_rate": 4.733487336989327e-06, "loss": 0.0582, "step": 1631 }, { "epoch": 0.7424931756141947, "grad_norm": 0.8592043857490114, "learning_rate": 4.733166174269886e-06, "loss": 0.073, "step": 1632 }, { "epoch": 0.7429481346678799, "grad_norm": 0.8641626074857739, "learning_rate": 4.732844829066189e-06, "loss": 0.0731, "step": 1633 }, { "epoch": 0.7434030937215651, "grad_norm": 0.9593751010905753, "learning_rate": 4.732523301404497e-06, "loss": 0.0753, "step": 1634 }, { "epoch": 0.7438580527752502, "grad_norm": 1.0669159630512404, "learning_rate": 4.732201591311082e-06, "loss": 0.0941, "step": 1635 }, { "epoch": 0.7443130118289354, "grad_norm": 0.7929283291326471, "learning_rate": 4.731879698812233e-06, "loss": 0.0858, "step": 1636 }, { "epoch": 0.7447679708826206, "grad_norm": 0.673427501132972, "learning_rate": 4.731557623934255e-06, "loss": 0.0513, "step": 1637 }, { "epoch": 0.7452229299363057, "grad_norm": 1.3418525148659195, "learning_rate": 4.7312353667034645e-06, "loss": 0.113, "step": 1638 }, { "epoch": 0.7456778889899909, "grad_norm": 1.2120619666958259, "learning_rate": 4.730912927146197e-06, "loss": 0.0919, "step": 1639 }, { "epoch": 0.7461328480436761, "grad_norm": 0.7616942301743401, "learning_rate": 4.7305903052888e-06, "loss": 0.0623, "step": 1640 }, { "epoch": 0.7465878070973613, "grad_norm": 0.958768278975319, "learning_rate": 4.730267501157636e-06, "loss": 0.071, "step": 1641 }, { "epoch": 0.7470427661510464, "grad_norm": 0.6568421919172921, "learning_rate": 4.729944514779084e-06, "loss": 0.0576, "step": 1642 }, { "epoch": 0.7474977252047316, "grad_norm": 0.6153711066410817, "learning_rate": 4.729621346179536e-06, "loss": 0.0605, "step": 1643 }, { "epoch": 0.7479526842584168, "grad_norm": 0.6856160527068095, "learning_rate": 4.7292979953854e-06, "loss": 0.0577, "step": 1644 }, { "epoch": 0.7484076433121019, "grad_norm": 0.5864170000219955, "learning_rate": 4.7289744624231004e-06, "loss": 0.0429, "step": 1645 }, { "epoch": 0.7488626023657871, "grad_norm": 1.1083470301221403, "learning_rate": 4.728650747319073e-06, "loss": 0.0895, "step": 1646 }, { "epoch": 0.7493175614194723, "grad_norm": 0.9211984823549421, "learning_rate": 4.728326850099771e-06, "loss": 0.0834, "step": 1647 }, { "epoch": 0.7497725204731575, "grad_norm": 0.6665680334269098, "learning_rate": 4.728002770791663e-06, "loss": 0.0641, "step": 1648 }, { "epoch": 0.7502274795268425, "grad_norm": 0.6938216889022656, "learning_rate": 4.727678509421229e-06, "loss": 0.0626, "step": 1649 }, { "epoch": 0.7506824385805277, "grad_norm": 0.7915210763484374, "learning_rate": 4.727354066014968e-06, "loss": 0.0449, "step": 1650 }, { "epoch": 0.7511373976342129, "grad_norm": 0.8713821596875527, "learning_rate": 4.727029440599391e-06, "loss": 0.0664, "step": 1651 }, { "epoch": 0.7515923566878981, "grad_norm": 0.633081271382763, "learning_rate": 4.726704633201025e-06, "loss": 0.0539, "step": 1652 }, { "epoch": 0.7520473157415832, "grad_norm": 0.926880670672549, "learning_rate": 4.726379643846412e-06, "loss": 0.0759, "step": 1653 }, { "epoch": 0.7525022747952684, "grad_norm": 1.1416005489607706, "learning_rate": 4.726054472562109e-06, "loss": 0.0837, "step": 1654 }, { "epoch": 0.7529572338489536, "grad_norm": 0.5402662393046999, "learning_rate": 4.725729119374687e-06, "loss": 0.0453, "step": 1655 }, { "epoch": 0.7534121929026387, "grad_norm": 0.6335686685514863, "learning_rate": 4.725403584310734e-06, "loss": 0.0461, "step": 1656 }, { "epoch": 0.7538671519563239, "grad_norm": 0.8262266594165262, "learning_rate": 4.725077867396849e-06, "loss": 0.0571, "step": 1657 }, { "epoch": 0.7543221110100091, "grad_norm": 0.7284181310556234, "learning_rate": 4.724751968659648e-06, "loss": 0.0776, "step": 1658 }, { "epoch": 0.7547770700636943, "grad_norm": 0.9346158065906657, "learning_rate": 4.724425888125764e-06, "loss": 0.0768, "step": 1659 }, { "epoch": 0.7552320291173794, "grad_norm": 0.7882371644054315, "learning_rate": 4.724099625821842e-06, "loss": 0.0662, "step": 1660 }, { "epoch": 0.7556869881710646, "grad_norm": 0.8622330946401275, "learning_rate": 4.723773181774543e-06, "loss": 0.0739, "step": 1661 }, { "epoch": 0.7561419472247498, "grad_norm": 0.882607769279876, "learning_rate": 4.723446556010542e-06, "loss": 0.0652, "step": 1662 }, { "epoch": 0.7565969062784349, "grad_norm": 0.6272241973275734, "learning_rate": 4.7231197485565275e-06, "loss": 0.0671, "step": 1663 }, { "epoch": 0.7570518653321201, "grad_norm": 0.9143319315222466, "learning_rate": 4.722792759439209e-06, "loss": 0.0836, "step": 1664 }, { "epoch": 0.7575068243858053, "grad_norm": 1.0179914574460616, "learning_rate": 4.722465588685302e-06, "loss": 0.1076, "step": 1665 }, { "epoch": 0.7579617834394905, "grad_norm": 0.5583219749541256, "learning_rate": 4.722138236321545e-06, "loss": 0.0402, "step": 1666 }, { "epoch": 0.7584167424931756, "grad_norm": 0.8435692964339, "learning_rate": 4.721810702374687e-06, "loss": 0.0557, "step": 1667 }, { "epoch": 0.7588717015468608, "grad_norm": 0.6927360095243408, "learning_rate": 4.721482986871491e-06, "loss": 0.0523, "step": 1668 }, { "epoch": 0.759326660600546, "grad_norm": 1.1648579407503177, "learning_rate": 4.721155089838738e-06, "loss": 0.0758, "step": 1669 }, { "epoch": 0.7597816196542311, "grad_norm": 0.7760606996897229, "learning_rate": 4.720827011303222e-06, "loss": 0.059, "step": 1670 }, { "epoch": 0.7602365787079163, "grad_norm": 0.866591093188149, "learning_rate": 4.720498751291751e-06, "loss": 0.0761, "step": 1671 }, { "epoch": 0.7606915377616015, "grad_norm": 1.0996180971293896, "learning_rate": 4.72017030983115e-06, "loss": 0.1103, "step": 1672 }, { "epoch": 0.7611464968152867, "grad_norm": 0.8231436249177936, "learning_rate": 4.7198416869482575e-06, "loss": 0.0713, "step": 1673 }, { "epoch": 0.7616014558689718, "grad_norm": 1.0653488989628077, "learning_rate": 4.719512882669926e-06, "loss": 0.0965, "step": 1674 }, { "epoch": 0.762056414922657, "grad_norm": 0.6519840385405045, "learning_rate": 4.719183897023027e-06, "loss": 0.0478, "step": 1675 }, { "epoch": 0.7625113739763422, "grad_norm": 0.9166137300574493, "learning_rate": 4.718854730034441e-06, "loss": 0.0616, "step": 1676 }, { "epoch": 0.7629663330300273, "grad_norm": 0.678521481275382, "learning_rate": 4.718525381731066e-06, "loss": 0.0562, "step": 1677 }, { "epoch": 0.7634212920837125, "grad_norm": 1.074542168121289, "learning_rate": 4.718195852139816e-06, "loss": 0.0955, "step": 1678 }, { "epoch": 0.7638762511373977, "grad_norm": 1.2860232860764726, "learning_rate": 4.717866141287618e-06, "loss": 0.1276, "step": 1679 }, { "epoch": 0.7643312101910829, "grad_norm": 0.8631797724656796, "learning_rate": 4.717536249201416e-06, "loss": 0.0698, "step": 1680 }, { "epoch": 0.7647861692447679, "grad_norm": 0.5780781227727216, "learning_rate": 4.7172061759081646e-06, "loss": 0.0516, "step": 1681 }, { "epoch": 0.7652411282984531, "grad_norm": 0.8601028523567151, "learning_rate": 4.716875921434838e-06, "loss": 0.0804, "step": 1682 }, { "epoch": 0.7656960873521383, "grad_norm": 0.8231714424296133, "learning_rate": 4.716545485808421e-06, "loss": 0.0673, "step": 1683 }, { "epoch": 0.7661510464058234, "grad_norm": 0.8877019503026795, "learning_rate": 4.716214869055918e-06, "loss": 0.0754, "step": 1684 }, { "epoch": 0.7666060054595086, "grad_norm": 0.8595543978861313, "learning_rate": 4.715884071204344e-06, "loss": 0.0758, "step": 1685 }, { "epoch": 0.7670609645131938, "grad_norm": 0.760119775875199, "learning_rate": 4.715553092280731e-06, "loss": 0.0717, "step": 1686 }, { "epoch": 0.767515923566879, "grad_norm": 0.9302850076057104, "learning_rate": 4.7152219323121246e-06, "loss": 0.0776, "step": 1687 }, { "epoch": 0.7679708826205641, "grad_norm": 1.202654506835177, "learning_rate": 4.714890591325586e-06, "loss": 0.0968, "step": 1688 }, { "epoch": 0.7684258416742493, "grad_norm": 0.6400839574993287, "learning_rate": 4.714559069348189e-06, "loss": 0.053, "step": 1689 }, { "epoch": 0.7688808007279345, "grad_norm": 0.9345879461383537, "learning_rate": 4.714227366407027e-06, "loss": 0.0754, "step": 1690 }, { "epoch": 0.7693357597816196, "grad_norm": 0.699599210661908, "learning_rate": 4.7138954825292035e-06, "loss": 0.0562, "step": 1691 }, { "epoch": 0.7697907188353048, "grad_norm": 0.7256617889152714, "learning_rate": 4.71356341774184e-06, "loss": 0.0647, "step": 1692 }, { "epoch": 0.77024567788899, "grad_norm": 0.8163148806797087, "learning_rate": 4.713231172072069e-06, "loss": 0.0647, "step": 1693 }, { "epoch": 0.7707006369426752, "grad_norm": 0.8921055220209645, "learning_rate": 4.712898745547043e-06, "loss": 0.0688, "step": 1694 }, { "epoch": 0.7711555959963603, "grad_norm": 0.8759443756147646, "learning_rate": 4.712566138193923e-06, "loss": 0.0861, "step": 1695 }, { "epoch": 0.7716105550500455, "grad_norm": 0.6748844572059718, "learning_rate": 4.712233350039892e-06, "loss": 0.0557, "step": 1696 }, { "epoch": 0.7720655141037307, "grad_norm": 0.6739690976228645, "learning_rate": 4.711900381112141e-06, "loss": 0.0467, "step": 1697 }, { "epoch": 0.7725204731574158, "grad_norm": 0.5565716241809456, "learning_rate": 4.71156723143788e-06, "loss": 0.0494, "step": 1698 }, { "epoch": 0.772975432211101, "grad_norm": 0.6665864775582577, "learning_rate": 4.711233901044332e-06, "loss": 0.0665, "step": 1699 }, { "epoch": 0.7734303912647862, "grad_norm": 0.7450992205724415, "learning_rate": 4.710900389958735e-06, "loss": 0.0718, "step": 1700 }, { "epoch": 0.7738853503184714, "grad_norm": 1.1792700578565205, "learning_rate": 4.710566698208343e-06, "loss": 0.1197, "step": 1701 }, { "epoch": 0.7743403093721565, "grad_norm": 0.7320056890324683, "learning_rate": 4.710232825820424e-06, "loss": 0.0651, "step": 1702 }, { "epoch": 0.7747952684258417, "grad_norm": 0.4782904500778329, "learning_rate": 4.709898772822258e-06, "loss": 0.0363, "step": 1703 }, { "epoch": 0.7752502274795269, "grad_norm": 0.589565072900403, "learning_rate": 4.709564539241145e-06, "loss": 0.052, "step": 1704 }, { "epoch": 0.775705186533212, "grad_norm": 0.6162848956452569, "learning_rate": 4.709230125104396e-06, "loss": 0.0581, "step": 1705 }, { "epoch": 0.7761601455868972, "grad_norm": 0.5419452584536989, "learning_rate": 4.708895530439339e-06, "loss": 0.0426, "step": 1706 }, { "epoch": 0.7766151046405824, "grad_norm": 0.64262799617097, "learning_rate": 4.708560755273313e-06, "loss": 0.0377, "step": 1707 }, { "epoch": 0.7770700636942676, "grad_norm": 1.2766398855768186, "learning_rate": 4.7082257996336765e-06, "loss": 0.1176, "step": 1708 }, { "epoch": 0.7775250227479527, "grad_norm": 0.6290733792474179, "learning_rate": 4.707890663547801e-06, "loss": 0.0621, "step": 1709 }, { "epoch": 0.7779799818016379, "grad_norm": 0.7132245474865738, "learning_rate": 4.7075553470430695e-06, "loss": 0.0729, "step": 1710 }, { "epoch": 0.778434940855323, "grad_norm": 1.2871463763795532, "learning_rate": 4.707219850146885e-06, "loss": 0.0809, "step": 1711 }, { "epoch": 0.7788898999090081, "grad_norm": 0.7326832656978515, "learning_rate": 4.706884172886662e-06, "loss": 0.0778, "step": 1712 }, { "epoch": 0.7793448589626933, "grad_norm": 0.584850834159967, "learning_rate": 4.706548315289831e-06, "loss": 0.0561, "step": 1713 }, { "epoch": 0.7797998180163785, "grad_norm": 0.6500063491474557, "learning_rate": 4.706212277383836e-06, "loss": 0.0546, "step": 1714 }, { "epoch": 0.7802547770700637, "grad_norm": 0.8671552717303382, "learning_rate": 4.705876059196136e-06, "loss": 0.0805, "step": 1715 }, { "epoch": 0.7807097361237488, "grad_norm": 0.836331279519993, "learning_rate": 4.705539660754208e-06, "loss": 0.0794, "step": 1716 }, { "epoch": 0.781164695177434, "grad_norm": 0.7331262117419055, "learning_rate": 4.705203082085538e-06, "loss": 0.0589, "step": 1717 }, { "epoch": 0.7816196542311192, "grad_norm": 1.0967129244905651, "learning_rate": 4.70486632321763e-06, "loss": 0.0875, "step": 1718 }, { "epoch": 0.7820746132848043, "grad_norm": 0.8063548612962124, "learning_rate": 4.7045293841780034e-06, "loss": 0.0638, "step": 1719 }, { "epoch": 0.7825295723384895, "grad_norm": 0.5877288681753885, "learning_rate": 4.704192264994193e-06, "loss": 0.0489, "step": 1720 }, { "epoch": 0.7829845313921747, "grad_norm": 0.4849602579402119, "learning_rate": 4.703854965693743e-06, "loss": 0.036, "step": 1721 }, { "epoch": 0.7834394904458599, "grad_norm": 0.9265838770512554, "learning_rate": 4.703517486304218e-06, "loss": 0.0864, "step": 1722 }, { "epoch": 0.783894449499545, "grad_norm": 0.6601652550077106, "learning_rate": 4.703179826853195e-06, "loss": 0.0628, "step": 1723 }, { "epoch": 0.7843494085532302, "grad_norm": 0.9233111628201732, "learning_rate": 4.702841987368265e-06, "loss": 0.0623, "step": 1724 }, { "epoch": 0.7848043676069154, "grad_norm": 0.6187150000991709, "learning_rate": 4.702503967877038e-06, "loss": 0.0411, "step": 1725 }, { "epoch": 0.7852593266606005, "grad_norm": 1.229045396910063, "learning_rate": 4.702165768407132e-06, "loss": 0.1123, "step": 1726 }, { "epoch": 0.7857142857142857, "grad_norm": 0.7445601607520801, "learning_rate": 4.701827388986185e-06, "loss": 0.0691, "step": 1727 }, { "epoch": 0.7861692447679709, "grad_norm": 0.7316496259855203, "learning_rate": 4.701488829641845e-06, "loss": 0.0561, "step": 1728 }, { "epoch": 0.7866242038216561, "grad_norm": 0.6991204172633203, "learning_rate": 4.701150090401782e-06, "loss": 0.063, "step": 1729 }, { "epoch": 0.7870791628753412, "grad_norm": 0.6297857414561489, "learning_rate": 4.700811171293673e-06, "loss": 0.0555, "step": 1730 }, { "epoch": 0.7875341219290264, "grad_norm": 0.6996558901945711, "learning_rate": 4.700472072345214e-06, "loss": 0.0746, "step": 1731 }, { "epoch": 0.7879890809827116, "grad_norm": 0.7853149482544831, "learning_rate": 4.700132793584113e-06, "loss": 0.0651, "step": 1732 }, { "epoch": 0.7884440400363967, "grad_norm": 0.7740092326049495, "learning_rate": 4.699793335038098e-06, "loss": 0.0616, "step": 1733 }, { "epoch": 0.7888989990900819, "grad_norm": 0.6626889036581106, "learning_rate": 4.699453696734905e-06, "loss": 0.059, "step": 1734 }, { "epoch": 0.7893539581437671, "grad_norm": 0.9561954479294612, "learning_rate": 4.699113878702288e-06, "loss": 0.0938, "step": 1735 }, { "epoch": 0.7898089171974523, "grad_norm": 0.5317461912915777, "learning_rate": 4.698773880968017e-06, "loss": 0.0359, "step": 1736 }, { "epoch": 0.7902638762511374, "grad_norm": 0.5011006710196552, "learning_rate": 4.698433703559874e-06, "loss": 0.0326, "step": 1737 }, { "epoch": 0.7907188353048226, "grad_norm": 0.5030053584799479, "learning_rate": 4.698093346505656e-06, "loss": 0.0409, "step": 1738 }, { "epoch": 0.7911737943585078, "grad_norm": 0.8510969897923517, "learning_rate": 4.697752809833177e-06, "loss": 0.0724, "step": 1739 }, { "epoch": 0.7916287534121929, "grad_norm": 1.0390434682676513, "learning_rate": 4.697412093570263e-06, "loss": 0.0797, "step": 1740 }, { "epoch": 0.792083712465878, "grad_norm": 0.6499989190029223, "learning_rate": 4.697071197744756e-06, "loss": 0.0494, "step": 1741 }, { "epoch": 0.7925386715195633, "grad_norm": 0.7483950848546489, "learning_rate": 4.6967301223845115e-06, "loss": 0.0507, "step": 1742 }, { "epoch": 0.7929936305732485, "grad_norm": 0.5874179561041022, "learning_rate": 4.696388867517403e-06, "loss": 0.0555, "step": 1743 }, { "epoch": 0.7934485896269335, "grad_norm": 0.6928591342268897, "learning_rate": 4.696047433171316e-06, "loss": 0.0484, "step": 1744 }, { "epoch": 0.7939035486806187, "grad_norm": 0.784441765858547, "learning_rate": 4.695705819374149e-06, "loss": 0.0611, "step": 1745 }, { "epoch": 0.7943585077343039, "grad_norm": 0.5652038657500907, "learning_rate": 4.695364026153818e-06, "loss": 0.0535, "step": 1746 }, { "epoch": 0.794813466787989, "grad_norm": 0.7298412289038372, "learning_rate": 4.695022053538253e-06, "loss": 0.0595, "step": 1747 }, { "epoch": 0.7952684258416742, "grad_norm": 1.0490206911158746, "learning_rate": 4.694679901555398e-06, "loss": 0.0861, "step": 1748 }, { "epoch": 0.7957233848953594, "grad_norm": 3.990810702240321, "learning_rate": 4.694337570233213e-06, "loss": 0.1767, "step": 1749 }, { "epoch": 0.7961783439490446, "grad_norm": 0.6802938770066911, "learning_rate": 4.693995059599672e-06, "loss": 0.0573, "step": 1750 }, { "epoch": 0.7966333030027297, "grad_norm": 0.8799308705617829, "learning_rate": 4.693652369682762e-06, "loss": 0.0811, "step": 1751 }, { "epoch": 0.7970882620564149, "grad_norm": 0.694712285767521, "learning_rate": 4.693309500510487e-06, "loss": 0.0452, "step": 1752 }, { "epoch": 0.7975432211101001, "grad_norm": 0.7794041106607383, "learning_rate": 4.692966452110864e-06, "loss": 0.0461, "step": 1753 }, { "epoch": 0.7979981801637852, "grad_norm": 0.7973687362919706, "learning_rate": 4.6926232245119265e-06, "loss": 0.0974, "step": 1754 }, { "epoch": 0.7984531392174704, "grad_norm": 0.9760391285618086, "learning_rate": 4.69227981774172e-06, "loss": 0.07, "step": 1755 }, { "epoch": 0.7989080982711556, "grad_norm": 0.9051191420196392, "learning_rate": 4.691936231828308e-06, "loss": 0.0701, "step": 1756 }, { "epoch": 0.7993630573248408, "grad_norm": 0.8399040364365982, "learning_rate": 4.691592466799766e-06, "loss": 0.08, "step": 1757 }, { "epoch": 0.7998180163785259, "grad_norm": 0.6489937656240298, "learning_rate": 4.691248522684184e-06, "loss": 0.0557, "step": 1758 }, { "epoch": 0.8002729754322111, "grad_norm": 0.5634573167334715, "learning_rate": 4.690904399509668e-06, "loss": 0.0424, "step": 1759 }, { "epoch": 0.8007279344858963, "grad_norm": 1.0271098563007677, "learning_rate": 4.69056009730434e-06, "loss": 0.0803, "step": 1760 }, { "epoch": 0.8011828935395814, "grad_norm": 1.0217177612047041, "learning_rate": 4.690215616096332e-06, "loss": 0.0883, "step": 1761 }, { "epoch": 0.8016378525932666, "grad_norm": 0.8407253882349629, "learning_rate": 4.689870955913796e-06, "loss": 0.0793, "step": 1762 }, { "epoch": 0.8020928116469518, "grad_norm": 0.5666180272561038, "learning_rate": 4.689526116784894e-06, "loss": 0.0459, "step": 1763 }, { "epoch": 0.802547770700637, "grad_norm": 0.8415069685290992, "learning_rate": 4.689181098737805e-06, "loss": 0.0649, "step": 1764 }, { "epoch": 0.8030027297543221, "grad_norm": 0.520623178173706, "learning_rate": 4.6888359018007235e-06, "loss": 0.0416, "step": 1765 }, { "epoch": 0.8034576888080073, "grad_norm": 0.6187159463210112, "learning_rate": 4.6884905260018565e-06, "loss": 0.0456, "step": 1766 }, { "epoch": 0.8039126478616925, "grad_norm": 0.7676728568516994, "learning_rate": 4.688144971369427e-06, "loss": 0.0604, "step": 1767 }, { "epoch": 0.8043676069153776, "grad_norm": 0.7921258117073752, "learning_rate": 4.687799237931673e-06, "loss": 0.0668, "step": 1768 }, { "epoch": 0.8048225659690628, "grad_norm": 0.9320479241709307, "learning_rate": 4.687453325716844e-06, "loss": 0.1011, "step": 1769 }, { "epoch": 0.805277525022748, "grad_norm": 1.0320110116628263, "learning_rate": 4.687107234753208e-06, "loss": 0.0777, "step": 1770 }, { "epoch": 0.8057324840764332, "grad_norm": 0.9182666614681877, "learning_rate": 4.686760965069046e-06, "loss": 0.0679, "step": 1771 }, { "epoch": 0.8061874431301183, "grad_norm": 0.6859628403586197, "learning_rate": 4.686414516692653e-06, "loss": 0.0735, "step": 1772 }, { "epoch": 0.8066424021838035, "grad_norm": 0.858787608672165, "learning_rate": 4.68606788965234e-06, "loss": 0.0739, "step": 1773 }, { "epoch": 0.8070973612374887, "grad_norm": 1.0829087136143425, "learning_rate": 4.68572108397643e-06, "loss": 0.1121, "step": 1774 }, { "epoch": 0.8075523202911737, "grad_norm": 0.8511945315116681, "learning_rate": 4.6853740996932645e-06, "loss": 0.0692, "step": 1775 }, { "epoch": 0.8080072793448589, "grad_norm": 0.6850807246243011, "learning_rate": 4.685026936831196e-06, "loss": 0.0572, "step": 1776 }, { "epoch": 0.8084622383985441, "grad_norm": 0.7373142885959381, "learning_rate": 4.684679595418595e-06, "loss": 0.0543, "step": 1777 }, { "epoch": 0.8089171974522293, "grad_norm": 0.9582099582222912, "learning_rate": 4.684332075483843e-06, "loss": 0.0575, "step": 1778 }, { "epoch": 0.8093721565059144, "grad_norm": 0.8529177501583067, "learning_rate": 4.6839843770553374e-06, "loss": 0.0829, "step": 1779 }, { "epoch": 0.8098271155595996, "grad_norm": 0.5785153858733987, "learning_rate": 4.683636500161491e-06, "loss": 0.0548, "step": 1780 }, { "epoch": 0.8102820746132848, "grad_norm": 1.1231799792720614, "learning_rate": 4.683288444830732e-06, "loss": 0.1008, "step": 1781 }, { "epoch": 0.8107370336669699, "grad_norm": 0.7388433195732499, "learning_rate": 4.6829402110915015e-06, "loss": 0.0554, "step": 1782 }, { "epoch": 0.8111919927206551, "grad_norm": 0.7176040956098546, "learning_rate": 4.682591798972253e-06, "loss": 0.0592, "step": 1783 }, { "epoch": 0.8116469517743403, "grad_norm": 0.6760595625823852, "learning_rate": 4.682243208501461e-06, "loss": 0.0621, "step": 1784 }, { "epoch": 0.8121019108280255, "grad_norm": 0.6601646947125127, "learning_rate": 4.681894439707609e-06, "loss": 0.0468, "step": 1785 }, { "epoch": 0.8125568698817106, "grad_norm": 0.6367038337766298, "learning_rate": 4.681545492619195e-06, "loss": 0.0523, "step": 1786 }, { "epoch": 0.8130118289353958, "grad_norm": 0.8235921980464636, "learning_rate": 4.681196367264736e-06, "loss": 0.064, "step": 1787 }, { "epoch": 0.813466787989081, "grad_norm": 0.7238890575594984, "learning_rate": 4.680847063672761e-06, "loss": 0.0664, "step": 1788 }, { "epoch": 0.8139217470427661, "grad_norm": 0.6470968736210913, "learning_rate": 4.680497581871811e-06, "loss": 0.0621, "step": 1789 }, { "epoch": 0.8143767060964513, "grad_norm": 0.9587164618230581, "learning_rate": 4.680147921890447e-06, "loss": 0.0699, "step": 1790 }, { "epoch": 0.8148316651501365, "grad_norm": 0.7070187493876317, "learning_rate": 4.67979808375724e-06, "loss": 0.0556, "step": 1791 }, { "epoch": 0.8152866242038217, "grad_norm": 0.7878117503048105, "learning_rate": 4.679448067500777e-06, "loss": 0.06, "step": 1792 }, { "epoch": 0.8157415832575068, "grad_norm": 7.5481195301184245, "learning_rate": 4.67909787314966e-06, "loss": 0.2081, "step": 1793 }, { "epoch": 0.816196542311192, "grad_norm": 1.0374324577053136, "learning_rate": 4.678747500732505e-06, "loss": 0.0815, "step": 1794 }, { "epoch": 0.8166515013648772, "grad_norm": 0.7986120350256016, "learning_rate": 4.6783969502779455e-06, "loss": 0.0624, "step": 1795 }, { "epoch": 0.8171064604185623, "grad_norm": 0.856218662448613, "learning_rate": 4.6780462218146236e-06, "loss": 0.0755, "step": 1796 }, { "epoch": 0.8175614194722475, "grad_norm": 0.8379197196440461, "learning_rate": 4.6776953153712005e-06, "loss": 0.0892, "step": 1797 }, { "epoch": 0.8180163785259327, "grad_norm": 1.2262653431681223, "learning_rate": 4.67734423097635e-06, "loss": 0.0677, "step": 1798 }, { "epoch": 0.8184713375796179, "grad_norm": 0.6080921164509873, "learning_rate": 4.676992968658762e-06, "loss": 0.0645, "step": 1799 }, { "epoch": 0.818926296633303, "grad_norm": 0.5894555534409597, "learning_rate": 4.67664152844714e-06, "loss": 0.0557, "step": 1800 }, { "epoch": 0.8193812556869882, "grad_norm": 0.7869279254500811, "learning_rate": 4.676289910370202e-06, "loss": 0.0507, "step": 1801 }, { "epoch": 0.8198362147406734, "grad_norm": 1.200875413797979, "learning_rate": 4.675938114456682e-06, "loss": 0.0878, "step": 1802 }, { "epoch": 0.8202911737943585, "grad_norm": 0.6675759838670742, "learning_rate": 4.675586140735323e-06, "loss": 0.0639, "step": 1803 }, { "epoch": 0.8207461328480437, "grad_norm": 0.7910152821490807, "learning_rate": 4.675233989234891e-06, "loss": 0.07, "step": 1804 }, { "epoch": 0.8212010919017289, "grad_norm": 0.6182132167368, "learning_rate": 4.67488165998416e-06, "loss": 0.0497, "step": 1805 }, { "epoch": 0.821656050955414, "grad_norm": 0.952276552917119, "learning_rate": 4.674529153011922e-06, "loss": 0.0898, "step": 1806 }, { "epoch": 0.8221110100090991, "grad_norm": 0.9577124870492246, "learning_rate": 4.674176468346982e-06, "loss": 0.0859, "step": 1807 }, { "epoch": 0.8225659690627843, "grad_norm": 0.6282575796746988, "learning_rate": 4.673823606018158e-06, "loss": 0.05, "step": 1808 }, { "epoch": 0.8230209281164695, "grad_norm": 0.7575136331083856, "learning_rate": 4.673470566054288e-06, "loss": 0.0668, "step": 1809 }, { "epoch": 0.8234758871701547, "grad_norm": 0.8264565607164219, "learning_rate": 4.673117348484217e-06, "loss": 0.0651, "step": 1810 }, { "epoch": 0.8239308462238398, "grad_norm": 0.5831702959060338, "learning_rate": 4.672763953336811e-06, "loss": 0.0552, "step": 1811 }, { "epoch": 0.824385805277525, "grad_norm": 0.9120237530681591, "learning_rate": 4.672410380640946e-06, "loss": 0.068, "step": 1812 }, { "epoch": 0.8248407643312102, "grad_norm": 0.7346308742565613, "learning_rate": 4.672056630425516e-06, "loss": 0.0649, "step": 1813 }, { "epoch": 0.8252957233848953, "grad_norm": 0.6814325566875316, "learning_rate": 4.671702702719426e-06, "loss": 0.059, "step": 1814 }, { "epoch": 0.8257506824385805, "grad_norm": 0.8584736371666158, "learning_rate": 4.671348597551599e-06, "loss": 0.0712, "step": 1815 }, { "epoch": 0.8262056414922657, "grad_norm": 0.7235192665941425, "learning_rate": 4.670994314950971e-06, "loss": 0.0626, "step": 1816 }, { "epoch": 0.8266606005459509, "grad_norm": 0.5383143835892068, "learning_rate": 4.6706398549464905e-06, "loss": 0.0398, "step": 1817 }, { "epoch": 0.827115559599636, "grad_norm": 0.5271328375766583, "learning_rate": 4.670285217567124e-06, "loss": 0.034, "step": 1818 }, { "epoch": 0.8275705186533212, "grad_norm": 0.8154779822293389, "learning_rate": 4.6699304028418516e-06, "loss": 0.0717, "step": 1819 }, { "epoch": 0.8280254777070064, "grad_norm": 0.9360607447963808, "learning_rate": 4.669575410799665e-06, "loss": 0.0619, "step": 1820 }, { "epoch": 0.8284804367606915, "grad_norm": 0.6087960564203705, "learning_rate": 4.669220241469573e-06, "loss": 0.0635, "step": 1821 }, { "epoch": 0.8289353958143767, "grad_norm": 0.7167883057342898, "learning_rate": 4.668864894880599e-06, "loss": 0.0693, "step": 1822 }, { "epoch": 0.8293903548680619, "grad_norm": 3.321238319754088, "learning_rate": 4.668509371061781e-06, "loss": 0.1734, "step": 1823 }, { "epoch": 0.8298453139217471, "grad_norm": 1.0425676852340926, "learning_rate": 4.668153670042171e-06, "loss": 0.0757, "step": 1824 }, { "epoch": 0.8303002729754322, "grad_norm": 0.662221744529951, "learning_rate": 4.667797791850833e-06, "loss": 0.0514, "step": 1825 }, { "epoch": 0.8307552320291174, "grad_norm": 0.7120101760933736, "learning_rate": 4.6674417365168495e-06, "loss": 0.0584, "step": 1826 }, { "epoch": 0.8312101910828026, "grad_norm": 0.8118980885746032, "learning_rate": 4.667085504069315e-06, "loss": 0.0698, "step": 1827 }, { "epoch": 0.8316651501364877, "grad_norm": 1.0677947820913898, "learning_rate": 4.66672909453734e-06, "loss": 0.1084, "step": 1828 }, { "epoch": 0.8321201091901729, "grad_norm": 7.344384584742864, "learning_rate": 4.6663725079500485e-06, "loss": 0.1199, "step": 1829 }, { "epoch": 0.8325750682438581, "grad_norm": 0.7616332667262099, "learning_rate": 4.666015744336578e-06, "loss": 0.0532, "step": 1830 }, { "epoch": 0.8330300272975433, "grad_norm": 0.598579113637809, "learning_rate": 4.665658803726083e-06, "loss": 0.0584, "step": 1831 }, { "epoch": 0.8334849863512284, "grad_norm": 0.5463327138280551, "learning_rate": 4.6653016861477315e-06, "loss": 0.0448, "step": 1832 }, { "epoch": 0.8339399454049136, "grad_norm": 0.6038418058635472, "learning_rate": 4.664944391630704e-06, "loss": 0.0491, "step": 1833 }, { "epoch": 0.8343949044585988, "grad_norm": 0.7544848763457503, "learning_rate": 4.664586920204197e-06, "loss": 0.0645, "step": 1834 }, { "epoch": 0.8348498635122839, "grad_norm": 0.807248967975218, "learning_rate": 4.664229271897422e-06, "loss": 0.0564, "step": 1835 }, { "epoch": 0.835304822565969, "grad_norm": 0.6512214018608161, "learning_rate": 4.663871446739606e-06, "loss": 0.0678, "step": 1836 }, { "epoch": 0.8357597816196543, "grad_norm": 0.8703248212743384, "learning_rate": 4.663513444759986e-06, "loss": 0.0613, "step": 1837 }, { "epoch": 0.8362147406733395, "grad_norm": 1.0090345588843233, "learning_rate": 4.663155265987818e-06, "loss": 0.0836, "step": 1838 }, { "epoch": 0.8366696997270245, "grad_norm": 0.6694681967933211, "learning_rate": 4.66279691045237e-06, "loss": 0.05, "step": 1839 }, { "epoch": 0.8371246587807097, "grad_norm": 1.1031071759011255, "learning_rate": 4.662438378182927e-06, "loss": 0.0957, "step": 1840 }, { "epoch": 0.8375796178343949, "grad_norm": 0.728630248897681, "learning_rate": 4.662079669208783e-06, "loss": 0.0605, "step": 1841 }, { "epoch": 0.83803457688808, "grad_norm": 1.001887022400648, "learning_rate": 4.661720783559254e-06, "loss": 0.0877, "step": 1842 }, { "epoch": 0.8384895359417652, "grad_norm": 0.6786956307872102, "learning_rate": 4.661361721263664e-06, "loss": 0.0559, "step": 1843 }, { "epoch": 0.8389444949954504, "grad_norm": 0.7188093304776232, "learning_rate": 4.661002482351355e-06, "loss": 0.0614, "step": 1844 }, { "epoch": 0.8393994540491356, "grad_norm": 0.7081647144591069, "learning_rate": 4.660643066851682e-06, "loss": 0.0496, "step": 1845 }, { "epoch": 0.8398544131028207, "grad_norm": 0.6295835306947963, "learning_rate": 4.6602834747940155e-06, "loss": 0.0585, "step": 1846 }, { "epoch": 0.8403093721565059, "grad_norm": 0.6877911302656206, "learning_rate": 4.6599237062077385e-06, "loss": 0.0537, "step": 1847 }, { "epoch": 0.8407643312101911, "grad_norm": 0.7291106168232226, "learning_rate": 4.65956376112225e-06, "loss": 0.0582, "step": 1848 }, { "epoch": 0.8412192902638762, "grad_norm": 0.45011772458421007, "learning_rate": 4.659203639566965e-06, "loss": 0.0324, "step": 1849 }, { "epoch": 0.8416742493175614, "grad_norm": 0.7993204530683249, "learning_rate": 4.658843341571308e-06, "loss": 0.065, "step": 1850 }, { "epoch": 0.8421292083712466, "grad_norm": 0.6243583315447274, "learning_rate": 4.6584828671647235e-06, "loss": 0.0476, "step": 1851 }, { "epoch": 0.8425841674249318, "grad_norm": 0.7933509538566996, "learning_rate": 4.658122216376666e-06, "loss": 0.0816, "step": 1852 }, { "epoch": 0.8430391264786169, "grad_norm": 1.1708916451450775, "learning_rate": 4.657761389236607e-06, "loss": 0.1023, "step": 1853 }, { "epoch": 0.8434940855323021, "grad_norm": 1.2414495723920722, "learning_rate": 4.657400385774032e-06, "loss": 0.0961, "step": 1854 }, { "epoch": 0.8439490445859873, "grad_norm": 0.8143335380819541, "learning_rate": 4.65703920601844e-06, "loss": 0.0868, "step": 1855 }, { "epoch": 0.8444040036396724, "grad_norm": 0.6991877465392216, "learning_rate": 4.656677849999345e-06, "loss": 0.05, "step": 1856 }, { "epoch": 0.8448589626933576, "grad_norm": 0.578084819635394, "learning_rate": 4.656316317746275e-06, "loss": 0.0351, "step": 1857 }, { "epoch": 0.8453139217470428, "grad_norm": 0.7330779326514238, "learning_rate": 4.655954609288775e-06, "loss": 0.0611, "step": 1858 }, { "epoch": 0.845768880800728, "grad_norm": 0.8310851616975516, "learning_rate": 4.655592724656399e-06, "loss": 0.0707, "step": 1859 }, { "epoch": 0.8462238398544131, "grad_norm": 0.548925696011472, "learning_rate": 4.655230663878721e-06, "loss": 0.0465, "step": 1860 }, { "epoch": 0.8466787989080983, "grad_norm": 0.6143015545428137, "learning_rate": 4.654868426985326e-06, "loss": 0.0485, "step": 1861 }, { "epoch": 0.8471337579617835, "grad_norm": 1.110252508265771, "learning_rate": 4.654506014005814e-06, "loss": 0.107, "step": 1862 }, { "epoch": 0.8475887170154686, "grad_norm": 0.9959562888894351, "learning_rate": 4.6541434249698e-06, "loss": 0.0833, "step": 1863 }, { "epoch": 0.8480436760691538, "grad_norm": 0.5468147069872943, "learning_rate": 4.6537806599069144e-06, "loss": 0.0515, "step": 1864 }, { "epoch": 0.848498635122839, "grad_norm": 0.8717783156119658, "learning_rate": 4.653417718846799e-06, "loss": 0.0708, "step": 1865 }, { "epoch": 0.8489535941765242, "grad_norm": 0.9951106402676078, "learning_rate": 4.6530546018191126e-06, "loss": 0.0676, "step": 1866 }, { "epoch": 0.8494085532302093, "grad_norm": 1.7646743889926437, "learning_rate": 4.652691308853526e-06, "loss": 0.0941, "step": 1867 }, { "epoch": 0.8498635122838945, "grad_norm": 0.7838033849127587, "learning_rate": 4.652327839979729e-06, "loss": 0.0658, "step": 1868 }, { "epoch": 0.8503184713375797, "grad_norm": 0.8803955131716555, "learning_rate": 4.651964195227419e-06, "loss": 0.0512, "step": 1869 }, { "epoch": 0.8507734303912647, "grad_norm": 0.6367380704516323, "learning_rate": 4.651600374626315e-06, "loss": 0.0627, "step": 1870 }, { "epoch": 0.8512283894449499, "grad_norm": 0.7460847826389658, "learning_rate": 4.651236378206144e-06, "loss": 0.0631, "step": 1871 }, { "epoch": 0.8516833484986351, "grad_norm": 1.0421113051413289, "learning_rate": 4.650872205996651e-06, "loss": 0.0895, "step": 1872 }, { "epoch": 0.8521383075523203, "grad_norm": 0.9088772885373132, "learning_rate": 4.650507858027595e-06, "loss": 0.0634, "step": 1873 }, { "epoch": 0.8525932666060054, "grad_norm": 0.6693618140614889, "learning_rate": 4.6501433343287475e-06, "loss": 0.0573, "step": 1874 }, { "epoch": 0.8530482256596906, "grad_norm": 1.0233365193295665, "learning_rate": 4.6497786349298975e-06, "loss": 0.0617, "step": 1875 }, { "epoch": 0.8535031847133758, "grad_norm": 0.703907501372265, "learning_rate": 4.649413759860846e-06, "loss": 0.0478, "step": 1876 }, { "epoch": 0.8539581437670609, "grad_norm": 0.8059119032929001, "learning_rate": 4.649048709151408e-06, "loss": 0.0801, "step": 1877 }, { "epoch": 0.8544131028207461, "grad_norm": 0.6078716632060385, "learning_rate": 4.648683482831415e-06, "loss": 0.0547, "step": 1878 }, { "epoch": 0.8548680618744313, "grad_norm": 0.9990647874074321, "learning_rate": 4.648318080930711e-06, "loss": 0.0924, "step": 1879 }, { "epoch": 0.8553230209281165, "grad_norm": 2.2575417512955327, "learning_rate": 4.647952503479154e-06, "loss": 0.119, "step": 1880 }, { "epoch": 0.8557779799818016, "grad_norm": 0.9125736125319561, "learning_rate": 4.6475867505066195e-06, "loss": 0.0842, "step": 1881 }, { "epoch": 0.8562329390354868, "grad_norm": 0.8327393689357763, "learning_rate": 4.647220822042995e-06, "loss": 0.0786, "step": 1882 }, { "epoch": 0.856687898089172, "grad_norm": 0.7174000884605474, "learning_rate": 4.64685471811818e-06, "loss": 0.0543, "step": 1883 }, { "epoch": 0.8571428571428571, "grad_norm": 0.7398748518067477, "learning_rate": 4.646488438762094e-06, "loss": 0.073, "step": 1884 }, { "epoch": 0.8575978161965423, "grad_norm": 0.9193811162801766, "learning_rate": 4.646121984004666e-06, "loss": 0.0811, "step": 1885 }, { "epoch": 0.8580527752502275, "grad_norm": 1.5266758295579101, "learning_rate": 4.64575535387584e-06, "loss": 0.1173, "step": 1886 }, { "epoch": 0.8585077343039127, "grad_norm": 1.0390577419470794, "learning_rate": 4.645388548405578e-06, "loss": 0.0844, "step": 1887 }, { "epoch": 0.8589626933575978, "grad_norm": 0.9981604633117009, "learning_rate": 4.645021567623852e-06, "loss": 0.076, "step": 1888 }, { "epoch": 0.859417652411283, "grad_norm": 0.8090672789437001, "learning_rate": 4.644654411560651e-06, "loss": 0.0668, "step": 1889 }, { "epoch": 0.8598726114649682, "grad_norm": 1.0022610897873472, "learning_rate": 4.644287080245975e-06, "loss": 0.0647, "step": 1890 }, { "epoch": 0.8603275705186533, "grad_norm": 0.8678922953452304, "learning_rate": 4.643919573709843e-06, "loss": 0.0779, "step": 1891 }, { "epoch": 0.8607825295723385, "grad_norm": 0.828489427676361, "learning_rate": 4.6435518919822854e-06, "loss": 0.0883, "step": 1892 }, { "epoch": 0.8612374886260237, "grad_norm": 0.6365179726435326, "learning_rate": 4.643184035093348e-06, "loss": 0.0485, "step": 1893 }, { "epoch": 0.8616924476797089, "grad_norm": 0.9364968630062864, "learning_rate": 4.642816003073089e-06, "loss": 0.0653, "step": 1894 }, { "epoch": 0.862147406733394, "grad_norm": 0.664790354826276, "learning_rate": 4.6424477959515836e-06, "loss": 0.0651, "step": 1895 }, { "epoch": 0.8626023657870792, "grad_norm": 0.6472325137853298, "learning_rate": 4.642079413758919e-06, "loss": 0.0563, "step": 1896 }, { "epoch": 0.8630573248407644, "grad_norm": 0.6990154291209034, "learning_rate": 4.641710856525199e-06, "loss": 0.0569, "step": 1897 }, { "epoch": 0.8635122838944495, "grad_norm": 0.8678945062000727, "learning_rate": 4.641342124280539e-06, "loss": 0.0901, "step": 1898 }, { "epoch": 0.8639672429481347, "grad_norm": 0.5665084013617818, "learning_rate": 4.6409732170550705e-06, "loss": 0.0487, "step": 1899 }, { "epoch": 0.8644222020018199, "grad_norm": 0.7935558097630077, "learning_rate": 4.64060413487894e-06, "loss": 0.0812, "step": 1900 }, { "epoch": 0.864877161055505, "grad_norm": 0.5864687466325638, "learning_rate": 4.640234877782306e-06, "loss": 0.0458, "step": 1901 }, { "epoch": 0.8653321201091901, "grad_norm": 0.813443015691526, "learning_rate": 4.639865445795344e-06, "loss": 0.0501, "step": 1902 }, { "epoch": 0.8657870791628753, "grad_norm": 0.5920765909800347, "learning_rate": 4.63949583894824e-06, "loss": 0.0547, "step": 1903 }, { "epoch": 0.8662420382165605, "grad_norm": 0.8617399384530425, "learning_rate": 4.639126057271199e-06, "loss": 0.0826, "step": 1904 }, { "epoch": 0.8666969972702456, "grad_norm": 1.1341813288905305, "learning_rate": 4.6387561007944355e-06, "loss": 0.1245, "step": 1905 }, { "epoch": 0.8671519563239308, "grad_norm": 0.7626148045229316, "learning_rate": 4.638385969548183e-06, "loss": 0.086, "step": 1906 }, { "epoch": 0.867606915377616, "grad_norm": 0.7074898750617904, "learning_rate": 4.638015663562686e-06, "loss": 0.0648, "step": 1907 }, { "epoch": 0.8680618744313012, "grad_norm": 0.7616835782134675, "learning_rate": 4.637645182868204e-06, "loss": 0.0662, "step": 1908 }, { "epoch": 0.8685168334849863, "grad_norm": 0.6447306948142749, "learning_rate": 4.637274527495011e-06, "loss": 0.0466, "step": 1909 }, { "epoch": 0.8689717925386715, "grad_norm": 0.6794305905203397, "learning_rate": 4.6369036974733955e-06, "loss": 0.0608, "step": 1910 }, { "epoch": 0.8694267515923567, "grad_norm": 0.6697315489178187, "learning_rate": 4.63653269283366e-06, "loss": 0.0638, "step": 1911 }, { "epoch": 0.8698817106460418, "grad_norm": 0.6256598442886095, "learning_rate": 4.636161513606122e-06, "loss": 0.0673, "step": 1912 }, { "epoch": 0.870336669699727, "grad_norm": 0.7994234222622871, "learning_rate": 4.6357901598211105e-06, "loss": 0.0821, "step": 1913 }, { "epoch": 0.8707916287534122, "grad_norm": 0.7041048918645969, "learning_rate": 4.635418631508974e-06, "loss": 0.0589, "step": 1914 }, { "epoch": 0.8712465878070974, "grad_norm": 0.5910855820794297, "learning_rate": 4.635046928700069e-06, "loss": 0.0618, "step": 1915 }, { "epoch": 0.8717015468607825, "grad_norm": 0.6953978547081013, "learning_rate": 4.634675051424771e-06, "loss": 0.0609, "step": 1916 }, { "epoch": 0.8721565059144677, "grad_norm": 0.6402822857806215, "learning_rate": 4.634302999713468e-06, "loss": 0.05, "step": 1917 }, { "epoch": 0.8726114649681529, "grad_norm": 1.4100178497357636, "learning_rate": 4.633930773596563e-06, "loss": 0.1251, "step": 1918 }, { "epoch": 0.873066424021838, "grad_norm": 1.0067064388849685, "learning_rate": 4.633558373104472e-06, "loss": 0.0863, "step": 1919 }, { "epoch": 0.8735213830755232, "grad_norm": 0.7720432867298371, "learning_rate": 4.633185798267625e-06, "loss": 0.0812, "step": 1920 }, { "epoch": 0.8739763421292084, "grad_norm": 0.9130001191840268, "learning_rate": 4.632813049116467e-06, "loss": 0.0762, "step": 1921 }, { "epoch": 0.8744313011828936, "grad_norm": 0.7297896124591896, "learning_rate": 4.63244012568146e-06, "loss": 0.0623, "step": 1922 }, { "epoch": 0.8748862602365787, "grad_norm": 1.1183042674093928, "learning_rate": 4.632067027993076e-06, "loss": 0.073, "step": 1923 }, { "epoch": 0.8753412192902639, "grad_norm": 0.8542658526151589, "learning_rate": 4.631693756081802e-06, "loss": 0.0719, "step": 1924 }, { "epoch": 0.8757961783439491, "grad_norm": 0.6727521948478059, "learning_rate": 4.631320309978141e-06, "loss": 0.072, "step": 1925 }, { "epoch": 0.8762511373976342, "grad_norm": 0.8947150872279354, "learning_rate": 4.630946689712609e-06, "loss": 0.0775, "step": 1926 }, { "epoch": 0.8767060964513194, "grad_norm": 0.7373481218781285, "learning_rate": 4.630572895315737e-06, "loss": 0.058, "step": 1927 }, { "epoch": 0.8771610555050046, "grad_norm": 0.4756516758736572, "learning_rate": 4.63019892681807e-06, "loss": 0.0445, "step": 1928 }, { "epoch": 0.8776160145586898, "grad_norm": 0.5208149626418009, "learning_rate": 4.629824784250166e-06, "loss": 0.0487, "step": 1929 }, { "epoch": 0.8780709736123748, "grad_norm": 0.5811343609358607, "learning_rate": 4.629450467642599e-06, "loss": 0.0473, "step": 1930 }, { "epoch": 0.87852593266606, "grad_norm": 0.6428228760282421, "learning_rate": 4.629075977025957e-06, "loss": 0.0691, "step": 1931 }, { "epoch": 0.8789808917197452, "grad_norm": 0.7532116780570327, "learning_rate": 4.62870131243084e-06, "loss": 0.076, "step": 1932 }, { "epoch": 0.8794358507734303, "grad_norm": 0.6362438045015979, "learning_rate": 4.628326473887865e-06, "loss": 0.0437, "step": 1933 }, { "epoch": 0.8798908098271155, "grad_norm": 0.6092139546854358, "learning_rate": 4.627951461427663e-06, "loss": 0.0401, "step": 1934 }, { "epoch": 0.8803457688808007, "grad_norm": 0.6583098597040281, "learning_rate": 4.627576275080876e-06, "loss": 0.0549, "step": 1935 }, { "epoch": 0.8808007279344859, "grad_norm": 0.6448996031197749, "learning_rate": 4.627200914878165e-06, "loss": 0.0566, "step": 1936 }, { "epoch": 0.881255686988171, "grad_norm": 0.9917471720064225, "learning_rate": 4.6268253808502005e-06, "loss": 0.0949, "step": 1937 }, { "epoch": 0.8817106460418562, "grad_norm": 0.4519754904291037, "learning_rate": 4.626449673027671e-06, "loss": 0.0369, "step": 1938 }, { "epoch": 0.8821656050955414, "grad_norm": 0.7122032429922148, "learning_rate": 4.626073791441278e-06, "loss": 0.0639, "step": 1939 }, { "epoch": 0.8826205641492265, "grad_norm": 0.7957247576823104, "learning_rate": 4.625697736121735e-06, "loss": 0.076, "step": 1940 }, { "epoch": 0.8830755232029117, "grad_norm": 1.0448936625007237, "learning_rate": 4.6253215070997735e-06, "loss": 0.0947, "step": 1941 }, { "epoch": 0.8835304822565969, "grad_norm": 0.7138340745094094, "learning_rate": 4.624945104406135e-06, "loss": 0.0603, "step": 1942 }, { "epoch": 0.8839854413102821, "grad_norm": 0.7835217856531912, "learning_rate": 4.624568528071579e-06, "loss": 0.0568, "step": 1943 }, { "epoch": 0.8844404003639672, "grad_norm": 0.6440096764240494, "learning_rate": 4.624191778126879e-06, "loss": 0.0643, "step": 1944 }, { "epoch": 0.8848953594176524, "grad_norm": 0.9196121184501301, "learning_rate": 4.623814854602818e-06, "loss": 0.0861, "step": 1945 }, { "epoch": 0.8853503184713376, "grad_norm": 0.7529334500938356, "learning_rate": 4.623437757530198e-06, "loss": 0.0621, "step": 1946 }, { "epoch": 0.8858052775250227, "grad_norm": 0.6444747494120212, "learning_rate": 4.623060486939835e-06, "loss": 0.0651, "step": 1947 }, { "epoch": 0.8862602365787079, "grad_norm": 0.9312918449614406, "learning_rate": 4.622683042862556e-06, "loss": 0.0774, "step": 1948 }, { "epoch": 0.8867151956323931, "grad_norm": 0.571023976593836, "learning_rate": 4.622305425329205e-06, "loss": 0.0488, "step": 1949 }, { "epoch": 0.8871701546860783, "grad_norm": 0.4821842011661118, "learning_rate": 4.621927634370638e-06, "loss": 0.0413, "step": 1950 }, { "epoch": 0.8876251137397634, "grad_norm": 0.6368005077549093, "learning_rate": 4.621549670017727e-06, "loss": 0.0542, "step": 1951 }, { "epoch": 0.8880800727934486, "grad_norm": 0.7246243226204113, "learning_rate": 4.6211715323013595e-06, "loss": 0.0623, "step": 1952 }, { "epoch": 0.8885350318471338, "grad_norm": 0.7688120695285035, "learning_rate": 4.6207932212524325e-06, "loss": 0.0753, "step": 1953 }, { "epoch": 0.8889899909008189, "grad_norm": 0.7357035989399864, "learning_rate": 4.620414736901861e-06, "loss": 0.0645, "step": 1954 }, { "epoch": 0.8894449499545041, "grad_norm": 0.6921873537829016, "learning_rate": 4.620036079280573e-06, "loss": 0.0674, "step": 1955 }, { "epoch": 0.8898999090081893, "grad_norm": 0.9110581432700333, "learning_rate": 4.619657248419511e-06, "loss": 0.086, "step": 1956 }, { "epoch": 0.8903548680618745, "grad_norm": 0.698295996358244, "learning_rate": 4.61927824434963e-06, "loss": 0.0575, "step": 1957 }, { "epoch": 0.8908098271155596, "grad_norm": 0.9051372778412762, "learning_rate": 4.6188990671019015e-06, "loss": 0.0889, "step": 1958 }, { "epoch": 0.8912647861692448, "grad_norm": 0.7017231398160091, "learning_rate": 4.618519716707311e-06, "loss": 0.0693, "step": 1959 }, { "epoch": 0.89171974522293, "grad_norm": 0.8361247694534356, "learning_rate": 4.618140193196856e-06, "loss": 0.0678, "step": 1960 }, { "epoch": 0.892174704276615, "grad_norm": 0.682212468445449, "learning_rate": 4.61776049660155e-06, "loss": 0.0637, "step": 1961 }, { "epoch": 0.8926296633303002, "grad_norm": 0.529244582188164, "learning_rate": 4.61738062695242e-06, "loss": 0.0447, "step": 1962 }, { "epoch": 0.8930846223839854, "grad_norm": 0.8253799792777393, "learning_rate": 4.617000584280506e-06, "loss": 0.0539, "step": 1963 }, { "epoch": 0.8935395814376706, "grad_norm": 0.8363462937787527, "learning_rate": 4.616620368616866e-06, "loss": 0.0808, "step": 1964 }, { "epoch": 0.8939945404913557, "grad_norm": 1.208594162076693, "learning_rate": 4.616239979992568e-06, "loss": 0.1071, "step": 1965 }, { "epoch": 0.8944494995450409, "grad_norm": 0.7323210379619807, "learning_rate": 4.615859418438695e-06, "loss": 0.0617, "step": 1966 }, { "epoch": 0.8949044585987261, "grad_norm": 0.5029303539944047, "learning_rate": 4.615478683986345e-06, "loss": 0.0447, "step": 1967 }, { "epoch": 0.8953594176524113, "grad_norm": 1.1709329391548968, "learning_rate": 4.6150977766666315e-06, "loss": 0.1162, "step": 1968 }, { "epoch": 0.8958143767060964, "grad_norm": 2.1123193474897746, "learning_rate": 4.614716696510679e-06, "loss": 0.1355, "step": 1969 }, { "epoch": 0.8962693357597816, "grad_norm": 0.7444729756367264, "learning_rate": 4.614335443549628e-06, "loss": 0.059, "step": 1970 }, { "epoch": 0.8967242948134668, "grad_norm": 0.5785396613883771, "learning_rate": 4.613954017814633e-06, "loss": 0.0419, "step": 1971 }, { "epoch": 0.8971792538671519, "grad_norm": 0.6399971925788602, "learning_rate": 4.613572419336862e-06, "loss": 0.052, "step": 1972 }, { "epoch": 0.8976342129208371, "grad_norm": 1.0179431232847986, "learning_rate": 4.613190648147497e-06, "loss": 0.0795, "step": 1973 }, { "epoch": 0.8980891719745223, "grad_norm": 0.8018358499725766, "learning_rate": 4.612808704277736e-06, "loss": 0.0726, "step": 1974 }, { "epoch": 0.8985441310282075, "grad_norm": 1.057940475258921, "learning_rate": 4.612426587758789e-06, "loss": 0.1035, "step": 1975 }, { "epoch": 0.8989990900818926, "grad_norm": 0.6814493223121904, "learning_rate": 4.612044298621881e-06, "loss": 0.0617, "step": 1976 }, { "epoch": 0.8994540491355778, "grad_norm": 0.8657309258123689, "learning_rate": 4.611661836898252e-06, "loss": 0.0692, "step": 1977 }, { "epoch": 0.899909008189263, "grad_norm": 0.8580923454556945, "learning_rate": 4.611279202619151e-06, "loss": 0.0745, "step": 1978 }, { "epoch": 0.9003639672429481, "grad_norm": 0.8215028957062736, "learning_rate": 4.61089639581585e-06, "loss": 0.0788, "step": 1979 }, { "epoch": 0.9008189262966333, "grad_norm": 0.8812044153052796, "learning_rate": 4.610513416519628e-06, "loss": 0.0761, "step": 1980 }, { "epoch": 0.9012738853503185, "grad_norm": 0.7857406697564773, "learning_rate": 4.6101302647617806e-06, "loss": 0.0688, "step": 1981 }, { "epoch": 0.9017288444040037, "grad_norm": 0.8417981177673383, "learning_rate": 4.609746940573617e-06, "loss": 0.0689, "step": 1982 }, { "epoch": 0.9021838034576888, "grad_norm": 0.8169079358621493, "learning_rate": 4.609363443986461e-06, "loss": 0.0648, "step": 1983 }, { "epoch": 0.902638762511374, "grad_norm": 0.7566807475286295, "learning_rate": 4.60897977503165e-06, "loss": 0.0616, "step": 1984 }, { "epoch": 0.9030937215650592, "grad_norm": 1.0714038601336504, "learning_rate": 4.608595933740536e-06, "loss": 0.1018, "step": 1985 }, { "epoch": 0.9035486806187443, "grad_norm": 0.9493251787399868, "learning_rate": 4.608211920144485e-06, "loss": 0.1073, "step": 1986 }, { "epoch": 0.9040036396724295, "grad_norm": 0.9973544276569233, "learning_rate": 4.607827734274876e-06, "loss": 0.0864, "step": 1987 }, { "epoch": 0.9044585987261147, "grad_norm": 0.6218560970311163, "learning_rate": 4.607443376163104e-06, "loss": 0.0421, "step": 1988 }, { "epoch": 0.9049135577797999, "grad_norm": 0.7664118230444326, "learning_rate": 4.607058845840576e-06, "loss": 0.0663, "step": 1989 }, { "epoch": 0.905368516833485, "grad_norm": 0.5074521856900044, "learning_rate": 4.606674143338714e-06, "loss": 0.0417, "step": 1990 }, { "epoch": 0.9058234758871702, "grad_norm": 0.662797208968593, "learning_rate": 4.606289268688955e-06, "loss": 0.049, "step": 1991 }, { "epoch": 0.9062784349408554, "grad_norm": 0.8307833634275037, "learning_rate": 4.605904221922749e-06, "loss": 0.0688, "step": 1992 }, { "epoch": 0.9067333939945404, "grad_norm": 0.6991781224532665, "learning_rate": 4.6055190030715605e-06, "loss": 0.0548, "step": 1993 }, { "epoch": 0.9071883530482256, "grad_norm": 0.6371941048527485, "learning_rate": 4.605133612166868e-06, "loss": 0.0565, "step": 1994 }, { "epoch": 0.9076433121019108, "grad_norm": 0.7661847561683841, "learning_rate": 4.604748049240162e-06, "loss": 0.0751, "step": 1995 }, { "epoch": 0.908098271155596, "grad_norm": 0.698983684338044, "learning_rate": 4.604362314322951e-06, "loss": 0.0618, "step": 1996 }, { "epoch": 0.9085532302092811, "grad_norm": 0.7223001112189644, "learning_rate": 4.603976407446756e-06, "loss": 0.0604, "step": 1997 }, { "epoch": 0.9090081892629663, "grad_norm": 0.6724854071313681, "learning_rate": 4.603590328643108e-06, "loss": 0.047, "step": 1998 }, { "epoch": 0.9094631483166515, "grad_norm": 0.7241602536272622, "learning_rate": 4.60320407794356e-06, "loss": 0.0616, "step": 1999 }, { "epoch": 0.9099181073703366, "grad_norm": 0.661995220560214, "learning_rate": 4.602817655379672e-06, "loss": 0.0706, "step": 2000 }, { "epoch": 0.9103730664240218, "grad_norm": 0.6993774781403542, "learning_rate": 4.602431060983022e-06, "loss": 0.0667, "step": 2001 }, { "epoch": 0.910828025477707, "grad_norm": 0.7589317965474969, "learning_rate": 4.6020442947852e-06, "loss": 0.0781, "step": 2002 }, { "epoch": 0.9112829845313922, "grad_norm": 0.5530988126559163, "learning_rate": 4.6016573568178105e-06, "loss": 0.0417, "step": 2003 }, { "epoch": 0.9117379435850773, "grad_norm": 0.7697388126057101, "learning_rate": 4.601270247112473e-06, "loss": 0.0931, "step": 2004 }, { "epoch": 0.9121929026387625, "grad_norm": 0.7976083703107357, "learning_rate": 4.60088296570082e-06, "loss": 0.0536, "step": 2005 }, { "epoch": 0.9126478616924477, "grad_norm": 0.5035240024386465, "learning_rate": 4.600495512614499e-06, "loss": 0.0482, "step": 2006 }, { "epoch": 0.9131028207461328, "grad_norm": 0.9991410007845525, "learning_rate": 4.60010788788517e-06, "loss": 0.0982, "step": 2007 }, { "epoch": 0.913557779799818, "grad_norm": 0.741220096048184, "learning_rate": 4.5997200915445095e-06, "loss": 0.067, "step": 2008 }, { "epoch": 0.9140127388535032, "grad_norm": 0.7469087070674232, "learning_rate": 4.599332123624204e-06, "loss": 0.0762, "step": 2009 }, { "epoch": 0.9144676979071884, "grad_norm": 0.7289519001463325, "learning_rate": 4.598943984155959e-06, "loss": 0.0688, "step": 2010 }, { "epoch": 0.9149226569608735, "grad_norm": 0.6249519650896803, "learning_rate": 4.598555673171489e-06, "loss": 0.0455, "step": 2011 }, { "epoch": 0.9153776160145587, "grad_norm": 0.5475588599861211, "learning_rate": 4.5981671907025275e-06, "loss": 0.0453, "step": 2012 }, { "epoch": 0.9158325750682439, "grad_norm": 0.8942412439441615, "learning_rate": 4.597778536780818e-06, "loss": 0.0626, "step": 2013 }, { "epoch": 0.916287534121929, "grad_norm": 0.7776225162515735, "learning_rate": 4.597389711438121e-06, "loss": 0.0598, "step": 2014 }, { "epoch": 0.9167424931756142, "grad_norm": 0.5972302865002332, "learning_rate": 4.597000714706207e-06, "loss": 0.043, "step": 2015 }, { "epoch": 0.9171974522292994, "grad_norm": 1.0530137942692235, "learning_rate": 4.596611546616865e-06, "loss": 0.099, "step": 2016 }, { "epoch": 0.9176524112829846, "grad_norm": 0.8214731002522497, "learning_rate": 4.596222207201896e-06, "loss": 0.0651, "step": 2017 }, { "epoch": 0.9181073703366697, "grad_norm": 1.1339945212794975, "learning_rate": 4.595832696493115e-06, "loss": 0.1155, "step": 2018 }, { "epoch": 0.9185623293903549, "grad_norm": 0.7172896510951188, "learning_rate": 4.59544301452235e-06, "loss": 0.0591, "step": 2019 }, { "epoch": 0.9190172884440401, "grad_norm": 1.0007257724401275, "learning_rate": 4.595053161321444e-06, "loss": 0.0726, "step": 2020 }, { "epoch": 0.9194722474977252, "grad_norm": 0.6723310112271977, "learning_rate": 4.594663136922256e-06, "loss": 0.0585, "step": 2021 }, { "epoch": 0.9199272065514104, "grad_norm": 0.720246302857727, "learning_rate": 4.594272941356655e-06, "loss": 0.0563, "step": 2022 }, { "epoch": 0.9203821656050956, "grad_norm": 0.9382088606607213, "learning_rate": 4.593882574656528e-06, "loss": 0.081, "step": 2023 }, { "epoch": 0.9208371246587808, "grad_norm": 0.8121354764600041, "learning_rate": 4.5934920368537724e-06, "loss": 0.0595, "step": 2024 }, { "epoch": 0.9212920837124658, "grad_norm": 0.6886828076012578, "learning_rate": 4.593101327980301e-06, "loss": 0.0641, "step": 2025 }, { "epoch": 0.921747042766151, "grad_norm": 0.7242613606214666, "learning_rate": 4.592710448068043e-06, "loss": 0.0656, "step": 2026 }, { "epoch": 0.9222020018198362, "grad_norm": 0.7089132611487177, "learning_rate": 4.592319397148936e-06, "loss": 0.0554, "step": 2027 }, { "epoch": 0.9226569608735213, "grad_norm": 0.720645938837672, "learning_rate": 4.5919281752549386e-06, "loss": 0.0663, "step": 2028 }, { "epoch": 0.9231119199272065, "grad_norm": 0.9380123278339468, "learning_rate": 4.5915367824180165e-06, "loss": 0.0853, "step": 2029 }, { "epoch": 0.9235668789808917, "grad_norm": 0.533238980696019, "learning_rate": 4.591145218670154e-06, "loss": 0.0412, "step": 2030 }, { "epoch": 0.9240218380345769, "grad_norm": 0.8424113030761979, "learning_rate": 4.590753484043348e-06, "loss": 0.0737, "step": 2031 }, { "epoch": 0.924476797088262, "grad_norm": 1.0925264051066994, "learning_rate": 4.590361578569609e-06, "loss": 0.1022, "step": 2032 }, { "epoch": 0.9249317561419472, "grad_norm": 0.6770217428708925, "learning_rate": 4.589969502280962e-06, "loss": 0.0541, "step": 2033 }, { "epoch": 0.9253867151956324, "grad_norm": 0.57050505469179, "learning_rate": 4.589577255209445e-06, "loss": 0.0562, "step": 2034 }, { "epoch": 0.9258416742493175, "grad_norm": 0.7685735318891608, "learning_rate": 4.589184837387112e-06, "loss": 0.0633, "step": 2035 }, { "epoch": 0.9262966333030027, "grad_norm": 0.618583197540069, "learning_rate": 4.588792248846028e-06, "loss": 0.054, "step": 2036 }, { "epoch": 0.9267515923566879, "grad_norm": 1.3624730369026448, "learning_rate": 4.588399489618274e-06, "loss": 0.0878, "step": 2037 }, { "epoch": 0.9272065514103731, "grad_norm": 0.565791252388298, "learning_rate": 4.588006559735945e-06, "loss": 0.0572, "step": 2038 }, { "epoch": 0.9276615104640582, "grad_norm": 0.693373847149278, "learning_rate": 4.587613459231149e-06, "loss": 0.0607, "step": 2039 }, { "epoch": 0.9281164695177434, "grad_norm": 0.8060935136542962, "learning_rate": 4.5872201881360105e-06, "loss": 0.0739, "step": 2040 }, { "epoch": 0.9285714285714286, "grad_norm": 0.7780880150619168, "learning_rate": 4.586826746482662e-06, "loss": 0.081, "step": 2041 }, { "epoch": 0.9290263876251137, "grad_norm": 0.6712346760538427, "learning_rate": 4.586433134303257e-06, "loss": 0.06, "step": 2042 }, { "epoch": 0.9294813466787989, "grad_norm": 0.7786901525036622, "learning_rate": 4.586039351629959e-06, "loss": 0.0655, "step": 2043 }, { "epoch": 0.9299363057324841, "grad_norm": 0.47897943126694803, "learning_rate": 4.585645398494944e-06, "loss": 0.0376, "step": 2044 }, { "epoch": 0.9303912647861693, "grad_norm": 0.6962412449304831, "learning_rate": 4.585251274930406e-06, "loss": 0.0606, "step": 2045 }, { "epoch": 0.9308462238398544, "grad_norm": 1.115044271168298, "learning_rate": 4.584856980968552e-06, "loss": 0.0868, "step": 2046 }, { "epoch": 0.9313011828935396, "grad_norm": 0.7659294032948517, "learning_rate": 4.584462516641599e-06, "loss": 0.0775, "step": 2047 }, { "epoch": 0.9317561419472248, "grad_norm": 0.6771011845137347, "learning_rate": 4.584067881981784e-06, "loss": 0.059, "step": 2048 }, { "epoch": 0.9322111010009099, "grad_norm": 0.5697595033358009, "learning_rate": 4.583673077021352e-06, "loss": 0.0415, "step": 2049 }, { "epoch": 0.9326660600545951, "grad_norm": 0.7835573744548883, "learning_rate": 4.583278101792567e-06, "loss": 0.0708, "step": 2050 }, { "epoch": 0.9331210191082803, "grad_norm": 0.7984504611053544, "learning_rate": 4.582882956327704e-06, "loss": 0.0645, "step": 2051 }, { "epoch": 0.9335759781619655, "grad_norm": 0.8236288743661234, "learning_rate": 4.58248764065905e-06, "loss": 0.0631, "step": 2052 }, { "epoch": 0.9340309372156506, "grad_norm": 0.6397509690764358, "learning_rate": 4.582092154818912e-06, "loss": 0.0611, "step": 2053 }, { "epoch": 0.9344858962693358, "grad_norm": 0.7767543633533711, "learning_rate": 4.581696498839605e-06, "loss": 0.0744, "step": 2054 }, { "epoch": 0.934940855323021, "grad_norm": 0.6701944816432484, "learning_rate": 4.581300672753462e-06, "loss": 0.0675, "step": 2055 }, { "epoch": 0.935395814376706, "grad_norm": 0.5629715439636069, "learning_rate": 4.580904676592826e-06, "loss": 0.0446, "step": 2056 }, { "epoch": 0.9358507734303912, "grad_norm": 0.6828299934760278, "learning_rate": 4.580508510390057e-06, "loss": 0.0461, "step": 2057 }, { "epoch": 0.9363057324840764, "grad_norm": 0.8984515044644055, "learning_rate": 4.580112174177529e-06, "loss": 0.0915, "step": 2058 }, { "epoch": 0.9367606915377616, "grad_norm": 0.6175632910426854, "learning_rate": 4.5797156679876274e-06, "loss": 0.0507, "step": 2059 }, { "epoch": 0.9372156505914467, "grad_norm": 0.6720822499361706, "learning_rate": 4.5793189918527524e-06, "loss": 0.0749, "step": 2060 }, { "epoch": 0.9376706096451319, "grad_norm": 0.6589572570555299, "learning_rate": 4.5789221458053205e-06, "loss": 0.0583, "step": 2061 }, { "epoch": 0.9381255686988171, "grad_norm": 1.1346300175180764, "learning_rate": 4.578525129877759e-06, "loss": 0.0779, "step": 2062 }, { "epoch": 0.9385805277525022, "grad_norm": 0.6833534458218872, "learning_rate": 4.5781279441025105e-06, "loss": 0.0719, "step": 2063 }, { "epoch": 0.9390354868061874, "grad_norm": 0.7906899586059671, "learning_rate": 4.577730588512031e-06, "loss": 0.0717, "step": 2064 }, { "epoch": 0.9394904458598726, "grad_norm": 0.9641443471266038, "learning_rate": 4.577333063138791e-06, "loss": 0.0751, "step": 2065 }, { "epoch": 0.9399454049135578, "grad_norm": 0.6129982759361653, "learning_rate": 4.576935368015274e-06, "loss": 0.0535, "step": 2066 }, { "epoch": 0.9404003639672429, "grad_norm": 0.9260468404104475, "learning_rate": 4.576537503173978e-06, "loss": 0.1152, "step": 2067 }, { "epoch": 0.9408553230209281, "grad_norm": 0.8261242184140701, "learning_rate": 4.576139468647415e-06, "loss": 0.0671, "step": 2068 }, { "epoch": 0.9413102820746133, "grad_norm": 0.6924666122798075, "learning_rate": 4.575741264468111e-06, "loss": 0.0466, "step": 2069 }, { "epoch": 0.9417652411282984, "grad_norm": 0.874077324118521, "learning_rate": 4.575342890668603e-06, "loss": 0.0706, "step": 2070 }, { "epoch": 0.9422202001819836, "grad_norm": 1.1818230514353092, "learning_rate": 4.574944347281448e-06, "loss": 0.1147, "step": 2071 }, { "epoch": 0.9426751592356688, "grad_norm": 0.7829196495955753, "learning_rate": 4.5745456343392114e-06, "loss": 0.0549, "step": 2072 }, { "epoch": 0.943130118289354, "grad_norm": 0.7015791020130244, "learning_rate": 4.574146751874473e-06, "loss": 0.0719, "step": 2073 }, { "epoch": 0.9435850773430391, "grad_norm": 0.7725458409426254, "learning_rate": 4.57374769991983e-06, "loss": 0.0672, "step": 2074 }, { "epoch": 0.9440400363967243, "grad_norm": 0.5494001673465373, "learning_rate": 4.573348478507888e-06, "loss": 0.0492, "step": 2075 }, { "epoch": 0.9444949954504095, "grad_norm": 1.2424508447918836, "learning_rate": 4.5729490876712725e-06, "loss": 0.1248, "step": 2076 }, { "epoch": 0.9449499545040946, "grad_norm": 0.7654468382081444, "learning_rate": 4.572549527442619e-06, "loss": 0.066, "step": 2077 }, { "epoch": 0.9454049135577798, "grad_norm": 0.7504667900780868, "learning_rate": 4.572149797854578e-06, "loss": 0.0715, "step": 2078 }, { "epoch": 0.945859872611465, "grad_norm": 0.5821750562736777, "learning_rate": 4.571749898939813e-06, "loss": 0.0488, "step": 2079 }, { "epoch": 0.9463148316651502, "grad_norm": 0.5995576604839662, "learning_rate": 4.5713498307310024e-06, "loss": 0.0467, "step": 2080 }, { "epoch": 0.9467697907188353, "grad_norm": 0.5193399380100792, "learning_rate": 4.570949593260837e-06, "loss": 0.0418, "step": 2081 }, { "epoch": 0.9472247497725205, "grad_norm": 0.7726672189615522, "learning_rate": 4.570549186562024e-06, "loss": 0.068, "step": 2082 }, { "epoch": 0.9476797088262057, "grad_norm": 0.9870162190322213, "learning_rate": 4.570148610667281e-06, "loss": 0.0785, "step": 2083 }, { "epoch": 0.9481346678798908, "grad_norm": 1.1745928555777398, "learning_rate": 4.569747865609343e-06, "loss": 0.0952, "step": 2084 }, { "epoch": 0.948589626933576, "grad_norm": 0.985185956121142, "learning_rate": 4.569346951420957e-06, "loss": 0.0928, "step": 2085 }, { "epoch": 0.9490445859872612, "grad_norm": 0.842113304644101, "learning_rate": 4.568945868134882e-06, "loss": 0.0758, "step": 2086 }, { "epoch": 0.9494995450409464, "grad_norm": 0.9148030370424597, "learning_rate": 4.568544615783894e-06, "loss": 0.0757, "step": 2087 }, { "epoch": 0.9499545040946314, "grad_norm": 0.6563897177314274, "learning_rate": 4.568143194400782e-06, "loss": 0.054, "step": 2088 }, { "epoch": 0.9504094631483166, "grad_norm": 0.7611486885212506, "learning_rate": 4.567741604018348e-06, "loss": 0.0636, "step": 2089 }, { "epoch": 0.9508644222020018, "grad_norm": 1.0072775936266698, "learning_rate": 4.567339844669407e-06, "loss": 0.0832, "step": 2090 }, { "epoch": 0.9513193812556869, "grad_norm": 0.574494067350516, "learning_rate": 4.566937916386791e-06, "loss": 0.0586, "step": 2091 }, { "epoch": 0.9517743403093721, "grad_norm": 0.6300268259772549, "learning_rate": 4.566535819203342e-06, "loss": 0.0632, "step": 2092 }, { "epoch": 0.9522292993630573, "grad_norm": 0.7929310570764979, "learning_rate": 4.566133553151918e-06, "loss": 0.0818, "step": 2093 }, { "epoch": 0.9526842584167425, "grad_norm": 0.807980183016958, "learning_rate": 4.565731118265392e-06, "loss": 0.0699, "step": 2094 }, { "epoch": 0.9531392174704276, "grad_norm": 1.0129943914072512, "learning_rate": 4.5653285145766465e-06, "loss": 0.0819, "step": 2095 }, { "epoch": 0.9535941765241128, "grad_norm": 0.41876997993238735, "learning_rate": 4.564925742118583e-06, "loss": 0.0357, "step": 2096 }, { "epoch": 0.954049135577798, "grad_norm": 0.9399575388698519, "learning_rate": 4.564522800924111e-06, "loss": 0.0924, "step": 2097 }, { "epoch": 0.9545040946314831, "grad_norm": 0.5866883608136003, "learning_rate": 4.56411969102616e-06, "loss": 0.0542, "step": 2098 }, { "epoch": 0.9549590536851683, "grad_norm": 0.7058269296544326, "learning_rate": 4.5637164124576695e-06, "loss": 0.0692, "step": 2099 }, { "epoch": 0.9554140127388535, "grad_norm": 0.589695685362554, "learning_rate": 4.563312965251594e-06, "loss": 0.0477, "step": 2100 }, { "epoch": 0.9558689717925387, "grad_norm": 0.767068461359846, "learning_rate": 4.562909349440899e-06, "loss": 0.069, "step": 2101 }, { "epoch": 0.9563239308462238, "grad_norm": 0.9615092558046602, "learning_rate": 4.5625055650585695e-06, "loss": 0.0587, "step": 2102 }, { "epoch": 0.956778889899909, "grad_norm": 0.7392367463114865, "learning_rate": 4.562101612137599e-06, "loss": 0.0637, "step": 2103 }, { "epoch": 0.9572338489535942, "grad_norm": 0.6001888352855437, "learning_rate": 4.561697490710998e-06, "loss": 0.0549, "step": 2104 }, { "epoch": 0.9576888080072793, "grad_norm": 0.7701090418633465, "learning_rate": 4.561293200811787e-06, "loss": 0.0652, "step": 2105 }, { "epoch": 0.9581437670609645, "grad_norm": 0.9986731635993634, "learning_rate": 4.560888742473005e-06, "loss": 0.0904, "step": 2106 }, { "epoch": 0.9585987261146497, "grad_norm": 0.9421619956607125, "learning_rate": 4.560484115727703e-06, "loss": 0.0822, "step": 2107 }, { "epoch": 0.9590536851683349, "grad_norm": 0.6209809874242187, "learning_rate": 4.560079320608942e-06, "loss": 0.0553, "step": 2108 }, { "epoch": 0.95950864422202, "grad_norm": 0.6196885855952039, "learning_rate": 4.5596743571498035e-06, "loss": 0.0592, "step": 2109 }, { "epoch": 0.9599636032757052, "grad_norm": 0.8124501053975468, "learning_rate": 4.5592692253833775e-06, "loss": 0.0583, "step": 2110 }, { "epoch": 0.9604185623293904, "grad_norm": 0.7003649102713996, "learning_rate": 4.5588639253427705e-06, "loss": 0.0523, "step": 2111 }, { "epoch": 0.9608735213830755, "grad_norm": 0.6112101133803953, "learning_rate": 4.558458457061101e-06, "loss": 0.0513, "step": 2112 }, { "epoch": 0.9613284804367607, "grad_norm": 0.7914680473124714, "learning_rate": 4.5580528205715024e-06, "loss": 0.0618, "step": 2113 }, { "epoch": 0.9617834394904459, "grad_norm": 0.9477327883059017, "learning_rate": 4.557647015907121e-06, "loss": 0.0786, "step": 2114 }, { "epoch": 0.9622383985441311, "grad_norm": 0.6623493696862014, "learning_rate": 4.557241043101118e-06, "loss": 0.058, "step": 2115 }, { "epoch": 0.9626933575978162, "grad_norm": 0.8423723830051886, "learning_rate": 4.556834902186667e-06, "loss": 0.0872, "step": 2116 }, { "epoch": 0.9631483166515014, "grad_norm": 0.6740938824585692, "learning_rate": 4.556428593196956e-06, "loss": 0.0546, "step": 2117 }, { "epoch": 0.9636032757051866, "grad_norm": 1.1955334494093306, "learning_rate": 4.556022116165189e-06, "loss": 0.1227, "step": 2118 }, { "epoch": 0.9640582347588716, "grad_norm": 0.9895083968984689, "learning_rate": 4.555615471124578e-06, "loss": 0.0802, "step": 2119 }, { "epoch": 0.9645131938125568, "grad_norm": 0.7647543998706896, "learning_rate": 4.555208658108354e-06, "loss": 0.0514, "step": 2120 }, { "epoch": 0.964968152866242, "grad_norm": 0.7129470981994964, "learning_rate": 4.55480167714976e-06, "loss": 0.0548, "step": 2121 }, { "epoch": 0.9654231119199272, "grad_norm": 0.7053376734462564, "learning_rate": 4.554394528282052e-06, "loss": 0.0761, "step": 2122 }, { "epoch": 0.9658780709736123, "grad_norm": 0.6283973436454654, "learning_rate": 4.553987211538501e-06, "loss": 0.0502, "step": 2123 }, { "epoch": 0.9663330300272975, "grad_norm": 0.9170409664655178, "learning_rate": 4.5535797269523906e-06, "loss": 0.0784, "step": 2124 }, { "epoch": 0.9667879890809827, "grad_norm": 0.8287926240908017, "learning_rate": 4.55317207455702e-06, "loss": 0.061, "step": 2125 }, { "epoch": 0.9672429481346679, "grad_norm": 0.6405681723490552, "learning_rate": 4.552764254385697e-06, "loss": 0.0662, "step": 2126 }, { "epoch": 0.967697907188353, "grad_norm": 0.8088482444470904, "learning_rate": 4.552356266471751e-06, "loss": 0.0582, "step": 2127 }, { "epoch": 0.9681528662420382, "grad_norm": 0.7353313299484825, "learning_rate": 4.55194811084852e-06, "loss": 0.0654, "step": 2128 }, { "epoch": 0.9686078252957234, "grad_norm": 1.1516254024678405, "learning_rate": 4.551539787549354e-06, "loss": 0.0936, "step": 2129 }, { "epoch": 0.9690627843494085, "grad_norm": 0.6501936911095981, "learning_rate": 4.551131296607623e-06, "loss": 0.0417, "step": 2130 }, { "epoch": 0.9695177434030937, "grad_norm": 0.7334920432846229, "learning_rate": 4.550722638056703e-06, "loss": 0.0526, "step": 2131 }, { "epoch": 0.9699727024567789, "grad_norm": 0.7198129404059547, "learning_rate": 4.550313811929993e-06, "loss": 0.0545, "step": 2132 }, { "epoch": 0.9704276615104641, "grad_norm": 0.7616235023926703, "learning_rate": 4.549904818260895e-06, "loss": 0.0639, "step": 2133 }, { "epoch": 0.9708826205641492, "grad_norm": 0.9116016503086201, "learning_rate": 4.549495657082834e-06, "loss": 0.0889, "step": 2134 }, { "epoch": 0.9713375796178344, "grad_norm": 0.6685343538666026, "learning_rate": 4.549086328429242e-06, "loss": 0.0483, "step": 2135 }, { "epoch": 0.9717925386715196, "grad_norm": 0.9095371650600752, "learning_rate": 4.548676832333569e-06, "loss": 0.0706, "step": 2136 }, { "epoch": 0.9722474977252047, "grad_norm": 0.7169076791465431, "learning_rate": 4.548267168829279e-06, "loss": 0.0525, "step": 2137 }, { "epoch": 0.9727024567788899, "grad_norm": 0.7564722267203925, "learning_rate": 4.547857337949844e-06, "loss": 0.0598, "step": 2138 }, { "epoch": 0.9731574158325751, "grad_norm": 0.6757823684126535, "learning_rate": 4.5474473397287556e-06, "loss": 0.0498, "step": 2139 }, { "epoch": 0.9736123748862603, "grad_norm": 0.6156762948567482, "learning_rate": 4.547037174199517e-06, "loss": 0.0534, "step": 2140 }, { "epoch": 0.9740673339399454, "grad_norm": 1.039052452467343, "learning_rate": 4.546626841395645e-06, "loss": 0.084, "step": 2141 }, { "epoch": 0.9745222929936306, "grad_norm": 0.9845877678571373, "learning_rate": 4.54621634135067e-06, "loss": 0.0628, "step": 2142 }, { "epoch": 0.9749772520473158, "grad_norm": 0.9305858125408707, "learning_rate": 4.545805674098136e-06, "loss": 0.0759, "step": 2143 }, { "epoch": 0.9754322111010009, "grad_norm": 0.8876941693028888, "learning_rate": 4.545394839671601e-06, "loss": 0.0741, "step": 2144 }, { "epoch": 0.9758871701546861, "grad_norm": 0.8796654962506077, "learning_rate": 4.544983838104637e-06, "loss": 0.0776, "step": 2145 }, { "epoch": 0.9763421292083713, "grad_norm": 0.7920165529963009, "learning_rate": 4.544572669430828e-06, "loss": 0.0786, "step": 2146 }, { "epoch": 0.9767970882620565, "grad_norm": 1.1006660615484487, "learning_rate": 4.544161333683775e-06, "loss": 0.1067, "step": 2147 }, { "epoch": 0.9772520473157416, "grad_norm": 0.7917410075321241, "learning_rate": 4.543749830897088e-06, "loss": 0.08, "step": 2148 }, { "epoch": 0.9777070063694268, "grad_norm": 0.9376537537983477, "learning_rate": 4.543338161104395e-06, "loss": 0.0809, "step": 2149 }, { "epoch": 0.978161965423112, "grad_norm": 0.6909374452010633, "learning_rate": 4.542926324339335e-06, "loss": 0.0599, "step": 2150 }, { "epoch": 0.978616924476797, "grad_norm": 0.9041638003887569, "learning_rate": 4.542514320635561e-06, "loss": 0.0897, "step": 2151 }, { "epoch": 0.9790718835304822, "grad_norm": 1.0016444468044507, "learning_rate": 4.542102150026741e-06, "loss": 0.0899, "step": 2152 }, { "epoch": 0.9795268425841674, "grad_norm": 0.8098413909616191, "learning_rate": 4.541689812546556e-06, "loss": 0.0824, "step": 2153 }, { "epoch": 0.9799818016378526, "grad_norm": 0.8781062759872817, "learning_rate": 4.541277308228698e-06, "loss": 0.0835, "step": 2154 }, { "epoch": 0.9804367606915377, "grad_norm": 0.6567263955436328, "learning_rate": 4.540864637106879e-06, "loss": 0.0561, "step": 2155 }, { "epoch": 0.9808917197452229, "grad_norm": 0.8959082251283595, "learning_rate": 4.540451799214817e-06, "loss": 0.0526, "step": 2156 }, { "epoch": 0.9813466787989081, "grad_norm": 0.6968673717470518, "learning_rate": 4.540038794586248e-06, "loss": 0.0559, "step": 2157 }, { "epoch": 0.9818016378525932, "grad_norm": 0.6576045950224041, "learning_rate": 4.539625623254923e-06, "loss": 0.048, "step": 2158 }, { "epoch": 0.9822565969062784, "grad_norm": 1.1320950822163716, "learning_rate": 4.539212285254601e-06, "loss": 0.1112, "step": 2159 }, { "epoch": 0.9827115559599636, "grad_norm": 0.7758556663067462, "learning_rate": 4.5387987806190615e-06, "loss": 0.0565, "step": 2160 }, { "epoch": 0.9831665150136488, "grad_norm": 0.8190828028341962, "learning_rate": 4.538385109382093e-06, "loss": 0.0757, "step": 2161 }, { "epoch": 0.9836214740673339, "grad_norm": 0.7293574919811723, "learning_rate": 4.537971271577498e-06, "loss": 0.0739, "step": 2162 }, { "epoch": 0.9840764331210191, "grad_norm": 0.5802134505622355, "learning_rate": 4.537557267239093e-06, "loss": 0.0544, "step": 2163 }, { "epoch": 0.9845313921747043, "grad_norm": 0.5765626125784856, "learning_rate": 4.537143096400712e-06, "loss": 0.0465, "step": 2164 }, { "epoch": 0.9849863512283894, "grad_norm": 0.97068469033027, "learning_rate": 4.536728759096195e-06, "loss": 0.075, "step": 2165 }, { "epoch": 0.9854413102820746, "grad_norm": 0.7198973054526564, "learning_rate": 4.536314255359402e-06, "loss": 0.0574, "step": 2166 }, { "epoch": 0.9858962693357598, "grad_norm": 0.8011541580376177, "learning_rate": 4.535899585224204e-06, "loss": 0.0652, "step": 2167 }, { "epoch": 0.986351228389445, "grad_norm": 0.5636355433670379, "learning_rate": 4.535484748724486e-06, "loss": 0.0462, "step": 2168 }, { "epoch": 0.9868061874431301, "grad_norm": 0.5710469301327791, "learning_rate": 4.535069745894147e-06, "loss": 0.057, "step": 2169 }, { "epoch": 0.9872611464968153, "grad_norm": 0.8053320529662169, "learning_rate": 4.534654576767098e-06, "loss": 0.0729, "step": 2170 }, { "epoch": 0.9877161055505005, "grad_norm": 0.9884868888578544, "learning_rate": 4.534239241377266e-06, "loss": 0.1066, "step": 2171 }, { "epoch": 0.9881710646041856, "grad_norm": 0.8252157658838575, "learning_rate": 4.5338237397585895e-06, "loss": 0.0975, "step": 2172 }, { "epoch": 0.9886260236578708, "grad_norm": 0.929818227033473, "learning_rate": 4.533408071945021e-06, "loss": 0.0481, "step": 2173 }, { "epoch": 0.989080982711556, "grad_norm": 0.7508255872462847, "learning_rate": 4.532992237970528e-06, "loss": 0.0626, "step": 2174 }, { "epoch": 0.9895359417652412, "grad_norm": 0.5839159131330265, "learning_rate": 4.532576237869091e-06, "loss": 0.0394, "step": 2175 }, { "epoch": 0.9899909008189263, "grad_norm": 0.5644496220261198, "learning_rate": 4.5321600716747025e-06, "loss": 0.0435, "step": 2176 }, { "epoch": 0.9904458598726115, "grad_norm": 0.7793906682399298, "learning_rate": 4.531743739421369e-06, "loss": 0.0609, "step": 2177 }, { "epoch": 0.9909008189262967, "grad_norm": 0.5892429799760452, "learning_rate": 4.531327241143114e-06, "loss": 0.0419, "step": 2178 }, { "epoch": 0.9913557779799818, "grad_norm": 0.8986290228451513, "learning_rate": 4.530910576873969e-06, "loss": 0.0614, "step": 2179 }, { "epoch": 0.991810737033667, "grad_norm": 0.6316271594357544, "learning_rate": 4.530493746647984e-06, "loss": 0.049, "step": 2180 }, { "epoch": 0.9922656960873522, "grad_norm": 0.862209961975702, "learning_rate": 4.530076750499219e-06, "loss": 0.0797, "step": 2181 }, { "epoch": 0.9927206551410374, "grad_norm": 0.8913393772459259, "learning_rate": 4.52965958846175e-06, "loss": 0.0754, "step": 2182 }, { "epoch": 0.9931756141947224, "grad_norm": 0.6021201567582666, "learning_rate": 4.529242260569665e-06, "loss": 0.0433, "step": 2183 }, { "epoch": 0.9936305732484076, "grad_norm": 1.1143487968824526, "learning_rate": 4.528824766857067e-06, "loss": 0.0933, "step": 2184 }, { "epoch": 0.9940855323020928, "grad_norm": 0.8397193906783932, "learning_rate": 4.5284071073580715e-06, "loss": 0.071, "step": 2185 }, { "epoch": 0.9945404913557779, "grad_norm": 0.7046300180509759, "learning_rate": 4.527989282106807e-06, "loss": 0.0553, "step": 2186 }, { "epoch": 0.9949954504094631, "grad_norm": 1.2273654625571622, "learning_rate": 4.527571291137416e-06, "loss": 0.0826, "step": 2187 }, { "epoch": 0.9954504094631483, "grad_norm": 0.8674968100145852, "learning_rate": 4.527153134484056e-06, "loss": 0.0793, "step": 2188 }, { "epoch": 0.9959053685168335, "grad_norm": 0.7590194710414102, "learning_rate": 4.5267348121808965e-06, "loss": 0.0627, "step": 2189 }, { "epoch": 0.9963603275705186, "grad_norm": 0.6472950730058415, "learning_rate": 4.526316324262121e-06, "loss": 0.072, "step": 2190 }, { "epoch": 0.9968152866242038, "grad_norm": 0.7337103448736059, "learning_rate": 4.525897670761926e-06, "loss": 0.0535, "step": 2191 }, { "epoch": 0.997270245677889, "grad_norm": 0.8681401412263181, "learning_rate": 4.525478851714522e-06, "loss": 0.0715, "step": 2192 }, { "epoch": 0.9977252047315741, "grad_norm": 0.8136877497937504, "learning_rate": 4.525059867154133e-06, "loss": 0.069, "step": 2193 }, { "epoch": 0.9981801637852593, "grad_norm": 0.7165865552910584, "learning_rate": 4.5246407171149975e-06, "loss": 0.0701, "step": 2194 }, { "epoch": 0.9986351228389445, "grad_norm": 0.7119563605507941, "learning_rate": 4.5242214016313655e-06, "loss": 0.0605, "step": 2195 }, { "epoch": 0.9990900818926297, "grad_norm": 0.5906420983866724, "learning_rate": 4.523801920737501e-06, "loss": 0.0537, "step": 2196 }, { "epoch": 0.9995450409463148, "grad_norm": 0.6793422286647208, "learning_rate": 4.523382274467684e-06, "loss": 0.0601, "step": 2197 }, { "epoch": 1.0, "grad_norm": 0.3861471518492922, "learning_rate": 4.522962462856206e-06, "loss": 0.0229, "step": 2198 }, { "epoch": 1.000454959053685, "grad_norm": 0.4134000717198071, "learning_rate": 4.522542485937369e-06, "loss": 0.0162, "step": 2199 }, { "epoch": 1.0009099181073704, "grad_norm": 0.47305029810650895, "learning_rate": 4.522122343745495e-06, "loss": 0.0251, "step": 2200 }, { "epoch": 1.0013648771610555, "grad_norm": 0.5407710422461701, "learning_rate": 4.521702036314915e-06, "loss": 0.0305, "step": 2201 }, { "epoch": 1.0018198362147406, "grad_norm": 0.3163516556985795, "learning_rate": 4.521281563679973e-06, "loss": 0.0169, "step": 2202 }, { "epoch": 1.0022747952684259, "grad_norm": 0.46270617626451155, "learning_rate": 4.5208609258750314e-06, "loss": 0.0208, "step": 2203 }, { "epoch": 1.002729754322111, "grad_norm": 0.37292336110835755, "learning_rate": 4.52044012293446e-06, "loss": 0.0261, "step": 2204 }, { "epoch": 1.0031847133757963, "grad_norm": 0.4740242480862373, "learning_rate": 4.520019154892646e-06, "loss": 0.0309, "step": 2205 }, { "epoch": 1.0036396724294814, "grad_norm": 0.9587535207840793, "learning_rate": 4.519598021783989e-06, "loss": 0.0473, "step": 2206 }, { "epoch": 1.0040946314831665, "grad_norm": 0.5020844798119457, "learning_rate": 4.519176723642903e-06, "loss": 0.0379, "step": 2207 }, { "epoch": 1.0045495905368518, "grad_norm": 0.5844057078703908, "learning_rate": 4.518755260503813e-06, "loss": 0.0264, "step": 2208 }, { "epoch": 1.0050045495905369, "grad_norm": 0.3907648371732826, "learning_rate": 4.51833363240116e-06, "loss": 0.0246, "step": 2209 }, { "epoch": 1.005459508644222, "grad_norm": 0.3437099134045075, "learning_rate": 4.517911839369398e-06, "loss": 0.0215, "step": 2210 }, { "epoch": 1.0059144676979073, "grad_norm": 0.48723727398361594, "learning_rate": 4.517489881442993e-06, "loss": 0.0221, "step": 2211 }, { "epoch": 1.0063694267515924, "grad_norm": 0.36188847497976756, "learning_rate": 4.517067758656424e-06, "loss": 0.0152, "step": 2212 }, { "epoch": 1.0068243858052774, "grad_norm": 0.32768362464406736, "learning_rate": 4.516645471044188e-06, "loss": 0.0137, "step": 2213 }, { "epoch": 1.0072793448589628, "grad_norm": 0.35318195696428845, "learning_rate": 4.516223018640791e-06, "loss": 0.0175, "step": 2214 }, { "epoch": 1.0077343039126478, "grad_norm": 0.36650556359181136, "learning_rate": 4.515800401480754e-06, "loss": 0.0225, "step": 2215 }, { "epoch": 1.008189262966333, "grad_norm": 0.42431668473537093, "learning_rate": 4.515377619598612e-06, "loss": 0.0251, "step": 2216 }, { "epoch": 1.0086442220200182, "grad_norm": 0.42493861735993127, "learning_rate": 4.514954673028913e-06, "loss": 0.022, "step": 2217 }, { "epoch": 1.0090991810737033, "grad_norm": 0.31532925739877793, "learning_rate": 4.5145315618062155e-06, "loss": 0.0167, "step": 2218 }, { "epoch": 1.0095541401273886, "grad_norm": 0.6455510153155769, "learning_rate": 4.514108285965098e-06, "loss": 0.0279, "step": 2219 }, { "epoch": 1.0100090991810737, "grad_norm": 0.4942618390077279, "learning_rate": 4.513684845540146e-06, "loss": 0.0281, "step": 2220 }, { "epoch": 1.0104640582347588, "grad_norm": 0.5202505979006149, "learning_rate": 4.5132612405659625e-06, "loss": 0.0352, "step": 2221 }, { "epoch": 1.0109190172884441, "grad_norm": 0.29852324614455794, "learning_rate": 4.5128374710771625e-06, "loss": 0.0125, "step": 2222 }, { "epoch": 1.0113739763421292, "grad_norm": 0.6497393685341776, "learning_rate": 4.512413537108374e-06, "loss": 0.0418, "step": 2223 }, { "epoch": 1.0118289353958143, "grad_norm": 0.5879513139032745, "learning_rate": 4.511989438694239e-06, "loss": 0.024, "step": 2224 }, { "epoch": 1.0122838944494996, "grad_norm": 0.4735839819820227, "learning_rate": 4.511565175869415e-06, "loss": 0.0136, "step": 2225 }, { "epoch": 1.0127388535031847, "grad_norm": 0.4033061216793247, "learning_rate": 4.511140748668566e-06, "loss": 0.0205, "step": 2226 }, { "epoch": 1.0131938125568698, "grad_norm": 0.4374666494282639, "learning_rate": 4.510716157126379e-06, "loss": 0.0219, "step": 2227 }, { "epoch": 1.013648771610555, "grad_norm": 0.46869316743808964, "learning_rate": 4.510291401277548e-06, "loss": 0.0255, "step": 2228 }, { "epoch": 1.0141037306642402, "grad_norm": 0.5854010070199603, "learning_rate": 4.509866481156781e-06, "loss": 0.0241, "step": 2229 }, { "epoch": 1.0145586897179253, "grad_norm": 0.39945011653857926, "learning_rate": 4.509441396798802e-06, "loss": 0.0193, "step": 2230 }, { "epoch": 1.0150136487716106, "grad_norm": 0.49690934996195507, "learning_rate": 4.5090161482383475e-06, "loss": 0.0175, "step": 2231 }, { "epoch": 1.0154686078252957, "grad_norm": 0.4748457609386695, "learning_rate": 4.508590735510166e-06, "loss": 0.0227, "step": 2232 }, { "epoch": 1.015923566878981, "grad_norm": 0.49987588415101697, "learning_rate": 4.508165158649019e-06, "loss": 0.0245, "step": 2233 }, { "epoch": 1.016378525932666, "grad_norm": 0.3789941766757115, "learning_rate": 4.507739417689685e-06, "loss": 0.0139, "step": 2234 }, { "epoch": 1.0168334849863512, "grad_norm": 0.5234968222321746, "learning_rate": 4.507313512666953e-06, "loss": 0.0177, "step": 2235 }, { "epoch": 1.0172884440400365, "grad_norm": 0.39843093182948436, "learning_rate": 4.506887443615625e-06, "loss": 0.0153, "step": 2236 }, { "epoch": 1.0177434030937216, "grad_norm": 0.5143152978261103, "learning_rate": 4.506461210570518e-06, "loss": 0.0194, "step": 2237 }, { "epoch": 1.0181983621474067, "grad_norm": 0.6529575725615211, "learning_rate": 4.506034813566462e-06, "loss": 0.0266, "step": 2238 }, { "epoch": 1.018653321201092, "grad_norm": 0.48382413646434125, "learning_rate": 4.505608252638301e-06, "loss": 0.0236, "step": 2239 }, { "epoch": 1.019108280254777, "grad_norm": 0.6312423204307113, "learning_rate": 4.50518152782089e-06, "loss": 0.0249, "step": 2240 }, { "epoch": 1.0195632393084622, "grad_norm": 0.47154152319198106, "learning_rate": 4.504754639149101e-06, "loss": 0.0176, "step": 2241 }, { "epoch": 1.0200181983621475, "grad_norm": 0.3828630177013417, "learning_rate": 4.504327586657814e-06, "loss": 0.0147, "step": 2242 }, { "epoch": 1.0204731574158326, "grad_norm": 0.6397319142718383, "learning_rate": 4.50390037038193e-06, "loss": 0.0251, "step": 2243 }, { "epoch": 1.0209281164695176, "grad_norm": 0.5582288758170052, "learning_rate": 4.503472990356357e-06, "loss": 0.0291, "step": 2244 }, { "epoch": 1.021383075523203, "grad_norm": 0.3375136771592828, "learning_rate": 4.503045446616018e-06, "loss": 0.0129, "step": 2245 }, { "epoch": 1.021838034576888, "grad_norm": 0.4662408356158423, "learning_rate": 4.502617739195852e-06, "loss": 0.0224, "step": 2246 }, { "epoch": 1.0222929936305734, "grad_norm": 0.5410561625011929, "learning_rate": 4.502189868130807e-06, "loss": 0.0172, "step": 2247 }, { "epoch": 1.0227479526842584, "grad_norm": 0.654839238459795, "learning_rate": 4.501761833455849e-06, "loss": 0.0281, "step": 2248 }, { "epoch": 1.0232029117379435, "grad_norm": 0.6119106378791578, "learning_rate": 4.501333635205952e-06, "loss": 0.0172, "step": 2249 }, { "epoch": 1.0236578707916288, "grad_norm": 0.6002704645250265, "learning_rate": 4.5009052734161095e-06, "loss": 0.0278, "step": 2250 }, { "epoch": 1.024112829845314, "grad_norm": 0.562189478960442, "learning_rate": 4.500476748121324e-06, "loss": 0.0289, "step": 2251 }, { "epoch": 1.024567788898999, "grad_norm": 0.5573521615480775, "learning_rate": 4.500048059356613e-06, "loss": 0.0264, "step": 2252 }, { "epoch": 1.0250227479526843, "grad_norm": 0.35812836056261116, "learning_rate": 4.499619207157007e-06, "loss": 0.0142, "step": 2253 }, { "epoch": 1.0254777070063694, "grad_norm": 0.4466546832771037, "learning_rate": 4.499190191557549e-06, "loss": 0.0199, "step": 2254 }, { "epoch": 1.0259326660600545, "grad_norm": 0.7194400511344139, "learning_rate": 4.498761012593296e-06, "loss": 0.0329, "step": 2255 }, { "epoch": 1.0263876251137398, "grad_norm": 0.5968641194765588, "learning_rate": 4.498331670299321e-06, "loss": 0.0259, "step": 2256 }, { "epoch": 1.026842584167425, "grad_norm": 0.6396159557727437, "learning_rate": 4.497902164710704e-06, "loss": 0.0165, "step": 2257 }, { "epoch": 1.02729754322111, "grad_norm": 0.9501081798643956, "learning_rate": 4.497472495862547e-06, "loss": 0.0432, "step": 2258 }, { "epoch": 1.0277525022747953, "grad_norm": 0.40770955227461003, "learning_rate": 4.497042663789957e-06, "loss": 0.0153, "step": 2259 }, { "epoch": 1.0282074613284804, "grad_norm": 0.6626552748025197, "learning_rate": 4.496612668528059e-06, "loss": 0.0271, "step": 2260 }, { "epoch": 1.0286624203821657, "grad_norm": 0.5720200272418298, "learning_rate": 4.496182510111991e-06, "loss": 0.0331, "step": 2261 }, { "epoch": 1.0291173794358508, "grad_norm": 0.6122100316161389, "learning_rate": 4.495752188576902e-06, "loss": 0.0279, "step": 2262 }, { "epoch": 1.0295723384895359, "grad_norm": 0.5909094519020904, "learning_rate": 4.4953217039579574e-06, "loss": 0.0214, "step": 2263 }, { "epoch": 1.0300272975432212, "grad_norm": 0.8262480646318613, "learning_rate": 4.494891056290335e-06, "loss": 0.0359, "step": 2264 }, { "epoch": 1.0304822565969063, "grad_norm": 0.4058443771592801, "learning_rate": 4.494460245609223e-06, "loss": 0.0151, "step": 2265 }, { "epoch": 1.0309372156505914, "grad_norm": 0.8211955800040838, "learning_rate": 4.494029271949827e-06, "loss": 0.0286, "step": 2266 }, { "epoch": 1.0313921747042767, "grad_norm": 0.4785178376545247, "learning_rate": 4.493598135347363e-06, "loss": 0.0201, "step": 2267 }, { "epoch": 1.0318471337579618, "grad_norm": 1.0647622975253437, "learning_rate": 4.493166835837064e-06, "loss": 0.0296, "step": 2268 }, { "epoch": 1.0323020928116469, "grad_norm": 0.7293233621494204, "learning_rate": 4.492735373454171e-06, "loss": 0.0301, "step": 2269 }, { "epoch": 1.0327570518653322, "grad_norm": 0.4229410037536498, "learning_rate": 4.492303748233943e-06, "loss": 0.0153, "step": 2270 }, { "epoch": 1.0332120109190173, "grad_norm": 0.6975035519080148, "learning_rate": 4.49187196021165e-06, "loss": 0.0448, "step": 2271 }, { "epoch": 1.0336669699727024, "grad_norm": 0.6125495922729677, "learning_rate": 4.491440009422575e-06, "loss": 0.0243, "step": 2272 }, { "epoch": 1.0341219290263877, "grad_norm": 0.408105520826875, "learning_rate": 4.491007895902016e-06, "loss": 0.0132, "step": 2273 }, { "epoch": 1.0345768880800728, "grad_norm": 0.48877548043609653, "learning_rate": 4.490575619685283e-06, "loss": 0.0205, "step": 2274 }, { "epoch": 1.035031847133758, "grad_norm": 0.812892516681499, "learning_rate": 4.4901431808077e-06, "loss": 0.0343, "step": 2275 }, { "epoch": 1.0354868061874432, "grad_norm": 0.8938319297474577, "learning_rate": 4.489710579304603e-06, "loss": 0.0227, "step": 2276 }, { "epoch": 1.0359417652411282, "grad_norm": 0.6676116587013313, "learning_rate": 4.489277815211343e-06, "loss": 0.0223, "step": 2277 }, { "epoch": 1.0363967242948136, "grad_norm": 0.6495562102011189, "learning_rate": 4.488844888563284e-06, "loss": 0.0282, "step": 2278 }, { "epoch": 1.0368516833484986, "grad_norm": 0.7504240515946974, "learning_rate": 4.488411799395802e-06, "loss": 0.0192, "step": 2279 }, { "epoch": 1.0373066424021837, "grad_norm": 0.505052238930926, "learning_rate": 4.487978547744287e-06, "loss": 0.024, "step": 2280 }, { "epoch": 1.037761601455869, "grad_norm": 0.5936270855305887, "learning_rate": 4.487545133644143e-06, "loss": 0.024, "step": 2281 }, { "epoch": 1.0382165605095541, "grad_norm": 0.5321574319749766, "learning_rate": 4.487111557130787e-06, "loss": 0.026, "step": 2282 }, { "epoch": 1.0386715195632392, "grad_norm": 0.7478753443981572, "learning_rate": 4.486677818239647e-06, "loss": 0.0374, "step": 2283 }, { "epoch": 1.0391264786169245, "grad_norm": 0.6243001056584193, "learning_rate": 4.486243917006169e-06, "loss": 0.0229, "step": 2284 }, { "epoch": 1.0395814376706096, "grad_norm": 0.4929168743628238, "learning_rate": 4.485809853465807e-06, "loss": 0.018, "step": 2285 }, { "epoch": 1.0400363967242947, "grad_norm": 0.7513792925376249, "learning_rate": 4.4853756276540315e-06, "loss": 0.0243, "step": 2286 }, { "epoch": 1.04049135577798, "grad_norm": 0.39797569187526005, "learning_rate": 4.484941239606326e-06, "loss": 0.01, "step": 2287 }, { "epoch": 1.040946314831665, "grad_norm": 0.39764537771237807, "learning_rate": 4.484506689358186e-06, "loss": 0.0113, "step": 2288 }, { "epoch": 1.0414012738853504, "grad_norm": 0.4552203666266973, "learning_rate": 4.484071976945121e-06, "loss": 0.0129, "step": 2289 }, { "epoch": 1.0418562329390355, "grad_norm": 0.6376672211925216, "learning_rate": 4.483637102402655e-06, "loss": 0.0295, "step": 2290 }, { "epoch": 1.0423111919927206, "grad_norm": 0.6877079936099527, "learning_rate": 4.4832020657663224e-06, "loss": 0.0231, "step": 2291 }, { "epoch": 1.042766151046406, "grad_norm": 0.4953931162830707, "learning_rate": 4.482766867071673e-06, "loss": 0.0159, "step": 2292 }, { "epoch": 1.043221110100091, "grad_norm": 0.697817840638706, "learning_rate": 4.482331506354269e-06, "loss": 0.046, "step": 2293 }, { "epoch": 1.043676069153776, "grad_norm": 0.6432246627987923, "learning_rate": 4.4818959836496876e-06, "loss": 0.0132, "step": 2294 }, { "epoch": 1.0441310282074614, "grad_norm": 0.5013738883280774, "learning_rate": 4.481460298993515e-06, "loss": 0.0195, "step": 2295 }, { "epoch": 1.0445859872611465, "grad_norm": 0.6732371138033433, "learning_rate": 4.481024452421357e-06, "loss": 0.0257, "step": 2296 }, { "epoch": 1.0450409463148316, "grad_norm": 0.5971847786556302, "learning_rate": 4.480588443968825e-06, "loss": 0.0164, "step": 2297 }, { "epoch": 1.0454959053685169, "grad_norm": 0.5912246018330237, "learning_rate": 4.4801522736715505e-06, "loss": 0.0198, "step": 2298 }, { "epoch": 1.045950864422202, "grad_norm": 0.48571742963419795, "learning_rate": 4.479715941565174e-06, "loss": 0.0185, "step": 2299 }, { "epoch": 1.046405823475887, "grad_norm": 0.41924338496925545, "learning_rate": 4.4792794476853514e-06, "loss": 0.0147, "step": 2300 }, { "epoch": 1.0468607825295724, "grad_norm": 0.5297657413035841, "learning_rate": 4.47884279206775e-06, "loss": 0.0215, "step": 2301 }, { "epoch": 1.0473157415832575, "grad_norm": 0.4231923684228783, "learning_rate": 4.478405974748054e-06, "loss": 0.0167, "step": 2302 }, { "epoch": 1.0477707006369428, "grad_norm": 0.5639392018822912, "learning_rate": 4.477968995761954e-06, "loss": 0.0301, "step": 2303 }, { "epoch": 1.0482256596906279, "grad_norm": 0.6361502705633699, "learning_rate": 4.477531855145161e-06, "loss": 0.0235, "step": 2304 }, { "epoch": 1.048680618744313, "grad_norm": 0.5802868456297169, "learning_rate": 4.477094552933395e-06, "loss": 0.0153, "step": 2305 }, { "epoch": 1.0491355777979983, "grad_norm": 0.412107801815597, "learning_rate": 4.476657089162391e-06, "loss": 0.0217, "step": 2306 }, { "epoch": 1.0495905368516834, "grad_norm": 0.7669706620629979, "learning_rate": 4.476219463867897e-06, "loss": 0.0309, "step": 2307 }, { "epoch": 1.0500454959053684, "grad_norm": 0.6816466836381806, "learning_rate": 4.475781677085671e-06, "loss": 0.017, "step": 2308 }, { "epoch": 1.0505004549590538, "grad_norm": 0.4841095295414997, "learning_rate": 4.4753437288514904e-06, "loss": 0.0167, "step": 2309 }, { "epoch": 1.0509554140127388, "grad_norm": 0.5612261779618657, "learning_rate": 4.47490561920114e-06, "loss": 0.0153, "step": 2310 }, { "epoch": 1.051410373066424, "grad_norm": 0.638519143758431, "learning_rate": 4.474467348170421e-06, "loss": 0.0298, "step": 2311 }, { "epoch": 1.0518653321201092, "grad_norm": 0.5669351202592476, "learning_rate": 4.474028915795148e-06, "loss": 0.0282, "step": 2312 }, { "epoch": 1.0523202911737943, "grad_norm": 0.4347136897443792, "learning_rate": 4.473590322111145e-06, "loss": 0.0157, "step": 2313 }, { "epoch": 1.0527752502274794, "grad_norm": 0.4833057498174656, "learning_rate": 4.473151567154255e-06, "loss": 0.02, "step": 2314 }, { "epoch": 1.0532302092811647, "grad_norm": 0.4914062822782954, "learning_rate": 4.472712650960328e-06, "loss": 0.0181, "step": 2315 }, { "epoch": 1.0536851683348498, "grad_norm": 0.615271249953854, "learning_rate": 4.472273573565234e-06, "loss": 0.0327, "step": 2316 }, { "epoch": 1.0541401273885351, "grad_norm": 0.692992244874008, "learning_rate": 4.471834335004849e-06, "loss": 0.0235, "step": 2317 }, { "epoch": 1.0545950864422202, "grad_norm": 0.5490826845180317, "learning_rate": 4.471394935315067e-06, "loss": 0.0208, "step": 2318 }, { "epoch": 1.0550500454959053, "grad_norm": 0.5091849562822288, "learning_rate": 4.470955374531794e-06, "loss": 0.0164, "step": 2319 }, { "epoch": 1.0555050045495906, "grad_norm": 0.6151353273515647, "learning_rate": 4.470515652690947e-06, "loss": 0.0265, "step": 2320 }, { "epoch": 1.0559599636032757, "grad_norm": 0.4729616741514311, "learning_rate": 4.470075769828461e-06, "loss": 0.0188, "step": 2321 }, { "epoch": 1.0564149226569608, "grad_norm": 0.6881944943779561, "learning_rate": 4.46963572598028e-06, "loss": 0.0198, "step": 2322 }, { "epoch": 1.056869881710646, "grad_norm": 0.49575367084132177, "learning_rate": 4.469195521182362e-06, "loss": 0.0217, "step": 2323 }, { "epoch": 1.0573248407643312, "grad_norm": 0.5505070213462345, "learning_rate": 4.468755155470679e-06, "loss": 0.0226, "step": 2324 }, { "epoch": 1.0577797998180163, "grad_norm": 0.5688205202999257, "learning_rate": 4.468314628881214e-06, "loss": 0.0208, "step": 2325 }, { "epoch": 1.0582347588717016, "grad_norm": 0.4538856296370273, "learning_rate": 4.467873941449969e-06, "loss": 0.0201, "step": 2326 }, { "epoch": 1.0586897179253867, "grad_norm": 0.4951949600945696, "learning_rate": 4.46743309321295e-06, "loss": 0.0192, "step": 2327 }, { "epoch": 1.0591446769790718, "grad_norm": 0.8177918826431166, "learning_rate": 4.466992084206185e-06, "loss": 0.0465, "step": 2328 }, { "epoch": 1.059599636032757, "grad_norm": 0.41902386946901277, "learning_rate": 4.466550914465709e-06, "loss": 0.0153, "step": 2329 }, { "epoch": 1.0600545950864422, "grad_norm": 0.6619680469471608, "learning_rate": 4.466109584027573e-06, "loss": 0.0269, "step": 2330 }, { "epoch": 1.0605095541401275, "grad_norm": 0.573290046990492, "learning_rate": 4.465668092927841e-06, "loss": 0.0226, "step": 2331 }, { "epoch": 1.0609645131938126, "grad_norm": 0.8738515426263221, "learning_rate": 4.465226441202589e-06, "loss": 0.0407, "step": 2332 }, { "epoch": 1.0614194722474977, "grad_norm": 0.5242511765844076, "learning_rate": 4.464784628887908e-06, "loss": 0.0202, "step": 2333 }, { "epoch": 1.061874431301183, "grad_norm": 0.4588386857289231, "learning_rate": 4.4643426560199e-06, "loss": 0.0104, "step": 2334 }, { "epoch": 1.062329390354868, "grad_norm": 0.43849362113269413, "learning_rate": 4.46390052263468e-06, "loss": 0.0152, "step": 2335 }, { "epoch": 1.0627843494085532, "grad_norm": 0.6871715121390033, "learning_rate": 4.463458228768378e-06, "loss": 0.0222, "step": 2336 }, { "epoch": 1.0632393084622385, "grad_norm": 0.6312910573017381, "learning_rate": 4.463015774457137e-06, "loss": 0.0292, "step": 2337 }, { "epoch": 1.0636942675159236, "grad_norm": 0.7620092964886106, "learning_rate": 4.462573159737113e-06, "loss": 0.0391, "step": 2338 }, { "epoch": 1.0641492265696086, "grad_norm": 0.585490205624937, "learning_rate": 4.462130384644472e-06, "loss": 0.0236, "step": 2339 }, { "epoch": 1.064604185623294, "grad_norm": 0.7034769748830236, "learning_rate": 4.461687449215397e-06, "loss": 0.0252, "step": 2340 }, { "epoch": 1.065059144676979, "grad_norm": 0.6110153772749273, "learning_rate": 4.4612443534860826e-06, "loss": 0.0248, "step": 2341 }, { "epoch": 1.0655141037306644, "grad_norm": 0.6858335786071746, "learning_rate": 4.460801097492737e-06, "loss": 0.0214, "step": 2342 }, { "epoch": 1.0659690627843494, "grad_norm": 0.5030440084983254, "learning_rate": 4.460357681271579e-06, "loss": 0.0179, "step": 2343 }, { "epoch": 1.0664240218380345, "grad_norm": 0.5301952004822383, "learning_rate": 4.4599141048588454e-06, "loss": 0.0273, "step": 2344 }, { "epoch": 1.0668789808917198, "grad_norm": 0.8918886613053432, "learning_rate": 4.4594703682907825e-06, "loss": 0.0348, "step": 2345 }, { "epoch": 1.067333939945405, "grad_norm": 0.5550020043836809, "learning_rate": 4.459026471603649e-06, "loss": 0.0264, "step": 2346 }, { "epoch": 1.06778889899909, "grad_norm": 0.5942449646440254, "learning_rate": 4.45858241483372e-06, "loss": 0.0311, "step": 2347 }, { "epoch": 1.0682438580527753, "grad_norm": 0.6882668929300695, "learning_rate": 4.458138198017281e-06, "loss": 0.0266, "step": 2348 }, { "epoch": 1.0686988171064604, "grad_norm": 0.6521955388813901, "learning_rate": 4.457693821190631e-06, "loss": 0.0324, "step": 2349 }, { "epoch": 1.0691537761601455, "grad_norm": 0.6489523342195044, "learning_rate": 4.4572492843900815e-06, "loss": 0.0253, "step": 2350 }, { "epoch": 1.0696087352138308, "grad_norm": 0.48279418830121457, "learning_rate": 4.456804587651961e-06, "loss": 0.0201, "step": 2351 }, { "epoch": 1.070063694267516, "grad_norm": 0.4410106378931689, "learning_rate": 4.456359731012606e-06, "loss": 0.0141, "step": 2352 }, { "epoch": 1.070518653321201, "grad_norm": 0.3752479866878442, "learning_rate": 4.455914714508369e-06, "loss": 0.0096, "step": 2353 }, { "epoch": 1.0709736123748863, "grad_norm": 0.35564777078621335, "learning_rate": 4.455469538175614e-06, "loss": 0.014, "step": 2354 }, { "epoch": 1.0714285714285714, "grad_norm": 0.5792769224363791, "learning_rate": 4.455024202050719e-06, "loss": 0.0245, "step": 2355 }, { "epoch": 1.0718835304822565, "grad_norm": 0.4418912340426733, "learning_rate": 4.454578706170075e-06, "loss": 0.0137, "step": 2356 }, { "epoch": 1.0723384895359418, "grad_norm": 0.6613336575639737, "learning_rate": 4.454133050570087e-06, "loss": 0.0234, "step": 2357 }, { "epoch": 1.0727934485896269, "grad_norm": 0.501346950607051, "learning_rate": 4.453687235287169e-06, "loss": 0.0226, "step": 2358 }, { "epoch": 1.0732484076433122, "grad_norm": 0.3786010644362032, "learning_rate": 4.453241260357754e-06, "loss": 0.0134, "step": 2359 }, { "epoch": 1.0737033666969973, "grad_norm": 0.4833259530881372, "learning_rate": 4.452795125818283e-06, "loss": 0.0188, "step": 2360 }, { "epoch": 1.0741583257506824, "grad_norm": 0.7741750323072273, "learning_rate": 4.4523488317052146e-06, "loss": 0.0355, "step": 2361 }, { "epoch": 1.0746132848043677, "grad_norm": 0.44639871630368005, "learning_rate": 4.451902378055015e-06, "loss": 0.0155, "step": 2362 }, { "epoch": 1.0750682438580528, "grad_norm": 0.38023273288220855, "learning_rate": 4.451455764904169e-06, "loss": 0.0195, "step": 2363 }, { "epoch": 1.0755232029117379, "grad_norm": 0.7834571656848334, "learning_rate": 4.45100899228917e-06, "loss": 0.0361, "step": 2364 }, { "epoch": 1.0759781619654232, "grad_norm": 0.44860302565592436, "learning_rate": 4.4505620602465275e-06, "loss": 0.0169, "step": 2365 }, { "epoch": 1.0764331210191083, "grad_norm": 0.5963152489034919, "learning_rate": 4.450114968812761e-06, "loss": 0.0191, "step": 2366 }, { "epoch": 1.0768880800727934, "grad_norm": 0.6470284182313165, "learning_rate": 4.449667718024406e-06, "loss": 0.0309, "step": 2367 }, { "epoch": 1.0773430391264787, "grad_norm": 0.5740322881813956, "learning_rate": 4.449220307918011e-06, "loss": 0.0211, "step": 2368 }, { "epoch": 1.0777979981801638, "grad_norm": 0.38610056650249264, "learning_rate": 4.448772738530134e-06, "loss": 0.0124, "step": 2369 }, { "epoch": 1.078252957233849, "grad_norm": 0.5957149086382844, "learning_rate": 4.44832500989735e-06, "loss": 0.0246, "step": 2370 }, { "epoch": 1.0787079162875342, "grad_norm": 0.4562941100201254, "learning_rate": 4.447877122056243e-06, "loss": 0.0191, "step": 2371 }, { "epoch": 1.0791628753412192, "grad_norm": 0.6775847445675632, "learning_rate": 4.447429075043416e-06, "loss": 0.0156, "step": 2372 }, { "epoch": 1.0796178343949046, "grad_norm": 0.4767997886979347, "learning_rate": 4.4469808688954786e-06, "loss": 0.0142, "step": 2373 }, { "epoch": 1.0800727934485896, "grad_norm": 0.6417561969278244, "learning_rate": 4.446532503649058e-06, "loss": 0.0268, "step": 2374 }, { "epoch": 1.0805277525022747, "grad_norm": 0.5723910654907396, "learning_rate": 4.44608397934079e-06, "loss": 0.0243, "step": 2375 }, { "epoch": 1.08098271155596, "grad_norm": 0.7726426359472051, "learning_rate": 4.445635296007329e-06, "loss": 0.0305, "step": 2376 }, { "epoch": 1.0814376706096451, "grad_norm": 0.5972224710436718, "learning_rate": 4.445186453685339e-06, "loss": 0.0246, "step": 2377 }, { "epoch": 1.0818926296633302, "grad_norm": 0.5542672270842244, "learning_rate": 4.444737452411494e-06, "loss": 0.0169, "step": 2378 }, { "epoch": 1.0823475887170155, "grad_norm": 0.669917314313808, "learning_rate": 4.444288292222488e-06, "loss": 0.0231, "step": 2379 }, { "epoch": 1.0828025477707006, "grad_norm": 0.46636787428212545, "learning_rate": 4.443838973155023e-06, "loss": 0.0198, "step": 2380 }, { "epoch": 1.0832575068243857, "grad_norm": 0.7488407504054011, "learning_rate": 4.443389495245816e-06, "loss": 0.0391, "step": 2381 }, { "epoch": 1.083712465878071, "grad_norm": 0.426306751011867, "learning_rate": 4.442939858531594e-06, "loss": 0.0186, "step": 2382 }, { "epoch": 1.084167424931756, "grad_norm": 0.5269009234872088, "learning_rate": 4.442490063049103e-06, "loss": 0.0181, "step": 2383 }, { "epoch": 1.0846223839854412, "grad_norm": 0.5045472519746933, "learning_rate": 4.442040108835095e-06, "loss": 0.0145, "step": 2384 }, { "epoch": 1.0850773430391265, "grad_norm": 0.5276413295553061, "learning_rate": 4.44158999592634e-06, "loss": 0.0233, "step": 2385 }, { "epoch": 1.0855323020928116, "grad_norm": 0.7029231940364357, "learning_rate": 4.441139724359617e-06, "loss": 0.0143, "step": 2386 }, { "epoch": 1.085987261146497, "grad_norm": 0.4846625365807441, "learning_rate": 4.440689294171724e-06, "loss": 0.016, "step": 2387 }, { "epoch": 1.086442220200182, "grad_norm": 0.7320057104472577, "learning_rate": 4.440238705399465e-06, "loss": 0.0216, "step": 2388 }, { "epoch": 1.086897179253867, "grad_norm": 0.45375539724546904, "learning_rate": 4.439787958079662e-06, "loss": 0.0166, "step": 2389 }, { "epoch": 1.0873521383075524, "grad_norm": 0.549722137867742, "learning_rate": 4.439337052249146e-06, "loss": 0.0167, "step": 2390 }, { "epoch": 1.0878070973612375, "grad_norm": 0.593866408149248, "learning_rate": 4.4388859879447645e-06, "loss": 0.025, "step": 2391 }, { "epoch": 1.0882620564149226, "grad_norm": 1.560944402013678, "learning_rate": 4.438434765203376e-06, "loss": 0.0742, "step": 2392 }, { "epoch": 1.0887170154686079, "grad_norm": 0.5799492458949477, "learning_rate": 4.4379833840618524e-06, "loss": 0.0282, "step": 2393 }, { "epoch": 1.089171974522293, "grad_norm": 0.8256447524416959, "learning_rate": 4.4375318445570785e-06, "loss": 0.0256, "step": 2394 }, { "epoch": 1.089626933575978, "grad_norm": 0.6537098687733487, "learning_rate": 4.437080146725951e-06, "loss": 0.0225, "step": 2395 }, { "epoch": 1.0900818926296634, "grad_norm": 0.5301949864589981, "learning_rate": 4.436628290605384e-06, "loss": 0.0236, "step": 2396 }, { "epoch": 1.0905368516833485, "grad_norm": 0.5077507855342336, "learning_rate": 4.436176276232297e-06, "loss": 0.0198, "step": 2397 }, { "epoch": 1.0909918107370338, "grad_norm": 0.6697585395312962, "learning_rate": 4.4357241036436294e-06, "loss": 0.0218, "step": 2398 }, { "epoch": 1.0914467697907189, "grad_norm": 0.37753594175599786, "learning_rate": 4.435271772876329e-06, "loss": 0.0198, "step": 2399 }, { "epoch": 1.091901728844404, "grad_norm": 0.7251989967515775, "learning_rate": 4.434819283967359e-06, "loss": 0.0337, "step": 2400 }, { "epoch": 1.0923566878980893, "grad_norm": 0.5685580227633195, "learning_rate": 4.434366636953695e-06, "loss": 0.0134, "step": 2401 }, { "epoch": 1.0928116469517744, "grad_norm": 0.3783023948073095, "learning_rate": 4.433913831872324e-06, "loss": 0.0092, "step": 2402 }, { "epoch": 1.0932666060054594, "grad_norm": 0.6281936324347903, "learning_rate": 4.43346086876025e-06, "loss": 0.0328, "step": 2403 }, { "epoch": 1.0937215650591448, "grad_norm": 0.6536473305158147, "learning_rate": 4.433007747654484e-06, "loss": 0.0188, "step": 2404 }, { "epoch": 1.0941765241128298, "grad_norm": 0.526513701587517, "learning_rate": 4.432554468592054e-06, "loss": 0.0226, "step": 2405 }, { "epoch": 1.094631483166515, "grad_norm": 0.5881238864879137, "learning_rate": 4.432101031610001e-06, "loss": 0.0223, "step": 2406 }, { "epoch": 1.0950864422202002, "grad_norm": 0.4422986207763811, "learning_rate": 4.431647436745376e-06, "loss": 0.0109, "step": 2407 }, { "epoch": 1.0955414012738853, "grad_norm": 0.7893654050648916, "learning_rate": 4.431193684035246e-06, "loss": 0.0332, "step": 2408 }, { "epoch": 1.0959963603275704, "grad_norm": 0.5247569845377812, "learning_rate": 4.43073977351669e-06, "loss": 0.024, "step": 2409 }, { "epoch": 1.0964513193812557, "grad_norm": 0.3758246466492886, "learning_rate": 4.430285705226799e-06, "loss": 0.0083, "step": 2410 }, { "epoch": 1.0969062784349408, "grad_norm": 0.6145335090304439, "learning_rate": 4.429831479202676e-06, "loss": 0.0228, "step": 2411 }, { "epoch": 1.097361237488626, "grad_norm": 0.4286683636340505, "learning_rate": 4.429377095481441e-06, "loss": 0.0165, "step": 2412 }, { "epoch": 1.0978161965423112, "grad_norm": 0.5952460412943158, "learning_rate": 4.428922554100221e-06, "loss": 0.0309, "step": 2413 }, { "epoch": 1.0982711555959963, "grad_norm": 0.44582992706971714, "learning_rate": 4.428467855096163e-06, "loss": 0.0177, "step": 2414 }, { "epoch": 1.0987261146496816, "grad_norm": 0.7300092932288427, "learning_rate": 4.428012998506419e-06, "loss": 0.0237, "step": 2415 }, { "epoch": 1.0991810737033667, "grad_norm": 0.6969394499120272, "learning_rate": 4.42755798436816e-06, "loss": 0.0271, "step": 2416 }, { "epoch": 1.0996360327570518, "grad_norm": 0.4680236162394029, "learning_rate": 4.427102812718568e-06, "loss": 0.0151, "step": 2417 }, { "epoch": 1.100090991810737, "grad_norm": 0.46662414655854256, "learning_rate": 4.426647483594836e-06, "loss": 0.0171, "step": 2418 }, { "epoch": 1.1005459508644222, "grad_norm": 0.549235332005641, "learning_rate": 4.4261919970341724e-06, "loss": 0.0312, "step": 2419 }, { "epoch": 1.1010009099181073, "grad_norm": 0.40850248404362477, "learning_rate": 4.425736353073798e-06, "loss": 0.0169, "step": 2420 }, { "epoch": 1.1014558689717926, "grad_norm": 0.7833330930453737, "learning_rate": 4.425280551750945e-06, "loss": 0.0358, "step": 2421 }, { "epoch": 1.1019108280254777, "grad_norm": 0.3603121621215666, "learning_rate": 4.42482459310286e-06, "loss": 0.0156, "step": 2422 }, { "epoch": 1.1023657870791628, "grad_norm": 0.8023639858765762, "learning_rate": 4.424368477166801e-06, "loss": 0.0256, "step": 2423 }, { "epoch": 1.102820746132848, "grad_norm": 0.6123340064787052, "learning_rate": 4.423912203980041e-06, "loss": 0.0239, "step": 2424 }, { "epoch": 1.1032757051865332, "grad_norm": 0.7883676564832361, "learning_rate": 4.423455773579865e-06, "loss": 0.0243, "step": 2425 }, { "epoch": 1.1037306642402185, "grad_norm": 0.5606580783089008, "learning_rate": 4.422999186003568e-06, "loss": 0.0206, "step": 2426 }, { "epoch": 1.1041856232939036, "grad_norm": 0.3791744966325019, "learning_rate": 4.422542441288462e-06, "loss": 0.0141, "step": 2427 }, { "epoch": 1.1046405823475887, "grad_norm": 0.6286323469492989, "learning_rate": 4.42208553947187e-06, "loss": 0.0279, "step": 2428 }, { "epoch": 1.105095541401274, "grad_norm": 0.45776690578796103, "learning_rate": 4.4216284805911275e-06, "loss": 0.0123, "step": 2429 }, { "epoch": 1.105550500454959, "grad_norm": 0.6993584918399423, "learning_rate": 4.421171264683584e-06, "loss": 0.031, "step": 2430 }, { "epoch": 1.1060054595086442, "grad_norm": 0.8904266677973449, "learning_rate": 4.4207138917866e-06, "loss": 0.0434, "step": 2431 }, { "epoch": 1.1064604185623295, "grad_norm": 0.6348166810636052, "learning_rate": 4.420256361937551e-06, "loss": 0.0309, "step": 2432 }, { "epoch": 1.1069153776160146, "grad_norm": 0.3981663200647504, "learning_rate": 4.419798675173824e-06, "loss": 0.0148, "step": 2433 }, { "epoch": 1.1073703366696996, "grad_norm": 0.48832091405954775, "learning_rate": 4.419340831532819e-06, "loss": 0.0194, "step": 2434 }, { "epoch": 1.107825295723385, "grad_norm": 0.5699123528938048, "learning_rate": 4.418882831051949e-06, "loss": 0.0219, "step": 2435 }, { "epoch": 1.10828025477707, "grad_norm": 0.5206899000767948, "learning_rate": 4.418424673768639e-06, "loss": 0.018, "step": 2436 }, { "epoch": 1.1087352138307551, "grad_norm": 0.45230109480362385, "learning_rate": 4.417966359720329e-06, "loss": 0.0165, "step": 2437 }, { "epoch": 1.1091901728844404, "grad_norm": 0.5703863477498058, "learning_rate": 4.417507888944469e-06, "loss": 0.029, "step": 2438 }, { "epoch": 1.1096451319381255, "grad_norm": 0.682258223758108, "learning_rate": 4.417049261478525e-06, "loss": 0.0334, "step": 2439 }, { "epoch": 1.1101000909918108, "grad_norm": 0.47676985671226807, "learning_rate": 4.416590477359971e-06, "loss": 0.0181, "step": 2440 }, { "epoch": 1.110555050045496, "grad_norm": 0.38788442111589033, "learning_rate": 4.416131536626299e-06, "loss": 0.0204, "step": 2441 }, { "epoch": 1.111010009099181, "grad_norm": 0.6170801313853547, "learning_rate": 4.415672439315011e-06, "loss": 0.0241, "step": 2442 }, { "epoch": 1.1114649681528663, "grad_norm": 0.3816037691138792, "learning_rate": 4.415213185463623e-06, "loss": 0.0148, "step": 2443 }, { "epoch": 1.1119199272065514, "grad_norm": 0.482252021647845, "learning_rate": 4.414753775109661e-06, "loss": 0.0115, "step": 2444 }, { "epoch": 1.1123748862602365, "grad_norm": 0.6286536712577396, "learning_rate": 4.414294208290669e-06, "loss": 0.0245, "step": 2445 }, { "epoch": 1.1128298453139218, "grad_norm": 0.5902607419466066, "learning_rate": 4.413834485044199e-06, "loss": 0.0232, "step": 2446 }, { "epoch": 1.113284804367607, "grad_norm": 0.5444384492120593, "learning_rate": 4.413374605407817e-06, "loss": 0.0186, "step": 2447 }, { "epoch": 1.113739763421292, "grad_norm": 0.6430829258294208, "learning_rate": 4.412914569419103e-06, "loss": 0.0187, "step": 2448 }, { "epoch": 1.1141947224749773, "grad_norm": 0.4980474374671166, "learning_rate": 4.412454377115649e-06, "loss": 0.0185, "step": 2449 }, { "epoch": 1.1146496815286624, "grad_norm": 0.4773216197958395, "learning_rate": 4.411994028535061e-06, "loss": 0.018, "step": 2450 }, { "epoch": 1.1151046405823477, "grad_norm": 0.6035811414068329, "learning_rate": 4.411533523714954e-06, "loss": 0.0206, "step": 2451 }, { "epoch": 1.1155595996360328, "grad_norm": 0.7275388720106067, "learning_rate": 4.41107286269296e-06, "loss": 0.0292, "step": 2452 }, { "epoch": 1.1160145586897179, "grad_norm": 0.5634610686928793, "learning_rate": 4.410612045506722e-06, "loss": 0.0198, "step": 2453 }, { "epoch": 1.1164695177434032, "grad_norm": 0.5718741959689277, "learning_rate": 4.410151072193897e-06, "loss": 0.027, "step": 2454 }, { "epoch": 1.1169244767970883, "grad_norm": 0.5551564270266938, "learning_rate": 4.409689942792152e-06, "loss": 0.0199, "step": 2455 }, { "epoch": 1.1173794358507734, "grad_norm": 0.8888251322235802, "learning_rate": 4.409228657339168e-06, "loss": 0.0333, "step": 2456 }, { "epoch": 1.1178343949044587, "grad_norm": 0.5773710624441224, "learning_rate": 4.4087672158726415e-06, "loss": 0.0183, "step": 2457 }, { "epoch": 1.1182893539581438, "grad_norm": 0.5015201013813986, "learning_rate": 4.408305618430277e-06, "loss": 0.0183, "step": 2458 }, { "epoch": 1.1187443130118289, "grad_norm": 0.5607273909280127, "learning_rate": 4.407843865049797e-06, "loss": 0.0179, "step": 2459 }, { "epoch": 1.1191992720655142, "grad_norm": 0.47810546016317323, "learning_rate": 4.40738195576893e-06, "loss": 0.0177, "step": 2460 }, { "epoch": 1.1196542311191993, "grad_norm": 0.3206530963273737, "learning_rate": 4.406919890625424e-06, "loss": 0.012, "step": 2461 }, { "epoch": 1.1201091901728844, "grad_norm": 0.5269729696359773, "learning_rate": 4.406457669657036e-06, "loss": 0.0252, "step": 2462 }, { "epoch": 1.1205641492265697, "grad_norm": 0.8110781283531187, "learning_rate": 4.405995292901537e-06, "loss": 0.0394, "step": 2463 }, { "epoch": 1.1210191082802548, "grad_norm": 0.685631955664527, "learning_rate": 4.40553276039671e-06, "loss": 0.0235, "step": 2464 }, { "epoch": 1.1214740673339398, "grad_norm": 0.5305736419722851, "learning_rate": 4.4050700721803505e-06, "loss": 0.0246, "step": 2465 }, { "epoch": 1.1219290263876252, "grad_norm": 0.8233289195972211, "learning_rate": 4.404607228290269e-06, "loss": 0.0329, "step": 2466 }, { "epoch": 1.1223839854413102, "grad_norm": 0.4494525107707997, "learning_rate": 4.404144228764285e-06, "loss": 0.0172, "step": 2467 }, { "epoch": 1.1228389444949956, "grad_norm": 0.6898284017237695, "learning_rate": 4.403681073640235e-06, "loss": 0.0309, "step": 2468 }, { "epoch": 1.1232939035486806, "grad_norm": 0.44022897506767517, "learning_rate": 4.403217762955963e-06, "loss": 0.014, "step": 2469 }, { "epoch": 1.1237488626023657, "grad_norm": 0.5105224449314946, "learning_rate": 4.402754296749331e-06, "loss": 0.0277, "step": 2470 }, { "epoch": 1.124203821656051, "grad_norm": 0.4431776829162167, "learning_rate": 4.402290675058211e-06, "loss": 0.0165, "step": 2471 }, { "epoch": 1.1246587807097361, "grad_norm": 0.7311094672586586, "learning_rate": 4.401826897920487e-06, "loss": 0.0394, "step": 2472 }, { "epoch": 1.1251137397634212, "grad_norm": 0.5038279648045123, "learning_rate": 4.4013629653740575e-06, "loss": 0.0193, "step": 2473 }, { "epoch": 1.1255686988171065, "grad_norm": 0.6090867075516695, "learning_rate": 4.400898877456833e-06, "loss": 0.0202, "step": 2474 }, { "epoch": 1.1260236578707916, "grad_norm": 0.4042652844289294, "learning_rate": 4.400434634206737e-06, "loss": 0.0202, "step": 2475 }, { "epoch": 1.1264786169244767, "grad_norm": 0.6930219774407071, "learning_rate": 4.399970235661705e-06, "loss": 0.0201, "step": 2476 }, { "epoch": 1.126933575978162, "grad_norm": 0.7770132345875099, "learning_rate": 4.399505681859685e-06, "loss": 0.0258, "step": 2477 }, { "epoch": 1.127388535031847, "grad_norm": 0.4536720849488241, "learning_rate": 4.399040972838639e-06, "loss": 0.013, "step": 2478 }, { "epoch": 1.1278434940855324, "grad_norm": 0.4629044906931116, "learning_rate": 4.398576108636541e-06, "loss": 0.0142, "step": 2479 }, { "epoch": 1.1282984531392175, "grad_norm": 0.5029551266143759, "learning_rate": 4.398111089291378e-06, "loss": 0.0223, "step": 2480 }, { "epoch": 1.1287534121929026, "grad_norm": 0.5267250567692062, "learning_rate": 4.3976459148411464e-06, "loss": 0.0184, "step": 2481 }, { "epoch": 1.129208371246588, "grad_norm": 0.5866609989932728, "learning_rate": 4.3971805853238616e-06, "loss": 0.0225, "step": 2482 }, { "epoch": 1.129663330300273, "grad_norm": 0.7290964432708312, "learning_rate": 4.396715100777547e-06, "loss": 0.0215, "step": 2483 }, { "epoch": 1.130118289353958, "grad_norm": 0.5687962106735492, "learning_rate": 4.39624946124024e-06, "loss": 0.014, "step": 2484 }, { "epoch": 1.1305732484076434, "grad_norm": 0.5222224290731767, "learning_rate": 4.39578366674999e-06, "loss": 0.0219, "step": 2485 }, { "epoch": 1.1310282074613285, "grad_norm": 0.6292767966184953, "learning_rate": 4.395317717344861e-06, "loss": 0.023, "step": 2486 }, { "epoch": 1.1314831665150136, "grad_norm": 0.6230840960536086, "learning_rate": 4.394851613062927e-06, "loss": 0.0255, "step": 2487 }, { "epoch": 1.1319381255686989, "grad_norm": 0.6868861554575693, "learning_rate": 4.394385353942275e-06, "loss": 0.0228, "step": 2488 }, { "epoch": 1.132393084622384, "grad_norm": 0.4274785686639345, "learning_rate": 4.393918940021008e-06, "loss": 0.0172, "step": 2489 }, { "epoch": 1.132848043676069, "grad_norm": 0.521848847942063, "learning_rate": 4.393452371337238e-06, "loss": 0.0213, "step": 2490 }, { "epoch": 1.1333030027297544, "grad_norm": 0.40932265141368795, "learning_rate": 4.39298564792909e-06, "loss": 0.018, "step": 2491 }, { "epoch": 1.1337579617834395, "grad_norm": 0.39066049998357827, "learning_rate": 4.392518769834705e-06, "loss": 0.0133, "step": 2492 }, { "epoch": 1.1342129208371245, "grad_norm": 0.7649639309094274, "learning_rate": 4.392051737092231e-06, "loss": 0.0242, "step": 2493 }, { "epoch": 1.1346678798908099, "grad_norm": 0.465262288674482, "learning_rate": 4.391584549739834e-06, "loss": 0.015, "step": 2494 }, { "epoch": 1.135122838944495, "grad_norm": 0.6825064297343973, "learning_rate": 4.391117207815691e-06, "loss": 0.0224, "step": 2495 }, { "epoch": 1.1355777979981803, "grad_norm": 0.39540271333569704, "learning_rate": 4.3906497113579895e-06, "loss": 0.013, "step": 2496 }, { "epoch": 1.1360327570518653, "grad_norm": 0.7534708050321565, "learning_rate": 4.390182060404931e-06, "loss": 0.0243, "step": 2497 }, { "epoch": 1.1364877161055504, "grad_norm": 0.4898364302791372, "learning_rate": 4.389714254994732e-06, "loss": 0.017, "step": 2498 }, { "epoch": 1.1369426751592357, "grad_norm": 0.6693549197179394, "learning_rate": 4.389246295165617e-06, "loss": 0.0157, "step": 2499 }, { "epoch": 1.1373976342129208, "grad_norm": 0.4291317100603944, "learning_rate": 4.388778180955826e-06, "loss": 0.0174, "step": 2500 }, { "epoch": 1.137852593266606, "grad_norm": 0.7020692499532706, "learning_rate": 4.388309912403612e-06, "loss": 0.0306, "step": 2501 }, { "epoch": 1.1383075523202912, "grad_norm": 0.6301261849620517, "learning_rate": 4.38784148954724e-06, "loss": 0.0319, "step": 2502 }, { "epoch": 1.1387625113739763, "grad_norm": 0.4361572339623868, "learning_rate": 4.387372912424987e-06, "loss": 0.0127, "step": 2503 }, { "epoch": 1.1392174704276614, "grad_norm": 0.7026556330785464, "learning_rate": 4.386904181075142e-06, "loss": 0.0298, "step": 2504 }, { "epoch": 1.1396724294813467, "grad_norm": 0.6098253171064514, "learning_rate": 4.386435295536008e-06, "loss": 0.0172, "step": 2505 }, { "epoch": 1.1401273885350318, "grad_norm": 0.4467914447435911, "learning_rate": 4.385966255845902e-06, "loss": 0.0133, "step": 2506 }, { "epoch": 1.1405823475887171, "grad_norm": 0.5646208066401632, "learning_rate": 4.38549706204315e-06, "loss": 0.0218, "step": 2507 }, { "epoch": 1.1410373066424022, "grad_norm": 0.5092623099739902, "learning_rate": 4.385027714166094e-06, "loss": 0.0228, "step": 2508 }, { "epoch": 1.1414922656960873, "grad_norm": 0.6483618609538168, "learning_rate": 4.384558212253084e-06, "loss": 0.0224, "step": 2509 }, { "epoch": 1.1419472247497726, "grad_norm": 0.5116010138281942, "learning_rate": 4.384088556342488e-06, "loss": 0.0157, "step": 2510 }, { "epoch": 1.1424021838034577, "grad_norm": 0.5953620176339847, "learning_rate": 4.383618746472686e-06, "loss": 0.02, "step": 2511 }, { "epoch": 1.1428571428571428, "grad_norm": 0.547071261477399, "learning_rate": 4.383148782682064e-06, "loss": 0.0213, "step": 2512 }, { "epoch": 1.143312101910828, "grad_norm": 0.6862235251333989, "learning_rate": 4.382678665009028e-06, "loss": 0.0326, "step": 2513 }, { "epoch": 1.1437670609645132, "grad_norm": 0.6144410092488735, "learning_rate": 4.382208393491994e-06, "loss": 0.021, "step": 2514 }, { "epoch": 1.1442220200181983, "grad_norm": 1.0612612279447866, "learning_rate": 4.381737968169389e-06, "loss": 0.0266, "step": 2515 }, { "epoch": 1.1446769790718836, "grad_norm": 0.5795112434597738, "learning_rate": 4.381267389079657e-06, "loss": 0.0204, "step": 2516 }, { "epoch": 1.1451319381255687, "grad_norm": 0.5281408551585983, "learning_rate": 4.380796656261248e-06, "loss": 0.0242, "step": 2517 }, { "epoch": 1.1455868971792538, "grad_norm": 0.5440850665021298, "learning_rate": 4.38032576975263e-06, "loss": 0.0222, "step": 2518 }, { "epoch": 1.146041856232939, "grad_norm": 0.547779027945082, "learning_rate": 4.3798547295922825e-06, "loss": 0.0252, "step": 2519 }, { "epoch": 1.1464968152866242, "grad_norm": 0.7316358167572443, "learning_rate": 4.3793835358186955e-06, "loss": 0.0245, "step": 2520 }, { "epoch": 1.1469517743403093, "grad_norm": 0.7748548730476206, "learning_rate": 4.378912188470374e-06, "loss": 0.0396, "step": 2521 }, { "epoch": 1.1474067333939946, "grad_norm": 0.6582577150455249, "learning_rate": 4.378440687585832e-06, "loss": 0.0324, "step": 2522 }, { "epoch": 1.1478616924476797, "grad_norm": 0.643886854209991, "learning_rate": 4.3779690332036005e-06, "loss": 0.0274, "step": 2523 }, { "epoch": 1.148316651501365, "grad_norm": 0.6168422110373858, "learning_rate": 4.3774972253622205e-06, "loss": 0.0212, "step": 2524 }, { "epoch": 1.14877161055505, "grad_norm": 0.5831017159186793, "learning_rate": 4.377025264100246e-06, "loss": 0.0207, "step": 2525 }, { "epoch": 1.1492265696087351, "grad_norm": 0.655374012181892, "learning_rate": 4.376553149456244e-06, "loss": 0.0186, "step": 2526 }, { "epoch": 1.1496815286624205, "grad_norm": 0.5616942135395393, "learning_rate": 4.376080881468793e-06, "loss": 0.0132, "step": 2527 }, { "epoch": 1.1501364877161055, "grad_norm": 0.45821214516392705, "learning_rate": 4.375608460176483e-06, "loss": 0.0167, "step": 2528 }, { "epoch": 1.1505914467697906, "grad_norm": 0.4482772285785562, "learning_rate": 4.375135885617922e-06, "loss": 0.0147, "step": 2529 }, { "epoch": 1.151046405823476, "grad_norm": 0.46276152379004304, "learning_rate": 4.3746631578317236e-06, "loss": 0.0162, "step": 2530 }, { "epoch": 1.151501364877161, "grad_norm": 0.472367156834232, "learning_rate": 4.374190276856517e-06, "loss": 0.0161, "step": 2531 }, { "epoch": 1.1519563239308463, "grad_norm": 0.7210180575796964, "learning_rate": 4.373717242730946e-06, "loss": 0.0164, "step": 2532 }, { "epoch": 1.1524112829845314, "grad_norm": 0.34265721640778585, "learning_rate": 4.373244055493663e-06, "loss": 0.009, "step": 2533 }, { "epoch": 1.1528662420382165, "grad_norm": 0.9246661541629277, "learning_rate": 4.372770715183336e-06, "loss": 0.0394, "step": 2534 }, { "epoch": 1.1533212010919018, "grad_norm": 1.0155207038438967, "learning_rate": 4.372297221838642e-06, "loss": 0.0433, "step": 2535 }, { "epoch": 1.153776160145587, "grad_norm": 0.45755930254991145, "learning_rate": 4.3718235754982755e-06, "loss": 0.0178, "step": 2536 }, { "epoch": 1.154231119199272, "grad_norm": 0.8712775724045019, "learning_rate": 4.371349776200939e-06, "loss": 0.0332, "step": 2537 }, { "epoch": 1.1546860782529573, "grad_norm": 0.4981626750356432, "learning_rate": 4.37087582398535e-06, "loss": 0.0146, "step": 2538 }, { "epoch": 1.1551410373066424, "grad_norm": 0.4312192452228287, "learning_rate": 4.370401718890237e-06, "loss": 0.0152, "step": 2539 }, { "epoch": 1.1555959963603275, "grad_norm": 0.6358957648601828, "learning_rate": 4.369927460954342e-06, "loss": 0.0215, "step": 2540 }, { "epoch": 1.1560509554140128, "grad_norm": 0.5795766897237975, "learning_rate": 4.36945305021642e-06, "loss": 0.0283, "step": 2541 }, { "epoch": 1.156505914467698, "grad_norm": 0.5890586761906142, "learning_rate": 4.368978486715237e-06, "loss": 0.0283, "step": 2542 }, { "epoch": 1.156960873521383, "grad_norm": 0.49231424613966623, "learning_rate": 4.368503770489573e-06, "loss": 0.017, "step": 2543 }, { "epoch": 1.1574158325750683, "grad_norm": 0.5284668616614862, "learning_rate": 4.368028901578218e-06, "loss": 0.0192, "step": 2544 }, { "epoch": 1.1578707916287534, "grad_norm": 0.4629161095951479, "learning_rate": 4.367553880019977e-06, "loss": 0.0174, "step": 2545 }, { "epoch": 1.1583257506824385, "grad_norm": 0.5295482770008699, "learning_rate": 4.367078705853667e-06, "loss": 0.0175, "step": 2546 }, { "epoch": 1.1587807097361238, "grad_norm": 0.4492266435415687, "learning_rate": 4.366603379118117e-06, "loss": 0.0174, "step": 2547 }, { "epoch": 1.1592356687898089, "grad_norm": 0.6861326206450016, "learning_rate": 4.366127899852169e-06, "loss": 0.0202, "step": 2548 }, { "epoch": 1.159690627843494, "grad_norm": 0.6458656612345586, "learning_rate": 4.365652268094675e-06, "loss": 0.0247, "step": 2549 }, { "epoch": 1.1601455868971793, "grad_norm": 0.5934112274239063, "learning_rate": 4.365176483884504e-06, "loss": 0.018, "step": 2550 }, { "epoch": 1.1606005459508644, "grad_norm": 0.5168135672415382, "learning_rate": 4.364700547260533e-06, "loss": 0.0164, "step": 2551 }, { "epoch": 1.1610555050045497, "grad_norm": 0.6179287757702098, "learning_rate": 4.3642244582616545e-06, "loss": 0.0224, "step": 2552 }, { "epoch": 1.1615104640582348, "grad_norm": 0.6789584396331461, "learning_rate": 4.363748216926772e-06, "loss": 0.0246, "step": 2553 }, { "epoch": 1.1619654231119199, "grad_norm": 0.5988318131014969, "learning_rate": 4.363271823294802e-06, "loss": 0.0249, "step": 2554 }, { "epoch": 1.1624203821656052, "grad_norm": 0.7304853558121732, "learning_rate": 4.362795277404673e-06, "loss": 0.0303, "step": 2555 }, { "epoch": 1.1628753412192903, "grad_norm": 0.4069363718224522, "learning_rate": 4.362318579295326e-06, "loss": 0.0142, "step": 2556 }, { "epoch": 1.1633303002729753, "grad_norm": 0.6305977347827836, "learning_rate": 4.361841729005715e-06, "loss": 0.0243, "step": 2557 }, { "epoch": 1.1637852593266607, "grad_norm": 1.9165750531232424, "learning_rate": 4.361364726574806e-06, "loss": 0.0472, "step": 2558 }, { "epoch": 1.1642402183803457, "grad_norm": 0.48386946622280164, "learning_rate": 4.360887572041578e-06, "loss": 0.0239, "step": 2559 }, { "epoch": 1.164695177434031, "grad_norm": 0.6741840320275676, "learning_rate": 4.36041026544502e-06, "loss": 0.0246, "step": 2560 }, { "epoch": 1.1651501364877161, "grad_norm": 0.618838331497097, "learning_rate": 4.359932806824138e-06, "loss": 0.027, "step": 2561 }, { "epoch": 1.1656050955414012, "grad_norm": 0.3417342823461349, "learning_rate": 4.359455196217946e-06, "loss": 0.0104, "step": 2562 }, { "epoch": 1.1660600545950865, "grad_norm": 0.31164520237783483, "learning_rate": 4.358977433665471e-06, "loss": 0.0088, "step": 2563 }, { "epoch": 1.1665150136487716, "grad_norm": 0.6554668583356018, "learning_rate": 4.3584995192057565e-06, "loss": 0.0335, "step": 2564 }, { "epoch": 1.1669699727024567, "grad_norm": 0.44408981017519084, "learning_rate": 4.358021452877854e-06, "loss": 0.0183, "step": 2565 }, { "epoch": 1.167424931756142, "grad_norm": 0.482451643460215, "learning_rate": 4.357543234720829e-06, "loss": 0.017, "step": 2566 }, { "epoch": 1.1678798908098271, "grad_norm": 0.8550493713474525, "learning_rate": 4.357064864773761e-06, "loss": 0.0363, "step": 2567 }, { "epoch": 1.1683348498635122, "grad_norm": 0.5050690519756426, "learning_rate": 4.3565863430757375e-06, "loss": 0.0176, "step": 2568 }, { "epoch": 1.1687898089171975, "grad_norm": 0.5475370608720886, "learning_rate": 4.356107669665862e-06, "loss": 0.0232, "step": 2569 }, { "epoch": 1.1692447679708826, "grad_norm": 0.6277444667505776, "learning_rate": 4.355628844583249e-06, "loss": 0.0298, "step": 2570 }, { "epoch": 1.1696997270245677, "grad_norm": 0.5366910312126616, "learning_rate": 4.355149867867029e-06, "loss": 0.0186, "step": 2571 }, { "epoch": 1.170154686078253, "grad_norm": 0.45310857499795165, "learning_rate": 4.354670739556338e-06, "loss": 0.0119, "step": 2572 }, { "epoch": 1.170609645131938, "grad_norm": 0.6356440967795077, "learning_rate": 4.35419145969033e-06, "loss": 0.0333, "step": 2573 }, { "epoch": 1.1710646041856232, "grad_norm": 0.5464909143012531, "learning_rate": 4.35371202830817e-06, "loss": 0.0189, "step": 2574 }, { "epoch": 1.1715195632393085, "grad_norm": 0.6383681077345984, "learning_rate": 4.353232445449034e-06, "loss": 0.0265, "step": 2575 }, { "epoch": 1.1719745222929936, "grad_norm": 0.47662442741731753, "learning_rate": 4.352752711152112e-06, "loss": 0.0148, "step": 2576 }, { "epoch": 1.1724294813466787, "grad_norm": 0.5868089466835913, "learning_rate": 4.352272825456605e-06, "loss": 0.0216, "step": 2577 }, { "epoch": 1.172884440400364, "grad_norm": 0.5253949671686583, "learning_rate": 4.3517927884017275e-06, "loss": 0.0256, "step": 2578 }, { "epoch": 1.173339399454049, "grad_norm": 0.6592308425561475, "learning_rate": 4.351312600026706e-06, "loss": 0.0285, "step": 2579 }, { "epoch": 1.1737943585077344, "grad_norm": 0.7649368180575881, "learning_rate": 4.350832260370779e-06, "loss": 0.0311, "step": 2580 }, { "epoch": 1.1742493175614195, "grad_norm": 0.5514861272003339, "learning_rate": 4.350351769473198e-06, "loss": 0.0231, "step": 2581 }, { "epoch": 1.1747042766151046, "grad_norm": 0.5775672174979725, "learning_rate": 4.349871127373226e-06, "loss": 0.0255, "step": 2582 }, { "epoch": 1.1751592356687899, "grad_norm": 0.9134141208612268, "learning_rate": 4.349390334110141e-06, "loss": 0.0439, "step": 2583 }, { "epoch": 1.175614194722475, "grad_norm": 0.4544618512994982, "learning_rate": 4.348909389723228e-06, "loss": 0.0178, "step": 2584 }, { "epoch": 1.17606915377616, "grad_norm": 0.5791839745574971, "learning_rate": 4.348428294251791e-06, "loss": 0.0304, "step": 2585 }, { "epoch": 1.1765241128298454, "grad_norm": 0.4843168191804037, "learning_rate": 4.34794704773514e-06, "loss": 0.0204, "step": 2586 }, { "epoch": 1.1769790718835305, "grad_norm": 0.47321640576793395, "learning_rate": 4.347465650212602e-06, "loss": 0.0161, "step": 2587 }, { "epoch": 1.1774340309372158, "grad_norm": 0.6270272685829019, "learning_rate": 4.346984101723513e-06, "loss": 0.0257, "step": 2588 }, { "epoch": 1.1778889899909009, "grad_norm": 0.4453595878333599, "learning_rate": 4.3465024023072255e-06, "loss": 0.0207, "step": 2589 }, { "epoch": 1.178343949044586, "grad_norm": 0.5620264526303997, "learning_rate": 4.3460205520031006e-06, "loss": 0.0162, "step": 2590 }, { "epoch": 1.1787989080982713, "grad_norm": 0.79050022325226, "learning_rate": 4.345538550850512e-06, "loss": 0.0275, "step": 2591 }, { "epoch": 1.1792538671519563, "grad_norm": 0.5242514508819358, "learning_rate": 4.345056398888847e-06, "loss": 0.0194, "step": 2592 }, { "epoch": 1.1797088262056414, "grad_norm": 0.45519280840041193, "learning_rate": 4.3445740961575066e-06, "loss": 0.018, "step": 2593 }, { "epoch": 1.1801637852593267, "grad_norm": 0.6747230064920378, "learning_rate": 4.3440916426959e-06, "loss": 0.0289, "step": 2594 }, { "epoch": 1.1806187443130118, "grad_norm": 0.4544809079544836, "learning_rate": 4.343609038543452e-06, "loss": 0.0249, "step": 2595 }, { "epoch": 1.181073703366697, "grad_norm": 0.35452190136365136, "learning_rate": 4.3431262837396e-06, "loss": 0.0141, "step": 2596 }, { "epoch": 1.1815286624203822, "grad_norm": 0.6536923673730825, "learning_rate": 4.342643378323791e-06, "loss": 0.0223, "step": 2597 }, { "epoch": 1.1819836214740673, "grad_norm": 0.5887741451878095, "learning_rate": 4.342160322335487e-06, "loss": 0.0353, "step": 2598 }, { "epoch": 1.1824385805277524, "grad_norm": 0.7048921014576134, "learning_rate": 4.34167711581416e-06, "loss": 0.0223, "step": 2599 }, { "epoch": 1.1828935395814377, "grad_norm": 0.7646870960953991, "learning_rate": 4.3411937587992955e-06, "loss": 0.0391, "step": 2600 }, { "epoch": 1.1833484986351228, "grad_norm": 0.5007217995372573, "learning_rate": 4.340710251330393e-06, "loss": 0.0219, "step": 2601 }, { "epoch": 1.183803457688808, "grad_norm": 0.5078521534312103, "learning_rate": 4.34022659344696e-06, "loss": 0.0225, "step": 2602 }, { "epoch": 1.1842584167424932, "grad_norm": 0.8160012841542015, "learning_rate": 4.339742785188521e-06, "loss": 0.0301, "step": 2603 }, { "epoch": 1.1847133757961783, "grad_norm": 0.523985625873912, "learning_rate": 4.339258826594611e-06, "loss": 0.0178, "step": 2604 }, { "epoch": 1.1851683348498634, "grad_norm": 0.5308806701133933, "learning_rate": 4.338774717704774e-06, "loss": 0.0175, "step": 2605 }, { "epoch": 1.1856232939035487, "grad_norm": 0.6656523129373758, "learning_rate": 4.338290458558572e-06, "loss": 0.0273, "step": 2606 }, { "epoch": 1.1860782529572338, "grad_norm": 0.39255221273464475, "learning_rate": 4.3378060491955744e-06, "loss": 0.0148, "step": 2607 }, { "epoch": 1.186533212010919, "grad_norm": 0.4950633863450793, "learning_rate": 4.337321489655366e-06, "loss": 0.0224, "step": 2608 }, { "epoch": 1.1869881710646042, "grad_norm": 0.6260958258831224, "learning_rate": 4.336836779977543e-06, "loss": 0.0257, "step": 2609 }, { "epoch": 1.1874431301182893, "grad_norm": 0.4751863637448733, "learning_rate": 4.336351920201714e-06, "loss": 0.0207, "step": 2610 }, { "epoch": 1.1878980891719746, "grad_norm": 0.43930578090368066, "learning_rate": 4.335866910367498e-06, "loss": 0.0114, "step": 2611 }, { "epoch": 1.1883530482256597, "grad_norm": 0.6314909631136255, "learning_rate": 4.3353817505145294e-06, "loss": 0.0261, "step": 2612 }, { "epoch": 1.1888080072793448, "grad_norm": 0.5008847623030172, "learning_rate": 4.334896440682452e-06, "loss": 0.0267, "step": 2613 }, { "epoch": 1.18926296633303, "grad_norm": 0.5600055290379815, "learning_rate": 4.334410980910924e-06, "loss": 0.0286, "step": 2614 }, { "epoch": 1.1897179253867152, "grad_norm": 0.5728580933788284, "learning_rate": 4.333925371239615e-06, "loss": 0.022, "step": 2615 }, { "epoch": 1.1901728844404005, "grad_norm": 0.5724099445776433, "learning_rate": 4.3334396117082065e-06, "loss": 0.0221, "step": 2616 }, { "epoch": 1.1906278434940856, "grad_norm": 0.6320973313826952, "learning_rate": 4.332953702356393e-06, "loss": 0.0288, "step": 2617 }, { "epoch": 1.1910828025477707, "grad_norm": 0.5090911320181027, "learning_rate": 4.33246764322388e-06, "loss": 0.0181, "step": 2618 }, { "epoch": 1.191537761601456, "grad_norm": 0.4752697085073445, "learning_rate": 4.331981434350387e-06, "loss": 0.0145, "step": 2619 }, { "epoch": 1.191992720655141, "grad_norm": 0.7500039731207865, "learning_rate": 4.331495075775644e-06, "loss": 0.0372, "step": 2620 }, { "epoch": 1.1924476797088261, "grad_norm": 0.46553750325278676, "learning_rate": 4.331008567539395e-06, "loss": 0.0216, "step": 2621 }, { "epoch": 1.1929026387625115, "grad_norm": 0.7523375315000249, "learning_rate": 4.330521909681394e-06, "loss": 0.0221, "step": 2622 }, { "epoch": 1.1933575978161965, "grad_norm": 0.5528858812158782, "learning_rate": 4.330035102241409e-06, "loss": 0.0285, "step": 2623 }, { "epoch": 1.1938125568698816, "grad_norm": 0.5368571535414223, "learning_rate": 4.32954814525922e-06, "loss": 0.0194, "step": 2624 }, { "epoch": 1.194267515923567, "grad_norm": 0.6644805545801543, "learning_rate": 4.329061038774619e-06, "loss": 0.0293, "step": 2625 }, { "epoch": 1.194722474977252, "grad_norm": 0.5920169242009168, "learning_rate": 4.32857378282741e-06, "loss": 0.027, "step": 2626 }, { "epoch": 1.1951774340309371, "grad_norm": 0.5765181192250086, "learning_rate": 4.328086377457409e-06, "loss": 0.0149, "step": 2627 }, { "epoch": 1.1956323930846224, "grad_norm": 2.59038115553395, "learning_rate": 4.327598822704444e-06, "loss": 0.0238, "step": 2628 }, { "epoch": 1.1960873521383075, "grad_norm": 0.6749015393615012, "learning_rate": 4.327111118608357e-06, "loss": 0.0241, "step": 2629 }, { "epoch": 1.1965423111919926, "grad_norm": 0.5555096309142309, "learning_rate": 4.326623265209001e-06, "loss": 0.0241, "step": 2630 }, { "epoch": 1.196997270245678, "grad_norm": 0.4422718467721418, "learning_rate": 4.326135262546241e-06, "loss": 0.009, "step": 2631 }, { "epoch": 1.197452229299363, "grad_norm": 1.011510904258874, "learning_rate": 4.325647110659954e-06, "loss": 0.04, "step": 2632 }, { "epoch": 1.197907188353048, "grad_norm": 0.871122479276943, "learning_rate": 4.325158809590028e-06, "loss": 0.0299, "step": 2633 }, { "epoch": 1.1983621474067334, "grad_norm": 0.45898463677814605, "learning_rate": 4.324670359376368e-06, "loss": 0.0182, "step": 2634 }, { "epoch": 1.1988171064604185, "grad_norm": 0.6617734015997742, "learning_rate": 4.3241817600588865e-06, "loss": 0.0207, "step": 2635 }, { "epoch": 1.1992720655141038, "grad_norm": 0.3928382449321343, "learning_rate": 4.3236930116775086e-06, "loss": 0.0139, "step": 2636 }, { "epoch": 1.199727024567789, "grad_norm": 0.9460647127731692, "learning_rate": 4.323204114272174e-06, "loss": 0.0396, "step": 2637 }, { "epoch": 1.200181983621474, "grad_norm": 0.8397877040107056, "learning_rate": 4.3227150678828335e-06, "loss": 0.0273, "step": 2638 }, { "epoch": 1.2006369426751593, "grad_norm": 0.5340130401222157, "learning_rate": 4.322225872549448e-06, "loss": 0.022, "step": 2639 }, { "epoch": 1.2010919017288444, "grad_norm": 0.9277766410326496, "learning_rate": 4.321736528311994e-06, "loss": 0.0439, "step": 2640 }, { "epoch": 1.2015468607825295, "grad_norm": 0.7592854867727282, "learning_rate": 4.321247035210456e-06, "loss": 0.0289, "step": 2641 }, { "epoch": 1.2020018198362148, "grad_norm": 0.5341297610203549, "learning_rate": 4.320757393284837e-06, "loss": 0.0225, "step": 2642 }, { "epoch": 1.2024567788898999, "grad_norm": 0.6237470883113181, "learning_rate": 4.3202676025751455e-06, "loss": 0.0336, "step": 2643 }, { "epoch": 1.2029117379435852, "grad_norm": 0.5870246556587211, "learning_rate": 4.319777663121406e-06, "loss": 0.0276, "step": 2644 }, { "epoch": 1.2033666969972703, "grad_norm": 0.7609302108890849, "learning_rate": 4.319287574963653e-06, "loss": 0.0297, "step": 2645 }, { "epoch": 1.2038216560509554, "grad_norm": 0.6569180371738808, "learning_rate": 4.318797338141936e-06, "loss": 0.0317, "step": 2646 }, { "epoch": 1.2042766151046407, "grad_norm": 0.5944609730434067, "learning_rate": 4.318306952696314e-06, "loss": 0.027, "step": 2647 }, { "epoch": 1.2047315741583258, "grad_norm": 0.34688904992460695, "learning_rate": 4.317816418666859e-06, "loss": 0.0125, "step": 2648 }, { "epoch": 1.2051865332120109, "grad_norm": 0.6679718066461144, "learning_rate": 4.317325736093656e-06, "loss": 0.0293, "step": 2649 }, { "epoch": 1.2056414922656962, "grad_norm": 0.5669693680069866, "learning_rate": 4.316834905016801e-06, "loss": 0.0241, "step": 2650 }, { "epoch": 1.2060964513193813, "grad_norm": 0.46022640722499586, "learning_rate": 4.3163439254764015e-06, "loss": 0.0176, "step": 2651 }, { "epoch": 1.2065514103730663, "grad_norm": 0.5009167482323504, "learning_rate": 4.31585279751258e-06, "loss": 0.0164, "step": 2652 }, { "epoch": 1.2070063694267517, "grad_norm": 0.4911116285507776, "learning_rate": 4.315361521165467e-06, "loss": 0.023, "step": 2653 }, { "epoch": 1.2074613284804367, "grad_norm": 0.4139079723299726, "learning_rate": 4.314870096475209e-06, "loss": 0.0191, "step": 2654 }, { "epoch": 1.2079162875341218, "grad_norm": 0.5147916096736812, "learning_rate": 4.3143785234819624e-06, "loss": 0.0213, "step": 2655 }, { "epoch": 1.2083712465878071, "grad_norm": 0.4934413168807628, "learning_rate": 4.3138868022258974e-06, "loss": 0.0194, "step": 2656 }, { "epoch": 1.2088262056414922, "grad_norm": 0.6093916204573823, "learning_rate": 4.313394932747194e-06, "loss": 0.0256, "step": 2657 }, { "epoch": 1.2092811646951773, "grad_norm": 0.4328897173436113, "learning_rate": 4.312902915086045e-06, "loss": 0.0179, "step": 2658 }, { "epoch": 1.2097361237488626, "grad_norm": 0.6051187480913044, "learning_rate": 4.312410749282658e-06, "loss": 0.0264, "step": 2659 }, { "epoch": 1.2101910828025477, "grad_norm": 0.6402153561332778, "learning_rate": 4.311918435377248e-06, "loss": 0.0208, "step": 2660 }, { "epoch": 1.210646041856233, "grad_norm": 0.5588924434287335, "learning_rate": 4.311425973410047e-06, "loss": 0.0303, "step": 2661 }, { "epoch": 1.2111010009099181, "grad_norm": 0.45347333487831026, "learning_rate": 4.310933363421296e-06, "loss": 0.0155, "step": 2662 }, { "epoch": 1.2115559599636032, "grad_norm": 0.6992225938503356, "learning_rate": 4.310440605451248e-06, "loss": 0.0349, "step": 2663 }, { "epoch": 1.2120109190172885, "grad_norm": 0.9380643327786069, "learning_rate": 4.30994769954017e-06, "loss": 0.0338, "step": 2664 }, { "epoch": 1.2124658780709736, "grad_norm": 0.8253191054656854, "learning_rate": 4.30945464572834e-06, "loss": 0.0404, "step": 2665 }, { "epoch": 1.2129208371246587, "grad_norm": 0.6476800258789567, "learning_rate": 4.3089614440560465e-06, "loss": 0.0201, "step": 2666 }, { "epoch": 1.213375796178344, "grad_norm": 0.5709513616813471, "learning_rate": 4.3084680945635946e-06, "loss": 0.0223, "step": 2667 }, { "epoch": 1.213830755232029, "grad_norm": 0.7272313964988455, "learning_rate": 4.307974597291296e-06, "loss": 0.0305, "step": 2668 }, { "epoch": 1.2142857142857142, "grad_norm": 0.4125464957023897, "learning_rate": 4.307480952279478e-06, "loss": 0.0106, "step": 2669 }, { "epoch": 1.2147406733393995, "grad_norm": 0.4210089698998306, "learning_rate": 4.3069871595684795e-06, "loss": 0.0132, "step": 2670 }, { "epoch": 1.2151956323930846, "grad_norm": 0.5567918051217664, "learning_rate": 4.30649321919865e-06, "loss": 0.0232, "step": 2671 }, { "epoch": 1.21565059144677, "grad_norm": 0.7165679260009614, "learning_rate": 4.305999131210353e-06, "loss": 0.0229, "step": 2672 }, { "epoch": 1.216105550500455, "grad_norm": 0.6510555589132238, "learning_rate": 4.305504895643963e-06, "loss": 0.0201, "step": 2673 }, { "epoch": 1.21656050955414, "grad_norm": 0.496585641543594, "learning_rate": 4.305010512539867e-06, "loss": 0.0174, "step": 2674 }, { "epoch": 1.2170154686078254, "grad_norm": 0.4598917500270742, "learning_rate": 4.304515981938462e-06, "loss": 0.0146, "step": 2675 }, { "epoch": 1.2174704276615105, "grad_norm": 0.5463739101416092, "learning_rate": 4.304021303880161e-06, "loss": 0.0252, "step": 2676 }, { "epoch": 1.2179253867151956, "grad_norm": 0.5957670917341833, "learning_rate": 4.303526478405386e-06, "loss": 0.0218, "step": 2677 }, { "epoch": 1.2183803457688809, "grad_norm": 0.614447395579151, "learning_rate": 4.3030315055545715e-06, "loss": 0.0324, "step": 2678 }, { "epoch": 1.218835304822566, "grad_norm": 0.5616831395891632, "learning_rate": 4.302536385368165e-06, "loss": 0.0215, "step": 2679 }, { "epoch": 1.219290263876251, "grad_norm": 0.3790511549714058, "learning_rate": 4.3020411178866246e-06, "loss": 0.0103, "step": 2680 }, { "epoch": 1.2197452229299364, "grad_norm": 0.524178162065685, "learning_rate": 4.3015457031504226e-06, "loss": 0.0216, "step": 2681 }, { "epoch": 1.2202001819836215, "grad_norm": 0.6312387863751975, "learning_rate": 4.301050141200041e-06, "loss": 0.0187, "step": 2682 }, { "epoch": 1.2206551410373065, "grad_norm": 0.48104404582769067, "learning_rate": 4.300554432075975e-06, "loss": 0.0137, "step": 2683 }, { "epoch": 1.2211101000909919, "grad_norm": 0.8650014734492228, "learning_rate": 4.300058575818733e-06, "loss": 0.0356, "step": 2684 }, { "epoch": 1.221565059144677, "grad_norm": 0.6861230937699648, "learning_rate": 4.299562572468833e-06, "loss": 0.0269, "step": 2685 }, { "epoch": 1.222020018198362, "grad_norm": 0.5634591121479472, "learning_rate": 4.299066422066807e-06, "loss": 0.0214, "step": 2686 }, { "epoch": 1.2224749772520473, "grad_norm": 1.199183215901639, "learning_rate": 4.2985701246531965e-06, "loss": 0.0602, "step": 2687 }, { "epoch": 1.2229299363057324, "grad_norm": 0.42406900713579304, "learning_rate": 4.2980736802685575e-06, "loss": 0.0148, "step": 2688 }, { "epoch": 1.2233848953594177, "grad_norm": 0.5957456918417167, "learning_rate": 4.297577088953458e-06, "loss": 0.0148, "step": 2689 }, { "epoch": 1.2238398544131028, "grad_norm": 0.60599231302819, "learning_rate": 4.2970803507484756e-06, "loss": 0.0237, "step": 2690 }, { "epoch": 1.224294813466788, "grad_norm": 0.48118211083729384, "learning_rate": 4.296583465694204e-06, "loss": 0.013, "step": 2691 }, { "epoch": 1.2247497725204732, "grad_norm": 0.605388969739492, "learning_rate": 4.296086433831244e-06, "loss": 0.0315, "step": 2692 }, { "epoch": 1.2252047315741583, "grad_norm": 0.6619863417539111, "learning_rate": 4.295589255200212e-06, "loss": 0.0227, "step": 2693 }, { "epoch": 1.2256596906278434, "grad_norm": 0.8014939338874068, "learning_rate": 4.295091929841734e-06, "loss": 0.0265, "step": 2694 }, { "epoch": 1.2261146496815287, "grad_norm": 0.7269612365736505, "learning_rate": 4.2945944577964516e-06, "loss": 0.0357, "step": 2695 }, { "epoch": 1.2265696087352138, "grad_norm": 0.4610790536448068, "learning_rate": 4.294096839105013e-06, "loss": 0.0153, "step": 2696 }, { "epoch": 1.2270245677888991, "grad_norm": 0.4654611530391689, "learning_rate": 4.293599073808083e-06, "loss": 0.0182, "step": 2697 }, { "epoch": 1.2274795268425842, "grad_norm": 0.5999217153178389, "learning_rate": 4.293101161946337e-06, "loss": 0.0229, "step": 2698 }, { "epoch": 1.2279344858962693, "grad_norm": 0.49917644989179766, "learning_rate": 4.292603103560462e-06, "loss": 0.0124, "step": 2699 }, { "epoch": 1.2283894449499546, "grad_norm": 0.5365210835623073, "learning_rate": 4.292104898691157e-06, "loss": 0.0196, "step": 2700 }, { "epoch": 1.2288444040036397, "grad_norm": 0.5558494608681172, "learning_rate": 4.291606547379131e-06, "loss": 0.0186, "step": 2701 }, { "epoch": 1.2292993630573248, "grad_norm": 0.4612006424923298, "learning_rate": 4.291108049665109e-06, "loss": 0.0192, "step": 2702 }, { "epoch": 1.22975432211101, "grad_norm": 0.5593361998504748, "learning_rate": 4.290609405589827e-06, "loss": 0.0151, "step": 2703 }, { "epoch": 1.2302092811646952, "grad_norm": 0.6007531603783597, "learning_rate": 4.29011061519403e-06, "loss": 0.0301, "step": 2704 }, { "epoch": 1.2306642402183803, "grad_norm": 0.636096108360712, "learning_rate": 4.289611678518478e-06, "loss": 0.0299, "step": 2705 }, { "epoch": 1.2311191992720656, "grad_norm": 0.7430118225044907, "learning_rate": 4.289112595603941e-06, "loss": 0.0226, "step": 2706 }, { "epoch": 1.2315741583257507, "grad_norm": 0.6604918263471005, "learning_rate": 4.288613366491202e-06, "loss": 0.0306, "step": 2707 }, { "epoch": 1.2320291173794358, "grad_norm": 0.7349725252139763, "learning_rate": 4.288113991221057e-06, "loss": 0.0302, "step": 2708 }, { "epoch": 1.232484076433121, "grad_norm": 0.5380600500720201, "learning_rate": 4.2876144698343115e-06, "loss": 0.0237, "step": 2709 }, { "epoch": 1.2329390354868062, "grad_norm": 0.5754211885761235, "learning_rate": 4.287114802371783e-06, "loss": 0.0171, "step": 2710 }, { "epoch": 1.2333939945404913, "grad_norm": 0.5551048882570763, "learning_rate": 4.286614988874304e-06, "loss": 0.0247, "step": 2711 }, { "epoch": 1.2338489535941766, "grad_norm": 0.6348105766760367, "learning_rate": 4.286115029382717e-06, "loss": 0.0254, "step": 2712 }, { "epoch": 1.2343039126478617, "grad_norm": 0.6849101702514654, "learning_rate": 4.285614923937876e-06, "loss": 0.0302, "step": 2713 }, { "epoch": 1.2347588717015467, "grad_norm": 0.46035309891602166, "learning_rate": 4.285114672580647e-06, "loss": 0.0159, "step": 2714 }, { "epoch": 1.235213830755232, "grad_norm": 0.5132490189425896, "learning_rate": 4.284614275351907e-06, "loss": 0.0222, "step": 2715 }, { "epoch": 1.2356687898089171, "grad_norm": 0.48061757961981605, "learning_rate": 4.2841137322925495e-06, "loss": 0.0183, "step": 2716 }, { "epoch": 1.2361237488626025, "grad_norm": 0.6115977831280694, "learning_rate": 4.283613043443474e-06, "loss": 0.0252, "step": 2717 }, { "epoch": 1.2365787079162875, "grad_norm": 0.43493566800443395, "learning_rate": 4.2831122088455955e-06, "loss": 0.0145, "step": 2718 }, { "epoch": 1.2370336669699726, "grad_norm": 0.978674343771398, "learning_rate": 4.2826112285398395e-06, "loss": 0.0507, "step": 2719 }, { "epoch": 1.237488626023658, "grad_norm": 0.5339344105322079, "learning_rate": 4.282110102567145e-06, "loss": 0.0129, "step": 2720 }, { "epoch": 1.237943585077343, "grad_norm": 0.5942512671949535, "learning_rate": 4.28160883096846e-06, "loss": 0.0202, "step": 2721 }, { "epoch": 1.2383985441310281, "grad_norm": 0.564039136663134, "learning_rate": 4.281107413784747e-06, "loss": 0.0141, "step": 2722 }, { "epoch": 1.2388535031847134, "grad_norm": 0.42797274135047625, "learning_rate": 4.28060585105698e-06, "loss": 0.0163, "step": 2723 }, { "epoch": 1.2393084622383985, "grad_norm": 0.5526854916080111, "learning_rate": 4.280104142826143e-06, "loss": 0.0266, "step": 2724 }, { "epoch": 1.2397634212920838, "grad_norm": 1.1014939973273092, "learning_rate": 4.2796022891332355e-06, "loss": 0.0457, "step": 2725 }, { "epoch": 1.240218380345769, "grad_norm": 0.597089268959557, "learning_rate": 4.279100290019265e-06, "loss": 0.0229, "step": 2726 }, { "epoch": 1.240673339399454, "grad_norm": 0.6166593528194175, "learning_rate": 4.278598145525253e-06, "loss": 0.0314, "step": 2727 }, { "epoch": 1.2411282984531393, "grad_norm": 0.5739818877716178, "learning_rate": 4.278095855692233e-06, "loss": 0.028, "step": 2728 }, { "epoch": 1.2415832575068244, "grad_norm": 0.3861409302981483, "learning_rate": 4.277593420561249e-06, "loss": 0.0137, "step": 2729 }, { "epoch": 1.2420382165605095, "grad_norm": 0.8505238122375511, "learning_rate": 4.277090840173359e-06, "loss": 0.0369, "step": 2730 }, { "epoch": 1.2424931756141948, "grad_norm": 0.6937534569187743, "learning_rate": 4.276588114569631e-06, "loss": 0.0346, "step": 2731 }, { "epoch": 1.24294813466788, "grad_norm": 0.4879993817467368, "learning_rate": 4.2760852437911436e-06, "loss": 0.0226, "step": 2732 }, { "epoch": 1.243403093721565, "grad_norm": 0.4704653935774365, "learning_rate": 4.2755822278789926e-06, "loss": 0.0185, "step": 2733 }, { "epoch": 1.2438580527752503, "grad_norm": 0.34901060315473925, "learning_rate": 4.2750790668742795e-06, "loss": 0.0154, "step": 2734 }, { "epoch": 1.2443130118289354, "grad_norm": 0.9393836760411997, "learning_rate": 4.274575760818122e-06, "loss": 0.0258, "step": 2735 }, { "epoch": 1.2447679708826205, "grad_norm": 0.63450452027728, "learning_rate": 4.274072309751646e-06, "loss": 0.026, "step": 2736 }, { "epoch": 1.2452229299363058, "grad_norm": 0.8168459205839178, "learning_rate": 4.273568713715993e-06, "loss": 0.0305, "step": 2737 }, { "epoch": 1.2456778889899909, "grad_norm": 0.6809047059242845, "learning_rate": 4.2730649727523145e-06, "loss": 0.0341, "step": 2738 }, { "epoch": 1.246132848043676, "grad_norm": 0.6936049408714152, "learning_rate": 4.272561086901773e-06, "loss": 0.0234, "step": 2739 }, { "epoch": 1.2465878070973613, "grad_norm": 0.5711940438181513, "learning_rate": 4.272057056205544e-06, "loss": 0.0232, "step": 2740 }, { "epoch": 1.2470427661510464, "grad_norm": 0.5131420714637235, "learning_rate": 4.271552880704815e-06, "loss": 0.0235, "step": 2741 }, { "epoch": 1.2474977252047315, "grad_norm": 0.5152210269306549, "learning_rate": 4.271048560440786e-06, "loss": 0.0261, "step": 2742 }, { "epoch": 1.2479526842584168, "grad_norm": 0.5629080978039249, "learning_rate": 4.2705440954546665e-06, "loss": 0.0322, "step": 2743 }, { "epoch": 1.2484076433121019, "grad_norm": 0.6865649256488602, "learning_rate": 4.270039485787678e-06, "loss": 0.0302, "step": 2744 }, { "epoch": 1.2488626023657872, "grad_norm": 0.6124115080840975, "learning_rate": 4.269534731481057e-06, "loss": 0.0227, "step": 2745 }, { "epoch": 1.2493175614194723, "grad_norm": 0.4441783467736665, "learning_rate": 4.269029832576048e-06, "loss": 0.014, "step": 2746 }, { "epoch": 1.2497725204731573, "grad_norm": 0.6342777023583722, "learning_rate": 4.2685247891139114e-06, "loss": 0.021, "step": 2747 }, { "epoch": 1.2502274795268427, "grad_norm": 0.4671045785013319, "learning_rate": 4.268019601135914e-06, "loss": 0.0272, "step": 2748 }, { "epoch": 1.2506824385805277, "grad_norm": 0.668394552858572, "learning_rate": 4.26751426868334e-06, "loss": 0.0171, "step": 2749 }, { "epoch": 1.251137397634213, "grad_norm": 0.6900498602539912, "learning_rate": 4.2670087917974826e-06, "loss": 0.0304, "step": 2750 }, { "epoch": 1.2515923566878981, "grad_norm": 0.5403025651863708, "learning_rate": 4.266503170519645e-06, "loss": 0.0192, "step": 2751 }, { "epoch": 1.2520473157415832, "grad_norm": 0.980772795905362, "learning_rate": 4.265997404891147e-06, "loss": 0.0507, "step": 2752 }, { "epoch": 1.2525022747952685, "grad_norm": 0.4711111793060614, "learning_rate": 4.265491494953316e-06, "loss": 0.0181, "step": 2753 }, { "epoch": 1.2529572338489536, "grad_norm": 0.6991553417108572, "learning_rate": 4.2649854407474925e-06, "loss": 0.0326, "step": 2754 }, { "epoch": 1.2534121929026387, "grad_norm": 0.4624764110725661, "learning_rate": 4.26447924231503e-06, "loss": 0.0193, "step": 2755 }, { "epoch": 1.253867151956324, "grad_norm": 0.5205590680500956, "learning_rate": 4.263972899697292e-06, "loss": 0.0252, "step": 2756 }, { "epoch": 1.2543221110100091, "grad_norm": 0.4239402915389399, "learning_rate": 4.263466412935654e-06, "loss": 0.0198, "step": 2757 }, { "epoch": 1.2547770700636942, "grad_norm": 0.30440994761396317, "learning_rate": 4.262959782071505e-06, "loss": 0.0101, "step": 2758 }, { "epoch": 1.2552320291173795, "grad_norm": 0.661608423060866, "learning_rate": 4.262453007146244e-06, "loss": 0.0302, "step": 2759 }, { "epoch": 1.2556869881710646, "grad_norm": 0.4269537794878538, "learning_rate": 4.261946088201282e-06, "loss": 0.0155, "step": 2760 }, { "epoch": 1.2561419472247497, "grad_norm": 0.5124937588999617, "learning_rate": 4.261439025278044e-06, "loss": 0.0235, "step": 2761 }, { "epoch": 1.256596906278435, "grad_norm": 0.5891484571522302, "learning_rate": 4.260931818417962e-06, "loss": 0.022, "step": 2762 }, { "epoch": 1.25705186533212, "grad_norm": 0.43196676673808215, "learning_rate": 4.260424467662484e-06, "loss": 0.0173, "step": 2763 }, { "epoch": 1.2575068243858052, "grad_norm": 0.48458914621643406, "learning_rate": 4.259916973053069e-06, "loss": 0.0246, "step": 2764 }, { "epoch": 1.2579617834394905, "grad_norm": 0.6072652816815827, "learning_rate": 4.2594093346311865e-06, "loss": 0.03, "step": 2765 }, { "epoch": 1.2584167424931756, "grad_norm": 0.4633461488964158, "learning_rate": 4.258901552438319e-06, "loss": 0.0193, "step": 2766 }, { "epoch": 1.2588717015468607, "grad_norm": 0.5390509256812834, "learning_rate": 4.25839362651596e-06, "loss": 0.0178, "step": 2767 }, { "epoch": 1.259326660600546, "grad_norm": 0.6135561122538858, "learning_rate": 4.257885556905613e-06, "loss": 0.0265, "step": 2768 }, { "epoch": 1.259781619654231, "grad_norm": 9.77529026151472, "learning_rate": 4.257377343648799e-06, "loss": 0.164, "step": 2769 }, { "epoch": 1.2602365787079162, "grad_norm": 0.8549509883223334, "learning_rate": 4.256868986787044e-06, "loss": 0.0386, "step": 2770 }, { "epoch": 1.2606915377616015, "grad_norm": 0.5619658661669584, "learning_rate": 4.256360486361889e-06, "loss": 0.0172, "step": 2771 }, { "epoch": 1.2611464968152866, "grad_norm": 0.3706477106784628, "learning_rate": 4.255851842414887e-06, "loss": 0.0121, "step": 2772 }, { "epoch": 1.2616014558689717, "grad_norm": 1.0861459812552854, "learning_rate": 4.255343054987601e-06, "loss": 0.048, "step": 2773 }, { "epoch": 1.262056414922657, "grad_norm": 0.4626982196257253, "learning_rate": 4.2548341241216085e-06, "loss": 0.0123, "step": 2774 }, { "epoch": 1.262511373976342, "grad_norm": 0.5255340575042864, "learning_rate": 4.254325049858496e-06, "loss": 0.0225, "step": 2775 }, { "epoch": 1.2629663330300274, "grad_norm": 0.4881043727343809, "learning_rate": 4.2538158322398625e-06, "loss": 0.0189, "step": 2776 }, { "epoch": 1.2634212920837125, "grad_norm": 0.43273322558327976, "learning_rate": 4.2533064713073195e-06, "loss": 0.0158, "step": 2777 }, { "epoch": 1.2638762511373978, "grad_norm": 0.4857284264595445, "learning_rate": 4.252796967102489e-06, "loss": 0.0193, "step": 2778 }, { "epoch": 1.2643312101910829, "grad_norm": 0.6384142420021423, "learning_rate": 4.2522873196670065e-06, "loss": 0.0277, "step": 2779 }, { "epoch": 1.264786169244768, "grad_norm": 0.589124877695731, "learning_rate": 4.2517775290425175e-06, "loss": 0.015, "step": 2780 }, { "epoch": 1.2652411282984533, "grad_norm": 0.6763414782155324, "learning_rate": 4.251267595270681e-06, "loss": 0.0361, "step": 2781 }, { "epoch": 1.2656960873521383, "grad_norm": 0.8967701103335196, "learning_rate": 4.250757518393163e-06, "loss": 0.0345, "step": 2782 }, { "epoch": 1.2661510464058234, "grad_norm": 0.7419598990931897, "learning_rate": 4.250247298451649e-06, "loss": 0.0304, "step": 2783 }, { "epoch": 1.2666060054595087, "grad_norm": 0.5353914883857681, "learning_rate": 4.249736935487828e-06, "loss": 0.0207, "step": 2784 }, { "epoch": 1.2670609645131938, "grad_norm": 0.5717840719643298, "learning_rate": 4.249226429543408e-06, "loss": 0.0241, "step": 2785 }, { "epoch": 1.267515923566879, "grad_norm": 0.7539610092518788, "learning_rate": 4.248715780660102e-06, "loss": 0.0308, "step": 2786 }, { "epoch": 1.2679708826205642, "grad_norm": 0.7217352721088893, "learning_rate": 4.2482049888796405e-06, "loss": 0.0253, "step": 2787 }, { "epoch": 1.2684258416742493, "grad_norm": 0.4520410375502876, "learning_rate": 4.247694054243762e-06, "loss": 0.0166, "step": 2788 }, { "epoch": 1.2688808007279344, "grad_norm": 0.4556016017243014, "learning_rate": 4.247182976794218e-06, "loss": 0.018, "step": 2789 }, { "epoch": 1.2693357597816197, "grad_norm": 0.6363419978670682, "learning_rate": 4.246671756572771e-06, "loss": 0.029, "step": 2790 }, { "epoch": 1.2697907188353048, "grad_norm": 0.6386809769247306, "learning_rate": 4.246160393621197e-06, "loss": 0.0237, "step": 2791 }, { "epoch": 1.27024567788899, "grad_norm": 0.4990900576925178, "learning_rate": 4.2456488879812805e-06, "loss": 0.0156, "step": 2792 }, { "epoch": 1.2707006369426752, "grad_norm": 0.6029372232546744, "learning_rate": 4.24513723969482e-06, "loss": 0.0242, "step": 2793 }, { "epoch": 1.2711555959963603, "grad_norm": 0.8051049668072983, "learning_rate": 4.244625448803625e-06, "loss": 0.0349, "step": 2794 }, { "epoch": 1.2716105550500454, "grad_norm": 0.5721627664779094, "learning_rate": 4.244113515349517e-06, "loss": 0.0317, "step": 2795 }, { "epoch": 1.2720655141037307, "grad_norm": 0.38177683285724745, "learning_rate": 4.243601439374329e-06, "loss": 0.0109, "step": 2796 }, { "epoch": 1.2725204731574158, "grad_norm": 0.3860734101533411, "learning_rate": 4.243089220919906e-06, "loss": 0.0177, "step": 2797 }, { "epoch": 1.2729754322111009, "grad_norm": 0.49565665719286456, "learning_rate": 4.242576860028103e-06, "loss": 0.0152, "step": 2798 }, { "epoch": 1.2734303912647862, "grad_norm": 0.6787639424386418, "learning_rate": 4.242064356740789e-06, "loss": 0.0222, "step": 2799 }, { "epoch": 1.2738853503184713, "grad_norm": 0.42802741447037806, "learning_rate": 4.2415517110998415e-06, "loss": 0.0136, "step": 2800 }, { "epoch": 1.2743403093721566, "grad_norm": 2.411461244356516, "learning_rate": 4.241038923147155e-06, "loss": 0.0668, "step": 2801 }, { "epoch": 1.2747952684258417, "grad_norm": 0.5980800042116179, "learning_rate": 4.240525992924629e-06, "loss": 0.0251, "step": 2802 }, { "epoch": 1.2752502274795268, "grad_norm": 0.5919404298777446, "learning_rate": 4.240012920474179e-06, "loss": 0.0265, "step": 2803 }, { "epoch": 1.275705186533212, "grad_norm": 0.6697149704004297, "learning_rate": 4.239499705837731e-06, "loss": 0.0221, "step": 2804 }, { "epoch": 1.2761601455868972, "grad_norm": 0.5180009025750582, "learning_rate": 4.238986349057223e-06, "loss": 0.0176, "step": 2805 }, { "epoch": 1.2766151046405825, "grad_norm": 0.7241572161326929, "learning_rate": 4.238472850174603e-06, "loss": 0.0307, "step": 2806 }, { "epoch": 1.2770700636942676, "grad_norm": 0.7167298252754748, "learning_rate": 4.2379592092318326e-06, "loss": 0.0311, "step": 2807 }, { "epoch": 1.2775250227479527, "grad_norm": 0.9125971645241219, "learning_rate": 4.237445426270884e-06, "loss": 0.0368, "step": 2808 }, { "epoch": 1.277979981801638, "grad_norm": 0.6955622824211942, "learning_rate": 4.236931501333742e-06, "loss": 0.0401, "step": 2809 }, { "epoch": 1.278434940855323, "grad_norm": 0.6822684286332794, "learning_rate": 4.236417434462401e-06, "loss": 0.0408, "step": 2810 }, { "epoch": 1.2788898999090081, "grad_norm": 0.5716282558922546, "learning_rate": 4.23590322569887e-06, "loss": 0.0276, "step": 2811 }, { "epoch": 1.2793448589626935, "grad_norm": 0.8045248725656529, "learning_rate": 4.2353888750851655e-06, "loss": 0.0342, "step": 2812 }, { "epoch": 1.2797998180163785, "grad_norm": 0.5829303425508636, "learning_rate": 4.2348743826633195e-06, "loss": 0.0228, "step": 2813 }, { "epoch": 1.2802547770700636, "grad_norm": 0.3606406351198297, "learning_rate": 4.234359748475374e-06, "loss": 0.0095, "step": 2814 }, { "epoch": 1.280709736123749, "grad_norm": 0.5750903549771339, "learning_rate": 4.233844972563382e-06, "loss": 0.0211, "step": 2815 }, { "epoch": 1.281164695177434, "grad_norm": 0.5079770252768003, "learning_rate": 4.233330054969409e-06, "loss": 0.0187, "step": 2816 }, { "epoch": 1.2816196542311191, "grad_norm": 0.5957559449093126, "learning_rate": 4.23281499573553e-06, "loss": 0.0221, "step": 2817 }, { "epoch": 1.2820746132848044, "grad_norm": 0.4741096882523198, "learning_rate": 4.232299794903837e-06, "loss": 0.0216, "step": 2818 }, { "epoch": 1.2825295723384895, "grad_norm": 0.5565236749852849, "learning_rate": 4.2317844525164265e-06, "loss": 0.0199, "step": 2819 }, { "epoch": 1.2829845313921746, "grad_norm": 0.6913898026559677, "learning_rate": 4.2312689686154115e-06, "loss": 0.0275, "step": 2820 }, { "epoch": 1.28343949044586, "grad_norm": 0.5448158752536557, "learning_rate": 4.230753343242915e-06, "loss": 0.0198, "step": 2821 }, { "epoch": 1.283894449499545, "grad_norm": 0.5785731155407168, "learning_rate": 4.230237576441071e-06, "loss": 0.0223, "step": 2822 }, { "epoch": 1.28434940855323, "grad_norm": 0.5022779878541892, "learning_rate": 4.229721668252026e-06, "loss": 0.0214, "step": 2823 }, { "epoch": 1.2848043676069154, "grad_norm": 0.654337012356161, "learning_rate": 4.2292056187179374e-06, "loss": 0.0309, "step": 2824 }, { "epoch": 1.2852593266606005, "grad_norm": 0.6919914477024262, "learning_rate": 4.228689427880975e-06, "loss": 0.028, "step": 2825 }, { "epoch": 1.2857142857142856, "grad_norm": 0.4782749797029657, "learning_rate": 4.228173095783319e-06, "loss": 0.0158, "step": 2826 }, { "epoch": 1.286169244767971, "grad_norm": 0.6918834440829104, "learning_rate": 4.227656622467162e-06, "loss": 0.0294, "step": 2827 }, { "epoch": 1.286624203821656, "grad_norm": 0.6095753344564729, "learning_rate": 4.2271400079747085e-06, "loss": 0.0189, "step": 2828 }, { "epoch": 1.2870791628753413, "grad_norm": 0.6000891224876131, "learning_rate": 4.2266232523481724e-06, "loss": 0.0183, "step": 2829 }, { "epoch": 1.2875341219290264, "grad_norm": 0.5782268417165628, "learning_rate": 4.226106355629781e-06, "loss": 0.0217, "step": 2830 }, { "epoch": 1.2879890809827115, "grad_norm": 0.672780854857564, "learning_rate": 4.225589317861775e-06, "loss": 0.0237, "step": 2831 }, { "epoch": 1.2884440400363968, "grad_norm": 0.421513804128696, "learning_rate": 4.225072139086401e-06, "loss": 0.016, "step": 2832 }, { "epoch": 1.2888989990900819, "grad_norm": 0.6961950913783308, "learning_rate": 4.224554819345923e-06, "loss": 0.0322, "step": 2833 }, { "epoch": 1.2893539581437672, "grad_norm": 0.6627847441657934, "learning_rate": 4.224037358682614e-06, "loss": 0.033, "step": 2834 }, { "epoch": 1.2898089171974523, "grad_norm": 0.4109901835435224, "learning_rate": 4.223519757138756e-06, "loss": 0.0159, "step": 2835 }, { "epoch": 1.2902638762511374, "grad_norm": 0.48996513048334367, "learning_rate": 4.223002014756647e-06, "loss": 0.0118, "step": 2836 }, { "epoch": 1.2907188353048227, "grad_norm": 0.5197825144686319, "learning_rate": 4.222484131578595e-06, "loss": 0.018, "step": 2837 }, { "epoch": 1.2911737943585078, "grad_norm": 0.6196176741621322, "learning_rate": 4.221966107646918e-06, "loss": 0.0215, "step": 2838 }, { "epoch": 1.2916287534121929, "grad_norm": 0.4809197909639679, "learning_rate": 4.221447943003947e-06, "loss": 0.0182, "step": 2839 }, { "epoch": 1.2920837124658782, "grad_norm": 0.5637491655107516, "learning_rate": 4.2209296376920254e-06, "loss": 0.0241, "step": 2840 }, { "epoch": 1.2925386715195633, "grad_norm": 0.5091598497989657, "learning_rate": 4.220411191753504e-06, "loss": 0.0206, "step": 2841 }, { "epoch": 1.2929936305732483, "grad_norm": 0.5260279573652854, "learning_rate": 4.21989260523075e-06, "loss": 0.0227, "step": 2842 }, { "epoch": 1.2934485896269337, "grad_norm": 0.533114008746521, "learning_rate": 4.219373878166139e-06, "loss": 0.0241, "step": 2843 }, { "epoch": 1.2939035486806187, "grad_norm": 0.715496752958288, "learning_rate": 4.21885501060206e-06, "loss": 0.0332, "step": 2844 }, { "epoch": 1.2943585077343038, "grad_norm": 0.7190547999653241, "learning_rate": 4.21833600258091e-06, "loss": 0.0218, "step": 2845 }, { "epoch": 1.2948134667879891, "grad_norm": 0.8834194627984504, "learning_rate": 4.217816854145103e-06, "loss": 0.065, "step": 2846 }, { "epoch": 1.2952684258416742, "grad_norm": 0.559679452911507, "learning_rate": 4.2172975653370605e-06, "loss": 0.0192, "step": 2847 }, { "epoch": 1.2957233848953593, "grad_norm": 0.6757787650161827, "learning_rate": 4.216778136199216e-06, "loss": 0.0324, "step": 2848 }, { "epoch": 1.2961783439490446, "grad_norm": 0.6089214602754296, "learning_rate": 4.216258566774015e-06, "loss": 0.0236, "step": 2849 }, { "epoch": 1.2966333030027297, "grad_norm": 0.7300661825007071, "learning_rate": 4.215738857103915e-06, "loss": 0.0348, "step": 2850 }, { "epoch": 1.2970882620564148, "grad_norm": 0.5680795748113346, "learning_rate": 4.215219007231382e-06, "loss": 0.0237, "step": 2851 }, { "epoch": 1.2975432211101001, "grad_norm": 0.7173691689634865, "learning_rate": 4.214699017198899e-06, "loss": 0.0239, "step": 2852 }, { "epoch": 1.2979981801637852, "grad_norm": 0.5947468711182669, "learning_rate": 4.214178887048956e-06, "loss": 0.0223, "step": 2853 }, { "epoch": 1.2984531392174703, "grad_norm": 0.6420155276039037, "learning_rate": 4.213658616824055e-06, "loss": 0.0326, "step": 2854 }, { "epoch": 1.2989080982711556, "grad_norm": 0.6390694990330569, "learning_rate": 4.213138206566711e-06, "loss": 0.0273, "step": 2855 }, { "epoch": 1.2993630573248407, "grad_norm": 0.44274048534178945, "learning_rate": 4.21261765631945e-06, "loss": 0.0197, "step": 2856 }, { "epoch": 1.299818016378526, "grad_norm": 0.661916330405343, "learning_rate": 4.212096966124807e-06, "loss": 0.0311, "step": 2857 }, { "epoch": 1.300272975432211, "grad_norm": 0.6435231707439829, "learning_rate": 4.2115761360253325e-06, "loss": 0.0263, "step": 2858 }, { "epoch": 1.3007279344858962, "grad_norm": 0.5981565976648671, "learning_rate": 4.211055166063585e-06, "loss": 0.0198, "step": 2859 }, { "epoch": 1.3011828935395815, "grad_norm": 0.561549400448317, "learning_rate": 4.210534056282136e-06, "loss": 0.0145, "step": 2860 }, { "epoch": 1.3016378525932666, "grad_norm": 0.518443153784379, "learning_rate": 4.21001280672357e-06, "loss": 0.0203, "step": 2861 }, { "epoch": 1.302092811646952, "grad_norm": 0.5417281859185272, "learning_rate": 4.209491417430479e-06, "loss": 0.0254, "step": 2862 }, { "epoch": 1.302547770700637, "grad_norm": 0.6510257580750398, "learning_rate": 4.208969888445469e-06, "loss": 0.0258, "step": 2863 }, { "epoch": 1.303002729754322, "grad_norm": 1.2628676554442981, "learning_rate": 4.208448219811158e-06, "loss": 0.03, "step": 2864 }, { "epoch": 1.3034576888080074, "grad_norm": 0.5017599571522583, "learning_rate": 4.207926411570172e-06, "loss": 0.0188, "step": 2865 }, { "epoch": 1.3039126478616925, "grad_norm": 0.46506741747107516, "learning_rate": 4.207404463765155e-06, "loss": 0.0247, "step": 2866 }, { "epoch": 1.3043676069153776, "grad_norm": 0.6077204885385615, "learning_rate": 4.2068823764387545e-06, "loss": 0.0329, "step": 2867 }, { "epoch": 1.3048225659690629, "grad_norm": 0.5556402652600926, "learning_rate": 4.206360149633635e-06, "loss": 0.0274, "step": 2868 }, { "epoch": 1.305277525022748, "grad_norm": 0.5553997725592712, "learning_rate": 4.205837783392469e-06, "loss": 0.0228, "step": 2869 }, { "epoch": 1.305732484076433, "grad_norm": 0.5961926358326571, "learning_rate": 4.205315277757943e-06, "loss": 0.0288, "step": 2870 }, { "epoch": 1.3061874431301184, "grad_norm": 0.7022966753338193, "learning_rate": 4.204792632772754e-06, "loss": 0.0368, "step": 2871 }, { "epoch": 1.3066424021838035, "grad_norm": 0.553176305683213, "learning_rate": 4.204269848479611e-06, "loss": 0.0166, "step": 2872 }, { "epoch": 1.3070973612374885, "grad_norm": 0.6673883943643991, "learning_rate": 4.203746924921231e-06, "loss": 0.0182, "step": 2873 }, { "epoch": 1.3075523202911739, "grad_norm": 0.4250980507545852, "learning_rate": 4.203223862140347e-06, "loss": 0.0146, "step": 2874 }, { "epoch": 1.308007279344859, "grad_norm": 0.584977310068724, "learning_rate": 4.2027006601797e-06, "loss": 0.0255, "step": 2875 }, { "epoch": 1.308462238398544, "grad_norm": 0.6022594998733989, "learning_rate": 4.202177319082045e-06, "loss": 0.0242, "step": 2876 }, { "epoch": 1.3089171974522293, "grad_norm": 0.7502986494718045, "learning_rate": 4.201653838890146e-06, "loss": 0.0347, "step": 2877 }, { "epoch": 1.3093721565059144, "grad_norm": 0.5980053178715274, "learning_rate": 4.20113021964678e-06, "loss": 0.0196, "step": 2878 }, { "epoch": 1.3098271155595995, "grad_norm": 0.4406338596368606, "learning_rate": 4.200606461394735e-06, "loss": 0.0133, "step": 2879 }, { "epoch": 1.3102820746132848, "grad_norm": 1.149413207067457, "learning_rate": 4.200082564176809e-06, "loss": 0.0543, "step": 2880 }, { "epoch": 1.31073703366697, "grad_norm": 0.5947980427387766, "learning_rate": 4.199558528035814e-06, "loss": 0.021, "step": 2881 }, { "epoch": 1.311191992720655, "grad_norm": 0.8151453929654713, "learning_rate": 4.199034353014572e-06, "loss": 0.0377, "step": 2882 }, { "epoch": 1.3116469517743403, "grad_norm": 1.068155005333274, "learning_rate": 4.198510039155914e-06, "loss": 0.068, "step": 2883 }, { "epoch": 1.3121019108280254, "grad_norm": 0.5572646955872677, "learning_rate": 4.197985586502686e-06, "loss": 0.019, "step": 2884 }, { "epoch": 1.3125568698817107, "grad_norm": 0.7818019627481866, "learning_rate": 4.197460995097745e-06, "loss": 0.0321, "step": 2885 }, { "epoch": 1.3130118289353958, "grad_norm": 0.7089080261247177, "learning_rate": 4.1969362649839565e-06, "loss": 0.0289, "step": 2886 }, { "epoch": 1.3134667879890811, "grad_norm": 0.3919993320458887, "learning_rate": 4.1964113962042e-06, "loss": 0.0154, "step": 2887 }, { "epoch": 1.3139217470427662, "grad_norm": 0.6936353932870721, "learning_rate": 4.195886388801364e-06, "loss": 0.0288, "step": 2888 }, { "epoch": 1.3143767060964513, "grad_norm": 0.604647177949053, "learning_rate": 4.195361242818354e-06, "loss": 0.0293, "step": 2889 }, { "epoch": 1.3148316651501366, "grad_norm": 0.7531452935186627, "learning_rate": 4.194835958298076e-06, "loss": 0.043, "step": 2890 }, { "epoch": 1.3152866242038217, "grad_norm": 0.4066027317342002, "learning_rate": 4.194310535283459e-06, "loss": 0.013, "step": 2891 }, { "epoch": 1.3157415832575068, "grad_norm": 0.6284459518815321, "learning_rate": 4.193784973817436e-06, "loss": 0.0176, "step": 2892 }, { "epoch": 1.316196542311192, "grad_norm": 0.6866940604048498, "learning_rate": 4.193259273942954e-06, "loss": 0.0344, "step": 2893 }, { "epoch": 1.3166515013648772, "grad_norm": 0.4840883086023792, "learning_rate": 4.192733435702971e-06, "loss": 0.022, "step": 2894 }, { "epoch": 1.3171064604185623, "grad_norm": 0.5536571277591115, "learning_rate": 4.192207459140456e-06, "loss": 0.0206, "step": 2895 }, { "epoch": 1.3175614194722476, "grad_norm": 0.7165221835225527, "learning_rate": 4.1916813442983895e-06, "loss": 0.0298, "step": 2896 }, { "epoch": 1.3180163785259327, "grad_norm": 0.6305232437397477, "learning_rate": 4.191155091219763e-06, "loss": 0.0267, "step": 2897 }, { "epoch": 1.3184713375796178, "grad_norm": 0.5579202333359629, "learning_rate": 4.1906286999475785e-06, "loss": 0.0291, "step": 2898 }, { "epoch": 1.318926296633303, "grad_norm": 0.5392021658327838, "learning_rate": 4.190102170524853e-06, "loss": 0.0242, "step": 2899 }, { "epoch": 1.3193812556869882, "grad_norm": 0.5800078554925919, "learning_rate": 4.18957550299461e-06, "loss": 0.0242, "step": 2900 }, { "epoch": 1.3198362147406733, "grad_norm": 0.5862214666422206, "learning_rate": 4.189048697399887e-06, "loss": 0.0215, "step": 2901 }, { "epoch": 1.3202911737943586, "grad_norm": 0.5620076241035448, "learning_rate": 4.188521753783732e-06, "loss": 0.0171, "step": 2902 }, { "epoch": 1.3207461328480437, "grad_norm": 0.6812119340304829, "learning_rate": 4.187994672189205e-06, "loss": 0.0233, "step": 2903 }, { "epoch": 1.3212010919017287, "grad_norm": 0.5812406311069044, "learning_rate": 4.187467452659376e-06, "loss": 0.0344, "step": 2904 }, { "epoch": 1.321656050955414, "grad_norm": 0.7247748495422082, "learning_rate": 4.186940095237327e-06, "loss": 0.0342, "step": 2905 }, { "epoch": 1.3221110100090991, "grad_norm": 0.4582523888047295, "learning_rate": 4.186412599966152e-06, "loss": 0.025, "step": 2906 }, { "epoch": 1.3225659690627842, "grad_norm": 0.4205471072755918, "learning_rate": 4.185884966888954e-06, "loss": 0.0139, "step": 2907 }, { "epoch": 1.3230209281164695, "grad_norm": 0.45492352353869137, "learning_rate": 4.185357196048852e-06, "loss": 0.0163, "step": 2908 }, { "epoch": 1.3234758871701546, "grad_norm": 0.5532805935798129, "learning_rate": 4.1848292874889694e-06, "loss": 0.0244, "step": 2909 }, { "epoch": 1.3239308462238397, "grad_norm": 0.5324603819929723, "learning_rate": 4.184301241252447e-06, "loss": 0.0208, "step": 2910 }, { "epoch": 1.324385805277525, "grad_norm": 0.34423546386086223, "learning_rate": 4.183773057382432e-06, "loss": 0.0147, "step": 2911 }, { "epoch": 1.3248407643312101, "grad_norm": 0.615818224239299, "learning_rate": 4.183244735922087e-06, "loss": 0.0218, "step": 2912 }, { "epoch": 1.3252957233848954, "grad_norm": 0.6549136207399878, "learning_rate": 4.182716276914585e-06, "loss": 0.0217, "step": 2913 }, { "epoch": 1.3257506824385805, "grad_norm": 0.5738916314899165, "learning_rate": 4.182187680403107e-06, "loss": 0.0162, "step": 2914 }, { "epoch": 1.3262056414922658, "grad_norm": 0.5594899471027078, "learning_rate": 4.181658946430848e-06, "loss": 0.0245, "step": 2915 }, { "epoch": 1.326660600545951, "grad_norm": 0.4109916930431615, "learning_rate": 4.181130075041015e-06, "loss": 0.0137, "step": 2916 }, { "epoch": 1.327115559599636, "grad_norm": 0.53422383536034, "learning_rate": 4.180601066276824e-06, "loss": 0.0216, "step": 2917 }, { "epoch": 1.3275705186533213, "grad_norm": 0.5383134563032649, "learning_rate": 4.180071920181503e-06, "loss": 0.0177, "step": 2918 }, { "epoch": 1.3280254777070064, "grad_norm": 0.5715076816892543, "learning_rate": 4.179542636798292e-06, "loss": 0.0179, "step": 2919 }, { "epoch": 1.3284804367606915, "grad_norm": 0.5437336115867218, "learning_rate": 4.1790132161704415e-06, "loss": 0.0211, "step": 2920 }, { "epoch": 1.3289353958143768, "grad_norm": 0.5527858593958116, "learning_rate": 4.178483658341213e-06, "loss": 0.0186, "step": 2921 }, { "epoch": 1.329390354868062, "grad_norm": 0.9925849620548352, "learning_rate": 4.17795396335388e-06, "loss": 0.047, "step": 2922 }, { "epoch": 1.329845313921747, "grad_norm": 0.7673987404091294, "learning_rate": 4.177424131251728e-06, "loss": 0.0361, "step": 2923 }, { "epoch": 1.3303002729754323, "grad_norm": 0.54250801803978, "learning_rate": 4.17689416207805e-06, "loss": 0.0199, "step": 2924 }, { "epoch": 1.3307552320291174, "grad_norm": 0.5272091207239011, "learning_rate": 4.176364055876154e-06, "loss": 0.0151, "step": 2925 }, { "epoch": 1.3312101910828025, "grad_norm": 0.5716765753458435, "learning_rate": 4.175833812689357e-06, "loss": 0.0194, "step": 2926 }, { "epoch": 1.3316651501364878, "grad_norm": 0.5032428892240143, "learning_rate": 4.17530343256099e-06, "loss": 0.0183, "step": 2927 }, { "epoch": 1.3321201091901729, "grad_norm": 0.913143929461543, "learning_rate": 4.174772915534392e-06, "loss": 0.0339, "step": 2928 }, { "epoch": 1.332575068243858, "grad_norm": 0.7146053626191031, "learning_rate": 4.174242261652914e-06, "loss": 0.034, "step": 2929 }, { "epoch": 1.3330300272975433, "grad_norm": 0.44685473112764684, "learning_rate": 4.173711470959919e-06, "loss": 0.0167, "step": 2930 }, { "epoch": 1.3334849863512284, "grad_norm": 0.5980701079569753, "learning_rate": 4.173180543498782e-06, "loss": 0.0276, "step": 2931 }, { "epoch": 1.3339399454049135, "grad_norm": 0.5027377779697596, "learning_rate": 4.1726494793128864e-06, "loss": 0.014, "step": 2932 }, { "epoch": 1.3343949044585988, "grad_norm": 0.5163509304756547, "learning_rate": 4.172118278445629e-06, "loss": 0.0201, "step": 2933 }, { "epoch": 1.3348498635122839, "grad_norm": 0.7718540702464267, "learning_rate": 4.171586940940417e-06, "loss": 0.0439, "step": 2934 }, { "epoch": 1.335304822565969, "grad_norm": 0.6284032907303854, "learning_rate": 4.171055466840669e-06, "loss": 0.0232, "step": 2935 }, { "epoch": 1.3357597816196543, "grad_norm": 1.0175122945856574, "learning_rate": 4.1705238561898144e-06, "loss": 0.0351, "step": 2936 }, { "epoch": 1.3362147406733393, "grad_norm": 0.46586902818145487, "learning_rate": 4.169992109031295e-06, "loss": 0.0155, "step": 2937 }, { "epoch": 1.3366696997270244, "grad_norm": 0.44893772743350846, "learning_rate": 4.169460225408562e-06, "loss": 0.0199, "step": 2938 }, { "epoch": 1.3371246587807097, "grad_norm": 0.8418229866849047, "learning_rate": 4.1689282053650786e-06, "loss": 0.0363, "step": 2939 }, { "epoch": 1.3375796178343948, "grad_norm": 0.4579693163374815, "learning_rate": 4.168396048944318e-06, "loss": 0.0224, "step": 2940 }, { "epoch": 1.3380345768880801, "grad_norm": 0.4215116697549558, "learning_rate": 4.167863756189767e-06, "loss": 0.0189, "step": 2941 }, { "epoch": 1.3384895359417652, "grad_norm": 0.4819574773962544, "learning_rate": 4.167331327144924e-06, "loss": 0.0192, "step": 2942 }, { "epoch": 1.3389444949954505, "grad_norm": 0.6590169559925403, "learning_rate": 4.166798761853291e-06, "loss": 0.0302, "step": 2943 }, { "epoch": 1.3393994540491356, "grad_norm": 0.5837457001514973, "learning_rate": 4.1662660603583936e-06, "loss": 0.0244, "step": 2944 }, { "epoch": 1.3398544131028207, "grad_norm": 0.6015904457920979, "learning_rate": 4.165733222703757e-06, "loss": 0.0311, "step": 2945 }, { "epoch": 1.340309372156506, "grad_norm": 0.685253409310759, "learning_rate": 4.165200248932923e-06, "loss": 0.02, "step": 2946 }, { "epoch": 1.3407643312101911, "grad_norm": 0.7220938837370316, "learning_rate": 4.164667139089446e-06, "loss": 0.0345, "step": 2947 }, { "epoch": 1.3412192902638762, "grad_norm": 0.4504885628402017, "learning_rate": 4.164133893216888e-06, "loss": 0.0161, "step": 2948 }, { "epoch": 1.3416742493175615, "grad_norm": 0.654778082115673, "learning_rate": 4.163600511358823e-06, "loss": 0.033, "step": 2949 }, { "epoch": 1.3421292083712466, "grad_norm": 0.5883225971322004, "learning_rate": 4.163066993558837e-06, "loss": 0.0314, "step": 2950 }, { "epoch": 1.3425841674249317, "grad_norm": 0.5896302652046609, "learning_rate": 4.1625333398605265e-06, "loss": 0.0257, "step": 2951 }, { "epoch": 1.343039126478617, "grad_norm": 0.6116556629810576, "learning_rate": 4.1619995503075e-06, "loss": 0.0232, "step": 2952 }, { "epoch": 1.343494085532302, "grad_norm": 0.7747409038134005, "learning_rate": 4.161465624943375e-06, "loss": 0.0283, "step": 2953 }, { "epoch": 1.3439490445859872, "grad_norm": 0.3783262451434524, "learning_rate": 4.1609315638117825e-06, "loss": 0.0133, "step": 2954 }, { "epoch": 1.3444040036396725, "grad_norm": 0.6903780649459388, "learning_rate": 4.160397366956364e-06, "loss": 0.039, "step": 2955 }, { "epoch": 1.3448589626933576, "grad_norm": 0.468325448617072, "learning_rate": 4.1598630344207705e-06, "loss": 0.0175, "step": 2956 }, { "epoch": 1.3453139217470427, "grad_norm": 0.60765527474527, "learning_rate": 4.159328566248665e-06, "loss": 0.0213, "step": 2957 }, { "epoch": 1.345768880800728, "grad_norm": 0.7428248623031719, "learning_rate": 4.1587939624837225e-06, "loss": 0.0241, "step": 2958 }, { "epoch": 1.346223839854413, "grad_norm": 0.4888241354818987, "learning_rate": 4.15825922316963e-06, "loss": 0.0149, "step": 2959 }, { "epoch": 1.3466787989080982, "grad_norm": 0.5484484705392908, "learning_rate": 4.15772434835008e-06, "loss": 0.0196, "step": 2960 }, { "epoch": 1.3471337579617835, "grad_norm": 1.0565925914634293, "learning_rate": 4.157189338068785e-06, "loss": 0.0413, "step": 2961 }, { "epoch": 1.3475887170154686, "grad_norm": 0.7341973618688173, "learning_rate": 4.156654192369459e-06, "loss": 0.0312, "step": 2962 }, { "epoch": 1.3480436760691537, "grad_norm": 0.4332535942804454, "learning_rate": 4.156118911295835e-06, "loss": 0.0103, "step": 2963 }, { "epoch": 1.348498635122839, "grad_norm": 0.516453626596964, "learning_rate": 4.155583494891651e-06, "loss": 0.0244, "step": 2964 }, { "epoch": 1.348953594176524, "grad_norm": 0.46094446618817314, "learning_rate": 4.155047943200663e-06, "loss": 0.0177, "step": 2965 }, { "epoch": 1.3494085532302094, "grad_norm": 0.5415622725687742, "learning_rate": 4.154512256266629e-06, "loss": 0.0182, "step": 2966 }, { "epoch": 1.3498635122838945, "grad_norm": 0.39310563760094946, "learning_rate": 4.153976434133327e-06, "loss": 0.0118, "step": 2967 }, { "epoch": 1.3503184713375795, "grad_norm": 0.41188508813446434, "learning_rate": 4.153440476844539e-06, "loss": 0.0128, "step": 2968 }, { "epoch": 1.3507734303912649, "grad_norm": 0.7427890993395497, "learning_rate": 4.1529043844440616e-06, "loss": 0.038, "step": 2969 }, { "epoch": 1.35122838944495, "grad_norm": 0.7612641381400927, "learning_rate": 4.1523681569757035e-06, "loss": 0.0392, "step": 2970 }, { "epoch": 1.3516833484986353, "grad_norm": 0.5867693257944069, "learning_rate": 4.151831794483281e-06, "loss": 0.0277, "step": 2971 }, { "epoch": 1.3521383075523203, "grad_norm": 0.44393405116337514, "learning_rate": 4.151295297010623e-06, "loss": 0.0168, "step": 2972 }, { "epoch": 1.3525932666060054, "grad_norm": 0.8173172206406042, "learning_rate": 4.150758664601572e-06, "loss": 0.0417, "step": 2973 }, { "epoch": 1.3530482256596907, "grad_norm": 0.5438123846748695, "learning_rate": 4.1502218972999765e-06, "loss": 0.0285, "step": 2974 }, { "epoch": 1.3535031847133758, "grad_norm": 0.5512371839504099, "learning_rate": 4.1496849951497005e-06, "loss": 0.0211, "step": 2975 }, { "epoch": 1.353958143767061, "grad_norm": 0.6338857840768191, "learning_rate": 4.149147958194617e-06, "loss": 0.0191, "step": 2976 }, { "epoch": 1.3544131028207462, "grad_norm": 0.5430184438020733, "learning_rate": 4.1486107864786095e-06, "loss": 0.0225, "step": 2977 }, { "epoch": 1.3548680618744313, "grad_norm": 0.5515766025691744, "learning_rate": 4.148073480045573e-06, "loss": 0.0178, "step": 2978 }, { "epoch": 1.3553230209281164, "grad_norm": 0.6143856897658074, "learning_rate": 4.147536038939416e-06, "loss": 0.0245, "step": 2979 }, { "epoch": 1.3557779799818017, "grad_norm": 0.52410413088212, "learning_rate": 4.146998463204053e-06, "loss": 0.014, "step": 2980 }, { "epoch": 1.3562329390354868, "grad_norm": 0.8689970476022745, "learning_rate": 4.146460752883413e-06, "loss": 0.0378, "step": 2981 }, { "epoch": 1.356687898089172, "grad_norm": 0.6157945347901961, "learning_rate": 4.145922908021436e-06, "loss": 0.0265, "step": 2982 }, { "epoch": 1.3571428571428572, "grad_norm": 0.593547987823192, "learning_rate": 4.145384928662072e-06, "loss": 0.0211, "step": 2983 }, { "epoch": 1.3575978161965423, "grad_norm": 0.624031958139668, "learning_rate": 4.144846814849282e-06, "loss": 0.0241, "step": 2984 }, { "epoch": 1.3580527752502274, "grad_norm": 0.7734232659670662, "learning_rate": 4.1443085666270375e-06, "loss": 0.0289, "step": 2985 }, { "epoch": 1.3585077343039127, "grad_norm": 0.9699839605495941, "learning_rate": 4.143770184039324e-06, "loss": 0.0493, "step": 2986 }, { "epoch": 1.3589626933575978, "grad_norm": 0.5096255057892579, "learning_rate": 4.143231667130134e-06, "loss": 0.0251, "step": 2987 }, { "epoch": 1.3594176524112829, "grad_norm": 0.5610719525460525, "learning_rate": 4.142693015943472e-06, "loss": 0.0161, "step": 2988 }, { "epoch": 1.3598726114649682, "grad_norm": 0.6484528506962219, "learning_rate": 4.142154230523356e-06, "loss": 0.0222, "step": 2989 }, { "epoch": 1.3603275705186533, "grad_norm": 0.5691271094961611, "learning_rate": 4.141615310913812e-06, "loss": 0.0201, "step": 2990 }, { "epoch": 1.3607825295723384, "grad_norm": 0.585470679430853, "learning_rate": 4.141076257158878e-06, "loss": 0.0206, "step": 2991 }, { "epoch": 1.3612374886260237, "grad_norm": 0.5886159920006029, "learning_rate": 4.1405370693026035e-06, "loss": 0.0273, "step": 2992 }, { "epoch": 1.3616924476797088, "grad_norm": 0.49346300298845935, "learning_rate": 4.139997747389049e-06, "loss": 0.0142, "step": 2993 }, { "epoch": 1.362147406733394, "grad_norm": 0.5623656737186806, "learning_rate": 4.139458291462283e-06, "loss": 0.0269, "step": 2994 }, { "epoch": 1.3626023657870792, "grad_norm": 0.6627238631761317, "learning_rate": 4.13891870156639e-06, "loss": 0.0437, "step": 2995 }, { "epoch": 1.3630573248407643, "grad_norm": 0.37964020676685056, "learning_rate": 4.138378977745462e-06, "loss": 0.0167, "step": 2996 }, { "epoch": 1.3635122838944496, "grad_norm": 0.5631408737685011, "learning_rate": 4.137839120043603e-06, "loss": 0.0218, "step": 2997 }, { "epoch": 1.3639672429481347, "grad_norm": 0.5831152022878606, "learning_rate": 4.137299128504928e-06, "loss": 0.0317, "step": 2998 }, { "epoch": 1.36442220200182, "grad_norm": 0.4861214216364132, "learning_rate": 4.136759003173561e-06, "loss": 0.0161, "step": 2999 }, { "epoch": 1.364877161055505, "grad_norm": 0.5358723278350062, "learning_rate": 4.136218744093641e-06, "loss": 0.0226, "step": 3000 }, { "epoch": 1.3653321201091901, "grad_norm": 0.669520131857986, "learning_rate": 4.1356783513093135e-06, "loss": 0.0358, "step": 3001 }, { "epoch": 1.3657870791628755, "grad_norm": 0.6459513757186216, "learning_rate": 4.135137824864738e-06, "loss": 0.025, "step": 3002 }, { "epoch": 1.3662420382165605, "grad_norm": 0.6336951581043813, "learning_rate": 4.134597164804084e-06, "loss": 0.0191, "step": 3003 }, { "epoch": 1.3666969972702456, "grad_norm": 0.5180512978385537, "learning_rate": 4.134056371171531e-06, "loss": 0.0186, "step": 3004 }, { "epoch": 1.367151956323931, "grad_norm": 0.3722453745078167, "learning_rate": 4.1335154440112715e-06, "loss": 0.0101, "step": 3005 }, { "epoch": 1.367606915377616, "grad_norm": 0.3713778689222229, "learning_rate": 4.132974383367505e-06, "loss": 0.0127, "step": 3006 }, { "epoch": 1.3680618744313011, "grad_norm": 0.4990475777860669, "learning_rate": 4.1324331892844485e-06, "loss": 0.0184, "step": 3007 }, { "epoch": 1.3685168334849864, "grad_norm": 0.8304052852022975, "learning_rate": 4.131891861806322e-06, "loss": 0.0329, "step": 3008 }, { "epoch": 1.3689717925386715, "grad_norm": 0.4389683470350522, "learning_rate": 4.131350400977363e-06, "loss": 0.0166, "step": 3009 }, { "epoch": 1.3694267515923566, "grad_norm": 0.698803640296317, "learning_rate": 4.130808806841816e-06, "loss": 0.0252, "step": 3010 }, { "epoch": 1.369881710646042, "grad_norm": 1.250044162677878, "learning_rate": 4.130267079443939e-06, "loss": 0.0617, "step": 3011 }, { "epoch": 1.370336669699727, "grad_norm": 0.5158514736015647, "learning_rate": 4.129725218827997e-06, "loss": 0.0268, "step": 3012 }, { "epoch": 1.370791628753412, "grad_norm": 0.5535080428066416, "learning_rate": 4.1291832250382705e-06, "loss": 0.0347, "step": 3013 }, { "epoch": 1.3712465878070974, "grad_norm": 0.610394373477909, "learning_rate": 4.128641098119048e-06, "loss": 0.0247, "step": 3014 }, { "epoch": 1.3717015468607825, "grad_norm": 0.5633256737475365, "learning_rate": 4.128098838114631e-06, "loss": 0.0239, "step": 3015 }, { "epoch": 1.3721565059144676, "grad_norm": 0.5949892339415896, "learning_rate": 4.127556445069328e-06, "loss": 0.0196, "step": 3016 }, { "epoch": 1.372611464968153, "grad_norm": 0.4362212537455964, "learning_rate": 4.127013919027462e-06, "loss": 0.0186, "step": 3017 }, { "epoch": 1.373066424021838, "grad_norm": 0.4608698551673798, "learning_rate": 4.126471260033368e-06, "loss": 0.0168, "step": 3018 }, { "epoch": 1.373521383075523, "grad_norm": 0.522646641348721, "learning_rate": 4.125928468131387e-06, "loss": 0.0234, "step": 3019 }, { "epoch": 1.3739763421292084, "grad_norm": 0.6666047647535678, "learning_rate": 4.125385543365873e-06, "loss": 0.0341, "step": 3020 }, { "epoch": 1.3744313011828935, "grad_norm": 0.45568077571323495, "learning_rate": 4.124842485781194e-06, "loss": 0.0115, "step": 3021 }, { "epoch": 1.3748862602365788, "grad_norm": 0.5297520552882724, "learning_rate": 4.1242992954217234e-06, "loss": 0.0201, "step": 3022 }, { "epoch": 1.3753412192902639, "grad_norm": 0.4259156472572623, "learning_rate": 4.123755972331851e-06, "loss": 0.0169, "step": 3023 }, { "epoch": 1.3757961783439492, "grad_norm": 0.7451700916534765, "learning_rate": 4.123212516555972e-06, "loss": 0.0412, "step": 3024 }, { "epoch": 1.3762511373976343, "grad_norm": 0.7232624853864648, "learning_rate": 4.122668928138498e-06, "loss": 0.0305, "step": 3025 }, { "epoch": 1.3767060964513194, "grad_norm": 0.6950540512190239, "learning_rate": 4.122125207123846e-06, "loss": 0.0314, "step": 3026 }, { "epoch": 1.3771610555050047, "grad_norm": 0.3593225407829798, "learning_rate": 4.121581353556447e-06, "loss": 0.0123, "step": 3027 }, { "epoch": 1.3776160145586898, "grad_norm": 0.3906828489333325, "learning_rate": 4.121037367480744e-06, "loss": 0.0202, "step": 3028 }, { "epoch": 1.3780709736123748, "grad_norm": 0.48824144662287894, "learning_rate": 4.120493248941188e-06, "loss": 0.0159, "step": 3029 }, { "epoch": 1.3785259326660602, "grad_norm": 0.5305250470220376, "learning_rate": 4.119948997982241e-06, "loss": 0.0178, "step": 3030 }, { "epoch": 1.3789808917197452, "grad_norm": 0.6167248818967296, "learning_rate": 4.119404614648378e-06, "loss": 0.0225, "step": 3031 }, { "epoch": 1.3794358507734303, "grad_norm": 0.8878066170438527, "learning_rate": 4.118860098984083e-06, "loss": 0.0535, "step": 3032 }, { "epoch": 1.3798908098271156, "grad_norm": 0.5195200604789711, "learning_rate": 4.118315451033851e-06, "loss": 0.0178, "step": 3033 }, { "epoch": 1.3803457688808007, "grad_norm": 0.582760689437918, "learning_rate": 4.117770670842189e-06, "loss": 0.0267, "step": 3034 }, { "epoch": 1.3808007279344858, "grad_norm": 0.48644656132929254, "learning_rate": 4.117225758453614e-06, "loss": 0.0142, "step": 3035 }, { "epoch": 1.3812556869881711, "grad_norm": 0.48520241105257794, "learning_rate": 4.116680713912652e-06, "loss": 0.0187, "step": 3036 }, { "epoch": 1.3817106460418562, "grad_norm": 0.6569899082539229, "learning_rate": 4.116135537263844e-06, "loss": 0.0299, "step": 3037 }, { "epoch": 1.3821656050955413, "grad_norm": 0.4241692854361547, "learning_rate": 4.115590228551738e-06, "loss": 0.0149, "step": 3038 }, { "epoch": 1.3826205641492266, "grad_norm": 0.48907194958347444, "learning_rate": 4.115044787820895e-06, "loss": 0.0177, "step": 3039 }, { "epoch": 1.3830755232029117, "grad_norm": 0.6439400567288324, "learning_rate": 4.114499215115885e-06, "loss": 0.0374, "step": 3040 }, { "epoch": 1.3835304822565968, "grad_norm": 0.5824862809523336, "learning_rate": 4.113953510481289e-06, "loss": 0.0228, "step": 3041 }, { "epoch": 1.3839854413102821, "grad_norm": 0.5755206805973067, "learning_rate": 4.113407673961702e-06, "loss": 0.0191, "step": 3042 }, { "epoch": 1.3844404003639672, "grad_norm": 0.5035093816887116, "learning_rate": 4.112861705601726e-06, "loss": 0.0207, "step": 3043 }, { "epoch": 1.3848953594176523, "grad_norm": 0.6764554424164136, "learning_rate": 4.112315605445975e-06, "loss": 0.0255, "step": 3044 }, { "epoch": 1.3853503184713376, "grad_norm": 0.6149217347847765, "learning_rate": 4.111769373539073e-06, "loss": 0.0287, "step": 3045 }, { "epoch": 1.3858052775250227, "grad_norm": 0.6663180017828199, "learning_rate": 4.1112230099256576e-06, "loss": 0.0212, "step": 3046 }, { "epoch": 1.3862602365787078, "grad_norm": 1.0598559044398572, "learning_rate": 4.1106765146503735e-06, "loss": 0.0272, "step": 3047 }, { "epoch": 1.386715195632393, "grad_norm": 0.4765126938927154, "learning_rate": 4.110129887757878e-06, "loss": 0.0168, "step": 3048 }, { "epoch": 1.3871701546860782, "grad_norm": 0.5485252582544047, "learning_rate": 4.10958312929284e-06, "loss": 0.0215, "step": 3049 }, { "epoch": 1.3876251137397635, "grad_norm": 0.6033054625797364, "learning_rate": 4.1090362392999376e-06, "loss": 0.0204, "step": 3050 }, { "epoch": 1.3880800727934486, "grad_norm": 0.7645773724183272, "learning_rate": 4.108489217823859e-06, "loss": 0.0463, "step": 3051 }, { "epoch": 1.388535031847134, "grad_norm": 0.5126384888627012, "learning_rate": 4.107942064909306e-06, "loss": 0.017, "step": 3052 }, { "epoch": 1.388989990900819, "grad_norm": 0.5517435339972416, "learning_rate": 4.107394780600989e-06, "loss": 0.0164, "step": 3053 }, { "epoch": 1.389444949954504, "grad_norm": 0.6894233207451937, "learning_rate": 4.10684736494363e-06, "loss": 0.0309, "step": 3054 }, { "epoch": 1.3898999090081894, "grad_norm": 0.6995203769861, "learning_rate": 4.10629981798196e-06, "loss": 0.0255, "step": 3055 }, { "epoch": 1.3903548680618745, "grad_norm": 0.6389147217213587, "learning_rate": 4.105752139760723e-06, "loss": 0.0289, "step": 3056 }, { "epoch": 1.3908098271155596, "grad_norm": 0.5052389232101024, "learning_rate": 4.105204330324673e-06, "loss": 0.0208, "step": 3057 }, { "epoch": 1.3912647861692449, "grad_norm": 0.49769601997533147, "learning_rate": 4.1046563897185736e-06, "loss": 0.0141, "step": 3058 }, { "epoch": 1.39171974522293, "grad_norm": 0.7519559246267947, "learning_rate": 4.104108317987201e-06, "loss": 0.0319, "step": 3059 }, { "epoch": 1.392174704276615, "grad_norm": 0.722697194866074, "learning_rate": 4.103560115175341e-06, "loss": 0.0389, "step": 3060 }, { "epoch": 1.3926296633303004, "grad_norm": 0.3738002744999205, "learning_rate": 4.103011781327789e-06, "loss": 0.0193, "step": 3061 }, { "epoch": 1.3930846223839854, "grad_norm": 0.5043059770286826, "learning_rate": 4.102463316489354e-06, "loss": 0.0201, "step": 3062 }, { "epoch": 1.3935395814376705, "grad_norm": 0.6670664153936168, "learning_rate": 4.101914720704854e-06, "loss": 0.0327, "step": 3063 }, { "epoch": 1.3939945404913558, "grad_norm": 0.6019451653378002, "learning_rate": 4.101365994019116e-06, "loss": 0.0243, "step": 3064 }, { "epoch": 1.394449499545041, "grad_norm": 0.9907449373657036, "learning_rate": 4.100817136476981e-06, "loss": 0.0383, "step": 3065 }, { "epoch": 1.394904458598726, "grad_norm": 0.49723412616757334, "learning_rate": 4.1002681481233e-06, "loss": 0.0165, "step": 3066 }, { "epoch": 1.3953594176524113, "grad_norm": 0.5920652529146905, "learning_rate": 4.099719029002932e-06, "loss": 0.0277, "step": 3067 }, { "epoch": 1.3958143767060964, "grad_norm": 0.8560125748124937, "learning_rate": 4.0991697791607485e-06, "loss": 0.0426, "step": 3068 }, { "epoch": 1.3962693357597815, "grad_norm": 0.6203648804145926, "learning_rate": 4.098620398641633e-06, "loss": 0.0241, "step": 3069 }, { "epoch": 1.3967242948134668, "grad_norm": 0.5521046470456151, "learning_rate": 4.098070887490478e-06, "loss": 0.0256, "step": 3070 }, { "epoch": 1.397179253867152, "grad_norm": 0.549780639587674, "learning_rate": 4.0975212457521865e-06, "loss": 0.0246, "step": 3071 }, { "epoch": 1.397634212920837, "grad_norm": 0.6502581386149143, "learning_rate": 4.096971473471674e-06, "loss": 0.0234, "step": 3072 }, { "epoch": 1.3980891719745223, "grad_norm": 0.5166228834612007, "learning_rate": 4.0964215706938635e-06, "loss": 0.0212, "step": 3073 }, { "epoch": 1.3985441310282074, "grad_norm": 0.6849292217423967, "learning_rate": 4.0958715374636925e-06, "loss": 0.0227, "step": 3074 }, { "epoch": 1.3989990900818925, "grad_norm": 0.525937882839348, "learning_rate": 4.095321373826105e-06, "loss": 0.0248, "step": 3075 }, { "epoch": 1.3994540491355778, "grad_norm": 0.5036452541982582, "learning_rate": 4.094771079826061e-06, "loss": 0.0175, "step": 3076 }, { "epoch": 1.399909008189263, "grad_norm": 0.6435654578869566, "learning_rate": 4.094220655508525e-06, "loss": 0.0299, "step": 3077 }, { "epoch": 1.4003639672429482, "grad_norm": 0.4960419896796071, "learning_rate": 4.0936701009184775e-06, "loss": 0.0244, "step": 3078 }, { "epoch": 1.4008189262966333, "grad_norm": 0.6004528177397015, "learning_rate": 4.0931194161009044e-06, "loss": 0.0357, "step": 3079 }, { "epoch": 1.4012738853503186, "grad_norm": 0.5619700414349914, "learning_rate": 4.092568601100809e-06, "loss": 0.0229, "step": 3080 }, { "epoch": 1.4017288444040037, "grad_norm": 0.4518813738572762, "learning_rate": 4.092017655963199e-06, "loss": 0.0117, "step": 3081 }, { "epoch": 1.4021838034576888, "grad_norm": 0.6514671911570182, "learning_rate": 4.091466580733095e-06, "loss": 0.0374, "step": 3082 }, { "epoch": 1.402638762511374, "grad_norm": 0.5408580921293082, "learning_rate": 4.09091537545553e-06, "loss": 0.0251, "step": 3083 }, { "epoch": 1.4030937215650592, "grad_norm": 0.49428453735831535, "learning_rate": 4.090364040175545e-06, "loss": 0.0172, "step": 3084 }, { "epoch": 1.4035486806187443, "grad_norm": 0.666899608158994, "learning_rate": 4.089812574938192e-06, "loss": 0.0324, "step": 3085 }, { "epoch": 1.4040036396724296, "grad_norm": 0.7622239351954695, "learning_rate": 4.089260979788534e-06, "loss": 0.0338, "step": 3086 }, { "epoch": 1.4044585987261147, "grad_norm": 0.5953257723172974, "learning_rate": 4.088709254771648e-06, "loss": 0.033, "step": 3087 }, { "epoch": 1.4049135577797998, "grad_norm": 0.6420360684443719, "learning_rate": 4.088157399932615e-06, "loss": 0.0229, "step": 3088 }, { "epoch": 1.405368516833485, "grad_norm": 0.5362082733292385, "learning_rate": 4.0876054153165314e-06, "loss": 0.0313, "step": 3089 }, { "epoch": 1.4058234758871702, "grad_norm": 0.43998962686932297, "learning_rate": 4.087053300968502e-06, "loss": 0.0158, "step": 3090 }, { "epoch": 1.4062784349408552, "grad_norm": 0.5249842338579929, "learning_rate": 4.086501056933646e-06, "loss": 0.0217, "step": 3091 }, { "epoch": 1.4067333939945406, "grad_norm": 0.7147219754503916, "learning_rate": 4.085948683257087e-06, "loss": 0.0345, "step": 3092 }, { "epoch": 1.4071883530482256, "grad_norm": 0.5648596460288634, "learning_rate": 4.085396179983963e-06, "loss": 0.0249, "step": 3093 }, { "epoch": 1.4076433121019107, "grad_norm": 0.7084596679025572, "learning_rate": 4.084843547159424e-06, "loss": 0.0324, "step": 3094 }, { "epoch": 1.408098271155596, "grad_norm": 0.5054480404602903, "learning_rate": 4.0842907848286265e-06, "loss": 0.02, "step": 3095 }, { "epoch": 1.4085532302092811, "grad_norm": 0.4088045729759403, "learning_rate": 4.083737893036741e-06, "loss": 0.0121, "step": 3096 }, { "epoch": 1.4090081892629662, "grad_norm": 0.4445880679833502, "learning_rate": 4.083184871828947e-06, "loss": 0.0163, "step": 3097 }, { "epoch": 1.4094631483166515, "grad_norm": 0.5583085908646305, "learning_rate": 4.0826317212504345e-06, "loss": 0.0212, "step": 3098 }, { "epoch": 1.4099181073703366, "grad_norm": 0.4817308983982684, "learning_rate": 4.0820784413464054e-06, "loss": 0.0154, "step": 3099 }, { "epoch": 1.4103730664240217, "grad_norm": 0.5448848620297295, "learning_rate": 4.08152503216207e-06, "loss": 0.0194, "step": 3100 }, { "epoch": 1.410828025477707, "grad_norm": 0.5964456320339733, "learning_rate": 4.080971493742652e-06, "loss": 0.019, "step": 3101 }, { "epoch": 1.4112829845313921, "grad_norm": 1.0500035124551044, "learning_rate": 4.080417826133382e-06, "loss": 0.0346, "step": 3102 }, { "epoch": 1.4117379435850774, "grad_norm": 0.6289836820106267, "learning_rate": 4.079864029379506e-06, "loss": 0.0432, "step": 3103 }, { "epoch": 1.4121929026387625, "grad_norm": 0.5116882510897833, "learning_rate": 4.079310103526275e-06, "loss": 0.0256, "step": 3104 }, { "epoch": 1.4126478616924476, "grad_norm": 0.6310635082352677, "learning_rate": 4.0787560486189545e-06, "loss": 0.0217, "step": 3105 }, { "epoch": 1.413102820746133, "grad_norm": 0.4671010346903545, "learning_rate": 4.07820186470282e-06, "loss": 0.0177, "step": 3106 }, { "epoch": 1.413557779799818, "grad_norm": 0.47143714345799553, "learning_rate": 4.077647551823155e-06, "loss": 0.0188, "step": 3107 }, { "epoch": 1.4140127388535033, "grad_norm": 0.4652926046789629, "learning_rate": 4.077093110025258e-06, "loss": 0.0312, "step": 3108 }, { "epoch": 1.4144676979071884, "grad_norm": 0.7370899657072758, "learning_rate": 4.076538539354433e-06, "loss": 0.0249, "step": 3109 }, { "epoch": 1.4149226569608735, "grad_norm": 0.6368992481166442, "learning_rate": 4.075983839855999e-06, "loss": 0.0327, "step": 3110 }, { "epoch": 1.4153776160145588, "grad_norm": 0.567399314370992, "learning_rate": 4.075429011575281e-06, "loss": 0.0313, "step": 3111 }, { "epoch": 1.415832575068244, "grad_norm": 0.3892637737460413, "learning_rate": 4.07487405455762e-06, "loss": 0.0123, "step": 3112 }, { "epoch": 1.416287534121929, "grad_norm": 0.613106414543713, "learning_rate": 4.074318968848364e-06, "loss": 0.028, "step": 3113 }, { "epoch": 1.4167424931756143, "grad_norm": 0.7160610594651534, "learning_rate": 4.073763754492871e-06, "loss": 0.0439, "step": 3114 }, { "epoch": 1.4171974522292994, "grad_norm": 0.6159887912900818, "learning_rate": 4.07320841153651e-06, "loss": 0.0223, "step": 3115 }, { "epoch": 1.4176524112829845, "grad_norm": 0.5767075501287264, "learning_rate": 4.072652940024664e-06, "loss": 0.021, "step": 3116 }, { "epoch": 1.4181073703366698, "grad_norm": 0.47742063463475076, "learning_rate": 4.07209734000272e-06, "loss": 0.0213, "step": 3117 }, { "epoch": 1.4185623293903549, "grad_norm": 1.1004949616896063, "learning_rate": 4.071541611516082e-06, "loss": 0.0311, "step": 3118 }, { "epoch": 1.41901728844404, "grad_norm": 0.7512803475968975, "learning_rate": 4.0709857546101605e-06, "loss": 0.0317, "step": 3119 }, { "epoch": 1.4194722474977253, "grad_norm": 0.6141920243976169, "learning_rate": 4.0704297693303775e-06, "loss": 0.034, "step": 3120 }, { "epoch": 1.4199272065514104, "grad_norm": 0.6658167646440603, "learning_rate": 4.0698736557221655e-06, "loss": 0.0246, "step": 3121 }, { "epoch": 1.4203821656050954, "grad_norm": 0.5034335001854008, "learning_rate": 4.069317413830968e-06, "loss": 0.0187, "step": 3122 }, { "epoch": 1.4208371246587808, "grad_norm": 0.4550548362840694, "learning_rate": 4.068761043702237e-06, "loss": 0.0172, "step": 3123 }, { "epoch": 1.4212920837124658, "grad_norm": 0.5022383747528916, "learning_rate": 4.06820454538144e-06, "loss": 0.0161, "step": 3124 }, { "epoch": 1.421747042766151, "grad_norm": 0.5321472674787463, "learning_rate": 4.067647918914049e-06, "loss": 0.0164, "step": 3125 }, { "epoch": 1.4222020018198362, "grad_norm": 0.5184576534954308, "learning_rate": 4.067091164345549e-06, "loss": 0.0205, "step": 3126 }, { "epoch": 1.4226569608735213, "grad_norm": 0.5611322594124284, "learning_rate": 4.066534281721437e-06, "loss": 0.0218, "step": 3127 }, { "epoch": 1.4231119199272064, "grad_norm": 0.6345079828140591, "learning_rate": 4.065977271087216e-06, "loss": 0.0307, "step": 3128 }, { "epoch": 1.4235668789808917, "grad_norm": 0.45570970357451407, "learning_rate": 4.065420132488406e-06, "loss": 0.0171, "step": 3129 }, { "epoch": 1.4240218380345768, "grad_norm": 0.5340857150202568, "learning_rate": 4.064862865970531e-06, "loss": 0.0191, "step": 3130 }, { "epoch": 1.4244767970882621, "grad_norm": 0.4899881794546963, "learning_rate": 4.064305471579131e-06, "loss": 0.0209, "step": 3131 }, { "epoch": 1.4249317561419472, "grad_norm": 0.7270878934451148, "learning_rate": 4.063747949359751e-06, "loss": 0.0339, "step": 3132 }, { "epoch": 1.4253867151956323, "grad_norm": 0.587673912483386, "learning_rate": 4.063190299357951e-06, "loss": 0.0226, "step": 3133 }, { "epoch": 1.4258416742493176, "grad_norm": 0.8856393386631087, "learning_rate": 4.062632521619298e-06, "loss": 0.0348, "step": 3134 }, { "epoch": 1.4262966333030027, "grad_norm": 0.9531592586535969, "learning_rate": 4.0620746161893736e-06, "loss": 0.0342, "step": 3135 }, { "epoch": 1.426751592356688, "grad_norm": 0.6282966394466677, "learning_rate": 4.061516583113765e-06, "loss": 0.0284, "step": 3136 }, { "epoch": 1.4272065514103731, "grad_norm": 0.4750464430925278, "learning_rate": 4.060958422438073e-06, "loss": 0.0228, "step": 3137 }, { "epoch": 1.4276615104640582, "grad_norm": 0.6847093600934828, "learning_rate": 4.060400134207908e-06, "loss": 0.0273, "step": 3138 }, { "epoch": 1.4281164695177435, "grad_norm": 0.48758933458619164, "learning_rate": 4.05984171846889e-06, "loss": 0.0183, "step": 3139 }, { "epoch": 1.4285714285714286, "grad_norm": 0.6816015219475341, "learning_rate": 4.059283175266652e-06, "loss": 0.0373, "step": 3140 }, { "epoch": 1.4290263876251137, "grad_norm": 0.5622493429118623, "learning_rate": 4.058724504646834e-06, "loss": 0.0208, "step": 3141 }, { "epoch": 1.429481346678799, "grad_norm": 0.6272048522856389, "learning_rate": 4.058165706655089e-06, "loss": 0.0236, "step": 3142 }, { "epoch": 1.429936305732484, "grad_norm": 0.7322971571480782, "learning_rate": 4.057606781337079e-06, "loss": 0.029, "step": 3143 }, { "epoch": 1.4303912647861692, "grad_norm": 0.6016774365121075, "learning_rate": 4.057047728738477e-06, "loss": 0.0259, "step": 3144 }, { "epoch": 1.4308462238398545, "grad_norm": 1.2230808497645234, "learning_rate": 4.056488548904966e-06, "loss": 0.0642, "step": 3145 }, { "epoch": 1.4313011828935396, "grad_norm": 0.7604574246701213, "learning_rate": 4.055929241882239e-06, "loss": 0.0359, "step": 3146 }, { "epoch": 1.4317561419472247, "grad_norm": 0.679498620667103, "learning_rate": 4.0553698077160025e-06, "loss": 0.023, "step": 3147 }, { "epoch": 1.43221110100091, "grad_norm": 0.5870150703843239, "learning_rate": 4.054810246451969e-06, "loss": 0.0328, "step": 3148 }, { "epoch": 1.432666060054595, "grad_norm": 0.6481099201299542, "learning_rate": 4.054250558135862e-06, "loss": 0.032, "step": 3149 }, { "epoch": 1.4331210191082802, "grad_norm": 0.504870297656496, "learning_rate": 4.05369074281342e-06, "loss": 0.0259, "step": 3150 }, { "epoch": 1.4335759781619655, "grad_norm": 0.42044102667812067, "learning_rate": 4.053130800530387e-06, "loss": 0.0156, "step": 3151 }, { "epoch": 1.4340309372156506, "grad_norm": 0.5204802834399147, "learning_rate": 4.052570731332518e-06, "loss": 0.026, "step": 3152 }, { "epoch": 1.4344858962693356, "grad_norm": 0.41353391151905655, "learning_rate": 4.0520105352655805e-06, "loss": 0.017, "step": 3153 }, { "epoch": 1.434940855323021, "grad_norm": 0.7582144112871689, "learning_rate": 4.051450212375351e-06, "loss": 0.0351, "step": 3154 }, { "epoch": 1.435395814376706, "grad_norm": 0.5600600152721896, "learning_rate": 4.050889762707616e-06, "loss": 0.0297, "step": 3155 }, { "epoch": 1.4358507734303911, "grad_norm": 0.6570950043422336, "learning_rate": 4.050329186308173e-06, "loss": 0.032, "step": 3156 }, { "epoch": 1.4363057324840764, "grad_norm": 0.6757814720202312, "learning_rate": 4.0497684832228305e-06, "loss": 0.0235, "step": 3157 }, { "epoch": 1.4367606915377615, "grad_norm": 0.6431879965703954, "learning_rate": 4.049207653497406e-06, "loss": 0.0301, "step": 3158 }, { "epoch": 1.4372156505914468, "grad_norm": 0.722091996750155, "learning_rate": 4.0486466971777295e-06, "loss": 0.0311, "step": 3159 }, { "epoch": 1.437670609645132, "grad_norm": 0.5159615478447471, "learning_rate": 4.048085614309638e-06, "loss": 0.0209, "step": 3160 }, { "epoch": 1.438125568698817, "grad_norm": 0.694743512846256, "learning_rate": 4.047524404938981e-06, "loss": 0.0265, "step": 3161 }, { "epoch": 1.4385805277525023, "grad_norm": 0.71210008238664, "learning_rate": 4.046963069111617e-06, "loss": 0.0434, "step": 3162 }, { "epoch": 1.4390354868061874, "grad_norm": 0.626366567745311, "learning_rate": 4.046401606873419e-06, "loss": 0.0254, "step": 3163 }, { "epoch": 1.4394904458598727, "grad_norm": 0.501604026702404, "learning_rate": 4.045840018270264e-06, "loss": 0.0238, "step": 3164 }, { "epoch": 1.4399454049135578, "grad_norm": 0.4397581383178915, "learning_rate": 4.045278303348044e-06, "loss": 0.0161, "step": 3165 }, { "epoch": 1.440400363967243, "grad_norm": 0.6360646084965851, "learning_rate": 4.044716462152659e-06, "loss": 0.0238, "step": 3166 }, { "epoch": 1.4408553230209282, "grad_norm": 0.38658450024092633, "learning_rate": 4.04415449473002e-06, "loss": 0.0193, "step": 3167 }, { "epoch": 1.4413102820746133, "grad_norm": 0.5581740216149683, "learning_rate": 4.043592401126051e-06, "loss": 0.021, "step": 3168 }, { "epoch": 1.4417652411282984, "grad_norm": 0.5845374823097698, "learning_rate": 4.043030181386681e-06, "loss": 0.0256, "step": 3169 }, { "epoch": 1.4422202001819837, "grad_norm": 0.35484275672194915, "learning_rate": 4.042467835557853e-06, "loss": 0.0101, "step": 3170 }, { "epoch": 1.4426751592356688, "grad_norm": 0.6225428160256282, "learning_rate": 4.0419053636855185e-06, "loss": 0.029, "step": 3171 }, { "epoch": 1.443130118289354, "grad_norm": 0.5597849056713826, "learning_rate": 4.041342765815641e-06, "loss": 0.0268, "step": 3172 }, { "epoch": 1.4435850773430392, "grad_norm": 0.6257534793117826, "learning_rate": 4.040780041994193e-06, "loss": 0.0214, "step": 3173 }, { "epoch": 1.4440400363967243, "grad_norm": 0.6053906166055238, "learning_rate": 4.040217192267159e-06, "loss": 0.0316, "step": 3174 }, { "epoch": 1.4444949954504094, "grad_norm": 0.4918130393096578, "learning_rate": 4.03965421668053e-06, "loss": 0.0241, "step": 3175 }, { "epoch": 1.4449499545040947, "grad_norm": 0.5702821016502554, "learning_rate": 4.039091115280314e-06, "loss": 0.0206, "step": 3176 }, { "epoch": 1.4454049135577798, "grad_norm": 0.6916587079039479, "learning_rate": 4.038527888112521e-06, "loss": 0.0313, "step": 3177 }, { "epoch": 1.4458598726114649, "grad_norm": 0.5788509299527757, "learning_rate": 4.037964535223177e-06, "loss": 0.0187, "step": 3178 }, { "epoch": 1.4463148316651502, "grad_norm": 0.5220453246358187, "learning_rate": 4.037401056658317e-06, "loss": 0.0242, "step": 3179 }, { "epoch": 1.4467697907188353, "grad_norm": 0.5240476001763903, "learning_rate": 4.036837452463985e-06, "loss": 0.0148, "step": 3180 }, { "epoch": 1.4472247497725204, "grad_norm": 0.6074840718812475, "learning_rate": 4.0362737226862356e-06, "loss": 0.0289, "step": 3181 }, { "epoch": 1.4476797088262057, "grad_norm": 0.656878621424084, "learning_rate": 4.035709867371137e-06, "loss": 0.0256, "step": 3182 }, { "epoch": 1.4481346678798908, "grad_norm": 0.7142547713654863, "learning_rate": 4.035145886564763e-06, "loss": 0.0345, "step": 3183 }, { "epoch": 1.4485896269335758, "grad_norm": 0.5825936793650826, "learning_rate": 4.0345817803132e-06, "loss": 0.0185, "step": 3184 }, { "epoch": 1.4490445859872612, "grad_norm": 0.4333270723886052, "learning_rate": 4.034017548662544e-06, "loss": 0.0166, "step": 3185 }, { "epoch": 1.4494995450409462, "grad_norm": 0.646212871411711, "learning_rate": 4.033453191658901e-06, "loss": 0.0383, "step": 3186 }, { "epoch": 1.4499545040946316, "grad_norm": 0.8367732250496335, "learning_rate": 4.032888709348388e-06, "loss": 0.0339, "step": 3187 }, { "epoch": 1.4504094631483166, "grad_norm": 0.7244976664758369, "learning_rate": 4.032324101777132e-06, "loss": 0.0191, "step": 3188 }, { "epoch": 1.450864422202002, "grad_norm": 0.6717322566236624, "learning_rate": 4.03175936899127e-06, "loss": 0.0368, "step": 3189 }, { "epoch": 1.451319381255687, "grad_norm": 0.4208158609209425, "learning_rate": 4.031194511036951e-06, "loss": 0.0158, "step": 3190 }, { "epoch": 1.4517743403093721, "grad_norm": 0.4638906634895803, "learning_rate": 4.0306295279603304e-06, "loss": 0.0137, "step": 3191 }, { "epoch": 1.4522292993630574, "grad_norm": 0.6347806832854297, "learning_rate": 4.030064419807578e-06, "loss": 0.0304, "step": 3192 }, { "epoch": 1.4526842584167425, "grad_norm": 0.5757174544801394, "learning_rate": 4.02949918662487e-06, "loss": 0.018, "step": 3193 }, { "epoch": 1.4531392174704276, "grad_norm": 0.6295428757868852, "learning_rate": 4.028933828458396e-06, "loss": 0.0264, "step": 3194 }, { "epoch": 1.453594176524113, "grad_norm": 0.43758794804800105, "learning_rate": 4.028368345354355e-06, "loss": 0.0253, "step": 3195 }, { "epoch": 1.454049135577798, "grad_norm": 0.5805048377093173, "learning_rate": 4.027802737358954e-06, "loss": 0.0296, "step": 3196 }, { "epoch": 1.4545040946314831, "grad_norm": 0.4264103702768461, "learning_rate": 4.027237004518413e-06, "loss": 0.0119, "step": 3197 }, { "epoch": 1.4549590536851684, "grad_norm": 0.41389260389989946, "learning_rate": 4.02667114687896e-06, "loss": 0.016, "step": 3198 }, { "epoch": 1.4554140127388535, "grad_norm": 0.574238682508948, "learning_rate": 4.026105164486836e-06, "loss": 0.0257, "step": 3199 }, { "epoch": 1.4558689717925386, "grad_norm": 0.4662483333465922, "learning_rate": 4.0255390573882904e-06, "loss": 0.0138, "step": 3200 }, { "epoch": 1.456323930846224, "grad_norm": 0.5385442309695424, "learning_rate": 4.024972825629581e-06, "loss": 0.0135, "step": 3201 }, { "epoch": 1.456778889899909, "grad_norm": 0.6347955000788789, "learning_rate": 4.024406469256979e-06, "loss": 0.0213, "step": 3202 }, { "epoch": 1.457233848953594, "grad_norm": 0.4546118103535644, "learning_rate": 4.023839988316766e-06, "loss": 0.0148, "step": 3203 }, { "epoch": 1.4576888080072794, "grad_norm": 0.7098005879398113, "learning_rate": 4.02327338285523e-06, "loss": 0.0356, "step": 3204 }, { "epoch": 1.4581437670609645, "grad_norm": 0.7622305783040608, "learning_rate": 4.022706652918672e-06, "loss": 0.0322, "step": 3205 }, { "epoch": 1.4585987261146496, "grad_norm": 0.5083688752589572, "learning_rate": 4.022139798553404e-06, "loss": 0.0291, "step": 3206 }, { "epoch": 1.459053685168335, "grad_norm": 0.5310444613766537, "learning_rate": 4.021572819805744e-06, "loss": 0.022, "step": 3207 }, { "epoch": 1.45950864422202, "grad_norm": 0.5549383758015742, "learning_rate": 4.021005716722025e-06, "loss": 0.0186, "step": 3208 }, { "epoch": 1.459963603275705, "grad_norm": 0.3870771580694624, "learning_rate": 4.020438489348587e-06, "loss": 0.0166, "step": 3209 }, { "epoch": 1.4604185623293904, "grad_norm": 0.443463357543213, "learning_rate": 4.019871137731783e-06, "loss": 0.0199, "step": 3210 }, { "epoch": 1.4608735213830755, "grad_norm": 0.6111312773271786, "learning_rate": 4.019303661917973e-06, "loss": 0.0183, "step": 3211 }, { "epoch": 1.4613284804367606, "grad_norm": 0.47602225232623346, "learning_rate": 4.018736061953529e-06, "loss": 0.0154, "step": 3212 }, { "epoch": 1.4617834394904459, "grad_norm": 0.6708391794944873, "learning_rate": 4.018168337884832e-06, "loss": 0.0236, "step": 3213 }, { "epoch": 1.462238398544131, "grad_norm": 0.6968852951901131, "learning_rate": 4.017600489758275e-06, "loss": 0.0335, "step": 3214 }, { "epoch": 1.4626933575978163, "grad_norm": 0.5811945946607384, "learning_rate": 4.017032517620259e-06, "loss": 0.0255, "step": 3215 }, { "epoch": 1.4631483166515014, "grad_norm": 0.49821764006461977, "learning_rate": 4.016464421517197e-06, "loss": 0.0152, "step": 3216 }, { "epoch": 1.4636032757051867, "grad_norm": 0.7149483577148827, "learning_rate": 4.015896201495511e-06, "loss": 0.036, "step": 3217 }, { "epoch": 1.4640582347588718, "grad_norm": 0.6403076806861139, "learning_rate": 4.015327857601632e-06, "loss": 0.025, "step": 3218 }, { "epoch": 1.4645131938125568, "grad_norm": 0.6206009458630282, "learning_rate": 4.014759389882004e-06, "loss": 0.0274, "step": 3219 }, { "epoch": 1.4649681528662422, "grad_norm": 0.819456018070291, "learning_rate": 4.0141907983830794e-06, "loss": 0.0309, "step": 3220 }, { "epoch": 1.4654231119199272, "grad_norm": 0.45406669744308337, "learning_rate": 4.0136220831513205e-06, "loss": 0.0164, "step": 3221 }, { "epoch": 1.4658780709736123, "grad_norm": 0.4892778094068776, "learning_rate": 4.013053244233202e-06, "loss": 0.0243, "step": 3222 }, { "epoch": 1.4663330300272976, "grad_norm": 0.4163752869408427, "learning_rate": 4.012484281675203e-06, "loss": 0.0137, "step": 3223 }, { "epoch": 1.4667879890809827, "grad_norm": 0.6166328684835028, "learning_rate": 4.01191519552382e-06, "loss": 0.0242, "step": 3224 }, { "epoch": 1.4672429481346678, "grad_norm": 0.8950156886314216, "learning_rate": 4.011345985825555e-06, "loss": 0.0377, "step": 3225 }, { "epoch": 1.4676979071883531, "grad_norm": 0.5451391822566414, "learning_rate": 4.010776652626921e-06, "loss": 0.0228, "step": 3226 }, { "epoch": 1.4681528662420382, "grad_norm": 0.43220994211012703, "learning_rate": 4.010207195974441e-06, "loss": 0.0149, "step": 3227 }, { "epoch": 1.4686078252957233, "grad_norm": 0.6421311871461202, "learning_rate": 4.00963761591465e-06, "loss": 0.0248, "step": 3228 }, { "epoch": 1.4690627843494086, "grad_norm": 0.5176247695837788, "learning_rate": 4.00906791249409e-06, "loss": 0.022, "step": 3229 }, { "epoch": 1.4695177434030937, "grad_norm": 0.5533545377326602, "learning_rate": 4.008498085759315e-06, "loss": 0.0317, "step": 3230 }, { "epoch": 1.4699727024567788, "grad_norm": 0.6655829269146801, "learning_rate": 4.007928135756889e-06, "loss": 0.0237, "step": 3231 }, { "epoch": 1.4704276615104641, "grad_norm": 0.4579299246542189, "learning_rate": 4.007358062533386e-06, "loss": 0.0169, "step": 3232 }, { "epoch": 1.4708826205641492, "grad_norm": 0.8007698321103973, "learning_rate": 4.006787866135387e-06, "loss": 0.0378, "step": 3233 }, { "epoch": 1.4713375796178343, "grad_norm": 0.46407860101829335, "learning_rate": 4.006217546609491e-06, "loss": 0.0186, "step": 3234 }, { "epoch": 1.4717925386715196, "grad_norm": 0.6325621692821115, "learning_rate": 4.005647104002298e-06, "loss": 0.0331, "step": 3235 }, { "epoch": 1.4722474977252047, "grad_norm": 0.4729130943143962, "learning_rate": 4.005076538360424e-06, "loss": 0.022, "step": 3236 }, { "epoch": 1.4727024567788898, "grad_norm": 0.5325635276758262, "learning_rate": 4.00450584973049e-06, "loss": 0.0254, "step": 3237 }, { "epoch": 1.473157415832575, "grad_norm": 0.5951663628498189, "learning_rate": 4.003935038159134e-06, "loss": 0.0258, "step": 3238 }, { "epoch": 1.4736123748862602, "grad_norm": 0.8046736104783195, "learning_rate": 4.003364103692998e-06, "loss": 0.0376, "step": 3239 }, { "epoch": 1.4740673339399453, "grad_norm": 0.3611545320846498, "learning_rate": 4.002793046378736e-06, "loss": 0.0129, "step": 3240 }, { "epoch": 1.4745222929936306, "grad_norm": 0.6746020737552472, "learning_rate": 4.002221866263013e-06, "loss": 0.026, "step": 3241 }, { "epoch": 1.4749772520473157, "grad_norm": 0.716374094519489, "learning_rate": 4.001650563392504e-06, "loss": 0.0365, "step": 3242 }, { "epoch": 1.475432211101001, "grad_norm": 0.5761801662679816, "learning_rate": 4.001079137813892e-06, "loss": 0.0251, "step": 3243 }, { "epoch": 1.475887170154686, "grad_norm": 0.5169030321042024, "learning_rate": 4.00050758957387e-06, "loss": 0.0214, "step": 3244 }, { "epoch": 1.4763421292083714, "grad_norm": 0.8699364781361667, "learning_rate": 3.999935918719146e-06, "loss": 0.0426, "step": 3245 }, { "epoch": 1.4767970882620565, "grad_norm": 0.5630042536436409, "learning_rate": 3.999364125296432e-06, "loss": 0.0216, "step": 3246 }, { "epoch": 1.4772520473157416, "grad_norm": 0.48241646420427814, "learning_rate": 3.998792209352453e-06, "loss": 0.0213, "step": 3247 }, { "epoch": 1.4777070063694269, "grad_norm": 0.6568434530404283, "learning_rate": 3.998220170933942e-06, "loss": 0.0313, "step": 3248 }, { "epoch": 1.478161965423112, "grad_norm": 0.4145592487022671, "learning_rate": 3.997648010087645e-06, "loss": 0.014, "step": 3249 }, { "epoch": 1.478616924476797, "grad_norm": 0.5557880093978479, "learning_rate": 3.997075726860316e-06, "loss": 0.0132, "step": 3250 }, { "epoch": 1.4790718835304824, "grad_norm": 0.6386918374347622, "learning_rate": 3.996503321298719e-06, "loss": 0.0293, "step": 3251 }, { "epoch": 1.4795268425841674, "grad_norm": 0.8108389566308398, "learning_rate": 3.995930793449629e-06, "loss": 0.055, "step": 3252 }, { "epoch": 1.4799818016378525, "grad_norm": 0.5994893257212202, "learning_rate": 3.995358143359831e-06, "loss": 0.0212, "step": 3253 }, { "epoch": 1.4804367606915378, "grad_norm": 0.36011088041771677, "learning_rate": 3.994785371076118e-06, "loss": 0.0132, "step": 3254 }, { "epoch": 1.480891719745223, "grad_norm": 0.47107785017529297, "learning_rate": 3.994212476645294e-06, "loss": 0.0211, "step": 3255 }, { "epoch": 1.481346678798908, "grad_norm": 0.6948312534480735, "learning_rate": 3.993639460114175e-06, "loss": 0.0314, "step": 3256 }, { "epoch": 1.4818016378525933, "grad_norm": 0.534307758470587, "learning_rate": 3.9930663215295845e-06, "loss": 0.0224, "step": 3257 }, { "epoch": 1.4822565969062784, "grad_norm": 0.621059549070313, "learning_rate": 3.992493060938357e-06, "loss": 0.0265, "step": 3258 }, { "epoch": 1.4827115559599635, "grad_norm": 0.59194056246558, "learning_rate": 3.991919678387336e-06, "loss": 0.0278, "step": 3259 }, { "epoch": 1.4831665150136488, "grad_norm": 0.6769673790378502, "learning_rate": 3.991346173923378e-06, "loss": 0.0223, "step": 3260 }, { "epoch": 1.483621474067334, "grad_norm": 0.5048248018891563, "learning_rate": 3.990772547593342e-06, "loss": 0.0177, "step": 3261 }, { "epoch": 1.484076433121019, "grad_norm": 1.0508584289590246, "learning_rate": 3.990198799444109e-06, "loss": 0.0563, "step": 3262 }, { "epoch": 1.4845313921747043, "grad_norm": 0.5906268368359552, "learning_rate": 3.989624929522558e-06, "loss": 0.0215, "step": 3263 }, { "epoch": 1.4849863512283894, "grad_norm": 0.4307882601993587, "learning_rate": 3.989050937875586e-06, "loss": 0.017, "step": 3264 }, { "epoch": 1.4854413102820745, "grad_norm": 0.49695404258064146, "learning_rate": 3.988476824550095e-06, "loss": 0.024, "step": 3265 }, { "epoch": 1.4858962693357598, "grad_norm": 0.827358190513197, "learning_rate": 3.9879025895930005e-06, "loss": 0.0294, "step": 3266 }, { "epoch": 1.486351228389445, "grad_norm": 0.6751148447719345, "learning_rate": 3.987328233051225e-06, "loss": 0.0314, "step": 3267 }, { "epoch": 1.4868061874431302, "grad_norm": 0.9522102460202996, "learning_rate": 3.986753754971703e-06, "loss": 0.0298, "step": 3268 }, { "epoch": 1.4872611464968153, "grad_norm": 0.6420301483996141, "learning_rate": 3.986179155401379e-06, "loss": 0.0279, "step": 3269 }, { "epoch": 1.4877161055505004, "grad_norm": 0.719242311377963, "learning_rate": 3.985604434387206e-06, "loss": 0.0433, "step": 3270 }, { "epoch": 1.4881710646041857, "grad_norm": 1.6278477178749045, "learning_rate": 3.985029591976147e-06, "loss": 0.0485, "step": 3271 }, { "epoch": 1.4886260236578708, "grad_norm": 5.758947431240914, "learning_rate": 3.984454628215176e-06, "loss": 0.0677, "step": 3272 }, { "epoch": 1.489080982711556, "grad_norm": 0.7413160561856129, "learning_rate": 3.983879543151277e-06, "loss": 0.0361, "step": 3273 }, { "epoch": 1.4895359417652412, "grad_norm": 0.36384913079858605, "learning_rate": 3.9833043368314426e-06, "loss": 0.0119, "step": 3274 }, { "epoch": 1.4899909008189263, "grad_norm": 0.6361813980344438, "learning_rate": 3.982729009302676e-06, "loss": 0.0317, "step": 3275 }, { "epoch": 1.4904458598726116, "grad_norm": 0.7257578291982972, "learning_rate": 3.982153560611991e-06, "loss": 0.0358, "step": 3276 }, { "epoch": 1.4909008189262967, "grad_norm": 0.5165112950123053, "learning_rate": 3.98157799080641e-06, "loss": 0.0217, "step": 3277 }, { "epoch": 1.4913557779799818, "grad_norm": 0.9281543214333533, "learning_rate": 3.9810022999329675e-06, "loss": 0.0389, "step": 3278 }, { "epoch": 1.491810737033667, "grad_norm": 0.514017469225204, "learning_rate": 3.980426488038703e-06, "loss": 0.0223, "step": 3279 }, { "epoch": 1.4922656960873522, "grad_norm": 0.7512761092778488, "learning_rate": 3.979850555170673e-06, "loss": 0.0357, "step": 3280 }, { "epoch": 1.4927206551410372, "grad_norm": 0.6169552418993354, "learning_rate": 3.979274501375939e-06, "loss": 0.03, "step": 3281 }, { "epoch": 1.4931756141947226, "grad_norm": 0.8280448418011287, "learning_rate": 3.978698326701573e-06, "loss": 0.0421, "step": 3282 }, { "epoch": 1.4936305732484076, "grad_norm": 0.650445967840913, "learning_rate": 3.978122031194657e-06, "loss": 0.0145, "step": 3283 }, { "epoch": 1.4940855323020927, "grad_norm": 0.6583510369558364, "learning_rate": 3.977545614902284e-06, "loss": 0.0274, "step": 3284 }, { "epoch": 1.494540491355778, "grad_norm": 0.4472056988014067, "learning_rate": 3.976969077871555e-06, "loss": 0.0187, "step": 3285 }, { "epoch": 1.4949954504094631, "grad_norm": 0.7735245686273473, "learning_rate": 3.976392420149583e-06, "loss": 0.0273, "step": 3286 }, { "epoch": 1.4954504094631482, "grad_norm": 0.41532105399889546, "learning_rate": 3.975815641783491e-06, "loss": 0.0119, "step": 3287 }, { "epoch": 1.4959053685168335, "grad_norm": 0.6987463484258479, "learning_rate": 3.975238742820409e-06, "loss": 0.0377, "step": 3288 }, { "epoch": 1.4963603275705186, "grad_norm": 0.6173848427744979, "learning_rate": 3.9746617233074785e-06, "loss": 0.0239, "step": 3289 }, { "epoch": 1.4968152866242037, "grad_norm": 0.8727742573581233, "learning_rate": 3.974084583291851e-06, "loss": 0.0341, "step": 3290 }, { "epoch": 1.497270245677889, "grad_norm": 0.6117086620866596, "learning_rate": 3.97350732282069e-06, "loss": 0.0207, "step": 3291 }, { "epoch": 1.4977252047315741, "grad_norm": 0.4816207312722353, "learning_rate": 3.9729299419411635e-06, "loss": 0.0194, "step": 3292 }, { "epoch": 1.4981801637852592, "grad_norm": 0.7728518816048001, "learning_rate": 3.972352440700455e-06, "loss": 0.0303, "step": 3293 }, { "epoch": 1.4986351228389445, "grad_norm": 0.624081771304932, "learning_rate": 3.971774819145753e-06, "loss": 0.0203, "step": 3294 }, { "epoch": 1.4990900818926296, "grad_norm": 0.5277977109774716, "learning_rate": 3.97119707732426e-06, "loss": 0.0222, "step": 3295 }, { "epoch": 1.499545040946315, "grad_norm": 0.506346258870687, "learning_rate": 3.970619215283185e-06, "loss": 0.0263, "step": 3296 }, { "epoch": 1.5, "grad_norm": 0.5996351512465383, "learning_rate": 3.97004123306975e-06, "loss": 0.0316, "step": 3297 }, { "epoch": 1.5004549590536853, "grad_norm": 0.5841670508981737, "learning_rate": 3.969463130731183e-06, "loss": 0.0226, "step": 3298 }, { "epoch": 1.5009099181073702, "grad_norm": 0.9717872460539779, "learning_rate": 3.968884908314725e-06, "loss": 0.0314, "step": 3299 }, { "epoch": 1.5013648771610555, "grad_norm": 0.49964688444508165, "learning_rate": 3.968306565867627e-06, "loss": 0.019, "step": 3300 }, { "epoch": 1.5018198362147408, "grad_norm": 0.4378823570391564, "learning_rate": 3.967728103437146e-06, "loss": 0.0156, "step": 3301 }, { "epoch": 1.5022747952684259, "grad_norm": 0.5762765989780131, "learning_rate": 3.967149521070554e-06, "loss": 0.0278, "step": 3302 }, { "epoch": 1.502729754322111, "grad_norm": 0.4935515854160086, "learning_rate": 3.966570818815126e-06, "loss": 0.0201, "step": 3303 }, { "epoch": 1.5031847133757963, "grad_norm": 0.6809847660339569, "learning_rate": 3.965991996718156e-06, "loss": 0.0317, "step": 3304 }, { "epoch": 1.5036396724294814, "grad_norm": 0.6967324134263413, "learning_rate": 3.965413054826941e-06, "loss": 0.0234, "step": 3305 }, { "epoch": 1.5040946314831665, "grad_norm": 0.4217207881637992, "learning_rate": 3.964833993188787e-06, "loss": 0.0157, "step": 3306 }, { "epoch": 1.5045495905368518, "grad_norm": 1.0187002398410523, "learning_rate": 3.964254811851015e-06, "loss": 0.0516, "step": 3307 }, { "epoch": 1.5050045495905369, "grad_norm": 0.7101382532002475, "learning_rate": 3.963675510860952e-06, "loss": 0.032, "step": 3308 }, { "epoch": 1.505459508644222, "grad_norm": 0.5885558558311415, "learning_rate": 3.963096090265936e-06, "loss": 0.0275, "step": 3309 }, { "epoch": 1.5059144676979073, "grad_norm": 0.43615010061018616, "learning_rate": 3.962516550113316e-06, "loss": 0.017, "step": 3310 }, { "epoch": 1.5063694267515924, "grad_norm": 0.8215219704782009, "learning_rate": 3.961936890450447e-06, "loss": 0.0372, "step": 3311 }, { "epoch": 1.5068243858052774, "grad_norm": 0.7536358876609534, "learning_rate": 3.961357111324697e-06, "loss": 0.0401, "step": 3312 }, { "epoch": 1.5072793448589628, "grad_norm": 0.42864889313643445, "learning_rate": 3.960777212783445e-06, "loss": 0.0128, "step": 3313 }, { "epoch": 1.5077343039126478, "grad_norm": 0.42513746003457825, "learning_rate": 3.960197194874075e-06, "loss": 0.0171, "step": 3314 }, { "epoch": 1.508189262966333, "grad_norm": 0.6286780358231147, "learning_rate": 3.9596170576439844e-06, "loss": 0.0244, "step": 3315 }, { "epoch": 1.5086442220200182, "grad_norm": 0.7201172248235939, "learning_rate": 3.959036801140579e-06, "loss": 0.0314, "step": 3316 }, { "epoch": 1.5090991810737033, "grad_norm": 0.46561129210504143, "learning_rate": 3.958456425411275e-06, "loss": 0.0166, "step": 3317 }, { "epoch": 1.5095541401273884, "grad_norm": 0.4767372867859632, "learning_rate": 3.9578759305035e-06, "loss": 0.0233, "step": 3318 }, { "epoch": 1.5100090991810737, "grad_norm": 0.6987570495118559, "learning_rate": 3.957295316464686e-06, "loss": 0.032, "step": 3319 }, { "epoch": 1.5104640582347588, "grad_norm": 0.5353818748279202, "learning_rate": 3.956714583342281e-06, "loss": 0.0269, "step": 3320 }, { "epoch": 1.510919017288444, "grad_norm": 0.5675379911569748, "learning_rate": 3.9561337311837365e-06, "loss": 0.02, "step": 3321 }, { "epoch": 1.5113739763421292, "grad_norm": 0.4771557187323555, "learning_rate": 3.955552760036522e-06, "loss": 0.0239, "step": 3322 }, { "epoch": 1.5118289353958145, "grad_norm": 0.9322358569084765, "learning_rate": 3.9549716699481076e-06, "loss": 0.0357, "step": 3323 }, { "epoch": 1.5122838944494994, "grad_norm": 0.7153380105420392, "learning_rate": 3.954390460965979e-06, "loss": 0.0247, "step": 3324 }, { "epoch": 1.5127388535031847, "grad_norm": 0.8634115848570844, "learning_rate": 3.95380913313763e-06, "loss": 0.0375, "step": 3325 }, { "epoch": 1.51319381255687, "grad_norm": 0.5167563789245198, "learning_rate": 3.953227686510565e-06, "loss": 0.0253, "step": 3326 }, { "epoch": 1.5136487716105549, "grad_norm": 0.47663986929500723, "learning_rate": 3.9526461211322955e-06, "loss": 0.0193, "step": 3327 }, { "epoch": 1.5141037306642402, "grad_norm": 0.6010637486374424, "learning_rate": 3.9520644370503446e-06, "loss": 0.0305, "step": 3328 }, { "epoch": 1.5145586897179255, "grad_norm": 0.5165423349279934, "learning_rate": 3.951482634312246e-06, "loss": 0.0191, "step": 3329 }, { "epoch": 1.5150136487716106, "grad_norm": 0.6953133284712563, "learning_rate": 3.950900712965541e-06, "loss": 0.0348, "step": 3330 }, { "epoch": 1.5154686078252957, "grad_norm": 0.7069599237507933, "learning_rate": 3.950318673057782e-06, "loss": 0.0223, "step": 3331 }, { "epoch": 1.515923566878981, "grad_norm": 0.5858982166505268, "learning_rate": 3.949736514636531e-06, "loss": 0.0267, "step": 3332 }, { "epoch": 1.516378525932666, "grad_norm": 0.6239175973904172, "learning_rate": 3.949154237749358e-06, "loss": 0.0302, "step": 3333 }, { "epoch": 1.5168334849863512, "grad_norm": 0.48259660987352654, "learning_rate": 3.948571842443846e-06, "loss": 0.0172, "step": 3334 }, { "epoch": 1.5172884440400365, "grad_norm": 0.6191379855463478, "learning_rate": 3.947989328767585e-06, "loss": 0.0233, "step": 3335 }, { "epoch": 1.5177434030937216, "grad_norm": 0.7300237333629697, "learning_rate": 3.9474066967681744e-06, "loss": 0.0333, "step": 3336 }, { "epoch": 1.5181983621474067, "grad_norm": 0.9568900461126489, "learning_rate": 3.946823946493224e-06, "loss": 0.0454, "step": 3337 }, { "epoch": 1.518653321201092, "grad_norm": 0.5091287677987434, "learning_rate": 3.946241077990356e-06, "loss": 0.0237, "step": 3338 }, { "epoch": 1.519108280254777, "grad_norm": 0.6058068938185345, "learning_rate": 3.945658091307198e-06, "loss": 0.0251, "step": 3339 }, { "epoch": 1.5195632393084622, "grad_norm": 0.6200256282289905, "learning_rate": 3.9450749864913895e-06, "loss": 0.0281, "step": 3340 }, { "epoch": 1.5200181983621475, "grad_norm": 0.5747069734404173, "learning_rate": 3.9444917635905784e-06, "loss": 0.0171, "step": 3341 }, { "epoch": 1.5204731574158326, "grad_norm": 0.4749954035386937, "learning_rate": 3.943908422652424e-06, "loss": 0.0179, "step": 3342 }, { "epoch": 1.5209281164695176, "grad_norm": 0.5376186870756726, "learning_rate": 3.943324963724594e-06, "loss": 0.0211, "step": 3343 }, { "epoch": 1.521383075523203, "grad_norm": 0.6533422193724441, "learning_rate": 3.942741386854766e-06, "loss": 0.0254, "step": 3344 }, { "epoch": 1.521838034576888, "grad_norm": 0.6125010168356277, "learning_rate": 3.942157692090627e-06, "loss": 0.0199, "step": 3345 }, { "epoch": 1.5222929936305731, "grad_norm": 0.5244298027629346, "learning_rate": 3.941573879479874e-06, "loss": 0.0186, "step": 3346 }, { "epoch": 1.5227479526842584, "grad_norm": 0.6221503010776018, "learning_rate": 3.940989949070214e-06, "loss": 0.0318, "step": 3347 }, { "epoch": 1.5232029117379435, "grad_norm": 0.5592270472897608, "learning_rate": 3.940405900909362e-06, "loss": 0.0179, "step": 3348 }, { "epoch": 1.5236578707916286, "grad_norm": 0.5643171405314981, "learning_rate": 3.939821735045046e-06, "loss": 0.0208, "step": 3349 }, { "epoch": 1.524112829845314, "grad_norm": 0.8069246114105432, "learning_rate": 3.9392374515249986e-06, "loss": 0.0267, "step": 3350 }, { "epoch": 1.5245677888989992, "grad_norm": 1.1290905498149801, "learning_rate": 3.938653050396967e-06, "loss": 0.0608, "step": 3351 }, { "epoch": 1.525022747952684, "grad_norm": 0.6018387449063141, "learning_rate": 3.938068531708706e-06, "loss": 0.0254, "step": 3352 }, { "epoch": 1.5254777070063694, "grad_norm": 0.4828243886278892, "learning_rate": 3.937483895507977e-06, "loss": 0.016, "step": 3353 }, { "epoch": 1.5259326660600547, "grad_norm": 0.6616900595814049, "learning_rate": 3.936899141842556e-06, "loss": 0.0253, "step": 3354 }, { "epoch": 1.5263876251137396, "grad_norm": 0.5923645573102901, "learning_rate": 3.936314270760227e-06, "loss": 0.0196, "step": 3355 }, { "epoch": 1.526842584167425, "grad_norm": 0.5163730502393629, "learning_rate": 3.935729282308781e-06, "loss": 0.0223, "step": 3356 }, { "epoch": 1.5272975432211102, "grad_norm": 0.6832393099061513, "learning_rate": 3.935144176536023e-06, "loss": 0.031, "step": 3357 }, { "epoch": 1.5277525022747953, "grad_norm": 0.5007005767499049, "learning_rate": 3.934558953489763e-06, "loss": 0.0229, "step": 3358 }, { "epoch": 1.5282074613284804, "grad_norm": 0.853003111032188, "learning_rate": 3.9339736132178245e-06, "loss": 0.0232, "step": 3359 }, { "epoch": 1.5286624203821657, "grad_norm": 0.5521298424691822, "learning_rate": 3.933388155768038e-06, "loss": 0.0195, "step": 3360 }, { "epoch": 1.5291173794358508, "grad_norm": 0.6457589733077482, "learning_rate": 3.932802581188243e-06, "loss": 0.0321, "step": 3361 }, { "epoch": 1.5295723384895359, "grad_norm": 0.608336431809314, "learning_rate": 3.932216889526293e-06, "loss": 0.0307, "step": 3362 }, { "epoch": 1.5300272975432212, "grad_norm": 0.6136520142795995, "learning_rate": 3.931631080830046e-06, "loss": 0.0231, "step": 3363 }, { "epoch": 1.5304822565969063, "grad_norm": 0.6946767843291841, "learning_rate": 3.931045155147373e-06, "loss": 0.0299, "step": 3364 }, { "epoch": 1.5309372156505914, "grad_norm": 0.4896726609119758, "learning_rate": 3.930459112526153e-06, "loss": 0.0163, "step": 3365 }, { "epoch": 1.5313921747042767, "grad_norm": 0.5149603793357758, "learning_rate": 3.929872953014272e-06, "loss": 0.0224, "step": 3366 }, { "epoch": 1.5318471337579618, "grad_norm": 0.4807476106098001, "learning_rate": 3.929286676659632e-06, "loss": 0.0225, "step": 3367 }, { "epoch": 1.5323020928116469, "grad_norm": 0.45708905725245436, "learning_rate": 3.92870028351014e-06, "loss": 0.0171, "step": 3368 }, { "epoch": 1.5327570518653322, "grad_norm": 0.6665872106085607, "learning_rate": 3.9281137736137105e-06, "loss": 0.033, "step": 3369 }, { "epoch": 1.5332120109190173, "grad_norm": 0.6182322774057446, "learning_rate": 3.927527147018275e-06, "loss": 0.0277, "step": 3370 }, { "epoch": 1.5336669699727024, "grad_norm": 0.5531434189010356, "learning_rate": 3.926940403771767e-06, "loss": 0.0336, "step": 3371 }, { "epoch": 1.5341219290263877, "grad_norm": 0.828274110828607, "learning_rate": 3.926353543922133e-06, "loss": 0.0337, "step": 3372 }, { "epoch": 1.5345768880800728, "grad_norm": 0.5161411919297396, "learning_rate": 3.925766567517329e-06, "loss": 0.019, "step": 3373 }, { "epoch": 1.5350318471337578, "grad_norm": 0.6230029027238686, "learning_rate": 3.925179474605319e-06, "loss": 0.0265, "step": 3374 }, { "epoch": 1.5354868061874432, "grad_norm": 0.5787436091028683, "learning_rate": 3.92459226523408e-06, "loss": 0.018, "step": 3375 }, { "epoch": 1.5359417652411285, "grad_norm": 0.45169309927288415, "learning_rate": 3.924004939451593e-06, "loss": 0.015, "step": 3376 }, { "epoch": 1.5363967242948133, "grad_norm": 0.920838062645902, "learning_rate": 3.923417497305853e-06, "loss": 0.0438, "step": 3377 }, { "epoch": 1.5368516833484986, "grad_norm": 0.5102151208358107, "learning_rate": 3.9228299388448645e-06, "loss": 0.0175, "step": 3378 }, { "epoch": 1.537306642402184, "grad_norm": 0.5052430153694474, "learning_rate": 3.922242264116639e-06, "loss": 0.0219, "step": 3379 }, { "epoch": 1.5377616014558688, "grad_norm": 0.6374313082835955, "learning_rate": 3.921654473169198e-06, "loss": 0.0333, "step": 3380 }, { "epoch": 1.5382165605095541, "grad_norm": 0.5476952530160154, "learning_rate": 3.921066566050573e-06, "loss": 0.0212, "step": 3381 }, { "epoch": 1.5386715195632394, "grad_norm": 0.9473433558396809, "learning_rate": 3.920478542808806e-06, "loss": 0.0223, "step": 3382 }, { "epoch": 1.5391264786169245, "grad_norm": 0.7335517144921838, "learning_rate": 3.919890403491947e-06, "loss": 0.0279, "step": 3383 }, { "epoch": 1.5395814376706096, "grad_norm": 0.6418967633228138, "learning_rate": 3.919302148148057e-06, "loss": 0.0202, "step": 3384 }, { "epoch": 1.540036396724295, "grad_norm": 0.6065834876804072, "learning_rate": 3.918713776825204e-06, "loss": 0.0226, "step": 3385 }, { "epoch": 1.54049135577798, "grad_norm": 0.5525509383436024, "learning_rate": 3.918125289571469e-06, "loss": 0.0165, "step": 3386 }, { "epoch": 1.540946314831665, "grad_norm": 0.5704841120317284, "learning_rate": 3.917536686434939e-06, "loss": 0.0251, "step": 3387 }, { "epoch": 1.5414012738853504, "grad_norm": 0.6284290011537956, "learning_rate": 3.916947967463713e-06, "loss": 0.0273, "step": 3388 }, { "epoch": 1.5418562329390355, "grad_norm": 0.45132218970827287, "learning_rate": 3.916359132705898e-06, "loss": 0.0159, "step": 3389 }, { "epoch": 1.5423111919927206, "grad_norm": 0.6245494356008068, "learning_rate": 3.91577018220961e-06, "loss": 0.0297, "step": 3390 }, { "epoch": 1.542766151046406, "grad_norm": 0.6639491637722991, "learning_rate": 3.9151811160229765e-06, "loss": 0.0312, "step": 3391 }, { "epoch": 1.543221110100091, "grad_norm": 0.4639934293921549, "learning_rate": 3.914591934194134e-06, "loss": 0.0203, "step": 3392 }, { "epoch": 1.543676069153776, "grad_norm": 0.7771392348973274, "learning_rate": 3.914002636771226e-06, "loss": 0.0387, "step": 3393 }, { "epoch": 1.5441310282074614, "grad_norm": 0.47187669001516336, "learning_rate": 3.913413223802408e-06, "loss": 0.0147, "step": 3394 }, { "epoch": 1.5445859872611465, "grad_norm": 0.6210756572770973, "learning_rate": 3.912823695335845e-06, "loss": 0.0269, "step": 3395 }, { "epoch": 1.5450409463148316, "grad_norm": 0.749126270362982, "learning_rate": 3.91223405141971e-06, "loss": 0.035, "step": 3396 }, { "epoch": 1.5454959053685169, "grad_norm": 0.5844102366135081, "learning_rate": 3.911644292102185e-06, "loss": 0.0244, "step": 3397 }, { "epoch": 1.545950864422202, "grad_norm": 0.4504846823952382, "learning_rate": 3.911054417431465e-06, "loss": 0.0158, "step": 3398 }, { "epoch": 1.546405823475887, "grad_norm": 0.6063139796018792, "learning_rate": 3.9104644274557485e-06, "loss": 0.035, "step": 3399 }, { "epoch": 1.5468607825295724, "grad_norm": 0.7624720000367489, "learning_rate": 3.909874322223249e-06, "loss": 0.0301, "step": 3400 }, { "epoch": 1.5473157415832575, "grad_norm": 0.4352980068651625, "learning_rate": 3.909284101782187e-06, "loss": 0.0158, "step": 3401 }, { "epoch": 1.5477707006369426, "grad_norm": 0.47543411380220885, "learning_rate": 3.908693766180792e-06, "loss": 0.019, "step": 3402 }, { "epoch": 1.5482256596906279, "grad_norm": 0.44347229009044975, "learning_rate": 3.908103315467306e-06, "loss": 0.0161, "step": 3403 }, { "epoch": 1.5486806187443132, "grad_norm": 0.5439384407995033, "learning_rate": 3.907512749689973e-06, "loss": 0.0227, "step": 3404 }, { "epoch": 1.549135577797998, "grad_norm": 0.598140622569138, "learning_rate": 3.906922068897057e-06, "loss": 0.0341, "step": 3405 }, { "epoch": 1.5495905368516834, "grad_norm": 0.397233945431831, "learning_rate": 3.906331273136822e-06, "loss": 0.0122, "step": 3406 }, { "epoch": 1.5500454959053687, "grad_norm": 0.5632352571342928, "learning_rate": 3.905740362457546e-06, "loss": 0.0266, "step": 3407 }, { "epoch": 1.5505004549590535, "grad_norm": 0.45563561711988787, "learning_rate": 3.905149336907516e-06, "loss": 0.016, "step": 3408 }, { "epoch": 1.5509554140127388, "grad_norm": 0.6596648155175628, "learning_rate": 3.904558196535029e-06, "loss": 0.0305, "step": 3409 }, { "epoch": 1.5514103730664242, "grad_norm": 0.7079970990005604, "learning_rate": 3.903966941388387e-06, "loss": 0.0263, "step": 3410 }, { "epoch": 1.5518653321201092, "grad_norm": 0.8057885831541769, "learning_rate": 3.9033755715159085e-06, "loss": 0.0297, "step": 3411 }, { "epoch": 1.5523202911737943, "grad_norm": 0.5776215693579038, "learning_rate": 3.902784086965915e-06, "loss": 0.0187, "step": 3412 }, { "epoch": 1.5527752502274796, "grad_norm": 0.5917536430690068, "learning_rate": 3.902192487786741e-06, "loss": 0.0272, "step": 3413 }, { "epoch": 1.5532302092811647, "grad_norm": 0.6721030679732494, "learning_rate": 3.9016007740267295e-06, "loss": 0.0249, "step": 3414 }, { "epoch": 1.5536851683348498, "grad_norm": 0.4963390776924177, "learning_rate": 3.901008945734232e-06, "loss": 0.0167, "step": 3415 }, { "epoch": 1.5541401273885351, "grad_norm": 0.7044535041689735, "learning_rate": 3.90041700295761e-06, "loss": 0.0337, "step": 3416 }, { "epoch": 1.5545950864422202, "grad_norm": 0.8038485567195957, "learning_rate": 3.899824945745236e-06, "loss": 0.0366, "step": 3417 }, { "epoch": 1.5550500454959053, "grad_norm": 0.6306007993135992, "learning_rate": 3.899232774145488e-06, "loss": 0.0215, "step": 3418 }, { "epoch": 1.5555050045495906, "grad_norm": 0.5663443449128879, "learning_rate": 3.898640488206756e-06, "loss": 0.0311, "step": 3419 }, { "epoch": 1.5559599636032757, "grad_norm": 0.6617378094913873, "learning_rate": 3.898048087977441e-06, "loss": 0.0296, "step": 3420 }, { "epoch": 1.5564149226569608, "grad_norm": 0.5247200025249587, "learning_rate": 3.89745557350595e-06, "loss": 0.0194, "step": 3421 }, { "epoch": 1.556869881710646, "grad_norm": 0.581451724478826, "learning_rate": 3.896862944840698e-06, "loss": 0.0177, "step": 3422 }, { "epoch": 1.5573248407643312, "grad_norm": 0.8154202104136817, "learning_rate": 3.896270202030116e-06, "loss": 0.0511, "step": 3423 }, { "epoch": 1.5577797998180163, "grad_norm": 0.4576805292939243, "learning_rate": 3.895677345122638e-06, "loss": 0.0152, "step": 3424 }, { "epoch": 1.5582347588717016, "grad_norm": 0.6462285553170106, "learning_rate": 3.895084374166711e-06, "loss": 0.0303, "step": 3425 }, { "epoch": 1.5586897179253867, "grad_norm": 0.4571016486302641, "learning_rate": 3.894491289210788e-06, "loss": 0.017, "step": 3426 }, { "epoch": 1.5591446769790718, "grad_norm": 0.7613273750746716, "learning_rate": 3.893898090303335e-06, "loss": 0.0345, "step": 3427 }, { "epoch": 1.559599636032757, "grad_norm": 0.6456841813220812, "learning_rate": 3.893304777492825e-06, "loss": 0.0307, "step": 3428 }, { "epoch": 1.5600545950864422, "grad_norm": 0.7063060028945898, "learning_rate": 3.89271135082774e-06, "loss": 0.0252, "step": 3429 }, { "epoch": 1.5605095541401273, "grad_norm": 0.49183537271033767, "learning_rate": 3.892117810356574e-06, "loss": 0.0197, "step": 3430 }, { "epoch": 1.5609645131938126, "grad_norm": 1.1855054320374372, "learning_rate": 3.8915241561278265e-06, "loss": 0.0276, "step": 3431 }, { "epoch": 1.5614194722474979, "grad_norm": 0.4694422800890511, "learning_rate": 3.890930388190009e-06, "loss": 0.0162, "step": 3432 }, { "epoch": 1.5618744313011828, "grad_norm": 0.854552183687447, "learning_rate": 3.890336506591642e-06, "loss": 0.0348, "step": 3433 }, { "epoch": 1.562329390354868, "grad_norm": 0.6692617939837532, "learning_rate": 3.889742511381254e-06, "loss": 0.0269, "step": 3434 }, { "epoch": 1.5627843494085534, "grad_norm": 1.0752556777655282, "learning_rate": 3.889148402607384e-06, "loss": 0.0303, "step": 3435 }, { "epoch": 1.5632393084622382, "grad_norm": 0.6604461169117343, "learning_rate": 3.88855418031858e-06, "loss": 0.0301, "step": 3436 }, { "epoch": 1.5636942675159236, "grad_norm": 0.6305615429026079, "learning_rate": 3.887959844563399e-06, "loss": 0.0209, "step": 3437 }, { "epoch": 1.5641492265696089, "grad_norm": 0.7058425188549334, "learning_rate": 3.887365395390407e-06, "loss": 0.0223, "step": 3438 }, { "epoch": 1.564604185623294, "grad_norm": 0.8386437287625623, "learning_rate": 3.886770832848181e-06, "loss": 0.0323, "step": 3439 }, { "epoch": 1.565059144676979, "grad_norm": 0.6598116278749199, "learning_rate": 3.886176156985305e-06, "loss": 0.0243, "step": 3440 }, { "epoch": 1.5655141037306644, "grad_norm": 0.6145240126950321, "learning_rate": 3.885581367850373e-06, "loss": 0.0258, "step": 3441 }, { "epoch": 1.5659690627843494, "grad_norm": 0.6848848503580919, "learning_rate": 3.8849864654919885e-06, "loss": 0.0248, "step": 3442 }, { "epoch": 1.5664240218380345, "grad_norm": 0.7462959103495879, "learning_rate": 3.884391449958765e-06, "loss": 0.0307, "step": 3443 }, { "epoch": 1.5668789808917198, "grad_norm": 0.9227690503269461, "learning_rate": 3.883796321299325e-06, "loss": 0.0307, "step": 3444 }, { "epoch": 1.567333939945405, "grad_norm": 0.6700677482330477, "learning_rate": 3.8832010795622975e-06, "loss": 0.0363, "step": 3445 }, { "epoch": 1.56778889899909, "grad_norm": 0.6822138930890699, "learning_rate": 3.882605724796324e-06, "loss": 0.0316, "step": 3446 }, { "epoch": 1.5682438580527753, "grad_norm": 0.6017869767431822, "learning_rate": 3.882010257050056e-06, "loss": 0.0266, "step": 3447 }, { "epoch": 1.5686988171064604, "grad_norm": 0.5232712126783707, "learning_rate": 3.88141467637215e-06, "loss": 0.0197, "step": 3448 }, { "epoch": 1.5691537761601455, "grad_norm": 0.5530318145705541, "learning_rate": 3.880818982811275e-06, "loss": 0.0279, "step": 3449 }, { "epoch": 1.5696087352138308, "grad_norm": 1.007799449110445, "learning_rate": 3.880223176416108e-06, "loss": 0.035, "step": 3450 }, { "epoch": 1.570063694267516, "grad_norm": 0.5334141245111912, "learning_rate": 3.879627257235337e-06, "loss": 0.0223, "step": 3451 }, { "epoch": 1.570518653321201, "grad_norm": 0.854819849016019, "learning_rate": 3.8790312253176565e-06, "loss": 0.0321, "step": 3452 }, { "epoch": 1.5709736123748863, "grad_norm": 0.6388465604043535, "learning_rate": 3.878435080711772e-06, "loss": 0.0207, "step": 3453 }, { "epoch": 1.5714285714285714, "grad_norm": 0.5635363641527492, "learning_rate": 3.877838823466398e-06, "loss": 0.0223, "step": 3454 }, { "epoch": 1.5718835304822565, "grad_norm": 0.6475301926880436, "learning_rate": 3.8772424536302565e-06, "loss": 0.0226, "step": 3455 }, { "epoch": 1.5723384895359418, "grad_norm": 0.6117567579130653, "learning_rate": 3.876645971252082e-06, "loss": 0.0255, "step": 3456 }, { "epoch": 1.5727934485896269, "grad_norm": 0.5792594395173578, "learning_rate": 3.876049376380615e-06, "loss": 0.0303, "step": 3457 }, { "epoch": 1.573248407643312, "grad_norm": 0.8432212860921924, "learning_rate": 3.875452669064609e-06, "loss": 0.0419, "step": 3458 }, { "epoch": 1.5737033666969973, "grad_norm": 0.42058349084627317, "learning_rate": 3.874855849352821e-06, "loss": 0.0191, "step": 3459 }, { "epoch": 1.5741583257506826, "grad_norm": 0.7235018800789317, "learning_rate": 3.874258917294021e-06, "loss": 0.0406, "step": 3460 }, { "epoch": 1.5746132848043675, "grad_norm": 0.8311342432465947, "learning_rate": 3.873661872936989e-06, "loss": 0.0478, "step": 3461 }, { "epoch": 1.5750682438580528, "grad_norm": 0.6547797917717316, "learning_rate": 3.873064716330513e-06, "loss": 0.0298, "step": 3462 }, { "epoch": 1.575523202911738, "grad_norm": 0.6755671622269617, "learning_rate": 3.872467447523388e-06, "loss": 0.0307, "step": 3463 }, { "epoch": 1.575978161965423, "grad_norm": 0.7583762247476595, "learning_rate": 3.871870066564422e-06, "loss": 0.0283, "step": 3464 }, { "epoch": 1.5764331210191083, "grad_norm": 0.6366872413298845, "learning_rate": 3.8712725735024295e-06, "loss": 0.0263, "step": 3465 }, { "epoch": 1.5768880800727936, "grad_norm": 0.5257311180353379, "learning_rate": 3.870674968386234e-06, "loss": 0.0226, "step": 3466 }, { "epoch": 1.5773430391264787, "grad_norm": 0.4253083736064576, "learning_rate": 3.87007725126467e-06, "loss": 0.0164, "step": 3467 }, { "epoch": 1.5777979981801638, "grad_norm": 0.5386960324866366, "learning_rate": 3.869479422186582e-06, "loss": 0.0173, "step": 3468 }, { "epoch": 1.578252957233849, "grad_norm": 0.48215988619225064, "learning_rate": 3.868881481200818e-06, "loss": 0.0184, "step": 3469 }, { "epoch": 1.5787079162875342, "grad_norm": 0.609767144737994, "learning_rate": 3.868283428356243e-06, "loss": 0.0409, "step": 3470 }, { "epoch": 1.5791628753412192, "grad_norm": 0.5291617114915205, "learning_rate": 3.8676852637017234e-06, "loss": 0.0231, "step": 3471 }, { "epoch": 1.5796178343949046, "grad_norm": 0.48866771038754586, "learning_rate": 3.867086987286141e-06, "loss": 0.0247, "step": 3472 }, { "epoch": 1.5800727934485896, "grad_norm": 0.4998279314275286, "learning_rate": 3.866488599158386e-06, "loss": 0.0243, "step": 3473 }, { "epoch": 1.5805277525022747, "grad_norm": 0.5281491792419408, "learning_rate": 3.865890099367351e-06, "loss": 0.0162, "step": 3474 }, { "epoch": 1.58098271155596, "grad_norm": 0.7800264919684391, "learning_rate": 3.865291487961946e-06, "loss": 0.0358, "step": 3475 }, { "epoch": 1.5814376706096451, "grad_norm": 0.668212797447539, "learning_rate": 3.864692764991087e-06, "loss": 0.0358, "step": 3476 }, { "epoch": 1.5818926296633302, "grad_norm": 0.5890921063282307, "learning_rate": 3.864093930503697e-06, "loss": 0.0248, "step": 3477 }, { "epoch": 1.5823475887170155, "grad_norm": 0.5909722423431498, "learning_rate": 3.863494984548712e-06, "loss": 0.0232, "step": 3478 }, { "epoch": 1.5828025477707006, "grad_norm": 0.6172756228743427, "learning_rate": 3.862895927175074e-06, "loss": 0.0344, "step": 3479 }, { "epoch": 1.5832575068243857, "grad_norm": 0.547485277081697, "learning_rate": 3.862296758431736e-06, "loss": 0.0279, "step": 3480 }, { "epoch": 1.583712465878071, "grad_norm": 0.5162157512249061, "learning_rate": 3.861697478367658e-06, "loss": 0.0164, "step": 3481 }, { "epoch": 1.584167424931756, "grad_norm": 0.6420660037228898, "learning_rate": 3.8610980870318126e-06, "loss": 0.0198, "step": 3482 }, { "epoch": 1.5846223839854412, "grad_norm": 0.7440250847976592, "learning_rate": 3.860498584473178e-06, "loss": 0.0479, "step": 3483 }, { "epoch": 1.5850773430391265, "grad_norm": 0.4443405489217349, "learning_rate": 3.859898970740743e-06, "loss": 0.0159, "step": 3484 }, { "epoch": 1.5855323020928116, "grad_norm": 0.6979930315119395, "learning_rate": 3.859299245883505e-06, "loss": 0.0324, "step": 3485 }, { "epoch": 1.5859872611464967, "grad_norm": 0.7134876833857815, "learning_rate": 3.858699409950472e-06, "loss": 0.0398, "step": 3486 }, { "epoch": 1.586442220200182, "grad_norm": 0.4825353059791467, "learning_rate": 3.858099462990658e-06, "loss": 0.016, "step": 3487 }, { "epoch": 1.5868971792538673, "grad_norm": 0.6150538702034337, "learning_rate": 3.857499405053089e-06, "loss": 0.0301, "step": 3488 }, { "epoch": 1.5873521383075522, "grad_norm": 0.625710397147892, "learning_rate": 3.856899236186799e-06, "loss": 0.0306, "step": 3489 }, { "epoch": 1.5878070973612375, "grad_norm": 0.6522778309170958, "learning_rate": 3.856298956440832e-06, "loss": 0.0292, "step": 3490 }, { "epoch": 1.5882620564149228, "grad_norm": 0.5784507541788387, "learning_rate": 3.8556985658642395e-06, "loss": 0.0288, "step": 3491 }, { "epoch": 1.5887170154686077, "grad_norm": 0.6076283369120725, "learning_rate": 3.855098064506081e-06, "loss": 0.0284, "step": 3492 }, { "epoch": 1.589171974522293, "grad_norm": 0.5347645776215969, "learning_rate": 3.85449745241543e-06, "loss": 0.0197, "step": 3493 }, { "epoch": 1.5896269335759783, "grad_norm": 0.5608060117520718, "learning_rate": 3.853896729641363e-06, "loss": 0.0249, "step": 3494 }, { "epoch": 1.5900818926296634, "grad_norm": 0.6468434298369498, "learning_rate": 3.853295896232969e-06, "loss": 0.0364, "step": 3495 }, { "epoch": 1.5905368516833485, "grad_norm": 0.6854777995675021, "learning_rate": 3.852694952239347e-06, "loss": 0.0339, "step": 3496 }, { "epoch": 1.5909918107370338, "grad_norm": 0.6643748488103951, "learning_rate": 3.852093897709601e-06, "loss": 0.0215, "step": 3497 }, { "epoch": 1.5914467697907189, "grad_norm": 0.5125764451782172, "learning_rate": 3.851492732692849e-06, "loss": 0.0221, "step": 3498 }, { "epoch": 1.591901728844404, "grad_norm": 0.9740815731787803, "learning_rate": 3.8508914572382124e-06, "loss": 0.0304, "step": 3499 }, { "epoch": 1.5923566878980893, "grad_norm": 0.47429273180622816, "learning_rate": 3.850290071394828e-06, "loss": 0.0211, "step": 3500 }, { "epoch": 1.5928116469517744, "grad_norm": 0.5856345203484362, "learning_rate": 3.8496885752118365e-06, "loss": 0.0275, "step": 3501 }, { "epoch": 1.5932666060054594, "grad_norm": 0.5701881313175124, "learning_rate": 3.849086968738389e-06, "loss": 0.0248, "step": 3502 }, { "epoch": 1.5937215650591448, "grad_norm": 0.7010777024298871, "learning_rate": 3.848485252023647e-06, "loss": 0.0181, "step": 3503 }, { "epoch": 1.5941765241128298, "grad_norm": 0.7893900975337746, "learning_rate": 3.847883425116781e-06, "loss": 0.0395, "step": 3504 }, { "epoch": 1.594631483166515, "grad_norm": 0.5894497701644011, "learning_rate": 3.8472814880669675e-06, "loss": 0.0368, "step": 3505 }, { "epoch": 1.5950864422202002, "grad_norm": 0.7531589111356518, "learning_rate": 3.8466794409233946e-06, "loss": 0.0258, "step": 3506 }, { "epoch": 1.5955414012738853, "grad_norm": 0.9230617438117314, "learning_rate": 3.846077283735261e-06, "loss": 0.0325, "step": 3507 }, { "epoch": 1.5959963603275704, "grad_norm": 0.370638936750391, "learning_rate": 3.84547501655177e-06, "loss": 0.0162, "step": 3508 }, { "epoch": 1.5964513193812557, "grad_norm": 0.6926234974622569, "learning_rate": 3.844872639422136e-06, "loss": 0.0362, "step": 3509 }, { "epoch": 1.5969062784349408, "grad_norm": 0.47160413647170063, "learning_rate": 3.844270152395583e-06, "loss": 0.0214, "step": 3510 }, { "epoch": 1.597361237488626, "grad_norm": 0.7083297825352631, "learning_rate": 3.843667555521346e-06, "loss": 0.0438, "step": 3511 }, { "epoch": 1.5978161965423112, "grad_norm": 0.8441503938502771, "learning_rate": 3.843064848848662e-06, "loss": 0.0276, "step": 3512 }, { "epoch": 1.5982711555959963, "grad_norm": 0.6561878128626468, "learning_rate": 3.842462032426784e-06, "loss": 0.0257, "step": 3513 }, { "epoch": 1.5987261146496814, "grad_norm": 0.4142556056021124, "learning_rate": 3.841859106304973e-06, "loss": 0.0153, "step": 3514 }, { "epoch": 1.5991810737033667, "grad_norm": 0.7732835680071638, "learning_rate": 3.841256070532494e-06, "loss": 0.0481, "step": 3515 }, { "epoch": 1.599636032757052, "grad_norm": 0.9016444602806903, "learning_rate": 3.840652925158626e-06, "loss": 0.0402, "step": 3516 }, { "epoch": 1.6000909918107369, "grad_norm": 0.7200030788002735, "learning_rate": 3.840049670232656e-06, "loss": 0.0321, "step": 3517 }, { "epoch": 1.6005459508644222, "grad_norm": 0.5119605070096909, "learning_rate": 3.839446305803878e-06, "loss": 0.025, "step": 3518 }, { "epoch": 1.6010009099181075, "grad_norm": 0.5867897527768563, "learning_rate": 3.838842831921598e-06, "loss": 0.018, "step": 3519 }, { "epoch": 1.6014558689717924, "grad_norm": 0.5674939657400796, "learning_rate": 3.8382392486351265e-06, "loss": 0.0243, "step": 3520 }, { "epoch": 1.6019108280254777, "grad_norm": 0.4585806965385925, "learning_rate": 3.837635555993787e-06, "loss": 0.0215, "step": 3521 }, { "epoch": 1.602365787079163, "grad_norm": 0.5861447224605015, "learning_rate": 3.837031754046912e-06, "loss": 0.0261, "step": 3522 }, { "epoch": 1.602820746132848, "grad_norm": 0.7789923182655337, "learning_rate": 3.836427842843838e-06, "loss": 0.0318, "step": 3523 }, { "epoch": 1.6032757051865332, "grad_norm": 0.6200237101545006, "learning_rate": 3.835823822433918e-06, "loss": 0.0282, "step": 3524 }, { "epoch": 1.6037306642402185, "grad_norm": 0.4823998206699662, "learning_rate": 3.835219692866506e-06, "loss": 0.0198, "step": 3525 }, { "epoch": 1.6041856232939036, "grad_norm": 0.5398187465426305, "learning_rate": 3.834615454190972e-06, "loss": 0.0297, "step": 3526 }, { "epoch": 1.6046405823475887, "grad_norm": 0.6202173676714453, "learning_rate": 3.834011106456689e-06, "loss": 0.04, "step": 3527 }, { "epoch": 1.605095541401274, "grad_norm": 0.6723747745682109, "learning_rate": 3.833406649713044e-06, "loss": 0.0373, "step": 3528 }, { "epoch": 1.605550500454959, "grad_norm": 0.5542430465028839, "learning_rate": 3.832802084009428e-06, "loss": 0.0282, "step": 3529 }, { "epoch": 1.6060054595086442, "grad_norm": 0.565624305303762, "learning_rate": 3.832197409395245e-06, "loss": 0.0215, "step": 3530 }, { "epoch": 1.6064604185623295, "grad_norm": 0.4861010317792031, "learning_rate": 3.831592625919906e-06, "loss": 0.0225, "step": 3531 }, { "epoch": 1.6069153776160146, "grad_norm": 0.4373745791753047, "learning_rate": 3.830987733632831e-06, "loss": 0.0176, "step": 3532 }, { "epoch": 1.6073703366696996, "grad_norm": 0.5422984557811947, "learning_rate": 3.830382732583449e-06, "loss": 0.0209, "step": 3533 }, { "epoch": 1.607825295723385, "grad_norm": 0.5685852646509083, "learning_rate": 3.829777622821198e-06, "loss": 0.0297, "step": 3534 }, { "epoch": 1.60828025477707, "grad_norm": 0.543627577172519, "learning_rate": 3.8291724043955245e-06, "loss": 0.0166, "step": 3535 }, { "epoch": 1.6087352138307551, "grad_norm": 0.488533286475008, "learning_rate": 3.828567077355885e-06, "loss": 0.017, "step": 3536 }, { "epoch": 1.6091901728844404, "grad_norm": 0.4320474763865453, "learning_rate": 3.827961641751744e-06, "loss": 0.016, "step": 3537 }, { "epoch": 1.6096451319381255, "grad_norm": 0.5050357798930966, "learning_rate": 3.827356097632574e-06, "loss": 0.0298, "step": 3538 }, { "epoch": 1.6101000909918106, "grad_norm": 0.43740647703076296, "learning_rate": 3.826750445047859e-06, "loss": 0.016, "step": 3539 }, { "epoch": 1.610555050045496, "grad_norm": 0.8100698120528552, "learning_rate": 3.826144684047089e-06, "loss": 0.0347, "step": 3540 }, { "epoch": 1.6110100090991812, "grad_norm": 0.6197479181778295, "learning_rate": 3.825538814679763e-06, "loss": 0.0168, "step": 3541 }, { "epoch": 1.611464968152866, "grad_norm": 0.5803411146457176, "learning_rate": 3.824932836995392e-06, "loss": 0.0185, "step": 3542 }, { "epoch": 1.6119199272065514, "grad_norm": 0.6921154745229382, "learning_rate": 3.8243267510434936e-06, "loss": 0.0443, "step": 3543 }, { "epoch": 1.6123748862602367, "grad_norm": 0.539892414542492, "learning_rate": 3.823720556873592e-06, "loss": 0.016, "step": 3544 }, { "epoch": 1.6128298453139216, "grad_norm": 0.60868298291056, "learning_rate": 3.823114254535226e-06, "loss": 0.0237, "step": 3545 }, { "epoch": 1.613284804367607, "grad_norm": 0.4889853258398554, "learning_rate": 3.8225078440779375e-06, "loss": 0.0209, "step": 3546 }, { "epoch": 1.6137397634212922, "grad_norm": 0.7040369108834665, "learning_rate": 3.821901325551281e-06, "loss": 0.0237, "step": 3547 }, { "epoch": 1.6141947224749773, "grad_norm": 0.6923999096362768, "learning_rate": 3.821294699004816e-06, "loss": 0.0255, "step": 3548 }, { "epoch": 1.6146496815286624, "grad_norm": 0.4021417615609393, "learning_rate": 3.820687964488117e-06, "loss": 0.0172, "step": 3549 }, { "epoch": 1.6151046405823477, "grad_norm": 0.5145338706518726, "learning_rate": 3.82008112205076e-06, "loss": 0.0222, "step": 3550 }, { "epoch": 1.6155595996360328, "grad_norm": 0.6171169806105846, "learning_rate": 3.819474171742336e-06, "loss": 0.0224, "step": 3551 }, { "epoch": 1.6160145586897179, "grad_norm": 0.5389292833044773, "learning_rate": 3.8188671136124425e-06, "loss": 0.0181, "step": 3552 }, { "epoch": 1.6164695177434032, "grad_norm": 0.4297302177233826, "learning_rate": 3.818259947710683e-06, "loss": 0.0181, "step": 3553 }, { "epoch": 1.6169244767970883, "grad_norm": 0.6660195157126831, "learning_rate": 3.817652674086675e-06, "loss": 0.0259, "step": 3554 }, { "epoch": 1.6173794358507734, "grad_norm": 0.758925079775858, "learning_rate": 3.81704529279004e-06, "loss": 0.0513, "step": 3555 }, { "epoch": 1.6178343949044587, "grad_norm": 0.4334208779986942, "learning_rate": 3.816437803870412e-06, "loss": 0.0196, "step": 3556 }, { "epoch": 1.6182893539581438, "grad_norm": 0.7882302405649769, "learning_rate": 3.815830207377431e-06, "loss": 0.0342, "step": 3557 }, { "epoch": 1.6187443130118289, "grad_norm": 0.6638178492343451, "learning_rate": 3.815222503360748e-06, "loss": 0.0297, "step": 3558 }, { "epoch": 1.6191992720655142, "grad_norm": 0.49748672673993416, "learning_rate": 3.814614691870021e-06, "loss": 0.0127, "step": 3559 }, { "epoch": 1.6196542311191993, "grad_norm": 0.49860874776103087, "learning_rate": 3.814006772954919e-06, "loss": 0.0187, "step": 3560 }, { "epoch": 1.6201091901728844, "grad_norm": 0.45358670629439046, "learning_rate": 3.8133987466651175e-06, "loss": 0.022, "step": 3561 }, { "epoch": 1.6205641492265697, "grad_norm": 0.7110343231389603, "learning_rate": 3.8127906130503014e-06, "loss": 0.0325, "step": 3562 }, { "epoch": 1.6210191082802548, "grad_norm": 0.46120308542915633, "learning_rate": 3.8121823721601647e-06, "loss": 0.0117, "step": 3563 }, { "epoch": 1.6214740673339398, "grad_norm": 0.7232903654349334, "learning_rate": 3.8115740240444106e-06, "loss": 0.0288, "step": 3564 }, { "epoch": 1.6219290263876252, "grad_norm": 0.6618321138024421, "learning_rate": 3.81096556875275e-06, "loss": 0.0239, "step": 3565 }, { "epoch": 1.6223839854413102, "grad_norm": 0.6552432695181556, "learning_rate": 3.8103570063349034e-06, "loss": 0.0378, "step": 3566 }, { "epoch": 1.6228389444949953, "grad_norm": 0.8200462149386227, "learning_rate": 3.8097483368406003e-06, "loss": 0.0463, "step": 3567 }, { "epoch": 1.6232939035486806, "grad_norm": 0.578990989494396, "learning_rate": 3.809139560319577e-06, "loss": 0.0222, "step": 3568 }, { "epoch": 1.623748862602366, "grad_norm": 0.4202465716313672, "learning_rate": 3.8085306768215812e-06, "loss": 0.0107, "step": 3569 }, { "epoch": 1.6242038216560508, "grad_norm": 0.5061392067706059, "learning_rate": 3.8079216863963675e-06, "loss": 0.0165, "step": 3570 }, { "epoch": 1.6246587807097361, "grad_norm": 0.43536820092251655, "learning_rate": 3.807312589093701e-06, "loss": 0.0188, "step": 3571 }, { "epoch": 1.6251137397634214, "grad_norm": 0.6050677446333904, "learning_rate": 3.806703384963353e-06, "loss": 0.0271, "step": 3572 }, { "epoch": 1.6255686988171063, "grad_norm": 0.8225447317035454, "learning_rate": 3.8060940740551056e-06, "loss": 0.0333, "step": 3573 }, { "epoch": 1.6260236578707916, "grad_norm": 0.3731276144702114, "learning_rate": 3.8054846564187486e-06, "loss": 0.0104, "step": 3574 }, { "epoch": 1.626478616924477, "grad_norm": 0.6006174110645207, "learning_rate": 3.8048751321040806e-06, "loss": 0.0278, "step": 3575 }, { "epoch": 1.626933575978162, "grad_norm": 0.6442558908935601, "learning_rate": 3.80426550116091e-06, "loss": 0.0238, "step": 3576 }, { "epoch": 1.627388535031847, "grad_norm": 0.6207227243383738, "learning_rate": 3.8036557636390527e-06, "loss": 0.0299, "step": 3577 }, { "epoch": 1.6278434940855324, "grad_norm": 0.5585727627560396, "learning_rate": 3.803045919588333e-06, "loss": 0.0262, "step": 3578 }, { "epoch": 1.6282984531392175, "grad_norm": 0.44114683790491727, "learning_rate": 3.8024359690585856e-06, "loss": 0.021, "step": 3579 }, { "epoch": 1.6287534121929026, "grad_norm": 0.3296094863201106, "learning_rate": 3.8018259120996527e-06, "loss": 0.0174, "step": 3580 }, { "epoch": 1.629208371246588, "grad_norm": 0.5397044586335464, "learning_rate": 3.8012157487613853e-06, "loss": 0.019, "step": 3581 }, { "epoch": 1.629663330300273, "grad_norm": 1.076109393415077, "learning_rate": 3.800605479093643e-06, "loss": 0.0429, "step": 3582 }, { "epoch": 1.630118289353958, "grad_norm": 0.6959026110586248, "learning_rate": 3.7999951031462946e-06, "loss": 0.03, "step": 3583 }, { "epoch": 1.6305732484076434, "grad_norm": 0.5139348407505921, "learning_rate": 3.7993846209692176e-06, "loss": 0.017, "step": 3584 }, { "epoch": 1.6310282074613285, "grad_norm": 0.9198756880788689, "learning_rate": 3.798774032612297e-06, "loss": 0.0399, "step": 3585 }, { "epoch": 1.6314831665150136, "grad_norm": 0.36677379892540746, "learning_rate": 3.7981633381254266e-06, "loss": 0.0131, "step": 3586 }, { "epoch": 1.6319381255686989, "grad_norm": 0.5820327888677712, "learning_rate": 3.7975525375585115e-06, "loss": 0.0184, "step": 3587 }, { "epoch": 1.632393084622384, "grad_norm": 0.5161381978272331, "learning_rate": 3.7969416309614633e-06, "loss": 0.0216, "step": 3588 }, { "epoch": 1.632848043676069, "grad_norm": 0.5463019720540039, "learning_rate": 3.796330618384201e-06, "loss": 0.0225, "step": 3589 }, { "epoch": 1.6333030027297544, "grad_norm": 0.4568655142259495, "learning_rate": 3.795719499876655e-06, "loss": 0.0182, "step": 3590 }, { "epoch": 1.6337579617834395, "grad_norm": 0.6196564337957245, "learning_rate": 3.7951082754887638e-06, "loss": 0.0182, "step": 3591 }, { "epoch": 1.6342129208371245, "grad_norm": 0.506298546219541, "learning_rate": 3.7944969452704717e-06, "loss": 0.0234, "step": 3592 }, { "epoch": 1.6346678798908099, "grad_norm": 0.7295581071981191, "learning_rate": 3.7938855092717354e-06, "loss": 0.0358, "step": 3593 }, { "epoch": 1.635122838944495, "grad_norm": 0.4035231793496555, "learning_rate": 3.793273967542519e-06, "loss": 0.0137, "step": 3594 }, { "epoch": 1.63557779799818, "grad_norm": 0.5837188097448504, "learning_rate": 3.792662320132794e-06, "loss": 0.0254, "step": 3595 }, { "epoch": 1.6360327570518653, "grad_norm": 0.42331112642730656, "learning_rate": 3.792050567092542e-06, "loss": 0.0226, "step": 3596 }, { "epoch": 1.6364877161055507, "grad_norm": 0.6719821406101639, "learning_rate": 3.791438708471752e-06, "loss": 0.0326, "step": 3597 }, { "epoch": 1.6369426751592355, "grad_norm": 0.6622216896059354, "learning_rate": 3.7908267443204226e-06, "loss": 0.023, "step": 3598 }, { "epoch": 1.6373976342129208, "grad_norm": 0.5047368379702989, "learning_rate": 3.7902146746885614e-06, "loss": 0.0244, "step": 3599 }, { "epoch": 1.6378525932666061, "grad_norm": 0.6965120716900632, "learning_rate": 3.789602499626184e-06, "loss": 0.0308, "step": 3600 }, { "epoch": 1.638307552320291, "grad_norm": 0.6025551885499085, "learning_rate": 3.788990219183314e-06, "loss": 0.0216, "step": 3601 }, { "epoch": 1.6387625113739763, "grad_norm": 0.6125717157099263, "learning_rate": 3.7883778334099842e-06, "loss": 0.0231, "step": 3602 }, { "epoch": 1.6392174704276616, "grad_norm": 0.5473132899665126, "learning_rate": 3.7877653423562365e-06, "loss": 0.0257, "step": 3603 }, { "epoch": 1.6396724294813467, "grad_norm": 0.6537475089518195, "learning_rate": 3.787152746072119e-06, "loss": 0.0274, "step": 3604 }, { "epoch": 1.6401273885350318, "grad_norm": 0.6480805629958326, "learning_rate": 3.7865400446076933e-06, "loss": 0.0279, "step": 3605 }, { "epoch": 1.6405823475887171, "grad_norm": 0.630271607808444, "learning_rate": 3.7859272380130248e-06, "loss": 0.0222, "step": 3606 }, { "epoch": 1.6410373066424022, "grad_norm": 0.48911537645003134, "learning_rate": 3.785314326338189e-06, "loss": 0.022, "step": 3607 }, { "epoch": 1.6414922656960873, "grad_norm": 0.49625230753870486, "learning_rate": 3.784701309633272e-06, "loss": 0.0154, "step": 3608 }, { "epoch": 1.6419472247497726, "grad_norm": 0.6207889729485055, "learning_rate": 3.7840881879483647e-06, "loss": 0.0222, "step": 3609 }, { "epoch": 1.6424021838034577, "grad_norm": 0.7328816141105232, "learning_rate": 3.7834749613335704e-06, "loss": 0.0209, "step": 3610 }, { "epoch": 1.6428571428571428, "grad_norm": 0.5530871566437657, "learning_rate": 3.782861629838997e-06, "loss": 0.0239, "step": 3611 }, { "epoch": 1.643312101910828, "grad_norm": 0.9052471747323153, "learning_rate": 3.782248193514766e-06, "loss": 0.0313, "step": 3612 }, { "epoch": 1.6437670609645132, "grad_norm": 0.4611313175757836, "learning_rate": 3.7816346524110027e-06, "loss": 0.0164, "step": 3613 }, { "epoch": 1.6442220200181983, "grad_norm": 0.5318534391416457, "learning_rate": 3.781021006577843e-06, "loss": 0.0187, "step": 3614 }, { "epoch": 1.6446769790718836, "grad_norm": 0.8999001858437418, "learning_rate": 3.780407256065432e-06, "loss": 0.0448, "step": 3615 }, { "epoch": 1.6451319381255687, "grad_norm": 0.6841986770877886, "learning_rate": 3.7797934009239224e-06, "loss": 0.0257, "step": 3616 }, { "epoch": 1.6455868971792538, "grad_norm": 0.5953377845238845, "learning_rate": 3.7791794412034756e-06, "loss": 0.033, "step": 3617 }, { "epoch": 1.646041856232939, "grad_norm": 0.6104178527749724, "learning_rate": 3.7785653769542613e-06, "loss": 0.0219, "step": 3618 }, { "epoch": 1.6464968152866242, "grad_norm": 0.6615786474747989, "learning_rate": 3.7779512082264586e-06, "loss": 0.0351, "step": 3619 }, { "epoch": 1.6469517743403093, "grad_norm": 0.45031972888153543, "learning_rate": 3.777336935070255e-06, "loss": 0.0191, "step": 3620 }, { "epoch": 1.6474067333939946, "grad_norm": 0.5290797918701384, "learning_rate": 3.7767225575358434e-06, "loss": 0.0264, "step": 3621 }, { "epoch": 1.6478616924476797, "grad_norm": 0.6155643496140948, "learning_rate": 3.7761080756734318e-06, "loss": 0.0293, "step": 3622 }, { "epoch": 1.6483166515013647, "grad_norm": 0.7688383564375362, "learning_rate": 3.7754934895332306e-06, "loss": 0.0418, "step": 3623 }, { "epoch": 1.64877161055505, "grad_norm": 0.5286169912201062, "learning_rate": 3.7748787991654623e-06, "loss": 0.0231, "step": 3624 }, { "epoch": 1.6492265696087354, "grad_norm": 0.6616058729544022, "learning_rate": 3.774264004620355e-06, "loss": 0.037, "step": 3625 }, { "epoch": 1.6496815286624202, "grad_norm": 0.4034447323556056, "learning_rate": 3.7736491059481474e-06, "loss": 0.0201, "step": 3626 }, { "epoch": 1.6501364877161055, "grad_norm": 0.4827707916446577, "learning_rate": 3.7730341031990873e-06, "loss": 0.0239, "step": 3627 }, { "epoch": 1.6505914467697909, "grad_norm": 0.622385511217906, "learning_rate": 3.772418996423428e-06, "loss": 0.0304, "step": 3628 }, { "epoch": 1.6510464058234757, "grad_norm": 0.4456421103444245, "learning_rate": 3.7718037856714364e-06, "loss": 0.0168, "step": 3629 }, { "epoch": 1.651501364877161, "grad_norm": 0.5066325224743325, "learning_rate": 3.7711884709933823e-06, "loss": 0.0174, "step": 3630 }, { "epoch": 1.6519563239308463, "grad_norm": 0.5641394565791685, "learning_rate": 3.7705730524395466e-06, "loss": 0.0313, "step": 3631 }, { "epoch": 1.6524112829845314, "grad_norm": 0.620291914257022, "learning_rate": 3.7699575300602188e-06, "loss": 0.0188, "step": 3632 }, { "epoch": 1.6528662420382165, "grad_norm": 0.5219762390228715, "learning_rate": 3.7693419039056965e-06, "loss": 0.0231, "step": 3633 }, { "epoch": 1.6533212010919018, "grad_norm": 0.4441682306331103, "learning_rate": 3.768726174026287e-06, "loss": 0.0192, "step": 3634 }, { "epoch": 1.653776160145587, "grad_norm": 0.44964605531542085, "learning_rate": 3.768110340472304e-06, "loss": 0.0208, "step": 3635 }, { "epoch": 1.654231119199272, "grad_norm": 0.6485140503083492, "learning_rate": 3.7674944032940696e-06, "loss": 0.0213, "step": 3636 }, { "epoch": 1.6546860782529573, "grad_norm": 0.6021902161090137, "learning_rate": 3.766878362541918e-06, "loss": 0.022, "step": 3637 }, { "epoch": 1.6551410373066424, "grad_norm": 0.5202194021470081, "learning_rate": 3.7662622182661867e-06, "loss": 0.0191, "step": 3638 }, { "epoch": 1.6555959963603275, "grad_norm": 1.009854523450612, "learning_rate": 3.7656459705172255e-06, "loss": 0.0403, "step": 3639 }, { "epoch": 1.6560509554140128, "grad_norm": 0.6839263706918253, "learning_rate": 3.7650296193453916e-06, "loss": 0.0269, "step": 3640 }, { "epoch": 1.656505914467698, "grad_norm": 0.6993714428573524, "learning_rate": 3.7644131648010494e-06, "loss": 0.0246, "step": 3641 }, { "epoch": 1.656960873521383, "grad_norm": 0.6815473973133779, "learning_rate": 3.7637966069345743e-06, "loss": 0.0203, "step": 3642 }, { "epoch": 1.6574158325750683, "grad_norm": 0.8439769351263455, "learning_rate": 3.7631799457963467e-06, "loss": 0.0419, "step": 3643 }, { "epoch": 1.6578707916287534, "grad_norm": 0.7033184573271442, "learning_rate": 3.7625631814367593e-06, "loss": 0.0331, "step": 3644 }, { "epoch": 1.6583257506824385, "grad_norm": 0.6122877596789658, "learning_rate": 3.7619463139062097e-06, "loss": 0.0242, "step": 3645 }, { "epoch": 1.6587807097361238, "grad_norm": 0.8027844629713855, "learning_rate": 3.761329343255107e-06, "loss": 0.0285, "step": 3646 }, { "epoch": 1.6592356687898089, "grad_norm": 0.5437531086613037, "learning_rate": 3.760712269533866e-06, "loss": 0.0193, "step": 3647 }, { "epoch": 1.659690627843494, "grad_norm": 0.8454673738376148, "learning_rate": 3.7600950927929116e-06, "loss": 0.0289, "step": 3648 }, { "epoch": 1.6601455868971793, "grad_norm": 0.5874359688131018, "learning_rate": 3.759477813082677e-06, "loss": 0.0368, "step": 3649 }, { "epoch": 1.6606005459508644, "grad_norm": 0.561547399632124, "learning_rate": 3.7588604304536026e-06, "loss": 0.0257, "step": 3650 }, { "epoch": 1.6610555050045495, "grad_norm": 0.6516254171674322, "learning_rate": 3.75824294495614e-06, "loss": 0.0264, "step": 3651 }, { "epoch": 1.6615104640582348, "grad_norm": 0.586676169852581, "learning_rate": 3.757625356640745e-06, "loss": 0.026, "step": 3652 }, { "epoch": 1.66196542311192, "grad_norm": 0.4734126319148065, "learning_rate": 3.757007665557886e-06, "loss": 0.0242, "step": 3653 }, { "epoch": 1.662420382165605, "grad_norm": 0.5634091884397163, "learning_rate": 3.7563898717580364e-06, "loss": 0.0319, "step": 3654 }, { "epoch": 1.6628753412192903, "grad_norm": 0.4879942321956978, "learning_rate": 3.755771975291681e-06, "loss": 0.0168, "step": 3655 }, { "epoch": 1.6633303002729756, "grad_norm": 0.5665912556605981, "learning_rate": 3.7551539762093103e-06, "loss": 0.0217, "step": 3656 }, { "epoch": 1.6637852593266604, "grad_norm": 0.5511340592863238, "learning_rate": 3.7545358745614246e-06, "loss": 0.0236, "step": 3657 }, { "epoch": 1.6642402183803457, "grad_norm": 0.7980773287068369, "learning_rate": 3.7539176703985338e-06, "loss": 0.0395, "step": 3658 }, { "epoch": 1.664695177434031, "grad_norm": 0.5272569448570994, "learning_rate": 3.7532993637711524e-06, "loss": 0.017, "step": 3659 }, { "epoch": 1.6651501364877161, "grad_norm": 1.973300676488328, "learning_rate": 3.7526809547298072e-06, "loss": 0.0867, "step": 3660 }, { "epoch": 1.6656050955414012, "grad_norm": 0.4603199822151636, "learning_rate": 3.752062443325032e-06, "loss": 0.0235, "step": 3661 }, { "epoch": 1.6660600545950865, "grad_norm": 0.4984041496096718, "learning_rate": 3.7514438296073678e-06, "loss": 0.0159, "step": 3662 }, { "epoch": 1.6665150136487716, "grad_norm": 0.6486830695220506, "learning_rate": 3.7508251136273656e-06, "loss": 0.0202, "step": 3663 }, { "epoch": 1.6669699727024567, "grad_norm": 0.8468363633924074, "learning_rate": 3.7502062954355835e-06, "loss": 0.0526, "step": 3664 }, { "epoch": 1.667424931756142, "grad_norm": 0.3919623481399935, "learning_rate": 3.749587375082589e-06, "loss": 0.0112, "step": 3665 }, { "epoch": 1.6678798908098271, "grad_norm": 0.5459884336885586, "learning_rate": 3.7489683526189575e-06, "loss": 0.0283, "step": 3666 }, { "epoch": 1.6683348498635122, "grad_norm": 0.5733639666031194, "learning_rate": 3.7483492280952718e-06, "loss": 0.027, "step": 3667 }, { "epoch": 1.6687898089171975, "grad_norm": 0.551070019879109, "learning_rate": 3.747730001562125e-06, "loss": 0.0223, "step": 3668 }, { "epoch": 1.6692447679708826, "grad_norm": 0.8818886132528604, "learning_rate": 3.747110673070117e-06, "loss": 0.0439, "step": 3669 }, { "epoch": 1.6696997270245677, "grad_norm": 0.449458660767284, "learning_rate": 3.7464912426698568e-06, "loss": 0.0129, "step": 3670 }, { "epoch": 1.670154686078253, "grad_norm": 0.5652944170753819, "learning_rate": 3.7458717104119618e-06, "loss": 0.0203, "step": 3671 }, { "epoch": 1.670609645131938, "grad_norm": 0.5678881883605275, "learning_rate": 3.7452520763470567e-06, "loss": 0.0225, "step": 3672 }, { "epoch": 1.6710646041856232, "grad_norm": 0.749443357576462, "learning_rate": 3.7446323405257755e-06, "loss": 0.0366, "step": 3673 }, { "epoch": 1.6715195632393085, "grad_norm": 0.48011063349192723, "learning_rate": 3.7440125029987593e-06, "loss": 0.025, "step": 3674 }, { "epoch": 1.6719745222929936, "grad_norm": 0.6671257763330405, "learning_rate": 3.7433925638166603e-06, "loss": 0.0238, "step": 3675 }, { "epoch": 1.6724294813466787, "grad_norm": 0.4852610433290402, "learning_rate": 3.742772523030136e-06, "loss": 0.0207, "step": 3676 }, { "epoch": 1.672884440400364, "grad_norm": 0.6115036341454524, "learning_rate": 3.742152380689853e-06, "loss": 0.0235, "step": 3677 }, { "epoch": 1.673339399454049, "grad_norm": 0.5855479287907706, "learning_rate": 3.7415321368464872e-06, "loss": 0.0313, "step": 3678 }, { "epoch": 1.6737943585077342, "grad_norm": 0.7448031677845011, "learning_rate": 3.740911791550722e-06, "loss": 0.0178, "step": 3679 }, { "epoch": 1.6742493175614195, "grad_norm": 0.7332173099068802, "learning_rate": 3.7402913448532493e-06, "loss": 0.0288, "step": 3680 }, { "epoch": 1.6747042766151048, "grad_norm": 0.6418505071933155, "learning_rate": 3.7396707968047676e-06, "loss": 0.0327, "step": 3681 }, { "epoch": 1.6751592356687897, "grad_norm": 0.7202448098932808, "learning_rate": 3.7390501474559883e-06, "loss": 0.0347, "step": 3682 }, { "epoch": 1.675614194722475, "grad_norm": 0.823163701581977, "learning_rate": 3.738429396857626e-06, "loss": 0.0326, "step": 3683 }, { "epoch": 1.6760691537761603, "grad_norm": 0.8604154683999686, "learning_rate": 3.7378085450604053e-06, "loss": 0.0277, "step": 3684 }, { "epoch": 1.6765241128298451, "grad_norm": 0.5425924320853744, "learning_rate": 3.7371875921150612e-06, "loss": 0.0206, "step": 3685 }, { "epoch": 1.6769790718835305, "grad_norm": 0.537530491367301, "learning_rate": 3.7365665380723335e-06, "loss": 0.023, "step": 3686 }, { "epoch": 1.6774340309372158, "grad_norm": 0.6072594994123963, "learning_rate": 3.7359453829829734e-06, "loss": 0.0249, "step": 3687 }, { "epoch": 1.6778889899909009, "grad_norm": 0.7277626181433888, "learning_rate": 3.7353241268977373e-06, "loss": 0.0289, "step": 3688 }, { "epoch": 1.678343949044586, "grad_norm": 0.37079405045520875, "learning_rate": 3.734702769867393e-06, "loss": 0.0085, "step": 3689 }, { "epoch": 1.6787989080982713, "grad_norm": 0.6369715456570345, "learning_rate": 3.734081311942714e-06, "loss": 0.0253, "step": 3690 }, { "epoch": 1.6792538671519563, "grad_norm": 0.5823431934795118, "learning_rate": 3.733459753174482e-06, "loss": 0.0166, "step": 3691 }, { "epoch": 1.6797088262056414, "grad_norm": 0.5224766975497316, "learning_rate": 3.7328380936134904e-06, "loss": 0.0239, "step": 3692 }, { "epoch": 1.6801637852593267, "grad_norm": 0.6392489030563797, "learning_rate": 3.732216333310537e-06, "loss": 0.0299, "step": 3693 }, { "epoch": 1.6806187443130118, "grad_norm": 0.6247103573315796, "learning_rate": 3.7315944723164297e-06, "loss": 0.0248, "step": 3694 }, { "epoch": 1.681073703366697, "grad_norm": 0.7473537288598567, "learning_rate": 3.730972510681984e-06, "loss": 0.0345, "step": 3695 }, { "epoch": 1.6815286624203822, "grad_norm": 0.5024685900367807, "learning_rate": 3.7303504484580235e-06, "loss": 0.0191, "step": 3696 }, { "epoch": 1.6819836214740673, "grad_norm": 0.6298449368900484, "learning_rate": 3.729728285695381e-06, "loss": 0.0214, "step": 3697 }, { "epoch": 1.6824385805277524, "grad_norm": 0.4152971336432089, "learning_rate": 3.7291060224448948e-06, "loss": 0.0171, "step": 3698 }, { "epoch": 1.6828935395814377, "grad_norm": 0.6815773728664686, "learning_rate": 3.728483658757417e-06, "loss": 0.0188, "step": 3699 }, { "epoch": 1.6833484986351228, "grad_norm": 0.618924972853139, "learning_rate": 3.7278611946838016e-06, "loss": 0.0185, "step": 3700 }, { "epoch": 1.683803457688808, "grad_norm": 0.5167418108822708, "learning_rate": 3.727238630274914e-06, "loss": 0.0206, "step": 3701 }, { "epoch": 1.6842584167424932, "grad_norm": 0.796862561203587, "learning_rate": 3.726615965581628e-06, "loss": 0.0393, "step": 3702 }, { "epoch": 1.6847133757961783, "grad_norm": 16.410696857396648, "learning_rate": 3.725993200654825e-06, "loss": 0.4378, "step": 3703 }, { "epoch": 1.6851683348498634, "grad_norm": 0.6039077909169345, "learning_rate": 3.725370335545394e-06, "loss": 0.025, "step": 3704 }, { "epoch": 1.6856232939035487, "grad_norm": 0.6758703407711169, "learning_rate": 3.7247473703042324e-06, "loss": 0.0298, "step": 3705 }, { "epoch": 1.686078252957234, "grad_norm": 0.40894796945610395, "learning_rate": 3.7241243049822475e-06, "loss": 0.0167, "step": 3706 }, { "epoch": 1.6865332120109189, "grad_norm": 0.6085395068507089, "learning_rate": 3.723501139630352e-06, "loss": 0.0253, "step": 3707 }, { "epoch": 1.6869881710646042, "grad_norm": 0.5264192185021481, "learning_rate": 3.722877874299469e-06, "loss": 0.0261, "step": 3708 }, { "epoch": 1.6874431301182895, "grad_norm": 0.41356331417687003, "learning_rate": 3.722254509040527e-06, "loss": 0.0152, "step": 3709 }, { "epoch": 1.6878980891719744, "grad_norm": 0.5322391946249139, "learning_rate": 3.721631043904468e-06, "loss": 0.0242, "step": 3710 }, { "epoch": 1.6883530482256597, "grad_norm": 0.5168838283494139, "learning_rate": 3.7210074789422363e-06, "loss": 0.021, "step": 3711 }, { "epoch": 1.688808007279345, "grad_norm": 0.6955554477419243, "learning_rate": 3.7203838142047875e-06, "loss": 0.039, "step": 3712 }, { "epoch": 1.68926296633303, "grad_norm": 0.5179778216573575, "learning_rate": 3.719760049743084e-06, "loss": 0.0265, "step": 3713 }, { "epoch": 1.6897179253867152, "grad_norm": 0.9036619682190697, "learning_rate": 3.719136185608099e-06, "loss": 0.0387, "step": 3714 }, { "epoch": 1.6901728844404005, "grad_norm": 0.4710707649389809, "learning_rate": 3.7185122218508097e-06, "loss": 0.0239, "step": 3715 }, { "epoch": 1.6906278434940856, "grad_norm": 0.47157157168562497, "learning_rate": 3.717888158522204e-06, "loss": 0.0222, "step": 3716 }, { "epoch": 1.6910828025477707, "grad_norm": 0.550350895261478, "learning_rate": 3.717263995673278e-06, "loss": 0.0241, "step": 3717 }, { "epoch": 1.691537761601456, "grad_norm": 0.6148748686415011, "learning_rate": 3.7166397333550357e-06, "loss": 0.0232, "step": 3718 }, { "epoch": 1.691992720655141, "grad_norm": 0.6910976774877058, "learning_rate": 3.7160153716184887e-06, "loss": 0.0344, "step": 3719 }, { "epoch": 1.6924476797088261, "grad_norm": 0.5659383368243334, "learning_rate": 3.7153909105146567e-06, "loss": 0.0203, "step": 3720 }, { "epoch": 1.6929026387625115, "grad_norm": 0.47971439657054477, "learning_rate": 3.7147663500945692e-06, "loss": 0.0179, "step": 3721 }, { "epoch": 1.6933575978161965, "grad_norm": 0.6295216941365326, "learning_rate": 3.7141416904092605e-06, "loss": 0.0333, "step": 3722 }, { "epoch": 1.6938125568698816, "grad_norm": 0.5946963393375687, "learning_rate": 3.713516931509775e-06, "loss": 0.0278, "step": 3723 }, { "epoch": 1.694267515923567, "grad_norm": 0.9716720750571886, "learning_rate": 3.7128920734471677e-06, "loss": 0.044, "step": 3724 }, { "epoch": 1.694722474977252, "grad_norm": 0.6509186594911334, "learning_rate": 3.7122671162724966e-06, "loss": 0.021, "step": 3725 }, { "epoch": 1.6951774340309371, "grad_norm": 0.6504872332839807, "learning_rate": 3.711642060036832e-06, "loss": 0.0304, "step": 3726 }, { "epoch": 1.6956323930846224, "grad_norm": 0.4843073380130816, "learning_rate": 3.711016904791249e-06, "loss": 0.0222, "step": 3727 }, { "epoch": 1.6960873521383075, "grad_norm": 0.5142043375988194, "learning_rate": 3.7103916505868342e-06, "loss": 0.0199, "step": 3728 }, { "epoch": 1.6965423111919926, "grad_norm": 0.5807813360855487, "learning_rate": 3.7097662974746795e-06, "loss": 0.0314, "step": 3729 }, { "epoch": 1.696997270245678, "grad_norm": 0.7719714775202473, "learning_rate": 3.7091408455058862e-06, "loss": 0.0337, "step": 3730 }, { "epoch": 1.697452229299363, "grad_norm": 0.4962539736030745, "learning_rate": 3.708515294731564e-06, "loss": 0.0187, "step": 3731 }, { "epoch": 1.697907188353048, "grad_norm": 0.5702466445302747, "learning_rate": 3.707889645202829e-06, "loss": 0.0224, "step": 3732 }, { "epoch": 1.6983621474067334, "grad_norm": 0.505469387514989, "learning_rate": 3.707263896970807e-06, "loss": 0.0236, "step": 3733 }, { "epoch": 1.6988171064604187, "grad_norm": 0.5040928264719208, "learning_rate": 3.706638050086631e-06, "loss": 0.0229, "step": 3734 }, { "epoch": 1.6992720655141036, "grad_norm": 0.5081632032813646, "learning_rate": 3.7060121046014434e-06, "loss": 0.02, "step": 3735 }, { "epoch": 1.699727024567789, "grad_norm": 0.41395460392786865, "learning_rate": 3.7053860605663927e-06, "loss": 0.0177, "step": 3736 }, { "epoch": 1.7001819836214742, "grad_norm": 0.8570781325955044, "learning_rate": 3.704759918032636e-06, "loss": 0.0309, "step": 3737 }, { "epoch": 1.700636942675159, "grad_norm": 0.5721539588385257, "learning_rate": 3.7041336770513403e-06, "loss": 0.0334, "step": 3738 }, { "epoch": 1.7010919017288444, "grad_norm": 0.6526103729827067, "learning_rate": 3.703507337673678e-06, "loss": 0.024, "step": 3739 }, { "epoch": 1.7015468607825297, "grad_norm": 0.7963773879297332, "learning_rate": 3.702880899950831e-06, "loss": 0.0407, "step": 3740 }, { "epoch": 1.7020018198362148, "grad_norm": 0.574933934375484, "learning_rate": 3.702254363933989e-06, "loss": 0.0187, "step": 3741 }, { "epoch": 1.7024567788898999, "grad_norm": 2.139762665256868, "learning_rate": 3.7016277296743496e-06, "loss": 0.0599, "step": 3742 }, { "epoch": 1.7029117379435852, "grad_norm": 0.5153834647656831, "learning_rate": 3.7010009972231186e-06, "loss": 0.0182, "step": 3743 }, { "epoch": 1.7033666969972703, "grad_norm": 0.6295570251944255, "learning_rate": 3.7003741666315095e-06, "loss": 0.0258, "step": 3744 }, { "epoch": 1.7038216560509554, "grad_norm": 0.8031954347623854, "learning_rate": 3.6997472379507454e-06, "loss": 0.0312, "step": 3745 }, { "epoch": 1.7042766151046407, "grad_norm": 0.8843858410369263, "learning_rate": 3.6991202112320544e-06, "loss": 0.0256, "step": 3746 }, { "epoch": 1.7047315741583258, "grad_norm": 0.561371813852862, "learning_rate": 3.6984930865266744e-06, "loss": 0.0254, "step": 3747 }, { "epoch": 1.7051865332120109, "grad_norm": 0.6772781140572789, "learning_rate": 3.6978658638858526e-06, "loss": 0.0249, "step": 3748 }, { "epoch": 1.7056414922656962, "grad_norm": 0.7681255950690041, "learning_rate": 3.6972385433608416e-06, "loss": 0.0313, "step": 3749 }, { "epoch": 1.7060964513193813, "grad_norm": 0.8281465322568329, "learning_rate": 3.6966111250029035e-06, "loss": 0.0362, "step": 3750 }, { "epoch": 1.7065514103730663, "grad_norm": 0.6778073444941, "learning_rate": 3.695983608863308e-06, "loss": 0.016, "step": 3751 }, { "epoch": 1.7070063694267517, "grad_norm": 0.3181064433588746, "learning_rate": 3.6953559949933334e-06, "loss": 0.0081, "step": 3752 }, { "epoch": 1.7074613284804367, "grad_norm": 0.5307372594094432, "learning_rate": 3.6947282834442643e-06, "loss": 0.0201, "step": 3753 }, { "epoch": 1.7079162875341218, "grad_norm": 0.972407393019446, "learning_rate": 3.6941004742673958e-06, "loss": 0.0273, "step": 3754 }, { "epoch": 1.7083712465878071, "grad_norm": 0.9214372631782088, "learning_rate": 3.693472567514029e-06, "loss": 0.0394, "step": 3755 }, { "epoch": 1.7088262056414922, "grad_norm": 0.48743904966054435, "learning_rate": 3.692844563235474e-06, "loss": 0.0199, "step": 3756 }, { "epoch": 1.7092811646951773, "grad_norm": 0.6817160494596262, "learning_rate": 3.692216461483047e-06, "loss": 0.0298, "step": 3757 }, { "epoch": 1.7097361237488626, "grad_norm": 0.3947750396419106, "learning_rate": 3.6915882623080756e-06, "loss": 0.0151, "step": 3758 }, { "epoch": 1.7101910828025477, "grad_norm": 1.1295486681126539, "learning_rate": 3.690959965761893e-06, "loss": 0.0547, "step": 3759 }, { "epoch": 1.7106460418562328, "grad_norm": 0.7037460217315583, "learning_rate": 3.6903315718958397e-06, "loss": 0.0311, "step": 3760 }, { "epoch": 1.7111010009099181, "grad_norm": 0.413788308194387, "learning_rate": 3.6897030807612655e-06, "loss": 0.0152, "step": 3761 }, { "epoch": 1.7115559599636034, "grad_norm": 0.6745789601697255, "learning_rate": 3.689074492409529e-06, "loss": 0.0327, "step": 3762 }, { "epoch": 1.7120109190172883, "grad_norm": 0.7540995398146106, "learning_rate": 3.6884458068919935e-06, "loss": 0.0372, "step": 3763 }, { "epoch": 1.7124658780709736, "grad_norm": 0.5881773051093517, "learning_rate": 3.687817024260035e-06, "loss": 0.0196, "step": 3764 }, { "epoch": 1.712920837124659, "grad_norm": 0.5571006232789326, "learning_rate": 3.687188144565033e-06, "loss": 0.0224, "step": 3765 }, { "epoch": 1.7133757961783438, "grad_norm": 0.4494519698168543, "learning_rate": 3.6865591678583775e-06, "loss": 0.0225, "step": 3766 }, { "epoch": 1.713830755232029, "grad_norm": 0.7711446370438744, "learning_rate": 3.685930094191465e-06, "loss": 0.0234, "step": 3767 }, { "epoch": 1.7142857142857144, "grad_norm": 0.8397312671641753, "learning_rate": 3.6853009236157e-06, "loss": 0.0423, "step": 3768 }, { "epoch": 1.7147406733393995, "grad_norm": 0.683211885753347, "learning_rate": 3.684671656182497e-06, "loss": 0.0216, "step": 3769 }, { "epoch": 1.7151956323930846, "grad_norm": 0.4348183694352422, "learning_rate": 3.6840422919432762e-06, "loss": 0.0165, "step": 3770 }, { "epoch": 1.71565059144677, "grad_norm": 0.8763212022348565, "learning_rate": 3.683412830949466e-06, "loss": 0.0378, "step": 3771 }, { "epoch": 1.716105550500455, "grad_norm": 0.4619274536838717, "learning_rate": 3.6827832732525042e-06, "loss": 0.0175, "step": 3772 }, { "epoch": 1.71656050955414, "grad_norm": 0.41456934939422674, "learning_rate": 3.6821536189038343e-06, "loss": 0.0192, "step": 3773 }, { "epoch": 1.7170154686078254, "grad_norm": 0.46901142214125285, "learning_rate": 3.681523867954909e-06, "loss": 0.015, "step": 3774 }, { "epoch": 1.7174704276615105, "grad_norm": 0.327082992709135, "learning_rate": 3.6808940204571895e-06, "loss": 0.0128, "step": 3775 }, { "epoch": 1.7179253867151956, "grad_norm": 0.6033494642390252, "learning_rate": 3.6802640764621427e-06, "loss": 0.0393, "step": 3776 }, { "epoch": 1.7183803457688809, "grad_norm": 0.5929060959086757, "learning_rate": 3.6796340360212467e-06, "loss": 0.0302, "step": 3777 }, { "epoch": 1.718835304822566, "grad_norm": 0.5969951393161446, "learning_rate": 3.679003899185983e-06, "loss": 0.0193, "step": 3778 }, { "epoch": 1.719290263876251, "grad_norm": 0.4888552177499924, "learning_rate": 3.6783736660078463e-06, "loss": 0.0179, "step": 3779 }, { "epoch": 1.7197452229299364, "grad_norm": 0.4857334931955398, "learning_rate": 3.6777433365383348e-06, "loss": 0.0176, "step": 3780 }, { "epoch": 1.7202001819836215, "grad_norm": 0.500193230861817, "learning_rate": 3.6771129108289568e-06, "loss": 0.0247, "step": 3781 }, { "epoch": 1.7206551410373065, "grad_norm": 0.5069256645883959, "learning_rate": 3.6764823889312263e-06, "loss": 0.0177, "step": 3782 }, { "epoch": 1.7211101000909919, "grad_norm": 0.42689246859965685, "learning_rate": 3.675851770896669e-06, "loss": 0.0198, "step": 3783 }, { "epoch": 1.721565059144677, "grad_norm": 0.6307321078470965, "learning_rate": 3.675221056776815e-06, "loss": 0.0282, "step": 3784 }, { "epoch": 1.722020018198362, "grad_norm": 0.4160803144159328, "learning_rate": 3.6745902466232027e-06, "loss": 0.0168, "step": 3785 }, { "epoch": 1.7224749772520473, "grad_norm": 0.42583746871846967, "learning_rate": 3.6739593404873804e-06, "loss": 0.0169, "step": 3786 }, { "epoch": 1.7229299363057324, "grad_norm": 0.5970593946647448, "learning_rate": 3.6733283384209022e-06, "loss": 0.0303, "step": 3787 }, { "epoch": 1.7233848953594175, "grad_norm": 0.324960274386966, "learning_rate": 3.6726972404753313e-06, "loss": 0.0124, "step": 3788 }, { "epoch": 1.7238398544131028, "grad_norm": 0.5534609578181952, "learning_rate": 3.672066046702237e-06, "loss": 0.0217, "step": 3789 }, { "epoch": 1.7242948134667881, "grad_norm": 0.5587241395138457, "learning_rate": 3.6714347571531993e-06, "loss": 0.0225, "step": 3790 }, { "epoch": 1.724749772520473, "grad_norm": 0.4138423827236354, "learning_rate": 3.670803371879803e-06, "loss": 0.0201, "step": 3791 }, { "epoch": 1.7252047315741583, "grad_norm": 0.415855766212165, "learning_rate": 3.6701718909336424e-06, "loss": 0.0145, "step": 3792 }, { "epoch": 1.7256596906278436, "grad_norm": 0.6435316880247659, "learning_rate": 3.669540314366319e-06, "loss": 0.0316, "step": 3793 }, { "epoch": 1.7261146496815285, "grad_norm": 0.506440622429494, "learning_rate": 3.6689086422294434e-06, "loss": 0.013, "step": 3794 }, { "epoch": 1.7265696087352138, "grad_norm": 0.4093327188417235, "learning_rate": 3.6682768745746317e-06, "loss": 0.0127, "step": 3795 }, { "epoch": 1.7270245677888991, "grad_norm": 0.5734344443277041, "learning_rate": 3.66764501145351e-06, "loss": 0.0205, "step": 3796 }, { "epoch": 1.7274795268425842, "grad_norm": 0.3817754141025844, "learning_rate": 3.6670130529177108e-06, "loss": 0.0186, "step": 3797 }, { "epoch": 1.7279344858962693, "grad_norm": 0.537107369260746, "learning_rate": 3.6663809990188752e-06, "loss": 0.0146, "step": 3798 }, { "epoch": 1.7283894449499546, "grad_norm": 0.49735286104024884, "learning_rate": 3.6657488498086517e-06, "loss": 0.0133, "step": 3799 }, { "epoch": 1.7288444040036397, "grad_norm": 0.47221804561763336, "learning_rate": 3.6651166053386966e-06, "loss": 0.0187, "step": 3800 }, { "epoch": 1.7292993630573248, "grad_norm": 0.6388205492573055, "learning_rate": 3.664484265660675e-06, "loss": 0.0257, "step": 3801 }, { "epoch": 1.72975432211101, "grad_norm": 0.6199481077062792, "learning_rate": 3.6638518308262567e-06, "loss": 0.0317, "step": 3802 }, { "epoch": 1.7302092811646952, "grad_norm": 0.49900585821193677, "learning_rate": 3.663219300887123e-06, "loss": 0.0213, "step": 3803 }, { "epoch": 1.7306642402183803, "grad_norm": 0.5335091474935367, "learning_rate": 3.6625866758949614e-06, "loss": 0.024, "step": 3804 }, { "epoch": 1.7311191992720656, "grad_norm": 0.5670937501409793, "learning_rate": 3.6619539559014673e-06, "loss": 0.0187, "step": 3805 }, { "epoch": 1.7315741583257507, "grad_norm": 0.8326133683447557, "learning_rate": 3.661321140958342e-06, "loss": 0.0357, "step": 3806 }, { "epoch": 1.7320291173794358, "grad_norm": 0.5421609083192159, "learning_rate": 3.660688231117298e-06, "loss": 0.0231, "step": 3807 }, { "epoch": 1.732484076433121, "grad_norm": 0.7365065719826951, "learning_rate": 3.660055226430054e-06, "loss": 0.0295, "step": 3808 }, { "epoch": 1.7329390354868062, "grad_norm": 1.0183754445566922, "learning_rate": 3.6594221269483356e-06, "loss": 0.039, "step": 3809 }, { "epoch": 1.7333939945404913, "grad_norm": 0.7024744503951253, "learning_rate": 3.658788932723876e-06, "loss": 0.0313, "step": 3810 }, { "epoch": 1.7338489535941766, "grad_norm": 0.5041725108988522, "learning_rate": 3.6581556438084185e-06, "loss": 0.01, "step": 3811 }, { "epoch": 1.7343039126478617, "grad_norm": 0.5707798595479919, "learning_rate": 3.6575222602537118e-06, "loss": 0.0214, "step": 3812 }, { "epoch": 1.7347588717015467, "grad_norm": 0.5529421515058182, "learning_rate": 3.6568887821115134e-06, "loss": 0.0191, "step": 3813 }, { "epoch": 1.735213830755232, "grad_norm": 0.7144596376367993, "learning_rate": 3.6562552094335878e-06, "loss": 0.0334, "step": 3814 }, { "epoch": 1.7356687898089171, "grad_norm": 0.6908144788678268, "learning_rate": 3.655621542271709e-06, "loss": 0.0249, "step": 3815 }, { "epoch": 1.7361237488626022, "grad_norm": 0.6371182292758583, "learning_rate": 3.654987780677656e-06, "loss": 0.031, "step": 3816 }, { "epoch": 1.7365787079162875, "grad_norm": 0.6184433778112067, "learning_rate": 3.654353924703217e-06, "loss": 0.0225, "step": 3817 }, { "epoch": 1.7370336669699729, "grad_norm": 0.40702156642919257, "learning_rate": 3.6537199744001893e-06, "loss": 0.0129, "step": 3818 }, { "epoch": 1.7374886260236577, "grad_norm": 0.5413199568074462, "learning_rate": 3.6530859298203746e-06, "loss": 0.0254, "step": 3819 }, { "epoch": 1.737943585077343, "grad_norm": 0.4914812587364899, "learning_rate": 3.6524517910155853e-06, "loss": 0.0213, "step": 3820 }, { "epoch": 1.7383985441310283, "grad_norm": 6.570695852121233, "learning_rate": 3.65181755803764e-06, "loss": 0.0807, "step": 3821 }, { "epoch": 1.7388535031847132, "grad_norm": 0.6210010475391491, "learning_rate": 3.6511832309383654e-06, "loss": 0.0252, "step": 3822 }, { "epoch": 1.7393084622383985, "grad_norm": 0.5307900469885153, "learning_rate": 3.6505488097695963e-06, "loss": 0.0156, "step": 3823 }, { "epoch": 1.7397634212920838, "grad_norm": 1.0474186407382713, "learning_rate": 3.6499142945831732e-06, "loss": 0.0449, "step": 3824 }, { "epoch": 1.740218380345769, "grad_norm": 0.5856174057365933, "learning_rate": 3.649279685430948e-06, "loss": 0.0212, "step": 3825 }, { "epoch": 1.740673339399454, "grad_norm": 0.7487230793612085, "learning_rate": 3.648644982364777e-06, "loss": 0.0327, "step": 3826 }, { "epoch": 1.7411282984531393, "grad_norm": 0.8370183653893686, "learning_rate": 3.648010185436525e-06, "loss": 0.0349, "step": 3827 }, { "epoch": 1.7415832575068244, "grad_norm": 0.573210475770346, "learning_rate": 3.6473752946980644e-06, "loss": 0.0279, "step": 3828 }, { "epoch": 1.7420382165605095, "grad_norm": 0.6791246869658677, "learning_rate": 3.6467403102012767e-06, "loss": 0.0249, "step": 3829 }, { "epoch": 1.7424931756141948, "grad_norm": 0.7888223019204079, "learning_rate": 3.64610523199805e-06, "loss": 0.0382, "step": 3830 }, { "epoch": 1.74294813466788, "grad_norm": 0.48924629959412974, "learning_rate": 3.6454700601402783e-06, "loss": 0.0172, "step": 3831 }, { "epoch": 1.743403093721565, "grad_norm": 0.8852683743955648, "learning_rate": 3.6448347946798672e-06, "loss": 0.0418, "step": 3832 }, { "epoch": 1.7438580527752503, "grad_norm": 0.7758243777941729, "learning_rate": 3.6441994356687265e-06, "loss": 0.0312, "step": 3833 }, { "epoch": 1.7443130118289354, "grad_norm": 0.7032750998686196, "learning_rate": 3.643563983158775e-06, "loss": 0.0329, "step": 3834 }, { "epoch": 1.7447679708826205, "grad_norm": 0.5317983715720668, "learning_rate": 3.642928437201939e-06, "loss": 0.0238, "step": 3835 }, { "epoch": 1.7452229299363058, "grad_norm": 0.6749497364473966, "learning_rate": 3.642292797850153e-06, "loss": 0.0338, "step": 3836 }, { "epoch": 1.7456778889899909, "grad_norm": 0.5413385003454867, "learning_rate": 3.641657065155358e-06, "loss": 0.0222, "step": 3837 }, { "epoch": 1.746132848043676, "grad_norm": 0.7915659846437547, "learning_rate": 3.6410212391695023e-06, "loss": 0.0253, "step": 3838 }, { "epoch": 1.7465878070973613, "grad_norm": 0.4970696189128546, "learning_rate": 3.6403853199445448e-06, "loss": 0.0141, "step": 3839 }, { "epoch": 1.7470427661510464, "grad_norm": 0.44885237124059263, "learning_rate": 3.6397493075324486e-06, "loss": 0.018, "step": 3840 }, { "epoch": 1.7474977252047315, "grad_norm": 0.6822120402090854, "learning_rate": 3.6391132019851857e-06, "loss": 0.0289, "step": 3841 }, { "epoch": 1.7479526842584168, "grad_norm": 0.3885326548095813, "learning_rate": 3.6384770033547366e-06, "loss": 0.0123, "step": 3842 }, { "epoch": 1.7484076433121019, "grad_norm": 0.5503894038476335, "learning_rate": 3.637840711693088e-06, "loss": 0.0199, "step": 3843 }, { "epoch": 1.748862602365787, "grad_norm": 0.5708299264033291, "learning_rate": 3.637204327052235e-06, "loss": 0.0298, "step": 3844 }, { "epoch": 1.7493175614194723, "grad_norm": 0.6508256310761806, "learning_rate": 3.6365678494841795e-06, "loss": 0.0384, "step": 3845 }, { "epoch": 1.7497725204731576, "grad_norm": 0.5643977346261464, "learning_rate": 3.6359312790409323e-06, "loss": 0.028, "step": 3846 }, { "epoch": 1.7502274795268424, "grad_norm": 0.5623851041936763, "learning_rate": 3.635294615774511e-06, "loss": 0.0316, "step": 3847 }, { "epoch": 1.7506824385805277, "grad_norm": 0.6335326929124626, "learning_rate": 3.6346578597369397e-06, "loss": 0.0276, "step": 3848 }, { "epoch": 1.751137397634213, "grad_norm": 0.7013109441398028, "learning_rate": 3.634021010980254e-06, "loss": 0.0315, "step": 3849 }, { "epoch": 1.7515923566878981, "grad_norm": 0.9579212494639825, "learning_rate": 3.633384069556491e-06, "loss": 0.0628, "step": 3850 }, { "epoch": 1.7520473157415832, "grad_norm": 0.5048993866431397, "learning_rate": 3.6327470355177006e-06, "loss": 0.0225, "step": 3851 }, { "epoch": 1.7525022747952685, "grad_norm": 0.5883308745672281, "learning_rate": 3.6321099089159377e-06, "loss": 0.0306, "step": 3852 }, { "epoch": 1.7529572338489536, "grad_norm": 0.5431914009800679, "learning_rate": 3.631472689803266e-06, "loss": 0.0219, "step": 3853 }, { "epoch": 1.7534121929026387, "grad_norm": 0.6136466481448818, "learning_rate": 3.6308353782317557e-06, "loss": 0.0345, "step": 3854 }, { "epoch": 1.753867151956324, "grad_norm": 0.6477013738851053, "learning_rate": 3.6301979742534844e-06, "loss": 0.0347, "step": 3855 }, { "epoch": 1.7543221110100091, "grad_norm": 0.649955937465349, "learning_rate": 3.6295604779205394e-06, "loss": 0.0269, "step": 3856 }, { "epoch": 1.7547770700636942, "grad_norm": 0.5451073684407265, "learning_rate": 3.6289228892850126e-06, "loss": 0.0232, "step": 3857 }, { "epoch": 1.7552320291173795, "grad_norm": 0.6501454323654668, "learning_rate": 3.628285208399006e-06, "loss": 0.0383, "step": 3858 }, { "epoch": 1.7556869881710646, "grad_norm": 0.7215446452564725, "learning_rate": 3.6276474353146274e-06, "loss": 0.0314, "step": 3859 }, { "epoch": 1.7561419472247497, "grad_norm": 0.6315136771122604, "learning_rate": 3.6270095700839926e-06, "loss": 0.036, "step": 3860 }, { "epoch": 1.756596906278435, "grad_norm": 0.6253760413777038, "learning_rate": 3.6263716127592253e-06, "loss": 0.0248, "step": 3861 }, { "epoch": 1.75705186533212, "grad_norm": 0.32504064719636533, "learning_rate": 3.6257335633924564e-06, "loss": 0.01, "step": 3862 }, { "epoch": 1.7575068243858052, "grad_norm": 0.7224805704522139, "learning_rate": 3.6250954220358248e-06, "loss": 0.0337, "step": 3863 }, { "epoch": 1.7579617834394905, "grad_norm": 0.7791448578606137, "learning_rate": 3.624457188741476e-06, "loss": 0.0345, "step": 3864 }, { "epoch": 1.7584167424931756, "grad_norm": 0.5675607416057377, "learning_rate": 3.6238188635615636e-06, "loss": 0.0272, "step": 3865 }, { "epoch": 1.7588717015468607, "grad_norm": 0.5160544818888646, "learning_rate": 3.6231804465482483e-06, "loss": 0.0159, "step": 3866 }, { "epoch": 1.759326660600546, "grad_norm": 0.6248463178241036, "learning_rate": 3.6225419377536997e-06, "loss": 0.0272, "step": 3867 }, { "epoch": 1.759781619654231, "grad_norm": 0.49893607907020765, "learning_rate": 3.6219033372300937e-06, "loss": 0.0215, "step": 3868 }, { "epoch": 1.7602365787079162, "grad_norm": 0.6287338412781016, "learning_rate": 3.621264645029613e-06, "loss": 0.0254, "step": 3869 }, { "epoch": 1.7606915377616015, "grad_norm": 0.576700858995691, "learning_rate": 3.6206258612044486e-06, "loss": 0.0215, "step": 3870 }, { "epoch": 1.7611464968152868, "grad_norm": 0.5569509873792899, "learning_rate": 3.6199869858068003e-06, "loss": 0.0175, "step": 3871 }, { "epoch": 1.7616014558689717, "grad_norm": 0.525407504415116, "learning_rate": 3.619348018888873e-06, "loss": 0.0141, "step": 3872 }, { "epoch": 1.762056414922657, "grad_norm": 0.4867512209136127, "learning_rate": 3.618708960502881e-06, "loss": 0.0273, "step": 3873 }, { "epoch": 1.7625113739763423, "grad_norm": 0.6767457078737245, "learning_rate": 3.6180698107010435e-06, "loss": 0.0385, "step": 3874 }, { "epoch": 1.7629663330300271, "grad_norm": 0.8052724611893239, "learning_rate": 3.617430569535592e-06, "loss": 0.0335, "step": 3875 }, { "epoch": 1.7634212920837125, "grad_norm": 0.6405382179049982, "learning_rate": 3.61679123705876e-06, "loss": 0.0315, "step": 3876 }, { "epoch": 1.7638762511373978, "grad_norm": 0.6433604509135347, "learning_rate": 3.616151813322791e-06, "loss": 0.0261, "step": 3877 }, { "epoch": 1.7643312101910829, "grad_norm": 0.6625365479641434, "learning_rate": 3.615512298379937e-06, "loss": 0.0285, "step": 3878 }, { "epoch": 1.764786169244768, "grad_norm": 0.5427435010837753, "learning_rate": 3.6148726922824545e-06, "loss": 0.0225, "step": 3879 }, { "epoch": 1.7652411282984533, "grad_norm": 0.660414462404647, "learning_rate": 3.614232995082611e-06, "loss": 0.0292, "step": 3880 }, { "epoch": 1.7656960873521383, "grad_norm": 0.9199890587743693, "learning_rate": 3.6135932068326797e-06, "loss": 0.0322, "step": 3881 }, { "epoch": 1.7661510464058234, "grad_norm": 0.5148296035434483, "learning_rate": 3.6129533275849395e-06, "loss": 0.0203, "step": 3882 }, { "epoch": 1.7666060054595087, "grad_norm": 0.48186441552250603, "learning_rate": 3.6123133573916792e-06, "loss": 0.0126, "step": 3883 }, { "epoch": 1.7670609645131938, "grad_norm": 0.6283861081525925, "learning_rate": 3.6116732963051946e-06, "loss": 0.0243, "step": 3884 }, { "epoch": 1.767515923566879, "grad_norm": 0.567215610102529, "learning_rate": 3.611033144377789e-06, "loss": 0.0281, "step": 3885 }, { "epoch": 1.7679708826205642, "grad_norm": 0.6059665148519728, "learning_rate": 3.610392901661772e-06, "loss": 0.0249, "step": 3886 }, { "epoch": 1.7684258416742493, "grad_norm": 1.3043639457630865, "learning_rate": 3.609752568209462e-06, "loss": 0.0589, "step": 3887 }, { "epoch": 1.7688808007279344, "grad_norm": 0.5083041603935354, "learning_rate": 3.6091121440731835e-06, "loss": 0.021, "step": 3888 }, { "epoch": 1.7693357597816197, "grad_norm": 0.519634807147173, "learning_rate": 3.608471629305269e-06, "loss": 0.0239, "step": 3889 }, { "epoch": 1.7697907188353048, "grad_norm": 0.5448468779032511, "learning_rate": 3.607831023958059e-06, "loss": 0.0238, "step": 3890 }, { "epoch": 1.77024567788899, "grad_norm": 0.6590487752654589, "learning_rate": 3.6071903280839003e-06, "loss": 0.0186, "step": 3891 }, { "epoch": 1.7707006369426752, "grad_norm": 0.46815162289678897, "learning_rate": 3.606549541735148e-06, "loss": 0.0204, "step": 3892 }, { "epoch": 1.7711555959963603, "grad_norm": 0.46916741395974876, "learning_rate": 3.605908664964165e-06, "loss": 0.022, "step": 3893 }, { "epoch": 1.7716105550500454, "grad_norm": 0.8207078001944466, "learning_rate": 3.605267697823319e-06, "loss": 0.0401, "step": 3894 }, { "epoch": 1.7720655141037307, "grad_norm": 0.5583817342162499, "learning_rate": 3.6046266403649897e-06, "loss": 0.0281, "step": 3895 }, { "epoch": 1.7725204731574158, "grad_norm": 0.6331389837231556, "learning_rate": 3.6039854926415585e-06, "loss": 0.0196, "step": 3896 }, { "epoch": 1.7729754322111009, "grad_norm": 0.6179106706338694, "learning_rate": 3.603344254705419e-06, "loss": 0.026, "step": 3897 }, { "epoch": 1.7734303912647862, "grad_norm": 0.676451965366572, "learning_rate": 3.6027029266089693e-06, "loss": 0.0202, "step": 3898 }, { "epoch": 1.7738853503184715, "grad_norm": 0.6154295651793058, "learning_rate": 3.602061508404616e-06, "loss": 0.0215, "step": 3899 }, { "epoch": 1.7743403093721564, "grad_norm": 0.6434847973405531, "learning_rate": 3.601420000144774e-06, "loss": 0.0278, "step": 3900 }, { "epoch": 1.7747952684258417, "grad_norm": 0.6656349437868243, "learning_rate": 3.6007784018818627e-06, "loss": 0.0344, "step": 3901 }, { "epoch": 1.775250227479527, "grad_norm": 0.8418131651302995, "learning_rate": 3.6001367136683117e-06, "loss": 0.0348, "step": 3902 }, { "epoch": 1.7757051865332119, "grad_norm": 0.7374845008025159, "learning_rate": 3.5994949355565565e-06, "loss": 0.0405, "step": 3903 }, { "epoch": 1.7761601455868972, "grad_norm": 0.6637955425767137, "learning_rate": 3.598853067599041e-06, "loss": 0.0208, "step": 3904 }, { "epoch": 1.7766151046405825, "grad_norm": 0.600504738278384, "learning_rate": 3.5982111098482146e-06, "loss": 0.0166, "step": 3905 }, { "epoch": 1.7770700636942676, "grad_norm": 0.6375017465476851, "learning_rate": 3.5975690623565364e-06, "loss": 0.0251, "step": 3906 }, { "epoch": 1.7775250227479527, "grad_norm": 0.5951924512508926, "learning_rate": 3.5969269251764704e-06, "loss": 0.0245, "step": 3907 }, { "epoch": 1.777979981801638, "grad_norm": 0.4978700271833971, "learning_rate": 3.596284698360489e-06, "loss": 0.0162, "step": 3908 }, { "epoch": 1.778434940855323, "grad_norm": 0.5111900130707472, "learning_rate": 3.5956423819610747e-06, "loss": 0.0142, "step": 3909 }, { "epoch": 1.7788898999090081, "grad_norm": 0.758827836834956, "learning_rate": 3.594999976030712e-06, "loss": 0.0247, "step": 3910 }, { "epoch": 1.7793448589626935, "grad_norm": 0.5006811977878517, "learning_rate": 3.594357480621896e-06, "loss": 0.0175, "step": 3911 }, { "epoch": 1.7797998180163785, "grad_norm": 0.6445417743650146, "learning_rate": 3.5937148957871294e-06, "loss": 0.0234, "step": 3912 }, { "epoch": 1.7802547770700636, "grad_norm": 0.6228359576787655, "learning_rate": 3.59307222157892e-06, "loss": 0.0303, "step": 3913 }, { "epoch": 1.780709736123749, "grad_norm": 0.649823513259258, "learning_rate": 3.5924294580497852e-06, "loss": 0.0265, "step": 3914 }, { "epoch": 1.781164695177434, "grad_norm": 0.7275866468886616, "learning_rate": 3.5917866052522478e-06, "loss": 0.0234, "step": 3915 }, { "epoch": 1.7816196542311191, "grad_norm": 0.5797476251052617, "learning_rate": 3.5911436632388403e-06, "loss": 0.0257, "step": 3916 }, { "epoch": 1.7820746132848044, "grad_norm": 0.4303797263387782, "learning_rate": 3.5905006320621006e-06, "loss": 0.0133, "step": 3917 }, { "epoch": 1.7825295723384895, "grad_norm": 0.6066737951252139, "learning_rate": 3.5898575117745725e-06, "loss": 0.0245, "step": 3918 }, { "epoch": 1.7829845313921746, "grad_norm": 0.9780049376198774, "learning_rate": 3.589214302428811e-06, "loss": 0.0175, "step": 3919 }, { "epoch": 1.78343949044586, "grad_norm": 0.666366761370394, "learning_rate": 3.5885710040773757e-06, "loss": 0.0272, "step": 3920 }, { "epoch": 1.783894449499545, "grad_norm": 0.7167233674970389, "learning_rate": 3.5879276167728343e-06, "loss": 0.0429, "step": 3921 }, { "epoch": 1.78434940855323, "grad_norm": 0.6437730569286951, "learning_rate": 3.5872841405677607e-06, "loss": 0.0258, "step": 3922 }, { "epoch": 1.7848043676069154, "grad_norm": 0.5235245718593277, "learning_rate": 3.5866405755147364e-06, "loss": 0.0221, "step": 3923 }, { "epoch": 1.7852593266606005, "grad_norm": 0.934032485425318, "learning_rate": 3.5859969216663526e-06, "loss": 0.0343, "step": 3924 }, { "epoch": 1.7857142857142856, "grad_norm": 0.7363067490735575, "learning_rate": 3.585353179075204e-06, "loss": 0.0312, "step": 3925 }, { "epoch": 1.786169244767971, "grad_norm": 0.6214405232934467, "learning_rate": 3.5847093477938955e-06, "loss": 0.0335, "step": 3926 }, { "epoch": 1.7866242038216562, "grad_norm": 0.36816042426325046, "learning_rate": 3.5840654278750377e-06, "loss": 0.0148, "step": 3927 }, { "epoch": 1.787079162875341, "grad_norm": 0.6496931922851692, "learning_rate": 3.5834214193712483e-06, "loss": 0.0233, "step": 3928 }, { "epoch": 1.7875341219290264, "grad_norm": 0.7372245796528532, "learning_rate": 3.5827773223351535e-06, "loss": 0.0351, "step": 3929 }, { "epoch": 1.7879890809827117, "grad_norm": 0.7583458422464306, "learning_rate": 3.5821331368193857e-06, "loss": 0.0365, "step": 3930 }, { "epoch": 1.7884440400363966, "grad_norm": 0.8054663645139133, "learning_rate": 3.5814888628765846e-06, "loss": 0.0347, "step": 3931 }, { "epoch": 1.7888989990900819, "grad_norm": 0.6015080502616464, "learning_rate": 3.5808445005593972e-06, "loss": 0.0283, "step": 3932 }, { "epoch": 1.7893539581437672, "grad_norm": 0.7874441891591232, "learning_rate": 3.5802000499204793e-06, "loss": 0.0379, "step": 3933 }, { "epoch": 1.7898089171974523, "grad_norm": 0.7295648051831871, "learning_rate": 3.5795555110124913e-06, "loss": 0.0304, "step": 3934 }, { "epoch": 1.7902638762511374, "grad_norm": 0.5555915158489714, "learning_rate": 3.5789108838881017e-06, "loss": 0.0202, "step": 3935 }, { "epoch": 1.7907188353048227, "grad_norm": 0.48155818395120503, "learning_rate": 3.5782661685999863e-06, "loss": 0.0162, "step": 3936 }, { "epoch": 1.7911737943585078, "grad_norm": 0.6189721388710233, "learning_rate": 3.57762136520083e-06, "loss": 0.0218, "step": 3937 }, { "epoch": 1.7916287534121929, "grad_norm": 0.7472704454550904, "learning_rate": 3.5769764737433226e-06, "loss": 0.0418, "step": 3938 }, { "epoch": 1.7920837124658782, "grad_norm": 0.5909099871804728, "learning_rate": 3.576331494280161e-06, "loss": 0.0256, "step": 3939 }, { "epoch": 1.7925386715195633, "grad_norm": 0.626063213330244, "learning_rate": 3.5756864268640494e-06, "loss": 0.031, "step": 3940 }, { "epoch": 1.7929936305732483, "grad_norm": 1.725587829454964, "learning_rate": 3.5750412715477016e-06, "loss": 0.0259, "step": 3941 }, { "epoch": 1.7934485896269337, "grad_norm": 0.6177307993630271, "learning_rate": 3.574396028383836e-06, "loss": 0.0289, "step": 3942 }, { "epoch": 1.7939035486806187, "grad_norm": 0.5157420727607485, "learning_rate": 3.5737506974251785e-06, "loss": 0.0243, "step": 3943 }, { "epoch": 1.7943585077343038, "grad_norm": 0.7234183098345265, "learning_rate": 3.573105278724463e-06, "loss": 0.0278, "step": 3944 }, { "epoch": 1.7948134667879891, "grad_norm": 0.31159761691155574, "learning_rate": 3.5724597723344313e-06, "loss": 0.0096, "step": 3945 }, { "epoch": 1.7952684258416742, "grad_norm": 0.6287427543006122, "learning_rate": 3.5718141783078285e-06, "loss": 0.024, "step": 3946 }, { "epoch": 1.7957233848953593, "grad_norm": 0.6506426377677337, "learning_rate": 3.5711684966974125e-06, "loss": 0.0262, "step": 3947 }, { "epoch": 1.7961783439490446, "grad_norm": 0.750892926554825, "learning_rate": 3.570522727555944e-06, "loss": 0.0354, "step": 3948 }, { "epoch": 1.7966333030027297, "grad_norm": 0.8008460262124096, "learning_rate": 3.5698768709361926e-06, "loss": 0.0297, "step": 3949 }, { "epoch": 1.7970882620564148, "grad_norm": 0.5443181890476545, "learning_rate": 3.569230926890935e-06, "loss": 0.0243, "step": 3950 }, { "epoch": 1.7975432211101001, "grad_norm": 0.5349816912665613, "learning_rate": 3.568584895472954e-06, "loss": 0.0203, "step": 3951 }, { "epoch": 1.7979981801637852, "grad_norm": 0.6181861080655738, "learning_rate": 3.5679387767350414e-06, "loss": 0.0223, "step": 3952 }, { "epoch": 1.7984531392174703, "grad_norm": 0.6176235881907776, "learning_rate": 3.5672925707299955e-06, "loss": 0.0314, "step": 3953 }, { "epoch": 1.7989080982711556, "grad_norm": 0.35662249360331727, "learning_rate": 3.5666462775106193e-06, "loss": 0.0144, "step": 3954 }, { "epoch": 1.799363057324841, "grad_norm": 0.6030616966248222, "learning_rate": 3.565999897129727e-06, "loss": 0.0219, "step": 3955 }, { "epoch": 1.7998180163785258, "grad_norm": 0.5617560766269626, "learning_rate": 3.5653534296401372e-06, "loss": 0.0186, "step": 3956 }, { "epoch": 1.800272975432211, "grad_norm": 0.5893075865344094, "learning_rate": 3.5647068750946754e-06, "loss": 0.0322, "step": 3957 }, { "epoch": 1.8007279344858964, "grad_norm": 1.0485891114944244, "learning_rate": 3.564060233546177e-06, "loss": 0.0291, "step": 3958 }, { "epoch": 1.8011828935395813, "grad_norm": 0.678699484176965, "learning_rate": 3.563413505047481e-06, "loss": 0.0219, "step": 3959 }, { "epoch": 1.8016378525932666, "grad_norm": 0.5897774698390604, "learning_rate": 3.562766689651436e-06, "loss": 0.0276, "step": 3960 }, { "epoch": 1.802092811646952, "grad_norm": 0.6153748474558995, "learning_rate": 3.5621197874108957e-06, "loss": 0.0285, "step": 3961 }, { "epoch": 1.802547770700637, "grad_norm": 0.7525661362658659, "learning_rate": 3.5614727983787244e-06, "loss": 0.0381, "step": 3962 }, { "epoch": 1.803002729754322, "grad_norm": 0.8375127499129398, "learning_rate": 3.5608257226077887e-06, "loss": 0.0438, "step": 3963 }, { "epoch": 1.8034576888080074, "grad_norm": 0.7324515866447845, "learning_rate": 3.5601785601509654e-06, "loss": 0.0299, "step": 3964 }, { "epoch": 1.8039126478616925, "grad_norm": 0.626212880057884, "learning_rate": 3.5595313110611386e-06, "loss": 0.0299, "step": 3965 }, { "epoch": 1.8043676069153776, "grad_norm": 0.6377522727760246, "learning_rate": 3.558883975391197e-06, "loss": 0.0291, "step": 3966 }, { "epoch": 1.8048225659690629, "grad_norm": 0.565336133719167, "learning_rate": 3.5582365531940387e-06, "loss": 0.0194, "step": 3967 }, { "epoch": 1.805277525022748, "grad_norm": 0.5777546052776361, "learning_rate": 3.5575890445225686e-06, "loss": 0.0239, "step": 3968 }, { "epoch": 1.805732484076433, "grad_norm": 0.5964416253475773, "learning_rate": 3.5569414494296982e-06, "loss": 0.0278, "step": 3969 }, { "epoch": 1.8061874431301184, "grad_norm": 0.6472019135476743, "learning_rate": 3.5562937679683455e-06, "loss": 0.0274, "step": 3970 }, { "epoch": 1.8066424021838035, "grad_norm": 0.5392367514210847, "learning_rate": 3.5556460001914357e-06, "loss": 0.0282, "step": 3971 }, { "epoch": 1.8070973612374885, "grad_norm": 0.4034773412537766, "learning_rate": 3.5549981461519028e-06, "loss": 0.0097, "step": 3972 }, { "epoch": 1.8075523202911739, "grad_norm": 0.6621325655516644, "learning_rate": 3.554350205902685e-06, "loss": 0.0328, "step": 3973 }, { "epoch": 1.808007279344859, "grad_norm": 0.5577167680167383, "learning_rate": 3.55370217949673e-06, "loss": 0.0253, "step": 3974 }, { "epoch": 1.808462238398544, "grad_norm": 0.7630664920699377, "learning_rate": 3.5530540669869915e-06, "loss": 0.0396, "step": 3975 }, { "epoch": 1.8089171974522293, "grad_norm": 0.5696580009227497, "learning_rate": 3.5524058684264304e-06, "loss": 0.0246, "step": 3976 }, { "epoch": 1.8093721565059144, "grad_norm": 0.5108750390920631, "learning_rate": 3.551757583868015e-06, "loss": 0.0127, "step": 3977 }, { "epoch": 1.8098271155595995, "grad_norm": 0.6308713566048816, "learning_rate": 3.551109213364717e-06, "loss": 0.0238, "step": 3978 }, { "epoch": 1.8102820746132848, "grad_norm": 0.6318831614695566, "learning_rate": 3.5504607569695237e-06, "loss": 0.0264, "step": 3979 }, { "epoch": 1.81073703366697, "grad_norm": 0.4408743836772877, "learning_rate": 3.5498122147354198e-06, "loss": 0.0151, "step": 3980 }, { "epoch": 1.811191992720655, "grad_norm": 0.46295335602810883, "learning_rate": 3.549163586715403e-06, "loss": 0.0234, "step": 3981 }, { "epoch": 1.8116469517743403, "grad_norm": 0.9393187059454814, "learning_rate": 3.5485148729624756e-06, "loss": 0.0288, "step": 3982 }, { "epoch": 1.8121019108280256, "grad_norm": 0.36590617604906744, "learning_rate": 3.5478660735296476e-06, "loss": 0.0115, "step": 3983 }, { "epoch": 1.8125568698817105, "grad_norm": 0.48888813010577203, "learning_rate": 3.547217188469937e-06, "loss": 0.0214, "step": 3984 }, { "epoch": 1.8130118289353958, "grad_norm": 0.5619021177212314, "learning_rate": 3.5465682178363657e-06, "loss": 0.0242, "step": 3985 }, { "epoch": 1.8134667879890811, "grad_norm": 0.663724732525883, "learning_rate": 3.5459191616819676e-06, "loss": 0.0341, "step": 3986 }, { "epoch": 1.813921747042766, "grad_norm": 0.8746818177533593, "learning_rate": 3.545270020059778e-06, "loss": 0.0386, "step": 3987 }, { "epoch": 1.8143767060964513, "grad_norm": 0.6107295363923879, "learning_rate": 3.544620793022842e-06, "loss": 0.0219, "step": 3988 }, { "epoch": 1.8148316651501366, "grad_norm": 0.6676118495207064, "learning_rate": 3.543971480624214e-06, "loss": 0.0227, "step": 3989 }, { "epoch": 1.8152866242038217, "grad_norm": 0.707667366118332, "learning_rate": 3.5433220829169495e-06, "loss": 0.0301, "step": 3990 }, { "epoch": 1.8157415832575068, "grad_norm": 0.6359747410412094, "learning_rate": 3.542672599954117e-06, "loss": 0.0147, "step": 3991 }, { "epoch": 1.816196542311192, "grad_norm": 0.48205299954100556, "learning_rate": 3.5420230317887884e-06, "loss": 0.019, "step": 3992 }, { "epoch": 1.8166515013648772, "grad_norm": 0.5832709597005538, "learning_rate": 3.5413733784740423e-06, "loss": 0.0217, "step": 3993 }, { "epoch": 1.8171064604185623, "grad_norm": 0.7912708862749763, "learning_rate": 3.5407236400629676e-06, "loss": 0.0342, "step": 3994 }, { "epoch": 1.8175614194722476, "grad_norm": 0.7468407956182314, "learning_rate": 3.5400738166086555e-06, "loss": 0.0159, "step": 3995 }, { "epoch": 1.8180163785259327, "grad_norm": 0.7689464353594428, "learning_rate": 3.5394239081642095e-06, "loss": 0.0173, "step": 3996 }, { "epoch": 1.8184713375796178, "grad_norm": 0.6623555020814018, "learning_rate": 3.5387739147827355e-06, "loss": 0.0352, "step": 3997 }, { "epoch": 1.818926296633303, "grad_norm": 0.6454186342811251, "learning_rate": 3.538123836517348e-06, "loss": 0.0322, "step": 3998 }, { "epoch": 1.8193812556869882, "grad_norm": 0.6138555588869485, "learning_rate": 3.5374736734211686e-06, "loss": 0.0275, "step": 3999 }, { "epoch": 1.8198362147406733, "grad_norm": 0.3639057356725028, "learning_rate": 3.5368234255473255e-06, "loss": 0.0119, "step": 4000 }, { "epoch": 1.8202911737943586, "grad_norm": 0.5597933598550137, "learning_rate": 3.536173092948955e-06, "loss": 0.0281, "step": 4001 }, { "epoch": 1.8207461328480437, "grad_norm": 0.6420504528345693, "learning_rate": 3.535522675679198e-06, "loss": 0.0299, "step": 4002 }, { "epoch": 1.8212010919017287, "grad_norm": 0.6559042372961266, "learning_rate": 3.534872173791205e-06, "loss": 0.0206, "step": 4003 }, { "epoch": 1.821656050955414, "grad_norm": 0.7011306590734704, "learning_rate": 3.534221587338131e-06, "loss": 0.0343, "step": 4004 }, { "epoch": 1.8221110100090991, "grad_norm": 0.5609435153804432, "learning_rate": 3.5335709163731395e-06, "loss": 0.0219, "step": 4005 }, { "epoch": 1.8225659690627842, "grad_norm": 0.6995344337679733, "learning_rate": 3.5329201609494002e-06, "loss": 0.0308, "step": 4006 }, { "epoch": 1.8230209281164695, "grad_norm": 0.5062579058664941, "learning_rate": 3.53226932112009e-06, "loss": 0.0223, "step": 4007 }, { "epoch": 1.8234758871701549, "grad_norm": 0.535475016515065, "learning_rate": 3.5316183969383927e-06, "loss": 0.0243, "step": 4008 }, { "epoch": 1.8239308462238397, "grad_norm": 0.49852208635235656, "learning_rate": 3.5309673884574986e-06, "loss": 0.0267, "step": 4009 }, { "epoch": 1.824385805277525, "grad_norm": 0.4575123075194255, "learning_rate": 3.530316295730605e-06, "loss": 0.0181, "step": 4010 }, { "epoch": 1.8248407643312103, "grad_norm": 0.654568819597339, "learning_rate": 3.5296651188109177e-06, "loss": 0.0308, "step": 4011 }, { "epoch": 1.8252957233848952, "grad_norm": 0.7008482823249849, "learning_rate": 3.529013857751646e-06, "loss": 0.0267, "step": 4012 }, { "epoch": 1.8257506824385805, "grad_norm": 0.619237197966384, "learning_rate": 3.5283625126060084e-06, "loss": 0.0261, "step": 4013 }, { "epoch": 1.8262056414922658, "grad_norm": 0.5431311052199314, "learning_rate": 3.5277110834272305e-06, "loss": 0.018, "step": 4014 }, { "epoch": 1.826660600545951, "grad_norm": 0.5058084145841144, "learning_rate": 3.527059570268545e-06, "loss": 0.0254, "step": 4015 }, { "epoch": 1.827115559599636, "grad_norm": 0.7501468653881717, "learning_rate": 3.5264079731831885e-06, "loss": 0.0372, "step": 4016 }, { "epoch": 1.8275705186533213, "grad_norm": 0.6697170559337703, "learning_rate": 3.5257562922244074e-06, "loss": 0.0321, "step": 4017 }, { "epoch": 1.8280254777070064, "grad_norm": 0.5431197605862713, "learning_rate": 3.5251045274454554e-06, "loss": 0.022, "step": 4018 }, { "epoch": 1.8284804367606915, "grad_norm": 0.5702555911626316, "learning_rate": 3.5244526788995905e-06, "loss": 0.0299, "step": 4019 }, { "epoch": 1.8289353958143768, "grad_norm": 0.7412336252532792, "learning_rate": 3.5238007466400787e-06, "loss": 0.0229, "step": 4020 }, { "epoch": 1.829390354868062, "grad_norm": 0.49646076515900334, "learning_rate": 3.5231487307201933e-06, "loss": 0.0238, "step": 4021 }, { "epoch": 1.829845313921747, "grad_norm": 0.6711020454374648, "learning_rate": 3.5224966311932145e-06, "loss": 0.0345, "step": 4022 }, { "epoch": 1.8303002729754323, "grad_norm": 0.622335187656626, "learning_rate": 3.521844448112428e-06, "loss": 0.0209, "step": 4023 }, { "epoch": 1.8307552320291174, "grad_norm": 0.6177719560348343, "learning_rate": 3.5211921815311284e-06, "loss": 0.0353, "step": 4024 }, { "epoch": 1.8312101910828025, "grad_norm": 0.5447190507063302, "learning_rate": 3.5205398315026156e-06, "loss": 0.0205, "step": 4025 }, { "epoch": 1.8316651501364878, "grad_norm": 0.4803194761696715, "learning_rate": 3.5198873980801956e-06, "loss": 0.0211, "step": 4026 }, { "epoch": 1.8321201091901729, "grad_norm": 0.39561031653300816, "learning_rate": 3.519234881317184e-06, "loss": 0.013, "step": 4027 }, { "epoch": 1.832575068243858, "grad_norm": 1.0812013544527848, "learning_rate": 3.5185822812669e-06, "loss": 0.0299, "step": 4028 }, { "epoch": 1.8330300272975433, "grad_norm": 0.5052824583359373, "learning_rate": 3.517929597982672e-06, "loss": 0.0262, "step": 4029 }, { "epoch": 1.8334849863512284, "grad_norm": 0.5581492140910633, "learning_rate": 3.5172768315178345e-06, "loss": 0.0247, "step": 4030 }, { "epoch": 1.8339399454049135, "grad_norm": 0.47193848945879807, "learning_rate": 3.5166239819257286e-06, "loss": 0.0157, "step": 4031 }, { "epoch": 1.8343949044585988, "grad_norm": 0.4935662522770819, "learning_rate": 3.5159710492597014e-06, "loss": 0.0208, "step": 4032 }, { "epoch": 1.8348498635122839, "grad_norm": 0.5117452200018818, "learning_rate": 3.5153180335731085e-06, "loss": 0.0238, "step": 4033 }, { "epoch": 1.835304822565969, "grad_norm": 0.7543006703122391, "learning_rate": 3.5146649349193108e-06, "loss": 0.0297, "step": 4034 }, { "epoch": 1.8357597816196543, "grad_norm": 0.6774993129537485, "learning_rate": 3.514011753351677e-06, "loss": 0.0316, "step": 4035 }, { "epoch": 1.8362147406733396, "grad_norm": 0.3845290001458167, "learning_rate": 3.5133584889235817e-06, "loss": 0.0122, "step": 4036 }, { "epoch": 1.8366696997270244, "grad_norm": 0.635937446710609, "learning_rate": 3.5127051416884067e-06, "loss": 0.0352, "step": 4037 }, { "epoch": 1.8371246587807097, "grad_norm": 0.6653192307681326, "learning_rate": 3.5120517116995413e-06, "loss": 0.03, "step": 4038 }, { "epoch": 1.837579617834395, "grad_norm": 0.5961246694882495, "learning_rate": 3.5113981990103807e-06, "loss": 0.0297, "step": 4039 }, { "epoch": 1.83803457688808, "grad_norm": 0.7144300843628418, "learning_rate": 3.5107446036743257e-06, "loss": 0.026, "step": 4040 }, { "epoch": 1.8384895359417652, "grad_norm": 0.3867494928962891, "learning_rate": 3.510090925744787e-06, "loss": 0.0253, "step": 4041 }, { "epoch": 1.8389444949954505, "grad_norm": 0.5989351669064817, "learning_rate": 3.5094371652751797e-06, "loss": 0.0288, "step": 4042 }, { "epoch": 1.8393994540491356, "grad_norm": 0.5961761652168938, "learning_rate": 3.508783322318925e-06, "loss": 0.0257, "step": 4043 }, { "epoch": 1.8398544131028207, "grad_norm": 0.4725898462098795, "learning_rate": 3.508129396929453e-06, "loss": 0.0194, "step": 4044 }, { "epoch": 1.840309372156506, "grad_norm": 0.5858154515029264, "learning_rate": 3.5074753891601996e-06, "loss": 0.0198, "step": 4045 }, { "epoch": 1.8407643312101911, "grad_norm": 0.5996895405853112, "learning_rate": 3.5068212990646065e-06, "loss": 0.0277, "step": 4046 }, { "epoch": 1.8412192902638762, "grad_norm": 0.5959211636447459, "learning_rate": 3.506167126696125e-06, "loss": 0.0271, "step": 4047 }, { "epoch": 1.8416742493175615, "grad_norm": 0.5232705229308232, "learning_rate": 3.5055128721082083e-06, "loss": 0.0221, "step": 4048 }, { "epoch": 1.8421292083712466, "grad_norm": 0.6822744533195637, "learning_rate": 3.5048585353543212e-06, "loss": 0.0301, "step": 4049 }, { "epoch": 1.8425841674249317, "grad_norm": 0.5455110382137249, "learning_rate": 3.5042041164879324e-06, "loss": 0.0253, "step": 4050 }, { "epoch": 1.843039126478617, "grad_norm": 7.258958353361619, "learning_rate": 3.503549615562518e-06, "loss": 0.0643, "step": 4051 }, { "epoch": 1.843494085532302, "grad_norm": 0.7990959843471525, "learning_rate": 3.5028950326315615e-06, "loss": 0.0317, "step": 4052 }, { "epoch": 1.8439490445859872, "grad_norm": 1.3086270545338479, "learning_rate": 3.502240367748551e-06, "loss": 0.0325, "step": 4053 }, { "epoch": 1.8444040036396725, "grad_norm": 0.5027413361342123, "learning_rate": 3.501585620966985e-06, "loss": 0.0237, "step": 4054 }, { "epoch": 1.8448589626933576, "grad_norm": 0.6472714468679889, "learning_rate": 3.5009307923403634e-06, "loss": 0.0318, "step": 4055 }, { "epoch": 1.8453139217470427, "grad_norm": 0.605065717239189, "learning_rate": 3.5002758819221993e-06, "loss": 0.0306, "step": 4056 }, { "epoch": 1.845768880800728, "grad_norm": 0.6199856958680251, "learning_rate": 3.499620889766007e-06, "loss": 0.0265, "step": 4057 }, { "epoch": 1.846223839854413, "grad_norm": 0.5120392464678569, "learning_rate": 3.4989658159253094e-06, "loss": 0.0197, "step": 4058 }, { "epoch": 1.8466787989080982, "grad_norm": 0.4687404162265479, "learning_rate": 3.4983106604536367e-06, "loss": 0.0185, "step": 4059 }, { "epoch": 1.8471337579617835, "grad_norm": 0.5805973356268656, "learning_rate": 3.4976554234045253e-06, "loss": 0.0303, "step": 4060 }, { "epoch": 1.8475887170154686, "grad_norm": 0.4302366131533026, "learning_rate": 3.4970001048315184e-06, "loss": 0.0156, "step": 4061 }, { "epoch": 1.8480436760691537, "grad_norm": 0.46184427619175644, "learning_rate": 3.496344704788165e-06, "loss": 0.0175, "step": 4062 }, { "epoch": 1.848498635122839, "grad_norm": 0.7186547724679246, "learning_rate": 3.4956892233280214e-06, "loss": 0.0264, "step": 4063 }, { "epoch": 1.8489535941765243, "grad_norm": 0.5444957276565535, "learning_rate": 3.4950336605046513e-06, "loss": 0.0192, "step": 4064 }, { "epoch": 1.8494085532302091, "grad_norm": 0.8250887590345779, "learning_rate": 3.4943780163716232e-06, "loss": 0.0346, "step": 4065 }, { "epoch": 1.8498635122838945, "grad_norm": 0.2799274145074072, "learning_rate": 3.4937222909825157e-06, "loss": 0.0088, "step": 4066 }, { "epoch": 1.8503184713375798, "grad_norm": 0.677323307811774, "learning_rate": 3.493066484390909e-06, "loss": 0.0277, "step": 4067 }, { "epoch": 1.8507734303912646, "grad_norm": 0.5194534881038342, "learning_rate": 3.4924105966503952e-06, "loss": 0.0222, "step": 4068 }, { "epoch": 1.85122838944495, "grad_norm": 0.5496648617283905, "learning_rate": 3.491754627814568e-06, "loss": 0.022, "step": 4069 }, { "epoch": 1.8516833484986353, "grad_norm": 0.7407519425454147, "learning_rate": 3.491098577937031e-06, "loss": 0.0249, "step": 4070 }, { "epoch": 1.8521383075523203, "grad_norm": 0.9158756446548485, "learning_rate": 3.4904424470713947e-06, "loss": 0.0301, "step": 4071 }, { "epoch": 1.8525932666060054, "grad_norm": 0.5413501841399587, "learning_rate": 3.4897862352712743e-06, "loss": 0.0229, "step": 4072 }, { "epoch": 1.8530482256596907, "grad_norm": 0.43338773222409777, "learning_rate": 3.4891299425902923e-06, "loss": 0.0223, "step": 4073 }, { "epoch": 1.8535031847133758, "grad_norm": 0.6570549149317368, "learning_rate": 3.4884735690820786e-06, "loss": 0.0262, "step": 4074 }, { "epoch": 1.853958143767061, "grad_norm": 0.5314666943552728, "learning_rate": 3.4878171148002694e-06, "loss": 0.0309, "step": 4075 }, { "epoch": 1.8544131028207462, "grad_norm": 0.5727180397610115, "learning_rate": 3.4871605797985052e-06, "loss": 0.0222, "step": 4076 }, { "epoch": 1.8548680618744313, "grad_norm": 0.6161967643537954, "learning_rate": 3.486503964130437e-06, "loss": 0.0194, "step": 4077 }, { "epoch": 1.8553230209281164, "grad_norm": 0.4706605286549067, "learning_rate": 3.4858472678497204e-06, "loss": 0.0147, "step": 4078 }, { "epoch": 1.8557779799818017, "grad_norm": 0.5115881021770834, "learning_rate": 3.4851904910100166e-06, "loss": 0.0252, "step": 4079 }, { "epoch": 1.8562329390354868, "grad_norm": 0.609519661045525, "learning_rate": 3.4845336336649943e-06, "loss": 0.0241, "step": 4080 }, { "epoch": 1.856687898089172, "grad_norm": 1.0261180461187072, "learning_rate": 3.483876695868331e-06, "loss": 0.0495, "step": 4081 }, { "epoch": 1.8571428571428572, "grad_norm": 0.7173150041656413, "learning_rate": 3.483219677673706e-06, "loss": 0.0233, "step": 4082 }, { "epoch": 1.8575978161965423, "grad_norm": 0.80501614780236, "learning_rate": 3.4825625791348093e-06, "loss": 0.0253, "step": 4083 }, { "epoch": 1.8580527752502274, "grad_norm": 0.5729737030518369, "learning_rate": 3.481905400305336e-06, "loss": 0.0183, "step": 4084 }, { "epoch": 1.8585077343039127, "grad_norm": 0.4912515747387265, "learning_rate": 3.481248141238988e-06, "loss": 0.0219, "step": 4085 }, { "epoch": 1.8589626933575978, "grad_norm": 0.5708473685074191, "learning_rate": 3.480590801989473e-06, "loss": 0.0226, "step": 4086 }, { "epoch": 1.8594176524112829, "grad_norm": 0.6212403183165005, "learning_rate": 3.479933382610506e-06, "loss": 0.018, "step": 4087 }, { "epoch": 1.8598726114649682, "grad_norm": 0.7378714652750996, "learning_rate": 3.479275883155808e-06, "loss": 0.0205, "step": 4088 }, { "epoch": 1.8603275705186533, "grad_norm": 0.7007909915589021, "learning_rate": 3.478618303679108e-06, "loss": 0.0265, "step": 4089 }, { "epoch": 1.8607825295723384, "grad_norm": 0.6772633104268468, "learning_rate": 3.4779606442341385e-06, "loss": 0.0304, "step": 4090 }, { "epoch": 1.8612374886260237, "grad_norm": 0.5741879903610126, "learning_rate": 3.477302904874642e-06, "loss": 0.0284, "step": 4091 }, { "epoch": 1.861692447679709, "grad_norm": 0.6953940223088877, "learning_rate": 3.476645085654366e-06, "loss": 0.0222, "step": 4092 }, { "epoch": 1.8621474067333939, "grad_norm": 0.6419073123672607, "learning_rate": 3.4759871866270633e-06, "loss": 0.0193, "step": 4093 }, { "epoch": 1.8626023657870792, "grad_norm": 0.4615499972469333, "learning_rate": 3.475329207846496e-06, "loss": 0.0221, "step": 4094 }, { "epoch": 1.8630573248407645, "grad_norm": 0.5886497963762636, "learning_rate": 3.4746711493664305e-06, "loss": 0.0229, "step": 4095 }, { "epoch": 1.8635122838944493, "grad_norm": 0.5615878201988692, "learning_rate": 3.4740130112406395e-06, "loss": 0.0192, "step": 4096 }, { "epoch": 1.8639672429481347, "grad_norm": 0.47702292606332697, "learning_rate": 3.473354793522904e-06, "loss": 0.0183, "step": 4097 }, { "epoch": 1.86442220200182, "grad_norm": 0.8871040808289109, "learning_rate": 3.472696496267011e-06, "loss": 0.0392, "step": 4098 }, { "epoch": 1.864877161055505, "grad_norm": 0.3908130222003508, "learning_rate": 3.4720381195267523e-06, "loss": 0.0179, "step": 4099 }, { "epoch": 1.8653321201091901, "grad_norm": 0.5662829255482532, "learning_rate": 3.4713796633559283e-06, "loss": 0.0196, "step": 4100 }, { "epoch": 1.8657870791628755, "grad_norm": 0.737562327709937, "learning_rate": 3.4707211278083453e-06, "loss": 0.0377, "step": 4101 }, { "epoch": 1.8662420382165605, "grad_norm": 0.631259881141492, "learning_rate": 3.470062512937815e-06, "loss": 0.0301, "step": 4102 }, { "epoch": 1.8666969972702456, "grad_norm": 0.5696674843363798, "learning_rate": 3.4694038187981573e-06, "loss": 0.0227, "step": 4103 }, { "epoch": 1.867151956323931, "grad_norm": 0.7185173322973434, "learning_rate": 3.468745045443197e-06, "loss": 0.0294, "step": 4104 }, { "epoch": 1.867606915377616, "grad_norm": 0.4947379541602466, "learning_rate": 3.468086192926767e-06, "loss": 0.0217, "step": 4105 }, { "epoch": 1.8680618744313011, "grad_norm": 0.4149424427669872, "learning_rate": 3.4674272613027043e-06, "loss": 0.0108, "step": 4106 }, { "epoch": 1.8685168334849864, "grad_norm": 0.6307821995155474, "learning_rate": 3.4667682506248547e-06, "loss": 0.0296, "step": 4107 }, { "epoch": 1.8689717925386715, "grad_norm": 0.4474845214084304, "learning_rate": 3.46610916094707e-06, "loss": 0.0203, "step": 4108 }, { "epoch": 1.8694267515923566, "grad_norm": 0.564919365150898, "learning_rate": 3.465449992323208e-06, "loss": 0.0231, "step": 4109 }, { "epoch": 1.869881710646042, "grad_norm": 0.7680842559322145, "learning_rate": 3.4647907448071315e-06, "loss": 0.0422, "step": 4110 }, { "epoch": 1.870336669699727, "grad_norm": 0.46086130249621043, "learning_rate": 3.464131418452713e-06, "loss": 0.0178, "step": 4111 }, { "epoch": 1.870791628753412, "grad_norm": 0.5725516629551197, "learning_rate": 3.463472013313829e-06, "loss": 0.018, "step": 4112 }, { "epoch": 1.8712465878070974, "grad_norm": 0.6990244627900922, "learning_rate": 3.4628125294443625e-06, "loss": 0.0277, "step": 4113 }, { "epoch": 1.8717015468607825, "grad_norm": 0.610243315310828, "learning_rate": 3.4621529668982047e-06, "loss": 0.027, "step": 4114 }, { "epoch": 1.8721565059144676, "grad_norm": 0.465210373345747, "learning_rate": 3.4614933257292514e-06, "loss": 0.0217, "step": 4115 }, { "epoch": 1.872611464968153, "grad_norm": 0.6601700350193268, "learning_rate": 3.4608336059914057e-06, "loss": 0.0169, "step": 4116 }, { "epoch": 1.873066424021838, "grad_norm": 0.42810973757761955, "learning_rate": 3.4601738077385766e-06, "loss": 0.0166, "step": 4117 }, { "epoch": 1.873521383075523, "grad_norm": 0.8908289199884776, "learning_rate": 3.4595139310246795e-06, "loss": 0.0427, "step": 4118 }, { "epoch": 1.8739763421292084, "grad_norm": 0.732326884916733, "learning_rate": 3.4588539759036377e-06, "loss": 0.0275, "step": 4119 }, { "epoch": 1.8744313011828937, "grad_norm": 0.6741926881885002, "learning_rate": 3.4581939424293792e-06, "loss": 0.0258, "step": 4120 }, { "epoch": 1.8748862602365786, "grad_norm": 0.7545921285061042, "learning_rate": 3.457533830655838e-06, "loss": 0.0228, "step": 4121 }, { "epoch": 1.8753412192902639, "grad_norm": 0.8435199876862758, "learning_rate": 3.456873640636958e-06, "loss": 0.0231, "step": 4122 }, { "epoch": 1.8757961783439492, "grad_norm": 0.4265752727991466, "learning_rate": 3.456213372426684e-06, "loss": 0.0097, "step": 4123 }, { "epoch": 1.876251137397634, "grad_norm": 0.5380126304302891, "learning_rate": 3.4555530260789715e-06, "loss": 0.0253, "step": 4124 }, { "epoch": 1.8767060964513194, "grad_norm": 0.734337864059189, "learning_rate": 3.4548926016477815e-06, "loss": 0.0301, "step": 4125 }, { "epoch": 1.8771610555050047, "grad_norm": 0.7381447086591394, "learning_rate": 3.4542320991870803e-06, "loss": 0.0272, "step": 4126 }, { "epoch": 1.8776160145586898, "grad_norm": 0.6502773644146786, "learning_rate": 3.4535715187508406e-06, "loss": 0.0206, "step": 4127 }, { "epoch": 1.8780709736123748, "grad_norm": 0.5876271299574465, "learning_rate": 3.4529108603930428e-06, "loss": 0.0195, "step": 4128 }, { "epoch": 1.8785259326660602, "grad_norm": 0.7381623404288203, "learning_rate": 3.452250124167674e-06, "loss": 0.0292, "step": 4129 }, { "epoch": 1.8789808917197452, "grad_norm": 0.6195594310864881, "learning_rate": 3.451589310128724e-06, "loss": 0.0253, "step": 4130 }, { "epoch": 1.8794358507734303, "grad_norm": 0.42037125483851656, "learning_rate": 3.450928418330193e-06, "loss": 0.0147, "step": 4131 }, { "epoch": 1.8798908098271156, "grad_norm": 0.9171367382983645, "learning_rate": 3.450267448826087e-06, "loss": 0.0339, "step": 4132 }, { "epoch": 1.8803457688808007, "grad_norm": 0.7164310366466077, "learning_rate": 3.4496064016704158e-06, "loss": 0.026, "step": 4133 }, { "epoch": 1.8808007279344858, "grad_norm": 0.6373806395860385, "learning_rate": 3.4489452769171982e-06, "loss": 0.0267, "step": 4134 }, { "epoch": 1.8812556869881711, "grad_norm": 0.7023389365921144, "learning_rate": 3.4482840746204573e-06, "loss": 0.0253, "step": 4135 }, { "epoch": 1.8817106460418562, "grad_norm": 0.5837647641296511, "learning_rate": 3.4476227948342247e-06, "loss": 0.0267, "step": 4136 }, { "epoch": 1.8821656050955413, "grad_norm": 0.7176431935593713, "learning_rate": 3.446961437612536e-06, "loss": 0.0279, "step": 4137 }, { "epoch": 1.8826205641492266, "grad_norm": 0.6987687781193473, "learning_rate": 3.4463000030094356e-06, "loss": 0.0322, "step": 4138 }, { "epoch": 1.8830755232029117, "grad_norm": 0.5301867923938187, "learning_rate": 3.445638491078973e-06, "loss": 0.0211, "step": 4139 }, { "epoch": 1.8835304822565968, "grad_norm": 0.6650944681919888, "learning_rate": 3.4449769018752027e-06, "loss": 0.0299, "step": 4140 }, { "epoch": 1.8839854413102821, "grad_norm": 0.7634384647834561, "learning_rate": 3.4443152354521882e-06, "loss": 0.0308, "step": 4141 }, { "epoch": 1.8844404003639672, "grad_norm": 0.6887678086511066, "learning_rate": 3.4436534918639957e-06, "loss": 0.0403, "step": 4142 }, { "epoch": 1.8848953594176523, "grad_norm": 0.6353392991911169, "learning_rate": 3.442991671164703e-06, "loss": 0.0252, "step": 4143 }, { "epoch": 1.8853503184713376, "grad_norm": 0.6053624978035878, "learning_rate": 3.4423297734083884e-06, "loss": 0.0249, "step": 4144 }, { "epoch": 1.8858052775250227, "grad_norm": 0.5445503993049013, "learning_rate": 3.4416677986491397e-06, "loss": 0.0196, "step": 4145 }, { "epoch": 1.8862602365787078, "grad_norm": 0.5508086570636826, "learning_rate": 3.4410057469410524e-06, "loss": 0.0159, "step": 4146 }, { "epoch": 1.886715195632393, "grad_norm": 0.6711713607070434, "learning_rate": 3.4403436183382244e-06, "loss": 0.0346, "step": 4147 }, { "epoch": 1.8871701546860784, "grad_norm": 0.5822389851345435, "learning_rate": 3.4396814128947626e-06, "loss": 0.0189, "step": 4148 }, { "epoch": 1.8876251137397633, "grad_norm": 0.47747445766686536, "learning_rate": 3.4390191306647787e-06, "loss": 0.0188, "step": 4149 }, { "epoch": 1.8880800727934486, "grad_norm": 0.5937013080982918, "learning_rate": 3.4383567717023923e-06, "loss": 0.0254, "step": 4150 }, { "epoch": 1.888535031847134, "grad_norm": 0.4623615492631241, "learning_rate": 3.437694336061729e-06, "loss": 0.0166, "step": 4151 }, { "epoch": 1.8889899909008188, "grad_norm": 0.5363330302234371, "learning_rate": 3.437031823796918e-06, "loss": 0.0312, "step": 4152 }, { "epoch": 1.889444949954504, "grad_norm": 0.7446609127283842, "learning_rate": 3.436369234962099e-06, "loss": 0.0345, "step": 4153 }, { "epoch": 1.8898999090081894, "grad_norm": 0.44932471079634706, "learning_rate": 3.4357065696114134e-06, "loss": 0.0169, "step": 4154 }, { "epoch": 1.8903548680618745, "grad_norm": 1.0159048182080996, "learning_rate": 3.435043827799014e-06, "loss": 0.0543, "step": 4155 }, { "epoch": 1.8908098271155596, "grad_norm": 0.7244525855805061, "learning_rate": 3.4343810095790547e-06, "loss": 0.0367, "step": 4156 }, { "epoch": 1.8912647861692449, "grad_norm": 0.5535646313035905, "learning_rate": 3.4337181150056984e-06, "loss": 0.0208, "step": 4157 }, { "epoch": 1.89171974522293, "grad_norm": 0.5546255967675834, "learning_rate": 3.433055144133116e-06, "loss": 0.0173, "step": 4158 }, { "epoch": 1.892174704276615, "grad_norm": 0.4782134308378916, "learning_rate": 3.432392097015479e-06, "loss": 0.0198, "step": 4159 }, { "epoch": 1.8926296633303004, "grad_norm": 0.6382607307170164, "learning_rate": 3.431728973706972e-06, "loss": 0.0309, "step": 4160 }, { "epoch": 1.8930846223839854, "grad_norm": 0.8475761200303933, "learning_rate": 3.4310657742617804e-06, "loss": 0.0361, "step": 4161 }, { "epoch": 1.8935395814376705, "grad_norm": 0.5131741209692655, "learning_rate": 3.4304024987340982e-06, "loss": 0.0273, "step": 4162 }, { "epoch": 1.8939945404913558, "grad_norm": 0.485346284414405, "learning_rate": 3.429739147178126e-06, "loss": 0.0147, "step": 4163 }, { "epoch": 1.894449499545041, "grad_norm": 0.5690778987303913, "learning_rate": 3.4290757196480683e-06, "loss": 0.0217, "step": 4164 }, { "epoch": 1.894904458598726, "grad_norm": 0.6884615505252311, "learning_rate": 3.42841221619814e-06, "loss": 0.0303, "step": 4165 }, { "epoch": 1.8953594176524113, "grad_norm": 0.7378294236885259, "learning_rate": 3.4277486368825563e-06, "loss": 0.0272, "step": 4166 }, { "epoch": 1.8958143767060964, "grad_norm": 0.5801359227478589, "learning_rate": 3.427084981755545e-06, "loss": 0.0233, "step": 4167 }, { "epoch": 1.8962693357597815, "grad_norm": 0.652635134686771, "learning_rate": 3.4264212508713357e-06, "loss": 0.0307, "step": 4168 }, { "epoch": 1.8967242948134668, "grad_norm": 0.4994589269899435, "learning_rate": 3.4257574442841644e-06, "loss": 0.0178, "step": 4169 }, { "epoch": 1.897179253867152, "grad_norm": 0.6766778640338458, "learning_rate": 3.425093562048276e-06, "loss": 0.0295, "step": 4170 }, { "epoch": 1.897634212920837, "grad_norm": 5.2663205074351245, "learning_rate": 3.424429604217919e-06, "loss": 0.052, "step": 4171 }, { "epoch": 1.8980891719745223, "grad_norm": 0.6708187266402499, "learning_rate": 3.4237655708473506e-06, "loss": 0.0317, "step": 4172 }, { "epoch": 1.8985441310282076, "grad_norm": 0.8460880256084787, "learning_rate": 3.4231014619908303e-06, "loss": 0.0319, "step": 4173 }, { "epoch": 1.8989990900818925, "grad_norm": 0.665888271390732, "learning_rate": 3.422437277702628e-06, "loss": 0.0254, "step": 4174 }, { "epoch": 1.8994540491355778, "grad_norm": 0.4823846028707616, "learning_rate": 3.4217730180370168e-06, "loss": 0.0202, "step": 4175 }, { "epoch": 1.8999090081892631, "grad_norm": 0.4287185783070002, "learning_rate": 3.4211086830482766e-06, "loss": 0.015, "step": 4176 }, { "epoch": 1.900363967242948, "grad_norm": 0.6936565498678845, "learning_rate": 3.420444272790695e-06, "loss": 0.0265, "step": 4177 }, { "epoch": 1.9008189262966333, "grad_norm": 0.5758321905431244, "learning_rate": 3.419779787318564e-06, "loss": 0.0195, "step": 4178 }, { "epoch": 1.9012738853503186, "grad_norm": 0.4329281864765619, "learning_rate": 3.4191152266861826e-06, "loss": 0.0191, "step": 4179 }, { "epoch": 1.9017288444040037, "grad_norm": 0.7440356541476081, "learning_rate": 3.4184505909478554e-06, "loss": 0.0333, "step": 4180 }, { "epoch": 1.9021838034576888, "grad_norm": 0.6565549249546254, "learning_rate": 3.417785880157894e-06, "loss": 0.0296, "step": 4181 }, { "epoch": 1.902638762511374, "grad_norm": 0.6540198355899479, "learning_rate": 3.417121094370615e-06, "loss": 0.0308, "step": 4182 }, { "epoch": 1.9030937215650592, "grad_norm": 0.6776201788773397, "learning_rate": 3.416456233640342e-06, "loss": 0.0295, "step": 4183 }, { "epoch": 1.9035486806187443, "grad_norm": 0.4963019048293424, "learning_rate": 3.4157912980214036e-06, "loss": 0.026, "step": 4184 }, { "epoch": 1.9040036396724296, "grad_norm": 0.7530984269401886, "learning_rate": 3.4151262875681362e-06, "loss": 0.0302, "step": 4185 }, { "epoch": 1.9044585987261147, "grad_norm": 0.48921722659817435, "learning_rate": 3.4144612023348823e-06, "loss": 0.0204, "step": 4186 }, { "epoch": 1.9049135577797998, "grad_norm": 0.5070336934731073, "learning_rate": 3.4137960423759874e-06, "loss": 0.0285, "step": 4187 }, { "epoch": 1.905368516833485, "grad_norm": 0.5721335887842549, "learning_rate": 3.413130807745807e-06, "loss": 0.0201, "step": 4188 }, { "epoch": 1.9058234758871702, "grad_norm": 0.5816604680915403, "learning_rate": 3.4124654984987003e-06, "loss": 0.0265, "step": 4189 }, { "epoch": 1.9062784349408552, "grad_norm": 0.5052944963535851, "learning_rate": 3.4118001146890345e-06, "loss": 0.0207, "step": 4190 }, { "epoch": 1.9067333939945406, "grad_norm": 0.4227736292224336, "learning_rate": 3.41113465637118e-06, "loss": 0.0128, "step": 4191 }, { "epoch": 1.9071883530482256, "grad_norm": 0.6038825988712329, "learning_rate": 3.4104691235995173e-06, "loss": 0.03, "step": 4192 }, { "epoch": 1.9076433121019107, "grad_norm": 0.6929729978992308, "learning_rate": 3.4098035164284284e-06, "loss": 0.0315, "step": 4193 }, { "epoch": 1.908098271155596, "grad_norm": 0.4228543731802424, "learning_rate": 3.409137834912305e-06, "loss": 0.0155, "step": 4194 }, { "epoch": 1.9085532302092811, "grad_norm": 0.8112317382400335, "learning_rate": 3.408472079105544e-06, "loss": 0.0386, "step": 4195 }, { "epoch": 1.9090081892629662, "grad_norm": 0.5241864221057698, "learning_rate": 3.4078062490625465e-06, "loss": 0.0217, "step": 4196 }, { "epoch": 1.9094631483166515, "grad_norm": 0.7701752072237201, "learning_rate": 3.407140344837722e-06, "loss": 0.0433, "step": 4197 }, { "epoch": 1.9099181073703366, "grad_norm": 0.4825984087303342, "learning_rate": 3.4064743664854853e-06, "loss": 0.0175, "step": 4198 }, { "epoch": 1.9103730664240217, "grad_norm": 0.6354849147728596, "learning_rate": 3.405808314060257e-06, "loss": 0.0345, "step": 4199 }, { "epoch": 1.910828025477707, "grad_norm": 0.4078479759744009, "learning_rate": 3.4051421876164643e-06, "loss": 0.0144, "step": 4200 }, { "epoch": 1.9112829845313923, "grad_norm": 0.44989143236776896, "learning_rate": 3.4044759872085387e-06, "loss": 0.0209, "step": 4201 }, { "epoch": 1.9117379435850772, "grad_norm": 0.6062240583673126, "learning_rate": 3.4038097128909207e-06, "loss": 0.0333, "step": 4202 }, { "epoch": 1.9121929026387625, "grad_norm": 0.960681212375926, "learning_rate": 3.4031433647180547e-06, "loss": 0.0553, "step": 4203 }, { "epoch": 1.9126478616924478, "grad_norm": 0.41136699096874724, "learning_rate": 3.4024769427443916e-06, "loss": 0.0165, "step": 4204 }, { "epoch": 1.9131028207461327, "grad_norm": 0.8050787794722686, "learning_rate": 3.4018104470243866e-06, "loss": 0.045, "step": 4205 }, { "epoch": 1.913557779799818, "grad_norm": 0.6334803312391295, "learning_rate": 3.401143877612506e-06, "loss": 0.0333, "step": 4206 }, { "epoch": 1.9140127388535033, "grad_norm": 0.7974970889991329, "learning_rate": 3.400477234563217e-06, "loss": 0.0279, "step": 4207 }, { "epoch": 1.9144676979071884, "grad_norm": 0.3793253935855811, "learning_rate": 3.3998105179309946e-06, "loss": 0.0119, "step": 4208 }, { "epoch": 1.9149226569608735, "grad_norm": 0.3861316901385662, "learning_rate": 3.399143727770321e-06, "loss": 0.0152, "step": 4209 }, { "epoch": 1.9153776160145588, "grad_norm": 0.5876048857171886, "learning_rate": 3.3984768641356812e-06, "loss": 0.0178, "step": 4210 }, { "epoch": 1.915832575068244, "grad_norm": 0.6400291536141484, "learning_rate": 3.3978099270815714e-06, "loss": 0.0267, "step": 4211 }, { "epoch": 1.916287534121929, "grad_norm": 0.5922713365094958, "learning_rate": 3.3971429166624864e-06, "loss": 0.0183, "step": 4212 }, { "epoch": 1.9167424931756143, "grad_norm": 0.4711307832200224, "learning_rate": 3.3964758329329356e-06, "loss": 0.0232, "step": 4213 }, { "epoch": 1.9171974522292994, "grad_norm": 0.9940377110927839, "learning_rate": 3.3958086759474275e-06, "loss": 0.0431, "step": 4214 }, { "epoch": 1.9176524112829845, "grad_norm": 0.7728067079231176, "learning_rate": 3.395141445760479e-06, "loss": 0.0303, "step": 4215 }, { "epoch": 1.9181073703366698, "grad_norm": 0.5819009065741437, "learning_rate": 3.394474142426616e-06, "loss": 0.0208, "step": 4216 }, { "epoch": 1.9185623293903549, "grad_norm": 0.4852048884638546, "learning_rate": 3.3938067660003636e-06, "loss": 0.0144, "step": 4217 }, { "epoch": 1.91901728844404, "grad_norm": 0.6551728377805718, "learning_rate": 3.3931393165362604e-06, "loss": 0.0238, "step": 4218 }, { "epoch": 1.9194722474977253, "grad_norm": 0.43000794844011364, "learning_rate": 3.3924717940888437e-06, "loss": 0.0168, "step": 4219 }, { "epoch": 1.9199272065514104, "grad_norm": 0.8062989482050673, "learning_rate": 3.391804198712664e-06, "loss": 0.029, "step": 4220 }, { "epoch": 1.9203821656050954, "grad_norm": 0.3825032344853464, "learning_rate": 3.391136530462272e-06, "loss": 0.0152, "step": 4221 }, { "epoch": 1.9208371246587808, "grad_norm": 0.8703479839095586, "learning_rate": 3.390468789392226e-06, "loss": 0.0414, "step": 4222 }, { "epoch": 1.9212920837124658, "grad_norm": 0.30167241449895205, "learning_rate": 3.389800975557093e-06, "loss": 0.0111, "step": 4223 }, { "epoch": 1.921747042766151, "grad_norm": 0.6984593172867893, "learning_rate": 3.389133089011442e-06, "loss": 0.0371, "step": 4224 }, { "epoch": 1.9222020018198362, "grad_norm": 0.4967016910617634, "learning_rate": 3.3884651298098514e-06, "loss": 0.025, "step": 4225 }, { "epoch": 1.9226569608735213, "grad_norm": 0.6260515897726002, "learning_rate": 3.3877970980069015e-06, "loss": 0.0187, "step": 4226 }, { "epoch": 1.9231119199272064, "grad_norm": 1.2059768455462165, "learning_rate": 3.387128993657182e-06, "loss": 0.034, "step": 4227 }, { "epoch": 1.9235668789808917, "grad_norm": 0.5187727129525913, "learning_rate": 3.3864608168152885e-06, "loss": 0.0198, "step": 4228 }, { "epoch": 1.924021838034577, "grad_norm": 0.5425011430746322, "learning_rate": 3.3857925675358187e-06, "loss": 0.0213, "step": 4229 }, { "epoch": 1.924476797088262, "grad_norm": 0.6735009276265433, "learning_rate": 3.3851242458733818e-06, "loss": 0.0291, "step": 4230 }, { "epoch": 1.9249317561419472, "grad_norm": 0.6576273731099586, "learning_rate": 3.3844558518825876e-06, "loss": 0.0322, "step": 4231 }, { "epoch": 1.9253867151956325, "grad_norm": 0.6144427659853936, "learning_rate": 3.383787385618057e-06, "loss": 0.0284, "step": 4232 }, { "epoch": 1.9258416742493174, "grad_norm": 0.6747398920521402, "learning_rate": 3.383118847134411e-06, "loss": 0.0291, "step": 4233 }, { "epoch": 1.9262966333030027, "grad_norm": 0.5844287310540148, "learning_rate": 3.382450236486281e-06, "loss": 0.015, "step": 4234 }, { "epoch": 1.926751592356688, "grad_norm": 0.5479244247667049, "learning_rate": 3.3817815537283033e-06, "loss": 0.023, "step": 4235 }, { "epoch": 1.9272065514103731, "grad_norm": 0.6881320416211005, "learning_rate": 3.381112798915118e-06, "loss": 0.041, "step": 4236 }, { "epoch": 1.9276615104640582, "grad_norm": 0.561062670879708, "learning_rate": 3.3804439721013756e-06, "loss": 0.0125, "step": 4237 }, { "epoch": 1.9281164695177435, "grad_norm": 0.5709066174631013, "learning_rate": 3.379775073341727e-06, "loss": 0.0232, "step": 4238 }, { "epoch": 1.9285714285714286, "grad_norm": 0.6953179620000729, "learning_rate": 3.3791061026908323e-06, "loss": 0.0261, "step": 4239 }, { "epoch": 1.9290263876251137, "grad_norm": 0.6202431322973055, "learning_rate": 3.3784370602033572e-06, "loss": 0.0243, "step": 4240 }, { "epoch": 1.929481346678799, "grad_norm": 0.6796583345406787, "learning_rate": 3.3777679459339717e-06, "loss": 0.0215, "step": 4241 }, { "epoch": 1.929936305732484, "grad_norm": 0.715765508686804, "learning_rate": 3.377098759937355e-06, "loss": 0.0192, "step": 4242 }, { "epoch": 1.9303912647861692, "grad_norm": 0.5302288369823409, "learning_rate": 3.376429502268188e-06, "loss": 0.0195, "step": 4243 }, { "epoch": 1.9308462238398545, "grad_norm": 0.7782561269046792, "learning_rate": 3.3757601729811596e-06, "loss": 0.0304, "step": 4244 }, { "epoch": 1.9313011828935396, "grad_norm": 0.49993090504217613, "learning_rate": 3.3750907721309658e-06, "loss": 0.0193, "step": 4245 }, { "epoch": 1.9317561419472247, "grad_norm": 0.573452913271923, "learning_rate": 3.374421299772305e-06, "loss": 0.0211, "step": 4246 }, { "epoch": 1.93221110100091, "grad_norm": 0.5351357141762232, "learning_rate": 3.373751755959884e-06, "loss": 0.0281, "step": 4247 }, { "epoch": 1.932666060054595, "grad_norm": 0.6073872140510244, "learning_rate": 3.373082140748416e-06, "loss": 0.0326, "step": 4248 }, { "epoch": 1.9331210191082802, "grad_norm": 0.6995372390956109, "learning_rate": 3.3724124541926184e-06, "loss": 0.0342, "step": 4249 }, { "epoch": 1.9335759781619655, "grad_norm": 0.5116810550134904, "learning_rate": 3.3717426963472146e-06, "loss": 0.0085, "step": 4250 }, { "epoch": 1.9340309372156506, "grad_norm": 0.6278414183115113, "learning_rate": 3.371072867266934e-06, "loss": 0.0228, "step": 4251 }, { "epoch": 1.9344858962693356, "grad_norm": 0.5738855728232217, "learning_rate": 3.3704029670065135e-06, "loss": 0.0268, "step": 4252 }, { "epoch": 1.934940855323021, "grad_norm": 0.5834210114389528, "learning_rate": 3.3697329956206927e-06, "loss": 0.0284, "step": 4253 }, { "epoch": 1.935395814376706, "grad_norm": 0.5176209892431944, "learning_rate": 3.3690629531642188e-06, "loss": 0.0162, "step": 4254 }, { "epoch": 1.9358507734303911, "grad_norm": 0.6177709448254487, "learning_rate": 3.3683928396918453e-06, "loss": 0.022, "step": 4255 }, { "epoch": 1.9363057324840764, "grad_norm": 0.7851992845984936, "learning_rate": 3.3677226552583307e-06, "loss": 0.031, "step": 4256 }, { "epoch": 1.9367606915377618, "grad_norm": 0.4834880101354562, "learning_rate": 3.367052399918439e-06, "loss": 0.0129, "step": 4257 }, { "epoch": 1.9372156505914466, "grad_norm": 0.6512788311296245, "learning_rate": 3.3663820737269408e-06, "loss": 0.0165, "step": 4258 }, { "epoch": 1.937670609645132, "grad_norm": 0.6052492982557375, "learning_rate": 3.365711676738612e-06, "loss": 0.0281, "step": 4259 }, { "epoch": 1.9381255686988172, "grad_norm": 0.6382609622642428, "learning_rate": 3.365041209008235e-06, "loss": 0.0367, "step": 4260 }, { "epoch": 1.9385805277525021, "grad_norm": 0.5805969770453554, "learning_rate": 3.3643706705905967e-06, "loss": 0.0169, "step": 4261 }, { "epoch": 1.9390354868061874, "grad_norm": 0.5401694286369224, "learning_rate": 3.3637000615404907e-06, "loss": 0.0228, "step": 4262 }, { "epoch": 1.9394904458598727, "grad_norm": 0.6647888488203181, "learning_rate": 3.3630293819127157e-06, "loss": 0.0172, "step": 4263 }, { "epoch": 1.9399454049135578, "grad_norm": 0.6829584993932576, "learning_rate": 3.362358631762077e-06, "loss": 0.0246, "step": 4264 }, { "epoch": 1.940400363967243, "grad_norm": 0.6249835340811345, "learning_rate": 3.361687811143386e-06, "loss": 0.0295, "step": 4265 }, { "epoch": 1.9408553230209282, "grad_norm": 0.6366544680783275, "learning_rate": 3.3610169201114586e-06, "loss": 0.0296, "step": 4266 }, { "epoch": 1.9413102820746133, "grad_norm": 0.4947392897634404, "learning_rate": 3.360345958721116e-06, "loss": 0.0149, "step": 4267 }, { "epoch": 1.9417652411282984, "grad_norm": 0.44981862364112446, "learning_rate": 3.3596749270271868e-06, "loss": 0.0209, "step": 4268 }, { "epoch": 1.9422202001819837, "grad_norm": 0.9848702428830935, "learning_rate": 3.3590038250845052e-06, "loss": 0.0414, "step": 4269 }, { "epoch": 1.9426751592356688, "grad_norm": 0.7660503122629815, "learning_rate": 3.3583326529479103e-06, "loss": 0.0334, "step": 4270 }, { "epoch": 1.943130118289354, "grad_norm": 0.5021528866709839, "learning_rate": 3.3576614106722473e-06, "loss": 0.0245, "step": 4271 }, { "epoch": 1.9435850773430392, "grad_norm": 0.47748204166296565, "learning_rate": 3.356990098312366e-06, "loss": 0.0169, "step": 4272 }, { "epoch": 1.9440400363967243, "grad_norm": 0.8232070551896792, "learning_rate": 3.3563187159231255e-06, "loss": 0.0319, "step": 4273 }, { "epoch": 1.9444949954504094, "grad_norm": 0.958283463376367, "learning_rate": 3.355647263559386e-06, "loss": 0.0621, "step": 4274 }, { "epoch": 1.9449499545040947, "grad_norm": 0.37429757161301375, "learning_rate": 3.354975741276016e-06, "loss": 0.0147, "step": 4275 }, { "epoch": 1.9454049135577798, "grad_norm": 1.2533464987225345, "learning_rate": 3.354304149127889e-06, "loss": 0.0142, "step": 4276 }, { "epoch": 1.9458598726114649, "grad_norm": 0.6645953851128645, "learning_rate": 3.353632487169886e-06, "loss": 0.0283, "step": 4277 }, { "epoch": 1.9463148316651502, "grad_norm": 0.5433622092921354, "learning_rate": 3.3529607554568904e-06, "loss": 0.0305, "step": 4278 }, { "epoch": 1.9467697907188353, "grad_norm": 0.6429933767991537, "learning_rate": 3.3522889540437946e-06, "loss": 0.0339, "step": 4279 }, { "epoch": 1.9472247497725204, "grad_norm": 0.6745814916059572, "learning_rate": 3.3516170829854938e-06, "loss": 0.0213, "step": 4280 }, { "epoch": 1.9476797088262057, "grad_norm": 0.5513278763274229, "learning_rate": 3.350945142336891e-06, "loss": 0.0237, "step": 4281 }, { "epoch": 1.9481346678798908, "grad_norm": 0.5096711537908712, "learning_rate": 3.3502731321528936e-06, "loss": 0.0217, "step": 4282 }, { "epoch": 1.9485896269335758, "grad_norm": 0.7349564964616183, "learning_rate": 3.349601052488416e-06, "loss": 0.0228, "step": 4283 }, { "epoch": 1.9490445859872612, "grad_norm": 0.6166644471186713, "learning_rate": 3.3489289033983767e-06, "loss": 0.0239, "step": 4284 }, { "epoch": 1.9494995450409465, "grad_norm": 0.6598939511976858, "learning_rate": 3.3482566849377017e-06, "loss": 0.0342, "step": 4285 }, { "epoch": 1.9499545040946313, "grad_norm": 0.5769006183244888, "learning_rate": 3.347584397161321e-06, "loss": 0.0185, "step": 4286 }, { "epoch": 1.9504094631483166, "grad_norm": 0.4647907906699717, "learning_rate": 3.3469120401241705e-06, "loss": 0.0155, "step": 4287 }, { "epoch": 1.950864422202002, "grad_norm": 0.5111914647749481, "learning_rate": 3.3462396138811936e-06, "loss": 0.0216, "step": 4288 }, { "epoch": 1.9513193812556868, "grad_norm": 0.5485598484984867, "learning_rate": 3.3455671184873363e-06, "loss": 0.022, "step": 4289 }, { "epoch": 1.9517743403093721, "grad_norm": 1.0454297591292132, "learning_rate": 3.3448945539975532e-06, "loss": 0.0499, "step": 4290 }, { "epoch": 1.9522292993630574, "grad_norm": 0.6411814196521031, "learning_rate": 3.3442219204668024e-06, "loss": 0.0257, "step": 4291 }, { "epoch": 1.9526842584167425, "grad_norm": 0.6196772124906079, "learning_rate": 3.3435492179500482e-06, "loss": 0.0333, "step": 4292 }, { "epoch": 1.9531392174704276, "grad_norm": 0.5046167759230185, "learning_rate": 3.3428764465022623e-06, "loss": 0.0215, "step": 4293 }, { "epoch": 1.953594176524113, "grad_norm": 0.8543909352677298, "learning_rate": 3.342203606178419e-06, "loss": 0.0331, "step": 4294 }, { "epoch": 1.954049135577798, "grad_norm": 1.0431436718052118, "learning_rate": 3.341530697033501e-06, "loss": 0.0371, "step": 4295 }, { "epoch": 1.9545040946314831, "grad_norm": 0.8412627755889172, "learning_rate": 3.3408577191224938e-06, "loss": 0.0346, "step": 4296 }, { "epoch": 1.9549590536851684, "grad_norm": 0.5010992262253349, "learning_rate": 3.3401846725003916e-06, "loss": 0.0261, "step": 4297 }, { "epoch": 1.9554140127388535, "grad_norm": 0.9398211069222777, "learning_rate": 3.3395115572221927e-06, "loss": 0.0464, "step": 4298 }, { "epoch": 1.9558689717925386, "grad_norm": 0.5887913235316075, "learning_rate": 3.3388383733428987e-06, "loss": 0.0293, "step": 4299 }, { "epoch": 1.956323930846224, "grad_norm": 0.41389463818430977, "learning_rate": 3.3381651209175224e-06, "loss": 0.0122, "step": 4300 }, { "epoch": 1.956778889899909, "grad_norm": 0.557371532680147, "learning_rate": 3.3374918000010776e-06, "loss": 0.0179, "step": 4301 }, { "epoch": 1.957233848953594, "grad_norm": 0.6513620994945504, "learning_rate": 3.336818410648585e-06, "loss": 0.0308, "step": 4302 }, { "epoch": 1.9576888080072794, "grad_norm": 0.4039404050164459, "learning_rate": 3.3361449529150706e-06, "loss": 0.0136, "step": 4303 }, { "epoch": 1.9581437670609645, "grad_norm": 0.588337522384382, "learning_rate": 3.3354714268555668e-06, "loss": 0.022, "step": 4304 }, { "epoch": 1.9585987261146496, "grad_norm": 0.6678853246614891, "learning_rate": 3.3347978325251113e-06, "loss": 0.0251, "step": 4305 }, { "epoch": 1.959053685168335, "grad_norm": 0.5017559834277505, "learning_rate": 3.3341241699787456e-06, "loss": 0.0228, "step": 4306 }, { "epoch": 1.95950864422202, "grad_norm": 0.5575804878967321, "learning_rate": 3.3334504392715205e-06, "loss": 0.0263, "step": 4307 }, { "epoch": 1.959963603275705, "grad_norm": 0.6372454111997727, "learning_rate": 3.3327766404584892e-06, "loss": 0.0297, "step": 4308 }, { "epoch": 1.9604185623293904, "grad_norm": 0.7857180260485226, "learning_rate": 3.332102773594712e-06, "loss": 0.0394, "step": 4309 }, { "epoch": 1.9608735213830755, "grad_norm": 0.32858056176760797, "learning_rate": 3.331428838735254e-06, "loss": 0.0148, "step": 4310 }, { "epoch": 1.9613284804367606, "grad_norm": 0.5601571730665277, "learning_rate": 3.330754835935185e-06, "loss": 0.0259, "step": 4311 }, { "epoch": 1.9617834394904459, "grad_norm": 0.5661031503565735, "learning_rate": 3.330080765249584e-06, "loss": 0.0246, "step": 4312 }, { "epoch": 1.9622383985441312, "grad_norm": 0.717795957824697, "learning_rate": 3.32940662673353e-06, "loss": 0.0359, "step": 4313 }, { "epoch": 1.962693357597816, "grad_norm": 0.6814739733378367, "learning_rate": 3.3287324204421125e-06, "loss": 0.0358, "step": 4314 }, { "epoch": 1.9631483166515014, "grad_norm": 0.5987102461548224, "learning_rate": 3.3280581464304244e-06, "loss": 0.0325, "step": 4315 }, { "epoch": 1.9636032757051867, "grad_norm": 0.7588223317310043, "learning_rate": 3.3273838047535635e-06, "loss": 0.0417, "step": 4316 }, { "epoch": 1.9640582347588715, "grad_norm": 0.5594775907579351, "learning_rate": 3.326709395466635e-06, "loss": 0.0192, "step": 4317 }, { "epoch": 1.9645131938125568, "grad_norm": 0.4500026640500846, "learning_rate": 3.3260349186247476e-06, "loss": 0.0147, "step": 4318 }, { "epoch": 1.9649681528662422, "grad_norm": 0.6758378067095314, "learning_rate": 3.3253603742830174e-06, "loss": 0.0354, "step": 4319 }, { "epoch": 1.9654231119199272, "grad_norm": 0.6931100015001628, "learning_rate": 3.3246857624965645e-06, "loss": 0.0218, "step": 4320 }, { "epoch": 1.9658780709736123, "grad_norm": 7.764783371657491, "learning_rate": 3.324011083320515e-06, "loss": 0.1095, "step": 4321 }, { "epoch": 1.9663330300272976, "grad_norm": 0.6309964744535191, "learning_rate": 3.3233363368100025e-06, "loss": 0.0328, "step": 4322 }, { "epoch": 1.9667879890809827, "grad_norm": 0.7601811858062737, "learning_rate": 3.3226615230201613e-06, "loss": 0.0257, "step": 4323 }, { "epoch": 1.9672429481346678, "grad_norm": 0.6102379044944203, "learning_rate": 3.3219866420061356e-06, "loss": 0.0221, "step": 4324 }, { "epoch": 1.9676979071883531, "grad_norm": 0.49936480479648493, "learning_rate": 3.321311693823074e-06, "loss": 0.0211, "step": 4325 }, { "epoch": 1.9681528662420382, "grad_norm": 0.6131134771246745, "learning_rate": 3.32063667852613e-06, "loss": 0.0273, "step": 4326 }, { "epoch": 1.9686078252957233, "grad_norm": 0.5413823317993313, "learning_rate": 3.3199615961704616e-06, "loss": 0.0203, "step": 4327 }, { "epoch": 1.9690627843494086, "grad_norm": 0.5496315500040164, "learning_rate": 3.319286446811235e-06, "loss": 0.0229, "step": 4328 }, { "epoch": 1.9695177434030937, "grad_norm": 0.5156918674674653, "learning_rate": 3.3186112305036205e-06, "loss": 0.0237, "step": 4329 }, { "epoch": 1.9699727024567788, "grad_norm": 0.4870175434472783, "learning_rate": 3.3179359473027923e-06, "loss": 0.0239, "step": 4330 }, { "epoch": 1.9704276615104641, "grad_norm": 0.468021500823563, "learning_rate": 3.3172605972639326e-06, "loss": 0.0172, "step": 4331 }, { "epoch": 1.9708826205641492, "grad_norm": 0.43102805170585023, "learning_rate": 3.3165851804422276e-06, "loss": 0.0175, "step": 4332 }, { "epoch": 1.9713375796178343, "grad_norm": 0.5145279437337631, "learning_rate": 3.3159096968928688e-06, "loss": 0.0234, "step": 4333 }, { "epoch": 1.9717925386715196, "grad_norm": 0.6462766025914175, "learning_rate": 3.3152341466710547e-06, "loss": 0.0322, "step": 4334 }, { "epoch": 1.9722474977252047, "grad_norm": 0.8854114243754025, "learning_rate": 3.3145585298319873e-06, "loss": 0.0401, "step": 4335 }, { "epoch": 1.9727024567788898, "grad_norm": 0.882681690932464, "learning_rate": 3.313882846430876e-06, "loss": 0.0428, "step": 4336 }, { "epoch": 1.973157415832575, "grad_norm": 0.626261632483024, "learning_rate": 3.3132070965229334e-06, "loss": 0.0265, "step": 4337 }, { "epoch": 1.9736123748862604, "grad_norm": 0.781042591192755, "learning_rate": 3.312531280163379e-06, "loss": 0.0343, "step": 4338 }, { "epoch": 1.9740673339399453, "grad_norm": 0.5909207942974861, "learning_rate": 3.3118553974074392e-06, "loss": 0.0264, "step": 4339 }, { "epoch": 1.9745222929936306, "grad_norm": 0.4846287779691759, "learning_rate": 3.311179448310341e-06, "loss": 0.0174, "step": 4340 }, { "epoch": 1.974977252047316, "grad_norm": 0.3752301355907516, "learning_rate": 3.3105034329273224e-06, "loss": 0.0144, "step": 4341 }, { "epoch": 1.9754322111010008, "grad_norm": 0.6221312045676485, "learning_rate": 3.309827351313623e-06, "loss": 0.0266, "step": 4342 }, { "epoch": 1.975887170154686, "grad_norm": 0.6314072118752829, "learning_rate": 3.30915120352449e-06, "loss": 0.0253, "step": 4343 }, { "epoch": 1.9763421292083714, "grad_norm": 0.7433456203747467, "learning_rate": 3.3084749896151746e-06, "loss": 0.0424, "step": 4344 }, { "epoch": 1.9767970882620565, "grad_norm": 0.47141143713058037, "learning_rate": 3.3077987096409335e-06, "loss": 0.0162, "step": 4345 }, { "epoch": 1.9772520473157416, "grad_norm": 0.7428783590530981, "learning_rate": 3.3071223636570316e-06, "loss": 0.0315, "step": 4346 }, { "epoch": 1.9777070063694269, "grad_norm": 0.7100086277527677, "learning_rate": 3.306445951718733e-06, "loss": 0.034, "step": 4347 }, { "epoch": 1.978161965423112, "grad_norm": 0.6375852991687883, "learning_rate": 3.305769473881314e-06, "loss": 0.0261, "step": 4348 }, { "epoch": 1.978616924476797, "grad_norm": 0.5515261500214916, "learning_rate": 3.305092930200053e-06, "loss": 0.0203, "step": 4349 }, { "epoch": 1.9790718835304824, "grad_norm": 0.6423390254314216, "learning_rate": 3.3044163207302326e-06, "loss": 0.0325, "step": 4350 }, { "epoch": 1.9795268425841674, "grad_norm": 0.5000491058312723, "learning_rate": 3.303739645527144e-06, "loss": 0.0218, "step": 4351 }, { "epoch": 1.9799818016378525, "grad_norm": 0.47539358935771, "learning_rate": 3.3030629046460798e-06, "loss": 0.0176, "step": 4352 }, { "epoch": 1.9804367606915378, "grad_norm": 0.5700082287000012, "learning_rate": 3.3023860981423427e-06, "loss": 0.0198, "step": 4353 }, { "epoch": 1.980891719745223, "grad_norm": 0.9118136709481944, "learning_rate": 3.3017092260712375e-06, "loss": 0.0311, "step": 4354 }, { "epoch": 1.981346678798908, "grad_norm": 0.5790852400808595, "learning_rate": 3.301032288488074e-06, "loss": 0.0215, "step": 4355 }, { "epoch": 1.9818016378525933, "grad_norm": 0.7431348463550917, "learning_rate": 3.3003552854481703e-06, "loss": 0.0318, "step": 4356 }, { "epoch": 1.9822565969062784, "grad_norm": 0.948461270917527, "learning_rate": 3.2996782170068457e-06, "loss": 0.0367, "step": 4357 }, { "epoch": 1.9827115559599635, "grad_norm": 0.3708330888393237, "learning_rate": 3.2990010832194297e-06, "loss": 0.0153, "step": 4358 }, { "epoch": 1.9831665150136488, "grad_norm": 0.7194534284448381, "learning_rate": 3.2983238841412526e-06, "loss": 0.0324, "step": 4359 }, { "epoch": 1.983621474067334, "grad_norm": 0.7404170755190087, "learning_rate": 3.2976466198276535e-06, "loss": 0.0323, "step": 4360 }, { "epoch": 1.984076433121019, "grad_norm": 0.7256576598351984, "learning_rate": 3.2969692903339746e-06, "loss": 0.0292, "step": 4361 }, { "epoch": 1.9845313921747043, "grad_norm": 0.8029213769261921, "learning_rate": 3.296291895715564e-06, "loss": 0.0355, "step": 4362 }, { "epoch": 1.9849863512283894, "grad_norm": 0.6832073246232914, "learning_rate": 3.2956144360277764e-06, "loss": 0.0207, "step": 4363 }, { "epoch": 1.9854413102820745, "grad_norm": 0.7675457792367372, "learning_rate": 3.2949369113259698e-06, "loss": 0.0338, "step": 4364 }, { "epoch": 1.9858962693357598, "grad_norm": 0.5539928879546419, "learning_rate": 3.294259321665509e-06, "loss": 0.0199, "step": 4365 }, { "epoch": 1.9863512283894451, "grad_norm": 0.571666548475762, "learning_rate": 3.2935816671017627e-06, "loss": 0.025, "step": 4366 }, { "epoch": 1.98680618744313, "grad_norm": 0.5267462607036049, "learning_rate": 3.292903947690106e-06, "loss": 0.0205, "step": 4367 }, { "epoch": 1.9872611464968153, "grad_norm": 0.7640953508829946, "learning_rate": 3.2922261634859205e-06, "loss": 0.0402, "step": 4368 }, { "epoch": 1.9877161055505006, "grad_norm": 0.5895671819156529, "learning_rate": 3.291548314544589e-06, "loss": 0.0254, "step": 4369 }, { "epoch": 1.9881710646041855, "grad_norm": 0.5456299437545727, "learning_rate": 3.2908704009215053e-06, "loss": 0.019, "step": 4370 }, { "epoch": 1.9886260236578708, "grad_norm": 0.6847805281816779, "learning_rate": 3.290192422672064e-06, "loss": 0.0253, "step": 4371 }, { "epoch": 1.989080982711556, "grad_norm": 0.6304491758136077, "learning_rate": 3.289514379851666e-06, "loss": 0.0191, "step": 4372 }, { "epoch": 1.9895359417652412, "grad_norm": 0.5309288385146096, "learning_rate": 3.288836272515718e-06, "loss": 0.023, "step": 4373 }, { "epoch": 1.9899909008189263, "grad_norm": 0.4584138427830615, "learning_rate": 3.288158100719632e-06, "loss": 0.0196, "step": 4374 }, { "epoch": 1.9904458598726116, "grad_norm": 0.749801243596037, "learning_rate": 3.2874798645188264e-06, "loss": 0.0252, "step": 4375 }, { "epoch": 1.9909008189262967, "grad_norm": 0.623764963498254, "learning_rate": 3.2868015639687214e-06, "loss": 0.0262, "step": 4376 }, { "epoch": 1.9913557779799818, "grad_norm": 0.5504411510263656, "learning_rate": 3.286123199124746e-06, "loss": 0.0216, "step": 4377 }, { "epoch": 1.991810737033667, "grad_norm": 0.7372513035218227, "learning_rate": 3.285444770042333e-06, "loss": 0.0275, "step": 4378 }, { "epoch": 1.9922656960873522, "grad_norm": 0.6494284454283802, "learning_rate": 3.2847662767769206e-06, "loss": 0.028, "step": 4379 }, { "epoch": 1.9927206551410372, "grad_norm": 0.6872930443038594, "learning_rate": 3.284087719383952e-06, "loss": 0.0258, "step": 4380 }, { "epoch": 1.9931756141947226, "grad_norm": 0.5902006860326527, "learning_rate": 3.2834090979188754e-06, "loss": 0.0246, "step": 4381 }, { "epoch": 1.9936305732484076, "grad_norm": 0.5884187417000288, "learning_rate": 3.2827304124371462e-06, "loss": 0.0289, "step": 4382 }, { "epoch": 1.9940855323020927, "grad_norm": 0.6432639388617688, "learning_rate": 3.282051662994221e-06, "loss": 0.0317, "step": 4383 }, { "epoch": 1.994540491355778, "grad_norm": 0.9126044235499592, "learning_rate": 3.2813728496455667e-06, "loss": 0.0243, "step": 4384 }, { "epoch": 1.9949954504094631, "grad_norm": 0.5621830996553333, "learning_rate": 3.280693972446652e-06, "loss": 0.0224, "step": 4385 }, { "epoch": 1.9954504094631482, "grad_norm": 0.47277487956010905, "learning_rate": 3.2800150314529504e-06, "loss": 0.0162, "step": 4386 }, { "epoch": 1.9959053685168335, "grad_norm": 0.6297455179929636, "learning_rate": 3.279336026719944e-06, "loss": 0.0297, "step": 4387 }, { "epoch": 1.9963603275705186, "grad_norm": 0.6726406410189831, "learning_rate": 3.278656958303116e-06, "loss": 0.0277, "step": 4388 }, { "epoch": 1.9968152866242037, "grad_norm": 0.7361812163803645, "learning_rate": 3.277977826257959e-06, "loss": 0.0156, "step": 4389 }, { "epoch": 1.997270245677889, "grad_norm": 0.6642440742683662, "learning_rate": 3.277298630639966e-06, "loss": 0.0326, "step": 4390 }, { "epoch": 1.9977252047315741, "grad_norm": 0.5592126803491151, "learning_rate": 3.276619371504639e-06, "loss": 0.0211, "step": 4391 }, { "epoch": 1.9981801637852592, "grad_norm": 0.5616946078430344, "learning_rate": 3.2759400489074855e-06, "loss": 0.0232, "step": 4392 }, { "epoch": 1.9986351228389445, "grad_norm": 0.6703444610758778, "learning_rate": 3.2752606629040146e-06, "loss": 0.0357, "step": 4393 }, { "epoch": 1.9990900818926298, "grad_norm": 0.5486620110873328, "learning_rate": 3.2745812135497433e-06, "loss": 0.0245, "step": 4394 }, { "epoch": 1.9995450409463147, "grad_norm": 0.6090582209593255, "learning_rate": 3.2739017009001927e-06, "loss": 0.0267, "step": 4395 }, { "epoch": 2.0, "grad_norm": 0.6855263907774384, "learning_rate": 3.2732221250108915e-06, "loss": 0.026, "step": 4396 }, { "epoch": 2.0004549590536853, "grad_norm": 0.4357924685708123, "learning_rate": 3.272542485937369e-06, "loss": 0.0103, "step": 4397 }, { "epoch": 2.00090991810737, "grad_norm": 0.3591731045264976, "learning_rate": 3.2718627837351634e-06, "loss": 0.0105, "step": 4398 }, { "epoch": 2.0013648771610555, "grad_norm": 0.2919384663417581, "learning_rate": 3.271183018459817e-06, "loss": 0.0071, "step": 4399 }, { "epoch": 2.001819836214741, "grad_norm": 0.33675279153952553, "learning_rate": 3.270503190166877e-06, "loss": 0.0078, "step": 4400 }, { "epoch": 2.0022747952684257, "grad_norm": 0.4672273073958976, "learning_rate": 3.2698232989118954e-06, "loss": 0.0166, "step": 4401 }, { "epoch": 2.002729754322111, "grad_norm": 0.3388373129922536, "learning_rate": 3.2691433447504312e-06, "loss": 0.011, "step": 4402 }, { "epoch": 2.0031847133757963, "grad_norm": 0.394634745607115, "learning_rate": 3.268463327738046e-06, "loss": 0.014, "step": 4403 }, { "epoch": 2.003639672429481, "grad_norm": 0.36620347323207775, "learning_rate": 3.2677832479303075e-06, "loss": 0.0099, "step": 4404 }, { "epoch": 2.0040946314831665, "grad_norm": 0.27710540453522575, "learning_rate": 3.2671031053827896e-06, "loss": 0.0099, "step": 4405 }, { "epoch": 2.0045495905368518, "grad_norm": 0.41791249802243097, "learning_rate": 3.26642290015107e-06, "loss": 0.0159, "step": 4406 }, { "epoch": 2.0050045495905366, "grad_norm": 0.3703648514392052, "learning_rate": 3.265742632290732e-06, "loss": 0.0102, "step": 4407 }, { "epoch": 2.005459508644222, "grad_norm": 0.39406043876781427, "learning_rate": 3.2650623018573644e-06, "loss": 0.0128, "step": 4408 }, { "epoch": 2.0059144676979073, "grad_norm": 0.25846455198471907, "learning_rate": 3.2643819089065608e-06, "loss": 0.0078, "step": 4409 }, { "epoch": 2.0063694267515926, "grad_norm": 0.3743134486214678, "learning_rate": 3.263701453493919e-06, "loss": 0.0096, "step": 4410 }, { "epoch": 2.0068243858052774, "grad_norm": 0.7398271581185867, "learning_rate": 3.263020935675043e-06, "loss": 0.0221, "step": 4411 }, { "epoch": 2.0072793448589628, "grad_norm": 0.48918584492047773, "learning_rate": 3.2623403555055423e-06, "loss": 0.0168, "step": 4412 }, { "epoch": 2.007734303912648, "grad_norm": 0.4639083725586363, "learning_rate": 3.261659713041031e-06, "loss": 0.0094, "step": 4413 }, { "epoch": 2.008189262966333, "grad_norm": 0.45358164949007385, "learning_rate": 3.2609790083371266e-06, "loss": 0.0141, "step": 4414 }, { "epoch": 2.0086442220200182, "grad_norm": 0.5072099930308094, "learning_rate": 3.260298241449455e-06, "loss": 0.0105, "step": 4415 }, { "epoch": 2.0090991810737036, "grad_norm": 0.2969350983390757, "learning_rate": 3.259617412433644e-06, "loss": 0.0087, "step": 4416 }, { "epoch": 2.0095541401273884, "grad_norm": 0.22801632240731717, "learning_rate": 3.258936521345329e-06, "loss": 0.0054, "step": 4417 }, { "epoch": 2.0100090991810737, "grad_norm": 0.41462607147262476, "learning_rate": 3.2582555682401486e-06, "loss": 0.0088, "step": 4418 }, { "epoch": 2.010464058234759, "grad_norm": 0.3534390172869506, "learning_rate": 3.2575745531737475e-06, "loss": 0.0059, "step": 4419 }, { "epoch": 2.010919017288444, "grad_norm": 0.4506441363483451, "learning_rate": 3.2568934762017744e-06, "loss": 0.009, "step": 4420 }, { "epoch": 2.011373976342129, "grad_norm": 0.45761211551784764, "learning_rate": 3.256212337379886e-06, "loss": 0.0114, "step": 4421 }, { "epoch": 2.0118289353958145, "grad_norm": 0.41139946975458375, "learning_rate": 3.255531136763739e-06, "loss": 0.0088, "step": 4422 }, { "epoch": 2.0122838944494994, "grad_norm": 0.3552583631723109, "learning_rate": 3.2548498744089996e-06, "loss": 0.0059, "step": 4423 }, { "epoch": 2.0127388535031847, "grad_norm": 0.6497249916795375, "learning_rate": 3.2541685503713377e-06, "loss": 0.0138, "step": 4424 }, { "epoch": 2.01319381255687, "grad_norm": 0.2596359231263455, "learning_rate": 3.2534871647064275e-06, "loss": 0.0053, "step": 4425 }, { "epoch": 2.013648771610555, "grad_norm": 0.28950115969119605, "learning_rate": 3.252805717469949e-06, "loss": 0.0093, "step": 4426 }, { "epoch": 2.01410373066424, "grad_norm": 0.40072172954375107, "learning_rate": 3.252124208717587e-06, "loss": 0.0108, "step": 4427 }, { "epoch": 2.0145586897179255, "grad_norm": 0.40288074377564126, "learning_rate": 3.2514426385050313e-06, "loss": 0.0108, "step": 4428 }, { "epoch": 2.0150136487716104, "grad_norm": 0.3862828957940289, "learning_rate": 3.2507610068879756e-06, "loss": 0.0073, "step": 4429 }, { "epoch": 2.0154686078252957, "grad_norm": 0.3136238636916741, "learning_rate": 3.2500793139221227e-06, "loss": 0.0073, "step": 4430 }, { "epoch": 2.015923566878981, "grad_norm": 0.2308996134097455, "learning_rate": 3.249397559663174e-06, "loss": 0.0052, "step": 4431 }, { "epoch": 2.016378525932666, "grad_norm": 0.4850385395159559, "learning_rate": 3.2487157441668416e-06, "loss": 0.0145, "step": 4432 }, { "epoch": 2.016833484986351, "grad_norm": 0.8169310702720827, "learning_rate": 3.2480338674888403e-06, "loss": 0.024, "step": 4433 }, { "epoch": 2.0172884440400365, "grad_norm": 0.2678749141302413, "learning_rate": 3.247351929684889e-06, "loss": 0.0038, "step": 4434 }, { "epoch": 2.0177434030937214, "grad_norm": 0.6337198819343365, "learning_rate": 3.246669930810713e-06, "loss": 0.0122, "step": 4435 }, { "epoch": 2.0181983621474067, "grad_norm": 0.45910198460892115, "learning_rate": 3.2459878709220417e-06, "loss": 0.0078, "step": 4436 }, { "epoch": 2.018653321201092, "grad_norm": 0.5199366715260265, "learning_rate": 3.245305750074611e-06, "loss": 0.0133, "step": 4437 }, { "epoch": 2.0191082802547773, "grad_norm": 0.3613175950637311, "learning_rate": 3.2446235683241605e-06, "loss": 0.0071, "step": 4438 }, { "epoch": 2.019563239308462, "grad_norm": 0.3298535064307309, "learning_rate": 3.2439413257264335e-06, "loss": 0.0111, "step": 4439 }, { "epoch": 2.0200181983621475, "grad_norm": 0.5896910664332702, "learning_rate": 3.243259022337182e-06, "loss": 0.0172, "step": 4440 }, { "epoch": 2.0204731574158328, "grad_norm": 0.5096225118347937, "learning_rate": 3.2425766582121592e-06, "loss": 0.0133, "step": 4441 }, { "epoch": 2.0209281164695176, "grad_norm": 0.38609263061806165, "learning_rate": 3.2418942334071255e-06, "loss": 0.0068, "step": 4442 }, { "epoch": 2.021383075523203, "grad_norm": 0.4625117159206764, "learning_rate": 3.241211747977846e-06, "loss": 0.0111, "step": 4443 }, { "epoch": 2.0218380345768883, "grad_norm": 0.5209786996603972, "learning_rate": 3.2405292019800888e-06, "loss": 0.0162, "step": 4444 }, { "epoch": 2.022292993630573, "grad_norm": 0.9463247870156444, "learning_rate": 3.2398465954696302e-06, "loss": 0.0313, "step": 4445 }, { "epoch": 2.0227479526842584, "grad_norm": 0.8296941679133147, "learning_rate": 3.239163928502248e-06, "loss": 0.0271, "step": 4446 }, { "epoch": 2.0232029117379438, "grad_norm": 0.44098130530549506, "learning_rate": 3.2384812011337286e-06, "loss": 0.0118, "step": 4447 }, { "epoch": 2.0236578707916286, "grad_norm": 0.30401377200356544, "learning_rate": 3.237798413419859e-06, "loss": 0.0059, "step": 4448 }, { "epoch": 2.024112829845314, "grad_norm": 0.47106659413419355, "learning_rate": 3.2371155654164365e-06, "loss": 0.0154, "step": 4449 }, { "epoch": 2.0245677888989992, "grad_norm": 0.5050955241114093, "learning_rate": 3.2364326571792583e-06, "loss": 0.012, "step": 4450 }, { "epoch": 2.025022747952684, "grad_norm": 0.3833619239751391, "learning_rate": 3.2357496887641288e-06, "loss": 0.0055, "step": 4451 }, { "epoch": 2.0254777070063694, "grad_norm": 0.534398471315064, "learning_rate": 3.235066660226858e-06, "loss": 0.0164, "step": 4452 }, { "epoch": 2.0259326660600547, "grad_norm": 0.48288708797424224, "learning_rate": 3.2343835716232576e-06, "loss": 0.0147, "step": 4453 }, { "epoch": 2.0263876251137396, "grad_norm": 0.4065014355548966, "learning_rate": 3.2337004230091497e-06, "loss": 0.0114, "step": 4454 }, { "epoch": 2.026842584167425, "grad_norm": 0.5273761753674635, "learning_rate": 3.2330172144403565e-06, "loss": 0.0179, "step": 4455 }, { "epoch": 2.02729754322111, "grad_norm": 0.5316131918540286, "learning_rate": 3.2323339459727065e-06, "loss": 0.0146, "step": 4456 }, { "epoch": 2.027752502274795, "grad_norm": 0.2762388739425338, "learning_rate": 3.231650617662033e-06, "loss": 0.0066, "step": 4457 }, { "epoch": 2.0282074613284804, "grad_norm": 0.3541393957079231, "learning_rate": 3.2309672295641757e-06, "loss": 0.0096, "step": 4458 }, { "epoch": 2.0286624203821657, "grad_norm": 0.4562329675555478, "learning_rate": 3.230283781734978e-06, "loss": 0.0094, "step": 4459 }, { "epoch": 2.0291173794358506, "grad_norm": 0.5238042228445258, "learning_rate": 3.229600274230287e-06, "loss": 0.0169, "step": 4460 }, { "epoch": 2.029572338489536, "grad_norm": 0.41182159943946656, "learning_rate": 3.2289167071059565e-06, "loss": 0.012, "step": 4461 }, { "epoch": 2.030027297543221, "grad_norm": 0.366931462792821, "learning_rate": 3.2282330804178447e-06, "loss": 0.0079, "step": 4462 }, { "epoch": 2.030482256596906, "grad_norm": 0.6158917895037452, "learning_rate": 3.227549394221814e-06, "loss": 0.0128, "step": 4463 }, { "epoch": 2.0309372156505914, "grad_norm": 0.19698256537064404, "learning_rate": 3.226865648573732e-06, "loss": 0.004, "step": 4464 }, { "epoch": 2.0313921747042767, "grad_norm": 0.3179271585858685, "learning_rate": 3.226181843529472e-06, "loss": 0.0067, "step": 4465 }, { "epoch": 2.031847133757962, "grad_norm": 0.6300086096412626, "learning_rate": 3.2254979791449115e-06, "loss": 0.0244, "step": 4466 }, { "epoch": 2.032302092811647, "grad_norm": 0.595460238353192, "learning_rate": 3.224814055475932e-06, "loss": 0.0177, "step": 4467 }, { "epoch": 2.032757051865332, "grad_norm": 0.532750798920976, "learning_rate": 3.224130072578421e-06, "loss": 0.0105, "step": 4468 }, { "epoch": 2.0332120109190175, "grad_norm": 0.3131508255091208, "learning_rate": 3.2234460305082717e-06, "loss": 0.0064, "step": 4469 }, { "epoch": 2.0336669699727024, "grad_norm": 0.2783195812532853, "learning_rate": 3.2227619293213784e-06, "loss": 0.0064, "step": 4470 }, { "epoch": 2.0341219290263877, "grad_norm": 0.2845262265014905, "learning_rate": 3.222077769073645e-06, "loss": 0.0103, "step": 4471 }, { "epoch": 2.034576888080073, "grad_norm": 0.4190674634402976, "learning_rate": 3.221393549820977e-06, "loss": 0.0095, "step": 4472 }, { "epoch": 2.035031847133758, "grad_norm": 0.7291613434039592, "learning_rate": 3.2207092716192863e-06, "loss": 0.0215, "step": 4473 }, { "epoch": 2.035486806187443, "grad_norm": 0.2694523574447536, "learning_rate": 3.2200249345244876e-06, "loss": 0.0051, "step": 4474 }, { "epoch": 2.0359417652411285, "grad_norm": 0.7612977549254607, "learning_rate": 3.2193405385925035e-06, "loss": 0.0245, "step": 4475 }, { "epoch": 2.0363967242948133, "grad_norm": 0.3545469486085853, "learning_rate": 3.21865608387926e-06, "loss": 0.0079, "step": 4476 }, { "epoch": 2.0368516833484986, "grad_norm": 0.40692573692285866, "learning_rate": 3.2179715704406853e-06, "loss": 0.0086, "step": 4477 }, { "epoch": 2.037306642402184, "grad_norm": 0.3653118014079669, "learning_rate": 3.2172869983327164e-06, "loss": 0.0055, "step": 4478 }, { "epoch": 2.037761601455869, "grad_norm": 0.38178242607499613, "learning_rate": 3.216602367611294e-06, "loss": 0.01, "step": 4479 }, { "epoch": 2.038216560509554, "grad_norm": 0.42314939016602504, "learning_rate": 3.215917678332362e-06, "loss": 0.0118, "step": 4480 }, { "epoch": 2.0386715195632394, "grad_norm": 0.19849322707772785, "learning_rate": 3.21523293055187e-06, "loss": 0.0023, "step": 4481 }, { "epoch": 2.0391264786169243, "grad_norm": 0.3185951962495949, "learning_rate": 3.2145481243257726e-06, "loss": 0.0037, "step": 4482 }, { "epoch": 2.0395814376706096, "grad_norm": 0.19816114462462073, "learning_rate": 3.2138632597100305e-06, "loss": 0.0032, "step": 4483 }, { "epoch": 2.040036396724295, "grad_norm": 0.3784732579896911, "learning_rate": 3.2131783367606057e-06, "loss": 0.0131, "step": 4484 }, { "epoch": 2.04049135577798, "grad_norm": 0.32688371426994955, "learning_rate": 3.212493355533468e-06, "loss": 0.0086, "step": 4485 }, { "epoch": 2.040946314831665, "grad_norm": 0.4904699927992855, "learning_rate": 3.2118083160845915e-06, "loss": 0.0099, "step": 4486 }, { "epoch": 2.0414012738853504, "grad_norm": 0.471626382549602, "learning_rate": 3.211123218469953e-06, "loss": 0.0113, "step": 4487 }, { "epoch": 2.0418562329390353, "grad_norm": 0.441867007094047, "learning_rate": 3.210438062745537e-06, "loss": 0.012, "step": 4488 }, { "epoch": 2.0423111919927206, "grad_norm": 0.3244614276599254, "learning_rate": 3.20975284896733e-06, "loss": 0.0093, "step": 4489 }, { "epoch": 2.042766151046406, "grad_norm": 0.413598677679127, "learning_rate": 3.2090675771913273e-06, "loss": 0.0097, "step": 4490 }, { "epoch": 2.0432211101000908, "grad_norm": 0.6613118941054286, "learning_rate": 3.2083822474735233e-06, "loss": 0.0155, "step": 4491 }, { "epoch": 2.043676069153776, "grad_norm": 0.30901409345739755, "learning_rate": 3.2076968598699197e-06, "loss": 0.005, "step": 4492 }, { "epoch": 2.0441310282074614, "grad_norm": 0.47636341461545284, "learning_rate": 3.2070114144365265e-06, "loss": 0.0086, "step": 4493 }, { "epoch": 2.0445859872611467, "grad_norm": 0.3396050521432364, "learning_rate": 3.2063259112293526e-06, "loss": 0.0061, "step": 4494 }, { "epoch": 2.0450409463148316, "grad_norm": 0.49655202910528656, "learning_rate": 3.2056403503044155e-06, "loss": 0.0131, "step": 4495 }, { "epoch": 2.045495905368517, "grad_norm": 0.15189145162764364, "learning_rate": 3.2049547317177355e-06, "loss": 0.0033, "step": 4496 }, { "epoch": 2.045950864422202, "grad_norm": 0.6991689968704818, "learning_rate": 3.2042690555253375e-06, "loss": 0.0236, "step": 4497 }, { "epoch": 2.046405823475887, "grad_norm": 0.3345437677804238, "learning_rate": 3.2035833217832536e-06, "loss": 0.0061, "step": 4498 }, { "epoch": 2.0468607825295724, "grad_norm": 0.5194445463535743, "learning_rate": 3.2028975305475174e-06, "loss": 0.0065, "step": 4499 }, { "epoch": 2.0473157415832577, "grad_norm": 0.22368033834177453, "learning_rate": 3.20221168187417e-06, "loss": 0.0026, "step": 4500 }, { "epoch": 2.0477707006369426, "grad_norm": 0.5604581249222471, "learning_rate": 3.2015257758192543e-06, "loss": 0.0147, "step": 4501 }, { "epoch": 2.048225659690628, "grad_norm": 0.6167615267915109, "learning_rate": 3.2008398124388206e-06, "loss": 0.0163, "step": 4502 }, { "epoch": 2.048680618744313, "grad_norm": 0.2638676697920625, "learning_rate": 3.2001537917889223e-06, "loss": 0.0057, "step": 4503 }, { "epoch": 2.049135577797998, "grad_norm": 0.5530718340972208, "learning_rate": 3.1994677139256182e-06, "loss": 0.0113, "step": 4504 }, { "epoch": 2.0495905368516834, "grad_norm": 0.5362576032932329, "learning_rate": 3.198781578904972e-06, "loss": 0.0093, "step": 4505 }, { "epoch": 2.0500454959053687, "grad_norm": 0.35066696455012725, "learning_rate": 3.198095386783049e-06, "loss": 0.0048, "step": 4506 }, { "epoch": 2.0505004549590535, "grad_norm": 0.519482776063547, "learning_rate": 3.197409137615925e-06, "loss": 0.019, "step": 4507 }, { "epoch": 2.050955414012739, "grad_norm": 1.7041144375021928, "learning_rate": 3.196722831459676e-06, "loss": 0.0133, "step": 4508 }, { "epoch": 2.051410373066424, "grad_norm": 0.5783307186973152, "learning_rate": 3.1960364683703822e-06, "loss": 0.0175, "step": 4509 }, { "epoch": 2.051865332120109, "grad_norm": 0.2993019441509297, "learning_rate": 3.195350048404133e-06, "loss": 0.0039, "step": 4510 }, { "epoch": 2.0523202911737943, "grad_norm": 0.313355214774591, "learning_rate": 3.1946635716170167e-06, "loss": 0.0065, "step": 4511 }, { "epoch": 2.0527752502274796, "grad_norm": 0.6347510056289802, "learning_rate": 3.1939770380651315e-06, "loss": 0.0209, "step": 4512 }, { "epoch": 2.0532302092811645, "grad_norm": 0.33814038666077434, "learning_rate": 3.1932904478045756e-06, "loss": 0.0076, "step": 4513 }, { "epoch": 2.05368516833485, "grad_norm": 0.753673630808132, "learning_rate": 3.192603800891456e-06, "loss": 0.0117, "step": 4514 }, { "epoch": 2.054140127388535, "grad_norm": 0.4190280723269517, "learning_rate": 3.1919170973818814e-06, "loss": 0.0073, "step": 4515 }, { "epoch": 2.05459508644222, "grad_norm": 0.5828828052524603, "learning_rate": 3.191230337331966e-06, "loss": 0.0154, "step": 4516 }, { "epoch": 2.0550500454959053, "grad_norm": 0.6532305468027004, "learning_rate": 3.1905435207978293e-06, "loss": 0.0193, "step": 4517 }, { "epoch": 2.0555050045495906, "grad_norm": 0.6549136633955596, "learning_rate": 3.1898566478355943e-06, "loss": 0.0087, "step": 4518 }, { "epoch": 2.055959963603276, "grad_norm": 0.6219930913298968, "learning_rate": 3.1891697185013892e-06, "loss": 0.0197, "step": 4519 }, { "epoch": 2.056414922656961, "grad_norm": 0.39270762640074147, "learning_rate": 3.188482732851348e-06, "loss": 0.0105, "step": 4520 }, { "epoch": 2.056869881710646, "grad_norm": 0.43196144752180177, "learning_rate": 3.1877956909416063e-06, "loss": 0.0097, "step": 4521 }, { "epoch": 2.0573248407643314, "grad_norm": 0.3109850935715319, "learning_rate": 3.187108592828307e-06, "loss": 0.0072, "step": 4522 }, { "epoch": 2.0577797998180163, "grad_norm": 1.0322569770398764, "learning_rate": 3.1864214385675957e-06, "loss": 0.0302, "step": 4523 }, { "epoch": 2.0582347588717016, "grad_norm": 0.4801276544138867, "learning_rate": 3.185734228215625e-06, "loss": 0.0092, "step": 4524 }, { "epoch": 2.058689717925387, "grad_norm": 0.37947953131710443, "learning_rate": 3.1850469618285494e-06, "loss": 0.0071, "step": 4525 }, { "epoch": 2.0591446769790718, "grad_norm": 0.33006122380739733, "learning_rate": 3.18435963946253e-06, "loss": 0.008, "step": 4526 }, { "epoch": 2.059599636032757, "grad_norm": 0.21524047730141965, "learning_rate": 3.1836722611737326e-06, "loss": 0.0033, "step": 4527 }, { "epoch": 2.0600545950864424, "grad_norm": 0.28564355478520576, "learning_rate": 3.182984827018324e-06, "loss": 0.0048, "step": 4528 }, { "epoch": 2.0605095541401273, "grad_norm": 0.3642505488180155, "learning_rate": 3.18229733705248e-06, "loss": 0.0099, "step": 4529 }, { "epoch": 2.0609645131938126, "grad_norm": 0.5157960048650225, "learning_rate": 3.181609791332379e-06, "loss": 0.0083, "step": 4530 }, { "epoch": 2.061419472247498, "grad_norm": 0.33892084296043734, "learning_rate": 3.180922189914204e-06, "loss": 0.0085, "step": 4531 }, { "epoch": 2.0618744313011828, "grad_norm": 0.229555140550174, "learning_rate": 3.180234532854143e-06, "loss": 0.0032, "step": 4532 }, { "epoch": 2.062329390354868, "grad_norm": 0.5618115860344798, "learning_rate": 3.1795468202083864e-06, "loss": 0.0117, "step": 4533 }, { "epoch": 2.0627843494085534, "grad_norm": 0.3740393986896978, "learning_rate": 3.1788590520331337e-06, "loss": 0.0074, "step": 4534 }, { "epoch": 2.0632393084622382, "grad_norm": 0.3854480971594549, "learning_rate": 3.1781712283845844e-06, "loss": 0.0105, "step": 4535 }, { "epoch": 2.0636942675159236, "grad_norm": 0.231719547888273, "learning_rate": 3.177483349318946e-06, "loss": 0.0043, "step": 4536 }, { "epoch": 2.064149226569609, "grad_norm": 0.7064709042538649, "learning_rate": 3.1767954148924266e-06, "loss": 0.0131, "step": 4537 }, { "epoch": 2.0646041856232937, "grad_norm": 0.20890267285335237, "learning_rate": 3.176107425161243e-06, "loss": 0.0029, "step": 4538 }, { "epoch": 2.065059144676979, "grad_norm": 0.39991281606334667, "learning_rate": 3.1754193801816137e-06, "loss": 0.0112, "step": 4539 }, { "epoch": 2.0655141037306644, "grad_norm": 0.40786621246629207, "learning_rate": 3.174731280009762e-06, "loss": 0.0074, "step": 4540 }, { "epoch": 2.065969062784349, "grad_norm": 0.6022183880304784, "learning_rate": 3.174043124701918e-06, "loss": 0.0307, "step": 4541 }, { "epoch": 2.0664240218380345, "grad_norm": 0.37285052953707915, "learning_rate": 3.1733549143143137e-06, "loss": 0.0064, "step": 4542 }, { "epoch": 2.06687898089172, "grad_norm": 0.12249002089549997, "learning_rate": 3.1726666489031873e-06, "loss": 0.0019, "step": 4543 }, { "epoch": 2.0673339399454047, "grad_norm": 0.3655661078361262, "learning_rate": 3.171978328524779e-06, "loss": 0.0097, "step": 4544 }, { "epoch": 2.06778889899909, "grad_norm": 0.5268738528974319, "learning_rate": 3.1712899532353366e-06, "loss": 0.0107, "step": 4545 }, { "epoch": 2.0682438580527753, "grad_norm": 0.233860442306843, "learning_rate": 3.1706015230911114e-06, "loss": 0.004, "step": 4546 }, { "epoch": 2.06869881710646, "grad_norm": 0.657882881147645, "learning_rate": 3.1699130381483574e-06, "loss": 0.0183, "step": 4547 }, { "epoch": 2.0691537761601455, "grad_norm": 0.4210791643052899, "learning_rate": 3.1692244984633353e-06, "loss": 0.0125, "step": 4548 }, { "epoch": 2.069608735213831, "grad_norm": 0.3993520050303323, "learning_rate": 3.1685359040923097e-06, "loss": 0.0103, "step": 4549 }, { "epoch": 2.070063694267516, "grad_norm": 0.3622633555109233, "learning_rate": 3.167847255091549e-06, "loss": 0.0068, "step": 4550 }, { "epoch": 2.070518653321201, "grad_norm": 0.4968954329420076, "learning_rate": 3.1671585515173262e-06, "loss": 0.0131, "step": 4551 }, { "epoch": 2.0709736123748863, "grad_norm": 0.4891651046904591, "learning_rate": 3.166469793425919e-06, "loss": 0.0163, "step": 4552 }, { "epoch": 2.0714285714285716, "grad_norm": 0.2553642188842752, "learning_rate": 3.165780980873612e-06, "loss": 0.0066, "step": 4553 }, { "epoch": 2.0718835304822565, "grad_norm": 0.6935636079151343, "learning_rate": 3.165092113916688e-06, "loss": 0.0209, "step": 4554 }, { "epoch": 2.072338489535942, "grad_norm": 0.468516247683012, "learning_rate": 3.1644031926114403e-06, "loss": 0.0108, "step": 4555 }, { "epoch": 2.072793448589627, "grad_norm": 0.39939030722652935, "learning_rate": 3.1637142170141655e-06, "loss": 0.0067, "step": 4556 }, { "epoch": 2.073248407643312, "grad_norm": 0.6009168910348357, "learning_rate": 3.163025187181161e-06, "loss": 0.0156, "step": 4557 }, { "epoch": 2.0737033666969973, "grad_norm": 0.3056368840462739, "learning_rate": 3.1623361031687323e-06, "loss": 0.0065, "step": 4558 }, { "epoch": 2.0741583257506826, "grad_norm": 0.536213842824378, "learning_rate": 3.1616469650331884e-06, "loss": 0.0149, "step": 4559 }, { "epoch": 2.0746132848043675, "grad_norm": 0.43470527406529913, "learning_rate": 3.1609577728308428e-06, "loss": 0.0105, "step": 4560 }, { "epoch": 2.0750682438580528, "grad_norm": 0.26861609279506216, "learning_rate": 3.160268526618012e-06, "loss": 0.0058, "step": 4561 }, { "epoch": 2.075523202911738, "grad_norm": 0.27041038098397774, "learning_rate": 3.15957922645102e-06, "loss": 0.0038, "step": 4562 }, { "epoch": 2.075978161965423, "grad_norm": 0.46439568093080513, "learning_rate": 3.158889872386192e-06, "loss": 0.0116, "step": 4563 }, { "epoch": 2.0764331210191083, "grad_norm": 0.6777693684741656, "learning_rate": 3.158200464479859e-06, "loss": 0.0193, "step": 4564 }, { "epoch": 2.0768880800727936, "grad_norm": 0.37202422659118556, "learning_rate": 3.1575110027883566e-06, "loss": 0.0041, "step": 4565 }, { "epoch": 2.0773430391264784, "grad_norm": 0.2603196345813098, "learning_rate": 3.156821487368025e-06, "loss": 0.005, "step": 4566 }, { "epoch": 2.0777979981801638, "grad_norm": 0.4540810449504058, "learning_rate": 3.1561319182752066e-06, "loss": 0.0089, "step": 4567 }, { "epoch": 2.078252957233849, "grad_norm": 0.6132454894064556, "learning_rate": 3.1554422955662505e-06, "loss": 0.0118, "step": 4568 }, { "epoch": 2.078707916287534, "grad_norm": 0.49637696606866727, "learning_rate": 3.154752619297511e-06, "loss": 0.0081, "step": 4569 }, { "epoch": 2.0791628753412192, "grad_norm": 0.40600126782555146, "learning_rate": 3.1540628895253438e-06, "loss": 0.0063, "step": 4570 }, { "epoch": 2.0796178343949046, "grad_norm": 0.7374903469024907, "learning_rate": 3.153373106306111e-06, "loss": 0.0143, "step": 4571 }, { "epoch": 2.0800727934485894, "grad_norm": 0.46884199781119285, "learning_rate": 3.152683269696179e-06, "loss": 0.0078, "step": 4572 }, { "epoch": 2.0805277525022747, "grad_norm": 0.5697865023878845, "learning_rate": 3.1519933797519174e-06, "loss": 0.0103, "step": 4573 }, { "epoch": 2.08098271155596, "grad_norm": 0.3896080942425689, "learning_rate": 3.1513034365297013e-06, "loss": 0.0051, "step": 4574 }, { "epoch": 2.0814376706096454, "grad_norm": 0.49627484312446607, "learning_rate": 3.150613440085909e-06, "loss": 0.0156, "step": 4575 }, { "epoch": 2.08189262966333, "grad_norm": 0.39593305922594946, "learning_rate": 3.149923390476925e-06, "loss": 0.0062, "step": 4576 }, { "epoch": 2.0823475887170155, "grad_norm": 0.8269876262977107, "learning_rate": 3.1492332877591368e-06, "loss": 0.017, "step": 4577 }, { "epoch": 2.082802547770701, "grad_norm": 0.7775885227724254, "learning_rate": 3.148543131988936e-06, "loss": 0.0194, "step": 4578 }, { "epoch": 2.0832575068243857, "grad_norm": 0.6470294738834439, "learning_rate": 3.1478529232227197e-06, "loss": 0.0293, "step": 4579 }, { "epoch": 2.083712465878071, "grad_norm": 0.522606943832705, "learning_rate": 3.1471626615168876e-06, "loss": 0.0083, "step": 4580 }, { "epoch": 2.0841674249317563, "grad_norm": 0.9165122856029624, "learning_rate": 3.146472346927845e-06, "loss": 0.0199, "step": 4581 }, { "epoch": 2.084622383985441, "grad_norm": 0.44884826436190695, "learning_rate": 3.1457819795120026e-06, "loss": 0.0104, "step": 4582 }, { "epoch": 2.0850773430391265, "grad_norm": 0.34956488981456013, "learning_rate": 3.145091559325773e-06, "loss": 0.0079, "step": 4583 }, { "epoch": 2.085532302092812, "grad_norm": 0.20658077317834608, "learning_rate": 3.1444010864255737e-06, "loss": 0.0029, "step": 4584 }, { "epoch": 2.0859872611464967, "grad_norm": 0.3391845761773436, "learning_rate": 3.1437105608678287e-06, "loss": 0.0081, "step": 4585 }, { "epoch": 2.086442220200182, "grad_norm": 0.2699178221615947, "learning_rate": 3.1430199827089624e-06, "loss": 0.0061, "step": 4586 }, { "epoch": 2.0868971792538673, "grad_norm": 0.3246303439677648, "learning_rate": 3.1423293520054076e-06, "loss": 0.0044, "step": 4587 }, { "epoch": 2.087352138307552, "grad_norm": 0.45334651129350845, "learning_rate": 3.141638668813599e-06, "loss": 0.009, "step": 4588 }, { "epoch": 2.0878070973612375, "grad_norm": 0.49456222758809987, "learning_rate": 3.1409479331899755e-06, "loss": 0.0091, "step": 4589 }, { "epoch": 2.088262056414923, "grad_norm": 0.29804771307839106, "learning_rate": 3.1402571451909823e-06, "loss": 0.0032, "step": 4590 }, { "epoch": 2.0887170154686077, "grad_norm": 0.4938893983297924, "learning_rate": 3.1395663048730662e-06, "loss": 0.0097, "step": 4591 }, { "epoch": 2.089171974522293, "grad_norm": 0.4011705512686831, "learning_rate": 3.1388754122926803e-06, "loss": 0.0117, "step": 4592 }, { "epoch": 2.0896269335759783, "grad_norm": 0.30403373074136997, "learning_rate": 3.1381844675062796e-06, "loss": 0.0083, "step": 4593 }, { "epoch": 2.090081892629663, "grad_norm": 0.6669182307158755, "learning_rate": 3.137493470570327e-06, "loss": 0.0153, "step": 4594 }, { "epoch": 2.0905368516833485, "grad_norm": 0.5364348037993327, "learning_rate": 3.1368024215412866e-06, "loss": 0.0092, "step": 4595 }, { "epoch": 2.0909918107370338, "grad_norm": 0.5961953652266239, "learning_rate": 3.1361113204756284e-06, "loss": 0.0073, "step": 4596 }, { "epoch": 2.0914467697907186, "grad_norm": 0.471251924525215, "learning_rate": 3.1354201674298257e-06, "loss": 0.0136, "step": 4597 }, { "epoch": 2.091901728844404, "grad_norm": 0.6577043244747472, "learning_rate": 3.1347289624603565e-06, "loss": 0.0165, "step": 4598 }, { "epoch": 2.0923566878980893, "grad_norm": 0.4807004080737277, "learning_rate": 3.1340377056237032e-06, "loss": 0.0105, "step": 4599 }, { "epoch": 2.092811646951774, "grad_norm": 0.480884511135834, "learning_rate": 3.133346396976351e-06, "loss": 0.0064, "step": 4600 }, { "epoch": 2.0932666060054594, "grad_norm": 0.8875327341919028, "learning_rate": 3.132655036574792e-06, "loss": 0.0218, "step": 4601 }, { "epoch": 2.0937215650591448, "grad_norm": 0.5215435774230481, "learning_rate": 3.131963624475521e-06, "loss": 0.0075, "step": 4602 }, { "epoch": 2.0941765241128296, "grad_norm": 0.42519114309966266, "learning_rate": 3.131272160735035e-06, "loss": 0.0133, "step": 4603 }, { "epoch": 2.094631483166515, "grad_norm": 0.4447899750047396, "learning_rate": 3.1305806454098404e-06, "loss": 0.0075, "step": 4604 }, { "epoch": 2.0950864422202002, "grad_norm": 0.46713037220578024, "learning_rate": 3.1298890785564425e-06, "loss": 0.008, "step": 4605 }, { "epoch": 2.0955414012738856, "grad_norm": 0.5464948592235895, "learning_rate": 3.1291974602313536e-06, "loss": 0.0165, "step": 4606 }, { "epoch": 2.0959963603275704, "grad_norm": 0.5459737202798755, "learning_rate": 3.1285057904910896e-06, "loss": 0.0078, "step": 4607 }, { "epoch": 2.0964513193812557, "grad_norm": 0.4215618708720424, "learning_rate": 3.1278140693921704e-06, "loss": 0.0111, "step": 4608 }, { "epoch": 2.096906278434941, "grad_norm": 0.37900840994326, "learning_rate": 3.127122296991122e-06, "loss": 0.0088, "step": 4609 }, { "epoch": 2.097361237488626, "grad_norm": 0.3749866694108707, "learning_rate": 3.1264304733444694e-06, "loss": 0.0033, "step": 4610 }, { "epoch": 2.097816196542311, "grad_norm": 0.6196942321182589, "learning_rate": 3.125738598508749e-06, "loss": 0.0136, "step": 4611 }, { "epoch": 2.0982711555959965, "grad_norm": 0.7514909398754005, "learning_rate": 3.125046672540496e-06, "loss": 0.0215, "step": 4612 }, { "epoch": 2.0987261146496814, "grad_norm": 0.31212416725706804, "learning_rate": 3.124354695496252e-06, "loss": 0.0036, "step": 4613 }, { "epoch": 2.0991810737033667, "grad_norm": 0.33996334454134824, "learning_rate": 3.1236626674325603e-06, "loss": 0.0079, "step": 4614 }, { "epoch": 2.099636032757052, "grad_norm": 0.31989656030943836, "learning_rate": 3.122970588405973e-06, "loss": 0.0063, "step": 4615 }, { "epoch": 2.100090991810737, "grad_norm": 0.5282396377733483, "learning_rate": 3.1222784584730426e-06, "loss": 0.0105, "step": 4616 }, { "epoch": 2.100545950864422, "grad_norm": 0.6047279199496063, "learning_rate": 3.1215862776903255e-06, "loss": 0.0137, "step": 4617 }, { "epoch": 2.1010009099181075, "grad_norm": 0.33430750196527975, "learning_rate": 3.1208940461143866e-06, "loss": 0.0118, "step": 4618 }, { "epoch": 2.1014558689717924, "grad_norm": 0.14923195356099492, "learning_rate": 3.1202017638017895e-06, "loss": 0.0021, "step": 4619 }, { "epoch": 2.1019108280254777, "grad_norm": 0.5236093528735157, "learning_rate": 3.119509430809105e-06, "loss": 0.0058, "step": 4620 }, { "epoch": 2.102365787079163, "grad_norm": 0.6175873258753357, "learning_rate": 3.118817047192907e-06, "loss": 0.0095, "step": 4621 }, { "epoch": 2.102820746132848, "grad_norm": 0.3762533513613006, "learning_rate": 3.118124613009775e-06, "loss": 0.009, "step": 4622 }, { "epoch": 2.103275705186533, "grad_norm": 0.5889621844864588, "learning_rate": 3.117432128316291e-06, "loss": 0.0161, "step": 4623 }, { "epoch": 2.1037306642402185, "grad_norm": 0.7989060791629005, "learning_rate": 3.116739593169042e-06, "loss": 0.0185, "step": 4624 }, { "epoch": 2.1041856232939034, "grad_norm": 0.40464277810126115, "learning_rate": 3.116047007624618e-06, "loss": 0.004, "step": 4625 }, { "epoch": 2.1046405823475887, "grad_norm": 0.5110156875225697, "learning_rate": 3.1153543717396157e-06, "loss": 0.0104, "step": 4626 }, { "epoch": 2.105095541401274, "grad_norm": 0.9129471106257, "learning_rate": 3.114661685570632e-06, "loss": 0.0137, "step": 4627 }, { "epoch": 2.105550500454959, "grad_norm": 0.4034851024542674, "learning_rate": 3.1139689491742708e-06, "loss": 0.0207, "step": 4628 }, { "epoch": 2.106005459508644, "grad_norm": 0.409481530423294, "learning_rate": 3.1132761626071406e-06, "loss": 0.0072, "step": 4629 }, { "epoch": 2.1064604185623295, "grad_norm": 0.401332995360447, "learning_rate": 3.112583325925852e-06, "loss": 0.0075, "step": 4630 }, { "epoch": 2.1069153776160148, "grad_norm": 0.35190056467204006, "learning_rate": 3.1118904391870197e-06, "loss": 0.0069, "step": 4631 }, { "epoch": 2.1073703366696996, "grad_norm": 0.6395411516554018, "learning_rate": 3.1111975024472647e-06, "loss": 0.0173, "step": 4632 }, { "epoch": 2.107825295723385, "grad_norm": 0.391030470790143, "learning_rate": 3.11050451576321e-06, "loss": 0.0098, "step": 4633 }, { "epoch": 2.1082802547770703, "grad_norm": 0.3389726925948434, "learning_rate": 3.1098114791914825e-06, "loss": 0.0068, "step": 4634 }, { "epoch": 2.108735213830755, "grad_norm": 0.2114744559784845, "learning_rate": 3.1091183927887154e-06, "loss": 0.003, "step": 4635 }, { "epoch": 2.1091901728844404, "grad_norm": 0.6359916897382926, "learning_rate": 3.1084252566115437e-06, "loss": 0.0156, "step": 4636 }, { "epoch": 2.1096451319381258, "grad_norm": 0.42380313714771295, "learning_rate": 3.107732070716608e-06, "loss": 0.0099, "step": 4637 }, { "epoch": 2.1101000909918106, "grad_norm": 0.44409539139794857, "learning_rate": 3.1070388351605513e-06, "loss": 0.0051, "step": 4638 }, { "epoch": 2.110555050045496, "grad_norm": 0.3416246218659999, "learning_rate": 3.106345550000023e-06, "loss": 0.0067, "step": 4639 }, { "epoch": 2.1110100090991812, "grad_norm": 0.5790384301986117, "learning_rate": 3.1056522152916747e-06, "loss": 0.0157, "step": 4640 }, { "epoch": 2.111464968152866, "grad_norm": 0.398177779344772, "learning_rate": 3.104958831092162e-06, "loss": 0.0069, "step": 4641 }, { "epoch": 2.1119199272065514, "grad_norm": 0.3283351087649671, "learning_rate": 3.1042653974581455e-06, "loss": 0.0055, "step": 4642 }, { "epoch": 2.1123748862602367, "grad_norm": 0.42736687309005617, "learning_rate": 3.10357191444629e-06, "loss": 0.0059, "step": 4643 }, { "epoch": 2.1128298453139216, "grad_norm": 0.4701385519724747, "learning_rate": 3.102878382113263e-06, "loss": 0.0103, "step": 4644 }, { "epoch": 2.113284804367607, "grad_norm": 0.5193304224026638, "learning_rate": 3.1021848005157372e-06, "loss": 0.0186, "step": 4645 }, { "epoch": 2.113739763421292, "grad_norm": 0.37371466987902885, "learning_rate": 3.101491169710389e-06, "loss": 0.0107, "step": 4646 }, { "epoch": 2.114194722474977, "grad_norm": 0.35507409452315697, "learning_rate": 3.100797489753899e-06, "loss": 0.0091, "step": 4647 }, { "epoch": 2.1146496815286624, "grad_norm": 0.5271013946639229, "learning_rate": 3.1001037607029512e-06, "loss": 0.0128, "step": 4648 }, { "epoch": 2.1151046405823477, "grad_norm": 0.5883083523924235, "learning_rate": 3.099409982614234e-06, "loss": 0.0145, "step": 4649 }, { "epoch": 2.1155595996360326, "grad_norm": 0.22453746397773822, "learning_rate": 3.09871615554444e-06, "loss": 0.002, "step": 4650 }, { "epoch": 2.116014558689718, "grad_norm": 0.5029329846572255, "learning_rate": 3.0980222795502655e-06, "loss": 0.0091, "step": 4651 }, { "epoch": 2.116469517743403, "grad_norm": 0.4264626127408675, "learning_rate": 3.097328354688411e-06, "loss": 0.0078, "step": 4652 }, { "epoch": 2.116924476797088, "grad_norm": 0.3784220680939018, "learning_rate": 3.096634381015581e-06, "loss": 0.0138, "step": 4653 }, { "epoch": 2.1173794358507734, "grad_norm": 0.6495312239937969, "learning_rate": 3.0959403585884833e-06, "loss": 0.0183, "step": 4654 }, { "epoch": 2.1178343949044587, "grad_norm": 0.3703994227885963, "learning_rate": 3.0952462874638318e-06, "loss": 0.0084, "step": 4655 }, { "epoch": 2.1182893539581436, "grad_norm": 0.5840168211225543, "learning_rate": 3.0945521676983403e-06, "loss": 0.0117, "step": 4656 }, { "epoch": 2.118744313011829, "grad_norm": 0.4093549396410657, "learning_rate": 3.0938579993487314e-06, "loss": 0.0073, "step": 4657 }, { "epoch": 2.119199272065514, "grad_norm": 0.46306691893574864, "learning_rate": 3.0931637824717287e-06, "loss": 0.009, "step": 4658 }, { "epoch": 2.1196542311191995, "grad_norm": 0.5339236528409919, "learning_rate": 3.0924695171240606e-06, "loss": 0.0124, "step": 4659 }, { "epoch": 2.1201091901728844, "grad_norm": 0.27179196291102914, "learning_rate": 3.0917752033624587e-06, "loss": 0.0046, "step": 4660 }, { "epoch": 2.1205641492265697, "grad_norm": 0.40521379084473785, "learning_rate": 3.09108084124366e-06, "loss": 0.0094, "step": 4661 }, { "epoch": 2.121019108280255, "grad_norm": 0.3568522174390502, "learning_rate": 3.0903864308244042e-06, "loss": 0.0091, "step": 4662 }, { "epoch": 2.12147406733394, "grad_norm": 0.20584063995064003, "learning_rate": 3.0896919721614342e-06, "loss": 0.0041, "step": 4663 }, { "epoch": 2.121929026387625, "grad_norm": 0.5327316374722975, "learning_rate": 3.0889974653115006e-06, "loss": 0.0113, "step": 4664 }, { "epoch": 2.1223839854413105, "grad_norm": 0.35005396336426753, "learning_rate": 3.0883029103313537e-06, "loss": 0.011, "step": 4665 }, { "epoch": 2.1228389444949953, "grad_norm": 0.2960871076453943, "learning_rate": 3.0876083072777498e-06, "loss": 0.0034, "step": 4666 }, { "epoch": 2.1232939035486806, "grad_norm": 0.33965124358125465, "learning_rate": 3.0869136562074486e-06, "loss": 0.0085, "step": 4667 }, { "epoch": 2.123748862602366, "grad_norm": 0.38469550942459374, "learning_rate": 3.086218957177214e-06, "loss": 0.0074, "step": 4668 }, { "epoch": 2.124203821656051, "grad_norm": 0.26408637879968977, "learning_rate": 3.0855242102438137e-06, "loss": 0.0052, "step": 4669 }, { "epoch": 2.124658780709736, "grad_norm": 0.2776611055835266, "learning_rate": 3.0848294154640184e-06, "loss": 0.0045, "step": 4670 }, { "epoch": 2.1251137397634214, "grad_norm": 0.6028649736730624, "learning_rate": 3.0841345728946056e-06, "loss": 0.0123, "step": 4671 }, { "epoch": 2.1255686988171063, "grad_norm": 0.37666878496415385, "learning_rate": 3.0834396825923523e-06, "loss": 0.0058, "step": 4672 }, { "epoch": 2.1260236578707916, "grad_norm": 0.42744972334526005, "learning_rate": 3.082744744614043e-06, "loss": 0.0087, "step": 4673 }, { "epoch": 2.126478616924477, "grad_norm": 0.4924880485664466, "learning_rate": 3.0820497590164655e-06, "loss": 0.0088, "step": 4674 }, { "epoch": 2.126933575978162, "grad_norm": 0.6710063635700912, "learning_rate": 3.08135472585641e-06, "loss": 0.0252, "step": 4675 }, { "epoch": 2.127388535031847, "grad_norm": 0.42382304691433526, "learning_rate": 3.080659645190671e-06, "loss": 0.0087, "step": 4676 }, { "epoch": 2.1278434940855324, "grad_norm": 0.3556241052218594, "learning_rate": 3.079964517076049e-06, "loss": 0.0049, "step": 4677 }, { "epoch": 2.1282984531392173, "grad_norm": 0.25640268252683734, "learning_rate": 3.0792693415693446e-06, "loss": 0.0056, "step": 4678 }, { "epoch": 2.1287534121929026, "grad_norm": 0.7706401840170887, "learning_rate": 3.078574118727367e-06, "loss": 0.0188, "step": 4679 }, { "epoch": 2.129208371246588, "grad_norm": 0.6375217188673925, "learning_rate": 3.077878848606924e-06, "loss": 0.0326, "step": 4680 }, { "epoch": 2.1296633303002728, "grad_norm": 0.893227977038612, "learning_rate": 3.0771835312648317e-06, "loss": 0.013, "step": 4681 }, { "epoch": 2.130118289353958, "grad_norm": 0.29656538184933345, "learning_rate": 3.076488166757908e-06, "loss": 0.006, "step": 4682 }, { "epoch": 2.1305732484076434, "grad_norm": 0.3442568651723262, "learning_rate": 3.0757927551429744e-06, "loss": 0.0047, "step": 4683 }, { "epoch": 2.1310282074613287, "grad_norm": 0.2888005636724783, "learning_rate": 3.075097296476857e-06, "loss": 0.0072, "step": 4684 }, { "epoch": 2.1314831665150136, "grad_norm": 0.2674025568782202, "learning_rate": 3.0744017908163853e-06, "loss": 0.0052, "step": 4685 }, { "epoch": 2.131938125568699, "grad_norm": 0.5009718892120344, "learning_rate": 3.0737062382183946e-06, "loss": 0.0149, "step": 4686 }, { "epoch": 2.132393084622384, "grad_norm": 0.5647430594663713, "learning_rate": 3.073010638739719e-06, "loss": 0.0192, "step": 4687 }, { "epoch": 2.132848043676069, "grad_norm": 0.4463795424522635, "learning_rate": 3.0723149924372032e-06, "loss": 0.0086, "step": 4688 }, { "epoch": 2.1333030027297544, "grad_norm": 0.4536321883643288, "learning_rate": 3.071619299367691e-06, "loss": 0.0102, "step": 4689 }, { "epoch": 2.1337579617834397, "grad_norm": 0.4870224552753642, "learning_rate": 3.07092355958803e-06, "loss": 0.009, "step": 4690 }, { "epoch": 2.1342129208371245, "grad_norm": 0.5753264284355549, "learning_rate": 3.0702277731550745e-06, "loss": 0.0161, "step": 4691 }, { "epoch": 2.13466787989081, "grad_norm": 0.6314345096233259, "learning_rate": 3.06953194012568e-06, "loss": 0.0131, "step": 4692 }, { "epoch": 2.135122838944495, "grad_norm": 0.49494177139838835, "learning_rate": 3.068836060556708e-06, "loss": 0.0135, "step": 4693 }, { "epoch": 2.13557779799818, "grad_norm": 0.5025210580321176, "learning_rate": 3.0681401345050214e-06, "loss": 0.0109, "step": 4694 }, { "epoch": 2.1360327570518653, "grad_norm": 0.47742173399106386, "learning_rate": 3.067444162027489e-06, "loss": 0.0105, "step": 4695 }, { "epoch": 2.1364877161055507, "grad_norm": 0.38133551695997997, "learning_rate": 3.0667481431809826e-06, "loss": 0.0091, "step": 4696 }, { "epoch": 2.1369426751592355, "grad_norm": 0.2668904594726249, "learning_rate": 3.0660520780223767e-06, "loss": 0.0054, "step": 4697 }, { "epoch": 2.137397634212921, "grad_norm": 0.7415834800812123, "learning_rate": 3.0653559666085513e-06, "loss": 0.0232, "step": 4698 }, { "epoch": 2.137852593266606, "grad_norm": 0.3758885346412651, "learning_rate": 3.06465980899639e-06, "loss": 0.0089, "step": 4699 }, { "epoch": 2.138307552320291, "grad_norm": 0.33336953540376907, "learning_rate": 3.063963605242779e-06, "loss": 0.0062, "step": 4700 }, { "epoch": 2.1387625113739763, "grad_norm": 0.5510009081858924, "learning_rate": 3.0632673554046084e-06, "loss": 0.0127, "step": 4701 }, { "epoch": 2.1392174704276616, "grad_norm": 0.3653710434840782, "learning_rate": 3.062571059538774e-06, "loss": 0.006, "step": 4702 }, { "epoch": 2.1396724294813465, "grad_norm": 0.30616250902922804, "learning_rate": 3.0618747177021725e-06, "loss": 0.0052, "step": 4703 }, { "epoch": 2.140127388535032, "grad_norm": 0.3371751582386914, "learning_rate": 3.061178329951707e-06, "loss": 0.0036, "step": 4704 }, { "epoch": 2.140582347588717, "grad_norm": 0.25680835128538615, "learning_rate": 3.0604818963442818e-06, "loss": 0.0057, "step": 4705 }, { "epoch": 2.141037306642402, "grad_norm": 0.1495642941768919, "learning_rate": 3.059785416936808e-06, "loss": 0.002, "step": 4706 }, { "epoch": 2.1414922656960873, "grad_norm": 0.9405538278775661, "learning_rate": 3.059088891786197e-06, "loss": 0.0336, "step": 4707 }, { "epoch": 2.1419472247497726, "grad_norm": 0.26693836949960975, "learning_rate": 3.058392320949367e-06, "loss": 0.0063, "step": 4708 }, { "epoch": 2.1424021838034575, "grad_norm": 0.4379512570942622, "learning_rate": 3.057695704483239e-06, "loss": 0.009, "step": 4709 }, { "epoch": 2.142857142857143, "grad_norm": 0.24836104572270803, "learning_rate": 3.056999042444736e-06, "loss": 0.0034, "step": 4710 }, { "epoch": 2.143312101910828, "grad_norm": 0.5238138163905602, "learning_rate": 3.056302334890786e-06, "loss": 0.0112, "step": 4711 }, { "epoch": 2.143767060964513, "grad_norm": 0.5389873613733847, "learning_rate": 3.055605581878322e-06, "loss": 0.0088, "step": 4712 }, { "epoch": 2.1442220200181983, "grad_norm": 0.33780511374121885, "learning_rate": 3.05490878346428e-06, "loss": 0.0047, "step": 4713 }, { "epoch": 2.1446769790718836, "grad_norm": 0.3088167323001631, "learning_rate": 3.0542119397055964e-06, "loss": 0.0061, "step": 4714 }, { "epoch": 2.145131938125569, "grad_norm": 0.3123086074328526, "learning_rate": 3.0535150506592163e-06, "loss": 0.0043, "step": 4715 }, { "epoch": 2.1455868971792538, "grad_norm": 0.2535492205725634, "learning_rate": 3.0528181163820863e-06, "loss": 0.0057, "step": 4716 }, { "epoch": 2.146041856232939, "grad_norm": 0.3523891629816241, "learning_rate": 3.0521211369311564e-06, "loss": 0.0093, "step": 4717 }, { "epoch": 2.1464968152866244, "grad_norm": 0.4794806971408245, "learning_rate": 3.0514241123633804e-06, "loss": 0.0115, "step": 4718 }, { "epoch": 2.1469517743403093, "grad_norm": 0.22828013006795858, "learning_rate": 3.0507270427357162e-06, "loss": 0.003, "step": 4719 }, { "epoch": 2.1474067333939946, "grad_norm": 0.4275364308846148, "learning_rate": 3.0500299281051254e-06, "loss": 0.0088, "step": 4720 }, { "epoch": 2.14786169244768, "grad_norm": 0.8460587740103219, "learning_rate": 3.0493327685285723e-06, "loss": 0.0114, "step": 4721 }, { "epoch": 2.1483166515013647, "grad_norm": 0.5076538593230759, "learning_rate": 3.048635564063026e-06, "loss": 0.011, "step": 4722 }, { "epoch": 2.14877161055505, "grad_norm": 0.3931368671311738, "learning_rate": 3.047938314765459e-06, "loss": 0.0117, "step": 4723 }, { "epoch": 2.1492265696087354, "grad_norm": 0.3183307638178997, "learning_rate": 3.0472410206928483e-06, "loss": 0.0034, "step": 4724 }, { "epoch": 2.1496815286624202, "grad_norm": 0.782122029421301, "learning_rate": 3.0465436819021726e-06, "loss": 0.0115, "step": 4725 }, { "epoch": 2.1501364877161055, "grad_norm": 0.5840710377403205, "learning_rate": 3.0458462984504137e-06, "loss": 0.0128, "step": 4726 }, { "epoch": 2.150591446769791, "grad_norm": 0.4498724297970662, "learning_rate": 3.045148870394562e-06, "loss": 0.0106, "step": 4727 }, { "epoch": 2.1510464058234757, "grad_norm": 0.39257193974403, "learning_rate": 3.0444513977916057e-06, "loss": 0.0068, "step": 4728 }, { "epoch": 2.151501364877161, "grad_norm": 0.3742624041220985, "learning_rate": 3.04375388069854e-06, "loss": 0.0061, "step": 4729 }, { "epoch": 2.1519563239308463, "grad_norm": 0.26774115781162755, "learning_rate": 3.0430563191723633e-06, "loss": 0.0038, "step": 4730 }, { "epoch": 2.152411282984531, "grad_norm": 0.38158010680297555, "learning_rate": 3.042358713270076e-06, "loss": 0.0061, "step": 4731 }, { "epoch": 2.1528662420382165, "grad_norm": 0.41472143270003475, "learning_rate": 3.041661063048685e-06, "loss": 0.0092, "step": 4732 }, { "epoch": 2.153321201091902, "grad_norm": 0.4351877464266028, "learning_rate": 3.040963368565196e-06, "loss": 0.0077, "step": 4733 }, { "epoch": 2.1537761601455867, "grad_norm": 0.5560216743727133, "learning_rate": 3.0402656298766254e-06, "loss": 0.0111, "step": 4734 }, { "epoch": 2.154231119199272, "grad_norm": 0.77005509132677, "learning_rate": 3.0395678470399863e-06, "loss": 0.0164, "step": 4735 }, { "epoch": 2.1546860782529573, "grad_norm": 0.48277315343867166, "learning_rate": 3.0388700201123e-06, "loss": 0.0147, "step": 4736 }, { "epoch": 2.1551410373066426, "grad_norm": 0.36160641647997416, "learning_rate": 3.038172149150589e-06, "loss": 0.0066, "step": 4737 }, { "epoch": 2.1555959963603275, "grad_norm": 0.4043487172262798, "learning_rate": 3.0374742342118803e-06, "loss": 0.0098, "step": 4738 }, { "epoch": 2.156050955414013, "grad_norm": 0.3445286857951706, "learning_rate": 3.036776275353204e-06, "loss": 0.0068, "step": 4739 }, { "epoch": 2.156505914467698, "grad_norm": 0.5710097814008477, "learning_rate": 3.0360782726315948e-06, "loss": 0.0091, "step": 4740 }, { "epoch": 2.156960873521383, "grad_norm": 0.4870406363888131, "learning_rate": 3.0353802261040904e-06, "loss": 0.0109, "step": 4741 }, { "epoch": 2.1574158325750683, "grad_norm": 0.3536218581372151, "learning_rate": 3.0346821358277316e-06, "loss": 0.0067, "step": 4742 }, { "epoch": 2.1578707916287536, "grad_norm": 0.3487830591506708, "learning_rate": 3.0339840018595622e-06, "loss": 0.008, "step": 4743 }, { "epoch": 2.1583257506824385, "grad_norm": 0.6417536178089811, "learning_rate": 3.0332858242566333e-06, "loss": 0.0259, "step": 4744 }, { "epoch": 2.158780709736124, "grad_norm": 0.24415659096707662, "learning_rate": 3.032587603075994e-06, "loss": 0.0028, "step": 4745 }, { "epoch": 2.159235668789809, "grad_norm": 0.930009051972887, "learning_rate": 3.0318893383747018e-06, "loss": 0.0188, "step": 4746 }, { "epoch": 2.159690627843494, "grad_norm": 0.4802933832576269, "learning_rate": 3.031191030209814e-06, "loss": 0.0105, "step": 4747 }, { "epoch": 2.1601455868971793, "grad_norm": 0.34023543822420677, "learning_rate": 3.0304926786383943e-06, "loss": 0.007, "step": 4748 }, { "epoch": 2.1606005459508646, "grad_norm": 0.41967920768644235, "learning_rate": 3.0297942837175092e-06, "loss": 0.0108, "step": 4749 }, { "epoch": 2.1610555050045495, "grad_norm": 0.5123088479761165, "learning_rate": 3.0290958455042264e-06, "loss": 0.0144, "step": 4750 }, { "epoch": 2.1615104640582348, "grad_norm": 0.9073745232373296, "learning_rate": 3.028397364055622e-06, "loss": 0.0342, "step": 4751 }, { "epoch": 2.16196542311192, "grad_norm": 0.1322197255416583, "learning_rate": 3.0276988394287697e-06, "loss": 0.0015, "step": 4752 }, { "epoch": 2.162420382165605, "grad_norm": 0.2645247016657681, "learning_rate": 3.027000271680753e-06, "loss": 0.0046, "step": 4753 }, { "epoch": 2.1628753412192903, "grad_norm": 0.513719800174683, "learning_rate": 3.026301660868653e-06, "loss": 0.0181, "step": 4754 }, { "epoch": 2.1633303002729756, "grad_norm": 0.39894384212322737, "learning_rate": 3.025603007049558e-06, "loss": 0.006, "step": 4755 }, { "epoch": 2.1637852593266604, "grad_norm": 0.46335375503173326, "learning_rate": 3.024904310280559e-06, "loss": 0.0089, "step": 4756 }, { "epoch": 2.1642402183803457, "grad_norm": 0.4434242014570459, "learning_rate": 3.0242055706187502e-06, "loss": 0.0089, "step": 4757 }, { "epoch": 2.164695177434031, "grad_norm": 0.39131299276128545, "learning_rate": 3.0235067881212295e-06, "loss": 0.0048, "step": 4758 }, { "epoch": 2.165150136487716, "grad_norm": 0.3775875466701756, "learning_rate": 3.022807962845098e-06, "loss": 0.0135, "step": 4759 }, { "epoch": 2.1656050955414012, "grad_norm": 0.28397474193939537, "learning_rate": 3.022109094847461e-06, "loss": 0.0068, "step": 4760 }, { "epoch": 2.1660600545950865, "grad_norm": 0.5358252152810421, "learning_rate": 3.021410184185427e-06, "loss": 0.0137, "step": 4761 }, { "epoch": 2.1665150136487714, "grad_norm": 0.43491643720963374, "learning_rate": 3.020711230916107e-06, "loss": 0.008, "step": 4762 }, { "epoch": 2.1669699727024567, "grad_norm": 0.7271280461336811, "learning_rate": 3.0200122350966167e-06, "loss": 0.0267, "step": 4763 }, { "epoch": 2.167424931756142, "grad_norm": 0.6839510363658479, "learning_rate": 3.019313196784075e-06, "loss": 0.0146, "step": 4764 }, { "epoch": 2.167879890809827, "grad_norm": 0.428805548947076, "learning_rate": 3.0186141160356035e-06, "loss": 0.0131, "step": 4765 }, { "epoch": 2.168334849863512, "grad_norm": 0.36724826281801987, "learning_rate": 3.0179149929083294e-06, "loss": 0.0076, "step": 4766 }, { "epoch": 2.1687898089171975, "grad_norm": 0.32539982110957744, "learning_rate": 3.0172158274593803e-06, "loss": 0.0039, "step": 4767 }, { "epoch": 2.1692447679708824, "grad_norm": 0.23099289860559238, "learning_rate": 3.0165166197458897e-06, "loss": 0.0041, "step": 4768 }, { "epoch": 2.1696997270245677, "grad_norm": 0.5515177070855042, "learning_rate": 3.0158173698249934e-06, "loss": 0.019, "step": 4769 }, { "epoch": 2.170154686078253, "grad_norm": 0.33180043761855865, "learning_rate": 3.0151180777538313e-06, "loss": 0.0118, "step": 4770 }, { "epoch": 2.1706096451319383, "grad_norm": 0.3732945564025352, "learning_rate": 3.014418743589546e-06, "loss": 0.0081, "step": 4771 }, { "epoch": 2.171064604185623, "grad_norm": 0.4992882442120034, "learning_rate": 3.0137193673892837e-06, "loss": 0.0092, "step": 4772 }, { "epoch": 2.1715195632393085, "grad_norm": 3.8449782591600243, "learning_rate": 3.013019949210196e-06, "loss": 0.059, "step": 4773 }, { "epoch": 2.171974522292994, "grad_norm": 0.3263215277974382, "learning_rate": 3.0123204891094333e-06, "loss": 0.0064, "step": 4774 }, { "epoch": 2.1724294813466787, "grad_norm": 0.5441587527396841, "learning_rate": 3.011620987144154e-06, "loss": 0.0171, "step": 4775 }, { "epoch": 2.172884440400364, "grad_norm": 0.4288342527301395, "learning_rate": 3.010921443371518e-06, "loss": 0.0091, "step": 4776 }, { "epoch": 2.1733393994540493, "grad_norm": 0.6810232780081685, "learning_rate": 3.01022185784869e-06, "loss": 0.0209, "step": 4777 }, { "epoch": 2.173794358507734, "grad_norm": 0.3687545098776424, "learning_rate": 3.009522230632836e-06, "loss": 0.0074, "step": 4778 }, { "epoch": 2.1742493175614195, "grad_norm": 0.38923383443821935, "learning_rate": 3.008822561781125e-06, "loss": 0.0077, "step": 4779 }, { "epoch": 2.174704276615105, "grad_norm": 0.3293052128620047, "learning_rate": 3.008122851350733e-06, "loss": 0.0056, "step": 4780 }, { "epoch": 2.1751592356687897, "grad_norm": 0.4825138212619072, "learning_rate": 3.0074230993988363e-06, "loss": 0.0087, "step": 4781 }, { "epoch": 2.175614194722475, "grad_norm": 0.4275390413826818, "learning_rate": 3.0067233059826146e-06, "loss": 0.0108, "step": 4782 }, { "epoch": 2.1760691537761603, "grad_norm": 0.2609125113090567, "learning_rate": 3.0060234711592534e-06, "loss": 0.0036, "step": 4783 }, { "epoch": 2.176524112829845, "grad_norm": 0.5601863494911928, "learning_rate": 3.0053235949859392e-06, "loss": 0.0109, "step": 4784 }, { "epoch": 2.1769790718835305, "grad_norm": 0.6567446352780604, "learning_rate": 3.0046236775198625e-06, "loss": 0.0118, "step": 4785 }, { "epoch": 2.1774340309372158, "grad_norm": 0.30472793793794595, "learning_rate": 3.003923718818218e-06, "loss": 0.0055, "step": 4786 }, { "epoch": 2.1778889899909006, "grad_norm": 0.3258713511719229, "learning_rate": 3.003223718938203e-06, "loss": 0.006, "step": 4787 }, { "epoch": 2.178343949044586, "grad_norm": 0.37093274617661853, "learning_rate": 3.002523677937018e-06, "loss": 0.0065, "step": 4788 }, { "epoch": 2.1787989080982713, "grad_norm": 0.5025463591912394, "learning_rate": 3.001823595871867e-06, "loss": 0.0111, "step": 4789 }, { "epoch": 2.179253867151956, "grad_norm": 0.35233357129119264, "learning_rate": 3.001123472799959e-06, "loss": 0.0074, "step": 4790 }, { "epoch": 2.1797088262056414, "grad_norm": 0.277217298871161, "learning_rate": 3.0004233087785033e-06, "loss": 0.0047, "step": 4791 }, { "epoch": 2.1801637852593267, "grad_norm": 0.45146257449191485, "learning_rate": 2.9997231038647147e-06, "loss": 0.0127, "step": 4792 }, { "epoch": 2.180618744313012, "grad_norm": 0.5887311593507997, "learning_rate": 2.9990228581158103e-06, "loss": 0.0177, "step": 4793 }, { "epoch": 2.181073703366697, "grad_norm": 0.256265040821521, "learning_rate": 2.9983225715890123e-06, "loss": 0.0038, "step": 4794 }, { "epoch": 2.1815286624203822, "grad_norm": 0.42770039025824447, "learning_rate": 2.997622244341544e-06, "loss": 0.0085, "step": 4795 }, { "epoch": 2.1819836214740675, "grad_norm": 0.6539521709926065, "learning_rate": 2.996921876430633e-06, "loss": 0.0156, "step": 4796 }, { "epoch": 2.1824385805277524, "grad_norm": 0.4286114778355643, "learning_rate": 2.9962214679135105e-06, "loss": 0.0091, "step": 4797 }, { "epoch": 2.1828935395814377, "grad_norm": 0.563832303686073, "learning_rate": 2.99552101884741e-06, "loss": 0.0064, "step": 4798 }, { "epoch": 2.183348498635123, "grad_norm": 0.46336330152853805, "learning_rate": 2.99482052928957e-06, "loss": 0.013, "step": 4799 }, { "epoch": 2.183803457688808, "grad_norm": 0.5331549642538213, "learning_rate": 2.9941199992972316e-06, "loss": 0.0129, "step": 4800 }, { "epoch": 2.184258416742493, "grad_norm": 0.4994944361658107, "learning_rate": 2.9934194289276368e-06, "loss": 0.0098, "step": 4801 }, { "epoch": 2.1847133757961785, "grad_norm": 0.4411608391953045, "learning_rate": 2.992718818238036e-06, "loss": 0.0098, "step": 4802 }, { "epoch": 2.1851683348498634, "grad_norm": 0.790717244204323, "learning_rate": 2.992018167285677e-06, "loss": 0.0232, "step": 4803 }, { "epoch": 2.1856232939035487, "grad_norm": 0.5627779146828896, "learning_rate": 2.9913174761278163e-06, "loss": 0.011, "step": 4804 }, { "epoch": 2.186078252957234, "grad_norm": 0.37350565393995755, "learning_rate": 2.99061674482171e-06, "loss": 0.0106, "step": 4805 }, { "epoch": 2.186533212010919, "grad_norm": 0.4858465797648907, "learning_rate": 2.9899159734246187e-06, "loss": 0.0188, "step": 4806 }, { "epoch": 2.186988171064604, "grad_norm": 0.29195680362538146, "learning_rate": 2.989215161993807e-06, "loss": 0.0051, "step": 4807 }, { "epoch": 2.1874431301182895, "grad_norm": 0.34338344442759855, "learning_rate": 2.988514310586541e-06, "loss": 0.005, "step": 4808 }, { "epoch": 2.1878980891719744, "grad_norm": 0.3301608536111477, "learning_rate": 2.9878134192600926e-06, "loss": 0.0053, "step": 4809 }, { "epoch": 2.1883530482256597, "grad_norm": 0.47387907595581313, "learning_rate": 2.9871124880717333e-06, "loss": 0.0092, "step": 4810 }, { "epoch": 2.188808007279345, "grad_norm": 0.5153714922993946, "learning_rate": 2.9864115170787424e-06, "loss": 0.012, "step": 4811 }, { "epoch": 2.18926296633303, "grad_norm": 0.2237686721075401, "learning_rate": 2.985710506338398e-06, "loss": 0.0052, "step": 4812 }, { "epoch": 2.189717925386715, "grad_norm": 0.3945070100844117, "learning_rate": 2.9850094559079845e-06, "loss": 0.0088, "step": 4813 }, { "epoch": 2.1901728844404005, "grad_norm": 0.763627122337376, "learning_rate": 2.9843083658447893e-06, "loss": 0.0117, "step": 4814 }, { "epoch": 2.1906278434940853, "grad_norm": 0.7459982593680611, "learning_rate": 2.983607236206101e-06, "loss": 0.0138, "step": 4815 }, { "epoch": 2.1910828025477707, "grad_norm": 0.5426860100029581, "learning_rate": 2.982906067049214e-06, "loss": 0.0148, "step": 4816 }, { "epoch": 2.191537761601456, "grad_norm": 0.4754718611683302, "learning_rate": 2.9822048584314227e-06, "loss": 0.014, "step": 4817 }, { "epoch": 2.191992720655141, "grad_norm": 0.47524169526862253, "learning_rate": 2.9815036104100287e-06, "loss": 0.0069, "step": 4818 }, { "epoch": 2.192447679708826, "grad_norm": 0.59016996786916, "learning_rate": 2.9808023230423342e-06, "loss": 0.0088, "step": 4819 }, { "epoch": 2.1929026387625115, "grad_norm": 0.420284668091405, "learning_rate": 2.9801009963856446e-06, "loss": 0.0102, "step": 4820 }, { "epoch": 2.1933575978161963, "grad_norm": 0.21265332555657832, "learning_rate": 2.9793996304972705e-06, "loss": 0.0028, "step": 4821 }, { "epoch": 2.1938125568698816, "grad_norm": 0.49079840602250613, "learning_rate": 2.978698225434523e-06, "loss": 0.0089, "step": 4822 }, { "epoch": 2.194267515923567, "grad_norm": 0.19990703737488408, "learning_rate": 2.977996781254719e-06, "loss": 0.0029, "step": 4823 }, { "epoch": 2.194722474977252, "grad_norm": 0.4216157643889506, "learning_rate": 2.977295298015176e-06, "loss": 0.0106, "step": 4824 }, { "epoch": 2.195177434030937, "grad_norm": 0.4942721085728846, "learning_rate": 2.9765937757732166e-06, "loss": 0.009, "step": 4825 }, { "epoch": 2.1956323930846224, "grad_norm": 0.33626753097015716, "learning_rate": 2.975892214586167e-06, "loss": 0.0077, "step": 4826 }, { "epoch": 2.1960873521383077, "grad_norm": 0.3017915508218457, "learning_rate": 2.975190614511353e-06, "loss": 0.0055, "step": 4827 }, { "epoch": 2.1965423111919926, "grad_norm": 1.6184980172410766, "learning_rate": 2.9744889756061097e-06, "loss": 0.0115, "step": 4828 }, { "epoch": 2.196997270245678, "grad_norm": 0.6234062261122608, "learning_rate": 2.9737872979277694e-06, "loss": 0.0124, "step": 4829 }, { "epoch": 2.1974522292993632, "grad_norm": 0.39948585366752987, "learning_rate": 2.9730855815336706e-06, "loss": 0.0091, "step": 4830 }, { "epoch": 2.197907188353048, "grad_norm": 0.48543572415113817, "learning_rate": 2.9723838264811546e-06, "loss": 0.0103, "step": 4831 }, { "epoch": 2.1983621474067334, "grad_norm": 0.4750913301511524, "learning_rate": 2.9716820328275655e-06, "loss": 0.0129, "step": 4832 }, { "epoch": 2.1988171064604187, "grad_norm": 0.7831042170107656, "learning_rate": 2.970980200630251e-06, "loss": 0.0184, "step": 4833 }, { "epoch": 2.1992720655141036, "grad_norm": 0.546213491928378, "learning_rate": 2.9702783299465617e-06, "loss": 0.0074, "step": 4834 }, { "epoch": 2.199727024567789, "grad_norm": 0.7177442005512296, "learning_rate": 2.96957642083385e-06, "loss": 0.0162, "step": 4835 }, { "epoch": 2.200181983621474, "grad_norm": 0.9366006502695664, "learning_rate": 2.968874473349475e-06, "loss": 0.0199, "step": 4836 }, { "epoch": 2.200636942675159, "grad_norm": 0.4997424482784416, "learning_rate": 2.9681724875507947e-06, "loss": 0.0153, "step": 4837 }, { "epoch": 2.2010919017288444, "grad_norm": 0.26482569876941237, "learning_rate": 2.967470463495173e-06, "loss": 0.0062, "step": 4838 }, { "epoch": 2.2015468607825297, "grad_norm": 0.7268256054258515, "learning_rate": 2.966768401239976e-06, "loss": 0.0188, "step": 4839 }, { "epoch": 2.2020018198362146, "grad_norm": 0.3003784177060861, "learning_rate": 2.9660663008425738e-06, "loss": 0.0076, "step": 4840 }, { "epoch": 2.2024567788899, "grad_norm": 0.33756143767554136, "learning_rate": 2.965364162360338e-06, "loss": 0.0052, "step": 4841 }, { "epoch": 2.202911737943585, "grad_norm": 0.3532066519771224, "learning_rate": 2.9646619858506437e-06, "loss": 0.0066, "step": 4842 }, { "epoch": 2.20336669699727, "grad_norm": 0.6753292852829262, "learning_rate": 2.963959771370871e-06, "loss": 0.0184, "step": 4843 }, { "epoch": 2.2038216560509554, "grad_norm": 0.1594160317667174, "learning_rate": 2.963257518978401e-06, "loss": 0.0018, "step": 4844 }, { "epoch": 2.2042766151046407, "grad_norm": 0.48228661198039796, "learning_rate": 2.962555228730618e-06, "loss": 0.006, "step": 4845 }, { "epoch": 2.2047315741583255, "grad_norm": 0.4458597176079458, "learning_rate": 2.961852900684911e-06, "loss": 0.0123, "step": 4846 }, { "epoch": 2.205186533212011, "grad_norm": 0.39591928817408845, "learning_rate": 2.9611505348986715e-06, "loss": 0.0059, "step": 4847 }, { "epoch": 2.205641492265696, "grad_norm": 0.4315659000855837, "learning_rate": 2.9604481314292914e-06, "loss": 0.0098, "step": 4848 }, { "epoch": 2.2060964513193815, "grad_norm": 0.3754357543481822, "learning_rate": 2.9597456903341703e-06, "loss": 0.0045, "step": 4849 }, { "epoch": 2.2065514103730663, "grad_norm": 0.42687087930542456, "learning_rate": 2.9590432116707075e-06, "loss": 0.0076, "step": 4850 }, { "epoch": 2.2070063694267517, "grad_norm": 0.33867072508866636, "learning_rate": 2.9583406954963063e-06, "loss": 0.0071, "step": 4851 }, { "epoch": 2.207461328480437, "grad_norm": 0.4204250475664571, "learning_rate": 2.957638141868373e-06, "loss": 0.0065, "step": 4852 }, { "epoch": 2.207916287534122, "grad_norm": 0.4140248974920594, "learning_rate": 2.9569355508443182e-06, "loss": 0.0091, "step": 4853 }, { "epoch": 2.208371246587807, "grad_norm": 0.47320798369038974, "learning_rate": 2.956232922481553e-06, "loss": 0.0064, "step": 4854 }, { "epoch": 2.2088262056414925, "grad_norm": 0.5881190115851597, "learning_rate": 2.955530256837493e-06, "loss": 0.0125, "step": 4855 }, { "epoch": 2.2092811646951773, "grad_norm": 0.3310236176623882, "learning_rate": 2.9548275539695588e-06, "loss": 0.0046, "step": 4856 }, { "epoch": 2.2097361237488626, "grad_norm": 0.5147892892297835, "learning_rate": 2.954124813935171e-06, "loss": 0.0123, "step": 4857 }, { "epoch": 2.210191082802548, "grad_norm": 0.3969697495574982, "learning_rate": 2.9534220367917533e-06, "loss": 0.0068, "step": 4858 }, { "epoch": 2.210646041856233, "grad_norm": 0.6243066818418457, "learning_rate": 2.952719222596735e-06, "loss": 0.0187, "step": 4859 }, { "epoch": 2.211101000909918, "grad_norm": 1.033421627494162, "learning_rate": 2.952016371407546e-06, "loss": 0.0298, "step": 4860 }, { "epoch": 2.2115559599636034, "grad_norm": 0.3875674533098041, "learning_rate": 2.9513134832816206e-06, "loss": 0.0054, "step": 4861 }, { "epoch": 2.2120109190172883, "grad_norm": 0.48709654910877814, "learning_rate": 2.9506105582763955e-06, "loss": 0.0085, "step": 4862 }, { "epoch": 2.2124658780709736, "grad_norm": 0.5355792783841338, "learning_rate": 2.9499075964493103e-06, "loss": 0.0156, "step": 4863 }, { "epoch": 2.212920837124659, "grad_norm": 0.6569352717994389, "learning_rate": 2.949204597857808e-06, "loss": 0.0242, "step": 4864 }, { "epoch": 2.213375796178344, "grad_norm": 0.5659025019114028, "learning_rate": 2.948501562559335e-06, "loss": 0.0105, "step": 4865 }, { "epoch": 2.213830755232029, "grad_norm": 0.45178928684332614, "learning_rate": 2.9477984906113395e-06, "loss": 0.007, "step": 4866 }, { "epoch": 2.2142857142857144, "grad_norm": 0.4766274695311824, "learning_rate": 2.9470953820712737e-06, "loss": 0.008, "step": 4867 }, { "epoch": 2.2147406733393993, "grad_norm": 0.41393668881377427, "learning_rate": 2.946392236996592e-06, "loss": 0.0131, "step": 4868 }, { "epoch": 2.2151956323930846, "grad_norm": 0.5464236995968396, "learning_rate": 2.9456890554447527e-06, "loss": 0.0142, "step": 4869 }, { "epoch": 2.21565059144677, "grad_norm": 0.46232407457885, "learning_rate": 2.944985837473217e-06, "loss": 0.0111, "step": 4870 }, { "epoch": 2.2161055505004548, "grad_norm": 0.5212313620386814, "learning_rate": 2.9442825831394474e-06, "loss": 0.0106, "step": 4871 }, { "epoch": 2.21656050955414, "grad_norm": 0.2845581104269175, "learning_rate": 2.9435792925009123e-06, "loss": 0.005, "step": 4872 }, { "epoch": 2.2170154686078254, "grad_norm": 0.5587771327403143, "learning_rate": 2.9428759656150795e-06, "loss": 0.009, "step": 4873 }, { "epoch": 2.2174704276615103, "grad_norm": 0.4156904775721916, "learning_rate": 2.9421726025394235e-06, "loss": 0.0096, "step": 4874 }, { "epoch": 2.2179253867151956, "grad_norm": 0.3711831609126654, "learning_rate": 2.9414692033314198e-06, "loss": 0.0106, "step": 4875 }, { "epoch": 2.218380345768881, "grad_norm": 0.37893260980857235, "learning_rate": 2.9407657680485454e-06, "loss": 0.0073, "step": 4876 }, { "epoch": 2.2188353048225657, "grad_norm": 0.32691871622349006, "learning_rate": 2.9400622967482838e-06, "loss": 0.0098, "step": 4877 }, { "epoch": 2.219290263876251, "grad_norm": 0.36738377087710516, "learning_rate": 2.939358789488118e-06, "loss": 0.011, "step": 4878 }, { "epoch": 2.2197452229299364, "grad_norm": 0.41320409791884355, "learning_rate": 2.9386552463255364e-06, "loss": 0.0077, "step": 4879 }, { "epoch": 2.2202001819836217, "grad_norm": 0.29505898444999806, "learning_rate": 2.937951667318028e-06, "loss": 0.0061, "step": 4880 }, { "epoch": 2.2206551410373065, "grad_norm": 0.487551607937164, "learning_rate": 2.9372480525230884e-06, "loss": 0.0086, "step": 4881 }, { "epoch": 2.221110100090992, "grad_norm": 0.5226135552961, "learning_rate": 2.936544401998212e-06, "loss": 0.0062, "step": 4882 }, { "epoch": 2.221565059144677, "grad_norm": 0.34267760187101054, "learning_rate": 2.9358407158008984e-06, "loss": 0.0072, "step": 4883 }, { "epoch": 2.222020018198362, "grad_norm": 0.3148411326843605, "learning_rate": 2.9351369939886504e-06, "loss": 0.006, "step": 4884 }, { "epoch": 2.2224749772520473, "grad_norm": 0.3079230663941869, "learning_rate": 2.934433236618972e-06, "loss": 0.006, "step": 4885 }, { "epoch": 2.2229299363057327, "grad_norm": 0.6428539491759201, "learning_rate": 2.9337294437493715e-06, "loss": 0.0145, "step": 4886 }, { "epoch": 2.2233848953594175, "grad_norm": 0.4507859140168978, "learning_rate": 2.9330256154373595e-06, "loss": 0.0072, "step": 4887 }, { "epoch": 2.223839854413103, "grad_norm": 0.69601935124361, "learning_rate": 2.9323217517404488e-06, "loss": 0.0089, "step": 4888 }, { "epoch": 2.224294813466788, "grad_norm": 0.5172347103409983, "learning_rate": 2.9316178527161583e-06, "loss": 0.0116, "step": 4889 }, { "epoch": 2.224749772520473, "grad_norm": 0.5310365702883805, "learning_rate": 2.930913918422005e-06, "loss": 0.0139, "step": 4890 }, { "epoch": 2.2252047315741583, "grad_norm": 0.40184853768823264, "learning_rate": 2.9302099489155126e-06, "loss": 0.009, "step": 4891 }, { "epoch": 2.2256596906278436, "grad_norm": 0.582127272015886, "learning_rate": 2.929505944254206e-06, "loss": 0.0189, "step": 4892 }, { "epoch": 2.2261146496815285, "grad_norm": 0.6409914999826816, "learning_rate": 2.928801904495614e-06, "loss": 0.0117, "step": 4893 }, { "epoch": 2.226569608735214, "grad_norm": 0.2641892942728218, "learning_rate": 2.9280978296972657e-06, "loss": 0.0053, "step": 4894 }, { "epoch": 2.227024567788899, "grad_norm": 0.5712389022512713, "learning_rate": 2.9273937199166962e-06, "loss": 0.0179, "step": 4895 }, { "epoch": 2.227479526842584, "grad_norm": 0.5239119068072173, "learning_rate": 2.9266895752114426e-06, "loss": 0.0117, "step": 4896 }, { "epoch": 2.2279344858962693, "grad_norm": 0.9128594196380702, "learning_rate": 2.925985395639043e-06, "loss": 0.0115, "step": 4897 }, { "epoch": 2.2283894449499546, "grad_norm": 0.23478433766818435, "learning_rate": 2.9252811812570415e-06, "loss": 0.0042, "step": 4898 }, { "epoch": 2.2288444040036395, "grad_norm": 0.436891729937903, "learning_rate": 2.9245769321229817e-06, "loss": 0.0067, "step": 4899 }, { "epoch": 2.229299363057325, "grad_norm": 0.5199638856267613, "learning_rate": 2.9238726482944134e-06, "loss": 0.0099, "step": 4900 }, { "epoch": 2.22975432211101, "grad_norm": 0.5747114934202581, "learning_rate": 2.9231683298288853e-06, "loss": 0.0171, "step": 4901 }, { "epoch": 2.2302092811646954, "grad_norm": 0.3596775839178632, "learning_rate": 2.922463976783953e-06, "loss": 0.0118, "step": 4902 }, { "epoch": 2.2306642402183803, "grad_norm": 0.8939798921759476, "learning_rate": 2.9217595892171724e-06, "loss": 0.0187, "step": 4903 }, { "epoch": 2.2311191992720656, "grad_norm": 0.5805711828852206, "learning_rate": 2.9210551671861016e-06, "loss": 0.0127, "step": 4904 }, { "epoch": 2.231574158325751, "grad_norm": 0.5822841335240057, "learning_rate": 2.9203507107483055e-06, "loss": 0.01, "step": 4905 }, { "epoch": 2.2320291173794358, "grad_norm": 0.32600437138534893, "learning_rate": 2.9196462199613473e-06, "loss": 0.0041, "step": 4906 }, { "epoch": 2.232484076433121, "grad_norm": 0.45762737886251953, "learning_rate": 2.9189416948827946e-06, "loss": 0.0081, "step": 4907 }, { "epoch": 2.2329390354868064, "grad_norm": 0.6624614988672972, "learning_rate": 2.9182371355702188e-06, "loss": 0.0134, "step": 4908 }, { "epoch": 2.2333939945404913, "grad_norm": 0.5775183169187995, "learning_rate": 2.917532542081193e-06, "loss": 0.0107, "step": 4909 }, { "epoch": 2.2338489535941766, "grad_norm": 0.5158090296026644, "learning_rate": 2.9168279144732936e-06, "loss": 0.0125, "step": 4910 }, { "epoch": 2.234303912647862, "grad_norm": 0.6474439945960965, "learning_rate": 2.916123252804099e-06, "loss": 0.0181, "step": 4911 }, { "epoch": 2.2347588717015467, "grad_norm": 0.5209679353911566, "learning_rate": 2.915418557131192e-06, "loss": 0.0151, "step": 4912 }, { "epoch": 2.235213830755232, "grad_norm": 0.5809706171715019, "learning_rate": 2.914713827512156e-06, "loss": 0.0129, "step": 4913 }, { "epoch": 2.2356687898089174, "grad_norm": 0.32035015477503426, "learning_rate": 2.914009064004578e-06, "loss": 0.0079, "step": 4914 }, { "epoch": 2.2361237488626022, "grad_norm": 0.6099868853119673, "learning_rate": 2.9133042666660505e-06, "loss": 0.0199, "step": 4915 }, { "epoch": 2.2365787079162875, "grad_norm": 0.31128710562512973, "learning_rate": 2.912599435554164e-06, "loss": 0.0066, "step": 4916 }, { "epoch": 2.237033666969973, "grad_norm": 0.35245851936428363, "learning_rate": 2.9118945707265154e-06, "loss": 0.0059, "step": 4917 }, { "epoch": 2.2374886260236577, "grad_norm": 0.6349414556237367, "learning_rate": 2.911189672240702e-06, "loss": 0.0173, "step": 4918 }, { "epoch": 2.237943585077343, "grad_norm": 0.3331397233283785, "learning_rate": 2.910484740154326e-06, "loss": 0.0034, "step": 4919 }, { "epoch": 2.2383985441310283, "grad_norm": 0.40324552187708196, "learning_rate": 2.909779774524991e-06, "loss": 0.0055, "step": 4920 }, { "epoch": 2.238853503184713, "grad_norm": 0.5067572867113999, "learning_rate": 2.9090747754103022e-06, "loss": 0.0094, "step": 4921 }, { "epoch": 2.2393084622383985, "grad_norm": 0.4532994042025974, "learning_rate": 2.9083697428678713e-06, "loss": 0.0119, "step": 4922 }, { "epoch": 2.239763421292084, "grad_norm": 0.5327769607592849, "learning_rate": 2.907664676955309e-06, "loss": 0.0123, "step": 4923 }, { "epoch": 2.2402183803457687, "grad_norm": 0.7666163169537017, "learning_rate": 2.90695957773023e-06, "loss": 0.015, "step": 4924 }, { "epoch": 2.240673339399454, "grad_norm": 0.4061581543065866, "learning_rate": 2.9062544452502515e-06, "loss": 0.0113, "step": 4925 }, { "epoch": 2.2411282984531393, "grad_norm": 0.24800405139437132, "learning_rate": 2.9055492795729954e-06, "loss": 0.0064, "step": 4926 }, { "epoch": 2.241583257506824, "grad_norm": 0.45516901182521324, "learning_rate": 2.9048440807560836e-06, "loss": 0.0121, "step": 4927 }, { "epoch": 2.2420382165605095, "grad_norm": 0.49776553841820587, "learning_rate": 2.9041388488571416e-06, "loss": 0.0146, "step": 4928 }, { "epoch": 2.242493175614195, "grad_norm": 0.4507905281551957, "learning_rate": 2.9034335839337975e-06, "loss": 0.0117, "step": 4929 }, { "epoch": 2.2429481346678797, "grad_norm": 0.515008514671491, "learning_rate": 2.9027282860436833e-06, "loss": 0.0163, "step": 4930 }, { "epoch": 2.243403093721565, "grad_norm": 0.3999604722172576, "learning_rate": 2.902022955244432e-06, "loss": 0.011, "step": 4931 }, { "epoch": 2.2438580527752503, "grad_norm": 0.29122864798080406, "learning_rate": 2.90131759159368e-06, "loss": 0.0056, "step": 4932 }, { "epoch": 2.244313011828935, "grad_norm": 0.4920852194792672, "learning_rate": 2.9006121951490673e-06, "loss": 0.0148, "step": 4933 }, { "epoch": 2.2447679708826205, "grad_norm": 0.31226609969019886, "learning_rate": 2.899906765968235e-06, "loss": 0.0057, "step": 4934 }, { "epoch": 2.245222929936306, "grad_norm": 0.4108632439376612, "learning_rate": 2.8992013041088274e-06, "loss": 0.0063, "step": 4935 }, { "epoch": 2.245677888989991, "grad_norm": 0.3577502093786973, "learning_rate": 2.8984958096284927e-06, "loss": 0.0099, "step": 4936 }, { "epoch": 2.246132848043676, "grad_norm": 0.3289806489958052, "learning_rate": 2.8977902825848798e-06, "loss": 0.01, "step": 4937 }, { "epoch": 2.2465878070973613, "grad_norm": 0.5139194182545219, "learning_rate": 2.8970847230356414e-06, "loss": 0.0115, "step": 4938 }, { "epoch": 2.2470427661510466, "grad_norm": 0.7910430037025714, "learning_rate": 2.896379131038432e-06, "loss": 0.0244, "step": 4939 }, { "epoch": 2.2474977252047315, "grad_norm": 0.6900822662910876, "learning_rate": 2.8956735066509113e-06, "loss": 0.0187, "step": 4940 }, { "epoch": 2.2479526842584168, "grad_norm": 0.2767631676700871, "learning_rate": 2.8949678499307376e-06, "loss": 0.0046, "step": 4941 }, { "epoch": 2.248407643312102, "grad_norm": 0.17363538238512619, "learning_rate": 2.894262160935575e-06, "loss": 0.0036, "step": 4942 }, { "epoch": 2.248862602365787, "grad_norm": 0.20369875496428924, "learning_rate": 2.8935564397230885e-06, "loss": 0.0035, "step": 4943 }, { "epoch": 2.2493175614194723, "grad_norm": 0.500865300373409, "learning_rate": 2.892850686350948e-06, "loss": 0.0131, "step": 4944 }, { "epoch": 2.2497725204731576, "grad_norm": 0.6509447350720985, "learning_rate": 2.892144900876823e-06, "loss": 0.0201, "step": 4945 }, { "epoch": 2.2502274795268424, "grad_norm": 0.5794282869411242, "learning_rate": 2.8914390833583877e-06, "loss": 0.0207, "step": 4946 }, { "epoch": 2.2506824385805277, "grad_norm": 0.4665450242859773, "learning_rate": 2.8907332338533182e-06, "loss": 0.0118, "step": 4947 }, { "epoch": 2.251137397634213, "grad_norm": 0.40805612665811286, "learning_rate": 2.8900273524192936e-06, "loss": 0.0056, "step": 4948 }, { "epoch": 2.251592356687898, "grad_norm": 0.47599031565792477, "learning_rate": 2.889321439113995e-06, "loss": 0.0108, "step": 4949 }, { "epoch": 2.2520473157415832, "grad_norm": 0.37848120832825255, "learning_rate": 2.888615493995106e-06, "loss": 0.0081, "step": 4950 }, { "epoch": 2.2525022747952685, "grad_norm": 0.546307560214703, "learning_rate": 2.8879095171203147e-06, "loss": 0.0155, "step": 4951 }, { "epoch": 2.2529572338489534, "grad_norm": 0.29216017053127363, "learning_rate": 2.887203508547309e-06, "loss": 0.0079, "step": 4952 }, { "epoch": 2.2534121929026387, "grad_norm": 0.6419134984295131, "learning_rate": 2.886497468333781e-06, "loss": 0.0198, "step": 4953 }, { "epoch": 2.253867151956324, "grad_norm": 0.5444254682754072, "learning_rate": 2.8857913965374264e-06, "loss": 0.0141, "step": 4954 }, { "epoch": 2.2543221110100093, "grad_norm": 0.299726570432879, "learning_rate": 2.88508529321594e-06, "loss": 0.0036, "step": 4955 }, { "epoch": 2.254777070063694, "grad_norm": 0.5141261985319838, "learning_rate": 2.8843791584270226e-06, "loss": 0.0088, "step": 4956 }, { "epoch": 2.2552320291173795, "grad_norm": 0.413695564402205, "learning_rate": 2.8836729922283756e-06, "loss": 0.0067, "step": 4957 }, { "epoch": 2.255686988171065, "grad_norm": 0.43576047089646097, "learning_rate": 2.8829667946777058e-06, "loss": 0.0102, "step": 4958 }, { "epoch": 2.2561419472247497, "grad_norm": 0.38535964572773457, "learning_rate": 2.8822605658327184e-06, "loss": 0.0097, "step": 4959 }, { "epoch": 2.256596906278435, "grad_norm": 0.357577035254831, "learning_rate": 2.8815543057511232e-06, "loss": 0.0073, "step": 4960 }, { "epoch": 2.2570518653321203, "grad_norm": 0.6896481073292591, "learning_rate": 2.8808480144906344e-06, "loss": 0.022, "step": 4961 }, { "epoch": 2.257506824385805, "grad_norm": 0.35001418688309893, "learning_rate": 2.8801416921089642e-06, "loss": 0.0068, "step": 4962 }, { "epoch": 2.2579617834394905, "grad_norm": 0.5168702499866729, "learning_rate": 2.8794353386638324e-06, "loss": 0.0078, "step": 4963 }, { "epoch": 2.258416742493176, "grad_norm": 0.36846893536930403, "learning_rate": 2.8787289542129588e-06, "loss": 0.004, "step": 4964 }, { "epoch": 2.2588717015468607, "grad_norm": 0.24103859979095263, "learning_rate": 2.8780225388140648e-06, "loss": 0.0029, "step": 4965 }, { "epoch": 2.259326660600546, "grad_norm": 0.32191055522055706, "learning_rate": 2.8773160925248766e-06, "loss": 0.0122, "step": 4966 }, { "epoch": 2.2597816196542313, "grad_norm": 0.3725372269303876, "learning_rate": 2.87660961540312e-06, "loss": 0.009, "step": 4967 }, { "epoch": 2.260236578707916, "grad_norm": 0.4176707851237545, "learning_rate": 2.8759031075065276e-06, "loss": 0.0075, "step": 4968 }, { "epoch": 2.2606915377616015, "grad_norm": 0.19046117514753938, "learning_rate": 2.87519656889283e-06, "loss": 0.0022, "step": 4969 }, { "epoch": 2.261146496815287, "grad_norm": 0.4282576531889189, "learning_rate": 2.874489999619764e-06, "loss": 0.0091, "step": 4970 }, { "epoch": 2.2616014558689717, "grad_norm": 0.7946435018232343, "learning_rate": 2.8737833997450658e-06, "loss": 0.0204, "step": 4971 }, { "epoch": 2.262056414922657, "grad_norm": 0.35111004032221094, "learning_rate": 2.8730767693264765e-06, "loss": 0.0049, "step": 4972 }, { "epoch": 2.2625113739763423, "grad_norm": 0.4087701600562731, "learning_rate": 2.8723701084217388e-06, "loss": 0.009, "step": 4973 }, { "epoch": 2.262966333030027, "grad_norm": 0.42578423688027506, "learning_rate": 2.871663417088596e-06, "loss": 0.0084, "step": 4974 }, { "epoch": 2.2634212920837125, "grad_norm": 0.5763911889719867, "learning_rate": 2.8709566953847984e-06, "loss": 0.0202, "step": 4975 }, { "epoch": 2.2638762511373978, "grad_norm": 0.4857987004360374, "learning_rate": 2.870249943368095e-06, "loss": 0.0108, "step": 4976 }, { "epoch": 2.2643312101910826, "grad_norm": 0.7886178398324192, "learning_rate": 2.869543161096237e-06, "loss": 0.0129, "step": 4977 }, { "epoch": 2.264786169244768, "grad_norm": 0.4161963530696184, "learning_rate": 2.868836348626982e-06, "loss": 0.0082, "step": 4978 }, { "epoch": 2.2652411282984533, "grad_norm": 0.3994258271578249, "learning_rate": 2.8681295060180856e-06, "loss": 0.013, "step": 4979 }, { "epoch": 2.265696087352138, "grad_norm": 0.4904976926689533, "learning_rate": 2.8674226333273093e-06, "loss": 0.0076, "step": 4980 }, { "epoch": 2.2661510464058234, "grad_norm": 0.3993371390321355, "learning_rate": 2.866715730612414e-06, "loss": 0.0052, "step": 4981 }, { "epoch": 2.2666060054595087, "grad_norm": 0.42996163862953024, "learning_rate": 2.8660087979311647e-06, "loss": 0.0064, "step": 4982 }, { "epoch": 2.2670609645131936, "grad_norm": 0.34328052332045905, "learning_rate": 2.8653018353413305e-06, "loss": 0.0079, "step": 4983 }, { "epoch": 2.267515923566879, "grad_norm": 0.49395225812932875, "learning_rate": 2.8645948429006787e-06, "loss": 0.0109, "step": 4984 }, { "epoch": 2.2679708826205642, "grad_norm": 0.4976234021834686, "learning_rate": 2.863887820666984e-06, "loss": 0.0144, "step": 4985 }, { "epoch": 2.268425841674249, "grad_norm": 0.4622661393719153, "learning_rate": 2.863180768698019e-06, "loss": 0.0159, "step": 4986 }, { "epoch": 2.2688808007279344, "grad_norm": 0.36481139770383525, "learning_rate": 2.8624736870515624e-06, "loss": 0.007, "step": 4987 }, { "epoch": 2.2693357597816197, "grad_norm": 0.2421938903968581, "learning_rate": 2.8617665757853925e-06, "loss": 0.0046, "step": 4988 }, { "epoch": 2.2697907188353046, "grad_norm": 0.4417002215440212, "learning_rate": 2.8610594349572917e-06, "loss": 0.0064, "step": 4989 }, { "epoch": 2.27024567788899, "grad_norm": 0.6798297800859594, "learning_rate": 2.8603522646250453e-06, "loss": 0.0123, "step": 4990 }, { "epoch": 2.270700636942675, "grad_norm": 0.6797909424066655, "learning_rate": 2.859645064846438e-06, "loss": 0.0173, "step": 4991 }, { "epoch": 2.2711555959963605, "grad_norm": 0.4663157041204241, "learning_rate": 2.8589378356792607e-06, "loss": 0.0089, "step": 4992 }, { "epoch": 2.2716105550500454, "grad_norm": 0.42179576788293766, "learning_rate": 2.8582305771813047e-06, "loss": 0.009, "step": 4993 }, { "epoch": 2.2720655141037307, "grad_norm": 0.3434988233688158, "learning_rate": 2.857523289410363e-06, "loss": 0.0048, "step": 4994 }, { "epoch": 2.272520473157416, "grad_norm": 0.5456480845412741, "learning_rate": 2.8568159724242333e-06, "loss": 0.0103, "step": 4995 }, { "epoch": 2.272975432211101, "grad_norm": 0.9721381877376272, "learning_rate": 2.856108626280713e-06, "loss": 0.0313, "step": 4996 }, { "epoch": 2.273430391264786, "grad_norm": 0.3866410592454436, "learning_rate": 2.855401251037605e-06, "loss": 0.007, "step": 4997 }, { "epoch": 2.2738853503184715, "grad_norm": 0.6014892522679584, "learning_rate": 2.8546938467527106e-06, "loss": 0.0146, "step": 4998 }, { "epoch": 2.2743403093721564, "grad_norm": 0.39514634769634216, "learning_rate": 2.8539864134838374e-06, "loss": 0.0115, "step": 4999 }, { "epoch": 2.2747952684258417, "grad_norm": 0.3470368290285338, "learning_rate": 2.8532789512887936e-06, "loss": 0.0091, "step": 5000 }, { "epoch": 2.275250227479527, "grad_norm": 0.2841511663543976, "learning_rate": 2.8525714602253885e-06, "loss": 0.006, "step": 5001 }, { "epoch": 2.275705186533212, "grad_norm": 0.44825964738701857, "learning_rate": 2.851863940351436e-06, "loss": 0.0081, "step": 5002 }, { "epoch": 2.276160145586897, "grad_norm": 0.3881439159737794, "learning_rate": 2.851156391724751e-06, "loss": 0.0045, "step": 5003 }, { "epoch": 2.2766151046405825, "grad_norm": 0.3868599445015578, "learning_rate": 2.850448814403152e-06, "loss": 0.0078, "step": 5004 }, { "epoch": 2.2770700636942673, "grad_norm": 0.6225012116911042, "learning_rate": 2.8497412084444585e-06, "loss": 0.0209, "step": 5005 }, { "epoch": 2.2775250227479527, "grad_norm": 0.34886069995973906, "learning_rate": 2.849033573906493e-06, "loss": 0.0047, "step": 5006 }, { "epoch": 2.277979981801638, "grad_norm": 0.37202851144240745, "learning_rate": 2.8483259108470796e-06, "loss": 0.01, "step": 5007 }, { "epoch": 2.278434940855323, "grad_norm": 0.5303494906746413, "learning_rate": 2.8476182193240458e-06, "loss": 0.0142, "step": 5008 }, { "epoch": 2.278889899909008, "grad_norm": 0.5085050242790066, "learning_rate": 2.846910499395221e-06, "loss": 0.0106, "step": 5009 }, { "epoch": 2.2793448589626935, "grad_norm": 0.5163369362011253, "learning_rate": 2.846202751118437e-06, "loss": 0.0137, "step": 5010 }, { "epoch": 2.2797998180163788, "grad_norm": 0.5598962303420268, "learning_rate": 2.845494974551528e-06, "loss": 0.0125, "step": 5011 }, { "epoch": 2.2802547770700636, "grad_norm": 1.080069638214284, "learning_rate": 2.8447871697523294e-06, "loss": 0.0265, "step": 5012 }, { "epoch": 2.280709736123749, "grad_norm": 0.5169044931983727, "learning_rate": 2.84407933677868e-06, "loss": 0.0132, "step": 5013 }, { "epoch": 2.2811646951774343, "grad_norm": 0.24522072022088567, "learning_rate": 2.843371475688422e-06, "loss": 0.0054, "step": 5014 }, { "epoch": 2.281619654231119, "grad_norm": 0.3388246762073615, "learning_rate": 2.842663586539397e-06, "loss": 0.0066, "step": 5015 }, { "epoch": 2.2820746132848044, "grad_norm": 0.28126694745916075, "learning_rate": 2.841955669389451e-06, "loss": 0.007, "step": 5016 }, { "epoch": 2.2825295723384897, "grad_norm": 0.4049495488946689, "learning_rate": 2.8412477242964326e-06, "loss": 0.0092, "step": 5017 }, { "epoch": 2.2829845313921746, "grad_norm": 0.5582857560847186, "learning_rate": 2.840539751318191e-06, "loss": 0.0135, "step": 5018 }, { "epoch": 2.28343949044586, "grad_norm": 0.4863493620765645, "learning_rate": 2.8398317505125783e-06, "loss": 0.0133, "step": 5019 }, { "epoch": 2.2838944494995452, "grad_norm": 0.6543338965256119, "learning_rate": 2.8391237219374495e-06, "loss": 0.0103, "step": 5020 }, { "epoch": 2.28434940855323, "grad_norm": 0.3270448217318662, "learning_rate": 2.838415665650663e-06, "loss": 0.0047, "step": 5021 }, { "epoch": 2.2848043676069154, "grad_norm": 0.1779963277987861, "learning_rate": 2.837707581710075e-06, "loss": 0.0024, "step": 5022 }, { "epoch": 2.2852593266606007, "grad_norm": 0.4627362902613846, "learning_rate": 2.836999470173549e-06, "loss": 0.0097, "step": 5023 }, { "epoch": 2.2857142857142856, "grad_norm": 0.5792364196381978, "learning_rate": 2.8362913310989485e-06, "loss": 0.0134, "step": 5024 }, { "epoch": 2.286169244767971, "grad_norm": 0.5317089228591818, "learning_rate": 2.835583164544139e-06, "loss": 0.0168, "step": 5025 }, { "epoch": 2.286624203821656, "grad_norm": 0.4289605734161935, "learning_rate": 2.834874970566989e-06, "loss": 0.012, "step": 5026 }, { "epoch": 2.287079162875341, "grad_norm": 0.30693516940638793, "learning_rate": 2.834166749225368e-06, "loss": 0.0047, "step": 5027 }, { "epoch": 2.2875341219290264, "grad_norm": 0.708951656217826, "learning_rate": 2.83345850057715e-06, "loss": 0.0199, "step": 5028 }, { "epoch": 2.2879890809827117, "grad_norm": 0.4327713301861736, "learning_rate": 2.832750224680209e-06, "loss": 0.0078, "step": 5029 }, { "epoch": 2.2884440400363966, "grad_norm": 0.4178731031156556, "learning_rate": 2.8320419215924217e-06, "loss": 0.0087, "step": 5030 }, { "epoch": 2.288898999090082, "grad_norm": 0.5109568323829935, "learning_rate": 2.831333591371669e-06, "loss": 0.0122, "step": 5031 }, { "epoch": 2.289353958143767, "grad_norm": 0.41280530472259774, "learning_rate": 2.830625234075831e-06, "loss": 0.0097, "step": 5032 }, { "epoch": 2.289808917197452, "grad_norm": 0.35223256674505665, "learning_rate": 2.829916849762792e-06, "loss": 0.0065, "step": 5033 }, { "epoch": 2.2902638762511374, "grad_norm": 0.6146291816716855, "learning_rate": 2.8292084384904383e-06, "loss": 0.0282, "step": 5034 }, { "epoch": 2.2907188353048227, "grad_norm": 0.49685847330771127, "learning_rate": 2.8285000003166574e-06, "loss": 0.0105, "step": 5035 }, { "epoch": 2.2911737943585075, "grad_norm": 0.49255452402718264, "learning_rate": 2.8277915352993403e-06, "loss": 0.008, "step": 5036 }, { "epoch": 2.291628753412193, "grad_norm": 0.39898694723211847, "learning_rate": 2.8270830434963783e-06, "loss": 0.0111, "step": 5037 }, { "epoch": 2.292083712465878, "grad_norm": 0.4453703932554615, "learning_rate": 2.826374524965668e-06, "loss": 0.0115, "step": 5038 }, { "epoch": 2.292538671519563, "grad_norm": 0.8396386686419544, "learning_rate": 2.825665979765105e-06, "loss": 0.0225, "step": 5039 }, { "epoch": 2.2929936305732483, "grad_norm": 0.5616211737614395, "learning_rate": 2.8249574079525887e-06, "loss": 0.0131, "step": 5040 }, { "epoch": 2.2934485896269337, "grad_norm": 0.5223158347200144, "learning_rate": 2.824248809586021e-06, "loss": 0.0135, "step": 5041 }, { "epoch": 2.2939035486806185, "grad_norm": 0.4903733816864113, "learning_rate": 2.8235401847233045e-06, "loss": 0.0128, "step": 5042 }, { "epoch": 2.294358507734304, "grad_norm": 0.4171020883563259, "learning_rate": 2.822831533422346e-06, "loss": 0.0071, "step": 5043 }, { "epoch": 2.294813466787989, "grad_norm": 0.8445522438021933, "learning_rate": 2.8221228557410505e-06, "loss": 0.0294, "step": 5044 }, { "epoch": 2.295268425841674, "grad_norm": 0.5645184322626259, "learning_rate": 2.8214141517373324e-06, "loss": 0.0132, "step": 5045 }, { "epoch": 2.2957233848953593, "grad_norm": 0.48386402677777823, "learning_rate": 2.8207054214691e-06, "loss": 0.008, "step": 5046 }, { "epoch": 2.2961783439490446, "grad_norm": 0.3826716392057223, "learning_rate": 2.8199966649942683e-06, "loss": 0.0103, "step": 5047 }, { "epoch": 2.29663330300273, "grad_norm": 0.5659602397863409, "learning_rate": 2.8192878823707554e-06, "loss": 0.0145, "step": 5048 }, { "epoch": 2.297088262056415, "grad_norm": 0.2594271447040721, "learning_rate": 2.818579073656478e-06, "loss": 0.0052, "step": 5049 }, { "epoch": 2.2975432211101, "grad_norm": 0.3103335870306608, "learning_rate": 2.817870238909358e-06, "loss": 0.0048, "step": 5050 }, { "epoch": 2.2979981801637854, "grad_norm": 0.26035801660072555, "learning_rate": 2.817161378187317e-06, "loss": 0.0056, "step": 5051 }, { "epoch": 2.2984531392174703, "grad_norm": 0.2941831078173984, "learning_rate": 2.8164524915482804e-06, "loss": 0.0047, "step": 5052 }, { "epoch": 2.2989080982711556, "grad_norm": 0.5467414996743842, "learning_rate": 2.815743579050176e-06, "loss": 0.0151, "step": 5053 }, { "epoch": 2.299363057324841, "grad_norm": 0.4310245586465084, "learning_rate": 2.815034640750931e-06, "loss": 0.0066, "step": 5054 }, { "epoch": 2.299818016378526, "grad_norm": 0.5826319412086409, "learning_rate": 2.8143256767084785e-06, "loss": 0.0088, "step": 5055 }, { "epoch": 2.300272975432211, "grad_norm": 0.7216941706194081, "learning_rate": 2.8136166869807513e-06, "loss": 0.0194, "step": 5056 }, { "epoch": 2.3007279344858964, "grad_norm": 0.44541415922316624, "learning_rate": 2.812907671625685e-06, "loss": 0.0075, "step": 5057 }, { "epoch": 2.3011828935395813, "grad_norm": 0.6787367867724394, "learning_rate": 2.812198630701216e-06, "loss": 0.0101, "step": 5058 }, { "epoch": 2.3016378525932666, "grad_norm": 0.40157201505826196, "learning_rate": 2.811489564265285e-06, "loss": 0.0089, "step": 5059 }, { "epoch": 2.302092811646952, "grad_norm": 0.4130583839082059, "learning_rate": 2.810780472375834e-06, "loss": 0.0074, "step": 5060 }, { "epoch": 2.3025477707006368, "grad_norm": 0.4302035451394966, "learning_rate": 2.8100713550908053e-06, "loss": 0.0077, "step": 5061 }, { "epoch": 2.303002729754322, "grad_norm": 0.37085302529818753, "learning_rate": 2.8093622124681474e-06, "loss": 0.0079, "step": 5062 }, { "epoch": 2.3034576888080074, "grad_norm": 0.3888281844928918, "learning_rate": 2.808653044565805e-06, "loss": 0.0078, "step": 5063 }, { "epoch": 2.3039126478616927, "grad_norm": 0.40120043776649167, "learning_rate": 2.807943851441731e-06, "loss": 0.0075, "step": 5064 }, { "epoch": 2.3043676069153776, "grad_norm": 0.37158873238140877, "learning_rate": 2.807234633153875e-06, "loss": 0.0062, "step": 5065 }, { "epoch": 2.304822565969063, "grad_norm": 0.9184176426232945, "learning_rate": 2.8065253897601924e-06, "loss": 0.019, "step": 5066 }, { "epoch": 2.305277525022748, "grad_norm": 0.6363809512338725, "learning_rate": 2.8058161213186396e-06, "loss": 0.0123, "step": 5067 }, { "epoch": 2.305732484076433, "grad_norm": 0.36586206455042775, "learning_rate": 2.8051068278871746e-06, "loss": 0.007, "step": 5068 }, { "epoch": 2.3061874431301184, "grad_norm": 0.4892338165917108, "learning_rate": 2.804397509523757e-06, "loss": 0.0153, "step": 5069 }, { "epoch": 2.3066424021838037, "grad_norm": 0.41830371232395297, "learning_rate": 2.8036881662863497e-06, "loss": 0.0085, "step": 5070 }, { "epoch": 2.3070973612374885, "grad_norm": 0.4286213549094428, "learning_rate": 2.802978798232917e-06, "loss": 0.0157, "step": 5071 }, { "epoch": 2.307552320291174, "grad_norm": 0.4379946506258405, "learning_rate": 2.802269405421425e-06, "loss": 0.009, "step": 5072 }, { "epoch": 2.308007279344859, "grad_norm": 0.5902414194468019, "learning_rate": 2.801559987909842e-06, "loss": 0.014, "step": 5073 }, { "epoch": 2.308462238398544, "grad_norm": 0.3967229198192856, "learning_rate": 2.8008505457561393e-06, "loss": 0.0089, "step": 5074 }, { "epoch": 2.3089171974522293, "grad_norm": 0.5120330011270533, "learning_rate": 2.8001410790182876e-06, "loss": 0.0063, "step": 5075 }, { "epoch": 2.3093721565059147, "grad_norm": 0.4630626953611785, "learning_rate": 2.799431587754263e-06, "loss": 0.0092, "step": 5076 }, { "epoch": 2.3098271155595995, "grad_norm": 0.38431503998691946, "learning_rate": 2.7987220720220415e-06, "loss": 0.0068, "step": 5077 }, { "epoch": 2.310282074613285, "grad_norm": 0.39490910020207387, "learning_rate": 2.798012531879601e-06, "loss": 0.0129, "step": 5078 }, { "epoch": 2.31073703366697, "grad_norm": 0.28776050466886344, "learning_rate": 2.7973029673849224e-06, "loss": 0.0082, "step": 5079 }, { "epoch": 2.311191992720655, "grad_norm": 0.4426686795650058, "learning_rate": 2.796593378595987e-06, "loss": 0.0071, "step": 5080 }, { "epoch": 2.3116469517743403, "grad_norm": 0.5777403032518409, "learning_rate": 2.7958837655707817e-06, "loss": 0.0096, "step": 5081 }, { "epoch": 2.3121019108280256, "grad_norm": 0.4290725307729291, "learning_rate": 2.79517412836729e-06, "loss": 0.0072, "step": 5082 }, { "epoch": 2.3125568698817105, "grad_norm": 0.34836307427298663, "learning_rate": 2.7944644670435015e-06, "loss": 0.0066, "step": 5083 }, { "epoch": 2.313011828935396, "grad_norm": 0.7859551118711383, "learning_rate": 2.7937547816574073e-06, "loss": 0.0126, "step": 5084 }, { "epoch": 2.313466787989081, "grad_norm": 0.7444453393348858, "learning_rate": 2.793045072266999e-06, "loss": 0.0256, "step": 5085 }, { "epoch": 2.313921747042766, "grad_norm": 0.6188142158105041, "learning_rate": 2.79233533893027e-06, "loss": 0.0115, "step": 5086 }, { "epoch": 2.3143767060964513, "grad_norm": 0.3310571445826078, "learning_rate": 2.791625581705218e-06, "loss": 0.0048, "step": 5087 }, { "epoch": 2.3148316651501366, "grad_norm": 0.5270626605315613, "learning_rate": 2.79091580064984e-06, "loss": 0.0124, "step": 5088 }, { "epoch": 2.3152866242038215, "grad_norm": 0.5819220248505045, "learning_rate": 2.7902059958221363e-06, "loss": 0.0149, "step": 5089 }, { "epoch": 2.315741583257507, "grad_norm": 0.5029319006176026, "learning_rate": 2.7894961672801095e-06, "loss": 0.01, "step": 5090 }, { "epoch": 2.316196542311192, "grad_norm": 0.4062522108619744, "learning_rate": 2.7887863150817636e-06, "loss": 0.0082, "step": 5091 }, { "epoch": 2.316651501364877, "grad_norm": 0.29588811413931976, "learning_rate": 2.788076439285104e-06, "loss": 0.0049, "step": 5092 }, { "epoch": 2.3171064604185623, "grad_norm": 0.6361646912756277, "learning_rate": 2.7873665399481382e-06, "loss": 0.0091, "step": 5093 }, { "epoch": 2.3175614194722476, "grad_norm": 0.4934540652022836, "learning_rate": 2.7866566171288773e-06, "loss": 0.0072, "step": 5094 }, { "epoch": 2.3180163785259325, "grad_norm": 0.5148642615398799, "learning_rate": 2.7859466708853315e-06, "loss": 0.0098, "step": 5095 }, { "epoch": 2.3184713375796178, "grad_norm": 0.48166862295715934, "learning_rate": 2.785236701275515e-06, "loss": 0.0083, "step": 5096 }, { "epoch": 2.318926296633303, "grad_norm": 0.6650392661562431, "learning_rate": 2.7845267083574433e-06, "loss": 0.0204, "step": 5097 }, { "epoch": 2.319381255686988, "grad_norm": 0.5292250743723036, "learning_rate": 2.783816692189135e-06, "loss": 0.0124, "step": 5098 }, { "epoch": 2.3198362147406733, "grad_norm": 0.516822884495865, "learning_rate": 2.7831066528286075e-06, "loss": 0.0142, "step": 5099 }, { "epoch": 2.3202911737943586, "grad_norm": 0.598103392532007, "learning_rate": 2.782396590333883e-06, "loss": 0.0122, "step": 5100 }, { "epoch": 2.3207461328480434, "grad_norm": 9.314088470661096, "learning_rate": 2.781686504762985e-06, "loss": 0.1975, "step": 5101 }, { "epoch": 2.3212010919017287, "grad_norm": 0.27887220000373664, "learning_rate": 2.7809763961739366e-06, "loss": 0.0048, "step": 5102 }, { "epoch": 2.321656050955414, "grad_norm": 0.4641872074627549, "learning_rate": 2.7802662646247667e-06, "loss": 0.012, "step": 5103 }, { "epoch": 2.3221110100090994, "grad_norm": 0.7567341987956407, "learning_rate": 2.7795561101735035e-06, "loss": 0.0231, "step": 5104 }, { "epoch": 2.3225659690627842, "grad_norm": 0.3523017854180219, "learning_rate": 2.7788459328781777e-06, "loss": 0.0054, "step": 5105 }, { "epoch": 2.3230209281164695, "grad_norm": 0.5098098897391048, "learning_rate": 2.778135732796821e-06, "loss": 0.0163, "step": 5106 }, { "epoch": 2.323475887170155, "grad_norm": 0.3547297461994853, "learning_rate": 2.7774255099874676e-06, "loss": 0.005, "step": 5107 }, { "epoch": 2.3239308462238397, "grad_norm": 0.2811446138158197, "learning_rate": 2.7767152645081557e-06, "loss": 0.0039, "step": 5108 }, { "epoch": 2.324385805277525, "grad_norm": 0.260637705589396, "learning_rate": 2.776004996416921e-06, "loss": 0.0048, "step": 5109 }, { "epoch": 2.3248407643312103, "grad_norm": 0.4457222997775194, "learning_rate": 2.775294705771805e-06, "loss": 0.0116, "step": 5110 }, { "epoch": 2.325295723384895, "grad_norm": 0.4278032944080649, "learning_rate": 2.774584392630849e-06, "loss": 0.0085, "step": 5111 }, { "epoch": 2.3257506824385805, "grad_norm": 0.43844031416094936, "learning_rate": 2.773874057052096e-06, "loss": 0.0088, "step": 5112 }, { "epoch": 2.326205641492266, "grad_norm": 0.4380326602332772, "learning_rate": 2.773163699093592e-06, "loss": 0.0053, "step": 5113 }, { "epoch": 2.3266606005459507, "grad_norm": 0.5008042920864526, "learning_rate": 2.772453318813384e-06, "loss": 0.0144, "step": 5114 }, { "epoch": 2.327115559599636, "grad_norm": 0.38856370347777197, "learning_rate": 2.7717429162695215e-06, "loss": 0.0063, "step": 5115 }, { "epoch": 2.3275705186533213, "grad_norm": 0.3697743637196574, "learning_rate": 2.771032491520055e-06, "loss": 0.0059, "step": 5116 }, { "epoch": 2.328025477707006, "grad_norm": 0.35371526238996265, "learning_rate": 2.7703220446230367e-06, "loss": 0.0108, "step": 5117 }, { "epoch": 2.3284804367606915, "grad_norm": 0.5986686798498362, "learning_rate": 2.7696115756365227e-06, "loss": 0.0061, "step": 5118 }, { "epoch": 2.328935395814377, "grad_norm": 0.43528023707172014, "learning_rate": 2.768901084618567e-06, "loss": 0.0065, "step": 5119 }, { "epoch": 2.329390354868062, "grad_norm": 0.28989376230150143, "learning_rate": 2.7681905716272307e-06, "loss": 0.0066, "step": 5120 }, { "epoch": 2.329845313921747, "grad_norm": 0.5822196632747476, "learning_rate": 2.7674800367205707e-06, "loss": 0.0114, "step": 5121 }, { "epoch": 2.3303002729754323, "grad_norm": 0.4399736059646962, "learning_rate": 2.7667694799566503e-06, "loss": 0.0055, "step": 5122 }, { "epoch": 2.3307552320291176, "grad_norm": 0.402823182334619, "learning_rate": 2.7660589013935327e-06, "loss": 0.0086, "step": 5123 }, { "epoch": 2.3312101910828025, "grad_norm": 0.37424606425758444, "learning_rate": 2.765348301089283e-06, "loss": 0.007, "step": 5124 }, { "epoch": 2.331665150136488, "grad_norm": 0.4324889658193771, "learning_rate": 2.764637679101969e-06, "loss": 0.0096, "step": 5125 }, { "epoch": 2.332120109190173, "grad_norm": 0.4298264924987423, "learning_rate": 2.7639270354896586e-06, "loss": 0.0059, "step": 5126 }, { "epoch": 2.332575068243858, "grad_norm": 0.4760094311938289, "learning_rate": 2.763216370310423e-06, "loss": 0.0127, "step": 5127 }, { "epoch": 2.3330300272975433, "grad_norm": 0.47765942596345407, "learning_rate": 2.762505683622334e-06, "loss": 0.0105, "step": 5128 }, { "epoch": 2.3334849863512286, "grad_norm": 0.2863577440078461, "learning_rate": 2.761794975483466e-06, "loss": 0.0062, "step": 5129 }, { "epoch": 2.3339399454049135, "grad_norm": 0.5695824945577267, "learning_rate": 2.7610842459518957e-06, "loss": 0.0092, "step": 5130 }, { "epoch": 2.3343949044585988, "grad_norm": 0.3436164809763099, "learning_rate": 2.760373495085698e-06, "loss": 0.005, "step": 5131 }, { "epoch": 2.334849863512284, "grad_norm": 0.5354573399259746, "learning_rate": 2.7596627229429557e-06, "loss": 0.0102, "step": 5132 }, { "epoch": 2.335304822565969, "grad_norm": 0.48306575398438, "learning_rate": 2.758951929581748e-06, "loss": 0.0126, "step": 5133 }, { "epoch": 2.3357597816196543, "grad_norm": 0.4445771980141461, "learning_rate": 2.758241115060158e-06, "loss": 0.0082, "step": 5134 }, { "epoch": 2.3362147406733396, "grad_norm": 0.5586676033398107, "learning_rate": 2.7575302794362704e-06, "loss": 0.0082, "step": 5135 }, { "epoch": 2.3366696997270244, "grad_norm": 0.566837515943476, "learning_rate": 2.7568194227681703e-06, "loss": 0.0065, "step": 5136 }, { "epoch": 2.3371246587807097, "grad_norm": 0.3869677492981024, "learning_rate": 2.756108545113948e-06, "loss": 0.0034, "step": 5137 }, { "epoch": 2.337579617834395, "grad_norm": 0.46123101492162843, "learning_rate": 2.7553976465316915e-06, "loss": 0.008, "step": 5138 }, { "epoch": 2.33803457688808, "grad_norm": 0.7544701454984287, "learning_rate": 2.754686727079493e-06, "loss": 0.0072, "step": 5139 }, { "epoch": 2.3384895359417652, "grad_norm": 0.6428787360636244, "learning_rate": 2.7539757868154452e-06, "loss": 0.0121, "step": 5140 }, { "epoch": 2.3389444949954505, "grad_norm": 0.3089488279841168, "learning_rate": 2.753264825797643e-06, "loss": 0.0055, "step": 5141 }, { "epoch": 2.3393994540491354, "grad_norm": 0.6935754883861309, "learning_rate": 2.7525538440841828e-06, "loss": 0.0144, "step": 5142 }, { "epoch": 2.3398544131028207, "grad_norm": 0.688738027980075, "learning_rate": 2.751842841733163e-06, "loss": 0.0174, "step": 5143 }, { "epoch": 2.340309372156506, "grad_norm": 0.6316808843566837, "learning_rate": 2.751131818802684e-06, "loss": 0.0198, "step": 5144 }, { "epoch": 2.340764331210191, "grad_norm": 0.7729484813236736, "learning_rate": 2.750420775350846e-06, "loss": 0.021, "step": 5145 }, { "epoch": 2.341219290263876, "grad_norm": 0.5065002803779283, "learning_rate": 2.749709711435753e-06, "loss": 0.0094, "step": 5146 }, { "epoch": 2.3416742493175615, "grad_norm": 0.6043705999852397, "learning_rate": 2.7489986271155112e-06, "loss": 0.0221, "step": 5147 }, { "epoch": 2.3421292083712464, "grad_norm": 0.7988864911066894, "learning_rate": 2.748287522448225e-06, "loss": 0.0139, "step": 5148 }, { "epoch": 2.3425841674249317, "grad_norm": 0.260309475815787, "learning_rate": 2.7475763974920046e-06, "loss": 0.0049, "step": 5149 }, { "epoch": 2.343039126478617, "grad_norm": 0.3023993403697899, "learning_rate": 2.746865252304958e-06, "loss": 0.0074, "step": 5150 }, { "epoch": 2.343494085532302, "grad_norm": 0.5675631901512953, "learning_rate": 2.746154086945199e-06, "loss": 0.0143, "step": 5151 }, { "epoch": 2.343949044585987, "grad_norm": 0.4667915956702053, "learning_rate": 2.745442901470839e-06, "loss": 0.0086, "step": 5152 }, { "epoch": 2.3444040036396725, "grad_norm": 0.5858577676671044, "learning_rate": 2.744731695939993e-06, "loss": 0.0124, "step": 5153 }, { "epoch": 2.3448589626933574, "grad_norm": 0.6160340168044767, "learning_rate": 2.744020470410779e-06, "loss": 0.0154, "step": 5154 }, { "epoch": 2.3453139217470427, "grad_norm": 0.5690402362792831, "learning_rate": 2.743309224941314e-06, "loss": 0.015, "step": 5155 }, { "epoch": 2.345768880800728, "grad_norm": 0.2951210180379065, "learning_rate": 2.742597959589717e-06, "loss": 0.0039, "step": 5156 }, { "epoch": 2.3462238398544133, "grad_norm": 0.31590775159033335, "learning_rate": 2.741886674414112e-06, "loss": 0.0075, "step": 5157 }, { "epoch": 2.346678798908098, "grad_norm": 0.41424468377805007, "learning_rate": 2.741175369472619e-06, "loss": 0.015, "step": 5158 }, { "epoch": 2.3471337579617835, "grad_norm": 0.6072192018019458, "learning_rate": 2.7404640448233637e-06, "loss": 0.0177, "step": 5159 }, { "epoch": 2.347588717015469, "grad_norm": 0.3872114041749146, "learning_rate": 2.7397527005244734e-06, "loss": 0.0076, "step": 5160 }, { "epoch": 2.3480436760691537, "grad_norm": 0.4256522019168811, "learning_rate": 2.7390413366340753e-06, "loss": 0.0087, "step": 5161 }, { "epoch": 2.348498635122839, "grad_norm": 0.48964461397747205, "learning_rate": 2.738329953210298e-06, "loss": 0.0089, "step": 5162 }, { "epoch": 2.3489535941765243, "grad_norm": 0.2122859294631207, "learning_rate": 2.7376185503112728e-06, "loss": 0.0033, "step": 5163 }, { "epoch": 2.349408553230209, "grad_norm": 0.6138154023276224, "learning_rate": 2.7369071279951342e-06, "loss": 0.0151, "step": 5164 }, { "epoch": 2.3498635122838945, "grad_norm": 0.3533121238883642, "learning_rate": 2.736195686320014e-06, "loss": 0.0087, "step": 5165 }, { "epoch": 2.3503184713375798, "grad_norm": 0.4844223917455224, "learning_rate": 2.735484225344049e-06, "loss": 0.0116, "step": 5166 }, { "epoch": 2.3507734303912646, "grad_norm": 1.0487782302267374, "learning_rate": 2.7347727451253763e-06, "loss": 0.018, "step": 5167 }, { "epoch": 2.35122838944495, "grad_norm": 0.5098256851057201, "learning_rate": 2.7340612457221355e-06, "loss": 0.0114, "step": 5168 }, { "epoch": 2.3516833484986353, "grad_norm": 0.36079840111210715, "learning_rate": 2.733349727192467e-06, "loss": 0.0113, "step": 5169 }, { "epoch": 2.35213830755232, "grad_norm": 0.8024765569161492, "learning_rate": 2.732638189594512e-06, "loss": 0.0241, "step": 5170 }, { "epoch": 2.3525932666060054, "grad_norm": 0.45238366640797906, "learning_rate": 2.7319266329864153e-06, "loss": 0.008, "step": 5171 }, { "epoch": 2.3530482256596907, "grad_norm": 0.4188426434904233, "learning_rate": 2.7312150574263207e-06, "loss": 0.0103, "step": 5172 }, { "epoch": 2.3535031847133756, "grad_norm": 0.19965077478017915, "learning_rate": 2.7305034629723765e-06, "loss": 0.0051, "step": 5173 }, { "epoch": 2.353958143767061, "grad_norm": 0.5849676392347655, "learning_rate": 2.7297918496827302e-06, "loss": 0.021, "step": 5174 }, { "epoch": 2.3544131028207462, "grad_norm": 0.3567783786340201, "learning_rate": 2.729080217615531e-06, "loss": 0.0078, "step": 5175 }, { "epoch": 2.3548680618744315, "grad_norm": 0.2346811304690993, "learning_rate": 2.7283685668289324e-06, "loss": 0.0041, "step": 5176 }, { "epoch": 2.3553230209281164, "grad_norm": 0.5627452994635612, "learning_rate": 2.7276568973810835e-06, "loss": 0.012, "step": 5177 }, { "epoch": 2.3557779799818017, "grad_norm": 0.3792351489954144, "learning_rate": 2.726945209330143e-06, "loss": 0.0077, "step": 5178 }, { "epoch": 2.356232939035487, "grad_norm": 0.4914377674402613, "learning_rate": 2.726233502734264e-06, "loss": 0.0156, "step": 5179 }, { "epoch": 2.356687898089172, "grad_norm": 0.47723249723853356, "learning_rate": 2.725521777651605e-06, "loss": 0.0178, "step": 5180 }, { "epoch": 2.357142857142857, "grad_norm": 0.4010905166704799, "learning_rate": 2.724810034140325e-06, "loss": 0.0149, "step": 5181 }, { "epoch": 2.3575978161965425, "grad_norm": 0.5081915101489938, "learning_rate": 2.724098272258584e-06, "loss": 0.0114, "step": 5182 }, { "epoch": 2.3580527752502274, "grad_norm": 0.475385014087324, "learning_rate": 2.723386492064545e-06, "loss": 0.0148, "step": 5183 }, { "epoch": 2.3585077343039127, "grad_norm": 0.5835563863719301, "learning_rate": 2.722674693616369e-06, "loss": 0.0124, "step": 5184 }, { "epoch": 2.358962693357598, "grad_norm": 0.5615664828704214, "learning_rate": 2.721962876972224e-06, "loss": 0.0151, "step": 5185 }, { "epoch": 2.359417652411283, "grad_norm": 0.2981928288585561, "learning_rate": 2.7212510421902743e-06, "loss": 0.006, "step": 5186 }, { "epoch": 2.359872611464968, "grad_norm": 0.596855340133716, "learning_rate": 2.7205391893286892e-06, "loss": 0.0116, "step": 5187 }, { "epoch": 2.3603275705186535, "grad_norm": 0.3571801463387938, "learning_rate": 2.7198273184456376e-06, "loss": 0.0104, "step": 5188 }, { "epoch": 2.3607825295723384, "grad_norm": 0.4877286043034149, "learning_rate": 2.7191154295992893e-06, "loss": 0.013, "step": 5189 }, { "epoch": 2.3612374886260237, "grad_norm": 0.24156948108163614, "learning_rate": 2.718403522847819e-06, "loss": 0.0043, "step": 5190 }, { "epoch": 2.361692447679709, "grad_norm": 0.40187998602960306, "learning_rate": 2.7176915982493975e-06, "loss": 0.0086, "step": 5191 }, { "epoch": 2.362147406733394, "grad_norm": 0.37736189370658485, "learning_rate": 2.716979655862203e-06, "loss": 0.0077, "step": 5192 }, { "epoch": 2.362602365787079, "grad_norm": 0.43404029024470286, "learning_rate": 2.7162676957444106e-06, "loss": 0.01, "step": 5193 }, { "epoch": 2.3630573248407645, "grad_norm": 0.3661631553607755, "learning_rate": 2.715555717954198e-06, "loss": 0.0096, "step": 5194 }, { "epoch": 2.3635122838944493, "grad_norm": 0.2524483199822068, "learning_rate": 2.7148437225497466e-06, "loss": 0.007, "step": 5195 }, { "epoch": 2.3639672429481347, "grad_norm": 0.39837570439498277, "learning_rate": 2.7141317095892356e-06, "loss": 0.0087, "step": 5196 }, { "epoch": 2.36442220200182, "grad_norm": 0.33302905588093007, "learning_rate": 2.7134196791308493e-06, "loss": 0.0099, "step": 5197 }, { "epoch": 2.364877161055505, "grad_norm": 0.24660713853698385, "learning_rate": 2.7127076312327695e-06, "loss": 0.0059, "step": 5198 }, { "epoch": 2.36533212010919, "grad_norm": 0.2898827470494871, "learning_rate": 2.711995565953183e-06, "loss": 0.0082, "step": 5199 }, { "epoch": 2.3657870791628755, "grad_norm": 0.5895624957111918, "learning_rate": 2.7112834833502766e-06, "loss": 0.0142, "step": 5200 }, { "epoch": 2.3662420382165603, "grad_norm": 0.4761000326916397, "learning_rate": 2.7105713834822374e-06, "loss": 0.0095, "step": 5201 }, { "epoch": 2.3666969972702456, "grad_norm": 0.4150882241675566, "learning_rate": 2.7098592664072563e-06, "loss": 0.0101, "step": 5202 }, { "epoch": 2.367151956323931, "grad_norm": 0.4388486887563971, "learning_rate": 2.709147132183523e-06, "loss": 0.0089, "step": 5203 }, { "epoch": 2.367606915377616, "grad_norm": 0.3201779460673347, "learning_rate": 2.7084349808692316e-06, "loss": 0.0078, "step": 5204 }, { "epoch": 2.368061874431301, "grad_norm": 0.6573248405322761, "learning_rate": 2.707722812522574e-06, "loss": 0.0139, "step": 5205 }, { "epoch": 2.3685168334849864, "grad_norm": 0.3450059227051716, "learning_rate": 2.7070106272017465e-06, "loss": 0.0084, "step": 5206 }, { "epoch": 2.3689717925386713, "grad_norm": 0.3067533544162796, "learning_rate": 2.706298424964946e-06, "loss": 0.0065, "step": 5207 }, { "epoch": 2.3694267515923566, "grad_norm": 0.5548141187778437, "learning_rate": 2.7055862058703685e-06, "loss": 0.0107, "step": 5208 }, { "epoch": 2.369881710646042, "grad_norm": 0.5612587117789055, "learning_rate": 2.704873969976216e-06, "loss": 0.0176, "step": 5209 }, { "epoch": 2.370336669699727, "grad_norm": 0.6628202251021884, "learning_rate": 2.7041617173406875e-06, "loss": 0.0177, "step": 5210 }, { "epoch": 2.370791628753412, "grad_norm": 0.40315497365980635, "learning_rate": 2.703449448021985e-06, "loss": 0.0087, "step": 5211 }, { "epoch": 2.3712465878070974, "grad_norm": 0.36051830048698236, "learning_rate": 2.7027371620783127e-06, "loss": 0.0051, "step": 5212 }, { "epoch": 2.3717015468607827, "grad_norm": 0.4193680160013738, "learning_rate": 2.7020248595678744e-06, "loss": 0.0072, "step": 5213 }, { "epoch": 2.3721565059144676, "grad_norm": 0.3235213418299606, "learning_rate": 2.7013125405488782e-06, "loss": 0.0066, "step": 5214 }, { "epoch": 2.372611464968153, "grad_norm": 0.4725933822080339, "learning_rate": 2.7006002050795294e-06, "loss": 0.0081, "step": 5215 }, { "epoch": 2.373066424021838, "grad_norm": 0.45518658545183927, "learning_rate": 2.6998878532180378e-06, "loss": 0.0086, "step": 5216 }, { "epoch": 2.373521383075523, "grad_norm": 0.2548752297852108, "learning_rate": 2.6991754850226143e-06, "loss": 0.0051, "step": 5217 }, { "epoch": 2.3739763421292084, "grad_norm": 0.30285490363996487, "learning_rate": 2.6984631005514685e-06, "loss": 0.0037, "step": 5218 }, { "epoch": 2.3744313011828937, "grad_norm": 0.30229545731242075, "learning_rate": 2.697750699862815e-06, "loss": 0.0071, "step": 5219 }, { "epoch": 2.3748862602365786, "grad_norm": 0.4895056553995496, "learning_rate": 2.6970382830148665e-06, "loss": 0.0133, "step": 5220 }, { "epoch": 2.375341219290264, "grad_norm": 0.48936299523109655, "learning_rate": 2.6963258500658406e-06, "loss": 0.0126, "step": 5221 }, { "epoch": 2.375796178343949, "grad_norm": 0.6193397123916056, "learning_rate": 2.695613401073952e-06, "loss": 0.0163, "step": 5222 }, { "epoch": 2.376251137397634, "grad_norm": 0.5244624545775332, "learning_rate": 2.69490093609742e-06, "loss": 0.0087, "step": 5223 }, { "epoch": 2.3767060964513194, "grad_norm": 0.3317536957111131, "learning_rate": 2.694188455194464e-06, "loss": 0.0097, "step": 5224 }, { "epoch": 2.3771610555050047, "grad_norm": 0.4539573574290135, "learning_rate": 2.693475958423304e-06, "loss": 0.0116, "step": 5225 }, { "epoch": 2.3776160145586895, "grad_norm": 0.5831988123848356, "learning_rate": 2.692763445842162e-06, "loss": 0.0142, "step": 5226 }, { "epoch": 2.378070973612375, "grad_norm": 0.49470580849079343, "learning_rate": 2.6920509175092622e-06, "loss": 0.0078, "step": 5227 }, { "epoch": 2.37852593266606, "grad_norm": 0.7059656684742778, "learning_rate": 2.6913383734828293e-06, "loss": 0.0191, "step": 5228 }, { "epoch": 2.3789808917197455, "grad_norm": 0.3633096326357653, "learning_rate": 2.690625813821087e-06, "loss": 0.0076, "step": 5229 }, { "epoch": 2.3794358507734303, "grad_norm": 0.3463790440117242, "learning_rate": 2.689913238582265e-06, "loss": 0.006, "step": 5230 }, { "epoch": 2.3798908098271156, "grad_norm": 0.6752740671545027, "learning_rate": 2.689200647824591e-06, "loss": 0.0204, "step": 5231 }, { "epoch": 2.380345768880801, "grad_norm": 0.43573383156412393, "learning_rate": 2.6884880416062942e-06, "loss": 0.0067, "step": 5232 }, { "epoch": 2.380800727934486, "grad_norm": 0.3642299175381693, "learning_rate": 2.687775419985606e-06, "loss": 0.0061, "step": 5233 }, { "epoch": 2.381255686988171, "grad_norm": 0.3240169060849022, "learning_rate": 2.6870627830207585e-06, "loss": 0.0078, "step": 5234 }, { "epoch": 2.3817106460418564, "grad_norm": 0.7412416564886055, "learning_rate": 2.686350130769985e-06, "loss": 0.0163, "step": 5235 }, { "epoch": 2.3821656050955413, "grad_norm": 0.24238455971366532, "learning_rate": 2.68563746329152e-06, "loss": 0.0029, "step": 5236 }, { "epoch": 2.3826205641492266, "grad_norm": 0.3952084208852954, "learning_rate": 2.6849247806436e-06, "loss": 0.007, "step": 5237 }, { "epoch": 2.383075523202912, "grad_norm": 0.5482699637315167, "learning_rate": 2.6842120828844625e-06, "loss": 0.0148, "step": 5238 }, { "epoch": 2.383530482256597, "grad_norm": 0.6253891984731148, "learning_rate": 2.6834993700723454e-06, "loss": 0.0209, "step": 5239 }, { "epoch": 2.383985441310282, "grad_norm": 0.6500433404262409, "learning_rate": 2.682786642265488e-06, "loss": 0.0235, "step": 5240 }, { "epoch": 2.3844404003639674, "grad_norm": 0.6776593718569566, "learning_rate": 2.6820738995221323e-06, "loss": 0.0235, "step": 5241 }, { "epoch": 2.3848953594176523, "grad_norm": 0.31508128523394174, "learning_rate": 2.681361141900519e-06, "loss": 0.0048, "step": 5242 }, { "epoch": 2.3853503184713376, "grad_norm": 0.2929976938739907, "learning_rate": 2.6806483694588926e-06, "loss": 0.0075, "step": 5243 }, { "epoch": 2.385805277525023, "grad_norm": 0.852277417305211, "learning_rate": 2.6799355822554974e-06, "loss": 0.0238, "step": 5244 }, { "epoch": 2.386260236578708, "grad_norm": 0.5595580299153871, "learning_rate": 2.6792227803485788e-06, "loss": 0.0127, "step": 5245 }, { "epoch": 2.386715195632393, "grad_norm": 0.43894445273312555, "learning_rate": 2.6785099637963847e-06, "loss": 0.0079, "step": 5246 }, { "epoch": 2.3871701546860784, "grad_norm": 0.36007438431040656, "learning_rate": 2.6777971326571605e-06, "loss": 0.0087, "step": 5247 }, { "epoch": 2.3876251137397633, "grad_norm": 0.28023898643097817, "learning_rate": 2.6770842869891593e-06, "loss": 0.0049, "step": 5248 }, { "epoch": 2.3880800727934486, "grad_norm": 0.3921539513262492, "learning_rate": 2.6763714268506297e-06, "loss": 0.0062, "step": 5249 }, { "epoch": 2.388535031847134, "grad_norm": 0.3937830852741565, "learning_rate": 2.675658552299823e-06, "loss": 0.012, "step": 5250 }, { "epoch": 2.3889899909008188, "grad_norm": 0.6367934674814292, "learning_rate": 2.6749456633949932e-06, "loss": 0.0121, "step": 5251 }, { "epoch": 2.389444949954504, "grad_norm": 0.3747580878047549, "learning_rate": 2.6742327601943936e-06, "loss": 0.0088, "step": 5252 }, { "epoch": 2.3898999090081894, "grad_norm": 0.5577456307739629, "learning_rate": 2.67351984275628e-06, "loss": 0.0156, "step": 5253 }, { "epoch": 2.3903548680618742, "grad_norm": 0.5315542301898049, "learning_rate": 2.6728069111389073e-06, "loss": 0.0157, "step": 5254 }, { "epoch": 2.3908098271155596, "grad_norm": 0.44568830399854736, "learning_rate": 2.672093965400536e-06, "loss": 0.0136, "step": 5255 }, { "epoch": 2.391264786169245, "grad_norm": 0.3753442962357679, "learning_rate": 2.6713810055994215e-06, "loss": 0.0069, "step": 5256 }, { "epoch": 2.3917197452229297, "grad_norm": 0.46781841650338424, "learning_rate": 2.6706680317938256e-06, "loss": 0.0111, "step": 5257 }, { "epoch": 2.392174704276615, "grad_norm": 0.3323765437558547, "learning_rate": 2.6699550440420093e-06, "loss": 0.0063, "step": 5258 }, { "epoch": 2.3926296633303004, "grad_norm": 0.4318778966975109, "learning_rate": 2.6692420424022335e-06, "loss": 0.0079, "step": 5259 }, { "epoch": 2.3930846223839852, "grad_norm": 0.7128880562086624, "learning_rate": 2.6685290269327637e-06, "loss": 0.0202, "step": 5260 }, { "epoch": 2.3935395814376705, "grad_norm": 0.7202044198070968, "learning_rate": 2.667815997691861e-06, "loss": 0.0177, "step": 5261 }, { "epoch": 2.393994540491356, "grad_norm": 0.8278941975649131, "learning_rate": 2.6671029547377943e-06, "loss": 0.0139, "step": 5262 }, { "epoch": 2.3944494995450407, "grad_norm": 0.46867689479305447, "learning_rate": 2.666389898128828e-06, "loss": 0.0078, "step": 5263 }, { "epoch": 2.394904458598726, "grad_norm": 0.6582145906170176, "learning_rate": 2.665676827923231e-06, "loss": 0.0186, "step": 5264 }, { "epoch": 2.3953594176524113, "grad_norm": 0.5100800599920248, "learning_rate": 2.664963744179272e-06, "loss": 0.0106, "step": 5265 }, { "epoch": 2.395814376706096, "grad_norm": 0.3453109239426172, "learning_rate": 2.6642506469552198e-06, "loss": 0.0074, "step": 5266 }, { "epoch": 2.3962693357597815, "grad_norm": 0.38141894873827825, "learning_rate": 2.663537536309348e-06, "loss": 0.0068, "step": 5267 }, { "epoch": 2.396724294813467, "grad_norm": 0.6944690395459148, "learning_rate": 2.6628244122999265e-06, "loss": 0.0209, "step": 5268 }, { "epoch": 2.397179253867152, "grad_norm": 0.38133371002374117, "learning_rate": 2.662111274985229e-06, "loss": 0.0159, "step": 5269 }, { "epoch": 2.397634212920837, "grad_norm": 0.40926907930635564, "learning_rate": 2.661398124423531e-06, "loss": 0.0082, "step": 5270 }, { "epoch": 2.3980891719745223, "grad_norm": 0.23841437241460556, "learning_rate": 2.6606849606731056e-06, "loss": 0.0043, "step": 5271 }, { "epoch": 2.3985441310282076, "grad_norm": 0.5571965423401104, "learning_rate": 2.6599717837922322e-06, "loss": 0.0167, "step": 5272 }, { "epoch": 2.3989990900818925, "grad_norm": 0.3409360650151248, "learning_rate": 2.659258593839187e-06, "loss": 0.0084, "step": 5273 }, { "epoch": 2.399454049135578, "grad_norm": 0.46935402545788363, "learning_rate": 2.6585453908722484e-06, "loss": 0.0077, "step": 5274 }, { "epoch": 2.399909008189263, "grad_norm": 0.4293516975520902, "learning_rate": 2.6578321749496965e-06, "loss": 0.0079, "step": 5275 }, { "epoch": 2.400363967242948, "grad_norm": 0.45308738380494207, "learning_rate": 2.657118946129812e-06, "loss": 0.0116, "step": 5276 }, { "epoch": 2.4008189262966333, "grad_norm": 0.41914655946752455, "learning_rate": 2.6564057044708767e-06, "loss": 0.0094, "step": 5277 }, { "epoch": 2.4012738853503186, "grad_norm": 0.6397913929949481, "learning_rate": 2.655692450031173e-06, "loss": 0.0123, "step": 5278 }, { "epoch": 2.4017288444040035, "grad_norm": 0.7021904278894873, "learning_rate": 2.6549791828689864e-06, "loss": 0.0155, "step": 5279 }, { "epoch": 2.402183803457689, "grad_norm": 0.5397248972906326, "learning_rate": 2.654265903042601e-06, "loss": 0.0175, "step": 5280 }, { "epoch": 2.402638762511374, "grad_norm": 0.3371560593044651, "learning_rate": 2.653552610610302e-06, "loss": 0.0057, "step": 5281 }, { "epoch": 2.403093721565059, "grad_norm": 0.5214898493810641, "learning_rate": 2.6528393056303767e-06, "loss": 0.0078, "step": 5282 }, { "epoch": 2.4035486806187443, "grad_norm": 0.49605866647148616, "learning_rate": 2.6521259881611144e-06, "loss": 0.0088, "step": 5283 }, { "epoch": 2.4040036396724296, "grad_norm": 0.46212348464482766, "learning_rate": 2.6514126582608037e-06, "loss": 0.011, "step": 5284 }, { "epoch": 2.404458598726115, "grad_norm": 0.5505433454351151, "learning_rate": 2.650699315987733e-06, "loss": 0.0107, "step": 5285 }, { "epoch": 2.4049135577797998, "grad_norm": 0.5903698125751682, "learning_rate": 2.6499859614001954e-06, "loss": 0.0216, "step": 5286 }, { "epoch": 2.405368516833485, "grad_norm": 0.44507294299281536, "learning_rate": 2.649272594556483e-06, "loss": 0.0094, "step": 5287 }, { "epoch": 2.4058234758871704, "grad_norm": 0.46590987768919534, "learning_rate": 2.6485592155148875e-06, "loss": 0.0171, "step": 5288 }, { "epoch": 2.4062784349408552, "grad_norm": 0.3731585225046938, "learning_rate": 2.6478458243337035e-06, "loss": 0.0073, "step": 5289 }, { "epoch": 2.4067333939945406, "grad_norm": 0.5027183528874986, "learning_rate": 2.647132421071227e-06, "loss": 0.0177, "step": 5290 }, { "epoch": 2.407188353048226, "grad_norm": 0.6441369163350117, "learning_rate": 2.6464190057857535e-06, "loss": 0.0119, "step": 5291 }, { "epoch": 2.4076433121019107, "grad_norm": 0.5363747072196974, "learning_rate": 2.6457055785355802e-06, "loss": 0.0069, "step": 5292 }, { "epoch": 2.408098271155596, "grad_norm": 0.5378451878025351, "learning_rate": 2.6449921393790045e-06, "loss": 0.0165, "step": 5293 }, { "epoch": 2.4085532302092814, "grad_norm": 0.2972711718509844, "learning_rate": 2.6442786883743267e-06, "loss": 0.0111, "step": 5294 }, { "epoch": 2.4090081892629662, "grad_norm": 0.5548571246600784, "learning_rate": 2.643565225579845e-06, "loss": 0.0119, "step": 5295 }, { "epoch": 2.4094631483166515, "grad_norm": 0.38088006109269384, "learning_rate": 2.642851751053862e-06, "loss": 0.0073, "step": 5296 }, { "epoch": 2.409918107370337, "grad_norm": 0.4609868949516718, "learning_rate": 2.642138264854679e-06, "loss": 0.0106, "step": 5297 }, { "epoch": 2.4103730664240217, "grad_norm": 0.512167439437391, "learning_rate": 2.641424767040599e-06, "loss": 0.0207, "step": 5298 }, { "epoch": 2.410828025477707, "grad_norm": 0.42476432212872417, "learning_rate": 2.640711257669925e-06, "loss": 0.0132, "step": 5299 }, { "epoch": 2.4112829845313923, "grad_norm": 0.5290531243510677, "learning_rate": 2.6399977368009632e-06, "loss": 0.0131, "step": 5300 }, { "epoch": 2.411737943585077, "grad_norm": 0.3528644786305546, "learning_rate": 2.6392842044920187e-06, "loss": 0.0099, "step": 5301 }, { "epoch": 2.4121929026387625, "grad_norm": 0.49443824072038073, "learning_rate": 2.6385706608013977e-06, "loss": 0.0108, "step": 5302 }, { "epoch": 2.412647861692448, "grad_norm": 0.3409880430265356, "learning_rate": 2.637857105787408e-06, "loss": 0.0056, "step": 5303 }, { "epoch": 2.4131028207461327, "grad_norm": 0.3937234168416613, "learning_rate": 2.6371435395083585e-06, "loss": 0.0112, "step": 5304 }, { "epoch": 2.413557779799818, "grad_norm": 0.47888676165629507, "learning_rate": 2.636429962022558e-06, "loss": 0.0121, "step": 5305 }, { "epoch": 2.4140127388535033, "grad_norm": 0.27106266919897065, "learning_rate": 2.6357163733883168e-06, "loss": 0.0041, "step": 5306 }, { "epoch": 2.414467697907188, "grad_norm": 0.8819545225308099, "learning_rate": 2.6350027736639467e-06, "loss": 0.0255, "step": 5307 }, { "epoch": 2.4149226569608735, "grad_norm": 0.37844341009797916, "learning_rate": 2.6342891629077603e-06, "loss": 0.0103, "step": 5308 }, { "epoch": 2.415377616014559, "grad_norm": 0.4564175387658164, "learning_rate": 2.633575541178069e-06, "loss": 0.0107, "step": 5309 }, { "epoch": 2.4158325750682437, "grad_norm": 0.36581330227252784, "learning_rate": 2.632861908533188e-06, "loss": 0.0107, "step": 5310 }, { "epoch": 2.416287534121929, "grad_norm": 0.2629530523538824, "learning_rate": 2.6321482650314324e-06, "loss": 0.0036, "step": 5311 }, { "epoch": 2.4167424931756143, "grad_norm": 0.1369297256044413, "learning_rate": 2.631434610731117e-06, "loss": 0.0021, "step": 5312 }, { "epoch": 2.417197452229299, "grad_norm": 0.3193006589680077, "learning_rate": 2.630720945690558e-06, "loss": 0.0069, "step": 5313 }, { "epoch": 2.4176524112829845, "grad_norm": 0.5601211135451076, "learning_rate": 2.630007269968074e-06, "loss": 0.0142, "step": 5314 }, { "epoch": 2.41810737033667, "grad_norm": 1.1069141185795472, "learning_rate": 2.629293583621984e-06, "loss": 0.0462, "step": 5315 }, { "epoch": 2.4185623293903546, "grad_norm": 0.49868068796957743, "learning_rate": 2.6285798867106054e-06, "loss": 0.0091, "step": 5316 }, { "epoch": 2.41901728844404, "grad_norm": 0.5358783660877817, "learning_rate": 2.6278661792922587e-06, "loss": 0.0163, "step": 5317 }, { "epoch": 2.4194722474977253, "grad_norm": 0.503397462647226, "learning_rate": 2.6271524614252663e-06, "loss": 0.0092, "step": 5318 }, { "epoch": 2.41992720655141, "grad_norm": 0.38301509161187197, "learning_rate": 2.6264387331679486e-06, "loss": 0.011, "step": 5319 }, { "epoch": 2.4203821656050954, "grad_norm": 0.6090990558000087, "learning_rate": 2.6257249945786285e-06, "loss": 0.0183, "step": 5320 }, { "epoch": 2.4208371246587808, "grad_norm": 0.41568059684098374, "learning_rate": 2.6250112457156296e-06, "loss": 0.0099, "step": 5321 }, { "epoch": 2.421292083712466, "grad_norm": 0.3188627229030001, "learning_rate": 2.6242974866372762e-06, "loss": 0.0049, "step": 5322 }, { "epoch": 2.421747042766151, "grad_norm": 0.4329228668630277, "learning_rate": 2.6235837174018937e-06, "loss": 0.0111, "step": 5323 }, { "epoch": 2.4222020018198362, "grad_norm": 0.5795443613140662, "learning_rate": 2.6228699380678074e-06, "loss": 0.0162, "step": 5324 }, { "epoch": 2.4226569608735216, "grad_norm": 0.5121591712386485, "learning_rate": 2.6221561486933454e-06, "loss": 0.0134, "step": 5325 }, { "epoch": 2.4231119199272064, "grad_norm": 0.41208437181859886, "learning_rate": 2.621442349336834e-06, "loss": 0.0067, "step": 5326 }, { "epoch": 2.4235668789808917, "grad_norm": 0.31955215018149946, "learning_rate": 2.6207285400566025e-06, "loss": 0.0056, "step": 5327 }, { "epoch": 2.424021838034577, "grad_norm": 0.5746019766530774, "learning_rate": 2.62001472091098e-06, "loss": 0.0161, "step": 5328 }, { "epoch": 2.424476797088262, "grad_norm": 0.42112445986735975, "learning_rate": 2.6193008919582962e-06, "loss": 0.0099, "step": 5329 }, { "epoch": 2.4249317561419472, "grad_norm": 0.25457275141838775, "learning_rate": 2.618587053256882e-06, "loss": 0.0052, "step": 5330 }, { "epoch": 2.4253867151956325, "grad_norm": 0.6682676041238147, "learning_rate": 2.6178732048650694e-06, "loss": 0.016, "step": 5331 }, { "epoch": 2.4258416742493174, "grad_norm": 0.5536230052285402, "learning_rate": 2.617159346841192e-06, "loss": 0.0148, "step": 5332 }, { "epoch": 2.4262966333030027, "grad_norm": 0.4120876647939283, "learning_rate": 2.616445479243581e-06, "loss": 0.005, "step": 5333 }, { "epoch": 2.426751592356688, "grad_norm": 0.5005724293752647, "learning_rate": 2.615731602130571e-06, "loss": 0.0175, "step": 5334 }, { "epoch": 2.427206551410373, "grad_norm": 0.6839007620325683, "learning_rate": 2.6150177155604977e-06, "loss": 0.0127, "step": 5335 }, { "epoch": 2.427661510464058, "grad_norm": 0.38712896033595323, "learning_rate": 2.614303819591696e-06, "loss": 0.0134, "step": 5336 }, { "epoch": 2.4281164695177435, "grad_norm": 0.3348280046257371, "learning_rate": 2.6135899142825015e-06, "loss": 0.0072, "step": 5337 }, { "epoch": 2.4285714285714284, "grad_norm": 0.3426349451759085, "learning_rate": 2.6128759996912533e-06, "loss": 0.0075, "step": 5338 }, { "epoch": 2.4290263876251137, "grad_norm": 0.20884175738383293, "learning_rate": 2.6121620758762877e-06, "loss": 0.0047, "step": 5339 }, { "epoch": 2.429481346678799, "grad_norm": 0.5320394052684966, "learning_rate": 2.6114481428959445e-06, "loss": 0.0114, "step": 5340 }, { "epoch": 2.4299363057324843, "grad_norm": 0.2508729923410518, "learning_rate": 2.6107342008085605e-06, "loss": 0.0048, "step": 5341 }, { "epoch": 2.430391264786169, "grad_norm": 0.382945584795804, "learning_rate": 2.610020249672479e-06, "loss": 0.0079, "step": 5342 }, { "epoch": 2.4308462238398545, "grad_norm": 0.5366898727493329, "learning_rate": 2.6093062895460398e-06, "loss": 0.0097, "step": 5343 }, { "epoch": 2.43130118289354, "grad_norm": 0.3808870988969623, "learning_rate": 2.6085923204875835e-06, "loss": 0.0077, "step": 5344 }, { "epoch": 2.4317561419472247, "grad_norm": 0.44475519961700954, "learning_rate": 2.6078783425554538e-06, "loss": 0.0154, "step": 5345 }, { "epoch": 2.43221110100091, "grad_norm": 0.6121706410498261, "learning_rate": 2.607164355807992e-06, "loss": 0.0109, "step": 5346 }, { "epoch": 2.4326660600545953, "grad_norm": 0.743717495510323, "learning_rate": 2.6064503603035447e-06, "loss": 0.0147, "step": 5347 }, { "epoch": 2.43312101910828, "grad_norm": 0.3989646541605613, "learning_rate": 2.6057363561004527e-06, "loss": 0.0094, "step": 5348 }, { "epoch": 2.4335759781619655, "grad_norm": 0.5403121541379345, "learning_rate": 2.6050223432570646e-06, "loss": 0.0119, "step": 5349 }, { "epoch": 2.434030937215651, "grad_norm": 0.20255481466998454, "learning_rate": 2.604308321831725e-06, "loss": 0.0029, "step": 5350 }, { "epoch": 2.4344858962693356, "grad_norm": 0.49895855657120847, "learning_rate": 2.6035942918827795e-06, "loss": 0.0141, "step": 5351 }, { "epoch": 2.434940855323021, "grad_norm": 0.5114113488432483, "learning_rate": 2.6028802534685773e-06, "loss": 0.0085, "step": 5352 }, { "epoch": 2.4353958143767063, "grad_norm": 0.4669218214101655, "learning_rate": 2.6021662066474646e-06, "loss": 0.0117, "step": 5353 }, { "epoch": 2.435850773430391, "grad_norm": 0.22957003509572685, "learning_rate": 2.601452151477791e-06, "loss": 0.0028, "step": 5354 }, { "epoch": 2.4363057324840764, "grad_norm": 0.36946797300234097, "learning_rate": 2.6007380880179063e-06, "loss": 0.0086, "step": 5355 }, { "epoch": 2.4367606915377618, "grad_norm": 0.6197851730889462, "learning_rate": 2.6000240163261593e-06, "loss": 0.0122, "step": 5356 }, { "epoch": 2.4372156505914466, "grad_norm": 0.6363626554683984, "learning_rate": 2.599309936460902e-06, "loss": 0.0117, "step": 5357 }, { "epoch": 2.437670609645132, "grad_norm": 0.6023373880189127, "learning_rate": 2.5985958484804843e-06, "loss": 0.0124, "step": 5358 }, { "epoch": 2.4381255686988172, "grad_norm": 0.5188365013222024, "learning_rate": 2.597881752443259e-06, "loss": 0.0181, "step": 5359 }, { "epoch": 2.438580527752502, "grad_norm": 0.27257651584057696, "learning_rate": 2.59716764840758e-06, "loss": 0.003, "step": 5360 }, { "epoch": 2.4390354868061874, "grad_norm": 0.5151512388356434, "learning_rate": 2.5964535364317992e-06, "loss": 0.0151, "step": 5361 }, { "epoch": 2.4394904458598727, "grad_norm": 0.341959153024696, "learning_rate": 2.5957394165742712e-06, "loss": 0.0053, "step": 5362 }, { "epoch": 2.4399454049135576, "grad_norm": 0.4177469403814429, "learning_rate": 2.5950252888933495e-06, "loss": 0.0074, "step": 5363 }, { "epoch": 2.440400363967243, "grad_norm": 0.4791415532277033, "learning_rate": 2.5943111534473914e-06, "loss": 0.0076, "step": 5364 }, { "epoch": 2.4408553230209282, "grad_norm": 0.319690214421995, "learning_rate": 2.5935970102947505e-06, "loss": 0.0043, "step": 5365 }, { "epoch": 2.441310282074613, "grad_norm": 0.2875843969334049, "learning_rate": 2.5928828594937854e-06, "loss": 0.0023, "step": 5366 }, { "epoch": 2.4417652411282984, "grad_norm": 0.611820457265046, "learning_rate": 2.5921687011028525e-06, "loss": 0.0174, "step": 5367 }, { "epoch": 2.4422202001819837, "grad_norm": 0.2482467641002363, "learning_rate": 2.59145453518031e-06, "loss": 0.0044, "step": 5368 }, { "epoch": 2.4426751592356686, "grad_norm": 0.24015669354702626, "learning_rate": 2.590740361784515e-06, "loss": 0.0025, "step": 5369 }, { "epoch": 2.443130118289354, "grad_norm": 0.31065582610491205, "learning_rate": 2.590026180973828e-06, "loss": 0.0061, "step": 5370 }, { "epoch": 2.443585077343039, "grad_norm": 0.45652777094429053, "learning_rate": 2.589311992806608e-06, "loss": 0.0117, "step": 5371 }, { "epoch": 2.444040036396724, "grad_norm": 0.28798471578885854, "learning_rate": 2.5885977973412154e-06, "loss": 0.007, "step": 5372 }, { "epoch": 2.4444949954504094, "grad_norm": 0.35340474935942545, "learning_rate": 2.58788359463601e-06, "loss": 0.0042, "step": 5373 }, { "epoch": 2.4449499545040947, "grad_norm": 0.4585428240517673, "learning_rate": 2.5871693847493555e-06, "loss": 0.0072, "step": 5374 }, { "epoch": 2.4454049135577796, "grad_norm": 0.6058646376824721, "learning_rate": 2.5864551677396116e-06, "loss": 0.016, "step": 5375 }, { "epoch": 2.445859872611465, "grad_norm": 0.6400764868584173, "learning_rate": 2.5857409436651416e-06, "loss": 0.0251, "step": 5376 }, { "epoch": 2.44631483166515, "grad_norm": 0.49339509668574455, "learning_rate": 2.5850267125843093e-06, "loss": 0.0066, "step": 5377 }, { "epoch": 2.4467697907188355, "grad_norm": 0.4614233963526694, "learning_rate": 2.584312474555478e-06, "loss": 0.0058, "step": 5378 }, { "epoch": 2.4472247497725204, "grad_norm": 0.5865606440635677, "learning_rate": 2.583598229637012e-06, "loss": 0.0158, "step": 5379 }, { "epoch": 2.4476797088262057, "grad_norm": 0.5178785942661344, "learning_rate": 2.582883977887277e-06, "loss": 0.0096, "step": 5380 }, { "epoch": 2.448134667879891, "grad_norm": 0.537247743689501, "learning_rate": 2.5821697193646367e-06, "loss": 0.0108, "step": 5381 }, { "epoch": 2.448589626933576, "grad_norm": 0.32958133595936345, "learning_rate": 2.5814554541274583e-06, "loss": 0.0084, "step": 5382 }, { "epoch": 2.449044585987261, "grad_norm": 0.6944837558859854, "learning_rate": 2.580741182234108e-06, "loss": 0.0283, "step": 5383 }, { "epoch": 2.4494995450409465, "grad_norm": 0.49009220542663784, "learning_rate": 2.5800269037429522e-06, "loss": 0.0115, "step": 5384 }, { "epoch": 2.4499545040946313, "grad_norm": 0.4552504982742526, "learning_rate": 2.57931261871236e-06, "loss": 0.0094, "step": 5385 }, { "epoch": 2.4504094631483166, "grad_norm": 0.6603902209521543, "learning_rate": 2.5785983272006987e-06, "loss": 0.0173, "step": 5386 }, { "epoch": 2.450864422202002, "grad_norm": 0.29748893663835313, "learning_rate": 2.577884029266337e-06, "loss": 0.0047, "step": 5387 }, { "epoch": 2.451319381255687, "grad_norm": 0.40018184560299075, "learning_rate": 2.577169724967645e-06, "loss": 0.0092, "step": 5388 }, { "epoch": 2.451774340309372, "grad_norm": 0.2503169931721118, "learning_rate": 2.57645541436299e-06, "loss": 0.0027, "step": 5389 }, { "epoch": 2.4522292993630574, "grad_norm": 0.6049606373720219, "learning_rate": 2.5757410975107444e-06, "loss": 0.022, "step": 5390 }, { "epoch": 2.4526842584167423, "grad_norm": 0.6076854433574773, "learning_rate": 2.5750267744692785e-06, "loss": 0.0165, "step": 5391 }, { "epoch": 2.4531392174704276, "grad_norm": 0.3010267166678339, "learning_rate": 2.5743124452969636e-06, "loss": 0.0049, "step": 5392 }, { "epoch": 2.453594176524113, "grad_norm": 0.44734346755281496, "learning_rate": 2.573598110052171e-06, "loss": 0.0077, "step": 5393 }, { "epoch": 2.4540491355777982, "grad_norm": 0.636677128365085, "learning_rate": 2.572883768793273e-06, "loss": 0.0111, "step": 5394 }, { "epoch": 2.454504094631483, "grad_norm": 0.3921686944607055, "learning_rate": 2.572169421578643e-06, "loss": 0.0101, "step": 5395 }, { "epoch": 2.4549590536851684, "grad_norm": 0.485847416727712, "learning_rate": 2.5714550684666532e-06, "loss": 0.0099, "step": 5396 }, { "epoch": 2.4554140127388537, "grad_norm": 0.4225877956074144, "learning_rate": 2.5707407095156783e-06, "loss": 0.0077, "step": 5397 }, { "epoch": 2.4558689717925386, "grad_norm": 0.43272070384079586, "learning_rate": 2.5700263447840927e-06, "loss": 0.0104, "step": 5398 }, { "epoch": 2.456323930846224, "grad_norm": 0.19962311515195533, "learning_rate": 2.5693119743302697e-06, "loss": 0.0026, "step": 5399 }, { "epoch": 2.4567788898999092, "grad_norm": 0.3251630993734691, "learning_rate": 2.5685975982125848e-06, "loss": 0.0087, "step": 5400 }, { "epoch": 2.457233848953594, "grad_norm": 0.3062269061005219, "learning_rate": 2.5678832164894145e-06, "loss": 0.0056, "step": 5401 }, { "epoch": 2.4576888080072794, "grad_norm": 0.36320007601960613, "learning_rate": 2.5671688292191347e-06, "loss": 0.0069, "step": 5402 }, { "epoch": 2.4581437670609647, "grad_norm": 0.5544922531229793, "learning_rate": 2.566454436460121e-06, "loss": 0.0111, "step": 5403 }, { "epoch": 2.4585987261146496, "grad_norm": 0.4380030858402807, "learning_rate": 2.5657400382707507e-06, "loss": 0.0062, "step": 5404 }, { "epoch": 2.459053685168335, "grad_norm": 0.4787088063009698, "learning_rate": 2.565025634709402e-06, "loss": 0.0079, "step": 5405 }, { "epoch": 2.45950864422202, "grad_norm": 0.9630337967884988, "learning_rate": 2.5643112258344517e-06, "loss": 0.0321, "step": 5406 }, { "epoch": 2.459963603275705, "grad_norm": 0.28553697113804627, "learning_rate": 2.563596811704278e-06, "loss": 0.0042, "step": 5407 }, { "epoch": 2.4604185623293904, "grad_norm": 0.5125313940369218, "learning_rate": 2.5628823923772606e-06, "loss": 0.0136, "step": 5408 }, { "epoch": 2.4608735213830757, "grad_norm": 0.5851195856911073, "learning_rate": 2.5621679679117778e-06, "loss": 0.0143, "step": 5409 }, { "epoch": 2.4613284804367606, "grad_norm": 0.4438428047723892, "learning_rate": 2.56145353836621e-06, "loss": 0.0109, "step": 5410 }, { "epoch": 2.461783439490446, "grad_norm": 0.721290921842615, "learning_rate": 2.5607391037989354e-06, "loss": 0.016, "step": 5411 }, { "epoch": 2.462238398544131, "grad_norm": 0.3472015423263504, "learning_rate": 2.560024664268337e-06, "loss": 0.008, "step": 5412 }, { "epoch": 2.462693357597816, "grad_norm": 0.7151505678630602, "learning_rate": 2.5593102198327927e-06, "loss": 0.0217, "step": 5413 }, { "epoch": 2.4631483166515014, "grad_norm": 0.4161316127587869, "learning_rate": 2.558595770550686e-06, "loss": 0.0102, "step": 5414 }, { "epoch": 2.4636032757051867, "grad_norm": 0.7136343834576812, "learning_rate": 2.5578813164803974e-06, "loss": 0.024, "step": 5415 }, { "epoch": 2.4640582347588715, "grad_norm": 0.5005741524868396, "learning_rate": 2.5571668576803087e-06, "loss": 0.0096, "step": 5416 }, { "epoch": 2.464513193812557, "grad_norm": 0.3802997918258905, "learning_rate": 2.5564523942088033e-06, "loss": 0.0088, "step": 5417 }, { "epoch": 2.464968152866242, "grad_norm": 0.43329116538982837, "learning_rate": 2.5557379261242615e-06, "loss": 0.0107, "step": 5418 }, { "epoch": 2.465423111919927, "grad_norm": 0.4746387569657841, "learning_rate": 2.55502345348507e-06, "loss": 0.0105, "step": 5419 }, { "epoch": 2.4658780709736123, "grad_norm": 0.46375879077136184, "learning_rate": 2.5543089763496092e-06, "loss": 0.0063, "step": 5420 }, { "epoch": 2.4663330300272976, "grad_norm": 0.43530040184728036, "learning_rate": 2.5535944947762643e-06, "loss": 0.009, "step": 5421 }, { "epoch": 2.4667879890809825, "grad_norm": 0.47960380221084453, "learning_rate": 2.5528800088234194e-06, "loss": 0.0109, "step": 5422 }, { "epoch": 2.467242948134668, "grad_norm": 0.5851990939402898, "learning_rate": 2.5521655185494592e-06, "loss": 0.0175, "step": 5423 }, { "epoch": 2.467697907188353, "grad_norm": 0.2970416670059983, "learning_rate": 2.551451024012769e-06, "loss": 0.0101, "step": 5424 }, { "epoch": 2.468152866242038, "grad_norm": 0.6289100119810958, "learning_rate": 2.550736525271732e-06, "loss": 0.0178, "step": 5425 }, { "epoch": 2.4686078252957233, "grad_norm": 0.42509577011633337, "learning_rate": 2.550022022384736e-06, "loss": 0.0092, "step": 5426 }, { "epoch": 2.4690627843494086, "grad_norm": 0.2907034826354086, "learning_rate": 2.5493075154101665e-06, "loss": 0.005, "step": 5427 }, { "epoch": 2.4695177434030935, "grad_norm": 0.701397671490685, "learning_rate": 2.548593004406409e-06, "loss": 0.0106, "step": 5428 }, { "epoch": 2.469972702456779, "grad_norm": 0.2226107052325088, "learning_rate": 2.547878489431851e-06, "loss": 0.0034, "step": 5429 }, { "epoch": 2.470427661510464, "grad_norm": 0.3639210674633844, "learning_rate": 2.547163970544879e-06, "loss": 0.0056, "step": 5430 }, { "epoch": 2.470882620564149, "grad_norm": 0.28350796151472246, "learning_rate": 2.5464494478038802e-06, "loss": 0.0084, "step": 5431 }, { "epoch": 2.4713375796178343, "grad_norm": 0.2776557844724811, "learning_rate": 2.5457349212672423e-06, "loss": 0.0035, "step": 5432 }, { "epoch": 2.4717925386715196, "grad_norm": 0.5836468204641926, "learning_rate": 2.545020390993353e-06, "loss": 0.0098, "step": 5433 }, { "epoch": 2.472247497725205, "grad_norm": 0.4203078989900982, "learning_rate": 2.5443058570406016e-06, "loss": 0.0052, "step": 5434 }, { "epoch": 2.47270245677889, "grad_norm": 0.4934302996298237, "learning_rate": 2.5435913194673738e-06, "loss": 0.0136, "step": 5435 }, { "epoch": 2.473157415832575, "grad_norm": 0.4848673192697751, "learning_rate": 2.542876778332062e-06, "loss": 0.0078, "step": 5436 }, { "epoch": 2.4736123748862604, "grad_norm": 0.6026489161153639, "learning_rate": 2.542162233693053e-06, "loss": 0.0101, "step": 5437 }, { "epoch": 2.4740673339399453, "grad_norm": 0.38742337378139774, "learning_rate": 2.5414476856087367e-06, "loss": 0.0096, "step": 5438 }, { "epoch": 2.4745222929936306, "grad_norm": 0.4549271345626841, "learning_rate": 2.5407331341375025e-06, "loss": 0.008, "step": 5439 }, { "epoch": 2.474977252047316, "grad_norm": 0.5468718499056573, "learning_rate": 2.5400185793377404e-06, "loss": 0.0202, "step": 5440 }, { "epoch": 2.4754322111010008, "grad_norm": 0.44378602013457463, "learning_rate": 2.539304021267841e-06, "loss": 0.0114, "step": 5441 }, { "epoch": 2.475887170154686, "grad_norm": 0.4891373854053144, "learning_rate": 2.538589459986194e-06, "loss": 0.0129, "step": 5442 }, { "epoch": 2.4763421292083714, "grad_norm": 0.9728103336446813, "learning_rate": 2.537874895551191e-06, "loss": 0.015, "step": 5443 }, { "epoch": 2.4767970882620562, "grad_norm": 0.35586265106338044, "learning_rate": 2.537160328021223e-06, "loss": 0.0065, "step": 5444 }, { "epoch": 2.4772520473157416, "grad_norm": 0.5158417166611209, "learning_rate": 2.5364457574546803e-06, "loss": 0.0112, "step": 5445 }, { "epoch": 2.477707006369427, "grad_norm": 0.5125682559667823, "learning_rate": 2.5357311839099546e-06, "loss": 0.0157, "step": 5446 }, { "epoch": 2.4781619654231117, "grad_norm": 0.4736641805570601, "learning_rate": 2.535016607445438e-06, "loss": 0.0145, "step": 5447 }, { "epoch": 2.478616924476797, "grad_norm": 0.4074053686607783, "learning_rate": 2.534302028119523e-06, "loss": 0.0064, "step": 5448 }, { "epoch": 2.4790718835304824, "grad_norm": 0.31588187144661106, "learning_rate": 2.5335874459906007e-06, "loss": 0.0061, "step": 5449 }, { "epoch": 2.4795268425841677, "grad_norm": 0.9232383744888657, "learning_rate": 2.532872861117064e-06, "loss": 0.0137, "step": 5450 }, { "epoch": 2.4799818016378525, "grad_norm": 0.592253699501661, "learning_rate": 2.532158273557306e-06, "loss": 0.0123, "step": 5451 }, { "epoch": 2.480436760691538, "grad_norm": 0.42196994462527365, "learning_rate": 2.5314436833697182e-06, "loss": 0.007, "step": 5452 }, { "epoch": 2.480891719745223, "grad_norm": 0.3726705520543948, "learning_rate": 2.5307290906126954e-06, "loss": 0.0045, "step": 5453 }, { "epoch": 2.481346678798908, "grad_norm": 0.49158769976344086, "learning_rate": 2.5300144953446294e-06, "loss": 0.0098, "step": 5454 }, { "epoch": 2.4818016378525933, "grad_norm": 0.5746247964124699, "learning_rate": 2.529299897623915e-06, "loss": 0.0154, "step": 5455 }, { "epoch": 2.4822565969062786, "grad_norm": 0.172430414711175, "learning_rate": 2.5285852975089454e-06, "loss": 0.0027, "step": 5456 }, { "epoch": 2.4827115559599635, "grad_norm": 0.28392260845561157, "learning_rate": 2.5278706950581133e-06, "loss": 0.0046, "step": 5457 }, { "epoch": 2.483166515013649, "grad_norm": 0.6296680717526039, "learning_rate": 2.5271560903298154e-06, "loss": 0.0141, "step": 5458 }, { "epoch": 2.483621474067334, "grad_norm": 0.5212541912650659, "learning_rate": 2.5264414833824437e-06, "loss": 0.0081, "step": 5459 }, { "epoch": 2.484076433121019, "grad_norm": 0.5124786563639232, "learning_rate": 2.525726874274393e-06, "loss": 0.0119, "step": 5460 }, { "epoch": 2.4845313921747043, "grad_norm": 0.3298281723112912, "learning_rate": 2.525012263064059e-06, "loss": 0.0063, "step": 5461 }, { "epoch": 2.4849863512283896, "grad_norm": 0.516512989204557, "learning_rate": 2.5242976498098355e-06, "loss": 0.0145, "step": 5462 }, { "epoch": 2.4854413102820745, "grad_norm": 0.7091881782810922, "learning_rate": 2.5235830345701175e-06, "loss": 0.0183, "step": 5463 }, { "epoch": 2.48589626933576, "grad_norm": 0.24301169262888558, "learning_rate": 2.5228684174033e-06, "loss": 0.0047, "step": 5464 }, { "epoch": 2.486351228389445, "grad_norm": 0.5077887532961906, "learning_rate": 2.52215379836778e-06, "loss": 0.009, "step": 5465 }, { "epoch": 2.48680618744313, "grad_norm": 0.5798870372388089, "learning_rate": 2.521439177521951e-06, "loss": 0.0127, "step": 5466 }, { "epoch": 2.4872611464968153, "grad_norm": 0.3265193206677385, "learning_rate": 2.520724554924209e-06, "loss": 0.0049, "step": 5467 }, { "epoch": 2.4877161055505006, "grad_norm": 0.2824965032528354, "learning_rate": 2.5200099306329507e-06, "loss": 0.0058, "step": 5468 }, { "epoch": 2.4881710646041855, "grad_norm": 0.3263687919725118, "learning_rate": 2.5192953047065704e-06, "loss": 0.0047, "step": 5469 }, { "epoch": 2.488626023657871, "grad_norm": 0.3357165824243905, "learning_rate": 2.518580677203465e-06, "loss": 0.0067, "step": 5470 }, { "epoch": 2.489080982711556, "grad_norm": 0.5905380932368327, "learning_rate": 2.5178660481820305e-06, "loss": 0.0232, "step": 5471 }, { "epoch": 2.489535941765241, "grad_norm": 0.6279283581510979, "learning_rate": 2.517151417700664e-06, "loss": 0.0139, "step": 5472 }, { "epoch": 2.4899909008189263, "grad_norm": 0.5653420945723893, "learning_rate": 2.516436785817761e-06, "loss": 0.0158, "step": 5473 }, { "epoch": 2.4904458598726116, "grad_norm": 0.3695374235550188, "learning_rate": 2.5157221525917175e-06, "loss": 0.0083, "step": 5474 }, { "epoch": 2.4909008189262964, "grad_norm": 0.7017307004496299, "learning_rate": 2.5150075180809315e-06, "loss": 0.0144, "step": 5475 }, { "epoch": 2.4913557779799818, "grad_norm": 0.2469777039341069, "learning_rate": 2.514292882343798e-06, "loss": 0.0044, "step": 5476 }, { "epoch": 2.491810737033667, "grad_norm": 0.307897891379637, "learning_rate": 2.513578245438715e-06, "loss": 0.0039, "step": 5477 }, { "epoch": 2.492265696087352, "grad_norm": 0.3721310994299067, "learning_rate": 2.512863607424079e-06, "loss": 0.0043, "step": 5478 }, { "epoch": 2.4927206551410372, "grad_norm": 0.7051449312159859, "learning_rate": 2.512148968358287e-06, "loss": 0.0111, "step": 5479 }, { "epoch": 2.4931756141947226, "grad_norm": 0.31561664811740997, "learning_rate": 2.5114343282997372e-06, "loss": 0.0053, "step": 5480 }, { "epoch": 2.4936305732484074, "grad_norm": 0.5814365490253641, "learning_rate": 2.510719687306824e-06, "loss": 0.0141, "step": 5481 }, { "epoch": 2.4940855323020927, "grad_norm": 0.4196571533397841, "learning_rate": 2.5100050454379475e-06, "loss": 0.0128, "step": 5482 }, { "epoch": 2.494540491355778, "grad_norm": 0.41012798015544827, "learning_rate": 2.5092904027515037e-06, "loss": 0.0077, "step": 5483 }, { "epoch": 2.494995450409463, "grad_norm": 0.5665139878407998, "learning_rate": 2.50857575930589e-06, "loss": 0.0146, "step": 5484 }, { "epoch": 2.4954504094631482, "grad_norm": 0.44465719509500035, "learning_rate": 2.5078611151595046e-06, "loss": 0.008, "step": 5485 }, { "epoch": 2.4959053685168335, "grad_norm": 0.5695618628788756, "learning_rate": 2.5071464703707437e-06, "loss": 0.0138, "step": 5486 }, { "epoch": 2.496360327570519, "grad_norm": 0.42943989060382365, "learning_rate": 2.5064318249980065e-06, "loss": 0.0098, "step": 5487 }, { "epoch": 2.4968152866242037, "grad_norm": 0.40124108475549075, "learning_rate": 2.5057171790996875e-06, "loss": 0.0093, "step": 5488 }, { "epoch": 2.497270245677889, "grad_norm": 0.6683830580150574, "learning_rate": 2.5050025327341883e-06, "loss": 0.0143, "step": 5489 }, { "epoch": 2.4977252047315743, "grad_norm": 0.6252139618008695, "learning_rate": 2.504287885959904e-06, "loss": 0.01, "step": 5490 }, { "epoch": 2.498180163785259, "grad_norm": 0.5666133231525157, "learning_rate": 2.503573238835233e-06, "loss": 0.0085, "step": 5491 }, { "epoch": 2.4986351228389445, "grad_norm": 0.8064110229335534, "learning_rate": 2.5028585914185736e-06, "loss": 0.0169, "step": 5492 }, { "epoch": 2.49909008189263, "grad_norm": 0.5110638207735546, "learning_rate": 2.5021439437683224e-06, "loss": 0.0134, "step": 5493 }, { "epoch": 2.4995450409463147, "grad_norm": 0.564504136108738, "learning_rate": 2.501429295942878e-06, "loss": 0.0102, "step": 5494 }, { "epoch": 2.5, "grad_norm": 0.4591660143032277, "learning_rate": 2.5007146480006376e-06, "loss": 0.0083, "step": 5495 }, { "epoch": 2.5004549590536853, "grad_norm": 0.49782420126352317, "learning_rate": 2.5e-06, "loss": 0.0095, "step": 5496 }, { "epoch": 2.50090991810737, "grad_norm": 0.2396376363781991, "learning_rate": 2.4992853519993628e-06, "loss": 0.0045, "step": 5497 }, { "epoch": 2.5013648771610555, "grad_norm": 0.436734000677321, "learning_rate": 2.4985707040571228e-06, "loss": 0.0097, "step": 5498 }, { "epoch": 2.501819836214741, "grad_norm": 0.3124102137408693, "learning_rate": 2.497856056231679e-06, "loss": 0.0067, "step": 5499 }, { "epoch": 2.502274795268426, "grad_norm": 0.2682876052851543, "learning_rate": 2.497141408581427e-06, "loss": 0.0043, "step": 5500 }, { "epoch": 2.502729754322111, "grad_norm": 0.477987272333973, "learning_rate": 2.4964267611647673e-06, "loss": 0.0072, "step": 5501 }, { "epoch": 2.5031847133757963, "grad_norm": 0.2881867749605936, "learning_rate": 2.4957121140400966e-06, "loss": 0.0064, "step": 5502 }, { "epoch": 2.5036396724294816, "grad_norm": 0.3502710835214348, "learning_rate": 2.4949974672658126e-06, "loss": 0.005, "step": 5503 }, { "epoch": 2.5040946314831665, "grad_norm": 0.2911983780720174, "learning_rate": 2.494282820900313e-06, "loss": 0.0065, "step": 5504 }, { "epoch": 2.5045495905368518, "grad_norm": 0.5782699713352444, "learning_rate": 2.493568175001995e-06, "loss": 0.0113, "step": 5505 }, { "epoch": 2.505004549590537, "grad_norm": 0.4128151546695382, "learning_rate": 2.4928535296292576e-06, "loss": 0.0074, "step": 5506 }, { "epoch": 2.505459508644222, "grad_norm": 0.3959604420650131, "learning_rate": 2.4921388848404962e-06, "loss": 0.0069, "step": 5507 }, { "epoch": 2.5059144676979073, "grad_norm": 0.5114695091198721, "learning_rate": 2.49142424069411e-06, "loss": 0.0091, "step": 5508 }, { "epoch": 2.5063694267515926, "grad_norm": 0.33037024183401975, "learning_rate": 2.4907095972484967e-06, "loss": 0.0053, "step": 5509 }, { "epoch": 2.5068243858052774, "grad_norm": 0.48739631631683733, "learning_rate": 2.489994954562053e-06, "loss": 0.0095, "step": 5510 }, { "epoch": 2.5072793448589628, "grad_norm": 0.5665522564788043, "learning_rate": 2.489280312693177e-06, "loss": 0.011, "step": 5511 }, { "epoch": 2.507734303912648, "grad_norm": 0.4367455832000735, "learning_rate": 2.488565671700264e-06, "loss": 0.0094, "step": 5512 }, { "epoch": 2.508189262966333, "grad_norm": 0.23157479846676995, "learning_rate": 2.487851031641714e-06, "loss": 0.0024, "step": 5513 }, { "epoch": 2.5086442220200182, "grad_norm": 0.594561252740091, "learning_rate": 2.4871363925759216e-06, "loss": 0.0137, "step": 5514 }, { "epoch": 2.5090991810737036, "grad_norm": 0.4138381557259493, "learning_rate": 2.4864217545612855e-06, "loss": 0.011, "step": 5515 }, { "epoch": 2.5095541401273884, "grad_norm": 0.26654650212848785, "learning_rate": 2.485707117656203e-06, "loss": 0.0045, "step": 5516 }, { "epoch": 2.5100090991810737, "grad_norm": 0.47557016140005137, "learning_rate": 2.4849924819190698e-06, "loss": 0.0089, "step": 5517 }, { "epoch": 2.510464058234759, "grad_norm": 0.4745416392461397, "learning_rate": 2.4842778474082833e-06, "loss": 0.0154, "step": 5518 }, { "epoch": 2.510919017288444, "grad_norm": 0.36350344434452825, "learning_rate": 2.48356321418224e-06, "loss": 0.0059, "step": 5519 }, { "epoch": 2.511373976342129, "grad_norm": 0.3843747990985066, "learning_rate": 2.482848582299337e-06, "loss": 0.0073, "step": 5520 }, { "epoch": 2.5118289353958145, "grad_norm": 0.6908132359075928, "learning_rate": 2.4821339518179695e-06, "loss": 0.0131, "step": 5521 }, { "epoch": 2.5122838944494994, "grad_norm": 0.6840176420672875, "learning_rate": 2.481419322796535e-06, "loss": 0.0127, "step": 5522 }, { "epoch": 2.5127388535031847, "grad_norm": 0.5140568183619559, "learning_rate": 2.48070469529343e-06, "loss": 0.0105, "step": 5523 }, { "epoch": 2.51319381255687, "grad_norm": 0.3031118732041178, "learning_rate": 2.47999006936705e-06, "loss": 0.0041, "step": 5524 }, { "epoch": 2.513648771610555, "grad_norm": 0.3851761706532333, "learning_rate": 2.479275445075792e-06, "loss": 0.0088, "step": 5525 }, { "epoch": 2.51410373066424, "grad_norm": 0.4290050616618402, "learning_rate": 2.47856082247805e-06, "loss": 0.0071, "step": 5526 }, { "epoch": 2.5145586897179255, "grad_norm": 0.8212908529435878, "learning_rate": 2.477846201632221e-06, "loss": 0.0341, "step": 5527 }, { "epoch": 2.5150136487716104, "grad_norm": 0.3764924865447634, "learning_rate": 2.4771315825967e-06, "loss": 0.0064, "step": 5528 }, { "epoch": 2.5154686078252957, "grad_norm": 1.1436667500413558, "learning_rate": 2.476416965429883e-06, "loss": 0.0531, "step": 5529 }, { "epoch": 2.515923566878981, "grad_norm": 0.7958621590886449, "learning_rate": 2.4757023501901654e-06, "loss": 0.0304, "step": 5530 }, { "epoch": 2.516378525932666, "grad_norm": 0.45946657324604456, "learning_rate": 2.4749877369359418e-06, "loss": 0.0102, "step": 5531 }, { "epoch": 2.516833484986351, "grad_norm": 0.6092242137397196, "learning_rate": 2.474273125725608e-06, "loss": 0.0115, "step": 5532 }, { "epoch": 2.5172884440400365, "grad_norm": 0.5135652683383144, "learning_rate": 2.473558516617558e-06, "loss": 0.0155, "step": 5533 }, { "epoch": 2.5177434030937214, "grad_norm": 0.3017663143092432, "learning_rate": 2.472843909670186e-06, "loss": 0.0039, "step": 5534 }, { "epoch": 2.5181983621474067, "grad_norm": 0.5653059163705902, "learning_rate": 2.4721293049418867e-06, "loss": 0.0114, "step": 5535 }, { "epoch": 2.518653321201092, "grad_norm": 0.4001544401929987, "learning_rate": 2.471414702491056e-06, "loss": 0.0087, "step": 5536 }, { "epoch": 2.519108280254777, "grad_norm": 0.508421217231658, "learning_rate": 2.4707001023760852e-06, "loss": 0.0122, "step": 5537 }, { "epoch": 2.519563239308462, "grad_norm": 0.28814595730303416, "learning_rate": 2.4699855046553714e-06, "loss": 0.0079, "step": 5538 }, { "epoch": 2.5200181983621475, "grad_norm": 0.5750934960086628, "learning_rate": 2.4692709093873054e-06, "loss": 0.0141, "step": 5539 }, { "epoch": 2.5204731574158323, "grad_norm": 0.36790525134801666, "learning_rate": 2.468556316630283e-06, "loss": 0.0101, "step": 5540 }, { "epoch": 2.5209281164695176, "grad_norm": 0.3167171075592501, "learning_rate": 2.4678417264426953e-06, "loss": 0.0066, "step": 5541 }, { "epoch": 2.521383075523203, "grad_norm": 0.4002123116299016, "learning_rate": 2.467127138882936e-06, "loss": 0.0102, "step": 5542 }, { "epoch": 2.521838034576888, "grad_norm": 0.35888605407158247, "learning_rate": 2.4664125540094e-06, "loss": 0.0076, "step": 5543 }, { "epoch": 2.522292993630573, "grad_norm": 0.329527942201093, "learning_rate": 2.4656979718804775e-06, "loss": 0.0074, "step": 5544 }, { "epoch": 2.5227479526842584, "grad_norm": 7.99120946554204, "learning_rate": 2.4649833925545626e-06, "loss": 0.1387, "step": 5545 }, { "epoch": 2.5232029117379433, "grad_norm": 0.8416508006626507, "learning_rate": 2.464268816090046e-06, "loss": 0.0188, "step": 5546 }, { "epoch": 2.5236578707916286, "grad_norm": 0.5471345368102515, "learning_rate": 2.4635542425453213e-06, "loss": 0.0142, "step": 5547 }, { "epoch": 2.524112829845314, "grad_norm": 0.4690144650129016, "learning_rate": 2.4628396719787783e-06, "loss": 0.0088, "step": 5548 }, { "epoch": 2.5245677888989992, "grad_norm": 0.2974621518301039, "learning_rate": 2.4621251044488094e-06, "loss": 0.008, "step": 5549 }, { "epoch": 2.525022747952684, "grad_norm": 0.38198257121760354, "learning_rate": 2.4614105400138066e-06, "loss": 0.0079, "step": 5550 }, { "epoch": 2.5254777070063694, "grad_norm": 0.33791284152285656, "learning_rate": 2.4606959787321596e-06, "loss": 0.0079, "step": 5551 }, { "epoch": 2.5259326660600547, "grad_norm": 0.48352603268055666, "learning_rate": 2.4599814206622604e-06, "loss": 0.0126, "step": 5552 }, { "epoch": 2.5263876251137396, "grad_norm": 0.5058459622712447, "learning_rate": 2.4592668658624984e-06, "loss": 0.0086, "step": 5553 }, { "epoch": 2.526842584167425, "grad_norm": 0.30631250868584836, "learning_rate": 2.4585523143912645e-06, "loss": 0.0063, "step": 5554 }, { "epoch": 2.52729754322111, "grad_norm": 0.4899158325274973, "learning_rate": 2.457837766306948e-06, "loss": 0.0088, "step": 5555 }, { "epoch": 2.5277525022747955, "grad_norm": 0.4279222070165329, "learning_rate": 2.457123221667938e-06, "loss": 0.009, "step": 5556 }, { "epoch": 2.5282074613284804, "grad_norm": 0.5377301723186966, "learning_rate": 2.4564086805326262e-06, "loss": 0.0105, "step": 5557 }, { "epoch": 2.5286624203821657, "grad_norm": 0.5209659357037371, "learning_rate": 2.4556941429593993e-06, "loss": 0.01, "step": 5558 }, { "epoch": 2.529117379435851, "grad_norm": 0.4938961123161512, "learning_rate": 2.4549796090066473e-06, "loss": 0.0152, "step": 5559 }, { "epoch": 2.529572338489536, "grad_norm": 0.6089195215861466, "learning_rate": 2.454265078732758e-06, "loss": 0.0122, "step": 5560 }, { "epoch": 2.530027297543221, "grad_norm": 0.2696612877108101, "learning_rate": 2.453550552196121e-06, "loss": 0.0041, "step": 5561 }, { "epoch": 2.5304822565969065, "grad_norm": 1.086684583135892, "learning_rate": 2.4528360294551216e-06, "loss": 0.0231, "step": 5562 }, { "epoch": 2.5309372156505914, "grad_norm": 0.7153919633206023, "learning_rate": 2.452121510568149e-06, "loss": 0.0135, "step": 5563 }, { "epoch": 2.5313921747042767, "grad_norm": 0.37068091672303366, "learning_rate": 2.4514069955935914e-06, "loss": 0.0091, "step": 5564 }, { "epoch": 2.531847133757962, "grad_norm": 0.29911106615689403, "learning_rate": 2.450692484589834e-06, "loss": 0.0037, "step": 5565 }, { "epoch": 2.532302092811647, "grad_norm": 0.4825413714395155, "learning_rate": 2.4499779776152647e-06, "loss": 0.0153, "step": 5566 }, { "epoch": 2.532757051865332, "grad_norm": 0.5148570203738869, "learning_rate": 2.4492634747282686e-06, "loss": 0.0103, "step": 5567 }, { "epoch": 2.5332120109190175, "grad_norm": 0.44969517106501866, "learning_rate": 2.4485489759872324e-06, "loss": 0.0121, "step": 5568 }, { "epoch": 2.5336669699727024, "grad_norm": 0.44212736318002716, "learning_rate": 2.447834481450542e-06, "loss": 0.0068, "step": 5569 }, { "epoch": 2.5341219290263877, "grad_norm": 0.51898428011272, "learning_rate": 2.447119991176581e-06, "loss": 0.0112, "step": 5570 }, { "epoch": 2.534576888080073, "grad_norm": 0.314766327221405, "learning_rate": 2.446405505223736e-06, "loss": 0.004, "step": 5571 }, { "epoch": 2.535031847133758, "grad_norm": 0.36604072080382394, "learning_rate": 2.4456910236503916e-06, "loss": 0.0072, "step": 5572 }, { "epoch": 2.535486806187443, "grad_norm": 0.5103943752844275, "learning_rate": 2.444976546514931e-06, "loss": 0.0129, "step": 5573 }, { "epoch": 2.5359417652411285, "grad_norm": 0.49731546366060353, "learning_rate": 2.4442620738757393e-06, "loss": 0.0116, "step": 5574 }, { "epoch": 2.5363967242948133, "grad_norm": 0.6243342616113668, "learning_rate": 2.4435476057911984e-06, "loss": 0.0168, "step": 5575 }, { "epoch": 2.5368516833484986, "grad_norm": 0.44324177629856326, "learning_rate": 2.4428331423196926e-06, "loss": 0.0147, "step": 5576 }, { "epoch": 2.537306642402184, "grad_norm": 0.7410744101340777, "learning_rate": 2.4421186835196035e-06, "loss": 0.0256, "step": 5577 }, { "epoch": 2.537761601455869, "grad_norm": 0.3018747555189916, "learning_rate": 2.4414042294493146e-06, "loss": 0.0066, "step": 5578 }, { "epoch": 2.538216560509554, "grad_norm": 0.39303925640427134, "learning_rate": 2.440689780167208e-06, "loss": 0.0067, "step": 5579 }, { "epoch": 2.5386715195632394, "grad_norm": 0.5783205892961655, "learning_rate": 2.439975335731664e-06, "loss": 0.0108, "step": 5580 }, { "epoch": 2.5391264786169243, "grad_norm": 0.6876470232719026, "learning_rate": 2.4392608962010654e-06, "loss": 0.0186, "step": 5581 }, { "epoch": 2.5395814376706096, "grad_norm": 0.2219762159102626, "learning_rate": 2.438546461633791e-06, "loss": 0.0045, "step": 5582 }, { "epoch": 2.540036396724295, "grad_norm": 0.4239280155967586, "learning_rate": 2.4378320320882235e-06, "loss": 0.0092, "step": 5583 }, { "epoch": 2.54049135577798, "grad_norm": 0.6772353181012416, "learning_rate": 2.43711760762274e-06, "loss": 0.0174, "step": 5584 }, { "epoch": 2.540946314831665, "grad_norm": 0.6421430612863832, "learning_rate": 2.4364031882957223e-06, "loss": 0.0122, "step": 5585 }, { "epoch": 2.5414012738853504, "grad_norm": 0.3048140231909923, "learning_rate": 2.4356887741655496e-06, "loss": 0.004, "step": 5586 }, { "epoch": 2.5418562329390353, "grad_norm": 0.3414113339904576, "learning_rate": 2.434974365290599e-06, "loss": 0.0044, "step": 5587 }, { "epoch": 2.5423111919927206, "grad_norm": 0.38778035154064144, "learning_rate": 2.43425996172925e-06, "loss": 0.0089, "step": 5588 }, { "epoch": 2.542766151046406, "grad_norm": 0.36897335125535013, "learning_rate": 2.4335455635398796e-06, "loss": 0.0073, "step": 5589 }, { "epoch": 2.5432211101000908, "grad_norm": 0.36387515863295994, "learning_rate": 2.4328311707808666e-06, "loss": 0.0075, "step": 5590 }, { "epoch": 2.543676069153776, "grad_norm": 0.5416096645323859, "learning_rate": 2.4321167835105855e-06, "loss": 0.0113, "step": 5591 }, { "epoch": 2.5441310282074614, "grad_norm": 0.2807093690829728, "learning_rate": 2.4314024017874152e-06, "loss": 0.0048, "step": 5592 }, { "epoch": 2.5445859872611463, "grad_norm": 0.5548724584511634, "learning_rate": 2.430688025669731e-06, "loss": 0.0174, "step": 5593 }, { "epoch": 2.5450409463148316, "grad_norm": 0.32903744691310366, "learning_rate": 2.429973655215908e-06, "loss": 0.0032, "step": 5594 }, { "epoch": 2.545495905368517, "grad_norm": 0.4658530255584992, "learning_rate": 2.429259290484322e-06, "loss": 0.0096, "step": 5595 }, { "epoch": 2.5459508644222018, "grad_norm": 0.4316421425429825, "learning_rate": 2.428544931533347e-06, "loss": 0.0116, "step": 5596 }, { "epoch": 2.546405823475887, "grad_norm": 0.4411827909527133, "learning_rate": 2.4278305784213583e-06, "loss": 0.0105, "step": 5597 }, { "epoch": 2.5468607825295724, "grad_norm": 0.6155398182146482, "learning_rate": 2.4271162312067274e-06, "loss": 0.0155, "step": 5598 }, { "epoch": 2.5473157415832572, "grad_norm": 0.46211744714844544, "learning_rate": 2.426401889947829e-06, "loss": 0.0067, "step": 5599 }, { "epoch": 2.5477707006369426, "grad_norm": 0.3480131827527548, "learning_rate": 2.4256875547030372e-06, "loss": 0.0074, "step": 5600 }, { "epoch": 2.548225659690628, "grad_norm": 0.22848980595948523, "learning_rate": 2.424973225530722e-06, "loss": 0.0035, "step": 5601 }, { "epoch": 2.548680618744313, "grad_norm": 0.6587765607523904, "learning_rate": 2.4242589024892564e-06, "loss": 0.0222, "step": 5602 }, { "epoch": 2.549135577797998, "grad_norm": 0.43982250056728694, "learning_rate": 2.423544585637011e-06, "loss": 0.0093, "step": 5603 }, { "epoch": 2.5495905368516834, "grad_norm": 0.37728491446151696, "learning_rate": 2.422830275032357e-06, "loss": 0.0051, "step": 5604 }, { "epoch": 2.5500454959053687, "grad_norm": 0.45562380226715576, "learning_rate": 2.4221159707336633e-06, "loss": 0.0066, "step": 5605 }, { "epoch": 2.5505004549590535, "grad_norm": 0.5764092848769716, "learning_rate": 2.421401672799302e-06, "loss": 0.0128, "step": 5606 }, { "epoch": 2.550955414012739, "grad_norm": 0.4877502577576033, "learning_rate": 2.4206873812876404e-06, "loss": 0.0109, "step": 5607 }, { "epoch": 2.551410373066424, "grad_norm": 0.5229544700121783, "learning_rate": 2.419973096257048e-06, "loss": 0.0108, "step": 5608 }, { "epoch": 2.5518653321201095, "grad_norm": 0.7467851275740915, "learning_rate": 2.4192588177658934e-06, "loss": 0.0269, "step": 5609 }, { "epoch": 2.5523202911737943, "grad_norm": 0.691732065919508, "learning_rate": 2.418544545872543e-06, "loss": 0.0106, "step": 5610 }, { "epoch": 2.5527752502274796, "grad_norm": 0.42075458282568834, "learning_rate": 2.4178302806353646e-06, "loss": 0.0066, "step": 5611 }, { "epoch": 2.553230209281165, "grad_norm": 0.5953029305155602, "learning_rate": 2.4171160221127236e-06, "loss": 0.0184, "step": 5612 }, { "epoch": 2.55368516833485, "grad_norm": 0.2651010200327099, "learning_rate": 2.4164017703629885e-06, "loss": 0.0078, "step": 5613 }, { "epoch": 2.554140127388535, "grad_norm": 0.729832093840458, "learning_rate": 2.4156875254445224e-06, "loss": 0.0182, "step": 5614 }, { "epoch": 2.5545950864422204, "grad_norm": 0.5818088192234572, "learning_rate": 2.4149732874156915e-06, "loss": 0.0121, "step": 5615 }, { "epoch": 2.5550500454959053, "grad_norm": 0.3732831454022132, "learning_rate": 2.414259056334859e-06, "loss": 0.0052, "step": 5616 }, { "epoch": 2.5555050045495906, "grad_norm": 0.6072137223905758, "learning_rate": 2.4135448322603896e-06, "loss": 0.0138, "step": 5617 }, { "epoch": 2.555959963603276, "grad_norm": 0.775749361481836, "learning_rate": 2.4128306152506457e-06, "loss": 0.0244, "step": 5618 }, { "epoch": 2.556414922656961, "grad_norm": 0.4320923311924329, "learning_rate": 2.4121164053639902e-06, "loss": 0.0076, "step": 5619 }, { "epoch": 2.556869881710646, "grad_norm": 0.3514580841507012, "learning_rate": 2.4114022026587854e-06, "loss": 0.0051, "step": 5620 }, { "epoch": 2.5573248407643314, "grad_norm": 0.43393698400890457, "learning_rate": 2.4106880071933923e-06, "loss": 0.0113, "step": 5621 }, { "epoch": 2.5577797998180163, "grad_norm": 0.35698607853647657, "learning_rate": 2.4099738190261727e-06, "loss": 0.0071, "step": 5622 }, { "epoch": 2.5582347588717016, "grad_norm": 0.6058558062920634, "learning_rate": 2.4092596382154855e-06, "loss": 0.0087, "step": 5623 }, { "epoch": 2.558689717925387, "grad_norm": 0.17878139434947476, "learning_rate": 2.4085454648196912e-06, "loss": 0.0031, "step": 5624 }, { "epoch": 2.5591446769790718, "grad_norm": 0.4496575803814198, "learning_rate": 2.407831298897148e-06, "loss": 0.0073, "step": 5625 }, { "epoch": 2.559599636032757, "grad_norm": 0.41119159526785964, "learning_rate": 2.4071171405062145e-06, "loss": 0.0083, "step": 5626 }, { "epoch": 2.5600545950864424, "grad_norm": 0.4026372776322949, "learning_rate": 2.4064029897052495e-06, "loss": 0.0072, "step": 5627 }, { "epoch": 2.5605095541401273, "grad_norm": 0.4149357361144556, "learning_rate": 2.4056888465526095e-06, "loss": 0.0077, "step": 5628 }, { "epoch": 2.5609645131938126, "grad_norm": 0.731842604977587, "learning_rate": 2.4049747111066513e-06, "loss": 0.0137, "step": 5629 }, { "epoch": 2.561419472247498, "grad_norm": 0.5305945518855866, "learning_rate": 2.40426058342573e-06, "loss": 0.0151, "step": 5630 }, { "epoch": 2.5618744313011828, "grad_norm": 0.6382509194341004, "learning_rate": 2.403546463568202e-06, "loss": 0.0131, "step": 5631 }, { "epoch": 2.562329390354868, "grad_norm": 0.34831784204532784, "learning_rate": 2.402832351592421e-06, "loss": 0.0103, "step": 5632 }, { "epoch": 2.5627843494085534, "grad_norm": 0.36911635901135154, "learning_rate": 2.4021182475567404e-06, "loss": 0.0095, "step": 5633 }, { "epoch": 2.5632393084622382, "grad_norm": 0.483679178404308, "learning_rate": 2.401404151519516e-06, "loss": 0.014, "step": 5634 }, { "epoch": 2.5636942675159236, "grad_norm": 0.2547338058589591, "learning_rate": 2.400690063539099e-06, "loss": 0.0034, "step": 5635 }, { "epoch": 2.564149226569609, "grad_norm": 0.32025674895639444, "learning_rate": 2.3999759836738415e-06, "loss": 0.0065, "step": 5636 }, { "epoch": 2.5646041856232937, "grad_norm": 0.41415137299486804, "learning_rate": 2.3992619119820945e-06, "loss": 0.008, "step": 5637 }, { "epoch": 2.565059144676979, "grad_norm": 0.4210202194195129, "learning_rate": 2.39854784852221e-06, "loss": 0.0084, "step": 5638 }, { "epoch": 2.5655141037306644, "grad_norm": 0.2885393086734407, "learning_rate": 2.3978337933525366e-06, "loss": 0.0051, "step": 5639 }, { "epoch": 2.565969062784349, "grad_norm": 0.3837436813045055, "learning_rate": 2.397119746531423e-06, "loss": 0.0054, "step": 5640 }, { "epoch": 2.5664240218380345, "grad_norm": 0.5142709419898498, "learning_rate": 2.3964057081172205e-06, "loss": 0.0122, "step": 5641 }, { "epoch": 2.56687898089172, "grad_norm": 0.4187125406997952, "learning_rate": 2.395691678168276e-06, "loss": 0.013, "step": 5642 }, { "epoch": 2.5673339399454047, "grad_norm": 0.38444134275762615, "learning_rate": 2.3949776567429358e-06, "loss": 0.0088, "step": 5643 }, { "epoch": 2.56778889899909, "grad_norm": 0.26194690679246624, "learning_rate": 2.3942636438995478e-06, "loss": 0.0048, "step": 5644 }, { "epoch": 2.5682438580527753, "grad_norm": 0.6443815478766546, "learning_rate": 2.3935496396964565e-06, "loss": 0.0198, "step": 5645 }, { "epoch": 2.56869881710646, "grad_norm": 0.3016096252919494, "learning_rate": 2.3928356441920087e-06, "loss": 0.0037, "step": 5646 }, { "epoch": 2.5691537761601455, "grad_norm": 0.5098929818906243, "learning_rate": 2.392121657444547e-06, "loss": 0.0203, "step": 5647 }, { "epoch": 2.569608735213831, "grad_norm": 0.21264700117257276, "learning_rate": 2.391407679512417e-06, "loss": 0.0041, "step": 5648 }, { "epoch": 2.5700636942675157, "grad_norm": 0.4483062396087633, "learning_rate": 2.390693710453961e-06, "loss": 0.0174, "step": 5649 }, { "epoch": 2.570518653321201, "grad_norm": 0.5320502676588692, "learning_rate": 2.3899797503275214e-06, "loss": 0.0146, "step": 5650 }, { "epoch": 2.5709736123748863, "grad_norm": 0.5376738081823274, "learning_rate": 2.38926579919144e-06, "loss": 0.0092, "step": 5651 }, { "epoch": 2.571428571428571, "grad_norm": 0.2638248778401654, "learning_rate": 2.388551857104057e-06, "loss": 0.0047, "step": 5652 }, { "epoch": 2.5718835304822565, "grad_norm": 0.7062612959329393, "learning_rate": 2.3878379241237136e-06, "loss": 0.019, "step": 5653 }, { "epoch": 2.572338489535942, "grad_norm": 0.40879324790864174, "learning_rate": 2.387124000308747e-06, "loss": 0.0091, "step": 5654 }, { "epoch": 2.5727934485896267, "grad_norm": 0.44399723829091525, "learning_rate": 2.3864100857174985e-06, "loss": 0.0101, "step": 5655 }, { "epoch": 2.573248407643312, "grad_norm": 0.4551258392620842, "learning_rate": 2.385696180408305e-06, "loss": 0.0108, "step": 5656 }, { "epoch": 2.5737033666969973, "grad_norm": 0.532529777119908, "learning_rate": 2.384982284439503e-06, "loss": 0.014, "step": 5657 }, { "epoch": 2.5741583257506826, "grad_norm": 0.5888859599845012, "learning_rate": 2.3842683978694296e-06, "loss": 0.0083, "step": 5658 }, { "epoch": 2.5746132848043675, "grad_norm": 0.4021940696474077, "learning_rate": 2.38355452075642e-06, "loss": 0.0162, "step": 5659 }, { "epoch": 2.5750682438580528, "grad_norm": 0.3977743597585202, "learning_rate": 2.382840653158809e-06, "loss": 0.0091, "step": 5660 }, { "epoch": 2.575523202911738, "grad_norm": 0.30236892132885623, "learning_rate": 2.3821267951349306e-06, "loss": 0.0049, "step": 5661 }, { "epoch": 2.575978161965423, "grad_norm": 0.7112311553318099, "learning_rate": 2.381412946743118e-06, "loss": 0.0211, "step": 5662 }, { "epoch": 2.5764331210191083, "grad_norm": 0.7053747544472199, "learning_rate": 2.3806991080417046e-06, "loss": 0.0222, "step": 5663 }, { "epoch": 2.5768880800727936, "grad_norm": 0.5442321571499983, "learning_rate": 2.3799852790890208e-06, "loss": 0.0159, "step": 5664 }, { "epoch": 2.577343039126479, "grad_norm": 0.2798882002354777, "learning_rate": 2.3792714599433988e-06, "loss": 0.0047, "step": 5665 }, { "epoch": 2.5777979981801638, "grad_norm": 0.48516201088362504, "learning_rate": 2.378557650663167e-06, "loss": 0.0102, "step": 5666 }, { "epoch": 2.578252957233849, "grad_norm": 0.36609990845184304, "learning_rate": 2.377843851306656e-06, "loss": 0.0057, "step": 5667 }, { "epoch": 2.5787079162875344, "grad_norm": 0.32401919344398983, "learning_rate": 2.377130061932193e-06, "loss": 0.0063, "step": 5668 }, { "epoch": 2.5791628753412192, "grad_norm": 0.40642380033944014, "learning_rate": 2.3764162825981067e-06, "loss": 0.0102, "step": 5669 }, { "epoch": 2.5796178343949046, "grad_norm": 0.7160575359131892, "learning_rate": 2.3757025133627246e-06, "loss": 0.0211, "step": 5670 }, { "epoch": 2.58007279344859, "grad_norm": 0.5205859842037662, "learning_rate": 2.374988754284371e-06, "loss": 0.0087, "step": 5671 }, { "epoch": 2.5805277525022747, "grad_norm": 0.403748488703227, "learning_rate": 2.3742750054213728e-06, "loss": 0.0097, "step": 5672 }, { "epoch": 2.58098271155596, "grad_norm": 0.8124764474001668, "learning_rate": 2.3735612668320522e-06, "loss": 0.0203, "step": 5673 }, { "epoch": 2.5814376706096454, "grad_norm": 0.34809523844268464, "learning_rate": 2.372847538574735e-06, "loss": 0.0062, "step": 5674 }, { "epoch": 2.58189262966333, "grad_norm": 0.3060117151274961, "learning_rate": 2.3721338207077413e-06, "loss": 0.01, "step": 5675 }, { "epoch": 2.5823475887170155, "grad_norm": 0.31354992600059406, "learning_rate": 2.371420113289395e-06, "loss": 0.0078, "step": 5676 }, { "epoch": 2.582802547770701, "grad_norm": 0.4991258400556015, "learning_rate": 2.370706416378017e-06, "loss": 0.0118, "step": 5677 }, { "epoch": 2.5832575068243857, "grad_norm": 0.7389784303400729, "learning_rate": 2.3699927300319262e-06, "loss": 0.0183, "step": 5678 }, { "epoch": 2.583712465878071, "grad_norm": 0.44790574686702084, "learning_rate": 2.3692790543094427e-06, "loss": 0.0129, "step": 5679 }, { "epoch": 2.5841674249317563, "grad_norm": 0.5055543555806595, "learning_rate": 2.3685653892688845e-06, "loss": 0.0146, "step": 5680 }, { "epoch": 2.584622383985441, "grad_norm": 0.3946273003804004, "learning_rate": 2.367851734968569e-06, "loss": 0.0095, "step": 5681 }, { "epoch": 2.5850773430391265, "grad_norm": 0.6641980767923964, "learning_rate": 2.367138091466812e-06, "loss": 0.0234, "step": 5682 }, { "epoch": 2.585532302092812, "grad_norm": 0.38279221345996006, "learning_rate": 2.3664244588219315e-06, "loss": 0.0049, "step": 5683 }, { "epoch": 2.5859872611464967, "grad_norm": 1.3907638299735696, "learning_rate": 2.3657108370922405e-06, "loss": 0.0232, "step": 5684 }, { "epoch": 2.586442220200182, "grad_norm": 0.37003216278428336, "learning_rate": 2.364997226336054e-06, "loss": 0.0082, "step": 5685 }, { "epoch": 2.5868971792538673, "grad_norm": 0.4391207635311121, "learning_rate": 2.3642836266116836e-06, "loss": 0.013, "step": 5686 }, { "epoch": 2.587352138307552, "grad_norm": 0.7538234695136433, "learning_rate": 2.3635700379774436e-06, "loss": 0.0319, "step": 5687 }, { "epoch": 2.5878070973612375, "grad_norm": 0.7281584428686606, "learning_rate": 2.362856460491643e-06, "loss": 0.0173, "step": 5688 }, { "epoch": 2.588262056414923, "grad_norm": 0.6348673673434349, "learning_rate": 2.3621428942125923e-06, "loss": 0.0074, "step": 5689 }, { "epoch": 2.5887170154686077, "grad_norm": 0.5039070596268133, "learning_rate": 2.361429339198603e-06, "loss": 0.0103, "step": 5690 }, { "epoch": 2.589171974522293, "grad_norm": 0.19772150262953322, "learning_rate": 2.3607157955079817e-06, "loss": 0.0031, "step": 5691 }, { "epoch": 2.5896269335759783, "grad_norm": 0.45035368951064875, "learning_rate": 2.3600022631990376e-06, "loss": 0.0131, "step": 5692 }, { "epoch": 2.590081892629663, "grad_norm": 1.0306605086593303, "learning_rate": 2.3592887423300752e-06, "loss": 0.0172, "step": 5693 }, { "epoch": 2.5905368516833485, "grad_norm": 0.5837314023232325, "learning_rate": 2.3585752329594026e-06, "loss": 0.0141, "step": 5694 }, { "epoch": 2.5909918107370338, "grad_norm": 0.5435645918850026, "learning_rate": 2.357861735145322e-06, "loss": 0.0142, "step": 5695 }, { "epoch": 2.5914467697907186, "grad_norm": 0.28152403119671165, "learning_rate": 2.3571482489461383e-06, "loss": 0.007, "step": 5696 }, { "epoch": 2.591901728844404, "grad_norm": 0.23053463955527914, "learning_rate": 2.3564347744201556e-06, "loss": 0.0052, "step": 5697 }, { "epoch": 2.5923566878980893, "grad_norm": 0.48813843673110385, "learning_rate": 2.3557213116256745e-06, "loss": 0.0158, "step": 5698 }, { "epoch": 2.592811646951774, "grad_norm": 0.5567192349182676, "learning_rate": 2.3550078606209963e-06, "loss": 0.0089, "step": 5699 }, { "epoch": 2.5932666060054594, "grad_norm": 0.3763919895852756, "learning_rate": 2.354294421464421e-06, "loss": 0.0104, "step": 5700 }, { "epoch": 2.5937215650591448, "grad_norm": 0.5733030868738037, "learning_rate": 2.3535809942142478e-06, "loss": 0.0105, "step": 5701 }, { "epoch": 2.5941765241128296, "grad_norm": 0.368499280341102, "learning_rate": 2.352867578928774e-06, "loss": 0.0085, "step": 5702 }, { "epoch": 2.594631483166515, "grad_norm": 0.5557193150967772, "learning_rate": 2.3521541756662965e-06, "loss": 0.0134, "step": 5703 }, { "epoch": 2.5950864422202002, "grad_norm": 0.816530205222988, "learning_rate": 2.3514407844851133e-06, "loss": 0.0162, "step": 5704 }, { "epoch": 2.595541401273885, "grad_norm": 0.48831511856175436, "learning_rate": 2.350727405443518e-06, "loss": 0.0123, "step": 5705 }, { "epoch": 2.5959963603275704, "grad_norm": 0.5951139942471467, "learning_rate": 2.350014038599805e-06, "loss": 0.0118, "step": 5706 }, { "epoch": 2.5964513193812557, "grad_norm": 0.4353148650075021, "learning_rate": 2.3493006840122676e-06, "loss": 0.0147, "step": 5707 }, { "epoch": 2.5969062784349406, "grad_norm": 0.5795591785260142, "learning_rate": 2.348587341739198e-06, "loss": 0.0155, "step": 5708 }, { "epoch": 2.597361237488626, "grad_norm": 0.6586796030292281, "learning_rate": 2.3478740118388865e-06, "loss": 0.0196, "step": 5709 }, { "epoch": 2.597816196542311, "grad_norm": 0.468284417335068, "learning_rate": 2.3471606943696232e-06, "loss": 0.0141, "step": 5710 }, { "epoch": 2.598271155595996, "grad_norm": 0.31654213599162, "learning_rate": 2.346447389389699e-06, "loss": 0.0057, "step": 5711 }, { "epoch": 2.5987261146496814, "grad_norm": 0.5403550983575814, "learning_rate": 2.3457340969573995e-06, "loss": 0.015, "step": 5712 }, { "epoch": 2.5991810737033667, "grad_norm": 0.5115146645272663, "learning_rate": 2.345020817131014e-06, "loss": 0.0117, "step": 5713 }, { "epoch": 2.599636032757052, "grad_norm": 0.4957333074030325, "learning_rate": 2.3443075499688277e-06, "loss": 0.0107, "step": 5714 }, { "epoch": 2.600090991810737, "grad_norm": 0.37098767411245703, "learning_rate": 2.343594295529124e-06, "loss": 0.0061, "step": 5715 }, { "epoch": 2.600545950864422, "grad_norm": 0.42799020688688366, "learning_rate": 2.34288105387019e-06, "loss": 0.013, "step": 5716 }, { "epoch": 2.6010009099181075, "grad_norm": 0.4603927582209522, "learning_rate": 2.3421678250503043e-06, "loss": 0.011, "step": 5717 }, { "epoch": 2.6014558689717924, "grad_norm": 0.5345337460953594, "learning_rate": 2.3414546091277524e-06, "loss": 0.0125, "step": 5718 }, { "epoch": 2.6019108280254777, "grad_norm": 0.34425149607994426, "learning_rate": 2.3407414061608142e-06, "loss": 0.0063, "step": 5719 }, { "epoch": 2.602365787079163, "grad_norm": 0.38615807633787624, "learning_rate": 2.340028216207768e-06, "loss": 0.007, "step": 5720 }, { "epoch": 2.6028207461328483, "grad_norm": 0.36298696630975846, "learning_rate": 2.3393150393268952e-06, "loss": 0.0096, "step": 5721 }, { "epoch": 2.603275705186533, "grad_norm": 0.47055654062849667, "learning_rate": 2.3386018755764704e-06, "loss": 0.0152, "step": 5722 }, { "epoch": 2.6037306642402185, "grad_norm": 0.4311554667310965, "learning_rate": 2.3378887250147724e-06, "loss": 0.0128, "step": 5723 }, { "epoch": 2.604185623293904, "grad_norm": 0.29229107507182667, "learning_rate": 2.3371755877000747e-06, "loss": 0.0061, "step": 5724 }, { "epoch": 2.6046405823475887, "grad_norm": 0.26077912690958677, "learning_rate": 2.3364624636906528e-06, "loss": 0.0039, "step": 5725 }, { "epoch": 2.605095541401274, "grad_norm": 0.4164966399534443, "learning_rate": 2.3357493530447806e-06, "loss": 0.0112, "step": 5726 }, { "epoch": 2.6055505004549593, "grad_norm": 0.4008709538230516, "learning_rate": 2.335036255820729e-06, "loss": 0.0093, "step": 5727 }, { "epoch": 2.606005459508644, "grad_norm": 0.5689662474315526, "learning_rate": 2.33432317207677e-06, "loss": 0.0222, "step": 5728 }, { "epoch": 2.6064604185623295, "grad_norm": 0.3777543943580994, "learning_rate": 2.3336101018711726e-06, "loss": 0.0053, "step": 5729 }, { "epoch": 2.6069153776160148, "grad_norm": 0.43366530743851056, "learning_rate": 2.332897045262207e-06, "loss": 0.0088, "step": 5730 }, { "epoch": 2.6073703366696996, "grad_norm": 0.5699695312272781, "learning_rate": 2.3321840023081392e-06, "loss": 0.0184, "step": 5731 }, { "epoch": 2.607825295723385, "grad_norm": 0.44931311073348107, "learning_rate": 2.331470973067237e-06, "loss": 0.0123, "step": 5732 }, { "epoch": 2.6082802547770703, "grad_norm": 0.3870522784694783, "learning_rate": 2.330757957597767e-06, "loss": 0.0098, "step": 5733 }, { "epoch": 2.608735213830755, "grad_norm": 0.5539317209613257, "learning_rate": 2.3300449559579916e-06, "loss": 0.0115, "step": 5734 }, { "epoch": 2.6091901728844404, "grad_norm": 0.4282496247501506, "learning_rate": 2.3293319682061752e-06, "loss": 0.0114, "step": 5735 }, { "epoch": 2.6096451319381258, "grad_norm": 0.2639361728950889, "learning_rate": 2.3286189944005794e-06, "loss": 0.0055, "step": 5736 }, { "epoch": 2.6101000909918106, "grad_norm": 0.37061781939671656, "learning_rate": 2.327906034599466e-06, "loss": 0.0077, "step": 5737 }, { "epoch": 2.610555050045496, "grad_norm": 0.3558110116262037, "learning_rate": 2.3271930888610927e-06, "loss": 0.0085, "step": 5738 }, { "epoch": 2.6110100090991812, "grad_norm": 0.2260231104551758, "learning_rate": 2.3264801572437206e-06, "loss": 0.0048, "step": 5739 }, { "epoch": 2.611464968152866, "grad_norm": 0.5405387177941061, "learning_rate": 2.325767239805607e-06, "loss": 0.0104, "step": 5740 }, { "epoch": 2.6119199272065514, "grad_norm": 0.37714454256854674, "learning_rate": 2.325054336605007e-06, "loss": 0.0083, "step": 5741 }, { "epoch": 2.6123748862602367, "grad_norm": 0.24488591989675917, "learning_rate": 2.324341447700178e-06, "loss": 0.0048, "step": 5742 }, { "epoch": 2.6128298453139216, "grad_norm": 0.6442454814573733, "learning_rate": 2.323628573149371e-06, "loss": 0.0083, "step": 5743 }, { "epoch": 2.613284804367607, "grad_norm": 1.5180638276894596, "learning_rate": 2.322915713010842e-06, "loss": 0.0288, "step": 5744 }, { "epoch": 2.613739763421292, "grad_norm": 0.5781935638333355, "learning_rate": 2.3222028673428394e-06, "loss": 0.0171, "step": 5745 }, { "epoch": 2.614194722474977, "grad_norm": 0.5472293934159035, "learning_rate": 2.3214900362036165e-06, "loss": 0.01, "step": 5746 }, { "epoch": 2.6146496815286624, "grad_norm": 0.7294800420886615, "learning_rate": 2.3207772196514216e-06, "loss": 0.0135, "step": 5747 }, { "epoch": 2.6151046405823477, "grad_norm": 0.551145690129928, "learning_rate": 2.3200644177445034e-06, "loss": 0.009, "step": 5748 }, { "epoch": 2.6155595996360326, "grad_norm": 0.4035416305091776, "learning_rate": 2.3193516305411082e-06, "loss": 0.0083, "step": 5749 }, { "epoch": 2.616014558689718, "grad_norm": 0.3025351010497842, "learning_rate": 2.318638858099482e-06, "loss": 0.0078, "step": 5750 }, { "epoch": 2.616469517743403, "grad_norm": 0.5578307481713445, "learning_rate": 2.317926100477869e-06, "loss": 0.0132, "step": 5751 }, { "epoch": 2.616924476797088, "grad_norm": 1.0193839362426635, "learning_rate": 2.317213357734512e-06, "loss": 0.0297, "step": 5752 }, { "epoch": 2.6173794358507734, "grad_norm": 0.36038469581229865, "learning_rate": 2.3165006299276555e-06, "loss": 0.0056, "step": 5753 }, { "epoch": 2.6178343949044587, "grad_norm": 0.341527805661225, "learning_rate": 2.315787917115538e-06, "loss": 0.0072, "step": 5754 }, { "epoch": 2.6182893539581436, "grad_norm": 0.521668123983328, "learning_rate": 2.3150752193564006e-06, "loss": 0.0131, "step": 5755 }, { "epoch": 2.618744313011829, "grad_norm": 0.30677290767508775, "learning_rate": 2.3143625367084802e-06, "loss": 0.0051, "step": 5756 }, { "epoch": 2.619199272065514, "grad_norm": 0.3434592405136861, "learning_rate": 2.313649869230016e-06, "loss": 0.0085, "step": 5757 }, { "epoch": 2.619654231119199, "grad_norm": 0.39207237568283626, "learning_rate": 2.3129372169792427e-06, "loss": 0.0099, "step": 5758 }, { "epoch": 2.6201091901728844, "grad_norm": 0.47608996337529913, "learning_rate": 2.3122245800143944e-06, "loss": 0.0101, "step": 5759 }, { "epoch": 2.6205641492265697, "grad_norm": 0.36556539158893026, "learning_rate": 2.311511958393706e-06, "loss": 0.0037, "step": 5760 }, { "epoch": 2.6210191082802545, "grad_norm": 0.8514435451785441, "learning_rate": 2.3107993521754092e-06, "loss": 0.0214, "step": 5761 }, { "epoch": 2.62147406733394, "grad_norm": 0.3269813945183917, "learning_rate": 2.3100867614177353e-06, "loss": 0.0064, "step": 5762 }, { "epoch": 2.621929026387625, "grad_norm": 0.31550076697471874, "learning_rate": 2.3093741861789133e-06, "loss": 0.0052, "step": 5763 }, { "epoch": 2.62238398544131, "grad_norm": 0.5951810614545807, "learning_rate": 2.3086616265171724e-06, "loss": 0.0092, "step": 5764 }, { "epoch": 2.6228389444949953, "grad_norm": 0.24810673720986626, "learning_rate": 2.3079490824907386e-06, "loss": 0.0046, "step": 5765 }, { "epoch": 2.6232939035486806, "grad_norm": 0.363096326388244, "learning_rate": 2.307236554157838e-06, "loss": 0.01, "step": 5766 }, { "epoch": 2.623748862602366, "grad_norm": 0.5808772163237255, "learning_rate": 2.3065240415766966e-06, "loss": 0.0152, "step": 5767 }, { "epoch": 2.624203821656051, "grad_norm": 0.6857191964735694, "learning_rate": 2.3058115448055363e-06, "loss": 0.0151, "step": 5768 }, { "epoch": 2.624658780709736, "grad_norm": 0.24246347874497162, "learning_rate": 2.3050990639025804e-06, "loss": 0.0042, "step": 5769 }, { "epoch": 2.6251137397634214, "grad_norm": 0.4554349275524944, "learning_rate": 2.304386598926048e-06, "loss": 0.0077, "step": 5770 }, { "epoch": 2.6255686988171063, "grad_norm": 0.6047291742998906, "learning_rate": 2.3036741499341607e-06, "loss": 0.0165, "step": 5771 }, { "epoch": 2.6260236578707916, "grad_norm": 0.5941676566800239, "learning_rate": 2.302961716985134e-06, "loss": 0.0149, "step": 5772 }, { "epoch": 2.626478616924477, "grad_norm": 0.4910333995321091, "learning_rate": 2.3022493001371853e-06, "loss": 0.0122, "step": 5773 }, { "epoch": 2.6269335759781622, "grad_norm": 0.3324775851990431, "learning_rate": 2.301536899448532e-06, "loss": 0.0062, "step": 5774 }, { "epoch": 2.627388535031847, "grad_norm": 0.2674833959805248, "learning_rate": 2.3008245149773865e-06, "loss": 0.0067, "step": 5775 }, { "epoch": 2.6278434940855324, "grad_norm": 0.3119579197369337, "learning_rate": 2.300112146781963e-06, "loss": 0.0065, "step": 5776 }, { "epoch": 2.6282984531392177, "grad_norm": 0.33436318394314696, "learning_rate": 2.2993997949204714e-06, "loss": 0.0046, "step": 5777 }, { "epoch": 2.6287534121929026, "grad_norm": 0.44149091731544277, "learning_rate": 2.2986874594511234e-06, "loss": 0.0087, "step": 5778 }, { "epoch": 2.629208371246588, "grad_norm": 0.3145502853713885, "learning_rate": 2.297975140432126e-06, "loss": 0.0061, "step": 5779 }, { "epoch": 2.629663330300273, "grad_norm": 0.5929665771378311, "learning_rate": 2.2972628379216877e-06, "loss": 0.0093, "step": 5780 }, { "epoch": 2.630118289353958, "grad_norm": 0.597611324618426, "learning_rate": 2.2965505519780156e-06, "loss": 0.0092, "step": 5781 }, { "epoch": 2.6305732484076434, "grad_norm": 0.3820027462404495, "learning_rate": 2.295838282659313e-06, "loss": 0.0072, "step": 5782 }, { "epoch": 2.6310282074613287, "grad_norm": 0.6492190239644151, "learning_rate": 2.2951260300237847e-06, "loss": 0.0093, "step": 5783 }, { "epoch": 2.6314831665150136, "grad_norm": 0.7043177297324652, "learning_rate": 2.2944137941296323e-06, "loss": 0.016, "step": 5784 }, { "epoch": 2.631938125568699, "grad_norm": 0.5803969818727341, "learning_rate": 2.2937015750350554e-06, "loss": 0.0131, "step": 5785 }, { "epoch": 2.632393084622384, "grad_norm": 0.48823864207286677, "learning_rate": 2.2929893727982547e-06, "loss": 0.0094, "step": 5786 }, { "epoch": 2.632848043676069, "grad_norm": 0.35486945890316113, "learning_rate": 2.2922771874774263e-06, "loss": 0.0051, "step": 5787 }, { "epoch": 2.6333030027297544, "grad_norm": 0.3096847837826242, "learning_rate": 2.291565019130769e-06, "loss": 0.0068, "step": 5788 }, { "epoch": 2.6337579617834397, "grad_norm": 0.49304503693779095, "learning_rate": 2.2908528678164773e-06, "loss": 0.0134, "step": 5789 }, { "epoch": 2.6342129208371245, "grad_norm": 0.40438256298131037, "learning_rate": 2.290140733592744e-06, "loss": 0.0065, "step": 5790 }, { "epoch": 2.63466787989081, "grad_norm": 0.4073215603185982, "learning_rate": 2.2894286165177634e-06, "loss": 0.0051, "step": 5791 }, { "epoch": 2.635122838944495, "grad_norm": 0.46983003000024337, "learning_rate": 2.2887165166497242e-06, "loss": 0.0057, "step": 5792 }, { "epoch": 2.63557779799818, "grad_norm": 0.37447891178021175, "learning_rate": 2.288004434046818e-06, "loss": 0.0067, "step": 5793 }, { "epoch": 2.6360327570518653, "grad_norm": 0.38063093070715653, "learning_rate": 2.287292368767231e-06, "loss": 0.0114, "step": 5794 }, { "epoch": 2.6364877161055507, "grad_norm": 0.38122474659569233, "learning_rate": 2.2865803208691515e-06, "loss": 0.0074, "step": 5795 }, { "epoch": 2.6369426751592355, "grad_norm": 0.6356364363180046, "learning_rate": 2.285868290410765e-06, "loss": 0.0101, "step": 5796 }, { "epoch": 2.637397634212921, "grad_norm": 0.5048297712647506, "learning_rate": 2.285156277450254e-06, "loss": 0.014, "step": 5797 }, { "epoch": 2.637852593266606, "grad_norm": 0.4314337281611259, "learning_rate": 2.284444282045803e-06, "loss": 0.0085, "step": 5798 }, { "epoch": 2.638307552320291, "grad_norm": 0.6505160698802979, "learning_rate": 2.2837323042555906e-06, "loss": 0.0249, "step": 5799 }, { "epoch": 2.6387625113739763, "grad_norm": 0.6604680821852107, "learning_rate": 2.2830203441377984e-06, "loss": 0.0372, "step": 5800 }, { "epoch": 2.6392174704276616, "grad_norm": 0.4291109886075772, "learning_rate": 2.2823084017506025e-06, "loss": 0.0128, "step": 5801 }, { "epoch": 2.6396724294813465, "grad_norm": 0.40359188796115564, "learning_rate": 2.281596477152182e-06, "loss": 0.0088, "step": 5802 }, { "epoch": 2.640127388535032, "grad_norm": 0.5386540692988163, "learning_rate": 2.280884570400711e-06, "loss": 0.0155, "step": 5803 }, { "epoch": 2.640582347588717, "grad_norm": 0.562428428013761, "learning_rate": 2.2801726815543633e-06, "loss": 0.0121, "step": 5804 }, { "epoch": 2.641037306642402, "grad_norm": 0.41888963111075744, "learning_rate": 2.2794608106713116e-06, "loss": 0.0102, "step": 5805 }, { "epoch": 2.6414922656960873, "grad_norm": 0.3262803704867561, "learning_rate": 2.278748957809726e-06, "loss": 0.0028, "step": 5806 }, { "epoch": 2.6419472247497726, "grad_norm": 0.3528142882781218, "learning_rate": 2.2780371230277773e-06, "loss": 0.0034, "step": 5807 }, { "epoch": 2.6424021838034575, "grad_norm": 0.3941370665427246, "learning_rate": 2.2773253063836313e-06, "loss": 0.007, "step": 5808 }, { "epoch": 2.642857142857143, "grad_norm": 0.3373759754826735, "learning_rate": 2.276613507935456e-06, "loss": 0.0037, "step": 5809 }, { "epoch": 2.643312101910828, "grad_norm": 0.39413462179909164, "learning_rate": 2.2759017277414165e-06, "loss": 0.0081, "step": 5810 }, { "epoch": 2.643767060964513, "grad_norm": 0.28118969111617287, "learning_rate": 2.2751899658596755e-06, "loss": 0.0066, "step": 5811 }, { "epoch": 2.6442220200181983, "grad_norm": 0.4788079166280209, "learning_rate": 2.2744782223483956e-06, "loss": 0.0104, "step": 5812 }, { "epoch": 2.6446769790718836, "grad_norm": 0.4901097521421739, "learning_rate": 2.2737664972657367e-06, "loss": 0.0105, "step": 5813 }, { "epoch": 2.6451319381255685, "grad_norm": 0.6170712159184213, "learning_rate": 2.2730547906698582e-06, "loss": 0.0137, "step": 5814 }, { "epoch": 2.6455868971792538, "grad_norm": 0.6780796969944884, "learning_rate": 2.2723431026189165e-06, "loss": 0.0171, "step": 5815 }, { "epoch": 2.646041856232939, "grad_norm": 0.5361389083225993, "learning_rate": 2.271631433171069e-06, "loss": 0.0137, "step": 5816 }, { "epoch": 2.646496815286624, "grad_norm": 0.49895972610399636, "learning_rate": 2.2709197823844696e-06, "loss": 0.009, "step": 5817 }, { "epoch": 2.6469517743403093, "grad_norm": 0.5505810708995255, "learning_rate": 2.2702081503172706e-06, "loss": 0.0177, "step": 5818 }, { "epoch": 2.6474067333939946, "grad_norm": 0.4569427305185593, "learning_rate": 2.2694965370276244e-06, "loss": 0.0116, "step": 5819 }, { "epoch": 2.6478616924476794, "grad_norm": 0.14917726472237658, "learning_rate": 2.2687849425736806e-06, "loss": 0.0023, "step": 5820 }, { "epoch": 2.6483166515013647, "grad_norm": 0.7887147894373706, "learning_rate": 2.2680733670135864e-06, "loss": 0.0171, "step": 5821 }, { "epoch": 2.64877161055505, "grad_norm": 0.42323518351328293, "learning_rate": 2.2673618104054885e-06, "loss": 0.0144, "step": 5822 }, { "epoch": 2.6492265696087354, "grad_norm": 0.41535728645884357, "learning_rate": 2.266650272807534e-06, "loss": 0.009, "step": 5823 }, { "epoch": 2.6496815286624202, "grad_norm": 0.2429238805842699, "learning_rate": 2.265938754277865e-06, "loss": 0.0048, "step": 5824 }, { "epoch": 2.6501364877161055, "grad_norm": 0.3595784192233995, "learning_rate": 2.2652272548746245e-06, "loss": 0.0077, "step": 5825 }, { "epoch": 2.650591446769791, "grad_norm": 0.5033121678312261, "learning_rate": 2.264515774655952e-06, "loss": 0.0127, "step": 5826 }, { "epoch": 2.6510464058234757, "grad_norm": 0.5535564747067478, "learning_rate": 2.2638043136799876e-06, "loss": 0.0186, "step": 5827 }, { "epoch": 2.651501364877161, "grad_norm": 0.5598510236976997, "learning_rate": 2.2630928720048674e-06, "loss": 0.0206, "step": 5828 }, { "epoch": 2.6519563239308463, "grad_norm": 0.4944102705641547, "learning_rate": 2.262381449688727e-06, "loss": 0.0125, "step": 5829 }, { "epoch": 2.6524112829845317, "grad_norm": 0.4335493037779709, "learning_rate": 2.261670046789703e-06, "loss": 0.0093, "step": 5830 }, { "epoch": 2.6528662420382165, "grad_norm": 0.26795518728169215, "learning_rate": 2.2609586633659256e-06, "loss": 0.0051, "step": 5831 }, { "epoch": 2.653321201091902, "grad_norm": 0.40693613382849975, "learning_rate": 2.2602472994755274e-06, "loss": 0.0065, "step": 5832 }, { "epoch": 2.653776160145587, "grad_norm": 0.4191461442563829, "learning_rate": 2.2595359551766367e-06, "loss": 0.0103, "step": 5833 }, { "epoch": 2.654231119199272, "grad_norm": 0.5494217713082916, "learning_rate": 2.2588246305273823e-06, "loss": 0.0139, "step": 5834 }, { "epoch": 2.6546860782529573, "grad_norm": 0.44294717517788096, "learning_rate": 2.2581133255858893e-06, "loss": 0.0099, "step": 5835 }, { "epoch": 2.6551410373066426, "grad_norm": 0.44277995653201174, "learning_rate": 2.2574020404102824e-06, "loss": 0.0125, "step": 5836 }, { "epoch": 2.6555959963603275, "grad_norm": 0.5391314558344218, "learning_rate": 2.2566907750586867e-06, "loss": 0.014, "step": 5837 }, { "epoch": 2.656050955414013, "grad_norm": 0.3226180445140648, "learning_rate": 2.2559795295892214e-06, "loss": 0.005, "step": 5838 }, { "epoch": 2.656505914467698, "grad_norm": 0.5632986907845435, "learning_rate": 2.2552683040600072e-06, "loss": 0.0201, "step": 5839 }, { "epoch": 2.656960873521383, "grad_norm": 0.3650985984299971, "learning_rate": 2.254557098529162e-06, "loss": 0.0073, "step": 5840 }, { "epoch": 2.6574158325750683, "grad_norm": 0.34996962081811145, "learning_rate": 2.253845913054802e-06, "loss": 0.01, "step": 5841 }, { "epoch": 2.6578707916287536, "grad_norm": 0.4379522652319161, "learning_rate": 2.2531347476950422e-06, "loss": 0.0072, "step": 5842 }, { "epoch": 2.6583257506824385, "grad_norm": 0.34802118294821843, "learning_rate": 2.252423602507996e-06, "loss": 0.0083, "step": 5843 }, { "epoch": 2.658780709736124, "grad_norm": 0.4737733017637036, "learning_rate": 2.2517124775517753e-06, "loss": 0.0131, "step": 5844 }, { "epoch": 2.659235668789809, "grad_norm": 0.8015495000208132, "learning_rate": 2.2510013728844896e-06, "loss": 0.0193, "step": 5845 }, { "epoch": 2.659690627843494, "grad_norm": 0.25476189896938023, "learning_rate": 2.2502902885642474e-06, "loss": 0.0039, "step": 5846 }, { "epoch": 2.6601455868971793, "grad_norm": 0.4147759042784719, "learning_rate": 2.249579224649155e-06, "loss": 0.0082, "step": 5847 }, { "epoch": 2.6606005459508646, "grad_norm": 0.22084769125472384, "learning_rate": 2.248868181197318e-06, "loss": 0.0027, "step": 5848 }, { "epoch": 2.6610555050045495, "grad_norm": 0.2510079998580514, "learning_rate": 2.2481571582668384e-06, "loss": 0.0046, "step": 5849 }, { "epoch": 2.6615104640582348, "grad_norm": 0.3146022722110543, "learning_rate": 2.2474461559158176e-06, "loss": 0.005, "step": 5850 }, { "epoch": 2.66196542311192, "grad_norm": 0.5065101016451348, "learning_rate": 2.246735174202358e-06, "loss": 0.0096, "step": 5851 }, { "epoch": 2.662420382165605, "grad_norm": 0.38070888887424015, "learning_rate": 2.2460242131845556e-06, "loss": 0.0077, "step": 5852 }, { "epoch": 2.6628753412192903, "grad_norm": 0.44218606088840146, "learning_rate": 2.2453132729205078e-06, "loss": 0.0125, "step": 5853 }, { "epoch": 2.6633303002729756, "grad_norm": 0.4280021860775236, "learning_rate": 2.244602353468309e-06, "loss": 0.0081, "step": 5854 }, { "epoch": 2.6637852593266604, "grad_norm": 0.6075442629923459, "learning_rate": 2.243891454886053e-06, "loss": 0.014, "step": 5855 }, { "epoch": 2.6642402183803457, "grad_norm": 0.41669976767179123, "learning_rate": 2.243180577231831e-06, "loss": 0.008, "step": 5856 }, { "epoch": 2.664695177434031, "grad_norm": 0.3132204422624939, "learning_rate": 2.242469720563731e-06, "loss": 0.0078, "step": 5857 }, { "epoch": 2.665150136487716, "grad_norm": 0.3446803320449168, "learning_rate": 2.241758884939843e-06, "loss": 0.0058, "step": 5858 }, { "epoch": 2.6656050955414012, "grad_norm": 0.4631758506670344, "learning_rate": 2.241048070418253e-06, "loss": 0.0149, "step": 5859 }, { "epoch": 2.6660600545950865, "grad_norm": 0.5334690291630625, "learning_rate": 2.240337277057045e-06, "loss": 0.015, "step": 5860 }, { "epoch": 2.6665150136487714, "grad_norm": 0.5786091912621981, "learning_rate": 2.2396265049143027e-06, "loss": 0.0128, "step": 5861 }, { "epoch": 2.6669699727024567, "grad_norm": 1.5742129882508953, "learning_rate": 2.238915754048106e-06, "loss": 0.0424, "step": 5862 }, { "epoch": 2.667424931756142, "grad_norm": 0.47801059189597905, "learning_rate": 2.2382050245165355e-06, "loss": 0.0168, "step": 5863 }, { "epoch": 2.667879890809827, "grad_norm": 0.570296529321114, "learning_rate": 2.2374943163776665e-06, "loss": 0.0103, "step": 5864 }, { "epoch": 2.668334849863512, "grad_norm": 0.45800906247663964, "learning_rate": 2.2367836296895777e-06, "loss": 0.0127, "step": 5865 }, { "epoch": 2.6687898089171975, "grad_norm": 0.39336927432773766, "learning_rate": 2.2360729645103423e-06, "loss": 0.0062, "step": 5866 }, { "epoch": 2.6692447679708824, "grad_norm": 0.5697945408841323, "learning_rate": 2.235362320898032e-06, "loss": 0.0135, "step": 5867 }, { "epoch": 2.6696997270245677, "grad_norm": 0.4783062350946733, "learning_rate": 2.234651698910718e-06, "loss": 0.0126, "step": 5868 }, { "epoch": 2.670154686078253, "grad_norm": 0.591905480365963, "learning_rate": 2.233941098606468e-06, "loss": 0.0162, "step": 5869 }, { "epoch": 2.670609645131938, "grad_norm": 0.5587744027569703, "learning_rate": 2.2332305200433514e-06, "loss": 0.0137, "step": 5870 }, { "epoch": 2.671064604185623, "grad_norm": 0.5758192796751398, "learning_rate": 2.23251996327943e-06, "loss": 0.018, "step": 5871 }, { "epoch": 2.6715195632393085, "grad_norm": 0.6551678755857482, "learning_rate": 2.23180942837277e-06, "loss": 0.0109, "step": 5872 }, { "epoch": 2.6719745222929934, "grad_norm": 0.2984658000677207, "learning_rate": 2.2310989153814334e-06, "loss": 0.0044, "step": 5873 }, { "epoch": 2.6724294813466787, "grad_norm": 0.3362333357477884, "learning_rate": 2.230388424363478e-06, "loss": 0.0072, "step": 5874 }, { "epoch": 2.672884440400364, "grad_norm": 0.39796689589439427, "learning_rate": 2.229677955376964e-06, "loss": 0.0123, "step": 5875 }, { "epoch": 2.673339399454049, "grad_norm": 0.556185874762118, "learning_rate": 2.2289675084799463e-06, "loss": 0.0151, "step": 5876 }, { "epoch": 2.673794358507734, "grad_norm": 0.5862955332478849, "learning_rate": 2.2282570837304797e-06, "loss": 0.0076, "step": 5877 }, { "epoch": 2.6742493175614195, "grad_norm": 0.38340244986476407, "learning_rate": 2.2275466811866163e-06, "loss": 0.0099, "step": 5878 }, { "epoch": 2.674704276615105, "grad_norm": 0.28446390833493995, "learning_rate": 2.2268363009064082e-06, "loss": 0.0074, "step": 5879 }, { "epoch": 2.6751592356687897, "grad_norm": 0.30197667900051023, "learning_rate": 2.226125942947905e-06, "loss": 0.0067, "step": 5880 }, { "epoch": 2.675614194722475, "grad_norm": 0.6151304355560283, "learning_rate": 2.2254156073691517e-06, "loss": 0.0108, "step": 5881 }, { "epoch": 2.6760691537761603, "grad_norm": 0.39423703832494905, "learning_rate": 2.2247052942281958e-06, "loss": 0.0071, "step": 5882 }, { "epoch": 2.676524112829845, "grad_norm": 0.41103837932815135, "learning_rate": 2.2239950035830797e-06, "loss": 0.0093, "step": 5883 }, { "epoch": 2.6769790718835305, "grad_norm": 0.5941345725339126, "learning_rate": 2.223284735491846e-06, "loss": 0.0134, "step": 5884 }, { "epoch": 2.6774340309372158, "grad_norm": 0.3254952901022357, "learning_rate": 2.2225744900125324e-06, "loss": 0.0102, "step": 5885 }, { "epoch": 2.677888989990901, "grad_norm": 0.17885406556344347, "learning_rate": 2.2218642672031794e-06, "loss": 0.0034, "step": 5886 }, { "epoch": 2.678343949044586, "grad_norm": 0.4485044243526959, "learning_rate": 2.2211540671218236e-06, "loss": 0.0145, "step": 5887 }, { "epoch": 2.6787989080982713, "grad_norm": 0.38159501907734683, "learning_rate": 2.2204438898264973e-06, "loss": 0.0057, "step": 5888 }, { "epoch": 2.6792538671519566, "grad_norm": 0.3807990344349225, "learning_rate": 2.219733735375234e-06, "loss": 0.0062, "step": 5889 }, { "epoch": 2.6797088262056414, "grad_norm": 0.5973074275566393, "learning_rate": 2.2190236038260647e-06, "loss": 0.0114, "step": 5890 }, { "epoch": 2.6801637852593267, "grad_norm": 0.39812037366902153, "learning_rate": 2.2183134952370154e-06, "loss": 0.0076, "step": 5891 }, { "epoch": 2.680618744313012, "grad_norm": 0.6624247155514199, "learning_rate": 2.2176034096661174e-06, "loss": 0.0151, "step": 5892 }, { "epoch": 2.681073703366697, "grad_norm": 0.3447920854013603, "learning_rate": 2.2168933471713933e-06, "loss": 0.0068, "step": 5893 }, { "epoch": 2.6815286624203822, "grad_norm": 0.727176204537926, "learning_rate": 2.2161833078108657e-06, "loss": 0.0236, "step": 5894 }, { "epoch": 2.6819836214740675, "grad_norm": 0.4803975790568459, "learning_rate": 2.215473291642557e-06, "loss": 0.0081, "step": 5895 }, { "epoch": 2.6824385805277524, "grad_norm": 0.3869688400878832, "learning_rate": 2.2147632987244855e-06, "loss": 0.0074, "step": 5896 }, { "epoch": 2.6828935395814377, "grad_norm": 0.5008549177721616, "learning_rate": 2.2140533291146697e-06, "loss": 0.0049, "step": 5897 }, { "epoch": 2.683348498635123, "grad_norm": 0.6608226776008678, "learning_rate": 2.2133433828711235e-06, "loss": 0.0152, "step": 5898 }, { "epoch": 2.683803457688808, "grad_norm": 0.6436409948177629, "learning_rate": 2.212633460051862e-06, "loss": 0.0143, "step": 5899 }, { "epoch": 2.684258416742493, "grad_norm": 0.5221364073745415, "learning_rate": 2.211923560714897e-06, "loss": 0.0106, "step": 5900 }, { "epoch": 2.6847133757961785, "grad_norm": 0.5483765794411765, "learning_rate": 2.211213684918237e-06, "loss": 0.0152, "step": 5901 }, { "epoch": 2.6851683348498634, "grad_norm": 0.3826020307871731, "learning_rate": 2.2105038327198914e-06, "loss": 0.0083, "step": 5902 }, { "epoch": 2.6856232939035487, "grad_norm": 0.3986345955767916, "learning_rate": 2.209794004177864e-06, "loss": 0.0046, "step": 5903 }, { "epoch": 2.686078252957234, "grad_norm": 0.39359500154344695, "learning_rate": 2.2090841993501614e-06, "loss": 0.0078, "step": 5904 }, { "epoch": 2.686533212010919, "grad_norm": 0.25950352317748276, "learning_rate": 2.2083744182947828e-06, "loss": 0.0044, "step": 5905 }, { "epoch": 2.686988171064604, "grad_norm": 0.5481308695529996, "learning_rate": 2.2076646610697304e-06, "loss": 0.011, "step": 5906 }, { "epoch": 2.6874431301182895, "grad_norm": 0.32080629750563733, "learning_rate": 2.206954927733002e-06, "loss": 0.0056, "step": 5907 }, { "epoch": 2.6878980891719744, "grad_norm": 0.5136584506415268, "learning_rate": 2.206245218342593e-06, "loss": 0.0168, "step": 5908 }, { "epoch": 2.6883530482256597, "grad_norm": 0.42599090150764224, "learning_rate": 2.205535532956499e-06, "loss": 0.008, "step": 5909 }, { "epoch": 2.688808007279345, "grad_norm": 0.22158374298781153, "learning_rate": 2.2048258716327107e-06, "loss": 0.0066, "step": 5910 }, { "epoch": 2.68926296633303, "grad_norm": 0.48437769451797014, "learning_rate": 2.20411623442922e-06, "loss": 0.0147, "step": 5911 }, { "epoch": 2.689717925386715, "grad_norm": 0.2850208231838959, "learning_rate": 2.203406621404013e-06, "loss": 0.0044, "step": 5912 }, { "epoch": 2.6901728844404005, "grad_norm": 0.6319894244913241, "learning_rate": 2.202697032615078e-06, "loss": 0.0189, "step": 5913 }, { "epoch": 2.6906278434940853, "grad_norm": 0.4550755727426168, "learning_rate": 2.2019874681204e-06, "loss": 0.0092, "step": 5914 }, { "epoch": 2.6910828025477707, "grad_norm": 0.4556357306168323, "learning_rate": 2.2012779279779593e-06, "loss": 0.0079, "step": 5915 }, { "epoch": 2.691537761601456, "grad_norm": 0.4854483843257913, "learning_rate": 2.2005684122457377e-06, "loss": 0.0143, "step": 5916 }, { "epoch": 2.691992720655141, "grad_norm": 0.4206137071252036, "learning_rate": 2.199858920981713e-06, "loss": 0.0094, "step": 5917 }, { "epoch": 2.692447679708826, "grad_norm": 0.47055028345577454, "learning_rate": 2.199149454243862e-06, "loss": 0.0156, "step": 5918 }, { "epoch": 2.6929026387625115, "grad_norm": 0.3558991339706368, "learning_rate": 2.1984400120901585e-06, "loss": 0.0046, "step": 5919 }, { "epoch": 2.6933575978161963, "grad_norm": 0.46871098323660204, "learning_rate": 2.1977305945785756e-06, "loss": 0.0152, "step": 5920 }, { "epoch": 2.6938125568698816, "grad_norm": 0.6358749944520551, "learning_rate": 2.197021201767084e-06, "loss": 0.0166, "step": 5921 }, { "epoch": 2.694267515923567, "grad_norm": 0.5205513136691301, "learning_rate": 2.1963118337136508e-06, "loss": 0.0188, "step": 5922 }, { "epoch": 2.694722474977252, "grad_norm": 0.3301453199294807, "learning_rate": 2.195602490476244e-06, "loss": 0.0041, "step": 5923 }, { "epoch": 2.695177434030937, "grad_norm": 0.38206999813879355, "learning_rate": 2.1948931721128262e-06, "loss": 0.0062, "step": 5924 }, { "epoch": 2.6956323930846224, "grad_norm": 0.4518503109815764, "learning_rate": 2.194183878681361e-06, "loss": 0.0132, "step": 5925 }, { "epoch": 2.6960873521383073, "grad_norm": 0.5482130678977459, "learning_rate": 2.1934746102398076e-06, "loss": 0.0137, "step": 5926 }, { "epoch": 2.6965423111919926, "grad_norm": 0.4677884182060836, "learning_rate": 2.1927653668461253e-06, "loss": 0.0072, "step": 5927 }, { "epoch": 2.696997270245678, "grad_norm": 0.267093179050148, "learning_rate": 2.1920561485582696e-06, "loss": 0.0047, "step": 5928 }, { "epoch": 2.697452229299363, "grad_norm": 0.3892006786583256, "learning_rate": 2.1913469554341953e-06, "loss": 0.0097, "step": 5929 }, { "epoch": 2.697907188353048, "grad_norm": 0.2923737204033531, "learning_rate": 2.1906377875318535e-06, "loss": 0.0051, "step": 5930 }, { "epoch": 2.6983621474067334, "grad_norm": 0.4470528446494097, "learning_rate": 2.189928644909195e-06, "loss": 0.0131, "step": 5931 }, { "epoch": 2.6988171064604187, "grad_norm": 0.4875216508579341, "learning_rate": 2.1892195276241667e-06, "loss": 0.0199, "step": 5932 }, { "epoch": 2.6992720655141036, "grad_norm": 0.5132817602773665, "learning_rate": 2.188510435734715e-06, "loss": 0.01, "step": 5933 }, { "epoch": 2.699727024567789, "grad_norm": 0.6759579862117174, "learning_rate": 2.1878013692987848e-06, "loss": 0.0148, "step": 5934 }, { "epoch": 2.700181983621474, "grad_norm": 0.40214167143753987, "learning_rate": 2.1870923283743156e-06, "loss": 0.0072, "step": 5935 }, { "epoch": 2.700636942675159, "grad_norm": 0.22896154553030396, "learning_rate": 2.1863833130192495e-06, "loss": 0.0052, "step": 5936 }, { "epoch": 2.7010919017288444, "grad_norm": 0.5744291179371018, "learning_rate": 2.185674323291522e-06, "loss": 0.0113, "step": 5937 }, { "epoch": 2.7015468607825297, "grad_norm": 0.7702154031486574, "learning_rate": 2.18496535924907e-06, "loss": 0.0105, "step": 5938 }, { "epoch": 2.702001819836215, "grad_norm": 0.6654746986529269, "learning_rate": 2.1842564209498254e-06, "loss": 0.0182, "step": 5939 }, { "epoch": 2.7024567788899, "grad_norm": 0.5258633999194502, "learning_rate": 2.18354750845172e-06, "loss": 0.0126, "step": 5940 }, { "epoch": 2.702911737943585, "grad_norm": 0.5347226137594734, "learning_rate": 2.1828386218126835e-06, "loss": 0.0061, "step": 5941 }, { "epoch": 2.7033666969972705, "grad_norm": 0.37080156234091477, "learning_rate": 2.182129761090643e-06, "loss": 0.0071, "step": 5942 }, { "epoch": 2.7038216560509554, "grad_norm": 0.3841392137328798, "learning_rate": 2.1814209263435226e-06, "loss": 0.0086, "step": 5943 }, { "epoch": 2.7042766151046407, "grad_norm": 0.564985024356286, "learning_rate": 2.1807121176292455e-06, "loss": 0.0143, "step": 5944 }, { "epoch": 2.704731574158326, "grad_norm": 0.23332213068705357, "learning_rate": 2.180003335005732e-06, "loss": 0.0036, "step": 5945 }, { "epoch": 2.705186533212011, "grad_norm": 0.4047043686511817, "learning_rate": 2.1792945785309013e-06, "loss": 0.0082, "step": 5946 }, { "epoch": 2.705641492265696, "grad_norm": 0.4592888266632891, "learning_rate": 2.178585848262668e-06, "loss": 0.0102, "step": 5947 }, { "epoch": 2.7060964513193815, "grad_norm": 0.4852882789625215, "learning_rate": 2.177877144258949e-06, "loss": 0.0138, "step": 5948 }, { "epoch": 2.7065514103730663, "grad_norm": 0.45745486989508766, "learning_rate": 2.1771684665776547e-06, "loss": 0.0083, "step": 5949 }, { "epoch": 2.7070063694267517, "grad_norm": 0.412252346408398, "learning_rate": 2.1764598152766963e-06, "loss": 0.0105, "step": 5950 }, { "epoch": 2.707461328480437, "grad_norm": 0.49891379059314916, "learning_rate": 2.1757511904139795e-06, "loss": 0.0081, "step": 5951 }, { "epoch": 2.707916287534122, "grad_norm": 0.34368322248273436, "learning_rate": 2.175042592047412e-06, "loss": 0.0066, "step": 5952 }, { "epoch": 2.708371246587807, "grad_norm": 0.3651893293954274, "learning_rate": 2.1743340202348956e-06, "loss": 0.006, "step": 5953 }, { "epoch": 2.7088262056414925, "grad_norm": 0.2887854599883066, "learning_rate": 2.1736254750343324e-06, "loss": 0.005, "step": 5954 }, { "epoch": 2.7092811646951773, "grad_norm": 0.44187752084355664, "learning_rate": 2.1729169565036217e-06, "loss": 0.0085, "step": 5955 }, { "epoch": 2.7097361237488626, "grad_norm": 0.31005177650080606, "learning_rate": 2.17220846470066e-06, "loss": 0.0083, "step": 5956 }, { "epoch": 2.710191082802548, "grad_norm": 0.83229760349143, "learning_rate": 2.1714999996833434e-06, "loss": 0.0079, "step": 5957 }, { "epoch": 2.710646041856233, "grad_norm": 0.3352700306696052, "learning_rate": 2.170791561509562e-06, "loss": 0.0049, "step": 5958 }, { "epoch": 2.711101000909918, "grad_norm": 0.5598999156795395, "learning_rate": 2.170083150237209e-06, "loss": 0.0171, "step": 5959 }, { "epoch": 2.7115559599636034, "grad_norm": 0.22857137261039173, "learning_rate": 2.1693747659241695e-06, "loss": 0.0048, "step": 5960 }, { "epoch": 2.7120109190172883, "grad_norm": 0.7329087867348903, "learning_rate": 2.168666408628331e-06, "loss": 0.015, "step": 5961 }, { "epoch": 2.7124658780709736, "grad_norm": 0.43604731820512416, "learning_rate": 2.1679580784075783e-06, "loss": 0.0135, "step": 5962 }, { "epoch": 2.712920837124659, "grad_norm": 0.9320157966413328, "learning_rate": 2.1672497753197914e-06, "loss": 0.0072, "step": 5963 }, { "epoch": 2.713375796178344, "grad_norm": 1.4751531477239885, "learning_rate": 2.1665414994228505e-06, "loss": 0.0115, "step": 5964 }, { "epoch": 2.713830755232029, "grad_norm": 0.45739677151289326, "learning_rate": 2.165833250774633e-06, "loss": 0.0118, "step": 5965 }, { "epoch": 2.7142857142857144, "grad_norm": 0.5720227186158555, "learning_rate": 2.1651250294330124e-06, "loss": 0.0181, "step": 5966 }, { "epoch": 2.7147406733393993, "grad_norm": 0.810936157853959, "learning_rate": 2.1644168354558623e-06, "loss": 0.0148, "step": 5967 }, { "epoch": 2.7151956323930846, "grad_norm": 0.6926987482752969, "learning_rate": 2.163708668901052e-06, "loss": 0.0157, "step": 5968 }, { "epoch": 2.71565059144677, "grad_norm": 0.43754939321258235, "learning_rate": 2.1630005298264513e-06, "loss": 0.0109, "step": 5969 }, { "epoch": 2.7161055505004548, "grad_norm": 0.5445338712759844, "learning_rate": 2.1622924182899257e-06, "loss": 0.0093, "step": 5970 }, { "epoch": 2.71656050955414, "grad_norm": 0.4773173141523642, "learning_rate": 2.1615843343493383e-06, "loss": 0.0152, "step": 5971 }, { "epoch": 2.7170154686078254, "grad_norm": 0.3653708978457601, "learning_rate": 2.1608762780625513e-06, "loss": 0.0072, "step": 5972 }, { "epoch": 2.7174704276615103, "grad_norm": 0.5933674270342885, "learning_rate": 2.1601682494874226e-06, "loss": 0.0192, "step": 5973 }, { "epoch": 2.7179253867151956, "grad_norm": 0.7051287361702634, "learning_rate": 2.1594602486818107e-06, "loss": 0.0271, "step": 5974 }, { "epoch": 2.718380345768881, "grad_norm": 0.2742089241069931, "learning_rate": 2.158752275703568e-06, "loss": 0.0041, "step": 5975 }, { "epoch": 2.7188353048225657, "grad_norm": 0.4938650999416178, "learning_rate": 2.1580443306105494e-06, "loss": 0.0125, "step": 5976 }, { "epoch": 2.719290263876251, "grad_norm": 0.8951679417075508, "learning_rate": 2.157336413460604e-06, "loss": 0.0428, "step": 5977 }, { "epoch": 2.7197452229299364, "grad_norm": 0.5417502430699431, "learning_rate": 2.156628524311579e-06, "loss": 0.0158, "step": 5978 }, { "epoch": 2.7202001819836212, "grad_norm": 0.18250610147845495, "learning_rate": 2.155920663221321e-06, "loss": 0.0037, "step": 5979 }, { "epoch": 2.7206551410373065, "grad_norm": 0.49003778295463496, "learning_rate": 2.1552128302476715e-06, "loss": 0.0114, "step": 5980 }, { "epoch": 2.721110100090992, "grad_norm": 0.5452130002419402, "learning_rate": 2.1545050254484732e-06, "loss": 0.0161, "step": 5981 }, { "epoch": 2.7215650591446767, "grad_norm": 0.43755966108111377, "learning_rate": 2.1537972488815633e-06, "loss": 0.0119, "step": 5982 }, { "epoch": 2.722020018198362, "grad_norm": 0.35015953562243834, "learning_rate": 2.153089500604779e-06, "loss": 0.011, "step": 5983 }, { "epoch": 2.7224749772520473, "grad_norm": 0.29310157733919323, "learning_rate": 2.1523817806759546e-06, "loss": 0.0069, "step": 5984 }, { "epoch": 2.722929936305732, "grad_norm": 0.4876160580266456, "learning_rate": 2.151674089152921e-06, "loss": 0.0122, "step": 5985 }, { "epoch": 2.7233848953594175, "grad_norm": 0.39945317663592417, "learning_rate": 2.150966426093508e-06, "loss": 0.0106, "step": 5986 }, { "epoch": 2.723839854413103, "grad_norm": 0.42866042461683757, "learning_rate": 2.1502587915555423e-06, "loss": 0.0111, "step": 5987 }, { "epoch": 2.724294813466788, "grad_norm": 0.31850654167975445, "learning_rate": 2.149551185596849e-06, "loss": 0.007, "step": 5988 }, { "epoch": 2.724749772520473, "grad_norm": 0.37218076442707193, "learning_rate": 2.148843608275249e-06, "loss": 0.0078, "step": 5989 }, { "epoch": 2.7252047315741583, "grad_norm": 0.321326011764435, "learning_rate": 2.148136059648564e-06, "loss": 0.0083, "step": 5990 }, { "epoch": 2.7256596906278436, "grad_norm": 0.6865320972680453, "learning_rate": 2.1474285397746123e-06, "loss": 0.0178, "step": 5991 }, { "epoch": 2.7261146496815285, "grad_norm": 0.42258442706746246, "learning_rate": 2.1467210487112072e-06, "loss": 0.0059, "step": 5992 }, { "epoch": 2.726569608735214, "grad_norm": 0.6128201663181315, "learning_rate": 2.146013586516163e-06, "loss": 0.0141, "step": 5993 }, { "epoch": 2.727024567788899, "grad_norm": 0.6075240347692777, "learning_rate": 2.14530615324729e-06, "loss": 0.0207, "step": 5994 }, { "epoch": 2.7274795268425844, "grad_norm": 0.6474864579472712, "learning_rate": 2.144598748962396e-06, "loss": 0.0141, "step": 5995 }, { "epoch": 2.7279344858962693, "grad_norm": 0.6120841698172097, "learning_rate": 2.1438913737192867e-06, "loss": 0.0247, "step": 5996 }, { "epoch": 2.7283894449499546, "grad_norm": 0.4404374048993341, "learning_rate": 2.143184027575767e-06, "loss": 0.0134, "step": 5997 }, { "epoch": 2.72884440400364, "grad_norm": 0.38471776617325726, "learning_rate": 2.1424767105896372e-06, "loss": 0.006, "step": 5998 }, { "epoch": 2.729299363057325, "grad_norm": 0.6237110089299068, "learning_rate": 2.1417694228186957e-06, "loss": 0.0119, "step": 5999 }, { "epoch": 2.72975432211101, "grad_norm": 0.39528100673652383, "learning_rate": 2.14106216432074e-06, "loss": 0.0086, "step": 6000 }, { "epoch": 2.7302092811646954, "grad_norm": 0.6213890341501973, "learning_rate": 2.140354935153563e-06, "loss": 0.017, "step": 6001 }, { "epoch": 2.7306642402183803, "grad_norm": 0.5279980240419103, "learning_rate": 2.1396477353749564e-06, "loss": 0.0133, "step": 6002 }, { "epoch": 2.7311191992720656, "grad_norm": 0.24306990141182938, "learning_rate": 2.1389405650427083e-06, "loss": 0.0032, "step": 6003 }, { "epoch": 2.731574158325751, "grad_norm": 0.34958735729643187, "learning_rate": 2.138233424214608e-06, "loss": 0.0085, "step": 6004 }, { "epoch": 2.7320291173794358, "grad_norm": 0.34975240230524696, "learning_rate": 2.1375263129484385e-06, "loss": 0.0056, "step": 6005 }, { "epoch": 2.732484076433121, "grad_norm": 0.4132828491241603, "learning_rate": 2.1368192313019817e-06, "loss": 0.0069, "step": 6006 }, { "epoch": 2.7329390354868064, "grad_norm": 0.4917410106004111, "learning_rate": 2.136112179333017e-06, "loss": 0.0084, "step": 6007 }, { "epoch": 2.7333939945404913, "grad_norm": 0.4352452461332469, "learning_rate": 2.135405157099322e-06, "loss": 0.0091, "step": 6008 }, { "epoch": 2.7338489535941766, "grad_norm": 0.6199256366429143, "learning_rate": 2.134698164658671e-06, "loss": 0.014, "step": 6009 }, { "epoch": 2.734303912647862, "grad_norm": 0.4634095618969099, "learning_rate": 2.1339912020688353e-06, "loss": 0.0138, "step": 6010 }, { "epoch": 2.7347588717015467, "grad_norm": 0.4757908021338067, "learning_rate": 2.133284269387587e-06, "loss": 0.0131, "step": 6011 }, { "epoch": 2.735213830755232, "grad_norm": 0.4962265890821412, "learning_rate": 2.1325773666726915e-06, "loss": 0.0085, "step": 6012 }, { "epoch": 2.7356687898089174, "grad_norm": 0.5587688280208368, "learning_rate": 2.1318704939819148e-06, "loss": 0.0157, "step": 6013 }, { "epoch": 2.7361237488626022, "grad_norm": 0.5681633039685227, "learning_rate": 2.1311636513730185e-06, "loss": 0.0052, "step": 6014 }, { "epoch": 2.7365787079162875, "grad_norm": 0.28273469335337353, "learning_rate": 2.1304568389037635e-06, "loss": 0.0056, "step": 6015 }, { "epoch": 2.737033666969973, "grad_norm": 0.5380465784667147, "learning_rate": 2.129750056631906e-06, "loss": 0.0102, "step": 6016 }, { "epoch": 2.7374886260236577, "grad_norm": 0.5376999984395594, "learning_rate": 2.1290433046152015e-06, "loss": 0.0119, "step": 6017 }, { "epoch": 2.737943585077343, "grad_norm": 0.3282457083799179, "learning_rate": 2.128336582911404e-06, "loss": 0.0053, "step": 6018 }, { "epoch": 2.7383985441310283, "grad_norm": 0.6238474552662001, "learning_rate": 2.127629891578262e-06, "loss": 0.0162, "step": 6019 }, { "epoch": 2.738853503184713, "grad_norm": 0.6406669292989805, "learning_rate": 2.1269232306735243e-06, "loss": 0.0155, "step": 6020 }, { "epoch": 2.7393084622383985, "grad_norm": 0.3427879260061764, "learning_rate": 2.1262166002549346e-06, "loss": 0.0075, "step": 6021 }, { "epoch": 2.739763421292084, "grad_norm": 0.41802942510707364, "learning_rate": 2.125510000380237e-06, "loss": 0.0105, "step": 6022 }, { "epoch": 2.7402183803457687, "grad_norm": 0.5120547611156145, "learning_rate": 2.1248034311071702e-06, "loss": 0.0191, "step": 6023 }, { "epoch": 2.740673339399454, "grad_norm": 0.5999029712811278, "learning_rate": 2.1240968924934724e-06, "loss": 0.0147, "step": 6024 }, { "epoch": 2.7411282984531393, "grad_norm": 0.41032632412834363, "learning_rate": 2.12339038459688e-06, "loss": 0.0054, "step": 6025 }, { "epoch": 2.741583257506824, "grad_norm": 1.2658256352135888, "learning_rate": 2.1226839074751243e-06, "loss": 0.0544, "step": 6026 }, { "epoch": 2.7420382165605095, "grad_norm": 0.41242207292265454, "learning_rate": 2.121977461185936e-06, "loss": 0.011, "step": 6027 }, { "epoch": 2.742493175614195, "grad_norm": 0.6461508034792343, "learning_rate": 2.1212710457870416e-06, "loss": 0.0154, "step": 6028 }, { "epoch": 2.7429481346678797, "grad_norm": 0.5098808407321771, "learning_rate": 2.120564661336168e-06, "loss": 0.0107, "step": 6029 }, { "epoch": 2.743403093721565, "grad_norm": 0.9745913768331923, "learning_rate": 2.119858307891036e-06, "loss": 0.0313, "step": 6030 }, { "epoch": 2.7438580527752503, "grad_norm": 0.4687672266694487, "learning_rate": 2.119151985509366e-06, "loss": 0.0089, "step": 6031 }, { "epoch": 2.744313011828935, "grad_norm": 0.46911104745739385, "learning_rate": 2.118445694248877e-06, "loss": 0.0132, "step": 6032 }, { "epoch": 2.7447679708826205, "grad_norm": 0.6635312939387069, "learning_rate": 2.117739434167282e-06, "loss": 0.0196, "step": 6033 }, { "epoch": 2.745222929936306, "grad_norm": 0.38392962399059644, "learning_rate": 2.117033205322295e-06, "loss": 0.0057, "step": 6034 }, { "epoch": 2.7456778889899907, "grad_norm": 0.33919800871845696, "learning_rate": 2.1163270077716248e-06, "loss": 0.0046, "step": 6035 }, { "epoch": 2.746132848043676, "grad_norm": 0.5030968380788403, "learning_rate": 2.1156208415729786e-06, "loss": 0.02, "step": 6036 }, { "epoch": 2.7465878070973613, "grad_norm": 0.7371703130674221, "learning_rate": 2.1149147067840616e-06, "loss": 0.0128, "step": 6037 }, { "epoch": 2.747042766151046, "grad_norm": 0.46657595579604527, "learning_rate": 2.1142086034625744e-06, "loss": 0.015, "step": 6038 }, { "epoch": 2.7474977252047315, "grad_norm": 0.5167973672868387, "learning_rate": 2.113502531666219e-06, "loss": 0.0148, "step": 6039 }, { "epoch": 2.7479526842584168, "grad_norm": 0.4073254085120502, "learning_rate": 2.1127964914526917e-06, "loss": 0.0072, "step": 6040 }, { "epoch": 2.7484076433121016, "grad_norm": 0.49134289782264085, "learning_rate": 2.1120904828796857e-06, "loss": 0.009, "step": 6041 }, { "epoch": 2.748862602365787, "grad_norm": 0.2358330735466824, "learning_rate": 2.1113845060048943e-06, "loss": 0.0039, "step": 6042 }, { "epoch": 2.7493175614194723, "grad_norm": 0.30389748922602866, "learning_rate": 2.110678560886006e-06, "loss": 0.0088, "step": 6043 }, { "epoch": 2.7497725204731576, "grad_norm": 0.43815164190575295, "learning_rate": 2.1099726475807077e-06, "loss": 0.0133, "step": 6044 }, { "epoch": 2.7502274795268424, "grad_norm": 0.48619097769062264, "learning_rate": 2.109266766146682e-06, "loss": 0.0169, "step": 6045 }, { "epoch": 2.7506824385805277, "grad_norm": 0.6055768172080458, "learning_rate": 2.1085609166416128e-06, "loss": 0.0134, "step": 6046 }, { "epoch": 2.751137397634213, "grad_norm": 0.42245866185406833, "learning_rate": 2.1078550991231777e-06, "loss": 0.01, "step": 6047 }, { "epoch": 2.7515923566878984, "grad_norm": 0.46699012982875104, "learning_rate": 2.1071493136490527e-06, "loss": 0.0107, "step": 6048 }, { "epoch": 2.7520473157415832, "grad_norm": 0.5591235679250541, "learning_rate": 2.106443560276912e-06, "loss": 0.0138, "step": 6049 }, { "epoch": 2.7525022747952685, "grad_norm": 0.4135254057894369, "learning_rate": 2.1057378390644263e-06, "loss": 0.0077, "step": 6050 }, { "epoch": 2.752957233848954, "grad_norm": 0.5529694326350861, "learning_rate": 2.1050321500692637e-06, "loss": 0.0135, "step": 6051 }, { "epoch": 2.7534121929026387, "grad_norm": 0.38732301408075503, "learning_rate": 2.1043264933490896e-06, "loss": 0.0056, "step": 6052 }, { "epoch": 2.753867151956324, "grad_norm": 0.369307269104194, "learning_rate": 2.1036208689615683e-06, "loss": 0.008, "step": 6053 }, { "epoch": 2.7543221110100093, "grad_norm": 0.2503803887333824, "learning_rate": 2.1029152769643595e-06, "loss": 0.0045, "step": 6054 }, { "epoch": 2.754777070063694, "grad_norm": 0.3098479303306389, "learning_rate": 2.102209717415121e-06, "loss": 0.0043, "step": 6055 }, { "epoch": 2.7552320291173795, "grad_norm": 0.4869238511987804, "learning_rate": 2.1015041903715085e-06, "loss": 0.0258, "step": 6056 }, { "epoch": 2.755686988171065, "grad_norm": 0.3465924072632591, "learning_rate": 2.100798695891173e-06, "loss": 0.0069, "step": 6057 }, { "epoch": 2.7561419472247497, "grad_norm": 0.3514611307916903, "learning_rate": 2.100093234031766e-06, "loss": 0.0059, "step": 6058 }, { "epoch": 2.756596906278435, "grad_norm": 0.5308275032446219, "learning_rate": 2.099387804850933e-06, "loss": 0.0117, "step": 6059 }, { "epoch": 2.7570518653321203, "grad_norm": 0.2699828416020186, "learning_rate": 2.09868240840632e-06, "loss": 0.0044, "step": 6060 }, { "epoch": 2.757506824385805, "grad_norm": 0.5182039241604249, "learning_rate": 2.097977044755569e-06, "loss": 0.0121, "step": 6061 }, { "epoch": 2.7579617834394905, "grad_norm": 0.46937368368853577, "learning_rate": 2.0972717139563176e-06, "loss": 0.0113, "step": 6062 }, { "epoch": 2.758416742493176, "grad_norm": 0.7196407334028793, "learning_rate": 2.0965664160662038e-06, "loss": 0.014, "step": 6063 }, { "epoch": 2.7588717015468607, "grad_norm": 0.4577115687652308, "learning_rate": 2.0958611511428596e-06, "loss": 0.0077, "step": 6064 }, { "epoch": 2.759326660600546, "grad_norm": 0.32096089545879763, "learning_rate": 2.0951559192439177e-06, "loss": 0.0072, "step": 6065 }, { "epoch": 2.7597816196542313, "grad_norm": 0.35551300593990476, "learning_rate": 2.094450720427005e-06, "loss": 0.0093, "step": 6066 }, { "epoch": 2.760236578707916, "grad_norm": 0.4546341604202345, "learning_rate": 2.093745554749748e-06, "loss": 0.0116, "step": 6067 }, { "epoch": 2.7606915377616015, "grad_norm": 0.6597314435511948, "learning_rate": 2.0930404222697707e-06, "loss": 0.0172, "step": 6068 }, { "epoch": 2.761146496815287, "grad_norm": 0.6667008303409409, "learning_rate": 2.0923353230446918e-06, "loss": 0.0177, "step": 6069 }, { "epoch": 2.7616014558689717, "grad_norm": 0.27833631770115896, "learning_rate": 2.0916302571321295e-06, "loss": 0.0051, "step": 6070 }, { "epoch": 2.762056414922657, "grad_norm": 0.5096377796094507, "learning_rate": 2.0909252245896986e-06, "loss": 0.0106, "step": 6071 }, { "epoch": 2.7625113739763423, "grad_norm": 0.3292914957047552, "learning_rate": 2.0902202254750105e-06, "loss": 0.0067, "step": 6072 }, { "epoch": 2.762966333030027, "grad_norm": 0.37033498748963206, "learning_rate": 2.0895152598456744e-06, "loss": 0.0083, "step": 6073 }, { "epoch": 2.7634212920837125, "grad_norm": 0.32828860053812897, "learning_rate": 2.0888103277592982e-06, "loss": 0.0075, "step": 6074 }, { "epoch": 2.7638762511373978, "grad_norm": 0.4470814200087827, "learning_rate": 2.088105429273485e-06, "loss": 0.0078, "step": 6075 }, { "epoch": 2.7643312101910826, "grad_norm": 0.43388364155673637, "learning_rate": 2.0874005644458367e-06, "loss": 0.0106, "step": 6076 }, { "epoch": 2.764786169244768, "grad_norm": 0.4351256754930606, "learning_rate": 2.0866957333339503e-06, "loss": 0.0086, "step": 6077 }, { "epoch": 2.7652411282984533, "grad_norm": 0.42362496383243103, "learning_rate": 2.0859909359954224e-06, "loss": 0.0091, "step": 6078 }, { "epoch": 2.765696087352138, "grad_norm": 0.4949511440897384, "learning_rate": 2.0852861724878452e-06, "loss": 0.0077, "step": 6079 }, { "epoch": 2.7661510464058234, "grad_norm": 0.5551085377041635, "learning_rate": 2.0845814428688086e-06, "loss": 0.0092, "step": 6080 }, { "epoch": 2.7666060054595087, "grad_norm": 0.43193367744263694, "learning_rate": 2.0838767471959015e-06, "loss": 0.0077, "step": 6081 }, { "epoch": 2.7670609645131936, "grad_norm": 0.5567365001930993, "learning_rate": 2.083172085526707e-06, "loss": 0.0175, "step": 6082 }, { "epoch": 2.767515923566879, "grad_norm": 0.46699494675694325, "learning_rate": 2.082467457918808e-06, "loss": 0.0099, "step": 6083 }, { "epoch": 2.7679708826205642, "grad_norm": 0.3674991660914695, "learning_rate": 2.081762864429782e-06, "loss": 0.0066, "step": 6084 }, { "epoch": 2.768425841674249, "grad_norm": 0.6644779960172993, "learning_rate": 2.0810583051172066e-06, "loss": 0.0169, "step": 6085 }, { "epoch": 2.7688808007279344, "grad_norm": 0.4482064293541443, "learning_rate": 2.080353780038654e-06, "loss": 0.009, "step": 6086 }, { "epoch": 2.7693357597816197, "grad_norm": 0.43297251378557744, "learning_rate": 2.079649289251695e-06, "loss": 0.0088, "step": 6087 }, { "epoch": 2.7697907188353046, "grad_norm": 0.42503860830259144, "learning_rate": 2.0789448328138984e-06, "loss": 0.0134, "step": 6088 }, { "epoch": 2.77024567788899, "grad_norm": 0.382561287231165, "learning_rate": 2.0782404107828284e-06, "loss": 0.0089, "step": 6089 }, { "epoch": 2.770700636942675, "grad_norm": 0.5623909640863534, "learning_rate": 2.077536023216048e-06, "loss": 0.0129, "step": 6090 }, { "epoch": 2.77115559599636, "grad_norm": 0.7057481009556903, "learning_rate": 2.076831670171115e-06, "loss": 0.0061, "step": 6091 }, { "epoch": 2.7716105550500454, "grad_norm": 0.6852534211994831, "learning_rate": 2.076127351705588e-06, "loss": 0.0171, "step": 6092 }, { "epoch": 2.7720655141037307, "grad_norm": 0.5013106472294399, "learning_rate": 2.0754230678770188e-06, "loss": 0.0085, "step": 6093 }, { "epoch": 2.7725204731574156, "grad_norm": 0.326627246055182, "learning_rate": 2.0747188187429585e-06, "loss": 0.0044, "step": 6094 }, { "epoch": 2.772975432211101, "grad_norm": 0.3761167771854859, "learning_rate": 2.074014604360957e-06, "loss": 0.0102, "step": 6095 }, { "epoch": 2.773430391264786, "grad_norm": 0.4288129655968693, "learning_rate": 2.073310424788558e-06, "loss": 0.0094, "step": 6096 }, { "epoch": 2.7738853503184715, "grad_norm": 0.48193493668371185, "learning_rate": 2.072606280083304e-06, "loss": 0.0125, "step": 6097 }, { "epoch": 2.7743403093721564, "grad_norm": 0.5526169989341705, "learning_rate": 2.071902170302735e-06, "loss": 0.0078, "step": 6098 }, { "epoch": 2.7747952684258417, "grad_norm": 0.526425769388352, "learning_rate": 2.0711980955043875e-06, "loss": 0.0132, "step": 6099 }, { "epoch": 2.775250227479527, "grad_norm": 0.613401929462698, "learning_rate": 2.0704940557457948e-06, "loss": 0.0153, "step": 6100 }, { "epoch": 2.775705186533212, "grad_norm": 0.505904974450474, "learning_rate": 2.0697900510844874e-06, "loss": 0.0111, "step": 6101 }, { "epoch": 2.776160145586897, "grad_norm": 0.5938986468479408, "learning_rate": 2.0690860815779954e-06, "loss": 0.015, "step": 6102 }, { "epoch": 2.7766151046405825, "grad_norm": 0.3948963481598596, "learning_rate": 2.068382147283842e-06, "loss": 0.0076, "step": 6103 }, { "epoch": 2.777070063694268, "grad_norm": 0.5082427702563836, "learning_rate": 2.0676782482595516e-06, "loss": 0.013, "step": 6104 }, { "epoch": 2.7775250227479527, "grad_norm": 0.5558015738276584, "learning_rate": 2.0669743845626417e-06, "loss": 0.0133, "step": 6105 }, { "epoch": 2.777979981801638, "grad_norm": 0.34695898351933174, "learning_rate": 2.0662705562506298e-06, "loss": 0.0077, "step": 6106 }, { "epoch": 2.7784349408553233, "grad_norm": 0.6876566651712398, "learning_rate": 2.0655667633810293e-06, "loss": 0.0216, "step": 6107 }, { "epoch": 2.778889899909008, "grad_norm": 0.6235359445212062, "learning_rate": 2.0648630060113496e-06, "loss": 0.0099, "step": 6108 }, { "epoch": 2.7793448589626935, "grad_norm": 0.32270879401206465, "learning_rate": 2.0641592841991016e-06, "loss": 0.0047, "step": 6109 }, { "epoch": 2.7797998180163788, "grad_norm": 0.4270213179242976, "learning_rate": 2.0634555980017884e-06, "loss": 0.0042, "step": 6110 }, { "epoch": 2.7802547770700636, "grad_norm": 0.5606661678310367, "learning_rate": 2.062751947476912e-06, "loss": 0.0126, "step": 6111 }, { "epoch": 2.780709736123749, "grad_norm": 0.5158701372047533, "learning_rate": 2.0620483326819724e-06, "loss": 0.0098, "step": 6112 }, { "epoch": 2.7811646951774343, "grad_norm": 0.45199848375370405, "learning_rate": 2.0613447536744645e-06, "loss": 0.0098, "step": 6113 }, { "epoch": 2.781619654231119, "grad_norm": 0.4905650301333676, "learning_rate": 2.060641210511883e-06, "loss": 0.0096, "step": 6114 }, { "epoch": 2.7820746132848044, "grad_norm": 0.370371441947716, "learning_rate": 2.059937703251717e-06, "loss": 0.0066, "step": 6115 }, { "epoch": 2.7825295723384897, "grad_norm": 0.5004523693460862, "learning_rate": 2.059234231951455e-06, "loss": 0.0116, "step": 6116 }, { "epoch": 2.7829845313921746, "grad_norm": 0.38912340069455315, "learning_rate": 2.0585307966685815e-06, "loss": 0.0119, "step": 6117 }, { "epoch": 2.78343949044586, "grad_norm": 0.3901464279570206, "learning_rate": 2.057827397460577e-06, "loss": 0.0117, "step": 6118 }, { "epoch": 2.7838944494995452, "grad_norm": 0.37246796594743414, "learning_rate": 2.0571240343849213e-06, "loss": 0.0074, "step": 6119 }, { "epoch": 2.78434940855323, "grad_norm": 0.29331026879333977, "learning_rate": 2.056420707499089e-06, "loss": 0.0048, "step": 6120 }, { "epoch": 2.7848043676069154, "grad_norm": 0.6306282420418781, "learning_rate": 2.055717416860554e-06, "loss": 0.0066, "step": 6121 }, { "epoch": 2.7852593266606007, "grad_norm": 0.5865205647455822, "learning_rate": 2.055014162526784e-06, "loss": 0.0128, "step": 6122 }, { "epoch": 2.7857142857142856, "grad_norm": 0.6514069599710616, "learning_rate": 2.0543109445552477e-06, "loss": 0.0175, "step": 6123 }, { "epoch": 2.786169244767971, "grad_norm": 0.42537739671344804, "learning_rate": 2.053607763003409e-06, "loss": 0.007, "step": 6124 }, { "epoch": 2.786624203821656, "grad_norm": 0.4869658792978083, "learning_rate": 2.052904617928727e-06, "loss": 0.01, "step": 6125 }, { "epoch": 2.787079162875341, "grad_norm": 0.6640673078827362, "learning_rate": 2.0522015093886614e-06, "loss": 0.0165, "step": 6126 }, { "epoch": 2.7875341219290264, "grad_norm": 0.5683132261595307, "learning_rate": 2.051498437440666e-06, "loss": 0.0132, "step": 6127 }, { "epoch": 2.7879890809827117, "grad_norm": 0.4383029696665301, "learning_rate": 2.050795402142193e-06, "loss": 0.0062, "step": 6128 }, { "epoch": 2.7884440400363966, "grad_norm": 0.5694437306996063, "learning_rate": 2.05009240355069e-06, "loss": 0.0095, "step": 6129 }, { "epoch": 2.788898999090082, "grad_norm": 0.42598461828370054, "learning_rate": 2.049389441723605e-06, "loss": 0.0083, "step": 6130 }, { "epoch": 2.789353958143767, "grad_norm": 0.39202420649789055, "learning_rate": 2.04868651671838e-06, "loss": 0.0058, "step": 6131 }, { "epoch": 2.789808917197452, "grad_norm": 0.5890132382792869, "learning_rate": 2.0479836285924543e-06, "loss": 0.0116, "step": 6132 }, { "epoch": 2.7902638762511374, "grad_norm": 0.6284864530063939, "learning_rate": 2.047280777403266e-06, "loss": 0.0183, "step": 6133 }, { "epoch": 2.7907188353048227, "grad_norm": 0.5134600574957057, "learning_rate": 2.046577963208247e-06, "loss": 0.0097, "step": 6134 }, { "epoch": 2.7911737943585075, "grad_norm": 0.5100895784210197, "learning_rate": 2.0458751860648304e-06, "loss": 0.0165, "step": 6135 }, { "epoch": 2.791628753412193, "grad_norm": 0.593987582813978, "learning_rate": 2.0451724460304416e-06, "loss": 0.0155, "step": 6136 }, { "epoch": 2.792083712465878, "grad_norm": 0.3811713700754966, "learning_rate": 2.0444697431625068e-06, "loss": 0.0097, "step": 6137 }, { "epoch": 2.792538671519563, "grad_norm": 0.27884416388414135, "learning_rate": 2.043767077518448e-06, "loss": 0.0051, "step": 6138 }, { "epoch": 2.7929936305732483, "grad_norm": 0.5520298806722808, "learning_rate": 2.0430644491556826e-06, "loss": 0.0217, "step": 6139 }, { "epoch": 2.7934485896269337, "grad_norm": 0.3519288184906698, "learning_rate": 2.0423618581316277e-06, "loss": 0.0064, "step": 6140 }, { "epoch": 2.7939035486806185, "grad_norm": 0.44245710522998716, "learning_rate": 2.041659304503695e-06, "loss": 0.0078, "step": 6141 }, { "epoch": 2.794358507734304, "grad_norm": 0.41653851254012625, "learning_rate": 2.0409567883292938e-06, "loss": 0.0067, "step": 6142 }, { "epoch": 2.794813466787989, "grad_norm": 0.36352820625469356, "learning_rate": 2.04025430966583e-06, "loss": 0.0116, "step": 6143 }, { "epoch": 2.795268425841674, "grad_norm": 0.4969985133264131, "learning_rate": 2.0395518685707086e-06, "loss": 0.0108, "step": 6144 }, { "epoch": 2.7957233848953593, "grad_norm": 0.42712827665182185, "learning_rate": 2.0388494651013293e-06, "loss": 0.0095, "step": 6145 }, { "epoch": 2.7961783439490446, "grad_norm": 0.4466664687919501, "learning_rate": 2.0381470993150894e-06, "loss": 0.0132, "step": 6146 }, { "epoch": 2.7966333030027295, "grad_norm": 0.396258373820242, "learning_rate": 2.0374447712693824e-06, "loss": 0.0088, "step": 6147 }, { "epoch": 2.797088262056415, "grad_norm": 0.5139175816265751, "learning_rate": 2.0367424810216004e-06, "loss": 0.0126, "step": 6148 }, { "epoch": 2.7975432211101, "grad_norm": 0.550915259955454, "learning_rate": 2.0360402286291302e-06, "loss": 0.0111, "step": 6149 }, { "epoch": 2.797998180163785, "grad_norm": 0.4006291874280076, "learning_rate": 2.0353380141493563e-06, "loss": 0.0135, "step": 6150 }, { "epoch": 2.7984531392174703, "grad_norm": 0.34337609332105967, "learning_rate": 2.034635837639663e-06, "loss": 0.0042, "step": 6151 }, { "epoch": 2.7989080982711556, "grad_norm": 0.7468791342053808, "learning_rate": 2.0339336991574267e-06, "loss": 0.0195, "step": 6152 }, { "epoch": 2.799363057324841, "grad_norm": 0.34533241355917377, "learning_rate": 2.033231598760025e-06, "loss": 0.0065, "step": 6153 }, { "epoch": 2.799818016378526, "grad_norm": 0.26262276824893843, "learning_rate": 2.032529536504828e-06, "loss": 0.0026, "step": 6154 }, { "epoch": 2.800272975432211, "grad_norm": 0.47201923576867155, "learning_rate": 2.0318275124492066e-06, "loss": 0.0092, "step": 6155 }, { "epoch": 2.8007279344858964, "grad_norm": 0.39452584930656537, "learning_rate": 2.0311255266505264e-06, "loss": 0.0072, "step": 6156 }, { "epoch": 2.8011828935395813, "grad_norm": 0.41437303381203167, "learning_rate": 2.03042357916615e-06, "loss": 0.0061, "step": 6157 }, { "epoch": 2.8016378525932666, "grad_norm": 0.5926029083494392, "learning_rate": 2.0297216700534396e-06, "loss": 0.0158, "step": 6158 }, { "epoch": 2.802092811646952, "grad_norm": 0.5207124770943852, "learning_rate": 2.0290197993697493e-06, "loss": 0.0118, "step": 6159 }, { "epoch": 2.802547770700637, "grad_norm": 0.8628976107676769, "learning_rate": 2.028317967172435e-06, "loss": 0.01, "step": 6160 }, { "epoch": 2.803002729754322, "grad_norm": 0.39653076978169216, "learning_rate": 2.0276161735188458e-06, "loss": 0.0094, "step": 6161 }, { "epoch": 2.8034576888080074, "grad_norm": 0.6282014214353135, "learning_rate": 2.02691441846633e-06, "loss": 0.0149, "step": 6162 }, { "epoch": 2.8039126478616927, "grad_norm": 0.3721762689956131, "learning_rate": 2.0262127020722315e-06, "loss": 0.0062, "step": 6163 }, { "epoch": 2.8043676069153776, "grad_norm": 0.3668757236165309, "learning_rate": 2.0255110243938903e-06, "loss": 0.007, "step": 6164 }, { "epoch": 2.804822565969063, "grad_norm": 1.0063811414235166, "learning_rate": 2.024809385488647e-06, "loss": 0.0237, "step": 6165 }, { "epoch": 2.805277525022748, "grad_norm": 0.22973763058555613, "learning_rate": 2.024107785413834e-06, "loss": 0.0026, "step": 6166 }, { "epoch": 2.805732484076433, "grad_norm": 0.389672615074809, "learning_rate": 2.023406224226784e-06, "loss": 0.0102, "step": 6167 }, { "epoch": 2.8061874431301184, "grad_norm": 0.42151250108279864, "learning_rate": 2.0227047019848246e-06, "loss": 0.0059, "step": 6168 }, { "epoch": 2.8066424021838037, "grad_norm": 0.3258181391553404, "learning_rate": 2.022003218745282e-06, "loss": 0.0047, "step": 6169 }, { "epoch": 2.8070973612374885, "grad_norm": 0.6042383841328871, "learning_rate": 2.0213017745654774e-06, "loss": 0.0132, "step": 6170 }, { "epoch": 2.807552320291174, "grad_norm": 0.3797795872263063, "learning_rate": 2.0206003695027294e-06, "loss": 0.0037, "step": 6171 }, { "epoch": 2.808007279344859, "grad_norm": 0.43286956760089235, "learning_rate": 2.0198990036143553e-06, "loss": 0.0128, "step": 6172 }, { "epoch": 2.808462238398544, "grad_norm": 0.3571222142947154, "learning_rate": 2.019197676957666e-06, "loss": 0.0098, "step": 6173 }, { "epoch": 2.8089171974522293, "grad_norm": 0.5191875341365563, "learning_rate": 2.018496389589972e-06, "loss": 0.0117, "step": 6174 }, { "epoch": 2.8093721565059147, "grad_norm": 0.731053092037357, "learning_rate": 2.0177951415685777e-06, "loss": 0.0083, "step": 6175 }, { "epoch": 2.8098271155595995, "grad_norm": 0.2775606818289063, "learning_rate": 2.017093932950788e-06, "loss": 0.0053, "step": 6176 }, { "epoch": 2.810282074613285, "grad_norm": 0.5999542891615807, "learning_rate": 2.0163927637939002e-06, "loss": 0.0183, "step": 6177 }, { "epoch": 2.81073703366697, "grad_norm": 0.43947780729758384, "learning_rate": 2.015691634155211e-06, "loss": 0.006, "step": 6178 }, { "epoch": 2.811191992720655, "grad_norm": 0.2652687141629533, "learning_rate": 2.0149905440920155e-06, "loss": 0.0054, "step": 6179 }, { "epoch": 2.8116469517743403, "grad_norm": 0.5180120441808519, "learning_rate": 2.014289493661603e-06, "loss": 0.0145, "step": 6180 }, { "epoch": 2.8121019108280256, "grad_norm": 0.39912691414518003, "learning_rate": 2.013588482921259e-06, "loss": 0.0085, "step": 6181 }, { "epoch": 2.8125568698817105, "grad_norm": 0.32605296410592566, "learning_rate": 2.0128875119282676e-06, "loss": 0.0071, "step": 6182 }, { "epoch": 2.813011828935396, "grad_norm": 0.39527765769248147, "learning_rate": 2.0121865807399087e-06, "loss": 0.0066, "step": 6183 }, { "epoch": 2.813466787989081, "grad_norm": 0.50431767360339, "learning_rate": 2.01148568941346e-06, "loss": 0.0112, "step": 6184 }, { "epoch": 2.813921747042766, "grad_norm": 0.38382163520790785, "learning_rate": 2.0107848380061932e-06, "loss": 0.006, "step": 6185 }, { "epoch": 2.8143767060964513, "grad_norm": 1.3542495933140037, "learning_rate": 2.0100840265753813e-06, "loss": 0.0153, "step": 6186 }, { "epoch": 2.8148316651501366, "grad_norm": 1.011463688713977, "learning_rate": 2.009383255178291e-06, "loss": 0.029, "step": 6187 }, { "epoch": 2.8152866242038215, "grad_norm": 0.57237356794186, "learning_rate": 2.008682523872184e-06, "loss": 0.014, "step": 6188 }, { "epoch": 2.815741583257507, "grad_norm": 0.38628089949629674, "learning_rate": 2.0079818327143235e-06, "loss": 0.0085, "step": 6189 }, { "epoch": 2.816196542311192, "grad_norm": 0.3125261446776096, "learning_rate": 2.0072811817619655e-06, "loss": 0.0045, "step": 6190 }, { "epoch": 2.816651501364877, "grad_norm": 0.38545742902002644, "learning_rate": 2.0065805710723645e-06, "loss": 0.0088, "step": 6191 }, { "epoch": 2.8171064604185623, "grad_norm": 0.7527036775578471, "learning_rate": 2.0058800007027697e-06, "loss": 0.0196, "step": 6192 }, { "epoch": 2.8175614194722476, "grad_norm": 0.48721981786831675, "learning_rate": 2.0051794707104304e-06, "loss": 0.011, "step": 6193 }, { "epoch": 2.8180163785259325, "grad_norm": 0.2908136187781927, "learning_rate": 2.0044789811525904e-06, "loss": 0.004, "step": 6194 }, { "epoch": 2.8184713375796178, "grad_norm": 0.6833928921392757, "learning_rate": 2.0037785320864904e-06, "loss": 0.0184, "step": 6195 }, { "epoch": 2.818926296633303, "grad_norm": 0.6608014136228015, "learning_rate": 2.0030781235693682e-06, "loss": 0.0158, "step": 6196 }, { "epoch": 2.819381255686988, "grad_norm": 0.6237911493257874, "learning_rate": 2.0023777556584567e-06, "loss": 0.0121, "step": 6197 }, { "epoch": 2.8198362147406733, "grad_norm": 0.3774044692147736, "learning_rate": 2.001677428410989e-06, "loss": 0.0069, "step": 6198 }, { "epoch": 2.8202911737943586, "grad_norm": 0.3615968015532065, "learning_rate": 2.0009771418841897e-06, "loss": 0.008, "step": 6199 }, { "epoch": 2.8207461328480434, "grad_norm": 0.5712158977270859, "learning_rate": 2.0002768961352858e-06, "loss": 0.0186, "step": 6200 }, { "epoch": 2.8212010919017287, "grad_norm": 0.31413334273667126, "learning_rate": 1.9995766912214976e-06, "loss": 0.0083, "step": 6201 }, { "epoch": 2.821656050955414, "grad_norm": 0.5769957303684906, "learning_rate": 1.9988765272000414e-06, "loss": 0.0201, "step": 6202 }, { "epoch": 2.822111010009099, "grad_norm": 0.4043815971994987, "learning_rate": 1.9981764041281334e-06, "loss": 0.0069, "step": 6203 }, { "epoch": 2.8225659690627842, "grad_norm": 0.8871803178715486, "learning_rate": 1.9974763220629826e-06, "loss": 0.0282, "step": 6204 }, { "epoch": 2.8230209281164695, "grad_norm": 0.4072104939861542, "learning_rate": 1.996776281061798e-06, "loss": 0.0056, "step": 6205 }, { "epoch": 2.823475887170155, "grad_norm": 0.5392102347322841, "learning_rate": 1.9960762811817822e-06, "loss": 0.0195, "step": 6206 }, { "epoch": 2.8239308462238397, "grad_norm": 0.6598783209389738, "learning_rate": 1.9953763224801375e-06, "loss": 0.0089, "step": 6207 }, { "epoch": 2.824385805277525, "grad_norm": 0.38749901714451296, "learning_rate": 1.9946764050140616e-06, "loss": 0.0086, "step": 6208 }, { "epoch": 2.8248407643312103, "grad_norm": 0.4744959750222511, "learning_rate": 1.993976528840747e-06, "loss": 0.0113, "step": 6209 }, { "epoch": 2.825295723384895, "grad_norm": 0.4502427360020071, "learning_rate": 1.993276694017386e-06, "loss": 0.0107, "step": 6210 }, { "epoch": 2.8257506824385805, "grad_norm": 0.5995705951095607, "learning_rate": 1.9925769006011645e-06, "loss": 0.0095, "step": 6211 }, { "epoch": 2.826205641492266, "grad_norm": 0.5084763281276701, "learning_rate": 1.991877148649268e-06, "loss": 0.0121, "step": 6212 }, { "epoch": 2.826660600545951, "grad_norm": 0.44367051169974114, "learning_rate": 1.991177438218875e-06, "loss": 0.0062, "step": 6213 }, { "epoch": 2.827115559599636, "grad_norm": 0.40238108933877587, "learning_rate": 1.9904777693671646e-06, "loss": 0.0071, "step": 6214 }, { "epoch": 2.8275705186533213, "grad_norm": 0.4547748210764773, "learning_rate": 1.9897781421513103e-06, "loss": 0.009, "step": 6215 }, { "epoch": 2.8280254777070066, "grad_norm": 0.4870860621058494, "learning_rate": 1.9890785566284822e-06, "loss": 0.01, "step": 6216 }, { "epoch": 2.8284804367606915, "grad_norm": 0.41363484566926595, "learning_rate": 1.9883790128558463e-06, "loss": 0.0116, "step": 6217 }, { "epoch": 2.828935395814377, "grad_norm": 0.29501044400992693, "learning_rate": 1.987679510890568e-06, "loss": 0.0078, "step": 6218 }, { "epoch": 2.829390354868062, "grad_norm": 0.9425392037606719, "learning_rate": 1.9869800507898053e-06, "loss": 0.0334, "step": 6219 }, { "epoch": 2.829845313921747, "grad_norm": 0.5713950153064172, "learning_rate": 1.9862806326107162e-06, "loss": 0.011, "step": 6220 }, { "epoch": 2.8303002729754323, "grad_norm": 0.6533103069937816, "learning_rate": 1.9855812564104547e-06, "loss": 0.0257, "step": 6221 }, { "epoch": 2.8307552320291176, "grad_norm": 0.26699617827482797, "learning_rate": 1.984881922246169e-06, "loss": 0.0035, "step": 6222 }, { "epoch": 2.8312101910828025, "grad_norm": 0.4077655889203086, "learning_rate": 1.984182630175007e-06, "loss": 0.009, "step": 6223 }, { "epoch": 2.831665150136488, "grad_norm": 0.434774134719716, "learning_rate": 1.9834833802541107e-06, "loss": 0.0107, "step": 6224 }, { "epoch": 2.832120109190173, "grad_norm": 0.3916293666967668, "learning_rate": 1.982784172540621e-06, "loss": 0.0094, "step": 6225 }, { "epoch": 2.832575068243858, "grad_norm": 0.3553583761060399, "learning_rate": 1.982085007091672e-06, "loss": 0.0087, "step": 6226 }, { "epoch": 2.8330300272975433, "grad_norm": 0.4907426856870724, "learning_rate": 1.9813858839643965e-06, "loss": 0.0107, "step": 6227 }, { "epoch": 2.8334849863512286, "grad_norm": 0.33426602853926185, "learning_rate": 1.980686803215926e-06, "loss": 0.0059, "step": 6228 }, { "epoch": 2.8339399454049135, "grad_norm": 0.3363514828785949, "learning_rate": 1.9799877649033837e-06, "loss": 0.0062, "step": 6229 }, { "epoch": 2.8343949044585988, "grad_norm": 0.4115771081104543, "learning_rate": 1.979288769083894e-06, "loss": 0.0072, "step": 6230 }, { "epoch": 2.834849863512284, "grad_norm": 0.2626000930522339, "learning_rate": 1.978589815814574e-06, "loss": 0.0066, "step": 6231 }, { "epoch": 2.835304822565969, "grad_norm": 0.700684096516197, "learning_rate": 1.9778909051525397e-06, "loss": 0.0099, "step": 6232 }, { "epoch": 2.8357597816196543, "grad_norm": 0.4235747930441633, "learning_rate": 1.9771920371549025e-06, "loss": 0.0083, "step": 6233 }, { "epoch": 2.8362147406733396, "grad_norm": 0.5752691376460682, "learning_rate": 1.9764932118787705e-06, "loss": 0.0097, "step": 6234 }, { "epoch": 2.8366696997270244, "grad_norm": 0.4769092723651919, "learning_rate": 1.97579442938125e-06, "loss": 0.0071, "step": 6235 }, { "epoch": 2.8371246587807097, "grad_norm": 0.4503175840653732, "learning_rate": 1.9750956897194413e-06, "loss": 0.0174, "step": 6236 }, { "epoch": 2.837579617834395, "grad_norm": 0.26394173884211025, "learning_rate": 1.9743969929504427e-06, "loss": 0.0036, "step": 6237 }, { "epoch": 2.83803457688808, "grad_norm": 0.4232802269739976, "learning_rate": 1.973698339131348e-06, "loss": 0.0131, "step": 6238 }, { "epoch": 2.8384895359417652, "grad_norm": 0.3480626882184171, "learning_rate": 1.9729997283192483e-06, "loss": 0.0048, "step": 6239 }, { "epoch": 2.8389444949954505, "grad_norm": 0.38004875592293275, "learning_rate": 1.9723011605712307e-06, "loss": 0.0064, "step": 6240 }, { "epoch": 2.8393994540491354, "grad_norm": 0.5384600923743769, "learning_rate": 1.9716026359443784e-06, "loss": 0.0176, "step": 6241 }, { "epoch": 2.8398544131028207, "grad_norm": 0.4401102761332914, "learning_rate": 1.970904154495774e-06, "loss": 0.0082, "step": 6242 }, { "epoch": 2.840309372156506, "grad_norm": 0.6266178789863744, "learning_rate": 1.9702057162824916e-06, "loss": 0.02, "step": 6243 }, { "epoch": 2.840764331210191, "grad_norm": 0.4473982397473027, "learning_rate": 1.9695073213616066e-06, "loss": 0.0098, "step": 6244 }, { "epoch": 2.841219290263876, "grad_norm": 0.38630630956817413, "learning_rate": 1.968808969790187e-06, "loss": 0.0119, "step": 6245 }, { "epoch": 2.8416742493175615, "grad_norm": 0.6706445058860871, "learning_rate": 1.9681106616252995e-06, "loss": 0.0202, "step": 6246 }, { "epoch": 2.8421292083712464, "grad_norm": 0.5711397991095105, "learning_rate": 1.9674123969240067e-06, "loss": 0.0181, "step": 6247 }, { "epoch": 2.8425841674249317, "grad_norm": 0.4158674804600919, "learning_rate": 1.9667141757433667e-06, "loss": 0.0078, "step": 6248 }, { "epoch": 2.843039126478617, "grad_norm": 0.5947589660845274, "learning_rate": 1.9660159981404373e-06, "loss": 0.0136, "step": 6249 }, { "epoch": 2.843494085532302, "grad_norm": 0.4351110531244067, "learning_rate": 1.9653178641722688e-06, "loss": 0.0091, "step": 6250 }, { "epoch": 2.843949044585987, "grad_norm": 0.33019008844976155, "learning_rate": 1.9646197738959104e-06, "loss": 0.0077, "step": 6251 }, { "epoch": 2.8444040036396725, "grad_norm": 0.34668549640231794, "learning_rate": 1.963921727368406e-06, "loss": 0.0072, "step": 6252 }, { "epoch": 2.8448589626933574, "grad_norm": 0.37812987680383103, "learning_rate": 1.9632237246467967e-06, "loss": 0.0098, "step": 6253 }, { "epoch": 2.8453139217470427, "grad_norm": 0.37801171621969004, "learning_rate": 1.962525765788121e-06, "loss": 0.0074, "step": 6254 }, { "epoch": 2.845768880800728, "grad_norm": 0.46080816020608806, "learning_rate": 1.9618278508494114e-06, "loss": 0.0093, "step": 6255 }, { "epoch": 2.846223839854413, "grad_norm": 0.32532399295983233, "learning_rate": 1.9611299798877004e-06, "loss": 0.0058, "step": 6256 }, { "epoch": 2.846678798908098, "grad_norm": 0.4207514132512303, "learning_rate": 1.960432152960014e-06, "loss": 0.0061, "step": 6257 }, { "epoch": 2.8471337579617835, "grad_norm": 0.38357425530480327, "learning_rate": 1.9597343701233754e-06, "loss": 0.0118, "step": 6258 }, { "epoch": 2.8475887170154683, "grad_norm": 0.34909149335857903, "learning_rate": 1.9590366314348043e-06, "loss": 0.0069, "step": 6259 }, { "epoch": 2.8480436760691537, "grad_norm": 0.4547785436232463, "learning_rate": 1.9583389369513164e-06, "loss": 0.0076, "step": 6260 }, { "epoch": 2.848498635122839, "grad_norm": 0.5593886039728666, "learning_rate": 1.957641286729925e-06, "loss": 0.0143, "step": 6261 }, { "epoch": 2.8489535941765243, "grad_norm": 0.42660735423359597, "learning_rate": 1.956943680827637e-06, "loss": 0.0074, "step": 6262 }, { "epoch": 2.849408553230209, "grad_norm": 0.35536064031705633, "learning_rate": 1.95624611930146e-06, "loss": 0.0063, "step": 6263 }, { "epoch": 2.8498635122838945, "grad_norm": 0.7938815559561992, "learning_rate": 1.9555486022083947e-06, "loss": 0.0206, "step": 6264 }, { "epoch": 2.8503184713375798, "grad_norm": 0.47152550322163383, "learning_rate": 1.9548511296054386e-06, "loss": 0.0154, "step": 6265 }, { "epoch": 2.8507734303912646, "grad_norm": 0.5998751610256071, "learning_rate": 1.9541537015495867e-06, "loss": 0.0122, "step": 6266 }, { "epoch": 2.85122838944495, "grad_norm": 0.45987387691121356, "learning_rate": 1.953456318097829e-06, "loss": 0.009, "step": 6267 }, { "epoch": 2.8516833484986353, "grad_norm": 0.4319641334056434, "learning_rate": 1.952758979307153e-06, "loss": 0.0074, "step": 6268 }, { "epoch": 2.8521383075523206, "grad_norm": 0.6840640566192517, "learning_rate": 1.952061685234541e-06, "loss": 0.0142, "step": 6269 }, { "epoch": 2.8525932666060054, "grad_norm": 0.5192131121417413, "learning_rate": 1.951364435936974e-06, "loss": 0.0132, "step": 6270 }, { "epoch": 2.8530482256596907, "grad_norm": 0.3898327705425861, "learning_rate": 1.9506672314714285e-06, "loss": 0.0091, "step": 6271 }, { "epoch": 2.853503184713376, "grad_norm": 0.9589571482184926, "learning_rate": 1.9499700718948754e-06, "loss": 0.0355, "step": 6272 }, { "epoch": 2.853958143767061, "grad_norm": 0.5235307062694303, "learning_rate": 1.9492729572642846e-06, "loss": 0.0071, "step": 6273 }, { "epoch": 2.8544131028207462, "grad_norm": 0.5207323059741937, "learning_rate": 1.94857588763662e-06, "loss": 0.0101, "step": 6274 }, { "epoch": 2.8548680618744315, "grad_norm": 0.4987617413309965, "learning_rate": 1.9478788630688444e-06, "loss": 0.0088, "step": 6275 }, { "epoch": 2.8553230209281164, "grad_norm": 0.42929807950297416, "learning_rate": 1.9471818836179137e-06, "loss": 0.0056, "step": 6276 }, { "epoch": 2.8557779799818017, "grad_norm": 0.4324780634846511, "learning_rate": 1.9464849493407836e-06, "loss": 0.0089, "step": 6277 }, { "epoch": 2.856232939035487, "grad_norm": 0.6214471473443827, "learning_rate": 1.945788060294404e-06, "loss": 0.0066, "step": 6278 }, { "epoch": 2.856687898089172, "grad_norm": 0.3090681173691223, "learning_rate": 1.945091216535721e-06, "loss": 0.0048, "step": 6279 }, { "epoch": 2.857142857142857, "grad_norm": 0.47500259305150244, "learning_rate": 1.9443944181216782e-06, "loss": 0.0111, "step": 6280 }, { "epoch": 2.8575978161965425, "grad_norm": 0.40782653437819444, "learning_rate": 1.9436976651092143e-06, "loss": 0.0101, "step": 6281 }, { "epoch": 2.8580527752502274, "grad_norm": 0.6916136658955463, "learning_rate": 1.943000957555265e-06, "loss": 0.0245, "step": 6282 }, { "epoch": 2.8585077343039127, "grad_norm": 0.563083870436295, "learning_rate": 1.9423042955167613e-06, "loss": 0.0104, "step": 6283 }, { "epoch": 2.858962693357598, "grad_norm": 0.4152057954391422, "learning_rate": 1.941607679050633e-06, "loss": 0.007, "step": 6284 }, { "epoch": 2.859417652411283, "grad_norm": 0.24463049261035472, "learning_rate": 1.9409111082138034e-06, "loss": 0.0035, "step": 6285 }, { "epoch": 2.859872611464968, "grad_norm": 0.4807592301460576, "learning_rate": 1.9402145830631926e-06, "loss": 0.0092, "step": 6286 }, { "epoch": 2.8603275705186535, "grad_norm": 0.7010071018794405, "learning_rate": 1.939518103655719e-06, "loss": 0.012, "step": 6287 }, { "epoch": 2.8607825295723384, "grad_norm": 0.45307361123879814, "learning_rate": 1.938821670048295e-06, "loss": 0.0175, "step": 6288 }, { "epoch": 2.8612374886260237, "grad_norm": 0.4496635949838475, "learning_rate": 1.938125282297829e-06, "loss": 0.006, "step": 6289 }, { "epoch": 2.861692447679709, "grad_norm": 0.3302751441949611, "learning_rate": 1.9374289404612266e-06, "loss": 0.0052, "step": 6290 }, { "epoch": 2.862147406733394, "grad_norm": 0.4985253168419252, "learning_rate": 1.9367326445953924e-06, "loss": 0.0123, "step": 6291 }, { "epoch": 2.862602365787079, "grad_norm": 0.2958668192408477, "learning_rate": 1.936036394757222e-06, "loss": 0.004, "step": 6292 }, { "epoch": 2.8630573248407645, "grad_norm": 0.32512733996569726, "learning_rate": 1.9353401910036115e-06, "loss": 0.0046, "step": 6293 }, { "epoch": 2.8635122838944493, "grad_norm": 0.31161915076648455, "learning_rate": 1.934644033391449e-06, "loss": 0.005, "step": 6294 }, { "epoch": 2.8639672429481347, "grad_norm": 0.358444743819044, "learning_rate": 1.9339479219776246e-06, "loss": 0.0064, "step": 6295 }, { "epoch": 2.86442220200182, "grad_norm": 0.2902082677243572, "learning_rate": 1.9332518568190186e-06, "loss": 0.0039, "step": 6296 }, { "epoch": 2.864877161055505, "grad_norm": 0.5523524322005023, "learning_rate": 1.9325558379725113e-06, "loss": 0.0162, "step": 6297 }, { "epoch": 2.86533212010919, "grad_norm": 0.8228557745385348, "learning_rate": 1.931859865494979e-06, "loss": 0.0193, "step": 6298 }, { "epoch": 2.8657870791628755, "grad_norm": 0.4616668660586284, "learning_rate": 1.9311639394432926e-06, "loss": 0.0085, "step": 6299 }, { "epoch": 2.8662420382165603, "grad_norm": 0.5780211780874812, "learning_rate": 1.930468059874321e-06, "loss": 0.0162, "step": 6300 }, { "epoch": 2.8666969972702456, "grad_norm": 0.5966771142770857, "learning_rate": 1.9297722268449264e-06, "loss": 0.0134, "step": 6301 }, { "epoch": 2.867151956323931, "grad_norm": 0.32000623260067396, "learning_rate": 1.9290764404119714e-06, "loss": 0.0064, "step": 6302 }, { "epoch": 2.867606915377616, "grad_norm": 0.5418482020418041, "learning_rate": 1.9283807006323104e-06, "loss": 0.0216, "step": 6303 }, { "epoch": 2.868061874431301, "grad_norm": 0.42891753019929685, "learning_rate": 1.9276850075627968e-06, "loss": 0.0068, "step": 6304 }, { "epoch": 2.8685168334849864, "grad_norm": 0.5024815417881062, "learning_rate": 1.926989361260281e-06, "loss": 0.0113, "step": 6305 }, { "epoch": 2.8689717925386713, "grad_norm": 0.40772108961950876, "learning_rate": 1.9262937617816062e-06, "loss": 0.0068, "step": 6306 }, { "epoch": 2.8694267515923566, "grad_norm": 0.46339288357377506, "learning_rate": 1.925598209183615e-06, "loss": 0.0154, "step": 6307 }, { "epoch": 2.869881710646042, "grad_norm": 2.2731455516800056, "learning_rate": 1.924902703523144e-06, "loss": 0.0054, "step": 6308 }, { "epoch": 2.870336669699727, "grad_norm": 0.34744711474521905, "learning_rate": 1.924207244857027e-06, "loss": 0.007, "step": 6309 }, { "epoch": 2.870791628753412, "grad_norm": 0.5987396148123377, "learning_rate": 1.9235118332420934e-06, "loss": 0.0205, "step": 6310 }, { "epoch": 2.8712465878070974, "grad_norm": 0.5679529759087092, "learning_rate": 1.9228164687351688e-06, "loss": 0.0149, "step": 6311 }, { "epoch": 2.8717015468607823, "grad_norm": 0.516573303690946, "learning_rate": 1.9221211513930766e-06, "loss": 0.0109, "step": 6312 }, { "epoch": 2.8721565059144676, "grad_norm": 0.47181784637952195, "learning_rate": 1.9214258812726338e-06, "loss": 0.0062, "step": 6313 }, { "epoch": 2.872611464968153, "grad_norm": 0.6121676505757293, "learning_rate": 1.920730658430656e-06, "loss": 0.0161, "step": 6314 }, { "epoch": 2.8730664240218378, "grad_norm": 0.7059549506900292, "learning_rate": 1.920035482923952e-06, "loss": 0.0059, "step": 6315 }, { "epoch": 2.873521383075523, "grad_norm": 0.5594674143813881, "learning_rate": 1.91934035480933e-06, "loss": 0.0116, "step": 6316 }, { "epoch": 2.8739763421292084, "grad_norm": 0.46141250650935606, "learning_rate": 1.9186452741435914e-06, "loss": 0.0115, "step": 6317 }, { "epoch": 2.8744313011828937, "grad_norm": 0.7104742678094401, "learning_rate": 1.917950240983535e-06, "loss": 0.0198, "step": 6318 }, { "epoch": 2.8748862602365786, "grad_norm": 0.329921321191989, "learning_rate": 1.917255255385957e-06, "loss": 0.0069, "step": 6319 }, { "epoch": 2.875341219290264, "grad_norm": 0.6469750225117502, "learning_rate": 1.916560317407648e-06, "loss": 0.0137, "step": 6320 }, { "epoch": 2.875796178343949, "grad_norm": 0.2344443115882633, "learning_rate": 1.9158654271053957e-06, "loss": 0.0022, "step": 6321 }, { "epoch": 2.876251137397634, "grad_norm": 0.36680583922902305, "learning_rate": 1.9151705845359825e-06, "loss": 0.0068, "step": 6322 }, { "epoch": 2.8767060964513194, "grad_norm": 0.45238457733720355, "learning_rate": 1.914475789756187e-06, "loss": 0.0082, "step": 6323 }, { "epoch": 2.8771610555050047, "grad_norm": 0.4503793897456432, "learning_rate": 1.913781042822787e-06, "loss": 0.0156, "step": 6324 }, { "epoch": 2.87761601455869, "grad_norm": 0.4377442151049546, "learning_rate": 1.913086343792552e-06, "loss": 0.0094, "step": 6325 }, { "epoch": 2.878070973612375, "grad_norm": 0.5751764279674006, "learning_rate": 1.9123916927222506e-06, "loss": 0.0137, "step": 6326 }, { "epoch": 2.87852593266606, "grad_norm": 0.3261270116072777, "learning_rate": 1.9116970896686467e-06, "loss": 0.005, "step": 6327 }, { "epoch": 2.8789808917197455, "grad_norm": 0.5374141171455415, "learning_rate": 1.9110025346885e-06, "loss": 0.0109, "step": 6328 }, { "epoch": 2.8794358507734303, "grad_norm": 0.4462045469525876, "learning_rate": 1.910308027838566e-06, "loss": 0.0087, "step": 6329 }, { "epoch": 2.8798908098271156, "grad_norm": 0.5930719753041798, "learning_rate": 1.909613569175597e-06, "loss": 0.0215, "step": 6330 }, { "epoch": 2.880345768880801, "grad_norm": 0.5738404901670497, "learning_rate": 1.9089191587563414e-06, "loss": 0.0082, "step": 6331 }, { "epoch": 2.880800727934486, "grad_norm": 0.5781485808208172, "learning_rate": 1.9082247966375417e-06, "loss": 0.015, "step": 6332 }, { "epoch": 2.881255686988171, "grad_norm": 0.5450804918201388, "learning_rate": 1.90753048287594e-06, "loss": 0.0121, "step": 6333 }, { "epoch": 2.8817106460418564, "grad_norm": 0.3128216287243589, "learning_rate": 1.906836217528272e-06, "loss": 0.0053, "step": 6334 }, { "epoch": 2.8821656050955413, "grad_norm": 0.46956133438248066, "learning_rate": 1.906142000651269e-06, "loss": 0.0106, "step": 6335 }, { "epoch": 2.8826205641492266, "grad_norm": 0.561493275110418, "learning_rate": 1.9054478323016607e-06, "loss": 0.0096, "step": 6336 }, { "epoch": 2.883075523202912, "grad_norm": 0.5551396424278942, "learning_rate": 1.9047537125361695e-06, "loss": 0.0127, "step": 6337 }, { "epoch": 2.883530482256597, "grad_norm": 0.42519224239030057, "learning_rate": 1.9040596414115175e-06, "loss": 0.0069, "step": 6338 }, { "epoch": 2.883985441310282, "grad_norm": 0.4865093556687679, "learning_rate": 1.9033656189844196e-06, "loss": 0.015, "step": 6339 }, { "epoch": 2.8844404003639674, "grad_norm": 0.23485570373575176, "learning_rate": 1.9026716453115893e-06, "loss": 0.0049, "step": 6340 }, { "epoch": 2.8848953594176523, "grad_norm": 0.8707213797617852, "learning_rate": 1.9019777204497353e-06, "loss": 0.0081, "step": 6341 }, { "epoch": 2.8853503184713376, "grad_norm": 0.45492842975157827, "learning_rate": 1.9012838444555605e-06, "loss": 0.0093, "step": 6342 }, { "epoch": 2.885805277525023, "grad_norm": 0.316303560768036, "learning_rate": 1.900590017385767e-06, "loss": 0.0073, "step": 6343 }, { "epoch": 2.886260236578708, "grad_norm": 0.6067844471976763, "learning_rate": 1.8998962392970496e-06, "loss": 0.0124, "step": 6344 }, { "epoch": 2.886715195632393, "grad_norm": 0.3882501954829309, "learning_rate": 1.899202510246102e-06, "loss": 0.0081, "step": 6345 }, { "epoch": 2.8871701546860784, "grad_norm": 0.5944302830092801, "learning_rate": 1.8985088302896113e-06, "loss": 0.0192, "step": 6346 }, { "epoch": 2.8876251137397633, "grad_norm": 0.4280944557285773, "learning_rate": 1.8978151994842632e-06, "loss": 0.0093, "step": 6347 }, { "epoch": 2.8880800727934486, "grad_norm": 0.493555706587382, "learning_rate": 1.8971216178867378e-06, "loss": 0.0166, "step": 6348 }, { "epoch": 2.888535031847134, "grad_norm": 0.25698501539986707, "learning_rate": 1.8964280855537106e-06, "loss": 0.0027, "step": 6349 }, { "epoch": 2.8889899909008188, "grad_norm": 0.326336129736613, "learning_rate": 1.8957346025418555e-06, "loss": 0.0107, "step": 6350 }, { "epoch": 2.889444949954504, "grad_norm": 0.5079349337973318, "learning_rate": 1.895041168907839e-06, "loss": 0.0154, "step": 6351 }, { "epoch": 2.8898999090081894, "grad_norm": 0.3705408689462415, "learning_rate": 1.894347784708327e-06, "loss": 0.0061, "step": 6352 }, { "epoch": 2.8903548680618742, "grad_norm": 0.6732427584375591, "learning_rate": 1.8936544499999777e-06, "loss": 0.0192, "step": 6353 }, { "epoch": 2.8908098271155596, "grad_norm": 0.4419319619865497, "learning_rate": 1.892961164839449e-06, "loss": 0.0063, "step": 6354 }, { "epoch": 2.891264786169245, "grad_norm": 0.5267787822450777, "learning_rate": 1.892267929283393e-06, "loss": 0.0173, "step": 6355 }, { "epoch": 2.8917197452229297, "grad_norm": 0.7514604931096212, "learning_rate": 1.8915747433884567e-06, "loss": 0.0139, "step": 6356 }, { "epoch": 2.892174704276615, "grad_norm": 0.5379623455382149, "learning_rate": 1.8908816072112857e-06, "loss": 0.0113, "step": 6357 }, { "epoch": 2.8926296633303004, "grad_norm": 0.34454108570752684, "learning_rate": 1.8901885208085186e-06, "loss": 0.0041, "step": 6358 }, { "epoch": 2.8930846223839852, "grad_norm": 0.37602734546503286, "learning_rate": 1.8894954842367912e-06, "loss": 0.0083, "step": 6359 }, { "epoch": 2.8935395814376705, "grad_norm": 0.44011709739534216, "learning_rate": 1.8888024975527359e-06, "loss": 0.0073, "step": 6360 }, { "epoch": 2.893994540491356, "grad_norm": 0.3961460791636242, "learning_rate": 1.8881095608129807e-06, "loss": 0.0086, "step": 6361 }, { "epoch": 2.8944494995450407, "grad_norm": 0.5813977886713859, "learning_rate": 1.8874166740741487e-06, "loss": 0.0158, "step": 6362 }, { "epoch": 2.894904458598726, "grad_norm": 0.5462166653123527, "learning_rate": 1.88672383739286e-06, "loss": 0.012, "step": 6363 }, { "epoch": 2.8953594176524113, "grad_norm": 0.3369726576268672, "learning_rate": 1.8860310508257297e-06, "loss": 0.0033, "step": 6364 }, { "epoch": 2.895814376706096, "grad_norm": 0.5948181591770133, "learning_rate": 1.8853383144293693e-06, "loss": 0.0122, "step": 6365 }, { "epoch": 2.8962693357597815, "grad_norm": 0.23504052882619378, "learning_rate": 1.8846456282603858e-06, "loss": 0.0049, "step": 6366 }, { "epoch": 2.896724294813467, "grad_norm": 0.6087863926892029, "learning_rate": 1.8839529923753822e-06, "loss": 0.0178, "step": 6367 }, { "epoch": 2.8971792538671517, "grad_norm": 0.46281530629800294, "learning_rate": 1.8832604068309588e-06, "loss": 0.0078, "step": 6368 }, { "epoch": 2.897634212920837, "grad_norm": 0.9383218388219906, "learning_rate": 1.8825678716837093e-06, "loss": 0.015, "step": 6369 }, { "epoch": 2.8980891719745223, "grad_norm": 0.2793640818797532, "learning_rate": 1.8818753869902256e-06, "loss": 0.0039, "step": 6370 }, { "epoch": 2.8985441310282076, "grad_norm": 0.5025841865375712, "learning_rate": 1.8811829528070935e-06, "loss": 0.0184, "step": 6371 }, { "epoch": 2.8989990900818925, "grad_norm": 0.5019437971389756, "learning_rate": 1.8804905691908965e-06, "loss": 0.0119, "step": 6372 }, { "epoch": 2.899454049135578, "grad_norm": 0.4527728684297999, "learning_rate": 1.8797982361982118e-06, "loss": 0.0085, "step": 6373 }, { "epoch": 2.899909008189263, "grad_norm": 0.32884107197950274, "learning_rate": 1.8791059538856138e-06, "loss": 0.0054, "step": 6374 }, { "epoch": 2.900363967242948, "grad_norm": 1.0449356330819919, "learning_rate": 1.8784137223096743e-06, "loss": 0.0307, "step": 6375 }, { "epoch": 2.9008189262966333, "grad_norm": 0.5676215298332934, "learning_rate": 1.8777215415269582e-06, "loss": 0.0081, "step": 6376 }, { "epoch": 2.9012738853503186, "grad_norm": 0.4236764083203293, "learning_rate": 1.8770294115940279e-06, "loss": 0.0044, "step": 6377 }, { "epoch": 2.901728844404004, "grad_norm": 0.6709722845762708, "learning_rate": 1.87633733256744e-06, "loss": 0.0139, "step": 6378 }, { "epoch": 2.902183803457689, "grad_norm": 0.7038719825378248, "learning_rate": 1.8756453045037499e-06, "loss": 0.0152, "step": 6379 }, { "epoch": 2.902638762511374, "grad_norm": 0.4183626409325102, "learning_rate": 1.874953327459505e-06, "loss": 0.0066, "step": 6380 }, { "epoch": 2.9030937215650594, "grad_norm": 0.30965112652408006, "learning_rate": 1.874261401491251e-06, "loss": 0.0059, "step": 6381 }, { "epoch": 2.9035486806187443, "grad_norm": 0.3689290502815818, "learning_rate": 1.8735695266555306e-06, "loss": 0.0051, "step": 6382 }, { "epoch": 2.9040036396724296, "grad_norm": 0.4033926358623149, "learning_rate": 1.872877703008879e-06, "loss": 0.0052, "step": 6383 }, { "epoch": 2.904458598726115, "grad_norm": 0.34374790548175294, "learning_rate": 1.87218593060783e-06, "loss": 0.0065, "step": 6384 }, { "epoch": 2.9049135577797998, "grad_norm": 0.38277073104478926, "learning_rate": 1.8714942095089112e-06, "loss": 0.0079, "step": 6385 }, { "epoch": 2.905368516833485, "grad_norm": 0.4780736351047789, "learning_rate": 1.8708025397686474e-06, "loss": 0.0103, "step": 6386 }, { "epoch": 2.9058234758871704, "grad_norm": 0.432567388315983, "learning_rate": 1.8701109214435586e-06, "loss": 0.0079, "step": 6387 }, { "epoch": 2.9062784349408552, "grad_norm": 0.6444904071863528, "learning_rate": 1.8694193545901602e-06, "loss": 0.0175, "step": 6388 }, { "epoch": 2.9067333939945406, "grad_norm": 0.3857423309294066, "learning_rate": 1.868727839264965e-06, "loss": 0.0114, "step": 6389 }, { "epoch": 2.907188353048226, "grad_norm": 0.518856575396282, "learning_rate": 1.86803637552448e-06, "loss": 0.0099, "step": 6390 }, { "epoch": 2.9076433121019107, "grad_norm": 0.3843008194005518, "learning_rate": 1.8673449634252087e-06, "loss": 0.0121, "step": 6391 }, { "epoch": 2.908098271155596, "grad_norm": 0.40741584052257745, "learning_rate": 1.8666536030236494e-06, "loss": 0.0078, "step": 6392 }, { "epoch": 2.9085532302092814, "grad_norm": 0.46834842388971437, "learning_rate": 1.8659622943762978e-06, "loss": 0.0093, "step": 6393 }, { "epoch": 2.9090081892629662, "grad_norm": 0.4160020930644956, "learning_rate": 1.865271037539645e-06, "loss": 0.0084, "step": 6394 }, { "epoch": 2.9094631483166515, "grad_norm": 0.5312773039685089, "learning_rate": 1.864579832570174e-06, "loss": 0.0119, "step": 6395 }, { "epoch": 2.909918107370337, "grad_norm": 0.43296286240328025, "learning_rate": 1.8638886795243718e-06, "loss": 0.0101, "step": 6396 }, { "epoch": 2.9103730664240217, "grad_norm": 0.3668170186524061, "learning_rate": 1.863197578458714e-06, "loss": 0.0102, "step": 6397 }, { "epoch": 2.910828025477707, "grad_norm": 0.38526106542625177, "learning_rate": 1.8625065294296734e-06, "loss": 0.0055, "step": 6398 }, { "epoch": 2.9112829845313923, "grad_norm": 0.4617120657970562, "learning_rate": 1.8618155324937214e-06, "loss": 0.0115, "step": 6399 }, { "epoch": 2.911737943585077, "grad_norm": 0.44545904212264464, "learning_rate": 1.8611245877073214e-06, "loss": 0.0107, "step": 6400 }, { "epoch": 2.9121929026387625, "grad_norm": 0.5236277286142653, "learning_rate": 1.8604336951269352e-06, "loss": 0.0099, "step": 6401 }, { "epoch": 2.912647861692448, "grad_norm": 0.6820746868368712, "learning_rate": 1.8597428548090183e-06, "loss": 0.0152, "step": 6402 }, { "epoch": 2.9131028207461327, "grad_norm": 0.3137214021307924, "learning_rate": 1.8590520668100243e-06, "loss": 0.005, "step": 6403 }, { "epoch": 2.913557779799818, "grad_norm": 0.49143600874156884, "learning_rate": 1.8583613311864018e-06, "loss": 0.0165, "step": 6404 }, { "epoch": 2.9140127388535033, "grad_norm": 0.5659317821589372, "learning_rate": 1.8576706479945928e-06, "loss": 0.0151, "step": 6405 }, { "epoch": 2.914467697907188, "grad_norm": 0.48741741843936603, "learning_rate": 1.8569800172910384e-06, "loss": 0.014, "step": 6406 }, { "epoch": 2.9149226569608735, "grad_norm": 0.2507186358385393, "learning_rate": 1.8562894391321725e-06, "loss": 0.0045, "step": 6407 }, { "epoch": 2.915377616014559, "grad_norm": 0.26894372669167044, "learning_rate": 1.8555989135744273e-06, "loss": 0.0049, "step": 6408 }, { "epoch": 2.9158325750682437, "grad_norm": 0.31572492520992174, "learning_rate": 1.854908440674228e-06, "loss": 0.0059, "step": 6409 }, { "epoch": 2.916287534121929, "grad_norm": 0.5498396781406056, "learning_rate": 1.8542180204879978e-06, "loss": 0.0087, "step": 6410 }, { "epoch": 2.9167424931756143, "grad_norm": 0.4761242161486268, "learning_rate": 1.8535276530721553e-06, "loss": 0.018, "step": 6411 }, { "epoch": 2.917197452229299, "grad_norm": 0.266045015854311, "learning_rate": 1.852837338483113e-06, "loss": 0.0041, "step": 6412 }, { "epoch": 2.9176524112829845, "grad_norm": 0.42840394864807213, "learning_rate": 1.8521470767772814e-06, "loss": 0.0092, "step": 6413 }, { "epoch": 2.91810737033667, "grad_norm": 0.44372913093304206, "learning_rate": 1.8514568680110646e-06, "loss": 0.0081, "step": 6414 }, { "epoch": 2.9185623293903546, "grad_norm": 0.20668370235612793, "learning_rate": 1.850766712240864e-06, "loss": 0.0031, "step": 6415 }, { "epoch": 2.91901728844404, "grad_norm": 0.1745010733242083, "learning_rate": 1.8500766095230749e-06, "loss": 0.0019, "step": 6416 }, { "epoch": 2.9194722474977253, "grad_norm": 0.4073347600053403, "learning_rate": 1.849386559914091e-06, "loss": 0.0113, "step": 6417 }, { "epoch": 2.91992720655141, "grad_norm": 0.47852970563674596, "learning_rate": 1.8486965634702997e-06, "loss": 0.0077, "step": 6418 }, { "epoch": 2.9203821656050954, "grad_norm": 0.3595123983847056, "learning_rate": 1.848006620248083e-06, "loss": 0.007, "step": 6419 }, { "epoch": 2.9208371246587808, "grad_norm": 0.4459578812273464, "learning_rate": 1.847316730303822e-06, "loss": 0.0082, "step": 6420 }, { "epoch": 2.9212920837124656, "grad_norm": 0.5046510726905001, "learning_rate": 1.8466268936938895e-06, "loss": 0.0112, "step": 6421 }, { "epoch": 2.921747042766151, "grad_norm": 0.5024955254349528, "learning_rate": 1.845937110474657e-06, "loss": 0.0122, "step": 6422 }, { "epoch": 2.9222020018198362, "grad_norm": 0.36277392310437745, "learning_rate": 1.8452473807024896e-06, "loss": 0.0041, "step": 6423 }, { "epoch": 2.922656960873521, "grad_norm": 0.35937636661401157, "learning_rate": 1.8445577044337492e-06, "loss": 0.0078, "step": 6424 }, { "epoch": 2.9231119199272064, "grad_norm": 0.6520551722615875, "learning_rate": 1.8438680817247944e-06, "loss": 0.0139, "step": 6425 }, { "epoch": 2.9235668789808917, "grad_norm": 0.3696170927813188, "learning_rate": 1.8431785126319761e-06, "loss": 0.0067, "step": 6426 }, { "epoch": 2.924021838034577, "grad_norm": 0.3476970031636846, "learning_rate": 1.8424889972116442e-06, "loss": 0.0059, "step": 6427 }, { "epoch": 2.924476797088262, "grad_norm": 0.6109020571919648, "learning_rate": 1.8417995355201415e-06, "loss": 0.0164, "step": 6428 }, { "epoch": 2.9249317561419472, "grad_norm": 0.6092016370781483, "learning_rate": 1.8411101276138088e-06, "loss": 0.0106, "step": 6429 }, { "epoch": 2.9253867151956325, "grad_norm": 0.5600942016340329, "learning_rate": 1.8404207735489801e-06, "loss": 0.0075, "step": 6430 }, { "epoch": 2.9258416742493174, "grad_norm": 0.2723993311962136, "learning_rate": 1.8397314733819876e-06, "loss": 0.0032, "step": 6431 }, { "epoch": 2.9262966333030027, "grad_norm": 0.36909658911940957, "learning_rate": 1.839042227169158e-06, "loss": 0.0036, "step": 6432 }, { "epoch": 2.926751592356688, "grad_norm": 0.4249281271690692, "learning_rate": 1.8383530349668127e-06, "loss": 0.0102, "step": 6433 }, { "epoch": 2.9272065514103733, "grad_norm": 0.2948118923510841, "learning_rate": 1.8376638968312687e-06, "loss": 0.0025, "step": 6434 }, { "epoch": 2.927661510464058, "grad_norm": 0.5306451276706241, "learning_rate": 1.8369748128188408e-06, "loss": 0.009, "step": 6435 }, { "epoch": 2.9281164695177435, "grad_norm": 0.5530315668444106, "learning_rate": 1.836285782985836e-06, "loss": 0.0155, "step": 6436 }, { "epoch": 2.928571428571429, "grad_norm": 0.4574750439563228, "learning_rate": 1.8355968073885594e-06, "loss": 0.0114, "step": 6437 }, { "epoch": 2.9290263876251137, "grad_norm": 0.9018591836584824, "learning_rate": 1.8349078860833125e-06, "loss": 0.0236, "step": 6438 }, { "epoch": 2.929481346678799, "grad_norm": 0.7930095568124655, "learning_rate": 1.8342190191263892e-06, "loss": 0.0205, "step": 6439 }, { "epoch": 2.9299363057324843, "grad_norm": 0.5644236267528726, "learning_rate": 1.8335302065740812e-06, "loss": 0.0145, "step": 6440 }, { "epoch": 2.930391264786169, "grad_norm": 0.6180401782316703, "learning_rate": 1.8328414484826746e-06, "loss": 0.0229, "step": 6441 }, { "epoch": 2.9308462238398545, "grad_norm": 0.39377676689399665, "learning_rate": 1.8321527449084525e-06, "loss": 0.0099, "step": 6442 }, { "epoch": 2.93130118289354, "grad_norm": 0.6775582593133752, "learning_rate": 1.8314640959076916e-06, "loss": 0.0192, "step": 6443 }, { "epoch": 2.9317561419472247, "grad_norm": 0.2577622152948973, "learning_rate": 1.8307755015366651e-06, "loss": 0.0041, "step": 6444 }, { "epoch": 2.93221110100091, "grad_norm": 0.32165832965389396, "learning_rate": 1.8300869618516434e-06, "loss": 0.0102, "step": 6445 }, { "epoch": 2.9326660600545953, "grad_norm": 0.6601745965341878, "learning_rate": 1.8293984769088896e-06, "loss": 0.0158, "step": 6446 }, { "epoch": 2.93312101910828, "grad_norm": 0.6475571191588946, "learning_rate": 1.828710046764664e-06, "loss": 0.016, "step": 6447 }, { "epoch": 2.9335759781619655, "grad_norm": 0.4515673961015021, "learning_rate": 1.8280216714752215e-06, "loss": 0.0068, "step": 6448 }, { "epoch": 2.934030937215651, "grad_norm": 0.5761556880715865, "learning_rate": 1.8273333510968142e-06, "loss": 0.0113, "step": 6449 }, { "epoch": 2.9344858962693356, "grad_norm": 1.005099708147736, "learning_rate": 1.8266450856856871e-06, "loss": 0.0266, "step": 6450 }, { "epoch": 2.934940855323021, "grad_norm": 0.39488202338327016, "learning_rate": 1.8259568752980818e-06, "loss": 0.0077, "step": 6451 }, { "epoch": 2.9353958143767063, "grad_norm": 0.5333613061744029, "learning_rate": 1.825268719990238e-06, "loss": 0.0139, "step": 6452 }, { "epoch": 2.935850773430391, "grad_norm": 0.653273696203658, "learning_rate": 1.824580619818387e-06, "loss": 0.0158, "step": 6453 }, { "epoch": 2.9363057324840764, "grad_norm": 0.403495166044093, "learning_rate": 1.823892574838758e-06, "loss": 0.0103, "step": 6454 }, { "epoch": 2.9367606915377618, "grad_norm": 0.6369355764625173, "learning_rate": 1.8232045851075742e-06, "loss": 0.0193, "step": 6455 }, { "epoch": 2.9372156505914466, "grad_norm": 0.46067034036527366, "learning_rate": 1.8225166506810555e-06, "loss": 0.0094, "step": 6456 }, { "epoch": 2.937670609645132, "grad_norm": 0.3515514201930771, "learning_rate": 1.821828771615416e-06, "loss": 0.0072, "step": 6457 }, { "epoch": 2.9381255686988172, "grad_norm": 0.6402659710614452, "learning_rate": 1.8211409479668663e-06, "loss": 0.0238, "step": 6458 }, { "epoch": 2.938580527752502, "grad_norm": 0.36409703311691327, "learning_rate": 1.820453179791614e-06, "loss": 0.0078, "step": 6459 }, { "epoch": 2.9390354868061874, "grad_norm": 0.4488943171963992, "learning_rate": 1.8197654671458581e-06, "loss": 0.0084, "step": 6460 }, { "epoch": 2.9394904458598727, "grad_norm": 0.3777354838588735, "learning_rate": 1.819077810085797e-06, "loss": 0.0091, "step": 6461 }, { "epoch": 2.9399454049135576, "grad_norm": 0.7357951826955476, "learning_rate": 1.8183902086676217e-06, "loss": 0.0215, "step": 6462 }, { "epoch": 2.940400363967243, "grad_norm": 0.34841386900016563, "learning_rate": 1.8177026629475208e-06, "loss": 0.0102, "step": 6463 }, { "epoch": 2.9408553230209282, "grad_norm": 0.43167342392108293, "learning_rate": 1.8170151729816776e-06, "loss": 0.0094, "step": 6464 }, { "epoch": 2.941310282074613, "grad_norm": 0.33852518441745155, "learning_rate": 1.8163277388262678e-06, "loss": 0.0062, "step": 6465 }, { "epoch": 2.9417652411282984, "grad_norm": 0.4515776917292284, "learning_rate": 1.81564036053747e-06, "loss": 0.0105, "step": 6466 }, { "epoch": 2.9422202001819837, "grad_norm": 0.42626406852346516, "learning_rate": 1.8149530381714508e-06, "loss": 0.0102, "step": 6467 }, { "epoch": 2.9426751592356686, "grad_norm": 0.4383032197408218, "learning_rate": 1.8142657717843756e-06, "loss": 0.0163, "step": 6468 }, { "epoch": 2.943130118289354, "grad_norm": 0.23321073492087824, "learning_rate": 1.8135785614324054e-06, "loss": 0.0037, "step": 6469 }, { "epoch": 2.943585077343039, "grad_norm": 0.3714794071754207, "learning_rate": 1.8128914071716943e-06, "loss": 0.0121, "step": 6470 }, { "epoch": 2.944040036396724, "grad_norm": 0.5999713067272705, "learning_rate": 1.8122043090583951e-06, "loss": 0.0152, "step": 6471 }, { "epoch": 2.9444949954504094, "grad_norm": 0.32238975875842635, "learning_rate": 1.811517267148653e-06, "loss": 0.0071, "step": 6472 }, { "epoch": 2.9449499545040947, "grad_norm": 0.3440542932383928, "learning_rate": 1.810830281498611e-06, "loss": 0.0104, "step": 6473 }, { "epoch": 2.9454049135577796, "grad_norm": 0.3146865078726124, "learning_rate": 1.8101433521644063e-06, "loss": 0.0052, "step": 6474 }, { "epoch": 2.945859872611465, "grad_norm": 0.5160794896014017, "learning_rate": 1.8094564792021713e-06, "loss": 0.0118, "step": 6475 }, { "epoch": 2.94631483166515, "grad_norm": 0.35555643334894965, "learning_rate": 1.8087696626680352e-06, "loss": 0.0072, "step": 6476 }, { "epoch": 2.946769790718835, "grad_norm": 0.7359475641771708, "learning_rate": 1.8080829026181197e-06, "loss": 0.0188, "step": 6477 }, { "epoch": 2.9472247497725204, "grad_norm": 0.30428797605388314, "learning_rate": 1.8073961991085453e-06, "loss": 0.0063, "step": 6478 }, { "epoch": 2.9476797088262057, "grad_norm": 0.39537523661612617, "learning_rate": 1.8067095521954248e-06, "loss": 0.0118, "step": 6479 }, { "epoch": 2.9481346678798905, "grad_norm": 0.6207933652498647, "learning_rate": 1.8060229619348693e-06, "loss": 0.0174, "step": 6480 }, { "epoch": 2.948589626933576, "grad_norm": 0.7268385338596233, "learning_rate": 1.805336428382984e-06, "loss": 0.0154, "step": 6481 }, { "epoch": 2.949044585987261, "grad_norm": 0.6474126138632875, "learning_rate": 1.8046499515958683e-06, "loss": 0.0085, "step": 6482 }, { "epoch": 2.9494995450409465, "grad_norm": 0.8984696465588514, "learning_rate": 1.8039635316296184e-06, "loss": 0.029, "step": 6483 }, { "epoch": 2.9499545040946313, "grad_norm": 0.5468490436237049, "learning_rate": 1.8032771685403252e-06, "loss": 0.0112, "step": 6484 }, { "epoch": 2.9504094631483166, "grad_norm": 0.3203701991492718, "learning_rate": 1.802590862384076e-06, "loss": 0.0049, "step": 6485 }, { "epoch": 2.950864422202002, "grad_norm": 0.46112772619588155, "learning_rate": 1.801904613216951e-06, "loss": 0.0064, "step": 6486 }, { "epoch": 2.951319381255687, "grad_norm": 0.24129927789663605, "learning_rate": 1.801218421095029e-06, "loss": 0.0038, "step": 6487 }, { "epoch": 2.951774340309372, "grad_norm": 0.45372961997280936, "learning_rate": 1.8005322860743824e-06, "loss": 0.008, "step": 6488 }, { "epoch": 2.9522292993630574, "grad_norm": 0.22367104273977265, "learning_rate": 1.7998462082110779e-06, "loss": 0.0027, "step": 6489 }, { "epoch": 2.9526842584167428, "grad_norm": 0.39778507721119705, "learning_rate": 1.7991601875611803e-06, "loss": 0.0099, "step": 6490 }, { "epoch": 2.9531392174704276, "grad_norm": 0.6072703477135746, "learning_rate": 1.7984742241807461e-06, "loss": 0.0152, "step": 6491 }, { "epoch": 2.953594176524113, "grad_norm": 0.40395737813160953, "learning_rate": 1.7977883181258316e-06, "loss": 0.0101, "step": 6492 }, { "epoch": 2.9540491355777982, "grad_norm": 0.3855783479896977, "learning_rate": 1.797102469452483e-06, "loss": 0.0057, "step": 6493 }, { "epoch": 2.954504094631483, "grad_norm": 0.4099485179417033, "learning_rate": 1.7964166782167468e-06, "loss": 0.0084, "step": 6494 }, { "epoch": 2.9549590536851684, "grad_norm": 0.42962895107479565, "learning_rate": 1.795730944474663e-06, "loss": 0.0115, "step": 6495 }, { "epoch": 2.9554140127388537, "grad_norm": 0.43800643453939603, "learning_rate": 1.7950452682822655e-06, "loss": 0.0077, "step": 6496 }, { "epoch": 2.9558689717925386, "grad_norm": 0.3394956327308562, "learning_rate": 1.7943596496955856e-06, "loss": 0.0086, "step": 6497 }, { "epoch": 2.956323930846224, "grad_norm": 0.5854904088907839, "learning_rate": 1.7936740887706478e-06, "loss": 0.0103, "step": 6498 }, { "epoch": 2.9567788898999092, "grad_norm": 0.4701087342750594, "learning_rate": 1.7929885855634743e-06, "loss": 0.0114, "step": 6499 }, { "epoch": 2.957233848953594, "grad_norm": 0.39685749457530467, "learning_rate": 1.79230314013008e-06, "loss": 0.0082, "step": 6500 }, { "epoch": 2.9576888080072794, "grad_norm": 0.933548390796242, "learning_rate": 1.7916177525264775e-06, "loss": 0.0331, "step": 6501 }, { "epoch": 2.9581437670609647, "grad_norm": 0.28295422686901384, "learning_rate": 1.790932422808674e-06, "loss": 0.0047, "step": 6502 }, { "epoch": 2.9585987261146496, "grad_norm": 0.36631873682068067, "learning_rate": 1.7902471510326701e-06, "loss": 0.0098, "step": 6503 }, { "epoch": 2.959053685168335, "grad_norm": 0.3501041172918534, "learning_rate": 1.7895619372544636e-06, "loss": 0.0083, "step": 6504 }, { "epoch": 2.95950864422202, "grad_norm": 0.4172549764486443, "learning_rate": 1.7888767815300481e-06, "loss": 0.0087, "step": 6505 }, { "epoch": 2.959963603275705, "grad_norm": 0.31604495534704813, "learning_rate": 1.78819168391541e-06, "loss": 0.0063, "step": 6506 }, { "epoch": 2.9604185623293904, "grad_norm": 0.5059004572834342, "learning_rate": 1.7875066444665324e-06, "loss": 0.0146, "step": 6507 }, { "epoch": 2.9608735213830757, "grad_norm": 0.5292114176722631, "learning_rate": 1.7868216632393951e-06, "loss": 0.0077, "step": 6508 }, { "epoch": 2.9613284804367606, "grad_norm": 0.48610899944905905, "learning_rate": 1.7861367402899705e-06, "loss": 0.0113, "step": 6509 }, { "epoch": 2.961783439490446, "grad_norm": 0.34814596648267365, "learning_rate": 1.7854518756742278e-06, "loss": 0.0071, "step": 6510 }, { "epoch": 2.962238398544131, "grad_norm": 0.658745348286229, "learning_rate": 1.784767069448131e-06, "loss": 0.0246, "step": 6511 }, { "epoch": 2.962693357597816, "grad_norm": 0.7209051241458937, "learning_rate": 1.7840823216676395e-06, "loss": 0.0213, "step": 6512 }, { "epoch": 2.9631483166515014, "grad_norm": 0.43923961590559735, "learning_rate": 1.783397632388707e-06, "loss": 0.0079, "step": 6513 }, { "epoch": 2.9636032757051867, "grad_norm": 0.6357652747098063, "learning_rate": 1.7827130016672836e-06, "loss": 0.0124, "step": 6514 }, { "epoch": 2.9640582347588715, "grad_norm": 0.428132164194889, "learning_rate": 1.7820284295593155e-06, "loss": 0.0114, "step": 6515 }, { "epoch": 2.964513193812557, "grad_norm": 0.47402864238582615, "learning_rate": 1.7813439161207412e-06, "loss": 0.0109, "step": 6516 }, { "epoch": 2.964968152866242, "grad_norm": 0.4395677778427893, "learning_rate": 1.7806594614074973e-06, "loss": 0.0154, "step": 6517 }, { "epoch": 2.965423111919927, "grad_norm": 0.626763452888447, "learning_rate": 1.7799750654755126e-06, "loss": 0.0131, "step": 6518 }, { "epoch": 2.9658780709736123, "grad_norm": 0.47306763094693005, "learning_rate": 1.7792907283807154e-06, "loss": 0.01, "step": 6519 }, { "epoch": 2.9663330300272976, "grad_norm": 0.611323093186345, "learning_rate": 1.778606450179024e-06, "loss": 0.0206, "step": 6520 }, { "epoch": 2.9667879890809825, "grad_norm": 0.4203159332824292, "learning_rate": 1.7779222309263556e-06, "loss": 0.0077, "step": 6521 }, { "epoch": 2.967242948134668, "grad_norm": 0.3028983417612062, "learning_rate": 1.7772380706786222e-06, "loss": 0.0059, "step": 6522 }, { "epoch": 2.967697907188353, "grad_norm": 0.325115054483697, "learning_rate": 1.7765539694917294e-06, "loss": 0.0056, "step": 6523 }, { "epoch": 2.968152866242038, "grad_norm": 0.3939406099756208, "learning_rate": 1.7758699274215796e-06, "loss": 0.009, "step": 6524 }, { "epoch": 2.9686078252957233, "grad_norm": 0.22372003195941495, "learning_rate": 1.7751859445240688e-06, "loss": 0.0034, "step": 6525 }, { "epoch": 2.9690627843494086, "grad_norm": 0.4203235459257464, "learning_rate": 1.7745020208550897e-06, "loss": 0.0086, "step": 6526 }, { "epoch": 2.9695177434030935, "grad_norm": 0.34092719985874076, "learning_rate": 1.7738181564705288e-06, "loss": 0.0078, "step": 6527 }, { "epoch": 2.969972702456779, "grad_norm": 0.403078969308054, "learning_rate": 1.7731343514262683e-06, "loss": 0.0094, "step": 6528 }, { "epoch": 2.970427661510464, "grad_norm": 0.40113620646447995, "learning_rate": 1.772450605778187e-06, "loss": 0.008, "step": 6529 }, { "epoch": 2.970882620564149, "grad_norm": 0.48068965948028713, "learning_rate": 1.771766919582156e-06, "loss": 0.0117, "step": 6530 }, { "epoch": 2.9713375796178343, "grad_norm": 0.3534840358314616, "learning_rate": 1.7710832928940444e-06, "loss": 0.0041, "step": 6531 }, { "epoch": 2.9717925386715196, "grad_norm": 0.5097586204703118, "learning_rate": 1.7703997257697136e-06, "loss": 0.0238, "step": 6532 }, { "epoch": 2.9722474977252045, "grad_norm": 0.48760170770812344, "learning_rate": 1.769716218265023e-06, "loss": 0.009, "step": 6533 }, { "epoch": 2.97270245677889, "grad_norm": 0.47140065180636703, "learning_rate": 1.7690327704358245e-06, "loss": 0.0103, "step": 6534 }, { "epoch": 2.973157415832575, "grad_norm": 0.7137132299900583, "learning_rate": 1.7683493823379666e-06, "loss": 0.0139, "step": 6535 }, { "epoch": 2.9736123748862604, "grad_norm": 0.6179240273787846, "learning_rate": 1.7676660540272945e-06, "loss": 0.019, "step": 6536 }, { "epoch": 2.9740673339399453, "grad_norm": 0.47500124131121974, "learning_rate": 1.7669827855596439e-06, "loss": 0.0159, "step": 6537 }, { "epoch": 2.9745222929936306, "grad_norm": 0.38782195605197267, "learning_rate": 1.766299576990851e-06, "loss": 0.0077, "step": 6538 }, { "epoch": 2.974977252047316, "grad_norm": 0.4919393991057052, "learning_rate": 1.765616428376743e-06, "loss": 0.0132, "step": 6539 }, { "epoch": 2.9754322111010008, "grad_norm": 0.617190915993564, "learning_rate": 1.7649333397731433e-06, "loss": 0.0118, "step": 6540 }, { "epoch": 2.975887170154686, "grad_norm": 0.41222511552503044, "learning_rate": 1.7642503112358725e-06, "loss": 0.0059, "step": 6541 }, { "epoch": 2.9763421292083714, "grad_norm": 0.657610895034959, "learning_rate": 1.7635673428207424e-06, "loss": 0.0181, "step": 6542 }, { "epoch": 2.9767970882620567, "grad_norm": 0.38408605942954716, "learning_rate": 1.762884434583564e-06, "loss": 0.0086, "step": 6543 }, { "epoch": 2.9772520473157416, "grad_norm": 0.5029180867704294, "learning_rate": 1.7622015865801412e-06, "loss": 0.0172, "step": 6544 }, { "epoch": 2.977707006369427, "grad_norm": 0.3854815743496932, "learning_rate": 1.7615187988662724e-06, "loss": 0.0084, "step": 6545 }, { "epoch": 2.978161965423112, "grad_norm": 0.6020588620168125, "learning_rate": 1.760836071497753e-06, "loss": 0.02, "step": 6546 }, { "epoch": 2.978616924476797, "grad_norm": 0.35246371348247674, "learning_rate": 1.7601534045303708e-06, "loss": 0.0049, "step": 6547 }, { "epoch": 2.9790718835304824, "grad_norm": 0.42647811886215525, "learning_rate": 1.7594707980199125e-06, "loss": 0.0097, "step": 6548 }, { "epoch": 2.9795268425841677, "grad_norm": 0.579497185057743, "learning_rate": 1.758788252022155e-06, "loss": 0.0254, "step": 6549 }, { "epoch": 2.9799818016378525, "grad_norm": 0.47425124200445395, "learning_rate": 1.7581057665928747e-06, "loss": 0.0171, "step": 6550 }, { "epoch": 2.980436760691538, "grad_norm": 0.38849335960358855, "learning_rate": 1.7574233417878414e-06, "loss": 0.0106, "step": 6551 }, { "epoch": 2.980891719745223, "grad_norm": 0.7696300520936127, "learning_rate": 1.7567409776628187e-06, "loss": 0.0134, "step": 6552 }, { "epoch": 2.981346678798908, "grad_norm": 0.5508428737746935, "learning_rate": 1.756058674273567e-06, "loss": 0.0101, "step": 6553 }, { "epoch": 2.9818016378525933, "grad_norm": 0.4248365708115526, "learning_rate": 1.755376431675841e-06, "loss": 0.0115, "step": 6554 }, { "epoch": 2.9822565969062786, "grad_norm": 0.3672790656121171, "learning_rate": 1.75469424992539e-06, "loss": 0.0067, "step": 6555 }, { "epoch": 2.9827115559599635, "grad_norm": 0.4470599843528532, "learning_rate": 1.754012129077959e-06, "loss": 0.0062, "step": 6556 }, { "epoch": 2.983166515013649, "grad_norm": 0.4569385424378455, "learning_rate": 1.7533300691892874e-06, "loss": 0.0101, "step": 6557 }, { "epoch": 2.983621474067334, "grad_norm": 0.603432133579873, "learning_rate": 1.752648070315112e-06, "loss": 0.0142, "step": 6558 }, { "epoch": 2.984076433121019, "grad_norm": 0.36759200971382694, "learning_rate": 1.7519661325111603e-06, "loss": 0.0063, "step": 6559 }, { "epoch": 2.9845313921747043, "grad_norm": 0.405435398738984, "learning_rate": 1.7512842558331588e-06, "loss": 0.0103, "step": 6560 }, { "epoch": 2.9849863512283896, "grad_norm": 0.5286251656618303, "learning_rate": 1.7506024403368262e-06, "loss": 0.0132, "step": 6561 }, { "epoch": 2.9854413102820745, "grad_norm": 0.246520891441022, "learning_rate": 1.7499206860778786e-06, "loss": 0.0034, "step": 6562 }, { "epoch": 2.98589626933576, "grad_norm": 0.3886982321278798, "learning_rate": 1.7492389931120241e-06, "loss": 0.0085, "step": 6563 }, { "epoch": 2.986351228389445, "grad_norm": 0.514081959212086, "learning_rate": 1.748557361494969e-06, "loss": 0.0094, "step": 6564 }, { "epoch": 2.98680618744313, "grad_norm": 0.36534858670760645, "learning_rate": 1.7478757912824135e-06, "loss": 0.0087, "step": 6565 }, { "epoch": 2.9872611464968153, "grad_norm": 0.49025689570968406, "learning_rate": 1.7471942825300514e-06, "loss": 0.0093, "step": 6566 }, { "epoch": 2.9877161055505006, "grad_norm": 0.4853337967522962, "learning_rate": 1.7465128352935734e-06, "loss": 0.0172, "step": 6567 }, { "epoch": 2.9881710646041855, "grad_norm": 0.7956688935756319, "learning_rate": 1.7458314496286633e-06, "loss": 0.0146, "step": 6568 }, { "epoch": 2.988626023657871, "grad_norm": 0.5540020259349786, "learning_rate": 1.7451501255910014e-06, "loss": 0.0114, "step": 6569 }, { "epoch": 2.989080982711556, "grad_norm": 0.543256996036825, "learning_rate": 1.7444688632362616e-06, "loss": 0.0137, "step": 6570 }, { "epoch": 2.989535941765241, "grad_norm": 0.40818360117801816, "learning_rate": 1.743787662620115e-06, "loss": 0.0082, "step": 6571 }, { "epoch": 2.9899909008189263, "grad_norm": 0.5692978498422049, "learning_rate": 1.7431065237982258e-06, "loss": 0.0132, "step": 6572 }, { "epoch": 2.9904458598726116, "grad_norm": 0.38616793421792933, "learning_rate": 1.7424254468262531e-06, "loss": 0.0106, "step": 6573 }, { "epoch": 2.9909008189262964, "grad_norm": 0.409580050791778, "learning_rate": 1.7417444317598522e-06, "loss": 0.0063, "step": 6574 }, { "epoch": 2.9913557779799818, "grad_norm": 0.4008548389760227, "learning_rate": 1.741063478654672e-06, "loss": 0.0114, "step": 6575 }, { "epoch": 2.991810737033667, "grad_norm": 0.5421544656358204, "learning_rate": 1.7403825875663567e-06, "loss": 0.0116, "step": 6576 }, { "epoch": 2.992265696087352, "grad_norm": 0.40849898251228, "learning_rate": 1.7397017585505454e-06, "loss": 0.0067, "step": 6577 }, { "epoch": 2.9927206551410372, "grad_norm": 0.5591929143093217, "learning_rate": 1.7390209916628736e-06, "loss": 0.011, "step": 6578 }, { "epoch": 2.9931756141947226, "grad_norm": 0.3384260725999993, "learning_rate": 1.7383402869589696e-06, "loss": 0.0056, "step": 6579 }, { "epoch": 2.9936305732484074, "grad_norm": 0.6398159013027072, "learning_rate": 1.7376596444944583e-06, "loss": 0.0141, "step": 6580 }, { "epoch": 2.9940855323020927, "grad_norm": 0.23516131861322706, "learning_rate": 1.7369790643249573e-06, "loss": 0.0037, "step": 6581 }, { "epoch": 2.994540491355778, "grad_norm": 0.41334611010840666, "learning_rate": 1.7362985465060823e-06, "loss": 0.0082, "step": 6582 }, { "epoch": 2.994995450409463, "grad_norm": 2.484885582570131, "learning_rate": 1.7356180910934407e-06, "loss": 0.0188, "step": 6583 }, { "epoch": 2.9954504094631482, "grad_norm": 0.30678994942273635, "learning_rate": 1.7349376981426358e-06, "loss": 0.0057, "step": 6584 }, { "epoch": 2.9959053685168335, "grad_norm": 0.31466473832849196, "learning_rate": 1.7342573677092684e-06, "loss": 0.0053, "step": 6585 }, { "epoch": 2.9963603275705184, "grad_norm": 0.38298145467538364, "learning_rate": 1.7335770998489304e-06, "loss": 0.0097, "step": 6586 }, { "epoch": 2.9968152866242037, "grad_norm": 0.3445837824619045, "learning_rate": 1.7328968946172114e-06, "loss": 0.009, "step": 6587 }, { "epoch": 2.997270245677889, "grad_norm": 0.4486458356380175, "learning_rate": 1.7322167520696933e-06, "loss": 0.0088, "step": 6588 }, { "epoch": 2.997725204731574, "grad_norm": 0.42461915789370436, "learning_rate": 1.7315366722619554e-06, "loss": 0.0082, "step": 6589 }, { "epoch": 2.998180163785259, "grad_norm": 0.5073872815900401, "learning_rate": 1.7308566552495698e-06, "loss": 0.0149, "step": 6590 }, { "epoch": 2.9986351228389445, "grad_norm": 0.3967272609610882, "learning_rate": 1.7301767010881044e-06, "loss": 0.0039, "step": 6591 }, { "epoch": 2.99909008189263, "grad_norm": 0.4210034535986422, "learning_rate": 1.729496809833124e-06, "loss": 0.0068, "step": 6592 }, { "epoch": 2.9995450409463147, "grad_norm": 0.52724973876027, "learning_rate": 1.7288169815401833e-06, "loss": 0.0091, "step": 6593 }, { "epoch": 3.0, "grad_norm": 0.39238584971615514, "learning_rate": 1.7281372162648375e-06, "loss": 0.0098, "step": 6594 }, { "epoch": 3.0004549590536853, "grad_norm": 0.18871704906893602, "learning_rate": 1.7274575140626318e-06, "loss": 0.0019, "step": 6595 }, { "epoch": 3.00090991810737, "grad_norm": 0.24626676344577259, "learning_rate": 1.7267778749891097e-06, "loss": 0.004, "step": 6596 }, { "epoch": 3.0013648771610555, "grad_norm": 0.16101752656097085, "learning_rate": 1.7260982990998075e-06, "loss": 0.002, "step": 6597 }, { "epoch": 3.001819836214741, "grad_norm": 0.1435785228062512, "learning_rate": 1.7254187864502569e-06, "loss": 0.0023, "step": 6598 }, { "epoch": 3.0022747952684257, "grad_norm": 0.31944921129275755, "learning_rate": 1.724739337095986e-06, "loss": 0.0053, "step": 6599 }, { "epoch": 3.002729754322111, "grad_norm": 0.13428935231996958, "learning_rate": 1.724059951092515e-06, "loss": 0.0016, "step": 6600 }, { "epoch": 3.0031847133757963, "grad_norm": 0.2480856301914793, "learning_rate": 1.7233806284953613e-06, "loss": 0.0035, "step": 6601 }, { "epoch": 3.003639672429481, "grad_norm": 0.4339430173386617, "learning_rate": 1.7227013693600348e-06, "loss": 0.0068, "step": 6602 }, { "epoch": 3.0040946314831665, "grad_norm": 0.19176098382149298, "learning_rate": 1.7220221737420428e-06, "loss": 0.0019, "step": 6603 }, { "epoch": 3.0045495905368518, "grad_norm": 0.35241881190671526, "learning_rate": 1.7213430416968848e-06, "loss": 0.0073, "step": 6604 }, { "epoch": 3.0050045495905366, "grad_norm": 0.34558718305719144, "learning_rate": 1.7206639732800568e-06, "loss": 0.0042, "step": 6605 }, { "epoch": 3.005459508644222, "grad_norm": 0.18884737709124044, "learning_rate": 1.7199849685470498e-06, "loss": 0.0047, "step": 6606 }, { "epoch": 3.0059144676979073, "grad_norm": 0.30123493101264553, "learning_rate": 1.7193060275533488e-06, "loss": 0.0028, "step": 6607 }, { "epoch": 3.0063694267515926, "grad_norm": 0.2307463484696562, "learning_rate": 1.718627150354434e-06, "loss": 0.0054, "step": 6608 }, { "epoch": 3.0068243858052774, "grad_norm": 0.26088568443769705, "learning_rate": 1.7179483370057797e-06, "loss": 0.0037, "step": 6609 }, { "epoch": 3.0072793448589628, "grad_norm": 0.1798259979550412, "learning_rate": 1.7172695875628553e-06, "loss": 0.0014, "step": 6610 }, { "epoch": 3.007734303912648, "grad_norm": 0.49710248833906284, "learning_rate": 1.7165909020811255e-06, "loss": 0.0079, "step": 6611 }, { "epoch": 3.008189262966333, "grad_norm": 0.20715902468220426, "learning_rate": 1.7159122806160488e-06, "loss": 0.0018, "step": 6612 }, { "epoch": 3.0086442220200182, "grad_norm": 0.37296249656432, "learning_rate": 1.7152337232230798e-06, "loss": 0.0079, "step": 6613 }, { "epoch": 3.0090991810737036, "grad_norm": 0.26288990057494804, "learning_rate": 1.714555229957668e-06, "loss": 0.0034, "step": 6614 }, { "epoch": 3.0095541401273884, "grad_norm": 0.34076600506508065, "learning_rate": 1.7138768008752545e-06, "loss": 0.003, "step": 6615 }, { "epoch": 3.0100090991810737, "grad_norm": 0.06987010251435957, "learning_rate": 1.7131984360312799e-06, "loss": 0.0008, "step": 6616 }, { "epoch": 3.010464058234759, "grad_norm": 0.39596485124547365, "learning_rate": 1.7125201354811749e-06, "loss": 0.0023, "step": 6617 }, { "epoch": 3.010919017288444, "grad_norm": 0.37717723006285186, "learning_rate": 1.711841899280369e-06, "loss": 0.0043, "step": 6618 }, { "epoch": 3.011373976342129, "grad_norm": 0.22374084605218367, "learning_rate": 1.7111637274842827e-06, "loss": 0.0026, "step": 6619 }, { "epoch": 3.0118289353958145, "grad_norm": 0.2041809533763727, "learning_rate": 1.7104856201483346e-06, "loss": 0.0025, "step": 6620 }, { "epoch": 3.0122838944494994, "grad_norm": 0.1451787686573356, "learning_rate": 1.709807577327937e-06, "loss": 0.0019, "step": 6621 }, { "epoch": 3.0127388535031847, "grad_norm": 0.2047717472579055, "learning_rate": 1.7091295990784952e-06, "loss": 0.0034, "step": 6622 }, { "epoch": 3.01319381255687, "grad_norm": 0.27135890707749444, "learning_rate": 1.708451685455411e-06, "loss": 0.0025, "step": 6623 }, { "epoch": 3.013648771610555, "grad_norm": 0.26928218479147914, "learning_rate": 1.7077738365140805e-06, "loss": 0.0036, "step": 6624 }, { "epoch": 3.01410373066424, "grad_norm": 0.12843391677147106, "learning_rate": 1.707096052309895e-06, "loss": 0.0017, "step": 6625 }, { "epoch": 3.0145586897179255, "grad_norm": 0.4257716770258478, "learning_rate": 1.706418332898238e-06, "loss": 0.0056, "step": 6626 }, { "epoch": 3.0150136487716104, "grad_norm": 0.09818215473289016, "learning_rate": 1.7057406783344918e-06, "loss": 0.001, "step": 6627 }, { "epoch": 3.0154686078252957, "grad_norm": 0.18295468383765298, "learning_rate": 1.705063088674031e-06, "loss": 0.0017, "step": 6628 }, { "epoch": 3.015923566878981, "grad_norm": 0.2737419797682681, "learning_rate": 1.704385563972224e-06, "loss": 0.0023, "step": 6629 }, { "epoch": 3.016378525932666, "grad_norm": 0.17372004665297497, "learning_rate": 1.7037081042844367e-06, "loss": 0.0032, "step": 6630 }, { "epoch": 3.016833484986351, "grad_norm": 0.1473502490965731, "learning_rate": 1.7030307096660262e-06, "loss": 0.0012, "step": 6631 }, { "epoch": 3.0172884440400365, "grad_norm": 0.4459812955231915, "learning_rate": 1.7023533801723474e-06, "loss": 0.0044, "step": 6632 }, { "epoch": 3.0177434030937214, "grad_norm": 0.49124786887065763, "learning_rate": 1.7016761158587474e-06, "loss": 0.007, "step": 6633 }, { "epoch": 3.0181983621474067, "grad_norm": 0.11955168079322342, "learning_rate": 1.7009989167805707e-06, "loss": 0.001, "step": 6634 }, { "epoch": 3.018653321201092, "grad_norm": 0.19953195925833736, "learning_rate": 1.7003217829931545e-06, "loss": 0.0031, "step": 6635 }, { "epoch": 3.0191082802547773, "grad_norm": 0.15278083420320596, "learning_rate": 1.6996447145518307e-06, "loss": 0.0014, "step": 6636 }, { "epoch": 3.019563239308462, "grad_norm": 0.22016839956690523, "learning_rate": 1.6989677115119268e-06, "loss": 0.0045, "step": 6637 }, { "epoch": 3.0200181983621475, "grad_norm": 0.14575654517139758, "learning_rate": 1.6982907739287636e-06, "loss": 0.0009, "step": 6638 }, { "epoch": 3.0204731574158328, "grad_norm": 0.31764612494012034, "learning_rate": 1.6976139018576581e-06, "loss": 0.0035, "step": 6639 }, { "epoch": 3.0209281164695176, "grad_norm": 0.12877311116772863, "learning_rate": 1.6969370953539202e-06, "loss": 0.0017, "step": 6640 }, { "epoch": 3.021383075523203, "grad_norm": 0.2395449296665478, "learning_rate": 1.6962603544728567e-06, "loss": 0.0028, "step": 6641 }, { "epoch": 3.0218380345768883, "grad_norm": 0.27987337085713127, "learning_rate": 1.695583679269768e-06, "loss": 0.0027, "step": 6642 }, { "epoch": 3.022292993630573, "grad_norm": 0.05115519276170048, "learning_rate": 1.6949070697999479e-06, "loss": 0.0006, "step": 6643 }, { "epoch": 3.0227479526842584, "grad_norm": 0.28461503189842247, "learning_rate": 1.6942305261186865e-06, "loss": 0.0082, "step": 6644 }, { "epoch": 3.0232029117379438, "grad_norm": 0.15304472569096794, "learning_rate": 1.6935540482812678e-06, "loss": 0.0014, "step": 6645 }, { "epoch": 3.0236578707916286, "grad_norm": 0.32260355499460414, "learning_rate": 1.6928776363429699e-06, "loss": 0.0048, "step": 6646 }, { "epoch": 3.024112829845314, "grad_norm": 0.1673230466238605, "learning_rate": 1.6922012903590663e-06, "loss": 0.0018, "step": 6647 }, { "epoch": 3.0245677888989992, "grad_norm": 0.2331725445060424, "learning_rate": 1.691525010384826e-06, "loss": 0.0026, "step": 6648 }, { "epoch": 3.025022747952684, "grad_norm": 0.27550847718476296, "learning_rate": 1.6908487964755105e-06, "loss": 0.0038, "step": 6649 }, { "epoch": 3.0254777070063694, "grad_norm": 0.2301407222573616, "learning_rate": 1.690172648686378e-06, "loss": 0.0015, "step": 6650 }, { "epoch": 3.0259326660600547, "grad_norm": 0.466025989867284, "learning_rate": 1.6894965670726782e-06, "loss": 0.0102, "step": 6651 }, { "epoch": 3.0263876251137396, "grad_norm": 0.36603331686003626, "learning_rate": 1.6888205516896599e-06, "loss": 0.0035, "step": 6652 }, { "epoch": 3.026842584167425, "grad_norm": 0.2681604647259943, "learning_rate": 1.6881446025925624e-06, "loss": 0.0033, "step": 6653 }, { "epoch": 3.02729754322111, "grad_norm": 0.11960486472210904, "learning_rate": 1.6874687198366207e-06, "loss": 0.0005, "step": 6654 }, { "epoch": 3.027752502274795, "grad_norm": 0.19072321494077665, "learning_rate": 1.6867929034770672e-06, "loss": 0.0027, "step": 6655 }, { "epoch": 3.0282074613284804, "grad_norm": 0.22741988440364228, "learning_rate": 1.6861171535691245e-06, "loss": 0.0021, "step": 6656 }, { "epoch": 3.0286624203821657, "grad_norm": 0.13977000225651853, "learning_rate": 1.6854414701680133e-06, "loss": 0.0018, "step": 6657 }, { "epoch": 3.0291173794358506, "grad_norm": 0.34980857430872997, "learning_rate": 1.684765853328946e-06, "loss": 0.0065, "step": 6658 }, { "epoch": 3.029572338489536, "grad_norm": 0.4399685652311045, "learning_rate": 1.684090303107132e-06, "loss": 0.0044, "step": 6659 }, { "epoch": 3.030027297543221, "grad_norm": 0.12689920515289774, "learning_rate": 1.6834148195577737e-06, "loss": 0.0019, "step": 6660 }, { "epoch": 3.030482256596906, "grad_norm": 0.31444214043225627, "learning_rate": 1.6827394027360678e-06, "loss": 0.0035, "step": 6661 }, { "epoch": 3.0309372156505914, "grad_norm": 0.48809559593459295, "learning_rate": 1.6820640526972083e-06, "loss": 0.0078, "step": 6662 }, { "epoch": 3.0313921747042767, "grad_norm": 0.3021197269489071, "learning_rate": 1.68138876949638e-06, "loss": 0.0047, "step": 6663 }, { "epoch": 3.031847133757962, "grad_norm": 0.21771453532885757, "learning_rate": 1.6807135531887653e-06, "loss": 0.0024, "step": 6664 }, { "epoch": 3.032302092811647, "grad_norm": 0.17872156844526293, "learning_rate": 1.6800384038295386e-06, "loss": 0.0025, "step": 6665 }, { "epoch": 3.032757051865332, "grad_norm": 0.09990132969909139, "learning_rate": 1.6793633214738713e-06, "loss": 0.001, "step": 6666 }, { "epoch": 3.0332120109190175, "grad_norm": 0.3836758564324542, "learning_rate": 1.6786883061769268e-06, "loss": 0.0058, "step": 6667 }, { "epoch": 3.0336669699727024, "grad_norm": 0.16816931777183775, "learning_rate": 1.6780133579938646e-06, "loss": 0.0016, "step": 6668 }, { "epoch": 3.0341219290263877, "grad_norm": 0.14849217848556467, "learning_rate": 1.6773384769798395e-06, "loss": 0.0019, "step": 6669 }, { "epoch": 3.034576888080073, "grad_norm": 0.1116821440410711, "learning_rate": 1.6766636631899986e-06, "loss": 0.0007, "step": 6670 }, { "epoch": 3.035031847133758, "grad_norm": 0.14090737780178714, "learning_rate": 1.6759889166794851e-06, "loss": 0.0013, "step": 6671 }, { "epoch": 3.035486806187443, "grad_norm": 0.2579835897029483, "learning_rate": 1.6753142375034359e-06, "loss": 0.0025, "step": 6672 }, { "epoch": 3.0359417652411285, "grad_norm": 0.22046142184596768, "learning_rate": 1.6746396257169836e-06, "loss": 0.0009, "step": 6673 }, { "epoch": 3.0363967242948133, "grad_norm": 0.11875685640341987, "learning_rate": 1.6739650813752526e-06, "loss": 0.0012, "step": 6674 }, { "epoch": 3.0368516833484986, "grad_norm": 0.4865399653210732, "learning_rate": 1.6732906045333651e-06, "loss": 0.0061, "step": 6675 }, { "epoch": 3.037306642402184, "grad_norm": 0.2760478900463036, "learning_rate": 1.6726161952464371e-06, "loss": 0.003, "step": 6676 }, { "epoch": 3.037761601455869, "grad_norm": 0.2362274420497099, "learning_rate": 1.6719418535695764e-06, "loss": 0.0031, "step": 6677 }, { "epoch": 3.038216560509554, "grad_norm": 0.4800710479063232, "learning_rate": 1.6712675795578883e-06, "loss": 0.0111, "step": 6678 }, { "epoch": 3.0386715195632394, "grad_norm": 0.21076332926086974, "learning_rate": 1.6705933732664708e-06, "loss": 0.0028, "step": 6679 }, { "epoch": 3.0391264786169243, "grad_norm": 0.10764345122384954, "learning_rate": 1.6699192347504178e-06, "loss": 0.0014, "step": 6680 }, { "epoch": 3.0395814376706096, "grad_norm": 0.10409272157296467, "learning_rate": 1.669245164064815e-06, "loss": 0.0014, "step": 6681 }, { "epoch": 3.040036396724295, "grad_norm": 0.377819005522705, "learning_rate": 1.6685711612647466e-06, "loss": 0.0038, "step": 6682 }, { "epoch": 3.04049135577798, "grad_norm": 0.467002137888794, "learning_rate": 1.6678972264052884e-06, "loss": 0.0055, "step": 6683 }, { "epoch": 3.040946314831665, "grad_norm": 0.22551250911788168, "learning_rate": 1.667223359541511e-06, "loss": 0.0026, "step": 6684 }, { "epoch": 3.0414012738853504, "grad_norm": 0.21439661663412216, "learning_rate": 1.66654956072848e-06, "loss": 0.0015, "step": 6685 }, { "epoch": 3.0418562329390353, "grad_norm": 0.2996576517655944, "learning_rate": 1.6658758300212552e-06, "loss": 0.0032, "step": 6686 }, { "epoch": 3.0423111919927206, "grad_norm": 0.18542905023007442, "learning_rate": 1.66520216747489e-06, "loss": 0.0021, "step": 6687 }, { "epoch": 3.042766151046406, "grad_norm": 0.18248382112113587, "learning_rate": 1.6645285731444332e-06, "loss": 0.0018, "step": 6688 }, { "epoch": 3.0432211101000908, "grad_norm": 0.25260982299994866, "learning_rate": 1.6638550470849298e-06, "loss": 0.0026, "step": 6689 }, { "epoch": 3.043676069153776, "grad_norm": 0.12429105006578883, "learning_rate": 1.6631815893514154e-06, "loss": 0.0011, "step": 6690 }, { "epoch": 3.0441310282074614, "grad_norm": 0.08751300389267144, "learning_rate": 1.6625081999989228e-06, "loss": 0.0007, "step": 6691 }, { "epoch": 3.0445859872611467, "grad_norm": 0.3677695891351524, "learning_rate": 1.6618348790824778e-06, "loss": 0.0051, "step": 6692 }, { "epoch": 3.0450409463148316, "grad_norm": 0.13107819774245225, "learning_rate": 1.6611616266571017e-06, "loss": 0.001, "step": 6693 }, { "epoch": 3.045495905368517, "grad_norm": 0.14253091833442869, "learning_rate": 1.660488442777809e-06, "loss": 0.0007, "step": 6694 }, { "epoch": 3.045950864422202, "grad_norm": 0.21822726043902058, "learning_rate": 1.6598153274996088e-06, "loss": 0.0036, "step": 6695 }, { "epoch": 3.046405823475887, "grad_norm": 0.5150427413164033, "learning_rate": 1.6591422808775068e-06, "loss": 0.0046, "step": 6696 }, { "epoch": 3.0468607825295724, "grad_norm": 0.12559361156588128, "learning_rate": 1.6584693029665e-06, "loss": 0.0013, "step": 6697 }, { "epoch": 3.0473157415832577, "grad_norm": 0.1841335145415453, "learning_rate": 1.657796393821582e-06, "loss": 0.0018, "step": 6698 }, { "epoch": 3.0477707006369426, "grad_norm": 0.15014382945933052, "learning_rate": 1.6571235534977383e-06, "loss": 0.0014, "step": 6699 }, { "epoch": 3.048225659690628, "grad_norm": 0.19281106167263032, "learning_rate": 1.6564507820499526e-06, "loss": 0.0012, "step": 6700 }, { "epoch": 3.048680618744313, "grad_norm": 0.23650310822285409, "learning_rate": 1.6557780795331984e-06, "loss": 0.0041, "step": 6701 }, { "epoch": 3.049135577797998, "grad_norm": 0.169039207652322, "learning_rate": 1.6551054460024468e-06, "loss": 0.0018, "step": 6702 }, { "epoch": 3.0495905368516834, "grad_norm": 0.3620620214570947, "learning_rate": 1.6544328815126639e-06, "loss": 0.0037, "step": 6703 }, { "epoch": 3.0500454959053687, "grad_norm": 0.352706664264297, "learning_rate": 1.6537603861188068e-06, "loss": 0.0056, "step": 6704 }, { "epoch": 3.0505004549590535, "grad_norm": 0.3001774386691079, "learning_rate": 1.6530879598758299e-06, "loss": 0.0037, "step": 6705 }, { "epoch": 3.050955414012739, "grad_norm": 0.16694795901537413, "learning_rate": 1.6524156028386796e-06, "loss": 0.0027, "step": 6706 }, { "epoch": 3.051410373066424, "grad_norm": 0.40977222754149756, "learning_rate": 1.6517433150622992e-06, "loss": 0.0101, "step": 6707 }, { "epoch": 3.051865332120109, "grad_norm": 0.11383319888304519, "learning_rate": 1.651071096601624e-06, "loss": 0.0009, "step": 6708 }, { "epoch": 3.0523202911737943, "grad_norm": 0.13663648144710513, "learning_rate": 1.6503989475115842e-06, "loss": 0.0011, "step": 6709 }, { "epoch": 3.0527752502274796, "grad_norm": 0.11756786808067193, "learning_rate": 1.6497268678471069e-06, "loss": 0.0009, "step": 6710 }, { "epoch": 3.0532302092811645, "grad_norm": 0.24803729467914853, "learning_rate": 1.6490548576631095e-06, "loss": 0.0032, "step": 6711 }, { "epoch": 3.05368516833485, "grad_norm": 0.6060041069670978, "learning_rate": 1.648382917014507e-06, "loss": 0.0064, "step": 6712 }, { "epoch": 3.054140127388535, "grad_norm": 0.3560487163396387, "learning_rate": 1.6477110459562062e-06, "loss": 0.0069, "step": 6713 }, { "epoch": 3.05459508644222, "grad_norm": 0.09582488628088716, "learning_rate": 1.64703924454311e-06, "loss": 0.0007, "step": 6714 }, { "epoch": 3.0550500454959053, "grad_norm": 0.14481264264128357, "learning_rate": 1.6463675128301146e-06, "loss": 0.0017, "step": 6715 }, { "epoch": 3.0555050045495906, "grad_norm": 0.2860290595410573, "learning_rate": 1.6456958508721106e-06, "loss": 0.0021, "step": 6716 }, { "epoch": 3.055959963603276, "grad_norm": 0.31737281015893043, "learning_rate": 1.6450242587239845e-06, "loss": 0.0057, "step": 6717 }, { "epoch": 3.056414922656961, "grad_norm": 0.16390519681831933, "learning_rate": 1.6443527364406142e-06, "loss": 0.0029, "step": 6718 }, { "epoch": 3.056869881710646, "grad_norm": 0.05957250479681491, "learning_rate": 1.6436812840768751e-06, "loss": 0.0003, "step": 6719 }, { "epoch": 3.0573248407643314, "grad_norm": 0.364678748707198, "learning_rate": 1.6430099016876345e-06, "loss": 0.0028, "step": 6720 }, { "epoch": 3.0577797998180163, "grad_norm": 0.2183607332439741, "learning_rate": 1.6423385893277537e-06, "loss": 0.0022, "step": 6721 }, { "epoch": 3.0582347588717016, "grad_norm": 0.6878484085194408, "learning_rate": 1.6416673470520912e-06, "loss": 0.007, "step": 6722 }, { "epoch": 3.058689717925387, "grad_norm": 0.1571580755015069, "learning_rate": 1.6409961749154952e-06, "loss": 0.0014, "step": 6723 }, { "epoch": 3.0591446769790718, "grad_norm": 0.29118857370686246, "learning_rate": 1.6403250729728134e-06, "loss": 0.0012, "step": 6724 }, { "epoch": 3.059599636032757, "grad_norm": 0.13536657455984966, "learning_rate": 1.639654041278885e-06, "loss": 0.0008, "step": 6725 }, { "epoch": 3.0600545950864424, "grad_norm": 0.5078979818189695, "learning_rate": 1.6389830798885425e-06, "loss": 0.0061, "step": 6726 }, { "epoch": 3.0605095541401273, "grad_norm": 0.33941568764404717, "learning_rate": 1.638312188856615e-06, "loss": 0.0018, "step": 6727 }, { "epoch": 3.0609645131938126, "grad_norm": 0.5139330580031816, "learning_rate": 1.6376413682379232e-06, "loss": 0.0059, "step": 6728 }, { "epoch": 3.061419472247498, "grad_norm": 0.22238503418145897, "learning_rate": 1.6369706180872851e-06, "loss": 0.0032, "step": 6729 }, { "epoch": 3.0618744313011828, "grad_norm": 0.30489326116501364, "learning_rate": 1.63629993845951e-06, "loss": 0.0014, "step": 6730 }, { "epoch": 3.062329390354868, "grad_norm": 0.5442630492686906, "learning_rate": 1.6356293294094037e-06, "loss": 0.0127, "step": 6731 }, { "epoch": 3.0627843494085534, "grad_norm": 0.09055319578965666, "learning_rate": 1.6349587909917655e-06, "loss": 0.0006, "step": 6732 }, { "epoch": 3.0632393084622382, "grad_norm": 0.11308457610713135, "learning_rate": 1.6342883232613883e-06, "loss": 0.0012, "step": 6733 }, { "epoch": 3.0636942675159236, "grad_norm": 0.31912278688113216, "learning_rate": 1.63361792627306e-06, "loss": 0.0061, "step": 6734 }, { "epoch": 3.064149226569609, "grad_norm": 0.33968447676185043, "learning_rate": 1.6329476000815616e-06, "loss": 0.0075, "step": 6735 }, { "epoch": 3.0646041856232937, "grad_norm": 0.534906385349234, "learning_rate": 1.6322773447416707e-06, "loss": 0.0078, "step": 6736 }, { "epoch": 3.065059144676979, "grad_norm": 0.2852533801593067, "learning_rate": 1.6316071603081551e-06, "loss": 0.0036, "step": 6737 }, { "epoch": 3.0655141037306644, "grad_norm": 0.16077628131100627, "learning_rate": 1.6309370468357816e-06, "loss": 0.0011, "step": 6738 }, { "epoch": 3.065969062784349, "grad_norm": 0.14126856949795177, "learning_rate": 1.6302670043793084e-06, "loss": 0.0016, "step": 6739 }, { "epoch": 3.0664240218380345, "grad_norm": 0.18170065616644981, "learning_rate": 1.6295970329934873e-06, "loss": 0.0023, "step": 6740 }, { "epoch": 3.06687898089172, "grad_norm": 0.3041652455814574, "learning_rate": 1.6289271327330663e-06, "loss": 0.0033, "step": 6741 }, { "epoch": 3.0673339399454047, "grad_norm": 0.5249288752, "learning_rate": 1.628257303652786e-06, "loss": 0.016, "step": 6742 }, { "epoch": 3.06778889899909, "grad_norm": 0.627171852610698, "learning_rate": 1.6275875458073828e-06, "loss": 0.0082, "step": 6743 }, { "epoch": 3.0682438580527753, "grad_norm": 0.2112946244848515, "learning_rate": 1.6269178592515844e-06, "loss": 0.0018, "step": 6744 }, { "epoch": 3.06869881710646, "grad_norm": 0.43541541988296456, "learning_rate": 1.6262482440401162e-06, "loss": 0.0054, "step": 6745 }, { "epoch": 3.0691537761601455, "grad_norm": 0.21124975573728358, "learning_rate": 1.6255787002276962e-06, "loss": 0.0013, "step": 6746 }, { "epoch": 3.069608735213831, "grad_norm": 0.14033184695249074, "learning_rate": 1.6249092278690353e-06, "loss": 0.0019, "step": 6747 }, { "epoch": 3.070063694267516, "grad_norm": 0.19264550903956684, "learning_rate": 1.6242398270188412e-06, "loss": 0.0023, "step": 6748 }, { "epoch": 3.070518653321201, "grad_norm": 0.2100841670165044, "learning_rate": 1.6235704977318128e-06, "loss": 0.0038, "step": 6749 }, { "epoch": 3.0709736123748863, "grad_norm": 0.13008763137976845, "learning_rate": 1.622901240062646e-06, "loss": 0.0017, "step": 6750 }, { "epoch": 3.0714285714285716, "grad_norm": 0.5801982614494698, "learning_rate": 1.622232054066028e-06, "loss": 0.0124, "step": 6751 }, { "epoch": 3.0718835304822565, "grad_norm": 0.22505278586144375, "learning_rate": 1.6215629397966432e-06, "loss": 0.0024, "step": 6752 }, { "epoch": 3.072338489535942, "grad_norm": 0.15819513393418455, "learning_rate": 1.620893897309168e-06, "loss": 0.0015, "step": 6753 }, { "epoch": 3.072793448589627, "grad_norm": 0.14952535622036248, "learning_rate": 1.620224926658274e-06, "loss": 0.002, "step": 6754 }, { "epoch": 3.073248407643312, "grad_norm": 0.15713279599494986, "learning_rate": 1.619556027898625e-06, "loss": 0.0011, "step": 6755 }, { "epoch": 3.0737033666969973, "grad_norm": 0.18434451929717538, "learning_rate": 1.6188872010848821e-06, "loss": 0.0011, "step": 6756 }, { "epoch": 3.0741583257506826, "grad_norm": 0.2912312874193071, "learning_rate": 1.6182184462716977e-06, "loss": 0.0055, "step": 6757 }, { "epoch": 3.0746132848043675, "grad_norm": 0.35532925198836596, "learning_rate": 1.617549763513719e-06, "loss": 0.0072, "step": 6758 }, { "epoch": 3.0750682438580528, "grad_norm": 0.2811421542224645, "learning_rate": 1.6168811528655897e-06, "loss": 0.0063, "step": 6759 }, { "epoch": 3.075523202911738, "grad_norm": 0.1781791409545224, "learning_rate": 1.616212614381944e-06, "loss": 0.0011, "step": 6760 }, { "epoch": 3.075978161965423, "grad_norm": 0.15608219514217067, "learning_rate": 1.6155441481174128e-06, "loss": 0.0023, "step": 6761 }, { "epoch": 3.0764331210191083, "grad_norm": 0.3080645294530636, "learning_rate": 1.614875754126619e-06, "loss": 0.0025, "step": 6762 }, { "epoch": 3.0768880800727936, "grad_norm": 0.5069802422043405, "learning_rate": 1.614207432464182e-06, "loss": 0.0095, "step": 6763 }, { "epoch": 3.0773430391264784, "grad_norm": 0.14001087385754882, "learning_rate": 1.6135391831847127e-06, "loss": 0.0014, "step": 6764 }, { "epoch": 3.0777979981801638, "grad_norm": 0.20389951870370754, "learning_rate": 1.6128710063428179e-06, "loss": 0.0022, "step": 6765 }, { "epoch": 3.078252957233849, "grad_norm": 0.24375417581446612, "learning_rate": 1.612202901993099e-06, "loss": 0.0023, "step": 6766 }, { "epoch": 3.078707916287534, "grad_norm": 0.18952136591708305, "learning_rate": 1.6115348701901496e-06, "loss": 0.0019, "step": 6767 }, { "epoch": 3.0791628753412192, "grad_norm": 0.3267912552572459, "learning_rate": 1.6108669109885583e-06, "loss": 0.0032, "step": 6768 }, { "epoch": 3.0796178343949046, "grad_norm": 0.2671212320833274, "learning_rate": 1.6101990244429077e-06, "loss": 0.0034, "step": 6769 }, { "epoch": 3.0800727934485894, "grad_norm": 0.08158482083321156, "learning_rate": 1.6095312106077749e-06, "loss": 0.0008, "step": 6770 }, { "epoch": 3.0805277525022747, "grad_norm": 0.3074069247416491, "learning_rate": 1.6088634695377294e-06, "loss": 0.0043, "step": 6771 }, { "epoch": 3.08098271155596, "grad_norm": 0.06972593722683812, "learning_rate": 1.6081958012873367e-06, "loss": 0.0007, "step": 6772 }, { "epoch": 3.0814376706096454, "grad_norm": 0.325099103545562, "learning_rate": 1.6075282059111565e-06, "loss": 0.0046, "step": 6773 }, { "epoch": 3.08189262966333, "grad_norm": 0.4065414430482593, "learning_rate": 1.6068606834637406e-06, "loss": 0.0092, "step": 6774 }, { "epoch": 3.0823475887170155, "grad_norm": 0.3353853316542865, "learning_rate": 1.6061932339996366e-06, "loss": 0.0019, "step": 6775 }, { "epoch": 3.082802547770701, "grad_norm": 0.34474066263234737, "learning_rate": 1.605525857573385e-06, "loss": 0.0043, "step": 6776 }, { "epoch": 3.0832575068243857, "grad_norm": 0.14935549586130645, "learning_rate": 1.604858554239521e-06, "loss": 0.0012, "step": 6777 }, { "epoch": 3.083712465878071, "grad_norm": 0.35074923995823626, "learning_rate": 1.6041913240525735e-06, "loss": 0.0033, "step": 6778 }, { "epoch": 3.0841674249317563, "grad_norm": 0.22260920406255463, "learning_rate": 1.6035241670670648e-06, "loss": 0.0028, "step": 6779 }, { "epoch": 3.084622383985441, "grad_norm": 0.30685730004609924, "learning_rate": 1.6028570833375134e-06, "loss": 0.0043, "step": 6780 }, { "epoch": 3.0850773430391265, "grad_norm": 0.23795585970496885, "learning_rate": 1.6021900729184299e-06, "loss": 0.0042, "step": 6781 }, { "epoch": 3.085532302092812, "grad_norm": 0.18248247140502616, "learning_rate": 1.601523135864319e-06, "loss": 0.0011, "step": 6782 }, { "epoch": 3.0859872611464967, "grad_norm": 0.4006993806084541, "learning_rate": 1.6008562722296797e-06, "loss": 0.0047, "step": 6783 }, { "epoch": 3.086442220200182, "grad_norm": 0.35518753368416306, "learning_rate": 1.6001894820690058e-06, "loss": 0.0091, "step": 6784 }, { "epoch": 3.0868971792538673, "grad_norm": 0.13838630534872037, "learning_rate": 1.5995227654367833e-06, "loss": 0.0011, "step": 6785 }, { "epoch": 3.087352138307552, "grad_norm": 0.2847726122056724, "learning_rate": 1.5988561223874938e-06, "loss": 0.0027, "step": 6786 }, { "epoch": 3.0878070973612375, "grad_norm": 0.3102729618041833, "learning_rate": 1.598189552975613e-06, "loss": 0.0031, "step": 6787 }, { "epoch": 3.088262056414923, "grad_norm": 0.43863162309011966, "learning_rate": 1.5975230572556094e-06, "loss": 0.015, "step": 6788 }, { "epoch": 3.0887170154686077, "grad_norm": 0.5240383150294361, "learning_rate": 1.596856635281946e-06, "loss": 0.0087, "step": 6789 }, { "epoch": 3.089171974522293, "grad_norm": 0.16122824749611941, "learning_rate": 1.5961902871090801e-06, "loss": 0.0016, "step": 6790 }, { "epoch": 3.0896269335759783, "grad_norm": 0.2429654572948973, "learning_rate": 1.5955240127914617e-06, "loss": 0.0033, "step": 6791 }, { "epoch": 3.090081892629663, "grad_norm": 0.28362834823235056, "learning_rate": 1.594857812383537e-06, "loss": 0.0036, "step": 6792 }, { "epoch": 3.0905368516833485, "grad_norm": 0.31485279389118553, "learning_rate": 1.5941916859397432e-06, "loss": 0.0037, "step": 6793 }, { "epoch": 3.0909918107370338, "grad_norm": 0.4052275364090227, "learning_rate": 1.593525633514515e-06, "loss": 0.005, "step": 6794 }, { "epoch": 3.0914467697907186, "grad_norm": 0.14949130760288265, "learning_rate": 1.5928596551622785e-06, "loss": 0.002, "step": 6795 }, { "epoch": 3.091901728844404, "grad_norm": 0.16819636254418016, "learning_rate": 1.592193750937454e-06, "loss": 0.0019, "step": 6796 }, { "epoch": 3.0923566878980893, "grad_norm": 0.15489505488303718, "learning_rate": 1.5915279208944572e-06, "loss": 0.0008, "step": 6797 }, { "epoch": 3.092811646951774, "grad_norm": 0.1955330550523499, "learning_rate": 1.5908621650876956e-06, "loss": 0.001, "step": 6798 }, { "epoch": 3.0932666060054594, "grad_norm": 0.28054520079281875, "learning_rate": 1.5901964835715728e-06, "loss": 0.005, "step": 6799 }, { "epoch": 3.0937215650591448, "grad_norm": 0.2089907138380868, "learning_rate": 1.5895308764004835e-06, "loss": 0.0017, "step": 6800 }, { "epoch": 3.0941765241128296, "grad_norm": 0.3940880184007369, "learning_rate": 1.5888653436288198e-06, "loss": 0.0065, "step": 6801 }, { "epoch": 3.094631483166515, "grad_norm": 0.272644618356537, "learning_rate": 1.5881998853109665e-06, "loss": 0.0028, "step": 6802 }, { "epoch": 3.0950864422202002, "grad_norm": 0.4554433488910837, "learning_rate": 1.5875345015012999e-06, "loss": 0.0052, "step": 6803 }, { "epoch": 3.0955414012738856, "grad_norm": 0.33903576189329926, "learning_rate": 1.586869192254194e-06, "loss": 0.0036, "step": 6804 }, { "epoch": 3.0959963603275704, "grad_norm": 0.3992404791446398, "learning_rate": 1.5862039576240134e-06, "loss": 0.0058, "step": 6805 }, { "epoch": 3.0964513193812557, "grad_norm": 0.20859438376364414, "learning_rate": 1.5855387976651194e-06, "loss": 0.0011, "step": 6806 }, { "epoch": 3.096906278434941, "grad_norm": 0.17623621355196992, "learning_rate": 1.584873712431864e-06, "loss": 0.0019, "step": 6807 }, { "epoch": 3.097361237488626, "grad_norm": 0.6408286348322164, "learning_rate": 1.5842087019785966e-06, "loss": 0.0066, "step": 6808 }, { "epoch": 3.097816196542311, "grad_norm": 0.28048661045698187, "learning_rate": 1.583543766359659e-06, "loss": 0.0047, "step": 6809 }, { "epoch": 3.0982711555959965, "grad_norm": 0.43258014204740824, "learning_rate": 1.5828789056293857e-06, "loss": 0.0034, "step": 6810 }, { "epoch": 3.0987261146496814, "grad_norm": 0.2111080352403279, "learning_rate": 1.5822141198421068e-06, "loss": 0.0022, "step": 6811 }, { "epoch": 3.0991810737033667, "grad_norm": 0.2587966120385266, "learning_rate": 1.581549409052145e-06, "loss": 0.0025, "step": 6812 }, { "epoch": 3.099636032757052, "grad_norm": 0.17362431026158973, "learning_rate": 1.5808847733138182e-06, "loss": 0.0011, "step": 6813 }, { "epoch": 3.100090991810737, "grad_norm": 0.29310366122609377, "learning_rate": 1.5802202126814365e-06, "loss": 0.0026, "step": 6814 }, { "epoch": 3.100545950864422, "grad_norm": 0.7836695134009406, "learning_rate": 1.5795557272093053e-06, "loss": 0.02, "step": 6815 }, { "epoch": 3.1010009099181075, "grad_norm": 0.3597955204296136, "learning_rate": 1.578891316951724e-06, "loss": 0.0028, "step": 6816 }, { "epoch": 3.1014558689717924, "grad_norm": 0.21381152300183692, "learning_rate": 1.5782269819629843e-06, "loss": 0.0021, "step": 6817 }, { "epoch": 3.1019108280254777, "grad_norm": 0.2069619435131697, "learning_rate": 1.5775627222973734e-06, "loss": 0.0021, "step": 6818 }, { "epoch": 3.102365787079163, "grad_norm": 0.2916736626539095, "learning_rate": 1.5768985380091703e-06, "loss": 0.0019, "step": 6819 }, { "epoch": 3.102820746132848, "grad_norm": 0.16601026668729094, "learning_rate": 1.5762344291526507e-06, "loss": 0.0018, "step": 6820 }, { "epoch": 3.103275705186533, "grad_norm": 0.38606848282046524, "learning_rate": 1.575570395782081e-06, "loss": 0.0059, "step": 6821 }, { "epoch": 3.1037306642402185, "grad_norm": 0.5918005198268076, "learning_rate": 1.5749064379517242e-06, "loss": 0.014, "step": 6822 }, { "epoch": 3.1041856232939034, "grad_norm": 0.2972877724656884, "learning_rate": 1.5742425557158362e-06, "loss": 0.0026, "step": 6823 }, { "epoch": 3.1046405823475887, "grad_norm": 0.5461888560298997, "learning_rate": 1.5735787491286653e-06, "loss": 0.011, "step": 6824 }, { "epoch": 3.105095541401274, "grad_norm": 0.22509069844372367, "learning_rate": 1.5729150182444559e-06, "loss": 0.0032, "step": 6825 }, { "epoch": 3.105550500454959, "grad_norm": 0.38160578051526695, "learning_rate": 1.5722513631174445e-06, "loss": 0.0047, "step": 6826 }, { "epoch": 3.106005459508644, "grad_norm": 0.3708209533647161, "learning_rate": 1.5715877838018615e-06, "loss": 0.0037, "step": 6827 }, { "epoch": 3.1064604185623295, "grad_norm": 0.1491305755989454, "learning_rate": 1.5709242803519314e-06, "loss": 0.0015, "step": 6828 }, { "epoch": 3.1069153776160148, "grad_norm": 0.18495191719097803, "learning_rate": 1.570260852821875e-06, "loss": 0.0034, "step": 6829 }, { "epoch": 3.1073703366696996, "grad_norm": 0.47798385691331263, "learning_rate": 1.569597501265902e-06, "loss": 0.0105, "step": 6830 }, { "epoch": 3.107825295723385, "grad_norm": 0.30747590722838086, "learning_rate": 1.5689342257382206e-06, "loss": 0.0024, "step": 6831 }, { "epoch": 3.1082802547770703, "grad_norm": 0.17122975542842983, "learning_rate": 1.5682710262930287e-06, "loss": 0.0008, "step": 6832 }, { "epoch": 3.108735213830755, "grad_norm": 0.22943052273811226, "learning_rate": 1.5676079029845215e-06, "loss": 0.002, "step": 6833 }, { "epoch": 3.1091901728844404, "grad_norm": 0.15160314047695253, "learning_rate": 1.5669448558668855e-06, "loss": 0.0015, "step": 6834 }, { "epoch": 3.1096451319381258, "grad_norm": 0.20590083130130152, "learning_rate": 1.5662818849943011e-06, "loss": 0.0017, "step": 6835 }, { "epoch": 3.1101000909918106, "grad_norm": 0.15803136492355277, "learning_rate": 1.5656189904209463e-06, "loss": 0.0014, "step": 6836 }, { "epoch": 3.110555050045496, "grad_norm": 0.11527142230570622, "learning_rate": 1.5649561722009868e-06, "loss": 0.0014, "step": 6837 }, { "epoch": 3.1110100090991812, "grad_norm": 0.497841809750796, "learning_rate": 1.564293430388587e-06, "loss": 0.0045, "step": 6838 }, { "epoch": 3.111464968152866, "grad_norm": 0.1654786922484033, "learning_rate": 1.563630765037902e-06, "loss": 0.0023, "step": 6839 }, { "epoch": 3.1119199272065514, "grad_norm": 0.1979042880676893, "learning_rate": 1.562968176203083e-06, "loss": 0.0018, "step": 6840 }, { "epoch": 3.1123748862602367, "grad_norm": 0.3431382402611356, "learning_rate": 1.5623056639382721e-06, "loss": 0.0057, "step": 6841 }, { "epoch": 3.1128298453139216, "grad_norm": 0.22044317321968546, "learning_rate": 1.5616432282976075e-06, "loss": 0.003, "step": 6842 }, { "epoch": 3.113284804367607, "grad_norm": 0.516870300570046, "learning_rate": 1.5609808693352217e-06, "loss": 0.0043, "step": 6843 }, { "epoch": 3.113739763421292, "grad_norm": 0.7708565641551699, "learning_rate": 1.5603185871052378e-06, "loss": 0.0155, "step": 6844 }, { "epoch": 3.114194722474977, "grad_norm": 0.323751674972638, "learning_rate": 1.5596563816617766e-06, "loss": 0.0036, "step": 6845 }, { "epoch": 3.1146496815286624, "grad_norm": 0.10438897596693349, "learning_rate": 1.5589942530589482e-06, "loss": 0.001, "step": 6846 }, { "epoch": 3.1151046405823477, "grad_norm": 0.11932165518973556, "learning_rate": 1.5583322013508605e-06, "loss": 0.001, "step": 6847 }, { "epoch": 3.1155595996360326, "grad_norm": 0.1798021497365959, "learning_rate": 1.5576702265916126e-06, "loss": 0.0023, "step": 6848 }, { "epoch": 3.116014558689718, "grad_norm": 0.22387248412863475, "learning_rate": 1.5570083288352977e-06, "loss": 0.0023, "step": 6849 }, { "epoch": 3.116469517743403, "grad_norm": 0.45127644954315904, "learning_rate": 1.5563465081360047e-06, "loss": 0.0113, "step": 6850 }, { "epoch": 3.116924476797088, "grad_norm": 0.3563716363829064, "learning_rate": 1.5556847645478128e-06, "loss": 0.0043, "step": 6851 }, { "epoch": 3.1173794358507734, "grad_norm": 0.4203113183495934, "learning_rate": 1.5550230981247983e-06, "loss": 0.0018, "step": 6852 }, { "epoch": 3.1178343949044587, "grad_norm": 0.29311964515632577, "learning_rate": 1.5543615089210279e-06, "loss": 0.0037, "step": 6853 }, { "epoch": 3.1182893539581436, "grad_norm": 0.4623200433175975, "learning_rate": 1.553699996990565e-06, "loss": 0.0076, "step": 6854 }, { "epoch": 3.118744313011829, "grad_norm": 0.3266598858151655, "learning_rate": 1.5530385623874643e-06, "loss": 0.0039, "step": 6855 }, { "epoch": 3.119199272065514, "grad_norm": 0.1972946577028999, "learning_rate": 1.5523772051657757e-06, "loss": 0.0026, "step": 6856 }, { "epoch": 3.1196542311191995, "grad_norm": 0.22838197744221508, "learning_rate": 1.5517159253795434e-06, "loss": 0.0031, "step": 6857 }, { "epoch": 3.1201091901728844, "grad_norm": 0.7695259901115906, "learning_rate": 1.5510547230828026e-06, "loss": 0.0091, "step": 6858 }, { "epoch": 3.1205641492265697, "grad_norm": 0.05763660091051396, "learning_rate": 1.550393598329585e-06, "loss": 0.0003, "step": 6859 }, { "epoch": 3.121019108280255, "grad_norm": 0.2837498411760376, "learning_rate": 1.5497325511739136e-06, "loss": 0.0015, "step": 6860 }, { "epoch": 3.12147406733394, "grad_norm": 0.3232169706008415, "learning_rate": 1.5490715816698077e-06, "loss": 0.0055, "step": 6861 }, { "epoch": 3.121929026387625, "grad_norm": 0.25269373361657055, "learning_rate": 1.5484106898712771e-06, "loss": 0.0032, "step": 6862 }, { "epoch": 3.1223839854413105, "grad_norm": 0.10182852769780844, "learning_rate": 1.5477498758323268e-06, "loss": 0.0007, "step": 6863 }, { "epoch": 3.1228389444949953, "grad_norm": 0.3033373661272923, "learning_rate": 1.547089139606957e-06, "loss": 0.0034, "step": 6864 }, { "epoch": 3.1232939035486806, "grad_norm": 0.029614465632993525, "learning_rate": 1.54642848124916e-06, "loss": 0.0002, "step": 6865 }, { "epoch": 3.123748862602366, "grad_norm": 0.1967758734297654, "learning_rate": 1.5457679008129205e-06, "loss": 0.0013, "step": 6866 }, { "epoch": 3.124203821656051, "grad_norm": 0.14372613608715176, "learning_rate": 1.5451073983522196e-06, "loss": 0.0012, "step": 6867 }, { "epoch": 3.124658780709736, "grad_norm": 0.11943845706345083, "learning_rate": 1.5444469739210291e-06, "loss": 0.0009, "step": 6868 }, { "epoch": 3.1251137397634214, "grad_norm": 0.2315089402252111, "learning_rate": 1.543786627573317e-06, "loss": 0.0017, "step": 6869 }, { "epoch": 3.1255686988171063, "grad_norm": 0.17839813843838048, "learning_rate": 1.543126359363043e-06, "loss": 0.0022, "step": 6870 }, { "epoch": 3.1260236578707916, "grad_norm": 0.23330906346847702, "learning_rate": 1.5424661693441618e-06, "loss": 0.0021, "step": 6871 }, { "epoch": 3.126478616924477, "grad_norm": 0.3010669664982761, "learning_rate": 1.5418060575706218e-06, "loss": 0.0028, "step": 6872 }, { "epoch": 3.126933575978162, "grad_norm": 0.2669948638747188, "learning_rate": 1.5411460240963627e-06, "loss": 0.0033, "step": 6873 }, { "epoch": 3.127388535031847, "grad_norm": 0.32174644907188693, "learning_rate": 1.5404860689753216e-06, "loss": 0.0023, "step": 6874 }, { "epoch": 3.1278434940855324, "grad_norm": 0.1426818315644164, "learning_rate": 1.5398261922614244e-06, "loss": 0.0016, "step": 6875 }, { "epoch": 3.1282984531392173, "grad_norm": 0.23729385866930025, "learning_rate": 1.5391663940085958e-06, "loss": 0.004, "step": 6876 }, { "epoch": 3.1287534121929026, "grad_norm": 0.41322091666820443, "learning_rate": 1.538506674270749e-06, "loss": 0.013, "step": 6877 }, { "epoch": 3.129208371246588, "grad_norm": 0.168280157256299, "learning_rate": 1.5378470331017955e-06, "loss": 0.0012, "step": 6878 }, { "epoch": 3.1296633303002728, "grad_norm": 0.1349233851728633, "learning_rate": 1.5371874705556377e-06, "loss": 0.0014, "step": 6879 }, { "epoch": 3.130118289353958, "grad_norm": 0.32627486086772356, "learning_rate": 1.5365279866861716e-06, "loss": 0.0034, "step": 6880 }, { "epoch": 3.1305732484076434, "grad_norm": 0.11823554206722059, "learning_rate": 1.535868581547288e-06, "loss": 0.0007, "step": 6881 }, { "epoch": 3.1310282074613287, "grad_norm": 0.8532523859168206, "learning_rate": 1.5352092551928691e-06, "loss": 0.0104, "step": 6882 }, { "epoch": 3.1314831665150136, "grad_norm": 0.12683245855594846, "learning_rate": 1.5345500076767932e-06, "loss": 0.001, "step": 6883 }, { "epoch": 3.131938125568699, "grad_norm": 0.24201292883441958, "learning_rate": 1.5338908390529302e-06, "loss": 0.0036, "step": 6884 }, { "epoch": 3.132393084622384, "grad_norm": 0.49533792455523207, "learning_rate": 1.5332317493751452e-06, "loss": 0.0034, "step": 6885 }, { "epoch": 3.132848043676069, "grad_norm": 0.21115882355673174, "learning_rate": 1.5325727386972963e-06, "loss": 0.002, "step": 6886 }, { "epoch": 3.1333030027297544, "grad_norm": 0.24609637490534747, "learning_rate": 1.531913807073234e-06, "loss": 0.0023, "step": 6887 }, { "epoch": 3.1337579617834397, "grad_norm": 0.11529526276720735, "learning_rate": 1.531254954556804e-06, "loss": 0.0009, "step": 6888 }, { "epoch": 3.1342129208371245, "grad_norm": 0.42260324860541737, "learning_rate": 1.5305961812018435e-06, "loss": 0.0048, "step": 6889 }, { "epoch": 3.13466787989081, "grad_norm": 0.40094264055457685, "learning_rate": 1.5299374870621859e-06, "loss": 0.0039, "step": 6890 }, { "epoch": 3.135122838944495, "grad_norm": 0.5013347276001865, "learning_rate": 1.529278872191655e-06, "loss": 0.0055, "step": 6891 }, { "epoch": 3.13557779799818, "grad_norm": 0.2815309983028692, "learning_rate": 1.528620336644072e-06, "loss": 0.0032, "step": 6892 }, { "epoch": 3.1360327570518653, "grad_norm": 0.12338885923251057, "learning_rate": 1.5279618804732481e-06, "loss": 0.001, "step": 6893 }, { "epoch": 3.1364877161055507, "grad_norm": 0.5441654360127969, "learning_rate": 1.5273035037329898e-06, "loss": 0.0065, "step": 6894 }, { "epoch": 3.1369426751592355, "grad_norm": 0.18922120174320853, "learning_rate": 1.5266452064770964e-06, "loss": 0.0017, "step": 6895 }, { "epoch": 3.137397634212921, "grad_norm": 0.33950477229752973, "learning_rate": 1.5259869887593618e-06, "loss": 0.0066, "step": 6896 }, { "epoch": 3.137852593266606, "grad_norm": 0.08532470031393762, "learning_rate": 1.525328850633571e-06, "loss": 0.0006, "step": 6897 }, { "epoch": 3.138307552320291, "grad_norm": 0.08731456690939723, "learning_rate": 1.5246707921535043e-06, "loss": 0.0004, "step": 6898 }, { "epoch": 3.1387625113739763, "grad_norm": 0.6425814048628203, "learning_rate": 1.524012813372937e-06, "loss": 0.0078, "step": 6899 }, { "epoch": 3.1392174704276616, "grad_norm": 0.10117548666109298, "learning_rate": 1.5233549143456348e-06, "loss": 0.0013, "step": 6900 }, { "epoch": 3.1396724294813465, "grad_norm": 0.4424523099120269, "learning_rate": 1.522697095125359e-06, "loss": 0.0098, "step": 6901 }, { "epoch": 3.140127388535032, "grad_norm": 0.19628136593511084, "learning_rate": 1.5220393557658621e-06, "loss": 0.0024, "step": 6902 }, { "epoch": 3.140582347588717, "grad_norm": 0.110847791260711, "learning_rate": 1.5213816963208938e-06, "loss": 0.0008, "step": 6903 }, { "epoch": 3.141037306642402, "grad_norm": 0.1541345930856694, "learning_rate": 1.5207241168441928e-06, "loss": 0.0014, "step": 6904 }, { "epoch": 3.1414922656960873, "grad_norm": 0.6605587673562094, "learning_rate": 1.5200666173894945e-06, "loss": 0.002, "step": 6905 }, { "epoch": 3.1419472247497726, "grad_norm": 0.048613579479441865, "learning_rate": 1.5194091980105277e-06, "loss": 0.0004, "step": 6906 }, { "epoch": 3.1424021838034575, "grad_norm": 0.33016091036780665, "learning_rate": 1.5187518587610123e-06, "loss": 0.0062, "step": 6907 }, { "epoch": 3.142857142857143, "grad_norm": 0.44291291499500157, "learning_rate": 1.5180945996946643e-06, "loss": 0.0089, "step": 6908 }, { "epoch": 3.143312101910828, "grad_norm": 0.09606582746878313, "learning_rate": 1.5174374208651913e-06, "loss": 0.0024, "step": 6909 }, { "epoch": 3.143767060964513, "grad_norm": 0.4233610310474437, "learning_rate": 1.516780322326295e-06, "loss": 0.0022, "step": 6910 }, { "epoch": 3.1442220200181983, "grad_norm": 0.23114517562728976, "learning_rate": 1.5161233041316702e-06, "loss": 0.0017, "step": 6911 }, { "epoch": 3.1446769790718836, "grad_norm": 0.6322175152470388, "learning_rate": 1.5154663663350055e-06, "loss": 0.0094, "step": 6912 }, { "epoch": 3.145131938125569, "grad_norm": 0.4852004166976127, "learning_rate": 1.5148095089899844e-06, "loss": 0.0111, "step": 6913 }, { "epoch": 3.1455868971792538, "grad_norm": 0.2672718601426118, "learning_rate": 1.5141527321502803e-06, "loss": 0.0022, "step": 6914 }, { "epoch": 3.146041856232939, "grad_norm": 0.19346135934174716, "learning_rate": 1.5134960358695635e-06, "loss": 0.0021, "step": 6915 }, { "epoch": 3.1464968152866244, "grad_norm": 0.17347944074347296, "learning_rate": 1.5128394202014952e-06, "loss": 0.0015, "step": 6916 }, { "epoch": 3.1469517743403093, "grad_norm": 0.2637249817786577, "learning_rate": 1.512182885199732e-06, "loss": 0.0033, "step": 6917 }, { "epoch": 3.1474067333939946, "grad_norm": 0.1233679985469648, "learning_rate": 1.5115264309179218e-06, "loss": 0.0009, "step": 6918 }, { "epoch": 3.14786169244768, "grad_norm": 0.24239284320990384, "learning_rate": 1.5108700574097074e-06, "loss": 0.0018, "step": 6919 }, { "epoch": 3.1483166515013647, "grad_norm": 0.10117046990584161, "learning_rate": 1.5102137647287263e-06, "loss": 0.0006, "step": 6920 }, { "epoch": 3.14877161055505, "grad_norm": 0.47799317878909064, "learning_rate": 1.5095575529286055e-06, "loss": 0.0041, "step": 6921 }, { "epoch": 3.1492265696087354, "grad_norm": 0.23820864327586672, "learning_rate": 1.5089014220629694e-06, "loss": 0.002, "step": 6922 }, { "epoch": 3.1496815286624202, "grad_norm": 0.22428896606767842, "learning_rate": 1.508245372185433e-06, "loss": 0.0026, "step": 6923 }, { "epoch": 3.1501364877161055, "grad_norm": 0.2142975366309967, "learning_rate": 1.5075894033496063e-06, "loss": 0.0069, "step": 6924 }, { "epoch": 3.150591446769791, "grad_norm": 0.2710840552971803, "learning_rate": 1.5069335156090915e-06, "loss": 0.0027, "step": 6925 }, { "epoch": 3.1510464058234757, "grad_norm": 0.23726779609071552, "learning_rate": 1.5062777090174847e-06, "loss": 0.0019, "step": 6926 }, { "epoch": 3.151501364877161, "grad_norm": 0.44461576639641776, "learning_rate": 1.5056219836283763e-06, "loss": 0.0058, "step": 6927 }, { "epoch": 3.1519563239308463, "grad_norm": 0.411155360787182, "learning_rate": 1.504966339495349e-06, "loss": 0.0073, "step": 6928 }, { "epoch": 3.152411282984531, "grad_norm": 0.3126100029270906, "learning_rate": 1.5043107766719795e-06, "loss": 0.0028, "step": 6929 }, { "epoch": 3.1528662420382165, "grad_norm": 0.08816363372983642, "learning_rate": 1.503655295211836e-06, "loss": 0.0008, "step": 6930 }, { "epoch": 3.153321201091902, "grad_norm": 0.26329820846430424, "learning_rate": 1.5029998951684829e-06, "loss": 0.004, "step": 6931 }, { "epoch": 3.1537761601455867, "grad_norm": 0.37949124591662786, "learning_rate": 1.502344576595476e-06, "loss": 0.0046, "step": 6932 }, { "epoch": 3.154231119199272, "grad_norm": 0.38088947365969045, "learning_rate": 1.5016893395463633e-06, "loss": 0.0026, "step": 6933 }, { "epoch": 3.1546860782529573, "grad_norm": 0.15150400098070624, "learning_rate": 1.5010341840746912e-06, "loss": 0.0014, "step": 6934 }, { "epoch": 3.1551410373066426, "grad_norm": 0.26877478925476916, "learning_rate": 1.500379110233994e-06, "loss": 0.0014, "step": 6935 }, { "epoch": 3.1555959963603275, "grad_norm": 0.056593181407895946, "learning_rate": 1.4997241180778013e-06, "loss": 0.0004, "step": 6936 }, { "epoch": 3.156050955414013, "grad_norm": 0.23475656848069756, "learning_rate": 1.4990692076596368e-06, "loss": 0.0041, "step": 6937 }, { "epoch": 3.156505914467698, "grad_norm": 0.11878540783206613, "learning_rate": 1.4984143790330164e-06, "loss": 0.0007, "step": 6938 }, { "epoch": 3.156960873521383, "grad_norm": 0.5665791141181236, "learning_rate": 1.4977596322514498e-06, "loss": 0.0058, "step": 6939 }, { "epoch": 3.1574158325750683, "grad_norm": 0.25265405211121783, "learning_rate": 1.4971049673684396e-06, "loss": 0.0033, "step": 6940 }, { "epoch": 3.1578707916287536, "grad_norm": 0.28698549126311895, "learning_rate": 1.4964503844374824e-06, "loss": 0.0049, "step": 6941 }, { "epoch": 3.1583257506824385, "grad_norm": 0.07203852221889555, "learning_rate": 1.4957958835120684e-06, "loss": 0.0005, "step": 6942 }, { "epoch": 3.158780709736124, "grad_norm": 0.9418265328067228, "learning_rate": 1.4951414646456794e-06, "loss": 0.0058, "step": 6943 }, { "epoch": 3.159235668789809, "grad_norm": 0.41737225175846193, "learning_rate": 1.4944871278917928e-06, "loss": 0.0085, "step": 6944 }, { "epoch": 3.159690627843494, "grad_norm": 0.5789792646681481, "learning_rate": 1.4938328733038762e-06, "loss": 0.0067, "step": 6945 }, { "epoch": 3.1601455868971793, "grad_norm": 0.5842886100386547, "learning_rate": 1.4931787009353943e-06, "loss": 0.0099, "step": 6946 }, { "epoch": 3.1606005459508646, "grad_norm": 0.22712449114098612, "learning_rate": 1.4925246108398008e-06, "loss": 0.0029, "step": 6947 }, { "epoch": 3.1610555050045495, "grad_norm": 0.5843906005830881, "learning_rate": 1.491870603070547e-06, "loss": 0.005, "step": 6948 }, { "epoch": 3.1615104640582348, "grad_norm": 0.13863781774140338, "learning_rate": 1.4912166776810757e-06, "loss": 0.0019, "step": 6949 }, { "epoch": 3.16196542311192, "grad_norm": 0.20550090497114704, "learning_rate": 1.4905628347248214e-06, "loss": 0.0013, "step": 6950 }, { "epoch": 3.162420382165605, "grad_norm": 0.13786176485536444, "learning_rate": 1.4899090742552136e-06, "loss": 0.0017, "step": 6951 }, { "epoch": 3.1628753412192903, "grad_norm": 0.6500245896778648, "learning_rate": 1.4892553963256745e-06, "loss": 0.0125, "step": 6952 }, { "epoch": 3.1633303002729756, "grad_norm": 0.2891464617245846, "learning_rate": 1.4886018009896208e-06, "loss": 0.0014, "step": 6953 }, { "epoch": 3.1637852593266604, "grad_norm": 0.44351997899158213, "learning_rate": 1.4879482883004593e-06, "loss": 0.0036, "step": 6954 }, { "epoch": 3.1642402183803457, "grad_norm": 0.2658160161829256, "learning_rate": 1.4872948583115935e-06, "loss": 0.0039, "step": 6955 }, { "epoch": 3.164695177434031, "grad_norm": 0.4766651570375473, "learning_rate": 1.4866415110764193e-06, "loss": 0.0105, "step": 6956 }, { "epoch": 3.165150136487716, "grad_norm": 0.1196671078312413, "learning_rate": 1.4859882466483239e-06, "loss": 0.0005, "step": 6957 }, { "epoch": 3.1656050955414012, "grad_norm": 0.2983031784643379, "learning_rate": 1.4853350650806903e-06, "loss": 0.0076, "step": 6958 }, { "epoch": 3.1660600545950865, "grad_norm": 0.4525417858094299, "learning_rate": 1.4846819664268925e-06, "loss": 0.0026, "step": 6959 }, { "epoch": 3.1665150136487714, "grad_norm": 0.2943677478855234, "learning_rate": 1.4840289507402995e-06, "loss": 0.0062, "step": 6960 }, { "epoch": 3.1669699727024567, "grad_norm": 0.051901415440677785, "learning_rate": 1.4833760180742718e-06, "loss": 0.0003, "step": 6961 }, { "epoch": 3.167424931756142, "grad_norm": 0.39556639128862736, "learning_rate": 1.4827231684821652e-06, "loss": 0.007, "step": 6962 }, { "epoch": 3.167879890809827, "grad_norm": 0.24520366352242695, "learning_rate": 1.4820704020173281e-06, "loss": 0.0018, "step": 6963 }, { "epoch": 3.168334849863512, "grad_norm": 0.11429154082346786, "learning_rate": 1.4814177187331003e-06, "loss": 0.0011, "step": 6964 }, { "epoch": 3.1687898089171975, "grad_norm": 0.2805603536478743, "learning_rate": 1.480765118682817e-06, "loss": 0.0042, "step": 6965 }, { "epoch": 3.1692447679708824, "grad_norm": 0.13963996823940888, "learning_rate": 1.4801126019198048e-06, "loss": 0.0015, "step": 6966 }, { "epoch": 3.1696997270245677, "grad_norm": 0.12177785885817916, "learning_rate": 1.479460168497386e-06, "loss": 0.0007, "step": 6967 }, { "epoch": 3.170154686078253, "grad_norm": 0.27363310048004086, "learning_rate": 1.478807818468872e-06, "loss": 0.0048, "step": 6968 }, { "epoch": 3.1706096451319383, "grad_norm": 0.34117928799899755, "learning_rate": 1.4781555518875718e-06, "loss": 0.0075, "step": 6969 }, { "epoch": 3.171064604185623, "grad_norm": 0.3938271016612936, "learning_rate": 1.4775033688067862e-06, "loss": 0.0029, "step": 6970 }, { "epoch": 3.1715195632393085, "grad_norm": 0.2861886041770689, "learning_rate": 1.4768512692798075e-06, "loss": 0.0021, "step": 6971 }, { "epoch": 3.171974522292994, "grad_norm": 0.29275033629582703, "learning_rate": 1.476199253359922e-06, "loss": 0.0014, "step": 6972 }, { "epoch": 3.1724294813466787, "grad_norm": 0.2034175594803535, "learning_rate": 1.4755473211004106e-06, "loss": 0.0034, "step": 6973 }, { "epoch": 3.172884440400364, "grad_norm": 0.3395910900439031, "learning_rate": 1.4748954725545456e-06, "loss": 0.0058, "step": 6974 }, { "epoch": 3.1733393994540493, "grad_norm": 0.3397068544475348, "learning_rate": 1.4742437077755925e-06, "loss": 0.0031, "step": 6975 }, { "epoch": 3.173794358507734, "grad_norm": 0.17911702319171802, "learning_rate": 1.4735920268168126e-06, "loss": 0.0017, "step": 6976 }, { "epoch": 3.1742493175614195, "grad_norm": 0.28645317385423313, "learning_rate": 1.4729404297314559e-06, "loss": 0.0042, "step": 6977 }, { "epoch": 3.174704276615105, "grad_norm": 0.3129438642053258, "learning_rate": 1.47228891657277e-06, "loss": 0.0052, "step": 6978 }, { "epoch": 3.1751592356687897, "grad_norm": 0.34753541986898323, "learning_rate": 1.4716374873939922e-06, "loss": 0.0045, "step": 6979 }, { "epoch": 3.175614194722475, "grad_norm": 0.8176329542773638, "learning_rate": 1.4709861422483557e-06, "loss": 0.0175, "step": 6980 }, { "epoch": 3.1760691537761603, "grad_norm": 0.1789521086581688, "learning_rate": 1.470334881189084e-06, "loss": 0.0016, "step": 6981 }, { "epoch": 3.176524112829845, "grad_norm": 0.3080263233145323, "learning_rate": 1.469683704269395e-06, "loss": 0.0042, "step": 6982 }, { "epoch": 3.1769790718835305, "grad_norm": 0.4265768647037812, "learning_rate": 1.4690326115425018e-06, "loss": 0.0054, "step": 6983 }, { "epoch": 3.1774340309372158, "grad_norm": 0.2578822196270599, "learning_rate": 1.4683816030616077e-06, "loss": 0.0026, "step": 6984 }, { "epoch": 3.1778889899909006, "grad_norm": 0.18821597573003374, "learning_rate": 1.4677306788799106e-06, "loss": 0.0013, "step": 6985 }, { "epoch": 3.178343949044586, "grad_norm": 0.2108330877486421, "learning_rate": 1.4670798390506002e-06, "loss": 0.002, "step": 6986 }, { "epoch": 3.1787989080982713, "grad_norm": 0.09240784055782118, "learning_rate": 1.4664290836268613e-06, "loss": 0.0008, "step": 6987 }, { "epoch": 3.179253867151956, "grad_norm": 0.1214391262464226, "learning_rate": 1.4657784126618697e-06, "loss": 0.0015, "step": 6988 }, { "epoch": 3.1797088262056414, "grad_norm": 0.19008684784857224, "learning_rate": 1.4651278262087954e-06, "loss": 0.0021, "step": 6989 }, { "epoch": 3.1801637852593267, "grad_norm": 0.27227910709379893, "learning_rate": 1.4644773243208021e-06, "loss": 0.0037, "step": 6990 }, { "epoch": 3.180618744313012, "grad_norm": 0.23925212127657935, "learning_rate": 1.4638269070510453e-06, "loss": 0.0012, "step": 6991 }, { "epoch": 3.181073703366697, "grad_norm": 0.3698377449866042, "learning_rate": 1.463176574452675e-06, "loss": 0.0089, "step": 6992 }, { "epoch": 3.1815286624203822, "grad_norm": 0.2511329543724763, "learning_rate": 1.462526326578832e-06, "loss": 0.0059, "step": 6993 }, { "epoch": 3.1819836214740675, "grad_norm": 0.24747642972490677, "learning_rate": 1.461876163482653e-06, "loss": 0.0024, "step": 6994 }, { "epoch": 3.1824385805277524, "grad_norm": 0.13829366401300883, "learning_rate": 1.4612260852172656e-06, "loss": 0.0015, "step": 6995 }, { "epoch": 3.1828935395814377, "grad_norm": 0.20984673655477235, "learning_rate": 1.4605760918357903e-06, "loss": 0.0022, "step": 6996 }, { "epoch": 3.183348498635123, "grad_norm": 0.10802846044875243, "learning_rate": 1.4599261833913443e-06, "loss": 0.0009, "step": 6997 }, { "epoch": 3.183803457688808, "grad_norm": 0.14145092855091615, "learning_rate": 1.4592763599370336e-06, "loss": 0.0014, "step": 6998 }, { "epoch": 3.184258416742493, "grad_norm": 0.3135551455751205, "learning_rate": 1.4586266215259575e-06, "loss": 0.0043, "step": 6999 }, { "epoch": 3.1847133757961785, "grad_norm": 0.424598530578906, "learning_rate": 1.4579769682112127e-06, "loss": 0.0072, "step": 7000 }, { "epoch": 3.1851683348498634, "grad_norm": 0.1525553037796216, "learning_rate": 1.457327400045884e-06, "loss": 0.0008, "step": 7001 }, { "epoch": 3.1856232939035487, "grad_norm": 0.553476222387892, "learning_rate": 1.4566779170830514e-06, "loss": 0.0156, "step": 7002 }, { "epoch": 3.186078252957234, "grad_norm": 0.3641217637962176, "learning_rate": 1.456028519375787e-06, "loss": 0.0024, "step": 7003 }, { "epoch": 3.186533212010919, "grad_norm": 0.35093848836969543, "learning_rate": 1.4553792069771574e-06, "loss": 0.0071, "step": 7004 }, { "epoch": 3.186988171064604, "grad_norm": 0.23483560193905978, "learning_rate": 1.4547299799402225e-06, "loss": 0.0039, "step": 7005 }, { "epoch": 3.1874431301182895, "grad_norm": 0.25846512692102547, "learning_rate": 1.4540808383180333e-06, "loss": 0.0019, "step": 7006 }, { "epoch": 3.1878980891719744, "grad_norm": 0.2302071833603282, "learning_rate": 1.4534317821636345e-06, "loss": 0.0018, "step": 7007 }, { "epoch": 3.1883530482256597, "grad_norm": 0.31596849236129537, "learning_rate": 1.4527828115300646e-06, "loss": 0.0035, "step": 7008 }, { "epoch": 3.188808007279345, "grad_norm": 0.4332499068891683, "learning_rate": 1.4521339264703526e-06, "loss": 0.0047, "step": 7009 }, { "epoch": 3.18926296633303, "grad_norm": 0.272528204724086, "learning_rate": 1.4514851270375246e-06, "loss": 0.0043, "step": 7010 }, { "epoch": 3.189717925386715, "grad_norm": 0.26715798169116417, "learning_rate": 1.4508364132845976e-06, "loss": 0.0044, "step": 7011 }, { "epoch": 3.1901728844404005, "grad_norm": 0.31783237341819176, "learning_rate": 1.450187785264581e-06, "loss": 0.003, "step": 7012 }, { "epoch": 3.1906278434940853, "grad_norm": 0.46739770874067466, "learning_rate": 1.4495392430304777e-06, "loss": 0.0124, "step": 7013 }, { "epoch": 3.1910828025477707, "grad_norm": 0.3085643073266237, "learning_rate": 1.4488907866352826e-06, "loss": 0.006, "step": 7014 }, { "epoch": 3.191537761601456, "grad_norm": 0.026240250450532535, "learning_rate": 1.4482424161319865e-06, "loss": 0.0002, "step": 7015 }, { "epoch": 3.191992720655141, "grad_norm": 0.1234082050621467, "learning_rate": 1.4475941315735706e-06, "loss": 0.0011, "step": 7016 }, { "epoch": 3.192447679708826, "grad_norm": 0.35121676375173017, "learning_rate": 1.4469459330130087e-06, "loss": 0.0037, "step": 7017 }, { "epoch": 3.1929026387625115, "grad_norm": 0.26743215688961997, "learning_rate": 1.4462978205032707e-06, "loss": 0.003, "step": 7018 }, { "epoch": 3.1933575978161963, "grad_norm": 0.2048788521877353, "learning_rate": 1.4456497940973152e-06, "loss": 0.0024, "step": 7019 }, { "epoch": 3.1938125568698816, "grad_norm": 0.6143127609472216, "learning_rate": 1.445001853848098e-06, "loss": 0.0047, "step": 7020 }, { "epoch": 3.194267515923567, "grad_norm": 0.28813420407047863, "learning_rate": 1.444353999808565e-06, "loss": 0.0033, "step": 7021 }, { "epoch": 3.194722474977252, "grad_norm": 0.1280484982815896, "learning_rate": 1.4437062320316557e-06, "loss": 0.0027, "step": 7022 }, { "epoch": 3.195177434030937, "grad_norm": 0.37369217786332276, "learning_rate": 1.4430585505703026e-06, "loss": 0.0054, "step": 7023 }, { "epoch": 3.1956323930846224, "grad_norm": 0.3166717571352752, "learning_rate": 1.4424109554774312e-06, "loss": 0.0043, "step": 7024 }, { "epoch": 3.1960873521383077, "grad_norm": 0.19985986230466862, "learning_rate": 1.4417634468059617e-06, "loss": 0.0016, "step": 7025 }, { "epoch": 3.1965423111919926, "grad_norm": 0.4743543805898379, "learning_rate": 1.441116024608804e-06, "loss": 0.0064, "step": 7026 }, { "epoch": 3.196997270245678, "grad_norm": 0.269701752249463, "learning_rate": 1.4404686889388631e-06, "loss": 0.006, "step": 7027 }, { "epoch": 3.1974522292993632, "grad_norm": 0.11094736315264271, "learning_rate": 1.439821439849035e-06, "loss": 0.0008, "step": 7028 }, { "epoch": 3.197907188353048, "grad_norm": 0.23474279946263965, "learning_rate": 1.4391742773922124e-06, "loss": 0.0029, "step": 7029 }, { "epoch": 3.1983621474067334, "grad_norm": 0.5430806120592543, "learning_rate": 1.438527201621277e-06, "loss": 0.0031, "step": 7030 }, { "epoch": 3.1988171064604187, "grad_norm": 0.10520295736204245, "learning_rate": 1.4378802125891038e-06, "loss": 0.0008, "step": 7031 }, { "epoch": 3.1992720655141036, "grad_norm": 0.22874613284967432, "learning_rate": 1.4372333103485648e-06, "loss": 0.0015, "step": 7032 }, { "epoch": 3.199727024567789, "grad_norm": 0.4617406267205746, "learning_rate": 1.4365864949525187e-06, "loss": 0.0067, "step": 7033 }, { "epoch": 3.200181983621474, "grad_norm": 0.23244193359455767, "learning_rate": 1.4359397664538232e-06, "loss": 0.0018, "step": 7034 }, { "epoch": 3.200636942675159, "grad_norm": 0.3841200818766199, "learning_rate": 1.4352931249053248e-06, "loss": 0.005, "step": 7035 }, { "epoch": 3.2010919017288444, "grad_norm": 0.22105128915445948, "learning_rate": 1.4346465703598638e-06, "loss": 0.0019, "step": 7036 }, { "epoch": 3.2015468607825297, "grad_norm": 0.1345016989883617, "learning_rate": 1.4340001028702733e-06, "loss": 0.0007, "step": 7037 }, { "epoch": 3.2020018198362146, "grad_norm": 0.4668713937767562, "learning_rate": 1.43335372248938e-06, "loss": 0.0035, "step": 7038 }, { "epoch": 3.2024567788899, "grad_norm": 0.12669710219118432, "learning_rate": 1.432707429270005e-06, "loss": 0.001, "step": 7039 }, { "epoch": 3.202911737943585, "grad_norm": 0.20287282493978034, "learning_rate": 1.432061223264959e-06, "loss": 0.0018, "step": 7040 }, { "epoch": 3.20336669699727, "grad_norm": 0.2891615812377252, "learning_rate": 1.4314151045270469e-06, "loss": 0.0037, "step": 7041 }, { "epoch": 3.2038216560509554, "grad_norm": 0.5753184290470229, "learning_rate": 1.4307690731090666e-06, "loss": 0.0043, "step": 7042 }, { "epoch": 3.2042766151046407, "grad_norm": 0.017946648257106325, "learning_rate": 1.4301231290638083e-06, "loss": 0.0001, "step": 7043 }, { "epoch": 3.2047315741583255, "grad_norm": 0.3959447599446077, "learning_rate": 1.429477272444057e-06, "loss": 0.0063, "step": 7044 }, { "epoch": 3.205186533212011, "grad_norm": 0.44950016142041826, "learning_rate": 1.428831503302588e-06, "loss": 0.0084, "step": 7045 }, { "epoch": 3.205641492265696, "grad_norm": 0.43228735415788405, "learning_rate": 1.4281858216921719e-06, "loss": 0.008, "step": 7046 }, { "epoch": 3.2060964513193815, "grad_norm": 0.2589888496045893, "learning_rate": 1.4275402276655703e-06, "loss": 0.0044, "step": 7047 }, { "epoch": 3.2065514103730663, "grad_norm": 0.3499440564858497, "learning_rate": 1.4268947212755371e-06, "loss": 0.0045, "step": 7048 }, { "epoch": 3.2070063694267517, "grad_norm": 0.2831657218059198, "learning_rate": 1.4262493025748219e-06, "loss": 0.0042, "step": 7049 }, { "epoch": 3.207461328480437, "grad_norm": 0.38664267252478424, "learning_rate": 1.425603971616165e-06, "loss": 0.0061, "step": 7050 }, { "epoch": 3.207916287534122, "grad_norm": 0.32896328983772155, "learning_rate": 1.4249587284522998e-06, "loss": 0.0054, "step": 7051 }, { "epoch": 3.208371246587807, "grad_norm": 0.2794212952539015, "learning_rate": 1.4243135731359512e-06, "loss": 0.0041, "step": 7052 }, { "epoch": 3.2088262056414925, "grad_norm": 0.16287489316451256, "learning_rate": 1.4236685057198395e-06, "loss": 0.0014, "step": 7053 }, { "epoch": 3.2092811646951773, "grad_norm": 0.15159584255230021, "learning_rate": 1.4230235262566783e-06, "loss": 0.0016, "step": 7054 }, { "epoch": 3.2097361237488626, "grad_norm": 0.22136942358800574, "learning_rate": 1.4223786347991706e-06, "loss": 0.0019, "step": 7055 }, { "epoch": 3.210191082802548, "grad_norm": 0.3450304103599148, "learning_rate": 1.4217338314000146e-06, "loss": 0.0054, "step": 7056 }, { "epoch": 3.210646041856233, "grad_norm": 0.3258613162945162, "learning_rate": 1.4210891161118991e-06, "loss": 0.0041, "step": 7057 }, { "epoch": 3.211101000909918, "grad_norm": 0.2256956504072615, "learning_rate": 1.4204444889875102e-06, "loss": 0.0022, "step": 7058 }, { "epoch": 3.2115559599636034, "grad_norm": 0.28420815850890974, "learning_rate": 1.419799950079521e-06, "loss": 0.0053, "step": 7059 }, { "epoch": 3.2120109190172883, "grad_norm": 0.3295635534632754, "learning_rate": 1.419155499440603e-06, "loss": 0.0029, "step": 7060 }, { "epoch": 3.2124658780709736, "grad_norm": 0.20169589184861295, "learning_rate": 1.4185111371234162e-06, "loss": 0.0015, "step": 7061 }, { "epoch": 3.212920837124659, "grad_norm": 0.2968303519059903, "learning_rate": 1.4178668631806147e-06, "loss": 0.0073, "step": 7062 }, { "epoch": 3.213375796178344, "grad_norm": 0.20647972081940374, "learning_rate": 1.4172226776648471e-06, "loss": 0.002, "step": 7063 }, { "epoch": 3.213830755232029, "grad_norm": 0.28668033147353805, "learning_rate": 1.4165785806287525e-06, "loss": 0.004, "step": 7064 }, { "epoch": 3.2142857142857144, "grad_norm": 0.23621000544562654, "learning_rate": 1.4159345721249637e-06, "loss": 0.0015, "step": 7065 }, { "epoch": 3.2147406733393993, "grad_norm": 0.03687707897749845, "learning_rate": 1.415290652206105e-06, "loss": 0.0002, "step": 7066 }, { "epoch": 3.2151956323930846, "grad_norm": 0.08545768996192986, "learning_rate": 1.4146468209247956e-06, "loss": 0.0011, "step": 7067 }, { "epoch": 3.21565059144677, "grad_norm": 0.4178357870683667, "learning_rate": 1.4140030783336478e-06, "loss": 0.0033, "step": 7068 }, { "epoch": 3.2161055505004548, "grad_norm": 0.36184373493642336, "learning_rate": 1.4133594244852638e-06, "loss": 0.002, "step": 7069 }, { "epoch": 3.21656050955414, "grad_norm": 0.1331219666473779, "learning_rate": 1.412715859432241e-06, "loss": 0.0007, "step": 7070 }, { "epoch": 3.2170154686078254, "grad_norm": 0.2720729652649089, "learning_rate": 1.4120723832271665e-06, "loss": 0.004, "step": 7071 }, { "epoch": 3.2174704276615103, "grad_norm": 0.16204769937784766, "learning_rate": 1.411428995922625e-06, "loss": 0.0014, "step": 7072 }, { "epoch": 3.2179253867151956, "grad_norm": 0.4192660115305755, "learning_rate": 1.4107856975711886e-06, "loss": 0.004, "step": 7073 }, { "epoch": 3.218380345768881, "grad_norm": 0.19073428752828564, "learning_rate": 1.4101424882254277e-06, "loss": 0.0031, "step": 7074 }, { "epoch": 3.2188353048225657, "grad_norm": 0.39109956217102065, "learning_rate": 1.4094993679379009e-06, "loss": 0.008, "step": 7075 }, { "epoch": 3.219290263876251, "grad_norm": 0.13473384115360404, "learning_rate": 1.4088563367611597e-06, "loss": 0.0013, "step": 7076 }, { "epoch": 3.2197452229299364, "grad_norm": 0.26958913273979274, "learning_rate": 1.4082133947477522e-06, "loss": 0.0041, "step": 7077 }, { "epoch": 3.2202001819836217, "grad_norm": 0.19406175866056705, "learning_rate": 1.4075705419502162e-06, "loss": 0.0032, "step": 7078 }, { "epoch": 3.2206551410373065, "grad_norm": 0.3554101373283718, "learning_rate": 1.4069277784210813e-06, "loss": 0.0038, "step": 7079 }, { "epoch": 3.221110100090992, "grad_norm": 0.24649868255296858, "learning_rate": 1.4062851042128716e-06, "loss": 0.0015, "step": 7080 }, { "epoch": 3.221565059144677, "grad_norm": 0.08696198881055343, "learning_rate": 1.4056425193781048e-06, "loss": 0.0007, "step": 7081 }, { "epoch": 3.222020018198362, "grad_norm": 0.19608447053039674, "learning_rate": 1.4050000239692885e-06, "loss": 0.0014, "step": 7082 }, { "epoch": 3.2224749772520473, "grad_norm": 0.2389225320120725, "learning_rate": 1.4043576180389257e-06, "loss": 0.0019, "step": 7083 }, { "epoch": 3.2229299363057327, "grad_norm": 0.20674145182060352, "learning_rate": 1.403715301639511e-06, "loss": 0.0012, "step": 7084 }, { "epoch": 3.2233848953594175, "grad_norm": 0.6291963326235939, "learning_rate": 1.403073074823531e-06, "loss": 0.0041, "step": 7085 }, { "epoch": 3.223839854413103, "grad_norm": 1.036777657967117, "learning_rate": 1.4024309376434645e-06, "loss": 0.0128, "step": 7086 }, { "epoch": 3.224294813466788, "grad_norm": 0.27589943951583573, "learning_rate": 1.4017888901517851e-06, "loss": 0.0036, "step": 7087 }, { "epoch": 3.224749772520473, "grad_norm": 0.11355779248340858, "learning_rate": 1.4011469324009594e-06, "loss": 0.0007, "step": 7088 }, { "epoch": 3.2252047315741583, "grad_norm": 0.209648661574973, "learning_rate": 1.400505064443444e-06, "loss": 0.0036, "step": 7089 }, { "epoch": 3.2256596906278436, "grad_norm": 0.19272867052086629, "learning_rate": 1.3998632863316892e-06, "loss": 0.0023, "step": 7090 }, { "epoch": 3.2261146496815285, "grad_norm": 0.035935341687968476, "learning_rate": 1.3992215981181379e-06, "loss": 0.0002, "step": 7091 }, { "epoch": 3.226569608735214, "grad_norm": 0.07537280672241484, "learning_rate": 1.398579999855227e-06, "loss": 0.0006, "step": 7092 }, { "epoch": 3.227024567788899, "grad_norm": 0.29327632997260994, "learning_rate": 1.3979384915953847e-06, "loss": 0.0023, "step": 7093 }, { "epoch": 3.227479526842584, "grad_norm": 0.11845844570816738, "learning_rate": 1.3972970733910313e-06, "loss": 0.0008, "step": 7094 }, { "epoch": 3.2279344858962693, "grad_norm": 0.14814570764139773, "learning_rate": 1.396655745294582e-06, "loss": 0.0013, "step": 7095 }, { "epoch": 3.2283894449499546, "grad_norm": 0.1771854947538731, "learning_rate": 1.3960145073584415e-06, "loss": 0.001, "step": 7096 }, { "epoch": 3.2288444040036395, "grad_norm": 0.7258379948608844, "learning_rate": 1.3953733596350111e-06, "loss": 0.0054, "step": 7097 }, { "epoch": 3.229299363057325, "grad_norm": 0.7057574893924956, "learning_rate": 1.3947323021766812e-06, "loss": 0.011, "step": 7098 }, { "epoch": 3.22975432211101, "grad_norm": 0.051855133329932045, "learning_rate": 1.3940913350358362e-06, "loss": 0.0004, "step": 7099 }, { "epoch": 3.2302092811646954, "grad_norm": 0.17485703104960276, "learning_rate": 1.3934504582648523e-06, "loss": 0.0011, "step": 7100 }, { "epoch": 3.2306642402183803, "grad_norm": 0.23260787844861122, "learning_rate": 1.3928096719160994e-06, "loss": 0.0018, "step": 7101 }, { "epoch": 3.2311191992720656, "grad_norm": 0.18222675634572522, "learning_rate": 1.3921689760419416e-06, "loss": 0.0009, "step": 7102 }, { "epoch": 3.231574158325751, "grad_norm": 0.17740367490519135, "learning_rate": 1.391528370694732e-06, "loss": 0.0022, "step": 7103 }, { "epoch": 3.2320291173794358, "grad_norm": 0.32518266117207695, "learning_rate": 1.3908878559268177e-06, "loss": 0.0031, "step": 7104 }, { "epoch": 3.232484076433121, "grad_norm": 0.1496407108924249, "learning_rate": 1.3902474317905384e-06, "loss": 0.0009, "step": 7105 }, { "epoch": 3.2329390354868064, "grad_norm": 0.2036043464290669, "learning_rate": 1.3896070983382284e-06, "loss": 0.0013, "step": 7106 }, { "epoch": 3.2333939945404913, "grad_norm": 0.3188499709779792, "learning_rate": 1.3889668556222119e-06, "loss": 0.0053, "step": 7107 }, { "epoch": 3.2338489535941766, "grad_norm": 0.3158857259096732, "learning_rate": 1.3883267036948056e-06, "loss": 0.0024, "step": 7108 }, { "epoch": 3.234303912647862, "grad_norm": 0.4159926349734479, "learning_rate": 1.3876866426083214e-06, "loss": 0.0069, "step": 7109 }, { "epoch": 3.2347588717015467, "grad_norm": 0.12325919198540973, "learning_rate": 1.387046672415061e-06, "loss": 0.0008, "step": 7110 }, { "epoch": 3.235213830755232, "grad_norm": 0.2889532708324326, "learning_rate": 1.3864067931673214e-06, "loss": 0.0047, "step": 7111 }, { "epoch": 3.2356687898089174, "grad_norm": 0.36669586642938995, "learning_rate": 1.3857670049173897e-06, "loss": 0.0022, "step": 7112 }, { "epoch": 3.2361237488626022, "grad_norm": 0.3503050181682975, "learning_rate": 1.3851273077175465e-06, "loss": 0.0091, "step": 7113 }, { "epoch": 3.2365787079162875, "grad_norm": 0.17357619769222885, "learning_rate": 1.384487701620065e-06, "loss": 0.002, "step": 7114 }, { "epoch": 3.237033666969973, "grad_norm": 0.2956585839005435, "learning_rate": 1.38384818667721e-06, "loss": 0.0024, "step": 7115 }, { "epoch": 3.2374886260236577, "grad_norm": 0.15642716569884477, "learning_rate": 1.3832087629412406e-06, "loss": 0.0019, "step": 7116 }, { "epoch": 3.237943585077343, "grad_norm": 0.66165783612274, "learning_rate": 1.3825694304644089e-06, "loss": 0.0107, "step": 7117 }, { "epoch": 3.2383985441310283, "grad_norm": 0.27304648536896886, "learning_rate": 1.3819301892989567e-06, "loss": 0.0048, "step": 7118 }, { "epoch": 3.238853503184713, "grad_norm": 0.4515093663826703, "learning_rate": 1.3812910394971205e-06, "loss": 0.0039, "step": 7119 }, { "epoch": 3.2393084622383985, "grad_norm": 0.09308015520910075, "learning_rate": 1.3806519811111275e-06, "loss": 0.0005, "step": 7120 }, { "epoch": 3.239763421292084, "grad_norm": 0.4526957648902682, "learning_rate": 1.3800130141932005e-06, "loss": 0.0024, "step": 7121 }, { "epoch": 3.2402183803457687, "grad_norm": 0.1522582487904685, "learning_rate": 1.3793741387955512e-06, "loss": 0.0014, "step": 7122 }, { "epoch": 3.240673339399454, "grad_norm": 0.1645685834997879, "learning_rate": 1.378735354970388e-06, "loss": 0.0047, "step": 7123 }, { "epoch": 3.2411282984531393, "grad_norm": 0.21129812054138702, "learning_rate": 1.3780966627699078e-06, "loss": 0.0022, "step": 7124 }, { "epoch": 3.241583257506824, "grad_norm": 0.13207707098569574, "learning_rate": 1.3774580622463005e-06, "loss": 0.0006, "step": 7125 }, { "epoch": 3.2420382165605095, "grad_norm": 0.15031150228763338, "learning_rate": 1.3768195534517523e-06, "loss": 0.0012, "step": 7126 }, { "epoch": 3.242493175614195, "grad_norm": 0.07450162157423998, "learning_rate": 1.3761811364384378e-06, "loss": 0.0004, "step": 7127 }, { "epoch": 3.2429481346678797, "grad_norm": 0.058581165561678715, "learning_rate": 1.3755428112585257e-06, "loss": 0.0007, "step": 7128 }, { "epoch": 3.243403093721565, "grad_norm": 0.11622244970738194, "learning_rate": 1.3749045779641763e-06, "loss": 0.0008, "step": 7129 }, { "epoch": 3.2438580527752503, "grad_norm": 0.27283067972129116, "learning_rate": 1.3742664366075436e-06, "loss": 0.006, "step": 7130 }, { "epoch": 3.244313011828935, "grad_norm": 0.16994286320906815, "learning_rate": 1.3736283872407753e-06, "loss": 0.001, "step": 7131 }, { "epoch": 3.2447679708826205, "grad_norm": 0.49538595959333104, "learning_rate": 1.3729904299160083e-06, "loss": 0.0052, "step": 7132 }, { "epoch": 3.245222929936306, "grad_norm": 0.2391098596015785, "learning_rate": 1.3723525646853738e-06, "loss": 0.0011, "step": 7133 }, { "epoch": 3.245677888989991, "grad_norm": 0.212529369531736, "learning_rate": 1.3717147916009943e-06, "loss": 0.0033, "step": 7134 }, { "epoch": 3.246132848043676, "grad_norm": 0.41436621176206273, "learning_rate": 1.3710771107149878e-06, "loss": 0.0071, "step": 7135 }, { "epoch": 3.2465878070973613, "grad_norm": 0.3559344955954863, "learning_rate": 1.3704395220794608e-06, "loss": 0.0071, "step": 7136 }, { "epoch": 3.2470427661510466, "grad_norm": 0.23139735965678487, "learning_rate": 1.3698020257465158e-06, "loss": 0.0037, "step": 7137 }, { "epoch": 3.2474977252047315, "grad_norm": 0.12478757928629627, "learning_rate": 1.3691646217682454e-06, "loss": 0.0014, "step": 7138 }, { "epoch": 3.2479526842584168, "grad_norm": 0.09892930943393129, "learning_rate": 1.3685273101967345e-06, "loss": 0.0006, "step": 7139 }, { "epoch": 3.248407643312102, "grad_norm": 0.1542786528341422, "learning_rate": 1.3678900910840627e-06, "loss": 0.0012, "step": 7140 }, { "epoch": 3.248862602365787, "grad_norm": 0.2416356501126442, "learning_rate": 1.3672529644823004e-06, "loss": 0.0018, "step": 7141 }, { "epoch": 3.2493175614194723, "grad_norm": 0.4638259510943004, "learning_rate": 1.3666159304435104e-06, "loss": 0.0043, "step": 7142 }, { "epoch": 3.2497725204731576, "grad_norm": 0.049350064793745374, "learning_rate": 1.3659789890197471e-06, "loss": 0.0003, "step": 7143 }, { "epoch": 3.2502274795268424, "grad_norm": 0.48393899447086586, "learning_rate": 1.3653421402630595e-06, "loss": 0.0039, "step": 7144 }, { "epoch": 3.2506824385805277, "grad_norm": 0.37960529367396256, "learning_rate": 1.3647053842254896e-06, "loss": 0.0042, "step": 7145 }, { "epoch": 3.251137397634213, "grad_norm": 0.06813626770993864, "learning_rate": 1.3640687209590683e-06, "loss": 0.0005, "step": 7146 }, { "epoch": 3.251592356687898, "grad_norm": 0.2077677607598635, "learning_rate": 1.3634321505158216e-06, "loss": 0.0012, "step": 7147 }, { "epoch": 3.2520473157415832, "grad_norm": 0.06204028867892192, "learning_rate": 1.3627956729477664e-06, "loss": 0.0004, "step": 7148 }, { "epoch": 3.2525022747952685, "grad_norm": 0.32227229509361016, "learning_rate": 1.3621592883069128e-06, "loss": 0.0056, "step": 7149 }, { "epoch": 3.2529572338489534, "grad_norm": 0.03626322084976804, "learning_rate": 1.3615229966452638e-06, "loss": 0.0003, "step": 7150 }, { "epoch": 3.2534121929026387, "grad_norm": 0.29121684972025913, "learning_rate": 1.3608867980148147e-06, "loss": 0.0032, "step": 7151 }, { "epoch": 3.253867151956324, "grad_norm": 0.09193379413557247, "learning_rate": 1.3602506924675524e-06, "loss": 0.0008, "step": 7152 }, { "epoch": 3.2543221110100093, "grad_norm": 0.2793211838936564, "learning_rate": 1.3596146800554567e-06, "loss": 0.0024, "step": 7153 }, { "epoch": 3.254777070063694, "grad_norm": 0.35279191073219135, "learning_rate": 1.358978760830498e-06, "loss": 0.0029, "step": 7154 }, { "epoch": 3.2552320291173795, "grad_norm": 0.282777219060684, "learning_rate": 1.3583429348446433e-06, "loss": 0.0033, "step": 7155 }, { "epoch": 3.255686988171065, "grad_norm": 0.383128155877309, "learning_rate": 1.3577072021498484e-06, "loss": 0.0047, "step": 7156 }, { "epoch": 3.2561419472247497, "grad_norm": 0.5296431077100328, "learning_rate": 1.3570715627980614e-06, "loss": 0.0046, "step": 7157 }, { "epoch": 3.256596906278435, "grad_norm": 0.15181040963159056, "learning_rate": 1.3564360168412262e-06, "loss": 0.0014, "step": 7158 }, { "epoch": 3.2570518653321203, "grad_norm": 0.15846234722702632, "learning_rate": 1.3558005643312739e-06, "loss": 0.0011, "step": 7159 }, { "epoch": 3.257506824385805, "grad_norm": 0.3146036797448421, "learning_rate": 1.3551652053201334e-06, "loss": 0.0017, "step": 7160 }, { "epoch": 3.2579617834394905, "grad_norm": 0.21857230338436553, "learning_rate": 1.3545299398597223e-06, "loss": 0.0029, "step": 7161 }, { "epoch": 3.258416742493176, "grad_norm": 0.21174564880255647, "learning_rate": 1.3538947680019515e-06, "loss": 0.0014, "step": 7162 }, { "epoch": 3.2588717015468607, "grad_norm": 0.4255996821827538, "learning_rate": 1.3532596897987237e-06, "loss": 0.0072, "step": 7163 }, { "epoch": 3.259326660600546, "grad_norm": 0.4960454126536695, "learning_rate": 1.3526247053019354e-06, "loss": 0.006, "step": 7164 }, { "epoch": 3.2597816196542313, "grad_norm": 0.1327424671216842, "learning_rate": 1.3519898145634758e-06, "loss": 0.0008, "step": 7165 }, { "epoch": 3.260236578707916, "grad_norm": 0.06004515167556991, "learning_rate": 1.3513550176352242e-06, "loss": 0.0003, "step": 7166 }, { "epoch": 3.2606915377616015, "grad_norm": 0.05914482870875182, "learning_rate": 1.3507203145690529e-06, "loss": 0.0004, "step": 7167 }, { "epoch": 3.261146496815287, "grad_norm": 0.3737950077834004, "learning_rate": 1.3500857054168267e-06, "loss": 0.0056, "step": 7168 }, { "epoch": 3.2616014558689717, "grad_norm": 0.23626350505118374, "learning_rate": 1.3494511902304047e-06, "loss": 0.0015, "step": 7169 }, { "epoch": 3.262056414922657, "grad_norm": 0.25921821597340367, "learning_rate": 1.3488167690616355e-06, "loss": 0.0012, "step": 7170 }, { "epoch": 3.2625113739763423, "grad_norm": 0.3583945957855792, "learning_rate": 1.3481824419623605e-06, "loss": 0.0038, "step": 7171 }, { "epoch": 3.262966333030027, "grad_norm": 0.19691721270349244, "learning_rate": 1.3475482089844155e-06, "loss": 0.001, "step": 7172 }, { "epoch": 3.2634212920837125, "grad_norm": 0.0757165363739483, "learning_rate": 1.3469140701796254e-06, "loss": 0.0004, "step": 7173 }, { "epoch": 3.2638762511373978, "grad_norm": 0.20636277578345355, "learning_rate": 1.3462800255998116e-06, "loss": 0.0018, "step": 7174 }, { "epoch": 3.2643312101910826, "grad_norm": 0.5264399654157959, "learning_rate": 1.3456460752967834e-06, "loss": 0.0037, "step": 7175 }, { "epoch": 3.264786169244768, "grad_norm": 0.2032976363343439, "learning_rate": 1.3450122193223452e-06, "loss": 0.0022, "step": 7176 }, { "epoch": 3.2652411282984533, "grad_norm": 0.6034036208572773, "learning_rate": 1.3443784577282915e-06, "loss": 0.0073, "step": 7177 }, { "epoch": 3.265696087352138, "grad_norm": 0.21990515313871045, "learning_rate": 1.3437447905664114e-06, "loss": 0.0023, "step": 7178 }, { "epoch": 3.2661510464058234, "grad_norm": 0.21721597527423148, "learning_rate": 1.3431112178884868e-06, "loss": 0.002, "step": 7179 }, { "epoch": 3.2666060054595087, "grad_norm": 0.4481886904354099, "learning_rate": 1.3424777397462884e-06, "loss": 0.0029, "step": 7180 }, { "epoch": 3.2670609645131936, "grad_norm": 0.2923173051729874, "learning_rate": 1.3418443561915823e-06, "loss": 0.0032, "step": 7181 }, { "epoch": 3.267515923566879, "grad_norm": 0.09000030145618662, "learning_rate": 1.3412110672761243e-06, "loss": 0.0004, "step": 7182 }, { "epoch": 3.2679708826205642, "grad_norm": 0.19187641376908587, "learning_rate": 1.3405778730516656e-06, "loss": 0.0016, "step": 7183 }, { "epoch": 3.268425841674249, "grad_norm": 0.3506950998961093, "learning_rate": 1.3399447735699473e-06, "loss": 0.0052, "step": 7184 }, { "epoch": 3.2688808007279344, "grad_norm": 0.313097783598212, "learning_rate": 1.339311768882702e-06, "loss": 0.0068, "step": 7185 }, { "epoch": 3.2693357597816197, "grad_norm": 0.3174219546280472, "learning_rate": 1.3386788590416586e-06, "loss": 0.0051, "step": 7186 }, { "epoch": 3.2697907188353046, "grad_norm": 0.24708165835172202, "learning_rate": 1.3380460440985344e-06, "loss": 0.001, "step": 7187 }, { "epoch": 3.27024567788899, "grad_norm": 0.3523559546404936, "learning_rate": 1.337413324105039e-06, "loss": 0.002, "step": 7188 }, { "epoch": 3.270700636942675, "grad_norm": 0.25752319948918695, "learning_rate": 1.3367806991128775e-06, "loss": 0.0013, "step": 7189 }, { "epoch": 3.2711555959963605, "grad_norm": 0.09559296821022233, "learning_rate": 1.3361481691737444e-06, "loss": 0.0006, "step": 7190 }, { "epoch": 3.2716105550500454, "grad_norm": 0.31698443467324966, "learning_rate": 1.3355157343393272e-06, "loss": 0.0058, "step": 7191 }, { "epoch": 3.2720655141037307, "grad_norm": 0.37190176760397087, "learning_rate": 1.3348833946613039e-06, "loss": 0.0032, "step": 7192 }, { "epoch": 3.272520473157416, "grad_norm": 0.3454187511383016, "learning_rate": 1.3342511501913483e-06, "loss": 0.0019, "step": 7193 }, { "epoch": 3.272975432211101, "grad_norm": 0.27776979313952926, "learning_rate": 1.3336190009811252e-06, "loss": 0.002, "step": 7194 }, { "epoch": 3.273430391264786, "grad_norm": 0.22430846170730814, "learning_rate": 1.3329869470822898e-06, "loss": 0.0018, "step": 7195 }, { "epoch": 3.2738853503184715, "grad_norm": 0.3138976597523853, "learning_rate": 1.3323549885464912e-06, "loss": 0.0076, "step": 7196 }, { "epoch": 3.2743403093721564, "grad_norm": 0.19998137222512824, "learning_rate": 1.3317231254253687e-06, "loss": 0.0015, "step": 7197 }, { "epoch": 3.2747952684258417, "grad_norm": 0.27975373093726463, "learning_rate": 1.3310913577705575e-06, "loss": 0.0017, "step": 7198 }, { "epoch": 3.275250227479527, "grad_norm": 0.243683501731399, "learning_rate": 1.330459685633681e-06, "loss": 0.003, "step": 7199 }, { "epoch": 3.275705186533212, "grad_norm": 0.25827211534475286, "learning_rate": 1.3298281090663584e-06, "loss": 0.0042, "step": 7200 }, { "epoch": 3.276160145586897, "grad_norm": 0.642623948020697, "learning_rate": 1.329196628120198e-06, "loss": 0.0061, "step": 7201 }, { "epoch": 3.2766151046405825, "grad_norm": 0.21108480867795318, "learning_rate": 1.328565242846801e-06, "loss": 0.0014, "step": 7202 }, { "epoch": 3.2770700636942673, "grad_norm": 0.16825297881605564, "learning_rate": 1.327933953297763e-06, "loss": 0.0025, "step": 7203 }, { "epoch": 3.2775250227479527, "grad_norm": 0.2962423591566813, "learning_rate": 1.32730275952467e-06, "loss": 0.0016, "step": 7204 }, { "epoch": 3.277979981801638, "grad_norm": 0.38072089320122837, "learning_rate": 1.326671661579099e-06, "loss": 0.0035, "step": 7205 }, { "epoch": 3.278434940855323, "grad_norm": 0.3025215710254773, "learning_rate": 1.3260406595126202e-06, "loss": 0.0029, "step": 7206 }, { "epoch": 3.278889899909008, "grad_norm": 0.34262737067369886, "learning_rate": 1.3254097533767973e-06, "loss": 0.002, "step": 7207 }, { "epoch": 3.2793448589626935, "grad_norm": 0.5349112687572084, "learning_rate": 1.324778943223186e-06, "loss": 0.006, "step": 7208 }, { "epoch": 3.2797998180163788, "grad_norm": 0.17250554271649438, "learning_rate": 1.324148229103332e-06, "loss": 0.0013, "step": 7209 }, { "epoch": 3.2802547770700636, "grad_norm": 0.2010470597295365, "learning_rate": 1.3235176110687748e-06, "loss": 0.0014, "step": 7210 }, { "epoch": 3.280709736123749, "grad_norm": 0.1733636616922158, "learning_rate": 1.3228870891710443e-06, "loss": 0.0015, "step": 7211 }, { "epoch": 3.2811646951774343, "grad_norm": 0.3558568973037123, "learning_rate": 1.3222566634616663e-06, "loss": 0.0074, "step": 7212 }, { "epoch": 3.281619654231119, "grad_norm": 0.34910181923925754, "learning_rate": 1.3216263339921537e-06, "loss": 0.0012, "step": 7213 }, { "epoch": 3.2820746132848044, "grad_norm": 0.42095150724997704, "learning_rate": 1.320996100814017e-06, "loss": 0.0025, "step": 7214 }, { "epoch": 3.2825295723384897, "grad_norm": 0.23069823688159793, "learning_rate": 1.3203659639787544e-06, "loss": 0.0021, "step": 7215 }, { "epoch": 3.2829845313921746, "grad_norm": 0.4642362852178418, "learning_rate": 1.319735923537857e-06, "loss": 0.0088, "step": 7216 }, { "epoch": 3.28343949044586, "grad_norm": 0.1728121213091432, "learning_rate": 1.3191059795428113e-06, "loss": 0.0012, "step": 7217 }, { "epoch": 3.2838944494995452, "grad_norm": 0.14598632473400106, "learning_rate": 1.3184761320450918e-06, "loss": 0.0014, "step": 7218 }, { "epoch": 3.28434940855323, "grad_norm": 0.5023140107575009, "learning_rate": 1.3178463810961672e-06, "loss": 0.0076, "step": 7219 }, { "epoch": 3.2848043676069154, "grad_norm": 0.30359879129886297, "learning_rate": 1.3172167267474966e-06, "loss": 0.0053, "step": 7220 }, { "epoch": 3.2852593266606007, "grad_norm": 0.16999804796463738, "learning_rate": 1.316587169050534e-06, "loss": 0.0013, "step": 7221 }, { "epoch": 3.2857142857142856, "grad_norm": 0.14448190496014007, "learning_rate": 1.3159577080567242e-06, "loss": 0.0009, "step": 7222 }, { "epoch": 3.286169244767971, "grad_norm": 0.1661091670428928, "learning_rate": 1.3153283438175036e-06, "loss": 0.0017, "step": 7223 }, { "epoch": 3.286624203821656, "grad_norm": 0.13192395755071792, "learning_rate": 1.3146990763843009e-06, "loss": 0.0007, "step": 7224 }, { "epoch": 3.287079162875341, "grad_norm": 0.19230216657427573, "learning_rate": 1.3140699058085368e-06, "loss": 0.0009, "step": 7225 }, { "epoch": 3.2875341219290264, "grad_norm": 0.15157242284666508, "learning_rate": 1.3134408321416236e-06, "loss": 0.0017, "step": 7226 }, { "epoch": 3.2879890809827117, "grad_norm": 0.06883299629558551, "learning_rate": 1.312811855434967e-06, "loss": 0.0004, "step": 7227 }, { "epoch": 3.2884440400363966, "grad_norm": 0.17553335559121158, "learning_rate": 1.312182975739965e-06, "loss": 0.0015, "step": 7228 }, { "epoch": 3.288898999090082, "grad_norm": 0.3973521340174868, "learning_rate": 1.3115541931080067e-06, "loss": 0.0036, "step": 7229 }, { "epoch": 3.289353958143767, "grad_norm": 0.3214072029306551, "learning_rate": 1.3109255075904725e-06, "loss": 0.0021, "step": 7230 }, { "epoch": 3.289808917197452, "grad_norm": 0.7056538487686829, "learning_rate": 1.3102969192387349e-06, "loss": 0.0174, "step": 7231 }, { "epoch": 3.2902638762511374, "grad_norm": 0.23739518644196378, "learning_rate": 1.3096684281041613e-06, "loss": 0.0019, "step": 7232 }, { "epoch": 3.2907188353048227, "grad_norm": 0.43067236290941113, "learning_rate": 1.3090400342381084e-06, "loss": 0.0058, "step": 7233 }, { "epoch": 3.2911737943585075, "grad_norm": 0.23533603537174455, "learning_rate": 1.3084117376919249e-06, "loss": 0.0015, "step": 7234 }, { "epoch": 3.291628753412193, "grad_norm": 0.10196556380165127, "learning_rate": 1.3077835385169535e-06, "loss": 0.0005, "step": 7235 }, { "epoch": 3.292083712465878, "grad_norm": 0.22930409582696565, "learning_rate": 1.3071554367645267e-06, "loss": 0.0008, "step": 7236 }, { "epoch": 3.292538671519563, "grad_norm": 0.5050257339836222, "learning_rate": 1.3065274324859717e-06, "loss": 0.0062, "step": 7237 }, { "epoch": 3.2929936305732483, "grad_norm": 0.2536488446643466, "learning_rate": 1.305899525732605e-06, "loss": 0.0023, "step": 7238 }, { "epoch": 3.2934485896269337, "grad_norm": 0.3509267041916796, "learning_rate": 1.3052717165557365e-06, "loss": 0.0042, "step": 7239 }, { "epoch": 3.2939035486806185, "grad_norm": 0.1997222200745626, "learning_rate": 1.3046440050066675e-06, "loss": 0.0025, "step": 7240 }, { "epoch": 3.294358507734304, "grad_norm": 0.07930446202456388, "learning_rate": 1.3040163911366918e-06, "loss": 0.0002, "step": 7241 }, { "epoch": 3.294813466787989, "grad_norm": 0.3377470474103759, "learning_rate": 1.3033888749970969e-06, "loss": 0.0021, "step": 7242 }, { "epoch": 3.295268425841674, "grad_norm": 0.21230535436024733, "learning_rate": 1.3027614566391588e-06, "loss": 0.0021, "step": 7243 }, { "epoch": 3.2957233848953593, "grad_norm": 0.34302899132398307, "learning_rate": 1.3021341361141482e-06, "loss": 0.0055, "step": 7244 }, { "epoch": 3.2961783439490446, "grad_norm": 0.26630714773446756, "learning_rate": 1.3015069134733255e-06, "loss": 0.002, "step": 7245 }, { "epoch": 3.29663330300273, "grad_norm": 0.4532736531379349, "learning_rate": 1.3008797887679464e-06, "loss": 0.0045, "step": 7246 }, { "epoch": 3.297088262056415, "grad_norm": 0.3169566574835879, "learning_rate": 1.3002527620492556e-06, "loss": 0.0068, "step": 7247 }, { "epoch": 3.2975432211101, "grad_norm": 0.18881983084082665, "learning_rate": 1.2996258333684903e-06, "loss": 0.0017, "step": 7248 }, { "epoch": 3.2979981801637854, "grad_norm": 0.3944257872508318, "learning_rate": 1.298999002776882e-06, "loss": 0.014, "step": 7249 }, { "epoch": 3.2984531392174703, "grad_norm": 0.1605160412225027, "learning_rate": 1.2983722703256506e-06, "loss": 0.0005, "step": 7250 }, { "epoch": 3.2989080982711556, "grad_norm": 0.32514578556397195, "learning_rate": 1.2977456360660119e-06, "loss": 0.002, "step": 7251 }, { "epoch": 3.299363057324841, "grad_norm": 0.2161429169904007, "learning_rate": 1.2971191000491701e-06, "loss": 0.002, "step": 7252 }, { "epoch": 3.299818016378526, "grad_norm": 0.3755123009286595, "learning_rate": 1.2964926623263233e-06, "loss": 0.0043, "step": 7253 }, { "epoch": 3.300272975432211, "grad_norm": 0.15431391529473956, "learning_rate": 1.2958663229486612e-06, "loss": 0.0017, "step": 7254 }, { "epoch": 3.3007279344858964, "grad_norm": 0.21551694683295183, "learning_rate": 1.2952400819673636e-06, "loss": 0.0033, "step": 7255 }, { "epoch": 3.3011828935395813, "grad_norm": 0.3128032733539429, "learning_rate": 1.2946139394336077e-06, "loss": 0.0023, "step": 7256 }, { "epoch": 3.3016378525932666, "grad_norm": 0.21286274596091942, "learning_rate": 1.2939878953985572e-06, "loss": 0.0021, "step": 7257 }, { "epoch": 3.302092811646952, "grad_norm": 0.37432196217716324, "learning_rate": 1.2933619499133693e-06, "loss": 0.0032, "step": 7258 }, { "epoch": 3.3025477707006368, "grad_norm": 0.3188672401338531, "learning_rate": 1.292736103029194e-06, "loss": 0.0011, "step": 7259 }, { "epoch": 3.303002729754322, "grad_norm": 0.1650270184665457, "learning_rate": 1.2921103547971715e-06, "loss": 0.0014, "step": 7260 }, { "epoch": 3.3034576888080074, "grad_norm": 0.31838197408016516, "learning_rate": 1.291484705268437e-06, "loss": 0.0037, "step": 7261 }, { "epoch": 3.3039126478616927, "grad_norm": 0.2871705912230669, "learning_rate": 1.2908591544941138e-06, "loss": 0.004, "step": 7262 }, { "epoch": 3.3043676069153776, "grad_norm": 0.2830582401749529, "learning_rate": 1.290233702525321e-06, "loss": 0.0029, "step": 7263 }, { "epoch": 3.304822565969063, "grad_norm": 0.06566549965938384, "learning_rate": 1.2896083494131668e-06, "loss": 0.0006, "step": 7264 }, { "epoch": 3.305277525022748, "grad_norm": 0.33280269310394983, "learning_rate": 1.2889830952087511e-06, "loss": 0.0039, "step": 7265 }, { "epoch": 3.305732484076433, "grad_norm": 0.2337608553701677, "learning_rate": 1.288357939963169e-06, "loss": 0.0029, "step": 7266 }, { "epoch": 3.3061874431301184, "grad_norm": 0.057198346964143366, "learning_rate": 1.2877328837275045e-06, "loss": 0.0003, "step": 7267 }, { "epoch": 3.3066424021838037, "grad_norm": 0.10951916199665072, "learning_rate": 1.2871079265528335e-06, "loss": 0.0006, "step": 7268 }, { "epoch": 3.3070973612374885, "grad_norm": 0.23201975680090992, "learning_rate": 1.2864830684902253e-06, "loss": 0.0027, "step": 7269 }, { "epoch": 3.307552320291174, "grad_norm": 0.1379034957771703, "learning_rate": 1.2858583095907402e-06, "loss": 0.0005, "step": 7270 }, { "epoch": 3.308007279344859, "grad_norm": 0.2855092042202947, "learning_rate": 1.2852336499054318e-06, "loss": 0.0024, "step": 7271 }, { "epoch": 3.308462238398544, "grad_norm": 0.23764043036952778, "learning_rate": 1.284609089485344e-06, "loss": 0.0013, "step": 7272 }, { "epoch": 3.3089171974522293, "grad_norm": 0.27973894249952524, "learning_rate": 1.2839846283815124e-06, "loss": 0.0027, "step": 7273 }, { "epoch": 3.3093721565059147, "grad_norm": 0.3862701594808769, "learning_rate": 1.2833602666449647e-06, "loss": 0.0045, "step": 7274 }, { "epoch": 3.3098271155595995, "grad_norm": 0.06051879812517434, "learning_rate": 1.2827360043267228e-06, "loss": 0.0006, "step": 7275 }, { "epoch": 3.310282074613285, "grad_norm": 0.3206572404534766, "learning_rate": 1.2821118414777963e-06, "loss": 0.0023, "step": 7276 }, { "epoch": 3.31073703366697, "grad_norm": 0.32321555527034, "learning_rate": 1.2814877781491914e-06, "loss": 0.0066, "step": 7277 }, { "epoch": 3.311191992720655, "grad_norm": 0.15462268346592511, "learning_rate": 1.2808638143919021e-06, "loss": 0.0014, "step": 7278 }, { "epoch": 3.3116469517743403, "grad_norm": 0.256471974618369, "learning_rate": 1.280239950256916e-06, "loss": 0.0028, "step": 7279 }, { "epoch": 3.3121019108280256, "grad_norm": 0.260155116665867, "learning_rate": 1.2796161857952133e-06, "loss": 0.004, "step": 7280 }, { "epoch": 3.3125568698817105, "grad_norm": 0.2712311931114619, "learning_rate": 1.2789925210577647e-06, "loss": 0.0038, "step": 7281 }, { "epoch": 3.313011828935396, "grad_norm": 0.19507575321383233, "learning_rate": 1.2783689560955336e-06, "loss": 0.0015, "step": 7282 }, { "epoch": 3.313466787989081, "grad_norm": 0.3497812388816731, "learning_rate": 1.2777454909594733e-06, "loss": 0.0059, "step": 7283 }, { "epoch": 3.313921747042766, "grad_norm": 0.3833523582699927, "learning_rate": 1.2771221257005317e-06, "loss": 0.0067, "step": 7284 }, { "epoch": 3.3143767060964513, "grad_norm": 0.17601927165100392, "learning_rate": 1.2764988603696489e-06, "loss": 0.0017, "step": 7285 }, { "epoch": 3.3148316651501366, "grad_norm": 0.39458600042852277, "learning_rate": 1.2758756950177536e-06, "loss": 0.01, "step": 7286 }, { "epoch": 3.3152866242038215, "grad_norm": 0.3490901372153946, "learning_rate": 1.2752526296957684e-06, "loss": 0.0039, "step": 7287 }, { "epoch": 3.315741583257507, "grad_norm": 0.15479582358004945, "learning_rate": 1.274629664454607e-06, "loss": 0.0013, "step": 7288 }, { "epoch": 3.316196542311192, "grad_norm": 0.28995087572055445, "learning_rate": 1.274006799345176e-06, "loss": 0.0024, "step": 7289 }, { "epoch": 3.316651501364877, "grad_norm": 0.3512799615671993, "learning_rate": 1.2733840344183719e-06, "loss": 0.0027, "step": 7290 }, { "epoch": 3.3171064604185623, "grad_norm": 0.45556642214014814, "learning_rate": 1.2727613697250863e-06, "loss": 0.0086, "step": 7291 }, { "epoch": 3.3175614194722476, "grad_norm": 0.35120630120316504, "learning_rate": 1.2721388053161992e-06, "loss": 0.0016, "step": 7292 }, { "epoch": 3.3180163785259325, "grad_norm": 0.07609767074503052, "learning_rate": 1.2715163412425846e-06, "loss": 0.0005, "step": 7293 }, { "epoch": 3.3184713375796178, "grad_norm": 0.14739131425784768, "learning_rate": 1.2708939775551052e-06, "loss": 0.0009, "step": 7294 }, { "epoch": 3.318926296633303, "grad_norm": 0.2422424716889751, "learning_rate": 1.2702717143046206e-06, "loss": 0.0013, "step": 7295 }, { "epoch": 3.319381255686988, "grad_norm": 0.1234518456730502, "learning_rate": 1.269649551541978e-06, "loss": 0.0013, "step": 7296 }, { "epoch": 3.3198362147406733, "grad_norm": 0.19794292107084804, "learning_rate": 1.2690274893180167e-06, "loss": 0.0012, "step": 7297 }, { "epoch": 3.3202911737943586, "grad_norm": 0.4140311582692203, "learning_rate": 1.2684055276835713e-06, "loss": 0.0104, "step": 7298 }, { "epoch": 3.3207461328480434, "grad_norm": 0.4009744310593412, "learning_rate": 1.2677836666894632e-06, "loss": 0.0023, "step": 7299 }, { "epoch": 3.3212010919017287, "grad_norm": 0.31069764933903976, "learning_rate": 1.26716190638651e-06, "loss": 0.0042, "step": 7300 }, { "epoch": 3.321656050955414, "grad_norm": 0.1684190564681023, "learning_rate": 1.2665402468255187e-06, "loss": 0.001, "step": 7301 }, { "epoch": 3.3221110100090994, "grad_norm": 0.26498483524516864, "learning_rate": 1.2659186880572879e-06, "loss": 0.0038, "step": 7302 }, { "epoch": 3.3225659690627842, "grad_norm": 0.1471315007088414, "learning_rate": 1.2652972301326084e-06, "loss": 0.0007, "step": 7303 }, { "epoch": 3.3230209281164695, "grad_norm": 0.30570202012187114, "learning_rate": 1.2646758731022627e-06, "loss": 0.0021, "step": 7304 }, { "epoch": 3.323475887170155, "grad_norm": 0.08490062568759389, "learning_rate": 1.264054617017027e-06, "loss": 0.0005, "step": 7305 }, { "epoch": 3.3239308462238397, "grad_norm": 0.5501695221925939, "learning_rate": 1.2634334619276669e-06, "loss": 0.0074, "step": 7306 }, { "epoch": 3.324385805277525, "grad_norm": 0.2946429714219301, "learning_rate": 1.26281240788494e-06, "loss": 0.003, "step": 7307 }, { "epoch": 3.3248407643312103, "grad_norm": 0.12221531347427644, "learning_rate": 1.2621914549395947e-06, "loss": 0.0011, "step": 7308 }, { "epoch": 3.325295723384895, "grad_norm": 0.32257044657733985, "learning_rate": 1.2615706031423751e-06, "loss": 0.004, "step": 7309 }, { "epoch": 3.3257506824385805, "grad_norm": 0.25656619143199694, "learning_rate": 1.2609498525440131e-06, "loss": 0.0013, "step": 7310 }, { "epoch": 3.326205641492266, "grad_norm": 0.1720719788345652, "learning_rate": 1.2603292031952324e-06, "loss": 0.0018, "step": 7311 }, { "epoch": 3.3266606005459507, "grad_norm": 0.20677104403303653, "learning_rate": 1.2597086551467522e-06, "loss": 0.0018, "step": 7312 }, { "epoch": 3.327115559599636, "grad_norm": 0.23000153311733718, "learning_rate": 1.2590882084492783e-06, "loss": 0.0028, "step": 7313 }, { "epoch": 3.3275705186533213, "grad_norm": 0.3526562950699865, "learning_rate": 1.2584678631535136e-06, "loss": 0.0031, "step": 7314 }, { "epoch": 3.328025477707006, "grad_norm": 0.4761314698958258, "learning_rate": 1.257847619310148e-06, "loss": 0.0039, "step": 7315 }, { "epoch": 3.3284804367606915, "grad_norm": 0.311658575350045, "learning_rate": 1.2572274769698656e-06, "loss": 0.0044, "step": 7316 }, { "epoch": 3.328935395814377, "grad_norm": 0.3995598797094526, "learning_rate": 1.2566074361833403e-06, "loss": 0.0036, "step": 7317 }, { "epoch": 3.329390354868062, "grad_norm": 1.7406130940379339, "learning_rate": 1.2559874970012403e-06, "loss": 0.014, "step": 7318 }, { "epoch": 3.329845313921747, "grad_norm": 0.2839809004535095, "learning_rate": 1.2553676594742251e-06, "loss": 0.0027, "step": 7319 }, { "epoch": 3.3303002729754323, "grad_norm": 0.3225956965360814, "learning_rate": 1.2547479236529442e-06, "loss": 0.002, "step": 7320 }, { "epoch": 3.3307552320291176, "grad_norm": 0.5571872770585166, "learning_rate": 1.254128289588039e-06, "loss": 0.012, "step": 7321 }, { "epoch": 3.3312101910828025, "grad_norm": 0.3516030317719282, "learning_rate": 1.2535087573301432e-06, "loss": 0.0075, "step": 7322 }, { "epoch": 3.331665150136488, "grad_norm": 0.7278198783708629, "learning_rate": 1.2528893269298837e-06, "loss": 0.0066, "step": 7323 }, { "epoch": 3.332120109190173, "grad_norm": 0.13383332899955608, "learning_rate": 1.252269998437876e-06, "loss": 0.001, "step": 7324 }, { "epoch": 3.332575068243858, "grad_norm": 0.2935412076526087, "learning_rate": 1.2516507719047289e-06, "loss": 0.002, "step": 7325 }, { "epoch": 3.3330300272975433, "grad_norm": 0.28043879836909524, "learning_rate": 1.2510316473810436e-06, "loss": 0.0025, "step": 7326 }, { "epoch": 3.3334849863512286, "grad_norm": 0.20929719642288913, "learning_rate": 1.2504126249174114e-06, "loss": 0.001, "step": 7327 }, { "epoch": 3.3339399454049135, "grad_norm": 0.1575046403044914, "learning_rate": 1.2497937045644171e-06, "loss": 0.001, "step": 7328 }, { "epoch": 3.3343949044585988, "grad_norm": 0.12387175555459039, "learning_rate": 1.2491748863726352e-06, "loss": 0.0006, "step": 7329 }, { "epoch": 3.334849863512284, "grad_norm": 0.37197763750549395, "learning_rate": 1.2485561703926333e-06, "loss": 0.0071, "step": 7330 }, { "epoch": 3.335304822565969, "grad_norm": 0.5691557593150757, "learning_rate": 1.2479375566749694e-06, "loss": 0.0038, "step": 7331 }, { "epoch": 3.3357597816196543, "grad_norm": 0.4634920071815555, "learning_rate": 1.2473190452701934e-06, "loss": 0.007, "step": 7332 }, { "epoch": 3.3362147406733396, "grad_norm": 0.08442416944743886, "learning_rate": 1.2467006362288476e-06, "loss": 0.0006, "step": 7333 }, { "epoch": 3.3366696997270244, "grad_norm": 0.49288722299560467, "learning_rate": 1.246082329601467e-06, "loss": 0.012, "step": 7334 }, { "epoch": 3.3371246587807097, "grad_norm": 0.49007497123855154, "learning_rate": 1.245464125438576e-06, "loss": 0.0112, "step": 7335 }, { "epoch": 3.337579617834395, "grad_norm": 0.20385692154388807, "learning_rate": 1.2448460237906912e-06, "loss": 0.0028, "step": 7336 }, { "epoch": 3.33803457688808, "grad_norm": 0.4261754440507751, "learning_rate": 1.24422802470832e-06, "loss": 0.0048, "step": 7337 }, { "epoch": 3.3384895359417652, "grad_norm": 0.35045966911916504, "learning_rate": 1.2436101282419646e-06, "loss": 0.0037, "step": 7338 }, { "epoch": 3.3389444949954505, "grad_norm": 0.3212212188737211, "learning_rate": 1.242992334442115e-06, "loss": 0.004, "step": 7339 }, { "epoch": 3.3393994540491354, "grad_norm": 0.34721485934691465, "learning_rate": 1.2423746433592557e-06, "loss": 0.0028, "step": 7340 }, { "epoch": 3.3398544131028207, "grad_norm": 0.313888740733667, "learning_rate": 1.2417570550438616e-06, "loss": 0.0055, "step": 7341 }, { "epoch": 3.340309372156506, "grad_norm": 0.3915170651494066, "learning_rate": 1.2411395695463976e-06, "loss": 0.0011, "step": 7342 }, { "epoch": 3.340764331210191, "grad_norm": 0.2672247610287948, "learning_rate": 1.240522186917324e-06, "loss": 0.0024, "step": 7343 }, { "epoch": 3.341219290263876, "grad_norm": 0.12298107154976468, "learning_rate": 1.2399049072070895e-06, "loss": 0.0008, "step": 7344 }, { "epoch": 3.3416742493175615, "grad_norm": 0.27232216201596, "learning_rate": 1.2392877304661357e-06, "loss": 0.0035, "step": 7345 }, { "epoch": 3.3421292083712464, "grad_norm": 0.3513439323984028, "learning_rate": 1.238670656744894e-06, "loss": 0.0035, "step": 7346 }, { "epoch": 3.3425841674249317, "grad_norm": 0.07615743024303782, "learning_rate": 1.2380536860937902e-06, "loss": 0.0008, "step": 7347 }, { "epoch": 3.343039126478617, "grad_norm": 0.04474842336577777, "learning_rate": 1.2374368185632413e-06, "loss": 0.0003, "step": 7348 }, { "epoch": 3.343494085532302, "grad_norm": 0.34130288067717546, "learning_rate": 1.2368200542036537e-06, "loss": 0.0056, "step": 7349 }, { "epoch": 3.343949044585987, "grad_norm": 0.45549481257445995, "learning_rate": 1.2362033930654272e-06, "loss": 0.0098, "step": 7350 }, { "epoch": 3.3444040036396725, "grad_norm": 0.4483098120352691, "learning_rate": 1.2355868351989507e-06, "loss": 0.006, "step": 7351 }, { "epoch": 3.3448589626933574, "grad_norm": 0.26398618630493537, "learning_rate": 1.2349703806546092e-06, "loss": 0.0016, "step": 7352 }, { "epoch": 3.3453139217470427, "grad_norm": 0.30110419200406374, "learning_rate": 1.2343540294827747e-06, "loss": 0.0032, "step": 7353 }, { "epoch": 3.345768880800728, "grad_norm": 0.5903704763373673, "learning_rate": 1.233737781733814e-06, "loss": 0.0092, "step": 7354 }, { "epoch": 3.3462238398544133, "grad_norm": 0.3107485058422592, "learning_rate": 1.2331216374580832e-06, "loss": 0.0045, "step": 7355 }, { "epoch": 3.346678798908098, "grad_norm": 0.4027455274550882, "learning_rate": 1.2325055967059302e-06, "loss": 0.0053, "step": 7356 }, { "epoch": 3.3471337579617835, "grad_norm": 0.14940377249693212, "learning_rate": 1.231889659527697e-06, "loss": 0.0012, "step": 7357 }, { "epoch": 3.347588717015469, "grad_norm": 0.4044309092851836, "learning_rate": 1.231273825973714e-06, "loss": 0.0036, "step": 7358 }, { "epoch": 3.3480436760691537, "grad_norm": 0.08288388400216994, "learning_rate": 1.2306580960943044e-06, "loss": 0.0005, "step": 7359 }, { "epoch": 3.348498635122839, "grad_norm": 0.09645876019490099, "learning_rate": 1.2300424699397817e-06, "loss": 0.0021, "step": 7360 }, { "epoch": 3.3489535941765243, "grad_norm": 0.1900221514142373, "learning_rate": 1.2294269475604536e-06, "loss": 0.0026, "step": 7361 }, { "epoch": 3.349408553230209, "grad_norm": 0.4436978624226301, "learning_rate": 1.2288115290066183e-06, "loss": 0.0028, "step": 7362 }, { "epoch": 3.3498635122838945, "grad_norm": 0.31308118384498557, "learning_rate": 1.2281962143285643e-06, "loss": 0.0036, "step": 7363 }, { "epoch": 3.3503184713375798, "grad_norm": 0.3513163292066812, "learning_rate": 1.227581003576572e-06, "loss": 0.0035, "step": 7364 }, { "epoch": 3.3507734303912646, "grad_norm": 0.3806735243887795, "learning_rate": 1.2269658968009144e-06, "loss": 0.0047, "step": 7365 }, { "epoch": 3.35122838944495, "grad_norm": 0.2624361788203404, "learning_rate": 1.2263508940518534e-06, "loss": 0.002, "step": 7366 }, { "epoch": 3.3516833484986353, "grad_norm": 0.08493992201370042, "learning_rate": 1.2257359953796455e-06, "loss": 0.0008, "step": 7367 }, { "epoch": 3.35213830755232, "grad_norm": 0.3522852982717652, "learning_rate": 1.2251212008345387e-06, "loss": 0.0044, "step": 7368 }, { "epoch": 3.3525932666060054, "grad_norm": 0.326689607037269, "learning_rate": 1.22450651046677e-06, "loss": 0.0039, "step": 7369 }, { "epoch": 3.3530482256596907, "grad_norm": 0.2147721330446467, "learning_rate": 1.2238919243265693e-06, "loss": 0.0028, "step": 7370 }, { "epoch": 3.3535031847133756, "grad_norm": 0.21019365585294764, "learning_rate": 1.2232774424641566e-06, "loss": 0.0012, "step": 7371 }, { "epoch": 3.353958143767061, "grad_norm": 0.1660011241987587, "learning_rate": 1.2226630649297466e-06, "loss": 0.0015, "step": 7372 }, { "epoch": 3.3544131028207462, "grad_norm": 0.4294012296004794, "learning_rate": 1.2220487917735426e-06, "loss": 0.003, "step": 7373 }, { "epoch": 3.3548680618744315, "grad_norm": 0.45341153512902377, "learning_rate": 1.2214346230457391e-06, "loss": 0.0053, "step": 7374 }, { "epoch": 3.3553230209281164, "grad_norm": 0.29488271871853294, "learning_rate": 1.2208205587965255e-06, "loss": 0.0035, "step": 7375 }, { "epoch": 3.3557779799818017, "grad_norm": 0.1370991987564596, "learning_rate": 1.220206599076078e-06, "loss": 0.0011, "step": 7376 }, { "epoch": 3.356232939035487, "grad_norm": 0.19441000430892869, "learning_rate": 1.2195927439345687e-06, "loss": 0.0015, "step": 7377 }, { "epoch": 3.356687898089172, "grad_norm": 0.13926343107399758, "learning_rate": 1.218978993422158e-06, "loss": 0.0007, "step": 7378 }, { "epoch": 3.357142857142857, "grad_norm": 0.5360848588433617, "learning_rate": 1.218365347588999e-06, "loss": 0.0031, "step": 7379 }, { "epoch": 3.3575978161965425, "grad_norm": 0.3560330010982586, "learning_rate": 1.217751806485235e-06, "loss": 0.0055, "step": 7380 }, { "epoch": 3.3580527752502274, "grad_norm": 0.26547105665932785, "learning_rate": 1.2171383701610026e-06, "loss": 0.0068, "step": 7381 }, { "epoch": 3.3585077343039127, "grad_norm": 0.2333535354963841, "learning_rate": 1.2165250386664304e-06, "loss": 0.0035, "step": 7382 }, { "epoch": 3.358962693357598, "grad_norm": 0.16746706571556716, "learning_rate": 1.2159118120516361e-06, "loss": 0.0019, "step": 7383 }, { "epoch": 3.359417652411283, "grad_norm": 0.30702631888517573, "learning_rate": 1.2152986903667294e-06, "loss": 0.0056, "step": 7384 }, { "epoch": 3.359872611464968, "grad_norm": 0.9003730423336203, "learning_rate": 1.214685673661811e-06, "loss": 0.0045, "step": 7385 }, { "epoch": 3.3603275705186535, "grad_norm": 0.2965388609123682, "learning_rate": 1.214072761986976e-06, "loss": 0.0027, "step": 7386 }, { "epoch": 3.3607825295723384, "grad_norm": 0.14030622295779063, "learning_rate": 1.2134599553923076e-06, "loss": 0.001, "step": 7387 }, { "epoch": 3.3612374886260237, "grad_norm": 0.35689398453377513, "learning_rate": 1.212847253927881e-06, "loss": 0.0075, "step": 7388 }, { "epoch": 3.361692447679709, "grad_norm": 0.09047535864409256, "learning_rate": 1.212234657643765e-06, "loss": 0.0006, "step": 7389 }, { "epoch": 3.362147406733394, "grad_norm": 0.21436975819757187, "learning_rate": 1.211622166590016e-06, "loss": 0.0012, "step": 7390 }, { "epoch": 3.362602365787079, "grad_norm": 0.07306876313217826, "learning_rate": 1.2110097808166865e-06, "loss": 0.0006, "step": 7391 }, { "epoch": 3.3630573248407645, "grad_norm": 0.14870386093806404, "learning_rate": 1.2103975003738167e-06, "loss": 0.0017, "step": 7392 }, { "epoch": 3.3635122838944493, "grad_norm": 0.2721519887375329, "learning_rate": 1.2097853253114393e-06, "loss": 0.0028, "step": 7393 }, { "epoch": 3.3639672429481347, "grad_norm": 0.15908017915441366, "learning_rate": 1.2091732556795774e-06, "loss": 0.0014, "step": 7394 }, { "epoch": 3.36442220200182, "grad_norm": 0.43651116836448406, "learning_rate": 1.208561291528248e-06, "loss": 0.0024, "step": 7395 }, { "epoch": 3.364877161055505, "grad_norm": 0.4223944827315913, "learning_rate": 1.2079494329074587e-06, "loss": 0.007, "step": 7396 }, { "epoch": 3.36533212010919, "grad_norm": 0.38553734880414914, "learning_rate": 1.2073376798672068e-06, "loss": 0.0032, "step": 7397 }, { "epoch": 3.3657870791628755, "grad_norm": 0.07987768276319719, "learning_rate": 1.2067260324574823e-06, "loss": 0.0011, "step": 7398 }, { "epoch": 3.3662420382165603, "grad_norm": 0.328196498230122, "learning_rate": 1.2061144907282656e-06, "loss": 0.0028, "step": 7399 }, { "epoch": 3.3666969972702456, "grad_norm": 0.28956987906905135, "learning_rate": 1.2055030547295292e-06, "loss": 0.0036, "step": 7400 }, { "epoch": 3.367151956323931, "grad_norm": 0.30335447425455875, "learning_rate": 1.2048917245112377e-06, "loss": 0.004, "step": 7401 }, { "epoch": 3.367606915377616, "grad_norm": 0.4272477859475346, "learning_rate": 1.2042805001233452e-06, "loss": 0.0086, "step": 7402 }, { "epoch": 3.368061874431301, "grad_norm": 0.2773860927140121, "learning_rate": 1.2036693816157995e-06, "loss": 0.0029, "step": 7403 }, { "epoch": 3.3685168334849864, "grad_norm": 0.13064732784899907, "learning_rate": 1.2030583690385381e-06, "loss": 0.0009, "step": 7404 }, { "epoch": 3.3689717925386713, "grad_norm": 0.15534947442763708, "learning_rate": 1.2024474624414885e-06, "loss": 0.0012, "step": 7405 }, { "epoch": 3.3694267515923566, "grad_norm": 0.5385865872359713, "learning_rate": 1.2018366618745738e-06, "loss": 0.0067, "step": 7406 }, { "epoch": 3.369881710646042, "grad_norm": 0.3841352656749689, "learning_rate": 1.2012259673877047e-06, "loss": 0.0051, "step": 7407 }, { "epoch": 3.370336669699727, "grad_norm": 0.17139828239505936, "learning_rate": 1.2006153790307843e-06, "loss": 0.0031, "step": 7408 }, { "epoch": 3.370791628753412, "grad_norm": 0.12578998593662244, "learning_rate": 1.200004896853706e-06, "loss": 0.0011, "step": 7409 }, { "epoch": 3.3712465878070974, "grad_norm": 0.4548432197364068, "learning_rate": 1.1993945209063567e-06, "loss": 0.0028, "step": 7410 }, { "epoch": 3.3717015468607827, "grad_norm": 0.13597611616803823, "learning_rate": 1.198784251238615e-06, "loss": 0.0009, "step": 7411 }, { "epoch": 3.3721565059144676, "grad_norm": 0.3473348927826866, "learning_rate": 1.1981740879003479e-06, "loss": 0.0047, "step": 7412 }, { "epoch": 3.372611464968153, "grad_norm": 0.08390675814912749, "learning_rate": 1.1975640309414152e-06, "loss": 0.0006, "step": 7413 }, { "epoch": 3.373066424021838, "grad_norm": 0.1959842167938354, "learning_rate": 1.1969540804116676e-06, "loss": 0.0014, "step": 7414 }, { "epoch": 3.373521383075523, "grad_norm": 0.3176977801408133, "learning_rate": 1.1963442363609487e-06, "loss": 0.0032, "step": 7415 }, { "epoch": 3.3739763421292084, "grad_norm": 0.5294639558978848, "learning_rate": 1.1957344988390904e-06, "loss": 0.0057, "step": 7416 }, { "epoch": 3.3744313011828937, "grad_norm": 0.14885580740788334, "learning_rate": 1.1951248678959202e-06, "loss": 0.0009, "step": 7417 }, { "epoch": 3.3748862602365786, "grad_norm": 0.12877268026488273, "learning_rate": 1.1945153435812529e-06, "loss": 0.0017, "step": 7418 }, { "epoch": 3.375341219290264, "grad_norm": 0.38002972046773853, "learning_rate": 1.1939059259448952e-06, "loss": 0.006, "step": 7419 }, { "epoch": 3.375796178343949, "grad_norm": 0.1766059540068926, "learning_rate": 1.1932966150366477e-06, "loss": 0.002, "step": 7420 }, { "epoch": 3.376251137397634, "grad_norm": 0.2129865006291291, "learning_rate": 1.1926874109063e-06, "loss": 0.0013, "step": 7421 }, { "epoch": 3.3767060964513194, "grad_norm": 0.04078151216918046, "learning_rate": 1.1920783136036336e-06, "loss": 0.0002, "step": 7422 }, { "epoch": 3.3771610555050047, "grad_norm": 0.062209220477069296, "learning_rate": 1.1914693231784194e-06, "loss": 0.0005, "step": 7423 }, { "epoch": 3.3776160145586895, "grad_norm": 0.23354292839423585, "learning_rate": 1.1908604396804233e-06, "loss": 0.0013, "step": 7424 }, { "epoch": 3.378070973612375, "grad_norm": 0.19284729668235384, "learning_rate": 1.1902516631594005e-06, "loss": 0.0028, "step": 7425 }, { "epoch": 3.37852593266606, "grad_norm": 0.24331899695866913, "learning_rate": 1.1896429936650975e-06, "loss": 0.001, "step": 7426 }, { "epoch": 3.3789808917197455, "grad_norm": 0.13364620640708288, "learning_rate": 1.1890344312472513e-06, "loss": 0.0012, "step": 7427 }, { "epoch": 3.3794358507734303, "grad_norm": 0.43127577111837734, "learning_rate": 1.1884259759555902e-06, "loss": 0.0056, "step": 7428 }, { "epoch": 3.3798908098271156, "grad_norm": 0.512697029774403, "learning_rate": 1.1878176278398363e-06, "loss": 0.004, "step": 7429 }, { "epoch": 3.380345768880801, "grad_norm": 0.676342156849057, "learning_rate": 1.187209386949699e-06, "loss": 0.0093, "step": 7430 }, { "epoch": 3.380800727934486, "grad_norm": 0.025901954019439618, "learning_rate": 1.1866012533348834e-06, "loss": 0.0002, "step": 7431 }, { "epoch": 3.381255686988171, "grad_norm": 0.30631346679675137, "learning_rate": 1.1859932270450817e-06, "loss": 0.0039, "step": 7432 }, { "epoch": 3.3817106460418564, "grad_norm": 0.10629446529280041, "learning_rate": 1.1853853081299787e-06, "loss": 0.0008, "step": 7433 }, { "epoch": 3.3821656050955413, "grad_norm": 0.1469918862498041, "learning_rate": 1.1847774966392526e-06, "loss": 0.0012, "step": 7434 }, { "epoch": 3.3826205641492266, "grad_norm": 0.08869810048625816, "learning_rate": 1.1841697926225698e-06, "loss": 0.0003, "step": 7435 }, { "epoch": 3.383075523202912, "grad_norm": 0.1876114917999823, "learning_rate": 1.1835621961295895e-06, "loss": 0.0007, "step": 7436 }, { "epoch": 3.383530482256597, "grad_norm": 0.21606760327083138, "learning_rate": 1.1829547072099607e-06, "loss": 0.0021, "step": 7437 }, { "epoch": 3.383985441310282, "grad_norm": 0.30562848274242344, "learning_rate": 1.1823473259133261e-06, "loss": 0.0026, "step": 7438 }, { "epoch": 3.3844404003639674, "grad_norm": 0.3072700918465498, "learning_rate": 1.1817400522893169e-06, "loss": 0.0027, "step": 7439 }, { "epoch": 3.3848953594176523, "grad_norm": 0.3535257206403179, "learning_rate": 1.181132886387558e-06, "loss": 0.0027, "step": 7440 }, { "epoch": 3.3853503184713376, "grad_norm": 0.23400429514603782, "learning_rate": 1.180525828257664e-06, "loss": 0.0019, "step": 7441 }, { "epoch": 3.385805277525023, "grad_norm": 0.22384007412659115, "learning_rate": 1.1799188779492407e-06, "loss": 0.0016, "step": 7442 }, { "epoch": 3.386260236578708, "grad_norm": 0.5522869850151557, "learning_rate": 1.1793120355118843e-06, "loss": 0.0089, "step": 7443 }, { "epoch": 3.386715195632393, "grad_norm": 0.13097033369134709, "learning_rate": 1.1787053009951837e-06, "loss": 0.0012, "step": 7444 }, { "epoch": 3.3871701546860784, "grad_norm": 0.28376212192573635, "learning_rate": 1.1780986744487204e-06, "loss": 0.0037, "step": 7445 }, { "epoch": 3.3876251137397633, "grad_norm": 0.22912019224823102, "learning_rate": 1.1774921559220637e-06, "loss": 0.0036, "step": 7446 }, { "epoch": 3.3880800727934486, "grad_norm": 0.2633806222987266, "learning_rate": 1.1768857454647756e-06, "loss": 0.0029, "step": 7447 }, { "epoch": 3.388535031847134, "grad_norm": 0.5238360053390515, "learning_rate": 1.1762794431264082e-06, "loss": 0.0046, "step": 7448 }, { "epoch": 3.3889899909008188, "grad_norm": 0.3558086037533303, "learning_rate": 1.175673248956508e-06, "loss": 0.0042, "step": 7449 }, { "epoch": 3.389444949954504, "grad_norm": 0.20033672000667274, "learning_rate": 1.175067163004609e-06, "loss": 0.0025, "step": 7450 }, { "epoch": 3.3898999090081894, "grad_norm": 0.5109999293276974, "learning_rate": 1.1744611853202376e-06, "loss": 0.0118, "step": 7451 }, { "epoch": 3.3903548680618742, "grad_norm": 0.4226361888983525, "learning_rate": 1.1738553159529126e-06, "loss": 0.0028, "step": 7452 }, { "epoch": 3.3908098271155596, "grad_norm": 0.22888222869570046, "learning_rate": 1.1732495549521413e-06, "loss": 0.0017, "step": 7453 }, { "epoch": 3.391264786169245, "grad_norm": 0.283258932636813, "learning_rate": 1.172643902367426e-06, "loss": 0.0032, "step": 7454 }, { "epoch": 3.3917197452229297, "grad_norm": 0.15155287692257746, "learning_rate": 1.1720383582482569e-06, "loss": 0.002, "step": 7455 }, { "epoch": 3.392174704276615, "grad_norm": 0.3481403607154899, "learning_rate": 1.171432922644116e-06, "loss": 0.0032, "step": 7456 }, { "epoch": 3.3926296633303004, "grad_norm": 0.26983452166736194, "learning_rate": 1.1708275956044757e-06, "loss": 0.0022, "step": 7457 }, { "epoch": 3.3930846223839852, "grad_norm": 0.37906501425183486, "learning_rate": 1.170222377178802e-06, "loss": 0.0023, "step": 7458 }, { "epoch": 3.3935395814376705, "grad_norm": 0.38692221829890017, "learning_rate": 1.1696172674165516e-06, "loss": 0.0021, "step": 7459 }, { "epoch": 3.393994540491356, "grad_norm": 0.16573215790159168, "learning_rate": 1.16901226636717e-06, "loss": 0.0022, "step": 7460 }, { "epoch": 3.3944494995450407, "grad_norm": 0.17820294984919155, "learning_rate": 1.168407374080095e-06, "loss": 0.0022, "step": 7461 }, { "epoch": 3.394904458598726, "grad_norm": 0.27960505395045315, "learning_rate": 1.1678025906047552e-06, "loss": 0.0079, "step": 7462 }, { "epoch": 3.3953594176524113, "grad_norm": 0.1082318970258481, "learning_rate": 1.1671979159905724e-06, "loss": 0.0015, "step": 7463 }, { "epoch": 3.395814376706096, "grad_norm": 0.1630492289107023, "learning_rate": 1.1665933502869563e-06, "loss": 0.0009, "step": 7464 }, { "epoch": 3.3962693357597815, "grad_norm": 0.1723375132995701, "learning_rate": 1.1659888935433108e-06, "loss": 0.0014, "step": 7465 }, { "epoch": 3.396724294813467, "grad_norm": 0.14789678345709673, "learning_rate": 1.1653845458090287e-06, "loss": 0.001, "step": 7466 }, { "epoch": 3.397179253867152, "grad_norm": 0.7022074820547839, "learning_rate": 1.1647803071334935e-06, "loss": 0.0088, "step": 7467 }, { "epoch": 3.397634212920837, "grad_norm": 0.39654072307132926, "learning_rate": 1.1641761775660826e-06, "loss": 0.0066, "step": 7468 }, { "epoch": 3.3980891719745223, "grad_norm": 0.16128198138735056, "learning_rate": 1.163572157156162e-06, "loss": 0.0018, "step": 7469 }, { "epoch": 3.3985441310282076, "grad_norm": 0.24189719433157336, "learning_rate": 1.1629682459530898e-06, "loss": 0.0032, "step": 7470 }, { "epoch": 3.3989990900818925, "grad_norm": 0.2332738916905824, "learning_rate": 1.1623644440062133e-06, "loss": 0.0012, "step": 7471 }, { "epoch": 3.399454049135578, "grad_norm": 0.1353241174734508, "learning_rate": 1.1617607513648735e-06, "loss": 0.0017, "step": 7472 }, { "epoch": 3.399909008189263, "grad_norm": 0.3189946353787522, "learning_rate": 1.161157168078403e-06, "loss": 0.0024, "step": 7473 }, { "epoch": 3.400363967242948, "grad_norm": 0.25960918479927786, "learning_rate": 1.1605536941961223e-06, "loss": 0.0025, "step": 7474 }, { "epoch": 3.4008189262966333, "grad_norm": 0.28417459372644216, "learning_rate": 1.159950329767345e-06, "loss": 0.0029, "step": 7475 }, { "epoch": 3.4012738853503186, "grad_norm": 0.4834319251673917, "learning_rate": 1.159347074841375e-06, "loss": 0.0036, "step": 7476 }, { "epoch": 3.4017288444040035, "grad_norm": 0.35302674902617864, "learning_rate": 1.1587439294675067e-06, "loss": 0.003, "step": 7477 }, { "epoch": 3.402183803457689, "grad_norm": 0.5773151818653711, "learning_rate": 1.1581408936950278e-06, "loss": 0.008, "step": 7478 }, { "epoch": 3.402638762511374, "grad_norm": 0.31689349700471015, "learning_rate": 1.157537967573216e-06, "loss": 0.0031, "step": 7479 }, { "epoch": 3.403093721565059, "grad_norm": 0.2575580591248953, "learning_rate": 1.1569351511513388e-06, "loss": 0.0032, "step": 7480 }, { "epoch": 3.4035486806187443, "grad_norm": 0.11924354710102746, "learning_rate": 1.1563324444786562e-06, "loss": 0.0006, "step": 7481 }, { "epoch": 3.4040036396724296, "grad_norm": 0.20371901066000078, "learning_rate": 1.155729847604417e-06, "loss": 0.0035, "step": 7482 }, { "epoch": 3.404458598726115, "grad_norm": 0.2851603088370641, "learning_rate": 1.155127360577865e-06, "loss": 0.0029, "step": 7483 }, { "epoch": 3.4049135577797998, "grad_norm": 0.30561774428063354, "learning_rate": 1.1545249834482319e-06, "loss": 0.0045, "step": 7484 }, { "epoch": 3.405368516833485, "grad_norm": 0.2983224465534028, "learning_rate": 1.1539227162647398e-06, "loss": 0.0021, "step": 7485 }, { "epoch": 3.4058234758871704, "grad_norm": 0.527103890511572, "learning_rate": 1.1533205590766056e-06, "loss": 0.0059, "step": 7486 }, { "epoch": 3.4062784349408552, "grad_norm": 0.34718585532395035, "learning_rate": 1.1527185119330327e-06, "loss": 0.0041, "step": 7487 }, { "epoch": 3.4067333939945406, "grad_norm": 0.07204645744160114, "learning_rate": 1.15211657488322e-06, "loss": 0.0006, "step": 7488 }, { "epoch": 3.407188353048226, "grad_norm": 0.41377410892897254, "learning_rate": 1.1515147479763536e-06, "loss": 0.0039, "step": 7489 }, { "epoch": 3.4076433121019107, "grad_norm": 0.20891795178131728, "learning_rate": 1.1509130312616123e-06, "loss": 0.0036, "step": 7490 }, { "epoch": 3.408098271155596, "grad_norm": 0.4071401483618306, "learning_rate": 1.1503114247881648e-06, "loss": 0.0041, "step": 7491 }, { "epoch": 3.4085532302092814, "grad_norm": 0.19261506078464238, "learning_rate": 1.1497099286051724e-06, "loss": 0.0021, "step": 7492 }, { "epoch": 3.4090081892629662, "grad_norm": 0.29440314443585974, "learning_rate": 1.149108542761788e-06, "loss": 0.0065, "step": 7493 }, { "epoch": 3.4094631483166515, "grad_norm": 0.25028133825100873, "learning_rate": 1.1485072673071522e-06, "loss": 0.0026, "step": 7494 }, { "epoch": 3.409918107370337, "grad_norm": 0.4500438722314447, "learning_rate": 1.1479061022904001e-06, "loss": 0.0054, "step": 7495 }, { "epoch": 3.4103730664240217, "grad_norm": 0.05049270567957581, "learning_rate": 1.147305047760654e-06, "loss": 0.0004, "step": 7496 }, { "epoch": 3.410828025477707, "grad_norm": 0.30873185970010614, "learning_rate": 1.1467041037670315e-06, "loss": 0.0029, "step": 7497 }, { "epoch": 3.4112829845313923, "grad_norm": 0.039256058901190986, "learning_rate": 1.1461032703586383e-06, "loss": 0.0002, "step": 7498 }, { "epoch": 3.411737943585077, "grad_norm": 0.034591769143582876, "learning_rate": 1.1455025475845708e-06, "loss": 0.0002, "step": 7499 }, { "epoch": 3.4121929026387625, "grad_norm": 0.18405734841976704, "learning_rate": 1.1449019354939193e-06, "loss": 0.0008, "step": 7500 }, { "epoch": 3.412647861692448, "grad_norm": 0.12101424285738203, "learning_rate": 1.1443014341357609e-06, "loss": 0.0006, "step": 7501 }, { "epoch": 3.4131028207461327, "grad_norm": 0.26639534434208245, "learning_rate": 1.143701043559168e-06, "loss": 0.0027, "step": 7502 }, { "epoch": 3.413557779799818, "grad_norm": 0.31436943580910903, "learning_rate": 1.1431007638132008e-06, "loss": 0.0055, "step": 7503 }, { "epoch": 3.4140127388535033, "grad_norm": 0.48841614944357314, "learning_rate": 1.1425005949469118e-06, "loss": 0.0045, "step": 7504 }, { "epoch": 3.414467697907188, "grad_norm": 0.4300722618181997, "learning_rate": 1.1419005370093425e-06, "loss": 0.0061, "step": 7505 }, { "epoch": 3.4149226569608735, "grad_norm": 0.20217454320377376, "learning_rate": 1.1413005900495284e-06, "loss": 0.0026, "step": 7506 }, { "epoch": 3.415377616014559, "grad_norm": 0.15660791150463593, "learning_rate": 1.140700754116495e-06, "loss": 0.0029, "step": 7507 }, { "epoch": 3.4158325750682437, "grad_norm": 0.2240380370049241, "learning_rate": 1.1401010292592574e-06, "loss": 0.0015, "step": 7508 }, { "epoch": 3.416287534121929, "grad_norm": 0.35377125036249923, "learning_rate": 1.1395014155268225e-06, "loss": 0.0069, "step": 7509 }, { "epoch": 3.4167424931756143, "grad_norm": 0.029785640860703823, "learning_rate": 1.138901912968188e-06, "loss": 0.0002, "step": 7510 }, { "epoch": 3.417197452229299, "grad_norm": 0.12750585152665958, "learning_rate": 1.1383025216323418e-06, "loss": 0.0009, "step": 7511 }, { "epoch": 3.4176524112829845, "grad_norm": 0.22226874261274449, "learning_rate": 1.1377032415682648e-06, "loss": 0.0017, "step": 7512 }, { "epoch": 3.41810737033667, "grad_norm": 0.12202424675378375, "learning_rate": 1.1371040728249258e-06, "loss": 0.0006, "step": 7513 }, { "epoch": 3.4185623293903546, "grad_norm": 0.3764303232367491, "learning_rate": 1.1365050154512883e-06, "loss": 0.0032, "step": 7514 }, { "epoch": 3.41901728844404, "grad_norm": 0.15649335636934697, "learning_rate": 1.1359060694963036e-06, "loss": 0.0007, "step": 7515 }, { "epoch": 3.4194722474977253, "grad_norm": 0.3801829028136898, "learning_rate": 1.1353072350089136e-06, "loss": 0.0034, "step": 7516 }, { "epoch": 3.41992720655141, "grad_norm": 0.18079642996971482, "learning_rate": 1.1347085120380543e-06, "loss": 0.0011, "step": 7517 }, { "epoch": 3.4203821656050954, "grad_norm": 0.31535186764915074, "learning_rate": 1.13410990063265e-06, "loss": 0.0028, "step": 7518 }, { "epoch": 3.4208371246587808, "grad_norm": 0.5135722538486109, "learning_rate": 1.1335114008416163e-06, "loss": 0.0042, "step": 7519 }, { "epoch": 3.421292083712466, "grad_norm": 0.2302363756775075, "learning_rate": 1.1329130127138588e-06, "loss": 0.0028, "step": 7520 }, { "epoch": 3.421747042766151, "grad_norm": 0.21521816454541087, "learning_rate": 1.1323147362982761e-06, "loss": 0.0018, "step": 7521 }, { "epoch": 3.4222020018198362, "grad_norm": 0.16489149362819527, "learning_rate": 1.1317165716437581e-06, "loss": 0.0009, "step": 7522 }, { "epoch": 3.4226569608735216, "grad_norm": 0.17221057439018994, "learning_rate": 1.1311185187991825e-06, "loss": 0.0011, "step": 7523 }, { "epoch": 3.4231119199272064, "grad_norm": 0.30040132349771637, "learning_rate": 1.1305205778134195e-06, "loss": 0.0048, "step": 7524 }, { "epoch": 3.4235668789808917, "grad_norm": 0.2755625462113418, "learning_rate": 1.1299227487353297e-06, "loss": 0.0021, "step": 7525 }, { "epoch": 3.424021838034577, "grad_norm": 0.06981260623075582, "learning_rate": 1.1293250316137666e-06, "loss": 0.0005, "step": 7526 }, { "epoch": 3.424476797088262, "grad_norm": 0.2781768359104749, "learning_rate": 1.1287274264975711e-06, "loss": 0.0026, "step": 7527 }, { "epoch": 3.4249317561419472, "grad_norm": 0.36757513718364654, "learning_rate": 1.1281299334355785e-06, "loss": 0.0053, "step": 7528 }, { "epoch": 3.4253867151956325, "grad_norm": 0.3618925328397377, "learning_rate": 1.1275325524766127e-06, "loss": 0.0029, "step": 7529 }, { "epoch": 3.4258416742493174, "grad_norm": 0.37064123557956447, "learning_rate": 1.1269352836694874e-06, "loss": 0.0017, "step": 7530 }, { "epoch": 3.4262966333030027, "grad_norm": 0.527657828435296, "learning_rate": 1.126338127063011e-06, "loss": 0.0012, "step": 7531 }, { "epoch": 3.426751592356688, "grad_norm": 0.17524878833250662, "learning_rate": 1.1257410827059795e-06, "loss": 0.0006, "step": 7532 }, { "epoch": 3.427206551410373, "grad_norm": 0.45464631762752045, "learning_rate": 1.1251441506471807e-06, "loss": 0.003, "step": 7533 }, { "epoch": 3.427661510464058, "grad_norm": 0.24662093183054024, "learning_rate": 1.1245473309353922e-06, "loss": 0.0022, "step": 7534 }, { "epoch": 3.4281164695177435, "grad_norm": 0.06749908415638106, "learning_rate": 1.1239506236193843e-06, "loss": 0.0003, "step": 7535 }, { "epoch": 3.4285714285714284, "grad_norm": 0.18265281507761805, "learning_rate": 1.1233540287479182e-06, "loss": 0.0009, "step": 7536 }, { "epoch": 3.4290263876251137, "grad_norm": 0.4156306928332177, "learning_rate": 1.122757546369744e-06, "loss": 0.01, "step": 7537 }, { "epoch": 3.429481346678799, "grad_norm": 0.4515886517335468, "learning_rate": 1.1221611765336035e-06, "loss": 0.0048, "step": 7538 }, { "epoch": 3.4299363057324843, "grad_norm": 0.30186362566078495, "learning_rate": 1.1215649192882283e-06, "loss": 0.0044, "step": 7539 }, { "epoch": 3.430391264786169, "grad_norm": 0.0697800921392459, "learning_rate": 1.120968774682344e-06, "loss": 0.0005, "step": 7540 }, { "epoch": 3.4308462238398545, "grad_norm": 0.1238360149544028, "learning_rate": 1.120372742764663e-06, "loss": 0.0015, "step": 7541 }, { "epoch": 3.43130118289354, "grad_norm": 0.05082722165465316, "learning_rate": 1.1197768235838917e-06, "loss": 0.0004, "step": 7542 }, { "epoch": 3.4317561419472247, "grad_norm": 0.0638792863234942, "learning_rate": 1.1191810171887258e-06, "loss": 0.0007, "step": 7543 }, { "epoch": 3.43221110100091, "grad_norm": 0.2219390049223841, "learning_rate": 1.1185853236278513e-06, "loss": 0.0011, "step": 7544 }, { "epoch": 3.4326660600545953, "grad_norm": 0.20035487890269807, "learning_rate": 1.1179897429499447e-06, "loss": 0.0015, "step": 7545 }, { "epoch": 3.43312101910828, "grad_norm": 0.2762693668699195, "learning_rate": 1.1173942752036762e-06, "loss": 0.0041, "step": 7546 }, { "epoch": 3.4335759781619655, "grad_norm": 0.34281246886847544, "learning_rate": 1.1167989204377036e-06, "loss": 0.0057, "step": 7547 }, { "epoch": 3.434030937215651, "grad_norm": 0.2698554614152204, "learning_rate": 1.116203678700676e-06, "loss": 0.0062, "step": 7548 }, { "epoch": 3.4344858962693356, "grad_norm": 0.19258844093183028, "learning_rate": 1.1156085500412355e-06, "loss": 0.0026, "step": 7549 }, { "epoch": 3.434940855323021, "grad_norm": 0.2253166282033809, "learning_rate": 1.1150135345080115e-06, "loss": 0.002, "step": 7550 }, { "epoch": 3.4353958143767063, "grad_norm": 0.3903266506542269, "learning_rate": 1.1144186321496279e-06, "loss": 0.0033, "step": 7551 }, { "epoch": 3.435850773430391, "grad_norm": 0.4873995223754006, "learning_rate": 1.113823843014696e-06, "loss": 0.0054, "step": 7552 }, { "epoch": 3.4363057324840764, "grad_norm": 0.49511273603583417, "learning_rate": 1.1132291671518203e-06, "loss": 0.0064, "step": 7553 }, { "epoch": 3.4367606915377618, "grad_norm": 0.22525515129857812, "learning_rate": 1.1126346046095932e-06, "loss": 0.0023, "step": 7554 }, { "epoch": 3.4372156505914466, "grad_norm": 0.22662528509742613, "learning_rate": 1.1120401554366012e-06, "loss": 0.0011, "step": 7555 }, { "epoch": 3.437670609645132, "grad_norm": 0.07737456506402281, "learning_rate": 1.1114458196814204e-06, "loss": 0.0004, "step": 7556 }, { "epoch": 3.4381255686988172, "grad_norm": 0.29384777082197355, "learning_rate": 1.1108515973926168e-06, "loss": 0.002, "step": 7557 }, { "epoch": 3.438580527752502, "grad_norm": 0.16614751253212814, "learning_rate": 1.110257488618747e-06, "loss": 0.0012, "step": 7558 }, { "epoch": 3.4390354868061874, "grad_norm": 0.5885316123099474, "learning_rate": 1.1096634934083586e-06, "loss": 0.0089, "step": 7559 }, { "epoch": 3.4394904458598727, "grad_norm": 0.2679052753387537, "learning_rate": 1.1090696118099914e-06, "loss": 0.0019, "step": 7560 }, { "epoch": 3.4399454049135576, "grad_norm": 0.261897017669284, "learning_rate": 1.1084758438721744e-06, "loss": 0.0021, "step": 7561 }, { "epoch": 3.440400363967243, "grad_norm": 0.20800993797725092, "learning_rate": 1.1078821896434264e-06, "loss": 0.0022, "step": 7562 }, { "epoch": 3.4408553230209282, "grad_norm": 0.1393938050674506, "learning_rate": 1.10728864917226e-06, "loss": 0.0007, "step": 7563 }, { "epoch": 3.441310282074613, "grad_norm": 0.40258058431919674, "learning_rate": 1.1066952225071751e-06, "loss": 0.0041, "step": 7564 }, { "epoch": 3.4417652411282984, "grad_norm": 0.20934668360011538, "learning_rate": 1.1061019096966648e-06, "loss": 0.0026, "step": 7565 }, { "epoch": 3.4422202001819837, "grad_norm": 0.16280155131079638, "learning_rate": 1.1055087107892124e-06, "loss": 0.0016, "step": 7566 }, { "epoch": 3.4426751592356686, "grad_norm": 0.3780022201224599, "learning_rate": 1.1049156258332903e-06, "loss": 0.0077, "step": 7567 }, { "epoch": 3.443130118289354, "grad_norm": 0.34213201939895876, "learning_rate": 1.1043226548773622e-06, "loss": 0.0033, "step": 7568 }, { "epoch": 3.443585077343039, "grad_norm": 0.14949591786913358, "learning_rate": 1.1037297979698837e-06, "loss": 0.0018, "step": 7569 }, { "epoch": 3.444040036396724, "grad_norm": 0.44923117957898956, "learning_rate": 1.1031370551593018e-06, "loss": 0.0055, "step": 7570 }, { "epoch": 3.4444949954504094, "grad_norm": 0.23573671942125854, "learning_rate": 1.1025444264940515e-06, "loss": 0.0032, "step": 7571 }, { "epoch": 3.4449499545040947, "grad_norm": 0.452379190711946, "learning_rate": 1.10195191202256e-06, "loss": 0.0051, "step": 7572 }, { "epoch": 3.4454049135577796, "grad_norm": 0.31390643545159996, "learning_rate": 1.1013595117932437e-06, "loss": 0.0019, "step": 7573 }, { "epoch": 3.445859872611465, "grad_norm": 0.15340152234159737, "learning_rate": 1.1007672258545126e-06, "loss": 0.0011, "step": 7574 }, { "epoch": 3.44631483166515, "grad_norm": 0.2395094646581573, "learning_rate": 1.100175054254765e-06, "loss": 0.0017, "step": 7575 }, { "epoch": 3.4467697907188355, "grad_norm": 0.2934290129058164, "learning_rate": 1.0995829970423898e-06, "loss": 0.003, "step": 7576 }, { "epoch": 3.4472247497725204, "grad_norm": 0.27031821593485833, "learning_rate": 1.0989910542657686e-06, "loss": 0.0008, "step": 7577 }, { "epoch": 3.4476797088262057, "grad_norm": 0.12810394770673902, "learning_rate": 1.0983992259732707e-06, "loss": 0.0006, "step": 7578 }, { "epoch": 3.448134667879891, "grad_norm": 0.40369912582337697, "learning_rate": 1.0978075122132592e-06, "loss": 0.0057, "step": 7579 }, { "epoch": 3.448589626933576, "grad_norm": 0.01798149742416836, "learning_rate": 1.0972159130340857e-06, "loss": 0.0001, "step": 7580 }, { "epoch": 3.449044585987261, "grad_norm": 0.10591441371310088, "learning_rate": 1.0966244284840926e-06, "loss": 0.0005, "step": 7581 }, { "epoch": 3.4494995450409465, "grad_norm": 0.1173205441043991, "learning_rate": 1.096033058611614e-06, "loss": 0.0005, "step": 7582 }, { "epoch": 3.4499545040946313, "grad_norm": 0.3585892286205243, "learning_rate": 1.0954418034649724e-06, "loss": 0.0022, "step": 7583 }, { "epoch": 3.4504094631483166, "grad_norm": 0.3926795681389428, "learning_rate": 1.0948506630924839e-06, "loss": 0.0059, "step": 7584 }, { "epoch": 3.450864422202002, "grad_norm": 0.11339160884314081, "learning_rate": 1.0942596375424544e-06, "loss": 0.0013, "step": 7585 }, { "epoch": 3.451319381255687, "grad_norm": 0.1136915964928837, "learning_rate": 1.093668726863179e-06, "loss": 0.0004, "step": 7586 }, { "epoch": 3.451774340309372, "grad_norm": 0.13573637410986836, "learning_rate": 1.0930779311029444e-06, "loss": 0.0013, "step": 7587 }, { "epoch": 3.4522292993630574, "grad_norm": 0.22529662036180098, "learning_rate": 1.0924872503100268e-06, "loss": 0.0011, "step": 7588 }, { "epoch": 3.4526842584167423, "grad_norm": 0.2621684187239149, "learning_rate": 1.0918966845326955e-06, "loss": 0.004, "step": 7589 }, { "epoch": 3.4531392174704276, "grad_norm": 0.2605987337393373, "learning_rate": 1.0913062338192076e-06, "loss": 0.0025, "step": 7590 }, { "epoch": 3.453594176524113, "grad_norm": 0.16794660953797946, "learning_rate": 1.0907158982178135e-06, "loss": 0.0015, "step": 7591 }, { "epoch": 3.4540491355777982, "grad_norm": 0.7094376951149104, "learning_rate": 1.0901256777767519e-06, "loss": 0.0109, "step": 7592 }, { "epoch": 3.454504094631483, "grad_norm": 0.11600773913377355, "learning_rate": 1.0895355725442519e-06, "loss": 0.001, "step": 7593 }, { "epoch": 3.4549590536851684, "grad_norm": 0.3132160848207166, "learning_rate": 1.0889455825685364e-06, "loss": 0.003, "step": 7594 }, { "epoch": 3.4554140127388537, "grad_norm": 0.460982241016577, "learning_rate": 1.0883557078978155e-06, "loss": 0.0073, "step": 7595 }, { "epoch": 3.4558689717925386, "grad_norm": 0.43520199516267966, "learning_rate": 1.0877659485802914e-06, "loss": 0.0088, "step": 7596 }, { "epoch": 3.456323930846224, "grad_norm": 0.11481880083510043, "learning_rate": 1.0871763046641553e-06, "loss": 0.0009, "step": 7597 }, { "epoch": 3.4567788898999092, "grad_norm": 0.6027251844206142, "learning_rate": 1.0865867761975916e-06, "loss": 0.0092, "step": 7598 }, { "epoch": 3.457233848953594, "grad_norm": 0.3339593314837236, "learning_rate": 1.0859973632287742e-06, "loss": 0.0033, "step": 7599 }, { "epoch": 3.4576888080072794, "grad_norm": 0.2879660025229417, "learning_rate": 1.0854080658058669e-06, "loss": 0.0016, "step": 7600 }, { "epoch": 3.4581437670609647, "grad_norm": 0.20891754156331113, "learning_rate": 1.084818883977024e-06, "loss": 0.0013, "step": 7601 }, { "epoch": 3.4585987261146496, "grad_norm": 0.04685111750052381, "learning_rate": 1.0842298177903904e-06, "loss": 0.0003, "step": 7602 }, { "epoch": 3.459053685168335, "grad_norm": 0.21487212200559497, "learning_rate": 1.0836408672941034e-06, "loss": 0.0028, "step": 7603 }, { "epoch": 3.45950864422202, "grad_norm": 0.22905916462749198, "learning_rate": 1.0830520325362876e-06, "loss": 0.0027, "step": 7604 }, { "epoch": 3.459963603275705, "grad_norm": 0.18844535056205866, "learning_rate": 1.0824633135650614e-06, "loss": 0.0017, "step": 7605 }, { "epoch": 3.4604185623293904, "grad_norm": 0.1030651449705298, "learning_rate": 1.081874710428532e-06, "loss": 0.0005, "step": 7606 }, { "epoch": 3.4608735213830757, "grad_norm": 0.2719978354732743, "learning_rate": 1.081286223174796e-06, "loss": 0.0018, "step": 7607 }, { "epoch": 3.4613284804367606, "grad_norm": 0.22529995059356012, "learning_rate": 1.080697851851944e-06, "loss": 0.0025, "step": 7608 }, { "epoch": 3.461783439490446, "grad_norm": 0.4426482010761282, "learning_rate": 1.080109596508054e-06, "loss": 0.0051, "step": 7609 }, { "epoch": 3.462238398544131, "grad_norm": 0.4529426726624976, "learning_rate": 1.0795214571911955e-06, "loss": 0.0094, "step": 7610 }, { "epoch": 3.462693357597816, "grad_norm": 0.24776144954047133, "learning_rate": 1.0789334339494278e-06, "loss": 0.003, "step": 7611 }, { "epoch": 3.4631483166515014, "grad_norm": 0.5947551066501249, "learning_rate": 1.0783455268308026e-06, "loss": 0.0067, "step": 7612 }, { "epoch": 3.4636032757051867, "grad_norm": 0.42214823391777234, "learning_rate": 1.0777577358833615e-06, "loss": 0.0039, "step": 7613 }, { "epoch": 3.4640582347588715, "grad_norm": 0.17602123703913636, "learning_rate": 1.0771700611551355e-06, "loss": 0.0016, "step": 7614 }, { "epoch": 3.464513193812557, "grad_norm": 0.12086827715672131, "learning_rate": 1.0765825026941467e-06, "loss": 0.0013, "step": 7615 }, { "epoch": 3.464968152866242, "grad_norm": 0.2452253508820482, "learning_rate": 1.075995060548408e-06, "loss": 0.0013, "step": 7616 }, { "epoch": 3.465423111919927, "grad_norm": 0.1273078312071546, "learning_rate": 1.0754077347659209e-06, "loss": 0.0008, "step": 7617 }, { "epoch": 3.4658780709736123, "grad_norm": 0.22234403432070154, "learning_rate": 1.0748205253946804e-06, "loss": 0.0029, "step": 7618 }, { "epoch": 3.4663330300272976, "grad_norm": 0.05937342631081049, "learning_rate": 1.0742334324826715e-06, "loss": 0.0003, "step": 7619 }, { "epoch": 3.4667879890809825, "grad_norm": 0.7965257634547764, "learning_rate": 1.0736464560778675e-06, "loss": 0.0127, "step": 7620 }, { "epoch": 3.467242948134668, "grad_norm": 0.2858245177491467, "learning_rate": 1.073059596228234e-06, "loss": 0.0042, "step": 7621 }, { "epoch": 3.467697907188353, "grad_norm": 0.2319207233659183, "learning_rate": 1.0724728529817253e-06, "loss": 0.0015, "step": 7622 }, { "epoch": 3.468152866242038, "grad_norm": 0.33670372239227275, "learning_rate": 1.0718862263862892e-06, "loss": 0.0027, "step": 7623 }, { "epoch": 3.4686078252957233, "grad_norm": 0.1632472657725517, "learning_rate": 1.0712997164898616e-06, "loss": 0.0016, "step": 7624 }, { "epoch": 3.4690627843494086, "grad_norm": 0.7490081671500421, "learning_rate": 1.0707133233403682e-06, "loss": 0.0132, "step": 7625 }, { "epoch": 3.4695177434030935, "grad_norm": 0.029752586101486657, "learning_rate": 1.0701270469857282e-06, "loss": 0.0002, "step": 7626 }, { "epoch": 3.469972702456779, "grad_norm": 0.14814992177146827, "learning_rate": 1.069540887473848e-06, "loss": 0.0012, "step": 7627 }, { "epoch": 3.470427661510464, "grad_norm": 0.2598430464839612, "learning_rate": 1.0689548448526273e-06, "loss": 0.0022, "step": 7628 }, { "epoch": 3.470882620564149, "grad_norm": 0.027553672271503838, "learning_rate": 1.0683689191699544e-06, "loss": 0.0002, "step": 7629 }, { "epoch": 3.4713375796178343, "grad_norm": 0.5947324986769041, "learning_rate": 1.067783110473708e-06, "loss": 0.0119, "step": 7630 }, { "epoch": 3.4717925386715196, "grad_norm": 0.27577615659701565, "learning_rate": 1.0671974188117573e-06, "loss": 0.0042, "step": 7631 }, { "epoch": 3.472247497725205, "grad_norm": 0.07150316761124101, "learning_rate": 1.0666118442319628e-06, "loss": 0.0004, "step": 7632 }, { "epoch": 3.47270245677889, "grad_norm": 0.3330107110252563, "learning_rate": 1.0660263867821763e-06, "loss": 0.0072, "step": 7633 }, { "epoch": 3.473157415832575, "grad_norm": 0.23203391006874863, "learning_rate": 1.0654410465102376e-06, "loss": 0.0018, "step": 7634 }, { "epoch": 3.4736123748862604, "grad_norm": 0.12877393880443339, "learning_rate": 1.0648558234639783e-06, "loss": 0.0008, "step": 7635 }, { "epoch": 3.4740673339399453, "grad_norm": 0.13418138342482436, "learning_rate": 1.064270717691219e-06, "loss": 0.0013, "step": 7636 }, { "epoch": 3.4745222929936306, "grad_norm": 0.3783967052767402, "learning_rate": 1.063685729239774e-06, "loss": 0.0025, "step": 7637 }, { "epoch": 3.474977252047316, "grad_norm": 0.19858418031122035, "learning_rate": 1.0631008581574448e-06, "loss": 0.0034, "step": 7638 }, { "epoch": 3.4754322111010008, "grad_norm": 0.2156779614404021, "learning_rate": 1.0625161044920238e-06, "loss": 0.0008, "step": 7639 }, { "epoch": 3.475887170154686, "grad_norm": 0.6613144420612114, "learning_rate": 1.0619314682912956e-06, "loss": 0.0066, "step": 7640 }, { "epoch": 3.4763421292083714, "grad_norm": 0.1942106748789676, "learning_rate": 1.0613469496030329e-06, "loss": 0.002, "step": 7641 }, { "epoch": 3.4767970882620562, "grad_norm": 0.22577554682960108, "learning_rate": 1.0607625484750014e-06, "loss": 0.0026, "step": 7642 }, { "epoch": 3.4772520473157416, "grad_norm": 0.3594566860080508, "learning_rate": 1.060178264954955e-06, "loss": 0.0029, "step": 7643 }, { "epoch": 3.477707006369427, "grad_norm": 0.3039765347607937, "learning_rate": 1.0595940990906387e-06, "loss": 0.0022, "step": 7644 }, { "epoch": 3.4781619654231117, "grad_norm": 0.3705727201023583, "learning_rate": 1.0590100509297866e-06, "loss": 0.0049, "step": 7645 }, { "epoch": 3.478616924476797, "grad_norm": 0.12351869528431314, "learning_rate": 1.058426120520126e-06, "loss": 0.0007, "step": 7646 }, { "epoch": 3.4790718835304824, "grad_norm": 0.35907740610148187, "learning_rate": 1.0578423079093734e-06, "loss": 0.0067, "step": 7647 }, { "epoch": 3.4795268425841677, "grad_norm": 1.0272405630030712, "learning_rate": 1.0572586131452347e-06, "loss": 0.0051, "step": 7648 }, { "epoch": 3.4799818016378525, "grad_norm": 0.16065157603325442, "learning_rate": 1.0566750362754069e-06, "loss": 0.0013, "step": 7649 }, { "epoch": 3.480436760691538, "grad_norm": 0.38428103378453093, "learning_rate": 1.0560915773475761e-06, "loss": 0.0038, "step": 7650 }, { "epoch": 3.480891719745223, "grad_norm": 0.13836645684792634, "learning_rate": 1.0555082364094222e-06, "loss": 0.0014, "step": 7651 }, { "epoch": 3.481346678798908, "grad_norm": 0.28229460397472456, "learning_rate": 1.0549250135086114e-06, "loss": 0.0053, "step": 7652 }, { "epoch": 3.4818016378525933, "grad_norm": 0.29665222694380444, "learning_rate": 1.054341908692802e-06, "loss": 0.0021, "step": 7653 }, { "epoch": 3.4822565969062786, "grad_norm": 0.18599930115063532, "learning_rate": 1.0537589220096441e-06, "loss": 0.0023, "step": 7654 }, { "epoch": 3.4827115559599635, "grad_norm": 0.3782434479743471, "learning_rate": 1.0531760535067762e-06, "loss": 0.0054, "step": 7655 }, { "epoch": 3.483166515013649, "grad_norm": 0.2700216688524689, "learning_rate": 1.0525933032318264e-06, "loss": 0.0026, "step": 7656 }, { "epoch": 3.483621474067334, "grad_norm": 0.4599613219547209, "learning_rate": 1.052010671232416e-06, "loss": 0.0022, "step": 7657 }, { "epoch": 3.484076433121019, "grad_norm": 0.18939470262761046, "learning_rate": 1.051428157556155e-06, "loss": 0.0026, "step": 7658 }, { "epoch": 3.4845313921747043, "grad_norm": 0.1229914420533985, "learning_rate": 1.050845762250643e-06, "loss": 0.0009, "step": 7659 }, { "epoch": 3.4849863512283896, "grad_norm": 0.1145652328015566, "learning_rate": 1.05026348536347e-06, "loss": 0.0005, "step": 7660 }, { "epoch": 3.4854413102820745, "grad_norm": 0.3258790888451917, "learning_rate": 1.049681326942218e-06, "loss": 0.0041, "step": 7661 }, { "epoch": 3.48589626933576, "grad_norm": 0.347590897028885, "learning_rate": 1.0490992870344593e-06, "loss": 0.0055, "step": 7662 }, { "epoch": 3.486351228389445, "grad_norm": 0.042801537347389784, "learning_rate": 1.0485173656877547e-06, "loss": 0.0002, "step": 7663 }, { "epoch": 3.48680618744313, "grad_norm": 0.1380741382637674, "learning_rate": 1.0479355629496563e-06, "loss": 0.0009, "step": 7664 }, { "epoch": 3.4872611464968153, "grad_norm": 0.22140529829629996, "learning_rate": 1.0473538788677051e-06, "loss": 0.0016, "step": 7665 }, { "epoch": 3.4877161055505006, "grad_norm": 0.3354377595588078, "learning_rate": 1.0467723134894359e-06, "loss": 0.001, "step": 7666 }, { "epoch": 3.4881710646041855, "grad_norm": 0.09605374822088676, "learning_rate": 1.0461908668623697e-06, "loss": 0.0006, "step": 7667 }, { "epoch": 3.488626023657871, "grad_norm": 0.22026844298734613, "learning_rate": 1.0456095390340213e-06, "loss": 0.0014, "step": 7668 }, { "epoch": 3.489080982711556, "grad_norm": 0.1532201873515956, "learning_rate": 1.0450283300518933e-06, "loss": 0.0012, "step": 7669 }, { "epoch": 3.489535941765241, "grad_norm": 0.23313941513772174, "learning_rate": 1.0444472399634786e-06, "loss": 0.0015, "step": 7670 }, { "epoch": 3.4899909008189263, "grad_norm": 0.36697013186413435, "learning_rate": 1.0438662688162635e-06, "loss": 0.0019, "step": 7671 }, { "epoch": 3.4904458598726116, "grad_norm": 0.032458973505422734, "learning_rate": 1.0432854166577207e-06, "loss": 0.0003, "step": 7672 }, { "epoch": 3.4909008189262964, "grad_norm": 0.24775523563367993, "learning_rate": 1.0427046835353154e-06, "loss": 0.0023, "step": 7673 }, { "epoch": 3.4913557779799818, "grad_norm": 0.2222201479788081, "learning_rate": 1.0421240694965012e-06, "loss": 0.0021, "step": 7674 }, { "epoch": 3.491810737033667, "grad_norm": 0.22735083415415752, "learning_rate": 1.0415435745887245e-06, "loss": 0.0031, "step": 7675 }, { "epoch": 3.492265696087352, "grad_norm": 0.18459799111238054, "learning_rate": 1.0409631988594216e-06, "loss": 0.0031, "step": 7676 }, { "epoch": 3.4927206551410372, "grad_norm": 0.12072921104140445, "learning_rate": 1.0403829423560168e-06, "loss": 0.0013, "step": 7677 }, { "epoch": 3.4931756141947226, "grad_norm": 0.3523924703402345, "learning_rate": 1.0398028051259266e-06, "loss": 0.0032, "step": 7678 }, { "epoch": 3.4936305732484074, "grad_norm": 0.19022918045449339, "learning_rate": 1.0392227872165557e-06, "loss": 0.0013, "step": 7679 }, { "epoch": 3.4940855323020927, "grad_norm": 0.430409696607159, "learning_rate": 1.038642888675303e-06, "loss": 0.0067, "step": 7680 }, { "epoch": 3.494540491355778, "grad_norm": 0.32745140632415454, "learning_rate": 1.0380631095495532e-06, "loss": 0.0026, "step": 7681 }, { "epoch": 3.494995450409463, "grad_norm": 0.31591976728223636, "learning_rate": 1.037483449886685e-06, "loss": 0.003, "step": 7682 }, { "epoch": 3.4954504094631482, "grad_norm": 0.17502361169218275, "learning_rate": 1.0369039097340644e-06, "loss": 0.001, "step": 7683 }, { "epoch": 3.4959053685168335, "grad_norm": 0.14621527436336784, "learning_rate": 1.036324489139048e-06, "loss": 0.0022, "step": 7684 }, { "epoch": 3.496360327570519, "grad_norm": 0.28543905866330915, "learning_rate": 1.0357451881489858e-06, "loss": 0.0042, "step": 7685 }, { "epoch": 3.4968152866242037, "grad_norm": 0.5125381305586044, "learning_rate": 1.0351660068112138e-06, "loss": 0.0047, "step": 7686 }, { "epoch": 3.497270245677889, "grad_norm": 0.31253665332230807, "learning_rate": 1.0345869451730609e-06, "loss": 0.0013, "step": 7687 }, { "epoch": 3.4977252047315743, "grad_norm": 0.10879589397886483, "learning_rate": 1.0340080032818442e-06, "loss": 0.0006, "step": 7688 }, { "epoch": 3.498180163785259, "grad_norm": 0.4416000230069858, "learning_rate": 1.0334291811848736e-06, "loss": 0.0078, "step": 7689 }, { "epoch": 3.4986351228389445, "grad_norm": 0.4119637717662131, "learning_rate": 1.032850478929447e-06, "loss": 0.0049, "step": 7690 }, { "epoch": 3.49909008189263, "grad_norm": 0.2012240555282583, "learning_rate": 1.0322718965628542e-06, "loss": 0.0022, "step": 7691 }, { "epoch": 3.4995450409463147, "grad_norm": 0.3950095927726189, "learning_rate": 1.031693434132374e-06, "loss": 0.0058, "step": 7692 }, { "epoch": 3.5, "grad_norm": 0.4799132514313561, "learning_rate": 1.0311150916852755e-06, "loss": 0.0117, "step": 7693 }, { "epoch": 3.5004549590536853, "grad_norm": 0.4200973356156511, "learning_rate": 1.0305368692688175e-06, "loss": 0.0052, "step": 7694 }, { "epoch": 3.50090991810737, "grad_norm": 0.25387487352062027, "learning_rate": 1.0299587669302501e-06, "loss": 0.0018, "step": 7695 }, { "epoch": 3.5013648771610555, "grad_norm": 0.18153786006811062, "learning_rate": 1.029380784716815e-06, "loss": 0.0024, "step": 7696 }, { "epoch": 3.501819836214741, "grad_norm": 0.15545920378635616, "learning_rate": 1.0288029226757407e-06, "loss": 0.0015, "step": 7697 }, { "epoch": 3.502274795268426, "grad_norm": 0.11893453347764651, "learning_rate": 1.0282251808542476e-06, "loss": 0.0006, "step": 7698 }, { "epoch": 3.502729754322111, "grad_norm": 0.31656334783886014, "learning_rate": 1.0276475592995455e-06, "loss": 0.0012, "step": 7699 }, { "epoch": 3.5031847133757963, "grad_norm": 0.35782043587709783, "learning_rate": 1.0270700580588367e-06, "loss": 0.0041, "step": 7700 }, { "epoch": 3.5036396724294816, "grad_norm": 0.48417090829700743, "learning_rate": 1.026492677179311e-06, "loss": 0.0051, "step": 7701 }, { "epoch": 3.5040946314831665, "grad_norm": 0.23617935133737974, "learning_rate": 1.0259154167081484e-06, "loss": 0.0034, "step": 7702 }, { "epoch": 3.5045495905368518, "grad_norm": 0.16743589314778942, "learning_rate": 1.0253382766925222e-06, "loss": 0.0013, "step": 7703 }, { "epoch": 3.505004549590537, "grad_norm": 0.4538252643201241, "learning_rate": 1.0247612571795914e-06, "loss": 0.0058, "step": 7704 }, { "epoch": 3.505459508644222, "grad_norm": 0.2028275224910937, "learning_rate": 1.0241843582165095e-06, "loss": 0.0014, "step": 7705 }, { "epoch": 3.5059144676979073, "grad_norm": 0.39978545631827184, "learning_rate": 1.0236075798504172e-06, "loss": 0.0024, "step": 7706 }, { "epoch": 3.5063694267515926, "grad_norm": 0.335510896442666, "learning_rate": 1.023030922128446e-06, "loss": 0.0021, "step": 7707 }, { "epoch": 3.5068243858052774, "grad_norm": 0.2715866146447563, "learning_rate": 1.022454385097717e-06, "loss": 0.0039, "step": 7708 }, { "epoch": 3.5072793448589628, "grad_norm": 0.20823687173743463, "learning_rate": 1.021877968805343e-06, "loss": 0.0013, "step": 7709 }, { "epoch": 3.507734303912648, "grad_norm": 0.23111098530309537, "learning_rate": 1.0213016732984276e-06, "loss": 0.0011, "step": 7710 }, { "epoch": 3.508189262966333, "grad_norm": 0.22656692501222195, "learning_rate": 1.0207254986240615e-06, "loss": 0.0031, "step": 7711 }, { "epoch": 3.5086442220200182, "grad_norm": 0.15570006269198586, "learning_rate": 1.0201494448293272e-06, "loss": 0.0018, "step": 7712 }, { "epoch": 3.5090991810737036, "grad_norm": 0.29643229023996104, "learning_rate": 1.0195735119612965e-06, "loss": 0.0017, "step": 7713 }, { "epoch": 3.5095541401273884, "grad_norm": 0.10151267911699977, "learning_rate": 1.0189977000670338e-06, "loss": 0.0011, "step": 7714 }, { "epoch": 3.5100090991810737, "grad_norm": 0.17235180532613997, "learning_rate": 1.0184220091935906e-06, "loss": 0.0023, "step": 7715 }, { "epoch": 3.510464058234759, "grad_norm": 0.18307991182798428, "learning_rate": 1.0178464393880095e-06, "loss": 0.0014, "step": 7716 }, { "epoch": 3.510919017288444, "grad_norm": 0.24528290947278478, "learning_rate": 1.017270990697325e-06, "loss": 0.0028, "step": 7717 }, { "epoch": 3.511373976342129, "grad_norm": 0.28198525864935803, "learning_rate": 1.0166956631685578e-06, "loss": 0.0022, "step": 7718 }, { "epoch": 3.5118289353958145, "grad_norm": 0.1802045594365712, "learning_rate": 1.016120456848724e-06, "loss": 0.0015, "step": 7719 }, { "epoch": 3.5122838944494994, "grad_norm": 0.035516954463711954, "learning_rate": 1.015545371784825e-06, "loss": 0.0002, "step": 7720 }, { "epoch": 3.5127388535031847, "grad_norm": 0.18292378275536114, "learning_rate": 1.0149704080238542e-06, "loss": 0.0015, "step": 7721 }, { "epoch": 3.51319381255687, "grad_norm": 0.13401248919585507, "learning_rate": 1.0143955656127958e-06, "loss": 0.0011, "step": 7722 }, { "epoch": 3.513648771610555, "grad_norm": 0.45857821787009717, "learning_rate": 1.0138208445986208e-06, "loss": 0.0046, "step": 7723 }, { "epoch": 3.51410373066424, "grad_norm": 0.12897298253731818, "learning_rate": 1.0132462450282969e-06, "loss": 0.0013, "step": 7724 }, { "epoch": 3.5145586897179255, "grad_norm": 0.2954284451777006, "learning_rate": 1.0126717669487753e-06, "loss": 0.0024, "step": 7725 }, { "epoch": 3.5150136487716104, "grad_norm": 0.09239249962477761, "learning_rate": 1.0120974104070005e-06, "loss": 0.0007, "step": 7726 }, { "epoch": 3.5154686078252957, "grad_norm": 0.23881489996894453, "learning_rate": 1.011523175449906e-06, "loss": 0.0028, "step": 7727 }, { "epoch": 3.515923566878981, "grad_norm": 0.3786391906942184, "learning_rate": 1.0109490621244148e-06, "loss": 0.0022, "step": 7728 }, { "epoch": 3.516378525932666, "grad_norm": 0.27787211552914914, "learning_rate": 1.0103750704774427e-06, "loss": 0.0032, "step": 7729 }, { "epoch": 3.516833484986351, "grad_norm": 0.5341401050887009, "learning_rate": 1.0098012005558916e-06, "loss": 0.004, "step": 7730 }, { "epoch": 3.5172884440400365, "grad_norm": 0.31175654487567134, "learning_rate": 1.0092274524066578e-06, "loss": 0.0049, "step": 7731 }, { "epoch": 3.5177434030937214, "grad_norm": 0.17906841818310687, "learning_rate": 1.0086538260766243e-06, "loss": 0.0015, "step": 7732 }, { "epoch": 3.5181983621474067, "grad_norm": 0.23061499769787855, "learning_rate": 1.0080803216126644e-06, "loss": 0.0035, "step": 7733 }, { "epoch": 3.518653321201092, "grad_norm": 0.2083354993802602, "learning_rate": 1.007506939061644e-06, "loss": 0.0036, "step": 7734 }, { "epoch": 3.519108280254777, "grad_norm": 0.8630241269228571, "learning_rate": 1.0069336784704165e-06, "loss": 0.0155, "step": 7735 }, { "epoch": 3.519563239308462, "grad_norm": 0.11296978277454736, "learning_rate": 1.0063605398858261e-06, "loss": 0.0005, "step": 7736 }, { "epoch": 3.5200181983621475, "grad_norm": 0.38281269549688884, "learning_rate": 1.0057875233547066e-06, "loss": 0.0065, "step": 7737 }, { "epoch": 3.5204731574158323, "grad_norm": 0.3041612323836244, "learning_rate": 1.0052146289238826e-06, "loss": 0.0032, "step": 7738 }, { "epoch": 3.5209281164695176, "grad_norm": 0.3095756919194255, "learning_rate": 1.0046418566401698e-06, "loss": 0.0036, "step": 7739 }, { "epoch": 3.521383075523203, "grad_norm": 0.22004648122401518, "learning_rate": 1.0040692065503712e-06, "loss": 0.0011, "step": 7740 }, { "epoch": 3.521838034576888, "grad_norm": 0.3131651615154017, "learning_rate": 1.0034966787012817e-06, "loss": 0.0017, "step": 7741 }, { "epoch": 3.522292993630573, "grad_norm": 0.3891635784707649, "learning_rate": 1.0029242731396847e-06, "loss": 0.0025, "step": 7742 }, { "epoch": 3.5227479526842584, "grad_norm": 0.38077821819459007, "learning_rate": 1.002351989912356e-06, "loss": 0.0031, "step": 7743 }, { "epoch": 3.5232029117379433, "grad_norm": 0.40947355476305697, "learning_rate": 1.0017798290660585e-06, "loss": 0.0092, "step": 7744 }, { "epoch": 3.5236578707916286, "grad_norm": 0.2370981916252491, "learning_rate": 1.0012077906475484e-06, "loss": 0.0022, "step": 7745 }, { "epoch": 3.524112829845314, "grad_norm": 0.2846921966849832, "learning_rate": 1.0006358747035692e-06, "loss": 0.0064, "step": 7746 }, { "epoch": 3.5245677888989992, "grad_norm": 0.3615407467178303, "learning_rate": 1.0000640812808543e-06, "loss": 0.0058, "step": 7747 }, { "epoch": 3.525022747952684, "grad_norm": 0.2200460712362465, "learning_rate": 9.9949241042613e-07, "loss": 0.0033, "step": 7748 }, { "epoch": 3.5254777070063694, "grad_norm": 0.08602971366695904, "learning_rate": 9.989208621861096e-07, "loss": 0.0007, "step": 7749 }, { "epoch": 3.5259326660600547, "grad_norm": 0.32453936764604535, "learning_rate": 9.983494366074975e-07, "loss": 0.0034, "step": 7750 }, { "epoch": 3.5263876251137396, "grad_norm": 0.3725260666417278, "learning_rate": 9.977781337369875e-07, "loss": 0.006, "step": 7751 }, { "epoch": 3.526842584167425, "grad_norm": 0.4059905210228861, "learning_rate": 9.972069536212638e-07, "loss": 0.0039, "step": 7752 }, { "epoch": 3.52729754322111, "grad_norm": 0.23930785197249896, "learning_rate": 9.966358963070027e-07, "loss": 0.002, "step": 7753 }, { "epoch": 3.5277525022747955, "grad_norm": 0.3903637728252443, "learning_rate": 9.96064961840867e-07, "loss": 0.0075, "step": 7754 }, { "epoch": 3.5282074613284804, "grad_norm": 0.25018979225310717, "learning_rate": 9.954941502695106e-07, "loss": 0.0034, "step": 7755 }, { "epoch": 3.5286624203821657, "grad_norm": 0.266420853599793, "learning_rate": 9.949234616395773e-07, "loss": 0.0026, "step": 7756 }, { "epoch": 3.529117379435851, "grad_norm": 0.5228129103353878, "learning_rate": 9.943528959977028e-07, "loss": 0.0062, "step": 7757 }, { "epoch": 3.529572338489536, "grad_norm": 0.1940766836341541, "learning_rate": 9.937824533905092e-07, "loss": 0.0014, "step": 7758 }, { "epoch": 3.530027297543221, "grad_norm": 0.2619051259716473, "learning_rate": 9.932121338646122e-07, "loss": 0.0038, "step": 7759 }, { "epoch": 3.5304822565969065, "grad_norm": 0.21641169951819617, "learning_rate": 9.926419374666152e-07, "loss": 0.0021, "step": 7760 }, { "epoch": 3.5309372156505914, "grad_norm": 0.40098814576873393, "learning_rate": 9.92071864243112e-07, "loss": 0.0066, "step": 7761 }, { "epoch": 3.5313921747042767, "grad_norm": 0.20987558109507845, "learning_rate": 9.915019142406854e-07, "loss": 0.0028, "step": 7762 }, { "epoch": 3.531847133757962, "grad_norm": 0.3214255086744716, "learning_rate": 9.90932087505911e-07, "loss": 0.0028, "step": 7763 }, { "epoch": 3.532302092811647, "grad_norm": 0.11873590631293578, "learning_rate": 9.90362384085351e-07, "loss": 0.0012, "step": 7764 }, { "epoch": 3.532757051865332, "grad_norm": 0.16944516954965785, "learning_rate": 9.897928040255592e-07, "loss": 0.0021, "step": 7765 }, { "epoch": 3.5332120109190175, "grad_norm": 0.2528623497294166, "learning_rate": 9.8922334737308e-07, "loss": 0.0014, "step": 7766 }, { "epoch": 3.5336669699727024, "grad_norm": 0.09330050569745525, "learning_rate": 9.886540141744456e-07, "loss": 0.0009, "step": 7767 }, { "epoch": 3.5341219290263877, "grad_norm": 0.360369931331059, "learning_rate": 9.880848044761806e-07, "loss": 0.0024, "step": 7768 }, { "epoch": 3.534576888080073, "grad_norm": 0.45517946293270595, "learning_rate": 9.875157183247977e-07, "loss": 0.0047, "step": 7769 }, { "epoch": 3.535031847133758, "grad_norm": 0.2587094623505037, "learning_rate": 9.869467557668002e-07, "loss": 0.0062, "step": 7770 }, { "epoch": 3.535486806187443, "grad_norm": 0.1714380277319724, "learning_rate": 9.863779168486797e-07, "loss": 0.0014, "step": 7771 }, { "epoch": 3.5359417652411285, "grad_norm": 0.5052117192936547, "learning_rate": 9.858092016169207e-07, "loss": 0.009, "step": 7772 }, { "epoch": 3.5363967242948133, "grad_norm": 0.2618867695985881, "learning_rate": 9.852406101179964e-07, "loss": 0.0021, "step": 7773 }, { "epoch": 3.5368516833484986, "grad_norm": 0.18127415522779802, "learning_rate": 9.846721423983692e-07, "loss": 0.0009, "step": 7774 }, { "epoch": 3.537306642402184, "grad_norm": 0.26377805661931647, "learning_rate": 9.841037985044907e-07, "loss": 0.0034, "step": 7775 }, { "epoch": 3.537761601455869, "grad_norm": 0.22532512200429133, "learning_rate": 9.835355784828038e-07, "loss": 0.0019, "step": 7776 }, { "epoch": 3.538216560509554, "grad_norm": 0.21964602810203054, "learning_rate": 9.829674823797417e-07, "loss": 0.0028, "step": 7777 }, { "epoch": 3.5386715195632394, "grad_norm": 0.1464860374986121, "learning_rate": 9.82399510241726e-07, "loss": 0.0017, "step": 7778 }, { "epoch": 3.5391264786169243, "grad_norm": 0.196314729409406, "learning_rate": 9.818316621151683e-07, "loss": 0.0019, "step": 7779 }, { "epoch": 3.5395814376706096, "grad_norm": 0.30238964132578905, "learning_rate": 9.81263938046472e-07, "loss": 0.0051, "step": 7780 }, { "epoch": 3.540036396724295, "grad_norm": 0.1830228709013517, "learning_rate": 9.806963380820271e-07, "loss": 0.0013, "step": 7781 }, { "epoch": 3.54049135577798, "grad_norm": 0.5169785423095711, "learning_rate": 9.801288622682172e-07, "loss": 0.0054, "step": 7782 }, { "epoch": 3.540946314831665, "grad_norm": 0.33419816897562654, "learning_rate": 9.795615106514133e-07, "loss": 0.0018, "step": 7783 }, { "epoch": 3.5414012738853504, "grad_norm": 0.32814249955228897, "learning_rate": 9.789942832779765e-07, "loss": 0.0063, "step": 7784 }, { "epoch": 3.5418562329390353, "grad_norm": 0.2977065766469356, "learning_rate": 9.784271801942568e-07, "loss": 0.0014, "step": 7785 }, { "epoch": 3.5423111919927206, "grad_norm": 0.3813108326943421, "learning_rate": 9.778602014465968e-07, "loss": 0.0036, "step": 7786 }, { "epoch": 3.542766151046406, "grad_norm": 0.2725893747028684, "learning_rate": 9.772933470813281e-07, "loss": 0.002, "step": 7787 }, { "epoch": 3.5432211101000908, "grad_norm": 0.28186801911222525, "learning_rate": 9.767266171447706e-07, "loss": 0.0026, "step": 7788 }, { "epoch": 3.543676069153776, "grad_norm": 0.1339099826531519, "learning_rate": 9.761600116832347e-07, "loss": 0.0023, "step": 7789 }, { "epoch": 3.5441310282074614, "grad_norm": 0.19229933088078544, "learning_rate": 9.755935307430203e-07, "loss": 0.0025, "step": 7790 }, { "epoch": 3.5445859872611463, "grad_norm": 0.24441277652624888, "learning_rate": 9.750271743704195e-07, "loss": 0.0026, "step": 7791 }, { "epoch": 3.5450409463148316, "grad_norm": 0.10865757533306881, "learning_rate": 9.74460942611711e-07, "loss": 0.0009, "step": 7792 }, { "epoch": 3.545495905368517, "grad_norm": 0.2504211981325846, "learning_rate": 9.738948355131642e-07, "loss": 0.0045, "step": 7793 }, { "epoch": 3.5459508644222018, "grad_norm": 0.7387823911648469, "learning_rate": 9.733288531210406e-07, "loss": 0.0108, "step": 7794 }, { "epoch": 3.546405823475887, "grad_norm": 0.2713616903749758, "learning_rate": 9.72762995481588e-07, "loss": 0.0036, "step": 7795 }, { "epoch": 3.5468607825295724, "grad_norm": 0.2582817438952903, "learning_rate": 9.72197262641047e-07, "loss": 0.0017, "step": 7796 }, { "epoch": 3.5473157415832572, "grad_norm": 0.17939560065350516, "learning_rate": 9.716316546456462e-07, "loss": 0.0009, "step": 7797 }, { "epoch": 3.5477707006369426, "grad_norm": 0.27015969751095786, "learning_rate": 9.710661715416048e-07, "loss": 0.0039, "step": 7798 }, { "epoch": 3.548225659690628, "grad_norm": 0.27688632768573707, "learning_rate": 9.70500813375131e-07, "loss": 0.0037, "step": 7799 }, { "epoch": 3.548680618744313, "grad_norm": 0.04515860006320888, "learning_rate": 9.69935580192423e-07, "loss": 0.0004, "step": 7800 }, { "epoch": 3.549135577797998, "grad_norm": 0.3218898903800299, "learning_rate": 9.693704720396693e-07, "loss": 0.0059, "step": 7801 }, { "epoch": 3.5495905368516834, "grad_norm": 0.24088745492965744, "learning_rate": 9.688054889630493e-07, "loss": 0.0006, "step": 7802 }, { "epoch": 3.5500454959053687, "grad_norm": 0.14373212524478762, "learning_rate": 9.682406310087304e-07, "loss": 0.0017, "step": 7803 }, { "epoch": 3.5505004549590535, "grad_norm": 0.43766680045830103, "learning_rate": 9.676758982228693e-07, "loss": 0.0046, "step": 7804 }, { "epoch": 3.550955414012739, "grad_norm": 0.15157674946590954, "learning_rate": 9.67111290651613e-07, "loss": 0.0013, "step": 7805 }, { "epoch": 3.551410373066424, "grad_norm": 0.27746415693251547, "learning_rate": 9.665468083411005e-07, "loss": 0.0012, "step": 7806 }, { "epoch": 3.5518653321201095, "grad_norm": 0.27294429145806565, "learning_rate": 9.659824513374572e-07, "loss": 0.0017, "step": 7807 }, { "epoch": 3.5523202911737943, "grad_norm": 0.13088510424653504, "learning_rate": 9.654182196868012e-07, "loss": 0.0009, "step": 7808 }, { "epoch": 3.5527752502274796, "grad_norm": 0.0839543934776781, "learning_rate": 9.648541134352379e-07, "loss": 0.0004, "step": 7809 }, { "epoch": 3.553230209281165, "grad_norm": 0.17498468641997475, "learning_rate": 9.642901326288631e-07, "loss": 0.0013, "step": 7810 }, { "epoch": 3.55368516833485, "grad_norm": 0.6227934562872792, "learning_rate": 9.637262773137642e-07, "loss": 0.0013, "step": 7811 }, { "epoch": 3.554140127388535, "grad_norm": 0.16581376737276773, "learning_rate": 9.631625475360166e-07, "loss": 0.0015, "step": 7812 }, { "epoch": 3.5545950864422204, "grad_norm": 0.18279268915356706, "learning_rate": 9.625989433416848e-07, "loss": 0.0008, "step": 7813 }, { "epoch": 3.5550500454959053, "grad_norm": 0.23532050480512695, "learning_rate": 9.62035464776824e-07, "loss": 0.0024, "step": 7814 }, { "epoch": 3.5555050045495906, "grad_norm": 0.28718599134114914, "learning_rate": 9.614721118874796e-07, "loss": 0.0026, "step": 7815 }, { "epoch": 3.555959963603276, "grad_norm": 0.18218241351890585, "learning_rate": 9.609088847196869e-07, "loss": 0.0016, "step": 7816 }, { "epoch": 3.556414922656961, "grad_norm": 0.2175935370127773, "learning_rate": 9.603457833194698e-07, "loss": 0.002, "step": 7817 }, { "epoch": 3.556869881710646, "grad_norm": 0.291180521165066, "learning_rate": 9.597828077328422e-07, "loss": 0.0065, "step": 7818 }, { "epoch": 3.5573248407643314, "grad_norm": 0.5211062539459631, "learning_rate": 9.592199580058073e-07, "loss": 0.0098, "step": 7819 }, { "epoch": 3.5577797998180163, "grad_norm": 0.14534594346969887, "learning_rate": 9.5865723418436e-07, "loss": 0.0008, "step": 7820 }, { "epoch": 3.5582347588717016, "grad_norm": 0.2915472892986374, "learning_rate": 9.580946363144822e-07, "loss": 0.0028, "step": 7821 }, { "epoch": 3.558689717925387, "grad_norm": 0.24184095277789042, "learning_rate": 9.575321644421482e-07, "loss": 0.0024, "step": 7822 }, { "epoch": 3.5591446769790718, "grad_norm": 0.2283277441075974, "learning_rate": 9.569698186133204e-07, "loss": 0.0018, "step": 7823 }, { "epoch": 3.559599636032757, "grad_norm": 1.2099798235065022, "learning_rate": 9.564075988739494e-07, "loss": 0.0187, "step": 7824 }, { "epoch": 3.5600545950864424, "grad_norm": 0.464804233833945, "learning_rate": 9.558455052699797e-07, "loss": 0.0049, "step": 7825 }, { "epoch": 3.5605095541401273, "grad_norm": 0.31690255363350756, "learning_rate": 9.552835378473418e-07, "loss": 0.005, "step": 7826 }, { "epoch": 3.5609645131938126, "grad_norm": 0.37797152517326643, "learning_rate": 9.547216966519575e-07, "loss": 0.0045, "step": 7827 }, { "epoch": 3.561419472247498, "grad_norm": 0.19432920271264825, "learning_rate": 9.54159981729737e-07, "loss": 0.0023, "step": 7828 }, { "epoch": 3.5618744313011828, "grad_norm": 0.1920604032269418, "learning_rate": 9.535983931265816e-07, "loss": 0.0015, "step": 7829 }, { "epoch": 3.562329390354868, "grad_norm": 0.3235701564937646, "learning_rate": 9.53036930888383e-07, "loss": 0.0054, "step": 7830 }, { "epoch": 3.5627843494085534, "grad_norm": 0.21770278156986933, "learning_rate": 9.524755950610204e-07, "loss": 0.0013, "step": 7831 }, { "epoch": 3.5632393084622382, "grad_norm": 0.19589818363846295, "learning_rate": 9.519143856903634e-07, "loss": 0.0032, "step": 7832 }, { "epoch": 3.5636942675159236, "grad_norm": 0.23252268853365782, "learning_rate": 9.513533028222719e-07, "loss": 0.0008, "step": 7833 }, { "epoch": 3.564149226569609, "grad_norm": 0.43461812048821846, "learning_rate": 9.507923465025939e-07, "loss": 0.0044, "step": 7834 }, { "epoch": 3.5646041856232937, "grad_norm": 0.32727633150383917, "learning_rate": 9.502315167771695e-07, "loss": 0.0036, "step": 7835 }, { "epoch": 3.565059144676979, "grad_norm": 0.1734465414634706, "learning_rate": 9.496708136918273e-07, "loss": 0.0009, "step": 7836 }, { "epoch": 3.5655141037306644, "grad_norm": 0.40301541676452146, "learning_rate": 9.491102372923852e-07, "loss": 0.004, "step": 7837 }, { "epoch": 3.565969062784349, "grad_norm": 0.30606227232306543, "learning_rate": 9.485497876246508e-07, "loss": 0.0033, "step": 7838 }, { "epoch": 3.5664240218380345, "grad_norm": 0.17615749971457573, "learning_rate": 9.479894647344204e-07, "loss": 0.0021, "step": 7839 }, { "epoch": 3.56687898089172, "grad_norm": 0.04418011660451149, "learning_rate": 9.474292686674832e-07, "loss": 0.0002, "step": 7840 }, { "epoch": 3.5673339399454047, "grad_norm": 0.06735726539723287, "learning_rate": 9.468691994696147e-07, "loss": 0.0005, "step": 7841 }, { "epoch": 3.56778889899909, "grad_norm": 0.1663874247027705, "learning_rate": 9.463092571865804e-07, "loss": 0.0008, "step": 7842 }, { "epoch": 3.5682438580527753, "grad_norm": 0.3995750541683232, "learning_rate": 9.457494418641383e-07, "loss": 0.0057, "step": 7843 }, { "epoch": 3.56869881710646, "grad_norm": 0.31760054582203, "learning_rate": 9.451897535480318e-07, "loss": 0.0056, "step": 7844 }, { "epoch": 3.5691537761601455, "grad_norm": 0.19074077497887162, "learning_rate": 9.446301922839981e-07, "loss": 0.0024, "step": 7845 }, { "epoch": 3.569608735213831, "grad_norm": 0.13167942520238435, "learning_rate": 9.440707581177611e-07, "loss": 0.0006, "step": 7846 }, { "epoch": 3.5700636942675157, "grad_norm": 0.29877292636570585, "learning_rate": 9.435114510950353e-07, "loss": 0.0045, "step": 7847 }, { "epoch": 3.570518653321201, "grad_norm": 0.5048410430344413, "learning_rate": 9.429522712615238e-07, "loss": 0.006, "step": 7848 }, { "epoch": 3.5709736123748863, "grad_norm": 0.2520740990956489, "learning_rate": 9.423932186629208e-07, "loss": 0.003, "step": 7849 }, { "epoch": 3.571428571428571, "grad_norm": 0.4187835567076045, "learning_rate": 9.418342933449112e-07, "loss": 0.0038, "step": 7850 }, { "epoch": 3.5718835304822565, "grad_norm": 0.188531517336511, "learning_rate": 9.412754953531664e-07, "loss": 0.0017, "step": 7851 }, { "epoch": 3.572338489535942, "grad_norm": 0.24912298952406117, "learning_rate": 9.407168247333489e-07, "loss": 0.0013, "step": 7852 }, { "epoch": 3.5727934485896267, "grad_norm": 0.21364822404759704, "learning_rate": 9.4015828153111e-07, "loss": 0.0026, "step": 7853 }, { "epoch": 3.573248407643312, "grad_norm": 0.3919719341394568, "learning_rate": 9.395998657920932e-07, "loss": 0.0023, "step": 7854 }, { "epoch": 3.5737033666969973, "grad_norm": 0.12771926940206707, "learning_rate": 9.390415775619283e-07, "loss": 0.0017, "step": 7855 }, { "epoch": 3.5741583257506826, "grad_norm": 0.17213722483981175, "learning_rate": 9.384834168862358e-07, "loss": 0.0016, "step": 7856 }, { "epoch": 3.5746132848043675, "grad_norm": 0.4555965275660695, "learning_rate": 9.379253838106275e-07, "loss": 0.0092, "step": 7857 }, { "epoch": 3.5750682438580528, "grad_norm": 0.3058765994402796, "learning_rate": 9.373674783807018e-07, "loss": 0.0027, "step": 7858 }, { "epoch": 3.575523202911738, "grad_norm": 0.13703810049490772, "learning_rate": 9.368097006420498e-07, "loss": 0.0018, "step": 7859 }, { "epoch": 3.575978161965423, "grad_norm": 0.28336857047505526, "learning_rate": 9.362520506402497e-07, "loss": 0.0026, "step": 7860 }, { "epoch": 3.5764331210191083, "grad_norm": 0.26403554997922574, "learning_rate": 9.356945284208704e-07, "loss": 0.0038, "step": 7861 }, { "epoch": 3.5768880800727936, "grad_norm": 0.3197443708555269, "learning_rate": 9.35137134029469e-07, "loss": 0.0034, "step": 7862 }, { "epoch": 3.577343039126479, "grad_norm": 0.24883548007320955, "learning_rate": 9.345798675115939e-07, "loss": 0.0031, "step": 7863 }, { "epoch": 3.5777979981801638, "grad_norm": 0.17867612314630654, "learning_rate": 9.340227289127837e-07, "loss": 0.0021, "step": 7864 }, { "epoch": 3.578252957233849, "grad_norm": 0.1840635105921823, "learning_rate": 9.334657182785642e-07, "loss": 0.001, "step": 7865 }, { "epoch": 3.5787079162875344, "grad_norm": 0.22466435281354866, "learning_rate": 9.329088356544519e-07, "loss": 0.004, "step": 7866 }, { "epoch": 3.5791628753412192, "grad_norm": 0.31891522275380135, "learning_rate": 9.323520810859523e-07, "loss": 0.006, "step": 7867 }, { "epoch": 3.5796178343949046, "grad_norm": 0.36910133510388105, "learning_rate": 9.317954546185607e-07, "loss": 0.002, "step": 7868 }, { "epoch": 3.58007279344859, "grad_norm": 0.12904785472648417, "learning_rate": 9.31238956297763e-07, "loss": 0.0012, "step": 7869 }, { "epoch": 3.5805277525022747, "grad_norm": 0.4421479944343875, "learning_rate": 9.30682586169033e-07, "loss": 0.0054, "step": 7870 }, { "epoch": 3.58098271155596, "grad_norm": 0.25504024960491645, "learning_rate": 9.301263442778358e-07, "loss": 0.0025, "step": 7871 }, { "epoch": 3.5814376706096454, "grad_norm": 0.2189309053992828, "learning_rate": 9.295702306696239e-07, "loss": 0.0041, "step": 7872 }, { "epoch": 3.58189262966333, "grad_norm": 0.22538793538325633, "learning_rate": 9.290142453898402e-07, "loss": 0.0019, "step": 7873 }, { "epoch": 3.5823475887170155, "grad_norm": 0.2070962336882421, "learning_rate": 9.284583884839187e-07, "loss": 0.0026, "step": 7874 }, { "epoch": 3.582802547770701, "grad_norm": 0.5228152185153323, "learning_rate": 9.279026599972807e-07, "loss": 0.0097, "step": 7875 }, { "epoch": 3.5832575068243857, "grad_norm": 0.4827429972361362, "learning_rate": 9.273470599753376e-07, "loss": 0.0046, "step": 7876 }, { "epoch": 3.583712465878071, "grad_norm": 0.49102012293559627, "learning_rate": 9.267915884634901e-07, "loss": 0.0087, "step": 7877 }, { "epoch": 3.5841674249317563, "grad_norm": 0.11426252690774491, "learning_rate": 9.262362455071294e-07, "loss": 0.0009, "step": 7878 }, { "epoch": 3.584622383985441, "grad_norm": 0.3690933936451745, "learning_rate": 9.256810311516365e-07, "loss": 0.0035, "step": 7879 }, { "epoch": 3.5850773430391265, "grad_norm": 0.08707486821805445, "learning_rate": 9.2512594544238e-07, "loss": 0.0006, "step": 7880 }, { "epoch": 3.585532302092812, "grad_norm": 0.3444613278744854, "learning_rate": 9.245709884247195e-07, "loss": 0.004, "step": 7881 }, { "epoch": 3.5859872611464967, "grad_norm": 0.2689963347299615, "learning_rate": 9.24016160144002e-07, "loss": 0.0034, "step": 7882 }, { "epoch": 3.586442220200182, "grad_norm": 0.0738102342829177, "learning_rate": 9.234614606455681e-07, "loss": 0.0006, "step": 7883 }, { "epoch": 3.5868971792538673, "grad_norm": 0.07710005841773723, "learning_rate": 9.229068899747428e-07, "loss": 0.0005, "step": 7884 }, { "epoch": 3.587352138307552, "grad_norm": 0.13411495852243516, "learning_rate": 9.223524481768454e-07, "loss": 0.0009, "step": 7885 }, { "epoch": 3.5878070973612375, "grad_norm": 0.2667279095362077, "learning_rate": 9.217981352971814e-07, "loss": 0.0021, "step": 7886 }, { "epoch": 3.588262056414923, "grad_norm": 0.35447492297631217, "learning_rate": 9.212439513810457e-07, "loss": 0.003, "step": 7887 }, { "epoch": 3.5887170154686077, "grad_norm": 0.33355981042091754, "learning_rate": 9.206898964737257e-07, "loss": 0.0044, "step": 7888 }, { "epoch": 3.589171974522293, "grad_norm": 0.28953335938539476, "learning_rate": 9.201359706204952e-07, "loss": 0.0028, "step": 7889 }, { "epoch": 3.5896269335759783, "grad_norm": 0.2673249713555099, "learning_rate": 9.195821738666183e-07, "loss": 0.0037, "step": 7890 }, { "epoch": 3.590081892629663, "grad_norm": 0.14708805595903082, "learning_rate": 9.190285062573484e-07, "loss": 0.0011, "step": 7891 }, { "epoch": 3.5905368516833485, "grad_norm": 0.2561908933741653, "learning_rate": 9.184749678379296e-07, "loss": 0.0015, "step": 7892 }, { "epoch": 3.5909918107370338, "grad_norm": 0.11439901099762029, "learning_rate": 9.17921558653595e-07, "loss": 0.0015, "step": 7893 }, { "epoch": 3.5914467697907186, "grad_norm": 0.755674354615607, "learning_rate": 9.173682787495658e-07, "loss": 0.0177, "step": 7894 }, { "epoch": 3.591901728844404, "grad_norm": 0.4425298544442894, "learning_rate": 9.168151281710542e-07, "loss": 0.0036, "step": 7895 }, { "epoch": 3.5923566878980893, "grad_norm": 0.322488583160564, "learning_rate": 9.162621069632596e-07, "loss": 0.0015, "step": 7896 }, { "epoch": 3.592811646951774, "grad_norm": 0.09384026883662254, "learning_rate": 9.157092151713742e-07, "loss": 0.0007, "step": 7897 }, { "epoch": 3.5932666060054594, "grad_norm": 0.0722118227147387, "learning_rate": 9.151564528405765e-07, "loss": 0.0006, "step": 7898 }, { "epoch": 3.5937215650591448, "grad_norm": 0.256681716753302, "learning_rate": 9.146038200160373e-07, "loss": 0.0015, "step": 7899 }, { "epoch": 3.5941765241128296, "grad_norm": 0.2538167248107771, "learning_rate": 9.140513167429144e-07, "loss": 0.005, "step": 7900 }, { "epoch": 3.594631483166515, "grad_norm": 0.2798742438094209, "learning_rate": 9.134989430663549e-07, "loss": 0.0015, "step": 7901 }, { "epoch": 3.5950864422202002, "grad_norm": 0.6373325441588238, "learning_rate": 9.129466990314978e-07, "loss": 0.0056, "step": 7902 }, { "epoch": 3.595541401273885, "grad_norm": 0.509606297884189, "learning_rate": 9.123945846834697e-07, "loss": 0.0052, "step": 7903 }, { "epoch": 3.5959963603275704, "grad_norm": 0.6640741779846633, "learning_rate": 9.118426000673864e-07, "loss": 0.0095, "step": 7904 }, { "epoch": 3.5964513193812557, "grad_norm": 0.18902161467620454, "learning_rate": 9.112907452283528e-07, "loss": 0.0025, "step": 7905 }, { "epoch": 3.5969062784349406, "grad_norm": 0.1248706289492942, "learning_rate": 9.10739020211466e-07, "loss": 0.0016, "step": 7906 }, { "epoch": 3.597361237488626, "grad_norm": 0.2833860270692957, "learning_rate": 9.101874250618086e-07, "loss": 0.0027, "step": 7907 }, { "epoch": 3.597816196542311, "grad_norm": 0.2535346264446849, "learning_rate": 9.096359598244562e-07, "loss": 0.0018, "step": 7908 }, { "epoch": 3.598271155595996, "grad_norm": 0.15460394509963396, "learning_rate": 9.090846245444709e-07, "loss": 0.0012, "step": 7909 }, { "epoch": 3.5987261146496814, "grad_norm": 0.5310861093595967, "learning_rate": 9.085334192669057e-07, "loss": 0.0043, "step": 7910 }, { "epoch": 3.5991810737033667, "grad_norm": 0.3804530227794708, "learning_rate": 9.079823440368018e-07, "loss": 0.0024, "step": 7911 }, { "epoch": 3.599636032757052, "grad_norm": 0.17354499705900464, "learning_rate": 9.074313988991909e-07, "loss": 0.001, "step": 7912 }, { "epoch": 3.600090991810737, "grad_norm": 0.10947203719170784, "learning_rate": 9.068805838990952e-07, "loss": 0.0012, "step": 7913 }, { "epoch": 3.600545950864422, "grad_norm": 0.318446872755484, "learning_rate": 9.063298990815237e-07, "loss": 0.0015, "step": 7914 }, { "epoch": 3.6010009099181075, "grad_norm": 0.19707853003434123, "learning_rate": 9.057793444914758e-07, "loss": 0.0013, "step": 7915 }, { "epoch": 3.6014558689717924, "grad_norm": 0.1759905214335846, "learning_rate": 9.052289201739397e-07, "loss": 0.0026, "step": 7916 }, { "epoch": 3.6019108280254777, "grad_norm": 0.3438175907337963, "learning_rate": 9.046786261738952e-07, "loss": 0.0041, "step": 7917 }, { "epoch": 3.602365787079163, "grad_norm": 0.38477565640342254, "learning_rate": 9.041284625363089e-07, "loss": 0.0041, "step": 7918 }, { "epoch": 3.6028207461328483, "grad_norm": 0.48317792608754084, "learning_rate": 9.035784293061367e-07, "loss": 0.0065, "step": 7919 }, { "epoch": 3.603275705186533, "grad_norm": 0.2961261574420471, "learning_rate": 9.03028526528327e-07, "loss": 0.0019, "step": 7920 }, { "epoch": 3.6037306642402185, "grad_norm": 0.35181139208143486, "learning_rate": 9.024787542478133e-07, "loss": 0.0054, "step": 7921 }, { "epoch": 3.604185623293904, "grad_norm": 0.4085636786766917, "learning_rate": 9.019291125095222e-07, "loss": 0.0055, "step": 7922 }, { "epoch": 3.6046405823475887, "grad_norm": 0.1309055598395693, "learning_rate": 9.013796013583675e-07, "loss": 0.001, "step": 7923 }, { "epoch": 3.605095541401274, "grad_norm": 0.2662796569972246, "learning_rate": 9.008302208392522e-07, "loss": 0.0044, "step": 7924 }, { "epoch": 3.6055505004549593, "grad_norm": 0.36245689875775916, "learning_rate": 9.002809709970686e-07, "loss": 0.0048, "step": 7925 }, { "epoch": 3.606005459508644, "grad_norm": 0.5612965022838162, "learning_rate": 8.997318518767001e-07, "loss": 0.0069, "step": 7926 }, { "epoch": 3.6064604185623295, "grad_norm": 0.07559106535852315, "learning_rate": 8.991828635230185e-07, "loss": 0.0012, "step": 7927 }, { "epoch": 3.6069153776160148, "grad_norm": 0.3473753683170802, "learning_rate": 8.98634005980884e-07, "loss": 0.004, "step": 7928 }, { "epoch": 3.6073703366696996, "grad_norm": 0.2718569470708077, "learning_rate": 8.980852792951472e-07, "loss": 0.0025, "step": 7929 }, { "epoch": 3.607825295723385, "grad_norm": 0.31540326044681766, "learning_rate": 8.975366835106461e-07, "loss": 0.0045, "step": 7930 }, { "epoch": 3.6082802547770703, "grad_norm": 0.2446283458251342, "learning_rate": 8.969882186722112e-07, "loss": 0.0022, "step": 7931 }, { "epoch": 3.608735213830755, "grad_norm": 0.5627946936077606, "learning_rate": 8.964398848246602e-07, "loss": 0.0078, "step": 7932 }, { "epoch": 3.6091901728844404, "grad_norm": 0.45949892695491296, "learning_rate": 8.958916820127994e-07, "loss": 0.0084, "step": 7933 }, { "epoch": 3.6096451319381258, "grad_norm": 0.36328256863539327, "learning_rate": 8.95343610281427e-07, "loss": 0.0046, "step": 7934 }, { "epoch": 3.6101000909918106, "grad_norm": 0.2155988375625289, "learning_rate": 8.947956696753274e-07, "loss": 0.0025, "step": 7935 }, { "epoch": 3.610555050045496, "grad_norm": 0.8165932585591997, "learning_rate": 8.942478602392773e-07, "loss": 0.0211, "step": 7936 }, { "epoch": 3.6110100090991812, "grad_norm": 0.10821871845468159, "learning_rate": 8.937001820180408e-07, "loss": 0.0012, "step": 7937 }, { "epoch": 3.611464968152866, "grad_norm": 0.239118001987956, "learning_rate": 8.931526350563713e-07, "loss": 0.0023, "step": 7938 }, { "epoch": 3.6119199272065514, "grad_norm": 0.1788377516261645, "learning_rate": 8.92605219399012e-07, "loss": 0.0025, "step": 7939 }, { "epoch": 3.6123748862602367, "grad_norm": 0.21696901042729988, "learning_rate": 8.920579350906936e-07, "loss": 0.0027, "step": 7940 }, { "epoch": 3.6128298453139216, "grad_norm": 0.39596947914279135, "learning_rate": 8.915107821761409e-07, "loss": 0.006, "step": 7941 }, { "epoch": 3.613284804367607, "grad_norm": 0.09897350279301749, "learning_rate": 8.909637607000632e-07, "loss": 0.0007, "step": 7942 }, { "epoch": 3.613739763421292, "grad_norm": 0.21321219315556247, "learning_rate": 8.904168707071609e-07, "loss": 0.0028, "step": 7943 }, { "epoch": 3.614194722474977, "grad_norm": 0.19279496691356407, "learning_rate": 8.89870112242123e-07, "loss": 0.0022, "step": 7944 }, { "epoch": 3.6146496815286624, "grad_norm": 0.10834454747894189, "learning_rate": 8.893234853496271e-07, "loss": 0.0006, "step": 7945 }, { "epoch": 3.6151046405823477, "grad_norm": 0.31715383855588214, "learning_rate": 8.887769900743434e-07, "loss": 0.0028, "step": 7946 }, { "epoch": 3.6155595996360326, "grad_norm": 0.23524379722526684, "learning_rate": 8.882306264609269e-07, "loss": 0.0024, "step": 7947 }, { "epoch": 3.616014558689718, "grad_norm": 0.6319259163255877, "learning_rate": 8.876843945540259e-07, "loss": 0.0048, "step": 7948 }, { "epoch": 3.616469517743403, "grad_norm": 0.3121406165423019, "learning_rate": 8.87138294398275e-07, "loss": 0.002, "step": 7949 }, { "epoch": 3.616924476797088, "grad_norm": 0.2604095395059417, "learning_rate": 8.865923260382981e-07, "loss": 0.0039, "step": 7950 }, { "epoch": 3.6173794358507734, "grad_norm": 0.24329228141641532, "learning_rate": 8.860464895187113e-07, "loss": 0.0027, "step": 7951 }, { "epoch": 3.6178343949044587, "grad_norm": 0.14814676313499775, "learning_rate": 8.855007848841166e-07, "loss": 0.0009, "step": 7952 }, { "epoch": 3.6182893539581436, "grad_norm": 0.35785566208773073, "learning_rate": 8.849552121791067e-07, "loss": 0.0042, "step": 7953 }, { "epoch": 3.618744313011829, "grad_norm": 0.24105171544036533, "learning_rate": 8.844097714482625e-07, "loss": 0.0011, "step": 7954 }, { "epoch": 3.619199272065514, "grad_norm": 0.36092845843004945, "learning_rate": 8.838644627361562e-07, "loss": 0.0082, "step": 7955 }, { "epoch": 3.619654231119199, "grad_norm": 0.27529316237455637, "learning_rate": 8.83319286087348e-07, "loss": 0.0034, "step": 7956 }, { "epoch": 3.6201091901728844, "grad_norm": 0.08774736106900133, "learning_rate": 8.827742415463872e-07, "loss": 0.0009, "step": 7957 }, { "epoch": 3.6205641492265697, "grad_norm": 0.15628814047547626, "learning_rate": 8.822293291578119e-07, "loss": 0.0009, "step": 7958 }, { "epoch": 3.6210191082802545, "grad_norm": 0.6027383386031004, "learning_rate": 8.816845489661493e-07, "loss": 0.0042, "step": 7959 }, { "epoch": 3.62147406733394, "grad_norm": 0.29686048427229084, "learning_rate": 8.811399010159177e-07, "loss": 0.003, "step": 7960 }, { "epoch": 3.621929026387625, "grad_norm": 0.11999066053222542, "learning_rate": 8.805953853516222e-07, "loss": 0.0006, "step": 7961 }, { "epoch": 3.62238398544131, "grad_norm": 0.14399689741564203, "learning_rate": 8.800510020177591e-07, "loss": 0.0018, "step": 7962 }, { "epoch": 3.6228389444949953, "grad_norm": 0.12604956390555208, "learning_rate": 8.795067510588129e-07, "loss": 0.0009, "step": 7963 }, { "epoch": 3.6232939035486806, "grad_norm": 0.34134377429943596, "learning_rate": 8.789626325192557e-07, "loss": 0.0019, "step": 7964 }, { "epoch": 3.623748862602366, "grad_norm": 0.3143934883239511, "learning_rate": 8.784186464435526e-07, "loss": 0.0023, "step": 7965 }, { "epoch": 3.624203821656051, "grad_norm": 0.19974057747881363, "learning_rate": 8.778747928761549e-07, "loss": 0.0023, "step": 7966 }, { "epoch": 3.624658780709736, "grad_norm": 0.05599039448759722, "learning_rate": 8.773310718615036e-07, "loss": 0.0004, "step": 7967 }, { "epoch": 3.6251137397634214, "grad_norm": 0.10228105868530508, "learning_rate": 8.767874834440282e-07, "loss": 0.0007, "step": 7968 }, { "epoch": 3.6255686988171063, "grad_norm": 0.21490258799950399, "learning_rate": 8.762440276681494e-07, "loss": 0.0019, "step": 7969 }, { "epoch": 3.6260236578707916, "grad_norm": 0.32206659227814527, "learning_rate": 8.757007045782768e-07, "loss": 0.0018, "step": 7970 }, { "epoch": 3.626478616924477, "grad_norm": 0.2535551157598862, "learning_rate": 8.751575142188071e-07, "loss": 0.0039, "step": 7971 }, { "epoch": 3.6269335759781622, "grad_norm": 0.12734921109640904, "learning_rate": 8.746144566341277e-07, "loss": 0.0011, "step": 7972 }, { "epoch": 3.627388535031847, "grad_norm": 0.35014843325445694, "learning_rate": 8.740715318686149e-07, "loss": 0.0025, "step": 7973 }, { "epoch": 3.6278434940855324, "grad_norm": 0.17554235487338546, "learning_rate": 8.735287399666329e-07, "loss": 0.001, "step": 7974 }, { "epoch": 3.6282984531392177, "grad_norm": 0.45728520832361597, "learning_rate": 8.729860809725371e-07, "loss": 0.0078, "step": 7975 }, { "epoch": 3.6287534121929026, "grad_norm": 0.07139248281483317, "learning_rate": 8.724435549306723e-07, "loss": 0.0006, "step": 7976 }, { "epoch": 3.629208371246588, "grad_norm": 0.10500598330562706, "learning_rate": 8.719011618853701e-07, "loss": 0.0009, "step": 7977 }, { "epoch": 3.629663330300273, "grad_norm": 0.45714259613928193, "learning_rate": 8.713589018809523e-07, "loss": 0.0058, "step": 7978 }, { "epoch": 3.630118289353958, "grad_norm": 0.36927087381050894, "learning_rate": 8.708167749617296e-07, "loss": 0.0024, "step": 7979 }, { "epoch": 3.6305732484076434, "grad_norm": 0.27234689693265013, "learning_rate": 8.702747811720035e-07, "loss": 0.0032, "step": 7980 }, { "epoch": 3.6310282074613287, "grad_norm": 0.410659924057722, "learning_rate": 8.697329205560625e-07, "loss": 0.0018, "step": 7981 }, { "epoch": 3.6314831665150136, "grad_norm": 0.2185542436011605, "learning_rate": 8.691911931581843e-07, "loss": 0.0015, "step": 7982 }, { "epoch": 3.631938125568699, "grad_norm": 0.26990198725652687, "learning_rate": 8.686495990226377e-07, "loss": 0.002, "step": 7983 }, { "epoch": 3.632393084622384, "grad_norm": 0.37666284522058185, "learning_rate": 8.681081381936779e-07, "loss": 0.0071, "step": 7984 }, { "epoch": 3.632848043676069, "grad_norm": 0.32693950055916715, "learning_rate": 8.675668107155527e-07, "loss": 0.0022, "step": 7985 }, { "epoch": 3.6333030027297544, "grad_norm": 0.205909138226967, "learning_rate": 8.670256166324953e-07, "loss": 0.0021, "step": 7986 }, { "epoch": 3.6337579617834397, "grad_norm": 0.3928367705956396, "learning_rate": 8.664845559887303e-07, "loss": 0.0043, "step": 7987 }, { "epoch": 3.6342129208371245, "grad_norm": 0.403882076893524, "learning_rate": 8.659436288284698e-07, "loss": 0.0028, "step": 7988 }, { "epoch": 3.63466787989081, "grad_norm": 0.15426013175477526, "learning_rate": 8.654028351959162e-07, "loss": 0.0008, "step": 7989 }, { "epoch": 3.635122838944495, "grad_norm": 0.3268450231737731, "learning_rate": 8.648621751352624e-07, "loss": 0.0025, "step": 7990 }, { "epoch": 3.63557779799818, "grad_norm": 0.32204887946779953, "learning_rate": 8.643216486906872e-07, "loss": 0.0021, "step": 7991 }, { "epoch": 3.6360327570518653, "grad_norm": 0.312807329688827, "learning_rate": 8.637812559063602e-07, "loss": 0.004, "step": 7992 }, { "epoch": 3.6364877161055507, "grad_norm": 0.3836154034514503, "learning_rate": 8.63240996826439e-07, "loss": 0.0036, "step": 7993 }, { "epoch": 3.6369426751592355, "grad_norm": 0.23124721956430397, "learning_rate": 8.62700871495073e-07, "loss": 0.0035, "step": 7994 }, { "epoch": 3.637397634212921, "grad_norm": 0.24924631741496642, "learning_rate": 8.621608799563977e-07, "loss": 0.0022, "step": 7995 }, { "epoch": 3.637852593266606, "grad_norm": 0.43654705117613907, "learning_rate": 8.616210222545382e-07, "loss": 0.0113, "step": 7996 }, { "epoch": 3.638307552320291, "grad_norm": 0.10190776210359477, "learning_rate": 8.610812984336106e-07, "loss": 0.0009, "step": 7997 }, { "epoch": 3.6387625113739763, "grad_norm": 0.06486666966427374, "learning_rate": 8.605417085377171e-07, "loss": 0.0005, "step": 7998 }, { "epoch": 3.6392174704276616, "grad_norm": 0.5029556465945595, "learning_rate": 8.600022526109522e-07, "loss": 0.0064, "step": 7999 }, { "epoch": 3.6396724294813465, "grad_norm": 0.17722165911433457, "learning_rate": 8.594629306973973e-07, "loss": 0.0016, "step": 8000 }, { "epoch": 3.640127388535032, "grad_norm": 0.26556136106097367, "learning_rate": 8.589237428411229e-07, "loss": 0.0037, "step": 8001 }, { "epoch": 3.640582347588717, "grad_norm": 0.154075030340778, "learning_rate": 8.583846890861885e-07, "loss": 0.0036, "step": 8002 }, { "epoch": 3.641037306642402, "grad_norm": 0.26293000284471885, "learning_rate": 8.57845769476644e-07, "loss": 0.0068, "step": 8003 }, { "epoch": 3.6414922656960873, "grad_norm": 0.596996684609262, "learning_rate": 8.573069840565279e-07, "loss": 0.0036, "step": 8004 }, { "epoch": 3.6419472247497726, "grad_norm": 0.34510503434781936, "learning_rate": 8.567683328698667e-07, "loss": 0.0024, "step": 8005 }, { "epoch": 3.6424021838034575, "grad_norm": 0.2812689993545925, "learning_rate": 8.562298159606766e-07, "loss": 0.0047, "step": 8006 }, { "epoch": 3.642857142857143, "grad_norm": 0.255329550672796, "learning_rate": 8.556914333729621e-07, "loss": 0.0043, "step": 8007 }, { "epoch": 3.643312101910828, "grad_norm": 0.19031624446465784, "learning_rate": 8.551531851507186e-07, "loss": 0.0019, "step": 8008 }, { "epoch": 3.643767060964513, "grad_norm": 0.27452669399414675, "learning_rate": 8.54615071337929e-07, "loss": 0.004, "step": 8009 }, { "epoch": 3.6442220200181983, "grad_norm": 0.21672892691728954, "learning_rate": 8.540770919785643e-07, "loss": 0.0016, "step": 8010 }, { "epoch": 3.6446769790718836, "grad_norm": 0.2994475106125561, "learning_rate": 8.535392471165877e-07, "loss": 0.0051, "step": 8011 }, { "epoch": 3.6451319381255685, "grad_norm": 0.22441101956940215, "learning_rate": 8.530015367959482e-07, "loss": 0.0038, "step": 8012 }, { "epoch": 3.6455868971792538, "grad_norm": 0.29457719023106677, "learning_rate": 8.524639610605848e-07, "loss": 0.0017, "step": 8013 }, { "epoch": 3.646041856232939, "grad_norm": 0.2406575048935029, "learning_rate": 8.519265199544269e-07, "loss": 0.0041, "step": 8014 }, { "epoch": 3.646496815286624, "grad_norm": 0.28782346729074626, "learning_rate": 8.513892135213911e-07, "loss": 0.0023, "step": 8015 }, { "epoch": 3.6469517743403093, "grad_norm": 0.5622923963522478, "learning_rate": 8.50852041805384e-07, "loss": 0.0048, "step": 8016 }, { "epoch": 3.6474067333939946, "grad_norm": 0.26092105243295644, "learning_rate": 8.503150048502995e-07, "loss": 0.0027, "step": 8017 }, { "epoch": 3.6478616924476794, "grad_norm": 0.2705901312604328, "learning_rate": 8.497781027000229e-07, "loss": 0.003, "step": 8018 }, { "epoch": 3.6483166515013647, "grad_norm": 0.2918159137580213, "learning_rate": 8.492413353984283e-07, "loss": 0.0024, "step": 8019 }, { "epoch": 3.64877161055505, "grad_norm": 0.3634009744741447, "learning_rate": 8.487047029893772e-07, "loss": 0.0011, "step": 8020 }, { "epoch": 3.6492265696087354, "grad_norm": 0.1879432917453688, "learning_rate": 8.481682055167203e-07, "loss": 0.0019, "step": 8021 }, { "epoch": 3.6496815286624202, "grad_norm": 0.1369578541657344, "learning_rate": 8.476318430242972e-07, "loss": 0.002, "step": 8022 }, { "epoch": 3.6501364877161055, "grad_norm": 0.38392247694193676, "learning_rate": 8.47095615555939e-07, "loss": 0.0016, "step": 8023 }, { "epoch": 3.650591446769791, "grad_norm": 0.4928681861769968, "learning_rate": 8.465595231554616e-07, "loss": 0.0081, "step": 8024 }, { "epoch": 3.6510464058234757, "grad_norm": 0.2824588533045412, "learning_rate": 8.460235658666738e-07, "loss": 0.0022, "step": 8025 }, { "epoch": 3.651501364877161, "grad_norm": 0.45809945466131713, "learning_rate": 8.454877437333711e-07, "loss": 0.0071, "step": 8026 }, { "epoch": 3.6519563239308463, "grad_norm": 0.12721001120506503, "learning_rate": 8.449520567993375e-07, "loss": 0.0005, "step": 8027 }, { "epoch": 3.6524112829845317, "grad_norm": 0.5528964238799462, "learning_rate": 8.444165051083483e-07, "loss": 0.0086, "step": 8028 }, { "epoch": 3.6528662420382165, "grad_norm": 0.4839860460864589, "learning_rate": 8.43881088704166e-07, "loss": 0.0086, "step": 8029 }, { "epoch": 3.653321201091902, "grad_norm": 0.23989107686508496, "learning_rate": 8.433458076305418e-07, "loss": 0.0017, "step": 8030 }, { "epoch": 3.653776160145587, "grad_norm": 0.3340664997999354, "learning_rate": 8.428106619312162e-07, "loss": 0.0005, "step": 8031 }, { "epoch": 3.654231119199272, "grad_norm": 0.6965774475326914, "learning_rate": 8.422756516499194e-07, "loss": 0.0125, "step": 8032 }, { "epoch": 3.6546860782529573, "grad_norm": 0.07900832488465712, "learning_rate": 8.417407768303712e-07, "loss": 0.0009, "step": 8033 }, { "epoch": 3.6551410373066426, "grad_norm": 0.1437832323548404, "learning_rate": 8.412060375162781e-07, "loss": 0.0008, "step": 8034 }, { "epoch": 3.6555959963603275, "grad_norm": 0.6748952371569488, "learning_rate": 8.406714337513364e-07, "loss": 0.0062, "step": 8035 }, { "epoch": 3.656050955414013, "grad_norm": 0.3377543750036824, "learning_rate": 8.401369655792307e-07, "loss": 0.0052, "step": 8036 }, { "epoch": 3.656505914467698, "grad_norm": 0.2102750866100855, "learning_rate": 8.396026330436374e-07, "loss": 0.0019, "step": 8037 }, { "epoch": 3.656960873521383, "grad_norm": 0.11721197102873249, "learning_rate": 8.390684361882176e-07, "loss": 0.0011, "step": 8038 }, { "epoch": 3.6574158325750683, "grad_norm": 0.2668707734858569, "learning_rate": 8.385343750566255e-07, "loss": 0.0037, "step": 8039 }, { "epoch": 3.6578707916287536, "grad_norm": 0.14847595997051638, "learning_rate": 8.380004496925012e-07, "loss": 0.0011, "step": 8040 }, { "epoch": 3.6583257506824385, "grad_norm": 0.28821757939346915, "learning_rate": 8.374666601394737e-07, "loss": 0.0039, "step": 8041 }, { "epoch": 3.658780709736124, "grad_norm": 0.65867180406551, "learning_rate": 8.369330064411635e-07, "loss": 0.0057, "step": 8042 }, { "epoch": 3.659235668789809, "grad_norm": 0.3597860450068528, "learning_rate": 8.363994886411778e-07, "loss": 0.0035, "step": 8043 }, { "epoch": 3.659690627843494, "grad_norm": 0.4451722575272541, "learning_rate": 8.358661067831131e-07, "loss": 0.0061, "step": 8044 }, { "epoch": 3.6601455868971793, "grad_norm": 0.3563616741339903, "learning_rate": 8.353328609105543e-07, "loss": 0.0036, "step": 8045 }, { "epoch": 3.6606005459508646, "grad_norm": 0.40551342110093463, "learning_rate": 8.347997510670763e-07, "loss": 0.006, "step": 8046 }, { "epoch": 3.6610555050045495, "grad_norm": 0.2577541826951957, "learning_rate": 8.342667772962437e-07, "loss": 0.0012, "step": 8047 }, { "epoch": 3.6615104640582348, "grad_norm": 0.2869924117714372, "learning_rate": 8.337339396416075e-07, "loss": 0.0021, "step": 8048 }, { "epoch": 3.66196542311192, "grad_norm": 0.3380566278694775, "learning_rate": 8.332012381467091e-07, "loss": 0.0032, "step": 8049 }, { "epoch": 3.662420382165605, "grad_norm": 0.16460466942189575, "learning_rate": 8.326686728550781e-07, "loss": 0.0012, "step": 8050 }, { "epoch": 3.6628753412192903, "grad_norm": 0.06036021093701949, "learning_rate": 8.321362438102329e-07, "loss": 0.0004, "step": 8051 }, { "epoch": 3.6633303002729756, "grad_norm": 0.2640295800986041, "learning_rate": 8.31603951055682e-07, "loss": 0.0014, "step": 8052 }, { "epoch": 3.6637852593266604, "grad_norm": 0.1235443056595422, "learning_rate": 8.310717946349226e-07, "loss": 0.0005, "step": 8053 }, { "epoch": 3.6642402183803457, "grad_norm": 0.227626390986983, "learning_rate": 8.30539774591439e-07, "loss": 0.0021, "step": 8054 }, { "epoch": 3.664695177434031, "grad_norm": 0.15494783872135204, "learning_rate": 8.30007890968706e-07, "loss": 0.0009, "step": 8055 }, { "epoch": 3.665150136487716, "grad_norm": 0.05535746515753282, "learning_rate": 8.294761438101859e-07, "loss": 0.0005, "step": 8056 }, { "epoch": 3.6656050955414012, "grad_norm": 0.2046188573882816, "learning_rate": 8.289445331593319e-07, "loss": 0.0039, "step": 8057 }, { "epoch": 3.6660600545950865, "grad_norm": 0.3208061421859173, "learning_rate": 8.284130590595843e-07, "loss": 0.005, "step": 8058 }, { "epoch": 3.6665150136487714, "grad_norm": 0.29377111778610043, "learning_rate": 8.278817215543717e-07, "loss": 0.0041, "step": 8059 }, { "epoch": 3.6669699727024567, "grad_norm": 0.34331349129616084, "learning_rate": 8.273505206871146e-07, "loss": 0.0028, "step": 8060 }, { "epoch": 3.667424931756142, "grad_norm": 0.0631722518556724, "learning_rate": 8.268194565012185e-07, "loss": 0.0003, "step": 8061 }, { "epoch": 3.667879890809827, "grad_norm": 0.4102122977200362, "learning_rate": 8.262885290400813e-07, "loss": 0.0024, "step": 8062 }, { "epoch": 3.668334849863512, "grad_norm": 0.29395418072670915, "learning_rate": 8.257577383470869e-07, "loss": 0.0065, "step": 8063 }, { "epoch": 3.6687898089171975, "grad_norm": 0.11440475720500419, "learning_rate": 8.252270844656093e-07, "loss": 0.0006, "step": 8064 }, { "epoch": 3.6692447679708824, "grad_norm": 0.4334030551354938, "learning_rate": 8.246965674390106e-07, "loss": 0.0068, "step": 8065 }, { "epoch": 3.6696997270245677, "grad_norm": 0.1491094955884599, "learning_rate": 8.241661873106427e-07, "loss": 0.0018, "step": 8066 }, { "epoch": 3.670154686078253, "grad_norm": 0.12148173230190468, "learning_rate": 8.236359441238467e-07, "loss": 0.0007, "step": 8067 }, { "epoch": 3.670609645131938, "grad_norm": 0.23986320353176852, "learning_rate": 8.231058379219509e-07, "loss": 0.0029, "step": 8068 }, { "epoch": 3.671064604185623, "grad_norm": 0.05036407417408627, "learning_rate": 8.225758687482732e-07, "loss": 0.0004, "step": 8069 }, { "epoch": 3.6715195632393085, "grad_norm": 0.25411078799482345, "learning_rate": 8.220460366461197e-07, "loss": 0.0026, "step": 8070 }, { "epoch": 3.6719745222929934, "grad_norm": 0.06006179064054081, "learning_rate": 8.215163416587874e-07, "loss": 0.0003, "step": 8071 }, { "epoch": 3.6724294813466787, "grad_norm": 0.12255151140964175, "learning_rate": 8.209867838295596e-07, "loss": 0.0008, "step": 8072 }, { "epoch": 3.672884440400364, "grad_norm": 0.3894694831365537, "learning_rate": 8.204573632017084e-07, "loss": 0.0055, "step": 8073 }, { "epoch": 3.673339399454049, "grad_norm": 0.08296297848393278, "learning_rate": 8.199280798184978e-07, "loss": 0.0006, "step": 8074 }, { "epoch": 3.673794358507734, "grad_norm": 0.37408302662428533, "learning_rate": 8.193989337231764e-07, "loss": 0.0043, "step": 8075 }, { "epoch": 3.6742493175614195, "grad_norm": 0.2776519127347404, "learning_rate": 8.188699249589857e-07, "loss": 0.0047, "step": 8076 }, { "epoch": 3.674704276615105, "grad_norm": 0.24197607190148096, "learning_rate": 8.183410535691527e-07, "loss": 0.0032, "step": 8077 }, { "epoch": 3.6751592356687897, "grad_norm": 0.2916883226706972, "learning_rate": 8.178123195968943e-07, "loss": 0.0034, "step": 8078 }, { "epoch": 3.675614194722475, "grad_norm": 0.2645110699114246, "learning_rate": 8.172837230854158e-07, "loss": 0.0028, "step": 8079 }, { "epoch": 3.6760691537761603, "grad_norm": 0.46008467367663464, "learning_rate": 8.167552640779125e-07, "loss": 0.0032, "step": 8080 }, { "epoch": 3.676524112829845, "grad_norm": 0.16088843227625607, "learning_rate": 8.162269426175681e-07, "loss": 0.0012, "step": 8081 }, { "epoch": 3.6769790718835305, "grad_norm": 0.39372496706032933, "learning_rate": 8.156987587475542e-07, "loss": 0.0092, "step": 8082 }, { "epoch": 3.6774340309372158, "grad_norm": 0.07929651409569254, "learning_rate": 8.151707125110317e-07, "loss": 0.0005, "step": 8083 }, { "epoch": 3.677888989990901, "grad_norm": 0.2152447839049605, "learning_rate": 8.146428039511498e-07, "loss": 0.0029, "step": 8084 }, { "epoch": 3.678343949044586, "grad_norm": 0.0812849720580395, "learning_rate": 8.141150331110459e-07, "loss": 0.0006, "step": 8085 }, { "epoch": 3.6787989080982713, "grad_norm": 0.2375430644570572, "learning_rate": 8.135874000338492e-07, "loss": 0.0024, "step": 8086 }, { "epoch": 3.6792538671519566, "grad_norm": 0.06970239883673401, "learning_rate": 8.130599047626736e-07, "loss": 0.0004, "step": 8087 }, { "epoch": 3.6797088262056414, "grad_norm": 0.6127656271820735, "learning_rate": 8.12532547340625e-07, "loss": 0.0056, "step": 8088 }, { "epoch": 3.6801637852593267, "grad_norm": 0.10225920356708554, "learning_rate": 8.120053278107964e-07, "loss": 0.0009, "step": 8089 }, { "epoch": 3.680618744313012, "grad_norm": 0.13177071130925513, "learning_rate": 8.114782462162684e-07, "loss": 0.0007, "step": 8090 }, { "epoch": 3.681073703366697, "grad_norm": 0.2549229473842294, "learning_rate": 8.10951302600114e-07, "loss": 0.0019, "step": 8091 }, { "epoch": 3.6815286624203822, "grad_norm": 0.4619544744142435, "learning_rate": 8.104244970053912e-07, "loss": 0.0121, "step": 8092 }, { "epoch": 3.6819836214740675, "grad_norm": 0.11872335400759348, "learning_rate": 8.098978294751484e-07, "loss": 0.0007, "step": 8093 }, { "epoch": 3.6824385805277524, "grad_norm": 0.3640275456081911, "learning_rate": 8.093713000524217e-07, "loss": 0.0045, "step": 8094 }, { "epoch": 3.6828935395814377, "grad_norm": 0.2519723502050132, "learning_rate": 8.088449087802378e-07, "loss": 0.0012, "step": 8095 }, { "epoch": 3.683348498635123, "grad_norm": 0.42989006807630836, "learning_rate": 8.083186557016115e-07, "loss": 0.0047, "step": 8096 }, { "epoch": 3.683803457688808, "grad_norm": 0.45296438764612607, "learning_rate": 8.07792540859545e-07, "loss": 0.0015, "step": 8097 }, { "epoch": 3.684258416742493, "grad_norm": 0.4264731790624713, "learning_rate": 8.072665642970301e-07, "loss": 0.0047, "step": 8098 }, { "epoch": 3.6847133757961785, "grad_norm": 0.3320693630369025, "learning_rate": 8.067407260570465e-07, "loss": 0.0052, "step": 8099 }, { "epoch": 3.6851683348498634, "grad_norm": 0.4321167944974133, "learning_rate": 8.062150261825649e-07, "loss": 0.0041, "step": 8100 }, { "epoch": 3.6856232939035487, "grad_norm": 0.30741308721987043, "learning_rate": 8.056894647165415e-07, "loss": 0.0047, "step": 8101 }, { "epoch": 3.686078252957234, "grad_norm": 0.5913469566852878, "learning_rate": 8.051640417019244e-07, "loss": 0.0097, "step": 8102 }, { "epoch": 3.686533212010919, "grad_norm": 0.11446778438560479, "learning_rate": 8.04638757181648e-07, "loss": 0.0008, "step": 8103 }, { "epoch": 3.686988171064604, "grad_norm": 0.28778735054457705, "learning_rate": 8.041136111986352e-07, "loss": 0.0063, "step": 8104 }, { "epoch": 3.6874431301182895, "grad_norm": 0.1507375444827684, "learning_rate": 8.035886037958008e-07, "loss": 0.0008, "step": 8105 }, { "epoch": 3.6878980891719744, "grad_norm": 0.08519730882583526, "learning_rate": 8.030637350160442e-07, "loss": 0.001, "step": 8106 }, { "epoch": 3.6883530482256597, "grad_norm": 0.19503317552945468, "learning_rate": 8.025390049022563e-07, "loss": 0.0018, "step": 8107 }, { "epoch": 3.688808007279345, "grad_norm": 0.506277041096427, "learning_rate": 8.020144134973143e-07, "loss": 0.0115, "step": 8108 }, { "epoch": 3.68926296633303, "grad_norm": 0.15737361151870374, "learning_rate": 8.014899608440863e-07, "loss": 0.001, "step": 8109 }, { "epoch": 3.689717925386715, "grad_norm": 0.27440932420366454, "learning_rate": 8.009656469854294e-07, "loss": 0.0033, "step": 8110 }, { "epoch": 3.6901728844404005, "grad_norm": 0.10943691653509242, "learning_rate": 8.004414719641868e-07, "loss": 0.0008, "step": 8111 }, { "epoch": 3.6906278434940853, "grad_norm": 0.08950412811393915, "learning_rate": 7.999174358231917e-07, "loss": 0.0006, "step": 8112 }, { "epoch": 3.6910828025477707, "grad_norm": 0.11821289166902811, "learning_rate": 7.993935386052659e-07, "loss": 0.001, "step": 8113 }, { "epoch": 3.691537761601456, "grad_norm": 0.38738198102921145, "learning_rate": 7.988697803532208e-07, "loss": 0.0048, "step": 8114 }, { "epoch": 3.691992720655141, "grad_norm": 0.1941319511621482, "learning_rate": 7.983461611098545e-07, "loss": 0.0018, "step": 8115 }, { "epoch": 3.692447679708826, "grad_norm": 0.27285579101718993, "learning_rate": 7.97822680917956e-07, "loss": 0.0025, "step": 8116 }, { "epoch": 3.6929026387625115, "grad_norm": 0.29763816487597994, "learning_rate": 7.972993398203008e-07, "loss": 0.0017, "step": 8117 }, { "epoch": 3.6933575978161963, "grad_norm": 0.251104481426122, "learning_rate": 7.967761378596545e-07, "loss": 0.0022, "step": 8118 }, { "epoch": 3.6938125568698816, "grad_norm": 0.30873286507771774, "learning_rate": 7.962530750787698e-07, "loss": 0.0024, "step": 8119 }, { "epoch": 3.694267515923567, "grad_norm": 0.20620069645305403, "learning_rate": 7.957301515203902e-07, "loss": 0.0023, "step": 8120 }, { "epoch": 3.694722474977252, "grad_norm": 0.3916600825791082, "learning_rate": 7.952073672272464e-07, "loss": 0.0034, "step": 8121 }, { "epoch": 3.695177434030937, "grad_norm": 0.33627142139979255, "learning_rate": 7.94684722242057e-07, "loss": 0.0032, "step": 8122 }, { "epoch": 3.6956323930846224, "grad_norm": 0.503770456398298, "learning_rate": 7.941622166075316e-07, "loss": 0.0041, "step": 8123 }, { "epoch": 3.6960873521383073, "grad_norm": 0.04851143862197025, "learning_rate": 7.936398503663658e-07, "loss": 0.0002, "step": 8124 }, { "epoch": 3.6965423111919926, "grad_norm": 0.213159656431364, "learning_rate": 7.931176235612462e-07, "loss": 0.0019, "step": 8125 }, { "epoch": 3.696997270245678, "grad_norm": 0.33325197851171057, "learning_rate": 7.925955362348464e-07, "loss": 0.0045, "step": 8126 }, { "epoch": 3.697452229299363, "grad_norm": 0.1382793388970199, "learning_rate": 7.920735884298286e-07, "loss": 0.0018, "step": 8127 }, { "epoch": 3.697907188353048, "grad_norm": 0.2785533917902245, "learning_rate": 7.915517801888434e-07, "loss": 0.0028, "step": 8128 }, { "epoch": 3.6983621474067334, "grad_norm": 0.6073515661101736, "learning_rate": 7.910301115545316e-07, "loss": 0.0046, "step": 8129 }, { "epoch": 3.6988171064604187, "grad_norm": 0.15183393158317623, "learning_rate": 7.905085825695222e-07, "loss": 0.0018, "step": 8130 }, { "epoch": 3.6992720655141036, "grad_norm": 0.09538701517347387, "learning_rate": 7.899871932764314e-07, "loss": 0.0004, "step": 8131 }, { "epoch": 3.699727024567789, "grad_norm": 0.16434240381685544, "learning_rate": 7.894659437178648e-07, "loss": 0.0016, "step": 8132 }, { "epoch": 3.700181983621474, "grad_norm": 0.16085231854452695, "learning_rate": 7.889448339364159e-07, "loss": 0.0023, "step": 8133 }, { "epoch": 3.700636942675159, "grad_norm": 0.44967698983103255, "learning_rate": 7.884238639746685e-07, "loss": 0.0018, "step": 8134 }, { "epoch": 3.7010919017288444, "grad_norm": 0.3919738084429878, "learning_rate": 7.879030338751939e-07, "loss": 0.0032, "step": 8135 }, { "epoch": 3.7015468607825297, "grad_norm": 0.1589753590433917, "learning_rate": 7.873823436805508e-07, "loss": 0.0015, "step": 8136 }, { "epoch": 3.702001819836215, "grad_norm": 0.2713870028011516, "learning_rate": 7.868617934332893e-07, "loss": 0.0046, "step": 8137 }, { "epoch": 3.7024567788899, "grad_norm": 0.10239479137072435, "learning_rate": 7.863413831759448e-07, "loss": 0.0005, "step": 8138 }, { "epoch": 3.702911737943585, "grad_norm": 0.5002574260317213, "learning_rate": 7.858211129510443e-07, "loss": 0.0019, "step": 8139 }, { "epoch": 3.7033666969972705, "grad_norm": 0.16642025283559436, "learning_rate": 7.853009828011013e-07, "loss": 0.0016, "step": 8140 }, { "epoch": 3.7038216560509554, "grad_norm": 0.23066600618018723, "learning_rate": 7.847809927686184e-07, "loss": 0.0015, "step": 8141 }, { "epoch": 3.7042766151046407, "grad_norm": 0.31099723889854547, "learning_rate": 7.842611428960861e-07, "loss": 0.0064, "step": 8142 }, { "epoch": 3.704731574158326, "grad_norm": 0.22067109094590015, "learning_rate": 7.837414332259852e-07, "loss": 0.0013, "step": 8143 }, { "epoch": 3.705186533212011, "grad_norm": 0.0652114631509419, "learning_rate": 7.832218638007846e-07, "loss": 0.0002, "step": 8144 }, { "epoch": 3.705641492265696, "grad_norm": 0.3509954042847797, "learning_rate": 7.827024346629403e-07, "loss": 0.0014, "step": 8145 }, { "epoch": 3.7060964513193815, "grad_norm": 0.7363231243589287, "learning_rate": 7.821831458548978e-07, "loss": 0.0168, "step": 8146 }, { "epoch": 3.7065514103730663, "grad_norm": 0.7291364102098046, "learning_rate": 7.816639974190901e-07, "loss": 0.007, "step": 8147 }, { "epoch": 3.7070063694267517, "grad_norm": 0.46146388940216165, "learning_rate": 7.811449893979416e-07, "loss": 0.0064, "step": 8148 }, { "epoch": 3.707461328480437, "grad_norm": 0.13319079146412263, "learning_rate": 7.806261218338623e-07, "loss": 0.0014, "step": 8149 }, { "epoch": 3.707916287534122, "grad_norm": 0.12849056875436657, "learning_rate": 7.801073947692508e-07, "loss": 0.0008, "step": 8150 }, { "epoch": 3.708371246587807, "grad_norm": 0.413056662354034, "learning_rate": 7.795888082464967e-07, "loss": 0.0034, "step": 8151 }, { "epoch": 3.7088262056414925, "grad_norm": 0.08862364682222229, "learning_rate": 7.790703623079754e-07, "loss": 0.0006, "step": 8152 }, { "epoch": 3.7092811646951773, "grad_norm": 0.07373041829391995, "learning_rate": 7.78552056996053e-07, "loss": 0.0007, "step": 8153 }, { "epoch": 3.7097361237488626, "grad_norm": 0.4403424533689199, "learning_rate": 7.780338923530825e-07, "loss": 0.0029, "step": 8154 }, { "epoch": 3.710191082802548, "grad_norm": 0.27205573889610835, "learning_rate": 7.775158684214062e-07, "loss": 0.0021, "step": 8155 }, { "epoch": 3.710646041856233, "grad_norm": 0.1944339778704307, "learning_rate": 7.769979852433543e-07, "loss": 0.001, "step": 8156 }, { "epoch": 3.711101000909918, "grad_norm": 0.3255689909557629, "learning_rate": 7.764802428612453e-07, "loss": 0.0038, "step": 8157 }, { "epoch": 3.7115559599636034, "grad_norm": 0.35592029484861687, "learning_rate": 7.759626413173873e-07, "loss": 0.0048, "step": 8158 }, { "epoch": 3.7120109190172883, "grad_norm": 0.19809637907594527, "learning_rate": 7.754451806540778e-07, "loss": 0.0026, "step": 8159 }, { "epoch": 3.7124658780709736, "grad_norm": 0.10075475390322686, "learning_rate": 7.749278609135996e-07, "loss": 0.0006, "step": 8160 }, { "epoch": 3.712920837124659, "grad_norm": 0.518372983618844, "learning_rate": 7.744106821382266e-07, "loss": 0.0041, "step": 8161 }, { "epoch": 3.713375796178344, "grad_norm": 0.1890728580536244, "learning_rate": 7.738936443702191e-07, "loss": 0.0016, "step": 8162 }, { "epoch": 3.713830755232029, "grad_norm": 0.17037503689860614, "learning_rate": 7.733767476518286e-07, "loss": 0.0012, "step": 8163 }, { "epoch": 3.7142857142857144, "grad_norm": 0.23920299743489762, "learning_rate": 7.728599920252925e-07, "loss": 0.0038, "step": 8164 }, { "epoch": 3.7147406733393993, "grad_norm": 0.4201561161883284, "learning_rate": 7.723433775328385e-07, "loss": 0.0034, "step": 8165 }, { "epoch": 3.7151956323930846, "grad_norm": 0.40677338558129655, "learning_rate": 7.718269042166818e-07, "loss": 0.004, "step": 8166 }, { "epoch": 3.71565059144677, "grad_norm": 0.315845650005589, "learning_rate": 7.713105721190257e-07, "loss": 0.0029, "step": 8167 }, { "epoch": 3.7161055505004548, "grad_norm": 0.3864346651622594, "learning_rate": 7.707943812820632e-07, "loss": 0.0026, "step": 8168 }, { "epoch": 3.71656050955414, "grad_norm": 0.43846700692234125, "learning_rate": 7.702783317479751e-07, "loss": 0.0083, "step": 8169 }, { "epoch": 3.7170154686078254, "grad_norm": 0.6596827524426608, "learning_rate": 7.697624235589304e-07, "loss": 0.0035, "step": 8170 }, { "epoch": 3.7174704276615103, "grad_norm": 0.20329488513299987, "learning_rate": 7.692466567570858e-07, "loss": 0.0015, "step": 8171 }, { "epoch": 3.7179253867151956, "grad_norm": 0.6130471796768915, "learning_rate": 7.687310313845886e-07, "loss": 0.0046, "step": 8172 }, { "epoch": 3.718380345768881, "grad_norm": 0.26857182281038283, "learning_rate": 7.682155474835739e-07, "loss": 0.0041, "step": 8173 }, { "epoch": 3.7188353048225657, "grad_norm": 0.388953202681179, "learning_rate": 7.67700205096164e-07, "loss": 0.0046, "step": 8174 }, { "epoch": 3.719290263876251, "grad_norm": 0.3775323828604808, "learning_rate": 7.671850042644702e-07, "loss": 0.0083, "step": 8175 }, { "epoch": 3.7197452229299364, "grad_norm": 0.11074208847761251, "learning_rate": 7.66669945030592e-07, "loss": 0.0006, "step": 8176 }, { "epoch": 3.7202001819836212, "grad_norm": 0.37180728764479565, "learning_rate": 7.661550274366189e-07, "loss": 0.0018, "step": 8177 }, { "epoch": 3.7206551410373065, "grad_norm": 0.5179804770913431, "learning_rate": 7.656402515246261e-07, "loss": 0.0087, "step": 8178 }, { "epoch": 3.721110100090992, "grad_norm": 0.19368376537111406, "learning_rate": 7.651256173366805e-07, "loss": 0.0025, "step": 8179 }, { "epoch": 3.7215650591446767, "grad_norm": 0.31533247505130596, "learning_rate": 7.646111249148349e-07, "loss": 0.0051, "step": 8180 }, { "epoch": 3.722020018198362, "grad_norm": 0.9007123750618157, "learning_rate": 7.640967743011304e-07, "loss": 0.0093, "step": 8181 }, { "epoch": 3.7224749772520473, "grad_norm": 0.14610163564679332, "learning_rate": 7.635825655375989e-07, "loss": 0.0012, "step": 8182 }, { "epoch": 3.722929936305732, "grad_norm": 0.3242970933895198, "learning_rate": 7.630684986662587e-07, "loss": 0.007, "step": 8183 }, { "epoch": 3.7233848953594175, "grad_norm": 0.3119549540206527, "learning_rate": 7.625545737291168e-07, "loss": 0.0042, "step": 8184 }, { "epoch": 3.723839854413103, "grad_norm": 0.17848753355593133, "learning_rate": 7.620407907681682e-07, "loss": 0.0008, "step": 8185 }, { "epoch": 3.724294813466788, "grad_norm": 0.4307261288094721, "learning_rate": 7.615271498253976e-07, "loss": 0.0111, "step": 8186 }, { "epoch": 3.724749772520473, "grad_norm": 0.25640319857586247, "learning_rate": 7.610136509427782e-07, "loss": 0.0073, "step": 8187 }, { "epoch": 3.7252047315741583, "grad_norm": 0.22727558910922852, "learning_rate": 7.6050029416227e-07, "loss": 0.0017, "step": 8188 }, { "epoch": 3.7256596906278436, "grad_norm": 0.4533982889809816, "learning_rate": 7.599870795258224e-07, "loss": 0.0068, "step": 8189 }, { "epoch": 3.7261146496815285, "grad_norm": 0.4705980769432011, "learning_rate": 7.594740070753725e-07, "loss": 0.0094, "step": 8190 }, { "epoch": 3.726569608735214, "grad_norm": 0.27652155574202675, "learning_rate": 7.58961076852846e-07, "loss": 0.0052, "step": 8191 }, { "epoch": 3.727024567788899, "grad_norm": 0.22664042288954894, "learning_rate": 7.58448288900158e-07, "loss": 0.0026, "step": 8192 }, { "epoch": 3.7274795268425844, "grad_norm": 0.3984447922483053, "learning_rate": 7.579356432592117e-07, "loss": 0.0022, "step": 8193 }, { "epoch": 3.7279344858962693, "grad_norm": 0.2959967856163849, "learning_rate": 7.574231399718976e-07, "loss": 0.0011, "step": 8194 }, { "epoch": 3.7283894449499546, "grad_norm": 0.10518163265101346, "learning_rate": 7.56910779080095e-07, "loss": 0.0008, "step": 8195 }, { "epoch": 3.72884440400364, "grad_norm": 0.3409618452099632, "learning_rate": 7.56398560625671e-07, "loss": 0.0029, "step": 8196 }, { "epoch": 3.729299363057325, "grad_norm": 0.12711780416264884, "learning_rate": 7.558864846504834e-07, "loss": 0.0009, "step": 8197 }, { "epoch": 3.72975432211101, "grad_norm": 0.23342799315569043, "learning_rate": 7.553745511963761e-07, "loss": 0.0043, "step": 8198 }, { "epoch": 3.7302092811646954, "grad_norm": 0.12791471537439206, "learning_rate": 7.548627603051809e-07, "loss": 0.0007, "step": 8199 }, { "epoch": 3.7306642402183803, "grad_norm": 0.42801229414939806, "learning_rate": 7.543511120187208e-07, "loss": 0.0049, "step": 8200 }, { "epoch": 3.7311191992720656, "grad_norm": 0.07688453329892708, "learning_rate": 7.538396063788037e-07, "loss": 0.0006, "step": 8201 }, { "epoch": 3.731574158325751, "grad_norm": 0.34024962837295064, "learning_rate": 7.533282434272294e-07, "loss": 0.0049, "step": 8202 }, { "epoch": 3.7320291173794358, "grad_norm": 0.6323524139426128, "learning_rate": 7.528170232057827e-07, "loss": 0.0048, "step": 8203 }, { "epoch": 3.732484076433121, "grad_norm": 0.15983845860851792, "learning_rate": 7.52305945756239e-07, "loss": 0.0017, "step": 8204 }, { "epoch": 3.7329390354868064, "grad_norm": 0.33411380966486875, "learning_rate": 7.517950111203598e-07, "loss": 0.0082, "step": 8205 }, { "epoch": 3.7333939945404913, "grad_norm": 0.2485061243423592, "learning_rate": 7.512842193398979e-07, "loss": 0.0012, "step": 8206 }, { "epoch": 3.7338489535941766, "grad_norm": 0.4528243507512641, "learning_rate": 7.50773570456593e-07, "loss": 0.0087, "step": 8207 }, { "epoch": 3.734303912647862, "grad_norm": 0.21779598749285517, "learning_rate": 7.502630645121722e-07, "loss": 0.0017, "step": 8208 }, { "epoch": 3.7347588717015467, "grad_norm": 0.29778477365716144, "learning_rate": 7.497527015483525e-07, "loss": 0.0056, "step": 8209 }, { "epoch": 3.735213830755232, "grad_norm": 0.2745875418615652, "learning_rate": 7.49242481606837e-07, "loss": 0.0047, "step": 8210 }, { "epoch": 3.7356687898089174, "grad_norm": 0.2267001622714543, "learning_rate": 7.487324047293204e-07, "loss": 0.0018, "step": 8211 }, { "epoch": 3.7361237488626022, "grad_norm": 0.10426146086219185, "learning_rate": 7.482224709574828e-07, "loss": 0.0012, "step": 8212 }, { "epoch": 3.7365787079162875, "grad_norm": 0.20591808082631624, "learning_rate": 7.477126803329934e-07, "loss": 0.0016, "step": 8213 }, { "epoch": 3.737033666969973, "grad_norm": 0.17327201090185007, "learning_rate": 7.472030328975114e-07, "loss": 0.0021, "step": 8214 }, { "epoch": 3.7374886260236577, "grad_norm": 0.16907263131350095, "learning_rate": 7.466935286926808e-07, "loss": 0.0024, "step": 8215 }, { "epoch": 3.737943585077343, "grad_norm": 0.1251327908167899, "learning_rate": 7.461841677601381e-07, "loss": 0.0009, "step": 8216 }, { "epoch": 3.7383985441310283, "grad_norm": 0.11964812370654432, "learning_rate": 7.456749501415053e-07, "loss": 0.0012, "step": 8217 }, { "epoch": 3.738853503184713, "grad_norm": 0.2960574233964127, "learning_rate": 7.451658758783928e-07, "loss": 0.0028, "step": 8218 }, { "epoch": 3.7393084622383985, "grad_norm": 0.2873219528318767, "learning_rate": 7.446569450123994e-07, "loss": 0.0042, "step": 8219 }, { "epoch": 3.739763421292084, "grad_norm": 0.40471980123541407, "learning_rate": 7.441481575851136e-07, "loss": 0.0037, "step": 8220 }, { "epoch": 3.7402183803457687, "grad_norm": 0.25939234779342124, "learning_rate": 7.436395136381117e-07, "loss": 0.002, "step": 8221 }, { "epoch": 3.740673339399454, "grad_norm": 0.04888183114444719, "learning_rate": 7.431310132129571e-07, "loss": 0.0001, "step": 8222 }, { "epoch": 3.7411282984531393, "grad_norm": 0.2979199354922992, "learning_rate": 7.426226563512021e-07, "loss": 0.0032, "step": 8223 }, { "epoch": 3.741583257506824, "grad_norm": 0.2907612418344866, "learning_rate": 7.421144430943866e-07, "loss": 0.0027, "step": 8224 }, { "epoch": 3.7420382165605095, "grad_norm": 0.47128573077900754, "learning_rate": 7.416063734840412e-07, "loss": 0.004, "step": 8225 }, { "epoch": 3.742493175614195, "grad_norm": 0.08561672904121022, "learning_rate": 7.41098447561682e-07, "loss": 0.0004, "step": 8226 }, { "epoch": 3.7429481346678797, "grad_norm": 0.3473313038839146, "learning_rate": 7.405906653688136e-07, "loss": 0.0038, "step": 8227 }, { "epoch": 3.743403093721565, "grad_norm": 0.38905498607233346, "learning_rate": 7.400830269469317e-07, "loss": 0.0021, "step": 8228 }, { "epoch": 3.7438580527752503, "grad_norm": 0.25662573588991217, "learning_rate": 7.39575532337517e-07, "loss": 0.0021, "step": 8229 }, { "epoch": 3.744313011828935, "grad_norm": 0.12591175657087667, "learning_rate": 7.390681815820388e-07, "loss": 0.0012, "step": 8230 }, { "epoch": 3.7447679708826205, "grad_norm": 0.07624201965711405, "learning_rate": 7.385609747219574e-07, "loss": 0.0006, "step": 8231 }, { "epoch": 3.745222929936306, "grad_norm": 0.35210042593585444, "learning_rate": 7.380539117987187e-07, "loss": 0.0039, "step": 8232 }, { "epoch": 3.7456778889899907, "grad_norm": 0.5634950418481592, "learning_rate": 7.375469928537574e-07, "loss": 0.0067, "step": 8233 }, { "epoch": 3.746132848043676, "grad_norm": 0.22667840053687124, "learning_rate": 7.370402179284958e-07, "loss": 0.003, "step": 8234 }, { "epoch": 3.7465878070973613, "grad_norm": 0.11512536408122304, "learning_rate": 7.365335870643462e-07, "loss": 0.0012, "step": 8235 }, { "epoch": 3.747042766151046, "grad_norm": 0.2765999812707346, "learning_rate": 7.360271003027089e-07, "loss": 0.0036, "step": 8236 }, { "epoch": 3.7474977252047315, "grad_norm": 0.3042817162800078, "learning_rate": 7.35520757684971e-07, "loss": 0.0032, "step": 8237 }, { "epoch": 3.7479526842584168, "grad_norm": 0.18663765548348532, "learning_rate": 7.350145592525082e-07, "loss": 0.0019, "step": 8238 }, { "epoch": 3.7484076433121016, "grad_norm": 0.40266759836340993, "learning_rate": 7.345085050466846e-07, "loss": 0.0062, "step": 8239 }, { "epoch": 3.748862602365787, "grad_norm": 0.2855504661647979, "learning_rate": 7.340025951088537e-07, "loss": 0.0018, "step": 8240 }, { "epoch": 3.7493175614194723, "grad_norm": 0.29716761357279337, "learning_rate": 7.334968294803546e-07, "loss": 0.003, "step": 8241 }, { "epoch": 3.7497725204731576, "grad_norm": 0.3467943977316671, "learning_rate": 7.329912082025182e-07, "loss": 0.0028, "step": 8242 }, { "epoch": 3.7502274795268424, "grad_norm": 0.14267073038787426, "learning_rate": 7.324857313166603e-07, "loss": 0.0013, "step": 8243 }, { "epoch": 3.7506824385805277, "grad_norm": 0.2798913737845762, "learning_rate": 7.319803988640858e-07, "loss": 0.005, "step": 8244 }, { "epoch": 3.751137397634213, "grad_norm": 0.07226439142224642, "learning_rate": 7.314752108860895e-07, "loss": 0.0005, "step": 8245 }, { "epoch": 3.7515923566878984, "grad_norm": 0.26556503741078213, "learning_rate": 7.309701674239522e-07, "loss": 0.0021, "step": 8246 }, { "epoch": 3.7520473157415832, "grad_norm": 0.19774466255566625, "learning_rate": 7.304652685189434e-07, "loss": 0.0017, "step": 8247 }, { "epoch": 3.7525022747952685, "grad_norm": 0.13189436609329588, "learning_rate": 7.299605142123226e-07, "loss": 0.0014, "step": 8248 }, { "epoch": 3.752957233848954, "grad_norm": 0.12156346543760096, "learning_rate": 7.294559045453342e-07, "loss": 0.0004, "step": 8249 }, { "epoch": 3.7534121929026387, "grad_norm": 0.14370923097911825, "learning_rate": 7.289514395592143e-07, "loss": 0.0012, "step": 8250 }, { "epoch": 3.753867151956324, "grad_norm": 0.21951459543922677, "learning_rate": 7.284471192951848e-07, "loss": 0.0028, "step": 8251 }, { "epoch": 3.7543221110100093, "grad_norm": 0.6188427038503349, "learning_rate": 7.279429437944565e-07, "loss": 0.007, "step": 8252 }, { "epoch": 3.754777070063694, "grad_norm": 0.06292076865509574, "learning_rate": 7.274389130982276e-07, "loss": 0.0006, "step": 8253 }, { "epoch": 3.7552320291173795, "grad_norm": 0.3930567819716887, "learning_rate": 7.269350272476858e-07, "loss": 0.0026, "step": 8254 }, { "epoch": 3.755686988171065, "grad_norm": 0.19480360997117047, "learning_rate": 7.264312862840073e-07, "loss": 0.0022, "step": 8255 }, { "epoch": 3.7561419472247497, "grad_norm": 0.29293519580041244, "learning_rate": 7.259276902483547e-07, "loss": 0.0044, "step": 8256 }, { "epoch": 3.756596906278435, "grad_norm": 0.2429832396908006, "learning_rate": 7.254242391818794e-07, "loss": 0.0016, "step": 8257 }, { "epoch": 3.7570518653321203, "grad_norm": 0.44696152223775737, "learning_rate": 7.249209331257209e-07, "loss": 0.0078, "step": 8258 }, { "epoch": 3.757506824385805, "grad_norm": 0.5831250784455954, "learning_rate": 7.244177721210083e-07, "loss": 0.0107, "step": 8259 }, { "epoch": 3.7579617834394905, "grad_norm": 0.09351674189524026, "learning_rate": 7.239147562088566e-07, "loss": 0.0012, "step": 8260 }, { "epoch": 3.758416742493176, "grad_norm": 0.5857165201073105, "learning_rate": 7.234118854303699e-07, "loss": 0.0021, "step": 8261 }, { "epoch": 3.7588717015468607, "grad_norm": 0.2653850531365393, "learning_rate": 7.229091598266417e-07, "loss": 0.0017, "step": 8262 }, { "epoch": 3.759326660600546, "grad_norm": 0.30992081285483974, "learning_rate": 7.224065794387513e-07, "loss": 0.0023, "step": 8263 }, { "epoch": 3.7597816196542313, "grad_norm": 0.08785484413765432, "learning_rate": 7.219041443077673e-07, "loss": 0.0005, "step": 8264 }, { "epoch": 3.760236578707916, "grad_norm": 0.40909327539300827, "learning_rate": 7.214018544747473e-07, "loss": 0.0065, "step": 8265 }, { "epoch": 3.7606915377616015, "grad_norm": 0.40960634069688345, "learning_rate": 7.208997099807358e-07, "loss": 0.0033, "step": 8266 }, { "epoch": 3.761146496815287, "grad_norm": 0.15267657166339035, "learning_rate": 7.203977108667656e-07, "loss": 0.0022, "step": 8267 }, { "epoch": 3.7616014558689717, "grad_norm": 0.36431174732046057, "learning_rate": 7.198958571738573e-07, "loss": 0.0086, "step": 8268 }, { "epoch": 3.762056414922657, "grad_norm": 0.31503227463274663, "learning_rate": 7.193941489430206e-07, "loss": 0.0031, "step": 8269 }, { "epoch": 3.7625113739763423, "grad_norm": 0.588822667076796, "learning_rate": 7.188925862152535e-07, "loss": 0.0203, "step": 8270 }, { "epoch": 3.762966333030027, "grad_norm": 0.4381148324485534, "learning_rate": 7.18391169031541e-07, "loss": 0.0059, "step": 8271 }, { "epoch": 3.7634212920837125, "grad_norm": 0.5508706898652731, "learning_rate": 7.178898974328563e-07, "loss": 0.0073, "step": 8272 }, { "epoch": 3.7638762511373978, "grad_norm": 0.21647702446732414, "learning_rate": 7.173887714601607e-07, "loss": 0.0023, "step": 8273 }, { "epoch": 3.7643312101910826, "grad_norm": 0.38901647588741056, "learning_rate": 7.16887791154405e-07, "loss": 0.0036, "step": 8274 }, { "epoch": 3.764786169244768, "grad_norm": 0.2874485831042166, "learning_rate": 7.16386956556526e-07, "loss": 0.0047, "step": 8275 }, { "epoch": 3.7652411282984533, "grad_norm": 0.20159580212455872, "learning_rate": 7.15886267707451e-07, "loss": 0.0018, "step": 8276 }, { "epoch": 3.765696087352138, "grad_norm": 0.40099770533218393, "learning_rate": 7.15385724648093e-07, "loss": 0.0053, "step": 8277 }, { "epoch": 3.7661510464058234, "grad_norm": 0.40581988433908606, "learning_rate": 7.148853274193537e-07, "loss": 0.0037, "step": 8278 }, { "epoch": 3.7666060054595087, "grad_norm": 0.3685420758572588, "learning_rate": 7.143850760621246e-07, "loss": 0.0029, "step": 8279 }, { "epoch": 3.7670609645131936, "grad_norm": 0.25695219104630496, "learning_rate": 7.138849706172835e-07, "loss": 0.0042, "step": 8280 }, { "epoch": 3.767515923566879, "grad_norm": 0.1874221926629101, "learning_rate": 7.133850111256965e-07, "loss": 0.0008, "step": 8281 }, { "epoch": 3.7679708826205642, "grad_norm": 0.19126640829668862, "learning_rate": 7.128851976282172e-07, "loss": 0.0023, "step": 8282 }, { "epoch": 3.768425841674249, "grad_norm": 0.3782253485138753, "learning_rate": 7.123855301656893e-07, "loss": 0.0047, "step": 8283 }, { "epoch": 3.7688808007279344, "grad_norm": 0.2514049853368398, "learning_rate": 7.118860087789436e-07, "loss": 0.0059, "step": 8284 }, { "epoch": 3.7693357597816197, "grad_norm": 0.2107461064581342, "learning_rate": 7.113866335087982e-07, "loss": 0.0035, "step": 8285 }, { "epoch": 3.7697907188353046, "grad_norm": 0.33719009543180534, "learning_rate": 7.108874043960601e-07, "loss": 0.0051, "step": 8286 }, { "epoch": 3.77024567788899, "grad_norm": 0.023665376442819942, "learning_rate": 7.103883214815227e-07, "loss": 0.0001, "step": 8287 }, { "epoch": 3.770700636942675, "grad_norm": 0.18196148356762307, "learning_rate": 7.098893848059707e-07, "loss": 0.0012, "step": 8288 }, { "epoch": 3.77115559599636, "grad_norm": 0.2989063097172773, "learning_rate": 7.093905944101734e-07, "loss": 0.0031, "step": 8289 }, { "epoch": 3.7716105550500454, "grad_norm": 0.30175697557157943, "learning_rate": 7.088919503348909e-07, "loss": 0.0045, "step": 8290 }, { "epoch": 3.7720655141037307, "grad_norm": 0.15981486690931948, "learning_rate": 7.0839345262087e-07, "loss": 0.0015, "step": 8291 }, { "epoch": 3.7725204731574156, "grad_norm": 0.32817574491484447, "learning_rate": 7.078951013088445e-07, "loss": 0.0035, "step": 8292 }, { "epoch": 3.772975432211101, "grad_norm": 0.4492861890364008, "learning_rate": 7.073968964395389e-07, "loss": 0.0117, "step": 8293 }, { "epoch": 3.773430391264786, "grad_norm": 0.05472441295210932, "learning_rate": 7.068988380536634e-07, "loss": 0.0003, "step": 8294 }, { "epoch": 3.7738853503184715, "grad_norm": 0.08668447011725966, "learning_rate": 7.064009261919178e-07, "loss": 0.0007, "step": 8295 }, { "epoch": 3.7743403093721564, "grad_norm": 0.6395446217533196, "learning_rate": 7.059031608949873e-07, "loss": 0.0027, "step": 8296 }, { "epoch": 3.7747952684258417, "grad_norm": 0.2522958326832031, "learning_rate": 7.054055422035488e-07, "loss": 0.0041, "step": 8297 }, { "epoch": 3.775250227479527, "grad_norm": 0.07104321526385536, "learning_rate": 7.049080701582658e-07, "loss": 0.0002, "step": 8298 }, { "epoch": 3.775705186533212, "grad_norm": 0.18454078192483736, "learning_rate": 7.044107447997888e-07, "loss": 0.0028, "step": 8299 }, { "epoch": 3.776160145586897, "grad_norm": 0.20099548472896966, "learning_rate": 7.039135661687568e-07, "loss": 0.0021, "step": 8300 }, { "epoch": 3.7766151046405825, "grad_norm": 0.3724147487331092, "learning_rate": 7.034165343057972e-07, "loss": 0.0094, "step": 8301 }, { "epoch": 3.777070063694268, "grad_norm": 0.38272478359023593, "learning_rate": 7.029196492515244e-07, "loss": 0.0091, "step": 8302 }, { "epoch": 3.7775250227479527, "grad_norm": 0.40729008750663953, "learning_rate": 7.024229110465422e-07, "loss": 0.0034, "step": 8303 }, { "epoch": 3.777979981801638, "grad_norm": 0.10817349195124203, "learning_rate": 7.019263197314427e-07, "loss": 0.0007, "step": 8304 }, { "epoch": 3.7784349408553233, "grad_norm": 0.44457206344878036, "learning_rate": 7.014298753468043e-07, "loss": 0.004, "step": 8305 }, { "epoch": 3.778889899909008, "grad_norm": 0.3787159610463049, "learning_rate": 7.009335779331944e-07, "loss": 0.0049, "step": 8306 }, { "epoch": 3.7793448589626935, "grad_norm": 0.2614848984393463, "learning_rate": 7.004374275311671e-07, "loss": 0.0039, "step": 8307 }, { "epoch": 3.7797998180163788, "grad_norm": 0.48323402418059896, "learning_rate": 6.999414241812672e-07, "loss": 0.0047, "step": 8308 }, { "epoch": 3.7802547770700636, "grad_norm": 0.2149286601671691, "learning_rate": 6.994455679240253e-07, "loss": 0.0031, "step": 8309 }, { "epoch": 3.780709736123749, "grad_norm": 0.11551862570116635, "learning_rate": 6.989498587999593e-07, "loss": 0.0016, "step": 8310 }, { "epoch": 3.7811646951774343, "grad_norm": 0.12761532614742685, "learning_rate": 6.984542968495784e-07, "loss": 0.0013, "step": 8311 }, { "epoch": 3.781619654231119, "grad_norm": 0.13217457207812205, "learning_rate": 6.979588821133756e-07, "loss": 0.0012, "step": 8312 }, { "epoch": 3.7820746132848044, "grad_norm": 0.21568281435016604, "learning_rate": 6.974636146318361e-07, "loss": 0.0018, "step": 8313 }, { "epoch": 3.7825295723384897, "grad_norm": 0.1507139407806551, "learning_rate": 6.969684944454297e-07, "loss": 0.0013, "step": 8314 }, { "epoch": 3.7829845313921746, "grad_norm": 0.07237210817070637, "learning_rate": 6.964735215946155e-07, "loss": 0.0006, "step": 8315 }, { "epoch": 3.78343949044586, "grad_norm": 0.24748848720755223, "learning_rate": 6.959786961198398e-07, "loss": 0.0012, "step": 8316 }, { "epoch": 3.7838944494995452, "grad_norm": 0.2514955967107369, "learning_rate": 6.95484018061538e-07, "loss": 0.002, "step": 8317 }, { "epoch": 3.78434940855323, "grad_norm": 0.03883373792801177, "learning_rate": 6.949894874601337e-07, "loss": 0.0002, "step": 8318 }, { "epoch": 3.7848043676069154, "grad_norm": 0.3893202890027429, "learning_rate": 6.944951043560375e-07, "loss": 0.0053, "step": 8319 }, { "epoch": 3.7852593266606007, "grad_norm": 0.2538358764960477, "learning_rate": 6.940008687896476e-07, "loss": 0.0017, "step": 8320 }, { "epoch": 3.7857142857142856, "grad_norm": 0.11824118459649498, "learning_rate": 6.935067808013502e-07, "loss": 0.0012, "step": 8321 }, { "epoch": 3.786169244767971, "grad_norm": 0.5006705936213275, "learning_rate": 6.930128404315214e-07, "loss": 0.0058, "step": 8322 }, { "epoch": 3.786624203821656, "grad_norm": 0.23247464412163427, "learning_rate": 6.92519047720523e-07, "loss": 0.001, "step": 8323 }, { "epoch": 3.787079162875341, "grad_norm": 0.2199544834554578, "learning_rate": 6.920254027087048e-07, "loss": 0.0013, "step": 8324 }, { "epoch": 3.7875341219290264, "grad_norm": 0.3520704520555474, "learning_rate": 6.915319054364064e-07, "loss": 0.0021, "step": 8325 }, { "epoch": 3.7879890809827117, "grad_norm": 0.18175915807242732, "learning_rate": 6.910385559439533e-07, "loss": 0.0005, "step": 8326 }, { "epoch": 3.7884440400363966, "grad_norm": 0.14905618709443022, "learning_rate": 6.905453542716608e-07, "loss": 0.0014, "step": 8327 }, { "epoch": 3.788898999090082, "grad_norm": 0.6179741137119675, "learning_rate": 6.900523004598306e-07, "loss": 0.0088, "step": 8328 }, { "epoch": 3.789353958143767, "grad_norm": 0.12150029193192755, "learning_rate": 6.895593945487527e-07, "loss": 0.0012, "step": 8329 }, { "epoch": 3.789808917197452, "grad_norm": 0.40646928077678746, "learning_rate": 6.890666365787043e-07, "loss": 0.0048, "step": 8330 }, { "epoch": 3.7902638762511374, "grad_norm": 0.48673732786317897, "learning_rate": 6.885740265899527e-07, "loss": 0.0085, "step": 8331 }, { "epoch": 3.7907188353048227, "grad_norm": 0.25442712910910287, "learning_rate": 6.880815646227518e-07, "loss": 0.0024, "step": 8332 }, { "epoch": 3.7911737943585075, "grad_norm": 0.3294889805308051, "learning_rate": 6.875892507173426e-07, "loss": 0.004, "step": 8333 }, { "epoch": 3.791628753412193, "grad_norm": 0.399785747926621, "learning_rate": 6.870970849139555e-07, "loss": 0.0053, "step": 8334 }, { "epoch": 3.792083712465878, "grad_norm": 0.11836401195302569, "learning_rate": 6.866050672528074e-07, "loss": 0.001, "step": 8335 }, { "epoch": 3.792538671519563, "grad_norm": 0.1665967013558716, "learning_rate": 6.861131977741034e-07, "loss": 0.0019, "step": 8336 }, { "epoch": 3.7929936305732483, "grad_norm": 0.27082108283806633, "learning_rate": 6.85621476518038e-07, "loss": 0.0034, "step": 8337 }, { "epoch": 3.7934485896269337, "grad_norm": 0.2896952919289291, "learning_rate": 6.851299035247913e-07, "loss": 0.0067, "step": 8338 }, { "epoch": 3.7939035486806185, "grad_norm": 0.11154206883308647, "learning_rate": 6.846384788345337e-07, "loss": 0.0011, "step": 8339 }, { "epoch": 3.794358507734304, "grad_norm": 0.28962257785881407, "learning_rate": 6.841472024874213e-07, "loss": 0.0053, "step": 8340 }, { "epoch": 3.794813466787989, "grad_norm": 0.2673556541560698, "learning_rate": 6.836560745235987e-07, "loss": 0.0028, "step": 8341 }, { "epoch": 3.795268425841674, "grad_norm": 0.4378826406735603, "learning_rate": 6.831650949831997e-07, "loss": 0.0069, "step": 8342 }, { "epoch": 3.7957233848953593, "grad_norm": 0.09774385453574179, "learning_rate": 6.826742639063447e-07, "loss": 0.0004, "step": 8343 }, { "epoch": 3.7961783439490446, "grad_norm": 0.43918865313949107, "learning_rate": 6.821835813331415e-07, "loss": 0.0041, "step": 8344 }, { "epoch": 3.7966333030027295, "grad_norm": 0.39060339022642154, "learning_rate": 6.816930473036865e-07, "loss": 0.0046, "step": 8345 }, { "epoch": 3.797088262056415, "grad_norm": 0.09723621173342846, "learning_rate": 6.812026618580639e-07, "loss": 0.0009, "step": 8346 }, { "epoch": 3.7975432211101, "grad_norm": 0.24011865306233024, "learning_rate": 6.80712425036347e-07, "loss": 0.0029, "step": 8347 }, { "epoch": 3.797998180163785, "grad_norm": 0.3085811776543174, "learning_rate": 6.802223368785951e-07, "loss": 0.0047, "step": 8348 }, { "epoch": 3.7984531392174703, "grad_norm": 0.2529600706468571, "learning_rate": 6.797323974248557e-07, "loss": 0.0024, "step": 8349 }, { "epoch": 3.7989080982711556, "grad_norm": 0.3505290255366888, "learning_rate": 6.792426067151636e-07, "loss": 0.006, "step": 8350 }, { "epoch": 3.799363057324841, "grad_norm": 0.4025116261383598, "learning_rate": 6.787529647895441e-07, "loss": 0.0111, "step": 8351 }, { "epoch": 3.799818016378526, "grad_norm": 0.3624413499325361, "learning_rate": 6.782634716880068e-07, "loss": 0.007, "step": 8352 }, { "epoch": 3.800272975432211, "grad_norm": 0.16736769712222052, "learning_rate": 6.777741274505525e-07, "loss": 0.0034, "step": 8353 }, { "epoch": 3.8007279344858964, "grad_norm": 0.5770243383723307, "learning_rate": 6.772849321171676e-07, "loss": 0.015, "step": 8354 }, { "epoch": 3.8011828935395813, "grad_norm": 0.21304008917849643, "learning_rate": 6.767958857278256e-07, "loss": 0.0015, "step": 8355 }, { "epoch": 3.8016378525932666, "grad_norm": 0.4043419226911027, "learning_rate": 6.763069883224915e-07, "loss": 0.0065, "step": 8356 }, { "epoch": 3.802092811646952, "grad_norm": 0.11063344862288715, "learning_rate": 6.758182399411142e-07, "loss": 0.0008, "step": 8357 }, { "epoch": 3.802547770700637, "grad_norm": 1.4664936270639495, "learning_rate": 6.753296406236326e-07, "loss": 0.0023, "step": 8358 }, { "epoch": 3.803002729754322, "grad_norm": 0.15306465745932182, "learning_rate": 6.748411904099719e-07, "loss": 0.001, "step": 8359 }, { "epoch": 3.8034576888080074, "grad_norm": 0.16432427980936212, "learning_rate": 6.743528893400466e-07, "loss": 0.0023, "step": 8360 }, { "epoch": 3.8039126478616927, "grad_norm": 0.4081354441627867, "learning_rate": 6.738647374537597e-07, "loss": 0.0073, "step": 8361 }, { "epoch": 3.8043676069153776, "grad_norm": 0.25809791462037607, "learning_rate": 6.733767347909995e-07, "loss": 0.0037, "step": 8362 }, { "epoch": 3.804822565969063, "grad_norm": 0.2427486280524624, "learning_rate": 6.728888813916434e-07, "loss": 0.0033, "step": 8363 }, { "epoch": 3.805277525022748, "grad_norm": 0.23019847003747654, "learning_rate": 6.724011772955563e-07, "loss": 0.0012, "step": 8364 }, { "epoch": 3.805732484076433, "grad_norm": 0.09430099084228707, "learning_rate": 6.719136225425923e-07, "loss": 0.0006, "step": 8365 }, { "epoch": 3.8061874431301184, "grad_norm": 0.2180400459621583, "learning_rate": 6.714262171725904e-07, "loss": 0.001, "step": 8366 }, { "epoch": 3.8066424021838037, "grad_norm": 0.2833555756336157, "learning_rate": 6.709389612253817e-07, "loss": 0.0022, "step": 8367 }, { "epoch": 3.8070973612374885, "grad_norm": 0.12516782526140358, "learning_rate": 6.704518547407806e-07, "loss": 0.0011, "step": 8368 }, { "epoch": 3.807552320291174, "grad_norm": 0.2104106936146035, "learning_rate": 6.699648977585912e-07, "loss": 0.002, "step": 8369 }, { "epoch": 3.808007279344859, "grad_norm": 0.2392001440943848, "learning_rate": 6.694780903186065e-07, "loss": 0.0029, "step": 8370 }, { "epoch": 3.808462238398544, "grad_norm": 0.3496969510540672, "learning_rate": 6.689914324606062e-07, "loss": 0.0091, "step": 8371 }, { "epoch": 3.8089171974522293, "grad_norm": 0.35516677561714594, "learning_rate": 6.685049242243569e-07, "loss": 0.0048, "step": 8372 }, { "epoch": 3.8093721565059147, "grad_norm": 0.2466772023719969, "learning_rate": 6.680185656496135e-07, "loss": 0.0039, "step": 8373 }, { "epoch": 3.8098271155595995, "grad_norm": 0.34756312584326676, "learning_rate": 6.675323567761205e-07, "loss": 0.0092, "step": 8374 }, { "epoch": 3.810282074613285, "grad_norm": 0.21160912829981016, "learning_rate": 6.670462976436073e-07, "loss": 0.001, "step": 8375 }, { "epoch": 3.81073703366697, "grad_norm": 0.216673563922175, "learning_rate": 6.665603882917937e-07, "loss": 0.0018, "step": 8376 }, { "epoch": 3.811191992720655, "grad_norm": 0.044645082644005976, "learning_rate": 6.660746287603855e-07, "loss": 0.0004, "step": 8377 }, { "epoch": 3.8116469517743403, "grad_norm": 0.20299835250321785, "learning_rate": 6.655890190890769e-07, "loss": 0.0023, "step": 8378 }, { "epoch": 3.8121019108280256, "grad_norm": 0.08914116642280294, "learning_rate": 6.651035593175486e-07, "loss": 0.0008, "step": 8379 }, { "epoch": 3.8125568698817105, "grad_norm": 0.174275336414384, "learning_rate": 6.646182494854711e-07, "loss": 0.0027, "step": 8380 }, { "epoch": 3.813011828935396, "grad_norm": 0.3595596790787332, "learning_rate": 6.641330896325027e-07, "loss": 0.0033, "step": 8381 }, { "epoch": 3.813466787989081, "grad_norm": 0.4681061571099994, "learning_rate": 6.636480797982872e-07, "loss": 0.0043, "step": 8382 }, { "epoch": 3.813921747042766, "grad_norm": 0.11012329310609566, "learning_rate": 6.631632200224581e-07, "loss": 0.0008, "step": 8383 }, { "epoch": 3.8143767060964513, "grad_norm": 0.14520944705466374, "learning_rate": 6.626785103446345e-07, "loss": 0.0018, "step": 8384 }, { "epoch": 3.8148316651501366, "grad_norm": 0.15081511257879762, "learning_rate": 6.621939508044267e-07, "loss": 0.0009, "step": 8385 }, { "epoch": 3.8152866242038215, "grad_norm": 0.33323972454216827, "learning_rate": 6.617095414414296e-07, "loss": 0.0022, "step": 8386 }, { "epoch": 3.815741583257507, "grad_norm": 0.2084881753296051, "learning_rate": 6.612252822952267e-07, "loss": 0.0023, "step": 8387 }, { "epoch": 3.816196542311192, "grad_norm": 0.623432809534493, "learning_rate": 6.607411734053903e-07, "loss": 0.0151, "step": 8388 }, { "epoch": 3.816651501364877, "grad_norm": 0.48656552078455345, "learning_rate": 6.602572148114786e-07, "loss": 0.009, "step": 8389 }, { "epoch": 3.8171064604185623, "grad_norm": 0.2206568193052791, "learning_rate": 6.597734065530398e-07, "loss": 0.0013, "step": 8390 }, { "epoch": 3.8175614194722476, "grad_norm": 0.6534565820129091, "learning_rate": 6.592897486696079e-07, "loss": 0.0157, "step": 8391 }, { "epoch": 3.8180163785259325, "grad_norm": 0.13959568336231695, "learning_rate": 6.588062412007051e-07, "loss": 0.0012, "step": 8392 }, { "epoch": 3.8184713375796178, "grad_norm": 0.26546231007816423, "learning_rate": 6.583228841858407e-07, "loss": 0.0022, "step": 8393 }, { "epoch": 3.818926296633303, "grad_norm": 0.2155625733946592, "learning_rate": 6.578396776645136e-07, "loss": 0.0022, "step": 8394 }, { "epoch": 3.819381255686988, "grad_norm": 0.18278885518744006, "learning_rate": 6.573566216762092e-07, "loss": 0.0023, "step": 8395 }, { "epoch": 3.8198362147406733, "grad_norm": 0.04647536074981886, "learning_rate": 6.568737162604005e-07, "loss": 0.0004, "step": 8396 }, { "epoch": 3.8202911737943586, "grad_norm": 0.49703306285520327, "learning_rate": 6.563909614565483e-07, "loss": 0.0124, "step": 8397 }, { "epoch": 3.8207461328480434, "grad_norm": 0.3844032183032846, "learning_rate": 6.559083573041003e-07, "loss": 0.0072, "step": 8398 }, { "epoch": 3.8212010919017287, "grad_norm": 0.2399849742467303, "learning_rate": 6.554259038424943e-07, "loss": 0.0019, "step": 8399 }, { "epoch": 3.821656050955414, "grad_norm": 0.6034595084836918, "learning_rate": 6.549436011111534e-07, "loss": 0.0048, "step": 8400 }, { "epoch": 3.822111010009099, "grad_norm": 0.21227339065474524, "learning_rate": 6.544614491494886e-07, "loss": 0.0014, "step": 8401 }, { "epoch": 3.8225659690627842, "grad_norm": 0.18559297950523887, "learning_rate": 6.539794479969003e-07, "loss": 0.0017, "step": 8402 }, { "epoch": 3.8230209281164695, "grad_norm": 0.17892006360799428, "learning_rate": 6.534975976927743e-07, "loss": 0.001, "step": 8403 }, { "epoch": 3.823475887170155, "grad_norm": 0.16298125496362026, "learning_rate": 6.530158982764867e-07, "loss": 0.0028, "step": 8404 }, { "epoch": 3.8239308462238397, "grad_norm": 0.27591416598098467, "learning_rate": 6.52534349787399e-07, "loss": 0.0026, "step": 8405 }, { "epoch": 3.824385805277525, "grad_norm": 0.10805742313490803, "learning_rate": 6.520529522648608e-07, "loss": 0.0007, "step": 8406 }, { "epoch": 3.8248407643312103, "grad_norm": 0.4268606177840665, "learning_rate": 6.515717057482105e-07, "loss": 0.007, "step": 8407 }, { "epoch": 3.825295723384895, "grad_norm": 0.2781287763427202, "learning_rate": 6.510906102767722e-07, "loss": 0.0022, "step": 8408 }, { "epoch": 3.8257506824385805, "grad_norm": 0.14398098106239182, "learning_rate": 6.506096658898594e-07, "loss": 0.0014, "step": 8409 }, { "epoch": 3.826205641492266, "grad_norm": 0.1480588946368712, "learning_rate": 6.501288726267737e-07, "loss": 0.0012, "step": 8410 }, { "epoch": 3.826660600545951, "grad_norm": 0.2891790845584774, "learning_rate": 6.496482305268029e-07, "loss": 0.0032, "step": 8411 }, { "epoch": 3.827115559599636, "grad_norm": 0.1899670360653416, "learning_rate": 6.491677396292223e-07, "loss": 0.0031, "step": 8412 }, { "epoch": 3.8275705186533213, "grad_norm": 0.3226951888067086, "learning_rate": 6.486873999732951e-07, "loss": 0.0029, "step": 8413 }, { "epoch": 3.8280254777070066, "grad_norm": 0.20591100227916992, "learning_rate": 6.482072115982738e-07, "loss": 0.0018, "step": 8414 }, { "epoch": 3.8284804367606915, "grad_norm": 0.2598941733669782, "learning_rate": 6.477271745433958e-07, "loss": 0.0024, "step": 8415 }, { "epoch": 3.828935395814377, "grad_norm": 0.22470524943252157, "learning_rate": 6.472472888478889e-07, "loss": 0.0027, "step": 8416 }, { "epoch": 3.829390354868062, "grad_norm": 0.36339630825015534, "learning_rate": 6.467675545509669e-07, "loss": 0.0028, "step": 8417 }, { "epoch": 3.829845313921747, "grad_norm": 0.10312673821274686, "learning_rate": 6.462879716918302e-07, "loss": 0.0007, "step": 8418 }, { "epoch": 3.8303002729754323, "grad_norm": 0.3774549011014724, "learning_rate": 6.4580854030967e-07, "loss": 0.0072, "step": 8419 }, { "epoch": 3.8307552320291176, "grad_norm": 0.4182536277153322, "learning_rate": 6.453292604436626e-07, "loss": 0.006, "step": 8420 }, { "epoch": 3.8312101910828025, "grad_norm": 0.25788697919351544, "learning_rate": 6.448501321329722e-07, "loss": 0.0013, "step": 8421 }, { "epoch": 3.831665150136488, "grad_norm": 0.29607981546475287, "learning_rate": 6.443711554167506e-07, "loss": 0.0043, "step": 8422 }, { "epoch": 3.832120109190173, "grad_norm": 0.15669285182281004, "learning_rate": 6.438923303341382e-07, "loss": 0.0008, "step": 8423 }, { "epoch": 3.832575068243858, "grad_norm": 0.4175951348372579, "learning_rate": 6.434136569242632e-07, "loss": 0.0069, "step": 8424 }, { "epoch": 3.8330300272975433, "grad_norm": 0.4283840132891572, "learning_rate": 6.429351352262401e-07, "loss": 0.0037, "step": 8425 }, { "epoch": 3.8334849863512286, "grad_norm": 0.12658619573424254, "learning_rate": 6.42456765279171e-07, "loss": 0.002, "step": 8426 }, { "epoch": 3.8339399454049135, "grad_norm": 0.04057168097905442, "learning_rate": 6.419785471221459e-07, "loss": 0.0003, "step": 8427 }, { "epoch": 3.8343949044585988, "grad_norm": 0.16827557948066557, "learning_rate": 6.415004807942438e-07, "loss": 0.0018, "step": 8428 }, { "epoch": 3.834849863512284, "grad_norm": 0.1605396894270811, "learning_rate": 6.410225663345288e-07, "loss": 0.0027, "step": 8429 }, { "epoch": 3.835304822565969, "grad_norm": 0.20368359631952662, "learning_rate": 6.405448037820553e-07, "loss": 0.002, "step": 8430 }, { "epoch": 3.8357597816196543, "grad_norm": 0.26520816051884294, "learning_rate": 6.400671931758634e-07, "loss": 0.0039, "step": 8431 }, { "epoch": 3.8362147406733396, "grad_norm": 0.2722361359047947, "learning_rate": 6.395897345549801e-07, "loss": 0.0034, "step": 8432 }, { "epoch": 3.8366696997270244, "grad_norm": 0.33348432315834814, "learning_rate": 6.391124279584229e-07, "loss": 0.0051, "step": 8433 }, { "epoch": 3.8371246587807097, "grad_norm": 0.2957843563372208, "learning_rate": 6.386352734251946e-07, "loss": 0.0025, "step": 8434 }, { "epoch": 3.837579617834395, "grad_norm": 0.2641795397628571, "learning_rate": 6.381582709942857e-07, "loss": 0.0021, "step": 8435 }, { "epoch": 3.83803457688808, "grad_norm": 0.12881881719664476, "learning_rate": 6.376814207046744e-07, "loss": 0.0007, "step": 8436 }, { "epoch": 3.8384895359417652, "grad_norm": 0.3062627222451424, "learning_rate": 6.37204722595327e-07, "loss": 0.0056, "step": 8437 }, { "epoch": 3.8389444949954505, "grad_norm": 0.2926441295115, "learning_rate": 6.367281767051984e-07, "loss": 0.0024, "step": 8438 }, { "epoch": 3.8393994540491354, "grad_norm": 0.19841101526247915, "learning_rate": 6.362517830732284e-07, "loss": 0.0026, "step": 8439 }, { "epoch": 3.8398544131028207, "grad_norm": 0.26731648959736287, "learning_rate": 6.357755417383462e-07, "loss": 0.0031, "step": 8440 }, { "epoch": 3.840309372156506, "grad_norm": 0.29839729390176956, "learning_rate": 6.352994527394679e-07, "loss": 0.002, "step": 8441 }, { "epoch": 3.840764331210191, "grad_norm": 0.07002261877885346, "learning_rate": 6.34823516115497e-07, "loss": 0.0006, "step": 8442 }, { "epoch": 3.841219290263876, "grad_norm": 0.14159625471538886, "learning_rate": 6.343477319053248e-07, "loss": 0.0024, "step": 8443 }, { "epoch": 3.8416742493175615, "grad_norm": 0.06313336451074122, "learning_rate": 6.338721001478318e-07, "loss": 0.0006, "step": 8444 }, { "epoch": 3.8421292083712464, "grad_norm": 0.24967781133186934, "learning_rate": 6.333966208818834e-07, "loss": 0.0037, "step": 8445 }, { "epoch": 3.8425841674249317, "grad_norm": 0.08421060127245589, "learning_rate": 6.329212941463336e-07, "loss": 0.0005, "step": 8446 }, { "epoch": 3.843039126478617, "grad_norm": 0.33051708222579484, "learning_rate": 6.324461199800233e-07, "loss": 0.0022, "step": 8447 }, { "epoch": 3.843494085532302, "grad_norm": 0.3785847611556764, "learning_rate": 6.319710984217827e-07, "loss": 0.0063, "step": 8448 }, { "epoch": 3.843949044585987, "grad_norm": 0.2351384335917368, "learning_rate": 6.314962295104285e-07, "loss": 0.0014, "step": 8449 }, { "epoch": 3.8444040036396725, "grad_norm": 0.445249185587086, "learning_rate": 6.310215132847633e-07, "loss": 0.0034, "step": 8450 }, { "epoch": 3.8448589626933574, "grad_norm": 0.432287462119368, "learning_rate": 6.305469497835803e-07, "loss": 0.0061, "step": 8451 }, { "epoch": 3.8453139217470427, "grad_norm": 0.14433864512287245, "learning_rate": 6.300725390456581e-07, "loss": 0.0016, "step": 8452 }, { "epoch": 3.845768880800728, "grad_norm": 0.23065305834223315, "learning_rate": 6.295982811097637e-07, "loss": 0.0032, "step": 8453 }, { "epoch": 3.846223839854413, "grad_norm": 0.48392703242068097, "learning_rate": 6.291241760146513e-07, "loss": 0.0034, "step": 8454 }, { "epoch": 3.846678798908098, "grad_norm": 0.21356704786254932, "learning_rate": 6.286502237990622e-07, "loss": 0.003, "step": 8455 }, { "epoch": 3.8471337579617835, "grad_norm": 0.12716857677882112, "learning_rate": 6.281764245017255e-07, "loss": 0.0007, "step": 8456 }, { "epoch": 3.8475887170154683, "grad_norm": 0.28888504362525824, "learning_rate": 6.277027781613581e-07, "loss": 0.0019, "step": 8457 }, { "epoch": 3.8480436760691537, "grad_norm": 0.23258725274612263, "learning_rate": 6.272292848166653e-07, "loss": 0.0035, "step": 8458 }, { "epoch": 3.848498635122839, "grad_norm": 0.3932834236525741, "learning_rate": 6.267559445063379e-07, "loss": 0.0049, "step": 8459 }, { "epoch": 3.8489535941765243, "grad_norm": 0.28253663002346896, "learning_rate": 6.262827572690552e-07, "loss": 0.003, "step": 8460 }, { "epoch": 3.849408553230209, "grad_norm": 0.3842106670079731, "learning_rate": 6.258097231434832e-07, "loss": 0.0034, "step": 8461 }, { "epoch": 3.8498635122838945, "grad_norm": 0.5262953426224602, "learning_rate": 6.253368421682776e-07, "loss": 0.0107, "step": 8462 }, { "epoch": 3.8503184713375798, "grad_norm": 0.35269048690026344, "learning_rate": 6.248641143820794e-07, "loss": 0.0032, "step": 8463 }, { "epoch": 3.8507734303912646, "grad_norm": 0.19296289482420284, "learning_rate": 6.24391539823517e-07, "loss": 0.0023, "step": 8464 }, { "epoch": 3.85122838944495, "grad_norm": 0.29439466037030326, "learning_rate": 6.239191185312085e-07, "loss": 0.0036, "step": 8465 }, { "epoch": 3.8516833484986353, "grad_norm": 0.19265429886087684, "learning_rate": 6.234468505437566e-07, "loss": 0.0025, "step": 8466 }, { "epoch": 3.8521383075523206, "grad_norm": 0.22130448667301614, "learning_rate": 6.229747358997542e-07, "loss": 0.0021, "step": 8467 }, { "epoch": 3.8525932666060054, "grad_norm": 0.22560426141781018, "learning_rate": 6.225027746377801e-07, "loss": 0.0026, "step": 8468 }, { "epoch": 3.8530482256596907, "grad_norm": 0.30537130741935303, "learning_rate": 6.220309667964005e-07, "loss": 0.003, "step": 8469 }, { "epoch": 3.853503184713376, "grad_norm": 0.16315736120603466, "learning_rate": 6.215593124141686e-07, "loss": 0.0009, "step": 8470 }, { "epoch": 3.853958143767061, "grad_norm": 0.2512329161515085, "learning_rate": 6.210878115296267e-07, "loss": 0.0014, "step": 8471 }, { "epoch": 3.8544131028207462, "grad_norm": 0.17798211989934404, "learning_rate": 6.206164641813048e-07, "loss": 0.0018, "step": 8472 }, { "epoch": 3.8548680618744315, "grad_norm": 0.252991751496126, "learning_rate": 6.201452704077179e-07, "loss": 0.0026, "step": 8473 }, { "epoch": 3.8553230209281164, "grad_norm": 0.22820800657416876, "learning_rate": 6.196742302473701e-07, "loss": 0.0024, "step": 8474 }, { "epoch": 3.8557779799818017, "grad_norm": 0.13233164790725335, "learning_rate": 6.192033437387524e-07, "loss": 0.0013, "step": 8475 }, { "epoch": 3.856232939035487, "grad_norm": 0.11830351179015551, "learning_rate": 6.187326109203442e-07, "loss": 0.0006, "step": 8476 }, { "epoch": 3.856687898089172, "grad_norm": 0.15424250316700613, "learning_rate": 6.182620318306115e-07, "loss": 0.001, "step": 8477 }, { "epoch": 3.857142857142857, "grad_norm": 0.1292071481988895, "learning_rate": 6.177916065080067e-07, "loss": 0.0011, "step": 8478 }, { "epoch": 3.8575978161965425, "grad_norm": 0.17644799243390777, "learning_rate": 6.17321334990973e-07, "loss": 0.0013, "step": 8479 }, { "epoch": 3.8580527752502274, "grad_norm": 0.11848766826538773, "learning_rate": 6.168512173179372e-07, "loss": 0.0013, "step": 8480 }, { "epoch": 3.8585077343039127, "grad_norm": 0.3001859848598346, "learning_rate": 6.163812535273153e-07, "loss": 0.0009, "step": 8481 }, { "epoch": 3.858962693357598, "grad_norm": 0.09541046130298848, "learning_rate": 6.159114436575117e-07, "loss": 0.0006, "step": 8482 }, { "epoch": 3.859417652411283, "grad_norm": 0.11066952385882513, "learning_rate": 6.154417877469165e-07, "loss": 0.0016, "step": 8483 }, { "epoch": 3.859872611464968, "grad_norm": 0.3946867997296686, "learning_rate": 6.149722858339077e-07, "loss": 0.0067, "step": 8484 }, { "epoch": 3.8603275705186535, "grad_norm": 0.03375509512374549, "learning_rate": 6.145029379568504e-07, "loss": 0.0002, "step": 8485 }, { "epoch": 3.8607825295723384, "grad_norm": 0.3390897378239802, "learning_rate": 6.14033744154098e-07, "loss": 0.0033, "step": 8486 }, { "epoch": 3.8612374886260237, "grad_norm": 0.44668224406561613, "learning_rate": 6.13564704463992e-07, "loss": 0.0028, "step": 8487 }, { "epoch": 3.861692447679709, "grad_norm": 0.5586917798729273, "learning_rate": 6.130958189248593e-07, "loss": 0.0061, "step": 8488 }, { "epoch": 3.862147406733394, "grad_norm": 0.3506398338492692, "learning_rate": 6.126270875750148e-07, "loss": 0.0026, "step": 8489 }, { "epoch": 3.862602365787079, "grad_norm": 0.2511742633309603, "learning_rate": 6.121585104527608e-07, "loss": 0.0027, "step": 8490 }, { "epoch": 3.8630573248407645, "grad_norm": 0.32781358253363296, "learning_rate": 6.116900875963888e-07, "loss": 0.0065, "step": 8491 }, { "epoch": 3.8635122838944493, "grad_norm": 0.375527226538828, "learning_rate": 6.112218190441746e-07, "loss": 0.007, "step": 8492 }, { "epoch": 3.8639672429481347, "grad_norm": 0.4795229016877918, "learning_rate": 6.107537048343842e-07, "loss": 0.0151, "step": 8493 }, { "epoch": 3.86442220200182, "grad_norm": 0.5150995548827975, "learning_rate": 6.102857450052694e-07, "loss": 0.0051, "step": 8494 }, { "epoch": 3.864877161055505, "grad_norm": 0.13678838347165292, "learning_rate": 6.09817939595069e-07, "loss": 0.0017, "step": 8495 }, { "epoch": 3.86533212010919, "grad_norm": 0.1584510217536118, "learning_rate": 6.093502886420111e-07, "loss": 0.0021, "step": 8496 }, { "epoch": 3.8657870791628755, "grad_norm": 0.40241341039461087, "learning_rate": 6.088827921843097e-07, "loss": 0.0062, "step": 8497 }, { "epoch": 3.8662420382165603, "grad_norm": 0.08980726792482173, "learning_rate": 6.084154502601661e-07, "loss": 0.0005, "step": 8498 }, { "epoch": 3.8666969972702456, "grad_norm": 0.24668278569455826, "learning_rate": 6.07948262907769e-07, "loss": 0.0031, "step": 8499 }, { "epoch": 3.867151956323931, "grad_norm": 0.4749143698291347, "learning_rate": 6.074812301652955e-07, "loss": 0.0072, "step": 8500 }, { "epoch": 3.867606915377616, "grad_norm": 0.3962010589409133, "learning_rate": 6.070143520709101e-07, "loss": 0.0056, "step": 8501 }, { "epoch": 3.868061874431301, "grad_norm": 0.24458271916370247, "learning_rate": 6.065476286627631e-07, "loss": 0.0035, "step": 8502 }, { "epoch": 3.8685168334849864, "grad_norm": 0.3313522654612488, "learning_rate": 6.06081059978993e-07, "loss": 0.006, "step": 8503 }, { "epoch": 3.8689717925386713, "grad_norm": 0.32094184060973263, "learning_rate": 6.056146460577253e-07, "loss": 0.0013, "step": 8504 }, { "epoch": 3.8694267515923566, "grad_norm": 0.27283611293839494, "learning_rate": 6.051483869370745e-07, "loss": 0.0025, "step": 8505 }, { "epoch": 3.869881710646042, "grad_norm": 0.20953151636237557, "learning_rate": 6.046822826551393e-07, "loss": 0.0016, "step": 8506 }, { "epoch": 3.870336669699727, "grad_norm": 0.19965724320776243, "learning_rate": 6.042163332500101e-07, "loss": 0.0017, "step": 8507 }, { "epoch": 3.870791628753412, "grad_norm": 0.13630356272925598, "learning_rate": 6.037505387597603e-07, "loss": 0.0014, "step": 8508 }, { "epoch": 3.8712465878070974, "grad_norm": 0.2967254174608875, "learning_rate": 6.032848992224527e-07, "loss": 0.0049, "step": 8509 }, { "epoch": 3.8717015468607823, "grad_norm": 0.23474103488664289, "learning_rate": 6.028194146761384e-07, "loss": 0.0035, "step": 8510 }, { "epoch": 3.8721565059144676, "grad_norm": 0.4351240629101886, "learning_rate": 6.023540851588539e-07, "loss": 0.0042, "step": 8511 }, { "epoch": 3.872611464968153, "grad_norm": 0.15587383726652537, "learning_rate": 6.018889107086238e-07, "loss": 0.0011, "step": 8512 }, { "epoch": 3.8730664240218378, "grad_norm": 0.10892134659584173, "learning_rate": 6.014238913634593e-07, "loss": 0.001, "step": 8513 }, { "epoch": 3.873521383075523, "grad_norm": 0.343382342746955, "learning_rate": 6.009590271613608e-07, "loss": 0.0043, "step": 8514 }, { "epoch": 3.8739763421292084, "grad_norm": 0.3769981177259924, "learning_rate": 6.00494318140315e-07, "loss": 0.0062, "step": 8515 }, { "epoch": 3.8744313011828937, "grad_norm": 0.31213557664917285, "learning_rate": 6.000297643382957e-07, "loss": 0.003, "step": 8516 }, { "epoch": 3.8748862602365786, "grad_norm": 0.20147824004807033, "learning_rate": 5.995653657932637e-07, "loss": 0.0024, "step": 8517 }, { "epoch": 3.875341219290264, "grad_norm": 0.19452937100274, "learning_rate": 5.991011225431679e-07, "loss": 0.0015, "step": 8518 }, { "epoch": 3.875796178343949, "grad_norm": 0.29477496599443465, "learning_rate": 5.986370346259429e-07, "loss": 0.0067, "step": 8519 }, { "epoch": 3.876251137397634, "grad_norm": 0.1348393471247145, "learning_rate": 5.981731020795131e-07, "loss": 0.0015, "step": 8520 }, { "epoch": 3.8767060964513194, "grad_norm": 0.2674875892068463, "learning_rate": 5.977093249417898e-07, "loss": 0.0035, "step": 8521 }, { "epoch": 3.8771610555050047, "grad_norm": 0.2810722624524911, "learning_rate": 5.972457032506695e-07, "loss": 0.0017, "step": 8522 }, { "epoch": 3.87761601455869, "grad_norm": 0.06135196596114453, "learning_rate": 5.96782237044038e-07, "loss": 0.0004, "step": 8523 }, { "epoch": 3.878070973612375, "grad_norm": 0.2306018272892176, "learning_rate": 5.96318926359766e-07, "loss": 0.0022, "step": 8524 }, { "epoch": 3.87852593266606, "grad_norm": 0.413901486109169, "learning_rate": 5.958557712357152e-07, "loss": 0.0029, "step": 8525 }, { "epoch": 3.8789808917197455, "grad_norm": 0.36453755940039323, "learning_rate": 5.953927717097319e-07, "loss": 0.0043, "step": 8526 }, { "epoch": 3.8794358507734303, "grad_norm": 0.5566084744732186, "learning_rate": 5.949299278196494e-07, "loss": 0.0067, "step": 8527 }, { "epoch": 3.8798908098271156, "grad_norm": 0.5811639535803785, "learning_rate": 5.944672396032908e-07, "loss": 0.0076, "step": 8528 }, { "epoch": 3.880345768880801, "grad_norm": 0.3344560460008367, "learning_rate": 5.940047070984631e-07, "loss": 0.005, "step": 8529 }, { "epoch": 3.880800727934486, "grad_norm": 0.18341477570279402, "learning_rate": 5.935423303429644e-07, "loss": 0.0008, "step": 8530 }, { "epoch": 3.881255686988171, "grad_norm": 0.538224527727948, "learning_rate": 5.930801093745766e-07, "loss": 0.0074, "step": 8531 }, { "epoch": 3.8817106460418564, "grad_norm": 0.24212492163348445, "learning_rate": 5.926180442310709e-07, "loss": 0.0042, "step": 8532 }, { "epoch": 3.8821656050955413, "grad_norm": 0.274044040704202, "learning_rate": 5.921561349502041e-07, "loss": 0.0016, "step": 8533 }, { "epoch": 3.8826205641492266, "grad_norm": 0.09074700134617639, "learning_rate": 5.916943815697223e-07, "loss": 0.0015, "step": 8534 }, { "epoch": 3.883075523202912, "grad_norm": 0.1410749508870772, "learning_rate": 5.912327841273588e-07, "loss": 0.0007, "step": 8535 }, { "epoch": 3.883530482256597, "grad_norm": 0.14510730044950712, "learning_rate": 5.90771342660832e-07, "loss": 0.0019, "step": 8536 }, { "epoch": 3.883985441310282, "grad_norm": 0.3190049469445135, "learning_rate": 5.90310057207849e-07, "loss": 0.0056, "step": 8537 }, { "epoch": 3.8844404003639674, "grad_norm": 0.3581839087665544, "learning_rate": 5.898489278061034e-07, "loss": 0.0062, "step": 8538 }, { "epoch": 3.8848953594176523, "grad_norm": 0.36077666646880613, "learning_rate": 5.89387954493278e-07, "loss": 0.0036, "step": 8539 }, { "epoch": 3.8853503184713376, "grad_norm": 0.400267805288127, "learning_rate": 5.889271373070407e-07, "loss": 0.0061, "step": 8540 }, { "epoch": 3.885805277525023, "grad_norm": 0.09838434170365867, "learning_rate": 5.884664762850467e-07, "loss": 0.0012, "step": 8541 }, { "epoch": 3.886260236578708, "grad_norm": 0.28737445641947046, "learning_rate": 5.880059714649405e-07, "loss": 0.0042, "step": 8542 }, { "epoch": 3.886715195632393, "grad_norm": 0.32983377394027313, "learning_rate": 5.875456228843512e-07, "loss": 0.0033, "step": 8543 }, { "epoch": 3.8871701546860784, "grad_norm": 0.14291617819496452, "learning_rate": 5.870854305808976e-07, "loss": 0.0005, "step": 8544 }, { "epoch": 3.8876251137397633, "grad_norm": 0.11722405781062492, "learning_rate": 5.866253945921841e-07, "loss": 0.0012, "step": 8545 }, { "epoch": 3.8880800727934486, "grad_norm": 0.13552940448142914, "learning_rate": 5.861655149558026e-07, "loss": 0.001, "step": 8546 }, { "epoch": 3.888535031847134, "grad_norm": 0.18909934634903144, "learning_rate": 5.857057917093323e-07, "loss": 0.0017, "step": 8547 }, { "epoch": 3.8889899909008188, "grad_norm": 0.17320478282847027, "learning_rate": 5.852462248903388e-07, "loss": 0.0016, "step": 8548 }, { "epoch": 3.889444949954504, "grad_norm": 0.18973129507862158, "learning_rate": 5.847868145363777e-07, "loss": 0.001, "step": 8549 }, { "epoch": 3.8898999090081894, "grad_norm": 0.03480811825286289, "learning_rate": 5.843275606849894e-07, "loss": 0.0003, "step": 8550 }, { "epoch": 3.8903548680618742, "grad_norm": 0.1850119950579185, "learning_rate": 5.838684633737018e-07, "loss": 0.0021, "step": 8551 }, { "epoch": 3.8908098271155596, "grad_norm": 0.4673524321711766, "learning_rate": 5.834095226400302e-07, "loss": 0.0037, "step": 8552 }, { "epoch": 3.891264786169245, "grad_norm": 0.2324458255894374, "learning_rate": 5.829507385214764e-07, "loss": 0.0023, "step": 8553 }, { "epoch": 3.8917197452229297, "grad_norm": 0.1817477505263173, "learning_rate": 5.824921110555315e-07, "loss": 0.0018, "step": 8554 }, { "epoch": 3.892174704276615, "grad_norm": 0.2435475539782927, "learning_rate": 5.820336402796712e-07, "loss": 0.0036, "step": 8555 }, { "epoch": 3.8926296633303004, "grad_norm": 0.3096596430418821, "learning_rate": 5.815753262313612e-07, "loss": 0.0032, "step": 8556 }, { "epoch": 3.8930846223839852, "grad_norm": 0.21002292938143421, "learning_rate": 5.811171689480518e-07, "loss": 0.0012, "step": 8557 }, { "epoch": 3.8935395814376705, "grad_norm": 0.20382475619322804, "learning_rate": 5.806591684671814e-07, "loss": 0.002, "step": 8558 }, { "epoch": 3.893994540491356, "grad_norm": 0.3071086656701487, "learning_rate": 5.802013248261768e-07, "loss": 0.0028, "step": 8559 }, { "epoch": 3.8944494995450407, "grad_norm": 0.19100415476252547, "learning_rate": 5.7974363806245e-07, "loss": 0.0011, "step": 8560 }, { "epoch": 3.894904458598726, "grad_norm": 0.4706987343718397, "learning_rate": 5.792861082134011e-07, "loss": 0.0053, "step": 8561 }, { "epoch": 3.8953594176524113, "grad_norm": 0.2813753104630414, "learning_rate": 5.788287353164171e-07, "loss": 0.0021, "step": 8562 }, { "epoch": 3.895814376706096, "grad_norm": 0.16753182758067225, "learning_rate": 5.783715194088729e-07, "loss": 0.0021, "step": 8563 }, { "epoch": 3.8962693357597815, "grad_norm": 0.17630771897047565, "learning_rate": 5.779144605281309e-07, "loss": 0.0016, "step": 8564 }, { "epoch": 3.896724294813467, "grad_norm": 0.2222228536168804, "learning_rate": 5.774575587115389e-07, "loss": 0.0031, "step": 8565 }, { "epoch": 3.8971792538671517, "grad_norm": 0.962212150592141, "learning_rate": 5.770008139964334e-07, "loss": 0.009, "step": 8566 }, { "epoch": 3.897634212920837, "grad_norm": 0.36655456653614543, "learning_rate": 5.765442264201362e-07, "loss": 0.0081, "step": 8567 }, { "epoch": 3.8980891719745223, "grad_norm": 0.6000640240334885, "learning_rate": 5.760877960199596e-07, "loss": 0.0125, "step": 8568 }, { "epoch": 3.8985441310282076, "grad_norm": 0.3648644164881282, "learning_rate": 5.756315228331988e-07, "loss": 0.0015, "step": 8569 }, { "epoch": 3.8989990900818925, "grad_norm": 0.0706823651705751, "learning_rate": 5.751754068971407e-07, "loss": 0.0006, "step": 8570 }, { "epoch": 3.899454049135578, "grad_norm": 0.4511802430159985, "learning_rate": 5.747194482490559e-07, "loss": 0.0044, "step": 8571 }, { "epoch": 3.899909008189263, "grad_norm": 0.21580189886425719, "learning_rate": 5.742636469262023e-07, "loss": 0.0018, "step": 8572 }, { "epoch": 3.900363967242948, "grad_norm": 0.2428856761311333, "learning_rate": 5.73808002965828e-07, "loss": 0.0026, "step": 8573 }, { "epoch": 3.9008189262966333, "grad_norm": 0.5098159921693439, "learning_rate": 5.733525164051648e-07, "loss": 0.0081, "step": 8574 }, { "epoch": 3.9012738853503186, "grad_norm": 0.3436166220529654, "learning_rate": 5.728971872814335e-07, "loss": 0.0064, "step": 8575 }, { "epoch": 3.901728844404004, "grad_norm": 0.1471815193919314, "learning_rate": 5.724420156318406e-07, "loss": 0.0014, "step": 8576 }, { "epoch": 3.902183803457689, "grad_norm": 0.1965155949278025, "learning_rate": 5.719870014935811e-07, "loss": 0.0022, "step": 8577 }, { "epoch": 3.902638762511374, "grad_norm": 0.4378059953518193, "learning_rate": 5.71532144903838e-07, "loss": 0.0064, "step": 8578 }, { "epoch": 3.9030937215650594, "grad_norm": 0.4412739491440795, "learning_rate": 5.710774458997792e-07, "loss": 0.0013, "step": 8579 }, { "epoch": 3.9035486806187443, "grad_norm": 0.1470657829171883, "learning_rate": 5.706229045185604e-07, "loss": 0.0017, "step": 8580 }, { "epoch": 3.9040036396724296, "grad_norm": 0.09891974157955645, "learning_rate": 5.701685207973243e-07, "loss": 0.0009, "step": 8581 }, { "epoch": 3.904458598726115, "grad_norm": 0.3238770105516786, "learning_rate": 5.697142947732021e-07, "loss": 0.0034, "step": 8582 }, { "epoch": 3.9049135577797998, "grad_norm": 0.6259171118622531, "learning_rate": 5.692602264833103e-07, "loss": 0.0055, "step": 8583 }, { "epoch": 3.905368516833485, "grad_norm": 0.884778127814277, "learning_rate": 5.688063159647539e-07, "loss": 0.0056, "step": 8584 }, { "epoch": 3.9058234758871704, "grad_norm": 0.4188212734483671, "learning_rate": 5.683525632546244e-07, "loss": 0.0053, "step": 8585 }, { "epoch": 3.9062784349408552, "grad_norm": 0.20098396245881908, "learning_rate": 5.678989683900002e-07, "loss": 0.0025, "step": 8586 }, { "epoch": 3.9067333939945406, "grad_norm": 0.4356403343620121, "learning_rate": 5.674455314079464e-07, "loss": 0.0053, "step": 8587 }, { "epoch": 3.907188353048226, "grad_norm": 0.41761793697747407, "learning_rate": 5.669922523455171e-07, "loss": 0.0045, "step": 8588 }, { "epoch": 3.9076433121019107, "grad_norm": 0.12747591164291688, "learning_rate": 5.665391312397514e-07, "loss": 0.0024, "step": 8589 }, { "epoch": 3.908098271155596, "grad_norm": 0.2676345227811032, "learning_rate": 5.660861681276758e-07, "loss": 0.0048, "step": 8590 }, { "epoch": 3.9085532302092814, "grad_norm": 0.49402795578937564, "learning_rate": 5.65633363046306e-07, "loss": 0.0049, "step": 8591 }, { "epoch": 3.9090081892629662, "grad_norm": 0.34744138275141634, "learning_rate": 5.651807160326414e-07, "loss": 0.0058, "step": 8592 }, { "epoch": 3.9094631483166515, "grad_norm": 0.1699928039853782, "learning_rate": 5.647282271236718e-07, "loss": 0.001, "step": 8593 }, { "epoch": 3.909918107370337, "grad_norm": 0.15114477040674484, "learning_rate": 5.642758963563719e-07, "loss": 0.0015, "step": 8594 }, { "epoch": 3.9103730664240217, "grad_norm": 0.2458305939778036, "learning_rate": 5.638237237677038e-07, "loss": 0.0037, "step": 8595 }, { "epoch": 3.910828025477707, "grad_norm": 0.32854354424440485, "learning_rate": 5.63371709394617e-07, "loss": 0.0033, "step": 8596 }, { "epoch": 3.9112829845313923, "grad_norm": 0.117828883393373, "learning_rate": 5.629198532740481e-07, "loss": 0.001, "step": 8597 }, { "epoch": 3.911737943585077, "grad_norm": 0.2151387659071673, "learning_rate": 5.62468155442922e-07, "loss": 0.0034, "step": 8598 }, { "epoch": 3.9121929026387625, "grad_norm": 0.3711287975389299, "learning_rate": 5.620166159381482e-07, "loss": 0.0096, "step": 8599 }, { "epoch": 3.912647861692448, "grad_norm": 0.19333852922544248, "learning_rate": 5.615652347966247e-07, "loss": 0.0024, "step": 8600 }, { "epoch": 3.9131028207461327, "grad_norm": 0.14716182037158773, "learning_rate": 5.611140120552358e-07, "loss": 0.0015, "step": 8601 }, { "epoch": 3.913557779799818, "grad_norm": 0.1458412917141075, "learning_rate": 5.606629477508543e-07, "loss": 0.0005, "step": 8602 }, { "epoch": 3.9140127388535033, "grad_norm": 0.2851669280987992, "learning_rate": 5.602120419203391e-07, "loss": 0.007, "step": 8603 }, { "epoch": 3.914467697907188, "grad_norm": 0.5084754635093742, "learning_rate": 5.597612946005348e-07, "loss": 0.01, "step": 8604 }, { "epoch": 3.9149226569608735, "grad_norm": 0.6936884261399083, "learning_rate": 5.593107058282765e-07, "loss": 0.0135, "step": 8605 }, { "epoch": 3.915377616014559, "grad_norm": 0.26156098551581614, "learning_rate": 5.588602756403822e-07, "loss": 0.0016, "step": 8606 }, { "epoch": 3.9158325750682437, "grad_norm": 0.3054043084868096, "learning_rate": 5.584100040736609e-07, "loss": 0.0022, "step": 8607 }, { "epoch": 3.916287534121929, "grad_norm": 0.19732360971961907, "learning_rate": 5.579598911649059e-07, "loss": 0.0024, "step": 8608 }, { "epoch": 3.9167424931756143, "grad_norm": 0.4452127570854979, "learning_rate": 5.575099369508985e-07, "loss": 0.0088, "step": 8609 }, { "epoch": 3.917197452229299, "grad_norm": 0.24963393209379087, "learning_rate": 5.570601414684062e-07, "loss": 0.0022, "step": 8610 }, { "epoch": 3.9176524112829845, "grad_norm": 0.10074091640236907, "learning_rate": 5.566105047541848e-07, "loss": 0.0012, "step": 8611 }, { "epoch": 3.91810737033667, "grad_norm": 0.3152325320338209, "learning_rate": 5.561610268449775e-07, "loss": 0.005, "step": 8612 }, { "epoch": 3.9185623293903546, "grad_norm": 0.09775413520224109, "learning_rate": 5.557117077775125e-07, "loss": 0.0009, "step": 8613 }, { "epoch": 3.91901728844404, "grad_norm": 0.142517785072768, "learning_rate": 5.552625475885065e-07, "loss": 0.0013, "step": 8614 }, { "epoch": 3.9194722474977253, "grad_norm": 0.2616129208642158, "learning_rate": 5.548135463146622e-07, "loss": 0.0024, "step": 8615 }, { "epoch": 3.91992720655141, "grad_norm": 0.2467449065386136, "learning_rate": 5.543647039926712e-07, "loss": 0.0056, "step": 8616 }, { "epoch": 3.9203821656050954, "grad_norm": 0.26719919466095854, "learning_rate": 5.5391602065921e-07, "loss": 0.0033, "step": 8617 }, { "epoch": 3.9208371246587808, "grad_norm": 0.16186441803544135, "learning_rate": 5.534674963509429e-07, "loss": 0.0013, "step": 8618 }, { "epoch": 3.9212920837124656, "grad_norm": 0.321306624702867, "learning_rate": 5.530191311045218e-07, "loss": 0.003, "step": 8619 }, { "epoch": 3.921747042766151, "grad_norm": 0.15562654089564626, "learning_rate": 5.525709249565842e-07, "loss": 0.0007, "step": 8620 }, { "epoch": 3.9222020018198362, "grad_norm": 0.2730389349987887, "learning_rate": 5.521228779437568e-07, "loss": 0.0028, "step": 8621 }, { "epoch": 3.922656960873521, "grad_norm": 0.24204620429210122, "learning_rate": 5.516749901026514e-07, "loss": 0.0023, "step": 8622 }, { "epoch": 3.9231119199272064, "grad_norm": 0.03999906336866665, "learning_rate": 5.512272614698672e-07, "loss": 0.0003, "step": 8623 }, { "epoch": 3.9235668789808917, "grad_norm": 0.2849779180394626, "learning_rate": 5.507796920819905e-07, "loss": 0.0041, "step": 8624 }, { "epoch": 3.924021838034577, "grad_norm": 0.4844591297322112, "learning_rate": 5.503322819755941e-07, "loss": 0.0161, "step": 8625 }, { "epoch": 3.924476797088262, "grad_norm": 0.17522496858420272, "learning_rate": 5.498850311872392e-07, "loss": 0.0012, "step": 8626 }, { "epoch": 3.9249317561419472, "grad_norm": 0.5713956669043477, "learning_rate": 5.494379397534733e-07, "loss": 0.0023, "step": 8627 }, { "epoch": 3.9253867151956325, "grad_norm": 0.25318268739588723, "learning_rate": 5.489910077108304e-07, "loss": 0.0021, "step": 8628 }, { "epoch": 3.9258416742493174, "grad_norm": 0.278432357078927, "learning_rate": 5.485442350958317e-07, "loss": 0.001, "step": 8629 }, { "epoch": 3.9262966333030027, "grad_norm": 0.08159773597045042, "learning_rate": 5.480976219449849e-07, "loss": 0.0006, "step": 8630 }, { "epoch": 3.926751592356688, "grad_norm": 0.37922198784089656, "learning_rate": 5.476511682947861e-07, "loss": 0.004, "step": 8631 }, { "epoch": 3.9272065514103733, "grad_norm": 0.24448285166787107, "learning_rate": 5.472048741817165e-07, "loss": 0.0027, "step": 8632 }, { "epoch": 3.927661510464058, "grad_norm": 0.41874051486992814, "learning_rate": 5.467587396422467e-07, "loss": 0.0116, "step": 8633 }, { "epoch": 3.9281164695177435, "grad_norm": 0.2706661731281602, "learning_rate": 5.463127647128319e-07, "loss": 0.0031, "step": 8634 }, { "epoch": 3.928571428571429, "grad_norm": 0.12978920508470537, "learning_rate": 5.458669494299143e-07, "loss": 0.0006, "step": 8635 }, { "epoch": 3.9290263876251137, "grad_norm": 0.05470699158535635, "learning_rate": 5.454212938299256e-07, "loss": 0.0003, "step": 8636 }, { "epoch": 3.929481346678799, "grad_norm": 0.10875040005812772, "learning_rate": 5.449757979492821e-07, "loss": 0.0008, "step": 8637 }, { "epoch": 3.9299363057324843, "grad_norm": 0.2443462486120143, "learning_rate": 5.445304618243874e-07, "loss": 0.0022, "step": 8638 }, { "epoch": 3.930391264786169, "grad_norm": 0.14547148994368106, "learning_rate": 5.44085285491632e-07, "loss": 0.0008, "step": 8639 }, { "epoch": 3.9308462238398545, "grad_norm": 0.10393817261114935, "learning_rate": 5.436402689873941e-07, "loss": 0.0023, "step": 8640 }, { "epoch": 3.93130118289354, "grad_norm": 0.43307280410193266, "learning_rate": 5.431954123480393e-07, "loss": 0.0019, "step": 8641 }, { "epoch": 3.9317561419472247, "grad_norm": 0.6998007931231693, "learning_rate": 5.427507156099185e-07, "loss": 0.0108, "step": 8642 }, { "epoch": 3.93221110100091, "grad_norm": 0.13027952882112628, "learning_rate": 5.423061788093706e-07, "loss": 0.0017, "step": 8643 }, { "epoch": 3.9326660600545953, "grad_norm": 0.11322512239887926, "learning_rate": 5.418618019827199e-07, "loss": 0.0013, "step": 8644 }, { "epoch": 3.93312101910828, "grad_norm": 0.09291245529239041, "learning_rate": 5.414175851662806e-07, "loss": 0.0005, "step": 8645 }, { "epoch": 3.9335759781619655, "grad_norm": 0.14593875365235223, "learning_rate": 5.409735283963511e-07, "loss": 0.0009, "step": 8646 }, { "epoch": 3.934030937215651, "grad_norm": 0.19479960852816386, "learning_rate": 5.405296317092182e-07, "loss": 0.0007, "step": 8647 }, { "epoch": 3.9344858962693356, "grad_norm": 0.29778832100920855, "learning_rate": 5.40085895141155e-07, "loss": 0.0015, "step": 8648 }, { "epoch": 3.934940855323021, "grad_norm": 0.5401276381216138, "learning_rate": 5.396423187284208e-07, "loss": 0.011, "step": 8649 }, { "epoch": 3.9353958143767063, "grad_norm": 0.2806489097507003, "learning_rate": 5.391989025072644e-07, "loss": 0.0013, "step": 8650 }, { "epoch": 3.935850773430391, "grad_norm": 0.40382604093349217, "learning_rate": 5.387556465139185e-07, "loss": 0.0034, "step": 8651 }, { "epoch": 3.9363057324840764, "grad_norm": 0.5082685767359625, "learning_rate": 5.383125507846043e-07, "loss": 0.0092, "step": 8652 }, { "epoch": 3.9367606915377618, "grad_norm": 0.48109655950929586, "learning_rate": 5.37869615355529e-07, "loss": 0.012, "step": 8653 }, { "epoch": 3.9372156505914466, "grad_norm": 0.23131793664472078, "learning_rate": 5.374268402628877e-07, "loss": 0.0019, "step": 8654 }, { "epoch": 3.937670609645132, "grad_norm": 0.27090872308774605, "learning_rate": 5.369842255428628e-07, "loss": 0.0014, "step": 8655 }, { "epoch": 3.9381255686988172, "grad_norm": 0.20443469340706544, "learning_rate": 5.365417712316223e-07, "loss": 0.002, "step": 8656 }, { "epoch": 3.938580527752502, "grad_norm": 0.36311600218556694, "learning_rate": 5.360994773653211e-07, "loss": 0.0063, "step": 8657 }, { "epoch": 3.9390354868061874, "grad_norm": 0.14528061881480858, "learning_rate": 5.356573439801019e-07, "loss": 0.0009, "step": 8658 }, { "epoch": 3.9394904458598727, "grad_norm": 0.46861129971744114, "learning_rate": 5.352153711120928e-07, "loss": 0.0093, "step": 8659 }, { "epoch": 3.9399454049135576, "grad_norm": 1.003464307327973, "learning_rate": 5.347735587974106e-07, "loss": 0.0058, "step": 8660 }, { "epoch": 3.940400363967243, "grad_norm": 0.1634402893600164, "learning_rate": 5.343319070721592e-07, "loss": 0.0012, "step": 8661 }, { "epoch": 3.9408553230209282, "grad_norm": 0.3454989399570101, "learning_rate": 5.338904159724275e-07, "loss": 0.0045, "step": 8662 }, { "epoch": 3.941310282074613, "grad_norm": 0.3040043122047599, "learning_rate": 5.334490855342922e-07, "loss": 0.0027, "step": 8663 }, { "epoch": 3.9417652411282984, "grad_norm": 0.0928489707659415, "learning_rate": 5.330079157938159e-07, "loss": 0.0018, "step": 8664 }, { "epoch": 3.9422202001819837, "grad_norm": 0.16862715585646001, "learning_rate": 5.325669067870503e-07, "loss": 0.0012, "step": 8665 }, { "epoch": 3.9426751592356686, "grad_norm": 0.3924201711468143, "learning_rate": 5.321260585500326e-07, "loss": 0.0041, "step": 8666 }, { "epoch": 3.943130118289354, "grad_norm": 0.1469415710380911, "learning_rate": 5.316853711187858e-07, "loss": 0.0018, "step": 8667 }, { "epoch": 3.943585077343039, "grad_norm": 0.1036978456869552, "learning_rate": 5.312448445293225e-07, "loss": 0.0009, "step": 8668 }, { "epoch": 3.944040036396724, "grad_norm": 0.2825882172559417, "learning_rate": 5.308044788176387e-07, "loss": 0.0077, "step": 8669 }, { "epoch": 3.9444949954504094, "grad_norm": 0.26595747748213727, "learning_rate": 5.30364274019721e-07, "loss": 0.0011, "step": 8670 }, { "epoch": 3.9449499545040947, "grad_norm": 0.28871007231708234, "learning_rate": 5.299242301715399e-07, "loss": 0.0028, "step": 8671 }, { "epoch": 3.9454049135577796, "grad_norm": 0.3577641435154776, "learning_rate": 5.294843473090539e-07, "loss": 0.0018, "step": 8672 }, { "epoch": 3.945859872611465, "grad_norm": 0.27039011202482927, "learning_rate": 5.290446254682074e-07, "loss": 0.0021, "step": 8673 }, { "epoch": 3.94631483166515, "grad_norm": 0.3504512919385584, "learning_rate": 5.286050646849336e-07, "loss": 0.0032, "step": 8674 }, { "epoch": 3.946769790718835, "grad_norm": 0.24904292784668605, "learning_rate": 5.28165664995152e-07, "loss": 0.0027, "step": 8675 }, { "epoch": 3.9472247497725204, "grad_norm": 0.16656710918964382, "learning_rate": 5.277264264347673e-07, "loss": 0.0014, "step": 8676 }, { "epoch": 3.9476797088262057, "grad_norm": 0.43624351163522035, "learning_rate": 5.272873490396723e-07, "loss": 0.0038, "step": 8677 }, { "epoch": 3.9481346678798905, "grad_norm": 0.30101905429246895, "learning_rate": 5.268484328457457e-07, "loss": 0.0026, "step": 8678 }, { "epoch": 3.948589626933576, "grad_norm": 0.28493761467481393, "learning_rate": 5.264096778888555e-07, "loss": 0.0041, "step": 8679 }, { "epoch": 3.949044585987261, "grad_norm": 0.3296194761111857, "learning_rate": 5.259710842048535e-07, "loss": 0.0034, "step": 8680 }, { "epoch": 3.9494995450409465, "grad_norm": 0.33742598089464265, "learning_rate": 5.255326518295791e-07, "loss": 0.0025, "step": 8681 }, { "epoch": 3.9499545040946313, "grad_norm": 0.12782172858178234, "learning_rate": 5.250943807988607e-07, "loss": 0.001, "step": 8682 }, { "epoch": 3.9504094631483166, "grad_norm": 0.38895068470727484, "learning_rate": 5.246562711485101e-07, "loss": 0.0076, "step": 8683 }, { "epoch": 3.950864422202002, "grad_norm": 0.4565945574005869, "learning_rate": 5.242183229143294e-07, "loss": 0.0034, "step": 8684 }, { "epoch": 3.951319381255687, "grad_norm": 0.27936561084111594, "learning_rate": 5.237805361321044e-07, "loss": 0.0048, "step": 8685 }, { "epoch": 3.951774340309372, "grad_norm": 0.44249409538144563, "learning_rate": 5.233429108376098e-07, "loss": 0.0071, "step": 8686 }, { "epoch": 3.9522292993630574, "grad_norm": 0.17047777717475263, "learning_rate": 5.22905447066605e-07, "loss": 0.0007, "step": 8687 }, { "epoch": 3.9526842584167428, "grad_norm": 0.264411565664632, "learning_rate": 5.224681448548388e-07, "loss": 0.0049, "step": 8688 }, { "epoch": 3.9531392174704276, "grad_norm": 0.32112739623435416, "learning_rate": 5.220310042380461e-07, "loss": 0.0015, "step": 8689 }, { "epoch": 3.953594176524113, "grad_norm": 0.49159892335269423, "learning_rate": 5.215940252519472e-07, "loss": 0.0026, "step": 8690 }, { "epoch": 3.9540491355777982, "grad_norm": 0.6832635479469529, "learning_rate": 5.211572079322499e-07, "loss": 0.0115, "step": 8691 }, { "epoch": 3.954504094631483, "grad_norm": 0.09582324783537992, "learning_rate": 5.207205523146497e-07, "loss": 0.0017, "step": 8692 }, { "epoch": 3.9549590536851684, "grad_norm": 0.4301857150348251, "learning_rate": 5.202840584348265e-07, "loss": 0.0043, "step": 8693 }, { "epoch": 3.9554140127388537, "grad_norm": 0.3222506623219817, "learning_rate": 5.198477263284507e-07, "loss": 0.0025, "step": 8694 }, { "epoch": 3.9558689717925386, "grad_norm": 0.26223888606472145, "learning_rate": 5.194115560311755e-07, "loss": 0.0027, "step": 8695 }, { "epoch": 3.956323930846224, "grad_norm": 0.3145557350240582, "learning_rate": 5.189755475786446e-07, "loss": 0.0037, "step": 8696 }, { "epoch": 3.9567788898999092, "grad_norm": 0.07119307406659955, "learning_rate": 5.185397010064855e-07, "loss": 0.0007, "step": 8697 }, { "epoch": 3.957233848953594, "grad_norm": 0.22957249130053567, "learning_rate": 5.181040163503132e-07, "loss": 0.0017, "step": 8698 }, { "epoch": 3.9576888080072794, "grad_norm": 0.2794993006319359, "learning_rate": 5.176684936457313e-07, "loss": 0.0034, "step": 8699 }, { "epoch": 3.9581437670609647, "grad_norm": 0.18965436222465623, "learning_rate": 5.172331329283281e-07, "loss": 0.0027, "step": 8700 }, { "epoch": 3.9585987261146496, "grad_norm": 0.232876676330153, "learning_rate": 5.167979342336787e-07, "loss": 0.0016, "step": 8701 }, { "epoch": 3.959053685168335, "grad_norm": 0.08708074193448935, "learning_rate": 5.163628975973459e-07, "loss": 0.0008, "step": 8702 }, { "epoch": 3.95950864422202, "grad_norm": 0.16924611997209335, "learning_rate": 5.159280230548789e-07, "loss": 0.001, "step": 8703 }, { "epoch": 3.959963603275705, "grad_norm": 0.1885885487893289, "learning_rate": 5.154933106418145e-07, "loss": 0.0012, "step": 8704 }, { "epoch": 3.9604185623293904, "grad_norm": 0.5767977169561751, "learning_rate": 5.150587603936746e-07, "loss": 0.0033, "step": 8705 }, { "epoch": 3.9608735213830757, "grad_norm": 0.27937634427240926, "learning_rate": 5.146243723459692e-07, "loss": 0.0062, "step": 8706 }, { "epoch": 3.9613284804367606, "grad_norm": 0.22515446126828542, "learning_rate": 5.141901465341933e-07, "loss": 0.0024, "step": 8707 }, { "epoch": 3.961783439490446, "grad_norm": 0.2834216532119542, "learning_rate": 5.137560829938318e-07, "loss": 0.0063, "step": 8708 }, { "epoch": 3.962238398544131, "grad_norm": 0.21286606916201087, "learning_rate": 5.133221817603526e-07, "loss": 0.001, "step": 8709 }, { "epoch": 3.962693357597816, "grad_norm": 0.27531781999712046, "learning_rate": 5.128884428692136e-07, "loss": 0.0028, "step": 8710 }, { "epoch": 3.9631483166515014, "grad_norm": 0.2922386741963923, "learning_rate": 5.124548663558571e-07, "loss": 0.0049, "step": 8711 }, { "epoch": 3.9636032757051867, "grad_norm": 0.2104574476617327, "learning_rate": 5.120214522557129e-07, "loss": 0.0019, "step": 8712 }, { "epoch": 3.9640582347588715, "grad_norm": 0.14203075238604376, "learning_rate": 5.115882006041983e-07, "loss": 0.0014, "step": 8713 }, { "epoch": 3.964513193812557, "grad_norm": 0.25723449847583385, "learning_rate": 5.111551114367166e-07, "loss": 0.0028, "step": 8714 }, { "epoch": 3.964968152866242, "grad_norm": 0.24826574190841608, "learning_rate": 5.107221847886576e-07, "loss": 0.0032, "step": 8715 }, { "epoch": 3.965423111919927, "grad_norm": 0.04998862921347075, "learning_rate": 5.102894206953976e-07, "loss": 0.0003, "step": 8716 }, { "epoch": 3.9658780709736123, "grad_norm": 0.10100960666111113, "learning_rate": 5.098568191923007e-07, "loss": 0.0011, "step": 8717 }, { "epoch": 3.9663330300272976, "grad_norm": 0.22171213968277023, "learning_rate": 5.094243803147175e-07, "loss": 0.0021, "step": 8718 }, { "epoch": 3.9667879890809825, "grad_norm": 0.2717126631672115, "learning_rate": 5.089921040979847e-07, "loss": 0.0031, "step": 8719 }, { "epoch": 3.967242948134668, "grad_norm": 0.3510953696942952, "learning_rate": 5.085599905774261e-07, "loss": 0.0053, "step": 8720 }, { "epoch": 3.967697907188353, "grad_norm": 0.35148671707068463, "learning_rate": 5.081280397883509e-07, "loss": 0.0045, "step": 8721 }, { "epoch": 3.968152866242038, "grad_norm": 0.5299034270009165, "learning_rate": 5.076962517660577e-07, "loss": 0.012, "step": 8722 }, { "epoch": 3.9686078252957233, "grad_norm": 0.2580455659812629, "learning_rate": 5.072646265458292e-07, "loss": 0.0023, "step": 8723 }, { "epoch": 3.9690627843494086, "grad_norm": 0.11721055050928136, "learning_rate": 5.068331641629367e-07, "loss": 0.0007, "step": 8724 }, { "epoch": 3.9695177434030935, "grad_norm": 0.15105051125816252, "learning_rate": 5.064018646526372e-07, "loss": 0.0011, "step": 8725 }, { "epoch": 3.969972702456779, "grad_norm": 0.5116694758236188, "learning_rate": 5.059707280501736e-07, "loss": 0.0067, "step": 8726 }, { "epoch": 3.970427661510464, "grad_norm": 0.1685267514317677, "learning_rate": 5.055397543907778e-07, "loss": 0.0014, "step": 8727 }, { "epoch": 3.970882620564149, "grad_norm": 0.3032089103661776, "learning_rate": 5.051089437096662e-07, "loss": 0.0052, "step": 8728 }, { "epoch": 3.9713375796178343, "grad_norm": 0.22962336845747364, "learning_rate": 5.046782960420432e-07, "loss": 0.0015, "step": 8729 }, { "epoch": 3.9717925386715196, "grad_norm": 0.47885126400231215, "learning_rate": 5.042478114230981e-07, "loss": 0.0064, "step": 8730 }, { "epoch": 3.9722474977252045, "grad_norm": 0.17360013771285462, "learning_rate": 5.038174898880099e-07, "loss": 0.0013, "step": 8731 }, { "epoch": 3.97270245677889, "grad_norm": 0.19343525410816256, "learning_rate": 5.033873314719409e-07, "loss": 0.0016, "step": 8732 }, { "epoch": 3.973157415832575, "grad_norm": 0.42607125867829887, "learning_rate": 5.029573362100434e-07, "loss": 0.0066, "step": 8733 }, { "epoch": 3.9736123748862604, "grad_norm": 0.30724043373236476, "learning_rate": 5.025275041374538e-07, "loss": 0.004, "step": 8734 }, { "epoch": 3.9740673339399453, "grad_norm": 0.1949025454091503, "learning_rate": 5.020978352892961e-07, "loss": 0.0024, "step": 8735 }, { "epoch": 3.9745222929936306, "grad_norm": 0.5264148784211109, "learning_rate": 5.016683297006803e-07, "loss": 0.0063, "step": 8736 }, { "epoch": 3.974977252047316, "grad_norm": 0.365932203578585, "learning_rate": 5.012389874067039e-07, "loss": 0.003, "step": 8737 }, { "epoch": 3.9754322111010008, "grad_norm": 0.27809689694776496, "learning_rate": 5.00809808442452e-07, "loss": 0.0021, "step": 8738 }, { "epoch": 3.975887170154686, "grad_norm": 0.19325908789674157, "learning_rate": 5.003807928429941e-07, "loss": 0.0017, "step": 8739 }, { "epoch": 3.9763421292083714, "grad_norm": 0.5300591113857821, "learning_rate": 4.999519406433878e-07, "loss": 0.0071, "step": 8740 }, { "epoch": 3.9767970882620567, "grad_norm": 0.38844103707287847, "learning_rate": 4.995232518786761e-07, "loss": 0.0043, "step": 8741 }, { "epoch": 3.9772520473157416, "grad_norm": 0.15524520515671852, "learning_rate": 4.990947265838906e-07, "loss": 0.0005, "step": 8742 }, { "epoch": 3.977707006369427, "grad_norm": 0.16903176459749475, "learning_rate": 4.986663647940481e-07, "loss": 0.002, "step": 8743 }, { "epoch": 3.978161965423112, "grad_norm": 0.20927038372210977, "learning_rate": 4.982381665441519e-07, "loss": 0.0029, "step": 8744 }, { "epoch": 3.978616924476797, "grad_norm": 0.33096175602903033, "learning_rate": 4.978101318691936e-07, "loss": 0.0019, "step": 8745 }, { "epoch": 3.9790718835304824, "grad_norm": 0.3068954788448669, "learning_rate": 4.973822608041484e-07, "loss": 0.0046, "step": 8746 }, { "epoch": 3.9795268425841677, "grad_norm": 0.4426303691387905, "learning_rate": 4.96954553383982e-07, "loss": 0.0035, "step": 8747 }, { "epoch": 3.9799818016378525, "grad_norm": 0.5535744212619474, "learning_rate": 4.965270096436439e-07, "loss": 0.006, "step": 8748 }, { "epoch": 3.980436760691538, "grad_norm": 0.18758203772260443, "learning_rate": 4.960996296180709e-07, "loss": 0.0033, "step": 8749 }, { "epoch": 3.980891719745223, "grad_norm": 0.18406192336115482, "learning_rate": 4.956724133421861e-07, "loss": 0.0025, "step": 8750 }, { "epoch": 3.981346678798908, "grad_norm": 0.3128214339250219, "learning_rate": 4.952453608509e-07, "loss": 0.0025, "step": 8751 }, { "epoch": 3.9818016378525933, "grad_norm": 0.16320176789740487, "learning_rate": 4.948184721791105e-07, "loss": 0.0025, "step": 8752 }, { "epoch": 3.9822565969062786, "grad_norm": 0.8717535417450323, "learning_rate": 4.943917473616999e-07, "loss": 0.0055, "step": 8753 }, { "epoch": 3.9827115559599635, "grad_norm": 0.07044212476211494, "learning_rate": 4.939651864335384e-07, "loss": 0.0004, "step": 8754 }, { "epoch": 3.983166515013649, "grad_norm": 0.2294691087391196, "learning_rate": 4.935387894294825e-07, "loss": 0.0016, "step": 8755 }, { "epoch": 3.983621474067334, "grad_norm": 0.33757349505015277, "learning_rate": 4.931125563843758e-07, "loss": 0.0034, "step": 8756 }, { "epoch": 3.984076433121019, "grad_norm": 0.2536413159633585, "learning_rate": 4.926864873330483e-07, "loss": 0.0026, "step": 8757 }, { "epoch": 3.9845313921747043, "grad_norm": 0.269026697949884, "learning_rate": 4.922605823103152e-07, "loss": 0.0009, "step": 8758 }, { "epoch": 3.9849863512283896, "grad_norm": 0.33093134107904676, "learning_rate": 4.918348413509813e-07, "loss": 0.0077, "step": 8759 }, { "epoch": 3.9854413102820745, "grad_norm": 0.1941797012891581, "learning_rate": 4.914092644898347e-07, "loss": 0.0013, "step": 8760 }, { "epoch": 3.98589626933576, "grad_norm": 0.4483014881599835, "learning_rate": 4.909838517616528e-07, "loss": 0.0037, "step": 8761 }, { "epoch": 3.986351228389445, "grad_norm": 0.43284882435213307, "learning_rate": 4.90558603201198e-07, "loss": 0.0044, "step": 8762 }, { "epoch": 3.98680618744313, "grad_norm": 0.06336388840806861, "learning_rate": 4.901335188432194e-07, "loss": 0.0006, "step": 8763 }, { "epoch": 3.9872611464968153, "grad_norm": 0.21146392757209365, "learning_rate": 4.897085987224534e-07, "loss": 0.0007, "step": 8764 }, { "epoch": 3.9877161055505006, "grad_norm": 0.06801174337021484, "learning_rate": 4.892838428736211e-07, "loss": 0.0005, "step": 8765 }, { "epoch": 3.9881710646041855, "grad_norm": 0.12082558811809449, "learning_rate": 4.888592513314338e-07, "loss": 0.0011, "step": 8766 }, { "epoch": 3.988626023657871, "grad_norm": 0.07640993256572369, "learning_rate": 4.884348241305864e-07, "loss": 0.0004, "step": 8767 }, { "epoch": 3.989080982711556, "grad_norm": 0.2254313086367822, "learning_rate": 4.880105613057612e-07, "loss": 0.0033, "step": 8768 }, { "epoch": 3.989535941765241, "grad_norm": 0.23342070547334637, "learning_rate": 4.875864628916266e-07, "loss": 0.0024, "step": 8769 }, { "epoch": 3.9899909008189263, "grad_norm": 0.2712052997804552, "learning_rate": 4.871625289228376e-07, "loss": 0.0009, "step": 8770 }, { "epoch": 3.9904458598726116, "grad_norm": 0.14306744091040455, "learning_rate": 4.867387594340378e-07, "loss": 0.0019, "step": 8771 }, { "epoch": 3.9909008189262964, "grad_norm": 0.23838914417350088, "learning_rate": 4.863151544598541e-07, "loss": 0.0016, "step": 8772 }, { "epoch": 3.9913557779799818, "grad_norm": 0.08926850599460358, "learning_rate": 4.858917140349026e-07, "loss": 0.0007, "step": 8773 }, { "epoch": 3.991810737033667, "grad_norm": 0.21393043357108166, "learning_rate": 4.854684381937846e-07, "loss": 0.0021, "step": 8774 }, { "epoch": 3.992265696087352, "grad_norm": 0.09349728911937602, "learning_rate": 4.850453269710878e-07, "loss": 0.001, "step": 8775 }, { "epoch": 3.9927206551410372, "grad_norm": 0.4207490277455391, "learning_rate": 4.846223804013883e-07, "loss": 0.0051, "step": 8776 }, { "epoch": 3.9931756141947226, "grad_norm": 0.4786706266313705, "learning_rate": 4.841995985192463e-07, "loss": 0.0052, "step": 8777 }, { "epoch": 3.9936305732484074, "grad_norm": 0.2160521125609943, "learning_rate": 4.837769813592097e-07, "loss": 0.0008, "step": 8778 }, { "epoch": 3.9940855323020927, "grad_norm": 0.162425405727277, "learning_rate": 4.833545289558125e-07, "loss": 0.001, "step": 8779 }, { "epoch": 3.994540491355778, "grad_norm": 0.476988833159541, "learning_rate": 4.829322413435761e-07, "loss": 0.0072, "step": 8780 }, { "epoch": 3.994995450409463, "grad_norm": 0.10289216472943763, "learning_rate": 4.825101185570086e-07, "loss": 0.0006, "step": 8781 }, { "epoch": 3.9954504094631482, "grad_norm": 0.13254229665242281, "learning_rate": 4.820881606306033e-07, "loss": 0.0022, "step": 8782 }, { "epoch": 3.9959053685168335, "grad_norm": 0.16621465009918138, "learning_rate": 4.816663675988406e-07, "loss": 0.0009, "step": 8783 }, { "epoch": 3.9963603275705184, "grad_norm": 0.2168595179274398, "learning_rate": 4.81244739496187e-07, "loss": 0.0011, "step": 8784 }, { "epoch": 3.9968152866242037, "grad_norm": 0.2955061497037257, "learning_rate": 4.808232763570972e-07, "loss": 0.0028, "step": 8785 }, { "epoch": 3.997270245677889, "grad_norm": 0.28837068088501117, "learning_rate": 4.804019782160105e-07, "loss": 0.0019, "step": 8786 }, { "epoch": 3.997725204731574, "grad_norm": 0.09348915730252162, "learning_rate": 4.799808451073539e-07, "loss": 0.001, "step": 8787 }, { "epoch": 3.998180163785259, "grad_norm": 0.3410547338464853, "learning_rate": 4.795598770655407e-07, "loss": 0.0077, "step": 8788 }, { "epoch": 3.9986351228389445, "grad_norm": 0.2297687628968546, "learning_rate": 4.791390741249691e-07, "loss": 0.002, "step": 8789 }, { "epoch": 3.99909008189263, "grad_norm": 0.1666822755837071, "learning_rate": 4.787184363200273e-07, "loss": 0.002, "step": 8790 }, { "epoch": 3.9995450409463147, "grad_norm": 0.1983329197523591, "learning_rate": 4.782979636850866e-07, "loss": 0.0028, "step": 8791 }, { "epoch": 4.0, "grad_norm": 0.36513865771001214, "learning_rate": 4.778776562545063e-07, "loss": 0.0039, "step": 8792 }, { "epoch": 4.000454959053685, "grad_norm": 0.040024620843632015, "learning_rate": 4.774575140626317e-07, "loss": 0.0002, "step": 8793 }, { "epoch": 4.000909918107371, "grad_norm": 0.06526790543202939, "learning_rate": 4.770375371437952e-07, "loss": 0.0003, "step": 8794 }, { "epoch": 4.0013648771610555, "grad_norm": 0.036962513443885105, "learning_rate": 4.766177255323162e-07, "loss": 0.0002, "step": 8795 }, { "epoch": 4.00181983621474, "grad_norm": 0.11465419831442426, "learning_rate": 4.76198079262499e-07, "loss": 0.0017, "step": 8796 }, { "epoch": 4.002274795268426, "grad_norm": 0.08833549459331642, "learning_rate": 4.7577859836863555e-07, "loss": 0.0015, "step": 8797 }, { "epoch": 4.002729754322111, "grad_norm": 0.218393692672721, "learning_rate": 4.7535928288500314e-07, "loss": 0.0011, "step": 8798 }, { "epoch": 4.003184713375796, "grad_norm": 0.10487619955345344, "learning_rate": 4.749401328458675e-07, "loss": 0.0003, "step": 8799 }, { "epoch": 4.003639672429482, "grad_norm": 0.18611326857816946, "learning_rate": 4.7452114828547835e-07, "loss": 0.0014, "step": 8800 }, { "epoch": 4.0040946314831665, "grad_norm": 0.0838376058512492, "learning_rate": 4.741023292380748e-07, "loss": 0.0005, "step": 8801 }, { "epoch": 4.004549590536851, "grad_norm": 0.08192016843518765, "learning_rate": 4.7368367573787987e-07, "loss": 0.0009, "step": 8802 }, { "epoch": 4.005004549590537, "grad_norm": 0.03214900623414378, "learning_rate": 4.7326518781910443e-07, "loss": 0.0003, "step": 8803 }, { "epoch": 4.005459508644222, "grad_norm": 0.07348787191026138, "learning_rate": 4.7284686551594435e-07, "loss": 0.0007, "step": 8804 }, { "epoch": 4.005914467697907, "grad_norm": 0.0717695047878521, "learning_rate": 4.724287088625845e-07, "loss": 0.0006, "step": 8805 }, { "epoch": 4.006369426751593, "grad_norm": 0.06474269148504012, "learning_rate": 4.720107178931943e-07, "loss": 0.0004, "step": 8806 }, { "epoch": 4.0068243858052774, "grad_norm": 0.21048699110488903, "learning_rate": 4.7159289264192917e-07, "loss": 0.0016, "step": 8807 }, { "epoch": 4.007279344858962, "grad_norm": 0.10880712237163202, "learning_rate": 4.711752331429334e-07, "loss": 0.001, "step": 8808 }, { "epoch": 4.007734303912648, "grad_norm": 0.05961491732063202, "learning_rate": 4.7075773943033474e-07, "loss": 0.0005, "step": 8809 }, { "epoch": 4.008189262966333, "grad_norm": 0.27654292411552106, "learning_rate": 4.7034041153825013e-07, "loss": 0.0018, "step": 8810 }, { "epoch": 4.008644222020018, "grad_norm": 0.032063156142564435, "learning_rate": 4.699232495007816e-07, "loss": 0.0002, "step": 8811 }, { "epoch": 4.0090991810737036, "grad_norm": 0.05329750172586599, "learning_rate": 4.69506253352017e-07, "loss": 0.0004, "step": 8812 }, { "epoch": 4.009554140127388, "grad_norm": 0.0548091223873112, "learning_rate": 4.6908942312603125e-07, "loss": 0.0004, "step": 8813 }, { "epoch": 4.010009099181073, "grad_norm": 0.04956702094631797, "learning_rate": 4.686727588568865e-07, "loss": 0.0005, "step": 8814 }, { "epoch": 4.010464058234759, "grad_norm": 0.04481708808418109, "learning_rate": 4.682562605786309e-07, "loss": 0.0003, "step": 8815 }, { "epoch": 4.010919017288444, "grad_norm": 0.15149044577633144, "learning_rate": 4.678399283252985e-07, "loss": 0.0011, "step": 8816 }, { "epoch": 4.011373976342129, "grad_norm": 0.03921290355935795, "learning_rate": 4.6742376213091e-07, "loss": 0.0003, "step": 8817 }, { "epoch": 4.0118289353958145, "grad_norm": 0.06737412468071217, "learning_rate": 4.670077620294719e-07, "loss": 0.0008, "step": 8818 }, { "epoch": 4.012283894449499, "grad_norm": 0.23919346582140114, "learning_rate": 4.665919280549794e-07, "loss": 0.003, "step": 8819 }, { "epoch": 4.012738853503185, "grad_norm": 0.04144640646785379, "learning_rate": 4.661762602414116e-07, "loss": 0.0003, "step": 8820 }, { "epoch": 4.01319381255687, "grad_norm": 0.048872139435531564, "learning_rate": 4.6576075862273445e-07, "loss": 0.0005, "step": 8821 }, { "epoch": 4.013648771610555, "grad_norm": 0.17014871126779232, "learning_rate": 4.6534542323290244e-07, "loss": 0.0023, "step": 8822 }, { "epoch": 4.014103730664241, "grad_norm": 0.29657182378345626, "learning_rate": 4.649302541058531e-07, "loss": 0.0041, "step": 8823 }, { "epoch": 4.0145586897179255, "grad_norm": 0.2655994543867957, "learning_rate": 4.645152512755141e-07, "loss": 0.0014, "step": 8824 }, { "epoch": 4.01501364877161, "grad_norm": 0.15131787491325718, "learning_rate": 4.641004147757963e-07, "loss": 0.0011, "step": 8825 }, { "epoch": 4.015468607825296, "grad_norm": 0.055298882669540896, "learning_rate": 4.6368574464059875e-07, "loss": 0.0004, "step": 8826 }, { "epoch": 4.015923566878981, "grad_norm": 0.3790482766432709, "learning_rate": 4.632712409038054e-07, "loss": 0.002, "step": 8827 }, { "epoch": 4.016378525932666, "grad_norm": 0.031105563785477717, "learning_rate": 4.6285690359928856e-07, "loss": 0.0003, "step": 8828 }, { "epoch": 4.016833484986352, "grad_norm": 0.016804000989045217, "learning_rate": 4.6244273276090655e-07, "loss": 0.0001, "step": 8829 }, { "epoch": 4.0172884440400365, "grad_norm": 0.01573243740488673, "learning_rate": 4.620287284225028e-07, "loss": 0.0001, "step": 8830 }, { "epoch": 4.017743403093721, "grad_norm": 0.06981557841033016, "learning_rate": 4.616148906179083e-07, "loss": 0.0005, "step": 8831 }, { "epoch": 4.018198362147407, "grad_norm": 0.042478010980220235, "learning_rate": 4.612012193809387e-07, "loss": 0.0003, "step": 8832 }, { "epoch": 4.018653321201092, "grad_norm": 0.018797158214429746, "learning_rate": 4.60787714745399e-07, "loss": 0.0001, "step": 8833 }, { "epoch": 4.019108280254777, "grad_norm": 0.14259138822207248, "learning_rate": 4.6037437674507827e-07, "loss": 0.0011, "step": 8834 }, { "epoch": 4.019563239308463, "grad_norm": 0.05550299353675155, "learning_rate": 4.59961205413752e-07, "loss": 0.0003, "step": 8835 }, { "epoch": 4.0200181983621475, "grad_norm": 0.14887499829866468, "learning_rate": 4.5954820078518397e-07, "loss": 0.0012, "step": 8836 }, { "epoch": 4.020473157415832, "grad_norm": 0.04884650401306347, "learning_rate": 4.591353628931222e-07, "loss": 0.0005, "step": 8837 }, { "epoch": 4.020928116469518, "grad_norm": 0.25048384879018143, "learning_rate": 4.587226917713017e-07, "loss": 0.0027, "step": 8838 }, { "epoch": 4.021383075523203, "grad_norm": 0.039212334613082246, "learning_rate": 4.5831018745344487e-07, "loss": 0.0003, "step": 8839 }, { "epoch": 4.021838034576888, "grad_norm": 0.04061267147946752, "learning_rate": 4.5789784997325946e-07, "loss": 0.0003, "step": 8840 }, { "epoch": 4.022292993630574, "grad_norm": 0.01709059628216983, "learning_rate": 4.5748567936443974e-07, "loss": 0.0001, "step": 8841 }, { "epoch": 4.022747952684258, "grad_norm": 0.08118899739292197, "learning_rate": 4.5707367566066584e-07, "loss": 0.0008, "step": 8842 }, { "epoch": 4.023202911737943, "grad_norm": 0.0743544864321873, "learning_rate": 4.566618388956054e-07, "loss": 0.0005, "step": 8843 }, { "epoch": 4.023657870791629, "grad_norm": 0.036398850400968116, "learning_rate": 4.5625016910291223e-07, "loss": 0.0003, "step": 8844 }, { "epoch": 4.024112829845314, "grad_norm": 0.08397148608667557, "learning_rate": 4.558386663162259e-07, "loss": 0.0006, "step": 8845 }, { "epoch": 4.024567788898999, "grad_norm": 0.06780493365428313, "learning_rate": 4.554273305691725e-07, "loss": 0.0006, "step": 8846 }, { "epoch": 4.0250227479526846, "grad_norm": 0.26470368176656217, "learning_rate": 4.550161618953636e-07, "loss": 0.0026, "step": 8847 }, { "epoch": 4.025477707006369, "grad_norm": 0.020980244258109404, "learning_rate": 4.5460516032839963e-07, "loss": 0.0002, "step": 8848 }, { "epoch": 4.025932666060054, "grad_norm": 0.29539143068006124, "learning_rate": 4.541943259018644e-07, "loss": 0.0019, "step": 8849 }, { "epoch": 4.02638762511374, "grad_norm": 0.07508928232780238, "learning_rate": 4.5378365864933076e-07, "loss": 0.0007, "step": 8850 }, { "epoch": 4.026842584167425, "grad_norm": 0.2476533807911324, "learning_rate": 4.5337315860435574e-07, "loss": 0.0026, "step": 8851 }, { "epoch": 4.02729754322111, "grad_norm": 0.0638381505471031, "learning_rate": 4.529628258004831e-07, "loss": 0.0009, "step": 8852 }, { "epoch": 4.0277525022747955, "grad_norm": 0.079336097080382, "learning_rate": 4.525526602712449e-07, "loss": 0.0007, "step": 8853 }, { "epoch": 4.02820746132848, "grad_norm": 0.03562445415236661, "learning_rate": 4.521426620501568e-07, "loss": 0.0004, "step": 8854 }, { "epoch": 4.028662420382165, "grad_norm": 0.17954392777611333, "learning_rate": 4.5173283117072254e-07, "loss": 0.001, "step": 8855 }, { "epoch": 4.029117379435851, "grad_norm": 0.15316014015544177, "learning_rate": 4.5132316766643064e-07, "loss": 0.0011, "step": 8856 }, { "epoch": 4.029572338489536, "grad_norm": 0.05852256928156285, "learning_rate": 4.5091367157075794e-07, "loss": 0.0003, "step": 8857 }, { "epoch": 4.030027297543221, "grad_norm": 0.03977896696225211, "learning_rate": 4.5050434291716684e-07, "loss": 0.0003, "step": 8858 }, { "epoch": 4.0304822565969065, "grad_norm": 0.020120630730188594, "learning_rate": 4.500951817391055e-07, "loss": 0.0001, "step": 8859 }, { "epoch": 4.030937215650591, "grad_norm": 0.0444802940688181, "learning_rate": 4.496861880700085e-07, "loss": 0.0004, "step": 8860 }, { "epoch": 4.031392174704276, "grad_norm": 0.05548860354782671, "learning_rate": 4.492773619432966e-07, "loss": 0.0001, "step": 8861 }, { "epoch": 4.031847133757962, "grad_norm": 0.03252949988324864, "learning_rate": 4.488687033923783e-07, "loss": 0.0002, "step": 8862 }, { "epoch": 4.032302092811647, "grad_norm": 0.07639466084698715, "learning_rate": 4.48460212450646e-07, "loss": 0.0004, "step": 8863 }, { "epoch": 4.032757051865332, "grad_norm": 0.29051709907604745, "learning_rate": 4.4805188915148095e-07, "loss": 0.0031, "step": 8864 }, { "epoch": 4.0332120109190175, "grad_norm": 0.04625509932775978, "learning_rate": 4.4764373352824934e-07, "loss": 0.0005, "step": 8865 }, { "epoch": 4.033666969972702, "grad_norm": 0.02125138123684615, "learning_rate": 4.4723574561430254e-07, "loss": 0.0002, "step": 8866 }, { "epoch": 4.034121929026387, "grad_norm": 0.2151614609268284, "learning_rate": 4.4682792544298137e-07, "loss": 0.001, "step": 8867 }, { "epoch": 4.034576888080073, "grad_norm": 0.11865602077180509, "learning_rate": 4.4642027304761e-07, "loss": 0.0004, "step": 8868 }, { "epoch": 4.035031847133758, "grad_norm": 0.0999603006711627, "learning_rate": 4.460127884614998e-07, "loss": 0.0006, "step": 8869 }, { "epoch": 4.035486806187443, "grad_norm": 0.026765780436039336, "learning_rate": 4.4560547171794837e-07, "loss": 0.0001, "step": 8870 }, { "epoch": 4.0359417652411285, "grad_norm": 0.020141659322894034, "learning_rate": 4.451983228502402e-07, "loss": 0.0002, "step": 8871 }, { "epoch": 4.036396724294813, "grad_norm": 0.045150664246789485, "learning_rate": 4.447913418916464e-07, "loss": 0.0003, "step": 8872 }, { "epoch": 4.036851683348498, "grad_norm": 0.07179440035400242, "learning_rate": 4.4438452887542255e-07, "loss": 0.0004, "step": 8873 }, { "epoch": 4.037306642402184, "grad_norm": 0.034919282923244685, "learning_rate": 4.4397788383481215e-07, "loss": 0.0002, "step": 8874 }, { "epoch": 4.037761601455869, "grad_norm": 0.016908506254233336, "learning_rate": 4.4357140680304416e-07, "loss": 0.0001, "step": 8875 }, { "epoch": 4.038216560509555, "grad_norm": 0.15985435969364517, "learning_rate": 4.4316509781333365e-07, "loss": 0.0011, "step": 8876 }, { "epoch": 4.038671519563239, "grad_norm": 0.04543023952476501, "learning_rate": 4.4275895689888243e-07, "loss": 0.0004, "step": 8877 }, { "epoch": 4.039126478616924, "grad_norm": 0.037462170910305514, "learning_rate": 4.4235298409287977e-07, "loss": 0.0003, "step": 8878 }, { "epoch": 4.03958143767061, "grad_norm": 0.04996800227890348, "learning_rate": 4.419471794284988e-07, "loss": 0.0005, "step": 8879 }, { "epoch": 4.040036396724295, "grad_norm": 0.0898803813289144, "learning_rate": 4.4154154293890003e-07, "loss": 0.0004, "step": 8880 }, { "epoch": 4.04049135577798, "grad_norm": 0.02036113435400236, "learning_rate": 4.4113607465723017e-07, "loss": 0.0001, "step": 8881 }, { "epoch": 4.0409463148316656, "grad_norm": 0.03644859325700703, "learning_rate": 4.407307746166231e-07, "loss": 0.0003, "step": 8882 }, { "epoch": 4.04140127388535, "grad_norm": 0.045184611320950085, "learning_rate": 4.4032564285019756e-07, "loss": 0.0005, "step": 8883 }, { "epoch": 4.041856232939035, "grad_norm": 0.07965823153482929, "learning_rate": 4.399206793910582e-07, "loss": 0.0005, "step": 8884 }, { "epoch": 4.042311191992721, "grad_norm": 0.13424331155534713, "learning_rate": 4.3951588427229855e-07, "loss": 0.0017, "step": 8885 }, { "epoch": 4.042766151046406, "grad_norm": 0.34583412383295853, "learning_rate": 4.3911125752699513e-07, "loss": 0.0063, "step": 8886 }, { "epoch": 4.043221110100091, "grad_norm": 0.10393870535620883, "learning_rate": 4.387067991882135e-07, "loss": 0.0006, "step": 8887 }, { "epoch": 4.0436760691537765, "grad_norm": 0.02371874045014525, "learning_rate": 4.3830250928900335e-07, "loss": 0.0001, "step": 8888 }, { "epoch": 4.044131028207461, "grad_norm": 0.07354094731243258, "learning_rate": 4.378983878624018e-07, "loss": 0.0004, "step": 8889 }, { "epoch": 4.044585987261146, "grad_norm": 0.3558037430930783, "learning_rate": 4.374944349414309e-07, "loss": 0.0021, "step": 8890 }, { "epoch": 4.045040946314832, "grad_norm": 0.0941867284613963, "learning_rate": 4.3709065055910075e-07, "loss": 0.0007, "step": 8891 }, { "epoch": 4.045495905368517, "grad_norm": 0.0855513473161346, "learning_rate": 4.36687034748407e-07, "loss": 0.0006, "step": 8892 }, { "epoch": 4.045950864422202, "grad_norm": 0.03518914132231523, "learning_rate": 4.3628358754233103e-07, "loss": 0.0003, "step": 8893 }, { "epoch": 4.0464058234758875, "grad_norm": 0.1699256956488362, "learning_rate": 4.3588030897384057e-07, "loss": 0.001, "step": 8894 }, { "epoch": 4.046860782529572, "grad_norm": 0.026256560206128866, "learning_rate": 4.3547719907588937e-07, "loss": 0.0002, "step": 8895 }, { "epoch": 4.047315741583257, "grad_norm": 0.08873912742515633, "learning_rate": 4.350742578814185e-07, "loss": 0.0008, "step": 8896 }, { "epoch": 4.047770700636943, "grad_norm": 0.13779096974363766, "learning_rate": 4.346714854233544e-07, "loss": 0.0024, "step": 8897 }, { "epoch": 4.048225659690628, "grad_norm": 0.09735693025735022, "learning_rate": 4.3426888173460886e-07, "loss": 0.0007, "step": 8898 }, { "epoch": 4.048680618744313, "grad_norm": 0.052153667104861766, "learning_rate": 4.3386644684808214e-07, "loss": 0.0003, "step": 8899 }, { "epoch": 4.0491355777979985, "grad_norm": 0.06581328604418529, "learning_rate": 4.3346418079665803e-07, "loss": 0.0003, "step": 8900 }, { "epoch": 4.049590536851683, "grad_norm": 0.03402697490121898, "learning_rate": 4.3306208361320963e-07, "loss": 0.0001, "step": 8901 }, { "epoch": 4.050045495905368, "grad_norm": 0.1337850560010932, "learning_rate": 4.326601553305934e-07, "loss": 0.0006, "step": 8902 }, { "epoch": 4.050500454959054, "grad_norm": 0.10683039362289583, "learning_rate": 4.3225839598165315e-07, "loss": 0.0011, "step": 8903 }, { "epoch": 4.050955414012739, "grad_norm": 0.31648003866504537, "learning_rate": 4.318568055992184e-07, "loss": 0.0046, "step": 8904 }, { "epoch": 4.051410373066424, "grad_norm": 0.0868167363096441, "learning_rate": 4.3145538421610564e-07, "loss": 0.0007, "step": 8905 }, { "epoch": 4.0518653321201095, "grad_norm": 0.07065431559614077, "learning_rate": 4.3105413186511847e-07, "loss": 0.0005, "step": 8906 }, { "epoch": 4.052320291173794, "grad_norm": 0.02716105503090328, "learning_rate": 4.306530485790439e-07, "loss": 0.0002, "step": 8907 }, { "epoch": 4.052775250227479, "grad_norm": 0.08185751743055214, "learning_rate": 4.302521343906574e-07, "loss": 0.0004, "step": 8908 }, { "epoch": 4.053230209281165, "grad_norm": 0.06183282622887986, "learning_rate": 4.298513893327194e-07, "loss": 0.0005, "step": 8909 }, { "epoch": 4.05368516833485, "grad_norm": 0.07334293101354054, "learning_rate": 4.2945081343797687e-07, "loss": 0.0005, "step": 8910 }, { "epoch": 4.054140127388535, "grad_norm": 0.026060614315279087, "learning_rate": 4.2905040673916376e-07, "loss": 0.0002, "step": 8911 }, { "epoch": 4.05459508644222, "grad_norm": 0.06725555879860226, "learning_rate": 4.2865016926899844e-07, "loss": 0.0002, "step": 8912 }, { "epoch": 4.055050045495905, "grad_norm": 0.31186162999323624, "learning_rate": 4.2825010106018776e-07, "loss": 0.005, "step": 8913 }, { "epoch": 4.05550500454959, "grad_norm": 0.36226860499898594, "learning_rate": 4.2785020214542285e-07, "loss": 0.0014, "step": 8914 }, { "epoch": 4.055959963603276, "grad_norm": 0.032708802775106426, "learning_rate": 4.274504725573811e-07, "loss": 0.0003, "step": 8915 }, { "epoch": 4.056414922656961, "grad_norm": 0.05639675946355039, "learning_rate": 4.270509123287278e-07, "loss": 0.0004, "step": 8916 }, { "epoch": 4.056869881710646, "grad_norm": 0.1878554323081628, "learning_rate": 4.266515214921127e-07, "loss": 0.001, "step": 8917 }, { "epoch": 4.057324840764331, "grad_norm": 0.48269205020218736, "learning_rate": 4.2625230008017185e-07, "loss": 0.0079, "step": 8918 }, { "epoch": 4.057779799818016, "grad_norm": 0.06865777398637297, "learning_rate": 4.258532481255276e-07, "loss": 0.0006, "step": 8919 }, { "epoch": 4.058234758871701, "grad_norm": 0.12471492906313303, "learning_rate": 4.254543656607893e-07, "loss": 0.0005, "step": 8920 }, { "epoch": 4.058689717925387, "grad_norm": 0.022496148724148846, "learning_rate": 4.2505565271855205e-07, "loss": 0.0002, "step": 8921 }, { "epoch": 4.059144676979072, "grad_norm": 0.06571193337150652, "learning_rate": 4.246571093313967e-07, "loss": 0.0005, "step": 8922 }, { "epoch": 4.059599636032757, "grad_norm": 0.08418412916523271, "learning_rate": 4.2425873553189003e-07, "loss": 0.0007, "step": 8923 }, { "epoch": 4.060054595086442, "grad_norm": 0.08475471830882202, "learning_rate": 4.238605313525851e-07, "loss": 0.0008, "step": 8924 }, { "epoch": 4.060509554140127, "grad_norm": 0.17949252961377035, "learning_rate": 4.234624968260223e-07, "loss": 0.0024, "step": 8925 }, { "epoch": 4.060964513193812, "grad_norm": 0.05365940286049136, "learning_rate": 4.2306463198472593e-07, "loss": 0.0003, "step": 8926 }, { "epoch": 4.061419472247498, "grad_norm": 0.13250982396776273, "learning_rate": 4.2266693686120933e-07, "loss": 0.0013, "step": 8927 }, { "epoch": 4.061874431301183, "grad_norm": 0.15468607479774765, "learning_rate": 4.222694114879694e-07, "loss": 0.0022, "step": 8928 }, { "epoch": 4.0623293903548685, "grad_norm": 0.09781095480350604, "learning_rate": 4.2187205589748953e-07, "loss": 0.0009, "step": 8929 }, { "epoch": 4.062784349408553, "grad_norm": 0.17892398765064033, "learning_rate": 4.2147487012224127e-07, "loss": 0.0012, "step": 8930 }, { "epoch": 4.063239308462238, "grad_norm": 0.14389889283426635, "learning_rate": 4.2107785419468e-07, "loss": 0.0008, "step": 8931 }, { "epoch": 4.063694267515924, "grad_norm": 0.11841112574703916, "learning_rate": 4.2068100814724814e-07, "loss": 0.0008, "step": 8932 }, { "epoch": 4.064149226569609, "grad_norm": 0.08851412807796308, "learning_rate": 4.2028433201237366e-07, "loss": 0.0003, "step": 8933 }, { "epoch": 4.064604185623294, "grad_norm": 0.10398232931916596, "learning_rate": 4.1988782582247146e-07, "loss": 0.0011, "step": 8934 }, { "epoch": 4.0650591446769795, "grad_norm": 0.07613790540884702, "learning_rate": 4.1949148960994335e-07, "loss": 0.0005, "step": 8935 }, { "epoch": 4.065514103730664, "grad_norm": 0.12459792657173298, "learning_rate": 4.1909532340717484e-07, "loss": 0.0012, "step": 8936 }, { "epoch": 4.065969062784349, "grad_norm": 0.1275234381376378, "learning_rate": 4.186993272465395e-07, "loss": 0.0017, "step": 8937 }, { "epoch": 4.066424021838035, "grad_norm": 0.028578791073652767, "learning_rate": 4.183035011603953e-07, "loss": 0.0002, "step": 8938 }, { "epoch": 4.06687898089172, "grad_norm": 0.024051987843396343, "learning_rate": 4.179078451810889e-07, "loss": 0.0001, "step": 8939 }, { "epoch": 4.067333939945405, "grad_norm": 0.1771318745268004, "learning_rate": 4.1751235934094994e-07, "loss": 0.0018, "step": 8940 }, { "epoch": 4.0677888989990905, "grad_norm": 0.09602009402475516, "learning_rate": 4.171170436722974e-07, "loss": 0.0013, "step": 8941 }, { "epoch": 4.068243858052775, "grad_norm": 0.1631755044871343, "learning_rate": 4.1672189820743365e-07, "loss": 0.001, "step": 8942 }, { "epoch": 4.06869881710646, "grad_norm": 0.04875861722462633, "learning_rate": 4.1632692297864765e-07, "loss": 0.0003, "step": 8943 }, { "epoch": 4.069153776160146, "grad_norm": 0.16831481056065561, "learning_rate": 4.159321180182166e-07, "loss": 0.0009, "step": 8944 }, { "epoch": 4.069608735213831, "grad_norm": 0.2039503646449555, "learning_rate": 4.155374833584011e-07, "loss": 0.0016, "step": 8945 }, { "epoch": 4.070063694267516, "grad_norm": 0.0833515814361538, "learning_rate": 4.1514301903144926e-07, "loss": 0.0009, "step": 8946 }, { "epoch": 4.070518653321201, "grad_norm": 0.04340237242044923, "learning_rate": 4.147487250695942e-07, "loss": 0.0004, "step": 8947 }, { "epoch": 4.070973612374886, "grad_norm": 0.013523594526505799, "learning_rate": 4.1435460150505675e-07, "loss": 0.0001, "step": 8948 }, { "epoch": 4.071428571428571, "grad_norm": 0.06956648701302995, "learning_rate": 4.139606483700423e-07, "loss": 0.0003, "step": 8949 }, { "epoch": 4.071883530482257, "grad_norm": 0.036469634747657236, "learning_rate": 4.1356686569674344e-07, "loss": 0.0002, "step": 8950 }, { "epoch": 4.072338489535942, "grad_norm": 0.1760636179358739, "learning_rate": 4.1317325351733827e-07, "loss": 0.0016, "step": 8951 }, { "epoch": 4.072793448589627, "grad_norm": 0.02473647519630741, "learning_rate": 4.1277981186399084e-07, "loss": 0.0002, "step": 8952 }, { "epoch": 4.073248407643312, "grad_norm": 0.015154699049856616, "learning_rate": 4.123865407688507e-07, "loss": 0.0001, "step": 8953 }, { "epoch": 4.073703366696997, "grad_norm": 0.03236877948845849, "learning_rate": 4.119934402640549e-07, "loss": 0.0002, "step": 8954 }, { "epoch": 4.074158325750682, "grad_norm": 0.32589199997522483, "learning_rate": 4.1160051038172636e-07, "loss": 0.0039, "step": 8955 }, { "epoch": 4.074613284804368, "grad_norm": 0.053027087224116656, "learning_rate": 4.1120775115397305e-07, "loss": 0.0006, "step": 8956 }, { "epoch": 4.075068243858053, "grad_norm": 0.2226578794508631, "learning_rate": 4.1081516261288953e-07, "loss": 0.0024, "step": 8957 }, { "epoch": 4.075523202911738, "grad_norm": 0.017802269588859197, "learning_rate": 4.104227447905554e-07, "loss": 0.0001, "step": 8958 }, { "epoch": 4.075978161965423, "grad_norm": 0.070404087528106, "learning_rate": 4.100304977190389e-07, "loss": 0.0006, "step": 8959 }, { "epoch": 4.076433121019108, "grad_norm": 0.15290078290318126, "learning_rate": 4.0963842143039194e-07, "loss": 0.0013, "step": 8960 }, { "epoch": 4.076888080072793, "grad_norm": 0.035565515690187524, "learning_rate": 4.092465159566525e-07, "loss": 0.0003, "step": 8961 }, { "epoch": 4.077343039126479, "grad_norm": 0.017133943016673082, "learning_rate": 4.088547813298466e-07, "loss": 0.0001, "step": 8962 }, { "epoch": 4.077797998180164, "grad_norm": 0.047009287521158155, "learning_rate": 4.084632175819836e-07, "loss": 0.0005, "step": 8963 }, { "epoch": 4.078252957233849, "grad_norm": 0.02523465296767164, "learning_rate": 4.080718247450621e-07, "loss": 0.0002, "step": 8964 }, { "epoch": 4.078707916287534, "grad_norm": 0.13462590400532753, "learning_rate": 4.076806028510638e-07, "loss": 0.0007, "step": 8965 }, { "epoch": 4.079162875341219, "grad_norm": 0.03524768490356308, "learning_rate": 4.0728955193195806e-07, "loss": 0.0003, "step": 8966 }, { "epoch": 4.079617834394904, "grad_norm": 0.024865085209006883, "learning_rate": 4.068986720196988e-07, "loss": 0.0002, "step": 8967 }, { "epoch": 4.08007279344859, "grad_norm": 0.1787034331451476, "learning_rate": 4.0650796314622767e-07, "loss": 0.0031, "step": 8968 }, { "epoch": 4.080527752502275, "grad_norm": 0.15321892961518296, "learning_rate": 4.061174253434724e-07, "loss": 0.0012, "step": 8969 }, { "epoch": 4.08098271155596, "grad_norm": 0.07212886393643139, "learning_rate": 4.057270586433451e-07, "loss": 0.0005, "step": 8970 }, { "epoch": 4.081437670609645, "grad_norm": 0.31257608096628187, "learning_rate": 4.0533686307774487e-07, "loss": 0.0026, "step": 8971 }, { "epoch": 4.08189262966333, "grad_norm": 0.019316007121914818, "learning_rate": 4.049468386785563e-07, "loss": 0.0001, "step": 8972 }, { "epoch": 4.082347588717015, "grad_norm": 0.04971425403352878, "learning_rate": 4.0455698547765155e-07, "loss": 0.0004, "step": 8973 }, { "epoch": 4.082802547770701, "grad_norm": 0.06157855241489604, "learning_rate": 4.0416730350688687e-07, "loss": 0.0004, "step": 8974 }, { "epoch": 4.083257506824386, "grad_norm": 0.22400799872050597, "learning_rate": 4.0377779279810485e-07, "loss": 0.0049, "step": 8975 }, { "epoch": 4.083712465878071, "grad_norm": 0.019722236016135063, "learning_rate": 4.033884533831359e-07, "loss": 0.0002, "step": 8976 }, { "epoch": 4.084167424931756, "grad_norm": 0.024323156564684867, "learning_rate": 4.0299928529379364e-07, "loss": 0.0001, "step": 8977 }, { "epoch": 4.084622383985441, "grad_norm": 0.07579298293751197, "learning_rate": 4.0261028856188017e-07, "loss": 0.0005, "step": 8978 }, { "epoch": 4.085077343039126, "grad_norm": 0.035182237351900215, "learning_rate": 4.022214632191826e-07, "loss": 0.0002, "step": 8979 }, { "epoch": 4.085532302092812, "grad_norm": 0.06484450573552261, "learning_rate": 4.018328092974733e-07, "loss": 0.0006, "step": 8980 }, { "epoch": 4.085987261146497, "grad_norm": 0.11956326818317344, "learning_rate": 4.014443268285118e-07, "loss": 0.0013, "step": 8981 }, { "epoch": 4.0864422202001816, "grad_norm": 0.02365170551268841, "learning_rate": 4.0105601584404214e-07, "loss": 0.0001, "step": 8982 }, { "epoch": 4.086897179253867, "grad_norm": 0.0769213661922686, "learning_rate": 4.00667876375796e-07, "loss": 0.0003, "step": 8983 }, { "epoch": 4.087352138307552, "grad_norm": 0.04086595529851256, "learning_rate": 4.0027990845549146e-07, "loss": 0.0002, "step": 8984 }, { "epoch": 4.087807097361237, "grad_norm": 0.044272577997537456, "learning_rate": 3.9989211211483025e-07, "loss": 0.0003, "step": 8985 }, { "epoch": 4.088262056414923, "grad_norm": 0.09563975222641727, "learning_rate": 3.9950448738550166e-07, "loss": 0.0008, "step": 8986 }, { "epoch": 4.088717015468608, "grad_norm": 0.15650550705974892, "learning_rate": 3.991170342991801e-07, "loss": 0.0017, "step": 8987 }, { "epoch": 4.089171974522293, "grad_norm": 0.028594097612181665, "learning_rate": 3.987297528875275e-07, "loss": 0.0002, "step": 8988 }, { "epoch": 4.089626933575978, "grad_norm": 0.14257020771177564, "learning_rate": 3.983426431821899e-07, "loss": 0.0027, "step": 8989 }, { "epoch": 4.090081892629663, "grad_norm": 0.08518786691554084, "learning_rate": 3.9795570521480087e-07, "loss": 0.0005, "step": 8990 }, { "epoch": 4.090536851683349, "grad_norm": 0.023473180487681263, "learning_rate": 3.9756893901697904e-07, "loss": 0.0002, "step": 8991 }, { "epoch": 4.090991810737034, "grad_norm": 0.017153213177267927, "learning_rate": 3.971823446203282e-07, "loss": 0.0001, "step": 8992 }, { "epoch": 4.091446769790719, "grad_norm": 0.11607033969024799, "learning_rate": 3.967959220564405e-07, "loss": 0.0004, "step": 8993 }, { "epoch": 4.091901728844404, "grad_norm": 0.13005382225539688, "learning_rate": 3.964096713568924e-07, "loss": 0.0009, "step": 8994 }, { "epoch": 4.092356687898089, "grad_norm": 0.11371559816419165, "learning_rate": 3.9602359255324574e-07, "loss": 0.001, "step": 8995 }, { "epoch": 4.092811646951774, "grad_norm": 0.031446826455104486, "learning_rate": 3.956376856770494e-07, "loss": 0.0001, "step": 8996 }, { "epoch": 4.09326660600546, "grad_norm": 0.1179360337073261, "learning_rate": 3.952519507598382e-07, "loss": 0.0011, "step": 8997 }, { "epoch": 4.093721565059145, "grad_norm": 0.06078718514274718, "learning_rate": 3.94866387833133e-07, "loss": 0.0002, "step": 8998 }, { "epoch": 4.09417652411283, "grad_norm": 0.04838422974721381, "learning_rate": 3.9448099692843994e-07, "loss": 0.0004, "step": 8999 }, { "epoch": 4.094631483166515, "grad_norm": 0.07744550894509146, "learning_rate": 3.940957780772514e-07, "loss": 0.0004, "step": 9000 }, { "epoch": 4.0950864422202, "grad_norm": 0.05903020337825681, "learning_rate": 3.93710731311045e-07, "loss": 0.0003, "step": 9001 }, { "epoch": 4.095541401273885, "grad_norm": 0.061192420940519854, "learning_rate": 3.933258566612863e-07, "loss": 0.0003, "step": 9002 }, { "epoch": 4.095996360327571, "grad_norm": 0.128103637034612, "learning_rate": 3.929411541594247e-07, "loss": 0.0009, "step": 9003 }, { "epoch": 4.096451319381256, "grad_norm": 0.08173666016097253, "learning_rate": 3.925566238368969e-07, "loss": 0.0004, "step": 9004 }, { "epoch": 4.096906278434941, "grad_norm": 0.08455005415241242, "learning_rate": 3.9217226572512453e-07, "loss": 0.0004, "step": 9005 }, { "epoch": 4.097361237488626, "grad_norm": 0.06196078021991468, "learning_rate": 3.917880798555154e-07, "loss": 0.0003, "step": 9006 }, { "epoch": 4.097816196542311, "grad_norm": 0.19312142642865252, "learning_rate": 3.9140406625946425e-07, "loss": 0.001, "step": 9007 }, { "epoch": 4.098271155595996, "grad_norm": 0.12342674820953474, "learning_rate": 3.910202249683506e-07, "loss": 0.0013, "step": 9008 }, { "epoch": 4.098726114649682, "grad_norm": 0.014929904106779699, "learning_rate": 3.9063655601354e-07, "loss": 0.0001, "step": 9009 }, { "epoch": 4.099181073703367, "grad_norm": 0.13442064431565656, "learning_rate": 3.902530594263837e-07, "loss": 0.0005, "step": 9010 }, { "epoch": 4.099636032757052, "grad_norm": 0.12214056897016685, "learning_rate": 3.898697352382197e-07, "loss": 0.001, "step": 9011 }, { "epoch": 4.100090991810737, "grad_norm": 0.20171543959270663, "learning_rate": 3.8948658348037236e-07, "loss": 0.003, "step": 9012 }, { "epoch": 4.100545950864422, "grad_norm": 0.1107694512372097, "learning_rate": 3.891036041841506e-07, "loss": 0.0004, "step": 9013 }, { "epoch": 4.101000909918107, "grad_norm": 0.09016204821908695, "learning_rate": 3.8872079738084934e-07, "loss": 0.0006, "step": 9014 }, { "epoch": 4.101455868971793, "grad_norm": 0.05225051473477989, "learning_rate": 3.883381631017502e-07, "loss": 0.0003, "step": 9015 }, { "epoch": 4.101910828025478, "grad_norm": 0.011355559561124873, "learning_rate": 3.8795570137811933e-07, "loss": 0.0001, "step": 9016 }, { "epoch": 4.1023657870791626, "grad_norm": 0.17054757578143456, "learning_rate": 3.8757341224121085e-07, "loss": 0.001, "step": 9017 }, { "epoch": 4.102820746132848, "grad_norm": 0.028939896536184978, "learning_rate": 3.8719129572226425e-07, "loss": 0.0002, "step": 9018 }, { "epoch": 4.103275705186533, "grad_norm": 0.02059007194905081, "learning_rate": 3.8680935185250344e-07, "loss": 0.0001, "step": 9019 }, { "epoch": 4.103730664240218, "grad_norm": 0.4135042036229917, "learning_rate": 3.864275806631393e-07, "loss": 0.0045, "step": 9020 }, { "epoch": 4.104185623293904, "grad_norm": 0.11730499101102758, "learning_rate": 3.8604598218536795e-07, "loss": 0.0011, "step": 9021 }, { "epoch": 4.104640582347589, "grad_norm": 0.22637765405295762, "learning_rate": 3.8566455645037275e-07, "loss": 0.001, "step": 9022 }, { "epoch": 4.1050955414012735, "grad_norm": 0.0559151242970765, "learning_rate": 3.852833034893219e-07, "loss": 0.0004, "step": 9023 }, { "epoch": 4.105550500454959, "grad_norm": 0.07905261774964673, "learning_rate": 3.8490222333336906e-07, "loss": 0.0007, "step": 9024 }, { "epoch": 4.106005459508644, "grad_norm": 0.06232094623974693, "learning_rate": 3.845213160136552e-07, "loss": 0.0005, "step": 9025 }, { "epoch": 4.106460418562329, "grad_norm": 0.05595419032540556, "learning_rate": 3.841405815613056e-07, "loss": 0.0003, "step": 9026 }, { "epoch": 4.106915377616015, "grad_norm": 0.05538268110720587, "learning_rate": 3.837600200074329e-07, "loss": 0.0003, "step": 9027 }, { "epoch": 4.1073703366697, "grad_norm": 0.18224407739377968, "learning_rate": 3.833796313831345e-07, "loss": 0.0019, "step": 9028 }, { "epoch": 4.1078252957233845, "grad_norm": 0.22245191667313283, "learning_rate": 3.8299941571949437e-07, "loss": 0.0012, "step": 9029 }, { "epoch": 4.10828025477707, "grad_norm": 0.09678574436259321, "learning_rate": 3.826193730475808e-07, "loss": 0.0009, "step": 9030 }, { "epoch": 4.108735213830755, "grad_norm": 0.05508149295783648, "learning_rate": 3.8223950339845024e-07, "loss": 0.0003, "step": 9031 }, { "epoch": 4.10919017288444, "grad_norm": 0.042459696711720035, "learning_rate": 3.818598068031443e-07, "loss": 0.0003, "step": 9032 }, { "epoch": 4.109645131938126, "grad_norm": 0.015432352456535781, "learning_rate": 3.814802832926895e-07, "loss": 0.0001, "step": 9033 }, { "epoch": 4.110100090991811, "grad_norm": 0.08181849591253876, "learning_rate": 3.811009328980986e-07, "loss": 0.0006, "step": 9034 }, { "epoch": 4.1105550500454955, "grad_norm": 0.07091766149859034, "learning_rate": 3.807217556503703e-07, "loss": 0.0007, "step": 9035 }, { "epoch": 4.111010009099181, "grad_norm": 0.10913642179110923, "learning_rate": 3.8034275158049e-07, "loss": 0.0013, "step": 9036 }, { "epoch": 4.111464968152866, "grad_norm": 0.06824435127034925, "learning_rate": 3.799639207194272e-07, "loss": 0.0002, "step": 9037 }, { "epoch": 4.111919927206552, "grad_norm": 0.12866686490878615, "learning_rate": 3.795852630981392e-07, "loss": 0.0012, "step": 9038 }, { "epoch": 4.112374886260237, "grad_norm": 0.05934049459920543, "learning_rate": 3.792067787475681e-07, "loss": 0.0005, "step": 9039 }, { "epoch": 4.112829845313922, "grad_norm": 0.06613021358648892, "learning_rate": 3.788284676986409e-07, "loss": 0.0005, "step": 9040 }, { "epoch": 4.113284804367607, "grad_norm": 0.054442552544728705, "learning_rate": 3.784503299822728e-07, "loss": 0.0003, "step": 9041 }, { "epoch": 4.113739763421292, "grad_norm": 0.06394303174661847, "learning_rate": 3.780723656293628e-07, "loss": 0.0005, "step": 9042 }, { "epoch": 4.114194722474977, "grad_norm": 0.19755634215134277, "learning_rate": 3.7769457467079664e-07, "loss": 0.0016, "step": 9043 }, { "epoch": 4.114649681528663, "grad_norm": 0.2848574648354931, "learning_rate": 3.7731695713744493e-07, "loss": 0.0079, "step": 9044 }, { "epoch": 4.115104640582348, "grad_norm": 0.012350905192398334, "learning_rate": 3.769395130601655e-07, "loss": 0.0001, "step": 9045 }, { "epoch": 4.115559599636033, "grad_norm": 0.03452230836346412, "learning_rate": 3.7656224246980207e-07, "loss": 0.0002, "step": 9046 }, { "epoch": 4.116014558689718, "grad_norm": 0.017016942182355485, "learning_rate": 3.761851453971829e-07, "loss": 0.0001, "step": 9047 }, { "epoch": 4.116469517743403, "grad_norm": 0.056574931495940485, "learning_rate": 3.7580822187312265e-07, "loss": 0.0003, "step": 9048 }, { "epoch": 4.116924476797088, "grad_norm": 0.04520327657993547, "learning_rate": 3.7543147192842075e-07, "loss": 0.0002, "step": 9049 }, { "epoch": 4.117379435850774, "grad_norm": 0.10587999363095865, "learning_rate": 3.750548955938654e-07, "loss": 0.0008, "step": 9050 }, { "epoch": 4.117834394904459, "grad_norm": 0.10479865918182107, "learning_rate": 3.746784929002273e-07, "loss": 0.0011, "step": 9051 }, { "epoch": 4.1182893539581436, "grad_norm": 0.10996666463641004, "learning_rate": 3.7430226387826534e-07, "loss": 0.0007, "step": 9052 }, { "epoch": 4.118744313011829, "grad_norm": 0.10594766673162277, "learning_rate": 3.739262085587228e-07, "loss": 0.0008, "step": 9053 }, { "epoch": 4.119199272065514, "grad_norm": 0.06942510708692096, "learning_rate": 3.7355032697232926e-07, "loss": 0.0007, "step": 9054 }, { "epoch": 4.119654231119199, "grad_norm": 0.15181528603978042, "learning_rate": 3.731746191497995e-07, "loss": 0.0018, "step": 9055 }, { "epoch": 4.120109190172885, "grad_norm": 0.07371383953514467, "learning_rate": 3.7279908512183576e-07, "loss": 0.0003, "step": 9056 }, { "epoch": 4.12056414922657, "grad_norm": 0.09863613279570581, "learning_rate": 3.7242372491912456e-07, "loss": 0.0006, "step": 9057 }, { "epoch": 4.1210191082802545, "grad_norm": 0.07828118394291272, "learning_rate": 3.720485385723377e-07, "loss": 0.0006, "step": 9058 }, { "epoch": 4.12147406733394, "grad_norm": 0.09819357644145124, "learning_rate": 3.716735261121351e-07, "loss": 0.0012, "step": 9059 }, { "epoch": 4.121929026387625, "grad_norm": 0.07466722586117931, "learning_rate": 3.7129868756916013e-07, "loss": 0.0006, "step": 9060 }, { "epoch": 4.12238398544131, "grad_norm": 0.28513863957973323, "learning_rate": 3.709240229740435e-07, "loss": 0.0025, "step": 9061 }, { "epoch": 4.122838944494996, "grad_norm": 0.13254757211331555, "learning_rate": 3.7054953235740125e-07, "loss": 0.0014, "step": 9062 }, { "epoch": 4.123293903548681, "grad_norm": 0.05648105239087329, "learning_rate": 3.701752157498345e-07, "loss": 0.0004, "step": 9063 }, { "epoch": 4.1237488626023655, "grad_norm": 0.016078607738892668, "learning_rate": 3.698010731819304e-07, "loss": 0.0001, "step": 9064 }, { "epoch": 4.124203821656051, "grad_norm": 0.0797824576826728, "learning_rate": 3.694271046842629e-07, "loss": 0.0005, "step": 9065 }, { "epoch": 4.124658780709736, "grad_norm": 0.14552948430355442, "learning_rate": 3.690533102873911e-07, "loss": 0.0014, "step": 9066 }, { "epoch": 4.125113739763421, "grad_norm": 0.03593296469014492, "learning_rate": 3.686796900218598e-07, "loss": 0.0002, "step": 9067 }, { "epoch": 4.125568698817107, "grad_norm": 0.1775120916958144, "learning_rate": 3.683062439181992e-07, "loss": 0.002, "step": 9068 }, { "epoch": 4.126023657870792, "grad_norm": 0.043957274139579394, "learning_rate": 3.6793297200692494e-07, "loss": 0.0003, "step": 9069 }, { "epoch": 4.1264786169244765, "grad_norm": 0.3739907697926269, "learning_rate": 3.6755987431854046e-07, "loss": 0.0021, "step": 9070 }, { "epoch": 4.126933575978162, "grad_norm": 0.03871682762589365, "learning_rate": 3.6718695088353323e-07, "loss": 0.0001, "step": 9071 }, { "epoch": 4.127388535031847, "grad_norm": 0.20786013035796008, "learning_rate": 3.6681420173237585e-07, "loss": 0.0036, "step": 9072 }, { "epoch": 4.127843494085532, "grad_norm": 0.109511192880789, "learning_rate": 3.6644162689552925e-07, "loss": 0.0008, "step": 9073 }, { "epoch": 4.128298453139218, "grad_norm": 0.133195707977949, "learning_rate": 3.660692264034374e-07, "loss": 0.0008, "step": 9074 }, { "epoch": 4.128753412192903, "grad_norm": 0.17943358908173426, "learning_rate": 3.6569700028653205e-07, "loss": 0.0022, "step": 9075 }, { "epoch": 4.1292083712465875, "grad_norm": 0.028336953877317182, "learning_rate": 3.6532494857522944e-07, "loss": 0.0001, "step": 9076 }, { "epoch": 4.129663330300273, "grad_norm": 0.051917062546592306, "learning_rate": 3.649530712999319e-07, "loss": 0.0004, "step": 9077 }, { "epoch": 4.130118289353958, "grad_norm": 0.023403191931804, "learning_rate": 3.645813684910271e-07, "loss": 0.0002, "step": 9078 }, { "epoch": 4.130573248407643, "grad_norm": 0.08297213864841577, "learning_rate": 3.6420984017888934e-07, "loss": 0.0003, "step": 9079 }, { "epoch": 4.131028207461329, "grad_norm": 0.022123727535895112, "learning_rate": 3.6383848639387876e-07, "loss": 0.0001, "step": 9080 }, { "epoch": 4.131483166515014, "grad_norm": 0.12305459892085657, "learning_rate": 3.6346730716634026e-07, "loss": 0.0006, "step": 9081 }, { "epoch": 4.131938125568698, "grad_norm": 0.03595185619897904, "learning_rate": 3.6309630252660514e-07, "loss": 0.0002, "step": 9082 }, { "epoch": 4.132393084622384, "grad_norm": 0.08834255793275665, "learning_rate": 3.627254725049892e-07, "loss": 0.0008, "step": 9083 }, { "epoch": 4.132848043676069, "grad_norm": 0.2978716999389763, "learning_rate": 3.6235481713179644e-07, "loss": 0.0013, "step": 9084 }, { "epoch": 4.133303002729754, "grad_norm": 0.1612770858997304, "learning_rate": 3.619843364373146e-07, "loss": 0.0011, "step": 9085 }, { "epoch": 4.13375796178344, "grad_norm": 0.0547167431600839, "learning_rate": 3.6161403045181704e-07, "loss": 0.0003, "step": 9086 }, { "epoch": 4.1342129208371245, "grad_norm": 0.06174196439793071, "learning_rate": 3.6124389920556445e-07, "loss": 0.0006, "step": 9087 }, { "epoch": 4.134667879890809, "grad_norm": 0.08518057447707345, "learning_rate": 3.608739427288013e-07, "loss": 0.001, "step": 9088 }, { "epoch": 4.135122838944495, "grad_norm": 0.18230518058250472, "learning_rate": 3.605041610517601e-07, "loss": 0.0019, "step": 9089 }, { "epoch": 4.13557779799818, "grad_norm": 0.03310649714798653, "learning_rate": 3.601345542046569e-07, "loss": 0.0002, "step": 9090 }, { "epoch": 4.136032757051865, "grad_norm": 0.15141611730483012, "learning_rate": 3.597651222176943e-07, "loss": 0.0009, "step": 9091 }, { "epoch": 4.136487716105551, "grad_norm": 0.26431138125160486, "learning_rate": 3.593958651210608e-07, "loss": 0.0016, "step": 9092 }, { "epoch": 4.1369426751592355, "grad_norm": 0.1259636501307728, "learning_rate": 3.590267829449298e-07, "loss": 0.0005, "step": 9093 }, { "epoch": 4.13739763421292, "grad_norm": 0.004783951991210903, "learning_rate": 3.586578757194614e-07, "loss": 0.0, "step": 9094 }, { "epoch": 4.137852593266606, "grad_norm": 0.11282749755227, "learning_rate": 3.5828914347480175e-07, "loss": 0.0012, "step": 9095 }, { "epoch": 4.138307552320291, "grad_norm": 0.04563324340128282, "learning_rate": 3.5792058624108143e-07, "loss": 0.0004, "step": 9096 }, { "epoch": 4.138762511373977, "grad_norm": 0.10856000969935799, "learning_rate": 3.575522040484172e-07, "loss": 0.0015, "step": 9097 }, { "epoch": 4.139217470427662, "grad_norm": 0.033390309222070806, "learning_rate": 3.571839969269114e-07, "loss": 0.0002, "step": 9098 }, { "epoch": 4.1396724294813465, "grad_norm": 0.037315934243376876, "learning_rate": 3.568159649066527e-07, "loss": 0.0001, "step": 9099 }, { "epoch": 4.140127388535032, "grad_norm": 0.051312427266226736, "learning_rate": 3.5644810801771454e-07, "loss": 0.0004, "step": 9100 }, { "epoch": 4.140582347588717, "grad_norm": 0.05938401691027865, "learning_rate": 3.560804262901571e-07, "loss": 0.0005, "step": 9101 }, { "epoch": 4.141037306642402, "grad_norm": 0.03925876859421038, "learning_rate": 3.5571291975402545e-07, "loss": 0.0003, "step": 9102 }, { "epoch": 4.141492265696088, "grad_norm": 0.3035938655328603, "learning_rate": 3.5534558843935e-07, "loss": 0.0023, "step": 9103 }, { "epoch": 4.141947224749773, "grad_norm": 0.04038203366005038, "learning_rate": 3.549784323761485e-07, "loss": 0.0002, "step": 9104 }, { "epoch": 4.1424021838034575, "grad_norm": 0.16591949222146338, "learning_rate": 3.546114515944224e-07, "loss": 0.0012, "step": 9105 }, { "epoch": 4.142857142857143, "grad_norm": 0.11617128484904168, "learning_rate": 3.5424464612416025e-07, "loss": 0.0015, "step": 9106 }, { "epoch": 4.143312101910828, "grad_norm": 0.09132211630031471, "learning_rate": 3.538780159953348e-07, "loss": 0.0003, "step": 9107 }, { "epoch": 4.143767060964513, "grad_norm": 0.12062183349426718, "learning_rate": 3.5351156123790614e-07, "loss": 0.0011, "step": 9108 }, { "epoch": 4.144222020018199, "grad_norm": 0.12355657801793715, "learning_rate": 3.5314528188181984e-07, "loss": 0.0015, "step": 9109 }, { "epoch": 4.144676979071884, "grad_norm": 0.017137431291686482, "learning_rate": 3.527791779570058e-07, "loss": 0.0001, "step": 9110 }, { "epoch": 4.1451319381255685, "grad_norm": 0.0713761099834918, "learning_rate": 3.5241324949338075e-07, "loss": 0.0004, "step": 9111 }, { "epoch": 4.145586897179254, "grad_norm": 0.1710048981717885, "learning_rate": 3.520474965208459e-07, "loss": 0.0021, "step": 9112 }, { "epoch": 4.146041856232939, "grad_norm": 0.09593339308340193, "learning_rate": 3.516819190692902e-07, "loss": 0.0009, "step": 9113 }, { "epoch": 4.146496815286624, "grad_norm": 0.06813030911968398, "learning_rate": 3.513165171685856e-07, "loss": 0.0004, "step": 9114 }, { "epoch": 4.14695177434031, "grad_norm": 0.052148698133053076, "learning_rate": 3.509512908485926e-07, "loss": 0.0002, "step": 9115 }, { "epoch": 4.147406733393995, "grad_norm": 0.028635177352923695, "learning_rate": 3.505862401391552e-07, "loss": 0.0002, "step": 9116 }, { "epoch": 4.147861692447679, "grad_norm": 0.06115969609789502, "learning_rate": 3.5022136507010277e-07, "loss": 0.0005, "step": 9117 }, { "epoch": 4.148316651501365, "grad_norm": 0.01508876898888281, "learning_rate": 3.498566656712529e-07, "loss": 0.0001, "step": 9118 }, { "epoch": 4.14877161055505, "grad_norm": 0.056077719804849636, "learning_rate": 3.4949214197240624e-07, "loss": 0.0004, "step": 9119 }, { "epoch": 4.149226569608735, "grad_norm": 0.09936156883390605, "learning_rate": 3.4912779400334996e-07, "loss": 0.0009, "step": 9120 }, { "epoch": 4.149681528662421, "grad_norm": 0.10511477859508148, "learning_rate": 3.487636217938567e-07, "loss": 0.0011, "step": 9121 }, { "epoch": 4.1501364877161055, "grad_norm": 0.02334420026084266, "learning_rate": 3.4839962537368514e-07, "loss": 0.0001, "step": 9122 }, { "epoch": 4.15059144676979, "grad_norm": 0.028238029902941626, "learning_rate": 3.480358047725804e-07, "loss": 0.0003, "step": 9123 }, { "epoch": 4.151046405823476, "grad_norm": 0.3867407632683006, "learning_rate": 3.476721600202715e-07, "loss": 0.0037, "step": 9124 }, { "epoch": 4.151501364877161, "grad_norm": 0.05328361517416891, "learning_rate": 3.4730869114647404e-07, "loss": 0.0003, "step": 9125 }, { "epoch": 4.151956323930846, "grad_norm": 0.12008439415548303, "learning_rate": 3.4694539818088876e-07, "loss": 0.0008, "step": 9126 }, { "epoch": 4.152411282984532, "grad_norm": 0.024768333699099594, "learning_rate": 3.4658228115320157e-07, "loss": 0.0001, "step": 9127 }, { "epoch": 4.1528662420382165, "grad_norm": 0.10133329619941663, "learning_rate": 3.4621934009308604e-07, "loss": 0.001, "step": 9128 }, { "epoch": 4.153321201091901, "grad_norm": 0.28158245677199967, "learning_rate": 3.458565750301998e-07, "loss": 0.002, "step": 9129 }, { "epoch": 4.153776160145587, "grad_norm": 0.037538057040845556, "learning_rate": 3.4549398599418667e-07, "loss": 0.0002, "step": 9130 }, { "epoch": 4.154231119199272, "grad_norm": 0.1359729532785647, "learning_rate": 3.4513157301467507e-07, "loss": 0.0006, "step": 9131 }, { "epoch": 4.154686078252957, "grad_norm": 0.08444210001408281, "learning_rate": 3.447693361212795e-07, "loss": 0.0006, "step": 9132 }, { "epoch": 4.155141037306643, "grad_norm": 0.5054388553230363, "learning_rate": 3.4440727534360147e-07, "loss": 0.0056, "step": 9133 }, { "epoch": 4.1555959963603275, "grad_norm": 0.36045387053564065, "learning_rate": 3.440453907112262e-07, "loss": 0.0048, "step": 9134 }, { "epoch": 4.156050955414012, "grad_norm": 0.099987397099871, "learning_rate": 3.4368368225372484e-07, "loss": 0.0007, "step": 9135 }, { "epoch": 4.156505914467698, "grad_norm": 0.042749617186405034, "learning_rate": 3.4332215000065587e-07, "loss": 0.0008, "step": 9136 }, { "epoch": 4.156960873521383, "grad_norm": 0.08949831370929621, "learning_rate": 3.4296079398156074e-07, "loss": 0.0006, "step": 9137 }, { "epoch": 4.157415832575068, "grad_norm": 0.46807526625236556, "learning_rate": 3.4259961422596884e-07, "loss": 0.0119, "step": 9138 }, { "epoch": 4.157870791628754, "grad_norm": 0.011366555467120175, "learning_rate": 3.4223861076339375e-07, "loss": 0.0001, "step": 9139 }, { "epoch": 4.1583257506824385, "grad_norm": 0.313273726986827, "learning_rate": 3.4187778362333503e-07, "loss": 0.0022, "step": 9140 }, { "epoch": 4.158780709736123, "grad_norm": 0.11610516403181656, "learning_rate": 3.415171328352773e-07, "loss": 0.0011, "step": 9141 }, { "epoch": 4.159235668789809, "grad_norm": 0.07128810561199539, "learning_rate": 3.411566584286918e-07, "loss": 0.0006, "step": 9142 }, { "epoch": 4.159690627843494, "grad_norm": 0.20208188865223078, "learning_rate": 3.4079636043303555e-07, "loss": 0.0011, "step": 9143 }, { "epoch": 4.160145586897179, "grad_norm": 0.6428640172838184, "learning_rate": 3.404362388777499e-07, "loss": 0.0044, "step": 9144 }, { "epoch": 4.160600545950865, "grad_norm": 0.06538088061022244, "learning_rate": 3.400762937922622e-07, "loss": 0.0004, "step": 9145 }, { "epoch": 4.1610555050045495, "grad_norm": 0.24720144341435019, "learning_rate": 3.397165252059853e-07, "loss": 0.0034, "step": 9146 }, { "epoch": 4.161510464058235, "grad_norm": 0.04916370344218548, "learning_rate": 3.3935693314831847e-07, "loss": 0.0002, "step": 9147 }, { "epoch": 4.16196542311192, "grad_norm": 0.32346336714865437, "learning_rate": 3.3899751764864597e-07, "loss": 0.0049, "step": 9148 }, { "epoch": 4.162420382165605, "grad_norm": 0.09764294935124478, "learning_rate": 3.386382787363365e-07, "loss": 0.0005, "step": 9149 }, { "epoch": 4.162875341219291, "grad_norm": 0.06905133437228733, "learning_rate": 3.38279216440747e-07, "loss": 0.0005, "step": 9150 }, { "epoch": 4.163330300272976, "grad_norm": 0.022181520818230445, "learning_rate": 3.379203307912171e-07, "loss": 0.0001, "step": 9151 }, { "epoch": 4.16378525932666, "grad_norm": 0.06556106697288804, "learning_rate": 3.3756162181707436e-07, "loss": 0.0005, "step": 9152 }, { "epoch": 4.164240218380346, "grad_norm": 0.049180099639596, "learning_rate": 3.3720308954763053e-07, "loss": 0.0002, "step": 9153 }, { "epoch": 4.164695177434031, "grad_norm": 0.036789575035575804, "learning_rate": 3.3684473401218304e-07, "loss": 0.0002, "step": 9154 }, { "epoch": 4.165150136487716, "grad_norm": 0.19681383672086783, "learning_rate": 3.364865552400146e-07, "loss": 0.0013, "step": 9155 }, { "epoch": 4.165605095541402, "grad_norm": 0.04414011294655894, "learning_rate": 3.3612855326039447e-07, "loss": 0.0002, "step": 9156 }, { "epoch": 4.1660600545950865, "grad_norm": 0.060126203043494746, "learning_rate": 3.3577072810257764e-07, "loss": 0.0003, "step": 9157 }, { "epoch": 4.166515013648771, "grad_norm": 0.14144947941203329, "learning_rate": 3.3541307979580356e-07, "loss": 0.0015, "step": 9158 }, { "epoch": 4.166969972702457, "grad_norm": 0.024813387599670157, "learning_rate": 3.3505560836929714e-07, "loss": 0.0001, "step": 9159 }, { "epoch": 4.167424931756142, "grad_norm": 0.01273848934616414, "learning_rate": 3.346983138522697e-07, "loss": 0.0001, "step": 9160 }, { "epoch": 4.167879890809827, "grad_norm": 0.06972640930155019, "learning_rate": 3.343411962739168e-07, "loss": 0.0003, "step": 9161 }, { "epoch": 4.168334849863513, "grad_norm": 0.0876088704565879, "learning_rate": 3.3398425566342236e-07, "loss": 0.0005, "step": 9162 }, { "epoch": 4.1687898089171975, "grad_norm": 0.06178731514224045, "learning_rate": 3.336274920499519e-07, "loss": 0.0003, "step": 9163 }, { "epoch": 4.169244767970882, "grad_norm": 0.2789407700061523, "learning_rate": 3.332709054626604e-07, "loss": 0.0044, "step": 9164 }, { "epoch": 4.169699727024568, "grad_norm": 0.06312858269186289, "learning_rate": 3.329144959306854e-07, "loss": 0.0002, "step": 9165 }, { "epoch": 4.170154686078253, "grad_norm": 0.19957430040727517, "learning_rate": 3.325582634831509e-07, "loss": 0.0037, "step": 9166 }, { "epoch": 4.170609645131938, "grad_norm": 0.06034824986750136, "learning_rate": 3.3220220814916773e-07, "loss": 0.0002, "step": 9167 }, { "epoch": 4.171064604185624, "grad_norm": 0.08347590571685813, "learning_rate": 3.3184632995783007e-07, "loss": 0.0004, "step": 9168 }, { "epoch": 4.1715195632393085, "grad_norm": 0.09925169292326279, "learning_rate": 3.3149062893821945e-07, "loss": 0.0032, "step": 9169 }, { "epoch": 4.171974522292993, "grad_norm": 0.026952863936589362, "learning_rate": 3.311351051194009e-07, "loss": 0.0002, "step": 9170 }, { "epoch": 4.172429481346679, "grad_norm": 0.10978395804899545, "learning_rate": 3.3077975853042704e-07, "loss": 0.0008, "step": 9171 }, { "epoch": 4.172884440400364, "grad_norm": 0.0210559725867683, "learning_rate": 3.3042458920033577e-07, "loss": 0.0001, "step": 9172 }, { "epoch": 4.173339399454049, "grad_norm": 0.08265237783843711, "learning_rate": 3.300695971581494e-07, "loss": 0.0005, "step": 9173 }, { "epoch": 4.173794358507735, "grad_norm": 0.09783442417148934, "learning_rate": 3.297147824328764e-07, "loss": 0.0006, "step": 9174 }, { "epoch": 4.1742493175614195, "grad_norm": 0.04974803208778292, "learning_rate": 3.293601450535097e-07, "loss": 0.0005, "step": 9175 }, { "epoch": 4.174704276615104, "grad_norm": 0.07861802844492698, "learning_rate": 3.2900568504903e-07, "loss": 0.0002, "step": 9176 }, { "epoch": 4.17515923566879, "grad_norm": 0.0717590549393412, "learning_rate": 3.286514024484011e-07, "loss": 0.0003, "step": 9177 }, { "epoch": 4.175614194722475, "grad_norm": 0.3550213892570246, "learning_rate": 3.2829729728057425e-07, "loss": 0.0035, "step": 9178 }, { "epoch": 4.17606915377616, "grad_norm": 0.5642025335349085, "learning_rate": 3.2794336957448517e-07, "loss": 0.0032, "step": 9179 }, { "epoch": 4.176524112829846, "grad_norm": 0.04329518868880122, "learning_rate": 3.2758961935905444e-07, "loss": 0.0003, "step": 9180 }, { "epoch": 4.1769790718835305, "grad_norm": 0.03612915779365306, "learning_rate": 3.272360466631899e-07, "loss": 0.0003, "step": 9181 }, { "epoch": 4.177434030937215, "grad_norm": 0.0658991811405624, "learning_rate": 3.2688265151578357e-07, "loss": 0.0004, "step": 9182 }, { "epoch": 4.177888989990901, "grad_norm": 0.01205597755531618, "learning_rate": 3.2652943394571314e-07, "loss": 0.0001, "step": 9183 }, { "epoch": 4.178343949044586, "grad_norm": 0.032132509979408086, "learning_rate": 3.2617639398184186e-07, "loss": 0.0002, "step": 9184 }, { "epoch": 4.178798908098271, "grad_norm": 0.04162663827670198, "learning_rate": 3.258235316530184e-07, "loss": 0.0003, "step": 9185 }, { "epoch": 4.179253867151957, "grad_norm": 0.1413303793236271, "learning_rate": 3.2547084698807824e-07, "loss": 0.0017, "step": 9186 }, { "epoch": 4.179708826205641, "grad_norm": 0.08931116117243798, "learning_rate": 3.2511834001584005e-07, "loss": 0.0016, "step": 9187 }, { "epoch": 4.180163785259326, "grad_norm": 0.025314031023039744, "learning_rate": 3.247660107651096e-07, "loss": 0.0002, "step": 9188 }, { "epoch": 4.180618744313012, "grad_norm": 0.013270951339039338, "learning_rate": 3.24413859264677e-07, "loss": 0.0001, "step": 9189 }, { "epoch": 4.181073703366697, "grad_norm": 0.3250200799254345, "learning_rate": 3.2406188554331945e-07, "loss": 0.0029, "step": 9190 }, { "epoch": 4.181528662420382, "grad_norm": 0.16607556062589846, "learning_rate": 3.237100896297979e-07, "loss": 0.0015, "step": 9191 }, { "epoch": 4.1819836214740675, "grad_norm": 0.15928801932773565, "learning_rate": 3.233584715528601e-07, "loss": 0.0019, "step": 9192 }, { "epoch": 4.182438580527752, "grad_norm": 0.017585810652606594, "learning_rate": 3.2300703134123814e-07, "loss": 0.0001, "step": 9193 }, { "epoch": 4.182893539581437, "grad_norm": 0.03160480543120011, "learning_rate": 3.2265576902365007e-07, "loss": 0.0002, "step": 9194 }, { "epoch": 4.183348498635123, "grad_norm": 0.11987694166799279, "learning_rate": 3.223046846288003e-07, "loss": 0.001, "step": 9195 }, { "epoch": 4.183803457688808, "grad_norm": 0.026336989535255356, "learning_rate": 3.219537781853774e-07, "loss": 0.0002, "step": 9196 }, { "epoch": 4.184258416742493, "grad_norm": 0.12380896785251697, "learning_rate": 3.216030497220557e-07, "loss": 0.0009, "step": 9197 }, { "epoch": 4.1847133757961785, "grad_norm": 0.08674091238532392, "learning_rate": 3.2125249926749455e-07, "loss": 0.0008, "step": 9198 }, { "epoch": 4.185168334849863, "grad_norm": 0.18096906823771972, "learning_rate": 3.2090212685034067e-07, "loss": 0.0012, "step": 9199 }, { "epoch": 4.185623293903548, "grad_norm": 0.08262409516676053, "learning_rate": 3.205519324992237e-07, "loss": 0.0005, "step": 9200 }, { "epoch": 4.186078252957234, "grad_norm": 0.07606078534499895, "learning_rate": 3.202019162427611e-07, "loss": 0.0004, "step": 9201 }, { "epoch": 4.186533212010919, "grad_norm": 0.011912339737237572, "learning_rate": 3.1985207810955404e-07, "loss": 0.0001, "step": 9202 }, { "epoch": 4.186988171064604, "grad_norm": 0.2205367417428787, "learning_rate": 3.1950241812818944e-07, "loss": 0.0028, "step": 9203 }, { "epoch": 4.1874431301182895, "grad_norm": 0.016757977209050784, "learning_rate": 3.1915293632723996e-07, "loss": 0.0001, "step": 9204 }, { "epoch": 4.187898089171974, "grad_norm": 0.2716495769507683, "learning_rate": 3.188036327352637e-07, "loss": 0.002, "step": 9205 }, { "epoch": 4.188353048225659, "grad_norm": 0.2866966432000893, "learning_rate": 3.1845450738080514e-07, "loss": 0.0038, "step": 9206 }, { "epoch": 4.188808007279345, "grad_norm": 0.04710002235277799, "learning_rate": 3.1810556029239214e-07, "loss": 0.0003, "step": 9207 }, { "epoch": 4.18926296633303, "grad_norm": 0.20799977025694474, "learning_rate": 3.177567914985397e-07, "loss": 0.0009, "step": 9208 }, { "epoch": 4.189717925386716, "grad_norm": 0.22937507997875292, "learning_rate": 3.174082010277468e-07, "loss": 0.0048, "step": 9209 }, { "epoch": 4.1901728844404005, "grad_norm": 0.2143960667331577, "learning_rate": 3.1705978890849946e-07, "loss": 0.002, "step": 9210 }, { "epoch": 4.190627843494085, "grad_norm": 0.08431440054879007, "learning_rate": 3.1671155516926843e-07, "loss": 0.0007, "step": 9211 }, { "epoch": 4.191082802547771, "grad_norm": 0.061529243927600126, "learning_rate": 3.163634998385087e-07, "loss": 0.0004, "step": 9212 }, { "epoch": 4.191537761601456, "grad_norm": 0.14380119295224839, "learning_rate": 3.160156229446631e-07, "loss": 0.0013, "step": 9213 }, { "epoch": 4.191992720655141, "grad_norm": 0.1722546490568956, "learning_rate": 3.156679245161576e-07, "loss": 0.0007, "step": 9214 }, { "epoch": 4.192447679708827, "grad_norm": 0.004186954681447171, "learning_rate": 3.153204045814054e-07, "loss": 0.0, "step": 9215 }, { "epoch": 4.1929026387625115, "grad_norm": 0.11776660086449574, "learning_rate": 3.149730631688039e-07, "loss": 0.0006, "step": 9216 }, { "epoch": 4.193357597816196, "grad_norm": 0.12306067212918406, "learning_rate": 3.1462590030673616e-07, "loss": 0.0004, "step": 9217 }, { "epoch": 4.193812556869882, "grad_norm": 0.01562820226242056, "learning_rate": 3.1427891602357014e-07, "loss": 0.0001, "step": 9218 }, { "epoch": 4.194267515923567, "grad_norm": 0.36358236166017716, "learning_rate": 3.139321103476606e-07, "loss": 0.0056, "step": 9219 }, { "epoch": 4.194722474977252, "grad_norm": 0.02247330129973813, "learning_rate": 3.1358548330734735e-07, "loss": 0.0001, "step": 9220 }, { "epoch": 4.195177434030938, "grad_norm": 0.022073283742156224, "learning_rate": 3.132390349309547e-07, "loss": 0.0001, "step": 9221 }, { "epoch": 4.195632393084622, "grad_norm": 0.035773204734251635, "learning_rate": 3.1289276524679254e-07, "loss": 0.0001, "step": 9222 }, { "epoch": 4.196087352138307, "grad_norm": 0.08004612207998735, "learning_rate": 3.125466742831562e-07, "loss": 0.0005, "step": 9223 }, { "epoch": 4.196542311191993, "grad_norm": 0.1237236258123073, "learning_rate": 3.122007620683279e-07, "loss": 0.0003, "step": 9224 }, { "epoch": 4.196997270245678, "grad_norm": 0.11600345429575866, "learning_rate": 3.1185502863057326e-07, "loss": 0.0011, "step": 9225 }, { "epoch": 4.197452229299363, "grad_norm": 0.16722052341548527, "learning_rate": 3.1150947399814363e-07, "loss": 0.0014, "step": 9226 }, { "epoch": 4.1979071883530485, "grad_norm": 0.014003304999429748, "learning_rate": 3.1116409819927697e-07, "loss": 0.0001, "step": 9227 }, { "epoch": 4.198362147406733, "grad_norm": 0.03410722284209634, "learning_rate": 3.108189012621951e-07, "loss": 0.0002, "step": 9228 }, { "epoch": 4.198817106460418, "grad_norm": 0.11335519157931308, "learning_rate": 3.1047388321510693e-07, "loss": 0.0003, "step": 9229 }, { "epoch": 4.199272065514104, "grad_norm": 0.04341729202826633, "learning_rate": 3.1012904408620536e-07, "loss": 0.0003, "step": 9230 }, { "epoch": 4.199727024567789, "grad_norm": 0.27925907026849245, "learning_rate": 3.097843839036688e-07, "loss": 0.0016, "step": 9231 }, { "epoch": 4.200181983621474, "grad_norm": 0.13379975297118227, "learning_rate": 3.094399026956613e-07, "loss": 0.0016, "step": 9232 }, { "epoch": 4.2006369426751595, "grad_norm": 0.1874245469646866, "learning_rate": 3.0909560049033145e-07, "loss": 0.002, "step": 9233 }, { "epoch": 4.201091901728844, "grad_norm": 0.03276155026760257, "learning_rate": 3.0875147731581625e-07, "loss": 0.0002, "step": 9234 }, { "epoch": 4.201546860782529, "grad_norm": 0.0228627388637466, "learning_rate": 3.084075332002348e-07, "loss": 0.0002, "step": 9235 }, { "epoch": 4.202001819836215, "grad_norm": 0.04946557117158177, "learning_rate": 3.080637681716925e-07, "loss": 0.0002, "step": 9236 }, { "epoch": 4.2024567788899, "grad_norm": 0.10467833115767271, "learning_rate": 3.077201822582804e-07, "loss": 0.0006, "step": 9237 }, { "epoch": 4.202911737943585, "grad_norm": 0.017785172702272856, "learning_rate": 3.0737677548807435e-07, "loss": 0.0001, "step": 9238 }, { "epoch": 4.2033666969972705, "grad_norm": 0.1845421842832455, "learning_rate": 3.0703354788913675e-07, "loss": 0.001, "step": 9239 }, { "epoch": 4.203821656050955, "grad_norm": 0.016432543299652935, "learning_rate": 3.066904994895137e-07, "loss": 0.0001, "step": 9240 }, { "epoch": 4.20427661510464, "grad_norm": 0.18368095725252256, "learning_rate": 3.0634763031723885e-07, "loss": 0.0006, "step": 9241 }, { "epoch": 4.204731574158326, "grad_norm": 0.024333324954560497, "learning_rate": 3.0600494040032904e-07, "loss": 0.0002, "step": 9242 }, { "epoch": 4.205186533212011, "grad_norm": 0.27948737809868623, "learning_rate": 3.056624297667871e-07, "loss": 0.0017, "step": 9243 }, { "epoch": 4.205641492265696, "grad_norm": 0.6954505499083713, "learning_rate": 3.0532009844460227e-07, "loss": 0.0032, "step": 9244 }, { "epoch": 4.2060964513193815, "grad_norm": 0.1633928693555531, "learning_rate": 3.0497794646174803e-07, "loss": 0.0012, "step": 9245 }, { "epoch": 4.206551410373066, "grad_norm": 0.02661587092291997, "learning_rate": 3.046359738461832e-07, "loss": 0.0001, "step": 9246 }, { "epoch": 4.207006369426751, "grad_norm": 0.211358176461674, "learning_rate": 3.0429418062585205e-07, "loss": 0.0011, "step": 9247 }, { "epoch": 4.207461328480437, "grad_norm": 0.09247431909290553, "learning_rate": 3.039525668286847e-07, "loss": 0.0003, "step": 9248 }, { "epoch": 4.207916287534122, "grad_norm": 0.18184787554163565, "learning_rate": 3.036111324825969e-07, "loss": 0.0014, "step": 9249 }, { "epoch": 4.208371246587807, "grad_norm": 0.07369544988888521, "learning_rate": 3.0326987761548825e-07, "loss": 0.0006, "step": 9250 }, { "epoch": 4.2088262056414925, "grad_norm": 0.052102402378107364, "learning_rate": 3.0292880225524514e-07, "loss": 0.0003, "step": 9251 }, { "epoch": 4.209281164695177, "grad_norm": 0.12042139163570018, "learning_rate": 3.0258790642973797e-07, "loss": 0.0007, "step": 9252 }, { "epoch": 4.209736123748862, "grad_norm": 0.7922189938171947, "learning_rate": 3.022471901668239e-07, "loss": 0.0062, "step": 9253 }, { "epoch": 4.210191082802548, "grad_norm": 0.022626568136306997, "learning_rate": 3.019066534943443e-07, "loss": 0.0001, "step": 9254 }, { "epoch": 4.210646041856233, "grad_norm": 0.013809128675604783, "learning_rate": 3.015662964401267e-07, "loss": 0.0001, "step": 9255 }, { "epoch": 4.211101000909918, "grad_norm": 0.11529474364017284, "learning_rate": 3.0122611903198344e-07, "loss": 0.0008, "step": 9256 }, { "epoch": 4.211555959963603, "grad_norm": 0.215145367201575, "learning_rate": 3.0088612129771154e-07, "loss": 0.0013, "step": 9257 }, { "epoch": 4.212010919017288, "grad_norm": 0.07888310555797243, "learning_rate": 3.0054630326509544e-07, "loss": 0.0007, "step": 9258 }, { "epoch": 4.212465878070974, "grad_norm": 0.03403442829869032, "learning_rate": 3.002066649619026e-07, "loss": 0.0001, "step": 9259 }, { "epoch": 4.212920837124659, "grad_norm": 0.044386645413949974, "learning_rate": 2.9986720641588696e-07, "loss": 0.0003, "step": 9260 }, { "epoch": 4.213375796178344, "grad_norm": 0.031655257580426865, "learning_rate": 2.9952792765478715e-07, "loss": 0.0003, "step": 9261 }, { "epoch": 4.2138307552320295, "grad_norm": 0.044086038790614575, "learning_rate": 2.991888287063277e-07, "loss": 0.0003, "step": 9262 }, { "epoch": 4.214285714285714, "grad_norm": 0.15459390490688696, "learning_rate": 2.988499095982189e-07, "loss": 0.0008, "step": 9263 }, { "epoch": 4.214740673339399, "grad_norm": 0.03282124017904193, "learning_rate": 2.98511170358155e-07, "loss": 0.0001, "step": 9264 }, { "epoch": 4.215195632393085, "grad_norm": 0.10703904727675269, "learning_rate": 2.9817261101381667e-07, "loss": 0.0004, "step": 9265 }, { "epoch": 4.21565059144677, "grad_norm": 0.21424640462059696, "learning_rate": 2.9783423159286923e-07, "loss": 0.0007, "step": 9266 }, { "epoch": 4.216105550500455, "grad_norm": 0.026658155487895473, "learning_rate": 2.974960321229628e-07, "loss": 0.0001, "step": 9267 }, { "epoch": 4.2165605095541405, "grad_norm": 0.17197779128879664, "learning_rate": 2.971580126317344e-07, "loss": 0.0023, "step": 9268 }, { "epoch": 4.217015468607825, "grad_norm": 0.028060910933664484, "learning_rate": 2.9682017314680566e-07, "loss": 0.0002, "step": 9269 }, { "epoch": 4.21747042766151, "grad_norm": 0.12874260273497468, "learning_rate": 2.96482513695783e-07, "loss": 0.001, "step": 9270 }, { "epoch": 4.217925386715196, "grad_norm": 0.3558520562450385, "learning_rate": 2.961450343062583e-07, "loss": 0.0019, "step": 9271 }, { "epoch": 4.218380345768881, "grad_norm": 0.18801440307510053, "learning_rate": 2.9580773500580804e-07, "loss": 0.0013, "step": 9272 }, { "epoch": 4.218835304822566, "grad_norm": 0.10826878992423056, "learning_rate": 2.9547061582199666e-07, "loss": 0.0009, "step": 9273 }, { "epoch": 4.2192902638762515, "grad_norm": 0.022635660328102918, "learning_rate": 2.9513367678237063e-07, "loss": 0.0001, "step": 9274 }, { "epoch": 4.219745222929936, "grad_norm": 0.12481825536518215, "learning_rate": 2.94796917914463e-07, "loss": 0.0012, "step": 9275 }, { "epoch": 4.220200181983621, "grad_norm": 0.05233295988947224, "learning_rate": 2.9446033924579315e-07, "loss": 0.0002, "step": 9276 }, { "epoch": 4.220655141037307, "grad_norm": 0.1481103880454747, "learning_rate": 2.9412394080386374e-07, "loss": 0.0015, "step": 9277 }, { "epoch": 4.221110100090992, "grad_norm": 0.07222116429879095, "learning_rate": 2.937877226161648e-07, "loss": 0.0005, "step": 9278 }, { "epoch": 4.221565059144677, "grad_norm": 0.07170444249940258, "learning_rate": 2.934516847101701e-07, "loss": 0.0005, "step": 9279 }, { "epoch": 4.2220200181983625, "grad_norm": 0.08147042990668123, "learning_rate": 2.9311582711333885e-07, "loss": 0.0006, "step": 9280 }, { "epoch": 4.222474977252047, "grad_norm": 0.116132594304015, "learning_rate": 2.927801498531155e-07, "loss": 0.0008, "step": 9281 }, { "epoch": 4.222929936305732, "grad_norm": 0.0841070819726987, "learning_rate": 2.924446529569308e-07, "loss": 0.0005, "step": 9282 }, { "epoch": 4.223384895359418, "grad_norm": 0.38680640108524017, "learning_rate": 2.9210933645220015e-07, "loss": 0.0053, "step": 9283 }, { "epoch": 4.223839854413103, "grad_norm": 0.040296511385621646, "learning_rate": 2.9177420036632376e-07, "loss": 0.0002, "step": 9284 }, { "epoch": 4.224294813466788, "grad_norm": 0.013301901139583591, "learning_rate": 2.9143924472668754e-07, "loss": 0.0001, "step": 9285 }, { "epoch": 4.2247497725204735, "grad_norm": 0.2464679445172045, "learning_rate": 2.9110446956066187e-07, "loss": 0.0028, "step": 9286 }, { "epoch": 4.225204731574158, "grad_norm": 0.17686723627287287, "learning_rate": 2.907698748956042e-07, "loss": 0.0016, "step": 9287 }, { "epoch": 4.225659690627843, "grad_norm": 0.03846137666647302, "learning_rate": 2.9043546075885554e-07, "loss": 0.0002, "step": 9288 }, { "epoch": 4.226114649681529, "grad_norm": 0.11529891164543937, "learning_rate": 2.901012271777423e-07, "loss": 0.0019, "step": 9289 }, { "epoch": 4.226569608735214, "grad_norm": 0.03178336394570907, "learning_rate": 2.897671741795774e-07, "loss": 0.0001, "step": 9290 }, { "epoch": 4.227024567788899, "grad_norm": 0.019824762297945078, "learning_rate": 2.894333017916573e-07, "loss": 0.0001, "step": 9291 }, { "epoch": 4.227479526842584, "grad_norm": 0.11616682672826968, "learning_rate": 2.8909961004126546e-07, "loss": 0.0007, "step": 9292 }, { "epoch": 4.227934485896269, "grad_norm": 0.05055422673118664, "learning_rate": 2.88766098955669e-07, "loss": 0.0003, "step": 9293 }, { "epoch": 4.228389444949954, "grad_norm": 0.06290395482985882, "learning_rate": 2.8843276856212106e-07, "loss": 0.0004, "step": 9294 }, { "epoch": 4.22884440400364, "grad_norm": 0.1937977660910569, "learning_rate": 2.880996188878596e-07, "loss": 0.0008, "step": 9295 }, { "epoch": 4.229299363057325, "grad_norm": 0.15606112375079403, "learning_rate": 2.877666499601084e-07, "loss": 0.0017, "step": 9296 }, { "epoch": 4.22975432211101, "grad_norm": 0.05255369510012786, "learning_rate": 2.874338618060765e-07, "loss": 0.0002, "step": 9297 }, { "epoch": 4.230209281164695, "grad_norm": 0.014082758820452863, "learning_rate": 2.8710125445295777e-07, "loss": 0.0001, "step": 9298 }, { "epoch": 4.23066424021838, "grad_norm": 0.15957973502861053, "learning_rate": 2.8676882792793126e-07, "loss": 0.002, "step": 9299 }, { "epoch": 4.231119199272065, "grad_norm": 0.135569501766141, "learning_rate": 2.864365822581605e-07, "loss": 0.0017, "step": 9300 }, { "epoch": 4.231574158325751, "grad_norm": 0.12922398193205, "learning_rate": 2.861045174707966e-07, "loss": 0.0011, "step": 9301 }, { "epoch": 4.232029117379436, "grad_norm": 0.23582723341256098, "learning_rate": 2.857726335929734e-07, "loss": 0.0033, "step": 9302 }, { "epoch": 4.232484076433121, "grad_norm": 0.033075484372448664, "learning_rate": 2.8544093065181105e-07, "loss": 0.0003, "step": 9303 }, { "epoch": 4.232939035486806, "grad_norm": 0.09847307509877014, "learning_rate": 2.851094086744152e-07, "loss": 0.0008, "step": 9304 }, { "epoch": 4.233393994540491, "grad_norm": 0.020452521474842354, "learning_rate": 2.8477806768787616e-07, "loss": 0.0001, "step": 9305 }, { "epoch": 4.233848953594176, "grad_norm": 0.0929647043788804, "learning_rate": 2.844469077192691e-07, "loss": 0.0009, "step": 9306 }, { "epoch": 4.234303912647862, "grad_norm": 0.005452271597518426, "learning_rate": 2.8411592879565604e-07, "loss": 0.0, "step": 9307 }, { "epoch": 4.234758871701547, "grad_norm": 0.19971737359400776, "learning_rate": 2.8378513094408227e-07, "loss": 0.0014, "step": 9308 }, { "epoch": 4.235213830755232, "grad_norm": 0.08094162748521051, "learning_rate": 2.8345451419157924e-07, "loss": 0.0004, "step": 9309 }, { "epoch": 4.235668789808917, "grad_norm": 0.05479800311295047, "learning_rate": 2.831240785651632e-07, "loss": 0.0002, "step": 9310 }, { "epoch": 4.236123748862602, "grad_norm": 0.1506925790265158, "learning_rate": 2.8279382409183596e-07, "loss": 0.0009, "step": 9311 }, { "epoch": 4.236578707916287, "grad_norm": 0.15302974529617772, "learning_rate": 2.824637507985853e-07, "loss": 0.0014, "step": 9312 }, { "epoch": 4.237033666969973, "grad_norm": 0.3526674472717498, "learning_rate": 2.821338587123823e-07, "loss": 0.0037, "step": 9313 }, { "epoch": 4.237488626023658, "grad_norm": 0.11840937005180575, "learning_rate": 2.818041478601849e-07, "loss": 0.0007, "step": 9314 }, { "epoch": 4.237943585077343, "grad_norm": 0.27137147190401895, "learning_rate": 2.8147461826893456e-07, "loss": 0.0011, "step": 9315 }, { "epoch": 4.238398544131028, "grad_norm": 0.02267441783639729, "learning_rate": 2.8114526996556006e-07, "loss": 0.0001, "step": 9316 }, { "epoch": 4.238853503184713, "grad_norm": 0.2235777400549848, "learning_rate": 2.8081610297697346e-07, "loss": 0.0013, "step": 9317 }, { "epoch": 4.239308462238399, "grad_norm": 0.04258437616519355, "learning_rate": 2.8048711733007356e-07, "loss": 0.0003, "step": 9318 }, { "epoch": 4.239763421292084, "grad_norm": 0.14898500257261904, "learning_rate": 2.8015831305174324e-07, "loss": 0.0015, "step": 9319 }, { "epoch": 4.240218380345769, "grad_norm": 0.05279248722953454, "learning_rate": 2.798296901688505e-07, "loss": 0.0002, "step": 9320 }, { "epoch": 4.2406733393994545, "grad_norm": 0.1011728708448812, "learning_rate": 2.795012487082496e-07, "loss": 0.0005, "step": 9321 }, { "epoch": 4.241128298453139, "grad_norm": 0.11731138509715626, "learning_rate": 2.791729886967792e-07, "loss": 0.0007, "step": 9322 }, { "epoch": 4.241583257506824, "grad_norm": 0.1495255622153591, "learning_rate": 2.788449101612628e-07, "loss": 0.0007, "step": 9323 }, { "epoch": 4.24203821656051, "grad_norm": 0.29486779517043193, "learning_rate": 2.785170131285092e-07, "loss": 0.0033, "step": 9324 }, { "epoch": 4.242493175614195, "grad_norm": 0.23510541728484557, "learning_rate": 2.7818929762531336e-07, "loss": 0.0021, "step": 9325 }, { "epoch": 4.24294813466788, "grad_norm": 0.22147173095525877, "learning_rate": 2.778617636784547e-07, "loss": 0.0029, "step": 9326 }, { "epoch": 4.243403093721565, "grad_norm": 0.038926175800498834, "learning_rate": 2.77534411314698e-07, "loss": 0.0003, "step": 9327 }, { "epoch": 4.24385805277525, "grad_norm": 0.17091823983776422, "learning_rate": 2.7720724056079227e-07, "loss": 0.001, "step": 9328 }, { "epoch": 4.244313011828935, "grad_norm": 0.08871001719827865, "learning_rate": 2.7688025144347266e-07, "loss": 0.0008, "step": 9329 }, { "epoch": 4.244767970882621, "grad_norm": 0.07964393963143425, "learning_rate": 2.765534439894596e-07, "loss": 0.0004, "step": 9330 }, { "epoch": 4.245222929936306, "grad_norm": 0.03195763616678364, "learning_rate": 2.7622681822545765e-07, "loss": 0.0001, "step": 9331 }, { "epoch": 4.245677888989991, "grad_norm": 0.10553879530674058, "learning_rate": 2.7590037417815825e-07, "loss": 0.0009, "step": 9332 }, { "epoch": 4.246132848043676, "grad_norm": 0.1378713837593223, "learning_rate": 2.755741118742361e-07, "loss": 0.0006, "step": 9333 }, { "epoch": 4.246587807097361, "grad_norm": 0.07272634835633027, "learning_rate": 2.752480313403519e-07, "loss": 0.0006, "step": 9334 }, { "epoch": 4.247042766151046, "grad_norm": 0.04309412326418501, "learning_rate": 2.74922132603152e-07, "loss": 0.0002, "step": 9335 }, { "epoch": 4.247497725204732, "grad_norm": 0.01443157385371839, "learning_rate": 2.745964156892672e-07, "loss": 0.0001, "step": 9336 }, { "epoch": 4.247952684258417, "grad_norm": 0.03539673513168654, "learning_rate": 2.7427088062531333e-07, "loss": 0.0001, "step": 9337 }, { "epoch": 4.248407643312102, "grad_norm": 0.06534090269818175, "learning_rate": 2.739455274378913e-07, "loss": 0.0004, "step": 9338 }, { "epoch": 4.248862602365787, "grad_norm": 0.14191446533679508, "learning_rate": 2.73620356153588e-07, "loss": 0.0017, "step": 9339 }, { "epoch": 4.249317561419472, "grad_norm": 0.08319579657127182, "learning_rate": 2.732953667989757e-07, "loss": 0.0006, "step": 9340 }, { "epoch": 4.249772520473157, "grad_norm": 0.10146862712820534, "learning_rate": 2.729705594006099e-07, "loss": 0.0006, "step": 9341 }, { "epoch": 4.250227479526843, "grad_norm": 0.03935637105984595, "learning_rate": 2.726459339850332e-07, "loss": 0.0002, "step": 9342 }, { "epoch": 4.250682438580528, "grad_norm": 0.0884392355906921, "learning_rate": 2.723214905787719e-07, "loss": 0.0008, "step": 9343 }, { "epoch": 4.251137397634213, "grad_norm": 0.012222496536784248, "learning_rate": 2.719972292083378e-07, "loss": 0.0001, "step": 9344 }, { "epoch": 4.251592356687898, "grad_norm": 0.13311118966952826, "learning_rate": 2.716731499002287e-07, "loss": 0.0017, "step": 9345 }, { "epoch": 4.252047315741583, "grad_norm": 0.007229069844665076, "learning_rate": 2.7134925268092724e-07, "loss": 0.0, "step": 9346 }, { "epoch": 4.252502274795268, "grad_norm": 0.02202007584470904, "learning_rate": 2.7102553757690024e-07, "loss": 0.0002, "step": 9347 }, { "epoch": 4.252957233848954, "grad_norm": 0.5369443585031551, "learning_rate": 2.707020046146003e-07, "loss": 0.006, "step": 9348 }, { "epoch": 4.253412192902639, "grad_norm": 0.03449530981895102, "learning_rate": 2.7037865382046475e-07, "loss": 0.0001, "step": 9349 }, { "epoch": 4.253867151956324, "grad_norm": 0.23752835182594068, "learning_rate": 2.7005548522091694e-07, "loss": 0.0013, "step": 9350 }, { "epoch": 4.254322111010009, "grad_norm": 0.17851347434781198, "learning_rate": 2.69732498842365e-07, "loss": 0.0023, "step": 9351 }, { "epoch": 4.254777070063694, "grad_norm": 0.1660478334130149, "learning_rate": 2.694096947112007e-07, "loss": 0.0009, "step": 9352 }, { "epoch": 4.255232029117379, "grad_norm": 0.0594617109558561, "learning_rate": 2.690870728538034e-07, "loss": 0.0003, "step": 9353 }, { "epoch": 4.255686988171065, "grad_norm": 0.12469733318750177, "learning_rate": 2.687646332965352e-07, "loss": 0.0014, "step": 9354 }, { "epoch": 4.25614194722475, "grad_norm": 0.05231495263495677, "learning_rate": 2.684423760657456e-07, "loss": 0.0002, "step": 9355 }, { "epoch": 4.256596906278435, "grad_norm": 0.10984422806722767, "learning_rate": 2.6812030118776754e-07, "loss": 0.0007, "step": 9356 }, { "epoch": 4.25705186533212, "grad_norm": 0.2085559189454362, "learning_rate": 2.677984086889193e-07, "loss": 0.001, "step": 9357 }, { "epoch": 4.257506824385805, "grad_norm": 0.07147216780040695, "learning_rate": 2.674766985955041e-07, "loss": 0.0005, "step": 9358 }, { "epoch": 4.25796178343949, "grad_norm": 0.12686155478128894, "learning_rate": 2.6715517093381075e-07, "loss": 0.0009, "step": 9359 }, { "epoch": 4.258416742493176, "grad_norm": 0.017291194606601903, "learning_rate": 2.6683382573011426e-07, "loss": 0.0001, "step": 9360 }, { "epoch": 4.258871701546861, "grad_norm": 0.12228566220874025, "learning_rate": 2.665126630106726e-07, "loss": 0.0011, "step": 9361 }, { "epoch": 4.2593266606005455, "grad_norm": 0.07939791853710303, "learning_rate": 2.661916828017297e-07, "loss": 0.0004, "step": 9362 }, { "epoch": 4.259781619654231, "grad_norm": 0.10250436245952146, "learning_rate": 2.6587088512951416e-07, "loss": 0.0006, "step": 9363 }, { "epoch": 4.260236578707916, "grad_norm": 0.08875464711024901, "learning_rate": 2.655502700202414e-07, "loss": 0.0007, "step": 9364 }, { "epoch": 4.260691537761602, "grad_norm": 0.062260303922073566, "learning_rate": 2.6522983750010974e-07, "loss": 0.0004, "step": 9365 }, { "epoch": 4.261146496815287, "grad_norm": 0.011247948682215915, "learning_rate": 2.6490958759530285e-07, "loss": 0.0001, "step": 9366 }, { "epoch": 4.261601455868972, "grad_norm": 0.11224606188109205, "learning_rate": 2.645895203319918e-07, "loss": 0.0006, "step": 9367 }, { "epoch": 4.262056414922657, "grad_norm": 0.020697486777823404, "learning_rate": 2.6426963573632947e-07, "loss": 0.0001, "step": 9368 }, { "epoch": 4.262511373976342, "grad_norm": 0.06894523470742818, "learning_rate": 2.6394993383445647e-07, "loss": 0.0005, "step": 9369 }, { "epoch": 4.262966333030027, "grad_norm": 0.03540185395643933, "learning_rate": 2.6363041465249707e-07, "loss": 0.0002, "step": 9370 }, { "epoch": 4.263421292083713, "grad_norm": 0.23872712243918323, "learning_rate": 2.633110782165607e-07, "loss": 0.0018, "step": 9371 }, { "epoch": 4.263876251137398, "grad_norm": 0.1257927141483009, "learning_rate": 2.629919245527418e-07, "loss": 0.0007, "step": 9372 }, { "epoch": 4.264331210191083, "grad_norm": 0.04183447487145962, "learning_rate": 2.6267295368712057e-07, "loss": 0.0003, "step": 9373 }, { "epoch": 4.264786169244768, "grad_norm": 0.03377399288717895, "learning_rate": 2.6235416564576234e-07, "loss": 0.0002, "step": 9374 }, { "epoch": 4.265241128298453, "grad_norm": 0.0425272583411163, "learning_rate": 2.6203556045471674e-07, "loss": 0.0002, "step": 9375 }, { "epoch": 4.265696087352138, "grad_norm": 0.11875493091887589, "learning_rate": 2.6171713814001824e-07, "loss": 0.0011, "step": 9376 }, { "epoch": 4.266151046405824, "grad_norm": 0.09311970079063656, "learning_rate": 2.6139889872768746e-07, "loss": 0.0005, "step": 9377 }, { "epoch": 4.266606005459509, "grad_norm": 0.118487793508368, "learning_rate": 2.6108084224372885e-07, "loss": 0.001, "step": 9378 }, { "epoch": 4.267060964513194, "grad_norm": 0.24692093583794514, "learning_rate": 2.607629687141333e-07, "loss": 0.003, "step": 9379 }, { "epoch": 4.267515923566879, "grad_norm": 0.0764346523732957, "learning_rate": 2.60445278164875e-07, "loss": 0.0002, "step": 9380 }, { "epoch": 4.267970882620564, "grad_norm": 0.19331063485442435, "learning_rate": 2.6012777062191546e-07, "loss": 0.001, "step": 9381 }, { "epoch": 4.268425841674249, "grad_norm": 0.12325223585349397, "learning_rate": 2.598104461111994e-07, "loss": 0.0011, "step": 9382 }, { "epoch": 4.268880800727935, "grad_norm": 0.10602964327099709, "learning_rate": 2.5949330465865676e-07, "loss": 0.0009, "step": 9383 }, { "epoch": 4.26933575978162, "grad_norm": 0.03613389778485683, "learning_rate": 2.5917634629020334e-07, "loss": 0.0003, "step": 9384 }, { "epoch": 4.269790718835305, "grad_norm": 0.5442146937623218, "learning_rate": 2.588595710317396e-07, "loss": 0.0031, "step": 9385 }, { "epoch": 4.27024567788899, "grad_norm": 0.07709390076013042, "learning_rate": 2.5854297890915094e-07, "loss": 0.0005, "step": 9386 }, { "epoch": 4.270700636942675, "grad_norm": 0.11625453899003799, "learning_rate": 2.582265699483069e-07, "loss": 0.0007, "step": 9387 }, { "epoch": 4.27115559599636, "grad_norm": 0.07989567683694666, "learning_rate": 2.579103441750641e-07, "loss": 0.0005, "step": 9388 }, { "epoch": 4.271610555050046, "grad_norm": 0.019284050686561197, "learning_rate": 2.5759430161526324e-07, "loss": 0.0001, "step": 9389 }, { "epoch": 4.272065514103731, "grad_norm": 0.04507095718466332, "learning_rate": 2.5727844229472914e-07, "loss": 0.0002, "step": 9390 }, { "epoch": 4.272520473157416, "grad_norm": 0.09453655874963024, "learning_rate": 2.569627662392729e-07, "loss": 0.0006, "step": 9391 }, { "epoch": 4.272975432211101, "grad_norm": 0.03612784443452458, "learning_rate": 2.566472734746894e-07, "loss": 0.0002, "step": 9392 }, { "epoch": 4.273430391264786, "grad_norm": 0.12729352646550016, "learning_rate": 2.563319640267606e-07, "loss": 0.0013, "step": 9393 }, { "epoch": 4.273885350318471, "grad_norm": 0.006688152173599204, "learning_rate": 2.560168379212505e-07, "loss": 0.0, "step": 9394 }, { "epoch": 4.274340309372157, "grad_norm": 0.1301225859431563, "learning_rate": 2.557018951839113e-07, "loss": 0.0009, "step": 9395 }, { "epoch": 4.274795268425842, "grad_norm": 0.053115407704252326, "learning_rate": 2.553871358404783e-07, "loss": 0.0003, "step": 9396 }, { "epoch": 4.2752502274795265, "grad_norm": 0.06056070731050059, "learning_rate": 2.5507255991667116e-07, "loss": 0.0004, "step": 9397 }, { "epoch": 4.275705186533212, "grad_norm": 0.567353191136651, "learning_rate": 2.5475816743819715e-07, "loss": 0.0024, "step": 9398 }, { "epoch": 4.276160145586897, "grad_norm": 0.14820591778138978, "learning_rate": 2.544439584307459e-07, "loss": 0.0018, "step": 9399 }, { "epoch": 4.276615104640582, "grad_norm": 0.12592498952554254, "learning_rate": 2.5412993291999396e-07, "loss": 0.0005, "step": 9400 }, { "epoch": 4.277070063694268, "grad_norm": 0.30282426855224903, "learning_rate": 2.538160909316009e-07, "loss": 0.0045, "step": 9401 }, { "epoch": 4.277525022747953, "grad_norm": 0.4243794888900528, "learning_rate": 2.5350243249121333e-07, "loss": 0.0011, "step": 9402 }, { "epoch": 4.2779799818016375, "grad_norm": 0.04591370227031355, "learning_rate": 2.5318895762446226e-07, "loss": 0.0003, "step": 9403 }, { "epoch": 4.278434940855323, "grad_norm": 0.03940098864966601, "learning_rate": 2.5287566635696316e-07, "loss": 0.0001, "step": 9404 }, { "epoch": 4.278889899909008, "grad_norm": 0.011391660376101833, "learning_rate": 2.5256255871431654e-07, "loss": 0.0001, "step": 9405 }, { "epoch": 4.279344858962693, "grad_norm": 0.06691150231615632, "learning_rate": 2.522496347221079e-07, "loss": 0.0005, "step": 9406 }, { "epoch": 4.279799818016379, "grad_norm": 0.0281475181483605, "learning_rate": 2.519368944059089e-07, "loss": 0.0001, "step": 9407 }, { "epoch": 4.280254777070064, "grad_norm": 0.026733400648313047, "learning_rate": 2.516243377912742e-07, "loss": 0.0002, "step": 9408 }, { "epoch": 4.2807097361237485, "grad_norm": 0.17880368978245664, "learning_rate": 2.513119649037454e-07, "loss": 0.0006, "step": 9409 }, { "epoch": 4.281164695177434, "grad_norm": 0.062356679966920424, "learning_rate": 2.5099977576884814e-07, "loss": 0.0003, "step": 9410 }, { "epoch": 4.281619654231119, "grad_norm": 0.04415758520466074, "learning_rate": 2.506877704120925e-07, "loss": 0.0004, "step": 9411 }, { "epoch": 4.282074613284804, "grad_norm": 0.0420328830164785, "learning_rate": 2.503759488589741e-07, "loss": 0.0001, "step": 9412 }, { "epoch": 4.28252957233849, "grad_norm": 0.22523743777093166, "learning_rate": 2.5006431113497457e-07, "loss": 0.0011, "step": 9413 }, { "epoch": 4.282984531392175, "grad_norm": 0.15801657845643066, "learning_rate": 2.497528572655586e-07, "loss": 0.0009, "step": 9414 }, { "epoch": 4.2834394904458595, "grad_norm": 0.08230770854636087, "learning_rate": 2.4944158727617687e-07, "loss": 0.0005, "step": 9415 }, { "epoch": 4.283894449499545, "grad_norm": 0.049863448942917644, "learning_rate": 2.4913050119226565e-07, "loss": 0.0003, "step": 9416 }, { "epoch": 4.28434940855323, "grad_norm": 0.0843963793092997, "learning_rate": 2.4881959903924447e-07, "loss": 0.0005, "step": 9417 }, { "epoch": 4.284804367606915, "grad_norm": 0.06244469635415512, "learning_rate": 2.4850888084251986e-07, "loss": 0.0005, "step": 9418 }, { "epoch": 4.285259326660601, "grad_norm": 0.129612455136567, "learning_rate": 2.4819834662748205e-07, "loss": 0.0003, "step": 9419 }, { "epoch": 4.285714285714286, "grad_norm": 0.026361759759546733, "learning_rate": 2.4788799641950605e-07, "loss": 0.0001, "step": 9420 }, { "epoch": 4.2861692447679705, "grad_norm": 0.03082335349168195, "learning_rate": 2.4757783024395244e-07, "loss": 0.0001, "step": 9421 }, { "epoch": 4.286624203821656, "grad_norm": 0.14548209894393918, "learning_rate": 2.4726784812616645e-07, "loss": 0.0009, "step": 9422 }, { "epoch": 4.287079162875341, "grad_norm": 0.14470166180041127, "learning_rate": 2.469580500914789e-07, "loss": 0.0005, "step": 9423 }, { "epoch": 4.287534121929026, "grad_norm": 0.07381829601548508, "learning_rate": 2.4664843616520525e-07, "loss": 0.0004, "step": 9424 }, { "epoch": 4.287989080982712, "grad_norm": 0.18301342078380922, "learning_rate": 2.4633900637264507e-07, "loss": 0.0019, "step": 9425 }, { "epoch": 4.288444040036397, "grad_norm": 0.14523745891451326, "learning_rate": 2.4602976073908354e-07, "loss": 0.0006, "step": 9426 }, { "epoch": 4.288898999090081, "grad_norm": 0.19642698509045584, "learning_rate": 2.4572069928979147e-07, "loss": 0.0024, "step": 9427 }, { "epoch": 4.289353958143767, "grad_norm": 0.16805137632154923, "learning_rate": 2.454118220500237e-07, "loss": 0.0014, "step": 9428 }, { "epoch": 4.289808917197452, "grad_norm": 0.03827208045162855, "learning_rate": 2.451031290450198e-07, "loss": 0.0003, "step": 9429 }, { "epoch": 4.290263876251138, "grad_norm": 0.33441306680605204, "learning_rate": 2.4479462030000566e-07, "loss": 0.0026, "step": 9430 }, { "epoch": 4.290718835304823, "grad_norm": 0.01815658480278786, "learning_rate": 2.4448629584019003e-07, "loss": 0.0001, "step": 9431 }, { "epoch": 4.2911737943585075, "grad_norm": 0.15088583230736027, "learning_rate": 2.441781556907694e-07, "loss": 0.001, "step": 9432 }, { "epoch": 4.291628753412193, "grad_norm": 0.03520651769863461, "learning_rate": 2.4387019987692215e-07, "loss": 0.0002, "step": 9433 }, { "epoch": 4.292083712465878, "grad_norm": 0.01065018869201681, "learning_rate": 2.435624284238139e-07, "loss": 0.0001, "step": 9434 }, { "epoch": 4.292538671519563, "grad_norm": 0.050544592123336646, "learning_rate": 2.4325484135659356e-07, "loss": 0.0004, "step": 9435 }, { "epoch": 4.292993630573249, "grad_norm": 0.20006987456375822, "learning_rate": 2.4294743870039604e-07, "loss": 0.0008, "step": 9436 }, { "epoch": 4.293448589626934, "grad_norm": 0.38981254547696365, "learning_rate": 2.426402204803416e-07, "loss": 0.0023, "step": 9437 }, { "epoch": 4.2939035486806185, "grad_norm": 0.13769200047071847, "learning_rate": 2.423331867215342e-07, "loss": 0.0009, "step": 9438 }, { "epoch": 4.294358507734304, "grad_norm": 0.11125310843875308, "learning_rate": 2.4202633744906336e-07, "loss": 0.0009, "step": 9439 }, { "epoch": 4.294813466787989, "grad_norm": 0.05861335761051751, "learning_rate": 2.417196726880025e-07, "loss": 0.0003, "step": 9440 }, { "epoch": 4.295268425841674, "grad_norm": 0.03421453083984292, "learning_rate": 2.414131924634125e-07, "loss": 0.0002, "step": 9441 }, { "epoch": 4.29572338489536, "grad_norm": 0.0440635716279685, "learning_rate": 2.411068968003366e-07, "loss": 0.0002, "step": 9442 }, { "epoch": 4.296178343949045, "grad_norm": 0.018846633477222076, "learning_rate": 2.408007857238037e-07, "loss": 0.0001, "step": 9443 }, { "epoch": 4.2966333030027295, "grad_norm": 0.1122153938557645, "learning_rate": 2.404948592588283e-07, "loss": 0.0005, "step": 9444 }, { "epoch": 4.297088262056415, "grad_norm": 0.09367133173347858, "learning_rate": 2.4018911743040884e-07, "loss": 0.0007, "step": 9445 }, { "epoch": 4.2975432211101, "grad_norm": 0.18698057033612367, "learning_rate": 2.398835602635302e-07, "loss": 0.0007, "step": 9446 }, { "epoch": 4.297998180163785, "grad_norm": 0.3229632670411761, "learning_rate": 2.3957818778316017e-07, "loss": 0.0071, "step": 9447 }, { "epoch": 4.298453139217471, "grad_norm": 0.024813007730280116, "learning_rate": 2.3927300001425263e-07, "loss": 0.0002, "step": 9448 }, { "epoch": 4.298908098271156, "grad_norm": 0.059576423531889536, "learning_rate": 2.389679969817463e-07, "loss": 0.0002, "step": 9449 }, { "epoch": 4.2993630573248405, "grad_norm": 0.15254592921812385, "learning_rate": 2.3866317871056396e-07, "loss": 0.0011, "step": 9450 }, { "epoch": 4.299818016378526, "grad_norm": 0.07415994829781644, "learning_rate": 2.3835854522561457e-07, "loss": 0.0016, "step": 9451 }, { "epoch": 4.300272975432211, "grad_norm": 0.9051397888658493, "learning_rate": 2.380540965517919e-07, "loss": 0.0143, "step": 9452 }, { "epoch": 4.300727934485896, "grad_norm": 0.08087502812332731, "learning_rate": 2.3774983271397352e-07, "loss": 0.0007, "step": 9453 }, { "epoch": 4.301182893539582, "grad_norm": 0.03845911759475082, "learning_rate": 2.3744575373702256e-07, "loss": 0.0002, "step": 9454 }, { "epoch": 4.301637852593267, "grad_norm": 0.126746651232611, "learning_rate": 2.3714185964578667e-07, "loss": 0.0013, "step": 9455 }, { "epoch": 4.3020928116469515, "grad_norm": 0.06042335415070713, "learning_rate": 2.3683815046509934e-07, "loss": 0.0003, "step": 9456 }, { "epoch": 4.302547770700637, "grad_norm": 0.11694072837314315, "learning_rate": 2.3653462621977764e-07, "loss": 0.0009, "step": 9457 }, { "epoch": 4.303002729754322, "grad_norm": 0.07792457241284886, "learning_rate": 2.3623128693462505e-07, "loss": 0.0003, "step": 9458 }, { "epoch": 4.303457688808007, "grad_norm": 0.18076075194623067, "learning_rate": 2.359281326344287e-07, "loss": 0.0011, "step": 9459 }, { "epoch": 4.303912647861693, "grad_norm": 0.12828068874460818, "learning_rate": 2.3562516334396018e-07, "loss": 0.0006, "step": 9460 }, { "epoch": 4.304367606915378, "grad_norm": 0.15013339176773796, "learning_rate": 2.35322379087978e-07, "loss": 0.0012, "step": 9461 }, { "epoch": 4.304822565969062, "grad_norm": 0.19247727144735044, "learning_rate": 2.3501977989122405e-07, "loss": 0.0009, "step": 9462 }, { "epoch": 4.305277525022748, "grad_norm": 0.3679959835165242, "learning_rate": 2.3471736577842492e-07, "loss": 0.0026, "step": 9463 }, { "epoch": 4.305732484076433, "grad_norm": 0.04376748967968921, "learning_rate": 2.3441513677429223e-07, "loss": 0.0004, "step": 9464 }, { "epoch": 4.306187443130118, "grad_norm": 0.1645934053934895, "learning_rate": 2.3411309290352347e-07, "loss": 0.0022, "step": 9465 }, { "epoch": 4.306642402183804, "grad_norm": 0.07061251101586037, "learning_rate": 2.3381123419080026e-07, "loss": 0.0008, "step": 9466 }, { "epoch": 4.3070973612374885, "grad_norm": 0.022538665223409717, "learning_rate": 2.3350956066078927e-07, "loss": 0.0001, "step": 9467 }, { "epoch": 4.307552320291173, "grad_norm": 0.06914271396504086, "learning_rate": 2.3320807233814135e-07, "loss": 0.0005, "step": 9468 }, { "epoch": 4.308007279344859, "grad_norm": 0.01140986906227095, "learning_rate": 2.3290676924749288e-07, "loss": 0.0001, "step": 9469 }, { "epoch": 4.308462238398544, "grad_norm": 0.07103942992693144, "learning_rate": 2.326056514134653e-07, "loss": 0.0006, "step": 9470 }, { "epoch": 4.308917197452229, "grad_norm": 0.01274661162605015, "learning_rate": 2.323047188606642e-07, "loss": 0.0001, "step": 9471 }, { "epoch": 4.309372156505915, "grad_norm": 0.09782082051773759, "learning_rate": 2.320039716136807e-07, "loss": 0.0007, "step": 9472 }, { "epoch": 4.3098271155595995, "grad_norm": 0.031445298727316544, "learning_rate": 2.3170340969709077e-07, "loss": 0.0003, "step": 9473 }, { "epoch": 4.310282074613285, "grad_norm": 0.08273736213569922, "learning_rate": 2.3140303313545416e-07, "loss": 0.0005, "step": 9474 }, { "epoch": 4.31073703366697, "grad_norm": 0.05167135447756471, "learning_rate": 2.3110284195331733e-07, "loss": 0.0004, "step": 9475 }, { "epoch": 4.311191992720655, "grad_norm": 0.06392119960535814, "learning_rate": 2.3080283617520987e-07, "loss": 0.0002, "step": 9476 }, { "epoch": 4.311646951774341, "grad_norm": 0.15203635035896754, "learning_rate": 2.3050301582564715e-07, "loss": 0.0012, "step": 9477 }, { "epoch": 4.312101910828026, "grad_norm": 0.10447354676038959, "learning_rate": 2.3020338092912848e-07, "loss": 0.0005, "step": 9478 }, { "epoch": 4.3125568698817105, "grad_norm": 0.036108897205970235, "learning_rate": 2.2990393151013923e-07, "loss": 0.0001, "step": 9479 }, { "epoch": 4.313011828935396, "grad_norm": 0.17138562331406, "learning_rate": 2.296046675931496e-07, "loss": 0.0007, "step": 9480 }, { "epoch": 4.313466787989081, "grad_norm": 0.1629762223405468, "learning_rate": 2.2930558920261332e-07, "loss": 0.001, "step": 9481 }, { "epoch": 4.313921747042766, "grad_norm": 0.006622158237539033, "learning_rate": 2.2900669636297002e-07, "loss": 0.0, "step": 9482 }, { "epoch": 4.314376706096452, "grad_norm": 0.01132928566563128, "learning_rate": 2.287079890986438e-07, "loss": 0.0001, "step": 9483 }, { "epoch": 4.314831665150137, "grad_norm": 0.09264308103350315, "learning_rate": 2.2840946743404313e-07, "loss": 0.0011, "step": 9484 }, { "epoch": 4.3152866242038215, "grad_norm": 0.14089205889743972, "learning_rate": 2.2811113139356245e-07, "loss": 0.0011, "step": 9485 }, { "epoch": 4.315741583257507, "grad_norm": 0.11568341492128045, "learning_rate": 2.2781298100158084e-07, "loss": 0.0004, "step": 9486 }, { "epoch": 4.316196542311192, "grad_norm": 0.09685647785178557, "learning_rate": 2.275150162824613e-07, "loss": 0.0008, "step": 9487 }, { "epoch": 4.316651501364877, "grad_norm": 0.1420903861678081, "learning_rate": 2.272172372605519e-07, "loss": 0.0017, "step": 9488 }, { "epoch": 4.317106460418563, "grad_norm": 0.06607882271447882, "learning_rate": 2.269196439601859e-07, "loss": 0.0003, "step": 9489 }, { "epoch": 4.317561419472248, "grad_norm": 0.03677028152782284, "learning_rate": 2.2662223640568192e-07, "loss": 0.0003, "step": 9490 }, { "epoch": 4.3180163785259325, "grad_norm": 0.15328973093240128, "learning_rate": 2.2632501462134222e-07, "loss": 0.0023, "step": 9491 }, { "epoch": 4.318471337579618, "grad_norm": 0.029585624649039712, "learning_rate": 2.2602797863145397e-07, "loss": 0.0001, "step": 9492 }, { "epoch": 4.318926296633303, "grad_norm": 0.05486520047462793, "learning_rate": 2.2573112846029087e-07, "loss": 0.0003, "step": 9493 }, { "epoch": 4.319381255686988, "grad_norm": 0.13015294245910106, "learning_rate": 2.2543446413210879e-07, "loss": 0.0007, "step": 9494 }, { "epoch": 4.319836214740674, "grad_norm": 0.05955708182955708, "learning_rate": 2.251379856711508e-07, "loss": 0.0003, "step": 9495 }, { "epoch": 4.320291173794359, "grad_norm": 0.09486808283604108, "learning_rate": 2.2484169310164366e-07, "loss": 0.0007, "step": 9496 }, { "epoch": 4.320746132848043, "grad_norm": 0.5005252441996373, "learning_rate": 2.2454558644779856e-07, "loss": 0.004, "step": 9497 }, { "epoch": 4.321201091901729, "grad_norm": 0.09995829595935833, "learning_rate": 2.2424966573381195e-07, "loss": 0.0007, "step": 9498 }, { "epoch": 4.321656050955414, "grad_norm": 0.19245368900234966, "learning_rate": 2.2395393098386565e-07, "loss": 0.0005, "step": 9499 }, { "epoch": 4.322111010009099, "grad_norm": 0.04195467654273141, "learning_rate": 2.2365838222212583e-07, "loss": 0.0003, "step": 9500 }, { "epoch": 4.322565969062785, "grad_norm": 0.013025986245795634, "learning_rate": 2.233630194727432e-07, "loss": 0.0001, "step": 9501 }, { "epoch": 4.3230209281164695, "grad_norm": 0.11884652770952413, "learning_rate": 2.2306784275985344e-07, "loss": 0.001, "step": 9502 }, { "epoch": 4.323475887170154, "grad_norm": 0.32516705509200095, "learning_rate": 2.2277285210757644e-07, "loss": 0.0041, "step": 9503 }, { "epoch": 4.32393084622384, "grad_norm": 0.09944679573507235, "learning_rate": 2.2247804754001874e-07, "loss": 0.0006, "step": 9504 }, { "epoch": 4.324385805277525, "grad_norm": 0.19704207627420747, "learning_rate": 2.2218342908126965e-07, "loss": 0.0016, "step": 9505 }, { "epoch": 4.32484076433121, "grad_norm": 0.10211042042992106, "learning_rate": 2.2188899675540388e-07, "loss": 0.0006, "step": 9506 }, { "epoch": 4.325295723384896, "grad_norm": 0.06511366359369153, "learning_rate": 2.2159475058648184e-07, "loss": 0.0008, "step": 9507 }, { "epoch": 4.3257506824385805, "grad_norm": 0.05098946651045803, "learning_rate": 2.213006905985471e-07, "loss": 0.0002, "step": 9508 }, { "epoch": 4.326205641492265, "grad_norm": 0.024831304867606985, "learning_rate": 2.2100681681562985e-07, "loss": 0.0002, "step": 9509 }, { "epoch": 4.326660600545951, "grad_norm": 0.2753907837439578, "learning_rate": 2.207131292617437e-07, "loss": 0.0038, "step": 9510 }, { "epoch": 4.327115559599636, "grad_norm": 0.045599236260259954, "learning_rate": 2.204196279608875e-07, "loss": 0.0002, "step": 9511 }, { "epoch": 4.327570518653321, "grad_norm": 0.177418579491672, "learning_rate": 2.201263129370443e-07, "loss": 0.0015, "step": 9512 }, { "epoch": 4.328025477707007, "grad_norm": 0.0441459047079445, "learning_rate": 2.198331842141832e-07, "loss": 0.0003, "step": 9513 }, { "epoch": 4.3284804367606915, "grad_norm": 0.04219780717511612, "learning_rate": 2.1954024181625732e-07, "loss": 0.0002, "step": 9514 }, { "epoch": 4.328935395814376, "grad_norm": 0.10473295560982808, "learning_rate": 2.1924748576720445e-07, "loss": 0.0002, "step": 9515 }, { "epoch": 4.329390354868062, "grad_norm": 0.039686216530744486, "learning_rate": 2.189549160909474e-07, "loss": 0.0002, "step": 9516 }, { "epoch": 4.329845313921747, "grad_norm": 0.10189849948868127, "learning_rate": 2.186625328113931e-07, "loss": 0.0009, "step": 9517 }, { "epoch": 4.330300272975432, "grad_norm": 0.07418027079674964, "learning_rate": 2.1837033595243446e-07, "loss": 0.0007, "step": 9518 }, { "epoch": 4.330755232029118, "grad_norm": 0.1438971311913872, "learning_rate": 2.1807832553794815e-07, "loss": 0.0013, "step": 9519 }, { "epoch": 4.3312101910828025, "grad_norm": 0.11556336490058125, "learning_rate": 2.177865015917957e-07, "loss": 0.0005, "step": 9520 }, { "epoch": 4.331665150136487, "grad_norm": 0.028230458310154607, "learning_rate": 2.174948641378244e-07, "loss": 0.0001, "step": 9521 }, { "epoch": 4.332120109190173, "grad_norm": 0.1514162484421748, "learning_rate": 2.1720341319986516e-07, "loss": 0.0019, "step": 9522 }, { "epoch": 4.332575068243858, "grad_norm": 0.18553732827770547, "learning_rate": 2.169121488017334e-07, "loss": 0.0017, "step": 9523 }, { "epoch": 4.333030027297543, "grad_norm": 0.25677219516161426, "learning_rate": 2.1662107096723116e-07, "loss": 0.0009, "step": 9524 }, { "epoch": 4.333484986351229, "grad_norm": 0.12042002774573858, "learning_rate": 2.163301797201431e-07, "loss": 0.0012, "step": 9525 }, { "epoch": 4.3339399454049135, "grad_norm": 0.1480183096746346, "learning_rate": 2.1603947508423983e-07, "loss": 0.0012, "step": 9526 }, { "epoch": 4.334394904458598, "grad_norm": 0.3268943694510917, "learning_rate": 2.1574895708327603e-07, "loss": 0.003, "step": 9527 }, { "epoch": 4.334849863512284, "grad_norm": 0.04877869570078236, "learning_rate": 2.1545862574099185e-07, "loss": 0.0002, "step": 9528 }, { "epoch": 4.335304822565969, "grad_norm": 0.08408294047755058, "learning_rate": 2.151684810811122e-07, "loss": 0.0006, "step": 9529 }, { "epoch": 4.335759781619654, "grad_norm": 0.0071065626636185805, "learning_rate": 2.148785231273462e-07, "loss": 0.0, "step": 9530 }, { "epoch": 4.33621474067334, "grad_norm": 0.10921965216004935, "learning_rate": 2.1458875190338792e-07, "loss": 0.0004, "step": 9531 }, { "epoch": 4.336669699727024, "grad_norm": 0.06915411947244138, "learning_rate": 2.1429916743291534e-07, "loss": 0.0004, "step": 9532 }, { "epoch": 4.337124658780709, "grad_norm": 0.07616223786985522, "learning_rate": 2.140097697395932e-07, "loss": 0.0005, "step": 9533 }, { "epoch": 4.337579617834395, "grad_norm": 0.061025317422114755, "learning_rate": 2.1372055884706865e-07, "loss": 0.0003, "step": 9534 }, { "epoch": 4.33803457688808, "grad_norm": 0.2868254707332175, "learning_rate": 2.1343153477897587e-07, "loss": 0.0012, "step": 9535 }, { "epoch": 4.338489535941765, "grad_norm": 0.3425878434646185, "learning_rate": 2.1314269755893209e-07, "loss": 0.0036, "step": 9536 }, { "epoch": 4.3389444949954505, "grad_norm": 0.03830721712629944, "learning_rate": 2.1285404721053893e-07, "loss": 0.0003, "step": 9537 }, { "epoch": 4.339399454049135, "grad_norm": 0.08622961608740579, "learning_rate": 2.1256558375738507e-07, "loss": 0.0005, "step": 9538 }, { "epoch": 4.339854413102821, "grad_norm": 0.5341661676728572, "learning_rate": 2.122773072230419e-07, "loss": 0.0035, "step": 9539 }, { "epoch": 4.340309372156506, "grad_norm": 0.02941645879061211, "learning_rate": 2.1198921763106562e-07, "loss": 0.0002, "step": 9540 }, { "epoch": 4.340764331210191, "grad_norm": 0.06177681283456973, "learning_rate": 2.1170131500499763e-07, "loss": 0.0005, "step": 9541 }, { "epoch": 4.341219290263877, "grad_norm": 0.16725735758112656, "learning_rate": 2.1141359936836414e-07, "loss": 0.0012, "step": 9542 }, { "epoch": 4.3416742493175615, "grad_norm": 0.014587340849318235, "learning_rate": 2.111260707446769e-07, "loss": 0.0001, "step": 9543 }, { "epoch": 4.342129208371246, "grad_norm": 0.2252943257961865, "learning_rate": 2.108387291574304e-07, "loss": 0.0032, "step": 9544 }, { "epoch": 4.342584167424932, "grad_norm": 0.08976979533208138, "learning_rate": 2.1055157463010512e-07, "loss": 0.0005, "step": 9545 }, { "epoch": 4.343039126478617, "grad_norm": 0.008361784512614174, "learning_rate": 2.102646071861661e-07, "loss": 0.0, "step": 9546 }, { "epoch": 4.343494085532302, "grad_norm": 0.06077803680086355, "learning_rate": 2.099778268490632e-07, "loss": 0.0004, "step": 9547 }, { "epoch": 4.343949044585988, "grad_norm": 0.022338582197259288, "learning_rate": 2.0969123364222993e-07, "loss": 0.0001, "step": 9548 }, { "epoch": 4.3444040036396725, "grad_norm": 0.05001286789211653, "learning_rate": 2.0940482758908698e-07, "loss": 0.0003, "step": 9549 }, { "epoch": 4.344858962693357, "grad_norm": 0.5131444003773306, "learning_rate": 2.09118608713037e-07, "loss": 0.002, "step": 9550 }, { "epoch": 4.345313921747043, "grad_norm": 0.1892078724731982, "learning_rate": 2.0883257703746856e-07, "loss": 0.0024, "step": 9551 }, { "epoch": 4.345768880800728, "grad_norm": 0.03147501621577283, "learning_rate": 2.0854673258575542e-07, "loss": 0.0002, "step": 9552 }, { "epoch": 4.346223839854413, "grad_norm": 0.05039301606110866, "learning_rate": 2.082610753812553e-07, "loss": 0.0004, "step": 9553 }, { "epoch": 4.346678798908099, "grad_norm": 0.24842230836914453, "learning_rate": 2.0797560544731061e-07, "loss": 0.0041, "step": 9554 }, { "epoch": 4.3471337579617835, "grad_norm": 0.08406834537351511, "learning_rate": 2.0769032280724827e-07, "loss": 0.0003, "step": 9555 }, { "epoch": 4.347588717015468, "grad_norm": 0.0993052353068596, "learning_rate": 2.074052274843813e-07, "loss": 0.0005, "step": 9556 }, { "epoch": 4.348043676069154, "grad_norm": 0.1164037827827723, "learning_rate": 2.0712031950200523e-07, "loss": 0.0015, "step": 9557 }, { "epoch": 4.348498635122839, "grad_norm": 0.31056024250558956, "learning_rate": 2.0683559888340283e-07, "loss": 0.0033, "step": 9558 }, { "epoch": 4.348953594176524, "grad_norm": 0.0967089675284538, "learning_rate": 2.0655106565183934e-07, "loss": 0.001, "step": 9559 }, { "epoch": 4.34940855323021, "grad_norm": 0.08500826343153452, "learning_rate": 2.0626671983056566e-07, "loss": 0.0006, "step": 9560 }, { "epoch": 4.3498635122838945, "grad_norm": 0.06025744600542242, "learning_rate": 2.0598256144281654e-07, "loss": 0.0004, "step": 9561 }, { "epoch": 4.350318471337579, "grad_norm": 0.05426865839799504, "learning_rate": 2.0569859051181306e-07, "loss": 0.0003, "step": 9562 }, { "epoch": 4.350773430391265, "grad_norm": 0.013782129401784245, "learning_rate": 2.0541480706076033e-07, "loss": 0.0001, "step": 9563 }, { "epoch": 4.35122838944495, "grad_norm": 0.0576129354060292, "learning_rate": 2.05131211112847e-07, "loss": 0.0003, "step": 9564 }, { "epoch": 4.351683348498635, "grad_norm": 0.10663829972158326, "learning_rate": 2.0484780269124787e-07, "loss": 0.0007, "step": 9565 }, { "epoch": 4.352138307552321, "grad_norm": 0.22648914406430004, "learning_rate": 2.0456458181912082e-07, "loss": 0.0013, "step": 9566 }, { "epoch": 4.352593266606005, "grad_norm": 0.12513764468447172, "learning_rate": 2.0428154851961062e-07, "loss": 0.001, "step": 9567 }, { "epoch": 4.35304822565969, "grad_norm": 0.0933400652436273, "learning_rate": 2.0399870281584467e-07, "loss": 0.0003, "step": 9568 }, { "epoch": 4.353503184713376, "grad_norm": 0.08306923408616707, "learning_rate": 2.037160447309358e-07, "loss": 0.0004, "step": 9569 }, { "epoch": 4.353958143767061, "grad_norm": 0.07105014037865892, "learning_rate": 2.0343357428798256e-07, "loss": 0.0004, "step": 9570 }, { "epoch": 4.354413102820746, "grad_norm": 0.25329249372699103, "learning_rate": 2.0315129151006557e-07, "loss": 0.0026, "step": 9571 }, { "epoch": 4.3548680618744315, "grad_norm": 0.07457281465560574, "learning_rate": 2.028691964202531e-07, "loss": 0.0005, "step": 9572 }, { "epoch": 4.355323020928116, "grad_norm": 0.09632454096008193, "learning_rate": 2.0258728904159612e-07, "loss": 0.0003, "step": 9573 }, { "epoch": 4.355777979981801, "grad_norm": 0.03065683879898118, "learning_rate": 2.0230556939713098e-07, "loss": 0.0002, "step": 9574 }, { "epoch": 4.356232939035487, "grad_norm": 0.059689771440563553, "learning_rate": 2.0202403750987809e-07, "loss": 0.0004, "step": 9575 }, { "epoch": 4.356687898089172, "grad_norm": 0.07873397178329462, "learning_rate": 2.0174269340284297e-07, "loss": 0.0004, "step": 9576 }, { "epoch": 4.357142857142857, "grad_norm": 0.015919855995803516, "learning_rate": 2.0146153709901666e-07, "loss": 0.0001, "step": 9577 }, { "epoch": 4.3575978161965425, "grad_norm": 0.31704379657282283, "learning_rate": 2.0118056862137358e-07, "loss": 0.0038, "step": 9578 }, { "epoch": 4.358052775250227, "grad_norm": 0.09654731823616933, "learning_rate": 2.0089978799287286e-07, "loss": 0.0004, "step": 9579 }, { "epoch": 4.358507734303912, "grad_norm": 0.04075762274118051, "learning_rate": 2.0061919523645835e-07, "loss": 0.0002, "step": 9580 }, { "epoch": 4.358962693357598, "grad_norm": 0.18803963305961446, "learning_rate": 2.0033879037506003e-07, "loss": 0.0009, "step": 9581 }, { "epoch": 4.359417652411283, "grad_norm": 0.09427340456548257, "learning_rate": 2.0005857343159046e-07, "loss": 0.0003, "step": 9582 }, { "epoch": 4.359872611464968, "grad_norm": 0.025592053408814232, "learning_rate": 1.997785444289471e-07, "loss": 0.0001, "step": 9583 }, { "epoch": 4.3603275705186535, "grad_norm": 0.12552229500423706, "learning_rate": 1.9949870339001422e-07, "loss": 0.0011, "step": 9584 }, { "epoch": 4.360782529572338, "grad_norm": 0.034152329399888375, "learning_rate": 1.992190503376576e-07, "loss": 0.0002, "step": 9585 }, { "epoch": 4.361237488626024, "grad_norm": 0.06950271470817332, "learning_rate": 1.989395852947304e-07, "loss": 0.0003, "step": 9586 }, { "epoch": 4.361692447679709, "grad_norm": 0.2645217037008491, "learning_rate": 1.9866030828406908e-07, "loss": 0.0009, "step": 9587 }, { "epoch": 4.362147406733394, "grad_norm": 0.06086184850993891, "learning_rate": 1.9838121932849423e-07, "loss": 0.0003, "step": 9588 }, { "epoch": 4.36260236578708, "grad_norm": 0.1403741466113561, "learning_rate": 1.981023184508124e-07, "loss": 0.0012, "step": 9589 }, { "epoch": 4.3630573248407645, "grad_norm": 0.1293572422738656, "learning_rate": 1.978236056738128e-07, "loss": 0.0006, "step": 9590 }, { "epoch": 4.363512283894449, "grad_norm": 0.026380440502853337, "learning_rate": 1.9754508102027253e-07, "loss": 0.0001, "step": 9591 }, { "epoch": 4.363967242948135, "grad_norm": 0.11356858834207062, "learning_rate": 1.972667445129506e-07, "loss": 0.0009, "step": 9592 }, { "epoch": 4.36442220200182, "grad_norm": 0.22923133086118935, "learning_rate": 1.96988596174591e-07, "loss": 0.0019, "step": 9593 }, { "epoch": 4.364877161055505, "grad_norm": 0.02398719613711277, "learning_rate": 1.9671063602792307e-07, "loss": 0.0002, "step": 9594 }, { "epoch": 4.365332120109191, "grad_norm": 0.08575923847309344, "learning_rate": 1.9643286409566004e-07, "loss": 0.0008, "step": 9595 }, { "epoch": 4.3657870791628755, "grad_norm": 0.2950939875389433, "learning_rate": 1.9615528040050098e-07, "loss": 0.0018, "step": 9596 }, { "epoch": 4.36624203821656, "grad_norm": 0.04154731773624997, "learning_rate": 1.95877884965128e-07, "loss": 0.0002, "step": 9597 }, { "epoch": 4.366696997270246, "grad_norm": 0.035176312826577065, "learning_rate": 1.956006778122091e-07, "loss": 0.0001, "step": 9598 }, { "epoch": 4.367151956323931, "grad_norm": 0.21012685857917038, "learning_rate": 1.9532365896439642e-07, "loss": 0.0021, "step": 9599 }, { "epoch": 4.367606915377616, "grad_norm": 0.045987571385079876, "learning_rate": 1.9504682844432603e-07, "loss": 0.0003, "step": 9600 }, { "epoch": 4.368061874431302, "grad_norm": 0.017881353681301157, "learning_rate": 1.9477018627462013e-07, "loss": 0.0001, "step": 9601 }, { "epoch": 4.368516833484986, "grad_norm": 0.17257162325279232, "learning_rate": 1.944937324778845e-07, "loss": 0.0018, "step": 9602 }, { "epoch": 4.368971792538671, "grad_norm": 0.06094567220154974, "learning_rate": 1.9421746707670913e-07, "loss": 0.0002, "step": 9603 }, { "epoch": 4.369426751592357, "grad_norm": 0.1842722193782953, "learning_rate": 1.939413900936693e-07, "loss": 0.0009, "step": 9604 }, { "epoch": 4.369881710646042, "grad_norm": 0.044514486933472146, "learning_rate": 1.93665501551325e-07, "loss": 0.0002, "step": 9605 }, { "epoch": 4.370336669699727, "grad_norm": 0.0880506084781617, "learning_rate": 1.9338980147222103e-07, "loss": 0.0008, "step": 9606 }, { "epoch": 4.3707916287534125, "grad_norm": 0.07080834155538668, "learning_rate": 1.9311428987888597e-07, "loss": 0.0004, "step": 9607 }, { "epoch": 4.371246587807097, "grad_norm": 0.01851558118500817, "learning_rate": 1.9283896679383325e-07, "loss": 0.0001, "step": 9608 }, { "epoch": 4.371701546860782, "grad_norm": 0.07943358206218608, "learning_rate": 1.9256383223956067e-07, "loss": 0.0004, "step": 9609 }, { "epoch": 4.372156505914468, "grad_norm": 0.09552820973317891, "learning_rate": 1.9228888623855192e-07, "loss": 0.0006, "step": 9610 }, { "epoch": 4.372611464968153, "grad_norm": 0.03854015214029533, "learning_rate": 1.9201412881327318e-07, "loss": 0.0002, "step": 9611 }, { "epoch": 4.373066424021838, "grad_norm": 0.09849701927692507, "learning_rate": 1.9173955998617794e-07, "loss": 0.0006, "step": 9612 }, { "epoch": 4.3735213830755235, "grad_norm": 0.05370785140047805, "learning_rate": 1.9146517977970152e-07, "loss": 0.0004, "step": 9613 }, { "epoch": 4.373976342129208, "grad_norm": 0.06317293695029726, "learning_rate": 1.9119098821626492e-07, "loss": 0.0003, "step": 9614 }, { "epoch": 4.374431301182893, "grad_norm": 0.30136122255102027, "learning_rate": 1.909169853182749e-07, "loss": 0.0029, "step": 9615 }, { "epoch": 4.374886260236579, "grad_norm": 0.3369213170037791, "learning_rate": 1.906431711081211e-07, "loss": 0.002, "step": 9616 }, { "epoch": 4.375341219290264, "grad_norm": 0.11141871164372193, "learning_rate": 1.9036954560817804e-07, "loss": 0.0012, "step": 9617 }, { "epoch": 4.375796178343949, "grad_norm": 0.011924702391935412, "learning_rate": 1.9009610884080543e-07, "loss": 0.0001, "step": 9618 }, { "epoch": 4.3762511373976345, "grad_norm": 0.04261046153919239, "learning_rate": 1.8982286082834728e-07, "loss": 0.0003, "step": 9619 }, { "epoch": 4.376706096451319, "grad_norm": 0.21473372713179306, "learning_rate": 1.895498015931327e-07, "loss": 0.002, "step": 9620 }, { "epoch": 4.377161055505004, "grad_norm": 0.06023729363556138, "learning_rate": 1.8927693115747464e-07, "loss": 0.0003, "step": 9621 }, { "epoch": 4.37761601455869, "grad_norm": 0.21808437468682726, "learning_rate": 1.8900424954367031e-07, "loss": 0.0017, "step": 9622 }, { "epoch": 4.378070973612375, "grad_norm": 0.07383249389450061, "learning_rate": 1.8873175677400212e-07, "loss": 0.0006, "step": 9623 }, { "epoch": 4.37852593266606, "grad_norm": 0.17063976838141512, "learning_rate": 1.884594528707376e-07, "loss": 0.0014, "step": 9624 }, { "epoch": 4.3789808917197455, "grad_norm": 0.024206747361200265, "learning_rate": 1.881873378561272e-07, "loss": 0.0001, "step": 9625 }, { "epoch": 4.37943585077343, "grad_norm": 0.25282079077318487, "learning_rate": 1.879154117524079e-07, "loss": 0.0022, "step": 9626 }, { "epoch": 4.379890809827115, "grad_norm": 0.01997023961653687, "learning_rate": 1.876436745818e-07, "loss": 0.0001, "step": 9627 }, { "epoch": 4.380345768880801, "grad_norm": 0.16746542679029403, "learning_rate": 1.8737212636650848e-07, "loss": 0.0014, "step": 9628 }, { "epoch": 4.380800727934486, "grad_norm": 0.42659473388205066, "learning_rate": 1.8710076712872254e-07, "loss": 0.0019, "step": 9629 }, { "epoch": 4.381255686988171, "grad_norm": 0.024725701847353256, "learning_rate": 1.8682959689061753e-07, "loss": 0.0002, "step": 9630 }, { "epoch": 4.3817106460418564, "grad_norm": 0.01820015798119904, "learning_rate": 1.8655861567435152e-07, "loss": 0.0001, "step": 9631 }, { "epoch": 4.382165605095541, "grad_norm": 0.02672888725840468, "learning_rate": 1.8628782350206764e-07, "loss": 0.0001, "step": 9632 }, { "epoch": 4.382620564149226, "grad_norm": 0.2159617987270714, "learning_rate": 1.8601722039589488e-07, "loss": 0.0014, "step": 9633 }, { "epoch": 4.383075523202912, "grad_norm": 0.041479694987933226, "learning_rate": 1.8574680637794413e-07, "loss": 0.0003, "step": 9634 }, { "epoch": 4.383530482256597, "grad_norm": 0.11870984518804817, "learning_rate": 1.8547658147031412e-07, "loss": 0.0007, "step": 9635 }, { "epoch": 4.383985441310282, "grad_norm": 0.08644231972782111, "learning_rate": 1.8520654569508523e-07, "loss": 0.0008, "step": 9636 }, { "epoch": 4.384440400363967, "grad_norm": 0.019906304493907217, "learning_rate": 1.8493669907432426e-07, "loss": 0.0001, "step": 9637 }, { "epoch": 4.384895359417652, "grad_norm": 0.18960745076498864, "learning_rate": 1.8466704163008082e-07, "loss": 0.0015, "step": 9638 }, { "epoch": 4.385350318471337, "grad_norm": 0.04160322000185879, "learning_rate": 1.8439757338439085e-07, "loss": 0.0003, "step": 9639 }, { "epoch": 4.385805277525023, "grad_norm": 0.17078456081801416, "learning_rate": 1.8412829435927453e-07, "loss": 0.0003, "step": 9640 }, { "epoch": 4.386260236578708, "grad_norm": 0.1463543722639989, "learning_rate": 1.838592045767354e-07, "loss": 0.0009, "step": 9641 }, { "epoch": 4.386715195632393, "grad_norm": 0.03586707417977993, "learning_rate": 1.8359030405876276e-07, "loss": 0.0001, "step": 9642 }, { "epoch": 4.387170154686078, "grad_norm": 0.19635609664884993, "learning_rate": 1.833215928273291e-07, "loss": 0.0015, "step": 9643 }, { "epoch": 4.387625113739763, "grad_norm": 0.01621338155425951, "learning_rate": 1.8305307090439318e-07, "loss": 0.0001, "step": 9644 }, { "epoch": 4.388080072793448, "grad_norm": 0.029713020827053783, "learning_rate": 1.8278473831189718e-07, "loss": 0.0002, "step": 9645 }, { "epoch": 4.388535031847134, "grad_norm": 0.015624876916318607, "learning_rate": 1.825165950717675e-07, "loss": 0.0001, "step": 9646 }, { "epoch": 4.388989990900819, "grad_norm": 0.36494507743111965, "learning_rate": 1.8224864120591628e-07, "loss": 0.0044, "step": 9647 }, { "epoch": 4.389444949954504, "grad_norm": 0.07797288523726933, "learning_rate": 1.819808767362391e-07, "loss": 0.0008, "step": 9648 }, { "epoch": 4.389899909008189, "grad_norm": 0.14232589649792296, "learning_rate": 1.8171330168461675e-07, "loss": 0.001, "step": 9649 }, { "epoch": 4.390354868061874, "grad_norm": 0.0395527774013836, "learning_rate": 1.8144591607291427e-07, "loss": 0.0002, "step": 9650 }, { "epoch": 4.39080982711556, "grad_norm": 0.20435162462555628, "learning_rate": 1.8117871992298086e-07, "loss": 0.0013, "step": 9651 }, { "epoch": 4.391264786169245, "grad_norm": 0.0631946753500401, "learning_rate": 1.8091171325665042e-07, "loss": 0.0002, "step": 9652 }, { "epoch": 4.39171974522293, "grad_norm": 0.20445838734975072, "learning_rate": 1.8064489609574186e-07, "loss": 0.0013, "step": 9653 }, { "epoch": 4.3921747042766155, "grad_norm": 0.13003264420066732, "learning_rate": 1.8037826846205864e-07, "loss": 0.001, "step": 9654 }, { "epoch": 4.3926296633303, "grad_norm": 0.20908400848116399, "learning_rate": 1.80111830377388e-07, "loss": 0.0003, "step": 9655 }, { "epoch": 4.393084622383985, "grad_norm": 0.6020050806296195, "learning_rate": 1.7984558186350226e-07, "loss": 0.0103, "step": 9656 }, { "epoch": 4.393539581437671, "grad_norm": 0.026186062031180856, "learning_rate": 1.795795229421571e-07, "loss": 0.0001, "step": 9657 }, { "epoch": 4.393994540491356, "grad_norm": 0.18017502987155215, "learning_rate": 1.7931365363509506e-07, "loss": 0.0018, "step": 9658 }, { "epoch": 4.394449499545041, "grad_norm": 0.18894368680438084, "learning_rate": 1.7904797396404106e-07, "loss": 0.0016, "step": 9659 }, { "epoch": 4.3949044585987265, "grad_norm": 0.057392406096937264, "learning_rate": 1.787824839507049e-07, "loss": 0.0002, "step": 9660 }, { "epoch": 4.395359417652411, "grad_norm": 0.03649282994483993, "learning_rate": 1.7851718361678206e-07, "loss": 0.0002, "step": 9661 }, { "epoch": 4.395814376706096, "grad_norm": 0.0398500274352411, "learning_rate": 1.7825207298395068e-07, "loss": 0.0002, "step": 9662 }, { "epoch": 4.396269335759782, "grad_norm": 0.016049605408660194, "learning_rate": 1.7798715207387545e-07, "loss": 0.0001, "step": 9663 }, { "epoch": 4.396724294813467, "grad_norm": 0.47277367996047936, "learning_rate": 1.7772242090820402e-07, "loss": 0.0052, "step": 9664 }, { "epoch": 4.397179253867152, "grad_norm": 0.18664740463447965, "learning_rate": 1.7745787950856907e-07, "loss": 0.0014, "step": 9665 }, { "epoch": 4.3976342129208374, "grad_norm": 0.06804284430574847, "learning_rate": 1.7719352789658779e-07, "loss": 0.0003, "step": 9666 }, { "epoch": 4.398089171974522, "grad_norm": 0.19692915029984495, "learning_rate": 1.769293660938612e-07, "loss": 0.0017, "step": 9667 }, { "epoch": 4.398544131028207, "grad_norm": 0.12432937576042775, "learning_rate": 1.7666539412197619e-07, "loss": 0.0012, "step": 9668 }, { "epoch": 4.398999090081893, "grad_norm": 0.2140073959245242, "learning_rate": 1.7640161200250305e-07, "loss": 0.0016, "step": 9669 }, { "epoch": 4.399454049135578, "grad_norm": 0.331883979001922, "learning_rate": 1.761380197569973e-07, "loss": 0.001, "step": 9670 }, { "epoch": 4.399909008189263, "grad_norm": 0.01800437240908675, "learning_rate": 1.758746174069978e-07, "loss": 0.0001, "step": 9671 }, { "epoch": 4.400363967242948, "grad_norm": 0.3500459745824907, "learning_rate": 1.7561140497402874e-07, "loss": 0.0027, "step": 9672 }, { "epoch": 4.400818926296633, "grad_norm": 0.016041000468013003, "learning_rate": 1.75348382479599e-07, "loss": 0.0001, "step": 9673 }, { "epoch": 4.401273885350318, "grad_norm": 0.08800223934976996, "learning_rate": 1.7508554994520117e-07, "loss": 0.0007, "step": 9674 }, { "epoch": 4.401728844404004, "grad_norm": 0.10156086301589007, "learning_rate": 1.7482290739231327e-07, "loss": 0.0006, "step": 9675 }, { "epoch": 4.402183803457689, "grad_norm": 0.15862402028471217, "learning_rate": 1.7456045484239708e-07, "loss": 0.0025, "step": 9676 }, { "epoch": 4.402638762511374, "grad_norm": 0.278767221258969, "learning_rate": 1.7429819231689848e-07, "loss": 0.0016, "step": 9677 }, { "epoch": 4.403093721565059, "grad_norm": 0.10248428349177828, "learning_rate": 1.740361198372492e-07, "loss": 0.0005, "step": 9678 }, { "epoch": 4.403548680618744, "grad_norm": 0.08079871660261254, "learning_rate": 1.7377423742486439e-07, "loss": 0.0003, "step": 9679 }, { "epoch": 4.404003639672429, "grad_norm": 0.15191369402548835, "learning_rate": 1.735125451011435e-07, "loss": 0.0006, "step": 9680 }, { "epoch": 4.404458598726115, "grad_norm": 0.03429312517347091, "learning_rate": 1.7325104288747092e-07, "loss": 0.0002, "step": 9681 }, { "epoch": 4.4049135577798, "grad_norm": 0.08829076426526689, "learning_rate": 1.7298973080521532e-07, "loss": 0.0006, "step": 9682 }, { "epoch": 4.405368516833485, "grad_norm": 0.08070351472481445, "learning_rate": 1.72728608875731e-07, "loss": 0.0005, "step": 9683 }, { "epoch": 4.40582347588717, "grad_norm": 0.1648732616127785, "learning_rate": 1.7246767712035457e-07, "loss": 0.0004, "step": 9684 }, { "epoch": 4.406278434940855, "grad_norm": 0.02244542569634849, "learning_rate": 1.7220693556040862e-07, "loss": 0.0001, "step": 9685 }, { "epoch": 4.40673339399454, "grad_norm": 0.05032909019998121, "learning_rate": 1.719463842171995e-07, "loss": 0.0004, "step": 9686 }, { "epoch": 4.407188353048226, "grad_norm": 0.019734517571463515, "learning_rate": 1.7168602311201875e-07, "loss": 0.0001, "step": 9687 }, { "epoch": 4.407643312101911, "grad_norm": 0.12569510021267113, "learning_rate": 1.7142585226614107e-07, "loss": 0.0006, "step": 9688 }, { "epoch": 4.408098271155596, "grad_norm": 0.022815780064671564, "learning_rate": 1.7116587170082743e-07, "loss": 0.0001, "step": 9689 }, { "epoch": 4.408553230209281, "grad_norm": 0.14668602650022639, "learning_rate": 1.70906081437322e-07, "loss": 0.0011, "step": 9690 }, { "epoch": 4.409008189262966, "grad_norm": 0.03401279784075482, "learning_rate": 1.706464814968531e-07, "loss": 0.0002, "step": 9691 }, { "epoch": 4.409463148316651, "grad_norm": 0.14860536331614985, "learning_rate": 1.7038707190063454e-07, "loss": 0.0015, "step": 9692 }, { "epoch": 4.409918107370337, "grad_norm": 0.08136402362013646, "learning_rate": 1.7012785266986438e-07, "loss": 0.0007, "step": 9693 }, { "epoch": 4.410373066424022, "grad_norm": 0.05710102652224282, "learning_rate": 1.698688238257243e-07, "loss": 0.0002, "step": 9694 }, { "epoch": 4.4108280254777075, "grad_norm": 0.11826300227649116, "learning_rate": 1.6960998538938096e-07, "loss": 0.0012, "step": 9695 }, { "epoch": 4.411282984531392, "grad_norm": 0.033745853427528796, "learning_rate": 1.693513373819855e-07, "loss": 0.0002, "step": 9696 }, { "epoch": 4.411737943585077, "grad_norm": 0.02861492461229329, "learning_rate": 1.6909287982467382e-07, "loss": 0.0002, "step": 9697 }, { "epoch": 4.412192902638763, "grad_norm": 0.09739591879671404, "learning_rate": 1.688346127385662e-07, "loss": 0.0007, "step": 9698 }, { "epoch": 4.412647861692448, "grad_norm": 0.283024664411039, "learning_rate": 1.685765361447661e-07, "loss": 0.0034, "step": 9699 }, { "epoch": 4.413102820746133, "grad_norm": 0.190205115766678, "learning_rate": 1.6831865006436327e-07, "loss": 0.0026, "step": 9700 }, { "epoch": 4.4135577797998184, "grad_norm": 0.02465989045772989, "learning_rate": 1.6806095451843e-07, "loss": 0.0001, "step": 9701 }, { "epoch": 4.414012738853503, "grad_norm": 0.04946457314774131, "learning_rate": 1.678034495280245e-07, "loss": 0.0004, "step": 9702 }, { "epoch": 4.414467697907188, "grad_norm": 0.36298608861895276, "learning_rate": 1.6754613511418938e-07, "loss": 0.0034, "step": 9703 }, { "epoch": 4.414922656960874, "grad_norm": 0.09838188537568604, "learning_rate": 1.6728901129795082e-07, "loss": 0.0007, "step": 9704 }, { "epoch": 4.415377616014559, "grad_norm": 0.21454049989910481, "learning_rate": 1.6703207810032012e-07, "loss": 0.0032, "step": 9705 }, { "epoch": 4.415832575068244, "grad_norm": 0.037895518056401095, "learning_rate": 1.6677533554229186e-07, "loss": 0.0003, "step": 9706 }, { "epoch": 4.416287534121929, "grad_norm": 0.07433387158371843, "learning_rate": 1.6651878364484676e-07, "loss": 0.0005, "step": 9707 }, { "epoch": 4.416742493175614, "grad_norm": 0.24138984600485042, "learning_rate": 1.6626242242894858e-07, "loss": 0.0014, "step": 9708 }, { "epoch": 4.417197452229299, "grad_norm": 0.15174790688259354, "learning_rate": 1.6600625191554616e-07, "loss": 0.0009, "step": 9709 }, { "epoch": 4.417652411282985, "grad_norm": 0.0311784917903873, "learning_rate": 1.657502721255727e-07, "loss": 0.0002, "step": 9710 }, { "epoch": 4.41810737033667, "grad_norm": 0.10921905500830395, "learning_rate": 1.6549448307994542e-07, "loss": 0.0007, "step": 9711 }, { "epoch": 4.418562329390355, "grad_norm": 0.038968355547673694, "learning_rate": 1.6523888479956675e-07, "loss": 0.0002, "step": 9712 }, { "epoch": 4.41901728844404, "grad_norm": 0.04923515167755432, "learning_rate": 1.649834773053227e-07, "loss": 0.0004, "step": 9713 }, { "epoch": 4.419472247497725, "grad_norm": 0.06036851091948985, "learning_rate": 1.6472826061808416e-07, "loss": 0.0002, "step": 9714 }, { "epoch": 4.41992720655141, "grad_norm": 0.023488254737782162, "learning_rate": 1.6447323475870554e-07, "loss": 0.0002, "step": 9715 }, { "epoch": 4.420382165605096, "grad_norm": 0.035583537369037674, "learning_rate": 1.6421839974802733e-07, "loss": 0.0001, "step": 9716 }, { "epoch": 4.420837124658781, "grad_norm": 0.02033917026946548, "learning_rate": 1.639637556068735e-07, "loss": 0.0001, "step": 9717 }, { "epoch": 4.421292083712466, "grad_norm": 0.0921549640036292, "learning_rate": 1.6370930235605182e-07, "loss": 0.0004, "step": 9718 }, { "epoch": 4.421747042766151, "grad_norm": 0.04785531859956693, "learning_rate": 1.6345504001635564e-07, "loss": 0.0003, "step": 9719 }, { "epoch": 4.422202001819836, "grad_norm": 0.17629068883551346, "learning_rate": 1.6320096860856143e-07, "loss": 0.0012, "step": 9720 }, { "epoch": 4.422656960873521, "grad_norm": 0.28905602270731684, "learning_rate": 1.6294708815343174e-07, "loss": 0.0027, "step": 9721 }, { "epoch": 4.423111919927207, "grad_norm": 0.11640746926306872, "learning_rate": 1.6269339867171163e-07, "loss": 0.0006, "step": 9722 }, { "epoch": 4.423566878980892, "grad_norm": 0.15542529019270582, "learning_rate": 1.6243990018413146e-07, "loss": 0.0012, "step": 9723 }, { "epoch": 4.424021838034577, "grad_norm": 0.12478904805197706, "learning_rate": 1.621865927114069e-07, "loss": 0.0012, "step": 9724 }, { "epoch": 4.424476797088262, "grad_norm": 0.23241436443019792, "learning_rate": 1.619334762742361e-07, "loss": 0.0005, "step": 9725 }, { "epoch": 4.424931756141947, "grad_norm": 0.2934008020256392, "learning_rate": 1.6168055089330338e-07, "loss": 0.0022, "step": 9726 }, { "epoch": 4.425386715195632, "grad_norm": 0.2427576883455203, "learning_rate": 1.6142781658927603e-07, "loss": 0.0007, "step": 9727 }, { "epoch": 4.425841674249318, "grad_norm": 0.107361203633043, "learning_rate": 1.6117527338280674e-07, "loss": 0.001, "step": 9728 }, { "epoch": 4.426296633303003, "grad_norm": 0.016605757465985905, "learning_rate": 1.609229212945318e-07, "loss": 0.0001, "step": 9729 }, { "epoch": 4.426751592356688, "grad_norm": 0.06980097900909764, "learning_rate": 1.6067076034507246e-07, "loss": 0.0005, "step": 9730 }, { "epoch": 4.427206551410373, "grad_norm": 0.037641766299985044, "learning_rate": 1.6041879055503473e-07, "loss": 0.0001, "step": 9731 }, { "epoch": 4.427661510464058, "grad_norm": 0.06813798792501911, "learning_rate": 1.60167011945008e-07, "loss": 0.0005, "step": 9732 }, { "epoch": 4.428116469517743, "grad_norm": 0.10000367989588477, "learning_rate": 1.5991542453556635e-07, "loss": 0.0008, "step": 9733 }, { "epoch": 4.428571428571429, "grad_norm": 0.015907193344298083, "learning_rate": 1.5966402834726862e-07, "loss": 0.0001, "step": 9734 }, { "epoch": 4.429026387625114, "grad_norm": 0.18857846040394016, "learning_rate": 1.59412823400657e-07, "loss": 0.0008, "step": 9735 }, { "epoch": 4.429481346678799, "grad_norm": 0.21827594253890314, "learning_rate": 1.5916180971626006e-07, "loss": 0.0014, "step": 9736 }, { "epoch": 4.429936305732484, "grad_norm": 0.017135352738874954, "learning_rate": 1.5891098731458832e-07, "loss": 0.0001, "step": 9737 }, { "epoch": 4.430391264786169, "grad_norm": 0.042476318254124054, "learning_rate": 1.58660356216139e-07, "loss": 0.0003, "step": 9738 }, { "epoch": 4.430846223839854, "grad_norm": 0.011638084117480739, "learning_rate": 1.5840991644139187e-07, "loss": 0.0, "step": 9739 }, { "epoch": 4.43130118289354, "grad_norm": 0.3768107217937325, "learning_rate": 1.5815966801081163e-07, "loss": 0.0019, "step": 9740 }, { "epoch": 4.431756141947225, "grad_norm": 0.04653429972004015, "learning_rate": 1.5790961094484802e-07, "loss": 0.0002, "step": 9741 }, { "epoch": 4.4322111010009095, "grad_norm": 0.024165571459754707, "learning_rate": 1.576597452639339e-07, "loss": 0.0002, "step": 9742 }, { "epoch": 4.432666060054595, "grad_norm": 0.016159532988322664, "learning_rate": 1.5741007098848792e-07, "loss": 0.0001, "step": 9743 }, { "epoch": 4.43312101910828, "grad_norm": 0.05428522390139831, "learning_rate": 1.571605881389113e-07, "loss": 0.0002, "step": 9744 }, { "epoch": 4.433575978161965, "grad_norm": 0.32921540696284124, "learning_rate": 1.5691129673559098e-07, "loss": 0.0028, "step": 9745 }, { "epoch": 4.434030937215651, "grad_norm": 0.06324975079046777, "learning_rate": 1.5666219679889906e-07, "loss": 0.0003, "step": 9746 }, { "epoch": 4.434485896269336, "grad_norm": 0.42909741142040775, "learning_rate": 1.5641328834918978e-07, "loss": 0.0026, "step": 9747 }, { "epoch": 4.4349408553230205, "grad_norm": 0.25586368703955814, "learning_rate": 1.5616457140680303e-07, "loss": 0.0042, "step": 9748 }, { "epoch": 4.435395814376706, "grad_norm": 0.013539318852820365, "learning_rate": 1.5591604599206223e-07, "loss": 0.0001, "step": 9749 }, { "epoch": 4.435850773430391, "grad_norm": 0.028000821177842885, "learning_rate": 1.5566771212527697e-07, "loss": 0.0002, "step": 9750 }, { "epoch": 4.436305732484076, "grad_norm": 0.3445395409620574, "learning_rate": 1.5541956982673912e-07, "loss": 0.0047, "step": 9751 }, { "epoch": 4.436760691537762, "grad_norm": 0.00872813544397269, "learning_rate": 1.5517161911672628e-07, "loss": 0.0, "step": 9752 }, { "epoch": 4.437215650591447, "grad_norm": 0.013832357723935177, "learning_rate": 1.5492386001549952e-07, "loss": 0.0001, "step": 9753 }, { "epoch": 4.4376706096451315, "grad_norm": 0.110675042142801, "learning_rate": 1.54676292543304e-07, "loss": 0.0008, "step": 9754 }, { "epoch": 4.438125568698817, "grad_norm": 0.08652302639156345, "learning_rate": 1.5442891672037135e-07, "loss": 0.0004, "step": 9755 }, { "epoch": 4.438580527752502, "grad_norm": 0.17331408607582421, "learning_rate": 1.5418173256691481e-07, "loss": 0.0003, "step": 9756 }, { "epoch": 4.439035486806187, "grad_norm": 0.11544848388010218, "learning_rate": 1.5393474010313353e-07, "loss": 0.0008, "step": 9757 }, { "epoch": 4.439490445859873, "grad_norm": 0.38150366552694065, "learning_rate": 1.5368793934921023e-07, "loss": 0.0015, "step": 9758 }, { "epoch": 4.439945404913558, "grad_norm": 0.8899955026351714, "learning_rate": 1.5344133032531267e-07, "loss": 0.0036, "step": 9759 }, { "epoch": 4.440400363967243, "grad_norm": 0.17943571486382587, "learning_rate": 1.531949130515928e-07, "loss": 0.0016, "step": 9760 }, { "epoch": 4.440855323020928, "grad_norm": 0.02639175788321511, "learning_rate": 1.529486875481867e-07, "loss": 0.0001, "step": 9761 }, { "epoch": 4.441310282074613, "grad_norm": 0.18697162011698137, "learning_rate": 1.5270265383521472e-07, "loss": 0.001, "step": 9762 }, { "epoch": 4.441765241128299, "grad_norm": 0.103508542533136, "learning_rate": 1.5245681193278127e-07, "loss": 0.0005, "step": 9763 }, { "epoch": 4.442220200181984, "grad_norm": 0.05239485746962667, "learning_rate": 1.522111618609759e-07, "loss": 0.0002, "step": 9764 }, { "epoch": 4.442675159235669, "grad_norm": 0.025385809991848883, "learning_rate": 1.5196570363987167e-07, "loss": 0.0001, "step": 9765 }, { "epoch": 4.443130118289354, "grad_norm": 0.046865214848126543, "learning_rate": 1.5172043728952672e-07, "loss": 0.0003, "step": 9766 }, { "epoch": 4.443585077343039, "grad_norm": 0.018438272284355555, "learning_rate": 1.514753628299831e-07, "loss": 0.0001, "step": 9767 }, { "epoch": 4.444040036396724, "grad_norm": 0.06966367922487204, "learning_rate": 1.512304802812664e-07, "loss": 0.0006, "step": 9768 }, { "epoch": 4.44449499545041, "grad_norm": 0.189022247791765, "learning_rate": 1.5098578966338845e-07, "loss": 0.0013, "step": 9769 }, { "epoch": 4.444949954504095, "grad_norm": 0.010608136320793715, "learning_rate": 1.507412909963435e-07, "loss": 0.0001, "step": 9770 }, { "epoch": 4.44540491355778, "grad_norm": 0.15251577537745972, "learning_rate": 1.504969843001114e-07, "loss": 0.0026, "step": 9771 }, { "epoch": 4.445859872611465, "grad_norm": 0.06851879679346339, "learning_rate": 1.5025286959465479e-07, "loss": 0.0004, "step": 9772 }, { "epoch": 4.44631483166515, "grad_norm": 0.01640258731095988, "learning_rate": 1.5000894689992274e-07, "loss": 0.0001, "step": 9773 }, { "epoch": 4.446769790718835, "grad_norm": 0.18229178897816128, "learning_rate": 1.4976521623584678e-07, "loss": 0.0012, "step": 9774 }, { "epoch": 4.447224749772521, "grad_norm": 0.24914796323574995, "learning_rate": 1.4952167762234433e-07, "loss": 0.003, "step": 9775 }, { "epoch": 4.447679708826206, "grad_norm": 0.1423168072971897, "learning_rate": 1.4927833107931556e-07, "loss": 0.0016, "step": 9776 }, { "epoch": 4.4481346678798905, "grad_norm": 0.13554189238700745, "learning_rate": 1.4903517662664568e-07, "loss": 0.0006, "step": 9777 }, { "epoch": 4.448589626933576, "grad_norm": 0.009783471594227535, "learning_rate": 1.487922142842041e-07, "loss": 0.0, "step": 9778 }, { "epoch": 4.449044585987261, "grad_norm": 0.3215174233706055, "learning_rate": 1.4854944407184463e-07, "loss": 0.0055, "step": 9779 }, { "epoch": 4.449499545040946, "grad_norm": 0.03776897119229784, "learning_rate": 1.4830686600940614e-07, "loss": 0.0001, "step": 9780 }, { "epoch": 4.449954504094632, "grad_norm": 0.06547529199802136, "learning_rate": 1.4806448011671025e-07, "loss": 0.0003, "step": 9781 }, { "epoch": 4.450409463148317, "grad_norm": 0.2707600335105275, "learning_rate": 1.4782228641356393e-07, "loss": 0.0031, "step": 9782 }, { "epoch": 4.4508644222020015, "grad_norm": 0.04034992954868125, "learning_rate": 1.4758028491975745e-07, "loss": 0.0001, "step": 9783 }, { "epoch": 4.451319381255687, "grad_norm": 0.0289764424297335, "learning_rate": 1.473384756550672e-07, "loss": 0.0001, "step": 9784 }, { "epoch": 4.451774340309372, "grad_norm": 0.04167238535320433, "learning_rate": 1.470968586392521e-07, "loss": 0.0003, "step": 9785 }, { "epoch": 4.452229299363057, "grad_norm": 0.10241121997961793, "learning_rate": 1.468554338920558e-07, "loss": 0.0005, "step": 9786 }, { "epoch": 4.452684258416743, "grad_norm": 0.1430581816449272, "learning_rate": 1.4661420143320725e-07, "loss": 0.0022, "step": 9787 }, { "epoch": 4.453139217470428, "grad_norm": 0.010047878733633235, "learning_rate": 1.4637316128241763e-07, "loss": 0.0001, "step": 9788 }, { "epoch": 4.4535941765241125, "grad_norm": 0.06256864173139524, "learning_rate": 1.4613231345938506e-07, "loss": 0.0004, "step": 9789 }, { "epoch": 4.454049135577798, "grad_norm": 0.04936333159644459, "learning_rate": 1.458916579837896e-07, "loss": 0.0002, "step": 9790 }, { "epoch": 4.454504094631483, "grad_norm": 0.07872484069271597, "learning_rate": 1.45651194875297e-07, "loss": 0.0005, "step": 9791 }, { "epoch": 4.454959053685168, "grad_norm": 0.009494159340903143, "learning_rate": 1.454109241535562e-07, "loss": 0.0001, "step": 9792 }, { "epoch": 4.455414012738854, "grad_norm": 0.06259063816792981, "learning_rate": 1.4517084583820145e-07, "loss": 0.0004, "step": 9793 }, { "epoch": 4.455868971792539, "grad_norm": 0.01554969721637093, "learning_rate": 1.449309599488513e-07, "loss": 0.0001, "step": 9794 }, { "epoch": 4.4563239308462235, "grad_norm": 0.015064527801470457, "learning_rate": 1.4469126650510755e-07, "loss": 0.0001, "step": 9795 }, { "epoch": 4.456778889899909, "grad_norm": 0.12039851929772516, "learning_rate": 1.4445176552655705e-07, "loss": 0.0006, "step": 9796 }, { "epoch": 4.457233848953594, "grad_norm": 0.018358401553766165, "learning_rate": 1.4421245703277047e-07, "loss": 0.0001, "step": 9797 }, { "epoch": 4.457688808007279, "grad_norm": 0.09008163037652966, "learning_rate": 1.4397334104330335e-07, "loss": 0.0005, "step": 9798 }, { "epoch": 4.458143767060965, "grad_norm": 0.015226249451931451, "learning_rate": 1.437344175776953e-07, "loss": 0.0001, "step": 9799 }, { "epoch": 4.45859872611465, "grad_norm": 0.09949665538997839, "learning_rate": 1.434956866554693e-07, "loss": 0.0008, "step": 9800 }, { "epoch": 4.4590536851683344, "grad_norm": 0.07460430031308773, "learning_rate": 1.4325714829613453e-07, "loss": 0.0003, "step": 9801 }, { "epoch": 4.45950864422202, "grad_norm": 0.12451570985471515, "learning_rate": 1.4301880251918227e-07, "loss": 0.0003, "step": 9802 }, { "epoch": 4.459963603275705, "grad_norm": 0.27402041316289744, "learning_rate": 1.4278064934408946e-07, "loss": 0.0009, "step": 9803 }, { "epoch": 4.460418562329391, "grad_norm": 0.20292367688986576, "learning_rate": 1.4254268879031725e-07, "loss": 0.0017, "step": 9804 }, { "epoch": 4.460873521383076, "grad_norm": 0.11750635867897744, "learning_rate": 1.423049208773103e-07, "loss": 0.0006, "step": 9805 }, { "epoch": 4.461328480436761, "grad_norm": 0.02880854054574754, "learning_rate": 1.420673456244978e-07, "loss": 0.0002, "step": 9806 }, { "epoch": 4.461783439490446, "grad_norm": 0.04855645824532577, "learning_rate": 1.418299630512926e-07, "loss": 0.0002, "step": 9807 }, { "epoch": 4.462238398544131, "grad_norm": 0.021651125088753727, "learning_rate": 1.415927731770944e-07, "loss": 0.0001, "step": 9808 }, { "epoch": 4.462693357597816, "grad_norm": 0.020115055658371562, "learning_rate": 1.4135577602128413e-07, "loss": 0.0001, "step": 9809 }, { "epoch": 4.463148316651502, "grad_norm": 0.06919348129461488, "learning_rate": 1.4111897160322852e-07, "loss": 0.0003, "step": 9810 }, { "epoch": 4.463603275705187, "grad_norm": 0.0900556660969972, "learning_rate": 1.408823599422779e-07, "loss": 0.0004, "step": 9811 }, { "epoch": 4.4640582347588715, "grad_norm": 0.031626611898193595, "learning_rate": 1.406459410577668e-07, "loss": 0.0001, "step": 9812 }, { "epoch": 4.464513193812557, "grad_norm": 0.06452543273808381, "learning_rate": 1.404097149690148e-07, "loss": 0.0004, "step": 9813 }, { "epoch": 4.464968152866242, "grad_norm": 0.1178573874502257, "learning_rate": 1.4017368169532474e-07, "loss": 0.001, "step": 9814 }, { "epoch": 4.465423111919927, "grad_norm": 0.12482690746224949, "learning_rate": 1.3993784125598513e-07, "loss": 0.0002, "step": 9815 }, { "epoch": 4.465878070973613, "grad_norm": 0.023521683738879546, "learning_rate": 1.3970219367026694e-07, "loss": 0.0002, "step": 9816 }, { "epoch": 4.466333030027298, "grad_norm": 0.24374796181088948, "learning_rate": 1.394667389574264e-07, "loss": 0.0034, "step": 9817 }, { "epoch": 4.4667879890809825, "grad_norm": 0.02250914804239694, "learning_rate": 1.39231477136704e-07, "loss": 0.0001, "step": 9818 }, { "epoch": 4.467242948134668, "grad_norm": 0.1618598074364517, "learning_rate": 1.38996408227324e-07, "loss": 0.0017, "step": 9819 }, { "epoch": 4.467697907188353, "grad_norm": 0.0225074803279589, "learning_rate": 1.387615322484953e-07, "loss": 0.0001, "step": 9820 }, { "epoch": 4.468152866242038, "grad_norm": 0.25041965893053963, "learning_rate": 1.3852684921941112e-07, "loss": 0.0014, "step": 9821 }, { "epoch": 4.468607825295724, "grad_norm": 0.13188134696819498, "learning_rate": 1.3829235915924832e-07, "loss": 0.0008, "step": 9822 }, { "epoch": 4.469062784349409, "grad_norm": 0.056824720627197865, "learning_rate": 1.3805806208716855e-07, "loss": 0.0003, "step": 9823 }, { "epoch": 4.4695177434030935, "grad_norm": 0.07492378727663916, "learning_rate": 1.3782395802231785e-07, "loss": 0.0006, "step": 9824 }, { "epoch": 4.469972702456779, "grad_norm": 0.0574127463031934, "learning_rate": 1.3759004698382566e-07, "loss": 0.0003, "step": 9825 }, { "epoch": 4.470427661510464, "grad_norm": 0.1671477342218146, "learning_rate": 1.3735632899080586e-07, "loss": 0.0025, "step": 9826 }, { "epoch": 4.470882620564149, "grad_norm": 0.01712506613046077, "learning_rate": 1.3712280406235733e-07, "loss": 0.0001, "step": 9827 }, { "epoch": 4.471337579617835, "grad_norm": 0.26472039926961577, "learning_rate": 1.3688947221756316e-07, "loss": 0.0018, "step": 9828 }, { "epoch": 4.47179253867152, "grad_norm": 0.2578845524897672, "learning_rate": 1.3665633347548946e-07, "loss": 0.0016, "step": 9829 }, { "epoch": 4.4722474977252045, "grad_norm": 0.06548301836828592, "learning_rate": 1.364233878551874e-07, "loss": 0.0004, "step": 9830 }, { "epoch": 4.47270245677889, "grad_norm": 0.2891181651337792, "learning_rate": 1.3619063537569173e-07, "loss": 0.0027, "step": 9831 }, { "epoch": 4.473157415832575, "grad_norm": 0.03882917559205413, "learning_rate": 1.3595807605602307e-07, "loss": 0.0001, "step": 9832 }, { "epoch": 4.47361237488626, "grad_norm": 0.03778369575400486, "learning_rate": 1.357257099151843e-07, "loss": 0.0002, "step": 9833 }, { "epoch": 4.474067333939946, "grad_norm": 0.06951162854600568, "learning_rate": 1.3549353697216326e-07, "loss": 0.0005, "step": 9834 }, { "epoch": 4.474522292993631, "grad_norm": 0.0885945184226179, "learning_rate": 1.3526155724593288e-07, "loss": 0.0004, "step": 9835 }, { "epoch": 4.4749772520473154, "grad_norm": 0.04513068030934251, "learning_rate": 1.350297707554485e-07, "loss": 0.0002, "step": 9836 }, { "epoch": 4.475432211101001, "grad_norm": 0.007379013038512034, "learning_rate": 1.3479817751965164e-07, "loss": 0.0, "step": 9837 }, { "epoch": 4.475887170154686, "grad_norm": 0.14555736842602635, "learning_rate": 1.3456677755746634e-07, "loss": 0.0006, "step": 9838 }, { "epoch": 4.476342129208371, "grad_norm": 0.01386680825260572, "learning_rate": 1.343355708878019e-07, "loss": 0.0001, "step": 9839 }, { "epoch": 4.476797088262057, "grad_norm": 0.036177410412325965, "learning_rate": 1.3410455752955132e-07, "loss": 0.0002, "step": 9840 }, { "epoch": 4.477252047315742, "grad_norm": 0.2725048821137754, "learning_rate": 1.338737375015911e-07, "loss": 0.0032, "step": 9841 }, { "epoch": 4.477707006369426, "grad_norm": 0.01608396682822352, "learning_rate": 1.336431108227848e-07, "loss": 0.0001, "step": 9842 }, { "epoch": 4.478161965423112, "grad_norm": 0.033096545523248776, "learning_rate": 1.3341267751197678e-07, "loss": 0.0003, "step": 9843 }, { "epoch": 4.478616924476797, "grad_norm": 0.12968653175697406, "learning_rate": 1.3318243758799754e-07, "loss": 0.001, "step": 9844 }, { "epoch": 4.479071883530482, "grad_norm": 0.15887965258518644, "learning_rate": 1.3295239106966118e-07, "loss": 0.0017, "step": 9845 }, { "epoch": 4.479526842584168, "grad_norm": 0.060152790128120574, "learning_rate": 1.3272253797576518e-07, "loss": 0.0003, "step": 9846 }, { "epoch": 4.4799818016378525, "grad_norm": 0.08877183512619093, "learning_rate": 1.3249287832509366e-07, "loss": 0.0005, "step": 9847 }, { "epoch": 4.480436760691537, "grad_norm": 0.07763314986868028, "learning_rate": 1.3226341213641191e-07, "loss": 0.0005, "step": 9848 }, { "epoch": 4.480891719745223, "grad_norm": 0.01773938803203697, "learning_rate": 1.3203413942847189e-07, "loss": 0.0001, "step": 9849 }, { "epoch": 4.481346678798908, "grad_norm": 0.021338453304576444, "learning_rate": 1.3180506022000827e-07, "loss": 0.0001, "step": 9850 }, { "epoch": 4.481801637852593, "grad_norm": 0.08213800754633385, "learning_rate": 1.3157617452974032e-07, "loss": 0.0005, "step": 9851 }, { "epoch": 4.482256596906279, "grad_norm": 0.08680595030569485, "learning_rate": 1.313474823763719e-07, "loss": 0.0005, "step": 9852 }, { "epoch": 4.4827115559599635, "grad_norm": 0.07849861939484405, "learning_rate": 1.311189837785906e-07, "loss": 0.0003, "step": 9853 }, { "epoch": 4.483166515013648, "grad_norm": 0.09971029713574944, "learning_rate": 1.3089067875506788e-07, "loss": 0.0009, "step": 9854 }, { "epoch": 4.483621474067334, "grad_norm": 0.156596118541529, "learning_rate": 1.306625673244602e-07, "loss": 0.0005, "step": 9855 }, { "epoch": 4.484076433121019, "grad_norm": 0.08651085933916612, "learning_rate": 1.3043464950540734e-07, "loss": 0.0005, "step": 9856 }, { "epoch": 4.484531392174704, "grad_norm": 0.030747435212328783, "learning_rate": 1.3020692531653445e-07, "loss": 0.0002, "step": 9857 }, { "epoch": 4.48498635122839, "grad_norm": 0.028983371974892043, "learning_rate": 1.2997939477644967e-07, "loss": 0.0002, "step": 9858 }, { "epoch": 4.4854413102820745, "grad_norm": 0.016313190747693194, "learning_rate": 1.2975205790374617e-07, "loss": 0.0001, "step": 9859 }, { "epoch": 4.485896269335759, "grad_norm": 0.04736812482630083, "learning_rate": 1.2952491471699997e-07, "loss": 0.0002, "step": 9860 }, { "epoch": 4.486351228389445, "grad_norm": 0.026114604404492876, "learning_rate": 1.292979652347731e-07, "loss": 0.0001, "step": 9861 }, { "epoch": 4.48680618744313, "grad_norm": 0.13350441358344078, "learning_rate": 1.2907120947561024e-07, "loss": 0.0007, "step": 9862 }, { "epoch": 4.487261146496815, "grad_norm": 0.10592432448712824, "learning_rate": 1.2884464745804125e-07, "loss": 0.0007, "step": 9863 }, { "epoch": 4.487716105550501, "grad_norm": 0.08952577294844928, "learning_rate": 1.2861827920057994e-07, "loss": 0.0007, "step": 9864 }, { "epoch": 4.4881710646041855, "grad_norm": 0.06946003510551277, "learning_rate": 1.283921047217232e-07, "loss": 0.0004, "step": 9865 }, { "epoch": 4.48862602365787, "grad_norm": 0.3589019329293952, "learning_rate": 1.28166124039954e-07, "loss": 0.0004, "step": 9866 }, { "epoch": 4.489080982711556, "grad_norm": 0.01684448478470936, "learning_rate": 1.279403371737381e-07, "loss": 0.0001, "step": 9867 }, { "epoch": 4.489535941765241, "grad_norm": 0.08672713964286013, "learning_rate": 1.2771474414152552e-07, "loss": 0.0004, "step": 9868 }, { "epoch": 4.489990900818927, "grad_norm": 0.006264175821780466, "learning_rate": 1.2748934496175092e-07, "loss": 0.0, "step": 9869 }, { "epoch": 4.490445859872612, "grad_norm": 0.0500824208796292, "learning_rate": 1.2726413965283264e-07, "loss": 0.0004, "step": 9870 }, { "epoch": 4.4909008189262964, "grad_norm": 0.15294267637722, "learning_rate": 1.27039128233174e-07, "loss": 0.002, "step": 9871 }, { "epoch": 4.491355777979982, "grad_norm": 0.10430638136032389, "learning_rate": 1.2681431072116168e-07, "loss": 0.0005, "step": 9872 }, { "epoch": 4.491810737033667, "grad_norm": 0.03664817440941109, "learning_rate": 1.2658968713516655e-07, "loss": 0.0001, "step": 9873 }, { "epoch": 4.492265696087352, "grad_norm": 0.04715457861939422, "learning_rate": 1.2636525749354396e-07, "loss": 0.0002, "step": 9874 }, { "epoch": 4.492720655141038, "grad_norm": 0.275107408033057, "learning_rate": 1.2614102181463334e-07, "loss": 0.0025, "step": 9875 }, { "epoch": 4.4931756141947226, "grad_norm": 0.20544564934997203, "learning_rate": 1.2591698011675784e-07, "loss": 0.001, "step": 9876 }, { "epoch": 4.493630573248407, "grad_norm": 0.2764291669967296, "learning_rate": 1.256931324182259e-07, "loss": 0.0025, "step": 9877 }, { "epoch": 4.494085532302093, "grad_norm": 0.03180623302840872, "learning_rate": 1.2546947873732896e-07, "loss": 0.0002, "step": 9878 }, { "epoch": 4.494540491355778, "grad_norm": 0.032801725390374334, "learning_rate": 1.2524601909234268e-07, "loss": 0.0002, "step": 9879 }, { "epoch": 4.494995450409463, "grad_norm": 0.07361620589079929, "learning_rate": 1.250227535015272e-07, "loss": 0.0002, "step": 9880 }, { "epoch": 4.495450409463149, "grad_norm": 0.04513060150998427, "learning_rate": 1.2479968198312736e-07, "loss": 0.0004, "step": 9881 }, { "epoch": 4.4959053685168335, "grad_norm": 0.07164892313637282, "learning_rate": 1.2457680455537136e-07, "loss": 0.0007, "step": 9882 }, { "epoch": 4.496360327570518, "grad_norm": 0.02729678099436633, "learning_rate": 1.24354121236471e-07, "loss": 0.0002, "step": 9883 }, { "epoch": 4.496815286624204, "grad_norm": 0.11250988671138751, "learning_rate": 1.2413163204462398e-07, "loss": 0.0007, "step": 9884 }, { "epoch": 4.497270245677889, "grad_norm": 0.04209106969706862, "learning_rate": 1.2390933699801017e-07, "loss": 0.0002, "step": 9885 }, { "epoch": 4.497725204731574, "grad_norm": 0.30150849699760474, "learning_rate": 1.236872361147956e-07, "loss": 0.0021, "step": 9886 }, { "epoch": 4.49818016378526, "grad_norm": 0.016534068719063245, "learning_rate": 1.2346532941312854e-07, "loss": 0.0001, "step": 9887 }, { "epoch": 4.4986351228389445, "grad_norm": 0.014203606944846629, "learning_rate": 1.2324361691114257e-07, "loss": 0.0001, "step": 9888 }, { "epoch": 4.499090081892629, "grad_norm": 0.018822143588966875, "learning_rate": 1.2302209862695453e-07, "loss": 0.0001, "step": 9889 }, { "epoch": 4.499545040946315, "grad_norm": 0.05623861933181946, "learning_rate": 1.2280077457866635e-07, "loss": 0.0002, "step": 9890 }, { "epoch": 4.5, "grad_norm": 0.031999500054796855, "learning_rate": 1.2257964478436358e-07, "loss": 0.0002, "step": 9891 }, { "epoch": 4.500454959053685, "grad_norm": 0.4951088613409253, "learning_rate": 1.223587092621162e-07, "loss": 0.003, "step": 9892 }, { "epoch": 4.500909918107371, "grad_norm": 0.03750030314261723, "learning_rate": 1.2213796802997752e-07, "loss": 0.0002, "step": 9893 }, { "epoch": 4.5013648771610555, "grad_norm": 0.14929035105510524, "learning_rate": 1.2191742110598564e-07, "loss": 0.0007, "step": 9894 }, { "epoch": 4.50181983621474, "grad_norm": 0.009480581469846415, "learning_rate": 1.2169706850816309e-07, "loss": 0.0001, "step": 9895 }, { "epoch": 4.502274795268426, "grad_norm": 0.05261831207390834, "learning_rate": 1.2147691025451573e-07, "loss": 0.0002, "step": 9896 }, { "epoch": 4.502729754322111, "grad_norm": 0.08350726804750176, "learning_rate": 1.2125694636303337e-07, "loss": 0.0006, "step": 9897 }, { "epoch": 4.503184713375796, "grad_norm": 0.2664924970650393, "learning_rate": 1.2103717685169187e-07, "loss": 0.0012, "step": 9898 }, { "epoch": 4.503639672429482, "grad_norm": 0.2684838580852157, "learning_rate": 1.2081760173844825e-07, "loss": 0.0041, "step": 9899 }, { "epoch": 4.5040946314831665, "grad_norm": 0.12253519434065228, "learning_rate": 1.2059822104124626e-07, "loss": 0.0005, "step": 9900 }, { "epoch": 4.504549590536851, "grad_norm": 0.04008661265487665, "learning_rate": 1.203790347780126e-07, "loss": 0.0003, "step": 9901 }, { "epoch": 4.505004549590537, "grad_norm": 0.14347104124378046, "learning_rate": 1.2016004296665772e-07, "loss": 0.0014, "step": 9902 }, { "epoch": 4.505459508644222, "grad_norm": 0.31074534838910534, "learning_rate": 1.1994124562507674e-07, "loss": 0.0029, "step": 9903 }, { "epoch": 4.505914467697907, "grad_norm": 0.06873302549530562, "learning_rate": 1.1972264277114898e-07, "loss": 0.0002, "step": 9904 }, { "epoch": 4.506369426751593, "grad_norm": 0.033298816857412435, "learning_rate": 1.195042344227376e-07, "loss": 0.0001, "step": 9905 }, { "epoch": 4.5068243858052774, "grad_norm": 0.02305502214517376, "learning_rate": 1.1928602059769008e-07, "loss": 0.0002, "step": 9906 }, { "epoch": 4.507279344858962, "grad_norm": 0.13768107270664226, "learning_rate": 1.1906800131383789e-07, "loss": 0.001, "step": 9907 }, { "epoch": 4.507734303912648, "grad_norm": 0.051213833934108596, "learning_rate": 1.18850176588996e-07, "loss": 0.0003, "step": 9908 }, { "epoch": 4.508189262966333, "grad_norm": 0.09263918925591912, "learning_rate": 1.1863254644096488e-07, "loss": 0.0005, "step": 9909 }, { "epoch": 4.508644222020019, "grad_norm": 0.009993676228488818, "learning_rate": 1.1841511088752783e-07, "loss": 0.0, "step": 9910 }, { "epoch": 4.5090991810737036, "grad_norm": 0.020254873487665047, "learning_rate": 1.1819786994645255e-07, "loss": 0.0001, "step": 9911 }, { "epoch": 4.509554140127388, "grad_norm": 0.05611369825631901, "learning_rate": 1.1798082363549152e-07, "loss": 0.0004, "step": 9912 }, { "epoch": 4.510009099181074, "grad_norm": 0.034880426821010425, "learning_rate": 1.1776397197238027e-07, "loss": 0.0001, "step": 9913 }, { "epoch": 4.510464058234759, "grad_norm": 0.1297816151382459, "learning_rate": 1.1754731497483934e-07, "loss": 0.0012, "step": 9914 }, { "epoch": 4.510919017288444, "grad_norm": 0.42707832619687747, "learning_rate": 1.1733085266057265e-07, "loss": 0.0044, "step": 9915 }, { "epoch": 4.51137397634213, "grad_norm": 0.11252217567783618, "learning_rate": 1.171145850472688e-07, "loss": 0.0008, "step": 9916 }, { "epoch": 4.5118289353958145, "grad_norm": 0.0724485830951297, "learning_rate": 1.1689851215260006e-07, "loss": 0.0005, "step": 9917 }, { "epoch": 4.512283894449499, "grad_norm": 0.1085986849099037, "learning_rate": 1.1668263399422258e-07, "loss": 0.0005, "step": 9918 }, { "epoch": 4.512738853503185, "grad_norm": 0.09429009181336304, "learning_rate": 1.1646695058977697e-07, "loss": 0.0005, "step": 9919 }, { "epoch": 4.51319381255687, "grad_norm": 0.20634913337020144, "learning_rate": 1.1625146195688885e-07, "loss": 0.0012, "step": 9920 }, { "epoch": 4.513648771610555, "grad_norm": 0.02237446130285486, "learning_rate": 1.1603616811316638e-07, "loss": 0.0001, "step": 9921 }, { "epoch": 4.514103730664241, "grad_norm": 0.10595956034762585, "learning_rate": 1.1582106907620238e-07, "loss": 0.0012, "step": 9922 }, { "epoch": 4.5145586897179255, "grad_norm": 0.046514872399429426, "learning_rate": 1.156061648635734e-07, "loss": 0.0001, "step": 9923 }, { "epoch": 4.51501364877161, "grad_norm": 0.08807501088977962, "learning_rate": 1.1539145549284092e-07, "loss": 0.0007, "step": 9924 }, { "epoch": 4.515468607825296, "grad_norm": 0.3178454619491908, "learning_rate": 1.151769409815498e-07, "loss": 0.0037, "step": 9925 }, { "epoch": 4.515923566878981, "grad_norm": 0.018352842687959864, "learning_rate": 1.1496262134722935e-07, "loss": 0.0001, "step": 9926 }, { "epoch": 4.516378525932666, "grad_norm": 0.08550811743589072, "learning_rate": 1.1474849660739306e-07, "loss": 0.0007, "step": 9927 }, { "epoch": 4.516833484986352, "grad_norm": 0.0727864055894757, "learning_rate": 1.145345667795375e-07, "loss": 0.0005, "step": 9928 }, { "epoch": 4.5172884440400365, "grad_norm": 0.12896652572834252, "learning_rate": 1.143208318811448e-07, "loss": 0.0005, "step": 9929 }, { "epoch": 4.517743403093721, "grad_norm": 0.3096034255597379, "learning_rate": 1.1410729192968012e-07, "loss": 0.0058, "step": 9930 }, { "epoch": 4.518198362147407, "grad_norm": 0.03629372131019963, "learning_rate": 1.1389394694259287e-07, "loss": 0.0001, "step": 9931 }, { "epoch": 4.518653321201092, "grad_norm": 0.027979547545536013, "learning_rate": 1.1368079693731632e-07, "loss": 0.0001, "step": 9932 }, { "epoch": 4.519108280254777, "grad_norm": 0.12933405995974326, "learning_rate": 1.1346784193126875e-07, "loss": 0.0008, "step": 9933 }, { "epoch": 4.519563239308463, "grad_norm": 0.027401963993423834, "learning_rate": 1.1325508194185181e-07, "loss": 0.0002, "step": 9934 }, { "epoch": 4.5200181983621475, "grad_norm": 0.12855302694107332, "learning_rate": 1.1304251698645102e-07, "loss": 0.0008, "step": 9935 }, { "epoch": 4.520473157415832, "grad_norm": 0.039752678577219897, "learning_rate": 1.1283014708243666e-07, "loss": 0.0001, "step": 9936 }, { "epoch": 4.520928116469518, "grad_norm": 0.27331130021123706, "learning_rate": 1.126179722471618e-07, "loss": 0.0021, "step": 9937 }, { "epoch": 4.521383075523203, "grad_norm": 0.08725844196903082, "learning_rate": 1.1240599249796535e-07, "loss": 0.0006, "step": 9938 }, { "epoch": 4.521838034576888, "grad_norm": 0.15463593914994578, "learning_rate": 1.1219420785216844e-07, "loss": 0.0011, "step": 9939 }, { "epoch": 4.522292993630574, "grad_norm": 0.05639438849384978, "learning_rate": 1.1198261832707808e-07, "loss": 0.0004, "step": 9940 }, { "epoch": 4.522747952684258, "grad_norm": 0.1157380026475925, "learning_rate": 1.1177122393998374e-07, "loss": 0.0007, "step": 9941 }, { "epoch": 4.523202911737943, "grad_norm": 0.05145772848673531, "learning_rate": 1.1156002470815968e-07, "loss": 0.0004, "step": 9942 }, { "epoch": 4.523657870791629, "grad_norm": 0.11127407099978395, "learning_rate": 1.113490206488646e-07, "loss": 0.0006, "step": 9943 }, { "epoch": 4.524112829845314, "grad_norm": 0.0931790793648685, "learning_rate": 1.1113821177934053e-07, "loss": 0.0021, "step": 9944 }, { "epoch": 4.524567788898999, "grad_norm": 0.2598539678565902, "learning_rate": 1.109275981168137e-07, "loss": 0.0018, "step": 9945 }, { "epoch": 4.5250227479526846, "grad_norm": 0.0631057892437016, "learning_rate": 1.1071717967849449e-07, "loss": 0.0002, "step": 9946 }, { "epoch": 4.525477707006369, "grad_norm": 0.22714020424482154, "learning_rate": 1.1050695648157722e-07, "loss": 0.0016, "step": 9947 }, { "epoch": 4.525932666060054, "grad_norm": 0.019483696760845947, "learning_rate": 1.1029692854324092e-07, "loss": 0.0001, "step": 9948 }, { "epoch": 4.52638762511374, "grad_norm": 0.021683825115466137, "learning_rate": 1.1008709588064798e-07, "loss": 0.0001, "step": 9949 }, { "epoch": 4.526842584167425, "grad_norm": 0.014842962708042522, "learning_rate": 1.0987745851094494e-07, "loss": 0.0001, "step": 9950 }, { "epoch": 4.52729754322111, "grad_norm": 0.06939720263520012, "learning_rate": 1.0966801645126202e-07, "loss": 0.0005, "step": 9951 }, { "epoch": 4.5277525022747955, "grad_norm": 0.09763165873699789, "learning_rate": 1.0945876971871383e-07, "loss": 0.0004, "step": 9952 }, { "epoch": 4.52820746132848, "grad_norm": 0.014103198707841564, "learning_rate": 1.0924971833039949e-07, "loss": 0.0001, "step": 9953 }, { "epoch": 4.528662420382165, "grad_norm": 0.22294299516612404, "learning_rate": 1.0904086230340199e-07, "loss": 0.001, "step": 9954 }, { "epoch": 4.529117379435851, "grad_norm": 0.01971923275256939, "learning_rate": 1.088322016547877e-07, "loss": 0.0001, "step": 9955 }, { "epoch": 4.529572338489536, "grad_norm": 0.12949130661944278, "learning_rate": 1.086237364016074e-07, "loss": 0.0007, "step": 9956 }, { "epoch": 4.530027297543221, "grad_norm": 0.3180944761287024, "learning_rate": 1.084154665608958e-07, "loss": 0.002, "step": 9957 }, { "epoch": 4.5304822565969065, "grad_norm": 0.3200562091552956, "learning_rate": 1.0820739214967208e-07, "loss": 0.0033, "step": 9958 }, { "epoch": 4.530937215650591, "grad_norm": 0.17328098149688567, "learning_rate": 1.0799951318493929e-07, "loss": 0.0015, "step": 9959 }, { "epoch": 4.531392174704276, "grad_norm": 0.01603172242902723, "learning_rate": 1.0779182968368357e-07, "loss": 0.0, "step": 9960 }, { "epoch": 4.531847133757962, "grad_norm": 0.22724266020945527, "learning_rate": 1.075843416628769e-07, "loss": 0.0022, "step": 9961 }, { "epoch": 4.532302092811647, "grad_norm": 0.04225244230183861, "learning_rate": 1.073770491394735e-07, "loss": 0.0003, "step": 9962 }, { "epoch": 4.532757051865332, "grad_norm": 0.130872294361375, "learning_rate": 1.0716995213041287e-07, "loss": 0.001, "step": 9963 }, { "epoch": 4.5332120109190175, "grad_norm": 0.01038590451316024, "learning_rate": 1.0696305065261787e-07, "loss": 0.0001, "step": 9964 }, { "epoch": 4.533666969972702, "grad_norm": 0.10615242530050997, "learning_rate": 1.067563447229955e-07, "loss": 0.0005, "step": 9965 }, { "epoch": 4.534121929026387, "grad_norm": 0.02714021453225405, "learning_rate": 1.0654983435843646e-07, "loss": 0.0001, "step": 9966 }, { "epoch": 4.534576888080073, "grad_norm": 0.13156047670212864, "learning_rate": 1.0634351957581613e-07, "loss": 0.0009, "step": 9967 }, { "epoch": 4.535031847133758, "grad_norm": 0.1903328963895742, "learning_rate": 1.0613740039199433e-07, "loss": 0.005, "step": 9968 }, { "epoch": 4.535486806187443, "grad_norm": 0.024289685396523483, "learning_rate": 1.0593147682381344e-07, "loss": 0.0001, "step": 9969 }, { "epoch": 4.5359417652411285, "grad_norm": 0.028941943888687295, "learning_rate": 1.0572574888810055e-07, "loss": 0.0002, "step": 9970 }, { "epoch": 4.536396724294813, "grad_norm": 0.11740130233289105, "learning_rate": 1.0552021660166694e-07, "loss": 0.0006, "step": 9971 }, { "epoch": 4.536851683348498, "grad_norm": 0.034808277511871225, "learning_rate": 1.0531487998130808e-07, "loss": 0.0002, "step": 9972 }, { "epoch": 4.537306642402184, "grad_norm": 0.1087258542128671, "learning_rate": 1.0510973904380301e-07, "loss": 0.0006, "step": 9973 }, { "epoch": 4.537761601455869, "grad_norm": 0.23925255085914765, "learning_rate": 1.0490479380591445e-07, "loss": 0.0012, "step": 9974 }, { "epoch": 4.538216560509554, "grad_norm": 0.032805879579947926, "learning_rate": 1.047000442843904e-07, "loss": 0.0002, "step": 9975 }, { "epoch": 4.538671519563239, "grad_norm": 0.17038017982715783, "learning_rate": 1.0449549049596136e-07, "loss": 0.0018, "step": 9976 }, { "epoch": 4.539126478616924, "grad_norm": 0.04877222551422188, "learning_rate": 1.0429113245734285e-07, "loss": 0.0001, "step": 9977 }, { "epoch": 4.539581437670609, "grad_norm": 0.3499735182951545, "learning_rate": 1.0408697018523428e-07, "loss": 0.0008, "step": 9978 }, { "epoch": 4.540036396724295, "grad_norm": 0.13203657630257137, "learning_rate": 1.0388300369631871e-07, "loss": 0.0002, "step": 9979 }, { "epoch": 4.54049135577798, "grad_norm": 0.10323295232826599, "learning_rate": 1.0367923300726307e-07, "loss": 0.0007, "step": 9980 }, { "epoch": 4.540946314831665, "grad_norm": 0.12219329564563036, "learning_rate": 1.034756581347185e-07, "loss": 0.0009, "step": 9981 }, { "epoch": 4.54140127388535, "grad_norm": 0.014377048390494055, "learning_rate": 1.0327227909532111e-07, "loss": 0.0, "step": 9982 }, { "epoch": 4.541856232939035, "grad_norm": 0.09870783897389586, "learning_rate": 1.0306909590568959e-07, "loss": 0.0006, "step": 9983 }, { "epoch": 4.542311191992721, "grad_norm": 0.20279443883561352, "learning_rate": 1.0286610858242701e-07, "loss": 0.0018, "step": 9984 }, { "epoch": 4.542766151046406, "grad_norm": 0.2141265401001463, "learning_rate": 1.0266331714212069e-07, "loss": 0.0026, "step": 9985 }, { "epoch": 4.543221110100091, "grad_norm": 0.0991599424016002, "learning_rate": 1.0246072160134123e-07, "loss": 0.0006, "step": 9986 }, { "epoch": 4.5436760691537765, "grad_norm": 0.020240779417226216, "learning_rate": 1.0225832197664515e-07, "loss": 0.0001, "step": 9987 }, { "epoch": 4.544131028207461, "grad_norm": 0.1307773891026049, "learning_rate": 1.0205611828457029e-07, "loss": 0.0007, "step": 9988 }, { "epoch": 4.544585987261146, "grad_norm": 0.17739988424691583, "learning_rate": 1.0185411054164096e-07, "loss": 0.0004, "step": 9989 }, { "epoch": 4.545040946314832, "grad_norm": 0.04309251603818407, "learning_rate": 1.0165229876436367e-07, "loss": 0.0002, "step": 9990 }, { "epoch": 4.545495905368517, "grad_norm": 0.07858865490781645, "learning_rate": 1.0145068296922911e-07, "loss": 0.0003, "step": 9991 }, { "epoch": 4.545950864422202, "grad_norm": 0.04133492673692023, "learning_rate": 1.0124926317271355e-07, "loss": 0.0001, "step": 9992 }, { "epoch": 4.5464058234758875, "grad_norm": 0.029706448929442, "learning_rate": 1.0104803939127578e-07, "loss": 0.0002, "step": 9993 }, { "epoch": 4.546860782529572, "grad_norm": 0.09403270997908098, "learning_rate": 1.0084701164135818e-07, "loss": 0.0005, "step": 9994 }, { "epoch": 4.547315741583257, "grad_norm": 0.015592542496362702, "learning_rate": 1.0064617993938847e-07, "loss": 0.0001, "step": 9995 }, { "epoch": 4.547770700636943, "grad_norm": 0.04790211348805635, "learning_rate": 1.0044554430177711e-07, "loss": 0.0002, "step": 9996 }, { "epoch": 4.548225659690628, "grad_norm": 0.15300722155404706, "learning_rate": 1.0024510474492016e-07, "loss": 0.0007, "step": 9997 }, { "epoch": 4.548680618744313, "grad_norm": 0.48027682376153663, "learning_rate": 1.0004486128519592e-07, "loss": 0.0033, "step": 9998 }, { "epoch": 4.5491355777979985, "grad_norm": 0.056429954049064625, "learning_rate": 9.984481393896767e-08, "loss": 0.0002, "step": 9999 }, { "epoch": 4.549590536851683, "grad_norm": 0.03266273917834157, "learning_rate": 9.964496272258178e-08, "loss": 0.0002, "step": 10000 }, { "epoch": 4.550045495905368, "grad_norm": 0.06427715138490808, "learning_rate": 9.944530765236993e-08, "loss": 0.0003, "step": 10001 }, { "epoch": 4.550500454959054, "grad_norm": 0.04813750976994104, "learning_rate": 9.924584874464655e-08, "loss": 0.0004, "step": 10002 }, { "epoch": 4.550955414012739, "grad_norm": 0.15923183672200286, "learning_rate": 9.904658601571083e-08, "loss": 0.0029, "step": 10003 }, { "epoch": 4.551410373066424, "grad_norm": 0.10806746937967927, "learning_rate": 9.884751948184584e-08, "loss": 0.0008, "step": 10004 }, { "epoch": 4.5518653321201095, "grad_norm": 0.15163459680697067, "learning_rate": 9.864864915931748e-08, "loss": 0.0011, "step": 10005 }, { "epoch": 4.552320291173794, "grad_norm": 0.03929315675433409, "learning_rate": 9.84499750643772e-08, "loss": 0.0001, "step": 10006 }, { "epoch": 4.552775250227479, "grad_norm": 0.02884948262422638, "learning_rate": 9.825149721326005e-08, "loss": 0.0001, "step": 10007 }, { "epoch": 4.553230209281165, "grad_norm": 0.04784724320088037, "learning_rate": 9.805321562218417e-08, "loss": 0.0002, "step": 10008 }, { "epoch": 4.55368516833485, "grad_norm": 0.025488045943143092, "learning_rate": 9.785513030735216e-08, "loss": 0.0001, "step": 10009 }, { "epoch": 4.554140127388535, "grad_norm": 0.18161381255172598, "learning_rate": 9.76572412849508e-08, "loss": 0.0018, "step": 10010 }, { "epoch": 4.55459508644222, "grad_norm": 0.09477985333708229, "learning_rate": 9.745954857115104e-08, "loss": 0.0005, "step": 10011 }, { "epoch": 4.555050045495905, "grad_norm": 0.03429172112165389, "learning_rate": 9.726205218210743e-08, "loss": 0.0002, "step": 10012 }, { "epoch": 4.55550500454959, "grad_norm": 0.14304134814484218, "learning_rate": 9.706475213395822e-08, "loss": 0.0006, "step": 10013 }, { "epoch": 4.555959963603276, "grad_norm": 0.020446161954489114, "learning_rate": 9.686764844282547e-08, "loss": 0.0001, "step": 10014 }, { "epoch": 4.556414922656961, "grad_norm": 0.15794775442454095, "learning_rate": 9.667074112481633e-08, "loss": 0.0011, "step": 10015 }, { "epoch": 4.556869881710646, "grad_norm": 0.1162002601220693, "learning_rate": 9.647403019602069e-08, "loss": 0.0013, "step": 10016 }, { "epoch": 4.557324840764331, "grad_norm": 0.200139279002302, "learning_rate": 9.62775156725132e-08, "loss": 0.0008, "step": 10017 }, { "epoch": 4.557779799818016, "grad_norm": 0.035758209315529435, "learning_rate": 9.608119757035211e-08, "loss": 0.0002, "step": 10018 }, { "epoch": 4.558234758871702, "grad_norm": 0.04323321612302484, "learning_rate": 9.588507590557933e-08, "loss": 0.0002, "step": 10019 }, { "epoch": 4.558689717925387, "grad_norm": 0.07094909288934594, "learning_rate": 9.568915069422147e-08, "loss": 0.0004, "step": 10020 }, { "epoch": 4.559144676979072, "grad_norm": 0.05660648501509175, "learning_rate": 9.54934219522885e-08, "loss": 0.0004, "step": 10021 }, { "epoch": 4.5595996360327575, "grad_norm": 0.015394494380752934, "learning_rate": 9.529788969577459e-08, "loss": 0.0001, "step": 10022 }, { "epoch": 4.560054595086442, "grad_norm": 0.14095711114717385, "learning_rate": 9.510255394065692e-08, "loss": 0.0005, "step": 10023 }, { "epoch": 4.560509554140127, "grad_norm": 0.030201157170683834, "learning_rate": 9.49074147028986e-08, "loss": 0.0002, "step": 10024 }, { "epoch": 4.560964513193813, "grad_norm": 0.053869634186569156, "learning_rate": 9.471247199844491e-08, "loss": 0.0003, "step": 10025 }, { "epoch": 4.561419472247498, "grad_norm": 0.058106739635148744, "learning_rate": 9.451772584322589e-08, "loss": 0.0003, "step": 10026 }, { "epoch": 4.561874431301183, "grad_norm": 0.009630564647456118, "learning_rate": 9.432317625315546e-08, "loss": 0.0, "step": 10027 }, { "epoch": 4.5623293903548685, "grad_norm": 0.059035940681520044, "learning_rate": 9.412882324413091e-08, "loss": 0.0002, "step": 10028 }, { "epoch": 4.562784349408553, "grad_norm": 0.02629881037468143, "learning_rate": 9.3934666832034e-08, "loss": 0.0002, "step": 10029 }, { "epoch": 4.563239308462238, "grad_norm": 0.09411042732681882, "learning_rate": 9.374070703273036e-08, "loss": 0.0003, "step": 10030 }, { "epoch": 4.563694267515924, "grad_norm": 0.04075401139648845, "learning_rate": 9.354694386206981e-08, "loss": 0.0001, "step": 10031 }, { "epoch": 4.564149226569609, "grad_norm": 0.18615049351506005, "learning_rate": 9.335337733588551e-08, "loss": 0.0015, "step": 10032 }, { "epoch": 4.564604185623294, "grad_norm": 0.059398907780857, "learning_rate": 9.31600074699951e-08, "loss": 0.0004, "step": 10033 }, { "epoch": 4.5650591446769795, "grad_norm": 0.09486553522256475, "learning_rate": 9.296683428019926e-08, "loss": 0.0005, "step": 10034 }, { "epoch": 4.565514103730664, "grad_norm": 0.641021285774377, "learning_rate": 9.277385778228398e-08, "loss": 0.0021, "step": 10035 }, { "epoch": 4.565969062784349, "grad_norm": 0.11996652800640809, "learning_rate": 9.258107799201804e-08, "loss": 0.0011, "step": 10036 }, { "epoch": 4.566424021838035, "grad_norm": 0.06188849470601045, "learning_rate": 9.23884949251544e-08, "loss": 0.0002, "step": 10037 }, { "epoch": 4.56687898089172, "grad_norm": 0.06770113514528986, "learning_rate": 9.219610859743044e-08, "loss": 0.0009, "step": 10038 }, { "epoch": 4.567333939945405, "grad_norm": 0.026097123708160135, "learning_rate": 9.200391902456667e-08, "loss": 0.0001, "step": 10039 }, { "epoch": 4.5677888989990905, "grad_norm": 0.16087835641379028, "learning_rate": 9.181192622226859e-08, "loss": 0.0012, "step": 10040 }, { "epoch": 4.568243858052775, "grad_norm": 0.12233153435582421, "learning_rate": 9.162013020622473e-08, "loss": 0.001, "step": 10041 }, { "epoch": 4.56869881710646, "grad_norm": 0.031455148924605325, "learning_rate": 9.142853099210758e-08, "loss": 0.0002, "step": 10042 }, { "epoch": 4.569153776160146, "grad_norm": 0.21062337258878586, "learning_rate": 9.123712859557348e-08, "loss": 0.0011, "step": 10043 }, { "epoch": 4.569608735213831, "grad_norm": 0.050051409797823174, "learning_rate": 9.104592303226356e-08, "loss": 0.0003, "step": 10044 }, { "epoch": 4.570063694267516, "grad_norm": 0.027106654670302297, "learning_rate": 9.085491431780224e-08, "loss": 0.0001, "step": 10045 }, { "epoch": 4.570518653321201, "grad_norm": 0.08165453987137891, "learning_rate": 9.06641024677976e-08, "loss": 0.0003, "step": 10046 }, { "epoch": 4.570973612374886, "grad_norm": 0.059635025059753065, "learning_rate": 9.047348749784218e-08, "loss": 0.0003, "step": 10047 }, { "epoch": 4.571428571428571, "grad_norm": 0.020526599960575032, "learning_rate": 9.028306942351156e-08, "loss": 0.0001, "step": 10048 }, { "epoch": 4.571883530482257, "grad_norm": 0.01318452885897771, "learning_rate": 9.00928482603669e-08, "loss": 0.0001, "step": 10049 }, { "epoch": 4.572338489535942, "grad_norm": 0.12282215189681875, "learning_rate": 8.990282402395134e-08, "loss": 0.0018, "step": 10050 }, { "epoch": 4.572793448589627, "grad_norm": 0.033655895136906325, "learning_rate": 8.9712996729793e-08, "loss": 0.0002, "step": 10051 }, { "epoch": 4.573248407643312, "grad_norm": 0.014002269620597122, "learning_rate": 8.952336639340419e-08, "loss": 0.0001, "step": 10052 }, { "epoch": 4.573703366696997, "grad_norm": 0.15501304378754774, "learning_rate": 8.933393303027977e-08, "loss": 0.0012, "step": 10053 }, { "epoch": 4.574158325750682, "grad_norm": 0.13175865197473743, "learning_rate": 8.914469665590036e-08, "loss": 0.0005, "step": 10054 }, { "epoch": 4.574613284804368, "grad_norm": 0.02630506635913927, "learning_rate": 8.895565728572864e-08, "loss": 0.0002, "step": 10055 }, { "epoch": 4.575068243858053, "grad_norm": 0.060013816151834695, "learning_rate": 8.876681493521277e-08, "loss": 0.0003, "step": 10056 }, { "epoch": 4.575523202911738, "grad_norm": 0.27290785836795023, "learning_rate": 8.857816961978377e-08, "loss": 0.0017, "step": 10057 }, { "epoch": 4.575978161965423, "grad_norm": 0.23959164717221618, "learning_rate": 8.838972135485596e-08, "loss": 0.0012, "step": 10058 }, { "epoch": 4.576433121019108, "grad_norm": 0.1183925391228008, "learning_rate": 8.820147015583037e-08, "loss": 0.0003, "step": 10059 }, { "epoch": 4.576888080072793, "grad_norm": 0.2204389822912125, "learning_rate": 8.801341603808883e-08, "loss": 0.0013, "step": 10060 }, { "epoch": 4.577343039126479, "grad_norm": 0.014018888943624623, "learning_rate": 8.782555901699852e-08, "loss": 0.0001, "step": 10061 }, { "epoch": 4.577797998180164, "grad_norm": 0.04926607183624747, "learning_rate": 8.763789910791021e-08, "loss": 0.0007, "step": 10062 }, { "epoch": 4.578252957233849, "grad_norm": 0.009710392008494275, "learning_rate": 8.74504363261583e-08, "loss": 0.0, "step": 10063 }, { "epoch": 4.578707916287534, "grad_norm": 0.06933762677051071, "learning_rate": 8.72631706870622e-08, "loss": 0.0003, "step": 10064 }, { "epoch": 4.579162875341219, "grad_norm": 0.1360717906407911, "learning_rate": 8.707610220592355e-08, "loss": 0.0008, "step": 10065 }, { "epoch": 4.579617834394904, "grad_norm": 0.009165730533887929, "learning_rate": 8.688923089802959e-08, "loss": 0.0001, "step": 10066 }, { "epoch": 4.58007279344859, "grad_norm": 0.021345241000805915, "learning_rate": 8.670255677865003e-08, "loss": 0.0001, "step": 10067 }, { "epoch": 4.580527752502275, "grad_norm": 0.04400760876220649, "learning_rate": 8.651607986303906e-08, "loss": 0.0003, "step": 10068 }, { "epoch": 4.58098271155596, "grad_norm": 0.015356274139048989, "learning_rate": 8.632980016643505e-08, "loss": 0.0001, "step": 10069 }, { "epoch": 4.581437670609645, "grad_norm": 0.03129954037250118, "learning_rate": 8.614371770405971e-08, "loss": 0.0001, "step": 10070 }, { "epoch": 4.58189262966333, "grad_norm": 0.059033519182187806, "learning_rate": 8.595783249111895e-08, "loss": 0.0004, "step": 10071 }, { "epoch": 4.582347588717015, "grad_norm": 0.30258226792782056, "learning_rate": 8.577214454280197e-08, "loss": 0.0019, "step": 10072 }, { "epoch": 4.582802547770701, "grad_norm": 0.1464490066674453, "learning_rate": 8.558665387428277e-08, "loss": 0.0009, "step": 10073 }, { "epoch": 4.583257506824386, "grad_norm": 0.16097996726427075, "learning_rate": 8.540136050071923e-08, "loss": 0.0011, "step": 10074 }, { "epoch": 4.583712465878071, "grad_norm": 0.10272121320871702, "learning_rate": 8.521626443725228e-08, "loss": 0.0004, "step": 10075 }, { "epoch": 4.584167424931756, "grad_norm": 0.028136286194293647, "learning_rate": 8.503136569900705e-08, "loss": 0.0002, "step": 10076 }, { "epoch": 4.584622383985441, "grad_norm": 0.10077765051654013, "learning_rate": 8.484666430109257e-08, "loss": 0.0002, "step": 10077 }, { "epoch": 4.585077343039126, "grad_norm": 0.31635174043096853, "learning_rate": 8.466216025860202e-08, "loss": 0.004, "step": 10078 }, { "epoch": 4.585532302092812, "grad_norm": 0.11236669066008148, "learning_rate": 8.447785358661199e-08, "loss": 0.0018, "step": 10079 }, { "epoch": 4.585987261146497, "grad_norm": 0.03902175017098642, "learning_rate": 8.429374430018372e-08, "loss": 0.0001, "step": 10080 }, { "epoch": 4.5864422202001816, "grad_norm": 0.0899879921599099, "learning_rate": 8.410983241436132e-08, "loss": 0.0006, "step": 10081 }, { "epoch": 4.586897179253867, "grad_norm": 0.262361588336662, "learning_rate": 8.392611794417305e-08, "loss": 0.0038, "step": 10082 }, { "epoch": 4.587352138307552, "grad_norm": 0.29217553851789746, "learning_rate": 8.374260090463188e-08, "loss": 0.001, "step": 10083 }, { "epoch": 4.587807097361237, "grad_norm": 0.02326432192667401, "learning_rate": 8.35592813107336e-08, "loss": 0.0001, "step": 10084 }, { "epoch": 4.588262056414923, "grad_norm": 0.21117252007615442, "learning_rate": 8.337615917745844e-08, "loss": 0.0016, "step": 10085 }, { "epoch": 4.588717015468608, "grad_norm": 0.19943317222974735, "learning_rate": 8.319323451976974e-08, "loss": 0.002, "step": 10086 }, { "epoch": 4.5891719745222925, "grad_norm": 0.1395049542307563, "learning_rate": 8.301050735261579e-08, "loss": 0.0014, "step": 10087 }, { "epoch": 4.589626933575978, "grad_norm": 0.018770667490364715, "learning_rate": 8.282797769092854e-08, "loss": 0.0001, "step": 10088 }, { "epoch": 4.590081892629663, "grad_norm": 0.004168554842953931, "learning_rate": 8.264564554962273e-08, "loss": 0.0, "step": 10089 }, { "epoch": 4.590536851683348, "grad_norm": 0.015876582316303567, "learning_rate": 8.24635109435984e-08, "loss": 0.0001, "step": 10090 }, { "epoch": 4.590991810737034, "grad_norm": 0.053903122407048684, "learning_rate": 8.228157388773805e-08, "loss": 0.0001, "step": 10091 }, { "epoch": 4.591446769790719, "grad_norm": 0.0233172713241367, "learning_rate": 8.209983439690955e-08, "loss": 0.0001, "step": 10092 }, { "epoch": 4.5919017288444035, "grad_norm": 0.013088894405956755, "learning_rate": 8.191829248596323e-08, "loss": 0.0001, "step": 10093 }, { "epoch": 4.592356687898089, "grad_norm": 0.03439473323284593, "learning_rate": 8.173694816973415e-08, "loss": 0.0002, "step": 10094 }, { "epoch": 4.592811646951774, "grad_norm": 0.22672710244918473, "learning_rate": 8.155580146304104e-08, "loss": 0.001, "step": 10095 }, { "epoch": 4.59326660600546, "grad_norm": 0.02203925802102184, "learning_rate": 8.13748523806862e-08, "loss": 0.0002, "step": 10096 }, { "epoch": 4.593721565059145, "grad_norm": 0.16648570517560243, "learning_rate": 8.11941009374556e-08, "loss": 0.0014, "step": 10097 }, { "epoch": 4.59417652411283, "grad_norm": 0.08089825644060317, "learning_rate": 8.101354714812021e-08, "loss": 0.0007, "step": 10098 }, { "epoch": 4.594631483166515, "grad_norm": 0.25755386390661184, "learning_rate": 8.083319102743375e-08, "loss": 0.0028, "step": 10099 }, { "epoch": 4.5950864422202, "grad_norm": 0.23757095961637417, "learning_rate": 8.065303259013362e-08, "loss": 0.004, "step": 10100 }, { "epoch": 4.595541401273885, "grad_norm": 0.01523174360691789, "learning_rate": 8.04730718509425e-08, "loss": 0.0001, "step": 10101 }, { "epoch": 4.595996360327571, "grad_norm": 0.09740573789396874, "learning_rate": 8.029330882456498e-08, "loss": 0.0006, "step": 10102 }, { "epoch": 4.596451319381256, "grad_norm": 0.029094668339720763, "learning_rate": 8.011374352569156e-08, "loss": 0.0002, "step": 10103 }, { "epoch": 4.596906278434941, "grad_norm": 0.1935397703815055, "learning_rate": 7.993437596899467e-08, "loss": 0.0021, "step": 10104 }, { "epoch": 4.597361237488626, "grad_norm": 0.15076378113981886, "learning_rate": 7.975520616913174e-08, "loss": 0.0018, "step": 10105 }, { "epoch": 4.597816196542311, "grad_norm": 0.0813561367172491, "learning_rate": 7.957623414074328e-08, "loss": 0.0008, "step": 10106 }, { "epoch": 4.598271155595996, "grad_norm": 0.03920494958501387, "learning_rate": 7.939745989845427e-08, "loss": 0.0001, "step": 10107 }, { "epoch": 4.598726114649682, "grad_norm": 0.24314608563268847, "learning_rate": 7.921888345687412e-08, "loss": 0.0004, "step": 10108 }, { "epoch": 4.599181073703367, "grad_norm": 0.027186096587570117, "learning_rate": 7.904050483059422e-08, "loss": 0.0001, "step": 10109 }, { "epoch": 4.599636032757052, "grad_norm": 0.058885345991462126, "learning_rate": 7.88623240341918e-08, "loss": 0.0002, "step": 10110 }, { "epoch": 4.600090991810737, "grad_norm": 0.18407281419222243, "learning_rate": 7.868434108222577e-08, "loss": 0.0019, "step": 10111 }, { "epoch": 4.600545950864422, "grad_norm": 0.06768940479058949, "learning_rate": 7.850655598924144e-08, "loss": 0.0003, "step": 10112 }, { "epoch": 4.601000909918107, "grad_norm": 0.020489712748750593, "learning_rate": 7.832896876976581e-08, "loss": 0.0001, "step": 10113 }, { "epoch": 4.601455868971793, "grad_norm": 0.023491601687323418, "learning_rate": 7.815157943831058e-08, "loss": 0.0001, "step": 10114 }, { "epoch": 4.601910828025478, "grad_norm": 0.058784043337870136, "learning_rate": 7.79743880093714e-08, "loss": 0.0003, "step": 10115 }, { "epoch": 4.6023657870791626, "grad_norm": 0.18016827741009192, "learning_rate": 7.779739449742724e-08, "loss": 0.0018, "step": 10116 }, { "epoch": 4.602820746132848, "grad_norm": 0.010440909778507089, "learning_rate": 7.76205989169418e-08, "loss": 0.0001, "step": 10117 }, { "epoch": 4.603275705186533, "grad_norm": 0.017945542654535962, "learning_rate": 7.744400128236158e-08, "loss": 0.0001, "step": 10118 }, { "epoch": 4.603730664240218, "grad_norm": 0.21577270584351588, "learning_rate": 7.726760160811726e-08, "loss": 0.002, "step": 10119 }, { "epoch": 4.604185623293904, "grad_norm": 0.1044598228018884, "learning_rate": 7.709139990862342e-08, "loss": 0.0004, "step": 10120 }, { "epoch": 4.604640582347589, "grad_norm": 0.06963060370740325, "learning_rate": 7.691539619827881e-08, "loss": 0.0003, "step": 10121 }, { "epoch": 4.6050955414012735, "grad_norm": 0.3678281184278254, "learning_rate": 7.673959049146557e-08, "loss": 0.0017, "step": 10122 }, { "epoch": 4.605550500454959, "grad_norm": 0.193155388615865, "learning_rate": 7.656398280254967e-08, "loss": 0.004, "step": 10123 }, { "epoch": 4.606005459508644, "grad_norm": 0.06322097220866274, "learning_rate": 7.638857314588077e-08, "loss": 0.0003, "step": 10124 }, { "epoch": 4.606460418562329, "grad_norm": 0.01726918962162614, "learning_rate": 7.621336153579267e-08, "loss": 0.0001, "step": 10125 }, { "epoch": 4.606915377616015, "grad_norm": 0.10317898627393955, "learning_rate": 7.603834798660309e-08, "loss": 0.0004, "step": 10126 }, { "epoch": 4.6073703366697, "grad_norm": 0.023596023914083886, "learning_rate": 7.586353251261336e-08, "loss": 0.0001, "step": 10127 }, { "epoch": 4.607825295723385, "grad_norm": 0.022926999383829402, "learning_rate": 7.568891512810817e-08, "loss": 0.0001, "step": 10128 }, { "epoch": 4.60828025477707, "grad_norm": 0.07195790384249622, "learning_rate": 7.551449584735693e-08, "loss": 0.0003, "step": 10129 }, { "epoch": 4.608735213830755, "grad_norm": 0.0113507518956911, "learning_rate": 7.534027468461213e-08, "loss": 0.0, "step": 10130 }, { "epoch": 4.609190172884441, "grad_norm": 0.1965740169298503, "learning_rate": 7.516625165411018e-08, "loss": 0.0043, "step": 10131 }, { "epoch": 4.609645131938126, "grad_norm": 0.04119745647136657, "learning_rate": 7.499242677007218e-08, "loss": 0.0002, "step": 10132 }, { "epoch": 4.610100090991811, "grad_norm": 0.03724320310443518, "learning_rate": 7.48188000467015e-08, "loss": 0.0002, "step": 10133 }, { "epoch": 4.610555050045496, "grad_norm": 0.2763527456452041, "learning_rate": 7.46453714981868e-08, "loss": 0.0008, "step": 10134 }, { "epoch": 4.611010009099181, "grad_norm": 0.1397980880641494, "learning_rate": 7.447214113869893e-08, "loss": 0.0008, "step": 10135 }, { "epoch": 4.611464968152866, "grad_norm": 0.07362450305914955, "learning_rate": 7.42991089823944e-08, "loss": 0.0004, "step": 10136 }, { "epoch": 4.611919927206552, "grad_norm": 0.013709287237232776, "learning_rate": 7.412627504341241e-08, "loss": 0.0001, "step": 10137 }, { "epoch": 4.612374886260237, "grad_norm": 0.06206533773425323, "learning_rate": 7.395363933587612e-08, "loss": 0.0002, "step": 10138 }, { "epoch": 4.612829845313922, "grad_norm": 0.06711882746304616, "learning_rate": 7.378120187389231e-08, "loss": 0.0002, "step": 10139 }, { "epoch": 4.613284804367607, "grad_norm": 0.03890864457091438, "learning_rate": 7.360896267155193e-08, "loss": 0.0001, "step": 10140 }, { "epoch": 4.613739763421292, "grad_norm": 0.10252023970595969, "learning_rate": 7.343692174292982e-08, "loss": 0.0006, "step": 10141 }, { "epoch": 4.614194722474977, "grad_norm": 0.009594031515413923, "learning_rate": 7.32650791020842e-08, "loss": 0.0001, "step": 10142 }, { "epoch": 4.614649681528663, "grad_norm": 0.10147122184346334, "learning_rate": 7.309343476305714e-08, "loss": 0.0006, "step": 10143 }, { "epoch": 4.615104640582348, "grad_norm": 0.17855380123113496, "learning_rate": 7.292198873987493e-08, "loss": 0.0009, "step": 10144 }, { "epoch": 4.615559599636033, "grad_norm": 0.0728030219741731, "learning_rate": 7.275074104654695e-08, "loss": 0.0003, "step": 10145 }, { "epoch": 4.616014558689718, "grad_norm": 0.1513923859390768, "learning_rate": 7.257969169706752e-08, "loss": 0.0011, "step": 10146 }, { "epoch": 4.616469517743403, "grad_norm": 0.2557620844197133, "learning_rate": 7.240884070541326e-08, "loss": 0.0008, "step": 10147 }, { "epoch": 4.616924476797088, "grad_norm": 0.38969949644253526, "learning_rate": 7.223818808554578e-08, "loss": 0.0031, "step": 10148 }, { "epoch": 4.617379435850774, "grad_norm": 0.01447037561802309, "learning_rate": 7.206773385140947e-08, "loss": 0.0001, "step": 10149 }, { "epoch": 4.617834394904459, "grad_norm": 0.07670991707430445, "learning_rate": 7.189747801693375e-08, "loss": 0.0005, "step": 10150 }, { "epoch": 4.6182893539581436, "grad_norm": 0.07926057087089236, "learning_rate": 7.172742059603111e-08, "loss": 0.0003, "step": 10151 }, { "epoch": 4.618744313011829, "grad_norm": 0.058005534293962574, "learning_rate": 7.155756160259764e-08, "loss": 0.0003, "step": 10152 }, { "epoch": 4.619199272065514, "grad_norm": 0.029049564989030375, "learning_rate": 7.138790105051335e-08, "loss": 0.0001, "step": 10153 }, { "epoch": 4.619654231119199, "grad_norm": 0.09113707938782738, "learning_rate": 7.121843895364217e-08, "loss": 0.0006, "step": 10154 }, { "epoch": 4.620109190172885, "grad_norm": 0.07047692916534815, "learning_rate": 7.104917532583216e-08, "loss": 0.0004, "step": 10155 }, { "epoch": 4.62056414922657, "grad_norm": 0.21648215555061576, "learning_rate": 7.088011018091395e-08, "loss": 0.0022, "step": 10156 }, { "epoch": 4.6210191082802545, "grad_norm": 0.0977618159397034, "learning_rate": 7.071124353270398e-08, "loss": 0.0004, "step": 10157 }, { "epoch": 4.62147406733394, "grad_norm": 0.21030800887983161, "learning_rate": 7.054257539500037e-08, "loss": 0.0017, "step": 10158 }, { "epoch": 4.621929026387625, "grad_norm": 0.04545628026032578, "learning_rate": 7.037410578158598e-08, "loss": 0.0003, "step": 10159 }, { "epoch": 4.62238398544131, "grad_norm": 0.08741941456285389, "learning_rate": 7.020583470622789e-08, "loss": 0.0006, "step": 10160 }, { "epoch": 4.622838944494996, "grad_norm": 0.15850099991012898, "learning_rate": 7.003776218267588e-08, "loss": 0.0004, "step": 10161 }, { "epoch": 4.623293903548681, "grad_norm": 0.01284741339698623, "learning_rate": 6.986988822466456e-08, "loss": 0.0, "step": 10162 }, { "epoch": 4.6237488626023655, "grad_norm": 0.4020150943061432, "learning_rate": 6.970221284591128e-08, "loss": 0.0041, "step": 10163 }, { "epoch": 4.624203821656051, "grad_norm": 0.02434693996108961, "learning_rate": 6.953473606011813e-08, "loss": 0.0001, "step": 10164 }, { "epoch": 4.624658780709736, "grad_norm": 0.3411933431249659, "learning_rate": 6.936745788097082e-08, "loss": 0.0066, "step": 10165 }, { "epoch": 4.625113739763421, "grad_norm": 0.03797664055411921, "learning_rate": 6.920037832213789e-08, "loss": 0.0003, "step": 10166 }, { "epoch": 4.625568698817107, "grad_norm": 0.08943655009839606, "learning_rate": 6.903349739727284e-08, "loss": 0.001, "step": 10167 }, { "epoch": 4.626023657870792, "grad_norm": 0.03523167559361827, "learning_rate": 6.886681512001225e-08, "loss": 0.0001, "step": 10168 }, { "epoch": 4.6264786169244765, "grad_norm": 0.3143466290884141, "learning_rate": 6.870033150397637e-08, "loss": 0.0017, "step": 10169 }, { "epoch": 4.626933575978162, "grad_norm": 0.19286338022548594, "learning_rate": 6.853404656276957e-08, "loss": 0.0006, "step": 10170 }, { "epoch": 4.627388535031847, "grad_norm": 0.0139010803773336, "learning_rate": 6.836796030998044e-08, "loss": 0.0001, "step": 10171 }, { "epoch": 4.627843494085532, "grad_norm": 0.3621310935742297, "learning_rate": 6.820207275918061e-08, "loss": 0.0028, "step": 10172 }, { "epoch": 4.628298453139218, "grad_norm": 0.009780065206764436, "learning_rate": 6.803638392392537e-08, "loss": 0.0001, "step": 10173 }, { "epoch": 4.628753412192903, "grad_norm": 0.1147165082584177, "learning_rate": 6.787089381775386e-08, "loss": 0.0008, "step": 10174 }, { "epoch": 4.6292083712465875, "grad_norm": 0.0540385164042009, "learning_rate": 6.770560245418972e-08, "loss": 0.0003, "step": 10175 }, { "epoch": 4.629663330300273, "grad_norm": 0.4100760917981542, "learning_rate": 6.754050984673993e-08, "loss": 0.0058, "step": 10176 }, { "epoch": 4.630118289353958, "grad_norm": 0.1795254682992498, "learning_rate": 6.737561600889425e-08, "loss": 0.0013, "step": 10177 }, { "epoch": 4.630573248407643, "grad_norm": 0.031924262904793416, "learning_rate": 6.721092095412774e-08, "loss": 0.0003, "step": 10178 }, { "epoch": 4.631028207461329, "grad_norm": 0.01203129918280449, "learning_rate": 6.70464246958985e-08, "loss": 0.0001, "step": 10179 }, { "epoch": 4.631483166515014, "grad_norm": 0.011623229429308345, "learning_rate": 6.688212724764831e-08, "loss": 0.0001, "step": 10180 }, { "epoch": 4.631938125568698, "grad_norm": 0.15964191191473606, "learning_rate": 6.67180286228028e-08, "loss": 0.0007, "step": 10181 }, { "epoch": 4.632393084622384, "grad_norm": 0.050023760293454435, "learning_rate": 6.655412883477153e-08, "loss": 0.0012, "step": 10182 }, { "epoch": 4.632848043676069, "grad_norm": 0.33686034507814494, "learning_rate": 6.63904278969471e-08, "loss": 0.001, "step": 10183 }, { "epoch": 4.633303002729754, "grad_norm": 0.03592512991061425, "learning_rate": 6.62269258227069e-08, "loss": 0.0002, "step": 10184 }, { "epoch": 4.63375796178344, "grad_norm": 0.32082424607806054, "learning_rate": 6.606362262541188e-08, "loss": 0.0027, "step": 10185 }, { "epoch": 4.6342129208371245, "grad_norm": 0.016928510459188098, "learning_rate": 6.590051831840583e-08, "loss": 0.0001, "step": 10186 }, { "epoch": 4.634667879890809, "grad_norm": 0.0074896180380910945, "learning_rate": 6.573761291501724e-08, "loss": 0.0, "step": 10187 }, { "epoch": 4.635122838944495, "grad_norm": 0.17574955599445377, "learning_rate": 6.557490642855769e-08, "loss": 0.0017, "step": 10188 }, { "epoch": 4.63557779799818, "grad_norm": 0.16562762629714245, "learning_rate": 6.54123988723232e-08, "loss": 0.0005, "step": 10189 }, { "epoch": 4.636032757051865, "grad_norm": 0.0730875523274649, "learning_rate": 6.525009025959289e-08, "loss": 0.0006, "step": 10190 }, { "epoch": 4.636487716105551, "grad_norm": 0.018362908306043632, "learning_rate": 6.508798060362976e-08, "loss": 0.0001, "step": 10191 }, { "epoch": 4.6369426751592355, "grad_norm": 0.28006227272415896, "learning_rate": 6.492606991768125e-08, "loss": 0.0015, "step": 10192 }, { "epoch": 4.63739763421292, "grad_norm": 0.0922460010043603, "learning_rate": 6.476435821497734e-08, "loss": 0.0005, "step": 10193 }, { "epoch": 4.637852593266606, "grad_norm": 0.120812798804586, "learning_rate": 6.460284550873275e-08, "loss": 0.0008, "step": 10194 }, { "epoch": 4.638307552320291, "grad_norm": 0.04322703405566636, "learning_rate": 6.44415318121458e-08, "loss": 0.0002, "step": 10195 }, { "epoch": 4.638762511373976, "grad_norm": 0.259028830748917, "learning_rate": 6.428041713839761e-08, "loss": 0.0012, "step": 10196 }, { "epoch": 4.639217470427662, "grad_norm": 0.09356599950851181, "learning_rate": 6.411950150065404e-08, "loss": 0.0004, "step": 10197 }, { "epoch": 4.6396724294813465, "grad_norm": 0.5187766008816018, "learning_rate": 6.395878491206458e-08, "loss": 0.0035, "step": 10198 }, { "epoch": 4.640127388535031, "grad_norm": 0.06935982068539899, "learning_rate": 6.379826738576206e-08, "loss": 0.0004, "step": 10199 }, { "epoch": 4.640582347588717, "grad_norm": 0.028159247273848255, "learning_rate": 6.363794893486375e-08, "loss": 0.0002, "step": 10200 }, { "epoch": 4.641037306642402, "grad_norm": 0.01604088197577797, "learning_rate": 6.347782957246945e-08, "loss": 0.0001, "step": 10201 }, { "epoch": 4.641492265696087, "grad_norm": 0.1423557006340073, "learning_rate": 6.331790931166371e-08, "loss": 0.0014, "step": 10202 }, { "epoch": 4.641947224749773, "grad_norm": 0.03193833695403542, "learning_rate": 6.315818816551439e-08, "loss": 0.0001, "step": 10203 }, { "epoch": 4.6424021838034575, "grad_norm": 0.023829891469034286, "learning_rate": 6.299866614707328e-08, "loss": 0.0001, "step": 10204 }, { "epoch": 4.642857142857143, "grad_norm": 0.01719304820642158, "learning_rate": 6.28393432693758e-08, "loss": 0.0001, "step": 10205 }, { "epoch": 4.643312101910828, "grad_norm": 0.06066263586466858, "learning_rate": 6.268021954544095e-08, "loss": 0.001, "step": 10206 }, { "epoch": 4.643767060964513, "grad_norm": 0.13217367532246074, "learning_rate": 6.252129498827197e-08, "loss": 0.0005, "step": 10207 }, { "epoch": 4.644222020018199, "grad_norm": 0.1436999637854848, "learning_rate": 6.236256961085486e-08, "loss": 0.0004, "step": 10208 }, { "epoch": 4.644676979071884, "grad_norm": 0.09319612195955942, "learning_rate": 6.22040434261606e-08, "loss": 0.0006, "step": 10209 }, { "epoch": 4.6451319381255685, "grad_norm": 0.15500488972028412, "learning_rate": 6.204571644714303e-08, "loss": 0.0013, "step": 10210 }, { "epoch": 4.645586897179254, "grad_norm": 0.10711900561504388, "learning_rate": 6.188758868673955e-08, "loss": 0.0006, "step": 10211 }, { "epoch": 4.646041856232939, "grad_norm": 0.004558516690659649, "learning_rate": 6.17296601578718e-08, "loss": 0.0, "step": 10212 }, { "epoch": 4.646496815286624, "grad_norm": 0.16410333193779508, "learning_rate": 6.157193087344526e-08, "loss": 0.0008, "step": 10213 }, { "epoch": 4.64695177434031, "grad_norm": 0.07313574809224159, "learning_rate": 6.141440084634854e-08, "loss": 0.0005, "step": 10214 }, { "epoch": 4.647406733393995, "grad_norm": 0.07150389149499847, "learning_rate": 6.125707008945464e-08, "loss": 0.0003, "step": 10215 }, { "epoch": 4.647861692447679, "grad_norm": 0.05618320627562014, "learning_rate": 6.109993861561969e-08, "loss": 0.0002, "step": 10216 }, { "epoch": 4.648316651501365, "grad_norm": 0.08438351758538795, "learning_rate": 6.09430064376837e-08, "loss": 0.0004, "step": 10217 }, { "epoch": 4.64877161055505, "grad_norm": 0.3214667267669352, "learning_rate": 6.078627356847056e-08, "loss": 0.0023, "step": 10218 }, { "epoch": 4.649226569608735, "grad_norm": 0.06411353658851413, "learning_rate": 6.062974002078753e-08, "loss": 0.0004, "step": 10219 }, { "epoch": 4.649681528662421, "grad_norm": 0.014197441555230065, "learning_rate": 6.047340580742634e-08, "loss": 0.0001, "step": 10220 }, { "epoch": 4.6501364877161055, "grad_norm": 0.11829189888049516, "learning_rate": 6.031727094116174e-08, "loss": 0.0003, "step": 10221 }, { "epoch": 4.65059144676979, "grad_norm": 0.28758433705649006, "learning_rate": 6.016133543475189e-08, "loss": 0.0006, "step": 10222 }, { "epoch": 4.651046405823476, "grad_norm": 0.2602554013082255, "learning_rate": 6.000559930093964e-08, "loss": 0.0016, "step": 10223 }, { "epoch": 4.651501364877161, "grad_norm": 0.03900275130099383, "learning_rate": 5.98500625524509e-08, "loss": 0.0001, "step": 10224 }, { "epoch": 4.651956323930846, "grad_norm": 0.020403896629928874, "learning_rate": 5.969472520199553e-08, "loss": 0.0002, "step": 10225 }, { "epoch": 4.652411282984532, "grad_norm": 0.20770032398033844, "learning_rate": 5.953958726226672e-08, "loss": 0.0015, "step": 10226 }, { "epoch": 4.6528662420382165, "grad_norm": 0.08608648255378754, "learning_rate": 5.93846487459418e-08, "loss": 0.0006, "step": 10227 }, { "epoch": 4.653321201091901, "grad_norm": 0.20467625050330115, "learning_rate": 5.922990966568176e-08, "loss": 0.0007, "step": 10228 }, { "epoch": 4.653776160145587, "grad_norm": 0.042685616900311245, "learning_rate": 5.9075370034131216e-08, "loss": 0.0002, "step": 10229 }, { "epoch": 4.654231119199272, "grad_norm": 0.14702939257855888, "learning_rate": 5.89210298639184e-08, "loss": 0.0008, "step": 10230 }, { "epoch": 4.654686078252957, "grad_norm": 0.13734549151745523, "learning_rate": 5.876688916765461e-08, "loss": 0.0008, "step": 10231 }, { "epoch": 4.655141037306643, "grad_norm": 0.03970681861341631, "learning_rate": 5.861294795793671e-08, "loss": 0.0001, "step": 10232 }, { "epoch": 4.6555959963603275, "grad_norm": 0.11635165069270528, "learning_rate": 5.845920624734325e-08, "loss": 0.001, "step": 10233 }, { "epoch": 4.656050955414012, "grad_norm": 0.09358978408722358, "learning_rate": 5.830566404843752e-08, "loss": 0.0005, "step": 10234 }, { "epoch": 4.656505914467698, "grad_norm": 0.07243080922254701, "learning_rate": 5.815232137376642e-08, "loss": 0.0003, "step": 10235 }, { "epoch": 4.656960873521383, "grad_norm": 0.003987867834913185, "learning_rate": 5.799917823586021e-08, "loss": 0.0, "step": 10236 }, { "epoch": 4.657415832575068, "grad_norm": 0.12296568206091014, "learning_rate": 5.784623464723332e-08, "loss": 0.0009, "step": 10237 }, { "epoch": 4.657870791628754, "grad_norm": 0.16414835934574742, "learning_rate": 5.7693490620383544e-08, "loss": 0.0007, "step": 10238 }, { "epoch": 4.6583257506824385, "grad_norm": 0.10509689127779376, "learning_rate": 5.7540946167792265e-08, "loss": 0.0009, "step": 10239 }, { "epoch": 4.658780709736124, "grad_norm": 0.15628226693968364, "learning_rate": 5.738860130192481e-08, "loss": 0.0011, "step": 10240 }, { "epoch": 4.659235668789809, "grad_norm": 0.40332781383361005, "learning_rate": 5.7236456035230096e-08, "loss": 0.003, "step": 10241 }, { "epoch": 4.659690627843494, "grad_norm": 0.06361723399303891, "learning_rate": 5.708451038014068e-08, "loss": 0.0003, "step": 10242 }, { "epoch": 4.66014558689718, "grad_norm": 0.23954099532429604, "learning_rate": 5.693276434907302e-08, "loss": 0.005, "step": 10243 }, { "epoch": 4.660600545950865, "grad_norm": 0.29413666971599267, "learning_rate": 5.6781217954427206e-08, "loss": 0.0015, "step": 10244 }, { "epoch": 4.6610555050045495, "grad_norm": 0.07348738177435787, "learning_rate": 5.6629871208586926e-08, "loss": 0.0002, "step": 10245 }, { "epoch": 4.661510464058235, "grad_norm": 0.28948341379539105, "learning_rate": 5.647872412391897e-08, "loss": 0.004, "step": 10246 }, { "epoch": 4.66196542311192, "grad_norm": 0.03164087454066804, "learning_rate": 5.632777671277484e-08, "loss": 0.0002, "step": 10247 }, { "epoch": 4.662420382165605, "grad_norm": 0.0748117489973812, "learning_rate": 5.617702898748967e-08, "loss": 0.0005, "step": 10248 }, { "epoch": 4.662875341219291, "grad_norm": 0.03791036392669134, "learning_rate": 5.6026480960381377e-08, "loss": 0.0002, "step": 10249 }, { "epoch": 4.663330300272976, "grad_norm": 0.054901969513198985, "learning_rate": 5.587613264375208e-08, "loss": 0.0003, "step": 10250 }, { "epoch": 4.66378525932666, "grad_norm": 0.06770926041246943, "learning_rate": 5.5725984049887495e-08, "loss": 0.0002, "step": 10251 }, { "epoch": 4.664240218380346, "grad_norm": 0.20301169404920738, "learning_rate": 5.5576035191057534e-08, "loss": 0.0009, "step": 10252 }, { "epoch": 4.664695177434031, "grad_norm": 0.14016504999641893, "learning_rate": 5.542628607951489e-08, "loss": 0.0016, "step": 10253 }, { "epoch": 4.665150136487716, "grad_norm": 0.023810863748633518, "learning_rate": 5.527673672749645e-08, "loss": 0.0001, "step": 10254 }, { "epoch": 4.665605095541402, "grad_norm": 0.10131112150954785, "learning_rate": 5.512738714722299e-08, "loss": 0.0006, "step": 10255 }, { "epoch": 4.6660600545950865, "grad_norm": 0.03631232373041352, "learning_rate": 5.497823735089836e-08, "loss": 0.0002, "step": 10256 }, { "epoch": 4.666515013648771, "grad_norm": 0.05086553025115126, "learning_rate": 5.482928735071086e-08, "loss": 0.0002, "step": 10257 }, { "epoch": 4.666969972702457, "grad_norm": 0.005618288765502647, "learning_rate": 5.4680537158831595e-08, "loss": 0.0, "step": 10258 }, { "epoch": 4.667424931756142, "grad_norm": 0.09854152456703526, "learning_rate": 5.4531986787415834e-08, "loss": 0.0008, "step": 10259 }, { "epoch": 4.667879890809827, "grad_norm": 0.06330422132618009, "learning_rate": 5.4383636248602213e-08, "loss": 0.0002, "step": 10260 }, { "epoch": 4.668334849863513, "grad_norm": 0.016668532658139584, "learning_rate": 5.423548555451352e-08, "loss": 0.0001, "step": 10261 }, { "epoch": 4.6687898089171975, "grad_norm": 0.018465042489223895, "learning_rate": 5.4087534717256195e-08, "loss": 0.0001, "step": 10262 }, { "epoch": 4.669244767970882, "grad_norm": 0.07086227040849898, "learning_rate": 5.393978374892001e-08, "loss": 0.0003, "step": 10263 }, { "epoch": 4.669699727024568, "grad_norm": 0.0971844313584898, "learning_rate": 5.379223266157835e-08, "loss": 0.0005, "step": 10264 }, { "epoch": 4.670154686078253, "grad_norm": 0.048592919959855096, "learning_rate": 5.3644881467288245e-08, "loss": 0.0002, "step": 10265 }, { "epoch": 4.670609645131938, "grad_norm": 0.10344850755353506, "learning_rate": 5.34977301780909e-08, "loss": 0.0007, "step": 10266 }, { "epoch": 4.671064604185624, "grad_norm": 0.07673130745494527, "learning_rate": 5.335077880601086e-08, "loss": 0.0004, "step": 10267 }, { "epoch": 4.6715195632393085, "grad_norm": 0.023719928470766354, "learning_rate": 5.320402736305602e-08, "loss": 0.0001, "step": 10268 }, { "epoch": 4.671974522292993, "grad_norm": 0.44698862092955777, "learning_rate": 5.305747586121845e-08, "loss": 0.0028, "step": 10269 }, { "epoch": 4.672429481346679, "grad_norm": 0.09805678856044371, "learning_rate": 5.291112431247358e-08, "loss": 0.0005, "step": 10270 }, { "epoch": 4.672884440400364, "grad_norm": 0.052973764335690635, "learning_rate": 5.276497272878101e-08, "loss": 0.0004, "step": 10271 }, { "epoch": 4.673339399454049, "grad_norm": 0.040758117884753274, "learning_rate": 5.2619021122083116e-08, "loss": 0.0001, "step": 10272 }, { "epoch": 4.673794358507735, "grad_norm": 0.17966389635721527, "learning_rate": 5.247326950430648e-08, "loss": 0.0012, "step": 10273 }, { "epoch": 4.6742493175614195, "grad_norm": 0.08640989072708358, "learning_rate": 5.232771788736157e-08, "loss": 0.0004, "step": 10274 }, { "epoch": 4.674704276615104, "grad_norm": 0.19289999215746176, "learning_rate": 5.2182366283141384e-08, "loss": 0.0008, "step": 10275 }, { "epoch": 4.67515923566879, "grad_norm": 0.13671702753421505, "learning_rate": 5.2037214703524185e-08, "loss": 0.0009, "step": 10276 }, { "epoch": 4.675614194722475, "grad_norm": 0.0982757822824404, "learning_rate": 5.189226316037105e-08, "loss": 0.0008, "step": 10277 }, { "epoch": 4.67606915377616, "grad_norm": 0.04033127906106569, "learning_rate": 5.1747511665526665e-08, "loss": 0.0002, "step": 10278 }, { "epoch": 4.676524112829846, "grad_norm": 0.15043688437277, "learning_rate": 5.1602960230819624e-08, "loss": 0.0003, "step": 10279 }, { "epoch": 4.6769790718835305, "grad_norm": 0.06667540106175948, "learning_rate": 5.145860886806131e-08, "loss": 0.0003, "step": 10280 }, { "epoch": 4.677434030937215, "grad_norm": 0.19860990823112568, "learning_rate": 5.131445758904813e-08, "loss": 0.0022, "step": 10281 }, { "epoch": 4.677888989990901, "grad_norm": 0.13732012957742257, "learning_rate": 5.117050640555926e-08, "loss": 0.0008, "step": 10282 }, { "epoch": 4.678343949044586, "grad_norm": 0.22430423728439122, "learning_rate": 5.1026755329358077e-08, "loss": 0.001, "step": 10283 }, { "epoch": 4.678798908098271, "grad_norm": 0.25519868913110505, "learning_rate": 5.088320437219074e-08, "loss": 0.0037, "step": 10284 }, { "epoch": 4.679253867151957, "grad_norm": 0.07235702799939445, "learning_rate": 5.073985354578786e-08, "loss": 0.0003, "step": 10285 }, { "epoch": 4.679708826205641, "grad_norm": 0.0879576902235708, "learning_rate": 5.059670286186341e-08, "loss": 0.0007, "step": 10286 }, { "epoch": 4.680163785259326, "grad_norm": 0.02654256353194826, "learning_rate": 5.045375233211497e-08, "loss": 0.0001, "step": 10287 }, { "epoch": 4.680618744313012, "grad_norm": 0.20255964400457638, "learning_rate": 5.031100196822403e-08, "loss": 0.0032, "step": 10288 }, { "epoch": 4.681073703366697, "grad_norm": 0.11059376402717858, "learning_rate": 5.0168451781854865e-08, "loss": 0.0005, "step": 10289 }, { "epoch": 4.681528662420382, "grad_norm": 0.05034314678133176, "learning_rate": 5.0026101784656776e-08, "loss": 0.0001, "step": 10290 }, { "epoch": 4.6819836214740675, "grad_norm": 0.04615330624608353, "learning_rate": 4.988395198826157e-08, "loss": 0.0003, "step": 10291 }, { "epoch": 4.682438580527752, "grad_norm": 0.016107987092591463, "learning_rate": 4.974200240428551e-08, "loss": 0.0001, "step": 10292 }, { "epoch": 4.682893539581437, "grad_norm": 0.04984409469160768, "learning_rate": 4.9600253044327364e-08, "loss": 0.0001, "step": 10293 }, { "epoch": 4.683348498635123, "grad_norm": 0.012456866789819557, "learning_rate": 4.945870391997065e-08, "loss": 0.0001, "step": 10294 }, { "epoch": 4.683803457688808, "grad_norm": 0.029165842482358453, "learning_rate": 4.931735504278223e-08, "loss": 0.0001, "step": 10295 }, { "epoch": 4.684258416742493, "grad_norm": 0.012628934104741972, "learning_rate": 4.917620642431231e-08, "loss": 0.0001, "step": 10296 }, { "epoch": 4.6847133757961785, "grad_norm": 0.08176285583276816, "learning_rate": 4.9035258076094996e-08, "loss": 0.0004, "step": 10297 }, { "epoch": 4.685168334849863, "grad_norm": 0.22627515769644557, "learning_rate": 4.889451000964801e-08, "loss": 0.0022, "step": 10298 }, { "epoch": 4.685623293903548, "grad_norm": 0.0782874996029841, "learning_rate": 4.8753962236472443e-08, "loss": 0.0004, "step": 10299 }, { "epoch": 4.686078252957234, "grad_norm": 0.08762871279195823, "learning_rate": 4.861361476805354e-08, "loss": 0.0004, "step": 10300 }, { "epoch": 4.686533212010919, "grad_norm": 0.03396071516665417, "learning_rate": 4.8473467615859637e-08, "loss": 0.0001, "step": 10301 }, { "epoch": 4.686988171064604, "grad_norm": 0.18947224809584584, "learning_rate": 4.833352079134296e-08, "loss": 0.0008, "step": 10302 }, { "epoch": 4.6874431301182895, "grad_norm": 0.06627823451865716, "learning_rate": 4.819377430593908e-08, "loss": 0.0002, "step": 10303 }, { "epoch": 4.687898089171974, "grad_norm": 0.05657065134585036, "learning_rate": 4.8054228171067755e-08, "loss": 0.0003, "step": 10304 }, { "epoch": 4.688353048225659, "grad_norm": 0.1886038039864433, "learning_rate": 4.7914882398132357e-08, "loss": 0.0016, "step": 10305 }, { "epoch": 4.688808007279345, "grad_norm": 0.026059265627915534, "learning_rate": 4.7775736998519065e-08, "loss": 0.0002, "step": 10306 }, { "epoch": 4.68926296633303, "grad_norm": 0.016156376598190872, "learning_rate": 4.7636791983598496e-08, "loss": 0.0001, "step": 10307 }, { "epoch": 4.689717925386715, "grad_norm": 0.17465367146344488, "learning_rate": 4.749804736472435e-08, "loss": 0.0011, "step": 10308 }, { "epoch": 4.6901728844404005, "grad_norm": 0.05385830185235328, "learning_rate": 4.7359503153234235e-08, "loss": 0.0002, "step": 10309 }, { "epoch": 4.690627843494085, "grad_norm": 0.08658182829788887, "learning_rate": 4.7221159360449634e-08, "loss": 0.0004, "step": 10310 }, { "epoch": 4.69108280254777, "grad_norm": 0.10098621271106202, "learning_rate": 4.7083015997675395e-08, "loss": 0.0003, "step": 10311 }, { "epoch": 4.691537761601456, "grad_norm": 0.10285293579223997, "learning_rate": 4.694507307619972e-08, "loss": 0.0004, "step": 10312 }, { "epoch": 4.691992720655141, "grad_norm": 0.18141378465930041, "learning_rate": 4.68073306072947e-08, "loss": 0.0016, "step": 10313 }, { "epoch": 4.692447679708827, "grad_norm": 0.13676688576067322, "learning_rate": 4.6669788602216046e-08, "loss": 0.0016, "step": 10314 }, { "epoch": 4.6929026387625115, "grad_norm": 0.15736324774966992, "learning_rate": 4.653244707220339e-08, "loss": 0.0027, "step": 10315 }, { "epoch": 4.693357597816196, "grad_norm": 0.175050557157584, "learning_rate": 4.639530602847914e-08, "loss": 0.0006, "step": 10316 }, { "epoch": 4.693812556869882, "grad_norm": 0.03904213433104053, "learning_rate": 4.625836548225016e-08, "loss": 0.0003, "step": 10317 }, { "epoch": 4.694267515923567, "grad_norm": 0.5182177598584586, "learning_rate": 4.612162544470666e-08, "loss": 0.0014, "step": 10318 }, { "epoch": 4.694722474977252, "grad_norm": 0.2760196578878952, "learning_rate": 4.598508592702222e-08, "loss": 0.0018, "step": 10319 }, { "epoch": 4.695177434030938, "grad_norm": 0.1017044693976114, "learning_rate": 4.5848746940354294e-08, "loss": 0.0004, "step": 10320 }, { "epoch": 4.695632393084622, "grad_norm": 0.4919901209435283, "learning_rate": 4.571260849584397e-08, "loss": 0.0042, "step": 10321 }, { "epoch": 4.696087352138307, "grad_norm": 0.1316115900836712, "learning_rate": 4.5576670604615955e-08, "loss": 0.0003, "step": 10322 }, { "epoch": 4.696542311191993, "grad_norm": 0.3925470227701807, "learning_rate": 4.544093327777804e-08, "loss": 0.0015, "step": 10323 }, { "epoch": 4.696997270245678, "grad_norm": 0.31940275683055264, "learning_rate": 4.530539652642246e-08, "loss": 0.0029, "step": 10324 }, { "epoch": 4.697452229299363, "grad_norm": 0.1862615721094317, "learning_rate": 4.51700603616248e-08, "loss": 0.0003, "step": 10325 }, { "epoch": 4.6979071883530485, "grad_norm": 0.576067915833267, "learning_rate": 4.503492479444371e-08, "loss": 0.0054, "step": 10326 }, { "epoch": 4.698362147406733, "grad_norm": 0.3872519377979727, "learning_rate": 4.489998983592231e-08, "loss": 0.005, "step": 10327 }, { "epoch": 4.698817106460418, "grad_norm": 0.05153847160472315, "learning_rate": 4.476525549708621e-08, "loss": 0.0004, "step": 10328 }, { "epoch": 4.699272065514104, "grad_norm": 0.04587703170845374, "learning_rate": 4.463072178894579e-08, "loss": 0.0003, "step": 10329 }, { "epoch": 4.699727024567789, "grad_norm": 0.014631810117323434, "learning_rate": 4.4496388722494455e-08, "loss": 0.0001, "step": 10330 }, { "epoch": 4.700181983621474, "grad_norm": 0.23348071221122926, "learning_rate": 4.436225630870927e-08, "loss": 0.0021, "step": 10331 }, { "epoch": 4.7006369426751595, "grad_norm": 0.14539507960560072, "learning_rate": 4.4228324558551195e-08, "loss": 0.0007, "step": 10332 }, { "epoch": 4.701091901728844, "grad_norm": 0.1040036167415139, "learning_rate": 4.4094593482963686e-08, "loss": 0.0004, "step": 10333 }, { "epoch": 4.701546860782529, "grad_norm": 0.14995788288141615, "learning_rate": 4.396106309287579e-08, "loss": 0.0012, "step": 10334 }, { "epoch": 4.702001819836215, "grad_norm": 0.04407824651998482, "learning_rate": 4.3827733399198215e-08, "loss": 0.0003, "step": 10335 }, { "epoch": 4.7024567788899, "grad_norm": 0.08204335417885975, "learning_rate": 4.3694604412826416e-08, "loss": 0.0002, "step": 10336 }, { "epoch": 4.702911737943585, "grad_norm": 0.03600972332427139, "learning_rate": 4.356167614463891e-08, "loss": 0.0001, "step": 10337 }, { "epoch": 4.7033666969972705, "grad_norm": 0.08016837124144119, "learning_rate": 4.3428948605497844e-08, "loss": 0.0008, "step": 10338 }, { "epoch": 4.703821656050955, "grad_norm": 0.19799015321887037, "learning_rate": 4.3296421806249546e-08, "loss": 0.0013, "step": 10339 }, { "epoch": 4.70427661510464, "grad_norm": 0.13597017535016817, "learning_rate": 4.3164095757723404e-08, "loss": 0.0008, "step": 10340 }, { "epoch": 4.704731574158326, "grad_norm": 0.07270118989941027, "learning_rate": 4.3031970470732156e-08, "loss": 0.0004, "step": 10341 }, { "epoch": 4.705186533212011, "grad_norm": 0.1890411880067807, "learning_rate": 4.290004595607272e-08, "loss": 0.0006, "step": 10342 }, { "epoch": 4.705641492265696, "grad_norm": 0.08419305228463635, "learning_rate": 4.276832222452537e-08, "loss": 0.0004, "step": 10343 }, { "epoch": 4.7060964513193815, "grad_norm": 0.08827029031620051, "learning_rate": 4.263679928685399e-08, "loss": 0.0004, "step": 10344 }, { "epoch": 4.706551410373066, "grad_norm": 0.22827354326122168, "learning_rate": 4.2505477153806094e-08, "loss": 0.0022, "step": 10345 }, { "epoch": 4.707006369426751, "grad_norm": 0.08420492680222043, "learning_rate": 4.2374355836112545e-08, "loss": 0.0004, "step": 10346 }, { "epoch": 4.707461328480437, "grad_norm": 0.03482876120470504, "learning_rate": 4.224343534448838e-08, "loss": 0.0001, "step": 10347 }, { "epoch": 4.707916287534122, "grad_norm": 0.0055695753724643955, "learning_rate": 4.211271568963116e-08, "loss": 0.0, "step": 10348 }, { "epoch": 4.708371246587808, "grad_norm": 0.23485740787267376, "learning_rate": 4.198219688222316e-08, "loss": 0.0006, "step": 10349 }, { "epoch": 4.7088262056414925, "grad_norm": 0.03427733611325282, "learning_rate": 4.1851878932930026e-08, "loss": 0.0001, "step": 10350 }, { "epoch": 4.709281164695177, "grad_norm": 0.1094013227453638, "learning_rate": 4.172176185240018e-08, "loss": 0.0007, "step": 10351 }, { "epoch": 4.709736123748863, "grad_norm": 0.11160385441219851, "learning_rate": 4.159184565126651e-08, "loss": 0.0008, "step": 10352 }, { "epoch": 4.710191082802548, "grad_norm": 0.08983999460758507, "learning_rate": 4.146213034014496e-08, "loss": 0.0003, "step": 10353 }, { "epoch": 4.710646041856233, "grad_norm": 0.36690252926583744, "learning_rate": 4.133261592963567e-08, "loss": 0.005, "step": 10354 }, { "epoch": 4.711101000909919, "grad_norm": 0.14847981181146966, "learning_rate": 4.120330243032183e-08, "loss": 0.0006, "step": 10355 }, { "epoch": 4.711555959963603, "grad_norm": 0.024017522546963776, "learning_rate": 4.1074189852770284e-08, "loss": 0.0001, "step": 10356 }, { "epoch": 4.712010919017288, "grad_norm": 0.015727491989390902, "learning_rate": 4.0945278207531466e-08, "loss": 0.0, "step": 10357 }, { "epoch": 4.712465878070974, "grad_norm": 0.07554634654864549, "learning_rate": 4.081656750513946e-08, "loss": 0.0005, "step": 10358 }, { "epoch": 4.712920837124659, "grad_norm": 0.1176502221492188, "learning_rate": 4.0688057756111956e-08, "loss": 0.0004, "step": 10359 }, { "epoch": 4.713375796178344, "grad_norm": 0.020822402259181708, "learning_rate": 4.0559748970950274e-08, "loss": 0.0, "step": 10360 }, { "epoch": 4.7138307552320295, "grad_norm": 0.16275457614449124, "learning_rate": 4.043164116013937e-08, "loss": 0.0013, "step": 10361 }, { "epoch": 4.714285714285714, "grad_norm": 0.06595895582496616, "learning_rate": 4.030373433414697e-08, "loss": 0.0005, "step": 10362 }, { "epoch": 4.714740673339399, "grad_norm": 0.05482359859823791, "learning_rate": 4.017602850342584e-08, "loss": 0.0003, "step": 10363 }, { "epoch": 4.715195632393085, "grad_norm": 0.022000393616267377, "learning_rate": 4.004852367841122e-08, "loss": 0.0001, "step": 10364 }, { "epoch": 4.71565059144677, "grad_norm": 0.23902780203875784, "learning_rate": 3.99212198695223e-08, "loss": 0.0036, "step": 10365 }, { "epoch": 4.716105550500455, "grad_norm": 0.02632372826813501, "learning_rate": 3.97941170871613e-08, "loss": 0.0002, "step": 10366 }, { "epoch": 4.7165605095541405, "grad_norm": 0.18706296933058006, "learning_rate": 3.9667215341714915e-08, "loss": 0.0011, "step": 10367 }, { "epoch": 4.717015468607825, "grad_norm": 0.18394125000459344, "learning_rate": 3.954051464355319e-08, "loss": 0.0018, "step": 10368 }, { "epoch": 4.71747042766151, "grad_norm": 0.09849985224894059, "learning_rate": 3.9414015003029214e-08, "loss": 0.0007, "step": 10369 }, { "epoch": 4.717925386715196, "grad_norm": 0.02328065408864155, "learning_rate": 3.9287716430480014e-08, "loss": 0.0001, "step": 10370 }, { "epoch": 4.718380345768881, "grad_norm": 0.036827049145836184, "learning_rate": 3.916161893622594e-08, "loss": 0.0003, "step": 10371 }, { "epoch": 4.718835304822566, "grad_norm": 0.07531315145445713, "learning_rate": 3.903572253057153e-08, "loss": 0.0003, "step": 10372 }, { "epoch": 4.7192902638762515, "grad_norm": 0.035316259140103015, "learning_rate": 3.8910027223804105e-08, "loss": 0.0001, "step": 10373 }, { "epoch": 4.719745222929936, "grad_norm": 0.04345558190739239, "learning_rate": 3.8784533026195446e-08, "loss": 0.0002, "step": 10374 }, { "epoch": 4.720200181983621, "grad_norm": 0.05778257609680395, "learning_rate": 3.865923994799958e-08, "loss": 0.0003, "step": 10375 }, { "epoch": 4.720655141037307, "grad_norm": 0.016224394198482258, "learning_rate": 3.853414799945554e-08, "loss": 0.0001, "step": 10376 }, { "epoch": 4.721110100090992, "grad_norm": 0.10145586994444004, "learning_rate": 3.840925719078486e-08, "loss": 0.0011, "step": 10377 }, { "epoch": 4.721565059144677, "grad_norm": 0.06249185308314817, "learning_rate": 3.828456753219356e-08, "loss": 0.0003, "step": 10378 }, { "epoch": 4.7220200181983625, "grad_norm": 0.058396138900749364, "learning_rate": 3.816007903387015e-08, "loss": 0.0003, "step": 10379 }, { "epoch": 4.722474977252047, "grad_norm": 0.05239979074329144, "learning_rate": 3.803579170598731e-08, "loss": 0.0002, "step": 10380 }, { "epoch": 4.722929936305732, "grad_norm": 0.06271330227004633, "learning_rate": 3.791170555870166e-08, "loss": 0.0003, "step": 10381 }, { "epoch": 4.723384895359418, "grad_norm": 0.1467472442926911, "learning_rate": 3.778782060215286e-08, "loss": 0.0011, "step": 10382 }, { "epoch": 4.723839854413103, "grad_norm": 0.07427085224630832, "learning_rate": 3.7664136846463916e-08, "loss": 0.0003, "step": 10383 }, { "epoch": 4.724294813466788, "grad_norm": 0.2932844640516416, "learning_rate": 3.754065430174203e-08, "loss": 0.0007, "step": 10384 }, { "epoch": 4.7247497725204735, "grad_norm": 0.026369805586496674, "learning_rate": 3.741737297807746e-08, "loss": 0.0001, "step": 10385 }, { "epoch": 4.725204731574158, "grad_norm": 0.020511213186914835, "learning_rate": 3.729429288554409e-08, "loss": 0.0001, "step": 10386 }, { "epoch": 4.725659690627843, "grad_norm": 0.21494548147259968, "learning_rate": 3.717141403419972e-08, "loss": 0.0007, "step": 10387 }, { "epoch": 4.726114649681529, "grad_norm": 0.04059240097159508, "learning_rate": 3.7048736434085465e-08, "loss": 0.0003, "step": 10388 }, { "epoch": 4.726569608735214, "grad_norm": 0.33729008266721355, "learning_rate": 3.692626009522582e-08, "loss": 0.002, "step": 10389 }, { "epoch": 4.727024567788899, "grad_norm": 0.002848594905693266, "learning_rate": 3.6803985027629164e-08, "loss": 0.0, "step": 10390 }, { "epoch": 4.727479526842584, "grad_norm": 0.03613301605356997, "learning_rate": 3.668191124128695e-08, "loss": 0.0002, "step": 10391 }, { "epoch": 4.727934485896269, "grad_norm": 0.13246534036348193, "learning_rate": 3.6560038746174805e-08, "loss": 0.0007, "step": 10392 }, { "epoch": 4.728389444949954, "grad_norm": 0.06813495066851347, "learning_rate": 3.643836755225172e-08, "loss": 0.0002, "step": 10393 }, { "epoch": 4.72884440400364, "grad_norm": 0.0315438444004096, "learning_rate": 3.631689766945945e-08, "loss": 0.0001, "step": 10394 }, { "epoch": 4.729299363057325, "grad_norm": 0.028013509459089092, "learning_rate": 3.619562910772478e-08, "loss": 0.0002, "step": 10395 }, { "epoch": 4.72975432211101, "grad_norm": 0.00989703585410808, "learning_rate": 3.607456187695646e-08, "loss": 0.0, "step": 10396 }, { "epoch": 4.730209281164695, "grad_norm": 0.11727482512812072, "learning_rate": 3.595369598704823e-08, "loss": 0.0006, "step": 10397 }, { "epoch": 4.73066424021838, "grad_norm": 0.24564087396834097, "learning_rate": 3.583303144787637e-08, "loss": 0.0015, "step": 10398 }, { "epoch": 4.731119199272065, "grad_norm": 0.022682202023293414, "learning_rate": 3.5712568269301306e-08, "loss": 0.0002, "step": 10399 }, { "epoch": 4.731574158325751, "grad_norm": 0.18195618411384526, "learning_rate": 3.559230646116629e-08, "loss": 0.0013, "step": 10400 }, { "epoch": 4.732029117379436, "grad_norm": 0.014223138497300543, "learning_rate": 3.547224603329874e-08, "loss": 0.0001, "step": 10401 }, { "epoch": 4.732484076433121, "grad_norm": 0.01755384123977352, "learning_rate": 3.535238699550969e-08, "loss": 0.0001, "step": 10402 }, { "epoch": 4.732939035486806, "grad_norm": 0.00813534936126288, "learning_rate": 3.5232729357593254e-08, "loss": 0.0, "step": 10403 }, { "epoch": 4.733393994540491, "grad_norm": 0.04547520832131413, "learning_rate": 3.511327312932772e-08, "loss": 0.0003, "step": 10404 }, { "epoch": 4.733848953594176, "grad_norm": 0.052554932719201926, "learning_rate": 3.499401832047361e-08, "loss": 0.0003, "step": 10405 }, { "epoch": 4.734303912647862, "grad_norm": 0.06903894957755015, "learning_rate": 3.487496494077702e-08, "loss": 0.0003, "step": 10406 }, { "epoch": 4.734758871701547, "grad_norm": 0.023001774163148688, "learning_rate": 3.4756112999965454e-08, "loss": 0.0002, "step": 10407 }, { "epoch": 4.735213830755232, "grad_norm": 0.0057483774686134595, "learning_rate": 3.463746250775141e-08, "loss": 0.0, "step": 10408 }, { "epoch": 4.735668789808917, "grad_norm": 0.04988897732721029, "learning_rate": 3.451901347383074e-08, "loss": 0.0002, "step": 10409 }, { "epoch": 4.736123748862602, "grad_norm": 0.037644394144645925, "learning_rate": 3.4400765907882106e-08, "loss": 0.0002, "step": 10410 }, { "epoch": 4.736578707916287, "grad_norm": 0.03518978913810154, "learning_rate": 3.4282719819568324e-08, "loss": 0.0001, "step": 10411 }, { "epoch": 4.737033666969973, "grad_norm": 0.2642077146146915, "learning_rate": 3.416487521853584e-08, "loss": 0.0004, "step": 10412 }, { "epoch": 4.737488626023658, "grad_norm": 0.15368384522588457, "learning_rate": 3.404723211441391e-08, "loss": 0.0014, "step": 10413 }, { "epoch": 4.737943585077343, "grad_norm": 0.24516451827331778, "learning_rate": 3.392979051681622e-08, "loss": 0.0011, "step": 10414 }, { "epoch": 4.738398544131028, "grad_norm": 0.21331366829104498, "learning_rate": 3.3812550435338706e-08, "loss": 0.0013, "step": 10415 }, { "epoch": 4.738853503184713, "grad_norm": 0.19621732466141262, "learning_rate": 3.369551187956288e-08, "loss": 0.0007, "step": 10416 }, { "epoch": 4.739308462238398, "grad_norm": 0.01815850151257493, "learning_rate": 3.3578674859052194e-08, "loss": 0.0001, "step": 10417 }, { "epoch": 4.739763421292084, "grad_norm": 0.07705858459796142, "learning_rate": 3.346203938335402e-08, "loss": 0.0002, "step": 10418 }, { "epoch": 4.740218380345769, "grad_norm": 0.035662005949079416, "learning_rate": 3.3345605461999056e-08, "loss": 0.0001, "step": 10419 }, { "epoch": 4.740673339399454, "grad_norm": 0.3966176548887481, "learning_rate": 3.322937310450164e-08, "loss": 0.0057, "step": 10420 }, { "epoch": 4.741128298453139, "grad_norm": 0.04167664602824811, "learning_rate": 3.3113342320360285e-08, "loss": 0.0002, "step": 10421 }, { "epoch": 4.741583257506824, "grad_norm": 0.003969365102431651, "learning_rate": 3.299751311905602e-08, "loss": 0.0, "step": 10422 }, { "epoch": 4.742038216560509, "grad_norm": 0.10697471783192297, "learning_rate": 3.288188551005433e-08, "loss": 0.0003, "step": 10423 }, { "epoch": 4.742493175614195, "grad_norm": 0.007320555598285575, "learning_rate": 3.27664595028035e-08, "loss": 0.0, "step": 10424 }, { "epoch": 4.74294813466788, "grad_norm": 0.008515987358377056, "learning_rate": 3.26512351067354e-08, "loss": 0.0, "step": 10425 }, { "epoch": 4.743403093721565, "grad_norm": 0.0976931696611599, "learning_rate": 3.2536212331266416e-08, "loss": 0.0004, "step": 10426 }, { "epoch": 4.74385805277525, "grad_norm": 0.020228504430368287, "learning_rate": 3.242139118579485e-08, "loss": 0.0001, "step": 10427 }, { "epoch": 4.744313011828935, "grad_norm": 0.006176319915255529, "learning_rate": 3.230677167970403e-08, "loss": 0.0, "step": 10428 }, { "epoch": 4.744767970882621, "grad_norm": 0.039600379482845956, "learning_rate": 3.2192353822359246e-08, "loss": 0.0002, "step": 10429 }, { "epoch": 4.745222929936306, "grad_norm": 0.13082613245080543, "learning_rate": 3.207813762311107e-08, "loss": 0.0009, "step": 10430 }, { "epoch": 4.745677888989991, "grad_norm": 0.024163965185792384, "learning_rate": 3.19641230912926e-08, "loss": 0.0001, "step": 10431 }, { "epoch": 4.746132848043676, "grad_norm": 0.03706304404761002, "learning_rate": 3.185031023622026e-08, "loss": 0.0002, "step": 10432 }, { "epoch": 4.746587807097361, "grad_norm": 0.04485473787044679, "learning_rate": 3.1736699067194675e-08, "loss": 0.0003, "step": 10433 }, { "epoch": 4.747042766151046, "grad_norm": 0.0188294771198581, "learning_rate": 3.162328959349925e-08, "loss": 0.0001, "step": 10434 }, { "epoch": 4.747497725204732, "grad_norm": 0.030246684992905573, "learning_rate": 3.151008182440185e-08, "loss": 0.0002, "step": 10435 }, { "epoch": 4.747952684258417, "grad_norm": 0.012277355512725345, "learning_rate": 3.1397075769152576e-08, "loss": 0.0001, "step": 10436 }, { "epoch": 4.748407643312102, "grad_norm": 0.01198354053522318, "learning_rate": 3.128427143698626e-08, "loss": 0.0001, "step": 10437 }, { "epoch": 4.748862602365787, "grad_norm": 0.017025700252031577, "learning_rate": 3.1171668837120805e-08, "loss": 0.0001, "step": 10438 }, { "epoch": 4.749317561419472, "grad_norm": 0.006531364357624947, "learning_rate": 3.1059267978757466e-08, "loss": 0.0, "step": 10439 }, { "epoch": 4.749772520473157, "grad_norm": 0.3843778874565918, "learning_rate": 3.094706887108084e-08, "loss": 0.0045, "step": 10440 }, { "epoch": 4.750227479526843, "grad_norm": 0.10300962676116689, "learning_rate": 3.083507152325999e-08, "loss": 0.0005, "step": 10441 }, { "epoch": 4.750682438580528, "grad_norm": 0.006208501213794707, "learning_rate": 3.0723275944446185e-08, "loss": 0.0, "step": 10442 }, { "epoch": 4.751137397634213, "grad_norm": 0.09279675470428447, "learning_rate": 3.061168214377519e-08, "loss": 0.001, "step": 10443 }, { "epoch": 4.751592356687898, "grad_norm": 0.3457114186890978, "learning_rate": 3.050029013036554e-08, "loss": 0.0029, "step": 10444 }, { "epoch": 4.752047315741583, "grad_norm": 0.055671211908465704, "learning_rate": 3.0389099913320505e-08, "loss": 0.0002, "step": 10445 }, { "epoch": 4.752502274795268, "grad_norm": 0.18523929811318407, "learning_rate": 3.027811150172533e-08, "loss": 0.001, "step": 10446 }, { "epoch": 4.752957233848954, "grad_norm": 0.026380773362651915, "learning_rate": 3.016732490464997e-08, "loss": 0.0002, "step": 10447 }, { "epoch": 4.753412192902639, "grad_norm": 0.013716810436319902, "learning_rate": 3.005674013114662e-08, "loss": 0.0, "step": 10448 }, { "epoch": 4.753867151956324, "grad_norm": 0.14212502406524039, "learning_rate": 2.994635719025279e-08, "loss": 0.0008, "step": 10449 }, { "epoch": 4.754322111010009, "grad_norm": 0.14559356227188827, "learning_rate": 2.983617609098766e-08, "loss": 0.0006, "step": 10450 }, { "epoch": 4.754777070063694, "grad_norm": 0.15895858172500793, "learning_rate": 2.9726196842355394e-08, "loss": 0.0015, "step": 10451 }, { "epoch": 4.755232029117379, "grad_norm": 0.18961472980348393, "learning_rate": 2.9616419453342426e-08, "loss": 0.0018, "step": 10452 }, { "epoch": 4.755686988171065, "grad_norm": 0.09186026657979525, "learning_rate": 2.9506843932919637e-08, "loss": 0.0006, "step": 10453 }, { "epoch": 4.75614194722475, "grad_norm": 0.16663806016057348, "learning_rate": 2.9397470290040697e-08, "loss": 0.0011, "step": 10454 }, { "epoch": 4.756596906278435, "grad_norm": 0.09245660171978064, "learning_rate": 2.9288298533643455e-08, "loss": 0.0002, "step": 10455 }, { "epoch": 4.75705186533212, "grad_norm": 0.3162398209656495, "learning_rate": 2.917932867264911e-08, "loss": 0.0028, "step": 10456 }, { "epoch": 4.757506824385805, "grad_norm": 0.006011435433727001, "learning_rate": 2.907056071596137e-08, "loss": 0.0, "step": 10457 }, { "epoch": 4.757961783439491, "grad_norm": 0.020744310178872828, "learning_rate": 2.896199467246924e-08, "loss": 0.0001, "step": 10458 }, { "epoch": 4.758416742493176, "grad_norm": 0.047582396286669326, "learning_rate": 2.8853630551043398e-08, "loss": 0.0002, "step": 10459 }, { "epoch": 4.758871701546861, "grad_norm": 0.14743088511498106, "learning_rate": 2.874546836053954e-08, "loss": 0.0014, "step": 10460 }, { "epoch": 4.759326660600546, "grad_norm": 0.05431232270045081, "learning_rate": 2.8637508109795875e-08, "loss": 0.0003, "step": 10461 }, { "epoch": 4.759781619654231, "grad_norm": 0.6319036600828077, "learning_rate": 2.852974980763451e-08, "loss": 0.0082, "step": 10462 }, { "epoch": 4.760236578707916, "grad_norm": 0.12255677276082155, "learning_rate": 2.8422193462860903e-08, "loss": 0.0006, "step": 10463 }, { "epoch": 4.760691537761602, "grad_norm": 0.249444837338403, "learning_rate": 2.8314839084263857e-08, "loss": 0.004, "step": 10464 }, { "epoch": 4.761146496815287, "grad_norm": 0.334978553531205, "learning_rate": 2.8207686680616354e-08, "loss": 0.0019, "step": 10465 }, { "epoch": 4.761601455868972, "grad_norm": 0.029912150033570346, "learning_rate": 2.8100736260674442e-08, "loss": 0.0002, "step": 10466 }, { "epoch": 4.762056414922657, "grad_norm": 0.12938942809102794, "learning_rate": 2.7993987833176972e-08, "loss": 0.0011, "step": 10467 }, { "epoch": 4.762511373976342, "grad_norm": 0.027051524602999807, "learning_rate": 2.7887441406847516e-08, "loss": 0.0001, "step": 10468 }, { "epoch": 4.762966333030027, "grad_norm": 0.2582818103205618, "learning_rate": 2.7781096990392443e-08, "loss": 0.0005, "step": 10469 }, { "epoch": 4.763421292083713, "grad_norm": 0.019537070263644648, "learning_rate": 2.767495459250147e-08, "loss": 0.0, "step": 10470 }, { "epoch": 4.763876251137398, "grad_norm": 0.04011532929804456, "learning_rate": 2.756901422184821e-08, "loss": 0.0002, "step": 10471 }, { "epoch": 4.764331210191083, "grad_norm": 0.2003673616899048, "learning_rate": 2.746327588709019e-08, "loss": 0.0025, "step": 10472 }, { "epoch": 4.764786169244768, "grad_norm": 0.016039794972526177, "learning_rate": 2.7357739596866884e-08, "loss": 0.0001, "step": 10473 }, { "epoch": 4.765241128298453, "grad_norm": 0.05197935663076809, "learning_rate": 2.7252405359803057e-08, "loss": 0.0003, "step": 10474 }, { "epoch": 4.765696087352138, "grad_norm": 0.01929352981733926, "learning_rate": 2.714727318450572e-08, "loss": 0.0001, "step": 10475 }, { "epoch": 4.766151046405824, "grad_norm": 0.0346400125809303, "learning_rate": 2.7042343079566048e-08, "loss": 0.0002, "step": 10476 }, { "epoch": 4.766606005459509, "grad_norm": 0.27148123078556224, "learning_rate": 2.6937615053558018e-08, "loss": 0.0028, "step": 10477 }, { "epoch": 4.767060964513194, "grad_norm": 0.07520070479089347, "learning_rate": 2.683308911503979e-08, "loss": 0.0004, "step": 10478 }, { "epoch": 4.767515923566879, "grad_norm": 0.05388360363732688, "learning_rate": 2.6728765272553135e-08, "loss": 0.0003, "step": 10479 }, { "epoch": 4.767970882620564, "grad_norm": 0.02841646243244464, "learning_rate": 2.662464353462263e-08, "loss": 0.0001, "step": 10480 }, { "epoch": 4.768425841674249, "grad_norm": 0.17002608105177433, "learning_rate": 2.6520723909756462e-08, "loss": 0.0017, "step": 10481 }, { "epoch": 4.768880800727935, "grad_norm": 0.03173188080904365, "learning_rate": 2.6417006406446456e-08, "loss": 0.0002, "step": 10482 }, { "epoch": 4.76933575978162, "grad_norm": 0.027150060281439074, "learning_rate": 2.631349103316805e-08, "loss": 0.0001, "step": 10483 }, { "epoch": 4.769790718835305, "grad_norm": 0.12358254394939898, "learning_rate": 2.621017779838031e-08, "loss": 0.0007, "step": 10484 }, { "epoch": 4.77024567788899, "grad_norm": 0.027747571413801395, "learning_rate": 2.61070667105251e-08, "loss": 0.0003, "step": 10485 }, { "epoch": 4.770700636942675, "grad_norm": 0.09523904922758075, "learning_rate": 2.6004157778028726e-08, "loss": 0.0006, "step": 10486 }, { "epoch": 4.77115559599636, "grad_norm": 0.18167274569140593, "learning_rate": 2.590145100929975e-08, "loss": 0.0013, "step": 10487 }, { "epoch": 4.771610555050046, "grad_norm": 0.07520278588314347, "learning_rate": 2.5798946412731452e-08, "loss": 0.0003, "step": 10488 }, { "epoch": 4.772065514103731, "grad_norm": 0.01925559251770938, "learning_rate": 2.569664399669991e-08, "loss": 0.0001, "step": 10489 }, { "epoch": 4.772520473157416, "grad_norm": 0.04205466339353107, "learning_rate": 2.5594543769564828e-08, "loss": 0.0002, "step": 10490 }, { "epoch": 4.772975432211101, "grad_norm": 0.04994559807202119, "learning_rate": 2.5492645739669253e-08, "loss": 0.0002, "step": 10491 }, { "epoch": 4.773430391264786, "grad_norm": 0.09622385679624378, "learning_rate": 2.539094991533958e-08, "loss": 0.0004, "step": 10492 }, { "epoch": 4.773885350318471, "grad_norm": 0.055466954972914734, "learning_rate": 2.5289456304886385e-08, "loss": 0.0004, "step": 10493 }, { "epoch": 4.774340309372157, "grad_norm": 0.07530314976859616, "learning_rate": 2.518816491660331e-08, "loss": 0.0004, "step": 10494 }, { "epoch": 4.774795268425842, "grad_norm": 0.12524762658678307, "learning_rate": 2.5087075758767064e-08, "loss": 0.0012, "step": 10495 }, { "epoch": 4.7752502274795265, "grad_norm": 0.009033318716210874, "learning_rate": 2.4986188839638548e-08, "loss": 0.0001, "step": 10496 }, { "epoch": 4.775705186533212, "grad_norm": 0.03631123678990658, "learning_rate": 2.4885504167461437e-08, "loss": 0.0002, "step": 10497 }, { "epoch": 4.776160145586897, "grad_norm": 0.35732632958235466, "learning_rate": 2.47850217504636e-08, "loss": 0.0042, "step": 10498 }, { "epoch": 4.776615104640582, "grad_norm": 0.19148538789640593, "learning_rate": 2.4684741596855687e-08, "loss": 0.0013, "step": 10499 }, { "epoch": 4.777070063694268, "grad_norm": 0.10412716009185198, "learning_rate": 2.4584663714832257e-08, "loss": 0.0008, "step": 10500 }, { "epoch": 4.777525022747953, "grad_norm": 0.042531330358256986, "learning_rate": 2.448478811257149e-08, "loss": 0.0001, "step": 10501 }, { "epoch": 4.7779799818016375, "grad_norm": 0.08666274992052517, "learning_rate": 2.438511479823408e-08, "loss": 0.0012, "step": 10502 }, { "epoch": 4.778434940855323, "grad_norm": 0.19712455666068673, "learning_rate": 2.428564377996545e-08, "loss": 0.0017, "step": 10503 }, { "epoch": 4.778889899909008, "grad_norm": 0.3519637332085336, "learning_rate": 2.4186375065894107e-08, "loss": 0.0029, "step": 10504 }, { "epoch": 4.779344858962693, "grad_norm": 0.04399124736496122, "learning_rate": 2.4087308664131338e-08, "loss": 0.0002, "step": 10505 }, { "epoch": 4.779799818016379, "grad_norm": 0.03498978194165457, "learning_rate": 2.398844458277233e-08, "loss": 0.0001, "step": 10506 }, { "epoch": 4.780254777070064, "grad_norm": 0.23660132358369396, "learning_rate": 2.388978282989618e-08, "loss": 0.0012, "step": 10507 }, { "epoch": 4.7807097361237485, "grad_norm": 0.1128073922466687, "learning_rate": 2.379132341356505e-08, "loss": 0.0002, "step": 10508 }, { "epoch": 4.781164695177434, "grad_norm": 0.15872875188206675, "learning_rate": 2.3693066341824444e-08, "loss": 0.0011, "step": 10509 }, { "epoch": 4.781619654231119, "grad_norm": 0.007646686688797953, "learning_rate": 2.359501162270378e-08, "loss": 0.0, "step": 10510 }, { "epoch": 4.782074613284804, "grad_norm": 0.06261207561745985, "learning_rate": 2.3497159264214974e-08, "loss": 0.001, "step": 10511 }, { "epoch": 4.78252957233849, "grad_norm": 0.20850967235962034, "learning_rate": 2.339950927435497e-08, "loss": 0.0012, "step": 10512 }, { "epoch": 4.782984531392175, "grad_norm": 0.15922705877534576, "learning_rate": 2.330206166110238e-08, "loss": 0.0007, "step": 10513 }, { "epoch": 4.7834394904458595, "grad_norm": 0.13405192355277407, "learning_rate": 2.320481643242112e-08, "loss": 0.001, "step": 10514 }, { "epoch": 4.783894449499545, "grad_norm": 0.12421529047453904, "learning_rate": 2.3107773596257042e-08, "loss": 0.0006, "step": 10515 }, { "epoch": 4.78434940855323, "grad_norm": 0.04549000824049216, "learning_rate": 2.3010933160539927e-08, "loss": 0.0002, "step": 10516 }, { "epoch": 4.784804367606915, "grad_norm": 0.09050053092770913, "learning_rate": 2.291429513318344e-08, "loss": 0.0003, "step": 10517 }, { "epoch": 4.785259326660601, "grad_norm": 0.13005592974396554, "learning_rate": 2.2817859522084597e-08, "loss": 0.0006, "step": 10518 }, { "epoch": 4.785714285714286, "grad_norm": 0.013860900317113099, "learning_rate": 2.2721626335123202e-08, "loss": 0.0001, "step": 10519 }, { "epoch": 4.7861692447679705, "grad_norm": 0.18314331467512132, "learning_rate": 2.262559558016325e-08, "loss": 0.0016, "step": 10520 }, { "epoch": 4.786624203821656, "grad_norm": 0.04210066833752256, "learning_rate": 2.2529767265051795e-08, "loss": 0.0002, "step": 10521 }, { "epoch": 4.787079162875341, "grad_norm": 0.12716084739657255, "learning_rate": 2.2434141397619513e-08, "loss": 0.0014, "step": 10522 }, { "epoch": 4.787534121929026, "grad_norm": 0.12248574903055168, "learning_rate": 2.2338717985680993e-08, "loss": 0.0008, "step": 10523 }, { "epoch": 4.787989080982712, "grad_norm": 0.010730737149028505, "learning_rate": 2.2243497037033325e-08, "loss": 0.0001, "step": 10524 }, { "epoch": 4.788444040036397, "grad_norm": 0.12614474490787275, "learning_rate": 2.214847855945751e-08, "loss": 0.0012, "step": 10525 }, { "epoch": 4.788898999090081, "grad_norm": 0.03235190888518915, "learning_rate": 2.205366256071817e-08, "loss": 0.0002, "step": 10526 }, { "epoch": 4.789353958143767, "grad_norm": 0.01800723390956778, "learning_rate": 2.1959049048562997e-08, "loss": 0.0001, "step": 10527 }, { "epoch": 4.789808917197452, "grad_norm": 0.1284029350809757, "learning_rate": 2.186463803072386e-08, "loss": 0.0004, "step": 10528 }, { "epoch": 4.790263876251137, "grad_norm": 0.07907902713024895, "learning_rate": 2.1770429514915425e-08, "loss": 0.001, "step": 10529 }, { "epoch": 4.790718835304823, "grad_norm": 0.021605575856893926, "learning_rate": 2.1676423508835698e-08, "loss": 0.0002, "step": 10530 }, { "epoch": 4.7911737943585075, "grad_norm": 0.3659533481791585, "learning_rate": 2.158262002016659e-08, "loss": 0.0031, "step": 10531 }, { "epoch": 4.791628753412192, "grad_norm": 0.05151083146251555, "learning_rate": 2.1489019056573636e-08, "loss": 0.0003, "step": 10532 }, { "epoch": 4.792083712465878, "grad_norm": 0.010324012586427546, "learning_rate": 2.1395620625704882e-08, "loss": 0.0, "step": 10533 }, { "epoch": 4.792538671519563, "grad_norm": 0.3480977996283698, "learning_rate": 2.130242473519284e-08, "loss": 0.0023, "step": 10534 }, { "epoch": 4.792993630573249, "grad_norm": 0.2881102469264038, "learning_rate": 2.1209431392653078e-08, "loss": 0.004, "step": 10535 }, { "epoch": 4.793448589626934, "grad_norm": 0.08217828402253445, "learning_rate": 2.1116640605684247e-08, "loss": 0.0003, "step": 10536 }, { "epoch": 4.7939035486806185, "grad_norm": 0.020307040704058085, "learning_rate": 2.1024052381869164e-08, "loss": 0.0001, "step": 10537 }, { "epoch": 4.794358507734304, "grad_norm": 0.07561370664093418, "learning_rate": 2.0931666728773448e-08, "loss": 0.0003, "step": 10538 }, { "epoch": 4.794813466787989, "grad_norm": 0.016907196057796052, "learning_rate": 2.0839483653946613e-08, "loss": 0.0001, "step": 10539 }, { "epoch": 4.795268425841674, "grad_norm": 0.01572951940926912, "learning_rate": 2.0747503164921522e-08, "loss": 0.0001, "step": 10540 }, { "epoch": 4.79572338489536, "grad_norm": 0.03794174392949506, "learning_rate": 2.0655725269213833e-08, "loss": 0.0002, "step": 10541 }, { "epoch": 4.796178343949045, "grad_norm": 0.01591731825606315, "learning_rate": 2.056414997432421e-08, "loss": 0.0001, "step": 10542 }, { "epoch": 4.7966333030027295, "grad_norm": 0.3016221623593345, "learning_rate": 2.0472777287735e-08, "loss": 0.0025, "step": 10543 }, { "epoch": 4.797088262056415, "grad_norm": 0.5110643454042528, "learning_rate": 2.0381607216913012e-08, "loss": 0.0034, "step": 10544 }, { "epoch": 4.7975432211101, "grad_norm": 0.2809573590803082, "learning_rate": 2.029063976930784e-08, "loss": 0.0038, "step": 10545 }, { "epoch": 4.797998180163785, "grad_norm": 0.021317394880968343, "learning_rate": 2.0199874952353816e-08, "loss": 0.0001, "step": 10546 }, { "epoch": 4.798453139217471, "grad_norm": 0.26115425262854836, "learning_rate": 2.0109312773467228e-08, "loss": 0.0005, "step": 10547 }, { "epoch": 4.798908098271156, "grad_norm": 0.21284294878169654, "learning_rate": 2.0018953240048267e-08, "loss": 0.001, "step": 10548 }, { "epoch": 4.7993630573248405, "grad_norm": 0.1460301646803416, "learning_rate": 1.9928796359481306e-08, "loss": 0.0005, "step": 10549 }, { "epoch": 4.799818016378526, "grad_norm": 0.1408695830104508, "learning_rate": 1.9838842139132953e-08, "loss": 0.0012, "step": 10550 }, { "epoch": 4.800272975432211, "grad_norm": 0.025034360579898895, "learning_rate": 1.974909058635399e-08, "loss": 0.0001, "step": 10551 }, { "epoch": 4.800727934485896, "grad_norm": 0.028254751396348365, "learning_rate": 1.9659541708478836e-08, "loss": 0.0002, "step": 10552 }, { "epoch": 4.801182893539582, "grad_norm": 0.09544566782191001, "learning_rate": 1.9570195512824963e-08, "loss": 0.0006, "step": 10553 }, { "epoch": 4.801637852593267, "grad_norm": 0.27927805152645935, "learning_rate": 1.9481052006692924e-08, "loss": 0.0038, "step": 10554 }, { "epoch": 4.8020928116469515, "grad_norm": 0.049100949581569985, "learning_rate": 1.9392111197367446e-08, "loss": 0.0002, "step": 10555 }, { "epoch": 4.802547770700637, "grad_norm": 0.017379952143386862, "learning_rate": 1.930337309211633e-08, "loss": 0.0001, "step": 10556 }, { "epoch": 4.803002729754322, "grad_norm": 0.06863583306046496, "learning_rate": 1.9214837698190992e-08, "loss": 0.0004, "step": 10557 }, { "epoch": 4.803457688808007, "grad_norm": 0.10548998264089175, "learning_rate": 1.9126505022825924e-08, "loss": 0.0009, "step": 10558 }, { "epoch": 4.803912647861693, "grad_norm": 0.06608158230460594, "learning_rate": 1.9038375073239245e-08, "loss": 0.0011, "step": 10559 }, { "epoch": 4.804367606915378, "grad_norm": 0.17648204630600905, "learning_rate": 1.8950447856632694e-08, "loss": 0.0013, "step": 10560 }, { "epoch": 4.804822565969062, "grad_norm": 0.16146273852018087, "learning_rate": 1.8862723380191072e-08, "loss": 0.0013, "step": 10561 }, { "epoch": 4.805277525022748, "grad_norm": 0.00965697805380615, "learning_rate": 1.8775201651083097e-08, "loss": 0.0, "step": 10562 }, { "epoch": 4.805732484076433, "grad_norm": 0.022003901651771723, "learning_rate": 1.8687882676460546e-08, "loss": 0.0001, "step": 10563 }, { "epoch": 4.806187443130118, "grad_norm": 0.009635732036962216, "learning_rate": 1.860076646345882e-08, "loss": 0.0, "step": 10564 }, { "epoch": 4.806642402183804, "grad_norm": 0.04962519964696768, "learning_rate": 1.8513853019196393e-08, "loss": 0.0004, "step": 10565 }, { "epoch": 4.8070973612374885, "grad_norm": 0.20146514241082927, "learning_rate": 1.842714235077564e-08, "loss": 0.0022, "step": 10566 }, { "epoch": 4.807552320291173, "grad_norm": 0.09530366396713405, "learning_rate": 1.834063446528228e-08, "loss": 0.0006, "step": 10567 }, { "epoch": 4.808007279344859, "grad_norm": 0.04639339298464435, "learning_rate": 1.8254329369785106e-08, "loss": 0.0003, "step": 10568 }, { "epoch": 4.808462238398544, "grad_norm": 0.006471234032519072, "learning_rate": 1.816822707133653e-08, "loss": 0.0, "step": 10569 }, { "epoch": 4.80891719745223, "grad_norm": 0.08149548985531777, "learning_rate": 1.808232757697259e-08, "loss": 0.0006, "step": 10570 }, { "epoch": 4.809372156505915, "grad_norm": 0.16167127918906546, "learning_rate": 1.7996630893712675e-08, "loss": 0.0009, "step": 10571 }, { "epoch": 4.8098271155595995, "grad_norm": 0.036265679915818146, "learning_rate": 1.791113702855951e-08, "loss": 0.0001, "step": 10572 }, { "epoch": 4.810282074613285, "grad_norm": 0.05194557931874723, "learning_rate": 1.782584598849918e-08, "loss": 0.0003, "step": 10573 }, { "epoch": 4.81073703366697, "grad_norm": 0.0498892383223227, "learning_rate": 1.7740757780501383e-08, "loss": 0.0003, "step": 10574 }, { "epoch": 4.811191992720655, "grad_norm": 0.018288345988732444, "learning_rate": 1.7655872411518892e-08, "loss": 0.0001, "step": 10575 }, { "epoch": 4.811646951774341, "grad_norm": 0.09918912622855867, "learning_rate": 1.7571189888488384e-08, "loss": 0.0005, "step": 10576 }, { "epoch": 4.812101910828026, "grad_norm": 0.05523794933518149, "learning_rate": 1.7486710218329872e-08, "loss": 0.0003, "step": 10577 }, { "epoch": 4.8125568698817105, "grad_norm": 0.06054202891755409, "learning_rate": 1.740243340794645e-08, "loss": 0.0004, "step": 10578 }, { "epoch": 4.813011828935396, "grad_norm": 0.013474274604919153, "learning_rate": 1.7318359464224555e-08, "loss": 0.0001, "step": 10579 }, { "epoch": 4.813466787989081, "grad_norm": 0.2775167765619049, "learning_rate": 1.7234488394034798e-08, "loss": 0.0006, "step": 10580 }, { "epoch": 4.813921747042766, "grad_norm": 0.23579249473769315, "learning_rate": 1.7150820204230868e-08, "loss": 0.0013, "step": 10581 }, { "epoch": 4.814376706096452, "grad_norm": 0.05562171559665929, "learning_rate": 1.7067354901649235e-08, "loss": 0.0002, "step": 10582 }, { "epoch": 4.814831665150137, "grad_norm": 0.00774848642735039, "learning_rate": 1.6984092493110283e-08, "loss": 0.0, "step": 10583 }, { "epoch": 4.8152866242038215, "grad_norm": 0.24875546706438104, "learning_rate": 1.6901032985418286e-08, "loss": 0.0024, "step": 10584 }, { "epoch": 4.815741583257507, "grad_norm": 0.10061200511718711, "learning_rate": 1.6818176385360318e-08, "loss": 0.0006, "step": 10585 }, { "epoch": 4.816196542311192, "grad_norm": 0.12022424196005245, "learning_rate": 1.6735522699707076e-08, "loss": 0.0007, "step": 10586 }, { "epoch": 4.816651501364877, "grad_norm": 0.02573401535730819, "learning_rate": 1.6653071935212872e-08, "loss": 0.0001, "step": 10587 }, { "epoch": 4.817106460418563, "grad_norm": 0.13053041709095647, "learning_rate": 1.6570824098614547e-08, "loss": 0.0005, "step": 10588 }, { "epoch": 4.817561419472248, "grad_norm": 0.14333193143248227, "learning_rate": 1.6488779196633387e-08, "loss": 0.0008, "step": 10589 }, { "epoch": 4.8180163785259325, "grad_norm": 0.08913667208584285, "learning_rate": 1.6406937235973753e-08, "loss": 0.001, "step": 10590 }, { "epoch": 4.818471337579618, "grad_norm": 0.00877688040780065, "learning_rate": 1.6325298223323626e-08, "loss": 0.0001, "step": 10591 }, { "epoch": 4.818926296633303, "grad_norm": 0.4762719092246266, "learning_rate": 1.6243862165353784e-08, "loss": 0.0023, "step": 10592 }, { "epoch": 4.819381255686988, "grad_norm": 0.47283817741230144, "learning_rate": 1.6162629068718904e-08, "loss": 0.0078, "step": 10593 }, { "epoch": 4.819836214740674, "grad_norm": 0.061387809652683666, "learning_rate": 1.6081598940057287e-08, "loss": 0.0005, "step": 10594 }, { "epoch": 4.820291173794359, "grad_norm": 0.09246283420872985, "learning_rate": 1.600077178599002e-08, "loss": 0.0007, "step": 10595 }, { "epoch": 4.820746132848043, "grad_norm": 0.1675466006309071, "learning_rate": 1.5920147613122106e-08, "loss": 0.001, "step": 10596 }, { "epoch": 4.821201091901729, "grad_norm": 0.01792665527781693, "learning_rate": 1.5839726428041602e-08, "loss": 0.0001, "step": 10597 }, { "epoch": 4.821656050955414, "grad_norm": 0.14494891607616378, "learning_rate": 1.5759508237320476e-08, "loss": 0.001, "step": 10598 }, { "epoch": 4.822111010009099, "grad_norm": 0.02037471196542757, "learning_rate": 1.5679493047513482e-08, "loss": 0.0001, "step": 10599 }, { "epoch": 4.822565969062785, "grad_norm": 0.14943776356317673, "learning_rate": 1.5599680865159285e-08, "loss": 0.0011, "step": 10600 }, { "epoch": 4.8230209281164695, "grad_norm": 0.017544455927330185, "learning_rate": 1.5520071696779605e-08, "loss": 0.0001, "step": 10601 }, { "epoch": 4.823475887170154, "grad_norm": 0.14816982849252444, "learning_rate": 1.5440665548879796e-08, "loss": 0.0006, "step": 10602 }, { "epoch": 4.82393084622384, "grad_norm": 0.04997167344696649, "learning_rate": 1.5361462427948838e-08, "loss": 0.0002, "step": 10603 }, { "epoch": 4.824385805277525, "grad_norm": 0.07492551973451524, "learning_rate": 1.5282462340458493e-08, "loss": 0.0007, "step": 10604 }, { "epoch": 4.82484076433121, "grad_norm": 0.07765745851220608, "learning_rate": 1.5203665292864435e-08, "loss": 0.0008, "step": 10605 }, { "epoch": 4.825295723384896, "grad_norm": 0.1447540699691798, "learning_rate": 1.5125071291605675e-08, "loss": 0.0012, "step": 10606 }, { "epoch": 4.8257506824385805, "grad_norm": 0.016460777315993307, "learning_rate": 1.504668034310458e-08, "loss": 0.0001, "step": 10607 }, { "epoch": 4.826205641492265, "grad_norm": 0.034115163430221376, "learning_rate": 1.496849245376658e-08, "loss": 0.0001, "step": 10608 }, { "epoch": 4.826660600545951, "grad_norm": 0.031131513480408733, "learning_rate": 1.4890507629981288e-08, "loss": 0.0002, "step": 10609 }, { "epoch": 4.827115559599636, "grad_norm": 0.012895728608366066, "learning_rate": 1.4812725878120827e-08, "loss": 0.0001, "step": 10610 }, { "epoch": 4.827570518653321, "grad_norm": 0.026373580192510866, "learning_rate": 1.47351472045415e-08, "loss": 0.0001, "step": 10611 }, { "epoch": 4.828025477707007, "grad_norm": 0.11608950669880563, "learning_rate": 1.4657771615582683e-08, "loss": 0.0005, "step": 10612 }, { "epoch": 4.8284804367606915, "grad_norm": 0.04909047279830297, "learning_rate": 1.4580599117567096e-08, "loss": 0.0002, "step": 10613 }, { "epoch": 4.828935395814376, "grad_norm": 0.006689680708383037, "learning_rate": 1.4503629716800804e-08, "loss": 0.0, "step": 10614 }, { "epoch": 4.829390354868062, "grad_norm": 0.02853169402959729, "learning_rate": 1.44268634195735e-08, "loss": 0.0001, "step": 10615 }, { "epoch": 4.829845313921747, "grad_norm": 0.11865104449228502, "learning_rate": 1.435030023215822e-08, "loss": 0.0012, "step": 10616 }, { "epoch": 4.830300272975432, "grad_norm": 0.07073492041077611, "learning_rate": 1.4273940160811073e-08, "loss": 0.0003, "step": 10617 }, { "epoch": 4.830755232029118, "grad_norm": 0.06397135580617469, "learning_rate": 1.4197783211772343e-08, "loss": 0.0004, "step": 10618 }, { "epoch": 4.8312101910828025, "grad_norm": 0.2469504636287445, "learning_rate": 1.412182939126483e-08, "loss": 0.0031, "step": 10619 }, { "epoch": 4.831665150136487, "grad_norm": 0.007493113735356582, "learning_rate": 1.4046078705495514e-08, "loss": 0.0, "step": 10620 }, { "epoch": 4.832120109190173, "grad_norm": 0.007690324267631788, "learning_rate": 1.3970531160654166e-08, "loss": 0.0, "step": 10621 }, { "epoch": 4.832575068243858, "grad_norm": 0.1603784086955444, "learning_rate": 1.3895186762913903e-08, "loss": 0.0009, "step": 10622 }, { "epoch": 4.833030027297543, "grad_norm": 0.06785915050453031, "learning_rate": 1.3820045518432025e-08, "loss": 0.0003, "step": 10623 }, { "epoch": 4.833484986351229, "grad_norm": 0.03763110235250103, "learning_rate": 1.3745107433348615e-08, "loss": 0.0002, "step": 10624 }, { "epoch": 4.8339399454049135, "grad_norm": 0.10161906470149008, "learning_rate": 1.3670372513787111e-08, "loss": 0.0007, "step": 10625 }, { "epoch": 4.834394904458598, "grad_norm": 0.008214559415265697, "learning_rate": 1.3595840765854574e-08, "loss": 0.0, "step": 10626 }, { "epoch": 4.834849863512284, "grad_norm": 0.19668367502207457, "learning_rate": 1.3521512195641407e-08, "loss": 0.0011, "step": 10627 }, { "epoch": 4.835304822565969, "grad_norm": 0.042119088502758695, "learning_rate": 1.3447386809221364e-08, "loss": 0.0002, "step": 10628 }, { "epoch": 4.835759781619654, "grad_norm": 0.04950723580817976, "learning_rate": 1.3373464612651821e-08, "loss": 0.0002, "step": 10629 }, { "epoch": 4.83621474067334, "grad_norm": 0.32734600508934775, "learning_rate": 1.3299745611973224e-08, "loss": 0.0034, "step": 10630 }, { "epoch": 4.836669699727024, "grad_norm": 0.18817306550168247, "learning_rate": 1.3226229813209645e-08, "loss": 0.0007, "step": 10631 }, { "epoch": 4.837124658780709, "grad_norm": 0.23423262604884412, "learning_rate": 1.3152917222368222e-08, "loss": 0.0021, "step": 10632 }, { "epoch": 4.837579617834395, "grad_norm": 0.028218168334768475, "learning_rate": 1.3079807845439996e-08, "loss": 0.0001, "step": 10633 }, { "epoch": 4.83803457688808, "grad_norm": 0.04017725369796755, "learning_rate": 1.3006901688399077e-08, "loss": 0.0002, "step": 10634 }, { "epoch": 4.838489535941765, "grad_norm": 0.006139846084892517, "learning_rate": 1.293419875720292e-08, "loss": 0.0, "step": 10635 }, { "epoch": 4.8389444949954505, "grad_norm": 0.45280386306931064, "learning_rate": 1.2861699057792887e-08, "loss": 0.0021, "step": 10636 }, { "epoch": 4.839399454049135, "grad_norm": 0.010526886635555314, "learning_rate": 1.278940259609257e-08, "loss": 0.0001, "step": 10637 }, { "epoch": 4.83985441310282, "grad_norm": 0.027046449988127094, "learning_rate": 1.2717309378010024e-08, "loss": 0.0001, "step": 10638 }, { "epoch": 4.840309372156506, "grad_norm": 0.17300784414598788, "learning_rate": 1.2645419409436921e-08, "loss": 0.0022, "step": 10639 }, { "epoch": 4.840764331210191, "grad_norm": 0.3012144600096755, "learning_rate": 1.2573732696247176e-08, "loss": 0.0019, "step": 10640 }, { "epoch": 4.841219290263876, "grad_norm": 0.0543081160424877, "learning_rate": 1.250224924429888e-08, "loss": 0.0002, "step": 10641 }, { "epoch": 4.8416742493175615, "grad_norm": 0.10863825059490467, "learning_rate": 1.2430969059433196e-08, "loss": 0.001, "step": 10642 }, { "epoch": 4.842129208371246, "grad_norm": 0.13898791697056886, "learning_rate": 1.2359892147474906e-08, "loss": 0.001, "step": 10643 }, { "epoch": 4.842584167424932, "grad_norm": 0.09535116707582544, "learning_rate": 1.2289018514232421e-08, "loss": 0.0006, "step": 10644 }, { "epoch": 4.843039126478617, "grad_norm": 0.010418065003441293, "learning_rate": 1.2218348165496663e-08, "loss": 0.0001, "step": 10645 }, { "epoch": 4.843494085532302, "grad_norm": 0.08946142683237501, "learning_rate": 1.2147881107043014e-08, "loss": 0.0004, "step": 10646 }, { "epoch": 4.843949044585988, "grad_norm": 0.10941653996776816, "learning_rate": 1.2077617344629366e-08, "loss": 0.0006, "step": 10647 }, { "epoch": 4.8444040036396725, "grad_norm": 0.022996663401435594, "learning_rate": 1.2007556883997518e-08, "loss": 0.0001, "step": 10648 }, { "epoch": 4.844858962693357, "grad_norm": 0.0748757052180644, "learning_rate": 1.193769973087261e-08, "loss": 0.0003, "step": 10649 }, { "epoch": 4.845313921747043, "grad_norm": 0.2054872259823408, "learning_rate": 1.1868045890962576e-08, "loss": 0.001, "step": 10650 }, { "epoch": 4.845768880800728, "grad_norm": 0.03970493245828275, "learning_rate": 1.1798595369959532e-08, "loss": 0.0002, "step": 10651 }, { "epoch": 4.846223839854413, "grad_norm": 0.07328908409755681, "learning_rate": 1.1729348173538934e-08, "loss": 0.0005, "step": 10652 }, { "epoch": 4.846678798908099, "grad_norm": 0.14408502652894722, "learning_rate": 1.166030430735876e-08, "loss": 0.0004, "step": 10653 }, { "epoch": 4.8471337579617835, "grad_norm": 0.012338397670819085, "learning_rate": 1.159146377706144e-08, "loss": 0.0001, "step": 10654 }, { "epoch": 4.847588717015468, "grad_norm": 0.1550600308188033, "learning_rate": 1.1522826588272196e-08, "loss": 0.0015, "step": 10655 }, { "epoch": 4.848043676069154, "grad_norm": 0.00621626578490091, "learning_rate": 1.1454392746599596e-08, "loss": 0.0, "step": 10656 }, { "epoch": 4.848498635122839, "grad_norm": 0.1151075597505322, "learning_rate": 1.1386162257636113e-08, "loss": 0.0011, "step": 10657 }, { "epoch": 4.848953594176524, "grad_norm": 0.040830925865118606, "learning_rate": 1.131813512695673e-08, "loss": 0.0001, "step": 10658 }, { "epoch": 4.84940855323021, "grad_norm": 0.059205997662293446, "learning_rate": 1.1250311360120335e-08, "loss": 0.0003, "step": 10659 }, { "epoch": 4.8498635122838945, "grad_norm": 0.05125190458130173, "learning_rate": 1.1182690962669719e-08, "loss": 0.0002, "step": 10660 }, { "epoch": 4.850318471337579, "grad_norm": 0.0739063545280596, "learning_rate": 1.1115273940130178e-08, "loss": 0.0004, "step": 10661 }, { "epoch": 4.850773430391265, "grad_norm": 0.07897196092041278, "learning_rate": 1.1048060298010644e-08, "loss": 0.0003, "step": 10662 }, { "epoch": 4.85122838944495, "grad_norm": 0.13374194460137226, "learning_rate": 1.0981050041803665e-08, "loss": 0.0012, "step": 10663 }, { "epoch": 4.851683348498635, "grad_norm": 0.09808763047972514, "learning_rate": 1.091424317698514e-08, "loss": 0.0005, "step": 10664 }, { "epoch": 4.852138307552321, "grad_norm": 0.01136048573689284, "learning_rate": 1.0847639709013757e-08, "loss": 0.0001, "step": 10665 }, { "epoch": 4.852593266606005, "grad_norm": 0.002531265472127422, "learning_rate": 1.0781239643332387e-08, "loss": 0.0, "step": 10666 }, { "epoch": 4.85304822565969, "grad_norm": 0.24635414640695227, "learning_rate": 1.0715042985366964e-08, "loss": 0.0017, "step": 10667 }, { "epoch": 4.853503184713376, "grad_norm": 0.26579660635290736, "learning_rate": 1.0649049740526774e-08, "loss": 0.0021, "step": 10668 }, { "epoch": 4.853958143767061, "grad_norm": 0.06290528861994506, "learning_rate": 1.0583259914204446e-08, "loss": 0.0003, "step": 10669 }, { "epoch": 4.854413102820746, "grad_norm": 0.02716847252320387, "learning_rate": 1.051767351177596e-08, "loss": 0.0002, "step": 10670 }, { "epoch": 4.8548680618744315, "grad_norm": 0.08626127587962669, "learning_rate": 1.045229053860064e-08, "loss": 0.0008, "step": 10671 }, { "epoch": 4.855323020928116, "grad_norm": 0.02572521171300294, "learning_rate": 1.038711100002171e-08, "loss": 0.0001, "step": 10672 }, { "epoch": 4.855777979981801, "grad_norm": 0.06940898677825351, "learning_rate": 1.032213490136491e-08, "loss": 0.0007, "step": 10673 }, { "epoch": 4.856232939035487, "grad_norm": 0.010164278037319804, "learning_rate": 1.0257362247939884e-08, "loss": 0.0001, "step": 10674 }, { "epoch": 4.856687898089172, "grad_norm": 0.02365102625086987, "learning_rate": 1.0192793045039894e-08, "loss": 0.0001, "step": 10675 }, { "epoch": 4.857142857142857, "grad_norm": 0.0325504817867162, "learning_rate": 1.0128427297940724e-08, "loss": 0.0002, "step": 10676 }, { "epoch": 4.8575978161965425, "grad_norm": 0.04629945805211338, "learning_rate": 1.006426501190233e-08, "loss": 0.0003, "step": 10677 }, { "epoch": 4.858052775250227, "grad_norm": 0.03705452712526486, "learning_rate": 1.000030619216802e-08, "loss": 0.0003, "step": 10678 }, { "epoch": 4.858507734303913, "grad_norm": 0.02296874509417839, "learning_rate": 9.936550843963888e-09, "loss": 0.0001, "step": 10679 }, { "epoch": 4.858962693357598, "grad_norm": 0.017040635391815225, "learning_rate": 9.872998972499381e-09, "loss": 0.0001, "step": 10680 }, { "epoch": 4.859417652411283, "grad_norm": 0.023592466934741017, "learning_rate": 9.8096505829684e-09, "loss": 0.0001, "step": 10681 }, { "epoch": 4.859872611464969, "grad_norm": 0.0472281430286813, "learning_rate": 9.746505680547358e-09, "loss": 0.0001, "step": 10682 }, { "epoch": 4.8603275705186535, "grad_norm": 0.29429678853937014, "learning_rate": 9.68356427039574e-09, "loss": 0.002, "step": 10683 }, { "epoch": 4.860782529572338, "grad_norm": 0.20445702568752483, "learning_rate": 9.62082635765721e-09, "loss": 0.0016, "step": 10684 }, { "epoch": 4.861237488626024, "grad_norm": 0.2463384769886679, "learning_rate": 9.558291947457943e-09, "loss": 0.0008, "step": 10685 }, { "epoch": 4.861692447679709, "grad_norm": 0.07990271286847846, "learning_rate": 9.495961044908852e-09, "loss": 0.0003, "step": 10686 }, { "epoch": 4.862147406733394, "grad_norm": 0.47251410709698694, "learning_rate": 9.433833655102253e-09, "loss": 0.0019, "step": 10687 }, { "epoch": 4.86260236578708, "grad_norm": 0.0067911614335490695, "learning_rate": 9.371909783116028e-09, "loss": 0.0, "step": 10688 }, { "epoch": 4.8630573248407645, "grad_norm": 0.11706981827814263, "learning_rate": 9.310189434009464e-09, "loss": 0.0018, "step": 10689 }, { "epoch": 4.863512283894449, "grad_norm": 0.03865280321396915, "learning_rate": 9.248672612826304e-09, "loss": 0.0001, "step": 10690 }, { "epoch": 4.863967242948135, "grad_norm": 0.1797691455024991, "learning_rate": 9.187359324593637e-09, "loss": 0.0016, "step": 10691 }, { "epoch": 4.86442220200182, "grad_norm": 0.1952259777902601, "learning_rate": 9.126249574321344e-09, "loss": 0.0028, "step": 10692 }, { "epoch": 4.864877161055505, "grad_norm": 0.24989496012218707, "learning_rate": 9.065343367003488e-09, "loss": 0.0022, "step": 10693 }, { "epoch": 4.865332120109191, "grad_norm": 0.1263327351988775, "learning_rate": 9.004640707616641e-09, "loss": 0.0007, "step": 10694 }, { "epoch": 4.8657870791628755, "grad_norm": 0.04595631100469024, "learning_rate": 8.944141601121559e-09, "loss": 0.0002, "step": 10695 }, { "epoch": 4.86624203821656, "grad_norm": 0.024646697985719748, "learning_rate": 8.88384605246151e-09, "loss": 0.0002, "step": 10696 }, { "epoch": 4.866696997270246, "grad_norm": 0.12302846648354072, "learning_rate": 8.823754066563662e-09, "loss": 0.0002, "step": 10697 }, { "epoch": 4.867151956323931, "grad_norm": 0.1369618364455303, "learning_rate": 8.763865648338809e-09, "loss": 0.0003, "step": 10698 }, { "epoch": 4.867606915377616, "grad_norm": 0.18102337848372435, "learning_rate": 8.70418080268054e-09, "loss": 0.0007, "step": 10699 }, { "epoch": 4.868061874431302, "grad_norm": 0.25779620572309364, "learning_rate": 8.644699534466061e-09, "loss": 0.0018, "step": 10700 }, { "epoch": 4.868516833484986, "grad_norm": 0.13039307259945296, "learning_rate": 8.585421848555653e-09, "loss": 0.0006, "step": 10701 }, { "epoch": 4.868971792538671, "grad_norm": 0.051289018830270386, "learning_rate": 8.526347749793495e-09, "loss": 0.0004, "step": 10702 }, { "epoch": 4.869426751592357, "grad_norm": 0.03259991655734656, "learning_rate": 8.467477243006838e-09, "loss": 0.0001, "step": 10703 }, { "epoch": 4.869881710646042, "grad_norm": 0.043673133771388574, "learning_rate": 8.408810333006278e-09, "loss": 0.0002, "step": 10704 }, { "epoch": 4.870336669699727, "grad_norm": 0.33654645231971725, "learning_rate": 8.350347024586036e-09, "loss": 0.0004, "step": 10705 }, { "epoch": 4.8707916287534125, "grad_norm": 0.2983156956811462, "learning_rate": 8.292087322522846e-09, "loss": 0.0018, "step": 10706 }, { "epoch": 4.871246587807097, "grad_norm": 0.2309292349566365, "learning_rate": 8.234031231578177e-09, "loss": 0.0012, "step": 10707 }, { "epoch": 4.871701546860782, "grad_norm": 0.04595287515934557, "learning_rate": 8.176178756495457e-09, "loss": 0.0003, "step": 10708 }, { "epoch": 4.872156505914468, "grad_norm": 0.012013293207284928, "learning_rate": 8.11852990200257e-09, "loss": 0.0001, "step": 10709 }, { "epoch": 4.872611464968153, "grad_norm": 0.3271314326084315, "learning_rate": 8.061084672810193e-09, "loss": 0.004, "step": 10710 }, { "epoch": 4.873066424021838, "grad_norm": 0.08073297118119954, "learning_rate": 8.003843073612627e-09, "loss": 0.0003, "step": 10711 }, { "epoch": 4.8735213830755235, "grad_norm": 0.3207634829796875, "learning_rate": 7.946805109086964e-09, "loss": 0.0024, "step": 10712 }, { "epoch": 4.873976342129208, "grad_norm": 0.19537371936377057, "learning_rate": 7.889970783894751e-09, "loss": 0.0011, "step": 10713 }, { "epoch": 4.874431301182893, "grad_norm": 0.2631053190623843, "learning_rate": 7.833340102679498e-09, "loss": 0.0009, "step": 10714 }, { "epoch": 4.874886260236579, "grad_norm": 0.03610842060962394, "learning_rate": 7.77691307006917e-09, "loss": 0.0001, "step": 10715 }, { "epoch": 4.875341219290264, "grad_norm": 0.06044937537570806, "learning_rate": 7.720689690674798e-09, "loss": 0.0004, "step": 10716 }, { "epoch": 4.875796178343949, "grad_norm": 0.05224609494302104, "learning_rate": 7.664669969090765e-09, "loss": 0.0004, "step": 10717 }, { "epoch": 4.8762511373976345, "grad_norm": 0.012461791140373608, "learning_rate": 7.60885390989452e-09, "loss": 0.0001, "step": 10718 }, { "epoch": 4.876706096451319, "grad_norm": 0.015878827940102914, "learning_rate": 7.553241517647136e-09, "loss": 0.0001, "step": 10719 }, { "epoch": 4.877161055505004, "grad_norm": 0.022786261694543397, "learning_rate": 7.497832796893311e-09, "loss": 0.0001, "step": 10720 }, { "epoch": 4.87761601455869, "grad_norm": 0.16257414753105004, "learning_rate": 7.442627752160259e-09, "loss": 0.0006, "step": 10721 }, { "epoch": 4.878070973612375, "grad_norm": 0.08709140645104503, "learning_rate": 7.387626387959368e-09, "loss": 0.0004, "step": 10722 }, { "epoch": 4.87852593266606, "grad_norm": 0.17221495361329678, "learning_rate": 7.332828708785378e-09, "loss": 0.0019, "step": 10723 }, { "epoch": 4.8789808917197455, "grad_norm": 0.09106326115741999, "learning_rate": 7.27823471911554e-09, "loss": 0.0007, "step": 10724 }, { "epoch": 4.87943585077343, "grad_norm": 0.013352688010834203, "learning_rate": 7.223844423411564e-09, "loss": 0.0001, "step": 10725 }, { "epoch": 4.879890809827115, "grad_norm": 0.12471295731189794, "learning_rate": 7.169657826117671e-09, "loss": 0.0008, "step": 10726 }, { "epoch": 4.880345768880801, "grad_norm": 0.07839991832838678, "learning_rate": 7.115674931661987e-09, "loss": 0.0004, "step": 10727 }, { "epoch": 4.880800727934486, "grad_norm": 0.2169681274811643, "learning_rate": 7.061895744455149e-09, "loss": 0.0015, "step": 10728 }, { "epoch": 4.881255686988171, "grad_norm": 0.10375007200417433, "learning_rate": 7.008320268892532e-09, "loss": 0.0003, "step": 10729 }, { "epoch": 4.8817106460418564, "grad_norm": 0.26736290732151613, "learning_rate": 6.9549485093514665e-09, "loss": 0.0031, "step": 10730 }, { "epoch": 4.882165605095541, "grad_norm": 0.012610600228071363, "learning_rate": 6.901780470193742e-09, "loss": 0.0, "step": 10731 }, { "epoch": 4.882620564149226, "grad_norm": 0.043349329457980784, "learning_rate": 6.8488161557639376e-09, "loss": 0.0003, "step": 10732 }, { "epoch": 4.883075523202912, "grad_norm": 0.18269073493055646, "learning_rate": 6.796055570389426e-09, "loss": 0.0018, "step": 10733 }, { "epoch": 4.883530482256597, "grad_norm": 0.022190874256047672, "learning_rate": 6.743498718382591e-09, "loss": 0.0001, "step": 10734 }, { "epoch": 4.883985441310282, "grad_norm": 0.06271172604656462, "learning_rate": 6.691145604037219e-09, "loss": 0.0002, "step": 10735 }, { "epoch": 4.884440400363967, "grad_norm": 0.018468312582681775, "learning_rate": 6.638996231631834e-09, "loss": 0.0001, "step": 10736 }, { "epoch": 4.884895359417652, "grad_norm": 0.08521884350899812, "learning_rate": 6.5870506054277475e-09, "loss": 0.0004, "step": 10737 }, { "epoch": 4.885350318471337, "grad_norm": 0.021990416113060986, "learning_rate": 6.5353087296696205e-09, "loss": 0.0001, "step": 10738 }, { "epoch": 4.885805277525023, "grad_norm": 0.11464217143437122, "learning_rate": 6.483770608586016e-09, "loss": 0.0014, "step": 10739 }, { "epoch": 4.886260236578708, "grad_norm": 0.09681486194531767, "learning_rate": 6.43243624638773e-09, "loss": 0.0008, "step": 10740 }, { "epoch": 4.886715195632393, "grad_norm": 0.07807853268236313, "learning_rate": 6.3813056472700194e-09, "loss": 0.0003, "step": 10741 }, { "epoch": 4.887170154686078, "grad_norm": 0.09501231667309604, "learning_rate": 6.330378815410932e-09, "loss": 0.0005, "step": 10742 }, { "epoch": 4.887625113739763, "grad_norm": 0.023517815230373215, "learning_rate": 6.2796557549718585e-09, "loss": 0.0001, "step": 10743 }, { "epoch": 4.888080072793448, "grad_norm": 0.06822269878541218, "learning_rate": 6.229136470098096e-09, "loss": 0.0002, "step": 10744 }, { "epoch": 4.888535031847134, "grad_norm": 0.032309589538568106, "learning_rate": 6.178820964917176e-09, "loss": 0.0001, "step": 10745 }, { "epoch": 4.888989990900819, "grad_norm": 0.09521324379961585, "learning_rate": 6.1287092435413645e-09, "loss": 0.0005, "step": 10746 }, { "epoch": 4.889444949954504, "grad_norm": 0.020280525694349932, "learning_rate": 6.078801310064886e-09, "loss": 0.0001, "step": 10747 }, { "epoch": 4.889899909008189, "grad_norm": 0.02084349307358039, "learning_rate": 6.029097168566422e-09, "loss": 0.0001, "step": 10748 }, { "epoch": 4.890354868061874, "grad_norm": 0.21450218431073484, "learning_rate": 5.979596823107448e-09, "loss": 0.0015, "step": 10749 }, { "epoch": 4.890809827115559, "grad_norm": 0.053570930832499204, "learning_rate": 5.930300277732781e-09, "loss": 0.0002, "step": 10750 }, { "epoch": 4.891264786169245, "grad_norm": 0.22287641162416144, "learning_rate": 5.881207536471145e-09, "loss": 0.0032, "step": 10751 }, { "epoch": 4.89171974522293, "grad_norm": 0.028368062142758457, "learning_rate": 5.832318603333776e-09, "loss": 0.0002, "step": 10752 }, { "epoch": 4.892174704276615, "grad_norm": 0.03349752712986947, "learning_rate": 5.783633482315809e-09, "loss": 0.0002, "step": 10753 }, { "epoch": 4.8926296633303, "grad_norm": 0.0726414633832181, "learning_rate": 5.735152177395453e-09, "loss": 0.0005, "step": 10754 }, { "epoch": 4.893084622383985, "grad_norm": 0.22155783597213108, "learning_rate": 5.686874692534538e-09, "loss": 0.0006, "step": 10755 }, { "epoch": 4.893539581437671, "grad_norm": 0.03998132307930347, "learning_rate": 5.6388010316779655e-09, "loss": 0.0002, "step": 10756 }, { "epoch": 4.893994540491356, "grad_norm": 0.15277341735494518, "learning_rate": 5.59093119875398e-09, "loss": 0.0014, "step": 10757 }, { "epoch": 4.894449499545041, "grad_norm": 0.08819398922081484, "learning_rate": 5.54326519767473e-09, "loss": 0.0004, "step": 10758 }, { "epoch": 4.8949044585987265, "grad_norm": 0.2819910584044524, "learning_rate": 5.495803032334879e-09, "loss": 0.002, "step": 10759 }, { "epoch": 4.895359417652411, "grad_norm": 0.056191180144579286, "learning_rate": 5.448544706612713e-09, "loss": 0.0003, "step": 10760 }, { "epoch": 4.895814376706096, "grad_norm": 0.1061936600676431, "learning_rate": 5.401490224370421e-09, "loss": 0.0011, "step": 10761 }, { "epoch": 4.896269335759782, "grad_norm": 0.008467724505777575, "learning_rate": 5.3546395894527035e-09, "loss": 0.0, "step": 10762 }, { "epoch": 4.896724294813467, "grad_norm": 0.03816506896896154, "learning_rate": 5.307992805688445e-09, "loss": 0.0002, "step": 10763 }, { "epoch": 4.897179253867152, "grad_norm": 0.052276967965756684, "learning_rate": 5.2615498768887605e-09, "loss": 0.0002, "step": 10764 }, { "epoch": 4.8976342129208374, "grad_norm": 0.15369685607932526, "learning_rate": 5.21531080684895e-09, "loss": 0.0015, "step": 10765 }, { "epoch": 4.898089171974522, "grad_norm": 0.07463738148535268, "learning_rate": 5.1692755993479335e-09, "loss": 0.0004, "step": 10766 }, { "epoch": 4.898544131028207, "grad_norm": 0.10072155516125267, "learning_rate": 5.12344425814687e-09, "loss": 0.0009, "step": 10767 }, { "epoch": 4.898999090081893, "grad_norm": 0.4790635050764603, "learning_rate": 5.077816786991374e-09, "loss": 0.0013, "step": 10768 }, { "epoch": 4.899454049135578, "grad_norm": 0.06502760688344285, "learning_rate": 5.032393189609852e-09, "loss": 0.0005, "step": 10769 }, { "epoch": 4.899909008189263, "grad_norm": 0.15158128343612481, "learning_rate": 4.98717346971378e-09, "loss": 0.0011, "step": 10770 }, { "epoch": 4.900363967242948, "grad_norm": 0.07186305645956985, "learning_rate": 4.942157630998534e-09, "loss": 0.0015, "step": 10771 }, { "epoch": 4.900818926296633, "grad_norm": 0.02876685690923764, "learning_rate": 4.897345677142562e-09, "loss": 0.0002, "step": 10772 }, { "epoch": 4.901273885350318, "grad_norm": 0.05184114454668266, "learning_rate": 4.852737611807656e-09, "loss": 0.0002, "step": 10773 }, { "epoch": 4.901728844404004, "grad_norm": 0.02984262460721666, "learning_rate": 4.808333438639235e-09, "loss": 0.0001, "step": 10774 }, { "epoch": 4.902183803457689, "grad_norm": 0.24710870076898026, "learning_rate": 4.764133161265505e-09, "loss": 0.0011, "step": 10775 }, { "epoch": 4.902638762511374, "grad_norm": 0.13345270771252998, "learning_rate": 4.720136783298579e-09, "loss": 0.0009, "step": 10776 }, { "epoch": 4.903093721565059, "grad_norm": 0.08420722929156499, "learning_rate": 4.676344308333081e-09, "loss": 0.0004, "step": 10777 }, { "epoch": 4.903548680618744, "grad_norm": 0.014289134449654297, "learning_rate": 4.632755739948369e-09, "loss": 0.0001, "step": 10778 }, { "epoch": 4.904003639672429, "grad_norm": 0.11940292017473539, "learning_rate": 4.589371081705762e-09, "loss": 0.0005, "step": 10779 }, { "epoch": 4.904458598726115, "grad_norm": 0.2019328564174436, "learning_rate": 4.54619033715048e-09, "loss": 0.0018, "step": 10780 }, { "epoch": 4.9049135577798, "grad_norm": 0.10271461735839835, "learning_rate": 4.5032135098110884e-09, "loss": 0.0008, "step": 10781 }, { "epoch": 4.905368516833485, "grad_norm": 0.06527753742725934, "learning_rate": 4.460440603199778e-09, "loss": 0.0003, "step": 10782 }, { "epoch": 4.90582347588717, "grad_norm": 0.05785288412326894, "learning_rate": 4.417871620811254e-09, "loss": 0.0005, "step": 10783 }, { "epoch": 4.906278434940855, "grad_norm": 0.11773247130232112, "learning_rate": 4.375506566124676e-09, "loss": 0.0003, "step": 10784 }, { "epoch": 4.90673339399454, "grad_norm": 0.01316470677449828, "learning_rate": 4.333345442601167e-09, "loss": 0.0001, "step": 10785 }, { "epoch": 4.907188353048226, "grad_norm": 0.1264564993119623, "learning_rate": 4.291388253686579e-09, "loss": 0.0011, "step": 10786 }, { "epoch": 4.907643312101911, "grad_norm": 0.08094700200284331, "learning_rate": 4.249635002809005e-09, "loss": 0.0004, "step": 10787 }, { "epoch": 4.9080982711555965, "grad_norm": 0.06692322337082972, "learning_rate": 4.208085693380715e-09, "loss": 0.0003, "step": 10788 }, { "epoch": 4.908553230209281, "grad_norm": 0.035027538332016425, "learning_rate": 4.16674032879677e-09, "loss": 0.0001, "step": 10789 }, { "epoch": 4.909008189262966, "grad_norm": 0.039040265859717795, "learning_rate": 4.12559891243558e-09, "loss": 0.0002, "step": 10790 }, { "epoch": 4.909463148316652, "grad_norm": 0.038739144650653715, "learning_rate": 4.084661447659178e-09, "loss": 0.0001, "step": 10791 }, { "epoch": 4.909918107370337, "grad_norm": 0.4601700286107576, "learning_rate": 4.043927937812941e-09, "loss": 0.0045, "step": 10792 }, { "epoch": 4.910373066424022, "grad_norm": 0.030421974282308653, "learning_rate": 4.003398386225321e-09, "loss": 0.0002, "step": 10793 }, { "epoch": 4.9108280254777075, "grad_norm": 0.05868278877668121, "learning_rate": 3.963072796208112e-09, "loss": 0.0001, "step": 10794 }, { "epoch": 4.911282984531392, "grad_norm": 0.19695952307931142, "learning_rate": 3.922951171056455e-09, "loss": 0.0014, "step": 10795 }, { "epoch": 4.911737943585077, "grad_norm": 0.09267305051334628, "learning_rate": 3.8830335140491174e-09, "loss": 0.0003, "step": 10796 }, { "epoch": 4.912192902638763, "grad_norm": 0.11406934619726468, "learning_rate": 3.8433198284479335e-09, "loss": 0.0007, "step": 10797 }, { "epoch": 4.912647861692448, "grad_norm": 0.03296025156713705, "learning_rate": 3.8038101174980856e-09, "loss": 0.0001, "step": 10798 }, { "epoch": 4.913102820746133, "grad_norm": 0.21727232731810692, "learning_rate": 3.764504384428103e-09, "loss": 0.0011, "step": 10799 }, { "epoch": 4.9135577797998184, "grad_norm": 0.04579125370645926, "learning_rate": 3.725402632450137e-09, "loss": 0.0003, "step": 10800 }, { "epoch": 4.914012738853503, "grad_norm": 0.19212182155948104, "learning_rate": 3.6865048647588554e-09, "loss": 0.0013, "step": 10801 }, { "epoch": 4.914467697907188, "grad_norm": 0.07076208843180638, "learning_rate": 3.6478110845333814e-09, "loss": 0.0004, "step": 10802 }, { "epoch": 4.914922656960874, "grad_norm": 0.17402024858920784, "learning_rate": 3.6093212949353527e-09, "loss": 0.0015, "step": 10803 }, { "epoch": 4.915377616014559, "grad_norm": 0.03483647503626536, "learning_rate": 3.5710354991100317e-09, "loss": 0.0001, "step": 10804 }, { "epoch": 4.915832575068244, "grad_norm": 0.13725927228994173, "learning_rate": 3.532953700185748e-09, "loss": 0.001, "step": 10805 }, { "epoch": 4.916287534121929, "grad_norm": 0.05575059296457214, "learning_rate": 3.495075901274736e-09, "loss": 0.0005, "step": 10806 }, { "epoch": 4.916742493175614, "grad_norm": 0.01203843804924826, "learning_rate": 3.457402105471741e-09, "loss": 0.0, "step": 10807 }, { "epoch": 4.917197452229299, "grad_norm": 0.055064194394880195, "learning_rate": 3.4199323158556897e-09, "loss": 0.0004, "step": 10808 }, { "epoch": 4.917652411282985, "grad_norm": 0.010347218878500025, "learning_rate": 3.3826665354882994e-09, "loss": 0.0, "step": 10809 }, { "epoch": 4.91810737033667, "grad_norm": 0.08209207101291814, "learning_rate": 3.3456047674149118e-09, "loss": 0.0006, "step": 10810 }, { "epoch": 4.918562329390355, "grad_norm": 0.005659168542673707, "learning_rate": 3.308747014663938e-09, "loss": 0.0, "step": 10811 }, { "epoch": 4.91901728844404, "grad_norm": 0.03781227010264688, "learning_rate": 3.2720932802468573e-09, "loss": 0.0003, "step": 10812 }, { "epoch": 4.919472247497725, "grad_norm": 0.0930607104643984, "learning_rate": 3.2356435671596076e-09, "loss": 0.0004, "step": 10813 }, { "epoch": 4.91992720655141, "grad_norm": 0.030348217367105943, "learning_rate": 3.199397878380084e-09, "loss": 0.0001, "step": 10814 }, { "epoch": 4.920382165605096, "grad_norm": 0.05070933568341646, "learning_rate": 3.1633562168700836e-09, "loss": 0.0001, "step": 10815 }, { "epoch": 4.920837124658781, "grad_norm": 0.1609911438453276, "learning_rate": 3.1275185855753064e-09, "loss": 0.0015, "step": 10816 }, { "epoch": 4.921292083712466, "grad_norm": 0.020426626765178472, "learning_rate": 3.091884987423965e-09, "loss": 0.0001, "step": 10817 }, { "epoch": 4.921747042766151, "grad_norm": 0.07865377255384368, "learning_rate": 3.0564554253276204e-09, "loss": 0.0004, "step": 10818 }, { "epoch": 4.922202001819836, "grad_norm": 0.4456754898551054, "learning_rate": 3.0212299021817326e-09, "loss": 0.004, "step": 10819 }, { "epoch": 4.922656960873521, "grad_norm": 0.14393492276536868, "learning_rate": 2.9862084208648336e-09, "loss": 0.0016, "step": 10820 }, { "epoch": 4.923111919927207, "grad_norm": 0.03999997254202869, "learning_rate": 2.951390984238245e-09, "loss": 0.0001, "step": 10821 }, { "epoch": 4.923566878980892, "grad_norm": 0.012243810596870153, "learning_rate": 2.916777595147746e-09, "loss": 0.0001, "step": 10822 }, { "epoch": 4.924021838034577, "grad_norm": 0.014516986930822552, "learning_rate": 2.8823682564210752e-09, "loss": 0.0001, "step": 10823 }, { "epoch": 4.924476797088262, "grad_norm": 0.17510641002354563, "learning_rate": 2.848162970870705e-09, "loss": 0.0016, "step": 10824 }, { "epoch": 4.924931756141947, "grad_norm": 0.10572563824708058, "learning_rate": 2.8141617412913435e-09, "loss": 0.0006, "step": 10825 }, { "epoch": 4.925386715195632, "grad_norm": 0.2860654143912229, "learning_rate": 2.7803645704616023e-09, "loss": 0.0029, "step": 10826 }, { "epoch": 4.925841674249318, "grad_norm": 0.07959005252540882, "learning_rate": 2.746771461142883e-09, "loss": 0.0003, "step": 10827 }, { "epoch": 4.926296633303003, "grad_norm": 0.14831856625158293, "learning_rate": 2.71338241608049e-09, "loss": 0.0012, "step": 10828 }, { "epoch": 4.926751592356688, "grad_norm": 0.21399185179301536, "learning_rate": 2.6801974380030736e-09, "loss": 0.0016, "step": 10829 }, { "epoch": 4.927206551410373, "grad_norm": 0.1359178811261437, "learning_rate": 2.6472165296220764e-09, "loss": 0.0011, "step": 10830 }, { "epoch": 4.927661510464058, "grad_norm": 0.15340046970508395, "learning_rate": 2.6144396936325645e-09, "loss": 0.0008, "step": 10831 }, { "epoch": 4.928116469517743, "grad_norm": 0.03161350677974272, "learning_rate": 2.5818669327129507e-09, "loss": 0.0002, "step": 10832 }, { "epoch": 4.928571428571429, "grad_norm": 0.07974613581773542, "learning_rate": 2.5494982495249955e-09, "loss": 0.0006, "step": 10833 }, { "epoch": 4.929026387625114, "grad_norm": 0.17336254314001331, "learning_rate": 2.5173336467135266e-09, "loss": 0.0015, "step": 10834 }, { "epoch": 4.929481346678799, "grad_norm": 0.2921408057008554, "learning_rate": 2.485373126906998e-09, "loss": 0.0046, "step": 10835 }, { "epoch": 4.929936305732484, "grad_norm": 0.023715005466949466, "learning_rate": 2.453616692717209e-09, "loss": 0.0001, "step": 10836 }, { "epoch": 4.930391264786169, "grad_norm": 0.09205495564282878, "learning_rate": 2.4220643467387506e-09, "loss": 0.0003, "step": 10837 }, { "epoch": 4.930846223839854, "grad_norm": 0.020637640524944885, "learning_rate": 2.390716091550671e-09, "loss": 0.0001, "step": 10838 }, { "epoch": 4.93130118289354, "grad_norm": 0.12324005692927724, "learning_rate": 2.3595719297139776e-09, "loss": 0.0017, "step": 10839 }, { "epoch": 4.931756141947225, "grad_norm": 0.018875205465060305, "learning_rate": 2.3286318637738557e-09, "loss": 0.0001, "step": 10840 }, { "epoch": 4.9322111010009095, "grad_norm": 0.07510952410206351, "learning_rate": 2.297895896258284e-09, "loss": 0.0003, "step": 10841 }, { "epoch": 4.932666060054595, "grad_norm": 0.04143944039137485, "learning_rate": 2.26736402967942e-09, "loss": 0.0002, "step": 10842 }, { "epoch": 4.93312101910828, "grad_norm": 0.1601130335424025, "learning_rate": 2.2370362665319333e-09, "loss": 0.0017, "step": 10843 }, { "epoch": 4.933575978161965, "grad_norm": 0.07191799273384368, "learning_rate": 2.206912609293843e-09, "loss": 0.0004, "step": 10844 }, { "epoch": 4.934030937215651, "grad_norm": 0.02775583046315913, "learning_rate": 2.1769930604270683e-09, "loss": 0.0001, "step": 10845 }, { "epoch": 4.934485896269336, "grad_norm": 0.013180142393612782, "learning_rate": 2.14727762237632e-09, "loss": 0.0001, "step": 10846 }, { "epoch": 4.9349408553230205, "grad_norm": 0.07255891772865697, "learning_rate": 2.1177662975699343e-09, "loss": 0.0003, "step": 10847 }, { "epoch": 4.935395814376706, "grad_norm": 0.03307756484056483, "learning_rate": 2.0884590884193144e-09, "loss": 0.0002, "step": 10848 }, { "epoch": 4.935850773430391, "grad_norm": 0.020586324753318924, "learning_rate": 2.0593559973192125e-09, "loss": 0.0001, "step": 10849 }, { "epoch": 4.936305732484076, "grad_norm": 0.0424489464384944, "learning_rate": 2.030457026648003e-09, "loss": 0.0002, "step": 10850 }, { "epoch": 4.936760691537762, "grad_norm": 0.135387365140251, "learning_rate": 2.0017621787671304e-09, "loss": 0.0008, "step": 10851 }, { "epoch": 4.937215650591447, "grad_norm": 0.03057885414582324, "learning_rate": 1.973271456021386e-09, "loss": 0.0002, "step": 10852 }, { "epoch": 4.9376706096451315, "grad_norm": 0.06970959626856178, "learning_rate": 1.9449848607391853e-09, "loss": 0.0002, "step": 10853 }, { "epoch": 4.938125568698817, "grad_norm": 0.3029430472726988, "learning_rate": 1.9169023952311795e-09, "loss": 0.001, "step": 10854 }, { "epoch": 4.938580527752502, "grad_norm": 0.14306138987574585, "learning_rate": 1.8890240617930323e-09, "loss": 0.0015, "step": 10855 }, { "epoch": 4.939035486806187, "grad_norm": 0.07525688739612568, "learning_rate": 1.8613498627023664e-09, "loss": 0.0002, "step": 10856 }, { "epoch": 4.939490445859873, "grad_norm": 0.026767637562538076, "learning_rate": 1.8338798002207059e-09, "loss": 0.0001, "step": 10857 }, { "epoch": 4.939945404913558, "grad_norm": 0.1913971986403899, "learning_rate": 1.8066138765926445e-09, "loss": 0.0006, "step": 10858 }, { "epoch": 4.9404003639672425, "grad_norm": 0.2675112333481013, "learning_rate": 1.779552094046677e-09, "loss": 0.0019, "step": 10859 }, { "epoch": 4.940855323020928, "grad_norm": 0.06240078981032583, "learning_rate": 1.7526944547935355e-09, "loss": 0.0005, "step": 10860 }, { "epoch": 4.941310282074613, "grad_norm": 0.23937756941425617, "learning_rate": 1.726040961028408e-09, "loss": 0.0022, "step": 10861 }, { "epoch": 4.941765241128298, "grad_norm": 0.087582179815932, "learning_rate": 1.699591614928997e-09, "loss": 0.0003, "step": 10862 }, { "epoch": 4.942220200181984, "grad_norm": 0.031827879320970816, "learning_rate": 1.6733464186566295e-09, "loss": 0.0002, "step": 10863 }, { "epoch": 4.942675159235669, "grad_norm": 0.08570218854803925, "learning_rate": 1.6473053743562561e-09, "loss": 0.0003, "step": 10864 }, { "epoch": 4.943130118289354, "grad_norm": 0.2552021774150249, "learning_rate": 1.6214684841556194e-09, "loss": 0.0019, "step": 10865 }, { "epoch": 4.943585077343039, "grad_norm": 0.049785385582254336, "learning_rate": 1.5958357501658084e-09, "loss": 0.0002, "step": 10866 }, { "epoch": 4.944040036396724, "grad_norm": 0.011916742424098022, "learning_rate": 1.5704071744818138e-09, "loss": 0.0001, "step": 10867 }, { "epoch": 4.94449499545041, "grad_norm": 0.07238912748022926, "learning_rate": 1.5451827591811407e-09, "loss": 0.0003, "step": 10868 }, { "epoch": 4.944949954504095, "grad_norm": 0.055883856134003915, "learning_rate": 1.5201625063251956e-09, "loss": 0.0003, "step": 10869 }, { "epoch": 4.94540491355778, "grad_norm": 0.026710860533693148, "learning_rate": 1.4953464179587319e-09, "loss": 0.0001, "step": 10870 }, { "epoch": 4.945859872611465, "grad_norm": 0.11037501558273588, "learning_rate": 1.4707344961092939e-09, "loss": 0.001, "step": 10871 }, { "epoch": 4.94631483166515, "grad_norm": 0.14545384336078984, "learning_rate": 1.4463267427883287e-09, "loss": 0.001, "step": 10872 }, { "epoch": 4.946769790718835, "grad_norm": 0.013658141057336178, "learning_rate": 1.4221231599900743e-09, "loss": 0.0001, "step": 10873 }, { "epoch": 4.947224749772521, "grad_norm": 0.04306260033346582, "learning_rate": 1.3981237496923927e-09, "loss": 0.0002, "step": 10874 }, { "epoch": 4.947679708826206, "grad_norm": 0.061329698075003075, "learning_rate": 1.3743285138564932e-09, "loss": 0.0003, "step": 10875 }, { "epoch": 4.9481346678798905, "grad_norm": 0.22964863152660372, "learning_rate": 1.3507374544266538e-09, "loss": 0.0032, "step": 10876 }, { "epoch": 4.948589626933576, "grad_norm": 0.104660397099372, "learning_rate": 1.3273505733310543e-09, "loss": 0.0007, "step": 10877 }, { "epoch": 4.949044585987261, "grad_norm": 0.32100817777178603, "learning_rate": 1.304167872480111e-09, "loss": 0.002, "step": 10878 }, { "epoch": 4.949499545040946, "grad_norm": 0.08361419174708681, "learning_rate": 1.2811893537686971e-09, "loss": 0.0006, "step": 10879 }, { "epoch": 4.949954504094632, "grad_norm": 0.01938505025664205, "learning_rate": 1.2584150190744772e-09, "loss": 0.0001, "step": 10880 }, { "epoch": 4.950409463148317, "grad_norm": 0.043168694484481566, "learning_rate": 1.235844870258185e-09, "loss": 0.0003, "step": 10881 }, { "epoch": 4.9508644222020015, "grad_norm": 0.04867863221408645, "learning_rate": 1.2134789091644561e-09, "loss": 0.0003, "step": 10882 }, { "epoch": 4.951319381255687, "grad_norm": 0.028582815546827723, "learning_rate": 1.1913171376207178e-09, "loss": 0.0001, "step": 10883 }, { "epoch": 4.951774340309372, "grad_norm": 0.06908974378246797, "learning_rate": 1.1693595574382989e-09, "loss": 0.0008, "step": 10884 }, { "epoch": 4.952229299363057, "grad_norm": 0.010541427843068805, "learning_rate": 1.1476061704107645e-09, "loss": 0.0, "step": 10885 }, { "epoch": 4.952684258416743, "grad_norm": 0.030625064807547665, "learning_rate": 1.1260569783164144e-09, "loss": 0.0002, "step": 10886 }, { "epoch": 4.953139217470428, "grad_norm": 0.05002813040806078, "learning_rate": 1.104711982915785e-09, "loss": 0.0002, "step": 10887 }, { "epoch": 4.9535941765241125, "grad_norm": 0.04561406806056215, "learning_rate": 1.0835711859533139e-09, "loss": 0.0006, "step": 10888 }, { "epoch": 4.954049135577798, "grad_norm": 0.10366546644546376, "learning_rate": 1.0626345891562305e-09, "loss": 0.0003, "step": 10889 }, { "epoch": 4.954504094631483, "grad_norm": 0.10400673656001036, "learning_rate": 1.0419021942356666e-09, "loss": 0.0006, "step": 10890 }, { "epoch": 4.954959053685168, "grad_norm": 0.09796958462796718, "learning_rate": 1.0213740028855445e-09, "loss": 0.0016, "step": 10891 }, { "epoch": 4.955414012738854, "grad_norm": 0.08656865480161717, "learning_rate": 1.001050016783689e-09, "loss": 0.0006, "step": 10892 }, { "epoch": 4.955868971792539, "grad_norm": 0.05906251459715766, "learning_rate": 9.809302375904385e-10, "loss": 0.0002, "step": 10893 }, { "epoch": 4.9563239308462235, "grad_norm": 0.04537254372056211, "learning_rate": 9.610146669500332e-10, "loss": 0.0003, "step": 10894 }, { "epoch": 4.956778889899909, "grad_norm": 0.08259705981431882, "learning_rate": 9.4130330649006e-10, "loss": 0.0006, "step": 10895 }, { "epoch": 4.957233848953594, "grad_norm": 0.037646128633683974, "learning_rate": 9.217961578211754e-10, "loss": 0.0002, "step": 10896 }, { "epoch": 4.957688808007279, "grad_norm": 0.024859073167431175, "learning_rate": 9.024932225371041e-10, "loss": 0.0002, "step": 10897 }, { "epoch": 4.958143767060965, "grad_norm": 0.020090164673679363, "learning_rate": 8.833945022157509e-10, "loss": 0.0001, "step": 10898 }, { "epoch": 4.95859872611465, "grad_norm": 0.04377001935374799, "learning_rate": 8.64499998417534e-10, "loss": 0.0002, "step": 10899 }, { "epoch": 4.959053685168335, "grad_norm": 0.08164743862524443, "learning_rate": 8.458097126862186e-10, "loss": 0.0006, "step": 10900 }, { "epoch": 4.95950864422202, "grad_norm": 0.11977276922118893, "learning_rate": 8.273236465491941e-10, "loss": 0.0011, "step": 10901 }, { "epoch": 4.959963603275705, "grad_norm": 0.20934034343041705, "learning_rate": 8.090418015171964e-10, "loss": 0.0044, "step": 10902 }, { "epoch": 4.960418562329391, "grad_norm": 0.027982396731803433, "learning_rate": 7.909641790840306e-10, "loss": 0.0001, "step": 10903 }, { "epoch": 4.960873521383076, "grad_norm": 0.008348903935252006, "learning_rate": 7.730907807271259e-10, "loss": 0.0, "step": 10904 }, { "epoch": 4.961328480436761, "grad_norm": 0.019038708141056238, "learning_rate": 7.554216079067033e-10, "loss": 0.0001, "step": 10905 }, { "epoch": 4.961783439490446, "grad_norm": 0.14137754728814259, "learning_rate": 7.379566620666079e-10, "loss": 0.0008, "step": 10906 }, { "epoch": 4.962238398544131, "grad_norm": 0.1635064533294399, "learning_rate": 7.206959446343087e-10, "loss": 0.002, "step": 10907 }, { "epoch": 4.962693357597816, "grad_norm": 0.32461420569749505, "learning_rate": 7.036394570200667e-10, "loss": 0.0006, "step": 10908 }, { "epoch": 4.963148316651502, "grad_norm": 0.24432507869609316, "learning_rate": 6.867872006174892e-10, "loss": 0.0008, "step": 10909 }, { "epoch": 4.963603275705187, "grad_norm": 0.030627373768288363, "learning_rate": 6.701391768040854e-10, "loss": 0.0002, "step": 10910 }, { "epoch": 4.9640582347588715, "grad_norm": 0.35699961870664143, "learning_rate": 6.536953869398788e-10, "loss": 0.0028, "step": 10911 }, { "epoch": 4.964513193812557, "grad_norm": 0.14016008548971726, "learning_rate": 6.37455832368794e-10, "loss": 0.0009, "step": 10912 }, { "epoch": 4.964968152866242, "grad_norm": 0.01834258601333133, "learning_rate": 6.214205144178254e-10, "loss": 0.0001, "step": 10913 }, { "epoch": 4.965423111919927, "grad_norm": 0.04282650021372601, "learning_rate": 6.055894343973135e-10, "loss": 0.0001, "step": 10914 }, { "epoch": 4.965878070973613, "grad_norm": 0.031137340053233754, "learning_rate": 5.899625936009457e-10, "loss": 0.0002, "step": 10915 }, { "epoch": 4.966333030027298, "grad_norm": 0.10060682115353033, "learning_rate": 5.745399933054785e-10, "loss": 0.001, "step": 10916 }, { "epoch": 4.9667879890809825, "grad_norm": 0.12933781290084378, "learning_rate": 5.593216347712927e-10, "loss": 0.0009, "step": 10917 }, { "epoch": 4.967242948134668, "grad_norm": 0.1234925377298162, "learning_rate": 5.443075192418379e-10, "loss": 0.0008, "step": 10918 }, { "epoch": 4.967697907188353, "grad_norm": 0.3361129414695146, "learning_rate": 5.294976479441882e-10, "loss": 0.0022, "step": 10919 }, { "epoch": 4.968152866242038, "grad_norm": 0.08716570611623747, "learning_rate": 5.148920220887643e-10, "loss": 0.0004, "step": 10920 }, { "epoch": 4.968607825295724, "grad_norm": 0.10211993259680609, "learning_rate": 5.004906428685008e-10, "loss": 0.0004, "step": 10921 }, { "epoch": 4.969062784349409, "grad_norm": 0.29167435233550887, "learning_rate": 4.862935114605117e-10, "loss": 0.0031, "step": 10922 }, { "epoch": 4.9695177434030935, "grad_norm": 0.03397125741266478, "learning_rate": 4.723006290249799e-10, "loss": 0.0002, "step": 10923 }, { "epoch": 4.969972702456779, "grad_norm": 0.0633152258802144, "learning_rate": 4.5851199670543523e-10, "loss": 0.0004, "step": 10924 }, { "epoch": 4.970427661510464, "grad_norm": 0.04647621800029054, "learning_rate": 4.4492761562819896e-10, "loss": 0.0002, "step": 10925 }, { "epoch": 4.970882620564149, "grad_norm": 0.3387909442187069, "learning_rate": 4.315474869037717e-10, "loss": 0.0027, "step": 10926 }, { "epoch": 4.971337579617835, "grad_norm": 0.10907026629776222, "learning_rate": 4.183716116251679e-10, "loss": 0.0004, "step": 10927 }, { "epoch": 4.97179253867152, "grad_norm": 0.06799915958199952, "learning_rate": 4.0539999086930403e-10, "loss": 0.0003, "step": 10928 }, { "epoch": 4.9722474977252045, "grad_norm": 0.07993464411227007, "learning_rate": 3.9263262569616547e-10, "loss": 0.0007, "step": 10929 }, { "epoch": 4.97270245677889, "grad_norm": 0.2126204687706394, "learning_rate": 3.800695171488067e-10, "loss": 0.0034, "step": 10930 }, { "epoch": 4.973157415832575, "grad_norm": 0.016889216323227048, "learning_rate": 3.6771066625418405e-10, "loss": 0.0001, "step": 10931 }, { "epoch": 4.97361237488626, "grad_norm": 0.052046488759678315, "learning_rate": 3.5555607402176783e-10, "loss": 0.0002, "step": 10932 }, { "epoch": 4.974067333939946, "grad_norm": 0.07235155231451798, "learning_rate": 3.4360574144520764e-10, "loss": 0.0008, "step": 10933 }, { "epoch": 4.974522292993631, "grad_norm": 0.1265906599229768, "learning_rate": 3.3185966950066705e-10, "loss": 0.0009, "step": 10934 }, { "epoch": 4.9749772520473154, "grad_norm": 0.1617360585105639, "learning_rate": 3.20317859148489e-10, "loss": 0.0017, "step": 10935 }, { "epoch": 4.975432211101001, "grad_norm": 0.09979470516767376, "learning_rate": 3.0898031133125283e-10, "loss": 0.0004, "step": 10936 }, { "epoch": 4.975887170154686, "grad_norm": 0.05307219849347166, "learning_rate": 2.9784702697543964e-10, "loss": 0.0002, "step": 10937 }, { "epoch": 4.976342129208371, "grad_norm": 0.12687774189561976, "learning_rate": 2.8691800699115477e-10, "loss": 0.0006, "step": 10938 }, { "epoch": 4.976797088262057, "grad_norm": 0.2528275188922311, "learning_rate": 2.761932522715727e-10, "loss": 0.0015, "step": 10939 }, { "epoch": 4.977252047315742, "grad_norm": 0.09611386894924956, "learning_rate": 2.656727636926593e-10, "loss": 0.0002, "step": 10940 }, { "epoch": 4.977707006369426, "grad_norm": 0.044079945763154306, "learning_rate": 2.5535654211400474e-10, "loss": 0.0003, "step": 10941 }, { "epoch": 4.978161965423112, "grad_norm": 0.033995171132156124, "learning_rate": 2.45244588379101e-10, "loss": 0.0002, "step": 10942 }, { "epoch": 4.978616924476797, "grad_norm": 0.09764170558248489, "learning_rate": 2.3533690331423166e-10, "loss": 0.0008, "step": 10943 }, { "epoch": 4.979071883530482, "grad_norm": 0.017810394298004028, "learning_rate": 2.256334877284716e-10, "loss": 0.0001, "step": 10944 }, { "epoch": 4.979526842584168, "grad_norm": 0.10672462904947455, "learning_rate": 2.1613434241507524e-10, "loss": 0.0006, "step": 10945 }, { "epoch": 4.9799818016378525, "grad_norm": 0.07568501901765415, "learning_rate": 2.0683946815036604e-10, "loss": 0.0012, "step": 10946 }, { "epoch": 4.980436760691537, "grad_norm": 0.4868649571853695, "learning_rate": 1.9774886569373653e-10, "loss": 0.004, "step": 10947 }, { "epoch": 4.980891719745223, "grad_norm": 0.21805681278342157, "learning_rate": 1.8886253578820345e-10, "loss": 0.0011, "step": 10948 }, { "epoch": 4.981346678798908, "grad_norm": 0.08878665336904644, "learning_rate": 1.8018047915957515e-10, "loss": 0.0008, "step": 10949 }, { "epoch": 4.981801637852593, "grad_norm": 0.0053382762816329464, "learning_rate": 1.7170269651756165e-10, "loss": 0.0, "step": 10950 }, { "epoch": 4.982256596906279, "grad_norm": 0.04184869700928816, "learning_rate": 1.6342918855494216e-10, "loss": 0.0002, "step": 10951 }, { "epoch": 4.9827115559599635, "grad_norm": 0.09685023810538493, "learning_rate": 1.553599559475649e-10, "loss": 0.0005, "step": 10952 }, { "epoch": 4.983166515013648, "grad_norm": 0.028313031089831395, "learning_rate": 1.4749499935517998e-10, "loss": 0.0001, "step": 10953 }, { "epoch": 4.983621474067334, "grad_norm": 0.04074965477524208, "learning_rate": 1.3983431942005133e-10, "loss": 0.0003, "step": 10954 }, { "epoch": 4.984076433121019, "grad_norm": 0.03480061310492548, "learning_rate": 1.3237791676862232e-10, "loss": 0.0002, "step": 10955 }, { "epoch": 4.984531392174704, "grad_norm": 0.27656435520665956, "learning_rate": 1.251257920098503e-10, "loss": 0.003, "step": 10956 }, { "epoch": 4.98498635122839, "grad_norm": 0.05647604923594398, "learning_rate": 1.1807794573659437e-10, "loss": 0.0002, "step": 10957 }, { "epoch": 4.9854413102820745, "grad_norm": 0.045529888659966114, "learning_rate": 1.1123437852450514e-10, "loss": 0.0003, "step": 10958 }, { "epoch": 4.985896269335759, "grad_norm": 0.21991338564624213, "learning_rate": 1.0459509093285747e-10, "loss": 0.0009, "step": 10959 }, { "epoch": 4.986351228389445, "grad_norm": 0.29503442760605075, "learning_rate": 9.816008350455042e-11, "loss": 0.0018, "step": 10960 }, { "epoch": 4.98680618744313, "grad_norm": 0.0618695736173639, "learning_rate": 9.192935676499704e-11, "loss": 0.0003, "step": 10961 }, { "epoch": 4.987261146496815, "grad_norm": 0.1441929915509968, "learning_rate": 8.590291122323458e-11, "loss": 0.0005, "step": 10962 }, { "epoch": 4.987716105550501, "grad_norm": 0.14093386537401684, "learning_rate": 8.008074737220206e-11, "loss": 0.0011, "step": 10963 }, { "epoch": 4.9881710646041855, "grad_norm": 0.03817816853276704, "learning_rate": 7.446286568763006e-11, "loss": 0.0001, "step": 10964 }, { "epoch": 4.98862602365787, "grad_norm": 0.056121507742340045, "learning_rate": 6.904926662804068e-11, "loss": 0.0002, "step": 10965 }, { "epoch": 4.989080982711556, "grad_norm": 0.3473611223151501, "learning_rate": 6.38399506364129e-11, "loss": 0.004, "step": 10966 }, { "epoch": 4.989535941765241, "grad_norm": 0.638200582876739, "learning_rate": 5.883491813796216e-11, "loss": 0.0027, "step": 10967 }, { "epoch": 4.989990900818926, "grad_norm": 0.26974176381597026, "learning_rate": 5.403416954208318e-11, "loss": 0.0009, "step": 10968 }, { "epoch": 4.990445859872612, "grad_norm": 0.026482839197145166, "learning_rate": 4.94377052406847e-11, "loss": 0.0001, "step": 10969 }, { "epoch": 4.9909008189262964, "grad_norm": 0.029148141268513245, "learning_rate": 4.5045525609854756e-11, "loss": 0.0001, "step": 10970 }, { "epoch": 4.991355777979981, "grad_norm": 0.10610641074625449, "learning_rate": 4.085763100791784e-11, "loss": 0.0009, "step": 10971 }, { "epoch": 4.991810737033667, "grad_norm": 0.05508866572407584, "learning_rate": 3.6874021777377754e-11, "loss": 0.0004, "step": 10972 }, { "epoch": 4.992265696087352, "grad_norm": 0.04515611153178447, "learning_rate": 3.3094698244084956e-11, "loss": 0.0003, "step": 10973 }, { "epoch": 4.992720655141038, "grad_norm": 0.181608558844543, "learning_rate": 2.951966071612633e-11, "loss": 0.0026, "step": 10974 }, { "epoch": 4.9931756141947226, "grad_norm": 0.1409987721801335, "learning_rate": 2.6148909486323204e-11, "loss": 0.0008, "step": 10975 }, { "epoch": 4.993630573248407, "grad_norm": 0.11138234453486186, "learning_rate": 2.298244482973333e-11, "loss": 0.0006, "step": 10976 }, { "epoch": 4.994085532302093, "grad_norm": 0.045132072498906786, "learning_rate": 2.002026700531623e-11, "loss": 0.0002, "step": 10977 }, { "epoch": 4.994540491355778, "grad_norm": 0.009866319843428532, "learning_rate": 1.7262376254822964e-11, "loss": 0.0001, "step": 10978 }, { "epoch": 4.994995450409463, "grad_norm": 0.015597733958891864, "learning_rate": 1.4708772804183924e-11, "loss": 0.0001, "step": 10979 }, { "epoch": 4.995450409463149, "grad_norm": 0.33020061299738246, "learning_rate": 1.2359456861565922e-11, "loss": 0.0005, "step": 10980 }, { "epoch": 4.9959053685168335, "grad_norm": 0.12240192978633481, "learning_rate": 1.0214428618759986e-11, "loss": 0.0002, "step": 10981 }, { "epoch": 4.996360327570518, "grad_norm": 0.058881585646716354, "learning_rate": 8.273688251736468e-12, "loss": 0.0003, "step": 10982 }, { "epoch": 4.996815286624204, "grad_norm": 0.14278813236639526, "learning_rate": 6.537235918702145e-12, "loss": 0.0019, "step": 10983 }, { "epoch": 4.997270245677889, "grad_norm": 0.1619893397701592, "learning_rate": 5.005071761488012e-12, "loss": 0.0007, "step": 10984 }, { "epoch": 4.997725204731574, "grad_norm": 0.1616142891002081, "learning_rate": 3.677195905271713e-12, "loss": 0.0017, "step": 10985 }, { "epoch": 4.99818016378526, "grad_norm": 0.14392807071830144, "learning_rate": 2.5536084588551058e-12, "loss": 0.0017, "step": 10986 }, { "epoch": 4.9986351228389445, "grad_norm": 0.15122368944456857, "learning_rate": 1.634309513831589e-12, "loss": 0.0024, "step": 10987 }, { "epoch": 4.999090081892629, "grad_norm": 0.00761621137012775, "learning_rate": 9.192991454187727e-13, "loss": 0.0, "step": 10988 }, { "epoch": 4.999545040946315, "grad_norm": 0.03634290933996723, "learning_rate": 4.085774119033659e-13, "loss": 0.0003, "step": 10989 }, { "epoch": 5.0, "grad_norm": 0.011907280441843741, "learning_rate": 1.0214435491873176e-13, "loss": 0.0001, "step": 10990 }, { "epoch": 5.0, "step": 10990, "total_flos": 72288269254656.0, "train_loss": 0.02189325313627952, "train_runtime": 13763.029, "train_samples_per_second": 3.193, "train_steps_per_second": 0.799 } ], "logging_steps": 1, "max_steps": 10990, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 555, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 72288269254656.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }