{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.767515923566879, "eval_steps": 500, "global_step": 3885, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00045495905368516835, "grad_norm": 9.424069126182447, "learning_rate": 5e-06, "loss": 0.1263, "step": 1 }, { "epoch": 0.0009099181073703367, "grad_norm": 10.713711803681479, "learning_rate": 4.999999897855645e-06, "loss": 0.1917, "step": 2 }, { "epoch": 0.001364877161055505, "grad_norm": 14.140338542227335, "learning_rate": 4.9999995914225884e-06, "loss": 0.1578, "step": 3 }, { "epoch": 0.0018198362147406734, "grad_norm": 3.597475372738082, "learning_rate": 4.999999080700855e-06, "loss": 0.1266, "step": 4 }, { "epoch": 0.0022747952684258415, "grad_norm": 6.105724745538744, "learning_rate": 4.999998365690486e-06, "loss": 0.1182, "step": 5 }, { "epoch": 0.00272975432211101, "grad_norm": 2.3169263707348047, "learning_rate": 4.999997446391542e-06, "loss": 0.0837, "step": 6 }, { "epoch": 0.0031847133757961785, "grad_norm": 1.5580577162131912, "learning_rate": 4.999996322804095e-06, "loss": 0.0761, "step": 7 }, { "epoch": 0.003639672429481347, "grad_norm": 7.8184917268265455, "learning_rate": 4.999994994928239e-06, "loss": 0.0922, "step": 8 }, { "epoch": 0.004094631483166515, "grad_norm": 1.0452365500769838, "learning_rate": 4.999993462764082e-06, "loss": 0.0478, "step": 9 }, { "epoch": 0.004549590536851683, "grad_norm": 13.965028712537013, "learning_rate": 4.999991726311749e-06, "loss": 0.0846, "step": 10 }, { "epoch": 0.005004549590536852, "grad_norm": 5.125925143296543, "learning_rate": 4.999989785571382e-06, "loss": 0.0881, "step": 11 }, { "epoch": 0.00545950864422202, "grad_norm": 2.2007100936893242, "learning_rate": 4.999987640543139e-06, "loss": 0.0896, "step": 12 }, { "epoch": 0.005914467697907188, "grad_norm": 1.4259973806728683, "learning_rate": 4.999985291227196e-06, "loss": 0.0707, "step": 13 }, { "epoch": 0.006369426751592357, "grad_norm": 2.5296942505090376, "learning_rate": 4.999982737623746e-06, "loss": 0.1089, "step": 14 }, { "epoch": 0.006824385805277525, "grad_norm": 1.9950751818037182, "learning_rate": 4.999979979732995e-06, "loss": 0.0868, "step": 15 }, { "epoch": 0.007279344858962694, "grad_norm": 1.3920340257758652, "learning_rate": 4.999977017555171e-06, "loss": 0.0667, "step": 16 }, { "epoch": 0.0077343039126478615, "grad_norm": 1.6901228042476675, "learning_rate": 4.999973851090514e-06, "loss": 0.1032, "step": 17 }, { "epoch": 0.00818926296633303, "grad_norm": 1.8139241575982044, "learning_rate": 4.999970480339284e-06, "loss": 0.0848, "step": 18 }, { "epoch": 0.008644222020018199, "grad_norm": 2.792209216647474, "learning_rate": 4.9999669053017564e-06, "loss": 0.0804, "step": 19 }, { "epoch": 0.009099181073703366, "grad_norm": 1.9016199513748882, "learning_rate": 4.9999631259782235e-06, "loss": 0.0612, "step": 20 }, { "epoch": 0.009554140127388535, "grad_norm": 1.9965871271660314, "learning_rate": 4.999959142368993e-06, "loss": 0.0969, "step": 21 }, { "epoch": 0.010009099181073703, "grad_norm": 2.0914009303085033, "learning_rate": 4.999954954474391e-06, "loss": 0.0697, "step": 22 }, { "epoch": 0.010464058234758872, "grad_norm": 1.4245797905814712, "learning_rate": 4.9999505622947594e-06, "loss": 0.0832, "step": 23 }, { "epoch": 0.01091901728844404, "grad_norm": 1.5918336957933508, "learning_rate": 4.999945965830458e-06, "loss": 0.0995, "step": 24 }, { "epoch": 0.011373976342129208, "grad_norm": 1.5479918567604505, "learning_rate": 4.999941165081863e-06, "loss": 0.0807, "step": 25 }, { "epoch": 0.011828935395814377, "grad_norm": 1.0230515440884096, "learning_rate": 4.999936160049364e-06, "loss": 0.0643, "step": 26 }, { "epoch": 0.012283894449499545, "grad_norm": 1.5686069800283207, "learning_rate": 4.999930950733373e-06, "loss": 0.0931, "step": 27 }, { "epoch": 0.012738853503184714, "grad_norm": 1.2554970571666952, "learning_rate": 4.999925537134312e-06, "loss": 0.0815, "step": 28 }, { "epoch": 0.013193812556869881, "grad_norm": 2.006239028459661, "learning_rate": 4.9999199192526286e-06, "loss": 0.1058, "step": 29 }, { "epoch": 0.01364877161055505, "grad_norm": 1.4436359414979703, "learning_rate": 4.9999140970887775e-06, "loss": 0.0869, "step": 30 }, { "epoch": 0.014103730664240218, "grad_norm": 1.9267705188401287, "learning_rate": 4.999908070643236e-06, "loss": 0.0781, "step": 31 }, { "epoch": 0.014558689717925387, "grad_norm": 1.4021843278575745, "learning_rate": 4.999901839916495e-06, "loss": 0.0623, "step": 32 }, { "epoch": 0.015013648771610554, "grad_norm": 1.208153452070421, "learning_rate": 4.999895404909067e-06, "loss": 0.063, "step": 33 }, { "epoch": 0.015468607825295723, "grad_norm": 2.273185304548797, "learning_rate": 4.999888765621476e-06, "loss": 0.0901, "step": 34 }, { "epoch": 0.01592356687898089, "grad_norm": 1.0383667898934177, "learning_rate": 4.999881922054264e-06, "loss": 0.0529, "step": 35 }, { "epoch": 0.01637852593266606, "grad_norm": 1.1537070720156926, "learning_rate": 4.999874874207991e-06, "loss": 0.0539, "step": 36 }, { "epoch": 0.01683348498635123, "grad_norm": 7.004645996036244, "learning_rate": 4.999867622083232e-06, "loss": 0.1028, "step": 37 }, { "epoch": 0.017288444040036398, "grad_norm": 2.6515111867419217, "learning_rate": 4.99986016568058e-06, "loss": 0.0958, "step": 38 }, { "epoch": 0.017743403093721567, "grad_norm": 1.5437471575403858, "learning_rate": 4.999852505000646e-06, "loss": 0.0738, "step": 39 }, { "epoch": 0.018198362147406732, "grad_norm": 1.4798019454902687, "learning_rate": 4.999844640044053e-06, "loss": 0.0695, "step": 40 }, { "epoch": 0.0186533212010919, "grad_norm": 1.3064785518172293, "learning_rate": 4.999836570811445e-06, "loss": 0.0738, "step": 41 }, { "epoch": 0.01910828025477707, "grad_norm": 2.6092308850144086, "learning_rate": 4.999828297303483e-06, "loss": 0.0854, "step": 42 }, { "epoch": 0.019563239308462238, "grad_norm": 1.1588535962376392, "learning_rate": 4.9998198195208405e-06, "loss": 0.0783, "step": 43 }, { "epoch": 0.020018198362147407, "grad_norm": 1.441993661454023, "learning_rate": 4.999811137464212e-06, "loss": 0.0826, "step": 44 }, { "epoch": 0.020473157415832575, "grad_norm": 1.6833012903770388, "learning_rate": 4.999802251134307e-06, "loss": 0.0932, "step": 45 }, { "epoch": 0.020928116469517744, "grad_norm": 1.061841465675538, "learning_rate": 4.99979316053185e-06, "loss": 0.0602, "step": 46 }, { "epoch": 0.021383075523202913, "grad_norm": 6.235552737213317, "learning_rate": 4.999783865657585e-06, "loss": 0.1756, "step": 47 }, { "epoch": 0.02183803457688808, "grad_norm": 4.150615789136632, "learning_rate": 4.999774366512272e-06, "loss": 0.1765, "step": 48 }, { "epoch": 0.022292993630573247, "grad_norm": 1.6544370579186418, "learning_rate": 4.9997646630966865e-06, "loss": 0.0841, "step": 49 }, { "epoch": 0.022747952684258416, "grad_norm": 1.3759378168890601, "learning_rate": 4.999754755411621e-06, "loss": 0.0669, "step": 50 }, { "epoch": 0.023202911737943584, "grad_norm": 1.182095773050476, "learning_rate": 4.9997446434578865e-06, "loss": 0.0653, "step": 51 }, { "epoch": 0.023657870791628753, "grad_norm": 1.035739970953985, "learning_rate": 4.999734327236307e-06, "loss": 0.0678, "step": 52 }, { "epoch": 0.024112829845313922, "grad_norm": 0.7085636728418604, "learning_rate": 4.999723806747728e-06, "loss": 0.0498, "step": 53 }, { "epoch": 0.02456778889899909, "grad_norm": 2.2722150874810185, "learning_rate": 4.99971308199301e-06, "loss": 0.0666, "step": 54 }, { "epoch": 0.02502274795268426, "grad_norm": 0.9420150282219443, "learning_rate": 4.999702152973025e-06, "loss": 0.0516, "step": 55 }, { "epoch": 0.025477707006369428, "grad_norm": 1.0929779986912587, "learning_rate": 4.9996910196886694e-06, "loss": 0.0593, "step": 56 }, { "epoch": 0.025932666060054597, "grad_norm": 0.783956655534044, "learning_rate": 4.999679682140852e-06, "loss": 0.0377, "step": 57 }, { "epoch": 0.026387625113739762, "grad_norm": 1.5218504285246661, "learning_rate": 4.999668140330499e-06, "loss": 0.1052, "step": 58 }, { "epoch": 0.02684258416742493, "grad_norm": 1.0791722855226673, "learning_rate": 4.999656394258555e-06, "loss": 0.0632, "step": 59 }, { "epoch": 0.0272975432211101, "grad_norm": 0.9557512868551324, "learning_rate": 4.999644443925978e-06, "loss": 0.0634, "step": 60 }, { "epoch": 0.027752502274795268, "grad_norm": 1.0667565930302423, "learning_rate": 4.999632289333746e-06, "loss": 0.0518, "step": 61 }, { "epoch": 0.028207461328480437, "grad_norm": 1.646318745184601, "learning_rate": 4.999619930482852e-06, "loss": 0.0766, "step": 62 }, { "epoch": 0.028662420382165606, "grad_norm": 1.2186400155674944, "learning_rate": 4.999607367374304e-06, "loss": 0.0741, "step": 63 }, { "epoch": 0.029117379435850774, "grad_norm": 1.0807362476000584, "learning_rate": 4.999594600009131e-06, "loss": 0.0553, "step": 64 }, { "epoch": 0.029572338489535943, "grad_norm": 1.3403222529377026, "learning_rate": 4.999581628388375e-06, "loss": 0.0886, "step": 65 }, { "epoch": 0.03002729754322111, "grad_norm": 1.5384085589580356, "learning_rate": 4.999568452513097e-06, "loss": 0.1371, "step": 66 }, { "epoch": 0.030482256596906277, "grad_norm": 1.3705617237121213, "learning_rate": 4.9995550723843726e-06, "loss": 0.0766, "step": 67 }, { "epoch": 0.030937215650591446, "grad_norm": 1.0692361538736996, "learning_rate": 4.999541488003295e-06, "loss": 0.0607, "step": 68 }, { "epoch": 0.03139217470427662, "grad_norm": 0.9190382606343962, "learning_rate": 4.999527699370975e-06, "loss": 0.0598, "step": 69 }, { "epoch": 0.03184713375796178, "grad_norm": 1.182484013540807, "learning_rate": 4.99951370648854e-06, "loss": 0.0583, "step": 70 }, { "epoch": 0.03230209281164695, "grad_norm": 1.0728084003134533, "learning_rate": 4.999499509357132e-06, "loss": 0.0595, "step": 71 }, { "epoch": 0.03275705186533212, "grad_norm": 4.354112851430141, "learning_rate": 4.999485107977912e-06, "loss": 0.063, "step": 72 }, { "epoch": 0.033212010919017286, "grad_norm": 1.5709570309947747, "learning_rate": 4.999470502352057e-06, "loss": 0.0511, "step": 73 }, { "epoch": 0.03366696997270246, "grad_norm": 5.936498302941106, "learning_rate": 4.999455692480759e-06, "loss": 0.0733, "step": 74 }, { "epoch": 0.034121929026387623, "grad_norm": 2.227018438923651, "learning_rate": 4.999440678365229e-06, "loss": 0.0504, "step": 75 }, { "epoch": 0.034576888080072796, "grad_norm": 1.7106603940792875, "learning_rate": 4.999425460006695e-06, "loss": 0.0672, "step": 76 }, { "epoch": 0.03503184713375796, "grad_norm": 3.603233120133456, "learning_rate": 4.9994100374063995e-06, "loss": 0.0605, "step": 77 }, { "epoch": 0.03548680618744313, "grad_norm": 1.3736083353388526, "learning_rate": 4.9993944105656035e-06, "loss": 0.0892, "step": 78 }, { "epoch": 0.0359417652411283, "grad_norm": 1.0823220835311542, "learning_rate": 4.999378579485582e-06, "loss": 0.0657, "step": 79 }, { "epoch": 0.036396724294813464, "grad_norm": 2.150924215229101, "learning_rate": 4.999362544167632e-06, "loss": 0.0787, "step": 80 }, { "epoch": 0.036851683348498636, "grad_norm": 1.7178625535843866, "learning_rate": 4.99934630461306e-06, "loss": 0.0428, "step": 81 }, { "epoch": 0.0373066424021838, "grad_norm": 1.3444344286672891, "learning_rate": 4.999329860823197e-06, "loss": 0.0683, "step": 82 }, { "epoch": 0.03776160145586897, "grad_norm": 0.9890758669005086, "learning_rate": 4.999313212799383e-06, "loss": 0.0684, "step": 83 }, { "epoch": 0.03821656050955414, "grad_norm": 1.341850788541947, "learning_rate": 4.99929636054298e-06, "loss": 0.0683, "step": 84 }, { "epoch": 0.03867151956323931, "grad_norm": 1.1425756088631416, "learning_rate": 4.999279304055366e-06, "loss": 0.0781, "step": 85 }, { "epoch": 0.039126478616924476, "grad_norm": 1.1872176417370066, "learning_rate": 4.999262043337933e-06, "loss": 0.0652, "step": 86 }, { "epoch": 0.03958143767060965, "grad_norm": 1.1143093977494338, "learning_rate": 4.999244578392094e-06, "loss": 0.0752, "step": 87 }, { "epoch": 0.040036396724294813, "grad_norm": 1.2369204202074342, "learning_rate": 4.9992269092192736e-06, "loss": 0.0822, "step": 88 }, { "epoch": 0.04049135577797998, "grad_norm": 1.1130108244588752, "learning_rate": 4.9992090358209166e-06, "loss": 0.0548, "step": 89 }, { "epoch": 0.04094631483166515, "grad_norm": 1.0691923631453497, "learning_rate": 4.9991909581984835e-06, "loss": 0.058, "step": 90 }, { "epoch": 0.041401273885350316, "grad_norm": 0.8020461125153492, "learning_rate": 4.999172676353451e-06, "loss": 0.0341, "step": 91 }, { "epoch": 0.04185623293903549, "grad_norm": 0.8729429986066347, "learning_rate": 4.999154190287314e-06, "loss": 0.0524, "step": 92 }, { "epoch": 0.042311191992720654, "grad_norm": 1.3186052508212676, "learning_rate": 4.999135500001583e-06, "loss": 0.1067, "step": 93 }, { "epoch": 0.042766151046405826, "grad_norm": 0.9402363215265714, "learning_rate": 4.9991166054977844e-06, "loss": 0.0631, "step": 94 }, { "epoch": 0.04322111010009099, "grad_norm": 1.8336904222617239, "learning_rate": 4.999097506777463e-06, "loss": 0.0897, "step": 95 }, { "epoch": 0.04367606915377616, "grad_norm": 1.0700343361679827, "learning_rate": 4.999078203842179e-06, "loss": 0.084, "step": 96 }, { "epoch": 0.04413102820746133, "grad_norm": 0.8783050881223096, "learning_rate": 4.999058696693511e-06, "loss": 0.0421, "step": 97 }, { "epoch": 0.044585987261146494, "grad_norm": 0.9801149440827129, "learning_rate": 4.99903898533305e-06, "loss": 0.0616, "step": 98 }, { "epoch": 0.045040946314831666, "grad_norm": 0.9471216563783236, "learning_rate": 4.99901906976241e-06, "loss": 0.0614, "step": 99 }, { "epoch": 0.04549590536851683, "grad_norm": 1.1616379193988644, "learning_rate": 4.998998949983217e-06, "loss": 0.0604, "step": 100 }, { "epoch": 0.045950864422202004, "grad_norm": 1.123688602696856, "learning_rate": 4.998978625997115e-06, "loss": 0.0831, "step": 101 }, { "epoch": 0.04640582347588717, "grad_norm": 1.1154387442545128, "learning_rate": 4.998958097805765e-06, "loss": 0.0686, "step": 102 }, { "epoch": 0.04686078252957234, "grad_norm": 0.9538196832365717, "learning_rate": 4.9989373654108445e-06, "loss": 0.0586, "step": 103 }, { "epoch": 0.047315741583257506, "grad_norm": 14.714854180857428, "learning_rate": 4.9989164288140465e-06, "loss": 0.2765, "step": 104 }, { "epoch": 0.04777070063694268, "grad_norm": 1.5310230722630254, "learning_rate": 4.998895288017085e-06, "loss": 0.1114, "step": 105 }, { "epoch": 0.048225659690627844, "grad_norm": 0.8704851988514942, "learning_rate": 4.998873943021684e-06, "loss": 0.0481, "step": 106 }, { "epoch": 0.04868061874431301, "grad_norm": 0.9229853294124807, "learning_rate": 4.998852393829589e-06, "loss": 0.0559, "step": 107 }, { "epoch": 0.04913557779799818, "grad_norm": 1.6890853415724327, "learning_rate": 4.9988306404425625e-06, "loss": 0.1104, "step": 108 }, { "epoch": 0.049590536851683346, "grad_norm": 0.9281407020626959, "learning_rate": 4.99880868286238e-06, "loss": 0.0636, "step": 109 }, { "epoch": 0.05004549590536852, "grad_norm": 1.2440415104108336, "learning_rate": 4.998786521090836e-06, "loss": 0.0522, "step": 110 }, { "epoch": 0.050500454959053684, "grad_norm": 1.06604652034606, "learning_rate": 4.9987641551297426e-06, "loss": 0.0916, "step": 111 }, { "epoch": 0.050955414012738856, "grad_norm": 0.9619782747004665, "learning_rate": 4.998741584980926e-06, "loss": 0.0822, "step": 112 }, { "epoch": 0.05141037306642402, "grad_norm": 1.0679619427370142, "learning_rate": 4.9987188106462314e-06, "loss": 0.0644, "step": 113 }, { "epoch": 0.051865332120109194, "grad_norm": 0.8424012677371406, "learning_rate": 4.99869583212752e-06, "loss": 0.0536, "step": 114 }, { "epoch": 0.05232029117379436, "grad_norm": 1.660603270099433, "learning_rate": 4.9986726494266694e-06, "loss": 0.1336, "step": 115 }, { "epoch": 0.052775250227479524, "grad_norm": 1.0314643506984187, "learning_rate": 4.998649262545574e-06, "loss": 0.0606, "step": 116 }, { "epoch": 0.053230209281164696, "grad_norm": 0.9468486095046134, "learning_rate": 4.998625671486144e-06, "loss": 0.0598, "step": 117 }, { "epoch": 0.05368516833484986, "grad_norm": 0.8800045267913842, "learning_rate": 4.998601876250308e-06, "loss": 0.06, "step": 118 }, { "epoch": 0.054140127388535034, "grad_norm": 1.0192910760666323, "learning_rate": 4.998577876840011e-06, "loss": 0.0601, "step": 119 }, { "epoch": 0.0545950864422202, "grad_norm": 0.9462635574827357, "learning_rate": 4.9985536732572124e-06, "loss": 0.06, "step": 120 }, { "epoch": 0.05505004549590537, "grad_norm": 0.7487116084320051, "learning_rate": 4.998529265503891e-06, "loss": 0.0458, "step": 121 }, { "epoch": 0.055505004549590536, "grad_norm": 1.0663282141956507, "learning_rate": 4.9985046535820416e-06, "loss": 0.0758, "step": 122 }, { "epoch": 0.05595996360327571, "grad_norm": 1.7476635252011261, "learning_rate": 4.998479837493675e-06, "loss": 0.0876, "step": 123 }, { "epoch": 0.056414922656960874, "grad_norm": 1.1513932571098853, "learning_rate": 4.9984548172408195e-06, "loss": 0.0475, "step": 124 }, { "epoch": 0.05686988171064604, "grad_norm": 97.54679492281674, "learning_rate": 4.998429592825519e-06, "loss": 0.2117, "step": 125 }, { "epoch": 0.05732484076433121, "grad_norm": 1.2146893500357796, "learning_rate": 4.998404164249835e-06, "loss": 0.0887, "step": 126 }, { "epoch": 0.05777979981801638, "grad_norm": 0.8319799978985719, "learning_rate": 4.998378531515845e-06, "loss": 0.0411, "step": 127 }, { "epoch": 0.05823475887170155, "grad_norm": 1.5818516008522756, "learning_rate": 4.998352694625645e-06, "loss": 0.068, "step": 128 }, { "epoch": 0.058689717925386714, "grad_norm": 0.883733186490376, "learning_rate": 4.998326653581343e-06, "loss": 0.0595, "step": 129 }, { "epoch": 0.059144676979071886, "grad_norm": 0.9357726879327158, "learning_rate": 4.998300408385072e-06, "loss": 0.0686, "step": 130 }, { "epoch": 0.05959963603275705, "grad_norm": 1.3606472296483436, "learning_rate": 4.998273959038972e-06, "loss": 0.0837, "step": 131 }, { "epoch": 0.06005459508644222, "grad_norm": 0.9597337111291308, "learning_rate": 4.998247305545207e-06, "loss": 0.0733, "step": 132 }, { "epoch": 0.06050955414012739, "grad_norm": 0.7271469650592398, "learning_rate": 4.998220447905953e-06, "loss": 0.0454, "step": 133 }, { "epoch": 0.060964513193812554, "grad_norm": 0.9630498239095886, "learning_rate": 4.998193386123408e-06, "loss": 0.074, "step": 134 }, { "epoch": 0.061419472247497726, "grad_norm": 1.133544314724227, "learning_rate": 4.99816612019978e-06, "loss": 0.077, "step": 135 }, { "epoch": 0.06187443130118289, "grad_norm": 4.162875613842665, "learning_rate": 4.998138650137298e-06, "loss": 0.1461, "step": 136 }, { "epoch": 0.062329390354868064, "grad_norm": 1.02851611153301, "learning_rate": 4.998110975938208e-06, "loss": 0.0883, "step": 137 }, { "epoch": 0.06278434940855324, "grad_norm": 1.4803017864082986, "learning_rate": 4.998083097604769e-06, "loss": 0.093, "step": 138 }, { "epoch": 0.0632393084622384, "grad_norm": 0.775173461523887, "learning_rate": 4.998055015139261e-06, "loss": 0.0446, "step": 139 }, { "epoch": 0.06369426751592357, "grad_norm": 0.9314427643573137, "learning_rate": 4.998026728543979e-06, "loss": 0.0627, "step": 140 }, { "epoch": 0.06414922656960874, "grad_norm": 0.8532842969957802, "learning_rate": 4.997998237821233e-06, "loss": 0.07, "step": 141 }, { "epoch": 0.0646041856232939, "grad_norm": 0.8003964270143441, "learning_rate": 4.997969542973352e-06, "loss": 0.0563, "step": 142 }, { "epoch": 0.06505914467697907, "grad_norm": 1.0449654693074535, "learning_rate": 4.997940644002681e-06, "loss": 0.0705, "step": 143 }, { "epoch": 0.06551410373066424, "grad_norm": 1.2317539206735935, "learning_rate": 4.997911540911581e-06, "loss": 0.0552, "step": 144 }, { "epoch": 0.06596906278434941, "grad_norm": 1.0170288864286834, "learning_rate": 4.99788223370243e-06, "loss": 0.075, "step": 145 }, { "epoch": 0.06642402183803457, "grad_norm": 2.1516221031707796, "learning_rate": 4.9978527223776245e-06, "loss": 0.1294, "step": 146 }, { "epoch": 0.06687898089171974, "grad_norm": 0.8159636795919125, "learning_rate": 4.9978230069395735e-06, "loss": 0.0512, "step": 147 }, { "epoch": 0.06733393994540492, "grad_norm": 1.0575473809333984, "learning_rate": 4.9977930873907065e-06, "loss": 0.0598, "step": 148 }, { "epoch": 0.06778889899909009, "grad_norm": 1.0958109016760909, "learning_rate": 4.997762963733468e-06, "loss": 0.074, "step": 149 }, { "epoch": 0.06824385805277525, "grad_norm": 1.047477211054204, "learning_rate": 4.997732635970321e-06, "loss": 0.0539, "step": 150 }, { "epoch": 0.06869881710646042, "grad_norm": 1.0301191819422422, "learning_rate": 4.9977021041037425e-06, "loss": 0.0686, "step": 151 }, { "epoch": 0.06915377616014559, "grad_norm": 1.225998339573777, "learning_rate": 4.9976713681362265e-06, "loss": 0.0859, "step": 152 }, { "epoch": 0.06960873521383075, "grad_norm": 1.416119617095304, "learning_rate": 4.997640428070286e-06, "loss": 0.1051, "step": 153 }, { "epoch": 0.07006369426751592, "grad_norm": 0.9227148160074169, "learning_rate": 4.99760928390845e-06, "loss": 0.0476, "step": 154 }, { "epoch": 0.0705186533212011, "grad_norm": 0.9417296172183, "learning_rate": 4.997577935653262e-06, "loss": 0.0546, "step": 155 }, { "epoch": 0.07097361237488627, "grad_norm": 0.7429922167271485, "learning_rate": 4.9975463833072835e-06, "loss": 0.0438, "step": 156 }, { "epoch": 0.07142857142857142, "grad_norm": 1.1317968054046752, "learning_rate": 4.997514626873093e-06, "loss": 0.0723, "step": 157 }, { "epoch": 0.0718835304822566, "grad_norm": 0.894309839547546, "learning_rate": 4.997482666353287e-06, "loss": 0.0484, "step": 158 }, { "epoch": 0.07233848953594177, "grad_norm": 1.2064896460901124, "learning_rate": 4.997450501750476e-06, "loss": 0.0686, "step": 159 }, { "epoch": 0.07279344858962693, "grad_norm": 1.0338993985106997, "learning_rate": 4.997418133067288e-06, "loss": 0.066, "step": 160 }, { "epoch": 0.0732484076433121, "grad_norm": 1.0854078217047458, "learning_rate": 4.997385560306368e-06, "loss": 0.075, "step": 161 }, { "epoch": 0.07370336669699727, "grad_norm": 0.9955556312708298, "learning_rate": 4.997352783470379e-06, "loss": 0.0693, "step": 162 }, { "epoch": 0.07415832575068244, "grad_norm": 1.1119344699280262, "learning_rate": 4.997319802561997e-06, "loss": 0.0687, "step": 163 }, { "epoch": 0.0746132848043676, "grad_norm": 0.8118992710097626, "learning_rate": 4.9972866175839196e-06, "loss": 0.061, "step": 164 }, { "epoch": 0.07506824385805277, "grad_norm": 1.0509201052861925, "learning_rate": 4.9972532285388575e-06, "loss": 0.0738, "step": 165 }, { "epoch": 0.07552320291173795, "grad_norm": 1.1660685920656126, "learning_rate": 4.997219635429538e-06, "loss": 0.1018, "step": 166 }, { "epoch": 0.07597816196542312, "grad_norm": 0.9894981496034668, "learning_rate": 4.997185838258709e-06, "loss": 0.0534, "step": 167 }, { "epoch": 0.07643312101910828, "grad_norm": 0.9397553289113793, "learning_rate": 4.997151837029129e-06, "loss": 0.0527, "step": 168 }, { "epoch": 0.07688808007279345, "grad_norm": 0.9368221512292729, "learning_rate": 4.997117631743579e-06, "loss": 0.0648, "step": 169 }, { "epoch": 0.07734303912647862, "grad_norm": 0.898690664067523, "learning_rate": 4.997083222404852e-06, "loss": 0.0479, "step": 170 }, { "epoch": 0.07779799818016378, "grad_norm": 0.835250569463016, "learning_rate": 4.997048609015762e-06, "loss": 0.0528, "step": 171 }, { "epoch": 0.07825295723384895, "grad_norm": 0.9098471940978452, "learning_rate": 4.997013791579136e-06, "loss": 0.0641, "step": 172 }, { "epoch": 0.07870791628753412, "grad_norm": 0.9538942863622895, "learning_rate": 4.996978770097819e-06, "loss": 0.0648, "step": 173 }, { "epoch": 0.0791628753412193, "grad_norm": 0.9163372515795332, "learning_rate": 4.996943544574673e-06, "loss": 0.0682, "step": 174 }, { "epoch": 0.07961783439490445, "grad_norm": 0.8193165634479148, "learning_rate": 4.996908115012576e-06, "loss": 0.0485, "step": 175 }, { "epoch": 0.08007279344858963, "grad_norm": 1.0278777387100766, "learning_rate": 4.996872481414425e-06, "loss": 0.0741, "step": 176 }, { "epoch": 0.0805277525022748, "grad_norm": 1.8323226365700802, "learning_rate": 4.9968366437831305e-06, "loss": 0.1107, "step": 177 }, { "epoch": 0.08098271155595996, "grad_norm": 0.5562843681536768, "learning_rate": 4.99680060212162e-06, "loss": 0.0379, "step": 178 }, { "epoch": 0.08143767060964513, "grad_norm": 0.6982410679992989, "learning_rate": 4.996764356432841e-06, "loss": 0.0576, "step": 179 }, { "epoch": 0.0818926296633303, "grad_norm": 0.9996693552976796, "learning_rate": 4.996727906719754e-06, "loss": 0.056, "step": 180 }, { "epoch": 0.08234758871701547, "grad_norm": 0.8092230365331524, "learning_rate": 4.9966912529853365e-06, "loss": 0.036, "step": 181 }, { "epoch": 0.08280254777070063, "grad_norm": 0.8856317784665715, "learning_rate": 4.996654395232585e-06, "loss": 0.0546, "step": 182 }, { "epoch": 0.0832575068243858, "grad_norm": 0.7648943084887926, "learning_rate": 4.996617333464512e-06, "loss": 0.0456, "step": 183 }, { "epoch": 0.08371246587807098, "grad_norm": 0.8896960831413809, "learning_rate": 4.996580067684145e-06, "loss": 0.0505, "step": 184 }, { "epoch": 0.08416742493175614, "grad_norm": 0.8819070603063018, "learning_rate": 4.996542597894528e-06, "loss": 0.0833, "step": 185 }, { "epoch": 0.08462238398544131, "grad_norm": 0.8756856388162975, "learning_rate": 4.996504924098726e-06, "loss": 0.0641, "step": 186 }, { "epoch": 0.08507734303912648, "grad_norm": 0.8527408544485862, "learning_rate": 4.9964670462998145e-06, "loss": 0.0553, "step": 187 }, { "epoch": 0.08553230209281165, "grad_norm": 0.9875356023767464, "learning_rate": 4.99642896450089e-06, "loss": 0.0874, "step": 188 }, { "epoch": 0.08598726114649681, "grad_norm": 2.0664437318649003, "learning_rate": 4.9963906787050656e-06, "loss": 0.0901, "step": 189 }, { "epoch": 0.08644222020018198, "grad_norm": 0.772276028123917, "learning_rate": 4.996352188915467e-06, "loss": 0.0457, "step": 190 }, { "epoch": 0.08689717925386715, "grad_norm": 1.5995533229184502, "learning_rate": 4.996313495135242e-06, "loss": 0.0902, "step": 191 }, { "epoch": 0.08735213830755233, "grad_norm": 1.14262643514501, "learning_rate": 4.9962745973675505e-06, "loss": 0.0887, "step": 192 }, { "epoch": 0.08780709736123748, "grad_norm": 0.653471766542576, "learning_rate": 4.996235495615572e-06, "loss": 0.0381, "step": 193 }, { "epoch": 0.08826205641492266, "grad_norm": 1.21800497391657, "learning_rate": 4.996196189882503e-06, "loss": 0.0859, "step": 194 }, { "epoch": 0.08871701546860783, "grad_norm": 1.2184077345088562, "learning_rate": 4.996156680171552e-06, "loss": 0.0858, "step": 195 }, { "epoch": 0.08917197452229299, "grad_norm": 0.8525171751383268, "learning_rate": 4.996116966485951e-06, "loss": 0.0542, "step": 196 }, { "epoch": 0.08962693357597816, "grad_norm": 1.0438941172842933, "learning_rate": 4.996077048828944e-06, "loss": 0.0735, "step": 197 }, { "epoch": 0.09008189262966333, "grad_norm": 0.9982779135093925, "learning_rate": 4.996036927203793e-06, "loss": 0.0773, "step": 198 }, { "epoch": 0.0905368516833485, "grad_norm": 1.5215875068980074, "learning_rate": 4.995996601613775e-06, "loss": 0.0814, "step": 199 }, { "epoch": 0.09099181073703366, "grad_norm": 0.9525593904667519, "learning_rate": 4.9959560720621875e-06, "loss": 0.0631, "step": 200 }, { "epoch": 0.09144676979071883, "grad_norm": 1.6658936796296464, "learning_rate": 4.995915338552341e-06, "loss": 0.0892, "step": 201 }, { "epoch": 0.09190172884440401, "grad_norm": 1.0100426736293826, "learning_rate": 4.995874401087565e-06, "loss": 0.0618, "step": 202 }, { "epoch": 0.09235668789808917, "grad_norm": 1.2729210933806279, "learning_rate": 4.9958332596712035e-06, "loss": 0.0808, "step": 203 }, { "epoch": 0.09281164695177434, "grad_norm": 1.0142800844722413, "learning_rate": 4.99579191430662e-06, "loss": 0.0715, "step": 204 }, { "epoch": 0.09326660600545951, "grad_norm": 4.237455676216414, "learning_rate": 4.995750364997192e-06, "loss": 0.062, "step": 205 }, { "epoch": 0.09372156505914468, "grad_norm": 7.872559330750363, "learning_rate": 4.995708611746314e-06, "loss": 0.0548, "step": 206 }, { "epoch": 0.09417652411282984, "grad_norm": 1.2028032815765721, "learning_rate": 4.995666654557399e-06, "loss": 0.0678, "step": 207 }, { "epoch": 0.09463148316651501, "grad_norm": 0.9911372243080299, "learning_rate": 4.995624493433876e-06, "loss": 0.0728, "step": 208 }, { "epoch": 0.09508644222020018, "grad_norm": 2.5900155398471942, "learning_rate": 4.995582128379189e-06, "loss": 0.0822, "step": 209 }, { "epoch": 0.09554140127388536, "grad_norm": 1.4214627215980935, "learning_rate": 4.9955395593968e-06, "loss": 0.1096, "step": 210 }, { "epoch": 0.09599636032757052, "grad_norm": 11.75678149199321, "learning_rate": 4.99549678649019e-06, "loss": 0.0579, "step": 211 }, { "epoch": 0.09645131938125569, "grad_norm": 3.8898709501740747, "learning_rate": 4.99545380966285e-06, "loss": 0.0695, "step": 212 }, { "epoch": 0.09690627843494086, "grad_norm": 4.099783756040842, "learning_rate": 4.995410628918294e-06, "loss": 0.0711, "step": 213 }, { "epoch": 0.09736123748862602, "grad_norm": 3.9495811570453445, "learning_rate": 4.995367244260052e-06, "loss": 0.0871, "step": 214 }, { "epoch": 0.09781619654231119, "grad_norm": 0.7508672950199423, "learning_rate": 4.995323655691667e-06, "loss": 0.0369, "step": 215 }, { "epoch": 0.09827115559599636, "grad_norm": 1.3368080010868653, "learning_rate": 4.995279863216702e-06, "loss": 0.0752, "step": 216 }, { "epoch": 0.09872611464968153, "grad_norm": 0.8823975012529762, "learning_rate": 4.995235866838735e-06, "loss": 0.0695, "step": 217 }, { "epoch": 0.09918107370336669, "grad_norm": 0.8099194866460178, "learning_rate": 4.995191666561361e-06, "loss": 0.0561, "step": 218 }, { "epoch": 0.09963603275705187, "grad_norm": 0.6772333028080019, "learning_rate": 4.995147262388192e-06, "loss": 0.0441, "step": 219 }, { "epoch": 0.10009099181073704, "grad_norm": 0.9342067677666205, "learning_rate": 4.995102654322858e-06, "loss": 0.0613, "step": 220 }, { "epoch": 0.1005459508644222, "grad_norm": 0.7594825525973931, "learning_rate": 4.995057842369002e-06, "loss": 0.0349, "step": 221 }, { "epoch": 0.10100090991810737, "grad_norm": 0.8418616902443392, "learning_rate": 4.995012826530287e-06, "loss": 0.0693, "step": 222 }, { "epoch": 0.10145586897179254, "grad_norm": 1.4826966236644097, "learning_rate": 4.99496760681039e-06, "loss": 0.0971, "step": 223 }, { "epoch": 0.10191082802547771, "grad_norm": 1.3244278108579797, "learning_rate": 4.994922183213009e-06, "loss": 0.0963, "step": 224 }, { "epoch": 0.10236578707916287, "grad_norm": 0.5464933779715734, "learning_rate": 4.9948765557418535e-06, "loss": 0.0357, "step": 225 }, { "epoch": 0.10282074613284804, "grad_norm": 1.1325271027713097, "learning_rate": 4.994830724400653e-06, "loss": 0.0756, "step": 226 }, { "epoch": 0.10327570518653321, "grad_norm": 0.7823528354045581, "learning_rate": 4.994784689193151e-06, "loss": 0.0609, "step": 227 }, { "epoch": 0.10373066424021839, "grad_norm": 0.6599438687201707, "learning_rate": 4.994738450123111e-06, "loss": 0.046, "step": 228 }, { "epoch": 0.10418562329390355, "grad_norm": 0.9666854434475629, "learning_rate": 4.994692007194312e-06, "loss": 0.0743, "step": 229 }, { "epoch": 0.10464058234758872, "grad_norm": 0.7151615241659314, "learning_rate": 4.994645360410547e-06, "loss": 0.0583, "step": 230 }, { "epoch": 0.10509554140127389, "grad_norm": 0.7773674174360427, "learning_rate": 4.99459850977563e-06, "loss": 0.0618, "step": 231 }, { "epoch": 0.10555050045495905, "grad_norm": 0.8418236580272198, "learning_rate": 4.994551455293388e-06, "loss": 0.046, "step": 232 }, { "epoch": 0.10600545950864422, "grad_norm": 0.9714541810445473, "learning_rate": 4.9945041969676654e-06, "loss": 0.0634, "step": 233 }, { "epoch": 0.10646041856232939, "grad_norm": 1.0109494927023708, "learning_rate": 4.994456734802325e-06, "loss": 0.0551, "step": 234 }, { "epoch": 0.10691537761601456, "grad_norm": 0.714933750259254, "learning_rate": 4.994409068801247e-06, "loss": 0.0593, "step": 235 }, { "epoch": 0.10737033666969972, "grad_norm": 1.998280137227604, "learning_rate": 4.994361198968323e-06, "loss": 0.0632, "step": 236 }, { "epoch": 0.1078252957233849, "grad_norm": 1.2708633718893245, "learning_rate": 4.994313125307466e-06, "loss": 0.0909, "step": 237 }, { "epoch": 0.10828025477707007, "grad_norm": 0.7903038049799667, "learning_rate": 4.994264847822605e-06, "loss": 0.0579, "step": 238 }, { "epoch": 0.10873521383075523, "grad_norm": 0.7076795429019287, "learning_rate": 4.994216366517684e-06, "loss": 0.0419, "step": 239 }, { "epoch": 0.1091901728844404, "grad_norm": 0.9078047157633448, "learning_rate": 4.994167681396667e-06, "loss": 0.0631, "step": 240 }, { "epoch": 0.10964513193812557, "grad_norm": 1.122407784822992, "learning_rate": 4.994118792463529e-06, "loss": 0.0771, "step": 241 }, { "epoch": 0.11010009099181074, "grad_norm": 1.3544092697698327, "learning_rate": 4.994069699722267e-06, "loss": 0.1034, "step": 242 }, { "epoch": 0.1105550500454959, "grad_norm": 1.0823051140179736, "learning_rate": 4.994020403176893e-06, "loss": 0.0737, "step": 243 }, { "epoch": 0.11101000909918107, "grad_norm": 1.4097890081473512, "learning_rate": 4.9939709028314345e-06, "loss": 0.0882, "step": 244 }, { "epoch": 0.11146496815286625, "grad_norm": 3.3536954759034883, "learning_rate": 4.993921198689935e-06, "loss": 0.0448, "step": 245 }, { "epoch": 0.11191992720655142, "grad_norm": 0.8141751078988797, "learning_rate": 4.993871290756459e-06, "loss": 0.053, "step": 246 }, { "epoch": 0.11237488626023658, "grad_norm": 0.7556149519633891, "learning_rate": 4.9938211790350835e-06, "loss": 0.053, "step": 247 }, { "epoch": 0.11282984531392175, "grad_norm": 1.028865867099704, "learning_rate": 4.993770863529902e-06, "loss": 0.068, "step": 248 }, { "epoch": 0.11328480436760692, "grad_norm": 0.8709129466992336, "learning_rate": 4.993720344245029e-06, "loss": 0.0533, "step": 249 }, { "epoch": 0.11373976342129208, "grad_norm": 0.8992015471183187, "learning_rate": 4.99366962118459e-06, "loss": 0.0589, "step": 250 }, { "epoch": 0.11419472247497725, "grad_norm": 0.8276539094244998, "learning_rate": 4.99361869435273e-06, "loss": 0.0537, "step": 251 }, { "epoch": 0.11464968152866242, "grad_norm": 1.1164497627583263, "learning_rate": 4.993567563753613e-06, "loss": 0.0627, "step": 252 }, { "epoch": 0.1151046405823476, "grad_norm": 1.049662917063972, "learning_rate": 4.993516229391414e-06, "loss": 0.0708, "step": 253 }, { "epoch": 0.11555959963603275, "grad_norm": 0.8007012653446455, "learning_rate": 4.993464691270331e-06, "loss": 0.036, "step": 254 }, { "epoch": 0.11601455868971793, "grad_norm": 1.0491396902879628, "learning_rate": 4.993412949394572e-06, "loss": 0.0564, "step": 255 }, { "epoch": 0.1164695177434031, "grad_norm": 1.0461746265014504, "learning_rate": 4.993361003768369e-06, "loss": 0.0547, "step": 256 }, { "epoch": 0.11692447679708826, "grad_norm": 0.7167785855145479, "learning_rate": 4.993308854395963e-06, "loss": 0.0543, "step": 257 }, { "epoch": 0.11737943585077343, "grad_norm": 1.7377303264454829, "learning_rate": 4.993256501281618e-06, "loss": 0.0385, "step": 258 }, { "epoch": 0.1178343949044586, "grad_norm": 0.9843734560261626, "learning_rate": 4.993203944429611e-06, "loss": 0.0793, "step": 259 }, { "epoch": 0.11828935395814377, "grad_norm": 0.7687699158665893, "learning_rate": 4.993151183844236e-06, "loss": 0.0554, "step": 260 }, { "epoch": 0.11874431301182893, "grad_norm": 0.8273770606193852, "learning_rate": 4.9930982195298065e-06, "loss": 0.0485, "step": 261 }, { "epoch": 0.1191992720655141, "grad_norm": 0.8576587444889947, "learning_rate": 4.9930450514906484e-06, "loss": 0.0668, "step": 262 }, { "epoch": 0.11965423111919928, "grad_norm": 0.8611188803026584, "learning_rate": 4.9929916797311075e-06, "loss": 0.0511, "step": 263 }, { "epoch": 0.12010919017288443, "grad_norm": 1.5171570240611278, "learning_rate": 4.992938104255545e-06, "loss": 0.105, "step": 264 }, { "epoch": 0.1205641492265696, "grad_norm": 0.9293458324727663, "learning_rate": 4.992884325068339e-06, "loss": 0.0519, "step": 265 }, { "epoch": 0.12101910828025478, "grad_norm": 0.854174247687424, "learning_rate": 4.992830342173882e-06, "loss": 0.0739, "step": 266 }, { "epoch": 0.12147406733393995, "grad_norm": 1.185366954699452, "learning_rate": 4.992776155576589e-06, "loss": 0.088, "step": 267 }, { "epoch": 0.12192902638762511, "grad_norm": 0.8266584460330494, "learning_rate": 4.992721765280884e-06, "loss": 0.0766, "step": 268 }, { "epoch": 0.12238398544131028, "grad_norm": 1.4759867391060453, "learning_rate": 4.992667171291215e-06, "loss": 0.0935, "step": 269 }, { "epoch": 0.12283894449499545, "grad_norm": 0.7694789949011869, "learning_rate": 4.992612373612042e-06, "loss": 0.0444, "step": 270 }, { "epoch": 0.12329390354868063, "grad_norm": 1.3788521475642475, "learning_rate": 4.99255737224784e-06, "loss": 0.0686, "step": 271 }, { "epoch": 0.12374886260236578, "grad_norm": 0.6815172893718315, "learning_rate": 4.9925021672031075e-06, "loss": 0.0597, "step": 272 }, { "epoch": 0.12420382165605096, "grad_norm": 1.0361010622054052, "learning_rate": 4.992446758482353e-06, "loss": 0.0577, "step": 273 }, { "epoch": 0.12465878070973613, "grad_norm": 1.3644605952379871, "learning_rate": 4.992391146090106e-06, "loss": 0.1058, "step": 274 }, { "epoch": 0.1251137397634213, "grad_norm": 0.6194563221594529, "learning_rate": 4.99233533003091e-06, "loss": 0.0481, "step": 275 }, { "epoch": 0.12556869881710647, "grad_norm": 0.8571229878526591, "learning_rate": 4.992279310309326e-06, "loss": 0.0811, "step": 276 }, { "epoch": 0.12602365787079162, "grad_norm": 1.543151015857885, "learning_rate": 4.9922230869299316e-06, "loss": 0.1184, "step": 277 }, { "epoch": 0.1264786169244768, "grad_norm": 0.8807612446920655, "learning_rate": 4.992166659897321e-06, "loss": 0.0629, "step": 278 }, { "epoch": 0.12693357597816196, "grad_norm": 0.754844132552438, "learning_rate": 4.992110029216106e-06, "loss": 0.0488, "step": 279 }, { "epoch": 0.12738853503184713, "grad_norm": 0.8268029175550342, "learning_rate": 4.992053194890914e-06, "loss": 0.0463, "step": 280 }, { "epoch": 0.1278434940855323, "grad_norm": 3.022212928870756, "learning_rate": 4.991996156926388e-06, "loss": 0.0622, "step": 281 }, { "epoch": 0.12829845313921748, "grad_norm": 11.09472751599815, "learning_rate": 4.9919389153271904e-06, "loss": 0.0631, "step": 282 }, { "epoch": 0.12875341219290265, "grad_norm": 5.099079327460766, "learning_rate": 4.991881470097998e-06, "loss": 0.0666, "step": 283 }, { "epoch": 0.1292083712465878, "grad_norm": 27.36348462792037, "learning_rate": 4.991823821243505e-06, "loss": 0.0601, "step": 284 }, { "epoch": 0.12966333030027297, "grad_norm": 4.3922238380370375, "learning_rate": 4.991765968768422e-06, "loss": 0.0801, "step": 285 }, { "epoch": 0.13011828935395814, "grad_norm": 2.2745288954264855, "learning_rate": 4.991707912677477e-06, "loss": 0.0461, "step": 286 }, { "epoch": 0.1305732484076433, "grad_norm": 4.103515429733392, "learning_rate": 4.991649652975414e-06, "loss": 0.0464, "step": 287 }, { "epoch": 0.13102820746132848, "grad_norm": 8.218943562506432, "learning_rate": 4.991591189666994e-06, "loss": 0.048, "step": 288 }, { "epoch": 0.13148316651501366, "grad_norm": 2.4966842175341917, "learning_rate": 4.991532522756993e-06, "loss": 0.0635, "step": 289 }, { "epoch": 0.13193812556869883, "grad_norm": 2.714522360598833, "learning_rate": 4.991473652250207e-06, "loss": 0.0416, "step": 290 }, { "epoch": 0.13239308462238397, "grad_norm": 0.7496525159208725, "learning_rate": 4.991414578151445e-06, "loss": 0.0558, "step": 291 }, { "epoch": 0.13284804367606914, "grad_norm": 1.9010591397052237, "learning_rate": 4.991355300465535e-06, "loss": 0.1319, "step": 292 }, { "epoch": 0.13330300272975432, "grad_norm": 1.0962153747662344, "learning_rate": 4.99129581919732e-06, "loss": 0.0597, "step": 293 }, { "epoch": 0.1337579617834395, "grad_norm": 0.682643165278525, "learning_rate": 4.9912361343516616e-06, "loss": 0.0392, "step": 294 }, { "epoch": 0.13421292083712466, "grad_norm": 0.8036831706718465, "learning_rate": 4.991176245933437e-06, "loss": 0.0572, "step": 295 }, { "epoch": 0.13466787989080983, "grad_norm": 0.799489770852785, "learning_rate": 4.9911161539475385e-06, "loss": 0.0533, "step": 296 }, { "epoch": 0.135122838944495, "grad_norm": 1.5836938564798495, "learning_rate": 4.991055858398879e-06, "loss": 0.0875, "step": 297 }, { "epoch": 0.13557779799818018, "grad_norm": 1.1468269055025655, "learning_rate": 4.990995359292384e-06, "loss": 0.0843, "step": 298 }, { "epoch": 0.13603275705186532, "grad_norm": 1.025610687153708, "learning_rate": 4.990934656632997e-06, "loss": 0.0767, "step": 299 }, { "epoch": 0.1364877161055505, "grad_norm": 1.135419351562697, "learning_rate": 4.990873750425679e-06, "loss": 0.0521, "step": 300 }, { "epoch": 0.13694267515923567, "grad_norm": 0.7857019349684609, "learning_rate": 4.990812640675406e-06, "loss": 0.0577, "step": 301 }, { "epoch": 0.13739763421292084, "grad_norm": 0.6543121694694685, "learning_rate": 4.990751327387174e-06, "loss": 0.0408, "step": 302 }, { "epoch": 0.137852593266606, "grad_norm": 0.9867579368206506, "learning_rate": 4.99068981056599e-06, "loss": 0.0644, "step": 303 }, { "epoch": 0.13830755232029118, "grad_norm": 0.9387206564680207, "learning_rate": 4.990628090216885e-06, "loss": 0.0725, "step": 304 }, { "epoch": 0.13876251137397635, "grad_norm": 0.6895906486970027, "learning_rate": 4.990566166344898e-06, "loss": 0.0444, "step": 305 }, { "epoch": 0.1392174704276615, "grad_norm": 0.8627162803317235, "learning_rate": 4.990504038955092e-06, "loss": 0.0639, "step": 306 }, { "epoch": 0.13967242948134667, "grad_norm": 0.9832854011829437, "learning_rate": 4.990441708052542e-06, "loss": 0.067, "step": 307 }, { "epoch": 0.14012738853503184, "grad_norm": 0.6828895359949346, "learning_rate": 4.9903791736423435e-06, "loss": 0.0511, "step": 308 }, { "epoch": 0.14058234758871702, "grad_norm": 0.949508820368659, "learning_rate": 4.9903164357296044e-06, "loss": 0.0586, "step": 309 }, { "epoch": 0.1410373066424022, "grad_norm": 0.8262401805570133, "learning_rate": 4.990253494319453e-06, "loss": 0.072, "step": 310 }, { "epoch": 0.14149226569608736, "grad_norm": 0.7329455864605506, "learning_rate": 4.990190349417032e-06, "loss": 0.0659, "step": 311 }, { "epoch": 0.14194722474977253, "grad_norm": 1.008005243411958, "learning_rate": 4.990127001027501e-06, "loss": 0.0682, "step": 312 }, { "epoch": 0.14240218380345768, "grad_norm": 1.3159038760119786, "learning_rate": 4.990063449156037e-06, "loss": 0.0485, "step": 313 }, { "epoch": 0.14285714285714285, "grad_norm": 1.053132819530921, "learning_rate": 4.989999693807832e-06, "loss": 0.0736, "step": 314 }, { "epoch": 0.14331210191082802, "grad_norm": 1.2097384821970267, "learning_rate": 4.989935734988098e-06, "loss": 0.0752, "step": 315 }, { "epoch": 0.1437670609645132, "grad_norm": 0.8883071525106219, "learning_rate": 4.98987157270206e-06, "loss": 0.0614, "step": 316 }, { "epoch": 0.14422202001819837, "grad_norm": 1.3457063090752772, "learning_rate": 4.989807206954961e-06, "loss": 0.0896, "step": 317 }, { "epoch": 0.14467697907188354, "grad_norm": 0.6077627024555071, "learning_rate": 4.9897426377520605e-06, "loss": 0.0426, "step": 318 }, { "epoch": 0.1451319381255687, "grad_norm": 1.4177858466419022, "learning_rate": 4.989677865098636e-06, "loss": 0.0922, "step": 319 }, { "epoch": 0.14558689717925385, "grad_norm": 0.5838535924114719, "learning_rate": 4.989612888999978e-06, "loss": 0.04, "step": 320 }, { "epoch": 0.14604185623293903, "grad_norm": 0.7991836602542821, "learning_rate": 4.9895477094614e-06, "loss": 0.0644, "step": 321 }, { "epoch": 0.1464968152866242, "grad_norm": 0.6309987592236359, "learning_rate": 4.989482326488225e-06, "loss": 0.0457, "step": 322 }, { "epoch": 0.14695177434030937, "grad_norm": 0.850157804894001, "learning_rate": 4.989416740085796e-06, "loss": 0.0706, "step": 323 }, { "epoch": 0.14740673339399454, "grad_norm": 0.8703332109039406, "learning_rate": 4.9893509502594735e-06, "loss": 0.0503, "step": 324 }, { "epoch": 0.14786169244767972, "grad_norm": 0.9357603198363387, "learning_rate": 4.9892849570146335e-06, "loss": 0.0799, "step": 325 }, { "epoch": 0.1483166515013649, "grad_norm": 0.9508555727006773, "learning_rate": 4.989218760356668e-06, "loss": 0.0703, "step": 326 }, { "epoch": 0.14877161055505003, "grad_norm": 0.8548982254979315, "learning_rate": 4.989152360290987e-06, "loss": 0.0706, "step": 327 }, { "epoch": 0.1492265696087352, "grad_norm": 1.1548758627037845, "learning_rate": 4.989085756823015e-06, "loss": 0.0868, "step": 328 }, { "epoch": 0.14968152866242038, "grad_norm": 0.872011841531817, "learning_rate": 4.989018949958197e-06, "loss": 0.0642, "step": 329 }, { "epoch": 0.15013648771610555, "grad_norm": 0.7767447334589991, "learning_rate": 4.98895193970199e-06, "loss": 0.0428, "step": 330 }, { "epoch": 0.15059144676979072, "grad_norm": 0.9215786343037755, "learning_rate": 4.9888847260598705e-06, "loss": 0.0652, "step": 331 }, { "epoch": 0.1510464058234759, "grad_norm": 1.0293746869379716, "learning_rate": 4.98881730903733e-06, "loss": 0.0768, "step": 332 }, { "epoch": 0.15150136487716107, "grad_norm": 1.2190824076232663, "learning_rate": 4.98874968863988e-06, "loss": 0.0746, "step": 333 }, { "epoch": 0.15195632393084624, "grad_norm": 0.8899729802614444, "learning_rate": 4.988681864873044e-06, "loss": 0.0638, "step": 334 }, { "epoch": 0.15241128298453138, "grad_norm": 0.8009499929718743, "learning_rate": 4.988613837742364e-06, "loss": 0.0556, "step": 335 }, { "epoch": 0.15286624203821655, "grad_norm": 1.0942561304100769, "learning_rate": 4.9885456072534015e-06, "loss": 0.0685, "step": 336 }, { "epoch": 0.15332120109190173, "grad_norm": 1.1210686024600067, "learning_rate": 4.988477173411728e-06, "loss": 0.0649, "step": 337 }, { "epoch": 0.1537761601455869, "grad_norm": 0.713128381997935, "learning_rate": 4.988408536222939e-06, "loss": 0.043, "step": 338 }, { "epoch": 0.15423111919927207, "grad_norm": 0.8820810335281195, "learning_rate": 4.9883396956926416e-06, "loss": 0.0545, "step": 339 }, { "epoch": 0.15468607825295724, "grad_norm": 0.7198251112806523, "learning_rate": 4.988270651826462e-06, "loss": 0.0419, "step": 340 }, { "epoch": 0.15514103730664242, "grad_norm": 0.9319745452557298, "learning_rate": 4.988201404630041e-06, "loss": 0.0556, "step": 341 }, { "epoch": 0.15559599636032756, "grad_norm": 0.7744733545189804, "learning_rate": 4.988131954109038e-06, "loss": 0.0566, "step": 342 }, { "epoch": 0.15605095541401273, "grad_norm": 1.2609547822192495, "learning_rate": 4.988062300269128e-06, "loss": 0.0931, "step": 343 }, { "epoch": 0.1565059144676979, "grad_norm": 1.0356035457639365, "learning_rate": 4.987992443116003e-06, "loss": 0.0592, "step": 344 }, { "epoch": 0.15696087352138308, "grad_norm": 0.8613107982126194, "learning_rate": 4.987922382655372e-06, "loss": 0.0599, "step": 345 }, { "epoch": 0.15741583257506825, "grad_norm": 1.2274429381178749, "learning_rate": 4.987852118892958e-06, "loss": 0.104, "step": 346 }, { "epoch": 0.15787079162875342, "grad_norm": 0.8982827327342306, "learning_rate": 4.987781651834503e-06, "loss": 0.0777, "step": 347 }, { "epoch": 0.1583257506824386, "grad_norm": 1.124267218302162, "learning_rate": 4.987710981485768e-06, "loss": 0.0815, "step": 348 }, { "epoch": 0.15878070973612374, "grad_norm": 1.1417083606361687, "learning_rate": 4.987640107852525e-06, "loss": 0.0968, "step": 349 }, { "epoch": 0.1592356687898089, "grad_norm": 0.7137928465125194, "learning_rate": 4.987569030940567e-06, "loss": 0.0525, "step": 350 }, { "epoch": 0.15969062784349408, "grad_norm": 0.8074447940975472, "learning_rate": 4.987497750755702e-06, "loss": 0.0478, "step": 351 }, { "epoch": 0.16014558689717925, "grad_norm": 1.320795972993318, "learning_rate": 4.987426267303753e-06, "loss": 0.0814, "step": 352 }, { "epoch": 0.16060054595086443, "grad_norm": 0.956458465296858, "learning_rate": 4.987354580590563e-06, "loss": 0.0728, "step": 353 }, { "epoch": 0.1610555050045496, "grad_norm": 0.9487388071568301, "learning_rate": 4.987282690621991e-06, "loss": 0.0778, "step": 354 }, { "epoch": 0.16151046405823477, "grad_norm": 0.8111568286998416, "learning_rate": 4.987210597403907e-06, "loss": 0.0634, "step": 355 }, { "epoch": 0.16196542311191992, "grad_norm": 0.9291291865293426, "learning_rate": 4.987138300942208e-06, "loss": 0.057, "step": 356 }, { "epoch": 0.1624203821656051, "grad_norm": 0.7796831533037398, "learning_rate": 4.987065801242798e-06, "loss": 0.0591, "step": 357 }, { "epoch": 0.16287534121929026, "grad_norm": 1.0091637666603208, "learning_rate": 4.986993098311601e-06, "loss": 0.0712, "step": 358 }, { "epoch": 0.16333030027297543, "grad_norm": 0.9599752405823201, "learning_rate": 4.986920192154561e-06, "loss": 0.0712, "step": 359 }, { "epoch": 0.1637852593266606, "grad_norm": 0.6975593533750986, "learning_rate": 4.986847082777632e-06, "loss": 0.0489, "step": 360 }, { "epoch": 0.16424021838034578, "grad_norm": 0.8407792898194115, "learning_rate": 4.986773770186791e-06, "loss": 0.0687, "step": 361 }, { "epoch": 0.16469517743403095, "grad_norm": 1.16032280422667, "learning_rate": 4.986700254388027e-06, "loss": 0.0814, "step": 362 }, { "epoch": 0.1651501364877161, "grad_norm": 0.6789989236352713, "learning_rate": 4.986626535387349e-06, "loss": 0.0502, "step": 363 }, { "epoch": 0.16560509554140126, "grad_norm": 0.8858819178004838, "learning_rate": 4.9865526131907795e-06, "loss": 0.0584, "step": 364 }, { "epoch": 0.16606005459508644, "grad_norm": 1.0159257224317502, "learning_rate": 4.9864784878043595e-06, "loss": 0.0828, "step": 365 }, { "epoch": 0.1665150136487716, "grad_norm": 1.1632391007958518, "learning_rate": 4.986404159234146e-06, "loss": 0.0693, "step": 366 }, { "epoch": 0.16696997270245678, "grad_norm": 0.7286212082146628, "learning_rate": 4.986329627486213e-06, "loss": 0.048, "step": 367 }, { "epoch": 0.16742493175614195, "grad_norm": 1.1675091585135315, "learning_rate": 4.986254892566652e-06, "loss": 0.0831, "step": 368 }, { "epoch": 0.16787989080982713, "grad_norm": 0.7791126867293955, "learning_rate": 4.9861799544815684e-06, "loss": 0.0511, "step": 369 }, { "epoch": 0.16833484986351227, "grad_norm": 0.8594476885535768, "learning_rate": 4.986104813237086e-06, "loss": 0.0605, "step": 370 }, { "epoch": 0.16878980891719744, "grad_norm": 0.8510456749795352, "learning_rate": 4.986029468839346e-06, "loss": 0.0568, "step": 371 }, { "epoch": 0.16924476797088261, "grad_norm": 1.1617139473891909, "learning_rate": 4.985953921294505e-06, "loss": 0.09, "step": 372 }, { "epoch": 0.1696997270245678, "grad_norm": 0.6957569576277562, "learning_rate": 4.985878170608736e-06, "loss": 0.038, "step": 373 }, { "epoch": 0.17015468607825296, "grad_norm": 0.8584263131532073, "learning_rate": 4.985802216788228e-06, "loss": 0.0517, "step": 374 }, { "epoch": 0.17060964513193813, "grad_norm": 0.9366771679720911, "learning_rate": 4.98572605983919e-06, "loss": 0.063, "step": 375 }, { "epoch": 0.1710646041856233, "grad_norm": 0.5935251092125957, "learning_rate": 4.985649699767842e-06, "loss": 0.0399, "step": 376 }, { "epoch": 0.17151956323930848, "grad_norm": 0.7556873935071919, "learning_rate": 4.985573136580427e-06, "loss": 0.0606, "step": 377 }, { "epoch": 0.17197452229299362, "grad_norm": 0.723085424895094, "learning_rate": 4.9854963702832e-06, "loss": 0.0498, "step": 378 }, { "epoch": 0.1724294813466788, "grad_norm": 0.9057911616547558, "learning_rate": 4.985419400882433e-06, "loss": 0.0733, "step": 379 }, { "epoch": 0.17288444040036396, "grad_norm": 1.0911724774245748, "learning_rate": 4.985342228384418e-06, "loss": 0.0974, "step": 380 }, { "epoch": 0.17333939945404914, "grad_norm": 15.867955316807802, "learning_rate": 4.985264852795459e-06, "loss": 0.4597, "step": 381 }, { "epoch": 0.1737943585077343, "grad_norm": 0.8242169703714594, "learning_rate": 4.98518727412188e-06, "loss": 0.0592, "step": 382 }, { "epoch": 0.17424931756141948, "grad_norm": 1.189476180626615, "learning_rate": 4.98510949237002e-06, "loss": 0.0871, "step": 383 }, { "epoch": 0.17470427661510465, "grad_norm": 0.9035387689173863, "learning_rate": 4.985031507546234e-06, "loss": 0.0659, "step": 384 }, { "epoch": 0.1751592356687898, "grad_norm": 1.5548450607275692, "learning_rate": 4.984953319656896e-06, "loss": 0.102, "step": 385 }, { "epoch": 0.17561419472247497, "grad_norm": 0.9148861743530409, "learning_rate": 4.984874928708395e-06, "loss": 0.0621, "step": 386 }, { "epoch": 0.17606915377616014, "grad_norm": 1.0088623446062757, "learning_rate": 4.984796334707136e-06, "loss": 0.0801, "step": 387 }, { "epoch": 0.17652411282984531, "grad_norm": 7.099087459170151, "learning_rate": 4.984717537659542e-06, "loss": 0.1139, "step": 388 }, { "epoch": 0.1769790718835305, "grad_norm": 0.6271204554143699, "learning_rate": 4.984638537572052e-06, "loss": 0.0362, "step": 389 }, { "epoch": 0.17743403093721566, "grad_norm": 0.9099126199173307, "learning_rate": 4.984559334451121e-06, "loss": 0.0589, "step": 390 }, { "epoch": 0.17788898999090083, "grad_norm": 0.9635928903258919, "learning_rate": 4.984479928303221e-06, "loss": 0.0485, "step": 391 }, { "epoch": 0.17834394904458598, "grad_norm": 0.8684293064054923, "learning_rate": 4.984400319134841e-06, "loss": 0.0488, "step": 392 }, { "epoch": 0.17879890809827115, "grad_norm": 1.490825595774446, "learning_rate": 4.984320506952487e-06, "loss": 0.1164, "step": 393 }, { "epoch": 0.17925386715195632, "grad_norm": 1.0210666975638372, "learning_rate": 4.9842404917626796e-06, "loss": 0.0765, "step": 394 }, { "epoch": 0.1797088262056415, "grad_norm": 0.7827897024774737, "learning_rate": 4.984160273571959e-06, "loss": 0.0627, "step": 395 }, { "epoch": 0.18016378525932666, "grad_norm": 0.9460976796008799, "learning_rate": 4.9840798523868785e-06, "loss": 0.0802, "step": 396 }, { "epoch": 0.18061874431301184, "grad_norm": 0.6974747481172566, "learning_rate": 4.983999228214011e-06, "loss": 0.0483, "step": 397 }, { "epoch": 0.181073703366697, "grad_norm": 0.7442577439773002, "learning_rate": 4.983918401059943e-06, "loss": 0.0501, "step": 398 }, { "epoch": 0.18152866242038215, "grad_norm": 1.001863981150214, "learning_rate": 4.983837370931282e-06, "loss": 0.0866, "step": 399 }, { "epoch": 0.18198362147406733, "grad_norm": 1.258993794296855, "learning_rate": 4.983756137834647e-06, "loss": 0.1164, "step": 400 }, { "epoch": 0.1824385805277525, "grad_norm": 1.1296307149258726, "learning_rate": 4.9836747017766765e-06, "loss": 0.0698, "step": 401 }, { "epoch": 0.18289353958143767, "grad_norm": 0.9299919208142283, "learning_rate": 4.983593062764027e-06, "loss": 0.0767, "step": 402 }, { "epoch": 0.18334849863512284, "grad_norm": 1.6483380962062835, "learning_rate": 4.983511220803367e-06, "loss": 0.0982, "step": 403 }, { "epoch": 0.18380345768880801, "grad_norm": 0.7951232146562915, "learning_rate": 4.983429175901386e-06, "loss": 0.0621, "step": 404 }, { "epoch": 0.1842584167424932, "grad_norm": 0.7346583458526271, "learning_rate": 4.983346928064788e-06, "loss": 0.0485, "step": 405 }, { "epoch": 0.18471337579617833, "grad_norm": 0.8488964995265393, "learning_rate": 4.9832644773002935e-06, "loss": 0.0697, "step": 406 }, { "epoch": 0.1851683348498635, "grad_norm": 0.637978257841365, "learning_rate": 4.98318182361464e-06, "loss": 0.0578, "step": 407 }, { "epoch": 0.18562329390354868, "grad_norm": 1.006460769017827, "learning_rate": 4.9830989670145825e-06, "loss": 0.0741, "step": 408 }, { "epoch": 0.18607825295723385, "grad_norm": 1.0063850758607982, "learning_rate": 4.9830159075068905e-06, "loss": 0.0698, "step": 409 }, { "epoch": 0.18653321201091902, "grad_norm": 0.9365632618002147, "learning_rate": 4.9829326450983514e-06, "loss": 0.0779, "step": 410 }, { "epoch": 0.1869881710646042, "grad_norm": 0.8773564274313461, "learning_rate": 4.98284917979577e-06, "loss": 0.0608, "step": 411 }, { "epoch": 0.18744313011828936, "grad_norm": 0.9057984183185465, "learning_rate": 4.9827655116059656e-06, "loss": 0.0639, "step": 412 }, { "epoch": 0.18789808917197454, "grad_norm": 0.8657789325497686, "learning_rate": 4.9826816405357755e-06, "loss": 0.0749, "step": 413 }, { "epoch": 0.18835304822565968, "grad_norm": 0.5817294435867961, "learning_rate": 4.982597566592054e-06, "loss": 0.0353, "step": 414 }, { "epoch": 0.18880800727934485, "grad_norm": 1.2277963790590036, "learning_rate": 4.982513289781671e-06, "loss": 0.091, "step": 415 }, { "epoch": 0.18926296633303002, "grad_norm": 0.7616764372047586, "learning_rate": 4.982428810111512e-06, "loss": 0.0597, "step": 416 }, { "epoch": 0.1897179253867152, "grad_norm": 0.710019161677026, "learning_rate": 4.9823441275884814e-06, "loss": 0.0535, "step": 417 }, { "epoch": 0.19017288444040037, "grad_norm": 1.1202371935797844, "learning_rate": 4.982259242219499e-06, "loss": 0.0643, "step": 418 }, { "epoch": 0.19062784349408554, "grad_norm": 0.6803190221634923, "learning_rate": 4.9821741540115006e-06, "loss": 0.0483, "step": 419 }, { "epoch": 0.1910828025477707, "grad_norm": 0.8014131027464055, "learning_rate": 4.982088862971441e-06, "loss": 0.0703, "step": 420 }, { "epoch": 0.19153776160145586, "grad_norm": 0.960552266983122, "learning_rate": 4.982003369106287e-06, "loss": 0.0709, "step": 421 }, { "epoch": 0.19199272065514103, "grad_norm": 0.6179685927519944, "learning_rate": 4.981917672423028e-06, "loss": 0.0407, "step": 422 }, { "epoch": 0.1924476797088262, "grad_norm": 0.9538296833436659, "learning_rate": 4.981831772928664e-06, "loss": 0.0681, "step": 423 }, { "epoch": 0.19290263876251137, "grad_norm": 1.076872796407403, "learning_rate": 4.981745670630216e-06, "loss": 0.0918, "step": 424 }, { "epoch": 0.19335759781619655, "grad_norm": 0.8486267027177018, "learning_rate": 4.981659365534718e-06, "loss": 0.081, "step": 425 }, { "epoch": 0.19381255686988172, "grad_norm": 1.2668354345440433, "learning_rate": 4.981572857649225e-06, "loss": 0.0855, "step": 426 }, { "epoch": 0.1942675159235669, "grad_norm": 0.785685618330662, "learning_rate": 4.981486146980804e-06, "loss": 0.0525, "step": 427 }, { "epoch": 0.19472247497725204, "grad_norm": 0.9012661112199176, "learning_rate": 4.9813992335365415e-06, "loss": 0.0616, "step": 428 }, { "epoch": 0.1951774340309372, "grad_norm": 0.9140326707870835, "learning_rate": 4.98131211732354e-06, "loss": 0.0742, "step": 429 }, { "epoch": 0.19563239308462238, "grad_norm": 0.8802115121731895, "learning_rate": 4.981224798348917e-06, "loss": 0.0543, "step": 430 }, { "epoch": 0.19608735213830755, "grad_norm": 1.2263655680320666, "learning_rate": 4.981137276619809e-06, "loss": 0.1, "step": 431 }, { "epoch": 0.19654231119199272, "grad_norm": 0.7179258520773776, "learning_rate": 4.9810495521433675e-06, "loss": 0.0563, "step": 432 }, { "epoch": 0.1969972702456779, "grad_norm": 1.2006165727982114, "learning_rate": 4.9809616249267616e-06, "loss": 0.0919, "step": 433 }, { "epoch": 0.19745222929936307, "grad_norm": 1.0426641838922892, "learning_rate": 4.980873494977174e-06, "loss": 0.0845, "step": 434 }, { "epoch": 0.1979071883530482, "grad_norm": 0.8009974020959663, "learning_rate": 4.98078516230181e-06, "loss": 0.0495, "step": 435 }, { "epoch": 0.19836214740673339, "grad_norm": 0.8146116166212912, "learning_rate": 4.980696626907884e-06, "loss": 0.0656, "step": 436 }, { "epoch": 0.19881710646041856, "grad_norm": 0.8146964454257942, "learning_rate": 4.980607888802633e-06, "loss": 0.0717, "step": 437 }, { "epoch": 0.19927206551410373, "grad_norm": 0.8473418815819729, "learning_rate": 4.980518947993307e-06, "loss": 0.0701, "step": 438 }, { "epoch": 0.1997270245677889, "grad_norm": 0.8132123262524923, "learning_rate": 4.980429804487176e-06, "loss": 0.0657, "step": 439 }, { "epoch": 0.20018198362147407, "grad_norm": 0.7631308196097977, "learning_rate": 4.980340458291521e-06, "loss": 0.0519, "step": 440 }, { "epoch": 0.20063694267515925, "grad_norm": 0.7710009886187632, "learning_rate": 4.980250909413646e-06, "loss": 0.0668, "step": 441 }, { "epoch": 0.2010919017288444, "grad_norm": 0.8960590823111618, "learning_rate": 4.980161157860867e-06, "loss": 0.066, "step": 442 }, { "epoch": 0.20154686078252956, "grad_norm": 1.0148659081855533, "learning_rate": 4.980071203640519e-06, "loss": 0.0666, "step": 443 }, { "epoch": 0.20200181983621474, "grad_norm": 0.6157365971883945, "learning_rate": 4.979981046759952e-06, "loss": 0.0441, "step": 444 }, { "epoch": 0.2024567788898999, "grad_norm": 0.8862364575439057, "learning_rate": 4.979890687226533e-06, "loss": 0.0638, "step": 445 }, { "epoch": 0.20291173794358508, "grad_norm": 1.051789940808801, "learning_rate": 4.979800125047647e-06, "loss": 0.0571, "step": 446 }, { "epoch": 0.20336669699727025, "grad_norm": 0.8963335794848035, "learning_rate": 4.979709360230692e-06, "loss": 0.0706, "step": 447 }, { "epoch": 0.20382165605095542, "grad_norm": 0.8639092050645877, "learning_rate": 4.979618392783087e-06, "loss": 0.0535, "step": 448 }, { "epoch": 0.20427661510464057, "grad_norm": 0.630704913013139, "learning_rate": 4.979527222712266e-06, "loss": 0.0553, "step": 449 }, { "epoch": 0.20473157415832574, "grad_norm": 0.6653631844503811, "learning_rate": 4.9794358500256765e-06, "loss": 0.0438, "step": 450 }, { "epoch": 0.2051865332120109, "grad_norm": 0.8074584078493093, "learning_rate": 4.979344274730786e-06, "loss": 0.0607, "step": 451 }, { "epoch": 0.20564149226569609, "grad_norm": 1.1020725070982913, "learning_rate": 4.979252496835079e-06, "loss": 0.0812, "step": 452 }, { "epoch": 0.20609645131938126, "grad_norm": 1.2231182771798559, "learning_rate": 4.979160516346054e-06, "loss": 0.1074, "step": 453 }, { "epoch": 0.20655141037306643, "grad_norm": 26.716723850026153, "learning_rate": 4.979068333271227e-06, "loss": 0.8002, "step": 454 }, { "epoch": 0.2070063694267516, "grad_norm": 1.2123236026672213, "learning_rate": 4.978975947618131e-06, "loss": 0.0788, "step": 455 }, { "epoch": 0.20746132848043677, "grad_norm": 0.8671125203100531, "learning_rate": 4.978883359394316e-06, "loss": 0.0902, "step": 456 }, { "epoch": 0.20791628753412192, "grad_norm": 0.9848601155594614, "learning_rate": 4.978790568607347e-06, "loss": 0.0606, "step": 457 }, { "epoch": 0.2083712465878071, "grad_norm": 1.013839640652733, "learning_rate": 4.9786975752648076e-06, "loss": 0.0873, "step": 458 }, { "epoch": 0.20882620564149226, "grad_norm": 0.7483252407807567, "learning_rate": 4.978604379374295e-06, "loss": 0.0592, "step": 459 }, { "epoch": 0.20928116469517744, "grad_norm": 0.7178737508101655, "learning_rate": 4.978510980943427e-06, "loss": 0.0506, "step": 460 }, { "epoch": 0.2097361237488626, "grad_norm": 1.0919247632044238, "learning_rate": 4.978417379979834e-06, "loss": 0.0778, "step": 461 }, { "epoch": 0.21019108280254778, "grad_norm": 0.8331653357443332, "learning_rate": 4.978323576491165e-06, "loss": 0.0577, "step": 462 }, { "epoch": 0.21064604185623295, "grad_norm": 0.8152928496306786, "learning_rate": 4.978229570485085e-06, "loss": 0.072, "step": 463 }, { "epoch": 0.2111010009099181, "grad_norm": 0.751813285906743, "learning_rate": 4.978135361969276e-06, "loss": 0.0649, "step": 464 }, { "epoch": 0.21155595996360327, "grad_norm": 0.8232278152234197, "learning_rate": 4.9780409509514375e-06, "loss": 0.0642, "step": 465 }, { "epoch": 0.21201091901728844, "grad_norm": 1.5303665195432214, "learning_rate": 4.977946337439282e-06, "loss": 0.1217, "step": 466 }, { "epoch": 0.2124658780709736, "grad_norm": 0.9269370490140525, "learning_rate": 4.9778515214405436e-06, "loss": 0.081, "step": 467 }, { "epoch": 0.21292083712465878, "grad_norm": 0.8830556120481512, "learning_rate": 4.977756502962967e-06, "loss": 0.0684, "step": 468 }, { "epoch": 0.21337579617834396, "grad_norm": 0.6113061227600053, "learning_rate": 4.97766128201432e-06, "loss": 0.0446, "step": 469 }, { "epoch": 0.21383075523202913, "grad_norm": 0.6077789311617329, "learning_rate": 4.977565858602381e-06, "loss": 0.0554, "step": 470 }, { "epoch": 0.21428571428571427, "grad_norm": 0.8598515142264441, "learning_rate": 4.977470232734949e-06, "loss": 0.0727, "step": 471 }, { "epoch": 0.21474067333939945, "grad_norm": 0.8043286169945988, "learning_rate": 4.977374404419838e-06, "loss": 0.0592, "step": 472 }, { "epoch": 0.21519563239308462, "grad_norm": 0.7551382062036437, "learning_rate": 4.977278373664877e-06, "loss": 0.0571, "step": 473 }, { "epoch": 0.2156505914467698, "grad_norm": 1.8211283606473743, "learning_rate": 4.977182140477916e-06, "loss": 0.1033, "step": 474 }, { "epoch": 0.21610555050045496, "grad_norm": 0.7146087276289771, "learning_rate": 4.977085704866817e-06, "loss": 0.0462, "step": 475 }, { "epoch": 0.21656050955414013, "grad_norm": 0.6542895317184714, "learning_rate": 4.97698906683946e-06, "loss": 0.061, "step": 476 }, { "epoch": 0.2170154686078253, "grad_norm": 1.0732518420250663, "learning_rate": 4.9768922264037435e-06, "loss": 0.0845, "step": 477 }, { "epoch": 0.21747042766151045, "grad_norm": 0.6769767303273837, "learning_rate": 4.976795183567579e-06, "loss": 0.0484, "step": 478 }, { "epoch": 0.21792538671519562, "grad_norm": 0.6792925907901064, "learning_rate": 4.976697938338898e-06, "loss": 0.0479, "step": 479 }, { "epoch": 0.2183803457688808, "grad_norm": 0.550587338837319, "learning_rate": 4.976600490725645e-06, "loss": 0.0402, "step": 480 }, { "epoch": 0.21883530482256597, "grad_norm": 0.9934557115485821, "learning_rate": 4.976502840735785e-06, "loss": 0.1096, "step": 481 }, { "epoch": 0.21929026387625114, "grad_norm": 0.7026152824587227, "learning_rate": 4.976404988377297e-06, "loss": 0.0442, "step": 482 }, { "epoch": 0.2197452229299363, "grad_norm": 1.1796498075270252, "learning_rate": 4.976306933658176e-06, "loss": 0.0896, "step": 483 }, { "epoch": 0.22020018198362148, "grad_norm": 0.9196991108702705, "learning_rate": 4.976208676586435e-06, "loss": 0.0903, "step": 484 }, { "epoch": 0.22065514103730663, "grad_norm": 0.9221909008992407, "learning_rate": 4.976110217170104e-06, "loss": 0.061, "step": 485 }, { "epoch": 0.2211101000909918, "grad_norm": 0.8446946807888076, "learning_rate": 4.976011555417228e-06, "loss": 0.06, "step": 486 }, { "epoch": 0.22156505914467697, "grad_norm": 0.8008200895651435, "learning_rate": 4.975912691335869e-06, "loss": 0.0552, "step": 487 }, { "epoch": 0.22202001819836215, "grad_norm": 0.7897004108366357, "learning_rate": 4.975813624934106e-06, "loss": 0.0524, "step": 488 }, { "epoch": 0.22247497725204732, "grad_norm": 0.7656059256782066, "learning_rate": 4.975714356220035e-06, "loss": 0.0532, "step": 489 }, { "epoch": 0.2229299363057325, "grad_norm": 0.49990009073007735, "learning_rate": 4.975614885201766e-06, "loss": 0.0335, "step": 490 }, { "epoch": 0.22338489535941766, "grad_norm": 0.7764965839211172, "learning_rate": 4.975515211887429e-06, "loss": 0.0663, "step": 491 }, { "epoch": 0.22383985441310283, "grad_norm": 0.8335023150964008, "learning_rate": 4.9754153362851684e-06, "loss": 0.0635, "step": 492 }, { "epoch": 0.22429481346678798, "grad_norm": 1.362631121260362, "learning_rate": 4.975315258403145e-06, "loss": 0.1184, "step": 493 }, { "epoch": 0.22474977252047315, "grad_norm": 0.8072718888075444, "learning_rate": 4.975214978249537e-06, "loss": 0.0575, "step": 494 }, { "epoch": 0.22520473157415832, "grad_norm": 0.7237599062848806, "learning_rate": 4.975114495832539e-06, "loss": 0.0629, "step": 495 }, { "epoch": 0.2256596906278435, "grad_norm": 0.9013757169049615, "learning_rate": 4.975013811160362e-06, "loss": 0.0641, "step": 496 }, { "epoch": 0.22611464968152867, "grad_norm": 1.046688141426079, "learning_rate": 4.974912924241233e-06, "loss": 0.0679, "step": 497 }, { "epoch": 0.22656960873521384, "grad_norm": 0.7549334371309422, "learning_rate": 4.974811835083397e-06, "loss": 0.0619, "step": 498 }, { "epoch": 0.227024567788899, "grad_norm": 1.4092663615099252, "learning_rate": 4.974710543695114e-06, "loss": 0.0907, "step": 499 }, { "epoch": 0.22747952684258416, "grad_norm": 1.2767203765961839, "learning_rate": 4.974609050084661e-06, "loss": 0.1037, "step": 500 }, { "epoch": 0.22793448589626933, "grad_norm": 0.957265553607594, "learning_rate": 4.974507354260332e-06, "loss": 0.0841, "step": 501 }, { "epoch": 0.2283894449499545, "grad_norm": 1.0285318937850472, "learning_rate": 4.974405456230436e-06, "loss": 0.0876, "step": 502 }, { "epoch": 0.22884440400363967, "grad_norm": 0.9438000836090487, "learning_rate": 4.974303356003301e-06, "loss": 0.0618, "step": 503 }, { "epoch": 0.22929936305732485, "grad_norm": 0.7641433481492992, "learning_rate": 4.974201053587268e-06, "loss": 0.0623, "step": 504 }, { "epoch": 0.22975432211101002, "grad_norm": 0.7211862506979909, "learning_rate": 4.9740985489907005e-06, "loss": 0.0458, "step": 505 }, { "epoch": 0.2302092811646952, "grad_norm": 1.3113691041435898, "learning_rate": 4.973995842221971e-06, "loss": 0.0865, "step": 506 }, { "epoch": 0.23066424021838033, "grad_norm": 1.1027187330835053, "learning_rate": 4.973892933289476e-06, "loss": 0.0817, "step": 507 }, { "epoch": 0.2311191992720655, "grad_norm": 0.8000847819873458, "learning_rate": 4.97378982220162e-06, "loss": 0.0639, "step": 508 }, { "epoch": 0.23157415832575068, "grad_norm": 0.5709614643890362, "learning_rate": 4.973686508966832e-06, "loss": 0.0427, "step": 509 }, { "epoch": 0.23202911737943585, "grad_norm": 0.6348346044427912, "learning_rate": 4.973582993593554e-06, "loss": 0.0453, "step": 510 }, { "epoch": 0.23248407643312102, "grad_norm": 0.7080077445614887, "learning_rate": 4.973479276090244e-06, "loss": 0.0567, "step": 511 }, { "epoch": 0.2329390354868062, "grad_norm": 0.586722983901754, "learning_rate": 4.973375356465378e-06, "loss": 0.0398, "step": 512 }, { "epoch": 0.23339399454049137, "grad_norm": 0.9373759345632122, "learning_rate": 4.973271234727447e-06, "loss": 0.083, "step": 513 }, { "epoch": 0.2338489535941765, "grad_norm": 0.7290102387520916, "learning_rate": 4.97316691088496e-06, "loss": 0.0573, "step": 514 }, { "epoch": 0.23430391264786168, "grad_norm": 1.2047650698868653, "learning_rate": 4.973062384946442e-06, "loss": 0.0979, "step": 515 }, { "epoch": 0.23475887170154686, "grad_norm": 0.5553854533375087, "learning_rate": 4.9729576569204345e-06, "loss": 0.0493, "step": 516 }, { "epoch": 0.23521383075523203, "grad_norm": 0.966683679171784, "learning_rate": 4.972852726815495e-06, "loss": 0.0744, "step": 517 }, { "epoch": 0.2356687898089172, "grad_norm": 0.8972567842292303, "learning_rate": 4.972747594640197e-06, "loss": 0.0822, "step": 518 }, { "epoch": 0.23612374886260237, "grad_norm": 0.9532248896529997, "learning_rate": 4.9726422604031335e-06, "loss": 0.0628, "step": 519 }, { "epoch": 0.23657870791628755, "grad_norm": 0.5831731409388041, "learning_rate": 4.97253672411291e-06, "loss": 0.0499, "step": 520 }, { "epoch": 0.2370336669699727, "grad_norm": 0.7629148584956371, "learning_rate": 4.972430985778152e-06, "loss": 0.0502, "step": 521 }, { "epoch": 0.23748862602365786, "grad_norm": 0.8867114815888714, "learning_rate": 4.972325045407499e-06, "loss": 0.0551, "step": 522 }, { "epoch": 0.23794358507734303, "grad_norm": 1.2463480840549028, "learning_rate": 4.972218903009608e-06, "loss": 0.0715, "step": 523 }, { "epoch": 0.2383985441310282, "grad_norm": 0.782156462915191, "learning_rate": 4.972112558593153e-06, "loss": 0.0658, "step": 524 }, { "epoch": 0.23885350318471338, "grad_norm": 0.5674610459457798, "learning_rate": 4.972006012166823e-06, "loss": 0.0443, "step": 525 }, { "epoch": 0.23930846223839855, "grad_norm": 0.6676557313621811, "learning_rate": 4.971899263739326e-06, "loss": 0.052, "step": 526 }, { "epoch": 0.23976342129208372, "grad_norm": 0.8996461781463584, "learning_rate": 4.971792313319384e-06, "loss": 0.0761, "step": 527 }, { "epoch": 0.24021838034576887, "grad_norm": 0.7869388715576839, "learning_rate": 4.971685160915737e-06, "loss": 0.059, "step": 528 }, { "epoch": 0.24067333939945404, "grad_norm": 0.8601250360554993, "learning_rate": 4.971577806537139e-06, "loss": 0.058, "step": 529 }, { "epoch": 0.2411282984531392, "grad_norm": 0.860384363291072, "learning_rate": 4.971470250192366e-06, "loss": 0.0746, "step": 530 }, { "epoch": 0.24158325750682438, "grad_norm": 17.481585256275345, "learning_rate": 4.9713624918902045e-06, "loss": 0.3357, "step": 531 }, { "epoch": 0.24203821656050956, "grad_norm": 1.3228769141545746, "learning_rate": 4.971254531639461e-06, "loss": 0.0978, "step": 532 }, { "epoch": 0.24249317561419473, "grad_norm": 0.9022991420443233, "learning_rate": 4.971146369448957e-06, "loss": 0.073, "step": 533 }, { "epoch": 0.2429481346678799, "grad_norm": 0.8487996347147105, "learning_rate": 4.971038005327532e-06, "loss": 0.0772, "step": 534 }, { "epoch": 0.24340309372156507, "grad_norm": 1.0939700661439853, "learning_rate": 4.970929439284039e-06, "loss": 0.1052, "step": 535 }, { "epoch": 0.24385805277525022, "grad_norm": 1.1117793169544092, "learning_rate": 4.970820671327351e-06, "loss": 0.0838, "step": 536 }, { "epoch": 0.2443130118289354, "grad_norm": 0.5711568883528185, "learning_rate": 4.9707117014663565e-06, "loss": 0.0477, "step": 537 }, { "epoch": 0.24476797088262056, "grad_norm": 0.9911963887990124, "learning_rate": 4.97060252970996e-06, "loss": 0.0859, "step": 538 }, { "epoch": 0.24522292993630573, "grad_norm": 0.8786877928757788, "learning_rate": 4.970493156067081e-06, "loss": 0.0672, "step": 539 }, { "epoch": 0.2456778889899909, "grad_norm": 0.6358718673962386, "learning_rate": 4.970383580546658e-06, "loss": 0.049, "step": 540 }, { "epoch": 0.24613284804367608, "grad_norm": 0.9673038276315246, "learning_rate": 4.970273803157645e-06, "loss": 0.0789, "step": 541 }, { "epoch": 0.24658780709736125, "grad_norm": 0.7896663626576268, "learning_rate": 4.970163823909013e-06, "loss": 0.0636, "step": 542 }, { "epoch": 0.2470427661510464, "grad_norm": 0.7725841407720596, "learning_rate": 4.970053642809748e-06, "loss": 0.0591, "step": 543 }, { "epoch": 0.24749772520473157, "grad_norm": 0.8834486709832678, "learning_rate": 4.969943259868853e-06, "loss": 0.0741, "step": 544 }, { "epoch": 0.24795268425841674, "grad_norm": 0.9862513700188255, "learning_rate": 4.969832675095351e-06, "loss": 0.0733, "step": 545 }, { "epoch": 0.2484076433121019, "grad_norm": 0.9230048911450578, "learning_rate": 4.969721888498275e-06, "loss": 0.0784, "step": 546 }, { "epoch": 0.24886260236578708, "grad_norm": 0.678321429576158, "learning_rate": 4.96961090008668e-06, "loss": 0.0548, "step": 547 }, { "epoch": 0.24931756141947226, "grad_norm": 1.0377618196684284, "learning_rate": 4.969499709869635e-06, "loss": 0.0972, "step": 548 }, { "epoch": 0.24977252047315743, "grad_norm": 1.0401408232919482, "learning_rate": 4.969388317856225e-06, "loss": 0.0803, "step": 549 }, { "epoch": 0.2502274795268426, "grad_norm": 1.1187089275098543, "learning_rate": 4.969276724055554e-06, "loss": 0.0959, "step": 550 }, { "epoch": 0.25068243858052774, "grad_norm": 0.955462869329459, "learning_rate": 4.969164928476741e-06, "loss": 0.0676, "step": 551 }, { "epoch": 0.25113739763421294, "grad_norm": 0.8046461909524141, "learning_rate": 4.969052931128919e-06, "loss": 0.0648, "step": 552 }, { "epoch": 0.2515923566878981, "grad_norm": 0.7081920862352523, "learning_rate": 4.968940732021243e-06, "loss": 0.0603, "step": 553 }, { "epoch": 0.25204731574158323, "grad_norm": 0.9857688144173427, "learning_rate": 4.9688283311628795e-06, "loss": 0.0918, "step": 554 }, { "epoch": 0.25250227479526843, "grad_norm": 0.8534813080817202, "learning_rate": 4.968715728563014e-06, "loss": 0.0679, "step": 555 }, { "epoch": 0.2529572338489536, "grad_norm": 0.5525293734820541, "learning_rate": 4.968602924230847e-06, "loss": 0.0413, "step": 556 }, { "epoch": 0.2534121929026388, "grad_norm": 1.2973130655518506, "learning_rate": 4.968489918175598e-06, "loss": 0.085, "step": 557 }, { "epoch": 0.2538671519563239, "grad_norm": 1.1050509785585005, "learning_rate": 4.9683767104065014e-06, "loss": 0.0758, "step": 558 }, { "epoch": 0.2543221110100091, "grad_norm": 0.7040398410425142, "learning_rate": 4.968263300932806e-06, "loss": 0.0484, "step": 559 }, { "epoch": 0.25477707006369427, "grad_norm": 0.5860142844568907, "learning_rate": 4.968149689763781e-06, "loss": 0.0477, "step": 560 }, { "epoch": 0.2552320291173794, "grad_norm": 0.7720444359609591, "learning_rate": 4.968035876908708e-06, "loss": 0.0716, "step": 561 }, { "epoch": 0.2556869881710646, "grad_norm": 0.9073150271174998, "learning_rate": 4.967921862376889e-06, "loss": 0.0775, "step": 562 }, { "epoch": 0.25614194722474976, "grad_norm": 1.0634349883750702, "learning_rate": 4.9678076461776415e-06, "loss": 0.0843, "step": 563 }, { "epoch": 0.25659690627843496, "grad_norm": 1.0056095668838196, "learning_rate": 4.9676932283202965e-06, "loss": 0.0845, "step": 564 }, { "epoch": 0.2570518653321201, "grad_norm": 1.0935445939907518, "learning_rate": 4.967578608814205e-06, "loss": 0.0844, "step": 565 }, { "epoch": 0.2575068243858053, "grad_norm": 0.736642742743355, "learning_rate": 4.9674637876687345e-06, "loss": 0.0683, "step": 566 }, { "epoch": 0.25796178343949044, "grad_norm": 0.5647018247667355, "learning_rate": 4.967348764893265e-06, "loss": 0.0453, "step": 567 }, { "epoch": 0.2584167424931756, "grad_norm": 0.882122047098411, "learning_rate": 4.967233540497197e-06, "loss": 0.0575, "step": 568 }, { "epoch": 0.2588717015468608, "grad_norm": 0.8934709872440615, "learning_rate": 4.967118114489946e-06, "loss": 0.0562, "step": 569 }, { "epoch": 0.25932666060054593, "grad_norm": 0.7282868782108531, "learning_rate": 4.967002486880944e-06, "loss": 0.0486, "step": 570 }, { "epoch": 0.25978161965423113, "grad_norm": 1.0082182066998666, "learning_rate": 4.966886657679641e-06, "loss": 0.0766, "step": 571 }, { "epoch": 0.2602365787079163, "grad_norm": 1.348224587830696, "learning_rate": 4.966770626895499e-06, "loss": 0.0845, "step": 572 }, { "epoch": 0.2606915377616015, "grad_norm": 0.9025549046907797, "learning_rate": 4.966654394538002e-06, "loss": 0.0738, "step": 573 }, { "epoch": 0.2611464968152866, "grad_norm": 0.639234711788688, "learning_rate": 4.966537960616646e-06, "loss": 0.0495, "step": 574 }, { "epoch": 0.26160145586897177, "grad_norm": 1.0233503597101892, "learning_rate": 4.9664213251409486e-06, "loss": 0.0637, "step": 575 }, { "epoch": 0.26205641492265697, "grad_norm": 1.1124626991947715, "learning_rate": 4.9663044881204375e-06, "loss": 0.1045, "step": 576 }, { "epoch": 0.2625113739763421, "grad_norm": 0.837344371555646, "learning_rate": 4.9661874495646615e-06, "loss": 0.0646, "step": 577 }, { "epoch": 0.2629663330300273, "grad_norm": 0.7068866993603214, "learning_rate": 4.9660702094831845e-06, "loss": 0.0619, "step": 578 }, { "epoch": 0.26342129208371245, "grad_norm": 0.9495992534959607, "learning_rate": 4.965952767885587e-06, "loss": 0.0635, "step": 579 }, { "epoch": 0.26387625113739765, "grad_norm": 1.1302962930077667, "learning_rate": 4.965835124781465e-06, "loss": 0.0852, "step": 580 }, { "epoch": 0.2643312101910828, "grad_norm": 0.6086948754935466, "learning_rate": 4.965717280180432e-06, "loss": 0.0551, "step": 581 }, { "epoch": 0.26478616924476794, "grad_norm": 0.7060150486109749, "learning_rate": 4.965599234092118e-06, "loss": 0.0546, "step": 582 }, { "epoch": 0.26524112829845314, "grad_norm": 0.8543624689211352, "learning_rate": 4.96548098652617e-06, "loss": 0.0754, "step": 583 }, { "epoch": 0.2656960873521383, "grad_norm": 1.0333432760993717, "learning_rate": 4.965362537492249e-06, "loss": 0.0864, "step": 584 }, { "epoch": 0.2661510464058235, "grad_norm": 0.7262201210260119, "learning_rate": 4.9652438870000356e-06, "loss": 0.0555, "step": 585 }, { "epoch": 0.26660600545950863, "grad_norm": 0.5794267973617044, "learning_rate": 4.965125035059224e-06, "loss": 0.0553, "step": 586 }, { "epoch": 0.26706096451319383, "grad_norm": 0.7938840996429771, "learning_rate": 4.965005981679527e-06, "loss": 0.0624, "step": 587 }, { "epoch": 0.267515923566879, "grad_norm": 1.1569186716649804, "learning_rate": 4.964886726870673e-06, "loss": 0.0905, "step": 588 }, { "epoch": 0.2679708826205642, "grad_norm": 0.8131461154142043, "learning_rate": 4.964767270642407e-06, "loss": 0.0542, "step": 589 }, { "epoch": 0.2684258416742493, "grad_norm": 0.9434951271078357, "learning_rate": 4.964647613004491e-06, "loss": 0.0747, "step": 590 }, { "epoch": 0.26888080072793447, "grad_norm": 0.7006034344602099, "learning_rate": 4.964527753966702e-06, "loss": 0.0512, "step": 591 }, { "epoch": 0.26933575978161967, "grad_norm": 0.684347542401468, "learning_rate": 4.964407693538834e-06, "loss": 0.0573, "step": 592 }, { "epoch": 0.2697907188353048, "grad_norm": 1.0140148730488754, "learning_rate": 4.9642874317307e-06, "loss": 0.0843, "step": 593 }, { "epoch": 0.27024567788899, "grad_norm": 0.8814518099865631, "learning_rate": 4.964166968552124e-06, "loss": 0.0874, "step": 594 }, { "epoch": 0.27070063694267515, "grad_norm": 0.7465912736193613, "learning_rate": 4.9640463040129525e-06, "loss": 0.0516, "step": 595 }, { "epoch": 0.27115559599636035, "grad_norm": 0.6714302701088581, "learning_rate": 4.963925438123044e-06, "loss": 0.0454, "step": 596 }, { "epoch": 0.2716105550500455, "grad_norm": 0.8301899302632495, "learning_rate": 4.963804370892276e-06, "loss": 0.0647, "step": 597 }, { "epoch": 0.27206551410373064, "grad_norm": 1.0257561895944438, "learning_rate": 4.9636831023305405e-06, "loss": 0.087, "step": 598 }, { "epoch": 0.27252047315741584, "grad_norm": 0.6785463668955102, "learning_rate": 4.963561632447748e-06, "loss": 0.0478, "step": 599 }, { "epoch": 0.272975432211101, "grad_norm": 0.6400387189893691, "learning_rate": 4.9634399612538255e-06, "loss": 0.0461, "step": 600 }, { "epoch": 0.2734303912647862, "grad_norm": 12.91966872067954, "learning_rate": 4.963318088758714e-06, "loss": 0.2613, "step": 601 }, { "epoch": 0.27388535031847133, "grad_norm": 0.797345248624046, "learning_rate": 4.963196014972371e-06, "loss": 0.0525, "step": 602 }, { "epoch": 0.27434030937215653, "grad_norm": 0.7395484231820286, "learning_rate": 4.963073739904775e-06, "loss": 0.0555, "step": 603 }, { "epoch": 0.2747952684258417, "grad_norm": 0.7278963125143824, "learning_rate": 4.962951263565915e-06, "loss": 0.0516, "step": 604 }, { "epoch": 0.2752502274795268, "grad_norm": 1.1277517505176968, "learning_rate": 4.962828585965801e-06, "loss": 0.0682, "step": 605 }, { "epoch": 0.275705186533212, "grad_norm": 0.8383324875513333, "learning_rate": 4.962705707114457e-06, "loss": 0.0653, "step": 606 }, { "epoch": 0.27616014558689717, "grad_norm": 0.8259808296763246, "learning_rate": 4.962582627021923e-06, "loss": 0.067, "step": 607 }, { "epoch": 0.27661510464058237, "grad_norm": 1.0342239300300777, "learning_rate": 4.962459345698258e-06, "loss": 0.0818, "step": 608 }, { "epoch": 0.2770700636942675, "grad_norm": 0.9575696366695832, "learning_rate": 4.962335863153537e-06, "loss": 0.0774, "step": 609 }, { "epoch": 0.2775250227479527, "grad_norm": 0.6545425176058414, "learning_rate": 4.962212179397847e-06, "loss": 0.0559, "step": 610 }, { "epoch": 0.27797998180163785, "grad_norm": 0.762093168945694, "learning_rate": 4.962088294441299e-06, "loss": 0.0486, "step": 611 }, { "epoch": 0.278434940855323, "grad_norm": 0.7471732165438408, "learning_rate": 4.9619642082940135e-06, "loss": 0.0653, "step": 612 }, { "epoch": 0.2788898999090082, "grad_norm": 0.9260147407911207, "learning_rate": 4.9618399209661305e-06, "loss": 0.0793, "step": 613 }, { "epoch": 0.27934485896269334, "grad_norm": 0.7947481326693611, "learning_rate": 4.961715432467807e-06, "loss": 0.0494, "step": 614 }, { "epoch": 0.27979981801637854, "grad_norm": 0.676048071191175, "learning_rate": 4.961590742809216e-06, "loss": 0.0499, "step": 615 }, { "epoch": 0.2802547770700637, "grad_norm": 0.776818740518446, "learning_rate": 4.961465852000545e-06, "loss": 0.0622, "step": 616 }, { "epoch": 0.2807097361237489, "grad_norm": 0.9447441776692052, "learning_rate": 4.961340760052001e-06, "loss": 0.0605, "step": 617 }, { "epoch": 0.28116469517743403, "grad_norm": 0.7631458630471165, "learning_rate": 4.961215466973806e-06, "loss": 0.0517, "step": 618 }, { "epoch": 0.2816196542311192, "grad_norm": 1.4586634447892357, "learning_rate": 4.961089972776197e-06, "loss": 0.1213, "step": 619 }, { "epoch": 0.2820746132848044, "grad_norm": 0.7305250205616287, "learning_rate": 4.9609642774694285e-06, "loss": 0.0491, "step": 620 }, { "epoch": 0.2825295723384895, "grad_norm": 1.0406398369078378, "learning_rate": 4.960838381063774e-06, "loss": 0.0663, "step": 621 }, { "epoch": 0.2829845313921747, "grad_norm": 1.2055388650663057, "learning_rate": 4.960712283569521e-06, "loss": 0.0954, "step": 622 }, { "epoch": 0.28343949044585987, "grad_norm": 1.0747699071981542, "learning_rate": 4.960585984996971e-06, "loss": 0.0708, "step": 623 }, { "epoch": 0.28389444949954507, "grad_norm": 0.8880949030661993, "learning_rate": 4.960459485356447e-06, "loss": 0.0863, "step": 624 }, { "epoch": 0.2843494085532302, "grad_norm": 0.7242068952577403, "learning_rate": 4.960332784658285e-06, "loss": 0.0626, "step": 625 }, { "epoch": 0.28480436760691535, "grad_norm": 1.1695122446283712, "learning_rate": 4.960205882912839e-06, "loss": 0.0891, "step": 626 }, { "epoch": 0.28525932666060055, "grad_norm": 1.3443620909335443, "learning_rate": 4.9600787801304785e-06, "loss": 0.125, "step": 627 }, { "epoch": 0.2857142857142857, "grad_norm": 1.155929017304206, "learning_rate": 4.959951476321589e-06, "loss": 0.0871, "step": 628 }, { "epoch": 0.2861692447679709, "grad_norm": 0.5929895200550217, "learning_rate": 4.959823971496575e-06, "loss": 0.0552, "step": 629 }, { "epoch": 0.28662420382165604, "grad_norm": 2.1679171173353, "learning_rate": 4.959696265665853e-06, "loss": 0.1544, "step": 630 }, { "epoch": 0.28707916287534124, "grad_norm": 0.5970059996178373, "learning_rate": 4.959568358839862e-06, "loss": 0.0341, "step": 631 }, { "epoch": 0.2875341219290264, "grad_norm": 0.7134802025387853, "learning_rate": 4.95944025102905e-06, "loss": 0.0561, "step": 632 }, { "epoch": 0.28798908098271153, "grad_norm": 1.0949999376535642, "learning_rate": 4.959311942243888e-06, "loss": 0.0847, "step": 633 }, { "epoch": 0.28844404003639673, "grad_norm": 0.6935143421827713, "learning_rate": 4.95918343249486e-06, "loss": 0.0527, "step": 634 }, { "epoch": 0.2888989990900819, "grad_norm": 0.9012545498209729, "learning_rate": 4.959054721792469e-06, "loss": 0.0765, "step": 635 }, { "epoch": 0.2893539581437671, "grad_norm": 0.718784967956377, "learning_rate": 4.958925810147231e-06, "loss": 0.0521, "step": 636 }, { "epoch": 0.2898089171974522, "grad_norm": 0.8036871187269504, "learning_rate": 4.958796697569679e-06, "loss": 0.0575, "step": 637 }, { "epoch": 0.2902638762511374, "grad_norm": 0.7083108170821438, "learning_rate": 4.958667384070365e-06, "loss": 0.0474, "step": 638 }, { "epoch": 0.29071883530482256, "grad_norm": 0.8852706168812431, "learning_rate": 4.958537869659855e-06, "loss": 0.078, "step": 639 }, { "epoch": 0.2911737943585077, "grad_norm": 0.7427839621337862, "learning_rate": 4.958408154348734e-06, "loss": 0.0481, "step": 640 }, { "epoch": 0.2916287534121929, "grad_norm": 0.8545728670719058, "learning_rate": 4.9582782381476e-06, "loss": 0.0775, "step": 641 }, { "epoch": 0.29208371246587805, "grad_norm": 1.112593592536602, "learning_rate": 4.958148121067071e-06, "loss": 0.1085, "step": 642 }, { "epoch": 0.29253867151956325, "grad_norm": 0.8382409456326524, "learning_rate": 4.9580178031177775e-06, "loss": 0.0834, "step": 643 }, { "epoch": 0.2929936305732484, "grad_norm": 0.7181010571245596, "learning_rate": 4.9578872843103694e-06, "loss": 0.0706, "step": 644 }, { "epoch": 0.2934485896269336, "grad_norm": 0.838533684989998, "learning_rate": 4.957756564655513e-06, "loss": 0.0699, "step": 645 }, { "epoch": 0.29390354868061874, "grad_norm": 0.8489478508314325, "learning_rate": 4.957625644163888e-06, "loss": 0.0786, "step": 646 }, { "epoch": 0.2943585077343039, "grad_norm": 0.880567536766304, "learning_rate": 4.957494522846194e-06, "loss": 0.0634, "step": 647 }, { "epoch": 0.2948134667879891, "grad_norm": 0.9985911504789262, "learning_rate": 4.957363200713146e-06, "loss": 0.0971, "step": 648 }, { "epoch": 0.29526842584167423, "grad_norm": 0.7465438435071795, "learning_rate": 4.957231677775475e-06, "loss": 0.0543, "step": 649 }, { "epoch": 0.29572338489535943, "grad_norm": 1.0873754674099565, "learning_rate": 4.957099954043928e-06, "loss": 0.0975, "step": 650 }, { "epoch": 0.2961783439490446, "grad_norm": 0.7743109510832679, "learning_rate": 4.956968029529269e-06, "loss": 0.0782, "step": 651 }, { "epoch": 0.2966333030027298, "grad_norm": 0.8294361612806938, "learning_rate": 4.956835904242277e-06, "loss": 0.0741, "step": 652 }, { "epoch": 0.2970882620564149, "grad_norm": 0.5971734320200014, "learning_rate": 4.9567035781937516e-06, "loss": 0.0382, "step": 653 }, { "epoch": 0.29754322111010006, "grad_norm": 0.9121379516261049, "learning_rate": 4.9565710513945024e-06, "loss": 0.0639, "step": 654 }, { "epoch": 0.29799818016378526, "grad_norm": 0.9983314125142588, "learning_rate": 4.956438323855362e-06, "loss": 0.0745, "step": 655 }, { "epoch": 0.2984531392174704, "grad_norm": 1.057472958552687, "learning_rate": 4.956305395587174e-06, "loss": 0.091, "step": 656 }, { "epoch": 0.2989080982711556, "grad_norm": 0.7245370640267725, "learning_rate": 4.956172266600802e-06, "loss": 0.0566, "step": 657 }, { "epoch": 0.29936305732484075, "grad_norm": 0.7068763180795751, "learning_rate": 4.956038936907125e-06, "loss": 0.0523, "step": 658 }, { "epoch": 0.29981801637852595, "grad_norm": 0.7580044270083526, "learning_rate": 4.955905406517036e-06, "loss": 0.0515, "step": 659 }, { "epoch": 0.3002729754322111, "grad_norm": 0.9984766712753593, "learning_rate": 4.95577167544145e-06, "loss": 0.0793, "step": 660 }, { "epoch": 0.30072793448589624, "grad_norm": 0.9743764266009726, "learning_rate": 4.955637743691291e-06, "loss": 0.0726, "step": 661 }, { "epoch": 0.30118289353958144, "grad_norm": 0.9004872852534804, "learning_rate": 4.955503611277506e-06, "loss": 0.0652, "step": 662 }, { "epoch": 0.3016378525932666, "grad_norm": 0.79764221013725, "learning_rate": 4.955369278211055e-06, "loss": 0.0536, "step": 663 }, { "epoch": 0.3020928116469518, "grad_norm": 0.9464070117627001, "learning_rate": 4.955234744502914e-06, "loss": 0.0662, "step": 664 }, { "epoch": 0.30254777070063693, "grad_norm": 1.0516204782864038, "learning_rate": 4.955100010164079e-06, "loss": 0.081, "step": 665 }, { "epoch": 0.30300272975432213, "grad_norm": 0.7302891890803844, "learning_rate": 4.954965075205557e-06, "loss": 0.0513, "step": 666 }, { "epoch": 0.3034576888080073, "grad_norm": 2.2645221771727537, "learning_rate": 4.9548299396383755e-06, "loss": 0.1286, "step": 667 }, { "epoch": 0.3039126478616925, "grad_norm": 0.8238624251883593, "learning_rate": 4.954694603473578e-06, "loss": 0.0514, "step": 668 }, { "epoch": 0.3043676069153776, "grad_norm": 1.1280026909584604, "learning_rate": 4.954559066722222e-06, "loss": 0.0872, "step": 669 }, { "epoch": 0.30482256596906276, "grad_norm": 0.9142124471934124, "learning_rate": 4.954423329395385e-06, "loss": 0.0795, "step": 670 }, { "epoch": 0.30527752502274796, "grad_norm": 1.0203604655093028, "learning_rate": 4.954287391504156e-06, "loss": 0.0887, "step": 671 }, { "epoch": 0.3057324840764331, "grad_norm": 1.137669290207854, "learning_rate": 4.9541512530596455e-06, "loss": 0.0946, "step": 672 }, { "epoch": 0.3061874431301183, "grad_norm": 0.8645784943070317, "learning_rate": 4.954014914072978e-06, "loss": 0.069, "step": 673 }, { "epoch": 0.30664240218380345, "grad_norm": 0.7612707675408733, "learning_rate": 4.9538783745552934e-06, "loss": 0.0655, "step": 674 }, { "epoch": 0.30709736123748865, "grad_norm": 1.266400392804822, "learning_rate": 4.95374163451775e-06, "loss": 0.0993, "step": 675 }, { "epoch": 0.3075523202911738, "grad_norm": 0.9601214063413259, "learning_rate": 4.953604693971521e-06, "loss": 0.066, "step": 676 }, { "epoch": 0.30800727934485894, "grad_norm": 0.7836256655266565, "learning_rate": 4.953467552927798e-06, "loss": 0.042, "step": 677 }, { "epoch": 0.30846223839854414, "grad_norm": 1.300589530536382, "learning_rate": 4.9533302113977845e-06, "loss": 0.0899, "step": 678 }, { "epoch": 0.3089171974522293, "grad_norm": 1.1474483826754185, "learning_rate": 4.9531926693927055e-06, "loss": 0.0808, "step": 679 }, { "epoch": 0.3093721565059145, "grad_norm": 0.9596950226202976, "learning_rate": 4.953054926923801e-06, "loss": 0.0795, "step": 680 }, { "epoch": 0.30982711555959963, "grad_norm": 1.0372662479293318, "learning_rate": 4.952916984002325e-06, "loss": 0.0726, "step": 681 }, { "epoch": 0.31028207461328483, "grad_norm": 0.9537762571435, "learning_rate": 4.95277884063955e-06, "loss": 0.0784, "step": 682 }, { "epoch": 0.31073703366697, "grad_norm": 0.7652431915975989, "learning_rate": 4.952640496846766e-06, "loss": 0.0736, "step": 683 }, { "epoch": 0.3111919927206551, "grad_norm": 0.6958333798668543, "learning_rate": 4.952501952635276e-06, "loss": 0.0563, "step": 684 }, { "epoch": 0.3116469517743403, "grad_norm": 1.1475385694550302, "learning_rate": 4.952363208016402e-06, "loss": 0.0969, "step": 685 }, { "epoch": 0.31210191082802546, "grad_norm": 0.8003142285493542, "learning_rate": 4.952224263001482e-06, "loss": 0.0499, "step": 686 }, { "epoch": 0.31255686988171066, "grad_norm": 0.5867807916718473, "learning_rate": 4.952085117601868e-06, "loss": 0.0477, "step": 687 }, { "epoch": 0.3130118289353958, "grad_norm": 1.0024535273535888, "learning_rate": 4.951945771828933e-06, "loss": 0.0999, "step": 688 }, { "epoch": 0.313466787989081, "grad_norm": 0.6050221715881425, "learning_rate": 4.951806225694061e-06, "loss": 0.059, "step": 689 }, { "epoch": 0.31392174704276615, "grad_norm": 0.9849063687092052, "learning_rate": 4.951666479208658e-06, "loss": 0.072, "step": 690 }, { "epoch": 0.3143767060964513, "grad_norm": 0.6941254969326264, "learning_rate": 4.951526532384141e-06, "loss": 0.0561, "step": 691 }, { "epoch": 0.3148316651501365, "grad_norm": 0.9726304738330778, "learning_rate": 4.951386385231946e-06, "loss": 0.0717, "step": 692 }, { "epoch": 0.31528662420382164, "grad_norm": 0.6220356445541609, "learning_rate": 4.951246037763528e-06, "loss": 0.0468, "step": 693 }, { "epoch": 0.31574158325750684, "grad_norm": 0.8409955006728991, "learning_rate": 4.9511054899903524e-06, "loss": 0.0547, "step": 694 }, { "epoch": 0.316196542311192, "grad_norm": 0.9382334215030735, "learning_rate": 4.950964741923905e-06, "loss": 0.0741, "step": 695 }, { "epoch": 0.3166515013648772, "grad_norm": 0.782635605280389, "learning_rate": 4.950823793575688e-06, "loss": 0.0581, "step": 696 }, { "epoch": 0.31710646041856233, "grad_norm": 1.2001834327909027, "learning_rate": 4.950682644957218e-06, "loss": 0.0963, "step": 697 }, { "epoch": 0.3175614194722475, "grad_norm": 1.1857201792014638, "learning_rate": 4.9505412960800295e-06, "loss": 0.0883, "step": 698 }, { "epoch": 0.3180163785259327, "grad_norm": 0.8662471412292134, "learning_rate": 4.950399746955673e-06, "loss": 0.0707, "step": 699 }, { "epoch": 0.3184713375796178, "grad_norm": 0.886786111665171, "learning_rate": 4.950257997595716e-06, "loss": 0.0647, "step": 700 }, { "epoch": 0.318926296633303, "grad_norm": 0.945494756521827, "learning_rate": 4.950116048011739e-06, "loss": 0.0682, "step": 701 }, { "epoch": 0.31938125568698816, "grad_norm": 0.5880480229225298, "learning_rate": 4.949973898215344e-06, "loss": 0.0371, "step": 702 }, { "epoch": 0.31983621474067336, "grad_norm": 0.8807599912178138, "learning_rate": 4.949831548218146e-06, "loss": 0.0685, "step": 703 }, { "epoch": 0.3202911737943585, "grad_norm": 1.0002963777426177, "learning_rate": 4.949688998031777e-06, "loss": 0.0727, "step": 704 }, { "epoch": 0.32074613284804365, "grad_norm": 0.7255329110919103, "learning_rate": 4.949546247667886e-06, "loss": 0.05, "step": 705 }, { "epoch": 0.32120109190172885, "grad_norm": 0.9082181101140028, "learning_rate": 4.949403297138137e-06, "loss": 0.0649, "step": 706 }, { "epoch": 0.321656050955414, "grad_norm": 0.805531599078662, "learning_rate": 4.949260146454212e-06, "loss": 0.0729, "step": 707 }, { "epoch": 0.3221110100090992, "grad_norm": 0.854802575466473, "learning_rate": 4.94911679562781e-06, "loss": 0.0562, "step": 708 }, { "epoch": 0.32256596906278434, "grad_norm": 1.0984796609960896, "learning_rate": 4.948973244670643e-06, "loss": 0.0725, "step": 709 }, { "epoch": 0.32302092811646954, "grad_norm": 0.7556324629138267, "learning_rate": 4.948829493594441e-06, "loss": 0.0544, "step": 710 }, { "epoch": 0.3234758871701547, "grad_norm": 0.5920439294431348, "learning_rate": 4.9486855424109524e-06, "loss": 0.0411, "step": 711 }, { "epoch": 0.32393084622383983, "grad_norm": 0.6808571640088359, "learning_rate": 4.948541391131939e-06, "loss": 0.0593, "step": 712 }, { "epoch": 0.32438580527752503, "grad_norm": 0.6475999202690299, "learning_rate": 4.948397039769181e-06, "loss": 0.0368, "step": 713 }, { "epoch": 0.3248407643312102, "grad_norm": 0.5475655472838014, "learning_rate": 4.948252488334474e-06, "loss": 0.034, "step": 714 }, { "epoch": 0.3252957233848954, "grad_norm": 0.5762149944161961, "learning_rate": 4.948107736839629e-06, "loss": 0.0499, "step": 715 }, { "epoch": 0.3257506824385805, "grad_norm": 0.7900459721209473, "learning_rate": 4.947962785296476e-06, "loss": 0.0774, "step": 716 }, { "epoch": 0.3262056414922657, "grad_norm": 1.0482560180703868, "learning_rate": 4.9478176337168594e-06, "loss": 0.0836, "step": 717 }, { "epoch": 0.32666060054595086, "grad_norm": 1.401558028095644, "learning_rate": 4.9476722821126386e-06, "loss": 0.1193, "step": 718 }, { "epoch": 0.327115559599636, "grad_norm": 0.771148987620668, "learning_rate": 4.9475267304956945e-06, "loss": 0.0689, "step": 719 }, { "epoch": 0.3275705186533212, "grad_norm": 1.0131729662598652, "learning_rate": 4.947380978877917e-06, "loss": 0.0755, "step": 720 }, { "epoch": 0.32802547770700635, "grad_norm": 0.9498239138116331, "learning_rate": 4.947235027271219e-06, "loss": 0.098, "step": 721 }, { "epoch": 0.32848043676069155, "grad_norm": 0.6332324914490356, "learning_rate": 4.9470888756875265e-06, "loss": 0.0447, "step": 722 }, { "epoch": 0.3289353958143767, "grad_norm": 0.7271597537238114, "learning_rate": 4.946942524138782e-06, "loss": 0.0483, "step": 723 }, { "epoch": 0.3293903548680619, "grad_norm": 0.8308219382266808, "learning_rate": 4.946795972636944e-06, "loss": 0.0631, "step": 724 }, { "epoch": 0.32984531392174704, "grad_norm": 1.0116359294484205, "learning_rate": 4.94664922119399e-06, "loss": 0.0896, "step": 725 }, { "epoch": 0.3303002729754322, "grad_norm": 0.7583834250675704, "learning_rate": 4.94650226982191e-06, "loss": 0.0598, "step": 726 }, { "epoch": 0.3307552320291174, "grad_norm": 1.3027980178767569, "learning_rate": 4.9463551185327115e-06, "loss": 0.1405, "step": 727 }, { "epoch": 0.33121019108280253, "grad_norm": 0.7112357392323109, "learning_rate": 4.946207767338422e-06, "loss": 0.0536, "step": 728 }, { "epoch": 0.33166515013648773, "grad_norm": 0.6618610768771548, "learning_rate": 4.9460602162510805e-06, "loss": 0.0516, "step": 729 }, { "epoch": 0.3321201091901729, "grad_norm": 0.7624917659824647, "learning_rate": 4.945912465282744e-06, "loss": 0.0586, "step": 730 }, { "epoch": 0.3325750682438581, "grad_norm": 0.9623565075253229, "learning_rate": 4.945764514445487e-06, "loss": 0.0966, "step": 731 }, { "epoch": 0.3330300272975432, "grad_norm": 1.158591935392886, "learning_rate": 4.9456163637513986e-06, "loss": 0.0762, "step": 732 }, { "epoch": 0.33348498635122836, "grad_norm": 1.0561029184500623, "learning_rate": 4.945468013212585e-06, "loss": 0.0736, "step": 733 }, { "epoch": 0.33393994540491356, "grad_norm": 0.8218447404278697, "learning_rate": 4.945319462841169e-06, "loss": 0.0716, "step": 734 }, { "epoch": 0.3343949044585987, "grad_norm": 0.9168665120865833, "learning_rate": 4.94517071264929e-06, "loss": 0.0726, "step": 735 }, { "epoch": 0.3348498635122839, "grad_norm": 0.7363681967868748, "learning_rate": 4.945021762649102e-06, "loss": 0.043, "step": 736 }, { "epoch": 0.33530482256596905, "grad_norm": 0.725798212067647, "learning_rate": 4.9448726128527776e-06, "loss": 0.0636, "step": 737 }, { "epoch": 0.33575978161965425, "grad_norm": 0.7941303692620212, "learning_rate": 4.944723263272504e-06, "loss": 0.0695, "step": 738 }, { "epoch": 0.3362147406733394, "grad_norm": 0.834891383255751, "learning_rate": 4.944573713920485e-06, "loss": 0.0712, "step": 739 }, { "epoch": 0.33666969972702454, "grad_norm": 2.4254280630054783, "learning_rate": 4.944423964808943e-06, "loss": 0.151, "step": 740 }, { "epoch": 0.33712465878070974, "grad_norm": 0.8195094139530902, "learning_rate": 4.944274015950113e-06, "loss": 0.0631, "step": 741 }, { "epoch": 0.3375796178343949, "grad_norm": 0.886058873471566, "learning_rate": 4.944123867356249e-06, "loss": 0.0535, "step": 742 }, { "epoch": 0.3380345768880801, "grad_norm": 1.069728524255416, "learning_rate": 4.943973519039619e-06, "loss": 0.0931, "step": 743 }, { "epoch": 0.33848953594176523, "grad_norm": 0.6746904422385723, "learning_rate": 4.943822971012511e-06, "loss": 0.0473, "step": 744 }, { "epoch": 0.33894449499545043, "grad_norm": 0.6169402633729492, "learning_rate": 4.943672223287226e-06, "loss": 0.0409, "step": 745 }, { "epoch": 0.3393994540491356, "grad_norm": 1.1379727803205435, "learning_rate": 4.9435212758760815e-06, "loss": 0.0974, "step": 746 }, { "epoch": 0.3398544131028208, "grad_norm": 0.6974504786844343, "learning_rate": 4.943370128791413e-06, "loss": 0.0484, "step": 747 }, { "epoch": 0.3403093721565059, "grad_norm": 0.7031181253608232, "learning_rate": 4.943218782045574e-06, "loss": 0.063, "step": 748 }, { "epoch": 0.34076433121019106, "grad_norm": 0.9627083360594578, "learning_rate": 4.943067235650927e-06, "loss": 0.08, "step": 749 }, { "epoch": 0.34121929026387626, "grad_norm": 0.9077046325521676, "learning_rate": 4.942915489619859e-06, "loss": 0.0789, "step": 750 }, { "epoch": 0.3416742493175614, "grad_norm": 0.8348448351284486, "learning_rate": 4.9427635439647704e-06, "loss": 0.0729, "step": 751 }, { "epoch": 0.3421292083712466, "grad_norm": 0.8858012000453745, "learning_rate": 4.942611398698075e-06, "loss": 0.0664, "step": 752 }, { "epoch": 0.34258416742493175, "grad_norm": 0.8901557022008841, "learning_rate": 4.942459053832208e-06, "loss": 0.0693, "step": 753 }, { "epoch": 0.34303912647861695, "grad_norm": 0.8234502982899836, "learning_rate": 4.942306509379617e-06, "loss": 0.0597, "step": 754 }, { "epoch": 0.3434940855323021, "grad_norm": 0.6946642261242176, "learning_rate": 4.942153765352767e-06, "loss": 0.0655, "step": 755 }, { "epoch": 0.34394904458598724, "grad_norm": 0.7281851362306453, "learning_rate": 4.94200082176414e-06, "loss": 0.0477, "step": 756 }, { "epoch": 0.34440400363967244, "grad_norm": 0.702216811046451, "learning_rate": 4.941847678626234e-06, "loss": 0.051, "step": 757 }, { "epoch": 0.3448589626933576, "grad_norm": 0.7406781926848387, "learning_rate": 4.941694335951563e-06, "loss": 0.0684, "step": 758 }, { "epoch": 0.3453139217470428, "grad_norm": 0.963572123895698, "learning_rate": 4.9415407937526575e-06, "loss": 0.071, "step": 759 }, { "epoch": 0.34576888080072793, "grad_norm": 1.0435397926975072, "learning_rate": 4.9413870520420635e-06, "loss": 0.0872, "step": 760 }, { "epoch": 0.34622383985441313, "grad_norm": 0.681188662305879, "learning_rate": 4.941233110832346e-06, "loss": 0.0408, "step": 761 }, { "epoch": 0.3466787989080983, "grad_norm": 1.0327113714700533, "learning_rate": 4.941078970136082e-06, "loss": 0.0773, "step": 762 }, { "epoch": 0.3471337579617834, "grad_norm": 0.9228809359894766, "learning_rate": 4.940924629965869e-06, "loss": 0.086, "step": 763 }, { "epoch": 0.3475887170154686, "grad_norm": 0.7250520063147825, "learning_rate": 4.940770090334319e-06, "loss": 0.0463, "step": 764 }, { "epoch": 0.34804367606915376, "grad_norm": 0.7506801006626348, "learning_rate": 4.940615351254059e-06, "loss": 0.0544, "step": 765 }, { "epoch": 0.34849863512283896, "grad_norm": 0.5540561760336307, "learning_rate": 4.940460412737734e-06, "loss": 0.0526, "step": 766 }, { "epoch": 0.3489535941765241, "grad_norm": 0.695612040454574, "learning_rate": 4.940305274798005e-06, "loss": 0.0524, "step": 767 }, { "epoch": 0.3494085532302093, "grad_norm": 0.6935188585127503, "learning_rate": 4.940149937447549e-06, "loss": 0.067, "step": 768 }, { "epoch": 0.34986351228389445, "grad_norm": 0.5312867727529762, "learning_rate": 4.939994400699061e-06, "loss": 0.041, "step": 769 }, { "epoch": 0.3503184713375796, "grad_norm": 0.7505879638280587, "learning_rate": 4.939838664565248e-06, "loss": 0.0701, "step": 770 }, { "epoch": 0.3507734303912648, "grad_norm": 0.6454845261456662, "learning_rate": 4.939682729058839e-06, "loss": 0.0529, "step": 771 }, { "epoch": 0.35122838944494994, "grad_norm": 0.7301095931814326, "learning_rate": 4.939526594192574e-06, "loss": 0.0542, "step": 772 }, { "epoch": 0.35168334849863514, "grad_norm": 0.652333772985979, "learning_rate": 4.939370259979213e-06, "loss": 0.048, "step": 773 }, { "epoch": 0.3521383075523203, "grad_norm": 0.9727171709896085, "learning_rate": 4.9392137264315295e-06, "loss": 0.0829, "step": 774 }, { "epoch": 0.3525932666060055, "grad_norm": 1.002727349516092, "learning_rate": 4.939056993562316e-06, "loss": 0.1006, "step": 775 }, { "epoch": 0.35304822565969063, "grad_norm": 0.8290934071283628, "learning_rate": 4.9389000613843805e-06, "loss": 0.0604, "step": 776 }, { "epoch": 0.3535031847133758, "grad_norm": 0.7094397892635911, "learning_rate": 4.938742929910546e-06, "loss": 0.0626, "step": 777 }, { "epoch": 0.353958143767061, "grad_norm": 0.463742445825036, "learning_rate": 4.938585599153652e-06, "loss": 0.0401, "step": 778 }, { "epoch": 0.3544131028207461, "grad_norm": 0.8795124878712618, "learning_rate": 4.938428069126555e-06, "loss": 0.0751, "step": 779 }, { "epoch": 0.3548680618744313, "grad_norm": 0.9849362153346484, "learning_rate": 4.9382703398421285e-06, "loss": 0.0685, "step": 780 }, { "epoch": 0.35532302092811646, "grad_norm": 0.9119479350188407, "learning_rate": 4.938112411313261e-06, "loss": 0.0717, "step": 781 }, { "epoch": 0.35577797998180166, "grad_norm": 0.7192019183778731, "learning_rate": 4.937954283552858e-06, "loss": 0.0522, "step": 782 }, { "epoch": 0.3562329390354868, "grad_norm": 1.1237812582622189, "learning_rate": 4.93779595657384e-06, "loss": 0.0876, "step": 783 }, { "epoch": 0.35668789808917195, "grad_norm": 1.0406475235702408, "learning_rate": 4.937637430389145e-06, "loss": 0.0893, "step": 784 }, { "epoch": 0.35714285714285715, "grad_norm": 0.7721864924395869, "learning_rate": 4.937478705011729e-06, "loss": 0.0446, "step": 785 }, { "epoch": 0.3575978161965423, "grad_norm": 0.8008956251240891, "learning_rate": 4.937319780454559e-06, "loss": 0.0705, "step": 786 }, { "epoch": 0.3580527752502275, "grad_norm": 0.691329548634175, "learning_rate": 4.937160656730625e-06, "loss": 0.0682, "step": 787 }, { "epoch": 0.35850773430391264, "grad_norm": 0.767648045445437, "learning_rate": 4.9370013338529274e-06, "loss": 0.0588, "step": 788 }, { "epoch": 0.35896269335759784, "grad_norm": 0.6437326183327811, "learning_rate": 4.936841811834486e-06, "loss": 0.0585, "step": 789 }, { "epoch": 0.359417652411283, "grad_norm": 1.0756251482963843, "learning_rate": 4.936682090688337e-06, "loss": 0.1152, "step": 790 }, { "epoch": 0.35987261146496813, "grad_norm": 0.7633629281483238, "learning_rate": 4.936522170427531e-06, "loss": 0.0519, "step": 791 }, { "epoch": 0.36032757051865333, "grad_norm": 0.9657586384218257, "learning_rate": 4.936362051065136e-06, "loss": 0.0622, "step": 792 }, { "epoch": 0.3607825295723385, "grad_norm": 0.7614427682900681, "learning_rate": 4.936201732614238e-06, "loss": 0.0578, "step": 793 }, { "epoch": 0.3612374886260237, "grad_norm": 0.8385664953370223, "learning_rate": 4.9360412150879355e-06, "loss": 0.0631, "step": 794 }, { "epoch": 0.3616924476797088, "grad_norm": 0.8652339075360451, "learning_rate": 4.935880498499346e-06, "loss": 0.0561, "step": 795 }, { "epoch": 0.362147406733394, "grad_norm": 1.1585494708567998, "learning_rate": 4.935719582861604e-06, "loss": 0.0798, "step": 796 }, { "epoch": 0.36260236578707916, "grad_norm": 0.8782653098452117, "learning_rate": 4.935558468187855e-06, "loss": 0.0785, "step": 797 }, { "epoch": 0.3630573248407643, "grad_norm": 1.0471698671960776, "learning_rate": 4.935397154491268e-06, "loss": 0.0843, "step": 798 }, { "epoch": 0.3635122838944495, "grad_norm": 1.072226910369364, "learning_rate": 4.935235641785023e-06, "loss": 0.1002, "step": 799 }, { "epoch": 0.36396724294813465, "grad_norm": 0.8003883087872027, "learning_rate": 4.935073930082319e-06, "loss": 0.077, "step": 800 }, { "epoch": 0.36442220200181985, "grad_norm": 0.666158838387195, "learning_rate": 4.93491201939637e-06, "loss": 0.05, "step": 801 }, { "epoch": 0.364877161055505, "grad_norm": 0.8051537513165499, "learning_rate": 4.934749909740408e-06, "loss": 0.0754, "step": 802 }, { "epoch": 0.3653321201091902, "grad_norm": 0.6922057989945819, "learning_rate": 4.934587601127677e-06, "loss": 0.059, "step": 803 }, { "epoch": 0.36578707916287534, "grad_norm": 0.7128402432948275, "learning_rate": 4.934425093571442e-06, "loss": 0.0619, "step": 804 }, { "epoch": 0.3662420382165605, "grad_norm": 0.7563859967433937, "learning_rate": 4.934262387084984e-06, "loss": 0.0627, "step": 805 }, { "epoch": 0.3666969972702457, "grad_norm": 0.7832340212976855, "learning_rate": 4.934099481681595e-06, "loss": 0.0526, "step": 806 }, { "epoch": 0.36715195632393083, "grad_norm": 0.86739587683623, "learning_rate": 4.933936377374589e-06, "loss": 0.0723, "step": 807 }, { "epoch": 0.36760691537761603, "grad_norm": 0.5600488971824944, "learning_rate": 4.933773074177293e-06, "loss": 0.053, "step": 808 }, { "epoch": 0.3680618744313012, "grad_norm": 0.7459592894739776, "learning_rate": 4.933609572103053e-06, "loss": 0.0575, "step": 809 }, { "epoch": 0.3685168334849864, "grad_norm": 1.0970116117153337, "learning_rate": 4.933445871165229e-06, "loss": 0.0956, "step": 810 }, { "epoch": 0.3689717925386715, "grad_norm": 0.7191805409301932, "learning_rate": 4.933281971377197e-06, "loss": 0.0519, "step": 811 }, { "epoch": 0.36942675159235666, "grad_norm": 0.8243120557909177, "learning_rate": 4.933117872752352e-06, "loss": 0.071, "step": 812 }, { "epoch": 0.36988171064604186, "grad_norm": 1.1020763342548079, "learning_rate": 4.932953575304102e-06, "loss": 0.0782, "step": 813 }, { "epoch": 0.370336669699727, "grad_norm": 0.9022725332415404, "learning_rate": 4.932789079045873e-06, "loss": 0.0833, "step": 814 }, { "epoch": 0.3707916287534122, "grad_norm": 0.9496599803899396, "learning_rate": 4.932624383991106e-06, "loss": 0.0847, "step": 815 }, { "epoch": 0.37124658780709735, "grad_norm": 1.0562705583617722, "learning_rate": 4.9324594901532605e-06, "loss": 0.0867, "step": 816 }, { "epoch": 0.37170154686078255, "grad_norm": 0.7181054591410602, "learning_rate": 4.93229439754581e-06, "loss": 0.0607, "step": 817 }, { "epoch": 0.3721565059144677, "grad_norm": 1.249078514543796, "learning_rate": 4.932129106182246e-06, "loss": 0.0695, "step": 818 }, { "epoch": 0.37261146496815284, "grad_norm": 0.9464577385866231, "learning_rate": 4.931963616076075e-06, "loss": 0.0555, "step": 819 }, { "epoch": 0.37306642402183804, "grad_norm": 0.6354068817614167, "learning_rate": 4.93179792724082e-06, "loss": 0.0506, "step": 820 }, { "epoch": 0.3735213830755232, "grad_norm": 0.733808597213929, "learning_rate": 4.9316320396900195e-06, "loss": 0.0624, "step": 821 }, { "epoch": 0.3739763421292084, "grad_norm": 1.0993304075084718, "learning_rate": 4.9314659534372305e-06, "loss": 0.0963, "step": 822 }, { "epoch": 0.37443130118289353, "grad_norm": 0.686462250780803, "learning_rate": 4.931299668496024e-06, "loss": 0.0439, "step": 823 }, { "epoch": 0.37488626023657873, "grad_norm": 0.7707830593490947, "learning_rate": 4.931133184879988e-06, "loss": 0.0602, "step": 824 }, { "epoch": 0.37534121929026387, "grad_norm": 0.577810862774901, "learning_rate": 4.930966502602727e-06, "loss": 0.046, "step": 825 }, { "epoch": 0.37579617834394907, "grad_norm": 1.227234424763045, "learning_rate": 4.930799621677862e-06, "loss": 0.0984, "step": 826 }, { "epoch": 0.3762511373976342, "grad_norm": 0.9596192413203867, "learning_rate": 4.93063254211903e-06, "loss": 0.0733, "step": 827 }, { "epoch": 0.37670609645131936, "grad_norm": 0.6852793283145953, "learning_rate": 4.930465263939882e-06, "loss": 0.046, "step": 828 }, { "epoch": 0.37716105550500456, "grad_norm": 1.0111400448234127, "learning_rate": 4.9302977871540894e-06, "loss": 0.0808, "step": 829 }, { "epoch": 0.3776160145586897, "grad_norm": 0.7993690990324225, "learning_rate": 4.930130111775336e-06, "loss": 0.0635, "step": 830 }, { "epoch": 0.3780709736123749, "grad_norm": 0.5709164804262241, "learning_rate": 4.9299622378173245e-06, "loss": 0.0403, "step": 831 }, { "epoch": 0.37852593266606005, "grad_norm": 1.104047361341013, "learning_rate": 4.929794165293773e-06, "loss": 0.0864, "step": 832 }, { "epoch": 0.37898089171974525, "grad_norm": 0.6855131484796984, "learning_rate": 4.9296258942184145e-06, "loss": 0.0617, "step": 833 }, { "epoch": 0.3794358507734304, "grad_norm": 1.0311774748471771, "learning_rate": 4.929457424605e-06, "loss": 0.0788, "step": 834 }, { "epoch": 0.37989080982711554, "grad_norm": 0.9165897835058952, "learning_rate": 4.929288756467296e-06, "loss": 0.0893, "step": 835 }, { "epoch": 0.38034576888080074, "grad_norm": 0.7941921577921506, "learning_rate": 4.929119889819086e-06, "loss": 0.0534, "step": 836 }, { "epoch": 0.3808007279344859, "grad_norm": 1.557335360800504, "learning_rate": 4.928950824674169e-06, "loss": 0.112, "step": 837 }, { "epoch": 0.3812556869881711, "grad_norm": 0.7901013784423294, "learning_rate": 4.928781561046359e-06, "loss": 0.0644, "step": 838 }, { "epoch": 0.3817106460418562, "grad_norm": 0.8005670034055866, "learning_rate": 4.928612098949488e-06, "loss": 0.0651, "step": 839 }, { "epoch": 0.3821656050955414, "grad_norm": 0.7907149517921656, "learning_rate": 4.9284424383974026e-06, "loss": 0.0666, "step": 840 }, { "epoch": 0.38262056414922657, "grad_norm": 0.5599277146162008, "learning_rate": 4.928272579403969e-06, "loss": 0.0415, "step": 841 }, { "epoch": 0.3830755232029117, "grad_norm": 0.8167324319310735, "learning_rate": 4.928102521983067e-06, "loss": 0.0832, "step": 842 }, { "epoch": 0.3835304822565969, "grad_norm": 1.110106061772308, "learning_rate": 4.9279322661485906e-06, "loss": 0.1075, "step": 843 }, { "epoch": 0.38398544131028206, "grad_norm": 0.9108736659112359, "learning_rate": 4.927761811914455e-06, "loss": 0.0782, "step": 844 }, { "epoch": 0.38444040036396726, "grad_norm": 0.7133113314845626, "learning_rate": 4.927591159294587e-06, "loss": 0.0597, "step": 845 }, { "epoch": 0.3848953594176524, "grad_norm": 1.2379543972496645, "learning_rate": 4.927420308302933e-06, "loss": 0.0739, "step": 846 }, { "epoch": 0.3853503184713376, "grad_norm": 0.7205090516697029, "learning_rate": 4.927249258953454e-06, "loss": 0.0637, "step": 847 }, { "epoch": 0.38580527752502275, "grad_norm": 0.9577940179044298, "learning_rate": 4.927078011260126e-06, "loss": 0.0647, "step": 848 }, { "epoch": 0.3862602365787079, "grad_norm": 1.063680913893135, "learning_rate": 4.926906565236943e-06, "loss": 0.0884, "step": 849 }, { "epoch": 0.3867151956323931, "grad_norm": 0.8411451706944509, "learning_rate": 4.926734920897916e-06, "loss": 0.0641, "step": 850 }, { "epoch": 0.38717015468607824, "grad_norm": 0.6435257771689179, "learning_rate": 4.926563078257071e-06, "loss": 0.0645, "step": 851 }, { "epoch": 0.38762511373976344, "grad_norm": 0.5478103039564508, "learning_rate": 4.926391037328448e-06, "loss": 0.0562, "step": 852 }, { "epoch": 0.3880800727934486, "grad_norm": 0.7813544786492084, "learning_rate": 4.926218798126108e-06, "loss": 0.0644, "step": 853 }, { "epoch": 0.3885350318471338, "grad_norm": 0.8655211183499932, "learning_rate": 4.926046360664124e-06, "loss": 0.059, "step": 854 }, { "epoch": 0.3889899909008189, "grad_norm": 0.9101899928988302, "learning_rate": 4.925873724956588e-06, "loss": 0.0737, "step": 855 }, { "epoch": 0.38944494995450407, "grad_norm": 1.0168400071509458, "learning_rate": 4.9257008910176065e-06, "loss": 0.1121, "step": 856 }, { "epoch": 0.38989990900818927, "grad_norm": 0.8167976616887521, "learning_rate": 4.925527858861302e-06, "loss": 0.0564, "step": 857 }, { "epoch": 0.3903548680618744, "grad_norm": 0.8798735310808856, "learning_rate": 4.925354628501814e-06, "loss": 0.0658, "step": 858 }, { "epoch": 0.3908098271155596, "grad_norm": 1.072539154167554, "learning_rate": 4.925181199953299e-06, "loss": 0.073, "step": 859 }, { "epoch": 0.39126478616924476, "grad_norm": 0.6908230723682215, "learning_rate": 4.9250075732299285e-06, "loss": 0.0623, "step": 860 }, { "epoch": 0.39171974522292996, "grad_norm": 0.9571638979072821, "learning_rate": 4.92483374834589e-06, "loss": 0.0773, "step": 861 }, { "epoch": 0.3921747042766151, "grad_norm": 0.756709209444031, "learning_rate": 4.9246597253153884e-06, "loss": 0.0579, "step": 862 }, { "epoch": 0.39262966333030025, "grad_norm": 0.5927412643446517, "learning_rate": 4.924485504152644e-06, "loss": 0.0534, "step": 863 }, { "epoch": 0.39308462238398545, "grad_norm": 0.9103877688416242, "learning_rate": 4.924311084871892e-06, "loss": 0.0706, "step": 864 }, { "epoch": 0.3935395814376706, "grad_norm": 1.0326915390707718, "learning_rate": 4.924136467487387e-06, "loss": 0.0598, "step": 865 }, { "epoch": 0.3939945404913558, "grad_norm": 0.6750287379400403, "learning_rate": 4.923961652013397e-06, "loss": 0.0544, "step": 866 }, { "epoch": 0.39444949954504094, "grad_norm": 0.6971234476522602, "learning_rate": 4.923786638464207e-06, "loss": 0.068, "step": 867 }, { "epoch": 0.39490445859872614, "grad_norm": 0.6838356971258669, "learning_rate": 4.9236114268541196e-06, "loss": 0.0547, "step": 868 }, { "epoch": 0.3953594176524113, "grad_norm": 0.7448093953926782, "learning_rate": 4.923436017197451e-06, "loss": 0.052, "step": 869 }, { "epoch": 0.3958143767060964, "grad_norm": 0.6641780681039909, "learning_rate": 4.923260409508535e-06, "loss": 0.0537, "step": 870 }, { "epoch": 0.3962693357597816, "grad_norm": 1.0599598667933217, "learning_rate": 4.9230846038017214e-06, "loss": 0.1064, "step": 871 }, { "epoch": 0.39672429481346677, "grad_norm": 0.7170909347633128, "learning_rate": 4.922908600091378e-06, "loss": 0.052, "step": 872 }, { "epoch": 0.39717925386715197, "grad_norm": 0.9331039569297795, "learning_rate": 4.9227323983918835e-06, "loss": 0.1059, "step": 873 }, { "epoch": 0.3976342129208371, "grad_norm": 0.6631065170731679, "learning_rate": 4.922555998717639e-06, "loss": 0.0617, "step": 874 }, { "epoch": 0.3980891719745223, "grad_norm": 0.6608685360122281, "learning_rate": 4.922379401083058e-06, "loss": 0.0499, "step": 875 }, { "epoch": 0.39854413102820746, "grad_norm": 0.7327299678205453, "learning_rate": 4.922202605502573e-06, "loss": 0.0566, "step": 876 }, { "epoch": 0.3989990900818926, "grad_norm": 0.6975891149084547, "learning_rate": 4.922025611990629e-06, "loss": 0.0516, "step": 877 }, { "epoch": 0.3994540491355778, "grad_norm": 0.7261723405077012, "learning_rate": 4.92184842056169e-06, "loss": 0.0564, "step": 878 }, { "epoch": 0.39990900818926295, "grad_norm": 0.7685758032234701, "learning_rate": 4.921671031230235e-06, "loss": 0.0607, "step": 879 }, { "epoch": 0.40036396724294815, "grad_norm": 0.8663271064629626, "learning_rate": 4.921493444010759e-06, "loss": 0.0772, "step": 880 }, { "epoch": 0.4008189262966333, "grad_norm": 0.6323885494682957, "learning_rate": 4.921315658917774e-06, "loss": 0.0542, "step": 881 }, { "epoch": 0.4012738853503185, "grad_norm": 0.7490017305697232, "learning_rate": 4.921137675965809e-06, "loss": 0.0561, "step": 882 }, { "epoch": 0.40172884440400364, "grad_norm": 0.5661173516415018, "learning_rate": 4.920959495169406e-06, "loss": 0.0514, "step": 883 }, { "epoch": 0.4021838034576888, "grad_norm": 0.90985620289341, "learning_rate": 4.920781116543126e-06, "loss": 0.0793, "step": 884 }, { "epoch": 0.402638762511374, "grad_norm": 0.737559568798236, "learning_rate": 4.920602540101546e-06, "loss": 0.0532, "step": 885 }, { "epoch": 0.4030937215650591, "grad_norm": 0.9457532899317224, "learning_rate": 4.920423765859257e-06, "loss": 0.0736, "step": 886 }, { "epoch": 0.4035486806187443, "grad_norm": 0.8223411810090336, "learning_rate": 4.920244793830869e-06, "loss": 0.0617, "step": 887 }, { "epoch": 0.40400363967242947, "grad_norm": 0.841036201739517, "learning_rate": 4.920065624031006e-06, "loss": 0.0663, "step": 888 }, { "epoch": 0.40445859872611467, "grad_norm": 0.6414512707848916, "learning_rate": 4.919886256474309e-06, "loss": 0.0577, "step": 889 }, { "epoch": 0.4049135577797998, "grad_norm": 0.9454993871214441, "learning_rate": 4.919706691175435e-06, "loss": 0.0691, "step": 890 }, { "epoch": 0.40536851683348496, "grad_norm": 1.139839821047098, "learning_rate": 4.919526928149058e-06, "loss": 0.0981, "step": 891 }, { "epoch": 0.40582347588717016, "grad_norm": 0.7527352667811262, "learning_rate": 4.919346967409867e-06, "loss": 0.0705, "step": 892 }, { "epoch": 0.4062784349408553, "grad_norm": 0.8215025181864493, "learning_rate": 4.919166808972567e-06, "loss": 0.0822, "step": 893 }, { "epoch": 0.4067333939945405, "grad_norm": 1.0573127490280785, "learning_rate": 4.918986452851881e-06, "loss": 0.0811, "step": 894 }, { "epoch": 0.40718835304822565, "grad_norm": 0.6965463925423991, "learning_rate": 4.918805899062545e-06, "loss": 0.0503, "step": 895 }, { "epoch": 0.40764331210191085, "grad_norm": 0.4193896755189461, "learning_rate": 4.9186251476193146e-06, "loss": 0.0341, "step": 896 }, { "epoch": 0.408098271155596, "grad_norm": 0.9727030498845781, "learning_rate": 4.918444198536959e-06, "loss": 0.0918, "step": 897 }, { "epoch": 0.40855323020928114, "grad_norm": 0.848379430601135, "learning_rate": 4.918263051830267e-06, "loss": 0.0846, "step": 898 }, { "epoch": 0.40900818926296634, "grad_norm": 0.8940054586896501, "learning_rate": 4.918081707514037e-06, "loss": 0.0561, "step": 899 }, { "epoch": 0.4094631483166515, "grad_norm": 1.0448933980565918, "learning_rate": 4.917900165603091e-06, "loss": 0.0881, "step": 900 }, { "epoch": 0.4099181073703367, "grad_norm": 0.8513360075907803, "learning_rate": 4.9177184261122624e-06, "loss": 0.0774, "step": 901 }, { "epoch": 0.4103730664240218, "grad_norm": 0.9926871621583441, "learning_rate": 4.917536489056402e-06, "loss": 0.0676, "step": 902 }, { "epoch": 0.410828025477707, "grad_norm": 0.7421973042221751, "learning_rate": 4.9173543544503775e-06, "loss": 0.0561, "step": 903 }, { "epoch": 0.41128298453139217, "grad_norm": 0.5540464230672232, "learning_rate": 4.917172022309072e-06, "loss": 0.0445, "step": 904 }, { "epoch": 0.41173794358507737, "grad_norm": 0.5720091238043334, "learning_rate": 4.916989492647385e-06, "loss": 0.0433, "step": 905 }, { "epoch": 0.4121929026387625, "grad_norm": 0.6162762711529532, "learning_rate": 4.916806765480231e-06, "loss": 0.0475, "step": 906 }, { "epoch": 0.41264786169244766, "grad_norm": 0.974787490907103, "learning_rate": 4.9166238408225416e-06, "loss": 0.1111, "step": 907 }, { "epoch": 0.41310282074613286, "grad_norm": 0.8865154928732101, "learning_rate": 4.916440718689267e-06, "loss": 0.0749, "step": 908 }, { "epoch": 0.413557779799818, "grad_norm": 0.5226001788391972, "learning_rate": 4.916257399095369e-06, "loss": 0.0395, "step": 909 }, { "epoch": 0.4140127388535032, "grad_norm": 0.5318996108265455, "learning_rate": 4.916073882055827e-06, "loss": 0.0433, "step": 910 }, { "epoch": 0.41446769790718835, "grad_norm": 1.0710581316835899, "learning_rate": 4.91589016758564e-06, "loss": 0.0763, "step": 911 }, { "epoch": 0.41492265696087355, "grad_norm": 0.795399749143522, "learning_rate": 4.915706255699817e-06, "loss": 0.0764, "step": 912 }, { "epoch": 0.4153776160145587, "grad_norm": 1.0890078502818572, "learning_rate": 4.915522146413389e-06, "loss": 0.1131, "step": 913 }, { "epoch": 0.41583257506824384, "grad_norm": 0.7960045886425829, "learning_rate": 4.9153378397413985e-06, "loss": 0.0683, "step": 914 }, { "epoch": 0.41628753412192904, "grad_norm": 1.1128797991041262, "learning_rate": 4.915153335698908e-06, "loss": 0.0913, "step": 915 }, { "epoch": 0.4167424931756142, "grad_norm": 1.0003336530508022, "learning_rate": 4.914968634300994e-06, "loss": 0.0908, "step": 916 }, { "epoch": 0.4171974522292994, "grad_norm": 0.6465244244795542, "learning_rate": 4.914783735562748e-06, "loss": 0.0567, "step": 917 }, { "epoch": 0.4176524112829845, "grad_norm": 0.7181629552621807, "learning_rate": 4.914598639499281e-06, "loss": 0.0601, "step": 918 }, { "epoch": 0.4181073703366697, "grad_norm": 0.6532643628064463, "learning_rate": 4.914413346125717e-06, "loss": 0.0601, "step": 919 }, { "epoch": 0.41856232939035487, "grad_norm": 0.6191538196132823, "learning_rate": 4.914227855457199e-06, "loss": 0.0499, "step": 920 }, { "epoch": 0.41901728844404, "grad_norm": 0.8550108331341532, "learning_rate": 4.914042167508881e-06, "loss": 0.0593, "step": 921 }, { "epoch": 0.4194722474977252, "grad_norm": 0.7149006472238378, "learning_rate": 4.9138562822959416e-06, "loss": 0.0445, "step": 922 }, { "epoch": 0.41992720655141036, "grad_norm": 0.8618890926980373, "learning_rate": 4.913670199833566e-06, "loss": 0.0623, "step": 923 }, { "epoch": 0.42038216560509556, "grad_norm": 0.7722281622664096, "learning_rate": 4.913483920136961e-06, "loss": 0.0599, "step": 924 }, { "epoch": 0.4208371246587807, "grad_norm": 1.0525677617663958, "learning_rate": 4.91329744322135e-06, "loss": 0.072, "step": 925 }, { "epoch": 0.4212920837124659, "grad_norm": 0.67535122521756, "learning_rate": 4.913110769101971e-06, "loss": 0.0591, "step": 926 }, { "epoch": 0.42174704276615105, "grad_norm": 0.822068707444294, "learning_rate": 4.912923897794077e-06, "loss": 0.0614, "step": 927 }, { "epoch": 0.4222020018198362, "grad_norm": 0.7885176236662199, "learning_rate": 4.912736829312938e-06, "loss": 0.0704, "step": 928 }, { "epoch": 0.4226569608735214, "grad_norm": 0.8766248992772606, "learning_rate": 4.912549563673842e-06, "loss": 0.0745, "step": 929 }, { "epoch": 0.42311191992720654, "grad_norm": 0.8283902735051627, "learning_rate": 4.912362100892091e-06, "loss": 0.092, "step": 930 }, { "epoch": 0.42356687898089174, "grad_norm": 0.6254134609819529, "learning_rate": 4.912174440983002e-06, "loss": 0.0537, "step": 931 }, { "epoch": 0.4240218380345769, "grad_norm": 0.882666141582528, "learning_rate": 4.911986583961912e-06, "loss": 0.0786, "step": 932 }, { "epoch": 0.4244767970882621, "grad_norm": 0.9420756223720808, "learning_rate": 4.91179852984417e-06, "loss": 0.0712, "step": 933 }, { "epoch": 0.4249317561419472, "grad_norm": 0.7094879380363412, "learning_rate": 4.911610278645144e-06, "loss": 0.0584, "step": 934 }, { "epoch": 0.42538671519563237, "grad_norm": 0.5926543191052056, "learning_rate": 4.911421830380217e-06, "loss": 0.0405, "step": 935 }, { "epoch": 0.42584167424931757, "grad_norm": 1.0230610119902441, "learning_rate": 4.911233185064788e-06, "loss": 0.0862, "step": 936 }, { "epoch": 0.4262966333030027, "grad_norm": 0.7907623777513295, "learning_rate": 4.911044342714272e-06, "loss": 0.0613, "step": 937 }, { "epoch": 0.4267515923566879, "grad_norm": 0.6568153392494973, "learning_rate": 4.9108553033440995e-06, "loss": 0.0476, "step": 938 }, { "epoch": 0.42720655141037306, "grad_norm": 0.6694889451748832, "learning_rate": 4.91066606696972e-06, "loss": 0.0524, "step": 939 }, { "epoch": 0.42766151046405826, "grad_norm": 0.8635963157899842, "learning_rate": 4.910476633606597e-06, "loss": 0.0614, "step": 940 }, { "epoch": 0.4281164695177434, "grad_norm": 0.7292867271167381, "learning_rate": 4.9102870032702075e-06, "loss": 0.0414, "step": 941 }, { "epoch": 0.42857142857142855, "grad_norm": 0.7616247338566872, "learning_rate": 4.910097175976049e-06, "loss": 0.0549, "step": 942 }, { "epoch": 0.42902638762511375, "grad_norm": 0.7194658405319286, "learning_rate": 4.909907151739634e-06, "loss": 0.0499, "step": 943 }, { "epoch": 0.4294813466787989, "grad_norm": 1.2091203072284862, "learning_rate": 4.909716930576489e-06, "loss": 0.0963, "step": 944 }, { "epoch": 0.4299363057324841, "grad_norm": 0.7905516413222626, "learning_rate": 4.909526512502158e-06, "loss": 0.0783, "step": 945 }, { "epoch": 0.43039126478616924, "grad_norm": 0.8761882129732462, "learning_rate": 4.9093358975322025e-06, "loss": 0.0703, "step": 946 }, { "epoch": 0.43084622383985444, "grad_norm": 0.8329694474419541, "learning_rate": 4.909145085682198e-06, "loss": 0.0747, "step": 947 }, { "epoch": 0.4313011828935396, "grad_norm": 0.7018276200636866, "learning_rate": 4.908954076967737e-06, "loss": 0.05, "step": 948 }, { "epoch": 0.4317561419472247, "grad_norm": 0.8623770812142951, "learning_rate": 4.908762871404427e-06, "loss": 0.1035, "step": 949 }, { "epoch": 0.4322111010009099, "grad_norm": 0.7203196297821325, "learning_rate": 4.908571469007893e-06, "loss": 0.0597, "step": 950 }, { "epoch": 0.43266606005459507, "grad_norm": 0.891364295683239, "learning_rate": 4.908379869793776e-06, "loss": 0.0656, "step": 951 }, { "epoch": 0.43312101910828027, "grad_norm": 0.7267875932577169, "learning_rate": 4.908188073777732e-06, "loss": 0.0537, "step": 952 }, { "epoch": 0.4335759781619654, "grad_norm": 1.136525948908607, "learning_rate": 4.9079960809754334e-06, "loss": 0.1066, "step": 953 }, { "epoch": 0.4340309372156506, "grad_norm": 0.6404455978391249, "learning_rate": 4.90780389140257e-06, "loss": 0.0506, "step": 954 }, { "epoch": 0.43448589626933576, "grad_norm": 1.1106980810150886, "learning_rate": 4.907611505074846e-06, "loss": 0.0756, "step": 955 }, { "epoch": 0.4349408553230209, "grad_norm": 0.8892492946222721, "learning_rate": 4.907418922007983e-06, "loss": 0.0755, "step": 956 }, { "epoch": 0.4353958143767061, "grad_norm": 0.7842836260244193, "learning_rate": 4.907226142217717e-06, "loss": 0.0584, "step": 957 }, { "epoch": 0.43585077343039125, "grad_norm": 0.7902286572984042, "learning_rate": 4.9070331657198015e-06, "loss": 0.0607, "step": 958 }, { "epoch": 0.43630573248407645, "grad_norm": 0.8874306420389249, "learning_rate": 4.906839992530006e-06, "loss": 0.0785, "step": 959 }, { "epoch": 0.4367606915377616, "grad_norm": 0.8366306399984552, "learning_rate": 4.906646622664115e-06, "loss": 0.0713, "step": 960 }, { "epoch": 0.4372156505914468, "grad_norm": 0.7074038721272251, "learning_rate": 4.906453056137931e-06, "loss": 0.041, "step": 961 }, { "epoch": 0.43767060964513194, "grad_norm": 1.1462267850180623, "learning_rate": 4.90625929296727e-06, "loss": 0.1047, "step": 962 }, { "epoch": 0.4381255686988171, "grad_norm": 0.6641712180680458, "learning_rate": 4.9060653331679665e-06, "loss": 0.0685, "step": 963 }, { "epoch": 0.4385805277525023, "grad_norm": 0.8700860351399569, "learning_rate": 4.90587117675587e-06, "loss": 0.0821, "step": 964 }, { "epoch": 0.4390354868061874, "grad_norm": 0.9136082234067431, "learning_rate": 4.905676823746846e-06, "loss": 0.0645, "step": 965 }, { "epoch": 0.4394904458598726, "grad_norm": 0.5926511030671447, "learning_rate": 4.9054822741567745e-06, "loss": 0.0487, "step": 966 }, { "epoch": 0.43994540491355777, "grad_norm": 0.8424384986445863, "learning_rate": 4.905287528001555e-06, "loss": 0.0621, "step": 967 }, { "epoch": 0.44040036396724297, "grad_norm": 0.6973422193542876, "learning_rate": 4.905092585297102e-06, "loss": 0.0583, "step": 968 }, { "epoch": 0.4408553230209281, "grad_norm": 0.9155989139411984, "learning_rate": 4.904897446059344e-06, "loss": 0.0699, "step": 969 }, { "epoch": 0.44131028207461326, "grad_norm": 0.7443964393050531, "learning_rate": 4.9047021103042255e-06, "loss": 0.051, "step": 970 }, { "epoch": 0.44176524112829846, "grad_norm": 0.7677456881758877, "learning_rate": 4.904506578047712e-06, "loss": 0.0559, "step": 971 }, { "epoch": 0.4422202001819836, "grad_norm": 0.9425858695883391, "learning_rate": 4.9043108493057785e-06, "loss": 0.0633, "step": 972 }, { "epoch": 0.4426751592356688, "grad_norm": 0.6366934546327063, "learning_rate": 4.904114924094421e-06, "loss": 0.0464, "step": 973 }, { "epoch": 0.44313011828935395, "grad_norm": 0.802460890266695, "learning_rate": 4.903918802429648e-06, "loss": 0.0727, "step": 974 }, { "epoch": 0.44358507734303915, "grad_norm": 0.44074125028934635, "learning_rate": 4.9037224843274875e-06, "loss": 0.0375, "step": 975 }, { "epoch": 0.4440400363967243, "grad_norm": 0.9236685790892595, "learning_rate": 4.903525969803979e-06, "loss": 0.0914, "step": 976 }, { "epoch": 0.44449499545040944, "grad_norm": 0.8186044519325196, "learning_rate": 4.903329258875184e-06, "loss": 0.0582, "step": 977 }, { "epoch": 0.44494995450409464, "grad_norm": 0.760419987901125, "learning_rate": 4.903132351557175e-06, "loss": 0.0662, "step": 978 }, { "epoch": 0.4454049135577798, "grad_norm": 0.8487192410638724, "learning_rate": 4.902935247866043e-06, "loss": 0.0622, "step": 979 }, { "epoch": 0.445859872611465, "grad_norm": 0.8969279017038029, "learning_rate": 4.9027379478178935e-06, "loss": 0.0696, "step": 980 }, { "epoch": 0.4463148316651501, "grad_norm": 0.8275157306730986, "learning_rate": 4.90254045142885e-06, "loss": 0.0617, "step": 981 }, { "epoch": 0.4467697907188353, "grad_norm": 0.8042954485928273, "learning_rate": 4.90234275871505e-06, "loss": 0.053, "step": 982 }, { "epoch": 0.44722474977252047, "grad_norm": 1.1786231664461284, "learning_rate": 4.9021448696926486e-06, "loss": 0.0986, "step": 983 }, { "epoch": 0.44767970882620567, "grad_norm": 0.6298200533016487, "learning_rate": 4.901946784377816e-06, "loss": 0.065, "step": 984 }, { "epoch": 0.4481346678798908, "grad_norm": 0.594574358873745, "learning_rate": 4.90174850278674e-06, "loss": 0.0539, "step": 985 }, { "epoch": 0.44858962693357596, "grad_norm": 0.679879881000302, "learning_rate": 4.901550024935623e-06, "loss": 0.0654, "step": 986 }, { "epoch": 0.44904458598726116, "grad_norm": 0.5886266734655748, "learning_rate": 4.901351350840683e-06, "loss": 0.0532, "step": 987 }, { "epoch": 0.4494995450409463, "grad_norm": 0.7808229432327206, "learning_rate": 4.901152480518155e-06, "loss": 0.048, "step": 988 }, { "epoch": 0.4499545040946315, "grad_norm": 0.6018998346440647, "learning_rate": 4.900953413984289e-06, "loss": 0.0494, "step": 989 }, { "epoch": 0.45040946314831665, "grad_norm": 1.136855215162297, "learning_rate": 4.900754151255353e-06, "loss": 0.1101, "step": 990 }, { "epoch": 0.45086442220200185, "grad_norm": 0.7654221991027399, "learning_rate": 4.9005546923476305e-06, "loss": 0.0514, "step": 991 }, { "epoch": 0.451319381255687, "grad_norm": 0.8646543296697372, "learning_rate": 4.9003550372774185e-06, "loss": 0.0773, "step": 992 }, { "epoch": 0.45177434030937214, "grad_norm": 0.7391042261561228, "learning_rate": 4.900155186061033e-06, "loss": 0.0593, "step": 993 }, { "epoch": 0.45222929936305734, "grad_norm": 0.9599073893248152, "learning_rate": 4.8999551387148045e-06, "loss": 0.0609, "step": 994 }, { "epoch": 0.4526842584167425, "grad_norm": 0.6526165463731411, "learning_rate": 4.89975489525508e-06, "loss": 0.0561, "step": 995 }, { "epoch": 0.4531392174704277, "grad_norm": 0.8465255037900871, "learning_rate": 4.899554455698223e-06, "loss": 0.0671, "step": 996 }, { "epoch": 0.4535941765241128, "grad_norm": 0.7201042670777416, "learning_rate": 4.899353820060612e-06, "loss": 0.0528, "step": 997 }, { "epoch": 0.454049135577798, "grad_norm": 1.0593597416511176, "learning_rate": 4.899152988358643e-06, "loss": 0.0911, "step": 998 }, { "epoch": 0.45450409463148317, "grad_norm": 0.6740510359790731, "learning_rate": 4.898951960608725e-06, "loss": 0.0516, "step": 999 }, { "epoch": 0.4549590536851683, "grad_norm": 0.7844312545261172, "learning_rate": 4.8987507368272865e-06, "loss": 0.0669, "step": 1000 }, { "epoch": 0.4554140127388535, "grad_norm": 0.8616523144825781, "learning_rate": 4.898549317030772e-06, "loss": 0.0793, "step": 1001 }, { "epoch": 0.45586897179253866, "grad_norm": 0.9076102187991024, "learning_rate": 4.898347701235637e-06, "loss": 0.0774, "step": 1002 }, { "epoch": 0.45632393084622386, "grad_norm": 0.9763695817464088, "learning_rate": 4.89814588945836e-06, "loss": 0.0893, "step": 1003 }, { "epoch": 0.456778889899909, "grad_norm": 0.83319057600543, "learning_rate": 4.89794388171543e-06, "loss": 0.0707, "step": 1004 }, { "epoch": 0.4572338489535942, "grad_norm": 1.8792537681412733, "learning_rate": 4.897741678023356e-06, "loss": 0.0764, "step": 1005 }, { "epoch": 0.45768880800727935, "grad_norm": 0.7734685973931732, "learning_rate": 4.897539278398659e-06, "loss": 0.0627, "step": 1006 }, { "epoch": 0.4581437670609645, "grad_norm": 0.9415629435575145, "learning_rate": 4.8973366828578804e-06, "loss": 0.0739, "step": 1007 }, { "epoch": 0.4585987261146497, "grad_norm": 0.7425043558467179, "learning_rate": 4.897133891417574e-06, "loss": 0.0654, "step": 1008 }, { "epoch": 0.45905368516833484, "grad_norm": 0.8911942098198534, "learning_rate": 4.896930904094311e-06, "loss": 0.0561, "step": 1009 }, { "epoch": 0.45950864422202004, "grad_norm": 1.2263772300651212, "learning_rate": 4.896727720904679e-06, "loss": 0.0864, "step": 1010 }, { "epoch": 0.4599636032757052, "grad_norm": 0.5601400077515136, "learning_rate": 4.896524341865282e-06, "loss": 0.0438, "step": 1011 }, { "epoch": 0.4604185623293904, "grad_norm": 0.877896381579309, "learning_rate": 4.896320766992737e-06, "loss": 0.0942, "step": 1012 }, { "epoch": 0.4608735213830755, "grad_norm": 0.5305552323500966, "learning_rate": 4.896116996303682e-06, "loss": 0.0529, "step": 1013 }, { "epoch": 0.46132848043676067, "grad_norm": 1.0131750705641154, "learning_rate": 4.895913029814766e-06, "loss": 0.0615, "step": 1014 }, { "epoch": 0.46178343949044587, "grad_norm": 1.000424807381978, "learning_rate": 4.895708867542658e-06, "loss": 0.0715, "step": 1015 }, { "epoch": 0.462238398544131, "grad_norm": 0.8460760286247231, "learning_rate": 4.895504509504039e-06, "loss": 0.0668, "step": 1016 }, { "epoch": 0.4626933575978162, "grad_norm": 0.7307313549457798, "learning_rate": 4.89529995571561e-06, "loss": 0.0703, "step": 1017 }, { "epoch": 0.46314831665150136, "grad_norm": 0.9064676368721324, "learning_rate": 4.895095206194086e-06, "loss": 0.0741, "step": 1018 }, { "epoch": 0.46360327570518656, "grad_norm": 0.7836127913087492, "learning_rate": 4.894890260956198e-06, "loss": 0.0609, "step": 1019 }, { "epoch": 0.4640582347588717, "grad_norm": 0.8243374701106395, "learning_rate": 4.8946851200186925e-06, "loss": 0.0714, "step": 1020 }, { "epoch": 0.46451319381255685, "grad_norm": 0.7466209538262989, "learning_rate": 4.894479783398334e-06, "loss": 0.0645, "step": 1021 }, { "epoch": 0.46496815286624205, "grad_norm": 0.8777577697024573, "learning_rate": 4.8942742511119004e-06, "loss": 0.0702, "step": 1022 }, { "epoch": 0.4654231119199272, "grad_norm": 2.534404305990435, "learning_rate": 4.894068523176187e-06, "loss": 0.1764, "step": 1023 }, { "epoch": 0.4658780709736124, "grad_norm": 0.6909522226512711, "learning_rate": 4.8938625996080056e-06, "loss": 0.0609, "step": 1024 }, { "epoch": 0.46633303002729753, "grad_norm": 0.49151101507719136, "learning_rate": 4.893656480424184e-06, "loss": 0.038, "step": 1025 }, { "epoch": 0.46678798908098273, "grad_norm": 0.6934743220996047, "learning_rate": 4.893450165641564e-06, "loss": 0.0639, "step": 1026 }, { "epoch": 0.4672429481346679, "grad_norm": 0.6130829752556336, "learning_rate": 4.893243655277005e-06, "loss": 0.0555, "step": 1027 }, { "epoch": 0.467697907188353, "grad_norm": 0.7605330677818611, "learning_rate": 4.893036949347383e-06, "loss": 0.0617, "step": 1028 }, { "epoch": 0.4681528662420382, "grad_norm": 0.6903185226412064, "learning_rate": 4.892830047869588e-06, "loss": 0.0568, "step": 1029 }, { "epoch": 0.46860782529572337, "grad_norm": 0.6912281627309118, "learning_rate": 4.892622950860527e-06, "loss": 0.0395, "step": 1030 }, { "epoch": 0.46906278434940857, "grad_norm": 0.6581879258835811, "learning_rate": 4.892415658337123e-06, "loss": 0.0634, "step": 1031 }, { "epoch": 0.4695177434030937, "grad_norm": 0.6826891181604423, "learning_rate": 4.892208170316317e-06, "loss": 0.054, "step": 1032 }, { "epoch": 0.4699727024567789, "grad_norm": 0.5663228273948423, "learning_rate": 4.892000486815062e-06, "loss": 0.0429, "step": 1033 }, { "epoch": 0.47042766151046406, "grad_norm": 0.6736706134956636, "learning_rate": 4.891792607850328e-06, "loss": 0.0576, "step": 1034 }, { "epoch": 0.4708826205641492, "grad_norm": 1.3024590269353873, "learning_rate": 4.891584533439104e-06, "loss": 0.0942, "step": 1035 }, { "epoch": 0.4713375796178344, "grad_norm": 0.7591015395797897, "learning_rate": 4.891376263598393e-06, "loss": 0.0686, "step": 1036 }, { "epoch": 0.47179253867151955, "grad_norm": 0.6830269838411529, "learning_rate": 4.891167798345213e-06, "loss": 0.0546, "step": 1037 }, { "epoch": 0.47224749772520475, "grad_norm": 1.08323398335216, "learning_rate": 4.890959137696598e-06, "loss": 0.0891, "step": 1038 }, { "epoch": 0.4727024567788899, "grad_norm": 0.8474479862966637, "learning_rate": 4.890750281669601e-06, "loss": 0.0647, "step": 1039 }, { "epoch": 0.4731574158325751, "grad_norm": 0.5830298328980045, "learning_rate": 4.890541230281287e-06, "loss": 0.0434, "step": 1040 }, { "epoch": 0.47361237488626023, "grad_norm": 0.8284120242669826, "learning_rate": 4.8903319835487385e-06, "loss": 0.0658, "step": 1041 }, { "epoch": 0.4740673339399454, "grad_norm": 1.0749546085564385, "learning_rate": 4.890122541489056e-06, "loss": 0.0781, "step": 1042 }, { "epoch": 0.4745222929936306, "grad_norm": 0.8028746869091197, "learning_rate": 4.889912904119353e-06, "loss": 0.0745, "step": 1043 }, { "epoch": 0.4749772520473157, "grad_norm": 0.9995418792465922, "learning_rate": 4.88970307145676e-06, "loss": 0.0676, "step": 1044 }, { "epoch": 0.4754322111010009, "grad_norm": 0.6522882405079128, "learning_rate": 4.889493043518423e-06, "loss": 0.0562, "step": 1045 }, { "epoch": 0.47588717015468607, "grad_norm": 0.5881171761200348, "learning_rate": 4.889282820321506e-06, "loss": 0.0346, "step": 1046 }, { "epoch": 0.47634212920837127, "grad_norm": 0.9483028123171037, "learning_rate": 4.889072401883187e-06, "loss": 0.0667, "step": 1047 }, { "epoch": 0.4767970882620564, "grad_norm": 0.6670957840215991, "learning_rate": 4.88886178822066e-06, "loss": 0.058, "step": 1048 }, { "epoch": 0.47725204731574156, "grad_norm": 0.8148080209615183, "learning_rate": 4.888650979351136e-06, "loss": 0.0719, "step": 1049 }, { "epoch": 0.47770700636942676, "grad_norm": 0.9532630639821967, "learning_rate": 4.888439975291841e-06, "loss": 0.0953, "step": 1050 }, { "epoch": 0.4781619654231119, "grad_norm": 0.6515552152853241, "learning_rate": 4.888228776060017e-06, "loss": 0.0599, "step": 1051 }, { "epoch": 0.4786169244767971, "grad_norm": 0.8306189556551553, "learning_rate": 4.888017381672923e-06, "loss": 0.0616, "step": 1052 }, { "epoch": 0.47907188353048225, "grad_norm": 0.7827901374918418, "learning_rate": 4.887805792147832e-06, "loss": 0.0609, "step": 1053 }, { "epoch": 0.47952684258416745, "grad_norm": 1.0177718214883258, "learning_rate": 4.887594007502036e-06, "loss": 0.0655, "step": 1054 }, { "epoch": 0.4799818016378526, "grad_norm": 1.0068335369987877, "learning_rate": 4.887382027752838e-06, "loss": 0.0723, "step": 1055 }, { "epoch": 0.48043676069153773, "grad_norm": 0.7718561478385435, "learning_rate": 4.8871698529175636e-06, "loss": 0.0706, "step": 1056 }, { "epoch": 0.48089171974522293, "grad_norm": 0.8506407410185749, "learning_rate": 4.886957483013549e-06, "loss": 0.0794, "step": 1057 }, { "epoch": 0.4813466787989081, "grad_norm": 1.0436903958800676, "learning_rate": 4.886744918058149e-06, "loss": 0.0863, "step": 1058 }, { "epoch": 0.4818016378525933, "grad_norm": 0.8684291097009643, "learning_rate": 4.886532158068732e-06, "loss": 0.0639, "step": 1059 }, { "epoch": 0.4822565969062784, "grad_norm": 0.8196534236848143, "learning_rate": 4.886319203062683e-06, "loss": 0.0575, "step": 1060 }, { "epoch": 0.4827115559599636, "grad_norm": 0.9689674310762992, "learning_rate": 4.886106053057408e-06, "loss": 0.0676, "step": 1061 }, { "epoch": 0.48316651501364877, "grad_norm": 0.9818608722703667, "learning_rate": 4.88589270807032e-06, "loss": 0.0799, "step": 1062 }, { "epoch": 0.48362147406733397, "grad_norm": 0.7489753545607997, "learning_rate": 4.885679168118855e-06, "loss": 0.071, "step": 1063 }, { "epoch": 0.4840764331210191, "grad_norm": 0.9547184698601481, "learning_rate": 4.8854654332204635e-06, "loss": 0.0898, "step": 1064 }, { "epoch": 0.48453139217470426, "grad_norm": 0.7541528249104351, "learning_rate": 4.885251503392607e-06, "loss": 0.0543, "step": 1065 }, { "epoch": 0.48498635122838946, "grad_norm": 1.067810398256581, "learning_rate": 4.885037378652771e-06, "loss": 0.092, "step": 1066 }, { "epoch": 0.4854413102820746, "grad_norm": 0.8614801184366359, "learning_rate": 4.884823059018451e-06, "loss": 0.0523, "step": 1067 }, { "epoch": 0.4858962693357598, "grad_norm": 0.8148290305727078, "learning_rate": 4.88460854450716e-06, "loss": 0.0584, "step": 1068 }, { "epoch": 0.48635122838944495, "grad_norm": 0.5855038739875491, "learning_rate": 4.884393835136427e-06, "loss": 0.0518, "step": 1069 }, { "epoch": 0.48680618744313015, "grad_norm": 0.8161089915400275, "learning_rate": 4.884178930923799e-06, "loss": 0.0574, "step": 1070 }, { "epoch": 0.4872611464968153, "grad_norm": 0.9133819494317933, "learning_rate": 4.883963831886834e-06, "loss": 0.0646, "step": 1071 }, { "epoch": 0.48771610555050043, "grad_norm": 0.8617450933904238, "learning_rate": 4.8837485380431115e-06, "loss": 0.0681, "step": 1072 }, { "epoch": 0.48817106460418563, "grad_norm": 0.6850383590418775, "learning_rate": 4.883533049410223e-06, "loss": 0.0547, "step": 1073 }, { "epoch": 0.4886260236578708, "grad_norm": 0.9045667246036927, "learning_rate": 4.8833173660057785e-06, "loss": 0.0759, "step": 1074 }, { "epoch": 0.489080982711556, "grad_norm": 0.8809498978267978, "learning_rate": 4.8831014878474004e-06, "loss": 0.0695, "step": 1075 }, { "epoch": 0.4895359417652411, "grad_norm": 0.7567941392113556, "learning_rate": 4.882885414952732e-06, "loss": 0.0626, "step": 1076 }, { "epoch": 0.4899909008189263, "grad_norm": 0.5783840394829795, "learning_rate": 4.882669147339428e-06, "loss": 0.0398, "step": 1077 }, { "epoch": 0.49044585987261147, "grad_norm": 0.6472410247637905, "learning_rate": 4.882452685025161e-06, "loss": 0.0433, "step": 1078 }, { "epoch": 0.4909008189262966, "grad_norm": 1.0450390786251647, "learning_rate": 4.88223602802762e-06, "loss": 0.0777, "step": 1079 }, { "epoch": 0.4913557779799818, "grad_norm": 0.7447912796292419, "learning_rate": 4.882019176364509e-06, "loss": 0.0717, "step": 1080 }, { "epoch": 0.49181073703366696, "grad_norm": 1.0045515435128731, "learning_rate": 4.881802130053548e-06, "loss": 0.0846, "step": 1081 }, { "epoch": 0.49226569608735216, "grad_norm": 0.7406749978383526, "learning_rate": 4.881584889112473e-06, "loss": 0.0576, "step": 1082 }, { "epoch": 0.4927206551410373, "grad_norm": 0.8307193744296768, "learning_rate": 4.881367453559036e-06, "loss": 0.0666, "step": 1083 }, { "epoch": 0.4931756141947225, "grad_norm": 0.6076387216705473, "learning_rate": 4.881149823411005e-06, "loss": 0.039, "step": 1084 }, { "epoch": 0.49363057324840764, "grad_norm": 0.6332467754054336, "learning_rate": 4.880931998686162e-06, "loss": 0.0494, "step": 1085 }, { "epoch": 0.4940855323020928, "grad_norm": 1.1521530496126213, "learning_rate": 4.880713979402311e-06, "loss": 0.1118, "step": 1086 }, { "epoch": 0.494540491355778, "grad_norm": 1.3887385722770256, "learning_rate": 4.880495765577263e-06, "loss": 0.0973, "step": 1087 }, { "epoch": 0.49499545040946313, "grad_norm": 0.8060673459124312, "learning_rate": 4.880277357228852e-06, "loss": 0.057, "step": 1088 }, { "epoch": 0.49545040946314833, "grad_norm": 0.5602480552863556, "learning_rate": 4.880058754374923e-06, "loss": 0.0521, "step": 1089 }, { "epoch": 0.4959053685168335, "grad_norm": 0.823959291020419, "learning_rate": 4.879839957033343e-06, "loss": 0.0732, "step": 1090 }, { "epoch": 0.4963603275705187, "grad_norm": 0.7660167202497931, "learning_rate": 4.879620965221987e-06, "loss": 0.0607, "step": 1091 }, { "epoch": 0.4968152866242038, "grad_norm": 0.6055131455183578, "learning_rate": 4.879401778958755e-06, "loss": 0.0478, "step": 1092 }, { "epoch": 0.49727024567788897, "grad_norm": 0.6291031678530673, "learning_rate": 4.8791823982615525e-06, "loss": 0.041, "step": 1093 }, { "epoch": 0.49772520473157417, "grad_norm": 0.5530508650871759, "learning_rate": 4.878962823148308e-06, "loss": 0.0424, "step": 1094 }, { "epoch": 0.4981801637852593, "grad_norm": 0.8531604298870573, "learning_rate": 4.878743053636968e-06, "loss": 0.0701, "step": 1095 }, { "epoch": 0.4986351228389445, "grad_norm": 0.7748184089023449, "learning_rate": 4.878523089745485e-06, "loss": 0.0748, "step": 1096 }, { "epoch": 0.49909008189262966, "grad_norm": 0.572162405974625, "learning_rate": 4.878302931491837e-06, "loss": 0.0531, "step": 1097 }, { "epoch": 0.49954504094631486, "grad_norm": 0.5587152183134356, "learning_rate": 4.8780825788940145e-06, "loss": 0.0377, "step": 1098 }, { "epoch": 0.5, "grad_norm": 0.7739049242000703, "learning_rate": 4.877862031970023e-06, "loss": 0.0653, "step": 1099 }, { "epoch": 0.5004549590536852, "grad_norm": 0.9657161074292855, "learning_rate": 4.8776412907378845e-06, "loss": 0.0693, "step": 1100 }, { "epoch": 0.5009099181073703, "grad_norm": 0.7975995176144653, "learning_rate": 4.877420355215637e-06, "loss": 0.0647, "step": 1101 }, { "epoch": 0.5013648771610555, "grad_norm": 1.0916440196980206, "learning_rate": 4.877199225421334e-06, "loss": 0.0904, "step": 1102 }, { "epoch": 0.5018198362147407, "grad_norm": 1.031914303869678, "learning_rate": 4.8769779013730454e-06, "loss": 0.1104, "step": 1103 }, { "epoch": 0.5022747952684259, "grad_norm": 0.9179664512897192, "learning_rate": 4.876756383088858e-06, "loss": 0.0731, "step": 1104 }, { "epoch": 0.502729754322111, "grad_norm": 0.5895244169254785, "learning_rate": 4.876534670586872e-06, "loss": 0.0513, "step": 1105 }, { "epoch": 0.5031847133757962, "grad_norm": 0.654710185038575, "learning_rate": 4.8763127638852045e-06, "loss": 0.0605, "step": 1106 }, { "epoch": 0.5036396724294814, "grad_norm": 0.7685792189309535, "learning_rate": 4.87609066300199e-06, "loss": 0.068, "step": 1107 }, { "epoch": 0.5040946314831665, "grad_norm": 0.8932660080856344, "learning_rate": 4.875868367955376e-06, "loss": 0.0789, "step": 1108 }, { "epoch": 0.5045495905368517, "grad_norm": 1.0120677400517832, "learning_rate": 4.87564587876353e-06, "loss": 0.0864, "step": 1109 }, { "epoch": 0.5050045495905369, "grad_norm": 0.835851016625387, "learning_rate": 4.87542319544463e-06, "loss": 0.0582, "step": 1110 }, { "epoch": 0.5054595086442221, "grad_norm": 0.7695840357212476, "learning_rate": 4.875200318016873e-06, "loss": 0.0675, "step": 1111 }, { "epoch": 0.5059144676979072, "grad_norm": 0.7971275413132646, "learning_rate": 4.8749772464984736e-06, "loss": 0.0743, "step": 1112 }, { "epoch": 0.5063694267515924, "grad_norm": 0.8814806686041933, "learning_rate": 4.874753980907658e-06, "loss": 0.0856, "step": 1113 }, { "epoch": 0.5068243858052776, "grad_norm": 0.6757729077082226, "learning_rate": 4.8745305212626714e-06, "loss": 0.0512, "step": 1114 }, { "epoch": 0.5072793448589626, "grad_norm": 0.7352914895461456, "learning_rate": 4.874306867581775e-06, "loss": 0.0618, "step": 1115 }, { "epoch": 0.5077343039126478, "grad_norm": 0.5868042194217611, "learning_rate": 4.874083019883242e-06, "loss": 0.0366, "step": 1116 }, { "epoch": 0.508189262966333, "grad_norm": 0.9033247458477103, "learning_rate": 4.873858978185367e-06, "loss": 0.0806, "step": 1117 }, { "epoch": 0.5086442220200182, "grad_norm": 1.2038578681531908, "learning_rate": 4.8736347425064565e-06, "loss": 0.1055, "step": 1118 }, { "epoch": 0.5090991810737033, "grad_norm": 0.8178175242835675, "learning_rate": 4.873410312864833e-06, "loss": 0.0609, "step": 1119 }, { "epoch": 0.5095541401273885, "grad_norm": 0.6653546708039177, "learning_rate": 4.8731856892788384e-06, "loss": 0.0495, "step": 1120 }, { "epoch": 0.5100090991810737, "grad_norm": 1.0479881259244002, "learning_rate": 4.872960871766826e-06, "loss": 0.0943, "step": 1121 }, { "epoch": 0.5104640582347588, "grad_norm": 0.6898301418972904, "learning_rate": 4.8727358603471675e-06, "loss": 0.072, "step": 1122 }, { "epoch": 0.510919017288444, "grad_norm": 0.9228312888198933, "learning_rate": 4.872510655038249e-06, "loss": 0.0594, "step": 1123 }, { "epoch": 0.5113739763421292, "grad_norm": 0.6811960575278385, "learning_rate": 4.872285255858476e-06, "loss": 0.0675, "step": 1124 }, { "epoch": 0.5118289353958144, "grad_norm": 0.9624744009083318, "learning_rate": 4.872059662826263e-06, "loss": 0.0766, "step": 1125 }, { "epoch": 0.5122838944494995, "grad_norm": 0.7538277543537744, "learning_rate": 4.8718338759600465e-06, "loss": 0.0592, "step": 1126 }, { "epoch": 0.5127388535031847, "grad_norm": 0.7210100766327706, "learning_rate": 4.871607895278278e-06, "loss": 0.0723, "step": 1127 }, { "epoch": 0.5131938125568699, "grad_norm": 0.6525437186084021, "learning_rate": 4.871381720799421e-06, "loss": 0.0474, "step": 1128 }, { "epoch": 0.513648771610555, "grad_norm": 0.8115901002403193, "learning_rate": 4.8711553525419595e-06, "loss": 0.066, "step": 1129 }, { "epoch": 0.5141037306642402, "grad_norm": 0.7148215823332176, "learning_rate": 4.87092879052439e-06, "loss": 0.0627, "step": 1130 }, { "epoch": 0.5145586897179254, "grad_norm": 0.48371660949391987, "learning_rate": 4.8707020347652275e-06, "loss": 0.0392, "step": 1131 }, { "epoch": 0.5150136487716106, "grad_norm": 0.7655779285836447, "learning_rate": 4.870475085283001e-06, "loss": 0.0659, "step": 1132 }, { "epoch": 0.5154686078252957, "grad_norm": 0.8307291704590695, "learning_rate": 4.870247942096254e-06, "loss": 0.0675, "step": 1133 }, { "epoch": 0.5159235668789809, "grad_norm": 0.6005245010930204, "learning_rate": 4.870020605223551e-06, "loss": 0.0435, "step": 1134 }, { "epoch": 0.5163785259326661, "grad_norm": 0.8060750325493741, "learning_rate": 4.869793074683466e-06, "loss": 0.06, "step": 1135 }, { "epoch": 0.5168334849863512, "grad_norm": 1.1918274978409322, "learning_rate": 4.8695653504945925e-06, "loss": 0.082, "step": 1136 }, { "epoch": 0.5172884440400364, "grad_norm": 1.307377261046503, "learning_rate": 4.8693374326755405e-06, "loss": 0.1036, "step": 1137 }, { "epoch": 0.5177434030937216, "grad_norm": 0.6775134916120404, "learning_rate": 4.869109321244932e-06, "loss": 0.0626, "step": 1138 }, { "epoch": 0.5181983621474068, "grad_norm": 0.7826832431767746, "learning_rate": 4.86888101622141e-06, "loss": 0.0678, "step": 1139 }, { "epoch": 0.5186533212010919, "grad_norm": 0.7645020910331457, "learning_rate": 4.868652517623629e-06, "loss": 0.0489, "step": 1140 }, { "epoch": 0.5191082802547771, "grad_norm": 0.632952475643817, "learning_rate": 4.86842382547026e-06, "loss": 0.0494, "step": 1141 }, { "epoch": 0.5195632393084623, "grad_norm": 0.6876406168526772, "learning_rate": 4.868194939779992e-06, "loss": 0.0396, "step": 1142 }, { "epoch": 0.5200181983621474, "grad_norm": 0.6327165174782583, "learning_rate": 4.867965860571529e-06, "loss": 0.054, "step": 1143 }, { "epoch": 0.5204731574158326, "grad_norm": 1.0011588527834634, "learning_rate": 4.867736587863589e-06, "loss": 0.0877, "step": 1144 }, { "epoch": 0.5209281164695178, "grad_norm": 1.1185639295345813, "learning_rate": 4.867507121674907e-06, "loss": 0.0861, "step": 1145 }, { "epoch": 0.521383075523203, "grad_norm": 0.8624559113619508, "learning_rate": 4.867277462024235e-06, "loss": 0.0629, "step": 1146 }, { "epoch": 0.521838034576888, "grad_norm": 0.9348410440606036, "learning_rate": 4.8670476089303395e-06, "loss": 0.0933, "step": 1147 }, { "epoch": 0.5222929936305732, "grad_norm": 0.9320714625912361, "learning_rate": 4.866817562412003e-06, "loss": 0.1038, "step": 1148 }, { "epoch": 0.5227479526842584, "grad_norm": 0.8075624594132853, "learning_rate": 4.866587322488024e-06, "loss": 0.0809, "step": 1149 }, { "epoch": 0.5232029117379435, "grad_norm": 1.3413067521083781, "learning_rate": 4.866356889177216e-06, "loss": 0.108, "step": 1150 }, { "epoch": 0.5236578707916287, "grad_norm": 0.9232308217568203, "learning_rate": 4.866126262498409e-06, "loss": 0.083, "step": 1151 }, { "epoch": 0.5241128298453139, "grad_norm": 1.305379354125092, "learning_rate": 4.865895442470449e-06, "loss": 0.0958, "step": 1152 }, { "epoch": 0.5245677888989991, "grad_norm": 0.5576012713848366, "learning_rate": 4.865664429112199e-06, "loss": 0.0452, "step": 1153 }, { "epoch": 0.5250227479526842, "grad_norm": 0.921237979155653, "learning_rate": 4.8654332224425345e-06, "loss": 0.0711, "step": 1154 }, { "epoch": 0.5254777070063694, "grad_norm": 0.8929194799091247, "learning_rate": 4.865201822480349e-06, "loss": 0.0764, "step": 1155 }, { "epoch": 0.5259326660600546, "grad_norm": 0.581346673528037, "learning_rate": 4.864970229244552e-06, "loss": 0.0424, "step": 1156 }, { "epoch": 0.5263876251137397, "grad_norm": 0.5623832436496817, "learning_rate": 4.864738442754068e-06, "loss": 0.0434, "step": 1157 }, { "epoch": 0.5268425841674249, "grad_norm": 0.613920526367082, "learning_rate": 4.864506463027837e-06, "loss": 0.0506, "step": 1158 }, { "epoch": 0.5272975432211101, "grad_norm": 1.0387758175670767, "learning_rate": 4.864274290084816e-06, "loss": 0.0875, "step": 1159 }, { "epoch": 0.5277525022747953, "grad_norm": 0.8664987550519835, "learning_rate": 4.864041923943978e-06, "loss": 0.0633, "step": 1160 }, { "epoch": 0.5282074613284804, "grad_norm": 0.8313294484835317, "learning_rate": 4.863809364624309e-06, "loss": 0.069, "step": 1161 }, { "epoch": 0.5286624203821656, "grad_norm": 0.744844430385872, "learning_rate": 4.863576612144814e-06, "loss": 0.0669, "step": 1162 }, { "epoch": 0.5291173794358508, "grad_norm": 0.9237519112875051, "learning_rate": 4.863343666524512e-06, "loss": 0.0735, "step": 1163 }, { "epoch": 0.5295723384895359, "grad_norm": 1.0354177918634275, "learning_rate": 4.863110527782437e-06, "loss": 0.0663, "step": 1164 }, { "epoch": 0.5300272975432211, "grad_norm": 0.8421924118581489, "learning_rate": 4.8628771959376435e-06, "loss": 0.0611, "step": 1165 }, { "epoch": 0.5304822565969063, "grad_norm": 0.9052022304190199, "learning_rate": 4.862643671009195e-06, "loss": 0.0678, "step": 1166 }, { "epoch": 0.5309372156505915, "grad_norm": 0.5590184662551977, "learning_rate": 4.862409953016175e-06, "loss": 0.0643, "step": 1167 }, { "epoch": 0.5313921747042766, "grad_norm": 1.017387677331305, "learning_rate": 4.862176041977683e-06, "loss": 0.0893, "step": 1168 }, { "epoch": 0.5318471337579618, "grad_norm": 0.7533348435394249, "learning_rate": 4.861941937912832e-06, "loss": 0.0674, "step": 1169 }, { "epoch": 0.532302092811647, "grad_norm": 0.6445462791157965, "learning_rate": 4.861707640840752e-06, "loss": 0.0493, "step": 1170 }, { "epoch": 0.5327570518653321, "grad_norm": 0.7797944668273296, "learning_rate": 4.861473150780589e-06, "loss": 0.0676, "step": 1171 }, { "epoch": 0.5332120109190173, "grad_norm": 0.9208202792762923, "learning_rate": 4.8612384677515054e-06, "loss": 0.0823, "step": 1172 }, { "epoch": 0.5336669699727025, "grad_norm": 0.597595752698138, "learning_rate": 4.861003591772677e-06, "loss": 0.0494, "step": 1173 }, { "epoch": 0.5341219290263877, "grad_norm": 0.8195100456101612, "learning_rate": 4.860768522863297e-06, "loss": 0.0538, "step": 1174 }, { "epoch": 0.5345768880800728, "grad_norm": 0.7767115772582979, "learning_rate": 4.860533261042574e-06, "loss": 0.0623, "step": 1175 }, { "epoch": 0.535031847133758, "grad_norm": 0.7588370967921573, "learning_rate": 4.8602978063297336e-06, "loss": 0.0825, "step": 1176 }, { "epoch": 0.5354868061874432, "grad_norm": 0.8816447645987197, "learning_rate": 4.8600621587440155e-06, "loss": 0.0608, "step": 1177 }, { "epoch": 0.5359417652411284, "grad_norm": 0.8625275373939374, "learning_rate": 4.859826318304676e-06, "loss": 0.0778, "step": 1178 }, { "epoch": 0.5363967242948134, "grad_norm": 0.5461509620774402, "learning_rate": 4.859590285030986e-06, "loss": 0.0555, "step": 1179 }, { "epoch": 0.5368516833484986, "grad_norm": 0.8435272435369155, "learning_rate": 4.859354058942234e-06, "loss": 0.0748, "step": 1180 }, { "epoch": 0.5373066424021838, "grad_norm": 0.7720087336226316, "learning_rate": 4.859117640057723e-06, "loss": 0.0671, "step": 1181 }, { "epoch": 0.5377616014558689, "grad_norm": 0.9891841529967211, "learning_rate": 4.858881028396773e-06, "loss": 0.0912, "step": 1182 }, { "epoch": 0.5382165605095541, "grad_norm": 0.8570437893831205, "learning_rate": 4.8586442239787165e-06, "loss": 0.065, "step": 1183 }, { "epoch": 0.5386715195632393, "grad_norm": 0.8305844073661711, "learning_rate": 4.858407226822906e-06, "loss": 0.0762, "step": 1184 }, { "epoch": 0.5391264786169245, "grad_norm": 0.640701641053205, "learning_rate": 4.858170036948707e-06, "loss": 0.0581, "step": 1185 }, { "epoch": 0.5395814376706096, "grad_norm": 0.6468235214549333, "learning_rate": 4.857932654375503e-06, "loss": 0.0482, "step": 1186 }, { "epoch": 0.5400363967242948, "grad_norm": 1.1837084026628135, "learning_rate": 4.857695079122691e-06, "loss": 0.1159, "step": 1187 }, { "epoch": 0.54049135577798, "grad_norm": 0.6715223091608853, "learning_rate": 4.857457311209683e-06, "loss": 0.0601, "step": 1188 }, { "epoch": 0.5409463148316651, "grad_norm": 0.6371483119799936, "learning_rate": 4.857219350655911e-06, "loss": 0.0528, "step": 1189 }, { "epoch": 0.5414012738853503, "grad_norm": 0.6786845490419491, "learning_rate": 4.856981197480818e-06, "loss": 0.0567, "step": 1190 }, { "epoch": 0.5418562329390355, "grad_norm": 0.5942209619771379, "learning_rate": 4.856742851703866e-06, "loss": 0.0489, "step": 1191 }, { "epoch": 0.5423111919927207, "grad_norm": 0.859323950389801, "learning_rate": 4.856504313344531e-06, "loss": 0.0904, "step": 1192 }, { "epoch": 0.5427661510464058, "grad_norm": 0.8018437461164658, "learning_rate": 4.8562655824223055e-06, "loss": 0.0597, "step": 1193 }, { "epoch": 0.543221110100091, "grad_norm": 0.588833511104325, "learning_rate": 4.856026658956697e-06, "loss": 0.0423, "step": 1194 }, { "epoch": 0.5436760691537762, "grad_norm": 0.573460707090398, "learning_rate": 4.8557875429672295e-06, "loss": 0.0633, "step": 1195 }, { "epoch": 0.5441310282074613, "grad_norm": 0.7742873470777307, "learning_rate": 4.855548234473444e-06, "loss": 0.0854, "step": 1196 }, { "epoch": 0.5445859872611465, "grad_norm": 0.9198876967543222, "learning_rate": 4.8553087334948935e-06, "loss": 0.0838, "step": 1197 }, { "epoch": 0.5450409463148317, "grad_norm": 0.6622698873314925, "learning_rate": 4.855069040051149e-06, "loss": 0.0557, "step": 1198 }, { "epoch": 0.5454959053685169, "grad_norm": 0.9436539139154301, "learning_rate": 4.854829154161799e-06, "loss": 0.0816, "step": 1199 }, { "epoch": 0.545950864422202, "grad_norm": 0.738597876294885, "learning_rate": 4.854589075846445e-06, "loss": 0.0706, "step": 1200 }, { "epoch": 0.5464058234758872, "grad_norm": 0.6650944152999292, "learning_rate": 4.854348805124704e-06, "loss": 0.0615, "step": 1201 }, { "epoch": 0.5468607825295724, "grad_norm": 0.7740616702375358, "learning_rate": 4.85410834201621e-06, "loss": 0.0622, "step": 1202 }, { "epoch": 0.5473157415832575, "grad_norm": 1.0655587786032952, "learning_rate": 4.8538676865406155e-06, "loss": 0.1008, "step": 1203 }, { "epoch": 0.5477707006369427, "grad_norm": 0.562596201287674, "learning_rate": 4.853626838717582e-06, "loss": 0.0446, "step": 1204 }, { "epoch": 0.5482256596906279, "grad_norm": 0.7450161661122177, "learning_rate": 4.853385798566793e-06, "loss": 0.0505, "step": 1205 }, { "epoch": 0.5486806187443131, "grad_norm": 0.7252053220128412, "learning_rate": 4.8531445661079444e-06, "loss": 0.0556, "step": 1206 }, { "epoch": 0.5491355777979982, "grad_norm": 0.8995425654201067, "learning_rate": 4.852903141360749e-06, "loss": 0.0759, "step": 1207 }, { "epoch": 0.5495905368516834, "grad_norm": 0.5908915627465986, "learning_rate": 4.852661524344933e-06, "loss": 0.0383, "step": 1208 }, { "epoch": 0.5500454959053686, "grad_norm": 0.8182852324660039, "learning_rate": 4.852419715080244e-06, "loss": 0.0764, "step": 1209 }, { "epoch": 0.5505004549590536, "grad_norm": 0.7043639311746182, "learning_rate": 4.852177713586437e-06, "loss": 0.0573, "step": 1210 }, { "epoch": 0.5509554140127388, "grad_norm": 0.6570600921060993, "learning_rate": 4.85193551988329e-06, "loss": 0.054, "step": 1211 }, { "epoch": 0.551410373066424, "grad_norm": 0.6708751527421707, "learning_rate": 4.851693133990594e-06, "loss": 0.0506, "step": 1212 }, { "epoch": 0.5518653321201092, "grad_norm": 0.5972152927280668, "learning_rate": 4.851450555928155e-06, "loss": 0.0427, "step": 1213 }, { "epoch": 0.5523202911737943, "grad_norm": 1.0934539341786074, "learning_rate": 4.851207785715797e-06, "loss": 0.1214, "step": 1214 }, { "epoch": 0.5527752502274795, "grad_norm": 0.7434164602830275, "learning_rate": 4.850964823373355e-06, "loss": 0.0836, "step": 1215 }, { "epoch": 0.5532302092811647, "grad_norm": 0.6176814717659638, "learning_rate": 4.850721668920685e-06, "loss": 0.0518, "step": 1216 }, { "epoch": 0.5536851683348498, "grad_norm": 0.848848092705782, "learning_rate": 4.850478322377657e-06, "loss": 0.0768, "step": 1217 }, { "epoch": 0.554140127388535, "grad_norm": 0.9187012656708098, "learning_rate": 4.8502347837641536e-06, "loss": 0.0936, "step": 1218 }, { "epoch": 0.5545950864422202, "grad_norm": 0.8042146566684509, "learning_rate": 4.8499910531000776e-06, "loss": 0.0672, "step": 1219 }, { "epoch": 0.5550500454959054, "grad_norm": 0.9323679662294462, "learning_rate": 4.849747130405346e-06, "loss": 0.0685, "step": 1220 }, { "epoch": 0.5555050045495905, "grad_norm": 0.8486293499713085, "learning_rate": 4.849503015699889e-06, "loss": 0.0637, "step": 1221 }, { "epoch": 0.5559599636032757, "grad_norm": 0.8023393545361088, "learning_rate": 4.849258709003657e-06, "loss": 0.064, "step": 1222 }, { "epoch": 0.5564149226569609, "grad_norm": 1.043454299347868, "learning_rate": 4.849014210336612e-06, "loss": 0.0837, "step": 1223 }, { "epoch": 0.556869881710646, "grad_norm": 0.880437517370894, "learning_rate": 4.848769519718734e-06, "loss": 0.0886, "step": 1224 }, { "epoch": 0.5573248407643312, "grad_norm": 0.8100133023892003, "learning_rate": 4.848524637170018e-06, "loss": 0.063, "step": 1225 }, { "epoch": 0.5577797998180164, "grad_norm": 1.2239449746298685, "learning_rate": 4.848279562710474e-06, "loss": 0.1003, "step": 1226 }, { "epoch": 0.5582347588717016, "grad_norm": 0.5784631570123066, "learning_rate": 4.848034296360129e-06, "loss": 0.0461, "step": 1227 }, { "epoch": 0.5586897179253867, "grad_norm": 0.5785185693838462, "learning_rate": 4.847788838139025e-06, "loss": 0.0584, "step": 1228 }, { "epoch": 0.5591446769790719, "grad_norm": 0.7383840653779203, "learning_rate": 4.847543188067219e-06, "loss": 0.0556, "step": 1229 }, { "epoch": 0.5595996360327571, "grad_norm": 0.8449984615944548, "learning_rate": 4.847297346164786e-06, "loss": 0.0656, "step": 1230 }, { "epoch": 0.5600545950864422, "grad_norm": 0.7138008169538578, "learning_rate": 4.8470513124518134e-06, "loss": 0.0627, "step": 1231 }, { "epoch": 0.5605095541401274, "grad_norm": 1.2286881556843667, "learning_rate": 4.8468050869484075e-06, "loss": 0.0863, "step": 1232 }, { "epoch": 0.5609645131938126, "grad_norm": 0.7827144293564832, "learning_rate": 4.846558669674688e-06, "loss": 0.0535, "step": 1233 }, { "epoch": 0.5614194722474978, "grad_norm": 0.8311970732348628, "learning_rate": 4.8463120606507904e-06, "loss": 0.0577, "step": 1234 }, { "epoch": 0.5618744313011829, "grad_norm": 1.0881117043806725, "learning_rate": 4.846065259896867e-06, "loss": 0.0825, "step": 1235 }, { "epoch": 0.5623293903548681, "grad_norm": 0.9620936539691768, "learning_rate": 4.845818267433086e-06, "loss": 0.089, "step": 1236 }, { "epoch": 0.5627843494085533, "grad_norm": 0.7942713424944172, "learning_rate": 4.845571083279629e-06, "loss": 0.0654, "step": 1237 }, { "epoch": 0.5632393084622384, "grad_norm": 0.5998278656859003, "learning_rate": 4.845323707456696e-06, "loss": 0.0649, "step": 1238 }, { "epoch": 0.5636942675159236, "grad_norm": 0.741973804021484, "learning_rate": 4.845076139984502e-06, "loss": 0.06, "step": 1239 }, { "epoch": 0.5641492265696088, "grad_norm": 0.9737908411420552, "learning_rate": 4.844828380883274e-06, "loss": 0.0788, "step": 1240 }, { "epoch": 0.564604185623294, "grad_norm": 0.6456783705803008, "learning_rate": 4.844580430173261e-06, "loss": 0.062, "step": 1241 }, { "epoch": 0.565059144676979, "grad_norm": 0.4512477885968687, "learning_rate": 4.8443322878747236e-06, "loss": 0.0338, "step": 1242 }, { "epoch": 0.5655141037306642, "grad_norm": 0.6789010313138605, "learning_rate": 4.844083954007938e-06, "loss": 0.0553, "step": 1243 }, { "epoch": 0.5659690627843494, "grad_norm": 1.0592017111841259, "learning_rate": 4.843835428593198e-06, "loss": 0.0964, "step": 1244 }, { "epoch": 0.5664240218380345, "grad_norm": 0.8615337952745731, "learning_rate": 4.84358671165081e-06, "loss": 0.0803, "step": 1245 }, { "epoch": 0.5668789808917197, "grad_norm": 0.9068648816735045, "learning_rate": 4.843337803201102e-06, "loss": 0.0957, "step": 1246 }, { "epoch": 0.5673339399454049, "grad_norm": 0.7801553916627879, "learning_rate": 4.8430887032644094e-06, "loss": 0.0707, "step": 1247 }, { "epoch": 0.5677888989990901, "grad_norm": 0.888048216687448, "learning_rate": 4.842839411861089e-06, "loss": 0.0713, "step": 1248 }, { "epoch": 0.5682438580527752, "grad_norm": 0.7108300238400989, "learning_rate": 4.842589929011513e-06, "loss": 0.0609, "step": 1249 }, { "epoch": 0.5686988171064604, "grad_norm": 0.8602271760854026, "learning_rate": 4.8423402547360665e-06, "loss": 0.071, "step": 1250 }, { "epoch": 0.5691537761601456, "grad_norm": 0.7368968504486557, "learning_rate": 4.842090389055153e-06, "loss": 0.0549, "step": 1251 }, { "epoch": 0.5696087352138307, "grad_norm": 0.6376797315072175, "learning_rate": 4.841840331989189e-06, "loss": 0.0536, "step": 1252 }, { "epoch": 0.5700636942675159, "grad_norm": 0.7105471536122931, "learning_rate": 4.841590083558608e-06, "loss": 0.0589, "step": 1253 }, { "epoch": 0.5705186533212011, "grad_norm": 0.7851951409187395, "learning_rate": 4.841339643783861e-06, "loss": 0.0667, "step": 1254 }, { "epoch": 0.5709736123748863, "grad_norm": 0.9143449154341239, "learning_rate": 4.841089012685412e-06, "loss": 0.094, "step": 1255 }, { "epoch": 0.5714285714285714, "grad_norm": 0.7279105322404732, "learning_rate": 4.840838190283741e-06, "loss": 0.0665, "step": 1256 }, { "epoch": 0.5718835304822566, "grad_norm": 0.5565255651391556, "learning_rate": 4.8405871765993435e-06, "loss": 0.0374, "step": 1257 }, { "epoch": 0.5723384895359418, "grad_norm": 0.7414165634317871, "learning_rate": 4.840335971652732e-06, "loss": 0.055, "step": 1258 }, { "epoch": 0.5727934485896269, "grad_norm": 0.7491117483341468, "learning_rate": 4.840084575464434e-06, "loss": 0.0663, "step": 1259 }, { "epoch": 0.5732484076433121, "grad_norm": 0.7016390473579003, "learning_rate": 4.839832988054992e-06, "loss": 0.0585, "step": 1260 }, { "epoch": 0.5737033666969973, "grad_norm": 0.6978517385357002, "learning_rate": 4.839581209444966e-06, "loss": 0.0515, "step": 1261 }, { "epoch": 0.5741583257506825, "grad_norm": 0.8617787500306493, "learning_rate": 4.839329239654927e-06, "loss": 0.0695, "step": 1262 }, { "epoch": 0.5746132848043676, "grad_norm": 0.8166990461173421, "learning_rate": 4.839077078705468e-06, "loss": 0.055, "step": 1263 }, { "epoch": 0.5750682438580528, "grad_norm": 0.9058417331374469, "learning_rate": 4.838824726617194e-06, "loss": 0.0821, "step": 1264 }, { "epoch": 0.575523202911738, "grad_norm": 0.8868783686078405, "learning_rate": 4.838572183410725e-06, "loss": 0.0708, "step": 1265 }, { "epoch": 0.5759781619654231, "grad_norm": 0.8499550028220518, "learning_rate": 4.838319449106697e-06, "loss": 0.071, "step": 1266 }, { "epoch": 0.5764331210191083, "grad_norm": 0.6310677239795418, "learning_rate": 4.838066523725764e-06, "loss": 0.0466, "step": 1267 }, { "epoch": 0.5768880800727935, "grad_norm": 0.7334881695978646, "learning_rate": 4.837813407288594e-06, "loss": 0.0672, "step": 1268 }, { "epoch": 0.5773430391264787, "grad_norm": 0.6599472606872935, "learning_rate": 4.837560099815869e-06, "loss": 0.0514, "step": 1269 }, { "epoch": 0.5777979981801638, "grad_norm": 0.7760258987880212, "learning_rate": 4.837306601328289e-06, "loss": 0.0684, "step": 1270 }, { "epoch": 0.578252957233849, "grad_norm": 0.6309403280935036, "learning_rate": 4.837052911846569e-06, "loss": 0.0626, "step": 1271 }, { "epoch": 0.5787079162875342, "grad_norm": 0.8629672961688011, "learning_rate": 4.836799031391439e-06, "loss": 0.0784, "step": 1272 }, { "epoch": 0.5791628753412192, "grad_norm": 0.3854359252943462, "learning_rate": 4.836544959983645e-06, "loss": 0.033, "step": 1273 }, { "epoch": 0.5796178343949044, "grad_norm": 1.0318224319043552, "learning_rate": 4.8362906976439485e-06, "loss": 0.0849, "step": 1274 }, { "epoch": 0.5800727934485896, "grad_norm": 0.9665553458146198, "learning_rate": 4.836036244393127e-06, "loss": 0.0958, "step": 1275 }, { "epoch": 0.5805277525022748, "grad_norm": 0.9363624449700683, "learning_rate": 4.835781600251973e-06, "loss": 0.0765, "step": 1276 }, { "epoch": 0.5809827115559599, "grad_norm": 0.6147554145907947, "learning_rate": 4.835526765241295e-06, "loss": 0.0488, "step": 1277 }, { "epoch": 0.5814376706096451, "grad_norm": 0.8334344326658653, "learning_rate": 4.835271739381917e-06, "loss": 0.0721, "step": 1278 }, { "epoch": 0.5818926296633303, "grad_norm": 0.49964314159638445, "learning_rate": 4.835016522694678e-06, "loss": 0.0493, "step": 1279 }, { "epoch": 0.5823475887170154, "grad_norm": 0.9514130488464217, "learning_rate": 4.834761115200434e-06, "loss": 0.1112, "step": 1280 }, { "epoch": 0.5828025477707006, "grad_norm": 0.7622176607459498, "learning_rate": 4.834505516920055e-06, "loss": 0.0773, "step": 1281 }, { "epoch": 0.5832575068243858, "grad_norm": 0.8319534302623502, "learning_rate": 4.834249727874428e-06, "loss": 0.0734, "step": 1282 }, { "epoch": 0.583712465878071, "grad_norm": 0.6580298023552714, "learning_rate": 4.833993748084455e-06, "loss": 0.0487, "step": 1283 }, { "epoch": 0.5841674249317561, "grad_norm": 0.5709038914352429, "learning_rate": 4.833737577571052e-06, "loss": 0.0437, "step": 1284 }, { "epoch": 0.5846223839854413, "grad_norm": 0.7738487313994183, "learning_rate": 4.833481216355153e-06, "loss": 0.0593, "step": 1285 }, { "epoch": 0.5850773430391265, "grad_norm": 1.3097720038286855, "learning_rate": 4.833224664457709e-06, "loss": 0.1053, "step": 1286 }, { "epoch": 0.5855323020928116, "grad_norm": 1.191215763782131, "learning_rate": 4.83296792189968e-06, "loss": 0.0791, "step": 1287 }, { "epoch": 0.5859872611464968, "grad_norm": 0.7550564620604591, "learning_rate": 4.83271098870205e-06, "loss": 0.0614, "step": 1288 }, { "epoch": 0.586442220200182, "grad_norm": 0.7855417872089538, "learning_rate": 4.832453864885811e-06, "loss": 0.0765, "step": 1289 }, { "epoch": 0.5868971792538672, "grad_norm": 0.7082768853538572, "learning_rate": 4.832196550471976e-06, "loss": 0.0584, "step": 1290 }, { "epoch": 0.5873521383075523, "grad_norm": 0.7586707116910946, "learning_rate": 4.831939045481571e-06, "loss": 0.0693, "step": 1291 }, { "epoch": 0.5878070973612375, "grad_norm": 0.6804347439528804, "learning_rate": 4.8316813499356375e-06, "loss": 0.0579, "step": 1292 }, { "epoch": 0.5882620564149227, "grad_norm": 0.6650179741094593, "learning_rate": 4.831423463855235e-06, "loss": 0.0473, "step": 1293 }, { "epoch": 0.5887170154686078, "grad_norm": 0.7381087388697778, "learning_rate": 4.8311653872614345e-06, "loss": 0.061, "step": 1294 }, { "epoch": 0.589171974522293, "grad_norm": 0.6163282527593773, "learning_rate": 4.830907120175327e-06, "loss": 0.0458, "step": 1295 }, { "epoch": 0.5896269335759782, "grad_norm": 1.355736098526988, "learning_rate": 4.830648662618015e-06, "loss": 0.1213, "step": 1296 }, { "epoch": 0.5900818926296634, "grad_norm": 0.49193334954985163, "learning_rate": 4.83039001461062e-06, "loss": 0.0379, "step": 1297 }, { "epoch": 0.5905368516833485, "grad_norm": 0.8251065772364982, "learning_rate": 4.830131176174276e-06, "loss": 0.0614, "step": 1298 }, { "epoch": 0.5909918107370337, "grad_norm": 0.9196499228007727, "learning_rate": 4.829872147330136e-06, "loss": 0.0747, "step": 1299 }, { "epoch": 0.5914467697907189, "grad_norm": 0.6635211914340154, "learning_rate": 4.829612928099366e-06, "loss": 0.0599, "step": 1300 }, { "epoch": 0.591901728844404, "grad_norm": 0.5214758111450317, "learning_rate": 4.829353518503147e-06, "loss": 0.0466, "step": 1301 }, { "epoch": 0.5923566878980892, "grad_norm": 0.610000532317552, "learning_rate": 4.829093918562678e-06, "loss": 0.048, "step": 1302 }, { "epoch": 0.5928116469517744, "grad_norm": 1.0296471042370532, "learning_rate": 4.828834128299173e-06, "loss": 0.0849, "step": 1303 }, { "epoch": 0.5932666060054596, "grad_norm": 1.0181697893257282, "learning_rate": 4.828574147733859e-06, "loss": 0.0917, "step": 1304 }, { "epoch": 0.5937215650591446, "grad_norm": 0.7456729558547099, "learning_rate": 4.828313976887982e-06, "loss": 0.0566, "step": 1305 }, { "epoch": 0.5941765241128298, "grad_norm": 1.2895345953766368, "learning_rate": 4.8280536157828e-06, "loss": 0.0768, "step": 1306 }, { "epoch": 0.594631483166515, "grad_norm": 0.7193657333914658, "learning_rate": 4.827793064439592e-06, "loss": 0.0649, "step": 1307 }, { "epoch": 0.5950864422202001, "grad_norm": 0.6369033412844897, "learning_rate": 4.8275323228796455e-06, "loss": 0.0485, "step": 1308 }, { "epoch": 0.5955414012738853, "grad_norm": 0.4260555886565184, "learning_rate": 4.8272713911242695e-06, "loss": 0.0233, "step": 1309 }, { "epoch": 0.5959963603275705, "grad_norm": 0.5487802818772052, "learning_rate": 4.827010269194785e-06, "loss": 0.0429, "step": 1310 }, { "epoch": 0.5964513193812557, "grad_norm": 0.9163026616839156, "learning_rate": 4.8267489571125295e-06, "loss": 0.0723, "step": 1311 }, { "epoch": 0.5969062784349408, "grad_norm": 1.2697687714050636, "learning_rate": 4.826487454898857e-06, "loss": 0.1022, "step": 1312 }, { "epoch": 0.597361237488626, "grad_norm": 0.6502381859861477, "learning_rate": 4.826225762575136e-06, "loss": 0.0566, "step": 1313 }, { "epoch": 0.5978161965423112, "grad_norm": 0.6784651371796548, "learning_rate": 4.825963880162752e-06, "loss": 0.0569, "step": 1314 }, { "epoch": 0.5982711555959963, "grad_norm": 0.9827857531768842, "learning_rate": 4.825701807683102e-06, "loss": 0.0709, "step": 1315 }, { "epoch": 0.5987261146496815, "grad_norm": 0.8148040815518555, "learning_rate": 4.825439545157603e-06, "loss": 0.0661, "step": 1316 }, { "epoch": 0.5991810737033667, "grad_norm": 0.7818624869971815, "learning_rate": 4.825177092607687e-06, "loss": 0.0756, "step": 1317 }, { "epoch": 0.5996360327570519, "grad_norm": 0.6526378977883536, "learning_rate": 4.8249144500547995e-06, "loss": 0.0549, "step": 1318 }, { "epoch": 0.600090991810737, "grad_norm": 0.5697295150295824, "learning_rate": 4.824651617520402e-06, "loss": 0.0393, "step": 1319 }, { "epoch": 0.6005459508644222, "grad_norm": 0.7421021671142831, "learning_rate": 4.824388595025972e-06, "loss": 0.0789, "step": 1320 }, { "epoch": 0.6010009099181074, "grad_norm": 0.7191904617460073, "learning_rate": 4.824125382593003e-06, "loss": 0.0532, "step": 1321 }, { "epoch": 0.6014558689717925, "grad_norm": 0.7309054499990442, "learning_rate": 4.823861980243003e-06, "loss": 0.0748, "step": 1322 }, { "epoch": 0.6019108280254777, "grad_norm": 0.8448893024828844, "learning_rate": 4.823598387997497e-06, "loss": 0.0667, "step": 1323 }, { "epoch": 0.6023657870791629, "grad_norm": 0.7601489588572167, "learning_rate": 4.823334605878024e-06, "loss": 0.0523, "step": 1324 }, { "epoch": 0.6028207461328481, "grad_norm": 0.7433654268959281, "learning_rate": 4.82307063390614e-06, "loss": 0.0553, "step": 1325 }, { "epoch": 0.6032757051865332, "grad_norm": 0.8187296751030086, "learning_rate": 4.822806472103413e-06, "loss": 0.0676, "step": 1326 }, { "epoch": 0.6037306642402184, "grad_norm": 0.6394929202903299, "learning_rate": 4.822542120491431e-06, "loss": 0.0577, "step": 1327 }, { "epoch": 0.6041856232939036, "grad_norm": 0.4810746844764873, "learning_rate": 4.822277579091796e-06, "loss": 0.0548, "step": 1328 }, { "epoch": 0.6046405823475887, "grad_norm": 0.6400955174315186, "learning_rate": 4.822012847926125e-06, "loss": 0.0527, "step": 1329 }, { "epoch": 0.6050955414012739, "grad_norm": 0.6867529527775732, "learning_rate": 4.821747927016049e-06, "loss": 0.0434, "step": 1330 }, { "epoch": 0.6055505004549591, "grad_norm": 0.8517417056997812, "learning_rate": 4.821482816383219e-06, "loss": 0.0785, "step": 1331 }, { "epoch": 0.6060054595086443, "grad_norm": 0.5351287181203948, "learning_rate": 4.821217516049296e-06, "loss": 0.0451, "step": 1332 }, { "epoch": 0.6064604185623294, "grad_norm": 0.7138436850600612, "learning_rate": 4.82095202603596e-06, "loss": 0.0636, "step": 1333 }, { "epoch": 0.6069153776160146, "grad_norm": 0.7109233850284291, "learning_rate": 4.820686346364906e-06, "loss": 0.0563, "step": 1334 }, { "epoch": 0.6073703366696998, "grad_norm": 0.9928633837693652, "learning_rate": 4.820420477057843e-06, "loss": 0.073, "step": 1335 }, { "epoch": 0.607825295723385, "grad_norm": 0.8108842754609783, "learning_rate": 4.820154418136498e-06, "loss": 0.0732, "step": 1336 }, { "epoch": 0.60828025477707, "grad_norm": 0.9409136888664106, "learning_rate": 4.819888169622612e-06, "loss": 0.0746, "step": 1337 }, { "epoch": 0.6087352138307552, "grad_norm": 0.8704561721777555, "learning_rate": 4.819621731537942e-06, "loss": 0.0863, "step": 1338 }, { "epoch": 0.6091901728844404, "grad_norm": 0.679765432028962, "learning_rate": 4.819355103904259e-06, "loss": 0.0522, "step": 1339 }, { "epoch": 0.6096451319381255, "grad_norm": 0.7506288496766044, "learning_rate": 4.81908828674335e-06, "loss": 0.0581, "step": 1340 }, { "epoch": 0.6101000909918107, "grad_norm": 0.7533146973996597, "learning_rate": 4.81882128007702e-06, "loss": 0.0508, "step": 1341 }, { "epoch": 0.6105550500454959, "grad_norm": 0.7623699590970283, "learning_rate": 4.818554083927086e-06, "loss": 0.0602, "step": 1342 }, { "epoch": 0.6110100090991811, "grad_norm": 0.8511179695780368, "learning_rate": 4.818286698315383e-06, "loss": 0.0577, "step": 1343 }, { "epoch": 0.6114649681528662, "grad_norm": 0.7862894129476269, "learning_rate": 4.818019123263761e-06, "loss": 0.0817, "step": 1344 }, { "epoch": 0.6119199272065514, "grad_norm": 0.6793771839239909, "learning_rate": 4.817751358794084e-06, "loss": 0.0516, "step": 1345 }, { "epoch": 0.6123748862602366, "grad_norm": 0.6744967187177401, "learning_rate": 4.8174834049282325e-06, "loss": 0.06, "step": 1346 }, { "epoch": 0.6128298453139217, "grad_norm": 1.0595508979402892, "learning_rate": 4.817215261688104e-06, "loss": 0.0928, "step": 1347 }, { "epoch": 0.6132848043676069, "grad_norm": 0.7276826984658654, "learning_rate": 4.816946929095607e-06, "loss": 0.0502, "step": 1348 }, { "epoch": 0.6137397634212921, "grad_norm": 0.741447673760934, "learning_rate": 4.816678407172671e-06, "loss": 0.0741, "step": 1349 }, { "epoch": 0.6141947224749773, "grad_norm": 0.7665525289277765, "learning_rate": 4.816409695941238e-06, "loss": 0.0586, "step": 1350 }, { "epoch": 0.6146496815286624, "grad_norm": 0.7571976993587441, "learning_rate": 4.816140795423265e-06, "loss": 0.0646, "step": 1351 }, { "epoch": 0.6151046405823476, "grad_norm": 1.0671078250910566, "learning_rate": 4.8158717056407255e-06, "loss": 0.0906, "step": 1352 }, { "epoch": 0.6155595996360328, "grad_norm": 1.068257180900936, "learning_rate": 4.815602426615609e-06, "loss": 0.0814, "step": 1353 }, { "epoch": 0.6160145586897179, "grad_norm": 0.7704299563830304, "learning_rate": 4.815332958369919e-06, "loss": 0.0628, "step": 1354 }, { "epoch": 0.6164695177434031, "grad_norm": 0.7309675809198951, "learning_rate": 4.815063300925677e-06, "loss": 0.0534, "step": 1355 }, { "epoch": 0.6169244767970883, "grad_norm": 0.6905459067357435, "learning_rate": 4.814793454304915e-06, "loss": 0.0664, "step": 1356 }, { "epoch": 0.6173794358507735, "grad_norm": 0.7612784977520042, "learning_rate": 4.814523418529686e-06, "loss": 0.071, "step": 1357 }, { "epoch": 0.6178343949044586, "grad_norm": 0.7397558513678282, "learning_rate": 4.814253193622056e-06, "loss": 0.0658, "step": 1358 }, { "epoch": 0.6182893539581438, "grad_norm": 0.8273217031416162, "learning_rate": 4.813982779604106e-06, "loss": 0.0542, "step": 1359 }, { "epoch": 0.618744313011829, "grad_norm": 0.7097594863375644, "learning_rate": 4.813712176497933e-06, "loss": 0.0695, "step": 1360 }, { "epoch": 0.6191992720655141, "grad_norm": 0.9081905345796648, "learning_rate": 4.813441384325649e-06, "loss": 0.0742, "step": 1361 }, { "epoch": 0.6196542311191993, "grad_norm": 0.6161179936509155, "learning_rate": 4.813170403109383e-06, "loss": 0.0435, "step": 1362 }, { "epoch": 0.6201091901728845, "grad_norm": 0.6587599265658766, "learning_rate": 4.8128992328712774e-06, "loss": 0.0511, "step": 1363 }, { "epoch": 0.6205641492265697, "grad_norm": 0.6246519005543884, "learning_rate": 4.812627873633492e-06, "loss": 0.0547, "step": 1364 }, { "epoch": 0.6210191082802548, "grad_norm": 0.9162916767800175, "learning_rate": 4.8123563254182e-06, "loss": 0.0909, "step": 1365 }, { "epoch": 0.62147406733394, "grad_norm": 0.9475342021978096, "learning_rate": 4.8120845882475924e-06, "loss": 0.0834, "step": 1366 }, { "epoch": 0.6219290263876252, "grad_norm": 0.9962945342835489, "learning_rate": 4.8118126621438734e-06, "loss": 0.082, "step": 1367 }, { "epoch": 0.6223839854413102, "grad_norm": 0.8129731557585484, "learning_rate": 4.811540547129263e-06, "loss": 0.102, "step": 1368 }, { "epoch": 0.6228389444949954, "grad_norm": 1.0476685985771956, "learning_rate": 4.811268243225999e-06, "loss": 0.0863, "step": 1369 }, { "epoch": 0.6232939035486806, "grad_norm": 0.6364270484543224, "learning_rate": 4.810995750456331e-06, "loss": 0.049, "step": 1370 }, { "epoch": 0.6237488626023658, "grad_norm": 0.9605165651320201, "learning_rate": 4.810723068842526e-06, "loss": 0.0907, "step": 1371 }, { "epoch": 0.6242038216560509, "grad_norm": 0.907972626235469, "learning_rate": 4.810450198406867e-06, "loss": 0.089, "step": 1372 }, { "epoch": 0.6246587807097361, "grad_norm": 1.2105959950909937, "learning_rate": 4.810177139171653e-06, "loss": 0.0997, "step": 1373 }, { "epoch": 0.6251137397634213, "grad_norm": 0.5261266936372415, "learning_rate": 4.809903891159195e-06, "loss": 0.0369, "step": 1374 }, { "epoch": 0.6255686988171064, "grad_norm": 1.0914928147545504, "learning_rate": 4.809630454391822e-06, "loss": 0.0763, "step": 1375 }, { "epoch": 0.6260236578707916, "grad_norm": 0.9315193606392632, "learning_rate": 4.80935682889188e-06, "loss": 0.0994, "step": 1376 }, { "epoch": 0.6264786169244768, "grad_norm": 0.8071975479211501, "learning_rate": 4.809083014681726e-06, "loss": 0.0754, "step": 1377 }, { "epoch": 0.626933575978162, "grad_norm": 0.8407873246258533, "learning_rate": 4.808809011783735e-06, "loss": 0.0862, "step": 1378 }, { "epoch": 0.6273885350318471, "grad_norm": 0.7028834708201565, "learning_rate": 4.808534820220299e-06, "loss": 0.0557, "step": 1379 }, { "epoch": 0.6278434940855323, "grad_norm": 0.9130142462887187, "learning_rate": 4.8082604400138226e-06, "loss": 0.0907, "step": 1380 }, { "epoch": 0.6282984531392175, "grad_norm": 0.5572902974057224, "learning_rate": 4.807985871186726e-06, "loss": 0.0538, "step": 1381 }, { "epoch": 0.6287534121929026, "grad_norm": 0.9359956622314829, "learning_rate": 4.8077111137614484e-06, "loss": 0.0761, "step": 1382 }, { "epoch": 0.6292083712465878, "grad_norm": 0.9259969123573535, "learning_rate": 4.8074361677604394e-06, "loss": 0.08, "step": 1383 }, { "epoch": 0.629663330300273, "grad_norm": 0.7515102950917599, "learning_rate": 4.807161033206168e-06, "loss": 0.068, "step": 1384 }, { "epoch": 0.6301182893539582, "grad_norm": 0.8350119106641899, "learning_rate": 4.806885710121114e-06, "loss": 0.0717, "step": 1385 }, { "epoch": 0.6305732484076433, "grad_norm": 0.7425131820751144, "learning_rate": 4.806610198527779e-06, "loss": 0.059, "step": 1386 }, { "epoch": 0.6310282074613285, "grad_norm": 0.8471900633166635, "learning_rate": 4.8063344984486755e-06, "loss": 0.0624, "step": 1387 }, { "epoch": 0.6314831665150137, "grad_norm": 0.8231045305624575, "learning_rate": 4.806058609906331e-06, "loss": 0.0708, "step": 1388 }, { "epoch": 0.6319381255686988, "grad_norm": 1.0923257733711043, "learning_rate": 4.805782532923292e-06, "loss": 0.088, "step": 1389 }, { "epoch": 0.632393084622384, "grad_norm": 0.8065424294249984, "learning_rate": 4.805506267522116e-06, "loss": 0.0817, "step": 1390 }, { "epoch": 0.6328480436760692, "grad_norm": 1.1107266551952906, "learning_rate": 4.80522981372538e-06, "loss": 0.0917, "step": 1391 }, { "epoch": 0.6333030027297544, "grad_norm": 0.5047747692042878, "learning_rate": 4.804953171555674e-06, "loss": 0.046, "step": 1392 }, { "epoch": 0.6337579617834395, "grad_norm": 1.1009908125405006, "learning_rate": 4.8046763410356046e-06, "loss": 0.0721, "step": 1393 }, { "epoch": 0.6342129208371247, "grad_norm": 1.1234718918773754, "learning_rate": 4.804399322187791e-06, "loss": 0.1011, "step": 1394 }, { "epoch": 0.6346678798908099, "grad_norm": 1.083495863144811, "learning_rate": 4.8041221150348725e-06, "loss": 0.0993, "step": 1395 }, { "epoch": 0.635122838944495, "grad_norm": 0.824505933705283, "learning_rate": 4.8038447195995e-06, "loss": 0.0714, "step": 1396 }, { "epoch": 0.6355777979981801, "grad_norm": 0.8879521653149162, "learning_rate": 4.80356713590434e-06, "loss": 0.0709, "step": 1397 }, { "epoch": 0.6360327570518653, "grad_norm": 1.0230090809736052, "learning_rate": 4.803289363972078e-06, "loss": 0.0902, "step": 1398 }, { "epoch": 0.6364877161055505, "grad_norm": 0.6519234189862375, "learning_rate": 4.8030114038254094e-06, "loss": 0.0522, "step": 1399 }, { "epoch": 0.6369426751592356, "grad_norm": 0.741318365992446, "learning_rate": 4.80273325548705e-06, "loss": 0.0611, "step": 1400 }, { "epoch": 0.6373976342129208, "grad_norm": 0.7527321897876023, "learning_rate": 4.802454918979728e-06, "loss": 0.0606, "step": 1401 }, { "epoch": 0.637852593266606, "grad_norm": 1.1417182907652552, "learning_rate": 4.802176394326187e-06, "loss": 0.1069, "step": 1402 }, { "epoch": 0.6383075523202911, "grad_norm": 0.8131687655992657, "learning_rate": 4.801897681549188e-06, "loss": 0.0464, "step": 1403 }, { "epoch": 0.6387625113739763, "grad_norm": 0.8327902884834529, "learning_rate": 4.801618780671506e-06, "loss": 0.0747, "step": 1404 }, { "epoch": 0.6392174704276615, "grad_norm": 0.692422710517289, "learning_rate": 4.801339691715932e-06, "loss": 0.0699, "step": 1405 }, { "epoch": 0.6396724294813467, "grad_norm": 0.6800001240174697, "learning_rate": 4.8010604147052695e-06, "loss": 0.0503, "step": 1406 }, { "epoch": 0.6401273885350318, "grad_norm": 0.8019856852977274, "learning_rate": 4.800780949662343e-06, "loss": 0.0698, "step": 1407 }, { "epoch": 0.640582347588717, "grad_norm": 0.6564335023756012, "learning_rate": 4.800501296609986e-06, "loss": 0.0501, "step": 1408 }, { "epoch": 0.6410373066424022, "grad_norm": 0.855136459668507, "learning_rate": 4.800221455571053e-06, "loss": 0.0777, "step": 1409 }, { "epoch": 0.6414922656960873, "grad_norm": 0.6154657454528484, "learning_rate": 4.7999414265684105e-06, "loss": 0.0527, "step": 1410 }, { "epoch": 0.6419472247497725, "grad_norm": 0.7989007320367253, "learning_rate": 4.79966120962494e-06, "loss": 0.0734, "step": 1411 }, { "epoch": 0.6424021838034577, "grad_norm": 0.7788506951611326, "learning_rate": 4.799380804763542e-06, "loss": 0.0634, "step": 1412 }, { "epoch": 0.6428571428571429, "grad_norm": 0.8023595788447846, "learning_rate": 4.799100212007128e-06, "loss": 0.0635, "step": 1413 }, { "epoch": 0.643312101910828, "grad_norm": 0.6671118062408689, "learning_rate": 4.7988194313786275e-06, "loss": 0.0502, "step": 1414 }, { "epoch": 0.6437670609645132, "grad_norm": 0.4852500706051457, "learning_rate": 4.798538462900984e-06, "loss": 0.0439, "step": 1415 }, { "epoch": 0.6442220200181984, "grad_norm": 0.6172531415308445, "learning_rate": 4.798257306597157e-06, "loss": 0.0512, "step": 1416 }, { "epoch": 0.6446769790718835, "grad_norm": 0.7721959959019802, "learning_rate": 4.797975962490122e-06, "loss": 0.071, "step": 1417 }, { "epoch": 0.6451319381255687, "grad_norm": 0.5314044706595764, "learning_rate": 4.797694430602869e-06, "loss": 0.0348, "step": 1418 }, { "epoch": 0.6455868971792539, "grad_norm": 0.9359827868935178, "learning_rate": 4.797412710958405e-06, "loss": 0.0813, "step": 1419 }, { "epoch": 0.6460418562329391, "grad_norm": 0.895180226763773, "learning_rate": 4.797130803579747e-06, "loss": 0.0725, "step": 1420 }, { "epoch": 0.6464968152866242, "grad_norm": 0.9382178957271444, "learning_rate": 4.796848708489935e-06, "loss": 0.0876, "step": 1421 }, { "epoch": 0.6469517743403094, "grad_norm": 0.8047561179790783, "learning_rate": 4.796566425712018e-06, "loss": 0.0791, "step": 1422 }, { "epoch": 0.6474067333939946, "grad_norm": 0.7813970242588332, "learning_rate": 4.796283955269065e-06, "loss": 0.0868, "step": 1423 }, { "epoch": 0.6478616924476797, "grad_norm": 0.9241460431035805, "learning_rate": 4.796001297184156e-06, "loss": 0.0905, "step": 1424 }, { "epoch": 0.6483166515013649, "grad_norm": 0.9826565480521312, "learning_rate": 4.79571845148039e-06, "loss": 0.0941, "step": 1425 }, { "epoch": 0.6487716105550501, "grad_norm": 0.5534927969311005, "learning_rate": 4.795435418180879e-06, "loss": 0.0579, "step": 1426 }, { "epoch": 0.6492265696087353, "grad_norm": 0.8672553303494054, "learning_rate": 4.795152197308753e-06, "loss": 0.0712, "step": 1427 }, { "epoch": 0.6496815286624203, "grad_norm": 0.49524738598652407, "learning_rate": 4.794868788887154e-06, "loss": 0.0379, "step": 1428 }, { "epoch": 0.6501364877161055, "grad_norm": 0.827670164958526, "learning_rate": 4.79458519293924e-06, "loss": 0.0882, "step": 1429 }, { "epoch": 0.6505914467697907, "grad_norm": 0.5992195124496454, "learning_rate": 4.794301409488187e-06, "loss": 0.0483, "step": 1430 }, { "epoch": 0.6510464058234758, "grad_norm": 0.7192899332508552, "learning_rate": 4.7940174385571835e-06, "loss": 0.0595, "step": 1431 }, { "epoch": 0.651501364877161, "grad_norm": 0.6956421052259842, "learning_rate": 4.793733280169435e-06, "loss": 0.0706, "step": 1432 }, { "epoch": 0.6519563239308462, "grad_norm": 1.165394616398272, "learning_rate": 4.7934489343481614e-06, "loss": 0.0993, "step": 1433 }, { "epoch": 0.6524112829845314, "grad_norm": 0.9487576511550925, "learning_rate": 4.7931644011165975e-06, "loss": 0.0668, "step": 1434 }, { "epoch": 0.6528662420382165, "grad_norm": 0.6703994528089227, "learning_rate": 4.792879680497995e-06, "loss": 0.0579, "step": 1435 }, { "epoch": 0.6533212010919017, "grad_norm": 0.8121521545363791, "learning_rate": 4.79259477251562e-06, "loss": 0.071, "step": 1436 }, { "epoch": 0.6537761601455869, "grad_norm": 1.0536792067264262, "learning_rate": 4.792309677192753e-06, "loss": 0.0987, "step": 1437 }, { "epoch": 0.654231119199272, "grad_norm": 0.7922763227676367, "learning_rate": 4.79202439455269e-06, "loss": 0.0608, "step": 1438 }, { "epoch": 0.6546860782529572, "grad_norm": 0.6328855607330163, "learning_rate": 4.791738924618745e-06, "loss": 0.0576, "step": 1439 }, { "epoch": 0.6551410373066424, "grad_norm": 0.7130766129638374, "learning_rate": 4.791453267414245e-06, "loss": 0.0474, "step": 1440 }, { "epoch": 0.6555959963603276, "grad_norm": 0.7668931671367808, "learning_rate": 4.7911674229625316e-06, "loss": 0.0608, "step": 1441 }, { "epoch": 0.6560509554140127, "grad_norm": 1.026785066290622, "learning_rate": 4.790881391286963e-06, "loss": 0.0784, "step": 1442 }, { "epoch": 0.6565059144676979, "grad_norm": 0.8437938287309505, "learning_rate": 4.790595172410914e-06, "loss": 0.0688, "step": 1443 }, { "epoch": 0.6569608735213831, "grad_norm": 0.95620015056413, "learning_rate": 4.79030876635777e-06, "loss": 0.0866, "step": 1444 }, { "epoch": 0.6574158325750682, "grad_norm": 0.7126304636743447, "learning_rate": 4.790022173150938e-06, "loss": 0.0633, "step": 1445 }, { "epoch": 0.6578707916287534, "grad_norm": 0.5125812984853052, "learning_rate": 4.789735392813835e-06, "loss": 0.0423, "step": 1446 }, { "epoch": 0.6583257506824386, "grad_norm": 0.7255097967015932, "learning_rate": 4.789448425369896e-06, "loss": 0.0591, "step": 1447 }, { "epoch": 0.6587807097361238, "grad_norm": 0.7245916935945349, "learning_rate": 4.789161270842571e-06, "loss": 0.0617, "step": 1448 }, { "epoch": 0.6592356687898089, "grad_norm": 0.8534895421081942, "learning_rate": 4.7888739292553235e-06, "loss": 0.0824, "step": 1449 }, { "epoch": 0.6596906278434941, "grad_norm": 0.9129297819396048, "learning_rate": 4.788586400631636e-06, "loss": 0.0864, "step": 1450 }, { "epoch": 0.6601455868971793, "grad_norm": 0.6321747250216057, "learning_rate": 4.788298684995003e-06, "loss": 0.0606, "step": 1451 }, { "epoch": 0.6606005459508644, "grad_norm": 0.6569500610517134, "learning_rate": 4.7880107823689355e-06, "loss": 0.0471, "step": 1452 }, { "epoch": 0.6610555050045496, "grad_norm": 0.836923575196295, "learning_rate": 4.787722692776958e-06, "loss": 0.0806, "step": 1453 }, { "epoch": 0.6615104640582348, "grad_norm": 0.8618715247200026, "learning_rate": 4.787434416242615e-06, "loss": 0.0796, "step": 1454 }, { "epoch": 0.66196542311192, "grad_norm": 1.2913844032281525, "learning_rate": 4.787145952789461e-06, "loss": 0.1144, "step": 1455 }, { "epoch": 0.6624203821656051, "grad_norm": 0.7740924771134702, "learning_rate": 4.786857302441069e-06, "loss": 0.0501, "step": 1456 }, { "epoch": 0.6628753412192903, "grad_norm": 0.8850884382043015, "learning_rate": 4.786568465221025e-06, "loss": 0.0776, "step": 1457 }, { "epoch": 0.6633303002729755, "grad_norm": 0.8065764669411247, "learning_rate": 4.7862794411529315e-06, "loss": 0.0714, "step": 1458 }, { "epoch": 0.6637852593266605, "grad_norm": 1.0967948038936701, "learning_rate": 4.7859902302604075e-06, "loss": 0.0996, "step": 1459 }, { "epoch": 0.6642402183803457, "grad_norm": 0.9974285175173262, "learning_rate": 4.785700832567085e-06, "loss": 0.0776, "step": 1460 }, { "epoch": 0.664695177434031, "grad_norm": 0.6236115703442758, "learning_rate": 4.785411248096613e-06, "loss": 0.0476, "step": 1461 }, { "epoch": 0.6651501364877161, "grad_norm": 0.5778143724368887, "learning_rate": 4.785121476872654e-06, "loss": 0.0623, "step": 1462 }, { "epoch": 0.6656050955414012, "grad_norm": 0.8331029721226916, "learning_rate": 4.784831518918888e-06, "loss": 0.086, "step": 1463 }, { "epoch": 0.6660600545950864, "grad_norm": 0.6555727601304058, "learning_rate": 4.784541374259008e-06, "loss": 0.0604, "step": 1464 }, { "epoch": 0.6665150136487716, "grad_norm": 0.840422831333416, "learning_rate": 4.7842510429167244e-06, "loss": 0.0705, "step": 1465 }, { "epoch": 0.6669699727024567, "grad_norm": 0.8495238637146105, "learning_rate": 4.783960524915761e-06, "loss": 0.0795, "step": 1466 }, { "epoch": 0.6674249317561419, "grad_norm": 0.7305356040505075, "learning_rate": 4.783669820279858e-06, "loss": 0.0632, "step": 1467 }, { "epoch": 0.6678798908098271, "grad_norm": 0.6350015765617174, "learning_rate": 4.783378929032769e-06, "loss": 0.0537, "step": 1468 }, { "epoch": 0.6683348498635123, "grad_norm": 0.7555553607544558, "learning_rate": 4.783087851198267e-06, "loss": 0.0664, "step": 1469 }, { "epoch": 0.6687898089171974, "grad_norm": 0.9513776608510918, "learning_rate": 4.7827965868001356e-06, "loss": 0.0797, "step": 1470 }, { "epoch": 0.6692447679708826, "grad_norm": 0.7540264419548088, "learning_rate": 4.782505135862176e-06, "loss": 0.0645, "step": 1471 }, { "epoch": 0.6696997270245678, "grad_norm": 0.8806981016853459, "learning_rate": 4.782213498408205e-06, "loss": 0.0792, "step": 1472 }, { "epoch": 0.6701546860782529, "grad_norm": 0.816738183429135, "learning_rate": 4.781921674462053e-06, "loss": 0.0567, "step": 1473 }, { "epoch": 0.6706096451319381, "grad_norm": 0.9563479172789231, "learning_rate": 4.781629664047566e-06, "loss": 0.0726, "step": 1474 }, { "epoch": 0.6710646041856233, "grad_norm": 0.6676816189321566, "learning_rate": 4.781337467188607e-06, "loss": 0.0674, "step": 1475 }, { "epoch": 0.6715195632393085, "grad_norm": 0.8153655901587608, "learning_rate": 4.781045083909053e-06, "loss": 0.0708, "step": 1476 }, { "epoch": 0.6719745222929936, "grad_norm": 0.9702425819685979, "learning_rate": 4.780752514232796e-06, "loss": 0.066, "step": 1477 }, { "epoch": 0.6724294813466788, "grad_norm": 0.6736063833667711, "learning_rate": 4.780459758183743e-06, "loss": 0.0594, "step": 1478 }, { "epoch": 0.672884440400364, "grad_norm": 0.6356551069651334, "learning_rate": 4.780166815785817e-06, "loss": 0.0572, "step": 1479 }, { "epoch": 0.6733393994540491, "grad_norm": 0.7785893078479745, "learning_rate": 4.7798736870629554e-06, "loss": 0.0826, "step": 1480 }, { "epoch": 0.6737943585077343, "grad_norm": 0.7319247864552544, "learning_rate": 4.779580372039113e-06, "loss": 0.0536, "step": 1481 }, { "epoch": 0.6742493175614195, "grad_norm": 0.9576832944027126, "learning_rate": 4.779286870738256e-06, "loss": 0.0689, "step": 1482 }, { "epoch": 0.6747042766151047, "grad_norm": 0.8166538023451795, "learning_rate": 4.778993183184371e-06, "loss": 0.0556, "step": 1483 }, { "epoch": 0.6751592356687898, "grad_norm": 1.1745759297821086, "learning_rate": 4.778699309401453e-06, "loss": 0.0897, "step": 1484 }, { "epoch": 0.675614194722475, "grad_norm": 0.7555435279337044, "learning_rate": 4.7784052494135195e-06, "loss": 0.0653, "step": 1485 }, { "epoch": 0.6760691537761602, "grad_norm": 0.664770902902504, "learning_rate": 4.778111003244596e-06, "loss": 0.0683, "step": 1486 }, { "epoch": 0.6765241128298453, "grad_norm": 0.5809114716709589, "learning_rate": 4.777816570918731e-06, "loss": 0.05, "step": 1487 }, { "epoch": 0.6769790718835305, "grad_norm": 0.6928304723812573, "learning_rate": 4.777521952459982e-06, "loss": 0.064, "step": 1488 }, { "epoch": 0.6774340309372157, "grad_norm": 0.8604944031525139, "learning_rate": 4.777227147892424e-06, "loss": 0.067, "step": 1489 }, { "epoch": 0.6778889899909009, "grad_norm": 0.752451889272893, "learning_rate": 4.776932157240147e-06, "loss": 0.065, "step": 1490 }, { "epoch": 0.678343949044586, "grad_norm": 0.8473681581598411, "learning_rate": 4.776636980527257e-06, "loss": 0.0623, "step": 1491 }, { "epoch": 0.6787989080982711, "grad_norm": 0.9703686393076305, "learning_rate": 4.776341617777874e-06, "loss": 0.0686, "step": 1492 }, { "epoch": 0.6792538671519563, "grad_norm": 0.811693083005839, "learning_rate": 4.776046069016135e-06, "loss": 0.0672, "step": 1493 }, { "epoch": 0.6797088262056415, "grad_norm": 1.0089795417423277, "learning_rate": 4.775750334266188e-06, "loss": 0.0867, "step": 1494 }, { "epoch": 0.6801637852593266, "grad_norm": 0.6348129639773868, "learning_rate": 4.775454413552202e-06, "loss": 0.0478, "step": 1495 }, { "epoch": 0.6806187443130118, "grad_norm": 1.8046001058548395, "learning_rate": 4.775158306898358e-06, "loss": 0.0856, "step": 1496 }, { "epoch": 0.681073703366697, "grad_norm": 0.804030519135084, "learning_rate": 4.774862014328849e-06, "loss": 0.0682, "step": 1497 }, { "epoch": 0.6815286624203821, "grad_norm": 0.7475852207063984, "learning_rate": 4.774565535867892e-06, "loss": 0.0621, "step": 1498 }, { "epoch": 0.6819836214740673, "grad_norm": 1.036971872978779, "learning_rate": 4.77426887153971e-06, "loss": 0.0772, "step": 1499 }, { "epoch": 0.6824385805277525, "grad_norm": 0.8877829921629609, "learning_rate": 4.773972021368546e-06, "loss": 0.0792, "step": 1500 }, { "epoch": 0.6828935395814377, "grad_norm": 0.9837757799688718, "learning_rate": 4.773674985378658e-06, "loss": 0.1229, "step": 1501 }, { "epoch": 0.6833484986351228, "grad_norm": 0.7772757895891362, "learning_rate": 4.773377763594319e-06, "loss": 0.0472, "step": 1502 }, { "epoch": 0.683803457688808, "grad_norm": 0.8631194323034224, "learning_rate": 4.773080356039814e-06, "loss": 0.0645, "step": 1503 }, { "epoch": 0.6842584167424932, "grad_norm": 0.6366545290180244, "learning_rate": 4.772782762739448e-06, "loss": 0.0595, "step": 1504 }, { "epoch": 0.6847133757961783, "grad_norm": 0.7166260637548661, "learning_rate": 4.772484983717539e-06, "loss": 0.0623, "step": 1505 }, { "epoch": 0.6851683348498635, "grad_norm": 0.8757638645378785, "learning_rate": 4.77218701899842e-06, "loss": 0.0655, "step": 1506 }, { "epoch": 0.6856232939035487, "grad_norm": 1.0305728337056401, "learning_rate": 4.771888868606438e-06, "loss": 0.098, "step": 1507 }, { "epoch": 0.6860782529572339, "grad_norm": 0.5264608517167783, "learning_rate": 4.771590532565957e-06, "loss": 0.0384, "step": 1508 }, { "epoch": 0.686533212010919, "grad_norm": 0.7225210635255812, "learning_rate": 4.771292010901357e-06, "loss": 0.0529, "step": 1509 }, { "epoch": 0.6869881710646042, "grad_norm": 0.9811455221159325, "learning_rate": 4.77099330363703e-06, "loss": 0.0734, "step": 1510 }, { "epoch": 0.6874431301182894, "grad_norm": 0.590972914047016, "learning_rate": 4.770694410797387e-06, "loss": 0.0552, "step": 1511 }, { "epoch": 0.6878980891719745, "grad_norm": 0.8133978032493828, "learning_rate": 4.770395332406851e-06, "loss": 0.061, "step": 1512 }, { "epoch": 0.6883530482256597, "grad_norm": 0.8194249817407185, "learning_rate": 4.770096068489861e-06, "loss": 0.0741, "step": 1513 }, { "epoch": 0.6888080072793449, "grad_norm": 0.6084096486485657, "learning_rate": 4.769796619070872e-06, "loss": 0.0602, "step": 1514 }, { "epoch": 0.6892629663330301, "grad_norm": 0.5651191066612926, "learning_rate": 4.769496984174353e-06, "loss": 0.0486, "step": 1515 }, { "epoch": 0.6897179253867152, "grad_norm": 0.9194603499902049, "learning_rate": 4.769197163824791e-06, "loss": 0.0656, "step": 1516 }, { "epoch": 0.6901728844404004, "grad_norm": 0.8500604771260194, "learning_rate": 4.768897158046683e-06, "loss": 0.0653, "step": 1517 }, { "epoch": 0.6906278434940856, "grad_norm": 0.6862315615479446, "learning_rate": 4.768596966864546e-06, "loss": 0.0524, "step": 1518 }, { "epoch": 0.6910828025477707, "grad_norm": 1.2304461047991757, "learning_rate": 4.76829659030291e-06, "loss": 0.1137, "step": 1519 }, { "epoch": 0.6915377616014559, "grad_norm": 0.876128817036191, "learning_rate": 4.767996028386319e-06, "loss": 0.0757, "step": 1520 }, { "epoch": 0.6919927206551411, "grad_norm": 1.07669637523419, "learning_rate": 4.767695281139336e-06, "loss": 0.0679, "step": 1521 }, { "epoch": 0.6924476797088263, "grad_norm": 0.8211862034870426, "learning_rate": 4.767394348586535e-06, "loss": 0.0619, "step": 1522 }, { "epoch": 0.6929026387625113, "grad_norm": 0.9099029471086892, "learning_rate": 4.767093230752507e-06, "loss": 0.0903, "step": 1523 }, { "epoch": 0.6933575978161965, "grad_norm": 0.7799051318100109, "learning_rate": 4.766791927661859e-06, "loss": 0.0766, "step": 1524 }, { "epoch": 0.6938125568698817, "grad_norm": 1.0883988105102491, "learning_rate": 4.766490439339211e-06, "loss": 0.0661, "step": 1525 }, { "epoch": 0.6942675159235668, "grad_norm": 0.8400663441468114, "learning_rate": 4.7661887658092e-06, "loss": 0.0653, "step": 1526 }, { "epoch": 0.694722474977252, "grad_norm": 0.8744033325354778, "learning_rate": 4.765886907096477e-06, "loss": 0.0712, "step": 1527 }, { "epoch": 0.6951774340309372, "grad_norm": 1.0117866298788205, "learning_rate": 4.7655848632257084e-06, "loss": 0.0961, "step": 1528 }, { "epoch": 0.6956323930846224, "grad_norm": 0.8451483006341224, "learning_rate": 4.7652826342215764e-06, "loss": 0.0805, "step": 1529 }, { "epoch": 0.6960873521383075, "grad_norm": 0.9721200931533607, "learning_rate": 4.764980220108777e-06, "loss": 0.0868, "step": 1530 }, { "epoch": 0.6965423111919927, "grad_norm": 1.139955334493506, "learning_rate": 4.764677620912022e-06, "loss": 0.0922, "step": 1531 }, { "epoch": 0.6969972702456779, "grad_norm": 0.6557667551942458, "learning_rate": 4.764374836656041e-06, "loss": 0.061, "step": 1532 }, { "epoch": 0.697452229299363, "grad_norm": 0.6235355321610729, "learning_rate": 4.764071867365571e-06, "loss": 0.0717, "step": 1533 }, { "epoch": 0.6979071883530482, "grad_norm": 0.8241396053903132, "learning_rate": 4.763768713065375e-06, "loss": 0.0635, "step": 1534 }, { "epoch": 0.6983621474067334, "grad_norm": 1.1215482357989177, "learning_rate": 4.763465373780223e-06, "loss": 0.0854, "step": 1535 }, { "epoch": 0.6988171064604186, "grad_norm": 0.9398991503235029, "learning_rate": 4.763161849534902e-06, "loss": 0.0708, "step": 1536 }, { "epoch": 0.6992720655141037, "grad_norm": 0.5429847226903595, "learning_rate": 4.762858140354214e-06, "loss": 0.0563, "step": 1537 }, { "epoch": 0.6997270245677889, "grad_norm": 0.7113795106394718, "learning_rate": 4.7625542462629785e-06, "loss": 0.0639, "step": 1538 }, { "epoch": 0.7001819836214741, "grad_norm": 0.4626733648173771, "learning_rate": 4.762250167286027e-06, "loss": 0.0323, "step": 1539 }, { "epoch": 0.7006369426751592, "grad_norm": 0.9326531891804614, "learning_rate": 4.761945903448209e-06, "loss": 0.0901, "step": 1540 }, { "epoch": 0.7010919017288444, "grad_norm": 0.5398486547930679, "learning_rate": 4.761641454774386e-06, "loss": 0.053, "step": 1541 }, { "epoch": 0.7015468607825296, "grad_norm": 0.9874559613432076, "learning_rate": 4.761336821289436e-06, "loss": 0.0966, "step": 1542 }, { "epoch": 0.7020018198362148, "grad_norm": 0.5432433449970621, "learning_rate": 4.761032003018254e-06, "loss": 0.0513, "step": 1543 }, { "epoch": 0.7024567788898999, "grad_norm": 0.622579728480354, "learning_rate": 4.760726999985748e-06, "loss": 0.0441, "step": 1544 }, { "epoch": 0.7029117379435851, "grad_norm": 0.6451217496312431, "learning_rate": 4.7604218122168406e-06, "loss": 0.0552, "step": 1545 }, { "epoch": 0.7033666969972703, "grad_norm": 1.1118626340333584, "learning_rate": 4.760116439736471e-06, "loss": 0.1001, "step": 1546 }, { "epoch": 0.7038216560509554, "grad_norm": 1.1718299505240957, "learning_rate": 4.759810882569591e-06, "loss": 0.1093, "step": 1547 }, { "epoch": 0.7042766151046406, "grad_norm": 0.4549553550039402, "learning_rate": 4.759505140741172e-06, "loss": 0.037, "step": 1548 }, { "epoch": 0.7047315741583258, "grad_norm": 1.2374534576601486, "learning_rate": 4.759199214276196e-06, "loss": 0.1075, "step": 1549 }, { "epoch": 0.705186533212011, "grad_norm": 0.6890766528984787, "learning_rate": 4.758893103199665e-06, "loss": 0.0704, "step": 1550 }, { "epoch": 0.7056414922656961, "grad_norm": 0.809006377964544, "learning_rate": 4.758586807536588e-06, "loss": 0.0635, "step": 1551 }, { "epoch": 0.7060964513193813, "grad_norm": 0.46816790314940004, "learning_rate": 4.758280327311998e-06, "loss": 0.0396, "step": 1552 }, { "epoch": 0.7065514103730665, "grad_norm": 0.956864928582162, "learning_rate": 4.757973662550938e-06, "loss": 0.0715, "step": 1553 }, { "epoch": 0.7070063694267515, "grad_norm": 0.6528599047999262, "learning_rate": 4.757666813278466e-06, "loss": 0.0525, "step": 1554 }, { "epoch": 0.7074613284804367, "grad_norm": 0.8181545206568527, "learning_rate": 4.757359779519659e-06, "loss": 0.0727, "step": 1555 }, { "epoch": 0.707916287534122, "grad_norm": 0.9844274916847088, "learning_rate": 4.757052561299604e-06, "loss": 0.0991, "step": 1556 }, { "epoch": 0.7083712465878071, "grad_norm": 1.0768378272925192, "learning_rate": 4.756745158643407e-06, "loss": 0.0976, "step": 1557 }, { "epoch": 0.7088262056414922, "grad_norm": 0.8642164719519719, "learning_rate": 4.7564375715761865e-06, "loss": 0.0791, "step": 1558 }, { "epoch": 0.7092811646951774, "grad_norm": 0.433851399002758, "learning_rate": 4.756129800123078e-06, "loss": 0.0264, "step": 1559 }, { "epoch": 0.7097361237488626, "grad_norm": 0.701794046170542, "learning_rate": 4.755821844309232e-06, "loss": 0.078, "step": 1560 }, { "epoch": 0.7101910828025477, "grad_norm": 0.7934468928491815, "learning_rate": 4.75551370415981e-06, "loss": 0.0705, "step": 1561 }, { "epoch": 0.7106460418562329, "grad_norm": 0.8567300928308393, "learning_rate": 4.755205379699996e-06, "loss": 0.0708, "step": 1562 }, { "epoch": 0.7111010009099181, "grad_norm": 0.7631031658693246, "learning_rate": 4.75489687095498e-06, "loss": 0.0696, "step": 1563 }, { "epoch": 0.7115559599636033, "grad_norm": 0.920620931877378, "learning_rate": 4.754588177949977e-06, "loss": 0.0721, "step": 1564 }, { "epoch": 0.7120109190172884, "grad_norm": 0.763031156086878, "learning_rate": 4.7542793007102086e-06, "loss": 0.0519, "step": 1565 }, { "epoch": 0.7124658780709736, "grad_norm": 0.8338043690194923, "learning_rate": 4.7539702392609165e-06, "loss": 0.091, "step": 1566 }, { "epoch": 0.7129208371246588, "grad_norm": 0.5783419362259836, "learning_rate": 4.753660993627356e-06, "loss": 0.0383, "step": 1567 }, { "epoch": 0.7133757961783439, "grad_norm": 1.091175103721431, "learning_rate": 4.753351563834795e-06, "loss": 0.0874, "step": 1568 }, { "epoch": 0.7138307552320291, "grad_norm": 0.9259203812623706, "learning_rate": 4.753041949908521e-06, "loss": 0.0658, "step": 1569 }, { "epoch": 0.7142857142857143, "grad_norm": 0.9365180546757296, "learning_rate": 4.752732151873834e-06, "loss": 0.0692, "step": 1570 }, { "epoch": 0.7147406733393995, "grad_norm": 0.7100718342966104, "learning_rate": 4.752422169756048e-06, "loss": 0.0711, "step": 1571 }, { "epoch": 0.7151956323930846, "grad_norm": 0.5978251540753616, "learning_rate": 4.752112003580495e-06, "loss": 0.0486, "step": 1572 }, { "epoch": 0.7156505914467698, "grad_norm": 0.7341732428573583, "learning_rate": 4.751801653372518e-06, "loss": 0.0462, "step": 1573 }, { "epoch": 0.716105550500455, "grad_norm": 0.7943019606068298, "learning_rate": 4.751491119157481e-06, "loss": 0.0679, "step": 1574 }, { "epoch": 0.7165605095541401, "grad_norm": 0.5499580471761844, "learning_rate": 4.751180400960756e-06, "loss": 0.0469, "step": 1575 }, { "epoch": 0.7170154686078253, "grad_norm": 0.8873520635980867, "learning_rate": 4.7508694988077355e-06, "loss": 0.0804, "step": 1576 }, { "epoch": 0.7174704276615105, "grad_norm": 0.7826027240405181, "learning_rate": 4.750558412723824e-06, "loss": 0.0534, "step": 1577 }, { "epoch": 0.7179253867151957, "grad_norm": 0.5436399916764901, "learning_rate": 4.750247142734442e-06, "loss": 0.0422, "step": 1578 }, { "epoch": 0.7183803457688808, "grad_norm": 0.8976952415351162, "learning_rate": 4.749935688865026e-06, "loss": 0.0946, "step": 1579 }, { "epoch": 0.718835304822566, "grad_norm": 0.5897366750354841, "learning_rate": 4.749624051141026e-06, "loss": 0.0448, "step": 1580 }, { "epoch": 0.7192902638762512, "grad_norm": 0.522008805738841, "learning_rate": 4.7493122295879076e-06, "loss": 0.0479, "step": 1581 }, { "epoch": 0.7197452229299363, "grad_norm": 0.8905616220483812, "learning_rate": 4.7490002242311525e-06, "loss": 0.0769, "step": 1582 }, { "epoch": 0.7202001819836215, "grad_norm": 0.7188643596274509, "learning_rate": 4.748688035096255e-06, "loss": 0.059, "step": 1583 }, { "epoch": 0.7206551410373067, "grad_norm": 0.6538082906296614, "learning_rate": 4.748375662208726e-06, "loss": 0.0421, "step": 1584 }, { "epoch": 0.7211101000909919, "grad_norm": 0.9507382067003013, "learning_rate": 4.748063105594092e-06, "loss": 0.0885, "step": 1585 }, { "epoch": 0.721565059144677, "grad_norm": 0.8751185263070331, "learning_rate": 4.747750365277892e-06, "loss": 0.082, "step": 1586 }, { "epoch": 0.7220200181983621, "grad_norm": 0.7815947443354453, "learning_rate": 4.747437441285684e-06, "loss": 0.0496, "step": 1587 }, { "epoch": 0.7224749772520473, "grad_norm": 0.8056389354729365, "learning_rate": 4.747124333643038e-06, "loss": 0.0758, "step": 1588 }, { "epoch": 0.7229299363057324, "grad_norm": 0.8424702785094896, "learning_rate": 4.746811042375538e-06, "loss": 0.0685, "step": 1589 }, { "epoch": 0.7233848953594176, "grad_norm": 0.7103882690080215, "learning_rate": 4.746497567508787e-06, "loss": 0.0611, "step": 1590 }, { "epoch": 0.7238398544131028, "grad_norm": 0.7217352053119206, "learning_rate": 4.7461839090684e-06, "loss": 0.0749, "step": 1591 }, { "epoch": 0.724294813466788, "grad_norm": 0.820137321844301, "learning_rate": 4.745870067080007e-06, "loss": 0.0624, "step": 1592 }, { "epoch": 0.7247497725204731, "grad_norm": 1.3704268529969041, "learning_rate": 4.7455560415692545e-06, "loss": 0.1186, "step": 1593 }, { "epoch": 0.7252047315741583, "grad_norm": 0.8173046496173154, "learning_rate": 4.745241832561803e-06, "loss": 0.0518, "step": 1594 }, { "epoch": 0.7256596906278435, "grad_norm": 0.6436436457004329, "learning_rate": 4.744927440083329e-06, "loss": 0.0601, "step": 1595 }, { "epoch": 0.7261146496815286, "grad_norm": 0.7006481377697166, "learning_rate": 4.744612864159522e-06, "loss": 0.0564, "step": 1596 }, { "epoch": 0.7265696087352138, "grad_norm": 0.9334346972782172, "learning_rate": 4.7442981048160895e-06, "loss": 0.0923, "step": 1597 }, { "epoch": 0.727024567788899, "grad_norm": 0.7193743397132841, "learning_rate": 4.74398316207875e-06, "loss": 0.0513, "step": 1598 }, { "epoch": 0.7274795268425842, "grad_norm": 1.3623828249001875, "learning_rate": 4.74366803597324e-06, "loss": 0.1304, "step": 1599 }, { "epoch": 0.7279344858962693, "grad_norm": 0.7986369654273386, "learning_rate": 4.743352726525311e-06, "loss": 0.0657, "step": 1600 }, { "epoch": 0.7283894449499545, "grad_norm": 0.6297195371205284, "learning_rate": 4.743037233760728e-06, "loss": 0.0518, "step": 1601 }, { "epoch": 0.7288444040036397, "grad_norm": 0.7797911267746347, "learning_rate": 4.742721557705271e-06, "loss": 0.0576, "step": 1602 }, { "epoch": 0.7292993630573248, "grad_norm": 0.6771322942350662, "learning_rate": 4.7424056983847374e-06, "loss": 0.0721, "step": 1603 }, { "epoch": 0.72975432211101, "grad_norm": 0.9370484628146732, "learning_rate": 4.7420896558249366e-06, "loss": 0.0818, "step": 1604 }, { "epoch": 0.7302092811646952, "grad_norm": 0.49858668464501216, "learning_rate": 4.741773430051694e-06, "loss": 0.0396, "step": 1605 }, { "epoch": 0.7306642402183804, "grad_norm": 0.7992861361600685, "learning_rate": 4.74145702109085e-06, "loss": 0.0681, "step": 1606 }, { "epoch": 0.7311191992720655, "grad_norm": 1.1366113719813769, "learning_rate": 4.741140428968261e-06, "loss": 0.0899, "step": 1607 }, { "epoch": 0.7315741583257507, "grad_norm": 0.4255609619960085, "learning_rate": 4.740823653709797e-06, "loss": 0.0374, "step": 1608 }, { "epoch": 0.7320291173794359, "grad_norm": 0.515074709614876, "learning_rate": 4.740506695341343e-06, "loss": 0.05, "step": 1609 }, { "epoch": 0.732484076433121, "grad_norm": 0.9479583163425525, "learning_rate": 4.740189553888801e-06, "loss": 0.0951, "step": 1610 }, { "epoch": 0.7329390354868062, "grad_norm": 0.6968388029166215, "learning_rate": 4.739872229378085e-06, "loss": 0.0585, "step": 1611 }, { "epoch": 0.7333939945404914, "grad_norm": 0.6907841663601652, "learning_rate": 4.739554721835125e-06, "loss": 0.0516, "step": 1612 }, { "epoch": 0.7338489535941766, "grad_norm": 0.931301306576401, "learning_rate": 4.739237031285867e-06, "loss": 0.0853, "step": 1613 }, { "epoch": 0.7343039126478617, "grad_norm": 0.8746710093849102, "learning_rate": 4.738919157756272e-06, "loss": 0.0726, "step": 1614 }, { "epoch": 0.7347588717015469, "grad_norm": 0.7323772065729443, "learning_rate": 4.738601101272313e-06, "loss": 0.0728, "step": 1615 }, { "epoch": 0.7352138307552321, "grad_norm": 0.6527662262438081, "learning_rate": 4.738282861859983e-06, "loss": 0.0566, "step": 1616 }, { "epoch": 0.7356687898089171, "grad_norm": 0.7588905056806852, "learning_rate": 4.737964439545284e-06, "loss": 0.0654, "step": 1617 }, { "epoch": 0.7361237488626023, "grad_norm": 0.7391620420308275, "learning_rate": 4.737645834354238e-06, "loss": 0.0577, "step": 1618 }, { "epoch": 0.7365787079162875, "grad_norm": 0.6241259905981298, "learning_rate": 4.737327046312879e-06, "loss": 0.0463, "step": 1619 }, { "epoch": 0.7370336669699727, "grad_norm": 0.7979480345626465, "learning_rate": 4.737008075447259e-06, "loss": 0.0594, "step": 1620 }, { "epoch": 0.7374886260236578, "grad_norm": 0.8204459689677482, "learning_rate": 4.73668892178344e-06, "loss": 0.0709, "step": 1621 }, { "epoch": 0.737943585077343, "grad_norm": 0.7262256791849234, "learning_rate": 4.736369585347503e-06, "loss": 0.0684, "step": 1622 }, { "epoch": 0.7383985441310282, "grad_norm": 1.0855588116680628, "learning_rate": 4.736050066165544e-06, "loss": 0.0733, "step": 1623 }, { "epoch": 0.7388535031847133, "grad_norm": 0.9128385411196811, "learning_rate": 4.735730364263671e-06, "loss": 0.0805, "step": 1624 }, { "epoch": 0.7393084622383985, "grad_norm": 0.6804670184644261, "learning_rate": 4.735410479668009e-06, "loss": 0.0625, "step": 1625 }, { "epoch": 0.7397634212920837, "grad_norm": 0.8471626944635055, "learning_rate": 4.735090412404697e-06, "loss": 0.0792, "step": 1626 }, { "epoch": 0.7402183803457689, "grad_norm": 0.740952084115674, "learning_rate": 4.734770162499891e-06, "loss": 0.054, "step": 1627 }, { "epoch": 0.740673339399454, "grad_norm": 0.6979194464789156, "learning_rate": 4.734449729979759e-06, "loss": 0.0516, "step": 1628 }, { "epoch": 0.7411282984531392, "grad_norm": 0.934620178703864, "learning_rate": 4.734129114870486e-06, "loss": 0.0728, "step": 1629 }, { "epoch": 0.7415832575068244, "grad_norm": 0.6776690850388026, "learning_rate": 4.733808317198271e-06, "loss": 0.0396, "step": 1630 }, { "epoch": 0.7420382165605095, "grad_norm": 0.6772926595393153, "learning_rate": 4.733487336989327e-06, "loss": 0.0582, "step": 1631 }, { "epoch": 0.7424931756141947, "grad_norm": 0.8592043857490114, "learning_rate": 4.733166174269886e-06, "loss": 0.073, "step": 1632 }, { "epoch": 0.7429481346678799, "grad_norm": 0.8641626074857739, "learning_rate": 4.732844829066189e-06, "loss": 0.0731, "step": 1633 }, { "epoch": 0.7434030937215651, "grad_norm": 0.9593751010905753, "learning_rate": 4.732523301404497e-06, "loss": 0.0753, "step": 1634 }, { "epoch": 0.7438580527752502, "grad_norm": 1.0669159630512404, "learning_rate": 4.732201591311082e-06, "loss": 0.0941, "step": 1635 }, { "epoch": 0.7443130118289354, "grad_norm": 0.7929283291326471, "learning_rate": 4.731879698812233e-06, "loss": 0.0858, "step": 1636 }, { "epoch": 0.7447679708826206, "grad_norm": 0.673427501132972, "learning_rate": 4.731557623934255e-06, "loss": 0.0513, "step": 1637 }, { "epoch": 0.7452229299363057, "grad_norm": 1.3418525148659195, "learning_rate": 4.7312353667034645e-06, "loss": 0.113, "step": 1638 }, { "epoch": 0.7456778889899909, "grad_norm": 1.2120619666958259, "learning_rate": 4.730912927146197e-06, "loss": 0.0919, "step": 1639 }, { "epoch": 0.7461328480436761, "grad_norm": 0.7616942301743401, "learning_rate": 4.7305903052888e-06, "loss": 0.0623, "step": 1640 }, { "epoch": 0.7465878070973613, "grad_norm": 0.958768278975319, "learning_rate": 4.730267501157636e-06, "loss": 0.071, "step": 1641 }, { "epoch": 0.7470427661510464, "grad_norm": 0.6568421919172921, "learning_rate": 4.729944514779084e-06, "loss": 0.0576, "step": 1642 }, { "epoch": 0.7474977252047316, "grad_norm": 0.6153711066410817, "learning_rate": 4.729621346179536e-06, "loss": 0.0605, "step": 1643 }, { "epoch": 0.7479526842584168, "grad_norm": 0.6856160527068095, "learning_rate": 4.7292979953854e-06, "loss": 0.0577, "step": 1644 }, { "epoch": 0.7484076433121019, "grad_norm": 0.5864170000219955, "learning_rate": 4.7289744624231004e-06, "loss": 0.0429, "step": 1645 }, { "epoch": 0.7488626023657871, "grad_norm": 1.1083470301221403, "learning_rate": 4.728650747319073e-06, "loss": 0.0895, "step": 1646 }, { "epoch": 0.7493175614194723, "grad_norm": 0.9211984823549421, "learning_rate": 4.728326850099771e-06, "loss": 0.0834, "step": 1647 }, { "epoch": 0.7497725204731575, "grad_norm": 0.6665680334269098, "learning_rate": 4.728002770791663e-06, "loss": 0.0641, "step": 1648 }, { "epoch": 0.7502274795268425, "grad_norm": 0.6938216889022656, "learning_rate": 4.727678509421229e-06, "loss": 0.0626, "step": 1649 }, { "epoch": 0.7506824385805277, "grad_norm": 0.7915210763484374, "learning_rate": 4.727354066014968e-06, "loss": 0.0449, "step": 1650 }, { "epoch": 0.7511373976342129, "grad_norm": 0.8713821596875527, "learning_rate": 4.727029440599391e-06, "loss": 0.0664, "step": 1651 }, { "epoch": 0.7515923566878981, "grad_norm": 0.633081271382763, "learning_rate": 4.726704633201025e-06, "loss": 0.0539, "step": 1652 }, { "epoch": 0.7520473157415832, "grad_norm": 0.926880670672549, "learning_rate": 4.726379643846412e-06, "loss": 0.0759, "step": 1653 }, { "epoch": 0.7525022747952684, "grad_norm": 1.1416005489607706, "learning_rate": 4.726054472562109e-06, "loss": 0.0837, "step": 1654 }, { "epoch": 0.7529572338489536, "grad_norm": 0.5402662393046999, "learning_rate": 4.725729119374687e-06, "loss": 0.0453, "step": 1655 }, { "epoch": 0.7534121929026387, "grad_norm": 0.6335686685514863, "learning_rate": 4.725403584310734e-06, "loss": 0.0461, "step": 1656 }, { "epoch": 0.7538671519563239, "grad_norm": 0.8262266594165262, "learning_rate": 4.725077867396849e-06, "loss": 0.0571, "step": 1657 }, { "epoch": 0.7543221110100091, "grad_norm": 0.7284181310556234, "learning_rate": 4.724751968659648e-06, "loss": 0.0776, "step": 1658 }, { "epoch": 0.7547770700636943, "grad_norm": 0.9346158065906657, "learning_rate": 4.724425888125764e-06, "loss": 0.0768, "step": 1659 }, { "epoch": 0.7552320291173794, "grad_norm": 0.7882371644054315, "learning_rate": 4.724099625821842e-06, "loss": 0.0662, "step": 1660 }, { "epoch": 0.7556869881710646, "grad_norm": 0.8622330946401275, "learning_rate": 4.723773181774543e-06, "loss": 0.0739, "step": 1661 }, { "epoch": 0.7561419472247498, "grad_norm": 0.882607769279876, "learning_rate": 4.723446556010542e-06, "loss": 0.0652, "step": 1662 }, { "epoch": 0.7565969062784349, "grad_norm": 0.6272241973275734, "learning_rate": 4.7231197485565275e-06, "loss": 0.0671, "step": 1663 }, { "epoch": 0.7570518653321201, "grad_norm": 0.9143319315222466, "learning_rate": 4.722792759439209e-06, "loss": 0.0836, "step": 1664 }, { "epoch": 0.7575068243858053, "grad_norm": 1.0179914574460616, "learning_rate": 4.722465588685302e-06, "loss": 0.1076, "step": 1665 }, { "epoch": 0.7579617834394905, "grad_norm": 0.5583219749541256, "learning_rate": 4.722138236321545e-06, "loss": 0.0402, "step": 1666 }, { "epoch": 0.7584167424931756, "grad_norm": 0.8435692964339, "learning_rate": 4.721810702374687e-06, "loss": 0.0557, "step": 1667 }, { "epoch": 0.7588717015468608, "grad_norm": 0.6927360095243408, "learning_rate": 4.721482986871491e-06, "loss": 0.0523, "step": 1668 }, { "epoch": 0.759326660600546, "grad_norm": 1.1648579407503177, "learning_rate": 4.721155089838738e-06, "loss": 0.0758, "step": 1669 }, { "epoch": 0.7597816196542311, "grad_norm": 0.7760606996897229, "learning_rate": 4.720827011303222e-06, "loss": 0.059, "step": 1670 }, { "epoch": 0.7602365787079163, "grad_norm": 0.866591093188149, "learning_rate": 4.720498751291751e-06, "loss": 0.0761, "step": 1671 }, { "epoch": 0.7606915377616015, "grad_norm": 1.0996180971293896, "learning_rate": 4.72017030983115e-06, "loss": 0.1103, "step": 1672 }, { "epoch": 0.7611464968152867, "grad_norm": 0.8231436249177936, "learning_rate": 4.7198416869482575e-06, "loss": 0.0713, "step": 1673 }, { "epoch": 0.7616014558689718, "grad_norm": 1.0653488989628077, "learning_rate": 4.719512882669926e-06, "loss": 0.0965, "step": 1674 }, { "epoch": 0.762056414922657, "grad_norm": 0.6519840385405045, "learning_rate": 4.719183897023027e-06, "loss": 0.0478, "step": 1675 }, { "epoch": 0.7625113739763422, "grad_norm": 0.9166137300574493, "learning_rate": 4.718854730034441e-06, "loss": 0.0616, "step": 1676 }, { "epoch": 0.7629663330300273, "grad_norm": 0.678521481275382, "learning_rate": 4.718525381731066e-06, "loss": 0.0562, "step": 1677 }, { "epoch": 0.7634212920837125, "grad_norm": 1.074542168121289, "learning_rate": 4.718195852139816e-06, "loss": 0.0955, "step": 1678 }, { "epoch": 0.7638762511373977, "grad_norm": 1.2860232860764726, "learning_rate": 4.717866141287618e-06, "loss": 0.1276, "step": 1679 }, { "epoch": 0.7643312101910829, "grad_norm": 0.8631797724656796, "learning_rate": 4.717536249201416e-06, "loss": 0.0698, "step": 1680 }, { "epoch": 0.7647861692447679, "grad_norm": 0.5780781227727216, "learning_rate": 4.7172061759081646e-06, "loss": 0.0516, "step": 1681 }, { "epoch": 0.7652411282984531, "grad_norm": 0.8601028523567151, "learning_rate": 4.716875921434838e-06, "loss": 0.0804, "step": 1682 }, { "epoch": 0.7656960873521383, "grad_norm": 0.8231714424296133, "learning_rate": 4.716545485808421e-06, "loss": 0.0673, "step": 1683 }, { "epoch": 0.7661510464058234, "grad_norm": 0.8877019503026795, "learning_rate": 4.716214869055918e-06, "loss": 0.0754, "step": 1684 }, { "epoch": 0.7666060054595086, "grad_norm": 0.8595543978861313, "learning_rate": 4.715884071204344e-06, "loss": 0.0758, "step": 1685 }, { "epoch": 0.7670609645131938, "grad_norm": 0.760119775875199, "learning_rate": 4.715553092280731e-06, "loss": 0.0717, "step": 1686 }, { "epoch": 0.767515923566879, "grad_norm": 0.9302850076057104, "learning_rate": 4.7152219323121246e-06, "loss": 0.0776, "step": 1687 }, { "epoch": 0.7679708826205641, "grad_norm": 1.202654506835177, "learning_rate": 4.714890591325586e-06, "loss": 0.0968, "step": 1688 }, { "epoch": 0.7684258416742493, "grad_norm": 0.6400839574993287, "learning_rate": 4.714559069348189e-06, "loss": 0.053, "step": 1689 }, { "epoch": 0.7688808007279345, "grad_norm": 0.9345879461383537, "learning_rate": 4.714227366407027e-06, "loss": 0.0754, "step": 1690 }, { "epoch": 0.7693357597816196, "grad_norm": 0.699599210661908, "learning_rate": 4.7138954825292035e-06, "loss": 0.0562, "step": 1691 }, { "epoch": 0.7697907188353048, "grad_norm": 0.7256617889152714, "learning_rate": 4.71356341774184e-06, "loss": 0.0647, "step": 1692 }, { "epoch": 0.77024567788899, "grad_norm": 0.8163148806797087, "learning_rate": 4.713231172072069e-06, "loss": 0.0647, "step": 1693 }, { "epoch": 0.7707006369426752, "grad_norm": 0.8921055220209645, "learning_rate": 4.712898745547043e-06, "loss": 0.0688, "step": 1694 }, { "epoch": 0.7711555959963603, "grad_norm": 0.8759443756147646, "learning_rate": 4.712566138193923e-06, "loss": 0.0861, "step": 1695 }, { "epoch": 0.7716105550500455, "grad_norm": 0.6748844572059718, "learning_rate": 4.712233350039892e-06, "loss": 0.0557, "step": 1696 }, { "epoch": 0.7720655141037307, "grad_norm": 0.6739690976228645, "learning_rate": 4.711900381112141e-06, "loss": 0.0467, "step": 1697 }, { "epoch": 0.7725204731574158, "grad_norm": 0.5565716241809456, "learning_rate": 4.71156723143788e-06, "loss": 0.0494, "step": 1698 }, { "epoch": 0.772975432211101, "grad_norm": 0.6665864775582577, "learning_rate": 4.711233901044332e-06, "loss": 0.0665, "step": 1699 }, { "epoch": 0.7734303912647862, "grad_norm": 0.7450992205724415, "learning_rate": 4.710900389958735e-06, "loss": 0.0718, "step": 1700 }, { "epoch": 0.7738853503184714, "grad_norm": 1.1792700578565205, "learning_rate": 4.710566698208343e-06, "loss": 0.1197, "step": 1701 }, { "epoch": 0.7743403093721565, "grad_norm": 0.7320056890324683, "learning_rate": 4.710232825820424e-06, "loss": 0.0651, "step": 1702 }, { "epoch": 0.7747952684258417, "grad_norm": 0.4782904500778329, "learning_rate": 4.709898772822258e-06, "loss": 0.0363, "step": 1703 }, { "epoch": 0.7752502274795269, "grad_norm": 0.589565072900403, "learning_rate": 4.709564539241145e-06, "loss": 0.052, "step": 1704 }, { "epoch": 0.775705186533212, "grad_norm": 0.6162848956452569, "learning_rate": 4.709230125104396e-06, "loss": 0.0581, "step": 1705 }, { "epoch": 0.7761601455868972, "grad_norm": 0.5419452584536989, "learning_rate": 4.708895530439339e-06, "loss": 0.0426, "step": 1706 }, { "epoch": 0.7766151046405824, "grad_norm": 0.64262799617097, "learning_rate": 4.708560755273313e-06, "loss": 0.0377, "step": 1707 }, { "epoch": 0.7770700636942676, "grad_norm": 1.2766398855768186, "learning_rate": 4.7082257996336765e-06, "loss": 0.1176, "step": 1708 }, { "epoch": 0.7775250227479527, "grad_norm": 0.6290733792474179, "learning_rate": 4.707890663547801e-06, "loss": 0.0621, "step": 1709 }, { "epoch": 0.7779799818016379, "grad_norm": 0.7132245474865738, "learning_rate": 4.7075553470430695e-06, "loss": 0.0729, "step": 1710 }, { "epoch": 0.778434940855323, "grad_norm": 1.2871463763795532, "learning_rate": 4.707219850146885e-06, "loss": 0.0809, "step": 1711 }, { "epoch": 0.7788898999090081, "grad_norm": 0.7326832656978515, "learning_rate": 4.706884172886662e-06, "loss": 0.0778, "step": 1712 }, { "epoch": 0.7793448589626933, "grad_norm": 0.584850834159967, "learning_rate": 4.706548315289831e-06, "loss": 0.0561, "step": 1713 }, { "epoch": 0.7797998180163785, "grad_norm": 0.6500063491474557, "learning_rate": 4.706212277383836e-06, "loss": 0.0546, "step": 1714 }, { "epoch": 0.7802547770700637, "grad_norm": 0.8671552717303382, "learning_rate": 4.705876059196136e-06, "loss": 0.0805, "step": 1715 }, { "epoch": 0.7807097361237488, "grad_norm": 0.836331279519993, "learning_rate": 4.705539660754208e-06, "loss": 0.0794, "step": 1716 }, { "epoch": 0.781164695177434, "grad_norm": 0.7331262117419055, "learning_rate": 4.705203082085538e-06, "loss": 0.0589, "step": 1717 }, { "epoch": 0.7816196542311192, "grad_norm": 1.0967129244905651, "learning_rate": 4.70486632321763e-06, "loss": 0.0875, "step": 1718 }, { "epoch": 0.7820746132848043, "grad_norm": 0.8063548612962124, "learning_rate": 4.7045293841780034e-06, "loss": 0.0638, "step": 1719 }, { "epoch": 0.7825295723384895, "grad_norm": 0.5877288681753885, "learning_rate": 4.704192264994193e-06, "loss": 0.0489, "step": 1720 }, { "epoch": 0.7829845313921747, "grad_norm": 0.4849602579402119, "learning_rate": 4.703854965693743e-06, "loss": 0.036, "step": 1721 }, { "epoch": 0.7834394904458599, "grad_norm": 0.9265838770512554, "learning_rate": 4.703517486304218e-06, "loss": 0.0864, "step": 1722 }, { "epoch": 0.783894449499545, "grad_norm": 0.6601652550077106, "learning_rate": 4.703179826853195e-06, "loss": 0.0628, "step": 1723 }, { "epoch": 0.7843494085532302, "grad_norm": 0.9233111628201732, "learning_rate": 4.702841987368265e-06, "loss": 0.0623, "step": 1724 }, { "epoch": 0.7848043676069154, "grad_norm": 0.6187150000991709, "learning_rate": 4.702503967877038e-06, "loss": 0.0411, "step": 1725 }, { "epoch": 0.7852593266606005, "grad_norm": 1.229045396910063, "learning_rate": 4.702165768407132e-06, "loss": 0.1123, "step": 1726 }, { "epoch": 0.7857142857142857, "grad_norm": 0.7445601607520801, "learning_rate": 4.701827388986185e-06, "loss": 0.0691, "step": 1727 }, { "epoch": 0.7861692447679709, "grad_norm": 0.7316496259855203, "learning_rate": 4.701488829641845e-06, "loss": 0.0561, "step": 1728 }, { "epoch": 0.7866242038216561, "grad_norm": 0.6991204172633203, "learning_rate": 4.701150090401782e-06, "loss": 0.063, "step": 1729 }, { "epoch": 0.7870791628753412, "grad_norm": 0.6297857414561489, "learning_rate": 4.700811171293673e-06, "loss": 0.0555, "step": 1730 }, { "epoch": 0.7875341219290264, "grad_norm": 0.6996558901945711, "learning_rate": 4.700472072345214e-06, "loss": 0.0746, "step": 1731 }, { "epoch": 0.7879890809827116, "grad_norm": 0.7853149482544831, "learning_rate": 4.700132793584113e-06, "loss": 0.0651, "step": 1732 }, { "epoch": 0.7884440400363967, "grad_norm": 0.7740092326049495, "learning_rate": 4.699793335038098e-06, "loss": 0.0616, "step": 1733 }, { "epoch": 0.7888989990900819, "grad_norm": 0.6626889036581106, "learning_rate": 4.699453696734905e-06, "loss": 0.059, "step": 1734 }, { "epoch": 0.7893539581437671, "grad_norm": 0.9561954479294612, "learning_rate": 4.699113878702288e-06, "loss": 0.0938, "step": 1735 }, { "epoch": 0.7898089171974523, "grad_norm": 0.5317461912915777, "learning_rate": 4.698773880968017e-06, "loss": 0.0359, "step": 1736 }, { "epoch": 0.7902638762511374, "grad_norm": 0.5011006710196552, "learning_rate": 4.698433703559874e-06, "loss": 0.0326, "step": 1737 }, { "epoch": 0.7907188353048226, "grad_norm": 0.5030053584799479, "learning_rate": 4.698093346505656e-06, "loss": 0.0409, "step": 1738 }, { "epoch": 0.7911737943585078, "grad_norm": 0.8510969897923517, "learning_rate": 4.697752809833177e-06, "loss": 0.0724, "step": 1739 }, { "epoch": 0.7916287534121929, "grad_norm": 1.0390434682676513, "learning_rate": 4.697412093570263e-06, "loss": 0.0797, "step": 1740 }, { "epoch": 0.792083712465878, "grad_norm": 0.6499989190029223, "learning_rate": 4.697071197744756e-06, "loss": 0.0494, "step": 1741 }, { "epoch": 0.7925386715195633, "grad_norm": 0.7483950848546489, "learning_rate": 4.6967301223845115e-06, "loss": 0.0507, "step": 1742 }, { "epoch": 0.7929936305732485, "grad_norm": 0.5874179561041022, "learning_rate": 4.696388867517403e-06, "loss": 0.0555, "step": 1743 }, { "epoch": 0.7934485896269335, "grad_norm": 0.6928591342268897, "learning_rate": 4.696047433171316e-06, "loss": 0.0484, "step": 1744 }, { "epoch": 0.7939035486806187, "grad_norm": 0.784441765858547, "learning_rate": 4.695705819374149e-06, "loss": 0.0611, "step": 1745 }, { "epoch": 0.7943585077343039, "grad_norm": 0.5652038657500907, "learning_rate": 4.695364026153818e-06, "loss": 0.0535, "step": 1746 }, { "epoch": 0.794813466787989, "grad_norm": 0.7298412289038372, "learning_rate": 4.695022053538253e-06, "loss": 0.0595, "step": 1747 }, { "epoch": 0.7952684258416742, "grad_norm": 1.0490206911158746, "learning_rate": 4.694679901555398e-06, "loss": 0.0861, "step": 1748 }, { "epoch": 0.7957233848953594, "grad_norm": 3.990810702240321, "learning_rate": 4.694337570233213e-06, "loss": 0.1767, "step": 1749 }, { "epoch": 0.7961783439490446, "grad_norm": 0.6802938770066911, "learning_rate": 4.693995059599672e-06, "loss": 0.0573, "step": 1750 }, { "epoch": 0.7966333030027297, "grad_norm": 0.8799308705617829, "learning_rate": 4.693652369682762e-06, "loss": 0.0811, "step": 1751 }, { "epoch": 0.7970882620564149, "grad_norm": 0.694712285767521, "learning_rate": 4.693309500510487e-06, "loss": 0.0452, "step": 1752 }, { "epoch": 0.7975432211101001, "grad_norm": 0.7794041106607383, "learning_rate": 4.692966452110864e-06, "loss": 0.0461, "step": 1753 }, { "epoch": 0.7979981801637852, "grad_norm": 0.7973687362919706, "learning_rate": 4.6926232245119265e-06, "loss": 0.0974, "step": 1754 }, { "epoch": 0.7984531392174704, "grad_norm": 0.9760391285618086, "learning_rate": 4.69227981774172e-06, "loss": 0.07, "step": 1755 }, { "epoch": 0.7989080982711556, "grad_norm": 0.9051191420196392, "learning_rate": 4.691936231828308e-06, "loss": 0.0701, "step": 1756 }, { "epoch": 0.7993630573248408, "grad_norm": 0.8399040364365982, "learning_rate": 4.691592466799766e-06, "loss": 0.08, "step": 1757 }, { "epoch": 0.7998180163785259, "grad_norm": 0.6489937656240298, "learning_rate": 4.691248522684184e-06, "loss": 0.0557, "step": 1758 }, { "epoch": 0.8002729754322111, "grad_norm": 0.5634573167334715, "learning_rate": 4.690904399509668e-06, "loss": 0.0424, "step": 1759 }, { "epoch": 0.8007279344858963, "grad_norm": 1.0271098563007677, "learning_rate": 4.69056009730434e-06, "loss": 0.0803, "step": 1760 }, { "epoch": 0.8011828935395814, "grad_norm": 1.0217177612047041, "learning_rate": 4.690215616096332e-06, "loss": 0.0883, "step": 1761 }, { "epoch": 0.8016378525932666, "grad_norm": 0.8407253882349629, "learning_rate": 4.689870955913796e-06, "loss": 0.0793, "step": 1762 }, { "epoch": 0.8020928116469518, "grad_norm": 0.5666180272561038, "learning_rate": 4.689526116784894e-06, "loss": 0.0459, "step": 1763 }, { "epoch": 0.802547770700637, "grad_norm": 0.8415069685290992, "learning_rate": 4.689181098737805e-06, "loss": 0.0649, "step": 1764 }, { "epoch": 0.8030027297543221, "grad_norm": 0.520623178173706, "learning_rate": 4.6888359018007235e-06, "loss": 0.0416, "step": 1765 }, { "epoch": 0.8034576888080073, "grad_norm": 0.6187159463210112, "learning_rate": 4.6884905260018565e-06, "loss": 0.0456, "step": 1766 }, { "epoch": 0.8039126478616925, "grad_norm": 0.7676728568516994, "learning_rate": 4.688144971369427e-06, "loss": 0.0604, "step": 1767 }, { "epoch": 0.8043676069153776, "grad_norm": 0.7921258117073752, "learning_rate": 4.687799237931673e-06, "loss": 0.0668, "step": 1768 }, { "epoch": 0.8048225659690628, "grad_norm": 0.9320479241709307, "learning_rate": 4.687453325716844e-06, "loss": 0.1011, "step": 1769 }, { "epoch": 0.805277525022748, "grad_norm": 1.0320110116628263, "learning_rate": 4.687107234753208e-06, "loss": 0.0777, "step": 1770 }, { "epoch": 0.8057324840764332, "grad_norm": 0.9182666614681877, "learning_rate": 4.686760965069046e-06, "loss": 0.0679, "step": 1771 }, { "epoch": 0.8061874431301183, "grad_norm": 0.6859628403586197, "learning_rate": 4.686414516692653e-06, "loss": 0.0735, "step": 1772 }, { "epoch": 0.8066424021838035, "grad_norm": 0.858787608672165, "learning_rate": 4.68606788965234e-06, "loss": 0.0739, "step": 1773 }, { "epoch": 0.8070973612374887, "grad_norm": 1.0829087136143425, "learning_rate": 4.68572108397643e-06, "loss": 0.1121, "step": 1774 }, { "epoch": 0.8075523202911737, "grad_norm": 0.8511945315116681, "learning_rate": 4.6853740996932645e-06, "loss": 0.0692, "step": 1775 }, { "epoch": 0.8080072793448589, "grad_norm": 0.6850807246243011, "learning_rate": 4.685026936831196e-06, "loss": 0.0572, "step": 1776 }, { "epoch": 0.8084622383985441, "grad_norm": 0.7373142885959381, "learning_rate": 4.684679595418595e-06, "loss": 0.0543, "step": 1777 }, { "epoch": 0.8089171974522293, "grad_norm": 0.9582099582222912, "learning_rate": 4.684332075483843e-06, "loss": 0.0575, "step": 1778 }, { "epoch": 0.8093721565059144, "grad_norm": 0.8529177501583067, "learning_rate": 4.6839843770553374e-06, "loss": 0.0829, "step": 1779 }, { "epoch": 0.8098271155595996, "grad_norm": 0.5785153858733987, "learning_rate": 4.683636500161491e-06, "loss": 0.0548, "step": 1780 }, { "epoch": 0.8102820746132848, "grad_norm": 1.1231799792720614, "learning_rate": 4.683288444830732e-06, "loss": 0.1008, "step": 1781 }, { "epoch": 0.8107370336669699, "grad_norm": 0.7388433195732499, "learning_rate": 4.6829402110915015e-06, "loss": 0.0554, "step": 1782 }, { "epoch": 0.8111919927206551, "grad_norm": 0.7176040956098546, "learning_rate": 4.682591798972253e-06, "loss": 0.0592, "step": 1783 }, { "epoch": 0.8116469517743403, "grad_norm": 0.6760595625823852, "learning_rate": 4.682243208501461e-06, "loss": 0.0621, "step": 1784 }, { "epoch": 0.8121019108280255, "grad_norm": 0.6601646947125127, "learning_rate": 4.681894439707609e-06, "loss": 0.0468, "step": 1785 }, { "epoch": 0.8125568698817106, "grad_norm": 0.6367038337766298, "learning_rate": 4.681545492619195e-06, "loss": 0.0523, "step": 1786 }, { "epoch": 0.8130118289353958, "grad_norm": 0.8235921980464636, "learning_rate": 4.681196367264736e-06, "loss": 0.064, "step": 1787 }, { "epoch": 0.813466787989081, "grad_norm": 0.7238890575594984, "learning_rate": 4.680847063672761e-06, "loss": 0.0664, "step": 1788 }, { "epoch": 0.8139217470427661, "grad_norm": 0.6470968736210913, "learning_rate": 4.680497581871811e-06, "loss": 0.0621, "step": 1789 }, { "epoch": 0.8143767060964513, "grad_norm": 0.9587164618230581, "learning_rate": 4.680147921890447e-06, "loss": 0.0699, "step": 1790 }, { "epoch": 0.8148316651501365, "grad_norm": 0.7070187493876317, "learning_rate": 4.67979808375724e-06, "loss": 0.0556, "step": 1791 }, { "epoch": 0.8152866242038217, "grad_norm": 0.7878117503048105, "learning_rate": 4.679448067500777e-06, "loss": 0.06, "step": 1792 }, { "epoch": 0.8157415832575068, "grad_norm": 7.5481195301184245, "learning_rate": 4.67909787314966e-06, "loss": 0.2081, "step": 1793 }, { "epoch": 0.816196542311192, "grad_norm": 1.0374324577053136, "learning_rate": 4.678747500732505e-06, "loss": 0.0815, "step": 1794 }, { "epoch": 0.8166515013648772, "grad_norm": 0.7986120350256016, "learning_rate": 4.6783969502779455e-06, "loss": 0.0624, "step": 1795 }, { "epoch": 0.8171064604185623, "grad_norm": 0.856218662448613, "learning_rate": 4.6780462218146236e-06, "loss": 0.0755, "step": 1796 }, { "epoch": 0.8175614194722475, "grad_norm": 0.8379197196440461, "learning_rate": 4.6776953153712005e-06, "loss": 0.0892, "step": 1797 }, { "epoch": 0.8180163785259327, "grad_norm": 1.2262653431681223, "learning_rate": 4.67734423097635e-06, "loss": 0.0677, "step": 1798 }, { "epoch": 0.8184713375796179, "grad_norm": 0.6080921164509873, "learning_rate": 4.676992968658762e-06, "loss": 0.0645, "step": 1799 }, { "epoch": 0.818926296633303, "grad_norm": 0.5894555534409597, "learning_rate": 4.67664152844714e-06, "loss": 0.0557, "step": 1800 }, { "epoch": 0.8193812556869882, "grad_norm": 0.7869279254500811, "learning_rate": 4.676289910370202e-06, "loss": 0.0507, "step": 1801 }, { "epoch": 0.8198362147406734, "grad_norm": 1.200875413797979, "learning_rate": 4.675938114456682e-06, "loss": 0.0878, "step": 1802 }, { "epoch": 0.8202911737943585, "grad_norm": 0.6675759838670742, "learning_rate": 4.675586140735323e-06, "loss": 0.0639, "step": 1803 }, { "epoch": 0.8207461328480437, "grad_norm": 0.7910152821490807, "learning_rate": 4.675233989234891e-06, "loss": 0.07, "step": 1804 }, { "epoch": 0.8212010919017289, "grad_norm": 0.6182132167368, "learning_rate": 4.67488165998416e-06, "loss": 0.0497, "step": 1805 }, { "epoch": 0.821656050955414, "grad_norm": 0.952276552917119, "learning_rate": 4.674529153011922e-06, "loss": 0.0898, "step": 1806 }, { "epoch": 0.8221110100090991, "grad_norm": 0.9577124870492246, "learning_rate": 4.674176468346982e-06, "loss": 0.0859, "step": 1807 }, { "epoch": 0.8225659690627843, "grad_norm": 0.6282575796746988, "learning_rate": 4.673823606018158e-06, "loss": 0.05, "step": 1808 }, { "epoch": 0.8230209281164695, "grad_norm": 0.7575136331083856, "learning_rate": 4.673470566054288e-06, "loss": 0.0668, "step": 1809 }, { "epoch": 0.8234758871701547, "grad_norm": 0.8264565607164219, "learning_rate": 4.673117348484217e-06, "loss": 0.0651, "step": 1810 }, { "epoch": 0.8239308462238398, "grad_norm": 0.5831702959060338, "learning_rate": 4.672763953336811e-06, "loss": 0.0552, "step": 1811 }, { "epoch": 0.824385805277525, "grad_norm": 0.9120237530681591, "learning_rate": 4.672410380640946e-06, "loss": 0.068, "step": 1812 }, { "epoch": 0.8248407643312102, "grad_norm": 0.7346308742565613, "learning_rate": 4.672056630425516e-06, "loss": 0.0649, "step": 1813 }, { "epoch": 0.8252957233848953, "grad_norm": 0.6814325566875316, "learning_rate": 4.671702702719426e-06, "loss": 0.059, "step": 1814 }, { "epoch": 0.8257506824385805, "grad_norm": 0.8584736371666158, "learning_rate": 4.671348597551599e-06, "loss": 0.0712, "step": 1815 }, { "epoch": 0.8262056414922657, "grad_norm": 0.7235192665941425, "learning_rate": 4.670994314950971e-06, "loss": 0.0626, "step": 1816 }, { "epoch": 0.8266606005459509, "grad_norm": 0.5383143835892068, "learning_rate": 4.6706398549464905e-06, "loss": 0.0398, "step": 1817 }, { "epoch": 0.827115559599636, "grad_norm": 0.5271328375766583, "learning_rate": 4.670285217567124e-06, "loss": 0.034, "step": 1818 }, { "epoch": 0.8275705186533212, "grad_norm": 0.8154779822293389, "learning_rate": 4.6699304028418516e-06, "loss": 0.0717, "step": 1819 }, { "epoch": 0.8280254777070064, "grad_norm": 0.9360607447963808, "learning_rate": 4.669575410799665e-06, "loss": 0.0619, "step": 1820 }, { "epoch": 0.8284804367606915, "grad_norm": 0.6087960564203705, "learning_rate": 4.669220241469573e-06, "loss": 0.0635, "step": 1821 }, { "epoch": 0.8289353958143767, "grad_norm": 0.7167883057342898, "learning_rate": 4.668864894880599e-06, "loss": 0.0693, "step": 1822 }, { "epoch": 0.8293903548680619, "grad_norm": 3.321238319754088, "learning_rate": 4.668509371061781e-06, "loss": 0.1734, "step": 1823 }, { "epoch": 0.8298453139217471, "grad_norm": 1.0425676852340926, "learning_rate": 4.668153670042171e-06, "loss": 0.0757, "step": 1824 }, { "epoch": 0.8303002729754322, "grad_norm": 0.662221744529951, "learning_rate": 4.667797791850833e-06, "loss": 0.0514, "step": 1825 }, { "epoch": 0.8307552320291174, "grad_norm": 0.7120101760933736, "learning_rate": 4.6674417365168495e-06, "loss": 0.0584, "step": 1826 }, { "epoch": 0.8312101910828026, "grad_norm": 0.8118980885746032, "learning_rate": 4.667085504069315e-06, "loss": 0.0698, "step": 1827 }, { "epoch": 0.8316651501364877, "grad_norm": 1.0677947820913898, "learning_rate": 4.66672909453734e-06, "loss": 0.1084, "step": 1828 }, { "epoch": 0.8321201091901729, "grad_norm": 7.344384584742864, "learning_rate": 4.6663725079500485e-06, "loss": 0.1199, "step": 1829 }, { "epoch": 0.8325750682438581, "grad_norm": 0.7616332667262099, "learning_rate": 4.666015744336578e-06, "loss": 0.0532, "step": 1830 }, { "epoch": 0.8330300272975433, "grad_norm": 0.598579113637809, "learning_rate": 4.665658803726083e-06, "loss": 0.0584, "step": 1831 }, { "epoch": 0.8334849863512284, "grad_norm": 0.5463327138280551, "learning_rate": 4.6653016861477315e-06, "loss": 0.0448, "step": 1832 }, { "epoch": 0.8339399454049136, "grad_norm": 0.6038418058635472, "learning_rate": 4.664944391630704e-06, "loss": 0.0491, "step": 1833 }, { "epoch": 0.8343949044585988, "grad_norm": 0.7544848763457503, "learning_rate": 4.664586920204197e-06, "loss": 0.0645, "step": 1834 }, { "epoch": 0.8348498635122839, "grad_norm": 0.807248967975218, "learning_rate": 4.664229271897422e-06, "loss": 0.0564, "step": 1835 }, { "epoch": 0.835304822565969, "grad_norm": 0.6512214018608161, "learning_rate": 4.663871446739606e-06, "loss": 0.0678, "step": 1836 }, { "epoch": 0.8357597816196543, "grad_norm": 0.8703248212743384, "learning_rate": 4.663513444759986e-06, "loss": 0.0613, "step": 1837 }, { "epoch": 0.8362147406733395, "grad_norm": 1.0090345588843233, "learning_rate": 4.663155265987818e-06, "loss": 0.0836, "step": 1838 }, { "epoch": 0.8366696997270245, "grad_norm": 0.6694681967933211, "learning_rate": 4.66279691045237e-06, "loss": 0.05, "step": 1839 }, { "epoch": 0.8371246587807097, "grad_norm": 1.1031071759011255, "learning_rate": 4.662438378182927e-06, "loss": 0.0957, "step": 1840 }, { "epoch": 0.8375796178343949, "grad_norm": 0.728630248897681, "learning_rate": 4.662079669208783e-06, "loss": 0.0605, "step": 1841 }, { "epoch": 0.83803457688808, "grad_norm": 1.001887022400648, "learning_rate": 4.661720783559254e-06, "loss": 0.0877, "step": 1842 }, { "epoch": 0.8384895359417652, "grad_norm": 0.6786956307872102, "learning_rate": 4.661361721263664e-06, "loss": 0.0559, "step": 1843 }, { "epoch": 0.8389444949954504, "grad_norm": 0.7188093304776232, "learning_rate": 4.661002482351355e-06, "loss": 0.0614, "step": 1844 }, { "epoch": 0.8393994540491356, "grad_norm": 0.7081647144591069, "learning_rate": 4.660643066851682e-06, "loss": 0.0496, "step": 1845 }, { "epoch": 0.8398544131028207, "grad_norm": 0.6295835306947963, "learning_rate": 4.6602834747940155e-06, "loss": 0.0585, "step": 1846 }, { "epoch": 0.8403093721565059, "grad_norm": 0.6877911302656206, "learning_rate": 4.6599237062077385e-06, "loss": 0.0537, "step": 1847 }, { "epoch": 0.8407643312101911, "grad_norm": 0.7291106168232226, "learning_rate": 4.65956376112225e-06, "loss": 0.0582, "step": 1848 }, { "epoch": 0.8412192902638762, "grad_norm": 0.45011772458421007, "learning_rate": 4.659203639566965e-06, "loss": 0.0324, "step": 1849 }, { "epoch": 0.8416742493175614, "grad_norm": 0.7993204530683249, "learning_rate": 4.658843341571308e-06, "loss": 0.065, "step": 1850 }, { "epoch": 0.8421292083712466, "grad_norm": 0.6243583315447274, "learning_rate": 4.6584828671647235e-06, "loss": 0.0476, "step": 1851 }, { "epoch": 0.8425841674249318, "grad_norm": 0.7933509538566996, "learning_rate": 4.658122216376666e-06, "loss": 0.0816, "step": 1852 }, { "epoch": 0.8430391264786169, "grad_norm": 1.1708916451450775, "learning_rate": 4.657761389236607e-06, "loss": 0.1023, "step": 1853 }, { "epoch": 0.8434940855323021, "grad_norm": 1.2414495723920722, "learning_rate": 4.657400385774032e-06, "loss": 0.0961, "step": 1854 }, { "epoch": 0.8439490445859873, "grad_norm": 0.8143335380819541, "learning_rate": 4.65703920601844e-06, "loss": 0.0868, "step": 1855 }, { "epoch": 0.8444040036396724, "grad_norm": 0.6991877465392216, "learning_rate": 4.656677849999345e-06, "loss": 0.05, "step": 1856 }, { "epoch": 0.8448589626933576, "grad_norm": 0.578084819635394, "learning_rate": 4.656316317746275e-06, "loss": 0.0351, "step": 1857 }, { "epoch": 0.8453139217470428, "grad_norm": 0.7330779326514238, "learning_rate": 4.655954609288775e-06, "loss": 0.0611, "step": 1858 }, { "epoch": 0.845768880800728, "grad_norm": 0.8310851616975516, "learning_rate": 4.655592724656399e-06, "loss": 0.0707, "step": 1859 }, { "epoch": 0.8462238398544131, "grad_norm": 0.548925696011472, "learning_rate": 4.655230663878721e-06, "loss": 0.0465, "step": 1860 }, { "epoch": 0.8466787989080983, "grad_norm": 0.6143015545428137, "learning_rate": 4.654868426985326e-06, "loss": 0.0485, "step": 1861 }, { "epoch": 0.8471337579617835, "grad_norm": 1.110252508265771, "learning_rate": 4.654506014005814e-06, "loss": 0.107, "step": 1862 }, { "epoch": 0.8475887170154686, "grad_norm": 0.9959562888894351, "learning_rate": 4.6541434249698e-06, "loss": 0.0833, "step": 1863 }, { "epoch": 0.8480436760691538, "grad_norm": 0.5468147069872943, "learning_rate": 4.6537806599069144e-06, "loss": 0.0515, "step": 1864 }, { "epoch": 0.848498635122839, "grad_norm": 0.8717783156119658, "learning_rate": 4.653417718846799e-06, "loss": 0.0708, "step": 1865 }, { "epoch": 0.8489535941765242, "grad_norm": 0.9951106402676078, "learning_rate": 4.6530546018191126e-06, "loss": 0.0676, "step": 1866 }, { "epoch": 0.8494085532302093, "grad_norm": 1.7646743889926437, "learning_rate": 4.652691308853526e-06, "loss": 0.0941, "step": 1867 }, { "epoch": 0.8498635122838945, "grad_norm": 0.7838033849127587, "learning_rate": 4.652327839979729e-06, "loss": 0.0658, "step": 1868 }, { "epoch": 0.8503184713375797, "grad_norm": 0.8803955131716555, "learning_rate": 4.651964195227419e-06, "loss": 0.0512, "step": 1869 }, { "epoch": 0.8507734303912647, "grad_norm": 0.6367380704516323, "learning_rate": 4.651600374626315e-06, "loss": 0.0627, "step": 1870 }, { "epoch": 0.8512283894449499, "grad_norm": 0.7460847826389658, "learning_rate": 4.651236378206144e-06, "loss": 0.0631, "step": 1871 }, { "epoch": 0.8516833484986351, "grad_norm": 1.0421113051413289, "learning_rate": 4.650872205996651e-06, "loss": 0.0895, "step": 1872 }, { "epoch": 0.8521383075523203, "grad_norm": 0.9088772885373132, "learning_rate": 4.650507858027595e-06, "loss": 0.0634, "step": 1873 }, { "epoch": 0.8525932666060054, "grad_norm": 0.6693618140614889, "learning_rate": 4.6501433343287475e-06, "loss": 0.0573, "step": 1874 }, { "epoch": 0.8530482256596906, "grad_norm": 1.0233365193295665, "learning_rate": 4.6497786349298975e-06, "loss": 0.0617, "step": 1875 }, { "epoch": 0.8535031847133758, "grad_norm": 0.703907501372265, "learning_rate": 4.649413759860846e-06, "loss": 0.0478, "step": 1876 }, { "epoch": 0.8539581437670609, "grad_norm": 0.8059119032929001, "learning_rate": 4.649048709151408e-06, "loss": 0.0801, "step": 1877 }, { "epoch": 0.8544131028207461, "grad_norm": 0.6078716632060385, "learning_rate": 4.648683482831415e-06, "loss": 0.0547, "step": 1878 }, { "epoch": 0.8548680618744313, "grad_norm": 0.9990647874074321, "learning_rate": 4.648318080930711e-06, "loss": 0.0924, "step": 1879 }, { "epoch": 0.8553230209281165, "grad_norm": 2.2575417512955327, "learning_rate": 4.647952503479154e-06, "loss": 0.119, "step": 1880 }, { "epoch": 0.8557779799818016, "grad_norm": 0.9125736125319561, "learning_rate": 4.6475867505066195e-06, "loss": 0.0842, "step": 1881 }, { "epoch": 0.8562329390354868, "grad_norm": 0.8327393689357763, "learning_rate": 4.647220822042995e-06, "loss": 0.0786, "step": 1882 }, { "epoch": 0.856687898089172, "grad_norm": 0.7174000884605474, "learning_rate": 4.64685471811818e-06, "loss": 0.0543, "step": 1883 }, { "epoch": 0.8571428571428571, "grad_norm": 0.7398748518067477, "learning_rate": 4.646488438762094e-06, "loss": 0.073, "step": 1884 }, { "epoch": 0.8575978161965423, "grad_norm": 0.9193811162801766, "learning_rate": 4.646121984004666e-06, "loss": 0.0811, "step": 1885 }, { "epoch": 0.8580527752502275, "grad_norm": 1.5266758295579101, "learning_rate": 4.64575535387584e-06, "loss": 0.1173, "step": 1886 }, { "epoch": 0.8585077343039127, "grad_norm": 1.0390577419470794, "learning_rate": 4.645388548405578e-06, "loss": 0.0844, "step": 1887 }, { "epoch": 0.8589626933575978, "grad_norm": 0.9981604633117009, "learning_rate": 4.645021567623852e-06, "loss": 0.076, "step": 1888 }, { "epoch": 0.859417652411283, "grad_norm": 0.8090672789437001, "learning_rate": 4.644654411560651e-06, "loss": 0.0668, "step": 1889 }, { "epoch": 0.8598726114649682, "grad_norm": 1.0022610897873472, "learning_rate": 4.644287080245975e-06, "loss": 0.0647, "step": 1890 }, { "epoch": 0.8603275705186533, "grad_norm": 0.8678922953452304, "learning_rate": 4.643919573709843e-06, "loss": 0.0779, "step": 1891 }, { "epoch": 0.8607825295723385, "grad_norm": 0.828489427676361, "learning_rate": 4.6435518919822854e-06, "loss": 0.0883, "step": 1892 }, { "epoch": 0.8612374886260237, "grad_norm": 0.6365179726435326, "learning_rate": 4.643184035093348e-06, "loss": 0.0485, "step": 1893 }, { "epoch": 0.8616924476797089, "grad_norm": 0.9364968630062864, "learning_rate": 4.642816003073089e-06, "loss": 0.0653, "step": 1894 }, { "epoch": 0.862147406733394, "grad_norm": 0.664790354826276, "learning_rate": 4.6424477959515836e-06, "loss": 0.0651, "step": 1895 }, { "epoch": 0.8626023657870792, "grad_norm": 0.6472325137853298, "learning_rate": 4.642079413758919e-06, "loss": 0.0563, "step": 1896 }, { "epoch": 0.8630573248407644, "grad_norm": 0.6990154291209034, "learning_rate": 4.641710856525199e-06, "loss": 0.0569, "step": 1897 }, { "epoch": 0.8635122838944495, "grad_norm": 0.8678945062000727, "learning_rate": 4.641342124280539e-06, "loss": 0.0901, "step": 1898 }, { "epoch": 0.8639672429481347, "grad_norm": 0.5665084013617818, "learning_rate": 4.6409732170550705e-06, "loss": 0.0487, "step": 1899 }, { "epoch": 0.8644222020018199, "grad_norm": 0.7935558097630077, "learning_rate": 4.64060413487894e-06, "loss": 0.0812, "step": 1900 }, { "epoch": 0.864877161055505, "grad_norm": 0.5864687466325638, "learning_rate": 4.640234877782306e-06, "loss": 0.0458, "step": 1901 }, { "epoch": 0.8653321201091901, "grad_norm": 0.813443015691526, "learning_rate": 4.639865445795344e-06, "loss": 0.0501, "step": 1902 }, { "epoch": 0.8657870791628753, "grad_norm": 0.5920765909800347, "learning_rate": 4.63949583894824e-06, "loss": 0.0547, "step": 1903 }, { "epoch": 0.8662420382165605, "grad_norm": 0.8617399384530425, "learning_rate": 4.639126057271199e-06, "loss": 0.0826, "step": 1904 }, { "epoch": 0.8666969972702456, "grad_norm": 1.1341813288905305, "learning_rate": 4.6387561007944355e-06, "loss": 0.1245, "step": 1905 }, { "epoch": 0.8671519563239308, "grad_norm": 0.7626148045229316, "learning_rate": 4.638385969548183e-06, "loss": 0.086, "step": 1906 }, { "epoch": 0.867606915377616, "grad_norm": 0.7074898750617904, "learning_rate": 4.638015663562686e-06, "loss": 0.0648, "step": 1907 }, { "epoch": 0.8680618744313012, "grad_norm": 0.7616835782134675, "learning_rate": 4.637645182868204e-06, "loss": 0.0662, "step": 1908 }, { "epoch": 0.8685168334849863, "grad_norm": 0.6447306948142749, "learning_rate": 4.637274527495011e-06, "loss": 0.0466, "step": 1909 }, { "epoch": 0.8689717925386715, "grad_norm": 0.6794305905203397, "learning_rate": 4.6369036974733955e-06, "loss": 0.0608, "step": 1910 }, { "epoch": 0.8694267515923567, "grad_norm": 0.6697315489178187, "learning_rate": 4.63653269283366e-06, "loss": 0.0638, "step": 1911 }, { "epoch": 0.8698817106460418, "grad_norm": 0.6256598442886095, "learning_rate": 4.636161513606122e-06, "loss": 0.0673, "step": 1912 }, { "epoch": 0.870336669699727, "grad_norm": 0.7994234222622871, "learning_rate": 4.6357901598211105e-06, "loss": 0.0821, "step": 1913 }, { "epoch": 0.8707916287534122, "grad_norm": 0.7041048918645969, "learning_rate": 4.635418631508974e-06, "loss": 0.0589, "step": 1914 }, { "epoch": 0.8712465878070974, "grad_norm": 0.5910855820794297, "learning_rate": 4.635046928700069e-06, "loss": 0.0618, "step": 1915 }, { "epoch": 0.8717015468607825, "grad_norm": 0.6953978547081013, "learning_rate": 4.634675051424771e-06, "loss": 0.0609, "step": 1916 }, { "epoch": 0.8721565059144677, "grad_norm": 0.6402822857806215, "learning_rate": 4.634302999713468e-06, "loss": 0.05, "step": 1917 }, { "epoch": 0.8726114649681529, "grad_norm": 1.4100178497357636, "learning_rate": 4.633930773596563e-06, "loss": 0.1251, "step": 1918 }, { "epoch": 0.873066424021838, "grad_norm": 1.0067064388849685, "learning_rate": 4.633558373104472e-06, "loss": 0.0863, "step": 1919 }, { "epoch": 0.8735213830755232, "grad_norm": 0.7720432867298371, "learning_rate": 4.633185798267625e-06, "loss": 0.0812, "step": 1920 }, { "epoch": 0.8739763421292084, "grad_norm": 0.9130001191840268, "learning_rate": 4.632813049116467e-06, "loss": 0.0762, "step": 1921 }, { "epoch": 0.8744313011828936, "grad_norm": 0.7297896124591896, "learning_rate": 4.63244012568146e-06, "loss": 0.0623, "step": 1922 }, { "epoch": 0.8748862602365787, "grad_norm": 1.1183042674093928, "learning_rate": 4.632067027993076e-06, "loss": 0.073, "step": 1923 }, { "epoch": 0.8753412192902639, "grad_norm": 0.8542658526151589, "learning_rate": 4.631693756081802e-06, "loss": 0.0719, "step": 1924 }, { "epoch": 0.8757961783439491, "grad_norm": 0.6727521948478059, "learning_rate": 4.631320309978141e-06, "loss": 0.072, "step": 1925 }, { "epoch": 0.8762511373976342, "grad_norm": 0.8947150872279354, "learning_rate": 4.630946689712609e-06, "loss": 0.0775, "step": 1926 }, { "epoch": 0.8767060964513194, "grad_norm": 0.7373481218781285, "learning_rate": 4.630572895315737e-06, "loss": 0.058, "step": 1927 }, { "epoch": 0.8771610555050046, "grad_norm": 0.4756516758736572, "learning_rate": 4.63019892681807e-06, "loss": 0.0445, "step": 1928 }, { "epoch": 0.8776160145586898, "grad_norm": 0.5208149626418009, "learning_rate": 4.629824784250166e-06, "loss": 0.0487, "step": 1929 }, { "epoch": 0.8780709736123748, "grad_norm": 0.5811343609358607, "learning_rate": 4.629450467642599e-06, "loss": 0.0473, "step": 1930 }, { "epoch": 0.87852593266606, "grad_norm": 0.6428228760282421, "learning_rate": 4.629075977025957e-06, "loss": 0.0691, "step": 1931 }, { "epoch": 0.8789808917197452, "grad_norm": 0.7532116780570327, "learning_rate": 4.62870131243084e-06, "loss": 0.076, "step": 1932 }, { "epoch": 0.8794358507734303, "grad_norm": 0.6362438045015979, "learning_rate": 4.628326473887865e-06, "loss": 0.0437, "step": 1933 }, { "epoch": 0.8798908098271155, "grad_norm": 0.6092139546854358, "learning_rate": 4.627951461427663e-06, "loss": 0.0401, "step": 1934 }, { "epoch": 0.8803457688808007, "grad_norm": 0.6583098597040281, "learning_rate": 4.627576275080876e-06, "loss": 0.0549, "step": 1935 }, { "epoch": 0.8808007279344859, "grad_norm": 0.6448996031197749, "learning_rate": 4.627200914878165e-06, "loss": 0.0566, "step": 1936 }, { "epoch": 0.881255686988171, "grad_norm": 0.9917471720064225, "learning_rate": 4.6268253808502005e-06, "loss": 0.0949, "step": 1937 }, { "epoch": 0.8817106460418562, "grad_norm": 0.4519754904291037, "learning_rate": 4.626449673027671e-06, "loss": 0.0369, "step": 1938 }, { "epoch": 0.8821656050955414, "grad_norm": 0.7122032429922148, "learning_rate": 4.626073791441278e-06, "loss": 0.0639, "step": 1939 }, { "epoch": 0.8826205641492265, "grad_norm": 0.7957247576823104, "learning_rate": 4.625697736121735e-06, "loss": 0.076, "step": 1940 }, { "epoch": 0.8830755232029117, "grad_norm": 1.0448936625007237, "learning_rate": 4.6253215070997735e-06, "loss": 0.0947, "step": 1941 }, { "epoch": 0.8835304822565969, "grad_norm": 0.7138340745094094, "learning_rate": 4.624945104406135e-06, "loss": 0.0603, "step": 1942 }, { "epoch": 0.8839854413102821, "grad_norm": 0.7835217856531912, "learning_rate": 4.624568528071579e-06, "loss": 0.0568, "step": 1943 }, { "epoch": 0.8844404003639672, "grad_norm": 0.6440096764240494, "learning_rate": 4.624191778126879e-06, "loss": 0.0643, "step": 1944 }, { "epoch": 0.8848953594176524, "grad_norm": 0.9196121184501301, "learning_rate": 4.623814854602818e-06, "loss": 0.0861, "step": 1945 }, { "epoch": 0.8853503184713376, "grad_norm": 0.7529334500938356, "learning_rate": 4.623437757530198e-06, "loss": 0.0621, "step": 1946 }, { "epoch": 0.8858052775250227, "grad_norm": 0.6444747494120212, "learning_rate": 4.623060486939835e-06, "loss": 0.0651, "step": 1947 }, { "epoch": 0.8862602365787079, "grad_norm": 0.9312918449614406, "learning_rate": 4.622683042862556e-06, "loss": 0.0774, "step": 1948 }, { "epoch": 0.8867151956323931, "grad_norm": 0.571023976593836, "learning_rate": 4.622305425329205e-06, "loss": 0.0488, "step": 1949 }, { "epoch": 0.8871701546860783, "grad_norm": 0.4821842011661118, "learning_rate": 4.621927634370638e-06, "loss": 0.0413, "step": 1950 }, { "epoch": 0.8876251137397634, "grad_norm": 0.6368005077549093, "learning_rate": 4.621549670017727e-06, "loss": 0.0542, "step": 1951 }, { "epoch": 0.8880800727934486, "grad_norm": 0.7246243226204113, "learning_rate": 4.6211715323013595e-06, "loss": 0.0623, "step": 1952 }, { "epoch": 0.8885350318471338, "grad_norm": 0.7688120695285035, "learning_rate": 4.6207932212524325e-06, "loss": 0.0753, "step": 1953 }, { "epoch": 0.8889899909008189, "grad_norm": 0.7357035989399864, "learning_rate": 4.620414736901861e-06, "loss": 0.0645, "step": 1954 }, { "epoch": 0.8894449499545041, "grad_norm": 0.6921873537829016, "learning_rate": 4.620036079280573e-06, "loss": 0.0674, "step": 1955 }, { "epoch": 0.8898999090081893, "grad_norm": 0.9110581432700333, "learning_rate": 4.619657248419511e-06, "loss": 0.086, "step": 1956 }, { "epoch": 0.8903548680618745, "grad_norm": 0.698295996358244, "learning_rate": 4.61927824434963e-06, "loss": 0.0575, "step": 1957 }, { "epoch": 0.8908098271155596, "grad_norm": 0.9051372778412762, "learning_rate": 4.6188990671019015e-06, "loss": 0.0889, "step": 1958 }, { "epoch": 0.8912647861692448, "grad_norm": 0.7017231398160091, "learning_rate": 4.618519716707311e-06, "loss": 0.0693, "step": 1959 }, { "epoch": 0.89171974522293, "grad_norm": 0.8361247694534356, "learning_rate": 4.618140193196856e-06, "loss": 0.0678, "step": 1960 }, { "epoch": 0.892174704276615, "grad_norm": 0.682212468445449, "learning_rate": 4.61776049660155e-06, "loss": 0.0637, "step": 1961 }, { "epoch": 0.8926296633303002, "grad_norm": 0.529244582188164, "learning_rate": 4.61738062695242e-06, "loss": 0.0447, "step": 1962 }, { "epoch": 0.8930846223839854, "grad_norm": 0.8253799792777393, "learning_rate": 4.617000584280506e-06, "loss": 0.0539, "step": 1963 }, { "epoch": 0.8935395814376706, "grad_norm": 0.8363462937787527, "learning_rate": 4.616620368616866e-06, "loss": 0.0808, "step": 1964 }, { "epoch": 0.8939945404913557, "grad_norm": 1.208594162076693, "learning_rate": 4.616239979992568e-06, "loss": 0.1071, "step": 1965 }, { "epoch": 0.8944494995450409, "grad_norm": 0.7323210379619807, "learning_rate": 4.615859418438695e-06, "loss": 0.0617, "step": 1966 }, { "epoch": 0.8949044585987261, "grad_norm": 0.5029303539944047, "learning_rate": 4.615478683986345e-06, "loss": 0.0447, "step": 1967 }, { "epoch": 0.8953594176524113, "grad_norm": 1.1709329391548968, "learning_rate": 4.6150977766666315e-06, "loss": 0.1162, "step": 1968 }, { "epoch": 0.8958143767060964, "grad_norm": 2.1123193474897746, "learning_rate": 4.614716696510679e-06, "loss": 0.1355, "step": 1969 }, { "epoch": 0.8962693357597816, "grad_norm": 0.7444729756367264, "learning_rate": 4.614335443549628e-06, "loss": 0.059, "step": 1970 }, { "epoch": 0.8967242948134668, "grad_norm": 0.5785396613883771, "learning_rate": 4.613954017814633e-06, "loss": 0.0419, "step": 1971 }, { "epoch": 0.8971792538671519, "grad_norm": 0.6399971925788602, "learning_rate": 4.613572419336862e-06, "loss": 0.052, "step": 1972 }, { "epoch": 0.8976342129208371, "grad_norm": 1.0179431232847986, "learning_rate": 4.613190648147497e-06, "loss": 0.0795, "step": 1973 }, { "epoch": 0.8980891719745223, "grad_norm": 0.8018358499725766, "learning_rate": 4.612808704277736e-06, "loss": 0.0726, "step": 1974 }, { "epoch": 0.8985441310282075, "grad_norm": 1.057940475258921, "learning_rate": 4.612426587758789e-06, "loss": 0.1035, "step": 1975 }, { "epoch": 0.8989990900818926, "grad_norm": 0.6814493223121904, "learning_rate": 4.612044298621881e-06, "loss": 0.0617, "step": 1976 }, { "epoch": 0.8994540491355778, "grad_norm": 0.8657309258123689, "learning_rate": 4.611661836898252e-06, "loss": 0.0692, "step": 1977 }, { "epoch": 0.899909008189263, "grad_norm": 0.8580923454556945, "learning_rate": 4.611279202619151e-06, "loss": 0.0745, "step": 1978 }, { "epoch": 0.9003639672429481, "grad_norm": 0.8215028957062736, "learning_rate": 4.61089639581585e-06, "loss": 0.0788, "step": 1979 }, { "epoch": 0.9008189262966333, "grad_norm": 0.8812044153052796, "learning_rate": 4.610513416519628e-06, "loss": 0.0761, "step": 1980 }, { "epoch": 0.9012738853503185, "grad_norm": 0.7857406697564773, "learning_rate": 4.6101302647617806e-06, "loss": 0.0688, "step": 1981 }, { "epoch": 0.9017288444040037, "grad_norm": 0.8417981177673383, "learning_rate": 4.609746940573617e-06, "loss": 0.0689, "step": 1982 }, { "epoch": 0.9021838034576888, "grad_norm": 0.8169079358621493, "learning_rate": 4.609363443986461e-06, "loss": 0.0648, "step": 1983 }, { "epoch": 0.902638762511374, "grad_norm": 0.7566807475286295, "learning_rate": 4.60897977503165e-06, "loss": 0.0616, "step": 1984 }, { "epoch": 0.9030937215650592, "grad_norm": 1.0714038601336504, "learning_rate": 4.608595933740536e-06, "loss": 0.1018, "step": 1985 }, { "epoch": 0.9035486806187443, "grad_norm": 0.9493251787399868, "learning_rate": 4.608211920144485e-06, "loss": 0.1073, "step": 1986 }, { "epoch": 0.9040036396724295, "grad_norm": 0.9973544276569233, "learning_rate": 4.607827734274876e-06, "loss": 0.0864, "step": 1987 }, { "epoch": 0.9044585987261147, "grad_norm": 0.6218560970311163, "learning_rate": 4.607443376163104e-06, "loss": 0.0421, "step": 1988 }, { "epoch": 0.9049135577797999, "grad_norm": 0.7664118230444326, "learning_rate": 4.607058845840576e-06, "loss": 0.0663, "step": 1989 }, { "epoch": 0.905368516833485, "grad_norm": 0.5074521856900044, "learning_rate": 4.606674143338714e-06, "loss": 0.0417, "step": 1990 }, { "epoch": 0.9058234758871702, "grad_norm": 0.662797208968593, "learning_rate": 4.606289268688955e-06, "loss": 0.049, "step": 1991 }, { "epoch": 0.9062784349408554, "grad_norm": 0.8307833634275037, "learning_rate": 4.605904221922749e-06, "loss": 0.0688, "step": 1992 }, { "epoch": 0.9067333939945404, "grad_norm": 0.6991781224532665, "learning_rate": 4.6055190030715605e-06, "loss": 0.0548, "step": 1993 }, { "epoch": 0.9071883530482256, "grad_norm": 0.6371941048527485, "learning_rate": 4.605133612166868e-06, "loss": 0.0565, "step": 1994 }, { "epoch": 0.9076433121019108, "grad_norm": 0.7661847561683841, "learning_rate": 4.604748049240162e-06, "loss": 0.0751, "step": 1995 }, { "epoch": 0.908098271155596, "grad_norm": 0.698983684338044, "learning_rate": 4.604362314322951e-06, "loss": 0.0618, "step": 1996 }, { "epoch": 0.9085532302092811, "grad_norm": 0.7223001112189644, "learning_rate": 4.603976407446756e-06, "loss": 0.0604, "step": 1997 }, { "epoch": 0.9090081892629663, "grad_norm": 0.6724854071313681, "learning_rate": 4.603590328643108e-06, "loss": 0.047, "step": 1998 }, { "epoch": 0.9094631483166515, "grad_norm": 0.7241602536272622, "learning_rate": 4.60320407794356e-06, "loss": 0.0616, "step": 1999 }, { "epoch": 0.9099181073703366, "grad_norm": 0.661995220560214, "learning_rate": 4.602817655379672e-06, "loss": 0.0706, "step": 2000 }, { "epoch": 0.9103730664240218, "grad_norm": 0.6993774781403542, "learning_rate": 4.602431060983022e-06, "loss": 0.0667, "step": 2001 }, { "epoch": 0.910828025477707, "grad_norm": 0.7589317965474969, "learning_rate": 4.6020442947852e-06, "loss": 0.0781, "step": 2002 }, { "epoch": 0.9112829845313922, "grad_norm": 0.5530988126559163, "learning_rate": 4.6016573568178105e-06, "loss": 0.0417, "step": 2003 }, { "epoch": 0.9117379435850773, "grad_norm": 0.7697388126057101, "learning_rate": 4.601270247112473e-06, "loss": 0.0931, "step": 2004 }, { "epoch": 0.9121929026387625, "grad_norm": 0.7976083703107357, "learning_rate": 4.60088296570082e-06, "loss": 0.0536, "step": 2005 }, { "epoch": 0.9126478616924477, "grad_norm": 0.5035240024386465, "learning_rate": 4.600495512614499e-06, "loss": 0.0482, "step": 2006 }, { "epoch": 0.9131028207461328, "grad_norm": 0.9991410007845525, "learning_rate": 4.60010788788517e-06, "loss": 0.0982, "step": 2007 }, { "epoch": 0.913557779799818, "grad_norm": 0.741220096048184, "learning_rate": 4.5997200915445095e-06, "loss": 0.067, "step": 2008 }, { "epoch": 0.9140127388535032, "grad_norm": 0.7469087070674232, "learning_rate": 4.599332123624204e-06, "loss": 0.0762, "step": 2009 }, { "epoch": 0.9144676979071884, "grad_norm": 0.7289519001463325, "learning_rate": 4.598943984155959e-06, "loss": 0.0688, "step": 2010 }, { "epoch": 0.9149226569608735, "grad_norm": 0.6249519650896803, "learning_rate": 4.598555673171489e-06, "loss": 0.0455, "step": 2011 }, { "epoch": 0.9153776160145587, "grad_norm": 0.5475588599861211, "learning_rate": 4.5981671907025275e-06, "loss": 0.0453, "step": 2012 }, { "epoch": 0.9158325750682439, "grad_norm": 0.8942412439441615, "learning_rate": 4.597778536780818e-06, "loss": 0.0626, "step": 2013 }, { "epoch": 0.916287534121929, "grad_norm": 0.7776225162515735, "learning_rate": 4.597389711438121e-06, "loss": 0.0598, "step": 2014 }, { "epoch": 0.9167424931756142, "grad_norm": 0.5972302865002332, "learning_rate": 4.597000714706207e-06, "loss": 0.043, "step": 2015 }, { "epoch": 0.9171974522292994, "grad_norm": 1.0530137942692235, "learning_rate": 4.596611546616865e-06, "loss": 0.099, "step": 2016 }, { "epoch": 0.9176524112829846, "grad_norm": 0.8214731002522497, "learning_rate": 4.596222207201896e-06, "loss": 0.0651, "step": 2017 }, { "epoch": 0.9181073703366697, "grad_norm": 1.1339945212794975, "learning_rate": 4.595832696493115e-06, "loss": 0.1155, "step": 2018 }, { "epoch": 0.9185623293903549, "grad_norm": 0.7172896510951188, "learning_rate": 4.59544301452235e-06, "loss": 0.0591, "step": 2019 }, { "epoch": 0.9190172884440401, "grad_norm": 1.0007257724401275, "learning_rate": 4.595053161321444e-06, "loss": 0.0726, "step": 2020 }, { "epoch": 0.9194722474977252, "grad_norm": 0.6723310112271977, "learning_rate": 4.594663136922256e-06, "loss": 0.0585, "step": 2021 }, { "epoch": 0.9199272065514104, "grad_norm": 0.720246302857727, "learning_rate": 4.594272941356655e-06, "loss": 0.0563, "step": 2022 }, { "epoch": 0.9203821656050956, "grad_norm": 0.9382088606607213, "learning_rate": 4.593882574656528e-06, "loss": 0.081, "step": 2023 }, { "epoch": 0.9208371246587808, "grad_norm": 0.8121354764600041, "learning_rate": 4.5934920368537724e-06, "loss": 0.0595, "step": 2024 }, { "epoch": 0.9212920837124658, "grad_norm": 0.6886828076012578, "learning_rate": 4.593101327980301e-06, "loss": 0.0641, "step": 2025 }, { "epoch": 0.921747042766151, "grad_norm": 0.7242613606214666, "learning_rate": 4.592710448068043e-06, "loss": 0.0656, "step": 2026 }, { "epoch": 0.9222020018198362, "grad_norm": 0.7089132611487177, "learning_rate": 4.592319397148936e-06, "loss": 0.0554, "step": 2027 }, { "epoch": 0.9226569608735213, "grad_norm": 0.720645938837672, "learning_rate": 4.5919281752549386e-06, "loss": 0.0663, "step": 2028 }, { "epoch": 0.9231119199272065, "grad_norm": 0.9380123278339468, "learning_rate": 4.5915367824180165e-06, "loss": 0.0853, "step": 2029 }, { "epoch": 0.9235668789808917, "grad_norm": 0.533238980696019, "learning_rate": 4.591145218670154e-06, "loss": 0.0412, "step": 2030 }, { "epoch": 0.9240218380345769, "grad_norm": 0.8424113030761979, "learning_rate": 4.590753484043348e-06, "loss": 0.0737, "step": 2031 }, { "epoch": 0.924476797088262, "grad_norm": 1.0925264051066994, "learning_rate": 4.590361578569609e-06, "loss": 0.1022, "step": 2032 }, { "epoch": 0.9249317561419472, "grad_norm": 0.6770217428708925, "learning_rate": 4.589969502280962e-06, "loss": 0.0541, "step": 2033 }, { "epoch": 0.9253867151956324, "grad_norm": 0.57050505469179, "learning_rate": 4.589577255209445e-06, "loss": 0.0562, "step": 2034 }, { "epoch": 0.9258416742493175, "grad_norm": 0.7685735318891608, "learning_rate": 4.589184837387112e-06, "loss": 0.0633, "step": 2035 }, { "epoch": 0.9262966333030027, "grad_norm": 0.618583197540069, "learning_rate": 4.588792248846028e-06, "loss": 0.054, "step": 2036 }, { "epoch": 0.9267515923566879, "grad_norm": 1.3624730369026448, "learning_rate": 4.588399489618274e-06, "loss": 0.0878, "step": 2037 }, { "epoch": 0.9272065514103731, "grad_norm": 0.565791252388298, "learning_rate": 4.588006559735945e-06, "loss": 0.0572, "step": 2038 }, { "epoch": 0.9276615104640582, "grad_norm": 0.693373847149278, "learning_rate": 4.587613459231149e-06, "loss": 0.0607, "step": 2039 }, { "epoch": 0.9281164695177434, "grad_norm": 0.8060935136542962, "learning_rate": 4.5872201881360105e-06, "loss": 0.0739, "step": 2040 }, { "epoch": 0.9285714285714286, "grad_norm": 0.7780880150619168, "learning_rate": 4.586826746482662e-06, "loss": 0.081, "step": 2041 }, { "epoch": 0.9290263876251137, "grad_norm": 0.6712346760538427, "learning_rate": 4.586433134303257e-06, "loss": 0.06, "step": 2042 }, { "epoch": 0.9294813466787989, "grad_norm": 0.7786901525036622, "learning_rate": 4.586039351629959e-06, "loss": 0.0655, "step": 2043 }, { "epoch": 0.9299363057324841, "grad_norm": 0.47897943126694803, "learning_rate": 4.585645398494944e-06, "loss": 0.0376, "step": 2044 }, { "epoch": 0.9303912647861693, "grad_norm": 0.6962412449304831, "learning_rate": 4.585251274930406e-06, "loss": 0.0606, "step": 2045 }, { "epoch": 0.9308462238398544, "grad_norm": 1.115044271168298, "learning_rate": 4.584856980968552e-06, "loss": 0.0868, "step": 2046 }, { "epoch": 0.9313011828935396, "grad_norm": 0.7659294032948517, "learning_rate": 4.584462516641599e-06, "loss": 0.0775, "step": 2047 }, { "epoch": 0.9317561419472248, "grad_norm": 0.6771011845137347, "learning_rate": 4.584067881981784e-06, "loss": 0.059, "step": 2048 }, { "epoch": 0.9322111010009099, "grad_norm": 0.5697595033358009, "learning_rate": 4.583673077021352e-06, "loss": 0.0415, "step": 2049 }, { "epoch": 0.9326660600545951, "grad_norm": 0.7835573744548883, "learning_rate": 4.583278101792567e-06, "loss": 0.0708, "step": 2050 }, { "epoch": 0.9331210191082803, "grad_norm": 0.7984504611053544, "learning_rate": 4.582882956327704e-06, "loss": 0.0645, "step": 2051 }, { "epoch": 0.9335759781619655, "grad_norm": 0.8236288743661234, "learning_rate": 4.58248764065905e-06, "loss": 0.0631, "step": 2052 }, { "epoch": 0.9340309372156506, "grad_norm": 0.6397509690764358, "learning_rate": 4.582092154818912e-06, "loss": 0.0611, "step": 2053 }, { "epoch": 0.9344858962693358, "grad_norm": 0.7767543633533711, "learning_rate": 4.581696498839605e-06, "loss": 0.0744, "step": 2054 }, { "epoch": 0.934940855323021, "grad_norm": 0.6701944816432484, "learning_rate": 4.581300672753462e-06, "loss": 0.0675, "step": 2055 }, { "epoch": 0.935395814376706, "grad_norm": 0.5629715439636069, "learning_rate": 4.580904676592826e-06, "loss": 0.0446, "step": 2056 }, { "epoch": 0.9358507734303912, "grad_norm": 0.6828299934760278, "learning_rate": 4.580508510390057e-06, "loss": 0.0461, "step": 2057 }, { "epoch": 0.9363057324840764, "grad_norm": 0.8984515044644055, "learning_rate": 4.580112174177529e-06, "loss": 0.0915, "step": 2058 }, { "epoch": 0.9367606915377616, "grad_norm": 0.6175632910426854, "learning_rate": 4.5797156679876274e-06, "loss": 0.0507, "step": 2059 }, { "epoch": 0.9372156505914467, "grad_norm": 0.6720822499361706, "learning_rate": 4.5793189918527524e-06, "loss": 0.0749, "step": 2060 }, { "epoch": 0.9376706096451319, "grad_norm": 0.6589572570555299, "learning_rate": 4.5789221458053205e-06, "loss": 0.0583, "step": 2061 }, { "epoch": 0.9381255686988171, "grad_norm": 1.1346300175180764, "learning_rate": 4.578525129877759e-06, "loss": 0.0779, "step": 2062 }, { "epoch": 0.9385805277525022, "grad_norm": 0.6833534458218872, "learning_rate": 4.5781279441025105e-06, "loss": 0.0719, "step": 2063 }, { "epoch": 0.9390354868061874, "grad_norm": 0.7906899586059671, "learning_rate": 4.577730588512031e-06, "loss": 0.0717, "step": 2064 }, { "epoch": 0.9394904458598726, "grad_norm": 0.9641443471266038, "learning_rate": 4.577333063138791e-06, "loss": 0.0751, "step": 2065 }, { "epoch": 0.9399454049135578, "grad_norm": 0.6129982759361653, "learning_rate": 4.576935368015274e-06, "loss": 0.0535, "step": 2066 }, { "epoch": 0.9404003639672429, "grad_norm": 0.9260468404104475, "learning_rate": 4.576537503173978e-06, "loss": 0.1152, "step": 2067 }, { "epoch": 0.9408553230209281, "grad_norm": 0.8261242184140701, "learning_rate": 4.576139468647415e-06, "loss": 0.0671, "step": 2068 }, { "epoch": 0.9413102820746133, "grad_norm": 0.6924666122798075, "learning_rate": 4.575741264468111e-06, "loss": 0.0466, "step": 2069 }, { "epoch": 0.9417652411282984, "grad_norm": 0.874077324118521, "learning_rate": 4.575342890668603e-06, "loss": 0.0706, "step": 2070 }, { "epoch": 0.9422202001819836, "grad_norm": 1.1818230514353092, "learning_rate": 4.574944347281448e-06, "loss": 0.1147, "step": 2071 }, { "epoch": 0.9426751592356688, "grad_norm": 0.7829196495955753, "learning_rate": 4.5745456343392114e-06, "loss": 0.0549, "step": 2072 }, { "epoch": 0.943130118289354, "grad_norm": 0.7015791020130244, "learning_rate": 4.574146751874473e-06, "loss": 0.0719, "step": 2073 }, { "epoch": 0.9435850773430391, "grad_norm": 0.7725458409426254, "learning_rate": 4.57374769991983e-06, "loss": 0.0672, "step": 2074 }, { "epoch": 0.9440400363967243, "grad_norm": 0.5494001673465373, "learning_rate": 4.573348478507888e-06, "loss": 0.0492, "step": 2075 }, { "epoch": 0.9444949954504095, "grad_norm": 1.2424508447918836, "learning_rate": 4.5729490876712725e-06, "loss": 0.1248, "step": 2076 }, { "epoch": 0.9449499545040946, "grad_norm": 0.7654468382081444, "learning_rate": 4.572549527442619e-06, "loss": 0.066, "step": 2077 }, { "epoch": 0.9454049135577798, "grad_norm": 0.7504667900780868, "learning_rate": 4.572149797854578e-06, "loss": 0.0715, "step": 2078 }, { "epoch": 0.945859872611465, "grad_norm": 0.5821750562736777, "learning_rate": 4.571749898939813e-06, "loss": 0.0488, "step": 2079 }, { "epoch": 0.9463148316651502, "grad_norm": 0.5995576604839662, "learning_rate": 4.5713498307310024e-06, "loss": 0.0467, "step": 2080 }, { "epoch": 0.9467697907188353, "grad_norm": 0.5193399380100792, "learning_rate": 4.570949593260837e-06, "loss": 0.0418, "step": 2081 }, { "epoch": 0.9472247497725205, "grad_norm": 0.7726672189615522, "learning_rate": 4.570549186562024e-06, "loss": 0.068, "step": 2082 }, { "epoch": 0.9476797088262057, "grad_norm": 0.9870162190322213, "learning_rate": 4.570148610667281e-06, "loss": 0.0785, "step": 2083 }, { "epoch": 0.9481346678798908, "grad_norm": 1.1745928555777398, "learning_rate": 4.569747865609343e-06, "loss": 0.0952, "step": 2084 }, { "epoch": 0.948589626933576, "grad_norm": 0.985185956121142, "learning_rate": 4.569346951420957e-06, "loss": 0.0928, "step": 2085 }, { "epoch": 0.9490445859872612, "grad_norm": 0.842113304644101, "learning_rate": 4.568945868134882e-06, "loss": 0.0758, "step": 2086 }, { "epoch": 0.9494995450409464, "grad_norm": 0.9148030370424597, "learning_rate": 4.568544615783894e-06, "loss": 0.0757, "step": 2087 }, { "epoch": 0.9499545040946314, "grad_norm": 0.6563897177314274, "learning_rate": 4.568143194400782e-06, "loss": 0.054, "step": 2088 }, { "epoch": 0.9504094631483166, "grad_norm": 0.7611486885212506, "learning_rate": 4.567741604018348e-06, "loss": 0.0636, "step": 2089 }, { "epoch": 0.9508644222020018, "grad_norm": 1.0072775936266698, "learning_rate": 4.567339844669407e-06, "loss": 0.0832, "step": 2090 }, { "epoch": 0.9513193812556869, "grad_norm": 0.574494067350516, "learning_rate": 4.566937916386791e-06, "loss": 0.0586, "step": 2091 }, { "epoch": 0.9517743403093721, "grad_norm": 0.6300268259772549, "learning_rate": 4.566535819203342e-06, "loss": 0.0632, "step": 2092 }, { "epoch": 0.9522292993630573, "grad_norm": 0.7929310570764979, "learning_rate": 4.566133553151918e-06, "loss": 0.0818, "step": 2093 }, { "epoch": 0.9526842584167425, "grad_norm": 0.807980183016958, "learning_rate": 4.565731118265392e-06, "loss": 0.0699, "step": 2094 }, { "epoch": 0.9531392174704276, "grad_norm": 1.0129943914072512, "learning_rate": 4.5653285145766465e-06, "loss": 0.0819, "step": 2095 }, { "epoch": 0.9535941765241128, "grad_norm": 0.41876997993238735, "learning_rate": 4.564925742118583e-06, "loss": 0.0357, "step": 2096 }, { "epoch": 0.954049135577798, "grad_norm": 0.9399575388698519, "learning_rate": 4.564522800924111e-06, "loss": 0.0924, "step": 2097 }, { "epoch": 0.9545040946314831, "grad_norm": 0.5866883608136003, "learning_rate": 4.56411969102616e-06, "loss": 0.0542, "step": 2098 }, { "epoch": 0.9549590536851683, "grad_norm": 0.7058269296544326, "learning_rate": 4.5637164124576695e-06, "loss": 0.0692, "step": 2099 }, { "epoch": 0.9554140127388535, "grad_norm": 0.589695685362554, "learning_rate": 4.563312965251594e-06, "loss": 0.0477, "step": 2100 }, { "epoch": 0.9558689717925387, "grad_norm": 0.767068461359846, "learning_rate": 4.562909349440899e-06, "loss": 0.069, "step": 2101 }, { "epoch": 0.9563239308462238, "grad_norm": 0.9615092558046602, "learning_rate": 4.5625055650585695e-06, "loss": 0.0587, "step": 2102 }, { "epoch": 0.956778889899909, "grad_norm": 0.7392367463114865, "learning_rate": 4.562101612137599e-06, "loss": 0.0637, "step": 2103 }, { "epoch": 0.9572338489535942, "grad_norm": 0.6001888352855437, "learning_rate": 4.561697490710998e-06, "loss": 0.0549, "step": 2104 }, { "epoch": 0.9576888080072793, "grad_norm": 0.7701090418633465, "learning_rate": 4.561293200811787e-06, "loss": 0.0652, "step": 2105 }, { "epoch": 0.9581437670609645, "grad_norm": 0.9986731635993634, "learning_rate": 4.560888742473005e-06, "loss": 0.0904, "step": 2106 }, { "epoch": 0.9585987261146497, "grad_norm": 0.9421619956607125, "learning_rate": 4.560484115727703e-06, "loss": 0.0822, "step": 2107 }, { "epoch": 0.9590536851683349, "grad_norm": 0.6209809874242187, "learning_rate": 4.560079320608942e-06, "loss": 0.0553, "step": 2108 }, { "epoch": 0.95950864422202, "grad_norm": 0.6196885855952039, "learning_rate": 4.5596743571498035e-06, "loss": 0.0592, "step": 2109 }, { "epoch": 0.9599636032757052, "grad_norm": 0.8124501053975468, "learning_rate": 4.5592692253833775e-06, "loss": 0.0583, "step": 2110 }, { "epoch": 0.9604185623293904, "grad_norm": 0.7003649102713996, "learning_rate": 4.5588639253427705e-06, "loss": 0.0523, "step": 2111 }, { "epoch": 0.9608735213830755, "grad_norm": 0.6112101133803953, "learning_rate": 4.558458457061101e-06, "loss": 0.0513, "step": 2112 }, { "epoch": 0.9613284804367607, "grad_norm": 0.7914680473124714, "learning_rate": 4.5580528205715024e-06, "loss": 0.0618, "step": 2113 }, { "epoch": 0.9617834394904459, "grad_norm": 0.9477327883059017, "learning_rate": 4.557647015907121e-06, "loss": 0.0786, "step": 2114 }, { "epoch": 0.9622383985441311, "grad_norm": 0.6623493696862014, "learning_rate": 4.557241043101118e-06, "loss": 0.058, "step": 2115 }, { "epoch": 0.9626933575978162, "grad_norm": 0.8423723830051886, "learning_rate": 4.556834902186667e-06, "loss": 0.0872, "step": 2116 }, { "epoch": 0.9631483166515014, "grad_norm": 0.6740938824585692, "learning_rate": 4.556428593196956e-06, "loss": 0.0546, "step": 2117 }, { "epoch": 0.9636032757051866, "grad_norm": 1.1955334494093306, "learning_rate": 4.556022116165189e-06, "loss": 0.1227, "step": 2118 }, { "epoch": 0.9640582347588716, "grad_norm": 0.9895083968984689, "learning_rate": 4.555615471124578e-06, "loss": 0.0802, "step": 2119 }, { "epoch": 0.9645131938125568, "grad_norm": 0.7647543998706896, "learning_rate": 4.555208658108354e-06, "loss": 0.0514, "step": 2120 }, { "epoch": 0.964968152866242, "grad_norm": 0.7129470981994964, "learning_rate": 4.55480167714976e-06, "loss": 0.0548, "step": 2121 }, { "epoch": 0.9654231119199272, "grad_norm": 0.7053376734462564, "learning_rate": 4.554394528282052e-06, "loss": 0.0761, "step": 2122 }, { "epoch": 0.9658780709736123, "grad_norm": 0.6283973436454654, "learning_rate": 4.553987211538501e-06, "loss": 0.0502, "step": 2123 }, { "epoch": 0.9663330300272975, "grad_norm": 0.9170409664655178, "learning_rate": 4.5535797269523906e-06, "loss": 0.0784, "step": 2124 }, { "epoch": 0.9667879890809827, "grad_norm": 0.8287926240908017, "learning_rate": 4.55317207455702e-06, "loss": 0.061, "step": 2125 }, { "epoch": 0.9672429481346679, "grad_norm": 0.6405681723490552, "learning_rate": 4.552764254385697e-06, "loss": 0.0662, "step": 2126 }, { "epoch": 0.967697907188353, "grad_norm": 0.8088482444470904, "learning_rate": 4.552356266471751e-06, "loss": 0.0582, "step": 2127 }, { "epoch": 0.9681528662420382, "grad_norm": 0.7353313299484825, "learning_rate": 4.55194811084852e-06, "loss": 0.0654, "step": 2128 }, { "epoch": 0.9686078252957234, "grad_norm": 1.1516254024678405, "learning_rate": 4.551539787549354e-06, "loss": 0.0936, "step": 2129 }, { "epoch": 0.9690627843494085, "grad_norm": 0.6501936911095981, "learning_rate": 4.551131296607623e-06, "loss": 0.0417, "step": 2130 }, { "epoch": 0.9695177434030937, "grad_norm": 0.7334920432846229, "learning_rate": 4.550722638056703e-06, "loss": 0.0526, "step": 2131 }, { "epoch": 0.9699727024567789, "grad_norm": 0.7198129404059547, "learning_rate": 4.550313811929993e-06, "loss": 0.0545, "step": 2132 }, { "epoch": 0.9704276615104641, "grad_norm": 0.7616235023926703, "learning_rate": 4.549904818260895e-06, "loss": 0.0639, "step": 2133 }, { "epoch": 0.9708826205641492, "grad_norm": 0.9116016503086201, "learning_rate": 4.549495657082834e-06, "loss": 0.0889, "step": 2134 }, { "epoch": 0.9713375796178344, "grad_norm": 0.6685343538666026, "learning_rate": 4.549086328429242e-06, "loss": 0.0483, "step": 2135 }, { "epoch": 0.9717925386715196, "grad_norm": 0.9095371650600752, "learning_rate": 4.548676832333569e-06, "loss": 0.0706, "step": 2136 }, { "epoch": 0.9722474977252047, "grad_norm": 0.7169076791465431, "learning_rate": 4.548267168829279e-06, "loss": 0.0525, "step": 2137 }, { "epoch": 0.9727024567788899, "grad_norm": 0.7564722267203925, "learning_rate": 4.547857337949844e-06, "loss": 0.0598, "step": 2138 }, { "epoch": 0.9731574158325751, "grad_norm": 0.6757823684126535, "learning_rate": 4.5474473397287556e-06, "loss": 0.0498, "step": 2139 }, { "epoch": 0.9736123748862603, "grad_norm": 0.6156762948567482, "learning_rate": 4.547037174199517e-06, "loss": 0.0534, "step": 2140 }, { "epoch": 0.9740673339399454, "grad_norm": 1.039052452467343, "learning_rate": 4.546626841395645e-06, "loss": 0.084, "step": 2141 }, { "epoch": 0.9745222929936306, "grad_norm": 0.9845877678571373, "learning_rate": 4.54621634135067e-06, "loss": 0.0628, "step": 2142 }, { "epoch": 0.9749772520473158, "grad_norm": 0.9305858125408707, "learning_rate": 4.545805674098136e-06, "loss": 0.0759, "step": 2143 }, { "epoch": 0.9754322111010009, "grad_norm": 0.8876941693028888, "learning_rate": 4.545394839671601e-06, "loss": 0.0741, "step": 2144 }, { "epoch": 0.9758871701546861, "grad_norm": 0.8796654962506077, "learning_rate": 4.544983838104637e-06, "loss": 0.0776, "step": 2145 }, { "epoch": 0.9763421292083713, "grad_norm": 0.7920165529963009, "learning_rate": 4.544572669430828e-06, "loss": 0.0786, "step": 2146 }, { "epoch": 0.9767970882620565, "grad_norm": 1.1006660615484487, "learning_rate": 4.544161333683775e-06, "loss": 0.1067, "step": 2147 }, { "epoch": 0.9772520473157416, "grad_norm": 0.7917410075321241, "learning_rate": 4.543749830897088e-06, "loss": 0.08, "step": 2148 }, { "epoch": 0.9777070063694268, "grad_norm": 0.9376537537983477, "learning_rate": 4.543338161104395e-06, "loss": 0.0809, "step": 2149 }, { "epoch": 0.978161965423112, "grad_norm": 0.6909374452010633, "learning_rate": 4.542926324339335e-06, "loss": 0.0599, "step": 2150 }, { "epoch": 0.978616924476797, "grad_norm": 0.9041638003887569, "learning_rate": 4.542514320635561e-06, "loss": 0.0897, "step": 2151 }, { "epoch": 0.9790718835304822, "grad_norm": 1.0016444468044507, "learning_rate": 4.542102150026741e-06, "loss": 0.0899, "step": 2152 }, { "epoch": 0.9795268425841674, "grad_norm": 0.8098413909616191, "learning_rate": 4.541689812546556e-06, "loss": 0.0824, "step": 2153 }, { "epoch": 0.9799818016378526, "grad_norm": 0.8781062759872817, "learning_rate": 4.541277308228698e-06, "loss": 0.0835, "step": 2154 }, { "epoch": 0.9804367606915377, "grad_norm": 0.6567263955436328, "learning_rate": 4.540864637106879e-06, "loss": 0.0561, "step": 2155 }, { "epoch": 0.9808917197452229, "grad_norm": 0.8959082251283595, "learning_rate": 4.540451799214817e-06, "loss": 0.0526, "step": 2156 }, { "epoch": 0.9813466787989081, "grad_norm": 0.6968673717470518, "learning_rate": 4.540038794586248e-06, "loss": 0.0559, "step": 2157 }, { "epoch": 0.9818016378525932, "grad_norm": 0.6576045950224041, "learning_rate": 4.539625623254923e-06, "loss": 0.048, "step": 2158 }, { "epoch": 0.9822565969062784, "grad_norm": 1.1320950822163716, "learning_rate": 4.539212285254601e-06, "loss": 0.1112, "step": 2159 }, { "epoch": 0.9827115559599636, "grad_norm": 0.7758556663067462, "learning_rate": 4.5387987806190615e-06, "loss": 0.0565, "step": 2160 }, { "epoch": 0.9831665150136488, "grad_norm": 0.8190828028341962, "learning_rate": 4.538385109382093e-06, "loss": 0.0757, "step": 2161 }, { "epoch": 0.9836214740673339, "grad_norm": 0.7293574919811723, "learning_rate": 4.537971271577498e-06, "loss": 0.0739, "step": 2162 }, { "epoch": 0.9840764331210191, "grad_norm": 0.5802134505622355, "learning_rate": 4.537557267239093e-06, "loss": 0.0544, "step": 2163 }, { "epoch": 0.9845313921747043, "grad_norm": 0.5765626125784856, "learning_rate": 4.537143096400712e-06, "loss": 0.0465, "step": 2164 }, { "epoch": 0.9849863512283894, "grad_norm": 0.97068469033027, "learning_rate": 4.536728759096195e-06, "loss": 0.075, "step": 2165 }, { "epoch": 0.9854413102820746, "grad_norm": 0.7198973054526564, "learning_rate": 4.536314255359402e-06, "loss": 0.0574, "step": 2166 }, { "epoch": 0.9858962693357598, "grad_norm": 0.8011541580376177, "learning_rate": 4.535899585224204e-06, "loss": 0.0652, "step": 2167 }, { "epoch": 0.986351228389445, "grad_norm": 0.5636355433670379, "learning_rate": 4.535484748724486e-06, "loss": 0.0462, "step": 2168 }, { "epoch": 0.9868061874431301, "grad_norm": 0.5710469301327791, "learning_rate": 4.535069745894147e-06, "loss": 0.057, "step": 2169 }, { "epoch": 0.9872611464968153, "grad_norm": 0.8053320529662169, "learning_rate": 4.534654576767098e-06, "loss": 0.0729, "step": 2170 }, { "epoch": 0.9877161055505005, "grad_norm": 0.9884868888578544, "learning_rate": 4.534239241377266e-06, "loss": 0.1066, "step": 2171 }, { "epoch": 0.9881710646041856, "grad_norm": 0.8252157658838575, "learning_rate": 4.5338237397585895e-06, "loss": 0.0975, "step": 2172 }, { "epoch": 0.9886260236578708, "grad_norm": 0.929818227033473, "learning_rate": 4.533408071945021e-06, "loss": 0.0481, "step": 2173 }, { "epoch": 0.989080982711556, "grad_norm": 0.7508255872462847, "learning_rate": 4.532992237970528e-06, "loss": 0.0626, "step": 2174 }, { "epoch": 0.9895359417652412, "grad_norm": 0.5839159131330265, "learning_rate": 4.532576237869091e-06, "loss": 0.0394, "step": 2175 }, { "epoch": 0.9899909008189263, "grad_norm": 0.5644496220261198, "learning_rate": 4.5321600716747025e-06, "loss": 0.0435, "step": 2176 }, { "epoch": 0.9904458598726115, "grad_norm": 0.7793906682399298, "learning_rate": 4.531743739421369e-06, "loss": 0.0609, "step": 2177 }, { "epoch": 0.9909008189262967, "grad_norm": 0.5892429799760452, "learning_rate": 4.531327241143114e-06, "loss": 0.0419, "step": 2178 }, { "epoch": 0.9913557779799818, "grad_norm": 0.8986290228451513, "learning_rate": 4.530910576873969e-06, "loss": 0.0614, "step": 2179 }, { "epoch": 0.991810737033667, "grad_norm": 0.6316271594357544, "learning_rate": 4.530493746647984e-06, "loss": 0.049, "step": 2180 }, { "epoch": 0.9922656960873522, "grad_norm": 0.862209961975702, "learning_rate": 4.530076750499219e-06, "loss": 0.0797, "step": 2181 }, { "epoch": 0.9927206551410374, "grad_norm": 0.8913393772459259, "learning_rate": 4.52965958846175e-06, "loss": 0.0754, "step": 2182 }, { "epoch": 0.9931756141947224, "grad_norm": 0.6021201567582666, "learning_rate": 4.529242260569665e-06, "loss": 0.0433, "step": 2183 }, { "epoch": 0.9936305732484076, "grad_norm": 1.1143487968824526, "learning_rate": 4.528824766857067e-06, "loss": 0.0933, "step": 2184 }, { "epoch": 0.9940855323020928, "grad_norm": 0.8397193906783932, "learning_rate": 4.5284071073580715e-06, "loss": 0.071, "step": 2185 }, { "epoch": 0.9945404913557779, "grad_norm": 0.7046300180509759, "learning_rate": 4.527989282106807e-06, "loss": 0.0553, "step": 2186 }, { "epoch": 0.9949954504094631, "grad_norm": 1.2273654625571622, "learning_rate": 4.527571291137416e-06, "loss": 0.0826, "step": 2187 }, { "epoch": 0.9954504094631483, "grad_norm": 0.8674968100145852, "learning_rate": 4.527153134484056e-06, "loss": 0.0793, "step": 2188 }, { "epoch": 0.9959053685168335, "grad_norm": 0.7590194710414102, "learning_rate": 4.5267348121808965e-06, "loss": 0.0627, "step": 2189 }, { "epoch": 0.9963603275705186, "grad_norm": 0.6472950730058415, "learning_rate": 4.526316324262121e-06, "loss": 0.072, "step": 2190 }, { "epoch": 0.9968152866242038, "grad_norm": 0.7337103448736059, "learning_rate": 4.525897670761926e-06, "loss": 0.0535, "step": 2191 }, { "epoch": 0.997270245677889, "grad_norm": 0.8681401412263181, "learning_rate": 4.525478851714522e-06, "loss": 0.0715, "step": 2192 }, { "epoch": 0.9977252047315741, "grad_norm": 0.8136877497937504, "learning_rate": 4.525059867154133e-06, "loss": 0.069, "step": 2193 }, { "epoch": 0.9981801637852593, "grad_norm": 0.7165865552910584, "learning_rate": 4.5246407171149975e-06, "loss": 0.0701, "step": 2194 }, { "epoch": 0.9986351228389445, "grad_norm": 0.7119563605507941, "learning_rate": 4.5242214016313655e-06, "loss": 0.0605, "step": 2195 }, { "epoch": 0.9990900818926297, "grad_norm": 0.5906420983866724, "learning_rate": 4.523801920737501e-06, "loss": 0.0537, "step": 2196 }, { "epoch": 0.9995450409463148, "grad_norm": 0.6793422286647208, "learning_rate": 4.523382274467684e-06, "loss": 0.0601, "step": 2197 }, { "epoch": 1.0, "grad_norm": 0.3861471518492922, "learning_rate": 4.522962462856206e-06, "loss": 0.0229, "step": 2198 }, { "epoch": 1.000454959053685, "grad_norm": 0.4134000717198071, "learning_rate": 4.522542485937369e-06, "loss": 0.0162, "step": 2199 }, { "epoch": 1.0009099181073704, "grad_norm": 0.47305029810650895, "learning_rate": 4.522122343745495e-06, "loss": 0.0251, "step": 2200 }, { "epoch": 1.0013648771610555, "grad_norm": 0.5407710422461701, "learning_rate": 4.521702036314915e-06, "loss": 0.0305, "step": 2201 }, { "epoch": 1.0018198362147406, "grad_norm": 0.3163516556985795, "learning_rate": 4.521281563679973e-06, "loss": 0.0169, "step": 2202 }, { "epoch": 1.0022747952684259, "grad_norm": 0.46270617626451155, "learning_rate": 4.5208609258750314e-06, "loss": 0.0208, "step": 2203 }, { "epoch": 1.002729754322111, "grad_norm": 0.37292336110835755, "learning_rate": 4.52044012293446e-06, "loss": 0.0261, "step": 2204 }, { "epoch": 1.0031847133757963, "grad_norm": 0.4740242480862373, "learning_rate": 4.520019154892646e-06, "loss": 0.0309, "step": 2205 }, { "epoch": 1.0036396724294814, "grad_norm": 0.9587535207840793, "learning_rate": 4.519598021783989e-06, "loss": 0.0473, "step": 2206 }, { "epoch": 1.0040946314831665, "grad_norm": 0.5020844798119457, "learning_rate": 4.519176723642903e-06, "loss": 0.0379, "step": 2207 }, { "epoch": 1.0045495905368518, "grad_norm": 0.5844057078703908, "learning_rate": 4.518755260503813e-06, "loss": 0.0264, "step": 2208 }, { "epoch": 1.0050045495905369, "grad_norm": 0.3907648371732826, "learning_rate": 4.51833363240116e-06, "loss": 0.0246, "step": 2209 }, { "epoch": 1.005459508644222, "grad_norm": 0.3437099134045075, "learning_rate": 4.517911839369398e-06, "loss": 0.0215, "step": 2210 }, { "epoch": 1.0059144676979073, "grad_norm": 0.48723727398361594, "learning_rate": 4.517489881442993e-06, "loss": 0.0221, "step": 2211 }, { "epoch": 1.0063694267515924, "grad_norm": 0.36188847497976756, "learning_rate": 4.517067758656424e-06, "loss": 0.0152, "step": 2212 }, { "epoch": 1.0068243858052774, "grad_norm": 0.32768362464406736, "learning_rate": 4.516645471044188e-06, "loss": 0.0137, "step": 2213 }, { "epoch": 1.0072793448589628, "grad_norm": 0.35318195696428845, "learning_rate": 4.516223018640791e-06, "loss": 0.0175, "step": 2214 }, { "epoch": 1.0077343039126478, "grad_norm": 0.36650556359181136, "learning_rate": 4.515800401480754e-06, "loss": 0.0225, "step": 2215 }, { "epoch": 1.008189262966333, "grad_norm": 0.42431668473537093, "learning_rate": 4.515377619598612e-06, "loss": 0.0251, "step": 2216 }, { "epoch": 1.0086442220200182, "grad_norm": 0.42493861735993127, "learning_rate": 4.514954673028913e-06, "loss": 0.022, "step": 2217 }, { "epoch": 1.0090991810737033, "grad_norm": 0.31532925739877793, "learning_rate": 4.5145315618062155e-06, "loss": 0.0167, "step": 2218 }, { "epoch": 1.0095541401273886, "grad_norm": 0.6455510153155769, "learning_rate": 4.514108285965098e-06, "loss": 0.0279, "step": 2219 }, { "epoch": 1.0100090991810737, "grad_norm": 0.4942618390077279, "learning_rate": 4.513684845540146e-06, "loss": 0.0281, "step": 2220 }, { "epoch": 1.0104640582347588, "grad_norm": 0.5202505979006149, "learning_rate": 4.5132612405659625e-06, "loss": 0.0352, "step": 2221 }, { "epoch": 1.0109190172884441, "grad_norm": 0.29852324614455794, "learning_rate": 4.5128374710771625e-06, "loss": 0.0125, "step": 2222 }, { "epoch": 1.0113739763421292, "grad_norm": 0.6497393685341776, "learning_rate": 4.512413537108374e-06, "loss": 0.0418, "step": 2223 }, { "epoch": 1.0118289353958143, "grad_norm": 0.5879513139032745, "learning_rate": 4.511989438694239e-06, "loss": 0.024, "step": 2224 }, { "epoch": 1.0122838944494996, "grad_norm": 0.4735839819820227, "learning_rate": 4.511565175869415e-06, "loss": 0.0136, "step": 2225 }, { "epoch": 1.0127388535031847, "grad_norm": 0.4033061216793247, "learning_rate": 4.511140748668566e-06, "loss": 0.0205, "step": 2226 }, { "epoch": 1.0131938125568698, "grad_norm": 0.4374666494282639, "learning_rate": 4.510716157126379e-06, "loss": 0.0219, "step": 2227 }, { "epoch": 1.013648771610555, "grad_norm": 0.46869316743808964, "learning_rate": 4.510291401277548e-06, "loss": 0.0255, "step": 2228 }, { "epoch": 1.0141037306642402, "grad_norm": 0.5854010070199603, "learning_rate": 4.509866481156781e-06, "loss": 0.0241, "step": 2229 }, { "epoch": 1.0145586897179253, "grad_norm": 0.39945011653857926, "learning_rate": 4.509441396798802e-06, "loss": 0.0193, "step": 2230 }, { "epoch": 1.0150136487716106, "grad_norm": 0.49690934996195507, "learning_rate": 4.5090161482383475e-06, "loss": 0.0175, "step": 2231 }, { "epoch": 1.0154686078252957, "grad_norm": 0.4748457609386695, "learning_rate": 4.508590735510166e-06, "loss": 0.0227, "step": 2232 }, { "epoch": 1.015923566878981, "grad_norm": 0.49987588415101697, "learning_rate": 4.508165158649019e-06, "loss": 0.0245, "step": 2233 }, { "epoch": 1.016378525932666, "grad_norm": 0.3789941766757115, "learning_rate": 4.507739417689685e-06, "loss": 0.0139, "step": 2234 }, { "epoch": 1.0168334849863512, "grad_norm": 0.5234968222321746, "learning_rate": 4.507313512666953e-06, "loss": 0.0177, "step": 2235 }, { "epoch": 1.0172884440400365, "grad_norm": 0.39843093182948436, "learning_rate": 4.506887443615625e-06, "loss": 0.0153, "step": 2236 }, { "epoch": 1.0177434030937216, "grad_norm": 0.5143152978261103, "learning_rate": 4.506461210570518e-06, "loss": 0.0194, "step": 2237 }, { "epoch": 1.0181983621474067, "grad_norm": 0.6529575725615211, "learning_rate": 4.506034813566462e-06, "loss": 0.0266, "step": 2238 }, { "epoch": 1.018653321201092, "grad_norm": 0.48382413646434125, "learning_rate": 4.505608252638301e-06, "loss": 0.0236, "step": 2239 }, { "epoch": 1.019108280254777, "grad_norm": 0.6312423204307113, "learning_rate": 4.50518152782089e-06, "loss": 0.0249, "step": 2240 }, { "epoch": 1.0195632393084622, "grad_norm": 0.47154152319198106, "learning_rate": 4.504754639149101e-06, "loss": 0.0176, "step": 2241 }, { "epoch": 1.0200181983621475, "grad_norm": 0.3828630177013417, "learning_rate": 4.504327586657814e-06, "loss": 0.0147, "step": 2242 }, { "epoch": 1.0204731574158326, "grad_norm": 0.6397319142718383, "learning_rate": 4.50390037038193e-06, "loss": 0.0251, "step": 2243 }, { "epoch": 1.0209281164695176, "grad_norm": 0.5582288758170052, "learning_rate": 4.503472990356357e-06, "loss": 0.0291, "step": 2244 }, { "epoch": 1.021383075523203, "grad_norm": 0.3375136771592828, "learning_rate": 4.503045446616018e-06, "loss": 0.0129, "step": 2245 }, { "epoch": 1.021838034576888, "grad_norm": 0.4662408356158423, "learning_rate": 4.502617739195852e-06, "loss": 0.0224, "step": 2246 }, { "epoch": 1.0222929936305734, "grad_norm": 0.5410561625011929, "learning_rate": 4.502189868130807e-06, "loss": 0.0172, "step": 2247 }, { "epoch": 1.0227479526842584, "grad_norm": 0.654839238459795, "learning_rate": 4.501761833455849e-06, "loss": 0.0281, "step": 2248 }, { "epoch": 1.0232029117379435, "grad_norm": 0.6119106378791578, "learning_rate": 4.501333635205952e-06, "loss": 0.0172, "step": 2249 }, { "epoch": 1.0236578707916288, "grad_norm": 0.6002704645250265, "learning_rate": 4.5009052734161095e-06, "loss": 0.0278, "step": 2250 }, { "epoch": 1.024112829845314, "grad_norm": 0.562189478960442, "learning_rate": 4.500476748121324e-06, "loss": 0.0289, "step": 2251 }, { "epoch": 1.024567788898999, "grad_norm": 0.5573521615480775, "learning_rate": 4.500048059356613e-06, "loss": 0.0264, "step": 2252 }, { "epoch": 1.0250227479526843, "grad_norm": 0.35812836056261116, "learning_rate": 4.499619207157007e-06, "loss": 0.0142, "step": 2253 }, { "epoch": 1.0254777070063694, "grad_norm": 0.4466546832771037, "learning_rate": 4.499190191557549e-06, "loss": 0.0199, "step": 2254 }, { "epoch": 1.0259326660600545, "grad_norm": 0.7194400511344139, "learning_rate": 4.498761012593296e-06, "loss": 0.0329, "step": 2255 }, { "epoch": 1.0263876251137398, "grad_norm": 0.5968641194765588, "learning_rate": 4.498331670299321e-06, "loss": 0.0259, "step": 2256 }, { "epoch": 1.026842584167425, "grad_norm": 0.6396159557727437, "learning_rate": 4.497902164710704e-06, "loss": 0.0165, "step": 2257 }, { "epoch": 1.02729754322111, "grad_norm": 0.9501081798643956, "learning_rate": 4.497472495862547e-06, "loss": 0.0432, "step": 2258 }, { "epoch": 1.0277525022747953, "grad_norm": 0.40770955227461003, "learning_rate": 4.497042663789957e-06, "loss": 0.0153, "step": 2259 }, { "epoch": 1.0282074613284804, "grad_norm": 0.6626552748025197, "learning_rate": 4.496612668528059e-06, "loss": 0.0271, "step": 2260 }, { "epoch": 1.0286624203821657, "grad_norm": 0.5720200272418298, "learning_rate": 4.496182510111991e-06, "loss": 0.0331, "step": 2261 }, { "epoch": 1.0291173794358508, "grad_norm": 0.6122100316161389, "learning_rate": 4.495752188576902e-06, "loss": 0.0279, "step": 2262 }, { "epoch": 1.0295723384895359, "grad_norm": 0.5909094519020904, "learning_rate": 4.4953217039579574e-06, "loss": 0.0214, "step": 2263 }, { "epoch": 1.0300272975432212, "grad_norm": 0.8262480646318613, "learning_rate": 4.494891056290335e-06, "loss": 0.0359, "step": 2264 }, { "epoch": 1.0304822565969063, "grad_norm": 0.4058443771592801, "learning_rate": 4.494460245609223e-06, "loss": 0.0151, "step": 2265 }, { "epoch": 1.0309372156505914, "grad_norm": 0.8211955800040838, "learning_rate": 4.494029271949827e-06, "loss": 0.0286, "step": 2266 }, { "epoch": 1.0313921747042767, "grad_norm": 0.4785178376545247, "learning_rate": 4.493598135347363e-06, "loss": 0.0201, "step": 2267 }, { "epoch": 1.0318471337579618, "grad_norm": 1.0647622975253437, "learning_rate": 4.493166835837064e-06, "loss": 0.0296, "step": 2268 }, { "epoch": 1.0323020928116469, "grad_norm": 0.7293233621494204, "learning_rate": 4.492735373454171e-06, "loss": 0.0301, "step": 2269 }, { "epoch": 1.0327570518653322, "grad_norm": 0.4229410037536498, "learning_rate": 4.492303748233943e-06, "loss": 0.0153, "step": 2270 }, { "epoch": 1.0332120109190173, "grad_norm": 0.6975035519080148, "learning_rate": 4.49187196021165e-06, "loss": 0.0448, "step": 2271 }, { "epoch": 1.0336669699727024, "grad_norm": 0.6125495922729677, "learning_rate": 4.491440009422575e-06, "loss": 0.0243, "step": 2272 }, { "epoch": 1.0341219290263877, "grad_norm": 0.408105520826875, "learning_rate": 4.491007895902016e-06, "loss": 0.0132, "step": 2273 }, { "epoch": 1.0345768880800728, "grad_norm": 0.48877548043609653, "learning_rate": 4.490575619685283e-06, "loss": 0.0205, "step": 2274 }, { "epoch": 1.035031847133758, "grad_norm": 0.812892516681499, "learning_rate": 4.4901431808077e-06, "loss": 0.0343, "step": 2275 }, { "epoch": 1.0354868061874432, "grad_norm": 0.8938319297474577, "learning_rate": 4.489710579304603e-06, "loss": 0.0227, "step": 2276 }, { "epoch": 1.0359417652411282, "grad_norm": 0.6676116587013313, "learning_rate": 4.489277815211343e-06, "loss": 0.0223, "step": 2277 }, { "epoch": 1.0363967242948136, "grad_norm": 0.6495562102011189, "learning_rate": 4.488844888563284e-06, "loss": 0.0282, "step": 2278 }, { "epoch": 1.0368516833484986, "grad_norm": 0.7504240515946974, "learning_rate": 4.488411799395802e-06, "loss": 0.0192, "step": 2279 }, { "epoch": 1.0373066424021837, "grad_norm": 0.505052238930926, "learning_rate": 4.487978547744287e-06, "loss": 0.024, "step": 2280 }, { "epoch": 1.037761601455869, "grad_norm": 0.5936270855305887, "learning_rate": 4.487545133644143e-06, "loss": 0.024, "step": 2281 }, { "epoch": 1.0382165605095541, "grad_norm": 0.5321574319749766, "learning_rate": 4.487111557130787e-06, "loss": 0.026, "step": 2282 }, { "epoch": 1.0386715195632392, "grad_norm": 0.7478753443981572, "learning_rate": 4.486677818239647e-06, "loss": 0.0374, "step": 2283 }, { "epoch": 1.0391264786169245, "grad_norm": 0.6243001056584193, "learning_rate": 4.486243917006169e-06, "loss": 0.0229, "step": 2284 }, { "epoch": 1.0395814376706096, "grad_norm": 0.4929168743628238, "learning_rate": 4.485809853465807e-06, "loss": 0.018, "step": 2285 }, { "epoch": 1.0400363967242947, "grad_norm": 0.7513792925376249, "learning_rate": 4.4853756276540315e-06, "loss": 0.0243, "step": 2286 }, { "epoch": 1.04049135577798, "grad_norm": 0.39797569187526005, "learning_rate": 4.484941239606326e-06, "loss": 0.01, "step": 2287 }, { "epoch": 1.040946314831665, "grad_norm": 0.39764537771237807, "learning_rate": 4.484506689358186e-06, "loss": 0.0113, "step": 2288 }, { "epoch": 1.0414012738853504, "grad_norm": 0.4552203666266973, "learning_rate": 4.484071976945121e-06, "loss": 0.0129, "step": 2289 }, { "epoch": 1.0418562329390355, "grad_norm": 0.6376672211925216, "learning_rate": 4.483637102402655e-06, "loss": 0.0295, "step": 2290 }, { "epoch": 1.0423111919927206, "grad_norm": 0.6877079936099527, "learning_rate": 4.4832020657663224e-06, "loss": 0.0231, "step": 2291 }, { "epoch": 1.042766151046406, "grad_norm": 0.4953931162830707, "learning_rate": 4.482766867071673e-06, "loss": 0.0159, "step": 2292 }, { "epoch": 1.043221110100091, "grad_norm": 0.697817840638706, "learning_rate": 4.482331506354269e-06, "loss": 0.046, "step": 2293 }, { "epoch": 1.043676069153776, "grad_norm": 0.6432246627987923, "learning_rate": 4.4818959836496876e-06, "loss": 0.0132, "step": 2294 }, { "epoch": 1.0441310282074614, "grad_norm": 0.5013738883280774, "learning_rate": 4.481460298993515e-06, "loss": 0.0195, "step": 2295 }, { "epoch": 1.0445859872611465, "grad_norm": 0.6732371138033433, "learning_rate": 4.481024452421357e-06, "loss": 0.0257, "step": 2296 }, { "epoch": 1.0450409463148316, "grad_norm": 0.5971847786556302, "learning_rate": 4.480588443968825e-06, "loss": 0.0164, "step": 2297 }, { "epoch": 1.0454959053685169, "grad_norm": 0.5912246018330237, "learning_rate": 4.4801522736715505e-06, "loss": 0.0198, "step": 2298 }, { "epoch": 1.045950864422202, "grad_norm": 0.48571742963419795, "learning_rate": 4.479715941565174e-06, "loss": 0.0185, "step": 2299 }, { "epoch": 1.046405823475887, "grad_norm": 0.41924338496925545, "learning_rate": 4.4792794476853514e-06, "loss": 0.0147, "step": 2300 }, { "epoch": 1.0468607825295724, "grad_norm": 0.5297657413035841, "learning_rate": 4.47884279206775e-06, "loss": 0.0215, "step": 2301 }, { "epoch": 1.0473157415832575, "grad_norm": 0.4231923684228783, "learning_rate": 4.478405974748054e-06, "loss": 0.0167, "step": 2302 }, { "epoch": 1.0477707006369428, "grad_norm": 0.5639392018822912, "learning_rate": 4.477968995761954e-06, "loss": 0.0301, "step": 2303 }, { "epoch": 1.0482256596906279, "grad_norm": 0.6361502705633699, "learning_rate": 4.477531855145161e-06, "loss": 0.0235, "step": 2304 }, { "epoch": 1.048680618744313, "grad_norm": 0.5802868456297169, "learning_rate": 4.477094552933395e-06, "loss": 0.0153, "step": 2305 }, { "epoch": 1.0491355777979983, "grad_norm": 0.412107801815597, "learning_rate": 4.476657089162391e-06, "loss": 0.0217, "step": 2306 }, { "epoch": 1.0495905368516834, "grad_norm": 0.7669706620629979, "learning_rate": 4.476219463867897e-06, "loss": 0.0309, "step": 2307 }, { "epoch": 1.0500454959053684, "grad_norm": 0.6816466836381806, "learning_rate": 4.475781677085671e-06, "loss": 0.017, "step": 2308 }, { "epoch": 1.0505004549590538, "grad_norm": 0.4841095295414997, "learning_rate": 4.4753437288514904e-06, "loss": 0.0167, "step": 2309 }, { "epoch": 1.0509554140127388, "grad_norm": 0.5612261779618657, "learning_rate": 4.47490561920114e-06, "loss": 0.0153, "step": 2310 }, { "epoch": 1.051410373066424, "grad_norm": 0.638519143758431, "learning_rate": 4.474467348170421e-06, "loss": 0.0298, "step": 2311 }, { "epoch": 1.0518653321201092, "grad_norm": 0.5669351202592476, "learning_rate": 4.474028915795148e-06, "loss": 0.0282, "step": 2312 }, { "epoch": 1.0523202911737943, "grad_norm": 0.4347136897443792, "learning_rate": 4.473590322111145e-06, "loss": 0.0157, "step": 2313 }, { "epoch": 1.0527752502274794, "grad_norm": 0.4833057498174656, "learning_rate": 4.473151567154255e-06, "loss": 0.02, "step": 2314 }, { "epoch": 1.0532302092811647, "grad_norm": 0.4914062822782954, "learning_rate": 4.472712650960328e-06, "loss": 0.0181, "step": 2315 }, { "epoch": 1.0536851683348498, "grad_norm": 0.615271249953854, "learning_rate": 4.472273573565234e-06, "loss": 0.0327, "step": 2316 }, { "epoch": 1.0541401273885351, "grad_norm": 0.692992244874008, "learning_rate": 4.471834335004849e-06, "loss": 0.0235, "step": 2317 }, { "epoch": 1.0545950864422202, "grad_norm": 0.5490826845180317, "learning_rate": 4.471394935315067e-06, "loss": 0.0208, "step": 2318 }, { "epoch": 1.0550500454959053, "grad_norm": 0.5091849562822288, "learning_rate": 4.470955374531794e-06, "loss": 0.0164, "step": 2319 }, { "epoch": 1.0555050045495906, "grad_norm": 0.6151353273515647, "learning_rate": 4.470515652690947e-06, "loss": 0.0265, "step": 2320 }, { "epoch": 1.0559599636032757, "grad_norm": 0.4729616741514311, "learning_rate": 4.470075769828461e-06, "loss": 0.0188, "step": 2321 }, { "epoch": 1.0564149226569608, "grad_norm": 0.6881944943779561, "learning_rate": 4.46963572598028e-06, "loss": 0.0198, "step": 2322 }, { "epoch": 1.056869881710646, "grad_norm": 0.49575367084132177, "learning_rate": 4.469195521182362e-06, "loss": 0.0217, "step": 2323 }, { "epoch": 1.0573248407643312, "grad_norm": 0.5505070213462345, "learning_rate": 4.468755155470679e-06, "loss": 0.0226, "step": 2324 }, { "epoch": 1.0577797998180163, "grad_norm": 0.5688205202999257, "learning_rate": 4.468314628881214e-06, "loss": 0.0208, "step": 2325 }, { "epoch": 1.0582347588717016, "grad_norm": 0.4538856296370273, "learning_rate": 4.467873941449969e-06, "loss": 0.0201, "step": 2326 }, { "epoch": 1.0586897179253867, "grad_norm": 0.4951949600945696, "learning_rate": 4.46743309321295e-06, "loss": 0.0192, "step": 2327 }, { "epoch": 1.0591446769790718, "grad_norm": 0.8177918826431166, "learning_rate": 4.466992084206185e-06, "loss": 0.0465, "step": 2328 }, { "epoch": 1.059599636032757, "grad_norm": 0.41902386946901277, "learning_rate": 4.466550914465709e-06, "loss": 0.0153, "step": 2329 }, { "epoch": 1.0600545950864422, "grad_norm": 0.6619680469471608, "learning_rate": 4.466109584027573e-06, "loss": 0.0269, "step": 2330 }, { "epoch": 1.0605095541401275, "grad_norm": 0.573290046990492, "learning_rate": 4.465668092927841e-06, "loss": 0.0226, "step": 2331 }, { "epoch": 1.0609645131938126, "grad_norm": 0.8738515426263221, "learning_rate": 4.465226441202589e-06, "loss": 0.0407, "step": 2332 }, { "epoch": 1.0614194722474977, "grad_norm": 0.5242511765844076, "learning_rate": 4.464784628887908e-06, "loss": 0.0202, "step": 2333 }, { "epoch": 1.061874431301183, "grad_norm": 0.4588386857289231, "learning_rate": 4.4643426560199e-06, "loss": 0.0104, "step": 2334 }, { "epoch": 1.062329390354868, "grad_norm": 0.43849362113269413, "learning_rate": 4.46390052263468e-06, "loss": 0.0152, "step": 2335 }, { "epoch": 1.0627843494085532, "grad_norm": 0.6871715121390033, "learning_rate": 4.463458228768378e-06, "loss": 0.0222, "step": 2336 }, { "epoch": 1.0632393084622385, "grad_norm": 0.6312910573017381, "learning_rate": 4.463015774457137e-06, "loss": 0.0292, "step": 2337 }, { "epoch": 1.0636942675159236, "grad_norm": 0.7620092964886106, "learning_rate": 4.462573159737113e-06, "loss": 0.0391, "step": 2338 }, { "epoch": 1.0641492265696086, "grad_norm": 0.585490205624937, "learning_rate": 4.462130384644472e-06, "loss": 0.0236, "step": 2339 }, { "epoch": 1.064604185623294, "grad_norm": 0.7034769748830236, "learning_rate": 4.461687449215397e-06, "loss": 0.0252, "step": 2340 }, { "epoch": 1.065059144676979, "grad_norm": 0.6110153772749273, "learning_rate": 4.4612443534860826e-06, "loss": 0.0248, "step": 2341 }, { "epoch": 1.0655141037306644, "grad_norm": 0.6858335786071746, "learning_rate": 4.460801097492737e-06, "loss": 0.0214, "step": 2342 }, { "epoch": 1.0659690627843494, "grad_norm": 0.5030440084983254, "learning_rate": 4.460357681271579e-06, "loss": 0.0179, "step": 2343 }, { "epoch": 1.0664240218380345, "grad_norm": 0.5301952004822383, "learning_rate": 4.4599141048588454e-06, "loss": 0.0273, "step": 2344 }, { "epoch": 1.0668789808917198, "grad_norm": 0.8918886613053432, "learning_rate": 4.4594703682907825e-06, "loss": 0.0348, "step": 2345 }, { "epoch": 1.067333939945405, "grad_norm": 0.5550020043836809, "learning_rate": 4.459026471603649e-06, "loss": 0.0264, "step": 2346 }, { "epoch": 1.06778889899909, "grad_norm": 0.5942449646440254, "learning_rate": 4.45858241483372e-06, "loss": 0.0311, "step": 2347 }, { "epoch": 1.0682438580527753, "grad_norm": 0.6882668929300695, "learning_rate": 4.458138198017281e-06, "loss": 0.0266, "step": 2348 }, { "epoch": 1.0686988171064604, "grad_norm": 0.6521955388813901, "learning_rate": 4.457693821190631e-06, "loss": 0.0324, "step": 2349 }, { "epoch": 1.0691537761601455, "grad_norm": 0.6489523342195044, "learning_rate": 4.4572492843900815e-06, "loss": 0.0253, "step": 2350 }, { "epoch": 1.0696087352138308, "grad_norm": 0.48279418830121457, "learning_rate": 4.456804587651961e-06, "loss": 0.0201, "step": 2351 }, { "epoch": 1.070063694267516, "grad_norm": 0.4410106378931689, "learning_rate": 4.456359731012606e-06, "loss": 0.0141, "step": 2352 }, { "epoch": 1.070518653321201, "grad_norm": 0.3752479866878442, "learning_rate": 4.455914714508369e-06, "loss": 0.0096, "step": 2353 }, { "epoch": 1.0709736123748863, "grad_norm": 0.35564777078621335, "learning_rate": 4.455469538175614e-06, "loss": 0.014, "step": 2354 }, { "epoch": 1.0714285714285714, "grad_norm": 0.5792769224363791, "learning_rate": 4.455024202050719e-06, "loss": 0.0245, "step": 2355 }, { "epoch": 1.0718835304822565, "grad_norm": 0.4418912340426733, "learning_rate": 4.454578706170075e-06, "loss": 0.0137, "step": 2356 }, { "epoch": 1.0723384895359418, "grad_norm": 0.6613336575639737, "learning_rate": 4.454133050570087e-06, "loss": 0.0234, "step": 2357 }, { "epoch": 1.0727934485896269, "grad_norm": 0.501346950607051, "learning_rate": 4.453687235287169e-06, "loss": 0.0226, "step": 2358 }, { "epoch": 1.0732484076433122, "grad_norm": 0.3786010644362032, "learning_rate": 4.453241260357754e-06, "loss": 0.0134, "step": 2359 }, { "epoch": 1.0737033666969973, "grad_norm": 0.4833259530881372, "learning_rate": 4.452795125818283e-06, "loss": 0.0188, "step": 2360 }, { "epoch": 1.0741583257506824, "grad_norm": 0.7741750323072273, "learning_rate": 4.4523488317052146e-06, "loss": 0.0355, "step": 2361 }, { "epoch": 1.0746132848043677, "grad_norm": 0.44639871630368005, "learning_rate": 4.451902378055015e-06, "loss": 0.0155, "step": 2362 }, { "epoch": 1.0750682438580528, "grad_norm": 0.38023273288220855, "learning_rate": 4.451455764904169e-06, "loss": 0.0195, "step": 2363 }, { "epoch": 1.0755232029117379, "grad_norm": 0.7834571656848334, "learning_rate": 4.45100899228917e-06, "loss": 0.0361, "step": 2364 }, { "epoch": 1.0759781619654232, "grad_norm": 0.44860302565592436, "learning_rate": 4.4505620602465275e-06, "loss": 0.0169, "step": 2365 }, { "epoch": 1.0764331210191083, "grad_norm": 0.5963152489034919, "learning_rate": 4.450114968812761e-06, "loss": 0.0191, "step": 2366 }, { "epoch": 1.0768880800727934, "grad_norm": 0.6470284182313165, "learning_rate": 4.449667718024406e-06, "loss": 0.0309, "step": 2367 }, { "epoch": 1.0773430391264787, "grad_norm": 0.5740322881813956, "learning_rate": 4.449220307918011e-06, "loss": 0.0211, "step": 2368 }, { "epoch": 1.0777979981801638, "grad_norm": 0.38610056650249264, "learning_rate": 4.448772738530134e-06, "loss": 0.0124, "step": 2369 }, { "epoch": 1.078252957233849, "grad_norm": 0.5957149086382844, "learning_rate": 4.44832500989735e-06, "loss": 0.0246, "step": 2370 }, { "epoch": 1.0787079162875342, "grad_norm": 0.4562941100201254, "learning_rate": 4.447877122056243e-06, "loss": 0.0191, "step": 2371 }, { "epoch": 1.0791628753412192, "grad_norm": 0.6775847445675632, "learning_rate": 4.447429075043416e-06, "loss": 0.0156, "step": 2372 }, { "epoch": 1.0796178343949046, "grad_norm": 0.4767997886979347, "learning_rate": 4.4469808688954786e-06, "loss": 0.0142, "step": 2373 }, { "epoch": 1.0800727934485896, "grad_norm": 0.6417561969278244, "learning_rate": 4.446532503649058e-06, "loss": 0.0268, "step": 2374 }, { "epoch": 1.0805277525022747, "grad_norm": 0.5723910654907396, "learning_rate": 4.44608397934079e-06, "loss": 0.0243, "step": 2375 }, { "epoch": 1.08098271155596, "grad_norm": 0.7726426359472051, "learning_rate": 4.445635296007329e-06, "loss": 0.0305, "step": 2376 }, { "epoch": 1.0814376706096451, "grad_norm": 0.5972224710436718, "learning_rate": 4.445186453685339e-06, "loss": 0.0246, "step": 2377 }, { "epoch": 1.0818926296633302, "grad_norm": 0.5542672270842244, "learning_rate": 4.444737452411494e-06, "loss": 0.0169, "step": 2378 }, { "epoch": 1.0823475887170155, "grad_norm": 0.669917314313808, "learning_rate": 4.444288292222488e-06, "loss": 0.0231, "step": 2379 }, { "epoch": 1.0828025477707006, "grad_norm": 0.46636787428212545, "learning_rate": 4.443838973155023e-06, "loss": 0.0198, "step": 2380 }, { "epoch": 1.0832575068243857, "grad_norm": 0.7488407504054011, "learning_rate": 4.443389495245816e-06, "loss": 0.0391, "step": 2381 }, { "epoch": 1.083712465878071, "grad_norm": 0.426306751011867, "learning_rate": 4.442939858531594e-06, "loss": 0.0186, "step": 2382 }, { "epoch": 1.084167424931756, "grad_norm": 0.5269009234872088, "learning_rate": 4.442490063049103e-06, "loss": 0.0181, "step": 2383 }, { "epoch": 1.0846223839854412, "grad_norm": 0.5045472519746933, "learning_rate": 4.442040108835095e-06, "loss": 0.0145, "step": 2384 }, { "epoch": 1.0850773430391265, "grad_norm": 0.5276413295553061, "learning_rate": 4.44158999592634e-06, "loss": 0.0233, "step": 2385 }, { "epoch": 1.0855323020928116, "grad_norm": 0.7029231940364357, "learning_rate": 4.441139724359617e-06, "loss": 0.0143, "step": 2386 }, { "epoch": 1.085987261146497, "grad_norm": 0.4846625365807441, "learning_rate": 4.440689294171724e-06, "loss": 0.016, "step": 2387 }, { "epoch": 1.086442220200182, "grad_norm": 0.7320057104472577, "learning_rate": 4.440238705399465e-06, "loss": 0.0216, "step": 2388 }, { "epoch": 1.086897179253867, "grad_norm": 0.45375539724546904, "learning_rate": 4.439787958079662e-06, "loss": 0.0166, "step": 2389 }, { "epoch": 1.0873521383075524, "grad_norm": 0.549722137867742, "learning_rate": 4.439337052249146e-06, "loss": 0.0167, "step": 2390 }, { "epoch": 1.0878070973612375, "grad_norm": 0.593866408149248, "learning_rate": 4.4388859879447645e-06, "loss": 0.025, "step": 2391 }, { "epoch": 1.0882620564149226, "grad_norm": 1.560944402013678, "learning_rate": 4.438434765203376e-06, "loss": 0.0742, "step": 2392 }, { "epoch": 1.0887170154686079, "grad_norm": 0.5799492458949477, "learning_rate": 4.4379833840618524e-06, "loss": 0.0282, "step": 2393 }, { "epoch": 1.089171974522293, "grad_norm": 0.8256447524416959, "learning_rate": 4.4375318445570785e-06, "loss": 0.0256, "step": 2394 }, { "epoch": 1.089626933575978, "grad_norm": 0.6537098687733487, "learning_rate": 4.437080146725951e-06, "loss": 0.0225, "step": 2395 }, { "epoch": 1.0900818926296634, "grad_norm": 0.5301949864589981, "learning_rate": 4.436628290605384e-06, "loss": 0.0236, "step": 2396 }, { "epoch": 1.0905368516833485, "grad_norm": 0.5077507855342336, "learning_rate": 4.436176276232297e-06, "loss": 0.0198, "step": 2397 }, { "epoch": 1.0909918107370338, "grad_norm": 0.6697585395312962, "learning_rate": 4.4357241036436294e-06, "loss": 0.0218, "step": 2398 }, { "epoch": 1.0914467697907189, "grad_norm": 0.37753594175599786, "learning_rate": 4.435271772876329e-06, "loss": 0.0198, "step": 2399 }, { "epoch": 1.091901728844404, "grad_norm": 0.7251989967515775, "learning_rate": 4.434819283967359e-06, "loss": 0.0337, "step": 2400 }, { "epoch": 1.0923566878980893, "grad_norm": 0.5685580227633195, "learning_rate": 4.434366636953695e-06, "loss": 0.0134, "step": 2401 }, { "epoch": 1.0928116469517744, "grad_norm": 0.3783023948073095, "learning_rate": 4.433913831872324e-06, "loss": 0.0092, "step": 2402 }, { "epoch": 1.0932666060054594, "grad_norm": 0.6281936324347903, "learning_rate": 4.43346086876025e-06, "loss": 0.0328, "step": 2403 }, { "epoch": 1.0937215650591448, "grad_norm": 0.6536473305158147, "learning_rate": 4.433007747654484e-06, "loss": 0.0188, "step": 2404 }, { "epoch": 1.0941765241128298, "grad_norm": 0.526513701587517, "learning_rate": 4.432554468592054e-06, "loss": 0.0226, "step": 2405 }, { "epoch": 1.094631483166515, "grad_norm": 0.5881238864879137, "learning_rate": 4.432101031610001e-06, "loss": 0.0223, "step": 2406 }, { "epoch": 1.0950864422202002, "grad_norm": 0.4422986207763811, "learning_rate": 4.431647436745376e-06, "loss": 0.0109, "step": 2407 }, { "epoch": 1.0955414012738853, "grad_norm": 0.7893654050648916, "learning_rate": 4.431193684035246e-06, "loss": 0.0332, "step": 2408 }, { "epoch": 1.0959963603275704, "grad_norm": 0.5247569845377812, "learning_rate": 4.43073977351669e-06, "loss": 0.024, "step": 2409 }, { "epoch": 1.0964513193812557, "grad_norm": 0.3758246466492886, "learning_rate": 4.430285705226799e-06, "loss": 0.0083, "step": 2410 }, { "epoch": 1.0969062784349408, "grad_norm": 0.6145335090304439, "learning_rate": 4.429831479202676e-06, "loss": 0.0228, "step": 2411 }, { "epoch": 1.097361237488626, "grad_norm": 0.4286683636340505, "learning_rate": 4.429377095481441e-06, "loss": 0.0165, "step": 2412 }, { "epoch": 1.0978161965423112, "grad_norm": 0.5952460412943158, "learning_rate": 4.428922554100221e-06, "loss": 0.0309, "step": 2413 }, { "epoch": 1.0982711555959963, "grad_norm": 0.44582992706971714, "learning_rate": 4.428467855096163e-06, "loss": 0.0177, "step": 2414 }, { "epoch": 1.0987261146496816, "grad_norm": 0.7300092932288427, "learning_rate": 4.428012998506419e-06, "loss": 0.0237, "step": 2415 }, { "epoch": 1.0991810737033667, "grad_norm": 0.6969394499120272, "learning_rate": 4.42755798436816e-06, "loss": 0.0271, "step": 2416 }, { "epoch": 1.0996360327570518, "grad_norm": 0.4680236162394029, "learning_rate": 4.427102812718568e-06, "loss": 0.0151, "step": 2417 }, { "epoch": 1.100090991810737, "grad_norm": 0.46662414655854256, "learning_rate": 4.426647483594836e-06, "loss": 0.0171, "step": 2418 }, { "epoch": 1.1005459508644222, "grad_norm": 0.549235332005641, "learning_rate": 4.4261919970341724e-06, "loss": 0.0312, "step": 2419 }, { "epoch": 1.1010009099181073, "grad_norm": 0.40850248404362477, "learning_rate": 4.425736353073798e-06, "loss": 0.0169, "step": 2420 }, { "epoch": 1.1014558689717926, "grad_norm": 0.7833330930453737, "learning_rate": 4.425280551750945e-06, "loss": 0.0358, "step": 2421 }, { "epoch": 1.1019108280254777, "grad_norm": 0.3603121621215666, "learning_rate": 4.42482459310286e-06, "loss": 0.0156, "step": 2422 }, { "epoch": 1.1023657870791628, "grad_norm": 0.8023639858765762, "learning_rate": 4.424368477166801e-06, "loss": 0.0256, "step": 2423 }, { "epoch": 1.102820746132848, "grad_norm": 0.6123340064787052, "learning_rate": 4.423912203980041e-06, "loss": 0.0239, "step": 2424 }, { "epoch": 1.1032757051865332, "grad_norm": 0.7883676564832361, "learning_rate": 4.423455773579865e-06, "loss": 0.0243, "step": 2425 }, { "epoch": 1.1037306642402185, "grad_norm": 0.5606580783089008, "learning_rate": 4.422999186003568e-06, "loss": 0.0206, "step": 2426 }, { "epoch": 1.1041856232939036, "grad_norm": 0.3791744966325019, "learning_rate": 4.422542441288462e-06, "loss": 0.0141, "step": 2427 }, { "epoch": 1.1046405823475887, "grad_norm": 0.6286323469492989, "learning_rate": 4.42208553947187e-06, "loss": 0.0279, "step": 2428 }, { "epoch": 1.105095541401274, "grad_norm": 0.45776690578796103, "learning_rate": 4.4216284805911275e-06, "loss": 0.0123, "step": 2429 }, { "epoch": 1.105550500454959, "grad_norm": 0.6993584918399423, "learning_rate": 4.421171264683584e-06, "loss": 0.031, "step": 2430 }, { "epoch": 1.1060054595086442, "grad_norm": 0.8904266677973449, "learning_rate": 4.4207138917866e-06, "loss": 0.0434, "step": 2431 }, { "epoch": 1.1064604185623295, "grad_norm": 0.6348166810636052, "learning_rate": 4.420256361937551e-06, "loss": 0.0309, "step": 2432 }, { "epoch": 1.1069153776160146, "grad_norm": 0.3981663200647504, "learning_rate": 4.419798675173824e-06, "loss": 0.0148, "step": 2433 }, { "epoch": 1.1073703366696996, "grad_norm": 0.48832091405954775, "learning_rate": 4.419340831532819e-06, "loss": 0.0194, "step": 2434 }, { "epoch": 1.107825295723385, "grad_norm": 0.5699123528938048, "learning_rate": 4.418882831051949e-06, "loss": 0.0219, "step": 2435 }, { "epoch": 1.10828025477707, "grad_norm": 0.5206899000767948, "learning_rate": 4.418424673768639e-06, "loss": 0.018, "step": 2436 }, { "epoch": 1.1087352138307551, "grad_norm": 0.45230109480362385, "learning_rate": 4.417966359720329e-06, "loss": 0.0165, "step": 2437 }, { "epoch": 1.1091901728844404, "grad_norm": 0.5703863477498058, "learning_rate": 4.417507888944469e-06, "loss": 0.029, "step": 2438 }, { "epoch": 1.1096451319381255, "grad_norm": 0.682258223758108, "learning_rate": 4.417049261478525e-06, "loss": 0.0334, "step": 2439 }, { "epoch": 1.1101000909918108, "grad_norm": 0.47676985671226807, "learning_rate": 4.416590477359971e-06, "loss": 0.0181, "step": 2440 }, { "epoch": 1.110555050045496, "grad_norm": 0.38788442111589033, "learning_rate": 4.416131536626299e-06, "loss": 0.0204, "step": 2441 }, { "epoch": 1.111010009099181, "grad_norm": 0.6170801313853547, "learning_rate": 4.415672439315011e-06, "loss": 0.0241, "step": 2442 }, { "epoch": 1.1114649681528663, "grad_norm": 0.3816037691138792, "learning_rate": 4.415213185463623e-06, "loss": 0.0148, "step": 2443 }, { "epoch": 1.1119199272065514, "grad_norm": 0.482252021647845, "learning_rate": 4.414753775109661e-06, "loss": 0.0115, "step": 2444 }, { "epoch": 1.1123748862602365, "grad_norm": 0.6286536712577396, "learning_rate": 4.414294208290669e-06, "loss": 0.0245, "step": 2445 }, { "epoch": 1.1128298453139218, "grad_norm": 0.5902607419466066, "learning_rate": 4.413834485044199e-06, "loss": 0.0232, "step": 2446 }, { "epoch": 1.113284804367607, "grad_norm": 0.5444384492120593, "learning_rate": 4.413374605407817e-06, "loss": 0.0186, "step": 2447 }, { "epoch": 1.113739763421292, "grad_norm": 0.6430829258294208, "learning_rate": 4.412914569419103e-06, "loss": 0.0187, "step": 2448 }, { "epoch": 1.1141947224749773, "grad_norm": 0.4980474374671166, "learning_rate": 4.412454377115649e-06, "loss": 0.0185, "step": 2449 }, { "epoch": 1.1146496815286624, "grad_norm": 0.4773216197958395, "learning_rate": 4.411994028535061e-06, "loss": 0.018, "step": 2450 }, { "epoch": 1.1151046405823477, "grad_norm": 0.6035811414068329, "learning_rate": 4.411533523714954e-06, "loss": 0.0206, "step": 2451 }, { "epoch": 1.1155595996360328, "grad_norm": 0.7275388720106067, "learning_rate": 4.41107286269296e-06, "loss": 0.0292, "step": 2452 }, { "epoch": 1.1160145586897179, "grad_norm": 0.5634610686928793, "learning_rate": 4.410612045506722e-06, "loss": 0.0198, "step": 2453 }, { "epoch": 1.1164695177434032, "grad_norm": 0.5718741959689277, "learning_rate": 4.410151072193897e-06, "loss": 0.027, "step": 2454 }, { "epoch": 1.1169244767970883, "grad_norm": 0.5551564270266938, "learning_rate": 4.409689942792152e-06, "loss": 0.0199, "step": 2455 }, { "epoch": 1.1173794358507734, "grad_norm": 0.8888251322235802, "learning_rate": 4.409228657339168e-06, "loss": 0.0333, "step": 2456 }, { "epoch": 1.1178343949044587, "grad_norm": 0.5773710624441224, "learning_rate": 4.4087672158726415e-06, "loss": 0.0183, "step": 2457 }, { "epoch": 1.1182893539581438, "grad_norm": 0.5015201013813986, "learning_rate": 4.408305618430277e-06, "loss": 0.0183, "step": 2458 }, { "epoch": 1.1187443130118289, "grad_norm": 0.5607273909280127, "learning_rate": 4.407843865049797e-06, "loss": 0.0179, "step": 2459 }, { "epoch": 1.1191992720655142, "grad_norm": 0.47810546016317323, "learning_rate": 4.40738195576893e-06, "loss": 0.0177, "step": 2460 }, { "epoch": 1.1196542311191993, "grad_norm": 0.3206530963273737, "learning_rate": 4.406919890625424e-06, "loss": 0.012, "step": 2461 }, { "epoch": 1.1201091901728844, "grad_norm": 0.5269729696359773, "learning_rate": 4.406457669657036e-06, "loss": 0.0252, "step": 2462 }, { "epoch": 1.1205641492265697, "grad_norm": 0.8110781283531187, "learning_rate": 4.405995292901537e-06, "loss": 0.0394, "step": 2463 }, { "epoch": 1.1210191082802548, "grad_norm": 0.685631955664527, "learning_rate": 4.40553276039671e-06, "loss": 0.0235, "step": 2464 }, { "epoch": 1.1214740673339398, "grad_norm": 0.5305736419722851, "learning_rate": 4.4050700721803505e-06, "loss": 0.0246, "step": 2465 }, { "epoch": 1.1219290263876252, "grad_norm": 0.8233289195972211, "learning_rate": 4.404607228290269e-06, "loss": 0.0329, "step": 2466 }, { "epoch": 1.1223839854413102, "grad_norm": 0.4494525107707997, "learning_rate": 4.404144228764285e-06, "loss": 0.0172, "step": 2467 }, { "epoch": 1.1228389444949956, "grad_norm": 0.6898284017237695, "learning_rate": 4.403681073640235e-06, "loss": 0.0309, "step": 2468 }, { "epoch": 1.1232939035486806, "grad_norm": 0.44022897506767517, "learning_rate": 4.403217762955963e-06, "loss": 0.014, "step": 2469 }, { "epoch": 1.1237488626023657, "grad_norm": 0.5105224449314946, "learning_rate": 4.402754296749331e-06, "loss": 0.0277, "step": 2470 }, { "epoch": 1.124203821656051, "grad_norm": 0.4431776829162167, "learning_rate": 4.402290675058211e-06, "loss": 0.0165, "step": 2471 }, { "epoch": 1.1246587807097361, "grad_norm": 0.7311094672586586, "learning_rate": 4.401826897920487e-06, "loss": 0.0394, "step": 2472 }, { "epoch": 1.1251137397634212, "grad_norm": 0.5038279648045123, "learning_rate": 4.4013629653740575e-06, "loss": 0.0193, "step": 2473 }, { "epoch": 1.1255686988171065, "grad_norm": 0.6090867075516695, "learning_rate": 4.400898877456833e-06, "loss": 0.0202, "step": 2474 }, { "epoch": 1.1260236578707916, "grad_norm": 0.4042652844289294, "learning_rate": 4.400434634206737e-06, "loss": 0.0202, "step": 2475 }, { "epoch": 1.1264786169244767, "grad_norm": 0.6930219774407071, "learning_rate": 4.399970235661705e-06, "loss": 0.0201, "step": 2476 }, { "epoch": 1.126933575978162, "grad_norm": 0.7770132345875099, "learning_rate": 4.399505681859685e-06, "loss": 0.0258, "step": 2477 }, { "epoch": 1.127388535031847, "grad_norm": 0.4536720849488241, "learning_rate": 4.399040972838639e-06, "loss": 0.013, "step": 2478 }, { "epoch": 1.1278434940855324, "grad_norm": 0.4629044906931116, "learning_rate": 4.398576108636541e-06, "loss": 0.0142, "step": 2479 }, { "epoch": 1.1282984531392175, "grad_norm": 0.5029551266143759, "learning_rate": 4.398111089291378e-06, "loss": 0.0223, "step": 2480 }, { "epoch": 1.1287534121929026, "grad_norm": 0.5267250567692062, "learning_rate": 4.3976459148411464e-06, "loss": 0.0184, "step": 2481 }, { "epoch": 1.129208371246588, "grad_norm": 0.5866609989932728, "learning_rate": 4.3971805853238616e-06, "loss": 0.0225, "step": 2482 }, { "epoch": 1.129663330300273, "grad_norm": 0.7290964432708312, "learning_rate": 4.396715100777547e-06, "loss": 0.0215, "step": 2483 }, { "epoch": 1.130118289353958, "grad_norm": 0.5687962106735492, "learning_rate": 4.39624946124024e-06, "loss": 0.014, "step": 2484 }, { "epoch": 1.1305732484076434, "grad_norm": 0.5222224290731767, "learning_rate": 4.39578366674999e-06, "loss": 0.0219, "step": 2485 }, { "epoch": 1.1310282074613285, "grad_norm": 0.6292767966184953, "learning_rate": 4.395317717344861e-06, "loss": 0.023, "step": 2486 }, { "epoch": 1.1314831665150136, "grad_norm": 0.6230840960536086, "learning_rate": 4.394851613062927e-06, "loss": 0.0255, "step": 2487 }, { "epoch": 1.1319381255686989, "grad_norm": 0.6868861554575693, "learning_rate": 4.394385353942275e-06, "loss": 0.0228, "step": 2488 }, { "epoch": 1.132393084622384, "grad_norm": 0.4274785686639345, "learning_rate": 4.393918940021008e-06, "loss": 0.0172, "step": 2489 }, { "epoch": 1.132848043676069, "grad_norm": 0.521848847942063, "learning_rate": 4.393452371337238e-06, "loss": 0.0213, "step": 2490 }, { "epoch": 1.1333030027297544, "grad_norm": 0.40932265141368795, "learning_rate": 4.39298564792909e-06, "loss": 0.018, "step": 2491 }, { "epoch": 1.1337579617834395, "grad_norm": 0.39066049998357827, "learning_rate": 4.392518769834705e-06, "loss": 0.0133, "step": 2492 }, { "epoch": 1.1342129208371245, "grad_norm": 0.7649639309094274, "learning_rate": 4.392051737092231e-06, "loss": 0.0242, "step": 2493 }, { "epoch": 1.1346678798908099, "grad_norm": 0.465262288674482, "learning_rate": 4.391584549739834e-06, "loss": 0.015, "step": 2494 }, { "epoch": 1.135122838944495, "grad_norm": 0.6825064297343973, "learning_rate": 4.391117207815691e-06, "loss": 0.0224, "step": 2495 }, { "epoch": 1.1355777979981803, "grad_norm": 0.39540271333569704, "learning_rate": 4.3906497113579895e-06, "loss": 0.013, "step": 2496 }, { "epoch": 1.1360327570518653, "grad_norm": 0.7534708050321565, "learning_rate": 4.390182060404931e-06, "loss": 0.0243, "step": 2497 }, { "epoch": 1.1364877161055504, "grad_norm": 0.4898364302791372, "learning_rate": 4.389714254994732e-06, "loss": 0.017, "step": 2498 }, { "epoch": 1.1369426751592357, "grad_norm": 0.6693549197179394, "learning_rate": 4.389246295165617e-06, "loss": 0.0157, "step": 2499 }, { "epoch": 1.1373976342129208, "grad_norm": 0.4291317100603944, "learning_rate": 4.388778180955826e-06, "loss": 0.0174, "step": 2500 }, { "epoch": 1.137852593266606, "grad_norm": 0.7020692499532706, "learning_rate": 4.388309912403612e-06, "loss": 0.0306, "step": 2501 }, { "epoch": 1.1383075523202912, "grad_norm": 0.6301261849620517, "learning_rate": 4.38784148954724e-06, "loss": 0.0319, "step": 2502 }, { "epoch": 1.1387625113739763, "grad_norm": 0.4361572339623868, "learning_rate": 4.387372912424987e-06, "loss": 0.0127, "step": 2503 }, { "epoch": 1.1392174704276614, "grad_norm": 0.7026556330785464, "learning_rate": 4.386904181075142e-06, "loss": 0.0298, "step": 2504 }, { "epoch": 1.1396724294813467, "grad_norm": 0.6098253171064514, "learning_rate": 4.386435295536008e-06, "loss": 0.0172, "step": 2505 }, { "epoch": 1.1401273885350318, "grad_norm": 0.4467914447435911, "learning_rate": 4.385966255845902e-06, "loss": 0.0133, "step": 2506 }, { "epoch": 1.1405823475887171, "grad_norm": 0.5646208066401632, "learning_rate": 4.38549706204315e-06, "loss": 0.0218, "step": 2507 }, { "epoch": 1.1410373066424022, "grad_norm": 0.5092623099739902, "learning_rate": 4.385027714166094e-06, "loss": 0.0228, "step": 2508 }, { "epoch": 1.1414922656960873, "grad_norm": 0.6483618609538168, "learning_rate": 4.384558212253084e-06, "loss": 0.0224, "step": 2509 }, { "epoch": 1.1419472247497726, "grad_norm": 0.5116010138281942, "learning_rate": 4.384088556342488e-06, "loss": 0.0157, "step": 2510 }, { "epoch": 1.1424021838034577, "grad_norm": 0.5953620176339847, "learning_rate": 4.383618746472686e-06, "loss": 0.02, "step": 2511 }, { "epoch": 1.1428571428571428, "grad_norm": 0.547071261477399, "learning_rate": 4.383148782682064e-06, "loss": 0.0213, "step": 2512 }, { "epoch": 1.143312101910828, "grad_norm": 0.6862235251333989, "learning_rate": 4.382678665009028e-06, "loss": 0.0326, "step": 2513 }, { "epoch": 1.1437670609645132, "grad_norm": 0.6144410092488735, "learning_rate": 4.382208393491994e-06, "loss": 0.021, "step": 2514 }, { "epoch": 1.1442220200181983, "grad_norm": 1.0612612279447866, "learning_rate": 4.381737968169389e-06, "loss": 0.0266, "step": 2515 }, { "epoch": 1.1446769790718836, "grad_norm": 0.5795112434597738, "learning_rate": 4.381267389079657e-06, "loss": 0.0204, "step": 2516 }, { "epoch": 1.1451319381255687, "grad_norm": 0.5281408551585983, "learning_rate": 4.380796656261248e-06, "loss": 0.0242, "step": 2517 }, { "epoch": 1.1455868971792538, "grad_norm": 0.5440850665021298, "learning_rate": 4.38032576975263e-06, "loss": 0.0222, "step": 2518 }, { "epoch": 1.146041856232939, "grad_norm": 0.547779027945082, "learning_rate": 4.3798547295922825e-06, "loss": 0.0252, "step": 2519 }, { "epoch": 1.1464968152866242, "grad_norm": 0.7316358167572443, "learning_rate": 4.3793835358186955e-06, "loss": 0.0245, "step": 2520 }, { "epoch": 1.1469517743403093, "grad_norm": 0.7748548730476206, "learning_rate": 4.378912188470374e-06, "loss": 0.0396, "step": 2521 }, { "epoch": 1.1474067333939946, "grad_norm": 0.6582577150455249, "learning_rate": 4.378440687585832e-06, "loss": 0.0324, "step": 2522 }, { "epoch": 1.1478616924476797, "grad_norm": 0.643886854209991, "learning_rate": 4.3779690332036005e-06, "loss": 0.0274, "step": 2523 }, { "epoch": 1.148316651501365, "grad_norm": 0.6168422110373858, "learning_rate": 4.3774972253622205e-06, "loss": 0.0212, "step": 2524 }, { "epoch": 1.14877161055505, "grad_norm": 0.5831017159186793, "learning_rate": 4.377025264100246e-06, "loss": 0.0207, "step": 2525 }, { "epoch": 1.1492265696087351, "grad_norm": 0.655374012181892, "learning_rate": 4.376553149456244e-06, "loss": 0.0186, "step": 2526 }, { "epoch": 1.1496815286624205, "grad_norm": 0.5616942135395393, "learning_rate": 4.376080881468793e-06, "loss": 0.0132, "step": 2527 }, { "epoch": 1.1501364877161055, "grad_norm": 0.45821214516392705, "learning_rate": 4.375608460176483e-06, "loss": 0.0167, "step": 2528 }, { "epoch": 1.1505914467697906, "grad_norm": 0.4482772285785562, "learning_rate": 4.375135885617922e-06, "loss": 0.0147, "step": 2529 }, { "epoch": 1.151046405823476, "grad_norm": 0.46276152379004304, "learning_rate": 4.3746631578317236e-06, "loss": 0.0162, "step": 2530 }, { "epoch": 1.151501364877161, "grad_norm": 0.472367156834232, "learning_rate": 4.374190276856517e-06, "loss": 0.0161, "step": 2531 }, { "epoch": 1.1519563239308463, "grad_norm": 0.7210180575796964, "learning_rate": 4.373717242730946e-06, "loss": 0.0164, "step": 2532 }, { "epoch": 1.1524112829845314, "grad_norm": 0.34265721640778585, "learning_rate": 4.373244055493663e-06, "loss": 0.009, "step": 2533 }, { "epoch": 1.1528662420382165, "grad_norm": 0.9246661541629277, "learning_rate": 4.372770715183336e-06, "loss": 0.0394, "step": 2534 }, { "epoch": 1.1533212010919018, "grad_norm": 1.0155207038438967, "learning_rate": 4.372297221838642e-06, "loss": 0.0433, "step": 2535 }, { "epoch": 1.153776160145587, "grad_norm": 0.45755930254991145, "learning_rate": 4.3718235754982755e-06, "loss": 0.0178, "step": 2536 }, { "epoch": 1.154231119199272, "grad_norm": 0.8712775724045019, "learning_rate": 4.371349776200939e-06, "loss": 0.0332, "step": 2537 }, { "epoch": 1.1546860782529573, "grad_norm": 0.4981626750356432, "learning_rate": 4.37087582398535e-06, "loss": 0.0146, "step": 2538 }, { "epoch": 1.1551410373066424, "grad_norm": 0.4312192452228287, "learning_rate": 4.370401718890237e-06, "loss": 0.0152, "step": 2539 }, { "epoch": 1.1555959963603275, "grad_norm": 0.6358957648601828, "learning_rate": 4.369927460954342e-06, "loss": 0.0215, "step": 2540 }, { "epoch": 1.1560509554140128, "grad_norm": 0.5795766897237975, "learning_rate": 4.36945305021642e-06, "loss": 0.0283, "step": 2541 }, { "epoch": 1.156505914467698, "grad_norm": 0.5890586761906142, "learning_rate": 4.368978486715237e-06, "loss": 0.0283, "step": 2542 }, { "epoch": 1.156960873521383, "grad_norm": 0.49231424613966623, "learning_rate": 4.368503770489573e-06, "loss": 0.017, "step": 2543 }, { "epoch": 1.1574158325750683, "grad_norm": 0.5284668616614862, "learning_rate": 4.368028901578218e-06, "loss": 0.0192, "step": 2544 }, { "epoch": 1.1578707916287534, "grad_norm": 0.4629161095951479, "learning_rate": 4.367553880019977e-06, "loss": 0.0174, "step": 2545 }, { "epoch": 1.1583257506824385, "grad_norm": 0.5295482770008699, "learning_rate": 4.367078705853667e-06, "loss": 0.0175, "step": 2546 }, { "epoch": 1.1587807097361238, "grad_norm": 0.4492266435415687, "learning_rate": 4.366603379118117e-06, "loss": 0.0174, "step": 2547 }, { "epoch": 1.1592356687898089, "grad_norm": 0.6861326206450016, "learning_rate": 4.366127899852169e-06, "loss": 0.0202, "step": 2548 }, { "epoch": 1.159690627843494, "grad_norm": 0.6458656612345586, "learning_rate": 4.365652268094675e-06, "loss": 0.0247, "step": 2549 }, { "epoch": 1.1601455868971793, "grad_norm": 0.5934112274239063, "learning_rate": 4.365176483884504e-06, "loss": 0.018, "step": 2550 }, { "epoch": 1.1606005459508644, "grad_norm": 0.5168135672415382, "learning_rate": 4.364700547260533e-06, "loss": 0.0164, "step": 2551 }, { "epoch": 1.1610555050045497, "grad_norm": 0.6179287757702098, "learning_rate": 4.3642244582616545e-06, "loss": 0.0224, "step": 2552 }, { "epoch": 1.1615104640582348, "grad_norm": 0.6789584396331461, "learning_rate": 4.363748216926772e-06, "loss": 0.0246, "step": 2553 }, { "epoch": 1.1619654231119199, "grad_norm": 0.5988318131014969, "learning_rate": 4.363271823294802e-06, "loss": 0.0249, "step": 2554 }, { "epoch": 1.1624203821656052, "grad_norm": 0.7304853558121732, "learning_rate": 4.362795277404673e-06, "loss": 0.0303, "step": 2555 }, { "epoch": 1.1628753412192903, "grad_norm": 0.4069363718224522, "learning_rate": 4.362318579295326e-06, "loss": 0.0142, "step": 2556 }, { "epoch": 1.1633303002729753, "grad_norm": 0.6305977347827836, "learning_rate": 4.361841729005715e-06, "loss": 0.0243, "step": 2557 }, { "epoch": 1.1637852593266607, "grad_norm": 1.9165750531232424, "learning_rate": 4.361364726574806e-06, "loss": 0.0472, "step": 2558 }, { "epoch": 1.1642402183803457, "grad_norm": 0.48386946622280164, "learning_rate": 4.360887572041578e-06, "loss": 0.0239, "step": 2559 }, { "epoch": 1.164695177434031, "grad_norm": 0.6741840320275676, "learning_rate": 4.36041026544502e-06, "loss": 0.0246, "step": 2560 }, { "epoch": 1.1651501364877161, "grad_norm": 0.618838331497097, "learning_rate": 4.359932806824138e-06, "loss": 0.027, "step": 2561 }, { "epoch": 1.1656050955414012, "grad_norm": 0.3417342823461349, "learning_rate": 4.359455196217946e-06, "loss": 0.0104, "step": 2562 }, { "epoch": 1.1660600545950865, "grad_norm": 0.31164520237783483, "learning_rate": 4.358977433665471e-06, "loss": 0.0088, "step": 2563 }, { "epoch": 1.1665150136487716, "grad_norm": 0.6554668583356018, "learning_rate": 4.3584995192057565e-06, "loss": 0.0335, "step": 2564 }, { "epoch": 1.1669699727024567, "grad_norm": 0.44408981017519084, "learning_rate": 4.358021452877854e-06, "loss": 0.0183, "step": 2565 }, { "epoch": 1.167424931756142, "grad_norm": 0.482451643460215, "learning_rate": 4.357543234720829e-06, "loss": 0.017, "step": 2566 }, { "epoch": 1.1678798908098271, "grad_norm": 0.8550493713474525, "learning_rate": 4.357064864773761e-06, "loss": 0.0363, "step": 2567 }, { "epoch": 1.1683348498635122, "grad_norm": 0.5050690519756426, "learning_rate": 4.3565863430757375e-06, "loss": 0.0176, "step": 2568 }, { "epoch": 1.1687898089171975, "grad_norm": 0.5475370608720886, "learning_rate": 4.356107669665862e-06, "loss": 0.0232, "step": 2569 }, { "epoch": 1.1692447679708826, "grad_norm": 0.6277444667505776, "learning_rate": 4.355628844583249e-06, "loss": 0.0298, "step": 2570 }, { "epoch": 1.1696997270245677, "grad_norm": 0.5366910312126616, "learning_rate": 4.355149867867029e-06, "loss": 0.0186, "step": 2571 }, { "epoch": 1.170154686078253, "grad_norm": 0.45310857499795165, "learning_rate": 4.354670739556338e-06, "loss": 0.0119, "step": 2572 }, { "epoch": 1.170609645131938, "grad_norm": 0.6356440967795077, "learning_rate": 4.35419145969033e-06, "loss": 0.0333, "step": 2573 }, { "epoch": 1.1710646041856232, "grad_norm": 0.5464909143012531, "learning_rate": 4.35371202830817e-06, "loss": 0.0189, "step": 2574 }, { "epoch": 1.1715195632393085, "grad_norm": 0.6383681077345984, "learning_rate": 4.353232445449034e-06, "loss": 0.0265, "step": 2575 }, { "epoch": 1.1719745222929936, "grad_norm": 0.47662442741731753, "learning_rate": 4.352752711152112e-06, "loss": 0.0148, "step": 2576 }, { "epoch": 1.1724294813466787, "grad_norm": 0.5868089466835913, "learning_rate": 4.352272825456605e-06, "loss": 0.0216, "step": 2577 }, { "epoch": 1.172884440400364, "grad_norm": 0.5253949671686583, "learning_rate": 4.3517927884017275e-06, "loss": 0.0256, "step": 2578 }, { "epoch": 1.173339399454049, "grad_norm": 0.6592308425561475, "learning_rate": 4.351312600026706e-06, "loss": 0.0285, "step": 2579 }, { "epoch": 1.1737943585077344, "grad_norm": 0.7649368180575881, "learning_rate": 4.350832260370779e-06, "loss": 0.0311, "step": 2580 }, { "epoch": 1.1742493175614195, "grad_norm": 0.5514861272003339, "learning_rate": 4.350351769473198e-06, "loss": 0.0231, "step": 2581 }, { "epoch": 1.1747042766151046, "grad_norm": 0.5775672174979725, "learning_rate": 4.349871127373226e-06, "loss": 0.0255, "step": 2582 }, { "epoch": 1.1751592356687899, "grad_norm": 0.9134141208612268, "learning_rate": 4.349390334110141e-06, "loss": 0.0439, "step": 2583 }, { "epoch": 1.175614194722475, "grad_norm": 0.4544618512994982, "learning_rate": 4.348909389723228e-06, "loss": 0.0178, "step": 2584 }, { "epoch": 1.17606915377616, "grad_norm": 0.5791839745574971, "learning_rate": 4.348428294251791e-06, "loss": 0.0304, "step": 2585 }, { "epoch": 1.1765241128298454, "grad_norm": 0.4843168191804037, "learning_rate": 4.34794704773514e-06, "loss": 0.0204, "step": 2586 }, { "epoch": 1.1769790718835305, "grad_norm": 0.47321640576793395, "learning_rate": 4.347465650212602e-06, "loss": 0.0161, "step": 2587 }, { "epoch": 1.1774340309372158, "grad_norm": 0.6270272685829019, "learning_rate": 4.346984101723513e-06, "loss": 0.0257, "step": 2588 }, { "epoch": 1.1778889899909009, "grad_norm": 0.4453595878333599, "learning_rate": 4.3465024023072255e-06, "loss": 0.0207, "step": 2589 }, { "epoch": 1.178343949044586, "grad_norm": 0.5620264526303997, "learning_rate": 4.3460205520031006e-06, "loss": 0.0162, "step": 2590 }, { "epoch": 1.1787989080982713, "grad_norm": 0.79050022325226, "learning_rate": 4.345538550850512e-06, "loss": 0.0275, "step": 2591 }, { "epoch": 1.1792538671519563, "grad_norm": 0.5242514508819358, "learning_rate": 4.345056398888847e-06, "loss": 0.0194, "step": 2592 }, { "epoch": 1.1797088262056414, "grad_norm": 0.45519280840041193, "learning_rate": 4.3445740961575066e-06, "loss": 0.018, "step": 2593 }, { "epoch": 1.1801637852593267, "grad_norm": 0.6747230064920378, "learning_rate": 4.3440916426959e-06, "loss": 0.0289, "step": 2594 }, { "epoch": 1.1806187443130118, "grad_norm": 0.4544809079544836, "learning_rate": 4.343609038543452e-06, "loss": 0.0249, "step": 2595 }, { "epoch": 1.181073703366697, "grad_norm": 0.35452190136365136, "learning_rate": 4.3431262837396e-06, "loss": 0.0141, "step": 2596 }, { "epoch": 1.1815286624203822, "grad_norm": 0.6536923673730825, "learning_rate": 4.342643378323791e-06, "loss": 0.0223, "step": 2597 }, { "epoch": 1.1819836214740673, "grad_norm": 0.5887741451878095, "learning_rate": 4.342160322335487e-06, "loss": 0.0353, "step": 2598 }, { "epoch": 1.1824385805277524, "grad_norm": 0.7048921014576134, "learning_rate": 4.34167711581416e-06, "loss": 0.0223, "step": 2599 }, { "epoch": 1.1828935395814377, "grad_norm": 0.7646870960953991, "learning_rate": 4.3411937587992955e-06, "loss": 0.0391, "step": 2600 }, { "epoch": 1.1833484986351228, "grad_norm": 0.5007217995372573, "learning_rate": 4.340710251330393e-06, "loss": 0.0219, "step": 2601 }, { "epoch": 1.183803457688808, "grad_norm": 0.5078521534312103, "learning_rate": 4.34022659344696e-06, "loss": 0.0225, "step": 2602 }, { "epoch": 1.1842584167424932, "grad_norm": 0.8160012841542015, "learning_rate": 4.339742785188521e-06, "loss": 0.0301, "step": 2603 }, { "epoch": 1.1847133757961783, "grad_norm": 0.523985625873912, "learning_rate": 4.339258826594611e-06, "loss": 0.0178, "step": 2604 }, { "epoch": 1.1851683348498634, "grad_norm": 0.5308806701133933, "learning_rate": 4.338774717704774e-06, "loss": 0.0175, "step": 2605 }, { "epoch": 1.1856232939035487, "grad_norm": 0.6656523129373758, "learning_rate": 4.338290458558572e-06, "loss": 0.0273, "step": 2606 }, { "epoch": 1.1860782529572338, "grad_norm": 0.39255221273464475, "learning_rate": 4.3378060491955744e-06, "loss": 0.0148, "step": 2607 }, { "epoch": 1.186533212010919, "grad_norm": 0.4950633863450793, "learning_rate": 4.337321489655366e-06, "loss": 0.0224, "step": 2608 }, { "epoch": 1.1869881710646042, "grad_norm": 0.6260958258831224, "learning_rate": 4.336836779977543e-06, "loss": 0.0257, "step": 2609 }, { "epoch": 1.1874431301182893, "grad_norm": 0.4751863637448733, "learning_rate": 4.336351920201714e-06, "loss": 0.0207, "step": 2610 }, { "epoch": 1.1878980891719746, "grad_norm": 0.43930578090368066, "learning_rate": 4.335866910367498e-06, "loss": 0.0114, "step": 2611 }, { "epoch": 1.1883530482256597, "grad_norm": 0.6314909631136255, "learning_rate": 4.3353817505145294e-06, "loss": 0.0261, "step": 2612 }, { "epoch": 1.1888080072793448, "grad_norm": 0.5008847623030172, "learning_rate": 4.334896440682452e-06, "loss": 0.0267, "step": 2613 }, { "epoch": 1.18926296633303, "grad_norm": 0.5600055290379815, "learning_rate": 4.334410980910924e-06, "loss": 0.0286, "step": 2614 }, { "epoch": 1.1897179253867152, "grad_norm": 0.5728580933788284, "learning_rate": 4.333925371239615e-06, "loss": 0.022, "step": 2615 }, { "epoch": 1.1901728844404005, "grad_norm": 0.5724099445776433, "learning_rate": 4.3334396117082065e-06, "loss": 0.0221, "step": 2616 }, { "epoch": 1.1906278434940856, "grad_norm": 0.6320973313826952, "learning_rate": 4.332953702356393e-06, "loss": 0.0288, "step": 2617 }, { "epoch": 1.1910828025477707, "grad_norm": 0.5090911320181027, "learning_rate": 4.33246764322388e-06, "loss": 0.0181, "step": 2618 }, { "epoch": 1.191537761601456, "grad_norm": 0.4752697085073445, "learning_rate": 4.331981434350387e-06, "loss": 0.0145, "step": 2619 }, { "epoch": 1.191992720655141, "grad_norm": 0.7500039731207865, "learning_rate": 4.331495075775644e-06, "loss": 0.0372, "step": 2620 }, { "epoch": 1.1924476797088261, "grad_norm": 0.46553750325278676, "learning_rate": 4.331008567539395e-06, "loss": 0.0216, "step": 2621 }, { "epoch": 1.1929026387625115, "grad_norm": 0.7523375315000249, "learning_rate": 4.330521909681394e-06, "loss": 0.0221, "step": 2622 }, { "epoch": 1.1933575978161965, "grad_norm": 0.5528858812158782, "learning_rate": 4.330035102241409e-06, "loss": 0.0285, "step": 2623 }, { "epoch": 1.1938125568698816, "grad_norm": 0.5368571535414223, "learning_rate": 4.32954814525922e-06, "loss": 0.0194, "step": 2624 }, { "epoch": 1.194267515923567, "grad_norm": 0.6644805545801543, "learning_rate": 4.329061038774619e-06, "loss": 0.0293, "step": 2625 }, { "epoch": 1.194722474977252, "grad_norm": 0.5920169242009168, "learning_rate": 4.32857378282741e-06, "loss": 0.027, "step": 2626 }, { "epoch": 1.1951774340309371, "grad_norm": 0.5765181192250086, "learning_rate": 4.328086377457409e-06, "loss": 0.0149, "step": 2627 }, { "epoch": 1.1956323930846224, "grad_norm": 2.59038115553395, "learning_rate": 4.327598822704444e-06, "loss": 0.0238, "step": 2628 }, { "epoch": 1.1960873521383075, "grad_norm": 0.6749015393615012, "learning_rate": 4.327111118608357e-06, "loss": 0.0241, "step": 2629 }, { "epoch": 1.1965423111919926, "grad_norm": 0.5555096309142309, "learning_rate": 4.326623265209001e-06, "loss": 0.0241, "step": 2630 }, { "epoch": 1.196997270245678, "grad_norm": 0.4422718467721418, "learning_rate": 4.326135262546241e-06, "loss": 0.009, "step": 2631 }, { "epoch": 1.197452229299363, "grad_norm": 1.011510904258874, "learning_rate": 4.325647110659954e-06, "loss": 0.04, "step": 2632 }, { "epoch": 1.197907188353048, "grad_norm": 0.871122479276943, "learning_rate": 4.325158809590028e-06, "loss": 0.0299, "step": 2633 }, { "epoch": 1.1983621474067334, "grad_norm": 0.45898463677814605, "learning_rate": 4.324670359376368e-06, "loss": 0.0182, "step": 2634 }, { "epoch": 1.1988171064604185, "grad_norm": 0.6617734015997742, "learning_rate": 4.3241817600588865e-06, "loss": 0.0207, "step": 2635 }, { "epoch": 1.1992720655141038, "grad_norm": 0.3928382449321343, "learning_rate": 4.3236930116775086e-06, "loss": 0.0139, "step": 2636 }, { "epoch": 1.199727024567789, "grad_norm": 0.9460647127731692, "learning_rate": 4.323204114272174e-06, "loss": 0.0396, "step": 2637 }, { "epoch": 1.200181983621474, "grad_norm": 0.8397877040107056, "learning_rate": 4.3227150678828335e-06, "loss": 0.0273, "step": 2638 }, { "epoch": 1.2006369426751593, "grad_norm": 0.5340130401222157, "learning_rate": 4.322225872549448e-06, "loss": 0.022, "step": 2639 }, { "epoch": 1.2010919017288444, "grad_norm": 0.9277766410326496, "learning_rate": 4.321736528311994e-06, "loss": 0.0439, "step": 2640 }, { "epoch": 1.2015468607825295, "grad_norm": 0.7592854867727282, "learning_rate": 4.321247035210456e-06, "loss": 0.0289, "step": 2641 }, { "epoch": 1.2020018198362148, "grad_norm": 0.5341297610203549, "learning_rate": 4.320757393284837e-06, "loss": 0.0225, "step": 2642 }, { "epoch": 1.2024567788898999, "grad_norm": 0.6237470883113181, "learning_rate": 4.3202676025751455e-06, "loss": 0.0336, "step": 2643 }, { "epoch": 1.2029117379435852, "grad_norm": 0.5870246556587211, "learning_rate": 4.319777663121406e-06, "loss": 0.0276, "step": 2644 }, { "epoch": 1.2033666969972703, "grad_norm": 0.7609302108890849, "learning_rate": 4.319287574963653e-06, "loss": 0.0297, "step": 2645 }, { "epoch": 1.2038216560509554, "grad_norm": 0.6569180371738808, "learning_rate": 4.318797338141936e-06, "loss": 0.0317, "step": 2646 }, { "epoch": 1.2042766151046407, "grad_norm": 0.5944609730434067, "learning_rate": 4.318306952696314e-06, "loss": 0.027, "step": 2647 }, { "epoch": 1.2047315741583258, "grad_norm": 0.34688904992460695, "learning_rate": 4.317816418666859e-06, "loss": 0.0125, "step": 2648 }, { "epoch": 1.2051865332120109, "grad_norm": 0.6679718066461144, "learning_rate": 4.317325736093656e-06, "loss": 0.0293, "step": 2649 }, { "epoch": 1.2056414922656962, "grad_norm": 0.5669693680069866, "learning_rate": 4.316834905016801e-06, "loss": 0.0241, "step": 2650 }, { "epoch": 1.2060964513193813, "grad_norm": 0.46022640722499586, "learning_rate": 4.3163439254764015e-06, "loss": 0.0176, "step": 2651 }, { "epoch": 1.2065514103730663, "grad_norm": 0.5009167482323504, "learning_rate": 4.31585279751258e-06, "loss": 0.0164, "step": 2652 }, { "epoch": 1.2070063694267517, "grad_norm": 0.4911116285507776, "learning_rate": 4.315361521165467e-06, "loss": 0.023, "step": 2653 }, { "epoch": 1.2074613284804367, "grad_norm": 0.4139079723299726, "learning_rate": 4.314870096475209e-06, "loss": 0.0191, "step": 2654 }, { "epoch": 1.2079162875341218, "grad_norm": 0.5147916096736812, "learning_rate": 4.3143785234819624e-06, "loss": 0.0213, "step": 2655 }, { "epoch": 1.2083712465878071, "grad_norm": 0.4934413168807628, "learning_rate": 4.3138868022258974e-06, "loss": 0.0194, "step": 2656 }, { "epoch": 1.2088262056414922, "grad_norm": 0.6093916204573823, "learning_rate": 4.313394932747194e-06, "loss": 0.0256, "step": 2657 }, { "epoch": 1.2092811646951773, "grad_norm": 0.4328897173436113, "learning_rate": 4.312902915086045e-06, "loss": 0.0179, "step": 2658 }, { "epoch": 1.2097361237488626, "grad_norm": 0.6051187480913044, "learning_rate": 4.312410749282658e-06, "loss": 0.0264, "step": 2659 }, { "epoch": 1.2101910828025477, "grad_norm": 0.6402153561332778, "learning_rate": 4.311918435377248e-06, "loss": 0.0208, "step": 2660 }, { "epoch": 1.210646041856233, "grad_norm": 0.5588924434287335, "learning_rate": 4.311425973410047e-06, "loss": 0.0303, "step": 2661 }, { "epoch": 1.2111010009099181, "grad_norm": 0.45347333487831026, "learning_rate": 4.310933363421296e-06, "loss": 0.0155, "step": 2662 }, { "epoch": 1.2115559599636032, "grad_norm": 0.6992225938503356, "learning_rate": 4.310440605451248e-06, "loss": 0.0349, "step": 2663 }, { "epoch": 1.2120109190172885, "grad_norm": 0.9380643327786069, "learning_rate": 4.30994769954017e-06, "loss": 0.0338, "step": 2664 }, { "epoch": 1.2124658780709736, "grad_norm": 0.8253191054656854, "learning_rate": 4.30945464572834e-06, "loss": 0.0404, "step": 2665 }, { "epoch": 1.2129208371246587, "grad_norm": 0.6476800258789567, "learning_rate": 4.3089614440560465e-06, "loss": 0.0201, "step": 2666 }, { "epoch": 1.213375796178344, "grad_norm": 0.5709513616813471, "learning_rate": 4.3084680945635946e-06, "loss": 0.0223, "step": 2667 }, { "epoch": 1.213830755232029, "grad_norm": 0.7272313964988455, "learning_rate": 4.307974597291296e-06, "loss": 0.0305, "step": 2668 }, { "epoch": 1.2142857142857142, "grad_norm": 0.4125464957023897, "learning_rate": 4.307480952279478e-06, "loss": 0.0106, "step": 2669 }, { "epoch": 1.2147406733393995, "grad_norm": 0.4210089698998306, "learning_rate": 4.3069871595684795e-06, "loss": 0.0132, "step": 2670 }, { "epoch": 1.2151956323930846, "grad_norm": 0.5567918051217664, "learning_rate": 4.30649321919865e-06, "loss": 0.0232, "step": 2671 }, { "epoch": 1.21565059144677, "grad_norm": 0.7165679260009614, "learning_rate": 4.305999131210353e-06, "loss": 0.0229, "step": 2672 }, { "epoch": 1.216105550500455, "grad_norm": 0.6510555589132238, "learning_rate": 4.305504895643963e-06, "loss": 0.0201, "step": 2673 }, { "epoch": 1.21656050955414, "grad_norm": 0.496585641543594, "learning_rate": 4.305010512539867e-06, "loss": 0.0174, "step": 2674 }, { "epoch": 1.2170154686078254, "grad_norm": 0.4598917500270742, "learning_rate": 4.304515981938462e-06, "loss": 0.0146, "step": 2675 }, { "epoch": 1.2174704276615105, "grad_norm": 0.5463739101416092, "learning_rate": 4.304021303880161e-06, "loss": 0.0252, "step": 2676 }, { "epoch": 1.2179253867151956, "grad_norm": 0.5957670917341833, "learning_rate": 4.303526478405386e-06, "loss": 0.0218, "step": 2677 }, { "epoch": 1.2183803457688809, "grad_norm": 0.614447395579151, "learning_rate": 4.3030315055545715e-06, "loss": 0.0324, "step": 2678 }, { "epoch": 1.218835304822566, "grad_norm": 0.5616831395891632, "learning_rate": 4.302536385368165e-06, "loss": 0.0215, "step": 2679 }, { "epoch": 1.219290263876251, "grad_norm": 0.3790511549714058, "learning_rate": 4.3020411178866246e-06, "loss": 0.0103, "step": 2680 }, { "epoch": 1.2197452229299364, "grad_norm": 0.524178162065685, "learning_rate": 4.3015457031504226e-06, "loss": 0.0216, "step": 2681 }, { "epoch": 1.2202001819836215, "grad_norm": 0.6312387863751975, "learning_rate": 4.301050141200041e-06, "loss": 0.0187, "step": 2682 }, { "epoch": 1.2206551410373065, "grad_norm": 0.48104404582769067, "learning_rate": 4.300554432075975e-06, "loss": 0.0137, "step": 2683 }, { "epoch": 1.2211101000909919, "grad_norm": 0.8650014734492228, "learning_rate": 4.300058575818733e-06, "loss": 0.0356, "step": 2684 }, { "epoch": 1.221565059144677, "grad_norm": 0.6861230937699648, "learning_rate": 4.299562572468833e-06, "loss": 0.0269, "step": 2685 }, { "epoch": 1.222020018198362, "grad_norm": 0.5634591121479472, "learning_rate": 4.299066422066807e-06, "loss": 0.0214, "step": 2686 }, { "epoch": 1.2224749772520473, "grad_norm": 1.199183215901639, "learning_rate": 4.2985701246531965e-06, "loss": 0.0602, "step": 2687 }, { "epoch": 1.2229299363057324, "grad_norm": 0.42406900713579304, "learning_rate": 4.2980736802685575e-06, "loss": 0.0148, "step": 2688 }, { "epoch": 1.2233848953594177, "grad_norm": 0.5957456918417167, "learning_rate": 4.297577088953458e-06, "loss": 0.0148, "step": 2689 }, { "epoch": 1.2238398544131028, "grad_norm": 0.60599231302819, "learning_rate": 4.2970803507484756e-06, "loss": 0.0237, "step": 2690 }, { "epoch": 1.224294813466788, "grad_norm": 0.48118211083729384, "learning_rate": 4.296583465694204e-06, "loss": 0.013, "step": 2691 }, { "epoch": 1.2247497725204732, "grad_norm": 0.605388969739492, "learning_rate": 4.296086433831244e-06, "loss": 0.0315, "step": 2692 }, { "epoch": 1.2252047315741583, "grad_norm": 0.6619863417539111, "learning_rate": 4.295589255200212e-06, "loss": 0.0227, "step": 2693 }, { "epoch": 1.2256596906278434, "grad_norm": 0.8014939338874068, "learning_rate": 4.295091929841734e-06, "loss": 0.0265, "step": 2694 }, { "epoch": 1.2261146496815287, "grad_norm": 0.7269612365736505, "learning_rate": 4.2945944577964516e-06, "loss": 0.0357, "step": 2695 }, { "epoch": 1.2265696087352138, "grad_norm": 0.4610790536448068, "learning_rate": 4.294096839105013e-06, "loss": 0.0153, "step": 2696 }, { "epoch": 1.2270245677888991, "grad_norm": 0.4654611530391689, "learning_rate": 4.293599073808083e-06, "loss": 0.0182, "step": 2697 }, { "epoch": 1.2274795268425842, "grad_norm": 0.5999217153178389, "learning_rate": 4.293101161946337e-06, "loss": 0.0229, "step": 2698 }, { "epoch": 1.2279344858962693, "grad_norm": 0.49917644989179766, "learning_rate": 4.292603103560462e-06, "loss": 0.0124, "step": 2699 }, { "epoch": 1.2283894449499546, "grad_norm": 0.5365210835623073, "learning_rate": 4.292104898691157e-06, "loss": 0.0196, "step": 2700 }, { "epoch": 1.2288444040036397, "grad_norm": 0.5558494608681172, "learning_rate": 4.291606547379131e-06, "loss": 0.0186, "step": 2701 }, { "epoch": 1.2292993630573248, "grad_norm": 0.4612006424923298, "learning_rate": 4.291108049665109e-06, "loss": 0.0192, "step": 2702 }, { "epoch": 1.22975432211101, "grad_norm": 0.5593361998504748, "learning_rate": 4.290609405589827e-06, "loss": 0.0151, "step": 2703 }, { "epoch": 1.2302092811646952, "grad_norm": 0.6007531603783597, "learning_rate": 4.29011061519403e-06, "loss": 0.0301, "step": 2704 }, { "epoch": 1.2306642402183803, "grad_norm": 0.636096108360712, "learning_rate": 4.289611678518478e-06, "loss": 0.0299, "step": 2705 }, { "epoch": 1.2311191992720656, "grad_norm": 0.7430118225044907, "learning_rate": 4.289112595603941e-06, "loss": 0.0226, "step": 2706 }, { "epoch": 1.2315741583257507, "grad_norm": 0.6604918263471005, "learning_rate": 4.288613366491202e-06, "loss": 0.0306, "step": 2707 }, { "epoch": 1.2320291173794358, "grad_norm": 0.7349725252139763, "learning_rate": 4.288113991221057e-06, "loss": 0.0302, "step": 2708 }, { "epoch": 1.232484076433121, "grad_norm": 0.5380600500720201, "learning_rate": 4.2876144698343115e-06, "loss": 0.0237, "step": 2709 }, { "epoch": 1.2329390354868062, "grad_norm": 0.5754211885761235, "learning_rate": 4.287114802371783e-06, "loss": 0.0171, "step": 2710 }, { "epoch": 1.2333939945404913, "grad_norm": 0.5551048882570763, "learning_rate": 4.286614988874304e-06, "loss": 0.0247, "step": 2711 }, { "epoch": 1.2338489535941766, "grad_norm": 0.6348105766760367, "learning_rate": 4.286115029382717e-06, "loss": 0.0254, "step": 2712 }, { "epoch": 1.2343039126478617, "grad_norm": 0.6849101702514654, "learning_rate": 4.285614923937876e-06, "loss": 0.0302, "step": 2713 }, { "epoch": 1.2347588717015467, "grad_norm": 0.46035309891602166, "learning_rate": 4.285114672580647e-06, "loss": 0.0159, "step": 2714 }, { "epoch": 1.235213830755232, "grad_norm": 0.5132490189425896, "learning_rate": 4.284614275351907e-06, "loss": 0.0222, "step": 2715 }, { "epoch": 1.2356687898089171, "grad_norm": 0.48061757961981605, "learning_rate": 4.2841137322925495e-06, "loss": 0.0183, "step": 2716 }, { "epoch": 1.2361237488626025, "grad_norm": 0.6115977831280694, "learning_rate": 4.283613043443474e-06, "loss": 0.0252, "step": 2717 }, { "epoch": 1.2365787079162875, "grad_norm": 0.43493566800443395, "learning_rate": 4.2831122088455955e-06, "loss": 0.0145, "step": 2718 }, { "epoch": 1.2370336669699726, "grad_norm": 0.978674343771398, "learning_rate": 4.2826112285398395e-06, "loss": 0.0507, "step": 2719 }, { "epoch": 1.237488626023658, "grad_norm": 0.5339344105322079, "learning_rate": 4.282110102567145e-06, "loss": 0.0129, "step": 2720 }, { "epoch": 1.237943585077343, "grad_norm": 0.5942512671949535, "learning_rate": 4.28160883096846e-06, "loss": 0.0202, "step": 2721 }, { "epoch": 1.2383985441310281, "grad_norm": 0.564039136663134, "learning_rate": 4.281107413784747e-06, "loss": 0.0141, "step": 2722 }, { "epoch": 1.2388535031847134, "grad_norm": 0.42797274135047625, "learning_rate": 4.28060585105698e-06, "loss": 0.0163, "step": 2723 }, { "epoch": 1.2393084622383985, "grad_norm": 0.5526854916080111, "learning_rate": 4.280104142826143e-06, "loss": 0.0266, "step": 2724 }, { "epoch": 1.2397634212920838, "grad_norm": 1.1014939973273092, "learning_rate": 4.2796022891332355e-06, "loss": 0.0457, "step": 2725 }, { "epoch": 1.240218380345769, "grad_norm": 0.597089268959557, "learning_rate": 4.279100290019265e-06, "loss": 0.0229, "step": 2726 }, { "epoch": 1.240673339399454, "grad_norm": 0.6166593528194175, "learning_rate": 4.278598145525253e-06, "loss": 0.0314, "step": 2727 }, { "epoch": 1.2411282984531393, "grad_norm": 0.5739818877716178, "learning_rate": 4.278095855692233e-06, "loss": 0.028, "step": 2728 }, { "epoch": 1.2415832575068244, "grad_norm": 0.3861409302981483, "learning_rate": 4.277593420561249e-06, "loss": 0.0137, "step": 2729 }, { "epoch": 1.2420382165605095, "grad_norm": 0.8505238122375511, "learning_rate": 4.277090840173359e-06, "loss": 0.0369, "step": 2730 }, { "epoch": 1.2424931756141948, "grad_norm": 0.6937534569187743, "learning_rate": 4.276588114569631e-06, "loss": 0.0346, "step": 2731 }, { "epoch": 1.24294813466788, "grad_norm": 0.4879993817467368, "learning_rate": 4.2760852437911436e-06, "loss": 0.0226, "step": 2732 }, { "epoch": 1.243403093721565, "grad_norm": 0.4704653935774365, "learning_rate": 4.2755822278789926e-06, "loss": 0.0185, "step": 2733 }, { "epoch": 1.2438580527752503, "grad_norm": 0.34901060315473925, "learning_rate": 4.2750790668742795e-06, "loss": 0.0154, "step": 2734 }, { "epoch": 1.2443130118289354, "grad_norm": 0.9393836760411997, "learning_rate": 4.274575760818122e-06, "loss": 0.0258, "step": 2735 }, { "epoch": 1.2447679708826205, "grad_norm": 0.63450452027728, "learning_rate": 4.274072309751646e-06, "loss": 0.026, "step": 2736 }, { "epoch": 1.2452229299363058, "grad_norm": 0.8168459205839178, "learning_rate": 4.273568713715993e-06, "loss": 0.0305, "step": 2737 }, { "epoch": 1.2456778889899909, "grad_norm": 0.6809047059242845, "learning_rate": 4.2730649727523145e-06, "loss": 0.0341, "step": 2738 }, { "epoch": 1.246132848043676, "grad_norm": 0.6936049408714152, "learning_rate": 4.272561086901773e-06, "loss": 0.0234, "step": 2739 }, { "epoch": 1.2465878070973613, "grad_norm": 0.5711940438181513, "learning_rate": 4.272057056205544e-06, "loss": 0.0232, "step": 2740 }, { "epoch": 1.2470427661510464, "grad_norm": 0.5131420714637235, "learning_rate": 4.271552880704815e-06, "loss": 0.0235, "step": 2741 }, { "epoch": 1.2474977252047315, "grad_norm": 0.5152210269306549, "learning_rate": 4.271048560440786e-06, "loss": 0.0261, "step": 2742 }, { "epoch": 1.2479526842584168, "grad_norm": 0.5629080978039249, "learning_rate": 4.2705440954546665e-06, "loss": 0.0322, "step": 2743 }, { "epoch": 1.2484076433121019, "grad_norm": 0.6865649256488602, "learning_rate": 4.270039485787678e-06, "loss": 0.0302, "step": 2744 }, { "epoch": 1.2488626023657872, "grad_norm": 0.6124115080840975, "learning_rate": 4.269534731481057e-06, "loss": 0.0227, "step": 2745 }, { "epoch": 1.2493175614194723, "grad_norm": 0.4441783467736665, "learning_rate": 4.269029832576048e-06, "loss": 0.014, "step": 2746 }, { "epoch": 1.2497725204731573, "grad_norm": 0.6342777023583722, "learning_rate": 4.2685247891139114e-06, "loss": 0.021, "step": 2747 }, { "epoch": 1.2502274795268427, "grad_norm": 0.4671045785013319, "learning_rate": 4.268019601135914e-06, "loss": 0.0272, "step": 2748 }, { "epoch": 1.2506824385805277, "grad_norm": 0.668394552858572, "learning_rate": 4.26751426868334e-06, "loss": 0.0171, "step": 2749 }, { "epoch": 1.251137397634213, "grad_norm": 0.6900498602539912, "learning_rate": 4.2670087917974826e-06, "loss": 0.0304, "step": 2750 }, { "epoch": 1.2515923566878981, "grad_norm": 0.5403025651863708, "learning_rate": 4.266503170519645e-06, "loss": 0.0192, "step": 2751 }, { "epoch": 1.2520473157415832, "grad_norm": 0.980772795905362, "learning_rate": 4.265997404891147e-06, "loss": 0.0507, "step": 2752 }, { "epoch": 1.2525022747952685, "grad_norm": 0.4711111793060614, "learning_rate": 4.265491494953316e-06, "loss": 0.0181, "step": 2753 }, { "epoch": 1.2529572338489536, "grad_norm": 0.6991553417108572, "learning_rate": 4.2649854407474925e-06, "loss": 0.0326, "step": 2754 }, { "epoch": 1.2534121929026387, "grad_norm": 0.4624764110725661, "learning_rate": 4.26447924231503e-06, "loss": 0.0193, "step": 2755 }, { "epoch": 1.253867151956324, "grad_norm": 0.5205590680500956, "learning_rate": 4.263972899697292e-06, "loss": 0.0252, "step": 2756 }, { "epoch": 1.2543221110100091, "grad_norm": 0.4239402915389399, "learning_rate": 4.263466412935654e-06, "loss": 0.0198, "step": 2757 }, { "epoch": 1.2547770700636942, "grad_norm": 0.30440994761396317, "learning_rate": 4.262959782071505e-06, "loss": 0.0101, "step": 2758 }, { "epoch": 1.2552320291173795, "grad_norm": 0.661608423060866, "learning_rate": 4.262453007146244e-06, "loss": 0.0302, "step": 2759 }, { "epoch": 1.2556869881710646, "grad_norm": 0.4269537794878538, "learning_rate": 4.261946088201282e-06, "loss": 0.0155, "step": 2760 }, { "epoch": 1.2561419472247497, "grad_norm": 0.5124937588999617, "learning_rate": 4.261439025278044e-06, "loss": 0.0235, "step": 2761 }, { "epoch": 1.256596906278435, "grad_norm": 0.5891484571522302, "learning_rate": 4.260931818417962e-06, "loss": 0.022, "step": 2762 }, { "epoch": 1.25705186533212, "grad_norm": 0.43196676673808215, "learning_rate": 4.260424467662484e-06, "loss": 0.0173, "step": 2763 }, { "epoch": 1.2575068243858052, "grad_norm": 0.48458914621643406, "learning_rate": 4.259916973053069e-06, "loss": 0.0246, "step": 2764 }, { "epoch": 1.2579617834394905, "grad_norm": 0.6072652816815827, "learning_rate": 4.2594093346311865e-06, "loss": 0.03, "step": 2765 }, { "epoch": 1.2584167424931756, "grad_norm": 0.4633461488964158, "learning_rate": 4.258901552438319e-06, "loss": 0.0193, "step": 2766 }, { "epoch": 1.2588717015468607, "grad_norm": 0.5390509256812834, "learning_rate": 4.25839362651596e-06, "loss": 0.0178, "step": 2767 }, { "epoch": 1.259326660600546, "grad_norm": 0.6135561122538858, "learning_rate": 4.257885556905613e-06, "loss": 0.0265, "step": 2768 }, { "epoch": 1.259781619654231, "grad_norm": 9.77529026151472, "learning_rate": 4.257377343648799e-06, "loss": 0.164, "step": 2769 }, { "epoch": 1.2602365787079162, "grad_norm": 0.8549509883223334, "learning_rate": 4.256868986787044e-06, "loss": 0.0386, "step": 2770 }, { "epoch": 1.2606915377616015, "grad_norm": 0.5619658661669584, "learning_rate": 4.256360486361889e-06, "loss": 0.0172, "step": 2771 }, { "epoch": 1.2611464968152866, "grad_norm": 0.3706477106784628, "learning_rate": 4.255851842414887e-06, "loss": 0.0121, "step": 2772 }, { "epoch": 1.2616014558689717, "grad_norm": 1.0861459812552854, "learning_rate": 4.255343054987601e-06, "loss": 0.048, "step": 2773 }, { "epoch": 1.262056414922657, "grad_norm": 0.4626982196257253, "learning_rate": 4.2548341241216085e-06, "loss": 0.0123, "step": 2774 }, { "epoch": 1.262511373976342, "grad_norm": 0.5255340575042864, "learning_rate": 4.254325049858496e-06, "loss": 0.0225, "step": 2775 }, { "epoch": 1.2629663330300274, "grad_norm": 0.4881043727343809, "learning_rate": 4.2538158322398625e-06, "loss": 0.0189, "step": 2776 }, { "epoch": 1.2634212920837125, "grad_norm": 0.43273322558327976, "learning_rate": 4.2533064713073195e-06, "loss": 0.0158, "step": 2777 }, { "epoch": 1.2638762511373978, "grad_norm": 0.4857284264595445, "learning_rate": 4.252796967102489e-06, "loss": 0.0193, "step": 2778 }, { "epoch": 1.2643312101910829, "grad_norm": 0.6384142420021423, "learning_rate": 4.2522873196670065e-06, "loss": 0.0277, "step": 2779 }, { "epoch": 1.264786169244768, "grad_norm": 0.589124877695731, "learning_rate": 4.2517775290425175e-06, "loss": 0.015, "step": 2780 }, { "epoch": 1.2652411282984533, "grad_norm": 0.6763414782155324, "learning_rate": 4.251267595270681e-06, "loss": 0.0361, "step": 2781 }, { "epoch": 1.2656960873521383, "grad_norm": 0.8967701103335196, "learning_rate": 4.250757518393163e-06, "loss": 0.0345, "step": 2782 }, { "epoch": 1.2661510464058234, "grad_norm": 0.7419598990931897, "learning_rate": 4.250247298451649e-06, "loss": 0.0304, "step": 2783 }, { "epoch": 1.2666060054595087, "grad_norm": 0.5353914883857681, "learning_rate": 4.249736935487828e-06, "loss": 0.0207, "step": 2784 }, { "epoch": 1.2670609645131938, "grad_norm": 0.5717840719643298, "learning_rate": 4.249226429543408e-06, "loss": 0.0241, "step": 2785 }, { "epoch": 1.267515923566879, "grad_norm": 0.7539610092518788, "learning_rate": 4.248715780660102e-06, "loss": 0.0308, "step": 2786 }, { "epoch": 1.2679708826205642, "grad_norm": 0.7217352721088893, "learning_rate": 4.2482049888796405e-06, "loss": 0.0253, "step": 2787 }, { "epoch": 1.2684258416742493, "grad_norm": 0.4520410375502876, "learning_rate": 4.247694054243762e-06, "loss": 0.0166, "step": 2788 }, { "epoch": 1.2688808007279344, "grad_norm": 0.4556016017243014, "learning_rate": 4.247182976794218e-06, "loss": 0.018, "step": 2789 }, { "epoch": 1.2693357597816197, "grad_norm": 0.6363419978670682, "learning_rate": 4.246671756572771e-06, "loss": 0.029, "step": 2790 }, { "epoch": 1.2697907188353048, "grad_norm": 0.6386809769247306, "learning_rate": 4.246160393621197e-06, "loss": 0.0237, "step": 2791 }, { "epoch": 1.27024567788899, "grad_norm": 0.4990900576925178, "learning_rate": 4.2456488879812805e-06, "loss": 0.0156, "step": 2792 }, { "epoch": 1.2707006369426752, "grad_norm": 0.6029372232546744, "learning_rate": 4.24513723969482e-06, "loss": 0.0242, "step": 2793 }, { "epoch": 1.2711555959963603, "grad_norm": 0.8051049668072983, "learning_rate": 4.244625448803625e-06, "loss": 0.0349, "step": 2794 }, { "epoch": 1.2716105550500454, "grad_norm": 0.5721627664779094, "learning_rate": 4.244113515349517e-06, "loss": 0.0317, "step": 2795 }, { "epoch": 1.2720655141037307, "grad_norm": 0.38177683285724745, "learning_rate": 4.243601439374329e-06, "loss": 0.0109, "step": 2796 }, { "epoch": 1.2725204731574158, "grad_norm": 0.3860734101533411, "learning_rate": 4.243089220919906e-06, "loss": 0.0177, "step": 2797 }, { "epoch": 1.2729754322111009, "grad_norm": 0.49565665719286456, "learning_rate": 4.242576860028103e-06, "loss": 0.0152, "step": 2798 }, { "epoch": 1.2734303912647862, "grad_norm": 0.6787639424386418, "learning_rate": 4.242064356740789e-06, "loss": 0.0222, "step": 2799 }, { "epoch": 1.2738853503184713, "grad_norm": 0.42802741447037806, "learning_rate": 4.2415517110998415e-06, "loss": 0.0136, "step": 2800 }, { "epoch": 1.2743403093721566, "grad_norm": 2.411461244356516, "learning_rate": 4.241038923147155e-06, "loss": 0.0668, "step": 2801 }, { "epoch": 1.2747952684258417, "grad_norm": 0.5980800042116179, "learning_rate": 4.240525992924629e-06, "loss": 0.0251, "step": 2802 }, { "epoch": 1.2752502274795268, "grad_norm": 0.5919404298777446, "learning_rate": 4.240012920474179e-06, "loss": 0.0265, "step": 2803 }, { "epoch": 1.275705186533212, "grad_norm": 0.6697149704004297, "learning_rate": 4.239499705837731e-06, "loss": 0.0221, "step": 2804 }, { "epoch": 1.2761601455868972, "grad_norm": 0.5180009025750582, "learning_rate": 4.238986349057223e-06, "loss": 0.0176, "step": 2805 }, { "epoch": 1.2766151046405825, "grad_norm": 0.7241572161326929, "learning_rate": 4.238472850174603e-06, "loss": 0.0307, "step": 2806 }, { "epoch": 1.2770700636942676, "grad_norm": 0.7167298252754748, "learning_rate": 4.2379592092318326e-06, "loss": 0.0311, "step": 2807 }, { "epoch": 1.2775250227479527, "grad_norm": 0.9125971645241219, "learning_rate": 4.237445426270884e-06, "loss": 0.0368, "step": 2808 }, { "epoch": 1.277979981801638, "grad_norm": 0.6955622824211942, "learning_rate": 4.236931501333742e-06, "loss": 0.0401, "step": 2809 }, { "epoch": 1.278434940855323, "grad_norm": 0.6822684286332794, "learning_rate": 4.236417434462401e-06, "loss": 0.0408, "step": 2810 }, { "epoch": 1.2788898999090081, "grad_norm": 0.5716282558922546, "learning_rate": 4.23590322569887e-06, "loss": 0.0276, "step": 2811 }, { "epoch": 1.2793448589626935, "grad_norm": 0.8045248725656529, "learning_rate": 4.2353888750851655e-06, "loss": 0.0342, "step": 2812 }, { "epoch": 1.2797998180163785, "grad_norm": 0.5829303425508636, "learning_rate": 4.2348743826633195e-06, "loss": 0.0228, "step": 2813 }, { "epoch": 1.2802547770700636, "grad_norm": 0.3606406351198297, "learning_rate": 4.234359748475374e-06, "loss": 0.0095, "step": 2814 }, { "epoch": 1.280709736123749, "grad_norm": 0.5750903549771339, "learning_rate": 4.233844972563382e-06, "loss": 0.0211, "step": 2815 }, { "epoch": 1.281164695177434, "grad_norm": 0.5079770252768003, "learning_rate": 4.233330054969409e-06, "loss": 0.0187, "step": 2816 }, { "epoch": 1.2816196542311191, "grad_norm": 0.5957559449093126, "learning_rate": 4.23281499573553e-06, "loss": 0.0221, "step": 2817 }, { "epoch": 1.2820746132848044, "grad_norm": 0.4741096882523198, "learning_rate": 4.232299794903837e-06, "loss": 0.0216, "step": 2818 }, { "epoch": 1.2825295723384895, "grad_norm": 0.5565236749852849, "learning_rate": 4.2317844525164265e-06, "loss": 0.0199, "step": 2819 }, { "epoch": 1.2829845313921746, "grad_norm": 0.6913898026559677, "learning_rate": 4.2312689686154115e-06, "loss": 0.0275, "step": 2820 }, { "epoch": 1.28343949044586, "grad_norm": 0.5448158752536557, "learning_rate": 4.230753343242915e-06, "loss": 0.0198, "step": 2821 }, { "epoch": 1.283894449499545, "grad_norm": 0.5785731155407168, "learning_rate": 4.230237576441071e-06, "loss": 0.0223, "step": 2822 }, { "epoch": 1.28434940855323, "grad_norm": 0.5022779878541892, "learning_rate": 4.229721668252026e-06, "loss": 0.0214, "step": 2823 }, { "epoch": 1.2848043676069154, "grad_norm": 0.654337012356161, "learning_rate": 4.2292056187179374e-06, "loss": 0.0309, "step": 2824 }, { "epoch": 1.2852593266606005, "grad_norm": 0.6919914477024262, "learning_rate": 4.228689427880975e-06, "loss": 0.028, "step": 2825 }, { "epoch": 1.2857142857142856, "grad_norm": 0.4782749797029657, "learning_rate": 4.228173095783319e-06, "loss": 0.0158, "step": 2826 }, { "epoch": 1.286169244767971, "grad_norm": 0.6918834440829104, "learning_rate": 4.227656622467162e-06, "loss": 0.0294, "step": 2827 }, { "epoch": 1.286624203821656, "grad_norm": 0.6095753344564729, "learning_rate": 4.2271400079747085e-06, "loss": 0.0189, "step": 2828 }, { "epoch": 1.2870791628753413, "grad_norm": 0.6000891224876131, "learning_rate": 4.2266232523481724e-06, "loss": 0.0183, "step": 2829 }, { "epoch": 1.2875341219290264, "grad_norm": 0.5782268417165628, "learning_rate": 4.226106355629781e-06, "loss": 0.0217, "step": 2830 }, { "epoch": 1.2879890809827115, "grad_norm": 0.672780854857564, "learning_rate": 4.225589317861775e-06, "loss": 0.0237, "step": 2831 }, { "epoch": 1.2884440400363968, "grad_norm": 0.421513804128696, "learning_rate": 4.225072139086401e-06, "loss": 0.016, "step": 2832 }, { "epoch": 1.2888989990900819, "grad_norm": 0.6961950913783308, "learning_rate": 4.224554819345923e-06, "loss": 0.0322, "step": 2833 }, { "epoch": 1.2893539581437672, "grad_norm": 0.6627847441657934, "learning_rate": 4.224037358682614e-06, "loss": 0.033, "step": 2834 }, { "epoch": 1.2898089171974523, "grad_norm": 0.4109901835435224, "learning_rate": 4.223519757138756e-06, "loss": 0.0159, "step": 2835 }, { "epoch": 1.2902638762511374, "grad_norm": 0.48996513048334367, "learning_rate": 4.223002014756647e-06, "loss": 0.0118, "step": 2836 }, { "epoch": 1.2907188353048227, "grad_norm": 0.5197825144686319, "learning_rate": 4.222484131578595e-06, "loss": 0.018, "step": 2837 }, { "epoch": 1.2911737943585078, "grad_norm": 0.6196176741621322, "learning_rate": 4.221966107646918e-06, "loss": 0.0215, "step": 2838 }, { "epoch": 1.2916287534121929, "grad_norm": 0.4809197909639679, "learning_rate": 4.221447943003947e-06, "loss": 0.0182, "step": 2839 }, { "epoch": 1.2920837124658782, "grad_norm": 0.5637491655107516, "learning_rate": 4.2209296376920254e-06, "loss": 0.0241, "step": 2840 }, { "epoch": 1.2925386715195633, "grad_norm": 0.5091598497989657, "learning_rate": 4.220411191753504e-06, "loss": 0.0206, "step": 2841 }, { "epoch": 1.2929936305732483, "grad_norm": 0.5260279573652854, "learning_rate": 4.21989260523075e-06, "loss": 0.0227, "step": 2842 }, { "epoch": 1.2934485896269337, "grad_norm": 0.533114008746521, "learning_rate": 4.219373878166139e-06, "loss": 0.0241, "step": 2843 }, { "epoch": 1.2939035486806187, "grad_norm": 0.715496752958288, "learning_rate": 4.21885501060206e-06, "loss": 0.0332, "step": 2844 }, { "epoch": 1.2943585077343038, "grad_norm": 0.7190547999653241, "learning_rate": 4.21833600258091e-06, "loss": 0.0218, "step": 2845 }, { "epoch": 1.2948134667879891, "grad_norm": 0.8834194627984504, "learning_rate": 4.217816854145103e-06, "loss": 0.065, "step": 2846 }, { "epoch": 1.2952684258416742, "grad_norm": 0.559679452911507, "learning_rate": 4.2172975653370605e-06, "loss": 0.0192, "step": 2847 }, { "epoch": 1.2957233848953593, "grad_norm": 0.6757787650161827, "learning_rate": 4.216778136199216e-06, "loss": 0.0324, "step": 2848 }, { "epoch": 1.2961783439490446, "grad_norm": 0.6089214602754296, "learning_rate": 4.216258566774015e-06, "loss": 0.0236, "step": 2849 }, { "epoch": 1.2966333030027297, "grad_norm": 0.7300661825007071, "learning_rate": 4.215738857103915e-06, "loss": 0.0348, "step": 2850 }, { "epoch": 1.2970882620564148, "grad_norm": 0.5680795748113346, "learning_rate": 4.215219007231382e-06, "loss": 0.0237, "step": 2851 }, { "epoch": 1.2975432211101001, "grad_norm": 0.7173691689634865, "learning_rate": 4.214699017198899e-06, "loss": 0.0239, "step": 2852 }, { "epoch": 1.2979981801637852, "grad_norm": 0.5947468711182669, "learning_rate": 4.214178887048956e-06, "loss": 0.0223, "step": 2853 }, { "epoch": 1.2984531392174703, "grad_norm": 0.6420155276039037, "learning_rate": 4.213658616824055e-06, "loss": 0.0326, "step": 2854 }, { "epoch": 1.2989080982711556, "grad_norm": 0.6390694990330569, "learning_rate": 4.213138206566711e-06, "loss": 0.0273, "step": 2855 }, { "epoch": 1.2993630573248407, "grad_norm": 0.44274048534178945, "learning_rate": 4.21261765631945e-06, "loss": 0.0197, "step": 2856 }, { "epoch": 1.299818016378526, "grad_norm": 0.661916330405343, "learning_rate": 4.212096966124807e-06, "loss": 0.0311, "step": 2857 }, { "epoch": 1.300272975432211, "grad_norm": 0.6435231707439829, "learning_rate": 4.2115761360253325e-06, "loss": 0.0263, "step": 2858 }, { "epoch": 1.3007279344858962, "grad_norm": 0.5981565976648671, "learning_rate": 4.211055166063585e-06, "loss": 0.0198, "step": 2859 }, { "epoch": 1.3011828935395815, "grad_norm": 0.561549400448317, "learning_rate": 4.210534056282136e-06, "loss": 0.0145, "step": 2860 }, { "epoch": 1.3016378525932666, "grad_norm": 0.518443153784379, "learning_rate": 4.21001280672357e-06, "loss": 0.0203, "step": 2861 }, { "epoch": 1.302092811646952, "grad_norm": 0.5417281859185272, "learning_rate": 4.209491417430479e-06, "loss": 0.0254, "step": 2862 }, { "epoch": 1.302547770700637, "grad_norm": 0.6510257580750398, "learning_rate": 4.208969888445469e-06, "loss": 0.0258, "step": 2863 }, { "epoch": 1.303002729754322, "grad_norm": 1.2628676554442981, "learning_rate": 4.208448219811158e-06, "loss": 0.03, "step": 2864 }, { "epoch": 1.3034576888080074, "grad_norm": 0.5017599571522583, "learning_rate": 4.207926411570172e-06, "loss": 0.0188, "step": 2865 }, { "epoch": 1.3039126478616925, "grad_norm": 0.46506741747107516, "learning_rate": 4.207404463765155e-06, "loss": 0.0247, "step": 2866 }, { "epoch": 1.3043676069153776, "grad_norm": 0.6077204885385615, "learning_rate": 4.2068823764387545e-06, "loss": 0.0329, "step": 2867 }, { "epoch": 1.3048225659690629, "grad_norm": 0.5556402652600926, "learning_rate": 4.206360149633635e-06, "loss": 0.0274, "step": 2868 }, { "epoch": 1.305277525022748, "grad_norm": 0.5553997725592712, "learning_rate": 4.205837783392469e-06, "loss": 0.0228, "step": 2869 }, { "epoch": 1.305732484076433, "grad_norm": 0.5961926358326571, "learning_rate": 4.205315277757943e-06, "loss": 0.0288, "step": 2870 }, { "epoch": 1.3061874431301184, "grad_norm": 0.7022966753338193, "learning_rate": 4.204792632772754e-06, "loss": 0.0368, "step": 2871 }, { "epoch": 1.3066424021838035, "grad_norm": 0.553176305683213, "learning_rate": 4.204269848479611e-06, "loss": 0.0166, "step": 2872 }, { "epoch": 1.3070973612374885, "grad_norm": 0.6673883943643991, "learning_rate": 4.203746924921231e-06, "loss": 0.0182, "step": 2873 }, { "epoch": 1.3075523202911739, "grad_norm": 0.4250980507545852, "learning_rate": 4.203223862140347e-06, "loss": 0.0146, "step": 2874 }, { "epoch": 1.308007279344859, "grad_norm": 0.584977310068724, "learning_rate": 4.2027006601797e-06, "loss": 0.0255, "step": 2875 }, { "epoch": 1.308462238398544, "grad_norm": 0.6022594998733989, "learning_rate": 4.202177319082045e-06, "loss": 0.0242, "step": 2876 }, { "epoch": 1.3089171974522293, "grad_norm": 0.7502986494718045, "learning_rate": 4.201653838890146e-06, "loss": 0.0347, "step": 2877 }, { "epoch": 1.3093721565059144, "grad_norm": 0.5980053178715274, "learning_rate": 4.20113021964678e-06, "loss": 0.0196, "step": 2878 }, { "epoch": 1.3098271155595995, "grad_norm": 0.4406338596368606, "learning_rate": 4.200606461394735e-06, "loss": 0.0133, "step": 2879 }, { "epoch": 1.3102820746132848, "grad_norm": 1.149413207067457, "learning_rate": 4.200082564176809e-06, "loss": 0.0543, "step": 2880 }, { "epoch": 1.31073703366697, "grad_norm": 0.5947980427387766, "learning_rate": 4.199558528035814e-06, "loss": 0.021, "step": 2881 }, { "epoch": 1.311191992720655, "grad_norm": 0.8151453929654713, "learning_rate": 4.199034353014572e-06, "loss": 0.0377, "step": 2882 }, { "epoch": 1.3116469517743403, "grad_norm": 1.068155005333274, "learning_rate": 4.198510039155914e-06, "loss": 0.068, "step": 2883 }, { "epoch": 1.3121019108280254, "grad_norm": 0.5572646955872677, "learning_rate": 4.197985586502686e-06, "loss": 0.019, "step": 2884 }, { "epoch": 1.3125568698817107, "grad_norm": 0.7818019627481866, "learning_rate": 4.197460995097745e-06, "loss": 0.0321, "step": 2885 }, { "epoch": 1.3130118289353958, "grad_norm": 0.7089080261247177, "learning_rate": 4.1969362649839565e-06, "loss": 0.0289, "step": 2886 }, { "epoch": 1.3134667879890811, "grad_norm": 0.3919993320458887, "learning_rate": 4.1964113962042e-06, "loss": 0.0154, "step": 2887 }, { "epoch": 1.3139217470427662, "grad_norm": 0.6936353932870721, "learning_rate": 4.195886388801364e-06, "loss": 0.0288, "step": 2888 }, { "epoch": 1.3143767060964513, "grad_norm": 0.604647177949053, "learning_rate": 4.195361242818354e-06, "loss": 0.0293, "step": 2889 }, { "epoch": 1.3148316651501366, "grad_norm": 0.7531452935186627, "learning_rate": 4.194835958298076e-06, "loss": 0.043, "step": 2890 }, { "epoch": 1.3152866242038217, "grad_norm": 0.4066027317342002, "learning_rate": 4.194310535283459e-06, "loss": 0.013, "step": 2891 }, { "epoch": 1.3157415832575068, "grad_norm": 0.6284459518815321, "learning_rate": 4.193784973817436e-06, "loss": 0.0176, "step": 2892 }, { "epoch": 1.316196542311192, "grad_norm": 0.6866940604048498, "learning_rate": 4.193259273942954e-06, "loss": 0.0344, "step": 2893 }, { "epoch": 1.3166515013648772, "grad_norm": 0.4840883086023792, "learning_rate": 4.192733435702971e-06, "loss": 0.022, "step": 2894 }, { "epoch": 1.3171064604185623, "grad_norm": 0.5536571277591115, "learning_rate": 4.192207459140456e-06, "loss": 0.0206, "step": 2895 }, { "epoch": 1.3175614194722476, "grad_norm": 0.7165221835225527, "learning_rate": 4.1916813442983895e-06, "loss": 0.0298, "step": 2896 }, { "epoch": 1.3180163785259327, "grad_norm": 0.6305232437397477, "learning_rate": 4.191155091219763e-06, "loss": 0.0267, "step": 2897 }, { "epoch": 1.3184713375796178, "grad_norm": 0.5579202333359629, "learning_rate": 4.1906286999475785e-06, "loss": 0.0291, "step": 2898 }, { "epoch": 1.318926296633303, "grad_norm": 0.5392021658327838, "learning_rate": 4.190102170524853e-06, "loss": 0.0242, "step": 2899 }, { "epoch": 1.3193812556869882, "grad_norm": 0.5800078554925919, "learning_rate": 4.18957550299461e-06, "loss": 0.0242, "step": 2900 }, { "epoch": 1.3198362147406733, "grad_norm": 0.5862214666422206, "learning_rate": 4.189048697399887e-06, "loss": 0.0215, "step": 2901 }, { "epoch": 1.3202911737943586, "grad_norm": 0.5620076241035448, "learning_rate": 4.188521753783732e-06, "loss": 0.0171, "step": 2902 }, { "epoch": 1.3207461328480437, "grad_norm": 0.6812119340304829, "learning_rate": 4.187994672189205e-06, "loss": 0.0233, "step": 2903 }, { "epoch": 1.3212010919017287, "grad_norm": 0.5812406311069044, "learning_rate": 4.187467452659376e-06, "loss": 0.0344, "step": 2904 }, { "epoch": 1.321656050955414, "grad_norm": 0.7247748495422082, "learning_rate": 4.186940095237327e-06, "loss": 0.0342, "step": 2905 }, { "epoch": 1.3221110100090991, "grad_norm": 0.4582523888047295, "learning_rate": 4.186412599966152e-06, "loss": 0.025, "step": 2906 }, { "epoch": 1.3225659690627842, "grad_norm": 0.4205471072755918, "learning_rate": 4.185884966888954e-06, "loss": 0.0139, "step": 2907 }, { "epoch": 1.3230209281164695, "grad_norm": 0.45492352353869137, "learning_rate": 4.185357196048852e-06, "loss": 0.0163, "step": 2908 }, { "epoch": 1.3234758871701546, "grad_norm": 0.5532805935798129, "learning_rate": 4.1848292874889694e-06, "loss": 0.0244, "step": 2909 }, { "epoch": 1.3239308462238397, "grad_norm": 0.5324603819929723, "learning_rate": 4.184301241252447e-06, "loss": 0.0208, "step": 2910 }, { "epoch": 1.324385805277525, "grad_norm": 0.34423546386086223, "learning_rate": 4.183773057382432e-06, "loss": 0.0147, "step": 2911 }, { "epoch": 1.3248407643312101, "grad_norm": 0.615818224239299, "learning_rate": 4.183244735922087e-06, "loss": 0.0218, "step": 2912 }, { "epoch": 1.3252957233848954, "grad_norm": 0.6549136207399878, "learning_rate": 4.182716276914585e-06, "loss": 0.0217, "step": 2913 }, { "epoch": 1.3257506824385805, "grad_norm": 0.5738916314899165, "learning_rate": 4.182187680403107e-06, "loss": 0.0162, "step": 2914 }, { "epoch": 1.3262056414922658, "grad_norm": 0.5594899471027078, "learning_rate": 4.181658946430848e-06, "loss": 0.0245, "step": 2915 }, { "epoch": 1.326660600545951, "grad_norm": 0.4109916930431615, "learning_rate": 4.181130075041015e-06, "loss": 0.0137, "step": 2916 }, { "epoch": 1.327115559599636, "grad_norm": 0.53422383536034, "learning_rate": 4.180601066276824e-06, "loss": 0.0216, "step": 2917 }, { "epoch": 1.3275705186533213, "grad_norm": 0.5383134563032649, "learning_rate": 4.180071920181503e-06, "loss": 0.0177, "step": 2918 }, { "epoch": 1.3280254777070064, "grad_norm": 0.5715076816892543, "learning_rate": 4.179542636798292e-06, "loss": 0.0179, "step": 2919 }, { "epoch": 1.3284804367606915, "grad_norm": 0.5437336115867218, "learning_rate": 4.1790132161704415e-06, "loss": 0.0211, "step": 2920 }, { "epoch": 1.3289353958143768, "grad_norm": 0.5527858593958116, "learning_rate": 4.178483658341213e-06, "loss": 0.0186, "step": 2921 }, { "epoch": 1.329390354868062, "grad_norm": 0.9925849620548352, "learning_rate": 4.17795396335388e-06, "loss": 0.047, "step": 2922 }, { "epoch": 1.329845313921747, "grad_norm": 0.7673987404091294, "learning_rate": 4.177424131251728e-06, "loss": 0.0361, "step": 2923 }, { "epoch": 1.3303002729754323, "grad_norm": 0.54250801803978, "learning_rate": 4.17689416207805e-06, "loss": 0.0199, "step": 2924 }, { "epoch": 1.3307552320291174, "grad_norm": 0.5272091207239011, "learning_rate": 4.176364055876154e-06, "loss": 0.0151, "step": 2925 }, { "epoch": 1.3312101910828025, "grad_norm": 0.5716765753458435, "learning_rate": 4.175833812689357e-06, "loss": 0.0194, "step": 2926 }, { "epoch": 1.3316651501364878, "grad_norm": 0.5032428892240143, "learning_rate": 4.17530343256099e-06, "loss": 0.0183, "step": 2927 }, { "epoch": 1.3321201091901729, "grad_norm": 0.913143929461543, "learning_rate": 4.174772915534392e-06, "loss": 0.0339, "step": 2928 }, { "epoch": 1.332575068243858, "grad_norm": 0.7146053626191031, "learning_rate": 4.174242261652914e-06, "loss": 0.034, "step": 2929 }, { "epoch": 1.3330300272975433, "grad_norm": 0.44685473112764684, "learning_rate": 4.173711470959919e-06, "loss": 0.0167, "step": 2930 }, { "epoch": 1.3334849863512284, "grad_norm": 0.5980701079569753, "learning_rate": 4.173180543498782e-06, "loss": 0.0276, "step": 2931 }, { "epoch": 1.3339399454049135, "grad_norm": 0.5027377779697596, "learning_rate": 4.1726494793128864e-06, "loss": 0.014, "step": 2932 }, { "epoch": 1.3343949044585988, "grad_norm": 0.5163509304756547, "learning_rate": 4.172118278445629e-06, "loss": 0.0201, "step": 2933 }, { "epoch": 1.3348498635122839, "grad_norm": 0.7718540702464267, "learning_rate": 4.171586940940417e-06, "loss": 0.0439, "step": 2934 }, { "epoch": 1.335304822565969, "grad_norm": 0.6284032907303854, "learning_rate": 4.171055466840669e-06, "loss": 0.0232, "step": 2935 }, { "epoch": 1.3357597816196543, "grad_norm": 1.0175122945856574, "learning_rate": 4.1705238561898144e-06, "loss": 0.0351, "step": 2936 }, { "epoch": 1.3362147406733393, "grad_norm": 0.46586902818145487, "learning_rate": 4.169992109031295e-06, "loss": 0.0155, "step": 2937 }, { "epoch": 1.3366696997270244, "grad_norm": 0.44893772743350846, "learning_rate": 4.169460225408562e-06, "loss": 0.0199, "step": 2938 }, { "epoch": 1.3371246587807097, "grad_norm": 0.8418229866849047, "learning_rate": 4.1689282053650786e-06, "loss": 0.0363, "step": 2939 }, { "epoch": 1.3375796178343948, "grad_norm": 0.4579693163374815, "learning_rate": 4.168396048944318e-06, "loss": 0.0224, "step": 2940 }, { "epoch": 1.3380345768880801, "grad_norm": 0.4215116697549558, "learning_rate": 4.167863756189767e-06, "loss": 0.0189, "step": 2941 }, { "epoch": 1.3384895359417652, "grad_norm": 0.4819574773962544, "learning_rate": 4.167331327144924e-06, "loss": 0.0192, "step": 2942 }, { "epoch": 1.3389444949954505, "grad_norm": 0.6590169559925403, "learning_rate": 4.166798761853291e-06, "loss": 0.0302, "step": 2943 }, { "epoch": 1.3393994540491356, "grad_norm": 0.5837457001514973, "learning_rate": 4.1662660603583936e-06, "loss": 0.0244, "step": 2944 }, { "epoch": 1.3398544131028207, "grad_norm": 0.6015904457920979, "learning_rate": 4.165733222703757e-06, "loss": 0.0311, "step": 2945 }, { "epoch": 1.340309372156506, "grad_norm": 0.685253409310759, "learning_rate": 4.165200248932923e-06, "loss": 0.02, "step": 2946 }, { "epoch": 1.3407643312101911, "grad_norm": 0.7220938837370316, "learning_rate": 4.164667139089446e-06, "loss": 0.0345, "step": 2947 }, { "epoch": 1.3412192902638762, "grad_norm": 0.4504885628402017, "learning_rate": 4.164133893216888e-06, "loss": 0.0161, "step": 2948 }, { "epoch": 1.3416742493175615, "grad_norm": 0.654778082115673, "learning_rate": 4.163600511358823e-06, "loss": 0.033, "step": 2949 }, { "epoch": 1.3421292083712466, "grad_norm": 0.5883225971322004, "learning_rate": 4.163066993558837e-06, "loss": 0.0314, "step": 2950 }, { "epoch": 1.3425841674249317, "grad_norm": 0.5896302652046609, "learning_rate": 4.1625333398605265e-06, "loss": 0.0257, "step": 2951 }, { "epoch": 1.343039126478617, "grad_norm": 0.6116556629810576, "learning_rate": 4.1619995503075e-06, "loss": 0.0232, "step": 2952 }, { "epoch": 1.343494085532302, "grad_norm": 0.7747409038134005, "learning_rate": 4.161465624943375e-06, "loss": 0.0283, "step": 2953 }, { "epoch": 1.3439490445859872, "grad_norm": 0.3783262451434524, "learning_rate": 4.1609315638117825e-06, "loss": 0.0133, "step": 2954 }, { "epoch": 1.3444040036396725, "grad_norm": 0.6903780649459388, "learning_rate": 4.160397366956364e-06, "loss": 0.039, "step": 2955 }, { "epoch": 1.3448589626933576, "grad_norm": 0.468325448617072, "learning_rate": 4.1598630344207705e-06, "loss": 0.0175, "step": 2956 }, { "epoch": 1.3453139217470427, "grad_norm": 0.60765527474527, "learning_rate": 4.159328566248665e-06, "loss": 0.0213, "step": 2957 }, { "epoch": 1.345768880800728, "grad_norm": 0.7428248623031719, "learning_rate": 4.1587939624837225e-06, "loss": 0.0241, "step": 2958 }, { "epoch": 1.346223839854413, "grad_norm": 0.4888241354818987, "learning_rate": 4.15825922316963e-06, "loss": 0.0149, "step": 2959 }, { "epoch": 1.3466787989080982, "grad_norm": 0.5484484705392908, "learning_rate": 4.15772434835008e-06, "loss": 0.0196, "step": 2960 }, { "epoch": 1.3471337579617835, "grad_norm": 1.0565925914634293, "learning_rate": 4.157189338068785e-06, "loss": 0.0413, "step": 2961 }, { "epoch": 1.3475887170154686, "grad_norm": 0.7341973618688173, "learning_rate": 4.156654192369459e-06, "loss": 0.0312, "step": 2962 }, { "epoch": 1.3480436760691537, "grad_norm": 0.4332535942804454, "learning_rate": 4.156118911295835e-06, "loss": 0.0103, "step": 2963 }, { "epoch": 1.348498635122839, "grad_norm": 0.516453626596964, "learning_rate": 4.155583494891651e-06, "loss": 0.0244, "step": 2964 }, { "epoch": 1.348953594176524, "grad_norm": 0.46094446618817314, "learning_rate": 4.155047943200663e-06, "loss": 0.0177, "step": 2965 }, { "epoch": 1.3494085532302094, "grad_norm": 0.5415622725687742, "learning_rate": 4.154512256266629e-06, "loss": 0.0182, "step": 2966 }, { "epoch": 1.3498635122838945, "grad_norm": 0.39310563760094946, "learning_rate": 4.153976434133327e-06, "loss": 0.0118, "step": 2967 }, { "epoch": 1.3503184713375795, "grad_norm": 0.41188508813446434, "learning_rate": 4.153440476844539e-06, "loss": 0.0128, "step": 2968 }, { "epoch": 1.3507734303912649, "grad_norm": 0.7427890993395497, "learning_rate": 4.1529043844440616e-06, "loss": 0.038, "step": 2969 }, { "epoch": 1.35122838944495, "grad_norm": 0.7612641381400927, "learning_rate": 4.1523681569757035e-06, "loss": 0.0392, "step": 2970 }, { "epoch": 1.3516833484986353, "grad_norm": 0.5867693257944069, "learning_rate": 4.151831794483281e-06, "loss": 0.0277, "step": 2971 }, { "epoch": 1.3521383075523203, "grad_norm": 0.44393405116337514, "learning_rate": 4.151295297010623e-06, "loss": 0.0168, "step": 2972 }, { "epoch": 1.3525932666060054, "grad_norm": 0.8173172206406042, "learning_rate": 4.150758664601572e-06, "loss": 0.0417, "step": 2973 }, { "epoch": 1.3530482256596907, "grad_norm": 0.5438123846748695, "learning_rate": 4.1502218972999765e-06, "loss": 0.0285, "step": 2974 }, { "epoch": 1.3535031847133758, "grad_norm": 0.5512371839504099, "learning_rate": 4.1496849951497005e-06, "loss": 0.0211, "step": 2975 }, { "epoch": 1.353958143767061, "grad_norm": 0.6338857840768191, "learning_rate": 4.149147958194617e-06, "loss": 0.0191, "step": 2976 }, { "epoch": 1.3544131028207462, "grad_norm": 0.5430184438020733, "learning_rate": 4.1486107864786095e-06, "loss": 0.0225, "step": 2977 }, { "epoch": 1.3548680618744313, "grad_norm": 0.5515766025691744, "learning_rate": 4.148073480045573e-06, "loss": 0.0178, "step": 2978 }, { "epoch": 1.3553230209281164, "grad_norm": 0.6143856897658074, "learning_rate": 4.147536038939416e-06, "loss": 0.0245, "step": 2979 }, { "epoch": 1.3557779799818017, "grad_norm": 0.52410413088212, "learning_rate": 4.146998463204053e-06, "loss": 0.014, "step": 2980 }, { "epoch": 1.3562329390354868, "grad_norm": 0.8689970476022745, "learning_rate": 4.146460752883413e-06, "loss": 0.0378, "step": 2981 }, { "epoch": 1.356687898089172, "grad_norm": 0.6157945347901961, "learning_rate": 4.145922908021436e-06, "loss": 0.0265, "step": 2982 }, { "epoch": 1.3571428571428572, "grad_norm": 0.593547987823192, "learning_rate": 4.145384928662072e-06, "loss": 0.0211, "step": 2983 }, { "epoch": 1.3575978161965423, "grad_norm": 0.624031958139668, "learning_rate": 4.144846814849282e-06, "loss": 0.0241, "step": 2984 }, { "epoch": 1.3580527752502274, "grad_norm": 0.7734232659670662, "learning_rate": 4.1443085666270375e-06, "loss": 0.0289, "step": 2985 }, { "epoch": 1.3585077343039127, "grad_norm": 0.9699839605495941, "learning_rate": 4.143770184039324e-06, "loss": 0.0493, "step": 2986 }, { "epoch": 1.3589626933575978, "grad_norm": 0.5096255057892579, "learning_rate": 4.143231667130134e-06, "loss": 0.0251, "step": 2987 }, { "epoch": 1.3594176524112829, "grad_norm": 0.5610719525460525, "learning_rate": 4.142693015943472e-06, "loss": 0.0161, "step": 2988 }, { "epoch": 1.3598726114649682, "grad_norm": 0.6484528506962219, "learning_rate": 4.142154230523356e-06, "loss": 0.0222, "step": 2989 }, { "epoch": 1.3603275705186533, "grad_norm": 0.5691271094961611, "learning_rate": 4.141615310913812e-06, "loss": 0.0201, "step": 2990 }, { "epoch": 1.3607825295723384, "grad_norm": 0.585470679430853, "learning_rate": 4.141076257158878e-06, "loss": 0.0206, "step": 2991 }, { "epoch": 1.3612374886260237, "grad_norm": 0.5886159920006029, "learning_rate": 4.1405370693026035e-06, "loss": 0.0273, "step": 2992 }, { "epoch": 1.3616924476797088, "grad_norm": 0.49346300298845935, "learning_rate": 4.139997747389049e-06, "loss": 0.0142, "step": 2993 }, { "epoch": 1.362147406733394, "grad_norm": 0.5623656737186806, "learning_rate": 4.139458291462283e-06, "loss": 0.0269, "step": 2994 }, { "epoch": 1.3626023657870792, "grad_norm": 0.6627238631761317, "learning_rate": 4.13891870156639e-06, "loss": 0.0437, "step": 2995 }, { "epoch": 1.3630573248407643, "grad_norm": 0.37964020676685056, "learning_rate": 4.138378977745462e-06, "loss": 0.0167, "step": 2996 }, { "epoch": 1.3635122838944496, "grad_norm": 0.5631408737685011, "learning_rate": 4.137839120043603e-06, "loss": 0.0218, "step": 2997 }, { "epoch": 1.3639672429481347, "grad_norm": 0.5831152022878606, "learning_rate": 4.137299128504928e-06, "loss": 0.0317, "step": 2998 }, { "epoch": 1.36442220200182, "grad_norm": 0.4861214216364132, "learning_rate": 4.136759003173561e-06, "loss": 0.0161, "step": 2999 }, { "epoch": 1.364877161055505, "grad_norm": 0.5358723278350062, "learning_rate": 4.136218744093641e-06, "loss": 0.0226, "step": 3000 }, { "epoch": 1.3653321201091901, "grad_norm": 0.669520131857986, "learning_rate": 4.1356783513093135e-06, "loss": 0.0358, "step": 3001 }, { "epoch": 1.3657870791628755, "grad_norm": 0.6459513757186216, "learning_rate": 4.135137824864738e-06, "loss": 0.025, "step": 3002 }, { "epoch": 1.3662420382165605, "grad_norm": 0.6336951581043813, "learning_rate": 4.134597164804084e-06, "loss": 0.0191, "step": 3003 }, { "epoch": 1.3666969972702456, "grad_norm": 0.5180512978385537, "learning_rate": 4.134056371171531e-06, "loss": 0.0186, "step": 3004 }, { "epoch": 1.367151956323931, "grad_norm": 0.3722453745078167, "learning_rate": 4.1335154440112715e-06, "loss": 0.0101, "step": 3005 }, { "epoch": 1.367606915377616, "grad_norm": 0.3713778689222229, "learning_rate": 4.132974383367505e-06, "loss": 0.0127, "step": 3006 }, { "epoch": 1.3680618744313011, "grad_norm": 0.4990475777860669, "learning_rate": 4.1324331892844485e-06, "loss": 0.0184, "step": 3007 }, { "epoch": 1.3685168334849864, "grad_norm": 0.8304052852022975, "learning_rate": 4.131891861806322e-06, "loss": 0.0329, "step": 3008 }, { "epoch": 1.3689717925386715, "grad_norm": 0.4389683470350522, "learning_rate": 4.131350400977363e-06, "loss": 0.0166, "step": 3009 }, { "epoch": 1.3694267515923566, "grad_norm": 0.698803640296317, "learning_rate": 4.130808806841816e-06, "loss": 0.0252, "step": 3010 }, { "epoch": 1.369881710646042, "grad_norm": 1.250044162677878, "learning_rate": 4.130267079443939e-06, "loss": 0.0617, "step": 3011 }, { "epoch": 1.370336669699727, "grad_norm": 0.5158514736015647, "learning_rate": 4.129725218827997e-06, "loss": 0.0268, "step": 3012 }, { "epoch": 1.370791628753412, "grad_norm": 0.5535080428066416, "learning_rate": 4.1291832250382705e-06, "loss": 0.0347, "step": 3013 }, { "epoch": 1.3712465878070974, "grad_norm": 0.610394373477909, "learning_rate": 4.128641098119048e-06, "loss": 0.0247, "step": 3014 }, { "epoch": 1.3717015468607825, "grad_norm": 0.5633256737475365, "learning_rate": 4.128098838114631e-06, "loss": 0.0239, "step": 3015 }, { "epoch": 1.3721565059144676, "grad_norm": 0.5949892339415896, "learning_rate": 4.127556445069328e-06, "loss": 0.0196, "step": 3016 }, { "epoch": 1.372611464968153, "grad_norm": 0.4362212537455964, "learning_rate": 4.127013919027462e-06, "loss": 0.0186, "step": 3017 }, { "epoch": 1.373066424021838, "grad_norm": 0.4608698551673798, "learning_rate": 4.126471260033368e-06, "loss": 0.0168, "step": 3018 }, { "epoch": 1.373521383075523, "grad_norm": 0.522646641348721, "learning_rate": 4.125928468131387e-06, "loss": 0.0234, "step": 3019 }, { "epoch": 1.3739763421292084, "grad_norm": 0.6666047647535678, "learning_rate": 4.125385543365873e-06, "loss": 0.0341, "step": 3020 }, { "epoch": 1.3744313011828935, "grad_norm": 0.45568077571323495, "learning_rate": 4.124842485781194e-06, "loss": 0.0115, "step": 3021 }, { "epoch": 1.3748862602365788, "grad_norm": 0.5297520552882724, "learning_rate": 4.1242992954217234e-06, "loss": 0.0201, "step": 3022 }, { "epoch": 1.3753412192902639, "grad_norm": 0.4259156472572623, "learning_rate": 4.123755972331851e-06, "loss": 0.0169, "step": 3023 }, { "epoch": 1.3757961783439492, "grad_norm": 0.7451700916534765, "learning_rate": 4.123212516555972e-06, "loss": 0.0412, "step": 3024 }, { "epoch": 1.3762511373976343, "grad_norm": 0.7232624853864648, "learning_rate": 4.122668928138498e-06, "loss": 0.0305, "step": 3025 }, { "epoch": 1.3767060964513194, "grad_norm": 0.6950540512190239, "learning_rate": 4.122125207123846e-06, "loss": 0.0314, "step": 3026 }, { "epoch": 1.3771610555050047, "grad_norm": 0.3593225407829798, "learning_rate": 4.121581353556447e-06, "loss": 0.0123, "step": 3027 }, { "epoch": 1.3776160145586898, "grad_norm": 0.3906828489333325, "learning_rate": 4.121037367480744e-06, "loss": 0.0202, "step": 3028 }, { "epoch": 1.3780709736123748, "grad_norm": 0.48824144662287894, "learning_rate": 4.120493248941188e-06, "loss": 0.0159, "step": 3029 }, { "epoch": 1.3785259326660602, "grad_norm": 0.5305250470220376, "learning_rate": 4.119948997982241e-06, "loss": 0.0178, "step": 3030 }, { "epoch": 1.3789808917197452, "grad_norm": 0.6167248818967296, "learning_rate": 4.119404614648378e-06, "loss": 0.0225, "step": 3031 }, { "epoch": 1.3794358507734303, "grad_norm": 0.8878066170438527, "learning_rate": 4.118860098984083e-06, "loss": 0.0535, "step": 3032 }, { "epoch": 1.3798908098271156, "grad_norm": 0.5195200604789711, "learning_rate": 4.118315451033851e-06, "loss": 0.0178, "step": 3033 }, { "epoch": 1.3803457688808007, "grad_norm": 0.582760689437918, "learning_rate": 4.117770670842189e-06, "loss": 0.0267, "step": 3034 }, { "epoch": 1.3808007279344858, "grad_norm": 0.48644656132929254, "learning_rate": 4.117225758453614e-06, "loss": 0.0142, "step": 3035 }, { "epoch": 1.3812556869881711, "grad_norm": 0.48520241105257794, "learning_rate": 4.116680713912652e-06, "loss": 0.0187, "step": 3036 }, { "epoch": 1.3817106460418562, "grad_norm": 0.6569899082539229, "learning_rate": 4.116135537263844e-06, "loss": 0.0299, "step": 3037 }, { "epoch": 1.3821656050955413, "grad_norm": 0.4241692854361547, "learning_rate": 4.115590228551738e-06, "loss": 0.0149, "step": 3038 }, { "epoch": 1.3826205641492266, "grad_norm": 0.48907194958347444, "learning_rate": 4.115044787820895e-06, "loss": 0.0177, "step": 3039 }, { "epoch": 1.3830755232029117, "grad_norm": 0.6439400567288324, "learning_rate": 4.114499215115885e-06, "loss": 0.0374, "step": 3040 }, { "epoch": 1.3835304822565968, "grad_norm": 0.5824862809523336, "learning_rate": 4.113953510481289e-06, "loss": 0.0228, "step": 3041 }, { "epoch": 1.3839854413102821, "grad_norm": 0.5755206805973067, "learning_rate": 4.113407673961702e-06, "loss": 0.0191, "step": 3042 }, { "epoch": 1.3844404003639672, "grad_norm": 0.5035093816887116, "learning_rate": 4.112861705601726e-06, "loss": 0.0207, "step": 3043 }, { "epoch": 1.3848953594176523, "grad_norm": 0.6764554424164136, "learning_rate": 4.112315605445975e-06, "loss": 0.0255, "step": 3044 }, { "epoch": 1.3853503184713376, "grad_norm": 0.6149217347847765, "learning_rate": 4.111769373539073e-06, "loss": 0.0287, "step": 3045 }, { "epoch": 1.3858052775250227, "grad_norm": 0.6663180017828199, "learning_rate": 4.1112230099256576e-06, "loss": 0.0212, "step": 3046 }, { "epoch": 1.3862602365787078, "grad_norm": 1.0598559044398572, "learning_rate": 4.1106765146503735e-06, "loss": 0.0272, "step": 3047 }, { "epoch": 1.386715195632393, "grad_norm": 0.4765126938927154, "learning_rate": 4.110129887757878e-06, "loss": 0.0168, "step": 3048 }, { "epoch": 1.3871701546860782, "grad_norm": 0.5485252582544047, "learning_rate": 4.10958312929284e-06, "loss": 0.0215, "step": 3049 }, { "epoch": 1.3876251137397635, "grad_norm": 0.6033054625797364, "learning_rate": 4.1090362392999376e-06, "loss": 0.0204, "step": 3050 }, { "epoch": 1.3880800727934486, "grad_norm": 0.7645773724183272, "learning_rate": 4.108489217823859e-06, "loss": 0.0463, "step": 3051 }, { "epoch": 1.388535031847134, "grad_norm": 0.5126384888627012, "learning_rate": 4.107942064909306e-06, "loss": 0.017, "step": 3052 }, { "epoch": 1.388989990900819, "grad_norm": 0.5517435339972416, "learning_rate": 4.107394780600989e-06, "loss": 0.0164, "step": 3053 }, { "epoch": 1.389444949954504, "grad_norm": 0.6894233207451937, "learning_rate": 4.10684736494363e-06, "loss": 0.0309, "step": 3054 }, { "epoch": 1.3898999090081894, "grad_norm": 0.6995203769861, "learning_rate": 4.10629981798196e-06, "loss": 0.0255, "step": 3055 }, { "epoch": 1.3903548680618745, "grad_norm": 0.6389147217213587, "learning_rate": 4.105752139760723e-06, "loss": 0.0289, "step": 3056 }, { "epoch": 1.3908098271155596, "grad_norm": 0.5052389232101024, "learning_rate": 4.105204330324673e-06, "loss": 0.0208, "step": 3057 }, { "epoch": 1.3912647861692449, "grad_norm": 0.49769601997533147, "learning_rate": 4.1046563897185736e-06, "loss": 0.0141, "step": 3058 }, { "epoch": 1.39171974522293, "grad_norm": 0.7519559246267947, "learning_rate": 4.104108317987201e-06, "loss": 0.0319, "step": 3059 }, { "epoch": 1.392174704276615, "grad_norm": 0.722697194866074, "learning_rate": 4.103560115175341e-06, "loss": 0.0389, "step": 3060 }, { "epoch": 1.3926296633303004, "grad_norm": 0.3738002744999205, "learning_rate": 4.103011781327789e-06, "loss": 0.0193, "step": 3061 }, { "epoch": 1.3930846223839854, "grad_norm": 0.5043059770286826, "learning_rate": 4.102463316489354e-06, "loss": 0.0201, "step": 3062 }, { "epoch": 1.3935395814376705, "grad_norm": 0.6670664153936168, "learning_rate": 4.101914720704854e-06, "loss": 0.0327, "step": 3063 }, { "epoch": 1.3939945404913558, "grad_norm": 0.6019451653378002, "learning_rate": 4.101365994019116e-06, "loss": 0.0243, "step": 3064 }, { "epoch": 1.394449499545041, "grad_norm": 0.9907449373657036, "learning_rate": 4.100817136476981e-06, "loss": 0.0383, "step": 3065 }, { "epoch": 1.394904458598726, "grad_norm": 0.49723412616757334, "learning_rate": 4.1002681481233e-06, "loss": 0.0165, "step": 3066 }, { "epoch": 1.3953594176524113, "grad_norm": 0.5920652529146905, "learning_rate": 4.099719029002932e-06, "loss": 0.0277, "step": 3067 }, { "epoch": 1.3958143767060964, "grad_norm": 0.8560125748124937, "learning_rate": 4.0991697791607485e-06, "loss": 0.0426, "step": 3068 }, { "epoch": 1.3962693357597815, "grad_norm": 0.6203648804145926, "learning_rate": 4.098620398641633e-06, "loss": 0.0241, "step": 3069 }, { "epoch": 1.3967242948134668, "grad_norm": 0.5521046470456151, "learning_rate": 4.098070887490478e-06, "loss": 0.0256, "step": 3070 }, { "epoch": 1.397179253867152, "grad_norm": 0.549780639587674, "learning_rate": 4.0975212457521865e-06, "loss": 0.0246, "step": 3071 }, { "epoch": 1.397634212920837, "grad_norm": 0.6502581386149143, "learning_rate": 4.096971473471674e-06, "loss": 0.0234, "step": 3072 }, { "epoch": 1.3980891719745223, "grad_norm": 0.5166228834612007, "learning_rate": 4.0964215706938635e-06, "loss": 0.0212, "step": 3073 }, { "epoch": 1.3985441310282074, "grad_norm": 0.6849292217423967, "learning_rate": 4.0958715374636925e-06, "loss": 0.0227, "step": 3074 }, { "epoch": 1.3989990900818925, "grad_norm": 0.525937882839348, "learning_rate": 4.095321373826105e-06, "loss": 0.0248, "step": 3075 }, { "epoch": 1.3994540491355778, "grad_norm": 0.5036452541982582, "learning_rate": 4.094771079826061e-06, "loss": 0.0175, "step": 3076 }, { "epoch": 1.399909008189263, "grad_norm": 0.6435654578869566, "learning_rate": 4.094220655508525e-06, "loss": 0.0299, "step": 3077 }, { "epoch": 1.4003639672429482, "grad_norm": 0.4960419896796071, "learning_rate": 4.0936701009184775e-06, "loss": 0.0244, "step": 3078 }, { "epoch": 1.4008189262966333, "grad_norm": 0.6004528177397015, "learning_rate": 4.0931194161009044e-06, "loss": 0.0357, "step": 3079 }, { "epoch": 1.4012738853503186, "grad_norm": 0.5619700414349914, "learning_rate": 4.092568601100809e-06, "loss": 0.0229, "step": 3080 }, { "epoch": 1.4017288444040037, "grad_norm": 0.4518813738572762, "learning_rate": 4.092017655963199e-06, "loss": 0.0117, "step": 3081 }, { "epoch": 1.4021838034576888, "grad_norm": 0.6514671911570182, "learning_rate": 4.091466580733095e-06, "loss": 0.0374, "step": 3082 }, { "epoch": 1.402638762511374, "grad_norm": 0.5408580921293082, "learning_rate": 4.09091537545553e-06, "loss": 0.0251, "step": 3083 }, { "epoch": 1.4030937215650592, "grad_norm": 0.49428453735831535, "learning_rate": 4.090364040175545e-06, "loss": 0.0172, "step": 3084 }, { "epoch": 1.4035486806187443, "grad_norm": 0.666899608158994, "learning_rate": 4.089812574938192e-06, "loss": 0.0324, "step": 3085 }, { "epoch": 1.4040036396724296, "grad_norm": 0.7622239351954695, "learning_rate": 4.089260979788534e-06, "loss": 0.0338, "step": 3086 }, { "epoch": 1.4044585987261147, "grad_norm": 0.5953257723172974, "learning_rate": 4.088709254771648e-06, "loss": 0.033, "step": 3087 }, { "epoch": 1.4049135577797998, "grad_norm": 0.6420360684443719, "learning_rate": 4.088157399932615e-06, "loss": 0.0229, "step": 3088 }, { "epoch": 1.405368516833485, "grad_norm": 0.5362082733292385, "learning_rate": 4.0876054153165314e-06, "loss": 0.0313, "step": 3089 }, { "epoch": 1.4058234758871702, "grad_norm": 0.43998962686932297, "learning_rate": 4.087053300968502e-06, "loss": 0.0158, "step": 3090 }, { "epoch": 1.4062784349408552, "grad_norm": 0.5249842338579929, "learning_rate": 4.086501056933646e-06, "loss": 0.0217, "step": 3091 }, { "epoch": 1.4067333939945406, "grad_norm": 0.7147219754503916, "learning_rate": 4.085948683257087e-06, "loss": 0.0345, "step": 3092 }, { "epoch": 1.4071883530482256, "grad_norm": 0.5648596460288634, "learning_rate": 4.085396179983963e-06, "loss": 0.0249, "step": 3093 }, { "epoch": 1.4076433121019107, "grad_norm": 0.7084596679025572, "learning_rate": 4.084843547159424e-06, "loss": 0.0324, "step": 3094 }, { "epoch": 1.408098271155596, "grad_norm": 0.5054480404602903, "learning_rate": 4.0842907848286265e-06, "loss": 0.02, "step": 3095 }, { "epoch": 1.4085532302092811, "grad_norm": 0.4088045729759403, "learning_rate": 4.083737893036741e-06, "loss": 0.0121, "step": 3096 }, { "epoch": 1.4090081892629662, "grad_norm": 0.4445880679833502, "learning_rate": 4.083184871828947e-06, "loss": 0.0163, "step": 3097 }, { "epoch": 1.4094631483166515, "grad_norm": 0.5583085908646305, "learning_rate": 4.0826317212504345e-06, "loss": 0.0212, "step": 3098 }, { "epoch": 1.4099181073703366, "grad_norm": 0.4817308983982684, "learning_rate": 4.0820784413464054e-06, "loss": 0.0154, "step": 3099 }, { "epoch": 1.4103730664240217, "grad_norm": 0.5448848620297295, "learning_rate": 4.08152503216207e-06, "loss": 0.0194, "step": 3100 }, { "epoch": 1.410828025477707, "grad_norm": 0.5964456320339733, "learning_rate": 4.080971493742652e-06, "loss": 0.019, "step": 3101 }, { "epoch": 1.4112829845313921, "grad_norm": 1.0500035124551044, "learning_rate": 4.080417826133382e-06, "loss": 0.0346, "step": 3102 }, { "epoch": 1.4117379435850774, "grad_norm": 0.6289836820106267, "learning_rate": 4.079864029379506e-06, "loss": 0.0432, "step": 3103 }, { "epoch": 1.4121929026387625, "grad_norm": 0.5116882510897833, "learning_rate": 4.079310103526275e-06, "loss": 0.0256, "step": 3104 }, { "epoch": 1.4126478616924476, "grad_norm": 0.6310635082352677, "learning_rate": 4.0787560486189545e-06, "loss": 0.0217, "step": 3105 }, { "epoch": 1.413102820746133, "grad_norm": 0.4671010346903545, "learning_rate": 4.07820186470282e-06, "loss": 0.0177, "step": 3106 }, { "epoch": 1.413557779799818, "grad_norm": 0.47143714345799553, "learning_rate": 4.077647551823155e-06, "loss": 0.0188, "step": 3107 }, { "epoch": 1.4140127388535033, "grad_norm": 0.4652926046789629, "learning_rate": 4.077093110025258e-06, "loss": 0.0312, "step": 3108 }, { "epoch": 1.4144676979071884, "grad_norm": 0.7370899657072758, "learning_rate": 4.076538539354433e-06, "loss": 0.0249, "step": 3109 }, { "epoch": 1.4149226569608735, "grad_norm": 0.6368992481166442, "learning_rate": 4.075983839855999e-06, "loss": 0.0327, "step": 3110 }, { "epoch": 1.4153776160145588, "grad_norm": 0.567399314370992, "learning_rate": 4.075429011575281e-06, "loss": 0.0313, "step": 3111 }, { "epoch": 1.415832575068244, "grad_norm": 0.3892637737460413, "learning_rate": 4.07487405455762e-06, "loss": 0.0123, "step": 3112 }, { "epoch": 1.416287534121929, "grad_norm": 0.613106414543713, "learning_rate": 4.074318968848364e-06, "loss": 0.028, "step": 3113 }, { "epoch": 1.4167424931756143, "grad_norm": 0.7160610594651534, "learning_rate": 4.073763754492871e-06, "loss": 0.0439, "step": 3114 }, { "epoch": 1.4171974522292994, "grad_norm": 0.6159887912900818, "learning_rate": 4.07320841153651e-06, "loss": 0.0223, "step": 3115 }, { "epoch": 1.4176524112829845, "grad_norm": 0.5767075501287264, "learning_rate": 4.072652940024664e-06, "loss": 0.021, "step": 3116 }, { "epoch": 1.4181073703366698, "grad_norm": 0.47742063463475076, "learning_rate": 4.07209734000272e-06, "loss": 0.0213, "step": 3117 }, { "epoch": 1.4185623293903549, "grad_norm": 1.1004949616896063, "learning_rate": 4.071541611516082e-06, "loss": 0.0311, "step": 3118 }, { "epoch": 1.41901728844404, "grad_norm": 0.7512803475968975, "learning_rate": 4.0709857546101605e-06, "loss": 0.0317, "step": 3119 }, { "epoch": 1.4194722474977253, "grad_norm": 0.6141920243976169, "learning_rate": 4.0704297693303775e-06, "loss": 0.034, "step": 3120 }, { "epoch": 1.4199272065514104, "grad_norm": 0.6658167646440603, "learning_rate": 4.0698736557221655e-06, "loss": 0.0246, "step": 3121 }, { "epoch": 1.4203821656050954, "grad_norm": 0.5034335001854008, "learning_rate": 4.069317413830968e-06, "loss": 0.0187, "step": 3122 }, { "epoch": 1.4208371246587808, "grad_norm": 0.4550548362840694, "learning_rate": 4.068761043702237e-06, "loss": 0.0172, "step": 3123 }, { "epoch": 1.4212920837124658, "grad_norm": 0.5022383747528916, "learning_rate": 4.06820454538144e-06, "loss": 0.0161, "step": 3124 }, { "epoch": 1.421747042766151, "grad_norm": 0.5321472674787463, "learning_rate": 4.067647918914049e-06, "loss": 0.0164, "step": 3125 }, { "epoch": 1.4222020018198362, "grad_norm": 0.5184576534954308, "learning_rate": 4.067091164345549e-06, "loss": 0.0205, "step": 3126 }, { "epoch": 1.4226569608735213, "grad_norm": 0.5611322594124284, "learning_rate": 4.066534281721437e-06, "loss": 0.0218, "step": 3127 }, { "epoch": 1.4231119199272064, "grad_norm": 0.6345079828140591, "learning_rate": 4.065977271087216e-06, "loss": 0.0307, "step": 3128 }, { "epoch": 1.4235668789808917, "grad_norm": 0.45570970357451407, "learning_rate": 4.065420132488406e-06, "loss": 0.0171, "step": 3129 }, { "epoch": 1.4240218380345768, "grad_norm": 0.5340857150202568, "learning_rate": 4.064862865970531e-06, "loss": 0.0191, "step": 3130 }, { "epoch": 1.4244767970882621, "grad_norm": 0.4899881794546963, "learning_rate": 4.064305471579131e-06, "loss": 0.0209, "step": 3131 }, { "epoch": 1.4249317561419472, "grad_norm": 0.7270878934451148, "learning_rate": 4.063747949359751e-06, "loss": 0.0339, "step": 3132 }, { "epoch": 1.4253867151956323, "grad_norm": 0.587673912483386, "learning_rate": 4.063190299357951e-06, "loss": 0.0226, "step": 3133 }, { "epoch": 1.4258416742493176, "grad_norm": 0.8856393386631087, "learning_rate": 4.062632521619298e-06, "loss": 0.0348, "step": 3134 }, { "epoch": 1.4262966333030027, "grad_norm": 0.9531592586535969, "learning_rate": 4.0620746161893736e-06, "loss": 0.0342, "step": 3135 }, { "epoch": 1.426751592356688, "grad_norm": 0.6282966394466677, "learning_rate": 4.061516583113765e-06, "loss": 0.0284, "step": 3136 }, { "epoch": 1.4272065514103731, "grad_norm": 0.4750464430925278, "learning_rate": 4.060958422438073e-06, "loss": 0.0228, "step": 3137 }, { "epoch": 1.4276615104640582, "grad_norm": 0.6847093600934828, "learning_rate": 4.060400134207908e-06, "loss": 0.0273, "step": 3138 }, { "epoch": 1.4281164695177435, "grad_norm": 0.48758933458619164, "learning_rate": 4.05984171846889e-06, "loss": 0.0183, "step": 3139 }, { "epoch": 1.4285714285714286, "grad_norm": 0.6816015219475341, "learning_rate": 4.059283175266652e-06, "loss": 0.0373, "step": 3140 }, { "epoch": 1.4290263876251137, "grad_norm": 0.5622493429118623, "learning_rate": 4.058724504646834e-06, "loss": 0.0208, "step": 3141 }, { "epoch": 1.429481346678799, "grad_norm": 0.6272048522856389, "learning_rate": 4.058165706655089e-06, "loss": 0.0236, "step": 3142 }, { "epoch": 1.429936305732484, "grad_norm": 0.7322971571480782, "learning_rate": 4.057606781337079e-06, "loss": 0.029, "step": 3143 }, { "epoch": 1.4303912647861692, "grad_norm": 0.6016774365121075, "learning_rate": 4.057047728738477e-06, "loss": 0.0259, "step": 3144 }, { "epoch": 1.4308462238398545, "grad_norm": 1.2230808497645234, "learning_rate": 4.056488548904966e-06, "loss": 0.0642, "step": 3145 }, { "epoch": 1.4313011828935396, "grad_norm": 0.7604574246701213, "learning_rate": 4.055929241882239e-06, "loss": 0.0359, "step": 3146 }, { "epoch": 1.4317561419472247, "grad_norm": 0.679498620667103, "learning_rate": 4.0553698077160025e-06, "loss": 0.023, "step": 3147 }, { "epoch": 1.43221110100091, "grad_norm": 0.5870150703843239, "learning_rate": 4.054810246451969e-06, "loss": 0.0328, "step": 3148 }, { "epoch": 1.432666060054595, "grad_norm": 0.6481099201299542, "learning_rate": 4.054250558135862e-06, "loss": 0.032, "step": 3149 }, { "epoch": 1.4331210191082802, "grad_norm": 0.504870297656496, "learning_rate": 4.05369074281342e-06, "loss": 0.0259, "step": 3150 }, { "epoch": 1.4335759781619655, "grad_norm": 0.42044102667812067, "learning_rate": 4.053130800530387e-06, "loss": 0.0156, "step": 3151 }, { "epoch": 1.4340309372156506, "grad_norm": 0.5204802834399147, "learning_rate": 4.052570731332518e-06, "loss": 0.026, "step": 3152 }, { "epoch": 1.4344858962693356, "grad_norm": 0.41353391151905655, "learning_rate": 4.0520105352655805e-06, "loss": 0.017, "step": 3153 }, { "epoch": 1.434940855323021, "grad_norm": 0.7582144112871689, "learning_rate": 4.051450212375351e-06, "loss": 0.0351, "step": 3154 }, { "epoch": 1.435395814376706, "grad_norm": 0.5600600152721896, "learning_rate": 4.050889762707616e-06, "loss": 0.0297, "step": 3155 }, { "epoch": 1.4358507734303911, "grad_norm": 0.6570950043422336, "learning_rate": 4.050329186308173e-06, "loss": 0.032, "step": 3156 }, { "epoch": 1.4363057324840764, "grad_norm": 0.6757814720202312, "learning_rate": 4.0497684832228305e-06, "loss": 0.0235, "step": 3157 }, { "epoch": 1.4367606915377615, "grad_norm": 0.6431879965703954, "learning_rate": 4.049207653497406e-06, "loss": 0.0301, "step": 3158 }, { "epoch": 1.4372156505914468, "grad_norm": 0.722091996750155, "learning_rate": 4.0486466971777295e-06, "loss": 0.0311, "step": 3159 }, { "epoch": 1.437670609645132, "grad_norm": 0.5159615478447471, "learning_rate": 4.048085614309638e-06, "loss": 0.0209, "step": 3160 }, { "epoch": 1.438125568698817, "grad_norm": 0.694743512846256, "learning_rate": 4.047524404938981e-06, "loss": 0.0265, "step": 3161 }, { "epoch": 1.4385805277525023, "grad_norm": 0.71210008238664, "learning_rate": 4.046963069111617e-06, "loss": 0.0434, "step": 3162 }, { "epoch": 1.4390354868061874, "grad_norm": 0.626366567745311, "learning_rate": 4.046401606873419e-06, "loss": 0.0254, "step": 3163 }, { "epoch": 1.4394904458598727, "grad_norm": 0.501604026702404, "learning_rate": 4.045840018270264e-06, "loss": 0.0238, "step": 3164 }, { "epoch": 1.4399454049135578, "grad_norm": 0.4397581383178915, "learning_rate": 4.045278303348044e-06, "loss": 0.0161, "step": 3165 }, { "epoch": 1.440400363967243, "grad_norm": 0.6360646084965851, "learning_rate": 4.044716462152659e-06, "loss": 0.0238, "step": 3166 }, { "epoch": 1.4408553230209282, "grad_norm": 0.38658450024092633, "learning_rate": 4.04415449473002e-06, "loss": 0.0193, "step": 3167 }, { "epoch": 1.4413102820746133, "grad_norm": 0.5581740216149683, "learning_rate": 4.043592401126051e-06, "loss": 0.021, "step": 3168 }, { "epoch": 1.4417652411282984, "grad_norm": 0.5845374823097698, "learning_rate": 4.043030181386681e-06, "loss": 0.0256, "step": 3169 }, { "epoch": 1.4422202001819837, "grad_norm": 0.35484275672194915, "learning_rate": 4.042467835557853e-06, "loss": 0.0101, "step": 3170 }, { "epoch": 1.4426751592356688, "grad_norm": 0.6225428160256282, "learning_rate": 4.0419053636855185e-06, "loss": 0.029, "step": 3171 }, { "epoch": 1.443130118289354, "grad_norm": 0.5597849056713826, "learning_rate": 4.041342765815641e-06, "loss": 0.0268, "step": 3172 }, { "epoch": 1.4435850773430392, "grad_norm": 0.6257534793117826, "learning_rate": 4.040780041994193e-06, "loss": 0.0214, "step": 3173 }, { "epoch": 1.4440400363967243, "grad_norm": 0.6053906166055238, "learning_rate": 4.040217192267159e-06, "loss": 0.0316, "step": 3174 }, { "epoch": 1.4444949954504094, "grad_norm": 0.4918130393096578, "learning_rate": 4.03965421668053e-06, "loss": 0.0241, "step": 3175 }, { "epoch": 1.4449499545040947, "grad_norm": 0.5702821016502554, "learning_rate": 4.039091115280314e-06, "loss": 0.0206, "step": 3176 }, { "epoch": 1.4454049135577798, "grad_norm": 0.6916587079039479, "learning_rate": 4.038527888112521e-06, "loss": 0.0313, "step": 3177 }, { "epoch": 1.4458598726114649, "grad_norm": 0.5788509299527757, "learning_rate": 4.037964535223177e-06, "loss": 0.0187, "step": 3178 }, { "epoch": 1.4463148316651502, "grad_norm": 0.5220453246358187, "learning_rate": 4.037401056658317e-06, "loss": 0.0242, "step": 3179 }, { "epoch": 1.4467697907188353, "grad_norm": 0.5240476001763903, "learning_rate": 4.036837452463985e-06, "loss": 0.0148, "step": 3180 }, { "epoch": 1.4472247497725204, "grad_norm": 0.6074840718812475, "learning_rate": 4.0362737226862356e-06, "loss": 0.0289, "step": 3181 }, { "epoch": 1.4476797088262057, "grad_norm": 0.656878621424084, "learning_rate": 4.035709867371137e-06, "loss": 0.0256, "step": 3182 }, { "epoch": 1.4481346678798908, "grad_norm": 0.7142547713654863, "learning_rate": 4.035145886564763e-06, "loss": 0.0345, "step": 3183 }, { "epoch": 1.4485896269335758, "grad_norm": 0.5825936793650826, "learning_rate": 4.0345817803132e-06, "loss": 0.0185, "step": 3184 }, { "epoch": 1.4490445859872612, "grad_norm": 0.4333270723886052, "learning_rate": 4.034017548662544e-06, "loss": 0.0166, "step": 3185 }, { "epoch": 1.4494995450409462, "grad_norm": 0.646212871411711, "learning_rate": 4.033453191658901e-06, "loss": 0.0383, "step": 3186 }, { "epoch": 1.4499545040946316, "grad_norm": 0.8367732250496335, "learning_rate": 4.032888709348388e-06, "loss": 0.0339, "step": 3187 }, { "epoch": 1.4504094631483166, "grad_norm": 0.7244976664758369, "learning_rate": 4.032324101777132e-06, "loss": 0.0191, "step": 3188 }, { "epoch": 1.450864422202002, "grad_norm": 0.6717322566236624, "learning_rate": 4.03175936899127e-06, "loss": 0.0368, "step": 3189 }, { "epoch": 1.451319381255687, "grad_norm": 0.4208158609209425, "learning_rate": 4.031194511036951e-06, "loss": 0.0158, "step": 3190 }, { "epoch": 1.4517743403093721, "grad_norm": 0.4638906634895803, "learning_rate": 4.0306295279603304e-06, "loss": 0.0137, "step": 3191 }, { "epoch": 1.4522292993630574, "grad_norm": 0.6347806832854297, "learning_rate": 4.030064419807578e-06, "loss": 0.0304, "step": 3192 }, { "epoch": 1.4526842584167425, "grad_norm": 0.5757174544801394, "learning_rate": 4.02949918662487e-06, "loss": 0.018, "step": 3193 }, { "epoch": 1.4531392174704276, "grad_norm": 0.6295428757868852, "learning_rate": 4.028933828458396e-06, "loss": 0.0264, "step": 3194 }, { "epoch": 1.453594176524113, "grad_norm": 0.43758794804800105, "learning_rate": 4.028368345354355e-06, "loss": 0.0253, "step": 3195 }, { "epoch": 1.454049135577798, "grad_norm": 0.5805048377093173, "learning_rate": 4.027802737358954e-06, "loss": 0.0296, "step": 3196 }, { "epoch": 1.4545040946314831, "grad_norm": 0.4264103702768461, "learning_rate": 4.027237004518413e-06, "loss": 0.0119, "step": 3197 }, { "epoch": 1.4549590536851684, "grad_norm": 0.41389260389989946, "learning_rate": 4.02667114687896e-06, "loss": 0.016, "step": 3198 }, { "epoch": 1.4554140127388535, "grad_norm": 0.574238682508948, "learning_rate": 4.026105164486836e-06, "loss": 0.0257, "step": 3199 }, { "epoch": 1.4558689717925386, "grad_norm": 0.4662483333465922, "learning_rate": 4.0255390573882904e-06, "loss": 0.0138, "step": 3200 }, { "epoch": 1.456323930846224, "grad_norm": 0.5385442309695424, "learning_rate": 4.024972825629581e-06, "loss": 0.0135, "step": 3201 }, { "epoch": 1.456778889899909, "grad_norm": 0.6347955000788789, "learning_rate": 4.024406469256979e-06, "loss": 0.0213, "step": 3202 }, { "epoch": 1.457233848953594, "grad_norm": 0.4546118103535644, "learning_rate": 4.023839988316766e-06, "loss": 0.0148, "step": 3203 }, { "epoch": 1.4576888080072794, "grad_norm": 0.7098005879398113, "learning_rate": 4.02327338285523e-06, "loss": 0.0356, "step": 3204 }, { "epoch": 1.4581437670609645, "grad_norm": 0.7622305783040608, "learning_rate": 4.022706652918672e-06, "loss": 0.0322, "step": 3205 }, { "epoch": 1.4585987261146496, "grad_norm": 0.5083688752589572, "learning_rate": 4.022139798553404e-06, "loss": 0.0291, "step": 3206 }, { "epoch": 1.459053685168335, "grad_norm": 0.5310444613766537, "learning_rate": 4.021572819805744e-06, "loss": 0.022, "step": 3207 }, { "epoch": 1.45950864422202, "grad_norm": 0.5549383758015742, "learning_rate": 4.021005716722025e-06, "loss": 0.0186, "step": 3208 }, { "epoch": 1.459963603275705, "grad_norm": 0.3870771580694624, "learning_rate": 4.020438489348587e-06, "loss": 0.0166, "step": 3209 }, { "epoch": 1.4604185623293904, "grad_norm": 0.443463357543213, "learning_rate": 4.019871137731783e-06, "loss": 0.0199, "step": 3210 }, { "epoch": 1.4608735213830755, "grad_norm": 0.6111312773271786, "learning_rate": 4.019303661917973e-06, "loss": 0.0183, "step": 3211 }, { "epoch": 1.4613284804367606, "grad_norm": 0.47602225232623346, "learning_rate": 4.018736061953529e-06, "loss": 0.0154, "step": 3212 }, { "epoch": 1.4617834394904459, "grad_norm": 0.6708391794944873, "learning_rate": 4.018168337884832e-06, "loss": 0.0236, "step": 3213 }, { "epoch": 1.462238398544131, "grad_norm": 0.6968852951901131, "learning_rate": 4.017600489758275e-06, "loss": 0.0335, "step": 3214 }, { "epoch": 1.4626933575978163, "grad_norm": 0.5811945946607384, "learning_rate": 4.017032517620259e-06, "loss": 0.0255, "step": 3215 }, { "epoch": 1.4631483166515014, "grad_norm": 0.49821764006461977, "learning_rate": 4.016464421517197e-06, "loss": 0.0152, "step": 3216 }, { "epoch": 1.4636032757051867, "grad_norm": 0.7149483577148827, "learning_rate": 4.015896201495511e-06, "loss": 0.036, "step": 3217 }, { "epoch": 1.4640582347588718, "grad_norm": 0.6403076806861139, "learning_rate": 4.015327857601632e-06, "loss": 0.025, "step": 3218 }, { "epoch": 1.4645131938125568, "grad_norm": 0.6206009458630282, "learning_rate": 4.014759389882004e-06, "loss": 0.0274, "step": 3219 }, { "epoch": 1.4649681528662422, "grad_norm": 0.819456018070291, "learning_rate": 4.0141907983830794e-06, "loss": 0.0309, "step": 3220 }, { "epoch": 1.4654231119199272, "grad_norm": 0.45406669744308337, "learning_rate": 4.0136220831513205e-06, "loss": 0.0164, "step": 3221 }, { "epoch": 1.4658780709736123, "grad_norm": 0.4892778094068776, "learning_rate": 4.013053244233202e-06, "loss": 0.0243, "step": 3222 }, { "epoch": 1.4663330300272976, "grad_norm": 0.4163752869408427, "learning_rate": 4.012484281675203e-06, "loss": 0.0137, "step": 3223 }, { "epoch": 1.4667879890809827, "grad_norm": 0.6166328684835028, "learning_rate": 4.01191519552382e-06, "loss": 0.0242, "step": 3224 }, { "epoch": 1.4672429481346678, "grad_norm": 0.8950156886314216, "learning_rate": 4.011345985825555e-06, "loss": 0.0377, "step": 3225 }, { "epoch": 1.4676979071883531, "grad_norm": 0.5451391822566414, "learning_rate": 4.010776652626921e-06, "loss": 0.0228, "step": 3226 }, { "epoch": 1.4681528662420382, "grad_norm": 0.43220994211012703, "learning_rate": 4.010207195974441e-06, "loss": 0.0149, "step": 3227 }, { "epoch": 1.4686078252957233, "grad_norm": 0.6421311871461202, "learning_rate": 4.00963761591465e-06, "loss": 0.0248, "step": 3228 }, { "epoch": 1.4690627843494086, "grad_norm": 0.5176247695837788, "learning_rate": 4.00906791249409e-06, "loss": 0.022, "step": 3229 }, { "epoch": 1.4695177434030937, "grad_norm": 0.5533545377326602, "learning_rate": 4.008498085759315e-06, "loss": 0.0317, "step": 3230 }, { "epoch": 1.4699727024567788, "grad_norm": 0.6655829269146801, "learning_rate": 4.007928135756889e-06, "loss": 0.0237, "step": 3231 }, { "epoch": 1.4704276615104641, "grad_norm": 0.4579299246542189, "learning_rate": 4.007358062533386e-06, "loss": 0.0169, "step": 3232 }, { "epoch": 1.4708826205641492, "grad_norm": 0.8007698321103973, "learning_rate": 4.006787866135387e-06, "loss": 0.0378, "step": 3233 }, { "epoch": 1.4713375796178343, "grad_norm": 0.46407860101829335, "learning_rate": 4.006217546609491e-06, "loss": 0.0186, "step": 3234 }, { "epoch": 1.4717925386715196, "grad_norm": 0.6325621692821115, "learning_rate": 4.005647104002298e-06, "loss": 0.0331, "step": 3235 }, { "epoch": 1.4722474977252047, "grad_norm": 0.4729130943143962, "learning_rate": 4.005076538360424e-06, "loss": 0.022, "step": 3236 }, { "epoch": 1.4727024567788898, "grad_norm": 0.5325635276758262, "learning_rate": 4.00450584973049e-06, "loss": 0.0254, "step": 3237 }, { "epoch": 1.473157415832575, "grad_norm": 0.5951663628498189, "learning_rate": 4.003935038159134e-06, "loss": 0.0258, "step": 3238 }, { "epoch": 1.4736123748862602, "grad_norm": 0.8046736104783195, "learning_rate": 4.003364103692998e-06, "loss": 0.0376, "step": 3239 }, { "epoch": 1.4740673339399453, "grad_norm": 0.3611545320846498, "learning_rate": 4.002793046378736e-06, "loss": 0.0129, "step": 3240 }, { "epoch": 1.4745222929936306, "grad_norm": 0.6746020737552472, "learning_rate": 4.002221866263013e-06, "loss": 0.026, "step": 3241 }, { "epoch": 1.4749772520473157, "grad_norm": 0.716374094519489, "learning_rate": 4.001650563392504e-06, "loss": 0.0365, "step": 3242 }, { "epoch": 1.475432211101001, "grad_norm": 0.5761801662679816, "learning_rate": 4.001079137813892e-06, "loss": 0.0251, "step": 3243 }, { "epoch": 1.475887170154686, "grad_norm": 0.5169030321042024, "learning_rate": 4.00050758957387e-06, "loss": 0.0214, "step": 3244 }, { "epoch": 1.4763421292083714, "grad_norm": 0.8699364781361667, "learning_rate": 3.999935918719146e-06, "loss": 0.0426, "step": 3245 }, { "epoch": 1.4767970882620565, "grad_norm": 0.5630042536436409, "learning_rate": 3.999364125296432e-06, "loss": 0.0216, "step": 3246 }, { "epoch": 1.4772520473157416, "grad_norm": 0.48241646420427814, "learning_rate": 3.998792209352453e-06, "loss": 0.0213, "step": 3247 }, { "epoch": 1.4777070063694269, "grad_norm": 0.6568434530404283, "learning_rate": 3.998220170933942e-06, "loss": 0.0313, "step": 3248 }, { "epoch": 1.478161965423112, "grad_norm": 0.4145592487022671, "learning_rate": 3.997648010087645e-06, "loss": 0.014, "step": 3249 }, { "epoch": 1.478616924476797, "grad_norm": 0.5557880093978479, "learning_rate": 3.997075726860316e-06, "loss": 0.0132, "step": 3250 }, { "epoch": 1.4790718835304824, "grad_norm": 0.6386918374347622, "learning_rate": 3.996503321298719e-06, "loss": 0.0293, "step": 3251 }, { "epoch": 1.4795268425841674, "grad_norm": 0.8108389566308398, "learning_rate": 3.995930793449629e-06, "loss": 0.055, "step": 3252 }, { "epoch": 1.4799818016378525, "grad_norm": 0.5994893257212202, "learning_rate": 3.995358143359831e-06, "loss": 0.0212, "step": 3253 }, { "epoch": 1.4804367606915378, "grad_norm": 0.36011088041771677, "learning_rate": 3.994785371076118e-06, "loss": 0.0132, "step": 3254 }, { "epoch": 1.480891719745223, "grad_norm": 0.47107785017529297, "learning_rate": 3.994212476645294e-06, "loss": 0.0211, "step": 3255 }, { "epoch": 1.481346678798908, "grad_norm": 0.6948312534480735, "learning_rate": 3.993639460114175e-06, "loss": 0.0314, "step": 3256 }, { "epoch": 1.4818016378525933, "grad_norm": 0.534307758470587, "learning_rate": 3.9930663215295845e-06, "loss": 0.0224, "step": 3257 }, { "epoch": 1.4822565969062784, "grad_norm": 0.621059549070313, "learning_rate": 3.992493060938357e-06, "loss": 0.0265, "step": 3258 }, { "epoch": 1.4827115559599635, "grad_norm": 0.59194056246558, "learning_rate": 3.991919678387336e-06, "loss": 0.0278, "step": 3259 }, { "epoch": 1.4831665150136488, "grad_norm": 0.6769673790378502, "learning_rate": 3.991346173923378e-06, "loss": 0.0223, "step": 3260 }, { "epoch": 1.483621474067334, "grad_norm": 0.5048248018891563, "learning_rate": 3.990772547593342e-06, "loss": 0.0177, "step": 3261 }, { "epoch": 1.484076433121019, "grad_norm": 1.0508584289590246, "learning_rate": 3.990198799444109e-06, "loss": 0.0563, "step": 3262 }, { "epoch": 1.4845313921747043, "grad_norm": 0.5906268368359552, "learning_rate": 3.989624929522558e-06, "loss": 0.0215, "step": 3263 }, { "epoch": 1.4849863512283894, "grad_norm": 0.4307882601993587, "learning_rate": 3.989050937875586e-06, "loss": 0.017, "step": 3264 }, { "epoch": 1.4854413102820745, "grad_norm": 0.49695404258064146, "learning_rate": 3.988476824550095e-06, "loss": 0.024, "step": 3265 }, { "epoch": 1.4858962693357598, "grad_norm": 0.827358190513197, "learning_rate": 3.9879025895930005e-06, "loss": 0.0294, "step": 3266 }, { "epoch": 1.486351228389445, "grad_norm": 0.6751148447719345, "learning_rate": 3.987328233051225e-06, "loss": 0.0314, "step": 3267 }, { "epoch": 1.4868061874431302, "grad_norm": 0.9522102460202996, "learning_rate": 3.986753754971703e-06, "loss": 0.0298, "step": 3268 }, { "epoch": 1.4872611464968153, "grad_norm": 0.6420301483996141, "learning_rate": 3.986179155401379e-06, "loss": 0.0279, "step": 3269 }, { "epoch": 1.4877161055505004, "grad_norm": 0.719242311377963, "learning_rate": 3.985604434387206e-06, "loss": 0.0433, "step": 3270 }, { "epoch": 1.4881710646041857, "grad_norm": 1.6278477178749045, "learning_rate": 3.985029591976147e-06, "loss": 0.0485, "step": 3271 }, { "epoch": 1.4886260236578708, "grad_norm": 5.758947431240914, "learning_rate": 3.984454628215176e-06, "loss": 0.0677, "step": 3272 }, { "epoch": 1.489080982711556, "grad_norm": 0.7413160561856129, "learning_rate": 3.983879543151277e-06, "loss": 0.0361, "step": 3273 }, { "epoch": 1.4895359417652412, "grad_norm": 0.36384913079858605, "learning_rate": 3.9833043368314426e-06, "loss": 0.0119, "step": 3274 }, { "epoch": 1.4899909008189263, "grad_norm": 0.6361813980344438, "learning_rate": 3.982729009302676e-06, "loss": 0.0317, "step": 3275 }, { "epoch": 1.4904458598726116, "grad_norm": 0.7257578291982972, "learning_rate": 3.982153560611991e-06, "loss": 0.0358, "step": 3276 }, { "epoch": 1.4909008189262967, "grad_norm": 0.5165112950123053, "learning_rate": 3.98157799080641e-06, "loss": 0.0217, "step": 3277 }, { "epoch": 1.4913557779799818, "grad_norm": 0.9281543214333533, "learning_rate": 3.9810022999329675e-06, "loss": 0.0389, "step": 3278 }, { "epoch": 1.491810737033667, "grad_norm": 0.514017469225204, "learning_rate": 3.980426488038703e-06, "loss": 0.0223, "step": 3279 }, { "epoch": 1.4922656960873522, "grad_norm": 0.7512761092778488, "learning_rate": 3.979850555170673e-06, "loss": 0.0357, "step": 3280 }, { "epoch": 1.4927206551410372, "grad_norm": 0.6169552418993354, "learning_rate": 3.979274501375939e-06, "loss": 0.03, "step": 3281 }, { "epoch": 1.4931756141947226, "grad_norm": 0.8280448418011287, "learning_rate": 3.978698326701573e-06, "loss": 0.0421, "step": 3282 }, { "epoch": 1.4936305732484076, "grad_norm": 0.650445967840913, "learning_rate": 3.978122031194657e-06, "loss": 0.0145, "step": 3283 }, { "epoch": 1.4940855323020927, "grad_norm": 0.6583510369558364, "learning_rate": 3.977545614902284e-06, "loss": 0.0274, "step": 3284 }, { "epoch": 1.494540491355778, "grad_norm": 0.4472056988014067, "learning_rate": 3.976969077871555e-06, "loss": 0.0187, "step": 3285 }, { "epoch": 1.4949954504094631, "grad_norm": 0.7735245686273473, "learning_rate": 3.976392420149583e-06, "loss": 0.0273, "step": 3286 }, { "epoch": 1.4954504094631482, "grad_norm": 0.41532105399889546, "learning_rate": 3.975815641783491e-06, "loss": 0.0119, "step": 3287 }, { "epoch": 1.4959053685168335, "grad_norm": 0.6987463484258479, "learning_rate": 3.975238742820409e-06, "loss": 0.0377, "step": 3288 }, { "epoch": 1.4963603275705186, "grad_norm": 0.6173848427744979, "learning_rate": 3.9746617233074785e-06, "loss": 0.0239, "step": 3289 }, { "epoch": 1.4968152866242037, "grad_norm": 0.8727742573581233, "learning_rate": 3.974084583291851e-06, "loss": 0.0341, "step": 3290 }, { "epoch": 1.497270245677889, "grad_norm": 0.6117086620866596, "learning_rate": 3.97350732282069e-06, "loss": 0.0207, "step": 3291 }, { "epoch": 1.4977252047315741, "grad_norm": 0.4816207312722353, "learning_rate": 3.9729299419411635e-06, "loss": 0.0194, "step": 3292 }, { "epoch": 1.4981801637852592, "grad_norm": 0.7728518816048001, "learning_rate": 3.972352440700455e-06, "loss": 0.0303, "step": 3293 }, { "epoch": 1.4986351228389445, "grad_norm": 0.624081771304932, "learning_rate": 3.971774819145753e-06, "loss": 0.0203, "step": 3294 }, { "epoch": 1.4990900818926296, "grad_norm": 0.5277977109774716, "learning_rate": 3.97119707732426e-06, "loss": 0.0222, "step": 3295 }, { "epoch": 1.499545040946315, "grad_norm": 0.506346258870687, "learning_rate": 3.970619215283185e-06, "loss": 0.0263, "step": 3296 }, { "epoch": 1.5, "grad_norm": 0.5996351512465383, "learning_rate": 3.97004123306975e-06, "loss": 0.0316, "step": 3297 }, { "epoch": 1.5004549590536853, "grad_norm": 0.5841670508981737, "learning_rate": 3.969463130731183e-06, "loss": 0.0226, "step": 3298 }, { "epoch": 1.5009099181073702, "grad_norm": 0.9717872460539779, "learning_rate": 3.968884908314725e-06, "loss": 0.0314, "step": 3299 }, { "epoch": 1.5013648771610555, "grad_norm": 0.49964688444508165, "learning_rate": 3.968306565867627e-06, "loss": 0.019, "step": 3300 }, { "epoch": 1.5018198362147408, "grad_norm": 0.4378823570391564, "learning_rate": 3.967728103437146e-06, "loss": 0.0156, "step": 3301 }, { "epoch": 1.5022747952684259, "grad_norm": 0.5762765989780131, "learning_rate": 3.967149521070554e-06, "loss": 0.0278, "step": 3302 }, { "epoch": 1.502729754322111, "grad_norm": 0.4935515854160086, "learning_rate": 3.966570818815126e-06, "loss": 0.0201, "step": 3303 }, { "epoch": 1.5031847133757963, "grad_norm": 0.6809847660339569, "learning_rate": 3.965991996718156e-06, "loss": 0.0317, "step": 3304 }, { "epoch": 1.5036396724294814, "grad_norm": 0.6967324134263413, "learning_rate": 3.965413054826941e-06, "loss": 0.0234, "step": 3305 }, { "epoch": 1.5040946314831665, "grad_norm": 0.4217207881637992, "learning_rate": 3.964833993188787e-06, "loss": 0.0157, "step": 3306 }, { "epoch": 1.5045495905368518, "grad_norm": 1.0187002398410523, "learning_rate": 3.964254811851015e-06, "loss": 0.0516, "step": 3307 }, { "epoch": 1.5050045495905369, "grad_norm": 0.7101382532002475, "learning_rate": 3.963675510860952e-06, "loss": 0.032, "step": 3308 }, { "epoch": 1.505459508644222, "grad_norm": 0.5885558558311415, "learning_rate": 3.963096090265936e-06, "loss": 0.0275, "step": 3309 }, { "epoch": 1.5059144676979073, "grad_norm": 0.43615010061018616, "learning_rate": 3.962516550113316e-06, "loss": 0.017, "step": 3310 }, { "epoch": 1.5063694267515924, "grad_norm": 0.8215219704782009, "learning_rate": 3.961936890450447e-06, "loss": 0.0372, "step": 3311 }, { "epoch": 1.5068243858052774, "grad_norm": 0.7536358876609534, "learning_rate": 3.961357111324697e-06, "loss": 0.0401, "step": 3312 }, { "epoch": 1.5072793448589628, "grad_norm": 0.42864889313643445, "learning_rate": 3.960777212783445e-06, "loss": 0.0128, "step": 3313 }, { "epoch": 1.5077343039126478, "grad_norm": 0.42513746003457825, "learning_rate": 3.960197194874075e-06, "loss": 0.0171, "step": 3314 }, { "epoch": 1.508189262966333, "grad_norm": 0.6286780358231147, "learning_rate": 3.9596170576439844e-06, "loss": 0.0244, "step": 3315 }, { "epoch": 1.5086442220200182, "grad_norm": 0.7201172248235939, "learning_rate": 3.959036801140579e-06, "loss": 0.0314, "step": 3316 }, { "epoch": 1.5090991810737033, "grad_norm": 0.46561129210504143, "learning_rate": 3.958456425411275e-06, "loss": 0.0166, "step": 3317 }, { "epoch": 1.5095541401273884, "grad_norm": 0.4767372867859632, "learning_rate": 3.9578759305035e-06, "loss": 0.0233, "step": 3318 }, { "epoch": 1.5100090991810737, "grad_norm": 0.6987570495118559, "learning_rate": 3.957295316464686e-06, "loss": 0.032, "step": 3319 }, { "epoch": 1.5104640582347588, "grad_norm": 0.5353818748279202, "learning_rate": 3.956714583342281e-06, "loss": 0.0269, "step": 3320 }, { "epoch": 1.510919017288444, "grad_norm": 0.5675379911569748, "learning_rate": 3.9561337311837365e-06, "loss": 0.02, "step": 3321 }, { "epoch": 1.5113739763421292, "grad_norm": 0.4771557187323555, "learning_rate": 3.955552760036522e-06, "loss": 0.0239, "step": 3322 }, { "epoch": 1.5118289353958145, "grad_norm": 0.9322358569084765, "learning_rate": 3.9549716699481076e-06, "loss": 0.0357, "step": 3323 }, { "epoch": 1.5122838944494994, "grad_norm": 0.7153380105420392, "learning_rate": 3.954390460965979e-06, "loss": 0.0247, "step": 3324 }, { "epoch": 1.5127388535031847, "grad_norm": 0.8634115848570844, "learning_rate": 3.95380913313763e-06, "loss": 0.0375, "step": 3325 }, { "epoch": 1.51319381255687, "grad_norm": 0.5167563789245198, "learning_rate": 3.953227686510565e-06, "loss": 0.0253, "step": 3326 }, { "epoch": 1.5136487716105549, "grad_norm": 0.47663986929500723, "learning_rate": 3.9526461211322955e-06, "loss": 0.0193, "step": 3327 }, { "epoch": 1.5141037306642402, "grad_norm": 0.6010637486374424, "learning_rate": 3.9520644370503446e-06, "loss": 0.0305, "step": 3328 }, { "epoch": 1.5145586897179255, "grad_norm": 0.5165423349279934, "learning_rate": 3.951482634312246e-06, "loss": 0.0191, "step": 3329 }, { "epoch": 1.5150136487716106, "grad_norm": 0.6953133284712563, "learning_rate": 3.950900712965541e-06, "loss": 0.0348, "step": 3330 }, { "epoch": 1.5154686078252957, "grad_norm": 0.7069599237507933, "learning_rate": 3.950318673057782e-06, "loss": 0.0223, "step": 3331 }, { "epoch": 1.515923566878981, "grad_norm": 0.5858982166505268, "learning_rate": 3.949736514636531e-06, "loss": 0.0267, "step": 3332 }, { "epoch": 1.516378525932666, "grad_norm": 0.6239175973904172, "learning_rate": 3.949154237749358e-06, "loss": 0.0302, "step": 3333 }, { "epoch": 1.5168334849863512, "grad_norm": 0.48259660987352654, "learning_rate": 3.948571842443846e-06, "loss": 0.0172, "step": 3334 }, { "epoch": 1.5172884440400365, "grad_norm": 0.6191379855463478, "learning_rate": 3.947989328767585e-06, "loss": 0.0233, "step": 3335 }, { "epoch": 1.5177434030937216, "grad_norm": 0.7300237333629697, "learning_rate": 3.9474066967681744e-06, "loss": 0.0333, "step": 3336 }, { "epoch": 1.5181983621474067, "grad_norm": 0.9568900461126489, "learning_rate": 3.946823946493224e-06, "loss": 0.0454, "step": 3337 }, { "epoch": 1.518653321201092, "grad_norm": 0.5091287677987434, "learning_rate": 3.946241077990356e-06, "loss": 0.0237, "step": 3338 }, { "epoch": 1.519108280254777, "grad_norm": 0.6058068938185345, "learning_rate": 3.945658091307198e-06, "loss": 0.0251, "step": 3339 }, { "epoch": 1.5195632393084622, "grad_norm": 0.6200256282289905, "learning_rate": 3.9450749864913895e-06, "loss": 0.0281, "step": 3340 }, { "epoch": 1.5200181983621475, "grad_norm": 0.5747069734404173, "learning_rate": 3.9444917635905784e-06, "loss": 0.0171, "step": 3341 }, { "epoch": 1.5204731574158326, "grad_norm": 0.4749954035386937, "learning_rate": 3.943908422652424e-06, "loss": 0.0179, "step": 3342 }, { "epoch": 1.5209281164695176, "grad_norm": 0.5376186870756726, "learning_rate": 3.943324963724594e-06, "loss": 0.0211, "step": 3343 }, { "epoch": 1.521383075523203, "grad_norm": 0.6533422193724441, "learning_rate": 3.942741386854766e-06, "loss": 0.0254, "step": 3344 }, { "epoch": 1.521838034576888, "grad_norm": 0.6125010168356277, "learning_rate": 3.942157692090627e-06, "loss": 0.0199, "step": 3345 }, { "epoch": 1.5222929936305731, "grad_norm": 0.5244298027629346, "learning_rate": 3.941573879479874e-06, "loss": 0.0186, "step": 3346 }, { "epoch": 1.5227479526842584, "grad_norm": 0.6221503010776018, "learning_rate": 3.940989949070214e-06, "loss": 0.0318, "step": 3347 }, { "epoch": 1.5232029117379435, "grad_norm": 0.5592270472897608, "learning_rate": 3.940405900909362e-06, "loss": 0.0179, "step": 3348 }, { "epoch": 1.5236578707916286, "grad_norm": 0.5643171405314981, "learning_rate": 3.939821735045046e-06, "loss": 0.0208, "step": 3349 }, { "epoch": 1.524112829845314, "grad_norm": 0.8069246114105432, "learning_rate": 3.9392374515249986e-06, "loss": 0.0267, "step": 3350 }, { "epoch": 1.5245677888989992, "grad_norm": 1.1290905498149801, "learning_rate": 3.938653050396967e-06, "loss": 0.0608, "step": 3351 }, { "epoch": 1.525022747952684, "grad_norm": 0.6018387449063141, "learning_rate": 3.938068531708706e-06, "loss": 0.0254, "step": 3352 }, { "epoch": 1.5254777070063694, "grad_norm": 0.4828243886278892, "learning_rate": 3.937483895507977e-06, "loss": 0.016, "step": 3353 }, { "epoch": 1.5259326660600547, "grad_norm": 0.6616900595814049, "learning_rate": 3.936899141842556e-06, "loss": 0.0253, "step": 3354 }, { "epoch": 1.5263876251137396, "grad_norm": 0.5923645573102901, "learning_rate": 3.936314270760227e-06, "loss": 0.0196, "step": 3355 }, { "epoch": 1.526842584167425, "grad_norm": 0.5163730502393629, "learning_rate": 3.935729282308781e-06, "loss": 0.0223, "step": 3356 }, { "epoch": 1.5272975432211102, "grad_norm": 0.6832393099061513, "learning_rate": 3.935144176536023e-06, "loss": 0.031, "step": 3357 }, { "epoch": 1.5277525022747953, "grad_norm": 0.5007005767499049, "learning_rate": 3.934558953489763e-06, "loss": 0.0229, "step": 3358 }, { "epoch": 1.5282074613284804, "grad_norm": 0.853003111032188, "learning_rate": 3.9339736132178245e-06, "loss": 0.0232, "step": 3359 }, { "epoch": 1.5286624203821657, "grad_norm": 0.5521298424691822, "learning_rate": 3.933388155768038e-06, "loss": 0.0195, "step": 3360 }, { "epoch": 1.5291173794358508, "grad_norm": 0.6457589733077482, "learning_rate": 3.932802581188243e-06, "loss": 0.0321, "step": 3361 }, { "epoch": 1.5295723384895359, "grad_norm": 0.608336431809314, "learning_rate": 3.932216889526293e-06, "loss": 0.0307, "step": 3362 }, { "epoch": 1.5300272975432212, "grad_norm": 0.6136520142795995, "learning_rate": 3.931631080830046e-06, "loss": 0.0231, "step": 3363 }, { "epoch": 1.5304822565969063, "grad_norm": 0.6946767843291841, "learning_rate": 3.931045155147373e-06, "loss": 0.0299, "step": 3364 }, { "epoch": 1.5309372156505914, "grad_norm": 0.4896726609119758, "learning_rate": 3.930459112526153e-06, "loss": 0.0163, "step": 3365 }, { "epoch": 1.5313921747042767, "grad_norm": 0.5149603793357758, "learning_rate": 3.929872953014272e-06, "loss": 0.0224, "step": 3366 }, { "epoch": 1.5318471337579618, "grad_norm": 0.4807476106098001, "learning_rate": 3.929286676659632e-06, "loss": 0.0225, "step": 3367 }, { "epoch": 1.5323020928116469, "grad_norm": 0.45708905725245436, "learning_rate": 3.92870028351014e-06, "loss": 0.0171, "step": 3368 }, { "epoch": 1.5327570518653322, "grad_norm": 0.6665872106085607, "learning_rate": 3.9281137736137105e-06, "loss": 0.033, "step": 3369 }, { "epoch": 1.5332120109190173, "grad_norm": 0.6182322774057446, "learning_rate": 3.927527147018275e-06, "loss": 0.0277, "step": 3370 }, { "epoch": 1.5336669699727024, "grad_norm": 0.5531434189010356, "learning_rate": 3.926940403771767e-06, "loss": 0.0336, "step": 3371 }, { "epoch": 1.5341219290263877, "grad_norm": 0.828274110828607, "learning_rate": 3.926353543922133e-06, "loss": 0.0337, "step": 3372 }, { "epoch": 1.5345768880800728, "grad_norm": 0.5161411919297396, "learning_rate": 3.925766567517329e-06, "loss": 0.019, "step": 3373 }, { "epoch": 1.5350318471337578, "grad_norm": 0.6230029027238686, "learning_rate": 3.925179474605319e-06, "loss": 0.0265, "step": 3374 }, { "epoch": 1.5354868061874432, "grad_norm": 0.5787436091028683, "learning_rate": 3.92459226523408e-06, "loss": 0.018, "step": 3375 }, { "epoch": 1.5359417652411285, "grad_norm": 0.45169309927288415, "learning_rate": 3.924004939451593e-06, "loss": 0.015, "step": 3376 }, { "epoch": 1.5363967242948133, "grad_norm": 0.920838062645902, "learning_rate": 3.923417497305853e-06, "loss": 0.0438, "step": 3377 }, { "epoch": 1.5368516833484986, "grad_norm": 0.5102151208358107, "learning_rate": 3.9228299388448645e-06, "loss": 0.0175, "step": 3378 }, { "epoch": 1.537306642402184, "grad_norm": 0.5052430153694474, "learning_rate": 3.922242264116639e-06, "loss": 0.0219, "step": 3379 }, { "epoch": 1.5377616014558688, "grad_norm": 0.6374313082835955, "learning_rate": 3.921654473169198e-06, "loss": 0.0333, "step": 3380 }, { "epoch": 1.5382165605095541, "grad_norm": 0.5476952530160154, "learning_rate": 3.921066566050573e-06, "loss": 0.0212, "step": 3381 }, { "epoch": 1.5386715195632394, "grad_norm": 0.9473433558396809, "learning_rate": 3.920478542808806e-06, "loss": 0.0223, "step": 3382 }, { "epoch": 1.5391264786169245, "grad_norm": 0.7335517144921838, "learning_rate": 3.919890403491947e-06, "loss": 0.0279, "step": 3383 }, { "epoch": 1.5395814376706096, "grad_norm": 0.6418967633228138, "learning_rate": 3.919302148148057e-06, "loss": 0.0202, "step": 3384 }, { "epoch": 1.540036396724295, "grad_norm": 0.6065834876804072, "learning_rate": 3.918713776825204e-06, "loss": 0.0226, "step": 3385 }, { "epoch": 1.54049135577798, "grad_norm": 0.5525509383436024, "learning_rate": 3.918125289571469e-06, "loss": 0.0165, "step": 3386 }, { "epoch": 1.540946314831665, "grad_norm": 0.5704841120317284, "learning_rate": 3.917536686434939e-06, "loss": 0.0251, "step": 3387 }, { "epoch": 1.5414012738853504, "grad_norm": 0.6284290011537956, "learning_rate": 3.916947967463713e-06, "loss": 0.0273, "step": 3388 }, { "epoch": 1.5418562329390355, "grad_norm": 0.45132218970827287, "learning_rate": 3.916359132705898e-06, "loss": 0.0159, "step": 3389 }, { "epoch": 1.5423111919927206, "grad_norm": 0.6245494356008068, "learning_rate": 3.91577018220961e-06, "loss": 0.0297, "step": 3390 }, { "epoch": 1.542766151046406, "grad_norm": 0.6639491637722991, "learning_rate": 3.9151811160229765e-06, "loss": 0.0312, "step": 3391 }, { "epoch": 1.543221110100091, "grad_norm": 0.4639934293921549, "learning_rate": 3.914591934194134e-06, "loss": 0.0203, "step": 3392 }, { "epoch": 1.543676069153776, "grad_norm": 0.7771392348973274, "learning_rate": 3.914002636771226e-06, "loss": 0.0387, "step": 3393 }, { "epoch": 1.5441310282074614, "grad_norm": 0.47187669001516336, "learning_rate": 3.913413223802408e-06, "loss": 0.0147, "step": 3394 }, { "epoch": 1.5445859872611465, "grad_norm": 0.6210756572770973, "learning_rate": 3.912823695335845e-06, "loss": 0.0269, "step": 3395 }, { "epoch": 1.5450409463148316, "grad_norm": 0.749126270362982, "learning_rate": 3.91223405141971e-06, "loss": 0.035, "step": 3396 }, { "epoch": 1.5454959053685169, "grad_norm": 0.5844102366135081, "learning_rate": 3.911644292102185e-06, "loss": 0.0244, "step": 3397 }, { "epoch": 1.545950864422202, "grad_norm": 0.4504846823952382, "learning_rate": 3.911054417431465e-06, "loss": 0.0158, "step": 3398 }, { "epoch": 1.546405823475887, "grad_norm": 0.6063139796018792, "learning_rate": 3.9104644274557485e-06, "loss": 0.035, "step": 3399 }, { "epoch": 1.5468607825295724, "grad_norm": 0.7624720000367489, "learning_rate": 3.909874322223249e-06, "loss": 0.0301, "step": 3400 }, { "epoch": 1.5473157415832575, "grad_norm": 0.4352980068651625, "learning_rate": 3.909284101782187e-06, "loss": 0.0158, "step": 3401 }, { "epoch": 1.5477707006369426, "grad_norm": 0.47543411380220885, "learning_rate": 3.908693766180792e-06, "loss": 0.019, "step": 3402 }, { "epoch": 1.5482256596906279, "grad_norm": 0.44347229009044975, "learning_rate": 3.908103315467306e-06, "loss": 0.0161, "step": 3403 }, { "epoch": 1.5486806187443132, "grad_norm": 0.5439384407995033, "learning_rate": 3.907512749689973e-06, "loss": 0.0227, "step": 3404 }, { "epoch": 1.549135577797998, "grad_norm": 0.598140622569138, "learning_rate": 3.906922068897057e-06, "loss": 0.0341, "step": 3405 }, { "epoch": 1.5495905368516834, "grad_norm": 0.397233945431831, "learning_rate": 3.906331273136822e-06, "loss": 0.0122, "step": 3406 }, { "epoch": 1.5500454959053687, "grad_norm": 0.5632352571342928, "learning_rate": 3.905740362457546e-06, "loss": 0.0266, "step": 3407 }, { "epoch": 1.5505004549590535, "grad_norm": 0.45563561711988787, "learning_rate": 3.905149336907516e-06, "loss": 0.016, "step": 3408 }, { "epoch": 1.5509554140127388, "grad_norm": 0.6596648155175628, "learning_rate": 3.904558196535029e-06, "loss": 0.0305, "step": 3409 }, { "epoch": 1.5514103730664242, "grad_norm": 0.7079970990005604, "learning_rate": 3.903966941388387e-06, "loss": 0.0263, "step": 3410 }, { "epoch": 1.5518653321201092, "grad_norm": 0.8057885831541769, "learning_rate": 3.9033755715159085e-06, "loss": 0.0297, "step": 3411 }, { "epoch": 1.5523202911737943, "grad_norm": 0.5776215693579038, "learning_rate": 3.902784086965915e-06, "loss": 0.0187, "step": 3412 }, { "epoch": 1.5527752502274796, "grad_norm": 0.5917536430690068, "learning_rate": 3.902192487786741e-06, "loss": 0.0272, "step": 3413 }, { "epoch": 1.5532302092811647, "grad_norm": 0.6721030679732494, "learning_rate": 3.9016007740267295e-06, "loss": 0.0249, "step": 3414 }, { "epoch": 1.5536851683348498, "grad_norm": 0.4963390776924177, "learning_rate": 3.901008945734232e-06, "loss": 0.0167, "step": 3415 }, { "epoch": 1.5541401273885351, "grad_norm": 0.7044535041689735, "learning_rate": 3.90041700295761e-06, "loss": 0.0337, "step": 3416 }, { "epoch": 1.5545950864422202, "grad_norm": 0.8038485567195957, "learning_rate": 3.899824945745236e-06, "loss": 0.0366, "step": 3417 }, { "epoch": 1.5550500454959053, "grad_norm": 0.6306007993135992, "learning_rate": 3.899232774145488e-06, "loss": 0.0215, "step": 3418 }, { "epoch": 1.5555050045495906, "grad_norm": 0.5663443449128879, "learning_rate": 3.898640488206756e-06, "loss": 0.0311, "step": 3419 }, { "epoch": 1.5559599636032757, "grad_norm": 0.6617378094913873, "learning_rate": 3.898048087977441e-06, "loss": 0.0296, "step": 3420 }, { "epoch": 1.5564149226569608, "grad_norm": 0.5247200025249587, "learning_rate": 3.89745557350595e-06, "loss": 0.0194, "step": 3421 }, { "epoch": 1.556869881710646, "grad_norm": 0.581451724478826, "learning_rate": 3.896862944840698e-06, "loss": 0.0177, "step": 3422 }, { "epoch": 1.5573248407643312, "grad_norm": 0.8154202104136817, "learning_rate": 3.896270202030116e-06, "loss": 0.0511, "step": 3423 }, { "epoch": 1.5577797998180163, "grad_norm": 0.4576805292939243, "learning_rate": 3.895677345122638e-06, "loss": 0.0152, "step": 3424 }, { "epoch": 1.5582347588717016, "grad_norm": 0.6462285553170106, "learning_rate": 3.895084374166711e-06, "loss": 0.0303, "step": 3425 }, { "epoch": 1.5586897179253867, "grad_norm": 0.4571016486302641, "learning_rate": 3.894491289210788e-06, "loss": 0.017, "step": 3426 }, { "epoch": 1.5591446769790718, "grad_norm": 0.7613273750746716, "learning_rate": 3.893898090303335e-06, "loss": 0.0345, "step": 3427 }, { "epoch": 1.559599636032757, "grad_norm": 0.6456841813220812, "learning_rate": 3.893304777492825e-06, "loss": 0.0307, "step": 3428 }, { "epoch": 1.5600545950864422, "grad_norm": 0.7063060028945898, "learning_rate": 3.89271135082774e-06, "loss": 0.0252, "step": 3429 }, { "epoch": 1.5605095541401273, "grad_norm": 0.49183537271033767, "learning_rate": 3.892117810356574e-06, "loss": 0.0197, "step": 3430 }, { "epoch": 1.5609645131938126, "grad_norm": 1.1855054320374372, "learning_rate": 3.8915241561278265e-06, "loss": 0.0276, "step": 3431 }, { "epoch": 1.5614194722474979, "grad_norm": 0.4694422800890511, "learning_rate": 3.890930388190009e-06, "loss": 0.0162, "step": 3432 }, { "epoch": 1.5618744313011828, "grad_norm": 0.854552183687447, "learning_rate": 3.890336506591642e-06, "loss": 0.0348, "step": 3433 }, { "epoch": 1.562329390354868, "grad_norm": 0.6692617939837532, "learning_rate": 3.889742511381254e-06, "loss": 0.0269, "step": 3434 }, { "epoch": 1.5627843494085534, "grad_norm": 1.0752556777655282, "learning_rate": 3.889148402607384e-06, "loss": 0.0303, "step": 3435 }, { "epoch": 1.5632393084622382, "grad_norm": 0.6604461169117343, "learning_rate": 3.88855418031858e-06, "loss": 0.0301, "step": 3436 }, { "epoch": 1.5636942675159236, "grad_norm": 0.6305615429026079, "learning_rate": 3.887959844563399e-06, "loss": 0.0209, "step": 3437 }, { "epoch": 1.5641492265696089, "grad_norm": 0.7058425188549334, "learning_rate": 3.887365395390407e-06, "loss": 0.0223, "step": 3438 }, { "epoch": 1.564604185623294, "grad_norm": 0.8386437287625623, "learning_rate": 3.886770832848181e-06, "loss": 0.0323, "step": 3439 }, { "epoch": 1.565059144676979, "grad_norm": 0.6598116278749199, "learning_rate": 3.886176156985305e-06, "loss": 0.0243, "step": 3440 }, { "epoch": 1.5655141037306644, "grad_norm": 0.6145240126950321, "learning_rate": 3.885581367850373e-06, "loss": 0.0258, "step": 3441 }, { "epoch": 1.5659690627843494, "grad_norm": 0.6848848503580919, "learning_rate": 3.8849864654919885e-06, "loss": 0.0248, "step": 3442 }, { "epoch": 1.5664240218380345, "grad_norm": 0.7462959103495879, "learning_rate": 3.884391449958765e-06, "loss": 0.0307, "step": 3443 }, { "epoch": 1.5668789808917198, "grad_norm": 0.9227690503269461, "learning_rate": 3.883796321299325e-06, "loss": 0.0307, "step": 3444 }, { "epoch": 1.567333939945405, "grad_norm": 0.6700677482330477, "learning_rate": 3.8832010795622975e-06, "loss": 0.0363, "step": 3445 }, { "epoch": 1.56778889899909, "grad_norm": 0.6822138930890699, "learning_rate": 3.882605724796324e-06, "loss": 0.0316, "step": 3446 }, { "epoch": 1.5682438580527753, "grad_norm": 0.6017869767431822, "learning_rate": 3.882010257050056e-06, "loss": 0.0266, "step": 3447 }, { "epoch": 1.5686988171064604, "grad_norm": 0.5232712126783707, "learning_rate": 3.88141467637215e-06, "loss": 0.0197, "step": 3448 }, { "epoch": 1.5691537761601455, "grad_norm": 0.5530318145705541, "learning_rate": 3.880818982811275e-06, "loss": 0.0279, "step": 3449 }, { "epoch": 1.5696087352138308, "grad_norm": 1.007799449110445, "learning_rate": 3.880223176416108e-06, "loss": 0.035, "step": 3450 }, { "epoch": 1.570063694267516, "grad_norm": 0.5334141245111912, "learning_rate": 3.879627257235337e-06, "loss": 0.0223, "step": 3451 }, { "epoch": 1.570518653321201, "grad_norm": 0.854819849016019, "learning_rate": 3.8790312253176565e-06, "loss": 0.0321, "step": 3452 }, { "epoch": 1.5709736123748863, "grad_norm": 0.6388465604043535, "learning_rate": 3.878435080711772e-06, "loss": 0.0207, "step": 3453 }, { "epoch": 1.5714285714285714, "grad_norm": 0.5635363641527492, "learning_rate": 3.877838823466398e-06, "loss": 0.0223, "step": 3454 }, { "epoch": 1.5718835304822565, "grad_norm": 0.6475301926880436, "learning_rate": 3.8772424536302565e-06, "loss": 0.0226, "step": 3455 }, { "epoch": 1.5723384895359418, "grad_norm": 0.6117567579130653, "learning_rate": 3.876645971252082e-06, "loss": 0.0255, "step": 3456 }, { "epoch": 1.5727934485896269, "grad_norm": 0.5792594395173578, "learning_rate": 3.876049376380615e-06, "loss": 0.0303, "step": 3457 }, { "epoch": 1.573248407643312, "grad_norm": 0.8432212860921924, "learning_rate": 3.875452669064609e-06, "loss": 0.0419, "step": 3458 }, { "epoch": 1.5737033666969973, "grad_norm": 0.42058349084627317, "learning_rate": 3.874855849352821e-06, "loss": 0.0191, "step": 3459 }, { "epoch": 1.5741583257506826, "grad_norm": 0.7235018800789317, "learning_rate": 3.874258917294021e-06, "loss": 0.0406, "step": 3460 }, { "epoch": 1.5746132848043675, "grad_norm": 0.8311342432465947, "learning_rate": 3.873661872936989e-06, "loss": 0.0478, "step": 3461 }, { "epoch": 1.5750682438580528, "grad_norm": 0.6547797917717316, "learning_rate": 3.873064716330513e-06, "loss": 0.0298, "step": 3462 }, { "epoch": 1.575523202911738, "grad_norm": 0.6755671622269617, "learning_rate": 3.872467447523388e-06, "loss": 0.0307, "step": 3463 }, { "epoch": 1.575978161965423, "grad_norm": 0.7583762247476595, "learning_rate": 3.871870066564422e-06, "loss": 0.0283, "step": 3464 }, { "epoch": 1.5764331210191083, "grad_norm": 0.6366872413298845, "learning_rate": 3.8712725735024295e-06, "loss": 0.0263, "step": 3465 }, { "epoch": 1.5768880800727936, "grad_norm": 0.5257311180353379, "learning_rate": 3.870674968386234e-06, "loss": 0.0226, "step": 3466 }, { "epoch": 1.5773430391264787, "grad_norm": 0.4253083736064576, "learning_rate": 3.87007725126467e-06, "loss": 0.0164, "step": 3467 }, { "epoch": 1.5777979981801638, "grad_norm": 0.5386960324866366, "learning_rate": 3.869479422186582e-06, "loss": 0.0173, "step": 3468 }, { "epoch": 1.578252957233849, "grad_norm": 0.48215988619225064, "learning_rate": 3.868881481200818e-06, "loss": 0.0184, "step": 3469 }, { "epoch": 1.5787079162875342, "grad_norm": 0.609767144737994, "learning_rate": 3.868283428356243e-06, "loss": 0.0409, "step": 3470 }, { "epoch": 1.5791628753412192, "grad_norm": 0.5291617114915205, "learning_rate": 3.8676852637017234e-06, "loss": 0.0231, "step": 3471 }, { "epoch": 1.5796178343949046, "grad_norm": 0.48866771038754586, "learning_rate": 3.867086987286141e-06, "loss": 0.0247, "step": 3472 }, { "epoch": 1.5800727934485896, "grad_norm": 0.4998279314275286, "learning_rate": 3.866488599158386e-06, "loss": 0.0243, "step": 3473 }, { "epoch": 1.5805277525022747, "grad_norm": 0.5281491792419408, "learning_rate": 3.865890099367351e-06, "loss": 0.0162, "step": 3474 }, { "epoch": 1.58098271155596, "grad_norm": 0.7800264919684391, "learning_rate": 3.865291487961946e-06, "loss": 0.0358, "step": 3475 }, { "epoch": 1.5814376706096451, "grad_norm": 0.668212797447539, "learning_rate": 3.864692764991087e-06, "loss": 0.0358, "step": 3476 }, { "epoch": 1.5818926296633302, "grad_norm": 0.5890921063282307, "learning_rate": 3.864093930503697e-06, "loss": 0.0248, "step": 3477 }, { "epoch": 1.5823475887170155, "grad_norm": 0.5909722423431498, "learning_rate": 3.863494984548712e-06, "loss": 0.0232, "step": 3478 }, { "epoch": 1.5828025477707006, "grad_norm": 0.6172756228743427, "learning_rate": 3.862895927175074e-06, "loss": 0.0344, "step": 3479 }, { "epoch": 1.5832575068243857, "grad_norm": 0.547485277081697, "learning_rate": 3.862296758431736e-06, "loss": 0.0279, "step": 3480 }, { "epoch": 1.583712465878071, "grad_norm": 0.5162157512249061, "learning_rate": 3.861697478367658e-06, "loss": 0.0164, "step": 3481 }, { "epoch": 1.584167424931756, "grad_norm": 0.6420660037228898, "learning_rate": 3.8610980870318126e-06, "loss": 0.0198, "step": 3482 }, { "epoch": 1.5846223839854412, "grad_norm": 0.7440250847976592, "learning_rate": 3.860498584473178e-06, "loss": 0.0479, "step": 3483 }, { "epoch": 1.5850773430391265, "grad_norm": 0.4443405489217349, "learning_rate": 3.859898970740743e-06, "loss": 0.0159, "step": 3484 }, { "epoch": 1.5855323020928116, "grad_norm": 0.6979930315119395, "learning_rate": 3.859299245883505e-06, "loss": 0.0324, "step": 3485 }, { "epoch": 1.5859872611464967, "grad_norm": 0.7134876833857815, "learning_rate": 3.858699409950472e-06, "loss": 0.0398, "step": 3486 }, { "epoch": 1.586442220200182, "grad_norm": 0.4825353059791467, "learning_rate": 3.858099462990658e-06, "loss": 0.016, "step": 3487 }, { "epoch": 1.5868971792538673, "grad_norm": 0.6150538702034337, "learning_rate": 3.857499405053089e-06, "loss": 0.0301, "step": 3488 }, { "epoch": 1.5873521383075522, "grad_norm": 0.625710397147892, "learning_rate": 3.856899236186799e-06, "loss": 0.0306, "step": 3489 }, { "epoch": 1.5878070973612375, "grad_norm": 0.6522778309170958, "learning_rate": 3.856298956440832e-06, "loss": 0.0292, "step": 3490 }, { "epoch": 1.5882620564149228, "grad_norm": 0.5784507541788387, "learning_rate": 3.8556985658642395e-06, "loss": 0.0288, "step": 3491 }, { "epoch": 1.5887170154686077, "grad_norm": 0.6076283369120725, "learning_rate": 3.855098064506081e-06, "loss": 0.0284, "step": 3492 }, { "epoch": 1.589171974522293, "grad_norm": 0.5347645776215969, "learning_rate": 3.85449745241543e-06, "loss": 0.0197, "step": 3493 }, { "epoch": 1.5896269335759783, "grad_norm": 0.5608060117520718, "learning_rate": 3.853896729641363e-06, "loss": 0.0249, "step": 3494 }, { "epoch": 1.5900818926296634, "grad_norm": 0.6468434298369498, "learning_rate": 3.853295896232969e-06, "loss": 0.0364, "step": 3495 }, { "epoch": 1.5905368516833485, "grad_norm": 0.6854777995675021, "learning_rate": 3.852694952239347e-06, "loss": 0.0339, "step": 3496 }, { "epoch": 1.5909918107370338, "grad_norm": 0.6643748488103951, "learning_rate": 3.852093897709601e-06, "loss": 0.0215, "step": 3497 }, { "epoch": 1.5914467697907189, "grad_norm": 0.5125764451782172, "learning_rate": 3.851492732692849e-06, "loss": 0.0221, "step": 3498 }, { "epoch": 1.591901728844404, "grad_norm": 0.9740815731787803, "learning_rate": 3.8508914572382124e-06, "loss": 0.0304, "step": 3499 }, { "epoch": 1.5923566878980893, "grad_norm": 0.47429273180622816, "learning_rate": 3.850290071394828e-06, "loss": 0.0211, "step": 3500 }, { "epoch": 1.5928116469517744, "grad_norm": 0.5856345203484362, "learning_rate": 3.8496885752118365e-06, "loss": 0.0275, "step": 3501 }, { "epoch": 1.5932666060054594, "grad_norm": 0.5701881313175124, "learning_rate": 3.849086968738389e-06, "loss": 0.0248, "step": 3502 }, { "epoch": 1.5937215650591448, "grad_norm": 0.7010777024298871, "learning_rate": 3.848485252023647e-06, "loss": 0.0181, "step": 3503 }, { "epoch": 1.5941765241128298, "grad_norm": 0.7893900975337746, "learning_rate": 3.847883425116781e-06, "loss": 0.0395, "step": 3504 }, { "epoch": 1.594631483166515, "grad_norm": 0.5894497701644011, "learning_rate": 3.8472814880669675e-06, "loss": 0.0368, "step": 3505 }, { "epoch": 1.5950864422202002, "grad_norm": 0.7531589111356518, "learning_rate": 3.8466794409233946e-06, "loss": 0.0258, "step": 3506 }, { "epoch": 1.5955414012738853, "grad_norm": 0.9230617438117314, "learning_rate": 3.846077283735261e-06, "loss": 0.0325, "step": 3507 }, { "epoch": 1.5959963603275704, "grad_norm": 0.370638936750391, "learning_rate": 3.84547501655177e-06, "loss": 0.0162, "step": 3508 }, { "epoch": 1.5964513193812557, "grad_norm": 0.6926234974622569, "learning_rate": 3.844872639422136e-06, "loss": 0.0362, "step": 3509 }, { "epoch": 1.5969062784349408, "grad_norm": 0.47160413647170063, "learning_rate": 3.844270152395583e-06, "loss": 0.0214, "step": 3510 }, { "epoch": 1.597361237488626, "grad_norm": 0.7083297825352631, "learning_rate": 3.843667555521346e-06, "loss": 0.0438, "step": 3511 }, { "epoch": 1.5978161965423112, "grad_norm": 0.8441503938502771, "learning_rate": 3.843064848848662e-06, "loss": 0.0276, "step": 3512 }, { "epoch": 1.5982711555959963, "grad_norm": 0.6561878128626468, "learning_rate": 3.842462032426784e-06, "loss": 0.0257, "step": 3513 }, { "epoch": 1.5987261146496814, "grad_norm": 0.4142556056021124, "learning_rate": 3.841859106304973e-06, "loss": 0.0153, "step": 3514 }, { "epoch": 1.5991810737033667, "grad_norm": 0.7732835680071638, "learning_rate": 3.841256070532494e-06, "loss": 0.0481, "step": 3515 }, { "epoch": 1.599636032757052, "grad_norm": 0.9016444602806903, "learning_rate": 3.840652925158626e-06, "loss": 0.0402, "step": 3516 }, { "epoch": 1.6000909918107369, "grad_norm": 0.7200030788002735, "learning_rate": 3.840049670232656e-06, "loss": 0.0321, "step": 3517 }, { "epoch": 1.6005459508644222, "grad_norm": 0.5119605070096909, "learning_rate": 3.839446305803878e-06, "loss": 0.025, "step": 3518 }, { "epoch": 1.6010009099181075, "grad_norm": 0.5867897527768563, "learning_rate": 3.838842831921598e-06, "loss": 0.018, "step": 3519 }, { "epoch": 1.6014558689717924, "grad_norm": 0.5674939657400796, "learning_rate": 3.8382392486351265e-06, "loss": 0.0243, "step": 3520 }, { "epoch": 1.6019108280254777, "grad_norm": 0.4585806965385925, "learning_rate": 3.837635555993787e-06, "loss": 0.0215, "step": 3521 }, { "epoch": 1.602365787079163, "grad_norm": 0.5861447224605015, "learning_rate": 3.837031754046912e-06, "loss": 0.0261, "step": 3522 }, { "epoch": 1.602820746132848, "grad_norm": 0.7789923182655337, "learning_rate": 3.836427842843838e-06, "loss": 0.0318, "step": 3523 }, { "epoch": 1.6032757051865332, "grad_norm": 0.6200237101545006, "learning_rate": 3.835823822433918e-06, "loss": 0.0282, "step": 3524 }, { "epoch": 1.6037306642402185, "grad_norm": 0.4823998206699662, "learning_rate": 3.835219692866506e-06, "loss": 0.0198, "step": 3525 }, { "epoch": 1.6041856232939036, "grad_norm": 0.5398187465426305, "learning_rate": 3.834615454190972e-06, "loss": 0.0297, "step": 3526 }, { "epoch": 1.6046405823475887, "grad_norm": 0.6202173676714453, "learning_rate": 3.834011106456689e-06, "loss": 0.04, "step": 3527 }, { "epoch": 1.605095541401274, "grad_norm": 0.6723747745682109, "learning_rate": 3.833406649713044e-06, "loss": 0.0373, "step": 3528 }, { "epoch": 1.605550500454959, "grad_norm": 0.5542430465028839, "learning_rate": 3.832802084009428e-06, "loss": 0.0282, "step": 3529 }, { "epoch": 1.6060054595086442, "grad_norm": 0.565624305303762, "learning_rate": 3.832197409395245e-06, "loss": 0.0215, "step": 3530 }, { "epoch": 1.6064604185623295, "grad_norm": 0.4861010317792031, "learning_rate": 3.831592625919906e-06, "loss": 0.0225, "step": 3531 }, { "epoch": 1.6069153776160146, "grad_norm": 0.4373745791753047, "learning_rate": 3.830987733632831e-06, "loss": 0.0176, "step": 3532 }, { "epoch": 1.6073703366696996, "grad_norm": 0.5422984557811947, "learning_rate": 3.830382732583449e-06, "loss": 0.0209, "step": 3533 }, { "epoch": 1.607825295723385, "grad_norm": 0.5685852646509083, "learning_rate": 3.829777622821198e-06, "loss": 0.0297, "step": 3534 }, { "epoch": 1.60828025477707, "grad_norm": 0.543627577172519, "learning_rate": 3.8291724043955245e-06, "loss": 0.0166, "step": 3535 }, { "epoch": 1.6087352138307551, "grad_norm": 0.488533286475008, "learning_rate": 3.828567077355885e-06, "loss": 0.017, "step": 3536 }, { "epoch": 1.6091901728844404, "grad_norm": 0.4320474763865453, "learning_rate": 3.827961641751744e-06, "loss": 0.016, "step": 3537 }, { "epoch": 1.6096451319381255, "grad_norm": 0.5050357798930966, "learning_rate": 3.827356097632574e-06, "loss": 0.0298, "step": 3538 }, { "epoch": 1.6101000909918106, "grad_norm": 0.43740647703076296, "learning_rate": 3.826750445047859e-06, "loss": 0.016, "step": 3539 }, { "epoch": 1.610555050045496, "grad_norm": 0.8100698120528552, "learning_rate": 3.826144684047089e-06, "loss": 0.0347, "step": 3540 }, { "epoch": 1.6110100090991812, "grad_norm": 0.6197479181778295, "learning_rate": 3.825538814679763e-06, "loss": 0.0168, "step": 3541 }, { "epoch": 1.611464968152866, "grad_norm": 0.5803411146457176, "learning_rate": 3.824932836995392e-06, "loss": 0.0185, "step": 3542 }, { "epoch": 1.6119199272065514, "grad_norm": 0.6921154745229382, "learning_rate": 3.8243267510434936e-06, "loss": 0.0443, "step": 3543 }, { "epoch": 1.6123748862602367, "grad_norm": 0.539892414542492, "learning_rate": 3.823720556873592e-06, "loss": 0.016, "step": 3544 }, { "epoch": 1.6128298453139216, "grad_norm": 0.60868298291056, "learning_rate": 3.823114254535226e-06, "loss": 0.0237, "step": 3545 }, { "epoch": 1.613284804367607, "grad_norm": 0.4889853258398554, "learning_rate": 3.8225078440779375e-06, "loss": 0.0209, "step": 3546 }, { "epoch": 1.6137397634212922, "grad_norm": 0.7040369108834665, "learning_rate": 3.821901325551281e-06, "loss": 0.0237, "step": 3547 }, { "epoch": 1.6141947224749773, "grad_norm": 0.6923999096362768, "learning_rate": 3.821294699004816e-06, "loss": 0.0255, "step": 3548 }, { "epoch": 1.6146496815286624, "grad_norm": 0.4021417615609393, "learning_rate": 3.820687964488117e-06, "loss": 0.0172, "step": 3549 }, { "epoch": 1.6151046405823477, "grad_norm": 0.5145338706518726, "learning_rate": 3.82008112205076e-06, "loss": 0.0222, "step": 3550 }, { "epoch": 1.6155595996360328, "grad_norm": 0.6171169806105846, "learning_rate": 3.819474171742336e-06, "loss": 0.0224, "step": 3551 }, { "epoch": 1.6160145586897179, "grad_norm": 0.5389292833044773, "learning_rate": 3.8188671136124425e-06, "loss": 0.0181, "step": 3552 }, { "epoch": 1.6164695177434032, "grad_norm": 0.4297302177233826, "learning_rate": 3.818259947710683e-06, "loss": 0.0181, "step": 3553 }, { "epoch": 1.6169244767970883, "grad_norm": 0.6660195157126831, "learning_rate": 3.817652674086675e-06, "loss": 0.0259, "step": 3554 }, { "epoch": 1.6173794358507734, "grad_norm": 0.758925079775858, "learning_rate": 3.81704529279004e-06, "loss": 0.0513, "step": 3555 }, { "epoch": 1.6178343949044587, "grad_norm": 0.4334208779986942, "learning_rate": 3.816437803870412e-06, "loss": 0.0196, "step": 3556 }, { "epoch": 1.6182893539581438, "grad_norm": 0.7882302405649769, "learning_rate": 3.815830207377431e-06, "loss": 0.0342, "step": 3557 }, { "epoch": 1.6187443130118289, "grad_norm": 0.6638178492343451, "learning_rate": 3.815222503360748e-06, "loss": 0.0297, "step": 3558 }, { "epoch": 1.6191992720655142, "grad_norm": 0.49748672673993416, "learning_rate": 3.814614691870021e-06, "loss": 0.0127, "step": 3559 }, { "epoch": 1.6196542311191993, "grad_norm": 0.49860874776103087, "learning_rate": 3.814006772954919e-06, "loss": 0.0187, "step": 3560 }, { "epoch": 1.6201091901728844, "grad_norm": 0.45358670629439046, "learning_rate": 3.8133987466651175e-06, "loss": 0.022, "step": 3561 }, { "epoch": 1.6205641492265697, "grad_norm": 0.7110343231389603, "learning_rate": 3.8127906130503014e-06, "loss": 0.0325, "step": 3562 }, { "epoch": 1.6210191082802548, "grad_norm": 0.46120308542915633, "learning_rate": 3.8121823721601647e-06, "loss": 0.0117, "step": 3563 }, { "epoch": 1.6214740673339398, "grad_norm": 0.7232903654349334, "learning_rate": 3.8115740240444106e-06, "loss": 0.0288, "step": 3564 }, { "epoch": 1.6219290263876252, "grad_norm": 0.6618321138024421, "learning_rate": 3.81096556875275e-06, "loss": 0.0239, "step": 3565 }, { "epoch": 1.6223839854413102, "grad_norm": 0.6552432695181556, "learning_rate": 3.8103570063349034e-06, "loss": 0.0378, "step": 3566 }, { "epoch": 1.6228389444949953, "grad_norm": 0.8200462149386227, "learning_rate": 3.8097483368406003e-06, "loss": 0.0463, "step": 3567 }, { "epoch": 1.6232939035486806, "grad_norm": 0.578990989494396, "learning_rate": 3.809139560319577e-06, "loss": 0.0222, "step": 3568 }, { "epoch": 1.623748862602366, "grad_norm": 0.4202465716313672, "learning_rate": 3.8085306768215812e-06, "loss": 0.0107, "step": 3569 }, { "epoch": 1.6242038216560508, "grad_norm": 0.5061392067706059, "learning_rate": 3.8079216863963675e-06, "loss": 0.0165, "step": 3570 }, { "epoch": 1.6246587807097361, "grad_norm": 0.43536820092251655, "learning_rate": 3.807312589093701e-06, "loss": 0.0188, "step": 3571 }, { "epoch": 1.6251137397634214, "grad_norm": 0.6050677446333904, "learning_rate": 3.806703384963353e-06, "loss": 0.0271, "step": 3572 }, { "epoch": 1.6255686988171063, "grad_norm": 0.8225447317035454, "learning_rate": 3.8060940740551056e-06, "loss": 0.0333, "step": 3573 }, { "epoch": 1.6260236578707916, "grad_norm": 0.3731276144702114, "learning_rate": 3.8054846564187486e-06, "loss": 0.0104, "step": 3574 }, { "epoch": 1.626478616924477, "grad_norm": 0.6006174110645207, "learning_rate": 3.8048751321040806e-06, "loss": 0.0278, "step": 3575 }, { "epoch": 1.626933575978162, "grad_norm": 0.6442558908935601, "learning_rate": 3.80426550116091e-06, "loss": 0.0238, "step": 3576 }, { "epoch": 1.627388535031847, "grad_norm": 0.6207227243383738, "learning_rate": 3.8036557636390527e-06, "loss": 0.0299, "step": 3577 }, { "epoch": 1.6278434940855324, "grad_norm": 0.5585727627560396, "learning_rate": 3.803045919588333e-06, "loss": 0.0262, "step": 3578 }, { "epoch": 1.6282984531392175, "grad_norm": 0.44114683790491727, "learning_rate": 3.8024359690585856e-06, "loss": 0.021, "step": 3579 }, { "epoch": 1.6287534121929026, "grad_norm": 0.3296094863201106, "learning_rate": 3.8018259120996527e-06, "loss": 0.0174, "step": 3580 }, { "epoch": 1.629208371246588, "grad_norm": 0.5397044586335464, "learning_rate": 3.8012157487613853e-06, "loss": 0.019, "step": 3581 }, { "epoch": 1.629663330300273, "grad_norm": 1.076109393415077, "learning_rate": 3.800605479093643e-06, "loss": 0.0429, "step": 3582 }, { "epoch": 1.630118289353958, "grad_norm": 0.6959026110586248, "learning_rate": 3.7999951031462946e-06, "loss": 0.03, "step": 3583 }, { "epoch": 1.6305732484076434, "grad_norm": 0.5139348407505921, "learning_rate": 3.7993846209692176e-06, "loss": 0.017, "step": 3584 }, { "epoch": 1.6310282074613285, "grad_norm": 0.9198756880788689, "learning_rate": 3.798774032612297e-06, "loss": 0.0399, "step": 3585 }, { "epoch": 1.6314831665150136, "grad_norm": 0.36677379892540746, "learning_rate": 3.7981633381254266e-06, "loss": 0.0131, "step": 3586 }, { "epoch": 1.6319381255686989, "grad_norm": 0.5820327888677712, "learning_rate": 3.7975525375585115e-06, "loss": 0.0184, "step": 3587 }, { "epoch": 1.632393084622384, "grad_norm": 0.5161381978272331, "learning_rate": 3.7969416309614633e-06, "loss": 0.0216, "step": 3588 }, { "epoch": 1.632848043676069, "grad_norm": 0.5463019720540039, "learning_rate": 3.796330618384201e-06, "loss": 0.0225, "step": 3589 }, { "epoch": 1.6333030027297544, "grad_norm": 0.4568655142259495, "learning_rate": 3.795719499876655e-06, "loss": 0.0182, "step": 3590 }, { "epoch": 1.6337579617834395, "grad_norm": 0.6196564337957245, "learning_rate": 3.7951082754887638e-06, "loss": 0.0182, "step": 3591 }, { "epoch": 1.6342129208371245, "grad_norm": 0.506298546219541, "learning_rate": 3.7944969452704717e-06, "loss": 0.0234, "step": 3592 }, { "epoch": 1.6346678798908099, "grad_norm": 0.7295581071981191, "learning_rate": 3.7938855092717354e-06, "loss": 0.0358, "step": 3593 }, { "epoch": 1.635122838944495, "grad_norm": 0.4035231793496555, "learning_rate": 3.793273967542519e-06, "loss": 0.0137, "step": 3594 }, { "epoch": 1.63557779799818, "grad_norm": 0.5837188097448504, "learning_rate": 3.792662320132794e-06, "loss": 0.0254, "step": 3595 }, { "epoch": 1.6360327570518653, "grad_norm": 0.42331112642730656, "learning_rate": 3.792050567092542e-06, "loss": 0.0226, "step": 3596 }, { "epoch": 1.6364877161055507, "grad_norm": 0.6719821406101639, "learning_rate": 3.791438708471752e-06, "loss": 0.0326, "step": 3597 }, { "epoch": 1.6369426751592355, "grad_norm": 0.6622216896059354, "learning_rate": 3.7908267443204226e-06, "loss": 0.023, "step": 3598 }, { "epoch": 1.6373976342129208, "grad_norm": 0.5047368379702989, "learning_rate": 3.7902146746885614e-06, "loss": 0.0244, "step": 3599 }, { "epoch": 1.6378525932666061, "grad_norm": 0.6965120716900632, "learning_rate": 3.789602499626184e-06, "loss": 0.0308, "step": 3600 }, { "epoch": 1.638307552320291, "grad_norm": 0.6025551885499085, "learning_rate": 3.788990219183314e-06, "loss": 0.0216, "step": 3601 }, { "epoch": 1.6387625113739763, "grad_norm": 0.6125717157099263, "learning_rate": 3.7883778334099842e-06, "loss": 0.0231, "step": 3602 }, { "epoch": 1.6392174704276616, "grad_norm": 0.5473132899665126, "learning_rate": 3.7877653423562365e-06, "loss": 0.0257, "step": 3603 }, { "epoch": 1.6396724294813467, "grad_norm": 0.6537475089518195, "learning_rate": 3.787152746072119e-06, "loss": 0.0274, "step": 3604 }, { "epoch": 1.6401273885350318, "grad_norm": 0.6480805629958326, "learning_rate": 3.7865400446076933e-06, "loss": 0.0279, "step": 3605 }, { "epoch": 1.6405823475887171, "grad_norm": 0.630271607808444, "learning_rate": 3.7859272380130248e-06, "loss": 0.0222, "step": 3606 }, { "epoch": 1.6410373066424022, "grad_norm": 0.48911537645003134, "learning_rate": 3.785314326338189e-06, "loss": 0.022, "step": 3607 }, { "epoch": 1.6414922656960873, "grad_norm": 0.49625230753870486, "learning_rate": 3.784701309633272e-06, "loss": 0.0154, "step": 3608 }, { "epoch": 1.6419472247497726, "grad_norm": 0.6207889729485055, "learning_rate": 3.7840881879483647e-06, "loss": 0.0222, "step": 3609 }, { "epoch": 1.6424021838034577, "grad_norm": 0.7328816141105232, "learning_rate": 3.7834749613335704e-06, "loss": 0.0209, "step": 3610 }, { "epoch": 1.6428571428571428, "grad_norm": 0.5530871566437657, "learning_rate": 3.782861629838997e-06, "loss": 0.0239, "step": 3611 }, { "epoch": 1.643312101910828, "grad_norm": 0.9052471747323153, "learning_rate": 3.782248193514766e-06, "loss": 0.0313, "step": 3612 }, { "epoch": 1.6437670609645132, "grad_norm": 0.4611313175757836, "learning_rate": 3.7816346524110027e-06, "loss": 0.0164, "step": 3613 }, { "epoch": 1.6442220200181983, "grad_norm": 0.5318534391416457, "learning_rate": 3.781021006577843e-06, "loss": 0.0187, "step": 3614 }, { "epoch": 1.6446769790718836, "grad_norm": 0.8999001858437418, "learning_rate": 3.780407256065432e-06, "loss": 0.0448, "step": 3615 }, { "epoch": 1.6451319381255687, "grad_norm": 0.6841986770877886, "learning_rate": 3.7797934009239224e-06, "loss": 0.0257, "step": 3616 }, { "epoch": 1.6455868971792538, "grad_norm": 0.5953377845238845, "learning_rate": 3.7791794412034756e-06, "loss": 0.033, "step": 3617 }, { "epoch": 1.646041856232939, "grad_norm": 0.6104178527749724, "learning_rate": 3.7785653769542613e-06, "loss": 0.0219, "step": 3618 }, { "epoch": 1.6464968152866242, "grad_norm": 0.6615786474747989, "learning_rate": 3.7779512082264586e-06, "loss": 0.0351, "step": 3619 }, { "epoch": 1.6469517743403093, "grad_norm": 0.45031972888153543, "learning_rate": 3.777336935070255e-06, "loss": 0.0191, "step": 3620 }, { "epoch": 1.6474067333939946, "grad_norm": 0.5290797918701384, "learning_rate": 3.7767225575358434e-06, "loss": 0.0264, "step": 3621 }, { "epoch": 1.6478616924476797, "grad_norm": 0.6155643496140948, "learning_rate": 3.7761080756734318e-06, "loss": 0.0293, "step": 3622 }, { "epoch": 1.6483166515013647, "grad_norm": 0.7688383564375362, "learning_rate": 3.7754934895332306e-06, "loss": 0.0418, "step": 3623 }, { "epoch": 1.64877161055505, "grad_norm": 0.5286169912201062, "learning_rate": 3.7748787991654623e-06, "loss": 0.0231, "step": 3624 }, { "epoch": 1.6492265696087354, "grad_norm": 0.6616058729544022, "learning_rate": 3.774264004620355e-06, "loss": 0.037, "step": 3625 }, { "epoch": 1.6496815286624202, "grad_norm": 0.4034447323556056, "learning_rate": 3.7736491059481474e-06, "loss": 0.0201, "step": 3626 }, { "epoch": 1.6501364877161055, "grad_norm": 0.4827707916446577, "learning_rate": 3.7730341031990873e-06, "loss": 0.0239, "step": 3627 }, { "epoch": 1.6505914467697909, "grad_norm": 0.622385511217906, "learning_rate": 3.772418996423428e-06, "loss": 0.0304, "step": 3628 }, { "epoch": 1.6510464058234757, "grad_norm": 0.4456421103444245, "learning_rate": 3.7718037856714364e-06, "loss": 0.0168, "step": 3629 }, { "epoch": 1.651501364877161, "grad_norm": 0.5066325224743325, "learning_rate": 3.7711884709933823e-06, "loss": 0.0174, "step": 3630 }, { "epoch": 1.6519563239308463, "grad_norm": 0.5641394565791685, "learning_rate": 3.7705730524395466e-06, "loss": 0.0313, "step": 3631 }, { "epoch": 1.6524112829845314, "grad_norm": 0.620291914257022, "learning_rate": 3.7699575300602188e-06, "loss": 0.0188, "step": 3632 }, { "epoch": 1.6528662420382165, "grad_norm": 0.5219762390228715, "learning_rate": 3.7693419039056965e-06, "loss": 0.0231, "step": 3633 }, { "epoch": 1.6533212010919018, "grad_norm": 0.4441682306331103, "learning_rate": 3.768726174026287e-06, "loss": 0.0192, "step": 3634 }, { "epoch": 1.653776160145587, "grad_norm": 0.44964605531542085, "learning_rate": 3.768110340472304e-06, "loss": 0.0208, "step": 3635 }, { "epoch": 1.654231119199272, "grad_norm": 0.6485140503083492, "learning_rate": 3.7674944032940696e-06, "loss": 0.0213, "step": 3636 }, { "epoch": 1.6546860782529573, "grad_norm": 0.6021902161090137, "learning_rate": 3.766878362541918e-06, "loss": 0.022, "step": 3637 }, { "epoch": 1.6551410373066424, "grad_norm": 0.5202194021470081, "learning_rate": 3.7662622182661867e-06, "loss": 0.0191, "step": 3638 }, { "epoch": 1.6555959963603275, "grad_norm": 1.009854523450612, "learning_rate": 3.7656459705172255e-06, "loss": 0.0403, "step": 3639 }, { "epoch": 1.6560509554140128, "grad_norm": 0.6839263706918253, "learning_rate": 3.7650296193453916e-06, "loss": 0.0269, "step": 3640 }, { "epoch": 1.656505914467698, "grad_norm": 0.6993714428573524, "learning_rate": 3.7644131648010494e-06, "loss": 0.0246, "step": 3641 }, { "epoch": 1.656960873521383, "grad_norm": 0.6815473973133779, "learning_rate": 3.7637966069345743e-06, "loss": 0.0203, "step": 3642 }, { "epoch": 1.6574158325750683, "grad_norm": 0.8439769351263455, "learning_rate": 3.7631799457963467e-06, "loss": 0.0419, "step": 3643 }, { "epoch": 1.6578707916287534, "grad_norm": 0.7033184573271442, "learning_rate": 3.7625631814367593e-06, "loss": 0.0331, "step": 3644 }, { "epoch": 1.6583257506824385, "grad_norm": 0.6122877596789658, "learning_rate": 3.7619463139062097e-06, "loss": 0.0242, "step": 3645 }, { "epoch": 1.6587807097361238, "grad_norm": 0.8027844629713855, "learning_rate": 3.761329343255107e-06, "loss": 0.0285, "step": 3646 }, { "epoch": 1.6592356687898089, "grad_norm": 0.5437531086613037, "learning_rate": 3.760712269533866e-06, "loss": 0.0193, "step": 3647 }, { "epoch": 1.659690627843494, "grad_norm": 0.8454673738376148, "learning_rate": 3.7600950927929116e-06, "loss": 0.0289, "step": 3648 }, { "epoch": 1.6601455868971793, "grad_norm": 0.5874359688131018, "learning_rate": 3.759477813082677e-06, "loss": 0.0368, "step": 3649 }, { "epoch": 1.6606005459508644, "grad_norm": 0.561547399632124, "learning_rate": 3.7588604304536026e-06, "loss": 0.0257, "step": 3650 }, { "epoch": 1.6610555050045495, "grad_norm": 0.6516254171674322, "learning_rate": 3.75824294495614e-06, "loss": 0.0264, "step": 3651 }, { "epoch": 1.6615104640582348, "grad_norm": 0.586676169852581, "learning_rate": 3.757625356640745e-06, "loss": 0.026, "step": 3652 }, { "epoch": 1.66196542311192, "grad_norm": 0.4734126319148065, "learning_rate": 3.757007665557886e-06, "loss": 0.0242, "step": 3653 }, { "epoch": 1.662420382165605, "grad_norm": 0.5634091884397163, "learning_rate": 3.7563898717580364e-06, "loss": 0.0319, "step": 3654 }, { "epoch": 1.6628753412192903, "grad_norm": 0.4879942321956978, "learning_rate": 3.755771975291681e-06, "loss": 0.0168, "step": 3655 }, { "epoch": 1.6633303002729756, "grad_norm": 0.5665912556605981, "learning_rate": 3.7551539762093103e-06, "loss": 0.0217, "step": 3656 }, { "epoch": 1.6637852593266604, "grad_norm": 0.5511340592863238, "learning_rate": 3.7545358745614246e-06, "loss": 0.0236, "step": 3657 }, { "epoch": 1.6642402183803457, "grad_norm": 0.7980773287068369, "learning_rate": 3.7539176703985338e-06, "loss": 0.0395, "step": 3658 }, { "epoch": 1.664695177434031, "grad_norm": 0.5272569448570994, "learning_rate": 3.7532993637711524e-06, "loss": 0.017, "step": 3659 }, { "epoch": 1.6651501364877161, "grad_norm": 1.973300676488328, "learning_rate": 3.7526809547298072e-06, "loss": 0.0867, "step": 3660 }, { "epoch": 1.6656050955414012, "grad_norm": 0.4603199822151636, "learning_rate": 3.752062443325032e-06, "loss": 0.0235, "step": 3661 }, { "epoch": 1.6660600545950865, "grad_norm": 0.4984041496096718, "learning_rate": 3.7514438296073678e-06, "loss": 0.0159, "step": 3662 }, { "epoch": 1.6665150136487716, "grad_norm": 0.6486830695220506, "learning_rate": 3.7508251136273656e-06, "loss": 0.0202, "step": 3663 }, { "epoch": 1.6669699727024567, "grad_norm": 0.8468363633924074, "learning_rate": 3.7502062954355835e-06, "loss": 0.0526, "step": 3664 }, { "epoch": 1.667424931756142, "grad_norm": 0.3919623481399935, "learning_rate": 3.749587375082589e-06, "loss": 0.0112, "step": 3665 }, { "epoch": 1.6678798908098271, "grad_norm": 0.5459884336885586, "learning_rate": 3.7489683526189575e-06, "loss": 0.0283, "step": 3666 }, { "epoch": 1.6683348498635122, "grad_norm": 0.5733639666031194, "learning_rate": 3.7483492280952718e-06, "loss": 0.027, "step": 3667 }, { "epoch": 1.6687898089171975, "grad_norm": 0.551070019879109, "learning_rate": 3.747730001562125e-06, "loss": 0.0223, "step": 3668 }, { "epoch": 1.6692447679708826, "grad_norm": 0.8818886132528604, "learning_rate": 3.747110673070117e-06, "loss": 0.0439, "step": 3669 }, { "epoch": 1.6696997270245677, "grad_norm": 0.449458660767284, "learning_rate": 3.7464912426698568e-06, "loss": 0.0129, "step": 3670 }, { "epoch": 1.670154686078253, "grad_norm": 0.5652944170753819, "learning_rate": 3.7458717104119618e-06, "loss": 0.0203, "step": 3671 }, { "epoch": 1.670609645131938, "grad_norm": 0.5678881883605275, "learning_rate": 3.7452520763470567e-06, "loss": 0.0225, "step": 3672 }, { "epoch": 1.6710646041856232, "grad_norm": 0.749443357576462, "learning_rate": 3.7446323405257755e-06, "loss": 0.0366, "step": 3673 }, { "epoch": 1.6715195632393085, "grad_norm": 0.48011063349192723, "learning_rate": 3.7440125029987593e-06, "loss": 0.025, "step": 3674 }, { "epoch": 1.6719745222929936, "grad_norm": 0.6671257763330405, "learning_rate": 3.7433925638166603e-06, "loss": 0.0238, "step": 3675 }, { "epoch": 1.6724294813466787, "grad_norm": 0.4852610433290402, "learning_rate": 3.742772523030136e-06, "loss": 0.0207, "step": 3676 }, { "epoch": 1.672884440400364, "grad_norm": 0.6115036341454524, "learning_rate": 3.742152380689853e-06, "loss": 0.0235, "step": 3677 }, { "epoch": 1.673339399454049, "grad_norm": 0.5855479287907706, "learning_rate": 3.7415321368464872e-06, "loss": 0.0313, "step": 3678 }, { "epoch": 1.6737943585077342, "grad_norm": 0.7448031677845011, "learning_rate": 3.740911791550722e-06, "loss": 0.0178, "step": 3679 }, { "epoch": 1.6742493175614195, "grad_norm": 0.7332173099068802, "learning_rate": 3.7402913448532493e-06, "loss": 0.0288, "step": 3680 }, { "epoch": 1.6747042766151048, "grad_norm": 0.6418505071933155, "learning_rate": 3.7396707968047676e-06, "loss": 0.0327, "step": 3681 }, { "epoch": 1.6751592356687897, "grad_norm": 0.7202448098932808, "learning_rate": 3.7390501474559883e-06, "loss": 0.0347, "step": 3682 }, { "epoch": 1.675614194722475, "grad_norm": 0.823163701581977, "learning_rate": 3.738429396857626e-06, "loss": 0.0326, "step": 3683 }, { "epoch": 1.6760691537761603, "grad_norm": 0.8604154683999686, "learning_rate": 3.7378085450604053e-06, "loss": 0.0277, "step": 3684 }, { "epoch": 1.6765241128298451, "grad_norm": 0.5425924320853744, "learning_rate": 3.7371875921150612e-06, "loss": 0.0206, "step": 3685 }, { "epoch": 1.6769790718835305, "grad_norm": 0.537530491367301, "learning_rate": 3.7365665380723335e-06, "loss": 0.023, "step": 3686 }, { "epoch": 1.6774340309372158, "grad_norm": 0.6072594994123963, "learning_rate": 3.7359453829829734e-06, "loss": 0.0249, "step": 3687 }, { "epoch": 1.6778889899909009, "grad_norm": 0.7277626181433888, "learning_rate": 3.7353241268977373e-06, "loss": 0.0289, "step": 3688 }, { "epoch": 1.678343949044586, "grad_norm": 0.37079405045520875, "learning_rate": 3.734702769867393e-06, "loss": 0.0085, "step": 3689 }, { "epoch": 1.6787989080982713, "grad_norm": 0.6369715456570345, "learning_rate": 3.734081311942714e-06, "loss": 0.0253, "step": 3690 }, { "epoch": 1.6792538671519563, "grad_norm": 0.5823431934795118, "learning_rate": 3.733459753174482e-06, "loss": 0.0166, "step": 3691 }, { "epoch": 1.6797088262056414, "grad_norm": 0.5224766975497316, "learning_rate": 3.7328380936134904e-06, "loss": 0.0239, "step": 3692 }, { "epoch": 1.6801637852593267, "grad_norm": 0.6392489030563797, "learning_rate": 3.732216333310537e-06, "loss": 0.0299, "step": 3693 }, { "epoch": 1.6806187443130118, "grad_norm": 0.6247103573315796, "learning_rate": 3.7315944723164297e-06, "loss": 0.0248, "step": 3694 }, { "epoch": 1.681073703366697, "grad_norm": 0.7473537288598567, "learning_rate": 3.730972510681984e-06, "loss": 0.0345, "step": 3695 }, { "epoch": 1.6815286624203822, "grad_norm": 0.5024685900367807, "learning_rate": 3.7303504484580235e-06, "loss": 0.0191, "step": 3696 }, { "epoch": 1.6819836214740673, "grad_norm": 0.6298449368900484, "learning_rate": 3.729728285695381e-06, "loss": 0.0214, "step": 3697 }, { "epoch": 1.6824385805277524, "grad_norm": 0.4152971336432089, "learning_rate": 3.7291060224448948e-06, "loss": 0.0171, "step": 3698 }, { "epoch": 1.6828935395814377, "grad_norm": 0.6815773728664686, "learning_rate": 3.728483658757417e-06, "loss": 0.0188, "step": 3699 }, { "epoch": 1.6833484986351228, "grad_norm": 0.618924972853139, "learning_rate": 3.7278611946838016e-06, "loss": 0.0185, "step": 3700 }, { "epoch": 1.683803457688808, "grad_norm": 0.5167418108822708, "learning_rate": 3.727238630274914e-06, "loss": 0.0206, "step": 3701 }, { "epoch": 1.6842584167424932, "grad_norm": 0.796862561203587, "learning_rate": 3.726615965581628e-06, "loss": 0.0393, "step": 3702 }, { "epoch": 1.6847133757961783, "grad_norm": 16.410696857396648, "learning_rate": 3.725993200654825e-06, "loss": 0.4378, "step": 3703 }, { "epoch": 1.6851683348498634, "grad_norm": 0.6039077909169345, "learning_rate": 3.725370335545394e-06, "loss": 0.025, "step": 3704 }, { "epoch": 1.6856232939035487, "grad_norm": 0.6758703407711169, "learning_rate": 3.7247473703042324e-06, "loss": 0.0298, "step": 3705 }, { "epoch": 1.686078252957234, "grad_norm": 0.40894796945610395, "learning_rate": 3.7241243049822475e-06, "loss": 0.0167, "step": 3706 }, { "epoch": 1.6865332120109189, "grad_norm": 0.6085395068507089, "learning_rate": 3.723501139630352e-06, "loss": 0.0253, "step": 3707 }, { "epoch": 1.6869881710646042, "grad_norm": 0.5264192185021481, "learning_rate": 3.722877874299469e-06, "loss": 0.0261, "step": 3708 }, { "epoch": 1.6874431301182895, "grad_norm": 0.41356331417687003, "learning_rate": 3.722254509040527e-06, "loss": 0.0152, "step": 3709 }, { "epoch": 1.6878980891719744, "grad_norm": 0.5322391946249139, "learning_rate": 3.721631043904468e-06, "loss": 0.0242, "step": 3710 }, { "epoch": 1.6883530482256597, "grad_norm": 0.5168838283494139, "learning_rate": 3.7210074789422363e-06, "loss": 0.021, "step": 3711 }, { "epoch": 1.688808007279345, "grad_norm": 0.6955554477419243, "learning_rate": 3.7203838142047875e-06, "loss": 0.039, "step": 3712 }, { "epoch": 1.68926296633303, "grad_norm": 0.5179778216573575, "learning_rate": 3.719760049743084e-06, "loss": 0.0265, "step": 3713 }, { "epoch": 1.6897179253867152, "grad_norm": 0.9036619682190697, "learning_rate": 3.719136185608099e-06, "loss": 0.0387, "step": 3714 }, { "epoch": 1.6901728844404005, "grad_norm": 0.4710707649389809, "learning_rate": 3.7185122218508097e-06, "loss": 0.0239, "step": 3715 }, { "epoch": 1.6906278434940856, "grad_norm": 0.47157157168562497, "learning_rate": 3.717888158522204e-06, "loss": 0.0222, "step": 3716 }, { "epoch": 1.6910828025477707, "grad_norm": 0.550350895261478, "learning_rate": 3.717263995673278e-06, "loss": 0.0241, "step": 3717 }, { "epoch": 1.691537761601456, "grad_norm": 0.6148748686415011, "learning_rate": 3.7166397333550357e-06, "loss": 0.0232, "step": 3718 }, { "epoch": 1.691992720655141, "grad_norm": 0.6910976774877058, "learning_rate": 3.7160153716184887e-06, "loss": 0.0344, "step": 3719 }, { "epoch": 1.6924476797088261, "grad_norm": 0.5659383368243334, "learning_rate": 3.7153909105146567e-06, "loss": 0.0203, "step": 3720 }, { "epoch": 1.6929026387625115, "grad_norm": 0.47971439657054477, "learning_rate": 3.7147663500945692e-06, "loss": 0.0179, "step": 3721 }, { "epoch": 1.6933575978161965, "grad_norm": 0.6295216941365326, "learning_rate": 3.7141416904092605e-06, "loss": 0.0333, "step": 3722 }, { "epoch": 1.6938125568698816, "grad_norm": 0.5946963393375687, "learning_rate": 3.713516931509775e-06, "loss": 0.0278, "step": 3723 }, { "epoch": 1.694267515923567, "grad_norm": 0.9716720750571886, "learning_rate": 3.7128920734471677e-06, "loss": 0.044, "step": 3724 }, { "epoch": 1.694722474977252, "grad_norm": 0.6509186594911334, "learning_rate": 3.7122671162724966e-06, "loss": 0.021, "step": 3725 }, { "epoch": 1.6951774340309371, "grad_norm": 0.6504872332839807, "learning_rate": 3.711642060036832e-06, "loss": 0.0304, "step": 3726 }, { "epoch": 1.6956323930846224, "grad_norm": 0.4843073380130816, "learning_rate": 3.711016904791249e-06, "loss": 0.0222, "step": 3727 }, { "epoch": 1.6960873521383075, "grad_norm": 0.5142043375988194, "learning_rate": 3.7103916505868342e-06, "loss": 0.0199, "step": 3728 }, { "epoch": 1.6965423111919926, "grad_norm": 0.5807813360855487, "learning_rate": 3.7097662974746795e-06, "loss": 0.0314, "step": 3729 }, { "epoch": 1.696997270245678, "grad_norm": 0.7719714775202473, "learning_rate": 3.7091408455058862e-06, "loss": 0.0337, "step": 3730 }, { "epoch": 1.697452229299363, "grad_norm": 0.4962539736030745, "learning_rate": 3.708515294731564e-06, "loss": 0.0187, "step": 3731 }, { "epoch": 1.697907188353048, "grad_norm": 0.5702466445302747, "learning_rate": 3.707889645202829e-06, "loss": 0.0224, "step": 3732 }, { "epoch": 1.6983621474067334, "grad_norm": 0.505469387514989, "learning_rate": 3.707263896970807e-06, "loss": 0.0236, "step": 3733 }, { "epoch": 1.6988171064604187, "grad_norm": 0.5040928264719208, "learning_rate": 3.706638050086631e-06, "loss": 0.0229, "step": 3734 }, { "epoch": 1.6992720655141036, "grad_norm": 0.5081632032813646, "learning_rate": 3.7060121046014434e-06, "loss": 0.02, "step": 3735 }, { "epoch": 1.699727024567789, "grad_norm": 0.41395460392786865, "learning_rate": 3.7053860605663927e-06, "loss": 0.0177, "step": 3736 }, { "epoch": 1.7001819836214742, "grad_norm": 0.8570781325955044, "learning_rate": 3.704759918032636e-06, "loss": 0.0309, "step": 3737 }, { "epoch": 1.700636942675159, "grad_norm": 0.5721539588385257, "learning_rate": 3.7041336770513403e-06, "loss": 0.0334, "step": 3738 }, { "epoch": 1.7010919017288444, "grad_norm": 0.6526103729827067, "learning_rate": 3.703507337673678e-06, "loss": 0.024, "step": 3739 }, { "epoch": 1.7015468607825297, "grad_norm": 0.7963773879297332, "learning_rate": 3.702880899950831e-06, "loss": 0.0407, "step": 3740 }, { "epoch": 1.7020018198362148, "grad_norm": 0.574933934375484, "learning_rate": 3.702254363933989e-06, "loss": 0.0187, "step": 3741 }, { "epoch": 1.7024567788898999, "grad_norm": 2.139762665256868, "learning_rate": 3.7016277296743496e-06, "loss": 0.0599, "step": 3742 }, { "epoch": 1.7029117379435852, "grad_norm": 0.5153834647656831, "learning_rate": 3.7010009972231186e-06, "loss": 0.0182, "step": 3743 }, { "epoch": 1.7033666969972703, "grad_norm": 0.6295570251944255, "learning_rate": 3.7003741666315095e-06, "loss": 0.0258, "step": 3744 }, { "epoch": 1.7038216560509554, "grad_norm": 0.8031954347623854, "learning_rate": 3.6997472379507454e-06, "loss": 0.0312, "step": 3745 }, { "epoch": 1.7042766151046407, "grad_norm": 0.8843858410369263, "learning_rate": 3.6991202112320544e-06, "loss": 0.0256, "step": 3746 }, { "epoch": 1.7047315741583258, "grad_norm": 0.561371813852862, "learning_rate": 3.6984930865266744e-06, "loss": 0.0254, "step": 3747 }, { "epoch": 1.7051865332120109, "grad_norm": 0.6772781140572789, "learning_rate": 3.6978658638858526e-06, "loss": 0.0249, "step": 3748 }, { "epoch": 1.7056414922656962, "grad_norm": 0.7681255950690041, "learning_rate": 3.6972385433608416e-06, "loss": 0.0313, "step": 3749 }, { "epoch": 1.7060964513193813, "grad_norm": 0.8281465322568329, "learning_rate": 3.6966111250029035e-06, "loss": 0.0362, "step": 3750 }, { "epoch": 1.7065514103730663, "grad_norm": 0.6778073444941, "learning_rate": 3.695983608863308e-06, "loss": 0.016, "step": 3751 }, { "epoch": 1.7070063694267517, "grad_norm": 0.3181064433588746, "learning_rate": 3.6953559949933334e-06, "loss": 0.0081, "step": 3752 }, { "epoch": 1.7074613284804367, "grad_norm": 0.5307372594094432, "learning_rate": 3.6947282834442643e-06, "loss": 0.0201, "step": 3753 }, { "epoch": 1.7079162875341218, "grad_norm": 0.972407393019446, "learning_rate": 3.6941004742673958e-06, "loss": 0.0273, "step": 3754 }, { "epoch": 1.7083712465878071, "grad_norm": 0.9214372631782088, "learning_rate": 3.693472567514029e-06, "loss": 0.0394, "step": 3755 }, { "epoch": 1.7088262056414922, "grad_norm": 0.48743904966054435, "learning_rate": 3.692844563235474e-06, "loss": 0.0199, "step": 3756 }, { "epoch": 1.7092811646951773, "grad_norm": 0.6817160494596262, "learning_rate": 3.692216461483047e-06, "loss": 0.0298, "step": 3757 }, { "epoch": 1.7097361237488626, "grad_norm": 0.3947750396419106, "learning_rate": 3.6915882623080756e-06, "loss": 0.0151, "step": 3758 }, { "epoch": 1.7101910828025477, "grad_norm": 1.1295486681126539, "learning_rate": 3.690959965761893e-06, "loss": 0.0547, "step": 3759 }, { "epoch": 1.7106460418562328, "grad_norm": 0.7037460217315583, "learning_rate": 3.6903315718958397e-06, "loss": 0.0311, "step": 3760 }, { "epoch": 1.7111010009099181, "grad_norm": 0.413788308194387, "learning_rate": 3.6897030807612655e-06, "loss": 0.0152, "step": 3761 }, { "epoch": 1.7115559599636034, "grad_norm": 0.6745789601697255, "learning_rate": 3.689074492409529e-06, "loss": 0.0327, "step": 3762 }, { "epoch": 1.7120109190172883, "grad_norm": 0.7540995398146106, "learning_rate": 3.6884458068919935e-06, "loss": 0.0372, "step": 3763 }, { "epoch": 1.7124658780709736, "grad_norm": 0.5881773051093517, "learning_rate": 3.687817024260035e-06, "loss": 0.0196, "step": 3764 }, { "epoch": 1.712920837124659, "grad_norm": 0.5571006232789326, "learning_rate": 3.687188144565033e-06, "loss": 0.0224, "step": 3765 }, { "epoch": 1.7133757961783438, "grad_norm": 0.4494519698168543, "learning_rate": 3.6865591678583775e-06, "loss": 0.0225, "step": 3766 }, { "epoch": 1.713830755232029, "grad_norm": 0.7711446370438744, "learning_rate": 3.685930094191465e-06, "loss": 0.0234, "step": 3767 }, { "epoch": 1.7142857142857144, "grad_norm": 0.8397312671641753, "learning_rate": 3.6853009236157e-06, "loss": 0.0423, "step": 3768 }, { "epoch": 1.7147406733393995, "grad_norm": 0.683211885753347, "learning_rate": 3.684671656182497e-06, "loss": 0.0216, "step": 3769 }, { "epoch": 1.7151956323930846, "grad_norm": 0.4348183694352422, "learning_rate": 3.6840422919432762e-06, "loss": 0.0165, "step": 3770 }, { "epoch": 1.71565059144677, "grad_norm": 0.8763212022348565, "learning_rate": 3.683412830949466e-06, "loss": 0.0378, "step": 3771 }, { "epoch": 1.716105550500455, "grad_norm": 0.4619274536838717, "learning_rate": 3.6827832732525042e-06, "loss": 0.0175, "step": 3772 }, { "epoch": 1.71656050955414, "grad_norm": 0.41456934939422674, "learning_rate": 3.6821536189038343e-06, "loss": 0.0192, "step": 3773 }, { "epoch": 1.7170154686078254, "grad_norm": 0.46901142214125285, "learning_rate": 3.681523867954909e-06, "loss": 0.015, "step": 3774 }, { "epoch": 1.7174704276615105, "grad_norm": 0.327082992709135, "learning_rate": 3.6808940204571895e-06, "loss": 0.0128, "step": 3775 }, { "epoch": 1.7179253867151956, "grad_norm": 0.6033494642390252, "learning_rate": 3.6802640764621427e-06, "loss": 0.0393, "step": 3776 }, { "epoch": 1.7183803457688809, "grad_norm": 0.5929060959086757, "learning_rate": 3.6796340360212467e-06, "loss": 0.0302, "step": 3777 }, { "epoch": 1.718835304822566, "grad_norm": 0.5969951393161446, "learning_rate": 3.679003899185983e-06, "loss": 0.0193, "step": 3778 }, { "epoch": 1.719290263876251, "grad_norm": 0.4888552177499924, "learning_rate": 3.6783736660078463e-06, "loss": 0.0179, "step": 3779 }, { "epoch": 1.7197452229299364, "grad_norm": 0.4857334931955398, "learning_rate": 3.6777433365383348e-06, "loss": 0.0176, "step": 3780 }, { "epoch": 1.7202001819836215, "grad_norm": 0.500193230861817, "learning_rate": 3.6771129108289568e-06, "loss": 0.0247, "step": 3781 }, { "epoch": 1.7206551410373065, "grad_norm": 0.5069256645883959, "learning_rate": 3.6764823889312263e-06, "loss": 0.0177, "step": 3782 }, { "epoch": 1.7211101000909919, "grad_norm": 0.42689246859965685, "learning_rate": 3.675851770896669e-06, "loss": 0.0198, "step": 3783 }, { "epoch": 1.721565059144677, "grad_norm": 0.6307321078470965, "learning_rate": 3.675221056776815e-06, "loss": 0.0282, "step": 3784 }, { "epoch": 1.722020018198362, "grad_norm": 0.4160803144159328, "learning_rate": 3.6745902466232027e-06, "loss": 0.0168, "step": 3785 }, { "epoch": 1.7224749772520473, "grad_norm": 0.42583746871846967, "learning_rate": 3.6739593404873804e-06, "loss": 0.0169, "step": 3786 }, { "epoch": 1.7229299363057324, "grad_norm": 0.5970593946647448, "learning_rate": 3.6733283384209022e-06, "loss": 0.0303, "step": 3787 }, { "epoch": 1.7233848953594175, "grad_norm": 0.324960274386966, "learning_rate": 3.6726972404753313e-06, "loss": 0.0124, "step": 3788 }, { "epoch": 1.7238398544131028, "grad_norm": 0.5534609578181952, "learning_rate": 3.672066046702237e-06, "loss": 0.0217, "step": 3789 }, { "epoch": 1.7242948134667881, "grad_norm": 0.5587241395138457, "learning_rate": 3.6714347571531993e-06, "loss": 0.0225, "step": 3790 }, { "epoch": 1.724749772520473, "grad_norm": 0.4138423827236354, "learning_rate": 3.670803371879803e-06, "loss": 0.0201, "step": 3791 }, { "epoch": 1.7252047315741583, "grad_norm": 0.415855766212165, "learning_rate": 3.6701718909336424e-06, "loss": 0.0145, "step": 3792 }, { "epoch": 1.7256596906278436, "grad_norm": 0.6435316880247659, "learning_rate": 3.669540314366319e-06, "loss": 0.0316, "step": 3793 }, { "epoch": 1.7261146496815285, "grad_norm": 0.506440622429494, "learning_rate": 3.6689086422294434e-06, "loss": 0.013, "step": 3794 }, { "epoch": 1.7265696087352138, "grad_norm": 0.4093327188417235, "learning_rate": 3.6682768745746317e-06, "loss": 0.0127, "step": 3795 }, { "epoch": 1.7270245677888991, "grad_norm": 0.5734344443277041, "learning_rate": 3.66764501145351e-06, "loss": 0.0205, "step": 3796 }, { "epoch": 1.7274795268425842, "grad_norm": 0.3817754141025844, "learning_rate": 3.6670130529177108e-06, "loss": 0.0186, "step": 3797 }, { "epoch": 1.7279344858962693, "grad_norm": 0.537107369260746, "learning_rate": 3.6663809990188752e-06, "loss": 0.0146, "step": 3798 }, { "epoch": 1.7283894449499546, "grad_norm": 0.49735286104024884, "learning_rate": 3.6657488498086517e-06, "loss": 0.0133, "step": 3799 }, { "epoch": 1.7288444040036397, "grad_norm": 0.47221804561763336, "learning_rate": 3.6651166053386966e-06, "loss": 0.0187, "step": 3800 }, { "epoch": 1.7292993630573248, "grad_norm": 0.6388205492573055, "learning_rate": 3.664484265660675e-06, "loss": 0.0257, "step": 3801 }, { "epoch": 1.72975432211101, "grad_norm": 0.6199481077062792, "learning_rate": 3.6638518308262567e-06, "loss": 0.0317, "step": 3802 }, { "epoch": 1.7302092811646952, "grad_norm": 0.49900585821193677, "learning_rate": 3.663219300887123e-06, "loss": 0.0213, "step": 3803 }, { "epoch": 1.7306642402183803, "grad_norm": 0.5335091474935367, "learning_rate": 3.6625866758949614e-06, "loss": 0.024, "step": 3804 }, { "epoch": 1.7311191992720656, "grad_norm": 0.5670937501409793, "learning_rate": 3.6619539559014673e-06, "loss": 0.0187, "step": 3805 }, { "epoch": 1.7315741583257507, "grad_norm": 0.8326133683447557, "learning_rate": 3.661321140958342e-06, "loss": 0.0357, "step": 3806 }, { "epoch": 1.7320291173794358, "grad_norm": 0.5421609083192159, "learning_rate": 3.660688231117298e-06, "loss": 0.0231, "step": 3807 }, { "epoch": 1.732484076433121, "grad_norm": 0.7365065719826951, "learning_rate": 3.660055226430054e-06, "loss": 0.0295, "step": 3808 }, { "epoch": 1.7329390354868062, "grad_norm": 1.0183754445566922, "learning_rate": 3.6594221269483356e-06, "loss": 0.039, "step": 3809 }, { "epoch": 1.7333939945404913, "grad_norm": 0.7024744503951253, "learning_rate": 3.658788932723876e-06, "loss": 0.0313, "step": 3810 }, { "epoch": 1.7338489535941766, "grad_norm": 0.5041725108988522, "learning_rate": 3.6581556438084185e-06, "loss": 0.01, "step": 3811 }, { "epoch": 1.7343039126478617, "grad_norm": 0.5707798595479919, "learning_rate": 3.6575222602537118e-06, "loss": 0.0214, "step": 3812 }, { "epoch": 1.7347588717015467, "grad_norm": 0.5529421515058182, "learning_rate": 3.6568887821115134e-06, "loss": 0.0191, "step": 3813 }, { "epoch": 1.735213830755232, "grad_norm": 0.7144596376367993, "learning_rate": 3.6562552094335878e-06, "loss": 0.0334, "step": 3814 }, { "epoch": 1.7356687898089171, "grad_norm": 0.6908144788678268, "learning_rate": 3.655621542271709e-06, "loss": 0.0249, "step": 3815 }, { "epoch": 1.7361237488626022, "grad_norm": 0.6371182292758583, "learning_rate": 3.654987780677656e-06, "loss": 0.031, "step": 3816 }, { "epoch": 1.7365787079162875, "grad_norm": 0.6184433778112067, "learning_rate": 3.654353924703217e-06, "loss": 0.0225, "step": 3817 }, { "epoch": 1.7370336669699729, "grad_norm": 0.40702156642919257, "learning_rate": 3.6537199744001893e-06, "loss": 0.0129, "step": 3818 }, { "epoch": 1.7374886260236577, "grad_norm": 0.5413199568074462, "learning_rate": 3.6530859298203746e-06, "loss": 0.0254, "step": 3819 }, { "epoch": 1.737943585077343, "grad_norm": 0.4914812587364899, "learning_rate": 3.6524517910155853e-06, "loss": 0.0213, "step": 3820 }, { "epoch": 1.7383985441310283, "grad_norm": 6.570695852121233, "learning_rate": 3.65181755803764e-06, "loss": 0.0807, "step": 3821 }, { "epoch": 1.7388535031847132, "grad_norm": 0.6210010475391491, "learning_rate": 3.6511832309383654e-06, "loss": 0.0252, "step": 3822 }, { "epoch": 1.7393084622383985, "grad_norm": 0.5307900469885153, "learning_rate": 3.6505488097695963e-06, "loss": 0.0156, "step": 3823 }, { "epoch": 1.7397634212920838, "grad_norm": 1.0474186407382713, "learning_rate": 3.6499142945831732e-06, "loss": 0.0449, "step": 3824 }, { "epoch": 1.740218380345769, "grad_norm": 0.5856174057365933, "learning_rate": 3.649279685430948e-06, "loss": 0.0212, "step": 3825 }, { "epoch": 1.740673339399454, "grad_norm": 0.7487230793612085, "learning_rate": 3.648644982364777e-06, "loss": 0.0327, "step": 3826 }, { "epoch": 1.7411282984531393, "grad_norm": 0.8370183653893686, "learning_rate": 3.648010185436525e-06, "loss": 0.0349, "step": 3827 }, { "epoch": 1.7415832575068244, "grad_norm": 0.573210475770346, "learning_rate": 3.6473752946980644e-06, "loss": 0.0279, "step": 3828 }, { "epoch": 1.7420382165605095, "grad_norm": 0.6791246869658677, "learning_rate": 3.6467403102012767e-06, "loss": 0.0249, "step": 3829 }, { "epoch": 1.7424931756141948, "grad_norm": 0.7888223019204079, "learning_rate": 3.64610523199805e-06, "loss": 0.0382, "step": 3830 }, { "epoch": 1.74294813466788, "grad_norm": 0.48924629959412974, "learning_rate": 3.6454700601402783e-06, "loss": 0.0172, "step": 3831 }, { "epoch": 1.743403093721565, "grad_norm": 0.8852683743955648, "learning_rate": 3.6448347946798672e-06, "loss": 0.0418, "step": 3832 }, { "epoch": 1.7438580527752503, "grad_norm": 0.7758243777941729, "learning_rate": 3.6441994356687265e-06, "loss": 0.0312, "step": 3833 }, { "epoch": 1.7443130118289354, "grad_norm": 0.7032750998686196, "learning_rate": 3.643563983158775e-06, "loss": 0.0329, "step": 3834 }, { "epoch": 1.7447679708826205, "grad_norm": 0.5317983715720668, "learning_rate": 3.642928437201939e-06, "loss": 0.0238, "step": 3835 }, { "epoch": 1.7452229299363058, "grad_norm": 0.6749497364473966, "learning_rate": 3.642292797850153e-06, "loss": 0.0338, "step": 3836 }, { "epoch": 1.7456778889899909, "grad_norm": 0.5413385003454867, "learning_rate": 3.641657065155358e-06, "loss": 0.0222, "step": 3837 }, { "epoch": 1.746132848043676, "grad_norm": 0.7915659846437547, "learning_rate": 3.6410212391695023e-06, "loss": 0.0253, "step": 3838 }, { "epoch": 1.7465878070973613, "grad_norm": 0.4970696189128546, "learning_rate": 3.6403853199445448e-06, "loss": 0.0141, "step": 3839 }, { "epoch": 1.7470427661510464, "grad_norm": 0.44885237124059263, "learning_rate": 3.6397493075324486e-06, "loss": 0.018, "step": 3840 }, { "epoch": 1.7474977252047315, "grad_norm": 0.6822120402090854, "learning_rate": 3.6391132019851857e-06, "loss": 0.0289, "step": 3841 }, { "epoch": 1.7479526842584168, "grad_norm": 0.3885326548095813, "learning_rate": 3.6384770033547366e-06, "loss": 0.0123, "step": 3842 }, { "epoch": 1.7484076433121019, "grad_norm": 0.5503894038476335, "learning_rate": 3.637840711693088e-06, "loss": 0.0199, "step": 3843 }, { "epoch": 1.748862602365787, "grad_norm": 0.5708299264033291, "learning_rate": 3.637204327052235e-06, "loss": 0.0298, "step": 3844 }, { "epoch": 1.7493175614194723, "grad_norm": 0.6508256310761806, "learning_rate": 3.6365678494841795e-06, "loss": 0.0384, "step": 3845 }, { "epoch": 1.7497725204731576, "grad_norm": 0.5643977346261464, "learning_rate": 3.6359312790409323e-06, "loss": 0.028, "step": 3846 }, { "epoch": 1.7502274795268424, "grad_norm": 0.5623851041936763, "learning_rate": 3.635294615774511e-06, "loss": 0.0316, "step": 3847 }, { "epoch": 1.7506824385805277, "grad_norm": 0.6335326929124626, "learning_rate": 3.6346578597369397e-06, "loss": 0.0276, "step": 3848 }, { "epoch": 1.751137397634213, "grad_norm": 0.7013109441398028, "learning_rate": 3.634021010980254e-06, "loss": 0.0315, "step": 3849 }, { "epoch": 1.7515923566878981, "grad_norm": 0.9579212494639825, "learning_rate": 3.633384069556491e-06, "loss": 0.0628, "step": 3850 }, { "epoch": 1.7520473157415832, "grad_norm": 0.5048993866431397, "learning_rate": 3.6327470355177006e-06, "loss": 0.0225, "step": 3851 }, { "epoch": 1.7525022747952685, "grad_norm": 0.5883308745672281, "learning_rate": 3.6321099089159377e-06, "loss": 0.0306, "step": 3852 }, { "epoch": 1.7529572338489536, "grad_norm": 0.5431914009800679, "learning_rate": 3.631472689803266e-06, "loss": 0.0219, "step": 3853 }, { "epoch": 1.7534121929026387, "grad_norm": 0.6136466481448818, "learning_rate": 3.6308353782317557e-06, "loss": 0.0345, "step": 3854 }, { "epoch": 1.753867151956324, "grad_norm": 0.6477013738851053, "learning_rate": 3.6301979742534844e-06, "loss": 0.0347, "step": 3855 }, { "epoch": 1.7543221110100091, "grad_norm": 0.649955937465349, "learning_rate": 3.6295604779205394e-06, "loss": 0.0269, "step": 3856 }, { "epoch": 1.7547770700636942, "grad_norm": 0.5451073684407265, "learning_rate": 3.6289228892850126e-06, "loss": 0.0232, "step": 3857 }, { "epoch": 1.7552320291173795, "grad_norm": 0.6501454323654668, "learning_rate": 3.628285208399006e-06, "loss": 0.0383, "step": 3858 }, { "epoch": 1.7556869881710646, "grad_norm": 0.7215446452564725, "learning_rate": 3.6276474353146274e-06, "loss": 0.0314, "step": 3859 }, { "epoch": 1.7561419472247497, "grad_norm": 0.6315136771122604, "learning_rate": 3.6270095700839926e-06, "loss": 0.036, "step": 3860 }, { "epoch": 1.756596906278435, "grad_norm": 0.6253760413777038, "learning_rate": 3.6263716127592253e-06, "loss": 0.0248, "step": 3861 }, { "epoch": 1.75705186533212, "grad_norm": 0.32504064719636533, "learning_rate": 3.6257335633924564e-06, "loss": 0.01, "step": 3862 }, { "epoch": 1.7575068243858052, "grad_norm": 0.7224805704522139, "learning_rate": 3.6250954220358248e-06, "loss": 0.0337, "step": 3863 }, { "epoch": 1.7579617834394905, "grad_norm": 0.7791448578606137, "learning_rate": 3.624457188741476e-06, "loss": 0.0345, "step": 3864 }, { "epoch": 1.7584167424931756, "grad_norm": 0.5675607416057377, "learning_rate": 3.6238188635615636e-06, "loss": 0.0272, "step": 3865 }, { "epoch": 1.7588717015468607, "grad_norm": 0.5160544818888646, "learning_rate": 3.6231804465482483e-06, "loss": 0.0159, "step": 3866 }, { "epoch": 1.759326660600546, "grad_norm": 0.6248463178241036, "learning_rate": 3.6225419377536997e-06, "loss": 0.0272, "step": 3867 }, { "epoch": 1.759781619654231, "grad_norm": 0.49893607907020765, "learning_rate": 3.6219033372300937e-06, "loss": 0.0215, "step": 3868 }, { "epoch": 1.7602365787079162, "grad_norm": 0.6287338412781016, "learning_rate": 3.621264645029613e-06, "loss": 0.0254, "step": 3869 }, { "epoch": 1.7606915377616015, "grad_norm": 0.576700858995691, "learning_rate": 3.6206258612044486e-06, "loss": 0.0215, "step": 3870 }, { "epoch": 1.7611464968152868, "grad_norm": 0.5569509873792899, "learning_rate": 3.6199869858068003e-06, "loss": 0.0175, "step": 3871 }, { "epoch": 1.7616014558689717, "grad_norm": 0.525407504415116, "learning_rate": 3.619348018888873e-06, "loss": 0.0141, "step": 3872 }, { "epoch": 1.762056414922657, "grad_norm": 0.4867512209136127, "learning_rate": 3.618708960502881e-06, "loss": 0.0273, "step": 3873 }, { "epoch": 1.7625113739763423, "grad_norm": 0.6767457078737245, "learning_rate": 3.6180698107010435e-06, "loss": 0.0385, "step": 3874 }, { "epoch": 1.7629663330300271, "grad_norm": 0.8052724611893239, "learning_rate": 3.617430569535592e-06, "loss": 0.0335, "step": 3875 }, { "epoch": 1.7634212920837125, "grad_norm": 0.6405382179049982, "learning_rate": 3.61679123705876e-06, "loss": 0.0315, "step": 3876 }, { "epoch": 1.7638762511373978, "grad_norm": 0.6433604509135347, "learning_rate": 3.616151813322791e-06, "loss": 0.0261, "step": 3877 }, { "epoch": 1.7643312101910829, "grad_norm": 0.6625365479641434, "learning_rate": 3.615512298379937e-06, "loss": 0.0285, "step": 3878 }, { "epoch": 1.764786169244768, "grad_norm": 0.5427435010837753, "learning_rate": 3.6148726922824545e-06, "loss": 0.0225, "step": 3879 }, { "epoch": 1.7652411282984533, "grad_norm": 0.660414462404647, "learning_rate": 3.614232995082611e-06, "loss": 0.0292, "step": 3880 }, { "epoch": 1.7656960873521383, "grad_norm": 0.9199890587743693, "learning_rate": 3.6135932068326797e-06, "loss": 0.0322, "step": 3881 }, { "epoch": 1.7661510464058234, "grad_norm": 0.5148296035434483, "learning_rate": 3.6129533275849395e-06, "loss": 0.0203, "step": 3882 }, { "epoch": 1.7666060054595087, "grad_norm": 0.48186441552250603, "learning_rate": 3.6123133573916792e-06, "loss": 0.0126, "step": 3883 }, { "epoch": 1.7670609645131938, "grad_norm": 0.6283861081525925, "learning_rate": 3.6116732963051946e-06, "loss": 0.0243, "step": 3884 }, { "epoch": 1.767515923566879, "grad_norm": 0.567215610102529, "learning_rate": 3.611033144377789e-06, "loss": 0.0281, "step": 3885 } ], "logging_steps": 1, "max_steps": 10990, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 555, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 25601289412608.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }