{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 2524, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000792393026941363, "grad_norm": 8.754895095644631, "learning_rate": 0.0, "loss": 1.2351, "step": 1 }, { "epoch": 0.001584786053882726, "grad_norm": 9.421868893027085, "learning_rate": 1.9607843137254904e-07, "loss": 1.2426, "step": 2 }, { "epoch": 0.002377179080824089, "grad_norm": 8.518167653125147, "learning_rate": 3.921568627450981e-07, "loss": 1.2244, "step": 3 }, { "epoch": 0.003169572107765452, "grad_norm": 7.752066621252201, "learning_rate": 5.882352941176471e-07, "loss": 1.1732, "step": 4 }, { "epoch": 0.003961965134706815, "grad_norm": 11.19191987801612, "learning_rate": 7.843137254901962e-07, "loss": 1.1989, "step": 5 }, { "epoch": 0.004754358161648178, "grad_norm": 9.154493348014704, "learning_rate": 9.80392156862745e-07, "loss": 1.1947, "step": 6 }, { "epoch": 0.005546751188589541, "grad_norm": 8.273329846989816, "learning_rate": 1.1764705882352942e-06, "loss": 1.2542, "step": 7 }, { "epoch": 0.006339144215530904, "grad_norm": 6.440064409026694, "learning_rate": 1.3725490196078434e-06, "loss": 1.1648, "step": 8 }, { "epoch": 0.0071315372424722665, "grad_norm": 6.599183623100538, "learning_rate": 1.5686274509803923e-06, "loss": 1.1455, "step": 9 }, { "epoch": 0.00792393026941363, "grad_norm": 8.592495994783802, "learning_rate": 1.7647058823529414e-06, "loss": 1.2054, "step": 10 }, { "epoch": 0.008716323296354992, "grad_norm": 5.278616405195021, "learning_rate": 1.96078431372549e-06, "loss": 1.1779, "step": 11 }, { "epoch": 0.009508716323296355, "grad_norm": 4.4682951252861205, "learning_rate": 2.1568627450980393e-06, "loss": 1.0495, "step": 12 }, { "epoch": 0.010301109350237718, "grad_norm": 4.195387151730475, "learning_rate": 2.3529411764705885e-06, "loss": 1.0061, "step": 13 }, { "epoch": 0.011093502377179081, "grad_norm": 3.899372085899887, "learning_rate": 2.549019607843137e-06, "loss": 1.0393, "step": 14 }, { "epoch": 0.011885895404120444, "grad_norm": 2.8404492688963536, "learning_rate": 2.7450980392156867e-06, "loss": 0.9967, "step": 15 }, { "epoch": 0.012678288431061807, "grad_norm": 3.6163909308768707, "learning_rate": 2.9411764705882355e-06, "loss": 1.005, "step": 16 }, { "epoch": 0.01347068145800317, "grad_norm": 2.665336415242126, "learning_rate": 3.1372549019607846e-06, "loss": 1.027, "step": 17 }, { "epoch": 0.014263074484944533, "grad_norm": 2.8157996773251357, "learning_rate": 3.3333333333333333e-06, "loss": 1.004, "step": 18 }, { "epoch": 0.015055467511885896, "grad_norm": 2.8434224274983997, "learning_rate": 3.529411764705883e-06, "loss": 0.995, "step": 19 }, { "epoch": 0.01584786053882726, "grad_norm": 2.9373492388106883, "learning_rate": 3.7254901960784316e-06, "loss": 1.0094, "step": 20 }, { "epoch": 0.01664025356576862, "grad_norm": 2.8174365736038776, "learning_rate": 3.92156862745098e-06, "loss": 0.9328, "step": 21 }, { "epoch": 0.017432646592709985, "grad_norm": 3.1579352008601105, "learning_rate": 4.11764705882353e-06, "loss": 0.9947, "step": 22 }, { "epoch": 0.018225039619651346, "grad_norm": 3.2135790385829557, "learning_rate": 4.313725490196079e-06, "loss": 1.0012, "step": 23 }, { "epoch": 0.01901743264659271, "grad_norm": 2.614973935752988, "learning_rate": 4.509803921568628e-06, "loss": 0.9242, "step": 24 }, { "epoch": 0.019809825673534072, "grad_norm": 2.3304352863721194, "learning_rate": 4.705882352941177e-06, "loss": 0.8703, "step": 25 }, { "epoch": 0.020602218700475437, "grad_norm": 2.3246333772931367, "learning_rate": 4.901960784313726e-06, "loss": 0.9645, "step": 26 }, { "epoch": 0.021394611727416798, "grad_norm": 1.999218103894415, "learning_rate": 5.098039215686274e-06, "loss": 0.8555, "step": 27 }, { "epoch": 0.022187004754358162, "grad_norm": 2.6137326289819494, "learning_rate": 5.294117647058824e-06, "loss": 0.9312, "step": 28 }, { "epoch": 0.022979397781299524, "grad_norm": 3.1527382212087773, "learning_rate": 5.4901960784313735e-06, "loss": 0.9411, "step": 29 }, { "epoch": 0.02377179080824089, "grad_norm": 2.7105402875747573, "learning_rate": 5.686274509803922e-06, "loss": 0.8714, "step": 30 }, { "epoch": 0.02456418383518225, "grad_norm": 2.4518563815971453, "learning_rate": 5.882352941176471e-06, "loss": 0.9178, "step": 31 }, { "epoch": 0.025356576862123614, "grad_norm": 2.2410760951013122, "learning_rate": 6.07843137254902e-06, "loss": 0.8587, "step": 32 }, { "epoch": 0.026148969889064975, "grad_norm": 2.201808002288626, "learning_rate": 6.274509803921569e-06, "loss": 0.9404, "step": 33 }, { "epoch": 0.02694136291600634, "grad_norm": 2.2600923762247045, "learning_rate": 6.470588235294119e-06, "loss": 0.9238, "step": 34 }, { "epoch": 0.0277337559429477, "grad_norm": 2.20786931564006, "learning_rate": 6.666666666666667e-06, "loss": 0.8722, "step": 35 }, { "epoch": 0.028526148969889066, "grad_norm": 1.8327424368592977, "learning_rate": 6.862745098039216e-06, "loss": 0.8305, "step": 36 }, { "epoch": 0.029318541996830427, "grad_norm": 2.313696142057337, "learning_rate": 7.058823529411766e-06, "loss": 0.7854, "step": 37 }, { "epoch": 0.030110935023771792, "grad_norm": 2.1264127791045304, "learning_rate": 7.2549019607843145e-06, "loss": 0.8598, "step": 38 }, { "epoch": 0.030903328050713153, "grad_norm": 1.6677708660857238, "learning_rate": 7.450980392156863e-06, "loss": 0.8225, "step": 39 }, { "epoch": 0.03169572107765452, "grad_norm": 1.6980490360756055, "learning_rate": 7.647058823529411e-06, "loss": 0.8711, "step": 40 }, { "epoch": 0.03248811410459588, "grad_norm": 1.905254307790843, "learning_rate": 7.84313725490196e-06, "loss": 0.8562, "step": 41 }, { "epoch": 0.03328050713153724, "grad_norm": 2.106548729679535, "learning_rate": 8.03921568627451e-06, "loss": 0.81, "step": 42 }, { "epoch": 0.03407290015847861, "grad_norm": 2.408395347987529, "learning_rate": 8.23529411764706e-06, "loss": 0.7477, "step": 43 }, { "epoch": 0.03486529318541997, "grad_norm": 1.6776607205739082, "learning_rate": 8.43137254901961e-06, "loss": 0.871, "step": 44 }, { "epoch": 0.03565768621236133, "grad_norm": 1.8144901807475566, "learning_rate": 8.627450980392157e-06, "loss": 0.7766, "step": 45 }, { "epoch": 0.03645007923930269, "grad_norm": 2.5991243684201404, "learning_rate": 8.823529411764707e-06, "loss": 0.8353, "step": 46 }, { "epoch": 0.03724247226624406, "grad_norm": 2.0967228482892852, "learning_rate": 9.019607843137256e-06, "loss": 0.8088, "step": 47 }, { "epoch": 0.03803486529318542, "grad_norm": 2.0141024481663568, "learning_rate": 9.215686274509804e-06, "loss": 0.832, "step": 48 }, { "epoch": 0.03882725832012678, "grad_norm": 1.730596823637408, "learning_rate": 9.411764705882354e-06, "loss": 0.7346, "step": 49 }, { "epoch": 0.039619651347068144, "grad_norm": 1.5928766843847086, "learning_rate": 9.607843137254903e-06, "loss": 0.7816, "step": 50 }, { "epoch": 0.04041204437400951, "grad_norm": 1.872073358592137, "learning_rate": 9.803921568627451e-06, "loss": 0.7583, "step": 51 }, { "epoch": 0.04120443740095087, "grad_norm": 1.7104737419053242, "learning_rate": 1e-05, "loss": 0.753, "step": 52 }, { "epoch": 0.041996830427892234, "grad_norm": 1.8116079345049982, "learning_rate": 9.999995965483807e-06, "loss": 0.7538, "step": 53 }, { "epoch": 0.042789223454833596, "grad_norm": 1.6326869750386659, "learning_rate": 9.999983861941738e-06, "loss": 0.8366, "step": 54 }, { "epoch": 0.043581616481774964, "grad_norm": 1.7375086521399525, "learning_rate": 9.999963689393326e-06, "loss": 0.7245, "step": 55 }, { "epoch": 0.044374009508716325, "grad_norm": 1.6946751819594645, "learning_rate": 9.999935447871127e-06, "loss": 0.8714, "step": 56 }, { "epoch": 0.045166402535657686, "grad_norm": 1.9211417392352752, "learning_rate": 9.999899137420714e-06, "loss": 0.7532, "step": 57 }, { "epoch": 0.04595879556259905, "grad_norm": 1.7503517065582104, "learning_rate": 9.999854758100688e-06, "loss": 0.834, "step": 58 }, { "epoch": 0.04675118858954041, "grad_norm": 1.789940239772959, "learning_rate": 9.999802309982667e-06, "loss": 0.8116, "step": 59 }, { "epoch": 0.04754358161648178, "grad_norm": 1.5805107848670614, "learning_rate": 9.999741793151293e-06, "loss": 0.8558, "step": 60 }, { "epoch": 0.04833597464342314, "grad_norm": 1.5964359235045271, "learning_rate": 9.999673207704228e-06, "loss": 0.7808, "step": 61 }, { "epoch": 0.0491283676703645, "grad_norm": 1.7671142327543046, "learning_rate": 9.999596553752157e-06, "loss": 0.7747, "step": 62 }, { "epoch": 0.04992076069730586, "grad_norm": 2.9528181713898034, "learning_rate": 9.999511831418782e-06, "loss": 0.6434, "step": 63 }, { "epoch": 0.05071315372424723, "grad_norm": 1.7324840171024525, "learning_rate": 9.999419040840832e-06, "loss": 0.8046, "step": 64 }, { "epoch": 0.05150554675118859, "grad_norm": 1.7171263842110995, "learning_rate": 9.999318182168049e-06, "loss": 0.7662, "step": 65 }, { "epoch": 0.05229793977812995, "grad_norm": 1.8905251892031416, "learning_rate": 9.999209255563203e-06, "loss": 0.7939, "step": 66 }, { "epoch": 0.05309033280507131, "grad_norm": 1.7723872174065645, "learning_rate": 9.999092261202076e-06, "loss": 0.7556, "step": 67 }, { "epoch": 0.05388272583201268, "grad_norm": 1.5919159215539098, "learning_rate": 9.998967199273481e-06, "loss": 0.7729, "step": 68 }, { "epoch": 0.05467511885895404, "grad_norm": 3.9120177259228726, "learning_rate": 9.998834069979238e-06, "loss": 0.7887, "step": 69 }, { "epoch": 0.0554675118858954, "grad_norm": 1.812890761818037, "learning_rate": 9.998692873534194e-06, "loss": 0.7475, "step": 70 }, { "epoch": 0.056259904912836764, "grad_norm": 1.8837475377853303, "learning_rate": 9.998543610166211e-06, "loss": 0.8021, "step": 71 }, { "epoch": 0.05705229793977813, "grad_norm": 3.1505211329934757, "learning_rate": 9.998386280116175e-06, "loss": 0.7626, "step": 72 }, { "epoch": 0.05784469096671949, "grad_norm": 1.6348720616226755, "learning_rate": 9.998220883637984e-06, "loss": 0.8671, "step": 73 }, { "epoch": 0.058637083993660855, "grad_norm": 2.195466491144765, "learning_rate": 9.998047420998553e-06, "loss": 0.8052, "step": 74 }, { "epoch": 0.059429477020602216, "grad_norm": 1.6031532236533161, "learning_rate": 9.997865892477822e-06, "loss": 0.7978, "step": 75 }, { "epoch": 0.060221870047543584, "grad_norm": 1.81578925453773, "learning_rate": 9.997676298368742e-06, "loss": 0.8276, "step": 76 }, { "epoch": 0.061014263074484945, "grad_norm": 2.068503791901515, "learning_rate": 9.997478638977278e-06, "loss": 0.7359, "step": 77 }, { "epoch": 0.061806656101426306, "grad_norm": 1.5132193300341286, "learning_rate": 9.997272914622417e-06, "loss": 0.8009, "step": 78 }, { "epoch": 0.06259904912836767, "grad_norm": 1.4379767139641526, "learning_rate": 9.997059125636158e-06, "loss": 0.7927, "step": 79 }, { "epoch": 0.06339144215530904, "grad_norm": 1.8110027255877088, "learning_rate": 9.996837272363514e-06, "loss": 0.7708, "step": 80 }, { "epoch": 0.06418383518225039, "grad_norm": 2.3158031355795687, "learning_rate": 9.996607355162513e-06, "loss": 0.771, "step": 81 }, { "epoch": 0.06497622820919176, "grad_norm": 1.690619444340736, "learning_rate": 9.996369374404197e-06, "loss": 0.6664, "step": 82 }, { "epoch": 0.06576862123613313, "grad_norm": 1.6093236585122765, "learning_rate": 9.996123330472622e-06, "loss": 0.8127, "step": 83 }, { "epoch": 0.06656101426307448, "grad_norm": 1.606553258039033, "learning_rate": 9.995869223764856e-06, "loss": 0.7484, "step": 84 }, { "epoch": 0.06735340729001585, "grad_norm": 1.5634797958212319, "learning_rate": 9.995607054690976e-06, "loss": 0.7974, "step": 85 }, { "epoch": 0.06814580031695722, "grad_norm": 1.694235537560226, "learning_rate": 9.995336823674073e-06, "loss": 0.7855, "step": 86 }, { "epoch": 0.06893819334389857, "grad_norm": 30.365147581829422, "learning_rate": 9.995058531150248e-06, "loss": 0.771, "step": 87 }, { "epoch": 0.06973058637083994, "grad_norm": 1.6920126547049108, "learning_rate": 9.99477217756861e-06, "loss": 0.6998, "step": 88 }, { "epoch": 0.0705229793977813, "grad_norm": 1.7447576661784603, "learning_rate": 9.994477763391279e-06, "loss": 0.8211, "step": 89 }, { "epoch": 0.07131537242472266, "grad_norm": 1.8038550961208857, "learning_rate": 9.994175289093383e-06, "loss": 0.8015, "step": 90 }, { "epoch": 0.07210776545166403, "grad_norm": 1.5630674014142574, "learning_rate": 9.993864755163057e-06, "loss": 0.777, "step": 91 }, { "epoch": 0.07290015847860538, "grad_norm": 1.700338410860087, "learning_rate": 9.993546162101441e-06, "loss": 0.7023, "step": 92 }, { "epoch": 0.07369255150554675, "grad_norm": 1.7159884427559096, "learning_rate": 9.993219510422684e-06, "loss": 0.7959, "step": 93 }, { "epoch": 0.07448494453248812, "grad_norm": 1.96513508487593, "learning_rate": 9.99288480065394e-06, "loss": 0.7763, "step": 94 }, { "epoch": 0.07527733755942947, "grad_norm": 1.9730256238645472, "learning_rate": 9.992542033335364e-06, "loss": 0.7018, "step": 95 }, { "epoch": 0.07606973058637084, "grad_norm": 2.6813644051557657, "learning_rate": 9.992191209020114e-06, "loss": 0.7734, "step": 96 }, { "epoch": 0.0768621236133122, "grad_norm": 1.6203300035442065, "learning_rate": 9.991832328274357e-06, "loss": 0.7623, "step": 97 }, { "epoch": 0.07765451664025357, "grad_norm": 2.0101942955861336, "learning_rate": 9.991465391677252e-06, "loss": 0.7893, "step": 98 }, { "epoch": 0.07844690966719493, "grad_norm": 2.038712409854615, "learning_rate": 9.99109039982097e-06, "loss": 0.7774, "step": 99 }, { "epoch": 0.07923930269413629, "grad_norm": 5.733518294662075, "learning_rate": 9.990707353310671e-06, "loss": 0.7558, "step": 100 }, { "epoch": 0.08003169572107766, "grad_norm": 1.5910580911576566, "learning_rate": 9.990316252764518e-06, "loss": 0.8094, "step": 101 }, { "epoch": 0.08082408874801902, "grad_norm": 2.19197221251147, "learning_rate": 9.989917098813673e-06, "loss": 0.8098, "step": 102 }, { "epoch": 0.08161648177496038, "grad_norm": 1.6382066440396346, "learning_rate": 9.98950989210229e-06, "loss": 0.7513, "step": 103 }, { "epoch": 0.08240887480190175, "grad_norm": 1.7242201329954812, "learning_rate": 9.989094633287528e-06, "loss": 0.7281, "step": 104 }, { "epoch": 0.0832012678288431, "grad_norm": 2.4872568618627877, "learning_rate": 9.988671323039529e-06, "loss": 0.7493, "step": 105 }, { "epoch": 0.08399366085578447, "grad_norm": 1.7177326808649351, "learning_rate": 9.988239962041436e-06, "loss": 0.7863, "step": 106 }, { "epoch": 0.08478605388272584, "grad_norm": 1.644497819643521, "learning_rate": 9.987800550989382e-06, "loss": 0.7137, "step": 107 }, { "epoch": 0.08557844690966719, "grad_norm": 1.5056197578507864, "learning_rate": 9.987353090592491e-06, "loss": 0.7253, "step": 108 }, { "epoch": 0.08637083993660856, "grad_norm": 1.6318031236319928, "learning_rate": 9.986897581572877e-06, "loss": 0.7925, "step": 109 }, { "epoch": 0.08716323296354993, "grad_norm": 1.5245037084250284, "learning_rate": 9.986434024665646e-06, "loss": 0.7914, "step": 110 }, { "epoch": 0.08795562599049128, "grad_norm": 2.1994631809626006, "learning_rate": 9.985962420618884e-06, "loss": 0.7534, "step": 111 }, { "epoch": 0.08874801901743265, "grad_norm": 1.598858025955189, "learning_rate": 9.985482770193674e-06, "loss": 0.7999, "step": 112 }, { "epoch": 0.089540412044374, "grad_norm": 1.4100130706932528, "learning_rate": 9.984995074164077e-06, "loss": 0.7467, "step": 113 }, { "epoch": 0.09033280507131537, "grad_norm": 1.7682528537554925, "learning_rate": 9.98449933331714e-06, "loss": 0.7755, "step": 114 }, { "epoch": 0.09112519809825674, "grad_norm": 1.6751970153097095, "learning_rate": 9.983995548452892e-06, "loss": 0.7901, "step": 115 }, { "epoch": 0.0919175911251981, "grad_norm": 2.034587925215773, "learning_rate": 9.983483720384346e-06, "loss": 0.7292, "step": 116 }, { "epoch": 0.09270998415213946, "grad_norm": 3.218860338066513, "learning_rate": 9.98296384993749e-06, "loss": 0.7563, "step": 117 }, { "epoch": 0.09350237717908082, "grad_norm": 1.789776696658269, "learning_rate": 9.982435937951297e-06, "loss": 0.7203, "step": 118 }, { "epoch": 0.09429477020602219, "grad_norm": 1.5133061278440034, "learning_rate": 9.981899985277717e-06, "loss": 0.821, "step": 119 }, { "epoch": 0.09508716323296355, "grad_norm": 1.5789128620536586, "learning_rate": 9.98135599278167e-06, "loss": 0.7859, "step": 120 }, { "epoch": 0.09587955625990491, "grad_norm": 1.6665054943315607, "learning_rate": 9.980803961341057e-06, "loss": 0.7611, "step": 121 }, { "epoch": 0.09667194928684628, "grad_norm": 1.5807260484449706, "learning_rate": 9.980243891846747e-06, "loss": 0.7765, "step": 122 }, { "epoch": 0.09746434231378764, "grad_norm": 1.739861563594325, "learning_rate": 9.979675785202587e-06, "loss": 0.7746, "step": 123 }, { "epoch": 0.098256735340729, "grad_norm": 2.09278170754784, "learning_rate": 9.979099642325389e-06, "loss": 0.7846, "step": 124 }, { "epoch": 0.09904912836767037, "grad_norm": 1.6191072173095677, "learning_rate": 9.978515464144938e-06, "loss": 0.6948, "step": 125 }, { "epoch": 0.09984152139461172, "grad_norm": 1.9846221182034771, "learning_rate": 9.977923251603982e-06, "loss": 0.7755, "step": 126 }, { "epoch": 0.10063391442155309, "grad_norm": 1.8195320797379209, "learning_rate": 9.977323005658239e-06, "loss": 0.7667, "step": 127 }, { "epoch": 0.10142630744849446, "grad_norm": 2.165057425939796, "learning_rate": 9.97671472727639e-06, "loss": 0.7237, "step": 128 }, { "epoch": 0.10221870047543581, "grad_norm": 2.1078745104453716, "learning_rate": 9.97609841744008e-06, "loss": 0.7382, "step": 129 }, { "epoch": 0.10301109350237718, "grad_norm": 2.1254759534251417, "learning_rate": 9.97547407714391e-06, "loss": 0.7597, "step": 130 }, { "epoch": 0.10380348652931855, "grad_norm": 1.6111046317106772, "learning_rate": 9.974841707395448e-06, "loss": 0.7668, "step": 131 }, { "epoch": 0.1045958795562599, "grad_norm": 1.541723476670277, "learning_rate": 9.974201309215215e-06, "loss": 0.7705, "step": 132 }, { "epoch": 0.10538827258320127, "grad_norm": 1.6818297184119466, "learning_rate": 9.973552883636688e-06, "loss": 0.7292, "step": 133 }, { "epoch": 0.10618066561014262, "grad_norm": 1.5836673496389615, "learning_rate": 9.972896431706303e-06, "loss": 0.7357, "step": 134 }, { "epoch": 0.10697305863708399, "grad_norm": 1.6576319385534826, "learning_rate": 9.972231954483446e-06, "loss": 0.6796, "step": 135 }, { "epoch": 0.10776545166402536, "grad_norm": 1.643645791433275, "learning_rate": 9.971559453040453e-06, "loss": 0.7744, "step": 136 }, { "epoch": 0.10855784469096671, "grad_norm": 1.7364820896249187, "learning_rate": 9.970878928462613e-06, "loss": 0.804, "step": 137 }, { "epoch": 0.10935023771790808, "grad_norm": 2.4391602701208974, "learning_rate": 9.970190381848159e-06, "loss": 0.753, "step": 138 }, { "epoch": 0.11014263074484945, "grad_norm": 1.547337659867073, "learning_rate": 9.969493814308276e-06, "loss": 0.6613, "step": 139 }, { "epoch": 0.1109350237717908, "grad_norm": 1.5329967498034336, "learning_rate": 9.968789226967084e-06, "loss": 0.7626, "step": 140 }, { "epoch": 0.11172741679873217, "grad_norm": 1.5718510606657001, "learning_rate": 9.968076620961652e-06, "loss": 0.6753, "step": 141 }, { "epoch": 0.11251980982567353, "grad_norm": 1.7317134491697603, "learning_rate": 9.967355997441993e-06, "loss": 0.6476, "step": 142 }, { "epoch": 0.1133122028526149, "grad_norm": 1.7332744329185803, "learning_rate": 9.966627357571046e-06, "loss": 0.6971, "step": 143 }, { "epoch": 0.11410459587955626, "grad_norm": 1.5476224469656554, "learning_rate": 9.965890702524701e-06, "loss": 0.8003, "step": 144 }, { "epoch": 0.11489698890649762, "grad_norm": 1.693998701053811, "learning_rate": 9.965146033491776e-06, "loss": 0.7711, "step": 145 }, { "epoch": 0.11568938193343899, "grad_norm": 1.703814919653566, "learning_rate": 9.964393351674019e-06, "loss": 0.7574, "step": 146 }, { "epoch": 0.11648177496038035, "grad_norm": 1.7768295259127047, "learning_rate": 9.963632658286115e-06, "loss": 0.7364, "step": 147 }, { "epoch": 0.11727416798732171, "grad_norm": 1.6040448102459601, "learning_rate": 9.962863954555677e-06, "loss": 0.6092, "step": 148 }, { "epoch": 0.11806656101426308, "grad_norm": 1.7342347394067688, "learning_rate": 9.962087241723242e-06, "loss": 0.7043, "step": 149 }, { "epoch": 0.11885895404120443, "grad_norm": 1.7655211734148757, "learning_rate": 9.961302521042278e-06, "loss": 0.7929, "step": 150 }, { "epoch": 0.1196513470681458, "grad_norm": 1.5730804145704798, "learning_rate": 9.960509793779166e-06, "loss": 0.742, "step": 151 }, { "epoch": 0.12044374009508717, "grad_norm": 1.7113243479574918, "learning_rate": 9.95970906121322e-06, "loss": 0.7717, "step": 152 }, { "epoch": 0.12123613312202852, "grad_norm": 2.7904126243807954, "learning_rate": 9.958900324636665e-06, "loss": 0.7826, "step": 153 }, { "epoch": 0.12202852614896989, "grad_norm": 1.5826645792113436, "learning_rate": 9.958083585354645e-06, "loss": 0.7301, "step": 154 }, { "epoch": 0.12282091917591126, "grad_norm": 1.668867460295352, "learning_rate": 9.95725884468522e-06, "loss": 0.6901, "step": 155 }, { "epoch": 0.12361331220285261, "grad_norm": 1.7782555667278959, "learning_rate": 9.956426103959362e-06, "loss": 0.7079, "step": 156 }, { "epoch": 0.12440570522979398, "grad_norm": 1.4599343795987663, "learning_rate": 9.955585364520955e-06, "loss": 0.7199, "step": 157 }, { "epoch": 0.12519809825673534, "grad_norm": 1.5840552676052948, "learning_rate": 9.954736627726784e-06, "loss": 0.7971, "step": 158 }, { "epoch": 0.1259904912836767, "grad_norm": 1.4242602432637153, "learning_rate": 9.953879894946552e-06, "loss": 0.7303, "step": 159 }, { "epoch": 0.12678288431061807, "grad_norm": 1.779106098210814, "learning_rate": 9.953015167562857e-06, "loss": 0.7314, "step": 160 }, { "epoch": 0.12757527733755944, "grad_norm": 1.7605141358076077, "learning_rate": 9.952142446971203e-06, "loss": 0.7225, "step": 161 }, { "epoch": 0.12836767036450078, "grad_norm": 1.6587412496572795, "learning_rate": 9.95126173457999e-06, "loss": 0.7561, "step": 162 }, { "epoch": 0.12916006339144215, "grad_norm": 1.5846038516349559, "learning_rate": 9.950373031810519e-06, "loss": 0.7448, "step": 163 }, { "epoch": 0.12995245641838352, "grad_norm": 3.571023410506087, "learning_rate": 9.949476340096986e-06, "loss": 0.7188, "step": 164 }, { "epoch": 0.13074484944532488, "grad_norm": 1.779391493121236, "learning_rate": 9.948571660886475e-06, "loss": 0.7197, "step": 165 }, { "epoch": 0.13153724247226625, "grad_norm": 1.922839422057549, "learning_rate": 9.947658995638962e-06, "loss": 0.7608, "step": 166 }, { "epoch": 0.13232963549920762, "grad_norm": 2.282892813638984, "learning_rate": 9.946738345827316e-06, "loss": 0.6583, "step": 167 }, { "epoch": 0.13312202852614896, "grad_norm": 1.5524276632447336, "learning_rate": 9.945809712937286e-06, "loss": 0.6912, "step": 168 }, { "epoch": 0.13391442155309033, "grad_norm": 1.6637777563966838, "learning_rate": 9.944873098467507e-06, "loss": 0.7928, "step": 169 }, { "epoch": 0.1347068145800317, "grad_norm": 1.5260453556635223, "learning_rate": 9.94392850392949e-06, "loss": 0.7621, "step": 170 }, { "epoch": 0.13549920760697307, "grad_norm": 1.9409267078411876, "learning_rate": 9.942975930847631e-06, "loss": 0.7368, "step": 171 }, { "epoch": 0.13629160063391443, "grad_norm": 1.5708128994537087, "learning_rate": 9.942015380759197e-06, "loss": 0.7688, "step": 172 }, { "epoch": 0.13708399366085577, "grad_norm": 1.3885791290498455, "learning_rate": 9.94104685521433e-06, "loss": 0.8049, "step": 173 }, { "epoch": 0.13787638668779714, "grad_norm": 1.4506101448441873, "learning_rate": 9.940070355776043e-06, "loss": 0.6972, "step": 174 }, { "epoch": 0.1386687797147385, "grad_norm": 1.9257639149819101, "learning_rate": 9.939085884020218e-06, "loss": 0.7047, "step": 175 }, { "epoch": 0.13946117274167988, "grad_norm": 2.141786190472124, "learning_rate": 9.938093441535604e-06, "loss": 0.6963, "step": 176 }, { "epoch": 0.14025356576862125, "grad_norm": 1.667407916951165, "learning_rate": 9.937093029923805e-06, "loss": 0.8001, "step": 177 }, { "epoch": 0.1410459587955626, "grad_norm": 1.6492614221109203, "learning_rate": 9.936084650799295e-06, "loss": 0.7041, "step": 178 }, { "epoch": 0.14183835182250396, "grad_norm": 1.5633116983795448, "learning_rate": 9.935068305789406e-06, "loss": 0.6757, "step": 179 }, { "epoch": 0.14263074484944532, "grad_norm": 2.7117246460207136, "learning_rate": 9.934043996534314e-06, "loss": 0.7774, "step": 180 }, { "epoch": 0.1434231378763867, "grad_norm": 1.636428025304696, "learning_rate": 9.933011724687064e-06, "loss": 0.6771, "step": 181 }, { "epoch": 0.14421553090332806, "grad_norm": 3.2298814361846215, "learning_rate": 9.931971491913542e-06, "loss": 0.6868, "step": 182 }, { "epoch": 0.1450079239302694, "grad_norm": 1.7445942856998689, "learning_rate": 9.930923299892477e-06, "loss": 0.8253, "step": 183 }, { "epoch": 0.14580031695721077, "grad_norm": 1.4687218879262045, "learning_rate": 9.929867150315454e-06, "loss": 0.698, "step": 184 }, { "epoch": 0.14659270998415214, "grad_norm": 1.3731580378942758, "learning_rate": 9.92880304488689e-06, "loss": 0.7133, "step": 185 }, { "epoch": 0.1473851030110935, "grad_norm": 1.3291274844457392, "learning_rate": 9.927730985324046e-06, "loss": 0.7496, "step": 186 }, { "epoch": 0.14817749603803487, "grad_norm": 1.9461764046246157, "learning_rate": 9.926650973357021e-06, "loss": 0.7844, "step": 187 }, { "epoch": 0.14896988906497624, "grad_norm": 1.3833906806653091, "learning_rate": 9.925563010728742e-06, "loss": 0.7302, "step": 188 }, { "epoch": 0.14976228209191758, "grad_norm": 1.6329750464541801, "learning_rate": 9.924467099194972e-06, "loss": 0.737, "step": 189 }, { "epoch": 0.15055467511885895, "grad_norm": 1.479413590341384, "learning_rate": 9.923363240524302e-06, "loss": 0.7572, "step": 190 }, { "epoch": 0.15134706814580032, "grad_norm": 1.9621782602911537, "learning_rate": 9.922251436498142e-06, "loss": 0.6905, "step": 191 }, { "epoch": 0.15213946117274169, "grad_norm": 1.8938153404878992, "learning_rate": 9.921131688910733e-06, "loss": 0.7105, "step": 192 }, { "epoch": 0.15293185419968305, "grad_norm": 1.408397744001921, "learning_rate": 9.920003999569125e-06, "loss": 0.825, "step": 193 }, { "epoch": 0.1537242472266244, "grad_norm": 1.6415018547689062, "learning_rate": 9.918868370293197e-06, "loss": 0.6878, "step": 194 }, { "epoch": 0.15451664025356576, "grad_norm": 1.82260079977059, "learning_rate": 9.917724802915632e-06, "loss": 0.7161, "step": 195 }, { "epoch": 0.15530903328050713, "grad_norm": 1.6625830922546736, "learning_rate": 9.916573299281925e-06, "loss": 0.6996, "step": 196 }, { "epoch": 0.1561014263074485, "grad_norm": 1.5695774403870457, "learning_rate": 9.91541386125038e-06, "loss": 0.7119, "step": 197 }, { "epoch": 0.15689381933438987, "grad_norm": 1.9873920112368197, "learning_rate": 9.91424649069211e-06, "loss": 0.7467, "step": 198 }, { "epoch": 0.1576862123613312, "grad_norm": 1.812260461017493, "learning_rate": 9.91307118949102e-06, "loss": 0.8018, "step": 199 }, { "epoch": 0.15847860538827258, "grad_norm": 1.5055152044923648, "learning_rate": 9.911887959543822e-06, "loss": 0.7479, "step": 200 }, { "epoch": 0.15927099841521394, "grad_norm": 2.2091235642459357, "learning_rate": 9.910696802760018e-06, "loss": 0.7195, "step": 201 }, { "epoch": 0.1600633914421553, "grad_norm": 1.8401293539494699, "learning_rate": 9.909497721061907e-06, "loss": 0.6649, "step": 202 }, { "epoch": 0.16085578446909668, "grad_norm": 1.8340601478982748, "learning_rate": 9.908290716384572e-06, "loss": 0.7351, "step": 203 }, { "epoch": 0.16164817749603805, "grad_norm": 1.4159699879641525, "learning_rate": 9.907075790675887e-06, "loss": 0.6861, "step": 204 }, { "epoch": 0.1624405705229794, "grad_norm": 1.8500363851135844, "learning_rate": 9.905852945896507e-06, "loss": 0.6589, "step": 205 }, { "epoch": 0.16323296354992076, "grad_norm": 1.7235520563490827, "learning_rate": 9.904622184019865e-06, "loss": 0.7812, "step": 206 }, { "epoch": 0.16402535657686212, "grad_norm": 1.5825556139276449, "learning_rate": 9.903383507032173e-06, "loss": 0.6842, "step": 207 }, { "epoch": 0.1648177496038035, "grad_norm": 1.8574873076578666, "learning_rate": 9.902136916932417e-06, "loss": 0.6974, "step": 208 }, { "epoch": 0.16561014263074486, "grad_norm": 1.5233517997271206, "learning_rate": 9.900882415732352e-06, "loss": 0.7082, "step": 209 }, { "epoch": 0.1664025356576862, "grad_norm": 1.3921839490699075, "learning_rate": 9.899620005456499e-06, "loss": 0.7436, "step": 210 }, { "epoch": 0.16719492868462757, "grad_norm": 1.5222002157916836, "learning_rate": 9.898349688142145e-06, "loss": 0.7521, "step": 211 }, { "epoch": 0.16798732171156894, "grad_norm": 1.7188430552062168, "learning_rate": 9.897071465839338e-06, "loss": 0.7459, "step": 212 }, { "epoch": 0.1687797147385103, "grad_norm": 1.702979906070874, "learning_rate": 9.895785340610878e-06, "loss": 0.7675, "step": 213 }, { "epoch": 0.16957210776545167, "grad_norm": 2.881333099745679, "learning_rate": 9.894491314532324e-06, "loss": 0.7688, "step": 214 }, { "epoch": 0.17036450079239301, "grad_norm": 2.2319828292399464, "learning_rate": 9.893189389691984e-06, "loss": 0.6399, "step": 215 }, { "epoch": 0.17115689381933438, "grad_norm": 1.7111049974794177, "learning_rate": 9.891879568190909e-06, "loss": 0.661, "step": 216 }, { "epoch": 0.17194928684627575, "grad_norm": 2.262602325020454, "learning_rate": 9.890561852142904e-06, "loss": 0.7615, "step": 217 }, { "epoch": 0.17274167987321712, "grad_norm": 3.378373753473706, "learning_rate": 9.889236243674503e-06, "loss": 0.8294, "step": 218 }, { "epoch": 0.1735340729001585, "grad_norm": 1.4393200565799296, "learning_rate": 9.887902744924982e-06, "loss": 0.7405, "step": 219 }, { "epoch": 0.17432646592709986, "grad_norm": 1.8908900083985691, "learning_rate": 9.886561358046352e-06, "loss": 0.7787, "step": 220 }, { "epoch": 0.1751188589540412, "grad_norm": 1.6449045084873481, "learning_rate": 9.88521208520335e-06, "loss": 0.6982, "step": 221 }, { "epoch": 0.17591125198098256, "grad_norm": 1.6546845101720242, "learning_rate": 9.883854928573442e-06, "loss": 0.6828, "step": 222 }, { "epoch": 0.17670364500792393, "grad_norm": 1.8403562907574271, "learning_rate": 9.882489890346816e-06, "loss": 0.722, "step": 223 }, { "epoch": 0.1774960380348653, "grad_norm": 2.6148043747371856, "learning_rate": 9.881116972726378e-06, "loss": 0.6868, "step": 224 }, { "epoch": 0.17828843106180667, "grad_norm": 1.6449281913570617, "learning_rate": 9.879736177927754e-06, "loss": 0.7152, "step": 225 }, { "epoch": 0.179080824088748, "grad_norm": 1.5635924014967348, "learning_rate": 9.878347508179279e-06, "loss": 0.6534, "step": 226 }, { "epoch": 0.17987321711568938, "grad_norm": 2.551104669536138, "learning_rate": 9.876950965721996e-06, "loss": 0.7149, "step": 227 }, { "epoch": 0.18066561014263074, "grad_norm": 3.401532495985276, "learning_rate": 9.875546552809654e-06, "loss": 0.7617, "step": 228 }, { "epoch": 0.1814580031695721, "grad_norm": 3.5754002832484724, "learning_rate": 9.874134271708707e-06, "loss": 0.7476, "step": 229 }, { "epoch": 0.18225039619651348, "grad_norm": 1.4459455829647612, "learning_rate": 9.8727141246983e-06, "loss": 0.7574, "step": 230 }, { "epoch": 0.18304278922345482, "grad_norm": 1.6309496926799276, "learning_rate": 9.871286114070276e-06, "loss": 0.6728, "step": 231 }, { "epoch": 0.1838351822503962, "grad_norm": 1.499570167382794, "learning_rate": 9.869850242129166e-06, "loss": 0.6498, "step": 232 }, { "epoch": 0.18462757527733756, "grad_norm": 2.5689934498153915, "learning_rate": 9.868406511192195e-06, "loss": 0.6806, "step": 233 }, { "epoch": 0.18541996830427893, "grad_norm": 2.1713578374628946, "learning_rate": 9.86695492358926e-06, "loss": 0.6628, "step": 234 }, { "epoch": 0.1862123613312203, "grad_norm": 1.6039842971380784, "learning_rate": 9.865495481662946e-06, "loss": 0.6756, "step": 235 }, { "epoch": 0.18700475435816163, "grad_norm": 1.68187893932842, "learning_rate": 9.864028187768506e-06, "loss": 0.6987, "step": 236 }, { "epoch": 0.187797147385103, "grad_norm": 1.4666012675692808, "learning_rate": 9.862553044273873e-06, "loss": 0.681, "step": 237 }, { "epoch": 0.18858954041204437, "grad_norm": 1.622982855059298, "learning_rate": 9.86107005355964e-06, "loss": 0.7383, "step": 238 }, { "epoch": 0.18938193343898574, "grad_norm": 1.800460768993766, "learning_rate": 9.859579218019068e-06, "loss": 0.7499, "step": 239 }, { "epoch": 0.1901743264659271, "grad_norm": 1.9756656406610278, "learning_rate": 9.858080540058077e-06, "loss": 0.7399, "step": 240 }, { "epoch": 0.19096671949286848, "grad_norm": 1.4916744504065476, "learning_rate": 9.856574022095243e-06, "loss": 0.7182, "step": 241 }, { "epoch": 0.19175911251980982, "grad_norm": 2.232650783256293, "learning_rate": 9.855059666561793e-06, "loss": 0.7243, "step": 242 }, { "epoch": 0.19255150554675118, "grad_norm": 1.633175631367372, "learning_rate": 9.853537475901607e-06, "loss": 0.6986, "step": 243 }, { "epoch": 0.19334389857369255, "grad_norm": 1.520737537652007, "learning_rate": 9.852007452571204e-06, "loss": 0.7212, "step": 244 }, { "epoch": 0.19413629160063392, "grad_norm": 1.7755694022876516, "learning_rate": 9.850469599039744e-06, "loss": 0.7308, "step": 245 }, { "epoch": 0.1949286846275753, "grad_norm": 1.7756746019816807, "learning_rate": 9.848923917789029e-06, "loss": 0.6986, "step": 246 }, { "epoch": 0.19572107765451663, "grad_norm": 1.3411482492990936, "learning_rate": 9.847370411313488e-06, "loss": 0.7195, "step": 247 }, { "epoch": 0.196513470681458, "grad_norm": 1.4498693984175361, "learning_rate": 9.845809082120177e-06, "loss": 0.7417, "step": 248 }, { "epoch": 0.19730586370839936, "grad_norm": 1.732779985989493, "learning_rate": 9.844239932728785e-06, "loss": 0.7718, "step": 249 }, { "epoch": 0.19809825673534073, "grad_norm": 2.3131151394705696, "learning_rate": 9.842662965671608e-06, "loss": 0.6686, "step": 250 }, { "epoch": 0.1988906497622821, "grad_norm": 1.898306845564621, "learning_rate": 9.84107818349357e-06, "loss": 0.695, "step": 251 }, { "epoch": 0.19968304278922344, "grad_norm": 1.4018215234853757, "learning_rate": 9.839485588752205e-06, "loss": 0.7704, "step": 252 }, { "epoch": 0.2004754358161648, "grad_norm": 1.6407984986922235, "learning_rate": 9.837885184017649e-06, "loss": 0.6931, "step": 253 }, { "epoch": 0.20126782884310618, "grad_norm": 1.7624027209504227, "learning_rate": 9.836276971872644e-06, "loss": 0.7357, "step": 254 }, { "epoch": 0.20206022187004755, "grad_norm": 1.7367988391433802, "learning_rate": 9.834660954912539e-06, "loss": 0.7688, "step": 255 }, { "epoch": 0.20285261489698891, "grad_norm": 1.5068279963720197, "learning_rate": 9.833037135745266e-06, "loss": 0.7107, "step": 256 }, { "epoch": 0.20364500792393028, "grad_norm": 1.792880203216384, "learning_rate": 9.831405516991361e-06, "loss": 0.7226, "step": 257 }, { "epoch": 0.20443740095087162, "grad_norm": 2.2590267043085803, "learning_rate": 9.829766101283937e-06, "loss": 0.7675, "step": 258 }, { "epoch": 0.205229793977813, "grad_norm": 1.8804936980407707, "learning_rate": 9.828118891268695e-06, "loss": 0.6543, "step": 259 }, { "epoch": 0.20602218700475436, "grad_norm": 1.4479292057183604, "learning_rate": 9.826463889603912e-06, "loss": 0.7145, "step": 260 }, { "epoch": 0.20681458003169573, "grad_norm": 3.0634422309641764, "learning_rate": 9.824801098960442e-06, "loss": 0.7, "step": 261 }, { "epoch": 0.2076069730586371, "grad_norm": 1.6544346478043463, "learning_rate": 9.823130522021707e-06, "loss": 0.727, "step": 262 }, { "epoch": 0.20839936608557844, "grad_norm": 1.7608555014384901, "learning_rate": 9.821452161483693e-06, "loss": 0.664, "step": 263 }, { "epoch": 0.2091917591125198, "grad_norm": 2.1784011130382557, "learning_rate": 9.819766020054952e-06, "loss": 0.6926, "step": 264 }, { "epoch": 0.20998415213946117, "grad_norm": 1.812352927421531, "learning_rate": 9.818072100456588e-06, "loss": 0.648, "step": 265 }, { "epoch": 0.21077654516640254, "grad_norm": 6.243653464927145, "learning_rate": 9.816370405422262e-06, "loss": 0.7173, "step": 266 }, { "epoch": 0.2115689381933439, "grad_norm": 1.4732791343802316, "learning_rate": 9.814660937698177e-06, "loss": 0.737, "step": 267 }, { "epoch": 0.21236133122028525, "grad_norm": 2.0669236774613737, "learning_rate": 9.812943700043085e-06, "loss": 0.7661, "step": 268 }, { "epoch": 0.21315372424722662, "grad_norm": 3.4482601951252176, "learning_rate": 9.811218695228273e-06, "loss": 0.7325, "step": 269 }, { "epoch": 0.21394611727416799, "grad_norm": 1.6537440525399363, "learning_rate": 9.809485926037569e-06, "loss": 0.7087, "step": 270 }, { "epoch": 0.21473851030110935, "grad_norm": 1.6491046306458133, "learning_rate": 9.807745395267324e-06, "loss": 0.7249, "step": 271 }, { "epoch": 0.21553090332805072, "grad_norm": 1.535909183563362, "learning_rate": 9.80599710572642e-06, "loss": 0.6525, "step": 272 }, { "epoch": 0.2163232963549921, "grad_norm": 1.4626720768896548, "learning_rate": 9.804241060236254e-06, "loss": 0.7742, "step": 273 }, { "epoch": 0.21711568938193343, "grad_norm": 3.0587377279837047, "learning_rate": 9.80247726163075e-06, "loss": 0.7396, "step": 274 }, { "epoch": 0.2179080824088748, "grad_norm": 1.501966037542709, "learning_rate": 9.80070571275633e-06, "loss": 0.7315, "step": 275 }, { "epoch": 0.21870047543581617, "grad_norm": 1.5930365026228248, "learning_rate": 9.79892641647194e-06, "loss": 0.7088, "step": 276 }, { "epoch": 0.21949286846275753, "grad_norm": 1.3916292616997403, "learning_rate": 9.79713937564901e-06, "loss": 0.74, "step": 277 }, { "epoch": 0.2202852614896989, "grad_norm": 1.5598382622692035, "learning_rate": 9.795344593171486e-06, "loss": 0.7004, "step": 278 }, { "epoch": 0.22107765451664024, "grad_norm": 1.5173633026609457, "learning_rate": 9.793542071935795e-06, "loss": 0.7166, "step": 279 }, { "epoch": 0.2218700475435816, "grad_norm": 2.100801100846311, "learning_rate": 9.791731814850858e-06, "loss": 0.7183, "step": 280 }, { "epoch": 0.22266244057052298, "grad_norm": 1.4845089086381626, "learning_rate": 9.789913824838082e-06, "loss": 0.8039, "step": 281 }, { "epoch": 0.22345483359746435, "grad_norm": 1.4606715949563633, "learning_rate": 9.78808810483135e-06, "loss": 0.721, "step": 282 }, { "epoch": 0.22424722662440572, "grad_norm": 1.5330301518536946, "learning_rate": 9.78625465777702e-06, "loss": 0.7146, "step": 283 }, { "epoch": 0.22503961965134706, "grad_norm": 1.623417946912195, "learning_rate": 9.784413486633921e-06, "loss": 0.7311, "step": 284 }, { "epoch": 0.22583201267828842, "grad_norm": 1.4923345971898996, "learning_rate": 9.782564594373348e-06, "loss": 0.7128, "step": 285 }, { "epoch": 0.2266244057052298, "grad_norm": 1.6113446207634403, "learning_rate": 9.780707983979052e-06, "loss": 0.7068, "step": 286 }, { "epoch": 0.22741679873217116, "grad_norm": 1.7059720466524497, "learning_rate": 9.778843658447248e-06, "loss": 0.7206, "step": 287 }, { "epoch": 0.22820919175911253, "grad_norm": 2.225007211751292, "learning_rate": 9.776971620786593e-06, "loss": 0.6659, "step": 288 }, { "epoch": 0.22900158478605387, "grad_norm": 1.9827649239790321, "learning_rate": 9.775091874018193e-06, "loss": 0.6765, "step": 289 }, { "epoch": 0.22979397781299524, "grad_norm": 1.6080379977667516, "learning_rate": 9.773204421175597e-06, "loss": 0.646, "step": 290 }, { "epoch": 0.2305863708399366, "grad_norm": 1.7412403347857364, "learning_rate": 9.77130926530479e-06, "loss": 0.6798, "step": 291 }, { "epoch": 0.23137876386687797, "grad_norm": 1.5952790926992222, "learning_rate": 9.769406409464185e-06, "loss": 0.6867, "step": 292 }, { "epoch": 0.23217115689381934, "grad_norm": 1.7040895378218612, "learning_rate": 9.767495856724623e-06, "loss": 0.7707, "step": 293 }, { "epoch": 0.2329635499207607, "grad_norm": 2.0038436035328906, "learning_rate": 9.765577610169366e-06, "loss": 0.7445, "step": 294 }, { "epoch": 0.23375594294770205, "grad_norm": 1.6250296566105322, "learning_rate": 9.763651672894095e-06, "loss": 0.6681, "step": 295 }, { "epoch": 0.23454833597464342, "grad_norm": 1.5179205764247976, "learning_rate": 9.761718048006898e-06, "loss": 0.7185, "step": 296 }, { "epoch": 0.2353407290015848, "grad_norm": 1.519574484049463, "learning_rate": 9.759776738628272e-06, "loss": 0.798, "step": 297 }, { "epoch": 0.23613312202852615, "grad_norm": 2.5112593043512397, "learning_rate": 9.757827747891112e-06, "loss": 0.7119, "step": 298 }, { "epoch": 0.23692551505546752, "grad_norm": 1.5284812649972477, "learning_rate": 9.75587107894072e-06, "loss": 0.7708, "step": 299 }, { "epoch": 0.23771790808240886, "grad_norm": 1.5201657678415668, "learning_rate": 9.75390673493477e-06, "loss": 0.7164, "step": 300 }, { "epoch": 0.23851030110935023, "grad_norm": 1.6121693164623934, "learning_rate": 9.751934719043342e-06, "loss": 0.6569, "step": 301 }, { "epoch": 0.2393026941362916, "grad_norm": 1.4673760061784857, "learning_rate": 9.749955034448884e-06, "loss": 0.7124, "step": 302 }, { "epoch": 0.24009508716323297, "grad_norm": 2.5585910761958184, "learning_rate": 9.747967684346223e-06, "loss": 0.7018, "step": 303 }, { "epoch": 0.24088748019017434, "grad_norm": 1.5270187324269242, "learning_rate": 9.74597267194256e-06, "loss": 0.6794, "step": 304 }, { "epoch": 0.24167987321711568, "grad_norm": 1.3091230994854457, "learning_rate": 9.743970000457457e-06, "loss": 0.7583, "step": 305 }, { "epoch": 0.24247226624405704, "grad_norm": 2.790363926668686, "learning_rate": 9.74195967312284e-06, "loss": 0.7066, "step": 306 }, { "epoch": 0.2432646592709984, "grad_norm": 1.6461259190819426, "learning_rate": 9.739941693182985e-06, "loss": 0.6349, "step": 307 }, { "epoch": 0.24405705229793978, "grad_norm": 1.4916689910602257, "learning_rate": 9.737916063894525e-06, "loss": 0.6935, "step": 308 }, { "epoch": 0.24484944532488115, "grad_norm": 1.442061670976205, "learning_rate": 9.735882788526434e-06, "loss": 0.7153, "step": 309 }, { "epoch": 0.24564183835182252, "grad_norm": 1.3135416223037728, "learning_rate": 9.73384187036002e-06, "loss": 0.7462, "step": 310 }, { "epoch": 0.24643423137876386, "grad_norm": 1.3617581735474853, "learning_rate": 9.731793312688934e-06, "loss": 0.7435, "step": 311 }, { "epoch": 0.24722662440570523, "grad_norm": 1.7581303719602746, "learning_rate": 9.729737118819152e-06, "loss": 0.6603, "step": 312 }, { "epoch": 0.2480190174326466, "grad_norm": 1.6805361580256397, "learning_rate": 9.727673292068969e-06, "loss": 0.6744, "step": 313 }, { "epoch": 0.24881141045958796, "grad_norm": 2.088511290787661, "learning_rate": 9.725601835769005e-06, "loss": 0.7047, "step": 314 }, { "epoch": 0.24960380348652933, "grad_norm": 1.4729830027285586, "learning_rate": 9.723522753262191e-06, "loss": 0.7219, "step": 315 }, { "epoch": 0.25039619651347067, "grad_norm": 1.363853372199276, "learning_rate": 9.72143604790376e-06, "loss": 0.7015, "step": 316 }, { "epoch": 0.25118858954041207, "grad_norm": 1.5813832204193643, "learning_rate": 9.719341723061253e-06, "loss": 0.6909, "step": 317 }, { "epoch": 0.2519809825673534, "grad_norm": 1.4400611834417325, "learning_rate": 9.717239782114507e-06, "loss": 0.6938, "step": 318 }, { "epoch": 0.25277337559429475, "grad_norm": 1.4328042610240983, "learning_rate": 9.715130228455643e-06, "loss": 0.7081, "step": 319 }, { "epoch": 0.25356576862123614, "grad_norm": 1.1321068517560882, "learning_rate": 9.713013065489076e-06, "loss": 0.7238, "step": 320 }, { "epoch": 0.2543581616481775, "grad_norm": 1.54813519656272, "learning_rate": 9.710888296631495e-06, "loss": 0.6205, "step": 321 }, { "epoch": 0.2551505546751189, "grad_norm": 2.2459300822423254, "learning_rate": 9.708755925311868e-06, "loss": 0.698, "step": 322 }, { "epoch": 0.2559429477020602, "grad_norm": 2.1429126473675906, "learning_rate": 9.706615954971429e-06, "loss": 0.701, "step": 323 }, { "epoch": 0.25673534072900156, "grad_norm": 1.4685534424529236, "learning_rate": 9.704468389063675e-06, "loss": 0.7464, "step": 324 }, { "epoch": 0.25752773375594296, "grad_norm": 1.779272193512004, "learning_rate": 9.702313231054361e-06, "loss": 0.7473, "step": 325 }, { "epoch": 0.2583201267828843, "grad_norm": 1.5141036304314426, "learning_rate": 9.700150484421497e-06, "loss": 0.6524, "step": 326 }, { "epoch": 0.2591125198098257, "grad_norm": 1.311071201676714, "learning_rate": 9.697980152655337e-06, "loss": 0.7372, "step": 327 }, { "epoch": 0.25990491283676703, "grad_norm": 1.4901453691896527, "learning_rate": 9.695802239258378e-06, "loss": 0.7475, "step": 328 }, { "epoch": 0.2606973058637084, "grad_norm": 1.6581757815071547, "learning_rate": 9.693616747745346e-06, "loss": 0.6393, "step": 329 }, { "epoch": 0.26148969889064977, "grad_norm": 1.3867974751464447, "learning_rate": 9.691423681643208e-06, "loss": 0.7256, "step": 330 }, { "epoch": 0.2622820919175911, "grad_norm": 1.5415970768022422, "learning_rate": 9.689223044491142e-06, "loss": 0.7148, "step": 331 }, { "epoch": 0.2630744849445325, "grad_norm": 1.4349223584618807, "learning_rate": 9.687014839840554e-06, "loss": 0.7659, "step": 332 }, { "epoch": 0.26386687797147385, "grad_norm": 1.5297840138310805, "learning_rate": 9.684799071255057e-06, "loss": 0.6755, "step": 333 }, { "epoch": 0.26465927099841524, "grad_norm": 1.4260689580701411, "learning_rate": 9.682575742310475e-06, "loss": 0.7257, "step": 334 }, { "epoch": 0.2654516640253566, "grad_norm": 1.465044713237242, "learning_rate": 9.68034485659483e-06, "loss": 0.7294, "step": 335 }, { "epoch": 0.2662440570522979, "grad_norm": 1.4662488183308857, "learning_rate": 9.678106417708338e-06, "loss": 0.6777, "step": 336 }, { "epoch": 0.2670364500792393, "grad_norm": 1.902281754525193, "learning_rate": 9.675860429263408e-06, "loss": 0.6125, "step": 337 }, { "epoch": 0.26782884310618066, "grad_norm": 1.5865953810185502, "learning_rate": 9.67360689488463e-06, "loss": 0.6615, "step": 338 }, { "epoch": 0.26862123613312205, "grad_norm": 1.5319347094764475, "learning_rate": 9.671345818208773e-06, "loss": 0.636, "step": 339 }, { "epoch": 0.2694136291600634, "grad_norm": 1.8203439176935354, "learning_rate": 9.669077202884776e-06, "loss": 0.6825, "step": 340 }, { "epoch": 0.27020602218700474, "grad_norm": 1.506019062243574, "learning_rate": 9.666801052573746e-06, "loss": 0.6966, "step": 341 }, { "epoch": 0.27099841521394613, "grad_norm": 2.228873400178841, "learning_rate": 9.664517370948948e-06, "loss": 0.7202, "step": 342 }, { "epoch": 0.27179080824088747, "grad_norm": 1.5956601720026455, "learning_rate": 9.662226161695806e-06, "loss": 0.7649, "step": 343 }, { "epoch": 0.27258320126782887, "grad_norm": 1.4330199425516095, "learning_rate": 9.659927428511884e-06, "loss": 0.6576, "step": 344 }, { "epoch": 0.2733755942947702, "grad_norm": 1.4156889514152595, "learning_rate": 9.657621175106894e-06, "loss": 0.7764, "step": 345 }, { "epoch": 0.27416798732171155, "grad_norm": 1.5588933579130317, "learning_rate": 9.655307405202682e-06, "loss": 0.686, "step": 346 }, { "epoch": 0.27496038034865294, "grad_norm": 1.6266596509477005, "learning_rate": 9.652986122533225e-06, "loss": 0.6536, "step": 347 }, { "epoch": 0.2757527733755943, "grad_norm": 2.2116779559771453, "learning_rate": 9.650657330844626e-06, "loss": 0.6798, "step": 348 }, { "epoch": 0.2765451664025357, "grad_norm": 1.745750714266328, "learning_rate": 9.648321033895102e-06, "loss": 0.7236, "step": 349 }, { "epoch": 0.277337559429477, "grad_norm": 1.8813284583194643, "learning_rate": 9.645977235454986e-06, "loss": 0.7079, "step": 350 }, { "epoch": 0.27812995245641836, "grad_norm": 1.445268853660594, "learning_rate": 9.643625939306713e-06, "loss": 0.7129, "step": 351 }, { "epoch": 0.27892234548335976, "grad_norm": 1.451183884818232, "learning_rate": 9.64126714924482e-06, "loss": 0.6747, "step": 352 }, { "epoch": 0.2797147385103011, "grad_norm": 1.4037521902480943, "learning_rate": 9.63890086907594e-06, "loss": 0.706, "step": 353 }, { "epoch": 0.2805071315372425, "grad_norm": 1.3180882806966645, "learning_rate": 9.63652710261879e-06, "loss": 0.6993, "step": 354 }, { "epoch": 0.28129952456418383, "grad_norm": 1.3287884679571957, "learning_rate": 9.634145853704168e-06, "loss": 0.7659, "step": 355 }, { "epoch": 0.2820919175911252, "grad_norm": 1.3919838578046295, "learning_rate": 9.63175712617495e-06, "loss": 0.6588, "step": 356 }, { "epoch": 0.28288431061806657, "grad_norm": 1.6705250699253953, "learning_rate": 9.629360923886081e-06, "loss": 0.625, "step": 357 }, { "epoch": 0.2836767036450079, "grad_norm": 2.04700639606446, "learning_rate": 9.626957250704568e-06, "loss": 0.6893, "step": 358 }, { "epoch": 0.2844690966719493, "grad_norm": 2.2320740847339837, "learning_rate": 9.624546110509475e-06, "loss": 0.6397, "step": 359 }, { "epoch": 0.28526148969889065, "grad_norm": 1.398558475847401, "learning_rate": 9.62212750719191e-06, "loss": 0.7084, "step": 360 }, { "epoch": 0.286053882725832, "grad_norm": 1.2214994665444199, "learning_rate": 9.619701444655038e-06, "loss": 0.7025, "step": 361 }, { "epoch": 0.2868462757527734, "grad_norm": 1.3841717224409957, "learning_rate": 9.617267926814049e-06, "loss": 0.7355, "step": 362 }, { "epoch": 0.2876386687797147, "grad_norm": 1.6435432005350339, "learning_rate": 9.614826957596176e-06, "loss": 0.6237, "step": 363 }, { "epoch": 0.2884310618066561, "grad_norm": 1.3329660766782596, "learning_rate": 9.612378540940664e-06, "loss": 0.7513, "step": 364 }, { "epoch": 0.28922345483359746, "grad_norm": 1.313006175960244, "learning_rate": 9.609922680798787e-06, "loss": 0.6975, "step": 365 }, { "epoch": 0.2900158478605388, "grad_norm": 1.5236469554195855, "learning_rate": 9.607459381133827e-06, "loss": 0.6295, "step": 366 }, { "epoch": 0.2908082408874802, "grad_norm": 1.4013229652117536, "learning_rate": 9.604988645921074e-06, "loss": 0.6737, "step": 367 }, { "epoch": 0.29160063391442154, "grad_norm": 1.3795252438647503, "learning_rate": 9.602510479147818e-06, "loss": 0.7732, "step": 368 }, { "epoch": 0.29239302694136293, "grad_norm": 1.863067188558874, "learning_rate": 9.600024884813338e-06, "loss": 0.7059, "step": 369 }, { "epoch": 0.2931854199683043, "grad_norm": 1.730155660062322, "learning_rate": 9.597531866928901e-06, "loss": 0.6712, "step": 370 }, { "epoch": 0.29397781299524567, "grad_norm": 1.2426494422800478, "learning_rate": 9.59503142951776e-06, "loss": 0.6665, "step": 371 }, { "epoch": 0.294770206022187, "grad_norm": 1.3210072827081492, "learning_rate": 9.592523576615134e-06, "loss": 0.7565, "step": 372 }, { "epoch": 0.29556259904912835, "grad_norm": 1.6664957937041378, "learning_rate": 9.590008312268211e-06, "loss": 0.7893, "step": 373 }, { "epoch": 0.29635499207606975, "grad_norm": 1.375528754843239, "learning_rate": 9.587485640536145e-06, "loss": 0.7063, "step": 374 }, { "epoch": 0.2971473851030111, "grad_norm": 1.5201896658695802, "learning_rate": 9.584955565490037e-06, "loss": 0.6426, "step": 375 }, { "epoch": 0.2979397781299525, "grad_norm": 1.253553250209687, "learning_rate": 9.582418091212939e-06, "loss": 0.7052, "step": 376 }, { "epoch": 0.2987321711568938, "grad_norm": 1.3368143924609892, "learning_rate": 9.579873221799842e-06, "loss": 0.7656, "step": 377 }, { "epoch": 0.29952456418383516, "grad_norm": 1.4631734804938445, "learning_rate": 9.577320961357677e-06, "loss": 0.7866, "step": 378 }, { "epoch": 0.30031695721077656, "grad_norm": 1.5239946621178455, "learning_rate": 9.574761314005293e-06, "loss": 0.6539, "step": 379 }, { "epoch": 0.3011093502377179, "grad_norm": 1.6649374012236773, "learning_rate": 9.57219428387347e-06, "loss": 0.7744, "step": 380 }, { "epoch": 0.3019017432646593, "grad_norm": 1.5012608287209641, "learning_rate": 9.569619875104896e-06, "loss": 0.6756, "step": 381 }, { "epoch": 0.30269413629160064, "grad_norm": 1.457473346997756, "learning_rate": 9.56703809185417e-06, "loss": 0.6577, "step": 382 }, { "epoch": 0.303486529318542, "grad_norm": 1.4171198721152205, "learning_rate": 9.564448938287787e-06, "loss": 0.7058, "step": 383 }, { "epoch": 0.30427892234548337, "grad_norm": 1.660021009969608, "learning_rate": 9.561852418584143e-06, "loss": 0.7131, "step": 384 }, { "epoch": 0.3050713153724247, "grad_norm": 1.7802846841078732, "learning_rate": 9.559248536933516e-06, "loss": 0.707, "step": 385 }, { "epoch": 0.3058637083993661, "grad_norm": 1.773286814161426, "learning_rate": 9.55663729753807e-06, "loss": 0.6676, "step": 386 }, { "epoch": 0.30665610142630745, "grad_norm": 1.5536738469629374, "learning_rate": 9.554018704611838e-06, "loss": 0.7014, "step": 387 }, { "epoch": 0.3074484944532488, "grad_norm": 1.3846992429662202, "learning_rate": 9.551392762380721e-06, "loss": 0.6897, "step": 388 }, { "epoch": 0.3082408874801902, "grad_norm": 1.3199450273032427, "learning_rate": 9.548759475082485e-06, "loss": 0.6806, "step": 389 }, { "epoch": 0.3090332805071315, "grad_norm": 1.5239484509494476, "learning_rate": 9.546118846966742e-06, "loss": 0.6834, "step": 390 }, { "epoch": 0.3098256735340729, "grad_norm": 3.2275033178173556, "learning_rate": 9.54347088229496e-06, "loss": 0.6663, "step": 391 }, { "epoch": 0.31061806656101426, "grad_norm": 1.5872116862690424, "learning_rate": 9.540815585340437e-06, "loss": 0.6586, "step": 392 }, { "epoch": 0.3114104595879556, "grad_norm": 1.3989695922583294, "learning_rate": 9.53815296038831e-06, "loss": 0.7066, "step": 393 }, { "epoch": 0.312202852614897, "grad_norm": 3.8876660513131434, "learning_rate": 9.53548301173554e-06, "loss": 0.7101, "step": 394 }, { "epoch": 0.31299524564183834, "grad_norm": 1.3787114095192852, "learning_rate": 9.53280574369091e-06, "loss": 0.663, "step": 395 }, { "epoch": 0.31378763866877973, "grad_norm": 1.556154703351383, "learning_rate": 9.53012116057501e-06, "loss": 0.7211, "step": 396 }, { "epoch": 0.3145800316957211, "grad_norm": 1.4074817515677993, "learning_rate": 9.527429266720237e-06, "loss": 0.7198, "step": 397 }, { "epoch": 0.3153724247226624, "grad_norm": 1.4600337797654648, "learning_rate": 9.524730066470786e-06, "loss": 0.6377, "step": 398 }, { "epoch": 0.3161648177496038, "grad_norm": 1.4398135260545508, "learning_rate": 9.522023564182648e-06, "loss": 0.7167, "step": 399 }, { "epoch": 0.31695721077654515, "grad_norm": 1.4224976820682231, "learning_rate": 9.51930976422359e-06, "loss": 0.6937, "step": 400 }, { "epoch": 0.31774960380348655, "grad_norm": 1.4651120998318206, "learning_rate": 9.516588670973163e-06, "loss": 0.7096, "step": 401 }, { "epoch": 0.3185419968304279, "grad_norm": 1.4877328330189743, "learning_rate": 9.513860288822683e-06, "loss": 0.7204, "step": 402 }, { "epoch": 0.3193343898573692, "grad_norm": 1.329430282271346, "learning_rate": 9.511124622175229e-06, "loss": 0.6762, "step": 403 }, { "epoch": 0.3201267828843106, "grad_norm": 1.3184839396891384, "learning_rate": 9.508381675445643e-06, "loss": 0.6655, "step": 404 }, { "epoch": 0.32091917591125196, "grad_norm": 1.641645245723363, "learning_rate": 9.505631453060504e-06, "loss": 0.6966, "step": 405 }, { "epoch": 0.32171156893819336, "grad_norm": 1.4752487739044713, "learning_rate": 9.502873959458143e-06, "loss": 0.685, "step": 406 }, { "epoch": 0.3225039619651347, "grad_norm": 1.6943808347749485, "learning_rate": 9.50010919908862e-06, "loss": 0.7173, "step": 407 }, { "epoch": 0.3232963549920761, "grad_norm": 1.3904052150345305, "learning_rate": 9.497337176413723e-06, "loss": 0.6753, "step": 408 }, { "epoch": 0.32408874801901744, "grad_norm": 1.4727034846325833, "learning_rate": 9.49455789590696e-06, "loss": 0.6914, "step": 409 }, { "epoch": 0.3248811410459588, "grad_norm": 1.4752334347119678, "learning_rate": 9.49177136205355e-06, "loss": 0.6532, "step": 410 }, { "epoch": 0.3256735340729002, "grad_norm": 1.4642776933702704, "learning_rate": 9.488977579350423e-06, "loss": 0.7, "step": 411 }, { "epoch": 0.3264659270998415, "grad_norm": 1.3801124801303177, "learning_rate": 9.4861765523062e-06, "loss": 0.6144, "step": 412 }, { "epoch": 0.3272583201267829, "grad_norm": 1.4999185254492524, "learning_rate": 9.4833682854412e-06, "loss": 0.7241, "step": 413 }, { "epoch": 0.32805071315372425, "grad_norm": 1.5502226173196272, "learning_rate": 9.48055278328742e-06, "loss": 0.679, "step": 414 }, { "epoch": 0.3288431061806656, "grad_norm": 1.417166004223544, "learning_rate": 9.477730050388538e-06, "loss": 0.7227, "step": 415 }, { "epoch": 0.329635499207607, "grad_norm": 1.299241066543502, "learning_rate": 9.474900091299894e-06, "loss": 0.7238, "step": 416 }, { "epoch": 0.3304278922345483, "grad_norm": 1.500545977446442, "learning_rate": 9.4720629105885e-06, "loss": 0.6477, "step": 417 }, { "epoch": 0.3312202852614897, "grad_norm": 1.4456637711614557, "learning_rate": 9.469218512833013e-06, "loss": 0.6956, "step": 418 }, { "epoch": 0.33201267828843106, "grad_norm": 1.6758847314100742, "learning_rate": 9.46636690262374e-06, "loss": 0.7081, "step": 419 }, { "epoch": 0.3328050713153724, "grad_norm": 1.4670199884341066, "learning_rate": 9.463508084562632e-06, "loss": 0.7042, "step": 420 }, { "epoch": 0.3335974643423138, "grad_norm": 1.4877721911501867, "learning_rate": 9.460642063263263e-06, "loss": 0.7184, "step": 421 }, { "epoch": 0.33438985736925514, "grad_norm": 1.4275416301067623, "learning_rate": 9.457768843350841e-06, "loss": 0.6675, "step": 422 }, { "epoch": 0.33518225039619653, "grad_norm": 1.2692524008641817, "learning_rate": 9.454888429462185e-06, "loss": 0.7354, "step": 423 }, { "epoch": 0.3359746434231379, "grad_norm": 1.4043056978027224, "learning_rate": 9.452000826245726e-06, "loss": 0.695, "step": 424 }, { "epoch": 0.3367670364500792, "grad_norm": 1.5383147662290064, "learning_rate": 9.449106038361496e-06, "loss": 0.6856, "step": 425 }, { "epoch": 0.3375594294770206, "grad_norm": 2.9546551477728684, "learning_rate": 9.446204070481122e-06, "loss": 0.6608, "step": 426 }, { "epoch": 0.33835182250396195, "grad_norm": 1.4308566575193131, "learning_rate": 9.44329492728782e-06, "loss": 0.7273, "step": 427 }, { "epoch": 0.33914421553090335, "grad_norm": 1.531815986168058, "learning_rate": 9.440378613476384e-06, "loss": 0.6935, "step": 428 }, { "epoch": 0.3399366085578447, "grad_norm": 1.523439770199983, "learning_rate": 9.43745513375318e-06, "loss": 0.7037, "step": 429 }, { "epoch": 0.34072900158478603, "grad_norm": 1.930405775716038, "learning_rate": 9.434524492836139e-06, "loss": 0.7048, "step": 430 }, { "epoch": 0.3415213946117274, "grad_norm": 1.7293874560076852, "learning_rate": 9.431586695454746e-06, "loss": 0.6661, "step": 431 }, { "epoch": 0.34231378763866877, "grad_norm": 1.4150992499072284, "learning_rate": 9.428641746350041e-06, "loss": 0.7094, "step": 432 }, { "epoch": 0.34310618066561016, "grad_norm": 1.5340538438548992, "learning_rate": 9.425689650274597e-06, "loss": 0.6787, "step": 433 }, { "epoch": 0.3438985736925515, "grad_norm": 1.4449740291606747, "learning_rate": 9.422730411992532e-06, "loss": 0.7097, "step": 434 }, { "epoch": 0.34469096671949284, "grad_norm": 1.2405383168669448, "learning_rate": 9.419764036279477e-06, "loss": 0.7198, "step": 435 }, { "epoch": 0.34548335974643424, "grad_norm": 1.4314082498028562, "learning_rate": 9.416790527922595e-06, "loss": 0.6956, "step": 436 }, { "epoch": 0.3462757527733756, "grad_norm": 1.3614849517889276, "learning_rate": 9.41380989172055e-06, "loss": 0.7475, "step": 437 }, { "epoch": 0.347068145800317, "grad_norm": 2.8013654923087437, "learning_rate": 9.41082213248351e-06, "loss": 0.7387, "step": 438 }, { "epoch": 0.3478605388272583, "grad_norm": 1.189832859559697, "learning_rate": 9.407827255033144e-06, "loss": 0.7126, "step": 439 }, { "epoch": 0.3486529318541997, "grad_norm": 1.2496829334469934, "learning_rate": 9.404825264202602e-06, "loss": 0.6964, "step": 440 }, { "epoch": 0.34944532488114105, "grad_norm": 1.4814985669147764, "learning_rate": 9.401816164836517e-06, "loss": 0.6672, "step": 441 }, { "epoch": 0.3502377179080824, "grad_norm": 1.6062130311058185, "learning_rate": 9.398799961790995e-06, "loss": 0.7283, "step": 442 }, { "epoch": 0.3510301109350238, "grad_norm": 1.2895695126570506, "learning_rate": 9.3957766599336e-06, "loss": 0.7654, "step": 443 }, { "epoch": 0.3518225039619651, "grad_norm": 1.642984728484263, "learning_rate": 9.392746264143359e-06, "loss": 0.5894, "step": 444 }, { "epoch": 0.3526148969889065, "grad_norm": 7.457212613114814, "learning_rate": 9.389708779310743e-06, "loss": 0.707, "step": 445 }, { "epoch": 0.35340729001584786, "grad_norm": 1.5082688316645068, "learning_rate": 9.386664210337665e-06, "loss": 0.6665, "step": 446 }, { "epoch": 0.3541996830427892, "grad_norm": 1.770022331031015, "learning_rate": 9.383612562137473e-06, "loss": 0.6847, "step": 447 }, { "epoch": 0.3549920760697306, "grad_norm": 1.8173254461476576, "learning_rate": 9.380553839634932e-06, "loss": 0.6662, "step": 448 }, { "epoch": 0.35578446909667194, "grad_norm": 1.3342295516514362, "learning_rate": 9.377488047766233e-06, "loss": 0.745, "step": 449 }, { "epoch": 0.35657686212361334, "grad_norm": 1.5161449899567243, "learning_rate": 9.374415191478964e-06, "loss": 0.7117, "step": 450 }, { "epoch": 0.3573692551505547, "grad_norm": 1.6216324889863643, "learning_rate": 9.371335275732127e-06, "loss": 0.7195, "step": 451 }, { "epoch": 0.358161648177496, "grad_norm": 1.2880550162639994, "learning_rate": 9.368248305496108e-06, "loss": 0.7183, "step": 452 }, { "epoch": 0.3589540412044374, "grad_norm": 1.2753998038251322, "learning_rate": 9.365154285752678e-06, "loss": 0.7041, "step": 453 }, { "epoch": 0.35974643423137875, "grad_norm": 1.3938580786529313, "learning_rate": 9.362053221494987e-06, "loss": 0.7351, "step": 454 }, { "epoch": 0.36053882725832015, "grad_norm": 1.4260996051082016, "learning_rate": 9.358945117727553e-06, "loss": 0.7463, "step": 455 }, { "epoch": 0.3613312202852615, "grad_norm": 1.788119873429661, "learning_rate": 9.355829979466253e-06, "loss": 0.6953, "step": 456 }, { "epoch": 0.36212361331220283, "grad_norm": 1.3533769896698373, "learning_rate": 9.35270781173832e-06, "loss": 0.7253, "step": 457 }, { "epoch": 0.3629160063391442, "grad_norm": 1.3979425891749786, "learning_rate": 9.349578619582325e-06, "loss": 0.7138, "step": 458 }, { "epoch": 0.36370839936608557, "grad_norm": 1.2910154523970991, "learning_rate": 9.346442408048179e-06, "loss": 0.7419, "step": 459 }, { "epoch": 0.36450079239302696, "grad_norm": 1.4484397059054845, "learning_rate": 9.343299182197124e-06, "loss": 0.6598, "step": 460 }, { "epoch": 0.3652931854199683, "grad_norm": 1.4479493320983814, "learning_rate": 9.340148947101713e-06, "loss": 0.7245, "step": 461 }, { "epoch": 0.36608557844690964, "grad_norm": 1.3747578775755138, "learning_rate": 9.336991707845821e-06, "loss": 0.8329, "step": 462 }, { "epoch": 0.36687797147385104, "grad_norm": 1.5653928995244943, "learning_rate": 9.333827469524617e-06, "loss": 0.6304, "step": 463 }, { "epoch": 0.3676703645007924, "grad_norm": 1.2678321962623096, "learning_rate": 9.330656237244572e-06, "loss": 0.6674, "step": 464 }, { "epoch": 0.3684627575277338, "grad_norm": 1.261592894760106, "learning_rate": 9.32747801612344e-06, "loss": 0.6309, "step": 465 }, { "epoch": 0.3692551505546751, "grad_norm": 1.2086001128840462, "learning_rate": 9.324292811290255e-06, "loss": 0.6028, "step": 466 }, { "epoch": 0.37004754358161646, "grad_norm": 1.7484568454290483, "learning_rate": 9.321100627885322e-06, "loss": 0.7225, "step": 467 }, { "epoch": 0.37083993660855785, "grad_norm": 1.3618722326073736, "learning_rate": 9.317901471060206e-06, "loss": 0.7299, "step": 468 }, { "epoch": 0.3716323296354992, "grad_norm": 1.211565305220343, "learning_rate": 9.314695345977728e-06, "loss": 0.7154, "step": 469 }, { "epoch": 0.3724247226624406, "grad_norm": 1.485114083763755, "learning_rate": 9.311482257811953e-06, "loss": 0.7312, "step": 470 }, { "epoch": 0.37321711568938193, "grad_norm": 1.6926954457327175, "learning_rate": 9.308262211748182e-06, "loss": 0.748, "step": 471 }, { "epoch": 0.37400950871632327, "grad_norm": 1.4512598126745988, "learning_rate": 9.305035212982949e-06, "loss": 0.7168, "step": 472 }, { "epoch": 0.37480190174326466, "grad_norm": 1.481936734498218, "learning_rate": 9.301801266724003e-06, "loss": 0.7297, "step": 473 }, { "epoch": 0.375594294770206, "grad_norm": 1.4405458934877604, "learning_rate": 9.298560378190309e-06, "loss": 0.6827, "step": 474 }, { "epoch": 0.3763866877971474, "grad_norm": 1.544090501596855, "learning_rate": 9.295312552612035e-06, "loss": 0.7139, "step": 475 }, { "epoch": 0.37717908082408874, "grad_norm": 1.3464852382247325, "learning_rate": 9.292057795230541e-06, "loss": 0.7151, "step": 476 }, { "epoch": 0.37797147385103014, "grad_norm": 1.3795341277205588, "learning_rate": 9.288796111298375e-06, "loss": 0.7177, "step": 477 }, { "epoch": 0.3787638668779715, "grad_norm": 1.3712540643149906, "learning_rate": 9.285527506079264e-06, "loss": 0.6875, "step": 478 }, { "epoch": 0.3795562599049128, "grad_norm": 1.2028082189357379, "learning_rate": 9.282251984848105e-06, "loss": 0.7993, "step": 479 }, { "epoch": 0.3803486529318542, "grad_norm": 1.4739325475372091, "learning_rate": 9.278969552890957e-06, "loss": 0.6643, "step": 480 }, { "epoch": 0.38114104595879555, "grad_norm": 1.8066353018795047, "learning_rate": 9.275680215505028e-06, "loss": 0.7361, "step": 481 }, { "epoch": 0.38193343898573695, "grad_norm": 2.125300596652203, "learning_rate": 9.27238397799867e-06, "loss": 0.6329, "step": 482 }, { "epoch": 0.3827258320126783, "grad_norm": 1.4437914185464966, "learning_rate": 9.269080845691379e-06, "loss": 0.7356, "step": 483 }, { "epoch": 0.38351822503961963, "grad_norm": 1.2169788329570492, "learning_rate": 9.265770823913762e-06, "loss": 0.7252, "step": 484 }, { "epoch": 0.384310618066561, "grad_norm": 2.8595649297450865, "learning_rate": 9.262453918007562e-06, "loss": 0.6589, "step": 485 }, { "epoch": 0.38510301109350237, "grad_norm": 1.4377306291590413, "learning_rate": 9.259130133325618e-06, "loss": 0.6594, "step": 486 }, { "epoch": 0.38589540412044376, "grad_norm": 2.0325942194618163, "learning_rate": 9.255799475231878e-06, "loss": 0.6446, "step": 487 }, { "epoch": 0.3866877971473851, "grad_norm": 1.985620715689071, "learning_rate": 9.25246194910138e-06, "loss": 0.6982, "step": 488 }, { "epoch": 0.38748019017432644, "grad_norm": 1.7575064433575358, "learning_rate": 9.24911756032024e-06, "loss": 0.672, "step": 489 }, { "epoch": 0.38827258320126784, "grad_norm": 1.4338768373696247, "learning_rate": 9.245766314285662e-06, "loss": 0.6147, "step": 490 }, { "epoch": 0.3890649762282092, "grad_norm": 1.302592635897776, "learning_rate": 9.242408216405903e-06, "loss": 0.6393, "step": 491 }, { "epoch": 0.3898573692551506, "grad_norm": 1.3372150426737357, "learning_rate": 9.239043272100285e-06, "loss": 0.734, "step": 492 }, { "epoch": 0.3906497622820919, "grad_norm": 1.7436698382274254, "learning_rate": 9.235671486799177e-06, "loss": 0.6889, "step": 493 }, { "epoch": 0.39144215530903326, "grad_norm": 1.7270201530229021, "learning_rate": 9.232292865943989e-06, "loss": 0.6959, "step": 494 }, { "epoch": 0.39223454833597465, "grad_norm": 1.5244508821108163, "learning_rate": 9.228907414987157e-06, "loss": 0.72, "step": 495 }, { "epoch": 0.393026941362916, "grad_norm": 1.3080472143244006, "learning_rate": 9.225515139392149e-06, "loss": 0.6496, "step": 496 }, { "epoch": 0.3938193343898574, "grad_norm": 1.4352599020222172, "learning_rate": 9.222116044633439e-06, "loss": 0.6589, "step": 497 }, { "epoch": 0.39461172741679873, "grad_norm": 1.3582897220078736, "learning_rate": 9.218710136196507e-06, "loss": 0.6654, "step": 498 }, { "epoch": 0.39540412044374007, "grad_norm": 1.3399959572662876, "learning_rate": 9.215297419577831e-06, "loss": 0.6554, "step": 499 }, { "epoch": 0.39619651347068147, "grad_norm": 1.5571451122076374, "learning_rate": 9.211877900284876e-06, "loss": 0.7221, "step": 500 }, { "epoch": 0.3969889064976228, "grad_norm": 1.8426616947805006, "learning_rate": 9.208451583836084e-06, "loss": 0.7023, "step": 501 }, { "epoch": 0.3977812995245642, "grad_norm": 1.5445622271394672, "learning_rate": 9.205018475760868e-06, "loss": 0.6624, "step": 502 }, { "epoch": 0.39857369255150554, "grad_norm": 7.196252529714104, "learning_rate": 9.201578581599596e-06, "loss": 0.719, "step": 503 }, { "epoch": 0.3993660855784469, "grad_norm": 1.5093021268672246, "learning_rate": 9.198131906903597e-06, "loss": 0.7323, "step": 504 }, { "epoch": 0.4001584786053883, "grad_norm": 1.4481690771245022, "learning_rate": 9.19467845723513e-06, "loss": 0.6497, "step": 505 }, { "epoch": 0.4009508716323296, "grad_norm": 2.7906898461097103, "learning_rate": 9.191218238167401e-06, "loss": 0.7172, "step": 506 }, { "epoch": 0.401743264659271, "grad_norm": 1.5432709545562344, "learning_rate": 9.187751255284532e-06, "loss": 0.646, "step": 507 }, { "epoch": 0.40253565768621236, "grad_norm": 1.385824617343005, "learning_rate": 9.184277514181559e-06, "loss": 0.6835, "step": 508 }, { "epoch": 0.40332805071315375, "grad_norm": 1.8630939189131719, "learning_rate": 9.180797020464433e-06, "loss": 0.696, "step": 509 }, { "epoch": 0.4041204437400951, "grad_norm": 1.4444154081932532, "learning_rate": 9.177309779749994e-06, "loss": 0.6542, "step": 510 }, { "epoch": 0.40491283676703643, "grad_norm": 1.9223312995245656, "learning_rate": 9.173815797665974e-06, "loss": 0.6459, "step": 511 }, { "epoch": 0.40570522979397783, "grad_norm": 1.6841926475319628, "learning_rate": 9.170315079850987e-06, "loss": 0.6891, "step": 512 }, { "epoch": 0.40649762282091917, "grad_norm": 1.272930033594402, "learning_rate": 9.166807631954508e-06, "loss": 0.7111, "step": 513 }, { "epoch": 0.40729001584786056, "grad_norm": 1.4400576938779581, "learning_rate": 9.163293459636886e-06, "loss": 0.6923, "step": 514 }, { "epoch": 0.4080824088748019, "grad_norm": 1.2696324026950154, "learning_rate": 9.15977256856931e-06, "loss": 0.6973, "step": 515 }, { "epoch": 0.40887480190174325, "grad_norm": 1.4753347770381742, "learning_rate": 9.15624496443382e-06, "loss": 0.7117, "step": 516 }, { "epoch": 0.40966719492868464, "grad_norm": 1.2782321492320705, "learning_rate": 9.152710652923284e-06, "loss": 0.6427, "step": 517 }, { "epoch": 0.410459587955626, "grad_norm": 1.3499993582595027, "learning_rate": 9.149169639741397e-06, "loss": 0.6632, "step": 518 }, { "epoch": 0.4112519809825674, "grad_norm": 1.3600923832812266, "learning_rate": 9.145621930602671e-06, "loss": 0.6379, "step": 519 }, { "epoch": 0.4120443740095087, "grad_norm": 1.4710560722162735, "learning_rate": 9.142067531232422e-06, "loss": 0.6852, "step": 520 }, { "epoch": 0.41283676703645006, "grad_norm": 1.3674722778448989, "learning_rate": 9.13850644736676e-06, "loss": 0.6685, "step": 521 }, { "epoch": 0.41362916006339145, "grad_norm": 1.2279497140382787, "learning_rate": 9.134938684752588e-06, "loss": 0.7147, "step": 522 }, { "epoch": 0.4144215530903328, "grad_norm": 1.537597988013878, "learning_rate": 9.131364249147583e-06, "loss": 0.682, "step": 523 }, { "epoch": 0.4152139461172742, "grad_norm": 1.453563136793913, "learning_rate": 9.127783146320191e-06, "loss": 0.7018, "step": 524 }, { "epoch": 0.41600633914421553, "grad_norm": 2.6961311943540642, "learning_rate": 9.124195382049621e-06, "loss": 0.6887, "step": 525 }, { "epoch": 0.41679873217115687, "grad_norm": 1.5042160869565526, "learning_rate": 9.12060096212583e-06, "loss": 0.7198, "step": 526 }, { "epoch": 0.41759112519809827, "grad_norm": 1.4813727694453103, "learning_rate": 9.116999892349515e-06, "loss": 0.7136, "step": 527 }, { "epoch": 0.4183835182250396, "grad_norm": 1.4368986434085886, "learning_rate": 9.113392178532107e-06, "loss": 0.6622, "step": 528 }, { "epoch": 0.419175911251981, "grad_norm": 1.3330229378927907, "learning_rate": 9.109777826495758e-06, "loss": 0.6532, "step": 529 }, { "epoch": 0.41996830427892234, "grad_norm": 3.1696136522766136, "learning_rate": 9.106156842073333e-06, "loss": 0.6696, "step": 530 }, { "epoch": 0.4207606973058637, "grad_norm": 1.3673337325355996, "learning_rate": 9.102529231108399e-06, "loss": 0.7951, "step": 531 }, { "epoch": 0.4215530903328051, "grad_norm": 1.4671206791447333, "learning_rate": 9.098894999455217e-06, "loss": 0.6899, "step": 532 }, { "epoch": 0.4223454833597464, "grad_norm": 1.4818351774221974, "learning_rate": 9.095254152978736e-06, "loss": 0.7072, "step": 533 }, { "epoch": 0.4231378763866878, "grad_norm": 1.9908653642975553, "learning_rate": 9.091606697554576e-06, "loss": 0.688, "step": 534 }, { "epoch": 0.42393026941362916, "grad_norm": 1.3437528072636544, "learning_rate": 9.087952639069027e-06, "loss": 0.728, "step": 535 }, { "epoch": 0.4247226624405705, "grad_norm": 1.3463974154765472, "learning_rate": 9.084291983419027e-06, "loss": 0.716, "step": 536 }, { "epoch": 0.4255150554675119, "grad_norm": 1.574778479455885, "learning_rate": 9.080624736512171e-06, "loss": 0.7276, "step": 537 }, { "epoch": 0.42630744849445323, "grad_norm": 1.5067303756627892, "learning_rate": 9.076950904266682e-06, "loss": 0.6836, "step": 538 }, { "epoch": 0.42709984152139463, "grad_norm": 1.3624470254444665, "learning_rate": 9.073270492611417e-06, "loss": 0.6707, "step": 539 }, { "epoch": 0.42789223454833597, "grad_norm": 1.211272144523275, "learning_rate": 9.069583507485847e-06, "loss": 0.672, "step": 540 }, { "epoch": 0.4286846275752773, "grad_norm": 1.4199040685634723, "learning_rate": 9.065889954840052e-06, "loss": 0.6796, "step": 541 }, { "epoch": 0.4294770206022187, "grad_norm": 1.401092239450817, "learning_rate": 9.062189840634712e-06, "loss": 0.6417, "step": 542 }, { "epoch": 0.43026941362916005, "grad_norm": 1.4838735399907101, "learning_rate": 9.058483170841096e-06, "loss": 0.6822, "step": 543 }, { "epoch": 0.43106180665610144, "grad_norm": 1.4198397398339657, "learning_rate": 9.05476995144105e-06, "loss": 0.7084, "step": 544 }, { "epoch": 0.4318541996830428, "grad_norm": 1.780508413948861, "learning_rate": 9.051050188426992e-06, "loss": 0.7225, "step": 545 }, { "epoch": 0.4326465927099842, "grad_norm": 1.430225558969414, "learning_rate": 9.0473238878019e-06, "loss": 0.7353, "step": 546 }, { "epoch": 0.4334389857369255, "grad_norm": 3.066343674111696, "learning_rate": 9.043591055579303e-06, "loss": 0.6651, "step": 547 }, { "epoch": 0.43423137876386686, "grad_norm": 1.4943985973613299, "learning_rate": 9.039851697783268e-06, "loss": 0.7163, "step": 548 }, { "epoch": 0.43502377179080826, "grad_norm": 1.4316736116381354, "learning_rate": 9.036105820448395e-06, "loss": 0.6727, "step": 549 }, { "epoch": 0.4358161648177496, "grad_norm": 1.4232764384957224, "learning_rate": 9.032353429619807e-06, "loss": 0.663, "step": 550 }, { "epoch": 0.436608557844691, "grad_norm": 1.5310855072711673, "learning_rate": 9.028594531353135e-06, "loss": 0.7255, "step": 551 }, { "epoch": 0.43740095087163233, "grad_norm": 1.5305057400844357, "learning_rate": 9.024829131714514e-06, "loss": 0.6593, "step": 552 }, { "epoch": 0.4381933438985737, "grad_norm": 1.410131506958036, "learning_rate": 9.021057236780569e-06, "loss": 0.6601, "step": 553 }, { "epoch": 0.43898573692551507, "grad_norm": 1.4007413507626543, "learning_rate": 9.017278852638412e-06, "loss": 0.6311, "step": 554 }, { "epoch": 0.4397781299524564, "grad_norm": 1.299424614842858, "learning_rate": 9.01349398538562e-06, "loss": 0.6741, "step": 555 }, { "epoch": 0.4405705229793978, "grad_norm": 1.9128375943915301, "learning_rate": 9.009702641130236e-06, "loss": 0.7101, "step": 556 }, { "epoch": 0.44136291600633915, "grad_norm": 1.287316393586707, "learning_rate": 9.005904825990759e-06, "loss": 0.6816, "step": 557 }, { "epoch": 0.4421553090332805, "grad_norm": 1.5410927903550433, "learning_rate": 9.002100546096127e-06, "loss": 0.6401, "step": 558 }, { "epoch": 0.4429477020602219, "grad_norm": 1.9670666740695117, "learning_rate": 8.99828980758571e-06, "loss": 0.6797, "step": 559 }, { "epoch": 0.4437400950871632, "grad_norm": 1.4934976906313617, "learning_rate": 8.994472616609303e-06, "loss": 0.6991, "step": 560 }, { "epoch": 0.4445324881141046, "grad_norm": 1.2951959634480705, "learning_rate": 8.990648979327115e-06, "loss": 0.6952, "step": 561 }, { "epoch": 0.44532488114104596, "grad_norm": 1.5157970042321711, "learning_rate": 8.986818901909755e-06, "loss": 0.691, "step": 562 }, { "epoch": 0.4461172741679873, "grad_norm": 1.4657251365557096, "learning_rate": 8.982982390538226e-06, "loss": 0.7016, "step": 563 }, { "epoch": 0.4469096671949287, "grad_norm": 1.3734525299768279, "learning_rate": 8.979139451403917e-06, "loss": 0.6817, "step": 564 }, { "epoch": 0.44770206022187004, "grad_norm": 1.350398533590499, "learning_rate": 8.975290090708587e-06, "loss": 0.7211, "step": 565 }, { "epoch": 0.44849445324881143, "grad_norm": 1.3492600262463184, "learning_rate": 8.97143431466436e-06, "loss": 0.7299, "step": 566 }, { "epoch": 0.44928684627575277, "grad_norm": 1.2474555292416396, "learning_rate": 8.96757212949371e-06, "loss": 0.6961, "step": 567 }, { "epoch": 0.4500792393026941, "grad_norm": 1.2327454577238706, "learning_rate": 8.963703541429459e-06, "loss": 0.7072, "step": 568 }, { "epoch": 0.4508716323296355, "grad_norm": 1.1981857461526593, "learning_rate": 8.95982855671476e-06, "loss": 0.6673, "step": 569 }, { "epoch": 0.45166402535657685, "grad_norm": 1.568691678238264, "learning_rate": 8.955947181603083e-06, "loss": 0.6865, "step": 570 }, { "epoch": 0.45245641838351824, "grad_norm": 1.4714327813400059, "learning_rate": 8.952059422358225e-06, "loss": 0.7258, "step": 571 }, { "epoch": 0.4532488114104596, "grad_norm": 1.3177967137121092, "learning_rate": 8.94816528525427e-06, "loss": 0.5938, "step": 572 }, { "epoch": 0.4540412044374009, "grad_norm": 1.4706196791448893, "learning_rate": 8.944264776575605e-06, "loss": 0.6947, "step": 573 }, { "epoch": 0.4548335974643423, "grad_norm": 4.815861968390517, "learning_rate": 8.940357902616895e-06, "loss": 0.6134, "step": 574 }, { "epoch": 0.45562599049128366, "grad_norm": 1.4192235084191303, "learning_rate": 8.93644466968308e-06, "loss": 0.6915, "step": 575 }, { "epoch": 0.45641838351822506, "grad_norm": 3.5242885243269018, "learning_rate": 8.932525084089358e-06, "loss": 0.693, "step": 576 }, { "epoch": 0.4572107765451664, "grad_norm": 1.2775549368145716, "learning_rate": 8.928599152161182e-06, "loss": 0.6794, "step": 577 }, { "epoch": 0.45800316957210774, "grad_norm": 1.6392339654724513, "learning_rate": 8.92466688023425e-06, "loss": 0.7091, "step": 578 }, { "epoch": 0.45879556259904913, "grad_norm": 1.277263406214335, "learning_rate": 8.920728274654483e-06, "loss": 0.7186, "step": 579 }, { "epoch": 0.4595879556259905, "grad_norm": 1.2289683894073793, "learning_rate": 8.91678334177803e-06, "loss": 0.7678, "step": 580 }, { "epoch": 0.46038034865293187, "grad_norm": 3.702691499294028, "learning_rate": 8.912832087971253e-06, "loss": 0.7109, "step": 581 }, { "epoch": 0.4611727416798732, "grad_norm": 1.459981579083792, "learning_rate": 8.908874519610706e-06, "loss": 0.7123, "step": 582 }, { "epoch": 0.4619651347068146, "grad_norm": 1.2617529050038687, "learning_rate": 8.90491064308314e-06, "loss": 0.6627, "step": 583 }, { "epoch": 0.46275752773375595, "grad_norm": 1.3866591211062722, "learning_rate": 8.900940464785485e-06, "loss": 0.7067, "step": 584 }, { "epoch": 0.4635499207606973, "grad_norm": 1.3472042387182472, "learning_rate": 8.89696399112484e-06, "loss": 0.7386, "step": 585 }, { "epoch": 0.4643423137876387, "grad_norm": 1.5025124196469166, "learning_rate": 8.892981228518465e-06, "loss": 0.6477, "step": 586 }, { "epoch": 0.46513470681458, "grad_norm": 1.1840801033376975, "learning_rate": 8.888992183393765e-06, "loss": 0.7214, "step": 587 }, { "epoch": 0.4659270998415214, "grad_norm": 1.259902097190002, "learning_rate": 8.88499686218829e-06, "loss": 0.641, "step": 588 }, { "epoch": 0.46671949286846276, "grad_norm": 1.196174121275428, "learning_rate": 8.880995271349717e-06, "loss": 0.6947, "step": 589 }, { "epoch": 0.4675118858954041, "grad_norm": 1.33433112538706, "learning_rate": 8.876987417335834e-06, "loss": 0.6878, "step": 590 }, { "epoch": 0.4683042789223455, "grad_norm": 1.286500849146215, "learning_rate": 8.872973306614547e-06, "loss": 0.7295, "step": 591 }, { "epoch": 0.46909667194928684, "grad_norm": 1.5216132462489012, "learning_rate": 8.868952945663849e-06, "loss": 0.6899, "step": 592 }, { "epoch": 0.46988906497622823, "grad_norm": 1.4362888629803232, "learning_rate": 8.864926340971827e-06, "loss": 0.675, "step": 593 }, { "epoch": 0.4706814580031696, "grad_norm": 1.2042919266343723, "learning_rate": 8.860893499036643e-06, "loss": 0.6833, "step": 594 }, { "epoch": 0.4714738510301109, "grad_norm": 1.3069194288738455, "learning_rate": 8.856854426366518e-06, "loss": 0.6621, "step": 595 }, { "epoch": 0.4722662440570523, "grad_norm": 1.6956412354855446, "learning_rate": 8.852809129479742e-06, "loss": 0.6548, "step": 596 }, { "epoch": 0.47305863708399365, "grad_norm": 2.3814468130946826, "learning_rate": 8.848757614904635e-06, "loss": 0.7023, "step": 597 }, { "epoch": 0.47385103011093505, "grad_norm": 8.227412221546855, "learning_rate": 8.84469988917956e-06, "loss": 0.5557, "step": 598 }, { "epoch": 0.4746434231378764, "grad_norm": 2.021086167009846, "learning_rate": 8.840635958852901e-06, "loss": 0.6602, "step": 599 }, { "epoch": 0.4754358161648177, "grad_norm": 1.289749341135909, "learning_rate": 8.836565830483054e-06, "loss": 0.6694, "step": 600 }, { "epoch": 0.4762282091917591, "grad_norm": 1.3957672761920774, "learning_rate": 8.832489510638417e-06, "loss": 0.676, "step": 601 }, { "epoch": 0.47702060221870046, "grad_norm": 1.2529423419642476, "learning_rate": 8.828407005897386e-06, "loss": 0.6564, "step": 602 }, { "epoch": 0.47781299524564186, "grad_norm": 1.7215508604352976, "learning_rate": 8.824318322848331e-06, "loss": 0.5954, "step": 603 }, { "epoch": 0.4786053882725832, "grad_norm": 1.7061378518228891, "learning_rate": 8.820223468089596e-06, "loss": 0.6421, "step": 604 }, { "epoch": 0.47939778129952454, "grad_norm": 1.4339672134745811, "learning_rate": 8.81612244822948e-06, "loss": 0.6701, "step": 605 }, { "epoch": 0.48019017432646594, "grad_norm": 2.304454191983635, "learning_rate": 8.812015269886241e-06, "loss": 0.6825, "step": 606 }, { "epoch": 0.4809825673534073, "grad_norm": 1.1951994257752048, "learning_rate": 8.807901939688069e-06, "loss": 0.7142, "step": 607 }, { "epoch": 0.48177496038034867, "grad_norm": 1.2774494511216659, "learning_rate": 8.803782464273078e-06, "loss": 0.675, "step": 608 }, { "epoch": 0.48256735340729, "grad_norm": 1.4130486941856666, "learning_rate": 8.79965685028931e-06, "loss": 0.6698, "step": 609 }, { "epoch": 0.48335974643423135, "grad_norm": 1.442781714932764, "learning_rate": 8.795525104394706e-06, "loss": 0.6462, "step": 610 }, { "epoch": 0.48415213946117275, "grad_norm": 1.5348429976583395, "learning_rate": 8.791387233257103e-06, "loss": 0.6593, "step": 611 }, { "epoch": 0.4849445324881141, "grad_norm": 1.215549636244647, "learning_rate": 8.787243243554224e-06, "loss": 0.6803, "step": 612 }, { "epoch": 0.4857369255150555, "grad_norm": 1.4611710674071792, "learning_rate": 8.783093141973669e-06, "loss": 0.6115, "step": 613 }, { "epoch": 0.4865293185419968, "grad_norm": 1.1717156028016822, "learning_rate": 8.778936935212896e-06, "loss": 0.6989, "step": 614 }, { "epoch": 0.4873217115689382, "grad_norm": 1.2541532704302287, "learning_rate": 8.774774629979221e-06, "loss": 0.6254, "step": 615 }, { "epoch": 0.48811410459587956, "grad_norm": 1.3407070065308695, "learning_rate": 8.770606232989795e-06, "loss": 0.6776, "step": 616 }, { "epoch": 0.4889064976228209, "grad_norm": 1.3572716121515966, "learning_rate": 8.76643175097161e-06, "loss": 0.7085, "step": 617 }, { "epoch": 0.4896988906497623, "grad_norm": 1.387420703732812, "learning_rate": 8.762251190661467e-06, "loss": 0.663, "step": 618 }, { "epoch": 0.49049128367670364, "grad_norm": 1.2741511958305822, "learning_rate": 8.758064558805984e-06, "loss": 0.6985, "step": 619 }, { "epoch": 0.49128367670364503, "grad_norm": 1.3704069221266513, "learning_rate": 8.75387186216157e-06, "loss": 0.6427, "step": 620 }, { "epoch": 0.4920760697305864, "grad_norm": 1.2963461010395163, "learning_rate": 8.749673107494435e-06, "loss": 0.7067, "step": 621 }, { "epoch": 0.4928684627575277, "grad_norm": 1.6434110487769873, "learning_rate": 8.745468301580548e-06, "loss": 0.5943, "step": 622 }, { "epoch": 0.4936608557844691, "grad_norm": 1.749042832300107, "learning_rate": 8.741257451205658e-06, "loss": 0.7093, "step": 623 }, { "epoch": 0.49445324881141045, "grad_norm": 2.1514638890103805, "learning_rate": 8.737040563165258e-06, "loss": 0.6966, "step": 624 }, { "epoch": 0.49524564183835185, "grad_norm": 2.2277863755676135, "learning_rate": 8.732817644264592e-06, "loss": 0.6798, "step": 625 }, { "epoch": 0.4960380348652932, "grad_norm": 1.5383327562829752, "learning_rate": 8.728588701318632e-06, "loss": 0.6787, "step": 626 }, { "epoch": 0.4968304278922345, "grad_norm": 1.1735664179471057, "learning_rate": 8.724353741152076e-06, "loss": 0.6746, "step": 627 }, { "epoch": 0.4976228209191759, "grad_norm": 1.4246187798473395, "learning_rate": 8.720112770599328e-06, "loss": 0.6648, "step": 628 }, { "epoch": 0.49841521394611726, "grad_norm": 1.2906684090898262, "learning_rate": 8.715865796504493e-06, "loss": 0.7401, "step": 629 }, { "epoch": 0.49920760697305866, "grad_norm": 1.2884447152582807, "learning_rate": 8.711612825721367e-06, "loss": 0.6999, "step": 630 }, { "epoch": 0.5, "grad_norm": 1.1547201370740663, "learning_rate": 8.707353865113423e-06, "loss": 0.6678, "step": 631 }, { "epoch": 0.5007923930269413, "grad_norm": 1.3471031839878325, "learning_rate": 8.703088921553797e-06, "loss": 0.597, "step": 632 }, { "epoch": 0.5015847860538827, "grad_norm": 1.2869430823032788, "learning_rate": 8.698818001925284e-06, "loss": 0.5955, "step": 633 }, { "epoch": 0.5023771790808241, "grad_norm": 1.2519417959692731, "learning_rate": 8.69454111312032e-06, "loss": 0.7501, "step": 634 }, { "epoch": 0.5031695721077655, "grad_norm": 1.6288098622199063, "learning_rate": 8.690258262040978e-06, "loss": 0.6443, "step": 635 }, { "epoch": 0.5039619651347068, "grad_norm": 2.4678988148301397, "learning_rate": 8.685969455598949e-06, "loss": 0.6349, "step": 636 }, { "epoch": 0.5047543581616482, "grad_norm": 1.4160235315442788, "learning_rate": 8.681674700715537e-06, "loss": 0.672, "step": 637 }, { "epoch": 0.5055467511885895, "grad_norm": 1.3951856382080847, "learning_rate": 8.677374004321647e-06, "loss": 0.6319, "step": 638 }, { "epoch": 0.506339144215531, "grad_norm": 1.288765272400529, "learning_rate": 8.673067373357766e-06, "loss": 0.6926, "step": 639 }, { "epoch": 0.5071315372424723, "grad_norm": 2.4384266045557896, "learning_rate": 8.668754814773968e-06, "loss": 0.7067, "step": 640 }, { "epoch": 0.5079239302694136, "grad_norm": 1.41375745139352, "learning_rate": 8.664436335529886e-06, "loss": 0.7703, "step": 641 }, { "epoch": 0.508716323296355, "grad_norm": 1.2988156740744776, "learning_rate": 8.660111942594712e-06, "loss": 0.6656, "step": 642 }, { "epoch": 0.5095087163232963, "grad_norm": 1.3430742244623575, "learning_rate": 8.655781642947176e-06, "loss": 0.7162, "step": 643 }, { "epoch": 0.5103011093502378, "grad_norm": 1.262316379030142, "learning_rate": 8.651445443575545e-06, "loss": 0.7254, "step": 644 }, { "epoch": 0.5110935023771791, "grad_norm": 1.380949659590829, "learning_rate": 8.647103351477604e-06, "loss": 0.7411, "step": 645 }, { "epoch": 0.5118858954041204, "grad_norm": 1.550137284019924, "learning_rate": 8.642755373660653e-06, "loss": 0.6514, "step": 646 }, { "epoch": 0.5126782884310618, "grad_norm": 1.5757925697071793, "learning_rate": 8.638401517141483e-06, "loss": 0.6783, "step": 647 }, { "epoch": 0.5134706814580031, "grad_norm": 1.4906755417872302, "learning_rate": 8.634041788946378e-06, "loss": 0.6622, "step": 648 }, { "epoch": 0.5142630744849446, "grad_norm": 1.3321144086491399, "learning_rate": 8.629676196111096e-06, "loss": 0.7314, "step": 649 }, { "epoch": 0.5150554675118859, "grad_norm": 1.2865483961147721, "learning_rate": 8.625304745680859e-06, "loss": 0.605, "step": 650 }, { "epoch": 0.5158478605388273, "grad_norm": 1.3947317860225779, "learning_rate": 8.620927444710339e-06, "loss": 0.7017, "step": 651 }, { "epoch": 0.5166402535657686, "grad_norm": 1.3146364469777816, "learning_rate": 8.616544300263656e-06, "loss": 0.6984, "step": 652 }, { "epoch": 0.5174326465927099, "grad_norm": 1.612770223139016, "learning_rate": 8.612155319414355e-06, "loss": 0.6147, "step": 653 }, { "epoch": 0.5182250396196514, "grad_norm": 1.4965485588987877, "learning_rate": 8.607760509245401e-06, "loss": 0.6944, "step": 654 }, { "epoch": 0.5190174326465927, "grad_norm": 1.4204820555396265, "learning_rate": 8.60335987684917e-06, "loss": 0.6403, "step": 655 }, { "epoch": 0.5198098256735341, "grad_norm": 1.2450037066106867, "learning_rate": 8.59895342932743e-06, "loss": 0.6826, "step": 656 }, { "epoch": 0.5206022187004754, "grad_norm": 1.2222747300365382, "learning_rate": 8.594541173791333e-06, "loss": 0.7496, "step": 657 }, { "epoch": 0.5213946117274167, "grad_norm": 1.5168781778510856, "learning_rate": 8.590123117361408e-06, "loss": 0.6985, "step": 658 }, { "epoch": 0.5221870047543582, "grad_norm": 1.3319135561083275, "learning_rate": 8.585699267167543e-06, "loss": 0.6575, "step": 659 }, { "epoch": 0.5229793977812995, "grad_norm": 1.6441248610017072, "learning_rate": 8.581269630348972e-06, "loss": 0.5968, "step": 660 }, { "epoch": 0.5237717908082409, "grad_norm": 1.690228085273782, "learning_rate": 8.576834214054276e-06, "loss": 0.6945, "step": 661 }, { "epoch": 0.5245641838351822, "grad_norm": 2.7828728558338156, "learning_rate": 8.572393025441357e-06, "loss": 0.6931, "step": 662 }, { "epoch": 0.5253565768621236, "grad_norm": 1.1389332232551184, "learning_rate": 8.567946071677433e-06, "loss": 0.7161, "step": 663 }, { "epoch": 0.526148969889065, "grad_norm": 1.529715552316968, "learning_rate": 8.563493359939029e-06, "loss": 0.6686, "step": 664 }, { "epoch": 0.5269413629160064, "grad_norm": 3.986768219197556, "learning_rate": 8.559034897411958e-06, "loss": 0.6977, "step": 665 }, { "epoch": 0.5277337559429477, "grad_norm": 1.6642071521044752, "learning_rate": 8.554570691291317e-06, "loss": 0.6995, "step": 666 }, { "epoch": 0.528526148969889, "grad_norm": 1.3564072964239988, "learning_rate": 8.55010074878147e-06, "loss": 0.6893, "step": 667 }, { "epoch": 0.5293185419968305, "grad_norm": 1.4323101872518267, "learning_rate": 8.545625077096041e-06, "loss": 0.6468, "step": 668 }, { "epoch": 0.5301109350237718, "grad_norm": 1.223480743606037, "learning_rate": 8.541143683457893e-06, "loss": 0.6527, "step": 669 }, { "epoch": 0.5309033280507132, "grad_norm": 1.237724830722516, "learning_rate": 8.536656575099134e-06, "loss": 0.7058, "step": 670 }, { "epoch": 0.5316957210776545, "grad_norm": 1.227947483344111, "learning_rate": 8.532163759261086e-06, "loss": 0.7484, "step": 671 }, { "epoch": 0.5324881141045958, "grad_norm": 1.1683046150205916, "learning_rate": 8.527665243194283e-06, "loss": 0.6785, "step": 672 }, { "epoch": 0.5332805071315373, "grad_norm": 1.3585828192150804, "learning_rate": 8.523161034158462e-06, "loss": 0.7286, "step": 673 }, { "epoch": 0.5340729001584786, "grad_norm": 1.375286897323632, "learning_rate": 8.518651139422543e-06, "loss": 0.7248, "step": 674 }, { "epoch": 0.53486529318542, "grad_norm": 1.3108595316021145, "learning_rate": 8.514135566264623e-06, "loss": 0.6876, "step": 675 }, { "epoch": 0.5356576862123613, "grad_norm": 1.2466501732261217, "learning_rate": 8.509614321971964e-06, "loss": 0.6322, "step": 676 }, { "epoch": 0.5364500792393027, "grad_norm": 1.2808454881056355, "learning_rate": 8.50508741384098e-06, "loss": 0.6529, "step": 677 }, { "epoch": 0.5372424722662441, "grad_norm": 1.526086013065224, "learning_rate": 8.500554849177223e-06, "loss": 0.626, "step": 678 }, { "epoch": 0.5380348652931854, "grad_norm": 1.3914532794117254, "learning_rate": 8.496016635295378e-06, "loss": 0.6583, "step": 679 }, { "epoch": 0.5388272583201268, "grad_norm": 1.3642008516208026, "learning_rate": 8.491472779519241e-06, "loss": 0.6617, "step": 680 }, { "epoch": 0.5396196513470681, "grad_norm": 1.5420160585468299, "learning_rate": 8.486923289181717e-06, "loss": 0.7348, "step": 681 }, { "epoch": 0.5404120443740095, "grad_norm": 1.388424126400135, "learning_rate": 8.482368171624802e-06, "loss": 0.6734, "step": 682 }, { "epoch": 0.5412044374009509, "grad_norm": 1.2709170182432095, "learning_rate": 8.477807434199578e-06, "loss": 0.7067, "step": 683 }, { "epoch": 0.5419968304278923, "grad_norm": 1.393151052323184, "learning_rate": 8.473241084266188e-06, "loss": 0.7251, "step": 684 }, { "epoch": 0.5427892234548336, "grad_norm": 1.3393471749827985, "learning_rate": 8.468669129193838e-06, "loss": 0.648, "step": 685 }, { "epoch": 0.5435816164817749, "grad_norm": 1.413131779160652, "learning_rate": 8.46409157636078e-06, "loss": 0.6183, "step": 686 }, { "epoch": 0.5443740095087163, "grad_norm": 1.4673771256658268, "learning_rate": 8.4595084331543e-06, "loss": 0.7594, "step": 687 }, { "epoch": 0.5451664025356577, "grad_norm": 1.5849808782267403, "learning_rate": 8.4549197069707e-06, "loss": 0.6526, "step": 688 }, { "epoch": 0.5459587955625991, "grad_norm": 1.246602341834949, "learning_rate": 8.450325405215298e-06, "loss": 0.7252, "step": 689 }, { "epoch": 0.5467511885895404, "grad_norm": 1.3400565015573647, "learning_rate": 8.44572553530241e-06, "loss": 0.6547, "step": 690 }, { "epoch": 0.5475435816164818, "grad_norm": 2.3567058544749715, "learning_rate": 8.441120104655333e-06, "loss": 0.676, "step": 691 }, { "epoch": 0.5483359746434231, "grad_norm": 2.040027935320704, "learning_rate": 8.436509120706341e-06, "loss": 0.6504, "step": 692 }, { "epoch": 0.5491283676703645, "grad_norm": 1.8182379337131878, "learning_rate": 8.431892590896672e-06, "loss": 0.6318, "step": 693 }, { "epoch": 0.5499207606973059, "grad_norm": 1.4445325877494224, "learning_rate": 8.42727052267651e-06, "loss": 0.6318, "step": 694 }, { "epoch": 0.5507131537242472, "grad_norm": 1.183170091401246, "learning_rate": 8.422642923504979e-06, "loss": 0.6692, "step": 695 }, { "epoch": 0.5515055467511886, "grad_norm": 1.3246905210191053, "learning_rate": 8.418009800850129e-06, "loss": 0.7549, "step": 696 }, { "epoch": 0.5522979397781299, "grad_norm": 2.2858169017188406, "learning_rate": 8.41337116218892e-06, "loss": 0.6763, "step": 697 }, { "epoch": 0.5530903328050714, "grad_norm": 1.652823610111726, "learning_rate": 8.408727015007223e-06, "loss": 0.641, "step": 698 }, { "epoch": 0.5538827258320127, "grad_norm": 1.5278018905691637, "learning_rate": 8.404077366799788e-06, "loss": 0.6961, "step": 699 }, { "epoch": 0.554675118858954, "grad_norm": 1.3430322890788846, "learning_rate": 8.39942222507025e-06, "loss": 0.6806, "step": 700 }, { "epoch": 0.5554675118858954, "grad_norm": 1.291711205180611, "learning_rate": 8.394761597331107e-06, "loss": 0.6573, "step": 701 }, { "epoch": 0.5562599049128367, "grad_norm": 1.3862171282911002, "learning_rate": 8.390095491103707e-06, "loss": 0.7171, "step": 702 }, { "epoch": 0.5570522979397782, "grad_norm": 1.3238644344619104, "learning_rate": 8.385423913918247e-06, "loss": 0.6865, "step": 703 }, { "epoch": 0.5578446909667195, "grad_norm": 1.4962134936234208, "learning_rate": 8.380746873313745e-06, "loss": 0.6306, "step": 704 }, { "epoch": 0.5586370839936609, "grad_norm": 1.37743087906314, "learning_rate": 8.37606437683804e-06, "loss": 0.7064, "step": 705 }, { "epoch": 0.5594294770206022, "grad_norm": 1.203879776489021, "learning_rate": 8.371376432047778e-06, "loss": 0.6828, "step": 706 }, { "epoch": 0.5602218700475435, "grad_norm": 1.4645083711634344, "learning_rate": 8.36668304650839e-06, "loss": 0.6443, "step": 707 }, { "epoch": 0.561014263074485, "grad_norm": 1.4394246211298218, "learning_rate": 8.361984227794095e-06, "loss": 0.6651, "step": 708 }, { "epoch": 0.5618066561014263, "grad_norm": 1.4504290497048171, "learning_rate": 8.357279983487878e-06, "loss": 0.7236, "step": 709 }, { "epoch": 0.5625990491283677, "grad_norm": 1.3141176362840308, "learning_rate": 8.352570321181476e-06, "loss": 0.6347, "step": 710 }, { "epoch": 0.563391442155309, "grad_norm": 1.2559291709620648, "learning_rate": 8.347855248475374e-06, "loss": 0.6047, "step": 711 }, { "epoch": 0.5641838351822503, "grad_norm": 1.2050691129725004, "learning_rate": 8.343134772978787e-06, "loss": 0.6713, "step": 712 }, { "epoch": 0.5649762282091918, "grad_norm": 1.3098652641545019, "learning_rate": 8.338408902309648e-06, "loss": 0.6948, "step": 713 }, { "epoch": 0.5657686212361331, "grad_norm": 1.3228031016408528, "learning_rate": 8.3336776440946e-06, "loss": 0.6525, "step": 714 }, { "epoch": 0.5665610142630745, "grad_norm": 1.3148902718293376, "learning_rate": 8.328941005968976e-06, "loss": 0.7137, "step": 715 }, { "epoch": 0.5673534072900158, "grad_norm": 1.5053494902672881, "learning_rate": 8.324198995576794e-06, "loss": 0.6687, "step": 716 }, { "epoch": 0.5681458003169572, "grad_norm": 1.3223209408677732, "learning_rate": 8.319451620570742e-06, "loss": 0.6119, "step": 717 }, { "epoch": 0.5689381933438986, "grad_norm": 1.3786848476867655, "learning_rate": 8.314698888612161e-06, "loss": 0.6514, "step": 718 }, { "epoch": 0.56973058637084, "grad_norm": 3.51679528654123, "learning_rate": 8.309940807371045e-06, "loss": 0.7128, "step": 719 }, { "epoch": 0.5705229793977813, "grad_norm": 1.3092806338354823, "learning_rate": 8.305177384526014e-06, "loss": 0.6467, "step": 720 }, { "epoch": 0.5713153724247226, "grad_norm": 1.6158173791643704, "learning_rate": 8.300408627764311e-06, "loss": 0.6872, "step": 721 }, { "epoch": 0.572107765451664, "grad_norm": 1.4223215206262176, "learning_rate": 8.295634544781788e-06, "loss": 0.6324, "step": 722 }, { "epoch": 0.5729001584786054, "grad_norm": 1.3070364451378484, "learning_rate": 8.290855143282888e-06, "loss": 0.6392, "step": 723 }, { "epoch": 0.5736925515055468, "grad_norm": 1.2428573620906052, "learning_rate": 8.286070430980643e-06, "loss": 0.7239, "step": 724 }, { "epoch": 0.5744849445324881, "grad_norm": 1.5375805932766116, "learning_rate": 8.281280415596651e-06, "loss": 0.6501, "step": 725 }, { "epoch": 0.5752773375594294, "grad_norm": 1.3746779380036023, "learning_rate": 8.27648510486107e-06, "loss": 0.6911, "step": 726 }, { "epoch": 0.5760697305863708, "grad_norm": 1.4474518810584704, "learning_rate": 8.271684506512604e-06, "loss": 0.6486, "step": 727 }, { "epoch": 0.5768621236133122, "grad_norm": 1.4013949711943967, "learning_rate": 8.26687862829849e-06, "loss": 0.671, "step": 728 }, { "epoch": 0.5776545166402536, "grad_norm": 1.3917822975556162, "learning_rate": 8.262067477974485e-06, "loss": 0.6776, "step": 729 }, { "epoch": 0.5784469096671949, "grad_norm": 1.2019936299734435, "learning_rate": 8.257251063304855e-06, "loss": 0.6896, "step": 730 }, { "epoch": 0.5792393026941363, "grad_norm": 1.4655913948932833, "learning_rate": 8.25242939206236e-06, "loss": 0.6703, "step": 731 }, { "epoch": 0.5800316957210776, "grad_norm": 1.3091957287532456, "learning_rate": 8.247602472028245e-06, "loss": 0.6588, "step": 732 }, { "epoch": 0.580824088748019, "grad_norm": 1.2440906320063667, "learning_rate": 8.242770310992226e-06, "loss": 0.691, "step": 733 }, { "epoch": 0.5816164817749604, "grad_norm": 1.260486909821343, "learning_rate": 8.237932916752473e-06, "loss": 0.6829, "step": 734 }, { "epoch": 0.5824088748019017, "grad_norm": 1.0994796628301342, "learning_rate": 8.233090297115607e-06, "loss": 0.642, "step": 735 }, { "epoch": 0.5832012678288431, "grad_norm": 1.5982738548257653, "learning_rate": 8.228242459896678e-06, "loss": 0.63, "step": 736 }, { "epoch": 0.5839936608557845, "grad_norm": 1.4898710290078374, "learning_rate": 8.223389412919154e-06, "loss": 0.7115, "step": 737 }, { "epoch": 0.5847860538827259, "grad_norm": 1.3469895354380856, "learning_rate": 8.21853116401492e-06, "loss": 0.6669, "step": 738 }, { "epoch": 0.5855784469096672, "grad_norm": 1.2573027675943753, "learning_rate": 8.213667721024244e-06, "loss": 0.6968, "step": 739 }, { "epoch": 0.5863708399366085, "grad_norm": 1.258393845932821, "learning_rate": 8.208799091795785e-06, "loss": 0.6699, "step": 740 }, { "epoch": 0.5871632329635499, "grad_norm": 1.2539058257133828, "learning_rate": 8.203925284186567e-06, "loss": 0.7095, "step": 741 }, { "epoch": 0.5879556259904913, "grad_norm": 1.4563732692742317, "learning_rate": 8.199046306061972e-06, "loss": 0.6017, "step": 742 }, { "epoch": 0.5887480190174327, "grad_norm": 1.3398244014381386, "learning_rate": 8.194162165295726e-06, "loss": 0.7121, "step": 743 }, { "epoch": 0.589540412044374, "grad_norm": 1.288413717565178, "learning_rate": 8.189272869769891e-06, "loss": 0.6555, "step": 744 }, { "epoch": 0.5903328050713154, "grad_norm": 1.2996037894905768, "learning_rate": 8.184378427374838e-06, "loss": 0.7603, "step": 745 }, { "epoch": 0.5911251980982567, "grad_norm": 1.208235296298138, "learning_rate": 8.179478846009254e-06, "loss": 0.6993, "step": 746 }, { "epoch": 0.5919175911251982, "grad_norm": 2.7076002500119807, "learning_rate": 8.174574133580113e-06, "loss": 0.6604, "step": 747 }, { "epoch": 0.5927099841521395, "grad_norm": 2.3351213646049604, "learning_rate": 8.169664298002673e-06, "loss": 0.6375, "step": 748 }, { "epoch": 0.5935023771790808, "grad_norm": 1.3642344332146963, "learning_rate": 8.164749347200457e-06, "loss": 0.6616, "step": 749 }, { "epoch": 0.5942947702060222, "grad_norm": 1.395536550971703, "learning_rate": 8.159829289105247e-06, "loss": 0.7082, "step": 750 }, { "epoch": 0.5950871632329635, "grad_norm": 1.2916329512121079, "learning_rate": 8.154904131657062e-06, "loss": 0.7237, "step": 751 }, { "epoch": 0.595879556259905, "grad_norm": 1.1837837505414737, "learning_rate": 8.149973882804154e-06, "loss": 0.733, "step": 752 }, { "epoch": 0.5966719492868463, "grad_norm": 1.6564892210480011, "learning_rate": 8.14503855050299e-06, "loss": 0.604, "step": 753 }, { "epoch": 0.5974643423137876, "grad_norm": 1.4653885155324986, "learning_rate": 8.140098142718243e-06, "loss": 0.7329, "step": 754 }, { "epoch": 0.598256735340729, "grad_norm": 1.4750142295791326, "learning_rate": 8.135152667422774e-06, "loss": 0.653, "step": 755 }, { "epoch": 0.5990491283676703, "grad_norm": 1.2171808489370737, "learning_rate": 8.130202132597622e-06, "loss": 0.616, "step": 756 }, { "epoch": 0.5998415213946118, "grad_norm": 1.548864870721167, "learning_rate": 8.125246546231994e-06, "loss": 0.6835, "step": 757 }, { "epoch": 0.6006339144215531, "grad_norm": 1.4591829627759656, "learning_rate": 8.120285916323244e-06, "loss": 0.6786, "step": 758 }, { "epoch": 0.6014263074484945, "grad_norm": 1.3835203866429573, "learning_rate": 8.115320250876873e-06, "loss": 0.588, "step": 759 }, { "epoch": 0.6022187004754358, "grad_norm": 1.2908425423274767, "learning_rate": 8.110349557906502e-06, "loss": 0.6188, "step": 760 }, { "epoch": 0.6030110935023771, "grad_norm": 1.376757505417989, "learning_rate": 8.105373845433867e-06, "loss": 0.6644, "step": 761 }, { "epoch": 0.6038034865293186, "grad_norm": 1.5172021471455572, "learning_rate": 8.100393121488808e-06, "loss": 0.6994, "step": 762 }, { "epoch": 0.6045958795562599, "grad_norm": 1.3724378859438606, "learning_rate": 8.095407394109244e-06, "loss": 0.6117, "step": 763 }, { "epoch": 0.6053882725832013, "grad_norm": 2.2372976479813693, "learning_rate": 8.090416671341179e-06, "loss": 0.6709, "step": 764 }, { "epoch": 0.6061806656101426, "grad_norm": 1.505828487202079, "learning_rate": 8.085420961238673e-06, "loss": 0.771, "step": 765 }, { "epoch": 0.606973058637084, "grad_norm": 1.3234462515439165, "learning_rate": 8.080420271863833e-06, "loss": 0.6785, "step": 766 }, { "epoch": 0.6077654516640254, "grad_norm": 1.3992603306947766, "learning_rate": 8.075414611286807e-06, "loss": 0.5811, "step": 767 }, { "epoch": 0.6085578446909667, "grad_norm": 1.505354709191153, "learning_rate": 8.070403987585759e-06, "loss": 0.7838, "step": 768 }, { "epoch": 0.6093502377179081, "grad_norm": 1.6666590241557198, "learning_rate": 8.065388408846869e-06, "loss": 0.6665, "step": 769 }, { "epoch": 0.6101426307448494, "grad_norm": 1.4265306465639018, "learning_rate": 8.060367883164308e-06, "loss": 0.5108, "step": 770 }, { "epoch": 0.6109350237717908, "grad_norm": 1.2580532737597867, "learning_rate": 8.055342418640236e-06, "loss": 0.6933, "step": 771 }, { "epoch": 0.6117274167987322, "grad_norm": 1.4495293891763206, "learning_rate": 8.050312023384776e-06, "loss": 0.6181, "step": 772 }, { "epoch": 0.6125198098256736, "grad_norm": 2.183292434659245, "learning_rate": 8.045276705516017e-06, "loss": 0.6484, "step": 773 }, { "epoch": 0.6133122028526149, "grad_norm": 1.2650957825170306, "learning_rate": 8.040236473159985e-06, "loss": 0.6947, "step": 774 }, { "epoch": 0.6141045958795562, "grad_norm": 1.7075124856144768, "learning_rate": 8.03519133445064e-06, "loss": 0.6101, "step": 775 }, { "epoch": 0.6148969889064976, "grad_norm": 1.3138641325137077, "learning_rate": 8.030141297529859e-06, "loss": 0.7155, "step": 776 }, { "epoch": 0.615689381933439, "grad_norm": 1.3605564591276098, "learning_rate": 8.025086370547425e-06, "loss": 0.6865, "step": 777 }, { "epoch": 0.6164817749603804, "grad_norm": 1.1577175374737663, "learning_rate": 8.020026561661012e-06, "loss": 0.6726, "step": 778 }, { "epoch": 0.6172741679873217, "grad_norm": 1.4114205450224284, "learning_rate": 8.014961879036172e-06, "loss": 0.6786, "step": 779 }, { "epoch": 0.618066561014263, "grad_norm": 1.2107563931686143, "learning_rate": 8.00989233084632e-06, "loss": 0.6862, "step": 780 }, { "epoch": 0.6188589540412044, "grad_norm": 1.3948938661903778, "learning_rate": 8.004817925272732e-06, "loss": 0.6542, "step": 781 }, { "epoch": 0.6196513470681458, "grad_norm": 1.1920112467184394, "learning_rate": 7.999738670504513e-06, "loss": 0.7145, "step": 782 }, { "epoch": 0.6204437400950872, "grad_norm": 2.3349259345450193, "learning_rate": 7.994654574738596e-06, "loss": 0.6695, "step": 783 }, { "epoch": 0.6212361331220285, "grad_norm": 1.1789084073117855, "learning_rate": 7.98956564617973e-06, "loss": 0.6953, "step": 784 }, { "epoch": 0.6220285261489699, "grad_norm": 1.369284379697743, "learning_rate": 7.984471893040457e-06, "loss": 0.6979, "step": 785 }, { "epoch": 0.6228209191759112, "grad_norm": 1.3910623765012986, "learning_rate": 7.979373323541114e-06, "loss": 0.7005, "step": 786 }, { "epoch": 0.6236133122028527, "grad_norm": 1.6230070852568448, "learning_rate": 7.974269945909803e-06, "loss": 0.7176, "step": 787 }, { "epoch": 0.624405705229794, "grad_norm": 1.2007902855800783, "learning_rate": 7.969161768382387e-06, "loss": 0.7184, "step": 788 }, { "epoch": 0.6251980982567353, "grad_norm": 1.5412114340526017, "learning_rate": 7.964048799202477e-06, "loss": 0.7033, "step": 789 }, { "epoch": 0.6259904912836767, "grad_norm": 1.2959103451833187, "learning_rate": 7.958931046621418e-06, "loss": 0.6734, "step": 790 }, { "epoch": 0.626782884310618, "grad_norm": 1.544521690363767, "learning_rate": 7.953808518898267e-06, "loss": 0.7351, "step": 791 }, { "epoch": 0.6275752773375595, "grad_norm": 1.2878818481031222, "learning_rate": 7.948681224299797e-06, "loss": 0.6956, "step": 792 }, { "epoch": 0.6283676703645008, "grad_norm": 1.3537636049532986, "learning_rate": 7.943549171100466e-06, "loss": 0.7146, "step": 793 }, { "epoch": 0.6291600633914421, "grad_norm": 1.6751920682627506, "learning_rate": 7.938412367582417e-06, "loss": 0.6477, "step": 794 }, { "epoch": 0.6299524564183835, "grad_norm": 1.171017952063872, "learning_rate": 7.933270822035459e-06, "loss": 0.7144, "step": 795 }, { "epoch": 0.6307448494453248, "grad_norm": 1.5292155189828334, "learning_rate": 7.928124542757046e-06, "loss": 0.7422, "step": 796 }, { "epoch": 0.6315372424722663, "grad_norm": 1.2958203720148735, "learning_rate": 7.92297353805228e-06, "loss": 0.6841, "step": 797 }, { "epoch": 0.6323296354992076, "grad_norm": 1.4623247889152968, "learning_rate": 7.917817816233884e-06, "loss": 0.6578, "step": 798 }, { "epoch": 0.633122028526149, "grad_norm": 1.2929430200070458, "learning_rate": 7.912657385622199e-06, "loss": 0.6788, "step": 799 }, { "epoch": 0.6339144215530903, "grad_norm": 1.2810345494572182, "learning_rate": 7.907492254545157e-06, "loss": 0.6616, "step": 800 }, { "epoch": 0.6347068145800316, "grad_norm": 1.5832246025694552, "learning_rate": 7.902322431338283e-06, "loss": 0.603, "step": 801 }, { "epoch": 0.6354992076069731, "grad_norm": 1.4264074948676981, "learning_rate": 7.897147924344668e-06, "loss": 0.6738, "step": 802 }, { "epoch": 0.6362916006339144, "grad_norm": 1.7952214204088714, "learning_rate": 7.891968741914968e-06, "loss": 0.6128, "step": 803 }, { "epoch": 0.6370839936608558, "grad_norm": 1.3510227918957451, "learning_rate": 7.886784892407379e-06, "loss": 0.6245, "step": 804 }, { "epoch": 0.6378763866877971, "grad_norm": 1.331046931359784, "learning_rate": 7.881596384187636e-06, "loss": 0.6313, "step": 805 }, { "epoch": 0.6386687797147385, "grad_norm": 1.5262964926765485, "learning_rate": 7.876403225628979e-06, "loss": 0.6227, "step": 806 }, { "epoch": 0.6394611727416799, "grad_norm": 1.4066394645280642, "learning_rate": 7.871205425112167e-06, "loss": 0.6895, "step": 807 }, { "epoch": 0.6402535657686212, "grad_norm": 1.4284995459559187, "learning_rate": 7.86600299102544e-06, "loss": 0.6859, "step": 808 }, { "epoch": 0.6410459587955626, "grad_norm": 1.1962473017807782, "learning_rate": 7.860795931764525e-06, "loss": 0.6796, "step": 809 }, { "epoch": 0.6418383518225039, "grad_norm": 1.5205023555519803, "learning_rate": 7.855584255732604e-06, "loss": 0.6527, "step": 810 }, { "epoch": 0.6426307448494454, "grad_norm": 1.6043670492093012, "learning_rate": 7.850367971340314e-06, "loss": 0.674, "step": 811 }, { "epoch": 0.6434231378763867, "grad_norm": 1.362149903087982, "learning_rate": 7.84514708700573e-06, "loss": 0.7001, "step": 812 }, { "epoch": 0.6442155309033281, "grad_norm": 1.3946001600885358, "learning_rate": 7.839921611154348e-06, "loss": 0.6535, "step": 813 }, { "epoch": 0.6450079239302694, "grad_norm": 1.3135386027096716, "learning_rate": 7.834691552219073e-06, "loss": 0.7202, "step": 814 }, { "epoch": 0.6458003169572107, "grad_norm": 1.314683109843096, "learning_rate": 7.82945691864021e-06, "loss": 0.6876, "step": 815 }, { "epoch": 0.6465927099841522, "grad_norm": 2.0614991098092363, "learning_rate": 7.824217718865446e-06, "loss": 0.6404, "step": 816 }, { "epoch": 0.6473851030110935, "grad_norm": 1.4725219560795761, "learning_rate": 7.818973961349834e-06, "loss": 0.694, "step": 817 }, { "epoch": 0.6481774960380349, "grad_norm": 1.847032044677697, "learning_rate": 7.813725654555782e-06, "loss": 0.653, "step": 818 }, { "epoch": 0.6489698890649762, "grad_norm": 1.3273990990314963, "learning_rate": 7.808472806953044e-06, "loss": 0.6756, "step": 819 }, { "epoch": 0.6497622820919176, "grad_norm": 1.470338322145949, "learning_rate": 7.8032154270187e-06, "loss": 0.6679, "step": 820 }, { "epoch": 0.650554675118859, "grad_norm": 1.4458658780405718, "learning_rate": 7.797953523237141e-06, "loss": 0.6778, "step": 821 }, { "epoch": 0.6513470681458003, "grad_norm": 1.2497096315892402, "learning_rate": 7.792687104100065e-06, "loss": 0.707, "step": 822 }, { "epoch": 0.6521394611727417, "grad_norm": 1.3072340571894059, "learning_rate": 7.787416178106448e-06, "loss": 0.6663, "step": 823 }, { "epoch": 0.652931854199683, "grad_norm": 1.3508990912836227, "learning_rate": 7.78214075376255e-06, "loss": 0.6504, "step": 824 }, { "epoch": 0.6537242472266244, "grad_norm": 1.4895600737136852, "learning_rate": 7.776860839581882e-06, "loss": 0.63, "step": 825 }, { "epoch": 0.6545166402535658, "grad_norm": 1.384571977076582, "learning_rate": 7.771576444085205e-06, "loss": 0.7203, "step": 826 }, { "epoch": 0.6553090332805072, "grad_norm": 1.3896400618401445, "learning_rate": 7.766287575800509e-06, "loss": 0.6672, "step": 827 }, { "epoch": 0.6561014263074485, "grad_norm": 1.3687509595869312, "learning_rate": 7.760994243263007e-06, "loss": 0.6958, "step": 828 }, { "epoch": 0.6568938193343898, "grad_norm": 1.4585226231226325, "learning_rate": 7.75569645501511e-06, "loss": 0.712, "step": 829 }, { "epoch": 0.6576862123613312, "grad_norm": 1.533182053638041, "learning_rate": 7.750394219606424e-06, "loss": 0.6496, "step": 830 }, { "epoch": 0.6584786053882726, "grad_norm": 1.2841196388142333, "learning_rate": 7.745087545593733e-06, "loss": 0.582, "step": 831 }, { "epoch": 0.659270998415214, "grad_norm": 1.0788357181089827, "learning_rate": 7.739776441540978e-06, "loss": 0.5542, "step": 832 }, { "epoch": 0.6600633914421553, "grad_norm": 1.234529631689461, "learning_rate": 7.734460916019255e-06, "loss": 0.6785, "step": 833 }, { "epoch": 0.6608557844690967, "grad_norm": 1.2357756224440097, "learning_rate": 7.729140977606794e-06, "loss": 0.6548, "step": 834 }, { "epoch": 0.661648177496038, "grad_norm": 1.35558354866342, "learning_rate": 7.723816634888947e-06, "loss": 0.6862, "step": 835 }, { "epoch": 0.6624405705229794, "grad_norm": 1.820002030283377, "learning_rate": 7.718487896458174e-06, "loss": 0.6662, "step": 836 }, { "epoch": 0.6632329635499208, "grad_norm": 1.9752742696635979, "learning_rate": 7.71315477091402e-06, "loss": 0.565, "step": 837 }, { "epoch": 0.6640253565768621, "grad_norm": 1.3093607788624275, "learning_rate": 7.707817266863125e-06, "loss": 0.6478, "step": 838 }, { "epoch": 0.6648177496038035, "grad_norm": 1.2143605552039285, "learning_rate": 7.702475392919185e-06, "loss": 0.6998, "step": 839 }, { "epoch": 0.6656101426307448, "grad_norm": 1.4504013742424613, "learning_rate": 7.697129157702951e-06, "loss": 0.6623, "step": 840 }, { "epoch": 0.6664025356576863, "grad_norm": 5.08887927040743, "learning_rate": 7.691778569842214e-06, "loss": 0.6741, "step": 841 }, { "epoch": 0.6671949286846276, "grad_norm": 1.1053547765630272, "learning_rate": 7.686423637971783e-06, "loss": 0.6287, "step": 842 }, { "epoch": 0.6679873217115689, "grad_norm": 1.2678390064629703, "learning_rate": 7.681064370733485e-06, "loss": 0.6516, "step": 843 }, { "epoch": 0.6687797147385103, "grad_norm": 1.8107453106158091, "learning_rate": 7.67570077677614e-06, "loss": 0.6523, "step": 844 }, { "epoch": 0.6695721077654516, "grad_norm": 1.357431403860213, "learning_rate": 7.670332864755549e-06, "loss": 0.6708, "step": 845 }, { "epoch": 0.6703645007923931, "grad_norm": 1.3770179953887687, "learning_rate": 7.664960643334484e-06, "loss": 0.6538, "step": 846 }, { "epoch": 0.6711568938193344, "grad_norm": 7.196104183626982, "learning_rate": 7.659584121182673e-06, "loss": 0.6229, "step": 847 }, { "epoch": 0.6719492868462758, "grad_norm": 1.7756776504059015, "learning_rate": 7.654203306976777e-06, "loss": 0.617, "step": 848 }, { "epoch": 0.6727416798732171, "grad_norm": 1.4289862007319956, "learning_rate": 7.648818209400395e-06, "loss": 0.6455, "step": 849 }, { "epoch": 0.6735340729001584, "grad_norm": 1.4514266893848429, "learning_rate": 7.64342883714403e-06, "loss": 0.6384, "step": 850 }, { "epoch": 0.6743264659270999, "grad_norm": 1.3207832155689563, "learning_rate": 7.638035198905082e-06, "loss": 0.6931, "step": 851 }, { "epoch": 0.6751188589540412, "grad_norm": 1.6758188709883897, "learning_rate": 7.632637303387844e-06, "loss": 0.6665, "step": 852 }, { "epoch": 0.6759112519809826, "grad_norm": 1.2761533180503055, "learning_rate": 7.627235159303475e-06, "loss": 0.6445, "step": 853 }, { "epoch": 0.6767036450079239, "grad_norm": 1.2069087496452289, "learning_rate": 7.621828775369986e-06, "loss": 0.6435, "step": 854 }, { "epoch": 0.6774960380348652, "grad_norm": 1.08122558178375, "learning_rate": 7.616418160312239e-06, "loss": 0.5956, "step": 855 }, { "epoch": 0.6782884310618067, "grad_norm": 3.0243604582724126, "learning_rate": 7.611003322861917e-06, "loss": 0.6876, "step": 856 }, { "epoch": 0.679080824088748, "grad_norm": 1.2944128100075794, "learning_rate": 7.605584271757519e-06, "loss": 0.716, "step": 857 }, { "epoch": 0.6798732171156894, "grad_norm": 1.4083097925294594, "learning_rate": 7.6001610157443465e-06, "loss": 0.6391, "step": 858 }, { "epoch": 0.6806656101426307, "grad_norm": 1.493142024063185, "learning_rate": 7.594733563574484e-06, "loss": 0.6639, "step": 859 }, { "epoch": 0.6814580031695721, "grad_norm": 1.5019196743805066, "learning_rate": 7.58930192400679e-06, "loss": 0.6465, "step": 860 }, { "epoch": 0.6822503961965135, "grad_norm": 1.434141984532341, "learning_rate": 7.583866105806881e-06, "loss": 0.6135, "step": 861 }, { "epoch": 0.6830427892234548, "grad_norm": 1.62445775258557, "learning_rate": 7.578426117747112e-06, "loss": 0.6851, "step": 862 }, { "epoch": 0.6838351822503962, "grad_norm": 1.545479927451177, "learning_rate": 7.572981968606574e-06, "loss": 0.624, "step": 863 }, { "epoch": 0.6846275752773375, "grad_norm": 1.3818360210943337, "learning_rate": 7.567533667171068e-06, "loss": 0.6355, "step": 864 }, { "epoch": 0.6854199683042789, "grad_norm": 1.282549948392138, "learning_rate": 7.5620812222331e-06, "loss": 0.6981, "step": 865 }, { "epoch": 0.6862123613312203, "grad_norm": 2.0389300122366247, "learning_rate": 7.55662464259186e-06, "loss": 0.6465, "step": 866 }, { "epoch": 0.6870047543581617, "grad_norm": 2.0727580343892384, "learning_rate": 7.551163937053212e-06, "loss": 0.684, "step": 867 }, { "epoch": 0.687797147385103, "grad_norm": 1.4213467938400905, "learning_rate": 7.5456991144296765e-06, "loss": 0.6251, "step": 868 }, { "epoch": 0.6885895404120443, "grad_norm": 1.3185236238146145, "learning_rate": 7.540230183540422e-06, "loss": 0.6172, "step": 869 }, { "epoch": 0.6893819334389857, "grad_norm": 1.4180801338857494, "learning_rate": 7.5347571532112426e-06, "loss": 0.6609, "step": 870 }, { "epoch": 0.6901743264659271, "grad_norm": 1.3192529970736229, "learning_rate": 7.529280032274551e-06, "loss": 0.6855, "step": 871 }, { "epoch": 0.6909667194928685, "grad_norm": 1.2690577748865384, "learning_rate": 7.52379882956936e-06, "loss": 0.6662, "step": 872 }, { "epoch": 0.6917591125198098, "grad_norm": 1.508033814834977, "learning_rate": 7.518313553941272e-06, "loss": 0.6911, "step": 873 }, { "epoch": 0.6925515055467512, "grad_norm": 1.4296261537776842, "learning_rate": 7.5128242142424575e-06, "loss": 0.6193, "step": 874 }, { "epoch": 0.6933438985736925, "grad_norm": 1.611230701568426, "learning_rate": 7.507330819331648e-06, "loss": 0.7036, "step": 875 }, { "epoch": 0.694136291600634, "grad_norm": 2.1875613556116305, "learning_rate": 7.5018333780741235e-06, "loss": 0.7107, "step": 876 }, { "epoch": 0.6949286846275753, "grad_norm": 1.403410801889864, "learning_rate": 7.496331899341689e-06, "loss": 0.6761, "step": 877 }, { "epoch": 0.6957210776545166, "grad_norm": 1.3000814473467939, "learning_rate": 7.490826392012664e-06, "loss": 0.7192, "step": 878 }, { "epoch": 0.696513470681458, "grad_norm": 1.4166406956451743, "learning_rate": 7.485316864971874e-06, "loss": 0.6437, "step": 879 }, { "epoch": 0.6973058637083994, "grad_norm": 1.4423381709849814, "learning_rate": 7.479803327110631e-06, "loss": 0.6278, "step": 880 }, { "epoch": 0.6980982567353408, "grad_norm": 2.044977057838423, "learning_rate": 7.474285787326714e-06, "loss": 0.6591, "step": 881 }, { "epoch": 0.6988906497622821, "grad_norm": 1.5227552045250412, "learning_rate": 7.468764254524368e-06, "loss": 0.6204, "step": 882 }, { "epoch": 0.6996830427892234, "grad_norm": 1.368258419153674, "learning_rate": 7.463238737614275e-06, "loss": 0.7444, "step": 883 }, { "epoch": 0.7004754358161648, "grad_norm": 1.6796011619336166, "learning_rate": 7.457709245513554e-06, "loss": 0.6521, "step": 884 }, { "epoch": 0.7012678288431062, "grad_norm": 1.4780726780551317, "learning_rate": 7.452175787145732e-06, "loss": 0.6212, "step": 885 }, { "epoch": 0.7020602218700476, "grad_norm": 1.5097680530967952, "learning_rate": 7.446638371440743e-06, "loss": 0.6745, "step": 886 }, { "epoch": 0.7028526148969889, "grad_norm": 1.4493026871983388, "learning_rate": 7.441097007334901e-06, "loss": 0.6939, "step": 887 }, { "epoch": 0.7036450079239303, "grad_norm": 1.2429854456666065, "learning_rate": 7.435551703770898e-06, "loss": 0.6364, "step": 888 }, { "epoch": 0.7044374009508716, "grad_norm": 1.310433704637822, "learning_rate": 7.430002469697777e-06, "loss": 0.6892, "step": 889 }, { "epoch": 0.705229793977813, "grad_norm": 1.7485031604567476, "learning_rate": 7.424449314070933e-06, "loss": 0.6705, "step": 890 }, { "epoch": 0.7060221870047544, "grad_norm": 1.423036851384073, "learning_rate": 7.41889224585208e-06, "loss": 0.64, "step": 891 }, { "epoch": 0.7068145800316957, "grad_norm": 1.4223400488588753, "learning_rate": 7.413331274009254e-06, "loss": 0.6804, "step": 892 }, { "epoch": 0.7076069730586371, "grad_norm": 1.304363424482755, "learning_rate": 7.407766407516787e-06, "loss": 0.689, "step": 893 }, { "epoch": 0.7083993660855784, "grad_norm": 1.4706971317018296, "learning_rate": 7.402197655355294e-06, "loss": 0.684, "step": 894 }, { "epoch": 0.7091917591125199, "grad_norm": 3.272151191189889, "learning_rate": 7.396625026511666e-06, "loss": 0.6935, "step": 895 }, { "epoch": 0.7099841521394612, "grad_norm": 1.5107108593364131, "learning_rate": 7.391048529979046e-06, "loss": 0.6368, "step": 896 }, { "epoch": 0.7107765451664025, "grad_norm": 7.215541591854234, "learning_rate": 7.38546817475682e-06, "loss": 0.6098, "step": 897 }, { "epoch": 0.7115689381933439, "grad_norm": 1.34365315987036, "learning_rate": 7.379883969850603e-06, "loss": 0.6525, "step": 898 }, { "epoch": 0.7123613312202852, "grad_norm": 1.3984345199687163, "learning_rate": 7.37429592427222e-06, "loss": 0.7046, "step": 899 }, { "epoch": 0.7131537242472267, "grad_norm": 1.4322584077974336, "learning_rate": 7.368704047039696e-06, "loss": 0.6695, "step": 900 }, { "epoch": 0.713946117274168, "grad_norm": 1.284797293199308, "learning_rate": 7.363108347177237e-06, "loss": 0.6737, "step": 901 }, { "epoch": 0.7147385103011094, "grad_norm": 1.9092667441652318, "learning_rate": 7.3575088337152215e-06, "loss": 0.6724, "step": 902 }, { "epoch": 0.7155309033280507, "grad_norm": 2.1035728511361276, "learning_rate": 7.351905515690179e-06, "loss": 0.6285, "step": 903 }, { "epoch": 0.716323296354992, "grad_norm": 1.3930376984848858, "learning_rate": 7.34629840214478e-06, "loss": 0.65, "step": 904 }, { "epoch": 0.7171156893819335, "grad_norm": 1.6358208837802153, "learning_rate": 7.3406875021278245e-06, "loss": 0.7157, "step": 905 }, { "epoch": 0.7179080824088748, "grad_norm": 1.211547906676419, "learning_rate": 7.335072824694215e-06, "loss": 0.6558, "step": 906 }, { "epoch": 0.7187004754358162, "grad_norm": 1.3061748734367833, "learning_rate": 7.329454378904957e-06, "loss": 0.6652, "step": 907 }, { "epoch": 0.7194928684627575, "grad_norm": 1.348148016958291, "learning_rate": 7.3238321738271325e-06, "loss": 0.6333, "step": 908 }, { "epoch": 0.7202852614896988, "grad_norm": 1.6143499069488736, "learning_rate": 7.318206218533894e-06, "loss": 0.691, "step": 909 }, { "epoch": 0.7210776545166403, "grad_norm": 1.3386211968259114, "learning_rate": 7.312576522104445e-06, "loss": 0.6501, "step": 910 }, { "epoch": 0.7218700475435816, "grad_norm": 1.31497806389395, "learning_rate": 7.306943093624025e-06, "loss": 0.6829, "step": 911 }, { "epoch": 0.722662440570523, "grad_norm": 1.7542152915682834, "learning_rate": 7.301305942183898e-06, "loss": 0.5923, "step": 912 }, { "epoch": 0.7234548335974643, "grad_norm": 1.5621521001987904, "learning_rate": 7.2956650768813355e-06, "loss": 0.6771, "step": 913 }, { "epoch": 0.7242472266244057, "grad_norm": 1.1888995052093274, "learning_rate": 7.2900205068196e-06, "loss": 0.6985, "step": 914 }, { "epoch": 0.7250396196513471, "grad_norm": 1.5088988823306964, "learning_rate": 7.28437224110794e-06, "loss": 0.6204, "step": 915 }, { "epoch": 0.7258320126782885, "grad_norm": 2.456587766471318, "learning_rate": 7.27872028886156e-06, "loss": 0.5718, "step": 916 }, { "epoch": 0.7266244057052298, "grad_norm": 1.3611532290935084, "learning_rate": 7.273064659201616e-06, "loss": 0.7048, "step": 917 }, { "epoch": 0.7274167987321711, "grad_norm": 1.4318124899432505, "learning_rate": 7.267405361255203e-06, "loss": 0.6564, "step": 918 }, { "epoch": 0.7282091917591125, "grad_norm": 1.4782501103578802, "learning_rate": 7.261742404155332e-06, "loss": 0.5987, "step": 919 }, { "epoch": 0.7290015847860539, "grad_norm": 1.4500488500871194, "learning_rate": 7.256075797040918e-06, "loss": 0.6773, "step": 920 }, { "epoch": 0.7297939778129953, "grad_norm": 1.3779256209118855, "learning_rate": 7.25040554905677e-06, "loss": 0.6623, "step": 921 }, { "epoch": 0.7305863708399366, "grad_norm": 1.481650170347223, "learning_rate": 7.244731669353568e-06, "loss": 0.6096, "step": 922 }, { "epoch": 0.731378763866878, "grad_norm": 1.2350984201730721, "learning_rate": 7.239054167087861e-06, "loss": 0.7073, "step": 923 }, { "epoch": 0.7321711568938193, "grad_norm": 1.1067489273016315, "learning_rate": 7.233373051422036e-06, "loss": 0.7172, "step": 924 }, { "epoch": 0.7329635499207607, "grad_norm": 1.5527815716616118, "learning_rate": 7.227688331524313e-06, "loss": 0.6477, "step": 925 }, { "epoch": 0.7337559429477021, "grad_norm": 1.4091655531083498, "learning_rate": 7.222000016568732e-06, "loss": 0.6657, "step": 926 }, { "epoch": 0.7345483359746434, "grad_norm": 1.305699290392255, "learning_rate": 7.216308115735131e-06, "loss": 0.6471, "step": 927 }, { "epoch": 0.7353407290015848, "grad_norm": 1.4629019596956303, "learning_rate": 7.210612638209137e-06, "loss": 0.6089, "step": 928 }, { "epoch": 0.7361331220285261, "grad_norm": 2.238099608178402, "learning_rate": 7.204913593182149e-06, "loss": 0.6432, "step": 929 }, { "epoch": 0.7369255150554676, "grad_norm": 3.4355223699149118, "learning_rate": 7.199210989851322e-06, "loss": 0.6414, "step": 930 }, { "epoch": 0.7377179080824089, "grad_norm": 1.335425193080077, "learning_rate": 7.193504837419555e-06, "loss": 0.7258, "step": 931 }, { "epoch": 0.7385103011093502, "grad_norm": 1.3488970028009506, "learning_rate": 7.187795145095473e-06, "loss": 0.6272, "step": 932 }, { "epoch": 0.7393026941362916, "grad_norm": 1.3433344309177007, "learning_rate": 7.1820819220934155e-06, "loss": 0.6129, "step": 933 }, { "epoch": 0.7400950871632329, "grad_norm": 1.712421988063712, "learning_rate": 7.176365177633418e-06, "loss": 0.65, "step": 934 }, { "epoch": 0.7408874801901744, "grad_norm": 1.3806056965293367, "learning_rate": 7.170644920941199e-06, "loss": 0.6129, "step": 935 }, { "epoch": 0.7416798732171157, "grad_norm": 1.6721675287733748, "learning_rate": 7.1649211612481466e-06, "loss": 0.6885, "step": 936 }, { "epoch": 0.742472266244057, "grad_norm": 1.5425572009868256, "learning_rate": 7.159193907791302e-06, "loss": 0.6802, "step": 937 }, { "epoch": 0.7432646592709984, "grad_norm": 1.5671208222767476, "learning_rate": 7.153463169813343e-06, "loss": 0.6064, "step": 938 }, { "epoch": 0.7440570522979397, "grad_norm": 12.520264305529436, "learning_rate": 7.147728956562572e-06, "loss": 0.6256, "step": 939 }, { "epoch": 0.7448494453248812, "grad_norm": 1.7519475764918995, "learning_rate": 7.141991277292898e-06, "loss": 0.7068, "step": 940 }, { "epoch": 0.7456418383518225, "grad_norm": 1.2696934082006766, "learning_rate": 7.136250141263827e-06, "loss": 0.6918, "step": 941 }, { "epoch": 0.7464342313787639, "grad_norm": 1.7145214528743835, "learning_rate": 7.130505557740442e-06, "loss": 0.6216, "step": 942 }, { "epoch": 0.7472266244057052, "grad_norm": 1.239513462484978, "learning_rate": 7.1247575359933864e-06, "loss": 0.621, "step": 943 }, { "epoch": 0.7480190174326465, "grad_norm": 1.7590501798064992, "learning_rate": 7.119006085298858e-06, "loss": 0.5951, "step": 944 }, { "epoch": 0.748811410459588, "grad_norm": 1.6883786023520775, "learning_rate": 7.113251214938582e-06, "loss": 0.6338, "step": 945 }, { "epoch": 0.7496038034865293, "grad_norm": 1.3396003410391746, "learning_rate": 7.107492934199809e-06, "loss": 0.6412, "step": 946 }, { "epoch": 0.7503961965134707, "grad_norm": 1.5790806464221883, "learning_rate": 7.101731252375287e-06, "loss": 0.627, "step": 947 }, { "epoch": 0.751188589540412, "grad_norm": 1.4476245698159267, "learning_rate": 7.095966178763256e-06, "loss": 0.6453, "step": 948 }, { "epoch": 0.7519809825673535, "grad_norm": 1.2487891872278847, "learning_rate": 7.090197722667429e-06, "loss": 0.7124, "step": 949 }, { "epoch": 0.7527733755942948, "grad_norm": 2.081788033006603, "learning_rate": 7.084425893396978e-06, "loss": 0.6656, "step": 950 }, { "epoch": 0.7535657686212361, "grad_norm": 1.5163923525879368, "learning_rate": 7.078650700266521e-06, "loss": 0.648, "step": 951 }, { "epoch": 0.7543581616481775, "grad_norm": 1.4215587784658716, "learning_rate": 7.072872152596096e-06, "loss": 0.667, "step": 952 }, { "epoch": 0.7551505546751188, "grad_norm": 1.4164022086523016, "learning_rate": 7.067090259711168e-06, "loss": 0.6616, "step": 953 }, { "epoch": 0.7559429477020603, "grad_norm": 1.561129031780153, "learning_rate": 7.061305030942588e-06, "loss": 0.6658, "step": 954 }, { "epoch": 0.7567353407290016, "grad_norm": 1.3902313290810504, "learning_rate": 7.055516475626598e-06, "loss": 0.6325, "step": 955 }, { "epoch": 0.757527733755943, "grad_norm": 1.6757927421288437, "learning_rate": 7.049724603104806e-06, "loss": 0.6885, "step": 956 }, { "epoch": 0.7583201267828843, "grad_norm": 1.3367271208434655, "learning_rate": 7.0439294227241715e-06, "loss": 0.6586, "step": 957 }, { "epoch": 0.7591125198098256, "grad_norm": 1.4117064276428966, "learning_rate": 7.038130943836996e-06, "loss": 0.6297, "step": 958 }, { "epoch": 0.7599049128367671, "grad_norm": 1.8405235622869613, "learning_rate": 7.032329175800902e-06, "loss": 0.6455, "step": 959 }, { "epoch": 0.7606973058637084, "grad_norm": 1.4135461330824348, "learning_rate": 7.02652412797882e-06, "loss": 0.6288, "step": 960 }, { "epoch": 0.7614896988906498, "grad_norm": 1.2278905712800376, "learning_rate": 7.020715809738972e-06, "loss": 0.7148, "step": 961 }, { "epoch": 0.7622820919175911, "grad_norm": 1.4605976875300104, "learning_rate": 7.014904230454864e-06, "loss": 0.6592, "step": 962 }, { "epoch": 0.7630744849445324, "grad_norm": 1.5332119286173704, "learning_rate": 7.009089399505256e-06, "loss": 0.6154, "step": 963 }, { "epoch": 0.7638668779714739, "grad_norm": 1.255080182406179, "learning_rate": 7.003271326274162e-06, "loss": 0.665, "step": 964 }, { "epoch": 0.7646592709984152, "grad_norm": 1.3642472914692982, "learning_rate": 6.997450020150825e-06, "loss": 0.6517, "step": 965 }, { "epoch": 0.7654516640253566, "grad_norm": 1.4142247042987692, "learning_rate": 6.991625490529709e-06, "loss": 0.681, "step": 966 }, { "epoch": 0.7662440570522979, "grad_norm": 1.2356999482176252, "learning_rate": 6.985797746810474e-06, "loss": 0.6403, "step": 967 }, { "epoch": 0.7670364500792393, "grad_norm": 1.3752800246948174, "learning_rate": 6.979966798397974e-06, "loss": 0.6673, "step": 968 }, { "epoch": 0.7678288431061807, "grad_norm": 1.4000938952984219, "learning_rate": 6.97413265470223e-06, "loss": 0.6659, "step": 969 }, { "epoch": 0.768621236133122, "grad_norm": 1.3347047392335551, "learning_rate": 6.96829532513842e-06, "loss": 0.6873, "step": 970 }, { "epoch": 0.7694136291600634, "grad_norm": 1.3091834593597622, "learning_rate": 6.962454819126865e-06, "loss": 0.6439, "step": 971 }, { "epoch": 0.7702060221870047, "grad_norm": 1.1794915493686082, "learning_rate": 6.956611146093012e-06, "loss": 0.6874, "step": 972 }, { "epoch": 0.7709984152139461, "grad_norm": 1.457953779937259, "learning_rate": 6.950764315467417e-06, "loss": 0.6665, "step": 973 }, { "epoch": 0.7717908082408875, "grad_norm": 1.236938858749694, "learning_rate": 6.944914336685734e-06, "loss": 0.6514, "step": 974 }, { "epoch": 0.7725832012678289, "grad_norm": 1.2746578897299825, "learning_rate": 6.939061219188697e-06, "loss": 0.6777, "step": 975 }, { "epoch": 0.7733755942947702, "grad_norm": 1.9046995234231832, "learning_rate": 6.933204972422105e-06, "loss": 0.6632, "step": 976 }, { "epoch": 0.7741679873217115, "grad_norm": 1.3069730020907897, "learning_rate": 6.927345605836806e-06, "loss": 0.6439, "step": 977 }, { "epoch": 0.7749603803486529, "grad_norm": 1.2114484878634222, "learning_rate": 6.921483128888685e-06, "loss": 0.7367, "step": 978 }, { "epoch": 0.7757527733755943, "grad_norm": 1.374887344342218, "learning_rate": 6.915617551038644e-06, "loss": 0.658, "step": 979 }, { "epoch": 0.7765451664025357, "grad_norm": 1.1767063180836204, "learning_rate": 6.909748881752591e-06, "loss": 0.5712, "step": 980 }, { "epoch": 0.777337559429477, "grad_norm": 1.377088405540473, "learning_rate": 6.903877130501422e-06, "loss": 0.6283, "step": 981 }, { "epoch": 0.7781299524564184, "grad_norm": 1.3132299735151778, "learning_rate": 6.898002306761008e-06, "loss": 0.6681, "step": 982 }, { "epoch": 0.7789223454833597, "grad_norm": 1.6827992225618187, "learning_rate": 6.892124420012178e-06, "loss": 0.6603, "step": 983 }, { "epoch": 0.7797147385103012, "grad_norm": 1.7658289595178722, "learning_rate": 6.886243479740703e-06, "loss": 0.6832, "step": 984 }, { "epoch": 0.7805071315372425, "grad_norm": 1.3861616872170253, "learning_rate": 6.880359495437282e-06, "loss": 0.6499, "step": 985 }, { "epoch": 0.7812995245641838, "grad_norm": 1.2348216771938405, "learning_rate": 6.874472476597527e-06, "loss": 0.6641, "step": 986 }, { "epoch": 0.7820919175911252, "grad_norm": 1.4613145166499855, "learning_rate": 6.868582432721949e-06, "loss": 0.7251, "step": 987 }, { "epoch": 0.7828843106180665, "grad_norm": 1.4177841932960544, "learning_rate": 6.862689373315937e-06, "loss": 0.6817, "step": 988 }, { "epoch": 0.783676703645008, "grad_norm": 1.3496539871912536, "learning_rate": 6.856793307889749e-06, "loss": 0.6415, "step": 989 }, { "epoch": 0.7844690966719493, "grad_norm": 1.4345030665801672, "learning_rate": 6.850894245958495e-06, "loss": 0.6676, "step": 990 }, { "epoch": 0.7852614896988906, "grad_norm": 1.4058729216696517, "learning_rate": 6.844992197042117e-06, "loss": 0.7084, "step": 991 }, { "epoch": 0.786053882725832, "grad_norm": 1.3078063251501815, "learning_rate": 6.839087170665381e-06, "loss": 0.7392, "step": 992 }, { "epoch": 0.7868462757527733, "grad_norm": 1.4456940084310796, "learning_rate": 6.833179176357856e-06, "loss": 0.6519, "step": 993 }, { "epoch": 0.7876386687797148, "grad_norm": 1.435584497212473, "learning_rate": 6.827268223653902e-06, "loss": 0.6143, "step": 994 }, { "epoch": 0.7884310618066561, "grad_norm": 1.1948836545594175, "learning_rate": 6.821354322092654e-06, "loss": 0.6406, "step": 995 }, { "epoch": 0.7892234548335975, "grad_norm": 1.4063917398744499, "learning_rate": 6.815437481218002e-06, "loss": 0.6035, "step": 996 }, { "epoch": 0.7900158478605388, "grad_norm": 1.4559840901295753, "learning_rate": 6.809517710578585e-06, "loss": 0.6502, "step": 997 }, { "epoch": 0.7908082408874801, "grad_norm": 1.4408270470203033, "learning_rate": 6.803595019727764e-06, "loss": 0.6553, "step": 998 }, { "epoch": 0.7916006339144216, "grad_norm": 1.3903259512174244, "learning_rate": 6.797669418223619e-06, "loss": 0.6284, "step": 999 }, { "epoch": 0.7923930269413629, "grad_norm": 1.5369500398765217, "learning_rate": 6.791740915628923e-06, "loss": 0.6948, "step": 1000 }, { "epoch": 0.7931854199683043, "grad_norm": 2.4312161907024663, "learning_rate": 6.785809521511131e-06, "loss": 0.6096, "step": 1001 }, { "epoch": 0.7939778129952456, "grad_norm": 1.409339241016328, "learning_rate": 6.779875245442366e-06, "loss": 0.6679, "step": 1002 }, { "epoch": 0.794770206022187, "grad_norm": 1.3786500251724283, "learning_rate": 6.773938096999402e-06, "loss": 0.6011, "step": 1003 }, { "epoch": 0.7955625990491284, "grad_norm": 1.3252794931066783, "learning_rate": 6.767998085763646e-06, "loss": 0.7663, "step": 1004 }, { "epoch": 0.7963549920760697, "grad_norm": 1.9453988843013865, "learning_rate": 6.762055221321128e-06, "loss": 0.6499, "step": 1005 }, { "epoch": 0.7971473851030111, "grad_norm": 1.5112890529315564, "learning_rate": 6.75610951326248e-06, "loss": 0.6693, "step": 1006 }, { "epoch": 0.7979397781299524, "grad_norm": 1.6653045787438063, "learning_rate": 6.750160971182924e-06, "loss": 0.697, "step": 1007 }, { "epoch": 0.7987321711568938, "grad_norm": 1.244161846329012, "learning_rate": 6.744209604682256e-06, "loss": 0.5582, "step": 1008 }, { "epoch": 0.7995245641838352, "grad_norm": 1.6559463983329505, "learning_rate": 6.73825542336483e-06, "loss": 0.6614, "step": 1009 }, { "epoch": 0.8003169572107766, "grad_norm": 1.5216381023050725, "learning_rate": 6.732298436839544e-06, "loss": 0.6382, "step": 1010 }, { "epoch": 0.8011093502377179, "grad_norm": 1.1837909032508178, "learning_rate": 6.72633865471982e-06, "loss": 0.6708, "step": 1011 }, { "epoch": 0.8019017432646592, "grad_norm": 1.4691262250627966, "learning_rate": 6.720376086623592e-06, "loss": 0.7063, "step": 1012 }, { "epoch": 0.8026941362916006, "grad_norm": 1.323008389796614, "learning_rate": 6.714410742173292e-06, "loss": 0.6451, "step": 1013 }, { "epoch": 0.803486529318542, "grad_norm": 3.917806298394738, "learning_rate": 6.70844263099583e-06, "loss": 0.6704, "step": 1014 }, { "epoch": 0.8042789223454834, "grad_norm": 2.2066053597930972, "learning_rate": 6.7024717627225865e-06, "loss": 0.6385, "step": 1015 }, { "epoch": 0.8050713153724247, "grad_norm": 1.3313903988326015, "learning_rate": 6.6964981469893845e-06, "loss": 0.6897, "step": 1016 }, { "epoch": 0.805863708399366, "grad_norm": 1.3187407965845024, "learning_rate": 6.690521793436484e-06, "loss": 0.6682, "step": 1017 }, { "epoch": 0.8066561014263075, "grad_norm": 1.5089878322689672, "learning_rate": 6.684542711708562e-06, "loss": 0.6209, "step": 1018 }, { "epoch": 0.8074484944532488, "grad_norm": 1.1653302729593944, "learning_rate": 6.678560911454703e-06, "loss": 0.6528, "step": 1019 }, { "epoch": 0.8082408874801902, "grad_norm": 1.4464022064005093, "learning_rate": 6.672576402328369e-06, "loss": 0.6792, "step": 1020 }, { "epoch": 0.8090332805071315, "grad_norm": 1.3664975480305996, "learning_rate": 6.666589193987405e-06, "loss": 0.6628, "step": 1021 }, { "epoch": 0.8098256735340729, "grad_norm": 1.293070781922764, "learning_rate": 6.660599296094005e-06, "loss": 0.6154, "step": 1022 }, { "epoch": 0.8106180665610143, "grad_norm": 1.5677651779525958, "learning_rate": 6.654606718314705e-06, "loss": 0.652, "step": 1023 }, { "epoch": 0.8114104595879557, "grad_norm": 1.1571898284846456, "learning_rate": 6.6486114703203644e-06, "loss": 0.6585, "step": 1024 }, { "epoch": 0.812202852614897, "grad_norm": 3.630357081810383, "learning_rate": 6.642613561786155e-06, "loss": 0.6777, "step": 1025 }, { "epoch": 0.8129952456418383, "grad_norm": 2.233574770502024, "learning_rate": 6.63661300239154e-06, "loss": 0.6595, "step": 1026 }, { "epoch": 0.8137876386687797, "grad_norm": 1.4814917636722322, "learning_rate": 6.630609801820259e-06, "loss": 0.6681, "step": 1027 }, { "epoch": 0.8145800316957211, "grad_norm": 6.36702568101935, "learning_rate": 6.624603969760319e-06, "loss": 0.6942, "step": 1028 }, { "epoch": 0.8153724247226625, "grad_norm": 1.2546828850521095, "learning_rate": 6.618595515903972e-06, "loss": 0.6383, "step": 1029 }, { "epoch": 0.8161648177496038, "grad_norm": 1.4135055146530753, "learning_rate": 6.612584449947694e-06, "loss": 0.6338, "step": 1030 }, { "epoch": 0.8169572107765452, "grad_norm": 1.256340501296941, "learning_rate": 6.6065707815921855e-06, "loss": 0.737, "step": 1031 }, { "epoch": 0.8177496038034865, "grad_norm": 1.3300961937353457, "learning_rate": 6.600554520542345e-06, "loss": 0.6447, "step": 1032 }, { "epoch": 0.8185419968304279, "grad_norm": 1.5840109092601695, "learning_rate": 6.594535676507252e-06, "loss": 0.6201, "step": 1033 }, { "epoch": 0.8193343898573693, "grad_norm": 1.395958068322336, "learning_rate": 6.588514259200155e-06, "loss": 0.6761, "step": 1034 }, { "epoch": 0.8201267828843106, "grad_norm": 1.7492306365802617, "learning_rate": 6.582490278338458e-06, "loss": 0.5948, "step": 1035 }, { "epoch": 0.820919175911252, "grad_norm": 1.2395675211035484, "learning_rate": 6.576463743643699e-06, "loss": 0.6424, "step": 1036 }, { "epoch": 0.8217115689381933, "grad_norm": 1.1876482088565294, "learning_rate": 6.570434664841539e-06, "loss": 0.6697, "step": 1037 }, { "epoch": 0.8225039619651348, "grad_norm": 1.4650407362941849, "learning_rate": 6.564403051661745e-06, "loss": 0.6806, "step": 1038 }, { "epoch": 0.8232963549920761, "grad_norm": 1.1421349393050408, "learning_rate": 6.558368913838174e-06, "loss": 0.6266, "step": 1039 }, { "epoch": 0.8240887480190174, "grad_norm": 1.1377681650672422, "learning_rate": 6.552332261108754e-06, "loss": 0.6152, "step": 1040 }, { "epoch": 0.8248811410459588, "grad_norm": 1.6490810050600095, "learning_rate": 6.546293103215477e-06, "loss": 0.6376, "step": 1041 }, { "epoch": 0.8256735340729001, "grad_norm": 1.2399706278577647, "learning_rate": 6.540251449904373e-06, "loss": 0.6265, "step": 1042 }, { "epoch": 0.8264659270998416, "grad_norm": 3.3137101387055847, "learning_rate": 6.534207310925504e-06, "loss": 0.6137, "step": 1043 }, { "epoch": 0.8272583201267829, "grad_norm": 1.9569609173063989, "learning_rate": 6.528160696032937e-06, "loss": 0.7036, "step": 1044 }, { "epoch": 0.8280507131537242, "grad_norm": 2.401000407614886, "learning_rate": 6.522111614984743e-06, "loss": 0.592, "step": 1045 }, { "epoch": 0.8288431061806656, "grad_norm": 1.5267571099684518, "learning_rate": 6.516060077542964e-06, "loss": 0.6366, "step": 1046 }, { "epoch": 0.8296354992076069, "grad_norm": 1.2460347175380355, "learning_rate": 6.510006093473612e-06, "loss": 0.7173, "step": 1047 }, { "epoch": 0.8304278922345484, "grad_norm": 1.436704279249163, "learning_rate": 6.503949672546646e-06, "loss": 0.6327, "step": 1048 }, { "epoch": 0.8312202852614897, "grad_norm": 1.319199852196198, "learning_rate": 6.497890824535957e-06, "loss": 0.5822, "step": 1049 }, { "epoch": 0.8320126782884311, "grad_norm": 1.3011008287795724, "learning_rate": 6.491829559219352e-06, "loss": 0.595, "step": 1050 }, { "epoch": 0.8328050713153724, "grad_norm": 1.3808932281128148, "learning_rate": 6.485765886378543e-06, "loss": 0.6445, "step": 1051 }, { "epoch": 0.8335974643423137, "grad_norm": 2.0563668204271233, "learning_rate": 6.479699815799121e-06, "loss": 0.6743, "step": 1052 }, { "epoch": 0.8343898573692552, "grad_norm": 1.364082901454961, "learning_rate": 6.473631357270554e-06, "loss": 0.6748, "step": 1053 }, { "epoch": 0.8351822503961965, "grad_norm": 1.3684453372779883, "learning_rate": 6.467560520586153e-06, "loss": 0.5888, "step": 1054 }, { "epoch": 0.8359746434231379, "grad_norm": 1.3497981674071626, "learning_rate": 6.4614873155430825e-06, "loss": 0.6567, "step": 1055 }, { "epoch": 0.8367670364500792, "grad_norm": 1.4573985155015843, "learning_rate": 6.455411751942313e-06, "loss": 0.694, "step": 1056 }, { "epoch": 0.8375594294770206, "grad_norm": 1.3011020901495194, "learning_rate": 6.449333839588633e-06, "loss": 0.6619, "step": 1057 }, { "epoch": 0.838351822503962, "grad_norm": 1.4740738328802683, "learning_rate": 6.443253588290614e-06, "loss": 0.6705, "step": 1058 }, { "epoch": 0.8391442155309033, "grad_norm": 1.2006658680516764, "learning_rate": 6.437171007860605e-06, "loss": 0.6429, "step": 1059 }, { "epoch": 0.8399366085578447, "grad_norm": 1.5997742751001074, "learning_rate": 6.431086108114715e-06, "loss": 0.5988, "step": 1060 }, { "epoch": 0.840729001584786, "grad_norm": 1.2436509586111102, "learning_rate": 6.424998898872794e-06, "loss": 0.7018, "step": 1061 }, { "epoch": 0.8415213946117274, "grad_norm": 1.4272338160888125, "learning_rate": 6.418909389958421e-06, "loss": 0.6433, "step": 1062 }, { "epoch": 0.8423137876386688, "grad_norm": 1.3545123881516525, "learning_rate": 6.412817591198883e-06, "loss": 0.6085, "step": 1063 }, { "epoch": 0.8431061806656102, "grad_norm": 1.2607758581618154, "learning_rate": 6.4067235124251645e-06, "loss": 0.6271, "step": 1064 }, { "epoch": 0.8438985736925515, "grad_norm": 1.2195290287042806, "learning_rate": 6.400627163471931e-06, "loss": 0.6319, "step": 1065 }, { "epoch": 0.8446909667194928, "grad_norm": 1.3836567875217902, "learning_rate": 6.394528554177507e-06, "loss": 0.6514, "step": 1066 }, { "epoch": 0.8454833597464342, "grad_norm": 1.369633280702813, "learning_rate": 6.38842769438387e-06, "loss": 0.7013, "step": 1067 }, { "epoch": 0.8462757527733756, "grad_norm": 1.5105379861044208, "learning_rate": 6.3823245939366275e-06, "loss": 0.6023, "step": 1068 }, { "epoch": 0.847068145800317, "grad_norm": 1.5156396935995575, "learning_rate": 6.376219262685001e-06, "loss": 0.6896, "step": 1069 }, { "epoch": 0.8478605388272583, "grad_norm": 1.542692815475631, "learning_rate": 6.370111710481814e-06, "loss": 0.6086, "step": 1070 }, { "epoch": 0.8486529318541997, "grad_norm": 1.5395125994360963, "learning_rate": 6.364001947183475e-06, "loss": 0.626, "step": 1071 }, { "epoch": 0.849445324881141, "grad_norm": 1.2149235309819526, "learning_rate": 6.357889982649957e-06, "loss": 0.6517, "step": 1072 }, { "epoch": 0.8502377179080824, "grad_norm": 1.6129254006207423, "learning_rate": 6.351775826744789e-06, "loss": 0.5828, "step": 1073 }, { "epoch": 0.8510301109350238, "grad_norm": 1.2733656366980972, "learning_rate": 6.345659489335038e-06, "loss": 0.6127, "step": 1074 }, { "epoch": 0.8518225039619651, "grad_norm": 1.3450743093453748, "learning_rate": 6.339540980291285e-06, "loss": 0.5969, "step": 1075 }, { "epoch": 0.8526148969889065, "grad_norm": 1.5602612373990312, "learning_rate": 6.333420309487623e-06, "loss": 0.5891, "step": 1076 }, { "epoch": 0.8534072900158478, "grad_norm": 3.354392709353868, "learning_rate": 6.327297486801627e-06, "loss": 0.6363, "step": 1077 }, { "epoch": 0.8541996830427893, "grad_norm": 1.3999085592126932, "learning_rate": 6.321172522114351e-06, "loss": 0.5546, "step": 1078 }, { "epoch": 0.8549920760697306, "grad_norm": 1.39176270513121, "learning_rate": 6.3150454253103e-06, "loss": 0.6765, "step": 1079 }, { "epoch": 0.8557844690966719, "grad_norm": 1.1712694892838127, "learning_rate": 6.308916206277425e-06, "loss": 0.6256, "step": 1080 }, { "epoch": 0.8565768621236133, "grad_norm": 1.489353294437249, "learning_rate": 6.302784874907096e-06, "loss": 0.5929, "step": 1081 }, { "epoch": 0.8573692551505546, "grad_norm": 1.307323543925307, "learning_rate": 6.296651441094099e-06, "loss": 0.672, "step": 1082 }, { "epoch": 0.8581616481774961, "grad_norm": 1.3619139419316137, "learning_rate": 6.290515914736607e-06, "loss": 0.5749, "step": 1083 }, { "epoch": 0.8589540412044374, "grad_norm": 5.617723175991439, "learning_rate": 6.284378305736173e-06, "loss": 0.6682, "step": 1084 }, { "epoch": 0.8597464342313788, "grad_norm": 1.431407091963891, "learning_rate": 6.278238623997709e-06, "loss": 0.6215, "step": 1085 }, { "epoch": 0.8605388272583201, "grad_norm": 1.3023196721410923, "learning_rate": 6.272096879429474e-06, "loss": 0.648, "step": 1086 }, { "epoch": 0.8613312202852615, "grad_norm": 3.3164929078370418, "learning_rate": 6.265953081943057e-06, "loss": 0.5845, "step": 1087 }, { "epoch": 0.8621236133122029, "grad_norm": 1.844375338895079, "learning_rate": 6.259807241453354e-06, "loss": 0.6372, "step": 1088 }, { "epoch": 0.8629160063391442, "grad_norm": 1.674525105204964, "learning_rate": 6.253659367878566e-06, "loss": 0.6453, "step": 1089 }, { "epoch": 0.8637083993660856, "grad_norm": 1.5745450360474658, "learning_rate": 6.247509471140169e-06, "loss": 0.6644, "step": 1090 }, { "epoch": 0.8645007923930269, "grad_norm": 1.490171060259048, "learning_rate": 6.241357561162906e-06, "loss": 0.6937, "step": 1091 }, { "epoch": 0.8652931854199684, "grad_norm": 1.2593513599011426, "learning_rate": 6.235203647874773e-06, "loss": 0.6639, "step": 1092 }, { "epoch": 0.8660855784469097, "grad_norm": 1.4004082763396553, "learning_rate": 6.22904774120699e-06, "loss": 0.6582, "step": 1093 }, { "epoch": 0.866877971473851, "grad_norm": 1.6283157870683986, "learning_rate": 6.222889851094002e-06, "loss": 0.5319, "step": 1094 }, { "epoch": 0.8676703645007924, "grad_norm": 1.247392285411217, "learning_rate": 6.216729987473451e-06, "loss": 0.679, "step": 1095 }, { "epoch": 0.8684627575277337, "grad_norm": 1.3346597459860121, "learning_rate": 6.210568160286166e-06, "loss": 0.6381, "step": 1096 }, { "epoch": 0.8692551505546752, "grad_norm": 1.354532412450609, "learning_rate": 6.204404379476142e-06, "loss": 0.6526, "step": 1097 }, { "epoch": 0.8700475435816165, "grad_norm": 1.2012419379648447, "learning_rate": 6.19823865499053e-06, "loss": 0.7641, "step": 1098 }, { "epoch": 0.8708399366085579, "grad_norm": 1.3802870999443462, "learning_rate": 6.192070996779613e-06, "loss": 0.6678, "step": 1099 }, { "epoch": 0.8716323296354992, "grad_norm": 1.350040705481885, "learning_rate": 6.1859014147968e-06, "loss": 0.6866, "step": 1100 }, { "epoch": 0.8724247226624405, "grad_norm": 1.2150669885156073, "learning_rate": 6.179729918998603e-06, "loss": 0.6614, "step": 1101 }, { "epoch": 0.873217115689382, "grad_norm": 1.2384698497049274, "learning_rate": 6.173556519344624e-06, "loss": 0.6999, "step": 1102 }, { "epoch": 0.8740095087163233, "grad_norm": 1.4267634653938126, "learning_rate": 6.167381225797529e-06, "loss": 0.6993, "step": 1103 }, { "epoch": 0.8748019017432647, "grad_norm": 1.1591295874463912, "learning_rate": 6.1612040483230514e-06, "loss": 0.6258, "step": 1104 }, { "epoch": 0.875594294770206, "grad_norm": 1.199310389668633, "learning_rate": 6.155024996889959e-06, "loss": 0.6709, "step": 1105 }, { "epoch": 0.8763866877971473, "grad_norm": 1.8567715188392147, "learning_rate": 6.148844081470045e-06, "loss": 0.6622, "step": 1106 }, { "epoch": 0.8771790808240888, "grad_norm": 1.3495266008942397, "learning_rate": 6.142661312038112e-06, "loss": 0.6518, "step": 1107 }, { "epoch": 0.8779714738510301, "grad_norm": 1.3265548296249583, "learning_rate": 6.136476698571952e-06, "loss": 0.655, "step": 1108 }, { "epoch": 0.8787638668779715, "grad_norm": 1.2717221159777479, "learning_rate": 6.130290251052335e-06, "loss": 0.6891, "step": 1109 }, { "epoch": 0.8795562599049128, "grad_norm": 1.7371969470454656, "learning_rate": 6.124101979462987e-06, "loss": 0.7119, "step": 1110 }, { "epoch": 0.8803486529318542, "grad_norm": 1.9447887059794282, "learning_rate": 6.117911893790586e-06, "loss": 0.6178, "step": 1111 }, { "epoch": 0.8811410459587956, "grad_norm": 1.1284301654959312, "learning_rate": 6.111720004024728e-06, "loss": 0.6251, "step": 1112 }, { "epoch": 0.881933438985737, "grad_norm": 1.5534966937584562, "learning_rate": 6.105526320157928e-06, "loss": 0.6229, "step": 1113 }, { "epoch": 0.8827258320126783, "grad_norm": 1.6110407272694267, "learning_rate": 6.09933085218559e-06, "loss": 0.6535, "step": 1114 }, { "epoch": 0.8835182250396196, "grad_norm": 1.0859338460499888, "learning_rate": 6.0931336101060035e-06, "loss": 0.6255, "step": 1115 }, { "epoch": 0.884310618066561, "grad_norm": 1.39382253947353, "learning_rate": 6.086934603920316e-06, "loss": 0.6429, "step": 1116 }, { "epoch": 0.8851030110935024, "grad_norm": 1.4580349354372704, "learning_rate": 6.0807338436325245e-06, "loss": 0.6691, "step": 1117 }, { "epoch": 0.8858954041204438, "grad_norm": 1.590642262741131, "learning_rate": 6.074531339249456e-06, "loss": 0.6384, "step": 1118 }, { "epoch": 0.8866877971473851, "grad_norm": 1.4254031674357208, "learning_rate": 6.06832710078075e-06, "loss": 0.5646, "step": 1119 }, { "epoch": 0.8874801901743264, "grad_norm": 1.517346952492483, "learning_rate": 6.062121138238851e-06, "loss": 0.6812, "step": 1120 }, { "epoch": 0.8882725832012678, "grad_norm": 1.7014871719807654, "learning_rate": 6.055913461638979e-06, "loss": 0.6512, "step": 1121 }, { "epoch": 0.8890649762282092, "grad_norm": 1.3030946787383686, "learning_rate": 6.049704080999122e-06, "loss": 0.6212, "step": 1122 }, { "epoch": 0.8898573692551506, "grad_norm": 1.2878884746110852, "learning_rate": 6.043493006340021e-06, "loss": 0.6637, "step": 1123 }, { "epoch": 0.8906497622820919, "grad_norm": 7.232125866487523, "learning_rate": 6.037280247685147e-06, "loss": 0.6552, "step": 1124 }, { "epoch": 0.8914421553090333, "grad_norm": 1.3885624187276224, "learning_rate": 6.031065815060691e-06, "loss": 0.6486, "step": 1125 }, { "epoch": 0.8922345483359746, "grad_norm": 2.656640892584787, "learning_rate": 6.0248497184955435e-06, "loss": 0.6315, "step": 1126 }, { "epoch": 0.893026941362916, "grad_norm": 2.294654276474422, "learning_rate": 6.018631968021283e-06, "loss": 0.5903, "step": 1127 }, { "epoch": 0.8938193343898574, "grad_norm": 1.4704081217620844, "learning_rate": 6.012412573672154e-06, "loss": 0.6083, "step": 1128 }, { "epoch": 0.8946117274167987, "grad_norm": 2.0885637252143794, "learning_rate": 6.006191545485055e-06, "loss": 0.6234, "step": 1129 }, { "epoch": 0.8954041204437401, "grad_norm": 1.463153149607859, "learning_rate": 5.999968893499524e-06, "loss": 0.7468, "step": 1130 }, { "epoch": 0.8961965134706814, "grad_norm": 1.468681759509577, "learning_rate": 5.993744627757713e-06, "loss": 0.6387, "step": 1131 }, { "epoch": 0.8969889064976229, "grad_norm": 1.3195590281841478, "learning_rate": 5.987518758304388e-06, "loss": 0.6903, "step": 1132 }, { "epoch": 0.8977812995245642, "grad_norm": 1.8781154006389225, "learning_rate": 5.981291295186893e-06, "loss": 0.6337, "step": 1133 }, { "epoch": 0.8985736925515055, "grad_norm": 1.2863844154802173, "learning_rate": 5.975062248455148e-06, "loss": 0.6297, "step": 1134 }, { "epoch": 0.8993660855784469, "grad_norm": 1.2183968311597477, "learning_rate": 5.968831628161633e-06, "loss": 0.683, "step": 1135 }, { "epoch": 0.9001584786053882, "grad_norm": 1.324330018743361, "learning_rate": 5.962599444361359e-06, "loss": 0.6713, "step": 1136 }, { "epoch": 0.9009508716323297, "grad_norm": 1.4663062588054059, "learning_rate": 5.956365707111869e-06, "loss": 0.6244, "step": 1137 }, { "epoch": 0.901743264659271, "grad_norm": 1.3358440574262542, "learning_rate": 5.950130426473204e-06, "loss": 0.6654, "step": 1138 }, { "epoch": 0.9025356576862124, "grad_norm": 1.2742843904664647, "learning_rate": 5.943893612507904e-06, "loss": 0.6005, "step": 1139 }, { "epoch": 0.9033280507131537, "grad_norm": 1.5753643897780194, "learning_rate": 5.937655275280978e-06, "loss": 0.645, "step": 1140 }, { "epoch": 0.904120443740095, "grad_norm": 1.4862894811575607, "learning_rate": 5.931415424859894e-06, "loss": 0.655, "step": 1141 }, { "epoch": 0.9049128367670365, "grad_norm": 1.508758610789276, "learning_rate": 5.925174071314567e-06, "loss": 0.6332, "step": 1142 }, { "epoch": 0.9057052297939778, "grad_norm": 3.4197847429165322, "learning_rate": 5.9189312247173284e-06, "loss": 0.5603, "step": 1143 }, { "epoch": 0.9064976228209192, "grad_norm": 1.3310974237534432, "learning_rate": 5.912686895142928e-06, "loss": 0.6326, "step": 1144 }, { "epoch": 0.9072900158478605, "grad_norm": 1.4400671822079705, "learning_rate": 5.906441092668504e-06, "loss": 0.6496, "step": 1145 }, { "epoch": 0.9080824088748018, "grad_norm": 1.7897432033904777, "learning_rate": 5.900193827373573e-06, "loss": 0.6775, "step": 1146 }, { "epoch": 0.9088748019017433, "grad_norm": 1.1948844532322065, "learning_rate": 5.893945109340015e-06, "loss": 0.602, "step": 1147 }, { "epoch": 0.9096671949286846, "grad_norm": 1.1535277018936576, "learning_rate": 5.887694948652047e-06, "loss": 0.647, "step": 1148 }, { "epoch": 0.910459587955626, "grad_norm": 1.208131911427833, "learning_rate": 5.881443355396222e-06, "loss": 0.6848, "step": 1149 }, { "epoch": 0.9112519809825673, "grad_norm": 2.277170523329896, "learning_rate": 5.8751903396614e-06, "loss": 0.6275, "step": 1150 }, { "epoch": 0.9120443740095087, "grad_norm": 1.447828479924288, "learning_rate": 5.868935911538738e-06, "loss": 0.6322, "step": 1151 }, { "epoch": 0.9128367670364501, "grad_norm": 1.5884601415050204, "learning_rate": 5.8626800811216744e-06, "loss": 0.5868, "step": 1152 }, { "epoch": 0.9136291600633915, "grad_norm": 1.321039008349722, "learning_rate": 5.856422858505909e-06, "loss": 0.6593, "step": 1153 }, { "epoch": 0.9144215530903328, "grad_norm": 1.2578285454270615, "learning_rate": 5.850164253789387e-06, "loss": 0.6784, "step": 1154 }, { "epoch": 0.9152139461172741, "grad_norm": 1.4917758538872503, "learning_rate": 5.843904277072284e-06, "loss": 0.5948, "step": 1155 }, { "epoch": 0.9160063391442155, "grad_norm": 1.595240013560024, "learning_rate": 5.837642938456994e-06, "loss": 0.671, "step": 1156 }, { "epoch": 0.9167987321711569, "grad_norm": 1.6241423714010688, "learning_rate": 5.831380248048103e-06, "loss": 0.7159, "step": 1157 }, { "epoch": 0.9175911251980983, "grad_norm": 1.325344358948386, "learning_rate": 5.825116215952383e-06, "loss": 0.6519, "step": 1158 }, { "epoch": 0.9183835182250396, "grad_norm": 1.4555230663747563, "learning_rate": 5.81885085227877e-06, "loss": 0.6289, "step": 1159 }, { "epoch": 0.919175911251981, "grad_norm": 1.4161265438344923, "learning_rate": 5.812584167138347e-06, "loss": 0.6572, "step": 1160 }, { "epoch": 0.9199683042789224, "grad_norm": 2.766219075317824, "learning_rate": 5.806316170644332e-06, "loss": 0.6822, "step": 1161 }, { "epoch": 0.9207606973058637, "grad_norm": 1.4419684211532529, "learning_rate": 5.800046872912057e-06, "loss": 0.6443, "step": 1162 }, { "epoch": 0.9215530903328051, "grad_norm": 1.460329838354535, "learning_rate": 5.793776284058957e-06, "loss": 0.6146, "step": 1163 }, { "epoch": 0.9223454833597464, "grad_norm": 1.4578831384527327, "learning_rate": 5.7875044142045485e-06, "loss": 0.6419, "step": 1164 }, { "epoch": 0.9231378763866878, "grad_norm": 1.3802472402367136, "learning_rate": 5.781231273470414e-06, "loss": 0.5506, "step": 1165 }, { "epoch": 0.9239302694136292, "grad_norm": 1.4577815921206294, "learning_rate": 5.774956871980192e-06, "loss": 0.6716, "step": 1166 }, { "epoch": 0.9247226624405706, "grad_norm": 1.198820220536814, "learning_rate": 5.76868121985955e-06, "loss": 0.6633, "step": 1167 }, { "epoch": 0.9255150554675119, "grad_norm": 1.604253498367623, "learning_rate": 5.762404327236176e-06, "loss": 0.679, "step": 1168 }, { "epoch": 0.9263074484944532, "grad_norm": 2.6668449618095544, "learning_rate": 5.756126204239759e-06, "loss": 0.6319, "step": 1169 }, { "epoch": 0.9270998415213946, "grad_norm": 1.0823714599098797, "learning_rate": 5.749846861001978e-06, "loss": 0.6266, "step": 1170 }, { "epoch": 0.927892234548336, "grad_norm": 1.418868210950258, "learning_rate": 5.743566307656475e-06, "loss": 0.6324, "step": 1171 }, { "epoch": 0.9286846275752774, "grad_norm": 1.2595986195723083, "learning_rate": 5.7372845543388485e-06, "loss": 0.663, "step": 1172 }, { "epoch": 0.9294770206022187, "grad_norm": 1.4260410151935945, "learning_rate": 5.7310016111866326e-06, "loss": 0.6085, "step": 1173 }, { "epoch": 0.93026941362916, "grad_norm": 1.3559852236700198, "learning_rate": 5.724717488339282e-06, "loss": 0.6761, "step": 1174 }, { "epoch": 0.9310618066561014, "grad_norm": 1.3921120159854636, "learning_rate": 5.7184321959381535e-06, "loss": 0.547, "step": 1175 }, { "epoch": 0.9318541996830428, "grad_norm": 1.2102109758056852, "learning_rate": 5.712145744126494e-06, "loss": 0.6262, "step": 1176 }, { "epoch": 0.9326465927099842, "grad_norm": 1.373900942709432, "learning_rate": 5.705858143049421e-06, "loss": 0.6487, "step": 1177 }, { "epoch": 0.9334389857369255, "grad_norm": 1.2441420318671248, "learning_rate": 5.699569402853903e-06, "loss": 0.6235, "step": 1178 }, { "epoch": 0.9342313787638669, "grad_norm": 1.4921461637663598, "learning_rate": 5.6932795336887525e-06, "loss": 0.5812, "step": 1179 }, { "epoch": 0.9350237717908082, "grad_norm": 1.4252414545059966, "learning_rate": 5.686988545704598e-06, "loss": 0.7046, "step": 1180 }, { "epoch": 0.9358161648177497, "grad_norm": 1.499523386639773, "learning_rate": 5.68069644905388e-06, "loss": 0.6385, "step": 1181 }, { "epoch": 0.936608557844691, "grad_norm": 1.6426366705285622, "learning_rate": 5.674403253890823e-06, "loss": 0.655, "step": 1182 }, { "epoch": 0.9374009508716323, "grad_norm": 1.3279637186056368, "learning_rate": 5.668108970371427e-06, "loss": 0.6217, "step": 1183 }, { "epoch": 0.9381933438985737, "grad_norm": 1.2419301227751673, "learning_rate": 5.661813608653447e-06, "loss": 0.7011, "step": 1184 }, { "epoch": 0.938985736925515, "grad_norm": 1.7723103469385775, "learning_rate": 5.655517178896379e-06, "loss": 0.6533, "step": 1185 }, { "epoch": 0.9397781299524565, "grad_norm": 1.3299217097131284, "learning_rate": 5.649219691261441e-06, "loss": 0.6805, "step": 1186 }, { "epoch": 0.9405705229793978, "grad_norm": 1.6462452433283985, "learning_rate": 5.64292115591156e-06, "loss": 0.6258, "step": 1187 }, { "epoch": 0.9413629160063391, "grad_norm": 1.1293092594984895, "learning_rate": 5.636621583011355e-06, "loss": 0.734, "step": 1188 }, { "epoch": 0.9421553090332805, "grad_norm": 1.2528327845064124, "learning_rate": 5.630320982727114e-06, "loss": 0.5932, "step": 1189 }, { "epoch": 0.9429477020602218, "grad_norm": 1.3682281964323326, "learning_rate": 5.6240193652267885e-06, "loss": 0.7288, "step": 1190 }, { "epoch": 0.9437400950871633, "grad_norm": 1.2768801057437027, "learning_rate": 5.617716740679969e-06, "loss": 0.6264, "step": 1191 }, { "epoch": 0.9445324881141046, "grad_norm": 1.3725258054568696, "learning_rate": 5.611413119257872e-06, "loss": 0.6783, "step": 1192 }, { "epoch": 0.945324881141046, "grad_norm": 1.3473200236158693, "learning_rate": 5.605108511133324e-06, "loss": 0.6633, "step": 1193 }, { "epoch": 0.9461172741679873, "grad_norm": 1.353923590110013, "learning_rate": 5.598802926480741e-06, "loss": 0.6949, "step": 1194 }, { "epoch": 0.9469096671949286, "grad_norm": 1.3682363278845069, "learning_rate": 5.592496375476117e-06, "loss": 0.6099, "step": 1195 }, { "epoch": 0.9477020602218701, "grad_norm": 1.3165169315460383, "learning_rate": 5.5861888682970035e-06, "loss": 0.6856, "step": 1196 }, { "epoch": 0.9484944532488114, "grad_norm": 1.415915734415865, "learning_rate": 5.579880415122496e-06, "loss": 0.6585, "step": 1197 }, { "epoch": 0.9492868462757528, "grad_norm": 1.3625795714376763, "learning_rate": 5.573571026133221e-06, "loss": 0.6366, "step": 1198 }, { "epoch": 0.9500792393026941, "grad_norm": 1.5847096055392103, "learning_rate": 5.5672607115113076e-06, "loss": 0.6439, "step": 1199 }, { "epoch": 0.9508716323296355, "grad_norm": 1.2954937133424727, "learning_rate": 5.560949481440384e-06, "loss": 0.6535, "step": 1200 }, { "epoch": 0.9516640253565769, "grad_norm": 1.2356091759670722, "learning_rate": 5.554637346105555e-06, "loss": 0.6415, "step": 1201 }, { "epoch": 0.9524564183835182, "grad_norm": 1.5127566525378149, "learning_rate": 5.548324315693383e-06, "loss": 0.6007, "step": 1202 }, { "epoch": 0.9532488114104596, "grad_norm": 1.1014852121343632, "learning_rate": 5.542010400391877e-06, "loss": 0.652, "step": 1203 }, { "epoch": 0.9540412044374009, "grad_norm": 1.371899038018539, "learning_rate": 5.535695610390478e-06, "loss": 0.6133, "step": 1204 }, { "epoch": 0.9548335974643423, "grad_norm": 1.265934030120256, "learning_rate": 5.5293799558800335e-06, "loss": 0.6648, "step": 1205 }, { "epoch": 0.9556259904912837, "grad_norm": 2.098851023567172, "learning_rate": 5.523063447052787e-06, "loss": 0.5814, "step": 1206 }, { "epoch": 0.9564183835182251, "grad_norm": 1.3834754147988602, "learning_rate": 5.516746094102361e-06, "loss": 0.588, "step": 1207 }, { "epoch": 0.9572107765451664, "grad_norm": 1.5942689671321124, "learning_rate": 5.510427907223742e-06, "loss": 0.6021, "step": 1208 }, { "epoch": 0.9580031695721077, "grad_norm": 1.3169543210272066, "learning_rate": 5.504108896613259e-06, "loss": 0.6171, "step": 1209 }, { "epoch": 0.9587955625990491, "grad_norm": 1.5433849636956187, "learning_rate": 5.497789072468575e-06, "loss": 0.6493, "step": 1210 }, { "epoch": 0.9595879556259905, "grad_norm": 1.624529085085802, "learning_rate": 5.491468444988659e-06, "loss": 0.6065, "step": 1211 }, { "epoch": 0.9603803486529319, "grad_norm": 1.5320160940964906, "learning_rate": 5.485147024373785e-06, "loss": 0.5794, "step": 1212 }, { "epoch": 0.9611727416798732, "grad_norm": 1.3070751479244076, "learning_rate": 5.4788248208255e-06, "loss": 0.5916, "step": 1213 }, { "epoch": 0.9619651347068146, "grad_norm": 1.4234796897299853, "learning_rate": 5.4725018445466175e-06, "loss": 0.6215, "step": 1214 }, { "epoch": 0.9627575277337559, "grad_norm": 1.4461652975114736, "learning_rate": 5.466178105741197e-06, "loss": 0.6233, "step": 1215 }, { "epoch": 0.9635499207606973, "grad_norm": 1.5481230382055053, "learning_rate": 5.4598536146145306e-06, "loss": 0.6332, "step": 1216 }, { "epoch": 0.9643423137876387, "grad_norm": 1.5289322120283344, "learning_rate": 5.453528381373123e-06, "loss": 0.5816, "step": 1217 }, { "epoch": 0.96513470681458, "grad_norm": 1.1598304092238643, "learning_rate": 5.447202416224676e-06, "loss": 0.6641, "step": 1218 }, { "epoch": 0.9659270998415214, "grad_norm": 1.4534182583260662, "learning_rate": 5.4408757293780725e-06, "loss": 0.6457, "step": 1219 }, { "epoch": 0.9667194928684627, "grad_norm": 1.1477419943734444, "learning_rate": 5.434548331043361e-06, "loss": 0.7024, "step": 1220 }, { "epoch": 0.9675118858954042, "grad_norm": 1.7592665374639225, "learning_rate": 5.428220231431739e-06, "loss": 0.6843, "step": 1221 }, { "epoch": 0.9683042789223455, "grad_norm": 1.2934450684643886, "learning_rate": 5.421891440755533e-06, "loss": 0.6569, "step": 1222 }, { "epoch": 0.9690966719492868, "grad_norm": 1.212395821444, "learning_rate": 5.415561969228188e-06, "loss": 0.7284, "step": 1223 }, { "epoch": 0.9698890649762282, "grad_norm": 1.1493835057167745, "learning_rate": 5.409231827064244e-06, "loss": 0.6836, "step": 1224 }, { "epoch": 0.9706814580031695, "grad_norm": 1.1878248941416651, "learning_rate": 5.402901024479328e-06, "loss": 0.6031, "step": 1225 }, { "epoch": 0.971473851030111, "grad_norm": 1.4046835325324245, "learning_rate": 5.396569571690127e-06, "loss": 0.6296, "step": 1226 }, { "epoch": 0.9722662440570523, "grad_norm": 1.8352729881109353, "learning_rate": 5.390237478914384e-06, "loss": 0.6872, "step": 1227 }, { "epoch": 0.9730586370839936, "grad_norm": 1.4680182225133094, "learning_rate": 5.383904756370869e-06, "loss": 0.7202, "step": 1228 }, { "epoch": 0.973851030110935, "grad_norm": 1.4431498219716838, "learning_rate": 5.3775714142793725e-06, "loss": 0.6133, "step": 1229 }, { "epoch": 0.9746434231378764, "grad_norm": 1.142889393196966, "learning_rate": 5.371237462860681e-06, "loss": 0.6522, "step": 1230 }, { "epoch": 0.9754358161648178, "grad_norm": 1.4533650310761097, "learning_rate": 5.364902912336569e-06, "loss": 0.5963, "step": 1231 }, { "epoch": 0.9762282091917591, "grad_norm": 1.3595522841464762, "learning_rate": 5.358567772929772e-06, "loss": 0.6923, "step": 1232 }, { "epoch": 0.9770206022187005, "grad_norm": 1.39678736201654, "learning_rate": 5.352232054863982e-06, "loss": 0.6418, "step": 1233 }, { "epoch": 0.9778129952456418, "grad_norm": 1.2872921748270085, "learning_rate": 5.345895768363819e-06, "loss": 0.6984, "step": 1234 }, { "epoch": 0.9786053882725833, "grad_norm": 1.475738578962304, "learning_rate": 5.339558923654825e-06, "loss": 0.5951, "step": 1235 }, { "epoch": 0.9793977812995246, "grad_norm": 1.6055034372220949, "learning_rate": 5.333221530963441e-06, "loss": 0.6714, "step": 1236 }, { "epoch": 0.9801901743264659, "grad_norm": 1.4436481737077962, "learning_rate": 5.3268836005169895e-06, "loss": 0.6325, "step": 1237 }, { "epoch": 0.9809825673534073, "grad_norm": 1.901978709015768, "learning_rate": 5.320545142543668e-06, "loss": 0.6709, "step": 1238 }, { "epoch": 0.9817749603803486, "grad_norm": 1.440043211316502, "learning_rate": 5.31420616727252e-06, "loss": 0.6363, "step": 1239 }, { "epoch": 0.9825673534072901, "grad_norm": 1.399157762535945, "learning_rate": 5.307866684933423e-06, "loss": 0.6576, "step": 1240 }, { "epoch": 0.9833597464342314, "grad_norm": 1.1804826524765655, "learning_rate": 5.301526705757076e-06, "loss": 0.6038, "step": 1241 }, { "epoch": 0.9841521394611727, "grad_norm": 1.4521926535508876, "learning_rate": 5.2951862399749784e-06, "loss": 0.628, "step": 1242 }, { "epoch": 0.9849445324881141, "grad_norm": 1.545312312695432, "learning_rate": 5.288845297819414e-06, "loss": 0.6341, "step": 1243 }, { "epoch": 0.9857369255150554, "grad_norm": 1.2564538570962165, "learning_rate": 5.282503889523437e-06, "loss": 0.6779, "step": 1244 }, { "epoch": 0.9865293185419969, "grad_norm": 1.3523924480307794, "learning_rate": 5.276162025320854e-06, "loss": 0.7096, "step": 1245 }, { "epoch": 0.9873217115689382, "grad_norm": 1.5171122239543664, "learning_rate": 5.269819715446205e-06, "loss": 0.6141, "step": 1246 }, { "epoch": 0.9881141045958796, "grad_norm": 1.3656467261690706, "learning_rate": 5.263476970134752e-06, "loss": 0.5686, "step": 1247 }, { "epoch": 0.9889064976228209, "grad_norm": 1.1378239241504908, "learning_rate": 5.257133799622458e-06, "loss": 0.6797, "step": 1248 }, { "epoch": 0.9896988906497622, "grad_norm": 1.2093971366123775, "learning_rate": 5.250790214145972e-06, "loss": 0.697, "step": 1249 }, { "epoch": 0.9904912836767037, "grad_norm": 1.3695339040834715, "learning_rate": 5.2444462239426145e-06, "loss": 0.6053, "step": 1250 }, { "epoch": 0.991283676703645, "grad_norm": 1.429099312634711, "learning_rate": 5.238101839250357e-06, "loss": 0.6401, "step": 1251 }, { "epoch": 0.9920760697305864, "grad_norm": 1.2986225809971945, "learning_rate": 5.23175707030781e-06, "loss": 0.6451, "step": 1252 }, { "epoch": 0.9928684627575277, "grad_norm": 1.3601155401555687, "learning_rate": 5.225411927354202e-06, "loss": 0.677, "step": 1253 }, { "epoch": 0.993660855784469, "grad_norm": 1.3781291762993584, "learning_rate": 5.2190664206293636e-06, "loss": 0.7105, "step": 1254 }, { "epoch": 0.9944532488114105, "grad_norm": 1.4066040443740373, "learning_rate": 5.212720560373719e-06, "loss": 0.6507, "step": 1255 }, { "epoch": 0.9952456418383518, "grad_norm": 1.3669916980396288, "learning_rate": 5.206374356828255e-06, "loss": 0.649, "step": 1256 }, { "epoch": 0.9960380348652932, "grad_norm": 1.403123505082106, "learning_rate": 5.200027820234515e-06, "loss": 0.6497, "step": 1257 }, { "epoch": 0.9968304278922345, "grad_norm": 1.2474502108170042, "learning_rate": 5.193680960834586e-06, "loss": 0.6468, "step": 1258 }, { "epoch": 0.9976228209191759, "grad_norm": 1.5048587504887276, "learning_rate": 5.187333788871067e-06, "loss": 0.5888, "step": 1259 }, { "epoch": 0.9984152139461173, "grad_norm": 1.8752733702217923, "learning_rate": 5.180986314587065e-06, "loss": 0.6735, "step": 1260 }, { "epoch": 0.9992076069730587, "grad_norm": 1.5586539101140404, "learning_rate": 5.174638548226178e-06, "loss": 0.583, "step": 1261 }, { "epoch": 1.0, "grad_norm": 1.3099299338084616, "learning_rate": 5.168290500032471e-06, "loss": 0.632, "step": 1262 }, { "epoch": 1.0007923930269413, "grad_norm": 1.2703100371220426, "learning_rate": 5.161942180250464e-06, "loss": 0.5129, "step": 1263 }, { "epoch": 1.0015847860538827, "grad_norm": 1.298752185557254, "learning_rate": 5.155593599125118e-06, "loss": 0.5605, "step": 1264 }, { "epoch": 1.002377179080824, "grad_norm": 1.3963312890293513, "learning_rate": 5.149244766901813e-06, "loss": 0.4845, "step": 1265 }, { "epoch": 1.0031695721077654, "grad_norm": 1.3142978708970763, "learning_rate": 5.142895693826337e-06, "loss": 0.4885, "step": 1266 }, { "epoch": 1.003961965134707, "grad_norm": 1.3247075936280914, "learning_rate": 5.136546390144865e-06, "loss": 0.4768, "step": 1267 }, { "epoch": 1.0047543581616483, "grad_norm": 1.3675260254386552, "learning_rate": 5.130196866103944e-06, "loss": 0.5199, "step": 1268 }, { "epoch": 1.0055467511885896, "grad_norm": 1.1477482156278696, "learning_rate": 5.123847131950478e-06, "loss": 0.4695, "step": 1269 }, { "epoch": 1.006339144215531, "grad_norm": 1.599075502004668, "learning_rate": 5.117497197931707e-06, "loss": 0.5364, "step": 1270 }, { "epoch": 1.0071315372424723, "grad_norm": 1.3696239309611848, "learning_rate": 5.111147074295197e-06, "loss": 0.4766, "step": 1271 }, { "epoch": 1.0079239302694136, "grad_norm": 1.3330809453022419, "learning_rate": 5.104796771288818e-06, "loss": 0.5545, "step": 1272 }, { "epoch": 1.008716323296355, "grad_norm": 1.346958785551292, "learning_rate": 5.098446299160732e-06, "loss": 0.5395, "step": 1273 }, { "epoch": 1.0095087163232963, "grad_norm": 1.505909518194026, "learning_rate": 5.09209566815937e-06, "loss": 0.4522, "step": 1274 }, { "epoch": 1.0103011093502376, "grad_norm": 1.4702634466917774, "learning_rate": 5.085744888533421e-06, "loss": 0.5363, "step": 1275 }, { "epoch": 1.011093502377179, "grad_norm": 1.3670166175679277, "learning_rate": 5.079393970531817e-06, "loss": 0.5766, "step": 1276 }, { "epoch": 1.0118858954041206, "grad_norm": 1.4433092882994376, "learning_rate": 5.073042924403709e-06, "loss": 0.4864, "step": 1277 }, { "epoch": 1.012678288431062, "grad_norm": 1.3773322246632893, "learning_rate": 5.066691760398458e-06, "loss": 0.4912, "step": 1278 }, { "epoch": 1.0134706814580032, "grad_norm": 1.2822742330723154, "learning_rate": 5.06034048876561e-06, "loss": 0.4687, "step": 1279 }, { "epoch": 1.0142630744849446, "grad_norm": 1.791072888989104, "learning_rate": 5.05398911975489e-06, "loss": 0.5036, "step": 1280 }, { "epoch": 1.015055467511886, "grad_norm": 1.2863535761464595, "learning_rate": 5.0476376636161806e-06, "loss": 0.5039, "step": 1281 }, { "epoch": 1.0158478605388273, "grad_norm": 1.5673116522017287, "learning_rate": 5.041286130599501e-06, "loss": 0.5494, "step": 1282 }, { "epoch": 1.0166402535657686, "grad_norm": 1.4389016917613202, "learning_rate": 5.0349345309549945e-06, "loss": 0.4718, "step": 1283 }, { "epoch": 1.01743264659271, "grad_norm": 1.6376347020813973, "learning_rate": 5.028582874932917e-06, "loss": 0.4545, "step": 1284 }, { "epoch": 1.0182250396196513, "grad_norm": 1.3314836641336334, "learning_rate": 5.0222311727836105e-06, "loss": 0.4292, "step": 1285 }, { "epoch": 1.0190174326465926, "grad_norm": 1.3192158552475184, "learning_rate": 5.015879434757493e-06, "loss": 0.4454, "step": 1286 }, { "epoch": 1.0198098256735342, "grad_norm": 1.324106092169371, "learning_rate": 5.009527671105042e-06, "loss": 0.4455, "step": 1287 }, { "epoch": 1.0206022187004755, "grad_norm": 1.786656797353581, "learning_rate": 5.003175892076773e-06, "loss": 0.5066, "step": 1288 }, { "epoch": 1.0213946117274169, "grad_norm": 1.3910163358067746, "learning_rate": 4.996824107923228e-06, "loss": 0.4892, "step": 1289 }, { "epoch": 1.0221870047543582, "grad_norm": 1.527332704425531, "learning_rate": 4.990472328894959e-06, "loss": 0.5731, "step": 1290 }, { "epoch": 1.0229793977812995, "grad_norm": 1.4904069775228634, "learning_rate": 4.9841205652425075e-06, "loss": 0.561, "step": 1291 }, { "epoch": 1.0237717908082409, "grad_norm": 1.1441232885923092, "learning_rate": 4.977768827216391e-06, "loss": 0.521, "step": 1292 }, { "epoch": 1.0245641838351822, "grad_norm": 1.3597976199439534, "learning_rate": 4.971417125067085e-06, "loss": 0.461, "step": 1293 }, { "epoch": 1.0253565768621236, "grad_norm": 1.4436309676204553, "learning_rate": 4.965065469045006e-06, "loss": 0.4603, "step": 1294 }, { "epoch": 1.026148969889065, "grad_norm": 1.2341863607059362, "learning_rate": 4.958713869400502e-06, "loss": 0.5826, "step": 1295 }, { "epoch": 1.0269413629160062, "grad_norm": 1.6117253227224055, "learning_rate": 4.952362336383821e-06, "loss": 0.5412, "step": 1296 }, { "epoch": 1.0277337559429478, "grad_norm": 1.34920356647524, "learning_rate": 4.946010880245111e-06, "loss": 0.5161, "step": 1297 }, { "epoch": 1.0285261489698891, "grad_norm": 1.2395199501045038, "learning_rate": 4.939659511234392e-06, "loss": 0.4526, "step": 1298 }, { "epoch": 1.0293185419968305, "grad_norm": 1.3846181653142091, "learning_rate": 4.933308239601546e-06, "loss": 0.5011, "step": 1299 }, { "epoch": 1.0301109350237718, "grad_norm": 1.283575911602512, "learning_rate": 4.926957075596291e-06, "loss": 0.5417, "step": 1300 }, { "epoch": 1.0309033280507132, "grad_norm": 1.2754651674470951, "learning_rate": 4.920606029468183e-06, "loss": 0.5089, "step": 1301 }, { "epoch": 1.0316957210776545, "grad_norm": 1.4574167881507438, "learning_rate": 4.914255111466579e-06, "loss": 0.4235, "step": 1302 }, { "epoch": 1.0324881141045958, "grad_norm": 1.5272713479006659, "learning_rate": 4.907904331840631e-06, "loss": 0.5367, "step": 1303 }, { "epoch": 1.0332805071315372, "grad_norm": 1.1491129115163117, "learning_rate": 4.901553700839269e-06, "loss": 0.5085, "step": 1304 }, { "epoch": 1.0340729001584785, "grad_norm": 1.3646236678539125, "learning_rate": 4.895203228711184e-06, "loss": 0.4613, "step": 1305 }, { "epoch": 1.0348652931854199, "grad_norm": 1.8956130707710954, "learning_rate": 4.888852925704804e-06, "loss": 0.4865, "step": 1306 }, { "epoch": 1.0356576862123614, "grad_norm": 1.2711922729302925, "learning_rate": 4.882502802068294e-06, "loss": 0.5268, "step": 1307 }, { "epoch": 1.0364500792393028, "grad_norm": 1.2741089561100811, "learning_rate": 4.876152868049523e-06, "loss": 0.4684, "step": 1308 }, { "epoch": 1.037242472266244, "grad_norm": 1.5242222427103707, "learning_rate": 4.869803133896057e-06, "loss": 0.4893, "step": 1309 }, { "epoch": 1.0380348652931854, "grad_norm": 1.2715269337550343, "learning_rate": 4.863453609855136e-06, "loss": 0.5712, "step": 1310 }, { "epoch": 1.0388272583201268, "grad_norm": 1.171217881762266, "learning_rate": 4.8571043061736636e-06, "loss": 0.5267, "step": 1311 }, { "epoch": 1.0396196513470681, "grad_norm": 1.218839116291414, "learning_rate": 4.850755233098188e-06, "loss": 0.6301, "step": 1312 }, { "epoch": 1.0404120443740095, "grad_norm": 1.8864630834470577, "learning_rate": 4.8444064008748835e-06, "loss": 0.5371, "step": 1313 }, { "epoch": 1.0412044374009508, "grad_norm": 1.517857637827087, "learning_rate": 4.838057819749538e-06, "loss": 0.4156, "step": 1314 }, { "epoch": 1.0419968304278922, "grad_norm": 1.2024420747289675, "learning_rate": 4.831709499967531e-06, "loss": 0.4392, "step": 1315 }, { "epoch": 1.0427892234548335, "grad_norm": 1.3935824684394766, "learning_rate": 4.825361451773823e-06, "loss": 0.4761, "step": 1316 }, { "epoch": 1.043581616481775, "grad_norm": 1.3738594011931797, "learning_rate": 4.819013685412936e-06, "loss": 0.4485, "step": 1317 }, { "epoch": 1.0443740095087164, "grad_norm": 1.2862618519324611, "learning_rate": 4.812666211128934e-06, "loss": 0.497, "step": 1318 }, { "epoch": 1.0451664025356577, "grad_norm": 1.3868384935281295, "learning_rate": 4.8063190391654156e-06, "loss": 0.4877, "step": 1319 }, { "epoch": 1.045958795562599, "grad_norm": 1.3841133661815221, "learning_rate": 4.7999721797654855e-06, "loss": 0.4923, "step": 1320 }, { "epoch": 1.0467511885895404, "grad_norm": 1.2637714884480735, "learning_rate": 4.793625643171748e-06, "loss": 0.5565, "step": 1321 }, { "epoch": 1.0475435816164818, "grad_norm": 1.2793537339829204, "learning_rate": 4.787279439626284e-06, "loss": 0.5854, "step": 1322 }, { "epoch": 1.048335974643423, "grad_norm": 1.2500817295355164, "learning_rate": 4.780933579370636e-06, "loss": 0.5102, "step": 1323 }, { "epoch": 1.0491283676703644, "grad_norm": 1.554490820464337, "learning_rate": 4.774588072645799e-06, "loss": 0.4666, "step": 1324 }, { "epoch": 1.0499207606973058, "grad_norm": 1.6573669327123128, "learning_rate": 4.768242929692191e-06, "loss": 0.5388, "step": 1325 }, { "epoch": 1.0507131537242471, "grad_norm": 1.2655435794873853, "learning_rate": 4.761898160749643e-06, "loss": 0.5382, "step": 1326 }, { "epoch": 1.0515055467511887, "grad_norm": 1.2607073632534982, "learning_rate": 4.755553776057386e-06, "loss": 0.4654, "step": 1327 }, { "epoch": 1.05229793977813, "grad_norm": 1.135483530665085, "learning_rate": 4.749209785854029e-06, "loss": 0.5491, "step": 1328 }, { "epoch": 1.0530903328050714, "grad_norm": 2.603688789889562, "learning_rate": 4.742866200377543e-06, "loss": 0.523, "step": 1329 }, { "epoch": 1.0538827258320127, "grad_norm": 1.4291231998075629, "learning_rate": 4.736523029865249e-06, "loss": 0.5106, "step": 1330 }, { "epoch": 1.054675118858954, "grad_norm": 1.2731658628496028, "learning_rate": 4.730180284553796e-06, "loss": 0.5684, "step": 1331 }, { "epoch": 1.0554675118858954, "grad_norm": 1.4478098696102, "learning_rate": 4.723837974679148e-06, "loss": 0.4641, "step": 1332 }, { "epoch": 1.0562599049128367, "grad_norm": 1.2964480146547606, "learning_rate": 4.717496110476564e-06, "loss": 0.5372, "step": 1333 }, { "epoch": 1.057052297939778, "grad_norm": 1.422216829039925, "learning_rate": 4.711154702180588e-06, "loss": 0.5651, "step": 1334 }, { "epoch": 1.0578446909667194, "grad_norm": 1.2330429400338945, "learning_rate": 4.704813760025023e-06, "loss": 0.511, "step": 1335 }, { "epoch": 1.058637083993661, "grad_norm": 1.5029508756439425, "learning_rate": 4.698473294242925e-06, "loss": 0.5004, "step": 1336 }, { "epoch": 1.0594294770206023, "grad_norm": 1.4219758234139719, "learning_rate": 4.692133315066578e-06, "loss": 0.4789, "step": 1337 }, { "epoch": 1.0602218700475436, "grad_norm": 1.3203133815314836, "learning_rate": 4.685793832727482e-06, "loss": 0.4726, "step": 1338 }, { "epoch": 1.061014263074485, "grad_norm": 2.038255102196825, "learning_rate": 4.679454857456333e-06, "loss": 0.5765, "step": 1339 }, { "epoch": 1.0618066561014263, "grad_norm": 1.2380915596077389, "learning_rate": 4.673116399483011e-06, "loss": 0.5308, "step": 1340 }, { "epoch": 1.0625990491283677, "grad_norm": 1.7907276503253902, "learning_rate": 4.666778469036562e-06, "loss": 0.5074, "step": 1341 }, { "epoch": 1.063391442155309, "grad_norm": 1.461433439453111, "learning_rate": 4.660441076345178e-06, "loss": 0.553, "step": 1342 }, { "epoch": 1.0641838351822503, "grad_norm": 1.3675677768034649, "learning_rate": 4.654104231636183e-06, "loss": 0.4628, "step": 1343 }, { "epoch": 1.0649762282091917, "grad_norm": 1.4485807590007713, "learning_rate": 4.6477679451360215e-06, "loss": 0.4841, "step": 1344 }, { "epoch": 1.065768621236133, "grad_norm": 5.702769107067583, "learning_rate": 4.64143222707023e-06, "loss": 0.4875, "step": 1345 }, { "epoch": 1.0665610142630744, "grad_norm": 1.3693231763073075, "learning_rate": 4.635097087663432e-06, "loss": 0.5549, "step": 1346 }, { "epoch": 1.067353407290016, "grad_norm": 1.468082890850682, "learning_rate": 4.628762537139319e-06, "loss": 0.5208, "step": 1347 }, { "epoch": 1.0681458003169573, "grad_norm": 1.4282808916149765, "learning_rate": 4.622428585720628e-06, "loss": 0.5085, "step": 1348 }, { "epoch": 1.0689381933438986, "grad_norm": 1.2366017987061249, "learning_rate": 4.6160952436291315e-06, "loss": 0.4452, "step": 1349 }, { "epoch": 1.06973058637084, "grad_norm": 1.3427755343332755, "learning_rate": 4.6097625210856174e-06, "loss": 0.5208, "step": 1350 }, { "epoch": 1.0705229793977813, "grad_norm": 1.4783399509524913, "learning_rate": 4.603430428309874e-06, "loss": 0.5235, "step": 1351 }, { "epoch": 1.0713153724247226, "grad_norm": 2.2366387456013035, "learning_rate": 4.5970989755206735e-06, "loss": 0.4938, "step": 1352 }, { "epoch": 1.072107765451664, "grad_norm": 1.2281957332587703, "learning_rate": 4.5907681729357574e-06, "loss": 0.5614, "step": 1353 }, { "epoch": 1.0729001584786053, "grad_norm": 1.4862357646128963, "learning_rate": 4.584438030771814e-06, "loss": 0.5302, "step": 1354 }, { "epoch": 1.0736925515055467, "grad_norm": 1.5135345452912652, "learning_rate": 4.578108559244469e-06, "loss": 0.4827, "step": 1355 }, { "epoch": 1.0744849445324882, "grad_norm": 1.3155589224179307, "learning_rate": 4.571779768568263e-06, "loss": 0.5006, "step": 1356 }, { "epoch": 1.0752773375594296, "grad_norm": 1.2403255245957059, "learning_rate": 4.56545166895664e-06, "loss": 0.5328, "step": 1357 }, { "epoch": 1.076069730586371, "grad_norm": 1.560407417917783, "learning_rate": 4.559124270621929e-06, "loss": 0.5038, "step": 1358 }, { "epoch": 1.0768621236133122, "grad_norm": 1.96127314648548, "learning_rate": 4.552797583775326e-06, "loss": 0.4473, "step": 1359 }, { "epoch": 1.0776545166402536, "grad_norm": 1.3854242033502377, "learning_rate": 4.546471618626878e-06, "loss": 0.4677, "step": 1360 }, { "epoch": 1.078446909667195, "grad_norm": 1.3788474354802274, "learning_rate": 4.54014638538547e-06, "loss": 0.4683, "step": 1361 }, { "epoch": 1.0792393026941363, "grad_norm": 1.5670189415808413, "learning_rate": 4.533821894258804e-06, "loss": 0.4763, "step": 1362 }, { "epoch": 1.0800316957210776, "grad_norm": 1.7773139666935187, "learning_rate": 4.527498155453385e-06, "loss": 0.423, "step": 1363 }, { "epoch": 1.080824088748019, "grad_norm": 1.2051263718979661, "learning_rate": 4.521175179174501e-06, "loss": 0.5278, "step": 1364 }, { "epoch": 1.0816164817749603, "grad_norm": 1.5740723551799212, "learning_rate": 4.514852975626217e-06, "loss": 0.4994, "step": 1365 }, { "epoch": 1.0824088748019018, "grad_norm": 1.1698348900949231, "learning_rate": 4.508531555011343e-06, "loss": 0.5307, "step": 1366 }, { "epoch": 1.0832012678288432, "grad_norm": 1.4478321589323273, "learning_rate": 4.502210927531428e-06, "loss": 0.5301, "step": 1367 }, { "epoch": 1.0839936608557845, "grad_norm": 1.4998211160792796, "learning_rate": 4.495891103386743e-06, "loss": 0.4977, "step": 1368 }, { "epoch": 1.0847860538827259, "grad_norm": 1.196470924884401, "learning_rate": 4.4895720927762584e-06, "loss": 0.4928, "step": 1369 }, { "epoch": 1.0855784469096672, "grad_norm": 1.5299612186499454, "learning_rate": 4.48325390589764e-06, "loss": 0.53, "step": 1370 }, { "epoch": 1.0863708399366085, "grad_norm": 1.4949228588331709, "learning_rate": 4.476936552947214e-06, "loss": 0.4413, "step": 1371 }, { "epoch": 1.0871632329635499, "grad_norm": 1.2046583220245117, "learning_rate": 4.470620044119967e-06, "loss": 0.5823, "step": 1372 }, { "epoch": 1.0879556259904912, "grad_norm": 1.3398768268600287, "learning_rate": 4.464304389609523e-06, "loss": 0.4652, "step": 1373 }, { "epoch": 1.0887480190174326, "grad_norm": 1.3032197753141834, "learning_rate": 4.457989599608123e-06, "loss": 0.5223, "step": 1374 }, { "epoch": 1.089540412044374, "grad_norm": 1.3259112989859374, "learning_rate": 4.451675684306619e-06, "loss": 0.4896, "step": 1375 }, { "epoch": 1.0903328050713155, "grad_norm": 1.2650023739169491, "learning_rate": 4.445362653894447e-06, "loss": 0.5115, "step": 1376 }, { "epoch": 1.0911251980982568, "grad_norm": 1.4452300372376214, "learning_rate": 4.439050518559618e-06, "loss": 0.5672, "step": 1377 }, { "epoch": 1.0919175911251982, "grad_norm": 1.4609690195665035, "learning_rate": 4.432739288488694e-06, "loss": 0.4883, "step": 1378 }, { "epoch": 1.0927099841521395, "grad_norm": 1.5003362188915381, "learning_rate": 4.426428973866781e-06, "loss": 0.4827, "step": 1379 }, { "epoch": 1.0935023771790808, "grad_norm": 1.6001980075937146, "learning_rate": 4.420119584877505e-06, "loss": 0.5375, "step": 1380 }, { "epoch": 1.0942947702060222, "grad_norm": 2.1267732637230443, "learning_rate": 4.413811131702998e-06, "loss": 0.5483, "step": 1381 }, { "epoch": 1.0950871632329635, "grad_norm": 1.6753420025306547, "learning_rate": 4.407503624523886e-06, "loss": 0.5044, "step": 1382 }, { "epoch": 1.0958795562599049, "grad_norm": 2.446913211445452, "learning_rate": 4.401197073519261e-06, "loss": 0.5132, "step": 1383 }, { "epoch": 1.0966719492868462, "grad_norm": 1.2065553427118896, "learning_rate": 4.394891488866678e-06, "loss": 0.5462, "step": 1384 }, { "epoch": 1.0974643423137875, "grad_norm": 1.6015453474296661, "learning_rate": 4.388586880742129e-06, "loss": 0.4293, "step": 1385 }, { "epoch": 1.098256735340729, "grad_norm": 1.3504879577448312, "learning_rate": 4.3822832593200335e-06, "loss": 0.5606, "step": 1386 }, { "epoch": 1.0990491283676704, "grad_norm": 1.4718984018065933, "learning_rate": 4.375980634773214e-06, "loss": 0.5305, "step": 1387 }, { "epoch": 1.0998415213946118, "grad_norm": 1.4131085039706701, "learning_rate": 4.369679017272889e-06, "loss": 0.5098, "step": 1388 }, { "epoch": 1.1006339144215531, "grad_norm": 1.2311925385886642, "learning_rate": 4.3633784169886485e-06, "loss": 0.5153, "step": 1389 }, { "epoch": 1.1014263074484945, "grad_norm": 1.8327350189132439, "learning_rate": 4.357078844088441e-06, "loss": 0.5237, "step": 1390 }, { "epoch": 1.1022187004754358, "grad_norm": 1.6051986339720805, "learning_rate": 4.350780308738561e-06, "loss": 0.5576, "step": 1391 }, { "epoch": 1.1030110935023771, "grad_norm": 1.4175146186507193, "learning_rate": 4.344482821103622e-06, "loss": 0.5384, "step": 1392 }, { "epoch": 1.1038034865293185, "grad_norm": 1.445607631068459, "learning_rate": 4.338186391346553e-06, "loss": 0.5212, "step": 1393 }, { "epoch": 1.1045958795562598, "grad_norm": 1.4663425106294246, "learning_rate": 4.331891029628573e-06, "loss": 0.5453, "step": 1394 }, { "epoch": 1.1053882725832014, "grad_norm": 1.3907085404819846, "learning_rate": 4.325596746109178e-06, "loss": 0.5389, "step": 1395 }, { "epoch": 1.1061806656101427, "grad_norm": 1.5713568871399004, "learning_rate": 4.319303550946121e-06, "loss": 0.4634, "step": 1396 }, { "epoch": 1.106973058637084, "grad_norm": 1.3520808161131632, "learning_rate": 4.313011454295403e-06, "loss": 0.5336, "step": 1397 }, { "epoch": 1.1077654516640254, "grad_norm": 1.4448339174715474, "learning_rate": 4.306720466311249e-06, "loss": 0.504, "step": 1398 }, { "epoch": 1.1085578446909667, "grad_norm": 1.3016735601942235, "learning_rate": 4.300430597146098e-06, "loss": 0.4461, "step": 1399 }, { "epoch": 1.109350237717908, "grad_norm": 1.3552396403762161, "learning_rate": 4.294141856950581e-06, "loss": 0.5376, "step": 1400 }, { "epoch": 1.1101426307448494, "grad_norm": 1.3237647371974537, "learning_rate": 4.287854255873507e-06, "loss": 0.4778, "step": 1401 }, { "epoch": 1.1109350237717908, "grad_norm": 1.8282376641629428, "learning_rate": 4.281567804061848e-06, "loss": 0.5505, "step": 1402 }, { "epoch": 1.111727416798732, "grad_norm": 1.5812398363270834, "learning_rate": 4.27528251166072e-06, "loss": 0.5893, "step": 1403 }, { "epoch": 1.1125198098256734, "grad_norm": 1.5690646011010188, "learning_rate": 4.268998388813369e-06, "loss": 0.5471, "step": 1404 }, { "epoch": 1.1133122028526148, "grad_norm": 1.4959680362099468, "learning_rate": 4.262715445661153e-06, "loss": 0.4807, "step": 1405 }, { "epoch": 1.1141045958795563, "grad_norm": 1.4029862890160945, "learning_rate": 4.256433692343527e-06, "loss": 0.4873, "step": 1406 }, { "epoch": 1.1148969889064977, "grad_norm": 1.4108869911738142, "learning_rate": 4.250153138998024e-06, "loss": 0.4746, "step": 1407 }, { "epoch": 1.115689381933439, "grad_norm": 1.542526668526277, "learning_rate": 4.243873795760242e-06, "loss": 0.5307, "step": 1408 }, { "epoch": 1.1164817749603804, "grad_norm": 1.3832712416042676, "learning_rate": 4.237595672763827e-06, "loss": 0.5388, "step": 1409 }, { "epoch": 1.1172741679873217, "grad_norm": 1.4197062918648926, "learning_rate": 4.231318780140451e-06, "loss": 0.5028, "step": 1410 }, { "epoch": 1.118066561014263, "grad_norm": 1.2608601747335646, "learning_rate": 4.2250431280198095e-06, "loss": 0.5017, "step": 1411 }, { "epoch": 1.1188589540412044, "grad_norm": 1.2910541162597216, "learning_rate": 4.218768726529587e-06, "loss": 0.5497, "step": 1412 }, { "epoch": 1.1196513470681457, "grad_norm": 1.2310806423306222, "learning_rate": 4.212495585795453e-06, "loss": 0.4726, "step": 1413 }, { "epoch": 1.120443740095087, "grad_norm": 1.4577940946726329, "learning_rate": 4.206223715941045e-06, "loss": 0.4492, "step": 1414 }, { "epoch": 1.1212361331220286, "grad_norm": 1.419858253341693, "learning_rate": 4.199953127087943e-06, "loss": 0.477, "step": 1415 }, { "epoch": 1.12202852614897, "grad_norm": 1.4195443575712319, "learning_rate": 4.193683829355669e-06, "loss": 0.4905, "step": 1416 }, { "epoch": 1.1228209191759113, "grad_norm": 1.631045907971975, "learning_rate": 4.187415832861654e-06, "loss": 0.4904, "step": 1417 }, { "epoch": 1.1236133122028527, "grad_norm": 1.5150214636275487, "learning_rate": 4.181149147721231e-06, "loss": 0.4959, "step": 1418 }, { "epoch": 1.124405705229794, "grad_norm": 1.3134997655017453, "learning_rate": 4.174883784047617e-06, "loss": 0.5449, "step": 1419 }, { "epoch": 1.1251980982567353, "grad_norm": 1.169073391164122, "learning_rate": 4.168619751951897e-06, "loss": 0.5435, "step": 1420 }, { "epoch": 1.1259904912836767, "grad_norm": 1.3473462329671124, "learning_rate": 4.162357061543007e-06, "loss": 0.5247, "step": 1421 }, { "epoch": 1.126782884310618, "grad_norm": 1.3096344206988038, "learning_rate": 4.1560957229277165e-06, "loss": 0.5256, "step": 1422 }, { "epoch": 1.1275752773375594, "grad_norm": 1.3921134411774674, "learning_rate": 4.149835746210615e-06, "loss": 0.4547, "step": 1423 }, { "epoch": 1.1283676703645007, "grad_norm": 1.314694477797499, "learning_rate": 4.1435771414940925e-06, "loss": 0.4936, "step": 1424 }, { "epoch": 1.129160063391442, "grad_norm": 1.3628418314901094, "learning_rate": 4.137319918878326e-06, "loss": 0.4893, "step": 1425 }, { "epoch": 1.1299524564183836, "grad_norm": 1.3459498291027865, "learning_rate": 4.131064088461263e-06, "loss": 0.5724, "step": 1426 }, { "epoch": 1.130744849445325, "grad_norm": 1.4224451143688255, "learning_rate": 4.124809660338601e-06, "loss": 0.4673, "step": 1427 }, { "epoch": 1.1315372424722663, "grad_norm": 1.7949004243022086, "learning_rate": 4.118556644603779e-06, "loss": 0.4643, "step": 1428 }, { "epoch": 1.1323296354992076, "grad_norm": 1.3345172201273132, "learning_rate": 4.1123050513479546e-06, "loss": 0.3852, "step": 1429 }, { "epoch": 1.133122028526149, "grad_norm": 1.3493223393769933, "learning_rate": 4.106054890659987e-06, "loss": 0.5365, "step": 1430 }, { "epoch": 1.1339144215530903, "grad_norm": 1.5327779241421893, "learning_rate": 4.0998061726264275e-06, "loss": 0.4398, "step": 1431 }, { "epoch": 1.1347068145800316, "grad_norm": 1.4486264188802198, "learning_rate": 4.0935589073314975e-06, "loss": 0.4631, "step": 1432 }, { "epoch": 1.135499207606973, "grad_norm": 1.2293848624913222, "learning_rate": 4.0873131048570744e-06, "loss": 0.5119, "step": 1433 }, { "epoch": 1.1362916006339145, "grad_norm": 1.194287537430038, "learning_rate": 4.081068775282674e-06, "loss": 0.5146, "step": 1434 }, { "epoch": 1.1370839936608559, "grad_norm": 1.4590547195005594, "learning_rate": 4.074825928685436e-06, "loss": 0.5092, "step": 1435 }, { "epoch": 1.1378763866877972, "grad_norm": 1.5014330172396708, "learning_rate": 4.068584575140108e-06, "loss": 0.5404, "step": 1436 }, { "epoch": 1.1386687797147386, "grad_norm": 1.3512767289584604, "learning_rate": 4.062344724719025e-06, "loss": 0.524, "step": 1437 }, { "epoch": 1.13946117274168, "grad_norm": 1.1987719779832147, "learning_rate": 4.056106387492096e-06, "loss": 0.5701, "step": 1438 }, { "epoch": 1.1402535657686212, "grad_norm": 1.6318620025843171, "learning_rate": 4.0498695735267965e-06, "loss": 0.5034, "step": 1439 }, { "epoch": 1.1410459587955626, "grad_norm": 1.2018003761316025, "learning_rate": 4.043634292888132e-06, "loss": 0.5991, "step": 1440 }, { "epoch": 1.141838351822504, "grad_norm": 1.2888644394057351, "learning_rate": 4.037400555638641e-06, "loss": 0.5477, "step": 1441 }, { "epoch": 1.1426307448494453, "grad_norm": 1.5401685170154114, "learning_rate": 4.031168371838369e-06, "loss": 0.5441, "step": 1442 }, { "epoch": 1.1434231378763866, "grad_norm": 1.7031465946821143, "learning_rate": 4.024937751544853e-06, "loss": 0.4086, "step": 1443 }, { "epoch": 1.144215530903328, "grad_norm": 1.3284097092112253, "learning_rate": 4.018708704813109e-06, "loss": 0.5144, "step": 1444 }, { "epoch": 1.1450079239302693, "grad_norm": 1.3629123980839861, "learning_rate": 4.012481241695614e-06, "loss": 0.5, "step": 1445 }, { "epoch": 1.1458003169572109, "grad_norm": 1.4378283115029267, "learning_rate": 4.006255372242287e-06, "loss": 0.5029, "step": 1446 }, { "epoch": 1.1465927099841522, "grad_norm": 1.3988274260150173, "learning_rate": 4.0000311065004785e-06, "loss": 0.5154, "step": 1447 }, { "epoch": 1.1473851030110935, "grad_norm": 1.533614247522034, "learning_rate": 3.993808454514946e-06, "loss": 0.5575, "step": 1448 }, { "epoch": 1.1481774960380349, "grad_norm": 1.4435460433190164, "learning_rate": 3.987587426327848e-06, "loss": 0.4988, "step": 1449 }, { "epoch": 1.1489698890649762, "grad_norm": 1.7529330778968704, "learning_rate": 3.981368031978719e-06, "loss": 0.5324, "step": 1450 }, { "epoch": 1.1497622820919176, "grad_norm": 1.070232832117804, "learning_rate": 3.975150281504458e-06, "loss": 0.6225, "step": 1451 }, { "epoch": 1.150554675118859, "grad_norm": 1.4919567407874927, "learning_rate": 3.96893418493931e-06, "loss": 0.4429, "step": 1452 }, { "epoch": 1.1513470681458002, "grad_norm": 2.192665862322254, "learning_rate": 3.962719752314854e-06, "loss": 0.374, "step": 1453 }, { "epoch": 1.1521394611727418, "grad_norm": 1.2839785126016086, "learning_rate": 3.9565069936599805e-06, "loss": 0.4679, "step": 1454 }, { "epoch": 1.1529318541996831, "grad_norm": 1.1229150884657237, "learning_rate": 3.9502959190008795e-06, "loss": 0.5196, "step": 1455 }, { "epoch": 1.1537242472266245, "grad_norm": 1.5698766022942305, "learning_rate": 3.9440865383610225e-06, "loss": 0.4351, "step": 1456 }, { "epoch": 1.1545166402535658, "grad_norm": 1.8164770317114964, "learning_rate": 3.937878861761151e-06, "loss": 0.5494, "step": 1457 }, { "epoch": 1.1553090332805072, "grad_norm": 1.3962654120443203, "learning_rate": 3.931672899219252e-06, "loss": 0.5142, "step": 1458 }, { "epoch": 1.1561014263074485, "grad_norm": 1.4378191000236233, "learning_rate": 3.925468660750547e-06, "loss": 0.4958, "step": 1459 }, { "epoch": 1.1568938193343898, "grad_norm": 1.2862188264012868, "learning_rate": 3.919266156367478e-06, "loss": 0.4878, "step": 1460 }, { "epoch": 1.1576862123613312, "grad_norm": 1.2939318696001687, "learning_rate": 3.913065396079685e-06, "loss": 0.5334, "step": 1461 }, { "epoch": 1.1584786053882725, "grad_norm": 1.366501575639997, "learning_rate": 3.9068663898939964e-06, "loss": 0.5239, "step": 1462 }, { "epoch": 1.1592709984152139, "grad_norm": 1.6453366329588814, "learning_rate": 3.90066914781441e-06, "loss": 0.525, "step": 1463 }, { "epoch": 1.1600633914421552, "grad_norm": 1.4730225545632614, "learning_rate": 3.894473679842073e-06, "loss": 0.4849, "step": 1464 }, { "epoch": 1.1608557844690968, "grad_norm": 1.808821754951793, "learning_rate": 3.888279995975273e-06, "loss": 0.5428, "step": 1465 }, { "epoch": 1.161648177496038, "grad_norm": 1.5472733607512246, "learning_rate": 3.882088106209416e-06, "loss": 0.4602, "step": 1466 }, { "epoch": 1.1624405705229794, "grad_norm": 1.5555198193000372, "learning_rate": 3.8758980205370134e-06, "loss": 0.4985, "step": 1467 }, { "epoch": 1.1632329635499208, "grad_norm": 1.328961129314169, "learning_rate": 3.869709748947668e-06, "loss": 0.5176, "step": 1468 }, { "epoch": 1.1640253565768621, "grad_norm": 1.2364484824548323, "learning_rate": 3.863523301428049e-06, "loss": 0.5379, "step": 1469 }, { "epoch": 1.1648177496038035, "grad_norm": 1.2224105411382986, "learning_rate": 3.857338687961889e-06, "loss": 0.492, "step": 1470 }, { "epoch": 1.1656101426307448, "grad_norm": 1.3406021075652643, "learning_rate": 3.8511559185299555e-06, "loss": 0.5475, "step": 1471 }, { "epoch": 1.1664025356576861, "grad_norm": 1.181702178708717, "learning_rate": 3.844975003110043e-06, "loss": 0.555, "step": 1472 }, { "epoch": 1.1671949286846275, "grad_norm": 1.4471423012332876, "learning_rate": 3.838795951676949e-06, "loss": 0.5002, "step": 1473 }, { "epoch": 1.167987321711569, "grad_norm": 1.423710657081403, "learning_rate": 3.832618774202472e-06, "loss": 0.5238, "step": 1474 }, { "epoch": 1.1687797147385104, "grad_norm": 1.383407365452075, "learning_rate": 3.826443480655379e-06, "loss": 0.4784, "step": 1475 }, { "epoch": 1.1695721077654517, "grad_norm": 1.200536869514233, "learning_rate": 3.8202700810013975e-06, "loss": 0.5351, "step": 1476 }, { "epoch": 1.170364500792393, "grad_norm": 1.308211195032932, "learning_rate": 3.814098585203201e-06, "loss": 0.4943, "step": 1477 }, { "epoch": 1.1711568938193344, "grad_norm": 1.3741523292864255, "learning_rate": 3.8079290032203894e-06, "loss": 0.4729, "step": 1478 }, { "epoch": 1.1719492868462758, "grad_norm": 1.4248028665386385, "learning_rate": 3.8017613450094737e-06, "loss": 0.401, "step": 1479 }, { "epoch": 1.172741679873217, "grad_norm": 1.4178863840700335, "learning_rate": 3.795595620523861e-06, "loss": 0.4924, "step": 1480 }, { "epoch": 1.1735340729001584, "grad_norm": 1.4491406208150066, "learning_rate": 3.789431839713837e-06, "loss": 0.426, "step": 1481 }, { "epoch": 1.1743264659270998, "grad_norm": 1.3323756467015095, "learning_rate": 3.7832700125265507e-06, "loss": 0.4729, "step": 1482 }, { "epoch": 1.1751188589540411, "grad_norm": 1.2822450158861807, "learning_rate": 3.777110148906e-06, "loss": 0.5009, "step": 1483 }, { "epoch": 1.1759112519809825, "grad_norm": 1.1109327432384608, "learning_rate": 3.7709522587930103e-06, "loss": 0.5625, "step": 1484 }, { "epoch": 1.176703645007924, "grad_norm": 1.27085417537719, "learning_rate": 3.7647963521252286e-06, "loss": 0.5159, "step": 1485 }, { "epoch": 1.1774960380348654, "grad_norm": 1.472915561597296, "learning_rate": 3.758642438837094e-06, "loss": 0.4514, "step": 1486 }, { "epoch": 1.1782884310618067, "grad_norm": 1.325274930654458, "learning_rate": 3.7524905288598325e-06, "loss": 0.4784, "step": 1487 }, { "epoch": 1.179080824088748, "grad_norm": 1.3401121770456772, "learning_rate": 3.7463406321214358e-06, "loss": 0.4534, "step": 1488 }, { "epoch": 1.1798732171156894, "grad_norm": 3.208379767320555, "learning_rate": 3.7401927585466475e-06, "loss": 0.4403, "step": 1489 }, { "epoch": 1.1806656101426307, "grad_norm": 1.2478165601351654, "learning_rate": 3.734046918056945e-06, "loss": 0.5306, "step": 1490 }, { "epoch": 1.181458003169572, "grad_norm": 1.3526957142234792, "learning_rate": 3.7279031205705265e-06, "loss": 0.4707, "step": 1491 }, { "epoch": 1.1822503961965134, "grad_norm": 1.4042993056492592, "learning_rate": 3.721761376002292e-06, "loss": 0.5449, "step": 1492 }, { "epoch": 1.1830427892234547, "grad_norm": 1.510572504343144, "learning_rate": 3.715621694263829e-06, "loss": 0.5572, "step": 1493 }, { "epoch": 1.1838351822503963, "grad_norm": 1.3175098634471558, "learning_rate": 3.7094840852633946e-06, "loss": 0.4825, "step": 1494 }, { "epoch": 1.1846275752773376, "grad_norm": 1.5899868642110642, "learning_rate": 3.7033485589059025e-06, "loss": 0.4363, "step": 1495 }, { "epoch": 1.185419968304279, "grad_norm": 1.294282416626561, "learning_rate": 3.697215125092905e-06, "loss": 0.4744, "step": 1496 }, { "epoch": 1.1862123613312203, "grad_norm": 1.525551995517493, "learning_rate": 3.6910837937225772e-06, "loss": 0.4617, "step": 1497 }, { "epoch": 1.1870047543581617, "grad_norm": 1.227283940611898, "learning_rate": 3.6849545746897013e-06, "loss": 0.5243, "step": 1498 }, { "epoch": 1.187797147385103, "grad_norm": 1.1754692050253215, "learning_rate": 3.6788274778856507e-06, "loss": 0.5067, "step": 1499 }, { "epoch": 1.1885895404120443, "grad_norm": 1.3168609443604886, "learning_rate": 3.672702513198374e-06, "loss": 0.5704, "step": 1500 }, { "epoch": 1.1893819334389857, "grad_norm": 1.238000282815092, "learning_rate": 3.6665796905123795e-06, "loss": 0.5135, "step": 1501 }, { "epoch": 1.190174326465927, "grad_norm": 1.300209097167972, "learning_rate": 3.6604590197087155e-06, "loss": 0.4746, "step": 1502 }, { "epoch": 1.1909667194928684, "grad_norm": 1.333867974502678, "learning_rate": 3.6543405106649633e-06, "loss": 0.509, "step": 1503 }, { "epoch": 1.1917591125198097, "grad_norm": 1.247575797454254, "learning_rate": 3.6482241732552114e-06, "loss": 0.5306, "step": 1504 }, { "epoch": 1.1925515055467513, "grad_norm": 1.2949039594003473, "learning_rate": 3.642110017350045e-06, "loss": 0.487, "step": 1505 }, { "epoch": 1.1933438985736926, "grad_norm": 1.477231441860899, "learning_rate": 3.635998052816528e-06, "loss": 0.5265, "step": 1506 }, { "epoch": 1.194136291600634, "grad_norm": 1.3636363801473654, "learning_rate": 3.6298882895181863e-06, "loss": 0.5024, "step": 1507 }, { "epoch": 1.1949286846275753, "grad_norm": 1.4898654032835836, "learning_rate": 3.623780737314999e-06, "loss": 0.4979, "step": 1508 }, { "epoch": 1.1957210776545166, "grad_norm": 1.858746016111307, "learning_rate": 3.617675406063373e-06, "loss": 0.6048, "step": 1509 }, { "epoch": 1.196513470681458, "grad_norm": 1.5822902942480925, "learning_rate": 3.61157230561613e-06, "loss": 0.4922, "step": 1510 }, { "epoch": 1.1973058637083993, "grad_norm": 1.202375994059027, "learning_rate": 3.6054714458224937e-06, "loss": 0.5013, "step": 1511 }, { "epoch": 1.1980982567353406, "grad_norm": 1.3804220446695588, "learning_rate": 3.599372836528071e-06, "loss": 0.4845, "step": 1512 }, { "epoch": 1.1988906497622822, "grad_norm": 1.0967406566523796, "learning_rate": 3.5932764875748367e-06, "loss": 0.5754, "step": 1513 }, { "epoch": 1.1996830427892236, "grad_norm": 1.2441581479968216, "learning_rate": 3.5871824088011185e-06, "loss": 0.5276, "step": 1514 }, { "epoch": 1.200475435816165, "grad_norm": 1.402090613472661, "learning_rate": 3.5810906100415808e-06, "loss": 0.5213, "step": 1515 }, { "epoch": 1.2012678288431062, "grad_norm": 2.0083241652791473, "learning_rate": 3.5750011011272067e-06, "loss": 0.4977, "step": 1516 }, { "epoch": 1.2020602218700476, "grad_norm": 1.3129883558623092, "learning_rate": 3.568913891885286e-06, "loss": 0.4573, "step": 1517 }, { "epoch": 1.202852614896989, "grad_norm": 1.3872092468807031, "learning_rate": 3.5628289921393968e-06, "loss": 0.5252, "step": 1518 }, { "epoch": 1.2036450079239303, "grad_norm": 1.2969753506064168, "learning_rate": 3.5567464117093876e-06, "loss": 0.5437, "step": 1519 }, { "epoch": 1.2044374009508716, "grad_norm": 1.3654796273451715, "learning_rate": 3.5506661604113685e-06, "loss": 0.4531, "step": 1520 }, { "epoch": 1.205229793977813, "grad_norm": 1.3682656912144264, "learning_rate": 3.5445882480576877e-06, "loss": 0.4912, "step": 1521 }, { "epoch": 1.2060221870047543, "grad_norm": 1.3659559404741006, "learning_rate": 3.5385126844569196e-06, "loss": 0.4242, "step": 1522 }, { "epoch": 1.2068145800316956, "grad_norm": 1.6669385870547084, "learning_rate": 3.532439479413847e-06, "loss": 0.5076, "step": 1523 }, { "epoch": 1.2076069730586372, "grad_norm": 1.7362176998775696, "learning_rate": 3.5263686427294497e-06, "loss": 0.4571, "step": 1524 }, { "epoch": 1.2083993660855785, "grad_norm": 1.5991145583299835, "learning_rate": 3.520300184200881e-06, "loss": 0.5099, "step": 1525 }, { "epoch": 1.2091917591125199, "grad_norm": 1.235970570492953, "learning_rate": 3.5142341136214597e-06, "loss": 0.521, "step": 1526 }, { "epoch": 1.2099841521394612, "grad_norm": 1.6681875288708359, "learning_rate": 3.5081704407806498e-06, "loss": 0.4416, "step": 1527 }, { "epoch": 1.2107765451664025, "grad_norm": 1.6738394406284447, "learning_rate": 3.5021091754640456e-06, "loss": 0.5555, "step": 1528 }, { "epoch": 1.2115689381933439, "grad_norm": 1.7964677865944858, "learning_rate": 3.4960503274533543e-06, "loss": 0.5202, "step": 1529 }, { "epoch": 1.2123613312202852, "grad_norm": 2.3127678532333333, "learning_rate": 3.4899939065263875e-06, "loss": 0.4722, "step": 1530 }, { "epoch": 1.2131537242472266, "grad_norm": 1.2926632413015924, "learning_rate": 3.4839399224570365e-06, "loss": 0.5586, "step": 1531 }, { "epoch": 1.213946117274168, "grad_norm": 1.400993848333868, "learning_rate": 3.477888385015258e-06, "loss": 0.54, "step": 1532 }, { "epoch": 1.2147385103011095, "grad_norm": 1.7512845147288498, "learning_rate": 3.471839303967063e-06, "loss": 0.5072, "step": 1533 }, { "epoch": 1.2155309033280508, "grad_norm": 1.2426411390577603, "learning_rate": 3.4657926890744974e-06, "loss": 0.4658, "step": 1534 }, { "epoch": 1.2163232963549921, "grad_norm": 2.686784796920797, "learning_rate": 3.4597485500956286e-06, "loss": 0.4383, "step": 1535 }, { "epoch": 1.2171156893819335, "grad_norm": 1.3581690985745043, "learning_rate": 3.4537068967845246e-06, "loss": 0.5086, "step": 1536 }, { "epoch": 1.2179080824088748, "grad_norm": 1.457268032859406, "learning_rate": 3.4476677388912473e-06, "loss": 0.4891, "step": 1537 }, { "epoch": 1.2187004754358162, "grad_norm": 3.3148479408576335, "learning_rate": 3.441631086161828e-06, "loss": 0.5072, "step": 1538 }, { "epoch": 1.2194928684627575, "grad_norm": 1.6690095178290993, "learning_rate": 3.4355969483382564e-06, "loss": 0.4536, "step": 1539 }, { "epoch": 1.2202852614896988, "grad_norm": 1.8120385222896231, "learning_rate": 3.4295653351584624e-06, "loss": 0.5489, "step": 1540 }, { "epoch": 1.2210776545166402, "grad_norm": 1.3857531902902818, "learning_rate": 3.4235362563563024e-06, "loss": 0.4287, "step": 1541 }, { "epoch": 1.2218700475435815, "grad_norm": 1.3701102834089574, "learning_rate": 3.4175097216615436e-06, "loss": 0.4715, "step": 1542 }, { "epoch": 1.2226624405705229, "grad_norm": 1.540362693321671, "learning_rate": 3.4114857407998462e-06, "loss": 0.4795, "step": 1543 }, { "epoch": 1.2234548335974644, "grad_norm": 1.4194185987666583, "learning_rate": 3.4054643234927497e-06, "loss": 0.5475, "step": 1544 }, { "epoch": 1.2242472266244058, "grad_norm": 1.4323156206073957, "learning_rate": 3.3994454794576565e-06, "loss": 0.5459, "step": 1545 }, { "epoch": 1.2250396196513471, "grad_norm": 1.5218100112094435, "learning_rate": 3.3934292184078153e-06, "loss": 0.5634, "step": 1546 }, { "epoch": 1.2258320126782885, "grad_norm": 4.1555818176742285, "learning_rate": 3.3874155500523083e-06, "loss": 0.4433, "step": 1547 }, { "epoch": 1.2266244057052298, "grad_norm": 1.5434282269558766, "learning_rate": 3.3814044840960303e-06, "loss": 0.4806, "step": 1548 }, { "epoch": 1.2274167987321711, "grad_norm": 1.3670729350250543, "learning_rate": 3.3753960302396816e-06, "loss": 0.5227, "step": 1549 }, { "epoch": 1.2282091917591125, "grad_norm": 1.3811499940620702, "learning_rate": 3.3693901981797418e-06, "loss": 0.5507, "step": 1550 }, { "epoch": 1.2290015847860538, "grad_norm": 1.416991821758, "learning_rate": 3.3633869976084627e-06, "loss": 0.5488, "step": 1551 }, { "epoch": 1.2297939778129952, "grad_norm": 1.1751447936610742, "learning_rate": 3.3573864382138466e-06, "loss": 0.506, "step": 1552 }, { "epoch": 1.2305863708399367, "grad_norm": 1.8452379786880908, "learning_rate": 3.351388529679636e-06, "loss": 0.4905, "step": 1553 }, { "epoch": 1.231378763866878, "grad_norm": 1.3426247285810677, "learning_rate": 3.3453932816852965e-06, "loss": 0.5172, "step": 1554 }, { "epoch": 1.2321711568938194, "grad_norm": 2.8608698132635775, "learning_rate": 3.3394007039059956e-06, "loss": 0.5249, "step": 1555 }, { "epoch": 1.2329635499207607, "grad_norm": 1.4530352962453783, "learning_rate": 3.333410806012595e-06, "loss": 0.5247, "step": 1556 }, { "epoch": 1.233755942947702, "grad_norm": 1.352261464612783, "learning_rate": 3.327423597671631e-06, "loss": 0.6293, "step": 1557 }, { "epoch": 1.2345483359746434, "grad_norm": 1.269523162842998, "learning_rate": 3.3214390885452995e-06, "loss": 0.5316, "step": 1558 }, { "epoch": 1.2353407290015848, "grad_norm": 1.3505573065001253, "learning_rate": 3.315457288291439e-06, "loss": 0.512, "step": 1559 }, { "epoch": 1.236133122028526, "grad_norm": 1.3581133332930964, "learning_rate": 3.3094782065635176e-06, "loss": 0.5035, "step": 1560 }, { "epoch": 1.2369255150554674, "grad_norm": 1.2186012687633752, "learning_rate": 3.3035018530106168e-06, "loss": 0.6171, "step": 1561 }, { "epoch": 1.2377179080824088, "grad_norm": 1.6882785602607862, "learning_rate": 3.2975282372774143e-06, "loss": 0.5601, "step": 1562 }, { "epoch": 1.2385103011093501, "grad_norm": 3.486729423739521, "learning_rate": 3.29155736900417e-06, "loss": 0.4977, "step": 1563 }, { "epoch": 1.2393026941362917, "grad_norm": 1.5194761893544655, "learning_rate": 3.2855892578267103e-06, "loss": 0.5379, "step": 1564 }, { "epoch": 1.240095087163233, "grad_norm": 1.196863217297334, "learning_rate": 3.2796239133764097e-06, "loss": 0.5698, "step": 1565 }, { "epoch": 1.2408874801901744, "grad_norm": 1.3653079791176377, "learning_rate": 3.273661345280182e-06, "loss": 0.5045, "step": 1566 }, { "epoch": 1.2416798732171157, "grad_norm": 1.2550216082661068, "learning_rate": 3.2677015631604568e-06, "loss": 0.5716, "step": 1567 }, { "epoch": 1.242472266244057, "grad_norm": 1.3273214686500665, "learning_rate": 3.261744576635171e-06, "loss": 0.4572, "step": 1568 }, { "epoch": 1.2432646592709984, "grad_norm": 1.3704085454371775, "learning_rate": 3.2557903953177456e-06, "loss": 0.525, "step": 1569 }, { "epoch": 1.2440570522979397, "grad_norm": 1.469293084160281, "learning_rate": 3.2498390288170784e-06, "loss": 0.571, "step": 1570 }, { "epoch": 1.244849445324881, "grad_norm": 1.3486612267408666, "learning_rate": 3.243890486737523e-06, "loss": 0.542, "step": 1571 }, { "epoch": 1.2456418383518226, "grad_norm": 5.529616842486942, "learning_rate": 3.237944778678875e-06, "loss": 0.4235, "step": 1572 }, { "epoch": 1.246434231378764, "grad_norm": 1.8326917412157786, "learning_rate": 3.232001914236356e-06, "loss": 0.4711, "step": 1573 }, { "epoch": 1.2472266244057053, "grad_norm": 1.6978004422984996, "learning_rate": 3.2260619030006e-06, "loss": 0.5253, "step": 1574 }, { "epoch": 1.2480190174326466, "grad_norm": 1.425054074277725, "learning_rate": 3.2201247545576337e-06, "loss": 0.4919, "step": 1575 }, { "epoch": 1.248811410459588, "grad_norm": 1.2994918021323054, "learning_rate": 3.214190478488869e-06, "loss": 0.5176, "step": 1576 }, { "epoch": 1.2496038034865293, "grad_norm": 5.380192852135657, "learning_rate": 3.2082590843710776e-06, "loss": 0.4838, "step": 1577 }, { "epoch": 1.2503961965134707, "grad_norm": 1.4520819927143196, "learning_rate": 3.2023305817763816e-06, "loss": 0.4772, "step": 1578 }, { "epoch": 1.251188589540412, "grad_norm": 1.8320840774622311, "learning_rate": 3.1964049802722363e-06, "loss": 0.5501, "step": 1579 }, { "epoch": 1.2519809825673534, "grad_norm": 1.5972794283148803, "learning_rate": 3.1904822894214173e-06, "loss": 0.4288, "step": 1580 }, { "epoch": 1.2527733755942947, "grad_norm": 1.8539029273751202, "learning_rate": 3.1845625187819996e-06, "loss": 0.4174, "step": 1581 }, { "epoch": 1.253565768621236, "grad_norm": 1.3677331941880544, "learning_rate": 3.178645677907347e-06, "loss": 0.4419, "step": 1582 }, { "epoch": 1.2543581616481774, "grad_norm": 1.5460434203380178, "learning_rate": 3.1727317763460987e-06, "loss": 0.6013, "step": 1583 }, { "epoch": 1.255150554675119, "grad_norm": 1.3098463826929179, "learning_rate": 3.1668208236421447e-06, "loss": 0.5371, "step": 1584 }, { "epoch": 1.2559429477020603, "grad_norm": 1.2266818883514081, "learning_rate": 3.1609128293346207e-06, "loss": 0.5218, "step": 1585 }, { "epoch": 1.2567353407290016, "grad_norm": 1.3313158691766718, "learning_rate": 3.1550078029578846e-06, "loss": 0.4836, "step": 1586 }, { "epoch": 1.257527733755943, "grad_norm": 1.1175766188036647, "learning_rate": 3.149105754041507e-06, "loss": 0.5523, "step": 1587 }, { "epoch": 1.2583201267828843, "grad_norm": 1.6165111386595197, "learning_rate": 3.1432066921102523e-06, "loss": 0.5281, "step": 1588 }, { "epoch": 1.2591125198098256, "grad_norm": 1.3421663134792912, "learning_rate": 3.137310626684064e-06, "loss": 0.5423, "step": 1589 }, { "epoch": 1.259904912836767, "grad_norm": 1.6903030896344262, "learning_rate": 3.131417567278052e-06, "loss": 0.5067, "step": 1590 }, { "epoch": 1.2606973058637083, "grad_norm": 1.3895966064733292, "learning_rate": 3.125527523402474e-06, "loss": 0.5086, "step": 1591 }, { "epoch": 1.2614896988906499, "grad_norm": 1.3938079248490811, "learning_rate": 3.1196405045627197e-06, "loss": 0.4578, "step": 1592 }, { "epoch": 1.2622820919175912, "grad_norm": 1.2341948825380178, "learning_rate": 3.1137565202592996e-06, "loss": 0.5167, "step": 1593 }, { "epoch": 1.2630744849445326, "grad_norm": 1.4779381050840918, "learning_rate": 3.1078755799878233e-06, "loss": 0.4847, "step": 1594 }, { "epoch": 1.263866877971474, "grad_norm": 1.3657712075556694, "learning_rate": 3.101997693238993e-06, "loss": 0.4693, "step": 1595 }, { "epoch": 1.2646592709984152, "grad_norm": 1.365584689271388, "learning_rate": 3.0961228694985794e-06, "loss": 0.5219, "step": 1596 }, { "epoch": 1.2654516640253566, "grad_norm": 1.4452642217442322, "learning_rate": 3.090251118247411e-06, "loss": 0.4284, "step": 1597 }, { "epoch": 1.266244057052298, "grad_norm": 1.8125069061079933, "learning_rate": 3.0843824489613573e-06, "loss": 0.454, "step": 1598 }, { "epoch": 1.2670364500792393, "grad_norm": 1.9288840885391003, "learning_rate": 3.078516871111315e-06, "loss": 0.5082, "step": 1599 }, { "epoch": 1.2678288431061806, "grad_norm": 1.293960056634167, "learning_rate": 3.072654394163194e-06, "loss": 0.5834, "step": 1600 }, { "epoch": 1.268621236133122, "grad_norm": 1.2670196765275537, "learning_rate": 3.0667950275778944e-06, "loss": 0.5267, "step": 1601 }, { "epoch": 1.2694136291600633, "grad_norm": 1.458079883137446, "learning_rate": 3.0609387808113024e-06, "loss": 0.4443, "step": 1602 }, { "epoch": 1.2702060221870046, "grad_norm": 1.4220208801678154, "learning_rate": 3.0550856633142657e-06, "loss": 0.5251, "step": 1603 }, { "epoch": 1.2709984152139462, "grad_norm": 1.3214740004766594, "learning_rate": 3.049235684532583e-06, "loss": 0.4701, "step": 1604 }, { "epoch": 1.2717908082408875, "grad_norm": 1.3832997424920217, "learning_rate": 3.043388853906989e-06, "loss": 0.4851, "step": 1605 }, { "epoch": 1.2725832012678289, "grad_norm": 1.461001804033919, "learning_rate": 3.037545180873136e-06, "loss": 0.5123, "step": 1606 }, { "epoch": 1.2733755942947702, "grad_norm": 1.4379184240509657, "learning_rate": 3.0317046748615815e-06, "loss": 0.576, "step": 1607 }, { "epoch": 1.2741679873217115, "grad_norm": 1.3952508938007266, "learning_rate": 3.0258673452977716e-06, "loss": 0.4517, "step": 1608 }, { "epoch": 1.2749603803486529, "grad_norm": 1.6657672338718457, "learning_rate": 3.0200332016020273e-06, "loss": 0.481, "step": 1609 }, { "epoch": 1.2757527733755942, "grad_norm": 1.8725562313329986, "learning_rate": 3.014202253189527e-06, "loss": 0.5557, "step": 1610 }, { "epoch": 1.2765451664025358, "grad_norm": 1.282039712190683, "learning_rate": 3.008374509470292e-06, "loss": 0.429, "step": 1611 }, { "epoch": 1.2773375594294771, "grad_norm": 2.8064315707804783, "learning_rate": 3.0025499798491757e-06, "loss": 0.5546, "step": 1612 }, { "epoch": 1.2781299524564185, "grad_norm": 1.300131553994651, "learning_rate": 2.9967286737258392e-06, "loss": 0.5154, "step": 1613 }, { "epoch": 1.2789223454833598, "grad_norm": 1.3284676251213319, "learning_rate": 2.990910600494746e-06, "loss": 0.4936, "step": 1614 }, { "epoch": 1.2797147385103012, "grad_norm": 1.18511563424907, "learning_rate": 2.9850957695451388e-06, "loss": 0.6071, "step": 1615 }, { "epoch": 1.2805071315372425, "grad_norm": 1.4714196344775945, "learning_rate": 2.9792841902610293e-06, "loss": 0.4866, "step": 1616 }, { "epoch": 1.2812995245641838, "grad_norm": 1.1901516593534966, "learning_rate": 2.973475872021183e-06, "loss": 0.4687, "step": 1617 }, { "epoch": 1.2820919175911252, "grad_norm": 1.2262141639250042, "learning_rate": 2.9676708241991e-06, "loss": 0.5112, "step": 1618 }, { "epoch": 1.2828843106180665, "grad_norm": 1.342513929510576, "learning_rate": 2.9618690561630064e-06, "loss": 0.544, "step": 1619 }, { "epoch": 1.2836767036450079, "grad_norm": 1.4110723218861667, "learning_rate": 2.9560705772758306e-06, "loss": 0.4175, "step": 1620 }, { "epoch": 1.2844690966719492, "grad_norm": 1.4184075817264434, "learning_rate": 2.950275396895195e-06, "loss": 0.4572, "step": 1621 }, { "epoch": 1.2852614896988905, "grad_norm": 1.4247354251098183, "learning_rate": 2.944483524373402e-06, "loss": 0.5425, "step": 1622 }, { "epoch": 1.2860538827258319, "grad_norm": 1.4170683614266557, "learning_rate": 2.9386949690574117e-06, "loss": 0.5206, "step": 1623 }, { "epoch": 1.2868462757527734, "grad_norm": 1.4555322588084911, "learning_rate": 2.932909740288833e-06, "loss": 0.4636, "step": 1624 }, { "epoch": 1.2876386687797148, "grad_norm": 1.3695407423716917, "learning_rate": 2.927127847403904e-06, "loss": 0.5168, "step": 1625 }, { "epoch": 1.2884310618066561, "grad_norm": 1.4539503298085785, "learning_rate": 2.9213492997334815e-06, "loss": 0.4593, "step": 1626 }, { "epoch": 1.2892234548335975, "grad_norm": 1.591928436317889, "learning_rate": 2.9155741066030218e-06, "loss": 0.5262, "step": 1627 }, { "epoch": 1.2900158478605388, "grad_norm": 1.4060358576796108, "learning_rate": 2.909802277332573e-06, "loss": 0.5021, "step": 1628 }, { "epoch": 1.2908082408874801, "grad_norm": 1.756329558304434, "learning_rate": 2.904033821236746e-06, "loss": 0.5904, "step": 1629 }, { "epoch": 1.2916006339144215, "grad_norm": 1.3531170736050961, "learning_rate": 2.898268747624716e-06, "loss": 0.4958, "step": 1630 }, { "epoch": 1.292393026941363, "grad_norm": 1.254002600927353, "learning_rate": 2.892507065800193e-06, "loss": 0.4796, "step": 1631 }, { "epoch": 1.2931854199683044, "grad_norm": 1.6906602772432433, "learning_rate": 2.8867487850614197e-06, "loss": 0.527, "step": 1632 }, { "epoch": 1.2939778129952457, "grad_norm": 1.1768466149382544, "learning_rate": 2.880993914701144e-06, "loss": 0.4594, "step": 1633 }, { "epoch": 1.294770206022187, "grad_norm": 9.844727835562791, "learning_rate": 2.8752424640066135e-06, "loss": 0.484, "step": 1634 }, { "epoch": 1.2955625990491284, "grad_norm": 1.4205834925057863, "learning_rate": 2.86949444225956e-06, "loss": 0.5074, "step": 1635 }, { "epoch": 1.2963549920760697, "grad_norm": 1.3481692180824643, "learning_rate": 2.8637498587361733e-06, "loss": 0.5321, "step": 1636 }, { "epoch": 1.297147385103011, "grad_norm": 1.655269975791016, "learning_rate": 2.858008722707104e-06, "loss": 0.55, "step": 1637 }, { "epoch": 1.2979397781299524, "grad_norm": 1.3411874682954108, "learning_rate": 2.85227104343743e-06, "loss": 0.4264, "step": 1638 }, { "epoch": 1.2987321711568938, "grad_norm": 1.2916548741386622, "learning_rate": 2.8465368301866604e-06, "loss": 0.4817, "step": 1639 }, { "epoch": 1.299524564183835, "grad_norm": 1.3567792850853433, "learning_rate": 2.8408060922087e-06, "loss": 0.5009, "step": 1640 }, { "epoch": 1.3003169572107764, "grad_norm": 1.386931671793817, "learning_rate": 2.835078838751856e-06, "loss": 0.5317, "step": 1641 }, { "epoch": 1.3011093502377178, "grad_norm": 1.2766655712922932, "learning_rate": 2.8293550790588033e-06, "loss": 0.5264, "step": 1642 }, { "epoch": 1.3019017432646594, "grad_norm": 1.3090259731863048, "learning_rate": 2.8236348223665855e-06, "loss": 0.4844, "step": 1643 }, { "epoch": 1.3026941362916007, "grad_norm": 1.3908159694435456, "learning_rate": 2.8179180779065853e-06, "loss": 0.5536, "step": 1644 }, { "epoch": 1.303486529318542, "grad_norm": 1.3580749273828403, "learning_rate": 2.812204854904528e-06, "loss": 0.5201, "step": 1645 }, { "epoch": 1.3042789223454834, "grad_norm": 2.259922547145398, "learning_rate": 2.8064951625804447e-06, "loss": 0.4614, "step": 1646 }, { "epoch": 1.3050713153724247, "grad_norm": 1.5266613716516737, "learning_rate": 2.800789010148679e-06, "loss": 0.4717, "step": 1647 }, { "epoch": 1.305863708399366, "grad_norm": 1.1185537372838212, "learning_rate": 2.795086406817852e-06, "loss": 0.508, "step": 1648 }, { "epoch": 1.3066561014263074, "grad_norm": 1.5330392438923375, "learning_rate": 2.7893873617908646e-06, "loss": 0.4872, "step": 1649 }, { "epoch": 1.3074484944532487, "grad_norm": 1.4607960274144973, "learning_rate": 2.78369188426487e-06, "loss": 0.4041, "step": 1650 }, { "epoch": 1.3082408874801903, "grad_norm": 1.43914106117787, "learning_rate": 2.777999983431269e-06, "loss": 0.5182, "step": 1651 }, { "epoch": 1.3090332805071316, "grad_norm": 1.20847366613828, "learning_rate": 2.7723116684756883e-06, "loss": 0.5991, "step": 1652 }, { "epoch": 1.309825673534073, "grad_norm": 1.2955698492031906, "learning_rate": 2.766626948577965e-06, "loss": 0.4912, "step": 1653 }, { "epoch": 1.3106180665610143, "grad_norm": 1.6339408010556575, "learning_rate": 2.76094583291214e-06, "loss": 0.519, "step": 1654 }, { "epoch": 1.3114104595879557, "grad_norm": 1.3530787832592377, "learning_rate": 2.7552683306464316e-06, "loss": 0.4967, "step": 1655 }, { "epoch": 1.312202852614897, "grad_norm": 1.3773451189409387, "learning_rate": 2.7495944509432325e-06, "loss": 0.4826, "step": 1656 }, { "epoch": 1.3129952456418383, "grad_norm": 1.4700975521818012, "learning_rate": 2.7439242029590836e-06, "loss": 0.4123, "step": 1657 }, { "epoch": 1.3137876386687797, "grad_norm": 1.400697551864994, "learning_rate": 2.738257595844671e-06, "loss": 0.5965, "step": 1658 }, { "epoch": 1.314580031695721, "grad_norm": 1.2695306908265072, "learning_rate": 2.7325946387447983e-06, "loss": 0.5012, "step": 1659 }, { "epoch": 1.3153724247226624, "grad_norm": 1.3514465586965083, "learning_rate": 2.7269353407983865e-06, "loss": 0.4718, "step": 1660 }, { "epoch": 1.3161648177496037, "grad_norm": 1.1828044399789235, "learning_rate": 2.7212797111384428e-06, "loss": 0.4512, "step": 1661 }, { "epoch": 1.316957210776545, "grad_norm": 1.3187731904877256, "learning_rate": 2.7156277588920633e-06, "loss": 0.572, "step": 1662 }, { "epoch": 1.3177496038034866, "grad_norm": 1.284118558105361, "learning_rate": 2.7099794931804014e-06, "loss": 0.4922, "step": 1663 }, { "epoch": 1.318541996830428, "grad_norm": 1.2359398152100853, "learning_rate": 2.7043349231186666e-06, "loss": 0.5426, "step": 1664 }, { "epoch": 1.3193343898573693, "grad_norm": 1.444584494867962, "learning_rate": 2.698694057816105e-06, "loss": 0.4751, "step": 1665 }, { "epoch": 1.3201267828843106, "grad_norm": 1.2336578747324527, "learning_rate": 2.6930569063759757e-06, "loss": 0.5308, "step": 1666 }, { "epoch": 1.320919175911252, "grad_norm": 1.2717717675257103, "learning_rate": 2.6874234778955555e-06, "loss": 0.4356, "step": 1667 }, { "epoch": 1.3217115689381933, "grad_norm": 1.9491784728339432, "learning_rate": 2.681793781466105e-06, "loss": 0.4789, "step": 1668 }, { "epoch": 1.3225039619651346, "grad_norm": 1.6359128415529267, "learning_rate": 2.676167826172868e-06, "loss": 0.4965, "step": 1669 }, { "epoch": 1.3232963549920762, "grad_norm": 1.549557871855656, "learning_rate": 2.670545621095043e-06, "loss": 0.5335, "step": 1670 }, { "epoch": 1.3240887480190175, "grad_norm": 1.3117708269704091, "learning_rate": 2.6649271753057858e-06, "loss": 0.4512, "step": 1671 }, { "epoch": 1.3248811410459589, "grad_norm": 1.3976434261354698, "learning_rate": 2.6593124978721763e-06, "loss": 0.4921, "step": 1672 }, { "epoch": 1.3256735340729002, "grad_norm": 1.4774690486910422, "learning_rate": 2.6537015978552207e-06, "loss": 0.4487, "step": 1673 }, { "epoch": 1.3264659270998416, "grad_norm": 1.5617624152302643, "learning_rate": 2.648094484309822e-06, "loss": 0.5443, "step": 1674 }, { "epoch": 1.327258320126783, "grad_norm": 1.8907557602741834, "learning_rate": 2.6424911662847806e-06, "loss": 0.541, "step": 1675 }, { "epoch": 1.3280507131537242, "grad_norm": 1.6733117607208499, "learning_rate": 2.636891652822764e-06, "loss": 0.5345, "step": 1676 }, { "epoch": 1.3288431061806656, "grad_norm": 1.200750218950656, "learning_rate": 2.6312959529603065e-06, "loss": 0.5105, "step": 1677 }, { "epoch": 1.329635499207607, "grad_norm": 1.2629336054635405, "learning_rate": 2.6257040757277815e-06, "loss": 0.4957, "step": 1678 }, { "epoch": 1.3304278922345483, "grad_norm": 2.1850517214307623, "learning_rate": 2.620116030149399e-06, "loss": 0.4343, "step": 1679 }, { "epoch": 1.3312202852614896, "grad_norm": 1.4753540220198806, "learning_rate": 2.6145318252431813e-06, "loss": 0.5031, "step": 1680 }, { "epoch": 1.332012678288431, "grad_norm": 1.3186559503142095, "learning_rate": 2.608951470020955e-06, "loss": 0.4865, "step": 1681 }, { "epoch": 1.3328050713153723, "grad_norm": 1.470476926451876, "learning_rate": 2.603374973488336e-06, "loss": 0.4763, "step": 1682 }, { "epoch": 1.3335974643423139, "grad_norm": 1.4010496663895873, "learning_rate": 2.5978023446447064e-06, "loss": 0.4769, "step": 1683 }, { "epoch": 1.3343898573692552, "grad_norm": 1.191159533430389, "learning_rate": 2.592233592483215e-06, "loss": 0.5007, "step": 1684 }, { "epoch": 1.3351822503961965, "grad_norm": 1.2096622880953216, "learning_rate": 2.586668725990746e-06, "loss": 0.5604, "step": 1685 }, { "epoch": 1.3359746434231379, "grad_norm": 1.2877005035402007, "learning_rate": 2.5811077541479213e-06, "loss": 0.5143, "step": 1686 }, { "epoch": 1.3367670364500792, "grad_norm": 1.3409606789772346, "learning_rate": 2.575550685929069e-06, "loss": 0.4572, "step": 1687 }, { "epoch": 1.3375594294770206, "grad_norm": 1.8734134985306645, "learning_rate": 2.5699975303022253e-06, "loss": 0.4754, "step": 1688 }, { "epoch": 1.338351822503962, "grad_norm": 1.306858538464615, "learning_rate": 2.564448296229105e-06, "loss": 0.4966, "step": 1689 }, { "epoch": 1.3391442155309035, "grad_norm": 1.273622324722472, "learning_rate": 2.5589029926651e-06, "loss": 0.4678, "step": 1690 }, { "epoch": 1.3399366085578448, "grad_norm": 1.2424985511307436, "learning_rate": 2.553361628559258e-06, "loss": 0.4723, "step": 1691 }, { "epoch": 1.3407290015847861, "grad_norm": 1.30743781598471, "learning_rate": 2.5478242128542684e-06, "loss": 0.4889, "step": 1692 }, { "epoch": 1.3415213946117275, "grad_norm": 1.2665848962580755, "learning_rate": 2.5422907544864463e-06, "loss": 0.5544, "step": 1693 }, { "epoch": 1.3423137876386688, "grad_norm": 1.648135448687766, "learning_rate": 2.5367612623857256e-06, "loss": 0.5139, "step": 1694 }, { "epoch": 1.3431061806656102, "grad_norm": 2.4445322802889984, "learning_rate": 2.531235745475633e-06, "loss": 0.5191, "step": 1695 }, { "epoch": 1.3438985736925515, "grad_norm": 1.4195946993026076, "learning_rate": 2.525714212673288e-06, "loss": 0.4414, "step": 1696 }, { "epoch": 1.3446909667194928, "grad_norm": 1.4298638596961866, "learning_rate": 2.5201966728893704e-06, "loss": 0.5116, "step": 1697 }, { "epoch": 1.3454833597464342, "grad_norm": 1.4274696113149987, "learning_rate": 2.5146831350281255e-06, "loss": 0.409, "step": 1698 }, { "epoch": 1.3462757527733755, "grad_norm": 1.2921134993060541, "learning_rate": 2.509173607987337e-06, "loss": 0.4978, "step": 1699 }, { "epoch": 1.3470681458003169, "grad_norm": 2.0027943068373584, "learning_rate": 2.503668100658312e-06, "loss": 0.5514, "step": 1700 }, { "epoch": 1.3478605388272582, "grad_norm": 1.3407170928484273, "learning_rate": 2.4981666219258777e-06, "loss": 0.4268, "step": 1701 }, { "epoch": 1.3486529318541998, "grad_norm": 1.3337738652752265, "learning_rate": 2.4926691806683516e-06, "loss": 0.5244, "step": 1702 }, { "epoch": 1.349445324881141, "grad_norm": 1.560650219932208, "learning_rate": 2.487175785757545e-06, "loss": 0.4895, "step": 1703 }, { "epoch": 1.3502377179080824, "grad_norm": 1.3696473589591731, "learning_rate": 2.48168644605873e-06, "loss": 0.4916, "step": 1704 }, { "epoch": 1.3510301109350238, "grad_norm": 1.2411934226079195, "learning_rate": 2.476201170430642e-06, "loss": 0.5393, "step": 1705 }, { "epoch": 1.3518225039619651, "grad_norm": 1.4974942902394672, "learning_rate": 2.4707199677254507e-06, "loss": 0.4284, "step": 1706 }, { "epoch": 1.3526148969889065, "grad_norm": 1.2666386328783703, "learning_rate": 2.46524284678876e-06, "loss": 0.5063, "step": 1707 }, { "epoch": 1.3534072900158478, "grad_norm": 1.4685761239954844, "learning_rate": 2.45976981645958e-06, "loss": 0.4674, "step": 1708 }, { "epoch": 1.3541996830427891, "grad_norm": 1.2767311193882507, "learning_rate": 2.4543008855703243e-06, "loss": 0.4853, "step": 1709 }, { "epoch": 1.3549920760697307, "grad_norm": 1.5987754609278155, "learning_rate": 2.4488360629467905e-06, "loss": 0.5305, "step": 1710 }, { "epoch": 1.355784469096672, "grad_norm": 1.474560637747107, "learning_rate": 2.4433753574081416e-06, "loss": 0.4748, "step": 1711 }, { "epoch": 1.3565768621236134, "grad_norm": 1.0600575885496626, "learning_rate": 2.437918777766902e-06, "loss": 0.5088, "step": 1712 }, { "epoch": 1.3573692551505547, "grad_norm": 1.5937473589478193, "learning_rate": 2.4324663328289334e-06, "loss": 0.5025, "step": 1713 }, { "epoch": 1.358161648177496, "grad_norm": 1.4114283507351446, "learning_rate": 2.427018031393427e-06, "loss": 0.5182, "step": 1714 }, { "epoch": 1.3589540412044374, "grad_norm": 1.384764621571333, "learning_rate": 2.4215738822528878e-06, "loss": 0.4975, "step": 1715 }, { "epoch": 1.3597464342313788, "grad_norm": 1.530404739332957, "learning_rate": 2.4161338941931205e-06, "loss": 0.5405, "step": 1716 }, { "epoch": 1.36053882725832, "grad_norm": 1.2719391821657662, "learning_rate": 2.410698075993209e-06, "loss": 0.5164, "step": 1717 }, { "epoch": 1.3613312202852614, "grad_norm": 1.303776785959559, "learning_rate": 2.4052664364255166e-06, "loss": 0.5131, "step": 1718 }, { "epoch": 1.3621236133122028, "grad_norm": 1.7627671373551173, "learning_rate": 2.3998389842556547e-06, "loss": 0.5169, "step": 1719 }, { "epoch": 1.3629160063391441, "grad_norm": 1.4542439710235313, "learning_rate": 2.394415728242483e-06, "loss": 0.4546, "step": 1720 }, { "epoch": 1.3637083993660855, "grad_norm": 1.5369539701571897, "learning_rate": 2.388996677138085e-06, "loss": 0.496, "step": 1721 }, { "epoch": 1.364500792393027, "grad_norm": 1.5825638073077801, "learning_rate": 2.3835818396877634e-06, "loss": 0.4815, "step": 1722 }, { "epoch": 1.3652931854199684, "grad_norm": 3.6667541476313956, "learning_rate": 2.378171224630015e-06, "loss": 0.4744, "step": 1723 }, { "epoch": 1.3660855784469097, "grad_norm": 1.202796038530065, "learning_rate": 2.3727648406965274e-06, "loss": 0.4733, "step": 1724 }, { "epoch": 1.366877971473851, "grad_norm": 1.6514090644224984, "learning_rate": 2.3673626966121564e-06, "loss": 0.4839, "step": 1725 }, { "epoch": 1.3676703645007924, "grad_norm": 1.8595407747154968, "learning_rate": 2.361964801094918e-06, "loss": 0.4977, "step": 1726 }, { "epoch": 1.3684627575277337, "grad_norm": 1.2396629974494258, "learning_rate": 2.3565711628559728e-06, "loss": 0.4148, "step": 1727 }, { "epoch": 1.369255150554675, "grad_norm": 1.3759843027874732, "learning_rate": 2.3511817905996056e-06, "loss": 0.5237, "step": 1728 }, { "epoch": 1.3700475435816164, "grad_norm": 1.2787312512484557, "learning_rate": 2.3457966930232233e-06, "loss": 0.469, "step": 1729 }, { "epoch": 1.370839936608558, "grad_norm": 1.2775957214233717, "learning_rate": 2.3404158788173286e-06, "loss": 0.5572, "step": 1730 }, { "epoch": 1.3716323296354993, "grad_norm": 1.5491627660277973, "learning_rate": 2.3350393566655173e-06, "loss": 0.4835, "step": 1731 }, { "epoch": 1.3724247226624406, "grad_norm": 1.4366151288499551, "learning_rate": 2.3296671352444522e-06, "loss": 0.5523, "step": 1732 }, { "epoch": 1.373217115689382, "grad_norm": 1.2305499359149168, "learning_rate": 2.3242992232238625e-06, "loss": 0.5599, "step": 1733 }, { "epoch": 1.3740095087163233, "grad_norm": 1.6525723370918182, "learning_rate": 2.3189356292665167e-06, "loss": 0.5063, "step": 1734 }, { "epoch": 1.3748019017432647, "grad_norm": 1.5505424309973006, "learning_rate": 2.31357636202822e-06, "loss": 0.5248, "step": 1735 }, { "epoch": 1.375594294770206, "grad_norm": 1.458986051145064, "learning_rate": 2.308221430157787e-06, "loss": 0.519, "step": 1736 }, { "epoch": 1.3763866877971473, "grad_norm": 1.7535948838309785, "learning_rate": 2.3028708422970493e-06, "loss": 0.4594, "step": 1737 }, { "epoch": 1.3771790808240887, "grad_norm": 1.3246309212997815, "learning_rate": 2.297524607080815e-06, "loss": 0.4743, "step": 1738 }, { "epoch": 1.37797147385103, "grad_norm": 1.1435567994937914, "learning_rate": 2.292182733136876e-06, "loss": 0.4825, "step": 1739 }, { "epoch": 1.3787638668779714, "grad_norm": 1.4281797199375246, "learning_rate": 2.2868452290859805e-06, "loss": 0.4431, "step": 1740 }, { "epoch": 1.3795562599049127, "grad_norm": 1.4921944141194308, "learning_rate": 2.2815121035418297e-06, "loss": 0.5296, "step": 1741 }, { "epoch": 1.3803486529318543, "grad_norm": 1.2814379701518341, "learning_rate": 2.2761833651110532e-06, "loss": 0.5155, "step": 1742 }, { "epoch": 1.3811410459587956, "grad_norm": 1.3940944638726263, "learning_rate": 2.270859022393205e-06, "loss": 0.5495, "step": 1743 }, { "epoch": 1.381933438985737, "grad_norm": 1.6226656434982054, "learning_rate": 2.2655390839807457e-06, "loss": 0.4658, "step": 1744 }, { "epoch": 1.3827258320126783, "grad_norm": 1.495555124842164, "learning_rate": 2.2602235584590225e-06, "loss": 0.5697, "step": 1745 }, { "epoch": 1.3835182250396196, "grad_norm": 1.329150894161747, "learning_rate": 2.2549124544062695e-06, "loss": 0.5216, "step": 1746 }, { "epoch": 1.384310618066561, "grad_norm": 1.2738593190214214, "learning_rate": 2.2496057803935766e-06, "loss": 0.516, "step": 1747 }, { "epoch": 1.3851030110935023, "grad_norm": 1.3152647277129015, "learning_rate": 2.244303544984892e-06, "loss": 0.4769, "step": 1748 }, { "epoch": 1.3858954041204439, "grad_norm": 1.3062334823523214, "learning_rate": 2.2390057567369943e-06, "loss": 0.5496, "step": 1749 }, { "epoch": 1.3866877971473852, "grad_norm": 1.381789755217059, "learning_rate": 2.2337124241994924e-06, "loss": 0.5223, "step": 1750 }, { "epoch": 1.3874801901743266, "grad_norm": 1.267397159383875, "learning_rate": 2.2284235559147966e-06, "loss": 0.5575, "step": 1751 }, { "epoch": 1.388272583201268, "grad_norm": 1.3908143509953261, "learning_rate": 2.22313916041812e-06, "loss": 0.5445, "step": 1752 }, { "epoch": 1.3890649762282092, "grad_norm": 1.4244725119839767, "learning_rate": 2.2178592462374514e-06, "loss": 0.4341, "step": 1753 }, { "epoch": 1.3898573692551506, "grad_norm": 8.045303927236654, "learning_rate": 2.212583821893554e-06, "loss": 0.3835, "step": 1754 }, { "epoch": 1.390649762282092, "grad_norm": 1.48501462745672, "learning_rate": 2.207312895899938e-06, "loss": 0.5091, "step": 1755 }, { "epoch": 1.3914421553090333, "grad_norm": 1.5485304648418032, "learning_rate": 2.2020464767628598e-06, "loss": 0.5039, "step": 1756 }, { "epoch": 1.3922345483359746, "grad_norm": 1.4878006517165252, "learning_rate": 2.196784572981302e-06, "loss": 0.4563, "step": 1757 }, { "epoch": 1.393026941362916, "grad_norm": 1.2366154207098947, "learning_rate": 2.1915271930469565e-06, "loss": 0.4709, "step": 1758 }, { "epoch": 1.3938193343898573, "grad_norm": 1.3644912672291634, "learning_rate": 2.186274345444218e-06, "loss": 0.5181, "step": 1759 }, { "epoch": 1.3946117274167986, "grad_norm": 1.8177714580145707, "learning_rate": 2.1810260386501666e-06, "loss": 0.4353, "step": 1760 }, { "epoch": 1.39540412044374, "grad_norm": 1.4353358174418105, "learning_rate": 2.1757822811345546e-06, "loss": 0.521, "step": 1761 }, { "epoch": 1.3961965134706815, "grad_norm": 1.3707549843365874, "learning_rate": 2.170543081359789e-06, "loss": 0.5076, "step": 1762 }, { "epoch": 1.3969889064976229, "grad_norm": 1.5381183200781627, "learning_rate": 2.1653084477809285e-06, "loss": 0.4171, "step": 1763 }, { "epoch": 1.3977812995245642, "grad_norm": 1.407533591916754, "learning_rate": 2.1600783888456534e-06, "loss": 0.4304, "step": 1764 }, { "epoch": 1.3985736925515055, "grad_norm": 1.1508373913182226, "learning_rate": 2.154852912994272e-06, "loss": 0.5687, "step": 1765 }, { "epoch": 1.3993660855784469, "grad_norm": 1.4186064755640861, "learning_rate": 2.1496320286596865e-06, "loss": 0.4719, "step": 1766 }, { "epoch": 1.4001584786053882, "grad_norm": 1.4981523097374436, "learning_rate": 2.144415744267398e-06, "loss": 0.4552, "step": 1767 }, { "epoch": 1.4009508716323296, "grad_norm": 1.4897370474397602, "learning_rate": 2.1392040682354757e-06, "loss": 0.5586, "step": 1768 }, { "epoch": 1.4017432646592711, "grad_norm": 1.5233345269755243, "learning_rate": 2.1339970089745605e-06, "loss": 0.3959, "step": 1769 }, { "epoch": 1.4025356576862125, "grad_norm": 1.4202540959785204, "learning_rate": 2.128794574887834e-06, "loss": 0.5517, "step": 1770 }, { "epoch": 1.4033280507131538, "grad_norm": 1.7096519894401618, "learning_rate": 2.1235967743710227e-06, "loss": 0.4611, "step": 1771 }, { "epoch": 1.4041204437400951, "grad_norm": 1.24379916289674, "learning_rate": 2.118403615812367e-06, "loss": 0.5259, "step": 1772 }, { "epoch": 1.4049128367670365, "grad_norm": 1.1460733189628514, "learning_rate": 2.11321510759262e-06, "loss": 0.5131, "step": 1773 }, { "epoch": 1.4057052297939778, "grad_norm": 1.291467536773707, "learning_rate": 2.1080312580850333e-06, "loss": 0.5068, "step": 1774 }, { "epoch": 1.4064976228209192, "grad_norm": 1.396526320553894, "learning_rate": 2.1028520756553326e-06, "loss": 0.4833, "step": 1775 }, { "epoch": 1.4072900158478605, "grad_norm": 1.1538073418806967, "learning_rate": 2.09767756866172e-06, "loss": 0.5907, "step": 1776 }, { "epoch": 1.4080824088748018, "grad_norm": 1.271946747527655, "learning_rate": 2.0925077454548443e-06, "loss": 0.5407, "step": 1777 }, { "epoch": 1.4088748019017432, "grad_norm": 1.7358089016775209, "learning_rate": 2.0873426143778043e-06, "loss": 0.4461, "step": 1778 }, { "epoch": 1.4096671949286845, "grad_norm": 1.3663186280664754, "learning_rate": 2.0821821837661167e-06, "loss": 0.5125, "step": 1779 }, { "epoch": 1.4104595879556259, "grad_norm": 1.294488014432374, "learning_rate": 2.0770264619477227e-06, "loss": 0.5087, "step": 1780 }, { "epoch": 1.4112519809825674, "grad_norm": 1.46809992925011, "learning_rate": 2.071875457242954e-06, "loss": 0.4632, "step": 1781 }, { "epoch": 1.4120443740095088, "grad_norm": 1.1423818288041083, "learning_rate": 2.0667291779645426e-06, "loss": 0.5337, "step": 1782 }, { "epoch": 1.4128367670364501, "grad_norm": 1.5745849735380184, "learning_rate": 2.0615876324175816e-06, "loss": 0.4504, "step": 1783 }, { "epoch": 1.4136291600633915, "grad_norm": 1.2102639431983686, "learning_rate": 2.0564508288995342e-06, "loss": 0.5225, "step": 1784 }, { "epoch": 1.4144215530903328, "grad_norm": 1.3954493763171552, "learning_rate": 2.0513187757002038e-06, "loss": 0.4981, "step": 1785 }, { "epoch": 1.4152139461172741, "grad_norm": 1.8069766294717113, "learning_rate": 2.046191481101734e-06, "loss": 0.5022, "step": 1786 }, { "epoch": 1.4160063391442155, "grad_norm": 1.7735243136910073, "learning_rate": 2.0410689533785837e-06, "loss": 0.5028, "step": 1787 }, { "epoch": 1.4167987321711568, "grad_norm": 1.4600979971641832, "learning_rate": 2.035951200797524e-06, "loss": 0.4778, "step": 1788 }, { "epoch": 1.4175911251980984, "grad_norm": 1.539152579943756, "learning_rate": 2.030838231617614e-06, "loss": 0.4969, "step": 1789 }, { "epoch": 1.4183835182250397, "grad_norm": 1.9059466986891906, "learning_rate": 2.0257300540901975e-06, "loss": 0.441, "step": 1790 }, { "epoch": 1.419175911251981, "grad_norm": 1.365183528705041, "learning_rate": 2.0206266764588873e-06, "loss": 0.5546, "step": 1791 }, { "epoch": 1.4199683042789224, "grad_norm": 1.3770223489058118, "learning_rate": 2.0155281069595428e-06, "loss": 0.5449, "step": 1792 }, { "epoch": 1.4207606973058637, "grad_norm": 1.2051891396536776, "learning_rate": 2.0104343538202725e-06, "loss": 0.5102, "step": 1793 }, { "epoch": 1.421553090332805, "grad_norm": 1.3829985449541435, "learning_rate": 2.005345425261405e-06, "loss": 0.4311, "step": 1794 }, { "epoch": 1.4223454833597464, "grad_norm": 1.413190553465145, "learning_rate": 2.0002613294954892e-06, "loss": 0.4504, "step": 1795 }, { "epoch": 1.4231378763866878, "grad_norm": 1.3474542221848382, "learning_rate": 1.9951820747272684e-06, "loss": 0.5294, "step": 1796 }, { "epoch": 1.423930269413629, "grad_norm": 1.8700124298483034, "learning_rate": 1.99010766915368e-06, "loss": 0.5249, "step": 1797 }, { "epoch": 1.4247226624405704, "grad_norm": 1.2421297038347543, "learning_rate": 1.98503812096383e-06, "loss": 0.4859, "step": 1798 }, { "epoch": 1.4255150554675118, "grad_norm": 1.7400142740545153, "learning_rate": 1.9799734383389907e-06, "loss": 0.5095, "step": 1799 }, { "epoch": 1.4263074484944531, "grad_norm": 1.4885309190356282, "learning_rate": 1.9749136294525766e-06, "loss": 0.506, "step": 1800 }, { "epoch": 1.4270998415213947, "grad_norm": 1.604936356829248, "learning_rate": 1.969858702470142e-06, "loss": 0.4643, "step": 1801 }, { "epoch": 1.427892234548336, "grad_norm": 1.2625995339192406, "learning_rate": 1.964808665549362e-06, "loss": 0.5712, "step": 1802 }, { "epoch": 1.4286846275752774, "grad_norm": 1.5390293718486523, "learning_rate": 1.959763526840016e-06, "loss": 0.5271, "step": 1803 }, { "epoch": 1.4294770206022187, "grad_norm": 1.3144963240725454, "learning_rate": 1.954723294483983e-06, "loss": 0.4236, "step": 1804 }, { "epoch": 1.43026941362916, "grad_norm": 1.366145994802287, "learning_rate": 1.9496879766152243e-06, "loss": 0.5281, "step": 1805 }, { "epoch": 1.4310618066561014, "grad_norm": 1.416184425696729, "learning_rate": 1.9446575813597645e-06, "loss": 0.5354, "step": 1806 }, { "epoch": 1.4318541996830427, "grad_norm": 1.435381053362953, "learning_rate": 1.9396321168356912e-06, "loss": 0.5818, "step": 1807 }, { "epoch": 1.4326465927099843, "grad_norm": 1.4699057623007132, "learning_rate": 1.934611591153132e-06, "loss": 0.4452, "step": 1808 }, { "epoch": 1.4334389857369256, "grad_norm": 1.401080029968082, "learning_rate": 1.929596012414241e-06, "loss": 0.5423, "step": 1809 }, { "epoch": 1.434231378763867, "grad_norm": 1.3568719378264413, "learning_rate": 1.924585388713195e-06, "loss": 0.5094, "step": 1810 }, { "epoch": 1.4350237717908083, "grad_norm": 1.503504580964693, "learning_rate": 1.9195797281361673e-06, "loss": 0.5242, "step": 1811 }, { "epoch": 1.4358161648177497, "grad_norm": 1.3689445178211126, "learning_rate": 1.9145790387613287e-06, "loss": 0.4662, "step": 1812 }, { "epoch": 1.436608557844691, "grad_norm": 1.5084422232831738, "learning_rate": 1.909583328658821e-06, "loss": 0.4211, "step": 1813 }, { "epoch": 1.4374009508716323, "grad_norm": 1.9328634285661004, "learning_rate": 1.9045926058907571e-06, "loss": 0.5227, "step": 1814 }, { "epoch": 1.4381933438985737, "grad_norm": 1.3706542843437763, "learning_rate": 1.8996068785111943e-06, "loss": 0.4782, "step": 1815 }, { "epoch": 1.438985736925515, "grad_norm": 1.8542621900690495, "learning_rate": 1.8946261545661342e-06, "loss": 0.4496, "step": 1816 }, { "epoch": 1.4397781299524564, "grad_norm": 1.6635856253213317, "learning_rate": 1.8896504420934992e-06, "loss": 0.5004, "step": 1817 }, { "epoch": 1.4405705229793977, "grad_norm": 1.4964127352210366, "learning_rate": 1.8846797491231277e-06, "loss": 0.4673, "step": 1818 }, { "epoch": 1.441362916006339, "grad_norm": 1.2476866341345296, "learning_rate": 1.8797140836767574e-06, "loss": 0.5574, "step": 1819 }, { "epoch": 1.4421553090332804, "grad_norm": 1.3756267662227974, "learning_rate": 1.8747534537680084e-06, "loss": 0.4781, "step": 1820 }, { "epoch": 1.442947702060222, "grad_norm": 1.3830408873702964, "learning_rate": 1.8697978674023803e-06, "loss": 0.4666, "step": 1821 }, { "epoch": 1.4437400950871633, "grad_norm": 1.5042961113577142, "learning_rate": 1.8648473325772276e-06, "loss": 0.4472, "step": 1822 }, { "epoch": 1.4445324881141046, "grad_norm": 1.232775401458302, "learning_rate": 1.859901857281759e-06, "loss": 0.4578, "step": 1823 }, { "epoch": 1.445324881141046, "grad_norm": 1.442397032019925, "learning_rate": 1.8549614494970108e-06, "loss": 0.5088, "step": 1824 }, { "epoch": 1.4461172741679873, "grad_norm": 1.4746318685017357, "learning_rate": 1.8500261171958483e-06, "loss": 0.4395, "step": 1825 }, { "epoch": 1.4469096671949286, "grad_norm": 1.337652324065384, "learning_rate": 1.8450958683429399e-06, "loss": 0.5626, "step": 1826 }, { "epoch": 1.44770206022187, "grad_norm": 1.5032506342986982, "learning_rate": 1.8401707108947542e-06, "loss": 0.4924, "step": 1827 }, { "epoch": 1.4484944532488115, "grad_norm": 1.520509554211489, "learning_rate": 1.8352506527995423e-06, "loss": 0.4808, "step": 1828 }, { "epoch": 1.4492868462757529, "grad_norm": 1.357024529627513, "learning_rate": 1.8303357019973284e-06, "loss": 0.4209, "step": 1829 }, { "epoch": 1.4500792393026942, "grad_norm": 1.4277272372001524, "learning_rate": 1.8254258664198875e-06, "loss": 0.4952, "step": 1830 }, { "epoch": 1.4508716323296356, "grad_norm": 1.3852756993470035, "learning_rate": 1.8205211539907481e-06, "loss": 0.5549, "step": 1831 }, { "epoch": 1.451664025356577, "grad_norm": 1.4121869856174858, "learning_rate": 1.8156215726251626e-06, "loss": 0.4989, "step": 1832 }, { "epoch": 1.4524564183835182, "grad_norm": 1.9253255070957396, "learning_rate": 1.8107271302301116e-06, "loss": 0.523, "step": 1833 }, { "epoch": 1.4532488114104596, "grad_norm": 1.2530329881686644, "learning_rate": 1.8058378347042744e-06, "loss": 0.5038, "step": 1834 }, { "epoch": 1.454041204437401, "grad_norm": 1.630669722806891, "learning_rate": 1.8009536939380285e-06, "loss": 0.5276, "step": 1835 }, { "epoch": 1.4548335974643423, "grad_norm": 1.5275888502472068, "learning_rate": 1.7960747158134345e-06, "loss": 0.4567, "step": 1836 }, { "epoch": 1.4556259904912836, "grad_norm": 1.3812861643412049, "learning_rate": 1.7912009082042158e-06, "loss": 0.4566, "step": 1837 }, { "epoch": 1.456418383518225, "grad_norm": 1.3374993013520815, "learning_rate": 1.786332278975757e-06, "loss": 0.5655, "step": 1838 }, { "epoch": 1.4572107765451663, "grad_norm": 1.1781434208219055, "learning_rate": 1.7814688359850813e-06, "loss": 0.5545, "step": 1839 }, { "epoch": 1.4580031695721076, "grad_norm": 1.3131246342788663, "learning_rate": 1.7766105870808464e-06, "loss": 0.4729, "step": 1840 }, { "epoch": 1.4587955625990492, "grad_norm": 1.5365756243909439, "learning_rate": 1.7717575401033239e-06, "loss": 0.5241, "step": 1841 }, { "epoch": 1.4595879556259905, "grad_norm": 2.2899396902568894, "learning_rate": 1.7669097028843952e-06, "loss": 0.4673, "step": 1842 }, { "epoch": 1.4603803486529319, "grad_norm": 1.5015934594749192, "learning_rate": 1.7620670832475284e-06, "loss": 0.5486, "step": 1843 }, { "epoch": 1.4611727416798732, "grad_norm": 1.4790104349849034, "learning_rate": 1.7572296890077767e-06, "loss": 0.537, "step": 1844 }, { "epoch": 1.4619651347068146, "grad_norm": 2.208533058180828, "learning_rate": 1.7523975279717564e-06, "loss": 0.5007, "step": 1845 }, { "epoch": 1.462757527733756, "grad_norm": 1.4296531004071216, "learning_rate": 1.7475706079376426e-06, "loss": 0.5042, "step": 1846 }, { "epoch": 1.4635499207606972, "grad_norm": 1.5784857592706965, "learning_rate": 1.7427489366951471e-06, "loss": 0.4917, "step": 1847 }, { "epoch": 1.4643423137876388, "grad_norm": 1.2037296812792189, "learning_rate": 1.7379325220255162e-06, "loss": 0.4585, "step": 1848 }, { "epoch": 1.4651347068145801, "grad_norm": 1.4446425674770267, "learning_rate": 1.7331213717015117e-06, "loss": 0.521, "step": 1849 }, { "epoch": 1.4659270998415215, "grad_norm": 1.3836841744343025, "learning_rate": 1.728315493487397e-06, "loss": 0.6364, "step": 1850 }, { "epoch": 1.4667194928684628, "grad_norm": 1.3600150929190626, "learning_rate": 1.7235148951389308e-06, "loss": 0.4823, "step": 1851 }, { "epoch": 1.4675118858954042, "grad_norm": 1.682168884233084, "learning_rate": 1.7187195844033493e-06, "loss": 0.4438, "step": 1852 }, { "epoch": 1.4683042789223455, "grad_norm": 1.5162805638699377, "learning_rate": 1.7139295690193581e-06, "loss": 0.5055, "step": 1853 }, { "epoch": 1.4690966719492868, "grad_norm": 1.35604622811064, "learning_rate": 1.709144856717112e-06, "loss": 0.4878, "step": 1854 }, { "epoch": 1.4698890649762282, "grad_norm": 1.4879877005759048, "learning_rate": 1.7043654552182138e-06, "loss": 0.5284, "step": 1855 }, { "epoch": 1.4706814580031695, "grad_norm": 1.4050231849588632, "learning_rate": 1.6995913722356889e-06, "loss": 0.5168, "step": 1856 }, { "epoch": 1.4714738510301109, "grad_norm": 1.3768714463735499, "learning_rate": 1.6948226154739872e-06, "loss": 0.5096, "step": 1857 }, { "epoch": 1.4722662440570522, "grad_norm": 1.3822675102236703, "learning_rate": 1.6900591926289555e-06, "loss": 0.5343, "step": 1858 }, { "epoch": 1.4730586370839935, "grad_norm": 1.3037856777389876, "learning_rate": 1.6853011113878404e-06, "loss": 0.5413, "step": 1859 }, { "epoch": 1.473851030110935, "grad_norm": 1.3925052589651918, "learning_rate": 1.6805483794292599e-06, "loss": 0.5223, "step": 1860 }, { "epoch": 1.4746434231378764, "grad_norm": 1.3336917441756846, "learning_rate": 1.675801004423208e-06, "loss": 0.4787, "step": 1861 }, { "epoch": 1.4754358161648178, "grad_norm": 1.7515918910895967, "learning_rate": 1.6710589940310252e-06, "loss": 0.544, "step": 1862 }, { "epoch": 1.4762282091917591, "grad_norm": 1.7717931759052858, "learning_rate": 1.6663223559054021e-06, "loss": 0.4588, "step": 1863 }, { "epoch": 1.4770206022187005, "grad_norm": 1.3357453495596165, "learning_rate": 1.6615910976903527e-06, "loss": 0.5928, "step": 1864 }, { "epoch": 1.4778129952456418, "grad_norm": 1.2112228889454668, "learning_rate": 1.6568652270212143e-06, "loss": 0.5091, "step": 1865 }, { "epoch": 1.4786053882725831, "grad_norm": 1.4427664572803562, "learning_rate": 1.6521447515246281e-06, "loss": 0.4653, "step": 1866 }, { "epoch": 1.4793977812995245, "grad_norm": 1.3859004425448886, "learning_rate": 1.6474296788185256e-06, "loss": 0.4582, "step": 1867 }, { "epoch": 1.480190174326466, "grad_norm": 1.6369622917249316, "learning_rate": 1.642720016512125e-06, "loss": 0.4868, "step": 1868 }, { "epoch": 1.4809825673534074, "grad_norm": 2.112996060800528, "learning_rate": 1.638015772205906e-06, "loss": 0.4831, "step": 1869 }, { "epoch": 1.4817749603803487, "grad_norm": 1.3461681897379034, "learning_rate": 1.6333169534916127e-06, "loss": 0.4698, "step": 1870 }, { "epoch": 1.48256735340729, "grad_norm": 1.440564818608394, "learning_rate": 1.6286235679522245e-06, "loss": 0.4567, "step": 1871 }, { "epoch": 1.4833597464342314, "grad_norm": 1.2611251209935996, "learning_rate": 1.6239356231619618e-06, "loss": 0.4642, "step": 1872 }, { "epoch": 1.4841521394611727, "grad_norm": 1.4601369822139945, "learning_rate": 1.6192531266862554e-06, "loss": 0.5113, "step": 1873 }, { "epoch": 1.484944532488114, "grad_norm": 1.4942138361519044, "learning_rate": 1.6145760860817544e-06, "loss": 0.4463, "step": 1874 }, { "epoch": 1.4857369255150554, "grad_norm": 1.6600975151512385, "learning_rate": 1.6099045088962929e-06, "loss": 0.5485, "step": 1875 }, { "epoch": 1.4865293185419968, "grad_norm": 1.3539574740274323, "learning_rate": 1.6052384026688944e-06, "loss": 0.4553, "step": 1876 }, { "epoch": 1.487321711568938, "grad_norm": 1.4556955393611117, "learning_rate": 1.6005777749297496e-06, "loss": 0.474, "step": 1877 }, { "epoch": 1.4881141045958794, "grad_norm": 1.2370773449552845, "learning_rate": 1.5959226332002126e-06, "loss": 0.5306, "step": 1878 }, { "epoch": 1.4889064976228208, "grad_norm": 1.4892877894862921, "learning_rate": 1.5912729849927776e-06, "loss": 0.4743, "step": 1879 }, { "epoch": 1.4896988906497624, "grad_norm": 1.2928064645622583, "learning_rate": 1.5866288378110805e-06, "loss": 0.4671, "step": 1880 }, { "epoch": 1.4904912836767037, "grad_norm": 1.5186036259850726, "learning_rate": 1.5819901991498731e-06, "loss": 0.5069, "step": 1881 }, { "epoch": 1.491283676703645, "grad_norm": 1.342626422753686, "learning_rate": 1.5773570764950218e-06, "loss": 0.5015, "step": 1882 }, { "epoch": 1.4920760697305864, "grad_norm": 1.555161240088715, "learning_rate": 1.5727294773234913e-06, "loss": 0.6038, "step": 1883 }, { "epoch": 1.4928684627575277, "grad_norm": 1.049372181555983, "learning_rate": 1.5681074091033289e-06, "loss": 0.5618, "step": 1884 }, { "epoch": 1.493660855784469, "grad_norm": 1.4061300551228142, "learning_rate": 1.5634908792936598e-06, "loss": 0.5, "step": 1885 }, { "epoch": 1.4944532488114104, "grad_norm": 1.4084879875065197, "learning_rate": 1.5588798953446681e-06, "loss": 0.5129, "step": 1886 }, { "epoch": 1.495245641838352, "grad_norm": 1.3328103266925386, "learning_rate": 1.5542744646975922e-06, "loss": 0.4412, "step": 1887 }, { "epoch": 1.4960380348652933, "grad_norm": 3.20176899501074, "learning_rate": 1.5496745947847025e-06, "loss": 0.5717, "step": 1888 }, { "epoch": 1.4968304278922346, "grad_norm": 1.2622548264699458, "learning_rate": 1.5450802930293025e-06, "loss": 0.5677, "step": 1889 }, { "epoch": 1.497622820919176, "grad_norm": 1.6187317338908367, "learning_rate": 1.5404915668457027e-06, "loss": 0.4757, "step": 1890 }, { "epoch": 1.4984152139461173, "grad_norm": 1.4148878601098733, "learning_rate": 1.5359084236392218e-06, "loss": 0.5704, "step": 1891 }, { "epoch": 1.4992076069730587, "grad_norm": 1.5022634155766241, "learning_rate": 1.5313308708061636e-06, "loss": 0.4788, "step": 1892 }, { "epoch": 1.5, "grad_norm": 1.35305200148449, "learning_rate": 1.5267589157338135e-06, "loss": 0.5008, "step": 1893 }, { "epoch": 1.5007923930269413, "grad_norm": 1.9141722140205364, "learning_rate": 1.522192565800425e-06, "loss": 0.4457, "step": 1894 }, { "epoch": 1.5015847860538827, "grad_norm": 1.3997069315623551, "learning_rate": 1.5176318283751985e-06, "loss": 0.4738, "step": 1895 }, { "epoch": 1.502377179080824, "grad_norm": 1.2445316329889375, "learning_rate": 1.5130767108182838e-06, "loss": 0.5454, "step": 1896 }, { "epoch": 1.5031695721077654, "grad_norm": 1.4668836262649976, "learning_rate": 1.5085272204807605e-06, "loss": 0.488, "step": 1897 }, { "epoch": 1.5039619651347067, "grad_norm": 1.4353559044748645, "learning_rate": 1.5039833647046225e-06, "loss": 0.5155, "step": 1898 }, { "epoch": 1.504754358161648, "grad_norm": 1.2926419394445614, "learning_rate": 1.499445150822776e-06, "loss": 0.5152, "step": 1899 }, { "epoch": 1.5055467511885894, "grad_norm": 1.3261702456847015, "learning_rate": 1.494912586159021e-06, "loss": 0.4668, "step": 1900 }, { "epoch": 1.506339144215531, "grad_norm": 1.5459148129338742, "learning_rate": 1.4903856780280356e-06, "loss": 0.4964, "step": 1901 }, { "epoch": 1.5071315372424723, "grad_norm": 1.1437568965678802, "learning_rate": 1.4858644337353784e-06, "loss": 0.5739, "step": 1902 }, { "epoch": 1.5079239302694136, "grad_norm": 1.4017005238999416, "learning_rate": 1.4813488605774585e-06, "loss": 0.5395, "step": 1903 }, { "epoch": 1.508716323296355, "grad_norm": 1.2573023269837786, "learning_rate": 1.47683896584154e-06, "loss": 0.4838, "step": 1904 }, { "epoch": 1.5095087163232963, "grad_norm": 1.3793125236351922, "learning_rate": 1.4723347568057178e-06, "loss": 0.5041, "step": 1905 }, { "epoch": 1.5103011093502379, "grad_norm": 1.3869573851369859, "learning_rate": 1.4678362407389163e-06, "loss": 0.4586, "step": 1906 }, { "epoch": 1.5110935023771792, "grad_norm": 1.7045609918728348, "learning_rate": 1.4633434249008672e-06, "loss": 0.4772, "step": 1907 }, { "epoch": 1.5118858954041206, "grad_norm": 1.5837192948575387, "learning_rate": 1.4588563165421083e-06, "loss": 0.4928, "step": 1908 }, { "epoch": 1.512678288431062, "grad_norm": 1.2702307808959108, "learning_rate": 1.4543749229039617e-06, "loss": 0.55, "step": 1909 }, { "epoch": 1.5134706814580032, "grad_norm": 1.4072399852672892, "learning_rate": 1.4498992512185305e-06, "loss": 0.4523, "step": 1910 }, { "epoch": 1.5142630744849446, "grad_norm": 14.6174060803734, "learning_rate": 1.4454293087086845e-06, "loss": 0.5191, "step": 1911 }, { "epoch": 1.515055467511886, "grad_norm": 1.4180447834460186, "learning_rate": 1.4409651025880428e-06, "loss": 0.4826, "step": 1912 }, { "epoch": 1.5158478605388273, "grad_norm": 1.39988254417761, "learning_rate": 1.436506640060973e-06, "loss": 0.5156, "step": 1913 }, { "epoch": 1.5166402535657686, "grad_norm": 1.6603043128671362, "learning_rate": 1.4320539283225677e-06, "loss": 0.5299, "step": 1914 }, { "epoch": 1.51743264659271, "grad_norm": 1.3242091518169683, "learning_rate": 1.4276069745586451e-06, "loss": 0.5169, "step": 1915 }, { "epoch": 1.5182250396196513, "grad_norm": 1.2965739610895788, "learning_rate": 1.4231657859457255e-06, "loss": 0.5199, "step": 1916 }, { "epoch": 1.5190174326465926, "grad_norm": 1.3501422783137087, "learning_rate": 1.4187303696510302e-06, "loss": 0.4387, "step": 1917 }, { "epoch": 1.519809825673534, "grad_norm": 2.22355328488113, "learning_rate": 1.4143007328324598e-06, "loss": 0.5293, "step": 1918 }, { "epoch": 1.5206022187004753, "grad_norm": 1.4652012390618894, "learning_rate": 1.4098768826385922e-06, "loss": 0.5169, "step": 1919 }, { "epoch": 1.5213946117274166, "grad_norm": 1.6878259308820216, "learning_rate": 1.405458826208666e-06, "loss": 0.4849, "step": 1920 }, { "epoch": 1.5221870047543582, "grad_norm": 1.1717968151257796, "learning_rate": 1.4010465706725706e-06, "loss": 0.574, "step": 1921 }, { "epoch": 1.5229793977812995, "grad_norm": 1.605317423036306, "learning_rate": 1.3966401231508297e-06, "loss": 0.5593, "step": 1922 }, { "epoch": 1.5237717908082409, "grad_norm": 1.1720786056379227, "learning_rate": 1.3922394907546e-06, "loss": 0.5541, "step": 1923 }, { "epoch": 1.5245641838351822, "grad_norm": 1.4893093594707414, "learning_rate": 1.3878446805856466e-06, "loss": 0.4595, "step": 1924 }, { "epoch": 1.5253565768621236, "grad_norm": 1.5337721896723036, "learning_rate": 1.3834556997363468e-06, "loss": 0.5371, "step": 1925 }, { "epoch": 1.5261489698890651, "grad_norm": 1.4307048939896987, "learning_rate": 1.3790725552896627e-06, "loss": 0.4393, "step": 1926 }, { "epoch": 1.5269413629160065, "grad_norm": 1.4989448131842427, "learning_rate": 1.3746952543191428e-06, "loss": 0.4923, "step": 1927 }, { "epoch": 1.5277337559429478, "grad_norm": 1.2893579680934038, "learning_rate": 1.370323803888905e-06, "loss": 0.4743, "step": 1928 }, { "epoch": 1.5285261489698891, "grad_norm": 1.523993242109271, "learning_rate": 1.365958211053622e-06, "loss": 0.4683, "step": 1929 }, { "epoch": 1.5293185419968305, "grad_norm": 1.2543129453931465, "learning_rate": 1.3615984828585177e-06, "loss": 0.5653, "step": 1930 }, { "epoch": 1.5301109350237718, "grad_norm": 1.4227754700207826, "learning_rate": 1.3572446263393479e-06, "loss": 0.4446, "step": 1931 }, { "epoch": 1.5309033280507132, "grad_norm": 1.3821713854784685, "learning_rate": 1.352896648522397e-06, "loss": 0.5389, "step": 1932 }, { "epoch": 1.5316957210776545, "grad_norm": 1.3535307188186503, "learning_rate": 1.3485545564244567e-06, "loss": 0.5067, "step": 1933 }, { "epoch": 1.5324881141045958, "grad_norm": 1.3408100979700965, "learning_rate": 1.344218357052826e-06, "loss": 0.5433, "step": 1934 }, { "epoch": 1.5332805071315372, "grad_norm": 1.2714443828092974, "learning_rate": 1.3398880574052892e-06, "loss": 0.521, "step": 1935 }, { "epoch": 1.5340729001584785, "grad_norm": 1.5816207357027545, "learning_rate": 1.3355636644701148e-06, "loss": 0.5737, "step": 1936 }, { "epoch": 1.5348652931854199, "grad_norm": 1.4409723308453046, "learning_rate": 1.3312451852260328e-06, "loss": 0.4455, "step": 1937 }, { "epoch": 1.5356576862123612, "grad_norm": 1.5874057726533548, "learning_rate": 1.3269326266422362e-06, "loss": 0.5268, "step": 1938 }, { "epoch": 1.5364500792393025, "grad_norm": 1.3626881802011643, "learning_rate": 1.3226259956783565e-06, "loss": 0.4843, "step": 1939 }, { "epoch": 1.537242472266244, "grad_norm": 1.1562204584767626, "learning_rate": 1.3183252992844647e-06, "loss": 0.5453, "step": 1940 }, { "epoch": 1.5380348652931854, "grad_norm": 1.3532987971596078, "learning_rate": 1.3140305444010537e-06, "loss": 0.4963, "step": 1941 }, { "epoch": 1.5388272583201268, "grad_norm": 1.3984817261087996, "learning_rate": 1.3097417379590239e-06, "loss": 0.4996, "step": 1942 }, { "epoch": 1.5396196513470681, "grad_norm": 1.4128214777819643, "learning_rate": 1.3054588868796807e-06, "loss": 0.4924, "step": 1943 }, { "epoch": 1.5404120443740095, "grad_norm": 1.2047679609729147, "learning_rate": 1.3011819980747164e-06, "loss": 0.5464, "step": 1944 }, { "epoch": 1.541204437400951, "grad_norm": 1.820592131295195, "learning_rate": 1.2969110784462036e-06, "loss": 0.4226, "step": 1945 }, { "epoch": 1.5419968304278924, "grad_norm": 1.422476229950954, "learning_rate": 1.292646134886577e-06, "loss": 0.4253, "step": 1946 }, { "epoch": 1.5427892234548337, "grad_norm": 2.02449505189463, "learning_rate": 1.288387174278633e-06, "loss": 0.436, "step": 1947 }, { "epoch": 1.543581616481775, "grad_norm": 1.4453297914654606, "learning_rate": 1.2841342034955073e-06, "loss": 0.4847, "step": 1948 }, { "epoch": 1.5443740095087164, "grad_norm": 1.165275864092856, "learning_rate": 1.2798872294006743e-06, "loss": 0.4788, "step": 1949 }, { "epoch": 1.5451664025356577, "grad_norm": 1.2372584005466627, "learning_rate": 1.2756462588479257e-06, "loss": 0.5802, "step": 1950 }, { "epoch": 1.545958795562599, "grad_norm": 1.46734981959213, "learning_rate": 1.2714112986813693e-06, "loss": 0.4507, "step": 1951 }, { "epoch": 1.5467511885895404, "grad_norm": 1.1965158821483508, "learning_rate": 1.267182355735409e-06, "loss": 0.5176, "step": 1952 }, { "epoch": 1.5475435816164818, "grad_norm": 1.1575517585118504, "learning_rate": 1.2629594368347436e-06, "loss": 0.5385, "step": 1953 }, { "epoch": 1.548335974643423, "grad_norm": 1.4502244195781573, "learning_rate": 1.2587425487943434e-06, "loss": 0.4409, "step": 1954 }, { "epoch": 1.5491283676703644, "grad_norm": 1.4233976132475006, "learning_rate": 1.2545316984194528e-06, "loss": 0.4653, "step": 1955 }, { "epoch": 1.5499207606973058, "grad_norm": 1.4358673406446456, "learning_rate": 1.250326892505566e-06, "loss": 0.5257, "step": 1956 }, { "epoch": 1.5507131537242471, "grad_norm": 1.338319831219426, "learning_rate": 1.2461281378384287e-06, "loss": 0.5093, "step": 1957 }, { "epoch": 1.5515055467511885, "grad_norm": 1.4492950595572722, "learning_rate": 1.2419354411940187e-06, "loss": 0.5202, "step": 1958 }, { "epoch": 1.5522979397781298, "grad_norm": 1.2936895213686752, "learning_rate": 1.2377488093385342e-06, "loss": 0.4702, "step": 1959 }, { "epoch": 1.5530903328050714, "grad_norm": 1.2126757232731182, "learning_rate": 1.2335682490283924e-06, "loss": 0.5334, "step": 1960 }, { "epoch": 1.5538827258320127, "grad_norm": 1.296581770201938, "learning_rate": 1.2293937670102052e-06, "loss": 0.4633, "step": 1961 }, { "epoch": 1.554675118858954, "grad_norm": 1.5550820696190482, "learning_rate": 1.2252253700207816e-06, "loss": 0.5486, "step": 1962 }, { "epoch": 1.5554675118858954, "grad_norm": 1.377746305149134, "learning_rate": 1.2210630647871046e-06, "loss": 0.5421, "step": 1963 }, { "epoch": 1.5562599049128367, "grad_norm": 1.395455043232548, "learning_rate": 1.2169068580263315e-06, "loss": 0.478, "step": 1964 }, { "epoch": 1.5570522979397783, "grad_norm": 1.4020810428616433, "learning_rate": 1.2127567564457753e-06, "loss": 0.5018, "step": 1965 }, { "epoch": 1.5578446909667196, "grad_norm": 1.251594588085136, "learning_rate": 1.2086127667428982e-06, "loss": 0.5026, "step": 1966 }, { "epoch": 1.558637083993661, "grad_norm": 1.4192697081404184, "learning_rate": 1.2044748956052944e-06, "loss": 0.5061, "step": 1967 }, { "epoch": 1.5594294770206023, "grad_norm": 1.3313320011850855, "learning_rate": 1.2003431497106904e-06, "loss": 0.5476, "step": 1968 }, { "epoch": 1.5602218700475436, "grad_norm": 1.3604888416136887, "learning_rate": 1.1962175357269218e-06, "loss": 0.5135, "step": 1969 }, { "epoch": 1.561014263074485, "grad_norm": 1.3037983923927792, "learning_rate": 1.1920980603119342e-06, "loss": 0.5025, "step": 1970 }, { "epoch": 1.5618066561014263, "grad_norm": 1.4901567765704362, "learning_rate": 1.1879847301137599e-06, "loss": 0.4947, "step": 1971 }, { "epoch": 1.5625990491283677, "grad_norm": 1.4976721837089328, "learning_rate": 1.1838775517705215e-06, "loss": 0.431, "step": 1972 }, { "epoch": 1.563391442155309, "grad_norm": 1.2792058437342637, "learning_rate": 1.1797765319104065e-06, "loss": 0.5006, "step": 1973 }, { "epoch": 1.5641838351822503, "grad_norm": 1.485751347711323, "learning_rate": 1.1756816771516695e-06, "loss": 0.4859, "step": 1974 }, { "epoch": 1.5649762282091917, "grad_norm": 1.3070612727102997, "learning_rate": 1.171592994102615e-06, "loss": 0.4771, "step": 1975 }, { "epoch": 1.565768621236133, "grad_norm": 1.7705491442808756, "learning_rate": 1.1675104893615824e-06, "loss": 0.4822, "step": 1976 }, { "epoch": 1.5665610142630744, "grad_norm": 1.4806790950369961, "learning_rate": 1.1634341695169481e-06, "loss": 0.4716, "step": 1977 }, { "epoch": 1.5673534072900157, "grad_norm": 1.495252860205128, "learning_rate": 1.1593640411471002e-06, "loss": 0.4437, "step": 1978 }, { "epoch": 1.568145800316957, "grad_norm": 1.4691335503500114, "learning_rate": 1.1553001108204416e-06, "loss": 0.4718, "step": 1979 }, { "epoch": 1.5689381933438986, "grad_norm": 1.5559835754974911, "learning_rate": 1.1512423850953657e-06, "loss": 0.5091, "step": 1980 }, { "epoch": 1.56973058637084, "grad_norm": 1.4099999589221899, "learning_rate": 1.1471908705202594e-06, "loss": 0.5142, "step": 1981 }, { "epoch": 1.5705229793977813, "grad_norm": 1.3914051195745747, "learning_rate": 1.1431455736334817e-06, "loss": 0.5385, "step": 1982 }, { "epoch": 1.5713153724247226, "grad_norm": 1.4804001212434486, "learning_rate": 1.1391065009633601e-06, "loss": 0.5223, "step": 1983 }, { "epoch": 1.572107765451664, "grad_norm": 1.4789432182590372, "learning_rate": 1.1350736590281746e-06, "loss": 0.5315, "step": 1984 }, { "epoch": 1.5729001584786055, "grad_norm": 5.626552850122411, "learning_rate": 1.1310470543361523e-06, "loss": 0.4639, "step": 1985 }, { "epoch": 1.5736925515055469, "grad_norm": 1.3332538668197333, "learning_rate": 1.1270266933854556e-06, "loss": 0.4679, "step": 1986 }, { "epoch": 1.5744849445324882, "grad_norm": 1.4353870867834715, "learning_rate": 1.1230125826641664e-06, "loss": 0.4316, "step": 1987 }, { "epoch": 1.5752773375594296, "grad_norm": 1.3282469151046075, "learning_rate": 1.1190047286502837e-06, "loss": 0.5137, "step": 1988 }, { "epoch": 1.576069730586371, "grad_norm": 1.3739373915935746, "learning_rate": 1.1150031378117093e-06, "loss": 0.5332, "step": 1989 }, { "epoch": 1.5768621236133122, "grad_norm": 1.3333610143568608, "learning_rate": 1.1110078166062348e-06, "loss": 0.4715, "step": 1990 }, { "epoch": 1.5776545166402536, "grad_norm": 1.842533139708374, "learning_rate": 1.107018771481536e-06, "loss": 0.4882, "step": 1991 }, { "epoch": 1.578446909667195, "grad_norm": 1.1999637694824863, "learning_rate": 1.1030360088751613e-06, "loss": 0.5572, "step": 1992 }, { "epoch": 1.5792393026941363, "grad_norm": 1.3594135432042036, "learning_rate": 1.0990595352145156e-06, "loss": 0.4524, "step": 1993 }, { "epoch": 1.5800316957210776, "grad_norm": 1.449290287581183, "learning_rate": 1.0950893569168613e-06, "loss": 0.5178, "step": 1994 }, { "epoch": 1.580824088748019, "grad_norm": 1.38451035779332, "learning_rate": 1.0911254803892952e-06, "loss": 0.566, "step": 1995 }, { "epoch": 1.5816164817749603, "grad_norm": 1.5139016178962181, "learning_rate": 1.0871679120287487e-06, "loss": 0.5524, "step": 1996 }, { "epoch": 1.5824088748019016, "grad_norm": 1.4806889896011346, "learning_rate": 1.0832166582219694e-06, "loss": 0.4718, "step": 1997 }, { "epoch": 1.583201267828843, "grad_norm": 1.3264702242621367, "learning_rate": 1.0792717253455187e-06, "loss": 0.4154, "step": 1998 }, { "epoch": 1.5839936608557845, "grad_norm": 1.229131590548316, "learning_rate": 1.0753331197657517e-06, "loss": 0.46, "step": 1999 }, { "epoch": 1.5847860538827259, "grad_norm": 1.441153967435657, "learning_rate": 1.0714008478388193e-06, "loss": 0.4725, "step": 2000 }, { "epoch": 1.5855784469096672, "grad_norm": 1.3660862206531146, "learning_rate": 1.067474915910644e-06, "loss": 0.5655, "step": 2001 }, { "epoch": 1.5863708399366085, "grad_norm": 1.2035078535709274, "learning_rate": 1.0635553303169216e-06, "loss": 0.5222, "step": 2002 }, { "epoch": 1.5871632329635499, "grad_norm": 2.7303289451094677, "learning_rate": 1.0596420973831061e-06, "loss": 0.4916, "step": 2003 }, { "epoch": 1.5879556259904914, "grad_norm": 1.2869982882832727, "learning_rate": 1.0557352234243956e-06, "loss": 0.5191, "step": 2004 }, { "epoch": 1.5887480190174328, "grad_norm": 1.2413191313241974, "learning_rate": 1.051834714745731e-06, "loss": 0.5338, "step": 2005 }, { "epoch": 1.5895404120443741, "grad_norm": 1.3642625422143708, "learning_rate": 1.0479405776417762e-06, "loss": 0.5011, "step": 2006 }, { "epoch": 1.5903328050713155, "grad_norm": 1.6076800668669677, "learning_rate": 1.0440528183969173e-06, "loss": 0.4696, "step": 2007 }, { "epoch": 1.5911251980982568, "grad_norm": 1.3324707066410035, "learning_rate": 1.0401714432852422e-06, "loss": 0.4761, "step": 2008 }, { "epoch": 1.5919175911251982, "grad_norm": 1.231560290478674, "learning_rate": 1.0362964585705432e-06, "loss": 0.5079, "step": 2009 }, { "epoch": 1.5927099841521395, "grad_norm": 1.4853444362416295, "learning_rate": 1.0324278705062902e-06, "loss": 0.4928, "step": 2010 }, { "epoch": 1.5935023771790808, "grad_norm": 1.29407452237368, "learning_rate": 1.0285656853356418e-06, "loss": 0.4924, "step": 2011 }, { "epoch": 1.5942947702060222, "grad_norm": 1.878919694710146, "learning_rate": 1.0247099092914136e-06, "loss": 0.5191, "step": 2012 }, { "epoch": 1.5950871632329635, "grad_norm": 1.4423161071299941, "learning_rate": 1.0208605485960843e-06, "loss": 0.4841, "step": 2013 }, { "epoch": 1.5958795562599049, "grad_norm": 1.4757730423490873, "learning_rate": 1.017017609461775e-06, "loss": 0.5164, "step": 2014 }, { "epoch": 1.5966719492868462, "grad_norm": 1.7707712923674392, "learning_rate": 1.0131810980902473e-06, "loss": 0.5036, "step": 2015 }, { "epoch": 1.5974643423137875, "grad_norm": 1.2826877665230916, "learning_rate": 1.0093510206728862e-06, "loss": 0.5133, "step": 2016 }, { "epoch": 1.5982567353407289, "grad_norm": 1.2735870902172186, "learning_rate": 1.005527383390698e-06, "loss": 0.4727, "step": 2017 }, { "epoch": 1.5990491283676702, "grad_norm": 1.6016089416631372, "learning_rate": 1.0017101924142914e-06, "loss": 0.5231, "step": 2018 }, { "epoch": 1.5998415213946118, "grad_norm": 1.8033895537828986, "learning_rate": 9.97899453903874e-07, "loss": 0.4737, "step": 2019 }, { "epoch": 1.6006339144215531, "grad_norm": 1.3603894542065078, "learning_rate": 9.94095174009242e-07, "loss": 0.4931, "step": 2020 }, { "epoch": 1.6014263074484945, "grad_norm": 1.3661039369456853, "learning_rate": 9.902973588697646e-07, "loss": 0.544, "step": 2021 }, { "epoch": 1.6022187004754358, "grad_norm": 1.5384640457540872, "learning_rate": 9.865060146143824e-07, "loss": 0.5012, "step": 2022 }, { "epoch": 1.6030110935023771, "grad_norm": 4.1121625090639, "learning_rate": 9.8272114736159e-07, "loss": 0.5195, "step": 2023 }, { "epoch": 1.6038034865293187, "grad_norm": 1.1890348194822797, "learning_rate": 9.78942763219432e-07, "loss": 0.5104, "step": 2024 }, { "epoch": 1.60459587955626, "grad_norm": 1.47046786343687, "learning_rate": 9.751708682854871e-07, "loss": 0.562, "step": 2025 }, { "epoch": 1.6053882725832014, "grad_norm": 2.105181368564849, "learning_rate": 9.714054686468666e-07, "loss": 0.4836, "step": 2026 }, { "epoch": 1.6061806656101427, "grad_norm": 1.7078097033800033, "learning_rate": 9.676465703801941e-07, "loss": 0.478, "step": 2027 }, { "epoch": 1.606973058637084, "grad_norm": 1.428996530035106, "learning_rate": 9.638941795516067e-07, "loss": 0.4573, "step": 2028 }, { "epoch": 1.6077654516640254, "grad_norm": 1.3542394143535434, "learning_rate": 9.601483022167335e-07, "loss": 0.5159, "step": 2029 }, { "epoch": 1.6085578446909667, "grad_norm": 1.7586702317678933, "learning_rate": 9.564089444206998e-07, "loss": 0.54, "step": 2030 }, { "epoch": 1.609350237717908, "grad_norm": 1.423494747784008, "learning_rate": 9.526761121981015e-07, "loss": 0.4238, "step": 2031 }, { "epoch": 1.6101426307448494, "grad_norm": 2.4485119232100496, "learning_rate": 9.489498115730089e-07, "loss": 0.4623, "step": 2032 }, { "epoch": 1.6109350237717908, "grad_norm": 1.3256228333478717, "learning_rate": 9.45230048558951e-07, "loss": 0.4824, "step": 2033 }, { "epoch": 1.611727416798732, "grad_norm": 1.5683312893627543, "learning_rate": 9.415168291589056e-07, "loss": 0.4967, "step": 2034 }, { "epoch": 1.6125198098256734, "grad_norm": 1.3967649025868953, "learning_rate": 9.378101593652883e-07, "loss": 0.5749, "step": 2035 }, { "epoch": 1.6133122028526148, "grad_norm": 1.4937719292910694, "learning_rate": 9.341100451599483e-07, "loss": 0.5085, "step": 2036 }, { "epoch": 1.6141045958795561, "grad_norm": 1.6087863732276526, "learning_rate": 9.304164925141545e-07, "loss": 0.4904, "step": 2037 }, { "epoch": 1.6148969889064975, "grad_norm": 1.2810150195209964, "learning_rate": 9.267295073885835e-07, "loss": 0.4609, "step": 2038 }, { "epoch": 1.615689381933439, "grad_norm": 1.3811231524716925, "learning_rate": 9.230490957333194e-07, "loss": 0.4402, "step": 2039 }, { "epoch": 1.6164817749603804, "grad_norm": 1.3185921448033513, "learning_rate": 9.193752634878306e-07, "loss": 0.5126, "step": 2040 }, { "epoch": 1.6172741679873217, "grad_norm": 1.4562992001721649, "learning_rate": 9.157080165809745e-07, "loss": 0.4774, "step": 2041 }, { "epoch": 1.618066561014263, "grad_norm": 1.3910695524142715, "learning_rate": 9.120473609309755e-07, "loss": 0.5274, "step": 2042 }, { "epoch": 1.6188589540412044, "grad_norm": 2.6201350775990204, "learning_rate": 9.08393302445425e-07, "loss": 0.5291, "step": 2043 }, { "epoch": 1.619651347068146, "grad_norm": 1.2118874231666819, "learning_rate": 9.04745847021265e-07, "loss": 0.4991, "step": 2044 }, { "epoch": 1.6204437400950873, "grad_norm": 1.2412771228484814, "learning_rate": 9.011050005447847e-07, "loss": 0.4893, "step": 2045 }, { "epoch": 1.6212361331220286, "grad_norm": 1.4691478480925184, "learning_rate": 8.974707688916029e-07, "loss": 0.4714, "step": 2046 }, { "epoch": 1.62202852614897, "grad_norm": 1.5261939981075683, "learning_rate": 8.938431579266693e-07, "loss": 0.5562, "step": 2047 }, { "epoch": 1.6228209191759113, "grad_norm": 1.3773298220362464, "learning_rate": 8.902221735042427e-07, "loss": 0.4904, "step": 2048 }, { "epoch": 1.6236133122028527, "grad_norm": 1.2676276677467042, "learning_rate": 8.866078214678931e-07, "loss": 0.5163, "step": 2049 }, { "epoch": 1.624405705229794, "grad_norm": 1.1187794471574257, "learning_rate": 8.830001076504863e-07, "loss": 0.5121, "step": 2050 }, { "epoch": 1.6251980982567353, "grad_norm": 1.2502564782218113, "learning_rate": 8.793990378741712e-07, "loss": 0.4798, "step": 2051 }, { "epoch": 1.6259904912836767, "grad_norm": 1.5751272506807983, "learning_rate": 8.75804617950381e-07, "loss": 0.5165, "step": 2052 }, { "epoch": 1.626782884310618, "grad_norm": 2.4924485424774083, "learning_rate": 8.722168536798109e-07, "loss": 0.4932, "step": 2053 }, { "epoch": 1.6275752773375594, "grad_norm": 1.6983033602422986, "learning_rate": 8.686357508524207e-07, "loss": 0.489, "step": 2054 }, { "epoch": 1.6283676703645007, "grad_norm": 1.2666252054073726, "learning_rate": 8.650613152474141e-07, "loss": 0.5248, "step": 2055 }, { "epoch": 1.629160063391442, "grad_norm": 1.313599194977584, "learning_rate": 8.614935526332407e-07, "loss": 0.4851, "step": 2056 }, { "epoch": 1.6299524564183834, "grad_norm": 1.2026102615017729, "learning_rate": 8.579324687675788e-07, "loss": 0.531, "step": 2057 }, { "epoch": 1.6307448494453247, "grad_norm": 1.4610678642452748, "learning_rate": 8.543780693973292e-07, "loss": 0.4725, "step": 2058 }, { "epoch": 1.6315372424722663, "grad_norm": 1.88289363284461, "learning_rate": 8.508303602586026e-07, "loss": 0.4963, "step": 2059 }, { "epoch": 1.6323296354992076, "grad_norm": 1.3781051725767661, "learning_rate": 8.472893470767174e-07, "loss": 0.481, "step": 2060 }, { "epoch": 1.633122028526149, "grad_norm": 1.5526297736183563, "learning_rate": 8.437550355661811e-07, "loss": 0.4549, "step": 2061 }, { "epoch": 1.6339144215530903, "grad_norm": 1.2616213466445838, "learning_rate": 8.402274314306913e-07, "loss": 0.496, "step": 2062 }, { "epoch": 1.6347068145800316, "grad_norm": 1.3257957185064684, "learning_rate": 8.367065403631153e-07, "loss": 0.5939, "step": 2063 }, { "epoch": 1.6354992076069732, "grad_norm": 1.4923591693913194, "learning_rate": 8.331923680454928e-07, "loss": 0.5108, "step": 2064 }, { "epoch": 1.6362916006339145, "grad_norm": 1.2898699968088863, "learning_rate": 8.296849201490153e-07, "loss": 0.5098, "step": 2065 }, { "epoch": 1.6370839936608559, "grad_norm": 1.2062965131469185, "learning_rate": 8.261842023340261e-07, "loss": 0.5155, "step": 2066 }, { "epoch": 1.6378763866877972, "grad_norm": 1.3281407973070765, "learning_rate": 8.226902202500076e-07, "loss": 0.5438, "step": 2067 }, { "epoch": 1.6386687797147386, "grad_norm": 1.617112564047401, "learning_rate": 8.192029795355683e-07, "loss": 0.4021, "step": 2068 }, { "epoch": 1.63946117274168, "grad_norm": 1.690667585761148, "learning_rate": 8.15722485818442e-07, "loss": 0.5001, "step": 2069 }, { "epoch": 1.6402535657686212, "grad_norm": 1.3699577837872892, "learning_rate": 8.122487447154703e-07, "loss": 0.4484, "step": 2070 }, { "epoch": 1.6410459587955626, "grad_norm": 1.3951271982014504, "learning_rate": 8.087817618326005e-07, "loss": 0.5119, "step": 2071 }, { "epoch": 1.641838351822504, "grad_norm": 1.5788753991911932, "learning_rate": 8.053215427648708e-07, "loss": 0.4646, "step": 2072 }, { "epoch": 1.6426307448494453, "grad_norm": 1.4924006512607348, "learning_rate": 8.018680930964062e-07, "loss": 0.4856, "step": 2073 }, { "epoch": 1.6434231378763866, "grad_norm": 1.721341586263957, "learning_rate": 7.984214184004047e-07, "loss": 0.4934, "step": 2074 }, { "epoch": 1.644215530903328, "grad_norm": 1.298803775033887, "learning_rate": 7.949815242391346e-07, "loss": 0.4172, "step": 2075 }, { "epoch": 1.6450079239302693, "grad_norm": 1.5387049928701046, "learning_rate": 7.915484161639164e-07, "loss": 0.4365, "step": 2076 }, { "epoch": 1.6458003169572106, "grad_norm": 1.409139584786502, "learning_rate": 7.88122099715124e-07, "loss": 0.4742, "step": 2077 }, { "epoch": 1.6465927099841522, "grad_norm": 1.0876326377219203, "learning_rate": 7.847025804221702e-07, "loss": 0.5661, "step": 2078 }, { "epoch": 1.6473851030110935, "grad_norm": 1.350054307958026, "learning_rate": 7.812898638034944e-07, "loss": 0.4931, "step": 2079 }, { "epoch": 1.6481774960380349, "grad_norm": 1.3297699549840936, "learning_rate": 7.778839553665623e-07, "loss": 0.4166, "step": 2080 }, { "epoch": 1.6489698890649762, "grad_norm": 1.9906779473156369, "learning_rate": 7.744848606078525e-07, "loss": 0.4831, "step": 2081 }, { "epoch": 1.6497622820919176, "grad_norm": 1.55089245257458, "learning_rate": 7.710925850128437e-07, "loss": 0.6036, "step": 2082 }, { "epoch": 1.6505546751188591, "grad_norm": 1.2924687083914712, "learning_rate": 7.677071340560127e-07, "loss": 0.4732, "step": 2083 }, { "epoch": 1.6513470681458005, "grad_norm": 1.255203299614932, "learning_rate": 7.643285132008243e-07, "loss": 0.5321, "step": 2084 }, { "epoch": 1.6521394611727418, "grad_norm": 1.4752105162159985, "learning_rate": 7.60956727899716e-07, "loss": 0.4406, "step": 2085 }, { "epoch": 1.6529318541996831, "grad_norm": 1.6978604739780374, "learning_rate": 7.575917835940988e-07, "loss": 0.5131, "step": 2086 }, { "epoch": 1.6537242472266245, "grad_norm": 1.2877324339716785, "learning_rate": 7.542336857143395e-07, "loss": 0.4976, "step": 2087 }, { "epoch": 1.6545166402535658, "grad_norm": 1.2637691807074454, "learning_rate": 7.508824396797609e-07, "loss": 0.555, "step": 2088 }, { "epoch": 1.6553090332805072, "grad_norm": 1.4040174876460723, "learning_rate": 7.475380508986224e-07, "loss": 0.4566, "step": 2089 }, { "epoch": 1.6561014263074485, "grad_norm": 1.269902405606831, "learning_rate": 7.442005247681234e-07, "loss": 0.4642, "step": 2090 }, { "epoch": 1.6568938193343898, "grad_norm": 1.393876275450546, "learning_rate": 7.40869866674383e-07, "loss": 0.433, "step": 2091 }, { "epoch": 1.6576862123613312, "grad_norm": 1.2865552865142709, "learning_rate": 7.375460819924401e-07, "loss": 0.4345, "step": 2092 }, { "epoch": 1.6584786053882725, "grad_norm": 1.2875952268458162, "learning_rate": 7.342291760862386e-07, "loss": 0.5307, "step": 2093 }, { "epoch": 1.6592709984152139, "grad_norm": 1.421883492001532, "learning_rate": 7.30919154308623e-07, "loss": 0.4147, "step": 2094 }, { "epoch": 1.6600633914421552, "grad_norm": 1.3766128472056973, "learning_rate": 7.276160220013301e-07, "loss": 0.5466, "step": 2095 }, { "epoch": 1.6608557844690965, "grad_norm": 1.4215400200488515, "learning_rate": 7.243197844949734e-07, "loss": 0.5243, "step": 2096 }, { "epoch": 1.6616481774960379, "grad_norm": 1.6591747985400214, "learning_rate": 7.210304471090446e-07, "loss": 0.4798, "step": 2097 }, { "epoch": 1.6624405705229794, "grad_norm": 1.3269970403205613, "learning_rate": 7.177480151518951e-07, "loss": 0.5143, "step": 2098 }, { "epoch": 1.6632329635499208, "grad_norm": 1.3241077877686331, "learning_rate": 7.144724939207376e-07, "loss": 0.4645, "step": 2099 }, { "epoch": 1.6640253565768621, "grad_norm": 1.4142206113853795, "learning_rate": 7.11203888701627e-07, "loss": 0.5382, "step": 2100 }, { "epoch": 1.6648177496038035, "grad_norm": 1.317538185663743, "learning_rate": 7.079422047694617e-07, "loss": 0.4674, "step": 2101 }, { "epoch": 1.6656101426307448, "grad_norm": 1.2405583264650648, "learning_rate": 7.046874473879655e-07, "loss": 0.4805, "step": 2102 }, { "epoch": 1.6664025356576864, "grad_norm": 1.1777291995046708, "learning_rate": 7.014396218096914e-07, "loss": 0.5641, "step": 2103 }, { "epoch": 1.6671949286846277, "grad_norm": 1.3687562734478897, "learning_rate": 6.981987332759971e-07, "loss": 0.5946, "step": 2104 }, { "epoch": 1.667987321711569, "grad_norm": 1.3793329791935127, "learning_rate": 6.949647870170522e-07, "loss": 0.4296, "step": 2105 }, { "epoch": 1.6687797147385104, "grad_norm": 1.3658479769002536, "learning_rate": 6.917377882518184e-07, "loss": 0.4782, "step": 2106 }, { "epoch": 1.6695721077654517, "grad_norm": 1.220253861156063, "learning_rate": 6.88517742188049e-07, "loss": 0.5568, "step": 2107 }, { "epoch": 1.670364500792393, "grad_norm": 2.4580643640565083, "learning_rate": 6.853046540222735e-07, "loss": 0.5009, "step": 2108 }, { "epoch": 1.6711568938193344, "grad_norm": 2.2298917332674586, "learning_rate": 6.820985289397952e-07, "loss": 0.4633, "step": 2109 }, { "epoch": 1.6719492868462758, "grad_norm": 1.395892833969504, "learning_rate": 6.788993721146792e-07, "loss": 0.4201, "step": 2110 }, { "epoch": 1.672741679873217, "grad_norm": 1.3932858909979673, "learning_rate": 6.75707188709745e-07, "loss": 0.5043, "step": 2111 }, { "epoch": 1.6735340729001584, "grad_norm": 1.3493909558218407, "learning_rate": 6.725219838765607e-07, "loss": 0.4707, "step": 2112 }, { "epoch": 1.6743264659270998, "grad_norm": 1.5690917532903885, "learning_rate": 6.693437627554284e-07, "loss": 0.5132, "step": 2113 }, { "epoch": 1.6751188589540411, "grad_norm": 1.2224066400911915, "learning_rate": 6.661725304753841e-07, "loss": 0.4946, "step": 2114 }, { "epoch": 1.6759112519809825, "grad_norm": 1.3179873870204921, "learning_rate": 6.630082921541803e-07, "loss": 0.469, "step": 2115 }, { "epoch": 1.6767036450079238, "grad_norm": 2.109076865577146, "learning_rate": 6.598510528982882e-07, "loss": 0.4662, "step": 2116 }, { "epoch": 1.6774960380348651, "grad_norm": 1.4321713497858521, "learning_rate": 6.567008178028783e-07, "loss": 0.4952, "step": 2117 }, { "epoch": 1.6782884310618067, "grad_norm": 1.3924687210631648, "learning_rate": 6.535575919518222e-07, "loss": 0.4634, "step": 2118 }, { "epoch": 1.679080824088748, "grad_norm": 1.2722633989876262, "learning_rate": 6.504213804176773e-07, "loss": 0.5536, "step": 2119 }, { "epoch": 1.6798732171156894, "grad_norm": 1.3697454138647855, "learning_rate": 6.472921882616829e-07, "loss": 0.4687, "step": 2120 }, { "epoch": 1.6806656101426307, "grad_norm": 1.768018199060229, "learning_rate": 6.441700205337481e-07, "loss": 0.4685, "step": 2121 }, { "epoch": 1.681458003169572, "grad_norm": 1.5342302147166131, "learning_rate": 6.410548822724488e-07, "loss": 0.4221, "step": 2122 }, { "epoch": 1.6822503961965136, "grad_norm": 1.5774028701778366, "learning_rate": 6.379467785050142e-07, "loss": 0.5266, "step": 2123 }, { "epoch": 1.683042789223455, "grad_norm": 1.4729490664652998, "learning_rate": 6.34845714247323e-07, "loss": 0.4802, "step": 2124 }, { "epoch": 1.6838351822503963, "grad_norm": 1.1792694668689565, "learning_rate": 6.317516945038926e-07, "loss": 0.5667, "step": 2125 }, { "epoch": 1.6846275752773376, "grad_norm": 1.4433481752927728, "learning_rate": 6.286647242678734e-07, "loss": 0.4891, "step": 2126 }, { "epoch": 1.685419968304279, "grad_norm": 1.4280016094658188, "learning_rate": 6.255848085210353e-07, "loss": 0.4401, "step": 2127 }, { "epoch": 1.6862123613312203, "grad_norm": 1.663787499350234, "learning_rate": 6.22511952233768e-07, "loss": 0.5251, "step": 2128 }, { "epoch": 1.6870047543581617, "grad_norm": 1.550128058997489, "learning_rate": 6.194461603650681e-07, "loss": 0.5044, "step": 2129 }, { "epoch": 1.687797147385103, "grad_norm": 1.604279024032174, "learning_rate": 6.163874378625273e-07, "loss": 0.4521, "step": 2130 }, { "epoch": 1.6885895404120443, "grad_norm": 1.3689983577761853, "learning_rate": 6.133357896623343e-07, "loss": 0.5163, "step": 2131 }, { "epoch": 1.6893819334389857, "grad_norm": 1.517795638235943, "learning_rate": 6.10291220689257e-07, "loss": 0.4462, "step": 2132 }, { "epoch": 1.690174326465927, "grad_norm": 1.2520771897988163, "learning_rate": 6.072537358566427e-07, "loss": 0.4363, "step": 2133 }, { "epoch": 1.6909667194928684, "grad_norm": 1.2948948425798292, "learning_rate": 6.042233400664011e-07, "loss": 0.5074, "step": 2134 }, { "epoch": 1.6917591125198097, "grad_norm": 1.613985921449831, "learning_rate": 6.012000382090077e-07, "loss": 0.3996, "step": 2135 }, { "epoch": 1.692551505546751, "grad_norm": 1.2488508139711265, "learning_rate": 5.981838351634839e-07, "loss": 0.5332, "step": 2136 }, { "epoch": 1.6933438985736924, "grad_norm": 1.3051633893308652, "learning_rate": 5.951747357973998e-07, "loss": 0.5511, "step": 2137 }, { "epoch": 1.694136291600634, "grad_norm": 1.3871672919788256, "learning_rate": 5.921727449668573e-07, "loss": 0.494, "step": 2138 }, { "epoch": 1.6949286846275753, "grad_norm": 1.321568648282286, "learning_rate": 5.891778675164917e-07, "loss": 0.5053, "step": 2139 }, { "epoch": 1.6957210776545166, "grad_norm": 1.3878028583280804, "learning_rate": 5.861901082794525e-07, "loss": 0.3866, "step": 2140 }, { "epoch": 1.696513470681458, "grad_norm": 1.828240756483403, "learning_rate": 5.832094720774056e-07, "loss": 0.4419, "step": 2141 }, { "epoch": 1.6973058637083995, "grad_norm": 1.3505320801020646, "learning_rate": 5.802359637205235e-07, "loss": 0.4822, "step": 2142 }, { "epoch": 1.6980982567353409, "grad_norm": 1.308024974459687, "learning_rate": 5.772695880074703e-07, "loss": 0.4935, "step": 2143 }, { "epoch": 1.6988906497622822, "grad_norm": 1.5062128496639982, "learning_rate": 5.743103497254043e-07, "loss": 0.5069, "step": 2144 }, { "epoch": 1.6996830427892236, "grad_norm": 1.3433856124853267, "learning_rate": 5.713582536499612e-07, "loss": 0.4505, "step": 2145 }, { "epoch": 1.700475435816165, "grad_norm": 1.4527230161673095, "learning_rate": 5.684133045452556e-07, "loss": 0.4151, "step": 2146 }, { "epoch": 1.7012678288431062, "grad_norm": 1.479236026335396, "learning_rate": 5.654755071638629e-07, "loss": 0.4646, "step": 2147 }, { "epoch": 1.7020602218700476, "grad_norm": 1.430166247484007, "learning_rate": 5.625448662468208e-07, "loss": 0.4771, "step": 2148 }, { "epoch": 1.702852614896989, "grad_norm": 1.406972864685261, "learning_rate": 5.59621386523616e-07, "loss": 0.5263, "step": 2149 }, { "epoch": 1.7036450079239303, "grad_norm": 1.5808209994409599, "learning_rate": 5.567050727121808e-07, "loss": 0.4638, "step": 2150 }, { "epoch": 1.7044374009508716, "grad_norm": 2.225042061077702, "learning_rate": 5.537959295188789e-07, "loss": 0.4208, "step": 2151 }, { "epoch": 1.705229793977813, "grad_norm": 1.2992297032521385, "learning_rate": 5.508939616385061e-07, "loss": 0.5076, "step": 2152 }, { "epoch": 1.7060221870047543, "grad_norm": 1.2150811912924644, "learning_rate": 5.479991737542756e-07, "loss": 0.4772, "step": 2153 }, { "epoch": 1.7068145800316956, "grad_norm": 1.526830723309829, "learning_rate": 5.451115705378163e-07, "loss": 0.4132, "step": 2154 }, { "epoch": 1.707606973058637, "grad_norm": 1.4836292674230886, "learning_rate": 5.422311566491595e-07, "loss": 0.4508, "step": 2155 }, { "epoch": 1.7083993660855783, "grad_norm": 1.2967668855638792, "learning_rate": 5.393579367367375e-07, "loss": 0.5509, "step": 2156 }, { "epoch": 1.7091917591125199, "grad_norm": 1.469689582474703, "learning_rate": 5.364919154373693e-07, "loss": 0.4997, "step": 2157 }, { "epoch": 1.7099841521394612, "grad_norm": 1.5426406658878176, "learning_rate": 5.336330973762593e-07, "loss": 0.5175, "step": 2158 }, { "epoch": 1.7107765451664025, "grad_norm": 1.3482039036494904, "learning_rate": 5.30781487166988e-07, "loss": 0.4955, "step": 2159 }, { "epoch": 1.7115689381933439, "grad_norm": 1.3805002930606163, "learning_rate": 5.279370894115005e-07, "loss": 0.5415, "step": 2160 }, { "epoch": 1.7123613312202852, "grad_norm": 1.389147443699831, "learning_rate": 5.250999087001063e-07, "loss": 0.4559, "step": 2161 }, { "epoch": 1.7131537242472268, "grad_norm": 1.4748127567097775, "learning_rate": 5.222699496114636e-07, "loss": 0.5043, "step": 2162 }, { "epoch": 1.7139461172741681, "grad_norm": 1.4880573766643377, "learning_rate": 5.194472167125808e-07, "loss": 0.4959, "step": 2163 }, { "epoch": 1.7147385103011095, "grad_norm": 1.286009948078861, "learning_rate": 5.166317145588007e-07, "loss": 0.5231, "step": 2164 }, { "epoch": 1.7155309033280508, "grad_norm": 1.4076032508142964, "learning_rate": 5.138234476938009e-07, "loss": 0.4747, "step": 2165 }, { "epoch": 1.7163232963549921, "grad_norm": 1.5849628511796414, "learning_rate": 5.11022420649579e-07, "loss": 0.555, "step": 2166 }, { "epoch": 1.7171156893819335, "grad_norm": 1.2167878601848376, "learning_rate": 5.082286379464518e-07, "loss": 0.5309, "step": 2167 }, { "epoch": 1.7179080824088748, "grad_norm": 1.223759189659061, "learning_rate": 5.054421040930423e-07, "loss": 0.5512, "step": 2168 }, { "epoch": 1.7187004754358162, "grad_norm": 1.5026732761769306, "learning_rate": 5.026628235862785e-07, "loss": 0.4617, "step": 2169 }, { "epoch": 1.7194928684627575, "grad_norm": 1.5403327406500933, "learning_rate": 4.998908009113812e-07, "loss": 0.5315, "step": 2170 }, { "epoch": 1.7202852614896988, "grad_norm": 1.3480858635767712, "learning_rate": 4.971260405418577e-07, "loss": 0.5421, "step": 2171 }, { "epoch": 1.7210776545166402, "grad_norm": 1.4797234768073142, "learning_rate": 4.94368546939496e-07, "loss": 0.4532, "step": 2172 }, { "epoch": 1.7218700475435815, "grad_norm": 1.3345228463027068, "learning_rate": 4.916183245543588e-07, "loss": 0.5599, "step": 2173 }, { "epoch": 1.7226624405705229, "grad_norm": 1.251919329026473, "learning_rate": 4.888753778247707e-07, "loss": 0.5316, "step": 2174 }, { "epoch": 1.7234548335974642, "grad_norm": 1.2583867133651105, "learning_rate": 4.86139711177318e-07, "loss": 0.5249, "step": 2175 }, { "epoch": 1.7242472266244055, "grad_norm": 1.3506164250598658, "learning_rate": 4.834113290268383e-07, "loss": 0.5553, "step": 2176 }, { "epoch": 1.7250396196513471, "grad_norm": 1.6115784269183038, "learning_rate": 4.806902357764098e-07, "loss": 0.5018, "step": 2177 }, { "epoch": 1.7258320126782885, "grad_norm": 1.2172009941547763, "learning_rate": 4.779764358173533e-07, "loss": 0.5065, "step": 2178 }, { "epoch": 1.7266244057052298, "grad_norm": 1.3237271908171802, "learning_rate": 4.752699335292138e-07, "loss": 0.5211, "step": 2179 }, { "epoch": 1.7274167987321711, "grad_norm": 1.5785421734925447, "learning_rate": 4.725707332797652e-07, "loss": 0.4974, "step": 2180 }, { "epoch": 1.7282091917591125, "grad_norm": 1.313667757378316, "learning_rate": 4.69878839424992e-07, "loss": 0.5382, "step": 2181 }, { "epoch": 1.729001584786054, "grad_norm": 1.4587988869607669, "learning_rate": 4.6719425630909145e-07, "loss": 0.5358, "step": 2182 }, { "epoch": 1.7297939778129954, "grad_norm": 1.274962034603765, "learning_rate": 4.645169882644596e-07, "loss": 0.5407, "step": 2183 }, { "epoch": 1.7305863708399367, "grad_norm": 1.1708782284908161, "learning_rate": 4.6184703961169085e-07, "loss": 0.546, "step": 2184 }, { "epoch": 1.731378763866878, "grad_norm": 1.2918860664229823, "learning_rate": 4.5918441465956363e-07, "loss": 0.5839, "step": 2185 }, { "epoch": 1.7321711568938194, "grad_norm": 1.5078565306926424, "learning_rate": 4.5652911770504074e-07, "loss": 0.4966, "step": 2186 }, { "epoch": 1.7329635499207607, "grad_norm": 1.5221477682487845, "learning_rate": 4.538811530332582e-07, "loss": 0.45, "step": 2187 }, { "epoch": 1.733755942947702, "grad_norm": 1.5722425391898391, "learning_rate": 4.5124052491751615e-07, "loss": 0.4259, "step": 2188 }, { "epoch": 1.7345483359746434, "grad_norm": 1.2663166432730104, "learning_rate": 4.4860723761928016e-07, "loss": 0.5083, "step": 2189 }, { "epoch": 1.7353407290015848, "grad_norm": 1.1402274450842573, "learning_rate": 4.4598129538816424e-07, "loss": 0.4804, "step": 2190 }, { "epoch": 1.736133122028526, "grad_norm": 1.5458170185229463, "learning_rate": 4.4336270246193193e-07, "loss": 0.4643, "step": 2191 }, { "epoch": 1.7369255150554674, "grad_norm": 1.3666366966827823, "learning_rate": 4.40751463066485e-07, "loss": 0.516, "step": 2192 }, { "epoch": 1.7377179080824088, "grad_norm": 1.747258635927493, "learning_rate": 4.3814758141585813e-07, "loss": 0.4821, "step": 2193 }, { "epoch": 1.7385103011093501, "grad_norm": 1.1569929201470972, "learning_rate": 4.355510617122133e-07, "loss": 0.5504, "step": 2194 }, { "epoch": 1.7393026941362915, "grad_norm": 1.5200356840286033, "learning_rate": 4.329619081458314e-07, "loss": 0.5997, "step": 2195 }, { "epoch": 1.7400950871632328, "grad_norm": 1.3732160349371476, "learning_rate": 4.303801248951034e-07, "loss": 0.5674, "step": 2196 }, { "epoch": 1.7408874801901744, "grad_norm": 1.2126854633301962, "learning_rate": 4.2780571612652977e-07, "loss": 0.5568, "step": 2197 }, { "epoch": 1.7416798732171157, "grad_norm": 1.3398840299681007, "learning_rate": 4.2523868599470606e-07, "loss": 0.5207, "step": 2198 }, { "epoch": 1.742472266244057, "grad_norm": 1.3182805906860589, "learning_rate": 4.2267903864232406e-07, "loss": 0.4704, "step": 2199 }, { "epoch": 1.7432646592709984, "grad_norm": 1.3076201368216356, "learning_rate": 4.201267782001578e-07, "loss": 0.489, "step": 2200 }, { "epoch": 1.7440570522979397, "grad_norm": 1.223731454813357, "learning_rate": 4.175819087870625e-07, "loss": 0.474, "step": 2201 }, { "epoch": 1.7448494453248813, "grad_norm": 1.2601050927834168, "learning_rate": 4.150444345099641e-07, "loss": 0.5235, "step": 2202 }, { "epoch": 1.7456418383518226, "grad_norm": 1.5068700546496299, "learning_rate": 4.125143594638553e-07, "loss": 0.5247, "step": 2203 }, { "epoch": 1.746434231378764, "grad_norm": 1.4138260935231013, "learning_rate": 4.099916877317889e-07, "loss": 0.4971, "step": 2204 }, { "epoch": 1.7472266244057053, "grad_norm": 1.4544526779889762, "learning_rate": 4.0747642338486714e-07, "loss": 0.4656, "step": 2205 }, { "epoch": 1.7480190174326466, "grad_norm": 1.5056202213878, "learning_rate": 4.049685704822409e-07, "loss": 0.4519, "step": 2206 }, { "epoch": 1.748811410459588, "grad_norm": 1.3459503436190698, "learning_rate": 4.0246813307109887e-07, "loss": 0.5219, "step": 2207 }, { "epoch": 1.7496038034865293, "grad_norm": 1.3678005896067789, "learning_rate": 3.9997511518666433e-07, "loss": 0.5277, "step": 2208 }, { "epoch": 1.7503961965134707, "grad_norm": 1.5831134708101817, "learning_rate": 3.974895208521834e-07, "loss": 0.4807, "step": 2209 }, { "epoch": 1.751188589540412, "grad_norm": 1.3991009286124156, "learning_rate": 3.950113540789269e-07, "loss": 0.4271, "step": 2210 }, { "epoch": 1.7519809825673534, "grad_norm": 1.509337843656474, "learning_rate": 3.92540618866174e-07, "loss": 0.4604, "step": 2211 }, { "epoch": 1.7527733755942947, "grad_norm": 1.3429573560627899, "learning_rate": 3.900773192012158e-07, "loss": 0.4555, "step": 2212 }, { "epoch": 1.753565768621236, "grad_norm": 3.6849491847946174, "learning_rate": 3.8762145905933855e-07, "loss": 0.5125, "step": 2213 }, { "epoch": 1.7543581616481774, "grad_norm": 1.3264798447638058, "learning_rate": 3.851730424038275e-07, "loss": 0.4809, "step": 2214 }, { "epoch": 1.7551505546751187, "grad_norm": 2.0789798434814197, "learning_rate": 3.8273207318595183e-07, "loss": 0.4977, "step": 2215 }, { "epoch": 1.7559429477020603, "grad_norm": 1.2690912304770952, "learning_rate": 3.802985553449634e-07, "loss": 0.5281, "step": 2216 }, { "epoch": 1.7567353407290016, "grad_norm": 1.1063191186335855, "learning_rate": 3.7787249280809015e-07, "loss": 0.5481, "step": 2217 }, { "epoch": 1.757527733755943, "grad_norm": 1.5477633713312582, "learning_rate": 3.7545388949052797e-07, "loss": 0.524, "step": 2218 }, { "epoch": 1.7583201267828843, "grad_norm": 1.4621364922809532, "learning_rate": 3.7304274929543247e-07, "loss": 0.4651, "step": 2219 }, { "epoch": 1.7591125198098256, "grad_norm": 1.5413317166866458, "learning_rate": 3.706390761139184e-07, "loss": 0.5612, "step": 2220 }, { "epoch": 1.7599049128367672, "grad_norm": 1.4947105089268238, "learning_rate": 3.6824287382505044e-07, "loss": 0.5073, "step": 2221 }, { "epoch": 1.7606973058637085, "grad_norm": 1.1738843675413029, "learning_rate": 3.6585414629583315e-07, "loss": 0.5071, "step": 2222 }, { "epoch": 1.7614896988906499, "grad_norm": 1.741982872364115, "learning_rate": 3.634728973812124e-07, "loss": 0.5058, "step": 2223 }, { "epoch": 1.7622820919175912, "grad_norm": 1.4288475723668628, "learning_rate": 3.610991309240608e-07, "loss": 0.4614, "step": 2224 }, { "epoch": 1.7630744849445326, "grad_norm": 1.2339937078928718, "learning_rate": 3.587328507551807e-07, "loss": 0.5766, "step": 2225 }, { "epoch": 1.763866877971474, "grad_norm": 1.411261835932106, "learning_rate": 3.5637406069328807e-07, "loss": 0.4895, "step": 2226 }, { "epoch": 1.7646592709984152, "grad_norm": 2.9841218267298792, "learning_rate": 3.5402276454501563e-07, "loss": 0.4756, "step": 2227 }, { "epoch": 1.7654516640253566, "grad_norm": 1.3380640905383079, "learning_rate": 3.5167896610489803e-07, "loss": 0.529, "step": 2228 }, { "epoch": 1.766244057052298, "grad_norm": 1.2623374741997322, "learning_rate": 3.493426691553747e-07, "loss": 0.5084, "step": 2229 }, { "epoch": 1.7670364500792393, "grad_norm": 1.2849969367140448, "learning_rate": 3.470138774667753e-07, "loss": 0.4838, "step": 2230 }, { "epoch": 1.7678288431061806, "grad_norm": 2.1756309181096034, "learning_rate": 3.446925947973195e-07, "loss": 0.4846, "step": 2231 }, { "epoch": 1.768621236133122, "grad_norm": 1.5407104477798594, "learning_rate": 3.4237882489310756e-07, "loss": 0.4987, "step": 2232 }, { "epoch": 1.7694136291600633, "grad_norm": 1.2695961819490547, "learning_rate": 3.40072571488117e-07, "loss": 0.5124, "step": 2233 }, { "epoch": 1.7702060221870046, "grad_norm": 1.6168183278963624, "learning_rate": 3.377738383041956e-07, "loss": 0.4137, "step": 2234 }, { "epoch": 1.770998415213946, "grad_norm": 1.536983690183298, "learning_rate": 3.3548262905105157e-07, "loss": 0.5182, "step": 2235 }, { "epoch": 1.7717908082408875, "grad_norm": 1.4125845543969555, "learning_rate": 3.3319894742625534e-07, "loss": 0.4751, "step": 2236 }, { "epoch": 1.7725832012678289, "grad_norm": 1.4010701221869837, "learning_rate": 3.309227971152251e-07, "loss": 0.4839, "step": 2237 }, { "epoch": 1.7733755942947702, "grad_norm": 1.4679682595098038, "learning_rate": 3.286541817912292e-07, "loss": 0.5797, "step": 2238 }, { "epoch": 1.7741679873217115, "grad_norm": 1.8815103839666498, "learning_rate": 3.263931051153707e-07, "loss": 0.5667, "step": 2239 }, { "epoch": 1.7749603803486529, "grad_norm": 1.49827462997825, "learning_rate": 3.2413957073659294e-07, "loss": 0.4836, "step": 2240 }, { "epoch": 1.7757527733755945, "grad_norm": 1.8746736036064613, "learning_rate": 3.21893582291663e-07, "loss": 0.5637, "step": 2241 }, { "epoch": 1.7765451664025358, "grad_norm": 1.4689353677222232, "learning_rate": 3.19655143405172e-07, "loss": 0.4613, "step": 2242 }, { "epoch": 1.7773375594294771, "grad_norm": 1.4405776519737787, "learning_rate": 3.174242576895259e-07, "loss": 0.4627, "step": 2243 }, { "epoch": 1.7781299524564185, "grad_norm": 1.330404037556862, "learning_rate": 3.1520092874494366e-07, "loss": 0.5126, "step": 2244 }, { "epoch": 1.7789223454833598, "grad_norm": 1.7051118402078538, "learning_rate": 3.1298516015944746e-07, "loss": 0.5071, "step": 2245 }, { "epoch": 1.7797147385103012, "grad_norm": 1.304428652137305, "learning_rate": 3.1077695550885965e-07, "loss": 0.4684, "step": 2246 }, { "epoch": 1.7805071315372425, "grad_norm": 1.4080417650021073, "learning_rate": 3.085763183567941e-07, "loss": 0.5095, "step": 2247 }, { "epoch": 1.7812995245641838, "grad_norm": 1.4560584537290782, "learning_rate": 3.063832522546545e-07, "loss": 0.471, "step": 2248 }, { "epoch": 1.7820919175911252, "grad_norm": 1.7493996320305347, "learning_rate": 3.0419776074162357e-07, "loss": 0.495, "step": 2249 }, { "epoch": 1.7828843106180665, "grad_norm": 1.563381799137943, "learning_rate": 3.020198473446628e-07, "loss": 0.588, "step": 2250 }, { "epoch": 1.7836767036450079, "grad_norm": 1.3862124427568085, "learning_rate": 2.998495155785036e-07, "loss": 0.4248, "step": 2251 }, { "epoch": 1.7844690966719492, "grad_norm": 1.3566688627268757, "learning_rate": 2.9768676894563975e-07, "loss": 0.5408, "step": 2252 }, { "epoch": 1.7852614896988905, "grad_norm": 1.2017899712061606, "learning_rate": 2.9553161093632707e-07, "loss": 0.4929, "step": 2253 }, { "epoch": 1.7860538827258319, "grad_norm": 1.4388433792867596, "learning_rate": 2.9338404502857244e-07, "loss": 0.4531, "step": 2254 }, { "epoch": 1.7868462757527732, "grad_norm": 1.1574934648052593, "learning_rate": 2.9124407468813345e-07, "loss": 0.5223, "step": 2255 }, { "epoch": 1.7876386687797148, "grad_norm": 2.6479137371647217, "learning_rate": 2.8911170336850535e-07, "loss": 0.5093, "step": 2256 }, { "epoch": 1.7884310618066561, "grad_norm": 1.1834810704632697, "learning_rate": 2.869869345109255e-07, "loss": 0.5214, "step": 2257 }, { "epoch": 1.7892234548335975, "grad_norm": 1.272988629835715, "learning_rate": 2.848697715443577e-07, "loss": 0.5231, "step": 2258 }, { "epoch": 1.7900158478605388, "grad_norm": 1.2609788190867115, "learning_rate": 2.827602178854949e-07, "loss": 0.5015, "step": 2259 }, { "epoch": 1.7908082408874801, "grad_norm": 1.2727994252812558, "learning_rate": 2.806582769387467e-07, "loss": 0.5087, "step": 2260 }, { "epoch": 1.7916006339144217, "grad_norm": 1.2780126994415126, "learning_rate": 2.785639520962402e-07, "loss": 0.5684, "step": 2261 }, { "epoch": 1.792393026941363, "grad_norm": 1.4279092046331585, "learning_rate": 2.764772467378096e-07, "loss": 0.5061, "step": 2262 }, { "epoch": 1.7931854199683044, "grad_norm": 1.4800717407678046, "learning_rate": 2.743981642309951e-07, "loss": 0.4025, "step": 2263 }, { "epoch": 1.7939778129952457, "grad_norm": 1.5738722528093072, "learning_rate": 2.7232670793103153e-07, "loss": 0.4687, "step": 2264 }, { "epoch": 1.794770206022187, "grad_norm": 1.2601025591650001, "learning_rate": 2.7026288118085054e-07, "loss": 0.5227, "step": 2265 }, { "epoch": 1.7955625990491284, "grad_norm": 3.384181913653343, "learning_rate": 2.682066873110667e-07, "loss": 0.4732, "step": 2266 }, { "epoch": 1.7963549920760697, "grad_norm": 1.4186802077319463, "learning_rate": 2.6615812963998065e-07, "loss": 0.444, "step": 2267 }, { "epoch": 1.797147385103011, "grad_norm": 1.2895572859523559, "learning_rate": 2.641172114735679e-07, "loss": 0.4774, "step": 2268 }, { "epoch": 1.7979397781299524, "grad_norm": 1.2419696041701878, "learning_rate": 2.62083936105475e-07, "loss": 0.5988, "step": 2269 }, { "epoch": 1.7987321711568938, "grad_norm": 1.40630757238778, "learning_rate": 2.600583068170154e-07, "loss": 0.4756, "step": 2270 }, { "epoch": 1.799524564183835, "grad_norm": 1.2756872146476745, "learning_rate": 2.580403268771614e-07, "loss": 0.4777, "step": 2271 }, { "epoch": 1.8003169572107764, "grad_norm": 1.259624940155659, "learning_rate": 2.5602999954254413e-07, "loss": 0.5614, "step": 2272 }, { "epoch": 1.8011093502377178, "grad_norm": 1.3533274206532309, "learning_rate": 2.540273280574412e-07, "loss": 0.4345, "step": 2273 }, { "epoch": 1.8019017432646591, "grad_norm": 1.8004968057229853, "learning_rate": 2.520323156537785e-07, "loss": 0.3904, "step": 2274 }, { "epoch": 1.8026941362916005, "grad_norm": 1.4785491610885744, "learning_rate": 2.500449655511178e-07, "loss": 0.5149, "step": 2275 }, { "epoch": 1.803486529318542, "grad_norm": 1.3933669453871376, "learning_rate": 2.480652809566597e-07, "loss": 0.5014, "step": 2276 }, { "epoch": 1.8042789223454834, "grad_norm": 1.4463378926318542, "learning_rate": 2.4609326506523037e-07, "loss": 0.4929, "step": 2277 }, { "epoch": 1.8050713153724247, "grad_norm": 1.3873731970581806, "learning_rate": 2.441289210592818e-07, "loss": 0.4738, "step": 2278 }, { "epoch": 1.805863708399366, "grad_norm": 1.3524227980579497, "learning_rate": 2.4217225210888706e-07, "loss": 0.4907, "step": 2279 }, { "epoch": 1.8066561014263076, "grad_norm": 1.487590279474623, "learning_rate": 2.4022326137172924e-07, "loss": 0.4289, "step": 2280 }, { "epoch": 1.807448494453249, "grad_norm": 1.0791173696377545, "learning_rate": 2.3828195199310343e-07, "loss": 0.5713, "step": 2281 }, { "epoch": 1.8082408874801903, "grad_norm": 1.2773112450470414, "learning_rate": 2.3634832710590584e-07, "loss": 0.5106, "step": 2282 }, { "epoch": 1.8090332805071316, "grad_norm": 1.439754249568816, "learning_rate": 2.344223898306347e-07, "loss": 0.4668, "step": 2283 }, { "epoch": 1.809825673534073, "grad_norm": 1.545108701394539, "learning_rate": 2.325041432753783e-07, "loss": 0.5393, "step": 2284 }, { "epoch": 1.8106180665610143, "grad_norm": 1.6682943585844672, "learning_rate": 2.3059359053581587e-07, "loss": 0.4543, "step": 2285 }, { "epoch": 1.8114104595879557, "grad_norm": 1.3881637200845593, "learning_rate": 2.2869073469520998e-07, "loss": 0.5497, "step": 2286 }, { "epoch": 1.812202852614897, "grad_norm": 1.3379283590234996, "learning_rate": 2.2679557882440305e-07, "loss": 0.4899, "step": 2287 }, { "epoch": 1.8129952456418383, "grad_norm": 1.43603708683887, "learning_rate": 2.2490812598180756e-07, "loss": 0.4613, "step": 2288 }, { "epoch": 1.8137876386687797, "grad_norm": 1.5085861167067771, "learning_rate": 2.2302837921340858e-07, "loss": 0.4267, "step": 2289 }, { "epoch": 1.814580031695721, "grad_norm": 1.3424129701136376, "learning_rate": 2.211563415527529e-07, "loss": 0.539, "step": 2290 }, { "epoch": 1.8153724247226624, "grad_norm": 2.139422169569654, "learning_rate": 2.192920160209483e-07, "loss": 0.4623, "step": 2291 }, { "epoch": 1.8161648177496037, "grad_norm": 1.285798503980566, "learning_rate": 2.1743540562665366e-07, "loss": 0.5003, "step": 2292 }, { "epoch": 1.816957210776545, "grad_norm": 1.4788873619864535, "learning_rate": 2.1558651336608006e-07, "loss": 0.4574, "step": 2293 }, { "epoch": 1.8177496038034864, "grad_norm": 1.5227976934830103, "learning_rate": 2.1374534222298072e-07, "loss": 0.504, "step": 2294 }, { "epoch": 1.818541996830428, "grad_norm": 1.1102714987779154, "learning_rate": 2.1191189516865052e-07, "loss": 0.5163, "step": 2295 }, { "epoch": 1.8193343898573693, "grad_norm": 1.6399557018500388, "learning_rate": 2.100861751619182e-07, "loss": 0.4529, "step": 2296 }, { "epoch": 1.8201267828843106, "grad_norm": 1.5891084932620096, "learning_rate": 2.0826818514914194e-07, "loss": 0.5282, "step": 2297 }, { "epoch": 1.820919175911252, "grad_norm": 1.392867718720439, "learning_rate": 2.064579280642065e-07, "loss": 0.4751, "step": 2298 }, { "epoch": 1.8217115689381933, "grad_norm": 1.4091469087811899, "learning_rate": 2.0465540682851614e-07, "loss": 0.4843, "step": 2299 }, { "epoch": 1.8225039619651349, "grad_norm": 1.3465466981547645, "learning_rate": 2.0286062435099118e-07, "loss": 0.484, "step": 2300 }, { "epoch": 1.8232963549920762, "grad_norm": 1.425165076216207, "learning_rate": 2.0107358352806306e-07, "loss": 0.5769, "step": 2301 }, { "epoch": 1.8240887480190175, "grad_norm": 1.3213866996549266, "learning_rate": 1.9929428724367038e-07, "loss": 0.441, "step": 2302 }, { "epoch": 1.8248811410459589, "grad_norm": 1.3367471282138317, "learning_rate": 1.9752273836925185e-07, "loss": 0.6095, "step": 2303 }, { "epoch": 1.8256735340729002, "grad_norm": 1.305356810461015, "learning_rate": 1.957589397637466e-07, "loss": 0.4943, "step": 2304 }, { "epoch": 1.8264659270998416, "grad_norm": 1.6184636834418726, "learning_rate": 1.940028942735822e-07, "loss": 0.4964, "step": 2305 }, { "epoch": 1.827258320126783, "grad_norm": 1.3166764931313804, "learning_rate": 1.9225460473267732e-07, "loss": 0.4519, "step": 2306 }, { "epoch": 1.8280507131537242, "grad_norm": 1.4195309020927651, "learning_rate": 1.9051407396243227e-07, "loss": 0.5052, "step": 2307 }, { "epoch": 1.8288431061806656, "grad_norm": 1.2336132049549506, "learning_rate": 1.887813047717274e-07, "loss": 0.4942, "step": 2308 }, { "epoch": 1.829635499207607, "grad_norm": 1.3420260380417455, "learning_rate": 1.8705629995691644e-07, "loss": 0.5107, "step": 2309 }, { "epoch": 1.8304278922345483, "grad_norm": 1.4322340976230097, "learning_rate": 1.8533906230182474e-07, "loss": 0.4913, "step": 2310 }, { "epoch": 1.8312202852614896, "grad_norm": 1.3731402336980418, "learning_rate": 1.8362959457773944e-07, "loss": 0.4667, "step": 2311 }, { "epoch": 1.832012678288431, "grad_norm": 1.7212598002125414, "learning_rate": 1.819278995434115e-07, "loss": 0.4527, "step": 2312 }, { "epoch": 1.8328050713153723, "grad_norm": 1.2592610033011566, "learning_rate": 1.8023397994504866e-07, "loss": 0.52, "step": 2313 }, { "epoch": 1.8335974643423136, "grad_norm": 1.278984296827505, "learning_rate": 1.7854783851630696e-07, "loss": 0.4859, "step": 2314 }, { "epoch": 1.8343898573692552, "grad_norm": 1.478276087250712, "learning_rate": 1.768694779782948e-07, "loss": 0.4341, "step": 2315 }, { "epoch": 1.8351822503961965, "grad_norm": 1.5376269664625715, "learning_rate": 1.7519890103955884e-07, "loss": 0.4847, "step": 2316 }, { "epoch": 1.8359746434231379, "grad_norm": 1.4361300255057854, "learning_rate": 1.735361103960892e-07, "loss": 0.5481, "step": 2317 }, { "epoch": 1.8367670364500792, "grad_norm": 1.3271492973672285, "learning_rate": 1.718811087313066e-07, "loss": 0.5497, "step": 2318 }, { "epoch": 1.8375594294770206, "grad_norm": 1.4936962283330721, "learning_rate": 1.7023389871606454e-07, "loss": 0.449, "step": 2319 }, { "epoch": 1.8383518225039621, "grad_norm": 1.2809206833661824, "learning_rate": 1.6859448300864e-07, "loss": 0.5202, "step": 2320 }, { "epoch": 1.8391442155309035, "grad_norm": 1.3112630594384063, "learning_rate": 1.669628642547344e-07, "loss": 0.4258, "step": 2321 }, { "epoch": 1.8399366085578448, "grad_norm": 1.239018822463854, "learning_rate": 1.6533904508746256e-07, "loss": 0.5065, "step": 2322 }, { "epoch": 1.8407290015847861, "grad_norm": 1.3887925065044504, "learning_rate": 1.6372302812735663e-07, "loss": 0.506, "step": 2323 }, { "epoch": 1.8415213946117275, "grad_norm": 1.381101676260999, "learning_rate": 1.621148159823527e-07, "loss": 0.4273, "step": 2324 }, { "epoch": 1.8423137876386688, "grad_norm": 1.3871281077566735, "learning_rate": 1.6051441124779577e-07, "loss": 0.5202, "step": 2325 }, { "epoch": 1.8431061806656102, "grad_norm": 1.3933812646441284, "learning_rate": 1.5892181650642935e-07, "loss": 0.5152, "step": 2326 }, { "epoch": 1.8438985736925515, "grad_norm": 1.281509401849153, "learning_rate": 1.5733703432839252e-07, "loss": 0.4941, "step": 2327 }, { "epoch": 1.8446909667194928, "grad_norm": 1.2498906435405763, "learning_rate": 1.557600672712173e-07, "loss": 0.5159, "step": 2328 }, { "epoch": 1.8454833597464342, "grad_norm": 1.732914103393298, "learning_rate": 1.541909178798229e-07, "loss": 0.5476, "step": 2329 }, { "epoch": 1.8462757527733755, "grad_norm": 1.2090650826069245, "learning_rate": 1.5262958868651366e-07, "loss": 0.57, "step": 2330 }, { "epoch": 1.8470681458003169, "grad_norm": 1.5882689116417583, "learning_rate": 1.5107608221097082e-07, "loss": 0.4994, "step": 2331 }, { "epoch": 1.8478605388272582, "grad_norm": 1.3889928700103171, "learning_rate": 1.4953040096025606e-07, "loss": 0.5156, "step": 2332 }, { "epoch": 1.8486529318541995, "grad_norm": 1.152470688117002, "learning_rate": 1.4799254742879798e-07, "loss": 0.5614, "step": 2333 }, { "epoch": 1.8494453248811409, "grad_norm": 1.3544374259504812, "learning_rate": 1.4646252409839468e-07, "loss": 0.5431, "step": 2334 }, { "epoch": 1.8502377179080824, "grad_norm": 1.2084559721219592, "learning_rate": 1.4494033343820822e-07, "loss": 0.4985, "step": 2335 }, { "epoch": 1.8510301109350238, "grad_norm": 1.4432599787334146, "learning_rate": 1.4342597790475975e-07, "loss": 0.4718, "step": 2336 }, { "epoch": 1.8518225039619651, "grad_norm": 1.4607973223001187, "learning_rate": 1.419194599419249e-07, "loss": 0.4531, "step": 2337 }, { "epoch": 1.8526148969889065, "grad_norm": 1.2536649720312925, "learning_rate": 1.4042078198093388e-07, "loss": 0.463, "step": 2338 }, { "epoch": 1.8534072900158478, "grad_norm": 1.2757634942890657, "learning_rate": 1.3892994644036094e-07, "loss": 0.521, "step": 2339 }, { "epoch": 1.8541996830427894, "grad_norm": 1.536375554743217, "learning_rate": 1.374469557261282e-07, "loss": 0.4399, "step": 2340 }, { "epoch": 1.8549920760697307, "grad_norm": 1.3010027376497109, "learning_rate": 1.3597181223149403e-07, "loss": 0.5101, "step": 2341 }, { "epoch": 1.855784469096672, "grad_norm": 1.3938785916686196, "learning_rate": 1.3450451833705524e-07, "loss": 0.5115, "step": 2342 }, { "epoch": 1.8565768621236134, "grad_norm": 1.248776589623727, "learning_rate": 1.3304507641074049e-07, "loss": 0.513, "step": 2343 }, { "epoch": 1.8573692551505547, "grad_norm": 1.3145690530155092, "learning_rate": 1.3159348880780577e-07, "loss": 0.502, "step": 2344 }, { "epoch": 1.858161648177496, "grad_norm": 1.4675075938855577, "learning_rate": 1.3014975787083383e-07, "loss": 0.4718, "step": 2345 }, { "epoch": 1.8589540412044374, "grad_norm": 1.553650228214874, "learning_rate": 1.28713885929726e-07, "loss": 0.4904, "step": 2346 }, { "epoch": 1.8597464342313788, "grad_norm": 1.4596306871881886, "learning_rate": 1.27285875301702e-07, "loss": 0.5131, "step": 2347 }, { "epoch": 1.86053882725832, "grad_norm": 1.8810423418644362, "learning_rate": 1.2586572829129396e-07, "loss": 0.4917, "step": 2348 }, { "epoch": 1.8613312202852614, "grad_norm": 1.2460943489274583, "learning_rate": 1.244534471903458e-07, "loss": 0.497, "step": 2349 }, { "epoch": 1.8621236133122028, "grad_norm": 1.5033166512883434, "learning_rate": 1.2304903427800497e-07, "loss": 0.4809, "step": 2350 }, { "epoch": 1.8629160063391441, "grad_norm": 1.5986508759601388, "learning_rate": 1.2165249182072182e-07, "loss": 0.5096, "step": 2351 }, { "epoch": 1.8637083993660855, "grad_norm": 1.361412754527247, "learning_rate": 1.2026382207224629e-07, "loss": 0.4565, "step": 2352 }, { "epoch": 1.8645007923930268, "grad_norm": 1.5746183620002823, "learning_rate": 1.1888302727362188e-07, "loss": 0.5398, "step": 2353 }, { "epoch": 1.8652931854199684, "grad_norm": 1.3659910630778076, "learning_rate": 1.17510109653185e-07, "loss": 0.4832, "step": 2354 }, { "epoch": 1.8660855784469097, "grad_norm": 1.421705993221639, "learning_rate": 1.1614507142655884e-07, "loss": 0.4078, "step": 2355 }, { "epoch": 1.866877971473851, "grad_norm": 1.3527764509905587, "learning_rate": 1.1478791479665074e-07, "loss": 0.536, "step": 2356 }, { "epoch": 1.8676703645007924, "grad_norm": 1.305066858051019, "learning_rate": 1.1343864195364873e-07, "loss": 0.4508, "step": 2357 }, { "epoch": 1.8684627575277337, "grad_norm": 1.3375102780911898, "learning_rate": 1.1209725507501823e-07, "loss": 0.4819, "step": 2358 }, { "epoch": 1.8692551505546753, "grad_norm": 1.2239798915010842, "learning_rate": 1.1076375632549762e-07, "loss": 0.5903, "step": 2359 }, { "epoch": 1.8700475435816166, "grad_norm": 1.423108535455646, "learning_rate": 1.0943814785709716e-07, "loss": 0.5217, "step": 2360 }, { "epoch": 1.870839936608558, "grad_norm": 3.077326680383855, "learning_rate": 1.081204318090906e-07, "loss": 0.492, "step": 2361 }, { "epoch": 1.8716323296354993, "grad_norm": 1.6918940205008026, "learning_rate": 1.0681061030801853e-07, "loss": 0.5226, "step": 2362 }, { "epoch": 1.8724247226624406, "grad_norm": 1.2898415268840735, "learning_rate": 1.0550868546767733e-07, "loss": 0.4753, "step": 2363 }, { "epoch": 1.873217115689382, "grad_norm": 1.4781489263816994, "learning_rate": 1.0421465938912301e-07, "loss": 0.4817, "step": 2364 }, { "epoch": 1.8740095087163233, "grad_norm": 1.2132316461241917, "learning_rate": 1.0292853416066284e-07, "loss": 0.5101, "step": 2365 }, { "epoch": 1.8748019017432647, "grad_norm": 1.286195825495799, "learning_rate": 1.016503118578549e-07, "loss": 0.4835, "step": 2366 }, { "epoch": 1.875594294770206, "grad_norm": 1.185532376648748, "learning_rate": 1.0037999454350133e-07, "loss": 0.5769, "step": 2367 }, { "epoch": 1.8763866877971473, "grad_norm": 1.337615476892316, "learning_rate": 9.911758426764951e-08, "loss": 0.5178, "step": 2368 }, { "epoch": 1.8771790808240887, "grad_norm": 1.372735727279226, "learning_rate": 9.786308306758418e-08, "loss": 0.5478, "step": 2369 }, { "epoch": 1.87797147385103, "grad_norm": 1.2610524317881007, "learning_rate": 9.661649296782815e-08, "loss": 0.5204, "step": 2370 }, { "epoch": 1.8787638668779714, "grad_norm": 1.5777187546004892, "learning_rate": 9.537781598013662e-08, "loss": 0.5164, "step": 2371 }, { "epoch": 1.8795562599049127, "grad_norm": 2.148993441489355, "learning_rate": 9.414705410349501e-08, "loss": 0.4759, "step": 2372 }, { "epoch": 1.880348652931854, "grad_norm": 1.3033675049471536, "learning_rate": 9.292420932411395e-08, "loss": 0.4834, "step": 2373 }, { "epoch": 1.8811410459587956, "grad_norm": 1.5752760879804257, "learning_rate": 9.170928361542874e-08, "loss": 0.4513, "step": 2374 }, { "epoch": 1.881933438985737, "grad_norm": 1.351544443847087, "learning_rate": 9.050227893809438e-08, "loss": 0.5245, "step": 2375 }, { "epoch": 1.8827258320126783, "grad_norm": 1.3466042504353761, "learning_rate": 8.930319723998216e-08, "loss": 0.4825, "step": 2376 }, { "epoch": 1.8835182250396196, "grad_norm": 1.478795920260295, "learning_rate": 8.811204045617861e-08, "loss": 0.499, "step": 2377 }, { "epoch": 1.884310618066561, "grad_norm": 1.4078788994780171, "learning_rate": 8.692881050897995e-08, "loss": 0.5177, "step": 2378 }, { "epoch": 1.8851030110935025, "grad_norm": 1.3566356487604136, "learning_rate": 8.575350930789095e-08, "loss": 0.4759, "step": 2379 }, { "epoch": 1.8858954041204439, "grad_norm": 1.4530969103017881, "learning_rate": 8.458613874961996e-08, "loss": 0.5005, "step": 2380 }, { "epoch": 1.8866877971473852, "grad_norm": 1.6983214028294509, "learning_rate": 8.342670071807667e-08, "loss": 0.5194, "step": 2381 }, { "epoch": 1.8874801901743266, "grad_norm": 1.2597574311986388, "learning_rate": 8.227519708436993e-08, "loss": 0.5116, "step": 2382 }, { "epoch": 1.888272583201268, "grad_norm": 1.238772941291779, "learning_rate": 8.113162970680433e-08, "loss": 0.4592, "step": 2383 }, { "epoch": 1.8890649762282092, "grad_norm": 1.2769312767059193, "learning_rate": 7.99960004308753e-08, "loss": 0.4682, "step": 2384 }, { "epoch": 1.8898573692551506, "grad_norm": 1.265180345699842, "learning_rate": 7.886831108926963e-08, "loss": 0.4459, "step": 2385 }, { "epoch": 1.890649762282092, "grad_norm": 1.2222861971495296, "learning_rate": 7.774856350185878e-08, "loss": 0.535, "step": 2386 }, { "epoch": 1.8914421553090333, "grad_norm": 1.2650385349095845, "learning_rate": 7.663675947569949e-08, "loss": 0.5027, "step": 2387 }, { "epoch": 1.8922345483359746, "grad_norm": 1.5623587875333256, "learning_rate": 7.553290080502817e-08, "loss": 0.4759, "step": 2388 }, { "epoch": 1.893026941362916, "grad_norm": 1.0879087347172525, "learning_rate": 7.443698927125875e-08, "loss": 0.5897, "step": 2389 }, { "epoch": 1.8938193343898573, "grad_norm": 1.3908121410244032, "learning_rate": 7.334902664298039e-08, "loss": 0.517, "step": 2390 }, { "epoch": 1.8946117274167986, "grad_norm": 1.3011078765701682, "learning_rate": 7.22690146759547e-08, "loss": 0.5195, "step": 2391 }, { "epoch": 1.89540412044374, "grad_norm": 1.322367867460901, "learning_rate": 7.119695511311142e-08, "loss": 0.4846, "step": 2392 }, { "epoch": 1.8961965134706813, "grad_norm": 1.6121993874022422, "learning_rate": 7.01328496845477e-08, "loss": 0.4166, "step": 2393 }, { "epoch": 1.8969889064976229, "grad_norm": 1.6574067620484672, "learning_rate": 6.907670010752321e-08, "loss": 0.5419, "step": 2394 }, { "epoch": 1.8977812995245642, "grad_norm": 1.6405550447789055, "learning_rate": 6.802850808645956e-08, "loss": 0.4706, "step": 2395 }, { "epoch": 1.8985736925515055, "grad_norm": 1.448334243008069, "learning_rate": 6.698827531293584e-08, "loss": 0.5137, "step": 2396 }, { "epoch": 1.8993660855784469, "grad_norm": 1.5533472711560237, "learning_rate": 6.595600346568532e-08, "loss": 0.5344, "step": 2397 }, { "epoch": 1.9001584786053882, "grad_norm": 1.762409336794433, "learning_rate": 6.493169421059653e-08, "loss": 0.5246, "step": 2398 }, { "epoch": 1.9009508716323298, "grad_norm": 1.3778451285721345, "learning_rate": 6.391534920070552e-08, "loss": 0.5039, "step": 2399 }, { "epoch": 1.9017432646592711, "grad_norm": 1.1657969621747757, "learning_rate": 6.290697007619639e-08, "loss": 0.5914, "step": 2400 }, { "epoch": 1.9025356576862125, "grad_norm": 1.2633155856077207, "learning_rate": 6.190655846439797e-08, "loss": 0.446, "step": 2401 }, { "epoch": 1.9033280507131538, "grad_norm": 1.3771884184771834, "learning_rate": 6.091411597978158e-08, "loss": 0.4919, "step": 2402 }, { "epoch": 1.9041204437400951, "grad_norm": 1.2913452852621399, "learning_rate": 5.992964422395664e-08, "loss": 0.5547, "step": 2403 }, { "epoch": 1.9049128367670365, "grad_norm": 1.328164892028027, "learning_rate": 5.89531447856706e-08, "loss": 0.5328, "step": 2404 }, { "epoch": 1.9057052297939778, "grad_norm": 1.3789045320012572, "learning_rate": 5.798461924080401e-08, "loss": 0.4992, "step": 2405 }, { "epoch": 1.9064976228209192, "grad_norm": 3.014700837715783, "learning_rate": 5.702406915236991e-08, "loss": 0.4451, "step": 2406 }, { "epoch": 1.9072900158478605, "grad_norm": 1.3621635749307544, "learning_rate": 5.607149607051054e-08, "loss": 0.4312, "step": 2407 }, { "epoch": 1.9080824088748018, "grad_norm": 1.1378155664512395, "learning_rate": 5.5126901532494005e-08, "loss": 0.4903, "step": 2408 }, { "epoch": 1.9088748019017432, "grad_norm": 1.5380375356475968, "learning_rate": 5.419028706271423e-08, "loss": 0.4887, "step": 2409 }, { "epoch": 1.9096671949286845, "grad_norm": 1.5757684724257666, "learning_rate": 5.3261654172683806e-08, "loss": 0.4133, "step": 2410 }, { "epoch": 1.9104595879556259, "grad_norm": 1.3206552042026918, "learning_rate": 5.234100436103784e-08, "loss": 0.5147, "step": 2411 }, { "epoch": 1.9112519809825672, "grad_norm": 1.2416610766947054, "learning_rate": 5.1428339113526735e-08, "loss": 0.5233, "step": 2412 }, { "epoch": 1.9120443740095086, "grad_norm": 2.4947313006475427, "learning_rate": 5.052365990301567e-08, "loss": 0.4695, "step": 2413 }, { "epoch": 1.9128367670364501, "grad_norm": 1.251504851917564, "learning_rate": 4.9626968189481205e-08, "loss": 0.5223, "step": 2414 }, { "epoch": 1.9136291600633915, "grad_norm": 1.3404926477662842, "learning_rate": 4.8738265420010814e-08, "loss": 0.5147, "step": 2415 }, { "epoch": 1.9144215530903328, "grad_norm": 1.3717300686799507, "learning_rate": 4.7857553028798356e-08, "loss": 0.5133, "step": 2416 }, { "epoch": 1.9152139461172741, "grad_norm": 1.3167284544564508, "learning_rate": 4.698483243714358e-08, "loss": 0.4864, "step": 2417 }, { "epoch": 1.9160063391442155, "grad_norm": 1.505127372654279, "learning_rate": 4.612010505344933e-08, "loss": 0.4893, "step": 2418 }, { "epoch": 1.916798732171157, "grad_norm": 1.2542574159686712, "learning_rate": 4.526337227321653e-08, "loss": 0.4946, "step": 2419 }, { "epoch": 1.9175911251980984, "grad_norm": 1.3981714890134782, "learning_rate": 4.441463547904756e-08, "loss": 0.5326, "step": 2420 }, { "epoch": 1.9183835182250397, "grad_norm": 1.151904903932661, "learning_rate": 4.357389604063844e-08, "loss": 0.5455, "step": 2421 }, { "epoch": 1.919175911251981, "grad_norm": 1.3596106232176817, "learning_rate": 4.274115531478052e-08, "loss": 0.4372, "step": 2422 }, { "epoch": 1.9199683042789224, "grad_norm": 1.183638543444736, "learning_rate": 4.191641464535545e-08, "loss": 0.4956, "step": 2423 }, { "epoch": 1.9207606973058637, "grad_norm": 1.2323859751678963, "learning_rate": 4.109967536333637e-08, "loss": 0.5274, "step": 2424 }, { "epoch": 1.921553090332805, "grad_norm": 1.4009499500612368, "learning_rate": 4.0290938786781145e-08, "loss": 0.5587, "step": 2425 }, { "epoch": 1.9223454833597464, "grad_norm": 1.4227951590964807, "learning_rate": 3.949020622083466e-08, "loss": 0.5464, "step": 2426 }, { "epoch": 1.9231378763866878, "grad_norm": 1.4216346705157072, "learning_rate": 3.86974789577238e-08, "loss": 0.438, "step": 2427 }, { "epoch": 1.923930269413629, "grad_norm": 1.1843364302699264, "learning_rate": 3.7912758276758e-08, "loss": 0.5631, "step": 2428 }, { "epoch": 1.9247226624405704, "grad_norm": 1.4492398982638595, "learning_rate": 3.71360454443237e-08, "loss": 0.4832, "step": 2429 }, { "epoch": 1.9255150554675118, "grad_norm": 1.6416701813383336, "learning_rate": 3.636734171388545e-08, "loss": 0.4856, "step": 2430 }, { "epoch": 1.9263074484944531, "grad_norm": 1.6658542050269909, "learning_rate": 3.560664832598204e-08, "loss": 0.5208, "step": 2431 }, { "epoch": 1.9270998415213945, "grad_norm": 1.6937243502921013, "learning_rate": 3.485396650822592e-08, "loss": 0.4401, "step": 2432 }, { "epoch": 1.927892234548336, "grad_norm": 1.9366534523351078, "learning_rate": 3.410929747529934e-08, "loss": 0.4843, "step": 2433 }, { "epoch": 1.9286846275752774, "grad_norm": 1.7606342700152255, "learning_rate": 3.337264242895433e-08, "loss": 0.5146, "step": 2434 }, { "epoch": 1.9294770206022187, "grad_norm": 1.4553172107584722, "learning_rate": 3.264400255800937e-08, "loss": 0.4572, "step": 2435 }, { "epoch": 1.93026941362916, "grad_norm": 2.567281194850898, "learning_rate": 3.192337903834775e-08, "loss": 0.5519, "step": 2436 }, { "epoch": 1.9310618066561014, "grad_norm": 1.3250112530543252, "learning_rate": 3.121077303291753e-08, "loss": 0.405, "step": 2437 }, { "epoch": 1.931854199683043, "grad_norm": 1.358967498422095, "learning_rate": 3.050618569172603e-08, "loss": 0.4738, "step": 2438 }, { "epoch": 1.9326465927099843, "grad_norm": 4.078233963715423, "learning_rate": 2.9809618151841466e-08, "loss": 0.4348, "step": 2439 }, { "epoch": 1.9334389857369256, "grad_norm": 1.1155301976880994, "learning_rate": 2.9121071537388522e-08, "loss": 0.532, "step": 2440 }, { "epoch": 1.934231378763867, "grad_norm": 1.4895860212597862, "learning_rate": 2.8440546959548344e-08, "loss": 0.4388, "step": 2441 }, { "epoch": 1.9350237717908083, "grad_norm": 2.099130792830962, "learning_rate": 2.7768045516555762e-08, "loss": 0.4349, "step": 2442 }, { "epoch": 1.9358161648177497, "grad_norm": 1.2367741606649036, "learning_rate": 2.7103568293698203e-08, "loss": 0.4952, "step": 2443 }, { "epoch": 1.936608557844691, "grad_norm": 1.27976843394138, "learning_rate": 2.6447116363312875e-08, "loss": 0.4671, "step": 2444 }, { "epoch": 1.9374009508716323, "grad_norm": 1.3277041761021624, "learning_rate": 2.5798690784786807e-08, "loss": 0.54, "step": 2445 }, { "epoch": 1.9381933438985737, "grad_norm": 1.4156127785739498, "learning_rate": 2.5158292604552938e-08, "loss": 0.4382, "step": 2446 }, { "epoch": 1.938985736925515, "grad_norm": 1.6974672213345585, "learning_rate": 2.4525922856090124e-08, "loss": 0.4669, "step": 2447 }, { "epoch": 1.9397781299524564, "grad_norm": 1.2803172044540578, "learning_rate": 2.3901582559920923e-08, "loss": 0.4662, "step": 2448 }, { "epoch": 1.9405705229793977, "grad_norm": 1.4577514183509204, "learning_rate": 2.328527272360992e-08, "loss": 0.46, "step": 2449 }, { "epoch": 1.941362916006339, "grad_norm": 1.7783153970418826, "learning_rate": 2.267699434176096e-08, "loss": 0.4836, "step": 2450 }, { "epoch": 1.9421553090332804, "grad_norm": 1.4479753434411018, "learning_rate": 2.207674839601881e-08, "loss": 0.4819, "step": 2451 }, { "epoch": 1.9429477020602217, "grad_norm": 1.347827410536308, "learning_rate": 2.1484535855063605e-08, "loss": 0.5336, "step": 2452 }, { "epoch": 1.9437400950871633, "grad_norm": 1.5912978782431286, "learning_rate": 2.090035767461196e-08, "loss": 0.4433, "step": 2453 }, { "epoch": 1.9445324881141046, "grad_norm": 1.2677069850600389, "learning_rate": 2.03242147974142e-08, "loss": 0.5037, "step": 2454 }, { "epoch": 1.945324881141046, "grad_norm": 1.4053997746365663, "learning_rate": 1.9756108153253793e-08, "loss": 0.5049, "step": 2455 }, { "epoch": 1.9461172741679873, "grad_norm": 1.3093426445123542, "learning_rate": 1.9196038658944594e-08, "loss": 0.4507, "step": 2456 }, { "epoch": 1.9469096671949286, "grad_norm": 1.1667907523075371, "learning_rate": 1.8644007218330263e-08, "loss": 0.5458, "step": 2457 }, { "epoch": 1.9477020602218702, "grad_norm": 1.3821552392093372, "learning_rate": 1.8100014722283733e-08, "loss": 0.4596, "step": 2458 }, { "epoch": 1.9484944532488115, "grad_norm": 1.321262674957625, "learning_rate": 1.7564062048702202e-08, "loss": 0.5573, "step": 2459 }, { "epoch": 1.9492868462757529, "grad_norm": 1.5149425346481724, "learning_rate": 1.7036150062510472e-08, "loss": 0.5226, "step": 2460 }, { "epoch": 1.9500792393026942, "grad_norm": 1.346154170174331, "learning_rate": 1.6516279615655383e-08, "loss": 0.4745, "step": 2461 }, { "epoch": 1.9508716323296356, "grad_norm": 1.455920397885792, "learning_rate": 1.6004451547108613e-08, "loss": 0.5554, "step": 2462 }, { "epoch": 1.951664025356577, "grad_norm": 1.4511758479228456, "learning_rate": 1.5500666682861098e-08, "loss": 0.5018, "step": 2463 }, { "epoch": 1.9524564183835182, "grad_norm": 1.3055910710000465, "learning_rate": 1.500492583592361e-08, "loss": 0.5211, "step": 2464 }, { "epoch": 1.9532488114104596, "grad_norm": 1.4532767795872907, "learning_rate": 1.4517229806326194e-08, "loss": 0.5136, "step": 2465 }, { "epoch": 1.954041204437401, "grad_norm": 1.361114739396051, "learning_rate": 1.403757938111594e-08, "loss": 0.3822, "step": 2466 }, { "epoch": 1.9548335974643423, "grad_norm": 1.6042961566829361, "learning_rate": 1.3565975334355886e-08, "loss": 0.4481, "step": 2467 }, { "epoch": 1.9556259904912836, "grad_norm": 1.4533158017911443, "learning_rate": 1.3102418427123343e-08, "loss": 0.5024, "step": 2468 }, { "epoch": 1.956418383518225, "grad_norm": 1.2621840464849328, "learning_rate": 1.26469094075099e-08, "loss": 0.5011, "step": 2469 }, { "epoch": 1.9572107765451663, "grad_norm": 1.2321402967722315, "learning_rate": 1.2199449010618646e-08, "loss": 0.5139, "step": 2470 }, { "epoch": 1.9580031695721076, "grad_norm": 1.413992029188902, "learning_rate": 1.176003795856473e-08, "loss": 0.438, "step": 2471 }, { "epoch": 1.958795562599049, "grad_norm": 1.3676456753317028, "learning_rate": 1.1328676960471463e-08, "loss": 0.5335, "step": 2472 }, { "epoch": 1.9595879556259905, "grad_norm": 1.4151546200329561, "learning_rate": 1.090536671247311e-08, "loss": 0.4672, "step": 2473 }, { "epoch": 1.9603803486529319, "grad_norm": 1.3821848832110717, "learning_rate": 1.0490107897709323e-08, "loss": 0.5108, "step": 2474 }, { "epoch": 1.9611727416798732, "grad_norm": 1.2216260547471456, "learning_rate": 1.0082901186328486e-08, "loss": 0.4894, "step": 2475 }, { "epoch": 1.9619651347068146, "grad_norm": 1.3076077103326784, "learning_rate": 9.683747235483265e-09, "loss": 0.4417, "step": 2476 }, { "epoch": 1.962757527733756, "grad_norm": 2.075869946717767, "learning_rate": 9.292646689330054e-09, "loss": 0.4325, "step": 2477 }, { "epoch": 1.9635499207606975, "grad_norm": 1.435913363992939, "learning_rate": 8.909600179030642e-09, "loss": 0.4709, "step": 2478 }, { "epoch": 1.9643423137876388, "grad_norm": 1.450286499583669, "learning_rate": 8.534608322747218e-09, "loss": 0.5159, "step": 2479 }, { "epoch": 1.9651347068145801, "grad_norm": 1.4432027431041197, "learning_rate": 8.167671725644034e-09, "loss": 0.5288, "step": 2480 }, { "epoch": 1.9659270998415215, "grad_norm": 1.3442454758874278, "learning_rate": 7.808790979886293e-09, "loss": 0.5153, "step": 2481 }, { "epoch": 1.9667194928684628, "grad_norm": 1.4459330684244422, "learning_rate": 7.457966664637384e-09, "loss": 0.4356, "step": 2482 }, { "epoch": 1.9675118858954042, "grad_norm": 1.3549190879600757, "learning_rate": 7.115199346060531e-09, "loss": 0.4887, "step": 2483 }, { "epoch": 1.9683042789223455, "grad_norm": 1.7463404510990286, "learning_rate": 6.780489577315474e-09, "loss": 0.4093, "step": 2484 }, { "epoch": 1.9690966719492868, "grad_norm": 1.3962199101150996, "learning_rate": 6.453837898559023e-09, "loss": 0.5082, "step": 2485 }, { "epoch": 1.9698890649762282, "grad_norm": 1.3639331990406567, "learning_rate": 6.135244836943388e-09, "loss": 0.5049, "step": 2486 }, { "epoch": 1.9706814580031695, "grad_norm": 1.650092097402154, "learning_rate": 5.824710906617292e-09, "loss": 0.4642, "step": 2487 }, { "epoch": 1.9714738510301109, "grad_norm": 1.366589312230384, "learning_rate": 5.522236608720977e-09, "loss": 0.4249, "step": 2488 }, { "epoch": 1.9722662440570522, "grad_norm": 1.5300800986955854, "learning_rate": 5.227822431390639e-09, "loss": 0.4983, "step": 2489 }, { "epoch": 1.9730586370839935, "grad_norm": 1.3408324202896011, "learning_rate": 4.941468849752884e-09, "loss": 0.5086, "step": 2490 }, { "epoch": 1.9738510301109349, "grad_norm": 1.637031209794462, "learning_rate": 4.663176325927499e-09, "loss": 0.3958, "step": 2491 }, { "epoch": 1.9746434231378764, "grad_norm": 1.559683434735915, "learning_rate": 4.392945309024121e-09, "loss": 0.4407, "step": 2492 }, { "epoch": 1.9754358161648178, "grad_norm": 1.3210027336095467, "learning_rate": 4.130776235144462e-09, "loss": 0.467, "step": 2493 }, { "epoch": 1.9762282091917591, "grad_norm": 1.521286648610697, "learning_rate": 3.876669527377863e-09, "loss": 0.4529, "step": 2494 }, { "epoch": 1.9770206022187005, "grad_norm": 1.4932034481444167, "learning_rate": 3.6306255958029614e-09, "loss": 0.3932, "step": 2495 }, { "epoch": 1.9778129952456418, "grad_norm": 1.3055393597804053, "learning_rate": 3.392644837488246e-09, "loss": 0.4827, "step": 2496 }, { "epoch": 1.9786053882725834, "grad_norm": 1.455493197303617, "learning_rate": 3.162727636487617e-09, "loss": 0.4736, "step": 2497 }, { "epoch": 1.9793977812995247, "grad_norm": 1.21975952037152, "learning_rate": 2.9408743638437156e-09, "loss": 0.5347, "step": 2498 }, { "epoch": 1.980190174326466, "grad_norm": 1.3010374866035426, "learning_rate": 2.727085377584038e-09, "loss": 0.4855, "step": 2499 }, { "epoch": 1.9809825673534074, "grad_norm": 1.6271256375048215, "learning_rate": 2.5213610227226014e-09, "loss": 0.5149, "step": 2500 }, { "epoch": 1.9817749603803487, "grad_norm": 1.5137110023366467, "learning_rate": 2.3237016312599447e-09, "loss": 0.4408, "step": 2501 }, { "epoch": 1.98256735340729, "grad_norm": 1.2572376593098777, "learning_rate": 2.1341075221781306e-09, "loss": 0.3776, "step": 2502 }, { "epoch": 1.9833597464342314, "grad_norm": 1.1940234249775612, "learning_rate": 1.9525790014474077e-09, "loss": 0.6088, "step": 2503 }, { "epoch": 1.9841521394611727, "grad_norm": 1.3755371768934337, "learning_rate": 1.7791163620178853e-09, "loss": 0.4536, "step": 2504 }, { "epoch": 1.984944532488114, "grad_norm": 1.335202033720645, "learning_rate": 1.6137198838261925e-09, "loss": 0.4859, "step": 2505 }, { "epoch": 1.9857369255150554, "grad_norm": 1.5025533853192037, "learning_rate": 1.4563898337888183e-09, "loss": 0.4436, "step": 2506 }, { "epoch": 1.9865293185419968, "grad_norm": 1.4184998947459604, "learning_rate": 1.3071264658076621e-09, "loss": 0.4939, "step": 2507 }, { "epoch": 1.987321711568938, "grad_norm": 1.4059182546317321, "learning_rate": 1.1659300207633727e-09, "loss": 0.4932, "step": 2508 }, { "epoch": 1.9881141045958794, "grad_norm": 1.3849938678529574, "learning_rate": 1.0328007265203443e-09, "loss": 0.5703, "step": 2509 }, { "epoch": 1.9889064976228208, "grad_norm": 1.5651833457168252, "learning_rate": 9.077387979233853e-10, "loss": 0.4442, "step": 2510 }, { "epoch": 1.9896988906497621, "grad_norm": 1.204075858956804, "learning_rate": 7.907444367982742e-10, "loss": 0.5623, "step": 2511 }, { "epoch": 1.9904912836767037, "grad_norm": 1.5678980073393989, "learning_rate": 6.818178319517588e-10, "loss": 0.457, "step": 2512 }, { "epoch": 1.991283676703645, "grad_norm": 1.9810269480514038, "learning_rate": 5.809591591693364e-10, "loss": 0.4415, "step": 2513 }, { "epoch": 1.9920760697305864, "grad_norm": 1.203443373272525, "learning_rate": 4.881685812180292e-10, "loss": 0.5146, "step": 2514 }, { "epoch": 1.9928684627575277, "grad_norm": 1.3160087884037732, "learning_rate": 4.0344624784416366e-10, "loss": 0.4254, "step": 2515 }, { "epoch": 1.993660855784469, "grad_norm": 1.2108772873272484, "learning_rate": 3.267922957722602e-10, "loss": 0.5161, "step": 2516 }, { "epoch": 1.9944532488114106, "grad_norm": 1.539665689995048, "learning_rate": 2.582068487078093e-10, "loss": 0.5253, "step": 2517 }, { "epoch": 1.995245641838352, "grad_norm": 1.2660451875793903, "learning_rate": 1.9769001733394023e-10, "loss": 0.5253, "step": 2518 }, { "epoch": 1.9960380348652933, "grad_norm": 1.5474901775203977, "learning_rate": 1.4524189931308662e-10, "loss": 0.4315, "step": 2519 }, { "epoch": 1.9968304278922346, "grad_norm": 1.4775630761361287, "learning_rate": 1.0086257928698661e-10, "loss": 0.3701, "step": 2520 }, { "epoch": 1.997622820919176, "grad_norm": 1.5648034945947824, "learning_rate": 6.455212887446217e-11, "loss": 0.4944, "step": 2521 }, { "epoch": 1.9984152139461173, "grad_norm": 1.2312374798252292, "learning_rate": 3.6310606674194724e-11, "loss": 0.4948, "step": 2522 }, { "epoch": 1.9992076069730587, "grad_norm": 1.4367224942380747, "learning_rate": 1.6138058262504807e-11, "loss": 0.551, "step": 2523 }, { "epoch": 2.0, "grad_norm": 1.4255075089706701, "learning_rate": 4.034516193351934e-12, "loss": 0.4279, "step": 2524 } ], "logging_steps": 1, "max_steps": 2524, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1101470958288896.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }