{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.993258426966292, "eval_steps": 500, "global_step": 333, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008988764044943821, "grad_norm": 5.937067934578121, "learning_rate": 2.3529411764705885e-06, "loss": 0.8925, "step": 1 }, { "epoch": 0.017977528089887642, "grad_norm": 5.923591025846198, "learning_rate": 4.705882352941177e-06, "loss": 0.8908, "step": 2 }, { "epoch": 0.02696629213483146, "grad_norm": 5.45957942819648, "learning_rate": 7.058823529411766e-06, "loss": 0.8716, "step": 3 }, { "epoch": 0.035955056179775284, "grad_norm": 3.974971561642825, "learning_rate": 9.411764705882354e-06, "loss": 0.8395, "step": 4 }, { "epoch": 0.0449438202247191, "grad_norm": 2.1772095757152767, "learning_rate": 1.1764705882352942e-05, "loss": 0.7954, "step": 5 }, { "epoch": 0.05393258426966292, "grad_norm": 5.329495741252824, "learning_rate": 1.4117647058823532e-05, "loss": 0.8391, "step": 6 }, { "epoch": 0.06292134831460675, "grad_norm": 7.0082332084809424, "learning_rate": 1.647058823529412e-05, "loss": 0.8077, "step": 7 }, { "epoch": 0.07191011235955057, "grad_norm": 7.179904932078677, "learning_rate": 1.8823529411764708e-05, "loss": 0.8269, "step": 8 }, { "epoch": 0.08089887640449438, "grad_norm": 3.944730501336432, "learning_rate": 2.1176470588235296e-05, "loss": 0.7655, "step": 9 }, { "epoch": 0.0898876404494382, "grad_norm": 3.094924202191644, "learning_rate": 2.3529411764705884e-05, "loss": 0.7356, "step": 10 }, { "epoch": 0.09887640449438202, "grad_norm": 2.1418931101275476, "learning_rate": 2.5882352941176475e-05, "loss": 0.7015, "step": 11 }, { "epoch": 0.10786516853932585, "grad_norm": 1.8786098685363002, "learning_rate": 2.8235294117647063e-05, "loss": 0.6907, "step": 12 }, { "epoch": 0.11685393258426967, "grad_norm": 1.217510887771457, "learning_rate": 3.0588235294117644e-05, "loss": 0.6657, "step": 13 }, { "epoch": 0.1258426966292135, "grad_norm": 1.645912176172542, "learning_rate": 3.294117647058824e-05, "loss": 0.6501, "step": 14 }, { "epoch": 0.1348314606741573, "grad_norm": 1.1214763643282593, "learning_rate": 3.529411764705883e-05, "loss": 0.6431, "step": 15 }, { "epoch": 0.14382022471910114, "grad_norm": 1.3566330254296206, "learning_rate": 3.7647058823529415e-05, "loss": 0.6305, "step": 16 }, { "epoch": 0.15280898876404495, "grad_norm": 1.5612433580633596, "learning_rate": 4e-05, "loss": 0.6239, "step": 17 }, { "epoch": 0.16179775280898875, "grad_norm": 1.6192637419680402, "learning_rate": 4.235294117647059e-05, "loss": 0.6169, "step": 18 }, { "epoch": 0.1707865168539326, "grad_norm": 1.2309268080406381, "learning_rate": 4.470588235294118e-05, "loss": 0.6097, "step": 19 }, { "epoch": 0.1797752808988764, "grad_norm": 1.862538444377319, "learning_rate": 4.705882352941177e-05, "loss": 0.6134, "step": 20 }, { "epoch": 0.18876404494382024, "grad_norm": 1.5958037318215281, "learning_rate": 4.941176470588236e-05, "loss": 0.6007, "step": 21 }, { "epoch": 0.19775280898876405, "grad_norm": 1.357215808802562, "learning_rate": 5.176470588235295e-05, "loss": 0.5961, "step": 22 }, { "epoch": 0.20674157303370785, "grad_norm": 1.3402016135380905, "learning_rate": 5.411764705882354e-05, "loss": 0.5872, "step": 23 }, { "epoch": 0.2157303370786517, "grad_norm": 1.9136536863131055, "learning_rate": 5.6470588235294126e-05, "loss": 0.5779, "step": 24 }, { "epoch": 0.2247191011235955, "grad_norm": 1.0205411195750105, "learning_rate": 5.8823529411764714e-05, "loss": 0.5728, "step": 25 }, { "epoch": 0.23370786516853934, "grad_norm": 2.6452559257739265, "learning_rate": 6.117647058823529e-05, "loss": 0.5948, "step": 26 }, { "epoch": 0.24269662921348314, "grad_norm": 1.7949903848826065, "learning_rate": 6.352941176470589e-05, "loss": 0.5869, "step": 27 }, { "epoch": 0.251685393258427, "grad_norm": 1.9069324360886062, "learning_rate": 6.588235294117648e-05, "loss": 0.5816, "step": 28 }, { "epoch": 0.2606741573033708, "grad_norm": 1.7027526272642033, "learning_rate": 6.823529411764707e-05, "loss": 0.5674, "step": 29 }, { "epoch": 0.2696629213483146, "grad_norm": 1.6647039397669456, "learning_rate": 7.058823529411765e-05, "loss": 0.5801, "step": 30 }, { "epoch": 0.2786516853932584, "grad_norm": 1.2647930520355923, "learning_rate": 7.294117647058824e-05, "loss": 0.5622, "step": 31 }, { "epoch": 0.2876404494382023, "grad_norm": 1.8668167622362088, "learning_rate": 7.529411764705883e-05, "loss": 0.5628, "step": 32 }, { "epoch": 0.2966292134831461, "grad_norm": 1.6151821153814043, "learning_rate": 7.764705882352942e-05, "loss": 0.569, "step": 33 }, { "epoch": 0.3056179775280899, "grad_norm": 1.9972629519815388, "learning_rate": 8e-05, "loss": 0.5559, "step": 34 }, { "epoch": 0.3146067415730337, "grad_norm": 1.930654248915306, "learning_rate": 7.999779207981935e-05, "loss": 0.5578, "step": 35 }, { "epoch": 0.3235955056179775, "grad_norm": 1.1040009369498223, "learning_rate": 7.999116856302298e-05, "loss": 0.5553, "step": 36 }, { "epoch": 0.3325842696629214, "grad_norm": 1.9114787612288138, "learning_rate": 7.998013018082072e-05, "loss": 0.5518, "step": 37 }, { "epoch": 0.3415730337078652, "grad_norm": 2.5196506278998476, "learning_rate": 7.996467815180588e-05, "loss": 0.5587, "step": 38 }, { "epoch": 0.350561797752809, "grad_norm": 1.5471557908374793, "learning_rate": 7.994481418182082e-05, "loss": 0.5394, "step": 39 }, { "epoch": 0.3595505617977528, "grad_norm": 2.747162602890951, "learning_rate": 7.992054046376854e-05, "loss": 0.5602, "step": 40 }, { "epoch": 0.3685393258426966, "grad_norm": 1.9963506460084293, "learning_rate": 7.989185967737066e-05, "loss": 0.5489, "step": 41 }, { "epoch": 0.3775280898876405, "grad_norm": 2.356173456359232, "learning_rate": 7.985877498887149e-05, "loss": 0.554, "step": 42 }, { "epoch": 0.3865168539325843, "grad_norm": 1.9651033697846167, "learning_rate": 7.982129005068865e-05, "loss": 0.5432, "step": 43 }, { "epoch": 0.3955056179775281, "grad_norm": 1.7900792925575555, "learning_rate": 7.977940900100967e-05, "loss": 0.5476, "step": 44 }, { "epoch": 0.4044943820224719, "grad_norm": 1.8559704498293903, "learning_rate": 7.973313646333532e-05, "loss": 0.535, "step": 45 }, { "epoch": 0.4134831460674157, "grad_norm": 1.4415920297319014, "learning_rate": 7.968247754596908e-05, "loss": 0.5339, "step": 46 }, { "epoch": 0.42247191011235957, "grad_norm": 1.4135208728964568, "learning_rate": 7.962743784145323e-05, "loss": 0.5381, "step": 47 }, { "epoch": 0.4314606741573034, "grad_norm": 1.6688537146906361, "learning_rate": 7.956802342595152e-05, "loss": 0.5328, "step": 48 }, { "epoch": 0.4404494382022472, "grad_norm": 1.3978532275006461, "learning_rate": 7.950424085857827e-05, "loss": 0.5265, "step": 49 }, { "epoch": 0.449438202247191, "grad_norm": 1.7334227968766784, "learning_rate": 7.943609718067437e-05, "loss": 0.5245, "step": 50 }, { "epoch": 0.4584269662921348, "grad_norm": 0.7953199977805888, "learning_rate": 7.936359991502993e-05, "loss": 0.5167, "step": 51 }, { "epoch": 0.46741573033707867, "grad_norm": 1.6871487499435203, "learning_rate": 7.92867570650537e-05, "loss": 0.5325, "step": 52 }, { "epoch": 0.4764044943820225, "grad_norm": 1.304827694741159, "learning_rate": 7.920557711388967e-05, "loss": 0.5371, "step": 53 }, { "epoch": 0.4853932584269663, "grad_norm": 1.6036182086909745, "learning_rate": 7.912006902348045e-05, "loss": 0.5245, "step": 54 }, { "epoch": 0.4943820224719101, "grad_norm": 1.0607757354012914, "learning_rate": 7.903024223357797e-05, "loss": 0.5289, "step": 55 }, { "epoch": 0.503370786516854, "grad_norm": 2.0583378114050728, "learning_rate": 7.893610666070134e-05, "loss": 0.5346, "step": 56 }, { "epoch": 0.5123595505617977, "grad_norm": 1.4305584900288937, "learning_rate": 7.883767269704209e-05, "loss": 0.5251, "step": 57 }, { "epoch": 0.5213483146067416, "grad_norm": 1.5466083404792839, "learning_rate": 7.873495120931697e-05, "loss": 0.5242, "step": 58 }, { "epoch": 0.5303370786516854, "grad_norm": 1.3322841661439688, "learning_rate": 7.86279535375683e-05, "loss": 0.5266, "step": 59 }, { "epoch": 0.5393258426966292, "grad_norm": 1.2220900027281376, "learning_rate": 7.851669149391198e-05, "loss": 0.5228, "step": 60 }, { "epoch": 0.5483146067415731, "grad_norm": 1.5982554174133687, "learning_rate": 7.84011773612336e-05, "loss": 0.5231, "step": 61 }, { "epoch": 0.5573033707865168, "grad_norm": 1.146536447272186, "learning_rate": 7.828142389183239e-05, "loss": 0.5195, "step": 62 }, { "epoch": 0.5662921348314607, "grad_norm": 1.0879405901836858, "learning_rate": 7.815744430601344e-05, "loss": 0.5129, "step": 63 }, { "epoch": 0.5752808988764045, "grad_norm": 1.6921788651579457, "learning_rate": 7.802925229062823e-05, "loss": 0.521, "step": 64 }, { "epoch": 0.5842696629213483, "grad_norm": 1.0795726495535796, "learning_rate": 7.789686199756365e-05, "loss": 0.5172, "step": 65 }, { "epoch": 0.5932584269662922, "grad_norm": 1.1065195406196553, "learning_rate": 7.776028804217968e-05, "loss": 0.5134, "step": 66 }, { "epoch": 0.6022471910112359, "grad_norm": 1.2749574533792616, "learning_rate": 7.761954550169593e-05, "loss": 0.5161, "step": 67 }, { "epoch": 0.6112359550561798, "grad_norm": 1.3329895959967077, "learning_rate": 7.74746499135272e-05, "loss": 0.5153, "step": 68 }, { "epoch": 0.6202247191011236, "grad_norm": 1.233521082461672, "learning_rate": 7.732561727356811e-05, "loss": 0.5086, "step": 69 }, { "epoch": 0.6292134831460674, "grad_norm": 1.1760347925547088, "learning_rate": 7.717246403442735e-05, "loss": 0.5137, "step": 70 }, { "epoch": 0.6382022471910113, "grad_norm": 1.1780036452883795, "learning_rate": 7.701520710361129e-05, "loss": 0.4999, "step": 71 }, { "epoch": 0.647191011235955, "grad_norm": 0.948808130697301, "learning_rate": 7.685386384165748e-05, "loss": 0.5036, "step": 72 }, { "epoch": 0.6561797752808989, "grad_norm": 1.7675174539861906, "learning_rate": 7.668845206021812e-05, "loss": 0.5132, "step": 73 }, { "epoch": 0.6651685393258427, "grad_norm": 0.7501534484404624, "learning_rate": 7.651899002009375e-05, "loss": 0.5044, "step": 74 }, { "epoch": 0.6741573033707865, "grad_norm": 1.2580366817929822, "learning_rate": 7.634549642921725e-05, "loss": 0.5158, "step": 75 }, { "epoch": 0.6831460674157304, "grad_norm": 1.2116010404784998, "learning_rate": 7.616799044058867e-05, "loss": 0.5132, "step": 76 }, { "epoch": 0.6921348314606741, "grad_norm": 1.504394845229427, "learning_rate": 7.598649165016073e-05, "loss": 0.5051, "step": 77 }, { "epoch": 0.701123595505618, "grad_norm": 0.8286673874695757, "learning_rate": 7.58010200946755e-05, "loss": 0.5, "step": 78 }, { "epoch": 0.7101123595505618, "grad_norm": 1.6610859144234769, "learning_rate": 7.561159624945257e-05, "loss": 0.5094, "step": 79 }, { "epoch": 0.7191011235955056, "grad_norm": 0.8748352440236109, "learning_rate": 7.541824102612839e-05, "loss": 0.5009, "step": 80 }, { "epoch": 0.7280898876404495, "grad_norm": 1.4412943546400492, "learning_rate": 7.5220975770348e-05, "loss": 0.501, "step": 81 }, { "epoch": 0.7370786516853932, "grad_norm": 1.017225970339285, "learning_rate": 7.501982225940833e-05, "loss": 0.5009, "step": 82 }, { "epoch": 0.7460674157303371, "grad_norm": 1.3512165456613465, "learning_rate": 7.48148026998542e-05, "loss": 0.503, "step": 83 }, { "epoch": 0.755056179775281, "grad_norm": 1.0549525315803745, "learning_rate": 7.460593972502674e-05, "loss": 0.4953, "step": 84 }, { "epoch": 0.7640449438202247, "grad_norm": 1.1196254022912724, "learning_rate": 7.439325639256483e-05, "loss": 0.4897, "step": 85 }, { "epoch": 0.7730337078651686, "grad_norm": 1.1317585655215339, "learning_rate": 7.417677618185955e-05, "loss": 0.4969, "step": 86 }, { "epoch": 0.7820224719101123, "grad_norm": 1.2763979132088588, "learning_rate": 7.39565229914622e-05, "loss": 0.5046, "step": 87 }, { "epoch": 0.7910112359550562, "grad_norm": 1.2131663136230482, "learning_rate": 7.373252113644596e-05, "loss": 0.5006, "step": 88 }, { "epoch": 0.8, "grad_norm": 1.121494784629588, "learning_rate": 7.350479534572166e-05, "loss": 0.4949, "step": 89 }, { "epoch": 0.8089887640449438, "grad_norm": 0.9241654008148638, "learning_rate": 7.327337075930775e-05, "loss": 0.5003, "step": 90 }, { "epoch": 0.8179775280898877, "grad_norm": 1.4368488374102792, "learning_rate": 7.303827292555495e-05, "loss": 0.4907, "step": 91 }, { "epoch": 0.8269662921348314, "grad_norm": 1.2319104612634353, "learning_rate": 7.279952779832584e-05, "loss": 0.5043, "step": 92 }, { "epoch": 0.8359550561797753, "grad_norm": 0.8671468092587438, "learning_rate": 7.255716173412966e-05, "loss": 0.486, "step": 93 }, { "epoch": 0.8449438202247191, "grad_norm": 1.3364076805186555, "learning_rate": 7.23112014892126e-05, "loss": 0.4932, "step": 94 }, { "epoch": 0.8539325842696629, "grad_norm": 1.0720511804076949, "learning_rate": 7.20616742166041e-05, "loss": 0.49, "step": 95 }, { "epoch": 0.8629213483146068, "grad_norm": 0.8784137232709633, "learning_rate": 7.180860746311917e-05, "loss": 0.4911, "step": 96 }, { "epoch": 0.8719101123595505, "grad_norm": 1.0650244396836273, "learning_rate": 7.155202916631743e-05, "loss": 0.4922, "step": 97 }, { "epoch": 0.8808988764044944, "grad_norm": 1.15396517981509, "learning_rate": 7.129196765141886e-05, "loss": 0.4899, "step": 98 }, { "epoch": 0.8898876404494382, "grad_norm": 0.8409463006756092, "learning_rate": 7.10284516281768e-05, "loss": 0.4935, "step": 99 }, { "epoch": 0.898876404494382, "grad_norm": 0.8987420745372244, "learning_rate": 7.076151018770854e-05, "loss": 0.4933, "step": 100 }, { "epoch": 0.9078651685393259, "grad_norm": 1.3476943698616615, "learning_rate": 7.049117279928374e-05, "loss": 0.4941, "step": 101 }, { "epoch": 0.9168539325842696, "grad_norm": 1.0470417246613994, "learning_rate": 7.021746930707117e-05, "loss": 0.4878, "step": 102 }, { "epoch": 0.9258426966292135, "grad_norm": 1.222327504764566, "learning_rate": 6.994042992684406e-05, "loss": 0.478, "step": 103 }, { "epoch": 0.9348314606741573, "grad_norm": 0.8143177540157983, "learning_rate": 6.966008524264429e-05, "loss": 0.4868, "step": 104 }, { "epoch": 0.9438202247191011, "grad_norm": 1.3154017535872904, "learning_rate": 6.937646620340618e-05, "loss": 0.4921, "step": 105 }, { "epoch": 0.952808988764045, "grad_norm": 1.2828038188846893, "learning_rate": 6.908960411953973e-05, "loss": 0.4871, "step": 106 }, { "epoch": 0.9617977528089887, "grad_norm": 0.7484103692531322, "learning_rate": 6.879953065947416e-05, "loss": 0.4856, "step": 107 }, { "epoch": 0.9707865168539326, "grad_norm": 1.393234551386416, "learning_rate": 6.850627784616178e-05, "loss": 0.4884, "step": 108 }, { "epoch": 0.9797752808988764, "grad_norm": 0.6778085483847683, "learning_rate": 6.82098780535428e-05, "loss": 0.4844, "step": 109 }, { "epoch": 0.9887640449438202, "grad_norm": 0.95356400631233, "learning_rate": 6.791036400297142e-05, "loss": 0.4766, "step": 110 }, { "epoch": 0.9977528089887641, "grad_norm": 0.9616360385718218, "learning_rate": 6.760776875960347e-05, "loss": 0.4796, "step": 111 }, { "epoch": 1.006741573033708, "grad_norm": 1.8748279594605497, "learning_rate": 6.730212572874618e-05, "loss": 0.8205, "step": 112 }, { "epoch": 1.0157303370786517, "grad_norm": 0.7712220221008119, "learning_rate": 6.699346865217031e-05, "loss": 0.4547, "step": 113 }, { "epoch": 1.0247191011235954, "grad_norm": 1.1476060733360436, "learning_rate": 6.668183160438531e-05, "loss": 0.4598, "step": 114 }, { "epoch": 1.0337078651685394, "grad_norm": 1.091336171209942, "learning_rate": 6.636724898887751e-05, "loss": 0.4566, "step": 115 }, { "epoch": 1.0426966292134832, "grad_norm": 1.356832256792843, "learning_rate": 6.604975553431219e-05, "loss": 0.4636, "step": 116 }, { "epoch": 1.051685393258427, "grad_norm": 0.8230787378003628, "learning_rate": 6.572938629069959e-05, "loss": 0.4539, "step": 117 }, { "epoch": 1.060674157303371, "grad_norm": 0.7577254786527922, "learning_rate": 6.540617662552565e-05, "loss": 0.4626, "step": 118 }, { "epoch": 1.0696629213483146, "grad_norm": 0.9663188263281904, "learning_rate": 6.508016221984747e-05, "loss": 0.4507, "step": 119 }, { "epoch": 1.0786516853932584, "grad_norm": 1.4024310383300067, "learning_rate": 6.475137906435435e-05, "loss": 0.4613, "step": 120 }, { "epoch": 1.0876404494382022, "grad_norm": 0.7663106953730127, "learning_rate": 6.441986345539446e-05, "loss": 0.4656, "step": 121 }, { "epoch": 1.0966292134831461, "grad_norm": 1.1216710559366627, "learning_rate": 6.408565199096798e-05, "loss": 0.4529, "step": 122 }, { "epoch": 1.1056179775280899, "grad_norm": 0.8823684903577822, "learning_rate": 6.374878156668676e-05, "loss": 0.4588, "step": 123 }, { "epoch": 1.1146067415730336, "grad_norm": 0.7542533792179187, "learning_rate": 6.340928937170118e-05, "loss": 0.4551, "step": 124 }, { "epoch": 1.1235955056179776, "grad_norm": 1.1622893113229344, "learning_rate": 6.30672128845947e-05, "loss": 0.4587, "step": 125 }, { "epoch": 1.1325842696629214, "grad_norm": 0.6982046729289911, "learning_rate": 6.272258986924624e-05, "loss": 0.4527, "step": 126 }, { "epoch": 1.1415730337078651, "grad_norm": 0.5854116152358985, "learning_rate": 6.237545837066133e-05, "loss": 0.4518, "step": 127 }, { "epoch": 1.1505617977528089, "grad_norm": 0.47197050837846644, "learning_rate": 6.202585671077204e-05, "loss": 0.4474, "step": 128 }, { "epoch": 1.1595505617977528, "grad_norm": 0.4631791319657276, "learning_rate": 6.167382348420637e-05, "loss": 0.4573, "step": 129 }, { "epoch": 1.1685393258426966, "grad_norm": 0.6398622174196904, "learning_rate": 6.131939755402755e-05, "loss": 0.4551, "step": 130 }, { "epoch": 1.1775280898876406, "grad_norm": 1.110637394887812, "learning_rate": 6.09626180474438e-05, "loss": 0.4524, "step": 131 }, { "epoch": 1.1865168539325843, "grad_norm": 1.377350137912211, "learning_rate": 6.060352435148874e-05, "loss": 0.4559, "step": 132 }, { "epoch": 1.195505617977528, "grad_norm": 0.5283491845810788, "learning_rate": 6.024215610867327e-05, "loss": 0.4603, "step": 133 }, { "epoch": 1.2044943820224718, "grad_norm": 0.9966520169090612, "learning_rate": 5.9878553212609184e-05, "loss": 0.4516, "step": 134 }, { "epoch": 1.2134831460674158, "grad_norm": 1.4576011324058094, "learning_rate": 5.95127558036051e-05, "loss": 0.4596, "step": 135 }, { "epoch": 1.2224719101123596, "grad_norm": 0.697821502136658, "learning_rate": 5.9144804264235066e-05, "loss": 0.4576, "step": 136 }, { "epoch": 1.2314606741573033, "grad_norm": 1.5590501658526488, "learning_rate": 5.8774739214880554e-05, "loss": 0.4602, "step": 137 }, { "epoch": 1.2404494382022473, "grad_norm": 0.6045793781300405, "learning_rate": 5.840260150924609e-05, "loss": 0.45, "step": 138 }, { "epoch": 1.249438202247191, "grad_norm": 1.417376379462992, "learning_rate": 5.802843222984919e-05, "loss": 0.4564, "step": 139 }, { "epoch": 1.2584269662921348, "grad_norm": 0.7383419101529723, "learning_rate": 5.765227268348501e-05, "loss": 0.4533, "step": 140 }, { "epoch": 1.2674157303370785, "grad_norm": 1.4264362126630619, "learning_rate": 5.727416439666622e-05, "loss": 0.457, "step": 141 }, { "epoch": 1.2764044943820225, "grad_norm": 0.8638040394017062, "learning_rate": 5.689414911103867e-05, "loss": 0.4516, "step": 142 }, { "epoch": 1.2853932584269663, "grad_norm": 1.42829306450921, "learning_rate": 5.651226877877326e-05, "loss": 0.4556, "step": 143 }, { "epoch": 1.29438202247191, "grad_norm": 1.0514839616322087, "learning_rate": 5.612856555793459e-05, "loss": 0.4554, "step": 144 }, { "epoch": 1.303370786516854, "grad_norm": 1.2796676473517508, "learning_rate": 5.574308180782693e-05, "loss": 0.4491, "step": 145 }, { "epoch": 1.3123595505617978, "grad_norm": 1.0932581128919143, "learning_rate": 5.5355860084317787e-05, "loss": 0.4575, "step": 146 }, { "epoch": 1.3213483146067415, "grad_norm": 0.8711585447910877, "learning_rate": 5.496694313514009e-05, "loss": 0.4536, "step": 147 }, { "epoch": 1.3303370786516853, "grad_norm": 0.976493438245937, "learning_rate": 5.457637389517285e-05, "loss": 0.4616, "step": 148 }, { "epoch": 1.3393258426966292, "grad_norm": 0.600498943325478, "learning_rate": 5.4184195481701425e-05, "loss": 0.4474, "step": 149 }, { "epoch": 1.348314606741573, "grad_norm": 0.7706578711873172, "learning_rate": 5.3790451189657486e-05, "loss": 0.4511, "step": 150 }, { "epoch": 1.357303370786517, "grad_norm": 0.5178771915217835, "learning_rate": 5.339518448683945e-05, "loss": 0.4514, "step": 151 }, { "epoch": 1.3662921348314607, "grad_norm": 0.7334914676346533, "learning_rate": 5.2998439009113814e-05, "loss": 0.4522, "step": 152 }, { "epoch": 1.3752808988764045, "grad_norm": 0.5701908032527899, "learning_rate": 5.260025855559792e-05, "loss": 0.457, "step": 153 }, { "epoch": 1.3842696629213482, "grad_norm": 0.45968308679963693, "learning_rate": 5.2200687083824706e-05, "loss": 0.4501, "step": 154 }, { "epoch": 1.3932584269662922, "grad_norm": 0.5556354013625224, "learning_rate": 5.179976870488999e-05, "loss": 0.4508, "step": 155 }, { "epoch": 1.402247191011236, "grad_norm": 0.4445793572850935, "learning_rate": 5.1397547678582745e-05, "loss": 0.4507, "step": 156 }, { "epoch": 1.4112359550561797, "grad_norm": 0.5201992901363155, "learning_rate": 5.099406840849902e-05, "loss": 0.4535, "step": 157 }, { "epoch": 1.4202247191011237, "grad_norm": 0.5297850125662515, "learning_rate": 5.058937543713999e-05, "loss": 0.4436, "step": 158 }, { "epoch": 1.4292134831460674, "grad_norm": 0.3969459180847379, "learning_rate": 5.018351344099453e-05, "loss": 0.4532, "step": 159 }, { "epoch": 1.4382022471910112, "grad_norm": 0.47432246499785363, "learning_rate": 4.9776527225607274e-05, "loss": 0.4511, "step": 160 }, { "epoch": 1.447191011235955, "grad_norm": 0.31975394190454653, "learning_rate": 4.93684617206321e-05, "loss": 0.4485, "step": 161 }, { "epoch": 1.456179775280899, "grad_norm": 0.4164002143535295, "learning_rate": 4.89593619748722e-05, "loss": 0.4481, "step": 162 }, { "epoch": 1.4651685393258427, "grad_norm": 0.367528177243067, "learning_rate": 4.8549273151306795e-05, "loss": 0.4446, "step": 163 }, { "epoch": 1.4741573033707867, "grad_norm": 0.36494718836358464, "learning_rate": 4.8138240522105365e-05, "loss": 0.4516, "step": 164 }, { "epoch": 1.4831460674157304, "grad_norm": 0.2981123577566662, "learning_rate": 4.7726309463629733e-05, "loss": 0.439, "step": 165 }, { "epoch": 1.4921348314606742, "grad_norm": 0.31072790704633446, "learning_rate": 4.731352545142478e-05, "loss": 0.449, "step": 166 }, { "epoch": 1.501123595505618, "grad_norm": 0.3992742056784971, "learning_rate": 4.689993405519802e-05, "loss": 0.4433, "step": 167 }, { "epoch": 1.5101123595505617, "grad_norm": 0.2594367011131309, "learning_rate": 4.648558093378899e-05, "loss": 0.4501, "step": 168 }, { "epoch": 1.5191011235955056, "grad_norm": 0.2830467959001486, "learning_rate": 4.607051183012862e-05, "loss": 0.4422, "step": 169 }, { "epoch": 1.5280898876404494, "grad_norm": 0.28137593410743117, "learning_rate": 4.5654772566189415e-05, "loss": 0.4439, "step": 170 }, { "epoch": 1.5370786516853934, "grad_norm": 0.30222361671491843, "learning_rate": 4.5238409037926905e-05, "loss": 0.4464, "step": 171 }, { "epoch": 1.5460674157303371, "grad_norm": 0.2984953677611448, "learning_rate": 4.4821467210212924e-05, "loss": 0.4515, "step": 172 }, { "epoch": 1.5550561797752809, "grad_norm": 0.29477762612841285, "learning_rate": 4.4403993111761265e-05, "loss": 0.4487, "step": 173 }, { "epoch": 1.5640449438202246, "grad_norm": 0.27738094537303265, "learning_rate": 4.398603283004626e-05, "loss": 0.4349, "step": 174 }, { "epoch": 1.5730337078651684, "grad_norm": 0.27179761514817713, "learning_rate": 4.356763250621496e-05, "loss": 0.439, "step": 175 }, { "epoch": 1.5820224719101124, "grad_norm": 0.30145150102869067, "learning_rate": 4.314883832999326e-05, "loss": 0.4399, "step": 176 }, { "epoch": 1.5910112359550563, "grad_norm": 0.27593285844535975, "learning_rate": 4.272969653458685e-05, "loss": 0.449, "step": 177 }, { "epoch": 1.6, "grad_norm": 0.30712340293439133, "learning_rate": 4.231025339157714e-05, "loss": 0.4368, "step": 178 }, { "epoch": 1.6089887640449438, "grad_norm": 0.26832779433391346, "learning_rate": 4.189055520581315e-05, "loss": 0.4447, "step": 179 }, { "epoch": 1.6179775280898876, "grad_norm": 0.2800675843384406, "learning_rate": 4.147064831029959e-05, "loss": 0.445, "step": 180 }, { "epoch": 1.6269662921348313, "grad_norm": 0.3067111758990106, "learning_rate": 4.105057906108189e-05, "loss": 0.4452, "step": 181 }, { "epoch": 1.6359550561797753, "grad_norm": 0.24971049417224553, "learning_rate": 4.063039383212866e-05, "loss": 0.4425, "step": 182 }, { "epoch": 1.644943820224719, "grad_norm": 0.2020310973856206, "learning_rate": 4.021013901021225e-05, "loss": 0.4411, "step": 183 }, { "epoch": 1.653932584269663, "grad_norm": 0.2574200192037008, "learning_rate": 3.978986098978777e-05, "loss": 0.4517, "step": 184 }, { "epoch": 1.6629213483146068, "grad_norm": 0.2736980359380189, "learning_rate": 3.936960616787135e-05, "loss": 0.4446, "step": 185 }, { "epoch": 1.6719101123595506, "grad_norm": 0.24658217170903196, "learning_rate": 3.8949420938918124e-05, "loss": 0.4364, "step": 186 }, { "epoch": 1.6808988764044943, "grad_norm": 0.2767599168622161, "learning_rate": 3.852935168970042e-05, "loss": 0.4357, "step": 187 }, { "epoch": 1.689887640449438, "grad_norm": 0.2548602216487942, "learning_rate": 3.810944479418686e-05, "loss": 0.4399, "step": 188 }, { "epoch": 1.698876404494382, "grad_norm": 0.2640517946445997, "learning_rate": 3.768974660842287e-05, "loss": 0.4475, "step": 189 }, { "epoch": 1.7078651685393258, "grad_norm": 0.25913290546087026, "learning_rate": 3.727030346541317e-05, "loss": 0.437, "step": 190 }, { "epoch": 1.7168539325842698, "grad_norm": 0.21742230610782412, "learning_rate": 3.685116167000675e-05, "loss": 0.4376, "step": 191 }, { "epoch": 1.7258426966292135, "grad_norm": 0.2076811227564877, "learning_rate": 3.6432367493785056e-05, "loss": 0.438, "step": 192 }, { "epoch": 1.7348314606741573, "grad_norm": 0.3321278391017944, "learning_rate": 3.601396716995375e-05, "loss": 0.4388, "step": 193 }, { "epoch": 1.743820224719101, "grad_norm": 0.2478941601472815, "learning_rate": 3.559600688823875e-05, "loss": 0.4452, "step": 194 }, { "epoch": 1.7528089887640448, "grad_norm": 0.28877121684016926, "learning_rate": 3.517853278978708e-05, "loss": 0.442, "step": 195 }, { "epoch": 1.7617977528089888, "grad_norm": 0.2393813520807247, "learning_rate": 3.4761590962073115e-05, "loss": 0.4437, "step": 196 }, { "epoch": 1.7707865168539327, "grad_norm": 0.26771357483054375, "learning_rate": 3.434522743381061e-05, "loss": 0.4496, "step": 197 }, { "epoch": 1.7797752808988765, "grad_norm": 0.2506533050953217, "learning_rate": 3.39294881698714e-05, "loss": 0.4443, "step": 198 }, { "epoch": 1.7887640449438202, "grad_norm": 0.21252098175250356, "learning_rate": 3.3514419066211025e-05, "loss": 0.4383, "step": 199 }, { "epoch": 1.797752808988764, "grad_norm": 0.3227331899101581, "learning_rate": 3.310006594480199e-05, "loss": 0.4373, "step": 200 }, { "epoch": 1.8067415730337077, "grad_norm": 0.2595288336777384, "learning_rate": 3.268647454857524e-05, "loss": 0.4432, "step": 201 }, { "epoch": 1.8157303370786517, "grad_norm": 0.22248085130320008, "learning_rate": 3.227369053637028e-05, "loss": 0.4378, "step": 202 }, { "epoch": 1.8247191011235955, "grad_norm": 0.26022739350380103, "learning_rate": 3.1861759477894656e-05, "loss": 0.4511, "step": 203 }, { "epoch": 1.8337078651685395, "grad_norm": 0.2525774955154011, "learning_rate": 3.145072684869322e-05, "loss": 0.4408, "step": 204 }, { "epoch": 1.8426966292134832, "grad_norm": 0.19933336701791182, "learning_rate": 3.104063802512782e-05, "loss": 0.4428, "step": 205 }, { "epoch": 1.851685393258427, "grad_norm": 0.2618315402075057, "learning_rate": 3.063153827936792e-05, "loss": 0.4387, "step": 206 }, { "epoch": 1.8606741573033707, "grad_norm": 0.2612229535126854, "learning_rate": 3.0223472774392753e-05, "loss": 0.4373, "step": 207 }, { "epoch": 1.8696629213483145, "grad_norm": 0.23175557173041528, "learning_rate": 2.9816486559005482e-05, "loss": 0.4428, "step": 208 }, { "epoch": 1.8786516853932584, "grad_norm": 0.2659832710112894, "learning_rate": 2.9410624562860026e-05, "loss": 0.4374, "step": 209 }, { "epoch": 1.8876404494382022, "grad_norm": 0.2394604361123856, "learning_rate": 2.9005931591500974e-05, "loss": 0.4377, "step": 210 }, { "epoch": 1.8966292134831462, "grad_norm": 0.21085090202042212, "learning_rate": 2.860245232141726e-05, "loss": 0.4446, "step": 211 }, { "epoch": 1.90561797752809, "grad_norm": 0.24320843039780787, "learning_rate": 2.8200231295110012e-05, "loss": 0.4449, "step": 212 }, { "epoch": 1.9146067415730337, "grad_norm": 0.18825252205454282, "learning_rate": 2.7799312916175294e-05, "loss": 0.4474, "step": 213 }, { "epoch": 1.9235955056179774, "grad_norm": 0.21020622489218485, "learning_rate": 2.7399741444402087e-05, "loss": 0.4396, "step": 214 }, { "epoch": 1.9325842696629212, "grad_norm": 0.19967691714665464, "learning_rate": 2.7001560990886196e-05, "loss": 0.432, "step": 215 }, { "epoch": 1.9415730337078652, "grad_norm": 0.1723917394030639, "learning_rate": 2.6604815513160556e-05, "loss": 0.4347, "step": 216 }, { "epoch": 1.9505617977528091, "grad_norm": 0.2286024751356636, "learning_rate": 2.6209548810342517e-05, "loss": 0.4272, "step": 217 }, { "epoch": 1.9595505617977529, "grad_norm": 0.1609850927229707, "learning_rate": 2.5815804518298575e-05, "loss": 0.4373, "step": 218 }, { "epoch": 1.9685393258426966, "grad_norm": 0.17225954372932897, "learning_rate": 2.542362610482715e-05, "loss": 0.4346, "step": 219 }, { "epoch": 1.9775280898876404, "grad_norm": 0.14538818432484127, "learning_rate": 2.503305686485991e-05, "loss": 0.444, "step": 220 }, { "epoch": 1.9865168539325841, "grad_norm": 0.17741790417981537, "learning_rate": 2.464413991568222e-05, "loss": 0.4413, "step": 221 }, { "epoch": 1.9955056179775281, "grad_norm": 0.19631454210187727, "learning_rate": 2.4256918192173088e-05, "loss": 0.4383, "step": 222 }, { "epoch": 2.004494382022472, "grad_norm": 0.2777440257511273, "learning_rate": 2.3871434442065414e-05, "loss": 0.7545, "step": 223 }, { "epoch": 2.013483146067416, "grad_norm": 0.2471458608915711, "learning_rate": 2.3487731221226754e-05, "loss": 0.4154, "step": 224 }, { "epoch": 2.0224719101123596, "grad_norm": 0.1821697922847147, "learning_rate": 2.3105850888961348e-05, "loss": 0.4165, "step": 225 }, { "epoch": 2.0314606741573034, "grad_norm": 0.24585959006825425, "learning_rate": 2.272583560333379e-05, "loss": 0.4141, "step": 226 }, { "epoch": 2.040449438202247, "grad_norm": 0.1702964012359119, "learning_rate": 2.2347727316515e-05, "loss": 0.4173, "step": 227 }, { "epoch": 2.049438202247191, "grad_norm": 0.24547374442583564, "learning_rate": 2.1971567770150814e-05, "loss": 0.4039, "step": 228 }, { "epoch": 2.0584269662921346, "grad_norm": 0.19448387676230083, "learning_rate": 2.1597398490753917e-05, "loss": 0.4126, "step": 229 }, { "epoch": 2.067415730337079, "grad_norm": 0.1711476507488459, "learning_rate": 2.1225260785119456e-05, "loss": 0.4112, "step": 230 }, { "epoch": 2.0764044943820226, "grad_norm": 0.21251938925846103, "learning_rate": 2.0855195735764947e-05, "loss": 0.4136, "step": 231 }, { "epoch": 2.0853932584269663, "grad_norm": 0.19044392751354922, "learning_rate": 2.0487244196394912e-05, "loss": 0.4158, "step": 232 }, { "epoch": 2.09438202247191, "grad_norm": 0.1815645592240795, "learning_rate": 2.0121446787390822e-05, "loss": 0.414, "step": 233 }, { "epoch": 2.103370786516854, "grad_norm": 0.2175948364317461, "learning_rate": 1.9757843891326736e-05, "loss": 0.409, "step": 234 }, { "epoch": 2.1123595505617976, "grad_norm": 0.16627200046068014, "learning_rate": 1.939647564851127e-05, "loss": 0.4118, "step": 235 }, { "epoch": 2.121348314606742, "grad_norm": 0.18089353116200013, "learning_rate": 1.9037381952556217e-05, "loss": 0.4082, "step": 236 }, { "epoch": 2.1303370786516855, "grad_norm": 0.1938341058307494, "learning_rate": 1.8680602445972463e-05, "loss": 0.4208, "step": 237 }, { "epoch": 2.1393258426966293, "grad_norm": 0.15296391756902566, "learning_rate": 1.832617651579365e-05, "loss": 0.4143, "step": 238 }, { "epoch": 2.148314606741573, "grad_norm": 0.2007977943202684, "learning_rate": 1.797414328922797e-05, "loss": 0.4126, "step": 239 }, { "epoch": 2.157303370786517, "grad_norm": 0.14549535548231118, "learning_rate": 1.7624541629338676e-05, "loss": 0.4107, "step": 240 }, { "epoch": 2.1662921348314605, "grad_norm": 0.1643469642395592, "learning_rate": 1.7277410130753775e-05, "loss": 0.4147, "step": 241 }, { "epoch": 2.1752808988764043, "grad_norm": 0.1743166147174567, "learning_rate": 1.6932787115405318e-05, "loss": 0.4079, "step": 242 }, { "epoch": 2.1842696629213485, "grad_norm": 0.15594946749382455, "learning_rate": 1.6590710628298826e-05, "loss": 0.41, "step": 243 }, { "epoch": 2.1932584269662923, "grad_norm": 0.16475084403286017, "learning_rate": 1.6251218433313254e-05, "loss": 0.4091, "step": 244 }, { "epoch": 2.202247191011236, "grad_norm": 0.13895524394350253, "learning_rate": 1.591434800903203e-05, "loss": 0.408, "step": 245 }, { "epoch": 2.2112359550561798, "grad_norm": 0.14999957095600605, "learning_rate": 1.558013654460555e-05, "loss": 0.409, "step": 246 }, { "epoch": 2.2202247191011235, "grad_norm": 0.1411289442089014, "learning_rate": 1.5248620935645666e-05, "loss": 0.4108, "step": 247 }, { "epoch": 2.2292134831460673, "grad_norm": 0.13130064830617885, "learning_rate": 1.4919837780152544e-05, "loss": 0.4086, "step": 248 }, { "epoch": 2.238202247191011, "grad_norm": 0.1362901361388582, "learning_rate": 1.4593823374474374e-05, "loss": 0.4134, "step": 249 }, { "epoch": 2.247191011235955, "grad_norm": 0.1641330954517184, "learning_rate": 1.4270613709300429e-05, "loss": 0.4105, "step": 250 }, { "epoch": 2.256179775280899, "grad_norm": 0.11707105305267158, "learning_rate": 1.3950244465687833e-05, "loss": 0.4113, "step": 251 }, { "epoch": 2.2651685393258427, "grad_norm": 0.14883912799444574, "learning_rate": 1.3632751011122497e-05, "loss": 0.4117, "step": 252 }, { "epoch": 2.2741573033707865, "grad_norm": 0.14576734750428819, "learning_rate": 1.3318168395614697e-05, "loss": 0.4086, "step": 253 }, { "epoch": 2.2831460674157302, "grad_norm": 0.1325863043302826, "learning_rate": 1.3006531347829699e-05, "loss": 0.4075, "step": 254 }, { "epoch": 2.292134831460674, "grad_norm": 0.14803632152992002, "learning_rate": 1.2697874271253844e-05, "loss": 0.4092, "step": 255 }, { "epoch": 2.3011235955056177, "grad_norm": 0.1484465193910235, "learning_rate": 1.2392231240396542e-05, "loss": 0.4103, "step": 256 }, { "epoch": 2.310112359550562, "grad_norm": 0.13481968569964842, "learning_rate": 1.2089635997028592e-05, "loss": 0.4165, "step": 257 }, { "epoch": 2.3191011235955057, "grad_norm": 0.14559693501049936, "learning_rate": 1.1790121946457212e-05, "loss": 0.4166, "step": 258 }, { "epoch": 2.3280898876404494, "grad_norm": 0.12726242281070635, "learning_rate": 1.1493722153838239e-05, "loss": 0.407, "step": 259 }, { "epoch": 2.337078651685393, "grad_norm": 0.13666677240818353, "learning_rate": 1.120046934052585e-05, "loss": 0.413, "step": 260 }, { "epoch": 2.346067415730337, "grad_norm": 0.1267204820522753, "learning_rate": 1.0910395880460274e-05, "loss": 0.4112, "step": 261 }, { "epoch": 2.355056179775281, "grad_norm": 0.13846022363503335, "learning_rate": 1.062353379659383e-05, "loss": 0.4182, "step": 262 }, { "epoch": 2.364044943820225, "grad_norm": 0.13931951845911503, "learning_rate": 1.0339914757355718e-05, "loss": 0.4058, "step": 263 }, { "epoch": 2.3730337078651687, "grad_norm": 0.14081173124556376, "learning_rate": 1.0059570073155953e-05, "loss": 0.4224, "step": 264 }, { "epoch": 2.3820224719101124, "grad_norm": 0.11818135028408412, "learning_rate": 9.782530692928832e-06, "loss": 0.4175, "step": 265 }, { "epoch": 2.391011235955056, "grad_norm": 0.12675733200213124, "learning_rate": 9.508827200716273e-06, "loss": 0.4087, "step": 266 }, { "epoch": 2.4, "grad_norm": 0.12595759152751707, "learning_rate": 9.238489812291469e-06, "loss": 0.4081, "step": 267 }, { "epoch": 2.4089887640449437, "grad_norm": 0.12527623811173608, "learning_rate": 8.971548371823205e-06, "loss": 0.4082, "step": 268 }, { "epoch": 2.417977528089888, "grad_norm": 0.122998316643011, "learning_rate": 8.708032348581144e-06, "loss": 0.4073, "step": 269 }, { "epoch": 2.4269662921348316, "grad_norm": 0.10564845207022606, "learning_rate": 8.447970833682584e-06, "loss": 0.4113, "step": 270 }, { "epoch": 2.4359550561797754, "grad_norm": 0.10685572250478023, "learning_rate": 8.191392536880852e-06, "loss": 0.4071, "step": 271 }, { "epoch": 2.444943820224719, "grad_norm": 0.11554548264247637, "learning_rate": 7.938325783395924e-06, "loss": 0.4019, "step": 272 }, { "epoch": 2.453932584269663, "grad_norm": 0.11306318481457633, "learning_rate": 7.68879851078741e-06, "loss": 0.4116, "step": 273 }, { "epoch": 2.4629213483146066, "grad_norm": 0.10283238222894554, "learning_rate": 7.442838265870347e-06, "loss": 0.4097, "step": 274 }, { "epoch": 2.4719101123595504, "grad_norm": 0.11264537827542116, "learning_rate": 7.2004722016741605e-06, "loss": 0.4086, "step": 275 }, { "epoch": 2.4808988764044946, "grad_norm": 0.1184428903128185, "learning_rate": 6.961727074445055e-06, "loss": 0.4048, "step": 276 }, { "epoch": 2.4898876404494383, "grad_norm": 0.10381057720149338, "learning_rate": 6.726629240692255e-06, "loss": 0.406, "step": 277 }, { "epoch": 2.498876404494382, "grad_norm": 0.11411996855242165, "learning_rate": 6.4952046542783395e-06, "loss": 0.4084, "step": 278 }, { "epoch": 2.507865168539326, "grad_norm": 0.11085772018759905, "learning_rate": 6.2674788635540415e-06, "loss": 0.4088, "step": 279 }, { "epoch": 2.5168539325842696, "grad_norm": 0.11581754129424132, "learning_rate": 6.04347700853781e-06, "loss": 0.4091, "step": 280 }, { "epoch": 2.5258426966292133, "grad_norm": 0.10915675557144788, "learning_rate": 5.823223818140458e-06, "loss": 0.4135, "step": 281 }, { "epoch": 2.534831460674157, "grad_norm": 0.10472009125683931, "learning_rate": 5.606743607435183e-06, "loss": 0.4192, "step": 282 }, { "epoch": 2.5438202247191013, "grad_norm": 0.10196405389390528, "learning_rate": 5.394060274973267e-06, "loss": 0.4143, "step": 283 }, { "epoch": 2.552808988764045, "grad_norm": 0.09299288202132956, "learning_rate": 5.185197300145817e-06, "loss": 0.4073, "step": 284 }, { "epoch": 2.561797752808989, "grad_norm": 0.10859218890312015, "learning_rate": 4.980177740591678e-06, "loss": 0.408, "step": 285 }, { "epoch": 2.5707865168539326, "grad_norm": 0.1030110328672013, "learning_rate": 4.779024229652005e-06, "loss": 0.4086, "step": 286 }, { "epoch": 2.5797752808988763, "grad_norm": 0.09705963975610957, "learning_rate": 4.581758973871609e-06, "loss": 0.4053, "step": 287 }, { "epoch": 2.58876404494382, "grad_norm": 0.08893242924745905, "learning_rate": 4.3884037505474455e-06, "loss": 0.4051, "step": 288 }, { "epoch": 2.597752808988764, "grad_norm": 0.09476330135285733, "learning_rate": 4.198979905324496e-06, "loss": 0.4062, "step": 289 }, { "epoch": 2.606741573033708, "grad_norm": 0.1125307729702412, "learning_rate": 4.0135083498392905e-06, "loss": 0.4143, "step": 290 }, { "epoch": 2.6157303370786518, "grad_norm": 0.10546064232164497, "learning_rate": 3.832009559411338e-06, "loss": 0.407, "step": 291 }, { "epoch": 2.6247191011235955, "grad_norm": 0.09832745095728368, "learning_rate": 3.654503570782755e-06, "loss": 0.4094, "step": 292 }, { "epoch": 2.6337078651685393, "grad_norm": 0.09906441503157463, "learning_rate": 3.481009979906258e-06, "loss": 0.408, "step": 293 }, { "epoch": 2.642696629213483, "grad_norm": 0.09565058554084314, "learning_rate": 3.311547939781887e-06, "loss": 0.4064, "step": 294 }, { "epoch": 2.6516853932584272, "grad_norm": 0.09922131509032601, "learning_rate": 3.14613615834253e-06, "loss": 0.4044, "step": 295 }, { "epoch": 2.6606741573033705, "grad_norm": 0.10360258898579061, "learning_rate": 2.9847928963887198e-06, "loss": 0.4054, "step": 296 }, { "epoch": 2.6696629213483147, "grad_norm": 0.09576331652157899, "learning_rate": 2.8275359655726586e-06, "loss": 0.4065, "step": 297 }, { "epoch": 2.6786516853932585, "grad_norm": 0.08804843115482651, "learning_rate": 2.6743827264319012e-06, "loss": 0.4027, "step": 298 }, { "epoch": 2.6876404494382022, "grad_norm": 0.0940651708680553, "learning_rate": 2.5253500864728155e-06, "loss": 0.4048, "step": 299 }, { "epoch": 2.696629213483146, "grad_norm": 0.09087393252432437, "learning_rate": 2.3804544983040724e-06, "loss": 0.4025, "step": 300 }, { "epoch": 2.7056179775280897, "grad_norm": 0.09939517236482039, "learning_rate": 2.23971195782033e-06, "loss": 0.406, "step": 301 }, { "epoch": 2.714606741573034, "grad_norm": 0.15328626125954675, "learning_rate": 2.1031380024363645e-06, "loss": 0.4121, "step": 302 }, { "epoch": 2.7235955056179773, "grad_norm": 0.08802675816187339, "learning_rate": 1.9707477093717786e-06, "loss": 0.4083, "step": 303 }, { "epoch": 2.7325842696629215, "grad_norm": 0.08803502781546772, "learning_rate": 1.8425556939865696e-06, "loss": 0.4146, "step": 304 }, { "epoch": 2.741573033707865, "grad_norm": 0.08953466563820135, "learning_rate": 1.7185761081676222e-06, "loss": 0.4076, "step": 305 }, { "epoch": 2.750561797752809, "grad_norm": 0.08902482730609233, "learning_rate": 1.5988226387664151e-06, "loss": 0.4072, "step": 306 }, { "epoch": 2.7595505617977527, "grad_norm": 0.08914869211011264, "learning_rate": 1.4833085060880349e-06, "loss": 0.4116, "step": 307 }, { "epoch": 2.7685393258426965, "grad_norm": 0.0839377567882791, "learning_rate": 1.3720464624317108e-06, "loss": 0.4028, "step": 308 }, { "epoch": 2.7775280898876407, "grad_norm": 0.0860599340838286, "learning_rate": 1.2650487906830234e-06, "loss": 0.408, "step": 309 }, { "epoch": 2.7865168539325844, "grad_norm": 0.08469381729317074, "learning_rate": 1.1623273029579195e-06, "loss": 0.4124, "step": 310 }, { "epoch": 2.795505617977528, "grad_norm": 0.08378981417352441, "learning_rate": 1.063893339298674e-06, "loss": 0.4153, "step": 311 }, { "epoch": 2.804494382022472, "grad_norm": 0.08792748870981947, "learning_rate": 9.697577664220303e-07, "loss": 0.4128, "step": 312 }, { "epoch": 2.8134831460674157, "grad_norm": 0.08428338920338808, "learning_rate": 8.799309765195452e-07, "loss": 0.4047, "step": 313 }, { "epoch": 2.8224719101123594, "grad_norm": 0.08341919328157262, "learning_rate": 7.944228861103264e-07, "loss": 0.4035, "step": 314 }, { "epoch": 2.831460674157303, "grad_norm": 0.08371596387722409, "learning_rate": 7.132429349463011e-07, "loss": 0.4059, "step": 315 }, { "epoch": 2.8404494382022474, "grad_norm": 0.0792325512831226, "learning_rate": 6.364000849700791e-07, "loss": 0.4039, "step": 316 }, { "epoch": 2.849438202247191, "grad_norm": 0.08312852243276637, "learning_rate": 5.639028193256257e-07, "loss": 0.4067, "step": 317 }, { "epoch": 2.858426966292135, "grad_norm": 0.08209771251775913, "learning_rate": 4.957591414217344e-07, "loss": 0.4077, "step": 318 }, { "epoch": 2.8674157303370786, "grad_norm": 0.08673033811521172, "learning_rate": 4.3197657404848935e-07, "loss": 0.4163, "step": 319 }, { "epoch": 2.8764044943820224, "grad_norm": 0.07889124751001583, "learning_rate": 3.725621585467698e-07, "loss": 0.4074, "step": 320 }, { "epoch": 2.885393258426966, "grad_norm": 0.08351100735021297, "learning_rate": 3.1752245403092963e-07, "loss": 0.4047, "step": 321 }, { "epoch": 2.89438202247191, "grad_norm": 0.0841384102489667, "learning_rate": 2.6686353666468323e-07, "loss": 0.4015, "step": 322 }, { "epoch": 2.903370786516854, "grad_norm": 0.08172285090606024, "learning_rate": 2.2059099899033098e-07, "loss": 0.4007, "step": 323 }, { "epoch": 2.912359550561798, "grad_norm": 0.08042881118540746, "learning_rate": 1.7870994931135977e-07, "loss": 0.4088, "step": 324 }, { "epoch": 2.9213483146067416, "grad_norm": 0.08582343232361876, "learning_rate": 1.412250111285074e-07, "loss": 0.4081, "step": 325 }, { "epoch": 2.9303370786516854, "grad_norm": 0.0810315622092791, "learning_rate": 1.0814032262935315e-07, "loss": 0.4079, "step": 326 }, { "epoch": 2.939325842696629, "grad_norm": 0.07928028852132449, "learning_rate": 7.945953623146096e-08, "loss": 0.4076, "step": 327 }, { "epoch": 2.9483146067415733, "grad_norm": 0.08125928562829822, "learning_rate": 5.518581817918645e-08, "loss": 0.4127, "step": 328 }, { "epoch": 2.9573033707865166, "grad_norm": 0.07827386715600733, "learning_rate": 3.532184819412532e-08, "loss": 0.4074, "step": 329 }, { "epoch": 2.966292134831461, "grad_norm": 0.08205445804803274, "learning_rate": 1.9869819179292315e-08, "loss": 0.4099, "step": 330 }, { "epoch": 2.9752808988764046, "grad_norm": 0.08077759984225538, "learning_rate": 8.83143697702149e-09, "loss": 0.4092, "step": 331 }, { "epoch": 2.9842696629213483, "grad_norm": 0.08245700392469553, "learning_rate": 2.2079201806501916e-09, "loss": 0.4106, "step": 332 }, { "epoch": 2.993258426966292, "grad_norm": 0.08229489202929206, "learning_rate": 0.0, "loss": 0.4064, "step": 333 }, { "epoch": 2.993258426966292, "step": 333, "total_flos": 7.998180122580484e+18, "train_loss": 0.4748834180939305, "train_runtime": 32625.0257, "train_samples_per_second": 5.234, "train_steps_per_second": 0.01 } ], "logging_steps": 1.0, "max_steps": 333, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.998180122580484e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }