{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.971537001897533, "eval_steps": 500, "global_step": 655, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007590132827324478, "grad_norm": 5.824297945739171, "learning_rate": 1.2121212121212122e-06, "loss": 0.9218, "step": 1 }, { "epoch": 0.015180265654648957, "grad_norm": 5.892263786026164, "learning_rate": 2.4242424242424244e-06, "loss": 0.9293, "step": 2 }, { "epoch": 0.022770398481973434, "grad_norm": 5.8114634272581736, "learning_rate": 3.6363636363636366e-06, "loss": 0.9311, "step": 3 }, { "epoch": 0.030360531309297913, "grad_norm": 5.296649545877873, "learning_rate": 4.848484848484849e-06, "loss": 0.9125, "step": 4 }, { "epoch": 0.03795066413662239, "grad_norm": 3.7809427076070765, "learning_rate": 6.060606060606061e-06, "loss": 0.8611, "step": 5 }, { "epoch": 0.04554079696394687, "grad_norm": 2.1730858437477893, "learning_rate": 7.272727272727273e-06, "loss": 0.8592, "step": 6 }, { "epoch": 0.05313092979127135, "grad_norm": 4.234904575535682, "learning_rate": 8.484848484848486e-06, "loss": 0.8607, "step": 7 }, { "epoch": 0.06072106261859583, "grad_norm": 4.53949743914793, "learning_rate": 9.696969696969698e-06, "loss": 0.8652, "step": 8 }, { "epoch": 0.0683111954459203, "grad_norm": 4.029238499849355, "learning_rate": 1.0909090909090909e-05, "loss": 0.8177, "step": 9 }, { "epoch": 0.07590132827324478, "grad_norm": 3.9784894633891312, "learning_rate": 1.2121212121212122e-05, "loss": 0.8151, "step": 10 }, { "epoch": 0.08349146110056926, "grad_norm": 2.6969063855035493, "learning_rate": 1.3333333333333333e-05, "loss": 0.7815, "step": 11 }, { "epoch": 0.09108159392789374, "grad_norm": 1.6447992531334745, "learning_rate": 1.4545454545454546e-05, "loss": 0.7618, "step": 12 }, { "epoch": 0.09867172675521822, "grad_norm": 1.5209576084591174, "learning_rate": 1.575757575757576e-05, "loss": 0.7486, "step": 13 }, { "epoch": 0.1062618595825427, "grad_norm": 1.2740153307577036, "learning_rate": 1.6969696969696972e-05, "loss": 0.7247, "step": 14 }, { "epoch": 0.11385199240986717, "grad_norm": 0.9320600982322024, "learning_rate": 1.8181818181818182e-05, "loss": 0.711, "step": 15 }, { "epoch": 0.12144212523719165, "grad_norm": 1.0702991390933831, "learning_rate": 1.9393939393939395e-05, "loss": 0.7028, "step": 16 }, { "epoch": 0.12903225806451613, "grad_norm": 0.9459102023256077, "learning_rate": 2.0606060606060608e-05, "loss": 0.6918, "step": 17 }, { "epoch": 0.1366223908918406, "grad_norm": 0.8949299902760269, "learning_rate": 2.1818181818181818e-05, "loss": 0.6782, "step": 18 }, { "epoch": 0.1442125237191651, "grad_norm": 0.876771721092771, "learning_rate": 2.3030303030303034e-05, "loss": 0.6772, "step": 19 }, { "epoch": 0.15180265654648956, "grad_norm": 1.0086928620416316, "learning_rate": 2.4242424242424244e-05, "loss": 0.6644, "step": 20 }, { "epoch": 0.15939278937381404, "grad_norm": 0.9465491775161774, "learning_rate": 2.5454545454545457e-05, "loss": 0.6628, "step": 21 }, { "epoch": 0.16698292220113853, "grad_norm": 0.8899418440526895, "learning_rate": 2.6666666666666667e-05, "loss": 0.6579, "step": 22 }, { "epoch": 0.174573055028463, "grad_norm": 0.9194240562361043, "learning_rate": 2.7878787878787883e-05, "loss": 0.6488, "step": 23 }, { "epoch": 0.18216318785578747, "grad_norm": 1.140269189956398, "learning_rate": 2.9090909090909093e-05, "loss": 0.6545, "step": 24 }, { "epoch": 0.18975332068311196, "grad_norm": 1.2817416105473125, "learning_rate": 3.0303030303030306e-05, "loss": 0.6505, "step": 25 }, { "epoch": 0.19734345351043645, "grad_norm": 0.6815058078206016, "learning_rate": 3.151515151515152e-05, "loss": 0.6317, "step": 26 }, { "epoch": 0.2049335863377609, "grad_norm": 1.1874255778058744, "learning_rate": 3.272727272727273e-05, "loss": 0.6432, "step": 27 }, { "epoch": 0.2125237191650854, "grad_norm": 0.9363859174853021, "learning_rate": 3.3939393939393945e-05, "loss": 0.6312, "step": 28 }, { "epoch": 0.22011385199240988, "grad_norm": 0.8935811457744806, "learning_rate": 3.515151515151515e-05, "loss": 0.6285, "step": 29 }, { "epoch": 0.22770398481973433, "grad_norm": 1.2762386300886945, "learning_rate": 3.6363636363636364e-05, "loss": 0.6275, "step": 30 }, { "epoch": 0.23529411764705882, "grad_norm": 0.9210641452165423, "learning_rate": 3.7575757575757584e-05, "loss": 0.6264, "step": 31 }, { "epoch": 0.2428842504743833, "grad_norm": 1.2440524082474191, "learning_rate": 3.878787878787879e-05, "loss": 0.6144, "step": 32 }, { "epoch": 0.2504743833017078, "grad_norm": 1.3065985154977695, "learning_rate": 4e-05, "loss": 0.6141, "step": 33 }, { "epoch": 0.25806451612903225, "grad_norm": 0.8172081904989663, "learning_rate": 4.1212121212121216e-05, "loss": 0.6092, "step": 34 }, { "epoch": 0.2656546489563567, "grad_norm": 1.167931775101708, "learning_rate": 4.242424242424242e-05, "loss": 0.6134, "step": 35 }, { "epoch": 0.2732447817836812, "grad_norm": 1.681922162159049, "learning_rate": 4.3636363636363636e-05, "loss": 0.6164, "step": 36 }, { "epoch": 0.2808349146110057, "grad_norm": 1.1257832927645395, "learning_rate": 4.484848484848485e-05, "loss": 0.6011, "step": 37 }, { "epoch": 0.2884250474383302, "grad_norm": 1.790614581178023, "learning_rate": 4.606060606060607e-05, "loss": 0.6094, "step": 38 }, { "epoch": 0.29601518026565465, "grad_norm": 1.127806608018945, "learning_rate": 4.727272727272728e-05, "loss": 0.6011, "step": 39 }, { "epoch": 0.3036053130929791, "grad_norm": 2.093380998039163, "learning_rate": 4.848484848484849e-05, "loss": 0.615, "step": 40 }, { "epoch": 0.3111954459203036, "grad_norm": 0.9384468154974619, "learning_rate": 4.96969696969697e-05, "loss": 0.5974, "step": 41 }, { "epoch": 0.3187855787476281, "grad_norm": 2.4981151616674686, "learning_rate": 5.0909090909090914e-05, "loss": 0.6002, "step": 42 }, { "epoch": 0.32637571157495254, "grad_norm": 1.6534765579286679, "learning_rate": 5.212121212121213e-05, "loss": 0.6062, "step": 43 }, { "epoch": 0.33396584440227706, "grad_norm": 2.4370056170762453, "learning_rate": 5.333333333333333e-05, "loss": 0.6068, "step": 44 }, { "epoch": 0.3415559772296015, "grad_norm": 2.183022857065241, "learning_rate": 5.4545454545454546e-05, "loss": 0.5993, "step": 45 }, { "epoch": 0.349146110056926, "grad_norm": 1.8264041908559345, "learning_rate": 5.5757575757575766e-05, "loss": 0.5967, "step": 46 }, { "epoch": 0.3567362428842505, "grad_norm": 1.9185286131763832, "learning_rate": 5.696969696969698e-05, "loss": 0.6048, "step": 47 }, { "epoch": 0.36432637571157495, "grad_norm": 1.5433175224735158, "learning_rate": 5.8181818181818185e-05, "loss": 0.5991, "step": 48 }, { "epoch": 0.3719165085388994, "grad_norm": 1.6301636930901502, "learning_rate": 5.93939393939394e-05, "loss": 0.5973, "step": 49 }, { "epoch": 0.3795066413662239, "grad_norm": 1.6154604740395921, "learning_rate": 6.060606060606061e-05, "loss": 0.5839, "step": 50 }, { "epoch": 0.3870967741935484, "grad_norm": 1.5375798706049526, "learning_rate": 6.181818181818182e-05, "loss": 0.6014, "step": 51 }, { "epoch": 0.3946869070208729, "grad_norm": 1.8926585193561105, "learning_rate": 6.303030303030304e-05, "loss": 0.5903, "step": 52 }, { "epoch": 0.40227703984819735, "grad_norm": 0.9591201704735197, "learning_rate": 6.424242424242424e-05, "loss": 0.5787, "step": 53 }, { "epoch": 0.4098671726755218, "grad_norm": 2.3504740289658144, "learning_rate": 6.545454545454546e-05, "loss": 0.5836, "step": 54 }, { "epoch": 0.4174573055028463, "grad_norm": 1.9833219660676837, "learning_rate": 6.666666666666667e-05, "loss": 0.6021, "step": 55 }, { "epoch": 0.4250474383301708, "grad_norm": 1.979773818430796, "learning_rate": 6.787878787878789e-05, "loss": 0.5745, "step": 56 }, { "epoch": 0.43263757115749524, "grad_norm": 1.6918535634940701, "learning_rate": 6.90909090909091e-05, "loss": 0.5802, "step": 57 }, { "epoch": 0.44022770398481975, "grad_norm": 1.896304739161675, "learning_rate": 7.03030303030303e-05, "loss": 0.5967, "step": 58 }, { "epoch": 0.4478178368121442, "grad_norm": 1.7127150877307569, "learning_rate": 7.151515151515152e-05, "loss": 0.5873, "step": 59 }, { "epoch": 0.45540796963946867, "grad_norm": 1.7288522680268184, "learning_rate": 7.272727272727273e-05, "loss": 0.5822, "step": 60 }, { "epoch": 0.4629981024667932, "grad_norm": 2.4113594863743777, "learning_rate": 7.393939393939395e-05, "loss": 0.5892, "step": 61 }, { "epoch": 0.47058823529411764, "grad_norm": 1.5610600634117355, "learning_rate": 7.515151515151517e-05, "loss": 0.5888, "step": 62 }, { "epoch": 0.4781783681214421, "grad_norm": 1.554510024355238, "learning_rate": 7.636363636363637e-05, "loss": 0.5748, "step": 63 }, { "epoch": 0.4857685009487666, "grad_norm": 1.4238723235068915, "learning_rate": 7.757575757575758e-05, "loss": 0.5752, "step": 64 }, { "epoch": 0.49335863377609107, "grad_norm": 3.2737964188798, "learning_rate": 7.87878787878788e-05, "loss": 0.5991, "step": 65 }, { "epoch": 0.5009487666034156, "grad_norm": 1.3673718679696243, "learning_rate": 8e-05, "loss": 0.587, "step": 66 }, { "epoch": 0.50853889943074, "grad_norm": 3.10214817390346, "learning_rate": 7.999943101853146e-05, "loss": 0.5968, "step": 67 }, { "epoch": 0.5161290322580645, "grad_norm": 2.4426856945858635, "learning_rate": 7.999772409031277e-05, "loss": 0.6063, "step": 68 }, { "epoch": 0.523719165085389, "grad_norm": 2.384951983454804, "learning_rate": 7.999487926390452e-05, "loss": 0.5968, "step": 69 }, { "epoch": 0.5313092979127134, "grad_norm": 2.470269943289222, "learning_rate": 7.999089662023934e-05, "loss": 0.5976, "step": 70 }, { "epoch": 0.538899430740038, "grad_norm": 2.0615837527679926, "learning_rate": 7.99857762726198e-05, "loss": 0.5892, "step": 71 }, { "epoch": 0.5464895635673624, "grad_norm": 1.4595469442640645, "learning_rate": 7.997951836671498e-05, "loss": 0.5763, "step": 72 }, { "epoch": 0.5540796963946869, "grad_norm": 1.6686147644039993, "learning_rate": 7.997212308055656e-05, "loss": 0.5885, "step": 73 }, { "epoch": 0.5616698292220114, "grad_norm": 1.1588798823385231, "learning_rate": 7.996359062453354e-05, "loss": 0.5816, "step": 74 }, { "epoch": 0.5692599620493358, "grad_norm": 2.139844499195118, "learning_rate": 7.995392124138642e-05, "loss": 0.5815, "step": 75 }, { "epoch": 0.5768500948766604, "grad_norm": 1.6540433397238854, "learning_rate": 7.994311520620017e-05, "loss": 0.5782, "step": 76 }, { "epoch": 0.5844402277039848, "grad_norm": 1.04883299144272, "learning_rate": 7.993117282639648e-05, "loss": 0.5782, "step": 77 }, { "epoch": 0.5920303605313093, "grad_norm": 2.724444333560736, "learning_rate": 7.9918094441725e-05, "loss": 0.5861, "step": 78 }, { "epoch": 0.5996204933586338, "grad_norm": 1.8249890665939426, "learning_rate": 7.990388042425367e-05, "loss": 0.58, "step": 79 }, { "epoch": 0.6072106261859582, "grad_norm": 2.602399399727078, "learning_rate": 7.988853117835806e-05, "loss": 0.5814, "step": 80 }, { "epoch": 0.6148007590132827, "grad_norm": 1.5944678851416663, "learning_rate": 7.987204714071006e-05, "loss": 0.5826, "step": 81 }, { "epoch": 0.6223908918406073, "grad_norm": 2.2610913780974546, "learning_rate": 7.985442878026524e-05, "loss": 0.5754, "step": 82 }, { "epoch": 0.6299810246679317, "grad_norm": 1.7537341638428399, "learning_rate": 7.983567659824962e-05, "loss": 0.5845, "step": 83 }, { "epoch": 0.6375711574952562, "grad_norm": 1.8121108815331453, "learning_rate": 7.981579112814541e-05, "loss": 0.585, "step": 84 }, { "epoch": 0.6451612903225806, "grad_norm": 1.467756636378608, "learning_rate": 7.97947729356758e-05, "loss": 0.5777, "step": 85 }, { "epoch": 0.6527514231499051, "grad_norm": 1.5365204832241453, "learning_rate": 7.977262261878892e-05, "loss": 0.5763, "step": 86 }, { "epoch": 0.6603415559772297, "grad_norm": 1.4259830475580915, "learning_rate": 7.974934080764075e-05, "loss": 0.5662, "step": 87 }, { "epoch": 0.6679316888045541, "grad_norm": 1.290860497369316, "learning_rate": 7.972492816457723e-05, "loss": 0.5627, "step": 88 }, { "epoch": 0.6755218216318786, "grad_norm": 1.1578178204522984, "learning_rate": 7.969938538411543e-05, "loss": 0.5611, "step": 89 }, { "epoch": 0.683111954459203, "grad_norm": 1.8928883460003019, "learning_rate": 7.967271319292382e-05, "loss": 0.5715, "step": 90 }, { "epoch": 0.6907020872865275, "grad_norm": 1.5577040910573858, "learning_rate": 7.96449123498015e-05, "loss": 0.5712, "step": 91 }, { "epoch": 0.698292220113852, "grad_norm": 1.064793253865779, "learning_rate": 7.96159836456567e-05, "loss": 0.5675, "step": 92 }, { "epoch": 0.7058823529411765, "grad_norm": 2.0170128081260406, "learning_rate": 7.958592790348425e-05, "loss": 0.5755, "step": 93 }, { "epoch": 0.713472485768501, "grad_norm": 1.3379111611740009, "learning_rate": 7.955474597834217e-05, "loss": 0.5604, "step": 94 }, { "epoch": 0.7210626185958254, "grad_norm": 1.4656800007307322, "learning_rate": 7.952243875732735e-05, "loss": 0.5655, "step": 95 }, { "epoch": 0.7286527514231499, "grad_norm": 1.2799455534799504, "learning_rate": 7.948900715955025e-05, "loss": 0.5629, "step": 96 }, { "epoch": 0.7362428842504743, "grad_norm": 1.6331551992017197, "learning_rate": 7.94544521361089e-05, "loss": 0.5589, "step": 97 }, { "epoch": 0.7438330170777988, "grad_norm": 1.8686747850955692, "learning_rate": 7.941877467006168e-05, "loss": 0.5644, "step": 98 }, { "epoch": 0.7514231499051234, "grad_norm": 1.1116521915214885, "learning_rate": 7.938197577639942e-05, "loss": 0.5559, "step": 99 }, { "epoch": 0.7590132827324478, "grad_norm": 1.5062245938638401, "learning_rate": 7.934405650201658e-05, "loss": 0.5723, "step": 100 }, { "epoch": 0.7666034155597723, "grad_norm": 1.1108744133424633, "learning_rate": 7.930501792568138e-05, "loss": 0.5545, "step": 101 }, { "epoch": 0.7741935483870968, "grad_norm": 1.5427714103721983, "learning_rate": 7.926486115800511e-05, "loss": 0.556, "step": 102 }, { "epoch": 0.7817836812144212, "grad_norm": 1.764775365031586, "learning_rate": 7.922358734141064e-05, "loss": 0.5596, "step": 103 }, { "epoch": 0.7893738140417458, "grad_norm": 1.2296630078252206, "learning_rate": 7.918119765009979e-05, "loss": 0.5598, "step": 104 }, { "epoch": 0.7969639468690702, "grad_norm": 1.2833682166627998, "learning_rate": 7.913769329002e-05, "loss": 0.5489, "step": 105 }, { "epoch": 0.8045540796963947, "grad_norm": 1.1872477219429831, "learning_rate": 7.909307549883002e-05, "loss": 0.5646, "step": 106 }, { "epoch": 0.8121442125237192, "grad_norm": 1.820761375614486, "learning_rate": 7.904734554586464e-05, "loss": 0.5556, "step": 107 }, { "epoch": 0.8197343453510436, "grad_norm": 1.1423898687342118, "learning_rate": 7.900050473209868e-05, "loss": 0.5483, "step": 108 }, { "epoch": 0.8273244781783681, "grad_norm": 1.476252579811037, "learning_rate": 7.895255439010987e-05, "loss": 0.5479, "step": 109 }, { "epoch": 0.8349146110056926, "grad_norm": 1.3278512325760372, "learning_rate": 7.890349588404102e-05, "loss": 0.5499, "step": 110 }, { "epoch": 0.8425047438330171, "grad_norm": 0.8671841713875902, "learning_rate": 7.885333060956117e-05, "loss": 0.5571, "step": 111 }, { "epoch": 0.8500948766603416, "grad_norm": 1.0738508848999515, "learning_rate": 7.88020599938259e-05, "loss": 0.5449, "step": 112 }, { "epoch": 0.857685009487666, "grad_norm": 1.7715748163298473, "learning_rate": 7.87496854954367e-05, "loss": 0.5491, "step": 113 }, { "epoch": 0.8652751423149905, "grad_norm": 1.0525784243440264, "learning_rate": 7.869620860439956e-05, "loss": 0.543, "step": 114 }, { "epoch": 0.872865275142315, "grad_norm": 2.0621859992760427, "learning_rate": 7.864163084208245e-05, "loss": 0.5622, "step": 115 }, { "epoch": 0.8804554079696395, "grad_norm": 1.363047653212627, "learning_rate": 7.858595376117214e-05, "loss": 0.5515, "step": 116 }, { "epoch": 0.888045540796964, "grad_norm": 1.7242002751506365, "learning_rate": 7.852917894563e-05, "loss": 0.5599, "step": 117 }, { "epoch": 0.8956356736242884, "grad_norm": 1.4061990696892013, "learning_rate": 7.847130801064694e-05, "loss": 0.5605, "step": 118 }, { "epoch": 0.9032258064516129, "grad_norm": 1.7767323380908933, "learning_rate": 7.84123426025974e-05, "loss": 0.5494, "step": 119 }, { "epoch": 0.9108159392789373, "grad_norm": 1.1684328222434068, "learning_rate": 7.835228439899264e-05, "loss": 0.546, "step": 120 }, { "epoch": 0.9184060721062619, "grad_norm": 1.9834381552810127, "learning_rate": 7.829113510843288e-05, "loss": 0.5551, "step": 121 }, { "epoch": 0.9259962049335864, "grad_norm": 1.4942107378630478, "learning_rate": 7.82288964705588e-05, "loss": 0.5454, "step": 122 }, { "epoch": 0.9335863377609108, "grad_norm": 1.631303090634789, "learning_rate": 7.816557025600196e-05, "loss": 0.5403, "step": 123 }, { "epoch": 0.9411764705882353, "grad_norm": 1.2779932620673164, "learning_rate": 7.81011582663345e-05, "loss": 0.5551, "step": 124 }, { "epoch": 0.9487666034155597, "grad_norm": 0.826316123440516, "learning_rate": 7.803566233401784e-05, "loss": 0.5468, "step": 125 }, { "epoch": 0.9563567362428842, "grad_norm": 1.5355038345605292, "learning_rate": 7.796908432235056e-05, "loss": 0.5588, "step": 126 }, { "epoch": 0.9639468690702088, "grad_norm": 1.6053485472330935, "learning_rate": 7.79014261254154e-05, "loss": 0.5457, "step": 127 }, { "epoch": 0.9715370018975332, "grad_norm": 0.8709812572017568, "learning_rate": 7.783268966802539e-05, "loss": 0.5482, "step": 128 }, { "epoch": 0.9791271347248577, "grad_norm": 1.0328203561237506, "learning_rate": 7.776287690566906e-05, "loss": 0.5516, "step": 129 }, { "epoch": 0.9867172675521821, "grad_norm": 1.421726756731164, "learning_rate": 7.769198982445478e-05, "loss": 0.5644, "step": 130 }, { "epoch": 0.9943074003795066, "grad_norm": 0.9699818427155015, "learning_rate": 7.762003044105435e-05, "loss": 0.5333, "step": 131 }, { "epoch": 1.0018975332068312, "grad_norm": 2.203324310322431, "learning_rate": 7.754700080264554e-05, "loss": 0.6801, "step": 132 }, { "epoch": 1.0094876660341556, "grad_norm": 1.2850623970507653, "learning_rate": 7.747290298685392e-05, "loss": 0.5231, "step": 133 }, { "epoch": 1.01707779886148, "grad_norm": 1.0733692629279126, "learning_rate": 7.739773910169366e-05, "loss": 0.526, "step": 134 }, { "epoch": 1.0246679316888045, "grad_norm": 1.3517159638201317, "learning_rate": 7.732151128550767e-05, "loss": 0.5374, "step": 135 }, { "epoch": 1.032258064516129, "grad_norm": 0.9043349347274219, "learning_rate": 7.724422170690668e-05, "loss": 0.5316, "step": 136 }, { "epoch": 1.0398481973434535, "grad_norm": 1.2575116166876772, "learning_rate": 7.716587256470759e-05, "loss": 0.5264, "step": 137 }, { "epoch": 1.047438330170778, "grad_norm": 1.151643956702767, "learning_rate": 7.708646608787091e-05, "loss": 0.5236, "step": 138 }, { "epoch": 1.0550284629981024, "grad_norm": 1.1533411140892482, "learning_rate": 7.700600453543731e-05, "loss": 0.5327, "step": 139 }, { "epoch": 1.0626185958254268, "grad_norm": 1.5703445128955635, "learning_rate": 7.692449019646341e-05, "loss": 0.5189, "step": 140 }, { "epoch": 1.0702087286527515, "grad_norm": 1.503708861643817, "learning_rate": 7.684192538995664e-05, "loss": 0.5208, "step": 141 }, { "epoch": 1.077798861480076, "grad_norm": 0.6891325431467323, "learning_rate": 7.675831246480923e-05, "loss": 0.5176, "step": 142 }, { "epoch": 1.0853889943074004, "grad_norm": 1.862959746082954, "learning_rate": 7.667365379973142e-05, "loss": 0.519, "step": 143 }, { "epoch": 1.092979127134725, "grad_norm": 0.9255777898780981, "learning_rate": 7.658795180318381e-05, "loss": 0.5306, "step": 144 }, { "epoch": 1.1005692599620494, "grad_norm": 1.2860696781263434, "learning_rate": 7.650120891330878e-05, "loss": 0.5231, "step": 145 }, { "epoch": 1.1081593927893738, "grad_norm": 0.9866085500546973, "learning_rate": 7.641342759786116e-05, "loss": 0.5134, "step": 146 }, { "epoch": 1.1157495256166983, "grad_norm": 1.6012070200344108, "learning_rate": 7.632461035413805e-05, "loss": 0.5225, "step": 147 }, { "epoch": 1.1233396584440227, "grad_norm": 1.0880689445644633, "learning_rate": 7.623475970890775e-05, "loss": 0.52, "step": 148 }, { "epoch": 1.1309297912713472, "grad_norm": 1.0388918530802034, "learning_rate": 7.614387821833786e-05, "loss": 0.5234, "step": 149 }, { "epoch": 1.1385199240986716, "grad_norm": 1.3834068969901858, "learning_rate": 7.605196846792256e-05, "loss": 0.52, "step": 150 }, { "epoch": 1.146110056925996, "grad_norm": 1.0808645625405662, "learning_rate": 7.59590330724091e-05, "loss": 0.5199, "step": 151 }, { "epoch": 1.1537001897533208, "grad_norm": 0.8748353485698048, "learning_rate": 7.586507467572339e-05, "loss": 0.5054, "step": 152 }, { "epoch": 1.1612903225806452, "grad_norm": 0.9809721493446659, "learning_rate": 7.577009595089472e-05, "loss": 0.5156, "step": 153 }, { "epoch": 1.1688804554079697, "grad_norm": 1.385065545391808, "learning_rate": 7.567409959997984e-05, "loss": 0.5125, "step": 154 }, { "epoch": 1.1764705882352942, "grad_norm": 1.1835810733031538, "learning_rate": 7.557708835398595e-05, "loss": 0.5089, "step": 155 }, { "epoch": 1.1840607210626186, "grad_norm": 1.0550638017889524, "learning_rate": 7.547906497279315e-05, "loss": 0.5085, "step": 156 }, { "epoch": 1.191650853889943, "grad_norm": 1.0668629873488273, "learning_rate": 7.538003224507579e-05, "loss": 0.5151, "step": 157 }, { "epoch": 1.1992409867172675, "grad_norm": 1.2773079106743754, "learning_rate": 7.52799929882232e-05, "loss": 0.5217, "step": 158 }, { "epoch": 1.206831119544592, "grad_norm": 1.0653233150213854, "learning_rate": 7.517895004825956e-05, "loss": 0.5142, "step": 159 }, { "epoch": 1.2144212523719164, "grad_norm": 1.1811879803660237, "learning_rate": 7.507690629976291e-05, "loss": 0.516, "step": 160 }, { "epoch": 1.222011385199241, "grad_norm": 0.9358140704136899, "learning_rate": 7.497386464578329e-05, "loss": 0.5116, "step": 161 }, { "epoch": 1.2296015180265654, "grad_norm": 1.236267972600389, "learning_rate": 7.486982801776032e-05, "loss": 0.5176, "step": 162 }, { "epoch": 1.23719165085389, "grad_norm": 1.1810121004464773, "learning_rate": 7.476479937543967e-05, "loss": 0.5208, "step": 163 }, { "epoch": 1.2447817836812145, "grad_norm": 1.0715306401128548, "learning_rate": 7.465878170678887e-05, "loss": 0.5149, "step": 164 }, { "epoch": 1.252371916508539, "grad_norm": 1.4554615426026292, "learning_rate": 7.455177802791237e-05, "loss": 0.5176, "step": 165 }, { "epoch": 1.2599620493358634, "grad_norm": 0.8300456250776146, "learning_rate": 7.444379138296572e-05, "loss": 0.5111, "step": 166 }, { "epoch": 1.2675521821631879, "grad_norm": 0.8301260998594161, "learning_rate": 7.433482484406887e-05, "loss": 0.5149, "step": 167 }, { "epoch": 1.2751423149905123, "grad_norm": 1.036861982897111, "learning_rate": 7.42248815112189e-05, "loss": 0.5074, "step": 168 }, { "epoch": 1.2827324478178368, "grad_norm": 1.1061999056879284, "learning_rate": 7.411396451220177e-05, "loss": 0.5014, "step": 169 }, { "epoch": 1.2903225806451613, "grad_norm": 1.3047827647572592, "learning_rate": 7.400207700250333e-05, "loss": 0.5144, "step": 170 }, { "epoch": 1.2979127134724857, "grad_norm": 0.7526970536905354, "learning_rate": 7.388922216521953e-05, "loss": 0.5132, "step": 171 }, { "epoch": 1.3055028462998102, "grad_norm": 0.7452267427677111, "learning_rate": 7.377540321096595e-05, "loss": 0.5022, "step": 172 }, { "epoch": 1.3130929791271346, "grad_norm": 1.0513789114160723, "learning_rate": 7.366062337778637e-05, "loss": 0.5039, "step": 173 }, { "epoch": 1.3206831119544593, "grad_norm": 1.3299701167289224, "learning_rate": 7.354488593106068e-05, "loss": 0.5039, "step": 174 }, { "epoch": 1.3282732447817835, "grad_norm": 0.9881183562854784, "learning_rate": 7.342819416341202e-05, "loss": 0.5161, "step": 175 }, { "epoch": 1.3358633776091082, "grad_norm": 1.3838355156124555, "learning_rate": 7.331055139461305e-05, "loss": 0.5128, "step": 176 }, { "epoch": 1.3434535104364327, "grad_norm": 0.706807050794008, "learning_rate": 7.319196097149153e-05, "loss": 0.4995, "step": 177 }, { "epoch": 1.3510436432637571, "grad_norm": 1.2072275318255294, "learning_rate": 7.307242626783514e-05, "loss": 0.5117, "step": 178 }, { "epoch": 1.3586337760910816, "grad_norm": 0.8736304969731823, "learning_rate": 7.295195068429539e-05, "loss": 0.5093, "step": 179 }, { "epoch": 1.366223908918406, "grad_norm": 1.118370322707032, "learning_rate": 7.283053764829106e-05, "loss": 0.513, "step": 180 }, { "epoch": 1.3738140417457305, "grad_norm": 1.2165754217336513, "learning_rate": 7.270819061391049e-05, "loss": 0.5061, "step": 181 }, { "epoch": 1.381404174573055, "grad_norm": 1.0662810244952639, "learning_rate": 7.258491306181346e-05, "loss": 0.5074, "step": 182 }, { "epoch": 1.3889943074003794, "grad_norm": 1.550093405647991, "learning_rate": 7.24607084991321e-05, "loss": 0.5169, "step": 183 }, { "epoch": 1.396584440227704, "grad_norm": 0.7232302048062569, "learning_rate": 7.233558045937113e-05, "loss": 0.5187, "step": 184 }, { "epoch": 1.4041745730550286, "grad_norm": 1.3301692157689138, "learning_rate": 7.220953250230733e-05, "loss": 0.5101, "step": 185 }, { "epoch": 1.4117647058823528, "grad_norm": 0.9469277615633731, "learning_rate": 7.208256821388831e-05, "loss": 0.5115, "step": 186 }, { "epoch": 1.4193548387096775, "grad_norm": 1.461657389888908, "learning_rate": 7.195469120613041e-05, "loss": 0.518, "step": 187 }, { "epoch": 1.426944971537002, "grad_norm": 0.7145042956694666, "learning_rate": 7.182590511701604e-05, "loss": 0.5002, "step": 188 }, { "epoch": 1.4345351043643264, "grad_norm": 0.9602590784255072, "learning_rate": 7.169621361039009e-05, "loss": 0.4932, "step": 189 }, { "epoch": 1.4421252371916509, "grad_norm": 0.9348247562699835, "learning_rate": 7.156562037585576e-05, "loss": 0.5045, "step": 190 }, { "epoch": 1.4497153700189753, "grad_norm": 1.5691729872812523, "learning_rate": 7.143412912866954e-05, "loss": 0.5146, "step": 191 }, { "epoch": 1.4573055028462998, "grad_norm": 0.7191513604989822, "learning_rate": 7.130174360963562e-05, "loss": 0.5031, "step": 192 }, { "epoch": 1.4648956356736242, "grad_norm": 1.6999162113253339, "learning_rate": 7.116846758499933e-05, "loss": 0.5103, "step": 193 }, { "epoch": 1.4724857685009487, "grad_norm": 1.0965769424195349, "learning_rate": 7.103430484634009e-05, "loss": 0.5101, "step": 194 }, { "epoch": 1.4800759013282732, "grad_norm": 1.042633463565035, "learning_rate": 7.089925921046348e-05, "loss": 0.5133, "step": 195 }, { "epoch": 1.4876660341555978, "grad_norm": 1.5277163845081705, "learning_rate": 7.076333451929275e-05, "loss": 0.5166, "step": 196 }, { "epoch": 1.495256166982922, "grad_norm": 0.7588665368653583, "learning_rate": 7.062653463975938e-05, "loss": 0.5028, "step": 197 }, { "epoch": 1.5028462998102468, "grad_norm": 1.4802097799655463, "learning_rate": 7.048886346369321e-05, "loss": 0.5173, "step": 198 }, { "epoch": 1.510436432637571, "grad_norm": 0.8989137638919413, "learning_rate": 7.035032490771165e-05, "loss": 0.5058, "step": 199 }, { "epoch": 1.5180265654648957, "grad_norm": 1.3727603969798114, "learning_rate": 7.021092291310821e-05, "loss": 0.5196, "step": 200 }, { "epoch": 1.5256166982922201, "grad_norm": 0.95363755185113, "learning_rate": 7.007066144574052e-05, "loss": 0.5205, "step": 201 }, { "epoch": 1.5332068311195446, "grad_norm": 1.1663040985006814, "learning_rate": 6.992954449591731e-05, "loss": 0.5093, "step": 202 }, { "epoch": 1.540796963946869, "grad_norm": 0.7636048619329266, "learning_rate": 6.978757607828509e-05, "loss": 0.506, "step": 203 }, { "epoch": 1.5483870967741935, "grad_norm": 1.1069490833534057, "learning_rate": 6.964476023171378e-05, "loss": 0.516, "step": 204 }, { "epoch": 1.5559772296015182, "grad_norm": 0.6735693040775705, "learning_rate": 6.95011010191819e-05, "loss": 0.507, "step": 205 }, { "epoch": 1.5635673624288424, "grad_norm": 0.7757347897129492, "learning_rate": 6.935660252766092e-05, "loss": 0.5181, "step": 206 }, { "epoch": 1.571157495256167, "grad_norm": 0.7414965427945387, "learning_rate": 6.921126886799903e-05, "loss": 0.5074, "step": 207 }, { "epoch": 1.5787476280834913, "grad_norm": 0.8131364204912126, "learning_rate": 6.906510417480422e-05, "loss": 0.5153, "step": 208 }, { "epoch": 1.586337760910816, "grad_norm": 0.8512550944337758, "learning_rate": 6.891811260632653e-05, "loss": 0.5054, "step": 209 }, { "epoch": 1.5939278937381403, "grad_norm": 0.7855183043381698, "learning_rate": 6.877029834433992e-05, "loss": 0.5047, "step": 210 }, { "epoch": 1.601518026565465, "grad_norm": 0.8992512717445637, "learning_rate": 6.862166559402318e-05, "loss": 0.5025, "step": 211 }, { "epoch": 1.6091081593927894, "grad_norm": 0.9210792646776457, "learning_rate": 6.847221858384032e-05, "loss": 0.4974, "step": 212 }, { "epoch": 1.6166982922201139, "grad_norm": 0.9424266330757026, "learning_rate": 6.832196156542033e-05, "loss": 0.5062, "step": 213 }, { "epoch": 1.6242884250474383, "grad_norm": 1.0966101994750281, "learning_rate": 6.817089881343613e-05, "loss": 0.5054, "step": 214 }, { "epoch": 1.6318785578747628, "grad_norm": 1.009163727768516, "learning_rate": 6.801903462548308e-05, "loss": 0.5034, "step": 215 }, { "epoch": 1.6394686907020875, "grad_norm": 0.9725332248811417, "learning_rate": 6.786637332195659e-05, "loss": 0.5115, "step": 216 }, { "epoch": 1.6470588235294117, "grad_norm": 1.0170207658600694, "learning_rate": 6.771291924592929e-05, "loss": 0.5066, "step": 217 }, { "epoch": 1.6546489563567364, "grad_norm": 0.9422861500618195, "learning_rate": 6.755867676302747e-05, "loss": 0.504, "step": 218 }, { "epoch": 1.6622390891840606, "grad_norm": 0.9158879164554034, "learning_rate": 6.740365026130684e-05, "loss": 0.5032, "step": 219 }, { "epoch": 1.6698292220113853, "grad_norm": 0.7780361297463692, "learning_rate": 6.724784415112774e-05, "loss": 0.4888, "step": 220 }, { "epoch": 1.6774193548387095, "grad_norm": 0.5692137929082299, "learning_rate": 6.709126286502965e-05, "loss": 0.5022, "step": 221 }, { "epoch": 1.6850094876660342, "grad_norm": 0.5004905918093622, "learning_rate": 6.693391085760506e-05, "loss": 0.4995, "step": 222 }, { "epoch": 1.6925996204933587, "grad_norm": 0.5848868016251021, "learning_rate": 6.677579260537277e-05, "loss": 0.5055, "step": 223 }, { "epoch": 1.7001897533206831, "grad_norm": 0.734294502408837, "learning_rate": 6.661691260665057e-05, "loss": 0.5008, "step": 224 }, { "epoch": 1.7077798861480076, "grad_norm": 0.9781085041990851, "learning_rate": 6.64572753814272e-05, "loss": 0.5082, "step": 225 }, { "epoch": 1.715370018975332, "grad_norm": 1.1839289754443743, "learning_rate": 6.629688547123381e-05, "loss": 0.4966, "step": 226 }, { "epoch": 1.7229601518026565, "grad_norm": 0.6203375151514526, "learning_rate": 6.613574743901472e-05, "loss": 0.4976, "step": 227 }, { "epoch": 1.730550284629981, "grad_norm": 0.37377037948651215, "learning_rate": 6.597386586899766e-05, "loss": 0.4907, "step": 228 }, { "epoch": 1.7381404174573056, "grad_norm": 0.5842003288831636, "learning_rate": 6.58112453665633e-05, "loss": 0.5, "step": 229 }, { "epoch": 1.7457305502846299, "grad_norm": 1.1216009200042196, "learning_rate": 6.564789055811422e-05, "loss": 0.5118, "step": 230 }, { "epoch": 1.7533206831119545, "grad_norm": 1.1503531175618553, "learning_rate": 6.54838060909434e-05, "loss": 0.4856, "step": 231 }, { "epoch": 1.7609108159392788, "grad_norm": 0.5953762660752571, "learning_rate": 6.531899663310187e-05, "loss": 0.4933, "step": 232 }, { "epoch": 1.7685009487666035, "grad_norm": 0.4946507234843489, "learning_rate": 6.515346687326602e-05, "loss": 0.488, "step": 233 }, { "epoch": 1.776091081593928, "grad_norm": 0.6911888286239702, "learning_rate": 6.498722152060411e-05, "loss": 0.5024, "step": 234 }, { "epoch": 1.7836812144212524, "grad_norm": 0.9524817833599729, "learning_rate": 6.482026530464244e-05, "loss": 0.497, "step": 235 }, { "epoch": 1.7912713472485768, "grad_norm": 1.056867827538452, "learning_rate": 6.465260297513059e-05, "loss": 0.5001, "step": 236 }, { "epoch": 1.7988614800759013, "grad_norm": 0.9341896474591933, "learning_rate": 6.448423930190653e-05, "loss": 0.5056, "step": 237 }, { "epoch": 1.8064516129032258, "grad_norm": 0.7998775078188581, "learning_rate": 6.431517907476073e-05, "loss": 0.4965, "step": 238 }, { "epoch": 1.8140417457305502, "grad_norm": 0.6024227793682277, "learning_rate": 6.414542710330004e-05, "loss": 0.4918, "step": 239 }, { "epoch": 1.821631878557875, "grad_norm": 0.5054296948703985, "learning_rate": 6.397498821681073e-05, "loss": 0.4987, "step": 240 }, { "epoch": 1.8292220113851991, "grad_norm": 0.4915898095283207, "learning_rate": 6.380386726412122e-05, "loss": 0.489, "step": 241 }, { "epoch": 1.8368121442125238, "grad_norm": 0.5191126165622191, "learning_rate": 6.363206911346405e-05, "loss": 0.5062, "step": 242 }, { "epoch": 1.844402277039848, "grad_norm": 0.591888201694542, "learning_rate": 6.345959865233742e-05, "loss": 0.4928, "step": 243 }, { "epoch": 1.8519924098671727, "grad_norm": 0.6103884601516754, "learning_rate": 6.328646078736614e-05, "loss": 0.4983, "step": 244 }, { "epoch": 1.8595825426944972, "grad_norm": 0.5676870354041681, "learning_rate": 6.311266044416205e-05, "loss": 0.493, "step": 245 }, { "epoch": 1.8671726755218216, "grad_norm": 0.5025577878236349, "learning_rate": 6.293820256718388e-05, "loss": 0.4936, "step": 246 }, { "epoch": 1.874762808349146, "grad_norm": 0.5343665402941907, "learning_rate": 6.276309211959657e-05, "loss": 0.4976, "step": 247 }, { "epoch": 1.8823529411764706, "grad_norm": 0.684168766812062, "learning_rate": 6.25873340831301e-05, "loss": 0.4986, "step": 248 }, { "epoch": 1.889943074003795, "grad_norm": 0.971664414920718, "learning_rate": 6.241093345793777e-05, "loss": 0.4923, "step": 249 }, { "epoch": 1.8975332068311195, "grad_norm": 1.3291099108661037, "learning_rate": 6.22338952624539e-05, "loss": 0.5085, "step": 250 }, { "epoch": 1.9051233396584442, "grad_norm": 0.5887944838607679, "learning_rate": 6.205622453325113e-05, "loss": 0.4901, "step": 251 }, { "epoch": 1.9127134724857684, "grad_norm": 0.5766670451808246, "learning_rate": 6.18779263248971e-05, "loss": 0.4923, "step": 252 }, { "epoch": 1.920303605313093, "grad_norm": 1.1307550162308162, "learning_rate": 6.169900570981057e-05, "loss": 0.4991, "step": 253 }, { "epoch": 1.9278937381404173, "grad_norm": 1.138869550845278, "learning_rate": 6.151946777811729e-05, "loss": 0.4998, "step": 254 }, { "epoch": 1.935483870967742, "grad_norm": 0.6269758422232977, "learning_rate": 6.133931763750509e-05, "loss": 0.4933, "step": 255 }, { "epoch": 1.9430740037950665, "grad_norm": 0.7710149723845751, "learning_rate": 6.11585604130785e-05, "loss": 0.4944, "step": 256 }, { "epoch": 1.950664136622391, "grad_norm": 0.9641556034924468, "learning_rate": 6.097720124721311e-05, "loss": 0.4915, "step": 257 }, { "epoch": 1.9582542694497154, "grad_norm": 0.8101487252514183, "learning_rate": 6.079524529940911e-05, "loss": 0.4788, "step": 258 }, { "epoch": 1.9658444022770398, "grad_norm": 0.6731500817613972, "learning_rate": 6.0612697746144664e-05, "loss": 0.4887, "step": 259 }, { "epoch": 1.9734345351043643, "grad_norm": 0.66266631987093, "learning_rate": 6.0429563780728476e-05, "loss": 0.4888, "step": 260 }, { "epoch": 1.9810246679316887, "grad_norm": 0.5402551506844365, "learning_rate": 6.02458486131522e-05, "loss": 0.4831, "step": 261 }, { "epoch": 1.9886148007590134, "grad_norm": 0.6879216139275022, "learning_rate": 6.006155746994212e-05, "loss": 0.491, "step": 262 }, { "epoch": 1.9962049335863377, "grad_norm": 0.9539606050998473, "learning_rate": 5.98766955940105e-05, "loss": 0.5341, "step": 263 }, { "epoch": 2.0037950664136623, "grad_norm": 1.2929340536370602, "learning_rate": 5.969126824450643e-05, "loss": 0.5524, "step": 264 }, { "epoch": 2.0113851992409866, "grad_norm": 0.6792026166979978, "learning_rate": 5.9505280696666174e-05, "loss": 0.4671, "step": 265 }, { "epoch": 2.0189753320683113, "grad_norm": 0.6570978500488273, "learning_rate": 5.931873824166316e-05, "loss": 0.458, "step": 266 }, { "epoch": 2.0265654648956355, "grad_norm": 0.8625246084442377, "learning_rate": 5.913164618645738e-05, "loss": 0.4646, "step": 267 }, { "epoch": 2.03415559772296, "grad_norm": 0.8463370840972069, "learning_rate": 5.894400985364444e-05, "loss": 0.4503, "step": 268 }, { "epoch": 2.041745730550285, "grad_norm": 0.5846678229118594, "learning_rate": 5.875583458130417e-05, "loss": 0.452, "step": 269 }, { "epoch": 2.049335863377609, "grad_norm": 0.48959366327046705, "learning_rate": 5.856712572284868e-05, "loss": 0.4608, "step": 270 }, { "epoch": 2.0569259962049338, "grad_norm": 0.5808495151777524, "learning_rate": 5.8377888646870154e-05, "loss": 0.4572, "step": 271 }, { "epoch": 2.064516129032258, "grad_norm": 0.5154615059210003, "learning_rate": 5.818812873698809e-05, "loss": 0.4555, "step": 272 }, { "epoch": 2.0721062618595827, "grad_norm": 0.5247505353737575, "learning_rate": 5.799785139169606e-05, "loss": 0.4493, "step": 273 }, { "epoch": 2.079696394686907, "grad_norm": 0.6700114330504865, "learning_rate": 5.7807062024208256e-05, "loss": 0.4593, "step": 274 }, { "epoch": 2.0872865275142316, "grad_norm": 0.6564087028803952, "learning_rate": 5.761576606230538e-05, "loss": 0.4543, "step": 275 }, { "epoch": 2.094876660341556, "grad_norm": 0.6170822532663903, "learning_rate": 5.742396894818031e-05, "loss": 0.4585, "step": 276 }, { "epoch": 2.1024667931688805, "grad_norm": 0.5359408843960233, "learning_rate": 5.723167613828324e-05, "loss": 0.4571, "step": 277 }, { "epoch": 2.1100569259962048, "grad_norm": 0.42551634695058566, "learning_rate": 5.7038893103166425e-05, "loss": 0.4553, "step": 278 }, { "epoch": 2.1176470588235294, "grad_norm": 0.25776313987894806, "learning_rate": 5.684562532732859e-05, "loss": 0.4467, "step": 279 }, { "epoch": 2.1252371916508537, "grad_norm": 0.27351669144074725, "learning_rate": 5.665187830905888e-05, "loss": 0.4415, "step": 280 }, { "epoch": 2.1328273244781784, "grad_norm": 0.41764999814129333, "learning_rate": 5.645765756028045e-05, "loss": 0.459, "step": 281 }, { "epoch": 2.140417457305503, "grad_norm": 0.4715282529881882, "learning_rate": 5.626296860639364e-05, "loss": 0.4535, "step": 282 }, { "epoch": 2.1480075901328273, "grad_norm": 0.45181614089506017, "learning_rate": 5.606781698611879e-05, "loss": 0.4557, "step": 283 }, { "epoch": 2.155597722960152, "grad_norm": 0.3928688694632629, "learning_rate": 5.587220825133867e-05, "loss": 0.4529, "step": 284 }, { "epoch": 2.163187855787476, "grad_norm": 0.3422352007203858, "learning_rate": 5.567614796694056e-05, "loss": 0.4478, "step": 285 }, { "epoch": 2.170777988614801, "grad_norm": 0.3858181479438661, "learning_rate": 5.5479641710657867e-05, "loss": 0.461, "step": 286 }, { "epoch": 2.178368121442125, "grad_norm": 0.4901941376432685, "learning_rate": 5.528269507291152e-05, "loss": 0.4533, "step": 287 }, { "epoch": 2.18595825426945, "grad_norm": 0.6077838701042644, "learning_rate": 5.5085313656650856e-05, "loss": 0.4565, "step": 288 }, { "epoch": 2.193548387096774, "grad_norm": 0.6334250948792183, "learning_rate": 5.48875030771943e-05, "loss": 0.4526, "step": 289 }, { "epoch": 2.2011385199240987, "grad_norm": 0.5394180746780861, "learning_rate": 5.468926896206955e-05, "loss": 0.4474, "step": 290 }, { "epoch": 2.2087286527514234, "grad_norm": 0.3688187782872463, "learning_rate": 5.4490616950853484e-05, "loss": 0.4486, "step": 291 }, { "epoch": 2.2163187855787476, "grad_norm": 0.28612624363569344, "learning_rate": 5.4291552695011786e-05, "loss": 0.4473, "step": 292 }, { "epoch": 2.2239089184060723, "grad_norm": 0.3786323162444375, "learning_rate": 5.409208185773806e-05, "loss": 0.4537, "step": 293 }, { "epoch": 2.2314990512333965, "grad_norm": 0.45998197157742643, "learning_rate": 5.389221011379281e-05, "loss": 0.445, "step": 294 }, { "epoch": 2.239089184060721, "grad_norm": 0.4227537195267863, "learning_rate": 5.3691943149341976e-05, "loss": 0.4524, "step": 295 }, { "epoch": 2.2466793168880455, "grad_norm": 0.3375900744876679, "learning_rate": 5.3491286661795104e-05, "loss": 0.4543, "step": 296 }, { "epoch": 2.25426944971537, "grad_norm": 0.3936283250723083, "learning_rate": 5.3290246359643365e-05, "loss": 0.4549, "step": 297 }, { "epoch": 2.2618595825426944, "grad_norm": 0.4158907529340202, "learning_rate": 5.3088827962297055e-05, "loss": 0.4615, "step": 298 }, { "epoch": 2.269449715370019, "grad_norm": 0.3573969971167834, "learning_rate": 5.288703719992296e-05, "loss": 0.4627, "step": 299 }, { "epoch": 2.2770398481973433, "grad_norm": 0.29339247941077856, "learning_rate": 5.2684879813281324e-05, "loss": 0.4527, "step": 300 }, { "epoch": 2.284629981024668, "grad_norm": 0.3811958753473836, "learning_rate": 5.248236155356244e-05, "loss": 0.4511, "step": 301 }, { "epoch": 2.292220113851992, "grad_norm": 0.3947350372974727, "learning_rate": 5.227948818222317e-05, "loss": 0.4551, "step": 302 }, { "epoch": 2.299810246679317, "grad_norm": 0.2959934006358651, "learning_rate": 5.207626547082294e-05, "loss": 0.451, "step": 303 }, { "epoch": 2.3074003795066416, "grad_norm": 0.3009410854470416, "learning_rate": 5.1872699200859606e-05, "loss": 0.4504, "step": 304 }, { "epoch": 2.314990512333966, "grad_norm": 0.38647651793826754, "learning_rate": 5.1668795163604924e-05, "loss": 0.4575, "step": 305 }, { "epoch": 2.3225806451612905, "grad_norm": 0.34305172614808316, "learning_rate": 5.1464559159939814e-05, "loss": 0.4513, "step": 306 }, { "epoch": 2.3301707779886147, "grad_norm": 0.3120007036591175, "learning_rate": 5.125999700018934e-05, "loss": 0.4601, "step": 307 }, { "epoch": 2.3377609108159394, "grad_norm": 0.31088228173825794, "learning_rate": 5.105511450395742e-05, "loss": 0.4605, "step": 308 }, { "epoch": 2.3453510436432636, "grad_norm": 0.24185319509946887, "learning_rate": 5.084991749996121e-05, "loss": 0.4544, "step": 309 }, { "epoch": 2.3529411764705883, "grad_norm": 0.3141319871949889, "learning_rate": 5.064441182586538e-05, "loss": 0.4477, "step": 310 }, { "epoch": 2.3605313092979125, "grad_norm": 0.3437798737764119, "learning_rate": 5.0438603328115915e-05, "loss": 0.438, "step": 311 }, { "epoch": 2.3681214421252372, "grad_norm": 0.3413170865670166, "learning_rate": 5.023249786177388e-05, "loss": 0.4496, "step": 312 }, { "epoch": 2.375711574952562, "grad_norm": 0.32816099400302223, "learning_rate": 5.002610129034883e-05, "loss": 0.4457, "step": 313 }, { "epoch": 2.383301707779886, "grad_norm": 0.23652738280230934, "learning_rate": 4.981941948563197e-05, "loss": 0.4518, "step": 314 }, { "epoch": 2.3908918406072104, "grad_norm": 0.3332470802079381, "learning_rate": 4.961245832752916e-05, "loss": 0.4553, "step": 315 }, { "epoch": 2.398481973434535, "grad_norm": 0.30703993672772734, "learning_rate": 4.940522370389355e-05, "loss": 0.4511, "step": 316 }, { "epoch": 2.4060721062618597, "grad_norm": 0.3458797214503799, "learning_rate": 4.919772151035819e-05, "loss": 0.4483, "step": 317 }, { "epoch": 2.413662239089184, "grad_norm": 0.33817212823710935, "learning_rate": 4.898995765016822e-05, "loss": 0.4602, "step": 318 }, { "epoch": 2.4212523719165087, "grad_norm": 0.28768592124027254, "learning_rate": 4.878193803401294e-05, "loss": 0.441, "step": 319 }, { "epoch": 2.428842504743833, "grad_norm": 0.24625871004420682, "learning_rate": 4.85736685798577e-05, "loss": 0.4447, "step": 320 }, { "epoch": 2.4364326375711576, "grad_norm": 0.3114815791554252, "learning_rate": 4.836515521277548e-05, "loss": 0.4506, "step": 321 }, { "epoch": 2.444022770398482, "grad_norm": 0.43608825596037326, "learning_rate": 4.8156403864778376e-05, "loss": 0.4559, "step": 322 }, { "epoch": 2.4516129032258065, "grad_norm": 0.3872177355726424, "learning_rate": 4.7947420474648826e-05, "loss": 0.4596, "step": 323 }, { "epoch": 2.4592030360531307, "grad_norm": 0.2265303368613466, "learning_rate": 4.773821098777061e-05, "loss": 0.4529, "step": 324 }, { "epoch": 2.4667931688804554, "grad_norm": 0.26489937931522084, "learning_rate": 4.7528781355959836e-05, "loss": 0.4462, "step": 325 }, { "epoch": 2.47438330170778, "grad_norm": 0.32008600117514796, "learning_rate": 4.731913753729543e-05, "loss": 0.4489, "step": 326 }, { "epoch": 2.4819734345351043, "grad_norm": 0.30655482675440676, "learning_rate": 4.710928549594979e-05, "loss": 0.4542, "step": 327 }, { "epoch": 2.489563567362429, "grad_norm": 0.24961472010620386, "learning_rate": 4.689923120201907e-05, "loss": 0.455, "step": 328 }, { "epoch": 2.4971537001897532, "grad_norm": 0.3196073862864069, "learning_rate": 4.668898063135327e-05, "loss": 0.4401, "step": 329 }, { "epoch": 2.504743833017078, "grad_norm": 0.277810170883558, "learning_rate": 4.647853976538635e-05, "loss": 0.4429, "step": 330 }, { "epoch": 2.512333965844402, "grad_norm": 0.2770203193332356, "learning_rate": 4.626791459096592e-05, "loss": 0.4509, "step": 331 }, { "epoch": 2.519924098671727, "grad_norm": 0.26941306970885837, "learning_rate": 4.605711110018307e-05, "loss": 0.4485, "step": 332 }, { "epoch": 2.527514231499051, "grad_norm": 0.2128205627033176, "learning_rate": 4.584613529020177e-05, "loss": 0.4567, "step": 333 }, { "epoch": 2.5351043643263758, "grad_norm": 0.2612809484941453, "learning_rate": 4.563499316308832e-05, "loss": 0.4454, "step": 334 }, { "epoch": 2.5426944971537004, "grad_norm": 0.2611991188114079, "learning_rate": 4.542369072564062e-05, "loss": 0.4527, "step": 335 }, { "epoch": 2.5502846299810247, "grad_norm": 0.21775843029252434, "learning_rate": 4.5212233989217217e-05, "loss": 0.4533, "step": 336 }, { "epoch": 2.557874762808349, "grad_norm": 0.24689100702507727, "learning_rate": 4.500062896956632e-05, "loss": 0.4564, "step": 337 }, { "epoch": 2.5654648956356736, "grad_norm": 0.26478079829629153, "learning_rate": 4.47888816866547e-05, "loss": 0.4529, "step": 338 }, { "epoch": 2.5730550284629983, "grad_norm": 0.27076572953883926, "learning_rate": 4.457699816449632e-05, "loss": 0.443, "step": 339 }, { "epoch": 2.5806451612903225, "grad_norm": 0.2578704602776011, "learning_rate": 4.436498443098108e-05, "loss": 0.4474, "step": 340 }, { "epoch": 2.588235294117647, "grad_norm": 0.22049010549186773, "learning_rate": 4.4152846517703265e-05, "loss": 0.45, "step": 341 }, { "epoch": 2.5958254269449714, "grad_norm": 0.24125071259305053, "learning_rate": 4.394059045978994e-05, "loss": 0.4481, "step": 342 }, { "epoch": 2.603415559772296, "grad_norm": 0.226901700766956, "learning_rate": 4.372822229572927e-05, "loss": 0.4457, "step": 343 }, { "epoch": 2.6110056925996203, "grad_norm": 0.2538357888941769, "learning_rate": 4.3515748067198734e-05, "loss": 0.4467, "step": 344 }, { "epoch": 2.618595825426945, "grad_norm": 0.24051209192684073, "learning_rate": 4.33031738188933e-05, "loss": 0.4612, "step": 345 }, { "epoch": 2.6261859582542693, "grad_norm": 0.1851624882291598, "learning_rate": 4.309050559835335e-05, "loss": 0.4447, "step": 346 }, { "epoch": 2.633776091081594, "grad_norm": 0.23729717589403226, "learning_rate": 4.287774945579268e-05, "loss": 0.4546, "step": 347 }, { "epoch": 2.6413662239089186, "grad_norm": 0.2414632155732589, "learning_rate": 4.266491144392646e-05, "loss": 0.4547, "step": 348 }, { "epoch": 2.648956356736243, "grad_norm": 0.1961262640553002, "learning_rate": 4.245199761779889e-05, "loss": 0.4528, "step": 349 }, { "epoch": 2.656546489563567, "grad_norm": 0.2519131470563294, "learning_rate": 4.223901403461104e-05, "loss": 0.4468, "step": 350 }, { "epoch": 2.6641366223908918, "grad_norm": 0.2871531404330494, "learning_rate": 4.202596675354851e-05, "loss": 0.4524, "step": 351 }, { "epoch": 2.6717267552182165, "grad_norm": 0.2328323960543026, "learning_rate": 4.1812861835609055e-05, "loss": 0.4477, "step": 352 }, { "epoch": 2.6793168880455407, "grad_norm": 0.34379368220276085, "learning_rate": 4.1599705343430126e-05, "loss": 0.4473, "step": 353 }, { "epoch": 2.6869070208728654, "grad_norm": 0.3077170581200678, "learning_rate": 4.138650334111641e-05, "loss": 0.4482, "step": 354 }, { "epoch": 2.6944971537001896, "grad_norm": 0.27170499447729773, "learning_rate": 4.117326189406733e-05, "loss": 0.4456, "step": 355 }, { "epoch": 2.7020872865275143, "grad_norm": 0.23574025133073204, "learning_rate": 4.095998706880449e-05, "loss": 0.441, "step": 356 }, { "epoch": 2.709677419354839, "grad_norm": 0.24705781444445205, "learning_rate": 4.0746684932799035e-05, "loss": 0.4546, "step": 357 }, { "epoch": 2.717267552182163, "grad_norm": 0.27539198214734206, "learning_rate": 4.05333615542991e-05, "loss": 0.4531, "step": 358 }, { "epoch": 2.7248576850094874, "grad_norm": 0.23526848668490832, "learning_rate": 4.032002300215715e-05, "loss": 0.4453, "step": 359 }, { "epoch": 2.732447817836812, "grad_norm": 0.23006904681882934, "learning_rate": 4.01066753456573e-05, "loss": 0.4498, "step": 360 }, { "epoch": 2.740037950664137, "grad_norm": 0.2370274892416526, "learning_rate": 3.989332465434272e-05, "loss": 0.4453, "step": 361 }, { "epoch": 2.747628083491461, "grad_norm": 0.21309601710918213, "learning_rate": 3.9679976997842875e-05, "loss": 0.4477, "step": 362 }, { "epoch": 2.7552182163187857, "grad_norm": 0.24539024717561866, "learning_rate": 3.946663844570091e-05, "loss": 0.4476, "step": 363 }, { "epoch": 2.76280834914611, "grad_norm": 0.18994827411578624, "learning_rate": 3.925331506720097e-05, "loss": 0.4464, "step": 364 }, { "epoch": 2.7703984819734346, "grad_norm": 0.23585418564376148, "learning_rate": 3.9040012931195515e-05, "loss": 0.4518, "step": 365 }, { "epoch": 2.777988614800759, "grad_norm": 0.2781600210524234, "learning_rate": 3.8826738105932674e-05, "loss": 0.4446, "step": 366 }, { "epoch": 2.7855787476280836, "grad_norm": 0.22272463213128638, "learning_rate": 3.8613496658883593e-05, "loss": 0.4593, "step": 367 }, { "epoch": 2.793168880455408, "grad_norm": 0.21131624600480586, "learning_rate": 3.8400294656569894e-05, "loss": 0.4553, "step": 368 }, { "epoch": 2.8007590132827325, "grad_norm": 0.31839013252012877, "learning_rate": 3.818713816439096e-05, "loss": 0.4548, "step": 369 }, { "epoch": 2.808349146110057, "grad_norm": 0.28626008068744174, "learning_rate": 3.7974033246451496e-05, "loss": 0.4454, "step": 370 }, { "epoch": 2.8159392789373814, "grad_norm": 0.16805045978558228, "learning_rate": 3.7760985965388975e-05, "loss": 0.4533, "step": 371 }, { "epoch": 2.8235294117647056, "grad_norm": 0.27576140446733527, "learning_rate": 3.7548002382201126e-05, "loss": 0.4528, "step": 372 }, { "epoch": 2.8311195445920303, "grad_norm": 0.2587367405759775, "learning_rate": 3.7335088556073554e-05, "loss": 0.4525, "step": 373 }, { "epoch": 2.838709677419355, "grad_norm": 0.2177015252391427, "learning_rate": 3.712225054420732e-05, "loss": 0.4466, "step": 374 }, { "epoch": 2.846299810246679, "grad_norm": 0.22016300401684588, "learning_rate": 3.690949440164667e-05, "loss": 0.4507, "step": 375 }, { "epoch": 2.853889943074004, "grad_norm": 0.2168768585860928, "learning_rate": 3.669682618110671e-05, "loss": 0.4537, "step": 376 }, { "epoch": 2.861480075901328, "grad_norm": 0.23069857787158976, "learning_rate": 3.648425193280128e-05, "loss": 0.4514, "step": 377 }, { "epoch": 2.869070208728653, "grad_norm": 0.2308590735052973, "learning_rate": 3.627177770427075e-05, "loss": 0.4517, "step": 378 }, { "epoch": 2.8766603415559775, "grad_norm": 0.16332486719450007, "learning_rate": 3.6059409540210075e-05, "loss": 0.4437, "step": 379 }, { "epoch": 2.8842504743833017, "grad_norm": 0.2372916479550056, "learning_rate": 3.5847153482296734e-05, "loss": 0.4516, "step": 380 }, { "epoch": 2.891840607210626, "grad_norm": 0.25887011649794794, "learning_rate": 3.563501556901892e-05, "loss": 0.4484, "step": 381 }, { "epoch": 2.8994307400379506, "grad_norm": 0.19423313672626585, "learning_rate": 3.5423001835503696e-05, "loss": 0.4489, "step": 382 }, { "epoch": 2.9070208728652753, "grad_norm": 0.2299380518686083, "learning_rate": 3.521111831334532e-05, "loss": 0.4458, "step": 383 }, { "epoch": 2.9146110056925996, "grad_norm": 0.19304361317692467, "learning_rate": 3.4999371030433694e-05, "loss": 0.4527, "step": 384 }, { "epoch": 2.9222011385199242, "grad_norm": 0.19969286229808306, "learning_rate": 3.47877660107828e-05, "loss": 0.4417, "step": 385 }, { "epoch": 2.9297912713472485, "grad_norm": 0.26755657071262773, "learning_rate": 3.4576309274359394e-05, "loss": 0.4611, "step": 386 }, { "epoch": 2.937381404174573, "grad_norm": 0.20374725518267028, "learning_rate": 3.436500683691168e-05, "loss": 0.4582, "step": 387 }, { "epoch": 2.9449715370018974, "grad_norm": 0.2258822384811313, "learning_rate": 3.4153864709798234e-05, "loss": 0.4475, "step": 388 }, { "epoch": 2.952561669829222, "grad_norm": 0.18638500489261803, "learning_rate": 3.394288889981695e-05, "loss": 0.445, "step": 389 }, { "epoch": 2.9601518026565463, "grad_norm": 0.2080901768578384, "learning_rate": 3.373208540903409e-05, "loss": 0.4515, "step": 390 }, { "epoch": 2.967741935483871, "grad_norm": 0.22482754779566425, "learning_rate": 3.3521460234613664e-05, "loss": 0.4476, "step": 391 }, { "epoch": 2.9753320683111957, "grad_norm": 0.21178017233679877, "learning_rate": 3.331101936864674e-05, "loss": 0.4503, "step": 392 }, { "epoch": 2.98292220113852, "grad_norm": 0.1947806663328843, "learning_rate": 3.310076879798095e-05, "loss": 0.4415, "step": 393 }, { "epoch": 2.990512333965844, "grad_norm": 0.18579209774749325, "learning_rate": 3.2890714504050216e-05, "loss": 0.446, "step": 394 }, { "epoch": 2.998102466793169, "grad_norm": 0.23138108723134526, "learning_rate": 3.268086246270458e-05, "loss": 0.5364, "step": 395 }, { "epoch": 3.0056925996204935, "grad_norm": 0.26045279058003457, "learning_rate": 3.2471218644040184e-05, "loss": 0.4487, "step": 396 }, { "epoch": 3.0132827324478177, "grad_norm": 0.21695656813890685, "learning_rate": 3.2261789012229394e-05, "loss": 0.4084, "step": 397 }, { "epoch": 3.0208728652751424, "grad_norm": 0.23426334273283442, "learning_rate": 3.205257952535119e-05, "loss": 0.4079, "step": 398 }, { "epoch": 3.0284629981024667, "grad_norm": 0.26850763056052446, "learning_rate": 3.184359613522163e-05, "loss": 0.4223, "step": 399 }, { "epoch": 3.0360531309297913, "grad_norm": 0.2683676862064319, "learning_rate": 3.1634844787224525e-05, "loss": 0.4182, "step": 400 }, { "epoch": 3.0436432637571156, "grad_norm": 0.26248839521550266, "learning_rate": 3.1426331420142314e-05, "loss": 0.4171, "step": 401 }, { "epoch": 3.0512333965844403, "grad_norm": 0.24770151611304786, "learning_rate": 3.121806196598706e-05, "loss": 0.4023, "step": 402 }, { "epoch": 3.0588235294117645, "grad_norm": 0.2613619938203551, "learning_rate": 3.10100423498318e-05, "loss": 0.4095, "step": 403 }, { "epoch": 3.066413662239089, "grad_norm": 0.2563470656186417, "learning_rate": 3.0802278489641816e-05, "loss": 0.4101, "step": 404 }, { "epoch": 3.074003795066414, "grad_norm": 0.22055572471052182, "learning_rate": 3.0594776296106464e-05, "loss": 0.4105, "step": 405 }, { "epoch": 3.081593927893738, "grad_norm": 0.22294658539566273, "learning_rate": 3.0387541672470857e-05, "loss": 0.4038, "step": 406 }, { "epoch": 3.0891840607210628, "grad_norm": 0.24713851379128027, "learning_rate": 3.0180580514368037e-05, "loss": 0.406, "step": 407 }, { "epoch": 3.096774193548387, "grad_norm": 0.20233125052294548, "learning_rate": 2.997389870965118e-05, "loss": 0.4067, "step": 408 }, { "epoch": 3.1043643263757117, "grad_norm": 0.2268353462063368, "learning_rate": 2.976750213822613e-05, "loss": 0.4069, "step": 409 }, { "epoch": 3.111954459203036, "grad_norm": 0.22067595331379308, "learning_rate": 2.9561396671884105e-05, "loss": 0.414, "step": 410 }, { "epoch": 3.1195445920303606, "grad_norm": 0.19168123310877352, "learning_rate": 2.9355588174134627e-05, "loss": 0.4052, "step": 411 }, { "epoch": 3.127134724857685, "grad_norm": 0.21765986129210937, "learning_rate": 2.9150082500038794e-05, "loss": 0.4084, "step": 412 }, { "epoch": 3.1347248576850095, "grad_norm": 0.17465524431494953, "learning_rate": 2.8944885496042593e-05, "loss": 0.4039, "step": 413 }, { "epoch": 3.1423149905123338, "grad_norm": 0.16136937176528135, "learning_rate": 2.874000299981067e-05, "loss": 0.4077, "step": 414 }, { "epoch": 3.1499051233396584, "grad_norm": 0.1842039752946862, "learning_rate": 2.8535440840060196e-05, "loss": 0.4114, "step": 415 }, { "epoch": 3.157495256166983, "grad_norm": 0.16275539565392577, "learning_rate": 2.83312048363951e-05, "loss": 0.4122, "step": 416 }, { "epoch": 3.1650853889943074, "grad_norm": 0.17613818747948046, "learning_rate": 2.812730079914041e-05, "loss": 0.4078, "step": 417 }, { "epoch": 3.172675521821632, "grad_norm": 0.1802212697189579, "learning_rate": 2.7923734529177076e-05, "loss": 0.4105, "step": 418 }, { "epoch": 3.1802656546489563, "grad_norm": 0.14800969306703707, "learning_rate": 2.772051181777684e-05, "loss": 0.4153, "step": 419 }, { "epoch": 3.187855787476281, "grad_norm": 0.17753235380567503, "learning_rate": 2.7517638446437574e-05, "loss": 0.4184, "step": 420 }, { "epoch": 3.195445920303605, "grad_norm": 0.16819549000064993, "learning_rate": 2.7315120186718686e-05, "loss": 0.4065, "step": 421 }, { "epoch": 3.20303605313093, "grad_norm": 0.18462973385672243, "learning_rate": 2.7112962800077034e-05, "loss": 0.4076, "step": 422 }, { "epoch": 3.210626185958254, "grad_norm": 0.16361853377388974, "learning_rate": 2.6911172037702962e-05, "loss": 0.4095, "step": 423 }, { "epoch": 3.218216318785579, "grad_norm": 0.18569025030207767, "learning_rate": 2.6709753640356652e-05, "loss": 0.4099, "step": 424 }, { "epoch": 3.225806451612903, "grad_norm": 0.1703754043113873, "learning_rate": 2.650871333820491e-05, "loss": 0.411, "step": 425 }, { "epoch": 3.2333965844402277, "grad_norm": 0.16840742937643677, "learning_rate": 2.6308056850658038e-05, "loss": 0.4114, "step": 426 }, { "epoch": 3.2409867172675524, "grad_norm": 0.15285852823906035, "learning_rate": 2.6107789886207195e-05, "loss": 0.4064, "step": 427 }, { "epoch": 3.2485768500948766, "grad_norm": 0.1713250127448791, "learning_rate": 2.5907918142261944e-05, "loss": 0.4167, "step": 428 }, { "epoch": 3.2561669829222013, "grad_norm": 0.1734237035758403, "learning_rate": 2.5708447304988227e-05, "loss": 0.4053, "step": 429 }, { "epoch": 3.2637571157495255, "grad_norm": 0.17043360973692148, "learning_rate": 2.5509383049146532e-05, "loss": 0.4037, "step": 430 }, { "epoch": 3.27134724857685, "grad_norm": 0.16479095536499094, "learning_rate": 2.5310731037930474e-05, "loss": 0.4071, "step": 431 }, { "epoch": 3.2789373814041745, "grad_norm": 0.17168397871604932, "learning_rate": 2.5112496922805712e-05, "loss": 0.4141, "step": 432 }, { "epoch": 3.286527514231499, "grad_norm": 0.15615068257123285, "learning_rate": 2.4914686343349158e-05, "loss": 0.4051, "step": 433 }, { "epoch": 3.2941176470588234, "grad_norm": 0.16211177369118324, "learning_rate": 2.4717304927088493e-05, "loss": 0.4091, "step": 434 }, { "epoch": 3.301707779886148, "grad_norm": 0.17500505329691834, "learning_rate": 2.4520358289342143e-05, "loss": 0.4157, "step": 435 }, { "epoch": 3.3092979127134727, "grad_norm": 0.16178628883403032, "learning_rate": 2.4323852033059447e-05, "loss": 0.4108, "step": 436 }, { "epoch": 3.316888045540797, "grad_norm": 0.15428648205322795, "learning_rate": 2.412779174866134e-05, "loss": 0.4133, "step": 437 }, { "epoch": 3.324478178368121, "grad_norm": 0.1648054065245008, "learning_rate": 2.393218301388123e-05, "loss": 0.4083, "step": 438 }, { "epoch": 3.332068311195446, "grad_norm": 0.14344944433387424, "learning_rate": 2.3737031393606376e-05, "loss": 0.4115, "step": 439 }, { "epoch": 3.3396584440227706, "grad_norm": 0.16677658490660136, "learning_rate": 2.3542342439719565e-05, "loss": 0.4101, "step": 440 }, { "epoch": 3.347248576850095, "grad_norm": 0.16428323673049788, "learning_rate": 2.3348121690941125e-05, "loss": 0.4033, "step": 441 }, { "epoch": 3.3548387096774195, "grad_norm": 0.14998088085310035, "learning_rate": 2.3154374672671417e-05, "loss": 0.4116, "step": 442 }, { "epoch": 3.3624288425047437, "grad_norm": 0.1646349150252062, "learning_rate": 2.2961106896833588e-05, "loss": 0.4053, "step": 443 }, { "epoch": 3.3700189753320684, "grad_norm": 0.1463235531505828, "learning_rate": 2.2768323861716778e-05, "loss": 0.4045, "step": 444 }, { "epoch": 3.3776091081593926, "grad_norm": 0.18600338855222787, "learning_rate": 2.2576031051819704e-05, "loss": 0.4145, "step": 445 }, { "epoch": 3.3851992409867173, "grad_norm": 0.15182364989779723, "learning_rate": 2.2384233937694626e-05, "loss": 0.412, "step": 446 }, { "epoch": 3.3927893738140416, "grad_norm": 0.1891499854862187, "learning_rate": 2.2192937975791757e-05, "loss": 0.4039, "step": 447 }, { "epoch": 3.4003795066413662, "grad_norm": 0.1524505414622732, "learning_rate": 2.2002148608303947e-05, "loss": 0.4059, "step": 448 }, { "epoch": 3.407969639468691, "grad_norm": 0.1570451938127235, "learning_rate": 2.1811871263011924e-05, "loss": 0.4063, "step": 449 }, { "epoch": 3.415559772296015, "grad_norm": 0.14780715290004184, "learning_rate": 2.1622111353129832e-05, "loss": 0.4137, "step": 450 }, { "epoch": 3.42314990512334, "grad_norm": 0.154720916330876, "learning_rate": 2.1432874277151337e-05, "loss": 0.4072, "step": 451 }, { "epoch": 3.430740037950664, "grad_norm": 0.14741003990298276, "learning_rate": 2.124416541869586e-05, "loss": 0.4106, "step": 452 }, { "epoch": 3.4383301707779887, "grad_norm": 0.13756307467876858, "learning_rate": 2.1055990146355566e-05, "loss": 0.4176, "step": 453 }, { "epoch": 3.445920303605313, "grad_norm": 0.1478071810974749, "learning_rate": 2.0868353813542633e-05, "loss": 0.4068, "step": 454 }, { "epoch": 3.4535104364326377, "grad_norm": 0.14471344451828674, "learning_rate": 2.068126175833685e-05, "loss": 0.4118, "step": 455 }, { "epoch": 3.461100569259962, "grad_norm": 0.1488219736052461, "learning_rate": 2.0494719303333836e-05, "loss": 0.412, "step": 456 }, { "epoch": 3.4686907020872866, "grad_norm": 0.7113380874758816, "learning_rate": 2.0308731755493577e-05, "loss": 0.4155, "step": 457 }, { "epoch": 3.476280834914611, "grad_norm": 0.13854452311181958, "learning_rate": 2.012330440598952e-05, "loss": 0.4058, "step": 458 }, { "epoch": 3.4838709677419355, "grad_norm": 0.17245669157249005, "learning_rate": 1.9938442530057904e-05, "loss": 0.4158, "step": 459 }, { "epoch": 3.4914611005692597, "grad_norm": 0.13334621900310906, "learning_rate": 1.975415138684781e-05, "loss": 0.4064, "step": 460 }, { "epoch": 3.4990512333965844, "grad_norm": 0.1704714868106143, "learning_rate": 1.9570436219271534e-05, "loss": 0.4053, "step": 461 }, { "epoch": 3.506641366223909, "grad_norm": 0.13841108978778513, "learning_rate": 1.9387302253855353e-05, "loss": 0.4084, "step": 462 }, { "epoch": 3.5142314990512333, "grad_norm": 0.14973879725428832, "learning_rate": 1.9204754700590878e-05, "loss": 0.412, "step": 463 }, { "epoch": 3.521821631878558, "grad_norm": 0.14960927710961097, "learning_rate": 1.9022798752786896e-05, "loss": 0.4118, "step": 464 }, { "epoch": 3.5294117647058822, "grad_norm": 0.14735649088289546, "learning_rate": 1.8841439586921515e-05, "loss": 0.4066, "step": 465 }, { "epoch": 3.537001897533207, "grad_norm": 0.15821300680395564, "learning_rate": 1.8660682362494926e-05, "loss": 0.416, "step": 466 }, { "epoch": 3.544592030360531, "grad_norm": 0.14559017597813168, "learning_rate": 1.848053222188271e-05, "loss": 0.4095, "step": 467 }, { "epoch": 3.552182163187856, "grad_norm": 0.15314476683793304, "learning_rate": 1.8300994290189452e-05, "loss": 0.4094, "step": 468 }, { "epoch": 3.55977229601518, "grad_norm": 0.169795874861623, "learning_rate": 1.8122073675102935e-05, "loss": 0.418, "step": 469 }, { "epoch": 3.5673624288425048, "grad_norm": 0.1443798018236273, "learning_rate": 1.7943775466748867e-05, "loss": 0.4086, "step": 470 }, { "epoch": 3.5749525616698294, "grad_norm": 0.16667220421287227, "learning_rate": 1.7766104737546102e-05, "loss": 0.4079, "step": 471 }, { "epoch": 3.5825426944971537, "grad_norm": 0.1346641081419542, "learning_rate": 1.7589066542062253e-05, "loss": 0.4076, "step": 472 }, { "epoch": 3.590132827324478, "grad_norm": 0.15846447495361166, "learning_rate": 1.741266591686991e-05, "loss": 0.4059, "step": 473 }, { "epoch": 3.5977229601518026, "grad_norm": 0.1366720287265696, "learning_rate": 1.7236907880403447e-05, "loss": 0.4078, "step": 474 }, { "epoch": 3.6053130929791273, "grad_norm": 0.16252820939929033, "learning_rate": 1.7061797432816138e-05, "loss": 0.4073, "step": 475 }, { "epoch": 3.6129032258064515, "grad_norm": 0.1494339978487333, "learning_rate": 1.6887339555837948e-05, "loss": 0.4081, "step": 476 }, { "epoch": 3.620493358633776, "grad_norm": 0.14531376606247468, "learning_rate": 1.671353921263386e-05, "loss": 0.4072, "step": 477 }, { "epoch": 3.6280834914611004, "grad_norm": 0.1388375063144433, "learning_rate": 1.654040134766259e-05, "loss": 0.4075, "step": 478 }, { "epoch": 3.635673624288425, "grad_norm": 0.13505123860416815, "learning_rate": 1.6367930886535957e-05, "loss": 0.4145, "step": 479 }, { "epoch": 3.64326375711575, "grad_norm": 0.13194605801302411, "learning_rate": 1.619613273587879e-05, "loss": 0.4177, "step": 480 }, { "epoch": 3.650853889943074, "grad_norm": 0.14166452387033596, "learning_rate": 1.602501178318928e-05, "loss": 0.4161, "step": 481 }, { "epoch": 3.6584440227703983, "grad_norm": 0.13583541432935878, "learning_rate": 1.5854572896699977e-05, "loss": 0.4105, "step": 482 }, { "epoch": 3.666034155597723, "grad_norm": 0.15005698486131297, "learning_rate": 1.5684820925239273e-05, "loss": 0.398, "step": 483 }, { "epoch": 3.6736242884250476, "grad_norm": 0.13590326204495468, "learning_rate": 1.5515760698093485e-05, "loss": 0.408, "step": 484 }, { "epoch": 3.681214421252372, "grad_norm": 0.14280292743110926, "learning_rate": 1.5347397024869423e-05, "loss": 0.4102, "step": 485 }, { "epoch": 3.6888045540796965, "grad_norm": 0.14960004062226398, "learning_rate": 1.5179734695357584e-05, "loss": 0.4048, "step": 486 }, { "epoch": 3.6963946869070208, "grad_norm": 0.14308871822409527, "learning_rate": 1.5012778479395892e-05, "loss": 0.41, "step": 487 }, { "epoch": 3.7039848197343455, "grad_norm": 0.17743355555341997, "learning_rate": 1.4846533126733999e-05, "loss": 0.4066, "step": 488 }, { "epoch": 3.7115749525616697, "grad_norm": 0.13804122945298328, "learning_rate": 1.4681003366898132e-05, "loss": 0.4108, "step": 489 }, { "epoch": 3.7191650853889944, "grad_norm": 0.13970618164447296, "learning_rate": 1.4516193909056609e-05, "loss": 0.4029, "step": 490 }, { "epoch": 3.7267552182163186, "grad_norm": 0.16177493370388654, "learning_rate": 1.4352109441885786e-05, "loss": 0.4083, "step": 491 }, { "epoch": 3.7343453510436433, "grad_norm": 0.12082905813373115, "learning_rate": 1.4188754633436718e-05, "loss": 0.4013, "step": 492 }, { "epoch": 3.741935483870968, "grad_norm": 0.14795325276605165, "learning_rate": 1.4026134131002347e-05, "loss": 0.4101, "step": 493 }, { "epoch": 3.749525616698292, "grad_norm": 0.14127927316169223, "learning_rate": 1.3864252560985283e-05, "loss": 0.414, "step": 494 }, { "epoch": 3.7571157495256164, "grad_norm": 0.13865730984205682, "learning_rate": 1.3703114528766203e-05, "loss": 0.4029, "step": 495 }, { "epoch": 3.764705882352941, "grad_norm": 0.14552488808913097, "learning_rate": 1.35427246185728e-05, "loss": 0.4073, "step": 496 }, { "epoch": 3.772296015180266, "grad_norm": 0.1357309007526371, "learning_rate": 1.3383087393349436e-05, "loss": 0.4091, "step": 497 }, { "epoch": 3.77988614800759, "grad_norm": 0.1324930748382244, "learning_rate": 1.3224207394627241e-05, "loss": 0.4122, "step": 498 }, { "epoch": 3.7874762808349147, "grad_norm": 0.1362271179146437, "learning_rate": 1.306608914239496e-05, "loss": 0.4041, "step": 499 }, { "epoch": 3.795066413662239, "grad_norm": 0.12409927061704383, "learning_rate": 1.2908737134970367e-05, "loss": 0.4056, "step": 500 }, { "epoch": 3.8026565464895636, "grad_norm": 0.13343164571477334, "learning_rate": 1.2752155848872266e-05, "loss": 0.4096, "step": 501 }, { "epoch": 3.8102466793168883, "grad_norm": 0.11787497363701406, "learning_rate": 1.2596349738693162e-05, "loss": 0.3975, "step": 502 }, { "epoch": 3.8178368121442126, "grad_norm": 0.12436712059884664, "learning_rate": 1.2441323236972536e-05, "loss": 0.4103, "step": 503 }, { "epoch": 3.825426944971537, "grad_norm": 0.1217813385845422, "learning_rate": 1.2287080754070719e-05, "loss": 0.407, "step": 504 }, { "epoch": 3.8330170777988615, "grad_norm": 0.11341001168506011, "learning_rate": 1.2133626678043426e-05, "loss": 0.4113, "step": 505 }, { "epoch": 3.840607210626186, "grad_norm": 0.10943902346608907, "learning_rate": 1.1980965374516922e-05, "loss": 0.4042, "step": 506 }, { "epoch": 3.8481973434535104, "grad_norm": 0.13586692787728996, "learning_rate": 1.1829101186563876e-05, "loss": 0.4149, "step": 507 }, { "epoch": 3.855787476280835, "grad_norm": 0.11690469403831676, "learning_rate": 1.167803843457969e-05, "loss": 0.4174, "step": 508 }, { "epoch": 3.8633776091081593, "grad_norm": 0.11141063470580559, "learning_rate": 1.1527781416159684e-05, "loss": 0.4064, "step": 509 }, { "epoch": 3.870967741935484, "grad_norm": 0.14761043009465893, "learning_rate": 1.1378334405976829e-05, "loss": 0.4095, "step": 510 }, { "epoch": 3.878557874762808, "grad_norm": 0.12802936815914415, "learning_rate": 1.122970165566009e-05, "loss": 0.4126, "step": 511 }, { "epoch": 3.886148007590133, "grad_norm": 0.11715478782287307, "learning_rate": 1.1081887393673481e-05, "loss": 0.4039, "step": 512 }, { "epoch": 3.893738140417457, "grad_norm": 0.12113439474595457, "learning_rate": 1.0934895825195807e-05, "loss": 0.4039, "step": 513 }, { "epoch": 3.901328273244782, "grad_norm": 0.12045212518695263, "learning_rate": 1.0788731132000985e-05, "loss": 0.4157, "step": 514 }, { "epoch": 3.9089184060721065, "grad_norm": 0.11325751581490795, "learning_rate": 1.0643397472339103e-05, "loss": 0.4058, "step": 515 }, { "epoch": 3.9165085388994307, "grad_norm": 0.11669592480574363, "learning_rate": 1.0498898980818115e-05, "loss": 0.4082, "step": 516 }, { "epoch": 3.924098671726755, "grad_norm": 0.11461527608695707, "learning_rate": 1.035523976828623e-05, "loss": 0.419, "step": 517 }, { "epoch": 3.9316888045540797, "grad_norm": 0.11809514587490393, "learning_rate": 1.0212423921714923e-05, "loss": 0.4158, "step": 518 }, { "epoch": 3.9392789373814043, "grad_norm": 0.11115980306830088, "learning_rate": 1.0070455504082695e-05, "loss": 0.4095, "step": 519 }, { "epoch": 3.9468690702087286, "grad_norm": 0.114832121716292, "learning_rate": 9.92933855425951e-06, "loss": 0.4154, "step": 520 }, { "epoch": 3.9544592030360532, "grad_norm": 0.11130484634692327, "learning_rate": 9.789077086891802e-06, "loss": 0.4137, "step": 521 }, { "epoch": 3.9620493358633775, "grad_norm": 0.10695561553946319, "learning_rate": 9.649675092288366e-06, "loss": 0.4006, "step": 522 }, { "epoch": 3.969639468690702, "grad_norm": 0.12052592151934423, "learning_rate": 9.511136536306793e-06, "loss": 0.4082, "step": 523 }, { "epoch": 3.9772296015180264, "grad_norm": 0.10913274770762135, "learning_rate": 9.373465360240627e-06, "loss": 0.4134, "step": 524 }, { "epoch": 3.984819734345351, "grad_norm": 0.11482531714881333, "learning_rate": 9.236665480707266e-06, "loss": 0.405, "step": 525 }, { "epoch": 3.9924098671726753, "grad_norm": 0.11550707751512918, "learning_rate": 9.100740789536515e-06, "loss": 0.4061, "step": 526 }, { "epoch": 4.0, "grad_norm": 0.14100318690739433, "learning_rate": 8.96569515365993e-06, "loss": 0.5074, "step": 527 }, { "epoch": 4.007590132827325, "grad_norm": 0.19514222390213506, "learning_rate": 8.831532415000685e-06, "loss": 0.3785, "step": 528 }, { "epoch": 4.015180265654649, "grad_norm": 0.14576099759098526, "learning_rate": 8.698256390364386e-06, "loss": 0.373, "step": 529 }, { "epoch": 4.022770398481973, "grad_norm": 0.13042501540336135, "learning_rate": 8.565870871330463e-06, "loss": 0.3799, "step": 530 }, { "epoch": 4.030360531309298, "grad_norm": 0.1565052782903353, "learning_rate": 8.434379624144261e-06, "loss": 0.3881, "step": 531 }, { "epoch": 4.0379506641366225, "grad_norm": 0.16603797889226304, "learning_rate": 8.303786389609914e-06, "loss": 0.386, "step": 532 }, { "epoch": 4.045540796963947, "grad_norm": 0.16546797622934456, "learning_rate": 8.17409488298396e-06, "loss": 0.3847, "step": 533 }, { "epoch": 4.053130929791271, "grad_norm": 0.14996763626729143, "learning_rate": 8.0453087938696e-06, "loss": 0.3816, "step": 534 }, { "epoch": 4.060721062618596, "grad_norm": 0.14469013701568176, "learning_rate": 7.917431786111698e-06, "loss": 0.3814, "step": 535 }, { "epoch": 4.06831119544592, "grad_norm": 0.14926356991103681, "learning_rate": 7.790467497692678e-06, "loss": 0.3779, "step": 536 }, { "epoch": 4.075901328273245, "grad_norm": 0.1655187733066776, "learning_rate": 7.664419540628886e-06, "loss": 0.3884, "step": 537 }, { "epoch": 4.08349146110057, "grad_norm": 0.16471106758966367, "learning_rate": 7.539291500867918e-06, "loss": 0.3823, "step": 538 }, { "epoch": 4.0910815939278935, "grad_norm": 0.146447604920726, "learning_rate": 7.415086938186542e-06, "loss": 0.392, "step": 539 }, { "epoch": 4.098671726755218, "grad_norm": 0.14674866169218687, "learning_rate": 7.291809386089515e-06, "loss": 0.3807, "step": 540 }, { "epoch": 4.106261859582543, "grad_norm": 0.1654058535020922, "learning_rate": 7.169462351708958e-06, "loss": 0.3852, "step": 541 }, { "epoch": 4.1138519924098675, "grad_norm": 0.1447542059683869, "learning_rate": 7.048049315704611e-06, "loss": 0.3831, "step": 542 }, { "epoch": 4.121442125237191, "grad_norm": 0.1261716394650113, "learning_rate": 6.927573732164879e-06, "loss": 0.3831, "step": 543 }, { "epoch": 4.129032258064516, "grad_norm": 0.15942710883242464, "learning_rate": 6.808039028508475e-06, "loss": 0.3835, "step": 544 }, { "epoch": 4.136622390891841, "grad_norm": 0.13949828863354824, "learning_rate": 6.6894486053869525e-06, "loss": 0.3811, "step": 545 }, { "epoch": 4.144212523719165, "grad_norm": 0.1298354723268211, "learning_rate": 6.571805836587981e-06, "loss": 0.3771, "step": 546 }, { "epoch": 4.151802656546489, "grad_norm": 0.11985046208878425, "learning_rate": 6.455114068939323e-06, "loss": 0.3865, "step": 547 }, { "epoch": 4.159392789373814, "grad_norm": 0.1444577266660429, "learning_rate": 6.3393766222136445e-06, "loss": 0.3826, "step": 548 }, { "epoch": 4.1669829222011385, "grad_norm": 0.13012757211149162, "learning_rate": 6.224596789034061e-06, "loss": 0.3809, "step": 549 }, { "epoch": 4.174573055028463, "grad_norm": 0.11192946871125323, "learning_rate": 6.1107778347804814e-06, "loss": 0.3826, "step": 550 }, { "epoch": 4.182163187855788, "grad_norm": 0.11903117545520656, "learning_rate": 5.99792299749669e-06, "loss": 0.3768, "step": 551 }, { "epoch": 4.189753320683112, "grad_norm": 0.12263935198172611, "learning_rate": 5.886035487798229e-06, "loss": 0.3807, "step": 552 }, { "epoch": 4.197343453510436, "grad_norm": 0.11730507620931735, "learning_rate": 5.775118488781099e-06, "loss": 0.3822, "step": 553 }, { "epoch": 4.204933586337761, "grad_norm": 0.11587381154598554, "learning_rate": 5.665175155931133e-06, "loss": 0.3827, "step": 554 }, { "epoch": 4.212523719165086, "grad_norm": 0.1134319753947347, "learning_rate": 5.556208617034289e-06, "loss": 0.3766, "step": 555 }, { "epoch": 4.2201138519924095, "grad_norm": 0.1050634846149617, "learning_rate": 5.448221972087631e-06, "loss": 0.3792, "step": 556 }, { "epoch": 4.227703984819734, "grad_norm": 0.10861492093915034, "learning_rate": 5.341218293211143e-06, "loss": 0.3857, "step": 557 }, { "epoch": 4.235294117647059, "grad_norm": 0.10851653304619344, "learning_rate": 5.235200624560341e-06, "loss": 0.3795, "step": 558 }, { "epoch": 4.242884250474384, "grad_norm": 0.10296399140659968, "learning_rate": 5.130171982239685e-06, "loss": 0.3846, "step": 559 }, { "epoch": 4.250474383301707, "grad_norm": 0.10426146312115164, "learning_rate": 5.026135354216717e-06, "loss": 0.383, "step": 560 }, { "epoch": 4.258064516129032, "grad_norm": 0.10232852820462886, "learning_rate": 4.923093700237109e-06, "loss": 0.3868, "step": 561 }, { "epoch": 4.265654648956357, "grad_norm": 0.10102795561292162, "learning_rate": 4.821049951740442e-06, "loss": 0.3781, "step": 562 }, { "epoch": 4.273244781783681, "grad_norm": 0.10261093996817437, "learning_rate": 4.720007011776808e-06, "loss": 0.3802, "step": 563 }, { "epoch": 4.280834914611006, "grad_norm": 0.09919927002366413, "learning_rate": 4.6199677549242285e-06, "loss": 0.3837, "step": 564 }, { "epoch": 4.28842504743833, "grad_norm": 0.10836098588585662, "learning_rate": 4.520935027206857e-06, "loss": 0.3869, "step": 565 }, { "epoch": 4.2960151802656545, "grad_norm": 0.10526098051103763, "learning_rate": 4.4229116460140495e-06, "loss": 0.377, "step": 566 }, { "epoch": 4.303605313092979, "grad_norm": 0.09914351892145006, "learning_rate": 4.325900400020176e-06, "loss": 0.3786, "step": 567 }, { "epoch": 4.311195445920304, "grad_norm": 0.10303085522084827, "learning_rate": 4.229904049105287e-06, "loss": 0.3799, "step": 568 }, { "epoch": 4.318785578747628, "grad_norm": 0.09870659666807487, "learning_rate": 4.1349253242766265e-06, "loss": 0.3723, "step": 569 }, { "epoch": 4.326375711574952, "grad_norm": 0.09910026166356478, "learning_rate": 4.040966927590901e-06, "loss": 0.3839, "step": 570 }, { "epoch": 4.333965844402277, "grad_norm": 0.10308676223772309, "learning_rate": 3.9480315320774524e-06, "loss": 0.3819, "step": 571 }, { "epoch": 4.341555977229602, "grad_norm": 0.11412486094736646, "learning_rate": 3.856121781662148e-06, "loss": 0.3886, "step": 572 }, { "epoch": 4.349146110056926, "grad_norm": 0.09763532306888358, "learning_rate": 3.7652402910922513e-06, "loss": 0.3798, "step": 573 }, { "epoch": 4.35673624288425, "grad_norm": 0.09695937184022163, "learning_rate": 3.675389645861951e-06, "loss": 0.3855, "step": 574 }, { "epoch": 4.364326375711575, "grad_norm": 0.10172178456609236, "learning_rate": 3.5865724021388437e-06, "loss": 0.3893, "step": 575 }, { "epoch": 4.3719165085389, "grad_norm": 0.10300828152047768, "learning_rate": 3.4987910866912402e-06, "loss": 0.3873, "step": 576 }, { "epoch": 4.379506641366224, "grad_norm": 0.09672665094607047, "learning_rate": 3.4120481968162022e-06, "loss": 0.3875, "step": 577 }, { "epoch": 4.387096774193548, "grad_norm": 0.09743576074956742, "learning_rate": 3.32634620026858e-06, "loss": 0.3792, "step": 578 }, { "epoch": 4.394686907020873, "grad_norm": 0.09625146137130541, "learning_rate": 3.241687535190776e-06, "loss": 0.3867, "step": 579 }, { "epoch": 4.402277039848197, "grad_norm": 0.0945950664728936, "learning_rate": 3.1580746100433646e-06, "loss": 0.3824, "step": 580 }, { "epoch": 4.409867172675522, "grad_norm": 0.0980784024646366, "learning_rate": 3.0755098035365917e-06, "loss": 0.3839, "step": 581 }, { "epoch": 4.417457305502847, "grad_norm": 0.10142930578038363, "learning_rate": 2.9939954645626934e-06, "loss": 0.3831, "step": 582 }, { "epoch": 4.425047438330171, "grad_norm": 0.1024718070994225, "learning_rate": 2.913533912129105e-06, "loss": 0.3838, "step": 583 }, { "epoch": 4.432637571157495, "grad_norm": 0.09460467739691439, "learning_rate": 2.8341274352924197e-06, "loss": 0.3856, "step": 584 }, { "epoch": 4.44022770398482, "grad_norm": 0.09628762445931284, "learning_rate": 2.7557782930933298e-06, "loss": 0.3813, "step": 585 }, { "epoch": 4.447817836812145, "grad_norm": 0.09194772578210779, "learning_rate": 2.6784887144923445e-06, "loss": 0.3817, "step": 586 }, { "epoch": 4.455407969639468, "grad_norm": 0.08931743384621908, "learning_rate": 2.6022608983063522e-06, "loss": 0.3788, "step": 587 }, { "epoch": 4.462998102466793, "grad_norm": 0.09150589167868585, "learning_rate": 2.5270970131460937e-06, "loss": 0.3866, "step": 588 }, { "epoch": 4.470588235294118, "grad_norm": 0.09812252230877141, "learning_rate": 2.4529991973544664e-06, "loss": 0.3903, "step": 589 }, { "epoch": 4.478178368121442, "grad_norm": 0.09412962970253295, "learning_rate": 2.3799695589456695e-06, "loss": 0.3812, "step": 590 }, { "epoch": 4.485768500948766, "grad_norm": 0.09424880620907046, "learning_rate": 2.308010175545232e-06, "loss": 0.3838, "step": 591 }, { "epoch": 4.493358633776091, "grad_norm": 0.09122643056285845, "learning_rate": 2.2371230943309598e-06, "loss": 0.3896, "step": 592 }, { "epoch": 4.500948766603416, "grad_norm": 0.09096615886328271, "learning_rate": 2.1673103319746146e-06, "loss": 0.3785, "step": 593 }, { "epoch": 4.50853889943074, "grad_norm": 0.0997668049582305, "learning_rate": 2.0985738745846086e-06, "loss": 0.3873, "step": 594 }, { "epoch": 4.516129032258064, "grad_norm": 0.09598005147735333, "learning_rate": 2.0309156776494497e-06, "loss": 0.3755, "step": 595 }, { "epoch": 4.523719165085389, "grad_norm": 0.09405869487813164, "learning_rate": 1.964337665982172e-06, "loss": 0.3923, "step": 596 }, { "epoch": 4.531309297912713, "grad_norm": 0.0916297167969929, "learning_rate": 1.898841733665515e-06, "loss": 0.3836, "step": 597 }, { "epoch": 4.538899430740038, "grad_norm": 0.09844277128999404, "learning_rate": 1.8344297439980475e-06, "loss": 0.3814, "step": 598 }, { "epoch": 4.546489563567363, "grad_norm": 0.09263986116298609, "learning_rate": 1.7711035294412094e-06, "loss": 0.3874, "step": 599 }, { "epoch": 4.554079696394687, "grad_norm": 0.09337538395396143, "learning_rate": 1.7088648915671236e-06, "loss": 0.3819, "step": 600 }, { "epoch": 4.561669829222011, "grad_norm": 0.09644194634203436, "learning_rate": 1.6477156010073693e-06, "loss": 0.3859, "step": 601 }, { "epoch": 4.569259962049336, "grad_norm": 0.09104976943289746, "learning_rate": 1.5876573974026043e-06, "loss": 0.3859, "step": 602 }, { "epoch": 4.576850094876661, "grad_norm": 0.1036240608596365, "learning_rate": 1.5286919893530727e-06, "loss": 0.378, "step": 603 }, { "epoch": 4.584440227703984, "grad_norm": 0.10319393509093422, "learning_rate": 1.4708210543700019e-06, "loss": 0.3821, "step": 604 }, { "epoch": 4.592030360531309, "grad_norm": 0.09419955371127982, "learning_rate": 1.4140462388278641e-06, "loss": 0.382, "step": 605 }, { "epoch": 4.599620493358634, "grad_norm": 0.09325440545672938, "learning_rate": 1.3583691579175563e-06, "loss": 0.3796, "step": 606 }, { "epoch": 4.6072106261859584, "grad_norm": 0.0948062866747191, "learning_rate": 1.3037913956004444e-06, "loss": 0.3802, "step": 607 }, { "epoch": 4.614800759013283, "grad_norm": 0.08880917402719704, "learning_rate": 1.2503145045632903e-06, "loss": 0.3837, "step": 608 }, { "epoch": 4.622390891840607, "grad_norm": 0.10097582002452218, "learning_rate": 1.1979400061741075e-06, "loss": 0.3771, "step": 609 }, { "epoch": 4.629981024667932, "grad_norm": 0.09101806397046262, "learning_rate": 1.146669390438837e-06, "loss": 0.3806, "step": 610 }, { "epoch": 4.637571157495256, "grad_norm": 0.08881115056650345, "learning_rate": 1.0965041159589806e-06, "loss": 0.3891, "step": 611 }, { "epoch": 4.645161290322581, "grad_norm": 0.09427363876334546, "learning_rate": 1.047445609890132e-06, "loss": 0.3889, "step": 612 }, { "epoch": 4.652751423149905, "grad_norm": 0.09134492739832982, "learning_rate": 9.994952679013292e-07, "loss": 0.3805, "step": 613 }, { "epoch": 4.660341555977229, "grad_norm": 0.08733457555820553, "learning_rate": 9.526544541353622e-07, "loss": 0.3721, "step": 614 }, { "epoch": 4.667931688804554, "grad_norm": 0.09375394173236, "learning_rate": 9.069245011699901e-07, "loss": 0.3809, "step": 615 }, { "epoch": 4.675521821631879, "grad_norm": 0.09910629089900622, "learning_rate": 8.623067099800076e-07, "loss": 0.3781, "step": 616 }, { "epoch": 4.6831119544592035, "grad_norm": 0.08948539795632664, "learning_rate": 8.188023499002206e-07, "loss": 0.3852, "step": 617 }, { "epoch": 4.690702087286527, "grad_norm": 0.09627861222029245, "learning_rate": 7.764126585893694e-07, "loss": 0.3781, "step": 618 }, { "epoch": 4.698292220113852, "grad_norm": 0.08783688983476867, "learning_rate": 7.351388419948979e-07, "loss": 0.3837, "step": 619 }, { "epoch": 4.705882352941177, "grad_norm": 0.08787642986660921, "learning_rate": 6.949820743186353e-07, "loss": 0.3932, "step": 620 }, { "epoch": 4.713472485768501, "grad_norm": 0.08788042105203867, "learning_rate": 6.559434979834223e-07, "loss": 0.3821, "step": 621 }, { "epoch": 4.721062618595825, "grad_norm": 0.08638210961076086, "learning_rate": 6.180242236005818e-07, "loss": 0.385, "step": 622 }, { "epoch": 4.72865275142315, "grad_norm": 0.08376769284093517, "learning_rate": 5.812253299383308e-07, "loss": 0.3764, "step": 623 }, { "epoch": 4.7362428842504745, "grad_norm": 0.08758279977677409, "learning_rate": 5.455478638911071e-07, "loss": 0.3784, "step": 624 }, { "epoch": 4.743833017077799, "grad_norm": 0.08749897373635039, "learning_rate": 5.109928404497532e-07, "loss": 0.3864, "step": 625 }, { "epoch": 4.751423149905124, "grad_norm": 0.08812251812037451, "learning_rate": 4.775612426726684e-07, "loss": 0.3832, "step": 626 }, { "epoch": 4.759013282732448, "grad_norm": 0.08938212786163188, "learning_rate": 4.452540216578349e-07, "loss": 0.3778, "step": 627 }, { "epoch": 4.766603415559772, "grad_norm": 0.09686068566031868, "learning_rate": 4.140720965157519e-07, "loss": 0.3882, "step": 628 }, { "epoch": 4.774193548387097, "grad_norm": 0.0845764980027318, "learning_rate": 3.840163543433084e-07, "loss": 0.3778, "step": 629 }, { "epoch": 4.781783681214421, "grad_norm": 0.08252440850107318, "learning_rate": 3.550876501985112e-07, "loss": 0.3758, "step": 630 }, { "epoch": 4.7893738140417454, "grad_norm": 0.08576998372456936, "learning_rate": 3.272868070761881e-07, "loss": 0.3857, "step": 631 }, { "epoch": 4.79696394686907, "grad_norm": 0.08653227658708267, "learning_rate": 3.006146158845713e-07, "loss": 0.387, "step": 632 }, { "epoch": 4.804554079696395, "grad_norm": 0.0868336765504615, "learning_rate": 2.750718354227822e-07, "loss": 0.3918, "step": 633 }, { "epoch": 4.8121442125237195, "grad_norm": 0.08524574086673203, "learning_rate": 2.506591923592572e-07, "loss": 0.3879, "step": 634 }, { "epoch": 4.819734345351043, "grad_norm": 0.0854074630262595, "learning_rate": 2.273773812110802e-07, "loss": 0.3822, "step": 635 }, { "epoch": 4.827324478178368, "grad_norm": 0.08782569347415704, "learning_rate": 2.0522706432419382e-07, "loss": 0.389, "step": 636 }, { "epoch": 4.834914611005693, "grad_norm": 0.08728478399615792, "learning_rate": 1.842088718546009e-07, "loss": 0.3792, "step": 637 }, { "epoch": 4.842504743833017, "grad_norm": 0.08691066239057142, "learning_rate": 1.6432340175039253e-07, "loss": 0.3784, "step": 638 }, { "epoch": 4.850094876660341, "grad_norm": 0.09126251564727415, "learning_rate": 1.4557121973477472e-07, "loss": 0.3861, "step": 639 }, { "epoch": 4.857685009487666, "grad_norm": 0.08741494339348937, "learning_rate": 1.2795285928994372e-07, "loss": 0.3864, "step": 640 }, { "epoch": 4.8652751423149905, "grad_norm": 0.0876181788276896, "learning_rate": 1.1146882164193795e-07, "loss": 0.3847, "step": 641 }, { "epoch": 4.872865275142315, "grad_norm": 0.08754414570978997, "learning_rate": 9.611957574634734e-08, "loss": 0.3804, "step": 642 }, { "epoch": 4.88045540796964, "grad_norm": 0.08306030029527668, "learning_rate": 8.190555827499947e-08, "loss": 0.3815, "step": 643 }, { "epoch": 4.888045540796964, "grad_norm": 0.09068276050158985, "learning_rate": 6.882717360352065e-08, "loss": 0.3833, "step": 644 }, { "epoch": 4.895635673624288, "grad_norm": 0.08477830736645849, "learning_rate": 5.688479379984291e-08, "loss": 0.3865, "step": 645 }, { "epoch": 4.903225806451613, "grad_norm": 0.08365266522816545, "learning_rate": 4.607875861359024e-08, "loss": 0.3733, "step": 646 }, { "epoch": 4.910815939278938, "grad_norm": 0.08509382362942203, "learning_rate": 3.640937546646406e-08, "loss": 0.3801, "step": 647 }, { "epoch": 4.9184060721062615, "grad_norm": 0.08832189058708398, "learning_rate": 2.787691944345472e-08, "loss": 0.382, "step": 648 }, { "epoch": 4.925996204933586, "grad_norm": 0.08727763140855628, "learning_rate": 2.0481633285025505e-08, "loss": 0.3799, "step": 649 }, { "epoch": 4.933586337760911, "grad_norm": 0.08685568551948444, "learning_rate": 1.4223727380215935e-08, "loss": 0.3812, "step": 650 }, { "epoch": 4.9411764705882355, "grad_norm": 0.08827489813293968, "learning_rate": 9.103379760655451e-09, "loss": 0.3897, "step": 651 }, { "epoch": 4.94876660341556, "grad_norm": 0.08969524756207933, "learning_rate": 5.120736095483026e-09, "loss": 0.3803, "step": 652 }, { "epoch": 4.956356736242884, "grad_norm": 0.0849566492970198, "learning_rate": 2.2759096872260187e-09, "loss": 0.3774, "step": 653 }, { "epoch": 4.963946869070209, "grad_norm": 0.08816588035901379, "learning_rate": 5.689814685538863e-10, "loss": 0.3843, "step": 654 }, { "epoch": 4.971537001897533, "grad_norm": 0.09034351763433966, "learning_rate": 0.0, "loss": 0.3759, "step": 655 }, { "epoch": 4.971537001897533, "step": 655, "total_flos": 1.573215812367627e+19, "train_loss": 0.4743207697649948, "train_runtime": 64212.961, "train_samples_per_second": 5.249, "train_steps_per_second": 0.01 } ], "logging_steps": 1.0, "max_steps": 655, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.573215812367627e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }