{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.992, "eval_steps": 500, "global_step": 390, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0128, "grad_norm": 5.839018847816655, "learning_rate": 1.0256410256410257e-06, "loss": 0.9618, "step": 1 }, { "epoch": 0.0256, "grad_norm": 6.044975028236581, "learning_rate": 2.0512820512820513e-06, "loss": 0.9669, "step": 2 }, { "epoch": 0.0384, "grad_norm": 6.219047217466672, "learning_rate": 3.0769230769230774e-06, "loss": 1.021, "step": 3 }, { "epoch": 0.0512, "grad_norm": 5.4838569963975985, "learning_rate": 4.102564102564103e-06, "loss": 0.9571, "step": 4 }, { "epoch": 0.064, "grad_norm": 4.047643058048521, "learning_rate": 5.128205128205128e-06, "loss": 0.8816, "step": 5 }, { "epoch": 0.0768, "grad_norm": 2.2364546199415556, "learning_rate": 6.153846153846155e-06, "loss": 0.8964, "step": 6 }, { "epoch": 0.0896, "grad_norm": 2.06808539264327, "learning_rate": 7.17948717948718e-06, "loss": 0.8921, "step": 7 }, { "epoch": 0.1024, "grad_norm": 4.395847794140312, "learning_rate": 8.205128205128205e-06, "loss": 0.8709, "step": 8 }, { "epoch": 0.1152, "grad_norm": 5.136863261262288, "learning_rate": 9.230769230769232e-06, "loss": 0.9305, "step": 9 }, { "epoch": 0.128, "grad_norm": 4.504381262936195, "learning_rate": 1.0256410256410256e-05, "loss": 0.813, "step": 10 }, { "epoch": 0.1408, "grad_norm": 3.9344562843621285, "learning_rate": 1.1282051282051283e-05, "loss": 0.8366, "step": 11 }, { "epoch": 0.1536, "grad_norm": 3.14056368073828, "learning_rate": 1.230769230769231e-05, "loss": 0.8205, "step": 12 }, { "epoch": 0.1664, "grad_norm": 2.2854369325531834, "learning_rate": 1.3333333333333333e-05, "loss": 0.7825, "step": 13 }, { "epoch": 0.1792, "grad_norm": 2.55230766379796, "learning_rate": 1.435897435897436e-05, "loss": 0.7745, "step": 14 }, { "epoch": 0.192, "grad_norm": 2.1163531503614808, "learning_rate": 1.5384615384615387e-05, "loss": 0.7378, "step": 15 }, { "epoch": 0.2048, "grad_norm": 1.7545006007454378, "learning_rate": 1.641025641025641e-05, "loss": 0.7281, "step": 16 }, { "epoch": 0.2176, "grad_norm": 1.4874110741930364, "learning_rate": 1.7435897435897438e-05, "loss": 0.7435, "step": 17 }, { "epoch": 0.2304, "grad_norm": 1.6189725301930469, "learning_rate": 1.8461538461538465e-05, "loss": 0.743, "step": 18 }, { "epoch": 0.2432, "grad_norm": 1.4046995785004956, "learning_rate": 1.9487179487179488e-05, "loss": 0.7486, "step": 19 }, { "epoch": 0.256, "grad_norm": 0.9358645050027536, "learning_rate": 2.0512820512820512e-05, "loss": 0.7467, "step": 20 }, { "epoch": 0.2688, "grad_norm": 1.0721376654895611, "learning_rate": 2.153846153846154e-05, "loss": 0.6933, "step": 21 }, { "epoch": 0.2816, "grad_norm": 1.1516256662003364, "learning_rate": 2.2564102564102566e-05, "loss": 0.7067, "step": 22 }, { "epoch": 0.2944, "grad_norm": 0.9223029393111457, "learning_rate": 2.3589743589743593e-05, "loss": 0.7046, "step": 23 }, { "epoch": 0.3072, "grad_norm": 1.1474983271538444, "learning_rate": 2.461538461538462e-05, "loss": 0.7125, "step": 24 }, { "epoch": 0.32, "grad_norm": 0.8955100949974366, "learning_rate": 2.5641025641025646e-05, "loss": 0.6728, "step": 25 }, { "epoch": 0.3328, "grad_norm": 1.2251687166354326, "learning_rate": 2.6666666666666667e-05, "loss": 0.7381, "step": 26 }, { "epoch": 0.3456, "grad_norm": 0.7816429553280688, "learning_rate": 2.7692307692307694e-05, "loss": 0.6553, "step": 27 }, { "epoch": 0.3584, "grad_norm": 0.9763360804698599, "learning_rate": 2.871794871794872e-05, "loss": 0.6798, "step": 28 }, { "epoch": 0.3712, "grad_norm": 0.8354114194674345, "learning_rate": 2.9743589743589747e-05, "loss": 0.6958, "step": 29 }, { "epoch": 0.384, "grad_norm": 0.6887105015637863, "learning_rate": 3.0769230769230774e-05, "loss": 0.6497, "step": 30 }, { "epoch": 0.3968, "grad_norm": 0.8779524514630032, "learning_rate": 3.1794871794871795e-05, "loss": 0.6844, "step": 31 }, { "epoch": 0.4096, "grad_norm": 0.7537997504917305, "learning_rate": 3.282051282051282e-05, "loss": 0.6672, "step": 32 }, { "epoch": 0.4224, "grad_norm": 0.8385727211139176, "learning_rate": 3.384615384615385e-05, "loss": 0.6744, "step": 33 }, { "epoch": 0.4352, "grad_norm": 0.7602171774049509, "learning_rate": 3.4871794871794875e-05, "loss": 0.6666, "step": 34 }, { "epoch": 0.448, "grad_norm": 0.7678255126965418, "learning_rate": 3.58974358974359e-05, "loss": 0.6642, "step": 35 }, { "epoch": 0.4608, "grad_norm": 0.647081056305973, "learning_rate": 3.692307692307693e-05, "loss": 0.6638, "step": 36 }, { "epoch": 0.4736, "grad_norm": 0.8379083769655634, "learning_rate": 3.794871794871795e-05, "loss": 0.6732, "step": 37 }, { "epoch": 0.4864, "grad_norm": 0.9149239688006771, "learning_rate": 3.8974358974358976e-05, "loss": 0.6624, "step": 38 }, { "epoch": 0.4992, "grad_norm": 0.9772104074784475, "learning_rate": 4e-05, "loss": 0.6962, "step": 39 }, { "epoch": 0.512, "grad_norm": 1.1116947880572285, "learning_rate": 3.9999198907597046e-05, "loss": 0.6245, "step": 40 }, { "epoch": 0.5248, "grad_norm": 0.7227423304768713, "learning_rate": 3.9996795694563096e-05, "loss": 0.6225, "step": 41 }, { "epoch": 0.5376, "grad_norm": 0.9725040218297323, "learning_rate": 3.999279055341771e-05, "loss": 0.6604, "step": 42 }, { "epoch": 0.5504, "grad_norm": 0.955643555394975, "learning_rate": 3.998718380500971e-05, "loss": 0.6133, "step": 43 }, { "epoch": 0.5632, "grad_norm": 0.7567970803940156, "learning_rate": 3.997997589849145e-05, "loss": 0.6309, "step": 44 }, { "epoch": 0.576, "grad_norm": 0.7996454927598492, "learning_rate": 3.9971167411282835e-05, "loss": 0.6659, "step": 45 }, { "epoch": 0.5888, "grad_norm": 0.7366005891361118, "learning_rate": 3.99607590490251e-05, "loss": 0.6569, "step": 46 }, { "epoch": 0.6016, "grad_norm": 0.7885369085723203, "learning_rate": 3.9948751645524235e-05, "loss": 0.6313, "step": 47 }, { "epoch": 0.6144, "grad_norm": 0.6751447378852333, "learning_rate": 3.9935146162684206e-05, "loss": 0.6326, "step": 48 }, { "epoch": 0.6272, "grad_norm": 0.8571310551925032, "learning_rate": 3.9919943690429906e-05, "loss": 0.6201, "step": 49 }, { "epoch": 0.64, "grad_norm": 0.7550362899021348, "learning_rate": 3.9903145446619837e-05, "loss": 0.6329, "step": 50 }, { "epoch": 0.6528, "grad_norm": 0.8280593896023649, "learning_rate": 3.9884752776948564e-05, "loss": 0.6422, "step": 51 }, { "epoch": 0.6656, "grad_norm": 0.8811902923700886, "learning_rate": 3.9864767154838864e-05, "loss": 0.6354, "step": 52 }, { "epoch": 0.6784, "grad_norm": 1.1095721931863598, "learning_rate": 3.9843190181323744e-05, "loss": 0.6667, "step": 53 }, { "epoch": 0.6912, "grad_norm": 0.6445072194387576, "learning_rate": 3.982002358491817e-05, "loss": 0.6364, "step": 54 }, { "epoch": 0.704, "grad_norm": 0.9734320449566246, "learning_rate": 3.979526922148058e-05, "loss": 0.6218, "step": 55 }, { "epoch": 0.7168, "grad_norm": 0.6247852399111969, "learning_rate": 3.9768929074064206e-05, "loss": 0.6401, "step": 56 }, { "epoch": 0.7296, "grad_norm": 0.6158328565284048, "learning_rate": 3.9741005252758255e-05, "loss": 0.6168, "step": 57 }, { "epoch": 0.7424, "grad_norm": 0.6280970629663458, "learning_rate": 3.971149999451886e-05, "loss": 0.6423, "step": 58 }, { "epoch": 0.7552, "grad_norm": 0.876342261245329, "learning_rate": 3.9680415662989806e-05, "loss": 0.6644, "step": 59 }, { "epoch": 0.768, "grad_norm": 0.6782228770929252, "learning_rate": 3.9647754748313294e-05, "loss": 0.6425, "step": 60 }, { "epoch": 0.7808, "grad_norm": 0.6213554461131593, "learning_rate": 3.96135198669304e-05, "loss": 0.6114, "step": 61 }, { "epoch": 0.7936, "grad_norm": 0.8970158114530625, "learning_rate": 3.957771376137144e-05, "loss": 0.6523, "step": 62 }, { "epoch": 0.8064, "grad_norm": 0.6928394207466625, "learning_rate": 3.954033930003634e-05, "loss": 0.6168, "step": 63 }, { "epoch": 0.8192, "grad_norm": 0.6361855672357039, "learning_rate": 3.9501399476964806e-05, "loss": 0.6177, "step": 64 }, { "epoch": 0.832, "grad_norm": 0.7475080798399086, "learning_rate": 3.946089741159648e-05, "loss": 0.6338, "step": 65 }, { "epoch": 0.8448, "grad_norm": 0.7201665058372186, "learning_rate": 3.9418836348521045e-05, "loss": 0.6004, "step": 66 }, { "epoch": 0.8576, "grad_norm": 0.6665201343517307, "learning_rate": 3.937521965721831e-05, "loss": 0.6324, "step": 67 }, { "epoch": 0.8704, "grad_norm": 0.7733661737414401, "learning_rate": 3.933005083178828e-05, "loss": 0.6246, "step": 68 }, { "epoch": 0.8832, "grad_norm": 0.6768671192755346, "learning_rate": 3.928333349067125e-05, "loss": 0.6219, "step": 69 }, { "epoch": 0.896, "grad_norm": 0.7097685593783307, "learning_rate": 3.923507137635792e-05, "loss": 0.623, "step": 70 }, { "epoch": 0.9088, "grad_norm": 0.6536798448840015, "learning_rate": 3.9185268355089606e-05, "loss": 0.6269, "step": 71 }, { "epoch": 0.9216, "grad_norm": 0.6614424957485971, "learning_rate": 3.913392841654851e-05, "loss": 0.6351, "step": 72 }, { "epoch": 0.9344, "grad_norm": 0.6977238316575177, "learning_rate": 3.9081055673538093e-05, "loss": 0.6239, "step": 73 }, { "epoch": 0.9472, "grad_norm": 0.7934882507910082, "learning_rate": 3.902665436165364e-05, "loss": 0.6573, "step": 74 }, { "epoch": 0.96, "grad_norm": 0.6575515131086959, "learning_rate": 3.897072883894291e-05, "loss": 0.5857, "step": 75 }, { "epoch": 0.9728, "grad_norm": 0.7723302740017406, "learning_rate": 3.8913283585557054e-05, "loss": 0.6344, "step": 76 }, { "epoch": 0.9856, "grad_norm": 0.6941667379950092, "learning_rate": 3.885432320339167e-05, "loss": 0.5906, "step": 77 }, { "epoch": 0.9984, "grad_norm": 0.8021010313212673, "learning_rate": 3.879385241571817e-05, "loss": 0.6364, "step": 78 }, { "epoch": 1.0112, "grad_norm": 1.626971688308395, "learning_rate": 3.873187606680543e-05, "loss": 1.068, "step": 79 }, { "epoch": 1.024, "grad_norm": 0.9308988989110186, "learning_rate": 3.866839912153168e-05, "loss": 0.5233, "step": 80 }, { "epoch": 1.0368, "grad_norm": 0.9488953391338323, "learning_rate": 3.860342666498677e-05, "loss": 0.5753, "step": 81 }, { "epoch": 1.0496, "grad_norm": 0.7366798820619407, "learning_rate": 3.853696390206484e-05, "loss": 0.4947, "step": 82 }, { "epoch": 1.0624, "grad_norm": 0.820021588310417, "learning_rate": 3.846901615704734e-05, "loss": 0.5649, "step": 83 }, { "epoch": 1.0752, "grad_norm": 0.6449418020815564, "learning_rate": 3.839958887317649e-05, "loss": 0.4996, "step": 84 }, { "epoch": 1.088, "grad_norm": 0.9216006193216165, "learning_rate": 3.832868761221926e-05, "loss": 0.5333, "step": 85 }, { "epoch": 1.1008, "grad_norm": 0.6725609871315669, "learning_rate": 3.825631805402182e-05, "loss": 0.4864, "step": 86 }, { "epoch": 1.1136, "grad_norm": 0.9531128581935571, "learning_rate": 3.818248599605448e-05, "loss": 0.5237, "step": 87 }, { "epoch": 1.1264, "grad_norm": 0.6641456185314702, "learning_rate": 3.810719735294731e-05, "loss": 0.5167, "step": 88 }, { "epoch": 1.1392, "grad_norm": 0.8823519514010172, "learning_rate": 3.8030458156016326e-05, "loss": 0.4995, "step": 89 }, { "epoch": 1.152, "grad_norm": 0.5548207275448888, "learning_rate": 3.795227455278029e-05, "loss": 0.5106, "step": 90 }, { "epoch": 1.1648, "grad_norm": 0.8288761814178127, "learning_rate": 3.787265280646825e-05, "loss": 0.5217, "step": 91 }, { "epoch": 1.1776, "grad_norm": 0.5061675211682982, "learning_rate": 3.7791599295517825e-05, "loss": 0.5033, "step": 92 }, { "epoch": 1.1904, "grad_norm": 0.7190426537289591, "learning_rate": 3.7709120513064196e-05, "loss": 0.5028, "step": 93 }, { "epoch": 1.2032, "grad_norm": 0.6128890949411272, "learning_rate": 3.762522306641998e-05, "loss": 0.4779, "step": 94 }, { "epoch": 1.216, "grad_norm": 0.62309405530766, "learning_rate": 3.7539913676545874e-05, "loss": 0.5451, "step": 95 }, { "epoch": 1.2288000000000001, "grad_norm": 0.6334625248101861, "learning_rate": 3.745319917751229e-05, "loss": 0.5274, "step": 96 }, { "epoch": 1.2416, "grad_norm": 0.5653360526439104, "learning_rate": 3.736508651595188e-05, "loss": 0.539, "step": 97 }, { "epoch": 1.2544, "grad_norm": 0.6224327796221331, "learning_rate": 3.727558275050301e-05, "loss": 0.4945, "step": 98 }, { "epoch": 1.2671999999999999, "grad_norm": 0.7081659483273709, "learning_rate": 3.718469505124434e-05, "loss": 0.5272, "step": 99 }, { "epoch": 1.28, "grad_norm": 0.5845719830086954, "learning_rate": 3.709243069912041e-05, "loss": 0.5066, "step": 100 }, { "epoch": 1.2928, "grad_norm": 0.6882406051818343, "learning_rate": 3.699879708535838e-05, "loss": 0.5123, "step": 101 }, { "epoch": 1.3056, "grad_norm": 0.6051334323383424, "learning_rate": 3.69038017108759e-05, "loss": 0.5259, "step": 102 }, { "epoch": 1.3184, "grad_norm": 0.5879222709126353, "learning_rate": 3.680745218568026e-05, "loss": 0.473, "step": 103 }, { "epoch": 1.3312, "grad_norm": 0.5592006776867556, "learning_rate": 3.6709756228258735e-05, "loss": 0.5262, "step": 104 }, { "epoch": 1.3439999999999999, "grad_norm": 0.5242844070496, "learning_rate": 3.6610721664960236e-05, "loss": 0.4905, "step": 105 }, { "epoch": 1.3568, "grad_norm": 0.5941812532382632, "learning_rate": 3.65103564293684e-05, "loss": 0.5479, "step": 106 }, { "epoch": 1.3696, "grad_norm": 0.5387721208985127, "learning_rate": 3.640866856166601e-05, "loss": 0.4948, "step": 107 }, { "epoch": 1.3824, "grad_norm": 0.5854840592046221, "learning_rate": 3.6305666207990886e-05, "loss": 0.4785, "step": 108 }, { "epoch": 1.3952, "grad_norm": 0.522507074880471, "learning_rate": 3.6201357619783336e-05, "loss": 0.4754, "step": 109 }, { "epoch": 1.408, "grad_norm": 0.5625209088620615, "learning_rate": 3.609575115312511e-05, "loss": 0.4869, "step": 110 }, { "epoch": 1.4208, "grad_norm": 0.43969509643876364, "learning_rate": 3.598885526807003e-05, "loss": 0.4972, "step": 111 }, { "epoch": 1.4336, "grad_norm": 0.5619860822540647, "learning_rate": 3.5880678527966224e-05, "loss": 0.4735, "step": 112 }, { "epoch": 1.4464000000000001, "grad_norm": 0.4421063815613501, "learning_rate": 3.577122959877017e-05, "loss": 0.5055, "step": 113 }, { "epoch": 1.4592, "grad_norm": 0.5369412032192468, "learning_rate": 3.566051724835245e-05, "loss": 0.4923, "step": 114 }, { "epoch": 1.472, "grad_norm": 0.48910911632729354, "learning_rate": 3.554855034579532e-05, "loss": 0.512, "step": 115 }, { "epoch": 1.4848, "grad_norm": 0.5136909704915422, "learning_rate": 3.5435337860682304e-05, "loss": 0.4981, "step": 116 }, { "epoch": 1.4976, "grad_norm": 0.48631677940560963, "learning_rate": 3.532088886237956e-05, "loss": 0.4915, "step": 117 }, { "epoch": 1.5104, "grad_norm": 0.4840473244318828, "learning_rate": 3.520521251930941e-05, "loss": 0.4736, "step": 118 }, { "epoch": 1.5232, "grad_norm": 0.5064523309799703, "learning_rate": 3.5088318098215805e-05, "loss": 0.5403, "step": 119 }, { "epoch": 1.536, "grad_norm": 0.589977688148516, "learning_rate": 3.497021496342203e-05, "loss": 0.4694, "step": 120 }, { "epoch": 1.5488, "grad_norm": 0.45092131277939423, "learning_rate": 3.485091257608047e-05, "loss": 0.4979, "step": 121 }, { "epoch": 1.5615999999999999, "grad_norm": 0.5694759979218573, "learning_rate": 3.473042049341474e-05, "loss": 0.5191, "step": 122 }, { "epoch": 1.5744, "grad_norm": 0.5065392756850746, "learning_rate": 3.4608748367954064e-05, "loss": 0.4805, "step": 123 }, { "epoch": 1.5872000000000002, "grad_norm": 0.5466527945164157, "learning_rate": 3.4485905946759965e-05, "loss": 0.5199, "step": 124 }, { "epoch": 1.6, "grad_norm": 0.7191169252596367, "learning_rate": 3.4361903070645484e-05, "loss": 0.5183, "step": 125 }, { "epoch": 1.6128, "grad_norm": 0.519742080744315, "learning_rate": 3.423674967338681e-05, "loss": 0.5178, "step": 126 }, { "epoch": 1.6256, "grad_norm": 0.738483275841055, "learning_rate": 3.411045578092754e-05, "loss": 0.4837, "step": 127 }, { "epoch": 1.6383999999999999, "grad_norm": 0.5192697289694931, "learning_rate": 3.398303151057543e-05, "loss": 0.5115, "step": 128 }, { "epoch": 1.6512, "grad_norm": 0.7555019608159453, "learning_rate": 3.385448707019199e-05, "loss": 0.5102, "step": 129 }, { "epoch": 1.6640000000000001, "grad_norm": 0.525986636066217, "learning_rate": 3.372483275737468e-05, "loss": 0.5035, "step": 130 }, { "epoch": 1.6768, "grad_norm": 0.6237338996072936, "learning_rate": 3.359407895863199e-05, "loss": 0.4708, "step": 131 }, { "epoch": 1.6896, "grad_norm": 0.5347925953394986, "learning_rate": 3.34622361485514e-05, "loss": 0.5117, "step": 132 }, { "epoch": 1.7024, "grad_norm": 0.47804164439498226, "learning_rate": 3.332931488896029e-05, "loss": 0.5006, "step": 133 }, { "epoch": 1.7151999999999998, "grad_norm": 0.5097745208171317, "learning_rate": 3.319532582807977e-05, "loss": 0.4956, "step": 134 }, { "epoch": 1.728, "grad_norm": 0.472002785073484, "learning_rate": 3.30602796996717e-05, "loss": 0.5138, "step": 135 }, { "epoch": 1.7408000000000001, "grad_norm": 0.40000804145024127, "learning_rate": 3.2924187322178865e-05, "loss": 0.452, "step": 136 }, { "epoch": 1.7536, "grad_norm": 0.452939045182372, "learning_rate": 3.278705959785821e-05, "loss": 0.5132, "step": 137 }, { "epoch": 1.7664, "grad_norm": 0.4492202701835303, "learning_rate": 3.2648907511907544e-05, "loss": 0.51, "step": 138 }, { "epoch": 1.7792, "grad_norm": 0.5217146312190998, "learning_rate": 3.250974213158555e-05, "loss": 0.5055, "step": 139 }, { "epoch": 1.792, "grad_norm": 0.45917544834665075, "learning_rate": 3.23695746053251e-05, "loss": 0.5128, "step": 140 }, { "epoch": 1.8048, "grad_norm": 0.48608619608415743, "learning_rate": 3.222841616184025e-05, "loss": 0.4807, "step": 141 }, { "epoch": 1.8176, "grad_norm": 0.4153980294330535, "learning_rate": 3.208627810922665e-05, "loss": 0.4887, "step": 142 }, { "epoch": 1.8304, "grad_norm": 0.4635350891032439, "learning_rate": 3.194317183405573e-05, "loss": 0.5051, "step": 143 }, { "epoch": 1.8432, "grad_norm": 0.5452295259370951, "learning_rate": 3.1799108800462466e-05, "loss": 0.5138, "step": 144 }, { "epoch": 1.8559999999999999, "grad_norm": 0.45699342832042916, "learning_rate": 3.1654100549227024e-05, "loss": 0.472, "step": 145 }, { "epoch": 1.8688, "grad_norm": 0.5082490486526375, "learning_rate": 3.1508158696850275e-05, "loss": 0.4954, "step": 146 }, { "epoch": 1.8816000000000002, "grad_norm": 0.6384690552060961, "learning_rate": 3.136129493462312e-05, "loss": 0.4987, "step": 147 }, { "epoch": 1.8944, "grad_norm": 0.4371303247604508, "learning_rate": 3.121352102768998e-05, "loss": 0.4848, "step": 148 }, { "epoch": 1.9072, "grad_norm": 0.41511391912882867, "learning_rate": 3.106484881410628e-05, "loss": 0.4665, "step": 149 }, { "epoch": 1.92, "grad_norm": 0.4249372081891107, "learning_rate": 3.091529020389009e-05, "loss": 0.4994, "step": 150 }, { "epoch": 1.9327999999999999, "grad_norm": 0.44138936645964416, "learning_rate": 3.076485717806808e-05, "loss": 0.4927, "step": 151 }, { "epoch": 1.9456, "grad_norm": 0.43910408608963586, "learning_rate": 3.061356178771564e-05, "loss": 0.5066, "step": 152 }, { "epoch": 1.9584000000000001, "grad_norm": 0.43088729689416577, "learning_rate": 3.0461416152991555e-05, "loss": 0.4687, "step": 153 }, { "epoch": 1.9712, "grad_norm": 0.4278640166834245, "learning_rate": 3.0308432462167045e-05, "loss": 0.5163, "step": 154 }, { "epoch": 1.984, "grad_norm": 0.4742909576609465, "learning_rate": 3.015462297064936e-05, "loss": 0.471, "step": 155 }, { "epoch": 1.9968, "grad_norm": 0.4587718963346401, "learning_rate": 3.0000000000000004e-05, "loss": 0.5087, "step": 156 }, { "epoch": 2.0096, "grad_norm": 0.9966628173120351, "learning_rate": 2.98445759369477e-05, "loss": 0.8075, "step": 157 }, { "epoch": 2.0224, "grad_norm": 0.6863182224101623, "learning_rate": 2.9688363232396056e-05, "loss": 0.4051, "step": 158 }, { "epoch": 2.0352, "grad_norm": 2.0900892617168663, "learning_rate": 2.9531374400426158e-05, "loss": 0.3823, "step": 159 }, { "epoch": 2.048, "grad_norm": 0.7960735194376195, "learning_rate": 2.9373622017294075e-05, "loss": 0.3444, "step": 160 }, { "epoch": 2.0608, "grad_norm": 0.655937861814063, "learning_rate": 2.9215118720423375e-05, "loss": 0.3576, "step": 161 }, { "epoch": 2.0736, "grad_norm": 0.5911864199285094, "learning_rate": 2.9055877207392752e-05, "loss": 0.3585, "step": 162 }, { "epoch": 2.0864, "grad_norm": 0.6532713894850588, "learning_rate": 2.8895910234918828e-05, "loss": 0.3538, "step": 163 }, { "epoch": 2.0992, "grad_norm": 0.6592758774119294, "learning_rate": 2.873523061783426e-05, "loss": 0.3908, "step": 164 }, { "epoch": 2.112, "grad_norm": 0.6444342767269957, "learning_rate": 2.8573851228061084e-05, "loss": 0.4022, "step": 165 }, { "epoch": 2.1248, "grad_norm": 0.6495217170344411, "learning_rate": 2.8411784993579633e-05, "loss": 0.3443, "step": 166 }, { "epoch": 2.1376, "grad_norm": 0.5590722491328468, "learning_rate": 2.8249044897392814e-05, "loss": 0.3723, "step": 167 }, { "epoch": 2.1504, "grad_norm": 0.5648244135782439, "learning_rate": 2.80856439764861e-05, "loss": 0.3498, "step": 168 }, { "epoch": 2.1632, "grad_norm": 0.4820229598242042, "learning_rate": 2.792159532078314e-05, "loss": 0.3497, "step": 169 }, { "epoch": 2.176, "grad_norm": 0.5561337750606294, "learning_rate": 2.77569120720971e-05, "loss": 0.3677, "step": 170 }, { "epoch": 2.1888, "grad_norm": 0.5455054065895131, "learning_rate": 2.7591607423077932e-05, "loss": 0.3433, "step": 171 }, { "epoch": 2.2016, "grad_norm": 0.5916420617982933, "learning_rate": 2.7425694616155474e-05, "loss": 0.3971, "step": 172 }, { "epoch": 2.2144, "grad_norm": 0.6508258578592079, "learning_rate": 2.7259186942478656e-05, "loss": 0.3756, "step": 173 }, { "epoch": 2.2272, "grad_norm": 0.47618701005958114, "learning_rate": 2.7092097740850712e-05, "loss": 0.3403, "step": 174 }, { "epoch": 2.24, "grad_norm": 0.5593921807214691, "learning_rate": 2.692444039666066e-05, "loss": 0.3595, "step": 175 }, { "epoch": 2.2528, "grad_norm": 0.4198057083178645, "learning_rate": 2.6756228340810946e-05, "loss": 0.3314, "step": 176 }, { "epoch": 2.2656, "grad_norm": 0.5110303368655525, "learning_rate": 2.6587475048641596e-05, "loss": 0.3606, "step": 177 }, { "epoch": 2.2784, "grad_norm": 0.5011779517833854, "learning_rate": 2.6418194038850634e-05, "loss": 0.3799, "step": 178 }, { "epoch": 2.2912, "grad_norm": 0.5246653684584236, "learning_rate": 2.624839887241115e-05, "loss": 0.3464, "step": 179 }, { "epoch": 2.304, "grad_norm": 0.4855556982779856, "learning_rate": 2.607810315148494e-05, "loss": 0.366, "step": 180 }, { "epoch": 2.3168, "grad_norm": 0.5305125098314, "learning_rate": 2.5907320518332827e-05, "loss": 0.3433, "step": 181 }, { "epoch": 2.3296, "grad_norm": 0.4970558826142017, "learning_rate": 2.5736064654221808e-05, "loss": 0.36, "step": 182 }, { "epoch": 2.3424, "grad_norm": 0.499133817248476, "learning_rate": 2.5564349278329056e-05, "loss": 0.3337, "step": 183 }, { "epoch": 2.3552, "grad_norm": 0.4862098543776659, "learning_rate": 2.539218814664288e-05, "loss": 0.3614, "step": 184 }, { "epoch": 2.368, "grad_norm": 0.49489877938395055, "learning_rate": 2.521959505086075e-05, "loss": 0.364, "step": 185 }, { "epoch": 2.3808, "grad_norm": 0.47745055778943635, "learning_rate": 2.5046583817284437e-05, "loss": 0.3369, "step": 186 }, { "epoch": 2.3936, "grad_norm": 0.4518599993209865, "learning_rate": 2.487316830571244e-05, "loss": 0.3397, "step": 187 }, { "epoch": 2.4064, "grad_norm": 0.49190428712388085, "learning_rate": 2.4699362408329646e-05, "loss": 0.3609, "step": 188 }, { "epoch": 2.4192, "grad_norm": 0.42695016108008466, "learning_rate": 2.4525180048594452e-05, "loss": 0.3403, "step": 189 }, { "epoch": 2.432, "grad_norm": 0.48399579137589493, "learning_rate": 2.435063518012335e-05, "loss": 0.3559, "step": 190 }, { "epoch": 2.4448, "grad_norm": 0.43535579894833265, "learning_rate": 2.4175741785573177e-05, "loss": 0.3699, "step": 191 }, { "epoch": 2.4576000000000002, "grad_norm": 0.42236836330089367, "learning_rate": 2.4000513875520892e-05, "loss": 0.3389, "step": 192 }, { "epoch": 2.4704, "grad_norm": 0.4254052985368455, "learning_rate": 2.3824965487341247e-05, "loss": 0.3748, "step": 193 }, { "epoch": 2.4832, "grad_norm": 0.4124026511474054, "learning_rate": 2.3649110684082258e-05, "loss": 0.3649, "step": 194 }, { "epoch": 2.496, "grad_norm": 0.4358743463877152, "learning_rate": 2.3472963553338614e-05, "loss": 0.3911, "step": 195 }, { "epoch": 2.5088, "grad_norm": 0.3787171361989274, "learning_rate": 2.3296538206123134e-05, "loss": 0.3594, "step": 196 }, { "epoch": 2.5216, "grad_norm": 0.402330181025512, "learning_rate": 2.311984877573636e-05, "loss": 0.3528, "step": 197 }, { "epoch": 2.5343999999999998, "grad_norm": 0.3742935305188853, "learning_rate": 2.2942909416634326e-05, "loss": 0.328, "step": 198 }, { "epoch": 2.5472, "grad_norm": 0.41761460975640535, "learning_rate": 2.2765734303294666e-05, "loss": 0.393, "step": 199 }, { "epoch": 2.56, "grad_norm": 0.3555221908620572, "learning_rate": 2.2588337629081107e-05, "loss": 0.3705, "step": 200 }, { "epoch": 2.5728, "grad_norm": 0.4632160156775803, "learning_rate": 2.2410733605106462e-05, "loss": 0.3333, "step": 201 }, { "epoch": 2.5856, "grad_norm": 0.3854892191886437, "learning_rate": 2.2232936459094158e-05, "loss": 0.3494, "step": 202 }, { "epoch": 2.5984, "grad_norm": 0.3637067298468189, "learning_rate": 2.205496043423849e-05, "loss": 0.3405, "step": 203 }, { "epoch": 2.6112, "grad_norm": 0.40494042381970147, "learning_rate": 2.1876819788063586e-05, "loss": 0.3888, "step": 204 }, { "epoch": 2.624, "grad_norm": 0.3755956967947316, "learning_rate": 2.16985287912813e-05, "loss": 0.3255, "step": 205 }, { "epoch": 2.6368, "grad_norm": 0.4304305110638947, "learning_rate": 2.1520101726647922e-05, "loss": 0.3865, "step": 206 }, { "epoch": 2.6496, "grad_norm": 0.4062101646966682, "learning_rate": 2.1341552887820048e-05, "loss": 0.3513, "step": 207 }, { "epoch": 2.6624, "grad_norm": 0.4000767842561131, "learning_rate": 2.1162896578209517e-05, "loss": 0.3609, "step": 208 }, { "epoch": 2.6752000000000002, "grad_norm": 0.3945734513213183, "learning_rate": 2.0984147109837564e-05, "loss": 0.3219, "step": 209 }, { "epoch": 2.6879999999999997, "grad_norm": 0.42058336216796155, "learning_rate": 2.0805318802188307e-05, "loss": 0.3744, "step": 210 }, { "epoch": 2.7008, "grad_norm": 0.4132723550394805, "learning_rate": 2.0626425981061608e-05, "loss": 0.3658, "step": 211 }, { "epoch": 2.7136, "grad_norm": 0.39649934238958967, "learning_rate": 2.0447482977425465e-05, "loss": 0.3405, "step": 212 }, { "epoch": 2.7264, "grad_norm": 0.4208572250100482, "learning_rate": 2.0268504126267952e-05, "loss": 0.368, "step": 213 }, { "epoch": 2.7392, "grad_norm": 0.3978701788145883, "learning_rate": 2.008950376544887e-05, "loss": 0.3465, "step": 214 }, { "epoch": 2.752, "grad_norm": 0.4001369867888692, "learning_rate": 1.9910496234551132e-05, "loss": 0.3532, "step": 215 }, { "epoch": 2.7648, "grad_norm": 0.4034806906167456, "learning_rate": 1.9731495873732055e-05, "loss": 0.353, "step": 216 }, { "epoch": 2.7776, "grad_norm": 0.4307848928295819, "learning_rate": 1.9552517022574542e-05, "loss": 0.3569, "step": 217 }, { "epoch": 2.7904, "grad_norm": 0.3999064020664724, "learning_rate": 1.93735740189384e-05, "loss": 0.3568, "step": 218 }, { "epoch": 2.8032, "grad_norm": 0.4090043678053143, "learning_rate": 1.9194681197811703e-05, "loss": 0.365, "step": 219 }, { "epoch": 2.816, "grad_norm": 0.42532758658903885, "learning_rate": 1.901585289016244e-05, "loss": 0.3427, "step": 220 }, { "epoch": 2.8288, "grad_norm": 0.39419334493438113, "learning_rate": 1.8837103421790486e-05, "loss": 0.3772, "step": 221 }, { "epoch": 2.8416, "grad_norm": 0.4016749191362796, "learning_rate": 1.8658447112179952e-05, "loss": 0.3534, "step": 222 }, { "epoch": 2.8544, "grad_norm": 0.40847223940384836, "learning_rate": 1.8479898273352084e-05, "loss": 0.369, "step": 223 }, { "epoch": 2.8672, "grad_norm": 0.4001323176748629, "learning_rate": 1.83014712087187e-05, "loss": 0.3387, "step": 224 }, { "epoch": 2.88, "grad_norm": 0.4248079947067711, "learning_rate": 1.8123180211936417e-05, "loss": 0.3771, "step": 225 }, { "epoch": 2.8928000000000003, "grad_norm": 0.3945282895021062, "learning_rate": 1.794503956576152e-05, "loss": 0.3534, "step": 226 }, { "epoch": 2.9055999999999997, "grad_norm": 0.3497014179956054, "learning_rate": 1.776706354090585e-05, "loss": 0.3563, "step": 227 }, { "epoch": 2.9184, "grad_norm": 0.3986345065122307, "learning_rate": 1.758926639489354e-05, "loss": 0.376, "step": 228 }, { "epoch": 2.9312, "grad_norm": 0.370859934109157, "learning_rate": 1.7411662370918893e-05, "loss": 0.339, "step": 229 }, { "epoch": 2.944, "grad_norm": 0.3747895589693441, "learning_rate": 1.7234265696705344e-05, "loss": 0.3758, "step": 230 }, { "epoch": 2.9568, "grad_norm": 0.3685623113569053, "learning_rate": 1.7057090583365678e-05, "loss": 0.35, "step": 231 }, { "epoch": 2.9696, "grad_norm": 0.36104623960562526, "learning_rate": 1.6880151224263646e-05, "loss": 0.3553, "step": 232 }, { "epoch": 2.9824, "grad_norm": 0.40586050975133925, "learning_rate": 1.6703461793876876e-05, "loss": 0.3772, "step": 233 }, { "epoch": 2.9952, "grad_norm": 0.41959521563610697, "learning_rate": 1.6527036446661396e-05, "loss": 0.3486, "step": 234 }, { "epoch": 3.008, "grad_norm": 1.1794707358197836, "learning_rate": 1.635088931591775e-05, "loss": 0.6203, "step": 235 }, { "epoch": 3.0208, "grad_norm": 0.6207295094733497, "learning_rate": 1.6175034512658753e-05, "loss": 0.2424, "step": 236 }, { "epoch": 3.0336, "grad_norm": 0.5155664642605005, "learning_rate": 1.5999486124479115e-05, "loss": 0.2496, "step": 237 }, { "epoch": 3.0464, "grad_norm": 0.841858506357091, "learning_rate": 1.5824258214426833e-05, "loss": 0.2474, "step": 238 }, { "epoch": 3.0592, "grad_norm": 0.6482412245876622, "learning_rate": 1.5649364819876655e-05, "loss": 0.2404, "step": 239 }, { "epoch": 3.072, "grad_norm": 0.4828258709239939, "learning_rate": 1.547481995140556e-05, "loss": 0.24, "step": 240 }, { "epoch": 3.0848, "grad_norm": 0.5795204158855689, "learning_rate": 1.5300637591670357e-05, "loss": 0.2274, "step": 241 }, { "epoch": 3.0976, "grad_norm": 0.5853029607020683, "learning_rate": 1.5126831694287564e-05, "loss": 0.26, "step": 242 }, { "epoch": 3.1104, "grad_norm": 0.4828643470497328, "learning_rate": 1.4953416182715566e-05, "loss": 0.2544, "step": 243 }, { "epoch": 3.1232, "grad_norm": 0.4329500194495083, "learning_rate": 1.478040494913926e-05, "loss": 0.2206, "step": 244 }, { "epoch": 3.136, "grad_norm": 0.5165455521545157, "learning_rate": 1.460781185335713e-05, "loss": 0.2417, "step": 245 }, { "epoch": 3.1488, "grad_norm": 0.5275383950137368, "learning_rate": 1.443565072167095e-05, "loss": 0.2449, "step": 246 }, { "epoch": 3.1616, "grad_norm": 0.39340503477267014, "learning_rate": 1.4263935345778202e-05, "loss": 0.2199, "step": 247 }, { "epoch": 3.1744, "grad_norm": 0.4317866921063053, "learning_rate": 1.409267948166718e-05, "loss": 0.2351, "step": 248 }, { "epoch": 3.1872, "grad_norm": 0.4443169086208162, "learning_rate": 1.3921896848515064e-05, "loss": 0.2556, "step": 249 }, { "epoch": 3.2, "grad_norm": 0.4005458806639983, "learning_rate": 1.3751601127588849e-05, "loss": 0.2365, "step": 250 }, { "epoch": 3.2128, "grad_norm": 0.4181384968270446, "learning_rate": 1.3581805961149371e-05, "loss": 0.245, "step": 251 }, { "epoch": 3.2256, "grad_norm": 0.37836401384344864, "learning_rate": 1.341252495135841e-05, "loss": 0.2344, "step": 252 }, { "epoch": 3.2384, "grad_norm": 0.3713195178763742, "learning_rate": 1.324377165918906e-05, "loss": 0.2465, "step": 253 }, { "epoch": 3.2512, "grad_norm": 0.36043895430414663, "learning_rate": 1.3075559603339354e-05, "loss": 0.2338, "step": 254 }, { "epoch": 3.2640000000000002, "grad_norm": 0.34938091694810997, "learning_rate": 1.2907902259149287e-05, "loss": 0.228, "step": 255 }, { "epoch": 3.2768, "grad_norm": 0.3625745323291279, "learning_rate": 1.274081305752135e-05, "loss": 0.2532, "step": 256 }, { "epoch": 3.2896, "grad_norm": 0.32852523370479425, "learning_rate": 1.2574305383844528e-05, "loss": 0.2086, "step": 257 }, { "epoch": 3.3024, "grad_norm": 0.3579171159966551, "learning_rate": 1.2408392576922075e-05, "loss": 0.2375, "step": 258 }, { "epoch": 3.3152, "grad_norm": 0.33573195229266234, "learning_rate": 1.2243087927902905e-05, "loss": 0.2216, "step": 259 }, { "epoch": 3.328, "grad_norm": 0.3678272441344075, "learning_rate": 1.2078404679216864e-05, "loss": 0.2438, "step": 260 }, { "epoch": 3.3407999999999998, "grad_norm": 0.33596560147428767, "learning_rate": 1.1914356023513904e-05, "loss": 0.2449, "step": 261 }, { "epoch": 3.3536, "grad_norm": 0.35108960586375987, "learning_rate": 1.1750955102607193e-05, "loss": 0.2298, "step": 262 }, { "epoch": 3.3664, "grad_norm": 0.3505624560113376, "learning_rate": 1.1588215006420374e-05, "loss": 0.2163, "step": 263 }, { "epoch": 3.3792, "grad_norm": 0.3504767757115922, "learning_rate": 1.1426148771938915e-05, "loss": 0.2189, "step": 264 }, { "epoch": 3.392, "grad_norm": 0.34430891718788276, "learning_rate": 1.1264769382165748e-05, "loss": 0.2372, "step": 265 }, { "epoch": 3.4048, "grad_norm": 0.33778027498109425, "learning_rate": 1.110408976508118e-05, "loss": 0.2133, "step": 266 }, { "epoch": 3.4176, "grad_norm": 0.33636617129539925, "learning_rate": 1.094412279260726e-05, "loss": 0.2301, "step": 267 }, { "epoch": 3.4304, "grad_norm": 0.3504754683249517, "learning_rate": 1.0784881279576635e-05, "loss": 0.2486, "step": 268 }, { "epoch": 3.4432, "grad_norm": 0.34795897812025656, "learning_rate": 1.0626377982705929e-05, "loss": 0.2141, "step": 269 }, { "epoch": 3.456, "grad_norm": 0.36580061553563076, "learning_rate": 1.0468625599573842e-05, "loss": 0.2304, "step": 270 }, { "epoch": 3.4688, "grad_norm": 0.3701212255172201, "learning_rate": 1.0311636767603952e-05, "loss": 0.2431, "step": 271 }, { "epoch": 3.4816, "grad_norm": 0.3405446548694439, "learning_rate": 1.0155424063052306e-05, "loss": 0.2231, "step": 272 }, { "epoch": 3.4944, "grad_norm": 0.3553166437308296, "learning_rate": 1.0000000000000006e-05, "loss": 0.219, "step": 273 }, { "epoch": 3.5072, "grad_norm": 0.37282631650361, "learning_rate": 9.84537702935065e-06, "loss": 0.2347, "step": 274 }, { "epoch": 3.52, "grad_norm": 0.32702491987456384, "learning_rate": 9.691567537832964e-06, "loss": 0.2257, "step": 275 }, { "epoch": 3.5328, "grad_norm": 0.34275469394870306, "learning_rate": 9.538583847008452e-06, "loss": 0.2367, "step": 276 }, { "epoch": 3.5456, "grad_norm": 0.35933691031419707, "learning_rate": 9.386438212284372e-06, "loss": 0.2415, "step": 277 }, { "epoch": 3.5584, "grad_norm": 0.3571317614246153, "learning_rate": 9.235142821931928e-06, "loss": 0.2333, "step": 278 }, { "epoch": 3.5712, "grad_norm": 0.34359614641158914, "learning_rate": 9.084709796109907e-06, "loss": 0.2235, "step": 279 }, { "epoch": 3.584, "grad_norm": 0.3521601969192374, "learning_rate": 8.93515118589373e-06, "loss": 0.2573, "step": 280 }, { "epoch": 3.5968, "grad_norm": 0.3283557605845052, "learning_rate": 8.786478972310023e-06, "loss": 0.2111, "step": 281 }, { "epoch": 3.6096, "grad_norm": 0.3608233567186079, "learning_rate": 8.638705065376887e-06, "loss": 0.2451, "step": 282 }, { "epoch": 3.6224, "grad_norm": 0.33885700019167236, "learning_rate": 8.491841303149728e-06, "loss": 0.2345, "step": 283 }, { "epoch": 3.6352, "grad_norm": 0.3558837243949352, "learning_rate": 8.345899450772975e-06, "loss": 0.2463, "step": 284 }, { "epoch": 3.648, "grad_norm": 0.3417347616378375, "learning_rate": 8.200891199537549e-06, "loss": 0.2451, "step": 285 }, { "epoch": 3.6608, "grad_norm": 0.3265691533149346, "learning_rate": 8.056828165944282e-06, "loss": 0.2292, "step": 286 }, { "epoch": 3.6736, "grad_norm": 0.35543632739891745, "learning_rate": 7.913721890773354e-06, "loss": 0.2499, "step": 287 }, { "epoch": 3.6864, "grad_norm": 0.32721794505380586, "learning_rate": 7.771583838159756e-06, "loss": 0.2284, "step": 288 }, { "epoch": 3.6992000000000003, "grad_norm": 0.3473173577929293, "learning_rate": 7.630425394674903e-06, "loss": 0.2477, "step": 289 }, { "epoch": 3.7119999999999997, "grad_norm": 0.3267429658875211, "learning_rate": 7.49025786841445e-06, "loss": 0.2256, "step": 290 }, { "epoch": 3.7248, "grad_norm": 0.3308565691014178, "learning_rate": 7.3510924880924575e-06, "loss": 0.2525, "step": 291 }, { "epoch": 3.7376, "grad_norm": 0.32775713637571846, "learning_rate": 7.212940402141808e-06, "loss": 0.2286, "step": 292 }, { "epoch": 3.7504, "grad_norm": 0.3230110993671641, "learning_rate": 7.075812677821145e-06, "loss": 0.2281, "step": 293 }, { "epoch": 3.7632, "grad_norm": 0.34319449892245124, "learning_rate": 6.939720300328303e-06, "loss": 0.2574, "step": 294 }, { "epoch": 3.776, "grad_norm": 0.3052306427404134, "learning_rate": 6.8046741719202385e-06, "loss": 0.2053, "step": 295 }, { "epoch": 3.7888, "grad_norm": 0.3389886313628059, "learning_rate": 6.67068511103971e-06, "loss": 0.2561, "step": 296 }, { "epoch": 3.8016, "grad_norm": 0.3096027527849823, "learning_rate": 6.537763851448593e-06, "loss": 0.2167, "step": 297 }, { "epoch": 3.8144, "grad_norm": 0.3320520175361268, "learning_rate": 6.4059210413680175e-06, "loss": 0.2372, "step": 298 }, { "epoch": 3.8272, "grad_norm": 0.3406340780226723, "learning_rate": 6.275167242625331e-06, "loss": 0.2545, "step": 299 }, { "epoch": 3.84, "grad_norm": 0.2988000824396719, "learning_rate": 6.145512929808013e-06, "loss": 0.2171, "step": 300 }, { "epoch": 3.8528000000000002, "grad_norm": 0.30931376951446365, "learning_rate": 6.016968489424572e-06, "loss": 0.2231, "step": 301 }, { "epoch": 3.8656, "grad_norm": 0.33622255017983976, "learning_rate": 5.889544219072465e-06, "loss": 0.2371, "step": 302 }, { "epoch": 3.8784, "grad_norm": 0.32360161834108225, "learning_rate": 5.7632503266131925e-06, "loss": 0.2348, "step": 303 }, { "epoch": 3.8912, "grad_norm": 0.30292782196099155, "learning_rate": 5.638096929354522e-06, "loss": 0.2048, "step": 304 }, { "epoch": 3.904, "grad_norm": 0.3090810070307293, "learning_rate": 5.514094053240035e-06, "loss": 0.2562, "step": 305 }, { "epoch": 3.9168, "grad_norm": 0.29309856186061733, "learning_rate": 5.39125163204594e-06, "loss": 0.225, "step": 306 }, { "epoch": 3.9295999999999998, "grad_norm": 0.31956453050611927, "learning_rate": 5.269579506585259e-06, "loss": 0.2279, "step": 307 }, { "epoch": 3.9424, "grad_norm": 0.3112087941940966, "learning_rate": 5.149087423919541e-06, "loss": 0.2219, "step": 308 }, { "epoch": 3.9552, "grad_norm": 0.3153817831789216, "learning_rate": 5.029785036577976e-06, "loss": 0.2354, "step": 309 }, { "epoch": 3.968, "grad_norm": 0.31372150880773386, "learning_rate": 4.911681901784198e-06, "loss": 0.2422, "step": 310 }, { "epoch": 3.9808, "grad_norm": 0.32510096573711394, "learning_rate": 4.794787480690597e-06, "loss": 0.2316, "step": 311 }, { "epoch": 3.9936, "grad_norm": 0.29906942590577396, "learning_rate": 4.679111137620442e-06, "loss": 0.2275, "step": 312 }, { "epoch": 4.0064, "grad_norm": 1.2139953461603725, "learning_rate": 4.5646621393177e-06, "loss": 0.3807, "step": 313 }, { "epoch": 4.0192, "grad_norm": 0.5237410389498884, "learning_rate": 4.451449654204685e-06, "loss": 0.183, "step": 314 }, { "epoch": 4.032, "grad_norm": 0.4427029532721314, "learning_rate": 4.339482751647557e-06, "loss": 0.1775, "step": 315 }, { "epoch": 4.0448, "grad_norm": 0.34230370359902346, "learning_rate": 4.228770401229824e-06, "loss": 0.1509, "step": 316 }, { "epoch": 4.0576, "grad_norm": 0.29601757434559484, "learning_rate": 4.119321472033779e-06, "loss": 0.1573, "step": 317 }, { "epoch": 4.0704, "grad_norm": 0.3814257012353634, "learning_rate": 4.011144731929981e-06, "loss": 0.1619, "step": 318 }, { "epoch": 4.0832, "grad_norm": 0.5041931343111281, "learning_rate": 3.904248846874894e-06, "loss": 0.1693, "step": 319 }, { "epoch": 4.096, "grad_norm": 0.5823550587186476, "learning_rate": 3.7986423802166705e-06, "loss": 0.1767, "step": 320 }, { "epoch": 4.1088, "grad_norm": 0.4625808142431024, "learning_rate": 3.694333792009115e-06, "loss": 0.1588, "step": 321 }, { "epoch": 4.1216, "grad_norm": 0.37116127475914895, "learning_rate": 3.5913314383339937e-06, "loss": 0.1782, "step": 322 }, { "epoch": 4.1344, "grad_norm": 0.3294610298238965, "learning_rate": 3.4896435706316e-06, "loss": 0.1661, "step": 323 }, { "epoch": 4.1472, "grad_norm": 0.300193661172271, "learning_rate": 3.3892783350397675e-06, "loss": 0.1424, "step": 324 }, { "epoch": 4.16, "grad_norm": 0.4038054890315199, "learning_rate": 3.290243771741275e-06, "loss": 0.1734, "step": 325 }, { "epoch": 4.1728, "grad_norm": 0.3513663152569111, "learning_rate": 3.1925478143197418e-06, "loss": 0.1608, "step": 326 }, { "epoch": 4.1856, "grad_norm": 0.39787351845147634, "learning_rate": 3.0961982891241083e-06, "loss": 0.158, "step": 327 }, { "epoch": 4.1984, "grad_norm": 0.33655397686316324, "learning_rate": 3.001202914641628e-06, "loss": 0.1596, "step": 328 }, { "epoch": 4.2112, "grad_norm": 0.318527802100109, "learning_rate": 2.907569300879596e-06, "loss": 0.1704, "step": 329 }, { "epoch": 4.224, "grad_norm": 0.2974865136489592, "learning_rate": 2.815304948755664e-06, "loss": 0.1641, "step": 330 }, { "epoch": 4.2368, "grad_norm": 0.3016788088406415, "learning_rate": 2.7244172494969978e-06, "loss": 0.1561, "step": 331 }, { "epoch": 4.2496, "grad_norm": 0.32849297882511297, "learning_rate": 2.6349134840481294e-06, "loss": 0.1561, "step": 332 }, { "epoch": 4.2624, "grad_norm": 0.33069183157158233, "learning_rate": 2.546800822487714e-06, "loss": 0.1765, "step": 333 }, { "epoch": 4.2752, "grad_norm": 0.3540783313652625, "learning_rate": 2.4600863234541338e-06, "loss": 0.1731, "step": 334 }, { "epoch": 4.288, "grad_norm": 0.3086978594001584, "learning_rate": 2.374776933580025e-06, "loss": 0.1558, "step": 335 }, { "epoch": 4.3008, "grad_norm": 0.29331458561932094, "learning_rate": 2.2908794869358044e-06, "loss": 0.1557, "step": 336 }, { "epoch": 4.3136, "grad_norm": 0.3085758487601604, "learning_rate": 2.2084007044821764e-06, "loss": 0.1572, "step": 337 }, { "epoch": 4.3264, "grad_norm": 0.2906868443213144, "learning_rate": 2.127347193531757e-06, "loss": 0.1557, "step": 338 }, { "epoch": 4.3392, "grad_norm": 0.2970918857148492, "learning_rate": 2.0477254472197237e-06, "loss": 0.1514, "step": 339 }, { "epoch": 4.352, "grad_norm": 0.3173460879970613, "learning_rate": 1.96954184398368e-06, "loss": 0.1704, "step": 340 }, { "epoch": 4.3648, "grad_norm": 0.2961358934681771, "learning_rate": 1.8928026470526917e-06, "loss": 0.1469, "step": 341 }, { "epoch": 4.3776, "grad_norm": 0.3130008308479644, "learning_rate": 1.817514003945524e-06, "loss": 0.1632, "step": 342 }, { "epoch": 4.3904, "grad_norm": 0.30139540096728357, "learning_rate": 1.743681945978184e-06, "loss": 0.1694, "step": 343 }, { "epoch": 4.4032, "grad_norm": 0.30497921291683516, "learning_rate": 1.6713123877807413e-06, "loss": 0.1515, "step": 344 }, { "epoch": 4.416, "grad_norm": 0.2962093457026487, "learning_rate": 1.6004111268235156e-06, "loss": 0.1549, "step": 345 }, { "epoch": 4.4288, "grad_norm": 0.29145197933072053, "learning_rate": 1.5309838429526714e-06, "loss": 0.1691, "step": 346 }, { "epoch": 4.4416, "grad_norm": 0.2805215505614425, "learning_rate": 1.4630360979351644e-06, "loss": 0.16, "step": 347 }, { "epoch": 4.4544, "grad_norm": 0.2716802696786645, "learning_rate": 1.396573335013236e-06, "loss": 0.1572, "step": 348 }, { "epoch": 4.4672, "grad_norm": 0.29116806810849954, "learning_rate": 1.3316008784683265e-06, "loss": 0.1677, "step": 349 }, { "epoch": 4.48, "grad_norm": 0.30403120086929486, "learning_rate": 1.2681239331945695e-06, "loss": 0.1656, "step": 350 }, { "epoch": 4.4928, "grad_norm": 0.2888776980210721, "learning_rate": 1.2061475842818337e-06, "loss": 0.1575, "step": 351 }, { "epoch": 4.5056, "grad_norm": 0.28436489114232044, "learning_rate": 1.1456767966083393e-06, "loss": 0.1512, "step": 352 }, { "epoch": 4.5184, "grad_norm": 0.2945919286625033, "learning_rate": 1.086716414442952e-06, "loss": 0.1719, "step": 353 }, { "epoch": 4.5312, "grad_norm": 0.2915528020141999, "learning_rate": 1.0292711610570904e-06, "loss": 0.1525, "step": 354 }, { "epoch": 4.5440000000000005, "grad_norm": 0.29895984099909056, "learning_rate": 9.733456383463658e-07, "loss": 0.163, "step": 355 }, { "epoch": 4.5568, "grad_norm": 0.30792150165168203, "learning_rate": 9.189443264619102e-07, "loss": 0.1621, "step": 356 }, { "epoch": 4.5696, "grad_norm": 0.2909817524180524, "learning_rate": 8.660715834514977e-07, "loss": 0.1517, "step": 357 }, { "epoch": 4.5824, "grad_norm": 0.2950182202862721, "learning_rate": 8.147316449103959e-07, "loss": 0.1579, "step": 358 }, { "epoch": 4.5952, "grad_norm": 0.2786025132207038, "learning_rate": 7.649286236420806e-07, "loss": 0.1589, "step": 359 }, { "epoch": 4.608, "grad_norm": 0.28027422258603657, "learning_rate": 7.166665093287539e-07, "loss": 0.1516, "step": 360 }, { "epoch": 4.6208, "grad_norm": 0.2751820247938203, "learning_rate": 6.69949168211721e-07, "loss": 0.1532, "step": 361 }, { "epoch": 4.6336, "grad_norm": 0.2734147517610915, "learning_rate": 6.247803427816945e-07, "loss": 0.1528, "step": 362 }, { "epoch": 4.6464, "grad_norm": 0.28075210904691406, "learning_rate": 5.811636514789598e-07, "loss": 0.1584, "step": 363 }, { "epoch": 4.6592, "grad_norm": 0.2887601716240021, "learning_rate": 5.391025884035239e-07, "loss": 0.1637, "step": 364 }, { "epoch": 4.672, "grad_norm": 0.277925460841827, "learning_rate": 4.986005230351954e-07, "loss": 0.154, "step": 365 }, { "epoch": 4.6848, "grad_norm": 0.2844367789603032, "learning_rate": 4.5966069996365993e-07, "loss": 0.1517, "step": 366 }, { "epoch": 4.6975999999999996, "grad_norm": 0.28931672628419347, "learning_rate": 4.22286238628562e-07, "loss": 0.1613, "step": 367 }, { "epoch": 4.7104, "grad_norm": 0.2860034658051635, "learning_rate": 3.8648013306960664e-07, "loss": 0.1563, "step": 368 }, { "epoch": 4.7232, "grad_norm": 0.3089159326788425, "learning_rate": 3.522452516867048e-07, "loss": 0.1612, "step": 369 }, { "epoch": 4.736, "grad_norm": 0.29457815167214546, "learning_rate": 3.1958433701019697e-07, "loss": 0.168, "step": 370 }, { "epoch": 4.7488, "grad_norm": 0.27208637514557227, "learning_rate": 2.8850000548115155e-07, "loss": 0.1472, "step": 371 }, { "epoch": 4.7616, "grad_norm": 0.27590131884272984, "learning_rate": 2.5899474724174313e-07, "loss": 0.1586, "step": 372 }, { "epoch": 4.7744, "grad_norm": 0.28602629295970033, "learning_rate": 2.3107092593579905e-07, "loss": 0.1578, "step": 373 }, { "epoch": 4.7872, "grad_norm": 0.28428689397441836, "learning_rate": 2.0473077851942858e-07, "loss": 0.1497, "step": 374 }, { "epoch": 4.8, "grad_norm": 0.2836450607351416, "learning_rate": 1.799764150818306e-07, "loss": 0.1584, "step": 375 }, { "epoch": 4.8128, "grad_norm": 0.2631863358532374, "learning_rate": 1.5680981867625566e-07, "loss": 0.1423, "step": 376 }, { "epoch": 4.8256, "grad_norm": 0.2746300466010014, "learning_rate": 1.3523284516113955e-07, "loss": 0.1533, "step": 377 }, { "epoch": 4.8384, "grad_norm": 0.2916563865381194, "learning_rate": 1.1524722305144231e-07, "loss": 0.1534, "step": 378 }, { "epoch": 4.8512, "grad_norm": 0.2801924376684902, "learning_rate": 9.685455338016347e-08, "loss": 0.1624, "step": 379 }, { "epoch": 4.864, "grad_norm": 0.3066114462534377, "learning_rate": 8.005630957010014e-08, "loss": 0.1641, "step": 380 }, { "epoch": 4.8768, "grad_norm": 0.28936538978957893, "learning_rate": 6.485383731580142e-08, "loss": 0.1585, "step": 381 }, { "epoch": 4.8896, "grad_norm": 0.29197481999400926, "learning_rate": 5.1248354475768034e-08, "loss": 0.1606, "step": 382 }, { "epoch": 4.9024, "grad_norm": 0.27877119220344143, "learning_rate": 3.924095097489922e-08, "loss": 0.1486, "step": 383 }, { "epoch": 4.9152000000000005, "grad_norm": 0.29040742011215104, "learning_rate": 2.8832588717164766e-08, "loss": 0.172, "step": 384 }, { "epoch": 4.928, "grad_norm": 0.27366147923175643, "learning_rate": 2.0024101508555604e-08, "loss": 0.1655, "step": 385 }, { "epoch": 4.9408, "grad_norm": 0.2928012431572296, "learning_rate": 1.281619499029274e-08, "loss": 0.178, "step": 386 }, { "epoch": 4.9536, "grad_norm": 0.28735601580788767, "learning_rate": 7.209446582292501e-09, "loss": 0.1551, "step": 387 }, { "epoch": 4.9664, "grad_norm": 0.2946376966049083, "learning_rate": 3.2043054369057523e-09, "loss": 0.1647, "step": 388 }, { "epoch": 4.9792, "grad_norm": 0.2925440423275787, "learning_rate": 8.010924029533406e-10, "loss": 0.1696, "step": 389 }, { "epoch": 4.992, "grad_norm": 0.2681299045611345, "learning_rate": 0.0, "loss": 0.1485, "step": 390 }, { "epoch": 4.992, "step": 390, "total_flos": 7.862516792374067e+17, "train_loss": 0.3952271275795423, "train_runtime": 13928.0051, "train_samples_per_second": 3.59, "train_steps_per_second": 0.028 } ], "logging_steps": 1, "max_steps": 390, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.862516792374067e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }