{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 435, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011510791366906475, "grad_norm": 1.295614242553711, "learning_rate": 0.0, "loss": 2.0611, "step": 1 }, { "epoch": 0.02302158273381295, "grad_norm": 1.3193614482879639, "learning_rate": 5.000000000000001e-07, "loss": 2.2646, "step": 2 }, { "epoch": 0.034532374100719423, "grad_norm": 1.3311564922332764, "learning_rate": 1.0000000000000002e-06, "loss": 2.302, "step": 3 }, { "epoch": 0.0460431654676259, "grad_norm": 1.3216726779937744, "learning_rate": 1.5e-06, "loss": 2.2383, "step": 4 }, { "epoch": 0.05755395683453238, "grad_norm": 1.3751393556594849, "learning_rate": 2.0000000000000003e-06, "loss": 2.2098, "step": 5 }, { "epoch": 0.06906474820143885, "grad_norm": 1.3023695945739746, "learning_rate": 2.5e-06, "loss": 2.1331, "step": 6 }, { "epoch": 0.08057553956834532, "grad_norm": 1.271135926246643, "learning_rate": 3e-06, "loss": 2.2544, "step": 7 }, { "epoch": 0.0920863309352518, "grad_norm": 1.3196772336959839, "learning_rate": 3.5e-06, "loss": 2.3242, "step": 8 }, { "epoch": 0.10359712230215827, "grad_norm": 1.251950740814209, "learning_rate": 4.000000000000001e-06, "loss": 2.1899, "step": 9 }, { "epoch": 0.11510791366906475, "grad_norm": 1.2269312143325806, "learning_rate": 4.5e-06, "loss": 2.1637, "step": 10 }, { "epoch": 0.12661870503597122, "grad_norm": 1.3193552494049072, "learning_rate": 5e-06, "loss": 2.2172, "step": 11 }, { "epoch": 0.1381294964028777, "grad_norm": 1.2646515369415283, "learning_rate": 4.988235294117647e-06, "loss": 2.1069, "step": 12 }, { "epoch": 0.14964028776978416, "grad_norm": 1.301425576210022, "learning_rate": 4.976470588235294e-06, "loss": 2.2276, "step": 13 }, { "epoch": 0.16115107913669063, "grad_norm": 1.1908143758773804, "learning_rate": 4.964705882352942e-06, "loss": 2.1751, "step": 14 }, { "epoch": 0.17266187050359713, "grad_norm": 1.1371599435806274, "learning_rate": 4.9529411764705885e-06, "loss": 2.2119, "step": 15 }, { "epoch": 0.1841726618705036, "grad_norm": 1.1369460821151733, "learning_rate": 4.941176470588236e-06, "loss": 2.1352, "step": 16 }, { "epoch": 0.19568345323741007, "grad_norm": 1.2164900302886963, "learning_rate": 4.929411764705882e-06, "loss": 2.1389, "step": 17 }, { "epoch": 0.20719424460431654, "grad_norm": 1.1954495906829834, "learning_rate": 4.91764705882353e-06, "loss": 2.1272, "step": 18 }, { "epoch": 0.218705035971223, "grad_norm": 1.077544927597046, "learning_rate": 4.905882352941177e-06, "loss": 2.2048, "step": 19 }, { "epoch": 0.2302158273381295, "grad_norm": 1.064244270324707, "learning_rate": 4.894117647058824e-06, "loss": 2.1051, "step": 20 }, { "epoch": 0.24172661870503598, "grad_norm": 1.045543909072876, "learning_rate": 4.882352941176471e-06, "loss": 2.1624, "step": 21 }, { "epoch": 0.25323741007194245, "grad_norm": 1.1219857931137085, "learning_rate": 4.870588235294118e-06, "loss": 2.0821, "step": 22 }, { "epoch": 0.2647482014388489, "grad_norm": 1.0319204330444336, "learning_rate": 4.858823529411766e-06, "loss": 2.2232, "step": 23 }, { "epoch": 0.2762589928057554, "grad_norm": 0.9106553792953491, "learning_rate": 4.847058823529412e-06, "loss": 2.0455, "step": 24 }, { "epoch": 0.28776978417266186, "grad_norm": 0.9222265481948853, "learning_rate": 4.835294117647059e-06, "loss": 2.0065, "step": 25 }, { "epoch": 0.2992805755395683, "grad_norm": 0.9210799336433411, "learning_rate": 4.823529411764706e-06, "loss": 2.2051, "step": 26 }, { "epoch": 0.3107913669064748, "grad_norm": 0.95929354429245, "learning_rate": 4.811764705882354e-06, "loss": 2.293, "step": 27 }, { "epoch": 0.32230215827338127, "grad_norm": 0.9271309971809387, "learning_rate": 4.800000000000001e-06, "loss": 2.1646, "step": 28 }, { "epoch": 0.3338129496402878, "grad_norm": 0.8561931848526001, "learning_rate": 4.7882352941176475e-06, "loss": 2.151, "step": 29 }, { "epoch": 0.34532374100719426, "grad_norm": 0.8737633228302002, "learning_rate": 4.776470588235294e-06, "loss": 2.2063, "step": 30 }, { "epoch": 0.35683453237410073, "grad_norm": 0.8623224496841431, "learning_rate": 4.764705882352941e-06, "loss": 2.0332, "step": 31 }, { "epoch": 0.3683453237410072, "grad_norm": 0.8182441592216492, "learning_rate": 4.752941176470589e-06, "loss": 2.1344, "step": 32 }, { "epoch": 0.37985611510791367, "grad_norm": 0.8933826088905334, "learning_rate": 4.741176470588236e-06, "loss": 2.2029, "step": 33 }, { "epoch": 0.39136690647482014, "grad_norm": 0.870568037033081, "learning_rate": 4.729411764705883e-06, "loss": 2.2155, "step": 34 }, { "epoch": 0.4028776978417266, "grad_norm": 0.8473733067512512, "learning_rate": 4.717647058823529e-06, "loss": 2.1522, "step": 35 }, { "epoch": 0.4143884892086331, "grad_norm": 0.8416939377784729, "learning_rate": 4.705882352941177e-06, "loss": 2.1937, "step": 36 }, { "epoch": 0.42589928057553955, "grad_norm": 0.7861284613609314, "learning_rate": 4.694117647058824e-06, "loss": 2.1359, "step": 37 }, { "epoch": 0.437410071942446, "grad_norm": 0.7433235049247742, "learning_rate": 4.682352941176471e-06, "loss": 2.1636, "step": 38 }, { "epoch": 0.4489208633093525, "grad_norm": 0.7175543308258057, "learning_rate": 4.670588235294118e-06, "loss": 1.9667, "step": 39 }, { "epoch": 0.460431654676259, "grad_norm": 0.7294153571128845, "learning_rate": 4.658823529411765e-06, "loss": 2.1362, "step": 40 }, { "epoch": 0.4719424460431655, "grad_norm": 0.7124460935592651, "learning_rate": 4.647058823529412e-06, "loss": 2.0463, "step": 41 }, { "epoch": 0.48345323741007196, "grad_norm": 0.7458817362785339, "learning_rate": 4.635294117647059e-06, "loss": 2.18, "step": 42 }, { "epoch": 0.4949640287769784, "grad_norm": 0.6650211215019226, "learning_rate": 4.623529411764706e-06, "loss": 2.0119, "step": 43 }, { "epoch": 0.5064748201438849, "grad_norm": 0.7400155067443848, "learning_rate": 4.611764705882353e-06, "loss": 2.0537, "step": 44 }, { "epoch": 0.5179856115107914, "grad_norm": 0.6916301250457764, "learning_rate": 4.600000000000001e-06, "loss": 2.0584, "step": 45 }, { "epoch": 0.5294964028776978, "grad_norm": 0.6942080855369568, "learning_rate": 4.588235294117647e-06, "loss": 2.1126, "step": 46 }, { "epoch": 0.5410071942446043, "grad_norm": 0.694041907787323, "learning_rate": 4.5764705882352945e-06, "loss": 2.1958, "step": 47 }, { "epoch": 0.5525179856115108, "grad_norm": 0.6456537842750549, "learning_rate": 4.564705882352941e-06, "loss": 2.212, "step": 48 }, { "epoch": 0.5640287769784172, "grad_norm": 0.663451075553894, "learning_rate": 4.552941176470589e-06, "loss": 2.1401, "step": 49 }, { "epoch": 0.5755395683453237, "grad_norm": 0.7182263135910034, "learning_rate": 4.541176470588236e-06, "loss": 2.137, "step": 50 }, { "epoch": 0.5870503597122302, "grad_norm": 0.6442857384681702, "learning_rate": 4.529411764705883e-06, "loss": 2.0755, "step": 51 }, { "epoch": 0.5985611510791367, "grad_norm": 0.6306608319282532, "learning_rate": 4.51764705882353e-06, "loss": 2.0456, "step": 52 }, { "epoch": 0.6100719424460431, "grad_norm": 0.628402054309845, "learning_rate": 4.505882352941176e-06, "loss": 2.0548, "step": 53 }, { "epoch": 0.6215827338129496, "grad_norm": 0.6245840191841125, "learning_rate": 4.494117647058824e-06, "loss": 2.176, "step": 54 }, { "epoch": 0.6330935251798561, "grad_norm": 0.5973242521286011, "learning_rate": 4.482352941176471e-06, "loss": 2.004, "step": 55 }, { "epoch": 0.6446043165467625, "grad_norm": 0.6911327242851257, "learning_rate": 4.4705882352941184e-06, "loss": 2.0991, "step": 56 }, { "epoch": 0.6561151079136691, "grad_norm": 0.6195106506347656, "learning_rate": 4.458823529411765e-06, "loss": 2.0232, "step": 57 }, { "epoch": 0.6676258992805756, "grad_norm": 0.5956724882125854, "learning_rate": 4.447058823529412e-06, "loss": 2.1384, "step": 58 }, { "epoch": 0.679136690647482, "grad_norm": 0.6167479753494263, "learning_rate": 4.435294117647059e-06, "loss": 2.0524, "step": 59 }, { "epoch": 0.6906474820143885, "grad_norm": 0.5958898663520813, "learning_rate": 4.423529411764707e-06, "loss": 2.0854, "step": 60 }, { "epoch": 0.702158273381295, "grad_norm": 0.6348150372505188, "learning_rate": 4.411764705882353e-06, "loss": 2.1037, "step": 61 }, { "epoch": 0.7136690647482015, "grad_norm": 0.5777585506439209, "learning_rate": 4.4e-06, "loss": 2.0463, "step": 62 }, { "epoch": 0.7251798561151079, "grad_norm": 0.5744509696960449, "learning_rate": 4.388235294117648e-06, "loss": 2.0368, "step": 63 }, { "epoch": 0.7366906474820144, "grad_norm": 0.6182202696800232, "learning_rate": 4.376470588235294e-06, "loss": 2.1305, "step": 64 }, { "epoch": 0.7482014388489209, "grad_norm": 0.6051247715950012, "learning_rate": 4.3647058823529415e-06, "loss": 1.9982, "step": 65 }, { "epoch": 0.7597122302158273, "grad_norm": 0.585530161857605, "learning_rate": 4.352941176470588e-06, "loss": 2.1311, "step": 66 }, { "epoch": 0.7712230215827338, "grad_norm": 0.5718886256217957, "learning_rate": 4.341176470588236e-06, "loss": 2.0899, "step": 67 }, { "epoch": 0.7827338129496403, "grad_norm": 0.5871637463569641, "learning_rate": 4.329411764705883e-06, "loss": 2.1422, "step": 68 }, { "epoch": 0.7942446043165468, "grad_norm": 0.5348390340805054, "learning_rate": 4.31764705882353e-06, "loss": 2.0414, "step": 69 }, { "epoch": 0.8057553956834532, "grad_norm": 0.5360246896743774, "learning_rate": 4.305882352941177e-06, "loss": 2.0697, "step": 70 }, { "epoch": 0.8172661870503597, "grad_norm": 0.575326144695282, "learning_rate": 4.294117647058823e-06, "loss": 2.126, "step": 71 }, { "epoch": 0.8287769784172662, "grad_norm": 0.5389431118965149, "learning_rate": 4.282352941176471e-06, "loss": 1.9699, "step": 72 }, { "epoch": 0.8402877697841726, "grad_norm": 0.5181302428245544, "learning_rate": 4.270588235294118e-06, "loss": 2.0431, "step": 73 }, { "epoch": 0.8517985611510791, "grad_norm": 0.5258436799049377, "learning_rate": 4.2588235294117655e-06, "loss": 2.1398, "step": 74 }, { "epoch": 0.8633093525179856, "grad_norm": 0.5759520530700684, "learning_rate": 4.247058823529412e-06, "loss": 2.1257, "step": 75 }, { "epoch": 0.874820143884892, "grad_norm": 0.5312909483909607, "learning_rate": 4.235294117647059e-06, "loss": 2.0202, "step": 76 }, { "epoch": 0.8863309352517985, "grad_norm": 0.6128862500190735, "learning_rate": 4.223529411764706e-06, "loss": 2.1821, "step": 77 }, { "epoch": 0.897841726618705, "grad_norm": 0.5947574377059937, "learning_rate": 4.211764705882354e-06, "loss": 2.0586, "step": 78 }, { "epoch": 0.9093525179856116, "grad_norm": 0.5092775225639343, "learning_rate": 4.2000000000000004e-06, "loss": 2.0783, "step": 79 }, { "epoch": 0.920863309352518, "grad_norm": 0.5344525575637817, "learning_rate": 4.188235294117647e-06, "loss": 2.1457, "step": 80 }, { "epoch": 0.9323741007194245, "grad_norm": 0.5249314308166504, "learning_rate": 4.176470588235295e-06, "loss": 1.9904, "step": 81 }, { "epoch": 0.943884892086331, "grad_norm": 0.5562986135482788, "learning_rate": 4.164705882352941e-06, "loss": 2.053, "step": 82 }, { "epoch": 0.9553956834532374, "grad_norm": 0.5227307081222534, "learning_rate": 4.152941176470589e-06, "loss": 1.9463, "step": 83 }, { "epoch": 0.9669064748201439, "grad_norm": 0.5479752421379089, "learning_rate": 4.141176470588235e-06, "loss": 2.1546, "step": 84 }, { "epoch": 0.9784172661870504, "grad_norm": 0.49207690358161926, "learning_rate": 4.129411764705883e-06, "loss": 1.9769, "step": 85 }, { "epoch": 0.9899280575539569, "grad_norm": 0.5519751310348511, "learning_rate": 4.11764705882353e-06, "loss": 2.0985, "step": 86 }, { "epoch": 1.0, "grad_norm": 0.5100968480110168, "learning_rate": 4.105882352941177e-06, "loss": 2.0412, "step": 87 }, { "epoch": 1.0115107913669066, "grad_norm": 0.5197513103485107, "learning_rate": 4.094117647058824e-06, "loss": 2.0366, "step": 88 }, { "epoch": 1.023021582733813, "grad_norm": 0.475099116563797, "learning_rate": 4.082352941176471e-06, "loss": 1.9594, "step": 89 }, { "epoch": 1.0345323741007195, "grad_norm": 0.4655166566371918, "learning_rate": 4.070588235294118e-06, "loss": 1.988, "step": 90 }, { "epoch": 1.0460431654676259, "grad_norm": 0.47748327255249023, "learning_rate": 4.058823529411765e-06, "loss": 2.0573, "step": 91 }, { "epoch": 1.0575539568345325, "grad_norm": 0.4752672612667084, "learning_rate": 4.0470588235294125e-06, "loss": 2.0761, "step": 92 }, { "epoch": 1.0690647482014388, "grad_norm": 0.42830637097358704, "learning_rate": 4.0352941176470585e-06, "loss": 1.9, "step": 93 }, { "epoch": 1.0805755395683454, "grad_norm": 0.509665310382843, "learning_rate": 4.023529411764706e-06, "loss": 2.1022, "step": 94 }, { "epoch": 1.0920863309352518, "grad_norm": 0.4985044300556183, "learning_rate": 4.011764705882353e-06, "loss": 2.0004, "step": 95 }, { "epoch": 1.1035971223021583, "grad_norm": 0.4855203330516815, "learning_rate": 4.000000000000001e-06, "loss": 1.9497, "step": 96 }, { "epoch": 1.1151079136690647, "grad_norm": 0.5421211123466492, "learning_rate": 3.9882352941176475e-06, "loss": 2.0288, "step": 97 }, { "epoch": 1.1266187050359713, "grad_norm": 0.4415021538734436, "learning_rate": 3.976470588235294e-06, "loss": 1.9209, "step": 98 }, { "epoch": 1.1381294964028776, "grad_norm": 0.5151281356811523, "learning_rate": 3.964705882352942e-06, "loss": 2.0827, "step": 99 }, { "epoch": 1.1496402877697842, "grad_norm": 0.4920862913131714, "learning_rate": 3.952941176470588e-06, "loss": 2.058, "step": 100 }, { "epoch": 1.1611510791366906, "grad_norm": 0.4780770540237427, "learning_rate": 3.941176470588236e-06, "loss": 2.0177, "step": 101 }, { "epoch": 1.1726618705035972, "grad_norm": 0.5100148916244507, "learning_rate": 3.9294117647058824e-06, "loss": 2.0284, "step": 102 }, { "epoch": 1.1841726618705035, "grad_norm": 0.49509063363075256, "learning_rate": 3.91764705882353e-06, "loss": 1.9646, "step": 103 }, { "epoch": 1.19568345323741, "grad_norm": 0.518322229385376, "learning_rate": 3.905882352941177e-06, "loss": 1.9543, "step": 104 }, { "epoch": 1.2071942446043165, "grad_norm": 0.49948441982269287, "learning_rate": 3.894117647058824e-06, "loss": 1.9506, "step": 105 }, { "epoch": 1.218705035971223, "grad_norm": 0.4965578317642212, "learning_rate": 3.882352941176471e-06, "loss": 1.9751, "step": 106 }, { "epoch": 1.2302158273381294, "grad_norm": 0.4927343428134918, "learning_rate": 3.870588235294118e-06, "loss": 2.0053, "step": 107 }, { "epoch": 1.241726618705036, "grad_norm": 0.5349815487861633, "learning_rate": 3.858823529411765e-06, "loss": 1.9543, "step": 108 }, { "epoch": 1.2532374100719426, "grad_norm": 0.47634226083755493, "learning_rate": 3.847058823529412e-06, "loss": 2.0552, "step": 109 }, { "epoch": 1.264748201438849, "grad_norm": 0.4937780797481537, "learning_rate": 3.8352941176470596e-06, "loss": 2.0465, "step": 110 }, { "epoch": 1.2762589928057553, "grad_norm": 0.49782195687294006, "learning_rate": 3.8235294117647055e-06, "loss": 2.0029, "step": 111 }, { "epoch": 1.2877697841726619, "grad_norm": 0.4511786103248596, "learning_rate": 3.8117647058823532e-06, "loss": 2.0275, "step": 112 }, { "epoch": 1.2992805755395684, "grad_norm": 0.5700087547302246, "learning_rate": 3.8000000000000005e-06, "loss": 2.1454, "step": 113 }, { "epoch": 1.3107913669064748, "grad_norm": 0.46322473883628845, "learning_rate": 3.7882352941176477e-06, "loss": 2.1094, "step": 114 }, { "epoch": 1.3223021582733812, "grad_norm": 0.48486411571502686, "learning_rate": 3.776470588235294e-06, "loss": 1.9504, "step": 115 }, { "epoch": 1.3338129496402877, "grad_norm": 0.5094816088676453, "learning_rate": 3.7647058823529414e-06, "loss": 2.0014, "step": 116 }, { "epoch": 1.3453237410071943, "grad_norm": 0.4983859360218048, "learning_rate": 3.7529411764705886e-06, "loss": 2.0876, "step": 117 }, { "epoch": 1.3568345323741007, "grad_norm": 0.46220195293426514, "learning_rate": 3.741176470588236e-06, "loss": 1.9317, "step": 118 }, { "epoch": 1.3683453237410073, "grad_norm": 0.5100168585777283, "learning_rate": 3.7294117647058827e-06, "loss": 2.0247, "step": 119 }, { "epoch": 1.3798561151079136, "grad_norm": 0.45993947982788086, "learning_rate": 3.71764705882353e-06, "loss": 2.1151, "step": 120 }, { "epoch": 1.3913669064748202, "grad_norm": 0.45173344016075134, "learning_rate": 3.7058823529411767e-06, "loss": 1.9784, "step": 121 }, { "epoch": 1.4028776978417266, "grad_norm": 0.5567683577537537, "learning_rate": 3.6941176470588236e-06, "loss": 1.9727, "step": 122 }, { "epoch": 1.4143884892086331, "grad_norm": 0.5246084928512573, "learning_rate": 3.682352941176471e-06, "loss": 1.9025, "step": 123 }, { "epoch": 1.4258992805755395, "grad_norm": 0.4744240343570709, "learning_rate": 3.670588235294118e-06, "loss": 1.9713, "step": 124 }, { "epoch": 1.437410071942446, "grad_norm": 0.4525962769985199, "learning_rate": 3.6588235294117653e-06, "loss": 1.9845, "step": 125 }, { "epoch": 1.4489208633093524, "grad_norm": 0.5006890296936035, "learning_rate": 3.6470588235294117e-06, "loss": 1.9638, "step": 126 }, { "epoch": 1.460431654676259, "grad_norm": 0.4702132046222687, "learning_rate": 3.635294117647059e-06, "loss": 1.9444, "step": 127 }, { "epoch": 1.4719424460431654, "grad_norm": 0.5455424189567566, "learning_rate": 3.623529411764706e-06, "loss": 1.9387, "step": 128 }, { "epoch": 1.483453237410072, "grad_norm": 0.6158833503723145, "learning_rate": 3.6117647058823534e-06, "loss": 2.0112, "step": 129 }, { "epoch": 1.4949640287769785, "grad_norm": 0.4596414268016815, "learning_rate": 3.6000000000000003e-06, "loss": 2.0323, "step": 130 }, { "epoch": 1.506474820143885, "grad_norm": 0.46785110235214233, "learning_rate": 3.5882352941176475e-06, "loss": 2.065, "step": 131 }, { "epoch": 1.5179856115107913, "grad_norm": 0.46331682801246643, "learning_rate": 3.5764705882352948e-06, "loss": 1.93, "step": 132 }, { "epoch": 1.5294964028776978, "grad_norm": 0.47538137435913086, "learning_rate": 3.564705882352941e-06, "loss": 2.0613, "step": 133 }, { "epoch": 1.5410071942446044, "grad_norm": 0.5323189496994019, "learning_rate": 3.5529411764705884e-06, "loss": 2.0175, "step": 134 }, { "epoch": 1.5525179856115108, "grad_norm": 0.4505595266819, "learning_rate": 3.5411764705882356e-06, "loss": 1.9124, "step": 135 }, { "epoch": 1.5640287769784171, "grad_norm": 0.4327537417411804, "learning_rate": 3.529411764705883e-06, "loss": 1.9633, "step": 136 }, { "epoch": 1.5755395683453237, "grad_norm": 0.497213214635849, "learning_rate": 3.5176470588235297e-06, "loss": 1.9673, "step": 137 }, { "epoch": 1.5870503597122303, "grad_norm": 0.4779418706893921, "learning_rate": 3.5058823529411765e-06, "loss": 2.0615, "step": 138 }, { "epoch": 1.5985611510791367, "grad_norm": 0.45400741696357727, "learning_rate": 3.4941176470588238e-06, "loss": 2.0023, "step": 139 }, { "epoch": 1.610071942446043, "grad_norm": 0.4582999646663666, "learning_rate": 3.4823529411764706e-06, "loss": 2.104, "step": 140 }, { "epoch": 1.6215827338129496, "grad_norm": 0.436869353055954, "learning_rate": 3.470588235294118e-06, "loss": 2.0327, "step": 141 }, { "epoch": 1.6330935251798562, "grad_norm": 0.49151161313056946, "learning_rate": 3.458823529411765e-06, "loss": 1.8815, "step": 142 }, { "epoch": 1.6446043165467625, "grad_norm": 0.4952029585838318, "learning_rate": 3.4470588235294123e-06, "loss": 1.9371, "step": 143 }, { "epoch": 1.6561151079136691, "grad_norm": 0.4617934226989746, "learning_rate": 3.4352941176470587e-06, "loss": 2.0492, "step": 144 }, { "epoch": 1.6676258992805755, "grad_norm": 0.45421457290649414, "learning_rate": 3.423529411764706e-06, "loss": 1.9799, "step": 145 }, { "epoch": 1.679136690647482, "grad_norm": 0.45641016960144043, "learning_rate": 3.4117647058823532e-06, "loss": 2.1129, "step": 146 }, { "epoch": 1.6906474820143886, "grad_norm": 0.4528232514858246, "learning_rate": 3.4000000000000005e-06, "loss": 1.9968, "step": 147 }, { "epoch": 1.702158273381295, "grad_norm": 0.425224632024765, "learning_rate": 3.3882352941176473e-06, "loss": 2.0766, "step": 148 }, { "epoch": 1.7136690647482014, "grad_norm": 0.44971901178359985, "learning_rate": 3.3764705882352946e-06, "loss": 1.951, "step": 149 }, { "epoch": 1.725179856115108, "grad_norm": 0.5100486278533936, "learning_rate": 3.364705882352942e-06, "loss": 2.0126, "step": 150 }, { "epoch": 1.7366906474820145, "grad_norm": 0.43650567531585693, "learning_rate": 3.352941176470588e-06, "loss": 2.0752, "step": 151 }, { "epoch": 1.7482014388489209, "grad_norm": 0.4769532382488251, "learning_rate": 3.3411764705882354e-06, "loss": 2.005, "step": 152 }, { "epoch": 1.7597122302158272, "grad_norm": 0.4705875813961029, "learning_rate": 3.3294117647058827e-06, "loss": 2.0744, "step": 153 }, { "epoch": 1.7712230215827338, "grad_norm": 0.4628766179084778, "learning_rate": 3.31764705882353e-06, "loss": 2.108, "step": 154 }, { "epoch": 1.7827338129496404, "grad_norm": 0.4407738149166107, "learning_rate": 3.3058823529411763e-06, "loss": 1.9779, "step": 155 }, { "epoch": 1.7942446043165468, "grad_norm": 0.500913679599762, "learning_rate": 3.2941176470588236e-06, "loss": 1.8435, "step": 156 }, { "epoch": 1.8057553956834531, "grad_norm": 0.4094080626964569, "learning_rate": 3.282352941176471e-06, "loss": 1.972, "step": 157 }, { "epoch": 1.8172661870503597, "grad_norm": 0.46937379240989685, "learning_rate": 3.270588235294118e-06, "loss": 1.8564, "step": 158 }, { "epoch": 1.8287769784172663, "grad_norm": 0.46270328760147095, "learning_rate": 3.258823529411765e-06, "loss": 2.0629, "step": 159 }, { "epoch": 1.8402877697841726, "grad_norm": 0.452856183052063, "learning_rate": 3.247058823529412e-06, "loss": 2.0334, "step": 160 }, { "epoch": 1.851798561151079, "grad_norm": 0.4600447714328766, "learning_rate": 3.2352941176470594e-06, "loss": 1.9833, "step": 161 }, { "epoch": 1.8633093525179856, "grad_norm": 0.5156863927841187, "learning_rate": 3.2235294117647058e-06, "loss": 2.1083, "step": 162 }, { "epoch": 1.8748201438848922, "grad_norm": 0.4730561077594757, "learning_rate": 3.211764705882353e-06, "loss": 2.016, "step": 163 }, { "epoch": 1.8863309352517985, "grad_norm": 0.4580685496330261, "learning_rate": 3.2000000000000003e-06, "loss": 2.0209, "step": 164 }, { "epoch": 1.8978417266187049, "grad_norm": 0.438728004693985, "learning_rate": 3.1882352941176475e-06, "loss": 1.9078, "step": 165 }, { "epoch": 1.9093525179856115, "grad_norm": 0.4413028061389923, "learning_rate": 3.1764705882352943e-06, "loss": 1.8775, "step": 166 }, { "epoch": 1.920863309352518, "grad_norm": 0.4299080967903137, "learning_rate": 3.1647058823529416e-06, "loss": 2.0034, "step": 167 }, { "epoch": 1.9323741007194246, "grad_norm": 0.47266408801078796, "learning_rate": 3.1529411764705884e-06, "loss": 1.8302, "step": 168 }, { "epoch": 1.943884892086331, "grad_norm": 0.4524175226688385, "learning_rate": 3.1411764705882357e-06, "loss": 1.967, "step": 169 }, { "epoch": 1.9553956834532373, "grad_norm": 0.39914512634277344, "learning_rate": 3.1294117647058825e-06, "loss": 2.0307, "step": 170 }, { "epoch": 1.966906474820144, "grad_norm": 0.47265124320983887, "learning_rate": 3.1176470588235297e-06, "loss": 2.0066, "step": 171 }, { "epoch": 1.9784172661870505, "grad_norm": 0.3866676986217499, "learning_rate": 3.105882352941177e-06, "loss": 2.0306, "step": 172 }, { "epoch": 1.9899280575539569, "grad_norm": 0.4875778555870056, "learning_rate": 3.0941176470588234e-06, "loss": 1.926, "step": 173 }, { "epoch": 2.0, "grad_norm": 0.467013418674469, "learning_rate": 3.0823529411764706e-06, "loss": 2.0708, "step": 174 }, { "epoch": 2.0115107913669066, "grad_norm": 0.4312443435192108, "learning_rate": 3.070588235294118e-06, "loss": 1.9997, "step": 175 }, { "epoch": 2.023021582733813, "grad_norm": 0.44231534004211426, "learning_rate": 3.058823529411765e-06, "loss": 2.0316, "step": 176 }, { "epoch": 2.0345323741007193, "grad_norm": 0.43418049812316895, "learning_rate": 3.047058823529412e-06, "loss": 1.898, "step": 177 }, { "epoch": 2.046043165467626, "grad_norm": 0.42247310280799866, "learning_rate": 3.035294117647059e-06, "loss": 1.9524, "step": 178 }, { "epoch": 2.0575539568345325, "grad_norm": 0.47551754117012024, "learning_rate": 3.0235294117647064e-06, "loss": 1.9299, "step": 179 }, { "epoch": 2.069064748201439, "grad_norm": 0.39652958512306213, "learning_rate": 3.011764705882353e-06, "loss": 2.0617, "step": 180 }, { "epoch": 2.080575539568345, "grad_norm": 0.41868913173675537, "learning_rate": 3e-06, "loss": 1.9897, "step": 181 }, { "epoch": 2.0920863309352518, "grad_norm": 0.39825567603111267, "learning_rate": 2.9882352941176473e-06, "loss": 1.9624, "step": 182 }, { "epoch": 2.1035971223021583, "grad_norm": 0.4380688965320587, "learning_rate": 2.9764705882352946e-06, "loss": 1.9904, "step": 183 }, { "epoch": 2.115107913669065, "grad_norm": 0.39882150292396545, "learning_rate": 2.9647058823529414e-06, "loss": 1.9084, "step": 184 }, { "epoch": 2.126618705035971, "grad_norm": 0.4040652811527252, "learning_rate": 2.9529411764705882e-06, "loss": 2.1459, "step": 185 }, { "epoch": 2.1381294964028776, "grad_norm": 0.4166700839996338, "learning_rate": 2.9411764705882355e-06, "loss": 2.0026, "step": 186 }, { "epoch": 2.149640287769784, "grad_norm": 0.41756269335746765, "learning_rate": 2.9294117647058827e-06, "loss": 2.0675, "step": 187 }, { "epoch": 2.161151079136691, "grad_norm": 0.41332557797431946, "learning_rate": 2.9176470588235295e-06, "loss": 1.9391, "step": 188 }, { "epoch": 2.172661870503597, "grad_norm": 0.4298039972782135, "learning_rate": 2.9058823529411768e-06, "loss": 1.8671, "step": 189 }, { "epoch": 2.1841726618705035, "grad_norm": 0.45794206857681274, "learning_rate": 2.894117647058824e-06, "loss": 1.9466, "step": 190 }, { "epoch": 2.19568345323741, "grad_norm": 0.3940126597881317, "learning_rate": 2.8823529411764704e-06, "loss": 1.9557, "step": 191 }, { "epoch": 2.2071942446043167, "grad_norm": 0.45463988184928894, "learning_rate": 2.8705882352941177e-06, "loss": 1.9659, "step": 192 }, { "epoch": 2.218705035971223, "grad_norm": 0.4931933879852295, "learning_rate": 2.858823529411765e-06, "loss": 1.9761, "step": 193 }, { "epoch": 2.2302158273381294, "grad_norm": 0.42029982805252075, "learning_rate": 2.847058823529412e-06, "loss": 2.0658, "step": 194 }, { "epoch": 2.241726618705036, "grad_norm": 0.4499173164367676, "learning_rate": 2.835294117647059e-06, "loss": 2.0426, "step": 195 }, { "epoch": 2.2532374100719426, "grad_norm": 0.42561689019203186, "learning_rate": 2.8235294117647062e-06, "loss": 1.9447, "step": 196 }, { "epoch": 2.2647482014388487, "grad_norm": 0.4154174327850342, "learning_rate": 2.8117647058823535e-06, "loss": 1.8771, "step": 197 }, { "epoch": 2.2762589928057553, "grad_norm": 0.40316736698150635, "learning_rate": 2.8000000000000003e-06, "loss": 2.0181, "step": 198 }, { "epoch": 2.287769784172662, "grad_norm": 0.4346086084842682, "learning_rate": 2.788235294117647e-06, "loss": 1.9526, "step": 199 }, { "epoch": 2.2992805755395684, "grad_norm": 0.4031694829463959, "learning_rate": 2.7764705882352944e-06, "loss": 2.0371, "step": 200 }, { "epoch": 2.310791366906475, "grad_norm": 0.46281856298446655, "learning_rate": 2.7647058823529416e-06, "loss": 1.9795, "step": 201 }, { "epoch": 2.322302158273381, "grad_norm": 0.41117939352989197, "learning_rate": 2.7529411764705884e-06, "loss": 1.93, "step": 202 }, { "epoch": 2.3338129496402877, "grad_norm": 0.4585975110530853, "learning_rate": 2.7411764705882353e-06, "loss": 2.0624, "step": 203 }, { "epoch": 2.3453237410071943, "grad_norm": 0.43110424280166626, "learning_rate": 2.7294117647058825e-06, "loss": 1.9551, "step": 204 }, { "epoch": 2.356834532374101, "grad_norm": 0.44794800877571106, "learning_rate": 2.7176470588235297e-06, "loss": 1.9633, "step": 205 }, { "epoch": 2.368345323741007, "grad_norm": 0.44358718395233154, "learning_rate": 2.7058823529411766e-06, "loss": 1.992, "step": 206 }, { "epoch": 2.3798561151079136, "grad_norm": 0.3666572868824005, "learning_rate": 2.694117647058824e-06, "loss": 1.9791, "step": 207 }, { "epoch": 2.39136690647482, "grad_norm": 0.4338827431201935, "learning_rate": 2.682352941176471e-06, "loss": 1.9804, "step": 208 }, { "epoch": 2.402877697841727, "grad_norm": 0.3984374701976776, "learning_rate": 2.6705882352941175e-06, "loss": 2.0183, "step": 209 }, { "epoch": 2.414388489208633, "grad_norm": 0.44970276951789856, "learning_rate": 2.6588235294117647e-06, "loss": 1.9441, "step": 210 }, { "epoch": 2.4258992805755395, "grad_norm": 0.38036495447158813, "learning_rate": 2.647058823529412e-06, "loss": 2.0069, "step": 211 }, { "epoch": 2.437410071942446, "grad_norm": 0.40496423840522766, "learning_rate": 2.635294117647059e-06, "loss": 2.0429, "step": 212 }, { "epoch": 2.4489208633093527, "grad_norm": 0.4080514907836914, "learning_rate": 2.623529411764706e-06, "loss": 1.9265, "step": 213 }, { "epoch": 2.460431654676259, "grad_norm": 0.4119996130466461, "learning_rate": 2.6117647058823533e-06, "loss": 2.0675, "step": 214 }, { "epoch": 2.4719424460431654, "grad_norm": 0.4465183615684509, "learning_rate": 2.6e-06, "loss": 1.9325, "step": 215 }, { "epoch": 2.483453237410072, "grad_norm": 0.4121825397014618, "learning_rate": 2.5882352941176473e-06, "loss": 1.9198, "step": 216 }, { "epoch": 2.4949640287769785, "grad_norm": 0.41370537877082825, "learning_rate": 2.576470588235294e-06, "loss": 2.0955, "step": 217 }, { "epoch": 2.506474820143885, "grad_norm": 0.44053828716278076, "learning_rate": 2.5647058823529414e-06, "loss": 2.0383, "step": 218 }, { "epoch": 2.5179856115107913, "grad_norm": 0.42168861627578735, "learning_rate": 2.5529411764705887e-06, "loss": 1.9718, "step": 219 }, { "epoch": 2.529496402877698, "grad_norm": 0.4006345570087433, "learning_rate": 2.541176470588235e-06, "loss": 1.9606, "step": 220 }, { "epoch": 2.5410071942446044, "grad_norm": 0.40757259726524353, "learning_rate": 2.5294117647058823e-06, "loss": 2.0171, "step": 221 }, { "epoch": 2.5525179856115106, "grad_norm": 0.40500617027282715, "learning_rate": 2.5176470588235295e-06, "loss": 1.9153, "step": 222 }, { "epoch": 2.564028776978417, "grad_norm": 0.36510592699050903, "learning_rate": 2.505882352941177e-06, "loss": 1.8415, "step": 223 }, { "epoch": 2.5755395683453237, "grad_norm": 0.44662347435951233, "learning_rate": 2.4941176470588236e-06, "loss": 1.9275, "step": 224 }, { "epoch": 2.5870503597122303, "grad_norm": 0.4446622431278229, "learning_rate": 2.482352941176471e-06, "loss": 2.002, "step": 225 }, { "epoch": 2.598561151079137, "grad_norm": 0.46108031272888184, "learning_rate": 2.470588235294118e-06, "loss": 1.9739, "step": 226 }, { "epoch": 2.610071942446043, "grad_norm": 0.42075315117836, "learning_rate": 2.458823529411765e-06, "loss": 1.93, "step": 227 }, { "epoch": 2.6215827338129496, "grad_norm": 0.4763178527355194, "learning_rate": 2.447058823529412e-06, "loss": 1.8973, "step": 228 }, { "epoch": 2.633093525179856, "grad_norm": 0.39183807373046875, "learning_rate": 2.435294117647059e-06, "loss": 1.9485, "step": 229 }, { "epoch": 2.6446043165467623, "grad_norm": 0.4360307455062866, "learning_rate": 2.423529411764706e-06, "loss": 1.902, "step": 230 }, { "epoch": 2.656115107913669, "grad_norm": 0.46272391080856323, "learning_rate": 2.411764705882353e-06, "loss": 1.9535, "step": 231 }, { "epoch": 2.6676258992805755, "grad_norm": 0.43482983112335205, "learning_rate": 2.4000000000000003e-06, "loss": 1.9474, "step": 232 }, { "epoch": 2.679136690647482, "grad_norm": 0.38162457942962646, "learning_rate": 2.388235294117647e-06, "loss": 1.935, "step": 233 }, { "epoch": 2.6906474820143886, "grad_norm": 0.4534943401813507, "learning_rate": 2.3764705882352944e-06, "loss": 2.0316, "step": 234 }, { "epoch": 2.702158273381295, "grad_norm": 0.3955429792404175, "learning_rate": 2.3647058823529416e-06, "loss": 1.9407, "step": 235 }, { "epoch": 2.7136690647482014, "grad_norm": 0.3862835466861725, "learning_rate": 2.3529411764705885e-06, "loss": 1.7705, "step": 236 }, { "epoch": 2.725179856115108, "grad_norm": 0.42534133791923523, "learning_rate": 2.3411764705882357e-06, "loss": 1.9679, "step": 237 }, { "epoch": 2.7366906474820145, "grad_norm": 0.42100828886032104, "learning_rate": 2.3294117647058825e-06, "loss": 1.8997, "step": 238 }, { "epoch": 2.7482014388489207, "grad_norm": 0.4313439726829529, "learning_rate": 2.3176470588235293e-06, "loss": 1.865, "step": 239 }, { "epoch": 2.7597122302158272, "grad_norm": 0.42461809515953064, "learning_rate": 2.3058823529411766e-06, "loss": 1.8169, "step": 240 }, { "epoch": 2.771223021582734, "grad_norm": 0.39618465304374695, "learning_rate": 2.2941176470588234e-06, "loss": 1.8453, "step": 241 }, { "epoch": 2.7827338129496404, "grad_norm": 0.4130411148071289, "learning_rate": 2.2823529411764707e-06, "loss": 1.901, "step": 242 }, { "epoch": 2.794244604316547, "grad_norm": 0.40345144271850586, "learning_rate": 2.270588235294118e-06, "loss": 1.9913, "step": 243 }, { "epoch": 2.805755395683453, "grad_norm": 0.4513832926750183, "learning_rate": 2.258823529411765e-06, "loss": 1.9226, "step": 244 }, { "epoch": 2.8172661870503597, "grad_norm": 0.4180731475353241, "learning_rate": 2.247058823529412e-06, "loss": 1.9364, "step": 245 }, { "epoch": 2.8287769784172663, "grad_norm": 0.40142112970352173, "learning_rate": 2.2352941176470592e-06, "loss": 2.0484, "step": 246 }, { "epoch": 2.8402877697841724, "grad_norm": 0.42125147581100464, "learning_rate": 2.223529411764706e-06, "loss": 1.8964, "step": 247 }, { "epoch": 2.851798561151079, "grad_norm": 0.4866432249546051, "learning_rate": 2.2117647058823533e-06, "loss": 1.8468, "step": 248 }, { "epoch": 2.8633093525179856, "grad_norm": 0.43598270416259766, "learning_rate": 2.2e-06, "loss": 1.8239, "step": 249 }, { "epoch": 2.874820143884892, "grad_norm": 0.39764901995658875, "learning_rate": 2.188235294117647e-06, "loss": 1.9577, "step": 250 }, { "epoch": 2.8863309352517987, "grad_norm": 0.43190810084342957, "learning_rate": 2.176470588235294e-06, "loss": 1.8573, "step": 251 }, { "epoch": 2.897841726618705, "grad_norm": 0.43591439723968506, "learning_rate": 2.1647058823529414e-06, "loss": 1.9726, "step": 252 }, { "epoch": 2.9093525179856115, "grad_norm": 0.37023991346359253, "learning_rate": 2.1529411764705887e-06, "loss": 1.9219, "step": 253 }, { "epoch": 2.920863309352518, "grad_norm": 0.4382263422012329, "learning_rate": 2.1411764705882355e-06, "loss": 2.0037, "step": 254 }, { "epoch": 2.9323741007194246, "grad_norm": 0.4027315378189087, "learning_rate": 2.1294117647058827e-06, "loss": 1.7579, "step": 255 }, { "epoch": 2.9438848920863308, "grad_norm": 0.41298389434814453, "learning_rate": 2.1176470588235296e-06, "loss": 1.895, "step": 256 }, { "epoch": 2.9553956834532373, "grad_norm": 0.39726293087005615, "learning_rate": 2.105882352941177e-06, "loss": 1.9757, "step": 257 }, { "epoch": 2.966906474820144, "grad_norm": 0.4210617244243622, "learning_rate": 2.0941176470588236e-06, "loss": 1.9105, "step": 258 }, { "epoch": 2.9784172661870505, "grad_norm": 0.4142284095287323, "learning_rate": 2.0823529411764705e-06, "loss": 1.8864, "step": 259 }, { "epoch": 2.989928057553957, "grad_norm": 0.3998337686061859, "learning_rate": 2.0705882352941177e-06, "loss": 1.9273, "step": 260 }, { "epoch": 3.0, "grad_norm": 0.5197608470916748, "learning_rate": 2.058823529411765e-06, "loss": 1.9653, "step": 261 }, { "epoch": 3.0115107913669066, "grad_norm": 0.41634294390678406, "learning_rate": 2.047058823529412e-06, "loss": 2.0373, "step": 262 }, { "epoch": 3.023021582733813, "grad_norm": 0.45880216360092163, "learning_rate": 2.035294117647059e-06, "loss": 2.0428, "step": 263 }, { "epoch": 3.0345323741007193, "grad_norm": 0.4628404974937439, "learning_rate": 2.0235294117647063e-06, "loss": 1.9155, "step": 264 }, { "epoch": 3.046043165467626, "grad_norm": 0.434393972158432, "learning_rate": 2.011764705882353e-06, "loss": 1.9765, "step": 265 }, { "epoch": 3.0575539568345325, "grad_norm": 0.3955315053462982, "learning_rate": 2.0000000000000003e-06, "loss": 1.8562, "step": 266 }, { "epoch": 3.069064748201439, "grad_norm": 0.38154056668281555, "learning_rate": 1.988235294117647e-06, "loss": 1.9882, "step": 267 }, { "epoch": 3.080575539568345, "grad_norm": 0.39588648080825806, "learning_rate": 1.976470588235294e-06, "loss": 1.966, "step": 268 }, { "epoch": 3.0920863309352518, "grad_norm": 0.3876536786556244, "learning_rate": 1.9647058823529412e-06, "loss": 1.9866, "step": 269 }, { "epoch": 3.1035971223021583, "grad_norm": 0.4039998948574066, "learning_rate": 1.9529411764705885e-06, "loss": 1.8997, "step": 270 }, { "epoch": 3.115107913669065, "grad_norm": 0.4637863039970398, "learning_rate": 1.9411764705882353e-06, "loss": 1.8496, "step": 271 }, { "epoch": 3.126618705035971, "grad_norm": 0.37885233759880066, "learning_rate": 1.9294117647058825e-06, "loss": 1.9608, "step": 272 }, { "epoch": 3.1381294964028776, "grad_norm": 0.39651158452033997, "learning_rate": 1.9176470588235298e-06, "loss": 1.9621, "step": 273 }, { "epoch": 3.149640287769784, "grad_norm": 0.4272053837776184, "learning_rate": 1.9058823529411766e-06, "loss": 2.0647, "step": 274 }, { "epoch": 3.161151079136691, "grad_norm": 0.36566904187202454, "learning_rate": 1.8941176470588239e-06, "loss": 1.9796, "step": 275 }, { "epoch": 3.172661870503597, "grad_norm": 0.4320291578769684, "learning_rate": 1.8823529411764707e-06, "loss": 1.8802, "step": 276 }, { "epoch": 3.1841726618705035, "grad_norm": 0.43051987886428833, "learning_rate": 1.870588235294118e-06, "loss": 1.8472, "step": 277 }, { "epoch": 3.19568345323741, "grad_norm": 0.40524744987487793, "learning_rate": 1.858823529411765e-06, "loss": 1.978, "step": 278 }, { "epoch": 3.2071942446043167, "grad_norm": 0.40227210521698, "learning_rate": 1.8470588235294118e-06, "loss": 1.8278, "step": 279 }, { "epoch": 3.218705035971223, "grad_norm": 0.4132155478000641, "learning_rate": 1.835294117647059e-06, "loss": 1.8649, "step": 280 }, { "epoch": 3.2302158273381294, "grad_norm": 0.44230976700782776, "learning_rate": 1.8235294117647058e-06, "loss": 2.0472, "step": 281 }, { "epoch": 3.241726618705036, "grad_norm": 0.41526058316230774, "learning_rate": 1.811764705882353e-06, "loss": 1.9339, "step": 282 }, { "epoch": 3.2532374100719426, "grad_norm": 0.4362848401069641, "learning_rate": 1.8000000000000001e-06, "loss": 2.0415, "step": 283 }, { "epoch": 3.2647482014388487, "grad_norm": 0.36819082498550415, "learning_rate": 1.7882352941176474e-06, "loss": 1.9617, "step": 284 }, { "epoch": 3.2762589928057553, "grad_norm": 0.3629878759384155, "learning_rate": 1.7764705882352942e-06, "loss": 1.9392, "step": 285 }, { "epoch": 3.287769784172662, "grad_norm": 0.39181169867515564, "learning_rate": 1.7647058823529414e-06, "loss": 1.9113, "step": 286 }, { "epoch": 3.2992805755395684, "grad_norm": 0.38480955362319946, "learning_rate": 1.7529411764705883e-06, "loss": 1.796, "step": 287 }, { "epoch": 3.310791366906475, "grad_norm": 0.3901033103466034, "learning_rate": 1.7411764705882353e-06, "loss": 1.8803, "step": 288 }, { "epoch": 3.322302158273381, "grad_norm": 0.43883225321769714, "learning_rate": 1.7294117647058825e-06, "loss": 1.9635, "step": 289 }, { "epoch": 3.3338129496402877, "grad_norm": 0.36902785301208496, "learning_rate": 1.7176470588235294e-06, "loss": 1.9661, "step": 290 }, { "epoch": 3.3453237410071943, "grad_norm": 0.39565619826316833, "learning_rate": 1.7058823529411766e-06, "loss": 2.0416, "step": 291 }, { "epoch": 3.356834532374101, "grad_norm": 0.4097813367843628, "learning_rate": 1.6941176470588237e-06, "loss": 1.9529, "step": 292 }, { "epoch": 3.368345323741007, "grad_norm": 0.43599432706832886, "learning_rate": 1.682352941176471e-06, "loss": 2.011, "step": 293 }, { "epoch": 3.3798561151079136, "grad_norm": 0.3928837180137634, "learning_rate": 1.6705882352941177e-06, "loss": 1.8357, "step": 294 }, { "epoch": 3.39136690647482, "grad_norm": 0.40635019540786743, "learning_rate": 1.658823529411765e-06, "loss": 1.8737, "step": 295 }, { "epoch": 3.402877697841727, "grad_norm": 0.43016231060028076, "learning_rate": 1.6470588235294118e-06, "loss": 1.9542, "step": 296 }, { "epoch": 3.414388489208633, "grad_norm": 0.478292852640152, "learning_rate": 1.635294117647059e-06, "loss": 1.8912, "step": 297 }, { "epoch": 3.4258992805755395, "grad_norm": 0.40000081062316895, "learning_rate": 1.623529411764706e-06, "loss": 1.9357, "step": 298 }, { "epoch": 3.437410071942446, "grad_norm": 0.4033874273300171, "learning_rate": 1.6117647058823529e-06, "loss": 1.906, "step": 299 }, { "epoch": 3.4489208633093527, "grad_norm": 0.40947261452674866, "learning_rate": 1.6000000000000001e-06, "loss": 1.8173, "step": 300 }, { "epoch": 3.460431654676259, "grad_norm": 0.4071550965309143, "learning_rate": 1.5882352941176472e-06, "loss": 1.9455, "step": 301 }, { "epoch": 3.4719424460431654, "grad_norm": 0.4430578052997589, "learning_rate": 1.5764705882352942e-06, "loss": 1.8767, "step": 302 }, { "epoch": 3.483453237410072, "grad_norm": 0.41929903626441956, "learning_rate": 1.5647058823529412e-06, "loss": 1.9888, "step": 303 }, { "epoch": 3.4949640287769785, "grad_norm": 0.3640955090522766, "learning_rate": 1.5529411764705885e-06, "loss": 1.8597, "step": 304 }, { "epoch": 3.506474820143885, "grad_norm": 0.4024275839328766, "learning_rate": 1.5411764705882353e-06, "loss": 1.8771, "step": 305 }, { "epoch": 3.5179856115107913, "grad_norm": 0.38399186730384827, "learning_rate": 1.5294117647058826e-06, "loss": 1.9956, "step": 306 }, { "epoch": 3.529496402877698, "grad_norm": 0.39859694242477417, "learning_rate": 1.5176470588235296e-06, "loss": 1.9475, "step": 307 }, { "epoch": 3.5410071942446044, "grad_norm": 0.4364980459213257, "learning_rate": 1.5058823529411764e-06, "loss": 1.9514, "step": 308 }, { "epoch": 3.5525179856115106, "grad_norm": 0.4755602180957794, "learning_rate": 1.4941176470588237e-06, "loss": 1.9057, "step": 309 }, { "epoch": 3.564028776978417, "grad_norm": 0.39024367928504944, "learning_rate": 1.4823529411764707e-06, "loss": 1.9041, "step": 310 }, { "epoch": 3.5755395683453237, "grad_norm": 0.4508678615093231, "learning_rate": 1.4705882352941177e-06, "loss": 2.0103, "step": 311 }, { "epoch": 3.5870503597122303, "grad_norm": 0.4054012894630432, "learning_rate": 1.4588235294117648e-06, "loss": 2.0046, "step": 312 }, { "epoch": 3.598561151079137, "grad_norm": 0.40608781576156616, "learning_rate": 1.447058823529412e-06, "loss": 1.9209, "step": 313 }, { "epoch": 3.610071942446043, "grad_norm": 0.4027564525604248, "learning_rate": 1.4352941176470588e-06, "loss": 1.9453, "step": 314 }, { "epoch": 3.6215827338129496, "grad_norm": 0.37097108364105225, "learning_rate": 1.423529411764706e-06, "loss": 1.8778, "step": 315 }, { "epoch": 3.633093525179856, "grad_norm": 0.3786408007144928, "learning_rate": 1.4117647058823531e-06, "loss": 2.0226, "step": 316 }, { "epoch": 3.6446043165467623, "grad_norm": 0.5143160223960876, "learning_rate": 1.4000000000000001e-06, "loss": 1.8967, "step": 317 }, { "epoch": 3.656115107913669, "grad_norm": 0.4117061197757721, "learning_rate": 1.3882352941176472e-06, "loss": 1.9639, "step": 318 }, { "epoch": 3.6676258992805755, "grad_norm": 0.38462352752685547, "learning_rate": 1.3764705882352942e-06, "loss": 1.9823, "step": 319 }, { "epoch": 3.679136690647482, "grad_norm": 0.4092719852924347, "learning_rate": 1.3647058823529413e-06, "loss": 1.9542, "step": 320 }, { "epoch": 3.6906474820143886, "grad_norm": 0.3653268814086914, "learning_rate": 1.3529411764705883e-06, "loss": 1.9841, "step": 321 }, { "epoch": 3.702158273381295, "grad_norm": 0.46952006220817566, "learning_rate": 1.3411764705882355e-06, "loss": 1.8227, "step": 322 }, { "epoch": 3.7136690647482014, "grad_norm": 0.3843960464000702, "learning_rate": 1.3294117647058824e-06, "loss": 1.9256, "step": 323 }, { "epoch": 3.725179856115108, "grad_norm": 0.4532316327095032, "learning_rate": 1.3176470588235296e-06, "loss": 1.8819, "step": 324 }, { "epoch": 3.7366906474820145, "grad_norm": 0.4273243248462677, "learning_rate": 1.3058823529411766e-06, "loss": 1.7333, "step": 325 }, { "epoch": 3.7482014388489207, "grad_norm": 0.3750397264957428, "learning_rate": 1.2941176470588237e-06, "loss": 1.8715, "step": 326 }, { "epoch": 3.7597122302158272, "grad_norm": 0.37180638313293457, "learning_rate": 1.2823529411764707e-06, "loss": 1.8691, "step": 327 }, { "epoch": 3.771223021582734, "grad_norm": 0.45777222514152527, "learning_rate": 1.2705882352941175e-06, "loss": 2.0797, "step": 328 }, { "epoch": 3.7827338129496404, "grad_norm": 0.40992122888565063, "learning_rate": 1.2588235294117648e-06, "loss": 1.8158, "step": 329 }, { "epoch": 3.794244604316547, "grad_norm": 0.39492303133010864, "learning_rate": 1.2470588235294118e-06, "loss": 1.9402, "step": 330 }, { "epoch": 3.805755395683453, "grad_norm": 0.4207254946231842, "learning_rate": 1.235294117647059e-06, "loss": 1.9019, "step": 331 }, { "epoch": 3.8172661870503597, "grad_norm": 0.41245564818382263, "learning_rate": 1.223529411764706e-06, "loss": 1.9962, "step": 332 }, { "epoch": 3.8287769784172663, "grad_norm": 0.4352625906467438, "learning_rate": 1.211764705882353e-06, "loss": 1.9475, "step": 333 }, { "epoch": 3.8402877697841724, "grad_norm": 0.44661152362823486, "learning_rate": 1.2000000000000002e-06, "loss": 1.9815, "step": 334 }, { "epoch": 3.851798561151079, "grad_norm": 0.3469794988632202, "learning_rate": 1.1882352941176472e-06, "loss": 1.9558, "step": 335 }, { "epoch": 3.8633093525179856, "grad_norm": 0.3915042281150818, "learning_rate": 1.1764705882352942e-06, "loss": 1.9432, "step": 336 }, { "epoch": 3.874820143884892, "grad_norm": 0.40242835879325867, "learning_rate": 1.1647058823529413e-06, "loss": 2.0347, "step": 337 }, { "epoch": 3.8863309352517987, "grad_norm": 0.4240435063838959, "learning_rate": 1.1529411764705883e-06, "loss": 1.8546, "step": 338 }, { "epoch": 3.897841726618705, "grad_norm": 0.3874111771583557, "learning_rate": 1.1411764705882353e-06, "loss": 1.8743, "step": 339 }, { "epoch": 3.9093525179856115, "grad_norm": 0.4088236391544342, "learning_rate": 1.1294117647058826e-06, "loss": 1.9511, "step": 340 }, { "epoch": 3.920863309352518, "grad_norm": 0.4264145493507385, "learning_rate": 1.1176470588235296e-06, "loss": 1.8457, "step": 341 }, { "epoch": 3.9323741007194246, "grad_norm": 0.4625447392463684, "learning_rate": 1.1058823529411766e-06, "loss": 2.0004, "step": 342 }, { "epoch": 3.9438848920863308, "grad_norm": 0.3717285692691803, "learning_rate": 1.0941176470588235e-06, "loss": 2.018, "step": 343 }, { "epoch": 3.9553956834532373, "grad_norm": 0.38859859108924866, "learning_rate": 1.0823529411764707e-06, "loss": 2.0653, "step": 344 }, { "epoch": 3.966906474820144, "grad_norm": 0.4414234161376953, "learning_rate": 1.0705882352941177e-06, "loss": 1.9468, "step": 345 }, { "epoch": 3.9784172661870505, "grad_norm": 0.44937196373939514, "learning_rate": 1.0588235294117648e-06, "loss": 1.9817, "step": 346 }, { "epoch": 3.989928057553957, "grad_norm": 0.4258635938167572, "learning_rate": 1.0470588235294118e-06, "loss": 1.8073, "step": 347 }, { "epoch": 4.0, "grad_norm": 0.43822839856147766, "learning_rate": 1.0352941176470589e-06, "loss": 1.8991, "step": 348 }, { "epoch": 4.011510791366907, "grad_norm": 0.46056368947029114, "learning_rate": 1.023529411764706e-06, "loss": 1.9822, "step": 349 }, { "epoch": 4.023021582733813, "grad_norm": 0.37518858909606934, "learning_rate": 1.0117647058823531e-06, "loss": 1.813, "step": 350 }, { "epoch": 4.03453237410072, "grad_norm": 0.3921293020248413, "learning_rate": 1.0000000000000002e-06, "loss": 2.0461, "step": 351 }, { "epoch": 4.046043165467626, "grad_norm": 0.4025590419769287, "learning_rate": 9.88235294117647e-07, "loss": 1.9639, "step": 352 }, { "epoch": 4.057553956834532, "grad_norm": 0.40453991293907166, "learning_rate": 9.764705882352942e-07, "loss": 1.9503, "step": 353 }, { "epoch": 4.069064748201439, "grad_norm": 0.35269466042518616, "learning_rate": 9.647058823529413e-07, "loss": 1.9183, "step": 354 }, { "epoch": 4.080575539568345, "grad_norm": 0.4840276837348938, "learning_rate": 9.529411764705883e-07, "loss": 1.8452, "step": 355 }, { "epoch": 4.092086330935252, "grad_norm": 0.39707890152931213, "learning_rate": 9.411764705882353e-07, "loss": 1.9937, "step": 356 }, { "epoch": 4.103597122302158, "grad_norm": 0.4407122731208801, "learning_rate": 9.294117647058825e-07, "loss": 2.022, "step": 357 }, { "epoch": 4.115107913669065, "grad_norm": 0.41454342007637024, "learning_rate": 9.176470588235295e-07, "loss": 1.8907, "step": 358 }, { "epoch": 4.1266187050359715, "grad_norm": 0.45531314611434937, "learning_rate": 9.058823529411765e-07, "loss": 1.8662, "step": 359 }, { "epoch": 4.138129496402878, "grad_norm": 0.4606649577617645, "learning_rate": 8.941176470588237e-07, "loss": 1.8686, "step": 360 }, { "epoch": 4.149640287769784, "grad_norm": 0.3887675106525421, "learning_rate": 8.823529411764707e-07, "loss": 1.9282, "step": 361 }, { "epoch": 4.16115107913669, "grad_norm": 0.40791404247283936, "learning_rate": 8.705882352941177e-07, "loss": 1.8935, "step": 362 }, { "epoch": 4.172661870503597, "grad_norm": 0.4134286046028137, "learning_rate": 8.588235294117647e-07, "loss": 1.9222, "step": 363 }, { "epoch": 4.1841726618705035, "grad_norm": 0.45545920729637146, "learning_rate": 8.470588235294118e-07, "loss": 1.7464, "step": 364 }, { "epoch": 4.19568345323741, "grad_norm": 0.3929649889469147, "learning_rate": 8.352941176470589e-07, "loss": 1.9524, "step": 365 }, { "epoch": 4.207194244604317, "grad_norm": 0.3917909860610962, "learning_rate": 8.235294117647059e-07, "loss": 1.8608, "step": 366 }, { "epoch": 4.218705035971223, "grad_norm": 0.3615923821926117, "learning_rate": 8.11764705882353e-07, "loss": 2.0196, "step": 367 }, { "epoch": 4.23021582733813, "grad_norm": 0.42296525835990906, "learning_rate": 8.000000000000001e-07, "loss": 2.0074, "step": 368 }, { "epoch": 4.2417266187050355, "grad_norm": 0.38392674922943115, "learning_rate": 7.882352941176471e-07, "loss": 1.8838, "step": 369 }, { "epoch": 4.253237410071942, "grad_norm": 0.45736461877822876, "learning_rate": 7.764705882352942e-07, "loss": 1.9283, "step": 370 }, { "epoch": 4.264748201438849, "grad_norm": 0.4002780020236969, "learning_rate": 7.647058823529413e-07, "loss": 1.9358, "step": 371 }, { "epoch": 4.276258992805755, "grad_norm": 0.4492432475090027, "learning_rate": 7.529411764705882e-07, "loss": 1.9168, "step": 372 }, { "epoch": 4.287769784172662, "grad_norm": 0.4120420813560486, "learning_rate": 7.411764705882353e-07, "loss": 2.0267, "step": 373 }, { "epoch": 4.299280575539568, "grad_norm": 0.37060075998306274, "learning_rate": 7.294117647058824e-07, "loss": 1.7903, "step": 374 }, { "epoch": 4.310791366906475, "grad_norm": 0.41300690174102783, "learning_rate": 7.176470588235294e-07, "loss": 1.9742, "step": 375 }, { "epoch": 4.322302158273382, "grad_norm": 0.39079055190086365, "learning_rate": 7.058823529411766e-07, "loss": 1.9725, "step": 376 }, { "epoch": 4.333812949640288, "grad_norm": 0.37425902485847473, "learning_rate": 6.941176470588236e-07, "loss": 1.9327, "step": 377 }, { "epoch": 4.345323741007194, "grad_norm": 0.39660215377807617, "learning_rate": 6.823529411764706e-07, "loss": 1.9264, "step": 378 }, { "epoch": 4.3568345323741005, "grad_norm": 0.3530445098876953, "learning_rate": 6.705882352941178e-07, "loss": 2.0301, "step": 379 }, { "epoch": 4.368345323741007, "grad_norm": 0.3451977074146271, "learning_rate": 6.588235294117648e-07, "loss": 1.9497, "step": 380 }, { "epoch": 4.379856115107914, "grad_norm": 0.39704129099845886, "learning_rate": 6.470588235294118e-07, "loss": 2.0197, "step": 381 }, { "epoch": 4.39136690647482, "grad_norm": 0.4510452151298523, "learning_rate": 6.352941176470588e-07, "loss": 2.0232, "step": 382 }, { "epoch": 4.402877697841727, "grad_norm": 0.40996772050857544, "learning_rate": 6.235294117647059e-07, "loss": 1.9544, "step": 383 }, { "epoch": 4.414388489208633, "grad_norm": 0.39487242698669434, "learning_rate": 6.11764705882353e-07, "loss": 1.906, "step": 384 }, { "epoch": 4.42589928057554, "grad_norm": 0.38208380341529846, "learning_rate": 6.000000000000001e-07, "loss": 1.9622, "step": 385 }, { "epoch": 4.437410071942446, "grad_norm": 0.4923550486564636, "learning_rate": 5.882352941176471e-07, "loss": 1.8734, "step": 386 }, { "epoch": 4.448920863309352, "grad_norm": 0.3934495151042938, "learning_rate": 5.764705882352941e-07, "loss": 1.9016, "step": 387 }, { "epoch": 4.460431654676259, "grad_norm": 0.3597968518733978, "learning_rate": 5.647058823529413e-07, "loss": 1.8471, "step": 388 }, { "epoch": 4.471942446043165, "grad_norm": 0.3755582273006439, "learning_rate": 5.529411764705883e-07, "loss": 1.9416, "step": 389 }, { "epoch": 4.483453237410072, "grad_norm": 0.41233447194099426, "learning_rate": 5.411764705882354e-07, "loss": 1.8175, "step": 390 }, { "epoch": 4.4949640287769785, "grad_norm": 0.43287456035614014, "learning_rate": 5.294117647058824e-07, "loss": 2.0551, "step": 391 }, { "epoch": 4.506474820143885, "grad_norm": 0.3722149133682251, "learning_rate": 5.176470588235294e-07, "loss": 1.8064, "step": 392 }, { "epoch": 4.517985611510792, "grad_norm": 0.36640405654907227, "learning_rate": 5.058823529411766e-07, "loss": 1.8697, "step": 393 }, { "epoch": 4.529496402877697, "grad_norm": 0.42729923129081726, "learning_rate": 4.941176470588235e-07, "loss": 1.9544, "step": 394 }, { "epoch": 4.541007194244604, "grad_norm": 0.4456847012042999, "learning_rate": 4.823529411764706e-07, "loss": 1.9493, "step": 395 }, { "epoch": 4.5525179856115106, "grad_norm": 0.37357428669929504, "learning_rate": 4.7058823529411767e-07, "loss": 1.8166, "step": 396 }, { "epoch": 4.564028776978417, "grad_norm": 0.36978697776794434, "learning_rate": 4.5882352941176476e-07, "loss": 1.9415, "step": 397 }, { "epoch": 4.575539568345324, "grad_norm": 0.4100985825061798, "learning_rate": 4.4705882352941184e-07, "loss": 1.9396, "step": 398 }, { "epoch": 4.58705035971223, "grad_norm": 0.4530591368675232, "learning_rate": 4.352941176470588e-07, "loss": 2.0031, "step": 399 }, { "epoch": 4.598561151079137, "grad_norm": 0.43112170696258545, "learning_rate": 4.235294117647059e-07, "loss": 1.8297, "step": 400 }, { "epoch": 4.6100719424460435, "grad_norm": 0.4256054162979126, "learning_rate": 4.1176470588235295e-07, "loss": 1.9466, "step": 401 }, { "epoch": 4.62158273381295, "grad_norm": 0.41112977266311646, "learning_rate": 4.0000000000000003e-07, "loss": 1.9521, "step": 402 }, { "epoch": 4.633093525179856, "grad_norm": 0.4099273681640625, "learning_rate": 3.882352941176471e-07, "loss": 1.8238, "step": 403 }, { "epoch": 4.644604316546762, "grad_norm": 0.3838717043399811, "learning_rate": 3.764705882352941e-07, "loss": 2.0297, "step": 404 }, { "epoch": 4.656115107913669, "grad_norm": 0.40768367052078247, "learning_rate": 3.647058823529412e-07, "loss": 1.763, "step": 405 }, { "epoch": 4.6676258992805755, "grad_norm": 0.4218824803829193, "learning_rate": 3.529411764705883e-07, "loss": 1.9831, "step": 406 }, { "epoch": 4.679136690647482, "grad_norm": 0.4038139581680298, "learning_rate": 3.411764705882353e-07, "loss": 1.9038, "step": 407 }, { "epoch": 4.690647482014389, "grad_norm": 0.444604754447937, "learning_rate": 3.294117647058824e-07, "loss": 1.8462, "step": 408 }, { "epoch": 4.702158273381295, "grad_norm": 0.398887038230896, "learning_rate": 3.176470588235294e-07, "loss": 1.8827, "step": 409 }, { "epoch": 4.713669064748202, "grad_norm": 0.3743375837802887, "learning_rate": 3.058823529411765e-07, "loss": 1.9547, "step": 410 }, { "epoch": 4.725179856115108, "grad_norm": 0.36487120389938354, "learning_rate": 2.9411764705882356e-07, "loss": 1.9927, "step": 411 }, { "epoch": 4.736690647482014, "grad_norm": 0.38175249099731445, "learning_rate": 2.8235294117647064e-07, "loss": 1.8559, "step": 412 }, { "epoch": 4.748201438848921, "grad_norm": 0.45353925228118896, "learning_rate": 2.705882352941177e-07, "loss": 1.8788, "step": 413 }, { "epoch": 4.759712230215827, "grad_norm": 0.38203585147857666, "learning_rate": 2.588235294117647e-07, "loss": 1.9798, "step": 414 }, { "epoch": 4.771223021582734, "grad_norm": 0.4031854271888733, "learning_rate": 2.4705882352941175e-07, "loss": 1.924, "step": 415 }, { "epoch": 4.78273381294964, "grad_norm": 0.39555788040161133, "learning_rate": 2.3529411764705883e-07, "loss": 1.9385, "step": 416 }, { "epoch": 4.794244604316547, "grad_norm": 0.36034807562828064, "learning_rate": 2.2352941176470592e-07, "loss": 1.8572, "step": 417 }, { "epoch": 4.805755395683454, "grad_norm": 0.416274756193161, "learning_rate": 2.1176470588235296e-07, "loss": 1.9249, "step": 418 }, { "epoch": 4.817266187050359, "grad_norm": 0.40581023693084717, "learning_rate": 2.0000000000000002e-07, "loss": 2.0491, "step": 419 }, { "epoch": 4.828776978417266, "grad_norm": 0.433010995388031, "learning_rate": 1.8823529411764705e-07, "loss": 1.9709, "step": 420 }, { "epoch": 4.840287769784172, "grad_norm": 0.37076178193092346, "learning_rate": 1.7647058823529414e-07, "loss": 1.8053, "step": 421 }, { "epoch": 4.851798561151079, "grad_norm": 0.4253133237361908, "learning_rate": 1.647058823529412e-07, "loss": 1.8407, "step": 422 }, { "epoch": 4.863309352517986, "grad_norm": 0.3607207238674164, "learning_rate": 1.5294117647058826e-07, "loss": 1.9195, "step": 423 }, { "epoch": 4.874820143884892, "grad_norm": 0.39863321185112, "learning_rate": 1.4117647058823532e-07, "loss": 1.9297, "step": 424 }, { "epoch": 4.886330935251799, "grad_norm": 0.36778998374938965, "learning_rate": 1.2941176470588236e-07, "loss": 1.9024, "step": 425 }, { "epoch": 4.897841726618705, "grad_norm": 0.4625915288925171, "learning_rate": 1.1764705882352942e-07, "loss": 1.969, "step": 426 }, { "epoch": 4.909352517985612, "grad_norm": 0.39172741770744324, "learning_rate": 1.0588235294117648e-07, "loss": 1.9035, "step": 427 }, { "epoch": 4.920863309352518, "grad_norm": 0.38108983635902405, "learning_rate": 9.411764705882353e-08, "loss": 1.921, "step": 428 }, { "epoch": 4.932374100719424, "grad_norm": 0.4520784020423889, "learning_rate": 8.23529411764706e-08, "loss": 2.0255, "step": 429 }, { "epoch": 4.943884892086331, "grad_norm": 0.4058828055858612, "learning_rate": 7.058823529411766e-08, "loss": 1.9021, "step": 430 }, { "epoch": 4.955395683453237, "grad_norm": 0.38254988193511963, "learning_rate": 5.882352941176471e-08, "loss": 1.865, "step": 431 }, { "epoch": 4.966906474820144, "grad_norm": 0.4428682327270508, "learning_rate": 4.705882352941176e-08, "loss": 1.9649, "step": 432 }, { "epoch": 4.9784172661870505, "grad_norm": 0.41550108790397644, "learning_rate": 3.529411764705883e-08, "loss": 1.9868, "step": 433 }, { "epoch": 4.989928057553957, "grad_norm": 0.430279016494751, "learning_rate": 2.352941176470588e-08, "loss": 1.913, "step": 434 }, { "epoch": 5.0, "grad_norm": 0.4352044463157654, "learning_rate": 1.176470588235294e-08, "loss": 1.9378, "step": 435 } ], "logging_steps": 1, "max_steps": 435, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1776104868752e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }