{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.987012987012987, "eval_steps": 500, "global_step": 360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013852813852813853, "grad_norm": 10.690503047217376, "learning_rate": 2.222222222222222e-06, "loss": 1.664, "step": 1 }, { "epoch": 0.027705627705627706, "grad_norm": 10.549038500876918, "learning_rate": 4.444444444444444e-06, "loss": 1.6687, "step": 2 }, { "epoch": 0.04155844155844156, "grad_norm": 9.952372502868275, "learning_rate": 6.666666666666667e-06, "loss": 1.6436, "step": 3 }, { "epoch": 0.05541125541125541, "grad_norm": 7.525381680312214, "learning_rate": 8.888888888888888e-06, "loss": 1.5751, "step": 4 }, { "epoch": 0.06926406926406926, "grad_norm": 3.6488707097222806, "learning_rate": 1.1111111111111113e-05, "loss": 1.4732, "step": 5 }, { "epoch": 0.08311688311688312, "grad_norm": 5.9440833747387405, "learning_rate": 1.3333333333333333e-05, "loss": 1.4929, "step": 6 }, { "epoch": 0.09696969696969697, "grad_norm": 7.008224469434576, "learning_rate": 1.555555555555556e-05, "loss": 1.4342, "step": 7 }, { "epoch": 0.11082251082251082, "grad_norm": 9.129791969259458, "learning_rate": 1.7777777777777777e-05, "loss": 1.4508, "step": 8 }, { "epoch": 0.12467532467532468, "grad_norm": 7.157661170613076, "learning_rate": 2e-05, "loss": 1.3993, "step": 9 }, { "epoch": 0.13852813852813853, "grad_norm": 5.878397281654449, "learning_rate": 2.2222222222222227e-05, "loss": 1.3716, "step": 10 }, { "epoch": 0.1523809523809524, "grad_norm": 4.04814199716087, "learning_rate": 2.444444444444445e-05, "loss": 1.3279, "step": 11 }, { "epoch": 0.16623376623376623, "grad_norm": 4.367325147342624, "learning_rate": 2.6666666666666667e-05, "loss": 1.2918, "step": 12 }, { "epoch": 0.1800865800865801, "grad_norm": 3.013051181093589, "learning_rate": 2.888888888888889e-05, "loss": 1.2683, "step": 13 }, { "epoch": 0.19393939393939394, "grad_norm": 2.7017616202077597, "learning_rate": 3.111111111111112e-05, "loss": 1.2741, "step": 14 }, { "epoch": 0.2077922077922078, "grad_norm": 2.4447347796035936, "learning_rate": 3.3333333333333335e-05, "loss": 1.2498, "step": 15 }, { "epoch": 0.22164502164502164, "grad_norm": 2.3013073090511016, "learning_rate": 3.555555555555555e-05, "loss": 1.2356, "step": 16 }, { "epoch": 0.2354978354978355, "grad_norm": 2.676331737240606, "learning_rate": 3.777777777777778e-05, "loss": 1.2226, "step": 17 }, { "epoch": 0.24935064935064935, "grad_norm": 1.8653678395700215, "learning_rate": 4e-05, "loss": 1.1883, "step": 18 }, { "epoch": 0.2632034632034632, "grad_norm": 2.489502341694411, "learning_rate": 4.222222222222223e-05, "loss": 1.1903, "step": 19 }, { "epoch": 0.27705627705627706, "grad_norm": 2.2381168497877746, "learning_rate": 4.444444444444445e-05, "loss": 1.1823, "step": 20 }, { "epoch": 0.2909090909090909, "grad_norm": 1.0658561341621282, "learning_rate": 4.666666666666667e-05, "loss": 1.1644, "step": 21 }, { "epoch": 0.3047619047619048, "grad_norm": 3.3353632520282024, "learning_rate": 4.88888888888889e-05, "loss": 1.1866, "step": 22 }, { "epoch": 0.31861471861471863, "grad_norm": 2.0828413940584256, "learning_rate": 5.111111111111111e-05, "loss": 1.1606, "step": 23 }, { "epoch": 0.33246753246753247, "grad_norm": 2.0722285174850334, "learning_rate": 5.333333333333333e-05, "loss": 1.1689, "step": 24 }, { "epoch": 0.3463203463203463, "grad_norm": 2.6579102865439035, "learning_rate": 5.555555555555556e-05, "loss": 1.1555, "step": 25 }, { "epoch": 0.3601731601731602, "grad_norm": 1.9616156182284334, "learning_rate": 5.777777777777778e-05, "loss": 1.1683, "step": 26 }, { "epoch": 0.37402597402597404, "grad_norm": 3.2895161663522225, "learning_rate": 6.000000000000001e-05, "loss": 1.162, "step": 27 }, { "epoch": 0.3878787878787879, "grad_norm": 2.2524763564895447, "learning_rate": 6.222222222222223e-05, "loss": 1.1588, "step": 28 }, { "epoch": 0.4017316017316017, "grad_norm": 2.9587565231476036, "learning_rate": 6.444444444444446e-05, "loss": 1.1477, "step": 29 }, { "epoch": 0.4155844155844156, "grad_norm": 2.0001168739095387, "learning_rate": 6.666666666666667e-05, "loss": 1.1463, "step": 30 }, { "epoch": 0.42943722943722945, "grad_norm": 3.0781839410346756, "learning_rate": 6.88888888888889e-05, "loss": 1.1273, "step": 31 }, { "epoch": 0.4432900432900433, "grad_norm": 2.155490334097704, "learning_rate": 7.11111111111111e-05, "loss": 1.1468, "step": 32 }, { "epoch": 0.45714285714285713, "grad_norm": 2.3875247457053566, "learning_rate": 7.333333333333333e-05, "loss": 1.1379, "step": 33 }, { "epoch": 0.470995670995671, "grad_norm": 1.71586428053475, "learning_rate": 7.555555555555556e-05, "loss": 1.1309, "step": 34 }, { "epoch": 0.48484848484848486, "grad_norm": 2.6858291279872, "learning_rate": 7.777777777777778e-05, "loss": 1.1318, "step": 35 }, { "epoch": 0.4987012987012987, "grad_norm": 1.997759995167864, "learning_rate": 8e-05, "loss": 1.1323, "step": 36 }, { "epoch": 0.5125541125541125, "grad_norm": 2.629649063991005, "learning_rate": 7.999811966028904e-05, "loss": 1.1398, "step": 37 }, { "epoch": 0.5264069264069264, "grad_norm": 2.6927398202491544, "learning_rate": 7.999247881794007e-05, "loss": 1.1272, "step": 38 }, { "epoch": 0.5402597402597402, "grad_norm": 1.0260444389642347, "learning_rate": 7.998307800328803e-05, "loss": 1.1148, "step": 39 }, { "epoch": 0.5541125541125541, "grad_norm": 3.1260836757156496, "learning_rate": 7.996991810016922e-05, "loss": 1.1581, "step": 40 }, { "epoch": 0.567965367965368, "grad_norm": 2.408162449515958, "learning_rate": 7.995300034583802e-05, "loss": 1.1579, "step": 41 }, { "epoch": 0.5818181818181818, "grad_norm": 1.7233621870783713, "learning_rate": 7.993232633085074e-05, "loss": 1.1154, "step": 42 }, { "epoch": 0.5956709956709957, "grad_norm": 3.2143011392314524, "learning_rate": 7.990789799891592e-05, "loss": 1.1361, "step": 43 }, { "epoch": 0.6095238095238096, "grad_norm": 2.541057275107033, "learning_rate": 7.987971764671168e-05, "loss": 1.1437, "step": 44 }, { "epoch": 0.6233766233766234, "grad_norm": 2.554077948353239, "learning_rate": 7.984778792366983e-05, "loss": 1.1278, "step": 45 }, { "epoch": 0.6372294372294373, "grad_norm": 1.9556507030666455, "learning_rate": 7.981211183172663e-05, "loss": 1.125, "step": 46 }, { "epoch": 0.651082251082251, "grad_norm": 2.4591106418916024, "learning_rate": 7.977269272504075e-05, "loss": 1.1113, "step": 47 }, { "epoch": 0.6649350649350649, "grad_norm": 1.7374508763969678, "learning_rate": 7.972953430967773e-05, "loss": 1.1119, "step": 48 }, { "epoch": 0.6787878787878788, "grad_norm": 2.271122042411741, "learning_rate": 7.96826406432617e-05, "loss": 1.1047, "step": 49 }, { "epoch": 0.6926406926406926, "grad_norm": 1.385329225067948, "learning_rate": 7.963201613459381e-05, "loss": 1.1104, "step": 50 }, { "epoch": 0.7064935064935065, "grad_norm": 2.0797667060906853, "learning_rate": 7.957766554323778e-05, "loss": 1.1008, "step": 51 }, { "epoch": 0.7203463203463204, "grad_norm": 1.4769275764871517, "learning_rate": 7.951959397907237e-05, "loss": 1.1063, "step": 52 }, { "epoch": 0.7341991341991342, "grad_norm": 1.5969040026842134, "learning_rate": 7.945780690181096e-05, "loss": 1.0958, "step": 53 }, { "epoch": 0.7480519480519481, "grad_norm": 1.5076777523334957, "learning_rate": 7.939231012048833e-05, "loss": 1.1038, "step": 54 }, { "epoch": 0.7619047619047619, "grad_norm": 1.5353741235556218, "learning_rate": 7.932310979291441e-05, "loss": 1.088, "step": 55 }, { "epoch": 0.7757575757575758, "grad_norm": 1.6688683700597435, "learning_rate": 7.925021242509539e-05, "loss": 1.1005, "step": 56 }, { "epoch": 0.7896103896103897, "grad_norm": 1.5907176050250653, "learning_rate": 7.917362487062207e-05, "loss": 1.0885, "step": 57 }, { "epoch": 0.8034632034632034, "grad_norm": 1.5886283739500444, "learning_rate": 7.909335433002543e-05, "loss": 1.0889, "step": 58 }, { "epoch": 0.8173160173160173, "grad_norm": 1.1345065452265992, "learning_rate": 7.900940835009974e-05, "loss": 1.0809, "step": 59 }, { "epoch": 0.8311688311688312, "grad_norm": 1.6727620200346303, "learning_rate": 7.892179482319297e-05, "loss": 1.0844, "step": 60 }, { "epoch": 0.845021645021645, "grad_norm": 1.726654683160669, "learning_rate": 7.883052198646481e-05, "loss": 1.0868, "step": 61 }, { "epoch": 0.8588744588744589, "grad_norm": 0.7828989407478679, "learning_rate": 7.873559842111225e-05, "loss": 1.0711, "step": 62 }, { "epoch": 0.8727272727272727, "grad_norm": 1.3882694170960725, "learning_rate": 7.863703305156273e-05, "loss": 1.0752, "step": 63 }, { "epoch": 0.8865800865800866, "grad_norm": 1.5779873659792967, "learning_rate": 7.853483514463521e-05, "loss": 1.0766, "step": 64 }, { "epoch": 0.9004329004329005, "grad_norm": 1.4180034460400448, "learning_rate": 7.842901430866882e-05, "loss": 1.0725, "step": 65 }, { "epoch": 0.9142857142857143, "grad_norm": 0.9127219395084748, "learning_rate": 7.831958049261956e-05, "loss": 1.0612, "step": 66 }, { "epoch": 0.9281385281385282, "grad_norm": 1.0847846746337275, "learning_rate": 7.820654398512492e-05, "loss": 1.074, "step": 67 }, { "epoch": 0.941991341991342, "grad_norm": 1.8013647852774308, "learning_rate": 7.808991541353662e-05, "loss": 1.0954, "step": 68 }, { "epoch": 0.9558441558441558, "grad_norm": 1.377128616335908, "learning_rate": 7.796970574292136e-05, "loss": 1.0752, "step": 69 }, { "epoch": 0.9696969696969697, "grad_norm": 1.6958522149590192, "learning_rate": 7.784592627503004e-05, "loss": 1.0821, "step": 70 }, { "epoch": 0.9835497835497835, "grad_norm": 1.0049024746726356, "learning_rate": 7.771858864723504e-05, "loss": 1.068, "step": 71 }, { "epoch": 0.9974025974025974, "grad_norm": 2.6484071234844953, "learning_rate": 7.758770483143634e-05, "loss": 1.0771, "step": 72 }, { "epoch": 1.0112554112554113, "grad_norm": 4.246067022400895, "learning_rate": 7.745328713293573e-05, "loss": 1.948, "step": 73 }, { "epoch": 1.025108225108225, "grad_norm": 1.7220828208048158, "learning_rate": 7.731534818928004e-05, "loss": 1.0427, "step": 74 }, { "epoch": 1.0389610389610389, "grad_norm": 1.8447923963725428, "learning_rate": 7.71739009690729e-05, "loss": 1.0479, "step": 75 }, { "epoch": 1.0528138528138529, "grad_norm": 0.9341938628888585, "learning_rate": 7.702895877075563e-05, "loss": 1.0333, "step": 76 }, { "epoch": 1.0666666666666667, "grad_norm": 2.424773237088678, "learning_rate": 7.688053522135675e-05, "loss": 1.0579, "step": 77 }, { "epoch": 1.0805194805194804, "grad_norm": 1.6058600540175567, "learning_rate": 7.672864427521097e-05, "loss": 1.0636, "step": 78 }, { "epoch": 1.0943722943722944, "grad_norm": 2.091045151793165, "learning_rate": 7.657330021264718e-05, "loss": 1.0442, "step": 79 }, { "epoch": 1.1082251082251082, "grad_norm": 1.318962052033536, "learning_rate": 7.641451763864587e-05, "loss": 1.045, "step": 80 }, { "epoch": 1.122077922077922, "grad_norm": 2.317561720529343, "learning_rate": 7.625231148146601e-05, "loss": 1.0484, "step": 81 }, { "epoch": 1.135930735930736, "grad_norm": 1.4987484149413424, "learning_rate": 7.608669699124153e-05, "loss": 1.0484, "step": 82 }, { "epoch": 1.1497835497835498, "grad_norm": 2.3968225100015816, "learning_rate": 7.591768973854753e-05, "loss": 1.0453, "step": 83 }, { "epoch": 1.1636363636363636, "grad_norm": 2.0769969941809454, "learning_rate": 7.57453056129365e-05, "loss": 1.0473, "step": 84 }, { "epoch": 1.1774891774891776, "grad_norm": 1.5328425512954666, "learning_rate": 7.556956082144425e-05, "loss": 1.0432, "step": 85 }, { "epoch": 1.1913419913419914, "grad_norm": 1.5329379349699184, "learning_rate": 7.539047188706631e-05, "loss": 1.0502, "step": 86 }, { "epoch": 1.2051948051948052, "grad_norm": 1.2635424997786673, "learning_rate": 7.520805564720444e-05, "loss": 1.0389, "step": 87 }, { "epoch": 1.2190476190476192, "grad_norm": 0.9180899722416639, "learning_rate": 7.502232925208365e-05, "loss": 1.0297, "step": 88 }, { "epoch": 1.232900432900433, "grad_norm": 0.9088421536152287, "learning_rate": 7.483331016313969e-05, "loss": 1.026, "step": 89 }, { "epoch": 1.2467532467532467, "grad_norm": 0.9759584263195824, "learning_rate": 7.464101615137756e-05, "loss": 1.042, "step": 90 }, { "epoch": 1.2606060606060607, "grad_norm": 1.7816477052359974, "learning_rate": 7.444546529570055e-05, "loss": 1.0375, "step": 91 }, { "epoch": 1.2744588744588745, "grad_norm": 1.0505006199756568, "learning_rate": 7.424667598121067e-05, "loss": 1.0232, "step": 92 }, { "epoch": 1.2883116883116883, "grad_norm": 1.1076363899720796, "learning_rate": 7.404466689747999e-05, "loss": 1.0358, "step": 93 }, { "epoch": 1.3021645021645023, "grad_norm": 1.766746417129588, "learning_rate": 7.383945703679365e-05, "loss": 1.041, "step": 94 }, { "epoch": 1.316017316017316, "grad_norm": 1.1727210609875833, "learning_rate": 7.363106569236413e-05, "loss": 1.0373, "step": 95 }, { "epoch": 1.3298701298701299, "grad_norm": 1.3811377730593195, "learning_rate": 7.341951245651747e-05, "loss": 1.0232, "step": 96 }, { "epoch": 1.3437229437229437, "grad_norm": 1.8848088994220173, "learning_rate": 7.320481721885116e-05, "loss": 1.0331, "step": 97 }, { "epoch": 1.3575757575757577, "grad_norm": 1.5407669706222948, "learning_rate": 7.298700016436427e-05, "loss": 1.0392, "step": 98 }, { "epoch": 1.3714285714285714, "grad_norm": 1.6439258533934764, "learning_rate": 7.276608177155968e-05, "loss": 1.0302, "step": 99 }, { "epoch": 1.3852813852813852, "grad_norm": 1.6555083210158104, "learning_rate": 7.254208281051871e-05, "loss": 1.0359, "step": 100 }, { "epoch": 1.399134199134199, "grad_norm": 1.2444215446875204, "learning_rate": 7.231502434094845e-05, "loss": 1.0203, "step": 101 }, { "epoch": 1.412987012987013, "grad_norm": 1.4648122676877777, "learning_rate": 7.208492771020176e-05, "loss": 1.0198, "step": 102 }, { "epoch": 1.4268398268398268, "grad_norm": 0.9173692823505156, "learning_rate": 7.185181455127023e-05, "loss": 1.0217, "step": 103 }, { "epoch": 1.4406926406926406, "grad_norm": 1.1009749853774418, "learning_rate": 7.161570678075038e-05, "loss": 1.0128, "step": 104 }, { "epoch": 1.4545454545454546, "grad_norm": 1.0933932370696173, "learning_rate": 7.137662659678303e-05, "loss": 1.0238, "step": 105 }, { "epoch": 1.4683982683982684, "grad_norm": 1.1757437604660779, "learning_rate": 7.113459647696641e-05, "loss": 1.0182, "step": 106 }, { "epoch": 1.4822510822510822, "grad_norm": 0.7527900271083177, "learning_rate": 7.088963917624277e-05, "loss": 1.012, "step": 107 }, { "epoch": 1.4961038961038962, "grad_norm": 1.1702807594476543, "learning_rate": 7.064177772475912e-05, "loss": 1.0264, "step": 108 }, { "epoch": 1.50995670995671, "grad_norm": 0.6981814585755302, "learning_rate": 7.039103542570199e-05, "loss": 1.0151, "step": 109 }, { "epoch": 1.5238095238095237, "grad_norm": 1.1192032445094018, "learning_rate": 7.013743585310642e-05, "loss": 1.0162, "step": 110 }, { "epoch": 1.5376623376623377, "grad_norm": 1.0770568024481744, "learning_rate": 6.988100284963985e-05, "loss": 1.0199, "step": 111 }, { "epoch": 1.5515151515151515, "grad_norm": 1.2005325967972154, "learning_rate": 6.96217605243602e-05, "loss": 1.0242, "step": 112 }, { "epoch": 1.5653679653679653, "grad_norm": 0.7699858239179544, "learning_rate": 6.935973325044941e-05, "loss": 1.0241, "step": 113 }, { "epoch": 1.5792207792207793, "grad_norm": 1.1064626845196381, "learning_rate": 6.909494566292195e-05, "loss": 1.0082, "step": 114 }, { "epoch": 1.593073593073593, "grad_norm": 1.4162206055932687, "learning_rate": 6.882742265630859e-05, "loss": 1.0161, "step": 115 }, { "epoch": 1.6069264069264069, "grad_norm": 0.9857373401383442, "learning_rate": 6.855718938231597e-05, "loss": 1.0223, "step": 116 }, { "epoch": 1.6207792207792209, "grad_norm": 1.4328471449116547, "learning_rate": 6.828427124746191e-05, "loss": 1.0059, "step": 117 }, { "epoch": 1.6346320346320347, "grad_norm": 0.929598786782075, "learning_rate": 6.800869391068674e-05, "loss": 1.0161, "step": 118 }, { "epoch": 1.6484848484848484, "grad_norm": 1.5271277070860276, "learning_rate": 6.773048328094097e-05, "loss": 1.0109, "step": 119 }, { "epoch": 1.6623376623376624, "grad_norm": 0.7369342923177392, "learning_rate": 6.744966551474936e-05, "loss": 1.0187, "step": 120 }, { "epoch": 1.6761904761904762, "grad_norm": 1.1411511227164497, "learning_rate": 6.716626701375174e-05, "loss": 1.0131, "step": 121 }, { "epoch": 1.69004329004329, "grad_norm": 1.2904195611318852, "learning_rate": 6.688031442222091e-05, "loss": 1.0084, "step": 122 }, { "epoch": 1.703896103896104, "grad_norm": 0.5757097623806057, "learning_rate": 6.659183462455751e-05, "loss": 1.0095, "step": 123 }, { "epoch": 1.7177489177489178, "grad_norm": 0.9291802416250161, "learning_rate": 6.630085474276256e-05, "loss": 1.0117, "step": 124 }, { "epoch": 1.7316017316017316, "grad_norm": 1.0033464839111939, "learning_rate": 6.600740213388735e-05, "loss": 1.0055, "step": 125 }, { "epoch": 1.7454545454545456, "grad_norm": 1.0577865447630987, "learning_rate": 6.571150438746157e-05, "loss": 0.9998, "step": 126 }, { "epoch": 1.7593073593073592, "grad_norm": 0.9644457639091424, "learning_rate": 6.54131893228994e-05, "loss": 1.003, "step": 127 }, { "epoch": 1.7731601731601732, "grad_norm": 0.80334378142282, "learning_rate": 6.511248498688396e-05, "loss": 1.0044, "step": 128 }, { "epoch": 1.7870129870129872, "grad_norm": 0.823547694775696, "learning_rate": 6.480941965073041e-05, "loss": 1.0109, "step": 129 }, { "epoch": 1.8008658008658007, "grad_norm": 0.7273863270792912, "learning_rate": 6.450402180772811e-05, "loss": 1.0017, "step": 130 }, { "epoch": 1.8147186147186147, "grad_norm": 0.762963999004941, "learning_rate": 6.419632017046167e-05, "loss": 1.0018, "step": 131 }, { "epoch": 1.8285714285714287, "grad_norm": 0.8148201089426899, "learning_rate": 6.388634366811146e-05, "loss": 0.9993, "step": 132 }, { "epoch": 1.8424242424242423, "grad_norm": 0.8416363889161061, "learning_rate": 6.35741214437338e-05, "loss": 1.0095, "step": 133 }, { "epoch": 1.8562770562770563, "grad_norm": 1.142390867021583, "learning_rate": 6.325968285152107e-05, "loss": 1.0062, "step": 134 }, { "epoch": 1.87012987012987, "grad_norm": 0.7962536559784616, "learning_rate": 6.294305745404185e-05, "loss": 1.0052, "step": 135 }, { "epoch": 1.8839826839826839, "grad_norm": 0.5650336880636371, "learning_rate": 6.262427501946155e-05, "loss": 1.0067, "step": 136 }, { "epoch": 1.8978354978354979, "grad_norm": 0.5818038902731943, "learning_rate": 6.230336551874372e-05, "loss": 1.0063, "step": 137 }, { "epoch": 1.9116883116883117, "grad_norm": 0.9977727916003996, "learning_rate": 6.198035912283225e-05, "loss": 1.0011, "step": 138 }, { "epoch": 1.9255411255411254, "grad_norm": 0.9993134068472553, "learning_rate": 6.165528619981479e-05, "loss": 0.9934, "step": 139 }, { "epoch": 1.9393939393939394, "grad_norm": 0.6309774026937955, "learning_rate": 6.132817731206766e-05, "loss": 1.0023, "step": 140 }, { "epoch": 1.9532467532467532, "grad_norm": 0.5631788726393073, "learning_rate": 6.099906321338241e-05, "loss": 0.9875, "step": 141 }, { "epoch": 1.967099567099567, "grad_norm": 0.6994904876843244, "learning_rate": 6.0667974846074524e-05, "loss": 0.9969, "step": 142 }, { "epoch": 1.980952380952381, "grad_norm": 0.6611818685825782, "learning_rate": 6.033494333807422e-05, "loss": 1.0052, "step": 143 }, { "epoch": 1.9948051948051948, "grad_norm": 0.5004771960590909, "learning_rate": 6.000000000000001e-05, "loss": 0.9857, "step": 144 }, { "epoch": 2.0086580086580086, "grad_norm": 1.0514858543746186, "learning_rate": 5.9663176322214826e-05, "loss": 1.8002, "step": 145 }, { "epoch": 2.0225108225108226, "grad_norm": 1.5590490021626622, "learning_rate": 5.9324503971865545e-05, "loss": 0.9591, "step": 146 }, { "epoch": 2.036363636363636, "grad_norm": 0.613252686965761, "learning_rate": 5.8984014789905625e-05, "loss": 0.9555, "step": 147 }, { "epoch": 2.05021645021645, "grad_norm": 1.5183857859367584, "learning_rate": 5.8641740788101566e-05, "loss": 0.9637, "step": 148 }, { "epoch": 2.064069264069264, "grad_norm": 0.599406946967003, "learning_rate": 5.8297714146023236e-05, "loss": 0.9396, "step": 149 }, { "epoch": 2.0779220779220777, "grad_norm": 1.171195638149606, "learning_rate": 5.79519672080185e-05, "loss": 0.9523, "step": 150 }, { "epoch": 2.0917748917748917, "grad_norm": 0.6714077570634802, "learning_rate": 5.76045324801722e-05, "loss": 0.9595, "step": 151 }, { "epoch": 2.1056277056277057, "grad_norm": 1.2318697934613918, "learning_rate": 5.7255442627250146e-05, "loss": 0.9514, "step": 152 }, { "epoch": 2.1194805194805193, "grad_norm": 0.746989496141657, "learning_rate": 5.6904730469627985e-05, "loss": 0.9482, "step": 153 }, { "epoch": 2.1333333333333333, "grad_norm": 0.901261215101538, "learning_rate": 5.6552428980205575e-05, "loss": 0.9587, "step": 154 }, { "epoch": 2.1471861471861473, "grad_norm": 0.674529916922478, "learning_rate": 5.619857128130695e-05, "loss": 0.9562, "step": 155 }, { "epoch": 2.161038961038961, "grad_norm": 0.8844375890562896, "learning_rate": 5.584319064156628e-05, "loss": 0.9459, "step": 156 }, { "epoch": 2.174891774891775, "grad_norm": 0.5176842951829833, "learning_rate": 5.548632047280003e-05, "loss": 0.9528, "step": 157 }, { "epoch": 2.188744588744589, "grad_norm": 0.6248120662111469, "learning_rate": 5.5127994326865706e-05, "loss": 0.9482, "step": 158 }, { "epoch": 2.2025974025974024, "grad_norm": 0.6093640758186603, "learning_rate": 5.476824589250738e-05, "loss": 0.9429, "step": 159 }, { "epoch": 2.2164502164502164, "grad_norm": 0.5492958980107647, "learning_rate": 5.440710899218842e-05, "loss": 0.9674, "step": 160 }, { "epoch": 2.2303030303030305, "grad_norm": 0.5903789766574798, "learning_rate": 5.404461757891156e-05, "loss": 0.9667, "step": 161 }, { "epoch": 2.244155844155844, "grad_norm": 0.5486871479315714, "learning_rate": 5.368080573302676e-05, "loss": 0.9478, "step": 162 }, { "epoch": 2.258008658008658, "grad_norm": 0.45428134417688254, "learning_rate": 5.331570765902706e-05, "loss": 0.9409, "step": 163 }, { "epoch": 2.271861471861472, "grad_norm": 0.42847012632012216, "learning_rate": 5.294935768233285e-05, "loss": 0.9416, "step": 164 }, { "epoch": 2.2857142857142856, "grad_norm": 0.4848698252601225, "learning_rate": 5.258179024606455e-05, "loss": 0.9463, "step": 165 }, { "epoch": 2.2995670995670996, "grad_norm": 0.3534788789389581, "learning_rate": 5.2213039907804535e-05, "loss": 0.9491, "step": 166 }, { "epoch": 2.3134199134199136, "grad_norm": 0.5082308518432114, "learning_rate": 5.1843141336348e-05, "loss": 0.95, "step": 167 }, { "epoch": 2.327272727272727, "grad_norm": 0.33208032748656197, "learning_rate": 5.1472129308443616e-05, "loss": 0.953, "step": 168 }, { "epoch": 2.341125541125541, "grad_norm": 0.35843782187780426, "learning_rate": 5.1100038705523834e-05, "loss": 0.957, "step": 169 }, { "epoch": 2.354978354978355, "grad_norm": 0.33243645634228375, "learning_rate": 5.07269045104255e-05, "loss": 0.9348, "step": 170 }, { "epoch": 2.3688311688311687, "grad_norm": 0.37544932004082693, "learning_rate": 5.0352761804100835e-05, "loss": 0.9501, "step": 171 }, { "epoch": 2.3826839826839827, "grad_norm": 0.3396549463565156, "learning_rate": 4.9977645762319255e-05, "loss": 0.9548, "step": 172 }, { "epoch": 2.3965367965367967, "grad_norm": 0.27413219762637864, "learning_rate": 4.9601591652360244e-05, "loss": 0.9516, "step": 173 }, { "epoch": 2.4103896103896103, "grad_norm": 0.2935194857656813, "learning_rate": 4.922463482969761e-05, "loss": 0.9537, "step": 174 }, { "epoch": 2.4242424242424243, "grad_norm": 0.31679378581933954, "learning_rate": 4.884681073467551e-05, "loss": 0.9566, "step": 175 }, { "epoch": 2.4380952380952383, "grad_norm": 0.2917510642085385, "learning_rate": 4.846815488917644e-05, "loss": 0.9602, "step": 176 }, { "epoch": 2.451948051948052, "grad_norm": 0.29512012950255556, "learning_rate": 4.808870289328153e-05, "loss": 0.9513, "step": 177 }, { "epoch": 2.465800865800866, "grad_norm": 0.24808203045159094, "learning_rate": 4.7708490421923596e-05, "loss": 0.9453, "step": 178 }, { "epoch": 2.47965367965368, "grad_norm": 0.21937289844225158, "learning_rate": 4.7327553221533074e-05, "loss": 0.9581, "step": 179 }, { "epoch": 2.4935064935064934, "grad_norm": 0.20437241337234358, "learning_rate": 4.694592710667723e-05, "loss": 0.948, "step": 180 }, { "epoch": 2.5073593073593075, "grad_norm": 0.20182625174185811, "learning_rate": 4.656364795669297e-05, "loss": 0.9505, "step": 181 }, { "epoch": 2.5212121212121215, "grad_norm": 0.2157700828054003, "learning_rate": 4.618075171231363e-05, "loss": 0.955, "step": 182 }, { "epoch": 2.535064935064935, "grad_norm": 0.20198999241369922, "learning_rate": 4.579727437228987e-05, "loss": 0.9479, "step": 183 }, { "epoch": 2.548917748917749, "grad_norm": 0.19349997377276865, "learning_rate": 4.541325199000525e-05, "loss": 0.9444, "step": 184 }, { "epoch": 2.562770562770563, "grad_norm": 0.20821593855670595, "learning_rate": 4.502872067008652e-05, "loss": 0.9484, "step": 185 }, { "epoch": 2.5766233766233766, "grad_norm": 0.22714292711765166, "learning_rate": 4.464371656500921e-05, "loss": 0.9478, "step": 186 }, { "epoch": 2.5904761904761906, "grad_norm": 0.22439821970405607, "learning_rate": 4.425827587169873e-05, "loss": 0.9642, "step": 187 }, { "epoch": 2.6043290043290046, "grad_norm": 0.19017166723603593, "learning_rate": 4.387243482812717e-05, "loss": 0.9354, "step": 188 }, { "epoch": 2.618181818181818, "grad_norm": 0.2338760203213592, "learning_rate": 4.348622970990634e-05, "loss": 0.9608, "step": 189 }, { "epoch": 2.632034632034632, "grad_norm": 0.19433184424361064, "learning_rate": 4.309969682687724e-05, "loss": 0.9365, "step": 190 }, { "epoch": 2.6458874458874457, "grad_norm": 0.2006639594796061, "learning_rate": 4.271287251969637e-05, "loss": 0.943, "step": 191 }, { "epoch": 2.6597402597402597, "grad_norm": 0.19675542180216962, "learning_rate": 4.2325793156419035e-05, "loss": 0.9629, "step": 192 }, { "epoch": 2.6735930735930737, "grad_norm": 0.22882862992661218, "learning_rate": 4.193849512908013e-05, "loss": 0.9399, "step": 193 }, { "epoch": 2.6874458874458873, "grad_norm": 0.27628995792251587, "learning_rate": 4.155101485027268e-05, "loss": 0.9517, "step": 194 }, { "epoch": 2.7012987012987013, "grad_norm": 0.25152494788624064, "learning_rate": 4.116338874972446e-05, "loss": 0.9532, "step": 195 }, { "epoch": 2.7151515151515153, "grad_norm": 0.17237631990944813, "learning_rate": 4.077565327087298e-05, "loss": 0.9443, "step": 196 }, { "epoch": 2.729004329004329, "grad_norm": 0.22052058799944804, "learning_rate": 4.0387844867439143e-05, "loss": 0.9384, "step": 197 }, { "epoch": 2.742857142857143, "grad_norm": 0.2821185693525401, "learning_rate": 4e-05, "loss": 0.9506, "step": 198 }, { "epoch": 2.7567099567099564, "grad_norm": 0.23974193332071514, "learning_rate": 3.961215513256086e-05, "loss": 0.944, "step": 199 }, { "epoch": 2.7705627705627704, "grad_norm": 0.23881720962641614, "learning_rate": 3.9224346729127034e-05, "loss": 0.9423, "step": 200 }, { "epoch": 2.7844155844155845, "grad_norm": 0.1774343946075327, "learning_rate": 3.8836611250275546e-05, "loss": 0.9355, "step": 201 }, { "epoch": 2.798268398268398, "grad_norm": 0.23570113544248983, "learning_rate": 3.844898514972733e-05, "loss": 0.9519, "step": 202 }, { "epoch": 2.812121212121212, "grad_norm": 0.21653970566029948, "learning_rate": 3.806150487091989e-05, "loss": 0.951, "step": 203 }, { "epoch": 2.825974025974026, "grad_norm": 0.1881655573837289, "learning_rate": 3.767420684358097e-05, "loss": 0.9425, "step": 204 }, { "epoch": 2.8398268398268396, "grad_norm": 0.19487964543004402, "learning_rate": 3.7287127480303634e-05, "loss": 0.9496, "step": 205 }, { "epoch": 2.8536796536796536, "grad_norm": 0.21940934921746677, "learning_rate": 3.690030317312277e-05, "loss": 0.9326, "step": 206 }, { "epoch": 2.8675324675324676, "grad_norm": 0.22419835861035028, "learning_rate": 3.6513770290093674e-05, "loss": 0.958, "step": 207 }, { "epoch": 2.881385281385281, "grad_norm": 0.20379473922199545, "learning_rate": 3.612756517187284e-05, "loss": 0.9475, "step": 208 }, { "epoch": 2.895238095238095, "grad_norm": 0.15734328009114276, "learning_rate": 3.574172412830127e-05, "loss": 0.9446, "step": 209 }, { "epoch": 2.909090909090909, "grad_norm": 0.2577374514137676, "learning_rate": 3.535628343499079e-05, "loss": 0.9518, "step": 210 }, { "epoch": 2.9229437229437227, "grad_norm": 0.21560632289046236, "learning_rate": 3.49712793299135e-05, "loss": 0.9321, "step": 211 }, { "epoch": 2.9367965367965367, "grad_norm": 0.19086166501572058, "learning_rate": 3.458674800999477e-05, "loss": 0.939, "step": 212 }, { "epoch": 2.9506493506493507, "grad_norm": 0.1635737725085455, "learning_rate": 3.4202725627710136e-05, "loss": 0.9519, "step": 213 }, { "epoch": 2.9645021645021643, "grad_norm": 0.2063878664719065, "learning_rate": 3.3819248287686386e-05, "loss": 0.9408, "step": 214 }, { "epoch": 2.9783549783549783, "grad_norm": 0.21758034147643424, "learning_rate": 3.343635204330704e-05, "loss": 0.9366, "step": 215 }, { "epoch": 2.9922077922077923, "grad_norm": 0.1756516719858461, "learning_rate": 3.305407289332279e-05, "loss": 0.9261, "step": 216 }, { "epoch": 3.006060606060606, "grad_norm": 0.44437950709772883, "learning_rate": 3.267244677846693e-05, "loss": 1.6737, "step": 217 }, { "epoch": 3.01991341991342, "grad_norm": 0.5202547459859553, "learning_rate": 3.229150957807641e-05, "loss": 0.9065, "step": 218 }, { "epoch": 3.033766233766234, "grad_norm": 0.4201768177217496, "learning_rate": 3.191129710671849e-05, "loss": 0.8993, "step": 219 }, { "epoch": 3.0476190476190474, "grad_norm": 0.3469006955241282, "learning_rate": 3.153184511082359e-05, "loss": 0.8924, "step": 220 }, { "epoch": 3.0614718614718615, "grad_norm": 0.34894763894121467, "learning_rate": 3.1153189265324494e-05, "loss": 0.9091, "step": 221 }, { "epoch": 3.0753246753246755, "grad_norm": 0.3951659967368868, "learning_rate": 3.07753651703024e-05, "loss": 0.9103, "step": 222 }, { "epoch": 3.089177489177489, "grad_norm": 0.33506373060928457, "learning_rate": 3.0398408347639773e-05, "loss": 0.8895, "step": 223 }, { "epoch": 3.103030303030303, "grad_norm": 0.2808678451376146, "learning_rate": 3.0022354237680752e-05, "loss": 0.8954, "step": 224 }, { "epoch": 3.116883116883117, "grad_norm": 0.3452617358086684, "learning_rate": 2.9647238195899168e-05, "loss": 0.8954, "step": 225 }, { "epoch": 3.1307359307359306, "grad_norm": 0.32553230647238945, "learning_rate": 2.9273095489574502e-05, "loss": 0.897, "step": 226 }, { "epoch": 3.1445887445887446, "grad_norm": 0.2604914839354281, "learning_rate": 2.889996129447618e-05, "loss": 0.907, "step": 227 }, { "epoch": 3.1584415584415586, "grad_norm": 0.34111866816202957, "learning_rate": 2.8527870691556404e-05, "loss": 0.8981, "step": 228 }, { "epoch": 3.172294372294372, "grad_norm": 0.28026302405180475, "learning_rate": 2.8156858663652015e-05, "loss": 0.9033, "step": 229 }, { "epoch": 3.186147186147186, "grad_norm": 0.26870372034953893, "learning_rate": 2.778696009219548e-05, "loss": 0.9059, "step": 230 }, { "epoch": 3.2, "grad_norm": 0.3798626491614641, "learning_rate": 2.7418209753935464e-05, "loss": 0.8894, "step": 231 }, { "epoch": 3.2138528138528137, "grad_norm": 0.21379716918544014, "learning_rate": 2.7050642317667164e-05, "loss": 0.8937, "step": 232 }, { "epoch": 3.2277056277056277, "grad_norm": 0.31956814421124774, "learning_rate": 2.6684292340972936e-05, "loss": 0.9068, "step": 233 }, { "epoch": 3.2415584415584417, "grad_norm": 0.194502129845176, "learning_rate": 2.6319194266973256e-05, "loss": 0.8999, "step": 234 }, { "epoch": 3.2554112554112553, "grad_norm": 0.25288436825501515, "learning_rate": 2.5955382421088457e-05, "loss": 0.8876, "step": 235 }, { "epoch": 3.2692640692640693, "grad_norm": 0.2045328796636946, "learning_rate": 2.5592891007811594e-05, "loss": 0.9056, "step": 236 }, { "epoch": 3.2831168831168833, "grad_norm": 0.17690924985251477, "learning_rate": 2.523175410749263e-05, "loss": 0.9068, "step": 237 }, { "epoch": 3.296969696969697, "grad_norm": 0.20432688291964138, "learning_rate": 2.4872005673134307e-05, "loss": 0.8916, "step": 238 }, { "epoch": 3.310822510822511, "grad_norm": 0.17738981903795317, "learning_rate": 2.4513679527199986e-05, "loss": 0.9115, "step": 239 }, { "epoch": 3.324675324675325, "grad_norm": 0.16833331057473214, "learning_rate": 2.4156809358433728e-05, "loss": 0.8891, "step": 240 }, { "epoch": 3.3385281385281385, "grad_norm": 0.17407822439034182, "learning_rate": 2.3801428718693055e-05, "loss": 0.8936, "step": 241 }, { "epoch": 3.3523809523809525, "grad_norm": 0.16434385080662373, "learning_rate": 2.3447571019794438e-05, "loss": 0.9079, "step": 242 }, { "epoch": 3.3662337662337665, "grad_norm": 0.1647420511208294, "learning_rate": 2.3095269530372032e-05, "loss": 0.8904, "step": 243 }, { "epoch": 3.38008658008658, "grad_norm": 0.16465200281562736, "learning_rate": 2.274455737274987e-05, "loss": 0.8965, "step": 244 }, { "epoch": 3.393939393939394, "grad_norm": 0.1942259697042446, "learning_rate": 2.239546751982782e-05, "loss": 0.9039, "step": 245 }, { "epoch": 3.407792207792208, "grad_norm": 0.15418958599426286, "learning_rate": 2.2048032791981515e-05, "loss": 0.8921, "step": 246 }, { "epoch": 3.4216450216450216, "grad_norm": 0.15256309020808106, "learning_rate": 2.1702285853976774e-05, "loss": 0.8972, "step": 247 }, { "epoch": 3.4354978354978356, "grad_norm": 0.14590845303296213, "learning_rate": 2.135825921189846e-05, "loss": 0.8967, "step": 248 }, { "epoch": 3.449350649350649, "grad_norm": 0.1756342017642444, "learning_rate": 2.1015985210094385e-05, "loss": 0.9089, "step": 249 }, { "epoch": 3.463203463203463, "grad_norm": 0.14928130402546771, "learning_rate": 2.067549602813446e-05, "loss": 0.9116, "step": 250 }, { "epoch": 3.477056277056277, "grad_norm": 0.19622196885081308, "learning_rate": 2.033682367778518e-05, "loss": 0.9035, "step": 251 }, { "epoch": 3.4909090909090907, "grad_norm": 0.16833682605095, "learning_rate": 2.0000000000000012e-05, "loss": 0.9049, "step": 252 }, { "epoch": 3.5047619047619047, "grad_norm": 0.1700606136967009, "learning_rate": 1.966505666192579e-05, "loss": 0.9013, "step": 253 }, { "epoch": 3.5186147186147188, "grad_norm": 0.1795362591013133, "learning_rate": 1.9332025153925486e-05, "loss": 0.887, "step": 254 }, { "epoch": 3.5324675324675323, "grad_norm": 0.16623457555792936, "learning_rate": 1.90009367866176e-05, "loss": 0.9025, "step": 255 }, { "epoch": 3.5463203463203463, "grad_norm": 0.1724331408670692, "learning_rate": 1.867182268793236e-05, "loss": 0.902, "step": 256 }, { "epoch": 3.5601731601731603, "grad_norm": 0.156738658049747, "learning_rate": 1.8344713800185215e-05, "loss": 0.8935, "step": 257 }, { "epoch": 3.574025974025974, "grad_norm": 0.16288790800709219, "learning_rate": 1.8019640877167763e-05, "loss": 0.898, "step": 258 }, { "epoch": 3.587878787878788, "grad_norm": 0.15690946638171066, "learning_rate": 1.7696634481256293e-05, "loss": 0.8959, "step": 259 }, { "epoch": 3.601731601731602, "grad_norm": 0.16001262583220252, "learning_rate": 1.7375724980538465e-05, "loss": 0.8888, "step": 260 }, { "epoch": 3.6155844155844155, "grad_norm": 0.15064377615121663, "learning_rate": 1.7056942545958167e-05, "loss": 0.9089, "step": 261 }, { "epoch": 3.6294372294372295, "grad_norm": 0.13096790236650285, "learning_rate": 1.6740317148478932e-05, "loss": 0.9055, "step": 262 }, { "epoch": 3.643290043290043, "grad_norm": 0.14921599598853594, "learning_rate": 1.642587855626621e-05, "loss": 0.9154, "step": 263 }, { "epoch": 3.657142857142857, "grad_norm": 0.13367750739235254, "learning_rate": 1.6113656331888563e-05, "loss": 0.8954, "step": 264 }, { "epoch": 3.670995670995671, "grad_norm": 0.14168194296838715, "learning_rate": 1.580367982953833e-05, "loss": 0.8939, "step": 265 }, { "epoch": 3.6848484848484846, "grad_norm": 0.14492593957298525, "learning_rate": 1.5495978192271887e-05, "loss": 0.91, "step": 266 }, { "epoch": 3.6987012987012986, "grad_norm": 0.1316497818256666, "learning_rate": 1.5190580349269604e-05, "loss": 0.9027, "step": 267 }, { "epoch": 3.7125541125541126, "grad_norm": 0.15841380793742146, "learning_rate": 1.4887515013116067e-05, "loss": 0.9106, "step": 268 }, { "epoch": 3.726406926406926, "grad_norm": 0.13126491215447147, "learning_rate": 1.4586810677100608e-05, "loss": 0.8937, "step": 269 }, { "epoch": 3.74025974025974, "grad_norm": 0.1495403663254427, "learning_rate": 1.4288495612538427e-05, "loss": 0.9034, "step": 270 }, { "epoch": 3.754112554112554, "grad_norm": 0.12429246476808327, "learning_rate": 1.3992597866112667e-05, "loss": 0.8975, "step": 271 }, { "epoch": 3.7679653679653677, "grad_norm": 0.13097022929593902, "learning_rate": 1.369914525723746e-05, "loss": 0.8882, "step": 272 }, { "epoch": 3.7818181818181817, "grad_norm": 0.13482171999455558, "learning_rate": 1.3408165375442486e-05, "loss": 0.8906, "step": 273 }, { "epoch": 3.7956709956709958, "grad_norm": 0.12515899928871424, "learning_rate": 1.3119685577779105e-05, "loss": 0.9008, "step": 274 }, { "epoch": 3.8095238095238093, "grad_norm": 0.13069692054136395, "learning_rate": 1.2833732986248277e-05, "loss": 0.8853, "step": 275 }, { "epoch": 3.8233766233766233, "grad_norm": 0.13447223817691295, "learning_rate": 1.2550334485250661e-05, "loss": 0.9051, "step": 276 }, { "epoch": 3.8372294372294373, "grad_norm": 0.12306949358534137, "learning_rate": 1.2269516719059041e-05, "loss": 0.8979, "step": 277 }, { "epoch": 3.851082251082251, "grad_norm": 0.13274764900634733, "learning_rate": 1.1991306089313261e-05, "loss": 0.901, "step": 278 }, { "epoch": 3.864935064935065, "grad_norm": 0.12496506975650054, "learning_rate": 1.1715728752538103e-05, "loss": 0.8851, "step": 279 }, { "epoch": 3.878787878787879, "grad_norm": 0.12342700776133213, "learning_rate": 1.1442810617684046e-05, "loss": 0.8906, "step": 280 }, { "epoch": 3.8926406926406925, "grad_norm": 0.11718555769651504, "learning_rate": 1.1172577343691415e-05, "loss": 0.8945, "step": 281 }, { "epoch": 3.9064935064935065, "grad_norm": 0.11900571530829156, "learning_rate": 1.0905054337078051e-05, "loss": 0.8939, "step": 282 }, { "epoch": 3.9203463203463205, "grad_norm": 0.11761709393948508, "learning_rate": 1.0640266749550593e-05, "loss": 0.8987, "step": 283 }, { "epoch": 3.934199134199134, "grad_norm": 0.12426098474964, "learning_rate": 1.0378239475639823e-05, "loss": 0.8954, "step": 284 }, { "epoch": 3.948051948051948, "grad_norm": 0.11342564958505907, "learning_rate": 1.0118997150360169e-05, "loss": 0.8967, "step": 285 }, { "epoch": 3.961904761904762, "grad_norm": 0.12414751882404233, "learning_rate": 9.862564146893571e-06, "loss": 0.8942, "step": 286 }, { "epoch": 3.9757575757575756, "grad_norm": 0.11821007668599343, "learning_rate": 9.60896457429803e-06, "loss": 0.8981, "step": 287 }, { "epoch": 3.9896103896103896, "grad_norm": 0.11207748566968422, "learning_rate": 9.358222275240884e-06, "loss": 0.8969, "step": 288 }, { "epoch": 4.003463203463204, "grad_norm": 0.24776696231966608, "learning_rate": 9.110360823757235e-06, "loss": 1.6175, "step": 289 }, { "epoch": 4.017316017316017, "grad_norm": 0.1639268139321257, "learning_rate": 8.8654035230336e-06, "loss": 0.8757, "step": 290 }, { "epoch": 4.031168831168831, "grad_norm": 0.1430026414045171, "learning_rate": 8.623373403216972e-06, "loss": 0.8619, "step": 291 }, { "epoch": 4.045021645021645, "grad_norm": 0.13983259059672157, "learning_rate": 8.384293219249633e-06, "loss": 0.875, "step": 292 }, { "epoch": 4.058874458874459, "grad_norm": 0.14776698103121835, "learning_rate": 8.148185448729778e-06, "loss": 0.8712, "step": 293 }, { "epoch": 4.072727272727272, "grad_norm": 0.1453264011082169, "learning_rate": 7.915072289798247e-06, "loss": 0.8859, "step": 294 }, { "epoch": 4.086580086580087, "grad_norm": 0.15943779551259862, "learning_rate": 7.684975659051557e-06, "loss": 0.8662, "step": 295 }, { "epoch": 4.1004329004329, "grad_norm": 0.1456231807293276, "learning_rate": 7.457917189481301e-06, "loss": 0.8774, "step": 296 }, { "epoch": 4.114285714285714, "grad_norm": 0.14340143561096827, "learning_rate": 7.233918228440324e-06, "loss": 0.8774, "step": 297 }, { "epoch": 4.128138528138528, "grad_norm": 0.14023071744580373, "learning_rate": 7.0129998356357295e-06, "loss": 0.863, "step": 298 }, { "epoch": 4.141991341991342, "grad_norm": 0.14172173606520722, "learning_rate": 6.795182781148848e-06, "loss": 0.8767, "step": 299 }, { "epoch": 4.1558441558441555, "grad_norm": 0.1318876467621652, "learning_rate": 6.58048754348255e-06, "loss": 0.8709, "step": 300 }, { "epoch": 4.16969696969697, "grad_norm": 0.1517460979685681, "learning_rate": 6.368934307635881e-06, "loss": 0.8716, "step": 301 }, { "epoch": 4.1835497835497835, "grad_norm": 0.15120519716651545, "learning_rate": 6.160542963206357e-06, "loss": 0.8697, "step": 302 }, { "epoch": 4.197402597402597, "grad_norm": 0.12276533641084203, "learning_rate": 5.955333102520011e-06, "loss": 0.8628, "step": 303 }, { "epoch": 4.2112554112554115, "grad_norm": 0.1303847318332295, "learning_rate": 5.753324018789346e-06, "loss": 0.8708, "step": 304 }, { "epoch": 4.225108225108225, "grad_norm": 0.13706452110864129, "learning_rate": 5.554534704299448e-06, "loss": 0.8566, "step": 305 }, { "epoch": 4.238961038961039, "grad_norm": 0.15781002543920747, "learning_rate": 5.358983848622452e-06, "loss": 0.8764, "step": 306 }, { "epoch": 4.252813852813853, "grad_norm": 0.11520912795530423, "learning_rate": 5.1666898368603195e-06, "loss": 0.8749, "step": 307 }, { "epoch": 4.266666666666667, "grad_norm": 0.11508546810833122, "learning_rate": 4.97767074791637e-06, "loss": 0.8657, "step": 308 }, { "epoch": 4.28051948051948, "grad_norm": 0.14352142083453215, "learning_rate": 4.791944352795561e-06, "loss": 0.8919, "step": 309 }, { "epoch": 4.294372294372295, "grad_norm": 0.13642778141475553, "learning_rate": 4.609528112933688e-06, "loss": 0.8575, "step": 310 }, { "epoch": 4.308225108225108, "grad_norm": 0.11645525287361383, "learning_rate": 4.430439178555759e-06, "loss": 0.874, "step": 311 }, { "epoch": 4.322077922077922, "grad_norm": 0.11198885083380229, "learning_rate": 4.254694387063514e-06, "loss": 0.866, "step": 312 }, { "epoch": 4.335930735930736, "grad_norm": 0.11999719505276203, "learning_rate": 4.082310261452471e-06, "loss": 0.8809, "step": 313 }, { "epoch": 4.34978354978355, "grad_norm": 0.11431861199461578, "learning_rate": 3.913303008758491e-06, "loss": 0.8739, "step": 314 }, { "epoch": 4.363636363636363, "grad_norm": 0.1089763284328194, "learning_rate": 3.747688518534003e-06, "loss": 0.8764, "step": 315 }, { "epoch": 4.377489177489178, "grad_norm": 0.11083535668146678, "learning_rate": 3.585482361354138e-06, "loss": 0.874, "step": 316 }, { "epoch": 4.391341991341991, "grad_norm": 0.10462111723473196, "learning_rate": 3.42669978735283e-06, "loss": 0.8712, "step": 317 }, { "epoch": 4.405194805194805, "grad_norm": 0.11192874060919457, "learning_rate": 3.2713557247890447e-06, "loss": 0.865, "step": 318 }, { "epoch": 4.419047619047619, "grad_norm": 0.0998639300176411, "learning_rate": 3.1194647786432663e-06, "loss": 0.8628, "step": 319 }, { "epoch": 4.432900432900433, "grad_norm": 0.1037388404966585, "learning_rate": 2.9710412292443868e-06, "loss": 0.8744, "step": 320 }, { "epoch": 4.4467532467532465, "grad_norm": 0.10341839983438926, "learning_rate": 2.8260990309270987e-06, "loss": 0.8707, "step": 321 }, { "epoch": 4.460606060606061, "grad_norm": 0.10245055505097513, "learning_rate": 2.6846518107199782e-06, "loss": 0.869, "step": 322 }, { "epoch": 4.4744588744588745, "grad_norm": 0.10245685258161713, "learning_rate": 2.546712867064276e-06, "loss": 0.866, "step": 323 }, { "epoch": 4.488311688311688, "grad_norm": 0.10246348212442796, "learning_rate": 2.4122951685636674e-06, "loss": 0.869, "step": 324 }, { "epoch": 4.5021645021645025, "grad_norm": 0.10133630585516906, "learning_rate": 2.281411352764966e-06, "loss": 0.8661, "step": 325 }, { "epoch": 4.516017316017316, "grad_norm": 0.10385457357599492, "learning_rate": 2.1540737249699893e-06, "loss": 0.8665, "step": 326 }, { "epoch": 4.52987012987013, "grad_norm": 0.09787930849328196, "learning_rate": 2.0302942570786446e-06, "loss": 0.8587, "step": 327 }, { "epoch": 4.543722943722944, "grad_norm": 0.09875061097653641, "learning_rate": 1.9100845864633875e-06, "loss": 0.862, "step": 328 }, { "epoch": 4.557575757575758, "grad_norm": 0.10019109859451927, "learning_rate": 1.793456014875079e-06, "loss": 0.8667, "step": 329 }, { "epoch": 4.571428571428571, "grad_norm": 0.09607007590769094, "learning_rate": 1.6804195073804442e-06, "loss": 0.8609, "step": 330 }, { "epoch": 4.585281385281386, "grad_norm": 0.0995091150688806, "learning_rate": 1.5709856913311795e-06, "loss": 0.8631, "step": 331 }, { "epoch": 4.599134199134199, "grad_norm": 0.10237535339157534, "learning_rate": 1.4651648553647869e-06, "loss": 0.874, "step": 332 }, { "epoch": 4.612987012987013, "grad_norm": 0.09685943360360758, "learning_rate": 1.3629669484372722e-06, "loss": 0.8608, "step": 333 }, { "epoch": 4.626839826839827, "grad_norm": 0.10088872360008577, "learning_rate": 1.2644015788877684e-06, "loss": 0.8776, "step": 334 }, { "epoch": 4.640692640692641, "grad_norm": 0.09659731541025765, "learning_rate": 1.1694780135352013e-06, "loss": 0.8659, "step": 335 }, { "epoch": 4.654545454545454, "grad_norm": 0.09754069143347813, "learning_rate": 1.0782051768070477e-06, "loss": 0.8822, "step": 336 }, { "epoch": 4.668398268398269, "grad_norm": 0.09529068088084004, "learning_rate": 9.905916499002787e-07, "loss": 0.8632, "step": 337 }, { "epoch": 4.682251082251082, "grad_norm": 0.09443098915190634, "learning_rate": 9.066456699745774e-07, "loss": 0.8686, "step": 338 }, { "epoch": 4.696103896103896, "grad_norm": 0.09719204747726426, "learning_rate": 8.263751293779409e-07, "loss": 0.8709, "step": 339 }, { "epoch": 4.70995670995671, "grad_norm": 0.0989300648418707, "learning_rate": 7.497875749046124e-07, "loss": 0.8706, "step": 340 }, { "epoch": 4.723809523809524, "grad_norm": 0.09639393839499397, "learning_rate": 6.768902070856031e-07, "loss": 0.8661, "step": 341 }, { "epoch": 4.7376623376623375, "grad_norm": 0.09557188345066484, "learning_rate": 6.076898795116792e-07, "loss": 0.8662, "step": 342 }, { "epoch": 4.751515151515152, "grad_norm": 0.09944408893779064, "learning_rate": 5.421930981890455e-07, "loss": 0.877, "step": 343 }, { "epoch": 4.7653679653679655, "grad_norm": 0.0977504011176678, "learning_rate": 4.804060209276396e-07, "loss": 0.8658, "step": 344 }, { "epoch": 4.779220779220779, "grad_norm": 0.09464762553229625, "learning_rate": 4.223344567622212e-07, "loss": 0.8718, "step": 345 }, { "epoch": 4.7930735930735935, "grad_norm": 0.09515637845594775, "learning_rate": 3.679838654061874e-07, "loss": 0.8672, "step": 346 }, { "epoch": 4.806926406926407, "grad_norm": 0.09692757545190614, "learning_rate": 3.173593567383071e-07, "loss": 0.8762, "step": 347 }, { "epoch": 4.820779220779221, "grad_norm": 0.09525175615621749, "learning_rate": 2.704656903222791e-07, "loss": 0.8792, "step": 348 }, { "epoch": 4.834632034632035, "grad_norm": 0.09621257866702408, "learning_rate": 2.273072749592631e-07, "loss": 0.864, "step": 349 }, { "epoch": 4.848484848484849, "grad_norm": 0.09435391607466348, "learning_rate": 1.8788816827336686e-07, "loss": 0.8827, "step": 350 }, { "epoch": 4.862337662337662, "grad_norm": 0.09330676760639534, "learning_rate": 1.522120763301782e-07, "loss": 0.8634, "step": 351 }, { "epoch": 4.876190476190477, "grad_norm": 0.09377768092440732, "learning_rate": 1.2028235328831906e-07, "loss": 0.8782, "step": 352 }, { "epoch": 4.89004329004329, "grad_norm": 0.09540719747182097, "learning_rate": 9.21020010840934e-08, "loss": 0.8721, "step": 353 }, { "epoch": 4.903896103896104, "grad_norm": 0.09356725286148478, "learning_rate": 6.767366914927298e-08, "loss": 0.8784, "step": 354 }, { "epoch": 4.917748917748918, "grad_norm": 0.09257225973193513, "learning_rate": 4.699965416198549e-08, "loss": 0.8794, "step": 355 }, { "epoch": 4.931601731601732, "grad_norm": 0.09315617718680014, "learning_rate": 3.0081899830798345e-08, "loss": 0.8658, "step": 356 }, { "epoch": 4.945454545454545, "grad_norm": 0.09320193350709476, "learning_rate": 1.6921996711976028e-08, "loss": 0.8666, "step": 357 }, { "epoch": 4.95930735930736, "grad_norm": 0.09451963386678745, "learning_rate": 7.521182059946342e-09, "loss": 0.866, "step": 358 }, { "epoch": 4.973160173160173, "grad_norm": 0.09250072566157394, "learning_rate": 1.8803397109534715e-09, "loss": 0.8639, "step": 359 }, { "epoch": 4.987012987012987, "grad_norm": 0.09138839375450408, "learning_rate": 0.0, "loss": 0.8814, "step": 360 }, { "epoch": 4.987012987012987, "step": 360, "total_flos": 9.572466247992345e+18, "train_loss": 0.0, "train_runtime": 2.6399, "train_samples_per_second": 69987.374, "train_steps_per_second": 136.369 } ], "logging_steps": 1, "max_steps": 360, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.572466247992345e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }