| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.987012987012987, |
| "eval_steps": 500, |
| "global_step": 360, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.013852813852813853, |
| "grad_norm": 10.690503047217376, |
| "learning_rate": 2.222222222222222e-06, |
| "loss": 1.664, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.027705627705627706, |
| "grad_norm": 10.549038500876918, |
| "learning_rate": 4.444444444444444e-06, |
| "loss": 1.6687, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.04155844155844156, |
| "grad_norm": 9.952372502868275, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 1.6436, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.05541125541125541, |
| "grad_norm": 7.525381680312214, |
| "learning_rate": 8.888888888888888e-06, |
| "loss": 1.5751, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.06926406926406926, |
| "grad_norm": 3.6488707097222806, |
| "learning_rate": 1.1111111111111113e-05, |
| "loss": 1.4732, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.08311688311688312, |
| "grad_norm": 5.9440833747387405, |
| "learning_rate": 1.3333333333333333e-05, |
| "loss": 1.4929, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.09696969696969697, |
| "grad_norm": 7.008224469434576, |
| "learning_rate": 1.555555555555556e-05, |
| "loss": 1.4342, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.11082251082251082, |
| "grad_norm": 9.129791969259458, |
| "learning_rate": 1.7777777777777777e-05, |
| "loss": 1.4508, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.12467532467532468, |
| "grad_norm": 7.157661170613076, |
| "learning_rate": 2e-05, |
| "loss": 1.3993, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.13852813852813853, |
| "grad_norm": 5.878397281654449, |
| "learning_rate": 2.2222222222222227e-05, |
| "loss": 1.3716, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.1523809523809524, |
| "grad_norm": 4.04814199716087, |
| "learning_rate": 2.444444444444445e-05, |
| "loss": 1.3279, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.16623376623376623, |
| "grad_norm": 4.367325147342624, |
| "learning_rate": 2.6666666666666667e-05, |
| "loss": 1.2918, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.1800865800865801, |
| "grad_norm": 3.013051181093589, |
| "learning_rate": 2.888888888888889e-05, |
| "loss": 1.2683, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.19393939393939394, |
| "grad_norm": 2.7017616202077597, |
| "learning_rate": 3.111111111111112e-05, |
| "loss": 1.2741, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.2077922077922078, |
| "grad_norm": 2.4447347796035936, |
| "learning_rate": 3.3333333333333335e-05, |
| "loss": 1.2498, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.22164502164502164, |
| "grad_norm": 2.3013073090511016, |
| "learning_rate": 3.555555555555555e-05, |
| "loss": 1.2356, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.2354978354978355, |
| "grad_norm": 2.676331737240606, |
| "learning_rate": 3.777777777777778e-05, |
| "loss": 1.2226, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.24935064935064935, |
| "grad_norm": 1.8653678395700215, |
| "learning_rate": 4e-05, |
| "loss": 1.1883, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.2632034632034632, |
| "grad_norm": 2.489502341694411, |
| "learning_rate": 4.222222222222223e-05, |
| "loss": 1.1903, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.27705627705627706, |
| "grad_norm": 2.2381168497877746, |
| "learning_rate": 4.444444444444445e-05, |
| "loss": 1.1823, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.2909090909090909, |
| "grad_norm": 1.0658561341621282, |
| "learning_rate": 4.666666666666667e-05, |
| "loss": 1.1644, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.3047619047619048, |
| "grad_norm": 3.3353632520282024, |
| "learning_rate": 4.88888888888889e-05, |
| "loss": 1.1866, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.31861471861471863, |
| "grad_norm": 2.0828413940584256, |
| "learning_rate": 5.111111111111111e-05, |
| "loss": 1.1606, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.33246753246753247, |
| "grad_norm": 2.0722285174850334, |
| "learning_rate": 5.333333333333333e-05, |
| "loss": 1.1689, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.3463203463203463, |
| "grad_norm": 2.6579102865439035, |
| "learning_rate": 5.555555555555556e-05, |
| "loss": 1.1555, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.3601731601731602, |
| "grad_norm": 1.9616156182284334, |
| "learning_rate": 5.777777777777778e-05, |
| "loss": 1.1683, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.37402597402597404, |
| "grad_norm": 3.2895161663522225, |
| "learning_rate": 6.000000000000001e-05, |
| "loss": 1.162, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.3878787878787879, |
| "grad_norm": 2.2524763564895447, |
| "learning_rate": 6.222222222222223e-05, |
| "loss": 1.1588, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.4017316017316017, |
| "grad_norm": 2.9587565231476036, |
| "learning_rate": 6.444444444444446e-05, |
| "loss": 1.1477, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.4155844155844156, |
| "grad_norm": 2.0001168739095387, |
| "learning_rate": 6.666666666666667e-05, |
| "loss": 1.1463, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.42943722943722945, |
| "grad_norm": 3.0781839410346756, |
| "learning_rate": 6.88888888888889e-05, |
| "loss": 1.1273, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.4432900432900433, |
| "grad_norm": 2.155490334097704, |
| "learning_rate": 7.11111111111111e-05, |
| "loss": 1.1468, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.45714285714285713, |
| "grad_norm": 2.3875247457053566, |
| "learning_rate": 7.333333333333333e-05, |
| "loss": 1.1379, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.470995670995671, |
| "grad_norm": 1.71586428053475, |
| "learning_rate": 7.555555555555556e-05, |
| "loss": 1.1309, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.48484848484848486, |
| "grad_norm": 2.6858291279872, |
| "learning_rate": 7.777777777777778e-05, |
| "loss": 1.1318, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.4987012987012987, |
| "grad_norm": 1.997759995167864, |
| "learning_rate": 8e-05, |
| "loss": 1.1323, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.5125541125541125, |
| "grad_norm": 2.629649063991005, |
| "learning_rate": 7.999811966028904e-05, |
| "loss": 1.1398, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.5264069264069264, |
| "grad_norm": 2.6927398202491544, |
| "learning_rate": 7.999247881794007e-05, |
| "loss": 1.1272, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.5402597402597402, |
| "grad_norm": 1.0260444389642347, |
| "learning_rate": 7.998307800328803e-05, |
| "loss": 1.1148, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.5541125541125541, |
| "grad_norm": 3.1260836757156496, |
| "learning_rate": 7.996991810016922e-05, |
| "loss": 1.1581, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.567965367965368, |
| "grad_norm": 2.408162449515958, |
| "learning_rate": 7.995300034583802e-05, |
| "loss": 1.1579, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.5818181818181818, |
| "grad_norm": 1.7233621870783713, |
| "learning_rate": 7.993232633085074e-05, |
| "loss": 1.1154, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.5956709956709957, |
| "grad_norm": 3.2143011392314524, |
| "learning_rate": 7.990789799891592e-05, |
| "loss": 1.1361, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.6095238095238096, |
| "grad_norm": 2.541057275107033, |
| "learning_rate": 7.987971764671168e-05, |
| "loss": 1.1437, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.6233766233766234, |
| "grad_norm": 2.554077948353239, |
| "learning_rate": 7.984778792366983e-05, |
| "loss": 1.1278, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.6372294372294373, |
| "grad_norm": 1.9556507030666455, |
| "learning_rate": 7.981211183172663e-05, |
| "loss": 1.125, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.651082251082251, |
| "grad_norm": 2.4591106418916024, |
| "learning_rate": 7.977269272504075e-05, |
| "loss": 1.1113, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.6649350649350649, |
| "grad_norm": 1.7374508763969678, |
| "learning_rate": 7.972953430967773e-05, |
| "loss": 1.1119, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.6787878787878788, |
| "grad_norm": 2.271122042411741, |
| "learning_rate": 7.96826406432617e-05, |
| "loss": 1.1047, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.6926406926406926, |
| "grad_norm": 1.385329225067948, |
| "learning_rate": 7.963201613459381e-05, |
| "loss": 1.1104, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.7064935064935065, |
| "grad_norm": 2.0797667060906853, |
| "learning_rate": 7.957766554323778e-05, |
| "loss": 1.1008, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.7203463203463204, |
| "grad_norm": 1.4769275764871517, |
| "learning_rate": 7.951959397907237e-05, |
| "loss": 1.1063, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.7341991341991342, |
| "grad_norm": 1.5969040026842134, |
| "learning_rate": 7.945780690181096e-05, |
| "loss": 1.0958, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.7480519480519481, |
| "grad_norm": 1.5076777523334957, |
| "learning_rate": 7.939231012048833e-05, |
| "loss": 1.1038, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.7619047619047619, |
| "grad_norm": 1.5353741235556218, |
| "learning_rate": 7.932310979291441e-05, |
| "loss": 1.088, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.7757575757575758, |
| "grad_norm": 1.6688683700597435, |
| "learning_rate": 7.925021242509539e-05, |
| "loss": 1.1005, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.7896103896103897, |
| "grad_norm": 1.5907176050250653, |
| "learning_rate": 7.917362487062207e-05, |
| "loss": 1.0885, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.8034632034632034, |
| "grad_norm": 1.5886283739500444, |
| "learning_rate": 7.909335433002543e-05, |
| "loss": 1.0889, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.8173160173160173, |
| "grad_norm": 1.1345065452265992, |
| "learning_rate": 7.900940835009974e-05, |
| "loss": 1.0809, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.8311688311688312, |
| "grad_norm": 1.6727620200346303, |
| "learning_rate": 7.892179482319297e-05, |
| "loss": 1.0844, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.845021645021645, |
| "grad_norm": 1.726654683160669, |
| "learning_rate": 7.883052198646481e-05, |
| "loss": 1.0868, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.8588744588744589, |
| "grad_norm": 0.7828989407478679, |
| "learning_rate": 7.873559842111225e-05, |
| "loss": 1.0711, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.8727272727272727, |
| "grad_norm": 1.3882694170960725, |
| "learning_rate": 7.863703305156273e-05, |
| "loss": 1.0752, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.8865800865800866, |
| "grad_norm": 1.5779873659792967, |
| "learning_rate": 7.853483514463521e-05, |
| "loss": 1.0766, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.9004329004329005, |
| "grad_norm": 1.4180034460400448, |
| "learning_rate": 7.842901430866882e-05, |
| "loss": 1.0725, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.9142857142857143, |
| "grad_norm": 0.9127219395084748, |
| "learning_rate": 7.831958049261956e-05, |
| "loss": 1.0612, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.9281385281385282, |
| "grad_norm": 1.0847846746337275, |
| "learning_rate": 7.820654398512492e-05, |
| "loss": 1.074, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.941991341991342, |
| "grad_norm": 1.8013647852774308, |
| "learning_rate": 7.808991541353662e-05, |
| "loss": 1.0954, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.9558441558441558, |
| "grad_norm": 1.377128616335908, |
| "learning_rate": 7.796970574292136e-05, |
| "loss": 1.0752, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.9696969696969697, |
| "grad_norm": 1.6958522149590192, |
| "learning_rate": 7.784592627503004e-05, |
| "loss": 1.0821, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.9835497835497835, |
| "grad_norm": 1.0049024746726356, |
| "learning_rate": 7.771858864723504e-05, |
| "loss": 1.068, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.9974025974025974, |
| "grad_norm": 2.6484071234844953, |
| "learning_rate": 7.758770483143634e-05, |
| "loss": 1.0771, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.0112554112554113, |
| "grad_norm": 4.246067022400895, |
| "learning_rate": 7.745328713293573e-05, |
| "loss": 1.948, |
| "step": 73 |
| }, |
| { |
| "epoch": 1.025108225108225, |
| "grad_norm": 1.7220828208048158, |
| "learning_rate": 7.731534818928004e-05, |
| "loss": 1.0427, |
| "step": 74 |
| }, |
| { |
| "epoch": 1.0389610389610389, |
| "grad_norm": 1.8447923963725428, |
| "learning_rate": 7.71739009690729e-05, |
| "loss": 1.0479, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.0528138528138529, |
| "grad_norm": 0.9341938628888585, |
| "learning_rate": 7.702895877075563e-05, |
| "loss": 1.0333, |
| "step": 76 |
| }, |
| { |
| "epoch": 1.0666666666666667, |
| "grad_norm": 2.424773237088678, |
| "learning_rate": 7.688053522135675e-05, |
| "loss": 1.0579, |
| "step": 77 |
| }, |
| { |
| "epoch": 1.0805194805194804, |
| "grad_norm": 1.6058600540175567, |
| "learning_rate": 7.672864427521097e-05, |
| "loss": 1.0636, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.0943722943722944, |
| "grad_norm": 2.091045151793165, |
| "learning_rate": 7.657330021264718e-05, |
| "loss": 1.0442, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.1082251082251082, |
| "grad_norm": 1.318962052033536, |
| "learning_rate": 7.641451763864587e-05, |
| "loss": 1.045, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.122077922077922, |
| "grad_norm": 2.317561720529343, |
| "learning_rate": 7.625231148146601e-05, |
| "loss": 1.0484, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.135930735930736, |
| "grad_norm": 1.4987484149413424, |
| "learning_rate": 7.608669699124153e-05, |
| "loss": 1.0484, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.1497835497835498, |
| "grad_norm": 2.3968225100015816, |
| "learning_rate": 7.591768973854753e-05, |
| "loss": 1.0453, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.1636363636363636, |
| "grad_norm": 2.0769969941809454, |
| "learning_rate": 7.57453056129365e-05, |
| "loss": 1.0473, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.1774891774891776, |
| "grad_norm": 1.5328425512954666, |
| "learning_rate": 7.556956082144425e-05, |
| "loss": 1.0432, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.1913419913419914, |
| "grad_norm": 1.5329379349699184, |
| "learning_rate": 7.539047188706631e-05, |
| "loss": 1.0502, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.2051948051948052, |
| "grad_norm": 1.2635424997786673, |
| "learning_rate": 7.520805564720444e-05, |
| "loss": 1.0389, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.2190476190476192, |
| "grad_norm": 0.9180899722416639, |
| "learning_rate": 7.502232925208365e-05, |
| "loss": 1.0297, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.232900432900433, |
| "grad_norm": 0.9088421536152287, |
| "learning_rate": 7.483331016313969e-05, |
| "loss": 1.026, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.2467532467532467, |
| "grad_norm": 0.9759584263195824, |
| "learning_rate": 7.464101615137756e-05, |
| "loss": 1.042, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.2606060606060607, |
| "grad_norm": 1.7816477052359974, |
| "learning_rate": 7.444546529570055e-05, |
| "loss": 1.0375, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.2744588744588745, |
| "grad_norm": 1.0505006199756568, |
| "learning_rate": 7.424667598121067e-05, |
| "loss": 1.0232, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.2883116883116883, |
| "grad_norm": 1.1076363899720796, |
| "learning_rate": 7.404466689747999e-05, |
| "loss": 1.0358, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.3021645021645023, |
| "grad_norm": 1.766746417129588, |
| "learning_rate": 7.383945703679365e-05, |
| "loss": 1.041, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.316017316017316, |
| "grad_norm": 1.1727210609875833, |
| "learning_rate": 7.363106569236413e-05, |
| "loss": 1.0373, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.3298701298701299, |
| "grad_norm": 1.3811377730593195, |
| "learning_rate": 7.341951245651747e-05, |
| "loss": 1.0232, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.3437229437229437, |
| "grad_norm": 1.8848088994220173, |
| "learning_rate": 7.320481721885116e-05, |
| "loss": 1.0331, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.3575757575757577, |
| "grad_norm": 1.5407669706222948, |
| "learning_rate": 7.298700016436427e-05, |
| "loss": 1.0392, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.3714285714285714, |
| "grad_norm": 1.6439258533934764, |
| "learning_rate": 7.276608177155968e-05, |
| "loss": 1.0302, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.3852813852813852, |
| "grad_norm": 1.6555083210158104, |
| "learning_rate": 7.254208281051871e-05, |
| "loss": 1.0359, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.399134199134199, |
| "grad_norm": 1.2444215446875204, |
| "learning_rate": 7.231502434094845e-05, |
| "loss": 1.0203, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.412987012987013, |
| "grad_norm": 1.4648122676877777, |
| "learning_rate": 7.208492771020176e-05, |
| "loss": 1.0198, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.4268398268398268, |
| "grad_norm": 0.9173692823505156, |
| "learning_rate": 7.185181455127023e-05, |
| "loss": 1.0217, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.4406926406926406, |
| "grad_norm": 1.1009749853774418, |
| "learning_rate": 7.161570678075038e-05, |
| "loss": 1.0128, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.4545454545454546, |
| "grad_norm": 1.0933932370696173, |
| "learning_rate": 7.137662659678303e-05, |
| "loss": 1.0238, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.4683982683982684, |
| "grad_norm": 1.1757437604660779, |
| "learning_rate": 7.113459647696641e-05, |
| "loss": 1.0182, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.4822510822510822, |
| "grad_norm": 0.7527900271083177, |
| "learning_rate": 7.088963917624277e-05, |
| "loss": 1.012, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.4961038961038962, |
| "grad_norm": 1.1702807594476543, |
| "learning_rate": 7.064177772475912e-05, |
| "loss": 1.0264, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.50995670995671, |
| "grad_norm": 0.6981814585755302, |
| "learning_rate": 7.039103542570199e-05, |
| "loss": 1.0151, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.5238095238095237, |
| "grad_norm": 1.1192032445094018, |
| "learning_rate": 7.013743585310642e-05, |
| "loss": 1.0162, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.5376623376623377, |
| "grad_norm": 1.0770568024481744, |
| "learning_rate": 6.988100284963985e-05, |
| "loss": 1.0199, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.5515151515151515, |
| "grad_norm": 1.2005325967972154, |
| "learning_rate": 6.96217605243602e-05, |
| "loss": 1.0242, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.5653679653679653, |
| "grad_norm": 0.7699858239179544, |
| "learning_rate": 6.935973325044941e-05, |
| "loss": 1.0241, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.5792207792207793, |
| "grad_norm": 1.1064626845196381, |
| "learning_rate": 6.909494566292195e-05, |
| "loss": 1.0082, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.593073593073593, |
| "grad_norm": 1.4162206055932687, |
| "learning_rate": 6.882742265630859e-05, |
| "loss": 1.0161, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.6069264069264069, |
| "grad_norm": 0.9857373401383442, |
| "learning_rate": 6.855718938231597e-05, |
| "loss": 1.0223, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.6207792207792209, |
| "grad_norm": 1.4328471449116547, |
| "learning_rate": 6.828427124746191e-05, |
| "loss": 1.0059, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.6346320346320347, |
| "grad_norm": 0.929598786782075, |
| "learning_rate": 6.800869391068674e-05, |
| "loss": 1.0161, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.6484848484848484, |
| "grad_norm": 1.5271277070860276, |
| "learning_rate": 6.773048328094097e-05, |
| "loss": 1.0109, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.6623376623376624, |
| "grad_norm": 0.7369342923177392, |
| "learning_rate": 6.744966551474936e-05, |
| "loss": 1.0187, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.6761904761904762, |
| "grad_norm": 1.1411511227164497, |
| "learning_rate": 6.716626701375174e-05, |
| "loss": 1.0131, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.69004329004329, |
| "grad_norm": 1.2904195611318852, |
| "learning_rate": 6.688031442222091e-05, |
| "loss": 1.0084, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.703896103896104, |
| "grad_norm": 0.5757097623806057, |
| "learning_rate": 6.659183462455751e-05, |
| "loss": 1.0095, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.7177489177489178, |
| "grad_norm": 0.9291802416250161, |
| "learning_rate": 6.630085474276256e-05, |
| "loss": 1.0117, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.7316017316017316, |
| "grad_norm": 1.0033464839111939, |
| "learning_rate": 6.600740213388735e-05, |
| "loss": 1.0055, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.7454545454545456, |
| "grad_norm": 1.0577865447630987, |
| "learning_rate": 6.571150438746157e-05, |
| "loss": 0.9998, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.7593073593073592, |
| "grad_norm": 0.9644457639091424, |
| "learning_rate": 6.54131893228994e-05, |
| "loss": 1.003, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.7731601731601732, |
| "grad_norm": 0.80334378142282, |
| "learning_rate": 6.511248498688396e-05, |
| "loss": 1.0044, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.7870129870129872, |
| "grad_norm": 0.823547694775696, |
| "learning_rate": 6.480941965073041e-05, |
| "loss": 1.0109, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.8008658008658007, |
| "grad_norm": 0.7273863270792912, |
| "learning_rate": 6.450402180772811e-05, |
| "loss": 1.0017, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.8147186147186147, |
| "grad_norm": 0.762963999004941, |
| "learning_rate": 6.419632017046167e-05, |
| "loss": 1.0018, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.8285714285714287, |
| "grad_norm": 0.8148201089426899, |
| "learning_rate": 6.388634366811146e-05, |
| "loss": 0.9993, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.8424242424242423, |
| "grad_norm": 0.8416363889161061, |
| "learning_rate": 6.35741214437338e-05, |
| "loss": 1.0095, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.8562770562770563, |
| "grad_norm": 1.142390867021583, |
| "learning_rate": 6.325968285152107e-05, |
| "loss": 1.0062, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.87012987012987, |
| "grad_norm": 0.7962536559784616, |
| "learning_rate": 6.294305745404185e-05, |
| "loss": 1.0052, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.8839826839826839, |
| "grad_norm": 0.5650336880636371, |
| "learning_rate": 6.262427501946155e-05, |
| "loss": 1.0067, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.8978354978354979, |
| "grad_norm": 0.5818038902731943, |
| "learning_rate": 6.230336551874372e-05, |
| "loss": 1.0063, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.9116883116883117, |
| "grad_norm": 0.9977727916003996, |
| "learning_rate": 6.198035912283225e-05, |
| "loss": 1.0011, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.9255411255411254, |
| "grad_norm": 0.9993134068472553, |
| "learning_rate": 6.165528619981479e-05, |
| "loss": 0.9934, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.9393939393939394, |
| "grad_norm": 0.6309774026937955, |
| "learning_rate": 6.132817731206766e-05, |
| "loss": 1.0023, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.9532467532467532, |
| "grad_norm": 0.5631788726393073, |
| "learning_rate": 6.099906321338241e-05, |
| "loss": 0.9875, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.967099567099567, |
| "grad_norm": 0.6994904876843244, |
| "learning_rate": 6.0667974846074524e-05, |
| "loss": 0.9969, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.980952380952381, |
| "grad_norm": 0.6611818685825782, |
| "learning_rate": 6.033494333807422e-05, |
| "loss": 1.0052, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.9948051948051948, |
| "grad_norm": 0.5004771960590909, |
| "learning_rate": 6.000000000000001e-05, |
| "loss": 0.9857, |
| "step": 144 |
| }, |
| { |
| "epoch": 2.0086580086580086, |
| "grad_norm": 1.0514858543746186, |
| "learning_rate": 5.9663176322214826e-05, |
| "loss": 1.8002, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.0225108225108226, |
| "grad_norm": 1.5590490021626622, |
| "learning_rate": 5.9324503971865545e-05, |
| "loss": 0.9591, |
| "step": 146 |
| }, |
| { |
| "epoch": 2.036363636363636, |
| "grad_norm": 0.613252686965761, |
| "learning_rate": 5.8984014789905625e-05, |
| "loss": 0.9555, |
| "step": 147 |
| }, |
| { |
| "epoch": 2.05021645021645, |
| "grad_norm": 1.5183857859367584, |
| "learning_rate": 5.8641740788101566e-05, |
| "loss": 0.9637, |
| "step": 148 |
| }, |
| { |
| "epoch": 2.064069264069264, |
| "grad_norm": 0.599406946967003, |
| "learning_rate": 5.8297714146023236e-05, |
| "loss": 0.9396, |
| "step": 149 |
| }, |
| { |
| "epoch": 2.0779220779220777, |
| "grad_norm": 1.171195638149606, |
| "learning_rate": 5.79519672080185e-05, |
| "loss": 0.9523, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.0917748917748917, |
| "grad_norm": 0.6714077570634802, |
| "learning_rate": 5.76045324801722e-05, |
| "loss": 0.9595, |
| "step": 151 |
| }, |
| { |
| "epoch": 2.1056277056277057, |
| "grad_norm": 1.2318697934613918, |
| "learning_rate": 5.7255442627250146e-05, |
| "loss": 0.9514, |
| "step": 152 |
| }, |
| { |
| "epoch": 2.1194805194805193, |
| "grad_norm": 0.746989496141657, |
| "learning_rate": 5.6904730469627985e-05, |
| "loss": 0.9482, |
| "step": 153 |
| }, |
| { |
| "epoch": 2.1333333333333333, |
| "grad_norm": 0.901261215101538, |
| "learning_rate": 5.6552428980205575e-05, |
| "loss": 0.9587, |
| "step": 154 |
| }, |
| { |
| "epoch": 2.1471861471861473, |
| "grad_norm": 0.674529916922478, |
| "learning_rate": 5.619857128130695e-05, |
| "loss": 0.9562, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.161038961038961, |
| "grad_norm": 0.8844375890562896, |
| "learning_rate": 5.584319064156628e-05, |
| "loss": 0.9459, |
| "step": 156 |
| }, |
| { |
| "epoch": 2.174891774891775, |
| "grad_norm": 0.5176842951829833, |
| "learning_rate": 5.548632047280003e-05, |
| "loss": 0.9528, |
| "step": 157 |
| }, |
| { |
| "epoch": 2.188744588744589, |
| "grad_norm": 0.6248120662111469, |
| "learning_rate": 5.5127994326865706e-05, |
| "loss": 0.9482, |
| "step": 158 |
| }, |
| { |
| "epoch": 2.2025974025974024, |
| "grad_norm": 0.6093640758186603, |
| "learning_rate": 5.476824589250738e-05, |
| "loss": 0.9429, |
| "step": 159 |
| }, |
| { |
| "epoch": 2.2164502164502164, |
| "grad_norm": 0.5492958980107647, |
| "learning_rate": 5.440710899218842e-05, |
| "loss": 0.9674, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.2303030303030305, |
| "grad_norm": 0.5903789766574798, |
| "learning_rate": 5.404461757891156e-05, |
| "loss": 0.9667, |
| "step": 161 |
| }, |
| { |
| "epoch": 2.244155844155844, |
| "grad_norm": 0.5486871479315714, |
| "learning_rate": 5.368080573302676e-05, |
| "loss": 0.9478, |
| "step": 162 |
| }, |
| { |
| "epoch": 2.258008658008658, |
| "grad_norm": 0.45428134417688254, |
| "learning_rate": 5.331570765902706e-05, |
| "loss": 0.9409, |
| "step": 163 |
| }, |
| { |
| "epoch": 2.271861471861472, |
| "grad_norm": 0.42847012632012216, |
| "learning_rate": 5.294935768233285e-05, |
| "loss": 0.9416, |
| "step": 164 |
| }, |
| { |
| "epoch": 2.2857142857142856, |
| "grad_norm": 0.4848698252601225, |
| "learning_rate": 5.258179024606455e-05, |
| "loss": 0.9463, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.2995670995670996, |
| "grad_norm": 0.3534788789389581, |
| "learning_rate": 5.2213039907804535e-05, |
| "loss": 0.9491, |
| "step": 166 |
| }, |
| { |
| "epoch": 2.3134199134199136, |
| "grad_norm": 0.5082308518432114, |
| "learning_rate": 5.1843141336348e-05, |
| "loss": 0.95, |
| "step": 167 |
| }, |
| { |
| "epoch": 2.327272727272727, |
| "grad_norm": 0.33208032748656197, |
| "learning_rate": 5.1472129308443616e-05, |
| "loss": 0.953, |
| "step": 168 |
| }, |
| { |
| "epoch": 2.341125541125541, |
| "grad_norm": 0.35843782187780426, |
| "learning_rate": 5.1100038705523834e-05, |
| "loss": 0.957, |
| "step": 169 |
| }, |
| { |
| "epoch": 2.354978354978355, |
| "grad_norm": 0.33243645634228375, |
| "learning_rate": 5.07269045104255e-05, |
| "loss": 0.9348, |
| "step": 170 |
| }, |
| { |
| "epoch": 2.3688311688311687, |
| "grad_norm": 0.37544932004082693, |
| "learning_rate": 5.0352761804100835e-05, |
| "loss": 0.9501, |
| "step": 171 |
| }, |
| { |
| "epoch": 2.3826839826839827, |
| "grad_norm": 0.3396549463565156, |
| "learning_rate": 4.9977645762319255e-05, |
| "loss": 0.9548, |
| "step": 172 |
| }, |
| { |
| "epoch": 2.3965367965367967, |
| "grad_norm": 0.27413219762637864, |
| "learning_rate": 4.9601591652360244e-05, |
| "loss": 0.9516, |
| "step": 173 |
| }, |
| { |
| "epoch": 2.4103896103896103, |
| "grad_norm": 0.2935194857656813, |
| "learning_rate": 4.922463482969761e-05, |
| "loss": 0.9537, |
| "step": 174 |
| }, |
| { |
| "epoch": 2.4242424242424243, |
| "grad_norm": 0.31679378581933954, |
| "learning_rate": 4.884681073467551e-05, |
| "loss": 0.9566, |
| "step": 175 |
| }, |
| { |
| "epoch": 2.4380952380952383, |
| "grad_norm": 0.2917510642085385, |
| "learning_rate": 4.846815488917644e-05, |
| "loss": 0.9602, |
| "step": 176 |
| }, |
| { |
| "epoch": 2.451948051948052, |
| "grad_norm": 0.29512012950255556, |
| "learning_rate": 4.808870289328153e-05, |
| "loss": 0.9513, |
| "step": 177 |
| }, |
| { |
| "epoch": 2.465800865800866, |
| "grad_norm": 0.24808203045159094, |
| "learning_rate": 4.7708490421923596e-05, |
| "loss": 0.9453, |
| "step": 178 |
| }, |
| { |
| "epoch": 2.47965367965368, |
| "grad_norm": 0.21937289844225158, |
| "learning_rate": 4.7327553221533074e-05, |
| "loss": 0.9581, |
| "step": 179 |
| }, |
| { |
| "epoch": 2.4935064935064934, |
| "grad_norm": 0.20437241337234358, |
| "learning_rate": 4.694592710667723e-05, |
| "loss": 0.948, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.5073593073593075, |
| "grad_norm": 0.20182625174185811, |
| "learning_rate": 4.656364795669297e-05, |
| "loss": 0.9505, |
| "step": 181 |
| }, |
| { |
| "epoch": 2.5212121212121215, |
| "grad_norm": 0.2157700828054003, |
| "learning_rate": 4.618075171231363e-05, |
| "loss": 0.955, |
| "step": 182 |
| }, |
| { |
| "epoch": 2.535064935064935, |
| "grad_norm": 0.20198999241369922, |
| "learning_rate": 4.579727437228987e-05, |
| "loss": 0.9479, |
| "step": 183 |
| }, |
| { |
| "epoch": 2.548917748917749, |
| "grad_norm": 0.19349997377276865, |
| "learning_rate": 4.541325199000525e-05, |
| "loss": 0.9444, |
| "step": 184 |
| }, |
| { |
| "epoch": 2.562770562770563, |
| "grad_norm": 0.20821593855670595, |
| "learning_rate": 4.502872067008652e-05, |
| "loss": 0.9484, |
| "step": 185 |
| }, |
| { |
| "epoch": 2.5766233766233766, |
| "grad_norm": 0.22714292711765166, |
| "learning_rate": 4.464371656500921e-05, |
| "loss": 0.9478, |
| "step": 186 |
| }, |
| { |
| "epoch": 2.5904761904761906, |
| "grad_norm": 0.22439821970405607, |
| "learning_rate": 4.425827587169873e-05, |
| "loss": 0.9642, |
| "step": 187 |
| }, |
| { |
| "epoch": 2.6043290043290046, |
| "grad_norm": 0.19017166723603593, |
| "learning_rate": 4.387243482812717e-05, |
| "loss": 0.9354, |
| "step": 188 |
| }, |
| { |
| "epoch": 2.618181818181818, |
| "grad_norm": 0.2338760203213592, |
| "learning_rate": 4.348622970990634e-05, |
| "loss": 0.9608, |
| "step": 189 |
| }, |
| { |
| "epoch": 2.632034632034632, |
| "grad_norm": 0.19433184424361064, |
| "learning_rate": 4.309969682687724e-05, |
| "loss": 0.9365, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.6458874458874457, |
| "grad_norm": 0.2006639594796061, |
| "learning_rate": 4.271287251969637e-05, |
| "loss": 0.943, |
| "step": 191 |
| }, |
| { |
| "epoch": 2.6597402597402597, |
| "grad_norm": 0.19675542180216962, |
| "learning_rate": 4.2325793156419035e-05, |
| "loss": 0.9629, |
| "step": 192 |
| }, |
| { |
| "epoch": 2.6735930735930737, |
| "grad_norm": 0.22882862992661218, |
| "learning_rate": 4.193849512908013e-05, |
| "loss": 0.9399, |
| "step": 193 |
| }, |
| { |
| "epoch": 2.6874458874458873, |
| "grad_norm": 0.27628995792251587, |
| "learning_rate": 4.155101485027268e-05, |
| "loss": 0.9517, |
| "step": 194 |
| }, |
| { |
| "epoch": 2.7012987012987013, |
| "grad_norm": 0.25152494788624064, |
| "learning_rate": 4.116338874972446e-05, |
| "loss": 0.9532, |
| "step": 195 |
| }, |
| { |
| "epoch": 2.7151515151515153, |
| "grad_norm": 0.17237631990944813, |
| "learning_rate": 4.077565327087298e-05, |
| "loss": 0.9443, |
| "step": 196 |
| }, |
| { |
| "epoch": 2.729004329004329, |
| "grad_norm": 0.22052058799944804, |
| "learning_rate": 4.0387844867439143e-05, |
| "loss": 0.9384, |
| "step": 197 |
| }, |
| { |
| "epoch": 2.742857142857143, |
| "grad_norm": 0.2821185693525401, |
| "learning_rate": 4e-05, |
| "loss": 0.9506, |
| "step": 198 |
| }, |
| { |
| "epoch": 2.7567099567099564, |
| "grad_norm": 0.23974193332071514, |
| "learning_rate": 3.961215513256086e-05, |
| "loss": 0.944, |
| "step": 199 |
| }, |
| { |
| "epoch": 2.7705627705627704, |
| "grad_norm": 0.23881720962641614, |
| "learning_rate": 3.9224346729127034e-05, |
| "loss": 0.9423, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.7844155844155845, |
| "grad_norm": 0.1774343946075327, |
| "learning_rate": 3.8836611250275546e-05, |
| "loss": 0.9355, |
| "step": 201 |
| }, |
| { |
| "epoch": 2.798268398268398, |
| "grad_norm": 0.23570113544248983, |
| "learning_rate": 3.844898514972733e-05, |
| "loss": 0.9519, |
| "step": 202 |
| }, |
| { |
| "epoch": 2.812121212121212, |
| "grad_norm": 0.21653970566029948, |
| "learning_rate": 3.806150487091989e-05, |
| "loss": 0.951, |
| "step": 203 |
| }, |
| { |
| "epoch": 2.825974025974026, |
| "grad_norm": 0.1881655573837289, |
| "learning_rate": 3.767420684358097e-05, |
| "loss": 0.9425, |
| "step": 204 |
| }, |
| { |
| "epoch": 2.8398268398268396, |
| "grad_norm": 0.19487964543004402, |
| "learning_rate": 3.7287127480303634e-05, |
| "loss": 0.9496, |
| "step": 205 |
| }, |
| { |
| "epoch": 2.8536796536796536, |
| "grad_norm": 0.21940934921746677, |
| "learning_rate": 3.690030317312277e-05, |
| "loss": 0.9326, |
| "step": 206 |
| }, |
| { |
| "epoch": 2.8675324675324676, |
| "grad_norm": 0.22419835861035028, |
| "learning_rate": 3.6513770290093674e-05, |
| "loss": 0.958, |
| "step": 207 |
| }, |
| { |
| "epoch": 2.881385281385281, |
| "grad_norm": 0.20379473922199545, |
| "learning_rate": 3.612756517187284e-05, |
| "loss": 0.9475, |
| "step": 208 |
| }, |
| { |
| "epoch": 2.895238095238095, |
| "grad_norm": 0.15734328009114276, |
| "learning_rate": 3.574172412830127e-05, |
| "loss": 0.9446, |
| "step": 209 |
| }, |
| { |
| "epoch": 2.909090909090909, |
| "grad_norm": 0.2577374514137676, |
| "learning_rate": 3.535628343499079e-05, |
| "loss": 0.9518, |
| "step": 210 |
| }, |
| { |
| "epoch": 2.9229437229437227, |
| "grad_norm": 0.21560632289046236, |
| "learning_rate": 3.49712793299135e-05, |
| "loss": 0.9321, |
| "step": 211 |
| }, |
| { |
| "epoch": 2.9367965367965367, |
| "grad_norm": 0.19086166501572058, |
| "learning_rate": 3.458674800999477e-05, |
| "loss": 0.939, |
| "step": 212 |
| }, |
| { |
| "epoch": 2.9506493506493507, |
| "grad_norm": 0.1635737725085455, |
| "learning_rate": 3.4202725627710136e-05, |
| "loss": 0.9519, |
| "step": 213 |
| }, |
| { |
| "epoch": 2.9645021645021643, |
| "grad_norm": 0.2063878664719065, |
| "learning_rate": 3.3819248287686386e-05, |
| "loss": 0.9408, |
| "step": 214 |
| }, |
| { |
| "epoch": 2.9783549783549783, |
| "grad_norm": 0.21758034147643424, |
| "learning_rate": 3.343635204330704e-05, |
| "loss": 0.9366, |
| "step": 215 |
| }, |
| { |
| "epoch": 2.9922077922077923, |
| "grad_norm": 0.1756516719858461, |
| "learning_rate": 3.305407289332279e-05, |
| "loss": 0.9261, |
| "step": 216 |
| }, |
| { |
| "epoch": 3.006060606060606, |
| "grad_norm": 0.44437950709772883, |
| "learning_rate": 3.267244677846693e-05, |
| "loss": 1.6737, |
| "step": 217 |
| }, |
| { |
| "epoch": 3.01991341991342, |
| "grad_norm": 0.5202547459859553, |
| "learning_rate": 3.229150957807641e-05, |
| "loss": 0.9065, |
| "step": 218 |
| }, |
| { |
| "epoch": 3.033766233766234, |
| "grad_norm": 0.4201768177217496, |
| "learning_rate": 3.191129710671849e-05, |
| "loss": 0.8993, |
| "step": 219 |
| }, |
| { |
| "epoch": 3.0476190476190474, |
| "grad_norm": 0.3469006955241282, |
| "learning_rate": 3.153184511082359e-05, |
| "loss": 0.8924, |
| "step": 220 |
| }, |
| { |
| "epoch": 3.0614718614718615, |
| "grad_norm": 0.34894763894121467, |
| "learning_rate": 3.1153189265324494e-05, |
| "loss": 0.9091, |
| "step": 221 |
| }, |
| { |
| "epoch": 3.0753246753246755, |
| "grad_norm": 0.3951659967368868, |
| "learning_rate": 3.07753651703024e-05, |
| "loss": 0.9103, |
| "step": 222 |
| }, |
| { |
| "epoch": 3.089177489177489, |
| "grad_norm": 0.33506373060928457, |
| "learning_rate": 3.0398408347639773e-05, |
| "loss": 0.8895, |
| "step": 223 |
| }, |
| { |
| "epoch": 3.103030303030303, |
| "grad_norm": 0.2808678451376146, |
| "learning_rate": 3.0022354237680752e-05, |
| "loss": 0.8954, |
| "step": 224 |
| }, |
| { |
| "epoch": 3.116883116883117, |
| "grad_norm": 0.3452617358086684, |
| "learning_rate": 2.9647238195899168e-05, |
| "loss": 0.8954, |
| "step": 225 |
| }, |
| { |
| "epoch": 3.1307359307359306, |
| "grad_norm": 0.32553230647238945, |
| "learning_rate": 2.9273095489574502e-05, |
| "loss": 0.897, |
| "step": 226 |
| }, |
| { |
| "epoch": 3.1445887445887446, |
| "grad_norm": 0.2604914839354281, |
| "learning_rate": 2.889996129447618e-05, |
| "loss": 0.907, |
| "step": 227 |
| }, |
| { |
| "epoch": 3.1584415584415586, |
| "grad_norm": 0.34111866816202957, |
| "learning_rate": 2.8527870691556404e-05, |
| "loss": 0.8981, |
| "step": 228 |
| }, |
| { |
| "epoch": 3.172294372294372, |
| "grad_norm": 0.28026302405180475, |
| "learning_rate": 2.8156858663652015e-05, |
| "loss": 0.9033, |
| "step": 229 |
| }, |
| { |
| "epoch": 3.186147186147186, |
| "grad_norm": 0.26870372034953893, |
| "learning_rate": 2.778696009219548e-05, |
| "loss": 0.9059, |
| "step": 230 |
| }, |
| { |
| "epoch": 3.2, |
| "grad_norm": 0.3798626491614641, |
| "learning_rate": 2.7418209753935464e-05, |
| "loss": 0.8894, |
| "step": 231 |
| }, |
| { |
| "epoch": 3.2138528138528137, |
| "grad_norm": 0.21379716918544014, |
| "learning_rate": 2.7050642317667164e-05, |
| "loss": 0.8937, |
| "step": 232 |
| }, |
| { |
| "epoch": 3.2277056277056277, |
| "grad_norm": 0.31956814421124774, |
| "learning_rate": 2.6684292340972936e-05, |
| "loss": 0.9068, |
| "step": 233 |
| }, |
| { |
| "epoch": 3.2415584415584417, |
| "grad_norm": 0.194502129845176, |
| "learning_rate": 2.6319194266973256e-05, |
| "loss": 0.8999, |
| "step": 234 |
| }, |
| { |
| "epoch": 3.2554112554112553, |
| "grad_norm": 0.25288436825501515, |
| "learning_rate": 2.5955382421088457e-05, |
| "loss": 0.8876, |
| "step": 235 |
| }, |
| { |
| "epoch": 3.2692640692640693, |
| "grad_norm": 0.2045328796636946, |
| "learning_rate": 2.5592891007811594e-05, |
| "loss": 0.9056, |
| "step": 236 |
| }, |
| { |
| "epoch": 3.2831168831168833, |
| "grad_norm": 0.17690924985251477, |
| "learning_rate": 2.523175410749263e-05, |
| "loss": 0.9068, |
| "step": 237 |
| }, |
| { |
| "epoch": 3.296969696969697, |
| "grad_norm": 0.20432688291964138, |
| "learning_rate": 2.4872005673134307e-05, |
| "loss": 0.8916, |
| "step": 238 |
| }, |
| { |
| "epoch": 3.310822510822511, |
| "grad_norm": 0.17738981903795317, |
| "learning_rate": 2.4513679527199986e-05, |
| "loss": 0.9115, |
| "step": 239 |
| }, |
| { |
| "epoch": 3.324675324675325, |
| "grad_norm": 0.16833331057473214, |
| "learning_rate": 2.4156809358433728e-05, |
| "loss": 0.8891, |
| "step": 240 |
| }, |
| { |
| "epoch": 3.3385281385281385, |
| "grad_norm": 0.17407822439034182, |
| "learning_rate": 2.3801428718693055e-05, |
| "loss": 0.8936, |
| "step": 241 |
| }, |
| { |
| "epoch": 3.3523809523809525, |
| "grad_norm": 0.16434385080662373, |
| "learning_rate": 2.3447571019794438e-05, |
| "loss": 0.9079, |
| "step": 242 |
| }, |
| { |
| "epoch": 3.3662337662337665, |
| "grad_norm": 0.1647420511208294, |
| "learning_rate": 2.3095269530372032e-05, |
| "loss": 0.8904, |
| "step": 243 |
| }, |
| { |
| "epoch": 3.38008658008658, |
| "grad_norm": 0.16465200281562736, |
| "learning_rate": 2.274455737274987e-05, |
| "loss": 0.8965, |
| "step": 244 |
| }, |
| { |
| "epoch": 3.393939393939394, |
| "grad_norm": 0.1942259697042446, |
| "learning_rate": 2.239546751982782e-05, |
| "loss": 0.9039, |
| "step": 245 |
| }, |
| { |
| "epoch": 3.407792207792208, |
| "grad_norm": 0.15418958599426286, |
| "learning_rate": 2.2048032791981515e-05, |
| "loss": 0.8921, |
| "step": 246 |
| }, |
| { |
| "epoch": 3.4216450216450216, |
| "grad_norm": 0.15256309020808106, |
| "learning_rate": 2.1702285853976774e-05, |
| "loss": 0.8972, |
| "step": 247 |
| }, |
| { |
| "epoch": 3.4354978354978356, |
| "grad_norm": 0.14590845303296213, |
| "learning_rate": 2.135825921189846e-05, |
| "loss": 0.8967, |
| "step": 248 |
| }, |
| { |
| "epoch": 3.449350649350649, |
| "grad_norm": 0.1756342017642444, |
| "learning_rate": 2.1015985210094385e-05, |
| "loss": 0.9089, |
| "step": 249 |
| }, |
| { |
| "epoch": 3.463203463203463, |
| "grad_norm": 0.14928130402546771, |
| "learning_rate": 2.067549602813446e-05, |
| "loss": 0.9116, |
| "step": 250 |
| }, |
| { |
| "epoch": 3.477056277056277, |
| "grad_norm": 0.19622196885081308, |
| "learning_rate": 2.033682367778518e-05, |
| "loss": 0.9035, |
| "step": 251 |
| }, |
| { |
| "epoch": 3.4909090909090907, |
| "grad_norm": 0.16833682605095, |
| "learning_rate": 2.0000000000000012e-05, |
| "loss": 0.9049, |
| "step": 252 |
| }, |
| { |
| "epoch": 3.5047619047619047, |
| "grad_norm": 0.1700606136967009, |
| "learning_rate": 1.966505666192579e-05, |
| "loss": 0.9013, |
| "step": 253 |
| }, |
| { |
| "epoch": 3.5186147186147188, |
| "grad_norm": 0.1795362591013133, |
| "learning_rate": 1.9332025153925486e-05, |
| "loss": 0.887, |
| "step": 254 |
| }, |
| { |
| "epoch": 3.5324675324675323, |
| "grad_norm": 0.16623457555792936, |
| "learning_rate": 1.90009367866176e-05, |
| "loss": 0.9025, |
| "step": 255 |
| }, |
| { |
| "epoch": 3.5463203463203463, |
| "grad_norm": 0.1724331408670692, |
| "learning_rate": 1.867182268793236e-05, |
| "loss": 0.902, |
| "step": 256 |
| }, |
| { |
| "epoch": 3.5601731601731603, |
| "grad_norm": 0.156738658049747, |
| "learning_rate": 1.8344713800185215e-05, |
| "loss": 0.8935, |
| "step": 257 |
| }, |
| { |
| "epoch": 3.574025974025974, |
| "grad_norm": 0.16288790800709219, |
| "learning_rate": 1.8019640877167763e-05, |
| "loss": 0.898, |
| "step": 258 |
| }, |
| { |
| "epoch": 3.587878787878788, |
| "grad_norm": 0.15690946638171066, |
| "learning_rate": 1.7696634481256293e-05, |
| "loss": 0.8959, |
| "step": 259 |
| }, |
| { |
| "epoch": 3.601731601731602, |
| "grad_norm": 0.16001262583220252, |
| "learning_rate": 1.7375724980538465e-05, |
| "loss": 0.8888, |
| "step": 260 |
| }, |
| { |
| "epoch": 3.6155844155844155, |
| "grad_norm": 0.15064377615121663, |
| "learning_rate": 1.7056942545958167e-05, |
| "loss": 0.9089, |
| "step": 261 |
| }, |
| { |
| "epoch": 3.6294372294372295, |
| "grad_norm": 0.13096790236650285, |
| "learning_rate": 1.6740317148478932e-05, |
| "loss": 0.9055, |
| "step": 262 |
| }, |
| { |
| "epoch": 3.643290043290043, |
| "grad_norm": 0.14921599598853594, |
| "learning_rate": 1.642587855626621e-05, |
| "loss": 0.9154, |
| "step": 263 |
| }, |
| { |
| "epoch": 3.657142857142857, |
| "grad_norm": 0.13367750739235254, |
| "learning_rate": 1.6113656331888563e-05, |
| "loss": 0.8954, |
| "step": 264 |
| }, |
| { |
| "epoch": 3.670995670995671, |
| "grad_norm": 0.14168194296838715, |
| "learning_rate": 1.580367982953833e-05, |
| "loss": 0.8939, |
| "step": 265 |
| }, |
| { |
| "epoch": 3.6848484848484846, |
| "grad_norm": 0.14492593957298525, |
| "learning_rate": 1.5495978192271887e-05, |
| "loss": 0.91, |
| "step": 266 |
| }, |
| { |
| "epoch": 3.6987012987012986, |
| "grad_norm": 0.1316497818256666, |
| "learning_rate": 1.5190580349269604e-05, |
| "loss": 0.9027, |
| "step": 267 |
| }, |
| { |
| "epoch": 3.7125541125541126, |
| "grad_norm": 0.15841380793742146, |
| "learning_rate": 1.4887515013116067e-05, |
| "loss": 0.9106, |
| "step": 268 |
| }, |
| { |
| "epoch": 3.726406926406926, |
| "grad_norm": 0.13126491215447147, |
| "learning_rate": 1.4586810677100608e-05, |
| "loss": 0.8937, |
| "step": 269 |
| }, |
| { |
| "epoch": 3.74025974025974, |
| "grad_norm": 0.1495403663254427, |
| "learning_rate": 1.4288495612538427e-05, |
| "loss": 0.9034, |
| "step": 270 |
| }, |
| { |
| "epoch": 3.754112554112554, |
| "grad_norm": 0.12429246476808327, |
| "learning_rate": 1.3992597866112667e-05, |
| "loss": 0.8975, |
| "step": 271 |
| }, |
| { |
| "epoch": 3.7679653679653677, |
| "grad_norm": 0.13097022929593902, |
| "learning_rate": 1.369914525723746e-05, |
| "loss": 0.8882, |
| "step": 272 |
| }, |
| { |
| "epoch": 3.7818181818181817, |
| "grad_norm": 0.13482171999455558, |
| "learning_rate": 1.3408165375442486e-05, |
| "loss": 0.8906, |
| "step": 273 |
| }, |
| { |
| "epoch": 3.7956709956709958, |
| "grad_norm": 0.12515899928871424, |
| "learning_rate": 1.3119685577779105e-05, |
| "loss": 0.9008, |
| "step": 274 |
| }, |
| { |
| "epoch": 3.8095238095238093, |
| "grad_norm": 0.13069692054136395, |
| "learning_rate": 1.2833732986248277e-05, |
| "loss": 0.8853, |
| "step": 275 |
| }, |
| { |
| "epoch": 3.8233766233766233, |
| "grad_norm": 0.13447223817691295, |
| "learning_rate": 1.2550334485250661e-05, |
| "loss": 0.9051, |
| "step": 276 |
| }, |
| { |
| "epoch": 3.8372294372294373, |
| "grad_norm": 0.12306949358534137, |
| "learning_rate": 1.2269516719059041e-05, |
| "loss": 0.8979, |
| "step": 277 |
| }, |
| { |
| "epoch": 3.851082251082251, |
| "grad_norm": 0.13274764900634733, |
| "learning_rate": 1.1991306089313261e-05, |
| "loss": 0.901, |
| "step": 278 |
| }, |
| { |
| "epoch": 3.864935064935065, |
| "grad_norm": 0.12496506975650054, |
| "learning_rate": 1.1715728752538103e-05, |
| "loss": 0.8851, |
| "step": 279 |
| }, |
| { |
| "epoch": 3.878787878787879, |
| "grad_norm": 0.12342700776133213, |
| "learning_rate": 1.1442810617684046e-05, |
| "loss": 0.8906, |
| "step": 280 |
| }, |
| { |
| "epoch": 3.8926406926406925, |
| "grad_norm": 0.11718555769651504, |
| "learning_rate": 1.1172577343691415e-05, |
| "loss": 0.8945, |
| "step": 281 |
| }, |
| { |
| "epoch": 3.9064935064935065, |
| "grad_norm": 0.11900571530829156, |
| "learning_rate": 1.0905054337078051e-05, |
| "loss": 0.8939, |
| "step": 282 |
| }, |
| { |
| "epoch": 3.9203463203463205, |
| "grad_norm": 0.11761709393948508, |
| "learning_rate": 1.0640266749550593e-05, |
| "loss": 0.8987, |
| "step": 283 |
| }, |
| { |
| "epoch": 3.934199134199134, |
| "grad_norm": 0.12426098474964, |
| "learning_rate": 1.0378239475639823e-05, |
| "loss": 0.8954, |
| "step": 284 |
| }, |
| { |
| "epoch": 3.948051948051948, |
| "grad_norm": 0.11342564958505907, |
| "learning_rate": 1.0118997150360169e-05, |
| "loss": 0.8967, |
| "step": 285 |
| }, |
| { |
| "epoch": 3.961904761904762, |
| "grad_norm": 0.12414751882404233, |
| "learning_rate": 9.862564146893571e-06, |
| "loss": 0.8942, |
| "step": 286 |
| }, |
| { |
| "epoch": 3.9757575757575756, |
| "grad_norm": 0.11821007668599343, |
| "learning_rate": 9.60896457429803e-06, |
| "loss": 0.8981, |
| "step": 287 |
| }, |
| { |
| "epoch": 3.9896103896103896, |
| "grad_norm": 0.11207748566968422, |
| "learning_rate": 9.358222275240884e-06, |
| "loss": 0.8969, |
| "step": 288 |
| }, |
| { |
| "epoch": 4.003463203463204, |
| "grad_norm": 0.24776696231966608, |
| "learning_rate": 9.110360823757235e-06, |
| "loss": 1.6175, |
| "step": 289 |
| }, |
| { |
| "epoch": 4.017316017316017, |
| "grad_norm": 0.1639268139321257, |
| "learning_rate": 8.8654035230336e-06, |
| "loss": 0.8757, |
| "step": 290 |
| }, |
| { |
| "epoch": 4.031168831168831, |
| "grad_norm": 0.1430026414045171, |
| "learning_rate": 8.623373403216972e-06, |
| "loss": 0.8619, |
| "step": 291 |
| }, |
| { |
| "epoch": 4.045021645021645, |
| "grad_norm": 0.13983259059672157, |
| "learning_rate": 8.384293219249633e-06, |
| "loss": 0.875, |
| "step": 292 |
| }, |
| { |
| "epoch": 4.058874458874459, |
| "grad_norm": 0.14776698103121835, |
| "learning_rate": 8.148185448729778e-06, |
| "loss": 0.8712, |
| "step": 293 |
| }, |
| { |
| "epoch": 4.072727272727272, |
| "grad_norm": 0.1453264011082169, |
| "learning_rate": 7.915072289798247e-06, |
| "loss": 0.8859, |
| "step": 294 |
| }, |
| { |
| "epoch": 4.086580086580087, |
| "grad_norm": 0.15943779551259862, |
| "learning_rate": 7.684975659051557e-06, |
| "loss": 0.8662, |
| "step": 295 |
| }, |
| { |
| "epoch": 4.1004329004329, |
| "grad_norm": 0.1456231807293276, |
| "learning_rate": 7.457917189481301e-06, |
| "loss": 0.8774, |
| "step": 296 |
| }, |
| { |
| "epoch": 4.114285714285714, |
| "grad_norm": 0.14340143561096827, |
| "learning_rate": 7.233918228440324e-06, |
| "loss": 0.8774, |
| "step": 297 |
| }, |
| { |
| "epoch": 4.128138528138528, |
| "grad_norm": 0.14023071744580373, |
| "learning_rate": 7.0129998356357295e-06, |
| "loss": 0.863, |
| "step": 298 |
| }, |
| { |
| "epoch": 4.141991341991342, |
| "grad_norm": 0.14172173606520722, |
| "learning_rate": 6.795182781148848e-06, |
| "loss": 0.8767, |
| "step": 299 |
| }, |
| { |
| "epoch": 4.1558441558441555, |
| "grad_norm": 0.1318876467621652, |
| "learning_rate": 6.58048754348255e-06, |
| "loss": 0.8709, |
| "step": 300 |
| }, |
| { |
| "epoch": 4.16969696969697, |
| "grad_norm": 0.1517460979685681, |
| "learning_rate": 6.368934307635881e-06, |
| "loss": 0.8716, |
| "step": 301 |
| }, |
| { |
| "epoch": 4.1835497835497835, |
| "grad_norm": 0.15120519716651545, |
| "learning_rate": 6.160542963206357e-06, |
| "loss": 0.8697, |
| "step": 302 |
| }, |
| { |
| "epoch": 4.197402597402597, |
| "grad_norm": 0.12276533641084203, |
| "learning_rate": 5.955333102520011e-06, |
| "loss": 0.8628, |
| "step": 303 |
| }, |
| { |
| "epoch": 4.2112554112554115, |
| "grad_norm": 0.1303847318332295, |
| "learning_rate": 5.753324018789346e-06, |
| "loss": 0.8708, |
| "step": 304 |
| }, |
| { |
| "epoch": 4.225108225108225, |
| "grad_norm": 0.13706452110864129, |
| "learning_rate": 5.554534704299448e-06, |
| "loss": 0.8566, |
| "step": 305 |
| }, |
| { |
| "epoch": 4.238961038961039, |
| "grad_norm": 0.15781002543920747, |
| "learning_rate": 5.358983848622452e-06, |
| "loss": 0.8764, |
| "step": 306 |
| }, |
| { |
| "epoch": 4.252813852813853, |
| "grad_norm": 0.11520912795530423, |
| "learning_rate": 5.1666898368603195e-06, |
| "loss": 0.8749, |
| "step": 307 |
| }, |
| { |
| "epoch": 4.266666666666667, |
| "grad_norm": 0.11508546810833122, |
| "learning_rate": 4.97767074791637e-06, |
| "loss": 0.8657, |
| "step": 308 |
| }, |
| { |
| "epoch": 4.28051948051948, |
| "grad_norm": 0.14352142083453215, |
| "learning_rate": 4.791944352795561e-06, |
| "loss": 0.8919, |
| "step": 309 |
| }, |
| { |
| "epoch": 4.294372294372295, |
| "grad_norm": 0.13642778141475553, |
| "learning_rate": 4.609528112933688e-06, |
| "loss": 0.8575, |
| "step": 310 |
| }, |
| { |
| "epoch": 4.308225108225108, |
| "grad_norm": 0.11645525287361383, |
| "learning_rate": 4.430439178555759e-06, |
| "loss": 0.874, |
| "step": 311 |
| }, |
| { |
| "epoch": 4.322077922077922, |
| "grad_norm": 0.11198885083380229, |
| "learning_rate": 4.254694387063514e-06, |
| "loss": 0.866, |
| "step": 312 |
| }, |
| { |
| "epoch": 4.335930735930736, |
| "grad_norm": 0.11999719505276203, |
| "learning_rate": 4.082310261452471e-06, |
| "loss": 0.8809, |
| "step": 313 |
| }, |
| { |
| "epoch": 4.34978354978355, |
| "grad_norm": 0.11431861199461578, |
| "learning_rate": 3.913303008758491e-06, |
| "loss": 0.8739, |
| "step": 314 |
| }, |
| { |
| "epoch": 4.363636363636363, |
| "grad_norm": 0.1089763284328194, |
| "learning_rate": 3.747688518534003e-06, |
| "loss": 0.8764, |
| "step": 315 |
| }, |
| { |
| "epoch": 4.377489177489178, |
| "grad_norm": 0.11083535668146678, |
| "learning_rate": 3.585482361354138e-06, |
| "loss": 0.874, |
| "step": 316 |
| }, |
| { |
| "epoch": 4.391341991341991, |
| "grad_norm": 0.10462111723473196, |
| "learning_rate": 3.42669978735283e-06, |
| "loss": 0.8712, |
| "step": 317 |
| }, |
| { |
| "epoch": 4.405194805194805, |
| "grad_norm": 0.11192874060919457, |
| "learning_rate": 3.2713557247890447e-06, |
| "loss": 0.865, |
| "step": 318 |
| }, |
| { |
| "epoch": 4.419047619047619, |
| "grad_norm": 0.0998639300176411, |
| "learning_rate": 3.1194647786432663e-06, |
| "loss": 0.8628, |
| "step": 319 |
| }, |
| { |
| "epoch": 4.432900432900433, |
| "grad_norm": 0.1037388404966585, |
| "learning_rate": 2.9710412292443868e-06, |
| "loss": 0.8744, |
| "step": 320 |
| }, |
| { |
| "epoch": 4.4467532467532465, |
| "grad_norm": 0.10341839983438926, |
| "learning_rate": 2.8260990309270987e-06, |
| "loss": 0.8707, |
| "step": 321 |
| }, |
| { |
| "epoch": 4.460606060606061, |
| "grad_norm": 0.10245055505097513, |
| "learning_rate": 2.6846518107199782e-06, |
| "loss": 0.869, |
| "step": 322 |
| }, |
| { |
| "epoch": 4.4744588744588745, |
| "grad_norm": 0.10245685258161713, |
| "learning_rate": 2.546712867064276e-06, |
| "loss": 0.866, |
| "step": 323 |
| }, |
| { |
| "epoch": 4.488311688311688, |
| "grad_norm": 0.10246348212442796, |
| "learning_rate": 2.4122951685636674e-06, |
| "loss": 0.869, |
| "step": 324 |
| }, |
| { |
| "epoch": 4.5021645021645025, |
| "grad_norm": 0.10133630585516906, |
| "learning_rate": 2.281411352764966e-06, |
| "loss": 0.8661, |
| "step": 325 |
| }, |
| { |
| "epoch": 4.516017316017316, |
| "grad_norm": 0.10385457357599492, |
| "learning_rate": 2.1540737249699893e-06, |
| "loss": 0.8665, |
| "step": 326 |
| }, |
| { |
| "epoch": 4.52987012987013, |
| "grad_norm": 0.09787930849328196, |
| "learning_rate": 2.0302942570786446e-06, |
| "loss": 0.8587, |
| "step": 327 |
| }, |
| { |
| "epoch": 4.543722943722944, |
| "grad_norm": 0.09875061097653641, |
| "learning_rate": 1.9100845864633875e-06, |
| "loss": 0.862, |
| "step": 328 |
| }, |
| { |
| "epoch": 4.557575757575758, |
| "grad_norm": 0.10019109859451927, |
| "learning_rate": 1.793456014875079e-06, |
| "loss": 0.8667, |
| "step": 329 |
| }, |
| { |
| "epoch": 4.571428571428571, |
| "grad_norm": 0.09607007590769094, |
| "learning_rate": 1.6804195073804442e-06, |
| "loss": 0.8609, |
| "step": 330 |
| }, |
| { |
| "epoch": 4.585281385281386, |
| "grad_norm": 0.0995091150688806, |
| "learning_rate": 1.5709856913311795e-06, |
| "loss": 0.8631, |
| "step": 331 |
| }, |
| { |
| "epoch": 4.599134199134199, |
| "grad_norm": 0.10237535339157534, |
| "learning_rate": 1.4651648553647869e-06, |
| "loss": 0.874, |
| "step": 332 |
| }, |
| { |
| "epoch": 4.612987012987013, |
| "grad_norm": 0.09685943360360758, |
| "learning_rate": 1.3629669484372722e-06, |
| "loss": 0.8608, |
| "step": 333 |
| }, |
| { |
| "epoch": 4.626839826839827, |
| "grad_norm": 0.10088872360008577, |
| "learning_rate": 1.2644015788877684e-06, |
| "loss": 0.8776, |
| "step": 334 |
| }, |
| { |
| "epoch": 4.640692640692641, |
| "grad_norm": 0.09659731541025765, |
| "learning_rate": 1.1694780135352013e-06, |
| "loss": 0.8659, |
| "step": 335 |
| }, |
| { |
| "epoch": 4.654545454545454, |
| "grad_norm": 0.09754069143347813, |
| "learning_rate": 1.0782051768070477e-06, |
| "loss": 0.8822, |
| "step": 336 |
| }, |
| { |
| "epoch": 4.668398268398269, |
| "grad_norm": 0.09529068088084004, |
| "learning_rate": 9.905916499002787e-07, |
| "loss": 0.8632, |
| "step": 337 |
| }, |
| { |
| "epoch": 4.682251082251082, |
| "grad_norm": 0.09443098915190634, |
| "learning_rate": 9.066456699745774e-07, |
| "loss": 0.8686, |
| "step": 338 |
| }, |
| { |
| "epoch": 4.696103896103896, |
| "grad_norm": 0.09719204747726426, |
| "learning_rate": 8.263751293779409e-07, |
| "loss": 0.8709, |
| "step": 339 |
| }, |
| { |
| "epoch": 4.70995670995671, |
| "grad_norm": 0.0989300648418707, |
| "learning_rate": 7.497875749046124e-07, |
| "loss": 0.8706, |
| "step": 340 |
| }, |
| { |
| "epoch": 4.723809523809524, |
| "grad_norm": 0.09639393839499397, |
| "learning_rate": 6.768902070856031e-07, |
| "loss": 0.8661, |
| "step": 341 |
| }, |
| { |
| "epoch": 4.7376623376623375, |
| "grad_norm": 0.09557188345066484, |
| "learning_rate": 6.076898795116792e-07, |
| "loss": 0.8662, |
| "step": 342 |
| }, |
| { |
| "epoch": 4.751515151515152, |
| "grad_norm": 0.09944408893779064, |
| "learning_rate": 5.421930981890455e-07, |
| "loss": 0.877, |
| "step": 343 |
| }, |
| { |
| "epoch": 4.7653679653679655, |
| "grad_norm": 0.0977504011176678, |
| "learning_rate": 4.804060209276396e-07, |
| "loss": 0.8658, |
| "step": 344 |
| }, |
| { |
| "epoch": 4.779220779220779, |
| "grad_norm": 0.09464762553229625, |
| "learning_rate": 4.223344567622212e-07, |
| "loss": 0.8718, |
| "step": 345 |
| }, |
| { |
| "epoch": 4.7930735930735935, |
| "grad_norm": 0.09515637845594775, |
| "learning_rate": 3.679838654061874e-07, |
| "loss": 0.8672, |
| "step": 346 |
| }, |
| { |
| "epoch": 4.806926406926407, |
| "grad_norm": 0.09692757545190614, |
| "learning_rate": 3.173593567383071e-07, |
| "loss": 0.8762, |
| "step": 347 |
| }, |
| { |
| "epoch": 4.820779220779221, |
| "grad_norm": 0.09525175615621749, |
| "learning_rate": 2.704656903222791e-07, |
| "loss": 0.8792, |
| "step": 348 |
| }, |
| { |
| "epoch": 4.834632034632035, |
| "grad_norm": 0.09621257866702408, |
| "learning_rate": 2.273072749592631e-07, |
| "loss": 0.864, |
| "step": 349 |
| }, |
| { |
| "epoch": 4.848484848484849, |
| "grad_norm": 0.09435391607466348, |
| "learning_rate": 1.8788816827336686e-07, |
| "loss": 0.8827, |
| "step": 350 |
| }, |
| { |
| "epoch": 4.862337662337662, |
| "grad_norm": 0.09330676760639534, |
| "learning_rate": 1.522120763301782e-07, |
| "loss": 0.8634, |
| "step": 351 |
| }, |
| { |
| "epoch": 4.876190476190477, |
| "grad_norm": 0.09377768092440732, |
| "learning_rate": 1.2028235328831906e-07, |
| "loss": 0.8782, |
| "step": 352 |
| }, |
| { |
| "epoch": 4.89004329004329, |
| "grad_norm": 0.09540719747182097, |
| "learning_rate": 9.21020010840934e-08, |
| "loss": 0.8721, |
| "step": 353 |
| }, |
| { |
| "epoch": 4.903896103896104, |
| "grad_norm": 0.09356725286148478, |
| "learning_rate": 6.767366914927298e-08, |
| "loss": 0.8784, |
| "step": 354 |
| }, |
| { |
| "epoch": 4.917748917748918, |
| "grad_norm": 0.09257225973193513, |
| "learning_rate": 4.699965416198549e-08, |
| "loss": 0.8794, |
| "step": 355 |
| }, |
| { |
| "epoch": 4.931601731601732, |
| "grad_norm": 0.09315617718680014, |
| "learning_rate": 3.0081899830798345e-08, |
| "loss": 0.8658, |
| "step": 356 |
| }, |
| { |
| "epoch": 4.945454545454545, |
| "grad_norm": 0.09320193350709476, |
| "learning_rate": 1.6921996711976028e-08, |
| "loss": 0.8666, |
| "step": 357 |
| }, |
| { |
| "epoch": 4.95930735930736, |
| "grad_norm": 0.09451963386678745, |
| "learning_rate": 7.521182059946342e-09, |
| "loss": 0.866, |
| "step": 358 |
| }, |
| { |
| "epoch": 4.973160173160173, |
| "grad_norm": 0.09250072566157394, |
| "learning_rate": 1.8803397109534715e-09, |
| "loss": 0.8639, |
| "step": 359 |
| }, |
| { |
| "epoch": 4.987012987012987, |
| "grad_norm": 0.09138839375450408, |
| "learning_rate": 0.0, |
| "loss": 0.8814, |
| "step": 360 |
| }, |
| { |
| "epoch": 4.987012987012987, |
| "step": 360, |
| "total_flos": 9.572466247992345e+18, |
| "train_loss": 0.0, |
| "train_runtime": 2.6399, |
| "train_samples_per_second": 69987.374, |
| "train_steps_per_second": 136.369 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 360, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.572466247992345e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|