| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.7818411097099621, |
| "eval_steps": 100, |
| "global_step": 310, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0025220680958385876, |
| "grad_norm": 25.350475311279297, |
| "learning_rate": 0.0, |
| "loss": 2.5568, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.005044136191677175, |
| "grad_norm": 24.538068771362305, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 2.7748, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.007566204287515763, |
| "grad_norm": 23.780784606933594, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 2.5911, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.01008827238335435, |
| "grad_norm": 24.780380249023438, |
| "learning_rate": 1.2e-05, |
| "loss": 2.8427, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.012610340479192938, |
| "grad_norm": 22.699949264526367, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 2.709, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.015132408575031526, |
| "grad_norm": 22.106008529663086, |
| "learning_rate": 2e-05, |
| "loss": 2.5854, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.017654476670870115, |
| "grad_norm": 22.497045516967773, |
| "learning_rate": 1.9948979591836737e-05, |
| "loss": 2.5427, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.0201765447667087, |
| "grad_norm": 27.103275299072266, |
| "learning_rate": 1.9897959183673473e-05, |
| "loss": 2.6599, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.02269861286254729, |
| "grad_norm": 21.081985473632812, |
| "learning_rate": 1.9846938775510205e-05, |
| "loss": 2.5145, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.025220680958385876, |
| "grad_norm": 25.964981079101562, |
| "learning_rate": 1.979591836734694e-05, |
| "loss": 2.4247, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.027742749054224466, |
| "grad_norm": 25.353195190429688, |
| "learning_rate": 1.9744897959183677e-05, |
| "loss": 2.5092, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.03026481715006305, |
| "grad_norm": 18.94191551208496, |
| "learning_rate": 1.969387755102041e-05, |
| "loss": 2.4335, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.03278688524590164, |
| "grad_norm": 23.60140037536621, |
| "learning_rate": 1.9642857142857145e-05, |
| "loss": 2.544, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.03530895334174023, |
| "grad_norm": 24.298965454101562, |
| "learning_rate": 1.9591836734693877e-05, |
| "loss": 2.4987, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.03783102143757881, |
| "grad_norm": 20.745506286621094, |
| "learning_rate": 1.9540816326530613e-05, |
| "loss": 2.4985, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0403530895334174, |
| "grad_norm": 22.54330062866211, |
| "learning_rate": 1.948979591836735e-05, |
| "loss": 2.6892, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.04287515762925599, |
| "grad_norm": 21.46229362487793, |
| "learning_rate": 1.9438775510204085e-05, |
| "loss": 2.3998, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.04539722572509458, |
| "grad_norm": 20.54530143737793, |
| "learning_rate": 1.9387755102040817e-05, |
| "loss": 2.4244, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.04791929382093316, |
| "grad_norm": 18.8839111328125, |
| "learning_rate": 1.9336734693877553e-05, |
| "loss": 2.3911, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.05044136191677175, |
| "grad_norm": 16.924652099609375, |
| "learning_rate": 1.928571428571429e-05, |
| "loss": 2.3588, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.05296343001261034, |
| "grad_norm": 16.996627807617188, |
| "learning_rate": 1.9234693877551024e-05, |
| "loss": 2.3727, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.05548549810844893, |
| "grad_norm": 18.584613800048828, |
| "learning_rate": 1.9183673469387756e-05, |
| "loss": 2.2974, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.058007566204287514, |
| "grad_norm": 14.309200286865234, |
| "learning_rate": 1.9132653061224492e-05, |
| "loss": 2.4843, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.0605296343001261, |
| "grad_norm": 15.074164390563965, |
| "learning_rate": 1.9081632653061225e-05, |
| "loss": 2.4043, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.06305170239596469, |
| "grad_norm": 13.610542297363281, |
| "learning_rate": 1.903061224489796e-05, |
| "loss": 2.3762, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.06557377049180328, |
| "grad_norm": 15.666613578796387, |
| "learning_rate": 1.8979591836734696e-05, |
| "loss": 2.3249, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.06809583858764187, |
| "grad_norm": 14.475164413452148, |
| "learning_rate": 1.892857142857143e-05, |
| "loss": 2.3317, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.07061790668348046, |
| "grad_norm": 16.231687545776367, |
| "learning_rate": 1.8877551020408164e-05, |
| "loss": 2.5064, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.07313997477931904, |
| "grad_norm": 16.8968563079834, |
| "learning_rate": 1.88265306122449e-05, |
| "loss": 2.3932, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.07566204287515763, |
| "grad_norm": 17.74305534362793, |
| "learning_rate": 1.8775510204081636e-05, |
| "loss": 2.3329, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.07818411097099622, |
| "grad_norm": 16.41620445251465, |
| "learning_rate": 1.8724489795918368e-05, |
| "loss": 2.3982, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.0807061790668348, |
| "grad_norm": 17.965959548950195, |
| "learning_rate": 1.8673469387755104e-05, |
| "loss": 2.4227, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.0832282471626734, |
| "grad_norm": 19.92589569091797, |
| "learning_rate": 1.862244897959184e-05, |
| "loss": 2.5255, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.08575031525851198, |
| "grad_norm": 20.62932586669922, |
| "learning_rate": 1.8571428571428575e-05, |
| "loss": 2.1816, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.08827238335435057, |
| "grad_norm": 18.360614776611328, |
| "learning_rate": 1.8520408163265307e-05, |
| "loss": 2.2827, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.09079445145018916, |
| "grad_norm": 19.199546813964844, |
| "learning_rate": 1.8469387755102043e-05, |
| "loss": 2.1498, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.09331651954602774, |
| "grad_norm": 22.727521896362305, |
| "learning_rate": 1.8418367346938776e-05, |
| "loss": 2.3811, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.09583858764186633, |
| "grad_norm": 19.80649757385254, |
| "learning_rate": 1.836734693877551e-05, |
| "loss": 2.2342, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.09836065573770492, |
| "grad_norm": 22.24563217163086, |
| "learning_rate": 1.8316326530612247e-05, |
| "loss": 2.2287, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.1008827238335435, |
| "grad_norm": 25.384042739868164, |
| "learning_rate": 1.826530612244898e-05, |
| "loss": 2.1259, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.1034047919293821, |
| "grad_norm": 23.417089462280273, |
| "learning_rate": 1.8214285714285715e-05, |
| "loss": 2.0858, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.10592686002522068, |
| "grad_norm": 27.639497756958008, |
| "learning_rate": 1.816326530612245e-05, |
| "loss": 2.2243, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.10844892812105927, |
| "grad_norm": 27.390850067138672, |
| "learning_rate": 1.8112244897959187e-05, |
| "loss": 2.1314, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.11097099621689786, |
| "grad_norm": 27.956937789916992, |
| "learning_rate": 1.806122448979592e-05, |
| "loss": 2.1755, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.11349306431273644, |
| "grad_norm": 32.09632873535156, |
| "learning_rate": 1.8010204081632655e-05, |
| "loss": 2.2365, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.11601513240857503, |
| "grad_norm": 33.84647750854492, |
| "learning_rate": 1.795918367346939e-05, |
| "loss": 2.1671, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.11853720050441362, |
| "grad_norm": 32.027130126953125, |
| "learning_rate": 1.7908163265306123e-05, |
| "loss": 2.09, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.1210592686002522, |
| "grad_norm": 35.423587799072266, |
| "learning_rate": 1.785714285714286e-05, |
| "loss": 2.2479, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.1235813366960908, |
| "grad_norm": 31.041240692138672, |
| "learning_rate": 1.780612244897959e-05, |
| "loss": 1.9687, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.12610340479192939, |
| "grad_norm": 28.790103912353516, |
| "learning_rate": 1.7755102040816327e-05, |
| "loss": 2.1428, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.12862547288776796, |
| "grad_norm": 25.089313507080078, |
| "learning_rate": 1.7704081632653062e-05, |
| "loss": 2.0673, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.13114754098360656, |
| "grad_norm": 26.493867874145508, |
| "learning_rate": 1.7653061224489798e-05, |
| "loss": 2.0814, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.13366960907944514, |
| "grad_norm": 19.993173599243164, |
| "learning_rate": 1.760204081632653e-05, |
| "loss": 2.005, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.13619167717528374, |
| "grad_norm": 21.89765167236328, |
| "learning_rate": 1.7551020408163266e-05, |
| "loss": 2.2262, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.13871374527112232, |
| "grad_norm": 23.22844123840332, |
| "learning_rate": 1.7500000000000002e-05, |
| "loss": 2.0208, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.14123581336696092, |
| "grad_norm": 15.864526748657227, |
| "learning_rate": 1.7448979591836738e-05, |
| "loss": 2.0153, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.1437578814627995, |
| "grad_norm": 21.451187133789062, |
| "learning_rate": 1.7397959183673473e-05, |
| "loss": 2.1386, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.14627994955863807, |
| "grad_norm": 18.089811325073242, |
| "learning_rate": 1.7346938775510206e-05, |
| "loss": 1.9517, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.14880201765447668, |
| "grad_norm": 24.029157638549805, |
| "learning_rate": 1.729591836734694e-05, |
| "loss": 1.9719, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.15132408575031525, |
| "grad_norm": 18.722776412963867, |
| "learning_rate": 1.7244897959183674e-05, |
| "loss": 2.0623, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.15384615384615385, |
| "grad_norm": 20.211933135986328, |
| "learning_rate": 1.719387755102041e-05, |
| "loss": 2.0081, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.15636822194199243, |
| "grad_norm": 17.61188507080078, |
| "learning_rate": 1.7142857142857142e-05, |
| "loss": 1.8484, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.15889029003783103, |
| "grad_norm": 20.118955612182617, |
| "learning_rate": 1.7091836734693878e-05, |
| "loss": 2.0799, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.1614123581336696, |
| "grad_norm": 17.271841049194336, |
| "learning_rate": 1.7040816326530613e-05, |
| "loss": 1.9832, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.16393442622950818, |
| "grad_norm": 19.521392822265625, |
| "learning_rate": 1.698979591836735e-05, |
| "loss": 1.9129, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.1664564943253468, |
| "grad_norm": 22.660900115966797, |
| "learning_rate": 1.6938775510204085e-05, |
| "loss": 2.118, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.16897856242118536, |
| "grad_norm": 17.332427978515625, |
| "learning_rate": 1.6887755102040817e-05, |
| "loss": 1.9632, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.17150063051702397, |
| "grad_norm": 22.42765998840332, |
| "learning_rate": 1.6836734693877553e-05, |
| "loss": 1.954, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.17402269861286254, |
| "grad_norm": 23.6208553314209, |
| "learning_rate": 1.678571428571429e-05, |
| "loss": 1.9917, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.17654476670870115, |
| "grad_norm": 19.78505516052246, |
| "learning_rate": 1.673469387755102e-05, |
| "loss": 1.7964, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.17906683480453972, |
| "grad_norm": 19.453041076660156, |
| "learning_rate": 1.6683673469387757e-05, |
| "loss": 1.9587, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.18158890290037832, |
| "grad_norm": 24.731407165527344, |
| "learning_rate": 1.6632653061224492e-05, |
| "loss": 1.9945, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.1841109709962169, |
| "grad_norm": 20.977611541748047, |
| "learning_rate": 1.6581632653061225e-05, |
| "loss": 2.0617, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.18663303909205547, |
| "grad_norm": 22.959585189819336, |
| "learning_rate": 1.653061224489796e-05, |
| "loss": 1.98, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.18915510718789408, |
| "grad_norm": 21.952653884887695, |
| "learning_rate": 1.6479591836734696e-05, |
| "loss": 2.1094, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.19167717528373265, |
| "grad_norm": 22.320383071899414, |
| "learning_rate": 1.642857142857143e-05, |
| "loss": 1.8418, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.19419924337957126, |
| "grad_norm": 24.375411987304688, |
| "learning_rate": 1.6377551020408164e-05, |
| "loss": 1.8428, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.19672131147540983, |
| "grad_norm": 19.64323616027832, |
| "learning_rate": 1.63265306122449e-05, |
| "loss": 1.9194, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.19924337957124844, |
| "grad_norm": 22.459064483642578, |
| "learning_rate": 1.6275510204081636e-05, |
| "loss": 1.6649, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.201765447667087, |
| "grad_norm": 36.789764404296875, |
| "learning_rate": 1.6224489795918368e-05, |
| "loss": 2.0131, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2042875157629256, |
| "grad_norm": 22.109119415283203, |
| "learning_rate": 1.6173469387755104e-05, |
| "loss": 1.9603, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.2068095838587642, |
| "grad_norm": 19.196834564208984, |
| "learning_rate": 1.612244897959184e-05, |
| "loss": 2.0538, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.20933165195460277, |
| "grad_norm": 26.870800018310547, |
| "learning_rate": 1.6071428571428572e-05, |
| "loss": 1.9168, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.21185372005044137, |
| "grad_norm": 35.190696716308594, |
| "learning_rate": 1.6020408163265308e-05, |
| "loss": 2.0149, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.21437578814627994, |
| "grad_norm": 19.963472366333008, |
| "learning_rate": 1.596938775510204e-05, |
| "loss": 1.7871, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.21689785624211855, |
| "grad_norm": 20.292407989501953, |
| "learning_rate": 1.5918367346938776e-05, |
| "loss": 1.944, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.21941992433795712, |
| "grad_norm": 20.55329132080078, |
| "learning_rate": 1.586734693877551e-05, |
| "loss": 2.0175, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.22194199243379573, |
| "grad_norm": 17.27350616455078, |
| "learning_rate": 1.5816326530612247e-05, |
| "loss": 1.9308, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.2244640605296343, |
| "grad_norm": 22.471134185791016, |
| "learning_rate": 1.576530612244898e-05, |
| "loss": 1.9221, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.22698612862547288, |
| "grad_norm": 25.098316192626953, |
| "learning_rate": 1.5714285714285715e-05, |
| "loss": 1.9359, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.22950819672131148, |
| "grad_norm": 25.125213623046875, |
| "learning_rate": 1.566326530612245e-05, |
| "loss": 2.0087, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.23203026481715006, |
| "grad_norm": 20.038599014282227, |
| "learning_rate": 1.5612244897959187e-05, |
| "loss": 2.0939, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.23455233291298866, |
| "grad_norm": 19.016841888427734, |
| "learning_rate": 1.556122448979592e-05, |
| "loss": 2.0183, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.23707440100882723, |
| "grad_norm": 21.97820472717285, |
| "learning_rate": 1.5510204081632655e-05, |
| "loss": 1.8239, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.23959646910466584, |
| "grad_norm": 25.578901290893555, |
| "learning_rate": 1.545918367346939e-05, |
| "loss": 1.9388, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.2421185372005044, |
| "grad_norm": 23.74614143371582, |
| "learning_rate": 1.5408163265306123e-05, |
| "loss": 2.0492, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.244640605296343, |
| "grad_norm": 22.203304290771484, |
| "learning_rate": 1.535714285714286e-05, |
| "loss": 1.941, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.2471626733921816, |
| "grad_norm": 21.39324188232422, |
| "learning_rate": 1.530612244897959e-05, |
| "loss": 1.9042, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.24968474148802017, |
| "grad_norm": 18.99315643310547, |
| "learning_rate": 1.5255102040816327e-05, |
| "loss": 1.88, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.25220680958385877, |
| "grad_norm": 24.22341537475586, |
| "learning_rate": 1.5204081632653063e-05, |
| "loss": 1.8147, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.25220680958385877, |
| "eval_loss": 1.8966256380081177, |
| "eval_runtime": 6.9787, |
| "eval_samples_per_second": 101.022, |
| "eval_steps_per_second": 50.583, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2547288776796974, |
| "grad_norm": 18.296152114868164, |
| "learning_rate": 1.5153061224489798e-05, |
| "loss": 1.8605, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.2572509457755359, |
| "grad_norm": 26.404766082763672, |
| "learning_rate": 1.510204081632653e-05, |
| "loss": 2.1195, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.2597730138713745, |
| "grad_norm": 19.187122344970703, |
| "learning_rate": 1.5051020408163266e-05, |
| "loss": 1.9284, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.26229508196721313, |
| "grad_norm": 20.79934310913086, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 1.725, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.2648171500630517, |
| "grad_norm": 23.833288192749023, |
| "learning_rate": 1.4948979591836736e-05, |
| "loss": 1.9215, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.2673392181588903, |
| "grad_norm": 22.301727294921875, |
| "learning_rate": 1.4897959183673472e-05, |
| "loss": 1.8728, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.2698612862547289, |
| "grad_norm": 23.685596466064453, |
| "learning_rate": 1.4846938775510204e-05, |
| "loss": 2.0482, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.2723833543505675, |
| "grad_norm": 18.969186782836914, |
| "learning_rate": 1.479591836734694e-05, |
| "loss": 1.8724, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.27490542244640603, |
| "grad_norm": 23.994483947753906, |
| "learning_rate": 1.4744897959183676e-05, |
| "loss": 1.9542, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.27742749054224464, |
| "grad_norm": 16.84621238708496, |
| "learning_rate": 1.469387755102041e-05, |
| "loss": 1.9703, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.27994955863808324, |
| "grad_norm": 23.411087036132812, |
| "learning_rate": 1.4642857142857144e-05, |
| "loss": 1.9836, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.28247162673392184, |
| "grad_norm": 29.55487632751465, |
| "learning_rate": 1.4591836734693878e-05, |
| "loss": 1.9124, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.2849936948297604, |
| "grad_norm": 32.28921127319336, |
| "learning_rate": 1.4540816326530614e-05, |
| "loss": 1.8566, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.287515762925599, |
| "grad_norm": 24.007558822631836, |
| "learning_rate": 1.448979591836735e-05, |
| "loss": 1.8296, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.2900378310214376, |
| "grad_norm": 26.753524780273438, |
| "learning_rate": 1.4438775510204083e-05, |
| "loss": 1.7181, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.29255989911727615, |
| "grad_norm": 22.49270248413086, |
| "learning_rate": 1.4387755102040817e-05, |
| "loss": 1.8741, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.29508196721311475, |
| "grad_norm": 28.006656646728516, |
| "learning_rate": 1.4336734693877551e-05, |
| "loss": 1.9151, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.29760403530895335, |
| "grad_norm": 17.606775283813477, |
| "learning_rate": 1.4285714285714287e-05, |
| "loss": 1.9654, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.30012610340479196, |
| "grad_norm": 29.94802474975586, |
| "learning_rate": 1.4234693877551023e-05, |
| "loss": 1.8849, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.3026481715006305, |
| "grad_norm": 28.27743148803711, |
| "learning_rate": 1.4183673469387755e-05, |
| "loss": 1.8006, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3051702395964691, |
| "grad_norm": 19.11652183532715, |
| "learning_rate": 1.4132653061224491e-05, |
| "loss": 1.8539, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.3076923076923077, |
| "grad_norm": 24.255807876586914, |
| "learning_rate": 1.4081632653061225e-05, |
| "loss": 2.0162, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.31021437578814626, |
| "grad_norm": 22.508352279663086, |
| "learning_rate": 1.403061224489796e-05, |
| "loss": 1.858, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.31273644388398486, |
| "grad_norm": 27.028772354125977, |
| "learning_rate": 1.3979591836734696e-05, |
| "loss": 1.8175, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.31525851197982346, |
| "grad_norm": 22.697704315185547, |
| "learning_rate": 1.3928571428571429e-05, |
| "loss": 1.9789, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.31778058007566207, |
| "grad_norm": 31.604068756103516, |
| "learning_rate": 1.3877551020408165e-05, |
| "loss": 2.0039, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.3203026481715006, |
| "grad_norm": 27.71053695678711, |
| "learning_rate": 1.38265306122449e-05, |
| "loss": 1.9867, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.3228247162673392, |
| "grad_norm": 17.37586784362793, |
| "learning_rate": 1.3775510204081634e-05, |
| "loss": 1.6931, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.3253467843631778, |
| "grad_norm": 20.28536605834961, |
| "learning_rate": 1.3724489795918368e-05, |
| "loss": 1.9199, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.32786885245901637, |
| "grad_norm": 29.4377384185791, |
| "learning_rate": 1.3673469387755102e-05, |
| "loss": 1.9322, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.33039092055485497, |
| "grad_norm": 17.703046798706055, |
| "learning_rate": 1.3622448979591838e-05, |
| "loss": 1.8842, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.3329129886506936, |
| "grad_norm": 30.14008140563965, |
| "learning_rate": 1.3571428571428574e-05, |
| "loss": 2.1228, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.3354350567465322, |
| "grad_norm": 29.657262802124023, |
| "learning_rate": 1.3520408163265306e-05, |
| "loss": 1.8929, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.3379571248423707, |
| "grad_norm": 18.243854522705078, |
| "learning_rate": 1.3469387755102042e-05, |
| "loss": 1.8811, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.34047919293820933, |
| "grad_norm": 33.22247314453125, |
| "learning_rate": 1.3418367346938776e-05, |
| "loss": 1.9814, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.34300126103404793, |
| "grad_norm": 26.413856506347656, |
| "learning_rate": 1.3367346938775512e-05, |
| "loss": 1.9329, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.3455233291298865, |
| "grad_norm": 20.56089210510254, |
| "learning_rate": 1.3316326530612247e-05, |
| "loss": 1.8944, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.3480453972257251, |
| "grad_norm": 19.480737686157227, |
| "learning_rate": 1.326530612244898e-05, |
| "loss": 1.9221, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.3505674653215637, |
| "grad_norm": 22.788074493408203, |
| "learning_rate": 1.3214285714285716e-05, |
| "loss": 2.0445, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.3530895334174023, |
| "grad_norm": 20.722291946411133, |
| "learning_rate": 1.316326530612245e-05, |
| "loss": 1.9998, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.35561160151324084, |
| "grad_norm": 25.190189361572266, |
| "learning_rate": 1.3112244897959185e-05, |
| "loss": 1.8076, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.35813366960907944, |
| "grad_norm": 23.203886032104492, |
| "learning_rate": 1.3061224489795918e-05, |
| "loss": 1.7821, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.36065573770491804, |
| "grad_norm": 25.32374382019043, |
| "learning_rate": 1.3010204081632653e-05, |
| "loss": 2.0356, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.36317780580075665, |
| "grad_norm": 28.798864364624023, |
| "learning_rate": 1.2959183673469389e-05, |
| "loss": 1.8202, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.3656998738965952, |
| "grad_norm": 24.93810272216797, |
| "learning_rate": 1.2908163265306123e-05, |
| "loss": 1.9237, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.3682219419924338, |
| "grad_norm": 36.78353500366211, |
| "learning_rate": 1.2857142857142859e-05, |
| "loss": 2.0019, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.3707440100882724, |
| "grad_norm": 28.510663986206055, |
| "learning_rate": 1.2806122448979591e-05, |
| "loss": 1.9268, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.37326607818411095, |
| "grad_norm": 38.19087219238281, |
| "learning_rate": 1.2755102040816327e-05, |
| "loss": 1.9366, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.37578814627994955, |
| "grad_norm": 20.796728134155273, |
| "learning_rate": 1.2704081632653063e-05, |
| "loss": 1.8731, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.37831021437578816, |
| "grad_norm": 23.036758422851562, |
| "learning_rate": 1.2653061224489798e-05, |
| "loss": 1.8835, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.38083228247162676, |
| "grad_norm": 27.058195114135742, |
| "learning_rate": 1.260204081632653e-05, |
| "loss": 1.8013, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.3833543505674653, |
| "grad_norm": 25.390460968017578, |
| "learning_rate": 1.2551020408163267e-05, |
| "loss": 2.0623, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.3858764186633039, |
| "grad_norm": 27.993654251098633, |
| "learning_rate": 1.25e-05, |
| "loss": 1.6895, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.3883984867591425, |
| "grad_norm": 24.15807342529297, |
| "learning_rate": 1.2448979591836736e-05, |
| "loss": 1.9799, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.39092055485498106, |
| "grad_norm": 24.369815826416016, |
| "learning_rate": 1.2397959183673472e-05, |
| "loss": 1.9687, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.39344262295081966, |
| "grad_norm": 24.572988510131836, |
| "learning_rate": 1.2346938775510204e-05, |
| "loss": 1.8607, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.39596469104665827, |
| "grad_norm": 20.491390228271484, |
| "learning_rate": 1.229591836734694e-05, |
| "loss": 2.0677, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.39848675914249687, |
| "grad_norm": 25.128101348876953, |
| "learning_rate": 1.2244897959183674e-05, |
| "loss": 1.9233, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.4010088272383354, |
| "grad_norm": 18.843276977539062, |
| "learning_rate": 1.219387755102041e-05, |
| "loss": 1.781, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.403530895334174, |
| "grad_norm": 24.99994659423828, |
| "learning_rate": 1.2142857142857142e-05, |
| "loss": 1.962, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.4060529634300126, |
| "grad_norm": 20.679218292236328, |
| "learning_rate": 1.2091836734693878e-05, |
| "loss": 2.0055, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.4085750315258512, |
| "grad_norm": 26.00550651550293, |
| "learning_rate": 1.2040816326530614e-05, |
| "loss": 1.9761, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.4110970996216898, |
| "grad_norm": 33.80900192260742, |
| "learning_rate": 1.1989795918367348e-05, |
| "loss": 2.0502, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.4136191677175284, |
| "grad_norm": 25.639009475708008, |
| "learning_rate": 1.1938775510204084e-05, |
| "loss": 1.9088, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.416141235813367, |
| "grad_norm": 17.48627471923828, |
| "learning_rate": 1.1887755102040816e-05, |
| "loss": 1.9359, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.41866330390920553, |
| "grad_norm": 23.16074562072754, |
| "learning_rate": 1.1836734693877552e-05, |
| "loss": 1.8647, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.42118537200504413, |
| "grad_norm": 25.39946174621582, |
| "learning_rate": 1.1785714285714287e-05, |
| "loss": 1.8523, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.42370744010088274, |
| "grad_norm": 25.8050537109375, |
| "learning_rate": 1.1734693877551021e-05, |
| "loss": 1.8403, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.4262295081967213, |
| "grad_norm": 20.019033432006836, |
| "learning_rate": 1.1683673469387755e-05, |
| "loss": 2.0023, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.4287515762925599, |
| "grad_norm": 26.194847106933594, |
| "learning_rate": 1.1632653061224491e-05, |
| "loss": 1.9429, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.4312736443883985, |
| "grad_norm": 21.064212799072266, |
| "learning_rate": 1.1581632653061225e-05, |
| "loss": 1.8302, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.4337957124842371, |
| "grad_norm": 21.876129150390625, |
| "learning_rate": 1.1530612244897961e-05, |
| "loss": 1.8881, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.43631778058007564, |
| "grad_norm": 33.61103439331055, |
| "learning_rate": 1.1479591836734697e-05, |
| "loss": 2.0497, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.43883984867591425, |
| "grad_norm": 27.204744338989258, |
| "learning_rate": 1.1428571428571429e-05, |
| "loss": 1.8431, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.44136191677175285, |
| "grad_norm": 21.605751037597656, |
| "learning_rate": 1.1377551020408165e-05, |
| "loss": 1.9149, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.44388398486759145, |
| "grad_norm": 30.307472229003906, |
| "learning_rate": 1.1326530612244899e-05, |
| "loss": 2.0313, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.44640605296343, |
| "grad_norm": 23.69244384765625, |
| "learning_rate": 1.1275510204081635e-05, |
| "loss": 1.8676, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.4489281210592686, |
| "grad_norm": 25.619901657104492, |
| "learning_rate": 1.1224489795918367e-05, |
| "loss": 1.7905, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.4514501891551072, |
| "grad_norm": 28.0296573638916, |
| "learning_rate": 1.1173469387755103e-05, |
| "loss": 1.8567, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.45397225725094575, |
| "grad_norm": 36.4359130859375, |
| "learning_rate": 1.1122448979591838e-05, |
| "loss": 2.115, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.45649432534678436, |
| "grad_norm": 26.91726303100586, |
| "learning_rate": 1.1071428571428572e-05, |
| "loss": 2.0642, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.45901639344262296, |
| "grad_norm": 23.085880279541016, |
| "learning_rate": 1.1020408163265306e-05, |
| "loss": 1.9109, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.46153846153846156, |
| "grad_norm": 27.870641708374023, |
| "learning_rate": 1.096938775510204e-05, |
| "loss": 1.8999, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.4640605296343001, |
| "grad_norm": 32.0672607421875, |
| "learning_rate": 1.0918367346938776e-05, |
| "loss": 1.904, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.4665825977301387, |
| "grad_norm": 28.879365921020508, |
| "learning_rate": 1.0867346938775512e-05, |
| "loss": 1.7159, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.4691046658259773, |
| "grad_norm": 27.592771530151367, |
| "learning_rate": 1.0816326530612246e-05, |
| "loss": 1.8561, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.47162673392181587, |
| "grad_norm": 27.412763595581055, |
| "learning_rate": 1.076530612244898e-05, |
| "loss": 1.9282, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.47414880201765447, |
| "grad_norm": 30.12356185913086, |
| "learning_rate": 1.0714285714285714e-05, |
| "loss": 1.8726, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.4766708701134931, |
| "grad_norm": 39.9027214050293, |
| "learning_rate": 1.066326530612245e-05, |
| "loss": 1.7551, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.4791929382093317, |
| "grad_norm": 30.483945846557617, |
| "learning_rate": 1.0612244897959186e-05, |
| "loss": 1.7643, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.4817150063051702, |
| "grad_norm": 26.00415802001953, |
| "learning_rate": 1.0561224489795918e-05, |
| "loss": 2.0552, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.4842370744010088, |
| "grad_norm": 23.03282356262207, |
| "learning_rate": 1.0510204081632654e-05, |
| "loss": 2.0052, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.48675914249684743, |
| "grad_norm": 33.653221130371094, |
| "learning_rate": 1.045918367346939e-05, |
| "loss": 1.7367, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.489281210592686, |
| "grad_norm": 39.59351348876953, |
| "learning_rate": 1.0408163265306123e-05, |
| "loss": 1.9726, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.4918032786885246, |
| "grad_norm": 42.77714920043945, |
| "learning_rate": 1.0357142857142859e-05, |
| "loss": 2.0906, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.4943253467843632, |
| "grad_norm": 33.194549560546875, |
| "learning_rate": 1.0306122448979591e-05, |
| "loss": 2.0742, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.4968474148802018, |
| "grad_norm": 25.10793685913086, |
| "learning_rate": 1.0255102040816327e-05, |
| "loss": 1.9204, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.49936948297604034, |
| "grad_norm": 40.048404693603516, |
| "learning_rate": 1.0204081632653063e-05, |
| "loss": 1.7775, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.501891551071879, |
| "grad_norm": 26.085933685302734, |
| "learning_rate": 1.0153061224489797e-05, |
| "loss": 1.9459, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.5044136191677175, |
| "grad_norm": 18.375, |
| "learning_rate": 1.0102040816326531e-05, |
| "loss": 1.9536, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5044136191677175, |
| "eval_loss": 1.8626214265823364, |
| "eval_runtime": 6.6508, |
| "eval_samples_per_second": 106.002, |
| "eval_steps_per_second": 53.076, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5069356872635561, |
| "grad_norm": 33.858341217041016, |
| "learning_rate": 1.0051020408163265e-05, |
| "loss": 1.8191, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.5094577553593947, |
| "grad_norm": 22.895992279052734, |
| "learning_rate": 1e-05, |
| "loss": 2.0596, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.5119798234552333, |
| "grad_norm": 30.55072593688965, |
| "learning_rate": 9.948979591836737e-06, |
| "loss": 1.8904, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.5145018915510718, |
| "grad_norm": 26.542705535888672, |
| "learning_rate": 9.89795918367347e-06, |
| "loss": 1.9683, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.5170239596469105, |
| "grad_norm": 39.81034851074219, |
| "learning_rate": 9.846938775510205e-06, |
| "loss": 1.9726, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.519546027742749, |
| "grad_norm": 22.0065860748291, |
| "learning_rate": 9.795918367346939e-06, |
| "loss": 2.0575, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.5220680958385876, |
| "grad_norm": 19.012041091918945, |
| "learning_rate": 9.744897959183674e-06, |
| "loss": 1.8431, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.5245901639344263, |
| "grad_norm": 39.699974060058594, |
| "learning_rate": 9.693877551020408e-06, |
| "loss": 1.8231, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.5271122320302648, |
| "grad_norm": 21.391319274902344, |
| "learning_rate": 9.642857142857144e-06, |
| "loss": 1.7939, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.5296343001261034, |
| "grad_norm": 25.8063907623291, |
| "learning_rate": 9.591836734693878e-06, |
| "loss": 2.0366, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.532156368221942, |
| "grad_norm": 20.598569869995117, |
| "learning_rate": 9.540816326530612e-06, |
| "loss": 1.8323, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.5346784363177806, |
| "grad_norm": 29.391401290893555, |
| "learning_rate": 9.489795918367348e-06, |
| "loss": 2.0052, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.5372005044136192, |
| "grad_norm": 24.39499855041504, |
| "learning_rate": 9.438775510204082e-06, |
| "loss": 1.8461, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.5397225725094578, |
| "grad_norm": 24.16887092590332, |
| "learning_rate": 9.387755102040818e-06, |
| "loss": 1.9404, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.5422446406052963, |
| "grad_norm": 24.577871322631836, |
| "learning_rate": 9.336734693877552e-06, |
| "loss": 1.9202, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.544766708701135, |
| "grad_norm": 26.117361068725586, |
| "learning_rate": 9.285714285714288e-06, |
| "loss": 1.921, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.5472887767969735, |
| "grad_norm": 22.586837768554688, |
| "learning_rate": 9.234693877551022e-06, |
| "loss": 1.9692, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.5498108448928121, |
| "grad_norm": 18.438722610473633, |
| "learning_rate": 9.183673469387756e-06, |
| "loss": 1.9496, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.5523329129886507, |
| "grad_norm": 22.94545555114746, |
| "learning_rate": 9.13265306122449e-06, |
| "loss": 1.991, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.5548549810844893, |
| "grad_norm": 28.664562225341797, |
| "learning_rate": 9.081632653061225e-06, |
| "loss": 1.8352, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.5573770491803278, |
| "grad_norm": 25.63576316833496, |
| "learning_rate": 9.03061224489796e-06, |
| "loss": 1.9399, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.5598991172761665, |
| "grad_norm": 21.650251388549805, |
| "learning_rate": 8.979591836734695e-06, |
| "loss": 1.9565, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.562421185372005, |
| "grad_norm": 29.605735778808594, |
| "learning_rate": 8.92857142857143e-06, |
| "loss": 1.729, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.5649432534678437, |
| "grad_norm": 23.98230743408203, |
| "learning_rate": 8.877551020408163e-06, |
| "loss": 1.9399, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.5674653215636822, |
| "grad_norm": 20.37510108947754, |
| "learning_rate": 8.826530612244899e-06, |
| "loss": 1.7322, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.5699873896595208, |
| "grad_norm": 25.876188278198242, |
| "learning_rate": 8.775510204081633e-06, |
| "loss": 1.9444, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.5725094577553594, |
| "grad_norm": 32.07249069213867, |
| "learning_rate": 8.724489795918369e-06, |
| "loss": 1.9364, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.575031525851198, |
| "grad_norm": 28.014524459838867, |
| "learning_rate": 8.673469387755103e-06, |
| "loss": 1.757, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.5775535939470365, |
| "grad_norm": 30.82647132873535, |
| "learning_rate": 8.622448979591837e-06, |
| "loss": 1.9067, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.5800756620428752, |
| "grad_norm": 30.651660919189453, |
| "learning_rate": 8.571428571428571e-06, |
| "loss": 1.9906, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.5825977301387137, |
| "grad_norm": 25.239904403686523, |
| "learning_rate": 8.520408163265307e-06, |
| "loss": 1.8914, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.5851197982345523, |
| "grad_norm": 21.33747673034668, |
| "learning_rate": 8.469387755102042e-06, |
| "loss": 1.9999, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.587641866330391, |
| "grad_norm": 25.255064010620117, |
| "learning_rate": 8.418367346938776e-06, |
| "loss": 1.8941, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.5901639344262295, |
| "grad_norm": 24.443973541259766, |
| "learning_rate": 8.36734693877551e-06, |
| "loss": 1.7679, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.592686002522068, |
| "grad_norm": 25.473894119262695, |
| "learning_rate": 8.316326530612246e-06, |
| "loss": 1.7876, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.5952080706179067, |
| "grad_norm": 26.28467559814453, |
| "learning_rate": 8.26530612244898e-06, |
| "loss": 1.6761, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.5977301387137453, |
| "grad_norm": 24.488052368164062, |
| "learning_rate": 8.214285714285714e-06, |
| "loss": 2.0022, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.6002522068095839, |
| "grad_norm": 30.074064254760742, |
| "learning_rate": 8.16326530612245e-06, |
| "loss": 1.7747, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.6027742749054225, |
| "grad_norm": 23.73440170288086, |
| "learning_rate": 8.112244897959184e-06, |
| "loss": 1.8468, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.605296343001261, |
| "grad_norm": 22.338869094848633, |
| "learning_rate": 8.06122448979592e-06, |
| "loss": 1.8611, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.6078184110970997, |
| "grad_norm": 24.844266891479492, |
| "learning_rate": 8.010204081632654e-06, |
| "loss": 1.9285, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.6103404791929382, |
| "grad_norm": 29.65668487548828, |
| "learning_rate": 7.959183673469388e-06, |
| "loss": 1.935, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.6128625472887768, |
| "grad_norm": 26.01723289489746, |
| "learning_rate": 7.908163265306124e-06, |
| "loss": 1.7587, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.6153846153846154, |
| "grad_norm": 27.04817771911621, |
| "learning_rate": 7.857142857142858e-06, |
| "loss": 1.8878, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.617906683480454, |
| "grad_norm": 36.23786163330078, |
| "learning_rate": 7.806122448979593e-06, |
| "loss": 1.992, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.6204287515762925, |
| "grad_norm": 19.283294677734375, |
| "learning_rate": 7.755102040816327e-06, |
| "loss": 1.8066, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.6229508196721312, |
| "grad_norm": 24.24143409729004, |
| "learning_rate": 7.704081632653061e-06, |
| "loss": 1.8899, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.6254728877679697, |
| "grad_norm": 25.59832763671875, |
| "learning_rate": 7.653061224489796e-06, |
| "loss": 1.9601, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.6279949558638083, |
| "grad_norm": 27.195640563964844, |
| "learning_rate": 7.602040816326531e-06, |
| "loss": 1.9561, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.6305170239596469, |
| "grad_norm": 27.854570388793945, |
| "learning_rate": 7.551020408163265e-06, |
| "loss": 1.8781, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.6330390920554855, |
| "grad_norm": 25.715761184692383, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 1.8542, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.6355611601513241, |
| "grad_norm": 22.562984466552734, |
| "learning_rate": 7.448979591836736e-06, |
| "loss": 1.7681, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.6380832282471627, |
| "grad_norm": 20.540348052978516, |
| "learning_rate": 7.39795918367347e-06, |
| "loss": 1.9617, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.6406052963430012, |
| "grad_norm": 24.610937118530273, |
| "learning_rate": 7.346938775510205e-06, |
| "loss": 1.9694, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.6431273644388399, |
| "grad_norm": 27.93538475036621, |
| "learning_rate": 7.295918367346939e-06, |
| "loss": 1.8858, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.6456494325346784, |
| "grad_norm": 31.466445922851562, |
| "learning_rate": 7.244897959183675e-06, |
| "loss": 1.8252, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.648171500630517, |
| "grad_norm": 26.276226043701172, |
| "learning_rate": 7.193877551020409e-06, |
| "loss": 1.8865, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.6506935687263556, |
| "grad_norm": 22.52095603942871, |
| "learning_rate": 7.1428571428571436e-06, |
| "loss": 1.7649, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.6532156368221942, |
| "grad_norm": 20.15144157409668, |
| "learning_rate": 7.091836734693878e-06, |
| "loss": 2.0158, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.6557377049180327, |
| "grad_norm": 26.405349731445312, |
| "learning_rate": 7.0408163265306125e-06, |
| "loss": 1.8932, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.6582597730138714, |
| "grad_norm": 32.94384765625, |
| "learning_rate": 6.989795918367348e-06, |
| "loss": 1.7795, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.6607818411097099, |
| "grad_norm": 23.109092712402344, |
| "learning_rate": 6.938775510204082e-06, |
| "loss": 1.8383, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.6633039092055486, |
| "grad_norm": 21.75737190246582, |
| "learning_rate": 6.887755102040817e-06, |
| "loss": 1.8727, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.6658259773013872, |
| "grad_norm": 22.96916389465332, |
| "learning_rate": 6.836734693877551e-06, |
| "loss": 1.8544, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.6683480453972257, |
| "grad_norm": 25.62445640563965, |
| "learning_rate": 6.785714285714287e-06, |
| "loss": 1.7503, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.6708701134930644, |
| "grad_norm": 25.430530548095703, |
| "learning_rate": 6.734693877551021e-06, |
| "loss": 1.7938, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.6733921815889029, |
| "grad_norm": 26.462881088256836, |
| "learning_rate": 6.683673469387756e-06, |
| "loss": 1.8284, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.6759142496847415, |
| "grad_norm": 31.45004653930664, |
| "learning_rate": 6.63265306122449e-06, |
| "loss": 2.0328, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.6784363177805801, |
| "grad_norm": 30.525737762451172, |
| "learning_rate": 6.581632653061225e-06, |
| "loss": 1.8192, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.6809583858764187, |
| "grad_norm": 25.705707550048828, |
| "learning_rate": 6.530612244897959e-06, |
| "loss": 1.8533, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.6834804539722572, |
| "grad_norm": 39.90187454223633, |
| "learning_rate": 6.4795918367346946e-06, |
| "loss": 1.9483, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.6860025220680959, |
| "grad_norm": 28.0180721282959, |
| "learning_rate": 6.4285714285714295e-06, |
| "loss": 1.8132, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.6885245901639344, |
| "grad_norm": 34.821372985839844, |
| "learning_rate": 6.3775510204081635e-06, |
| "loss": 1.9599, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.691046658259773, |
| "grad_norm": 24.018394470214844, |
| "learning_rate": 6.326530612244899e-06, |
| "loss": 1.9248, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.6935687263556116, |
| "grad_norm": 24.074344635009766, |
| "learning_rate": 6.275510204081633e-06, |
| "loss": 2.0148, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.6960907944514502, |
| "grad_norm": 31.1939754486084, |
| "learning_rate": 6.224489795918368e-06, |
| "loss": 1.8959, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.6986128625472888, |
| "grad_norm": 25.481502532958984, |
| "learning_rate": 6.173469387755102e-06, |
| "loss": 1.9832, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.7011349306431274, |
| "grad_norm": 29.6664981842041, |
| "learning_rate": 6.122448979591837e-06, |
| "loss": 1.9222, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.7036569987389659, |
| "grad_norm": 26.30698585510254, |
| "learning_rate": 6.071428571428571e-06, |
| "loss": 1.9897, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.7061790668348046, |
| "grad_norm": 31.827558517456055, |
| "learning_rate": 6.020408163265307e-06, |
| "loss": 1.8615, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.7087011349306431, |
| "grad_norm": 24.80223846435547, |
| "learning_rate": 5.969387755102042e-06, |
| "loss": 1.9579, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.7112232030264817, |
| "grad_norm": 36.134700775146484, |
| "learning_rate": 5.918367346938776e-06, |
| "loss": 1.723, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.7137452711223203, |
| "grad_norm": 30.388233184814453, |
| "learning_rate": 5.867346938775511e-06, |
| "loss": 1.9736, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.7162673392181589, |
| "grad_norm": 32.231563568115234, |
| "learning_rate": 5.816326530612246e-06, |
| "loss": 1.9228, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.7187894073139974, |
| "grad_norm": 38.05869674682617, |
| "learning_rate": 5.7653061224489805e-06, |
| "loss": 1.9159, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.7213114754098361, |
| "grad_norm": 27.256147384643555, |
| "learning_rate": 5.7142857142857145e-06, |
| "loss": 1.8072, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.7238335435056746, |
| "grad_norm": 25.67181396484375, |
| "learning_rate": 5.663265306122449e-06, |
| "loss": 1.8633, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.7263556116015133, |
| "grad_norm": 31.8681697845459, |
| "learning_rate": 5.6122448979591834e-06, |
| "loss": 1.9822, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.7288776796973518, |
| "grad_norm": 32.85325241088867, |
| "learning_rate": 5.561224489795919e-06, |
| "loss": 1.8166, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.7313997477931904, |
| "grad_norm": 35.64312744140625, |
| "learning_rate": 5.510204081632653e-06, |
| "loss": 1.7166, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.733921815889029, |
| "grad_norm": 24.276235580444336, |
| "learning_rate": 5.459183673469388e-06, |
| "loss": 1.7593, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.7364438839848676, |
| "grad_norm": 29.371950149536133, |
| "learning_rate": 5.408163265306123e-06, |
| "loss": 1.8124, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.7389659520807061, |
| "grad_norm": 23.76220703125, |
| "learning_rate": 5.357142857142857e-06, |
| "loss": 1.7775, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.7414880201765448, |
| "grad_norm": 37.103050231933594, |
| "learning_rate": 5.306122448979593e-06, |
| "loss": 1.9253, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.7440100882723834, |
| "grad_norm": 20.0811767578125, |
| "learning_rate": 5.255102040816327e-06, |
| "loss": 1.8711, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.7465321563682219, |
| "grad_norm": 35.33123016357422, |
| "learning_rate": 5.204081632653062e-06, |
| "loss": 1.8764, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.7490542244640606, |
| "grad_norm": 31.880672454833984, |
| "learning_rate": 5.153061224489796e-06, |
| "loss": 1.8929, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.7515762925598991, |
| "grad_norm": 21.682334899902344, |
| "learning_rate": 5.1020408163265315e-06, |
| "loss": 2.0377, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.7540983606557377, |
| "grad_norm": 34.68608474731445, |
| "learning_rate": 5.0510204081632655e-06, |
| "loss": 1.9341, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.7566204287515763, |
| "grad_norm": 25.59632110595703, |
| "learning_rate": 5e-06, |
| "loss": 1.8264, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7566204287515763, |
| "eval_loss": 1.8502724170684814, |
| "eval_runtime": 6.6208, |
| "eval_samples_per_second": 106.483, |
| "eval_steps_per_second": 53.317, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7591424968474149, |
| "grad_norm": 33.780616760253906, |
| "learning_rate": 4.948979591836735e-06, |
| "loss": 1.8315, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.7616645649432535, |
| "grad_norm": 23.005069732666016, |
| "learning_rate": 4.897959183673469e-06, |
| "loss": 1.8434, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.7641866330390921, |
| "grad_norm": 27.338787078857422, |
| "learning_rate": 4.846938775510204e-06, |
| "loss": 1.9102, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.7667087011349306, |
| "grad_norm": 26.87493133544922, |
| "learning_rate": 4.795918367346939e-06, |
| "loss": 1.8408, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.7692307692307693, |
| "grad_norm": 33.59117126464844, |
| "learning_rate": 4.744897959183674e-06, |
| "loss": 1.8633, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.7717528373266078, |
| "grad_norm": 38.98092269897461, |
| "learning_rate": 4.693877551020409e-06, |
| "loss": 1.8187, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.7742749054224464, |
| "grad_norm": 28.7203369140625, |
| "learning_rate": 4.642857142857144e-06, |
| "loss": 1.8425, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.776796973518285, |
| "grad_norm": 30.91414451599121, |
| "learning_rate": 4.591836734693878e-06, |
| "loss": 1.8526, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.7793190416141236, |
| "grad_norm": 29.04154396057129, |
| "learning_rate": 4.540816326530613e-06, |
| "loss": 1.8913, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.7818411097099621, |
| "grad_norm": 29.638099670410156, |
| "learning_rate": 4.489795918367348e-06, |
| "loss": 1.8736, |
| "step": 310 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 397, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 10, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4524488042102784.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|