{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7818411097099621, "eval_steps": 100, "global_step": 310, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025220680958385876, "grad_norm": 25.350475311279297, "learning_rate": 0.0, "loss": 2.5568, "step": 1 }, { "epoch": 0.005044136191677175, "grad_norm": 24.538068771362305, "learning_rate": 4.000000000000001e-06, "loss": 2.7748, "step": 2 }, { "epoch": 0.007566204287515763, "grad_norm": 23.780784606933594, "learning_rate": 8.000000000000001e-06, "loss": 2.5911, "step": 3 }, { "epoch": 0.01008827238335435, "grad_norm": 24.780380249023438, "learning_rate": 1.2e-05, "loss": 2.8427, "step": 4 }, { "epoch": 0.012610340479192938, "grad_norm": 22.699949264526367, "learning_rate": 1.6000000000000003e-05, "loss": 2.709, "step": 5 }, { "epoch": 0.015132408575031526, "grad_norm": 22.106008529663086, "learning_rate": 2e-05, "loss": 2.5854, "step": 6 }, { "epoch": 0.017654476670870115, "grad_norm": 22.497045516967773, "learning_rate": 1.9948979591836737e-05, "loss": 2.5427, "step": 7 }, { "epoch": 0.0201765447667087, "grad_norm": 27.103275299072266, "learning_rate": 1.9897959183673473e-05, "loss": 2.6599, "step": 8 }, { "epoch": 0.02269861286254729, "grad_norm": 21.081985473632812, "learning_rate": 1.9846938775510205e-05, "loss": 2.5145, "step": 9 }, { "epoch": 0.025220680958385876, "grad_norm": 25.964981079101562, "learning_rate": 1.979591836734694e-05, "loss": 2.4247, "step": 10 }, { "epoch": 0.027742749054224466, "grad_norm": 25.353195190429688, "learning_rate": 1.9744897959183677e-05, "loss": 2.5092, "step": 11 }, { "epoch": 0.03026481715006305, "grad_norm": 18.94191551208496, "learning_rate": 1.969387755102041e-05, "loss": 2.4335, "step": 12 }, { "epoch": 0.03278688524590164, "grad_norm": 23.60140037536621, "learning_rate": 1.9642857142857145e-05, "loss": 2.544, "step": 13 }, { "epoch": 0.03530895334174023, "grad_norm": 24.298965454101562, "learning_rate": 1.9591836734693877e-05, "loss": 2.4987, "step": 14 }, { "epoch": 0.03783102143757881, "grad_norm": 20.745506286621094, "learning_rate": 1.9540816326530613e-05, "loss": 2.4985, "step": 15 }, { "epoch": 0.0403530895334174, "grad_norm": 22.54330062866211, "learning_rate": 1.948979591836735e-05, "loss": 2.6892, "step": 16 }, { "epoch": 0.04287515762925599, "grad_norm": 21.46229362487793, "learning_rate": 1.9438775510204085e-05, "loss": 2.3998, "step": 17 }, { "epoch": 0.04539722572509458, "grad_norm": 20.54530143737793, "learning_rate": 1.9387755102040817e-05, "loss": 2.4244, "step": 18 }, { "epoch": 0.04791929382093316, "grad_norm": 18.8839111328125, "learning_rate": 1.9336734693877553e-05, "loss": 2.3911, "step": 19 }, { "epoch": 0.05044136191677175, "grad_norm": 16.924652099609375, "learning_rate": 1.928571428571429e-05, "loss": 2.3588, "step": 20 }, { "epoch": 0.05296343001261034, "grad_norm": 16.996627807617188, "learning_rate": 1.9234693877551024e-05, "loss": 2.3727, "step": 21 }, { "epoch": 0.05548549810844893, "grad_norm": 18.584613800048828, "learning_rate": 1.9183673469387756e-05, "loss": 2.2974, "step": 22 }, { "epoch": 0.058007566204287514, "grad_norm": 14.309200286865234, "learning_rate": 1.9132653061224492e-05, "loss": 2.4843, "step": 23 }, { "epoch": 0.0605296343001261, "grad_norm": 15.074164390563965, "learning_rate": 1.9081632653061225e-05, "loss": 2.4043, "step": 24 }, { "epoch": 0.06305170239596469, "grad_norm": 13.610542297363281, "learning_rate": 1.903061224489796e-05, "loss": 2.3762, "step": 25 }, { "epoch": 0.06557377049180328, "grad_norm": 15.666613578796387, "learning_rate": 1.8979591836734696e-05, "loss": 2.3249, "step": 26 }, { "epoch": 0.06809583858764187, "grad_norm": 14.475164413452148, "learning_rate": 1.892857142857143e-05, "loss": 2.3317, "step": 27 }, { "epoch": 0.07061790668348046, "grad_norm": 16.231687545776367, "learning_rate": 1.8877551020408164e-05, "loss": 2.5064, "step": 28 }, { "epoch": 0.07313997477931904, "grad_norm": 16.8968563079834, "learning_rate": 1.88265306122449e-05, "loss": 2.3932, "step": 29 }, { "epoch": 0.07566204287515763, "grad_norm": 17.74305534362793, "learning_rate": 1.8775510204081636e-05, "loss": 2.3329, "step": 30 }, { "epoch": 0.07818411097099622, "grad_norm": 16.41620445251465, "learning_rate": 1.8724489795918368e-05, "loss": 2.3982, "step": 31 }, { "epoch": 0.0807061790668348, "grad_norm": 17.965959548950195, "learning_rate": 1.8673469387755104e-05, "loss": 2.4227, "step": 32 }, { "epoch": 0.0832282471626734, "grad_norm": 19.92589569091797, "learning_rate": 1.862244897959184e-05, "loss": 2.5255, "step": 33 }, { "epoch": 0.08575031525851198, "grad_norm": 20.62932586669922, "learning_rate": 1.8571428571428575e-05, "loss": 2.1816, "step": 34 }, { "epoch": 0.08827238335435057, "grad_norm": 18.360614776611328, "learning_rate": 1.8520408163265307e-05, "loss": 2.2827, "step": 35 }, { "epoch": 0.09079445145018916, "grad_norm": 19.199546813964844, "learning_rate": 1.8469387755102043e-05, "loss": 2.1498, "step": 36 }, { "epoch": 0.09331651954602774, "grad_norm": 22.727521896362305, "learning_rate": 1.8418367346938776e-05, "loss": 2.3811, "step": 37 }, { "epoch": 0.09583858764186633, "grad_norm": 19.80649757385254, "learning_rate": 1.836734693877551e-05, "loss": 2.2342, "step": 38 }, { "epoch": 0.09836065573770492, "grad_norm": 22.24563217163086, "learning_rate": 1.8316326530612247e-05, "loss": 2.2287, "step": 39 }, { "epoch": 0.1008827238335435, "grad_norm": 25.384042739868164, "learning_rate": 1.826530612244898e-05, "loss": 2.1259, "step": 40 }, { "epoch": 0.1034047919293821, "grad_norm": 23.417089462280273, "learning_rate": 1.8214285714285715e-05, "loss": 2.0858, "step": 41 }, { "epoch": 0.10592686002522068, "grad_norm": 27.639497756958008, "learning_rate": 1.816326530612245e-05, "loss": 2.2243, "step": 42 }, { "epoch": 0.10844892812105927, "grad_norm": 27.390850067138672, "learning_rate": 1.8112244897959187e-05, "loss": 2.1314, "step": 43 }, { "epoch": 0.11097099621689786, "grad_norm": 27.956937789916992, "learning_rate": 1.806122448979592e-05, "loss": 2.1755, "step": 44 }, { "epoch": 0.11349306431273644, "grad_norm": 32.09632873535156, "learning_rate": 1.8010204081632655e-05, "loss": 2.2365, "step": 45 }, { "epoch": 0.11601513240857503, "grad_norm": 33.84647750854492, "learning_rate": 1.795918367346939e-05, "loss": 2.1671, "step": 46 }, { "epoch": 0.11853720050441362, "grad_norm": 32.027130126953125, "learning_rate": 1.7908163265306123e-05, "loss": 2.09, "step": 47 }, { "epoch": 0.1210592686002522, "grad_norm": 35.423587799072266, "learning_rate": 1.785714285714286e-05, "loss": 2.2479, "step": 48 }, { "epoch": 0.1235813366960908, "grad_norm": 31.041240692138672, "learning_rate": 1.780612244897959e-05, "loss": 1.9687, "step": 49 }, { "epoch": 0.12610340479192939, "grad_norm": 28.790103912353516, "learning_rate": 1.7755102040816327e-05, "loss": 2.1428, "step": 50 }, { "epoch": 0.12862547288776796, "grad_norm": 25.089313507080078, "learning_rate": 1.7704081632653062e-05, "loss": 2.0673, "step": 51 }, { "epoch": 0.13114754098360656, "grad_norm": 26.493867874145508, "learning_rate": 1.7653061224489798e-05, "loss": 2.0814, "step": 52 }, { "epoch": 0.13366960907944514, "grad_norm": 19.993173599243164, "learning_rate": 1.760204081632653e-05, "loss": 2.005, "step": 53 }, { "epoch": 0.13619167717528374, "grad_norm": 21.89765167236328, "learning_rate": 1.7551020408163266e-05, "loss": 2.2262, "step": 54 }, { "epoch": 0.13871374527112232, "grad_norm": 23.22844123840332, "learning_rate": 1.7500000000000002e-05, "loss": 2.0208, "step": 55 }, { "epoch": 0.14123581336696092, "grad_norm": 15.864526748657227, "learning_rate": 1.7448979591836738e-05, "loss": 2.0153, "step": 56 }, { "epoch": 0.1437578814627995, "grad_norm": 21.451187133789062, "learning_rate": 1.7397959183673473e-05, "loss": 2.1386, "step": 57 }, { "epoch": 0.14627994955863807, "grad_norm": 18.089811325073242, "learning_rate": 1.7346938775510206e-05, "loss": 1.9517, "step": 58 }, { "epoch": 0.14880201765447668, "grad_norm": 24.029157638549805, "learning_rate": 1.729591836734694e-05, "loss": 1.9719, "step": 59 }, { "epoch": 0.15132408575031525, "grad_norm": 18.722776412963867, "learning_rate": 1.7244897959183674e-05, "loss": 2.0623, "step": 60 }, { "epoch": 0.15384615384615385, "grad_norm": 20.211933135986328, "learning_rate": 1.719387755102041e-05, "loss": 2.0081, "step": 61 }, { "epoch": 0.15636822194199243, "grad_norm": 17.61188507080078, "learning_rate": 1.7142857142857142e-05, "loss": 1.8484, "step": 62 }, { "epoch": 0.15889029003783103, "grad_norm": 20.118955612182617, "learning_rate": 1.7091836734693878e-05, "loss": 2.0799, "step": 63 }, { "epoch": 0.1614123581336696, "grad_norm": 17.271841049194336, "learning_rate": 1.7040816326530613e-05, "loss": 1.9832, "step": 64 }, { "epoch": 0.16393442622950818, "grad_norm": 19.521392822265625, "learning_rate": 1.698979591836735e-05, "loss": 1.9129, "step": 65 }, { "epoch": 0.1664564943253468, "grad_norm": 22.660900115966797, "learning_rate": 1.6938775510204085e-05, "loss": 2.118, "step": 66 }, { "epoch": 0.16897856242118536, "grad_norm": 17.332427978515625, "learning_rate": 1.6887755102040817e-05, "loss": 1.9632, "step": 67 }, { "epoch": 0.17150063051702397, "grad_norm": 22.42765998840332, "learning_rate": 1.6836734693877553e-05, "loss": 1.954, "step": 68 }, { "epoch": 0.17402269861286254, "grad_norm": 23.6208553314209, "learning_rate": 1.678571428571429e-05, "loss": 1.9917, "step": 69 }, { "epoch": 0.17654476670870115, "grad_norm": 19.78505516052246, "learning_rate": 1.673469387755102e-05, "loss": 1.7964, "step": 70 }, { "epoch": 0.17906683480453972, "grad_norm": 19.453041076660156, "learning_rate": 1.6683673469387757e-05, "loss": 1.9587, "step": 71 }, { "epoch": 0.18158890290037832, "grad_norm": 24.731407165527344, "learning_rate": 1.6632653061224492e-05, "loss": 1.9945, "step": 72 }, { "epoch": 0.1841109709962169, "grad_norm": 20.977611541748047, "learning_rate": 1.6581632653061225e-05, "loss": 2.0617, "step": 73 }, { "epoch": 0.18663303909205547, "grad_norm": 22.959585189819336, "learning_rate": 1.653061224489796e-05, "loss": 1.98, "step": 74 }, { "epoch": 0.18915510718789408, "grad_norm": 21.952653884887695, "learning_rate": 1.6479591836734696e-05, "loss": 2.1094, "step": 75 }, { "epoch": 0.19167717528373265, "grad_norm": 22.320383071899414, "learning_rate": 1.642857142857143e-05, "loss": 1.8418, "step": 76 }, { "epoch": 0.19419924337957126, "grad_norm": 24.375411987304688, "learning_rate": 1.6377551020408164e-05, "loss": 1.8428, "step": 77 }, { "epoch": 0.19672131147540983, "grad_norm": 19.64323616027832, "learning_rate": 1.63265306122449e-05, "loss": 1.9194, "step": 78 }, { "epoch": 0.19924337957124844, "grad_norm": 22.459064483642578, "learning_rate": 1.6275510204081636e-05, "loss": 1.6649, "step": 79 }, { "epoch": 0.201765447667087, "grad_norm": 36.789764404296875, "learning_rate": 1.6224489795918368e-05, "loss": 2.0131, "step": 80 }, { "epoch": 0.2042875157629256, "grad_norm": 22.109119415283203, "learning_rate": 1.6173469387755104e-05, "loss": 1.9603, "step": 81 }, { "epoch": 0.2068095838587642, "grad_norm": 19.196834564208984, "learning_rate": 1.612244897959184e-05, "loss": 2.0538, "step": 82 }, { "epoch": 0.20933165195460277, "grad_norm": 26.870800018310547, "learning_rate": 1.6071428571428572e-05, "loss": 1.9168, "step": 83 }, { "epoch": 0.21185372005044137, "grad_norm": 35.190696716308594, "learning_rate": 1.6020408163265308e-05, "loss": 2.0149, "step": 84 }, { "epoch": 0.21437578814627994, "grad_norm": 19.963472366333008, "learning_rate": 1.596938775510204e-05, "loss": 1.7871, "step": 85 }, { "epoch": 0.21689785624211855, "grad_norm": 20.292407989501953, "learning_rate": 1.5918367346938776e-05, "loss": 1.944, "step": 86 }, { "epoch": 0.21941992433795712, "grad_norm": 20.55329132080078, "learning_rate": 1.586734693877551e-05, "loss": 2.0175, "step": 87 }, { "epoch": 0.22194199243379573, "grad_norm": 17.27350616455078, "learning_rate": 1.5816326530612247e-05, "loss": 1.9308, "step": 88 }, { "epoch": 0.2244640605296343, "grad_norm": 22.471134185791016, "learning_rate": 1.576530612244898e-05, "loss": 1.9221, "step": 89 }, { "epoch": 0.22698612862547288, "grad_norm": 25.098316192626953, "learning_rate": 1.5714285714285715e-05, "loss": 1.9359, "step": 90 }, { "epoch": 0.22950819672131148, "grad_norm": 25.125213623046875, "learning_rate": 1.566326530612245e-05, "loss": 2.0087, "step": 91 }, { "epoch": 0.23203026481715006, "grad_norm": 20.038599014282227, "learning_rate": 1.5612244897959187e-05, "loss": 2.0939, "step": 92 }, { "epoch": 0.23455233291298866, "grad_norm": 19.016841888427734, "learning_rate": 1.556122448979592e-05, "loss": 2.0183, "step": 93 }, { "epoch": 0.23707440100882723, "grad_norm": 21.97820472717285, "learning_rate": 1.5510204081632655e-05, "loss": 1.8239, "step": 94 }, { "epoch": 0.23959646910466584, "grad_norm": 25.578901290893555, "learning_rate": 1.545918367346939e-05, "loss": 1.9388, "step": 95 }, { "epoch": 0.2421185372005044, "grad_norm": 23.74614143371582, "learning_rate": 1.5408163265306123e-05, "loss": 2.0492, "step": 96 }, { "epoch": 0.244640605296343, "grad_norm": 22.203304290771484, "learning_rate": 1.535714285714286e-05, "loss": 1.941, "step": 97 }, { "epoch": 0.2471626733921816, "grad_norm": 21.39324188232422, "learning_rate": 1.530612244897959e-05, "loss": 1.9042, "step": 98 }, { "epoch": 0.24968474148802017, "grad_norm": 18.99315643310547, "learning_rate": 1.5255102040816327e-05, "loss": 1.88, "step": 99 }, { "epoch": 0.25220680958385877, "grad_norm": 24.22341537475586, "learning_rate": 1.5204081632653063e-05, "loss": 1.8147, "step": 100 }, { "epoch": 0.25220680958385877, "eval_loss": 1.8966256380081177, "eval_runtime": 6.9787, "eval_samples_per_second": 101.022, "eval_steps_per_second": 50.583, "step": 100 }, { "epoch": 0.2547288776796974, "grad_norm": 18.296152114868164, "learning_rate": 1.5153061224489798e-05, "loss": 1.8605, "step": 101 }, { "epoch": 0.2572509457755359, "grad_norm": 26.404766082763672, "learning_rate": 1.510204081632653e-05, "loss": 2.1195, "step": 102 }, { "epoch": 0.2597730138713745, "grad_norm": 19.187122344970703, "learning_rate": 1.5051020408163266e-05, "loss": 1.9284, "step": 103 }, { "epoch": 0.26229508196721313, "grad_norm": 20.79934310913086, "learning_rate": 1.5000000000000002e-05, "loss": 1.725, "step": 104 }, { "epoch": 0.2648171500630517, "grad_norm": 23.833288192749023, "learning_rate": 1.4948979591836736e-05, "loss": 1.9215, "step": 105 }, { "epoch": 0.2673392181588903, "grad_norm": 22.301727294921875, "learning_rate": 1.4897959183673472e-05, "loss": 1.8728, "step": 106 }, { "epoch": 0.2698612862547289, "grad_norm": 23.685596466064453, "learning_rate": 1.4846938775510204e-05, "loss": 2.0482, "step": 107 }, { "epoch": 0.2723833543505675, "grad_norm": 18.969186782836914, "learning_rate": 1.479591836734694e-05, "loss": 1.8724, "step": 108 }, { "epoch": 0.27490542244640603, "grad_norm": 23.994483947753906, "learning_rate": 1.4744897959183676e-05, "loss": 1.9542, "step": 109 }, { "epoch": 0.27742749054224464, "grad_norm": 16.84621238708496, "learning_rate": 1.469387755102041e-05, "loss": 1.9703, "step": 110 }, { "epoch": 0.27994955863808324, "grad_norm": 23.411087036132812, "learning_rate": 1.4642857142857144e-05, "loss": 1.9836, "step": 111 }, { "epoch": 0.28247162673392184, "grad_norm": 29.55487632751465, "learning_rate": 1.4591836734693878e-05, "loss": 1.9124, "step": 112 }, { "epoch": 0.2849936948297604, "grad_norm": 32.28921127319336, "learning_rate": 1.4540816326530614e-05, "loss": 1.8566, "step": 113 }, { "epoch": 0.287515762925599, "grad_norm": 24.007558822631836, "learning_rate": 1.448979591836735e-05, "loss": 1.8296, "step": 114 }, { "epoch": 0.2900378310214376, "grad_norm": 26.753524780273438, "learning_rate": 1.4438775510204083e-05, "loss": 1.7181, "step": 115 }, { "epoch": 0.29255989911727615, "grad_norm": 22.49270248413086, "learning_rate": 1.4387755102040817e-05, "loss": 1.8741, "step": 116 }, { "epoch": 0.29508196721311475, "grad_norm": 28.006656646728516, "learning_rate": 1.4336734693877551e-05, "loss": 1.9151, "step": 117 }, { "epoch": 0.29760403530895335, "grad_norm": 17.606775283813477, "learning_rate": 1.4285714285714287e-05, "loss": 1.9654, "step": 118 }, { "epoch": 0.30012610340479196, "grad_norm": 29.94802474975586, "learning_rate": 1.4234693877551023e-05, "loss": 1.8849, "step": 119 }, { "epoch": 0.3026481715006305, "grad_norm": 28.27743148803711, "learning_rate": 1.4183673469387755e-05, "loss": 1.8006, "step": 120 }, { "epoch": 0.3051702395964691, "grad_norm": 19.11652183532715, "learning_rate": 1.4132653061224491e-05, "loss": 1.8539, "step": 121 }, { "epoch": 0.3076923076923077, "grad_norm": 24.255807876586914, "learning_rate": 1.4081632653061225e-05, "loss": 2.0162, "step": 122 }, { "epoch": 0.31021437578814626, "grad_norm": 22.508352279663086, "learning_rate": 1.403061224489796e-05, "loss": 1.858, "step": 123 }, { "epoch": 0.31273644388398486, "grad_norm": 27.028772354125977, "learning_rate": 1.3979591836734696e-05, "loss": 1.8175, "step": 124 }, { "epoch": 0.31525851197982346, "grad_norm": 22.697704315185547, "learning_rate": 1.3928571428571429e-05, "loss": 1.9789, "step": 125 }, { "epoch": 0.31778058007566207, "grad_norm": 31.604068756103516, "learning_rate": 1.3877551020408165e-05, "loss": 2.0039, "step": 126 }, { "epoch": 0.3203026481715006, "grad_norm": 27.71053695678711, "learning_rate": 1.38265306122449e-05, "loss": 1.9867, "step": 127 }, { "epoch": 0.3228247162673392, "grad_norm": 17.37586784362793, "learning_rate": 1.3775510204081634e-05, "loss": 1.6931, "step": 128 }, { "epoch": 0.3253467843631778, "grad_norm": 20.28536605834961, "learning_rate": 1.3724489795918368e-05, "loss": 1.9199, "step": 129 }, { "epoch": 0.32786885245901637, "grad_norm": 29.4377384185791, "learning_rate": 1.3673469387755102e-05, "loss": 1.9322, "step": 130 }, { "epoch": 0.33039092055485497, "grad_norm": 17.703046798706055, "learning_rate": 1.3622448979591838e-05, "loss": 1.8842, "step": 131 }, { "epoch": 0.3329129886506936, "grad_norm": 30.14008140563965, "learning_rate": 1.3571428571428574e-05, "loss": 2.1228, "step": 132 }, { "epoch": 0.3354350567465322, "grad_norm": 29.657262802124023, "learning_rate": 1.3520408163265306e-05, "loss": 1.8929, "step": 133 }, { "epoch": 0.3379571248423707, "grad_norm": 18.243854522705078, "learning_rate": 1.3469387755102042e-05, "loss": 1.8811, "step": 134 }, { "epoch": 0.34047919293820933, "grad_norm": 33.22247314453125, "learning_rate": 1.3418367346938776e-05, "loss": 1.9814, "step": 135 }, { "epoch": 0.34300126103404793, "grad_norm": 26.413856506347656, "learning_rate": 1.3367346938775512e-05, "loss": 1.9329, "step": 136 }, { "epoch": 0.3455233291298865, "grad_norm": 20.56089210510254, "learning_rate": 1.3316326530612247e-05, "loss": 1.8944, "step": 137 }, { "epoch": 0.3480453972257251, "grad_norm": 19.480737686157227, "learning_rate": 1.326530612244898e-05, "loss": 1.9221, "step": 138 }, { "epoch": 0.3505674653215637, "grad_norm": 22.788074493408203, "learning_rate": 1.3214285714285716e-05, "loss": 2.0445, "step": 139 }, { "epoch": 0.3530895334174023, "grad_norm": 20.722291946411133, "learning_rate": 1.316326530612245e-05, "loss": 1.9998, "step": 140 }, { "epoch": 0.35561160151324084, "grad_norm": 25.190189361572266, "learning_rate": 1.3112244897959185e-05, "loss": 1.8076, "step": 141 }, { "epoch": 0.35813366960907944, "grad_norm": 23.203886032104492, "learning_rate": 1.3061224489795918e-05, "loss": 1.7821, "step": 142 }, { "epoch": 0.36065573770491804, "grad_norm": 25.32374382019043, "learning_rate": 1.3010204081632653e-05, "loss": 2.0356, "step": 143 }, { "epoch": 0.36317780580075665, "grad_norm": 28.798864364624023, "learning_rate": 1.2959183673469389e-05, "loss": 1.8202, "step": 144 }, { "epoch": 0.3656998738965952, "grad_norm": 24.93810272216797, "learning_rate": 1.2908163265306123e-05, "loss": 1.9237, "step": 145 }, { "epoch": 0.3682219419924338, "grad_norm": 36.78353500366211, "learning_rate": 1.2857142857142859e-05, "loss": 2.0019, "step": 146 }, { "epoch": 0.3707440100882724, "grad_norm": 28.510663986206055, "learning_rate": 1.2806122448979591e-05, "loss": 1.9268, "step": 147 }, { "epoch": 0.37326607818411095, "grad_norm": 38.19087219238281, "learning_rate": 1.2755102040816327e-05, "loss": 1.9366, "step": 148 }, { "epoch": 0.37578814627994955, "grad_norm": 20.796728134155273, "learning_rate": 1.2704081632653063e-05, "loss": 1.8731, "step": 149 }, { "epoch": 0.37831021437578816, "grad_norm": 23.036758422851562, "learning_rate": 1.2653061224489798e-05, "loss": 1.8835, "step": 150 }, { "epoch": 0.38083228247162676, "grad_norm": 27.058195114135742, "learning_rate": 1.260204081632653e-05, "loss": 1.8013, "step": 151 }, { "epoch": 0.3833543505674653, "grad_norm": 25.390460968017578, "learning_rate": 1.2551020408163267e-05, "loss": 2.0623, "step": 152 }, { "epoch": 0.3858764186633039, "grad_norm": 27.993654251098633, "learning_rate": 1.25e-05, "loss": 1.6895, "step": 153 }, { "epoch": 0.3883984867591425, "grad_norm": 24.15807342529297, "learning_rate": 1.2448979591836736e-05, "loss": 1.9799, "step": 154 }, { "epoch": 0.39092055485498106, "grad_norm": 24.369815826416016, "learning_rate": 1.2397959183673472e-05, "loss": 1.9687, "step": 155 }, { "epoch": 0.39344262295081966, "grad_norm": 24.572988510131836, "learning_rate": 1.2346938775510204e-05, "loss": 1.8607, "step": 156 }, { "epoch": 0.39596469104665827, "grad_norm": 20.491390228271484, "learning_rate": 1.229591836734694e-05, "loss": 2.0677, "step": 157 }, { "epoch": 0.39848675914249687, "grad_norm": 25.128101348876953, "learning_rate": 1.2244897959183674e-05, "loss": 1.9233, "step": 158 }, { "epoch": 0.4010088272383354, "grad_norm": 18.843276977539062, "learning_rate": 1.219387755102041e-05, "loss": 1.781, "step": 159 }, { "epoch": 0.403530895334174, "grad_norm": 24.99994659423828, "learning_rate": 1.2142857142857142e-05, "loss": 1.962, "step": 160 }, { "epoch": 0.4060529634300126, "grad_norm": 20.679218292236328, "learning_rate": 1.2091836734693878e-05, "loss": 2.0055, "step": 161 }, { "epoch": 0.4085750315258512, "grad_norm": 26.00550651550293, "learning_rate": 1.2040816326530614e-05, "loss": 1.9761, "step": 162 }, { "epoch": 0.4110970996216898, "grad_norm": 33.80900192260742, "learning_rate": 1.1989795918367348e-05, "loss": 2.0502, "step": 163 }, { "epoch": 0.4136191677175284, "grad_norm": 25.639009475708008, "learning_rate": 1.1938775510204084e-05, "loss": 1.9088, "step": 164 }, { "epoch": 0.416141235813367, "grad_norm": 17.48627471923828, "learning_rate": 1.1887755102040816e-05, "loss": 1.9359, "step": 165 }, { "epoch": 0.41866330390920553, "grad_norm": 23.16074562072754, "learning_rate": 1.1836734693877552e-05, "loss": 1.8647, "step": 166 }, { "epoch": 0.42118537200504413, "grad_norm": 25.39946174621582, "learning_rate": 1.1785714285714287e-05, "loss": 1.8523, "step": 167 }, { "epoch": 0.42370744010088274, "grad_norm": 25.8050537109375, "learning_rate": 1.1734693877551021e-05, "loss": 1.8403, "step": 168 }, { "epoch": 0.4262295081967213, "grad_norm": 20.019033432006836, "learning_rate": 1.1683673469387755e-05, "loss": 2.0023, "step": 169 }, { "epoch": 0.4287515762925599, "grad_norm": 26.194847106933594, "learning_rate": 1.1632653061224491e-05, "loss": 1.9429, "step": 170 }, { "epoch": 0.4312736443883985, "grad_norm": 21.064212799072266, "learning_rate": 1.1581632653061225e-05, "loss": 1.8302, "step": 171 }, { "epoch": 0.4337957124842371, "grad_norm": 21.876129150390625, "learning_rate": 1.1530612244897961e-05, "loss": 1.8881, "step": 172 }, { "epoch": 0.43631778058007564, "grad_norm": 33.61103439331055, "learning_rate": 1.1479591836734697e-05, "loss": 2.0497, "step": 173 }, { "epoch": 0.43883984867591425, "grad_norm": 27.204744338989258, "learning_rate": 1.1428571428571429e-05, "loss": 1.8431, "step": 174 }, { "epoch": 0.44136191677175285, "grad_norm": 21.605751037597656, "learning_rate": 1.1377551020408165e-05, "loss": 1.9149, "step": 175 }, { "epoch": 0.44388398486759145, "grad_norm": 30.307472229003906, "learning_rate": 1.1326530612244899e-05, "loss": 2.0313, "step": 176 }, { "epoch": 0.44640605296343, "grad_norm": 23.69244384765625, "learning_rate": 1.1275510204081635e-05, "loss": 1.8676, "step": 177 }, { "epoch": 0.4489281210592686, "grad_norm": 25.619901657104492, "learning_rate": 1.1224489795918367e-05, "loss": 1.7905, "step": 178 }, { "epoch": 0.4514501891551072, "grad_norm": 28.0296573638916, "learning_rate": 1.1173469387755103e-05, "loss": 1.8567, "step": 179 }, { "epoch": 0.45397225725094575, "grad_norm": 36.4359130859375, "learning_rate": 1.1122448979591838e-05, "loss": 2.115, "step": 180 }, { "epoch": 0.45649432534678436, "grad_norm": 26.91726303100586, "learning_rate": 1.1071428571428572e-05, "loss": 2.0642, "step": 181 }, { "epoch": 0.45901639344262296, "grad_norm": 23.085880279541016, "learning_rate": 1.1020408163265306e-05, "loss": 1.9109, "step": 182 }, { "epoch": 0.46153846153846156, "grad_norm": 27.870641708374023, "learning_rate": 1.096938775510204e-05, "loss": 1.8999, "step": 183 }, { "epoch": 0.4640605296343001, "grad_norm": 32.0672607421875, "learning_rate": 1.0918367346938776e-05, "loss": 1.904, "step": 184 }, { "epoch": 0.4665825977301387, "grad_norm": 28.879365921020508, "learning_rate": 1.0867346938775512e-05, "loss": 1.7159, "step": 185 }, { "epoch": 0.4691046658259773, "grad_norm": 27.592771530151367, "learning_rate": 1.0816326530612246e-05, "loss": 1.8561, "step": 186 }, { "epoch": 0.47162673392181587, "grad_norm": 27.412763595581055, "learning_rate": 1.076530612244898e-05, "loss": 1.9282, "step": 187 }, { "epoch": 0.47414880201765447, "grad_norm": 30.12356185913086, "learning_rate": 1.0714285714285714e-05, "loss": 1.8726, "step": 188 }, { "epoch": 0.4766708701134931, "grad_norm": 39.9027214050293, "learning_rate": 1.066326530612245e-05, "loss": 1.7551, "step": 189 }, { "epoch": 0.4791929382093317, "grad_norm": 30.483945846557617, "learning_rate": 1.0612244897959186e-05, "loss": 1.7643, "step": 190 }, { "epoch": 0.4817150063051702, "grad_norm": 26.00415802001953, "learning_rate": 1.0561224489795918e-05, "loss": 2.0552, "step": 191 }, { "epoch": 0.4842370744010088, "grad_norm": 23.03282356262207, "learning_rate": 1.0510204081632654e-05, "loss": 2.0052, "step": 192 }, { "epoch": 0.48675914249684743, "grad_norm": 33.653221130371094, "learning_rate": 1.045918367346939e-05, "loss": 1.7367, "step": 193 }, { "epoch": 0.489281210592686, "grad_norm": 39.59351348876953, "learning_rate": 1.0408163265306123e-05, "loss": 1.9726, "step": 194 }, { "epoch": 0.4918032786885246, "grad_norm": 42.77714920043945, "learning_rate": 1.0357142857142859e-05, "loss": 2.0906, "step": 195 }, { "epoch": 0.4943253467843632, "grad_norm": 33.194549560546875, "learning_rate": 1.0306122448979591e-05, "loss": 2.0742, "step": 196 }, { "epoch": 0.4968474148802018, "grad_norm": 25.10793685913086, "learning_rate": 1.0255102040816327e-05, "loss": 1.9204, "step": 197 }, { "epoch": 0.49936948297604034, "grad_norm": 40.048404693603516, "learning_rate": 1.0204081632653063e-05, "loss": 1.7775, "step": 198 }, { "epoch": 0.501891551071879, "grad_norm": 26.085933685302734, "learning_rate": 1.0153061224489797e-05, "loss": 1.9459, "step": 199 }, { "epoch": 0.5044136191677175, "grad_norm": 18.375, "learning_rate": 1.0102040816326531e-05, "loss": 1.9536, "step": 200 }, { "epoch": 0.5044136191677175, "eval_loss": 1.8626214265823364, "eval_runtime": 6.6508, "eval_samples_per_second": 106.002, "eval_steps_per_second": 53.076, "step": 200 }, { "epoch": 0.5069356872635561, "grad_norm": 33.858341217041016, "learning_rate": 1.0051020408163265e-05, "loss": 1.8191, "step": 201 }, { "epoch": 0.5094577553593947, "grad_norm": 22.895992279052734, "learning_rate": 1e-05, "loss": 2.0596, "step": 202 }, { "epoch": 0.5119798234552333, "grad_norm": 30.55072593688965, "learning_rate": 9.948979591836737e-06, "loss": 1.8904, "step": 203 }, { "epoch": 0.5145018915510718, "grad_norm": 26.542705535888672, "learning_rate": 9.89795918367347e-06, "loss": 1.9683, "step": 204 }, { "epoch": 0.5170239596469105, "grad_norm": 39.81034851074219, "learning_rate": 9.846938775510205e-06, "loss": 1.9726, "step": 205 }, { "epoch": 0.519546027742749, "grad_norm": 22.0065860748291, "learning_rate": 9.795918367346939e-06, "loss": 2.0575, "step": 206 }, { "epoch": 0.5220680958385876, "grad_norm": 19.012041091918945, "learning_rate": 9.744897959183674e-06, "loss": 1.8431, "step": 207 }, { "epoch": 0.5245901639344263, "grad_norm": 39.699974060058594, "learning_rate": 9.693877551020408e-06, "loss": 1.8231, "step": 208 }, { "epoch": 0.5271122320302648, "grad_norm": 21.391319274902344, "learning_rate": 9.642857142857144e-06, "loss": 1.7939, "step": 209 }, { "epoch": 0.5296343001261034, "grad_norm": 25.8063907623291, "learning_rate": 9.591836734693878e-06, "loss": 2.0366, "step": 210 }, { "epoch": 0.532156368221942, "grad_norm": 20.598569869995117, "learning_rate": 9.540816326530612e-06, "loss": 1.8323, "step": 211 }, { "epoch": 0.5346784363177806, "grad_norm": 29.391401290893555, "learning_rate": 9.489795918367348e-06, "loss": 2.0052, "step": 212 }, { "epoch": 0.5372005044136192, "grad_norm": 24.39499855041504, "learning_rate": 9.438775510204082e-06, "loss": 1.8461, "step": 213 }, { "epoch": 0.5397225725094578, "grad_norm": 24.16887092590332, "learning_rate": 9.387755102040818e-06, "loss": 1.9404, "step": 214 }, { "epoch": 0.5422446406052963, "grad_norm": 24.577871322631836, "learning_rate": 9.336734693877552e-06, "loss": 1.9202, "step": 215 }, { "epoch": 0.544766708701135, "grad_norm": 26.117361068725586, "learning_rate": 9.285714285714288e-06, "loss": 1.921, "step": 216 }, { "epoch": 0.5472887767969735, "grad_norm": 22.586837768554688, "learning_rate": 9.234693877551022e-06, "loss": 1.9692, "step": 217 }, { "epoch": 0.5498108448928121, "grad_norm": 18.438722610473633, "learning_rate": 9.183673469387756e-06, "loss": 1.9496, "step": 218 }, { "epoch": 0.5523329129886507, "grad_norm": 22.94545555114746, "learning_rate": 9.13265306122449e-06, "loss": 1.991, "step": 219 }, { "epoch": 0.5548549810844893, "grad_norm": 28.664562225341797, "learning_rate": 9.081632653061225e-06, "loss": 1.8352, "step": 220 }, { "epoch": 0.5573770491803278, "grad_norm": 25.63576316833496, "learning_rate": 9.03061224489796e-06, "loss": 1.9399, "step": 221 }, { "epoch": 0.5598991172761665, "grad_norm": 21.650251388549805, "learning_rate": 8.979591836734695e-06, "loss": 1.9565, "step": 222 }, { "epoch": 0.562421185372005, "grad_norm": 29.605735778808594, "learning_rate": 8.92857142857143e-06, "loss": 1.729, "step": 223 }, { "epoch": 0.5649432534678437, "grad_norm": 23.98230743408203, "learning_rate": 8.877551020408163e-06, "loss": 1.9399, "step": 224 }, { "epoch": 0.5674653215636822, "grad_norm": 20.37510108947754, "learning_rate": 8.826530612244899e-06, "loss": 1.7322, "step": 225 }, { "epoch": 0.5699873896595208, "grad_norm": 25.876188278198242, "learning_rate": 8.775510204081633e-06, "loss": 1.9444, "step": 226 }, { "epoch": 0.5725094577553594, "grad_norm": 32.07249069213867, "learning_rate": 8.724489795918369e-06, "loss": 1.9364, "step": 227 }, { "epoch": 0.575031525851198, "grad_norm": 28.014524459838867, "learning_rate": 8.673469387755103e-06, "loss": 1.757, "step": 228 }, { "epoch": 0.5775535939470365, "grad_norm": 30.82647132873535, "learning_rate": 8.622448979591837e-06, "loss": 1.9067, "step": 229 }, { "epoch": 0.5800756620428752, "grad_norm": 30.651660919189453, "learning_rate": 8.571428571428571e-06, "loss": 1.9906, "step": 230 }, { "epoch": 0.5825977301387137, "grad_norm": 25.239904403686523, "learning_rate": 8.520408163265307e-06, "loss": 1.8914, "step": 231 }, { "epoch": 0.5851197982345523, "grad_norm": 21.33747673034668, "learning_rate": 8.469387755102042e-06, "loss": 1.9999, "step": 232 }, { "epoch": 0.587641866330391, "grad_norm": 25.255064010620117, "learning_rate": 8.418367346938776e-06, "loss": 1.8941, "step": 233 }, { "epoch": 0.5901639344262295, "grad_norm": 24.443973541259766, "learning_rate": 8.36734693877551e-06, "loss": 1.7679, "step": 234 }, { "epoch": 0.592686002522068, "grad_norm": 25.473894119262695, "learning_rate": 8.316326530612246e-06, "loss": 1.7876, "step": 235 }, { "epoch": 0.5952080706179067, "grad_norm": 26.28467559814453, "learning_rate": 8.26530612244898e-06, "loss": 1.6761, "step": 236 }, { "epoch": 0.5977301387137453, "grad_norm": 24.488052368164062, "learning_rate": 8.214285714285714e-06, "loss": 2.0022, "step": 237 }, { "epoch": 0.6002522068095839, "grad_norm": 30.074064254760742, "learning_rate": 8.16326530612245e-06, "loss": 1.7747, "step": 238 }, { "epoch": 0.6027742749054225, "grad_norm": 23.73440170288086, "learning_rate": 8.112244897959184e-06, "loss": 1.8468, "step": 239 }, { "epoch": 0.605296343001261, "grad_norm": 22.338869094848633, "learning_rate": 8.06122448979592e-06, "loss": 1.8611, "step": 240 }, { "epoch": 0.6078184110970997, "grad_norm": 24.844266891479492, "learning_rate": 8.010204081632654e-06, "loss": 1.9285, "step": 241 }, { "epoch": 0.6103404791929382, "grad_norm": 29.65668487548828, "learning_rate": 7.959183673469388e-06, "loss": 1.935, "step": 242 }, { "epoch": 0.6128625472887768, "grad_norm": 26.01723289489746, "learning_rate": 7.908163265306124e-06, "loss": 1.7587, "step": 243 }, { "epoch": 0.6153846153846154, "grad_norm": 27.04817771911621, "learning_rate": 7.857142857142858e-06, "loss": 1.8878, "step": 244 }, { "epoch": 0.617906683480454, "grad_norm": 36.23786163330078, "learning_rate": 7.806122448979593e-06, "loss": 1.992, "step": 245 }, { "epoch": 0.6204287515762925, "grad_norm": 19.283294677734375, "learning_rate": 7.755102040816327e-06, "loss": 1.8066, "step": 246 }, { "epoch": 0.6229508196721312, "grad_norm": 24.24143409729004, "learning_rate": 7.704081632653061e-06, "loss": 1.8899, "step": 247 }, { "epoch": 0.6254728877679697, "grad_norm": 25.59832763671875, "learning_rate": 7.653061224489796e-06, "loss": 1.9601, "step": 248 }, { "epoch": 0.6279949558638083, "grad_norm": 27.195640563964844, "learning_rate": 7.602040816326531e-06, "loss": 1.9561, "step": 249 }, { "epoch": 0.6305170239596469, "grad_norm": 27.854570388793945, "learning_rate": 7.551020408163265e-06, "loss": 1.8781, "step": 250 }, { "epoch": 0.6330390920554855, "grad_norm": 25.715761184692383, "learning_rate": 7.500000000000001e-06, "loss": 1.8542, "step": 251 }, { "epoch": 0.6355611601513241, "grad_norm": 22.562984466552734, "learning_rate": 7.448979591836736e-06, "loss": 1.7681, "step": 252 }, { "epoch": 0.6380832282471627, "grad_norm": 20.540348052978516, "learning_rate": 7.39795918367347e-06, "loss": 1.9617, "step": 253 }, { "epoch": 0.6406052963430012, "grad_norm": 24.610937118530273, "learning_rate": 7.346938775510205e-06, "loss": 1.9694, "step": 254 }, { "epoch": 0.6431273644388399, "grad_norm": 27.93538475036621, "learning_rate": 7.295918367346939e-06, "loss": 1.8858, "step": 255 }, { "epoch": 0.6456494325346784, "grad_norm": 31.466445922851562, "learning_rate": 7.244897959183675e-06, "loss": 1.8252, "step": 256 }, { "epoch": 0.648171500630517, "grad_norm": 26.276226043701172, "learning_rate": 7.193877551020409e-06, "loss": 1.8865, "step": 257 }, { "epoch": 0.6506935687263556, "grad_norm": 22.52095603942871, "learning_rate": 7.1428571428571436e-06, "loss": 1.7649, "step": 258 }, { "epoch": 0.6532156368221942, "grad_norm": 20.15144157409668, "learning_rate": 7.091836734693878e-06, "loss": 2.0158, "step": 259 }, { "epoch": 0.6557377049180327, "grad_norm": 26.405349731445312, "learning_rate": 7.0408163265306125e-06, "loss": 1.8932, "step": 260 }, { "epoch": 0.6582597730138714, "grad_norm": 32.94384765625, "learning_rate": 6.989795918367348e-06, "loss": 1.7795, "step": 261 }, { "epoch": 0.6607818411097099, "grad_norm": 23.109092712402344, "learning_rate": 6.938775510204082e-06, "loss": 1.8383, "step": 262 }, { "epoch": 0.6633039092055486, "grad_norm": 21.75737190246582, "learning_rate": 6.887755102040817e-06, "loss": 1.8727, "step": 263 }, { "epoch": 0.6658259773013872, "grad_norm": 22.96916389465332, "learning_rate": 6.836734693877551e-06, "loss": 1.8544, "step": 264 }, { "epoch": 0.6683480453972257, "grad_norm": 25.62445640563965, "learning_rate": 6.785714285714287e-06, "loss": 1.7503, "step": 265 }, { "epoch": 0.6708701134930644, "grad_norm": 25.430530548095703, "learning_rate": 6.734693877551021e-06, "loss": 1.7938, "step": 266 }, { "epoch": 0.6733921815889029, "grad_norm": 26.462881088256836, "learning_rate": 6.683673469387756e-06, "loss": 1.8284, "step": 267 }, { "epoch": 0.6759142496847415, "grad_norm": 31.45004653930664, "learning_rate": 6.63265306122449e-06, "loss": 2.0328, "step": 268 }, { "epoch": 0.6784363177805801, "grad_norm": 30.525737762451172, "learning_rate": 6.581632653061225e-06, "loss": 1.8192, "step": 269 }, { "epoch": 0.6809583858764187, "grad_norm": 25.705707550048828, "learning_rate": 6.530612244897959e-06, "loss": 1.8533, "step": 270 }, { "epoch": 0.6834804539722572, "grad_norm": 39.90187454223633, "learning_rate": 6.4795918367346946e-06, "loss": 1.9483, "step": 271 }, { "epoch": 0.6860025220680959, "grad_norm": 28.0180721282959, "learning_rate": 6.4285714285714295e-06, "loss": 1.8132, "step": 272 }, { "epoch": 0.6885245901639344, "grad_norm": 34.821372985839844, "learning_rate": 6.3775510204081635e-06, "loss": 1.9599, "step": 273 }, { "epoch": 0.691046658259773, "grad_norm": 24.018394470214844, "learning_rate": 6.326530612244899e-06, "loss": 1.9248, "step": 274 }, { "epoch": 0.6935687263556116, "grad_norm": 24.074344635009766, "learning_rate": 6.275510204081633e-06, "loss": 2.0148, "step": 275 }, { "epoch": 0.6960907944514502, "grad_norm": 31.1939754486084, "learning_rate": 6.224489795918368e-06, "loss": 1.8959, "step": 276 }, { "epoch": 0.6986128625472888, "grad_norm": 25.481502532958984, "learning_rate": 6.173469387755102e-06, "loss": 1.9832, "step": 277 }, { "epoch": 0.7011349306431274, "grad_norm": 29.6664981842041, "learning_rate": 6.122448979591837e-06, "loss": 1.9222, "step": 278 }, { "epoch": 0.7036569987389659, "grad_norm": 26.30698585510254, "learning_rate": 6.071428571428571e-06, "loss": 1.9897, "step": 279 }, { "epoch": 0.7061790668348046, "grad_norm": 31.827558517456055, "learning_rate": 6.020408163265307e-06, "loss": 1.8615, "step": 280 }, { "epoch": 0.7087011349306431, "grad_norm": 24.80223846435547, "learning_rate": 5.969387755102042e-06, "loss": 1.9579, "step": 281 }, { "epoch": 0.7112232030264817, "grad_norm": 36.134700775146484, "learning_rate": 5.918367346938776e-06, "loss": 1.723, "step": 282 }, { "epoch": 0.7137452711223203, "grad_norm": 30.388233184814453, "learning_rate": 5.867346938775511e-06, "loss": 1.9736, "step": 283 }, { "epoch": 0.7162673392181589, "grad_norm": 32.231563568115234, "learning_rate": 5.816326530612246e-06, "loss": 1.9228, "step": 284 }, { "epoch": 0.7187894073139974, "grad_norm": 38.05869674682617, "learning_rate": 5.7653061224489805e-06, "loss": 1.9159, "step": 285 }, { "epoch": 0.7213114754098361, "grad_norm": 27.256147384643555, "learning_rate": 5.7142857142857145e-06, "loss": 1.8072, "step": 286 }, { "epoch": 0.7238335435056746, "grad_norm": 25.67181396484375, "learning_rate": 5.663265306122449e-06, "loss": 1.8633, "step": 287 }, { "epoch": 0.7263556116015133, "grad_norm": 31.8681697845459, "learning_rate": 5.6122448979591834e-06, "loss": 1.9822, "step": 288 }, { "epoch": 0.7288776796973518, "grad_norm": 32.85325241088867, "learning_rate": 5.561224489795919e-06, "loss": 1.8166, "step": 289 }, { "epoch": 0.7313997477931904, "grad_norm": 35.64312744140625, "learning_rate": 5.510204081632653e-06, "loss": 1.7166, "step": 290 }, { "epoch": 0.733921815889029, "grad_norm": 24.276235580444336, "learning_rate": 5.459183673469388e-06, "loss": 1.7593, "step": 291 }, { "epoch": 0.7364438839848676, "grad_norm": 29.371950149536133, "learning_rate": 5.408163265306123e-06, "loss": 1.8124, "step": 292 }, { "epoch": 0.7389659520807061, "grad_norm": 23.76220703125, "learning_rate": 5.357142857142857e-06, "loss": 1.7775, "step": 293 }, { "epoch": 0.7414880201765448, "grad_norm": 37.103050231933594, "learning_rate": 5.306122448979593e-06, "loss": 1.9253, "step": 294 }, { "epoch": 0.7440100882723834, "grad_norm": 20.0811767578125, "learning_rate": 5.255102040816327e-06, "loss": 1.8711, "step": 295 }, { "epoch": 0.7465321563682219, "grad_norm": 35.33123016357422, "learning_rate": 5.204081632653062e-06, "loss": 1.8764, "step": 296 }, { "epoch": 0.7490542244640606, "grad_norm": 31.880672454833984, "learning_rate": 5.153061224489796e-06, "loss": 1.8929, "step": 297 }, { "epoch": 0.7515762925598991, "grad_norm": 21.682334899902344, "learning_rate": 5.1020408163265315e-06, "loss": 2.0377, "step": 298 }, { "epoch": 0.7540983606557377, "grad_norm": 34.68608474731445, "learning_rate": 5.0510204081632655e-06, "loss": 1.9341, "step": 299 }, { "epoch": 0.7566204287515763, "grad_norm": 25.59632110595703, "learning_rate": 5e-06, "loss": 1.8264, "step": 300 }, { "epoch": 0.7566204287515763, "eval_loss": 1.8502724170684814, "eval_runtime": 6.6208, "eval_samples_per_second": 106.483, "eval_steps_per_second": 53.317, "step": 300 }, { "epoch": 0.7591424968474149, "grad_norm": 33.780616760253906, "learning_rate": 4.948979591836735e-06, "loss": 1.8315, "step": 301 }, { "epoch": 0.7616645649432535, "grad_norm": 23.005069732666016, "learning_rate": 4.897959183673469e-06, "loss": 1.8434, "step": 302 }, { "epoch": 0.7641866330390921, "grad_norm": 27.338787078857422, "learning_rate": 4.846938775510204e-06, "loss": 1.9102, "step": 303 }, { "epoch": 0.7667087011349306, "grad_norm": 26.87493133544922, "learning_rate": 4.795918367346939e-06, "loss": 1.8408, "step": 304 }, { "epoch": 0.7692307692307693, "grad_norm": 33.59117126464844, "learning_rate": 4.744897959183674e-06, "loss": 1.8633, "step": 305 }, { "epoch": 0.7717528373266078, "grad_norm": 38.98092269897461, "learning_rate": 4.693877551020409e-06, "loss": 1.8187, "step": 306 }, { "epoch": 0.7742749054224464, "grad_norm": 28.7203369140625, "learning_rate": 4.642857142857144e-06, "loss": 1.8425, "step": 307 }, { "epoch": 0.776796973518285, "grad_norm": 30.91414451599121, "learning_rate": 4.591836734693878e-06, "loss": 1.8526, "step": 308 }, { "epoch": 0.7793190416141236, "grad_norm": 29.04154396057129, "learning_rate": 4.540816326530613e-06, "loss": 1.8913, "step": 309 }, { "epoch": 0.7818411097099621, "grad_norm": 29.638099670410156, "learning_rate": 4.489795918367348e-06, "loss": 1.8736, "step": 310 } ], "logging_steps": 1, "max_steps": 397, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4524488042102784.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }