diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,2335 +1,3526 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.9981785063752278, + "epoch": 1.9987960510474356, "eval_steps": 100, - "global_step": 1372, + "global_step": 2076, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.007285974499089253, - "grad_norm": 1.0096142292022705, - "learning_rate": 6.532799061198535e-06, - "loss": 1.4658, - "mean_token_accuracy": 0.643508793356131, + "epoch": 0.004815795810257645, + "grad_norm": 1.217836856842041, + "learning_rate": 6.030637677527024e-06, + "loss": 1.5557, + "mean_token_accuracy": 0.6304981142282486, "step": 5 }, { - "epoch": 0.014571948998178506, - "grad_norm": 0.7324991226196289, - "learning_rate": 9.346322475460614e-06, - "loss": 1.4392, - "mean_token_accuracy": 0.6500748045920858, + "epoch": 0.00963159162051529, + "grad_norm": 0.9109130501747131, + "learning_rate": 8.627891955472085e-06, + "loss": 1.6, + "mean_token_accuracy": 0.6200483560562133, "step": 10 }, { - "epoch": 0.02185792349726776, - "grad_norm": 0.4901340901851654, - "learning_rate": 1.0992128167704883e-05, - "loss": 1.4079, - "mean_token_accuracy": 0.6516214780158487, + "epoch": 0.014447387430772935, + "grad_norm": 0.7003273367881775, + "learning_rate": 1.0147188312907547e-05, + "loss": 1.5115, + "mean_token_accuracy": 0.6351560086011887, "step": 15 }, { - "epoch": 0.029143897996357013, - "grad_norm": 0.5083264708518982, - "learning_rate": 1.215984588972269e-05, - "loss": 1.3607, - "mean_token_accuracy": 0.658668172936004, + "epoch": 0.01926318324103058, + "grad_norm": 0.5947560667991638, + "learning_rate": 1.1225146233417143e-05, + "loss": 1.4895, + "mean_token_accuracy": 0.6334351062774658, "step": 20 }, { - "epoch": 0.03642987249544627, - "grad_norm": 0.40738093852996826, - "learning_rate": 1.306559812239707e-05, - "loss": 1.2944, - "mean_token_accuracy": 0.6690186858817784, + "epoch": 0.024078979051288224, + "grad_norm": 0.6775269508361816, + "learning_rate": 1.2061275355054047e-05, + "loss": 1.3818, + "mean_token_accuracy": 0.6498163402080536, "step": 25 }, { - "epoch": 0.04371584699453552, - "grad_norm": 0.3954770267009735, - "learning_rate": 1.3805651581966963e-05, - "loss": 1.2738, - "mean_token_accuracy": 0.670574316072301, + "epoch": 0.02889477486154587, + "grad_norm": 0.5584117770195007, + "learning_rate": 1.2744442590852606e-05, + "loss": 1.3492, + "mean_token_accuracy": 0.6553073137998581, "step": 30 }, { - "epoch": 0.051001821493624776, - "grad_norm": 0.3460943102836609, - "learning_rate": 1.4431357866551495e-05, - "loss": 1.2281, - "mean_token_accuracy": 0.6792226428920372, + "epoch": 0.03371057067180352, + "grad_norm": 0.5681416988372803, + "learning_rate": 1.3322052258551255e-05, + "loss": 1.326, + "mean_token_accuracy": 0.658917811512947, "step": 35 }, { - "epoch": 0.058287795992714025, - "grad_norm": 0.3018144369125366, - "learning_rate": 1.4973369303984771e-05, - "loss": 1.2132, - "mean_token_accuracy": 0.6817354665363946, + "epoch": 0.03852636648206116, + "grad_norm": 0.5098866820335388, + "learning_rate": 1.3822400511362205e-05, + "loss": 1.2541, + "mean_token_accuracy": 0.6750771552324295, "step": 40 }, { - "epoch": 0.06557377049180328, - "grad_norm": 0.26513800024986267, - "learning_rate": 1.545145727421123e-05, - "loss": 1.1888, - "mean_token_accuracy": 0.6858985710796286, + "epoch": 0.043342162292318805, + "grad_norm": 0.5420442819595337, + "learning_rate": 1.4263738948288068e-05, + "loss": 1.2549, + "mean_token_accuracy": 0.6709121555089951, "step": 45 }, { - "epoch": 0.07285974499089254, - "grad_norm": 0.24335457384586334, - "learning_rate": 1.5879121536659146e-05, - "loss": 1.1716, - "mean_token_accuracy": 0.6894952979970689, + "epoch": 0.04815795810257645, + "grad_norm": 0.48938167095184326, + "learning_rate": 1.4658529632999107e-05, + "loss": 1.2226, + "mean_token_accuracy": 0.6783863663673401, "step": 50 }, { - "epoch": 0.08014571948998178, - "grad_norm": 0.24771854281425476, - "learning_rate": 1.626599092027313e-05, - "loss": 1.1691, - "mean_token_accuracy": 0.688681607230093, + "epoch": 0.0529737539128341, + "grad_norm": 0.4799329340457916, + "learning_rate": 1.5015661248291147e-05, + "loss": 1.2588, + "mean_token_accuracy": 0.6713581442832947, "step": 55 }, { - "epoch": 0.08743169398907104, - "grad_norm": 0.25512585043907166, - "learning_rate": 1.661917499622904e-05, - "loss": 1.1528, - "mean_token_accuracy": 0.6918442926674404, + "epoch": 0.05778954972309174, + "grad_norm": 0.4751971662044525, + "learning_rate": 1.5341696868797664e-05, + "loss": 1.259, + "mean_token_accuracy": 0.667513769865036, "step": 60 }, { - "epoch": 0.0947176684881603, - "grad_norm": 0.24238301813602448, - "learning_rate": 1.694407285125386e-05, - "loss": 1.1547, - "mean_token_accuracy": 0.6903882936378324, + "epoch": 0.06260534553334939, + "grad_norm": 0.4749854803085327, + "learning_rate": 1.5641620565746784e-05, + "loss": 1.2196, + "mean_token_accuracy": 0.675195860862732, "step": 65 }, { - "epoch": 0.10200364298724955, - "grad_norm": 0.25091490149497986, - "learning_rate": 1.7244881280813573e-05, - "loss": 1.1366, - "mean_token_accuracy": 0.6941881411822179, + "epoch": 0.06742114134360704, + "grad_norm": 0.4443358778953552, + "learning_rate": 1.5919306536496318e-05, + "loss": 1.2104, + "mean_token_accuracy": 0.6783170014619827, "step": 70 }, { - "epoch": 0.1092896174863388, - "grad_norm": 0.235865980386734, - "learning_rate": 1.7524927228903416e-05, - "loss": 1.1182, - "mean_token_accuracy": 0.6983726184660476, + "epoch": 0.07223693715386467, + "grad_norm": 0.4589794874191284, + "learning_rate": 1.617782599043457e-05, + "loss": 1.1866, + "mean_token_accuracy": 0.6849248081445694, "step": 75 }, { - "epoch": 0.11657559198542805, - "grad_norm": 0.2224230170249939, - "learning_rate": 1.7786892718246845e-05, - "loss": 1.1276, - "mean_token_accuracy": 0.6956857596482655, + "epoch": 0.07705273296412232, + "grad_norm": 0.4352623522281647, + "learning_rate": 1.641965478930726e-05, + "loss": 1.1985, + "mean_token_accuracy": 0.6817637622356415, "step": 80 }, { - "epoch": 0.12386156648451731, - "grad_norm": 0.24439820647239685, - "learning_rate": 1.8032971469982566e-05, - "loss": 1.1179, - "mean_token_accuracy": 0.6984550561797753, + "epoch": 0.08186852877437997, + "grad_norm": 0.5222891569137573, + "learning_rate": 1.6646818027905938e-05, + "loss": 1.1538, + "mean_token_accuracy": 0.6914296627044678, "step": 85 }, { - "epoch": 0.13114754098360656, - "grad_norm": 0.22702665627002716, - "learning_rate": 1.826498068847331e-05, - "loss": 1.1293, - "mean_token_accuracy": 0.6938858695652173, + "epoch": 0.08668432458463761, + "grad_norm": 0.47876179218292236, + "learning_rate": 1.686099322623313e-05, + "loss": 1.1641, + "mean_token_accuracy": 0.6872312694787979, "step": 90 }, { - "epoch": 0.1384335154826958, - "grad_norm": 0.24332156777381897, - "learning_rate": 1.8484442582360153e-05, - "loss": 1.117, - "mean_token_accuracy": 0.6968719467513433, + "epoch": 0.09150012039489526, + "grad_norm": 0.45317333936691284, + "learning_rate": 1.7063585584218894e-05, + "loss": 1.1775, + "mean_token_accuracy": 0.6858260571956635, "step": 95 }, { - "epoch": 0.14571948998178508, - "grad_norm": 0.240131676197052, - "learning_rate": 1.8692644950921228e-05, - "loss": 1.1094, - "mean_token_accuracy": 0.6984871152906692, + "epoch": 0.0963159162051529, + "grad_norm": 0.4628841280937195, + "learning_rate": 1.725578391094417e-05, + "loss": 1.1936, + "mean_token_accuracy": 0.6802503883838653, "step": 100 }, { - "epoch": 0.14571948998178508, - "eval_loss": 1.1161140203475952, - "eval_mean_token_accuracy": 0.6946953157815736, - "eval_runtime": 45.3066, - "eval_samples_per_second": 5.783, - "eval_steps_per_second": 0.375, + "epoch": 0.0963159162051529, + "eval_loss": 1.170312523841858, + "eval_mean_token_accuracy": 0.6898542321645297, + "eval_runtime": 7.3819, + "eval_samples_per_second": 13.547, + "eval_steps_per_second": 1.761, "step": 100 }, { - "epoch": 0.15300546448087432, - "grad_norm": 0.24882008135318756, - "learning_rate": 1.8890686973057843e-05, - "loss": 1.1004, - "mean_token_accuracy": 0.700412188568637, + "epoch": 0.10113171201541055, + "grad_norm": 0.4867287874221802, + "learning_rate": 1.7438602893931777e-05, + "loss": 1.1756, + "mean_token_accuracy": 0.6832725733518601, "step": 105 }, { - "epoch": 0.16029143897996356, - "grad_norm": 0.2562393546104431, - "learning_rate": 1.9079514334535213e-05, - "loss": 1.1024, - "mean_token_accuracy": 0.6994366756228629, + "epoch": 0.1059475078256682, + "grad_norm": 0.47164857387542725, + "learning_rate": 1.7612915526236206e-05, + "loss": 1.1882, + "mean_token_accuracy": 0.6813990294933319, "step": 110 }, { - "epoch": 0.16757741347905283, - "grad_norm": 0.256538987159729, - "learning_rate": 1.9259946540430108e-05, - "loss": 1.0851, - "mean_token_accuracy": 0.7041478382999509, + "epoch": 0.11076330363592583, + "grad_norm": 0.47079476714134216, + "learning_rate": 1.7779478319445623e-05, + "loss": 1.1365, + "mean_token_accuracy": 0.6930296301841736, "step": 115 }, { - "epoch": 0.17486338797814208, - "grad_norm": 0.2501155734062195, - "learning_rate": 1.943269841049112e-05, - "loss": 1.1104, - "mean_token_accuracy": 0.697314667806546, + "epoch": 0.11557909944618348, + "grad_norm": 0.4742228388786316, + "learning_rate": 1.7938951146742724e-05, + "loss": 1.1367, + "mean_token_accuracy": 0.6922942072153091, "step": 120 }, { - "epoch": 0.18214936247723132, - "grad_norm": 0.23555661737918854, - "learning_rate": 1.9598397183595605e-05, - "loss": 1.0897, - "mean_token_accuracy": 0.7015464704445529, + "epoch": 0.12039489525644112, + "grad_norm": 0.48846733570098877, + "learning_rate": 1.8091913032581076e-05, + "loss": 1.153, + "mean_token_accuracy": 0.6915476590394973, "step": 125 }, { - "epoch": 0.1894353369763206, - "grad_norm": 0.27323609590530396, - "learning_rate": 1.9757596265515943e-05, - "loss": 1.0836, - "mean_token_accuracy": 0.7026937829876132, + "epoch": 0.12521069106669877, + "grad_norm": 0.4500122666358948, + "learning_rate": 1.8238874843691844e-05, + "loss": 1.1292, + "mean_token_accuracy": 0.6947832733392716, "step": 130 }, { - "epoch": 0.19672131147540983, - "grad_norm": 0.28936222195625305, - "learning_rate": 1.9910786380717584e-05, - "loss": 1.08, - "mean_token_accuracy": 0.7054042501221296, + "epoch": 0.13002648687695642, + "grad_norm": 0.47640055418014526, + "learning_rate": 1.8380289583668592e-05, + "loss": 1.0912, + "mean_token_accuracy": 0.7037966132164002, "step": 135 }, { - "epoch": 0.2040072859744991, - "grad_norm": 0.2519981265068054, - "learning_rate": 2e-05, - "loss": 1.0728, - "mean_token_accuracy": 0.705013434294089, + "epoch": 0.13484228268721407, + "grad_norm": 0.48464760184288025, + "learning_rate": 1.8516560814441375e-05, + "loss": 1.1089, + "mean_token_accuracy": 0.6969835489988327, "step": 140 }, { - "epoch": 0.21129326047358835, - "grad_norm": 0.24323837459087372, - "learning_rate": 2e-05, - "loss": 1.0795, - "mean_token_accuracy": 0.7039799096238396, + "epoch": 0.1396580784974717, + "grad_norm": 0.4772765338420868, + "learning_rate": 1.8648049599297907e-05, + "loss": 1.1564, + "mean_token_accuracy": 0.6854357540607452, "step": 145 }, { - "epoch": 0.2185792349726776, - "grad_norm": 0.2754611074924469, - "learning_rate": 2e-05, - "loss": 1.0832, - "mean_token_accuracy": 0.7035051294577429, + "epoch": 0.14447387430772934, + "grad_norm": 0.47613686323165894, + "learning_rate": 1.877508026837963e-05, + "loss": 1.1364, + "mean_token_accuracy": 0.6942001134157181, "step": 150 }, { - "epoch": 0.22586520947176686, - "grad_norm": 0.24538645148277283, - "learning_rate": 2e-05, - "loss": 1.0684, - "mean_token_accuracy": 0.7052928065461651, + "epoch": 0.149289670117987, + "grad_norm": 0.46470990777015686, + "learning_rate": 1.889794523845897e-05, + "loss": 1.1249, + "mean_token_accuracy": 0.6940385103225708, "step": 155 }, { - "epoch": 0.2331511839708561, - "grad_norm": 0.2513323724269867, - "learning_rate": 2e-05, - "loss": 1.0832, - "mean_token_accuracy": 0.7013220566682951, + "epoch": 0.15410546592824464, + "grad_norm": 0.46193844079971313, + "learning_rate": 1.901690906725232e-05, + "loss": 1.0972, + "mean_token_accuracy": 0.700231596827507, "step": 160 }, { - "epoch": 0.24043715846994534, - "grad_norm": 0.24028229713439941, - "learning_rate": 2e-05, - "loss": 1.0543, - "mean_token_accuracy": 0.7086483268197361, + "epoch": 0.1589212617385023, + "grad_norm": 0.46510210633277893, + "learning_rate": 1.913221188367167e-05, + "loss": 1.1313, + "mean_token_accuracy": 0.6941230714321136, "step": 165 }, { - "epoch": 0.24772313296903462, - "grad_norm": 0.2462315559387207, - "learning_rate": 2e-05, - "loss": 1.0591, - "mean_token_accuracy": 0.7064622007816315, + "epoch": 0.16373705754875995, + "grad_norm": 0.4849300682544708, + "learning_rate": 1.9244072305850998e-05, + "loss": 1.1193, + "mean_token_accuracy": 0.6944386541843415, "step": 170 }, { - "epoch": 0.2550091074681239, - "grad_norm": 0.22473278641700745, - "learning_rate": 2e-05, - "loss": 1.0463, - "mean_token_accuracy": 0.7112435881778212, + "epoch": 0.16855285335901757, + "grad_norm": 0.4786815643310547, + "learning_rate": 1.935268993607828e-05, + "loss": 1.1276, + "mean_token_accuracy": 0.694945952296257, "step": 175 }, { - "epoch": 0.26229508196721313, - "grad_norm": 0.23117414116859436, - "learning_rate": 2e-05, - "loss": 1.0669, - "mean_token_accuracy": 0.7054256228627258, + "epoch": 0.17336864916927522, + "grad_norm": 0.48029711842536926, + "learning_rate": 1.945824750417819e-05, + "loss": 1.1089, + "mean_token_accuracy": 0.6993857860565186, "step": 180 }, { - "epoch": 0.26958105646630237, - "grad_norm": 0.23212246596813202, - "learning_rate": 2e-05, - "loss": 1.0721, - "mean_token_accuracy": 0.703644052271617, + "epoch": 0.17818444497953287, + "grad_norm": 0.4307546317577362, + "learning_rate": 1.9560912717162102e-05, + "loss": 1.0942, + "mean_token_accuracy": 0.7034429222345352, "step": 185 }, { - "epoch": 0.2768670309653916, - "grad_norm": 0.22645309567451477, - "learning_rate": 2e-05, - "loss": 1.0542, - "mean_token_accuracy": 0.7082223986321445, + "epoch": 0.18300024078979052, + "grad_norm": 0.47431886196136475, + "learning_rate": 1.9660839862163957e-05, + "loss": 1.0961, + "mean_token_accuracy": 0.6994453608989716, "step": 190 }, { - "epoch": 0.28415300546448086, - "grad_norm": 0.24246527254581451, - "learning_rate": 2e-05, - "loss": 1.0473, - "mean_token_accuracy": 0.7109749022960431, + "epoch": 0.18781603660004817, + "grad_norm": 0.504095196723938, + "learning_rate": 1.975817120112731e-05, + "loss": 1.0959, + "mean_token_accuracy": 0.7012527734041214, "step": 195 }, { - "epoch": 0.29143897996357016, - "grad_norm": 0.26072046160697937, - "learning_rate": 2e-05, - "loss": 1.0423, - "mean_token_accuracy": 0.7102268563751831, + "epoch": 0.1926318324103058, + "grad_norm": 0.4743957221508026, + "learning_rate": 1.9853038188889227e-05, + "loss": 1.1025, + "mean_token_accuracy": 0.6987982332706452, "step": 200 }, { - "epoch": 0.29143897996357016, - "eval_loss": 1.0579973459243774, - "eval_mean_token_accuracy": 0.7055148573028509, - "eval_runtime": 40.5396, - "eval_samples_per_second": 6.463, - "eval_steps_per_second": 0.419, + "epoch": 0.1926318324103058, + "eval_loss": 1.1085937023162842, + "eval_mean_token_accuracy": 0.7000226653539218, + "eval_runtime": 7.4464, + "eval_samples_per_second": 13.429, + "eval_steps_per_second": 1.746, "step": 200 }, { - "epoch": 0.2987249544626594, - "grad_norm": 0.2392524629831314, - "learning_rate": 2e-05, - "loss": 1.056, - "mean_token_accuracy": 0.7082315583781142, + "epoch": 0.19744762822056344, + "grad_norm": 0.4345133900642395, + "learning_rate": 1.9945562540834474e-05, + "loss": 1.0657, + "mean_token_accuracy": 0.7082561492919922, "step": 205 }, { - "epoch": 0.30601092896174864, - "grad_norm": 0.24252192676067352, + "epoch": 0.2022634240308211, + "grad_norm": 0.4903443157672882, "learning_rate": 2e-05, - "loss": 1.055, - "mean_token_accuracy": 0.7082407181240838, + "loss": 1.1104, + "mean_token_accuracy": 0.6973560303449631, "step": 210 }, { - "epoch": 0.3132969034608379, - "grad_norm": 0.2512829303741455, + "epoch": 0.20707921984107874, + "grad_norm": 0.4759863018989563, "learning_rate": 2e-05, - "loss": 1.0353, - "mean_token_accuracy": 0.7127122007816316, + "loss": 1.0581, + "mean_token_accuracy": 0.7085967868566513, "step": 215 }, { - "epoch": 0.3205828779599271, - "grad_norm": 0.2263517677783966, + "epoch": 0.2118950156513364, + "grad_norm": 0.48684269189834595, "learning_rate": 2e-05, - "loss": 1.0528, - "mean_token_accuracy": 0.7089963971665854, + "loss": 1.1051, + "mean_token_accuracy": 0.698865732550621, "step": 220 }, { - "epoch": 0.32786885245901637, - "grad_norm": 0.25571829080581665, + "epoch": 0.21671081146159402, + "grad_norm": 0.4599597156047821, "learning_rate": 2e-05, - "loss": 1.0373, - "mean_token_accuracy": 0.7127348798847855, + "loss": 1.1377, + "mean_token_accuracy": 0.6892463296651841, "step": 225 }, { - "epoch": 0.33515482695810567, - "grad_norm": 0.2718060314655304, + "epoch": 0.22152660727185167, + "grad_norm": 0.4799645245075226, "learning_rate": 2e-05, - "loss": 1.0231, - "mean_token_accuracy": 0.7161425256472889, + "loss": 1.0888, + "mean_token_accuracy": 0.7011725068092346, "step": 230 }, { - "epoch": 0.3424408014571949, - "grad_norm": 0.24943749606609344, + "epoch": 0.22634240308210932, + "grad_norm": 0.4817507565021515, "learning_rate": 2e-05, - "loss": 1.0364, - "mean_token_accuracy": 0.7126031834774758, + "loss": 1.0796, + "mean_token_accuracy": 0.7057040512561799, "step": 235 }, { - "epoch": 0.34972677595628415, - "grad_norm": 0.25987544655799866, + "epoch": 0.23115819889236697, + "grad_norm": 0.5366252064704895, "learning_rate": 2e-05, - "loss": 1.0302, - "mean_token_accuracy": 0.7140556301905225, + "loss": 1.0563, + "mean_token_accuracy": 0.7098583936691284, "step": 240 }, { - "epoch": 0.3570127504553734, - "grad_norm": 0.2467297464609146, + "epoch": 0.23597399470262462, + "grad_norm": 0.5398194193840027, "learning_rate": 2e-05, - "loss": 1.0313, - "mean_token_accuracy": 0.7123977161700048, + "loss": 1.0854, + "mean_token_accuracy": 0.7036941111087799, "step": 245 }, { - "epoch": 0.36429872495446264, - "grad_norm": 0.25790703296661377, + "epoch": 0.24078979051288224, + "grad_norm": 0.5043858289718628, "learning_rate": 2e-05, - "loss": 1.0158, - "mean_token_accuracy": 0.7178706643869076, + "loss": 1.098, + "mean_token_accuracy": 0.7009976714849472, "step": 250 }, { - "epoch": 0.37158469945355194, - "grad_norm": 0.2591741383075714, + "epoch": 0.2456055863231399, + "grad_norm": 0.48920267820358276, "learning_rate": 2e-05, - "loss": 1.0258, - "mean_token_accuracy": 0.7137533585735221, + "loss": 1.0846, + "mean_token_accuracy": 0.7016850769519806, "step": 255 }, { - "epoch": 0.3788706739526412, - "grad_norm": 0.2623481750488281, + "epoch": 0.25042138213339754, + "grad_norm": 0.5131216645240784, "learning_rate": 2e-05, - "loss": 1.0525, - "mean_token_accuracy": 0.7065553248656569, + "loss": 1.0724, + "mean_token_accuracy": 0.7033721059560776, "step": 260 }, { - "epoch": 0.3861566484517304, - "grad_norm": 0.22918273508548737, + "epoch": 0.2552371779436552, + "grad_norm": 0.47745051980018616, "learning_rate": 2e-05, - "loss": 1.027, - "mean_token_accuracy": 0.7136006961406938, + "loss": 1.0779, + "mean_token_accuracy": 0.7036916673183441, "step": 265 }, { - "epoch": 0.39344262295081966, - "grad_norm": 0.24080874025821686, + "epoch": 0.26005297375391284, + "grad_norm": 0.47006651759147644, "learning_rate": 2e-05, - "loss": 1.0324, - "mean_token_accuracy": 0.7127405883580941, + "loss": 1.0782, + "mean_token_accuracy": 0.7005774110555649, "step": 270 }, { - "epoch": 0.4007285974499089, - "grad_norm": 0.2676204741001129, + "epoch": 0.2648687695641705, + "grad_norm": 0.463060200214386, "learning_rate": 2e-05, - "loss": 1.0265, - "mean_token_accuracy": 0.7136129091353199, + "loss": 1.0874, + "mean_token_accuracy": 0.7018742352724076, "step": 275 }, { - "epoch": 0.4080145719489982, - "grad_norm": 0.24465428292751312, + "epoch": 0.26968456537442814, + "grad_norm": 0.4785911440849304, "learning_rate": 2e-05, - "loss": 1.0349, - "mean_token_accuracy": 0.7100161822178797, + "loss": 1.0793, + "mean_token_accuracy": 0.7015463501214981, "step": 280 }, { - "epoch": 0.41530054644808745, - "grad_norm": 0.24205349385738373, + "epoch": 0.2745003611846858, + "grad_norm": 0.47646117210388184, "learning_rate": 2e-05, - "loss": 1.0254, - "mean_token_accuracy": 0.7133365901319003, + "loss": 1.0935, + "mean_token_accuracy": 0.7005387544631958, "step": 285 }, { - "epoch": 0.4225865209471767, - "grad_norm": 0.2451147884130478, + "epoch": 0.2793161569949434, + "grad_norm": 0.5132871866226196, "learning_rate": 2e-05, - "loss": 1.0189, - "mean_token_accuracy": 0.715379213483146, + "loss": 1.0743, + "mean_token_accuracy": 0.7038295924663543, "step": 290 }, { - "epoch": 0.42987249544626593, - "grad_norm": 0.24288122355937958, + "epoch": 0.28413195280520104, + "grad_norm": 0.46936750411987305, "learning_rate": 2e-05, - "loss": 1.0282, - "mean_token_accuracy": 0.7133766909700059, + "loss": 1.0938, + "mean_token_accuracy": 0.698417204618454, "step": 295 }, { - "epoch": 0.4371584699453552, - "grad_norm": 0.23199407756328583, + "epoch": 0.2889477486154587, + "grad_norm": 0.4777081608772278, "learning_rate": 2e-05, - "loss": 1.012, - "mean_token_accuracy": 0.7167684416218857, + "loss": 1.0537, + "mean_token_accuracy": 0.7099823027849197, "step": 300 }, { - "epoch": 0.4371584699453552, - "eval_loss": 1.0286855697631836, - "eval_mean_token_accuracy": 0.7104119275712862, - "eval_runtime": 37.7776, - "eval_samples_per_second": 6.935, - "eval_steps_per_second": 0.45, + "epoch": 0.2889477486154587, + "eval_loss": 1.0759375095367432, + "eval_mean_token_accuracy": 0.7064682612052331, + "eval_runtime": 7.4519, + "eval_samples_per_second": 13.419, + "eval_steps_per_second": 1.745, "step": 300 }, { - "epoch": 0.4444444444444444, - "grad_norm": 0.24106916785240173, + "epoch": 0.29376354442571634, + "grad_norm": 0.46188250184059143, "learning_rate": 2e-05, - "loss": 1.0193, - "mean_token_accuracy": 0.714025097703957, + "loss": 1.059, + "mean_token_accuracy": 0.7085582405328751, "step": 305 }, { - "epoch": 0.4517304189435337, - "grad_norm": 0.2508867084980011, + "epoch": 0.298579340235974, + "grad_norm": 0.49645256996154785, "learning_rate": 2e-05, - "loss": 1.0078, - "mean_token_accuracy": 0.7173775647288717, + "loss": 1.0982, + "mean_token_accuracy": 0.6987593978643417, "step": 310 }, { - "epoch": 0.45901639344262296, - "grad_norm": 0.2557254731655121, + "epoch": 0.30339513604623164, + "grad_norm": 0.4283128082752228, "learning_rate": 2e-05, - "loss": 1.0002, - "mean_token_accuracy": 0.7212154982901808, + "loss": 1.1032, + "mean_token_accuracy": 0.6966762721538544, "step": 315 }, { - "epoch": 0.4663023679417122, - "grad_norm": 0.2555619180202484, + "epoch": 0.3082109318564893, + "grad_norm": 0.45126873254776, "learning_rate": 2e-05, - "loss": 1.0212, - "mean_token_accuracy": 0.7146265876893015, + "loss": 1.1416, + "mean_token_accuracy": 0.6872466802597046, "step": 320 }, { - "epoch": 0.47358834244080145, - "grad_norm": 0.2347564995288849, + "epoch": 0.31302672766674694, + "grad_norm": 0.4647647440433502, "learning_rate": 2e-05, - "loss": 0.9874, - "mean_token_accuracy": 0.7231909501709821, + "loss": 1.0582, + "mean_token_accuracy": 0.7075585454702378, "step": 325 }, { - "epoch": 0.4808743169398907, - "grad_norm": 0.24380792677402496, + "epoch": 0.3178425234770046, + "grad_norm": 0.5326666831970215, "learning_rate": 2e-05, - "loss": 0.9829, - "mean_token_accuracy": 0.7247633732291158, + "loss": 1.0665, + "mean_token_accuracy": 0.7061545759439468, "step": 330 }, { - "epoch": 0.48816029143898, - "grad_norm": 0.24516567587852478, + "epoch": 0.32265831928726224, + "grad_norm": 0.4525166153907776, "learning_rate": 2e-05, - "loss": 1.0198, - "mean_token_accuracy": 0.7143673417060667, + "loss": 1.0411, + "mean_token_accuracy": 0.7101572811603546, "step": 335 }, { - "epoch": 0.49544626593806923, - "grad_norm": 0.23901765048503876, + "epoch": 0.3274741150975199, + "grad_norm": 0.4440806806087494, "learning_rate": 2e-05, - "loss": 1.0099, - "mean_token_accuracy": 0.7166050928187591, + "loss": 1.0884, + "mean_token_accuracy": 0.701458466053009, "step": 340 }, { - "epoch": 0.5027322404371585, - "grad_norm": 0.252539724111557, + "epoch": 0.3322899109077775, + "grad_norm": 0.455497145652771, "learning_rate": 2e-05, - "loss": 0.9999, - "mean_token_accuracy": 0.7207071323888619, + "loss": 1.0372, + "mean_token_accuracy": 0.7121497482061386, "step": 345 }, { - "epoch": 0.5100182149362478, - "grad_norm": 0.2206733673810959, + "epoch": 0.33710570671803514, + "grad_norm": 0.4636366665363312, "learning_rate": 2e-05, - "loss": 0.9919, - "mean_token_accuracy": 0.7224993893502688, + "loss": 1.0269, + "mean_token_accuracy": 0.7159249395132065, "step": 350 }, { - "epoch": 0.517304189435337, - "grad_norm": 0.21977460384368896, + "epoch": 0.3419215025282928, + "grad_norm": 0.4710885286331177, "learning_rate": 2e-05, - "loss": 1.0083, - "mean_token_accuracy": 0.716026502198339, + "loss": 1.0242, + "mean_token_accuracy": 0.7154449820518494, "step": 355 }, { - "epoch": 0.5245901639344263, - "grad_norm": 0.25190576910972595, + "epoch": 0.34673729833855044, + "grad_norm": 0.49214205145835876, "learning_rate": 2e-05, - "loss": 1.0001, - "mean_token_accuracy": 0.7192278334147535, + "loss": 1.0676, + "mean_token_accuracy": 0.7049440264701843, "step": 360 }, { - "epoch": 0.5318761384335154, - "grad_norm": 0.2302297204732895, + "epoch": 0.3515530941488081, + "grad_norm": 0.4833540916442871, "learning_rate": 2e-05, - "loss": 0.9807, - "mean_token_accuracy": 0.7235634465070835, + "loss": 1.0756, + "mean_token_accuracy": 0.7041820973157883, "step": 365 }, { - "epoch": 0.5391621129326047, - "grad_norm": 0.2296873778104782, + "epoch": 0.35636888995906574, + "grad_norm": 0.4929426610469818, "learning_rate": 2e-05, - "loss": 0.979, - "mean_token_accuracy": 0.7253984489496823, + "loss": 1.0363, + "mean_token_accuracy": 0.7134405076503754, "step": 370 }, { - "epoch": 0.546448087431694, - "grad_norm": 0.22952046990394592, + "epoch": 0.3611846857693234, + "grad_norm": 0.4757974147796631, "learning_rate": 2e-05, - "loss": 1.0024, - "mean_token_accuracy": 0.7198125305324866, + "loss": 1.0506, + "mean_token_accuracy": 0.7101884871721268, "step": 375 }, { - "epoch": 0.5537340619307832, - "grad_norm": 0.26434800028800964, + "epoch": 0.36600048157958104, + "grad_norm": 0.49926406145095825, "learning_rate": 2e-05, - "loss": 1.0029, - "mean_token_accuracy": 0.7184065095261356, + "loss": 1.047, + "mean_token_accuracy": 0.7088975489139557, "step": 380 }, { - "epoch": 0.5610200364298725, - "grad_norm": 0.24686159193515778, + "epoch": 0.3708162773898387, + "grad_norm": 0.506790280342102, "learning_rate": 2e-05, - "loss": 0.9886, - "mean_token_accuracy": 0.7215569017124912, + "loss": 1.006, + "mean_token_accuracy": 0.7221918374300003, "step": 385 }, { - "epoch": 0.5683060109289617, - "grad_norm": 0.2560524344444275, + "epoch": 0.37563207320009634, + "grad_norm": 0.467853844165802, "learning_rate": 2e-05, - "loss": 0.9816, - "mean_token_accuracy": 0.7243008060576454, + "loss": 1.0646, + "mean_token_accuracy": 0.7053724229335785, "step": 390 }, { - "epoch": 0.575591985428051, - "grad_norm": 0.23923024535179138, + "epoch": 0.38044786901035393, + "grad_norm": 0.45863065123558044, "learning_rate": 2e-05, - "loss": 1.0118, - "mean_token_accuracy": 0.7144159135319982, + "loss": 1.0539, + "mean_token_accuracy": 0.7085458695888519, "step": 395 }, { - "epoch": 0.5828779599271403, - "grad_norm": 0.2485678642988205, + "epoch": 0.3852636648206116, + "grad_norm": 0.4951156675815582, "learning_rate": 2e-05, - "loss": 1.0, - "mean_token_accuracy": 0.7188690766976062, + "loss": 1.0268, + "mean_token_accuracy": 0.7149923086166382, "step": 400 }, { - "epoch": 0.5828779599271403, - "eval_loss": 1.008617639541626, - "eval_mean_token_accuracy": 0.714862565310479, - "eval_runtime": 38.6648, - "eval_samples_per_second": 6.776, - "eval_steps_per_second": 0.44, + "epoch": 0.3852636648206116, + "eval_loss": 1.0528124570846558, + "eval_mean_token_accuracy": 0.7111574503091666, + "eval_runtime": 7.4249, + "eval_samples_per_second": 13.468, + "eval_steps_per_second": 1.751, "step": 400 }, { - "epoch": 0.5901639344262295, - "grad_norm": 0.2388627827167511, + "epoch": 0.39007946063086923, + "grad_norm": 0.4989476203918457, "learning_rate": 2e-05, - "loss": 0.9903, - "mean_token_accuracy": 0.7215254030288225, + "loss": 1.1, + "mean_token_accuracy": 0.6973515421152114, "step": 405 }, { - "epoch": 0.5974499089253188, - "grad_norm": 0.23851706087589264, + "epoch": 0.3948952564411269, + "grad_norm": 0.4778280556201935, "learning_rate": 2e-05, - "loss": 1.002, - "mean_token_accuracy": 0.7170279677576941, + "loss": 1.0217, + "mean_token_accuracy": 0.7147800713777542, "step": 410 }, { - "epoch": 0.604735883424408, - "grad_norm": 0.25729191303253174, + "epoch": 0.39971105225138454, + "grad_norm": 0.47735705971717834, "learning_rate": 2e-05, - "loss": 1.0014, - "mean_token_accuracy": 0.7183484978016609, + "loss": 1.0789, + "mean_token_accuracy": 0.7017655581235885, "step": 415 }, { - "epoch": 0.6120218579234973, - "grad_norm": 0.24574624001979828, + "epoch": 0.4045268480616422, + "grad_norm": 0.4676768183708191, "learning_rate": 2e-05, - "loss": 1.0013, - "mean_token_accuracy": 0.7191163898387888, + "loss": 1.0402, + "mean_token_accuracy": 0.7099715679883957, "step": 420 }, { - "epoch": 0.6193078324225865, - "grad_norm": 0.24090902507305145, + "epoch": 0.40934264387189984, + "grad_norm": 0.5054851770401001, "learning_rate": 2e-05, - "loss": 1.0069, - "mean_token_accuracy": 0.7172478016609672, + "loss": 1.0594, + "mean_token_accuracy": 0.70656159222126, "step": 425 }, { - "epoch": 0.6265938069216758, - "grad_norm": 0.24890194833278656, + "epoch": 0.4141584396821575, + "grad_norm": 0.464779794216156, "learning_rate": 2e-05, - "loss": 0.9897, - "mean_token_accuracy": 0.7221360527601368, + "loss": 1.0399, + "mean_token_accuracy": 0.7095290720462799, "step": 430 }, { - "epoch": 0.6338797814207651, - "grad_norm": 0.27910518646240234, + "epoch": 0.41897423549241514, + "grad_norm": 0.470594584941864, "learning_rate": 2e-05, - "loss": 1.0073, - "mean_token_accuracy": 0.7160036028334147, + "loss": 1.0284, + "mean_token_accuracy": 0.7146120637655258, "step": 435 }, { - "epoch": 0.6411657559198543, - "grad_norm": 0.24221768975257874, + "epoch": 0.4237900313026728, + "grad_norm": 0.4631585478782654, "learning_rate": 2e-05, - "loss": 0.9711, - "mean_token_accuracy": 0.7259183995837477, + "loss": 1.0712, + "mean_token_accuracy": 0.7038343369960784, "step": 440 }, { - "epoch": 0.6484517304189436, - "grad_norm": 0.23564772307872772, + "epoch": 0.42860582711293044, + "grad_norm": 0.5031366348266602, "learning_rate": 2e-05, - "loss": 0.9914, - "mean_token_accuracy": 0.7203071568148511, + "loss": 1.0038, + "mean_token_accuracy": 0.7181661784648895, "step": 445 }, { - "epoch": 0.6557377049180327, - "grad_norm": 0.23250643908977509, + "epoch": 0.43342162292318803, + "grad_norm": 0.4990742802619934, "learning_rate": 2e-05, - "loss": 0.9863, - "mean_token_accuracy": 0.7207727772349781, + "loss": 1.0521, + "mean_token_accuracy": 0.7067010790109635, "step": 450 }, { - "epoch": 0.663023679417122, - "grad_norm": 0.22369219362735748, + "epoch": 0.4382374187334457, + "grad_norm": 0.505125105381012, "learning_rate": 2e-05, - "loss": 0.9786, - "mean_token_accuracy": 0.7255831704934049, + "loss": 1.0461, + "mean_token_accuracy": 0.7104395478963852, "step": 455 }, { - "epoch": 0.6703096539162113, - "grad_norm": 0.2398165464401245, + "epoch": 0.44305321454370333, + "grad_norm": 0.49441832304000854, "learning_rate": 2e-05, - "loss": 0.9824, - "mean_token_accuracy": 0.723346665852467, + "loss": 1.0268, + "mean_token_accuracy": 0.7156393229961395, "step": 460 }, { - "epoch": 0.6775956284153005, - "grad_norm": 0.2463538646697998, + "epoch": 0.447869010353961, + "grad_norm": 0.4587756395339966, "learning_rate": 2e-05, - "loss": 0.9823, - "mean_token_accuracy": 0.7212032852955546, + "loss": 1.0119, + "mean_token_accuracy": 0.7168730080127717, "step": 465 }, { - "epoch": 0.6848816029143898, - "grad_norm": 0.25979697704315186, + "epoch": 0.45268480616421863, + "grad_norm": 0.4372672438621521, "learning_rate": 2e-05, - "loss": 0.9844, - "mean_token_accuracy": 0.7220978871519298, + "loss": 1.0133, + "mean_token_accuracy": 0.7184069484472275, "step": 470 }, { - "epoch": 0.692167577413479, - "grad_norm": 0.25015923380851746, + "epoch": 0.4575006019744763, + "grad_norm": 0.4742435812950134, "learning_rate": 2e-05, - "loss": 0.9966, - "mean_token_accuracy": 0.717882877381534, + "loss": 1.0209, + "mean_token_accuracy": 0.7158730626106262, "step": 475 }, { - "epoch": 0.6994535519125683, - "grad_norm": 0.2696310877799988, + "epoch": 0.46231639778473393, + "grad_norm": 0.43648701906204224, "learning_rate": 2e-05, - "loss": 0.9742, - "mean_token_accuracy": 0.7241342766156815, + "loss": 1.0335, + "mean_token_accuracy": 0.7114817708730697, "step": 480 }, { - "epoch": 0.7067395264116576, - "grad_norm": 0.23021022975444794, + "epoch": 0.4671321935949916, + "grad_norm": 0.47067275643348694, "learning_rate": 2e-05, - "loss": 0.9685, - "mean_token_accuracy": 0.7260197850512945, + "loss": 1.0327, + "mean_token_accuracy": 0.7122172027826309, "step": 485 }, { - "epoch": 0.7140255009107468, - "grad_norm": 0.25251683592796326, + "epoch": 0.47194798940524924, + "grad_norm": 0.4419151246547699, "learning_rate": 2e-05, - "loss": 0.9619, - "mean_token_accuracy": 0.7277463971665853, + "loss": 1.0094, + "mean_token_accuracy": 0.7186745911836624, "step": 490 }, { - "epoch": 0.7213114754098361, - "grad_norm": 0.23041123151779175, + "epoch": 0.4767637852155069, + "grad_norm": 0.47076234221458435, "learning_rate": 2e-05, - "loss": 0.9733, - "mean_token_accuracy": 0.7242901196873474, + "loss": 1.0115, + "mean_token_accuracy": 0.7169291287660599, "step": 495 }, { - "epoch": 0.7285974499089253, - "grad_norm": 0.2576392889022827, + "epoch": 0.4815795810257645, + "grad_norm": 0.49152296781539917, "learning_rate": 2e-05, - "loss": 0.9852, - "mean_token_accuracy": 0.7223543600390816, + "loss": 0.9952, + "mean_token_accuracy": 0.7206105887889862, "step": 500 }, { - "epoch": 0.7285974499089253, - "eval_loss": 0.9932907819747925, - "eval_mean_token_accuracy": 0.7173129502108766, - "eval_runtime": 37.0409, - "eval_samples_per_second": 7.073, - "eval_steps_per_second": 0.459, + "epoch": 0.4815795810257645, + "eval_loss": 1.0356249809265137, + "eval_mean_token_accuracy": 0.7145319947829614, + "eval_runtime": 7.4076, + "eval_samples_per_second": 13.5, + "eval_steps_per_second": 1.755, "step": 500 }, { - "epoch": 0.7358834244080146, - "grad_norm": 0.2792870104312897, + "epoch": 0.48639537683602213, + "grad_norm": 0.4353288412094116, "learning_rate": 2e-05, - "loss": 0.9639, - "mean_token_accuracy": 0.7265312042012704, + "loss": 1.0562, + "mean_token_accuracy": 0.706000679731369, "step": 505 }, { - "epoch": 0.7431693989071039, - "grad_norm": 0.2572779059410095, + "epoch": 0.4912111726462798, + "grad_norm": 0.4375183582305908, "learning_rate": 2e-05, - "loss": 0.9807, - "mean_token_accuracy": 0.7220719345383488, + "loss": 1.0153, + "mean_token_accuracy": 0.7189254313707352, "step": 510 }, { - "epoch": 0.7504553734061931, - "grad_norm": 0.23189429938793182, + "epoch": 0.49602696845653743, + "grad_norm": 0.5375787615776062, "learning_rate": 2e-05, - "loss": 0.982, - "mean_token_accuracy": 0.7221726917440157, + "loss": 1.0458, + "mean_token_accuracy": 0.7079954385757447, "step": 515 }, { - "epoch": 0.7577413479052824, - "grad_norm": 0.23592406511306763, + "epoch": 0.5008427642667951, + "grad_norm": 0.46828216314315796, "learning_rate": 2e-05, - "loss": 0.9817, - "mean_token_accuracy": 0.7224322178798241, + "loss": 1.0113, + "mean_token_accuracy": 0.7164820760488511, "step": 520 }, { - "epoch": 0.7650273224043715, - "grad_norm": 0.24176359176635742, + "epoch": 0.5056585600770528, + "grad_norm": 0.48052704334259033, "learning_rate": 2e-05, - "loss": 0.9749, - "mean_token_accuracy": 0.7247175744992674, + "loss": 0.9916, + "mean_token_accuracy": 0.7223492801189423, "step": 525 }, { - "epoch": 0.7723132969034608, - "grad_norm": 0.2435123324394226, + "epoch": 0.5104743558873104, + "grad_norm": 0.4925815463066101, "learning_rate": 2e-05, - "loss": 0.9819, - "mean_token_accuracy": 0.7221314728871521, + "loss": 1.02, + "mean_token_accuracy": 0.7158171087503433, "step": 530 }, { - "epoch": 0.7795992714025501, - "grad_norm": 0.22645749151706696, + "epoch": 0.515290151697568, + "grad_norm": 0.47382616996765137, "learning_rate": 2e-05, - "loss": 0.9847, - "mean_token_accuracy": 0.7216826453346362, + "loss": 1.0491, + "mean_token_accuracy": 0.707519245147705, "step": 535 }, { - "epoch": 0.7868852459016393, - "grad_norm": 0.2561565339565277, + "epoch": 0.5201059475078257, + "grad_norm": 0.45917588472366333, "learning_rate": 2e-05, - "loss": 0.9747, - "mean_token_accuracy": 0.723244382022472, + "loss": 1.0204, + "mean_token_accuracy": 0.716147267818451, "step": 540 }, { - "epoch": 0.7941712204007286, - "grad_norm": 0.238953098654747, + "epoch": 0.5249217433180833, + "grad_norm": 0.47048982977867126, "learning_rate": 2e-05, - "loss": 0.978, - "mean_token_accuracy": 0.722381839276991, + "loss": 1.0341, + "mean_token_accuracy": 0.7119667261838913, "step": 545 }, { - "epoch": 0.8014571948998178, - "grad_norm": 0.27305781841278076, + "epoch": 0.529737539128341, + "grad_norm": 0.49886590242385864, "learning_rate": 2e-05, - "loss": 0.9671, - "mean_token_accuracy": 0.725273265754763, + "loss": 1.0018, + "mean_token_accuracy": 0.7183328688144683, "step": 550 }, { - "epoch": 0.8087431693989071, - "grad_norm": 0.23001207411289215, + "epoch": 0.5345533349385986, + "grad_norm": 0.43700650334358215, "learning_rate": 2e-05, - "loss": 0.9588, - "mean_token_accuracy": 0.7276065295012101, + "loss": 1.0127, + "mean_token_accuracy": 0.7179402559995651, "step": 555 }, { - "epoch": 0.8160291438979964, - "grad_norm": 0.24449627101421356, + "epoch": 0.5393691307488563, + "grad_norm": 0.45927077531814575, "learning_rate": 2e-05, - "loss": 0.9728, - "mean_token_accuracy": 0.7239069369809478, + "loss": 1.0199, + "mean_token_accuracy": 0.7139646023511886, "step": 560 }, { - "epoch": 0.8233151183970856, - "grad_norm": 0.23095275461673737, + "epoch": 0.5441849265591139, + "grad_norm": 0.466413289308548, "learning_rate": 2e-05, - "loss": 0.967, - "mean_token_accuracy": 0.7263602222765023, + "loss": 1.0538, + "mean_token_accuracy": 0.7058045387268066, "step": 565 }, { - "epoch": 0.8306010928961749, - "grad_norm": 0.2598857879638672, + "epoch": 0.5490007223693716, + "grad_norm": 0.4314544200897217, "learning_rate": 2e-05, - "loss": 0.976, - "mean_token_accuracy": 0.722152845627748, + "loss": 0.999, + "mean_token_accuracy": 0.7193293124437332, "step": 570 }, { - "epoch": 0.8378870673952641, - "grad_norm": 0.2525656223297119, + "epoch": 0.5538165181796292, + "grad_norm": 0.48159873485565186, "learning_rate": 2e-05, - "loss": 0.9756, - "mean_token_accuracy": 0.7226230459208598, + "loss": 1.055, + "mean_token_accuracy": 0.7064671397209168, "step": 575 }, { - "epoch": 0.8451730418943534, - "grad_norm": 0.23842041194438934, + "epoch": 0.5586323139898868, + "grad_norm": 0.48405176401138306, "learning_rate": 2e-05, - "loss": 0.9929, - "mean_token_accuracy": 0.7186629824132877, + "loss": 0.9989, + "mean_token_accuracy": 0.7200118064880371, "step": 580 }, { - "epoch": 0.8524590163934426, - "grad_norm": 0.26655957102775574, + "epoch": 0.5634481098001445, + "grad_norm": 0.5083625912666321, "learning_rate": 2e-05, - "loss": 0.9732, - "mean_token_accuracy": 0.7229268441621884, + "loss": 1.0452, + "mean_token_accuracy": 0.7077913284301758, "step": 585 }, { - "epoch": 0.8597449908925319, - "grad_norm": 0.2641935348510742, + "epoch": 0.5682639056104021, + "grad_norm": 0.4147779941558838, "learning_rate": 2e-05, - "loss": 0.9758, - "mean_token_accuracy": 0.7253282242305814, + "loss": 1.0126, + "mean_token_accuracy": 0.7150066405534744, "step": 590 }, { - "epoch": 0.8670309653916212, - "grad_norm": 0.24463647603988647, + "epoch": 0.5730797014206598, + "grad_norm": 0.454995721578598, "learning_rate": 2e-05, - "loss": 0.983, - "mean_token_accuracy": 0.7212307645334635, + "loss": 1.0548, + "mean_token_accuracy": 0.7070407778024673, "step": 595 }, { - "epoch": 0.8743169398907104, - "grad_norm": 0.24827370047569275, + "epoch": 0.5778954972309174, + "grad_norm": 0.4965215027332306, "learning_rate": 2e-05, - "loss": 0.9807, - "mean_token_accuracy": 0.7212093917928677, + "loss": 1.0068, + "mean_token_accuracy": 0.7180212557315826, "step": 600 }, { - "epoch": 0.8743169398907104, - "eval_loss": 0.979246199131012, - "eval_mean_token_accuracy": 0.7202548355395697, - "eval_runtime": 38.2386, - "eval_samples_per_second": 6.852, - "eval_steps_per_second": 0.445, + "epoch": 0.5778954972309174, + "eval_loss": 1.0229687690734863, + "eval_mean_token_accuracy": 0.7169581284889808, + "eval_runtime": 7.4219, + "eval_samples_per_second": 13.474, + "eval_steps_per_second": 1.752, "step": 600 }, { - "epoch": 0.8816029143897997, - "grad_norm": 0.23297545313835144, + "epoch": 0.5827112930411751, + "grad_norm": 0.4686574339866638, "learning_rate": 2e-05, - "loss": 0.9609, - "mean_token_accuracy": 0.7272982922789468, + "loss": 1.0378, + "mean_token_accuracy": 0.7109969854354858, "step": 605 }, { - "epoch": 0.8888888888888888, - "grad_norm": 0.23551233112812042, + "epoch": 0.5875270888514327, + "grad_norm": 0.4724644124507904, "learning_rate": 2e-05, - "loss": 0.9715, - "mean_token_accuracy": 0.7250076331216416, + "loss": 1.0339, + "mean_token_accuracy": 0.712575551867485, "step": 610 }, { - "epoch": 0.8961748633879781, - "grad_norm": 0.24120616912841797, + "epoch": 0.5923428846616904, + "grad_norm": 0.45253986120224, "learning_rate": 2e-05, - "loss": 0.9707, - "mean_token_accuracy": 0.7239664753297508, + "loss": 0.9974, + "mean_token_accuracy": 0.7197393357753754, "step": 615 }, { - "epoch": 0.9034608378870674, - "grad_norm": 0.2541744112968445, + "epoch": 0.597158680471948, + "grad_norm": 0.4631718695163727, "learning_rate": 2e-05, - "loss": 0.9695, - "mean_token_accuracy": 0.7240719724584961, + "loss": 1.0661, + "mean_token_accuracy": 0.7025329202413559, "step": 620 }, { - "epoch": 0.9107468123861566, - "grad_norm": 0.2749602198600769, + "epoch": 0.6019744762822057, + "grad_norm": 0.45256227254867554, "learning_rate": 2e-05, - "loss": 0.9714, - "mean_token_accuracy": 0.7238626648754275, + "loss": 0.9886, + "mean_token_accuracy": 0.7221844792366028, "step": 625 }, { - "epoch": 0.9180327868852459, - "grad_norm": 0.2778976559638977, + "epoch": 0.6067902720924633, + "grad_norm": 0.4730903208255768, "learning_rate": 2e-05, - "loss": 0.9765, - "mean_token_accuracy": 0.7213330483634588, + "loss": 1.003, + "mean_token_accuracy": 0.7192000895738602, "step": 630 }, { - "epoch": 0.9253187613843351, - "grad_norm": 0.24223344027996063, + "epoch": 0.6116060679027209, + "grad_norm": 0.45991525053977966, "learning_rate": 2e-05, - "loss": 0.9472, - "mean_token_accuracy": 0.7306683561309237, + "loss": 1.0419, + "mean_token_accuracy": 0.7091460645198822, "step": 635 }, { - "epoch": 0.9326047358834244, - "grad_norm": 0.2598780691623688, + "epoch": 0.6164218637129786, + "grad_norm": 0.43712058663368225, "learning_rate": 2e-05, - "loss": 0.9544, - "mean_token_accuracy": 0.7294760625305325, + "loss": 1.0274, + "mean_token_accuracy": 0.7129867494106292, "step": 640 }, { - "epoch": 0.9398907103825137, - "grad_norm": 0.2586725950241089, + "epoch": 0.6212376595232362, + "grad_norm": 0.5012054443359375, "learning_rate": 2e-05, - "loss": 0.9571, - "mean_token_accuracy": 0.7278074621397166, + "loss": 1.0087, + "mean_token_accuracy": 0.717068886756897, "step": 645 }, { - "epoch": 0.9471766848816029, - "grad_norm": 0.23097127676010132, + "epoch": 0.6260534553334939, + "grad_norm": 0.44968077540397644, "learning_rate": 2e-05, - "loss": 0.951, - "mean_token_accuracy": 0.7284211651196874, + "loss": 1.0275, + "mean_token_accuracy": 0.712301179766655, "step": 650 }, { - "epoch": 0.9544626593806922, - "grad_norm": 0.242562934756279, + "epoch": 0.6308692511437515, + "grad_norm": 0.5434072613716125, "learning_rate": 2e-05, - "loss": 0.9743, - "mean_token_accuracy": 0.7239756350757205, + "loss": 1.0335, + "mean_token_accuracy": 0.7124795228242874, "step": 655 }, { - "epoch": 0.9617486338797814, - "grad_norm": 0.24876342713832855, + "epoch": 0.6356850469540092, + "grad_norm": 0.4992160201072693, "learning_rate": 2e-05, - "loss": 0.9561, - "mean_token_accuracy": 0.7260304714215927, + "loss": 1.0437, + "mean_token_accuracy": 0.7077430039644241, "step": 660 }, { - "epoch": 0.9690346083788707, - "grad_norm": 0.25081324577331543, + "epoch": 0.6405008427642668, + "grad_norm": 0.4403627812862396, "learning_rate": 2e-05, - "loss": 0.9534, - "mean_token_accuracy": 0.7289707498778701, + "loss": 0.9781, + "mean_token_accuracy": 0.7234994322061539, "step": 665 }, { - "epoch": 0.97632058287796, - "grad_norm": 0.22985951602458954, + "epoch": 0.6453166385745245, + "grad_norm": 0.4874439239501953, "learning_rate": 2e-05, - "loss": 0.9596, - "mean_token_accuracy": 0.726289997557401, + "loss": 1.0354, + "mean_token_accuracy": 0.7110928893089294, "step": 670 }, { - "epoch": 0.9836065573770492, - "grad_norm": 0.24084888398647308, + "epoch": 0.6501324343847821, + "grad_norm": 0.47384676337242126, "learning_rate": 2e-05, - "loss": 0.9573, - "mean_token_accuracy": 0.7278120420127016, + "loss": 0.9852, + "mean_token_accuracy": 0.7246900945901871, "step": 675 }, { - "epoch": 0.9908925318761385, - "grad_norm": 0.2884751558303833, + "epoch": 0.6549482301950398, + "grad_norm": 0.47619345784187317, "learning_rate": 2e-05, - "loss": 0.9637, - "mean_token_accuracy": 0.7256558724047968, + "loss": 1.0197, + "mean_token_accuracy": 0.7132910996675491, "step": 680 }, { - "epoch": 0.9981785063752276, - "grad_norm": 0.23881231248378754, + "epoch": 0.6597640260052974, + "grad_norm": 0.4445981979370117, "learning_rate": 2e-05, - "loss": 0.9658, - "mean_token_accuracy": 0.7258991125185024, + "loss": 0.9935, + "mean_token_accuracy": 0.7201527744531632, "step": 685 }, { - "epoch": 1.0043715846994536, - "grad_norm": 0.2694164514541626, + "epoch": 0.664579821815555, + "grad_norm": 0.46052321791648865, "learning_rate": 2e-05, - "loss": 0.78, - "mean_token_accuracy": 0.7354216356791862, + "loss": 1.0267, + "mean_token_accuracy": 0.7131595432758331, "step": 690 }, { - "epoch": 1.0116575591985428, - "grad_norm": 0.28284427523612976, + "epoch": 0.6693956176258127, + "grad_norm": 0.46183568239212036, "learning_rate": 2e-05, - "loss": 0.9199, - "mean_token_accuracy": 0.7356466780654618, + "loss": 1.0137, + "mean_token_accuracy": 0.7151708722114563, "step": 695 }, { - "epoch": 1.018943533697632, - "grad_norm": 0.24294410645961761, + "epoch": 0.6742114134360703, + "grad_norm": 0.49695634841918945, "learning_rate": 2e-05, - "loss": 0.9269, - "mean_token_accuracy": 0.7344269052271617, + "loss": 1.0145, + "mean_token_accuracy": 0.7154181480407715, "step": 700 }, { - "epoch": 1.018943533697632, - "eval_loss": 0.9708074927330017, - "eval_mean_token_accuracy": 0.7221159128785493, - "eval_runtime": 39.0528, - "eval_samples_per_second": 6.709, - "eval_steps_per_second": 0.435, + "epoch": 0.6742114134360703, + "eval_loss": 1.0114063024520874, + "eval_mean_token_accuracy": 0.7193385500174302, + "eval_runtime": 7.4015, + "eval_samples_per_second": 13.511, + "eval_steps_per_second": 1.756, "step": 700 }, { - "epoch": 1.0262295081967212, - "grad_norm": 0.2431807816028595, + "epoch": 0.679027209246328, + "grad_norm": 0.48050031065940857, "learning_rate": 2e-05, - "loss": 0.9302, - "mean_token_accuracy": 0.7342864557889597, + "loss": 1.0213, + "mean_token_accuracy": 0.712389525771141, "step": 705 }, { - "epoch": 1.0335154826958106, - "grad_norm": 0.23510615527629852, + "epoch": 0.6838430050565856, + "grad_norm": 0.47063517570495605, "learning_rate": 2e-05, - "loss": 0.9161, - "mean_token_accuracy": 0.7370954445530045, + "loss": 1.0073, + "mean_token_accuracy": 0.7171070665121079, "step": 710 }, { - "epoch": 1.0408014571948998, - "grad_norm": 0.25080975890159607, + "epoch": 0.6886588008668433, + "grad_norm": 0.46786707639694214, "learning_rate": 2e-05, - "loss": 0.9284, - "mean_token_accuracy": 0.7337506106497316, + "loss": 1.0093, + "mean_token_accuracy": 0.71626777946949, "step": 715 }, { - "epoch": 1.048087431693989, - "grad_norm": 0.22864864766597748, + "epoch": 0.6934745966771009, + "grad_norm": 0.4665524959564209, "learning_rate": 2e-05, - "loss": 0.9224, - "mean_token_accuracy": 0.7355474474841233, + "loss": 1.0405, + "mean_token_accuracy": 0.7099679440259934, "step": 720 }, { - "epoch": 1.0553734061930784, - "grad_norm": 0.22231045365333557, + "epoch": 0.6982903924873586, + "grad_norm": 0.43055522441864014, "learning_rate": 2e-05, - "loss": 0.9309, - "mean_token_accuracy": 0.7322361993160723, + "loss": 1.063, + "mean_token_accuracy": 0.7021431714296341, "step": 725 }, { - "epoch": 1.0626593806921676, - "grad_norm": 0.26311782002449036, + "epoch": 0.7031061882976162, + "grad_norm": 0.4753313958644867, "learning_rate": 2e-05, - "loss": 0.927, - "mean_token_accuracy": 0.7335413663120914, + "loss": 0.9971, + "mean_token_accuracy": 0.7194192945957184, "step": 730 }, { - "epoch": 1.0699453551912568, - "grad_norm": 0.27380216121673584, + "epoch": 0.7079219841078739, + "grad_norm": 0.4904627501964569, "learning_rate": 2e-05, - "loss": 0.9099, - "mean_token_accuracy": 0.7381701880801176, + "loss": 1.0045, + "mean_token_accuracy": 0.7177853912115097, "step": 735 }, { - "epoch": 1.0772313296903462, - "grad_norm": 0.23722966015338898, + "epoch": 0.7127377799181315, + "grad_norm": 0.48672381043434143, "learning_rate": 2e-05, - "loss": 0.9104, - "mean_token_accuracy": 0.7384098680996581, + "loss": 1.0169, + "mean_token_accuracy": 0.7152146756649017, "step": 740 }, { - "epoch": 1.0845173041894354, - "grad_norm": 0.23841090500354767, + "epoch": 0.7175535757283891, + "grad_norm": 0.4979478418827057, "learning_rate": 2e-05, - "loss": 0.9326, - "mean_token_accuracy": 0.730987809040814, + "loss": 0.9924, + "mean_token_accuracy": 0.7202946066856384, "step": 745 }, { - "epoch": 1.0918032786885246, - "grad_norm": 0.21909235417842865, + "epoch": 0.7223693715386468, + "grad_norm": 0.4939980208873749, "learning_rate": 2e-05, - "loss": 0.9216, - "mean_token_accuracy": 0.7349217046220666, + "loss": 1.0003, + "mean_token_accuracy": 0.7201969802379609, "step": 750 }, { - "epoch": 1.0990892531876137, - "grad_norm": 0.23422910273075104, + "epoch": 0.7271851673489044, + "grad_norm": 0.5065310597419739, "learning_rate": 2e-05, - "loss": 0.9153, - "mean_token_accuracy": 0.737058805569126, + "loss": 0.9996, + "mean_token_accuracy": 0.7165877521038055, "step": 755 }, { - "epoch": 1.1063752276867032, - "grad_norm": 0.25514715909957886, + "epoch": 0.7320009631591621, + "grad_norm": 0.45522525906562805, "learning_rate": 2e-05, - "loss": 0.9185, - "mean_token_accuracy": 0.7353154005862238, + "loss": 1.0276, + "mean_token_accuracy": 0.7141087472438812, "step": 760 }, { - "epoch": 1.1136612021857923, - "grad_norm": 0.2466048300266266, + "epoch": 0.7368167589694197, + "grad_norm": 0.4365679919719696, "learning_rate": 2e-05, - "loss": 0.9171, - "mean_token_accuracy": 0.7364954811919885, + "loss": 0.943, + "mean_token_accuracy": 0.7309204876422882, "step": 765 }, { - "epoch": 1.1209471766848815, - "grad_norm": 0.2330339401960373, + "epoch": 0.7416325547796774, + "grad_norm": 0.44899559020996094, "learning_rate": 2e-05, - "loss": 0.9181, - "mean_token_accuracy": 0.7352024303859309, + "loss": 1.0011, + "mean_token_accuracy": 0.718404445052147, "step": 770 }, { - "epoch": 1.128233151183971, - "grad_norm": 0.23831795156002045, + "epoch": 0.746448350589935, + "grad_norm": 0.48060882091522217, "learning_rate": 2e-05, - "loss": 0.93, - "mean_token_accuracy": 0.7330208842208114, + "loss": 1.0183, + "mean_token_accuracy": 0.7138826489448548, "step": 775 }, { - "epoch": 1.1355191256830601, - "grad_norm": 0.23831504583358765, + "epoch": 0.7512641464001927, + "grad_norm": 0.4900418221950531, "learning_rate": 2e-05, - "loss": 0.928, - "mean_token_accuracy": 0.7325506839276994, + "loss": 0.9712, + "mean_token_accuracy": 0.7260754346847534, "step": 780 }, { - "epoch": 1.1428051001821493, - "grad_norm": 0.243895024061203, + "epoch": 0.7560799422104503, + "grad_norm": 0.46254783868789673, "learning_rate": 2e-05, - "loss": 0.926, - "mean_token_accuracy": 0.7334284929164633, + "loss": 0.9738, + "mean_token_accuracy": 0.7248203098773957, "step": 785 }, { - "epoch": 1.1500910746812387, - "grad_norm": 0.23136085271835327, + "epoch": 0.7608957380207079, + "grad_norm": 0.4280416667461395, "learning_rate": 2e-05, - "loss": 0.9135, - "mean_token_accuracy": 0.736092452369321, + "loss": 1.0079, + "mean_token_accuracy": 0.7169389814138413, "step": 790 }, { - "epoch": 1.157377049180328, - "grad_norm": 0.2278895527124405, + "epoch": 0.7657115338309656, + "grad_norm": 0.43759432435035706, "learning_rate": 2e-05, - "loss": 0.8998, - "mean_token_accuracy": 0.7414035784074257, + "loss": 0.9617, + "mean_token_accuracy": 0.7278865665197373, "step": 795 }, { - "epoch": 1.164663023679417, - "grad_norm": 0.21887780725955963, + "epoch": 0.7705273296412232, + "grad_norm": 0.469664990901947, "learning_rate": 2e-05, - "loss": 0.931, - "mean_token_accuracy": 0.7319369809477286, + "loss": 1.0503, + "mean_token_accuracy": 0.7059356421232224, "step": 800 }, { - "epoch": 1.164663023679417, - "eval_loss": 0.962488055229187, - "eval_mean_token_accuracy": 0.7237455079222439, - "eval_runtime": 35.8741, - "eval_samples_per_second": 7.303, - "eval_steps_per_second": 0.474, + "epoch": 0.7705273296412232, + "eval_loss": 1.001406192779541, + "eval_mean_token_accuracy": 0.7216103397882901, + "eval_runtime": 7.4383, + "eval_samples_per_second": 13.444, + "eval_steps_per_second": 1.748, "step": 800 }, { - "epoch": 1.1719489981785063, - "grad_norm": 0.24219731986522675, + "epoch": 0.7753431254514809, + "grad_norm": 0.48634785413742065, "learning_rate": 2e-05, - "loss": 0.9308, - "mean_token_accuracy": 0.7323140571568149, + "loss": 1.0094, + "mean_token_accuracy": 0.7159925192594528, "step": 805 }, { - "epoch": 1.1792349726775957, - "grad_norm": 0.23919643461704254, + "epoch": 0.7801589212617385, + "grad_norm": 0.54598468542099, "learning_rate": 2e-05, - "loss": 0.9198, - "mean_token_accuracy": 0.73474902296043, + "loss": 1.0256, + "mean_token_accuracy": 0.7139883726835251, "step": 810 }, { - "epoch": 1.1865209471766849, - "grad_norm": 0.2236970216035843, + "epoch": 0.7849747170719962, + "grad_norm": 0.47515490651130676, "learning_rate": 2e-05, - "loss": 0.8978, - "mean_token_accuracy": 0.7420615534929167, + "loss": 1.0059, + "mean_token_accuracy": 0.7149439841508866, "step": 815 }, { - "epoch": 1.193806921675774, - "grad_norm": 0.22984063625335693, + "epoch": 0.7897905128822538, + "grad_norm": 0.4998687505722046, "learning_rate": 2e-05, - "loss": 0.9075, - "mean_token_accuracy": 0.7390708964338057, + "loss": 0.9977, + "mean_token_accuracy": 0.7198412358760834, "step": 820 }, { - "epoch": 1.2010928961748635, - "grad_norm": 0.24438656866550446, + "epoch": 0.7946063086925115, + "grad_norm": 0.44265708327293396, "learning_rate": 2e-05, - "loss": 0.9321, - "mean_token_accuracy": 0.7320885006237531, + "loss": 0.987, + "mean_token_accuracy": 0.7209729939699173, "step": 825 }, { - "epoch": 1.2083788706739527, - "grad_norm": 0.23722946643829346, + "epoch": 0.7994221045027691, + "grad_norm": 0.4938831031322479, "learning_rate": 2e-05, - "loss": 0.912, - "mean_token_accuracy": 0.7364176233512458, + "loss": 0.9836, + "mean_token_accuracy": 0.7213312387466431, "step": 830 }, { - "epoch": 1.2156648451730419, - "grad_norm": 0.24531032145023346, + "epoch": 0.8042379003130268, + "grad_norm": 0.4545023739337921, "learning_rate": 2e-05, - "loss": 0.9119, - "mean_token_accuracy": 0.7369840009770396, + "loss": 0.9721, + "mean_token_accuracy": 0.72330262362957, "step": 835 }, { - "epoch": 1.222950819672131, - "grad_norm": 0.2706034481525421, + "epoch": 0.8090536961232844, + "grad_norm": 0.44935089349746704, "learning_rate": 2e-05, - "loss": 0.9024, - "mean_token_accuracy": 0.7401547997068881, + "loss": 0.9834, + "mean_token_accuracy": 0.7243214756250381, "step": 840 }, { - "epoch": 1.2302367941712204, - "grad_norm": 0.23218482732772827, + "epoch": 0.813869491933542, + "grad_norm": 0.49139270186424255, "learning_rate": 2e-05, - "loss": 0.9054, - "mean_token_accuracy": 0.7385793234000977, + "loss": 1.0201, + "mean_token_accuracy": 0.7126396864652633, "step": 845 }, { - "epoch": 1.2375227686703096, - "grad_norm": 0.23459894955158234, + "epoch": 0.8186852877437997, + "grad_norm": 0.4555511772632599, "learning_rate": 2e-05, - "loss": 0.922, - "mean_token_accuracy": 0.7354253175378604, + "loss": 1.0021, + "mean_token_accuracy": 0.7186207294464111, "step": 850 }, { - "epoch": 1.2448087431693988, - "grad_norm": 0.25812986493110657, + "epoch": 0.8235010835540573, + "grad_norm": 0.4603417217731476, "learning_rate": 2e-05, - "loss": 0.9228, - "mean_token_accuracy": 0.7338277706546423, + "loss": 0.9496, + "mean_token_accuracy": 0.7318733394145965, "step": 855 }, { - "epoch": 1.2520947176684882, - "grad_norm": 0.2745145261287689, + "epoch": 0.828316879364315, + "grad_norm": 0.4485034942626953, "learning_rate": 2e-05, - "loss": 0.9108, - "mean_token_accuracy": 0.7370404860771861, + "loss": 1.0077, + "mean_token_accuracy": 0.716553458571434, "step": 860 }, { - "epoch": 1.2593806921675774, - "grad_norm": 0.2526284158229828, + "epoch": 0.8331326751745726, + "grad_norm": 0.5173277258872986, "learning_rate": 2e-05, - "loss": 0.898, - "mean_token_accuracy": 0.7409898632144605, + "loss": 1.0268, + "mean_token_accuracy": 0.7155414998531342, "step": 865 }, { - "epoch": 1.2666666666666666, - "grad_norm": 0.24674515426158905, + "epoch": 0.8379484709848303, + "grad_norm": 0.4781360924243927, "learning_rate": 2e-05, - "loss": 0.8939, - "mean_token_accuracy": 0.7410646678065462, + "loss": 0.9907, + "mean_token_accuracy": 0.7216429501771927, "step": 870 }, { - "epoch": 1.273952641165756, - "grad_norm": 0.23100045323371887, + "epoch": 0.8427642667950879, + "grad_norm": 0.43977782130241394, "learning_rate": 2e-05, - "loss": 0.9145, - "mean_token_accuracy": 0.7354451636541282, + "loss": 0.974, + "mean_token_accuracy": 0.7252675861120224, "step": 875 }, { - "epoch": 1.2812386156648452, - "grad_norm": 0.23644477128982544, + "epoch": 0.8475800626053456, + "grad_norm": 0.42007747292518616, "learning_rate": 2e-05, - "loss": 0.9088, - "mean_token_accuracy": 0.7372084147532976, + "loss": 0.9493, + "mean_token_accuracy": 0.731750875711441, "step": 880 }, { - "epoch": 1.2885245901639344, - "grad_norm": 0.25217151641845703, + "epoch": 0.8523958584156032, + "grad_norm": 0.4776918590068817, "learning_rate": 2e-05, - "loss": 0.9217, - "mean_token_accuracy": 0.7347871885686371, + "loss": 1.0154, + "mean_token_accuracy": 0.715020352602005, "step": 885 }, { - "epoch": 1.2958105646630238, - "grad_norm": 0.25527337193489075, + "epoch": 0.8572116542258609, + "grad_norm": 0.45001938939094543, "learning_rate": 2e-05, - "loss": 0.9227, - "mean_token_accuracy": 0.7333750610649734, + "loss": 0.9748, + "mean_token_accuracy": 0.72476125061512, "step": 890 }, { - "epoch": 1.303096539162113, - "grad_norm": 0.2655749022960663, + "epoch": 0.8620274500361185, + "grad_norm": 0.461335152387619, "learning_rate": 2e-05, - "loss": 0.8971, - "mean_token_accuracy": 0.7408488269506435, + "loss": 0.9866, + "mean_token_accuracy": 0.7213943302631378, "step": 895 }, { - "epoch": 1.3103825136612022, - "grad_norm": 0.26281771063804626, + "epoch": 0.8668432458463761, + "grad_norm": 0.48666810989379883, "learning_rate": 2e-05, - "loss": 0.9074, - "mean_token_accuracy": 0.7382098803126528, + "loss": 0.9831, + "mean_token_accuracy": 0.7247642397880554, "step": 900 }, { - "epoch": 1.3103825136612022, - "eval_loss": 0.9559279680252075, - "eval_mean_token_accuracy": 0.7254805694923981, - "eval_runtime": 37.1818, - "eval_samples_per_second": 7.046, - "eval_steps_per_second": 0.457, + "epoch": 0.8668432458463761, + "eval_loss": 0.9918749928474426, + "eval_mean_token_accuracy": 0.7235597096956693, + "eval_runtime": 7.4533, + "eval_samples_per_second": 13.417, + "eval_steps_per_second": 1.744, "step": 900 }, { - "epoch": 1.3176684881602914, - "grad_norm": 0.24635250866413116, + "epoch": 0.8716590416566338, + "grad_norm": 0.5488170981407166, "learning_rate": 2e-05, - "loss": 0.9268, - "mean_token_accuracy": 0.7334086468001956, + "loss": 0.9742, + "mean_token_accuracy": 0.7267244845628739, "step": 905 }, { - "epoch": 1.3249544626593808, - "grad_norm": 0.2402939647436142, + "epoch": 0.8764748374668914, + "grad_norm": 0.492119699716568, "learning_rate": 2e-05, - "loss": 0.8963, - "mean_token_accuracy": 0.7412554958475819, + "loss": 0.9847, + "mean_token_accuracy": 0.7216699242591857, "step": 910 }, { - "epoch": 1.33224043715847, - "grad_norm": 0.23926663398742676, + "epoch": 0.8812906332771491, + "grad_norm": 0.44146597385406494, "learning_rate": 2e-05, - "loss": 0.9015, - "mean_token_accuracy": 0.7405654616511969, + "loss": 0.9681, + "mean_token_accuracy": 0.7264615386724472, "step": 915 }, { - "epoch": 1.3395264116575591, - "grad_norm": 0.23617114126682281, + "epoch": 0.8861064290874067, + "grad_norm": 0.470457524061203, "learning_rate": 2e-05, - "loss": 0.9204, - "mean_token_accuracy": 0.734159745969712, + "loss": 0.9703, + "mean_token_accuracy": 0.7229752570390702, "step": 920 }, { - "epoch": 1.3468123861566483, - "grad_norm": 0.24529297649860382, + "epoch": 0.8909222248976644, + "grad_norm": 0.5002897381782532, "learning_rate": 2e-05, - "loss": 0.91, - "mean_token_accuracy": 0.7378984489496825, + "loss": 1.007, + "mean_token_accuracy": 0.7148719429969788, "step": 925 }, { - "epoch": 1.3540983606557377, - "grad_norm": 0.2535146176815033, + "epoch": 0.895738020707922, + "grad_norm": 0.4409243166446686, "learning_rate": 2e-05, - "loss": 0.8946, - "mean_token_accuracy": 0.740582254518808, + "loss": 0.9826, + "mean_token_accuracy": 0.7208293199539184, "step": 930 }, { - "epoch": 1.361384335154827, - "grad_norm": 0.24595101177692413, + "epoch": 0.9005538165181797, + "grad_norm": 0.4750625193119049, "learning_rate": 2e-05, - "loss": 0.9149, - "mean_token_accuracy": 0.7362970200293113, + "loss": 0.9885, + "mean_token_accuracy": 0.7238674312829971, "step": 935 }, { - "epoch": 1.3686703096539161, - "grad_norm": 0.22996020317077637, + "epoch": 0.9053696123284373, + "grad_norm": 0.47139453887939453, "learning_rate": 2e-05, - "loss": 0.8912, - "mean_token_accuracy": 0.7419562164142649, + "loss": 0.9866, + "mean_token_accuracy": 0.7211822777986526, "step": 940 }, { - "epoch": 1.3759562841530055, - "grad_norm": 0.27776476740837097, + "epoch": 0.910185408138695, + "grad_norm": 0.4502057433128357, "learning_rate": 2e-05, - "loss": 0.9057, - "mean_token_accuracy": 0.7376755617977531, + "loss": 0.9875, + "mean_token_accuracy": 0.7202529609203339, "step": 945 }, { - "epoch": 1.3832422586520947, - "grad_norm": 0.2726893126964569, + "epoch": 0.9150012039489526, + "grad_norm": 0.4842672348022461, "learning_rate": 2e-05, - "loss": 0.8975, - "mean_token_accuracy": 0.7407517098192475, + "loss": 0.9412, + "mean_token_accuracy": 0.7316877156496048, "step": 950 }, { - "epoch": 1.390528233151184, - "grad_norm": 0.24319769442081451, + "epoch": 0.9198169997592102, + "grad_norm": 0.4380047023296356, "learning_rate": 2e-05, - "loss": 0.911, - "mean_token_accuracy": 0.7371152906692723, + "loss": 0.9675, + "mean_token_accuracy": 0.7267184883356095, "step": 955 }, { - "epoch": 1.3978142076502733, - "grad_norm": 0.24976621568202972, + "epoch": 0.9246327955694679, + "grad_norm": 0.49916306138038635, "learning_rate": 2e-05, - "loss": 0.9148, - "mean_token_accuracy": 0.7358390327308257, + "loss": 0.9711, + "mean_token_accuracy": 0.7259949505329132, "step": 960 }, { - "epoch": 1.4051001821493625, - "grad_norm": 0.23896424472332, + "epoch": 0.9294485913797255, + "grad_norm": 0.42182397842407227, "learning_rate": 2e-05, - "loss": 0.8832, - "mean_token_accuracy": 0.7439560942843186, + "loss": 1.0279, + "mean_token_accuracy": 0.7109717577695847, "step": 965 }, { - "epoch": 1.4123861566484517, - "grad_norm": 0.23795920610427856, + "epoch": 0.9342643871899832, + "grad_norm": 0.4488319158554077, "learning_rate": 2e-05, - "loss": 0.9021, - "mean_token_accuracy": 0.7406188935026868, + "loss": 0.974, + "mean_token_accuracy": 0.7252845972776413, "step": 970 }, { - "epoch": 1.419672131147541, - "grad_norm": 0.2664782404899597, + "epoch": 0.9390801830002408, + "grad_norm": 0.4408664107322693, "learning_rate": 2e-05, - "loss": 0.9101, - "mean_token_accuracy": 0.7370694919394236, + "loss": 0.9827, + "mean_token_accuracy": 0.7232415169477463, "step": 975 }, { - "epoch": 1.4269581056466303, - "grad_norm": 0.2434052973985672, + "epoch": 0.9438959788104985, + "grad_norm": 0.4980012774467468, "learning_rate": 2e-05, - "loss": 0.8896, - "mean_token_accuracy": 0.7435683317049342, + "loss": 0.966, + "mean_token_accuracy": 0.727419114112854, "step": 980 }, { - "epoch": 1.4342440801457195, - "grad_norm": 0.2583552300930023, + "epoch": 0.9487117746207561, + "grad_norm": 0.44837021827697754, "learning_rate": 2e-05, - "loss": 0.8936, - "mean_token_accuracy": 0.7419669027845627, + "loss": 0.9766, + "mean_token_accuracy": 0.7247399032115937, "step": 985 }, { - "epoch": 1.4415300546448089, - "grad_norm": 0.24104474484920502, + "epoch": 0.9535275704310138, + "grad_norm": 0.4687640368938446, "learning_rate": 2e-05, - "loss": 0.9149, - "mean_token_accuracy": 0.7361550439667807, + "loss": 1.0062, + "mean_token_accuracy": 0.7164921730756759, "step": 990 }, { - "epoch": 1.448816029143898, - "grad_norm": 0.247370645403862, + "epoch": 0.9583433662412714, + "grad_norm": 0.493757963180542, "learning_rate": 2e-05, - "loss": 0.9037, - "mean_token_accuracy": 0.7384266609672692, + "loss": 1.0161, + "mean_token_accuracy": 0.7133044749498367, "step": 995 }, { - "epoch": 1.4561020036429873, - "grad_norm": 0.22899129986763, + "epoch": 0.963159162051529, + "grad_norm": 0.45803388953208923, "learning_rate": 2e-05, - "loss": 0.9055, - "mean_token_accuracy": 0.7383884953590621, + "loss": 0.9919, + "mean_token_accuracy": 0.7180918484926224, "step": 1000 }, { - "epoch": 1.4561020036429873, - "eval_loss": 0.9484136700630188, - "eval_mean_token_accuracy": 0.7270321376011663, - "eval_runtime": 36.2695, - "eval_samples_per_second": 7.224, - "eval_steps_per_second": 0.469, + "epoch": 0.963159162051529, + "eval_loss": 0.983593761920929, + "eval_mean_token_accuracy": 0.7254445598675654, + "eval_runtime": 7.3943, + "eval_samples_per_second": 13.524, + "eval_steps_per_second": 1.758, "step": 1000 }, { - "epoch": 1.4633879781420764, - "grad_norm": 0.2408648431301117, + "epoch": 0.9679749578617867, + "grad_norm": 0.44389694929122925, "learning_rate": 2e-05, - "loss": 0.8974, - "mean_token_accuracy": 0.7416111993160724, + "loss": 0.938, + "mean_token_accuracy": 0.7324963569641113, "step": 1005 }, { - "epoch": 1.4706739526411656, - "grad_norm": 0.22463728487491608, + "epoch": 0.9727907536720443, + "grad_norm": 0.4311533570289612, "learning_rate": 2e-05, - "loss": 0.9098, - "mean_token_accuracy": 0.7365138006839276, + "loss": 0.986, + "mean_token_accuracy": 0.7234836369752884, "step": 1010 }, { - "epoch": 1.477959927140255, - "grad_norm": 0.2450459599494934, + "epoch": 0.977606549482302, + "grad_norm": 0.47805625200271606, "learning_rate": 2e-05, - "loss": 0.9068, - "mean_token_accuracy": 0.7383442232535418, + "loss": 0.9725, + "mean_token_accuracy": 0.7261527508497239, "step": 1015 }, { - "epoch": 1.4852459016393442, - "grad_norm": 0.25232186913490295, + "epoch": 0.9824223452925596, + "grad_norm": 0.4597644507884979, "learning_rate": 2e-05, - "loss": 0.8972, - "mean_token_accuracy": 0.7410585613092329, + "loss": 0.9498, + "mean_token_accuracy": 0.7301326423883439, "step": 1020 }, { - "epoch": 1.4925318761384334, - "grad_norm": 0.23277677595615387, + "epoch": 0.9872381411028173, + "grad_norm": 0.4078470766544342, "learning_rate": 2e-05, - "loss": 0.8895, - "mean_token_accuracy": 0.7428874572545188, + "loss": 0.9723, + "mean_token_accuracy": 0.7240770101547241, "step": 1025 }, { - "epoch": 1.4998178506375228, - "grad_norm": 0.24832560122013092, + "epoch": 0.9920539369130749, + "grad_norm": 0.5153449773788452, "learning_rate": 2e-05, - "loss": 0.8962, - "mean_token_accuracy": 0.7403868466047876, + "loss": 0.9672, + "mean_token_accuracy": 0.7258705139160156, "step": 1030 }, { - "epoch": 1.507103825136612, - "grad_norm": 0.24917817115783691, + "epoch": 0.9968697327233326, + "grad_norm": 0.4625481963157654, "learning_rate": 2e-05, - "loss": 0.899, - "mean_token_accuracy": 0.7395075109916951, + "loss": 0.954, + "mean_token_accuracy": 0.728279384970665, "step": 1035 }, { - "epoch": 1.5143897996357012, - "grad_norm": 0.2454776167869568, + "epoch": 1.0009631591620516, + "grad_norm": 0.4911911189556122, "learning_rate": 2e-05, - "loss": 0.925, - "mean_token_accuracy": 0.7319293695039306, + "loss": 0.8181, + "mean_token_accuracy": 0.72792368776658, "step": 1040 }, { - "epoch": 1.5216757741347906, - "grad_norm": 0.2297164499759674, + "epoch": 1.005778954972309, + "grad_norm": 0.5356068015098572, "learning_rate": 2e-05, - "loss": 0.9006, - "mean_token_accuracy": 0.7397594040058624, + "loss": 0.8554, + "mean_token_accuracy": 0.7529169410467148, "step": 1045 }, { - "epoch": 1.5289617486338798, - "grad_norm": 0.23188287019729614, + "epoch": 1.0105947507825668, + "grad_norm": 0.556844174861908, "learning_rate": 2e-05, - "loss": 0.909, - "mean_token_accuracy": 0.7355245481191989, + "loss": 0.9097, + "mean_token_accuracy": 0.7376857072114944, "step": 1050 }, { - "epoch": 1.536247723132969, - "grad_norm": 0.2465352863073349, + "epoch": 1.0154105465928245, + "grad_norm": 0.4455285668373108, "learning_rate": 2e-05, - "loss": 0.8836, - "mean_token_accuracy": 0.7430340131900344, + "loss": 0.9046, + "mean_token_accuracy": 0.7377484381198883, "step": 1055 }, { - "epoch": 1.5435336976320584, - "grad_norm": 0.24740713834762573, + "epoch": 1.0202263424030822, + "grad_norm": 0.49472665786743164, "learning_rate": 2e-05, - "loss": 0.9019, - "mean_token_accuracy": 0.7388159501709819, + "loss": 0.9295, + "mean_token_accuracy": 0.7324019372463226, "step": 1060 }, { - "epoch": 1.5508196721311476, - "grad_norm": 0.24196745455265045, + "epoch": 1.0250421382133397, + "grad_norm": 0.46856382489204407, "learning_rate": 2e-05, - "loss": 0.9029, - "mean_token_accuracy": 0.7383976551050319, + "loss": 0.94, + "mean_token_accuracy": 0.7307655364274979, "step": 1065 }, { - "epoch": 1.5581056466302368, - "grad_norm": 0.26808851957321167, + "epoch": 1.0298579340235974, + "grad_norm": 0.4596047103404999, "learning_rate": 2e-05, - "loss": 0.8949, - "mean_token_accuracy": 0.7395578895945287, + "loss": 0.9528, + "mean_token_accuracy": 0.7279935926198959, "step": 1070 }, { - "epoch": 1.5653916211293262, - "grad_norm": 0.23099803924560547, + "epoch": 1.034673729833855, + "grad_norm": 0.43763288855552673, "learning_rate": 2e-05, - "loss": 0.8969, - "mean_token_accuracy": 0.740501343429409, + "loss": 0.9156, + "mean_token_accuracy": 0.73514584004879, "step": 1075 }, { - "epoch": 1.5726775956284151, - "grad_norm": 0.23486842215061188, + "epoch": 1.0394895256441128, + "grad_norm": 0.4287591576576233, "learning_rate": 2e-05, - "loss": 0.8934, - "mean_token_accuracy": 0.7417089032730828, + "loss": 0.9454, + "mean_token_accuracy": 0.7282472938299179, "step": 1080 }, { - "epoch": 1.5799635701275045, - "grad_norm": 0.27474597096443176, + "epoch": 1.0443053214543703, + "grad_norm": 0.4557848572731018, "learning_rate": 2e-05, - "loss": 0.8868, - "mean_token_accuracy": 0.7421821568148511, + "loss": 0.9548, + "mean_token_accuracy": 0.7260845839977265, "step": 1085 }, { - "epoch": 1.587249544626594, - "grad_norm": 0.25371241569519043, + "epoch": 1.049121117264628, + "grad_norm": 0.508375883102417, "learning_rate": 2e-05, - "loss": 0.9051, - "mean_token_accuracy": 0.7380175256472887, + "loss": 0.9157, + "mean_token_accuracy": 0.7361098319292069, "step": 1090 }, { - "epoch": 1.594535519125683, - "grad_norm": 0.24320611357688904, + "epoch": 1.0539369130748857, + "grad_norm": 0.46058163046836853, "learning_rate": 2e-05, - "loss": 0.8896, - "mean_token_accuracy": 0.7424699888439653, + "loss": 0.9107, + "mean_token_accuracy": 0.7396865218877793, "step": 1095 }, { - "epoch": 1.6018214936247723, - "grad_norm": 0.24586845934391022, + "epoch": 1.0587527088851432, + "grad_norm": 0.4415583610534668, "learning_rate": 2e-05, - "loss": 0.9074, - "mean_token_accuracy": 0.7369397288715193, + "loss": 0.9443, + "mean_token_accuracy": 0.7304907143115997, "step": 1100 }, { - "epoch": 1.6018214936247723, - "eval_loss": 0.9426586627960205, - "eval_mean_token_accuracy": 0.7278746708682018, - "eval_runtime": 36.7765, - "eval_samples_per_second": 7.124, - "eval_steps_per_second": 0.462, + "epoch": 1.0587527088851432, + "eval_loss": 0.9793750047683716, + "eval_mean_token_accuracy": 0.7268870839705834, + "eval_runtime": 7.4707, + "eval_samples_per_second": 13.386, + "eval_steps_per_second": 1.74, "step": 1100 }, { - "epoch": 1.6091074681238615, - "grad_norm": 0.2685466408729553, + "epoch": 1.0635685046954009, + "grad_norm": 0.46880385279655457, "learning_rate": 2e-05, - "loss": 0.8912, - "mean_token_accuracy": 0.7420249145090378, + "loss": 0.9159, + "mean_token_accuracy": 0.7369724005460739, "step": 1105 }, { - "epoch": 1.6163934426229507, - "grad_norm": 0.24883808195590973, + "epoch": 1.0683843005056586, + "grad_norm": 0.46472156047821045, "learning_rate": 2e-05, - "loss": 0.8973, - "mean_token_accuracy": 0.7395487298485591, + "loss": 0.9196, + "mean_token_accuracy": 0.7389460921287536, "step": 1110 }, { - "epoch": 1.6236794171220401, - "grad_norm": 0.24616660177707672, + "epoch": 1.0732000963159163, + "grad_norm": 0.4124833643436432, "learning_rate": 2e-05, - "loss": 0.9001, - "mean_token_accuracy": 0.7390174645823157, + "loss": 0.9313, + "mean_token_accuracy": 0.7337959378957748, "step": 1115 }, { - "epoch": 1.6309653916211293, - "grad_norm": 0.24163974821567535, + "epoch": 1.0780158921261738, + "grad_norm": 0.4799756705760956, "learning_rate": 2e-05, - "loss": 0.8976, - "mean_token_accuracy": 0.7398876404494382, + "loss": 0.9289, + "mean_token_accuracy": 0.7316039860248565, "step": 1120 }, { - "epoch": 1.6382513661202185, - "grad_norm": 0.3287493586540222, + "epoch": 1.0828316879364315, + "grad_norm": 0.4218555986881256, "learning_rate": 2e-05, - "loss": 0.9024, - "mean_token_accuracy": 0.7385472780685156, + "loss": 0.9281, + "mean_token_accuracy": 0.7350360959768295, "step": 1125 }, { - "epoch": 1.645537340619308, - "grad_norm": 0.27125054597854614, + "epoch": 1.0876474837466892, + "grad_norm": 0.4582618176937103, "learning_rate": 2e-05, - "loss": 0.8934, - "mean_token_accuracy": 0.7404921836834394, + "loss": 0.9423, + "mean_token_accuracy": 0.729295802116394, "step": 1130 }, { - "epoch": 1.652823315118397, - "grad_norm": 0.2530953884124756, + "epoch": 1.0924632795569469, + "grad_norm": 0.42727431654930115, "learning_rate": 2e-05, - "loss": 0.8947, - "mean_token_accuracy": 0.7405977039159446, + "loss": 0.9252, + "mean_token_accuracy": 0.7355052590370178, "step": 1135 }, { - "epoch": 1.6601092896174863, - "grad_norm": 0.24643385410308838, + "epoch": 1.0972790753672044, + "grad_norm": 0.48448023200035095, "learning_rate": 2e-05, - "loss": 0.9148, - "mean_token_accuracy": 0.734823827552516, + "loss": 0.912, + "mean_token_accuracy": 0.7374375581741333, "step": 1140 }, { - "epoch": 1.6673952641165757, - "grad_norm": 0.2359907627105713, + "epoch": 1.102094871177462, + "grad_norm": 0.49189600348472595, "learning_rate": 2e-05, - "loss": 0.8908, - "mean_token_accuracy": 0.7428370786516856, + "loss": 0.904, + "mean_token_accuracy": 0.7397159487009048, "step": 1145 }, { - "epoch": 1.6746812386156649, - "grad_norm": 0.2252684086561203, + "epoch": 1.1069106669877198, + "grad_norm": 0.4637366235256195, "learning_rate": 2e-05, - "loss": 0.9013, - "mean_token_accuracy": 0.7393166829506594, + "loss": 0.8921, + "mean_token_accuracy": 0.741287299990654, "step": 1150 }, { - "epoch": 1.681967213114754, - "grad_norm": 0.24328413605690002, + "epoch": 1.1117264627979773, + "grad_norm": 0.4737195074558258, "learning_rate": 2e-05, - "loss": 0.9094, - "mean_token_accuracy": 0.7362924401563264, + "loss": 0.9369, + "mean_token_accuracy": 0.7317490667104721, "step": 1155 }, { - "epoch": 1.6892531876138435, - "grad_norm": 0.25157198309898376, + "epoch": 1.116542258608235, + "grad_norm": 0.42976248264312744, "learning_rate": 2e-05, - "loss": 0.8964, - "mean_token_accuracy": 0.740287616023449, + "loss": 0.9407, + "mean_token_accuracy": 0.731182512640953, "step": 1160 }, { - "epoch": 1.6965391621129327, - "grad_norm": 0.2484838217496872, + "epoch": 1.1213580544184927, + "grad_norm": 0.4201087951660156, "learning_rate": 2e-05, - "loss": 0.8947, - "mean_token_accuracy": 0.7410952002931117, + "loss": 0.8963, + "mean_token_accuracy": 0.7426417738199234, "step": 1165 }, { - "epoch": 1.7038251366120218, - "grad_norm": 0.23148389160633087, + "epoch": 1.1261738502287504, + "grad_norm": 0.4694991409778595, "learning_rate": 2e-05, - "loss": 0.8939, - "mean_token_accuracy": 0.7413776257938448, + "loss": 0.9174, + "mean_token_accuracy": 0.7342051297426224, "step": 1170 }, { - "epoch": 1.7111111111111112, - "grad_norm": 0.2488527148962021, + "epoch": 1.1309896460390079, + "grad_norm": 0.439586877822876, "learning_rate": 2e-05, - "loss": 0.8938, - "mean_token_accuracy": 0.7411043600390815, + "loss": 0.9323, + "mean_token_accuracy": 0.7301142245531083, "step": 1175 }, { - "epoch": 1.7183970856102002, - "grad_norm": 0.24698656797409058, + "epoch": 1.1358054418492656, + "grad_norm": 0.4932158887386322, "learning_rate": 2e-05, - "loss": 0.8933, - "mean_token_accuracy": 0.7404463849535907, + "loss": 0.917, + "mean_token_accuracy": 0.7341791063547134, "step": 1180 }, { - "epoch": 1.7256830601092896, - "grad_norm": 0.2506196200847626, + "epoch": 1.1406212376595233, + "grad_norm": 0.4529474079608917, "learning_rate": 2e-05, - "loss": 0.8828, - "mean_token_accuracy": 0.7443102711284807, + "loss": 0.9168, + "mean_token_accuracy": 0.7385017931461334, "step": 1185 }, { - "epoch": 1.732969034608379, - "grad_norm": 0.22465619444847107, + "epoch": 1.145437033469781, + "grad_norm": 0.5287330150604248, "learning_rate": 2e-05, - "loss": 0.8947, - "mean_token_accuracy": 0.740437225207621, + "loss": 0.9125, + "mean_token_accuracy": 0.7389725655317306, "step": 1190 }, { - "epoch": 1.740255009107468, - "grad_norm": 0.2372632622718811, + "epoch": 1.1502528292800385, + "grad_norm": 0.45688769221305847, "learning_rate": 2e-05, - "loss": 0.9029, - "mean_token_accuracy": 0.7392021861260379, + "loss": 0.9111, + "mean_token_accuracy": 0.7377701252698898, "step": 1195 }, { - "epoch": 1.7475409836065574, - "grad_norm": 0.259969025850296, + "epoch": 1.1550686250902962, + "grad_norm": 0.49863970279693604, "learning_rate": 2e-05, - "loss": 0.878, - "mean_token_accuracy": 0.7458796409379582, + "loss": 0.9408, + "mean_token_accuracy": 0.7300858527421952, "step": 1200 }, { - "epoch": 1.7475409836065574, - "eval_loss": 0.9372912645339966, - "eval_mean_token_accuracy": 0.7289227538789632, - "eval_runtime": 36.6965, - "eval_samples_per_second": 7.14, - "eval_steps_per_second": 0.463, + "epoch": 1.1550686250902962, + "eval_loss": 0.9731249809265137, + "eval_mean_token_accuracy": 0.7276574327395513, + "eval_runtime": 7.5058, + "eval_samples_per_second": 13.323, + "eval_steps_per_second": 1.732, "step": 1200 }, { - "epoch": 1.7548269581056466, - "grad_norm": 0.23141299188137054, + "epoch": 1.1598844209005539, + "grad_norm": 0.4945198595523834, "learning_rate": 2e-05, - "loss": 0.8977, - "mean_token_accuracy": 0.7405669882755255, + "loss": 0.8994, + "mean_token_accuracy": 0.7420838236808777, "step": 1205 }, { - "epoch": 1.7621129326047358, - "grad_norm": 0.241252601146698, + "epoch": 1.1647002167108114, + "grad_norm": 0.4849279820919037, "learning_rate": 2e-05, - "loss": 0.8983, - "mean_token_accuracy": 0.7395714566117425, + "loss": 0.8947, + "mean_token_accuracy": 0.7422223538160324, "step": 1210 }, { - "epoch": 1.7693989071038252, - "grad_norm": 0.2374604344367981, + "epoch": 1.169516012521069, + "grad_norm": 0.4697224795818329, "learning_rate": 2e-05, - "loss": 0.8941, - "mean_token_accuracy": 0.7401441133365902, + "loss": 0.8753, + "mean_token_accuracy": 0.7431703954935074, "step": 1215 }, { - "epoch": 1.7766848816029144, - "grad_norm": 0.2405879646539688, + "epoch": 1.1743318083313268, + "grad_norm": 0.4229016602039337, "learning_rate": 2e-05, - "loss": 0.8926, - "mean_token_accuracy": 0.7409089521250611, + "loss": 0.8972, + "mean_token_accuracy": 0.7435213953256607, "step": 1220 }, { - "epoch": 1.7839708561020036, - "grad_norm": 0.2612534761428833, + "epoch": 1.1791476041415845, + "grad_norm": 0.460322767496109, "learning_rate": 2e-05, - "loss": 0.8894, - "mean_token_accuracy": 0.741570077275746, + "loss": 0.8959, + "mean_token_accuracy": 0.7428202599287033, "step": 1225 }, { - "epoch": 1.791256830601093, - "grad_norm": 0.22933730483055115, + "epoch": 1.183963399951842, + "grad_norm": 0.4953286051750183, "learning_rate": 2e-05, - "loss": 0.9101, - "mean_token_accuracy": 0.736487848070347, + "loss": 0.9159, + "mean_token_accuracy": 0.7356004238128662, "step": 1230 }, { - "epoch": 1.7985428051001822, - "grad_norm": 0.22422315180301666, + "epoch": 1.1887791957620997, + "grad_norm": 0.4624522626399994, "learning_rate": 2e-05, - "loss": 0.8908, - "mean_token_accuracy": 0.7419745359062043, + "loss": 0.8787, + "mean_token_accuracy": 0.7456301301717758, "step": 1235 }, { - "epoch": 1.8058287795992713, - "grad_norm": 0.22796830534934998, + "epoch": 1.1935949915723574, + "grad_norm": 0.4406830370426178, "learning_rate": 2e-05, - "loss": 0.8922, - "mean_token_accuracy": 0.7410585613092332, + "loss": 0.8834, + "mean_token_accuracy": 0.7440876245498658, "step": 1240 }, { - "epoch": 1.8131147540983608, - "grad_norm": 0.244882270693779, + "epoch": 1.198410787382615, + "grad_norm": 0.46658602356910706, "learning_rate": 2e-05, - "loss": 0.8833, - "mean_token_accuracy": 0.7435210063507574, + "loss": 0.9317, + "mean_token_accuracy": 0.7305654168128968, "step": 1245 }, { - "epoch": 1.82040072859745, - "grad_norm": 0.24056288599967957, + "epoch": 1.2032265831928726, + "grad_norm": 0.4948701560497284, "learning_rate": 2e-05, - "loss": 0.8965, - "mean_token_accuracy": 0.7406986070429273, + "loss": 0.9875, + "mean_token_accuracy": 0.7164684683084488, "step": 1250 }, { - "epoch": 1.8276867030965391, - "grad_norm": 0.2528102695941925, + "epoch": 1.2080423790031303, + "grad_norm": 0.4903734028339386, "learning_rate": 2e-05, - "loss": 0.8877, - "mean_token_accuracy": 0.7422752808988765, + "loss": 0.9161, + "mean_token_accuracy": 0.7358790308237075, "step": 1255 }, { - "epoch": 1.8349726775956285, - "grad_norm": 0.2547694444656372, + "epoch": 1.212858174813388, + "grad_norm": 0.47133368253707886, "learning_rate": 2e-05, - "loss": 0.8942, - "mean_token_accuracy": 0.7405499980795623, + "loss": 0.8859, + "mean_token_accuracy": 0.7439457237720489, "step": 1260 }, { - "epoch": 1.8422586520947175, - "grad_norm": 0.24326466023921967, + "epoch": 1.2176739706236455, + "grad_norm": 0.4432521164417267, "learning_rate": 2e-05, - "loss": 0.8889, - "mean_token_accuracy": 0.7411135197850514, + "loss": 0.897, + "mean_token_accuracy": 0.7405682742595673, "step": 1265 }, { - "epoch": 1.849544626593807, - "grad_norm": 0.2283967137336731, + "epoch": 1.2224897664339032, + "grad_norm": 0.4732060134410858, "learning_rate": 2e-05, - "loss": 0.8918, - "mean_token_accuracy": 0.7406066805080608, + "loss": 0.8965, + "mean_token_accuracy": 0.7405115336179733, "step": 1270 }, { - "epoch": 1.8568306010928963, - "grad_norm": 0.2283177226781845, + "epoch": 1.2273055622441609, + "grad_norm": 0.4536789655685425, "learning_rate": 2e-05, - "loss": 0.9075, - "mean_token_accuracy": 0.7364283097215438, + "loss": 0.9809, + "mean_token_accuracy": 0.7194355845451355, "step": 1275 }, { - "epoch": 1.8641165755919853, - "grad_norm": 0.2293919175863266, + "epoch": 1.2321213580544186, + "grad_norm": 0.45061028003692627, "learning_rate": 2e-05, - "loss": 0.883, - "mean_token_accuracy": 0.7429531021006351, + "loss": 0.9005, + "mean_token_accuracy": 0.7391522288322449, "step": 1280 }, { - "epoch": 1.8714025500910747, - "grad_norm": 0.2358449548482895, + "epoch": 1.236937153864676, + "grad_norm": 0.43687281012535095, "learning_rate": 2e-05, - "loss": 0.873, - "mean_token_accuracy": 0.7455468368343917, + "loss": 0.9141, + "mean_token_accuracy": 0.7360376954078675, "step": 1285 }, { - "epoch": 1.8786885245901639, - "grad_norm": 0.24609410762786865, + "epoch": 1.2417529496749338, + "grad_norm": 0.4651852250099182, "learning_rate": 2e-05, - "loss": 0.8909, - "mean_token_accuracy": 0.7408952125061068, + "loss": 0.8546, + "mean_token_accuracy": 0.7502513557672501, "step": 1290 }, { - "epoch": 1.885974499089253, - "grad_norm": 0.24984125792980194, + "epoch": 1.2465687454851915, + "grad_norm": 0.47418808937072754, "learning_rate": 2e-05, - "loss": 0.8979, - "mean_token_accuracy": 0.7388785417684416, + "loss": 0.9286, + "mean_token_accuracy": 0.732248243689537, "step": 1295 }, { - "epoch": 1.8932604735883425, - "grad_norm": 0.2732260227203369, + "epoch": 1.2513845412954492, + "grad_norm": 0.47841620445251465, "learning_rate": 2e-05, - "loss": 0.8971, - "mean_token_accuracy": 0.7392968368343918, + "loss": 0.8714, + "mean_token_accuracy": 0.7448538273572922, "step": 1300 }, { - "epoch": 1.8932604735883425, - "eval_loss": 0.931566059589386, - "eval_mean_token_accuracy": 0.7301682973396459, - "eval_runtime": 39.2113, - "eval_samples_per_second": 6.682, - "eval_steps_per_second": 0.434, + "epoch": 1.2513845412954492, + "eval_loss": 0.96875, + "eval_mean_token_accuracy": 0.7282290825477014, + "eval_runtime": 7.4645, + "eval_samples_per_second": 13.397, + "eval_steps_per_second": 1.742, "step": 1300 }, { - "epoch": 1.9005464480874317, - "grad_norm": 0.2397509217262268, + "epoch": 1.2562003371057067, + "grad_norm": 0.4392741322517395, "learning_rate": 2e-05, - "loss": 0.8939, - "mean_token_accuracy": 0.7408295676599905, + "loss": 0.8927, + "mean_token_accuracy": 0.740854287147522, "step": 1305 }, { - "epoch": 1.9078324225865209, - "grad_norm": 0.24797864258289337, + "epoch": 1.2610161329159644, + "grad_norm": 0.45625588297843933, "learning_rate": 2e-05, - "loss": 0.9037, - "mean_token_accuracy": 0.7368725574010748, + "loss": 0.9038, + "mean_token_accuracy": 0.7398477971553803, "step": 1310 }, { - "epoch": 1.9151183970856103, - "grad_norm": 0.2255106419324875, + "epoch": 1.265831928726222, + "grad_norm": 0.4694783389568329, "learning_rate": 2e-05, - "loss": 0.8858, - "mean_token_accuracy": 0.7422493282852957, + "loss": 0.9006, + "mean_token_accuracy": 0.7389223098754882, "step": 1315 }, { - "epoch": 1.9224043715846995, - "grad_norm": 0.22497308254241943, + "epoch": 1.2706477245364796, + "grad_norm": 0.45548853278160095, "learning_rate": 2e-05, - "loss": 0.8869, - "mean_token_accuracy": 0.7429964063296582, + "loss": 0.8969, + "mean_token_accuracy": 0.7393833607435226, "step": 1320 }, { - "epoch": 1.9296903460837886, - "grad_norm": 0.23234759271144867, + "epoch": 1.2754635203467373, + "grad_norm": 0.4488835036754608, "learning_rate": 2e-05, - "loss": 0.8769, - "mean_token_accuracy": 0.7449453468490475, + "loss": 0.9385, + "mean_token_accuracy": 0.7314446687698364, "step": 1325 }, { - "epoch": 1.936976320582878, - "grad_norm": 0.23880702257156372, + "epoch": 1.280279316156995, + "grad_norm": 0.4357261061668396, "learning_rate": 2e-05, - "loss": 0.8829, - "mean_token_accuracy": 0.7424783219345384, + "loss": 0.9103, + "mean_token_accuracy": 0.7364071696996689, "step": 1330 }, { - "epoch": 1.9442622950819672, - "grad_norm": 0.24366316199302673, + "epoch": 1.2850951119672525, + "grad_norm": 0.44631990790367126, "learning_rate": 2e-05, - "loss": 0.8627, - "mean_token_accuracy": 0.7484535295554471, + "loss": 0.9388, + "mean_token_accuracy": 0.7308136492967605, "step": 1335 }, { - "epoch": 1.9515482695810564, - "grad_norm": 0.23241998255252838, + "epoch": 1.2899109077775102, + "grad_norm": 0.4615370035171509, "learning_rate": 2e-05, - "loss": 0.8974, - "mean_token_accuracy": 0.7385976428920371, + "loss": 0.9324, + "mean_token_accuracy": 0.7321425348520278, "step": 1340 }, { - "epoch": 1.9588342440801458, - "grad_norm": 0.2727753818035126, + "epoch": 1.2947267035877679, + "grad_norm": 0.5415944457054138, "learning_rate": 2e-05, - "loss": 0.885, - "mean_token_accuracy": 0.7431951350665288, + "loss": 0.9471, + "mean_token_accuracy": 0.7317284226417542, "step": 1345 }, { - "epoch": 1.966120218579235, - "grad_norm": 0.24258148670196533, + "epoch": 1.2995424993980256, + "grad_norm": 0.48141545057296753, "learning_rate": 2e-05, - "loss": 0.8952, - "mean_token_accuracy": 0.7398082559843674, + "loss": 0.8986, + "mean_token_accuracy": 0.7391999721527099, "step": 1350 }, { - "epoch": 1.9734061930783242, - "grad_norm": 0.2493925392627716, + "epoch": 1.3043582952082833, + "grad_norm": 0.4513304829597473, "learning_rate": 2e-05, - "loss": 0.8824, - "mean_token_accuracy": 0.7440339521250612, + "loss": 0.9184, + "mean_token_accuracy": 0.7361092567443848, "step": 1355 }, { - "epoch": 1.9806921675774136, - "grad_norm": 0.26210054755210876, + "epoch": 1.3091740910185408, + "grad_norm": 0.4669760465621948, "learning_rate": 2e-05, - "loss": 0.8782, - "mean_token_accuracy": 0.7450537371763556, + "loss": 0.9314, + "mean_token_accuracy": 0.7313281089067459, "step": 1360 }, { - "epoch": 1.9879781420765026, - "grad_norm": 0.28852924704551697, + "epoch": 1.3139898868287985, + "grad_norm": 0.4345858097076416, "learning_rate": 2e-05, - "loss": 0.876, - "mean_token_accuracy": 0.7450354176844163, + "loss": 0.8675, + "mean_token_accuracy": 0.7467694342136383, "step": 1365 }, { - "epoch": 1.995264116575592, - "grad_norm": 0.23531539738178253, + "epoch": 1.3188056826390562, + "grad_norm": 0.47558847069740295, "learning_rate": 2e-05, - "loss": 0.882, - "mean_token_accuracy": 0.7429299400447796, + "loss": 0.8863, + "mean_token_accuracy": 0.7418897211551666, "step": 1370 }, { - "epoch": 1.9981785063752278, - "mean_token_accuracy": 0.7337910661944309, - "step": 1372, + "epoch": 1.3236214784493137, + "grad_norm": 0.43032994866371155, + "learning_rate": 2e-05, + "loss": 0.9222, + "mean_token_accuracy": 0.7338301599025726, + "step": 1375 + }, + { + "epoch": 1.3284372742595714, + "grad_norm": 0.4723619520664215, + "learning_rate": 2e-05, + "loss": 0.9087, + "mean_token_accuracy": 0.7396367639303207, + "step": 1380 + }, + { + "epoch": 1.333253070069829, + "grad_norm": 0.4503762722015381, + "learning_rate": 2e-05, + "loss": 0.9168, + "mean_token_accuracy": 0.7358366668224334, + "step": 1385 + }, + { + "epoch": 1.3380688658800866, + "grad_norm": 0.48158782720565796, + "learning_rate": 2e-05, + "loss": 0.9284, + "mean_token_accuracy": 0.7334781587123871, + "step": 1390 + }, + { + "epoch": 1.3428846616903443, + "grad_norm": 0.4519921839237213, + "learning_rate": 2e-05, + "loss": 0.8668, + "mean_token_accuracy": 0.7482002139091491, + "step": 1395 + }, + { + "epoch": 1.347700457500602, + "grad_norm": 0.4667375981807709, + "learning_rate": 2e-05, + "loss": 0.9062, + "mean_token_accuracy": 0.7376438468694687, + "step": 1400 + }, + { + "epoch": 1.347700457500602, + "eval_loss": 0.9634374976158142, + "eval_mean_token_accuracy": 0.7297810866282537, + "eval_runtime": 7.4478, + "eval_samples_per_second": 13.427, + "eval_steps_per_second": 1.745, + "step": 1400 + }, + { + "epoch": 1.3525162533108597, + "grad_norm": 0.44705450534820557, + "learning_rate": 2e-05, + "loss": 0.8938, + "mean_token_accuracy": 0.7402571320533753, + "step": 1405 + }, + { + "epoch": 1.3573320491211174, + "grad_norm": 0.4210263788700104, + "learning_rate": 2e-05, + "loss": 0.8981, + "mean_token_accuracy": 0.7392288744449615, + "step": 1410 + }, + { + "epoch": 1.3621478449313749, + "grad_norm": 0.460750550031662, + "learning_rate": 2e-05, + "loss": 0.9077, + "mean_token_accuracy": 0.7354150235652923, + "step": 1415 + }, + { + "epoch": 1.3669636407416326, + "grad_norm": 0.457933634519577, + "learning_rate": 2e-05, + "loss": 0.9379, + "mean_token_accuracy": 0.7317300707101821, + "step": 1420 + }, + { + "epoch": 1.3717794365518903, + "grad_norm": 0.5365740656852722, + "learning_rate": 2e-05, + "loss": 0.9169, + "mean_token_accuracy": 0.7354377210140228, + "step": 1425 + }, + { + "epoch": 1.3765952323621478, + "grad_norm": 0.44313621520996094, + "learning_rate": 2e-05, + "loss": 0.9334, + "mean_token_accuracy": 0.7325593650341033, + "step": 1430 + }, + { + "epoch": 1.3814110281724055, + "grad_norm": 0.42803245782852173, + "learning_rate": 2e-05, + "loss": 0.8757, + "mean_token_accuracy": 0.7443858623504639, + "step": 1435 + }, + { + "epoch": 1.3862268239826632, + "grad_norm": 0.4595027565956116, + "learning_rate": 2e-05, + "loss": 0.9343, + "mean_token_accuracy": 0.7328775763511658, + "step": 1440 + }, + { + "epoch": 1.3910426197929207, + "grad_norm": 0.44830256700515747, + "learning_rate": 2e-05, + "loss": 0.9239, + "mean_token_accuracy": 0.7343266099691391, + "step": 1445 + }, + { + "epoch": 1.3958584156031784, + "grad_norm": 0.4366381764411926, + "learning_rate": 2e-05, + "loss": 0.8512, + "mean_token_accuracy": 0.7534618347883224, + "step": 1450 + }, + { + "epoch": 1.400674211413436, + "grad_norm": 0.4866580069065094, + "learning_rate": 2e-05, + "loss": 0.9385, + "mean_token_accuracy": 0.7313212603330612, + "step": 1455 + }, + { + "epoch": 1.4054900072236938, + "grad_norm": 0.47201380133628845, + "learning_rate": 2e-05, + "loss": 0.9216, + "mean_token_accuracy": 0.7338675051927567, + "step": 1460 + }, + { + "epoch": 1.4103058030339515, + "grad_norm": 0.46657127141952515, + "learning_rate": 2e-05, + "loss": 0.9866, + "mean_token_accuracy": 0.7204782664775848, + "step": 1465 + }, + { + "epoch": 1.415121598844209, + "grad_norm": 0.5041907429695129, + "learning_rate": 2e-05, + "loss": 0.8737, + "mean_token_accuracy": 0.7460451602935791, + "step": 1470 + }, + { + "epoch": 1.4199373946544667, + "grad_norm": 0.4409908950328827, + "learning_rate": 2e-05, + "loss": 0.8953, + "mean_token_accuracy": 0.7413113296031952, + "step": 1475 + }, + { + "epoch": 1.4247531904647244, + "grad_norm": 0.45194968581199646, + "learning_rate": 2e-05, + "loss": 0.8855, + "mean_token_accuracy": 0.7436691761016846, + "step": 1480 + }, + { + "epoch": 1.4295689862749819, + "grad_norm": 0.4848441481590271, + "learning_rate": 2e-05, + "loss": 0.8753, + "mean_token_accuracy": 0.7468056827783585, + "step": 1485 + }, + { + "epoch": 1.4343847820852396, + "grad_norm": 0.4938194453716278, + "learning_rate": 2e-05, + "loss": 0.9528, + "mean_token_accuracy": 0.7248939067125321, + "step": 1490 + }, + { + "epoch": 1.4392005778954973, + "grad_norm": 0.4727918803691864, + "learning_rate": 2e-05, + "loss": 0.926, + "mean_token_accuracy": 0.7339328557252884, + "step": 1495 + }, + { + "epoch": 1.4440163737057548, + "grad_norm": 0.42337730526924133, + "learning_rate": 2e-05, + "loss": 0.9011, + "mean_token_accuracy": 0.7394659042358398, + "step": 1500 + }, + { + "epoch": 1.4440163737057548, + "eval_loss": 0.9573437571525574, + "eval_mean_token_accuracy": 0.7310392810748174, + "eval_runtime": 7.4732, + "eval_samples_per_second": 13.381, + "eval_steps_per_second": 1.74, + "step": 1500 + }, + { + "epoch": 1.4488321695160125, + "grad_norm": 0.44844719767570496, + "learning_rate": 2e-05, + "loss": 0.9325, + "mean_token_accuracy": 0.7313905775547027, + "step": 1505 + }, + { + "epoch": 1.4536479653262702, + "grad_norm": 0.49171948432922363, + "learning_rate": 2e-05, + "loss": 0.9387, + "mean_token_accuracy": 0.7284117460250854, + "step": 1510 + }, + { + "epoch": 1.4584637611365279, + "grad_norm": 0.4760735034942627, + "learning_rate": 2e-05, + "loss": 0.9293, + "mean_token_accuracy": 0.7311459213495255, + "step": 1515 + }, + { + "epoch": 1.4632795569467856, + "grad_norm": 0.4475283622741699, + "learning_rate": 2e-05, + "loss": 0.9006, + "mean_token_accuracy": 0.7409524530172348, + "step": 1520 + }, + { + "epoch": 1.468095352757043, + "grad_norm": 0.4623259902000427, + "learning_rate": 2e-05, + "loss": 0.8953, + "mean_token_accuracy": 0.7404493808746337, + "step": 1525 + }, + { + "epoch": 1.4729111485673008, + "grad_norm": 0.4705229103565216, + "learning_rate": 2e-05, + "loss": 0.9136, + "mean_token_accuracy": 0.7354353934526443, + "step": 1530 + }, + { + "epoch": 1.4777269443775585, + "grad_norm": 0.5531764626502991, + "learning_rate": 2e-05, + "loss": 0.9307, + "mean_token_accuracy": 0.7317899644374848, + "step": 1535 + }, + { + "epoch": 1.482542740187816, + "grad_norm": 0.4909057021141052, + "learning_rate": 2e-05, + "loss": 0.8986, + "mean_token_accuracy": 0.7409513860940933, + "step": 1540 + }, + { + "epoch": 1.4873585359980737, + "grad_norm": 0.46586015820503235, + "learning_rate": 2e-05, + "loss": 0.8851, + "mean_token_accuracy": 0.7432068467140198, + "step": 1545 + }, + { + "epoch": 1.4921743318083314, + "grad_norm": 0.45466411113739014, + "learning_rate": 2e-05, + "loss": 0.8811, + "mean_token_accuracy": 0.7456226170063018, + "step": 1550 + }, + { + "epoch": 1.4969901276185889, + "grad_norm": 0.4437198340892792, + "learning_rate": 2e-05, + "loss": 0.919, + "mean_token_accuracy": 0.7341081619262695, + "step": 1555 + }, + { + "epoch": 1.5018059234288468, + "grad_norm": 0.4511047601699829, + "learning_rate": 2e-05, + "loss": 0.9047, + "mean_token_accuracy": 0.7383978575468063, + "step": 1560 + }, + { + "epoch": 1.5066217192391043, + "grad_norm": 0.45091575384140015, + "learning_rate": 2e-05, + "loss": 0.9084, + "mean_token_accuracy": 0.7376039415597916, + "step": 1565 + }, + { + "epoch": 1.5114375150493617, + "grad_norm": 0.48546507954597473, + "learning_rate": 2e-05, + "loss": 0.8924, + "mean_token_accuracy": 0.743433365225792, + "step": 1570 + }, + { + "epoch": 1.5162533108596197, + "grad_norm": 0.4690825343132019, + "learning_rate": 2e-05, + "loss": 0.8873, + "mean_token_accuracy": 0.743861198425293, + "step": 1575 + }, + { + "epoch": 1.5210691066698772, + "grad_norm": 0.49503663182258606, + "learning_rate": 2e-05, + "loss": 0.9284, + "mean_token_accuracy": 0.7325572490692138, + "step": 1580 + }, + { + "epoch": 1.5258849024801349, + "grad_norm": 0.4349866509437561, + "learning_rate": 2e-05, + "loss": 0.8989, + "mean_token_accuracy": 0.7411394506692887, + "step": 1585 + }, + { + "epoch": 1.5307006982903926, + "grad_norm": 0.5051383376121521, + "learning_rate": 2e-05, + "loss": 0.8699, + "mean_token_accuracy": 0.7465825587511062, + "step": 1590 + }, + { + "epoch": 1.53551649410065, + "grad_norm": 0.47513774037361145, + "learning_rate": 2e-05, + "loss": 0.8722, + "mean_token_accuracy": 0.7465938866138458, + "step": 1595 + }, + { + "epoch": 1.5403322899109078, + "grad_norm": 0.4388999938964844, + "learning_rate": 2e-05, + "loss": 0.8892, + "mean_token_accuracy": 0.7405003696680069, + "step": 1600 + }, + { + "epoch": 1.5403322899109078, + "eval_loss": 0.9518749713897705, + "eval_mean_token_accuracy": 0.7328802851530222, + "eval_runtime": 7.4831, + "eval_samples_per_second": 13.363, + "eval_steps_per_second": 1.737, + "step": 1600 + }, + { + "epoch": 1.5451480857211655, + "grad_norm": 0.4367406964302063, + "learning_rate": 2e-05, + "loss": 0.8861, + "mean_token_accuracy": 0.7440774708986282, + "step": 1605 + }, + { + "epoch": 1.549963881531423, + "grad_norm": 0.49486687779426575, + "learning_rate": 2e-05, + "loss": 0.9275, + "mean_token_accuracy": 0.7305904746055603, + "step": 1610 + }, + { + "epoch": 1.5547796773416809, + "grad_norm": 0.4388335943222046, + "learning_rate": 2e-05, + "loss": 0.9326, + "mean_token_accuracy": 0.7333998173475266, + "step": 1615 + }, + { + "epoch": 1.5595954731519384, + "grad_norm": 0.4478837251663208, + "learning_rate": 2e-05, + "loss": 0.9092, + "mean_token_accuracy": 0.7344666838645935, + "step": 1620 + }, + { + "epoch": 1.5644112689621958, + "grad_norm": 0.4690518379211426, + "learning_rate": 2e-05, + "loss": 0.9123, + "mean_token_accuracy": 0.736617586016655, + "step": 1625 + }, + { + "epoch": 1.5692270647724538, + "grad_norm": 0.5184943675994873, + "learning_rate": 2e-05, + "loss": 0.9083, + "mean_token_accuracy": 0.7396018832921982, + "step": 1630 + }, + { + "epoch": 1.5740428605827113, + "grad_norm": 0.4626966714859009, + "learning_rate": 2e-05, + "loss": 0.9016, + "mean_token_accuracy": 0.7409818112850189, + "step": 1635 + }, + { + "epoch": 1.578858656392969, + "grad_norm": 0.444965124130249, + "learning_rate": 2e-05, + "loss": 0.8611, + "mean_token_accuracy": 0.7501071333885193, + "step": 1640 + }, + { + "epoch": 1.5836744522032267, + "grad_norm": 0.4352843463420868, + "learning_rate": 2e-05, + "loss": 0.9074, + "mean_token_accuracy": 0.7390000373125076, + "step": 1645 + }, + { + "epoch": 1.5884902480134842, + "grad_norm": 0.44259950518608093, + "learning_rate": 2e-05, + "loss": 0.8765, + "mean_token_accuracy": 0.744164663553238, + "step": 1650 + }, + { + "epoch": 1.5933060438237419, + "grad_norm": 0.4687725305557251, + "learning_rate": 2e-05, + "loss": 0.8715, + "mean_token_accuracy": 0.7472029328346252, + "step": 1655 + }, + { + "epoch": 1.5981218396339996, + "grad_norm": 0.47756823897361755, + "learning_rate": 2e-05, + "loss": 0.906, + "mean_token_accuracy": 0.7397542387247086, + "step": 1660 + }, + { + "epoch": 1.602937635444257, + "grad_norm": 0.43353796005249023, + "learning_rate": 2e-05, + "loss": 0.8994, + "mean_token_accuracy": 0.7399900108575821, + "step": 1665 + }, + { + "epoch": 1.6077534312545148, + "grad_norm": 0.46565377712249756, + "learning_rate": 2e-05, + "loss": 0.8834, + "mean_token_accuracy": 0.7433773428201675, + "step": 1670 + }, + { + "epoch": 1.6125692270647725, + "grad_norm": 0.45986300706863403, + "learning_rate": 2e-05, + "loss": 0.8539, + "mean_token_accuracy": 0.7526435494422913, + "step": 1675 + }, + { + "epoch": 1.61738502287503, + "grad_norm": 0.43337032198905945, + "learning_rate": 2e-05, + "loss": 0.9317, + "mean_token_accuracy": 0.7318619042634964, + "step": 1680 + }, + { + "epoch": 1.6222008186852879, + "grad_norm": 0.45174193382263184, + "learning_rate": 2e-05, + "loss": 0.9006, + "mean_token_accuracy": 0.7419182240962983, + "step": 1685 + }, + { + "epoch": 1.6270166144955454, + "grad_norm": 0.46104565262794495, + "learning_rate": 2e-05, + "loss": 0.8887, + "mean_token_accuracy": 0.7427491843700409, + "step": 1690 + }, + { + "epoch": 1.631832410305803, + "grad_norm": 0.46335282921791077, + "learning_rate": 2e-05, + "loss": 0.9287, + "mean_token_accuracy": 0.7303118437528611, + "step": 1695 + }, + { + "epoch": 1.6366482061160608, + "grad_norm": 0.47474405169487, + "learning_rate": 2e-05, + "loss": 0.8764, + "mean_token_accuracy": 0.744899383187294, + "step": 1700 + }, + { + "epoch": 1.6366482061160608, + "eval_loss": 0.9485937356948853, + "eval_mean_token_accuracy": 0.733470096037938, + "eval_runtime": 7.4301, + "eval_samples_per_second": 13.459, + "eval_steps_per_second": 1.75, + "step": 1700 + }, + { + "epoch": 1.6414640019263183, + "grad_norm": 0.4701962471008301, + "learning_rate": 2e-05, + "loss": 0.8779, + "mean_token_accuracy": 0.7440233767032624, + "step": 1705 + }, + { + "epoch": 1.646279797736576, + "grad_norm": 0.4552370607852936, + "learning_rate": 2e-05, + "loss": 0.9152, + "mean_token_accuracy": 0.737464314699173, + "step": 1710 + }, + { + "epoch": 1.6510955935468337, + "grad_norm": 0.47667303681373596, + "learning_rate": 2e-05, + "loss": 0.919, + "mean_token_accuracy": 0.7344905406236648, + "step": 1715 + }, + { + "epoch": 1.6559113893570911, + "grad_norm": 0.4600849449634552, + "learning_rate": 2e-05, + "loss": 0.911, + "mean_token_accuracy": 0.7379329711198807, + "step": 1720 + }, + { + "epoch": 1.6607271851673489, + "grad_norm": 0.48880884051322937, + "learning_rate": 2e-05, + "loss": 0.898, + "mean_token_accuracy": 0.7387097924947739, + "step": 1725 + }, + { + "epoch": 1.6655429809776066, + "grad_norm": 0.5022911429405212, + "learning_rate": 2e-05, + "loss": 0.8979, + "mean_token_accuracy": 0.7413761407136917, + "step": 1730 + }, + { + "epoch": 1.670358776787864, + "grad_norm": 0.5053398609161377, + "learning_rate": 2e-05, + "loss": 0.8841, + "mean_token_accuracy": 0.7442550808191299, + "step": 1735 + }, + { + "epoch": 1.675174572598122, + "grad_norm": 0.4696631133556366, + "learning_rate": 2e-05, + "loss": 0.8464, + "mean_token_accuracy": 0.7531871020793914, + "step": 1740 + }, + { + "epoch": 1.6799903684083795, + "grad_norm": 0.46126672625541687, + "learning_rate": 2e-05, + "loss": 0.8808, + "mean_token_accuracy": 0.7435233056545257, + "step": 1745 + }, + { + "epoch": 1.6848061642186372, + "grad_norm": 0.45544305443763733, + "learning_rate": 2e-05, + "loss": 0.8818, + "mean_token_accuracy": 0.7450660765171051, + "step": 1750 + }, + { + "epoch": 1.6896219600288949, + "grad_norm": 0.43367689847946167, + "learning_rate": 2e-05, + "loss": 0.9102, + "mean_token_accuracy": 0.7359609037637711, + "step": 1755 + }, + { + "epoch": 1.6944377558391523, + "grad_norm": 0.5101639628410339, + "learning_rate": 2e-05, + "loss": 0.9781, + "mean_token_accuracy": 0.7192240744829178, + "step": 1760 + }, + { + "epoch": 1.69925355164941, + "grad_norm": 0.46075791120529175, + "learning_rate": 2e-05, + "loss": 0.8834, + "mean_token_accuracy": 0.7445993691682815, + "step": 1765 + }, + { + "epoch": 1.7040693474596678, + "grad_norm": 0.4832165539264679, + "learning_rate": 2e-05, + "loss": 0.911, + "mean_token_accuracy": 0.7357688426971436, + "step": 1770 + }, + { + "epoch": 1.7088851432699252, + "grad_norm": 0.5077883005142212, + "learning_rate": 2e-05, + "loss": 0.9091, + "mean_token_accuracy": 0.7364090234041214, + "step": 1775 + }, + { + "epoch": 1.713700939080183, + "grad_norm": 0.4622097313404083, + "learning_rate": 2e-05, + "loss": 0.9122, + "mean_token_accuracy": 0.7369868725538253, + "step": 1780 + }, + { + "epoch": 1.7185167348904407, + "grad_norm": 0.48189571499824524, + "learning_rate": 2e-05, + "loss": 0.8921, + "mean_token_accuracy": 0.7430964499711991, + "step": 1785 + }, + { + "epoch": 1.7233325307006981, + "grad_norm": 0.45552459359169006, + "learning_rate": 2e-05, + "loss": 0.8843, + "mean_token_accuracy": 0.7434415936470031, + "step": 1790 + }, + { + "epoch": 1.728148326510956, + "grad_norm": 0.4220450520515442, + "learning_rate": 2e-05, + "loss": 0.8936, + "mean_token_accuracy": 0.7402406454086303, + "step": 1795 + }, + { + "epoch": 1.7329641223212136, + "grad_norm": 0.49192172288894653, + "learning_rate": 2e-05, + "loss": 0.8959, + "mean_token_accuracy": 0.7403791964054107, + "step": 1800 + }, + { + "epoch": 1.7329641223212136, + "eval_loss": 0.9439062476158142, + "eval_mean_token_accuracy": 0.7341789053036616, + "eval_runtime": 7.4397, + "eval_samples_per_second": 13.441, + "eval_steps_per_second": 1.747, + "step": 1800 + }, + { + "epoch": 1.7377799181314713, + "grad_norm": 0.4940565526485443, + "learning_rate": 2e-05, + "loss": 0.938, + "mean_token_accuracy": 0.7320731997489929, + "step": 1805 + }, + { + "epoch": 1.742595713941729, + "grad_norm": 0.4866228699684143, + "learning_rate": 2e-05, + "loss": 0.8935, + "mean_token_accuracy": 0.7401677757501602, + "step": 1810 + }, + { + "epoch": 1.7474115097519864, + "grad_norm": 0.48419567942619324, + "learning_rate": 2e-05, + "loss": 0.9205, + "mean_token_accuracy": 0.7368877470493317, + "step": 1815 + }, + { + "epoch": 1.7522273055622442, + "grad_norm": 0.4500778317451477, + "learning_rate": 2e-05, + "loss": 0.9154, + "mean_token_accuracy": 0.735489484667778, + "step": 1820 + }, + { + "epoch": 1.7570431013725019, + "grad_norm": 0.4256117045879364, + "learning_rate": 2e-05, + "loss": 0.9003, + "mean_token_accuracy": 0.7396515876054763, + "step": 1825 + }, + { + "epoch": 1.7618588971827593, + "grad_norm": 0.43817195296287537, + "learning_rate": 2e-05, + "loss": 0.8895, + "mean_token_accuracy": 0.7437138319015503, + "step": 1830 + }, + { + "epoch": 1.766674692993017, + "grad_norm": 0.450078547000885, + "learning_rate": 2e-05, + "loss": 0.9105, + "mean_token_accuracy": 0.7374698102474213, + "step": 1835 + }, + { + "epoch": 1.7714904888032748, + "grad_norm": 0.41473084688186646, + "learning_rate": 2e-05, + "loss": 0.9145, + "mean_token_accuracy": 0.73527010679245, + "step": 1840 + }, + { + "epoch": 1.7763062846135322, + "grad_norm": 0.4389092028141022, + "learning_rate": 2e-05, + "loss": 0.8979, + "mean_token_accuracy": 0.7409280002117157, + "step": 1845 + }, + { + "epoch": 1.7811220804237902, + "grad_norm": 0.47870081663131714, + "learning_rate": 2e-05, + "loss": 0.8843, + "mean_token_accuracy": 0.7440291404724121, + "step": 1850 + }, + { + "epoch": 1.7859378762340476, + "grad_norm": 0.4589843153953552, + "learning_rate": 2e-05, + "loss": 0.9391, + "mean_token_accuracy": 0.7301819682121277, + "step": 1855 + }, + { + "epoch": 1.7907536720443054, + "grad_norm": 0.4766651690006256, + "learning_rate": 2e-05, + "loss": 0.8598, + "mean_token_accuracy": 0.7507995873689651, + "step": 1860 + }, + { + "epoch": 1.795569467854563, + "grad_norm": 0.44371894001960754, + "learning_rate": 2e-05, + "loss": 0.8883, + "mean_token_accuracy": 0.7420922189950943, + "step": 1865 + }, + { + "epoch": 1.8003852636648205, + "grad_norm": 0.4720803499221802, + "learning_rate": 2e-05, + "loss": 0.8807, + "mean_token_accuracy": 0.7464898467063904, + "step": 1870 + }, + { + "epoch": 1.8052010594750783, + "grad_norm": 0.44293534755706787, + "learning_rate": 2e-05, + "loss": 0.8856, + "mean_token_accuracy": 0.743656238913536, + "step": 1875 + }, + { + "epoch": 1.810016855285336, + "grad_norm": 0.44866520166397095, + "learning_rate": 2e-05, + "loss": 0.8883, + "mean_token_accuracy": 0.7446618676185608, + "step": 1880 + }, + { + "epoch": 1.8148326510955934, + "grad_norm": 0.4480263888835907, + "learning_rate": 2e-05, + "loss": 0.9138, + "mean_token_accuracy": 0.7357570141553879, + "step": 1885 + }, + { + "epoch": 1.8196484469058511, + "grad_norm": 0.46717023849487305, + "learning_rate": 2e-05, + "loss": 0.916, + "mean_token_accuracy": 0.7360194474458694, + "step": 1890 + }, + { + "epoch": 1.8244642427161089, + "grad_norm": 0.4781874716281891, + "learning_rate": 2e-05, + "loss": 0.9032, + "mean_token_accuracy": 0.7403118848800659, + "step": 1895 + }, + { + "epoch": 1.8292800385263663, + "grad_norm": 0.4285045266151428, + "learning_rate": 2e-05, + "loss": 0.8532, + "mean_token_accuracy": 0.7512061387300492, + "step": 1900 + }, + { + "epoch": 1.8292800385263663, + "eval_loss": 0.9396874904632568, + "eval_mean_token_accuracy": 0.7350479043447055, + "eval_runtime": 7.4322, + "eval_samples_per_second": 13.455, + "eval_steps_per_second": 1.749, + "step": 1900 + }, + { + "epoch": 1.8340958343366243, + "grad_norm": 0.4367077946662903, + "learning_rate": 2e-05, + "loss": 0.8795, + "mean_token_accuracy": 0.7435707747936249, + "step": 1905 + }, + { + "epoch": 1.8389116301468817, + "grad_norm": 0.4504747688770294, + "learning_rate": 2e-05, + "loss": 0.904, + "mean_token_accuracy": 0.7381240397691726, + "step": 1910 + }, + { + "epoch": 1.8437274259571395, + "grad_norm": 0.43157216906547546, + "learning_rate": 2e-05, + "loss": 0.9132, + "mean_token_accuracy": 0.7355201184749603, + "step": 1915 + }, + { + "epoch": 1.8485432217673972, + "grad_norm": 0.44472646713256836, + "learning_rate": 2e-05, + "loss": 0.8799, + "mean_token_accuracy": 0.7441627860069275, + "step": 1920 + }, + { + "epoch": 1.8533590175776546, + "grad_norm": 0.46139705181121826, + "learning_rate": 2e-05, + "loss": 0.8883, + "mean_token_accuracy": 0.7415470153093338, + "step": 1925 + }, + { + "epoch": 1.8581748133879123, + "grad_norm": 0.4556531310081482, + "learning_rate": 2e-05, + "loss": 0.8971, + "mean_token_accuracy": 0.741265720129013, + "step": 1930 + }, + { + "epoch": 1.86299060919817, + "grad_norm": 0.46431905031204224, + "learning_rate": 2e-05, + "loss": 0.8815, + "mean_token_accuracy": 0.74463951587677, + "step": 1935 + }, + { + "epoch": 1.8678064050084275, + "grad_norm": 0.4754282534122467, + "learning_rate": 2e-05, + "loss": 0.9187, + "mean_token_accuracy": 0.7326898455619812, + "step": 1940 + }, + { + "epoch": 1.8726222008186852, + "grad_norm": 0.46812552213668823, + "learning_rate": 2e-05, + "loss": 0.9112, + "mean_token_accuracy": 0.7361989319324493, + "step": 1945 + }, + { + "epoch": 1.877437996628943, + "grad_norm": 0.44834113121032715, + "learning_rate": 2e-05, + "loss": 0.8194, + "mean_token_accuracy": 0.7607882767915726, + "step": 1950 + }, + { + "epoch": 1.8822537924392004, + "grad_norm": 0.44423848390579224, + "learning_rate": 2e-05, + "loss": 0.9082, + "mean_token_accuracy": 0.7385355263948441, + "step": 1955 + }, + { + "epoch": 1.8870695882494584, + "grad_norm": 0.457197368144989, + "learning_rate": 2e-05, + "loss": 0.8985, + "mean_token_accuracy": 0.7388326019048691, + "step": 1960 + }, + { + "epoch": 1.8918853840597158, + "grad_norm": 0.4411796033382416, + "learning_rate": 2e-05, + "loss": 0.8979, + "mean_token_accuracy": 0.7395110338926315, + "step": 1965 + }, + { + "epoch": 1.8967011798699736, + "grad_norm": 0.4752340018749237, + "learning_rate": 2e-05, + "loss": 0.9014, + "mean_token_accuracy": 0.7413509577512741, + "step": 1970 + }, + { + "epoch": 1.9015169756802313, + "grad_norm": 0.4289093315601349, + "learning_rate": 2e-05, + "loss": 0.8975, + "mean_token_accuracy": 0.7393780171871185, + "step": 1975 + }, + { + "epoch": 1.9063327714904887, + "grad_norm": 0.4665391743183136, + "learning_rate": 2e-05, + "loss": 0.9019, + "mean_token_accuracy": 0.73647480905056, + "step": 1980 + }, + { + "epoch": 1.9111485673007464, + "grad_norm": 0.47752028703689575, + "learning_rate": 2e-05, + "loss": 0.8965, + "mean_token_accuracy": 0.7401424318552017, + "step": 1985 + }, + { + "epoch": 1.9159643631110042, + "grad_norm": 0.4596128761768341, + "learning_rate": 2e-05, + "loss": 0.902, + "mean_token_accuracy": 0.7377873957157135, + "step": 1990 + }, + { + "epoch": 1.9207801589212616, + "grad_norm": 0.46942925453186035, + "learning_rate": 2e-05, + "loss": 0.8943, + "mean_token_accuracy": 0.7391342610120774, + "step": 1995 + }, + { + "epoch": 1.9255959547315193, + "grad_norm": 0.4725325107574463, + "learning_rate": 2e-05, + "loss": 0.8672, + "mean_token_accuracy": 0.7467629760503769, + "step": 2000 + }, + { + "epoch": 1.9255959547315193, + "eval_loss": 0.9370312690734863, + "eval_mean_token_accuracy": 0.7346881352938138, + "eval_runtime": 7.4433, + "eval_samples_per_second": 13.435, + "eval_steps_per_second": 1.747, + "step": 2000 + }, + { + "epoch": 1.930411750541777, + "grad_norm": 0.4463973939418793, + "learning_rate": 2e-05, + "loss": 0.8966, + "mean_token_accuracy": 0.7401046574115753, + "step": 2005 + }, + { + "epoch": 1.9352275463520345, + "grad_norm": 0.4733448028564453, + "learning_rate": 2e-05, + "loss": 0.9018, + "mean_token_accuracy": 0.7389231294393539, + "step": 2010 + }, + { + "epoch": 1.9400433421622925, + "grad_norm": 0.5125485062599182, + "learning_rate": 2e-05, + "loss": 0.8444, + "mean_token_accuracy": 0.7513499796390534, + "step": 2015 + }, + { + "epoch": 1.94485913797255, + "grad_norm": 0.43975546956062317, + "learning_rate": 2e-05, + "loss": 0.8721, + "mean_token_accuracy": 0.744769099354744, + "step": 2020 + }, + { + "epoch": 1.9496749337828077, + "grad_norm": 0.46694791316986084, + "learning_rate": 2e-05, + "loss": 0.8803, + "mean_token_accuracy": 0.7435093283653259, + "step": 2025 + }, + { + "epoch": 1.9544907295930654, + "grad_norm": 0.4587364196777344, + "learning_rate": 2e-05, + "loss": 0.8996, + "mean_token_accuracy": 0.7398793309926986, + "step": 2030 + }, + { + "epoch": 1.9593065254033228, + "grad_norm": 0.44869858026504517, + "learning_rate": 2e-05, + "loss": 0.8561, + "mean_token_accuracy": 0.7505311012268067, + "step": 2035 + }, + { + "epoch": 1.9641223212135805, + "grad_norm": 0.4261813461780548, + "learning_rate": 2e-05, + "loss": 0.8906, + "mean_token_accuracy": 0.7425627171993255, + "step": 2040 + }, + { + "epoch": 1.9689381170238383, + "grad_norm": 0.42234715819358826, + "learning_rate": 2e-05, + "loss": 0.8838, + "mean_token_accuracy": 0.7437568008899689, + "step": 2045 + }, + { + "epoch": 1.9737539128340957, + "grad_norm": 0.4368042051792145, + "learning_rate": 2e-05, + "loss": 0.9199, + "mean_token_accuracy": 0.7325636714696884, + "step": 2050 + }, + { + "epoch": 1.9785697086443534, + "grad_norm": 0.4789094924926758, + "learning_rate": 2e-05, + "loss": 0.8941, + "mean_token_accuracy": 0.7423620015382767, + "step": 2055 + }, + { + "epoch": 1.9833855044546111, + "grad_norm": 0.4829238951206207, + "learning_rate": 2e-05, + "loss": 0.8621, + "mean_token_accuracy": 0.7491715341806412, + "step": 2060 + }, + { + "epoch": 1.9882013002648686, + "grad_norm": 0.4356552064418793, + "learning_rate": 2e-05, + "loss": 0.8892, + "mean_token_accuracy": 0.7427193284034729, + "step": 2065 + }, + { + "epoch": 1.9930170960751266, + "grad_norm": 0.4590350389480591, + "learning_rate": 2e-05, + "loss": 0.9224, + "mean_token_accuracy": 0.7345257490873337, + "step": 2070 + }, + { + "epoch": 1.997832891885384, + "grad_norm": 0.4588664472103119, + "learning_rate": 2e-05, + "loss": 0.9009, + "mean_token_accuracy": 0.7404451668262482, + "step": 2075 + }, + { + "epoch": 1.9987960510474356, + "mean_token_accuracy": 0.7274856269359589, + "step": 2076, "total_flos": 0.0, - "train_loss": 0.9694986148756377, - "train_runtime": 60321.7593, - "train_samples_per_second": 1.456, - "train_steps_per_second": 0.023 + "train_loss": 0.9815138741495063, + "train_runtime": 13832.1557, + "train_samples_per_second": 2.402, + "train_steps_per_second": 0.15 } ], "logging_steps": 5, - "max_steps": 1372, + "max_steps": 2076, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, @@ -2339,8 +3530,8 @@ "should_epoch_stop": false, "should_evaluate": false, "should_log": false, - "should_save": true, - "should_training_stop": true + "should_save": false, + "should_training_stop": false }, "attributes": {} }