| { |
| "best_metric": 0.19586918, |
| "best_model_checkpoint": "/share/project/gsai/kch/output/v9-20250120-041149/checkpoint-614", |
| "epoch": 4.996548223350254, |
| "eval_steps": 500, |
| "global_step": 1535, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.003248730964467005, |
| "grad_norm": 9.773202050760368, |
| "learning_rate": 1.2987012987012988e-06, |
| "loss": 1.5496090650558472, |
| "memory(GiB)": 35.94, |
| "step": 1, |
| "token_acc": 0.7444444444444445, |
| "train_speed(iter/s)": 0.013018 |
| }, |
| { |
| "epoch": 0.016243654822335026, |
| "grad_norm": 9.248677372850217, |
| "learning_rate": 6.493506493506493e-06, |
| "loss": 1.707068681716919, |
| "memory(GiB)": 36.33, |
| "step": 5, |
| "token_acc": 0.7226890756302521, |
| "train_speed(iter/s)": 0.016033 |
| }, |
| { |
| "epoch": 0.03248730964467005, |
| "grad_norm": 7.883848099402922, |
| "learning_rate": 1.2987012987012986e-05, |
| "loss": 1.7282501220703126, |
| "memory(GiB)": 36.33, |
| "step": 10, |
| "token_acc": 0.6761363636363636, |
| "train_speed(iter/s)": 0.016046 |
| }, |
| { |
| "epoch": 0.048730964467005075, |
| "grad_norm": 6.0889640814527155, |
| "learning_rate": 1.9480519480519483e-05, |
| "loss": 1.2976716995239257, |
| "memory(GiB)": 36.33, |
| "step": 15, |
| "token_acc": 0.7266355140186916, |
| "train_speed(iter/s)": 0.015999 |
| }, |
| { |
| "epoch": 0.0649746192893401, |
| "grad_norm": 2.6158303198283113, |
| "learning_rate": 2.5974025974025972e-05, |
| "loss": 0.7637146949768067, |
| "memory(GiB)": 36.33, |
| "step": 20, |
| "token_acc": 0.8190709046454768, |
| "train_speed(iter/s)": 0.016127 |
| }, |
| { |
| "epoch": 0.08121827411167512, |
| "grad_norm": 1.1712343980644169, |
| "learning_rate": 3.246753246753247e-05, |
| "loss": 0.5213486194610596, |
| "memory(GiB)": 36.91, |
| "step": 25, |
| "token_acc": 0.8802992518703242, |
| "train_speed(iter/s)": 0.016159 |
| }, |
| { |
| "epoch": 0.09746192893401015, |
| "grad_norm": 1.4361934956753106, |
| "learning_rate": 3.8961038961038966e-05, |
| "loss": 0.4833333969116211, |
| "memory(GiB)": 36.91, |
| "step": 30, |
| "token_acc": 0.8929440389294404, |
| "train_speed(iter/s)": 0.01608 |
| }, |
| { |
| "epoch": 0.11370558375634518, |
| "grad_norm": 1.1662861682771686, |
| "learning_rate": 4.545454545454546e-05, |
| "loss": 0.4054920196533203, |
| "memory(GiB)": 36.91, |
| "step": 35, |
| "token_acc": 0.8860103626943006, |
| "train_speed(iter/s)": 0.016078 |
| }, |
| { |
| "epoch": 0.1299492385786802, |
| "grad_norm": 1.0429297515235254, |
| "learning_rate": 5.1948051948051944e-05, |
| "loss": 0.43406662940979, |
| "memory(GiB)": 36.91, |
| "step": 40, |
| "token_acc": 0.8708333333333333, |
| "train_speed(iter/s)": 0.016008 |
| }, |
| { |
| "epoch": 0.14619289340101524, |
| "grad_norm": 1.2238750692730618, |
| "learning_rate": 5.844155844155844e-05, |
| "loss": 0.36366307735443115, |
| "memory(GiB)": 36.91, |
| "step": 45, |
| "token_acc": 0.9007832898172323, |
| "train_speed(iter/s)": 0.01607 |
| }, |
| { |
| "epoch": 0.16243654822335024, |
| "grad_norm": 1.2558032464123954, |
| "learning_rate": 6.493506493506494e-05, |
| "loss": 0.327667236328125, |
| "memory(GiB)": 36.91, |
| "step": 50, |
| "token_acc": 0.9095890410958904, |
| "train_speed(iter/s)": 0.016095 |
| }, |
| { |
| "epoch": 0.17868020304568527, |
| "grad_norm": 1.1346516950379935, |
| "learning_rate": 7.142857142857143e-05, |
| "loss": 0.2869602680206299, |
| "memory(GiB)": 36.91, |
| "step": 55, |
| "token_acc": 0.9400428265524625, |
| "train_speed(iter/s)": 0.016146 |
| }, |
| { |
| "epoch": 0.1949238578680203, |
| "grad_norm": 1.062592286052222, |
| "learning_rate": 7.792207792207793e-05, |
| "loss": 0.32817542552948, |
| "memory(GiB)": 36.91, |
| "step": 60, |
| "token_acc": 0.9162162162162162, |
| "train_speed(iter/s)": 0.016208 |
| }, |
| { |
| "epoch": 0.21116751269035533, |
| "grad_norm": 1.0747418170911354, |
| "learning_rate": 8.441558441558442e-05, |
| "loss": 0.3106029987335205, |
| "memory(GiB)": 36.91, |
| "step": 65, |
| "token_acc": 0.8882235528942116, |
| "train_speed(iter/s)": 0.016166 |
| }, |
| { |
| "epoch": 0.22741116751269036, |
| "grad_norm": 1.3626948899821127, |
| "learning_rate": 9.090909090909092e-05, |
| "loss": 0.2963001251220703, |
| "memory(GiB)": 36.91, |
| "step": 70, |
| "token_acc": 0.9046511627906977, |
| "train_speed(iter/s)": 0.016127 |
| }, |
| { |
| "epoch": 0.2436548223350254, |
| "grad_norm": 1.767990529674908, |
| "learning_rate": 9.74025974025974e-05, |
| "loss": 0.30068559646606446, |
| "memory(GiB)": 36.91, |
| "step": 75, |
| "token_acc": 0.9203539823008849, |
| "train_speed(iter/s)": 0.016153 |
| }, |
| { |
| "epoch": 0.2598984771573604, |
| "grad_norm": 1.1682901865357622, |
| "learning_rate": 9.99989553622803e-05, |
| "loss": 0.2741088390350342, |
| "memory(GiB)": 36.91, |
| "step": 80, |
| "token_acc": 0.9041394335511983, |
| "train_speed(iter/s)": 0.016134 |
| }, |
| { |
| "epoch": 0.27614213197969545, |
| "grad_norm": 1.3278362200249414, |
| "learning_rate": 9.999257162318026e-05, |
| "loss": 0.25543942451477053, |
| "memory(GiB)": 36.91, |
| "step": 85, |
| "token_acc": 0.9399538106235565, |
| "train_speed(iter/s)": 0.016119 |
| }, |
| { |
| "epoch": 0.2923857868020305, |
| "grad_norm": 1.1803595161351554, |
| "learning_rate": 9.998038523933224e-05, |
| "loss": 0.3038362503051758, |
| "memory(GiB)": 36.91, |
| "step": 90, |
| "token_acc": 0.9416058394160584, |
| "train_speed(iter/s)": 0.016104 |
| }, |
| { |
| "epoch": 0.3086294416243655, |
| "grad_norm": 1.1025992286590631, |
| "learning_rate": 9.996239762521151e-05, |
| "loss": 0.24188714027404784, |
| "memory(GiB)": 36.91, |
| "step": 95, |
| "token_acc": 0.9402298850574713, |
| "train_speed(iter/s)": 0.016109 |
| }, |
| { |
| "epoch": 0.3248730964467005, |
| "grad_norm": 1.7473005302414135, |
| "learning_rate": 9.993861086864293e-05, |
| "loss": 0.2190408945083618, |
| "memory(GiB)": 36.91, |
| "step": 100, |
| "token_acc": 0.9553349875930521, |
| "train_speed(iter/s)": 0.016079 |
| }, |
| { |
| "epoch": 0.3411167512690355, |
| "grad_norm": 0.9780470952963239, |
| "learning_rate": 9.990902773055866e-05, |
| "loss": 0.22316210269927977, |
| "memory(GiB)": 36.91, |
| "step": 105, |
| "token_acc": 0.9384236453201971, |
| "train_speed(iter/s)": 0.016094 |
| }, |
| { |
| "epoch": 0.35736040609137054, |
| "grad_norm": 1.2071939622104944, |
| "learning_rate": 9.987365164467767e-05, |
| "loss": 0.1844509482383728, |
| "memory(GiB)": 36.91, |
| "step": 110, |
| "token_acc": 0.9557291666666666, |
| "train_speed(iter/s)": 0.016096 |
| }, |
| { |
| "epoch": 0.37360406091370557, |
| "grad_norm": 1.3488873859555934, |
| "learning_rate": 9.983248671710714e-05, |
| "loss": 0.24020743370056152, |
| "memory(GiB)": 36.91, |
| "step": 115, |
| "token_acc": 0.91792656587473, |
| "train_speed(iter/s)": 0.016103 |
| }, |
| { |
| "epoch": 0.3898477157360406, |
| "grad_norm": 1.3346849143090171, |
| "learning_rate": 9.978553772586596e-05, |
| "loss": 0.17928496599197388, |
| "memory(GiB)": 36.91, |
| "step": 120, |
| "token_acc": 0.9523809523809523, |
| "train_speed(iter/s)": 0.016107 |
| }, |
| { |
| "epoch": 0.40609137055837563, |
| "grad_norm": 1.5370257812561328, |
| "learning_rate": 9.973281012033007e-05, |
| "loss": 0.22673540115356444, |
| "memory(GiB)": 36.91, |
| "step": 125, |
| "token_acc": 0.9307692307692308, |
| "train_speed(iter/s)": 0.016132 |
| }, |
| { |
| "epoch": 0.42233502538071066, |
| "grad_norm": 1.564641958549246, |
| "learning_rate": 9.967431002060002e-05, |
| "loss": 0.2366321563720703, |
| "memory(GiB)": 36.91, |
| "step": 130, |
| "token_acc": 0.9309576837416481, |
| "train_speed(iter/s)": 0.016147 |
| }, |
| { |
| "epoch": 0.4385786802030457, |
| "grad_norm": 2.562291519667129, |
| "learning_rate": 9.961004421679047e-05, |
| "loss": 0.1997455835342407, |
| "memory(GiB)": 36.91, |
| "step": 135, |
| "token_acc": 0.9694656488549618, |
| "train_speed(iter/s)": 0.01615 |
| }, |
| { |
| "epoch": 0.4548223350253807, |
| "grad_norm": 1.3505627897575587, |
| "learning_rate": 9.954002016824227e-05, |
| "loss": 0.23050181865692138, |
| "memory(GiB)": 36.91, |
| "step": 140, |
| "token_acc": 0.9395248380129589, |
| "train_speed(iter/s)": 0.016177 |
| }, |
| { |
| "epoch": 0.47106598984771575, |
| "grad_norm": 1.1439093152874722, |
| "learning_rate": 9.946424600265646e-05, |
| "loss": 0.2069091796875, |
| "memory(GiB)": 36.91, |
| "step": 145, |
| "token_acc": 0.9485294117647058, |
| "train_speed(iter/s)": 0.016153 |
| }, |
| { |
| "epoch": 0.4873096446700508, |
| "grad_norm": 1.3223308004820944, |
| "learning_rate": 9.938273051515098e-05, |
| "loss": 0.21799993515014648, |
| "memory(GiB)": 36.91, |
| "step": 150, |
| "token_acc": 0.9325581395348838, |
| "train_speed(iter/s)": 0.016154 |
| }, |
| { |
| "epoch": 0.5035532994923858, |
| "grad_norm": 1.2523275744092777, |
| "learning_rate": 9.929548316723982e-05, |
| "loss": 0.25325832366943357, |
| "memory(GiB)": 36.91, |
| "step": 155, |
| "token_acc": 0.9368421052631579, |
| "train_speed(iter/s)": 0.016145 |
| }, |
| { |
| "epoch": 0.5197969543147208, |
| "grad_norm": 0.9022910796931503, |
| "learning_rate": 9.920251408573483e-05, |
| "loss": 0.2051997184753418, |
| "memory(GiB)": 36.91, |
| "step": 160, |
| "token_acc": 0.9321266968325792, |
| "train_speed(iter/s)": 0.016138 |
| }, |
| { |
| "epoch": 0.5360406091370559, |
| "grad_norm": 1.3630797879167007, |
| "learning_rate": 9.910383406157018e-05, |
| "loss": 0.19534312486648558, |
| "memory(GiB)": 36.91, |
| "step": 165, |
| "token_acc": 0.9489795918367347, |
| "train_speed(iter/s)": 0.016146 |
| }, |
| { |
| "epoch": 0.5522842639593909, |
| "grad_norm": 1.2845653777954962, |
| "learning_rate": 9.899945454855006e-05, |
| "loss": 0.25403494834899903, |
| "memory(GiB)": 36.91, |
| "step": 170, |
| "token_acc": 0.9025974025974026, |
| "train_speed(iter/s)": 0.01615 |
| }, |
| { |
| "epoch": 0.5685279187817259, |
| "grad_norm": 1.2637865638643238, |
| "learning_rate": 9.888938766201907e-05, |
| "loss": 0.21994171142578126, |
| "memory(GiB)": 36.91, |
| "step": 175, |
| "token_acc": 0.9292452830188679, |
| "train_speed(iter/s)": 0.016148 |
| }, |
| { |
| "epoch": 0.584771573604061, |
| "grad_norm": 1.3035045872952578, |
| "learning_rate": 9.877364617745604e-05, |
| "loss": 0.21233229637145995, |
| "memory(GiB)": 36.91, |
| "step": 180, |
| "token_acc": 0.936046511627907, |
| "train_speed(iter/s)": 0.016163 |
| }, |
| { |
| "epoch": 0.601015228426396, |
| "grad_norm": 1.0837997073678936, |
| "learning_rate": 9.865224352899119e-05, |
| "loss": 0.20809760093688964, |
| "memory(GiB)": 36.91, |
| "step": 185, |
| "token_acc": 0.9612403100775194, |
| "train_speed(iter/s)": 0.016158 |
| }, |
| { |
| "epoch": 0.617258883248731, |
| "grad_norm": 1.6131697829206757, |
| "learning_rate": 9.852519380784686e-05, |
| "loss": 0.16450556516647338, |
| "memory(GiB)": 36.91, |
| "step": 190, |
| "token_acc": 0.9518716577540107, |
| "train_speed(iter/s)": 0.01615 |
| }, |
| { |
| "epoch": 0.6335025380710659, |
| "grad_norm": 1.0897399385105642, |
| "learning_rate": 9.839251176070184e-05, |
| "loss": 0.21039419174194335, |
| "memory(GiB)": 36.91, |
| "step": 195, |
| "token_acc": 0.943089430894309, |
| "train_speed(iter/s)": 0.016128 |
| }, |
| { |
| "epoch": 0.649746192893401, |
| "grad_norm": 1.0509670789538326, |
| "learning_rate": 9.825421278797983e-05, |
| "loss": 0.2035764217376709, |
| "memory(GiB)": 36.91, |
| "step": 200, |
| "token_acc": 0.9397260273972603, |
| "train_speed(iter/s)": 0.016132 |
| }, |
| { |
| "epoch": 0.665989847715736, |
| "grad_norm": 1.2329373260124112, |
| "learning_rate": 9.811031294206184e-05, |
| "loss": 0.21548199653625488, |
| "memory(GiB)": 36.91, |
| "step": 205, |
| "token_acc": 0.9368191721132898, |
| "train_speed(iter/s)": 0.01613 |
| }, |
| { |
| "epoch": 0.682233502538071, |
| "grad_norm": 0.8421449582235737, |
| "learning_rate": 9.796082892542302e-05, |
| "loss": 0.166330087184906, |
| "memory(GiB)": 36.91, |
| "step": 210, |
| "token_acc": 0.9555555555555556, |
| "train_speed(iter/s)": 0.016131 |
| }, |
| { |
| "epoch": 0.698477157360406, |
| "grad_norm": 3.430879327858774, |
| "learning_rate": 9.780577808869398e-05, |
| "loss": 0.2193552017211914, |
| "memory(GiB)": 36.91, |
| "step": 215, |
| "token_acc": 0.952020202020202, |
| "train_speed(iter/s)": 0.016136 |
| }, |
| { |
| "epoch": 0.7147208121827411, |
| "grad_norm": 1.5093266746905538, |
| "learning_rate": 9.764517842864696e-05, |
| "loss": 0.21606364250183105, |
| "memory(GiB)": 36.91, |
| "step": 220, |
| "token_acc": 0.9311926605504587, |
| "train_speed(iter/s)": 0.016141 |
| }, |
| { |
| "epoch": 0.7309644670050761, |
| "grad_norm": 1.3437389442959786, |
| "learning_rate": 9.747904858610681e-05, |
| "loss": 0.18983598947525024, |
| "memory(GiB)": 36.91, |
| "step": 225, |
| "token_acc": 0.9449035812672176, |
| "train_speed(iter/s)": 0.016146 |
| }, |
| { |
| "epoch": 0.7472081218274111, |
| "grad_norm": 0.9560028124850986, |
| "learning_rate": 9.730740784378753e-05, |
| "loss": 0.15862367153167725, |
| "memory(GiB)": 36.91, |
| "step": 230, |
| "token_acc": 0.9284009546539379, |
| "train_speed(iter/s)": 0.016153 |
| }, |
| { |
| "epoch": 0.7634517766497462, |
| "grad_norm": 0.9944797001481037, |
| "learning_rate": 9.713027612405395e-05, |
| "loss": 0.2057633638381958, |
| "memory(GiB)": 36.91, |
| "step": 235, |
| "token_acc": 0.9560975609756097, |
| "train_speed(iter/s)": 0.016148 |
| }, |
| { |
| "epoch": 0.7796954314720812, |
| "grad_norm": 1.3080304212648073, |
| "learning_rate": 9.694767398660942e-05, |
| "loss": 0.20023531913757325, |
| "memory(GiB)": 36.91, |
| "step": 240, |
| "token_acc": 0.930835734870317, |
| "train_speed(iter/s)": 0.016152 |
| }, |
| { |
| "epoch": 0.7959390862944162, |
| "grad_norm": 1.0766984239588557, |
| "learning_rate": 9.67596226261095e-05, |
| "loss": 0.17447829246520996, |
| "memory(GiB)": 36.91, |
| "step": 245, |
| "token_acc": 0.9543269230769231, |
| "train_speed(iter/s)": 0.016152 |
| }, |
| { |
| "epoch": 0.8121827411167513, |
| "grad_norm": 1.507367869013474, |
| "learning_rate": 9.656614386970173e-05, |
| "loss": 0.1656266212463379, |
| "memory(GiB)": 36.91, |
| "step": 250, |
| "token_acc": 0.9447368421052632, |
| "train_speed(iter/s)": 0.016157 |
| }, |
| { |
| "epoch": 0.8284263959390863, |
| "grad_norm": 0.9746901508793566, |
| "learning_rate": 9.636726017449236e-05, |
| "loss": 0.1971142530441284, |
| "memory(GiB)": 36.91, |
| "step": 255, |
| "token_acc": 0.9336384439359268, |
| "train_speed(iter/s)": 0.016144 |
| }, |
| { |
| "epoch": 0.8446700507614213, |
| "grad_norm": 1.2090195353569724, |
| "learning_rate": 9.616299462493952e-05, |
| "loss": 0.13225051164627075, |
| "memory(GiB)": 36.91, |
| "step": 260, |
| "token_acc": 0.9502369668246445, |
| "train_speed(iter/s)": 0.016132 |
| }, |
| { |
| "epoch": 0.8609137055837564, |
| "grad_norm": 2.0461505378854024, |
| "learning_rate": 9.595337093017404e-05, |
| "loss": 0.15409984588623046, |
| "memory(GiB)": 36.91, |
| "step": 265, |
| "token_acc": 0.9537444933920705, |
| "train_speed(iter/s)": 0.016136 |
| }, |
| { |
| "epoch": 0.8771573604060914, |
| "grad_norm": 2.0732984340431178, |
| "learning_rate": 9.57384134212473e-05, |
| "loss": 0.21368227005004883, |
| "memory(GiB)": 36.91, |
| "step": 270, |
| "token_acc": 0.9419642857142857, |
| "train_speed(iter/s)": 0.016136 |
| }, |
| { |
| "epoch": 0.8934010152284264, |
| "grad_norm": 0.7925657032904146, |
| "learning_rate": 9.551814704830734e-05, |
| "loss": 0.1758435010910034, |
| "memory(GiB)": 36.91, |
| "step": 275, |
| "token_acc": 0.948509485094851, |
| "train_speed(iter/s)": 0.016143 |
| }, |
| { |
| "epoch": 0.9096446700507614, |
| "grad_norm": 1.493944081608633, |
| "learning_rate": 9.529259737770269e-05, |
| "loss": 0.1807725191116333, |
| "memory(GiB)": 36.91, |
| "step": 280, |
| "token_acc": 0.9431524547803618, |
| "train_speed(iter/s)": 0.016126 |
| }, |
| { |
| "epoch": 0.9258883248730965, |
| "grad_norm": 1.6848601658017734, |
| "learning_rate": 9.506179058901503e-05, |
| "loss": 0.20769875049591063, |
| "memory(GiB)": 36.91, |
| "step": 285, |
| "token_acc": 0.9391304347826087, |
| "train_speed(iter/s)": 0.016132 |
| }, |
| { |
| "epoch": 0.9421319796954315, |
| "grad_norm": 1.3210817601987923, |
| "learning_rate": 9.482575347202047e-05, |
| "loss": 0.162405526638031, |
| "memory(GiB)": 36.91, |
| "step": 290, |
| "token_acc": 0.9507042253521126, |
| "train_speed(iter/s)": 0.016136 |
| }, |
| { |
| "epoch": 0.9583756345177665, |
| "grad_norm": 1.3496077516635223, |
| "learning_rate": 9.458451342358002e-05, |
| "loss": 0.19487454891204833, |
| "memory(GiB)": 36.91, |
| "step": 295, |
| "token_acc": 0.9321608040201005, |
| "train_speed(iter/s)": 0.016132 |
| }, |
| { |
| "epoch": 0.9746192893401016, |
| "grad_norm": 0.990005748680569, |
| "learning_rate": 9.433809844445969e-05, |
| "loss": 0.18303027153015136, |
| "memory(GiB)": 36.91, |
| "step": 300, |
| "token_acc": 0.9073170731707317, |
| "train_speed(iter/s)": 0.016134 |
| }, |
| { |
| "epoch": 0.9908629441624366, |
| "grad_norm": 1.2295422719869937, |
| "learning_rate": 9.40865371360804e-05, |
| "loss": 0.17322018146514892, |
| "memory(GiB)": 36.91, |
| "step": 305, |
| "token_acc": 0.9247311827956989, |
| "train_speed(iter/s)": 0.016141 |
| }, |
| { |
| "epoch": 0.9973604060913706, |
| "eval_loss": 0.1993405520915985, |
| "eval_runtime": 62.0419, |
| "eval_samples_per_second": 3.191, |
| "eval_steps_per_second": 0.806, |
| "eval_token_acc": 0.9332079021636877, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.0095431472081218, |
| "grad_norm": 0.6300057786945967, |
| "learning_rate": 9.382985869719825e-05, |
| "loss": 0.18641979694366456, |
| "memory(GiB)": 36.91, |
| "step": 310, |
| "token_acc": 0.9554234769687965, |
| "train_speed(iter/s)": 0.016081 |
| }, |
| { |
| "epoch": 1.0257868020304568, |
| "grad_norm": 0.6546315853574257, |
| "learning_rate": 9.35680929205154e-05, |
| "loss": 0.09114786386489868, |
| "memory(GiB)": 36.91, |
| "step": 315, |
| "token_acc": 0.972972972972973, |
| "train_speed(iter/s)": 0.016095 |
| }, |
| { |
| "epoch": 1.0420304568527918, |
| "grad_norm": 1.0908662736650971, |
| "learning_rate": 9.330127018922194e-05, |
| "loss": 0.10798045396804809, |
| "memory(GiB)": 36.91, |
| "step": 320, |
| "token_acc": 0.9705093833780161, |
| "train_speed(iter/s)": 0.016104 |
| }, |
| { |
| "epoch": 1.0582741116751269, |
| "grad_norm": 1.3297407747084764, |
| "learning_rate": 9.302942147346945e-05, |
| "loss": 0.1425997495651245, |
| "memory(GiB)": 36.91, |
| "step": 325, |
| "token_acc": 0.9705014749262537, |
| "train_speed(iter/s)": 0.016103 |
| }, |
| { |
| "epoch": 1.074517766497462, |
| "grad_norm": 0.9696985174488663, |
| "learning_rate": 9.275257832677623e-05, |
| "loss": 0.09851968884468079, |
| "memory(GiB)": 36.91, |
| "step": 330, |
| "token_acc": 0.9644549763033176, |
| "train_speed(iter/s)": 0.016115 |
| }, |
| { |
| "epoch": 1.090761421319797, |
| "grad_norm": 0.9656710998245678, |
| "learning_rate": 9.247077288236488e-05, |
| "loss": 0.11144424676895141, |
| "memory(GiB)": 36.91, |
| "step": 335, |
| "token_acc": 0.972568578553616, |
| "train_speed(iter/s)": 0.016119 |
| }, |
| { |
| "epoch": 1.107005076142132, |
| "grad_norm": 2.119365217816497, |
| "learning_rate": 9.21840378494325e-05, |
| "loss": 0.11279252767562867, |
| "memory(GiB)": 36.91, |
| "step": 340, |
| "token_acc": 0.9637462235649547, |
| "train_speed(iter/s)": 0.016124 |
| }, |
| { |
| "epoch": 1.123248730964467, |
| "grad_norm": 1.0607496749665157, |
| "learning_rate": 9.189240650935433e-05, |
| "loss": 0.15501840114593507, |
| "memory(GiB)": 36.91, |
| "step": 345, |
| "token_acc": 0.9662337662337662, |
| "train_speed(iter/s)": 0.016118 |
| }, |
| { |
| "epoch": 1.139492385786802, |
| "grad_norm": 1.1350038539205582, |
| "learning_rate": 9.159591271182058e-05, |
| "loss": 0.12092633247375488, |
| "memory(GiB)": 36.91, |
| "step": 350, |
| "token_acc": 0.9680232558139535, |
| "train_speed(iter/s)": 0.016126 |
| }, |
| { |
| "epoch": 1.155736040609137, |
| "grad_norm": 0.6471881138956326, |
| "learning_rate": 9.129459087090763e-05, |
| "loss": 0.09021483659744263, |
| "memory(GiB)": 36.91, |
| "step": 355, |
| "token_acc": 0.9718670076726342, |
| "train_speed(iter/s)": 0.016119 |
| }, |
| { |
| "epoch": 1.171979695431472, |
| "grad_norm": 0.5557368721254966, |
| "learning_rate": 9.098847596108351e-05, |
| "loss": 0.09125213623046875, |
| "memory(GiB)": 36.91, |
| "step": 360, |
| "token_acc": 0.9772727272727273, |
| "train_speed(iter/s)": 0.01612 |
| }, |
| { |
| "epoch": 1.188223350253807, |
| "grad_norm": 0.8767747521686889, |
| "learning_rate": 9.067760351314838e-05, |
| "loss": 0.10847616195678711, |
| "memory(GiB)": 36.91, |
| "step": 365, |
| "token_acc": 0.9425587467362925, |
| "train_speed(iter/s)": 0.016116 |
| }, |
| { |
| "epoch": 1.2044670050761421, |
| "grad_norm": 0.7043233347928591, |
| "learning_rate": 9.036200961011059e-05, |
| "loss": 0.14046638011932372, |
| "memory(GiB)": 36.91, |
| "step": 370, |
| "token_acc": 0.9632034632034632, |
| "train_speed(iter/s)": 0.016126 |
| }, |
| { |
| "epoch": 1.2207106598984772, |
| "grad_norm": 1.0689456764149206, |
| "learning_rate": 9.004173088299837e-05, |
| "loss": 0.13291985988616944, |
| "memory(GiB)": 36.91, |
| "step": 375, |
| "token_acc": 0.9565217391304348, |
| "train_speed(iter/s)": 0.016139 |
| }, |
| { |
| "epoch": 1.2369543147208122, |
| "grad_norm": 1.3657829465422844, |
| "learning_rate": 8.97168045066082e-05, |
| "loss": 0.11737120151519775, |
| "memory(GiB)": 36.91, |
| "step": 380, |
| "token_acc": 0.973421926910299, |
| "train_speed(iter/s)": 0.01615 |
| }, |
| { |
| "epoch": 1.2531979695431472, |
| "grad_norm": 0.991725434659403, |
| "learning_rate": 8.938726819518977e-05, |
| "loss": 0.1285269021987915, |
| "memory(GiB)": 36.91, |
| "step": 385, |
| "token_acc": 0.97, |
| "train_speed(iter/s)": 0.016149 |
| }, |
| { |
| "epoch": 1.2694416243654822, |
| "grad_norm": 0.7615458350738632, |
| "learning_rate": 8.905316019806868e-05, |
| "loss": 0.08999634981155395, |
| "memory(GiB)": 36.91, |
| "step": 390, |
| "token_acc": 0.9392405063291139, |
| "train_speed(iter/s)": 0.016141 |
| }, |
| { |
| "epoch": 1.2856852791878173, |
| "grad_norm": 1.0176469569030087, |
| "learning_rate": 8.871451929520663e-05, |
| "loss": 0.12240591049194335, |
| "memory(GiB)": 36.91, |
| "step": 395, |
| "token_acc": 0.9611872146118722, |
| "train_speed(iter/s)": 0.016137 |
| }, |
| { |
| "epoch": 1.3019289340101523, |
| "grad_norm": 1.5999057477034428, |
| "learning_rate": 8.837138479270036e-05, |
| "loss": 0.1078599214553833, |
| "memory(GiB)": 36.91, |
| "step": 400, |
| "token_acc": 0.9562982005141388, |
| "train_speed(iter/s)": 0.016137 |
| }, |
| { |
| "epoch": 1.3181725888324873, |
| "grad_norm": 1.8517636831594235, |
| "learning_rate": 8.802379651821938e-05, |
| "loss": 0.14071439504623412, |
| "memory(GiB)": 36.91, |
| "step": 405, |
| "token_acc": 0.9592875318066157, |
| "train_speed(iter/s)": 0.016131 |
| }, |
| { |
| "epoch": 1.3344162436548224, |
| "grad_norm": 1.333329930877741, |
| "learning_rate": 8.767179481638303e-05, |
| "loss": 0.13171937465667724, |
| "memory(GiB)": 36.91, |
| "step": 410, |
| "token_acc": 0.9744897959183674, |
| "train_speed(iter/s)": 0.016135 |
| }, |
| { |
| "epoch": 1.3506598984771574, |
| "grad_norm": 1.1709434640964491, |
| "learning_rate": 8.731542054407793e-05, |
| "loss": 0.10031242370605468, |
| "memory(GiB)": 36.91, |
| "step": 415, |
| "token_acc": 0.9507829977628636, |
| "train_speed(iter/s)": 0.016142 |
| }, |
| { |
| "epoch": 1.3669035532994924, |
| "grad_norm": 0.8550588073511182, |
| "learning_rate": 8.695471506571542e-05, |
| "loss": 0.09321081638336182, |
| "memory(GiB)": 36.91, |
| "step": 420, |
| "token_acc": 0.9667519181585678, |
| "train_speed(iter/s)": 0.016135 |
| }, |
| { |
| "epoch": 1.3831472081218275, |
| "grad_norm": 0.8651388677420173, |
| "learning_rate": 8.658972024843062e-05, |
| "loss": 0.11361520290374756, |
| "memory(GiB)": 36.91, |
| "step": 425, |
| "token_acc": 0.9243243243243243, |
| "train_speed(iter/s)": 0.016131 |
| }, |
| { |
| "epoch": 1.3993908629441625, |
| "grad_norm": 1.1539120381770573, |
| "learning_rate": 8.622047845722275e-05, |
| "loss": 0.11814072132110595, |
| "memory(GiB)": 36.91, |
| "step": 430, |
| "token_acc": 0.9747368421052631, |
| "train_speed(iter/s)": 0.016135 |
| }, |
| { |
| "epoch": 1.4156345177664975, |
| "grad_norm": 0.8277592112279485, |
| "learning_rate": 8.584703255003795e-05, |
| "loss": 0.11146994829177856, |
| "memory(GiB)": 36.91, |
| "step": 435, |
| "token_acc": 0.9720101781170484, |
| "train_speed(iter/s)": 0.016134 |
| }, |
| { |
| "epoch": 1.4318781725888325, |
| "grad_norm": 0.613271329664299, |
| "learning_rate": 8.546942587279465e-05, |
| "loss": 0.09394789338111878, |
| "memory(GiB)": 36.91, |
| "step": 440, |
| "token_acc": 0.9636803874092009, |
| "train_speed(iter/s)": 0.016134 |
| }, |
| { |
| "epoch": 1.4481218274111676, |
| "grad_norm": 1.0271786482031176, |
| "learning_rate": 8.508770225435244e-05, |
| "loss": 0.09493039846420288, |
| "memory(GiB)": 36.91, |
| "step": 445, |
| "token_acc": 0.9743589743589743, |
| "train_speed(iter/s)": 0.016139 |
| }, |
| { |
| "epoch": 1.4643654822335026, |
| "grad_norm": 1.0170609694346187, |
| "learning_rate": 8.470190600142486e-05, |
| "loss": 0.0872123122215271, |
| "memory(GiB)": 36.91, |
| "step": 450, |
| "token_acc": 0.9763779527559056, |
| "train_speed(iter/s)": 0.016139 |
| }, |
| { |
| "epoch": 1.4806091370558376, |
| "grad_norm": 1.6562131424643847, |
| "learning_rate": 8.43120818934367e-05, |
| "loss": 0.12921547889709473, |
| "memory(GiB)": 36.91, |
| "step": 455, |
| "token_acc": 0.9691516709511568, |
| "train_speed(iter/s)": 0.016142 |
| }, |
| { |
| "epoch": 1.4968527918781727, |
| "grad_norm": 1.9551348110028592, |
| "learning_rate": 8.39182751773264e-05, |
| "loss": 0.10002539157867432, |
| "memory(GiB)": 36.91, |
| "step": 460, |
| "token_acc": 0.9665924276169265, |
| "train_speed(iter/s)": 0.016147 |
| }, |
| { |
| "epoch": 1.5130964467005077, |
| "grad_norm": 1.376875063389563, |
| "learning_rate": 8.352053156229438e-05, |
| "loss": 0.0880006194114685, |
| "memory(GiB)": 36.91, |
| "step": 465, |
| "token_acc": 0.958128078817734, |
| "train_speed(iter/s)": 0.016149 |
| }, |
| { |
| "epoch": 1.5293401015228425, |
| "grad_norm": 1.688502126127077, |
| "learning_rate": 8.31188972144974e-05, |
| "loss": 0.08950616717338562, |
| "memory(GiB)": 36.91, |
| "step": 470, |
| "token_acc": 0.96996996996997, |
| "train_speed(iter/s)": 0.016152 |
| }, |
| { |
| "epoch": 1.5455837563451778, |
| "grad_norm": 1.3189009566745062, |
| "learning_rate": 8.27134187516901e-05, |
| "loss": 0.08834458589553833, |
| "memory(GiB)": 36.91, |
| "step": 475, |
| "token_acc": 0.9663865546218487, |
| "train_speed(iter/s)": 0.016152 |
| }, |
| { |
| "epoch": 1.5618274111675126, |
| "grad_norm": 1.4946742975658185, |
| "learning_rate": 8.23041432378141e-05, |
| "loss": 0.14390041828155517, |
| "memory(GiB)": 36.91, |
| "step": 480, |
| "token_acc": 0.9621212121212122, |
| "train_speed(iter/s)": 0.016158 |
| }, |
| { |
| "epoch": 1.5780710659898478, |
| "grad_norm": 1.3879821905262077, |
| "learning_rate": 8.18911181775353e-05, |
| "loss": 0.1267578125, |
| "memory(GiB)": 36.91, |
| "step": 485, |
| "token_acc": 0.9685230024213075, |
| "train_speed(iter/s)": 0.016166 |
| }, |
| { |
| "epoch": 1.5943147208121826, |
| "grad_norm": 1.0017173842059925, |
| "learning_rate": 8.147439151072994e-05, |
| "loss": 0.11637402772903442, |
| "memory(GiB)": 36.91, |
| "step": 490, |
| "token_acc": 0.945031712473573, |
| "train_speed(iter/s)": 0.016169 |
| }, |
| { |
| "epoch": 1.6105583756345179, |
| "grad_norm": 3.517464157304767, |
| "learning_rate": 8.105401160692023e-05, |
| "loss": 0.11228004693984986, |
| "memory(GiB)": 36.91, |
| "step": 495, |
| "token_acc": 0.9544513457556936, |
| "train_speed(iter/s)": 0.016174 |
| }, |
| { |
| "epoch": 1.6268020304568527, |
| "grad_norm": 1.123471909128111, |
| "learning_rate": 8.063002725966015e-05, |
| "loss": 0.1422884702682495, |
| "memory(GiB)": 36.91, |
| "step": 500, |
| "token_acc": 0.9502487562189055, |
| "train_speed(iter/s)": 0.016173 |
| }, |
| { |
| "epoch": 1.643045685279188, |
| "grad_norm": 0.6219224292611003, |
| "learning_rate": 8.020248768087188e-05, |
| "loss": 0.09764043688774109, |
| "memory(GiB)": 36.91, |
| "step": 505, |
| "token_acc": 0.9696312364425163, |
| "train_speed(iter/s)": 0.016171 |
| }, |
| { |
| "epoch": 1.6592893401015227, |
| "grad_norm": 0.6599500918289528, |
| "learning_rate": 7.977144249513391e-05, |
| "loss": 0.11226143836975097, |
| "memory(GiB)": 36.91, |
| "step": 510, |
| "token_acc": 0.9662447257383966, |
| "train_speed(iter/s)": 0.016171 |
| }, |
| { |
| "epoch": 1.675532994923858, |
| "grad_norm": 1.1327299497198065, |
| "learning_rate": 7.93369417339209e-05, |
| "loss": 0.15791513919830322, |
| "memory(GiB)": 36.91, |
| "step": 515, |
| "token_acc": 0.9535962877030162, |
| "train_speed(iter/s)": 0.016172 |
| }, |
| { |
| "epoch": 1.6917766497461928, |
| "grad_norm": 1.246895523664307, |
| "learning_rate": 7.88990358297967e-05, |
| "loss": 0.1254945158958435, |
| "memory(GiB)": 36.91, |
| "step": 520, |
| "token_acc": 0.9494252873563218, |
| "train_speed(iter/s)": 0.016169 |
| }, |
| { |
| "epoch": 1.708020304568528, |
| "grad_norm": 0.7907689981367572, |
| "learning_rate": 7.84577756105606e-05, |
| "loss": 0.11963515281677246, |
| "memory(GiB)": 36.91, |
| "step": 525, |
| "token_acc": 0.9851116625310173, |
| "train_speed(iter/s)": 0.016171 |
| }, |
| { |
| "epoch": 1.7242639593908629, |
| "grad_norm": 0.9327837359999639, |
| "learning_rate": 7.801321229334764e-05, |
| "loss": 0.0870942771434784, |
| "memory(GiB)": 36.91, |
| "step": 530, |
| "token_acc": 0.9618320610687023, |
| "train_speed(iter/s)": 0.01617 |
| }, |
| { |
| "epoch": 1.740507614213198, |
| "grad_norm": 1.0881384151057631, |
| "learning_rate": 7.756539747868394e-05, |
| "loss": 0.08531727194786072, |
| "memory(GiB)": 36.91, |
| "step": 535, |
| "token_acc": 0.9748603351955307, |
| "train_speed(iter/s)": 0.016168 |
| }, |
| { |
| "epoch": 1.756751269035533, |
| "grad_norm": 0.7767069783252919, |
| "learning_rate": 7.71143831444974e-05, |
| "loss": 0.11042824983596802, |
| "memory(GiB)": 36.91, |
| "step": 540, |
| "token_acc": 0.957983193277311, |
| "train_speed(iter/s)": 0.016168 |
| }, |
| { |
| "epoch": 1.7729949238578682, |
| "grad_norm": 1.4499212106775468, |
| "learning_rate": 7.666022164008457e-05, |
| "loss": 0.11432676315307617, |
| "memory(GiB)": 36.91, |
| "step": 545, |
| "token_acc": 0.975, |
| "train_speed(iter/s)": 0.016171 |
| }, |
| { |
| "epoch": 1.789238578680203, |
| "grad_norm": 1.0081688781849556, |
| "learning_rate": 7.620296568003449e-05, |
| "loss": 0.12327454090118409, |
| "memory(GiB)": 36.91, |
| "step": 550, |
| "token_acc": 0.9525, |
| "train_speed(iter/s)": 0.016174 |
| }, |
| { |
| "epoch": 1.8054822335025382, |
| "grad_norm": 0.9935491377578084, |
| "learning_rate": 7.57426683381101e-05, |
| "loss": 0.09574033617973328, |
| "memory(GiB)": 36.91, |
| "step": 555, |
| "token_acc": 0.9694117647058823, |
| "train_speed(iter/s)": 0.016174 |
| }, |
| { |
| "epoch": 1.821725888324873, |
| "grad_norm": 1.0191162814710237, |
| "learning_rate": 7.527938304108795e-05, |
| "loss": 0.10299128293991089, |
| "memory(GiB)": 36.91, |
| "step": 560, |
| "token_acc": 0.9694793536804309, |
| "train_speed(iter/s)": 0.016176 |
| }, |
| { |
| "epoch": 1.8379695431472083, |
| "grad_norm": 1.322632268427317, |
| "learning_rate": 7.481316356255698e-05, |
| "loss": 0.12594590187072754, |
| "memory(GiB)": 36.91, |
| "step": 565, |
| "token_acc": 0.946257197696737, |
| "train_speed(iter/s)": 0.016169 |
| }, |
| { |
| "epoch": 1.854213197969543, |
| "grad_norm": 1.2990436559927216, |
| "learning_rate": 7.434406401667695e-05, |
| "loss": 0.10811959505081177, |
| "memory(GiB)": 36.91, |
| "step": 570, |
| "token_acc": 0.9556650246305419, |
| "train_speed(iter/s)": 0.016173 |
| }, |
| { |
| "epoch": 1.8704568527918781, |
| "grad_norm": 1.141255912127714, |
| "learning_rate": 7.387213885189746e-05, |
| "loss": 0.10128064155578613, |
| "memory(GiB)": 36.91, |
| "step": 575, |
| "token_acc": 0.9654255319148937, |
| "train_speed(iter/s)": 0.016178 |
| }, |
| { |
| "epoch": 1.8867005076142132, |
| "grad_norm": 1.6575287534795722, |
| "learning_rate": 7.339744284463808e-05, |
| "loss": 0.09879794716835022, |
| "memory(GiB)": 36.91, |
| "step": 580, |
| "token_acc": 0.9805555555555555, |
| "train_speed(iter/s)": 0.016182 |
| }, |
| { |
| "epoch": 1.9029441624365482, |
| "grad_norm": 1.1141293923635756, |
| "learning_rate": 7.292003109293048e-05, |
| "loss": 0.0816422462463379, |
| "memory(GiB)": 36.91, |
| "step": 585, |
| "token_acc": 0.961038961038961, |
| "train_speed(iter/s)": 0.016187 |
| }, |
| { |
| "epoch": 1.9191878172588832, |
| "grad_norm": 0.9384463374768481, |
| "learning_rate": 7.243995901002312e-05, |
| "loss": 0.10118494033813477, |
| "memory(GiB)": 36.91, |
| "step": 590, |
| "token_acc": 0.978021978021978, |
| "train_speed(iter/s)": 0.016179 |
| }, |
| { |
| "epoch": 1.9354314720812182, |
| "grad_norm": 1.2458643327317989, |
| "learning_rate": 7.19572823179495e-05, |
| "loss": 0.13551709651947022, |
| "memory(GiB)": 36.91, |
| "step": 595, |
| "token_acc": 0.96, |
| "train_speed(iter/s)": 0.016178 |
| }, |
| { |
| "epoch": 1.9516751269035533, |
| "grad_norm": 1.2473685164472739, |
| "learning_rate": 7.147205704106046e-05, |
| "loss": 0.12769120931625366, |
| "memory(GiB)": 36.91, |
| "step": 600, |
| "token_acc": 0.9561586638830898, |
| "train_speed(iter/s)": 0.016179 |
| }, |
| { |
| "epoch": 1.9679187817258883, |
| "grad_norm": 0.7203387342947396, |
| "learning_rate": 7.098433949952146e-05, |
| "loss": 0.09962844252586364, |
| "memory(GiB)": 36.91, |
| "step": 605, |
| "token_acc": 0.9623115577889447, |
| "train_speed(iter/s)": 0.016178 |
| }, |
| { |
| "epoch": 1.9841624365482233, |
| "grad_norm": 0.9094364008463653, |
| "learning_rate": 7.049418630277542e-05, |
| "loss": 0.10799739360809327, |
| "memory(GiB)": 36.91, |
| "step": 610, |
| "token_acc": 0.9705159705159705, |
| "train_speed(iter/s)": 0.016178 |
| }, |
| { |
| "epoch": 1.9971573604060913, |
| "eval_loss": 0.19586917757987976, |
| "eval_runtime": 62.6829, |
| "eval_samples_per_second": 3.159, |
| "eval_steps_per_second": 0.798, |
| "eval_token_acc": 0.9416745061147695, |
| "step": 614 |
| }, |
| { |
| "epoch": 2.0028426395939087, |
| "grad_norm": 11.787067733742486, |
| "learning_rate": 7.000165434297214e-05, |
| "loss": 0.12140052318572998, |
| "memory(GiB)": 36.91, |
| "step": 615, |
| "token_acc": 0.951048951048951, |
| "train_speed(iter/s)": 0.016146 |
| }, |
| { |
| "epoch": 2.0190862944162435, |
| "grad_norm": 0.9036939767517369, |
| "learning_rate": 6.950680078836474e-05, |
| "loss": 0.0476156622171402, |
| "memory(GiB)": 36.91, |
| "step": 620, |
| "token_acc": 0.9901477832512315, |
| "train_speed(iter/s)": 0.016145 |
| }, |
| { |
| "epoch": 2.035329949238579, |
| "grad_norm": 0.8045933316745676, |
| "learning_rate": 6.900968307667423e-05, |
| "loss": 0.0368287205696106, |
| "memory(GiB)": 36.91, |
| "step": 625, |
| "token_acc": 0.9932584269662922, |
| "train_speed(iter/s)": 0.016144 |
| }, |
| { |
| "epoch": 2.0515736040609136, |
| "grad_norm": 0.9084110351960255, |
| "learning_rate": 6.851035890842259e-05, |
| "loss": 0.03829330801963806, |
| "memory(GiB)": 36.91, |
| "step": 630, |
| "token_acc": 0.9928741092636579, |
| "train_speed(iter/s)": 0.016139 |
| }, |
| { |
| "epoch": 2.067817258883249, |
| "grad_norm": 0.6115130889160721, |
| "learning_rate": 6.800888624023553e-05, |
| "loss": 0.04897831082344055, |
| "memory(GiB)": 36.91, |
| "step": 635, |
| "token_acc": 0.995, |
| "train_speed(iter/s)": 0.016139 |
| }, |
| { |
| "epoch": 2.0840609137055837, |
| "grad_norm": 0.2929609590178906, |
| "learning_rate": 6.750532327811547e-05, |
| "loss": 0.027808183431625368, |
| "memory(GiB)": 36.91, |
| "step": 640, |
| "token_acc": 0.9877750611246944, |
| "train_speed(iter/s)": 0.016139 |
| }, |
| { |
| "epoch": 2.100304568527919, |
| "grad_norm": 1.6659772014622232, |
| "learning_rate": 6.699972847068553e-05, |
| "loss": 0.04012786149978638, |
| "memory(GiB)": 36.91, |
| "step": 645, |
| "token_acc": 0.9892183288409704, |
| "train_speed(iter/s)": 0.016136 |
| }, |
| { |
| "epoch": 2.1165482233502537, |
| "grad_norm": 1.6942318499082378, |
| "learning_rate": 6.649216050240539e-05, |
| "loss": 0.03581180572509766, |
| "memory(GiB)": 36.91, |
| "step": 650, |
| "token_acc": 0.9848866498740554, |
| "train_speed(iter/s)": 0.016138 |
| }, |
| { |
| "epoch": 2.132791878172589, |
| "grad_norm": 1.7750332328595628, |
| "learning_rate": 6.598267828675979e-05, |
| "loss": 0.038441383838653566, |
| "memory(GiB)": 36.91, |
| "step": 655, |
| "token_acc": 0.9860724233983287, |
| "train_speed(iter/s)": 0.016136 |
| }, |
| { |
| "epoch": 2.149035532994924, |
| "grad_norm": 0.948452800180108, |
| "learning_rate": 6.547134095942044e-05, |
| "loss": 0.03809022605419159, |
| "memory(GiB)": 36.91, |
| "step": 660, |
| "token_acc": 0.9917355371900827, |
| "train_speed(iter/s)": 0.016135 |
| }, |
| { |
| "epoch": 2.165279187817259, |
| "grad_norm": 1.185267349759789, |
| "learning_rate": 6.495820787138209e-05, |
| "loss": 0.033171114325523374, |
| "memory(GiB)": 36.91, |
| "step": 665, |
| "token_acc": 0.9947916666666666, |
| "train_speed(iter/s)": 0.016132 |
| }, |
| { |
| "epoch": 2.181522842639594, |
| "grad_norm": 1.1780464513130944, |
| "learning_rate": 6.44433385820737e-05, |
| "loss": 0.03416465222835541, |
| "memory(GiB)": 36.91, |
| "step": 670, |
| "token_acc": 0.9948051948051948, |
| "train_speed(iter/s)": 0.01614 |
| }, |
| { |
| "epoch": 2.197766497461929, |
| "grad_norm": 0.5862751780031482, |
| "learning_rate": 6.392679285244538e-05, |
| "loss": 0.043843358755111694, |
| "memory(GiB)": 36.91, |
| "step": 675, |
| "token_acc": 0.9854014598540146, |
| "train_speed(iter/s)": 0.016137 |
| }, |
| { |
| "epoch": 2.214010152284264, |
| "grad_norm": 0.7314774852745054, |
| "learning_rate": 6.340863063803188e-05, |
| "loss": 0.03051617741584778, |
| "memory(GiB)": 36.91, |
| "step": 680, |
| "token_acc": 0.9970326409495549, |
| "train_speed(iter/s)": 0.016136 |
| }, |
| { |
| "epoch": 2.230253807106599, |
| "grad_norm": 1.4305053109603272, |
| "learning_rate": 6.288891208199353e-05, |
| "loss": 0.03859332203865051, |
| "memory(GiB)": 36.91, |
| "step": 685, |
| "token_acc": 0.9813829787234043, |
| "train_speed(iter/s)": 0.016138 |
| }, |
| { |
| "epoch": 2.246497461928934, |
| "grad_norm": 1.2676862153868658, |
| "learning_rate": 6.23676975081355e-05, |
| "loss": 0.03608715534210205, |
| "memory(GiB)": 36.91, |
| "step": 690, |
| "token_acc": 0.9923076923076923, |
| "train_speed(iter/s)": 0.016143 |
| }, |
| { |
| "epoch": 2.262741116751269, |
| "grad_norm": 0.717797595223322, |
| "learning_rate": 6.184504741390596e-05, |
| "loss": 0.024200823903083802, |
| "memory(GiB)": 36.91, |
| "step": 695, |
| "token_acc": 0.9932885906040269, |
| "train_speed(iter/s)": 0.016142 |
| }, |
| { |
| "epoch": 2.278984771573604, |
| "grad_norm": 1.2738346733999926, |
| "learning_rate": 6.132102246337407e-05, |
| "loss": 0.04924860596656799, |
| "memory(GiB)": 36.91, |
| "step": 700, |
| "token_acc": 0.989769820971867, |
| "train_speed(iter/s)": 0.016144 |
| }, |
| { |
| "epoch": 2.2952284263959393, |
| "grad_norm": 0.9709229547354659, |
| "learning_rate": 6.079568348018882e-05, |
| "loss": 0.04101951122283935, |
| "memory(GiB)": 36.91, |
| "step": 705, |
| "token_acc": 0.9838709677419355, |
| "train_speed(iter/s)": 0.016145 |
| }, |
| { |
| "epoch": 2.311472081218274, |
| "grad_norm": 0.34074159031019935, |
| "learning_rate": 6.02690914405191e-05, |
| "loss": 0.012625060975551605, |
| "memory(GiB)": 36.91, |
| "step": 710, |
| "token_acc": 0.9893162393162394, |
| "train_speed(iter/s)": 0.016143 |
| }, |
| { |
| "epoch": 2.3277157360406093, |
| "grad_norm": 1.405033686903226, |
| "learning_rate": 5.974130746597628e-05, |
| "loss": 0.023314157128334047, |
| "memory(GiB)": 36.91, |
| "step": 715, |
| "token_acc": 0.9845261121856866, |
| "train_speed(iter/s)": 0.016146 |
| }, |
| { |
| "epoch": 2.343959390862944, |
| "grad_norm": 0.393622080479984, |
| "learning_rate": 5.921239281651976e-05, |
| "loss": 0.03884749114513397, |
| "memory(GiB)": 36.91, |
| "step": 720, |
| "token_acc": 0.9844961240310077, |
| "train_speed(iter/s)": 0.016147 |
| }, |
| { |
| "epoch": 2.360203045685279, |
| "grad_norm": 0.8205162732404321, |
| "learning_rate": 5.868240888334653e-05, |
| "loss": 0.0408410519361496, |
| "memory(GiB)": 36.91, |
| "step": 725, |
| "token_acc": 0.9696969696969697, |
| "train_speed(iter/s)": 0.016147 |
| }, |
| { |
| "epoch": 2.376446700507614, |
| "grad_norm": 0.9254262259522679, |
| "learning_rate": 5.815141718176549e-05, |
| "loss": 0.03491292595863342, |
| "memory(GiB)": 36.91, |
| "step": 730, |
| "token_acc": 0.9818731117824774, |
| "train_speed(iter/s)": 0.016148 |
| }, |
| { |
| "epoch": 2.3926903553299494, |
| "grad_norm": 0.4613013276623316, |
| "learning_rate": 5.761947934405736e-05, |
| "loss": 0.041343241930007935, |
| "memory(GiB)": 36.91, |
| "step": 735, |
| "token_acc": 0.9923076923076923, |
| "train_speed(iter/s)": 0.01615 |
| }, |
| { |
| "epoch": 2.4089340101522843, |
| "grad_norm": 0.5995425123829327, |
| "learning_rate": 5.708665711232103e-05, |
| "loss": 0.026265931129455567, |
| "memory(GiB)": 36.91, |
| "step": 740, |
| "token_acc": 0.980225988700565, |
| "train_speed(iter/s)": 0.016147 |
| }, |
| { |
| "epoch": 2.425177664974619, |
| "grad_norm": 0.8947399880614664, |
| "learning_rate": 5.655301233130711e-05, |
| "loss": 0.026338309049606323, |
| "memory(GiB)": 36.91, |
| "step": 745, |
| "token_acc": 0.9891304347826086, |
| "train_speed(iter/s)": 0.01615 |
| }, |
| { |
| "epoch": 2.4414213197969543, |
| "grad_norm": 0.6528954286261448, |
| "learning_rate": 5.6018606941239615e-05, |
| "loss": 0.031349584460258484, |
| "memory(GiB)": 36.91, |
| "step": 750, |
| "token_acc": 0.9825870646766169, |
| "train_speed(iter/s)": 0.016153 |
| }, |
| { |
| "epoch": 2.4576649746192896, |
| "grad_norm": 0.9124965491201447, |
| "learning_rate": 5.548350297062659e-05, |
| "loss": 0.04390305280685425, |
| "memory(GiB)": 36.91, |
| "step": 755, |
| "token_acc": 0.9971181556195965, |
| "train_speed(iter/s)": 0.016158 |
| }, |
| { |
| "epoch": 2.4739086294416244, |
| "grad_norm": 1.2758793187917294, |
| "learning_rate": 5.494776252906036e-05, |
| "loss": 0.03932673335075378, |
| "memory(GiB)": 36.91, |
| "step": 760, |
| "token_acc": 0.9852941176470589, |
| "train_speed(iter/s)": 0.016155 |
| }, |
| { |
| "epoch": 2.490152284263959, |
| "grad_norm": 1.6183527750946778, |
| "learning_rate": 5.44114478000086e-05, |
| "loss": 0.040107494592666625, |
| "memory(GiB)": 36.91, |
| "step": 765, |
| "token_acc": 0.980722891566265, |
| "train_speed(iter/s)": 0.01616 |
| }, |
| { |
| "epoch": 2.5063959390862944, |
| "grad_norm": 0.8155608212943981, |
| "learning_rate": 5.387462103359655e-05, |
| "loss": 0.034613233804702756, |
| "memory(GiB)": 36.91, |
| "step": 770, |
| "token_acc": 0.9809885931558935, |
| "train_speed(iter/s)": 0.016158 |
| }, |
| { |
| "epoch": 2.5226395939086297, |
| "grad_norm": 0.72914335142115, |
| "learning_rate": 5.333734453938174e-05, |
| "loss": 0.03472020030021668, |
| "memory(GiB)": 36.91, |
| "step": 775, |
| "token_acc": 0.980722891566265, |
| "train_speed(iter/s)": 0.016157 |
| }, |
| { |
| "epoch": 2.5388832487309645, |
| "grad_norm": 0.715640193227215, |
| "learning_rate": 5.279968067912161e-05, |
| "loss": 0.03267112672328949, |
| "memory(GiB)": 36.91, |
| "step": 780, |
| "token_acc": 0.9949109414758269, |
| "train_speed(iter/s)": 0.016159 |
| }, |
| { |
| "epoch": 2.5551269035532993, |
| "grad_norm": 0.5201766196940287, |
| "learning_rate": 5.226169185953532e-05, |
| "loss": 0.06324458122253418, |
| "memory(GiB)": 36.91, |
| "step": 785, |
| "token_acc": 0.9822784810126582, |
| "train_speed(iter/s)": 0.016157 |
| }, |
| { |
| "epoch": 2.5713705583756346, |
| "grad_norm": 0.716527670309396, |
| "learning_rate": 5.1723440525060026e-05, |
| "loss": 0.036973622441291806, |
| "memory(GiB)": 36.91, |
| "step": 790, |
| "token_acc": 0.9828009828009828, |
| "train_speed(iter/s)": 0.016157 |
| }, |
| { |
| "epoch": 2.58761421319797, |
| "grad_norm": 0.9508048665101771, |
| "learning_rate": 5.118498915060307e-05, |
| "loss": 0.04134515523910522, |
| "memory(GiB)": 36.91, |
| "step": 795, |
| "token_acc": 0.9832402234636871, |
| "train_speed(iter/s)": 0.016159 |
| }, |
| { |
| "epoch": 2.6038578680203046, |
| "grad_norm": 0.1695737988935869, |
| "learning_rate": 5.064640023429043e-05, |
| "loss": 0.0396234929561615, |
| "memory(GiB)": 36.91, |
| "step": 800, |
| "token_acc": 0.9937888198757764, |
| "train_speed(iter/s)": 0.01616 |
| }, |
| { |
| "epoch": 2.6201015228426394, |
| "grad_norm": 1.353410357397197, |
| "learning_rate": 5.0107736290212603e-05, |
| "loss": 0.032366597652435304, |
| "memory(GiB)": 36.91, |
| "step": 805, |
| "token_acc": 0.9853658536585366, |
| "train_speed(iter/s)": 0.016161 |
| }, |
| { |
| "epoch": 2.6363451776649747, |
| "grad_norm": 0.9287301884362714, |
| "learning_rate": 4.956905984116858e-05, |
| "loss": 0.02025129795074463, |
| "memory(GiB)": 36.91, |
| "step": 810, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016156 |
| }, |
| { |
| "epoch": 2.65258883248731, |
| "grad_norm": 0.6605215469870417, |
| "learning_rate": 4.903043341140879e-05, |
| "loss": 0.027498137950897217, |
| "memory(GiB)": 36.91, |
| "step": 815, |
| "token_acc": 0.9890590809628009, |
| "train_speed(iter/s)": 0.016158 |
| }, |
| { |
| "epoch": 2.6688324873096447, |
| "grad_norm": 1.284202747583917, |
| "learning_rate": 4.84919195193779e-05, |
| "loss": 0.04052730202674866, |
| "memory(GiB)": 36.91, |
| "step": 820, |
| "token_acc": 0.9691714836223507, |
| "train_speed(iter/s)": 0.016161 |
| }, |
| { |
| "epoch": 2.6850761421319795, |
| "grad_norm": 1.054572423840406, |
| "learning_rate": 4.7953580670458345e-05, |
| "loss": 0.029700332880020143, |
| "memory(GiB)": 36.91, |
| "step": 825, |
| "token_acc": 0.9903381642512077, |
| "train_speed(iter/s)": 0.016161 |
| }, |
| { |
| "epoch": 2.701319796954315, |
| "grad_norm": 1.515148160249309, |
| "learning_rate": 4.7415479349715275e-05, |
| "loss": 0.03995212614536285, |
| "memory(GiB)": 36.91, |
| "step": 830, |
| "token_acc": 0.9887005649717514, |
| "train_speed(iter/s)": 0.016163 |
| }, |
| { |
| "epoch": 2.7175634517766496, |
| "grad_norm": 0.7966857436927859, |
| "learning_rate": 4.687767801464388e-05, |
| "loss": 0.029492130875587462, |
| "memory(GiB)": 36.91, |
| "step": 835, |
| "token_acc": 0.9946091644204852, |
| "train_speed(iter/s)": 0.016162 |
| }, |
| { |
| "epoch": 2.733807106598985, |
| "grad_norm": 0.6747809015160623, |
| "learning_rate": 4.634023908791999e-05, |
| "loss": 0.028040975332260132, |
| "memory(GiB)": 36.91, |
| "step": 840, |
| "token_acc": 0.9950372208436724, |
| "train_speed(iter/s)": 0.016165 |
| }, |
| { |
| "epoch": 2.7500507614213197, |
| "grad_norm": 0.7236373548114289, |
| "learning_rate": 4.5803224950154656e-05, |
| "loss": 0.022182533144950868, |
| "memory(GiB)": 36.91, |
| "step": 845, |
| "token_acc": 0.9973753280839895, |
| "train_speed(iter/s)": 0.016167 |
| }, |
| { |
| "epoch": 2.766294416243655, |
| "grad_norm": 0.8702609694851884, |
| "learning_rate": 4.5266697932653616e-05, |
| "loss": 0.03542717695236206, |
| "memory(GiB)": 36.91, |
| "step": 850, |
| "token_acc": 0.9930394431554525, |
| "train_speed(iter/s)": 0.016168 |
| }, |
| { |
| "epoch": 2.7825380710659897, |
| "grad_norm": 0.2339976820774803, |
| "learning_rate": 4.473072031018248e-05, |
| "loss": 0.017447268962860106, |
| "memory(GiB)": 36.91, |
| "step": 855, |
| "token_acc": 0.9897172236503856, |
| "train_speed(iter/s)": 0.016172 |
| }, |
| { |
| "epoch": 2.798781725888325, |
| "grad_norm": 1.7564108472908913, |
| "learning_rate": 4.4195354293738484e-05, |
| "loss": 0.040924933552742, |
| "memory(GiB)": 36.91, |
| "step": 860, |
| "token_acc": 0.9693396226415094, |
| "train_speed(iter/s)": 0.016172 |
| }, |
| { |
| "epoch": 2.8150253807106598, |
| "grad_norm": 1.749637468786309, |
| "learning_rate": 4.366066202332974e-05, |
| "loss": 0.0398847758769989, |
| "memory(GiB)": 36.91, |
| "step": 865, |
| "token_acc": 0.9884726224783862, |
| "train_speed(iter/s)": 0.016173 |
| }, |
| { |
| "epoch": 2.831269035532995, |
| "grad_norm": 1.6657986428559317, |
| "learning_rate": 4.312670556076244e-05, |
| "loss": 0.027478563785552978, |
| "memory(GiB)": 36.91, |
| "step": 870, |
| "token_acc": 0.9953379953379954, |
| "train_speed(iter/s)": 0.016178 |
| }, |
| { |
| "epoch": 2.84751269035533, |
| "grad_norm": 0.8830417040757416, |
| "learning_rate": 4.259354688243757e-05, |
| "loss": 0.05422350764274597, |
| "memory(GiB)": 36.91, |
| "step": 875, |
| "token_acc": 0.9813953488372092, |
| "train_speed(iter/s)": 0.016176 |
| }, |
| { |
| "epoch": 2.863756345177665, |
| "grad_norm": 1.4037166255295264, |
| "learning_rate": 4.206124787215714e-05, |
| "loss": 0.03585241138935089, |
| "memory(GiB)": 36.91, |
| "step": 880, |
| "token_acc": 0.9929577464788732, |
| "train_speed(iter/s)": 0.016178 |
| }, |
| { |
| "epoch": 2.88, |
| "grad_norm": 0.40929439648007787, |
| "learning_rate": 4.1529870313941386e-05, |
| "loss": 0.037713998556137086, |
| "memory(GiB)": 36.91, |
| "step": 885, |
| "token_acc": 0.9755555555555555, |
| "train_speed(iter/s)": 0.016182 |
| }, |
| { |
| "epoch": 2.896243654822335, |
| "grad_norm": 0.5649136450093045, |
| "learning_rate": 4.099947588485744e-05, |
| "loss": 0.02235218584537506, |
| "memory(GiB)": 36.91, |
| "step": 890, |
| "token_acc": 0.9738562091503268, |
| "train_speed(iter/s)": 0.016179 |
| }, |
| { |
| "epoch": 2.91248730964467, |
| "grad_norm": 0.9411441260021843, |
| "learning_rate": 4.047012614786055e-05, |
| "loss": 0.03756971955299378, |
| "memory(GiB)": 36.91, |
| "step": 895, |
| "token_acc": 0.9953596287703016, |
| "train_speed(iter/s)": 0.016182 |
| }, |
| { |
| "epoch": 2.928730964467005, |
| "grad_norm": 0.493632814272918, |
| "learning_rate": 3.994188254464838e-05, |
| "loss": 0.03068949580192566, |
| "memory(GiB)": 36.91, |
| "step": 900, |
| "token_acc": 0.9681372549019608, |
| "train_speed(iter/s)": 0.016183 |
| }, |
| { |
| "epoch": 2.94497461928934, |
| "grad_norm": 0.9098057371042104, |
| "learning_rate": 3.941480638852948e-05, |
| "loss": 0.060313427448272706, |
| "memory(GiB)": 36.91, |
| "step": 905, |
| "token_acc": 0.9809976247030879, |
| "train_speed(iter/s)": 0.016186 |
| }, |
| { |
| "epoch": 2.9612182741116753, |
| "grad_norm": 0.7111307711774197, |
| "learning_rate": 3.888895885730666e-05, |
| "loss": 0.017010049521923067, |
| "memory(GiB)": 36.91, |
| "step": 910, |
| "token_acc": 0.9949748743718593, |
| "train_speed(iter/s)": 0.016184 |
| }, |
| { |
| "epoch": 2.97746192893401, |
| "grad_norm": 1.1085076966021257, |
| "learning_rate": 3.836440098617611e-05, |
| "loss": 0.0352476716041565, |
| "memory(GiB)": 36.91, |
| "step": 915, |
| "token_acc": 0.9971264367816092, |
| "train_speed(iter/s)": 0.016185 |
| }, |
| { |
| "epoch": 2.9937055837563453, |
| "grad_norm": 1.0414881730973389, |
| "learning_rate": 3.784119366064293e-05, |
| "loss": 0.036097651720046996, |
| "memory(GiB)": 36.91, |
| "step": 920, |
| "token_acc": 0.9859484777517564, |
| "train_speed(iter/s)": 0.016183 |
| }, |
| { |
| "epoch": 2.996954314720812, |
| "eval_loss": 0.2438431978225708, |
| "eval_runtime": 61.9093, |
| "eval_samples_per_second": 3.198, |
| "eval_steps_per_second": 0.808, |
| "eval_token_acc": 0.9426152398871119, |
| "step": 921 |
| }, |
| { |
| "epoch": 3.0123857868020303, |
| "grad_norm": 0.40292122284066784, |
| "learning_rate": 3.731939760945423e-05, |
| "loss": 0.02739437222480774, |
| "memory(GiB)": 36.91, |
| "step": 925, |
| "token_acc": 0.9686609686609686, |
| "train_speed(iter/s)": 0.016163 |
| }, |
| { |
| "epoch": 3.0286294416243655, |
| "grad_norm": 2.9493043319197345, |
| "learning_rate": 3.6799073397550324e-05, |
| "loss": 0.023541851341724394, |
| "memory(GiB)": 36.91, |
| "step": 930, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016168 |
| }, |
| { |
| "epoch": 3.0448730964467003, |
| "grad_norm": 0.17930096671859505, |
| "learning_rate": 3.628028141903493e-05, |
| "loss": 0.011585032194852829, |
| "memory(GiB)": 36.91, |
| "step": 935, |
| "token_acc": 0.9955849889624724, |
| "train_speed(iter/s)": 0.016168 |
| }, |
| { |
| "epoch": 3.0611167512690356, |
| "grad_norm": 0.32421421634457975, |
| "learning_rate": 3.576308189016521e-05, |
| "loss": 0.01218060329556465, |
| "memory(GiB)": 36.91, |
| "step": 940, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016169 |
| }, |
| { |
| "epoch": 3.0773604060913704, |
| "grad_norm": 0.6594419595560748, |
| "learning_rate": 3.5247534842362486e-05, |
| "loss": 0.02207506597042084, |
| "memory(GiB)": 36.91, |
| "step": 945, |
| "token_acc": 0.988558352402746, |
| "train_speed(iter/s)": 0.016162 |
| }, |
| { |
| "epoch": 3.0936040609137057, |
| "grad_norm": 0.2767332960437252, |
| "learning_rate": 3.473370011524435e-05, |
| "loss": 0.007218687236309052, |
| "memory(GiB)": 36.91, |
| "step": 950, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016165 |
| }, |
| { |
| "epoch": 3.1098477157360405, |
| "grad_norm": 0.35071543831944074, |
| "learning_rate": 3.422163734967913e-05, |
| "loss": 0.01153595745563507, |
| "memory(GiB)": 36.91, |
| "step": 955, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016173 |
| }, |
| { |
| "epoch": 3.1260913705583757, |
| "grad_norm": 0.09053944993100493, |
| "learning_rate": 3.371140598086332e-05, |
| "loss": 0.0028192587196826935, |
| "memory(GiB)": 36.91, |
| "step": 960, |
| "token_acc": 0.9975247524752475, |
| "train_speed(iter/s)": 0.016172 |
| }, |
| { |
| "epoch": 3.1423350253807105, |
| "grad_norm": 0.2428779518534084, |
| "learning_rate": 3.3203065231422904e-05, |
| "loss": 0.0033150166273117065, |
| "memory(GiB)": 36.91, |
| "step": 965, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016172 |
| }, |
| { |
| "epoch": 3.1585786802030458, |
| "grad_norm": 0.3634314044068558, |
| "learning_rate": 3.269667410453944e-05, |
| "loss": 0.006601892411708832, |
| "memory(GiB)": 36.91, |
| "step": 970, |
| "token_acc": 0.9974160206718347, |
| "train_speed(iter/s)": 0.016171 |
| }, |
| { |
| "epoch": 3.1748223350253806, |
| "grad_norm": 0.09528591509222967, |
| "learning_rate": 3.2192291377101544e-05, |
| "loss": 0.006571587175130844, |
| "memory(GiB)": 36.91, |
| "step": 975, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016173 |
| }, |
| { |
| "epoch": 3.191065989847716, |
| "grad_norm": 1.3857004471442305, |
| "learning_rate": 3.1689975592882603e-05, |
| "loss": 0.010420820116996765, |
| "memory(GiB)": 36.91, |
| "step": 980, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016176 |
| }, |
| { |
| "epoch": 3.2073096446700506, |
| "grad_norm": 0.6960737288379213, |
| "learning_rate": 3.11897850557456e-05, |
| "loss": 0.013220900297164917, |
| "memory(GiB)": 36.91, |
| "step": 985, |
| "token_acc": 0.9951807228915662, |
| "train_speed(iter/s)": 0.016176 |
| }, |
| { |
| "epoch": 3.223553299492386, |
| "grad_norm": 0.9453732221306024, |
| "learning_rate": 3.0691777822875846e-05, |
| "loss": 0.01793895959854126, |
| "memory(GiB)": 36.91, |
| "step": 990, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016176 |
| }, |
| { |
| "epoch": 3.2397969543147207, |
| "grad_norm": 0.7409992990444315, |
| "learning_rate": 3.019601169804216e-05, |
| "loss": 0.019229742884635925, |
| "memory(GiB)": 36.91, |
| "step": 995, |
| "token_acc": 0.9945054945054945, |
| "train_speed(iter/s)": 0.016174 |
| }, |
| { |
| "epoch": 3.256040609137056, |
| "grad_norm": 0.5679417621370911, |
| "learning_rate": 2.9702544224887684e-05, |
| "loss": 0.024555668234825134, |
| "memory(GiB)": 36.91, |
| "step": 1000, |
| "token_acc": 0.9953161592505855, |
| "train_speed(iter/s)": 0.016175 |
| }, |
| { |
| "epoch": 3.2722842639593908, |
| "grad_norm": 0.08818412948467023, |
| "learning_rate": 2.9211432680250717e-05, |
| "loss": 0.009600495547056198, |
| "memory(GiB)": 36.91, |
| "step": 1005, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016176 |
| }, |
| { |
| "epoch": 3.288527918781726, |
| "grad_norm": 0.597788232010352, |
| "learning_rate": 2.872273406751664e-05, |
| "loss": 0.015477313101291657, |
| "memory(GiB)": 36.91, |
| "step": 1010, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016178 |
| }, |
| { |
| "epoch": 3.304771573604061, |
| "grad_norm": 0.5815875303347526, |
| "learning_rate": 2.823650511000142e-05, |
| "loss": 0.007314224541187286, |
| "memory(GiB)": 36.91, |
| "step": 1015, |
| "token_acc": 0.9928741092636579, |
| "train_speed(iter/s)": 0.016175 |
| }, |
| { |
| "epoch": 3.321015228426396, |
| "grad_norm": 0.06303638116527722, |
| "learning_rate": 2.7752802244367875e-05, |
| "loss": 0.0048162821680307385, |
| "memory(GiB)": 36.91, |
| "step": 1020, |
| "token_acc": 0.9976359338061466, |
| "train_speed(iter/s)": 0.016175 |
| }, |
| { |
| "epoch": 3.337258883248731, |
| "grad_norm": 1.530822467857818, |
| "learning_rate": 2.7271681614074973e-05, |
| "loss": 0.011756302416324615, |
| "memory(GiB)": 36.91, |
| "step": 1025, |
| "token_acc": 0.9976744186046511, |
| "train_speed(iter/s)": 0.016173 |
| }, |
| { |
| "epoch": 3.353502538071066, |
| "grad_norm": 0.03790601751186608, |
| "learning_rate": 2.679319906286122e-05, |
| "loss": 0.008612405508756638, |
| "memory(GiB)": 36.91, |
| "step": 1030, |
| "token_acc": 0.9927184466019418, |
| "train_speed(iter/s)": 0.016176 |
| }, |
| { |
| "epoch": 3.369746192893401, |
| "grad_norm": 0.21401768725028367, |
| "learning_rate": 2.6317410128262954e-05, |
| "loss": 0.006316320598125457, |
| "memory(GiB)": 36.91, |
| "step": 1035, |
| "token_acc": 0.9950124688279302, |
| "train_speed(iter/s)": 0.016179 |
| }, |
| { |
| "epoch": 3.385989847715736, |
| "grad_norm": 0.19540220508166592, |
| "learning_rate": 2.5844370035168073e-05, |
| "loss": 0.004939628392457962, |
| "memory(GiB)": 36.91, |
| "step": 1040, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016181 |
| }, |
| { |
| "epoch": 3.402233502538071, |
| "grad_norm": 0.8965894055639708, |
| "learning_rate": 2.537413368940601e-05, |
| "loss": 0.016151268780231477, |
| "memory(GiB)": 36.91, |
| "step": 1045, |
| "token_acc": 0.9898785425101214, |
| "train_speed(iter/s)": 0.016182 |
| }, |
| { |
| "epoch": 3.4184771573604062, |
| "grad_norm": 0.21427146738429803, |
| "learning_rate": 2.4906755671374903e-05, |
| "loss": 0.010773959755897521, |
| "memory(GiB)": 36.91, |
| "step": 1050, |
| "token_acc": 0.9977827050997783, |
| "train_speed(iter/s)": 0.016182 |
| }, |
| { |
| "epoch": 3.434720812182741, |
| "grad_norm": 0.09286838269357345, |
| "learning_rate": 2.4442290229706344e-05, |
| "loss": 0.004091666638851165, |
| "memory(GiB)": 36.91, |
| "step": 1055, |
| "token_acc": 0.9954233409610984, |
| "train_speed(iter/s)": 0.016183 |
| }, |
| { |
| "epoch": 3.4509644670050763, |
| "grad_norm": 0.13489614133107514, |
| "learning_rate": 2.3980791274968837e-05, |
| "loss": 0.018990179896354674, |
| "memory(GiB)": 36.91, |
| "step": 1060, |
| "token_acc": 0.9945054945054945, |
| "train_speed(iter/s)": 0.016184 |
| }, |
| { |
| "epoch": 3.467208121827411, |
| "grad_norm": 0.1825955700626613, |
| "learning_rate": 2.3522312373410276e-05, |
| "loss": 0.011526491492986679, |
| "memory(GiB)": 36.91, |
| "step": 1065, |
| "token_acc": 0.997275204359673, |
| "train_speed(iter/s)": 0.016188 |
| }, |
| { |
| "epoch": 3.4834517766497464, |
| "grad_norm": 0.2440094791459664, |
| "learning_rate": 2.3066906740740623e-05, |
| "loss": 0.019795812666416168, |
| "memory(GiB)": 36.91, |
| "step": 1070, |
| "token_acc": 0.9896373056994818, |
| "train_speed(iter/s)": 0.016187 |
| }, |
| { |
| "epoch": 3.499695431472081, |
| "grad_norm": 0.4913730237430669, |
| "learning_rate": 2.2614627235955026e-05, |
| "loss": 0.007270602881908417, |
| "memory(GiB)": 36.91, |
| "step": 1075, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016189 |
| }, |
| { |
| "epoch": 3.5159390862944164, |
| "grad_norm": 0.6922284750457558, |
| "learning_rate": 2.2165526355198605e-05, |
| "loss": 0.0127563938498497, |
| "memory(GiB)": 36.91, |
| "step": 1080, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016191 |
| }, |
| { |
| "epoch": 3.5321827411167512, |
| "grad_norm": 0.6450602563278425, |
| "learning_rate": 2.171965622567308e-05, |
| "loss": 0.007853203266859055, |
| "memory(GiB)": 36.91, |
| "step": 1085, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016193 |
| }, |
| { |
| "epoch": 3.548426395939086, |
| "grad_norm": 0.3234875973475892, |
| "learning_rate": 2.127706859958647e-05, |
| "loss": 0.008352670073509216, |
| "memory(GiB)": 36.91, |
| "step": 1090, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016193 |
| }, |
| { |
| "epoch": 3.5646700507614213, |
| "grad_norm": 0.09371017997182811, |
| "learning_rate": 2.0837814848146166e-05, |
| "loss": 0.001982194371521473, |
| "memory(GiB)": 36.91, |
| "step": 1095, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016191 |
| }, |
| { |
| "epoch": 3.5809137055837565, |
| "grad_norm": 0.8724610494447905, |
| "learning_rate": 2.0401945955596206e-05, |
| "loss": 0.0030656153336167335, |
| "memory(GiB)": 36.91, |
| "step": 1100, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016188 |
| }, |
| { |
| "epoch": 3.5971573604060914, |
| "grad_norm": 0.5650605008223917, |
| "learning_rate": 1.9969512513299664e-05, |
| "loss": 0.00554112084209919, |
| "memory(GiB)": 36.91, |
| "step": 1105, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.01619 |
| }, |
| { |
| "epoch": 3.613401015228426, |
| "grad_norm": 0.39939968413297244, |
| "learning_rate": 1.9540564713866387e-05, |
| "loss": 0.006034587323665619, |
| "memory(GiB)": 36.91, |
| "step": 1110, |
| "token_acc": 0.9948586118251928, |
| "train_speed(iter/s)": 0.016191 |
| }, |
| { |
| "epoch": 3.6296446700507614, |
| "grad_norm": 0.1065247660653177, |
| "learning_rate": 1.9115152345327152e-05, |
| "loss": 0.005482121184468269, |
| "memory(GiB)": 36.91, |
| "step": 1115, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016191 |
| }, |
| { |
| "epoch": 3.6458883248730967, |
| "grad_norm": 0.8174090560458377, |
| "learning_rate": 1.8693324785354822e-05, |
| "loss": 0.011324305832386018, |
| "memory(GiB)": 36.91, |
| "step": 1120, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016193 |
| }, |
| { |
| "epoch": 3.6621319796954315, |
| "grad_norm": 0.17850770204119407, |
| "learning_rate": 1.8275130995532974e-05, |
| "loss": 0.0144767165184021, |
| "memory(GiB)": 36.91, |
| "step": 1125, |
| "token_acc": 0.9978586723768736, |
| "train_speed(iter/s)": 0.016195 |
| }, |
| { |
| "epoch": 3.6783756345177663, |
| "grad_norm": 0.33877743892749795, |
| "learning_rate": 1.7860619515673033e-05, |
| "loss": 0.01116895154118538, |
| "memory(GiB)": 36.91, |
| "step": 1130, |
| "token_acc": 0.9953271028037384, |
| "train_speed(iter/s)": 0.016195 |
| }, |
| { |
| "epoch": 3.6946192893401015, |
| "grad_norm": 0.5168488777536275, |
| "learning_rate": 1.744983845818019e-05, |
| "loss": 0.0068625412881374356, |
| "memory(GiB)": 36.91, |
| "step": 1135, |
| "token_acc": 0.9978213507625272, |
| "train_speed(iter/s)": 0.0162 |
| }, |
| { |
| "epoch": 3.710862944162437, |
| "grad_norm": 0.7346145409084535, |
| "learning_rate": 1.7042835502468934e-05, |
| "loss": 0.002322973683476448, |
| "memory(GiB)": 36.91, |
| "step": 1140, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016197 |
| }, |
| { |
| "epoch": 3.7271065989847716, |
| "grad_norm": 0.6646625028373466, |
| "learning_rate": 1.6639657889429018e-05, |
| "loss": 0.018248292803764343, |
| "memory(GiB)": 36.91, |
| "step": 1145, |
| "token_acc": 0.9840182648401826, |
| "train_speed(iter/s)": 0.016195 |
| }, |
| { |
| "epoch": 3.7433502538071064, |
| "grad_norm": 0.8354437881107281, |
| "learning_rate": 1.624035241594213e-05, |
| "loss": 0.006459401547908783, |
| "memory(GiB)": 36.91, |
| "step": 1150, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016193 |
| }, |
| { |
| "epoch": 3.7595939086294416, |
| "grad_norm": 0.2958093671449778, |
| "learning_rate": 1.5844965429450132e-05, |
| "loss": 0.008441635966300964, |
| "memory(GiB)": 36.91, |
| "step": 1155, |
| "token_acc": 0.9834368530020704, |
| "train_speed(iter/s)": 0.016192 |
| }, |
| { |
| "epoch": 3.775837563451777, |
| "grad_norm": 0.4306627690474224, |
| "learning_rate": 1.545354282257562e-05, |
| "loss": 0.015231077373027802, |
| "memory(GiB)": 36.91, |
| "step": 1160, |
| "token_acc": 0.9976851851851852, |
| "train_speed(iter/s)": 0.016196 |
| }, |
| { |
| "epoch": 3.7920812182741117, |
| "grad_norm": 0.0801666210860899, |
| "learning_rate": 1.5066130027795044e-05, |
| "loss": 0.02225690186023712, |
| "memory(GiB)": 36.91, |
| "step": 1165, |
| "token_acc": 0.9886363636363636, |
| "train_speed(iter/s)": 0.0162 |
| }, |
| { |
| "epoch": 3.8083248730964465, |
| "grad_norm": 1.390297822775598, |
| "learning_rate": 1.4682772012165436e-05, |
| "loss": 0.011767344176769256, |
| "memory(GiB)": 36.91, |
| "step": 1170, |
| "token_acc": 0.9953810623556582, |
| "train_speed(iter/s)": 0.0162 |
| }, |
| { |
| "epoch": 3.8245685279187818, |
| "grad_norm": 0.576269037629794, |
| "learning_rate": 1.4303513272105057e-05, |
| "loss": 0.01135575920343399, |
| "memory(GiB)": 36.91, |
| "step": 1175, |
| "token_acc": 0.9976744186046511, |
| "train_speed(iter/s)": 0.016199 |
| }, |
| { |
| "epoch": 3.840812182741117, |
| "grad_norm": 0.6175307257021349, |
| "learning_rate": 1.3928397828228628e-05, |
| "loss": 0.00802643597126007, |
| "memory(GiB)": 36.91, |
| "step": 1180, |
| "token_acc": 0.9950738916256158, |
| "train_speed(iter/s)": 0.016201 |
| }, |
| { |
| "epoch": 3.857055837563452, |
| "grad_norm": 0.13098006216818975, |
| "learning_rate": 1.3557469220237962e-05, |
| "loss": 0.011502113938331605, |
| "memory(GiB)": 36.91, |
| "step": 1185, |
| "token_acc": 0.9935344827586207, |
| "train_speed(iter/s)": 0.016204 |
| }, |
| { |
| "epoch": 3.8732994923857866, |
| "grad_norm": 0.3987654668677921, |
| "learning_rate": 1.3190770501868243e-05, |
| "loss": 0.011363585293293, |
| "memory(GiB)": 36.91, |
| "step": 1190, |
| "token_acc": 0.9974160206718347, |
| "train_speed(iter/s)": 0.016203 |
| }, |
| { |
| "epoch": 3.889543147208122, |
| "grad_norm": 0.14976124575026759, |
| "learning_rate": 1.2828344235890726e-05, |
| "loss": 0.01089974120259285, |
| "memory(GiB)": 36.91, |
| "step": 1195, |
| "token_acc": 0.9933481152993349, |
| "train_speed(iter/s)": 0.016203 |
| }, |
| { |
| "epoch": 3.9057868020304567, |
| "grad_norm": 1.5199866835408566, |
| "learning_rate": 1.247023248917259e-05, |
| "loss": 0.009822697192430497, |
| "memory(GiB)": 36.91, |
| "step": 1200, |
| "token_acc": 0.9929742388758782, |
| "train_speed(iter/s)": 0.016204 |
| }, |
| { |
| "epoch": 3.922030456852792, |
| "grad_norm": 1.6580131250235997, |
| "learning_rate": 1.2116476827794104e-05, |
| "loss": 0.024014970660209654, |
| "memory(GiB)": 36.91, |
| "step": 1205, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016204 |
| }, |
| { |
| "epoch": 3.9382741116751268, |
| "grad_norm": 1.157754837023021, |
| "learning_rate": 1.1767118312224151e-05, |
| "loss": 0.007532584667205811, |
| "memory(GiB)": 36.91, |
| "step": 1210, |
| "token_acc": 0.9972375690607734, |
| "train_speed(iter/s)": 0.016207 |
| }, |
| { |
| "epoch": 3.954517766497462, |
| "grad_norm": 0.6972765226059477, |
| "learning_rate": 1.142219749255427e-05, |
| "loss": 0.004430451989173889, |
| "memory(GiB)": 36.91, |
| "step": 1215, |
| "token_acc": 0.9972677595628415, |
| "train_speed(iter/s)": 0.016207 |
| }, |
| { |
| "epoch": 3.970761421319797, |
| "grad_norm": 2.2979580480692188, |
| "learning_rate": 1.1081754403791999e-05, |
| "loss": 0.015141716599464417, |
| "memory(GiB)": 36.91, |
| "step": 1220, |
| "token_acc": 0.9954337899543378, |
| "train_speed(iter/s)": 0.016206 |
| }, |
| { |
| "epoch": 3.987005076142132, |
| "grad_norm": 0.2965970510784761, |
| "learning_rate": 1.0745828561214056e-05, |
| "loss": 0.021216361224651335, |
| "memory(GiB)": 36.91, |
| "step": 1225, |
| "token_acc": 0.9954337899543378, |
| "train_speed(iter/s)": 0.016206 |
| }, |
| { |
| "epoch": 3.996751269035533, |
| "eval_loss": 0.29802748560905457, |
| "eval_runtime": 62.08, |
| "eval_samples_per_second": 3.189, |
| "eval_steps_per_second": 0.805, |
| "eval_token_acc": 0.9388523047977423, |
| "step": 1228 |
| }, |
| { |
| "epoch": 4.0056852791878175, |
| "grad_norm": 0.7419564842144963, |
| "learning_rate": 1.041445895577977e-05, |
| "loss": 0.009254975616931916, |
| "memory(GiB)": 36.91, |
| "step": 1230, |
| "token_acc": 0.9668174962292609, |
| "train_speed(iter/s)": 0.016191 |
| }, |
| { |
| "epoch": 4.021928934010153, |
| "grad_norm": 0.1343462548929871, |
| "learning_rate": 1.008768404960535e-05, |
| "loss": 0.002759779617190361, |
| "memory(GiB)": 36.91, |
| "step": 1235, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016192 |
| }, |
| { |
| "epoch": 4.038172588832487, |
| "grad_norm": 0.08229350773537837, |
| "learning_rate": 9.765541771499659e-06, |
| "loss": 0.0012123636901378632, |
| "memory(GiB)": 36.91, |
| "step": 1240, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016192 |
| }, |
| { |
| "epoch": 4.054416243654822, |
| "grad_norm": 0.08190000464747839, |
| "learning_rate": 9.448069512561775e-06, |
| "loss": 0.0066297553479671475, |
| "memory(GiB)": 36.91, |
| "step": 1245, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016195 |
| }, |
| { |
| "epoch": 4.070659898477158, |
| "grad_norm": 0.12397302242146173, |
| "learning_rate": 9.135304121840976e-06, |
| "loss": 0.0012923330999910832, |
| "memory(GiB)": 36.91, |
| "step": 1250, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016192 |
| }, |
| { |
| "epoch": 4.086903553299492, |
| "grad_norm": 0.057048418793994596, |
| "learning_rate": 8.827281902059698e-06, |
| "loss": 0.0007107659243047237, |
| "memory(GiB)": 36.91, |
| "step": 1255, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016191 |
| }, |
| { |
| "epoch": 4.103147208121827, |
| "grad_norm": 0.16324844745357645, |
| "learning_rate": 8.524038605399886e-06, |
| "loss": 0.0021383626386523246, |
| "memory(GiB)": 36.91, |
| "step": 1260, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016189 |
| }, |
| { |
| "epoch": 4.1193908629441625, |
| "grad_norm": 0.06874787839714207, |
| "learning_rate": 8.225609429353187e-06, |
| "loss": 0.0028022559359669684, |
| "memory(GiB)": 36.91, |
| "step": 1265, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016188 |
| }, |
| { |
| "epoch": 4.135634517766498, |
| "grad_norm": 0.2526140368602798, |
| "learning_rate": 7.932029012635623e-06, |
| "loss": 0.003260459750890732, |
| "memory(GiB)": 36.91, |
| "step": 1270, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016187 |
| }, |
| { |
| "epoch": 4.151878172588832, |
| "grad_norm": 0.14918347721067196, |
| "learning_rate": 7.643331431167017e-06, |
| "loss": 0.004188637435436249, |
| "memory(GiB)": 36.91, |
| "step": 1275, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016188 |
| }, |
| { |
| "epoch": 4.168121827411167, |
| "grad_norm": 0.46928271799249704, |
| "learning_rate": 7.35955019411585e-06, |
| "loss": 0.011932872980833054, |
| "memory(GiB)": 36.91, |
| "step": 1280, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016191 |
| }, |
| { |
| "epoch": 4.184365482233503, |
| "grad_norm": 0.07080459315091195, |
| "learning_rate": 7.080718240009826e-06, |
| "loss": 0.004019932448863983, |
| "memory(GiB)": 36.91, |
| "step": 1285, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.01619 |
| }, |
| { |
| "epoch": 4.200609137055838, |
| "grad_norm": 0.7271340874397169, |
| "learning_rate": 6.806867932912653e-06, |
| "loss": 0.0061328854411840435, |
| "memory(GiB)": 36.91, |
| "step": 1290, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016194 |
| }, |
| { |
| "epoch": 4.216852791878172, |
| "grad_norm": 0.1265328539578886, |
| "learning_rate": 6.53803105866761e-06, |
| "loss": 0.006417517364025116, |
| "memory(GiB)": 36.91, |
| "step": 1295, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016196 |
| }, |
| { |
| "epoch": 4.233096446700507, |
| "grad_norm": 0.057288978819073196, |
| "learning_rate": 6.274238821208128e-06, |
| "loss": 0.003987757861614228, |
| "memory(GiB)": 36.91, |
| "step": 1300, |
| "token_acc": 0.9975062344139651, |
| "train_speed(iter/s)": 0.016195 |
| }, |
| { |
| "epoch": 4.249340101522843, |
| "grad_norm": 0.1481683428098521, |
| "learning_rate": 6.015521838935905e-06, |
| "loss": 0.0010721445083618163, |
| "memory(GiB)": 36.91, |
| "step": 1305, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016194 |
| }, |
| { |
| "epoch": 4.265583756345178, |
| "grad_norm": 0.10590383120253814, |
| "learning_rate": 5.7619101411671095e-06, |
| "loss": 0.002213609591126442, |
| "memory(GiB)": 36.91, |
| "step": 1310, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016195 |
| }, |
| { |
| "epoch": 4.281827411167512, |
| "grad_norm": 0.04714189372424805, |
| "learning_rate": 5.513433164646814e-06, |
| "loss": 0.0011348580941557885, |
| "memory(GiB)": 36.91, |
| "step": 1315, |
| "token_acc": 0.9976689976689976, |
| "train_speed(iter/s)": 0.016199 |
| }, |
| { |
| "epoch": 4.298071065989848, |
| "grad_norm": 0.476391282204877, |
| "learning_rate": 5.270119750132258e-06, |
| "loss": 0.004196888953447342, |
| "memory(GiB)": 36.91, |
| "step": 1320, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016201 |
| }, |
| { |
| "epoch": 4.314314720812183, |
| "grad_norm": 0.35042552841819846, |
| "learning_rate": 5.031998139045352e-06, |
| "loss": 0.0034095611423254012, |
| "memory(GiB)": 36.91, |
| "step": 1325, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016203 |
| }, |
| { |
| "epoch": 4.330558375634518, |
| "grad_norm": 0.05524764971116243, |
| "learning_rate": 4.799095970194628e-06, |
| "loss": 0.0037711452692747115, |
| "memory(GiB)": 36.91, |
| "step": 1330, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016203 |
| }, |
| { |
| "epoch": 4.346802030456852, |
| "grad_norm": 0.5445980593755461, |
| "learning_rate": 4.571440276567257e-06, |
| "loss": 0.0024499524384737014, |
| "memory(GiB)": 36.91, |
| "step": 1335, |
| "token_acc": 0.997624703087886, |
| "train_speed(iter/s)": 0.016206 |
| }, |
| { |
| "epoch": 4.363045685279188, |
| "grad_norm": 0.10598886435572437, |
| "learning_rate": 4.349057482191299e-06, |
| "loss": 0.004410183429718018, |
| "memory(GiB)": 36.91, |
| "step": 1340, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016206 |
| }, |
| { |
| "epoch": 4.379289340101523, |
| "grad_norm": 0.04699969388550453, |
| "learning_rate": 4.1319733990686446e-06, |
| "loss": 0.0011100947856903076, |
| "memory(GiB)": 36.91, |
| "step": 1345, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016205 |
| }, |
| { |
| "epoch": 4.395532994923858, |
| "grad_norm": 0.017045928815902597, |
| "learning_rate": 3.920213224179042e-06, |
| "loss": 0.00034863052424043416, |
| "memory(GiB)": 36.91, |
| "step": 1350, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016206 |
| }, |
| { |
| "epoch": 4.4117766497461925, |
| "grad_norm": 0.7161935581935048, |
| "learning_rate": 3.7138015365554833e-06, |
| "loss": 0.0035605177283287047, |
| "memory(GiB)": 36.91, |
| "step": 1355, |
| "token_acc": 0.9977220956719818, |
| "train_speed(iter/s)": 0.016207 |
| }, |
| { |
| "epoch": 4.428020304568528, |
| "grad_norm": 0.06887525802872778, |
| "learning_rate": 3.512762294431271e-06, |
| "loss": 0.006134101003408432, |
| "memory(GiB)": 36.91, |
| "step": 1360, |
| "token_acc": 0.9975186104218362, |
| "train_speed(iter/s)": 0.016208 |
| }, |
| { |
| "epoch": 4.444263959390863, |
| "grad_norm": 0.041826315852571724, |
| "learning_rate": 3.3171188324592427e-06, |
| "loss": 0.0012344198301434516, |
| "memory(GiB)": 36.91, |
| "step": 1365, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016208 |
| }, |
| { |
| "epoch": 4.460507614213198, |
| "grad_norm": 0.07787992465189252, |
| "learning_rate": 3.126893859003249e-06, |
| "loss": 0.0013754777610301971, |
| "memory(GiB)": 36.91, |
| "step": 1370, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016208 |
| }, |
| { |
| "epoch": 4.476751269035533, |
| "grad_norm": 0.9611581457799497, |
| "learning_rate": 2.9421094535024507e-06, |
| "loss": 0.004121043905615807, |
| "memory(GiB)": 36.91, |
| "step": 1375, |
| "token_acc": 0.9933920704845814, |
| "train_speed(iter/s)": 0.016206 |
| }, |
| { |
| "epoch": 4.492994923857868, |
| "grad_norm": 0.11072593270596472, |
| "learning_rate": 2.762787063908523e-06, |
| "loss": 0.0024029091000556946, |
| "memory(GiB)": 36.91, |
| "step": 1380, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016205 |
| }, |
| { |
| "epoch": 4.509238578680203, |
| "grad_norm": 0.02340550565254115, |
| "learning_rate": 2.5889475041961765e-06, |
| "loss": 0.001028289459645748, |
| "memory(GiB)": 36.91, |
| "step": 1385, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.01621 |
| }, |
| { |
| "epoch": 4.525482233502538, |
| "grad_norm": 0.08895116218405089, |
| "learning_rate": 2.4206109519473163e-06, |
| "loss": 0.0021161407232284544, |
| "memory(GiB)": 36.91, |
| "step": 1390, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016211 |
| }, |
| { |
| "epoch": 4.541725888324873, |
| "grad_norm": 0.24076601170504602, |
| "learning_rate": 2.2577969460089997e-06, |
| "loss": 0.0007429494522511959, |
| "memory(GiB)": 36.91, |
| "step": 1395, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016212 |
| }, |
| { |
| "epoch": 4.557969543147208, |
| "grad_norm": 0.19664829308024404, |
| "learning_rate": 2.100524384225555e-06, |
| "loss": 0.0008249727077782154, |
| "memory(GiB)": 36.91, |
| "step": 1400, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.01621 |
| }, |
| { |
| "epoch": 4.574213197969543, |
| "grad_norm": 0.06599531052332817, |
| "learning_rate": 1.948811521245131e-06, |
| "loss": 0.000786225963383913, |
| "memory(GiB)": 36.91, |
| "step": 1405, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016209 |
| }, |
| { |
| "epoch": 4.5904568527918785, |
| "grad_norm": 0.10702737644857346, |
| "learning_rate": 1.8026759664008465e-06, |
| "loss": 0.003063713386654854, |
| "memory(GiB)": 36.91, |
| "step": 1410, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016207 |
| }, |
| { |
| "epoch": 4.606700507614213, |
| "grad_norm": 0.41678449867799244, |
| "learning_rate": 1.6621346816668992e-06, |
| "loss": 0.00532943345606327, |
| "memory(GiB)": 36.91, |
| "step": 1415, |
| "token_acc": 0.9937629937629938, |
| "train_speed(iter/s)": 0.016207 |
| }, |
| { |
| "epoch": 4.622944162436548, |
| "grad_norm": 0.029982460463042173, |
| "learning_rate": 1.5272039796897786e-06, |
| "loss": 0.0017097776755690575, |
| "memory(GiB)": 36.91, |
| "step": 1420, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016209 |
| }, |
| { |
| "epoch": 4.639187817258883, |
| "grad_norm": 0.03591858354249925, |
| "learning_rate": 1.397899521894841e-06, |
| "loss": 0.0013645312748849392, |
| "memory(GiB)": 36.91, |
| "step": 1425, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016212 |
| }, |
| { |
| "epoch": 4.655431472081219, |
| "grad_norm": 0.04773799774300644, |
| "learning_rate": 1.2742363166685034e-06, |
| "loss": 0.0009639391675591469, |
| "memory(GiB)": 36.91, |
| "step": 1430, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.01621 |
| }, |
| { |
| "epoch": 4.671675126903553, |
| "grad_norm": 0.129000803673704, |
| "learning_rate": 1.15622871761622e-06, |
| "loss": 0.0005136763211339712, |
| "memory(GiB)": 36.91, |
| "step": 1435, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016212 |
| }, |
| { |
| "epoch": 4.687918781725888, |
| "grad_norm": 0.029179325549530243, |
| "learning_rate": 1.0438904218964319e-06, |
| "loss": 0.0004105303902179003, |
| "memory(GiB)": 36.91, |
| "step": 1440, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016213 |
| }, |
| { |
| "epoch": 4.7041624365482235, |
| "grad_norm": 0.04897256940654327, |
| "learning_rate": 9.372344686307655e-07, |
| "loss": 0.0009922079741954803, |
| "memory(GiB)": 36.91, |
| "step": 1445, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016212 |
| }, |
| { |
| "epoch": 4.720406091370558, |
| "grad_norm": 0.0393178010532892, |
| "learning_rate": 8.362732373905723e-07, |
| "loss": 0.0008288329467177391, |
| "memory(GiB)": 36.91, |
| "step": 1450, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016214 |
| }, |
| { |
| "epoch": 4.736649746192893, |
| "grad_norm": 0.08771738931354985, |
| "learning_rate": 7.410184467600001e-07, |
| "loss": 0.0005111692938953638, |
| "memory(GiB)": 36.91, |
| "step": 1455, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016214 |
| }, |
| { |
| "epoch": 4.752893401015228, |
| "grad_norm": 0.04916799951696976, |
| "learning_rate": 6.514811529758747e-07, |
| "loss": 0.007441927492618561, |
| "memory(GiB)": 36.91, |
| "step": 1460, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016214 |
| }, |
| { |
| "epoch": 4.769137055837564, |
| "grad_norm": 0.44716598217302617, |
| "learning_rate": 5.676717486443439e-07, |
| "loss": 0.0024275451898574827, |
| "memory(GiB)": 36.91, |
| "step": 1465, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016214 |
| }, |
| { |
| "epoch": 4.785380710659899, |
| "grad_norm": 0.12117859136787597, |
| "learning_rate": 4.895999615346314e-07, |
| "loss": 0.001637093722820282, |
| "memory(GiB)": 36.91, |
| "step": 1470, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016214 |
| }, |
| { |
| "epoch": 4.801624365482233, |
| "grad_norm": 0.01706819131966345, |
| "learning_rate": 4.1727485344994486e-07, |
| "loss": 0.0003483247943222523, |
| "memory(GiB)": 36.91, |
| "step": 1475, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016212 |
| }, |
| { |
| "epoch": 4.8178680203045685, |
| "grad_norm": 0.04859669108953238, |
| "learning_rate": 3.507048191756401e-07, |
| "loss": 0.0021356761455535887, |
| "memory(GiB)": 36.91, |
| "step": 1480, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016212 |
| }, |
| { |
| "epoch": 4.834111675126904, |
| "grad_norm": 0.03682429514387162, |
| "learning_rate": 2.8989758550487245e-07, |
| "loss": 0.0021858945488929748, |
| "memory(GiB)": 36.91, |
| "step": 1485, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016213 |
| }, |
| { |
| "epoch": 4.850355329949238, |
| "grad_norm": 0.06507640939116277, |
| "learning_rate": 2.3486021034170857e-07, |
| "loss": 0.002923069894313812, |
| "memory(GiB)": 36.91, |
| "step": 1490, |
| "token_acc": 0.9977064220183486, |
| "train_speed(iter/s)": 0.016212 |
| }, |
| { |
| "epoch": 4.866598984771573, |
| "grad_norm": 0.04259804746440851, |
| "learning_rate": 1.8559908188195418e-07, |
| "loss": 0.0019719479605555534, |
| "memory(GiB)": 36.91, |
| "step": 1495, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016212 |
| }, |
| { |
| "epoch": 4.882842639593909, |
| "grad_norm": 0.25393381486977334, |
| "learning_rate": 1.4211991787164147e-07, |
| "loss": 0.0011512625962495804, |
| "memory(GiB)": 36.91, |
| "step": 1500, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016215 |
| }, |
| { |
| "epoch": 4.899086294416244, |
| "grad_norm": 0.21720000107148496, |
| "learning_rate": 1.044277649433989e-07, |
| "loss": 0.003379678726196289, |
| "memory(GiB)": 36.91, |
| "step": 1505, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016213 |
| }, |
| { |
| "epoch": 4.915329949238579, |
| "grad_norm": 0.6636335728606932, |
| "learning_rate": 7.252699803065311e-08, |
| "loss": 0.014554958045482635, |
| "memory(GiB)": 36.91, |
| "step": 1510, |
| "token_acc": 0.9886792452830189, |
| "train_speed(iter/s)": 0.016211 |
| }, |
| { |
| "epoch": 4.9315736040609135, |
| "grad_norm": 0.042674818413491626, |
| "learning_rate": 4.6421319859862864e-08, |
| "loss": 0.0024311095476150513, |
| "memory(GiB)": 36.91, |
| "step": 1515, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016212 |
| }, |
| { |
| "epoch": 4.947817258883249, |
| "grad_norm": 0.07981897617268605, |
| "learning_rate": 2.6113760520735108e-08, |
| "loss": 0.0024462098255753515, |
| "memory(GiB)": 36.91, |
| "step": 1520, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.01621 |
| }, |
| { |
| "epoch": 4.964060913705584, |
| "grad_norm": 0.01695528976036472, |
| "learning_rate": 1.1606677114500697e-08, |
| "loss": 0.011407441645860671, |
| "memory(GiB)": 36.91, |
| "step": 1525, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.01621 |
| }, |
| { |
| "epoch": 4.980304568527918, |
| "grad_norm": 0.05383783400729952, |
| "learning_rate": 2.901753480361036e-09, |
| "loss": 0.005226198583841324, |
| "memory(GiB)": 36.91, |
| "step": 1530, |
| "token_acc": 0.9956521739130435, |
| "train_speed(iter/s)": 0.01621 |
| }, |
| { |
| "epoch": 4.996548223350254, |
| "grad_norm": 0.9774296313594534, |
| "learning_rate": 0.0, |
| "loss": 0.003532126545906067, |
| "memory(GiB)": 36.91, |
| "step": 1535, |
| "token_acc": 1.0, |
| "train_speed(iter/s)": 0.016209 |
| }, |
| { |
| "epoch": 4.996548223350254, |
| "eval_loss": 0.31882038712501526, |
| "eval_runtime": 62.2556, |
| "eval_samples_per_second": 3.18, |
| "eval_steps_per_second": 0.803, |
| "eval_token_acc": 0.9397930385700847, |
| "step": 1535 |
| }, |
| { |
| "epoch": 4.996548223350254, |
| "eval_loss": 0.31882038712501526, |
| "eval_runtime": 62.6813, |
| "eval_samples_per_second": 3.159, |
| "eval_steps_per_second": 0.798, |
| "eval_token_acc": 0.9397930385700847, |
| "step": 1535 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1535, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.2119246482890555e+19, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|