| { | |
| "best_metric": 0.19586918, | |
| "best_model_checkpoint": "/share/project/gsai/kch/output/v9-20250120-041149/checkpoint-614", | |
| "epoch": 4.996548223350254, | |
| "eval_steps": 500, | |
| "global_step": 1535, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003248730964467005, | |
| "grad_norm": 9.773202050760368, | |
| "learning_rate": 1.2987012987012988e-06, | |
| "loss": 1.5496090650558472, | |
| "memory(GiB)": 35.94, | |
| "step": 1, | |
| "token_acc": 0.7444444444444445, | |
| "train_speed(iter/s)": 0.013018 | |
| }, | |
| { | |
| "epoch": 0.016243654822335026, | |
| "grad_norm": 9.248677372850217, | |
| "learning_rate": 6.493506493506493e-06, | |
| "loss": 1.707068681716919, | |
| "memory(GiB)": 36.33, | |
| "step": 5, | |
| "token_acc": 0.7226890756302521, | |
| "train_speed(iter/s)": 0.016033 | |
| }, | |
| { | |
| "epoch": 0.03248730964467005, | |
| "grad_norm": 7.883848099402922, | |
| "learning_rate": 1.2987012987012986e-05, | |
| "loss": 1.7282501220703126, | |
| "memory(GiB)": 36.33, | |
| "step": 10, | |
| "token_acc": 0.6761363636363636, | |
| "train_speed(iter/s)": 0.016046 | |
| }, | |
| { | |
| "epoch": 0.048730964467005075, | |
| "grad_norm": 6.0889640814527155, | |
| "learning_rate": 1.9480519480519483e-05, | |
| "loss": 1.2976716995239257, | |
| "memory(GiB)": 36.33, | |
| "step": 15, | |
| "token_acc": 0.7266355140186916, | |
| "train_speed(iter/s)": 0.015999 | |
| }, | |
| { | |
| "epoch": 0.0649746192893401, | |
| "grad_norm": 2.6158303198283113, | |
| "learning_rate": 2.5974025974025972e-05, | |
| "loss": 0.7637146949768067, | |
| "memory(GiB)": 36.33, | |
| "step": 20, | |
| "token_acc": 0.8190709046454768, | |
| "train_speed(iter/s)": 0.016127 | |
| }, | |
| { | |
| "epoch": 0.08121827411167512, | |
| "grad_norm": 1.1712343980644169, | |
| "learning_rate": 3.246753246753247e-05, | |
| "loss": 0.5213486194610596, | |
| "memory(GiB)": 36.91, | |
| "step": 25, | |
| "token_acc": 0.8802992518703242, | |
| "train_speed(iter/s)": 0.016159 | |
| }, | |
| { | |
| "epoch": 0.09746192893401015, | |
| "grad_norm": 1.4361934956753106, | |
| "learning_rate": 3.8961038961038966e-05, | |
| "loss": 0.4833333969116211, | |
| "memory(GiB)": 36.91, | |
| "step": 30, | |
| "token_acc": 0.8929440389294404, | |
| "train_speed(iter/s)": 0.01608 | |
| }, | |
| { | |
| "epoch": 0.11370558375634518, | |
| "grad_norm": 1.1662861682771686, | |
| "learning_rate": 4.545454545454546e-05, | |
| "loss": 0.4054920196533203, | |
| "memory(GiB)": 36.91, | |
| "step": 35, | |
| "token_acc": 0.8860103626943006, | |
| "train_speed(iter/s)": 0.016078 | |
| }, | |
| { | |
| "epoch": 0.1299492385786802, | |
| "grad_norm": 1.0429297515235254, | |
| "learning_rate": 5.1948051948051944e-05, | |
| "loss": 0.43406662940979, | |
| "memory(GiB)": 36.91, | |
| "step": 40, | |
| "token_acc": 0.8708333333333333, | |
| "train_speed(iter/s)": 0.016008 | |
| }, | |
| { | |
| "epoch": 0.14619289340101524, | |
| "grad_norm": 1.2238750692730618, | |
| "learning_rate": 5.844155844155844e-05, | |
| "loss": 0.36366307735443115, | |
| "memory(GiB)": 36.91, | |
| "step": 45, | |
| "token_acc": 0.9007832898172323, | |
| "train_speed(iter/s)": 0.01607 | |
| }, | |
| { | |
| "epoch": 0.16243654822335024, | |
| "grad_norm": 1.2558032464123954, | |
| "learning_rate": 6.493506493506494e-05, | |
| "loss": 0.327667236328125, | |
| "memory(GiB)": 36.91, | |
| "step": 50, | |
| "token_acc": 0.9095890410958904, | |
| "train_speed(iter/s)": 0.016095 | |
| }, | |
| { | |
| "epoch": 0.17868020304568527, | |
| "grad_norm": 1.1346516950379935, | |
| "learning_rate": 7.142857142857143e-05, | |
| "loss": 0.2869602680206299, | |
| "memory(GiB)": 36.91, | |
| "step": 55, | |
| "token_acc": 0.9400428265524625, | |
| "train_speed(iter/s)": 0.016146 | |
| }, | |
| { | |
| "epoch": 0.1949238578680203, | |
| "grad_norm": 1.062592286052222, | |
| "learning_rate": 7.792207792207793e-05, | |
| "loss": 0.32817542552948, | |
| "memory(GiB)": 36.91, | |
| "step": 60, | |
| "token_acc": 0.9162162162162162, | |
| "train_speed(iter/s)": 0.016208 | |
| }, | |
| { | |
| "epoch": 0.21116751269035533, | |
| "grad_norm": 1.0747418170911354, | |
| "learning_rate": 8.441558441558442e-05, | |
| "loss": 0.3106029987335205, | |
| "memory(GiB)": 36.91, | |
| "step": 65, | |
| "token_acc": 0.8882235528942116, | |
| "train_speed(iter/s)": 0.016166 | |
| }, | |
| { | |
| "epoch": 0.22741116751269036, | |
| "grad_norm": 1.3626948899821127, | |
| "learning_rate": 9.090909090909092e-05, | |
| "loss": 0.2963001251220703, | |
| "memory(GiB)": 36.91, | |
| "step": 70, | |
| "token_acc": 0.9046511627906977, | |
| "train_speed(iter/s)": 0.016127 | |
| }, | |
| { | |
| "epoch": 0.2436548223350254, | |
| "grad_norm": 1.767990529674908, | |
| "learning_rate": 9.74025974025974e-05, | |
| "loss": 0.30068559646606446, | |
| "memory(GiB)": 36.91, | |
| "step": 75, | |
| "token_acc": 0.9203539823008849, | |
| "train_speed(iter/s)": 0.016153 | |
| }, | |
| { | |
| "epoch": 0.2598984771573604, | |
| "grad_norm": 1.1682901865357622, | |
| "learning_rate": 9.99989553622803e-05, | |
| "loss": 0.2741088390350342, | |
| "memory(GiB)": 36.91, | |
| "step": 80, | |
| "token_acc": 0.9041394335511983, | |
| "train_speed(iter/s)": 0.016134 | |
| }, | |
| { | |
| "epoch": 0.27614213197969545, | |
| "grad_norm": 1.3278362200249414, | |
| "learning_rate": 9.999257162318026e-05, | |
| "loss": 0.25543942451477053, | |
| "memory(GiB)": 36.91, | |
| "step": 85, | |
| "token_acc": 0.9399538106235565, | |
| "train_speed(iter/s)": 0.016119 | |
| }, | |
| { | |
| "epoch": 0.2923857868020305, | |
| "grad_norm": 1.1803595161351554, | |
| "learning_rate": 9.998038523933224e-05, | |
| "loss": 0.3038362503051758, | |
| "memory(GiB)": 36.91, | |
| "step": 90, | |
| "token_acc": 0.9416058394160584, | |
| "train_speed(iter/s)": 0.016104 | |
| }, | |
| { | |
| "epoch": 0.3086294416243655, | |
| "grad_norm": 1.1025992286590631, | |
| "learning_rate": 9.996239762521151e-05, | |
| "loss": 0.24188714027404784, | |
| "memory(GiB)": 36.91, | |
| "step": 95, | |
| "token_acc": 0.9402298850574713, | |
| "train_speed(iter/s)": 0.016109 | |
| }, | |
| { | |
| "epoch": 0.3248730964467005, | |
| "grad_norm": 1.7473005302414135, | |
| "learning_rate": 9.993861086864293e-05, | |
| "loss": 0.2190408945083618, | |
| "memory(GiB)": 36.91, | |
| "step": 100, | |
| "token_acc": 0.9553349875930521, | |
| "train_speed(iter/s)": 0.016079 | |
| }, | |
| { | |
| "epoch": 0.3411167512690355, | |
| "grad_norm": 0.9780470952963239, | |
| "learning_rate": 9.990902773055866e-05, | |
| "loss": 0.22316210269927977, | |
| "memory(GiB)": 36.91, | |
| "step": 105, | |
| "token_acc": 0.9384236453201971, | |
| "train_speed(iter/s)": 0.016094 | |
| }, | |
| { | |
| "epoch": 0.35736040609137054, | |
| "grad_norm": 1.2071939622104944, | |
| "learning_rate": 9.987365164467767e-05, | |
| "loss": 0.1844509482383728, | |
| "memory(GiB)": 36.91, | |
| "step": 110, | |
| "token_acc": 0.9557291666666666, | |
| "train_speed(iter/s)": 0.016096 | |
| }, | |
| { | |
| "epoch": 0.37360406091370557, | |
| "grad_norm": 1.3488873859555934, | |
| "learning_rate": 9.983248671710714e-05, | |
| "loss": 0.24020743370056152, | |
| "memory(GiB)": 36.91, | |
| "step": 115, | |
| "token_acc": 0.91792656587473, | |
| "train_speed(iter/s)": 0.016103 | |
| }, | |
| { | |
| "epoch": 0.3898477157360406, | |
| "grad_norm": 1.3346849143090171, | |
| "learning_rate": 9.978553772586596e-05, | |
| "loss": 0.17928496599197388, | |
| "memory(GiB)": 36.91, | |
| "step": 120, | |
| "token_acc": 0.9523809523809523, | |
| "train_speed(iter/s)": 0.016107 | |
| }, | |
| { | |
| "epoch": 0.40609137055837563, | |
| "grad_norm": 1.5370257812561328, | |
| "learning_rate": 9.973281012033007e-05, | |
| "loss": 0.22673540115356444, | |
| "memory(GiB)": 36.91, | |
| "step": 125, | |
| "token_acc": 0.9307692307692308, | |
| "train_speed(iter/s)": 0.016132 | |
| }, | |
| { | |
| "epoch": 0.42233502538071066, | |
| "grad_norm": 1.564641958549246, | |
| "learning_rate": 9.967431002060002e-05, | |
| "loss": 0.2366321563720703, | |
| "memory(GiB)": 36.91, | |
| "step": 130, | |
| "token_acc": 0.9309576837416481, | |
| "train_speed(iter/s)": 0.016147 | |
| }, | |
| { | |
| "epoch": 0.4385786802030457, | |
| "grad_norm": 2.562291519667129, | |
| "learning_rate": 9.961004421679047e-05, | |
| "loss": 0.1997455835342407, | |
| "memory(GiB)": 36.91, | |
| "step": 135, | |
| "token_acc": 0.9694656488549618, | |
| "train_speed(iter/s)": 0.01615 | |
| }, | |
| { | |
| "epoch": 0.4548223350253807, | |
| "grad_norm": 1.3505627897575587, | |
| "learning_rate": 9.954002016824227e-05, | |
| "loss": 0.23050181865692138, | |
| "memory(GiB)": 36.91, | |
| "step": 140, | |
| "token_acc": 0.9395248380129589, | |
| "train_speed(iter/s)": 0.016177 | |
| }, | |
| { | |
| "epoch": 0.47106598984771575, | |
| "grad_norm": 1.1439093152874722, | |
| "learning_rate": 9.946424600265646e-05, | |
| "loss": 0.2069091796875, | |
| "memory(GiB)": 36.91, | |
| "step": 145, | |
| "token_acc": 0.9485294117647058, | |
| "train_speed(iter/s)": 0.016153 | |
| }, | |
| { | |
| "epoch": 0.4873096446700508, | |
| "grad_norm": 1.3223308004820944, | |
| "learning_rate": 9.938273051515098e-05, | |
| "loss": 0.21799993515014648, | |
| "memory(GiB)": 36.91, | |
| "step": 150, | |
| "token_acc": 0.9325581395348838, | |
| "train_speed(iter/s)": 0.016154 | |
| }, | |
| { | |
| "epoch": 0.5035532994923858, | |
| "grad_norm": 1.2523275744092777, | |
| "learning_rate": 9.929548316723982e-05, | |
| "loss": 0.25325832366943357, | |
| "memory(GiB)": 36.91, | |
| "step": 155, | |
| "token_acc": 0.9368421052631579, | |
| "train_speed(iter/s)": 0.016145 | |
| }, | |
| { | |
| "epoch": 0.5197969543147208, | |
| "grad_norm": 0.9022910796931503, | |
| "learning_rate": 9.920251408573483e-05, | |
| "loss": 0.2051997184753418, | |
| "memory(GiB)": 36.91, | |
| "step": 160, | |
| "token_acc": 0.9321266968325792, | |
| "train_speed(iter/s)": 0.016138 | |
| }, | |
| { | |
| "epoch": 0.5360406091370559, | |
| "grad_norm": 1.3630797879167007, | |
| "learning_rate": 9.910383406157018e-05, | |
| "loss": 0.19534312486648558, | |
| "memory(GiB)": 36.91, | |
| "step": 165, | |
| "token_acc": 0.9489795918367347, | |
| "train_speed(iter/s)": 0.016146 | |
| }, | |
| { | |
| "epoch": 0.5522842639593909, | |
| "grad_norm": 1.2845653777954962, | |
| "learning_rate": 9.899945454855006e-05, | |
| "loss": 0.25403494834899903, | |
| "memory(GiB)": 36.91, | |
| "step": 170, | |
| "token_acc": 0.9025974025974026, | |
| "train_speed(iter/s)": 0.01615 | |
| }, | |
| { | |
| "epoch": 0.5685279187817259, | |
| "grad_norm": 1.2637865638643238, | |
| "learning_rate": 9.888938766201907e-05, | |
| "loss": 0.21994171142578126, | |
| "memory(GiB)": 36.91, | |
| "step": 175, | |
| "token_acc": 0.9292452830188679, | |
| "train_speed(iter/s)": 0.016148 | |
| }, | |
| { | |
| "epoch": 0.584771573604061, | |
| "grad_norm": 1.3035045872952578, | |
| "learning_rate": 9.877364617745604e-05, | |
| "loss": 0.21233229637145995, | |
| "memory(GiB)": 36.91, | |
| "step": 180, | |
| "token_acc": 0.936046511627907, | |
| "train_speed(iter/s)": 0.016163 | |
| }, | |
| { | |
| "epoch": 0.601015228426396, | |
| "grad_norm": 1.0837997073678936, | |
| "learning_rate": 9.865224352899119e-05, | |
| "loss": 0.20809760093688964, | |
| "memory(GiB)": 36.91, | |
| "step": 185, | |
| "token_acc": 0.9612403100775194, | |
| "train_speed(iter/s)": 0.016158 | |
| }, | |
| { | |
| "epoch": 0.617258883248731, | |
| "grad_norm": 1.6131697829206757, | |
| "learning_rate": 9.852519380784686e-05, | |
| "loss": 0.16450556516647338, | |
| "memory(GiB)": 36.91, | |
| "step": 190, | |
| "token_acc": 0.9518716577540107, | |
| "train_speed(iter/s)": 0.01615 | |
| }, | |
| { | |
| "epoch": 0.6335025380710659, | |
| "grad_norm": 1.0897399385105642, | |
| "learning_rate": 9.839251176070184e-05, | |
| "loss": 0.21039419174194335, | |
| "memory(GiB)": 36.91, | |
| "step": 195, | |
| "token_acc": 0.943089430894309, | |
| "train_speed(iter/s)": 0.016128 | |
| }, | |
| { | |
| "epoch": 0.649746192893401, | |
| "grad_norm": 1.0509670789538326, | |
| "learning_rate": 9.825421278797983e-05, | |
| "loss": 0.2035764217376709, | |
| "memory(GiB)": 36.91, | |
| "step": 200, | |
| "token_acc": 0.9397260273972603, | |
| "train_speed(iter/s)": 0.016132 | |
| }, | |
| { | |
| "epoch": 0.665989847715736, | |
| "grad_norm": 1.2329373260124112, | |
| "learning_rate": 9.811031294206184e-05, | |
| "loss": 0.21548199653625488, | |
| "memory(GiB)": 36.91, | |
| "step": 205, | |
| "token_acc": 0.9368191721132898, | |
| "train_speed(iter/s)": 0.01613 | |
| }, | |
| { | |
| "epoch": 0.682233502538071, | |
| "grad_norm": 0.8421449582235737, | |
| "learning_rate": 9.796082892542302e-05, | |
| "loss": 0.166330087184906, | |
| "memory(GiB)": 36.91, | |
| "step": 210, | |
| "token_acc": 0.9555555555555556, | |
| "train_speed(iter/s)": 0.016131 | |
| }, | |
| { | |
| "epoch": 0.698477157360406, | |
| "grad_norm": 3.430879327858774, | |
| "learning_rate": 9.780577808869398e-05, | |
| "loss": 0.2193552017211914, | |
| "memory(GiB)": 36.91, | |
| "step": 215, | |
| "token_acc": 0.952020202020202, | |
| "train_speed(iter/s)": 0.016136 | |
| }, | |
| { | |
| "epoch": 0.7147208121827411, | |
| "grad_norm": 1.5093266746905538, | |
| "learning_rate": 9.764517842864696e-05, | |
| "loss": 0.21606364250183105, | |
| "memory(GiB)": 36.91, | |
| "step": 220, | |
| "token_acc": 0.9311926605504587, | |
| "train_speed(iter/s)": 0.016141 | |
| }, | |
| { | |
| "epoch": 0.7309644670050761, | |
| "grad_norm": 1.3437389442959786, | |
| "learning_rate": 9.747904858610681e-05, | |
| "loss": 0.18983598947525024, | |
| "memory(GiB)": 36.91, | |
| "step": 225, | |
| "token_acc": 0.9449035812672176, | |
| "train_speed(iter/s)": 0.016146 | |
| }, | |
| { | |
| "epoch": 0.7472081218274111, | |
| "grad_norm": 0.9560028124850986, | |
| "learning_rate": 9.730740784378753e-05, | |
| "loss": 0.15862367153167725, | |
| "memory(GiB)": 36.91, | |
| "step": 230, | |
| "token_acc": 0.9284009546539379, | |
| "train_speed(iter/s)": 0.016153 | |
| }, | |
| { | |
| "epoch": 0.7634517766497462, | |
| "grad_norm": 0.9944797001481037, | |
| "learning_rate": 9.713027612405395e-05, | |
| "loss": 0.2057633638381958, | |
| "memory(GiB)": 36.91, | |
| "step": 235, | |
| "token_acc": 0.9560975609756097, | |
| "train_speed(iter/s)": 0.016148 | |
| }, | |
| { | |
| "epoch": 0.7796954314720812, | |
| "grad_norm": 1.3080304212648073, | |
| "learning_rate": 9.694767398660942e-05, | |
| "loss": 0.20023531913757325, | |
| "memory(GiB)": 36.91, | |
| "step": 240, | |
| "token_acc": 0.930835734870317, | |
| "train_speed(iter/s)": 0.016152 | |
| }, | |
| { | |
| "epoch": 0.7959390862944162, | |
| "grad_norm": 1.0766984239588557, | |
| "learning_rate": 9.67596226261095e-05, | |
| "loss": 0.17447829246520996, | |
| "memory(GiB)": 36.91, | |
| "step": 245, | |
| "token_acc": 0.9543269230769231, | |
| "train_speed(iter/s)": 0.016152 | |
| }, | |
| { | |
| "epoch": 0.8121827411167513, | |
| "grad_norm": 1.507367869013474, | |
| "learning_rate": 9.656614386970173e-05, | |
| "loss": 0.1656266212463379, | |
| "memory(GiB)": 36.91, | |
| "step": 250, | |
| "token_acc": 0.9447368421052632, | |
| "train_speed(iter/s)": 0.016157 | |
| }, | |
| { | |
| "epoch": 0.8284263959390863, | |
| "grad_norm": 0.9746901508793566, | |
| "learning_rate": 9.636726017449236e-05, | |
| "loss": 0.1971142530441284, | |
| "memory(GiB)": 36.91, | |
| "step": 255, | |
| "token_acc": 0.9336384439359268, | |
| "train_speed(iter/s)": 0.016144 | |
| }, | |
| { | |
| "epoch": 0.8446700507614213, | |
| "grad_norm": 1.2090195353569724, | |
| "learning_rate": 9.616299462493952e-05, | |
| "loss": 0.13225051164627075, | |
| "memory(GiB)": 36.91, | |
| "step": 260, | |
| "token_acc": 0.9502369668246445, | |
| "train_speed(iter/s)": 0.016132 | |
| }, | |
| { | |
| "epoch": 0.8609137055837564, | |
| "grad_norm": 2.0461505378854024, | |
| "learning_rate": 9.595337093017404e-05, | |
| "loss": 0.15409984588623046, | |
| "memory(GiB)": 36.91, | |
| "step": 265, | |
| "token_acc": 0.9537444933920705, | |
| "train_speed(iter/s)": 0.016136 | |
| }, | |
| { | |
| "epoch": 0.8771573604060914, | |
| "grad_norm": 2.0732984340431178, | |
| "learning_rate": 9.57384134212473e-05, | |
| "loss": 0.21368227005004883, | |
| "memory(GiB)": 36.91, | |
| "step": 270, | |
| "token_acc": 0.9419642857142857, | |
| "train_speed(iter/s)": 0.016136 | |
| }, | |
| { | |
| "epoch": 0.8934010152284264, | |
| "grad_norm": 0.7925657032904146, | |
| "learning_rate": 9.551814704830734e-05, | |
| "loss": 0.1758435010910034, | |
| "memory(GiB)": 36.91, | |
| "step": 275, | |
| "token_acc": 0.948509485094851, | |
| "train_speed(iter/s)": 0.016143 | |
| }, | |
| { | |
| "epoch": 0.9096446700507614, | |
| "grad_norm": 1.493944081608633, | |
| "learning_rate": 9.529259737770269e-05, | |
| "loss": 0.1807725191116333, | |
| "memory(GiB)": 36.91, | |
| "step": 280, | |
| "token_acc": 0.9431524547803618, | |
| "train_speed(iter/s)": 0.016126 | |
| }, | |
| { | |
| "epoch": 0.9258883248730965, | |
| "grad_norm": 1.6848601658017734, | |
| "learning_rate": 9.506179058901503e-05, | |
| "loss": 0.20769875049591063, | |
| "memory(GiB)": 36.91, | |
| "step": 285, | |
| "token_acc": 0.9391304347826087, | |
| "train_speed(iter/s)": 0.016132 | |
| }, | |
| { | |
| "epoch": 0.9421319796954315, | |
| "grad_norm": 1.3210817601987923, | |
| "learning_rate": 9.482575347202047e-05, | |
| "loss": 0.162405526638031, | |
| "memory(GiB)": 36.91, | |
| "step": 290, | |
| "token_acc": 0.9507042253521126, | |
| "train_speed(iter/s)": 0.016136 | |
| }, | |
| { | |
| "epoch": 0.9583756345177665, | |
| "grad_norm": 1.3496077516635223, | |
| "learning_rate": 9.458451342358002e-05, | |
| "loss": 0.19487454891204833, | |
| "memory(GiB)": 36.91, | |
| "step": 295, | |
| "token_acc": 0.9321608040201005, | |
| "train_speed(iter/s)": 0.016132 | |
| }, | |
| { | |
| "epoch": 0.9746192893401016, | |
| "grad_norm": 0.990005748680569, | |
| "learning_rate": 9.433809844445969e-05, | |
| "loss": 0.18303027153015136, | |
| "memory(GiB)": 36.91, | |
| "step": 300, | |
| "token_acc": 0.9073170731707317, | |
| "train_speed(iter/s)": 0.016134 | |
| }, | |
| { | |
| "epoch": 0.9908629441624366, | |
| "grad_norm": 1.2295422719869937, | |
| "learning_rate": 9.40865371360804e-05, | |
| "loss": 0.17322018146514892, | |
| "memory(GiB)": 36.91, | |
| "step": 305, | |
| "token_acc": 0.9247311827956989, | |
| "train_speed(iter/s)": 0.016141 | |
| }, | |
| { | |
| "epoch": 0.9973604060913706, | |
| "eval_loss": 0.1993405520915985, | |
| "eval_runtime": 62.0419, | |
| "eval_samples_per_second": 3.191, | |
| "eval_steps_per_second": 0.806, | |
| "eval_token_acc": 0.9332079021636877, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.0095431472081218, | |
| "grad_norm": 0.6300057786945967, | |
| "learning_rate": 9.382985869719825e-05, | |
| "loss": 0.18641979694366456, | |
| "memory(GiB)": 36.91, | |
| "step": 310, | |
| "token_acc": 0.9554234769687965, | |
| "train_speed(iter/s)": 0.016081 | |
| }, | |
| { | |
| "epoch": 1.0257868020304568, | |
| "grad_norm": 0.6546315853574257, | |
| "learning_rate": 9.35680929205154e-05, | |
| "loss": 0.09114786386489868, | |
| "memory(GiB)": 36.91, | |
| "step": 315, | |
| "token_acc": 0.972972972972973, | |
| "train_speed(iter/s)": 0.016095 | |
| }, | |
| { | |
| "epoch": 1.0420304568527918, | |
| "grad_norm": 1.0908662736650971, | |
| "learning_rate": 9.330127018922194e-05, | |
| "loss": 0.10798045396804809, | |
| "memory(GiB)": 36.91, | |
| "step": 320, | |
| "token_acc": 0.9705093833780161, | |
| "train_speed(iter/s)": 0.016104 | |
| }, | |
| { | |
| "epoch": 1.0582741116751269, | |
| "grad_norm": 1.3297407747084764, | |
| "learning_rate": 9.302942147346945e-05, | |
| "loss": 0.1425997495651245, | |
| "memory(GiB)": 36.91, | |
| "step": 325, | |
| "token_acc": 0.9705014749262537, | |
| "train_speed(iter/s)": 0.016103 | |
| }, | |
| { | |
| "epoch": 1.074517766497462, | |
| "grad_norm": 0.9696985174488663, | |
| "learning_rate": 9.275257832677623e-05, | |
| "loss": 0.09851968884468079, | |
| "memory(GiB)": 36.91, | |
| "step": 330, | |
| "token_acc": 0.9644549763033176, | |
| "train_speed(iter/s)": 0.016115 | |
| }, | |
| { | |
| "epoch": 1.090761421319797, | |
| "grad_norm": 0.9656710998245678, | |
| "learning_rate": 9.247077288236488e-05, | |
| "loss": 0.11144424676895141, | |
| "memory(GiB)": 36.91, | |
| "step": 335, | |
| "token_acc": 0.972568578553616, | |
| "train_speed(iter/s)": 0.016119 | |
| }, | |
| { | |
| "epoch": 1.107005076142132, | |
| "grad_norm": 2.119365217816497, | |
| "learning_rate": 9.21840378494325e-05, | |
| "loss": 0.11279252767562867, | |
| "memory(GiB)": 36.91, | |
| "step": 340, | |
| "token_acc": 0.9637462235649547, | |
| "train_speed(iter/s)": 0.016124 | |
| }, | |
| { | |
| "epoch": 1.123248730964467, | |
| "grad_norm": 1.0607496749665157, | |
| "learning_rate": 9.189240650935433e-05, | |
| "loss": 0.15501840114593507, | |
| "memory(GiB)": 36.91, | |
| "step": 345, | |
| "token_acc": 0.9662337662337662, | |
| "train_speed(iter/s)": 0.016118 | |
| }, | |
| { | |
| "epoch": 1.139492385786802, | |
| "grad_norm": 1.1350038539205582, | |
| "learning_rate": 9.159591271182058e-05, | |
| "loss": 0.12092633247375488, | |
| "memory(GiB)": 36.91, | |
| "step": 350, | |
| "token_acc": 0.9680232558139535, | |
| "train_speed(iter/s)": 0.016126 | |
| }, | |
| { | |
| "epoch": 1.155736040609137, | |
| "grad_norm": 0.6471881138956326, | |
| "learning_rate": 9.129459087090763e-05, | |
| "loss": 0.09021483659744263, | |
| "memory(GiB)": 36.91, | |
| "step": 355, | |
| "token_acc": 0.9718670076726342, | |
| "train_speed(iter/s)": 0.016119 | |
| }, | |
| { | |
| "epoch": 1.171979695431472, | |
| "grad_norm": 0.5557368721254966, | |
| "learning_rate": 9.098847596108351e-05, | |
| "loss": 0.09125213623046875, | |
| "memory(GiB)": 36.91, | |
| "step": 360, | |
| "token_acc": 0.9772727272727273, | |
| "train_speed(iter/s)": 0.01612 | |
| }, | |
| { | |
| "epoch": 1.188223350253807, | |
| "grad_norm": 0.8767747521686889, | |
| "learning_rate": 9.067760351314838e-05, | |
| "loss": 0.10847616195678711, | |
| "memory(GiB)": 36.91, | |
| "step": 365, | |
| "token_acc": 0.9425587467362925, | |
| "train_speed(iter/s)": 0.016116 | |
| }, | |
| { | |
| "epoch": 1.2044670050761421, | |
| "grad_norm": 0.7043233347928591, | |
| "learning_rate": 9.036200961011059e-05, | |
| "loss": 0.14046638011932372, | |
| "memory(GiB)": 36.91, | |
| "step": 370, | |
| "token_acc": 0.9632034632034632, | |
| "train_speed(iter/s)": 0.016126 | |
| }, | |
| { | |
| "epoch": 1.2207106598984772, | |
| "grad_norm": 1.0689456764149206, | |
| "learning_rate": 9.004173088299837e-05, | |
| "loss": 0.13291985988616944, | |
| "memory(GiB)": 36.91, | |
| "step": 375, | |
| "token_acc": 0.9565217391304348, | |
| "train_speed(iter/s)": 0.016139 | |
| }, | |
| { | |
| "epoch": 1.2369543147208122, | |
| "grad_norm": 1.3657829465422844, | |
| "learning_rate": 8.97168045066082e-05, | |
| "loss": 0.11737120151519775, | |
| "memory(GiB)": 36.91, | |
| "step": 380, | |
| "token_acc": 0.973421926910299, | |
| "train_speed(iter/s)": 0.01615 | |
| }, | |
| { | |
| "epoch": 1.2531979695431472, | |
| "grad_norm": 0.991725434659403, | |
| "learning_rate": 8.938726819518977e-05, | |
| "loss": 0.1285269021987915, | |
| "memory(GiB)": 36.91, | |
| "step": 385, | |
| "token_acc": 0.97, | |
| "train_speed(iter/s)": 0.016149 | |
| }, | |
| { | |
| "epoch": 1.2694416243654822, | |
| "grad_norm": 0.7615458350738632, | |
| "learning_rate": 8.905316019806868e-05, | |
| "loss": 0.08999634981155395, | |
| "memory(GiB)": 36.91, | |
| "step": 390, | |
| "token_acc": 0.9392405063291139, | |
| "train_speed(iter/s)": 0.016141 | |
| }, | |
| { | |
| "epoch": 1.2856852791878173, | |
| "grad_norm": 1.0176469569030087, | |
| "learning_rate": 8.871451929520663e-05, | |
| "loss": 0.12240591049194335, | |
| "memory(GiB)": 36.91, | |
| "step": 395, | |
| "token_acc": 0.9611872146118722, | |
| "train_speed(iter/s)": 0.016137 | |
| }, | |
| { | |
| "epoch": 1.3019289340101523, | |
| "grad_norm": 1.5999057477034428, | |
| "learning_rate": 8.837138479270036e-05, | |
| "loss": 0.1078599214553833, | |
| "memory(GiB)": 36.91, | |
| "step": 400, | |
| "token_acc": 0.9562982005141388, | |
| "train_speed(iter/s)": 0.016137 | |
| }, | |
| { | |
| "epoch": 1.3181725888324873, | |
| "grad_norm": 1.8517636831594235, | |
| "learning_rate": 8.802379651821938e-05, | |
| "loss": 0.14071439504623412, | |
| "memory(GiB)": 36.91, | |
| "step": 405, | |
| "token_acc": 0.9592875318066157, | |
| "train_speed(iter/s)": 0.016131 | |
| }, | |
| { | |
| "epoch": 1.3344162436548224, | |
| "grad_norm": 1.333329930877741, | |
| "learning_rate": 8.767179481638303e-05, | |
| "loss": 0.13171937465667724, | |
| "memory(GiB)": 36.91, | |
| "step": 410, | |
| "token_acc": 0.9744897959183674, | |
| "train_speed(iter/s)": 0.016135 | |
| }, | |
| { | |
| "epoch": 1.3506598984771574, | |
| "grad_norm": 1.1709434640964491, | |
| "learning_rate": 8.731542054407793e-05, | |
| "loss": 0.10031242370605468, | |
| "memory(GiB)": 36.91, | |
| "step": 415, | |
| "token_acc": 0.9507829977628636, | |
| "train_speed(iter/s)": 0.016142 | |
| }, | |
| { | |
| "epoch": 1.3669035532994924, | |
| "grad_norm": 0.8550588073511182, | |
| "learning_rate": 8.695471506571542e-05, | |
| "loss": 0.09321081638336182, | |
| "memory(GiB)": 36.91, | |
| "step": 420, | |
| "token_acc": 0.9667519181585678, | |
| "train_speed(iter/s)": 0.016135 | |
| }, | |
| { | |
| "epoch": 1.3831472081218275, | |
| "grad_norm": 0.8651388677420173, | |
| "learning_rate": 8.658972024843062e-05, | |
| "loss": 0.11361520290374756, | |
| "memory(GiB)": 36.91, | |
| "step": 425, | |
| "token_acc": 0.9243243243243243, | |
| "train_speed(iter/s)": 0.016131 | |
| }, | |
| { | |
| "epoch": 1.3993908629441625, | |
| "grad_norm": 1.1539120381770573, | |
| "learning_rate": 8.622047845722275e-05, | |
| "loss": 0.11814072132110595, | |
| "memory(GiB)": 36.91, | |
| "step": 430, | |
| "token_acc": 0.9747368421052631, | |
| "train_speed(iter/s)": 0.016135 | |
| }, | |
| { | |
| "epoch": 1.4156345177664975, | |
| "grad_norm": 0.8277592112279485, | |
| "learning_rate": 8.584703255003795e-05, | |
| "loss": 0.11146994829177856, | |
| "memory(GiB)": 36.91, | |
| "step": 435, | |
| "token_acc": 0.9720101781170484, | |
| "train_speed(iter/s)": 0.016134 | |
| }, | |
| { | |
| "epoch": 1.4318781725888325, | |
| "grad_norm": 0.613271329664299, | |
| "learning_rate": 8.546942587279465e-05, | |
| "loss": 0.09394789338111878, | |
| "memory(GiB)": 36.91, | |
| "step": 440, | |
| "token_acc": 0.9636803874092009, | |
| "train_speed(iter/s)": 0.016134 | |
| }, | |
| { | |
| "epoch": 1.4481218274111676, | |
| "grad_norm": 1.0271786482031176, | |
| "learning_rate": 8.508770225435244e-05, | |
| "loss": 0.09493039846420288, | |
| "memory(GiB)": 36.91, | |
| "step": 445, | |
| "token_acc": 0.9743589743589743, | |
| "train_speed(iter/s)": 0.016139 | |
| }, | |
| { | |
| "epoch": 1.4643654822335026, | |
| "grad_norm": 1.0170609694346187, | |
| "learning_rate": 8.470190600142486e-05, | |
| "loss": 0.0872123122215271, | |
| "memory(GiB)": 36.91, | |
| "step": 450, | |
| "token_acc": 0.9763779527559056, | |
| "train_speed(iter/s)": 0.016139 | |
| }, | |
| { | |
| "epoch": 1.4806091370558376, | |
| "grad_norm": 1.6562131424643847, | |
| "learning_rate": 8.43120818934367e-05, | |
| "loss": 0.12921547889709473, | |
| "memory(GiB)": 36.91, | |
| "step": 455, | |
| "token_acc": 0.9691516709511568, | |
| "train_speed(iter/s)": 0.016142 | |
| }, | |
| { | |
| "epoch": 1.4968527918781727, | |
| "grad_norm": 1.9551348110028592, | |
| "learning_rate": 8.39182751773264e-05, | |
| "loss": 0.10002539157867432, | |
| "memory(GiB)": 36.91, | |
| "step": 460, | |
| "token_acc": 0.9665924276169265, | |
| "train_speed(iter/s)": 0.016147 | |
| }, | |
| { | |
| "epoch": 1.5130964467005077, | |
| "grad_norm": 1.376875063389563, | |
| "learning_rate": 8.352053156229438e-05, | |
| "loss": 0.0880006194114685, | |
| "memory(GiB)": 36.91, | |
| "step": 465, | |
| "token_acc": 0.958128078817734, | |
| "train_speed(iter/s)": 0.016149 | |
| }, | |
| { | |
| "epoch": 1.5293401015228425, | |
| "grad_norm": 1.688502126127077, | |
| "learning_rate": 8.31188972144974e-05, | |
| "loss": 0.08950616717338562, | |
| "memory(GiB)": 36.91, | |
| "step": 470, | |
| "token_acc": 0.96996996996997, | |
| "train_speed(iter/s)": 0.016152 | |
| }, | |
| { | |
| "epoch": 1.5455837563451778, | |
| "grad_norm": 1.3189009566745062, | |
| "learning_rate": 8.27134187516901e-05, | |
| "loss": 0.08834458589553833, | |
| "memory(GiB)": 36.91, | |
| "step": 475, | |
| "token_acc": 0.9663865546218487, | |
| "train_speed(iter/s)": 0.016152 | |
| }, | |
| { | |
| "epoch": 1.5618274111675126, | |
| "grad_norm": 1.4946742975658185, | |
| "learning_rate": 8.23041432378141e-05, | |
| "loss": 0.14390041828155517, | |
| "memory(GiB)": 36.91, | |
| "step": 480, | |
| "token_acc": 0.9621212121212122, | |
| "train_speed(iter/s)": 0.016158 | |
| }, | |
| { | |
| "epoch": 1.5780710659898478, | |
| "grad_norm": 1.3879821905262077, | |
| "learning_rate": 8.18911181775353e-05, | |
| "loss": 0.1267578125, | |
| "memory(GiB)": 36.91, | |
| "step": 485, | |
| "token_acc": 0.9685230024213075, | |
| "train_speed(iter/s)": 0.016166 | |
| }, | |
| { | |
| "epoch": 1.5943147208121826, | |
| "grad_norm": 1.0017173842059925, | |
| "learning_rate": 8.147439151072994e-05, | |
| "loss": 0.11637402772903442, | |
| "memory(GiB)": 36.91, | |
| "step": 490, | |
| "token_acc": 0.945031712473573, | |
| "train_speed(iter/s)": 0.016169 | |
| }, | |
| { | |
| "epoch": 1.6105583756345179, | |
| "grad_norm": 3.517464157304767, | |
| "learning_rate": 8.105401160692023e-05, | |
| "loss": 0.11228004693984986, | |
| "memory(GiB)": 36.91, | |
| "step": 495, | |
| "token_acc": 0.9544513457556936, | |
| "train_speed(iter/s)": 0.016174 | |
| }, | |
| { | |
| "epoch": 1.6268020304568527, | |
| "grad_norm": 1.123471909128111, | |
| "learning_rate": 8.063002725966015e-05, | |
| "loss": 0.1422884702682495, | |
| "memory(GiB)": 36.91, | |
| "step": 500, | |
| "token_acc": 0.9502487562189055, | |
| "train_speed(iter/s)": 0.016173 | |
| }, | |
| { | |
| "epoch": 1.643045685279188, | |
| "grad_norm": 0.6219224292611003, | |
| "learning_rate": 8.020248768087188e-05, | |
| "loss": 0.09764043688774109, | |
| "memory(GiB)": 36.91, | |
| "step": 505, | |
| "token_acc": 0.9696312364425163, | |
| "train_speed(iter/s)": 0.016171 | |
| }, | |
| { | |
| "epoch": 1.6592893401015227, | |
| "grad_norm": 0.6599500918289528, | |
| "learning_rate": 7.977144249513391e-05, | |
| "loss": 0.11226143836975097, | |
| "memory(GiB)": 36.91, | |
| "step": 510, | |
| "token_acc": 0.9662447257383966, | |
| "train_speed(iter/s)": 0.016171 | |
| }, | |
| { | |
| "epoch": 1.675532994923858, | |
| "grad_norm": 1.1327299497198065, | |
| "learning_rate": 7.93369417339209e-05, | |
| "loss": 0.15791513919830322, | |
| "memory(GiB)": 36.91, | |
| "step": 515, | |
| "token_acc": 0.9535962877030162, | |
| "train_speed(iter/s)": 0.016172 | |
| }, | |
| { | |
| "epoch": 1.6917766497461928, | |
| "grad_norm": 1.246895523664307, | |
| "learning_rate": 7.88990358297967e-05, | |
| "loss": 0.1254945158958435, | |
| "memory(GiB)": 36.91, | |
| "step": 520, | |
| "token_acc": 0.9494252873563218, | |
| "train_speed(iter/s)": 0.016169 | |
| }, | |
| { | |
| "epoch": 1.708020304568528, | |
| "grad_norm": 0.7907689981367572, | |
| "learning_rate": 7.84577756105606e-05, | |
| "loss": 0.11963515281677246, | |
| "memory(GiB)": 36.91, | |
| "step": 525, | |
| "token_acc": 0.9851116625310173, | |
| "train_speed(iter/s)": 0.016171 | |
| }, | |
| { | |
| "epoch": 1.7242639593908629, | |
| "grad_norm": 0.9327837359999639, | |
| "learning_rate": 7.801321229334764e-05, | |
| "loss": 0.0870942771434784, | |
| "memory(GiB)": 36.91, | |
| "step": 530, | |
| "token_acc": 0.9618320610687023, | |
| "train_speed(iter/s)": 0.01617 | |
| }, | |
| { | |
| "epoch": 1.740507614213198, | |
| "grad_norm": 1.0881384151057631, | |
| "learning_rate": 7.756539747868394e-05, | |
| "loss": 0.08531727194786072, | |
| "memory(GiB)": 36.91, | |
| "step": 535, | |
| "token_acc": 0.9748603351955307, | |
| "train_speed(iter/s)": 0.016168 | |
| }, | |
| { | |
| "epoch": 1.756751269035533, | |
| "grad_norm": 0.7767069783252919, | |
| "learning_rate": 7.71143831444974e-05, | |
| "loss": 0.11042824983596802, | |
| "memory(GiB)": 36.91, | |
| "step": 540, | |
| "token_acc": 0.957983193277311, | |
| "train_speed(iter/s)": 0.016168 | |
| }, | |
| { | |
| "epoch": 1.7729949238578682, | |
| "grad_norm": 1.4499212106775468, | |
| "learning_rate": 7.666022164008457e-05, | |
| "loss": 0.11432676315307617, | |
| "memory(GiB)": 36.91, | |
| "step": 545, | |
| "token_acc": 0.975, | |
| "train_speed(iter/s)": 0.016171 | |
| }, | |
| { | |
| "epoch": 1.789238578680203, | |
| "grad_norm": 1.0081688781849556, | |
| "learning_rate": 7.620296568003449e-05, | |
| "loss": 0.12327454090118409, | |
| "memory(GiB)": 36.91, | |
| "step": 550, | |
| "token_acc": 0.9525, | |
| "train_speed(iter/s)": 0.016174 | |
| }, | |
| { | |
| "epoch": 1.8054822335025382, | |
| "grad_norm": 0.9935491377578084, | |
| "learning_rate": 7.57426683381101e-05, | |
| "loss": 0.09574033617973328, | |
| "memory(GiB)": 36.91, | |
| "step": 555, | |
| "token_acc": 0.9694117647058823, | |
| "train_speed(iter/s)": 0.016174 | |
| }, | |
| { | |
| "epoch": 1.821725888324873, | |
| "grad_norm": 1.0191162814710237, | |
| "learning_rate": 7.527938304108795e-05, | |
| "loss": 0.10299128293991089, | |
| "memory(GiB)": 36.91, | |
| "step": 560, | |
| "token_acc": 0.9694793536804309, | |
| "train_speed(iter/s)": 0.016176 | |
| }, | |
| { | |
| "epoch": 1.8379695431472083, | |
| "grad_norm": 1.322632268427317, | |
| "learning_rate": 7.481316356255698e-05, | |
| "loss": 0.12594590187072754, | |
| "memory(GiB)": 36.91, | |
| "step": 565, | |
| "token_acc": 0.946257197696737, | |
| "train_speed(iter/s)": 0.016169 | |
| }, | |
| { | |
| "epoch": 1.854213197969543, | |
| "grad_norm": 1.2990436559927216, | |
| "learning_rate": 7.434406401667695e-05, | |
| "loss": 0.10811959505081177, | |
| "memory(GiB)": 36.91, | |
| "step": 570, | |
| "token_acc": 0.9556650246305419, | |
| "train_speed(iter/s)": 0.016173 | |
| }, | |
| { | |
| "epoch": 1.8704568527918781, | |
| "grad_norm": 1.141255912127714, | |
| "learning_rate": 7.387213885189746e-05, | |
| "loss": 0.10128064155578613, | |
| "memory(GiB)": 36.91, | |
| "step": 575, | |
| "token_acc": 0.9654255319148937, | |
| "train_speed(iter/s)": 0.016178 | |
| }, | |
| { | |
| "epoch": 1.8867005076142132, | |
| "grad_norm": 1.6575287534795722, | |
| "learning_rate": 7.339744284463808e-05, | |
| "loss": 0.09879794716835022, | |
| "memory(GiB)": 36.91, | |
| "step": 580, | |
| "token_acc": 0.9805555555555555, | |
| "train_speed(iter/s)": 0.016182 | |
| }, | |
| { | |
| "epoch": 1.9029441624365482, | |
| "grad_norm": 1.1141293923635756, | |
| "learning_rate": 7.292003109293048e-05, | |
| "loss": 0.0816422462463379, | |
| "memory(GiB)": 36.91, | |
| "step": 585, | |
| "token_acc": 0.961038961038961, | |
| "train_speed(iter/s)": 0.016187 | |
| }, | |
| { | |
| "epoch": 1.9191878172588832, | |
| "grad_norm": 0.9384463374768481, | |
| "learning_rate": 7.243995901002312e-05, | |
| "loss": 0.10118494033813477, | |
| "memory(GiB)": 36.91, | |
| "step": 590, | |
| "token_acc": 0.978021978021978, | |
| "train_speed(iter/s)": 0.016179 | |
| }, | |
| { | |
| "epoch": 1.9354314720812182, | |
| "grad_norm": 1.2458643327317989, | |
| "learning_rate": 7.19572823179495e-05, | |
| "loss": 0.13551709651947022, | |
| "memory(GiB)": 36.91, | |
| "step": 595, | |
| "token_acc": 0.96, | |
| "train_speed(iter/s)": 0.016178 | |
| }, | |
| { | |
| "epoch": 1.9516751269035533, | |
| "grad_norm": 1.2473685164472739, | |
| "learning_rate": 7.147205704106046e-05, | |
| "loss": 0.12769120931625366, | |
| "memory(GiB)": 36.91, | |
| "step": 600, | |
| "token_acc": 0.9561586638830898, | |
| "train_speed(iter/s)": 0.016179 | |
| }, | |
| { | |
| "epoch": 1.9679187817258883, | |
| "grad_norm": 0.7203387342947396, | |
| "learning_rate": 7.098433949952146e-05, | |
| "loss": 0.09962844252586364, | |
| "memory(GiB)": 36.91, | |
| "step": 605, | |
| "token_acc": 0.9623115577889447, | |
| "train_speed(iter/s)": 0.016178 | |
| }, | |
| { | |
| "epoch": 1.9841624365482233, | |
| "grad_norm": 0.9094364008463653, | |
| "learning_rate": 7.049418630277542e-05, | |
| "loss": 0.10799739360809327, | |
| "memory(GiB)": 36.91, | |
| "step": 610, | |
| "token_acc": 0.9705159705159705, | |
| "train_speed(iter/s)": 0.016178 | |
| }, | |
| { | |
| "epoch": 1.9971573604060913, | |
| "eval_loss": 0.19586917757987976, | |
| "eval_runtime": 62.6829, | |
| "eval_samples_per_second": 3.159, | |
| "eval_steps_per_second": 0.798, | |
| "eval_token_acc": 0.9416745061147695, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 2.0028426395939087, | |
| "grad_norm": 11.787067733742486, | |
| "learning_rate": 7.000165434297214e-05, | |
| "loss": 0.12140052318572998, | |
| "memory(GiB)": 36.91, | |
| "step": 615, | |
| "token_acc": 0.951048951048951, | |
| "train_speed(iter/s)": 0.016146 | |
| }, | |
| { | |
| "epoch": 2.0190862944162435, | |
| "grad_norm": 0.9036939767517369, | |
| "learning_rate": 6.950680078836474e-05, | |
| "loss": 0.0476156622171402, | |
| "memory(GiB)": 36.91, | |
| "step": 620, | |
| "token_acc": 0.9901477832512315, | |
| "train_speed(iter/s)": 0.016145 | |
| }, | |
| { | |
| "epoch": 2.035329949238579, | |
| "grad_norm": 0.8045933316745676, | |
| "learning_rate": 6.900968307667423e-05, | |
| "loss": 0.0368287205696106, | |
| "memory(GiB)": 36.91, | |
| "step": 625, | |
| "token_acc": 0.9932584269662922, | |
| "train_speed(iter/s)": 0.016144 | |
| }, | |
| { | |
| "epoch": 2.0515736040609136, | |
| "grad_norm": 0.9084110351960255, | |
| "learning_rate": 6.851035890842259e-05, | |
| "loss": 0.03829330801963806, | |
| "memory(GiB)": 36.91, | |
| "step": 630, | |
| "token_acc": 0.9928741092636579, | |
| "train_speed(iter/s)": 0.016139 | |
| }, | |
| { | |
| "epoch": 2.067817258883249, | |
| "grad_norm": 0.6115130889160721, | |
| "learning_rate": 6.800888624023553e-05, | |
| "loss": 0.04897831082344055, | |
| "memory(GiB)": 36.91, | |
| "step": 635, | |
| "token_acc": 0.995, | |
| "train_speed(iter/s)": 0.016139 | |
| }, | |
| { | |
| "epoch": 2.0840609137055837, | |
| "grad_norm": 0.2929609590178906, | |
| "learning_rate": 6.750532327811547e-05, | |
| "loss": 0.027808183431625368, | |
| "memory(GiB)": 36.91, | |
| "step": 640, | |
| "token_acc": 0.9877750611246944, | |
| "train_speed(iter/s)": 0.016139 | |
| }, | |
| { | |
| "epoch": 2.100304568527919, | |
| "grad_norm": 1.6659772014622232, | |
| "learning_rate": 6.699972847068553e-05, | |
| "loss": 0.04012786149978638, | |
| "memory(GiB)": 36.91, | |
| "step": 645, | |
| "token_acc": 0.9892183288409704, | |
| "train_speed(iter/s)": 0.016136 | |
| }, | |
| { | |
| "epoch": 2.1165482233502537, | |
| "grad_norm": 1.6942318499082378, | |
| "learning_rate": 6.649216050240539e-05, | |
| "loss": 0.03581180572509766, | |
| "memory(GiB)": 36.91, | |
| "step": 650, | |
| "token_acc": 0.9848866498740554, | |
| "train_speed(iter/s)": 0.016138 | |
| }, | |
| { | |
| "epoch": 2.132791878172589, | |
| "grad_norm": 1.7750332328595628, | |
| "learning_rate": 6.598267828675979e-05, | |
| "loss": 0.038441383838653566, | |
| "memory(GiB)": 36.91, | |
| "step": 655, | |
| "token_acc": 0.9860724233983287, | |
| "train_speed(iter/s)": 0.016136 | |
| }, | |
| { | |
| "epoch": 2.149035532994924, | |
| "grad_norm": 0.948452800180108, | |
| "learning_rate": 6.547134095942044e-05, | |
| "loss": 0.03809022605419159, | |
| "memory(GiB)": 36.91, | |
| "step": 660, | |
| "token_acc": 0.9917355371900827, | |
| "train_speed(iter/s)": 0.016135 | |
| }, | |
| { | |
| "epoch": 2.165279187817259, | |
| "grad_norm": 1.185267349759789, | |
| "learning_rate": 6.495820787138209e-05, | |
| "loss": 0.033171114325523374, | |
| "memory(GiB)": 36.91, | |
| "step": 665, | |
| "token_acc": 0.9947916666666666, | |
| "train_speed(iter/s)": 0.016132 | |
| }, | |
| { | |
| "epoch": 2.181522842639594, | |
| "grad_norm": 1.1780464513130944, | |
| "learning_rate": 6.44433385820737e-05, | |
| "loss": 0.03416465222835541, | |
| "memory(GiB)": 36.91, | |
| "step": 670, | |
| "token_acc": 0.9948051948051948, | |
| "train_speed(iter/s)": 0.01614 | |
| }, | |
| { | |
| "epoch": 2.197766497461929, | |
| "grad_norm": 0.5862751780031482, | |
| "learning_rate": 6.392679285244538e-05, | |
| "loss": 0.043843358755111694, | |
| "memory(GiB)": 36.91, | |
| "step": 675, | |
| "token_acc": 0.9854014598540146, | |
| "train_speed(iter/s)": 0.016137 | |
| }, | |
| { | |
| "epoch": 2.214010152284264, | |
| "grad_norm": 0.7314774852745054, | |
| "learning_rate": 6.340863063803188e-05, | |
| "loss": 0.03051617741584778, | |
| "memory(GiB)": 36.91, | |
| "step": 680, | |
| "token_acc": 0.9970326409495549, | |
| "train_speed(iter/s)": 0.016136 | |
| }, | |
| { | |
| "epoch": 2.230253807106599, | |
| "grad_norm": 1.4305053109603272, | |
| "learning_rate": 6.288891208199353e-05, | |
| "loss": 0.03859332203865051, | |
| "memory(GiB)": 36.91, | |
| "step": 685, | |
| "token_acc": 0.9813829787234043, | |
| "train_speed(iter/s)": 0.016138 | |
| }, | |
| { | |
| "epoch": 2.246497461928934, | |
| "grad_norm": 1.2676862153868658, | |
| "learning_rate": 6.23676975081355e-05, | |
| "loss": 0.03608715534210205, | |
| "memory(GiB)": 36.91, | |
| "step": 690, | |
| "token_acc": 0.9923076923076923, | |
| "train_speed(iter/s)": 0.016143 | |
| }, | |
| { | |
| "epoch": 2.262741116751269, | |
| "grad_norm": 0.717797595223322, | |
| "learning_rate": 6.184504741390596e-05, | |
| "loss": 0.024200823903083802, | |
| "memory(GiB)": 36.91, | |
| "step": 695, | |
| "token_acc": 0.9932885906040269, | |
| "train_speed(iter/s)": 0.016142 | |
| }, | |
| { | |
| "epoch": 2.278984771573604, | |
| "grad_norm": 1.2738346733999926, | |
| "learning_rate": 6.132102246337407e-05, | |
| "loss": 0.04924860596656799, | |
| "memory(GiB)": 36.91, | |
| "step": 700, | |
| "token_acc": 0.989769820971867, | |
| "train_speed(iter/s)": 0.016144 | |
| }, | |
| { | |
| "epoch": 2.2952284263959393, | |
| "grad_norm": 0.9709229547354659, | |
| "learning_rate": 6.079568348018882e-05, | |
| "loss": 0.04101951122283935, | |
| "memory(GiB)": 36.91, | |
| "step": 705, | |
| "token_acc": 0.9838709677419355, | |
| "train_speed(iter/s)": 0.016145 | |
| }, | |
| { | |
| "epoch": 2.311472081218274, | |
| "grad_norm": 0.34074159031019935, | |
| "learning_rate": 6.02690914405191e-05, | |
| "loss": 0.012625060975551605, | |
| "memory(GiB)": 36.91, | |
| "step": 710, | |
| "token_acc": 0.9893162393162394, | |
| "train_speed(iter/s)": 0.016143 | |
| }, | |
| { | |
| "epoch": 2.3277157360406093, | |
| "grad_norm": 1.405033686903226, | |
| "learning_rate": 5.974130746597628e-05, | |
| "loss": 0.023314157128334047, | |
| "memory(GiB)": 36.91, | |
| "step": 715, | |
| "token_acc": 0.9845261121856866, | |
| "train_speed(iter/s)": 0.016146 | |
| }, | |
| { | |
| "epoch": 2.343959390862944, | |
| "grad_norm": 0.393622080479984, | |
| "learning_rate": 5.921239281651976e-05, | |
| "loss": 0.03884749114513397, | |
| "memory(GiB)": 36.91, | |
| "step": 720, | |
| "token_acc": 0.9844961240310077, | |
| "train_speed(iter/s)": 0.016147 | |
| }, | |
| { | |
| "epoch": 2.360203045685279, | |
| "grad_norm": 0.8205162732404321, | |
| "learning_rate": 5.868240888334653e-05, | |
| "loss": 0.0408410519361496, | |
| "memory(GiB)": 36.91, | |
| "step": 725, | |
| "token_acc": 0.9696969696969697, | |
| "train_speed(iter/s)": 0.016147 | |
| }, | |
| { | |
| "epoch": 2.376446700507614, | |
| "grad_norm": 0.9254262259522679, | |
| "learning_rate": 5.815141718176549e-05, | |
| "loss": 0.03491292595863342, | |
| "memory(GiB)": 36.91, | |
| "step": 730, | |
| "token_acc": 0.9818731117824774, | |
| "train_speed(iter/s)": 0.016148 | |
| }, | |
| { | |
| "epoch": 2.3926903553299494, | |
| "grad_norm": 0.4613013276623316, | |
| "learning_rate": 5.761947934405736e-05, | |
| "loss": 0.041343241930007935, | |
| "memory(GiB)": 36.91, | |
| "step": 735, | |
| "token_acc": 0.9923076923076923, | |
| "train_speed(iter/s)": 0.01615 | |
| }, | |
| { | |
| "epoch": 2.4089340101522843, | |
| "grad_norm": 0.5995425123829327, | |
| "learning_rate": 5.708665711232103e-05, | |
| "loss": 0.026265931129455567, | |
| "memory(GiB)": 36.91, | |
| "step": 740, | |
| "token_acc": 0.980225988700565, | |
| "train_speed(iter/s)": 0.016147 | |
| }, | |
| { | |
| "epoch": 2.425177664974619, | |
| "grad_norm": 0.8947399880614664, | |
| "learning_rate": 5.655301233130711e-05, | |
| "loss": 0.026338309049606323, | |
| "memory(GiB)": 36.91, | |
| "step": 745, | |
| "token_acc": 0.9891304347826086, | |
| "train_speed(iter/s)": 0.01615 | |
| }, | |
| { | |
| "epoch": 2.4414213197969543, | |
| "grad_norm": 0.6528954286261448, | |
| "learning_rate": 5.6018606941239615e-05, | |
| "loss": 0.031349584460258484, | |
| "memory(GiB)": 36.91, | |
| "step": 750, | |
| "token_acc": 0.9825870646766169, | |
| "train_speed(iter/s)": 0.016153 | |
| }, | |
| { | |
| "epoch": 2.4576649746192896, | |
| "grad_norm": 0.9124965491201447, | |
| "learning_rate": 5.548350297062659e-05, | |
| "loss": 0.04390305280685425, | |
| "memory(GiB)": 36.91, | |
| "step": 755, | |
| "token_acc": 0.9971181556195965, | |
| "train_speed(iter/s)": 0.016158 | |
| }, | |
| { | |
| "epoch": 2.4739086294416244, | |
| "grad_norm": 1.2758793187917294, | |
| "learning_rate": 5.494776252906036e-05, | |
| "loss": 0.03932673335075378, | |
| "memory(GiB)": 36.91, | |
| "step": 760, | |
| "token_acc": 0.9852941176470589, | |
| "train_speed(iter/s)": 0.016155 | |
| }, | |
| { | |
| "epoch": 2.490152284263959, | |
| "grad_norm": 1.6183527750946778, | |
| "learning_rate": 5.44114478000086e-05, | |
| "loss": 0.040107494592666625, | |
| "memory(GiB)": 36.91, | |
| "step": 765, | |
| "token_acc": 0.980722891566265, | |
| "train_speed(iter/s)": 0.01616 | |
| }, | |
| { | |
| "epoch": 2.5063959390862944, | |
| "grad_norm": 0.8155608212943981, | |
| "learning_rate": 5.387462103359655e-05, | |
| "loss": 0.034613233804702756, | |
| "memory(GiB)": 36.91, | |
| "step": 770, | |
| "token_acc": 0.9809885931558935, | |
| "train_speed(iter/s)": 0.016158 | |
| }, | |
| { | |
| "epoch": 2.5226395939086297, | |
| "grad_norm": 0.72914335142115, | |
| "learning_rate": 5.333734453938174e-05, | |
| "loss": 0.03472020030021668, | |
| "memory(GiB)": 36.91, | |
| "step": 775, | |
| "token_acc": 0.980722891566265, | |
| "train_speed(iter/s)": 0.016157 | |
| }, | |
| { | |
| "epoch": 2.5388832487309645, | |
| "grad_norm": 0.715640193227215, | |
| "learning_rate": 5.279968067912161e-05, | |
| "loss": 0.03267112672328949, | |
| "memory(GiB)": 36.91, | |
| "step": 780, | |
| "token_acc": 0.9949109414758269, | |
| "train_speed(iter/s)": 0.016159 | |
| }, | |
| { | |
| "epoch": 2.5551269035532993, | |
| "grad_norm": 0.5201766196940287, | |
| "learning_rate": 5.226169185953532e-05, | |
| "loss": 0.06324458122253418, | |
| "memory(GiB)": 36.91, | |
| "step": 785, | |
| "token_acc": 0.9822784810126582, | |
| "train_speed(iter/s)": 0.016157 | |
| }, | |
| { | |
| "epoch": 2.5713705583756346, | |
| "grad_norm": 0.716527670309396, | |
| "learning_rate": 5.1723440525060026e-05, | |
| "loss": 0.036973622441291806, | |
| "memory(GiB)": 36.91, | |
| "step": 790, | |
| "token_acc": 0.9828009828009828, | |
| "train_speed(iter/s)": 0.016157 | |
| }, | |
| { | |
| "epoch": 2.58761421319797, | |
| "grad_norm": 0.9508048665101771, | |
| "learning_rate": 5.118498915060307e-05, | |
| "loss": 0.04134515523910522, | |
| "memory(GiB)": 36.91, | |
| "step": 795, | |
| "token_acc": 0.9832402234636871, | |
| "train_speed(iter/s)": 0.016159 | |
| }, | |
| { | |
| "epoch": 2.6038578680203046, | |
| "grad_norm": 0.1695737988935869, | |
| "learning_rate": 5.064640023429043e-05, | |
| "loss": 0.0396234929561615, | |
| "memory(GiB)": 36.91, | |
| "step": 800, | |
| "token_acc": 0.9937888198757764, | |
| "train_speed(iter/s)": 0.01616 | |
| }, | |
| { | |
| "epoch": 2.6201015228426394, | |
| "grad_norm": 1.353410357397197, | |
| "learning_rate": 5.0107736290212603e-05, | |
| "loss": 0.032366597652435304, | |
| "memory(GiB)": 36.91, | |
| "step": 805, | |
| "token_acc": 0.9853658536585366, | |
| "train_speed(iter/s)": 0.016161 | |
| }, | |
| { | |
| "epoch": 2.6363451776649747, | |
| "grad_norm": 0.9287301884362714, | |
| "learning_rate": 4.956905984116858e-05, | |
| "loss": 0.02025129795074463, | |
| "memory(GiB)": 36.91, | |
| "step": 810, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016156 | |
| }, | |
| { | |
| "epoch": 2.65258883248731, | |
| "grad_norm": 0.6605215469870417, | |
| "learning_rate": 4.903043341140879e-05, | |
| "loss": 0.027498137950897217, | |
| "memory(GiB)": 36.91, | |
| "step": 815, | |
| "token_acc": 0.9890590809628009, | |
| "train_speed(iter/s)": 0.016158 | |
| }, | |
| { | |
| "epoch": 2.6688324873096447, | |
| "grad_norm": 1.284202747583917, | |
| "learning_rate": 4.84919195193779e-05, | |
| "loss": 0.04052730202674866, | |
| "memory(GiB)": 36.91, | |
| "step": 820, | |
| "token_acc": 0.9691714836223507, | |
| "train_speed(iter/s)": 0.016161 | |
| }, | |
| { | |
| "epoch": 2.6850761421319795, | |
| "grad_norm": 1.054572423840406, | |
| "learning_rate": 4.7953580670458345e-05, | |
| "loss": 0.029700332880020143, | |
| "memory(GiB)": 36.91, | |
| "step": 825, | |
| "token_acc": 0.9903381642512077, | |
| "train_speed(iter/s)": 0.016161 | |
| }, | |
| { | |
| "epoch": 2.701319796954315, | |
| "grad_norm": 1.515148160249309, | |
| "learning_rate": 4.7415479349715275e-05, | |
| "loss": 0.03995212614536285, | |
| "memory(GiB)": 36.91, | |
| "step": 830, | |
| "token_acc": 0.9887005649717514, | |
| "train_speed(iter/s)": 0.016163 | |
| }, | |
| { | |
| "epoch": 2.7175634517766496, | |
| "grad_norm": 0.7966857436927859, | |
| "learning_rate": 4.687767801464388e-05, | |
| "loss": 0.029492130875587462, | |
| "memory(GiB)": 36.91, | |
| "step": 835, | |
| "token_acc": 0.9946091644204852, | |
| "train_speed(iter/s)": 0.016162 | |
| }, | |
| { | |
| "epoch": 2.733807106598985, | |
| "grad_norm": 0.6747809015160623, | |
| "learning_rate": 4.634023908791999e-05, | |
| "loss": 0.028040975332260132, | |
| "memory(GiB)": 36.91, | |
| "step": 840, | |
| "token_acc": 0.9950372208436724, | |
| "train_speed(iter/s)": 0.016165 | |
| }, | |
| { | |
| "epoch": 2.7500507614213197, | |
| "grad_norm": 0.7236373548114289, | |
| "learning_rate": 4.5803224950154656e-05, | |
| "loss": 0.022182533144950868, | |
| "memory(GiB)": 36.91, | |
| "step": 845, | |
| "token_acc": 0.9973753280839895, | |
| "train_speed(iter/s)": 0.016167 | |
| }, | |
| { | |
| "epoch": 2.766294416243655, | |
| "grad_norm": 0.8702609694851884, | |
| "learning_rate": 4.5266697932653616e-05, | |
| "loss": 0.03542717695236206, | |
| "memory(GiB)": 36.91, | |
| "step": 850, | |
| "token_acc": 0.9930394431554525, | |
| "train_speed(iter/s)": 0.016168 | |
| }, | |
| { | |
| "epoch": 2.7825380710659897, | |
| "grad_norm": 0.2339976820774803, | |
| "learning_rate": 4.473072031018248e-05, | |
| "loss": 0.017447268962860106, | |
| "memory(GiB)": 36.91, | |
| "step": 855, | |
| "token_acc": 0.9897172236503856, | |
| "train_speed(iter/s)": 0.016172 | |
| }, | |
| { | |
| "epoch": 2.798781725888325, | |
| "grad_norm": 1.7564108472908913, | |
| "learning_rate": 4.4195354293738484e-05, | |
| "loss": 0.040924933552742, | |
| "memory(GiB)": 36.91, | |
| "step": 860, | |
| "token_acc": 0.9693396226415094, | |
| "train_speed(iter/s)": 0.016172 | |
| }, | |
| { | |
| "epoch": 2.8150253807106598, | |
| "grad_norm": 1.749637468786309, | |
| "learning_rate": 4.366066202332974e-05, | |
| "loss": 0.0398847758769989, | |
| "memory(GiB)": 36.91, | |
| "step": 865, | |
| "token_acc": 0.9884726224783862, | |
| "train_speed(iter/s)": 0.016173 | |
| }, | |
| { | |
| "epoch": 2.831269035532995, | |
| "grad_norm": 1.6657986428559317, | |
| "learning_rate": 4.312670556076244e-05, | |
| "loss": 0.027478563785552978, | |
| "memory(GiB)": 36.91, | |
| "step": 870, | |
| "token_acc": 0.9953379953379954, | |
| "train_speed(iter/s)": 0.016178 | |
| }, | |
| { | |
| "epoch": 2.84751269035533, | |
| "grad_norm": 0.8830417040757416, | |
| "learning_rate": 4.259354688243757e-05, | |
| "loss": 0.05422350764274597, | |
| "memory(GiB)": 36.91, | |
| "step": 875, | |
| "token_acc": 0.9813953488372092, | |
| "train_speed(iter/s)": 0.016176 | |
| }, | |
| { | |
| "epoch": 2.863756345177665, | |
| "grad_norm": 1.4037166255295264, | |
| "learning_rate": 4.206124787215714e-05, | |
| "loss": 0.03585241138935089, | |
| "memory(GiB)": 36.91, | |
| "step": 880, | |
| "token_acc": 0.9929577464788732, | |
| "train_speed(iter/s)": 0.016178 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.40929439648007787, | |
| "learning_rate": 4.1529870313941386e-05, | |
| "loss": 0.037713998556137086, | |
| "memory(GiB)": 36.91, | |
| "step": 885, | |
| "token_acc": 0.9755555555555555, | |
| "train_speed(iter/s)": 0.016182 | |
| }, | |
| { | |
| "epoch": 2.896243654822335, | |
| "grad_norm": 0.5649136450093045, | |
| "learning_rate": 4.099947588485744e-05, | |
| "loss": 0.02235218584537506, | |
| "memory(GiB)": 36.91, | |
| "step": 890, | |
| "token_acc": 0.9738562091503268, | |
| "train_speed(iter/s)": 0.016179 | |
| }, | |
| { | |
| "epoch": 2.91248730964467, | |
| "grad_norm": 0.9411441260021843, | |
| "learning_rate": 4.047012614786055e-05, | |
| "loss": 0.03756971955299378, | |
| "memory(GiB)": 36.91, | |
| "step": 895, | |
| "token_acc": 0.9953596287703016, | |
| "train_speed(iter/s)": 0.016182 | |
| }, | |
| { | |
| "epoch": 2.928730964467005, | |
| "grad_norm": 0.493632814272918, | |
| "learning_rate": 3.994188254464838e-05, | |
| "loss": 0.03068949580192566, | |
| "memory(GiB)": 36.91, | |
| "step": 900, | |
| "token_acc": 0.9681372549019608, | |
| "train_speed(iter/s)": 0.016183 | |
| }, | |
| { | |
| "epoch": 2.94497461928934, | |
| "grad_norm": 0.9098057371042104, | |
| "learning_rate": 3.941480638852948e-05, | |
| "loss": 0.060313427448272706, | |
| "memory(GiB)": 36.91, | |
| "step": 905, | |
| "token_acc": 0.9809976247030879, | |
| "train_speed(iter/s)": 0.016186 | |
| }, | |
| { | |
| "epoch": 2.9612182741116753, | |
| "grad_norm": 0.7111307711774197, | |
| "learning_rate": 3.888895885730666e-05, | |
| "loss": 0.017010049521923067, | |
| "memory(GiB)": 36.91, | |
| "step": 910, | |
| "token_acc": 0.9949748743718593, | |
| "train_speed(iter/s)": 0.016184 | |
| }, | |
| { | |
| "epoch": 2.97746192893401, | |
| "grad_norm": 1.1085076966021257, | |
| "learning_rate": 3.836440098617611e-05, | |
| "loss": 0.0352476716041565, | |
| "memory(GiB)": 36.91, | |
| "step": 915, | |
| "token_acc": 0.9971264367816092, | |
| "train_speed(iter/s)": 0.016185 | |
| }, | |
| { | |
| "epoch": 2.9937055837563453, | |
| "grad_norm": 1.0414881730973389, | |
| "learning_rate": 3.784119366064293e-05, | |
| "loss": 0.036097651720046996, | |
| "memory(GiB)": 36.91, | |
| "step": 920, | |
| "token_acc": 0.9859484777517564, | |
| "train_speed(iter/s)": 0.016183 | |
| }, | |
| { | |
| "epoch": 2.996954314720812, | |
| "eval_loss": 0.2438431978225708, | |
| "eval_runtime": 61.9093, | |
| "eval_samples_per_second": 3.198, | |
| "eval_steps_per_second": 0.808, | |
| "eval_token_acc": 0.9426152398871119, | |
| "step": 921 | |
| }, | |
| { | |
| "epoch": 3.0123857868020303, | |
| "grad_norm": 0.40292122284066784, | |
| "learning_rate": 3.731939760945423e-05, | |
| "loss": 0.02739437222480774, | |
| "memory(GiB)": 36.91, | |
| "step": 925, | |
| "token_acc": 0.9686609686609686, | |
| "train_speed(iter/s)": 0.016163 | |
| }, | |
| { | |
| "epoch": 3.0286294416243655, | |
| "grad_norm": 2.9493043319197345, | |
| "learning_rate": 3.6799073397550324e-05, | |
| "loss": 0.023541851341724394, | |
| "memory(GiB)": 36.91, | |
| "step": 930, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016168 | |
| }, | |
| { | |
| "epoch": 3.0448730964467003, | |
| "grad_norm": 0.17930096671859505, | |
| "learning_rate": 3.628028141903493e-05, | |
| "loss": 0.011585032194852829, | |
| "memory(GiB)": 36.91, | |
| "step": 935, | |
| "token_acc": 0.9955849889624724, | |
| "train_speed(iter/s)": 0.016168 | |
| }, | |
| { | |
| "epoch": 3.0611167512690356, | |
| "grad_norm": 0.32421421634457975, | |
| "learning_rate": 3.576308189016521e-05, | |
| "loss": 0.01218060329556465, | |
| "memory(GiB)": 36.91, | |
| "step": 940, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016169 | |
| }, | |
| { | |
| "epoch": 3.0773604060913704, | |
| "grad_norm": 0.6594419595560748, | |
| "learning_rate": 3.5247534842362486e-05, | |
| "loss": 0.02207506597042084, | |
| "memory(GiB)": 36.91, | |
| "step": 945, | |
| "token_acc": 0.988558352402746, | |
| "train_speed(iter/s)": 0.016162 | |
| }, | |
| { | |
| "epoch": 3.0936040609137057, | |
| "grad_norm": 0.2767332960437252, | |
| "learning_rate": 3.473370011524435e-05, | |
| "loss": 0.007218687236309052, | |
| "memory(GiB)": 36.91, | |
| "step": 950, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016165 | |
| }, | |
| { | |
| "epoch": 3.1098477157360405, | |
| "grad_norm": 0.35071543831944074, | |
| "learning_rate": 3.422163734967913e-05, | |
| "loss": 0.01153595745563507, | |
| "memory(GiB)": 36.91, | |
| "step": 955, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016173 | |
| }, | |
| { | |
| "epoch": 3.1260913705583757, | |
| "grad_norm": 0.09053944993100493, | |
| "learning_rate": 3.371140598086332e-05, | |
| "loss": 0.0028192587196826935, | |
| "memory(GiB)": 36.91, | |
| "step": 960, | |
| "token_acc": 0.9975247524752475, | |
| "train_speed(iter/s)": 0.016172 | |
| }, | |
| { | |
| "epoch": 3.1423350253807105, | |
| "grad_norm": 0.2428779518534084, | |
| "learning_rate": 3.3203065231422904e-05, | |
| "loss": 0.0033150166273117065, | |
| "memory(GiB)": 36.91, | |
| "step": 965, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016172 | |
| }, | |
| { | |
| "epoch": 3.1585786802030458, | |
| "grad_norm": 0.3634314044068558, | |
| "learning_rate": 3.269667410453944e-05, | |
| "loss": 0.006601892411708832, | |
| "memory(GiB)": 36.91, | |
| "step": 970, | |
| "token_acc": 0.9974160206718347, | |
| "train_speed(iter/s)": 0.016171 | |
| }, | |
| { | |
| "epoch": 3.1748223350253806, | |
| "grad_norm": 0.09528591509222967, | |
| "learning_rate": 3.2192291377101544e-05, | |
| "loss": 0.006571587175130844, | |
| "memory(GiB)": 36.91, | |
| "step": 975, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016173 | |
| }, | |
| { | |
| "epoch": 3.191065989847716, | |
| "grad_norm": 1.3857004471442305, | |
| "learning_rate": 3.1689975592882603e-05, | |
| "loss": 0.010420820116996765, | |
| "memory(GiB)": 36.91, | |
| "step": 980, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016176 | |
| }, | |
| { | |
| "epoch": 3.2073096446700506, | |
| "grad_norm": 0.6960737288379213, | |
| "learning_rate": 3.11897850557456e-05, | |
| "loss": 0.013220900297164917, | |
| "memory(GiB)": 36.91, | |
| "step": 985, | |
| "token_acc": 0.9951807228915662, | |
| "train_speed(iter/s)": 0.016176 | |
| }, | |
| { | |
| "epoch": 3.223553299492386, | |
| "grad_norm": 0.9453732221306024, | |
| "learning_rate": 3.0691777822875846e-05, | |
| "loss": 0.01793895959854126, | |
| "memory(GiB)": 36.91, | |
| "step": 990, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016176 | |
| }, | |
| { | |
| "epoch": 3.2397969543147207, | |
| "grad_norm": 0.7409992990444315, | |
| "learning_rate": 3.019601169804216e-05, | |
| "loss": 0.019229742884635925, | |
| "memory(GiB)": 36.91, | |
| "step": 995, | |
| "token_acc": 0.9945054945054945, | |
| "train_speed(iter/s)": 0.016174 | |
| }, | |
| { | |
| "epoch": 3.256040609137056, | |
| "grad_norm": 0.5679417621370911, | |
| "learning_rate": 2.9702544224887684e-05, | |
| "loss": 0.024555668234825134, | |
| "memory(GiB)": 36.91, | |
| "step": 1000, | |
| "token_acc": 0.9953161592505855, | |
| "train_speed(iter/s)": 0.016175 | |
| }, | |
| { | |
| "epoch": 3.2722842639593908, | |
| "grad_norm": 0.08818412948467023, | |
| "learning_rate": 2.9211432680250717e-05, | |
| "loss": 0.009600495547056198, | |
| "memory(GiB)": 36.91, | |
| "step": 1005, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016176 | |
| }, | |
| { | |
| "epoch": 3.288527918781726, | |
| "grad_norm": 0.597788232010352, | |
| "learning_rate": 2.872273406751664e-05, | |
| "loss": 0.015477313101291657, | |
| "memory(GiB)": 36.91, | |
| "step": 1010, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016178 | |
| }, | |
| { | |
| "epoch": 3.304771573604061, | |
| "grad_norm": 0.5815875303347526, | |
| "learning_rate": 2.823650511000142e-05, | |
| "loss": 0.007314224541187286, | |
| "memory(GiB)": 36.91, | |
| "step": 1015, | |
| "token_acc": 0.9928741092636579, | |
| "train_speed(iter/s)": 0.016175 | |
| }, | |
| { | |
| "epoch": 3.321015228426396, | |
| "grad_norm": 0.06303638116527722, | |
| "learning_rate": 2.7752802244367875e-05, | |
| "loss": 0.0048162821680307385, | |
| "memory(GiB)": 36.91, | |
| "step": 1020, | |
| "token_acc": 0.9976359338061466, | |
| "train_speed(iter/s)": 0.016175 | |
| }, | |
| { | |
| "epoch": 3.337258883248731, | |
| "grad_norm": 1.530822467857818, | |
| "learning_rate": 2.7271681614074973e-05, | |
| "loss": 0.011756302416324615, | |
| "memory(GiB)": 36.91, | |
| "step": 1025, | |
| "token_acc": 0.9976744186046511, | |
| "train_speed(iter/s)": 0.016173 | |
| }, | |
| { | |
| "epoch": 3.353502538071066, | |
| "grad_norm": 0.03790601751186608, | |
| "learning_rate": 2.679319906286122e-05, | |
| "loss": 0.008612405508756638, | |
| "memory(GiB)": 36.91, | |
| "step": 1030, | |
| "token_acc": 0.9927184466019418, | |
| "train_speed(iter/s)": 0.016176 | |
| }, | |
| { | |
| "epoch": 3.369746192893401, | |
| "grad_norm": 0.21401768725028367, | |
| "learning_rate": 2.6317410128262954e-05, | |
| "loss": 0.006316320598125457, | |
| "memory(GiB)": 36.91, | |
| "step": 1035, | |
| "token_acc": 0.9950124688279302, | |
| "train_speed(iter/s)": 0.016179 | |
| }, | |
| { | |
| "epoch": 3.385989847715736, | |
| "grad_norm": 0.19540220508166592, | |
| "learning_rate": 2.5844370035168073e-05, | |
| "loss": 0.004939628392457962, | |
| "memory(GiB)": 36.91, | |
| "step": 1040, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016181 | |
| }, | |
| { | |
| "epoch": 3.402233502538071, | |
| "grad_norm": 0.8965894055639708, | |
| "learning_rate": 2.537413368940601e-05, | |
| "loss": 0.016151268780231477, | |
| "memory(GiB)": 36.91, | |
| "step": 1045, | |
| "token_acc": 0.9898785425101214, | |
| "train_speed(iter/s)": 0.016182 | |
| }, | |
| { | |
| "epoch": 3.4184771573604062, | |
| "grad_norm": 0.21427146738429803, | |
| "learning_rate": 2.4906755671374903e-05, | |
| "loss": 0.010773959755897521, | |
| "memory(GiB)": 36.91, | |
| "step": 1050, | |
| "token_acc": 0.9977827050997783, | |
| "train_speed(iter/s)": 0.016182 | |
| }, | |
| { | |
| "epoch": 3.434720812182741, | |
| "grad_norm": 0.09286838269357345, | |
| "learning_rate": 2.4442290229706344e-05, | |
| "loss": 0.004091666638851165, | |
| "memory(GiB)": 36.91, | |
| "step": 1055, | |
| "token_acc": 0.9954233409610984, | |
| "train_speed(iter/s)": 0.016183 | |
| }, | |
| { | |
| "epoch": 3.4509644670050763, | |
| "grad_norm": 0.13489614133107514, | |
| "learning_rate": 2.3980791274968837e-05, | |
| "loss": 0.018990179896354674, | |
| "memory(GiB)": 36.91, | |
| "step": 1060, | |
| "token_acc": 0.9945054945054945, | |
| "train_speed(iter/s)": 0.016184 | |
| }, | |
| { | |
| "epoch": 3.467208121827411, | |
| "grad_norm": 0.1825955700626613, | |
| "learning_rate": 2.3522312373410276e-05, | |
| "loss": 0.011526491492986679, | |
| "memory(GiB)": 36.91, | |
| "step": 1065, | |
| "token_acc": 0.997275204359673, | |
| "train_speed(iter/s)": 0.016188 | |
| }, | |
| { | |
| "epoch": 3.4834517766497464, | |
| "grad_norm": 0.2440094791459664, | |
| "learning_rate": 2.3066906740740623e-05, | |
| "loss": 0.019795812666416168, | |
| "memory(GiB)": 36.91, | |
| "step": 1070, | |
| "token_acc": 0.9896373056994818, | |
| "train_speed(iter/s)": 0.016187 | |
| }, | |
| { | |
| "epoch": 3.499695431472081, | |
| "grad_norm": 0.4913730237430669, | |
| "learning_rate": 2.2614627235955026e-05, | |
| "loss": 0.007270602881908417, | |
| "memory(GiB)": 36.91, | |
| "step": 1075, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016189 | |
| }, | |
| { | |
| "epoch": 3.5159390862944164, | |
| "grad_norm": 0.6922284750457558, | |
| "learning_rate": 2.2165526355198605e-05, | |
| "loss": 0.0127563938498497, | |
| "memory(GiB)": 36.91, | |
| "step": 1080, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016191 | |
| }, | |
| { | |
| "epoch": 3.5321827411167512, | |
| "grad_norm": 0.6450602563278425, | |
| "learning_rate": 2.171965622567308e-05, | |
| "loss": 0.007853203266859055, | |
| "memory(GiB)": 36.91, | |
| "step": 1085, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016193 | |
| }, | |
| { | |
| "epoch": 3.548426395939086, | |
| "grad_norm": 0.3234875973475892, | |
| "learning_rate": 2.127706859958647e-05, | |
| "loss": 0.008352670073509216, | |
| "memory(GiB)": 36.91, | |
| "step": 1090, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016193 | |
| }, | |
| { | |
| "epoch": 3.5646700507614213, | |
| "grad_norm": 0.09371017997182811, | |
| "learning_rate": 2.0837814848146166e-05, | |
| "loss": 0.001982194371521473, | |
| "memory(GiB)": 36.91, | |
| "step": 1095, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016191 | |
| }, | |
| { | |
| "epoch": 3.5809137055837565, | |
| "grad_norm": 0.8724610494447905, | |
| "learning_rate": 2.0401945955596206e-05, | |
| "loss": 0.0030656153336167335, | |
| "memory(GiB)": 36.91, | |
| "step": 1100, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016188 | |
| }, | |
| { | |
| "epoch": 3.5971573604060914, | |
| "grad_norm": 0.5650605008223917, | |
| "learning_rate": 1.9969512513299664e-05, | |
| "loss": 0.00554112084209919, | |
| "memory(GiB)": 36.91, | |
| "step": 1105, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.01619 | |
| }, | |
| { | |
| "epoch": 3.613401015228426, | |
| "grad_norm": 0.39939968413297244, | |
| "learning_rate": 1.9540564713866387e-05, | |
| "loss": 0.006034587323665619, | |
| "memory(GiB)": 36.91, | |
| "step": 1110, | |
| "token_acc": 0.9948586118251928, | |
| "train_speed(iter/s)": 0.016191 | |
| }, | |
| { | |
| "epoch": 3.6296446700507614, | |
| "grad_norm": 0.1065247660653177, | |
| "learning_rate": 1.9115152345327152e-05, | |
| "loss": 0.005482121184468269, | |
| "memory(GiB)": 36.91, | |
| "step": 1115, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016191 | |
| }, | |
| { | |
| "epoch": 3.6458883248730967, | |
| "grad_norm": 0.8174090560458377, | |
| "learning_rate": 1.8693324785354822e-05, | |
| "loss": 0.011324305832386018, | |
| "memory(GiB)": 36.91, | |
| "step": 1120, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016193 | |
| }, | |
| { | |
| "epoch": 3.6621319796954315, | |
| "grad_norm": 0.17850770204119407, | |
| "learning_rate": 1.8275130995532974e-05, | |
| "loss": 0.0144767165184021, | |
| "memory(GiB)": 36.91, | |
| "step": 1125, | |
| "token_acc": 0.9978586723768736, | |
| "train_speed(iter/s)": 0.016195 | |
| }, | |
| { | |
| "epoch": 3.6783756345177663, | |
| "grad_norm": 0.33877743892749795, | |
| "learning_rate": 1.7860619515673033e-05, | |
| "loss": 0.01116895154118538, | |
| "memory(GiB)": 36.91, | |
| "step": 1130, | |
| "token_acc": 0.9953271028037384, | |
| "train_speed(iter/s)": 0.016195 | |
| }, | |
| { | |
| "epoch": 3.6946192893401015, | |
| "grad_norm": 0.5168488777536275, | |
| "learning_rate": 1.744983845818019e-05, | |
| "loss": 0.0068625412881374356, | |
| "memory(GiB)": 36.91, | |
| "step": 1135, | |
| "token_acc": 0.9978213507625272, | |
| "train_speed(iter/s)": 0.0162 | |
| }, | |
| { | |
| "epoch": 3.710862944162437, | |
| "grad_norm": 0.7346145409084535, | |
| "learning_rate": 1.7042835502468934e-05, | |
| "loss": 0.002322973683476448, | |
| "memory(GiB)": 36.91, | |
| "step": 1140, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016197 | |
| }, | |
| { | |
| "epoch": 3.7271065989847716, | |
| "grad_norm": 0.6646625028373466, | |
| "learning_rate": 1.6639657889429018e-05, | |
| "loss": 0.018248292803764343, | |
| "memory(GiB)": 36.91, | |
| "step": 1145, | |
| "token_acc": 0.9840182648401826, | |
| "train_speed(iter/s)": 0.016195 | |
| }, | |
| { | |
| "epoch": 3.7433502538071064, | |
| "grad_norm": 0.8354437881107281, | |
| "learning_rate": 1.624035241594213e-05, | |
| "loss": 0.006459401547908783, | |
| "memory(GiB)": 36.91, | |
| "step": 1150, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016193 | |
| }, | |
| { | |
| "epoch": 3.7595939086294416, | |
| "grad_norm": 0.2958093671449778, | |
| "learning_rate": 1.5844965429450132e-05, | |
| "loss": 0.008441635966300964, | |
| "memory(GiB)": 36.91, | |
| "step": 1155, | |
| "token_acc": 0.9834368530020704, | |
| "train_speed(iter/s)": 0.016192 | |
| }, | |
| { | |
| "epoch": 3.775837563451777, | |
| "grad_norm": 0.4306627690474224, | |
| "learning_rate": 1.545354282257562e-05, | |
| "loss": 0.015231077373027802, | |
| "memory(GiB)": 36.91, | |
| "step": 1160, | |
| "token_acc": 0.9976851851851852, | |
| "train_speed(iter/s)": 0.016196 | |
| }, | |
| { | |
| "epoch": 3.7920812182741117, | |
| "grad_norm": 0.0801666210860899, | |
| "learning_rate": 1.5066130027795044e-05, | |
| "loss": 0.02225690186023712, | |
| "memory(GiB)": 36.91, | |
| "step": 1165, | |
| "token_acc": 0.9886363636363636, | |
| "train_speed(iter/s)": 0.0162 | |
| }, | |
| { | |
| "epoch": 3.8083248730964465, | |
| "grad_norm": 1.390297822775598, | |
| "learning_rate": 1.4682772012165436e-05, | |
| "loss": 0.011767344176769256, | |
| "memory(GiB)": 36.91, | |
| "step": 1170, | |
| "token_acc": 0.9953810623556582, | |
| "train_speed(iter/s)": 0.0162 | |
| }, | |
| { | |
| "epoch": 3.8245685279187818, | |
| "grad_norm": 0.576269037629794, | |
| "learning_rate": 1.4303513272105057e-05, | |
| "loss": 0.01135575920343399, | |
| "memory(GiB)": 36.91, | |
| "step": 1175, | |
| "token_acc": 0.9976744186046511, | |
| "train_speed(iter/s)": 0.016199 | |
| }, | |
| { | |
| "epoch": 3.840812182741117, | |
| "grad_norm": 0.6175307257021349, | |
| "learning_rate": 1.3928397828228628e-05, | |
| "loss": 0.00802643597126007, | |
| "memory(GiB)": 36.91, | |
| "step": 1180, | |
| "token_acc": 0.9950738916256158, | |
| "train_speed(iter/s)": 0.016201 | |
| }, | |
| { | |
| "epoch": 3.857055837563452, | |
| "grad_norm": 0.13098006216818975, | |
| "learning_rate": 1.3557469220237962e-05, | |
| "loss": 0.011502113938331605, | |
| "memory(GiB)": 36.91, | |
| "step": 1185, | |
| "token_acc": 0.9935344827586207, | |
| "train_speed(iter/s)": 0.016204 | |
| }, | |
| { | |
| "epoch": 3.8732994923857866, | |
| "grad_norm": 0.3987654668677921, | |
| "learning_rate": 1.3190770501868243e-05, | |
| "loss": 0.011363585293293, | |
| "memory(GiB)": 36.91, | |
| "step": 1190, | |
| "token_acc": 0.9974160206718347, | |
| "train_speed(iter/s)": 0.016203 | |
| }, | |
| { | |
| "epoch": 3.889543147208122, | |
| "grad_norm": 0.14976124575026759, | |
| "learning_rate": 1.2828344235890726e-05, | |
| "loss": 0.01089974120259285, | |
| "memory(GiB)": 36.91, | |
| "step": 1195, | |
| "token_acc": 0.9933481152993349, | |
| "train_speed(iter/s)": 0.016203 | |
| }, | |
| { | |
| "epoch": 3.9057868020304567, | |
| "grad_norm": 1.5199866835408566, | |
| "learning_rate": 1.247023248917259e-05, | |
| "loss": 0.009822697192430497, | |
| "memory(GiB)": 36.91, | |
| "step": 1200, | |
| "token_acc": 0.9929742388758782, | |
| "train_speed(iter/s)": 0.016204 | |
| }, | |
| { | |
| "epoch": 3.922030456852792, | |
| "grad_norm": 1.6580131250235997, | |
| "learning_rate": 1.2116476827794104e-05, | |
| "loss": 0.024014970660209654, | |
| "memory(GiB)": 36.91, | |
| "step": 1205, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016204 | |
| }, | |
| { | |
| "epoch": 3.9382741116751268, | |
| "grad_norm": 1.157754837023021, | |
| "learning_rate": 1.1767118312224151e-05, | |
| "loss": 0.007532584667205811, | |
| "memory(GiB)": 36.91, | |
| "step": 1210, | |
| "token_acc": 0.9972375690607734, | |
| "train_speed(iter/s)": 0.016207 | |
| }, | |
| { | |
| "epoch": 3.954517766497462, | |
| "grad_norm": 0.6972765226059477, | |
| "learning_rate": 1.142219749255427e-05, | |
| "loss": 0.004430451989173889, | |
| "memory(GiB)": 36.91, | |
| "step": 1215, | |
| "token_acc": 0.9972677595628415, | |
| "train_speed(iter/s)": 0.016207 | |
| }, | |
| { | |
| "epoch": 3.970761421319797, | |
| "grad_norm": 2.2979580480692188, | |
| "learning_rate": 1.1081754403791999e-05, | |
| "loss": 0.015141716599464417, | |
| "memory(GiB)": 36.91, | |
| "step": 1220, | |
| "token_acc": 0.9954337899543378, | |
| "train_speed(iter/s)": 0.016206 | |
| }, | |
| { | |
| "epoch": 3.987005076142132, | |
| "grad_norm": 0.2965970510784761, | |
| "learning_rate": 1.0745828561214056e-05, | |
| "loss": 0.021216361224651335, | |
| "memory(GiB)": 36.91, | |
| "step": 1225, | |
| "token_acc": 0.9954337899543378, | |
| "train_speed(iter/s)": 0.016206 | |
| }, | |
| { | |
| "epoch": 3.996751269035533, | |
| "eval_loss": 0.29802748560905457, | |
| "eval_runtime": 62.08, | |
| "eval_samples_per_second": 3.189, | |
| "eval_steps_per_second": 0.805, | |
| "eval_token_acc": 0.9388523047977423, | |
| "step": 1228 | |
| }, | |
| { | |
| "epoch": 4.0056852791878175, | |
| "grad_norm": 0.7419564842144963, | |
| "learning_rate": 1.041445895577977e-05, | |
| "loss": 0.009254975616931916, | |
| "memory(GiB)": 36.91, | |
| "step": 1230, | |
| "token_acc": 0.9668174962292609, | |
| "train_speed(iter/s)": 0.016191 | |
| }, | |
| { | |
| "epoch": 4.021928934010153, | |
| "grad_norm": 0.1343462548929871, | |
| "learning_rate": 1.008768404960535e-05, | |
| "loss": 0.002759779617190361, | |
| "memory(GiB)": 36.91, | |
| "step": 1235, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016192 | |
| }, | |
| { | |
| "epoch": 4.038172588832487, | |
| "grad_norm": 0.08229350773537837, | |
| "learning_rate": 9.765541771499659e-06, | |
| "loss": 0.0012123636901378632, | |
| "memory(GiB)": 36.91, | |
| "step": 1240, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016192 | |
| }, | |
| { | |
| "epoch": 4.054416243654822, | |
| "grad_norm": 0.08190000464747839, | |
| "learning_rate": 9.448069512561775e-06, | |
| "loss": 0.0066297553479671475, | |
| "memory(GiB)": 36.91, | |
| "step": 1245, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016195 | |
| }, | |
| { | |
| "epoch": 4.070659898477158, | |
| "grad_norm": 0.12397302242146173, | |
| "learning_rate": 9.135304121840976e-06, | |
| "loss": 0.0012923330999910832, | |
| "memory(GiB)": 36.91, | |
| "step": 1250, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016192 | |
| }, | |
| { | |
| "epoch": 4.086903553299492, | |
| "grad_norm": 0.057048418793994596, | |
| "learning_rate": 8.827281902059698e-06, | |
| "loss": 0.0007107659243047237, | |
| "memory(GiB)": 36.91, | |
| "step": 1255, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016191 | |
| }, | |
| { | |
| "epoch": 4.103147208121827, | |
| "grad_norm": 0.16324844745357645, | |
| "learning_rate": 8.524038605399886e-06, | |
| "loss": 0.0021383626386523246, | |
| "memory(GiB)": 36.91, | |
| "step": 1260, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016189 | |
| }, | |
| { | |
| "epoch": 4.1193908629441625, | |
| "grad_norm": 0.06874787839714207, | |
| "learning_rate": 8.225609429353187e-06, | |
| "loss": 0.0028022559359669684, | |
| "memory(GiB)": 36.91, | |
| "step": 1265, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016188 | |
| }, | |
| { | |
| "epoch": 4.135634517766498, | |
| "grad_norm": 0.2526140368602798, | |
| "learning_rate": 7.932029012635623e-06, | |
| "loss": 0.003260459750890732, | |
| "memory(GiB)": 36.91, | |
| "step": 1270, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016187 | |
| }, | |
| { | |
| "epoch": 4.151878172588832, | |
| "grad_norm": 0.14918347721067196, | |
| "learning_rate": 7.643331431167017e-06, | |
| "loss": 0.004188637435436249, | |
| "memory(GiB)": 36.91, | |
| "step": 1275, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016188 | |
| }, | |
| { | |
| "epoch": 4.168121827411167, | |
| "grad_norm": 0.46928271799249704, | |
| "learning_rate": 7.35955019411585e-06, | |
| "loss": 0.011932872980833054, | |
| "memory(GiB)": 36.91, | |
| "step": 1280, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016191 | |
| }, | |
| { | |
| "epoch": 4.184365482233503, | |
| "grad_norm": 0.07080459315091195, | |
| "learning_rate": 7.080718240009826e-06, | |
| "loss": 0.004019932448863983, | |
| "memory(GiB)": 36.91, | |
| "step": 1285, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.01619 | |
| }, | |
| { | |
| "epoch": 4.200609137055838, | |
| "grad_norm": 0.7271340874397169, | |
| "learning_rate": 6.806867932912653e-06, | |
| "loss": 0.0061328854411840435, | |
| "memory(GiB)": 36.91, | |
| "step": 1290, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016194 | |
| }, | |
| { | |
| "epoch": 4.216852791878172, | |
| "grad_norm": 0.1265328539578886, | |
| "learning_rate": 6.53803105866761e-06, | |
| "loss": 0.006417517364025116, | |
| "memory(GiB)": 36.91, | |
| "step": 1295, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016196 | |
| }, | |
| { | |
| "epoch": 4.233096446700507, | |
| "grad_norm": 0.057288978819073196, | |
| "learning_rate": 6.274238821208128e-06, | |
| "loss": 0.003987757861614228, | |
| "memory(GiB)": 36.91, | |
| "step": 1300, | |
| "token_acc": 0.9975062344139651, | |
| "train_speed(iter/s)": 0.016195 | |
| }, | |
| { | |
| "epoch": 4.249340101522843, | |
| "grad_norm": 0.1481683428098521, | |
| "learning_rate": 6.015521838935905e-06, | |
| "loss": 0.0010721445083618163, | |
| "memory(GiB)": 36.91, | |
| "step": 1305, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016194 | |
| }, | |
| { | |
| "epoch": 4.265583756345178, | |
| "grad_norm": 0.10590383120253814, | |
| "learning_rate": 5.7619101411671095e-06, | |
| "loss": 0.002213609591126442, | |
| "memory(GiB)": 36.91, | |
| "step": 1310, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016195 | |
| }, | |
| { | |
| "epoch": 4.281827411167512, | |
| "grad_norm": 0.04714189372424805, | |
| "learning_rate": 5.513433164646814e-06, | |
| "loss": 0.0011348580941557885, | |
| "memory(GiB)": 36.91, | |
| "step": 1315, | |
| "token_acc": 0.9976689976689976, | |
| "train_speed(iter/s)": 0.016199 | |
| }, | |
| { | |
| "epoch": 4.298071065989848, | |
| "grad_norm": 0.476391282204877, | |
| "learning_rate": 5.270119750132258e-06, | |
| "loss": 0.004196888953447342, | |
| "memory(GiB)": 36.91, | |
| "step": 1320, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016201 | |
| }, | |
| { | |
| "epoch": 4.314314720812183, | |
| "grad_norm": 0.35042552841819846, | |
| "learning_rate": 5.031998139045352e-06, | |
| "loss": 0.0034095611423254012, | |
| "memory(GiB)": 36.91, | |
| "step": 1325, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016203 | |
| }, | |
| { | |
| "epoch": 4.330558375634518, | |
| "grad_norm": 0.05524764971116243, | |
| "learning_rate": 4.799095970194628e-06, | |
| "loss": 0.0037711452692747115, | |
| "memory(GiB)": 36.91, | |
| "step": 1330, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016203 | |
| }, | |
| { | |
| "epoch": 4.346802030456852, | |
| "grad_norm": 0.5445980593755461, | |
| "learning_rate": 4.571440276567257e-06, | |
| "loss": 0.0024499524384737014, | |
| "memory(GiB)": 36.91, | |
| "step": 1335, | |
| "token_acc": 0.997624703087886, | |
| "train_speed(iter/s)": 0.016206 | |
| }, | |
| { | |
| "epoch": 4.363045685279188, | |
| "grad_norm": 0.10598886435572437, | |
| "learning_rate": 4.349057482191299e-06, | |
| "loss": 0.004410183429718018, | |
| "memory(GiB)": 36.91, | |
| "step": 1340, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016206 | |
| }, | |
| { | |
| "epoch": 4.379289340101523, | |
| "grad_norm": 0.04699969388550453, | |
| "learning_rate": 4.1319733990686446e-06, | |
| "loss": 0.0011100947856903076, | |
| "memory(GiB)": 36.91, | |
| "step": 1345, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016205 | |
| }, | |
| { | |
| "epoch": 4.395532994923858, | |
| "grad_norm": 0.017045928815902597, | |
| "learning_rate": 3.920213224179042e-06, | |
| "loss": 0.00034863052424043416, | |
| "memory(GiB)": 36.91, | |
| "step": 1350, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016206 | |
| }, | |
| { | |
| "epoch": 4.4117766497461925, | |
| "grad_norm": 0.7161935581935048, | |
| "learning_rate": 3.7138015365554833e-06, | |
| "loss": 0.0035605177283287047, | |
| "memory(GiB)": 36.91, | |
| "step": 1355, | |
| "token_acc": 0.9977220956719818, | |
| "train_speed(iter/s)": 0.016207 | |
| }, | |
| { | |
| "epoch": 4.428020304568528, | |
| "grad_norm": 0.06887525802872778, | |
| "learning_rate": 3.512762294431271e-06, | |
| "loss": 0.006134101003408432, | |
| "memory(GiB)": 36.91, | |
| "step": 1360, | |
| "token_acc": 0.9975186104218362, | |
| "train_speed(iter/s)": 0.016208 | |
| }, | |
| { | |
| "epoch": 4.444263959390863, | |
| "grad_norm": 0.041826315852571724, | |
| "learning_rate": 3.3171188324592427e-06, | |
| "loss": 0.0012344198301434516, | |
| "memory(GiB)": 36.91, | |
| "step": 1365, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016208 | |
| }, | |
| { | |
| "epoch": 4.460507614213198, | |
| "grad_norm": 0.07787992465189252, | |
| "learning_rate": 3.126893859003249e-06, | |
| "loss": 0.0013754777610301971, | |
| "memory(GiB)": 36.91, | |
| "step": 1370, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016208 | |
| }, | |
| { | |
| "epoch": 4.476751269035533, | |
| "grad_norm": 0.9611581457799497, | |
| "learning_rate": 2.9421094535024507e-06, | |
| "loss": 0.004121043905615807, | |
| "memory(GiB)": 36.91, | |
| "step": 1375, | |
| "token_acc": 0.9933920704845814, | |
| "train_speed(iter/s)": 0.016206 | |
| }, | |
| { | |
| "epoch": 4.492994923857868, | |
| "grad_norm": 0.11072593270596472, | |
| "learning_rate": 2.762787063908523e-06, | |
| "loss": 0.0024029091000556946, | |
| "memory(GiB)": 36.91, | |
| "step": 1380, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016205 | |
| }, | |
| { | |
| "epoch": 4.509238578680203, | |
| "grad_norm": 0.02340550565254115, | |
| "learning_rate": 2.5889475041961765e-06, | |
| "loss": 0.001028289459645748, | |
| "memory(GiB)": 36.91, | |
| "step": 1385, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.01621 | |
| }, | |
| { | |
| "epoch": 4.525482233502538, | |
| "grad_norm": 0.08895116218405089, | |
| "learning_rate": 2.4206109519473163e-06, | |
| "loss": 0.0021161407232284544, | |
| "memory(GiB)": 36.91, | |
| "step": 1390, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016211 | |
| }, | |
| { | |
| "epoch": 4.541725888324873, | |
| "grad_norm": 0.24076601170504602, | |
| "learning_rate": 2.2577969460089997e-06, | |
| "loss": 0.0007429494522511959, | |
| "memory(GiB)": 36.91, | |
| "step": 1395, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016212 | |
| }, | |
| { | |
| "epoch": 4.557969543147208, | |
| "grad_norm": 0.19664829308024404, | |
| "learning_rate": 2.100524384225555e-06, | |
| "loss": 0.0008249727077782154, | |
| "memory(GiB)": 36.91, | |
| "step": 1400, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.01621 | |
| }, | |
| { | |
| "epoch": 4.574213197969543, | |
| "grad_norm": 0.06599531052332817, | |
| "learning_rate": 1.948811521245131e-06, | |
| "loss": 0.000786225963383913, | |
| "memory(GiB)": 36.91, | |
| "step": 1405, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016209 | |
| }, | |
| { | |
| "epoch": 4.5904568527918785, | |
| "grad_norm": 0.10702737644857346, | |
| "learning_rate": 1.8026759664008465e-06, | |
| "loss": 0.003063713386654854, | |
| "memory(GiB)": 36.91, | |
| "step": 1410, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016207 | |
| }, | |
| { | |
| "epoch": 4.606700507614213, | |
| "grad_norm": 0.41678449867799244, | |
| "learning_rate": 1.6621346816668992e-06, | |
| "loss": 0.00532943345606327, | |
| "memory(GiB)": 36.91, | |
| "step": 1415, | |
| "token_acc": 0.9937629937629938, | |
| "train_speed(iter/s)": 0.016207 | |
| }, | |
| { | |
| "epoch": 4.622944162436548, | |
| "grad_norm": 0.029982460463042173, | |
| "learning_rate": 1.5272039796897786e-06, | |
| "loss": 0.0017097776755690575, | |
| "memory(GiB)": 36.91, | |
| "step": 1420, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016209 | |
| }, | |
| { | |
| "epoch": 4.639187817258883, | |
| "grad_norm": 0.03591858354249925, | |
| "learning_rate": 1.397899521894841e-06, | |
| "loss": 0.0013645312748849392, | |
| "memory(GiB)": 36.91, | |
| "step": 1425, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016212 | |
| }, | |
| { | |
| "epoch": 4.655431472081219, | |
| "grad_norm": 0.04773799774300644, | |
| "learning_rate": 1.2742363166685034e-06, | |
| "loss": 0.0009639391675591469, | |
| "memory(GiB)": 36.91, | |
| "step": 1430, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.01621 | |
| }, | |
| { | |
| "epoch": 4.671675126903553, | |
| "grad_norm": 0.129000803673704, | |
| "learning_rate": 1.15622871761622e-06, | |
| "loss": 0.0005136763211339712, | |
| "memory(GiB)": 36.91, | |
| "step": 1435, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016212 | |
| }, | |
| { | |
| "epoch": 4.687918781725888, | |
| "grad_norm": 0.029179325549530243, | |
| "learning_rate": 1.0438904218964319e-06, | |
| "loss": 0.0004105303902179003, | |
| "memory(GiB)": 36.91, | |
| "step": 1440, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016213 | |
| }, | |
| { | |
| "epoch": 4.7041624365482235, | |
| "grad_norm": 0.04897256940654327, | |
| "learning_rate": 9.372344686307655e-07, | |
| "loss": 0.0009922079741954803, | |
| "memory(GiB)": 36.91, | |
| "step": 1445, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016212 | |
| }, | |
| { | |
| "epoch": 4.720406091370558, | |
| "grad_norm": 0.0393178010532892, | |
| "learning_rate": 8.362732373905723e-07, | |
| "loss": 0.0008288329467177391, | |
| "memory(GiB)": 36.91, | |
| "step": 1450, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016214 | |
| }, | |
| { | |
| "epoch": 4.736649746192893, | |
| "grad_norm": 0.08771738931354985, | |
| "learning_rate": 7.410184467600001e-07, | |
| "loss": 0.0005111692938953638, | |
| "memory(GiB)": 36.91, | |
| "step": 1455, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016214 | |
| }, | |
| { | |
| "epoch": 4.752893401015228, | |
| "grad_norm": 0.04916799951696976, | |
| "learning_rate": 6.514811529758747e-07, | |
| "loss": 0.007441927492618561, | |
| "memory(GiB)": 36.91, | |
| "step": 1460, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016214 | |
| }, | |
| { | |
| "epoch": 4.769137055837564, | |
| "grad_norm": 0.44716598217302617, | |
| "learning_rate": 5.676717486443439e-07, | |
| "loss": 0.0024275451898574827, | |
| "memory(GiB)": 36.91, | |
| "step": 1465, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016214 | |
| }, | |
| { | |
| "epoch": 4.785380710659899, | |
| "grad_norm": 0.12117859136787597, | |
| "learning_rate": 4.895999615346314e-07, | |
| "loss": 0.001637093722820282, | |
| "memory(GiB)": 36.91, | |
| "step": 1470, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016214 | |
| }, | |
| { | |
| "epoch": 4.801624365482233, | |
| "grad_norm": 0.01706819131966345, | |
| "learning_rate": 4.1727485344994486e-07, | |
| "loss": 0.0003483247943222523, | |
| "memory(GiB)": 36.91, | |
| "step": 1475, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016212 | |
| }, | |
| { | |
| "epoch": 4.8178680203045685, | |
| "grad_norm": 0.04859669108953238, | |
| "learning_rate": 3.507048191756401e-07, | |
| "loss": 0.0021356761455535887, | |
| "memory(GiB)": 36.91, | |
| "step": 1480, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016212 | |
| }, | |
| { | |
| "epoch": 4.834111675126904, | |
| "grad_norm": 0.03682429514387162, | |
| "learning_rate": 2.8989758550487245e-07, | |
| "loss": 0.0021858945488929748, | |
| "memory(GiB)": 36.91, | |
| "step": 1485, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016213 | |
| }, | |
| { | |
| "epoch": 4.850355329949238, | |
| "grad_norm": 0.06507640939116277, | |
| "learning_rate": 2.3486021034170857e-07, | |
| "loss": 0.002923069894313812, | |
| "memory(GiB)": 36.91, | |
| "step": 1490, | |
| "token_acc": 0.9977064220183486, | |
| "train_speed(iter/s)": 0.016212 | |
| }, | |
| { | |
| "epoch": 4.866598984771573, | |
| "grad_norm": 0.04259804746440851, | |
| "learning_rate": 1.8559908188195418e-07, | |
| "loss": 0.0019719479605555534, | |
| "memory(GiB)": 36.91, | |
| "step": 1495, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016212 | |
| }, | |
| { | |
| "epoch": 4.882842639593909, | |
| "grad_norm": 0.25393381486977334, | |
| "learning_rate": 1.4211991787164147e-07, | |
| "loss": 0.0011512625962495804, | |
| "memory(GiB)": 36.91, | |
| "step": 1500, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016215 | |
| }, | |
| { | |
| "epoch": 4.899086294416244, | |
| "grad_norm": 0.21720000107148496, | |
| "learning_rate": 1.044277649433989e-07, | |
| "loss": 0.003379678726196289, | |
| "memory(GiB)": 36.91, | |
| "step": 1505, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016213 | |
| }, | |
| { | |
| "epoch": 4.915329949238579, | |
| "grad_norm": 0.6636335728606932, | |
| "learning_rate": 7.252699803065311e-08, | |
| "loss": 0.014554958045482635, | |
| "memory(GiB)": 36.91, | |
| "step": 1510, | |
| "token_acc": 0.9886792452830189, | |
| "train_speed(iter/s)": 0.016211 | |
| }, | |
| { | |
| "epoch": 4.9315736040609135, | |
| "grad_norm": 0.042674818413491626, | |
| "learning_rate": 4.6421319859862864e-08, | |
| "loss": 0.0024311095476150513, | |
| "memory(GiB)": 36.91, | |
| "step": 1515, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016212 | |
| }, | |
| { | |
| "epoch": 4.947817258883249, | |
| "grad_norm": 0.07981897617268605, | |
| "learning_rate": 2.6113760520735108e-08, | |
| "loss": 0.0024462098255753515, | |
| "memory(GiB)": 36.91, | |
| "step": 1520, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.01621 | |
| }, | |
| { | |
| "epoch": 4.964060913705584, | |
| "grad_norm": 0.01695528976036472, | |
| "learning_rate": 1.1606677114500697e-08, | |
| "loss": 0.011407441645860671, | |
| "memory(GiB)": 36.91, | |
| "step": 1525, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.01621 | |
| }, | |
| { | |
| "epoch": 4.980304568527918, | |
| "grad_norm": 0.05383783400729952, | |
| "learning_rate": 2.901753480361036e-09, | |
| "loss": 0.005226198583841324, | |
| "memory(GiB)": 36.91, | |
| "step": 1530, | |
| "token_acc": 0.9956521739130435, | |
| "train_speed(iter/s)": 0.01621 | |
| }, | |
| { | |
| "epoch": 4.996548223350254, | |
| "grad_norm": 0.9774296313594534, | |
| "learning_rate": 0.0, | |
| "loss": 0.003532126545906067, | |
| "memory(GiB)": 36.91, | |
| "step": 1535, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 0.016209 | |
| }, | |
| { | |
| "epoch": 4.996548223350254, | |
| "eval_loss": 0.31882038712501526, | |
| "eval_runtime": 62.2556, | |
| "eval_samples_per_second": 3.18, | |
| "eval_steps_per_second": 0.803, | |
| "eval_token_acc": 0.9397930385700847, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 4.996548223350254, | |
| "eval_loss": 0.31882038712501526, | |
| "eval_runtime": 62.6813, | |
| "eval_samples_per_second": 3.159, | |
| "eval_steps_per_second": 0.798, | |
| "eval_token_acc": 0.9397930385700847, | |
| "step": 1535 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1535, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.2119246482890555e+19, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |