Qwen_Atlas_Finetuned2 / trainer_state.json
amrithhun's picture
update files
31fb29c verified
{
"best_metric": 0.13022484,
"best_model_checkpoint": "/workspace/ms-swift/qwen_atlas/v1-20250305-211120/checkpoint-1152",
"epoch": 1.997939038941317,
"eval_steps": 100,
"global_step": 1152,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001735546154680551,
"grad_norm": 4.6760759353637695,
"learning_rate": 1.724137931034483e-06,
"loss": 1.6580421924591064,
"memory(GiB)": 35.37,
"step": 1,
"token_acc": 0.6974789915966386,
"train_speed(iter/s)": 0.008041
},
{
"epoch": 0.008677730773402755,
"grad_norm": 4.790174961090088,
"learning_rate": 8.620689655172414e-06,
"loss": 1.6793524026870728,
"memory(GiB)": 36.38,
"step": 5,
"token_acc": 0.6903924077628493,
"train_speed(iter/s)": 0.008451
},
{
"epoch": 0.01735546154680551,
"grad_norm": 4.196518421173096,
"learning_rate": 1.7241379310344828e-05,
"loss": 1.64276065826416,
"memory(GiB)": 39.49,
"step": 10,
"token_acc": 0.6917943452851051,
"train_speed(iter/s)": 0.008498
},
{
"epoch": 0.026033192320208266,
"grad_norm": 6.951071739196777,
"learning_rate": 2.5862068965517244e-05,
"loss": 1.35954008102417,
"memory(GiB)": 39.49,
"step": 15,
"token_acc": 0.7025507998270644,
"train_speed(iter/s)": 0.00851
},
{
"epoch": 0.03471092309361102,
"grad_norm": 5.639510154724121,
"learning_rate": 3.4482758620689657e-05,
"loss": 0.8004651069641113,
"memory(GiB)": 42.62,
"step": 20,
"token_acc": 0.7704440185273917,
"train_speed(iter/s)": 0.008512
},
{
"epoch": 0.04338865386701377,
"grad_norm": 2.946929693222046,
"learning_rate": 4.3103448275862066e-05,
"loss": 0.5983310222625733,
"memory(GiB)": 43.7,
"step": 25,
"token_acc": 0.8041049513330512,
"train_speed(iter/s)": 0.008515
},
{
"epoch": 0.05206638464041653,
"grad_norm": 2.4821643829345703,
"learning_rate": 5.172413793103449e-05,
"loss": 0.5139754772186279,
"memory(GiB)": 43.7,
"step": 30,
"token_acc": 0.8192055870798778,
"train_speed(iter/s)": 0.00852
},
{
"epoch": 0.060744115413819286,
"grad_norm": 2.31684947013855,
"learning_rate": 6.03448275862069e-05,
"loss": 0.4888479709625244,
"memory(GiB)": 43.7,
"step": 35,
"token_acc": 0.8198602583103959,
"train_speed(iter/s)": 0.008521
},
{
"epoch": 0.06942184618722204,
"grad_norm": 2.18639874458313,
"learning_rate": 6.896551724137931e-05,
"loss": 0.4743186473846436,
"memory(GiB)": 43.7,
"step": 40,
"token_acc": 0.8318414322250639,
"train_speed(iter/s)": 0.008525
},
{
"epoch": 0.07809957696062479,
"grad_norm": 2.380423069000244,
"learning_rate": 7.758620689655173e-05,
"loss": 0.4614367961883545,
"memory(GiB)": 43.7,
"step": 45,
"token_acc": 0.8394912427022518,
"train_speed(iter/s)": 0.008526
},
{
"epoch": 0.08677730773402755,
"grad_norm": 16.561281204223633,
"learning_rate": 8.620689655172413e-05,
"loss": 0.4493127346038818,
"memory(GiB)": 43.7,
"step": 50,
"token_acc": 0.8405128414519619,
"train_speed(iter/s)": 0.008526
},
{
"epoch": 0.09545503850743031,
"grad_norm": 2.035421848297119,
"learning_rate": 9.482758620689656e-05,
"loss": 0.43794918060302734,
"memory(GiB)": 43.7,
"step": 55,
"token_acc": 0.8461954318151101,
"train_speed(iter/s)": 0.008527
},
{
"epoch": 0.10413276928083307,
"grad_norm": 1.2835978269577026,
"learning_rate": 9.999917536092901e-05,
"loss": 0.43846426010131834,
"memory(GiB)": 43.7,
"step": 60,
"token_acc": 0.8478704525288376,
"train_speed(iter/s)": 0.008527
},
{
"epoch": 0.11281050005423582,
"grad_norm": 6.1280741691589355,
"learning_rate": 9.99898984837663e-05,
"loss": 0.4272454261779785,
"memory(GiB)": 43.7,
"step": 65,
"token_acc": 0.8532792155601994,
"train_speed(iter/s)": 0.008528
},
{
"epoch": 0.12148823082763857,
"grad_norm": 2.31329607963562,
"learning_rate": 9.997031584946868e-05,
"loss": 0.4156841278076172,
"memory(GiB)": 43.7,
"step": 70,
"token_acc": 0.8506665595888211,
"train_speed(iter/s)": 0.008529
},
{
"epoch": 0.13016596160104132,
"grad_norm": 4.000900745391846,
"learning_rate": 9.994043149512924e-05,
"loss": 0.40361742973327636,
"memory(GiB)": 43.7,
"step": 75,
"token_acc": 0.8494347592032796,
"train_speed(iter/s)": 0.00853
},
{
"epoch": 0.13884369237444408,
"grad_norm": 4.327010154724121,
"learning_rate": 9.99002515816106e-05,
"loss": 0.4148906707763672,
"memory(GiB)": 43.7,
"step": 80,
"token_acc": 0.8513289724269272,
"train_speed(iter/s)": 0.008532
},
{
"epoch": 0.14752142314784683,
"grad_norm": 2.917447805404663,
"learning_rate": 9.984978439227486e-05,
"loss": 0.3819650411605835,
"memory(GiB)": 43.7,
"step": 85,
"token_acc": 0.865501155338536,
"train_speed(iter/s)": 0.008532
},
{
"epoch": 0.15619915392124958,
"grad_norm": 2.645838499069214,
"learning_rate": 9.978904033127593e-05,
"loss": 0.3954829454421997,
"memory(GiB)": 43.7,
"step": 90,
"token_acc": 0.8565228073048891,
"train_speed(iter/s)": 0.008532
},
{
"epoch": 0.16487688469465234,
"grad_norm": 1.4669901132583618,
"learning_rate": 9.971803192141458e-05,
"loss": 0.36885552406311034,
"memory(GiB)": 43.7,
"step": 95,
"token_acc": 0.8682406076780949,
"train_speed(iter/s)": 0.008534
},
{
"epoch": 0.1735546154680551,
"grad_norm": 2.0109212398529053,
"learning_rate": 9.963677380155683e-05,
"loss": 0.3746215581893921,
"memory(GiB)": 43.7,
"step": 100,
"token_acc": 0.8680254347917632,
"train_speed(iter/s)": 0.008533
},
{
"epoch": 0.1735546154680551,
"eval_loss": 0.385597825050354,
"eval_runtime": 222.5947,
"eval_samples_per_second": 4.182,
"eval_steps_per_second": 0.84,
"eval_token_acc": 0.8657494207806096,
"step": 100
},
{
"epoch": 0.18223234624145787,
"grad_norm": 6.436813831329346,
"learning_rate": 9.954528272361607e-05,
"loss": 0.36356263160705565,
"memory(GiB)": 43.7,
"step": 105,
"token_acc": 0.8616467952308554,
"train_speed(iter/s)": 0.00838
},
{
"epoch": 0.19091007701486062,
"grad_norm": 4.178765773773193,
"learning_rate": 9.944357754909945e-05,
"loss": 0.3653342485427856,
"memory(GiB)": 43.7,
"step": 110,
"token_acc": 0.8718457996659594,
"train_speed(iter/s)": 0.008389
},
{
"epoch": 0.19958780778826338,
"grad_norm": 2.2793004512786865,
"learning_rate": 9.933167924521956e-05,
"loss": 0.36059207916259767,
"memory(GiB)": 43.7,
"step": 115,
"token_acc": 0.8739488320355951,
"train_speed(iter/s)": 0.008395
},
{
"epoch": 0.20826553856166613,
"grad_norm": 2.0142924785614014,
"learning_rate": 9.920961088057184e-05,
"loss": 0.3489166975021362,
"memory(GiB)": 43.7,
"step": 120,
"token_acc": 0.8721164521567498,
"train_speed(iter/s)": 0.0084
},
{
"epoch": 0.21694326933506888,
"grad_norm": 1.6888527870178223,
"learning_rate": 9.907739762037879e-05,
"loss": 0.3451664447784424,
"memory(GiB)": 43.7,
"step": 125,
"token_acc": 0.8695542472666106,
"train_speed(iter/s)": 0.008406
},
{
"epoch": 0.22562100010847164,
"grad_norm": 1.9126875400543213,
"learning_rate": 9.893506672130211e-05,
"loss": 0.336126446723938,
"memory(GiB)": 43.7,
"step": 130,
"token_acc": 0.8788651577940708,
"train_speed(iter/s)": 0.008411
},
{
"epoch": 0.2342987308818744,
"grad_norm": 1.2295624017715454,
"learning_rate": 9.878264752582341e-05,
"loss": 0.3329684495925903,
"memory(GiB)": 43.7,
"step": 135,
"token_acc": 0.8763455827765405,
"train_speed(iter/s)": 0.008415
},
{
"epoch": 0.24297646165527714,
"grad_norm": 2.3171041011810303,
"learning_rate": 9.86201714561952e-05,
"loss": 0.3186241865158081,
"memory(GiB)": 43.7,
"step": 140,
"token_acc": 0.8794609212217936,
"train_speed(iter/s)": 0.00842
},
{
"epoch": 0.2516541924286799,
"grad_norm": 1.815627932548523,
"learning_rate": 9.844767200796284e-05,
"loss": 0.318283748626709,
"memory(GiB)": 43.7,
"step": 145,
"token_acc": 0.891897272546287,
"train_speed(iter/s)": 0.008423
},
{
"epoch": 0.26033192320208265,
"grad_norm": 2.60662841796875,
"learning_rate": 9.826518474305932e-05,
"loss": 0.3101086378097534,
"memory(GiB)": 43.7,
"step": 150,
"token_acc": 0.8821918980194208,
"train_speed(iter/s)": 0.008427
},
{
"epoch": 0.26900965397548543,
"grad_norm": 2.4237709045410156,
"learning_rate": 9.807274728247389e-05,
"loss": 0.31526162624359133,
"memory(GiB)": 43.7,
"step": 155,
"token_acc": 0.8851850323858245,
"train_speed(iter/s)": 0.008429
},
{
"epoch": 0.27768738474888816,
"grad_norm": 2.208160638809204,
"learning_rate": 9.787039929849617e-05,
"loss": 0.30453202724456785,
"memory(GiB)": 43.7,
"step": 160,
"token_acc": 0.8869576413029484,
"train_speed(iter/s)": 0.008433
},
{
"epoch": 0.28636511552229094,
"grad_norm": 2.1958048343658447,
"learning_rate": 9.765818250653756e-05,
"loss": 0.3076713800430298,
"memory(GiB)": 43.7,
"step": 165,
"token_acc": 0.889782004881744,
"train_speed(iter/s)": 0.008437
},
{
"epoch": 0.29504284629569366,
"grad_norm": 3.110142707824707,
"learning_rate": 9.743614065653119e-05,
"loss": 0.29779419898986814,
"memory(GiB)": 43.7,
"step": 170,
"token_acc": 0.8947498864058822,
"train_speed(iter/s)": 0.00844
},
{
"epoch": 0.30372057706909644,
"grad_norm": 2.8065741062164307,
"learning_rate": 9.720431952391271e-05,
"loss": 0.29869651794433594,
"memory(GiB)": 43.7,
"step": 175,
"token_acc": 0.886225766767064,
"train_speed(iter/s)": 0.008442
},
{
"epoch": 0.31239830784249917,
"grad_norm": 3.4282145500183105,
"learning_rate": 9.696276690018329e-05,
"loss": 0.2898601293563843,
"memory(GiB)": 43.7,
"step": 180,
"token_acc": 0.896655905968049,
"train_speed(iter/s)": 0.008445
},
{
"epoch": 0.32107603861590195,
"grad_norm": 1.7516893148422241,
"learning_rate": 9.671153258305709e-05,
"loss": 0.2760122060775757,
"memory(GiB)": 43.7,
"step": 185,
"token_acc": 0.8976162563501368,
"train_speed(iter/s)": 0.008447
},
{
"epoch": 0.3297537693893047,
"grad_norm": 3.0154619216918945,
"learning_rate": 9.64506683661951e-05,
"loss": 0.29156625270843506,
"memory(GiB)": 43.7,
"step": 190,
"token_acc": 0.8932791728212703,
"train_speed(iter/s)": 0.00845
},
{
"epoch": 0.33843150016270745,
"grad_norm": 3.490983724594116,
"learning_rate": 9.61802280285276e-05,
"loss": 0.2764655590057373,
"memory(GiB)": 43.7,
"step": 195,
"token_acc": 0.895898926293695,
"train_speed(iter/s)": 0.008453
},
{
"epoch": 0.3471092309361102,
"grad_norm": 13.885275840759277,
"learning_rate": 9.59002673231672e-05,
"loss": 0.2801302671432495,
"memory(GiB)": 43.7,
"step": 200,
"token_acc": 0.9076711267908618,
"train_speed(iter/s)": 0.008455
},
{
"epoch": 0.3471092309361102,
"eval_loss": 0.28404700756073,
"eval_runtime": 220.1539,
"eval_samples_per_second": 4.229,
"eval_steps_per_second": 0.849,
"eval_token_acc": 0.9015362680449118,
"step": 200
},
{
"epoch": 0.35578696170951296,
"grad_norm": 1.6657084226608276,
"learning_rate": 9.561084396591494e-05,
"loss": 0.2741654396057129,
"memory(GiB)": 43.7,
"step": 205,
"token_acc": 0.8974332690967239,
"train_speed(iter/s)": 0.00838
},
{
"epoch": 0.36446469248291574,
"grad_norm": 3.5704126358032227,
"learning_rate": 9.53120176233619e-05,
"loss": 0.27562870979309084,
"memory(GiB)": 43.7,
"step": 210,
"token_acc": 0.8987169683065681,
"train_speed(iter/s)": 0.008384
},
{
"epoch": 0.37314242325631847,
"grad_norm": 1.3969208002090454,
"learning_rate": 9.500384990058841e-05,
"loss": 0.27314207553863523,
"memory(GiB)": 43.7,
"step": 215,
"token_acc": 0.8985029696525914,
"train_speed(iter/s)": 0.008388
},
{
"epoch": 0.38182015402972125,
"grad_norm": 2.852271795272827,
"learning_rate": 9.468640432846378e-05,
"loss": 0.2624432325363159,
"memory(GiB)": 43.7,
"step": 220,
"token_acc": 0.9034218361270852,
"train_speed(iter/s)": 0.008391
},
{
"epoch": 0.390497884803124,
"grad_norm": 2.3971970081329346,
"learning_rate": 9.4359746350549e-05,
"loss": 0.27120444774627683,
"memory(GiB)": 43.7,
"step": 225,
"token_acc": 0.9019728189390618,
"train_speed(iter/s)": 0.008395
},
{
"epoch": 0.39917561557652675,
"grad_norm": 2.9562697410583496,
"learning_rate": 9.402394330960506e-05,
"loss": 0.2618232727050781,
"memory(GiB)": 43.7,
"step": 230,
"token_acc": 0.9105880494451246,
"train_speed(iter/s)": 0.008397
},
{
"epoch": 0.4078533463499295,
"grad_norm": 2.3861279487609863,
"learning_rate": 9.367906443370984e-05,
"loss": 0.2571915864944458,
"memory(GiB)": 43.7,
"step": 235,
"token_acc": 0.9074556213017752,
"train_speed(iter/s)": 0.0084
},
{
"epoch": 0.41653107712333226,
"grad_norm": 2.1653852462768555,
"learning_rate": 9.332518082198624e-05,
"loss": 0.26276435852050783,
"memory(GiB)": 43.7,
"step": 240,
"token_acc": 0.9037630249806616,
"train_speed(iter/s)": 0.008403
},
{
"epoch": 0.425208807896735,
"grad_norm": 6.575355529785156,
"learning_rate": 9.296236542994463e-05,
"loss": 0.24440584182739258,
"memory(GiB)": 43.7,
"step": 245,
"token_acc": 0.9097762166002289,
"train_speed(iter/s)": 0.008406
},
{
"epoch": 0.43388653867013777,
"grad_norm": 6.715099811553955,
"learning_rate": 9.259069305444252e-05,
"loss": 0.2519469499588013,
"memory(GiB)": 43.7,
"step": 250,
"token_acc": 0.9191672993167379,
"train_speed(iter/s)": 0.008408
},
{
"epoch": 0.4425642694435405,
"grad_norm": 2.3563008308410645,
"learning_rate": 9.221024031826476e-05,
"loss": 0.2473703384399414,
"memory(GiB)": 43.7,
"step": 255,
"token_acc": 0.9128261606235174,
"train_speed(iter/s)": 0.008411
},
{
"epoch": 0.4512420002169433,
"grad_norm": 3.382643461227417,
"learning_rate": 9.18210856543272e-05,
"loss": 0.236089825630188,
"memory(GiB)": 43.7,
"step": 260,
"token_acc": 0.9163037833086972,
"train_speed(iter/s)": 0.008413
},
{
"epoch": 0.459919730990346,
"grad_norm": 2.673158884048462,
"learning_rate": 9.142330928950718e-05,
"loss": 0.24100546836853026,
"memory(GiB)": 43.7,
"step": 265,
"token_acc": 0.9135581499410161,
"train_speed(iter/s)": 0.008416
},
{
"epoch": 0.4685974617637488,
"grad_norm": 11.933094024658203,
"learning_rate": 9.101699322810424e-05,
"loss": 0.23190362453460694,
"memory(GiB)": 43.7,
"step": 270,
"token_acc": 0.9265802143189936,
"train_speed(iter/s)": 0.008418
},
{
"epoch": 0.47727519253715156,
"grad_norm": 22.01103973388672,
"learning_rate": 9.060222123493441e-05,
"loss": 0.22824497222900392,
"memory(GiB)": 43.7,
"step": 275,
"token_acc": 0.9234252124870919,
"train_speed(iter/s)": 0.00842
},
{
"epoch": 0.4859529233105543,
"grad_norm": 2.3091957569122314,
"learning_rate": 9.017907881806146e-05,
"loss": 0.22891411781311036,
"memory(GiB)": 43.7,
"step": 280,
"token_acc": 0.9208098411071246,
"train_speed(iter/s)": 0.008422
},
{
"epoch": 0.49463065408395707,
"grad_norm": 1.878693699836731,
"learning_rate": 8.974765321116886e-05,
"loss": 0.22780225276947022,
"memory(GiB)": 43.7,
"step": 285,
"token_acc": 0.92338361626043,
"train_speed(iter/s)": 0.008424
},
{
"epoch": 0.5033083848573598,
"grad_norm": 4.434957981109619,
"learning_rate": 8.930803335557602e-05,
"loss": 0.22615401744842528,
"memory(GiB)": 43.7,
"step": 290,
"token_acc": 0.9223257075684963,
"train_speed(iter/s)": 0.008426
},
{
"epoch": 0.5119861156307626,
"grad_norm": 3.4905004501342773,
"learning_rate": 8.886030988190232e-05,
"loss": 0.22355277538299562,
"memory(GiB)": 43.7,
"step": 295,
"token_acc": 0.9136541664790888,
"train_speed(iter/s)": 0.008428
},
{
"epoch": 0.5206638464041653,
"grad_norm": 2.17803692817688,
"learning_rate": 8.840457509138307e-05,
"loss": 0.2240373134613037,
"memory(GiB)": 43.7,
"step": 300,
"token_acc": 0.9161153744376819,
"train_speed(iter/s)": 0.00843
},
{
"epoch": 0.5206638464041653,
"eval_loss": 0.22916455566883087,
"eval_runtime": 220.0371,
"eval_samples_per_second": 4.231,
"eval_steps_per_second": 0.85,
"eval_token_acc": 0.9222598467296382,
"step": 300
},
{
"epoch": 0.529341577177568,
"grad_norm": 2.641390800476074,
"learning_rate": 8.794092293684098e-05,
"loss": 0.22017295360565187,
"memory(GiB)": 43.7,
"step": 305,
"token_acc": 0.9203932638956488,
"train_speed(iter/s)": 0.00838
},
{
"epoch": 0.5380193079509709,
"grad_norm": 2.450665235519409,
"learning_rate": 8.746944900331711e-05,
"loss": 0.2305682897567749,
"memory(GiB)": 43.7,
"step": 310,
"token_acc": 0.914595610214365,
"train_speed(iter/s)": 0.008382
},
{
"epoch": 0.5466970387243736,
"grad_norm": 1.7363907098770142,
"learning_rate": 8.699025048836541e-05,
"loss": 0.21577081680297852,
"memory(GiB)": 43.7,
"step": 315,
"token_acc": 0.9174962038130589,
"train_speed(iter/s)": 0.008384
},
{
"epoch": 0.5553747694977763,
"grad_norm": 4.199025630950928,
"learning_rate": 8.650342618201475e-05,
"loss": 0.22321650981903077,
"memory(GiB)": 43.7,
"step": 320,
"token_acc": 0.9186493336603712,
"train_speed(iter/s)": 0.008386
},
{
"epoch": 0.564052500271179,
"grad_norm": 3.0483787059783936,
"learning_rate": 8.60090764464027e-05,
"loss": 0.21764111518859863,
"memory(GiB)": 43.7,
"step": 325,
"token_acc": 0.9221584676398616,
"train_speed(iter/s)": 0.008389
},
{
"epoch": 0.5727302310445819,
"grad_norm": 1.9495664834976196,
"learning_rate": 8.550730319508516e-05,
"loss": 0.21329159736633302,
"memory(GiB)": 43.7,
"step": 330,
"token_acc": 0.9290431767522497,
"train_speed(iter/s)": 0.008391
},
{
"epoch": 0.5814079618179846,
"grad_norm": 1.935138463973999,
"learning_rate": 8.49982098720263e-05,
"loss": 0.21635849475860597,
"memory(GiB)": 43.7,
"step": 335,
"token_acc": 0.9326674848702865,
"train_speed(iter/s)": 0.008393
},
{
"epoch": 0.5900856925913873,
"grad_norm": 2.897500991821289,
"learning_rate": 8.448190143027269e-05,
"loss": 0.2165597438812256,
"memory(GiB)": 43.7,
"step": 340,
"token_acc": 0.9246217699825947,
"train_speed(iter/s)": 0.008395
},
{
"epoch": 0.5987634233647902,
"grad_norm": 3.2862493991851807,
"learning_rate": 8.395848431031672e-05,
"loss": 0.2119969129562378,
"memory(GiB)": 43.7,
"step": 345,
"token_acc": 0.927697031729785,
"train_speed(iter/s)": 0.008397
},
{
"epoch": 0.6074411541381929,
"grad_norm": 2.485685110092163,
"learning_rate": 8.342806641815304e-05,
"loss": 0.19694406986236573,
"memory(GiB)": 43.7,
"step": 350,
"token_acc": 0.9306310071040534,
"train_speed(iter/s)": 0.008399
},
{
"epoch": 0.6161188849115956,
"grad_norm": 1.790946364402771,
"learning_rate": 8.289075710303305e-05,
"loss": 0.20862700939178466,
"memory(GiB)": 43.7,
"step": 355,
"token_acc": 0.9261981517300666,
"train_speed(iter/s)": 0.008401
},
{
"epoch": 0.6247966156849983,
"grad_norm": 9.109935760498047,
"learning_rate": 8.234666713492178e-05,
"loss": 0.2021815538406372,
"memory(GiB)": 43.7,
"step": 360,
"token_acc": 0.9324708811238552,
"train_speed(iter/s)": 0.008403
},
{
"epoch": 0.6334743464584012,
"grad_norm": 8.062849998474121,
"learning_rate": 8.179590868166196e-05,
"loss": 0.20523991584777831,
"memory(GiB)": 43.7,
"step": 365,
"token_acc": 0.9240613952655837,
"train_speed(iter/s)": 0.008404
},
{
"epoch": 0.6421520772318039,
"grad_norm": 2.0501291751861572,
"learning_rate": 8.123859528584985e-05,
"loss": 0.1977332353591919,
"memory(GiB)": 43.7,
"step": 370,
"token_acc": 0.9301781085375032,
"train_speed(iter/s)": 0.008406
},
{
"epoch": 0.6508298080052066,
"grad_norm": 6.636540412902832,
"learning_rate": 8.067484184142759e-05,
"loss": 0.19646908044815065,
"memory(GiB)": 43.7,
"step": 375,
"token_acc": 0.9374709076803723,
"train_speed(iter/s)": 0.008407
},
{
"epoch": 0.6595075387786093,
"grad_norm": 4.470943927764893,
"learning_rate": 8.010476456999712e-05,
"loss": 0.1992442488670349,
"memory(GiB)": 43.7,
"step": 380,
"token_acc": 0.9223243653537995,
"train_speed(iter/s)": 0.008408
},
{
"epoch": 0.6681852695520122,
"grad_norm": 3.5556273460388184,
"learning_rate": 7.952848099686025e-05,
"loss": 0.18782631158828736,
"memory(GiB)": 43.7,
"step": 385,
"token_acc": 0.9409830807473338,
"train_speed(iter/s)": 0.00841
},
{
"epoch": 0.6768630003254149,
"grad_norm": 3.291494846343994,
"learning_rate": 7.894610992679008e-05,
"loss": 0.18964465856552123,
"memory(GiB)": 43.7,
"step": 390,
"token_acc": 0.933142197268347,
"train_speed(iter/s)": 0.008411
},
{
"epoch": 0.6855407310988176,
"grad_norm": 2.276304006576538,
"learning_rate": 7.835777141953853e-05,
"loss": 0.1859665870666504,
"memory(GiB)": 43.7,
"step": 395,
"token_acc": 0.9327179666652085,
"train_speed(iter/s)": 0.008413
},
{
"epoch": 0.6942184618722204,
"grad_norm": 9.960855484008789,
"learning_rate": 7.776358676508522e-05,
"loss": 0.1905187964439392,
"memory(GiB)": 43.7,
"step": 400,
"token_acc": 0.93853976001627,
"train_speed(iter/s)": 0.008414
},
{
"epoch": 0.6942184618722204,
"eval_loss": 0.19546635448932648,
"eval_runtime": 220.2237,
"eval_samples_per_second": 4.228,
"eval_steps_per_second": 0.849,
"eval_token_acc": 0.933366601318838,
"step": 400
},
{
"epoch": 0.7028961926456232,
"grad_norm": 2.7447469234466553,
"learning_rate": 7.716367845863273e-05,
"loss": 0.17779254913330078,
"memory(GiB)": 43.7,
"step": 405,
"token_acc": 0.9323472308400552,
"train_speed(iter/s)": 0.008377
},
{
"epoch": 0.7115739234190259,
"grad_norm": 2.3675718307495117,
"learning_rate": 7.655817017535339e-05,
"loss": 0.19050852060317994,
"memory(GiB)": 43.7,
"step": 410,
"token_acc": 0.9310586393044051,
"train_speed(iter/s)": 0.008379
},
{
"epoch": 0.7202516541924286,
"grad_norm": 3.075871706008911,
"learning_rate": 7.594718674489281e-05,
"loss": 0.19913625717163086,
"memory(GiB)": 43.7,
"step": 415,
"token_acc": 0.9342206089937487,
"train_speed(iter/s)": 0.008381
},
{
"epoch": 0.7289293849658315,
"grad_norm": 1.9185936450958252,
"learning_rate": 7.533085412563534e-05,
"loss": 0.19407714605331422,
"memory(GiB)": 43.7,
"step": 420,
"token_acc": 0.9339040074954218,
"train_speed(iter/s)": 0.008382
},
{
"epoch": 0.7376071157392342,
"grad_norm": 3.576767683029175,
"learning_rate": 7.470929937873695e-05,
"loss": 0.19026898145675658,
"memory(GiB)": 43.7,
"step": 425,
"token_acc": 0.9307452283502857,
"train_speed(iter/s)": 0.008384
},
{
"epoch": 0.7462848465126369,
"grad_norm": 18.04636573791504,
"learning_rate": 7.408265064193071e-05,
"loss": 0.1796002984046936,
"memory(GiB)": 43.7,
"step": 430,
"token_acc": 0.9397331762131224,
"train_speed(iter/s)": 0.008386
},
{
"epoch": 0.7549625772860397,
"grad_norm": 2.2993645668029785,
"learning_rate": 7.345103710311031e-05,
"loss": 0.18258541822433472,
"memory(GiB)": 43.7,
"step": 435,
"token_acc": 0.9341671247938428,
"train_speed(iter/s)": 0.008387
},
{
"epoch": 0.7636403080594425,
"grad_norm": 2.52563214302063,
"learning_rate": 7.281458897369707e-05,
"loss": 0.18408983945846558,
"memory(GiB)": 43.7,
"step": 440,
"token_acc": 0.9345260397152492,
"train_speed(iter/s)": 0.008388
},
{
"epoch": 0.7723180388328452,
"grad_norm": 2.247847318649292,
"learning_rate": 7.217343746179601e-05,
"loss": 0.1789139986038208,
"memory(GiB)": 43.7,
"step": 445,
"token_acc": 0.9334740027072219,
"train_speed(iter/s)": 0.00839
},
{
"epoch": 0.780995769606248,
"grad_norm": 2.011239767074585,
"learning_rate": 7.152771474514642e-05,
"loss": 0.183357834815979,
"memory(GiB)": 43.7,
"step": 450,
"token_acc": 0.9391916113998925,
"train_speed(iter/s)": 0.008392
},
{
"epoch": 0.7896735003796507,
"grad_norm": 1.3411664962768555,
"learning_rate": 7.087755394387251e-05,
"loss": 0.1795508623123169,
"memory(GiB)": 43.7,
"step": 455,
"token_acc": 0.9274950486523723,
"train_speed(iter/s)": 0.008394
},
{
"epoch": 0.7983512311530535,
"grad_norm": 5.725118637084961,
"learning_rate": 7.022308909303974e-05,
"loss": 0.18583301305770875,
"memory(GiB)": 43.7,
"step": 460,
"token_acc": 0.9388505371512602,
"train_speed(iter/s)": 0.008395
},
{
"epoch": 0.8070289619264562,
"grad_norm": 5.536004066467285,
"learning_rate": 6.956445511502264e-05,
"loss": 0.17879717350006102,
"memory(GiB)": 43.7,
"step": 465,
"token_acc": 0.9410176125244618,
"train_speed(iter/s)": 0.008397
},
{
"epoch": 0.815706692699859,
"grad_norm": 2.3825104236602783,
"learning_rate": 6.890178779168963e-05,
"loss": 0.17840908765792846,
"memory(GiB)": 43.7,
"step": 470,
"token_acc": 0.9386419190454489,
"train_speed(iter/s)": 0.008398
},
{
"epoch": 0.8243844234732618,
"grad_norm": 2.428227424621582,
"learning_rate": 6.823522373641066e-05,
"loss": 0.15655564069747924,
"memory(GiB)": 43.7,
"step": 475,
"token_acc": 0.9488992717935766,
"train_speed(iter/s)": 0.008399
},
{
"epoch": 0.8330621542466645,
"grad_norm": 3.9280638694763184,
"learning_rate": 6.756490036589346e-05,
"loss": 0.17563689947128297,
"memory(GiB)": 43.7,
"step": 480,
"token_acc": 0.9412151607333717,
"train_speed(iter/s)": 0.008401
},
{
"epoch": 0.8417398850200672,
"grad_norm": 3.4658777713775635,
"learning_rate": 6.68909558718541e-05,
"loss": 0.1809309244155884,
"memory(GiB)": 43.7,
"step": 485,
"token_acc": 0.9326161790017212,
"train_speed(iter/s)": 0.008402
},
{
"epoch": 0.85041761579347,
"grad_norm": 3.0663387775421143,
"learning_rate": 6.621352919252788e-05,
"loss": 0.16797908544540405,
"memory(GiB)": 43.7,
"step": 490,
"token_acc": 0.9391401202940521,
"train_speed(iter/s)": 0.008403
},
{
"epoch": 0.8590953465668728,
"grad_norm": 2.3330888748168945,
"learning_rate": 6.553275998402625e-05,
"loss": 0.16253708600997924,
"memory(GiB)": 43.7,
"step": 495,
"token_acc": 0.9451398355847482,
"train_speed(iter/s)": 0.008405
},
{
"epoch": 0.8677730773402755,
"grad_norm": 5.0757317543029785,
"learning_rate": 6.484878859154576e-05,
"loss": 0.16285682916641236,
"memory(GiB)": 43.7,
"step": 500,
"token_acc": 0.9440452863052471,
"train_speed(iter/s)": 0.008406
},
{
"epoch": 0.8677730773402755,
"eval_loss": 0.1776481717824936,
"eval_runtime": 220.271,
"eval_samples_per_second": 4.227,
"eval_steps_per_second": 0.849,
"eval_token_acc": 0.9389627517376582,
"step": 500
},
{
"epoch": 0.8764508081136783,
"grad_norm": 35.683311462402344,
"learning_rate": 6.416175602043489e-05,
"loss": 0.16943529844284058,
"memory(GiB)": 43.7,
"step": 505,
"token_acc": 0.9365593665522729,
"train_speed(iter/s)": 0.008377
},
{
"epoch": 0.885128538887081,
"grad_norm": 5.939138889312744,
"learning_rate": 6.347180390712497e-05,
"loss": 0.17387474775314332,
"memory(GiB)": 43.7,
"step": 510,
"token_acc": 0.9383922829581993,
"train_speed(iter/s)": 0.008378
},
{
"epoch": 0.8938062696604838,
"grad_norm": 2.826707601547241,
"learning_rate": 6.277907448993072e-05,
"loss": 0.16873899698257447,
"memory(GiB)": 43.7,
"step": 515,
"token_acc": 0.9365477021643599,
"train_speed(iter/s)": 0.00838
},
{
"epoch": 0.9024840004338865,
"grad_norm": 1.4783248901367188,
"learning_rate": 6.208371057972694e-05,
"loss": 0.16192502975463868,
"memory(GiB)": 43.7,
"step": 520,
"token_acc": 0.9410790980672871,
"train_speed(iter/s)": 0.008381
},
{
"epoch": 0.9111617312072893,
"grad_norm": 4.642916202545166,
"learning_rate": 6.13858555305071e-05,
"loss": 0.15619339942932128,
"memory(GiB)": 43.7,
"step": 525,
"token_acc": 0.9391466731620184,
"train_speed(iter/s)": 0.008383
},
{
"epoch": 0.919839461980692,
"grad_norm": 3.962188243865967,
"learning_rate": 6.068565320982982e-05,
"loss": 0.1607579231262207,
"memory(GiB)": 43.7,
"step": 530,
"token_acc": 0.9430901476746749,
"train_speed(iter/s)": 0.008384
},
{
"epoch": 0.9285171927540948,
"grad_norm": 4.833482265472412,
"learning_rate": 5.998324796915973e-05,
"loss": 0.15943752527236937,
"memory(GiB)": 43.7,
"step": 535,
"token_acc": 0.9461845102505695,
"train_speed(iter/s)": 0.008386
},
{
"epoch": 0.9371949235274976,
"grad_norm": 3.3520522117614746,
"learning_rate": 5.9278784614108375e-05,
"loss": 0.1695178270339966,
"memory(GiB)": 43.7,
"step": 540,
"token_acc": 0.9380854742089816,
"train_speed(iter/s)": 0.008387
},
{
"epoch": 0.9458726543009003,
"grad_norm": 5.150724411010742,
"learning_rate": 5.857240837458155e-05,
"loss": 0.16256927251815795,
"memory(GiB)": 43.7,
"step": 545,
"token_acc": 0.9492572786690434,
"train_speed(iter/s)": 0.008389
},
{
"epoch": 0.9545503850743031,
"grad_norm": 2.933917284011841,
"learning_rate": 5.7864264874839144e-05,
"loss": 0.15667448043823243,
"memory(GiB)": 43.7,
"step": 550,
"token_acc": 0.947681539807524,
"train_speed(iter/s)": 0.00839
},
{
"epoch": 0.9632281158477058,
"grad_norm": 2.804276704788208,
"learning_rate": 5.715450010347384e-05,
"loss": 0.15544140338897705,
"memory(GiB)": 43.7,
"step": 555,
"token_acc": 0.9417873014476174,
"train_speed(iter/s)": 0.008391
},
{
"epoch": 0.9719058466211086,
"grad_norm": 3.734673500061035,
"learning_rate": 5.644326038331439e-05,
"loss": 0.1483505606651306,
"memory(GiB)": 43.7,
"step": 560,
"token_acc": 0.9512087329296858,
"train_speed(iter/s)": 0.008392
},
{
"epoch": 0.9805835773945113,
"grad_norm": 2.5557937622070312,
"learning_rate": 5.5730692341260294e-05,
"loss": 0.1612637758255005,
"memory(GiB)": 43.7,
"step": 565,
"token_acc": 0.9437612864390748,
"train_speed(iter/s)": 0.008394
},
{
"epoch": 0.9892613081679141,
"grad_norm": 4.100978851318359,
"learning_rate": 5.501694287805361e-05,
"loss": 0.16001300811767577,
"memory(GiB)": 43.7,
"step": 570,
"token_acc": 0.9466726417208156,
"train_speed(iter/s)": 0.008395
},
{
"epoch": 0.9979390389413169,
"grad_norm": 2.458726167678833,
"learning_rate": 5.430215913799441e-05,
"loss": 0.1615024447441101,
"memory(GiB)": 43.7,
"step": 575,
"token_acc": 0.9395579236564958,
"train_speed(iter/s)": 0.008396
},
{
"epoch": 1.0052066384640417,
"grad_norm": 2.017334461212158,
"learning_rate": 5.358648847860599e-05,
"loss": 0.1611289620399475,
"memory(GiB)": 43.7,
"step": 580,
"token_acc": 0.9456555752581621,
"train_speed(iter/s)": 0.008408
},
{
"epoch": 1.0138843692374444,
"grad_norm": 2.5282626152038574,
"learning_rate": 5.287007844025604e-05,
"loss": 0.15156197547912598,
"memory(GiB)": 43.7,
"step": 585,
"token_acc": 0.9409188629667967,
"train_speed(iter/s)": 0.008409
},
{
"epoch": 1.0225621000108471,
"grad_norm": 2.413511037826538,
"learning_rate": 5.215307671574027e-05,
"loss": 0.1491085410118103,
"memory(GiB)": 43.7,
"step": 590,
"token_acc": 0.948726757418302,
"train_speed(iter/s)": 0.00841
},
{
"epoch": 1.0312398307842499,
"grad_norm": 2.100595235824585,
"learning_rate": 5.1435631119834526e-05,
"loss": 0.1547134280204773,
"memory(GiB)": 43.7,
"step": 595,
"token_acc": 0.9431145431145431,
"train_speed(iter/s)": 0.008411
},
{
"epoch": 1.0399175615576526,
"grad_norm": 2.0920307636260986,
"learning_rate": 5.071788955882171e-05,
"loss": 0.15303828716278076,
"memory(GiB)": 43.7,
"step": 600,
"token_acc": 0.9547254329480462,
"train_speed(iter/s)": 0.008412
},
{
"epoch": 1.0399175615576526,
"eval_loss": 0.15999968349933624,
"eval_runtime": 220.7406,
"eval_samples_per_second": 4.218,
"eval_steps_per_second": 0.847,
"eval_token_acc": 0.9457921939048298,
"step": 600
},
{
"epoch": 1.0485952923310555,
"grad_norm": 7.501833438873291,
"learning_rate": 5e-05,
"loss": 0.14910356998443602,
"memory(GiB)": 43.7,
"step": 605,
"token_acc": 0.9452935468219311,
"train_speed(iter/s)": 0.008387
},
{
"epoch": 1.0572730231044583,
"grad_norm": 3.8511178493499756,
"learning_rate": 4.92821104411783e-05,
"loss": 0.1561565399169922,
"memory(GiB)": 43.7,
"step": 610,
"token_acc": 0.9424419619479839,
"train_speed(iter/s)": 0.008388
},
{
"epoch": 1.065950753877861,
"grad_norm": 4.334895610809326,
"learning_rate": 4.856436888016549e-05,
"loss": 0.15020160675048827,
"memory(GiB)": 43.7,
"step": 615,
"token_acc": 0.9489557386472505,
"train_speed(iter/s)": 0.008389
},
{
"epoch": 1.0746284846512637,
"grad_norm": 6.407334804534912,
"learning_rate": 4.784692328425974e-05,
"loss": 0.15203957557678222,
"memory(GiB)": 43.7,
"step": 620,
"token_acc": 0.9531370872521477,
"train_speed(iter/s)": 0.00839
},
{
"epoch": 1.0833062154246664,
"grad_norm": 3.85739803314209,
"learning_rate": 4.712992155974397e-05,
"loss": 0.14752843379974365,
"memory(GiB)": 43.7,
"step": 625,
"token_acc": 0.9495189355168885,
"train_speed(iter/s)": 0.008391
},
{
"epoch": 1.0919839461980692,
"grad_norm": 59.63698959350586,
"learning_rate": 4.6413511521394026e-05,
"loss": 0.14950259923934936,
"memory(GiB)": 43.7,
"step": 630,
"token_acc": 0.9457502223351544,
"train_speed(iter/s)": 0.008392
},
{
"epoch": 1.1006616769714719,
"grad_norm": 2.6143155097961426,
"learning_rate": 4.56978408620056e-05,
"loss": 0.14179157018661498,
"memory(GiB)": 43.7,
"step": 635,
"token_acc": 0.9504487580880818,
"train_speed(iter/s)": 0.008393
},
{
"epoch": 1.1093394077448746,
"grad_norm": 7.679717540740967,
"learning_rate": 4.4983057121946414e-05,
"loss": 0.13978817462921142,
"memory(GiB)": 43.7,
"step": 640,
"token_acc": 0.9578175752787288,
"train_speed(iter/s)": 0.008394
},
{
"epoch": 1.1180171385182776,
"grad_norm": 4.3338518142700195,
"learning_rate": 4.426930765873971e-05,
"loss": 0.12927252054214478,
"memory(GiB)": 43.7,
"step": 645,
"token_acc": 0.9480657805044308,
"train_speed(iter/s)": 0.008395
},
{
"epoch": 1.1266948692916803,
"grad_norm": 8.904815673828125,
"learning_rate": 4.355673961668561e-05,
"loss": 0.142861807346344,
"memory(GiB)": 43.7,
"step": 650,
"token_acc": 0.9463022237196765,
"train_speed(iter/s)": 0.008396
},
{
"epoch": 1.135372600065083,
"grad_norm": 4.161317348480225,
"learning_rate": 4.2845499896526165e-05,
"loss": 0.14242833852767944,
"memory(GiB)": 43.7,
"step": 655,
"token_acc": 0.9534545454545454,
"train_speed(iter/s)": 0.008397
},
{
"epoch": 1.1440503308384857,
"grad_norm": 4.19969367980957,
"learning_rate": 4.213573512516086e-05,
"loss": 0.145144522190094,
"memory(GiB)": 43.7,
"step": 660,
"token_acc": 0.9491097208854667,
"train_speed(iter/s)": 0.008398
},
{
"epoch": 1.1527280616118885,
"grad_norm": 14.433186531066895,
"learning_rate": 4.142759162541847e-05,
"loss": 0.14555753469467164,
"memory(GiB)": 43.7,
"step": 665,
"token_acc": 0.9485169091252598,
"train_speed(iter/s)": 0.008398
},
{
"epoch": 1.1614057923852912,
"grad_norm": 2.4086694717407227,
"learning_rate": 4.072121538589164e-05,
"loss": 0.1370793342590332,
"memory(GiB)": 43.7,
"step": 670,
"token_acc": 0.9519276329959078,
"train_speed(iter/s)": 0.008399
},
{
"epoch": 1.170083523158694,
"grad_norm": 1.9969909191131592,
"learning_rate": 4.001675203084029e-05,
"loss": 0.13346275091171264,
"memory(GiB)": 43.7,
"step": 675,
"token_acc": 0.956349370225067,
"train_speed(iter/s)": 0.0084
},
{
"epoch": 1.1787612539320969,
"grad_norm": 6.479780673980713,
"learning_rate": 3.931434679017019e-05,
"loss": 0.1528043270111084,
"memory(GiB)": 43.7,
"step": 680,
"token_acc": 0.9442832532414543,
"train_speed(iter/s)": 0.008401
},
{
"epoch": 1.1874389847054996,
"grad_norm": 5.8431315422058105,
"learning_rate": 3.8614144469492914e-05,
"loss": 0.1413252830505371,
"memory(GiB)": 43.7,
"step": 685,
"token_acc": 0.9446944213883186,
"train_speed(iter/s)": 0.008402
},
{
"epoch": 1.1961167154789023,
"grad_norm": 4.603435039520264,
"learning_rate": 3.791628942027307e-05,
"loss": 0.13914390802383422,
"memory(GiB)": 43.7,
"step": 690,
"token_acc": 0.9466184864287768,
"train_speed(iter/s)": 0.008403
},
{
"epoch": 1.204794446252305,
"grad_norm": 3.8140857219696045,
"learning_rate": 3.72209255100693e-05,
"loss": 0.1497533917427063,
"memory(GiB)": 43.7,
"step": 695,
"token_acc": 0.9487390588439206,
"train_speed(iter/s)": 0.008404
},
{
"epoch": 1.2134721770257078,
"grad_norm": 3.3604190349578857,
"learning_rate": 3.6528196092875044e-05,
"loss": 0.13937609195709227,
"memory(GiB)": 43.7,
"step": 700,
"token_acc": 0.9550221300689821,
"train_speed(iter/s)": 0.008404
},
{
"epoch": 1.2134721770257078,
"eval_loss": 0.14972732961177826,
"eval_runtime": 220.5567,
"eval_samples_per_second": 4.221,
"eval_steps_per_second": 0.848,
"eval_token_acc": 0.9490785956157548,
"step": 700
},
{
"epoch": 1.2221499077991105,
"grad_norm": 1.7973473072052002,
"learning_rate": 3.5838243979565126e-05,
"loss": 0.14264475107192992,
"memory(GiB)": 43.7,
"step": 705,
"token_acc": 0.9420059621871812,
"train_speed(iter/s)": 0.008383
},
{
"epoch": 1.2308276385725132,
"grad_norm": 3.916630744934082,
"learning_rate": 3.5151211408454276e-05,
"loss": 0.14307937622070313,
"memory(GiB)": 43.7,
"step": 710,
"token_acc": 0.9455980249526077,
"train_speed(iter/s)": 0.008384
},
{
"epoch": 1.239505369345916,
"grad_norm": 1.8483539819717407,
"learning_rate": 3.446724001597375e-05,
"loss": 0.13684509992599486,
"memory(GiB)": 43.7,
"step": 715,
"token_acc": 0.9552911412420445,
"train_speed(iter/s)": 0.008385
},
{
"epoch": 1.2481831001193189,
"grad_norm": 4.51676607131958,
"learning_rate": 3.378647080747213e-05,
"loss": 0.1370592713356018,
"memory(GiB)": 43.7,
"step": 720,
"token_acc": 0.9479323691179466,
"train_speed(iter/s)": 0.008386
},
{
"epoch": 1.2568608308927216,
"grad_norm": 5.6989922523498535,
"learning_rate": 3.31090441281459e-05,
"loss": 0.13757110834121705,
"memory(GiB)": 43.7,
"step": 725,
"token_acc": 0.9518228006061146,
"train_speed(iter/s)": 0.008387
},
{
"epoch": 1.2655385616661243,
"grad_norm": 2.230487108230591,
"learning_rate": 3.2435099634106545e-05,
"loss": 0.14716867208480836,
"memory(GiB)": 43.7,
"step": 730,
"token_acc": 0.9534712267048945,
"train_speed(iter/s)": 0.008387
},
{
"epoch": 1.274216292439527,
"grad_norm": 4.099515438079834,
"learning_rate": 3.176477626358935e-05,
"loss": 0.1300519347190857,
"memory(GiB)": 43.7,
"step": 735,
"token_acc": 0.9527396126469128,
"train_speed(iter/s)": 0.008388
},
{
"epoch": 1.2828940232129298,
"grad_norm": 7.339303016662598,
"learning_rate": 3.1098212208310385e-05,
"loss": 0.13597266674041747,
"memory(GiB)": 43.7,
"step": 740,
"token_acc": 0.9525093572664708,
"train_speed(iter/s)": 0.008389
},
{
"epoch": 1.2915717539863325,
"grad_norm": 3.044003486633301,
"learning_rate": 3.0435544884977368e-05,
"loss": 0.13851481676101685,
"memory(GiB)": 43.7,
"step": 745,
"token_acc": 0.955963223921636,
"train_speed(iter/s)": 0.00839
},
{
"epoch": 1.3002494847597355,
"grad_norm": 2.625251054763794,
"learning_rate": 2.977691090696027e-05,
"loss": 0.13460161685943603,
"memory(GiB)": 43.7,
"step": 750,
"token_acc": 0.9536030797609977,
"train_speed(iter/s)": 0.00839
},
{
"epoch": 1.3089272155331382,
"grad_norm": 4.502344131469727,
"learning_rate": 2.912244605612749e-05,
"loss": 0.13218367099761963,
"memory(GiB)": 43.7,
"step": 755,
"token_acc": 0.9520431328036323,
"train_speed(iter/s)": 0.008391
},
{
"epoch": 1.317604946306541,
"grad_norm": 4.324583530426025,
"learning_rate": 2.8472285254853593e-05,
"loss": 0.13542770147323607,
"memory(GiB)": 43.7,
"step": 760,
"token_acc": 0.9550812224079551,
"train_speed(iter/s)": 0.008392
},
{
"epoch": 1.3262826770799436,
"grad_norm": 5.790456771850586,
"learning_rate": 2.7826562538204004e-05,
"loss": 0.1304815888404846,
"memory(GiB)": 43.7,
"step": 765,
"token_acc": 0.9475846709162122,
"train_speed(iter/s)": 0.008393
},
{
"epoch": 1.3349604078533464,
"grad_norm": 7.2669267654418945,
"learning_rate": 2.7185411026302964e-05,
"loss": 0.14250266551971436,
"memory(GiB)": 43.7,
"step": 770,
"token_acc": 0.9475920679886686,
"train_speed(iter/s)": 0.008394
},
{
"epoch": 1.343638138626749,
"grad_norm": 5.094613552093506,
"learning_rate": 2.654896289688972e-05,
"loss": 0.12874077558517455,
"memory(GiB)": 43.7,
"step": 775,
"token_acc": 0.9529416591449905,
"train_speed(iter/s)": 0.008394
},
{
"epoch": 1.3523158694001518,
"grad_norm": 6.989784240722656,
"learning_rate": 2.591734935806929e-05,
"loss": 0.12754837274551392,
"memory(GiB)": 43.7,
"step": 780,
"token_acc": 0.9427796380282287,
"train_speed(iter/s)": 0.008395
},
{
"epoch": 1.3609936001735545,
"grad_norm": 2.299795150756836,
"learning_rate": 2.5290700621263046e-05,
"loss": 0.12484978437423706,
"memory(GiB)": 43.7,
"step": 785,
"token_acc": 0.9552753758998168,
"train_speed(iter/s)": 0.008396
},
{
"epoch": 1.3696713309469573,
"grad_norm": 6.531172275543213,
"learning_rate": 2.4669145874364658e-05,
"loss": 0.12864662408828736,
"memory(GiB)": 43.7,
"step": 790,
"token_acc": 0.9534572241754562,
"train_speed(iter/s)": 0.008396
},
{
"epoch": 1.3783490617203602,
"grad_norm": 5.2566819190979,
"learning_rate": 2.4052813255107198e-05,
"loss": 0.12368242740631104,
"memory(GiB)": 43.7,
"step": 795,
"token_acc": 0.9542535446205171,
"train_speed(iter/s)": 0.008397
},
{
"epoch": 1.387026792493763,
"grad_norm": 2.911468029022217,
"learning_rate": 2.3441829824646604e-05,
"loss": 0.1311914801597595,
"memory(GiB)": 43.7,
"step": 800,
"token_acc": 0.9545532351934269,
"train_speed(iter/s)": 0.008398
},
{
"epoch": 1.387026792493763,
"eval_loss": 0.1414770781993866,
"eval_runtime": 220.3542,
"eval_samples_per_second": 4.225,
"eval_steps_per_second": 0.849,
"eval_token_acc": 0.9522295490999821,
"step": 800
},
{
"epoch": 1.3957045232671657,
"grad_norm": 43.0417594909668,
"learning_rate": 2.2836321541367272e-05,
"loss": 0.1324560523033142,
"memory(GiB)": 43.7,
"step": 805,
"token_acc": 0.9497365543294563,
"train_speed(iter/s)": 0.008379
},
{
"epoch": 1.4043822540405684,
"grad_norm": 3.845632791519165,
"learning_rate": 2.2236413234914805e-05,
"loss": 0.12397520542144776,
"memory(GiB)": 43.7,
"step": 810,
"token_acc": 0.953968391052768,
"train_speed(iter/s)": 0.00838
},
{
"epoch": 1.413059984813971,
"grad_norm": 32.551578521728516,
"learning_rate": 2.164222858046148e-05,
"loss": 0.12206425666809081,
"memory(GiB)": 43.7,
"step": 815,
"token_acc": 0.9575527717120638,
"train_speed(iter/s)": 0.008381
},
{
"epoch": 1.4217377155873738,
"grad_norm": 6.810664653778076,
"learning_rate": 2.105389007320992e-05,
"loss": 0.12367465496063232,
"memory(GiB)": 43.7,
"step": 820,
"token_acc": 0.9609237622601696,
"train_speed(iter/s)": 0.008382
},
{
"epoch": 1.4304154463607768,
"grad_norm": 6.067445755004883,
"learning_rate": 2.0471519003139762e-05,
"loss": 0.13576604127883912,
"memory(GiB)": 43.7,
"step": 825,
"token_acc": 0.9502264513472711,
"train_speed(iter/s)": 0.008383
},
{
"epoch": 1.4390931771341795,
"grad_norm": 2.4773712158203125,
"learning_rate": 1.9895235430002894e-05,
"loss": 0.1232601284980774,
"memory(GiB)": 43.7,
"step": 830,
"token_acc": 0.9555563932448733,
"train_speed(iter/s)": 0.008383
},
{
"epoch": 1.4477709079075822,
"grad_norm": 15.01673698425293,
"learning_rate": 1.9325158158572433e-05,
"loss": 0.12532531023025512,
"memory(GiB)": 43.7,
"step": 835,
"token_acc": 0.9559790297079138,
"train_speed(iter/s)": 0.008384
},
{
"epoch": 1.456448638680985,
"grad_norm": 4.682022571563721,
"learning_rate": 1.876140471415016e-05,
"loss": 0.13699095249176024,
"memory(GiB)": 43.7,
"step": 840,
"token_acc": 0.9564936637339575,
"train_speed(iter/s)": 0.008385
},
{
"epoch": 1.4651263694543877,
"grad_norm": 2.494595527648926,
"learning_rate": 1.820409131833804e-05,
"loss": 0.12996993064880372,
"memory(GiB)": 43.7,
"step": 845,
"token_acc": 0.9557536734356943,
"train_speed(iter/s)": 0.008386
},
{
"epoch": 1.4738041002277904,
"grad_norm": 5.67137336730957,
"learning_rate": 1.7653332865078242e-05,
"loss": 0.12316564321517945,
"memory(GiB)": 43.7,
"step": 850,
"token_acc": 0.9541896573848325,
"train_speed(iter/s)": 0.008387
},
{
"epoch": 1.4824818310011931,
"grad_norm": 3.5610580444335938,
"learning_rate": 1.710924289696697e-05,
"loss": 0.12576040029525756,
"memory(GiB)": 43.7,
"step": 855,
"token_acc": 0.958788442172128,
"train_speed(iter/s)": 0.008387
},
{
"epoch": 1.4911595617745959,
"grad_norm": 5.919505596160889,
"learning_rate": 1.6571933581846965e-05,
"loss": 0.13589413166046144,
"memory(GiB)": 43.7,
"step": 860,
"token_acc": 0.9557088249958451,
"train_speed(iter/s)": 0.008388
},
{
"epoch": 1.4998372925479986,
"grad_norm": 2.857247829437256,
"learning_rate": 1.604151568968328e-05,
"loss": 0.13363780975341796,
"memory(GiB)": 43.7,
"step": 865,
"token_acc": 0.9531686859273066,
"train_speed(iter/s)": 0.008389
},
{
"epoch": 1.5085150233214013,
"grad_norm": 4.2472734451293945,
"learning_rate": 1.55180985697273e-05,
"loss": 0.11843851804733277,
"memory(GiB)": 43.7,
"step": 870,
"token_acc": 0.9585031493145609,
"train_speed(iter/s)": 0.008389
},
{
"epoch": 1.5171927540948043,
"grad_norm": 2.154675006866455,
"learning_rate": 1.5001790127973719e-05,
"loss": 0.13474491834640503,
"memory(GiB)": 43.7,
"step": 875,
"token_acc": 0.9471295060080107,
"train_speed(iter/s)": 0.00839
},
{
"epoch": 1.525870484868207,
"grad_norm": 4.447220325469971,
"learning_rate": 1.449269680491484e-05,
"loss": 0.12090286016464233,
"memory(GiB)": 43.7,
"step": 880,
"token_acc": 0.9576200325661224,
"train_speed(iter/s)": 0.008391
},
{
"epoch": 1.5345482156416097,
"grad_norm": 5.527554035186768,
"learning_rate": 1.3990923553597312e-05,
"loss": 0.12562017440795897,
"memory(GiB)": 43.7,
"step": 885,
"token_acc": 0.9540816326530612,
"train_speed(iter/s)": 0.008391
},
{
"epoch": 1.5432259464150124,
"grad_norm": 5.432262420654297,
"learning_rate": 1.3496573817985264e-05,
"loss": 0.12086418867111207,
"memory(GiB)": 43.7,
"step": 890,
"token_acc": 0.9573405945087361,
"train_speed(iter/s)": 0.008392
},
{
"epoch": 1.5519036771884154,
"grad_norm": 1.7145942449569702,
"learning_rate": 1.30097495116346e-05,
"loss": 0.12666620016098024,
"memory(GiB)": 43.7,
"step": 895,
"token_acc": 0.9523390710725803,
"train_speed(iter/s)": 0.008393
},
{
"epoch": 1.560581407961818,
"grad_norm": 3.3511152267456055,
"learning_rate": 1.2530550996682905e-05,
"loss": 0.13257088661193847,
"memory(GiB)": 43.7,
"step": 900,
"token_acc": 0.9545784418356457,
"train_speed(iter/s)": 0.008393
},
{
"epoch": 1.560581407961818,
"eval_loss": 0.1344464272260666,
"eval_runtime": 220.4433,
"eval_samples_per_second": 4.223,
"eval_steps_per_second": 0.848,
"eval_token_acc": 0.9540331491712707,
"step": 900
},
{
"epoch": 1.5692591387352208,
"grad_norm": 2.53428053855896,
"learning_rate": 1.2059077063159035e-05,
"loss": 0.13223330974578856,
"memory(GiB)": 43.7,
"step": 905,
"token_acc": 0.9484811124360815,
"train_speed(iter/s)": 0.008377
},
{
"epoch": 1.5779368695086236,
"grad_norm": 2.2530903816223145,
"learning_rate": 1.1595424908616931e-05,
"loss": 0.12704041004180908,
"memory(GiB)": 43.7,
"step": 910,
"token_acc": 0.9567981213053689,
"train_speed(iter/s)": 0.008377
},
{
"epoch": 1.5866146002820263,
"grad_norm": 3.6312787532806396,
"learning_rate": 1.113969011809769e-05,
"loss": 0.124385404586792,
"memory(GiB)": 43.7,
"step": 915,
"token_acc": 0.9561269586179189,
"train_speed(iter/s)": 0.008378
},
{
"epoch": 1.595292331055429,
"grad_norm": 2.5788967609405518,
"learning_rate": 1.0691966644423985e-05,
"loss": 0.1238779067993164,
"memory(GiB)": 43.7,
"step": 920,
"token_acc": 0.9614115681770724,
"train_speed(iter/s)": 0.008379
},
{
"epoch": 1.6039700618288317,
"grad_norm": 3.791745185852051,
"learning_rate": 1.0252346788831136e-05,
"loss": 0.12241133451461791,
"memory(GiB)": 43.7,
"step": 925,
"token_acc": 0.9497440670079107,
"train_speed(iter/s)": 0.00838
},
{
"epoch": 1.6126477926022345,
"grad_norm": 2.4049911499023438,
"learning_rate": 9.820921181938547e-06,
"loss": 0.12140171527862549,
"memory(GiB)": 43.7,
"step": 930,
"token_acc": 0.9623712869189373,
"train_speed(iter/s)": 0.00838
},
{
"epoch": 1.6213255233756372,
"grad_norm": 3.9300315380096436,
"learning_rate": 9.39777876506559e-06,
"loss": 0.1210600733757019,
"memory(GiB)": 43.7,
"step": 935,
"token_acc": 0.9548552328862265,
"train_speed(iter/s)": 0.008381
},
{
"epoch": 1.63000325414904,
"grad_norm": 3.4940907955169678,
"learning_rate": 8.983006771895763e-06,
"loss": 0.12003093957901001,
"memory(GiB)": 43.7,
"step": 940,
"token_acc": 0.9573810994441013,
"train_speed(iter/s)": 0.008382
},
{
"epoch": 1.6386809849224426,
"grad_norm": 3.156331777572632,
"learning_rate": 8.57669071049283e-06,
"loss": 0.13336446285247802,
"memory(GiB)": 43.7,
"step": 945,
"token_acc": 0.9616925789389326,
"train_speed(iter/s)": 0.008382
},
{
"epoch": 1.6473587156958456,
"grad_norm": 2.2776732444763184,
"learning_rate": 8.1789143456728e-06,
"loss": 0.12087045907974243,
"memory(GiB)": 43.7,
"step": 950,
"token_acc": 0.9592353951890035,
"train_speed(iter/s)": 0.008383
},
{
"epoch": 1.6560364464692483,
"grad_norm": 4.188504695892334,
"learning_rate": 7.789759681735242e-06,
"loss": 0.12968361377716064,
"memory(GiB)": 43.7,
"step": 955,
"token_acc": 0.9590592711080368,
"train_speed(iter/s)": 0.008383
},
{
"epoch": 1.664714177242651,
"grad_norm": 2.595299243927002,
"learning_rate": 7.409306945557487e-06,
"loss": 0.12285364866256714,
"memory(GiB)": 43.7,
"step": 960,
"token_acc": 0.9505176420874709,
"train_speed(iter/s)": 0.008384
},
{
"epoch": 1.6733919080160538,
"grad_norm": 5.3159050941467285,
"learning_rate": 7.03763457005539e-06,
"loss": 0.117578125,
"memory(GiB)": 43.7,
"step": 965,
"token_acc": 0.9578746014877789,
"train_speed(iter/s)": 0.008385
},
{
"epoch": 1.6820696387894567,
"grad_norm": 6.753002643585205,
"learning_rate": 6.674819178013769e-06,
"loss": 0.12883116006851197,
"memory(GiB)": 43.7,
"step": 970,
"token_acc": 0.9502868869623488,
"train_speed(iter/s)": 0.008385
},
{
"epoch": 1.6907473695628594,
"grad_norm": 2.9135501384735107,
"learning_rate": 6.32093556629017e-06,
"loss": 0.1296370506286621,
"memory(GiB)": 43.7,
"step": 975,
"token_acc": 0.9569416670212313,
"train_speed(iter/s)": 0.008386
},
{
"epoch": 1.6994251003362622,
"grad_norm": 8.138466835021973,
"learning_rate": 5.97605669039496e-06,
"loss": 0.12514114379882812,
"memory(GiB)": 43.7,
"step": 980,
"token_acc": 0.959045205591433,
"train_speed(iter/s)": 0.008387
},
{
"epoch": 1.7081028311096649,
"grad_norm": 6.167930603027344,
"learning_rate": 5.640253649451016e-06,
"loss": 0.12624462842941284,
"memory(GiB)": 43.7,
"step": 985,
"token_acc": 0.9558782574099861,
"train_speed(iter/s)": 0.008387
},
{
"epoch": 1.7167805618830676,
"grad_norm": 3.414597749710083,
"learning_rate": 5.3135956715362205e-06,
"loss": 0.11242947578430176,
"memory(GiB)": 43.7,
"step": 990,
"token_acc": 0.9574127310061602,
"train_speed(iter/s)": 0.008388
},
{
"epoch": 1.7254582926564703,
"grad_norm": 5.249174118041992,
"learning_rate": 4.996150099411595e-06,
"loss": 0.1168135643005371,
"memory(GiB)": 43.7,
"step": 995,
"token_acc": 0.9566812923813323,
"train_speed(iter/s)": 0.008389
},
{
"epoch": 1.734136023429873,
"grad_norm": 16.526689529418945,
"learning_rate": 4.687982376638101e-06,
"loss": 0.12106761932373047,
"memory(GiB)": 43.7,
"step": 1000,
"token_acc": 0.950882838633188,
"train_speed(iter/s)": 0.008389
},
{
"epoch": 1.734136023429873,
"eval_loss": 0.13192416727542877,
"eval_runtime": 220.4784,
"eval_samples_per_second": 4.223,
"eval_steps_per_second": 0.848,
"eval_token_acc": 0.9551238638388879,
"step": 1000
},
{
"epoch": 1.7428137542032758,
"grad_norm": 8.070154190063477,
"learning_rate": 4.389156034085062e-06,
"loss": 0.11758378744125367,
"memory(GiB)": 43.7,
"step": 1005,
"token_acc": 0.9530685920577617,
"train_speed(iter/s)": 0.008374
},
{
"epoch": 1.7514914849766785,
"grad_norm": 4.704627513885498,
"learning_rate": 4.099732676832818e-06,
"loss": 0.11970211267471313,
"memory(GiB)": 43.7,
"step": 1010,
"token_acc": 0.9605290871561565,
"train_speed(iter/s)": 0.008375
},
{
"epoch": 1.7601692157500812,
"grad_norm": 2.2863786220550537,
"learning_rate": 3.8197719714724075e-06,
"loss": 0.13179272413253784,
"memory(GiB)": 43.7,
"step": 1015,
"token_acc": 0.9519187762769479,
"train_speed(iter/s)": 0.008376
},
{
"epoch": 1.768846946523484,
"grad_norm": 2.264657735824585,
"learning_rate": 3.5493316338049086e-06,
"loss": 0.11556450128555298,
"memory(GiB)": 43.7,
"step": 1020,
"token_acc": 0.9577792123950936,
"train_speed(iter/s)": 0.008376
},
{
"epoch": 1.777524677296887,
"grad_norm": 2.526029586791992,
"learning_rate": 3.2884674169429195e-06,
"loss": 0.12338602542877197,
"memory(GiB)": 43.7,
"step": 1025,
"token_acc": 0.9530650918866511,
"train_speed(iter/s)": 0.008377
},
{
"epoch": 1.7862024080702896,
"grad_norm": 2.9345407485961914,
"learning_rate": 3.037233099816705e-06,
"loss": 0.11988996267318726,
"memory(GiB)": 43.7,
"step": 1030,
"token_acc": 0.9589889775750665,
"train_speed(iter/s)": 0.008377
},
{
"epoch": 1.7948801388436924,
"grad_norm": 26.361116409301758,
"learning_rate": 2.7956804760872923e-06,
"loss": 0.1184123396873474,
"memory(GiB)": 43.7,
"step": 1035,
"token_acc": 0.9544123107388919,
"train_speed(iter/s)": 0.008376
},
{
"epoch": 1.803557869617095,
"grad_norm": 1.9186588525772095,
"learning_rate": 2.563859343468822e-06,
"loss": 0.11901482343673705,
"memory(GiB)": 43.7,
"step": 1040,
"token_acc": 0.957713331308837,
"train_speed(iter/s)": 0.008376
},
{
"epoch": 1.812235600390498,
"grad_norm": 2.4511313438415527,
"learning_rate": 2.3418174934624614e-06,
"loss": 0.11996428966522217,
"memory(GiB)": 43.7,
"step": 1045,
"token_acc": 0.9580075296843324,
"train_speed(iter/s)": 0.008376
},
{
"epoch": 1.8209133311639008,
"grad_norm": 4.2686381340026855,
"learning_rate": 2.1296007015038366e-06,
"loss": 0.12237482070922852,
"memory(GiB)": 43.7,
"step": 1050,
"token_acc": 0.9547814096844838,
"train_speed(iter/s)": 0.008376
},
{
"epoch": 1.8295910619373035,
"grad_norm": 2.8372726440429688,
"learning_rate": 1.927252717526118e-06,
"loss": 0.11317713260650634,
"memory(GiB)": 43.7,
"step": 1055,
"token_acc": 0.95649260918722,
"train_speed(iter/s)": 0.008377
},
{
"epoch": 1.8382687927107062,
"grad_norm": 2.8570916652679443,
"learning_rate": 1.734815256940675e-06,
"loss": 0.12497738599777222,
"memory(GiB)": 43.7,
"step": 1060,
"token_acc": 0.9487427466150871,
"train_speed(iter/s)": 0.008377
},
{
"epoch": 1.846946523484109,
"grad_norm": 3.211838722229004,
"learning_rate": 1.552327992037167e-06,
"loss": 0.1036531686782837,
"memory(GiB)": 43.7,
"step": 1065,
"token_acc": 0.9615865210205974,
"train_speed(iter/s)": 0.008378
},
{
"epoch": 1.8556242542575117,
"grad_norm": 3.4643843173980713,
"learning_rate": 1.379828543804812e-06,
"loss": 0.12307696342468262,
"memory(GiB)": 43.7,
"step": 1070,
"token_acc": 0.9607265161038623,
"train_speed(iter/s)": 0.008378
},
{
"epoch": 1.8643019850309144,
"grad_norm": 1.6095548868179321,
"learning_rate": 1.2173524741765917e-06,
"loss": 0.12708972692489623,
"memory(GiB)": 43.7,
"step": 1075,
"token_acc": 0.952561829999156,
"train_speed(iter/s)": 0.008379
},
{
"epoch": 1.872979715804317,
"grad_norm": 4.927311897277832,
"learning_rate": 1.064933278697905e-06,
"loss": 0.12298980951309205,
"memory(GiB)": 43.7,
"step": 1080,
"token_acc": 0.9553622478150781,
"train_speed(iter/s)": 0.00838
},
{
"epoch": 1.8816574465777198,
"grad_norm": 9.174079895019531,
"learning_rate": 9.22602379621218e-07,
"loss": 0.11775370836257934,
"memory(GiB)": 43.7,
"step": 1085,
"token_acc": 0.9566113042358697,
"train_speed(iter/s)": 0.00838
},
{
"epoch": 1.8903351773511226,
"grad_norm": 2.2587075233459473,
"learning_rate": 7.903891194281754e-07,
"loss": 0.12096530199050903,
"memory(GiB)": 43.7,
"step": 1090,
"token_acc": 0.9606564187785626,
"train_speed(iter/s)": 0.008381
},
{
"epoch": 1.8990129081245253,
"grad_norm": 2.2280704975128174,
"learning_rate": 6.683207547804382e-07,
"loss": 0.1250510573387146,
"memory(GiB)": 43.7,
"step": 1095,
"token_acc": 0.9589882943143813,
"train_speed(iter/s)": 0.008382
},
{
"epoch": 1.9076906388979282,
"grad_norm": 1.6023691892623901,
"learning_rate": 5.564224509005566e-07,
"loss": 0.1259409546852112,
"memory(GiB)": 43.7,
"step": 1100,
"token_acc": 0.9541287997492949,
"train_speed(iter/s)": 0.008382
},
{
"epoch": 1.9076906388979282,
"eval_loss": 0.13046810030937195,
"eval_runtime": 220.3988,
"eval_samples_per_second": 4.224,
"eval_steps_per_second": 0.848,
"eval_token_acc": 0.9554517911245767,
"step": 1100
},
{
"epoch": 1.916368369671331,
"grad_norm": 4.342984676361084,
"learning_rate": 4.5471727638394246e-07,
"loss": 0.11758095026016235,
"memory(GiB)": 43.7,
"step": 1105,
"token_acc": 0.9547268970774075,
"train_speed(iter/s)": 0.008369
},
{
"epoch": 1.9250461004447337,
"grad_norm": 1.968943476676941,
"learning_rate": 3.6322619844317286e-07,
"loss": 0.11470128297805786,
"memory(GiB)": 43.7,
"step": 1110,
"token_acc": 0.9598653765250316,
"train_speed(iter/s)": 0.008369
},
{
"epoch": 1.9337238312181364,
"grad_norm": 8.885481834411621,
"learning_rate": 2.8196807858543174e-07,
"loss": 0.12146815061569213,
"memory(GiB)": 43.7,
"step": 1115,
"token_acc": 0.957883686752047,
"train_speed(iter/s)": 0.00837
},
{
"epoch": 1.9424015619915393,
"grad_norm": 3.2066810131073,
"learning_rate": 2.1095966872407557e-07,
"loss": 0.12168284654617309,
"memory(GiB)": 43.7,
"step": 1120,
"token_acc": 0.9506006374111302,
"train_speed(iter/s)": 0.008371
},
{
"epoch": 1.951079292764942,
"grad_norm": 4.034703254699707,
"learning_rate": 1.5021560772514597e-07,
"loss": 0.12114499807357788,
"memory(GiB)": 43.7,
"step": 1125,
"token_acc": 0.9597590361445784,
"train_speed(iter/s)": 0.008371
},
{
"epoch": 1.9597570235383448,
"grad_norm": 4.031400203704834,
"learning_rate": 9.974841838941151e-08,
"loss": 0.124148690700531,
"memory(GiB)": 43.7,
"step": 1130,
"token_acc": 0.9580030654696737,
"train_speed(iter/s)": 0.008372
},
{
"epoch": 1.9684347543117475,
"grad_norm": 1.9329931735992432,
"learning_rate": 5.9568504870771704e-08,
"loss": 0.13172950744628906,
"memory(GiB)": 43.7,
"step": 1135,
"token_acc": 0.9577101598762249,
"train_speed(iter/s)": 0.008373
},
{
"epoch": 1.9771124850851503,
"grad_norm": 5.185240268707275,
"learning_rate": 2.9684150531317233e-08,
"loss": 0.1293073296546936,
"memory(GiB)": 43.7,
"step": 1140,
"token_acc": 0.9555131747946911,
"train_speed(iter/s)": 0.008373
},
{
"epoch": 1.985790215858553,
"grad_norm": 1.6715742349624634,
"learning_rate": 1.0101516233695928e-08,
"loss": 0.11854711771011353,
"memory(GiB)": 43.7,
"step": 1145,
"token_acc": 0.9571147482336105,
"train_speed(iter/s)": 0.008374
},
{
"epoch": 1.9944679466319557,
"grad_norm": 2.0126240253448486,
"learning_rate": 8.246390709787389e-10,
"loss": 0.12129169702529907,
"memory(GiB)": 43.7,
"step": 1150,
"token_acc": 0.9570041028217365,
"train_speed(iter/s)": 0.008375
},
{
"epoch": 1.997939038941317,
"eval_loss": 0.13022483885288239,
"eval_runtime": 220.4004,
"eval_samples_per_second": 4.224,
"eval_steps_per_second": 0.848,
"eval_token_acc": 0.9554517911245767,
"step": 1152
}
],
"logging_steps": 5,
"max_steps": 1152,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2250994394408157e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}