parser_unsloth_qwen_2_3.5b / trainer_state.json
AhmedCodes65's picture
Upload 15 files
106d7fb verified
{
"best_global_step": 785,
"best_metric": 0.04556597024202347,
"best_model_checkpoint": "outputs/checkpoint-785",
"epoch": 14.0,
"eval_steps": 500,
"global_step": 2198,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06369426751592357,
"grad_norm": 0.5893169641494751,
"learning_rate": 0.000199991726421274,
"loss": 0.3173,
"step": 10
},
{
"epoch": 0.12738853503184713,
"grad_norm": 0.4055931568145752,
"learning_rate": 0.00019996312815368718,
"loss": 0.1199,
"step": 20
},
{
"epoch": 0.1910828025477707,
"grad_norm": 0.49983513355255127,
"learning_rate": 0.00019991410889510497,
"loss": 0.0987,
"step": 30
},
{
"epoch": 0.25477707006369427,
"grad_norm": 0.4363257586956024,
"learning_rate": 0.00019984467865943805,
"loss": 0.0775,
"step": 40
},
{
"epoch": 0.3184713375796178,
"grad_norm": 0.26337388157844543,
"learning_rate": 0.00019975485163025835,
"loss": 0.0573,
"step": 50
},
{
"epoch": 0.3821656050955414,
"grad_norm": 0.24820806086063385,
"learning_rate": 0.00019964464615790156,
"loss": 0.0647,
"step": 60
},
{
"epoch": 0.445859872611465,
"grad_norm": 0.1877005398273468,
"learning_rate": 0.0001995140847557183,
"loss": 0.0608,
"step": 70
},
{
"epoch": 0.5095541401273885,
"grad_norm": 0.2699015736579895,
"learning_rate": 0.00019936319409547513,
"loss": 0.0533,
"step": 80
},
{
"epoch": 0.5732484076433121,
"grad_norm": 0.32841813564300537,
"learning_rate": 0.00019919200500190587,
"loss": 0.0622,
"step": 90
},
{
"epoch": 0.6369426751592356,
"grad_norm": 0.2413034588098526,
"learning_rate": 0.00019900055244641447,
"loss": 0.0664,
"step": 100
},
{
"epoch": 0.7006369426751592,
"grad_norm": 0.3198252320289612,
"learning_rate": 0.000198788875539931,
"loss": 0.049,
"step": 110
},
{
"epoch": 0.7643312101910829,
"grad_norm": 0.27829986810684204,
"learning_rate": 0.00019855701752492176,
"loss": 0.0574,
"step": 120
},
{
"epoch": 0.8280254777070064,
"grad_norm": 0.2980740964412689,
"learning_rate": 0.00019830502576655552,
"loss": 0.0494,
"step": 130
},
{
"epoch": 0.89171974522293,
"grad_norm": 0.3273659348487854,
"learning_rate": 0.00019803295174302752,
"loss": 0.0486,
"step": 140
},
{
"epoch": 0.9554140127388535,
"grad_norm": 0.23626568913459778,
"learning_rate": 0.00019774085103504326,
"loss": 0.0408,
"step": 150
},
{
"epoch": 1.0,
"eval_loss": 0.05023716390132904,
"eval_runtime": 152.5451,
"eval_samples_per_second": 2.74,
"eval_steps_per_second": 0.347,
"step": 157
},
{
"epoch": 1.019108280254777,
"grad_norm": 0.2541586458683014,
"learning_rate": 0.00019742878331446414,
"loss": 0.0473,
"step": 160
},
{
"epoch": 1.0828025477707006,
"grad_norm": 0.27645406126976013,
"learning_rate": 0.00019709681233211733,
"loss": 0.0224,
"step": 170
},
{
"epoch": 1.1464968152866242,
"grad_norm": 0.3336308002471924,
"learning_rate": 0.0001967450059047726,
"loss": 0.0478,
"step": 180
},
{
"epoch": 1.2101910828025477,
"grad_norm": 0.1680356115102768,
"learning_rate": 0.00019637343590128809,
"loss": 0.0315,
"step": 190
},
{
"epoch": 1.2738853503184713,
"grad_norm": 0.1790059506893158,
"learning_rate": 0.00019598217822792892,
"loss": 0.0326,
"step": 200
},
{
"epoch": 1.3375796178343948,
"grad_norm": 0.22847947478294373,
"learning_rate": 0.00019557131281286024,
"loss": 0.0481,
"step": 210
},
{
"epoch": 1.4012738853503186,
"grad_norm": 0.20187054574489594,
"learning_rate": 0.0001951409235898194,
"loss": 0.0407,
"step": 220
},
{
"epoch": 1.4649681528662422,
"grad_norm": 0.14863349497318268,
"learning_rate": 0.0001946910984809694,
"loss": 0.0406,
"step": 230
},
{
"epoch": 1.5286624203821657,
"grad_norm": 0.17791222035884857,
"learning_rate": 0.00019422192937893775,
"loss": 0.0328,
"step": 240
},
{
"epoch": 1.5923566878980893,
"grad_norm": 0.15719226002693176,
"learning_rate": 0.00019373351212804404,
"loss": 0.0337,
"step": 250
},
{
"epoch": 1.6560509554140128,
"grad_norm": 0.2113306075334549,
"learning_rate": 0.0001932259465047206,
"loss": 0.0353,
"step": 260
},
{
"epoch": 1.7197452229299364,
"grad_norm": 0.19012148678302765,
"learning_rate": 0.0001926993361971293,
"loss": 0.0328,
"step": 270
},
{
"epoch": 1.78343949044586,
"grad_norm": 0.1509159654378891,
"learning_rate": 0.00019215378878397997,
"loss": 0.0407,
"step": 280
},
{
"epoch": 1.8471337579617835,
"grad_norm": 0.1625605821609497,
"learning_rate": 0.00019158941571255337,
"loss": 0.0379,
"step": 290
},
{
"epoch": 1.910828025477707,
"grad_norm": 0.12683314085006714,
"learning_rate": 0.0001910063322759343,
"loss": 0.037,
"step": 300
},
{
"epoch": 1.9745222929936306,
"grad_norm": 0.14843901991844177,
"learning_rate": 0.00019040465758945883,
"loss": 0.0353,
"step": 310
},
{
"epoch": 2.0,
"eval_loss": 0.0473560094833374,
"eval_runtime": 151.5547,
"eval_samples_per_second": 2.758,
"eval_steps_per_second": 0.35,
"step": 314
},
{
"epoch": 2.038216560509554,
"grad_norm": 0.16260643303394318,
"learning_rate": 0.00018978451456638088,
"loss": 0.0365,
"step": 320
},
{
"epoch": 2.1019108280254777,
"grad_norm": 0.09349235892295837,
"learning_rate": 0.00018914602989276294,
"loss": 0.0327,
"step": 330
},
{
"epoch": 2.1656050955414012,
"grad_norm": 0.17592753469944,
"learning_rate": 0.00018848933400159569,
"loss": 0.0258,
"step": 340
},
{
"epoch": 2.229299363057325,
"grad_norm": 0.1627691686153412,
"learning_rate": 0.00018781456104615272,
"loss": 0.0274,
"step": 350
},
{
"epoch": 2.2929936305732483,
"grad_norm": 0.14966121315956116,
"learning_rate": 0.00018712184887258494,
"loss": 0.0293,
"step": 360
},
{
"epoch": 2.356687898089172,
"grad_norm": 0.17441661655902863,
"learning_rate": 0.0001864113389917606,
"loss": 0.0304,
"step": 370
},
{
"epoch": 2.4203821656050954,
"grad_norm": 0.09554579854011536,
"learning_rate": 0.00018568317655035676,
"loss": 0.0321,
"step": 380
},
{
"epoch": 2.484076433121019,
"grad_norm": 0.2536601126194,
"learning_rate": 0.00018493751030120793,
"loss": 0.0257,
"step": 390
},
{
"epoch": 2.5477707006369426,
"grad_norm": 0.16189263761043549,
"learning_rate": 0.00018417449257291803,
"loss": 0.0279,
"step": 400
},
{
"epoch": 2.611464968152866,
"grad_norm": 0.1018817350268364,
"learning_rate": 0.00018339427923874207,
"loss": 0.0289,
"step": 410
},
{
"epoch": 2.6751592356687897,
"grad_norm": 0.14323534071445465,
"learning_rate": 0.00018259702968474327,
"loss": 0.0275,
"step": 420
},
{
"epoch": 2.738853503184713,
"grad_norm": 0.11969427019357681,
"learning_rate": 0.00018178290677723312,
"loss": 0.0281,
"step": 430
},
{
"epoch": 2.802547770700637,
"grad_norm": 0.2024673968553543,
"learning_rate": 0.00018095207682950005,
"loss": 0.0314,
"step": 440
},
{
"epoch": 2.8662420382165603,
"grad_norm": 0.17737647891044617,
"learning_rate": 0.00018010470956783406,
"loss": 0.0279,
"step": 450
},
{
"epoch": 2.9299363057324843,
"grad_norm": 0.18298077583312988,
"learning_rate": 0.00017924097809685424,
"loss": 0.0257,
"step": 460
},
{
"epoch": 2.9936305732484074,
"grad_norm": 0.2549296021461487,
"learning_rate": 0.00017836105886414596,
"loss": 0.0304,
"step": 470
},
{
"epoch": 3.0,
"eval_loss": 0.05053602159023285,
"eval_runtime": 152.6545,
"eval_samples_per_second": 2.738,
"eval_steps_per_second": 0.347,
"step": 471
},
{
"epoch": 3.0573248407643314,
"grad_norm": 0.17169545590877533,
"learning_rate": 0.00017746513162421535,
"loss": 0.0208,
"step": 480
},
{
"epoch": 3.121019108280255,
"grad_norm": 0.21220606565475464,
"learning_rate": 0.00017655337940176793,
"loss": 0.0226,
"step": 490
},
{
"epoch": 3.1847133757961785,
"grad_norm": 0.12988293170928955,
"learning_rate": 0.00017562598845431956,
"loss": 0.0217,
"step": 500
},
{
"epoch": 3.248407643312102,
"grad_norm": 0.3005841076374054,
"learning_rate": 0.00017468314823414669,
"loss": 0.0187,
"step": 510
},
{
"epoch": 3.3121019108280256,
"grad_norm": 0.21529339253902435,
"learning_rate": 0.000173725051349584,
"loss": 0.022,
"step": 520
},
{
"epoch": 3.375796178343949,
"grad_norm": 0.07448782026767731,
"learning_rate": 0.00017275189352567745,
"loss": 0.0311,
"step": 530
},
{
"epoch": 3.4394904458598727,
"grad_norm": 0.10172971338033676,
"learning_rate": 0.0001717638735642005,
"loss": 0.0223,
"step": 540
},
{
"epoch": 3.5031847133757963,
"grad_norm": 0.19833995401859283,
"learning_rate": 0.0001707611933030419,
"loss": 0.0249,
"step": 550
},
{
"epoch": 3.56687898089172,
"grad_norm": 0.234901562333107,
"learning_rate": 0.00016974405757497318,
"loss": 0.0383,
"step": 560
},
{
"epoch": 3.6305732484076434,
"grad_norm": 0.21425440907478333,
"learning_rate": 0.0001687126741658041,
"loss": 0.0266,
"step": 570
},
{
"epoch": 3.694267515923567,
"grad_norm": 0.08396715670824051,
"learning_rate": 0.00016766725377193557,
"loss": 0.0265,
"step": 580
},
{
"epoch": 3.7579617834394905,
"grad_norm": 0.08471404016017914,
"learning_rate": 0.00016660800995731693,
"loss": 0.0238,
"step": 590
},
{
"epoch": 3.821656050955414,
"grad_norm": 0.0868527814745903,
"learning_rate": 0.00016553515910981847,
"loss": 0.0269,
"step": 600
},
{
"epoch": 3.8853503184713376,
"grad_norm": 0.3129713535308838,
"learning_rate": 0.0001644489203970263,
"loss": 0.0223,
"step": 610
},
{
"epoch": 3.949044585987261,
"grad_norm": 0.11113307625055313,
"learning_rate": 0.00016334951572146965,
"loss": 0.0238,
"step": 620
},
{
"epoch": 4.0,
"eval_loss": 0.04629155993461609,
"eval_runtime": 148.9174,
"eval_samples_per_second": 2.807,
"eval_steps_per_second": 0.356,
"step": 628
},
{
"epoch": 4.012738853503185,
"grad_norm": 0.0971466675400734,
"learning_rate": 0.00016223716967528958,
"loss": 0.018,
"step": 630
},
{
"epoch": 4.076433121019108,
"grad_norm": 0.15950708091259003,
"learning_rate": 0.00016111210949435815,
"loss": 0.0158,
"step": 640
},
{
"epoch": 4.140127388535032,
"grad_norm": 0.20078063011169434,
"learning_rate": 0.00015997456501185727,
"loss": 0.0179,
"step": 650
},
{
"epoch": 4.203821656050955,
"grad_norm": 0.1425529569387436,
"learning_rate": 0.0001588247686113274,
"loss": 0.0245,
"step": 660
},
{
"epoch": 4.267515923566879,
"grad_norm": 0.1291467249393463,
"learning_rate": 0.00015766295517919497,
"loss": 0.0241,
"step": 670
},
{
"epoch": 4.3312101910828025,
"grad_norm": 0.08179380744695663,
"learning_rate": 0.00015648936205678838,
"loss": 0.023,
"step": 680
},
{
"epoch": 4.3949044585987265,
"grad_norm": 0.15069672465324402,
"learning_rate": 0.00015530422899185298,
"loss": 0.0304,
"step": 690
},
{
"epoch": 4.45859872611465,
"grad_norm": 0.14441800117492676,
"learning_rate": 0.00015410779808957385,
"loss": 0.0246,
"step": 700
},
{
"epoch": 4.522292993630574,
"grad_norm": 0.07218258827924728,
"learning_rate": 0.0001529003137631175,
"loss": 0.0232,
"step": 710
},
{
"epoch": 4.585987261146497,
"grad_norm": 0.28358036279678345,
"learning_rate": 0.0001516820226837017,
"loss": 0.0357,
"step": 720
},
{
"epoch": 4.649681528662421,
"grad_norm": 0.48727092146873474,
"learning_rate": 0.00015045317373020426,
"loss": 0.0192,
"step": 730
},
{
"epoch": 4.713375796178344,
"grad_norm": 0.14064273238182068,
"learning_rate": 0.00014921401793832094,
"loss": 0.0221,
"step": 740
},
{
"epoch": 4.777070063694268,
"grad_norm": 0.4577218294143677,
"learning_rate": 0.00014796480844928218,
"loss": 0.0171,
"step": 750
},
{
"epoch": 4.840764331210191,
"grad_norm": 0.07277490198612213,
"learning_rate": 0.0001467058004581404,
"loss": 0.0244,
"step": 760
},
{
"epoch": 4.904458598726115,
"grad_norm": 0.3607349693775177,
"learning_rate": 0.0001454372511616373,
"loss": 0.0227,
"step": 770
},
{
"epoch": 4.968152866242038,
"grad_norm": 1.5265377759933472,
"learning_rate": 0.00014415941970566233,
"loss": 0.0239,
"step": 780
},
{
"epoch": 5.0,
"eval_loss": 0.04556597024202347,
"eval_runtime": 148.8315,
"eval_samples_per_second": 2.809,
"eval_steps_per_second": 0.356,
"step": 785
},
{
"epoch": 5.031847133757962,
"grad_norm": 0.08108412474393845,
"learning_rate": 0.00014287256713231314,
"loss": 0.0179,
"step": 790
},
{
"epoch": 5.095541401273885,
"grad_norm": 0.07250893861055374,
"learning_rate": 0.00014157695632656837,
"loss": 0.0128,
"step": 800
},
{
"epoch": 5.159235668789809,
"grad_norm": 0.12970279157161713,
"learning_rate": 0.00014027285196258426,
"loss": 0.0127,
"step": 810
},
{
"epoch": 5.222929936305732,
"grad_norm": 0.20146997272968292,
"learning_rate": 0.00013896052044962557,
"loss": 0.014,
"step": 820
},
{
"epoch": 5.286624203821656,
"grad_norm": 0.165513277053833,
"learning_rate": 0.00013764022987764209,
"loss": 0.0182,
"step": 830
},
{
"epoch": 5.350318471337579,
"grad_norm": 0.1810760200023651,
"learning_rate": 0.00013631224996250185,
"loss": 0.0164,
"step": 840
},
{
"epoch": 5.414012738853503,
"grad_norm": 0.13012060523033142,
"learning_rate": 0.00013497685199089217,
"loss": 0.0162,
"step": 850
},
{
"epoch": 5.477707006369426,
"grad_norm": 0.1861809343099594,
"learning_rate": 0.00013363430876489976,
"loss": 0.0141,
"step": 860
},
{
"epoch": 5.54140127388535,
"grad_norm": 0.2427922487258911,
"learning_rate": 0.00013228489454628127,
"loss": 0.0191,
"step": 870
},
{
"epoch": 5.6050955414012735,
"grad_norm": 0.06246360391378403,
"learning_rate": 0.00013092888500043566,
"loss": 0.0143,
"step": 880
},
{
"epoch": 5.6687898089171975,
"grad_norm": 0.15271341800689697,
"learning_rate": 0.0001295665571400899,
"loss": 0.0185,
"step": 890
},
{
"epoch": 5.732484076433121,
"grad_norm": 0.04112791642546654,
"learning_rate": 0.00012819818926870942,
"loss": 0.0122,
"step": 900
},
{
"epoch": 5.796178343949045,
"grad_norm": 0.13756102323532104,
"learning_rate": 0.00012682406092364446,
"loss": 0.0205,
"step": 910
},
{
"epoch": 5.859872611464969,
"grad_norm": 0.1089109405875206,
"learning_rate": 0.00012544445281902512,
"loss": 0.0175,
"step": 920
},
{
"epoch": 5.923566878980892,
"grad_norm": 0.5035731792449951,
"learning_rate": 0.00012405964678841556,
"loss": 0.0164,
"step": 930
},
{
"epoch": 5.987261146496815,
"grad_norm": 0.04808522015810013,
"learning_rate": 0.0001226699257272393,
"loss": 0.0178,
"step": 940
},
{
"epoch": 6.0,
"eval_loss": 0.048118457198143005,
"eval_runtime": 148.8942,
"eval_samples_per_second": 2.807,
"eval_steps_per_second": 0.356,
"step": 942
},
{
"epoch": 6.050955414012739,
"grad_norm": 0.09140116721391678,
"learning_rate": 0.00012127557353498806,
"loss": 0.0095,
"step": 950
},
{
"epoch": 6.114649681528663,
"grad_norm": 0.0554346963763237,
"learning_rate": 0.00011987687505722532,
"loss": 0.0102,
"step": 960
},
{
"epoch": 6.178343949044586,
"grad_norm": 0.0764077678322792,
"learning_rate": 0.00011847411602739645,
"loss": 0.0097,
"step": 970
},
{
"epoch": 6.24203821656051,
"grad_norm": 0.0922919437289238,
"learning_rate": 0.00011706758300845771,
"loss": 0.0127,
"step": 980
},
{
"epoch": 6.305732484076433,
"grad_norm": 0.12583084404468536,
"learning_rate": 0.0001156575633343355,
"loss": 0.0119,
"step": 990
},
{
"epoch": 6.369426751592357,
"grad_norm": 0.14942176640033722,
"learning_rate": 0.00011424434505122851,
"loss": 0.0132,
"step": 1000
},
{
"epoch": 6.43312101910828,
"grad_norm": 0.2156478315591812,
"learning_rate": 0.00011282821685876399,
"loss": 0.012,
"step": 1010
},
{
"epoch": 6.496815286624204,
"grad_norm": 0.12194344401359558,
"learning_rate": 0.00011140946805102059,
"loss": 0.0136,
"step": 1020
},
{
"epoch": 6.560509554140127,
"grad_norm": 0.1328732818365097,
"learning_rate": 0.00010998838845743011,
"loss": 0.0131,
"step": 1030
},
{
"epoch": 6.624203821656051,
"grad_norm": 0.30128493905067444,
"learning_rate": 0.00010856526838356941,
"loss": 0.0109,
"step": 1040
},
{
"epoch": 6.687898089171974,
"grad_norm": 0.16975216567516327,
"learning_rate": 0.00010714039855185539,
"loss": 0.0149,
"step": 1050
},
{
"epoch": 6.751592356687898,
"grad_norm": 0.08274857699871063,
"learning_rate": 0.00010571407004215447,
"loss": 0.0155,
"step": 1060
},
{
"epoch": 6.8152866242038215,
"grad_norm": 0.1308615654706955,
"learning_rate": 0.00010428657423231969,
"loss": 0.0135,
"step": 1070
},
{
"epoch": 6.8789808917197455,
"grad_norm": 0.05005017668008804,
"learning_rate": 0.00010285820273866613,
"loss": 0.0157,
"step": 1080
},
{
"epoch": 6.942675159235669,
"grad_norm": 0.23810291290283203,
"learning_rate": 0.00010142924735639819,
"loss": 0.0141,
"step": 1090
},
{
"epoch": 7.0,
"eval_loss": 0.047865718603134155,
"eval_runtime": 148.8655,
"eval_samples_per_second": 2.808,
"eval_steps_per_second": 0.356,
"step": 1099
},
{
"epoch": 7.006369426751593,
"grad_norm": 0.09416891634464264,
"learning_rate": 0.0001,
"loss": 0.0146,
"step": 1100
},
{
"epoch": 7.070063694267516,
"grad_norm": 0.7533183097839355,
"learning_rate": 9.857075264360185e-05,
"loss": 0.0113,
"step": 1110
},
{
"epoch": 7.13375796178344,
"grad_norm": 0.16101513803005219,
"learning_rate": 9.714179726133388e-05,
"loss": 0.0075,
"step": 1120
},
{
"epoch": 7.197452229299363,
"grad_norm": 0.1908101737499237,
"learning_rate": 9.571342576768035e-05,
"loss": 0.009,
"step": 1130
},
{
"epoch": 7.261146496815287,
"grad_norm": 0.06933945417404175,
"learning_rate": 9.428592995784554e-05,
"loss": 0.0089,
"step": 1140
},
{
"epoch": 7.32484076433121,
"grad_norm": 0.054749008268117905,
"learning_rate": 9.285960144814465e-05,
"loss": 0.0097,
"step": 1150
},
{
"epoch": 7.388535031847134,
"grad_norm": 0.21135513484477997,
"learning_rate": 9.14347316164306e-05,
"loss": 0.011,
"step": 1160
},
{
"epoch": 7.452229299363057,
"grad_norm": 0.0507340133190155,
"learning_rate": 9.00116115425699e-05,
"loss": 0.0095,
"step": 1170
},
{
"epoch": 7.515923566878981,
"grad_norm": 0.06733115762472153,
"learning_rate": 8.859053194897942e-05,
"loss": 0.0108,
"step": 1180
},
{
"epoch": 7.579617834394904,
"grad_norm": 0.07039262354373932,
"learning_rate": 8.717178314123605e-05,
"loss": 0.0082,
"step": 1190
},
{
"epoch": 7.643312101910828,
"grad_norm": 0.08534280955791473,
"learning_rate": 8.575565494877147e-05,
"loss": 0.0099,
"step": 1200
},
{
"epoch": 7.707006369426751,
"grad_norm": 0.11997800320386887,
"learning_rate": 8.434243666566451e-05,
"loss": 0.011,
"step": 1210
},
{
"epoch": 7.770700636942675,
"grad_norm": 0.05447472259402275,
"learning_rate": 8.293241699154231e-05,
"loss": 0.0089,
"step": 1220
},
{
"epoch": 7.834394904458598,
"grad_norm": 0.045016925781965256,
"learning_rate": 8.152588397260357e-05,
"loss": 0.0087,
"step": 1230
},
{
"epoch": 7.898089171974522,
"grad_norm": 0.10037513077259064,
"learning_rate": 8.012312494277472e-05,
"loss": 0.0088,
"step": 1240
},
{
"epoch": 7.961783439490446,
"grad_norm": 0.09553356468677521,
"learning_rate": 7.872442646501199e-05,
"loss": 0.008,
"step": 1250
},
{
"epoch": 8.0,
"eval_loss": 0.05469883605837822,
"eval_runtime": 148.8965,
"eval_samples_per_second": 2.807,
"eval_steps_per_second": 0.356,
"step": 1256
},
{
"epoch": 8.02547770700637,
"grad_norm": 0.099692702293396,
"learning_rate": 7.733007427276075e-05,
"loss": 0.008,
"step": 1260
},
{
"epoch": 8.089171974522293,
"grad_norm": 0.05955551564693451,
"learning_rate": 7.594035321158445e-05,
"loss": 0.0062,
"step": 1270
},
{
"epoch": 8.152866242038217,
"grad_norm": 0.055873848497867584,
"learning_rate": 7.455554718097487e-05,
"loss": 0.0051,
"step": 1280
},
{
"epoch": 8.21656050955414,
"grad_norm": 0.11832093447446823,
"learning_rate": 7.317593907635558e-05,
"loss": 0.0068,
"step": 1290
},
{
"epoch": 8.280254777070065,
"grad_norm": 0.11872788518667221,
"learning_rate": 7.180181073129061e-05,
"loss": 0.0055,
"step": 1300
},
{
"epoch": 8.343949044585987,
"grad_norm": 0.12502700090408325,
"learning_rate": 7.043344285991012e-05,
"loss": 0.006,
"step": 1310
},
{
"epoch": 8.40764331210191,
"grad_norm": 0.07949739694595337,
"learning_rate": 6.907111499956439e-05,
"loss": 0.0056,
"step": 1320
},
{
"epoch": 8.471337579617835,
"grad_norm": 0.08610483258962631,
"learning_rate": 6.77151054537188e-05,
"loss": 0.0048,
"step": 1330
},
{
"epoch": 8.535031847133759,
"grad_norm": 0.08261114358901978,
"learning_rate": 6.636569123510027e-05,
"loss": 0.0047,
"step": 1340
},
{
"epoch": 8.598726114649681,
"grad_norm": 0.030890854075551033,
"learning_rate": 6.502314800910785e-05,
"loss": 0.0052,
"step": 1350
},
{
"epoch": 8.662420382165605,
"grad_norm": 0.07963161170482635,
"learning_rate": 6.368775003749816e-05,
"loss": 0.0099,
"step": 1360
},
{
"epoch": 8.726114649681529,
"grad_norm": 0.14875206351280212,
"learning_rate": 6.235977012235792e-05,
"loss": 0.006,
"step": 1370
},
{
"epoch": 8.789808917197453,
"grad_norm": 0.21349501609802246,
"learning_rate": 6.103947955037446e-05,
"loss": 0.0047,
"step": 1380
},
{
"epoch": 8.853503184713375,
"grad_norm": 0.05400541424751282,
"learning_rate": 5.972714803741577e-05,
"loss": 0.006,
"step": 1390
},
{
"epoch": 8.9171974522293,
"grad_norm": 0.14428143203258514,
"learning_rate": 5.842304367343161e-05,
"loss": 0.0095,
"step": 1400
},
{
"epoch": 8.980891719745223,
"grad_norm": 0.07769430428743362,
"learning_rate": 5.712743286768687e-05,
"loss": 0.0053,
"step": 1410
},
{
"epoch": 9.0,
"eval_loss": 0.05676256865262985,
"eval_runtime": 148.9241,
"eval_samples_per_second": 2.807,
"eval_steps_per_second": 0.356,
"step": 1413
},
{
"epoch": 9.044585987261147,
"grad_norm": 0.10749530047178268,
"learning_rate": 5.584058029433766e-05,
"loss": 0.0038,
"step": 1420
},
{
"epoch": 9.10828025477707,
"grad_norm": 0.016948334872722626,
"learning_rate": 5.4562748838362735e-05,
"loss": 0.0051,
"step": 1430
},
{
"epoch": 9.171974522292993,
"grad_norm": 0.008234160952270031,
"learning_rate": 5.329419954185965e-05,
"loss": 0.0043,
"step": 1440
},
{
"epoch": 9.235668789808917,
"grad_norm": 0.04994361847639084,
"learning_rate": 5.203519155071785e-05,
"loss": 0.0039,
"step": 1450
},
{
"epoch": 9.299363057324841,
"grad_norm": 0.037435177713632584,
"learning_rate": 5.078598206167912e-05,
"loss": 0.0033,
"step": 1460
},
{
"epoch": 9.363057324840764,
"grad_norm": 0.11694881319999695,
"learning_rate": 4.9546826269795765e-05,
"loss": 0.0036,
"step": 1470
},
{
"epoch": 9.426751592356688,
"grad_norm": 0.05153834447264671,
"learning_rate": 4.831797731629835e-05,
"loss": 0.0042,
"step": 1480
},
{
"epoch": 9.490445859872612,
"grad_norm": 0.09336938709020615,
"learning_rate": 4.709968623688254e-05,
"loss": 0.0028,
"step": 1490
},
{
"epoch": 9.554140127388536,
"grad_norm": 0.03732943907380104,
"learning_rate": 4.589220191042616e-05,
"loss": 0.0034,
"step": 1500
},
{
"epoch": 9.617834394904458,
"grad_norm": 0.14202427864074707,
"learning_rate": 4.469577100814705e-05,
"loss": 0.0031,
"step": 1510
},
{
"epoch": 9.681528662420382,
"grad_norm": 0.09861844778060913,
"learning_rate": 4.351063794321165e-05,
"loss": 0.003,
"step": 1520
},
{
"epoch": 9.745222929936306,
"grad_norm": 0.16652171313762665,
"learning_rate": 4.233704482080504e-05,
"loss": 0.0041,
"step": 1530
},
{
"epoch": 9.80891719745223,
"grad_norm": 0.05778292566537857,
"learning_rate": 4.11752313886726e-05,
"loss": 0.0042,
"step": 1540
},
{
"epoch": 9.872611464968152,
"grad_norm": 0.012399845756590366,
"learning_rate": 4.0025434988142766e-05,
"loss": 0.0037,
"step": 1550
},
{
"epoch": 9.936305732484076,
"grad_norm": 0.0798059031367302,
"learning_rate": 3.888789050564188e-05,
"loss": 0.0047,
"step": 1560
},
{
"epoch": 10.0,
"grad_norm": 0.04369504004716873,
"learning_rate": 3.776283032471044e-05,
"loss": 0.0029,
"step": 1570
},
{
"epoch": 10.0,
"eval_loss": 0.07112779468297958,
"eval_runtime": 148.9356,
"eval_samples_per_second": 2.807,
"eval_steps_per_second": 0.356,
"step": 1570
},
{
"epoch": 10.063694267515924,
"grad_norm": 0.02837546356022358,
"learning_rate": 3.6650484278530387e-05,
"loss": 0.0023,
"step": 1580
},
{
"epoch": 10.127388535031848,
"grad_norm": 0.00527458218857646,
"learning_rate": 3.5551079602973734e-05,
"loss": 0.0041,
"step": 1590
},
{
"epoch": 10.19108280254777,
"grad_norm": 0.013476898893713951,
"learning_rate": 3.446484089018153e-05,
"loss": 0.0028,
"step": 1600
},
{
"epoch": 10.254777070063694,
"grad_norm": 0.009121859446167946,
"learning_rate": 3.3391990042683055e-05,
"loss": 0.0032,
"step": 1610
},
{
"epoch": 10.318471337579618,
"grad_norm": 0.035027824342250824,
"learning_rate": 3.233274622806446e-05,
"loss": 0.0038,
"step": 1620
},
{
"epoch": 10.382165605095542,
"grad_norm": 0.05406291410326958,
"learning_rate": 3.1287325834195915e-05,
"loss": 0.0027,
"step": 1630
},
{
"epoch": 10.445859872611464,
"grad_norm": 0.021644996479153633,
"learning_rate": 3.025594242502684e-05,
"loss": 0.0022,
"step": 1640
},
{
"epoch": 10.509554140127388,
"grad_norm": 0.04471028223633766,
"learning_rate": 2.9238806696958087e-05,
"loss": 0.0034,
"step": 1650
},
{
"epoch": 10.573248407643312,
"grad_norm": 0.014225292019546032,
"learning_rate": 2.823612643579949e-05,
"loss": 0.002,
"step": 1660
},
{
"epoch": 10.636942675159236,
"grad_norm": 0.005709750112146139,
"learning_rate": 2.7248106474322554e-05,
"loss": 0.0021,
"step": 1670
},
{
"epoch": 10.700636942675159,
"grad_norm": 0.015249662101268768,
"learning_rate": 2.627494865041602e-05,
"loss": 0.002,
"step": 1680
},
{
"epoch": 10.764331210191083,
"grad_norm": 0.061606843024492264,
"learning_rate": 2.5316851765853344e-05,
"loss": 0.0035,
"step": 1690
},
{
"epoch": 10.828025477707007,
"grad_norm": 0.056455183774232864,
"learning_rate": 2.437401154568044e-05,
"loss": 0.0016,
"step": 1700
},
{
"epoch": 10.89171974522293,
"grad_norm": 0.050760120153427124,
"learning_rate": 2.3446620598232104e-05,
"loss": 0.0031,
"step": 1710
},
{
"epoch": 10.955414012738853,
"grad_norm": 0.0020009365398436785,
"learning_rate": 2.253486837578468e-05,
"loss": 0.0024,
"step": 1720
},
{
"epoch": 11.0,
"eval_loss": 0.07927798479795456,
"eval_runtime": 148.9334,
"eval_samples_per_second": 2.807,
"eval_steps_per_second": 0.356,
"step": 1727
},
{
"epoch": 11.019108280254777,
"grad_norm": 0.010104876011610031,
"learning_rate": 2.163894113585404e-05,
"loss": 0.0021,
"step": 1730
},
{
"epoch": 11.0828025477707,
"grad_norm": 0.00586892431601882,
"learning_rate": 2.075902190314578e-05,
"loss": 0.0018,
"step": 1740
},
{
"epoch": 11.146496815286625,
"grad_norm": 0.004691167734563351,
"learning_rate": 1.9895290432165935e-05,
"loss": 0.0021,
"step": 1750
},
{
"epoch": 11.210191082802547,
"grad_norm": 0.0037749160546809435,
"learning_rate": 1.904792317049996e-05,
"loss": 0.0016,
"step": 1760
},
{
"epoch": 11.273885350318471,
"grad_norm": 0.0812540128827095,
"learning_rate": 1.82170932227669e-05,
"loss": 0.0023,
"step": 1770
},
{
"epoch": 11.337579617834395,
"grad_norm": 0.03934706375002861,
"learning_rate": 1.740297031525674e-05,
"loss": 0.0027,
"step": 1780
},
{
"epoch": 11.401273885350319,
"grad_norm": 0.005765652749687433,
"learning_rate": 1.660572076125797e-05,
"loss": 0.0017,
"step": 1790
},
{
"epoch": 11.464968152866241,
"grad_norm": 0.006609324831515551,
"learning_rate": 1.5825507427081976e-05,
"loss": 0.0019,
"step": 1800
},
{
"epoch": 11.528662420382165,
"grad_norm": 0.046800799667835236,
"learning_rate": 1.5062489698792082e-05,
"loss": 0.0024,
"step": 1810
},
{
"epoch": 11.59235668789809,
"grad_norm": 0.0012369153555482626,
"learning_rate": 1.4316823449643257e-05,
"loss": 0.0015,
"step": 1820
},
{
"epoch": 11.656050955414013,
"grad_norm": 0.008111722767353058,
"learning_rate": 1.3588661008239412e-05,
"loss": 0.0023,
"step": 1830
},
{
"epoch": 11.719745222929935,
"grad_norm": 0.0041403137147426605,
"learning_rate": 1.2878151127415094e-05,
"loss": 0.0021,
"step": 1840
},
{
"epoch": 11.78343949044586,
"grad_norm": 0.19675485789775848,
"learning_rate": 1.2185438953847328e-05,
"loss": 0.0032,
"step": 1850
},
{
"epoch": 11.847133757961783,
"grad_norm": 0.006231395993381739,
"learning_rate": 1.1510665998404336e-05,
"loss": 0.0022,
"step": 1860
},
{
"epoch": 11.910828025477707,
"grad_norm": 0.0019039853941649199,
"learning_rate": 1.0853970107237088e-05,
"loss": 0.0028,
"step": 1870
},
{
"epoch": 11.97452229299363,
"grad_norm": 0.0023176763206720352,
"learning_rate": 1.0215485433619132e-05,
"loss": 0.0017,
"step": 1880
},
{
"epoch": 12.0,
"eval_loss": 0.08632908761501312,
"eval_runtime": 148.9624,
"eval_samples_per_second": 2.806,
"eval_steps_per_second": 0.356,
"step": 1884
},
{
"epoch": 12.038216560509554,
"grad_norm": 0.0012192321009933949,
"learning_rate": 9.595342410541209e-06,
"loss": 0.0017,
"step": 1890
},
{
"epoch": 12.101910828025478,
"grad_norm": 0.01771487295627594,
"learning_rate": 8.993667724065747e-06,
"loss": 0.002,
"step": 1900
},
{
"epoch": 12.165605095541402,
"grad_norm": 0.004799762275069952,
"learning_rate": 8.410584287446643e-06,
"loss": 0.0018,
"step": 1910
},
{
"epoch": 12.229299363057326,
"grad_norm": 0.0028903819620609283,
"learning_rate": 7.846211216020039e-06,
"loss": 0.0017,
"step": 1920
},
{
"epoch": 12.292993630573248,
"grad_norm": 0.002215326763689518,
"learning_rate": 7.3006638028707e-06,
"loss": 0.0018,
"step": 1930
},
{
"epoch": 12.356687898089172,
"grad_norm": 0.0046812682412564754,
"learning_rate": 6.77405349527942e-06,
"loss": 0.0022,
"step": 1940
},
{
"epoch": 12.420382165605096,
"grad_norm": 0.00792867224663496,
"learning_rate": 6.266487871955962e-06,
"loss": 0.0018,
"step": 1950
},
{
"epoch": 12.48407643312102,
"grad_norm": 0.0029917878564447165,
"learning_rate": 5.778070621062281e-06,
"loss": 0.0019,
"step": 1960
},
{
"epoch": 12.547770700636942,
"grad_norm": 0.0012242052471265197,
"learning_rate": 5.308901519030607e-06,
"loss": 0.0015,
"step": 1970
},
{
"epoch": 12.611464968152866,
"grad_norm": 0.0016652451595291495,
"learning_rate": 4.859076410180629e-06,
"loss": 0.0018,
"step": 1980
},
{
"epoch": 12.67515923566879,
"grad_norm": 0.0020181615836918354,
"learning_rate": 4.42868718713978e-06,
"loss": 0.0026,
"step": 1990
},
{
"epoch": 12.738853503184714,
"grad_norm": 0.01197089534252882,
"learning_rate": 4.017821772071084e-06,
"loss": 0.0018,
"step": 2000
},
{
"epoch": 12.802547770700636,
"grad_norm": 0.0036414351779967546,
"learning_rate": 3.6265640987119042e-06,
"loss": 0.0016,
"step": 2010
},
{
"epoch": 12.86624203821656,
"grad_norm": 0.005194537341594696,
"learning_rate": 3.2549940952274483e-06,
"loss": 0.0017,
"step": 2020
},
{
"epoch": 12.929936305732484,
"grad_norm": 0.0025584339164197445,
"learning_rate": 2.903187667882701e-06,
"loss": 0.0019,
"step": 2030
},
{
"epoch": 12.993630573248408,
"grad_norm": 0.08787062019109726,
"learning_rate": 2.5712166855359045e-06,
"loss": 0.0027,
"step": 2040
},
{
"epoch": 13.0,
"eval_loss": 0.09124071151018143,
"eval_runtime": 148.9651,
"eval_samples_per_second": 2.806,
"eval_steps_per_second": 0.356,
"step": 2041
},
{
"epoch": 13.05732484076433,
"grad_norm": 0.001067174132913351,
"learning_rate": 2.2591489649567587e-06,
"loss": 0.0019,
"step": 2050
},
{
"epoch": 13.121019108280255,
"grad_norm": 0.04148571938276291,
"learning_rate": 1.967048256972492e-06,
"loss": 0.0018,
"step": 2060
},
{
"epoch": 13.184713375796179,
"grad_norm": 0.0012596879387274384,
"learning_rate": 1.6949742334445018e-06,
"loss": 0.0017,
"step": 2070
},
{
"epoch": 13.248407643312103,
"grad_norm": 0.0008848529541864991,
"learning_rate": 1.4429824750782583e-06,
"loss": 0.0019,
"step": 2080
},
{
"epoch": 13.312101910828025,
"grad_norm": 0.004621226340532303,
"learning_rate": 1.211124460069013e-06,
"loss": 0.0022,
"step": 2090
},
{
"epoch": 13.375796178343949,
"grad_norm": 0.0037146620452404022,
"learning_rate": 9.99447553585542e-07,
"loss": 0.0018,
"step": 2100
},
{
"epoch": 13.439490445859873,
"grad_norm": 0.0031733817886561155,
"learning_rate": 8.079949980941526e-07,
"loss": 0.0016,
"step": 2110
},
{
"epoch": 13.503184713375797,
"grad_norm": 0.04481673985719681,
"learning_rate": 6.368059045248842e-07,
"loss": 0.0021,
"step": 2120
},
{
"epoch": 13.566878980891719,
"grad_norm": 0.008498159237205982,
"learning_rate": 4.859152442817205e-07,
"loss": 0.002,
"step": 2130
},
{
"epoch": 13.630573248407643,
"grad_norm": 0.003298922209069133,
"learning_rate": 3.5535384209846036e-07,
"loss": 0.0017,
"step": 2140
},
{
"epoch": 13.694267515923567,
"grad_norm": 0.003794416319578886,
"learning_rate": 2.4514836974165454e-07,
"loss": 0.0016,
"step": 2150
},
{
"epoch": 13.757961783439491,
"grad_norm": 0.005654670298099518,
"learning_rate": 1.5532134056196468e-07,
"loss": 0.0017,
"step": 2160
},
{
"epoch": 13.821656050955415,
"grad_norm": 0.0006748574669472873,
"learning_rate": 8.589110489505281e-08,
"loss": 0.0019,
"step": 2170
},
{
"epoch": 13.885350318471337,
"grad_norm": 0.05673813074827194,
"learning_rate": 3.687184631284701e-08,
"loss": 0.0023,
"step": 2180
},
{
"epoch": 13.949044585987261,
"grad_norm": 0.044436752796173096,
"learning_rate": 8.273578726014642e-09,
"loss": 0.002,
"step": 2190
},
{
"epoch": 14.0,
"eval_loss": 0.09217014908790588,
"eval_runtime": 148.9613,
"eval_samples_per_second": 2.806,
"eval_steps_per_second": 0.356,
"step": 2198
}
],
"logging_steps": 10,
"max_steps": 2198,
"num_input_tokens_seen": 0,
"num_train_epochs": 14,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.05580823441408e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}