multi1500_gradient_accumulate32 / trainer_state.json
AaronWu901225's picture
Upload LoRA adapter folder
495e665 verified
{
"best_metric": 0.09926149994134903,
"best_model_checkpoint": "./xlam_lora_new_2560_1_delete_over_size_3epoch_multi/checkpoint-1384",
"epoch": 2.9994592321095954,
"eval_steps": 173,
"global_step": 1560,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.019227302769933306,
"grad_norm": 0.8050442337989807,
"learning_rate": 3.205128205128205e-06,
"loss": 0.6642,
"step": 10
},
{
"epoch": 0.03845460553986661,
"grad_norm": 0.45196670293807983,
"learning_rate": 6.41025641025641e-06,
"loss": 0.6377,
"step": 20
},
{
"epoch": 0.05768190830979992,
"grad_norm": 0.47538116574287415,
"learning_rate": 9.615384615384616e-06,
"loss": 0.6205,
"step": 30
},
{
"epoch": 0.07690921107973323,
"grad_norm": 0.34218236804008484,
"learning_rate": 1.282051282051282e-05,
"loss": 0.5579,
"step": 40
},
{
"epoch": 0.09613651384966652,
"grad_norm": 0.38529354333877563,
"learning_rate": 1.602564102564103e-05,
"loss": 0.4393,
"step": 50
},
{
"epoch": 0.11536381661959984,
"grad_norm": 0.3676348924636841,
"learning_rate": 1.923076923076923e-05,
"loss": 0.3965,
"step": 60
},
{
"epoch": 0.13459111938953314,
"grad_norm": 0.3427989184856415,
"learning_rate": 2.2435897435897437e-05,
"loss": 0.3488,
"step": 70
},
{
"epoch": 0.15381842215946645,
"grad_norm": 0.30555886030197144,
"learning_rate": 2.4999887657859027e-05,
"loss": 0.2224,
"step": 80
},
{
"epoch": 0.17304572492939974,
"grad_norm": 0.3501119315624237,
"learning_rate": 2.4995955894949523e-05,
"loss": 0.248,
"step": 90
},
{
"epoch": 0.19227302769933305,
"grad_norm": 0.36164987087249756,
"learning_rate": 2.4986409044149163e-05,
"loss": 0.2322,
"step": 100
},
{
"epoch": 0.21150033046926636,
"grad_norm": 0.3375028967857361,
"learning_rate": 2.4971251395358342e-05,
"loss": 0.2427,
"step": 110
},
{
"epoch": 0.23072763323919968,
"grad_norm": 0.3321882486343384,
"learning_rate": 2.495048975970308e-05,
"loss": 0.1967,
"step": 120
},
{
"epoch": 0.24995493600913296,
"grad_norm": 0.2828320562839508,
"learning_rate": 2.492413346647437e-05,
"loss": 0.1636,
"step": 130
},
{
"epoch": 0.2691822387790663,
"grad_norm": 0.3430372476577759,
"learning_rate": 2.4892194358936095e-05,
"loss": 0.2041,
"step": 140
},
{
"epoch": 0.28840954154899956,
"grad_norm": 0.3393559455871582,
"learning_rate": 2.4854686789003173e-05,
"loss": 0.1911,
"step": 150
},
{
"epoch": 0.3076368443189329,
"grad_norm": 0.36908936500549316,
"learning_rate": 2.4811627610792543e-05,
"loss": 0.2025,
"step": 160
},
{
"epoch": 0.3268641470888662,
"grad_norm": 0.38679710030555725,
"learning_rate": 2.4763036173049677e-05,
"loss": 0.1566,
"step": 170
},
{
"epoch": 0.3326323379198462,
"eval_loss": 0.14381718635559082,
"eval_runtime": 202.8561,
"eval_samples_per_second": 5.285,
"eval_steps_per_second": 5.285,
"step": 173
},
{
"epoch": 0.34609144985879947,
"grad_norm": 0.39648687839508057,
"learning_rate": 2.4708934310454207e-05,
"loss": 0.1943,
"step": 180
},
{
"epoch": 0.3653187526287328,
"grad_norm": 0.3979399800300598,
"learning_rate": 2.4649346333808458e-05,
"loss": 0.1594,
"step": 190
},
{
"epoch": 0.3845460553986661,
"grad_norm": 0.34854283928871155,
"learning_rate": 2.458429901911331e-05,
"loss": 0.1683,
"step": 200
},
{
"epoch": 0.40377335816859944,
"grad_norm": 0.26675811409950256,
"learning_rate": 2.4513821595536356e-05,
"loss": 0.1616,
"step": 210
},
{
"epoch": 0.4230006609385327,
"grad_norm": 0.4399104118347168,
"learning_rate": 2.44379457322777e-05,
"loss": 0.1664,
"step": 220
},
{
"epoch": 0.442227963708466,
"grad_norm": 0.5316939353942871,
"learning_rate": 2.4356705524339317e-05,
"loss": 0.1745,
"step": 230
},
{
"epoch": 0.46145526647839935,
"grad_norm": 0.5996547341346741,
"learning_rate": 2.4270137477204408e-05,
"loss": 0.1753,
"step": 240
},
{
"epoch": 0.48068256924833264,
"grad_norm": 0.4330001175403595,
"learning_rate": 2.417828049043353e-05,
"loss": 0.1997,
"step": 250
},
{
"epoch": 0.4999098720182659,
"grad_norm": 0.4255751073360443,
"learning_rate": 2.4081175840185022e-05,
"loss": 0.1728,
"step": 260
},
{
"epoch": 0.5191371747881992,
"grad_norm": 0.536382257938385,
"learning_rate": 2.3978867160667457e-05,
"loss": 0.147,
"step": 270
},
{
"epoch": 0.5383644775581325,
"grad_norm": 0.5623698830604553,
"learning_rate": 2.3871400424532493e-05,
"loss": 0.1863,
"step": 280
},
{
"epoch": 0.5575917803280659,
"grad_norm": 0.49679550528526306,
"learning_rate": 2.375882392221695e-05,
"loss": 0.1685,
"step": 290
},
{
"epoch": 0.5768190830979991,
"grad_norm": 0.5784851908683777,
"learning_rate": 2.36411882402434e-05,
"loss": 0.1506,
"step": 300
},
{
"epoch": 0.5960463858679325,
"grad_norm": 0.6098183393478394,
"learning_rate": 2.3518546238489e-05,
"loss": 0.1565,
"step": 310
},
{
"epoch": 0.6152736886378658,
"grad_norm": 0.5198598504066467,
"learning_rate": 2.339095302643273e-05,
"loss": 0.1433,
"step": 320
},
{
"epoch": 0.634500991407799,
"grad_norm": 0.5796005129814148,
"learning_rate": 2.325846593839188e-05,
"loss": 0.1668,
"step": 330
},
{
"epoch": 0.6537282941777324,
"grad_norm": 0.6006646752357483,
"learning_rate": 2.312114450775869e-05,
"loss": 0.1505,
"step": 340
},
{
"epoch": 0.6652646758396924,
"eval_loss": 0.1198095753788948,
"eval_runtime": 202.8938,
"eval_samples_per_second": 5.284,
"eval_steps_per_second": 5.284,
"step": 346
},
{
"epoch": 0.6729555969476657,
"grad_norm": 0.5787773728370667,
"learning_rate": 2.2979050440248896e-05,
"loss": 0.1442,
"step": 350
},
{
"epoch": 0.6921828997175989,
"grad_norm": 0.5230283141136169,
"learning_rate": 2.2832247586174118e-05,
"loss": 0.1555,
"step": 360
},
{
"epoch": 0.7114102024875323,
"grad_norm": 0.5551069378852844,
"learning_rate": 2.2680801911750558e-05,
"loss": 0.1422,
"step": 370
},
{
"epoch": 0.7306375052574656,
"grad_norm": 0.5769614577293396,
"learning_rate": 2.2524781469456928e-05,
"loss": 0.165,
"step": 380
},
{
"epoch": 0.7498648080273989,
"grad_norm": 0.6609200239181519,
"learning_rate": 2.2364256367454922e-05,
"loss": 0.161,
"step": 390
},
{
"epoch": 0.7690921107973322,
"grad_norm": 0.5530131459236145,
"learning_rate": 2.2199298738085907e-05,
"loss": 0.1709,
"step": 400
},
{
"epoch": 0.7883194135672655,
"grad_norm": 0.7019795775413513,
"learning_rate": 2.2029982705458107e-05,
"loss": 0.1471,
"step": 410
},
{
"epoch": 0.8075467163371989,
"grad_norm": 0.5327528715133667,
"learning_rate": 2.1856384352138765e-05,
"loss": 0.1913,
"step": 420
},
{
"epoch": 0.8267740191071321,
"grad_norm": 0.5548112988471985,
"learning_rate": 2.1678581684966235e-05,
"loss": 0.1509,
"step": 430
},
{
"epoch": 0.8460013218770654,
"grad_norm": 0.51619553565979,
"learning_rate": 2.149665459999743e-05,
"loss": 0.1341,
"step": 440
},
{
"epoch": 0.8652286246469988,
"grad_norm": 0.6642457842826843,
"learning_rate": 2.1310684846606346e-05,
"loss": 0.1458,
"step": 450
},
{
"epoch": 0.884455927416932,
"grad_norm": 0.48370271921157837,
"learning_rate": 2.1120755990749762e-05,
"loss": 0.1584,
"step": 460
},
{
"epoch": 0.9036832301868654,
"grad_norm": 0.8130201697349548,
"learning_rate": 2.092695337741671e-05,
"loss": 0.1389,
"step": 470
},
{
"epoch": 0.9229105329567987,
"grad_norm": 0.4986889958381653,
"learning_rate": 2.0729364092278456e-05,
"loss": 0.1263,
"step": 480
},
{
"epoch": 0.9421378357267319,
"grad_norm": 0.6791219711303711,
"learning_rate": 2.052807692255638e-05,
"loss": 0.1562,
"step": 490
},
{
"epoch": 0.9613651384966653,
"grad_norm": 0.6069239974021912,
"learning_rate": 2.0323182317125198e-05,
"loss": 0.1296,
"step": 500
},
{
"epoch": 0.9805924412665986,
"grad_norm": 0.6993957161903381,
"learning_rate": 2.011477234586957e-05,
"loss": 0.1695,
"step": 510
},
{
"epoch": 0.9978970137595385,
"eval_loss": 0.11108512431383133,
"eval_runtime": 202.9151,
"eval_samples_per_second": 5.283,
"eval_steps_per_second": 5.283,
"step": 519
},
{
"epoch": 0.9998197440365318,
"grad_norm": 0.5495030283927917,
"learning_rate": 1.9902940658312253e-05,
"loss": 0.1512,
"step": 520
},
{
"epoch": 1.0190470468064652,
"grad_norm": 0.5100754499435425,
"learning_rate": 1.968778244153246e-05,
"loss": 0.1088,
"step": 530
},
{
"epoch": 1.0382743495763984,
"grad_norm": 0.6836853623390198,
"learning_rate": 1.9469394377393335e-05,
"loss": 0.1524,
"step": 540
},
{
"epoch": 1.0575016523463319,
"grad_norm": 0.5304776430130005,
"learning_rate": 1.9247874599097714e-05,
"loss": 0.1239,
"step": 550
},
{
"epoch": 1.076728955116265,
"grad_norm": 0.6995298862457275,
"learning_rate": 1.9023322647091736e-05,
"loss": 0.1203,
"step": 560
},
{
"epoch": 1.0959562578861983,
"grad_norm": 0.579207181930542,
"learning_rate": 1.8795839424336097e-05,
"loss": 0.134,
"step": 570
},
{
"epoch": 1.1151835606561318,
"grad_norm": 0.4746134877204895,
"learning_rate": 1.8565527150965077e-05,
"loss": 0.1344,
"step": 580
},
{
"epoch": 1.134410863426065,
"grad_norm": 0.8127744793891907,
"learning_rate": 1.8332489318353655e-05,
"loss": 0.1157,
"step": 590
},
{
"epoch": 1.1536381661959982,
"grad_norm": 0.6949151158332825,
"learning_rate": 1.809683064261343e-05,
"loss": 0.1197,
"step": 600
},
{
"epoch": 1.1728654689659317,
"grad_norm": 0.6869731545448303,
"learning_rate": 1.7858657017538178e-05,
"loss": 0.1392,
"step": 610
},
{
"epoch": 1.192092771735865,
"grad_norm": 0.7461158037185669,
"learning_rate": 1.7618075467020213e-05,
"loss": 0.1262,
"step": 620
},
{
"epoch": 1.2113200745057981,
"grad_norm": 0.5442166924476624,
"learning_rate": 1.7375194096958946e-05,
"loss": 0.1258,
"step": 630
},
{
"epoch": 1.2305473772757316,
"grad_norm": 0.7670741081237793,
"learning_rate": 1.713012204668325e-05,
"loss": 0.1204,
"step": 640
},
{
"epoch": 1.2497746800456648,
"grad_norm": 0.3919640779495239,
"learning_rate": 1.6882969439909434e-05,
"loss": 0.1444,
"step": 650
},
{
"epoch": 1.269001982815598,
"grad_norm": 0.6234434247016907,
"learning_rate": 1.663384733525686e-05,
"loss": 0.1245,
"step": 660
},
{
"epoch": 1.2882292855855315,
"grad_norm": 0.7237009406089783,
"learning_rate": 1.638286767634353e-05,
"loss": 0.1258,
"step": 670
},
{
"epoch": 1.3074565883554647,
"grad_norm": 0.6398624181747437,
"learning_rate": 1.613014324148392e-05,
"loss": 0.1519,
"step": 680
},
{
"epoch": 1.326683891125398,
"grad_norm": 0.7676591873168945,
"learning_rate": 1.5875787593011784e-05,
"loss": 0.1545,
"step": 690
},
{
"epoch": 1.3305293516793848,
"eval_loss": 0.10604555904865265,
"eval_runtime": 203.0173,
"eval_samples_per_second": 5.28,
"eval_steps_per_second": 5.28,
"step": 692
},
{
"epoch": 1.3459111938953314,
"grad_norm": 0.5583875775337219,
"learning_rate": 1.5619915026250646e-05,
"loss": 0.1141,
"step": 700
},
{
"epoch": 1.3651384966652647,
"grad_norm": 0.5790243148803711,
"learning_rate": 1.536264051815491e-05,
"loss": 0.1326,
"step": 710
},
{
"epoch": 1.3843657994351979,
"grad_norm": 0.7467628121376038,
"learning_rate": 1.5104079675644706e-05,
"loss": 0.1439,
"step": 720
},
{
"epoch": 1.4035931022051313,
"grad_norm": 0.9867657423019409,
"learning_rate": 1.4844348683657616e-05,
"loss": 0.1385,
"step": 730
},
{
"epoch": 1.4228204049750646,
"grad_norm": 0.7909297347068787,
"learning_rate": 1.4583564252940735e-05,
"loss": 0.1259,
"step": 740
},
{
"epoch": 1.4420477077449978,
"grad_norm": 0.6159791350364685,
"learning_rate": 1.432184356760637e-05,
"loss": 0.1126,
"step": 750
},
{
"epoch": 1.4612750105149312,
"grad_norm": 0.6234619617462158,
"learning_rate": 1.4059304232475098e-05,
"loss": 0.1144,
"step": 760
},
{
"epoch": 1.4805023132848645,
"grad_norm": 0.7142959833145142,
"learning_rate": 1.3796064220229765e-05,
"loss": 0.1249,
"step": 770
},
{
"epoch": 1.4997296160547977,
"grad_norm": 0.6258341073989868,
"learning_rate": 1.3532241818404156e-05,
"loss": 0.1321,
"step": 780
},
{
"epoch": 1.5189569188247312,
"grad_norm": 0.5723307728767395,
"learning_rate": 1.326795557623022e-05,
"loss": 0.1193,
"step": 790
},
{
"epoch": 1.5381842215946644,
"grad_norm": 0.7454131841659546,
"learning_rate": 1.300332425136769e-05,
"loss": 0.1281,
"step": 800
},
{
"epoch": 1.5574115243645976,
"grad_norm": 0.5975070595741272,
"learning_rate": 1.273846675654003e-05,
"loss": 0.1321,
"step": 810
},
{
"epoch": 1.576638827134531,
"grad_norm": 0.7056507468223572,
"learning_rate": 1.2473502106100723e-05,
"loss": 0.1444,
"step": 820
},
{
"epoch": 1.5958661299044643,
"grad_norm": 0.7889280915260315,
"learning_rate": 1.2208549362553885e-05,
"loss": 0.1226,
"step": 830
},
{
"epoch": 1.6150934326743975,
"grad_norm": 0.7041313648223877,
"learning_rate": 1.194372758305325e-05,
"loss": 0.1316,
"step": 840
},
{
"epoch": 1.634320735444331,
"grad_norm": 0.7797935605049133,
"learning_rate": 1.1679155765903524e-05,
"loss": 0.132,
"step": 850
},
{
"epoch": 1.6535480382142642,
"grad_norm": 0.6426231861114502,
"learning_rate": 1.1414952797088248e-05,
"loss": 0.1101,
"step": 860
},
{
"epoch": 1.663161689599231,
"eval_loss": 0.10293085128068924,
"eval_runtime": 203.1567,
"eval_samples_per_second": 5.277,
"eval_steps_per_second": 5.277,
"step": 865
},
{
"epoch": 1.6727753409841974,
"grad_norm": 1.0461760759353638,
"learning_rate": 1.1151237396848058e-05,
"loss": 0.128,
"step": 870
},
{
"epoch": 1.692002643754131,
"grad_norm": 0.8692240118980408,
"learning_rate": 1.088812806633349e-05,
"loss": 0.1114,
"step": 880
},
{
"epoch": 1.7112299465240641,
"grad_norm": 0.5583866238594055,
"learning_rate": 1.0625743034356183e-05,
"loss": 0.1309,
"step": 890
},
{
"epoch": 1.7304572492939974,
"grad_norm": 0.5476118922233582,
"learning_rate": 1.0364200204262473e-05,
"loss": 0.1156,
"step": 900
},
{
"epoch": 1.7496845520639308,
"grad_norm": 0.8960713148117065,
"learning_rate": 1.0103617100953274e-05,
"loss": 0.1305,
"step": 910
},
{
"epoch": 1.768911854833864,
"grad_norm": 0.6927953958511353,
"learning_rate": 9.84411081807393e-06,
"loss": 0.1245,
"step": 920
},
{
"epoch": 1.7881391576037973,
"grad_norm": 0.5891989469528198,
"learning_rate": 9.585797965397949e-06,
"loss": 0.1125,
"step": 930
},
{
"epoch": 1.8073664603737307,
"grad_norm": 0.8319947123527527,
"learning_rate": 9.328794616428092e-06,
"loss": 0.1462,
"step": 940
},
{
"epoch": 1.826593763143664,
"grad_norm": 0.7439499497413635,
"learning_rate": 9.073216256238485e-06,
"loss": 0.1167,
"step": 950
},
{
"epoch": 1.8458210659135972,
"grad_norm": 0.7593638896942139,
"learning_rate": 8.8191777295811e-06,
"loss": 0.1356,
"step": 960
},
{
"epoch": 1.8650483686835306,
"grad_norm": 0.871376097202301,
"learning_rate": 8.56679318928e-06,
"loss": 0.1173,
"step": 970
},
{
"epoch": 1.884275671453464,
"grad_norm": 0.8772872090339661,
"learning_rate": 8.31617604493651e-06,
"loss": 0.1347,
"step": 980
},
{
"epoch": 1.903502974223397,
"grad_norm": 0.6309168934822083,
"learning_rate": 8.067438911968305e-06,
"loss": 0.1382,
"step": 990
},
{
"epoch": 1.9227302769933305,
"grad_norm": 0.775113046169281,
"learning_rate": 7.820693561005429e-06,
"loss": 0.1368,
"step": 1000
},
{
"epoch": 1.941957579763264,
"grad_norm": 0.9096739888191223,
"learning_rate": 7.576050867665876e-06,
"loss": 0.1263,
"step": 1010
},
{
"epoch": 1.961184882533197,
"grad_norm": 0.7637848258018494,
"learning_rate": 7.333620762733376e-06,
"loss": 0.1148,
"step": 1020
},
{
"epoch": 1.9804121853031305,
"grad_norm": 0.8084997534751892,
"learning_rate": 7.0935121827597245e-06,
"loss": 0.1457,
"step": 1030
},
{
"epoch": 1.995794027519077,
"eval_loss": 0.10069960355758667,
"eval_runtime": 203.0573,
"eval_samples_per_second": 5.279,
"eval_steps_per_second": 5.279,
"step": 1038
},
{
"epoch": 1.999639488073064,
"grad_norm": 1.0884274244308472,
"learning_rate": 6.855833021113886e-06,
"loss": 0.1641,
"step": 1040
},
{
"epoch": 2.018866790842997,
"grad_norm": 0.702237069606781,
"learning_rate": 6.620690079499835e-06,
"loss": 0.1159,
"step": 1050
},
{
"epoch": 2.0380940936129304,
"grad_norm": 0.6377178430557251,
"learning_rate": 6.388189019964976e-06,
"loss": 0.1103,
"step": 1060
},
{
"epoch": 2.057321396382864,
"grad_norm": 0.8843504786491394,
"learning_rate": 6.158434317420636e-06,
"loss": 0.1178,
"step": 1070
},
{
"epoch": 2.076548699152797,
"grad_norm": 0.42746174335479736,
"learning_rate": 5.931529212695996e-06,
"loss": 0.1143,
"step": 1080
},
{
"epoch": 2.0957760019227303,
"grad_norm": 0.7449749708175659,
"learning_rate": 5.70757566614661e-06,
"loss": 0.1262,
"step": 1090
},
{
"epoch": 2.1150033046926637,
"grad_norm": 0.6538805961608887,
"learning_rate": 5.48667431183824e-06,
"loss": 0.1344,
"step": 1100
},
{
"epoch": 2.1342306074625967,
"grad_norm": 0.8034993410110474,
"learning_rate": 5.268924412326709e-06,
"loss": 0.1447,
"step": 1110
},
{
"epoch": 2.15345791023253,
"grad_norm": 0.7438477277755737,
"learning_rate": 5.054423814054049e-06,
"loss": 0.1082,
"step": 1120
},
{
"epoch": 2.1726852130024636,
"grad_norm": 0.5646623373031616,
"learning_rate": 4.843268903380932e-06,
"loss": 0.1199,
"step": 1130
},
{
"epoch": 2.1919125157723967,
"grad_norm": 0.9965047240257263,
"learning_rate": 4.6355545632752575e-06,
"loss": 0.1303,
"step": 1140
},
{
"epoch": 2.21113981854233,
"grad_norm": 0.8709131479263306,
"learning_rate": 4.4313741306762495e-06,
"loss": 0.1107,
"step": 1150
},
{
"epoch": 2.2303671213122636,
"grad_norm": 0.6653530597686768,
"learning_rate": 4.230819354553279e-06,
"loss": 0.1053,
"step": 1160
},
{
"epoch": 2.2495944240821966,
"grad_norm": 0.766173243522644,
"learning_rate": 4.033980354678239e-06,
"loss": 0.1017,
"step": 1170
},
{
"epoch": 2.26882172685213,
"grad_norm": 0.5112572312355042,
"learning_rate": 3.840945581130008e-06,
"loss": 0.109,
"step": 1180
},
{
"epoch": 2.2880490296220635,
"grad_norm": 0.8744060397148132,
"learning_rate": 3.651801774549213e-06,
"loss": 0.1026,
"step": 1190
},
{
"epoch": 2.3072763323919965,
"grad_norm": 0.8215727806091309,
"learning_rate": 3.4666339271610836e-06,
"loss": 0.1058,
"step": 1200
},
{
"epoch": 2.32650363516193,
"grad_norm": 0.6597920656204224,
"learning_rate": 3.285525244584017e-06,
"loss": 0.1378,
"step": 1210
},
{
"epoch": 2.3284263654389235,
"eval_loss": 0.10013294219970703,
"eval_runtime": 203.5302,
"eval_samples_per_second": 5.267,
"eval_steps_per_second": 5.267,
"step": 1211
},
{
"epoch": 2.3457309379318634,
"grad_norm": 0.7206103205680847,
"learning_rate": 3.108557108440914e-06,
"loss": 0.1028,
"step": 1220
},
{
"epoch": 2.3649582407017964,
"grad_norm": 0.968497097492218,
"learning_rate": 2.9358090397901634e-06,
"loss": 0.1345,
"step": 1230
},
{
"epoch": 2.38418554347173,
"grad_norm": 0.7522798180580139,
"learning_rate": 2.767358663392658e-06,
"loss": 0.1029,
"step": 1240
},
{
"epoch": 2.4034128462416633,
"grad_norm": 0.8699542284011841,
"learning_rate": 2.6032816728309166e-06,
"loss": 0.1181,
"step": 1250
},
{
"epoch": 2.4226401490115963,
"grad_norm": 0.8779841661453247,
"learning_rate": 2.4436517964960005e-06,
"loss": 0.1028,
"step": 1260
},
{
"epoch": 2.4418674517815298,
"grad_norm": 0.6922764182090759,
"learning_rate": 2.2885407644574696e-06,
"loss": 0.1148,
"step": 1270
},
{
"epoch": 2.461094754551463,
"grad_norm": 0.7528237700462341,
"learning_rate": 2.1380182762313238e-06,
"loss": 0.1128,
"step": 1280
},
{
"epoch": 2.480322057321396,
"grad_norm": 0.8349286913871765,
"learning_rate": 1.992151969460333e-06,
"loss": 0.1027,
"step": 1290
},
{
"epoch": 2.4995493600913297,
"grad_norm": 0.8040717244148254,
"learning_rate": 1.8510073895209131e-06,
"loss": 0.1001,
"step": 1300
},
{
"epoch": 2.518776662861263,
"grad_norm": 0.8065551519393921,
"learning_rate": 1.7146479600701565e-06,
"loss": 0.1454,
"step": 1310
},
{
"epoch": 2.538003965631196,
"grad_norm": 0.7855721712112427,
"learning_rate": 1.5831349545462461e-06,
"loss": 0.1063,
"step": 1320
},
{
"epoch": 2.5572312684011296,
"grad_norm": 0.9087608456611633,
"learning_rate": 1.4565274686351022e-06,
"loss": 0.1155,
"step": 1330
},
{
"epoch": 2.576458571171063,
"grad_norm": 0.49701324105262756,
"learning_rate": 1.334882393715585e-06,
"loss": 0.1001,
"step": 1340
},
{
"epoch": 2.5956858739409965,
"grad_norm": 0.7943114638328552,
"learning_rate": 1.2182543912952178e-06,
"loss": 0.1107,
"step": 1350
},
{
"epoch": 2.6149131767109295,
"grad_norm": 0.8685261607170105,
"learning_rate": 1.1066958684479074e-06,
"loss": 0.1209,
"step": 1360
},
{
"epoch": 2.634140479480863,
"grad_norm": 1.0667730569839478,
"learning_rate": 1.0002569542646973e-06,
"loss": 0.1361,
"step": 1370
},
{
"epoch": 2.653367782250796,
"grad_norm": 0.6879278421401978,
"learning_rate": 8.989854773281486e-07,
"loss": 0.0925,
"step": 1380
},
{
"epoch": 2.6610587033587696,
"eval_loss": 0.09926149994134903,
"eval_runtime": 203.0153,
"eval_samples_per_second": 5.28,
"eval_steps_per_second": 5.28,
"step": 1384
},
{
"epoch": 2.6725950850207294,
"grad_norm": 0.7204756736755371,
"learning_rate": 8.029269442204348e-07,
"loss": 0.1148,
"step": 1390
},
{
"epoch": 2.691822387790663,
"grad_norm": 0.834997832775116,
"learning_rate": 7.121245190748708e-07,
"loss": 0.0918,
"step": 1400
},
{
"epoch": 2.7110496905605963,
"grad_norm": 0.8163384795188904,
"learning_rate": 6.266190041799805e-07,
"loss": 0.1345,
"step": 1410
},
{
"epoch": 2.7302769933305293,
"grad_norm": 0.6108123660087585,
"learning_rate": 5.464488216449154e-07,
"loss": 0.1235,
"step": 1420
},
{
"epoch": 2.7495042961004628,
"grad_norm": 0.8302232027053833,
"learning_rate": 4.716499961343698e-07,
"loss": 0.1163,
"step": 1430
},
{
"epoch": 2.7687315988703958,
"grad_norm": 0.670668363571167,
"learning_rate": 4.022561386808177e-07,
"loss": 0.1103,
"step": 1440
},
{
"epoch": 2.7879589016403292,
"grad_norm": 0.7220197319984436,
"learning_rate": 3.3829843158131175e-07,
"loss": 0.1228,
"step": 1450
},
{
"epoch": 2.8071862044102627,
"grad_norm": 0.5018804669380188,
"learning_rate": 2.798056143856462e-07,
"loss": 0.1225,
"step": 1460
},
{
"epoch": 2.826413507180196,
"grad_norm": 0.5343906283378601,
"learning_rate": 2.268039709821687e-07,
"loss": 0.0918,
"step": 1470
},
{
"epoch": 2.845640809950129,
"grad_norm": 0.6775656938552856,
"learning_rate": 1.7931731778705052e-07,
"loss": 0.0903,
"step": 1480
},
{
"epoch": 2.8648681127200626,
"grad_norm": 0.7841689586639404,
"learning_rate": 1.373669930423288e-07,
"loss": 0.1308,
"step": 1490
},
{
"epoch": 2.8840954154899956,
"grad_norm": 0.8570185303688049,
"learning_rate": 1.0097184722750592e-07,
"loss": 0.1287,
"step": 1500
},
{
"epoch": 2.903322718259929,
"grad_norm": 0.635200023651123,
"learning_rate": 7.014823458905001e-08,
"loss": 0.1011,
"step": 1510
},
{
"epoch": 2.9225500210298625,
"grad_norm": 0.7127873301506042,
"learning_rate": 4.4910005791570786e-08,
"loss": 0.1345,
"step": 1520
},
{
"epoch": 2.941777323799796,
"grad_norm": 0.9114808440208435,
"learning_rate": 2.526850169399103e-08,
"loss": 0.1132,
"step": 1530
},
{
"epoch": 2.961004626569729,
"grad_norm": 0.7554405927658081,
"learning_rate": 1.1232548253503616e-08,
"loss": 0.1091,
"step": 1540
},
{
"epoch": 2.9802319293396624,
"grad_norm": 0.7547165155410767,
"learning_rate": 2.8084525596064337e-09,
"loss": 0.0944,
"step": 1550
},
{
"epoch": 2.9936910412786157,
"eval_loss": 0.09935057163238525,
"eval_runtime": 203.0468,
"eval_samples_per_second": 5.28,
"eval_steps_per_second": 5.28,
"step": 1557
},
{
"epoch": 2.9994592321095954,
"grad_norm": 0.7488301992416382,
"learning_rate": 0.0,
"loss": 0.1237,
"step": 1560
},
{
"epoch": 2.9994592321095954,
"step": 1560,
"total_flos": 1.3623219564340838e+18,
"train_loss": 0.15373969880434182,
"train_runtime": 33993.1903,
"train_samples_per_second": 1.469,
"train_steps_per_second": 0.046
}
],
"logging_steps": 10,
"max_steps": 1560,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 173,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3623219564340838e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}