muditbaid's picture
Add HateXplain QLoRA adapter
e407314 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1923,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015600624024960999,
"grad_norm": 12.956160545349121,
"learning_rate": 2.7835051546391753e-06,
"loss": 1.1012,
"step": 10
},
{
"epoch": 0.031201248049921998,
"grad_norm": 12.30282974243164,
"learning_rate": 5.876288659793814e-06,
"loss": 0.8156,
"step": 20
},
{
"epoch": 0.046801872074883,
"grad_norm": 9.357769012451172,
"learning_rate": 8.969072164948454e-06,
"loss": 0.7969,
"step": 30
},
{
"epoch": 0.062402496099843996,
"grad_norm": 3.8810768127441406,
"learning_rate": 1.2061855670103093e-05,
"loss": 0.3807,
"step": 40
},
{
"epoch": 0.078003120124805,
"grad_norm": 3.557420253753662,
"learning_rate": 1.5154639175257731e-05,
"loss": 0.3692,
"step": 50
},
{
"epoch": 0.093603744149766,
"grad_norm": 1.7188315391540527,
"learning_rate": 1.824742268041237e-05,
"loss": 0.2744,
"step": 60
},
{
"epoch": 0.10920436817472699,
"grad_norm": 1.362113118171692,
"learning_rate": 2.134020618556701e-05,
"loss": 0.2794,
"step": 70
},
{
"epoch": 0.12480499219968799,
"grad_norm": 3.378025770187378,
"learning_rate": 2.443298969072165e-05,
"loss": 0.2633,
"step": 80
},
{
"epoch": 0.14040561622464898,
"grad_norm": 1.2982730865478516,
"learning_rate": 2.7525773195876287e-05,
"loss": 0.2838,
"step": 90
},
{
"epoch": 0.15600624024961,
"grad_norm": 1.5544312000274658,
"learning_rate": 2.9999911198761025e-05,
"loss": 0.2872,
"step": 100
},
{
"epoch": 0.17160686427457097,
"grad_norm": 2.317873001098633,
"learning_rate": 2.999680326579471e-05,
"loss": 0.2635,
"step": 110
},
{
"epoch": 0.187207488299532,
"grad_norm": 1.5369235277175903,
"learning_rate": 2.998925632224497e-05,
"loss": 0.2571,
"step": 120
},
{
"epoch": 0.20280811232449297,
"grad_norm": 1.3865573406219482,
"learning_rate": 2.9977272601985376e-05,
"loss": 0.2297,
"step": 130
},
{
"epoch": 0.21840873634945399,
"grad_norm": 1.2493661642074585,
"learning_rate": 2.9960855652162606e-05,
"loss": 0.2551,
"step": 140
},
{
"epoch": 0.23400936037441497,
"grad_norm": 1.4932328462600708,
"learning_rate": 2.994001033214654e-05,
"loss": 0.2507,
"step": 150
},
{
"epoch": 0.24960998439937598,
"grad_norm": 1.5236423015594482,
"learning_rate": 2.9914742812091878e-05,
"loss": 0.245,
"step": 160
},
{
"epoch": 0.26521060842433697,
"grad_norm": 1.3999335765838623,
"learning_rate": 2.9885060571111795e-05,
"loss": 0.2361,
"step": 170
},
{
"epoch": 0.28081123244929795,
"grad_norm": 0.8492459058761597,
"learning_rate": 2.985097239506416e-05,
"loss": 0.2581,
"step": 180
},
{
"epoch": 0.296411856474259,
"grad_norm": 1.2287479639053345,
"learning_rate": 2.9812488373950918e-05,
"loss": 0.2581,
"step": 190
},
{
"epoch": 0.31201248049922,
"grad_norm": 1.5606762170791626,
"learning_rate": 2.9769619898931505e-05,
"loss": 0.2614,
"step": 200
},
{
"epoch": 0.32761310452418096,
"grad_norm": 1.4384827613830566,
"learning_rate": 2.9722379658951095e-05,
"loss": 0.228,
"step": 210
},
{
"epoch": 0.34321372854914195,
"grad_norm": 0.8863910436630249,
"learning_rate": 2.9670781636984686e-05,
"loss": 0.2408,
"step": 220
},
{
"epoch": 0.358814352574103,
"grad_norm": 1.1458382606506348,
"learning_rate": 2.96148411058982e-05,
"loss": 0.223,
"step": 230
},
{
"epoch": 0.374414976599064,
"grad_norm": 1.1114429235458374,
"learning_rate": 2.955457462392777e-05,
"loss": 0.2607,
"step": 240
},
{
"epoch": 0.39001560062402496,
"grad_norm": 0.9094457626342773,
"learning_rate": 2.9490000029778514e-05,
"loss": 0.2267,
"step": 250
},
{
"epoch": 0.40561622464898595,
"grad_norm": 2.2145955562591553,
"learning_rate": 2.9421136437344358e-05,
"loss": 0.2514,
"step": 260
},
{
"epoch": 0.42121684867394693,
"grad_norm": 0.9296560287475586,
"learning_rate": 2.934800423005037e-05,
"loss": 0.2339,
"step": 270
},
{
"epoch": 0.43681747269890797,
"grad_norm": 1.1148079633712769,
"learning_rate": 2.927062505481933e-05,
"loss": 0.2093,
"step": 280
},
{
"epoch": 0.45241809672386896,
"grad_norm": 1.0592412948608398,
"learning_rate": 2.9189021815664287e-05,
"loss": 0.2627,
"step": 290
},
{
"epoch": 0.46801872074882994,
"grad_norm": 0.6646207571029663,
"learning_rate": 2.910321866690906e-05,
"loss": 0.2257,
"step": 300
},
{
"epoch": 0.4836193447737909,
"grad_norm": 1.14007568359375,
"learning_rate": 2.901324100603861e-05,
"loss": 0.2554,
"step": 310
},
{
"epoch": 0.49921996879875197,
"grad_norm": 1.2674685716629028,
"learning_rate": 2.8919115466181455e-05,
"loss": 0.2222,
"step": 320
},
{
"epoch": 0.514820592823713,
"grad_norm": 1.4763505458831787,
"learning_rate": 2.882086990822637e-05,
"loss": 0.2234,
"step": 330
},
{
"epoch": 0.5304212168486739,
"grad_norm": 0.9958274960517883,
"learning_rate": 2.8718533412575613e-05,
"loss": 0.2532,
"step": 340
},
{
"epoch": 0.5460218408736349,
"grad_norm": 1.1231701374053955,
"learning_rate": 2.8612136270537206e-05,
"loss": 0.2221,
"step": 350
},
{
"epoch": 0.5616224648985959,
"grad_norm": 1.0642913579940796,
"learning_rate": 2.8501709975358828e-05,
"loss": 0.223,
"step": 360
},
{
"epoch": 0.5772230889235569,
"grad_norm": 1.683724284172058,
"learning_rate": 2.8387287212905888e-05,
"loss": 0.2255,
"step": 370
},
{
"epoch": 0.592823712948518,
"grad_norm": 0.9343106150627136,
"learning_rate": 2.826890185198658e-05,
"loss": 0.2726,
"step": 380
},
{
"epoch": 0.608424336973479,
"grad_norm": 0.8196080923080444,
"learning_rate": 2.8146588934326855e-05,
"loss": 0.2227,
"step": 390
},
{
"epoch": 0.62402496099844,
"grad_norm": 1.024862289428711,
"learning_rate": 2.8020384664198134e-05,
"loss": 0.2337,
"step": 400
},
{
"epoch": 0.6396255850234009,
"grad_norm": 1.8752734661102295,
"learning_rate": 2.7890326397700974e-05,
"loss": 0.2206,
"step": 410
},
{
"epoch": 0.6552262090483619,
"grad_norm": 1.076338768005371,
"learning_rate": 2.7756452631707753e-05,
"loss": 0.2174,
"step": 420
},
{
"epoch": 0.6708268330733229,
"grad_norm": 1.7308673858642578,
"learning_rate": 2.7618802992467718e-05,
"loss": 0.2517,
"step": 430
},
{
"epoch": 0.6864274570982839,
"grad_norm": 0.9615013003349304,
"learning_rate": 2.747741822387772e-05,
"loss": 0.2595,
"step": 440
},
{
"epoch": 0.7020280811232449,
"grad_norm": 1.0114976167678833,
"learning_rate": 2.733234017542215e-05,
"loss": 0.2129,
"step": 450
},
{
"epoch": 0.717628705148206,
"grad_norm": 1.1466189622879028,
"learning_rate": 2.7183611789785597e-05,
"loss": 0.23,
"step": 460
},
{
"epoch": 0.733229329173167,
"grad_norm": 0.9956033229827881,
"learning_rate": 2.7031277090141938e-05,
"loss": 0.1992,
"step": 470
},
{
"epoch": 0.748829953198128,
"grad_norm": 0.8617722392082214,
"learning_rate": 2.687538116712363e-05,
"loss": 0.2076,
"step": 480
},
{
"epoch": 0.7644305772230889,
"grad_norm": 1.1923143863677979,
"learning_rate": 2.6715970165474982e-05,
"loss": 0.2123,
"step": 490
},
{
"epoch": 0.7800312012480499,
"grad_norm": 0.8156811594963074,
"learning_rate": 2.6553091270393456e-05,
"loss": 0.225,
"step": 500
},
{
"epoch": 0.7800312012480499,
"eval_loss": 0.2404618114233017,
"eval_runtime": 222.9357,
"eval_samples_per_second": 8.621,
"eval_steps_per_second": 8.621,
"step": 500
},
{
"epoch": 0.7956318252730109,
"grad_norm": 0.9273092150688171,
"learning_rate": 2.6386792693562992e-05,
"loss": 0.2312,
"step": 510
},
{
"epoch": 0.8112324492979719,
"grad_norm": 1.1608659029006958,
"learning_rate": 2.621712365888347e-05,
"loss": 0.209,
"step": 520
},
{
"epoch": 0.8268330733229329,
"grad_norm": 1.0634217262268066,
"learning_rate": 2.6044134387900598e-05,
"loss": 0.207,
"step": 530
},
{
"epoch": 0.8424336973478939,
"grad_norm": 0.863330066204071,
"learning_rate": 2.586787608494046e-05,
"loss": 0.2295,
"step": 540
},
{
"epoch": 0.858034321372855,
"grad_norm": 0.7286916971206665,
"learning_rate": 2.5688400921953197e-05,
"loss": 0.2349,
"step": 550
},
{
"epoch": 0.8736349453978159,
"grad_norm": 1.5327069759368896,
"learning_rate": 2.5505762023070265e-05,
"loss": 0.2505,
"step": 560
},
{
"epoch": 0.8892355694227769,
"grad_norm": 0.9913854002952576,
"learning_rate": 2.5320013448879812e-05,
"loss": 0.2405,
"step": 570
},
{
"epoch": 0.9048361934477379,
"grad_norm": 0.7204810976982117,
"learning_rate": 2.513121018042494e-05,
"loss": 0.1962,
"step": 580
},
{
"epoch": 0.9204368174726989,
"grad_norm": 0.8549244999885559,
"learning_rate": 2.4939408102929457e-05,
"loss": 0.2358,
"step": 590
},
{
"epoch": 0.9360374414976599,
"grad_norm": 0.8404679894447327,
"learning_rate": 2.474466398925601e-05,
"loss": 0.2517,
"step": 600
},
{
"epoch": 0.9516380655226209,
"grad_norm": 1.46381413936615,
"learning_rate": 2.4547035483101474e-05,
"loss": 0.2414,
"step": 610
},
{
"epoch": 0.9672386895475819,
"grad_norm": 1.4833369255065918,
"learning_rate": 2.43465810819346e-05,
"loss": 0.2236,
"step": 620
},
{
"epoch": 0.982839313572543,
"grad_norm": 1.524997353553772,
"learning_rate": 2.4143360119680928e-05,
"loss": 0.2404,
"step": 630
},
{
"epoch": 0.9984399375975039,
"grad_norm": 0.5853265523910522,
"learning_rate": 2.3937432749160113e-05,
"loss": 0.2316,
"step": 640
},
{
"epoch": 1.0140405616224648,
"grad_norm": 1.2552859783172607,
"learning_rate": 2.3728859924280858e-05,
"loss": 0.2286,
"step": 650
},
{
"epoch": 1.029641185647426,
"grad_norm": 1.6384315490722656,
"learning_rate": 2.351770338199875e-05,
"loss": 0.2048,
"step": 660
},
{
"epoch": 1.045241809672387,
"grad_norm": 1.1269832849502563,
"learning_rate": 2.3304025624042265e-05,
"loss": 0.1966,
"step": 670
},
{
"epoch": 1.0608424336973479,
"grad_norm": 1.2746951580047607,
"learning_rate": 2.308788989841249e-05,
"loss": 0.2279,
"step": 680
},
{
"epoch": 1.076443057722309,
"grad_norm": 0.9508239030838013,
"learning_rate": 2.2869360180661844e-05,
"loss": 0.2257,
"step": 690
},
{
"epoch": 1.0920436817472698,
"grad_norm": 0.7616346478462219,
"learning_rate": 2.264850115495752e-05,
"loss": 0.2217,
"step": 700
},
{
"epoch": 1.107644305772231,
"grad_norm": 0.8200326561927795,
"learning_rate": 2.2425378194935163e-05,
"loss": 0.1953,
"step": 710
},
{
"epoch": 1.1232449297971918,
"grad_norm": 1.595325231552124,
"learning_rate": 2.220005734434847e-05,
"loss": 0.247,
"step": 720
},
{
"epoch": 1.138845553822153,
"grad_norm": 0.860009491443634,
"learning_rate": 2.1972605297520388e-05,
"loss": 0.183,
"step": 730
},
{
"epoch": 1.154446177847114,
"grad_norm": 1.3904730081558228,
"learning_rate": 2.1743089379601842e-05,
"loss": 0.217,
"step": 740
},
{
"epoch": 1.1700468018720749,
"grad_norm": 0.9801428914070129,
"learning_rate": 2.1511577526643646e-05,
"loss": 0.2296,
"step": 750
},
{
"epoch": 1.185647425897036,
"grad_norm": 1.275903344154358,
"learning_rate": 2.1278138265487627e-05,
"loss": 0.2245,
"step": 760
},
{
"epoch": 1.2012480499219969,
"grad_norm": 1.0965229272842407,
"learning_rate": 2.1042840693482907e-05,
"loss": 0.1869,
"step": 770
},
{
"epoch": 1.216848673946958,
"grad_norm": 1.6734888553619385,
"learning_rate": 2.080575445803326e-05,
"loss": 0.2308,
"step": 780
},
{
"epoch": 1.2324492979719188,
"grad_norm": 1.082309603691101,
"learning_rate": 2.056694973598169e-05,
"loss": 0.1968,
"step": 790
},
{
"epoch": 1.24804992199688,
"grad_norm": 1.0727277994155884,
"learning_rate": 2.0326497212838283e-05,
"loss": 0.219,
"step": 800
},
{
"epoch": 1.2636505460218408,
"grad_norm": 1.4003663063049316,
"learning_rate": 2.008446806185751e-05,
"loss": 0.2348,
"step": 810
},
{
"epoch": 1.2792511700468019,
"grad_norm": 1.290280818939209,
"learning_rate": 1.9840933922971144e-05,
"loss": 0.203,
"step": 820
},
{
"epoch": 1.294851794071763,
"grad_norm": 1.0119068622589111,
"learning_rate": 1.9595966881583032e-05,
"loss": 0.1948,
"step": 830
},
{
"epoch": 1.3104524180967239,
"grad_norm": 0.9613682627677917,
"learning_rate": 1.9349639447232046e-05,
"loss": 0.1845,
"step": 840
},
{
"epoch": 1.3260530421216847,
"grad_norm": 1.4405962228775024,
"learning_rate": 1.9102024532129452e-05,
"loss": 0.1807,
"step": 850
},
{
"epoch": 1.3416536661466458,
"grad_norm": 1.6739627122879028,
"learning_rate": 1.8853195429577124e-05,
"loss": 0.2036,
"step": 860
},
{
"epoch": 1.357254290171607,
"grad_norm": 1.0754071474075317,
"learning_rate": 1.8603225792272897e-05,
"loss": 0.2174,
"step": 870
},
{
"epoch": 1.3728549141965678,
"grad_norm": 1.03872811794281,
"learning_rate": 1.8352189610509642e-05,
"loss": 0.2496,
"step": 880
},
{
"epoch": 1.388455538221529,
"grad_norm": 1.1556732654571533,
"learning_rate": 1.8100161190274293e-05,
"loss": 0.2099,
"step": 890
},
{
"epoch": 1.4040561622464898,
"grad_norm": 1.54631769657135,
"learning_rate": 1.7847215131253534e-05,
"loss": 0.2034,
"step": 900
},
{
"epoch": 1.4196567862714509,
"grad_norm": 2.124622344970703,
"learning_rate": 1.759342630475247e-05,
"loss": 0.1891,
"step": 910
},
{
"epoch": 1.435257410296412,
"grad_norm": 0.9314622282981873,
"learning_rate": 1.7338869831532962e-05,
"loss": 0.2302,
"step": 920
},
{
"epoch": 1.4508580343213728,
"grad_norm": 0.9277411103248596,
"learning_rate": 1.7083621059578093e-05,
"loss": 0.2167,
"step": 930
},
{
"epoch": 1.466458658346334,
"grad_norm": 1.0425158739089966,
"learning_rate": 1.6827755541789363e-05,
"loss": 0.2091,
"step": 940
},
{
"epoch": 1.4820592823712948,
"grad_norm": 1.8837333917617798,
"learning_rate": 1.657134901362329e-05,
"loss": 0.2039,
"step": 950
},
{
"epoch": 1.497659906396256,
"grad_norm": 1.4193438291549683,
"learning_rate": 1.6314477370673874e-05,
"loss": 0.2343,
"step": 960
},
{
"epoch": 1.513260530421217,
"grad_norm": 1.1180918216705322,
"learning_rate": 1.6057216646207774e-05,
"loss": 0.2061,
"step": 970
},
{
"epoch": 1.5288611544461779,
"grad_norm": 0.8356765508651733,
"learning_rate": 1.579964298865865e-05,
"loss": 0.2248,
"step": 980
},
{
"epoch": 1.5444617784711387,
"grad_norm": 1.0649689435958862,
"learning_rate": 1.554183263908745e-05,
"loss": 0.1944,
"step": 990
},
{
"epoch": 1.5600624024960998,
"grad_norm": 0.9574515223503113,
"learning_rate": 1.5283861908615286e-05,
"loss": 0.2144,
"step": 1000
},
{
"epoch": 1.5600624024960998,
"eval_loss": 0.22991187870502472,
"eval_runtime": 212.0865,
"eval_samples_per_second": 9.062,
"eval_steps_per_second": 9.062,
"step": 1000
},
{
"epoch": 1.575663026521061,
"grad_norm": 1.9467267990112305,
"learning_rate": 1.5025807155835557e-05,
"loss": 0.2127,
"step": 1010
},
{
"epoch": 1.5912636505460218,
"grad_norm": 1.6572612524032593,
"learning_rate": 1.4767744764212002e-05,
"loss": 0.1974,
"step": 1020
},
{
"epoch": 1.6068642745709827,
"grad_norm": 1.8733537197113037,
"learning_rate": 1.450975111946947e-05,
"loss": 0.2245,
"step": 1030
},
{
"epoch": 1.6224648985959438,
"grad_norm": 1.1123310327529907,
"learning_rate": 1.42519025869839e-05,
"loss": 0.1946,
"step": 1040
},
{
"epoch": 1.6380655226209049,
"grad_norm": 1.3432146310806274,
"learning_rate": 1.3994275489178445e-05,
"loss": 0.2209,
"step": 1050
},
{
"epoch": 1.653666146645866,
"grad_norm": 1.7226901054382324,
"learning_rate": 1.3736946082932203e-05,
"loss": 0.1922,
"step": 1060
},
{
"epoch": 1.6692667706708268,
"grad_norm": 0.8934373259544373,
"learning_rate": 1.347999053700846e-05,
"loss": 0.1996,
"step": 1070
},
{
"epoch": 1.6848673946957877,
"grad_norm": 1.0361180305480957,
"learning_rate": 1.3223484909508899e-05,
"loss": 0.1875,
"step": 1080
},
{
"epoch": 1.7004680187207488,
"grad_norm": 1.2863364219665527,
"learning_rate": 1.296750512536065e-05,
"loss": 0.1951,
"step": 1090
},
{
"epoch": 1.71606864274571,
"grad_norm": 1.3062769174575806,
"learning_rate": 1.2712126953842734e-05,
"loss": 0.2124,
"step": 1100
},
{
"epoch": 1.7316692667706708,
"grad_norm": 1.2328706979751587,
"learning_rate": 1.245742598615855e-05,
"loss": 0.182,
"step": 1110
},
{
"epoch": 1.7472698907956317,
"grad_norm": 1.0888112783432007,
"learning_rate": 1.2203477613061136e-05,
"loss": 0.2155,
"step": 1120
},
{
"epoch": 1.7628705148205928,
"grad_norm": 0.9680263996124268,
"learning_rate": 1.1950357002537672e-05,
"loss": 0.2049,
"step": 1130
},
{
"epoch": 1.7784711388455539,
"grad_norm": 1.1818790435791016,
"learning_rate": 1.1698139077560021e-05,
"loss": 0.2048,
"step": 1140
},
{
"epoch": 1.794071762870515,
"grad_norm": 1.0962156057357788,
"learning_rate": 1.1446898493907707e-05,
"loss": 0.2145,
"step": 1150
},
{
"epoch": 1.8096723868954758,
"grad_norm": 1.179892659187317,
"learning_rate": 1.1196709618070055e-05,
"loss": 0.2048,
"step": 1160
},
{
"epoch": 1.8252730109204367,
"grad_norm": 0.753470242023468,
"learning_rate": 1.0947646505233888e-05,
"loss": 0.2039,
"step": 1170
},
{
"epoch": 1.8408736349453978,
"grad_norm": 1.2412258386611938,
"learning_rate": 1.0699782877363435e-05,
"loss": 0.2305,
"step": 1180
},
{
"epoch": 1.856474258970359,
"grad_norm": 0.9623850584030151,
"learning_rate": 1.0453192101378812e-05,
"loss": 0.2028,
"step": 1190
},
{
"epoch": 1.8720748829953198,
"grad_norm": 1.9972094297409058,
"learning_rate": 1.0207947167439665e-05,
"loss": 0.185,
"step": 1200
},
{
"epoch": 1.8876755070202809,
"grad_norm": 1.424430251121521,
"learning_rate": 9.964120667340252e-06,
"loss": 0.2163,
"step": 1210
},
{
"epoch": 1.9032761310452417,
"grad_norm": 0.9559535980224609,
"learning_rate": 9.721784773022505e-06,
"loss": 0.1774,
"step": 1220
},
{
"epoch": 1.9188767550702028,
"grad_norm": 1.1195014715194702,
"learning_rate": 9.481011215213333e-06,
"loss": 0.212,
"step": 1230
},
{
"epoch": 1.934477379095164,
"grad_norm": 2.1014575958251953,
"learning_rate": 9.241871262192553e-06,
"loss": 0.1878,
"step": 1240
},
{
"epoch": 1.9500780031201248,
"grad_norm": 0.9159352779388428,
"learning_rate": 9.004435698697638e-06,
"loss": 0.1992,
"step": 1250
},
{
"epoch": 1.9656786271450857,
"grad_norm": 1.7032984495162964,
"learning_rate": 8.768774804971705e-06,
"loss": 0.2284,
"step": 1260
},
{
"epoch": 1.9812792511700468,
"grad_norm": 0.9082481861114502,
"learning_rate": 8.534958335960701e-06,
"loss": 0.1948,
"step": 1270
},
{
"epoch": 1.9968798751950079,
"grad_norm": 1.1760114431381226,
"learning_rate": 8.303055500666185e-06,
"loss": 0.1939,
"step": 1280
},
{
"epoch": 2.012480499219969,
"grad_norm": 1.515101432800293,
"learning_rate": 8.073134941659631e-06,
"loss": 0.2213,
"step": 1290
},
{
"epoch": 2.0280811232449296,
"grad_norm": 0.7668793201446533,
"learning_rate": 7.845264714764464e-06,
"loss": 0.1783,
"step": 1300
},
{
"epoch": 2.0436817472698907,
"grad_norm": 1.2395737171173096,
"learning_rate": 7.619512268911687e-06,
"loss": 0.201,
"step": 1310
},
{
"epoch": 2.059282371294852,
"grad_norm": 1.0414377450942993,
"learning_rate": 7.395944426175209e-06,
"loss": 0.1866,
"step": 1320
},
{
"epoch": 2.074882995319813,
"grad_norm": 1.3810758590698242,
"learning_rate": 7.174627361992733e-06,
"loss": 0.2028,
"step": 1330
},
{
"epoch": 2.090483619344774,
"grad_norm": 1.0247944593429565,
"learning_rate": 6.955626585577968e-06,
"loss": 0.1705,
"step": 1340
},
{
"epoch": 2.1060842433697347,
"grad_norm": 1.0820848941802979,
"learning_rate": 6.73900692053012e-06,
"loss": 0.2025,
"step": 1350
},
{
"epoch": 2.1216848673946958,
"grad_norm": 1.0407085418701172,
"learning_rate": 6.5248324856462825e-06,
"loss": 0.1858,
"step": 1360
},
{
"epoch": 2.137285491419657,
"grad_norm": 0.9962034821510315,
"learning_rate": 6.313166675942475e-06,
"loss": 0.1958,
"step": 1370
},
{
"epoch": 2.152886115444618,
"grad_norm": 1.0945219993591309,
"learning_rate": 6.104072143888874e-06,
"loss": 0.1727,
"step": 1380
},
{
"epoch": 2.1684867394695786,
"grad_norm": 1.7445493936538696,
"learning_rate": 5.897610780864885e-06,
"loss": 0.2164,
"step": 1390
},
{
"epoch": 2.1840873634945397,
"grad_norm": 1.2947014570236206,
"learning_rate": 5.693843698839448e-06,
"loss": 0.2124,
"step": 1400
},
{
"epoch": 2.199687987519501,
"grad_norm": 1.7516402006149292,
"learning_rate": 5.4928312122821106e-06,
"loss": 0.169,
"step": 1410
},
{
"epoch": 2.215288611544462,
"grad_norm": 0.8668001294136047,
"learning_rate": 5.294632820310068e-06,
"loss": 0.1479,
"step": 1420
},
{
"epoch": 2.230889235569423,
"grad_norm": 1.2017163038253784,
"learning_rate": 5.099307189076637e-06,
"loss": 0.1909,
"step": 1430
},
{
"epoch": 2.2464898595943836,
"grad_norm": 1.1504899263381958,
"learning_rate": 4.906912134406216e-06,
"loss": 0.188,
"step": 1440
},
{
"epoch": 2.2620904836193447,
"grad_norm": 1.597075343132019,
"learning_rate": 4.717504604680997e-06,
"loss": 0.1938,
"step": 1450
},
{
"epoch": 2.277691107644306,
"grad_norm": 1.7201188802719116,
"learning_rate": 4.531140663984368e-06,
"loss": 0.1867,
"step": 1460
},
{
"epoch": 2.293291731669267,
"grad_norm": 1.5416109561920166,
"learning_rate": 4.3478754755061526e-06,
"loss": 0.1807,
"step": 1470
},
{
"epoch": 2.308892355694228,
"grad_norm": 1.2527023553848267,
"learning_rate": 4.167763285214421e-06,
"loss": 0.1839,
"step": 1480
},
{
"epoch": 2.3244929797191887,
"grad_norm": 1.764147162437439,
"learning_rate": 3.990857405798876e-06,
"loss": 0.2142,
"step": 1490
},
{
"epoch": 2.3400936037441498,
"grad_norm": 1.8016437292099,
"learning_rate": 3.817210200890411e-06,
"loss": 0.195,
"step": 1500
},
{
"epoch": 2.3400936037441498,
"eval_loss": 0.24235089123249054,
"eval_runtime": 211.616,
"eval_samples_per_second": 9.082,
"eval_steps_per_second": 9.082,
"step": 1500
},
{
"epoch": 2.355694227769111,
"grad_norm": 1.1703391075134277,
"learning_rate": 3.6468730695616733e-06,
"loss": 0.1855,
"step": 1510
},
{
"epoch": 2.371294851794072,
"grad_norm": 1.0875886678695679,
"learning_rate": 3.479896431113043e-06,
"loss": 0.1939,
"step": 1520
},
{
"epoch": 2.3868954758190326,
"grad_norm": 0.8574866652488708,
"learning_rate": 3.3163297101486995e-06,
"loss": 0.1834,
"step": 1530
},
{
"epoch": 2.4024960998439937,
"grad_norm": 0.8828145265579224,
"learning_rate": 3.156221321947055e-06,
"loss": 0.1806,
"step": 1540
},
{
"epoch": 2.418096723868955,
"grad_norm": 1.8176056146621704,
"learning_rate": 2.999618658129983e-06,
"loss": 0.1717,
"step": 1550
},
{
"epoch": 2.433697347893916,
"grad_norm": 0.9186724424362183,
"learning_rate": 2.846568072635042e-06,
"loss": 0.1721,
"step": 1560
},
{
"epoch": 2.4492979719188765,
"grad_norm": 1.5324952602386475,
"learning_rate": 2.6971148679948256e-06,
"loss": 0.1928,
"step": 1570
},
{
"epoch": 2.4648985959438376,
"grad_norm": 1.231868028640747,
"learning_rate": 2.551303281927559e-06,
"loss": 0.1904,
"step": 1580
},
{
"epoch": 2.4804992199687987,
"grad_norm": 2.1853291988372803,
"learning_rate": 2.4091764742428483e-06,
"loss": 0.1843,
"step": 1590
},
{
"epoch": 2.49609984399376,
"grad_norm": 0.9732924103736877,
"learning_rate": 2.2707765140665256e-06,
"loss": 0.1801,
"step": 1600
},
{
"epoch": 2.511700468018721,
"grad_norm": 1.5212299823760986,
"learning_rate": 2.1361443673882688e-06,
"loss": 0.1802,
"step": 1610
},
{
"epoch": 2.5273010920436816,
"grad_norm": 1.6022634506225586,
"learning_rate": 2.0053198849358323e-06,
"loss": 0.1965,
"step": 1620
},
{
"epoch": 2.5429017160686427,
"grad_norm": 1.337381362915039,
"learning_rate": 1.8783417903793037e-06,
"loss": 0.202,
"step": 1630
},
{
"epoch": 2.5585023400936038,
"grad_norm": 2.099640369415283,
"learning_rate": 1.7552476688690482e-06,
"loss": 0.2049,
"step": 1640
},
{
"epoch": 2.574102964118565,
"grad_norm": 1.7517180442810059,
"learning_rate": 1.6360739559105786e-06,
"loss": 0.1762,
"step": 1650
},
{
"epoch": 2.589703588143526,
"grad_norm": 1.3570579290390015,
"learning_rate": 1.52085592657977e-06,
"loss": 0.1519,
"step": 1660
},
{
"epoch": 2.6053042121684866,
"grad_norm": 1.470625638961792,
"learning_rate": 1.409627685081531e-06,
"loss": 0.1724,
"step": 1670
},
{
"epoch": 2.6209048361934477,
"grad_norm": 1.7645882368087769,
"learning_rate": 1.3024221546550713e-06,
"loss": 0.1788,
"step": 1680
},
{
"epoch": 2.636505460218409,
"grad_norm": 1.4340243339538574,
"learning_rate": 1.1992710678286929e-06,
"loss": 0.17,
"step": 1690
},
{
"epoch": 2.6521060842433695,
"grad_norm": 1.6330922842025757,
"learning_rate": 1.100204957027079e-06,
"loss": 0.1713,
"step": 1700
},
{
"epoch": 2.667706708268331,
"grad_norm": 1.0880082845687866,
"learning_rate": 1.005253145533761e-06,
"loss": 0.1758,
"step": 1710
},
{
"epoch": 2.6833073322932917,
"grad_norm": 2.9010987281799316,
"learning_rate": 9.144437388115295e-07,
"loss": 0.2119,
"step": 1720
},
{
"epoch": 2.6989079563182528,
"grad_norm": 1.8082084655761719,
"learning_rate": 8.278036161832869e-07,
"loss": 0.1732,
"step": 1730
},
{
"epoch": 2.714508580343214,
"grad_norm": 1.6634337902069092,
"learning_rate": 7.453584228758553e-07,
"loss": 0.2212,
"step": 1740
},
{
"epoch": 2.7301092043681745,
"grad_norm": 1.1831082105636597,
"learning_rate": 6.671325624290503e-07,
"loss": 0.1902,
"step": 1750
},
{
"epoch": 2.7457098283931356,
"grad_norm": 1.9070043563842773,
"learning_rate": 5.931491894723107e-07,
"loss": 0.1737,
"step": 1760
},
{
"epoch": 2.7613104524180967,
"grad_norm": 1.6935715675354004,
"learning_rate": 5.234302028710008e-07,
"loss": 0.1841,
"step": 1770
},
{
"epoch": 2.776911076443058,
"grad_norm": 1.1902638673782349,
"learning_rate": 4.579962392443959e-07,
"loss": 0.1501,
"step": 1780
},
{
"epoch": 2.792511700468019,
"grad_norm": 1.1872891187667847,
"learning_rate": 3.968666668573179e-07,
"loss": 0.195,
"step": 1790
},
{
"epoch": 2.8081123244929795,
"grad_norm": 1.0661053657531738,
"learning_rate": 3.4005957988716473e-07,
"loss": 0.1911,
"step": 1800
},
{
"epoch": 2.8237129485179406,
"grad_norm": 1.0829426050186157,
"learning_rate": 2.8759179306810657e-07,
"loss": 0.17,
"step": 1810
},
{
"epoch": 2.8393135725429017,
"grad_norm": 1.3797425031661987,
"learning_rate": 2.3947883671396e-07,
"loss": 0.192,
"step": 1820
},
{
"epoch": 2.854914196567863,
"grad_norm": 1.4285950660705566,
"learning_rate": 1.9573495212126535e-07,
"loss": 0.1988,
"step": 1830
},
{
"epoch": 2.870514820592824,
"grad_norm": 0.9140704274177551,
"learning_rate": 1.5637308735390044e-07,
"loss": 0.2175,
"step": 1840
},
{
"epoch": 2.8861154446177846,
"grad_norm": 1.3677566051483154,
"learning_rate": 1.2140489341049777e-07,
"loss": 0.1775,
"step": 1850
},
{
"epoch": 2.9017160686427457,
"grad_norm": 1.7777496576309204,
"learning_rate": 9.084072077576999e-08,
"loss": 0.1381,
"step": 1860
},
{
"epoch": 2.9173166926677068,
"grad_norm": 1.4862127304077148,
"learning_rate": 6.468961635680893e-08,
"loss": 0.177,
"step": 1870
},
{
"epoch": 2.932917316692668,
"grad_norm": 1.7681442499160767,
"learning_rate": 4.295932080521925e-08,
"loss": 0.1928,
"step": 1880
},
{
"epoch": 2.948517940717629,
"grad_norm": 3.2976081371307373,
"learning_rate": 2.565626622591466e-08,
"loss": 0.1981,
"step": 1890
},
{
"epoch": 2.9641185647425896,
"grad_norm": 1.4172863960266113,
"learning_rate": 1.2785574273224132e-08,
"loss": 0.1983,
"step": 1900
},
{
"epoch": 2.9797191887675507,
"grad_norm": 1.8934816122055054,
"learning_rate": 4.3510546349045945e-09,
"loss": 0.1933,
"step": 1910
},
{
"epoch": 2.995319812792512,
"grad_norm": 1.8813408613204956,
"learning_rate": 3.552039044829591e-10,
"loss": 0.1862,
"step": 1920
},
{
"epoch": 3.0,
"step": 1923,
"total_flos": 2.3277958164873216e+17,
"train_loss": 0.22298803400137868,
"train_runtime": 7941.0421,
"train_samples_per_second": 5.811,
"train_steps_per_second": 0.242
}
],
"logging_steps": 10,
"max_steps": 1923,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.3277958164873216e+17,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}