ChessSLM-PM / trainer_state.json
FlameF0X's picture
Upload folder using huggingface_hub
5a85a89 verified
Raw
History Blame Contribute Delete
21.1 kB
{
"best_global_step": 4200,
"best_metric": 0.9021432995796204,
"best_model_checkpoint": "./chessslm-2000plus/checkpoint-4200",
"epoch": 0.6086074481959136,
"eval_steps": 200,
"global_step": 4200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007245326764237067,
"grad_norm": 0.28264278173446655,
"learning_rate": 2.45e-05,
"loss": 1.2346894073486328,
"step": 50
},
{
"epoch": 0.014490653528474133,
"grad_norm": 0.25636932253837585,
"learning_rate": 4.9500000000000004e-05,
"loss": 1.1820894622802733,
"step": 100
},
{
"epoch": 0.0217359802927112,
"grad_norm": 0.2744424343109131,
"learning_rate": 4.9993596199009453e-05,
"loss": 1.1531302642822265,
"step": 150
},
{
"epoch": 0.028981307056948267,
"grad_norm": 0.28049495816230774,
"learning_rate": 4.997386280918375e-05,
"loss": 1.1511387634277344,
"step": 200
},
{
"epoch": 0.028981307056948267,
"eval_loss": 1.1023718118667603,
"eval_runtime": 193.7165,
"eval_samples_per_second": 120.0,
"eval_steps_per_second": 3.753,
"step": 200
},
{
"epoch": 0.036226633821185336,
"grad_norm": 0.2879795730113983,
"learning_rate": 4.994080768959056e-05,
"loss": 1.1302296447753906,
"step": 250
},
{
"epoch": 0.0434719605854224,
"grad_norm": 0.29017066955566406,
"learning_rate": 4.989444847271928e-05,
"loss": 1.1334593963623047,
"step": 300
},
{
"epoch": 0.05071728734965947,
"grad_norm": 0.31129732728004456,
"learning_rate": 4.98348098878224e-05,
"loss": 1.1211315155029298,
"step": 350
},
{
"epoch": 0.057962614113896534,
"grad_norm": 0.29630768299102783,
"learning_rate": 4.9761923747724296e-05,
"loss": 1.1175349426269532,
"step": 400
},
{
"epoch": 0.057962614113896534,
"eval_loss": 1.0756858587265015,
"eval_runtime": 193.6918,
"eval_samples_per_second": 120.015,
"eval_steps_per_second": 3.753,
"step": 400
},
{
"epoch": 0.0652079408781336,
"grad_norm": 0.2720015347003937,
"learning_rate": 4.9675828931851424e-05,
"loss": 1.1160638427734375,
"step": 450
},
{
"epoch": 0.07245326764237067,
"grad_norm": 0.29248911142349243,
"learning_rate": 4.9576571365492965e-05,
"loss": 1.1049542236328125,
"step": 500
},
{
"epoch": 0.07969859440660773,
"grad_norm": 0.31138038635253906,
"learning_rate": 4.946420399530304e-05,
"loss": 1.1029164123535156,
"step": 550
},
{
"epoch": 0.0869439211708448,
"grad_norm": 0.2833043932914734,
"learning_rate": 4.933878676105756e-05,
"loss": 1.0982711791992188,
"step": 600
},
{
"epoch": 0.0869439211708448,
"eval_loss": 1.056943416595459,
"eval_runtime": 193.7026,
"eval_samples_per_second": 120.009,
"eval_steps_per_second": 3.753,
"step": 600
},
{
"epoch": 0.09418924793508188,
"grad_norm": 0.32613420486450195,
"learning_rate": 4.9200386563680734e-05,
"loss": 1.0998190307617188,
"step": 650
},
{
"epoch": 0.10143457469931894,
"grad_norm": 0.30005744099617004,
"learning_rate": 4.904907722955829e-05,
"loss": 1.0960050201416016,
"step": 700
},
{
"epoch": 0.10867990146355601,
"grad_norm": 0.3087840974330902,
"learning_rate": 4.8884939471156544e-05,
"loss": 1.0800698852539063,
"step": 750
},
{
"epoch": 0.11592522822779307,
"grad_norm": 0.3069000542163849,
"learning_rate": 4.8708060843968165e-05,
"loss": 1.088259506225586,
"step": 800
},
{
"epoch": 0.11592522822779307,
"eval_loss": 1.0397652387619019,
"eval_runtime": 193.5663,
"eval_samples_per_second": 120.093,
"eval_steps_per_second": 3.756,
"step": 800
},
{
"epoch": 0.12317055499203014,
"grad_norm": 0.3139374554157257,
"learning_rate": 4.8518535699807713e-05,
"loss": 1.0745314788818359,
"step": 850
},
{
"epoch": 0.1304158817562672,
"grad_norm": 0.29361429810523987,
"learning_rate": 4.831646513648189e-05,
"loss": 1.0730350494384766,
"step": 900
},
{
"epoch": 0.13766120852050429,
"grad_norm": 0.32080790400505066,
"learning_rate": 4.810195694386123e-05,
"loss": 1.0730361938476562,
"step": 950
},
{
"epoch": 0.14490653528474134,
"grad_norm": 0.3122475743293762,
"learning_rate": 4.787512554638205e-05,
"loss": 1.0709983825683593,
"step": 1000
},
{
"epoch": 0.14490653528474134,
"eval_loss": 1.023197054862976,
"eval_runtime": 193.618,
"eval_samples_per_second": 120.061,
"eval_steps_per_second": 3.755,
"step": 1000
},
{
"epoch": 0.1521518620489784,
"grad_norm": 0.33824747800827026,
"learning_rate": 4.763609194200942e-05,
"loss": 1.0684901428222657,
"step": 1050
},
{
"epoch": 0.15939718881321546,
"grad_norm": 0.30118507146835327,
"learning_rate": 4.7384983637693614e-05,
"loss": 1.0644412231445313,
"step": 1100
},
{
"epoch": 0.16664251557745255,
"grad_norm": 0.29680049419403076,
"learning_rate": 4.71219345813544e-05,
"loss": 1.0592123413085937,
"step": 1150
},
{
"epoch": 0.1738878423416896,
"grad_norm": 0.320403128862381,
"learning_rate": 4.684708509042971e-05,
"loss": 1.0508477020263671,
"step": 1200
},
{
"epoch": 0.1738878423416896,
"eval_loss": 1.0096791982650757,
"eval_runtime": 193.6179,
"eval_samples_per_second": 120.061,
"eval_steps_per_second": 3.755,
"step": 1200
},
{
"epoch": 0.18113316910592667,
"grad_norm": 0.3681316673755646,
"learning_rate": 4.656058177702647e-05,
"loss": 1.0457440185546876,
"step": 1250
},
{
"epoch": 0.18837849587016375,
"grad_norm": 0.3355785608291626,
"learning_rate": 4.626257746971382e-05,
"loss": 1.0470626068115234,
"step": 1300
},
{
"epoch": 0.1956238226344008,
"grad_norm": 0.339575856924057,
"learning_rate": 4.5953231132000175e-05,
"loss": 1.0423269653320313,
"step": 1350
},
{
"epoch": 0.20286914939863787,
"grad_norm": 0.33345919847488403,
"learning_rate": 4.563270777753791e-05,
"loss": 1.0385567474365234,
"step": 1400
},
{
"epoch": 0.20286914939863787,
"eval_loss": 0.9979937076568604,
"eval_runtime": 193.9537,
"eval_samples_per_second": 119.853,
"eval_steps_per_second": 3.748,
"step": 1400
},
{
"epoch": 0.21011447616287496,
"grad_norm": 0.33952876925468445,
"learning_rate": 4.530117838210059e-05,
"loss": 1.040812759399414,
"step": 1450
},
{
"epoch": 0.21735980292711202,
"grad_norm": 0.3470812737941742,
"learning_rate": 4.4958819792379846e-05,
"loss": 1.0452130126953125,
"step": 1500
},
{
"epoch": 0.22460512969134908,
"grad_norm": 0.37180599570274353,
"learning_rate": 4.460581463165071e-05,
"loss": 1.0368424224853516,
"step": 1550
},
{
"epoch": 0.23185045645558613,
"grad_norm": 0.3716419041156769,
"learning_rate": 4.424235120235537e-05,
"loss": 1.0325569915771484,
"step": 1600
},
{
"epoch": 0.23185045645558613,
"eval_loss": 0.9863654971122742,
"eval_runtime": 192.694,
"eval_samples_per_second": 120.637,
"eval_steps_per_second": 3.773,
"step": 1600
},
{
"epoch": 0.23909578321982322,
"grad_norm": 0.33746030926704407,
"learning_rate": 4.386862338565759e-05,
"loss": 1.0336730194091797,
"step": 1650
},
{
"epoch": 0.24634110998406028,
"grad_norm": 0.34745824337005615,
"learning_rate": 4.3484830538021324e-05,
"loss": 1.0262679290771484,
"step": 1700
},
{
"epoch": 0.25358643674829734,
"grad_norm": 0.33224305510520935,
"learning_rate": 4.3091177384868585e-05,
"loss": 1.0238925170898439,
"step": 1750
},
{
"epoch": 0.2608317635125344,
"grad_norm": 0.3336823880672455,
"learning_rate": 4.26878739113734e-05,
"loss": 1.021321792602539,
"step": 1800
},
{
"epoch": 0.2608317635125344,
"eval_loss": 0.9752877354621887,
"eval_runtime": 192.2236,
"eval_samples_per_second": 120.932,
"eval_steps_per_second": 3.782,
"step": 1800
},
{
"epoch": 0.26807709027677146,
"grad_norm": 0.3285761773586273,
"learning_rate": 4.2275135250450106e-05,
"loss": 1.0217689514160155,
"step": 1850
},
{
"epoch": 0.27532241704100857,
"grad_norm": 0.35993409156799316,
"learning_rate": 4.1853181567995645e-05,
"loss": 1.0172651672363282,
"step": 1900
},
{
"epoch": 0.28256774380524563,
"grad_norm": 0.3507116734981537,
"learning_rate": 4.142223794544715e-05,
"loss": 1.0248129272460937,
"step": 1950
},
{
"epoch": 0.2898130705694827,
"grad_norm": 0.3568938374519348,
"learning_rate": 4.0982534259717475e-05,
"loss": 1.0136798095703126,
"step": 2000
},
{
"epoch": 0.2898130705694827,
"eval_loss": 0.9648082256317139,
"eval_runtime": 192.1777,
"eval_samples_per_second": 120.961,
"eval_steps_per_second": 3.783,
"step": 2000
},
{
"epoch": 0.29705839733371975,
"grad_norm": 0.3777235746383667,
"learning_rate": 4.053430506057268e-05,
"loss": 1.0077315521240235,
"step": 2050
},
{
"epoch": 0.3043037240979568,
"grad_norm": 0.3593011498451233,
"learning_rate": 4.0077789445516814e-05,
"loss": 1.0111034393310547,
"step": 2100
},
{
"epoch": 0.31154905086219387,
"grad_norm": 0.3624265193939209,
"learning_rate": 3.9613230932250985e-05,
"loss": 1.0038584136962891,
"step": 2150
},
{
"epoch": 0.3187943776264309,
"grad_norm": 0.3455412983894348,
"learning_rate": 3.9140877328774375e-05,
"loss": 1.0052964782714844,
"step": 2200
},
{
"epoch": 0.3187943776264309,
"eval_loss": 0.9546002149581909,
"eval_runtime": 193.0198,
"eval_samples_per_second": 120.433,
"eval_steps_per_second": 3.766,
"step": 2200
},
{
"epoch": 0.32603970439066804,
"grad_norm": 0.35707926750183105,
"learning_rate": 3.866098060119684e-05,
"loss": 0.9974722290039062,
"step": 2250
},
{
"epoch": 0.3332850311549051,
"grad_norm": 0.3994494080543518,
"learning_rate": 3.817379673933341e-05,
"loss": 0.9981327056884766,
"step": 2300
},
{
"epoch": 0.34053035791914216,
"grad_norm": 0.36747175455093384,
"learning_rate": 3.767958562015246e-05,
"loss": 0.9972872924804688,
"step": 2350
},
{
"epoch": 0.3477756846833792,
"grad_norm": 0.36322903633117676,
"learning_rate": 3.717861086915026e-05,
"loss": 0.9997342681884765,
"step": 2400
},
{
"epoch": 0.3477756846833792,
"eval_loss": 0.9470997452735901,
"eval_runtime": 192.2555,
"eval_samples_per_second": 120.912,
"eval_steps_per_second": 3.781,
"step": 2400
},
{
"epoch": 0.3550210114476163,
"grad_norm": 0.33153679966926575,
"learning_rate": 3.6671139719726174e-05,
"loss": 0.9987025451660156,
"step": 2450
},
{
"epoch": 0.36226633821185333,
"grad_norm": 0.40384331345558167,
"learning_rate": 3.6157442870633096e-05,
"loss": 0.992891616821289,
"step": 2500
},
{
"epoch": 0.36951166497609045,
"grad_norm": 0.3685908913612366,
"learning_rate": 3.563779434157947e-05,
"loss": 0.991015396118164,
"step": 2550
},
{
"epoch": 0.3767569917403275,
"grad_norm": 0.37052083015441895,
"learning_rate": 3.511247132705986e-05,
"loss": 0.9884940338134766,
"step": 2600
},
{
"epoch": 0.3767569917403275,
"eval_loss": 0.939989447593689,
"eval_runtime": 192.4644,
"eval_samples_per_second": 120.781,
"eval_steps_per_second": 3.777,
"step": 2600
},
{
"epoch": 0.38400231850456457,
"grad_norm": 0.35503244400024414,
"learning_rate": 3.458175404849188e-05,
"loss": 0.9872270965576172,
"step": 2650
},
{
"epoch": 0.3912476452688016,
"grad_norm": 0.3497767746448517,
"learning_rate": 3.4045925604738604e-05,
"loss": 0.9846116638183594,
"step": 2700
},
{
"epoch": 0.3984929720330387,
"grad_norm": 0.3695673942565918,
"learning_rate": 3.350527182109603e-05,
"loss": 0.9809439849853515,
"step": 2750
},
{
"epoch": 0.40573829879727574,
"grad_norm": 0.36862626671791077,
"learning_rate": 3.296008109682616e-05,
"loss": 0.9826165008544921,
"step": 2800
},
{
"epoch": 0.40573829879727574,
"eval_loss": 0.9336075186729431,
"eval_runtime": 192.3219,
"eval_samples_per_second": 120.87,
"eval_steps_per_second": 3.78,
"step": 2800
},
{
"epoch": 0.4129836255615128,
"grad_norm": 0.34875786304473877,
"learning_rate": 3.241064425131708e-05,
"loss": 0.9786082458496094,
"step": 2850
},
{
"epoch": 0.4202289523257499,
"grad_norm": 0.3561500608921051,
"learning_rate": 3.185725436895209e-05,
"loss": 0.9792596435546875,
"step": 2900
},
{
"epoch": 0.427474279089987,
"grad_norm": 0.36314815282821655,
"learning_rate": 3.130020664277064e-05,
"loss": 0.9800941467285156,
"step": 2950
},
{
"epoch": 0.43471960585422403,
"grad_norm": 0.3698309361934662,
"learning_rate": 3.0739798217004354e-05,
"loss": 0.9847097015380859,
"step": 3000
},
{
"epoch": 0.43471960585422403,
"eval_loss": 0.927081286907196,
"eval_runtime": 192.3461,
"eval_samples_per_second": 120.855,
"eval_steps_per_second": 3.78,
"step": 3000
},
{
"epoch": 0.4419649326184611,
"grad_norm": 0.3639983534812927,
"learning_rate": 3.0176328028572408e-05,
"loss": 0.9701497650146484,
"step": 3050
},
{
"epoch": 0.44921025938269815,
"grad_norm": 0.3515409827232361,
"learning_rate": 2.9610096647620445e-05,
"loss": 0.9721237945556641,
"step": 3100
},
{
"epoch": 0.4564555861469352,
"grad_norm": 0.391891747713089,
"learning_rate": 2.9041406117188458e-05,
"loss": 0.9730013275146484,
"step": 3150
},
{
"epoch": 0.46370091291117227,
"grad_norm": 0.3907790780067444,
"learning_rate": 2.8470559792092855e-05,
"loss": 0.974240951538086,
"step": 3200
},
{
"epoch": 0.46370091291117227,
"eval_loss": 0.9227471351623535,
"eval_runtime": 192.3251,
"eval_samples_per_second": 120.868,
"eval_steps_per_second": 3.78,
"step": 3200
},
{
"epoch": 0.4709462396754094,
"grad_norm": 0.3531961143016815,
"learning_rate": 2.789786217710888e-05,
"loss": 0.9673224639892578,
"step": 3250
},
{
"epoch": 0.47819156643964644,
"grad_norm": 0.33910030126571655,
"learning_rate": 2.732361876453957e-05,
"loss": 0.9719451904296875,
"step": 3300
},
{
"epoch": 0.4854368932038835,
"grad_norm": 0.37699881196022034,
"learning_rate": 2.6748135871257955e-05,
"loss": 0.9684635925292969,
"step": 3350
},
{
"epoch": 0.49268221996812056,
"grad_norm": 0.3599157929420471,
"learning_rate": 2.617172047530939e-05,
"loss": 0.957724609375,
"step": 3400
},
{
"epoch": 0.49268221996812056,
"eval_loss": 0.91666579246521,
"eval_runtime": 192.0058,
"eval_samples_per_second": 121.069,
"eval_steps_per_second": 3.786,
"step": 3400
},
{
"epoch": 0.4999275467323576,
"grad_norm": 0.37489375472068787,
"learning_rate": 2.5594680052161206e-05,
"loss": 0.9689096069335937,
"step": 3450
},
{
"epoch": 0.5071728734965947,
"grad_norm": 0.3328556418418884,
"learning_rate": 2.5017322410687077e-05,
"loss": 0.96274169921875,
"step": 3500
},
{
"epoch": 0.5144182002608317,
"grad_norm": 0.35683172941207886,
"learning_rate": 2.4439955528973414e-05,
"loss": 0.9614187622070313,
"step": 3550
},
{
"epoch": 0.5216635270250688,
"grad_norm": 0.36412957310676575,
"learning_rate": 2.386288739003567e-05,
"loss": 0.964275131225586,
"step": 3600
},
{
"epoch": 0.5216635270250688,
"eval_loss": 0.9119325280189514,
"eval_runtime": 192.5165,
"eval_samples_per_second": 120.748,
"eval_steps_per_second": 3.776,
"step": 3600
},
{
"epoch": 0.5289088537893059,
"grad_norm": 0.3563174307346344,
"learning_rate": 2.3286425817531836e-05,
"loss": 0.9626367950439453,
"step": 3650
},
{
"epoch": 0.5361541805535429,
"grad_norm": 0.3758934438228607,
"learning_rate": 2.271087831156107e-05,
"loss": 0.9564736938476562,
"step": 3700
},
{
"epoch": 0.5433995073177801,
"grad_norm": 0.34257784485816956,
"learning_rate": 2.2136551884634864e-05,
"loss": 0.9561862945556641,
"step": 3750
},
{
"epoch": 0.5506448340820171,
"grad_norm": 0.38688501715660095,
"learning_rate": 2.1563752897908352e-05,
"loss": 0.9572794342041016,
"step": 3800
},
{
"epoch": 0.5506448340820171,
"eval_loss": 0.9085790514945984,
"eval_runtime": 193.5054,
"eval_samples_per_second": 120.131,
"eval_steps_per_second": 3.757,
"step": 3800
},
{
"epoch": 0.5578901608462542,
"grad_norm": 0.4174833297729492,
"learning_rate": 2.0992786897758974e-05,
"loss": 0.9556180572509766,
"step": 3850
},
{
"epoch": 0.5651354876104913,
"grad_norm": 0.35218948125839233,
"learning_rate": 2.0423958452799854e-05,
"loss": 0.9547309875488281,
"step": 3900
},
{
"epoch": 0.5723808143747283,
"grad_norm": 0.3613223433494568,
"learning_rate": 1.9857570991414737e-05,
"loss": 0.9548380279541016,
"step": 3950
},
{
"epoch": 0.5796261411389654,
"grad_norm": 0.35305657982826233,
"learning_rate": 1.9293926639901092e-05,
"loss": 0.9536138916015625,
"step": 4000
},
{
"epoch": 0.5796261411389654,
"eval_loss": 0.9045887589454651,
"eval_runtime": 193.4058,
"eval_samples_per_second": 120.193,
"eval_steps_per_second": 3.759,
"step": 4000
},
{
"epoch": 0.5868714679032024,
"grad_norm": 0.35594019293785095,
"learning_rate": 1.873332606130787e-05,
"loss": 0.961040267944336,
"step": 4050
},
{
"epoch": 0.5941167946674395,
"grad_norm": 0.3579290509223938,
"learning_rate": 1.8176068295053684e-05,
"loss": 0.953524169921875,
"step": 4100
},
{
"epoch": 0.6013621214316766,
"grad_norm": 0.37046733498573303,
"learning_rate": 1.7622450597411216e-05,
"loss": 0.9559716796875,
"step": 4150
},
{
"epoch": 0.6086074481959136,
"grad_norm": 0.41828927397727966,
"learning_rate": 1.7072768282942695e-05,
"loss": 0.9484828186035156,
"step": 4200
},
{
"epoch": 0.6086074481959136,
"eval_loss": 0.9021432995796204,
"eval_runtime": 193.4682,
"eval_samples_per_second": 120.154,
"eval_steps_per_second": 3.758,
"step": 4200
}
],
"logging_steps": 50,
"max_steps": 6901,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8789181767221248.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}