c16 / checkpoint-800 /trainer_state.json
thetmon's picture
Upload merged Qwen3-4B-Instruct-2507 model (auto-generated README)
fb979e4 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9860291834833903,
"eval_steps": 30,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.024837007140639553,
"grad_norm": 11.272954940795898,
"learning_rate": 1.1111111111111112e-05,
"loss": 2.7848,
"step": 10
},
{
"epoch": 0.04967401428127911,
"grad_norm": 1.125125765800476,
"learning_rate": 2.345679012345679e-05,
"loss": 1.2296,
"step": 20
},
{
"epoch": 0.07451102142191866,
"grad_norm": 0.8439419269561768,
"learning_rate": 3.580246913580247e-05,
"loss": 0.6858,
"step": 30
},
{
"epoch": 0.07451102142191866,
"eval_loss": 0.6361502408981323,
"eval_runtime": 38.0472,
"eval_samples_per_second": 4.468,
"eval_steps_per_second": 4.468,
"step": 30
},
{
"epoch": 0.09934802856255821,
"grad_norm": 0.639077365398407,
"learning_rate": 4.814814814814815e-05,
"loss": 0.5125,
"step": 40
},
{
"epoch": 0.12418503570319776,
"grad_norm": 0.6465684771537781,
"learning_rate": 6.049382716049383e-05,
"loss": 0.3889,
"step": 50
},
{
"epoch": 0.14902204284383733,
"grad_norm": 0.688926100730896,
"learning_rate": 7.283950617283951e-05,
"loss": 0.3039,
"step": 60
},
{
"epoch": 0.14902204284383733,
"eval_loss": 0.31402018666267395,
"eval_runtime": 37.4247,
"eval_samples_per_second": 4.542,
"eval_steps_per_second": 4.542,
"step": 60
},
{
"epoch": 0.17385904998447688,
"grad_norm": 0.6076411008834839,
"learning_rate": 8.518518518518518e-05,
"loss": 0.2683,
"step": 70
},
{
"epoch": 0.19869605712511643,
"grad_norm": 1.1276863813400269,
"learning_rate": 9.753086419753087e-05,
"loss": 0.2392,
"step": 80
},
{
"epoch": 0.22353306426575598,
"grad_norm": 0.5334923267364502,
"learning_rate": 9.996995997963675e-05,
"loss": 0.2136,
"step": 90
},
{
"epoch": 0.22353306426575598,
"eval_loss": 0.2120552957057953,
"eval_runtime": 37.372,
"eval_samples_per_second": 4.549,
"eval_steps_per_second": 4.549,
"step": 90
},
{
"epoch": 0.24837007140639553,
"grad_norm": 0.43537846207618713,
"learning_rate": 9.984798425822163e-05,
"loss": 0.1974,
"step": 100
},
{
"epoch": 0.2732070785470351,
"grad_norm": 0.7943837642669678,
"learning_rate": 9.963242415487557e-05,
"loss": 0.1678,
"step": 110
},
{
"epoch": 0.29804408568767465,
"grad_norm": 0.30124473571777344,
"learning_rate": 9.932368436116915e-05,
"loss": 0.1648,
"step": 120
},
{
"epoch": 0.29804408568767465,
"eval_loss": 0.15894636511802673,
"eval_runtime": 37.5461,
"eval_samples_per_second": 4.528,
"eval_steps_per_second": 4.528,
"step": 120
},
{
"epoch": 0.3228810928283142,
"grad_norm": 0.26191410422325134,
"learning_rate": 9.892234450380547e-05,
"loss": 0.1602,
"step": 130
},
{
"epoch": 0.34771809996895375,
"grad_norm": 0.5726041197776794,
"learning_rate": 9.842915805643155e-05,
"loss": 0.1647,
"step": 140
},
{
"epoch": 0.3725551071095933,
"grad_norm": 0.4395739436149597,
"learning_rate": 9.784505092507031e-05,
"loss": 0.1525,
"step": 150
},
{
"epoch": 0.3725551071095933,
"eval_loss": 0.14860223233699799,
"eval_runtime": 37.4998,
"eval_samples_per_second": 4.533,
"eval_steps_per_second": 4.533,
"step": 150
},
{
"epoch": 0.39739211425023285,
"grad_norm": 0.33055049180984497,
"learning_rate": 9.717111970982869e-05,
"loss": 0.1555,
"step": 160
},
{
"epoch": 0.4222291213908724,
"grad_norm": 0.2741611897945404,
"learning_rate": 9.640862964614564e-05,
"loss": 0.1487,
"step": 170
},
{
"epoch": 0.44706612853151195,
"grad_norm": 0.3228336572647095,
"learning_rate": 9.555901222944468e-05,
"loss": 0.1564,
"step": 180
},
{
"epoch": 0.44706612853151195,
"eval_loss": 0.1397811621427536,
"eval_runtime": 37.4841,
"eval_samples_per_second": 4.535,
"eval_steps_per_second": 4.535,
"step": 180
},
{
"epoch": 0.47190313567215153,
"grad_norm": 0.23800568282604218,
"learning_rate": 9.462386252765087e-05,
"loss": 0.1443,
"step": 190
},
{
"epoch": 0.49674014281279105,
"grad_norm": 0.27476266026496887,
"learning_rate": 9.36049361866175e-05,
"loss": 0.1447,
"step": 200
},
{
"epoch": 0.5215771499534306,
"grad_norm": 0.1981421262025833,
"learning_rate": 9.250414613408427e-05,
"loss": 0.1448,
"step": 210
},
{
"epoch": 0.5215771499534306,
"eval_loss": 0.1324106752872467,
"eval_runtime": 37.3913,
"eval_samples_per_second": 4.547,
"eval_steps_per_second": 4.547,
"step": 210
},
{
"epoch": 0.5464141570940702,
"grad_norm": 0.21448366343975067,
"learning_rate": 9.132355898835556e-05,
"loss": 0.1401,
"step": 220
},
{
"epoch": 0.5712511642347097,
"grad_norm": 0.2850506603717804,
"learning_rate": 9.00653911784403e-05,
"loss": 0.1455,
"step": 230
},
{
"epoch": 0.5960881713753493,
"grad_norm": 0.22351579368114471,
"learning_rate": 8.873200478293826e-05,
"loss": 0.1478,
"step": 240
},
{
"epoch": 0.5960881713753493,
"eval_loss": 0.1282844841480255,
"eval_runtime": 37.4497,
"eval_samples_per_second": 4.539,
"eval_steps_per_second": 4.539,
"step": 240
},
{
"epoch": 0.6209251785159888,
"grad_norm": 0.2048240602016449,
"learning_rate": 8.732590309548416e-05,
"loss": 0.1357,
"step": 250
},
{
"epoch": 0.6457621856566284,
"grad_norm": 0.24408458173274994,
"learning_rate": 8.584972592507553e-05,
"loss": 0.1371,
"step": 260
},
{
"epoch": 0.670599192797268,
"grad_norm": 0.14031532406806946,
"learning_rate": 8.430624464010706e-05,
"loss": 0.1424,
"step": 270
},
{
"epoch": 0.670599192797268,
"eval_loss": 0.12568299472332,
"eval_runtime": 37.8149,
"eval_samples_per_second": 4.496,
"eval_steps_per_second": 4.496,
"step": 270
},
{
"epoch": 0.6954361999379075,
"grad_norm": 0.24387261271476746,
"learning_rate": 8.269835696541607e-05,
"loss": 0.1289,
"step": 280
},
{
"epoch": 0.720273207078547,
"grad_norm": 0.1631806641817093,
"learning_rate": 8.102908154210693e-05,
"loss": 0.1358,
"step": 290
},
{
"epoch": 0.7451102142191866,
"grad_norm": 0.1921030879020691,
"learning_rate": 7.93015522603677e-05,
"loss": 0.1466,
"step": 300
},
{
"epoch": 0.7451102142191866,
"eval_loss": 0.12537378072738647,
"eval_runtime": 37.7611,
"eval_samples_per_second": 4.502,
"eval_steps_per_second": 4.502,
"step": 300
},
{
"epoch": 0.7699472213598262,
"grad_norm": 0.17755988240242004,
"learning_rate": 7.751901237591887e-05,
"loss": 0.1423,
"step": 310
},
{
"epoch": 0.7947842285004657,
"grad_norm": 0.22009995579719543,
"learning_rate": 7.568480842113952e-05,
"loss": 0.1399,
"step": 320
},
{
"epoch": 0.8196212356411052,
"grad_norm": 0.19072045385837555,
"learning_rate": 7.380238392230257e-05,
"loss": 0.1429,
"step": 330
},
{
"epoch": 0.8196212356411052,
"eval_loss": 0.12434083968400955,
"eval_runtime": 37.8732,
"eval_samples_per_second": 4.489,
"eval_steps_per_second": 4.489,
"step": 330
},
{
"epoch": 0.8444582427817448,
"grad_norm": 0.13499405980110168,
"learning_rate": 7.187527293471385e-05,
"loss": 0.1388,
"step": 340
},
{
"epoch": 0.8692952499223844,
"grad_norm": 0.1576332151889801,
"learning_rate": 6.990709340789273e-05,
"loss": 0.1308,
"step": 350
},
{
"epoch": 0.8941322570630239,
"grad_norm": 0.13361729681491852,
"learning_rate": 6.790154039324975e-05,
"loss": 0.132,
"step": 360
},
{
"epoch": 0.8941322570630239,
"eval_loss": 0.12276456505060196,
"eval_runtime": 37.8657,
"eval_samples_per_second": 4.49,
"eval_steps_per_second": 4.49,
"step": 360
},
{
"epoch": 0.9189692642036634,
"grad_norm": 0.13873551785945892,
"learning_rate": 6.586237910701374e-05,
"loss": 0.1454,
"step": 370
},
{
"epoch": 0.9438062713443031,
"grad_norm": 0.12903927266597748,
"learning_rate": 6.379343786143184e-05,
"loss": 0.1325,
"step": 380
},
{
"epoch": 0.9686432784849426,
"grad_norm": 0.12467560917139053,
"learning_rate": 6.169860087751321e-05,
"loss": 0.1389,
"step": 390
},
{
"epoch": 0.9686432784849426,
"eval_loss": 0.1223841980099678,
"eval_runtime": 37.8137,
"eval_samples_per_second": 4.496,
"eval_steps_per_second": 4.496,
"step": 390
},
{
"epoch": 0.9934802856255821,
"grad_norm": 0.1413436084985733,
"learning_rate": 5.95818009928099e-05,
"loss": 0.1283,
"step": 400
},
{
"epoch": 1.0173859049984477,
"grad_norm": 0.17072723805904388,
"learning_rate": 5.744701227792538e-05,
"loss": 0.1351,
"step": 410
},
{
"epoch": 1.0422229121390871,
"grad_norm": 0.15790237486362457,
"learning_rate": 5.529824257561212e-05,
"loss": 0.1346,
"step": 420
},
{
"epoch": 1.0422229121390871,
"eval_loss": 0.12222303450107574,
"eval_runtime": 37.8365,
"eval_samples_per_second": 4.493,
"eval_steps_per_second": 4.493,
"step": 420
},
{
"epoch": 1.0670599192797268,
"grad_norm": 0.14720048010349274,
"learning_rate": 5.313952597646568e-05,
"loss": 0.1324,
"step": 430
},
{
"epoch": 1.0918969264203664,
"grad_norm": 0.12014240026473999,
"learning_rate": 5.097491524534106e-05,
"loss": 0.1315,
"step": 440
},
{
"epoch": 1.1167339335610058,
"grad_norm": 0.18464700877666473,
"learning_rate": 4.88084742127102e-05,
"loss": 0.1201,
"step": 450
},
{
"epoch": 1.1167339335610058,
"eval_loss": 0.12194804102182388,
"eval_runtime": 37.6494,
"eval_samples_per_second": 4.515,
"eval_steps_per_second": 4.515,
"step": 450
},
{
"epoch": 1.1415709407016454,
"grad_norm": 0.1334882378578186,
"learning_rate": 4.664427014524492e-05,
"loss": 0.1309,
"step": 460
},
{
"epoch": 1.166407947842285,
"grad_norm": 0.14938846230506897,
"learning_rate": 4.448636610994857e-05,
"loss": 0.1313,
"step": 470
},
{
"epoch": 1.1912449549829245,
"grad_norm": 0.11317029595375061,
"learning_rate": 4.2338813346172476e-05,
"loss": 0.1297,
"step": 480
},
{
"epoch": 1.1912449549829245,
"eval_loss": 0.12136982381343842,
"eval_runtime": 38.0285,
"eval_samples_per_second": 4.47,
"eval_steps_per_second": 4.47,
"step": 480
},
{
"epoch": 1.2160819621235641,
"grad_norm": 0.15464289486408234,
"learning_rate": 4.020564365983722e-05,
"loss": 0.1344,
"step": 490
},
{
"epoch": 1.2409189692642038,
"grad_norm": 0.13622425496578217,
"learning_rate": 3.80908618541384e-05,
"loss": 0.1321,
"step": 500
},
{
"epoch": 1.2657559764048432,
"grad_norm": 0.18376871943473816,
"learning_rate": 3.5998438210946937e-05,
"loss": 0.1252,
"step": 510
},
{
"epoch": 1.2657559764048432,
"eval_loss": 0.12068537622690201,
"eval_runtime": 37.8008,
"eval_samples_per_second": 4.497,
"eval_steps_per_second": 4.497,
"step": 510
},
{
"epoch": 1.2905929835454828,
"grad_norm": 0.1372678130865097,
"learning_rate": 3.393230103701989e-05,
"loss": 0.13,
"step": 520
},
{
"epoch": 1.3154299906861224,
"grad_norm": 0.12849152088165283,
"learning_rate": 3.1896329289014846e-05,
"loss": 0.1317,
"step": 530
},
{
"epoch": 1.3402669978267618,
"grad_norm": 0.11176390945911407,
"learning_rate": 2.9894345291154202e-05,
"loss": 0.13,
"step": 540
},
{
"epoch": 1.3402669978267618,
"eval_loss": 0.12049100548028946,
"eval_runtime": 37.9715,
"eval_samples_per_second": 4.477,
"eval_steps_per_second": 4.477,
"step": 540
},
{
"epoch": 1.3651040049674015,
"grad_norm": 0.16603630781173706,
"learning_rate": 2.793010755921068e-05,
"loss": 0.1272,
"step": 550
},
{
"epoch": 1.389941012108041,
"grad_norm": 0.1615874320268631,
"learning_rate": 2.6007303744286844e-05,
"loss": 0.1309,
"step": 560
},
{
"epoch": 1.4147780192486805,
"grad_norm": 0.13734523952007294,
"learning_rate": 2.4129543709635378e-05,
"loss": 0.1303,
"step": 570
},
{
"epoch": 1.4147780192486805,
"eval_loss": 0.12090769410133362,
"eval_runtime": 38.0585,
"eval_samples_per_second": 4.467,
"eval_steps_per_second": 4.467,
"step": 570
},
{
"epoch": 1.4396150263893202,
"grad_norm": 0.13437435030937195,
"learning_rate": 2.230035275351806e-05,
"loss": 0.1338,
"step": 580
},
{
"epoch": 1.4644520335299596,
"grad_norm": 0.12444904446601868,
"learning_rate": 2.0523164990826543e-05,
"loss": 0.1281,
"step": 590
},
{
"epoch": 1.4892890406705992,
"grad_norm": 0.14093923568725586,
"learning_rate": 1.8801316905890583e-05,
"loss": 0.1307,
"step": 600
},
{
"epoch": 1.4892890406705992,
"eval_loss": 0.11982201784849167,
"eval_runtime": 37.9037,
"eval_samples_per_second": 4.485,
"eval_steps_per_second": 4.485,
"step": 600
},
{
"epoch": 1.5141260478112386,
"grad_norm": 0.14758960902690887,
"learning_rate": 1.7138041088577267e-05,
"loss": 0.1299,
"step": 610
},
{
"epoch": 1.5389630549518785,
"grad_norm": 0.13439303636550903,
"learning_rate": 1.5536460165441323e-05,
"loss": 0.1285,
"step": 620
},
{
"epoch": 1.5638000620925179,
"grad_norm": 0.1315300315618515,
"learning_rate": 1.3999580937320011e-05,
"loss": 0.1294,
"step": 630
},
{
"epoch": 1.5638000620925179,
"eval_loss": 0.11952362954616547,
"eval_runtime": 37.8235,
"eval_samples_per_second": 4.495,
"eval_steps_per_second": 4.495,
"step": 630
},
{
"epoch": 1.5886370692331573,
"grad_norm": 0.13855591416358948,
"learning_rate": 1.2530288734378764e-05,
"loss": 0.1302,
"step": 640
},
{
"epoch": 1.613474076373797,
"grad_norm": 0.14461226761341095,
"learning_rate": 1.1131341999205274e-05,
"loss": 0.1241,
"step": 650
},
{
"epoch": 1.6383110835144366,
"grad_norm": 0.1513630747795105,
"learning_rate": 9.805367108121761e-06,
"loss": 0.1177,
"step": 660
},
{
"epoch": 1.6383110835144366,
"eval_loss": 0.11914250999689102,
"eval_runtime": 37.8427,
"eval_samples_per_second": 4.492,
"eval_steps_per_second": 4.492,
"step": 660
},
{
"epoch": 1.663148090655076,
"grad_norm": 0.3891820013523102,
"learning_rate": 8.554853440437805e-06,
"loss": 0.1297,
"step": 670
},
{
"epoch": 1.6879850977957156,
"grad_norm": 0.1408233791589737,
"learning_rate": 7.382148704900882e-06,
"loss": 0.1302,
"step": 680
},
{
"epoch": 1.7128221049363552,
"grad_norm": 0.15466411411762238,
"learning_rate": 6.289454532118444e-06,
"loss": 0.1288,
"step": 690
},
{
"epoch": 1.7128221049363552,
"eval_loss": 0.11917895078659058,
"eval_runtime": 37.8925,
"eval_samples_per_second": 4.486,
"eval_steps_per_second": 4.486,
"step": 690
},
{
"epoch": 1.7376591120769946,
"grad_norm": 0.09873683750629425,
"learning_rate": 5.278822341226519e-06,
"loss": 0.1296,
"step": 700
},
{
"epoch": 1.7624961192176343,
"grad_norm": 0.13227960467338562,
"learning_rate": 4.352149488564605e-06,
"loss": 0.1241,
"step": 710
},
{
"epoch": 1.787333126358274,
"grad_norm": 0.12939970195293427,
"learning_rate": 3.511175705587433e-06,
"loss": 0.1362,
"step": 720
},
{
"epoch": 1.787333126358274,
"eval_loss": 0.11892342567443848,
"eval_runtime": 37.9012,
"eval_samples_per_second": 4.485,
"eval_steps_per_second": 4.485,
"step": 720
},
{
"epoch": 1.8121701334989133,
"grad_norm": 0.15326713025569916,
"learning_rate": 2.75747983270091e-06,
"loss": 0.1287,
"step": 730
},
{
"epoch": 1.837007140639553,
"grad_norm": 0.13985218107700348,
"learning_rate": 2.0924768551542463e-06,
"loss": 0.1298,
"step": 740
},
{
"epoch": 1.8618441477801926,
"grad_norm": 0.14625447988510132,
"learning_rate": 1.517415246552978e-06,
"loss": 0.1295,
"step": 750
},
{
"epoch": 1.8618441477801926,
"eval_loss": 0.11896809190511703,
"eval_runtime": 37.7631,
"eval_samples_per_second": 4.502,
"eval_steps_per_second": 4.502,
"step": 750
},
{
"epoch": 1.886681154920832,
"grad_norm": 0.134150892496109,
"learning_rate": 1.033374624980249e-06,
"loss": 0.1252,
"step": 760
},
{
"epoch": 1.9115181620614716,
"grad_norm": 0.1421349197626114,
"learning_rate": 6.412637261266396e-07,
"loss": 0.1224,
"step": 770
},
{
"epoch": 1.9363551692021113,
"grad_norm": 0.1370449662208557,
"learning_rate": 3.418186972338977e-07,
"loss": 0.1305,
"step": 780
},
{
"epoch": 1.9363551692021113,
"eval_loss": 0.11897934973239899,
"eval_runtime": 37.8217,
"eval_samples_per_second": 4.495,
"eval_steps_per_second": 4.495,
"step": 780
},
{
"epoch": 1.9611921763427507,
"grad_norm": 0.14134632050991058,
"learning_rate": 1.356017150553557e-07,
"loss": 0.1298,
"step": 790
},
{
"epoch": 1.9860291834833903,
"grad_norm": 0.13970708847045898,
"learning_rate": 2.299993042786941e-08,
"loss": 0.1286,
"step": 800
}
],
"logging_steps": 10,
"max_steps": 806,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.2428317974302515e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}