c22 / checkpoint-1200 /trainer_state.json
thetmon's picture
Upload merged Qwen3-4B-Instruct-2507 model (auto-generated README)
5fdfef2 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9458527682011764,
"eval_steps": 30,
"global_step": 1200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01622388967755019,
"grad_norm": 8.625337600708008,
"learning_rate": 1.4516129032258066e-05,
"loss": 2.6755,
"step": 10
},
{
"epoch": 0.03244777935510038,
"grad_norm": 0.859293520450592,
"learning_rate": 3.0645161290322585e-05,
"loss": 1.0056,
"step": 20
},
{
"epoch": 0.04867166903265058,
"grad_norm": 0.7715856432914734,
"learning_rate": 4.67741935483871e-05,
"loss": 0.6365,
"step": 30
},
{
"epoch": 0.04867166903265058,
"eval_loss": 0.5236709713935852,
"eval_runtime": 55.0101,
"eval_samples_per_second": 4.726,
"eval_steps_per_second": 4.726,
"step": 30
},
{
"epoch": 0.06489555871020077,
"grad_norm": 0.5313697457313538,
"learning_rate": 6.290322580645161e-05,
"loss": 0.4295,
"step": 40
},
{
"epoch": 0.08111944838775097,
"grad_norm": 1.1634626388549805,
"learning_rate": 7.903225806451613e-05,
"loss": 0.3387,
"step": 50
},
{
"epoch": 0.09734333806530115,
"grad_norm": 0.7073950171470642,
"learning_rate": 9.516129032258065e-05,
"loss": 0.2797,
"step": 60
},
{
"epoch": 0.09734333806530115,
"eval_loss": 0.2691636085510254,
"eval_runtime": 54.4477,
"eval_samples_per_second": 4.775,
"eval_steps_per_second": 4.775,
"step": 60
},
{
"epoch": 0.11356722774285134,
"grad_norm": 0.5980345606803894,
"learning_rate": 0.00011129032258064515,
"loss": 0.2525,
"step": 70
},
{
"epoch": 0.12979111742040153,
"grad_norm": 0.5646739602088928,
"learning_rate": 0.0001274193548387097,
"loss": 0.2208,
"step": 80
},
{
"epoch": 0.14601500709795173,
"grad_norm": 0.33468499779701233,
"learning_rate": 0.00014354838709677422,
"loss": 0.2018,
"step": 90
},
{
"epoch": 0.14601500709795173,
"eval_loss": 0.1886902004480362,
"eval_runtime": 54.4734,
"eval_samples_per_second": 4.773,
"eval_steps_per_second": 4.773,
"step": 90
},
{
"epoch": 0.16223889677550193,
"grad_norm": 0.3952212929725647,
"learning_rate": 0.00015967741935483872,
"loss": 0.1841,
"step": 100
},
{
"epoch": 0.1784627864530521,
"grad_norm": 0.34694239497184753,
"learning_rate": 0.00017580645161290325,
"loss": 0.1731,
"step": 110
},
{
"epoch": 0.1946866761306023,
"grad_norm": 0.3307943046092987,
"learning_rate": 0.00019193548387096775,
"loss": 0.1736,
"step": 120
},
{
"epoch": 0.1946866761306023,
"eval_loss": 0.1661035567522049,
"eval_runtime": 54.4184,
"eval_samples_per_second": 4.778,
"eval_steps_per_second": 4.778,
"step": 120
},
{
"epoch": 0.2109105658081525,
"grad_norm": 0.2933768928050995,
"learning_rate": 0.0001999899871766749,
"loss": 0.1616,
"step": 130
},
{
"epoch": 0.22713445548570269,
"grad_norm": 0.3262489438056946,
"learning_rate": 0.00019990989662046818,
"loss": 0.1619,
"step": 140
},
{
"epoch": 0.2433583451632529,
"grad_norm": 0.18724548816680908,
"learning_rate": 0.00019974977965945,
"loss": 0.1482,
"step": 150
},
{
"epoch": 0.2433583451632529,
"eval_loss": 0.14517094194889069,
"eval_runtime": 54.4554,
"eval_samples_per_second": 4.775,
"eval_steps_per_second": 4.775,
"step": 150
},
{
"epoch": 0.25958223484080306,
"grad_norm": 0.39443448185920715,
"learning_rate": 0.0001995097645450266,
"loss": 0.1571,
"step": 160
},
{
"epoch": 0.2758061245183533,
"grad_norm": 0.3215681314468384,
"learning_rate": 0.00019919004352588767,
"loss": 0.1518,
"step": 170
},
{
"epoch": 0.29203001419590346,
"grad_norm": 0.1917540580034256,
"learning_rate": 0.0001987908726940178,
"loss": 0.1477,
"step": 180
},
{
"epoch": 0.29203001419590346,
"eval_loss": 0.1380539983510971,
"eval_runtime": 54.5257,
"eval_samples_per_second": 4.768,
"eval_steps_per_second": 4.768,
"step": 180
},
{
"epoch": 0.30825390387345364,
"grad_norm": 0.27752983570098877,
"learning_rate": 0.00019831257177957044,
"loss": 0.1352,
"step": 190
},
{
"epoch": 0.32447779355100387,
"grad_norm": 0.3052772879600525,
"learning_rate": 0.00019775552389476864,
"loss": 0.1425,
"step": 200
},
{
"epoch": 0.34070168322855404,
"grad_norm": 0.17911162972450256,
"learning_rate": 0.00019712017522703764,
"loss": 0.1411,
"step": 210
},
{
"epoch": 0.34070168322855404,
"eval_loss": 0.13320107758045197,
"eval_runtime": 54.1867,
"eval_samples_per_second": 4.798,
"eval_steps_per_second": 4.798,
"step": 210
},
{
"epoch": 0.3569255729061042,
"grad_norm": 0.15296442806720734,
"learning_rate": 0.0001964070346816151,
"loss": 0.1378,
"step": 220
},
{
"epoch": 0.37314946258365445,
"grad_norm": 0.17346209287643433,
"learning_rate": 0.00019561667347392508,
"loss": 0.136,
"step": 230
},
{
"epoch": 0.3893733522612046,
"grad_norm": 0.16340570151805878,
"learning_rate": 0.00019474972467204297,
"loss": 0.1395,
"step": 240
},
{
"epoch": 0.3893733522612046,
"eval_loss": 0.13255620002746582,
"eval_runtime": 54.3562,
"eval_samples_per_second": 4.783,
"eval_steps_per_second": 4.783,
"step": 240
},
{
"epoch": 0.4055972419387548,
"grad_norm": 0.2969210147857666,
"learning_rate": 0.0001938068826896166,
"loss": 0.1368,
"step": 250
},
{
"epoch": 0.421821131616305,
"grad_norm": 0.20236295461654663,
"learning_rate": 0.00019278890272965096,
"loss": 0.1273,
"step": 260
},
{
"epoch": 0.4380450212938552,
"grad_norm": 0.2891747057437897,
"learning_rate": 0.00019169660017960137,
"loss": 0.129,
"step": 270
},
{
"epoch": 0.4380450212938552,
"eval_loss": 0.12807676196098328,
"eval_runtime": 54.3522,
"eval_samples_per_second": 4.784,
"eval_steps_per_second": 4.784,
"step": 270
},
{
"epoch": 0.45426891097140537,
"grad_norm": 0.20254789292812347,
"learning_rate": 0.0001905308499582597,
"loss": 0.141,
"step": 280
},
{
"epoch": 0.4704928006489556,
"grad_norm": 0.1715897023677826,
"learning_rate": 0.00018929258581495685,
"loss": 0.1329,
"step": 290
},
{
"epoch": 0.4867166903265058,
"grad_norm": 0.3499029278755188,
"learning_rate": 0.00018798279958164295,
"loss": 0.1355,
"step": 300
},
{
"epoch": 0.4867166903265058,
"eval_loss": 0.12350723892450333,
"eval_runtime": 54.5668,
"eval_samples_per_second": 4.765,
"eval_steps_per_second": 4.765,
"step": 300
},
{
"epoch": 0.502940580004056,
"grad_norm": 0.15775884687900543,
"learning_rate": 0.00018660254037844388,
"loss": 0.126,
"step": 310
},
{
"epoch": 0.5191644696816061,
"grad_norm": 0.19402213394641876,
"learning_rate": 0.00018515291377333112,
"loss": 0.1332,
"step": 320
},
{
"epoch": 0.5353883593591564,
"grad_norm": 0.13237500190734863,
"learning_rate": 0.0001836350808965776,
"loss": 0.1441,
"step": 330
},
{
"epoch": 0.5353883593591564,
"eval_loss": 0.12080405652523041,
"eval_runtime": 54.7379,
"eval_samples_per_second": 4.75,
"eval_steps_per_second": 4.75,
"step": 330
},
{
"epoch": 0.5516122490367066,
"grad_norm": 0.29119789600372314,
"learning_rate": 0.00018205025751070875,
"loss": 0.1245,
"step": 340
},
{
"epoch": 0.5678361387142568,
"grad_norm": 0.15898150205612183,
"learning_rate": 0.00018039971303669407,
"loss": 0.121,
"step": 350
},
{
"epoch": 0.5840600283918069,
"grad_norm": 0.13710354268550873,
"learning_rate": 0.000178684769537159,
"loss": 0.1346,
"step": 360
},
{
"epoch": 0.5840600283918069,
"eval_loss": 0.12007435411214828,
"eval_runtime": 54.7789,
"eval_samples_per_second": 4.746,
"eval_steps_per_second": 4.746,
"step": 360
},
{
"epoch": 0.6002839180693571,
"grad_norm": 0.12323573976755142,
"learning_rate": 0.0001769068006574317,
"loss": 0.1311,
"step": 370
},
{
"epoch": 0.6165078077469073,
"grad_norm": 0.14197634160518646,
"learning_rate": 0.00017506723052527242,
"loss": 0.122,
"step": 380
},
{
"epoch": 0.6327316974244576,
"grad_norm": 0.13415870070457458,
"learning_rate": 0.00017316753261016783,
"loss": 0.1242,
"step": 390
},
{
"epoch": 0.6327316974244576,
"eval_loss": 0.11815402656793594,
"eval_runtime": 54.9041,
"eval_samples_per_second": 4.736,
"eval_steps_per_second": 4.736,
"step": 390
},
{
"epoch": 0.6489555871020077,
"grad_norm": 0.14391979575157166,
"learning_rate": 0.00017120922854310257,
"loss": 0.1235,
"step": 400
},
{
"epoch": 0.6651794767795579,
"grad_norm": 0.12567928433418274,
"learning_rate": 0.00016919388689775464,
"loss": 0.1292,
"step": 410
},
{
"epoch": 0.6814033664571081,
"grad_norm": 0.24638038873672485,
"learning_rate": 0.0001671231219340903,
"loss": 0.124,
"step": 420
},
{
"epoch": 0.6814033664571081,
"eval_loss": 0.11765418201684952,
"eval_runtime": 54.8324,
"eval_samples_per_second": 4.742,
"eval_steps_per_second": 4.742,
"step": 420
},
{
"epoch": 0.6976272561346583,
"grad_norm": 0.11872219294309616,
"learning_rate": 0.00016499859230536466,
"loss": 0.1284,
"step": 430
},
{
"epoch": 0.7138511458122084,
"grad_norm": 0.14060816168785095,
"learning_rate": 0.00016282199972956425,
"loss": 0.1228,
"step": 440
},
{
"epoch": 0.7300750354897587,
"grad_norm": 0.1108463779091835,
"learning_rate": 0.00016059508762635482,
"loss": 0.1315,
"step": 450
},
{
"epoch": 0.7300750354897587,
"eval_loss": 0.11692557483911514,
"eval_runtime": 55.1996,
"eval_samples_per_second": 4.71,
"eval_steps_per_second": 4.71,
"step": 450
},
{
"epoch": 0.7462989251673089,
"grad_norm": 0.13592351973056793,
"learning_rate": 0.00015831963972062733,
"loss": 0.1235,
"step": 460
},
{
"epoch": 0.7625228148448591,
"grad_norm": 0.12181869149208069,
"learning_rate": 0.00015599747861375955,
"loss": 0.12,
"step": 470
},
{
"epoch": 0.7787467045224092,
"grad_norm": 0.37740716338157654,
"learning_rate": 0.00015363046432373824,
"loss": 0.119,
"step": 480
},
{
"epoch": 0.7787467045224092,
"eval_loss": 0.11858060956001282,
"eval_runtime": 55.2002,
"eval_samples_per_second": 4.71,
"eval_steps_per_second": 4.71,
"step": 480
},
{
"epoch": 0.7949705941999594,
"grad_norm": 0.3189144432544708,
"learning_rate": 0.00015122049279531143,
"loss": 0.1322,
"step": 490
},
{
"epoch": 0.8111944838775096,
"grad_norm": 0.15118102729320526,
"learning_rate": 0.00014876949438136347,
"loss": 0.1211,
"step": 500
},
{
"epoch": 0.8274183735550599,
"grad_norm": 0.09746048599481583,
"learning_rate": 0.0001462794322967299,
"loss": 0.124,
"step": 510
},
{
"epoch": 0.8274183735550599,
"eval_loss": 0.11739031225442886,
"eval_runtime": 54.9138,
"eval_samples_per_second": 4.735,
"eval_steps_per_second": 4.735,
"step": 510
},
{
"epoch": 0.84364226323261,
"grad_norm": 0.11682560294866562,
"learning_rate": 0.00014375230104569044,
"loss": 0.1219,
"step": 520
},
{
"epoch": 0.8598661529101602,
"grad_norm": 0.10706628113985062,
"learning_rate": 0.0001411901248243993,
"loss": 0.1198,
"step": 530
},
{
"epoch": 0.8760900425877104,
"grad_norm": 0.26870429515838623,
"learning_rate": 0.0001385949558995329,
"loss": 0.1253,
"step": 540
},
{
"epoch": 0.8760900425877104,
"eval_loss": 0.11531012505292892,
"eval_runtime": 54.9326,
"eval_samples_per_second": 4.733,
"eval_steps_per_second": 4.733,
"step": 540
},
{
"epoch": 0.8923139322652606,
"grad_norm": 0.09110305458307266,
"learning_rate": 0.0001359688729644536,
"loss": 0.1129,
"step": 550
},
{
"epoch": 0.9085378219428107,
"grad_norm": 0.11204719543457031,
"learning_rate": 0.00013331397947420576,
"loss": 0.129,
"step": 560
},
{
"epoch": 0.924761711620361,
"grad_norm": 0.15059269964694977,
"learning_rate": 0.00013063240196067836,
"loss": 0.1272,
"step": 570
},
{
"epoch": 0.924761711620361,
"eval_loss": 0.11525142192840576,
"eval_runtime": 55.1098,
"eval_samples_per_second": 4.718,
"eval_steps_per_second": 4.718,
"step": 570
},
{
"epoch": 0.9409856012979112,
"grad_norm": 0.11085380613803864,
"learning_rate": 0.00012792628832928302,
"loss": 0.1243,
"step": 580
},
{
"epoch": 0.9572094909754614,
"grad_norm": 0.10796050727367401,
"learning_rate": 0.00012519780613851254,
"loss": 0.1197,
"step": 590
},
{
"epoch": 0.9734333806530115,
"grad_norm": 0.08801472187042236,
"learning_rate": 0.00012244914086375724,
"loss": 0.1261,
"step": 600
},
{
"epoch": 0.9734333806530115,
"eval_loss": 0.11538033187389374,
"eval_runtime": 54.7547,
"eval_samples_per_second": 4.748,
"eval_steps_per_second": 4.748,
"step": 600
},
{
"epoch": 0.9896572703305617,
"grad_norm": 0.08395121991634369,
"learning_rate": 0.00011968249414677055,
"loss": 0.1215,
"step": 610
},
{
"epoch": 1.0048671669032652,
"grad_norm": 0.10106801986694336,
"learning_rate": 0.00011690008203218493,
"loss": 0.1206,
"step": 620
},
{
"epoch": 1.0210910565808153,
"grad_norm": 0.08509080857038498,
"learning_rate": 0.00011410413319249194,
"loss": 0.1125,
"step": 630
},
{
"epoch": 1.0210910565808153,
"eval_loss": 0.11403891444206238,
"eval_runtime": 54.6547,
"eval_samples_per_second": 4.757,
"eval_steps_per_second": 4.757,
"step": 630
},
{
"epoch": 1.0373149462583655,
"grad_norm": 0.09074535965919495,
"learning_rate": 0.00011129688714290729,
"loss": 0.1167,
"step": 640
},
{
"epoch": 1.0535388359359157,
"grad_norm": 0.10673358291387558,
"learning_rate": 0.00010848059244755093,
"loss": 0.1082,
"step": 650
},
{
"epoch": 1.0697627256134659,
"grad_norm": 0.11839364469051361,
"learning_rate": 0.00010565750491837925,
"loss": 0.1192,
"step": 660
},
{
"epoch": 1.0697627256134659,
"eval_loss": 0.11325760185718536,
"eval_runtime": 54.7532,
"eval_samples_per_second": 4.749,
"eval_steps_per_second": 4.749,
"step": 660
},
{
"epoch": 1.085986615291016,
"grad_norm": 0.08097315579652786,
"learning_rate": 0.00010282988580831183,
"loss": 0.1188,
"step": 670
},
{
"epoch": 1.1022105049685662,
"grad_norm": 0.10252730548381805,
"learning_rate": 0.0001,
"loss": 0.1217,
"step": 680
},
{
"epoch": 1.1184343946461164,
"grad_norm": 0.1087854653596878,
"learning_rate": 9.71701141916882e-05,
"loss": 0.1185,
"step": 690
},
{
"epoch": 1.1184343946461164,
"eval_loss": 0.11274029314517975,
"eval_runtime": 54.6489,
"eval_samples_per_second": 4.758,
"eval_steps_per_second": 4.758,
"step": 690
},
{
"epoch": 1.1346582843236666,
"grad_norm": 0.09857626259326935,
"learning_rate": 9.434249508162076e-05,
"loss": 0.1225,
"step": 700
},
{
"epoch": 1.1508821740012167,
"grad_norm": 0.09853193908929825,
"learning_rate": 9.151940755244912e-05,
"loss": 0.1185,
"step": 710
},
{
"epoch": 1.167106063678767,
"grad_norm": 0.0954899936914444,
"learning_rate": 8.870311285709274e-05,
"loss": 0.1336,
"step": 720
},
{
"epoch": 1.167106063678767,
"eval_loss": 0.11281841993331909,
"eval_runtime": 54.5506,
"eval_samples_per_second": 4.766,
"eval_steps_per_second": 4.766,
"step": 720
},
{
"epoch": 1.183329953356317,
"grad_norm": 0.0967244803905487,
"learning_rate": 8.58958668075081e-05,
"loss": 0.118,
"step": 730
},
{
"epoch": 1.1995538430338675,
"grad_norm": 0.09406362473964691,
"learning_rate": 8.309991796781511e-05,
"loss": 0.1218,
"step": 740
},
{
"epoch": 1.2157777327114176,
"grad_norm": 0.0989699587225914,
"learning_rate": 8.031750585322947e-05,
"loss": 0.1178,
"step": 750
},
{
"epoch": 1.2157777327114176,
"eval_loss": 0.11203999817371368,
"eval_runtime": 54.636,
"eval_samples_per_second": 4.759,
"eval_steps_per_second": 4.759,
"step": 750
},
{
"epoch": 1.2320016223889678,
"grad_norm": 0.10200025886297226,
"learning_rate": 7.755085913624274e-05,
"loss": 0.1126,
"step": 760
},
{
"epoch": 1.248225512066518,
"grad_norm": 0.09863891452550888,
"learning_rate": 7.48021938614875e-05,
"loss": 0.1253,
"step": 770
},
{
"epoch": 1.2644494017440682,
"grad_norm": 0.07942435145378113,
"learning_rate": 7.2073711670717e-05,
"loss": 0.1188,
"step": 780
},
{
"epoch": 1.2644494017440682,
"eval_loss": 0.11259657144546509,
"eval_runtime": 54.5511,
"eval_samples_per_second": 4.766,
"eval_steps_per_second": 4.766,
"step": 780
},
{
"epoch": 1.2806732914216183,
"grad_norm": 0.13004329800605774,
"learning_rate": 6.936759803932167e-05,
"loss": 0.1158,
"step": 790
},
{
"epoch": 1.2968971810991685,
"grad_norm": 0.08646363765001297,
"learning_rate": 6.668602052579424e-05,
"loss": 0.1173,
"step": 800
},
{
"epoch": 1.3131210707767187,
"grad_norm": 0.08868124336004257,
"learning_rate": 6.403112703554643e-05,
"loss": 0.1065,
"step": 810
},
{
"epoch": 1.3131210707767187,
"eval_loss": 0.1115630567073822,
"eval_runtime": 55.0171,
"eval_samples_per_second": 4.726,
"eval_steps_per_second": 4.726,
"step": 810
},
{
"epoch": 1.3293449604542689,
"grad_norm": 0.08065708726644516,
"learning_rate": 6.140504410046712e-05,
"loss": 0.1243,
"step": 820
},
{
"epoch": 1.345568850131819,
"grad_norm": 0.08410122245550156,
"learning_rate": 5.880987517560075e-05,
"loss": 0.114,
"step": 830
},
{
"epoch": 1.3617927398093692,
"grad_norm": 0.0926986113190651,
"learning_rate": 5.624769895430961e-05,
"loss": 0.1178,
"step": 840
},
{
"epoch": 1.3617927398093692,
"eval_loss": 0.11092434078454971,
"eval_runtime": 55.2141,
"eval_samples_per_second": 4.709,
"eval_steps_per_second": 4.709,
"step": 840
},
{
"epoch": 1.3780166294869196,
"grad_norm": 0.07351404428482056,
"learning_rate": 5.372056770327013e-05,
"loss": 0.1108,
"step": 850
},
{
"epoch": 1.3942405191644696,
"grad_norm": 0.08835868537425995,
"learning_rate": 5.123050561863657e-05,
"loss": 0.1127,
"step": 860
},
{
"epoch": 1.41046440884202,
"grad_norm": 0.08012858778238297,
"learning_rate": 4.877950720468859e-05,
"loss": 0.1118,
"step": 870
},
{
"epoch": 1.41046440884202,
"eval_loss": 0.11060689389705658,
"eval_runtime": 55.1249,
"eval_samples_per_second": 4.717,
"eval_steps_per_second": 4.717,
"step": 870
},
{
"epoch": 1.4266882985195701,
"grad_norm": 0.09063031524419785,
"learning_rate": 4.636953567626177e-05,
"loss": 0.1195,
"step": 880
},
{
"epoch": 1.4429121881971203,
"grad_norm": 0.07680074125528336,
"learning_rate": 4.4002521386240466e-05,
"loss": 0.1133,
"step": 890
},
{
"epoch": 1.4591360778746705,
"grad_norm": 0.09211437404155731,
"learning_rate": 4.168036027937267e-05,
"loss": 0.113,
"step": 900
},
{
"epoch": 1.4591360778746705,
"eval_loss": 0.1109917163848877,
"eval_runtime": 54.6876,
"eval_samples_per_second": 4.754,
"eval_steps_per_second": 4.754,
"step": 900
},
{
"epoch": 1.4753599675522207,
"grad_norm": 0.08036933839321136,
"learning_rate": 3.9404912373645185e-05,
"loss": 0.1183,
"step": 910
},
{
"epoch": 1.4915838572297708,
"grad_norm": 0.07682196795940399,
"learning_rate": 3.717800027043576e-05,
"loss": 0.1198,
"step": 920
},
{
"epoch": 1.507807746907321,
"grad_norm": 0.08601044863462448,
"learning_rate": 3.500140769463533e-05,
"loss": 0.1209,
"step": 930
},
{
"epoch": 1.507807746907321,
"eval_loss": 0.1104922965168953,
"eval_runtime": 54.6525,
"eval_samples_per_second": 4.757,
"eval_steps_per_second": 4.757,
"step": 930
},
{
"epoch": 1.5240316365848712,
"grad_norm": 0.08242760598659515,
"learning_rate": 3.287687806590971e-05,
"loss": 0.1127,
"step": 940
},
{
"epoch": 1.5402555262624213,
"grad_norm": 0.08664223551750183,
"learning_rate": 3.080611310224539e-05,
"loss": 0.1193,
"step": 950
},
{
"epoch": 1.5564794159399717,
"grad_norm": 0.08529651165008545,
"learning_rate": 2.879077145689746e-05,
"loss": 0.1126,
"step": 960
},
{
"epoch": 1.5564794159399717,
"eval_loss": 0.10984186083078384,
"eval_runtime": 54.6191,
"eval_samples_per_second": 4.76,
"eval_steps_per_second": 4.76,
"step": 960
},
{
"epoch": 1.5727033056175217,
"grad_norm": 0.07219547033309937,
"learning_rate": 2.6832467389832173e-05,
"loss": 0.1182,
"step": 970
},
{
"epoch": 1.588927195295072,
"grad_norm": 0.10246000438928604,
"learning_rate": 2.493276947472756e-05,
"loss": 0.1109,
"step": 980
},
{
"epoch": 1.605151084972622,
"grad_norm": 0.08859504014253616,
"learning_rate": 2.3093199342568318e-05,
"loss": 0.1137,
"step": 990
},
{
"epoch": 1.605151084972622,
"eval_loss": 0.10924239456653595,
"eval_runtime": 54.6323,
"eval_samples_per_second": 4.759,
"eval_steps_per_second": 4.759,
"step": 990
},
{
"epoch": 1.6213749746501724,
"grad_norm": 0.08224408328533173,
"learning_rate": 2.1315230462840985e-05,
"loss": 0.1168,
"step": 1000
},
{
"epoch": 1.6375988643277226,
"grad_norm": 0.09429515153169632,
"learning_rate": 1.9600286963305957e-05,
"loss": 0.114,
"step": 1010
},
{
"epoch": 1.6538227540052728,
"grad_norm": 0.08045922219753265,
"learning_rate": 1.7949742489291255e-05,
"loss": 0.111,
"step": 1020
},
{
"epoch": 1.6538227540052728,
"eval_loss": 0.10893593728542328,
"eval_runtime": 54.7365,
"eval_samples_per_second": 4.75,
"eval_steps_per_second": 4.75,
"step": 1020
},
{
"epoch": 1.670046643682823,
"grad_norm": 0.0847032219171524,
"learning_rate": 1.6364919103422393e-05,
"loss": 0.1142,
"step": 1030
},
{
"epoch": 1.6862705333603731,
"grad_norm": 0.08252795040607452,
"learning_rate": 1.4847086226668872e-05,
"loss": 0.1126,
"step": 1040
},
{
"epoch": 1.7024944230379233,
"grad_norm": 0.0862165316939354,
"learning_rate": 1.339745962155613e-05,
"loss": 0.1186,
"step": 1050
},
{
"epoch": 1.7024944230379233,
"eval_loss": 0.1090201735496521,
"eval_runtime": 54.7713,
"eval_samples_per_second": 4.747,
"eval_steps_per_second": 4.747,
"step": 1050
},
{
"epoch": 1.7187183127154735,
"grad_norm": 0.09021241962909698,
"learning_rate": 1.2017200418357078e-05,
"loss": 0.1178,
"step": 1060
},
{
"epoch": 1.7349422023930239,
"grad_norm": 0.07867942750453949,
"learning_rate": 1.0707414185043163e-05,
"loss": 0.1041,
"step": 1070
},
{
"epoch": 1.7511660920705738,
"grad_norm": 0.10106240212917328,
"learning_rate": 9.469150041740338e-06,
"loss": 0.1111,
"step": 1080
},
{
"epoch": 1.7511660920705738,
"eval_loss": 0.10924577713012695,
"eval_runtime": 54.6761,
"eval_samples_per_second": 4.755,
"eval_steps_per_second": 4.755,
"step": 1080
},
{
"epoch": 1.7673899817481242,
"grad_norm": 0.1259811371564865,
"learning_rate": 8.303399820398672e-06,
"loss": 0.1118,
"step": 1090
},
{
"epoch": 1.7836138714256742,
"grad_norm": 0.07995796203613281,
"learning_rate": 7.211097270349066e-06,
"loss": 0.1005,
"step": 1100
},
{
"epoch": 1.7998377611032246,
"grad_norm": 0.0811864510178566,
"learning_rate": 6.1931173103834115e-06,
"loss": 0.115,
"step": 1110
},
{
"epoch": 1.7998377611032246,
"eval_loss": 0.10901561379432678,
"eval_runtime": 54.5743,
"eval_samples_per_second": 4.764,
"eval_steps_per_second": 4.764,
"step": 1110
},
{
"epoch": 1.8160616507807745,
"grad_norm": 0.10539617389440536,
"learning_rate": 5.250275327957032e-06,
"loss": 0.1129,
"step": 1120
},
{
"epoch": 1.832285540458325,
"grad_norm": 0.15508656203746796,
"learning_rate": 4.383326526074916e-06,
"loss": 0.1258,
"step": 1130
},
{
"epoch": 1.848509430135875,
"grad_norm": 0.08507981151342392,
"learning_rate": 3.592965318384944e-06,
"loss": 0.1123,
"step": 1140
},
{
"epoch": 1.848509430135875,
"eval_loss": 0.1087607592344284,
"eval_runtime": 54.908,
"eval_samples_per_second": 4.735,
"eval_steps_per_second": 4.735,
"step": 1140
},
{
"epoch": 1.8647333198134253,
"grad_norm": 0.08362692594528198,
"learning_rate": 2.8798247729623806e-06,
"loss": 0.1164,
"step": 1150
},
{
"epoch": 1.8809572094909754,
"grad_norm": 0.08825893700122833,
"learning_rate": 2.2444761052313856e-06,
"loss": 0.1223,
"step": 1160
},
{
"epoch": 1.8971810991685256,
"grad_norm": 0.07475200295448303,
"learning_rate": 1.6874282204295766e-06,
"loss": 0.1101,
"step": 1170
},
{
"epoch": 1.8971810991685256,
"eval_loss": 0.108616404235363,
"eval_runtime": 55.1832,
"eval_samples_per_second": 4.712,
"eval_steps_per_second": 4.712,
"step": 1170
},
{
"epoch": 1.9134049888460758,
"grad_norm": 0.07395757734775543,
"learning_rate": 1.209127305982205e-06,
"loss": 0.1217,
"step": 1180
},
{
"epoch": 1.929628878523626,
"grad_norm": 0.1036606878042221,
"learning_rate": 8.099564741123166e-07,
"loss": 0.1167,
"step": 1190
},
{
"epoch": 1.9458527682011764,
"grad_norm": 0.07940636575222015,
"learning_rate": 4.902354549733978e-07,
"loss": 0.1111,
"step": 1200
},
{
"epoch": 1.9458527682011764,
"eval_loss": 0.10869967937469482,
"eval_runtime": 54.8601,
"eval_samples_per_second": 4.739,
"eval_steps_per_second": 4.739,
"step": 1200
}
],
"logging_steps": 10,
"max_steps": 1234,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.675362213715753e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}