PortugueseT5-Instruct / trainer_state.json
bratao's picture
Upload 10 files
b379913 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0046453183320519995,
"eval_steps": 2000,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 2.3226591660259998e-05,
"grad_norm": 0.43654176592826843,
"learning_rate": 0.0009999930320225019,
"loss": 1.0703,
"step": 10
},
{
"epoch": 4.6453183320519996e-05,
"grad_norm": 0.29478368163108826,
"learning_rate": 0.0009999852898252817,
"loss": 1.1726,
"step": 20
},
{
"epoch": 6.967977498078e-05,
"grad_norm": 0.30410653352737427,
"learning_rate": 0.0009999775476280618,
"loss": 1.106,
"step": 30
},
{
"epoch": 9.290636664103999e-05,
"grad_norm": 0.3648824989795685,
"learning_rate": 0.0009999698054308417,
"loss": 1.1939,
"step": 40
},
{
"epoch": 0.00011613295830129999,
"grad_norm": 0.430895060300827,
"learning_rate": 0.0009999620632336215,
"loss": 1.2002,
"step": 50
},
{
"epoch": 0.00013935954996156,
"grad_norm": 0.3720713257789612,
"learning_rate": 0.0009999543210364014,
"loss": 1.0248,
"step": 60
},
{
"epoch": 0.00016258614162182,
"grad_norm": 0.354899138212204,
"learning_rate": 0.0009999465788391815,
"loss": 1.1271,
"step": 70
},
{
"epoch": 0.00018581273328207998,
"grad_norm": 0.35504820942878723,
"learning_rate": 0.0009999388366419614,
"loss": 1.1396,
"step": 80
},
{
"epoch": 0.00020903932494234,
"grad_norm": 0.4521724581718445,
"learning_rate": 0.0009999310944447412,
"loss": 1.1032,
"step": 90
},
{
"epoch": 0.00023226591660259997,
"grad_norm": 0.2742864787578583,
"learning_rate": 0.000999923352247521,
"loss": 1.1479,
"step": 100
},
{
"epoch": 0.00025549250826286,
"grad_norm": 0.41575589776039124,
"learning_rate": 0.0009999156100503012,
"loss": 1.1837,
"step": 110
},
{
"epoch": 0.00027871909992312,
"grad_norm": 0.27715566754341125,
"learning_rate": 0.000999907867853081,
"loss": 1.1597,
"step": 120
},
{
"epoch": 0.00030194569158338,
"grad_norm": 0.4537408649921417,
"learning_rate": 0.000999900125655861,
"loss": 1.1183,
"step": 130
},
{
"epoch": 0.00032517228324364,
"grad_norm": 0.2952319383621216,
"learning_rate": 0.0009998923834586408,
"loss": 1.1501,
"step": 140
},
{
"epoch": 0.00034839887490389996,
"grad_norm": 0.38295623660087585,
"learning_rate": 0.0009998846412614208,
"loss": 1.1381,
"step": 150
},
{
"epoch": 0.00037162546656415997,
"grad_norm": 0.3845287561416626,
"learning_rate": 0.0009998768990642007,
"loss": 1.0968,
"step": 160
},
{
"epoch": 0.00039485205822442,
"grad_norm": 0.25896570086479187,
"learning_rate": 0.0009998691568669806,
"loss": 1.0684,
"step": 170
},
{
"epoch": 0.00041807864988468,
"grad_norm": 0.2440153807401657,
"learning_rate": 0.0009998614146697604,
"loss": 1.1281,
"step": 180
},
{
"epoch": 0.00044130524154494,
"grad_norm": 0.3060740530490875,
"learning_rate": 0.0009998536724725403,
"loss": 1.1285,
"step": 190
},
{
"epoch": 0.00046453183320519995,
"grad_norm": 0.2703372538089752,
"learning_rate": 0.0009998459302753204,
"loss": 1.2085,
"step": 200
},
{
"epoch": 0.00048775842486545995,
"grad_norm": 0.3993639647960663,
"learning_rate": 0.0009998381880781003,
"loss": 1.2365,
"step": 210
},
{
"epoch": 0.00051098501652572,
"grad_norm": 0.41694164276123047,
"learning_rate": 0.0009998304458808801,
"loss": 1.161,
"step": 220
},
{
"epoch": 0.0005342116081859799,
"grad_norm": 0.2720717191696167,
"learning_rate": 0.00099982270368366,
"loss": 1.0553,
"step": 230
},
{
"epoch": 0.00055743819984624,
"grad_norm": 0.3238905072212219,
"learning_rate": 0.0009998149614864399,
"loss": 1.0825,
"step": 240
},
{
"epoch": 0.0005806647915064999,
"grad_norm": 0.39301878213882446,
"learning_rate": 0.00099980721928922,
"loss": 1.1421,
"step": 250
},
{
"epoch": 0.00060389138316676,
"grad_norm": 0.25302958488464355,
"learning_rate": 0.0009997994770919998,
"loss": 1.0533,
"step": 260
},
{
"epoch": 0.00062711797482702,
"grad_norm": 0.29384830594062805,
"learning_rate": 0.0009997917348947797,
"loss": 1.1011,
"step": 270
},
{
"epoch": 0.00065034456648728,
"grad_norm": 0.35217076539993286,
"learning_rate": 0.0009997839926975595,
"loss": 1.0289,
"step": 280
},
{
"epoch": 0.00067357115814754,
"grad_norm": 0.3412124216556549,
"learning_rate": 0.0009997762505003394,
"loss": 1.0974,
"step": 290
},
{
"epoch": 0.0006967977498077999,
"grad_norm": 0.2988780736923218,
"learning_rate": 0.0009997685083031195,
"loss": 1.1618,
"step": 300
},
{
"epoch": 0.00072002434146806,
"grad_norm": 0.43221724033355713,
"learning_rate": 0.0009997607661058994,
"loss": 1.1023,
"step": 310
},
{
"epoch": 0.0007432509331283199,
"grad_norm": 0.2644006013870239,
"learning_rate": 0.0009997530239086792,
"loss": 1.1548,
"step": 320
},
{
"epoch": 0.00076647752478858,
"grad_norm": 0.2950528860092163,
"learning_rate": 0.000999745281711459,
"loss": 1.1203,
"step": 330
},
{
"epoch": 0.00078970411644884,
"grad_norm": 0.20538517832756042,
"learning_rate": 0.0009997375395142392,
"loss": 1.0904,
"step": 340
},
{
"epoch": 0.0008129307081090999,
"grad_norm": 0.3531719446182251,
"learning_rate": 0.000999729797317019,
"loss": 1.0951,
"step": 350
},
{
"epoch": 0.00083615729976936,
"grad_norm": 0.3661258816719055,
"learning_rate": 0.000999722055119799,
"loss": 1.0885,
"step": 360
},
{
"epoch": 0.0008593838914296199,
"grad_norm": 0.4355231523513794,
"learning_rate": 0.0009997143129225788,
"loss": 1.1301,
"step": 370
},
{
"epoch": 0.00088261048308988,
"grad_norm": 0.3286990225315094,
"learning_rate": 0.0009997065707253588,
"loss": 1.0705,
"step": 380
},
{
"epoch": 0.0009058370747501399,
"grad_norm": 0.31140822172164917,
"learning_rate": 0.0009996988285281387,
"loss": 1.1873,
"step": 390
},
{
"epoch": 0.0009290636664103999,
"grad_norm": 0.2582302689552307,
"learning_rate": 0.0009996910863309186,
"loss": 1.1567,
"step": 400
},
{
"epoch": 0.00095229025807066,
"grad_norm": 0.36799147725105286,
"learning_rate": 0.0009996833441336984,
"loss": 1.2273,
"step": 410
},
{
"epoch": 0.0009755168497309199,
"grad_norm": 0.28618550300598145,
"learning_rate": 0.0009996756019364785,
"loss": 1.0851,
"step": 420
},
{
"epoch": 0.00099874344139118,
"grad_norm": 0.3006650507450104,
"learning_rate": 0.0009996678597392584,
"loss": 1.0341,
"step": 430
},
{
"epoch": 0.00102197003305144,
"grad_norm": 0.3651888072490692,
"learning_rate": 0.0009996601175420383,
"loss": 1.0212,
"step": 440
},
{
"epoch": 0.0010451966247116999,
"grad_norm": 0.32596904039382935,
"learning_rate": 0.0009996523753448181,
"loss": 1.0919,
"step": 450
},
{
"epoch": 0.0010684232163719598,
"grad_norm": 0.30658453702926636,
"learning_rate": 0.000999644633147598,
"loss": 1.0934,
"step": 460
},
{
"epoch": 0.00109164980803222,
"grad_norm": 0.49543142318725586,
"learning_rate": 0.0009996368909503779,
"loss": 1.1603,
"step": 470
},
{
"epoch": 0.00111487639969248,
"grad_norm": 0.24394716322422028,
"learning_rate": 0.000999629148753158,
"loss": 1.1455,
"step": 480
},
{
"epoch": 0.00113810299135274,
"grad_norm": 0.38373667001724243,
"learning_rate": 0.0009996214065559378,
"loss": 1.1498,
"step": 490
},
{
"epoch": 0.0011613295830129999,
"grad_norm": 0.5020566582679749,
"learning_rate": 0.0009996136643587177,
"loss": 1.076,
"step": 500
},
{
"epoch": 0.0011845561746732598,
"grad_norm": 0.3413016200065613,
"learning_rate": 0.0009996059221614975,
"loss": 1.1747,
"step": 510
},
{
"epoch": 0.00120778276633352,
"grad_norm": 0.3450530171394348,
"learning_rate": 0.0009995981799642774,
"loss": 1.1441,
"step": 520
},
{
"epoch": 0.00123100935799378,
"grad_norm": 0.3582036793231964,
"learning_rate": 0.0009995904377670575,
"loss": 1.1679,
"step": 530
},
{
"epoch": 0.00125423594965404,
"grad_norm": 0.30296868085861206,
"learning_rate": 0.0009995826955698373,
"loss": 1.0446,
"step": 540
},
{
"epoch": 0.0012774625413142999,
"grad_norm": 0.3772015869617462,
"learning_rate": 0.0009995749533726172,
"loss": 1.1239,
"step": 550
},
{
"epoch": 0.00130068913297456,
"grad_norm": 0.3441556692123413,
"learning_rate": 0.000999567211175397,
"loss": 1.112,
"step": 560
},
{
"epoch": 0.00132391572463482,
"grad_norm": 0.3211918771266937,
"learning_rate": 0.0009995594689781772,
"loss": 1.1344,
"step": 570
},
{
"epoch": 0.00134714231629508,
"grad_norm": 0.2808244824409485,
"learning_rate": 0.000999551726780957,
"loss": 1.1398,
"step": 580
},
{
"epoch": 0.0013703689079553399,
"grad_norm": 0.32571667432785034,
"learning_rate": 0.000999543984583737,
"loss": 1.1455,
"step": 590
},
{
"epoch": 0.0013935954996155998,
"grad_norm": 0.3554767668247223,
"learning_rate": 0.0009995362423865168,
"loss": 0.991,
"step": 600
},
{
"epoch": 0.00141682209127586,
"grad_norm": 0.253456711769104,
"learning_rate": 0.0009995285001892968,
"loss": 1.1686,
"step": 610
},
{
"epoch": 0.00144004868293612,
"grad_norm": 0.31393057107925415,
"learning_rate": 0.0009995207579920767,
"loss": 1.1034,
"step": 620
},
{
"epoch": 0.00146327527459638,
"grad_norm": 0.3797680735588074,
"learning_rate": 0.0009995130157948566,
"loss": 1.1224,
"step": 630
},
{
"epoch": 0.0014865018662566399,
"grad_norm": 0.3667146563529968,
"learning_rate": 0.0009995052735976364,
"loss": 1.1484,
"step": 640
},
{
"epoch": 0.0015097284579168998,
"grad_norm": 0.28348517417907715,
"learning_rate": 0.0009994975314004165,
"loss": 1.2004,
"step": 650
},
{
"epoch": 0.00153295504957716,
"grad_norm": 0.4176248610019684,
"learning_rate": 0.0009994897892031964,
"loss": 1.1415,
"step": 660
},
{
"epoch": 0.00155618164123742,
"grad_norm": 0.3170236647129059,
"learning_rate": 0.0009994820470059763,
"loss": 1.0853,
"step": 670
},
{
"epoch": 0.00157940823289768,
"grad_norm": 0.31185317039489746,
"learning_rate": 0.0009994743048087561,
"loss": 1.1353,
"step": 680
},
{
"epoch": 0.0016026348245579399,
"grad_norm": 0.33214762806892395,
"learning_rate": 0.000999466562611536,
"loss": 1.1504,
"step": 690
},
{
"epoch": 0.0016258614162181998,
"grad_norm": 0.3761586844921112,
"learning_rate": 0.000999458820414316,
"loss": 1.0549,
"step": 700
},
{
"epoch": 0.00164908800787846,
"grad_norm": 0.2806662619113922,
"learning_rate": 0.000999451078217096,
"loss": 1.1859,
"step": 710
},
{
"epoch": 0.00167231459953872,
"grad_norm": 0.39696329832077026,
"learning_rate": 0.0009994433360198758,
"loss": 1.1716,
"step": 720
},
{
"epoch": 0.0016955411911989799,
"grad_norm": 0.28009161353111267,
"learning_rate": 0.0009994355938226557,
"loss": 1.1932,
"step": 730
},
{
"epoch": 0.0017187677828592398,
"grad_norm": 0.2747149169445038,
"learning_rate": 0.0009994278516254355,
"loss": 1.0847,
"step": 740
},
{
"epoch": 0.0017419943745194998,
"grad_norm": 0.30023542046546936,
"learning_rate": 0.0009994201094282154,
"loss": 1.0696,
"step": 750
},
{
"epoch": 0.00176522096617976,
"grad_norm": 0.3453909158706665,
"learning_rate": 0.0009994123672309955,
"loss": 1.0967,
"step": 760
},
{
"epoch": 0.00178844755784002,
"grad_norm": 0.49272191524505615,
"learning_rate": 0.0009994046250337753,
"loss": 1.0573,
"step": 770
},
{
"epoch": 0.0018116741495002799,
"grad_norm": 0.2652382254600525,
"learning_rate": 0.0009993968828365552,
"loss": 1.1404,
"step": 780
},
{
"epoch": 0.0018349007411605398,
"grad_norm": 0.25675663352012634,
"learning_rate": 0.000999389140639335,
"loss": 1.0459,
"step": 790
},
{
"epoch": 0.0018581273328207998,
"grad_norm": 0.3685920834541321,
"learning_rate": 0.0009993813984421152,
"loss": 1.0117,
"step": 800
},
{
"epoch": 0.00188135392448106,
"grad_norm": 0.3216955363750458,
"learning_rate": 0.000999373656244895,
"loss": 1.1672,
"step": 810
},
{
"epoch": 0.00190458051614132,
"grad_norm": 0.4081834852695465,
"learning_rate": 0.000999365914047675,
"loss": 1.1555,
"step": 820
},
{
"epoch": 0.0019278071078015799,
"grad_norm": 0.3144775927066803,
"learning_rate": 0.0009993581718504548,
"loss": 1.2002,
"step": 830
},
{
"epoch": 0.0019510336994618398,
"grad_norm": 0.3642594814300537,
"learning_rate": 0.0009993504296532348,
"loss": 1.0547,
"step": 840
},
{
"epoch": 0.0019742602911220998,
"grad_norm": 0.3856127858161926,
"learning_rate": 0.0009993426874560147,
"loss": 1.2028,
"step": 850
},
{
"epoch": 0.00199748688278236,
"grad_norm": 0.41429170966148376,
"learning_rate": 0.0009993349452587946,
"loss": 1.0857,
"step": 860
},
{
"epoch": 0.0020207134744426197,
"grad_norm": 0.4278993606567383,
"learning_rate": 0.0009993272030615744,
"loss": 1.0574,
"step": 870
},
{
"epoch": 0.00204394006610288,
"grad_norm": 0.26868101954460144,
"learning_rate": 0.0009993194608643545,
"loss": 1.0538,
"step": 880
},
{
"epoch": 0.00206716665776314,
"grad_norm": 0.8726014494895935,
"learning_rate": 0.0009993117186671344,
"loss": 1.2263,
"step": 890
},
{
"epoch": 0.0020903932494233998,
"grad_norm": 0.39568719267845154,
"learning_rate": 0.0009993039764699143,
"loss": 1.1606,
"step": 900
},
{
"epoch": 0.00211361984108366,
"grad_norm": 0.3933831751346588,
"learning_rate": 0.0009992962342726941,
"loss": 1.1263,
"step": 910
},
{
"epoch": 0.0021368464327439197,
"grad_norm": 0.4326261579990387,
"learning_rate": 0.000999288492075474,
"loss": 1.0729,
"step": 920
},
{
"epoch": 0.00216007302440418,
"grad_norm": 0.3416406810283661,
"learning_rate": 0.000999280749878254,
"loss": 1.1538,
"step": 930
},
{
"epoch": 0.00218329961606444,
"grad_norm": 0.338379830121994,
"learning_rate": 0.000999273007681034,
"loss": 1.0347,
"step": 940
},
{
"epoch": 0.0022065262077246997,
"grad_norm": 0.34776318073272705,
"learning_rate": 0.0009992652654838138,
"loss": 1.1322,
"step": 950
},
{
"epoch": 0.00222975279938496,
"grad_norm": 0.23187178373336792,
"learning_rate": 0.0009992575232865937,
"loss": 1.0574,
"step": 960
},
{
"epoch": 0.0022529793910452196,
"grad_norm": 0.3015563189983368,
"learning_rate": 0.0009992497810893735,
"loss": 1.0911,
"step": 970
},
{
"epoch": 0.00227620598270548,
"grad_norm": 0.31411874294281006,
"learning_rate": 0.0009992420388921534,
"loss": 1.1008,
"step": 980
},
{
"epoch": 0.00229943257436574,
"grad_norm": 0.4988269805908203,
"learning_rate": 0.0009992342966949335,
"loss": 1.1292,
"step": 990
},
{
"epoch": 0.0023226591660259997,
"grad_norm": 0.3398004472255707,
"learning_rate": 0.0009992265544977133,
"loss": 1.1665,
"step": 1000
},
{
"epoch": 0.00234588575768626,
"grad_norm": 0.32879185676574707,
"learning_rate": 0.0009992188123004932,
"loss": 1.1131,
"step": 1010
},
{
"epoch": 0.0023691123493465196,
"grad_norm": 0.40583041310310364,
"learning_rate": 0.000999211070103273,
"loss": 1.0571,
"step": 1020
},
{
"epoch": 0.00239233894100678,
"grad_norm": 0.3514922559261322,
"learning_rate": 0.0009992033279060532,
"loss": 1.1166,
"step": 1030
},
{
"epoch": 0.00241556553266704,
"grad_norm": 1.3851335048675537,
"learning_rate": 0.000999195585708833,
"loss": 1.0532,
"step": 1040
},
{
"epoch": 0.0024387921243272997,
"grad_norm": 0.5054768919944763,
"learning_rate": 0.000999187843511613,
"loss": 1.16,
"step": 1050
},
{
"epoch": 0.00246201871598756,
"grad_norm": 0.37074124813079834,
"learning_rate": 0.0009991801013143928,
"loss": 1.2028,
"step": 1060
},
{
"epoch": 0.0024852453076478196,
"grad_norm": 0.3337225615978241,
"learning_rate": 0.0009991723591171728,
"loss": 1.1109,
"step": 1070
},
{
"epoch": 0.00250847189930808,
"grad_norm": 0.283372163772583,
"learning_rate": 0.0009991646169199527,
"loss": 1.063,
"step": 1080
},
{
"epoch": 0.00253169849096834,
"grad_norm": 0.3113659620285034,
"learning_rate": 0.0009991568747227326,
"loss": 1.1027,
"step": 1090
},
{
"epoch": 0.0025549250826285997,
"grad_norm": 0.43556565046310425,
"learning_rate": 0.0009991491325255124,
"loss": 1.1181,
"step": 1100
},
{
"epoch": 0.00257815167428886,
"grad_norm": 0.3736826479434967,
"learning_rate": 0.0009991413903282925,
"loss": 1.1035,
"step": 1110
},
{
"epoch": 0.00260137826594912,
"grad_norm": 0.3376559913158417,
"learning_rate": 0.0009991336481310724,
"loss": 1.0149,
"step": 1120
},
{
"epoch": 0.0026246048576093798,
"grad_norm": 0.3545368015766144,
"learning_rate": 0.0009991259059338523,
"loss": 1.1472,
"step": 1130
},
{
"epoch": 0.00264783144926964,
"grad_norm": 0.2400045394897461,
"learning_rate": 0.0009991181637366321,
"loss": 1.1423,
"step": 1140
},
{
"epoch": 0.0026710580409298997,
"grad_norm": 0.37132346630096436,
"learning_rate": 0.0009991104215394122,
"loss": 1.1802,
"step": 1150
},
{
"epoch": 0.00269428463259016,
"grad_norm": 0.26770955324172974,
"learning_rate": 0.000999102679342192,
"loss": 1.0859,
"step": 1160
},
{
"epoch": 0.00271751122425042,
"grad_norm": 0.3567134439945221,
"learning_rate": 0.000999094937144972,
"loss": 1.1699,
"step": 1170
},
{
"epoch": 0.0027407378159106798,
"grad_norm": 0.3370940387248993,
"learning_rate": 0.0009990871949477518,
"loss": 1.2679,
"step": 1180
},
{
"epoch": 0.00276396440757094,
"grad_norm": 0.3533010184764862,
"learning_rate": 0.0009990794527505317,
"loss": 1.1444,
"step": 1190
},
{
"epoch": 0.0027871909992311997,
"grad_norm": 0.227728933095932,
"learning_rate": 0.0009990717105533115,
"loss": 1.1105,
"step": 1200
},
{
"epoch": 0.00281041759089146,
"grad_norm": 0.39945659041404724,
"learning_rate": 0.0009990639683560916,
"loss": 1.0122,
"step": 1210
},
{
"epoch": 0.00283364418255172,
"grad_norm": 0.38961905241012573,
"learning_rate": 0.0009990562261588715,
"loss": 1.1677,
"step": 1220
},
{
"epoch": 0.0028568707742119798,
"grad_norm": 0.35965076088905334,
"learning_rate": 0.0009990484839616513,
"loss": 1.2045,
"step": 1230
},
{
"epoch": 0.00288009736587224,
"grad_norm": 0.3876691460609436,
"learning_rate": 0.0009990407417644312,
"loss": 1.1577,
"step": 1240
},
{
"epoch": 0.0029033239575324997,
"grad_norm": 0.3059842586517334,
"learning_rate": 0.000999032999567211,
"loss": 1.1294,
"step": 1250
},
{
"epoch": 0.00292655054919276,
"grad_norm": 0.31481969356536865,
"learning_rate": 0.0009990252573699912,
"loss": 1.1202,
"step": 1260
},
{
"epoch": 0.00294977714085302,
"grad_norm": 0.3077446222305298,
"learning_rate": 0.000999017515172771,
"loss": 1.0893,
"step": 1270
},
{
"epoch": 0.0029730037325132797,
"grad_norm": 0.30285683274269104,
"learning_rate": 0.000999009772975551,
"loss": 1.0844,
"step": 1280
},
{
"epoch": 0.00299623032417354,
"grad_norm": 0.32145956158638,
"learning_rate": 0.0009990020307783308,
"loss": 1.1524,
"step": 1290
},
{
"epoch": 0.0030194569158337996,
"grad_norm": 0.3908081352710724,
"learning_rate": 0.0009989942885811108,
"loss": 1.104,
"step": 1300
},
{
"epoch": 0.00304268350749406,
"grad_norm": 0.32902881503105164,
"learning_rate": 0.0009989865463838907,
"loss": 1.1161,
"step": 1310
},
{
"epoch": 0.00306591009915432,
"grad_norm": 0.3777260184288025,
"learning_rate": 0.0009989788041866706,
"loss": 1.1623,
"step": 1320
},
{
"epoch": 0.0030891366908145797,
"grad_norm": 0.4204845130443573,
"learning_rate": 0.0009989710619894504,
"loss": 1.1284,
"step": 1330
},
{
"epoch": 0.00311236328247484,
"grad_norm": 0.3189554810523987,
"learning_rate": 0.0009989633197922305,
"loss": 1.104,
"step": 1340
},
{
"epoch": 0.0031355898741350996,
"grad_norm": 0.30896514654159546,
"learning_rate": 0.0009989555775950104,
"loss": 1.1221,
"step": 1350
},
{
"epoch": 0.00315881646579536,
"grad_norm": 1.2486257553100586,
"learning_rate": 0.0009989478353977903,
"loss": 1.2578,
"step": 1360
},
{
"epoch": 0.00318204305745562,
"grad_norm": 0.433830201625824,
"learning_rate": 0.0009989400932005701,
"loss": 1.101,
"step": 1370
},
{
"epoch": 0.0032052696491158797,
"grad_norm": 0.3873724341392517,
"learning_rate": 0.0009989323510033502,
"loss": 1.1509,
"step": 1380
},
{
"epoch": 0.00322849624077614,
"grad_norm": 0.238771453499794,
"learning_rate": 0.00099892460880613,
"loss": 1.171,
"step": 1390
},
{
"epoch": 0.0032517228324363996,
"grad_norm": 0.3480624258518219,
"learning_rate": 0.00099891686660891,
"loss": 1.2122,
"step": 1400
},
{
"epoch": 0.00327494942409666,
"grad_norm": 0.35760608315467834,
"learning_rate": 0.0009989091244116898,
"loss": 1.0479,
"step": 1410
},
{
"epoch": 0.00329817601575692,
"grad_norm": 0.3133438527584076,
"learning_rate": 0.0009989013822144697,
"loss": 1.1176,
"step": 1420
},
{
"epoch": 0.0033214026074171797,
"grad_norm": 0.2956129014492035,
"learning_rate": 0.0009988936400172495,
"loss": 1.12,
"step": 1430
},
{
"epoch": 0.00334462919907744,
"grad_norm": 0.2697290778160095,
"learning_rate": 0.0009988858978200296,
"loss": 1.0247,
"step": 1440
},
{
"epoch": 0.0033678557907376996,
"grad_norm": 0.34495481848716736,
"learning_rate": 0.0009988781556228095,
"loss": 1.0775,
"step": 1450
},
{
"epoch": 0.0033910823823979598,
"grad_norm": 0.29800111055374146,
"learning_rate": 0.0009988704134255893,
"loss": 1.1489,
"step": 1460
},
{
"epoch": 0.00341430897405822,
"grad_norm": 0.29650014638900757,
"learning_rate": 0.0009988626712283692,
"loss": 1.0565,
"step": 1470
},
{
"epoch": 0.0034375355657184797,
"grad_norm": 0.35248780250549316,
"learning_rate": 0.000998854929031149,
"loss": 1.1121,
"step": 1480
},
{
"epoch": 0.00346076215737874,
"grad_norm": 0.2716731131076813,
"learning_rate": 0.0009988471868339292,
"loss": 1.0923,
"step": 1490
},
{
"epoch": 0.0034839887490389996,
"grad_norm": 0.4371800422668457,
"learning_rate": 0.000998839444636709,
"loss": 1.0155,
"step": 1500
},
{
"epoch": 0.0035072153406992598,
"grad_norm": 0.2633199691772461,
"learning_rate": 0.0009988317024394889,
"loss": 1.1037,
"step": 1510
},
{
"epoch": 0.00353044193235952,
"grad_norm": 0.2944166362285614,
"learning_rate": 0.0009988239602422688,
"loss": 1.0995,
"step": 1520
},
{
"epoch": 0.0035536685240197797,
"grad_norm": 0.2786024212837219,
"learning_rate": 0.0009988162180450488,
"loss": 1.0641,
"step": 1530
},
{
"epoch": 0.00357689511568004,
"grad_norm": 0.31116756796836853,
"learning_rate": 0.0009988084758478287,
"loss": 1.1015,
"step": 1540
},
{
"epoch": 0.0036001217073402996,
"grad_norm": 0.31829699873924255,
"learning_rate": 0.0009988007336506086,
"loss": 1.0519,
"step": 1550
},
{
"epoch": 0.0036233482990005597,
"grad_norm": 0.4150811433792114,
"learning_rate": 0.0009987929914533884,
"loss": 1.1509,
"step": 1560
},
{
"epoch": 0.00364657489066082,
"grad_norm": 0.2690746784210205,
"learning_rate": 0.0009987852492561685,
"loss": 1.0517,
"step": 1570
},
{
"epoch": 0.0036698014823210797,
"grad_norm": 0.3126815855503082,
"learning_rate": 0.0009987775070589484,
"loss": 1.1398,
"step": 1580
},
{
"epoch": 0.00369302807398134,
"grad_norm": 0.34572452306747437,
"learning_rate": 0.0009987697648617283,
"loss": 1.0342,
"step": 1590
},
{
"epoch": 0.0037162546656415996,
"grad_norm": 0.30171483755111694,
"learning_rate": 0.0009987620226645081,
"loss": 1.0517,
"step": 1600
},
{
"epoch": 0.0037394812573018597,
"grad_norm": 0.2483634054660797,
"learning_rate": 0.0009987542804672882,
"loss": 1.1146,
"step": 1610
},
{
"epoch": 0.00376270784896212,
"grad_norm": 0.41606566309928894,
"learning_rate": 0.000998746538270068,
"loss": 1.0997,
"step": 1620
},
{
"epoch": 0.0037859344406223796,
"grad_norm": 0.3014843761920929,
"learning_rate": 0.000998738796072848,
"loss": 1.0975,
"step": 1630
},
{
"epoch": 0.00380916103228264,
"grad_norm": 0.31974515318870544,
"learning_rate": 0.0009987310538756278,
"loss": 1.0963,
"step": 1640
},
{
"epoch": 0.0038323876239428996,
"grad_norm": 0.3185972273349762,
"learning_rate": 0.0009987233116784077,
"loss": 1.1598,
"step": 1650
},
{
"epoch": 0.0038556142156031597,
"grad_norm": 0.3430216908454895,
"learning_rate": 0.0009987155694811877,
"loss": 0.9476,
"step": 1660
},
{
"epoch": 0.00387884080726342,
"grad_norm": 0.4456688165664673,
"learning_rate": 0.0009987078272839676,
"loss": 1.1319,
"step": 1670
},
{
"epoch": 0.0039020673989236796,
"grad_norm": 0.4243941605091095,
"learning_rate": 0.0009987000850867475,
"loss": 1.1765,
"step": 1680
},
{
"epoch": 0.003925293990583939,
"grad_norm": 0.22148986160755157,
"learning_rate": 0.0009986923428895273,
"loss": 1.1305,
"step": 1690
},
{
"epoch": 0.0039485205822441995,
"grad_norm": 0.44649383425712585,
"learning_rate": 0.0009986846006923072,
"loss": 1.1282,
"step": 1700
},
{
"epoch": 0.00397174717390446,
"grad_norm": 0.35965171456336975,
"learning_rate": 0.000998676858495087,
"loss": 1.0997,
"step": 1710
},
{
"epoch": 0.00399497376556472,
"grad_norm": 0.4147953987121582,
"learning_rate": 0.0009986691162978672,
"loss": 1.0682,
"step": 1720
},
{
"epoch": 0.00401820035722498,
"grad_norm": 0.47538864612579346,
"learning_rate": 0.000998661374100647,
"loss": 1.1625,
"step": 1730
},
{
"epoch": 0.004041426948885239,
"grad_norm": 0.3181823194026947,
"learning_rate": 0.0009986536319034269,
"loss": 1.1683,
"step": 1740
},
{
"epoch": 0.0040646535405454995,
"grad_norm": 0.32929712533950806,
"learning_rate": 0.0009986458897062068,
"loss": 1.1306,
"step": 1750
},
{
"epoch": 0.00408788013220576,
"grad_norm": 0.34377196431159973,
"learning_rate": 0.0009986381475089868,
"loss": 1.1267,
"step": 1760
},
{
"epoch": 0.00411110672386602,
"grad_norm": 0.3156042695045471,
"learning_rate": 0.0009986304053117667,
"loss": 1.0523,
"step": 1770
},
{
"epoch": 0.00413433331552628,
"grad_norm": 0.35088011622428894,
"learning_rate": 0.0009986226631145466,
"loss": 1.0075,
"step": 1780
},
{
"epoch": 0.004157559907186539,
"grad_norm": 0.3740438222885132,
"learning_rate": 0.0009986149209173264,
"loss": 1.1788,
"step": 1790
},
{
"epoch": 0.0041807864988467995,
"grad_norm": 0.28393882513046265,
"learning_rate": 0.0009986071787201065,
"loss": 1.0374,
"step": 1800
},
{
"epoch": 0.00420401309050706,
"grad_norm": 0.2916342318058014,
"learning_rate": 0.0009985994365228864,
"loss": 1.0783,
"step": 1810
},
{
"epoch": 0.00422723968216732,
"grad_norm": 0.3398910462856293,
"learning_rate": 0.0009985916943256663,
"loss": 1.129,
"step": 1820
},
{
"epoch": 0.00425046627382758,
"grad_norm": 0.3244156837463379,
"learning_rate": 0.0009985839521284461,
"loss": 1.1812,
"step": 1830
},
{
"epoch": 0.004273692865487839,
"grad_norm": 0.5498040318489075,
"learning_rate": 0.0009985762099312262,
"loss": 1.1812,
"step": 1840
},
{
"epoch": 0.0042969194571480995,
"grad_norm": 0.27574270963668823,
"learning_rate": 0.000998568467734006,
"loss": 1.1414,
"step": 1850
},
{
"epoch": 0.00432014604880836,
"grad_norm": 0.3610564172267914,
"learning_rate": 0.000998560725536786,
"loss": 1.0763,
"step": 1860
},
{
"epoch": 0.00434337264046862,
"grad_norm": 0.33828043937683105,
"learning_rate": 0.0009985529833395658,
"loss": 1.0169,
"step": 1870
},
{
"epoch": 0.00436659923212888,
"grad_norm": 0.22078180313110352,
"learning_rate": 0.0009985452411423457,
"loss": 1.0513,
"step": 1880
},
{
"epoch": 0.004389825823789139,
"grad_norm": 0.4355666935443878,
"learning_rate": 0.0009985374989451257,
"loss": 1.1245,
"step": 1890
},
{
"epoch": 0.0044130524154493995,
"grad_norm": 0.3071712851524353,
"learning_rate": 0.0009985297567479056,
"loss": 1.1669,
"step": 1900
},
{
"epoch": 0.00443627900710966,
"grad_norm": 0.3043074905872345,
"learning_rate": 0.0009985220145506855,
"loss": 1.1917,
"step": 1910
},
{
"epoch": 0.00445950559876992,
"grad_norm": 0.33084383606910706,
"learning_rate": 0.0009985142723534653,
"loss": 1.0819,
"step": 1920
},
{
"epoch": 0.00448273219043018,
"grad_norm": 0.32064658403396606,
"learning_rate": 0.0009985065301562452,
"loss": 1.1362,
"step": 1930
},
{
"epoch": 0.004505958782090439,
"grad_norm": 0.34291279315948486,
"learning_rate": 0.0009984987879590253,
"loss": 1.0888,
"step": 1940
},
{
"epoch": 0.0045291853737506995,
"grad_norm": 0.4338567852973938,
"learning_rate": 0.0009984910457618052,
"loss": 1.0783,
"step": 1950
},
{
"epoch": 0.00455241196541096,
"grad_norm": 0.33047792315483093,
"learning_rate": 0.000998483303564585,
"loss": 1.0977,
"step": 1960
},
{
"epoch": 0.00457563855707122,
"grad_norm": 0.33728134632110596,
"learning_rate": 0.0009984755613673649,
"loss": 1.137,
"step": 1970
},
{
"epoch": 0.00459886514873148,
"grad_norm": 0.27301332354545593,
"learning_rate": 0.0009984678191701448,
"loss": 1.1413,
"step": 1980
},
{
"epoch": 0.004622091740391739,
"grad_norm": 0.2804515063762665,
"learning_rate": 0.0009984600769729248,
"loss": 1.0865,
"step": 1990
},
{
"epoch": 0.0046453183320519995,
"grad_norm": 0.33448469638824463,
"learning_rate": 0.0009984523347757047,
"loss": 1.1526,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 1291623,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.5412177861554176e+16,
"train_batch_size": 7,
"trial_name": null,
"trial_params": null
}