c9 / checkpoint-1600 /trainer_state.json
thetmon's picture
Upload merged Qwen3-4B-Instruct-2507 model (auto-generated README)
e9eee15 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.931896354047237,
"eval_steps": 30,
"global_step": 1600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.018344416418252695,
"grad_norm": 11.986655235290527,
"learning_rate": 1.0975609756097562e-05,
"loss": 2.8049,
"step": 10
},
{
"epoch": 0.03668883283650539,
"grad_norm": 1.3442561626434326,
"learning_rate": 2.3170731707317075e-05,
"loss": 1.2643,
"step": 20
},
{
"epoch": 0.05503324925475808,
"grad_norm": 1.0081877708435059,
"learning_rate": 3.5365853658536584e-05,
"loss": 0.7278,
"step": 30
},
{
"epoch": 0.05503324925475808,
"eval_loss": 0.6342164874076843,
"eval_runtime": 47.3806,
"eval_samples_per_second": 4.854,
"eval_steps_per_second": 4.854,
"step": 30
},
{
"epoch": 0.07337766567301078,
"grad_norm": 0.8855087757110596,
"learning_rate": 4.75609756097561e-05,
"loss": 0.5154,
"step": 40
},
{
"epoch": 0.09172208209126347,
"grad_norm": 0.7658259272575378,
"learning_rate": 5.975609756097561e-05,
"loss": 0.4091,
"step": 50
},
{
"epoch": 0.11006649850951616,
"grad_norm": 1.1029750108718872,
"learning_rate": 7.195121951219513e-05,
"loss": 0.3197,
"step": 60
},
{
"epoch": 0.11006649850951616,
"eval_loss": 0.26837778091430664,
"eval_runtime": 47.1026,
"eval_samples_per_second": 4.883,
"eval_steps_per_second": 4.883,
"step": 60
},
{
"epoch": 0.12841091492776885,
"grad_norm": 0.5400053262710571,
"learning_rate": 8.414634146341464e-05,
"loss": 0.2753,
"step": 70
},
{
"epoch": 0.14675533134602156,
"grad_norm": 0.5874791145324707,
"learning_rate": 9.634146341463415e-05,
"loss": 0.243,
"step": 80
},
{
"epoch": 0.16509974776427425,
"grad_norm": 0.47156110405921936,
"learning_rate": 0.00010853658536585367,
"loss": 0.2144,
"step": 90
},
{
"epoch": 0.16509974776427425,
"eval_loss": 0.18793882429599762,
"eval_runtime": 46.9461,
"eval_samples_per_second": 4.899,
"eval_steps_per_second": 4.899,
"step": 90
},
{
"epoch": 0.18344416418252693,
"grad_norm": 0.6732025146484375,
"learning_rate": 0.00012073170731707318,
"loss": 0.1804,
"step": 100
},
{
"epoch": 0.20178858060077964,
"grad_norm": 0.4138810336589813,
"learning_rate": 0.0001329268292682927,
"loss": 0.1704,
"step": 110
},
{
"epoch": 0.22013299701903233,
"grad_norm": 0.3548312783241272,
"learning_rate": 0.0001451219512195122,
"loss": 0.1642,
"step": 120
},
{
"epoch": 0.22013299701903233,
"eval_loss": 0.14807145297527313,
"eval_runtime": 47.16,
"eval_samples_per_second": 4.877,
"eval_steps_per_second": 4.877,
"step": 120
},
{
"epoch": 0.238477413437285,
"grad_norm": 0.5005571842193604,
"learning_rate": 0.00015731707317073173,
"loss": 0.1688,
"step": 130
},
{
"epoch": 0.2568218298555377,
"grad_norm": 0.4382654130458832,
"learning_rate": 0.00016951219512195123,
"loss": 0.1479,
"step": 140
},
{
"epoch": 0.27516624627379044,
"grad_norm": 0.3288620114326477,
"learning_rate": 0.00018170731707317075,
"loss": 0.1542,
"step": 150
},
{
"epoch": 0.27516624627379044,
"eval_loss": 0.13612844049930573,
"eval_runtime": 46.9463,
"eval_samples_per_second": 4.899,
"eval_steps_per_second": 4.899,
"step": 150
},
{
"epoch": 0.2935106626920431,
"grad_norm": 0.48989060521125793,
"learning_rate": 0.00019390243902439025,
"loss": 0.1536,
"step": 160
},
{
"epoch": 0.3118550791102958,
"grad_norm": 0.2770105004310608,
"learning_rate": 0.00019999432180005332,
"loss": 0.1407,
"step": 170
},
{
"epoch": 0.3301994955285485,
"grad_norm": 1.1703704595565796,
"learning_rate": 0.00019994890006944105,
"loss": 0.1485,
"step": 180
},
{
"epoch": 0.3301994955285485,
"eval_loss": 0.13879014551639557,
"eval_runtime": 46.9507,
"eval_samples_per_second": 4.899,
"eval_steps_per_second": 4.899,
"step": 180
},
{
"epoch": 0.3485439119468012,
"grad_norm": 0.4198831617832184,
"learning_rate": 0.0001998580772407242,
"loss": 0.1488,
"step": 190
},
{
"epoch": 0.36688832836505386,
"grad_norm": 0.5260700583457947,
"learning_rate": 0.00019972189456954594,
"loss": 0.1589,
"step": 200
},
{
"epoch": 0.3852327447833066,
"grad_norm": 0.34109413623809814,
"learning_rate": 0.00019954041391594487,
"loss": 0.1429,
"step": 210
},
{
"epoch": 0.3852327447833066,
"eval_loss": 0.1368015855550766,
"eval_runtime": 46.9592,
"eval_samples_per_second": 4.898,
"eval_steps_per_second": 4.898,
"step": 210
},
{
"epoch": 0.4035771612015593,
"grad_norm": 0.5479871034622192,
"learning_rate": 0.00019931371771625544,
"loss": 0.1413,
"step": 220
},
{
"epoch": 0.42192157761981197,
"grad_norm": 0.2849060297012329,
"learning_rate": 0.00019904190894566194,
"loss": 0.1329,
"step": 230
},
{
"epoch": 0.44026599403806466,
"grad_norm": 0.2200181484222412,
"learning_rate": 0.00019872511107142261,
"loss": 0.15,
"step": 240
},
{
"epoch": 0.44026599403806466,
"eval_loss": 0.12958292663097382,
"eval_runtime": 46.9085,
"eval_samples_per_second": 4.903,
"eval_steps_per_second": 4.903,
"step": 240
},
{
"epoch": 0.45861041045631734,
"grad_norm": 0.31030556559562683,
"learning_rate": 0.00019836346799678568,
"loss": 0.1277,
"step": 250
},
{
"epoch": 0.47695482687457,
"grad_norm": 0.1974978893995285,
"learning_rate": 0.00019795714399562197,
"loss": 0.1293,
"step": 260
},
{
"epoch": 0.49529924329282277,
"grad_norm": 0.40907976031303406,
"learning_rate": 0.00019750632363780505,
"loss": 0.1437,
"step": 270
},
{
"epoch": 0.49529924329282277,
"eval_loss": 0.12063605338335037,
"eval_runtime": 47.4843,
"eval_samples_per_second": 4.844,
"eval_steps_per_second": 4.844,
"step": 270
},
{
"epoch": 0.5136436597110754,
"grad_norm": 0.37478116154670715,
"learning_rate": 0.00019701121170537125,
"loss": 0.1388,
"step": 280
},
{
"epoch": 0.5319880761293282,
"grad_norm": 0.24447715282440186,
"learning_rate": 0.00019647203309949913,
"loss": 0.1316,
"step": 290
},
{
"epoch": 0.5503324925475809,
"grad_norm": 0.44361746311187744,
"learning_rate": 0.00019588903273834953,
"loss": 0.14,
"step": 300
},
{
"epoch": 0.5503324925475809,
"eval_loss": 0.12744086980819702,
"eval_runtime": 47.4937,
"eval_samples_per_second": 4.843,
"eval_steps_per_second": 4.843,
"step": 300
},
{
"epoch": 0.5686769089658336,
"grad_norm": 0.734635591506958,
"learning_rate": 0.00019526247544581312,
"loss": 0.1403,
"step": 310
},
{
"epoch": 0.5870213253840862,
"grad_norm": 0.16803057491779327,
"learning_rate": 0.00019459264583121622,
"loss": 0.138,
"step": 320
},
{
"epoch": 0.6053657418023389,
"grad_norm": 0.29355189204216003,
"learning_rate": 0.00019387984816003867,
"loss": 0.1292,
"step": 330
},
{
"epoch": 0.6053657418023389,
"eval_loss": 0.11417385935783386,
"eval_runtime": 47.4735,
"eval_samples_per_second": 4.845,
"eval_steps_per_second": 4.845,
"step": 330
},
{
"epoch": 0.6237101582205916,
"grad_norm": 0.14734932780265808,
"learning_rate": 0.00019312440621570355,
"loss": 0.1319,
"step": 340
},
{
"epoch": 0.6420545746388443,
"grad_norm": 0.1784052848815918,
"learning_rate": 0.00019232666315250078,
"loss": 0.133,
"step": 350
},
{
"epoch": 0.660398991057097,
"grad_norm": 0.21272070705890656,
"learning_rate": 0.00019148698133971155,
"loss": 0.1244,
"step": 360
},
{
"epoch": 0.660398991057097,
"eval_loss": 0.11460884660482407,
"eval_runtime": 47.2816,
"eval_samples_per_second": 4.864,
"eval_steps_per_second": 4.864,
"step": 360
},
{
"epoch": 0.6787434074753497,
"grad_norm": 0.14382557570934296,
"learning_rate": 0.0001906057421970046,
"loss": 0.1252,
"step": 370
},
{
"epoch": 0.6970878238936024,
"grad_norm": 0.138260617852211,
"learning_rate": 0.00018968334602117906,
"loss": 0.1358,
"step": 380
},
{
"epoch": 0.715432240311855,
"grad_norm": 0.14098672568798065,
"learning_rate": 0.00018872021180433232,
"loss": 0.1183,
"step": 390
},
{
"epoch": 0.715432240311855,
"eval_loss": 0.11497555673122406,
"eval_runtime": 47.5258,
"eval_samples_per_second": 4.839,
"eval_steps_per_second": 4.839,
"step": 390
},
{
"epoch": 0.7337766567301077,
"grad_norm": 0.5053527355194092,
"learning_rate": 0.0001877167770435357,
"loss": 0.1181,
"step": 400
},
{
"epoch": 0.7521210731483605,
"grad_norm": 0.12653136253356934,
"learning_rate": 0.00018667349754210457,
"loss": 0.1274,
"step": 410
},
{
"epoch": 0.7704654895666132,
"grad_norm": 0.11409477144479752,
"learning_rate": 0.00018559084720255276,
"loss": 0.1336,
"step": 420
},
{
"epoch": 0.7704654895666132,
"eval_loss": 0.11223822832107544,
"eval_runtime": 47.4963,
"eval_samples_per_second": 4.842,
"eval_steps_per_second": 4.842,
"step": 420
},
{
"epoch": 0.7888099059848659,
"grad_norm": 0.12272641062736511,
"learning_rate": 0.00018446931781132553,
"loss": 0.1297,
"step": 430
},
{
"epoch": 0.8071543224031186,
"grad_norm": 0.1796967089176178,
"learning_rate": 0.00018330941881540915,
"loss": 0.1378,
"step": 440
},
{
"epoch": 0.8254987388213713,
"grad_norm": 0.11490087956190109,
"learning_rate": 0.00018211167709091802,
"loss": 0.1262,
"step": 450
},
{
"epoch": 0.8254987388213713,
"eval_loss": 0.11122166365385056,
"eval_runtime": 47.6342,
"eval_samples_per_second": 4.828,
"eval_steps_per_second": 4.828,
"step": 450
},
{
"epoch": 0.8438431552396239,
"grad_norm": 0.1275721788406372,
"learning_rate": 0.00018087663670376483,
"loss": 0.1244,
"step": 460
},
{
"epoch": 0.8621875716578766,
"grad_norm": 0.10838750749826431,
"learning_rate": 0.0001796048586625223,
"loss": 0.1278,
"step": 470
},
{
"epoch": 0.8805319880761293,
"grad_norm": 0.132126122713089,
"learning_rate": 0.00017829692066358914,
"loss": 0.1365,
"step": 480
},
{
"epoch": 0.8805319880761293,
"eval_loss": 0.11278139054775238,
"eval_runtime": 47.4269,
"eval_samples_per_second": 4.85,
"eval_steps_per_second": 4.85,
"step": 480
},
{
"epoch": 0.898876404494382,
"grad_norm": 0.130384624004364,
"learning_rate": 0.0001769534168287752,
"loss": 0.1215,
"step": 490
},
{
"epoch": 0.9172208209126347,
"grad_norm": 0.11609125882387161,
"learning_rate": 0.00017557495743542585,
"loss": 0.1178,
"step": 500
},
{
"epoch": 0.9355652373308874,
"grad_norm": 0.12103743851184845,
"learning_rate": 0.0001741621686392077,
"loss": 0.125,
"step": 510
},
{
"epoch": 0.9355652373308874,
"eval_loss": 0.11113429814577103,
"eval_runtime": 47.2207,
"eval_samples_per_second": 4.871,
"eval_steps_per_second": 4.871,
"step": 510
},
{
"epoch": 0.95390965374914,
"grad_norm": 0.13481061160564423,
"learning_rate": 0.00017271569218968175,
"loss": 0.1233,
"step": 520
},
{
"epoch": 0.9722540701673928,
"grad_norm": 0.12927769124507904,
"learning_rate": 0.00017123618513879295,
"loss": 0.1237,
"step": 530
},
{
"epoch": 0.9905984865856455,
"grad_norm": 0.1044018417596817,
"learning_rate": 0.00016972431954240906,
"loss": 0.1222,
"step": 540
},
{
"epoch": 0.9905984865856455,
"eval_loss": 0.11119447648525238,
"eval_runtime": 47.6107,
"eval_samples_per_second": 4.831,
"eval_steps_per_second": 4.831,
"step": 540
},
{
"epoch": 1.0073377665673011,
"grad_norm": 0.10839453339576721,
"learning_rate": 0.0001681807821550438,
"loss": 0.1076,
"step": 550
},
{
"epoch": 1.0256821829855538,
"grad_norm": 0.151426762342453,
"learning_rate": 0.00016660627411790329,
"loss": 0.1131,
"step": 560
},
{
"epoch": 1.0440265994038065,
"grad_norm": 0.1167786568403244,
"learning_rate": 0.00016500151064039766,
"loss": 0.1332,
"step": 570
},
{
"epoch": 1.0440265994038065,
"eval_loss": 0.11187437176704407,
"eval_runtime": 47.4208,
"eval_samples_per_second": 4.85,
"eval_steps_per_second": 4.85,
"step": 570
},
{
"epoch": 1.0623710158220592,
"grad_norm": 0.0833975151181221,
"learning_rate": 0.0001633672206752621,
"loss": 0.1236,
"step": 580
},
{
"epoch": 1.0807154322403119,
"grad_norm": 0.08196871727705002,
"learning_rate": 0.00016170414658743488,
"loss": 0.1177,
"step": 590
},
{
"epoch": 1.0990598486585645,
"grad_norm": 0.1361839324235916,
"learning_rate": 0.00016001304381684347,
"loss": 0.1103,
"step": 600
},
{
"epoch": 1.0990598486585645,
"eval_loss": 0.11141453683376312,
"eval_runtime": 47.4348,
"eval_samples_per_second": 4.849,
"eval_steps_per_second": 4.849,
"step": 600
},
{
"epoch": 1.1174042650768172,
"grad_norm": 0.1309792697429657,
"learning_rate": 0.00015829468053525102,
"loss": 0.1285,
"step": 610
},
{
"epoch": 1.13574868149507,
"grad_norm": 0.09610354155302048,
"learning_rate": 0.00015654983729731977,
"loss": 0.1278,
"step": 620
},
{
"epoch": 1.1540930979133226,
"grad_norm": 0.08027515560388565,
"learning_rate": 0.00015477930668604916,
"loss": 0.117,
"step": 630
},
{
"epoch": 1.1540930979133226,
"eval_loss": 0.11004864424467087,
"eval_runtime": 47.4023,
"eval_samples_per_second": 4.852,
"eval_steps_per_second": 4.852,
"step": 630
},
{
"epoch": 1.1724375143315753,
"grad_norm": 0.25526005029678345,
"learning_rate": 0.00015298389295275098,
"loss": 0.1179,
"step": 640
},
{
"epoch": 1.190781930749828,
"grad_norm": 0.2913426458835602,
"learning_rate": 0.00015116441165172328,
"loss": 0.1226,
"step": 650
},
{
"epoch": 1.2091263471680807,
"grad_norm": 0.12240047007799149,
"learning_rate": 0.00014932168926979074,
"loss": 0.1206,
"step": 660
},
{
"epoch": 1.2091263471680807,
"eval_loss": 0.11615979671478271,
"eval_runtime": 47.3821,
"eval_samples_per_second": 4.854,
"eval_steps_per_second": 4.854,
"step": 660
},
{
"epoch": 1.2274707635863333,
"grad_norm": 0.5275110602378845,
"learning_rate": 0.00014745656285087866,
"loss": 0.131,
"step": 670
},
{
"epoch": 1.245815180004586,
"grad_norm": 0.08649874478578568,
"learning_rate": 0.00014556987961579146,
"loss": 0.1273,
"step": 680
},
{
"epoch": 1.264159596422839,
"grad_norm": 0.21493305265903473,
"learning_rate": 0.00014366249657736866,
"loss": 0.1238,
"step": 690
},
{
"epoch": 1.264159596422839,
"eval_loss": 0.11726190149784088,
"eval_runtime": 47.4729,
"eval_samples_per_second": 4.845,
"eval_steps_per_second": 4.845,
"step": 690
},
{
"epoch": 1.2825040128410916,
"grad_norm": 0.2242494374513626,
"learning_rate": 0.00014173528015119246,
"loss": 0.1267,
"step": 700
},
{
"epoch": 1.3008484292593443,
"grad_norm": 0.1080465167760849,
"learning_rate": 0.0001397891057620247,
"loss": 0.1275,
"step": 710
},
{
"epoch": 1.319192845677597,
"grad_norm": 0.11881080269813538,
"learning_rate": 0.00013782485744615096,
"loss": 0.1265,
"step": 720
},
{
"epoch": 1.319192845677597,
"eval_loss": 0.11231047660112381,
"eval_runtime": 47.5156,
"eval_samples_per_second": 4.841,
"eval_steps_per_second": 4.841,
"step": 720
},
{
"epoch": 1.3375372620958497,
"grad_norm": 0.11510378122329712,
"learning_rate": 0.0001358434274498134,
"loss": 0.118,
"step": 730
},
{
"epoch": 1.3558816785141024,
"grad_norm": 0.1533641517162323,
"learning_rate": 0.00013384571582391393,
"loss": 0.1239,
"step": 740
},
{
"epoch": 1.374226094932355,
"grad_norm": 0.09978242218494415,
"learning_rate": 0.00013183263001517224,
"loss": 0.1206,
"step": 750
},
{
"epoch": 1.374226094932355,
"eval_loss": 0.1097174733877182,
"eval_runtime": 47.3489,
"eval_samples_per_second": 4.858,
"eval_steps_per_second": 4.858,
"step": 750
},
{
"epoch": 1.3925705113506077,
"grad_norm": 0.10042322427034378,
"learning_rate": 0.0001298050844539246,
"loss": 0.1244,
"step": 760
},
{
"epoch": 1.4109149277688604,
"grad_norm": 0.09835106879472733,
"learning_rate": 0.00012776400013875006,
"loss": 0.1157,
"step": 770
},
{
"epoch": 1.429259344187113,
"grad_norm": 0.0930342748761177,
"learning_rate": 0.00012571030421811314,
"loss": 0.1236,
"step": 780
},
{
"epoch": 1.429259344187113,
"eval_loss": 0.11065001785755157,
"eval_runtime": 47.4754,
"eval_samples_per_second": 4.845,
"eval_steps_per_second": 4.845,
"step": 780
},
{
"epoch": 1.4476037606053658,
"grad_norm": 0.08919622749090195,
"learning_rate": 0.0001236449295692131,
"loss": 0.1261,
"step": 790
},
{
"epoch": 1.4659481770236185,
"grad_norm": 0.0871356800198555,
"learning_rate": 0.00012156881437423103,
"loss": 0.1215,
"step": 800
},
{
"epoch": 1.4842925934418711,
"grad_norm": 0.11698926240205765,
"learning_rate": 0.00011948290169416682,
"loss": 0.1269,
"step": 810
},
{
"epoch": 1.4842925934418711,
"eval_loss": 0.10887381434440613,
"eval_runtime": 47.4819,
"eval_samples_per_second": 4.844,
"eval_steps_per_second": 4.844,
"step": 810
},
{
"epoch": 1.5026370098601238,
"grad_norm": 0.06746023893356323,
"learning_rate": 0.00011738813904046044,
"loss": 0.1122,
"step": 820
},
{
"epoch": 1.5209814262783765,
"grad_norm": 0.09794127196073532,
"learning_rate": 0.00011528547794459128,
"loss": 0.1161,
"step": 830
},
{
"epoch": 1.5393258426966292,
"grad_norm": 0.15457604825496674,
"learning_rate": 0.00011317587352585157,
"loss": 0.1281,
"step": 840
},
{
"epoch": 1.5393258426966292,
"eval_loss": 0.10888072103261948,
"eval_runtime": 47.3656,
"eval_samples_per_second": 4.856,
"eval_steps_per_second": 4.856,
"step": 840
},
{
"epoch": 1.5576702591148819,
"grad_norm": 0.08662489801645279,
"learning_rate": 0.00011106028405749005,
"loss": 0.1191,
"step": 850
},
{
"epoch": 1.5760146755331346,
"grad_norm": 0.08548998087644577,
"learning_rate": 0.00010893967053142296,
"loss": 0.1196,
"step": 860
},
{
"epoch": 1.5943590919513873,
"grad_norm": 0.10732567310333252,
"learning_rate": 0.00010681499622171005,
"loss": 0.1246,
"step": 870
},
{
"epoch": 1.5943590919513873,
"eval_loss": 0.10937893390655518,
"eval_runtime": 47.5072,
"eval_samples_per_second": 4.841,
"eval_steps_per_second": 4.841,
"step": 870
},
{
"epoch": 1.61270350836964,
"grad_norm": 0.07822810858488083,
"learning_rate": 0.00010468722624699401,
"loss": 0.1121,
"step": 880
},
{
"epoch": 1.6310479247878926,
"grad_norm": 0.07400278747081757,
"learning_rate": 0.00010255732713210206,
"loss": 0.124,
"step": 890
},
{
"epoch": 1.6493923412061453,
"grad_norm": 0.0789525955915451,
"learning_rate": 0.00010042626636900856,
"loss": 0.1181,
"step": 900
},
{
"epoch": 1.6493923412061453,
"eval_loss": 0.10813174396753311,
"eval_runtime": 47.5685,
"eval_samples_per_second": 4.835,
"eval_steps_per_second": 4.835,
"step": 900
},
{
"epoch": 1.667736757624398,
"grad_norm": 0.07806720584630966,
"learning_rate": 9.829501197735866e-05,
"loss": 0.1172,
"step": 910
},
{
"epoch": 1.6860811740426507,
"grad_norm": 0.09959676861763,
"learning_rate": 9.616453206475179e-05,
"loss": 0.1214,
"step": 920
},
{
"epoch": 1.7044255904609034,
"grad_norm": 0.07471271604299545,
"learning_rate": 9.40357943869858e-05,
"loss": 0.1212,
"step": 930
},
{
"epoch": 1.7044255904609034,
"eval_loss": 0.1081625297665596,
"eval_runtime": 47.4204,
"eval_samples_per_second": 4.85,
"eval_steps_per_second": 4.85,
"step": 930
},
{
"epoch": 1.722770006879156,
"grad_norm": 0.0786692202091217,
"learning_rate": 9.190976590846027e-05,
"loss": 0.1187,
"step": 940
},
{
"epoch": 1.7411144232974087,
"grad_norm": 0.09310892224311829,
"learning_rate": 8.978741236293973e-05,
"loss": 0.115,
"step": 950
},
{
"epoch": 1.7594588397156614,
"grad_norm": 0.10510563850402832,
"learning_rate": 8.766969781487578e-05,
"loss": 0.1254,
"step": 960
},
{
"epoch": 1.7594588397156614,
"eval_loss": 0.10792473703622818,
"eval_runtime": 47.4662,
"eval_samples_per_second": 4.846,
"eval_steps_per_second": 4.846,
"step": 960
},
{
"epoch": 1.777803256133914,
"grad_norm": 0.07966180145740509,
"learning_rate": 8.555758422148745e-05,
"loss": 0.121,
"step": 970
},
{
"epoch": 1.7961476725521668,
"grad_norm": 0.08049053698778152,
"learning_rate": 8.345203099579874e-05,
"loss": 0.11,
"step": 980
},
{
"epoch": 1.8144920889704195,
"grad_norm": 0.06832806766033173,
"learning_rate": 8.13539945708319e-05,
"loss": 0.1236,
"step": 990
},
{
"epoch": 1.8144920889704195,
"eval_loss": 0.10886505246162415,
"eval_runtime": 47.5036,
"eval_samples_per_second": 4.842,
"eval_steps_per_second": 4.842,
"step": 990
},
{
"epoch": 1.8328365053886724,
"grad_norm": 0.08461681008338928,
"learning_rate": 7.926442796515429e-05,
"loss": 0.1341,
"step": 1000
},
{
"epoch": 1.851180921806925,
"grad_norm": 0.07851795107126236,
"learning_rate": 7.71842803499764e-05,
"loss": 0.1306,
"step": 1010
},
{
"epoch": 1.8695253382251777,
"grad_norm": 0.0859571248292923,
"learning_rate": 7.51144966179972e-05,
"loss": 0.1155,
"step": 1020
},
{
"epoch": 1.8695253382251777,
"eval_loss": 0.107667475938797,
"eval_runtime": 47.4797,
"eval_samples_per_second": 4.844,
"eval_steps_per_second": 4.844,
"step": 1020
},
{
"epoch": 1.8878697546434304,
"grad_norm": 0.08879910409450531,
"learning_rate": 7.305601695419323e-05,
"loss": 0.1173,
"step": 1030
},
{
"epoch": 1.9062141710616831,
"grad_norm": 0.07893572747707367,
"learning_rate": 7.10097764087462e-05,
"loss": 0.1109,
"step": 1040
},
{
"epoch": 1.9245585874799358,
"grad_norm": 0.0726647600531578,
"learning_rate": 6.897670447230262e-05,
"loss": 0.1136,
"step": 1050
},
{
"epoch": 1.9245585874799358,
"eval_loss": 0.10811686515808105,
"eval_runtime": 47.401,
"eval_samples_per_second": 4.852,
"eval_steps_per_second": 4.852,
"step": 1050
},
{
"epoch": 1.9429030038981885,
"grad_norm": 0.08151935786008835,
"learning_rate": 6.69577246537593e-05,
"loss": 0.1145,
"step": 1060
},
{
"epoch": 1.9612474203164412,
"grad_norm": 0.09550312161445618,
"learning_rate": 6.495375406076573e-05,
"loss": 0.1215,
"step": 1070
},
{
"epoch": 1.9795918367346939,
"grad_norm": 0.20997484028339386,
"learning_rate": 6.296570298313431e-05,
"loss": 0.1222,
"step": 1080
},
{
"epoch": 1.9795918367346939,
"eval_loss": 0.1070978045463562,
"eval_runtime": 47.3536,
"eval_samples_per_second": 4.857,
"eval_steps_per_second": 4.857,
"step": 1080
},
{
"epoch": 1.9979362531529465,
"grad_norm": 0.07865249365568161,
"learning_rate": 6.099447447934743e-05,
"loss": 0.1113,
"step": 1090
},
{
"epoch": 2.0146755331346022,
"grad_norm": 0.08859091252088547,
"learning_rate": 5.904096396634935e-05,
"loss": 0.1253,
"step": 1100
},
{
"epoch": 2.033019949552855,
"grad_norm": 0.08467495441436768,
"learning_rate": 5.710605881280939e-05,
"loss": 0.1168,
"step": 1110
},
{
"epoch": 2.033019949552855,
"eval_loss": 0.10781675577163696,
"eval_runtime": 47.5138,
"eval_samples_per_second": 4.841,
"eval_steps_per_second": 4.841,
"step": 1110
},
{
"epoch": 2.0513643659711076,
"grad_norm": 0.07836291193962097,
"learning_rate": 5.519063793604067e-05,
"loss": 0.1138,
"step": 1120
},
{
"epoch": 2.0697087823893603,
"grad_norm": 0.10746737569570541,
"learning_rate": 5.329557140275801e-05,
"loss": 0.1146,
"step": 1130
},
{
"epoch": 2.088053198807613,
"grad_norm": 0.06803814321756363,
"learning_rate": 5.1421720033856216e-05,
"loss": 0.1148,
"step": 1140
},
{
"epoch": 2.088053198807613,
"eval_loss": 0.10701934248209,
"eval_runtime": 47.4053,
"eval_samples_per_second": 4.852,
"eval_steps_per_second": 4.852,
"step": 1140
},
{
"epoch": 2.1063976152258657,
"grad_norm": 0.08060242980718613,
"learning_rate": 4.9569935013388125e-05,
"loss": 0.1099,
"step": 1150
},
{
"epoch": 2.1247420316441183,
"grad_norm": 0.08353856205940247,
"learning_rate": 4.774105750192e-05,
"loss": 0.1203,
"step": 1160
},
{
"epoch": 2.143086448062371,
"grad_norm": 0.08800723403692245,
"learning_rate": 4.593591825444028e-05,
"loss": 0.112,
"step": 1170
},
{
"epoch": 2.143086448062371,
"eval_loss": 0.10760512948036194,
"eval_runtime": 47.4958,
"eval_samples_per_second": 4.843,
"eval_steps_per_second": 4.843,
"step": 1170
},
{
"epoch": 2.1614308644806237,
"grad_norm": 0.07223484665155411,
"learning_rate": 4.415533724299471e-05,
"loss": 0.1221,
"step": 1180
},
{
"epoch": 2.1797752808988764,
"grad_norm": 0.07669170200824738,
"learning_rate": 4.240012328421997e-05,
"loss": 0.1088,
"step": 1190
},
{
"epoch": 2.198119697317129,
"grad_norm": 0.07591399550437927,
"learning_rate": 4.067107367194397e-05,
"loss": 0.1115,
"step": 1200
},
{
"epoch": 2.198119697317129,
"eval_loss": 0.1069360300898552,
"eval_runtime": 47.5987,
"eval_samples_per_second": 4.832,
"eval_steps_per_second": 4.832,
"step": 1200
},
{
"epoch": 2.2164641137353818,
"grad_norm": 0.0850834846496582,
"learning_rate": 3.8968973815020806e-05,
"loss": 0.1149,
"step": 1210
},
{
"epoch": 2.2348085301536345,
"grad_norm": 0.08643563091754913,
"learning_rate": 3.729459688056427e-05,
"loss": 0.1125,
"step": 1220
},
{
"epoch": 2.253152946571887,
"grad_norm": 0.07351703196763992,
"learning_rate": 3.564870344274185e-05,
"loss": 0.1099,
"step": 1230
},
{
"epoch": 2.253152946571887,
"eval_loss": 0.10722808539867401,
"eval_runtime": 47.5417,
"eval_samples_per_second": 4.838,
"eval_steps_per_second": 4.838,
"step": 1230
},
{
"epoch": 2.27149736299014,
"grad_norm": 0.09461668133735657,
"learning_rate": 3.403204113728933e-05,
"loss": 0.1189,
"step": 1240
},
{
"epoch": 2.2898417794083925,
"grad_norm": 0.09557740390300751,
"learning_rate": 3.244534432190225e-05,
"loss": 0.1219,
"step": 1250
},
{
"epoch": 2.308186195826645,
"grad_norm": 0.09171107411384583,
"learning_rate": 3.088933374265919e-05,
"loss": 0.1199,
"step": 1260
},
{
"epoch": 2.308186195826645,
"eval_loss": 0.10666479170322418,
"eval_runtime": 47.7241,
"eval_samples_per_second": 4.819,
"eval_steps_per_second": 4.819,
"step": 1260
},
{
"epoch": 2.326530612244898,
"grad_norm": 0.09569697827100754,
"learning_rate": 2.936471620662763e-05,
"loss": 0.1018,
"step": 1270
},
{
"epoch": 2.3448750286631506,
"grad_norm": 0.10778363794088364,
"learning_rate": 2.7872184260801838e-05,
"loss": 0.1156,
"step": 1280
},
{
"epoch": 2.3632194450814032,
"grad_norm": 0.07144766300916672,
"learning_rate": 2.6412415877518238e-05,
"loss": 0.1171,
"step": 1290
},
{
"epoch": 2.3632194450814032,
"eval_loss": 0.10628043115139008,
"eval_runtime": 47.8715,
"eval_samples_per_second": 4.805,
"eval_steps_per_second": 4.805,
"step": 1290
},
{
"epoch": 2.381563861499656,
"grad_norm": 0.10883186757564545,
"learning_rate": 2.4986074146490967e-05,
"loss": 0.115,
"step": 1300
},
{
"epoch": 2.3999082779179086,
"grad_norm": 0.08375997096300125,
"learning_rate": 2.35938069736081e-05,
"loss": 0.1134,
"step": 1310
},
{
"epoch": 2.4182526943361613,
"grad_norm": 0.09004294127225876,
"learning_rate": 2.2236246786624792e-05,
"loss": 0.1067,
"step": 1320
},
{
"epoch": 2.4182526943361613,
"eval_loss": 0.10667029023170471,
"eval_runtime": 47.8027,
"eval_samples_per_second": 4.811,
"eval_steps_per_second": 4.811,
"step": 1320
},
{
"epoch": 2.436597110754414,
"grad_norm": 0.08431920409202576,
"learning_rate": 2.091401024788745e-05,
"loss": 0.1175,
"step": 1330
},
{
"epoch": 2.4549415271726667,
"grad_norm": 0.06736938655376434,
"learning_rate": 1.962769797421895e-05,
"loss": 0.1081,
"step": 1340
},
{
"epoch": 2.4732859435909194,
"grad_norm": 0.0825665220618248,
"learning_rate": 1.83778942640927e-05,
"loss": 0.1144,
"step": 1350
},
{
"epoch": 2.4732859435909194,
"eval_loss": 0.10638684034347534,
"eval_runtime": 47.5125,
"eval_samples_per_second": 4.841,
"eval_steps_per_second": 4.841,
"step": 1350
},
{
"epoch": 2.491630360009172,
"grad_norm": 0.09597857296466827,
"learning_rate": 1.716516683221906e-05,
"loss": 0.1174,
"step": 1360
},
{
"epoch": 2.509974776427425,
"grad_norm": 0.087304025888443,
"learning_rate": 1.5990066551664906e-05,
"loss": 0.1217,
"step": 1370
},
{
"epoch": 2.528319192845678,
"grad_norm": 0.1020963117480278,
"learning_rate": 1.4853127203623252e-05,
"loss": 0.1206,
"step": 1380
},
{
"epoch": 2.528319192845678,
"eval_loss": 0.10611271858215332,
"eval_runtime": 47.5951,
"eval_samples_per_second": 4.832,
"eval_steps_per_second": 4.832,
"step": 1380
},
{
"epoch": 2.5466636092639305,
"grad_norm": 0.09987211227416992,
"learning_rate": 1.3754865234946835e-05,
"loss": 0.1237,
"step": 1390
},
{
"epoch": 2.5650080256821832,
"grad_norm": 0.09537294507026672,
"learning_rate": 1.2695779523555829e-05,
"loss": 0.1074,
"step": 1400
},
{
"epoch": 2.583352442100436,
"grad_norm": 0.06951133906841278,
"learning_rate": 1.1676351151825804e-05,
"loss": 0.1113,
"step": 1410
},
{
"epoch": 2.583352442100436,
"eval_loss": 0.10620437562465668,
"eval_runtime": 47.5731,
"eval_samples_per_second": 4.835,
"eval_steps_per_second": 4.835,
"step": 1410
},
{
"epoch": 2.6016968585186886,
"grad_norm": 0.08393154293298721,
"learning_rate": 1.0697043188059475e-05,
"loss": 0.1082,
"step": 1420
},
{
"epoch": 2.6200412749369413,
"grad_norm": 0.08919060230255127,
"learning_rate": 9.75830047614117e-06,
"loss": 0.1129,
"step": 1430
},
{
"epoch": 2.638385691355194,
"grad_norm": 0.08663026988506317,
"learning_rate": 8.860549433469444e-06,
"loss": 0.1151,
"step": 1440
},
{
"epoch": 2.638385691355194,
"eval_loss": 0.10626456141471863,
"eval_runtime": 47.3921,
"eval_samples_per_second": 4.853,
"eval_steps_per_second": 4.853,
"step": 1440
},
{
"epoch": 2.6567301077734466,
"grad_norm": 0.09450593590736389,
"learning_rate": 8.004197857260042e-06,
"loss": 0.1154,
"step": 1450
},
{
"epoch": 2.6750745241916993,
"grad_norm": 0.08700842410326004,
"learning_rate": 7.189634739306705e-06,
"loss": 0.1113,
"step": 1460
},
{
"epoch": 2.693418940609952,
"grad_norm": 0.08897579461336136,
"learning_rate": 6.4172300892844425e-06,
"loss": 0.1068,
"step": 1470
},
{
"epoch": 2.693418940609952,
"eval_loss": 0.10611724108457565,
"eval_runtime": 47.6236,
"eval_samples_per_second": 4.83,
"eval_steps_per_second": 4.83,
"step": 1470
},
{
"epoch": 2.7117633570282047,
"grad_norm": 0.08531934022903442,
"learning_rate": 5.687334766675123e-06,
"loss": 0.1057,
"step": 1480
},
{
"epoch": 2.7301077734464574,
"grad_norm": 0.08912410587072372,
"learning_rate": 5.000280321392004e-06,
"loss": 0.1179,
"step": 1490
},
{
"epoch": 2.74845218986471,
"grad_norm": 0.0903608500957489,
"learning_rate": 4.356378843175446e-06,
"loss": 0.1137,
"step": 1500
},
{
"epoch": 2.74845218986471,
"eval_loss": 0.10605704039335251,
"eval_runtime": 47.5904,
"eval_samples_per_second": 4.833,
"eval_steps_per_second": 4.833,
"step": 1500
},
{
"epoch": 2.7667966062829628,
"grad_norm": 0.08206541836261749,
"learning_rate": 3.75592281982835e-06,
"loss": 0.121,
"step": 1510
},
{
"epoch": 2.7851410227012154,
"grad_norm": 0.08107905834913254,
"learning_rate": 3.1991850043555425e-06,
"loss": 0.1141,
"step": 1520
},
{
"epoch": 2.803485439119468,
"grad_norm": 0.09001165628433228,
"learning_rate": 2.6864182910676273e-06,
"loss": 0.1135,
"step": 1530
},
{
"epoch": 2.803485439119468,
"eval_loss": 0.10597212612628937,
"eval_runtime": 47.6034,
"eval_samples_per_second": 4.832,
"eval_steps_per_second": 4.832,
"step": 1530
},
{
"epoch": 2.821829855537721,
"grad_norm": 0.08551483601331711,
"learning_rate": 2.2178556007054872e-06,
"loss": 0.1209,
"step": 1540
},
{
"epoch": 2.8401742719559735,
"grad_norm": 0.09350485354661942,
"learning_rate": 1.793709774637653e-06,
"loss": 0.1174,
"step": 1550
},
{
"epoch": 2.858518688374226,
"grad_norm": 0.08843007683753967,
"learning_rate": 1.41417347817856e-06,
"loss": 0.1146,
"step": 1560
},
{
"epoch": 2.858518688374226,
"eval_loss": 0.10600461810827255,
"eval_runtime": 47.5109,
"eval_samples_per_second": 4.841,
"eval_steps_per_second": 4.841,
"step": 1560
},
{
"epoch": 2.876863104792479,
"grad_norm": 0.0968756377696991,
"learning_rate": 1.079419113071678e-06,
"loss": 0.1239,
"step": 1570
},
{
"epoch": 2.8952075212107315,
"grad_norm": 0.09828708320856094,
"learning_rate": 7.895987391771997e-07,
"loss": 0.1206,
"step": 1580
},
{
"epoch": 2.9135519376289842,
"grad_norm": 0.09327838569879532,
"learning_rate": 5.448440053999137e-07,
"loss": 0.1173,
"step": 1590
},
{
"epoch": 2.9135519376289842,
"eval_loss": 0.10602891445159912,
"eval_runtime": 47.5894,
"eval_samples_per_second": 4.833,
"eval_steps_per_second": 4.833,
"step": 1590
},
{
"epoch": 2.931896354047237,
"grad_norm": 0.07366887480020523,
"learning_rate": 3.45266089888574e-07,
"loss": 0.1076,
"step": 1600
}
],
"logging_steps": 10,
"max_steps": 1638,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.2220701474596557e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}