c25 / checkpoint-1162 /trainer_state.json
thetmon's picture
Upload merged Qwen3-4B-Instruct-2507 model (auto-generated README)
4a117ce verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 30,
"global_step": 1162,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017219113215669393,
"grad_norm": 8.32422161102295,
"learning_rate": 1.5384615384615387e-05,
"loss": 2.589,
"step": 10
},
{
"epoch": 0.034438226431338786,
"grad_norm": 0.8107045292854309,
"learning_rate": 3.247863247863248e-05,
"loss": 0.9674,
"step": 20
},
{
"epoch": 0.05165733964700818,
"grad_norm": 0.7474735379219055,
"learning_rate": 4.9572649572649575e-05,
"loss": 0.5896,
"step": 30
},
{
"epoch": 0.05165733964700818,
"eval_loss": 0.5013840198516846,
"eval_runtime": 53.9761,
"eval_samples_per_second": 4.539,
"eval_steps_per_second": 4.539,
"step": 30
},
{
"epoch": 0.06887645286267757,
"grad_norm": 0.5421889424324036,
"learning_rate": 6.666666666666667e-05,
"loss": 0.439,
"step": 40
},
{
"epoch": 0.08609556607834697,
"grad_norm": 0.5523178577423096,
"learning_rate": 8.376068376068377e-05,
"loss": 0.334,
"step": 50
},
{
"epoch": 0.10331467929401636,
"grad_norm": 0.5268102884292603,
"learning_rate": 0.00010085470085470086,
"loss": 0.2835,
"step": 60
},
{
"epoch": 0.10331467929401636,
"eval_loss": 0.2452857494354248,
"eval_runtime": 53.6208,
"eval_samples_per_second": 4.569,
"eval_steps_per_second": 4.569,
"step": 60
},
{
"epoch": 0.12053379250968575,
"grad_norm": 0.4393221437931061,
"learning_rate": 0.00011794871794871796,
"loss": 0.2366,
"step": 70
},
{
"epoch": 0.13775290572535515,
"grad_norm": 0.5775641202926636,
"learning_rate": 0.00013504273504273505,
"loss": 0.2218,
"step": 80
},
{
"epoch": 0.15497201894102453,
"grad_norm": 0.4302266240119934,
"learning_rate": 0.00015213675213675214,
"loss": 0.1986,
"step": 90
},
{
"epoch": 0.15497201894102453,
"eval_loss": 0.1793452501296997,
"eval_runtime": 53.5304,
"eval_samples_per_second": 4.577,
"eval_steps_per_second": 4.577,
"step": 90
},
{
"epoch": 0.17219113215669393,
"grad_norm": 0.39770159125328064,
"learning_rate": 0.00016923076923076923,
"loss": 0.186,
"step": 100
},
{
"epoch": 0.1894102453723633,
"grad_norm": 0.4006386399269104,
"learning_rate": 0.00018632478632478634,
"loss": 0.1708,
"step": 110
},
{
"epoch": 0.20662935858803272,
"grad_norm": 0.4491441547870636,
"learning_rate": 0.00019999819242697418,
"loss": 0.1712,
"step": 120
},
{
"epoch": 0.20662935858803272,
"eval_loss": 0.15836507081985474,
"eval_runtime": 53.6179,
"eval_samples_per_second": 4.569,
"eval_steps_per_second": 4.569,
"step": 120
},
{
"epoch": 0.2238484718037021,
"grad_norm": 0.33016568422317505,
"learning_rate": 0.00019993493423217814,
"loss": 0.1638,
"step": 130
},
{
"epoch": 0.2410675850193715,
"grad_norm": 0.22399674355983734,
"learning_rate": 0.00019978136272187747,
"loss": 0.1533,
"step": 140
},
{
"epoch": 0.2582866982350409,
"grad_norm": 0.2746177911758423,
"learning_rate": 0.0001995376166818969,
"loss": 0.158,
"step": 150
},
{
"epoch": 0.2582866982350409,
"eval_loss": 0.14334778487682343,
"eval_runtime": 53.578,
"eval_samples_per_second": 4.573,
"eval_steps_per_second": 4.573,
"step": 150
},
{
"epoch": 0.2755058114507103,
"grad_norm": 0.2805308699607849,
"learning_rate": 0.00019920391639069242,
"loss": 0.1509,
"step": 160
},
{
"epoch": 0.29272492466637967,
"grad_norm": 0.4730603098869324,
"learning_rate": 0.00019878056342028102,
"loss": 0.1496,
"step": 170
},
{
"epoch": 0.30994403788204905,
"grad_norm": 0.23217393457889557,
"learning_rate": 0.00019826794036370362,
"loss": 0.1472,
"step": 180
},
{
"epoch": 0.30994403788204905,
"eval_loss": 0.13477234542369843,
"eval_runtime": 53.4805,
"eval_samples_per_second": 4.581,
"eval_steps_per_second": 4.581,
"step": 180
},
{
"epoch": 0.3271631510977185,
"grad_norm": 0.28313347697257996,
"learning_rate": 0.0001976665104892678,
"loss": 0.147,
"step": 190
},
{
"epoch": 0.34438226431338786,
"grad_norm": 0.15724846720695496,
"learning_rate": 0.00019697681732188218,
"loss": 0.1426,
"step": 200
},
{
"epoch": 0.36160137752905724,
"grad_norm": 0.24457155168056488,
"learning_rate": 0.00019619948415186173,
"loss": 0.1397,
"step": 210
},
{
"epoch": 0.36160137752905724,
"eval_loss": 0.13591289520263672,
"eval_runtime": 53.3683,
"eval_samples_per_second": 4.591,
"eval_steps_per_second": 4.591,
"step": 210
},
{
"epoch": 0.3788204907447266,
"grad_norm": 0.25315046310424805,
"learning_rate": 0.00019533521347164687,
"loss": 0.1454,
"step": 220
},
{
"epoch": 0.39603960396039606,
"grad_norm": 0.20438989996910095,
"learning_rate": 0.00019438478634094638,
"loss": 0.1404,
"step": 230
},
{
"epoch": 0.41325871717606544,
"grad_norm": 0.3646264672279358,
"learning_rate": 0.00019334906168087698,
"loss": 0.1371,
"step": 240
},
{
"epoch": 0.41325871717606544,
"eval_loss": 0.12897279858589172,
"eval_runtime": 53.3855,
"eval_samples_per_second": 4.589,
"eval_steps_per_second": 4.589,
"step": 240
},
{
"epoch": 0.4304778303917348,
"grad_norm": 0.24957729876041412,
"learning_rate": 0.00019222897549773848,
"loss": 0.1489,
"step": 250
},
{
"epoch": 0.4476969436074042,
"grad_norm": 0.17658768594264984,
"learning_rate": 0.00019102554003712466,
"loss": 0.1221,
"step": 260
},
{
"epoch": 0.46491605682307363,
"grad_norm": 0.26017439365386963,
"learning_rate": 0.00018973984286913584,
"loss": 0.1364,
"step": 270
},
{
"epoch": 0.46491605682307363,
"eval_loss": 0.12664556503295898,
"eval_runtime": 53.4534,
"eval_samples_per_second": 4.583,
"eval_steps_per_second": 4.583,
"step": 270
},
{
"epoch": 0.482135170038743,
"grad_norm": 0.13661746680736542,
"learning_rate": 0.0001883730459055186,
"loss": 0.128,
"step": 280
},
{
"epoch": 0.4993542832544124,
"grad_norm": 0.20260806381702423,
"learning_rate": 0.00018692638434962143,
"loss": 0.1341,
"step": 290
},
{
"epoch": 0.5165733964700818,
"grad_norm": 0.14534738659858704,
"learning_rate": 0.0001854011655801157,
"loss": 0.1393,
"step": 300
},
{
"epoch": 0.5165733964700818,
"eval_loss": 0.12330517172813416,
"eval_runtime": 53.8347,
"eval_samples_per_second": 4.551,
"eval_steps_per_second": 4.551,
"step": 300
},
{
"epoch": 0.5337925096857512,
"grad_norm": 0.15992437303066254,
"learning_rate": 0.0001837987679694894,
"loss": 0.1344,
"step": 310
},
{
"epoch": 0.5510116229014206,
"grad_norm": 0.17435793578624725,
"learning_rate": 0.0001821206396383831,
"loss": 0.1326,
"step": 320
},
{
"epoch": 0.56823073611709,
"grad_norm": 0.12753354012966156,
"learning_rate": 0.00018036829714689252,
"loss": 0.1265,
"step": 330
},
{
"epoch": 0.56823073611709,
"eval_loss": 0.12111596018075943,
"eval_runtime": 53.8413,
"eval_samples_per_second": 4.55,
"eval_steps_per_second": 4.55,
"step": 330
},
{
"epoch": 0.5854498493327593,
"grad_norm": 0.17733751237392426,
"learning_rate": 0.00017854332412402108,
"loss": 0.1321,
"step": 340
},
{
"epoch": 0.6026689625484287,
"grad_norm": 0.18490831553936005,
"learning_rate": 0.00017664736983652088,
"loss": 0.1336,
"step": 350
},
{
"epoch": 0.6198880757640981,
"grad_norm": 0.16691215336322784,
"learning_rate": 0.0001746821476984154,
"loss": 0.1286,
"step": 360
},
{
"epoch": 0.6198880757640981,
"eval_loss": 0.1179625391960144,
"eval_runtime": 53.9361,
"eval_samples_per_second": 4.542,
"eval_steps_per_second": 4.542,
"step": 360
},
{
"epoch": 0.6371071889797676,
"grad_norm": 0.1389610469341278,
"learning_rate": 0.000172649433722551,
"loss": 0.123,
"step": 370
},
{
"epoch": 0.654326302195437,
"grad_norm": 0.12287218123674393,
"learning_rate": 0.00017055106491557645,
"loss": 0.1264,
"step": 380
},
{
"epoch": 0.6715454154111064,
"grad_norm": 0.13567915558815002,
"learning_rate": 0.00016838893761780127,
"loss": 0.1258,
"step": 390
},
{
"epoch": 0.6715454154111064,
"eval_loss": 0.11948917806148529,
"eval_runtime": 53.9271,
"eval_samples_per_second": 4.543,
"eval_steps_per_second": 4.543,
"step": 390
},
{
"epoch": 0.6887645286267757,
"grad_norm": 0.1640617996454239,
"learning_rate": 0.00016616500578943273,
"loss": 0.1233,
"step": 400
},
{
"epoch": 0.7059836418424451,
"grad_norm": 0.10329368710517883,
"learning_rate": 0.0001638812792447406,
"loss": 0.1275,
"step": 410
},
{
"epoch": 0.7232027550581145,
"grad_norm": 0.11142633855342865,
"learning_rate": 0.0001615398218357457,
"loss": 0.132,
"step": 420
},
{
"epoch": 0.7232027550581145,
"eval_loss": 0.11622682213783264,
"eval_runtime": 53.961,
"eval_samples_per_second": 4.54,
"eval_steps_per_second": 4.54,
"step": 420
},
{
"epoch": 0.7404218682737839,
"grad_norm": 0.12253064662218094,
"learning_rate": 0.0001591427495870729,
"loss": 0.1272,
"step": 430
},
{
"epoch": 0.7576409814894532,
"grad_norm": 0.14540846645832062,
"learning_rate": 0.00015669222878365486,
"loss": 0.1317,
"step": 440
},
{
"epoch": 0.7748600947051227,
"grad_norm": 0.10412976145744324,
"learning_rate": 0.00015419047401301472,
"loss": 0.1317,
"step": 450
},
{
"epoch": 0.7748600947051227,
"eval_loss": 0.11769552528858185,
"eval_runtime": 53.9569,
"eval_samples_per_second": 4.541,
"eval_steps_per_second": 4.541,
"step": 450
},
{
"epoch": 0.7920792079207921,
"grad_norm": 0.12871715426445007,
"learning_rate": 0.0001516397461638962,
"loss": 0.129,
"step": 460
},
{
"epoch": 0.8092983211364615,
"grad_norm": 0.21784746646881104,
"learning_rate": 0.00014904235038305083,
"loss": 0.1233,
"step": 470
},
{
"epoch": 0.8265174343521309,
"grad_norm": 0.4632514417171478,
"learning_rate": 0.0001464006339920278,
"loss": 0.1363,
"step": 480
},
{
"epoch": 0.8265174343521309,
"eval_loss": 0.11895836144685745,
"eval_runtime": 54.1763,
"eval_samples_per_second": 4.522,
"eval_steps_per_second": 4.522,
"step": 480
},
{
"epoch": 0.8437365475678003,
"grad_norm": 0.15127432346343994,
"learning_rate": 0.00014371698436585004,
"loss": 0.1265,
"step": 490
},
{
"epoch": 0.8609556607834696,
"grad_norm": 0.17194333672523499,
"learning_rate": 0.0001409938267754926,
"loss": 0.13,
"step": 500
},
{
"epoch": 0.878174773999139,
"grad_norm": 0.15506021678447723,
"learning_rate": 0.0001382336221961141,
"loss": 0.1274,
"step": 510
},
{
"epoch": 0.878174773999139,
"eval_loss": 0.11823900789022446,
"eval_runtime": 53.9107,
"eval_samples_per_second": 4.545,
"eval_steps_per_second": 4.545,
"step": 510
},
{
"epoch": 0.8953938872148084,
"grad_norm": 0.13229945302009583,
"learning_rate": 0.00013543886508302148,
"loss": 0.1295,
"step": 520
},
{
"epoch": 0.9126130004304779,
"grad_norm": 0.19207294285297394,
"learning_rate": 0.00013261208111737765,
"loss": 0.1156,
"step": 530
},
{
"epoch": 0.9298321136461473,
"grad_norm": 0.20044037699699402,
"learning_rate": 0.00012975582492369016,
"loss": 0.1198,
"step": 540
},
{
"epoch": 0.9298321136461473,
"eval_loss": 0.11498646438121796,
"eval_runtime": 53.9988,
"eval_samples_per_second": 4.537,
"eval_steps_per_second": 4.537,
"step": 540
},
{
"epoch": 0.9470512268618166,
"grad_norm": 0.2503241002559662,
"learning_rate": 0.00012687267776114304,
"loss": 0.1325,
"step": 550
},
{
"epoch": 0.964270340077486,
"grad_norm": 0.1304081231355667,
"learning_rate": 0.0001239652451908579,
"loss": 0.1305,
"step": 560
},
{
"epoch": 0.9814894532931554,
"grad_norm": 0.17837955057621002,
"learning_rate": 0.0001210361547211936,
"loss": 0.1105,
"step": 570
},
{
"epoch": 0.9814894532931554,
"eval_loss": 0.1158880740404129,
"eval_runtime": 53.9013,
"eval_samples_per_second": 4.545,
"eval_steps_per_second": 4.545,
"step": 570
},
{
"epoch": 0.9987085665088248,
"grad_norm": 0.14954382181167603,
"learning_rate": 0.000118088053433211,
"loss": 0.1275,
"step": 580
},
{
"epoch": 1.0154972018941024,
"grad_norm": 0.16672547161579132,
"learning_rate": 0.00011512360558844994,
"loss": 0.122,
"step": 590
},
{
"epoch": 1.0327163151097718,
"grad_norm": 0.22516606748104095,
"learning_rate": 0.00011214549022117967,
"loss": 0.1215,
"step": 600
},
{
"epoch": 1.0327163151097718,
"eval_loss": 0.11643720418214798,
"eval_runtime": 54.0392,
"eval_samples_per_second": 4.534,
"eval_steps_per_second": 4.534,
"step": 600
},
{
"epoch": 1.0499354283254412,
"grad_norm": 0.10763582587242126,
"learning_rate": 0.00010915639871729874,
"loss": 0.1092,
"step": 610
},
{
"epoch": 1.0671545415411106,
"grad_norm": 0.12457219511270523,
"learning_rate": 0.00010615903238207292,
"loss": 0.1262,
"step": 620
},
{
"epoch": 1.08437365475678,
"grad_norm": 0.10427327454090118,
"learning_rate": 0.00010315609999890798,
"loss": 0.1221,
"step": 630
},
{
"epoch": 1.08437365475678,
"eval_loss": 0.11431042104959488,
"eval_runtime": 53.9648,
"eval_samples_per_second": 4.54,
"eval_steps_per_second": 4.54,
"step": 630
},
{
"epoch": 1.1015927679724493,
"grad_norm": 0.10417324304580688,
"learning_rate": 0.00010015031538136518,
"loss": 0.1243,
"step": 640
},
{
"epoch": 1.118811881188119,
"grad_norm": 0.10996241122484207,
"learning_rate": 9.71443949206304e-05,
"loss": 0.123,
"step": 650
},
{
"epoch": 1.1360309944037883,
"grad_norm": 0.09512694925069809,
"learning_rate": 9.41410551306537e-05,
"loss": 0.1161,
"step": 660
},
{
"epoch": 1.1360309944037883,
"eval_loss": 0.11319658905267715,
"eval_runtime": 54.1752,
"eval_samples_per_second": 4.522,
"eval_steps_per_second": 4.522,
"step": 660
},
{
"epoch": 1.1532501076194577,
"grad_norm": 0.08619705587625504,
"learning_rate": 9.114301019317854e-05,
"loss": 0.1138,
"step": 670
},
{
"epoch": 1.170469220835127,
"grad_norm": 0.099159836769104,
"learning_rate": 8.815296950487804e-05,
"loss": 0.116,
"step": 680
},
{
"epoch": 1.1876883340507964,
"grad_norm": 0.09249532967805862,
"learning_rate": 8.517363522881579e-05,
"loss": 0.1209,
"step": 690
},
{
"epoch": 1.1876883340507964,
"eval_loss": 0.11321850121021271,
"eval_runtime": 54.1757,
"eval_samples_per_second": 4.522,
"eval_steps_per_second": 4.522,
"step": 690
},
{
"epoch": 1.2049074472664658,
"grad_norm": 0.10436110198497772,
"learning_rate": 8.220769985244425e-05,
"loss": 0.1245,
"step": 700
},
{
"epoch": 1.2221265604821352,
"grad_norm": 0.08504263311624527,
"learning_rate": 7.925784375434629e-05,
"loss": 0.1116,
"step": 710
},
{
"epoch": 1.2393456736978046,
"grad_norm": 0.10974706709384918,
"learning_rate": 7.63267327819209e-05,
"loss": 0.1213,
"step": 720
},
{
"epoch": 1.2393456736978046,
"eval_loss": 0.11273372918367386,
"eval_runtime": 54.1807,
"eval_samples_per_second": 4.522,
"eval_steps_per_second": 4.522,
"step": 720
},
{
"epoch": 1.256564786913474,
"grad_norm": 0.0872415080666542,
"learning_rate": 7.341701584220006e-05,
"loss": 0.1164,
"step": 730
},
{
"epoch": 1.2737839001291433,
"grad_norm": 0.09694085270166397,
"learning_rate": 7.05313225079756e-05,
"loss": 0.1193,
"step": 740
},
{
"epoch": 1.2910030133448127,
"grad_norm": 0.09382116794586182,
"learning_rate": 6.767226064139841e-05,
"loss": 0.1212,
"step": 750
},
{
"epoch": 1.2910030133448127,
"eval_loss": 0.11223271489143372,
"eval_runtime": 53.9413,
"eval_samples_per_second": 4.542,
"eval_steps_per_second": 4.542,
"step": 750
},
{
"epoch": 1.308222126560482,
"grad_norm": 0.09150267392396927,
"learning_rate": 6.484241403719842e-05,
"loss": 0.1204,
"step": 760
},
{
"epoch": 1.3254412397761515,
"grad_norm": 0.10270262509584427,
"learning_rate": 6.204434008765458e-05,
"loss": 0.119,
"step": 770
},
{
"epoch": 1.3426603529918208,
"grad_norm": 0.10682746022939682,
"learning_rate": 5.9280567471425077e-05,
"loss": 0.1158,
"step": 780
},
{
"epoch": 1.3426603529918208,
"eval_loss": 0.11181723326444626,
"eval_runtime": 53.9263,
"eval_samples_per_second": 4.543,
"eval_steps_per_second": 4.543,
"step": 780
},
{
"epoch": 1.3598794662074902,
"grad_norm": 0.0856538936495781,
"learning_rate": 5.655359386832728e-05,
"loss": 0.1214,
"step": 790
},
{
"epoch": 1.3770985794231598,
"grad_norm": 0.08333317190408707,
"learning_rate": 5.386588370213124e-05,
"loss": 0.123,
"step": 800
},
{
"epoch": 1.394317692638829,
"grad_norm": 0.08669654279947281,
"learning_rate": 5.121986591340808e-05,
"loss": 0.1177,
"step": 810
},
{
"epoch": 1.394317692638829,
"eval_loss": 0.11222003400325775,
"eval_runtime": 53.9921,
"eval_samples_per_second": 4.538,
"eval_steps_per_second": 4.538,
"step": 810
},
{
"epoch": 1.4115368058544986,
"grad_norm": 0.09735915809869766,
"learning_rate": 4.861793176444479e-05,
"loss": 0.1155,
"step": 820
},
{
"epoch": 1.428755919070168,
"grad_norm": 0.10614746063947678,
"learning_rate": 4.6062432678209986e-05,
"loss": 0.1126,
"step": 830
},
{
"epoch": 1.4459750322858373,
"grad_norm": 0.09488119930028915,
"learning_rate": 4.355567811332311e-05,
"loss": 0.1153,
"step": 840
},
{
"epoch": 1.4459750322858373,
"eval_loss": 0.11169660836458206,
"eval_runtime": 54.1675,
"eval_samples_per_second": 4.523,
"eval_steps_per_second": 4.523,
"step": 840
},
{
"epoch": 1.4631941455015067,
"grad_norm": 0.10649088025093079,
"learning_rate": 4.109993347694781e-05,
"loss": 0.1289,
"step": 850
},
{
"epoch": 1.480413258717176,
"grad_norm": 0.08584726601839066,
"learning_rate": 3.8697418077495575e-05,
"loss": 0.1193,
"step": 860
},
{
"epoch": 1.4976323719328455,
"grad_norm": 0.08480769395828247,
"learning_rate": 3.635030311898975e-05,
"loss": 0.1167,
"step": 870
},
{
"epoch": 1.4976323719328455,
"eval_loss": 0.1113625019788742,
"eval_runtime": 54.0863,
"eval_samples_per_second": 4.53,
"eval_steps_per_second": 4.53,
"step": 870
},
{
"epoch": 1.5148514851485149,
"grad_norm": 0.09291431307792664,
"learning_rate": 3.4060709738902485e-05,
"loss": 0.1223,
"step": 880
},
{
"epoch": 1.5320705983641842,
"grad_norm": 0.06462642550468445,
"learning_rate": 3.183070709123781e-05,
"loss": 0.1113,
"step": 890
},
{
"epoch": 1.5492897115798536,
"grad_norm": 0.08776138722896576,
"learning_rate": 2.9662310476593492e-05,
"loss": 0.1159,
"step": 900
},
{
"epoch": 1.5492897115798536,
"eval_loss": 0.11135545372962952,
"eval_runtime": 54.2115,
"eval_samples_per_second": 4.519,
"eval_steps_per_second": 4.519,
"step": 900
},
{
"epoch": 1.566508824795523,
"grad_norm": 0.08839749544858932,
"learning_rate": 2.7557479520891104e-05,
"loss": 0.1226,
"step": 910
},
{
"epoch": 1.5837279380111924,
"grad_norm": 0.1001950353384018,
"learning_rate": 2.551811640442081e-05,
"loss": 0.1156,
"step": 920
},
{
"epoch": 1.6009470512268618,
"grad_norm": 0.08094879239797592,
"learning_rate": 2.354606414280045e-05,
"loss": 0.1178,
"step": 930
},
{
"epoch": 1.6009470512268618,
"eval_loss": 0.11072849482297897,
"eval_runtime": 54.2409,
"eval_samples_per_second": 4.517,
"eval_steps_per_second": 4.517,
"step": 930
},
{
"epoch": 1.6181661644425311,
"grad_norm": 0.0837637260556221,
"learning_rate": 2.1643104921403657e-05,
"loss": 0.1163,
"step": 940
},
{
"epoch": 1.6353852776582007,
"grad_norm": 0.08061401546001434,
"learning_rate": 1.98109584847609e-05,
"loss": 0.1215,
"step": 950
},
{
"epoch": 1.65260439087387,
"grad_norm": 0.08909575641155243,
"learning_rate": 1.805128058239014e-05,
"loss": 0.1214,
"step": 960
},
{
"epoch": 1.65260439087387,
"eval_loss": 0.11049379408359528,
"eval_runtime": 54.0663,
"eval_samples_per_second": 4.531,
"eval_steps_per_second": 4.531,
"step": 960
},
{
"epoch": 1.6698235040895395,
"grad_norm": 0.0853874459862709,
"learning_rate": 1.6365661472460946e-05,
"loss": 0.1223,
"step": 970
},
{
"epoch": 1.6870426173052087,
"grad_norm": 0.07660456001758575,
"learning_rate": 1.475562448464437e-05,
"loss": 0.1249,
"step": 980
},
{
"epoch": 1.7042617305208783,
"grad_norm": 0.08277535438537598,
"learning_rate": 1.3222624643447879e-05,
"loss": 0.1139,
"step": 990
},
{
"epoch": 1.7042617305208783,
"eval_loss": 0.11061510443687439,
"eval_runtime": 54.1562,
"eval_samples_per_second": 4.524,
"eval_steps_per_second": 4.524,
"step": 990
},
{
"epoch": 1.7214808437365474,
"grad_norm": 0.08193802833557129,
"learning_rate": 1.1768047353278721e-05,
"loss": 0.1163,
"step": 1000
},
{
"epoch": 1.738699956952217,
"grad_norm": 0.09207924455404282,
"learning_rate": 1.0393207146424766e-05,
"loss": 0.1171,
"step": 1010
},
{
"epoch": 1.7559190701678864,
"grad_norm": 0.07924116402864456,
"learning_rate": 9.09934649508375e-06,
"loss": 0.1215,
"step": 1020
},
{
"epoch": 1.7559190701678864,
"eval_loss": 0.1104968935251236,
"eval_runtime": 54.2108,
"eval_samples_per_second": 4.519,
"eval_steps_per_second": 4.519,
"step": 1020
},
{
"epoch": 1.7731381833835558,
"grad_norm": 0.0777198001742363,
"learning_rate": 7.887634688515e-06,
"loss": 0.1204,
"step": 1030
},
{
"epoch": 1.7903572965992252,
"grad_norm": 0.09511927515268326,
"learning_rate": 6.759166776327786e-06,
"loss": 0.1125,
"step": 1040
},
{
"epoch": 1.8075764098148945,
"grad_norm": 0.08975937217473984,
"learning_rate": 5.71496257886196e-06,
"loss": 0.1143,
"step": 1050
},
{
"epoch": 1.8075764098148945,
"eval_loss": 0.11050143837928772,
"eval_runtime": 54.1535,
"eval_samples_per_second": 4.524,
"eval_steps_per_second": 4.524,
"step": 1050
},
{
"epoch": 1.824795523030564,
"grad_norm": 0.08812955766916275,
"learning_rate": 4.755965765554637e-06,
"loss": 0.112,
"step": 1060
},
{
"epoch": 1.8420146362462333,
"grad_norm": 0.08198709785938263,
"learning_rate": 3.883043002126219e-06,
"loss": 0.1149,
"step": 1070
},
{
"epoch": 1.8592337494619027,
"grad_norm": 0.0873785987496376,
"learning_rate": 3.0969831673562042e-06,
"loss": 0.1142,
"step": 1080
},
{
"epoch": 1.8592337494619027,
"eval_loss": 0.11041069775819778,
"eval_runtime": 54.1647,
"eval_samples_per_second": 4.523,
"eval_steps_per_second": 4.523,
"step": 1080
},
{
"epoch": 1.876452862677572,
"grad_norm": 0.08640486001968384,
"learning_rate": 2.3984966401567e-06,
"loss": 0.1154,
"step": 1090
},
{
"epoch": 1.8936719758932417,
"grad_norm": 0.09290562570095062,
"learning_rate": 1.7882146575880166e-06,
"loss": 0.1163,
"step": 1100
},
{
"epoch": 1.9108910891089108,
"grad_norm": 0.0945608988404274,
"learning_rate": 1.266688744396327e-06,
"loss": 0.1162,
"step": 1110
},
{
"epoch": 1.9108910891089108,
"eval_loss": 0.11023778468370438,
"eval_runtime": 54.2854,
"eval_samples_per_second": 4.513,
"eval_steps_per_second": 4.513,
"step": 1110
},
{
"epoch": 1.9281102023245804,
"grad_norm": 0.09848543256521225,
"learning_rate": 8.343902145891646e-07,
"loss": 0.1112,
"step": 1120
},
{
"epoch": 1.9453293155402496,
"grad_norm": 0.09019118547439575,
"learning_rate": 4.917097454988584e-07,
"loss": 0.1156,
"step": 1130
},
{
"epoch": 1.9625484287559192,
"grad_norm": 0.09469816088676453,
"learning_rate": 2.389570247192552e-07,
"loss": 0.1218,
"step": 1140
},
{
"epoch": 1.9625484287559192,
"eval_loss": 0.11023097485303879,
"eval_runtime": 54.1686,
"eval_samples_per_second": 4.523,
"eval_steps_per_second": 4.523,
"step": 1140
},
{
"epoch": 1.9797675419715883,
"grad_norm": 0.08748683333396912,
"learning_rate": 7.636047023441561e-08,
"loss": 0.1202,
"step": 1150
},
{
"epoch": 1.996986655187258,
"grad_norm": 0.09518461674451828,
"learning_rate": 4.0670239925155725e-09,
"loss": 0.1136,
"step": 1160
}
],
"logging_steps": 10,
"max_steps": 1162,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6568373351982694e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}