7b_date1_v2345_data2_v2344_adapta / trainer_state.json
helloworldabc's picture
Upload merged Qwen3-4B-Instruct-2507 model (auto-generated README)
049c969 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 50,
"global_step": 1350,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007410151908114116,
"grad_norm": 4.45390510559082,
"learning_rate": 1.3333333333333334e-07,
"loss": 1.2745,
"step": 10
},
{
"epoch": 0.014820303816228233,
"grad_norm": 4.346654415130615,
"learning_rate": 2.814814814814815e-07,
"loss": 1.2289,
"step": 20
},
{
"epoch": 0.02223045572434235,
"grad_norm": 3.9740023612976074,
"learning_rate": 4.296296296296296e-07,
"loss": 1.2757,
"step": 30
},
{
"epoch": 0.029640607632456465,
"grad_norm": 3.878234386444092,
"learning_rate": 5.777777777777777e-07,
"loss": 1.2404,
"step": 40
},
{
"epoch": 0.037050759540570584,
"grad_norm": 4.405137538909912,
"learning_rate": 7.259259259259259e-07,
"loss": 1.2627,
"step": 50
},
{
"epoch": 0.037050759540570584,
"eval_loss": 1.2372760772705078,
"eval_runtime": 208.6218,
"eval_samples_per_second": 2.723,
"eval_steps_per_second": 1.361,
"step": 50
},
{
"epoch": 0.0444609114486847,
"grad_norm": 3.6006131172180176,
"learning_rate": 8.740740740740741e-07,
"loss": 1.2097,
"step": 60
},
{
"epoch": 0.051871063356798815,
"grad_norm": 3.857161045074463,
"learning_rate": 1.0222222222222221e-06,
"loss": 1.0813,
"step": 70
},
{
"epoch": 0.05928121526491293,
"grad_norm": 2.1402502059936523,
"learning_rate": 1.1703703703703702e-06,
"loss": 1.0065,
"step": 80
},
{
"epoch": 0.06669136717302705,
"grad_norm": 2.2522549629211426,
"learning_rate": 1.3185185185185184e-06,
"loss": 0.885,
"step": 90
},
{
"epoch": 0.07410151908114117,
"grad_norm": 1.8222051858901978,
"learning_rate": 1.4666666666666665e-06,
"loss": 0.8144,
"step": 100
},
{
"epoch": 0.07410151908114117,
"eval_loss": 0.7972270846366882,
"eval_runtime": 208.2662,
"eval_samples_per_second": 2.727,
"eval_steps_per_second": 1.364,
"step": 100
},
{
"epoch": 0.08151167098925528,
"grad_norm": 1.2294323444366455,
"learning_rate": 1.614814814814815e-06,
"loss": 0.763,
"step": 110
},
{
"epoch": 0.0889218228973694,
"grad_norm": 1.1122323274612427,
"learning_rate": 1.762962962962963e-06,
"loss": 0.7137,
"step": 120
},
{
"epoch": 0.09633197480548351,
"grad_norm": 0.8735978007316589,
"learning_rate": 1.9111111111111112e-06,
"loss": 0.6903,
"step": 130
},
{
"epoch": 0.10374212671359763,
"grad_norm": 0.7982610464096069,
"learning_rate": 1.9999465148392903e-06,
"loss": 0.6381,
"step": 140
},
{
"epoch": 0.11115227862171174,
"grad_norm": 1.082419753074646,
"learning_rate": 1.999344872485215e-06,
"loss": 0.6285,
"step": 150
},
{
"epoch": 0.11115227862171174,
"eval_loss": 0.6020215749740601,
"eval_runtime": 208.1618,
"eval_samples_per_second": 2.729,
"eval_steps_per_second": 1.364,
"step": 150
},
{
"epoch": 0.11856243052982586,
"grad_norm": 0.9158598780632019,
"learning_rate": 1.9980751348850217e-06,
"loss": 0.5996,
"step": 160
},
{
"epoch": 0.12597258243794,
"grad_norm": 0.8013431429862976,
"learning_rate": 1.996138150900478e-06,
"loss": 0.5625,
"step": 170
},
{
"epoch": 0.1333827343460541,
"grad_norm": 0.6254628300666809,
"learning_rate": 1.9935352154697255e-06,
"loss": 0.5658,
"step": 180
},
{
"epoch": 0.14079288625416822,
"grad_norm": 0.7743931412696838,
"learning_rate": 1.99026806874157e-06,
"loss": 0.5308,
"step": 190
},
{
"epoch": 0.14820303816228234,
"grad_norm": 0.8001664280891418,
"learning_rate": 1.986338894912137e-06,
"loss": 0.4969,
"step": 200
},
{
"epoch": 0.14820303816228234,
"eval_loss": 0.5175904035568237,
"eval_runtime": 208.2728,
"eval_samples_per_second": 2.727,
"eval_steps_per_second": 1.364,
"step": 200
},
{
"epoch": 0.15561319007039645,
"grad_norm": 0.7069100141525269,
"learning_rate": 1.9817503207646603e-06,
"loss": 0.4996,
"step": 210
},
{
"epoch": 0.16302334197851057,
"grad_norm": 1.052976131439209,
"learning_rate": 1.9765054139133926e-06,
"loss": 0.4658,
"step": 220
},
{
"epoch": 0.17043349388662468,
"grad_norm": 0.739701509475708,
"learning_rate": 1.970607680752804e-06,
"loss": 0.4852,
"step": 230
},
{
"epoch": 0.1778436457947388,
"grad_norm": 1.0828092098236084,
"learning_rate": 1.9640610641134382e-06,
"loss": 0.4793,
"step": 240
},
{
"epoch": 0.1852537977028529,
"grad_norm": 0.8444927930831909,
"learning_rate": 1.956869940626001e-06,
"loss": 0.4198,
"step": 250
},
{
"epoch": 0.1852537977028529,
"eval_loss": 0.45384249091148376,
"eval_runtime": 208.3355,
"eval_samples_per_second": 2.726,
"eval_steps_per_second": 1.363,
"step": 250
},
{
"epoch": 0.19266394961096703,
"grad_norm": 0.9700266122817993,
"learning_rate": 1.9490391177954383e-06,
"loss": 0.4512,
"step": 260
},
{
"epoch": 0.20007410151908114,
"grad_norm": 0.8187345862388611,
"learning_rate": 1.940573830786956e-06,
"loss": 0.419,
"step": 270
},
{
"epoch": 0.20748425342719526,
"grad_norm": 0.7705732583999634,
"learning_rate": 1.9314797389261425e-06,
"loss": 0.4329,
"step": 280
},
{
"epoch": 0.21489440533530937,
"grad_norm": 0.8967396020889282,
"learning_rate": 1.921762921915517e-06,
"loss": 0.4267,
"step": 290
},
{
"epoch": 0.2223045572434235,
"grad_norm": 0.8268992304801941,
"learning_rate": 1.911429875770051e-06,
"loss": 0.3825,
"step": 300
},
{
"epoch": 0.2223045572434235,
"eval_loss": 0.3979480266571045,
"eval_runtime": 208.4142,
"eval_samples_per_second": 2.725,
"eval_steps_per_second": 1.363,
"step": 300
},
{
"epoch": 0.2297147091515376,
"grad_norm": 0.8976253867149353,
"learning_rate": 1.9004875084743622e-06,
"loss": 0.4011,
"step": 310
},
{
"epoch": 0.23712486105965172,
"grad_norm": 1.0648952722549438,
"learning_rate": 1.8889431353645002e-06,
"loss": 0.3914,
"step": 320
},
{
"epoch": 0.24453501296776584,
"grad_norm": 1.5843675136566162,
"learning_rate": 1.8768044742374006e-06,
"loss": 0.392,
"step": 330
},
{
"epoch": 0.25194516487588,
"grad_norm": 1.2248514890670776,
"learning_rate": 1.8640796401912805e-06,
"loss": 0.3595,
"step": 340
},
{
"epoch": 0.2593553167839941,
"grad_norm": 1.2559500932693481,
"learning_rate": 1.8507771402004266e-06,
"loss": 0.355,
"step": 350
},
{
"epoch": 0.2593553167839941,
"eval_loss": 0.34925511479377747,
"eval_runtime": 208.5081,
"eval_samples_per_second": 2.724,
"eval_steps_per_second": 1.362,
"step": 350
},
{
"epoch": 0.2667654686921082,
"grad_norm": 1.2216342687606812,
"learning_rate": 1.8369058674280002e-06,
"loss": 0.315,
"step": 360
},
{
"epoch": 0.2741756206002223,
"grad_norm": 1.0528122186660767,
"learning_rate": 1.8224750952806621e-06,
"loss": 0.3279,
"step": 370
},
{
"epoch": 0.28158577250833644,
"grad_norm": 0.948635995388031,
"learning_rate": 1.8074944712089923e-06,
"loss": 0.3288,
"step": 380
},
{
"epoch": 0.28899592441645056,
"grad_norm": 1.1430398225784302,
"learning_rate": 1.791974010257848e-06,
"loss": 0.3157,
"step": 390
},
{
"epoch": 0.29640607632456467,
"grad_norm": 1.2426784038543701,
"learning_rate": 1.7759240883709743e-06,
"loss": 0.2976,
"step": 400
},
{
"epoch": 0.29640607632456467,
"eval_loss": 0.3070007562637329,
"eval_runtime": 208.3886,
"eval_samples_per_second": 2.726,
"eval_steps_per_second": 1.363,
"step": 400
},
{
"epoch": 0.3038162282326788,
"grad_norm": 1.1574546098709106,
"learning_rate": 1.7593554354543415e-06,
"loss": 0.3031,
"step": 410
},
{
"epoch": 0.3112263801407929,
"grad_norm": 1.3627562522888184,
"learning_rate": 1.7422791282028455e-06,
"loss": 0.2916,
"step": 420
},
{
"epoch": 0.318636532048907,
"grad_norm": 1.4292229413986206,
"learning_rate": 1.7247065826951692e-06,
"loss": 0.2731,
"step": 430
},
{
"epoch": 0.32604668395702113,
"grad_norm": 1.2023752927780151,
"learning_rate": 1.706649546761755e-06,
"loss": 0.2734,
"step": 440
},
{
"epoch": 0.33345683586513525,
"grad_norm": 1.4670097827911377,
"learning_rate": 1.6881200921309913e-06,
"loss": 0.2547,
"step": 450
},
{
"epoch": 0.33345683586513525,
"eval_loss": 0.2757605016231537,
"eval_runtime": 208.5349,
"eval_samples_per_second": 2.724,
"eval_steps_per_second": 1.362,
"step": 450
},
{
"epoch": 0.34086698777324936,
"grad_norm": 1.222509503364563,
"learning_rate": 1.669130606358858e-06,
"loss": 0.2549,
"step": 460
},
{
"epoch": 0.3482771396813635,
"grad_norm": 1.0831719636917114,
"learning_rate": 1.6496937845474371e-06,
"loss": 0.2878,
"step": 470
},
{
"epoch": 0.3556872915894776,
"grad_norm": 1.6136417388916016,
"learning_rate": 1.6298226208578124e-06,
"loss": 0.2495,
"step": 480
},
{
"epoch": 0.3630974434975917,
"grad_norm": 1.2490746974945068,
"learning_rate": 1.6095303998230431e-06,
"loss": 0.2541,
"step": 490
},
{
"epoch": 0.3705075954057058,
"grad_norm": 1.3669530153274536,
"learning_rate": 1.5888306874670112e-06,
"loss": 0.2537,
"step": 500
},
{
"epoch": 0.3705075954057058,
"eval_loss": 0.2523694634437561,
"eval_runtime": 208.3135,
"eval_samples_per_second": 2.727,
"eval_steps_per_second": 1.363,
"step": 500
},
{
"epoch": 0.37791774731381994,
"grad_norm": 1.2288622856140137,
"learning_rate": 1.567737322235084e-06,
"loss": 0.2494,
"step": 510
},
{
"epoch": 0.38532789922193406,
"grad_norm": 1.4230667352676392,
"learning_rate": 1.546264405742654e-06,
"loss": 0.2437,
"step": 520
},
{
"epoch": 0.39273805113004817,
"grad_norm": 2.109126091003418,
"learning_rate": 1.5244262933477398e-06,
"loss": 0.2418,
"step": 530
},
{
"epoch": 0.4001482030381623,
"grad_norm": 1.7550323009490967,
"learning_rate": 1.5022375845539534e-06,
"loss": 0.2358,
"step": 540
},
{
"epoch": 0.4075583549462764,
"grad_norm": 1.5699337720870972,
"learning_rate": 1.4797131132502464e-06,
"loss": 0.2403,
"step": 550
},
{
"epoch": 0.4075583549462764,
"eval_loss": 0.23371076583862305,
"eval_runtime": 209.4346,
"eval_samples_per_second": 2.712,
"eval_steps_per_second": 1.356,
"step": 550
},
{
"epoch": 0.4149685068543905,
"grad_norm": 1.7377654314041138,
"learning_rate": 1.4568679377939617e-06,
"loss": 0.2189,
"step": 560
},
{
"epoch": 0.42237865876250463,
"grad_norm": 1.5883984565734863,
"learning_rate": 1.4337173309438233e-06,
"loss": 0.2268,
"step": 570
},
{
"epoch": 0.42978881067061875,
"grad_norm": 1.6136311292648315,
"learning_rate": 1.4102767696495883e-06,
"loss": 0.2187,
"step": 580
},
{
"epoch": 0.43719896257873286,
"grad_norm": 1.569895625114441,
"learning_rate": 1.3865619247051915e-06,
"loss": 0.2132,
"step": 590
},
{
"epoch": 0.444609114486847,
"grad_norm": 1.6274341344833374,
"learning_rate": 1.3625886502723008e-06,
"loss": 0.2022,
"step": 600
},
{
"epoch": 0.444609114486847,
"eval_loss": 0.21966929733753204,
"eval_runtime": 209.6627,
"eval_samples_per_second": 2.709,
"eval_steps_per_second": 1.355,
"step": 600
},
{
"epoch": 0.4520192663949611,
"grad_norm": 1.511763095855713,
"learning_rate": 1.338372973281281e-06,
"loss": 0.2026,
"step": 610
},
{
"epoch": 0.4594294183030752,
"grad_norm": 1.2806991338729858,
"learning_rate": 1.3139310827166612e-06,
"loss": 0.217,
"step": 620
},
{
"epoch": 0.4668395702111893,
"grad_norm": 1.5671716928482056,
"learning_rate": 1.2892793187942586e-06,
"loss": 0.2002,
"step": 630
},
{
"epoch": 0.47424972211930344,
"grad_norm": 1.7969197034835815,
"learning_rate": 1.2644341620372023e-06,
"loss": 0.2004,
"step": 640
},
{
"epoch": 0.48165987402741756,
"grad_norm": 1.4059193134307861,
"learning_rate": 1.2394122222581555e-06,
"loss": 0.2023,
"step": 650
},
{
"epoch": 0.48165987402741756,
"eval_loss": 0.2078457921743393,
"eval_runtime": 209.6676,
"eval_samples_per_second": 2.709,
"eval_steps_per_second": 1.355,
"step": 650
},
{
"epoch": 0.48907002593553167,
"grad_norm": 1.4455323219299316,
"learning_rate": 1.214230227455106e-06,
"loss": 0.2059,
"step": 660
},
{
"epoch": 0.4964801778436458,
"grad_norm": 1.5389137268066406,
"learning_rate": 1.1889050126281403e-06,
"loss": 0.2017,
"step": 670
},
{
"epoch": 0.50389032975176,
"grad_norm": 1.3474076986312866,
"learning_rate": 1.1634535085246902e-06,
"loss": 0.1841,
"step": 680
},
{
"epoch": 0.5113004816598741,
"grad_norm": 1.399523138999939,
"learning_rate": 1.1378927303207636e-06,
"loss": 0.2018,
"step": 690
},
{
"epoch": 0.5187106335679882,
"grad_norm": 1.4829585552215576,
"learning_rate": 1.112239766245735e-06,
"loss": 0.1985,
"step": 700
},
{
"epoch": 0.5187106335679882,
"eval_loss": 0.1990778148174286,
"eval_runtime": 209.7592,
"eval_samples_per_second": 2.708,
"eval_steps_per_second": 1.354,
"step": 700
},
{
"epoch": 0.5261207854761023,
"grad_norm": 1.460558295249939,
"learning_rate": 1.0865117661582956e-06,
"loss": 0.1789,
"step": 710
},
{
"epoch": 0.5335309373842164,
"grad_norm": 1.1899714469909668,
"learning_rate": 1.0607259300812045e-06,
"loss": 0.2072,
"step": 720
},
{
"epoch": 0.5409410892923305,
"grad_norm": 1.4120274782180786,
"learning_rate": 1.034899496702501e-06,
"loss": 0.1887,
"step": 730
},
{
"epoch": 0.5483512412004447,
"grad_norm": 1.3966922760009766,
"learning_rate": 1.0090497318508686e-06,
"loss": 0.1723,
"step": 740
},
{
"epoch": 0.5557613931085588,
"grad_norm": 1.6820755004882812,
"learning_rate": 9.831939169528563e-07,
"loss": 0.195,
"step": 750
},
{
"epoch": 0.5557613931085588,
"eval_loss": 0.19260452687740326,
"eval_runtime": 209.5514,
"eval_samples_per_second": 2.711,
"eval_steps_per_second": 1.355,
"step": 750
},
{
"epoch": 0.5631715450166729,
"grad_norm": 1.6537615060806274,
"learning_rate": 9.57349337479669e-07,
"loss": 0.1922,
"step": 760
},
{
"epoch": 0.570581696924787,
"grad_norm": 1.4756197929382324,
"learning_rate": 9.315332713912591e-07,
"loss": 0.1771,
"step": 770
},
{
"epoch": 0.5779918488329011,
"grad_norm": 1.5430341958999634,
"learning_rate": 9.057629775854314e-07,
"loss": 0.1693,
"step": 780
},
{
"epoch": 0.5854020007410152,
"grad_norm": 1.7049578428268433,
"learning_rate": 8.800556843597001e-07,
"loss": 0.1875,
"step": 790
},
{
"epoch": 0.5928121526491293,
"grad_norm": 1.6176693439483643,
"learning_rate": 8.544285778936002e-07,
"loss": 0.1737,
"step": 800
},
{
"epoch": 0.5928121526491293,
"eval_loss": 0.18725676834583282,
"eval_runtime": 209.6187,
"eval_samples_per_second": 2.71,
"eval_steps_per_second": 1.355,
"step": 800
},
{
"epoch": 0.6002223045572435,
"grad_norm": 1.9868125915527344,
"learning_rate": 8.288987907591518e-07,
"loss": 0.1754,
"step": 810
},
{
"epoch": 0.6076324564653576,
"grad_norm": 1.5091502666473389,
"learning_rate": 8.034833904671697e-07,
"loss": 0.1815,
"step": 820
},
{
"epoch": 0.6150426083734717,
"grad_norm": 2.074658155441284,
"learning_rate": 7.781993680570655e-07,
"loss": 0.1834,
"step": 830
},
{
"epoch": 0.6224527602815858,
"grad_norm": 1.2912707328796387,
"learning_rate": 7.530636267377706e-07,
"loss": 0.1816,
"step": 840
},
{
"epoch": 0.6298629121896999,
"grad_norm": 1.5129729509353638,
"learning_rate": 7.280929705873818e-07,
"loss": 0.1864,
"step": 850
},
{
"epoch": 0.6298629121896999,
"eval_loss": 0.18269772827625275,
"eval_runtime": 209.5398,
"eval_samples_per_second": 2.711,
"eval_steps_per_second": 1.355,
"step": 850
},
{
"epoch": 0.637273064097814,
"grad_norm": 1.635215163230896,
"learning_rate": 7.033040933190774e-07,
"loss": 0.1815,
"step": 860
},
{
"epoch": 0.6446832160059282,
"grad_norm": 1.7670304775238037,
"learning_rate": 6.787135671208126e-07,
"loss": 0.1873,
"step": 870
},
{
"epoch": 0.6520933679140423,
"grad_norm": 1.7472981214523315,
"learning_rate": 6.543378315762633e-07,
"loss": 0.19,
"step": 880
},
{
"epoch": 0.6595035198221564,
"grad_norm": 1.8691086769104004,
"learning_rate": 6.301931826744189e-07,
"loss": 0.1914,
"step": 890
},
{
"epoch": 0.6669136717302705,
"grad_norm": 1.5207786560058594,
"learning_rate": 6.062957619151703e-07,
"loss": 0.1805,
"step": 900
},
{
"epoch": 0.6669136717302705,
"eval_loss": 0.17914490401744843,
"eval_runtime": 209.5686,
"eval_samples_per_second": 2.71,
"eval_steps_per_second": 1.355,
"step": 900
},
{
"epoch": 0.6743238236383846,
"grad_norm": 1.5552278757095337,
"learning_rate": 5.826615455181821e-07,
"loss": 0.1887,
"step": 910
},
{
"epoch": 0.6817339755464987,
"grad_norm": 1.5720982551574707,
"learning_rate": 5.593063337422594e-07,
"loss": 0.1743,
"step": 920
},
{
"epoch": 0.6891441274546128,
"grad_norm": 1.781369924545288,
"learning_rate": 5.362457403223495e-07,
"loss": 0.1772,
"step": 930
},
{
"epoch": 0.696554279362727,
"grad_norm": 1.4669294357299805,
"learning_rate": 5.134951820312401e-07,
"loss": 0.1897,
"step": 940
},
{
"epoch": 0.7039644312708411,
"grad_norm": 1.097123384475708,
"learning_rate": 4.91069868372937e-07,
"loss": 0.1744,
"step": 950
},
{
"epoch": 0.7039644312708411,
"eval_loss": 0.1766466647386551,
"eval_runtime": 209.6411,
"eval_samples_per_second": 2.709,
"eval_steps_per_second": 1.355,
"step": 950
},
{
"epoch": 0.7113745831789552,
"grad_norm": 1.3973758220672607,
"learning_rate": 4.689847914146041e-07,
"loss": 0.1683,
"step": 960
},
{
"epoch": 0.7187847350870693,
"grad_norm": 1.736214280128479,
"learning_rate": 4.472547157638673e-07,
"loss": 0.1824,
"step": 970
},
{
"epoch": 0.7261948869951834,
"grad_norm": 1.3411122560501099,
"learning_rate": 4.258941686981864e-07,
"loss": 0.183,
"step": 980
},
{
"epoch": 0.7336050389032975,
"grad_norm": 1.4088919162750244,
"learning_rate": 4.0491743045288564e-07,
"loss": 0.1826,
"step": 990
},
{
"epoch": 0.7410151908114117,
"grad_norm": 1.4413363933563232,
"learning_rate": 3.843385246743417e-07,
"loss": 0.1853,
"step": 1000
},
{
"epoch": 0.7410151908114117,
"eval_loss": 0.17452707886695862,
"eval_runtime": 209.5833,
"eval_samples_per_second": 2.71,
"eval_steps_per_second": 1.355,
"step": 1000
},
{
"epoch": 0.7484253427195258,
"grad_norm": 1.4395395517349243,
"learning_rate": 3.6417120904471244e-07,
"loss": 0.1612,
"step": 1010
},
{
"epoch": 0.7558354946276399,
"grad_norm": 1.5045926570892334,
"learning_rate": 3.4442896608446647e-07,
"loss": 0.1717,
"step": 1020
},
{
"epoch": 0.763245646535754,
"grad_norm": 1.546872854232788,
"learning_rate": 3.2512499413887253e-07,
"loss": 0.1665,
"step": 1030
},
{
"epoch": 0.7706557984438681,
"grad_norm": 1.5870492458343506,
"learning_rate": 3.0627219855446664e-07,
"loss": 0.1653,
"step": 1040
},
{
"epoch": 0.7780659503519822,
"grad_norm": 1.694765329360962,
"learning_rate": 2.87883183051398e-07,
"loss": 0.1626,
"step": 1050
},
{
"epoch": 0.7780659503519822,
"eval_loss": 0.17290721833705902,
"eval_runtime": 209.5418,
"eval_samples_per_second": 2.711,
"eval_steps_per_second": 1.355,
"step": 1050
},
{
"epoch": 0.7854761022600963,
"grad_norm": 2.193631649017334,
"learning_rate": 2.699702412974254e-07,
"loss": 0.1637,
"step": 1060
},
{
"epoch": 0.7928862541682105,
"grad_norm": 1.4417670965194702,
"learning_rate": 2.525453486891908e-07,
"loss": 0.177,
"step": 1070
},
{
"epoch": 0.8002964060763246,
"grad_norm": 1.472188949584961,
"learning_rate": 2.356201543462678e-07,
"loss": 0.1444,
"step": 1080
},
{
"epoch": 0.8077065579844387,
"grad_norm": 1.3810269832611084,
"learning_rate": 2.192059733233408e-07,
"loss": 0.1771,
"step": 1090
},
{
"epoch": 0.8151167098925528,
"grad_norm": 1.764819622039795,
"learning_rate": 2.03313779045713e-07,
"loss": 0.1557,
"step": 1100
},
{
"epoch": 0.8151167098925528,
"eval_loss": 0.17193768918514252,
"eval_runtime": 209.8503,
"eval_samples_per_second": 2.707,
"eval_steps_per_second": 1.353,
"step": 1100
},
{
"epoch": 0.8225268618006669,
"grad_norm": 1.3674615621566772,
"learning_rate": 1.8795419597320717e-07,
"loss": 0.165,
"step": 1110
},
{
"epoch": 0.829937013708781,
"grad_norm": 1.6381874084472656,
"learning_rate": 1.7313749249736264e-07,
"loss": 0.1743,
"step": 1120
},
{
"epoch": 0.8373471656168952,
"grad_norm": 1.665426254272461,
"learning_rate": 1.5887357407667312e-07,
"loss": 0.1535,
"step": 1130
},
{
"epoch": 0.8447573175250093,
"grad_norm": 1.4278993606567383,
"learning_rate": 1.451719766144589e-07,
"loss": 0.1709,
"step": 1140
},
{
"epoch": 0.8521674694331234,
"grad_norm": 1.9010162353515625,
"learning_rate": 1.3204186008379925e-07,
"loss": 0.1621,
"step": 1150
},
{
"epoch": 0.8521674694331234,
"eval_loss": 0.1711866408586502,
"eval_runtime": 209.8853,
"eval_samples_per_second": 2.706,
"eval_steps_per_second": 1.353,
"step": 1150
},
{
"epoch": 0.8595776213412375,
"grad_norm": 1.5918656587600708,
"learning_rate": 1.1949200240378577e-07,
"loss": 0.1866,
"step": 1160
},
{
"epoch": 0.8669877732493516,
"grad_norm": 1.593591570854187,
"learning_rate": 1.0753079357119132e-07,
"loss": 0.1605,
"step": 1170
},
{
"epoch": 0.8743979251574657,
"grad_norm": 1.415703535079956,
"learning_rate": 9.61662300514795e-08,
"loss": 0.159,
"step": 1180
},
{
"epoch": 0.8818080770655798,
"grad_norm": 1.7163190841674805,
"learning_rate": 8.540590943290127e-08,
"loss": 0.1783,
"step": 1190
},
{
"epoch": 0.889218228973694,
"grad_norm": 1.6285511255264282,
"learning_rate": 7.525702534725443e-08,
"loss": 0.1682,
"step": 1200
},
{
"epoch": 0.889218228973694,
"eval_loss": 0.17063578963279724,
"eval_runtime": 209.8969,
"eval_samples_per_second": 2.706,
"eval_steps_per_second": 1.353,
"step": 1200
},
{
"epoch": 0.8966283808818081,
"grad_norm": 1.5694634914398193,
"learning_rate": 6.572636266070264e-08,
"loss": 0.1612,
"step": 1210
},
{
"epoch": 0.9040385327899222,
"grad_norm": 1.6561760902404785,
"learning_rate": 5.682029293786672e-08,
"loss": 0.1703,
"step": 1220
},
{
"epoch": 0.9114486846980363,
"grad_norm": 1.4719998836517334,
"learning_rate": 4.854477018222103e-08,
"loss": 0.1909,
"step": 1230
},
{
"epoch": 0.9188588366061504,
"grad_norm": 1.349411129951477,
"learning_rate": 4.090532685564618e-08,
"loss": 0.1851,
"step": 1240
},
{
"epoch": 0.9262689885142645,
"grad_norm": 1.8222600221633911,
"learning_rate": 3.390707017979311e-08,
"loss": 0.1744,
"step": 1250
},
{
"epoch": 0.9262689885142645,
"eval_loss": 0.17040617763996124,
"eval_runtime": 210.0132,
"eval_samples_per_second": 2.705,
"eval_steps_per_second": 1.352,
"step": 1250
},
{
"epoch": 0.9336791404223787,
"grad_norm": 1.5898839235305786,
"learning_rate": 2.755467872173567e-08,
"loss": 0.1565,
"step": 1260
},
{
"epoch": 0.9410892923304928,
"grad_norm": 1.6416816711425781,
"learning_rate": 2.185239926619431e-08,
"loss": 0.1699,
"step": 1270
},
{
"epoch": 0.9484994442386069,
"grad_norm": 2.081421375274658,
"learning_rate": 1.6804043976418438e-08,
"loss": 0.1601,
"step": 1280
},
{
"epoch": 0.955909596146721,
"grad_norm": 1.6453341245651245,
"learning_rate": 1.2412987845628498e-08,
"loss": 0.1553,
"step": 1290
},
{
"epoch": 0.9633197480548351,
"grad_norm": 1.314548134803772,
"learning_rate": 8.682166440721727e-09,
"loss": 0.1626,
"step": 1300
},
{
"epoch": 0.9633197480548351,
"eval_loss": 0.1703246533870697,
"eval_runtime": 209.9198,
"eval_samples_per_second": 2.706,
"eval_steps_per_second": 1.353,
"step": 1300
},
{
"epoch": 0.9707298999629492,
"grad_norm": 1.4979052543640137,
"learning_rate": 5.614073939747443e-09,
"loss": 0.1671,
"step": 1310
},
{
"epoch": 0.9781400518710633,
"grad_norm": 1.4579306840896606,
"learning_rate": 3.210761464466638e-09,
"loss": 0.1605,
"step": 1320
},
{
"epoch": 0.9855502037791775,
"grad_norm": 1.3856755495071411,
"learning_rate": 1.4738357091084174e-09,
"loss": 0.1662,
"step": 1330
},
{
"epoch": 0.9929603556872916,
"grad_norm": 1.5018383264541626,
"learning_rate": 4.0445786624199175e-10,
"loss": 0.1757,
"step": 1340
},
{
"epoch": 1.0,
"grad_norm": 1.9634699821472168,
"learning_rate": 3.3428504808696857e-12,
"loss": 0.1735,
"step": 1350
},
{
"epoch": 1.0,
"eval_loss": 0.1701827049255371,
"eval_runtime": 209.8395,
"eval_samples_per_second": 2.707,
"eval_steps_per_second": 1.353,
"step": 1350
}
],
"logging_steps": 10,
"max_steps": 1350,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.764415382282035e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}