topogpt / trainer_state.json
ruio248's picture
Upload folder using huggingface_hub
9c9644a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.07476915024860742,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007476915024860742,
"grad_norm": 58.74357986450195,
"learning_rate": 2.4919013207077e-08,
"loss": 2.1224,
"step": 10
},
{
"epoch": 0.0014953830049721484,
"grad_norm": 52.66537094116211,
"learning_rate": 4.9838026414154e-08,
"loss": 2.0535,
"step": 20
},
{
"epoch": 0.002243074507458223,
"grad_norm": 64.12317657470703,
"learning_rate": 7.4757039621231e-08,
"loss": 2.225,
"step": 30
},
{
"epoch": 0.002990766009944297,
"grad_norm": 53.08683776855469,
"learning_rate": 9.9676052828308e-08,
"loss": 2.2341,
"step": 40
},
{
"epoch": 0.0037384575124303713,
"grad_norm": 60.34415054321289,
"learning_rate": 1.24595066035385e-07,
"loss": 2.1499,
"step": 50
},
{
"epoch": 0.004486149014916446,
"grad_norm": 59.9915657043457,
"learning_rate": 1.49514079242462e-07,
"loss": 2.1315,
"step": 60
},
{
"epoch": 0.00523384051740252,
"grad_norm": 53.04794692993164,
"learning_rate": 1.7443309244953902e-07,
"loss": 1.8725,
"step": 70
},
{
"epoch": 0.005981532019888594,
"grad_norm": 44.168540954589844,
"learning_rate": 1.99352105656616e-07,
"loss": 1.778,
"step": 80
},
{
"epoch": 0.006729223522374668,
"grad_norm": 42.75059509277344,
"learning_rate": 2.2427111886369301e-07,
"loss": 1.4302,
"step": 90
},
{
"epoch": 0.007476915024860743,
"grad_norm": 27.886425018310547,
"learning_rate": 2.4919013207077e-07,
"loss": 1.2755,
"step": 100
},
{
"epoch": 0.008224606527346816,
"grad_norm": 26.356143951416016,
"learning_rate": 2.74109145277847e-07,
"loss": 0.9265,
"step": 110
},
{
"epoch": 0.008972298029832891,
"grad_norm": 23.42557144165039,
"learning_rate": 2.99028158484924e-07,
"loss": 0.7193,
"step": 120
},
{
"epoch": 0.009719989532318965,
"grad_norm": 20.218666076660156,
"learning_rate": 3.2394717169200103e-07,
"loss": 0.5458,
"step": 130
},
{
"epoch": 0.01046768103480504,
"grad_norm": 15.449305534362793,
"learning_rate": 3.4886618489907804e-07,
"loss": 0.531,
"step": 140
},
{
"epoch": 0.011215372537291114,
"grad_norm": 14.092931747436523,
"learning_rate": 3.73785198106155e-07,
"loss": 0.3885,
"step": 150
},
{
"epoch": 0.011963064039777187,
"grad_norm": 14.002432823181152,
"learning_rate": 3.98704211313232e-07,
"loss": 0.4171,
"step": 160
},
{
"epoch": 0.012710755542263263,
"grad_norm": 14.803778648376465,
"learning_rate": 4.23623224520309e-07,
"loss": 0.2906,
"step": 170
},
{
"epoch": 0.013458447044749336,
"grad_norm": 12.300663948059082,
"learning_rate": 4.4854223772738603e-07,
"loss": 0.303,
"step": 180
},
{
"epoch": 0.01420613854723541,
"grad_norm": 11.724754333496094,
"learning_rate": 4.7346125093446304e-07,
"loss": 0.2489,
"step": 190
},
{
"epoch": 0.014953830049721485,
"grad_norm": 11.619375228881836,
"learning_rate": 4.9838026414154e-07,
"loss": 0.2303,
"step": 200
},
{
"epoch": 0.01570152155220756,
"grad_norm": 11.649147987365723,
"learning_rate": 5.23299277348617e-07,
"loss": 0.296,
"step": 210
},
{
"epoch": 0.016449213054693632,
"grad_norm": 9.424432754516602,
"learning_rate": 5.48218290555694e-07,
"loss": 0.2946,
"step": 220
},
{
"epoch": 0.017196904557179708,
"grad_norm": 8.53671646118164,
"learning_rate": 5.73137303762771e-07,
"loss": 0.3388,
"step": 230
},
{
"epoch": 0.017944596059665783,
"grad_norm": 7.045456886291504,
"learning_rate": 5.98056316969848e-07,
"loss": 0.2974,
"step": 240
},
{
"epoch": 0.018692287562151855,
"grad_norm": 6.465863227844238,
"learning_rate": 6.22975330176925e-07,
"loss": 0.2693,
"step": 250
},
{
"epoch": 0.01943997906463793,
"grad_norm": 10.001205444335938,
"learning_rate": 6.478943433840021e-07,
"loss": 0.252,
"step": 260
},
{
"epoch": 0.020187670567124005,
"grad_norm": 11.637846946716309,
"learning_rate": 6.728133565910791e-07,
"loss": 0.2691,
"step": 270
},
{
"epoch": 0.02093536206961008,
"grad_norm": 4.656905651092529,
"learning_rate": 6.977323697981561e-07,
"loss": 0.1984,
"step": 280
},
{
"epoch": 0.021683053572096152,
"grad_norm": 5.980122089385986,
"learning_rate": 7.22651383005233e-07,
"loss": 0.2565,
"step": 290
},
{
"epoch": 0.022430745074582228,
"grad_norm": 5.372318267822266,
"learning_rate": 7.4757039621231e-07,
"loss": 0.2042,
"step": 300
},
{
"epoch": 0.023178436577068303,
"grad_norm": 5.082016944885254,
"learning_rate": 7.72489409419387e-07,
"loss": 0.1794,
"step": 310
},
{
"epoch": 0.023926128079554375,
"grad_norm": 4.287816047668457,
"learning_rate": 7.97408422626464e-07,
"loss": 0.2197,
"step": 320
},
{
"epoch": 0.02467381958204045,
"grad_norm": 5.412531852722168,
"learning_rate": 8.22327435833541e-07,
"loss": 0.2527,
"step": 330
},
{
"epoch": 0.025421511084526525,
"grad_norm": 5.210222244262695,
"learning_rate": 8.47246449040618e-07,
"loss": 0.2304,
"step": 340
},
{
"epoch": 0.026169202587012597,
"grad_norm": 6.744383335113525,
"learning_rate": 8.72165462247695e-07,
"loss": 0.2496,
"step": 350
},
{
"epoch": 0.026916894089498673,
"grad_norm": 4.84708309173584,
"learning_rate": 8.970844754547721e-07,
"loss": 0.2347,
"step": 360
},
{
"epoch": 0.027664585591984748,
"grad_norm": 4.0091633796691895,
"learning_rate": 9.220034886618491e-07,
"loss": 0.2366,
"step": 370
},
{
"epoch": 0.02841227709447082,
"grad_norm": 4.7725348472595215,
"learning_rate": 9.469225018689261e-07,
"loss": 0.2645,
"step": 380
},
{
"epoch": 0.029159968596956895,
"grad_norm": 6.1945414543151855,
"learning_rate": 9.71841515076003e-07,
"loss": 0.2052,
"step": 390
},
{
"epoch": 0.02990766009944297,
"grad_norm": 3.8042452335357666,
"learning_rate": 9.9676052828308e-07,
"loss": 0.1775,
"step": 400
},
{
"epoch": 0.030655351601929046,
"grad_norm": 5.524064540863037,
"learning_rate": 1.0216795414901571e-06,
"loss": 0.2121,
"step": 410
},
{
"epoch": 0.03140304310441512,
"grad_norm": 3.3003029823303223,
"learning_rate": 1.046598554697234e-06,
"loss": 0.1952,
"step": 420
},
{
"epoch": 0.03215073460690119,
"grad_norm": 5.770596981048584,
"learning_rate": 1.0715175679043111e-06,
"loss": 0.2236,
"step": 430
},
{
"epoch": 0.032898426109387265,
"grad_norm": 6.644259929656982,
"learning_rate": 1.096436581111388e-06,
"loss": 0.2303,
"step": 440
},
{
"epoch": 0.03364611761187334,
"grad_norm": 2.906667470932007,
"learning_rate": 1.1213555943184652e-06,
"loss": 0.2226,
"step": 450
},
{
"epoch": 0.034393809114359415,
"grad_norm": 4.669227600097656,
"learning_rate": 1.146274607525542e-06,
"loss": 0.2409,
"step": 460
},
{
"epoch": 0.03514150061684549,
"grad_norm": 2.6083242893218994,
"learning_rate": 1.1711936207326192e-06,
"loss": 0.2273,
"step": 470
},
{
"epoch": 0.035889192119331566,
"grad_norm": 6.719790458679199,
"learning_rate": 1.196112633939696e-06,
"loss": 0.2079,
"step": 480
},
{
"epoch": 0.03663688362181764,
"grad_norm": 4.7992167472839355,
"learning_rate": 1.2210316471467732e-06,
"loss": 0.2181,
"step": 490
},
{
"epoch": 0.03738457512430371,
"grad_norm": 3.172945261001587,
"learning_rate": 1.24595066035385e-06,
"loss": 0.2258,
"step": 500
},
{
"epoch": 0.038132266626789785,
"grad_norm": 3.8808019161224365,
"learning_rate": 1.2708696735609272e-06,
"loss": 0.2436,
"step": 510
},
{
"epoch": 0.03887995812927586,
"grad_norm": 4.4543256759643555,
"learning_rate": 1.2957886867680041e-06,
"loss": 0.2319,
"step": 520
},
{
"epoch": 0.039627649631761935,
"grad_norm": 5.0955491065979,
"learning_rate": 1.3207076999750812e-06,
"loss": 0.2302,
"step": 530
},
{
"epoch": 0.04037534113424801,
"grad_norm": 3.1149497032165527,
"learning_rate": 1.3456267131821581e-06,
"loss": 0.2158,
"step": 540
},
{
"epoch": 0.041123032636734086,
"grad_norm": 3.299131155014038,
"learning_rate": 1.3705457263892353e-06,
"loss": 0.2355,
"step": 550
},
{
"epoch": 0.04187072413922016,
"grad_norm": 6.149620532989502,
"learning_rate": 1.3954647395963122e-06,
"loss": 0.243,
"step": 560
},
{
"epoch": 0.04261841564170623,
"grad_norm": 3.9118595123291016,
"learning_rate": 1.420383752803389e-06,
"loss": 0.2242,
"step": 570
},
{
"epoch": 0.043366107144192305,
"grad_norm": 2.8375048637390137,
"learning_rate": 1.445302766010466e-06,
"loss": 0.2153,
"step": 580
},
{
"epoch": 0.04411379864667838,
"grad_norm": 3.6035165786743164,
"learning_rate": 1.470221779217543e-06,
"loss": 0.2533,
"step": 590
},
{
"epoch": 0.044861490149164455,
"grad_norm": 2.9740023612976074,
"learning_rate": 1.49514079242462e-06,
"loss": 0.2069,
"step": 600
},
{
"epoch": 0.04560918165165053,
"grad_norm": 1.984499454498291,
"learning_rate": 1.5200598056316971e-06,
"loss": 0.2035,
"step": 610
},
{
"epoch": 0.046356873154136606,
"grad_norm": 3.3487439155578613,
"learning_rate": 1.544978818838774e-06,
"loss": 0.1891,
"step": 620
},
{
"epoch": 0.047104564656622674,
"grad_norm": 3.0334084033966064,
"learning_rate": 1.5698978320458511e-06,
"loss": 0.2165,
"step": 630
},
{
"epoch": 0.04785225615910875,
"grad_norm": 2.650395631790161,
"learning_rate": 1.594816845252928e-06,
"loss": 0.242,
"step": 640
},
{
"epoch": 0.048599947661594825,
"grad_norm": 4.511347770690918,
"learning_rate": 1.6197358584600052e-06,
"loss": 0.2383,
"step": 650
},
{
"epoch": 0.0493476391640809,
"grad_norm": 3.0503461360931396,
"learning_rate": 1.644654871667082e-06,
"loss": 0.2059,
"step": 660
},
{
"epoch": 0.050095330666566976,
"grad_norm": 3.2818543910980225,
"learning_rate": 1.6695738848741592e-06,
"loss": 0.2051,
"step": 670
},
{
"epoch": 0.05084302216905305,
"grad_norm": 3.457136392593384,
"learning_rate": 1.694492898081236e-06,
"loss": 0.2139,
"step": 680
},
{
"epoch": 0.051590713671539126,
"grad_norm": 2.4822287559509277,
"learning_rate": 1.7194119112883132e-06,
"loss": 0.2211,
"step": 690
},
{
"epoch": 0.052338405174025195,
"grad_norm": 3.445197820663452,
"learning_rate": 1.74433092449539e-06,
"loss": 0.2233,
"step": 700
},
{
"epoch": 0.05308609667651127,
"grad_norm": 2.7770423889160156,
"learning_rate": 1.7692499377024672e-06,
"loss": 0.2509,
"step": 710
},
{
"epoch": 0.053833788178997345,
"grad_norm": 5.7018513679504395,
"learning_rate": 1.7941689509095441e-06,
"loss": 0.2181,
"step": 720
},
{
"epoch": 0.05458147968148342,
"grad_norm": 2.4717321395874023,
"learning_rate": 1.8190879641166212e-06,
"loss": 0.2241,
"step": 730
},
{
"epoch": 0.055329171183969496,
"grad_norm": 3.148643732070923,
"learning_rate": 1.8440069773236981e-06,
"loss": 0.2172,
"step": 740
},
{
"epoch": 0.05607686268645557,
"grad_norm": 2.6590192317962646,
"learning_rate": 1.8689259905307753e-06,
"loss": 0.1997,
"step": 750
},
{
"epoch": 0.05682455418894164,
"grad_norm": 3.944225311279297,
"learning_rate": 1.8938450037378522e-06,
"loss": 0.2584,
"step": 760
},
{
"epoch": 0.057572245691427715,
"grad_norm": 2.639666795730591,
"learning_rate": 1.9187640169449293e-06,
"loss": 0.2518,
"step": 770
},
{
"epoch": 0.05831993719391379,
"grad_norm": 2.579602003097534,
"learning_rate": 1.943683030152006e-06,
"loss": 0.2181,
"step": 780
},
{
"epoch": 0.059067628696399865,
"grad_norm": 3.810966968536377,
"learning_rate": 1.968602043359083e-06,
"loss": 0.2164,
"step": 790
},
{
"epoch": 0.05981532019888594,
"grad_norm": 2.8656558990478516,
"learning_rate": 1.99352105656616e-06,
"loss": 0.258,
"step": 800
},
{
"epoch": 0.060563011701372016,
"grad_norm": 2.6037025451660156,
"learning_rate": 2.018440069773237e-06,
"loss": 0.2307,
"step": 810
},
{
"epoch": 0.06131070320385809,
"grad_norm": 2.0483126640319824,
"learning_rate": 2.0433590829803142e-06,
"loss": 0.2399,
"step": 820
},
{
"epoch": 0.06205839470634416,
"grad_norm": 2.1348931789398193,
"learning_rate": 2.068278096187391e-06,
"loss": 0.2341,
"step": 830
},
{
"epoch": 0.06280608620883024,
"grad_norm": 2.779850959777832,
"learning_rate": 2.093197109394468e-06,
"loss": 0.2077,
"step": 840
},
{
"epoch": 0.06355377771131632,
"grad_norm": 3.0291380882263184,
"learning_rate": 2.118116122601545e-06,
"loss": 0.2576,
"step": 850
},
{
"epoch": 0.06430146921380238,
"grad_norm": 2.3847033977508545,
"learning_rate": 2.1430351358086223e-06,
"loss": 0.2174,
"step": 860
},
{
"epoch": 0.06504916071628845,
"grad_norm": 2.7788398265838623,
"learning_rate": 2.167954149015699e-06,
"loss": 0.313,
"step": 870
},
{
"epoch": 0.06579685221877453,
"grad_norm": 3.205575466156006,
"learning_rate": 2.192873162222776e-06,
"loss": 0.2412,
"step": 880
},
{
"epoch": 0.0665445437212606,
"grad_norm": 3.200623035430908,
"learning_rate": 2.217792175429853e-06,
"loss": 0.1836,
"step": 890
},
{
"epoch": 0.06729223522374668,
"grad_norm": 4.147164344787598,
"learning_rate": 2.2427111886369303e-06,
"loss": 0.1856,
"step": 900
},
{
"epoch": 0.06803992672623276,
"grad_norm": 2.339167833328247,
"learning_rate": 2.267630201844007e-06,
"loss": 0.2139,
"step": 910
},
{
"epoch": 0.06878761822871883,
"grad_norm": 2.250478744506836,
"learning_rate": 2.292549215051084e-06,
"loss": 0.2164,
"step": 920
},
{
"epoch": 0.0695353097312049,
"grad_norm": 2.227017641067505,
"learning_rate": 2.317468228258161e-06,
"loss": 0.2217,
"step": 930
},
{
"epoch": 0.07028300123369098,
"grad_norm": 2.296231269836426,
"learning_rate": 2.3423872414652383e-06,
"loss": 0.1947,
"step": 940
},
{
"epoch": 0.07103069273617706,
"grad_norm": 1.9884122610092163,
"learning_rate": 2.3673062546723153e-06,
"loss": 0.2421,
"step": 950
},
{
"epoch": 0.07177838423866313,
"grad_norm": 2.3243603706359863,
"learning_rate": 2.392225267879392e-06,
"loss": 0.1988,
"step": 960
},
{
"epoch": 0.0725260757411492,
"grad_norm": 2.440160036087036,
"learning_rate": 2.417144281086469e-06,
"loss": 0.2126,
"step": 970
},
{
"epoch": 0.07327376724363528,
"grad_norm": 2.4434638023376465,
"learning_rate": 2.4420632942935464e-06,
"loss": 0.219,
"step": 980
},
{
"epoch": 0.07402145874612134,
"grad_norm": 12.352116584777832,
"learning_rate": 2.4669823075006233e-06,
"loss": 0.2414,
"step": 990
},
{
"epoch": 0.07476915024860742,
"grad_norm": 2.0996179580688477,
"learning_rate": 2.4919013207077e-06,
"loss": 0.2139,
"step": 1000
}
],
"logging_steps": 10,
"max_steps": 40122,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.5826787355262976e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}