MedicalBitNET-350M-v0.1 / trainer_state.json
3v324v23's picture
final additions
cfae895
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9891774989096567,
"eval_steps": 500,
"global_step": 110000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008992522717360514,
"grad_norm": 0.5267877578735352,
"learning_rate": 0.0004999002617394902,
"loss": 4.4189,
"step": 1000
},
{
"epoch": 0.017985045434721028,
"grad_norm": 0.4186614155769348,
"learning_rate": 0.0004996010866580058,
"loss": 3.2692,
"step": 2000
},
{
"epoch": 0.026977568152081546,
"grad_norm": 0.37206095457077026,
"learning_rate": 0.0004991027135159133,
"loss": 3.055,
"step": 3000
},
{
"epoch": 0.035970090869442056,
"grad_norm": 0.3365996479988098,
"learning_rate": 0.0004984055400477128,
"loss": 2.9514,
"step": 4000
},
{
"epoch": 0.04496261358680257,
"grad_norm": 0.3055461645126343,
"learning_rate": 0.0004975101226436211,
"loss": 2.8843,
"step": 5000
},
{
"epoch": 0.05395513630416309,
"grad_norm": 0.2653150260448456,
"learning_rate": 0.0004964171759055367,
"loss": 2.8359,
"step": 6000
},
{
"epoch": 0.0629476590215236,
"grad_norm": 0.26524779200553894,
"learning_rate": 0.0004951275720767395,
"loss": 2.7964,
"step": 7000
},
{
"epoch": 0.07194018173888411,
"grad_norm": 0.23818659782409668,
"learning_rate": 0.0004936423403457847,
"loss": 2.7717,
"step": 8000
},
{
"epoch": 0.08093270445624463,
"grad_norm": 0.24217627942562103,
"learning_rate": 0.0004919626660251412,
"loss": 2.7383,
"step": 9000
},
{
"epoch": 0.08992522717360514,
"grad_norm": 0.2376166582107544,
"learning_rate": 0.0004900898896052357,
"loss": 2.7244,
"step": 10000
},
{
"epoch": 0.09891774989096566,
"grad_norm": 0.2203313708305359,
"learning_rate": 0.00048802550568465263,
"loss": 2.7031,
"step": 11000
},
{
"epoch": 0.10791027260832618,
"grad_norm": 0.20930123329162598,
"learning_rate": 0.00048577116177734653,
"loss": 2.6876,
"step": 12000
},
{
"epoch": 0.11690279532568669,
"grad_norm": 0.2004300355911255,
"learning_rate": 0.0004833286569978177,
"loss": 2.6722,
"step": 13000
},
{
"epoch": 0.1258953180430472,
"grad_norm": 0.19555214047431946,
"learning_rate": 0.0004806999406253004,
"loss": 2.6577,
"step": 14000
},
{
"epoch": 0.13488784076040772,
"grad_norm": 0.1877773553133011,
"learning_rate": 0.0004778871105481104,
"loss": 2.647,
"step": 15000
},
{
"epoch": 0.14388036347776822,
"grad_norm": 0.18455323576927185,
"learning_rate": 0.0004748924115893922,
"loss": 2.6388,
"step": 16000
},
{
"epoch": 0.15287288619512876,
"grad_norm": 0.18309278786182404,
"learning_rate": 0.0004717214967118909,
"loss": 2.6254,
"step": 17000
},
{
"epoch": 0.16186540891248927,
"grad_norm": 0.17841506004333496,
"learning_rate": 0.0004683705487586517,
"loss": 2.6187,
"step": 18000
},
{
"epoch": 0.17085793162984977,
"grad_norm": 0.17425057291984558,
"learning_rate": 0.00046484532676522683,
"loss": 2.6115,
"step": 19000
},
{
"epoch": 0.17985045434721028,
"grad_norm": 0.17270472645759583,
"learning_rate": 0.00046114864409029877,
"loss": 2.6017,
"step": 20000
},
{
"epoch": 0.18884297706457082,
"grad_norm": 0.1661667376756668,
"learning_rate": 0.00045728739927797956,
"loss": 2.5935,
"step": 21000
},
{
"epoch": 0.19783549978193132,
"grad_norm": 0.16307614743709564,
"learning_rate": 0.0004532569441500434,
"loss": 2.593,
"step": 22000
},
{
"epoch": 0.20682802249929183,
"grad_norm": 0.15843474864959717,
"learning_rate": 0.0004490642766310399,
"loss": 2.5831,
"step": 23000
},
{
"epoch": 0.21582054521665237,
"grad_norm": 0.15826140344142914,
"learning_rate": 0.0004447171724872102,
"loss": 2.575,
"step": 24000
},
{
"epoch": 0.22481306793401287,
"grad_norm": 0.15876850485801697,
"learning_rate": 0.0004402103986613901,
"loss": 2.5718,
"step": 25000
},
{
"epoch": 0.23380559065137338,
"grad_norm": 0.15799590945243835,
"learning_rate": 0.0004355518244446819,
"loss": 2.5643,
"step": 26000
},
{
"epoch": 0.2427981133687339,
"grad_norm": 0.15866918861865997,
"learning_rate": 0.000430750047042202,
"loss": 2.5611,
"step": 27000
},
{
"epoch": 0.2517906360860944,
"grad_norm": 0.1551973819732666,
"learning_rate": 0.0004257992860597374,
"loss": 2.5532,
"step": 28000
},
{
"epoch": 0.26078315880345493,
"grad_norm": 0.1489175260066986,
"learning_rate": 0.00042070822570074265,
"loss": 2.5508,
"step": 29000
},
{
"epoch": 0.26977568152081544,
"grad_norm": 0.14987464249134064,
"learning_rate": 0.0004154809289657581,
"loss": 2.5451,
"step": 30000
},
{
"epoch": 0.27876820423817594,
"grad_norm": 0.14638318121433258,
"learning_rate": 0.00041013241527853406,
"loss": 2.5382,
"step": 31000
},
{
"epoch": 0.28776072695553645,
"grad_norm": 0.14251314103603363,
"learning_rate": 0.0004046455176043083,
"loss": 2.5351,
"step": 32000
},
{
"epoch": 0.296753249672897,
"grad_norm": 0.14875943958759308,
"learning_rate": 0.00039904087312981354,
"loss": 2.5315,
"step": 33000
},
{
"epoch": 0.3057457723902575,
"grad_norm": 0.14232666790485382,
"learning_rate": 0.00039331173496701843,
"loss": 2.5287,
"step": 34000
},
{
"epoch": 0.314738295107618,
"grad_norm": 0.1461074948310852,
"learning_rate": 0.00038746822462720277,
"loss": 2.5252,
"step": 35000
},
{
"epoch": 0.32373081782497853,
"grad_norm": 0.14645366370677948,
"learning_rate": 0.00038152101205969716,
"loss": 2.5184,
"step": 36000
},
{
"epoch": 0.33272334054233904,
"grad_norm": 0.14445528388023376,
"learning_rate": 0.00037546293799195995,
"loss": 2.5201,
"step": 37000
},
{
"epoch": 0.34171586325969955,
"grad_norm": 0.13728487491607666,
"learning_rate": 0.00036930473625947265,
"loss": 2.5149,
"step": 38000
},
{
"epoch": 0.35070838597706006,
"grad_norm": 0.13824057579040527,
"learning_rate": 0.00036305132151167983,
"loss": 2.5122,
"step": 39000
},
{
"epoch": 0.35970090869442056,
"grad_norm": 0.1395253986120224,
"learning_rate": 0.00035672045834706856,
"loss": 2.5069,
"step": 40000
},
{
"epoch": 0.3686934314117811,
"grad_norm": 0.14017699658870697,
"learning_rate": 0.0003502918267002188,
"loss": 2.5047,
"step": 41000
},
{
"epoch": 0.37768595412914163,
"grad_norm": 0.1350419521331787,
"learning_rate": 0.00034378315558863357,
"loss": 2.5011,
"step": 42000
},
{
"epoch": 0.38667847684650214,
"grad_norm": 0.14121927320957184,
"learning_rate": 0.00033719963935934026,
"loss": 2.496,
"step": 43000
},
{
"epoch": 0.39567099956386265,
"grad_norm": 0.1398804783821106,
"learning_rate": 0.0003305532181958758,
"loss": 2.4966,
"step": 44000
},
{
"epoch": 0.40466352228122315,
"grad_norm": 0.14015056192874908,
"learning_rate": 0.00032383589111626814,
"loss": 2.493,
"step": 45000
},
{
"epoch": 0.41365604499858366,
"grad_norm": 0.1367267668247223,
"learning_rate": 0.00031705963814611035,
"loss": 2.484,
"step": 46000
},
{
"epoch": 0.42264856771594417,
"grad_norm": 0.13933929800987244,
"learning_rate": 0.0003102367218727284,
"loss": 2.4843,
"step": 47000
},
{
"epoch": 0.43164109043330473,
"grad_norm": 0.13233740627765656,
"learning_rate": 0.0003033589288488015,
"loss": 2.4813,
"step": 48000
},
{
"epoch": 0.44063361315066524,
"grad_norm": 0.13488726317882538,
"learning_rate": 0.0002964385518951125,
"loss": 2.4786,
"step": 49000
},
{
"epoch": 0.44962613586802574,
"grad_norm": 0.1470656394958496,
"learning_rate": 0.0002894811139269912,
"loss": 2.4743,
"step": 50000
},
{
"epoch": 0.45861865858538625,
"grad_norm": 0.1314244419336319,
"learning_rate": 0.0002825061730718414,
"loss": 2.4751,
"step": 51000
},
{
"epoch": 0.46761118130274676,
"grad_norm": 0.13198289275169373,
"learning_rate": 0.0002754913419788723,
"loss": 2.4729,
"step": 52000
},
{
"epoch": 0.47660370402010727,
"grad_norm": 0.12992499768733978,
"learning_rate": 0.00026845616712076794,
"loss": 2.4694,
"step": 53000
},
{
"epoch": 0.4855962267374678,
"grad_norm": 0.13007935881614685,
"learning_rate": 0.00026141331841860756,
"loss": 2.463,
"step": 54000
},
{
"epoch": 0.49458874945482834,
"grad_norm": 0.1366134136915207,
"learning_rate": 0.000254354317669745,
"loss": 2.4592,
"step": 55000
},
{
"epoch": 0.5035812721721888,
"grad_norm": 0.12824617326259613,
"learning_rate": 0.0002472918418893963,
"loss": 2.4585,
"step": 56000
},
{
"epoch": 0.5125737948895494,
"grad_norm": 0.1312493234872818,
"learning_rate": 0.00024023858475626683,
"loss": 2.4571,
"step": 57000
},
{
"epoch": 0.5215663176069099,
"grad_norm": 0.12798364460468292,
"learning_rate": 0.0002331860555380091,
"loss": 2.4561,
"step": 58000
},
{
"epoch": 0.5305588403242704,
"grad_norm": 0.1297149360179901,
"learning_rate": 0.00022614694495174873,
"loss": 2.4524,
"step": 59000
},
{
"epoch": 0.5395513630416309,
"grad_norm": 0.12818291783332825,
"learning_rate": 0.00021912687067005265,
"loss": 2.4496,
"step": 60000
},
{
"epoch": 0.5485438857589914,
"grad_norm": 0.13012410700321198,
"learning_rate": 0.00021213841644168087,
"loss": 2.4465,
"step": 61000
},
{
"epoch": 0.5575364084763519,
"grad_norm": 0.13271279633045197,
"learning_rate": 0.00020518706618325146,
"loss": 2.4471,
"step": 62000
},
{
"epoch": 0.5665289311937124,
"grad_norm": 0.13144823908805847,
"learning_rate": 0.00019825751696178184,
"loss": 2.4405,
"step": 63000
},
{
"epoch": 0.5755214539110729,
"grad_norm": 0.1301935911178589,
"learning_rate": 0.00019136926164015156,
"loss": 2.4368,
"step": 64000
},
{
"epoch": 0.5845139766284334,
"grad_norm": 0.1324065774679184,
"learning_rate": 0.00018452779749851848,
"loss": 2.4383,
"step": 65000
},
{
"epoch": 0.593506499345794,
"grad_norm": 0.13288547098636627,
"learning_rate": 0.00017774534578113516,
"loss": 2.4307,
"step": 66000
},
{
"epoch": 0.6024990220631545,
"grad_norm": 0.13380388915538788,
"learning_rate": 0.00017101374175203582,
"loss": 2.4311,
"step": 67000
},
{
"epoch": 0.611491544780515,
"grad_norm": 0.12911923229694366,
"learning_rate": 0.00016434517394472685,
"loss": 2.4301,
"step": 68000
},
{
"epoch": 0.6204840674978755,
"grad_norm": 0.129085510969162,
"learning_rate": 0.0001577515286210997,
"loss": 2.4237,
"step": 69000
},
{
"epoch": 0.629476590215236,
"grad_norm": 0.12780623137950897,
"learning_rate": 0.00015122486833089863,
"loss": 2.4248,
"step": 70000
},
{
"epoch": 0.6384691129325966,
"grad_norm": 0.13019651174545288,
"learning_rate": 0.00014478344382118653,
"loss": 2.4217,
"step": 71000
},
{
"epoch": 0.6474616356499571,
"grad_norm": 0.13168035447597504,
"learning_rate": 0.00013841950088558575,
"loss": 2.4179,
"step": 72000
},
{
"epoch": 0.6564541583673176,
"grad_norm": 0.12985067069530487,
"learning_rate": 0.000132150835258465,
"loss": 2.4188,
"step": 73000
},
{
"epoch": 0.6654466810846781,
"grad_norm": 0.1350133717060089,
"learning_rate": 0.00012596990070111393,
"loss": 2.4159,
"step": 74000
},
{
"epoch": 0.6744392038020386,
"grad_norm": 0.13242337107658386,
"learning_rate": 0.00011988795030912905,
"loss": 2.4116,
"step": 75000
},
{
"epoch": 0.6834317265193991,
"grad_norm": 0.1263236254453659,
"learning_rate": 0.00011391576252872856,
"loss": 2.4121,
"step": 76000
},
{
"epoch": 0.6924242492367596,
"grad_norm": 0.1311461478471756,
"learning_rate": 0.00010804614802213383,
"loss": 2.4086,
"step": 77000
},
{
"epoch": 0.7014167719541201,
"grad_norm": 0.1291423887014389,
"learning_rate": 0.00010229552021442814,
"loss": 2.4061,
"step": 78000
},
{
"epoch": 0.7104092946714806,
"grad_norm": 0.13248379528522491,
"learning_rate": 9.66569564824003e-05,
"loss": 2.403,
"step": 79000
},
{
"epoch": 0.7194018173888411,
"grad_norm": 0.13015016913414001,
"learning_rate": 9.115167786655006e-05,
"loss": 2.4049,
"step": 80000
},
{
"epoch": 0.7283943401062017,
"grad_norm": 0.13090303540229797,
"learning_rate": 8.576201416635018e-05,
"loss": 2.4002,
"step": 81000
},
{
"epoch": 0.7373868628235623,
"grad_norm": 0.13276338577270508,
"learning_rate": 8.050342316576978e-05,
"loss": 2.3996,
"step": 82000
},
{
"epoch": 0.7463793855409228,
"grad_norm": 0.13524140417575836,
"learning_rate": 7.538010156580435e-05,
"loss": 2.3976,
"step": 83000
},
{
"epoch": 0.7553719082582833,
"grad_norm": 0.13214819133281708,
"learning_rate": 7.039613811358328e-05,
"loss": 2.3951,
"step": 84000
},
{
"epoch": 0.7643644309756438,
"grad_norm": 0.13076630234718323,
"learning_rate": 6.555551033928139e-05,
"loss": 2.3936,
"step": 85000
},
{
"epoch": 0.7733569536930043,
"grad_norm": 0.1300399899482727,
"learning_rate": 6.08620813817882e-05,
"loss": 2.3891,
"step": 86000
},
{
"epoch": 0.7823494764103648,
"grad_norm": 0.12917381525039673,
"learning_rate": 5.631959690566982e-05,
"loss": 2.3865,
"step": 87000
},
{
"epoch": 0.7913419991277253,
"grad_norm": 0.13091173768043518,
"learning_rate": 5.1940301332541934e-05,
"loss": 2.3814,
"step": 88000
},
{
"epoch": 0.8003345218450858,
"grad_norm": 0.13044433295726776,
"learning_rate": 4.771013851078279e-05,
"loss": 2.3799,
"step": 89000
},
{
"epoch": 0.8093270445624463,
"grad_norm": 0.12928803265094757,
"learning_rate": 4.364141628461085e-05,
"loss": 2.3801,
"step": 90000
},
{
"epoch": 0.8183195672798068,
"grad_norm": 0.1315431147813797,
"learning_rate": 3.973738176159078e-05,
"loss": 2.3749,
"step": 91000
},
{
"epoch": 0.8273120899971673,
"grad_norm": 0.13129934668540955,
"learning_rate": 3.6004802027335776e-05,
"loss": 2.3751,
"step": 92000
},
{
"epoch": 0.8363046127145278,
"grad_norm": 0.13383065164089203,
"learning_rate": 3.243918379103131e-05,
"loss": 2.3747,
"step": 93000
},
{
"epoch": 0.8452971354318883,
"grad_norm": 0.13397032022476196,
"learning_rate": 2.905049772583343e-05,
"loss": 2.3716,
"step": 94000
},
{
"epoch": 0.854289658149249,
"grad_norm": 0.13112303614616394,
"learning_rate": 2.5834664521170502e-05,
"loss": 2.3705,
"step": 95000
},
{
"epoch": 0.8632821808666095,
"grad_norm": 0.1314103603363037,
"learning_rate": 2.2797729977089537e-05,
"loss": 2.3649,
"step": 96000
},
{
"epoch": 0.87227470358397,
"grad_norm": 0.1317732334136963,
"learning_rate": 1.994488203786088e-05,
"loss": 2.3652,
"step": 97000
},
{
"epoch": 0.8812672263013305,
"grad_norm": 0.13032500445842743,
"learning_rate": 1.7272686436475505e-05,
"loss": 2.3646,
"step": 98000
},
{
"epoch": 0.890259749018691,
"grad_norm": 0.1355644017457962,
"learning_rate": 1.4788615534578526e-05,
"loss": 2.363,
"step": 99000
},
{
"epoch": 0.8992522717360515,
"grad_norm": 0.1347675770521164,
"learning_rate": 1.2489679000077859e-05,
"loss": 2.354,
"step": 100000
},
{
"epoch": 0.908244794453412,
"grad_norm": 0.13323768973350525,
"learning_rate": 1.0380291301085198e-05,
"loss": 2.3521,
"step": 101000
},
{
"epoch": 0.9172373171707725,
"grad_norm": 0.13284514844417572,
"learning_rate": 8.463957980162084e-06,
"loss": 2.3492,
"step": 102000
},
{
"epoch": 0.926229839888133,
"grad_norm": 0.1306186467409134,
"learning_rate": 6.738372149439731e-06,
"loss": 2.3454,
"step": 103000
},
{
"epoch": 0.9352223626054935,
"grad_norm": 0.13234929740428925,
"learning_rate": 5.206925074830115e-06,
"loss": 2.3452,
"step": 104000
},
{
"epoch": 0.944214885322854,
"grad_norm": 0.1304798424243927,
"learning_rate": 3.8708389516820544e-06,
"loss": 2.3448,
"step": 105000
},
{
"epoch": 0.9532074080402145,
"grad_norm": 0.1340903788805008,
"learning_rate": 2.733262717159124e-06,
"loss": 2.338,
"step": 106000
},
{
"epoch": 0.962199930757575,
"grad_norm": 0.13070346415042877,
"learning_rate": 1.7905451118825079e-06,
"loss": 2.3358,
"step": 107000
},
{
"epoch": 0.9711924534749355,
"grad_norm": 0.13099098205566406,
"learning_rate": 1.0459149536610202e-06,
"loss": 2.3329,
"step": 108000
},
{
"epoch": 0.980184976192296,
"grad_norm": 0.13041888177394867,
"learning_rate": 5.004130549572938e-07,
"loss": 2.3332,
"step": 109000
},
{
"epoch": 0.9891774989096567,
"grad_norm": 0.1311406046152115,
"learning_rate": 1.5338273892975818e-07,
"loss": 2.3253,
"step": 110000
}
],
"logging_steps": 1000,
"max_steps": 111203,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 2000,
"total_flos": 9.3094825426944e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}