ITFormer-7B / trainer_state.json
pandalin98's picture
Upload ITFormer 7B model
55f07f1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 300000000000000000,
"global_step": 4230,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02364066193853428,
"grad_norm": 1506.188232421875,
"learning_rate": 2.978723404255319e-06,
"loss": 19.5964,
"step": 50
},
{
"epoch": 0.04728132387706856,
"grad_norm": 1148.9078369140625,
"learning_rate": 6.4539007092198585e-06,
"loss": 7.2922,
"step": 100
},
{
"epoch": 0.07092198581560284,
"grad_norm": 625.9593505859375,
"learning_rate": 9.929078014184396e-06,
"loss": 2.0938,
"step": 150
},
{
"epoch": 0.09456264775413711,
"grad_norm": 190.0230255126953,
"learning_rate": 1.347517730496454e-05,
"loss": 1.5984,
"step": 200
},
{
"epoch": 0.1182033096926714,
"grad_norm": 201.42596435546875,
"learning_rate": 1.7021276595744682e-05,
"loss": 1.4525,
"step": 250
},
{
"epoch": 0.14184397163120568,
"grad_norm": 42.652748107910156,
"learning_rate": 2.0567375886524822e-05,
"loss": 1.299,
"step": 300
},
{
"epoch": 0.16548463356973994,
"grad_norm": 38.504234313964844,
"learning_rate": 2.4113475177304965e-05,
"loss": 1.2265,
"step": 350
},
{
"epoch": 0.18912529550827423,
"grad_norm": 12.436441421508789,
"learning_rate": 2.765957446808511e-05,
"loss": 1.157,
"step": 400
},
{
"epoch": 0.2127659574468085,
"grad_norm": 7.723984241485596,
"learning_rate": 2.9998524001298715e-05,
"loss": 1.0646,
"step": 450
},
{
"epoch": 0.2364066193853428,
"grad_norm": 6.159175395965576,
"learning_rate": 2.9977078965004553e-05,
"loss": 0.9783,
"step": 500
},
{
"epoch": 0.26004728132387706,
"grad_norm": 5.534773826599121,
"learning_rate": 2.9930139839249263e-05,
"loss": 0.9361,
"step": 550
},
{
"epoch": 0.28368794326241137,
"grad_norm": 4.956591606140137,
"learning_rate": 2.9857786524143804e-05,
"loss": 0.8943,
"step": 600
},
{
"epoch": 0.3073286052009456,
"grad_norm": 4.068881511688232,
"learning_rate": 2.976014218001153e-05,
"loss": 0.8575,
"step": 650
},
{
"epoch": 0.3309692671394799,
"grad_norm": 6.331308364868164,
"learning_rate": 2.963737301774379e-05,
"loss": 0.8035,
"step": 700
},
{
"epoch": 0.3546099290780142,
"grad_norm": 3.5792810916900635,
"learning_rate": 2.9489688015874604e-05,
"loss": 0.7974,
"step": 750
},
{
"epoch": 0.37825059101654845,
"grad_norm": 4.266261577606201,
"learning_rate": 2.9317338564855907e-05,
"loss": 0.7663,
"step": 800
},
{
"epoch": 0.40189125295508277,
"grad_norm": 3.804710865020752,
"learning_rate": 2.9120618039138956e-05,
"loss": 0.7613,
"step": 850
},
{
"epoch": 0.425531914893617,
"grad_norm": 4.593356132507324,
"learning_rate": 2.889986129779028e-05,
"loss": 0.7473,
"step": 900
},
{
"epoch": 0.4491725768321513,
"grad_norm": 3.493183135986328,
"learning_rate": 2.86554441144922e-05,
"loss": 0.7426,
"step": 950
},
{
"epoch": 0.4728132387706856,
"grad_norm": 5.607442855834961,
"learning_rate": 2.838778253789822e-05,
"loss": 0.7147,
"step": 1000
},
{
"epoch": 0.49645390070921985,
"grad_norm": 5.357724189758301,
"learning_rate": 2.8097332183432076e-05,
"loss": 0.7038,
"step": 1050
},
{
"epoch": 0.5200945626477541,
"grad_norm": 4.367136478424072,
"learning_rate": 2.7784587457735947e-05,
"loss": 0.6779,
"step": 1100
},
{
"epoch": 0.5437352245862884,
"grad_norm": 5.684892654418945,
"learning_rate": 2.7450080717087995e-05,
"loss": 0.6698,
"step": 1150
},
{
"epoch": 0.5673758865248227,
"grad_norm": 9.005069732666016,
"learning_rate": 2.7094381361221724e-05,
"loss": 0.692,
"step": 1200
},
{
"epoch": 0.5910165484633569,
"grad_norm": 6.619784832000732,
"learning_rate": 2.6718094864089753e-05,
"loss": 0.6623,
"step": 1250
},
{
"epoch": 0.6146572104018913,
"grad_norm": 5.136876583099365,
"learning_rate": 2.63218617432218e-05,
"loss": 0.6518,
"step": 1300
},
{
"epoch": 0.6382978723404256,
"grad_norm": 9.187308311462402,
"learning_rate": 2.590635646943119e-05,
"loss": 0.6466,
"step": 1350
},
{
"epoch": 0.6619385342789598,
"grad_norm": 9.047890663146973,
"learning_rate": 2.547228631872591e-05,
"loss": 0.6326,
"step": 1400
},
{
"epoch": 0.6855791962174941,
"grad_norm": 5.511709690093994,
"learning_rate": 2.5020390168378374e-05,
"loss": 0.6261,
"step": 1450
},
{
"epoch": 0.7092198581560284,
"grad_norm": 11.79383659362793,
"learning_rate": 2.4551437239203342e-05,
"loss": 0.6387,
"step": 1500
},
{
"epoch": 0.7328605200945626,
"grad_norm": 11.677322387695312,
"learning_rate": 2.4066225786184802e-05,
"loss": 0.6246,
"step": 1550
},
{
"epoch": 0.7565011820330969,
"grad_norm": 6.934109687805176,
"learning_rate": 2.3565581739680718e-05,
"loss": 0.607,
"step": 1600
},
{
"epoch": 0.7801418439716312,
"grad_norm": 10.238001823425293,
"learning_rate": 2.3050357299518546e-05,
"loss": 0.5975,
"step": 1650
},
{
"epoch": 0.8037825059101655,
"grad_norm": 4.702815055847168,
"learning_rate": 2.2521429484374676e-05,
"loss": 0.5886,
"step": 1700
},
{
"epoch": 0.8274231678486997,
"grad_norm": 8.547035217285156,
"learning_rate": 2.197969863890705e-05,
"loss": 0.5869,
"step": 1750
},
{
"epoch": 0.851063829787234,
"grad_norm": 6.0442399978637695,
"learning_rate": 2.1426086901182144e-05,
"loss": 0.5837,
"step": 1800
},
{
"epoch": 0.8747044917257684,
"grad_norm": 6.5389018058776855,
"learning_rate": 2.086153663300503e-05,
"loss": 0.5885,
"step": 1850
},
{
"epoch": 0.8983451536643026,
"grad_norm": 22.425125122070312,
"learning_rate": 2.0287008815824495e-05,
"loss": 0.5859,
"step": 1900
},
{
"epoch": 0.9219858156028369,
"grad_norm": 11.52462387084961,
"learning_rate": 1.9703481414943606e-05,
"loss": 0.5714,
"step": 1950
},
{
"epoch": 0.9456264775413712,
"grad_norm": 37.39026641845703,
"learning_rate": 1.9111947714820277e-05,
"loss": 0.565,
"step": 2000
},
{
"epoch": 0.9692671394799054,
"grad_norm": 5.455526351928711,
"learning_rate": 1.85134146282915e-05,
"loss": 0.5478,
"step": 2050
},
{
"epoch": 0.9929078014184397,
"grad_norm": 22.083446502685547,
"learning_rate": 1.7908900982599148e-05,
"loss": 0.5469,
"step": 2100
},
{
"epoch": 1.016548463356974,
"grad_norm": 10.494434356689453,
"learning_rate": 1.7299435785135098e-05,
"loss": 0.5444,
"step": 2150
},
{
"epoch": 1.0401891252955082,
"grad_norm": 20.982011795043945,
"learning_rate": 1.6686056471857595e-05,
"loss": 0.5356,
"step": 2200
},
{
"epoch": 1.0638297872340425,
"grad_norm": 10.873411178588867,
"learning_rate": 1.606980714136041e-05,
"loss": 0.5466,
"step": 2250
},
{
"epoch": 1.0874704491725768,
"grad_norm": 33.547916412353516,
"learning_rate": 1.5451736777600882e-05,
"loss": 0.5157,
"step": 2300
},
{
"epoch": 1.1111111111111112,
"grad_norm": 14.819494247436523,
"learning_rate": 1.4832897464312018e-05,
"loss": 0.5492,
"step": 2350
},
{
"epoch": 1.1347517730496455,
"grad_norm": 12.469233512878418,
"learning_rate": 1.4214342594138124e-05,
"loss": 0.5305,
"step": 2400
},
{
"epoch": 1.1583924349881798,
"grad_norm": 6.700783729553223,
"learning_rate": 1.3597125075542446e-05,
"loss": 0.5401,
"step": 2450
},
{
"epoch": 1.1820330969267139,
"grad_norm": 7.968286514282227,
"learning_rate": 1.2982295540538918e-05,
"loss": 0.528,
"step": 2500
},
{
"epoch": 1.2056737588652482,
"grad_norm": 6.891226291656494,
"learning_rate": 1.237090055629899e-05,
"loss": 0.5242,
"step": 2550
},
{
"epoch": 1.2293144208037825,
"grad_norm": 25.54634666442871,
"learning_rate": 1.1763980843677541e-05,
"loss": 0.4995,
"step": 2600
},
{
"epoch": 1.2529550827423168,
"grad_norm": 5.665143966674805,
"learning_rate": 1.1162569505690563e-05,
"loss": 0.4916,
"step": 2650
},
{
"epoch": 1.2765957446808511,
"grad_norm": 127.18929290771484,
"learning_rate": 1.0567690268959864e-05,
"loss": 0.4954,
"step": 2700
},
{
"epoch": 1.3002364066193852,
"grad_norm": 8.059895515441895,
"learning_rate": 9.980355741118442e-06,
"loss": 0.5046,
"step": 2750
},
{
"epoch": 1.3238770685579198,
"grad_norm": 5.609386920928955,
"learning_rate": 9.401565687142579e-06,
"loss": 0.5089,
"step": 2800
},
{
"epoch": 1.3475177304964538,
"grad_norm": 12.760987281799316,
"learning_rate": 8.832305327544893e-06,
"loss": 0.4951,
"step": 2850
},
{
"epoch": 1.3711583924349882,
"grad_norm": 3.802379846572876,
"learning_rate": 8.27354366132499e-06,
"loss": 0.4945,
"step": 2900
},
{
"epoch": 1.3947990543735225,
"grad_norm": 9.105464935302734,
"learning_rate": 7.726231816532574e-06,
"loss": 0.4808,
"step": 2950
},
{
"epoch": 1.4184397163120568,
"grad_norm": 41.7253303527832,
"learning_rate": 7.1913014312505226e-06,
"loss": 0.4898,
"step": 3000
},
{
"epoch": 1.442080378250591,
"grad_norm": 3.800203800201416,
"learning_rate": 6.6696630677540235e-06,
"loss": 0.4718,
"step": 3050
},
{
"epoch": 1.4657210401891252,
"grad_norm": 8.743084907531738,
"learning_rate": 6.162204662544992e-06,
"loss": 0.488,
"step": 3100
},
{
"epoch": 1.4893617021276595,
"grad_norm": 7.492750644683838,
"learning_rate": 5.66979001490036e-06,
"loss": 0.4593,
"step": 3150
},
{
"epoch": 1.5130023640661938,
"grad_norm": 3.9157655239105225,
"learning_rate": 5.193257316506778e-06,
"loss": 0.4883,
"step": 3200
},
{
"epoch": 1.5366430260047281,
"grad_norm": 6.936612606048584,
"learning_rate": 4.733417724684879e-06,
"loss": 0.464,
"step": 3250
},
{
"epoch": 1.5602836879432624,
"grad_norm": 7.2238688468933105,
"learning_rate": 4.2910539816315166e-06,
"loss": 0.4663,
"step": 3300
},
{
"epoch": 1.5839243498817965,
"grad_norm": 3.867790460586548,
"learning_rate": 3.866919082030514e-06,
"loss": 0.4485,
"step": 3350
},
{
"epoch": 1.607565011820331,
"grad_norm": 6.582269668579102,
"learning_rate": 3.461734991299779e-06,
"loss": 0.4421,
"step": 3400
},
{
"epoch": 1.6312056737588652,
"grad_norm": 5.653420448303223,
"learning_rate": 3.0761914166566895e-06,
"loss": 0.4673,
"step": 3450
},
{
"epoch": 1.6548463356973995,
"grad_norm": 12.142635345458984,
"learning_rate": 2.71094463309358e-06,
"loss": 0.4525,
"step": 3500
},
{
"epoch": 1.6784869976359338,
"grad_norm": 9.567633628845215,
"learning_rate": 2.3666163662618575e-06,
"loss": 0.4698,
"step": 3550
},
{
"epoch": 1.702127659574468,
"grad_norm": 5.014305591583252,
"learning_rate": 2.043792734166174e-06,
"loss": 0.4669,
"step": 3600
},
{
"epoch": 1.7257683215130024,
"grad_norm": 5.084635257720947,
"learning_rate": 1.7430232494702537e-06,
"loss": 0.449,
"step": 3650
},
{
"epoch": 1.7494089834515365,
"grad_norm": 14.368584632873535,
"learning_rate": 1.4648198841125453e-06,
"loss": 0.447,
"step": 3700
},
{
"epoch": 1.773049645390071,
"grad_norm": 5.743776321411133,
"learning_rate": 1.209656197823985e-06,
"loss": 0.4455,
"step": 3750
},
{
"epoch": 1.7966903073286051,
"grad_norm": 5.783755779266357,
"learning_rate": 9.779665320312675e-07,
"loss": 0.4416,
"step": 3800
},
{
"epoch": 1.8203309692671394,
"grad_norm": 7.874058723449707,
"learning_rate": 7.701452705178236e-07,
"loss": 0.4501,
"step": 3850
},
{
"epoch": 1.8439716312056738,
"grad_norm": 8.542813301086426,
"learning_rate": 5.865461681009542e-07,
"loss": 0.4626,
"step": 3900
},
{
"epoch": 1.867612293144208,
"grad_norm": 6.454606056213379,
"learning_rate": 4.2748174846788724e-07,
"loss": 0.4527,
"step": 3950
},
{
"epoch": 1.8912529550827424,
"grad_norm": 18.86402702331543,
"learning_rate": 2.9322277219574145e-07,
"loss": 0.4503,
"step": 4000
},
{
"epoch": 1.9148936170212765,
"grad_norm": 6.615504741668701,
"learning_rate": 1.839977758609801e-07,
"loss": 0.4526,
"step": 4050
},
{
"epoch": 1.938534278959811,
"grad_norm": 11.798493385314941,
"learning_rate": 9.99926830228265e-08,
"loss": 0.4485,
"step": 4100
},
{
"epoch": 1.962174940898345,
"grad_norm": 20.15699577331543,
"learning_rate": 4.135048774287553e-08,
"loss": 0.4371,
"step": 4150
},
{
"epoch": 1.9858156028368794,
"grad_norm": 15.80203628540039,
"learning_rate": 8.171011179587961e-09,
"loss": 0.4495,
"step": 4200
}
],
"logging_steps": 50,
"max_steps": 4230,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1254586343864402e+18,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}