mt5-small-to-hin-fixing / trainer_state.json
kallacharanteja's picture
easy_final — 3 epochs done
d884a19 verified
{
"best_global_step": 2500,
"best_metric": 2.1584246158599854,
"best_model_checkpoint": "/kaggle/working/checkpoints/checkpoint-2500",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2691,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011157601115760112,
"grad_norm": 255.8717498779297,
"learning_rate": 0.0,
"loss": 108.23062133789062,
"num_input_tokens_seen": 8192,
"step": 1,
"train_runtime": 9.3992,
"train_tokens_per_second": 871.567
},
{
"epoch": 0.05578800557880056,
"grad_norm": 115.45601654052734,
"learning_rate": 0.000362962962962963,
"loss": 93.04728356186224,
"num_input_tokens_seen": 409600,
"step": 50,
"train_runtime": 173.2384,
"train_tokens_per_second": 2364.372
},
{
"epoch": 0.11157601115760112,
"grad_norm": 1.7184170484542847,
"learning_rate": 0.0007333333333333333,
"loss": 36.475849609375,
"num_input_tokens_seen": 819200,
"step": 100,
"train_runtime": 351.2989,
"train_tokens_per_second": 2331.918
},
{
"epoch": 0.16736401673640167,
"grad_norm": 0.7904146909713745,
"learning_rate": 0.0009945226917057904,
"loss": 17.753541259765626,
"num_input_tokens_seen": 1228800,
"step": 150,
"train_runtime": 530.547,
"train_tokens_per_second": 2316.1
},
{
"epoch": 0.22315202231520223,
"grad_norm": 0.6032889485359192,
"learning_rate": 0.000974960876369327,
"loss": 14.870999755859375,
"num_input_tokens_seen": 1638400,
"step": 200,
"train_runtime": 709.2031,
"train_tokens_per_second": 2310.199
},
{
"epoch": 0.2789400278940028,
"grad_norm": 0.5387754440307617,
"learning_rate": 0.0009553990610328639,
"loss": 14.170311279296875,
"num_input_tokens_seen": 2048000,
"step": 250,
"train_runtime": 887.3381,
"train_tokens_per_second": 2308.027
},
{
"epoch": 0.33472803347280333,
"grad_norm": 0.5034986734390259,
"learning_rate": 0.0009358372456964006,
"loss": 13.64815673828125,
"num_input_tokens_seen": 2457600,
"step": 300,
"train_runtime": 1065.8215,
"train_tokens_per_second": 2305.827
},
{
"epoch": 0.3905160390516039,
"grad_norm": 0.637877345085144,
"learning_rate": 0.0009162754303599374,
"loss": 13.331446533203126,
"num_input_tokens_seen": 2867200,
"step": 350,
"train_runtime": 1244.2112,
"train_tokens_per_second": 2304.432
},
{
"epoch": 0.44630404463040446,
"grad_norm": 0.5915430188179016,
"learning_rate": 0.0008967136150234741,
"loss": 12.95500732421875,
"num_input_tokens_seen": 3276800,
"step": 400,
"train_runtime": 1422.4179,
"train_tokens_per_second": 2303.683
},
{
"epoch": 0.502092050209205,
"grad_norm": 0.5109351873397827,
"learning_rate": 0.000877151799687011,
"loss": 12.745074462890624,
"num_input_tokens_seen": 3686400,
"step": 450,
"train_runtime": 1600.7409,
"train_tokens_per_second": 2302.934
},
{
"epoch": 0.5578800557880056,
"grad_norm": 0.5278257727622986,
"learning_rate": 0.0008575899843505478,
"loss": 12.500452880859376,
"num_input_tokens_seen": 4096000,
"step": 500,
"train_runtime": 1779.1422,
"train_tokens_per_second": 2302.233
},
{
"epoch": 0.5578800557880056,
"eval_loss": 2.4494383335113525,
"eval_runtime": 10.7867,
"eval_samples_per_second": 53.77,
"eval_steps_per_second": 6.768,
"num_input_tokens_seen": 4096000,
"step": 500
},
{
"epoch": 0.6136680613668062,
"grad_norm": 0.5274862051010132,
"learning_rate": 0.0008380281690140845,
"loss": 12.42565185546875,
"num_input_tokens_seen": 4505600,
"step": 550,
"train_runtime": 2008.4152,
"train_tokens_per_second": 2243.361
},
{
"epoch": 0.6694560669456067,
"grad_norm": 0.5122476816177368,
"learning_rate": 0.0008184663536776214,
"loss": 12.244449462890625,
"num_input_tokens_seen": 4915200,
"step": 600,
"train_runtime": 2186.6759,
"train_tokens_per_second": 2247.795
},
{
"epoch": 0.7252440725244073,
"grad_norm": 0.48029351234436035,
"learning_rate": 0.000798904538341158,
"loss": 12.14824951171875,
"num_input_tokens_seen": 5324800,
"step": 650,
"train_runtime": 2365.4468,
"train_tokens_per_second": 2251.076
},
{
"epoch": 0.7810320781032078,
"grad_norm": 0.5422759652137756,
"learning_rate": 0.0007793427230046949,
"loss": 12.082501220703126,
"num_input_tokens_seen": 5734400,
"step": 700,
"train_runtime": 2543.5192,
"train_tokens_per_second": 2254.514
},
{
"epoch": 0.8368200836820083,
"grad_norm": 0.5309758186340332,
"learning_rate": 0.0007597809076682316,
"loss": 11.98714599609375,
"num_input_tokens_seen": 6144000,
"step": 750,
"train_runtime": 2721.8962,
"train_tokens_per_second": 2257.25
},
{
"epoch": 0.8926080892608089,
"grad_norm": 0.5811660289764404,
"learning_rate": 0.0007402190923317684,
"loss": 11.878330078125,
"num_input_tokens_seen": 6553600,
"step": 800,
"train_runtime": 2899.6508,
"train_tokens_per_second": 2260.134
},
{
"epoch": 0.9483960948396095,
"grad_norm": 0.5693972706794739,
"learning_rate": 0.0007206572769953051,
"loss": 11.66504638671875,
"num_input_tokens_seen": 6963200,
"step": 850,
"train_runtime": 3077.6825,
"train_tokens_per_second": 2262.482
},
{
"epoch": 1.003347280334728,
"grad_norm": 0.5016735792160034,
"learning_rate": 0.000701095461658842,
"loss": 11.572359619140625,
"num_input_tokens_seen": 7366656,
"step": 900,
"train_runtime": 3253.1428,
"train_tokens_per_second": 2264.474
},
{
"epoch": 1.0591352859135286,
"grad_norm": 0.5279722809791565,
"learning_rate": 0.0006815336463223787,
"loss": 11.42222412109375,
"num_input_tokens_seen": 7776256,
"step": 950,
"train_runtime": 3431.1574,
"train_tokens_per_second": 2266.365
},
{
"epoch": 1.114923291492329,
"grad_norm": 0.5057054758071899,
"learning_rate": 0.0006619718309859155,
"loss": 11.35930419921875,
"num_input_tokens_seen": 8185856,
"step": 1000,
"train_runtime": 3609.1142,
"train_tokens_per_second": 2268.107
},
{
"epoch": 1.114923291492329,
"eval_loss": 2.3056819438934326,
"eval_runtime": 10.8358,
"eval_samples_per_second": 53.527,
"eval_steps_per_second": 6.737,
"num_input_tokens_seen": 8185856,
"step": 1000
},
{
"epoch": 1.1707112970711298,
"grad_norm": 0.4898532032966614,
"learning_rate": 0.0006424100156494523,
"loss": 11.516380615234375,
"num_input_tokens_seen": 8595456,
"step": 1050,
"train_runtime": 3839.7669,
"train_tokens_per_second": 2238.536
},
{
"epoch": 1.2264993026499302,
"grad_norm": 0.48597925901412964,
"learning_rate": 0.000622848200312989,
"loss": 11.34823974609375,
"num_input_tokens_seen": 9005056,
"step": 1100,
"train_runtime": 4018.0046,
"train_tokens_per_second": 2241.176
},
{
"epoch": 1.2822873082287307,
"grad_norm": 0.48646438121795654,
"learning_rate": 0.0006032863849765259,
"loss": 11.37316650390625,
"num_input_tokens_seen": 9414656,
"step": 1150,
"train_runtime": 4195.7027,
"train_tokens_per_second": 2243.881
},
{
"epoch": 1.3380753138075314,
"grad_norm": 0.5371856689453125,
"learning_rate": 0.0005837245696400626,
"loss": 11.138577880859375,
"num_input_tokens_seen": 9824256,
"step": 1200,
"train_runtime": 4373.778,
"train_tokens_per_second": 2246.172
},
{
"epoch": 1.393863319386332,
"grad_norm": 0.5028601884841919,
"learning_rate": 0.0005641627543035994,
"loss": 11.108404541015625,
"num_input_tokens_seen": 10233856,
"step": 1250,
"train_runtime": 4552.0261,
"train_tokens_per_second": 2248.198
},
{
"epoch": 1.4496513249651324,
"grad_norm": 0.5344403386116028,
"learning_rate": 0.0005446009389671362,
"loss": 11.08916015625,
"num_input_tokens_seen": 10643456,
"step": 1300,
"train_runtime": 4730.315,
"train_tokens_per_second": 2250.052
},
{
"epoch": 1.505439330543933,
"grad_norm": 0.4930146038532257,
"learning_rate": 0.000525039123630673,
"loss": 11.0640771484375,
"num_input_tokens_seen": 11053056,
"step": 1350,
"train_runtime": 4908.43,
"train_tokens_per_second": 2251.852
},
{
"epoch": 1.5612273361227336,
"grad_norm": 0.6248587965965271,
"learning_rate": 0.0005054773082942097,
"loss": 11.053743896484375,
"num_input_tokens_seen": 11462656,
"step": 1400,
"train_runtime": 5086.4814,
"train_tokens_per_second": 2253.553
},
{
"epoch": 1.617015341701534,
"grad_norm": 0.5878036618232727,
"learning_rate": 0.0004859154929577465,
"loss": 11.075479736328125,
"num_input_tokens_seen": 11872256,
"step": 1450,
"train_runtime": 5264.6386,
"train_tokens_per_second": 2255.094
},
{
"epoch": 1.6728033472803348,
"grad_norm": 0.5579462647438049,
"learning_rate": 0.0004663536776212833,
"loss": 11.019775390625,
"num_input_tokens_seen": 12281856,
"step": 1500,
"train_runtime": 5442.3024,
"train_tokens_per_second": 2256.739
},
{
"epoch": 1.6728033472803348,
"eval_loss": 2.2273507118225098,
"eval_runtime": 10.872,
"eval_samples_per_second": 53.348,
"eval_steps_per_second": 6.714,
"num_input_tokens_seen": 12281856,
"step": 1500
},
{
"epoch": 1.7285913528591352,
"grad_norm": 0.5162424445152283,
"learning_rate": 0.00044679186228482003,
"loss": 11.0057470703125,
"num_input_tokens_seen": 12691456,
"step": 1550,
"train_runtime": 5677.3627,
"train_tokens_per_second": 2235.449
},
{
"epoch": 1.7843793584379357,
"grad_norm": 0.5111306309700012,
"learning_rate": 0.00042723004694835684,
"loss": 10.911416015625,
"num_input_tokens_seen": 13101056,
"step": 1600,
"train_runtime": 5855.7999,
"train_tokens_per_second": 2237.279
},
{
"epoch": 1.8401673640167364,
"grad_norm": 0.5066443681716919,
"learning_rate": 0.0004076682316118936,
"loss": 10.815118408203125,
"num_input_tokens_seen": 13510656,
"step": 1650,
"train_runtime": 6033.7829,
"train_tokens_per_second": 2239.168
},
{
"epoch": 1.8959553695955371,
"grad_norm": 0.4963262677192688,
"learning_rate": 0.00038810641627543035,
"loss": 10.912720947265624,
"num_input_tokens_seen": 13920256,
"step": 1700,
"train_runtime": 6211.8611,
"train_tokens_per_second": 2240.916
},
{
"epoch": 1.9517433751743374,
"grad_norm": 0.5284143090248108,
"learning_rate": 0.00036854460093896715,
"loss": 10.824471435546876,
"num_input_tokens_seen": 14329856,
"step": 1750,
"train_runtime": 6389.8997,
"train_tokens_per_second": 2242.579
},
{
"epoch": 2.006694560669456,
"grad_norm": 0.5001941323280334,
"learning_rate": 0.0003489827856025039,
"loss": 10.686629638671874,
"num_input_tokens_seen": 14733312,
"step": 1800,
"train_runtime": 6565.3838,
"train_tokens_per_second": 2244.09
},
{
"epoch": 2.0624825662482564,
"grad_norm": 0.5786783695220947,
"learning_rate": 0.00032942097026604066,
"loss": 10.69493896484375,
"num_input_tokens_seen": 15142912,
"step": 1850,
"train_runtime": 6743.0758,
"train_tokens_per_second": 2245.698
},
{
"epoch": 2.118270571827057,
"grad_norm": 0.5400704145431519,
"learning_rate": 0.00030985915492957747,
"loss": 10.6007958984375,
"num_input_tokens_seen": 15552512,
"step": 1900,
"train_runtime": 6921.2091,
"train_tokens_per_second": 2247.08
},
{
"epoch": 2.174058577405858,
"grad_norm": 0.5565312504768372,
"learning_rate": 0.0002902973395931143,
"loss": 10.623209228515625,
"num_input_tokens_seen": 15962112,
"step": 1950,
"train_runtime": 7099.0243,
"train_tokens_per_second": 2248.494
},
{
"epoch": 2.229846582984658,
"grad_norm": 0.5031603574752808,
"learning_rate": 0.00027073552425665103,
"loss": 10.5346484375,
"num_input_tokens_seen": 16371712,
"step": 2000,
"train_runtime": 7277.2957,
"train_tokens_per_second": 2249.697
},
{
"epoch": 2.229846582984658,
"eval_loss": 2.1852738857269287,
"eval_runtime": 10.833,
"eval_samples_per_second": 53.54,
"eval_steps_per_second": 6.739,
"num_input_tokens_seen": 16371712,
"step": 2000
},
{
"epoch": 2.285634588563459,
"grad_norm": 0.5220429301261902,
"learning_rate": 0.00025117370892018784,
"loss": 10.5833935546875,
"num_input_tokens_seen": 16781312,
"step": 2050,
"train_runtime": 7514.4606,
"train_tokens_per_second": 2233.202
},
{
"epoch": 2.3414225941422595,
"grad_norm": 0.5597192049026489,
"learning_rate": 0.0002316118935837246,
"loss": 10.598099365234376,
"num_input_tokens_seen": 17190912,
"step": 2100,
"train_runtime": 7692.7208,
"train_tokens_per_second": 2234.699
},
{
"epoch": 2.3972105997210598,
"grad_norm": 0.5263229012489319,
"learning_rate": 0.00021205007824726135,
"loss": 10.5710595703125,
"num_input_tokens_seen": 17600512,
"step": 2150,
"train_runtime": 7870.818,
"train_tokens_per_second": 2236.173
},
{
"epoch": 2.4529986052998605,
"grad_norm": 0.5467224717140198,
"learning_rate": 0.00019248826291079813,
"loss": 10.567000732421874,
"num_input_tokens_seen": 18010112,
"step": 2200,
"train_runtime": 8049.3853,
"train_tokens_per_second": 2237.452
},
{
"epoch": 2.508786610878661,
"grad_norm": 0.5116831660270691,
"learning_rate": 0.0001729264475743349,
"loss": 10.55864013671875,
"num_input_tokens_seen": 18419712,
"step": 2250,
"train_runtime": 8227.2353,
"train_tokens_per_second": 2238.87
},
{
"epoch": 2.5645746164574614,
"grad_norm": 0.4988791048526764,
"learning_rate": 0.00015336463223787167,
"loss": 10.57171142578125,
"num_input_tokens_seen": 18829312,
"step": 2300,
"train_runtime": 8405.3452,
"train_tokens_per_second": 2240.159
},
{
"epoch": 2.620362622036262,
"grad_norm": 0.5179631114006042,
"learning_rate": 0.00013380281690140845,
"loss": 10.552501220703125,
"num_input_tokens_seen": 19238912,
"step": 2350,
"train_runtime": 8583.3621,
"train_tokens_per_second": 2241.419
},
{
"epoch": 2.676150627615063,
"grad_norm": 0.5133325457572937,
"learning_rate": 0.00011424100156494523,
"loss": 10.537681884765625,
"num_input_tokens_seen": 19648512,
"step": 2400,
"train_runtime": 8761.4066,
"train_tokens_per_second": 2242.621
},
{
"epoch": 2.731938633193863,
"grad_norm": 0.5093114972114563,
"learning_rate": 9.467918622848201e-05,
"loss": 10.546854248046875,
"num_input_tokens_seen": 20058112,
"step": 2450,
"train_runtime": 8939.3652,
"train_tokens_per_second": 2243.796
},
{
"epoch": 2.787726638772664,
"grad_norm": 0.534430742263794,
"learning_rate": 7.511737089201878e-05,
"loss": 10.509217529296874,
"num_input_tokens_seen": 20467712,
"step": 2500,
"train_runtime": 9117.1673,
"train_tokens_per_second": 2244.964
},
{
"epoch": 2.787726638772664,
"eval_loss": 2.1584246158599854,
"eval_runtime": 10.8364,
"eval_samples_per_second": 53.523,
"eval_steps_per_second": 6.737,
"num_input_tokens_seen": 20467712,
"step": 2500
},
{
"epoch": 2.8435146443514645,
"grad_norm": 0.5242642760276794,
"learning_rate": 5.555555555555555e-05,
"loss": 10.4308740234375,
"num_input_tokens_seen": 20877312,
"step": 2550,
"train_runtime": 9348.7798,
"train_tokens_per_second": 2233.159
},
{
"epoch": 2.8993026499302648,
"grad_norm": 0.5218517184257507,
"learning_rate": 3.599374021909233e-05,
"loss": 10.400145263671876,
"num_input_tokens_seen": 21286912,
"step": 2600,
"train_runtime": 9526.9521,
"train_tokens_per_second": 2234.388
},
{
"epoch": 2.9550906555090655,
"grad_norm": 0.5121810436248779,
"learning_rate": 1.643192488262911e-05,
"loss": 10.519163818359376,
"num_input_tokens_seen": 21696512,
"step": 2650,
"train_runtime": 9704.9471,
"train_tokens_per_second": 2235.614
},
{
"epoch": 3.0,
"num_input_tokens_seen": 22026240,
"step": 2691,
"total_flos": 3.99160296603648e+16,
"train_loss": 13.479616094991115,
"train_runtime": 9892.7445,
"train_samples_per_second": 17.395,
"train_steps_per_second": 0.272
}
],
"logging_steps": 50,
"max_steps": 2691,
"num_input_tokens_seen": 22026240,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.99160296603648e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}