gec_coedit_c4200m / checkpoint-46000 /trainer_state.json
sumitraut7's picture
Updated model to version 2
218c1b2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.9372115487817965,
"eval_steps": 500,
"global_step": 46000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05366534292154127,
"grad_norm": 0.3935315012931824,
"learning_rate": 4.9463346570784594e-05,
"loss": 0.7966,
"step": 500
},
{
"epoch": 0.10733068584308254,
"grad_norm": 0.26197516918182373,
"learning_rate": 4.892669314156917e-05,
"loss": 0.2163,
"step": 1000
},
{
"epoch": 0.1609960287646238,
"grad_norm": 0.34691885113716125,
"learning_rate": 4.839003971235376e-05,
"loss": 0.2089,
"step": 1500
},
{
"epoch": 0.2146613716861651,
"grad_norm": 0.239683598279953,
"learning_rate": 4.7853386283138354e-05,
"loss": 0.1973,
"step": 2000
},
{
"epoch": 0.26832671460770635,
"grad_norm": 0.3232818841934204,
"learning_rate": 4.731673285392294e-05,
"loss": 0.2035,
"step": 2500
},
{
"epoch": 0.3219920575292476,
"grad_norm": 0.3142314553260803,
"learning_rate": 4.678007942470752e-05,
"loss": 0.197,
"step": 3000
},
{
"epoch": 0.3756574004507889,
"grad_norm": 0.2924354076385498,
"learning_rate": 4.6243425995492114e-05,
"loss": 0.1913,
"step": 3500
},
{
"epoch": 0.4293227433723302,
"grad_norm": 0.4022532105445862,
"learning_rate": 4.57067725662767e-05,
"loss": 0.1931,
"step": 4000
},
{
"epoch": 0.4829880862938714,
"grad_norm": 0.2730918228626251,
"learning_rate": 4.517011913706129e-05,
"loss": 0.189,
"step": 4500
},
{
"epoch": 0.5366534292154127,
"grad_norm": 0.2549761235713959,
"learning_rate": 4.4633465707845874e-05,
"loss": 0.1861,
"step": 5000
},
{
"epoch": 0.5903187721369539,
"grad_norm": 0.28634166717529297,
"learning_rate": 4.409681227863046e-05,
"loss": 0.1853,
"step": 5500
},
{
"epoch": 0.6439841150584952,
"grad_norm": 0.267437607049942,
"learning_rate": 4.356015884941505e-05,
"loss": 0.1888,
"step": 6000
},
{
"epoch": 0.6976494579800365,
"grad_norm": 0.24703972041606903,
"learning_rate": 4.302350542019964e-05,
"loss": 0.1863,
"step": 6500
},
{
"epoch": 0.7513148009015778,
"grad_norm": 0.20265009999275208,
"learning_rate": 4.2486851990984225e-05,
"loss": 0.1854,
"step": 7000
},
{
"epoch": 0.804980143823119,
"grad_norm": 0.3746808171272278,
"learning_rate": 4.195019856176881e-05,
"loss": 0.1838,
"step": 7500
},
{
"epoch": 0.8586454867446603,
"grad_norm": 0.2632112503051758,
"learning_rate": 4.14135451325534e-05,
"loss": 0.1827,
"step": 8000
},
{
"epoch": 0.9123108296662016,
"grad_norm": 0.3191295564174652,
"learning_rate": 4.0876891703337986e-05,
"loss": 0.18,
"step": 8500
},
{
"epoch": 0.9659761725877428,
"grad_norm": 0.2685633599758148,
"learning_rate": 4.034023827412257e-05,
"loss": 0.1783,
"step": 9000
},
{
"epoch": 1.0,
"eval_loss": 0.18574130535125732,
"eval_runtime": 121.9059,
"eval_samples_per_second": 178.105,
"eval_steps_per_second": 11.132,
"step": 9317
},
{
"epoch": 1.0196415155092842,
"grad_norm": 0.20498144626617432,
"learning_rate": 3.980358484490716e-05,
"loss": 0.1754,
"step": 9500
},
{
"epoch": 1.0733068584308254,
"grad_norm": 0.2528730034828186,
"learning_rate": 3.9266931415691746e-05,
"loss": 0.1755,
"step": 10000
},
{
"epoch": 1.1269722013523666,
"grad_norm": 0.28406932950019836,
"learning_rate": 3.873027798647634e-05,
"loss": 0.1724,
"step": 10500
},
{
"epoch": 1.1806375442739079,
"grad_norm": 0.20382213592529297,
"learning_rate": 3.819362455726092e-05,
"loss": 0.1729,
"step": 11000
},
{
"epoch": 1.2343028871954491,
"grad_norm": 0.22263003885746002,
"learning_rate": 3.765697112804551e-05,
"loss": 0.1791,
"step": 11500
},
{
"epoch": 1.2879682301169906,
"grad_norm": 0.2563818395137787,
"learning_rate": 3.71203176988301e-05,
"loss": 0.1758,
"step": 12000
},
{
"epoch": 1.3416335730385316,
"grad_norm": 0.2475346475839615,
"learning_rate": 3.658366426961469e-05,
"loss": 0.1798,
"step": 12500
},
{
"epoch": 1.395298915960073,
"grad_norm": 0.246457040309906,
"learning_rate": 3.604701084039927e-05,
"loss": 0.1752,
"step": 13000
},
{
"epoch": 1.4489642588816143,
"grad_norm": 0.18007038533687592,
"learning_rate": 3.551035741118386e-05,
"loss": 0.1708,
"step": 13500
},
{
"epoch": 1.5026296018031555,
"grad_norm": 0.3777049779891968,
"learning_rate": 3.497370398196845e-05,
"loss": 0.174,
"step": 14000
},
{
"epoch": 1.5562949447246968,
"grad_norm": 0.29228395223617554,
"learning_rate": 3.443705055275303e-05,
"loss": 0.1723,
"step": 14500
},
{
"epoch": 1.609960287646238,
"grad_norm": 0.2725663483142853,
"learning_rate": 3.390039712353762e-05,
"loss": 0.173,
"step": 15000
},
{
"epoch": 1.6636256305677795,
"grad_norm": 0.2582469880580902,
"learning_rate": 3.336374369432221e-05,
"loss": 0.1739,
"step": 15500
},
{
"epoch": 1.7172909734893205,
"grad_norm": 0.3346666395664215,
"learning_rate": 3.28270902651068e-05,
"loss": 0.1723,
"step": 16000
},
{
"epoch": 1.770956316410862,
"grad_norm": 0.3053486943244934,
"learning_rate": 3.2290436835891384e-05,
"loss": 0.1702,
"step": 16500
},
{
"epoch": 1.8246216593324032,
"grad_norm": 0.231419175863266,
"learning_rate": 3.175378340667597e-05,
"loss": 0.1721,
"step": 17000
},
{
"epoch": 1.8782870022539444,
"grad_norm": 0.29271528124809265,
"learning_rate": 3.121712997746056e-05,
"loss": 0.1724,
"step": 17500
},
{
"epoch": 1.9319523451754856,
"grad_norm": 0.19697044789791107,
"learning_rate": 3.0680476548245145e-05,
"loss": 0.1737,
"step": 18000
},
{
"epoch": 1.9856176880970269,
"grad_norm": 0.19517184793949127,
"learning_rate": 3.014382311902973e-05,
"loss": 0.1762,
"step": 18500
},
{
"epoch": 2.0,
"eval_loss": 0.18117234110832214,
"eval_runtime": 121.7819,
"eval_samples_per_second": 178.286,
"eval_steps_per_second": 11.143,
"step": 18634
},
{
"epoch": 2.0392830310185683,
"grad_norm": 0.38678213953971863,
"learning_rate": 2.9607169689814317e-05,
"loss": 0.1706,
"step": 19000
},
{
"epoch": 2.0929483739401094,
"grad_norm": 0.37432995438575745,
"learning_rate": 2.9070516260598908e-05,
"loss": 0.1676,
"step": 19500
},
{
"epoch": 2.146613716861651,
"grad_norm": 0.230524942278862,
"learning_rate": 2.8533862831383496e-05,
"loss": 0.17,
"step": 20000
},
{
"epoch": 2.200279059783192,
"grad_norm": 0.28219106793403625,
"learning_rate": 2.7997209402168077e-05,
"loss": 0.1701,
"step": 20500
},
{
"epoch": 2.2539444027047333,
"grad_norm": 0.31730708479881287,
"learning_rate": 2.7460555972952668e-05,
"loss": 0.1694,
"step": 21000
},
{
"epoch": 2.3076097456262747,
"grad_norm": 0.20662575960159302,
"learning_rate": 2.6923902543737256e-05,
"loss": 0.1679,
"step": 21500
},
{
"epoch": 2.3612750885478158,
"grad_norm": 0.22145278751850128,
"learning_rate": 2.6387249114521844e-05,
"loss": 0.1697,
"step": 22000
},
{
"epoch": 2.414940431469357,
"grad_norm": 0.17506997287273407,
"learning_rate": 2.585059568530643e-05,
"loss": 0.1657,
"step": 22500
},
{
"epoch": 2.4686057743908982,
"grad_norm": 0.22657690942287445,
"learning_rate": 2.5313942256091016e-05,
"loss": 0.1682,
"step": 23000
},
{
"epoch": 2.5222711173124397,
"grad_norm": 0.2509589195251465,
"learning_rate": 2.4777288826875604e-05,
"loss": 0.1642,
"step": 23500
},
{
"epoch": 2.575936460233981,
"grad_norm": 0.1847866326570511,
"learning_rate": 2.4240635397660192e-05,
"loss": 0.1712,
"step": 24000
},
{
"epoch": 2.629601803155522,
"grad_norm": 0.33803707361221313,
"learning_rate": 2.370398196844478e-05,
"loss": 0.1688,
"step": 24500
},
{
"epoch": 2.683267146077063,
"grad_norm": 0.16116875410079956,
"learning_rate": 2.3167328539229368e-05,
"loss": 0.169,
"step": 25000
},
{
"epoch": 2.7369324889986046,
"grad_norm": 0.21728037297725677,
"learning_rate": 2.2630675110013955e-05,
"loss": 0.1673,
"step": 25500
},
{
"epoch": 2.790597831920146,
"grad_norm": 0.26892897486686707,
"learning_rate": 2.209402168079854e-05,
"loss": 0.167,
"step": 26000
},
{
"epoch": 2.844263174841687,
"grad_norm": 0.24722512066364288,
"learning_rate": 2.1557368251583128e-05,
"loss": 0.1702,
"step": 26500
},
{
"epoch": 2.8979285177632286,
"grad_norm": 0.26736003160476685,
"learning_rate": 2.1020714822367716e-05,
"loss": 0.1646,
"step": 27000
},
{
"epoch": 2.9515938606847696,
"grad_norm": 0.32340237498283386,
"learning_rate": 2.0484061393152303e-05,
"loss": 0.167,
"step": 27500
},
{
"epoch": 3.0,
"eval_loss": 0.17867647111415863,
"eval_runtime": 120.9369,
"eval_samples_per_second": 179.532,
"eval_steps_per_second": 11.221,
"step": 27951
},
{
"epoch": 3.005259203606311,
"grad_norm": 0.20410487055778503,
"learning_rate": 1.994740796393689e-05,
"loss": 0.1616,
"step": 28000
},
{
"epoch": 3.0589245465278525,
"grad_norm": 0.2781909704208374,
"learning_rate": 1.941075453472148e-05,
"loss": 0.1622,
"step": 28500
},
{
"epoch": 3.1125898894493935,
"grad_norm": 0.23636963963508606,
"learning_rate": 1.8874101105506064e-05,
"loss": 0.1671,
"step": 29000
},
{
"epoch": 3.166255232370935,
"grad_norm": 0.17557688057422638,
"learning_rate": 1.8337447676290655e-05,
"loss": 0.1637,
"step": 29500
},
{
"epoch": 3.219920575292476,
"grad_norm": 0.15285401046276093,
"learning_rate": 1.780079424707524e-05,
"loss": 0.1625,
"step": 30000
},
{
"epoch": 3.2735859182140175,
"grad_norm": 0.18128257989883423,
"learning_rate": 1.7264140817859827e-05,
"loss": 0.1634,
"step": 30500
},
{
"epoch": 3.3272512611355585,
"grad_norm": 0.2326362580060959,
"learning_rate": 1.6727487388644415e-05,
"loss": 0.1642,
"step": 31000
},
{
"epoch": 3.3809166040571,
"grad_norm": 0.28395962715148926,
"learning_rate": 1.6190833959429003e-05,
"loss": 0.1644,
"step": 31500
},
{
"epoch": 3.4345819469786414,
"grad_norm": 0.22677470743656158,
"learning_rate": 1.5654180530213587e-05,
"loss": 0.1648,
"step": 32000
},
{
"epoch": 3.4882472899001824,
"grad_norm": 0.27061572670936584,
"learning_rate": 1.5117527100998177e-05,
"loss": 0.1647,
"step": 32500
},
{
"epoch": 3.541912632821724,
"grad_norm": 0.23157773911952972,
"learning_rate": 1.4580873671782763e-05,
"loss": 0.1642,
"step": 33000
},
{
"epoch": 3.595577975743265,
"grad_norm": 0.22143514454364777,
"learning_rate": 1.4044220242567353e-05,
"loss": 0.1668,
"step": 33500
},
{
"epoch": 3.6492433186648063,
"grad_norm": 0.2169780731201172,
"learning_rate": 1.3507566813351939e-05,
"loss": 0.1658,
"step": 34000
},
{
"epoch": 3.7029086615863473,
"grad_norm": 0.3290941119194031,
"learning_rate": 1.2970913384136527e-05,
"loss": 0.1647,
"step": 34500
},
{
"epoch": 3.756574004507889,
"grad_norm": 0.29469916224479675,
"learning_rate": 1.2434259954921113e-05,
"loss": 0.1638,
"step": 35000
},
{
"epoch": 3.8102393474294303,
"grad_norm": 0.29265907406806946,
"learning_rate": 1.18976065257057e-05,
"loss": 0.1618,
"step": 35500
},
{
"epoch": 3.8639046903509713,
"grad_norm": 0.192903533577919,
"learning_rate": 1.1360953096490287e-05,
"loss": 0.1673,
"step": 36000
},
{
"epoch": 3.9175700332725127,
"grad_norm": 0.29646775126457214,
"learning_rate": 1.0824299667274874e-05,
"loss": 0.1631,
"step": 36500
},
{
"epoch": 3.9712353761940538,
"grad_norm": 0.2786768674850464,
"learning_rate": 1.0287646238059462e-05,
"loss": 0.1634,
"step": 37000
},
{
"epoch": 4.0,
"eval_loss": 0.17870686948299408,
"eval_runtime": 120.9355,
"eval_samples_per_second": 179.534,
"eval_steps_per_second": 11.221,
"step": 37268
},
{
"epoch": 4.024900719115595,
"grad_norm": 0.18881458044052124,
"learning_rate": 9.750992808844048e-06,
"loss": 0.1587,
"step": 37500
},
{
"epoch": 4.078566062037137,
"grad_norm": 0.2428637146949768,
"learning_rate": 9.214339379628636e-06,
"loss": 0.1664,
"step": 38000
},
{
"epoch": 4.132231404958677,
"grad_norm": 0.2574012279510498,
"learning_rate": 8.677685950413224e-06,
"loss": 0.1602,
"step": 38500
},
{
"epoch": 4.185896747880219,
"grad_norm": 0.2370443195104599,
"learning_rate": 8.14103252119781e-06,
"loss": 0.1619,
"step": 39000
},
{
"epoch": 4.23956209080176,
"grad_norm": 0.2101278007030487,
"learning_rate": 7.604379091982398e-06,
"loss": 0.1636,
"step": 39500
},
{
"epoch": 4.293227433723302,
"grad_norm": 0.1884097009897232,
"learning_rate": 7.067725662766986e-06,
"loss": 0.1613,
"step": 40000
},
{
"epoch": 4.346892776644843,
"grad_norm": 0.2265803962945938,
"learning_rate": 6.531072233551573e-06,
"loss": 0.1657,
"step": 40500
},
{
"epoch": 4.400558119566384,
"grad_norm": 0.2761909067630768,
"learning_rate": 5.99441880433616e-06,
"loss": 0.1641,
"step": 41000
},
{
"epoch": 4.454223462487925,
"grad_norm": 0.2776241898536682,
"learning_rate": 5.457765375120747e-06,
"loss": 0.1611,
"step": 41500
},
{
"epoch": 4.507888805409467,
"grad_norm": 0.17776153981685638,
"learning_rate": 4.921111945905334e-06,
"loss": 0.1641,
"step": 42000
},
{
"epoch": 4.561554148331008,
"grad_norm": 0.29574069380760193,
"learning_rate": 4.384458516689922e-06,
"loss": 0.1619,
"step": 42500
},
{
"epoch": 4.6152194912525495,
"grad_norm": 0.25601324439048767,
"learning_rate": 3.847805087474509e-06,
"loss": 0.1598,
"step": 43000
},
{
"epoch": 4.66888483417409,
"grad_norm": 0.2692703306674957,
"learning_rate": 3.3111516582590963e-06,
"loss": 0.1604,
"step": 43500
},
{
"epoch": 4.7225501770956315,
"grad_norm": 0.24300901591777802,
"learning_rate": 2.7744982290436837e-06,
"loss": 0.1631,
"step": 44000
},
{
"epoch": 4.776215520017173,
"grad_norm": 0.2995280623435974,
"learning_rate": 2.237844799828271e-06,
"loss": 0.1647,
"step": 44500
},
{
"epoch": 4.829880862938714,
"grad_norm": 0.26150044798851013,
"learning_rate": 1.7011913706128583e-06,
"loss": 0.1619,
"step": 45000
},
{
"epoch": 4.883546205860256,
"grad_norm": 0.22888287901878357,
"learning_rate": 1.1645379413974456e-06,
"loss": 0.1607,
"step": 45500
},
{
"epoch": 4.9372115487817965,
"grad_norm": 0.3330378234386444,
"learning_rate": 6.278845121820329e-07,
"loss": 0.1607,
"step": 46000
}
],
"logging_steps": 500,
"max_steps": 46585,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.490275612904653e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}