clean-baseline / trainer_state.json
deepaksamuel-cuk's picture
Epochs: 6000, loss: 0.052, time: 1.5 hrs, config: baseline
fd4a5ce verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 750.0,
"eval_steps": 100,
"global_step": 6000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 12.5,
"grad_norm": 0.9450863599777222,
"learning_rate": 5.94e-05,
"loss": 3.582935485839844,
"step": 100
},
{
"epoch": 25.0,
"grad_norm": 0.19262371957302094,
"learning_rate": 0.0001194,
"loss": 0.49665924072265627,
"step": 200
},
{
"epoch": 37.5,
"grad_norm": 0.15991109609603882,
"learning_rate": 0.00017939999999999997,
"loss": 0.4021767044067383,
"step": 300
},
{
"epoch": 50.0,
"grad_norm": 0.18609966337680817,
"learning_rate": 0.0002394,
"loss": 0.37090412139892576,
"step": 400
},
{
"epoch": 62.5,
"grad_norm": 0.24805887043476105,
"learning_rate": 0.00029939999999999996,
"loss": 0.3474049758911133,
"step": 500
},
{
"epoch": 75.0,
"grad_norm": 0.17464160919189453,
"learning_rate": 0.0002980838709677419,
"loss": 0.3306478118896484,
"step": 600
},
{
"epoch": 87.5,
"grad_norm": 0.24544841051101685,
"learning_rate": 0.00029614838709677416,
"loss": 0.3101034927368164,
"step": 700
},
{
"epoch": 100.0,
"grad_norm": 0.2018628716468811,
"learning_rate": 0.00029421290322580645,
"loss": 0.2937062835693359,
"step": 800
},
{
"epoch": 112.5,
"grad_norm": 0.2136959582567215,
"learning_rate": 0.0002922774193548387,
"loss": 0.2772307777404785,
"step": 900
},
{
"epoch": 125.0,
"grad_norm": 0.23597952723503113,
"learning_rate": 0.0002903419354838709,
"loss": 0.258614501953125,
"step": 1000
},
{
"epoch": 137.5,
"grad_norm": 0.30438488721847534,
"learning_rate": 0.0002884064516129032,
"loss": 0.2398568344116211,
"step": 1100
},
{
"epoch": 150.0,
"grad_norm": 0.27026715874671936,
"learning_rate": 0.00028647096774193546,
"loss": 0.21622713088989257,
"step": 1200
},
{
"epoch": 162.5,
"grad_norm": 0.2623114287853241,
"learning_rate": 0.0002845354838709677,
"loss": 0.19229209899902344,
"step": 1300
},
{
"epoch": 175.0,
"grad_norm": 0.34945833683013916,
"learning_rate": 0.0002826,
"loss": 0.16757678985595703,
"step": 1400
},
{
"epoch": 187.5,
"grad_norm": 0.29883235692977905,
"learning_rate": 0.0002806645161290322,
"loss": 0.14341635704040528,
"step": 1500
},
{
"epoch": 200.0,
"grad_norm": 0.31376898288726807,
"learning_rate": 0.0002787290322580645,
"loss": 0.1221920394897461,
"step": 1600
},
{
"epoch": 212.5,
"grad_norm": 0.28367292881011963,
"learning_rate": 0.00027679354838709675,
"loss": 0.1032716178894043,
"step": 1700
},
{
"epoch": 225.0,
"grad_norm": 0.2790682315826416,
"learning_rate": 0.000274858064516129,
"loss": 0.08668439865112304,
"step": 1800
},
{
"epoch": 237.5,
"grad_norm": 0.2293432205915451,
"learning_rate": 0.0002729225806451613,
"loss": 0.0720753002166748,
"step": 1900
},
{
"epoch": 250.0,
"grad_norm": 0.27616050839424133,
"learning_rate": 0.0002709870967741935,
"loss": 0.062033796310424806,
"step": 2000
},
{
"epoch": 262.5,
"grad_norm": 0.2692248225212097,
"learning_rate": 0.0002690516129032258,
"loss": 0.05264517307281494,
"step": 2100
},
{
"epoch": 275.0,
"grad_norm": 0.21932683885097504,
"learning_rate": 0.00026711612903225805,
"loss": 0.045755772590637206,
"step": 2200
},
{
"epoch": 287.5,
"grad_norm": 0.20013022422790527,
"learning_rate": 0.0002651806451612903,
"loss": 0.04000330924987793,
"step": 2300
},
{
"epoch": 300.0,
"grad_norm": 0.16391867399215698,
"learning_rate": 0.0002632451612903226,
"loss": 0.03490618944168091,
"step": 2400
},
{
"epoch": 312.5,
"grad_norm": 0.19230681657791138,
"learning_rate": 0.0002613096774193548,
"loss": 0.031311240196228024,
"step": 2500
},
{
"epoch": 325.0,
"grad_norm": 0.1750553548336029,
"learning_rate": 0.00025937419354838705,
"loss": 0.02794300317764282,
"step": 2600
},
{
"epoch": 337.5,
"grad_norm": 0.2386818677186966,
"learning_rate": 0.00025743870967741934,
"loss": 0.02533245801925659,
"step": 2700
},
{
"epoch": 350.0,
"grad_norm": 0.15160086750984192,
"learning_rate": 0.00025550322580645163,
"loss": 0.023450531959533692,
"step": 2800
},
{
"epoch": 362.5,
"grad_norm": 0.15656723082065582,
"learning_rate": 0.00025356774193548387,
"loss": 0.02200608015060425,
"step": 2900
},
{
"epoch": 375.0,
"grad_norm": 0.14112432301044464,
"learning_rate": 0.0002516322580645161,
"loss": 0.020031318664550782,
"step": 3000
},
{
"epoch": 387.5,
"grad_norm": 0.12787914276123047,
"learning_rate": 0.00024969677419354834,
"loss": 0.01899993300437927,
"step": 3100
},
{
"epoch": 400.0,
"grad_norm": 0.1528688669204712,
"learning_rate": 0.00024776129032258063,
"loss": 0.01810125231742859,
"step": 3200
},
{
"epoch": 412.5,
"grad_norm": 0.15528737008571625,
"learning_rate": 0.00024582580645161287,
"loss": 0.016684828996658324,
"step": 3300
},
{
"epoch": 425.0,
"grad_norm": 0.1281791627407074,
"learning_rate": 0.00024389032258064514,
"loss": 0.015172331333160401,
"step": 3400
},
{
"epoch": 437.5,
"grad_norm": 0.11617272347211838,
"learning_rate": 0.0002419548387096774,
"loss": 0.01434700846672058,
"step": 3500
},
{
"epoch": 450.0,
"grad_norm": 0.11877749860286713,
"learning_rate": 0.00024001935483870966,
"loss": 0.01436853289604187,
"step": 3600
},
{
"epoch": 462.5,
"grad_norm": 0.11250139772891998,
"learning_rate": 0.00023808387096774193,
"loss": 0.013647955656051636,
"step": 3700
},
{
"epoch": 475.0,
"grad_norm": 0.12692750990390778,
"learning_rate": 0.00023614838709677417,
"loss": 0.012936822175979613,
"step": 3800
},
{
"epoch": 487.5,
"grad_norm": 0.08776593208312988,
"learning_rate": 0.00023421290322580643,
"loss": 0.01205775499343872,
"step": 3900
},
{
"epoch": 500.0,
"grad_norm": 0.08575516194105148,
"learning_rate": 0.00023227741935483867,
"loss": 0.012118096351623536,
"step": 4000
},
{
"epoch": 512.5,
"grad_norm": 0.11763694882392883,
"learning_rate": 0.00023034193548387093,
"loss": 0.010872763395309449,
"step": 4100
},
{
"epoch": 525.0,
"grad_norm": 0.11833110451698303,
"learning_rate": 0.00022840645161290322,
"loss": 0.010899600982666015,
"step": 4200
},
{
"epoch": 537.5,
"grad_norm": 0.11374954879283905,
"learning_rate": 0.00022647096774193546,
"loss": 0.010327227115631103,
"step": 4300
},
{
"epoch": 550.0,
"grad_norm": 0.10840512067079544,
"learning_rate": 0.00022453548387096773,
"loss": 0.010270411968231202,
"step": 4400
},
{
"epoch": 562.5,
"grad_norm": 0.07199712842702866,
"learning_rate": 0.0002226,
"loss": 0.009772901535034179,
"step": 4500
},
{
"epoch": 575.0,
"grad_norm": 0.15016108751296997,
"learning_rate": 0.00022066451612903223,
"loss": 0.009401602745056152,
"step": 4600
},
{
"epoch": 587.5,
"grad_norm": 0.08698810636997223,
"learning_rate": 0.0002187290322580645,
"loss": 0.00952852725982666,
"step": 4700
},
{
"epoch": 600.0,
"grad_norm": 0.11057093739509583,
"learning_rate": 0.00021679354838709678,
"loss": 0.008922239542007446,
"step": 4800
},
{
"epoch": 612.5,
"grad_norm": 0.11917728185653687,
"learning_rate": 0.00021485806451612902,
"loss": 0.008766108751296997,
"step": 4900
},
{
"epoch": 625.0,
"grad_norm": 0.07486002892255783,
"learning_rate": 0.00021292258064516128,
"loss": 0.008633826971054076,
"step": 5000
},
{
"epoch": 637.5,
"grad_norm": 0.11766602843999863,
"learning_rate": 0.00021098709677419352,
"loss": 0.008536132574081421,
"step": 5100
},
{
"epoch": 650.0,
"grad_norm": 0.0582246296107769,
"learning_rate": 0.00020905161290322579,
"loss": 0.008086669445037841,
"step": 5200
},
{
"epoch": 662.5,
"grad_norm": 0.0658862367272377,
"learning_rate": 0.00020711612903225805,
"loss": 0.007744500637054444,
"step": 5300
},
{
"epoch": 675.0,
"grad_norm": 0.10022356361150742,
"learning_rate": 0.0002051806451612903,
"loss": 0.007699260115623474,
"step": 5400
},
{
"epoch": 687.5,
"grad_norm": 0.12475644052028656,
"learning_rate": 0.00020324516129032258,
"loss": 0.0076804465055465695,
"step": 5500
},
{
"epoch": 700.0,
"grad_norm": 0.12272350490093231,
"learning_rate": 0.00020130967741935484,
"loss": 0.007605299353599548,
"step": 5600
},
{
"epoch": 712.5,
"grad_norm": 0.08131624013185501,
"learning_rate": 0.00019937419354838708,
"loss": 0.0075324904918670655,
"step": 5700
},
{
"epoch": 725.0,
"grad_norm": 0.12169747799634933,
"learning_rate": 0.00019743870967741935,
"loss": 0.0071774739027023315,
"step": 5800
},
{
"epoch": 737.5,
"grad_norm": 0.05529671907424927,
"learning_rate": 0.00019550322580645158,
"loss": 0.007025536298751831,
"step": 5900
},
{
"epoch": 750.0,
"grad_norm": 0.07039643824100494,
"learning_rate": 0.00019356774193548385,
"loss": 0.006921111941337586,
"step": 6000
}
],
"logging_steps": 100,
"max_steps": 16000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2000,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7074582945792000.0,
"train_batch_size": 125,
"trial_name": null,
"trial_params": null
}