clean-baseline / trainer_state.json

Epochs: 6000, loss: 0.052, time: 1.5 hrs, config: baseline

fd4a5ce verified 2 months ago

11.2 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 750.0,
	"eval_steps": 100,
	"global_step": 6000,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 12.5,
	"grad_norm": 0.9450863599777222,
	"learning_rate": 5.94e-05,
	"loss": 3.582935485839844,
	"step": 100
	},
	{
	"epoch": 25.0,
	"grad_norm": 0.19262371957302094,
	"learning_rate": 0.0001194,
	"loss": 0.49665924072265627,
	"step": 200
	},
	{
	"epoch": 37.5,
	"grad_norm": 0.15991109609603882,
	"learning_rate": 0.00017939999999999997,
	"loss": 0.4021767044067383,
	"step": 300
	},
	{
	"epoch": 50.0,
	"grad_norm": 0.18609966337680817,
	"learning_rate": 0.0002394,
	"loss": 0.37090412139892576,
	"step": 400
	},
	{
	"epoch": 62.5,
	"grad_norm": 0.24805887043476105,
	"learning_rate": 0.00029939999999999996,
	"loss": 0.3474049758911133,
	"step": 500
	},
	{
	"epoch": 75.0,
	"grad_norm": 0.17464160919189453,
	"learning_rate": 0.0002980838709677419,
	"loss": 0.3306478118896484,
	"step": 600
	},
	{
	"epoch": 87.5,
	"grad_norm": 0.24544841051101685,
	"learning_rate": 0.00029614838709677416,
	"loss": 0.3101034927368164,
	"step": 700
	},
	{
	"epoch": 100.0,
	"grad_norm": 0.2018628716468811,
	"learning_rate": 0.00029421290322580645,
	"loss": 0.2937062835693359,
	"step": 800
	},
	{
	"epoch": 112.5,
	"grad_norm": 0.2136959582567215,
	"learning_rate": 0.0002922774193548387,
	"loss": 0.2772307777404785,
	"step": 900
	},
	{
	"epoch": 125.0,
	"grad_norm": 0.23597952723503113,
	"learning_rate": 0.0002903419354838709,
	"loss": 0.258614501953125,
	"step": 1000
	},
	{
	"epoch": 137.5,
	"grad_norm": 0.30438488721847534,
	"learning_rate": 0.0002884064516129032,
	"loss": 0.2398568344116211,
	"step": 1100
	},
	{
	"epoch": 150.0,
	"grad_norm": 0.27026715874671936,
	"learning_rate": 0.00028647096774193546,
	"loss": 0.21622713088989257,
	"step": 1200
	},
	{
	"epoch": 162.5,
	"grad_norm": 0.2623114287853241,
	"learning_rate": 0.0002845354838709677,
	"loss": 0.19229209899902344,
	"step": 1300
	},
	{
	"epoch": 175.0,
	"grad_norm": 0.34945833683013916,
	"learning_rate": 0.0002826,
	"loss": 0.16757678985595703,
	"step": 1400
	},
	{
	"epoch": 187.5,
	"grad_norm": 0.29883235692977905,
	"learning_rate": 0.0002806645161290322,
	"loss": 0.14341635704040528,
	"step": 1500
	},
	{
	"epoch": 200.0,
	"grad_norm": 0.31376898288726807,
	"learning_rate": 0.0002787290322580645,
	"loss": 0.1221920394897461,
	"step": 1600
	},
	{
	"epoch": 212.5,
	"grad_norm": 0.28367292881011963,
	"learning_rate": 0.00027679354838709675,
	"loss": 0.1032716178894043,
	"step": 1700
	},
	{
	"epoch": 225.0,
	"grad_norm": 0.2790682315826416,
	"learning_rate": 0.000274858064516129,
	"loss": 0.08668439865112304,
	"step": 1800
	},
	{
	"epoch": 237.5,
	"grad_norm": 0.2293432205915451,
	"learning_rate": 0.0002729225806451613,
	"loss": 0.0720753002166748,
	"step": 1900
	},
	{
	"epoch": 250.0,
	"grad_norm": 0.27616050839424133,
	"learning_rate": 0.0002709870967741935,
	"loss": 0.062033796310424806,
	"step": 2000
	},
	{
	"epoch": 262.5,
	"grad_norm": 0.2692248225212097,
	"learning_rate": 0.0002690516129032258,
	"loss": 0.05264517307281494,
	"step": 2100
	},
	{
	"epoch": 275.0,
	"grad_norm": 0.21932683885097504,
	"learning_rate": 0.00026711612903225805,
	"loss": 0.045755772590637206,
	"step": 2200
	},
	{
	"epoch": 287.5,
	"grad_norm": 0.20013022422790527,
	"learning_rate": 0.0002651806451612903,
	"loss": 0.04000330924987793,
	"step": 2300
	},
	{
	"epoch": 300.0,
	"grad_norm": 0.16391867399215698,
	"learning_rate": 0.0002632451612903226,
	"loss": 0.03490618944168091,
	"step": 2400
	},
	{
	"epoch": 312.5,
	"grad_norm": 0.19230681657791138,
	"learning_rate": 0.0002613096774193548,
	"loss": 0.031311240196228024,
	"step": 2500
	},
	{
	"epoch": 325.0,
	"grad_norm": 0.1750553548336029,
	"learning_rate": 0.00025937419354838705,
	"loss": 0.02794300317764282,
	"step": 2600
	},
	{
	"epoch": 337.5,
	"grad_norm": 0.2386818677186966,
	"learning_rate": 0.00025743870967741934,
	"loss": 0.02533245801925659,
	"step": 2700
	},
	{
	"epoch": 350.0,
	"grad_norm": 0.15160086750984192,
	"learning_rate": 0.00025550322580645163,
	"loss": 0.023450531959533692,
	"step": 2800
	},
	{
	"epoch": 362.5,
	"grad_norm": 0.15656723082065582,
	"learning_rate": 0.00025356774193548387,
	"loss": 0.02200608015060425,
	"step": 2900
	},
	{
	"epoch": 375.0,
	"grad_norm": 0.14112432301044464,
	"learning_rate": 0.0002516322580645161,
	"loss": 0.020031318664550782,
	"step": 3000
	},
	{
	"epoch": 387.5,
	"grad_norm": 0.12787914276123047,
	"learning_rate": 0.00024969677419354834,
	"loss": 0.01899993300437927,
	"step": 3100
	},
	{
	"epoch": 400.0,
	"grad_norm": 0.1528688669204712,
	"learning_rate": 0.00024776129032258063,
	"loss": 0.01810125231742859,
	"step": 3200
	},
	{
	"epoch": 412.5,
	"grad_norm": 0.15528737008571625,
	"learning_rate": 0.00024582580645161287,
	"loss": 0.016684828996658324,
	"step": 3300
	},
	{
	"epoch": 425.0,
	"grad_norm": 0.1281791627407074,
	"learning_rate": 0.00024389032258064514,
	"loss": 0.015172331333160401,
	"step": 3400
	},
	{
	"epoch": 437.5,
	"grad_norm": 0.11617272347211838,
	"learning_rate": 0.0002419548387096774,
	"loss": 0.01434700846672058,
	"step": 3500
	},
	{
	"epoch": 450.0,
	"grad_norm": 0.11877749860286713,
	"learning_rate": 0.00024001935483870966,
	"loss": 0.01436853289604187,
	"step": 3600
	},
	{
	"epoch": 462.5,
	"grad_norm": 0.11250139772891998,
	"learning_rate": 0.00023808387096774193,
	"loss": 0.013647955656051636,
	"step": 3700
	},
	{
	"epoch": 475.0,
	"grad_norm": 0.12692750990390778,
	"learning_rate": 0.00023614838709677417,
	"loss": 0.012936822175979613,
	"step": 3800
	},
	{
	"epoch": 487.5,
	"grad_norm": 0.08776593208312988,
	"learning_rate": 0.00023421290322580643,
	"loss": 0.01205775499343872,
	"step": 3900
	},
	{
	"epoch": 500.0,
	"grad_norm": 0.08575516194105148,
	"learning_rate": 0.00023227741935483867,
	"loss": 0.012118096351623536,
	"step": 4000
	},
	{
	"epoch": 512.5,
	"grad_norm": 0.11763694882392883,
	"learning_rate": 0.00023034193548387093,
	"loss": 0.010872763395309449,
	"step": 4100
	},
	{
	"epoch": 525.0,
	"grad_norm": 0.11833110451698303,
	"learning_rate": 0.00022840645161290322,
	"loss": 0.010899600982666015,
	"step": 4200
	},
	{
	"epoch": 537.5,
	"grad_norm": 0.11374954879283905,
	"learning_rate": 0.00022647096774193546,
	"loss": 0.010327227115631103,
	"step": 4300
	},
	{
	"epoch": 550.0,
	"grad_norm": 0.10840512067079544,
	"learning_rate": 0.00022453548387096773,
	"loss": 0.010270411968231202,
	"step": 4400
	},
	{
	"epoch": 562.5,
	"grad_norm": 0.07199712842702866,
	"learning_rate": 0.0002226,
	"loss": 0.009772901535034179,
	"step": 4500
	},
	{
	"epoch": 575.0,
	"grad_norm": 0.15016108751296997,
	"learning_rate": 0.00022066451612903223,
	"loss": 0.009401602745056152,
	"step": 4600
	},
	{
	"epoch": 587.5,
	"grad_norm": 0.08698810636997223,
	"learning_rate": 0.0002187290322580645,
	"loss": 0.00952852725982666,
	"step": 4700
	},
	{
	"epoch": 600.0,
	"grad_norm": 0.11057093739509583,
	"learning_rate": 0.00021679354838709678,
	"loss": 0.008922239542007446,
	"step": 4800
	},
	{
	"epoch": 612.5,
	"grad_norm": 0.11917728185653687,
	"learning_rate": 0.00021485806451612902,
	"loss": 0.008766108751296997,
	"step": 4900
	},
	{
	"epoch": 625.0,
	"grad_norm": 0.07486002892255783,
	"learning_rate": 0.00021292258064516128,
	"loss": 0.008633826971054076,
	"step": 5000
	},
	{
	"epoch": 637.5,
	"grad_norm": 0.11766602843999863,
	"learning_rate": 0.00021098709677419352,
	"loss": 0.008536132574081421,
	"step": 5100
	},
	{
	"epoch": 650.0,
	"grad_norm": 0.0582246296107769,
	"learning_rate": 0.00020905161290322579,
	"loss": 0.008086669445037841,
	"step": 5200
	},
	{
	"epoch": 662.5,
	"grad_norm": 0.0658862367272377,
	"learning_rate": 0.00020711612903225805,
	"loss": 0.007744500637054444,
	"step": 5300
	},
	{
	"epoch": 675.0,
	"grad_norm": 0.10022356361150742,
	"learning_rate": 0.0002051806451612903,
	"loss": 0.007699260115623474,
	"step": 5400
	},
	{
	"epoch": 687.5,
	"grad_norm": 0.12475644052028656,
	"learning_rate": 0.00020324516129032258,
	"loss": 0.0076804465055465695,
	"step": 5500
	},
	{
	"epoch": 700.0,
	"grad_norm": 0.12272350490093231,
	"learning_rate": 0.00020130967741935484,
	"loss": 0.007605299353599548,
	"step": 5600
	},
	{
	"epoch": 712.5,
	"grad_norm": 0.08131624013185501,
	"learning_rate": 0.00019937419354838708,
	"loss": 0.0075324904918670655,
	"step": 5700
	},
	{
	"epoch": 725.0,
	"grad_norm": 0.12169747799634933,
	"learning_rate": 0.00019743870967741935,
	"loss": 0.0071774739027023315,
	"step": 5800
	},
	{
	"epoch": 737.5,
	"grad_norm": 0.05529671907424927,
	"learning_rate": 0.00019550322580645158,
	"loss": 0.007025536298751831,
	"step": 5900
	},
	{
	"epoch": 750.0,
	"grad_norm": 0.07039643824100494,
	"learning_rate": 0.00019356774193548385,
	"loss": 0.006921111941337586,
	"step": 6000
	}
	],
	"logging_steps": 100,
	"max_steps": 16000,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 2000,
	"save_steps": 1000,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 7074582945792000.0,
	"train_batch_size": 125,
	"trial_name": null,
	"trial_params": null
	}