GPokeT2 / trainer_state.json

Upload model v0.1-wip-4200

69e558e verified about 1 month ago

16.3 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 11.444444444444445,
	"eval_steps": 500,
	"global_step": 4200,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.136332651670075,
	"grad_norm": 11.269988059997559,
	"learning_rate": 0.00010652173913043477,
	"loss": 39.8198193359375,
	"step": 50
	},
	{
	"epoch": 0.27266530334015,
	"grad_norm": 14.962249755859375,
	"learning_rate": 0.0002152173913043478,
	"loss": 26.48206298828125,
	"step": 100
	},
	{
	"epoch": 0.40899795501022496,
	"grad_norm": 16.134239196777344,
	"learning_rate": 0.0003239130434782608,
	"loss": 19.36090087890625,
	"step": 150
	},
	{
	"epoch": 0.5453306066803,
	"grad_norm": 12.964629173278809,
	"learning_rate": 0.00043260869565217385,
	"loss": 14.5181689453125,
	"step": 200
	},
	{
	"epoch": 0.6816632583503749,
	"grad_norm": 10.061964988708496,
	"learning_rate": 0.0005413043478260869,
	"loss": 12.766522216796876,
	"step": 250
	},
	{
	"epoch": 0.8179959100204499,
	"grad_norm": 6.430169105529785,
	"learning_rate": 0.0005999713580566041,
	"loss": 11.881512451171876,
	"step": 300
	},
	{
	"epoch": 0.9543285616905249,
	"grad_norm": 5.836061954498291,
	"learning_rate": 0.0005997115106245061,
	"loss": 11.075633544921875,
	"step": 350
	},
	{
	"epoch": 1.0899795501022496,
	"grad_norm": 4.700469017028809,
	"learning_rate": 0.000599181221756225,
	"loss": 10.015964965820313,
	"step": 400
	},
	{
	"epoch": 1.2263122017723245,
	"grad_norm": 4.282314777374268,
	"learning_rate": 0.0005983809699521793,
	"loss": 9.592711181640626,
	"step": 450
	},
	{
	"epoch": 1.3626448534423994,
	"grad_norm": 3.662461280822754,
	"learning_rate": 0.0005973114773109183,
	"loss": 9.211348266601563,
	"step": 500
	},
	{
	"epoch": 1.4989775051124745,
	"grad_norm": 3.783862590789795,
	"learning_rate": 0.0005959737088775463,
	"loss": 8.782565307617187,
	"step": 550
	},
	{
	"epoch": 1.6353101567825494,
	"grad_norm": 3.429069757461548,
	"learning_rate": 0.0005943688717729229,
	"loss": 8.386593627929688,
	"step": 600
	},
	{
	"epoch": 1.7716428084526243,
	"grad_norm": 2.529482841491699,
	"learning_rate": 0.0005924984141044315,
	"loss": 8.06916259765625,
	"step": 650
	},
	{
	"epoch": 1.9079754601226995,
	"grad_norm": 2.4761486053466797,
	"learning_rate": 0.0005903640236592949,
	"loss": 7.7736474609375,
	"step": 700
	},
	{
	"epoch": 2.043626448534424,
	"grad_norm": 2.3675243854522705,
	"learning_rate": 0.0005879676263816192,
	"loss": 7.4699859619140625,
	"step": 750
	},
	{
	"epoch": 2.179959100204499,
	"grad_norm": 2.305575132369995,
	"learning_rate": 0.0005853113846345384,
	"loss": 7.305302734375,
	"step": 800
	},
	{
	"epoch": 2.316291751874574,
	"grad_norm": 2.3954548835754395,
	"learning_rate": 0.0005823976952490298,
	"loss": 7.102890625,
	"step": 850
	},
	{
	"epoch": 2.452624403544649,
	"grad_norm": 2.2483468055725098,
	"learning_rate": 0.0005792291873611596,
	"loss": 6.9435498046875,
	"step": 900
	},
	{
	"epoch": 2.588957055214724,
	"grad_norm": 2.0154542922973633,
	"learning_rate": 0.00057580872003971,
	"loss": 6.761861572265625,
	"step": 950
	},
	{
	"epoch": 2.7252897068847988,
	"grad_norm": 2.155539035797119,
	"learning_rate": 0.00057213937970633,
	"loss": 6.58298583984375,
	"step": 1000
	},
	{
	"epoch": 2.861622358554874,
	"grad_norm": 1.7670893669128418,
	"learning_rate": 0.0005682244773505363,
	"loss": 6.419741821289063,
	"step": 1050
	},
	{
	"epoch": 2.997955010224949,
	"grad_norm": 2.3611533641815186,
	"learning_rate": 0.0005640675455420765,
	"loss": 6.288121948242187,
	"step": 1100
	},
	{
	"epoch": 3.1336059986366736,
	"grad_norm": 1.8783221244812012,
	"learning_rate": 0.0005596723352433551,
	"loss": 6.090737915039062,
	"step": 1150
	},
	{
	"epoch": 3.2699386503067487,
	"grad_norm": 2.2795541286468506,
	"learning_rate": 0.0005550428124247912,
	"loss": 5.979439086914063,
	"step": 1200
	},
	{
	"epoch": 3.4062713019768234,
	"grad_norm": 1.3074142932891846,
	"learning_rate": 0.0005501831544861696,
	"loss": 5.897046508789063,
	"step": 1250
	},
	{
	"epoch": 3.5426039536468985,
	"grad_norm": 1.259432077407837,
	"learning_rate": 0.0005450977464872081,
	"loss": 5.734913940429688,
	"step": 1300
	},
	{
	"epoch": 3.6789366053169736,
	"grad_norm": 1.4511066675186157,
	"learning_rate": 0.0005397911771907473,
	"loss": 5.604786987304688,
	"step": 1350
	},
	{
	"epoch": 3.8152692569870483,
	"grad_norm": 1.428958535194397,
	"learning_rate": 0.0005342682349221297,
	"loss": 5.445667114257812,
	"step": 1400
	},
	{
	"epoch": 3.9516019086571235,
	"grad_norm": 1.2921603918075562,
	"learning_rate": 0.000528533903248506,
	"loss": 5.391282958984375,
	"step": 1450
	},
	{
	"epoch": 4.087252897068848,
	"grad_norm": 1.3616167306900024,
	"learning_rate": 0.0005225933564819676,
	"loss": 5.183615112304688,
	"step": 1500
	},
	{
	"epoch": 4.223585548738923,
	"grad_norm": 1.2340154647827148,
	"learning_rate": 0.0005164519550105623,
	"loss": 5.060681457519531,
	"step": 1550
	},
	{
	"epoch": 4.359918200408998,
	"grad_norm": 1.5526384115219116,
	"learning_rate": 0.0005101152404614052,
	"loss": 4.902400817871094,
	"step": 1600
	},
	{
	"epoch": 4.496250852079073,
	"grad_norm": 1.4958291053771973,
	"learning_rate": 0.0005035889307002529,
	"loss": 4.787099304199219,
	"step": 1650
	},
	{
	"epoch": 4.632583503749148,
	"grad_norm": 1.4236118793487549,
	"learning_rate": 0.0004968789146720478,
	"loss": 4.660638427734375,
	"step": 1700
	},
	{
	"epoch": 4.768916155419223,
	"grad_norm": 1.3592256307601929,
	"learning_rate": 0.0004899912470870939,
	"loss": 4.454691162109375,
	"step": 1750
	},
	{
	"epoch": 4.905248807089298,
	"grad_norm": 1.502578616142273,
	"learning_rate": 0.00048293214295765303,
	"loss": 4.297479553222656,
	"step": 1800
	},
	{
	"epoch": 5.040899795501023,
	"grad_norm": 1.7106261253356934,
	"learning_rate": 0.0004757079719898968,
	"loss": 4.13409423828125,
	"step": 1850
	},
	{
	"epoch": 5.1772324471710975,
	"grad_norm": 1.2559808492660522,
	"learning_rate": 0.00046832525283627114,
	"loss": 3.96047607421875,
	"step": 1900
	},
	{
	"epoch": 5.313565098841172,
	"grad_norm": 1.2694923877716064,
	"learning_rate": 0.0004607906472134603,
	"loss": 3.8196981811523436,
	"step": 1950
	},
	{
	"epoch": 5.449897750511248,
	"grad_norm": 1.6992137432098389,
	"learning_rate": 0.0004531109538912596,
	"loss": 3.6628662109375,
	"step": 2000
	},
	{
	"epoch": 5.5862304021813225,
	"grad_norm": 1.453190803527832,
	"learning_rate": 0.00044529310255777855,
	"loss": 3.52033935546875,
	"step": 2050
	},
	{
	"epoch": 5.722563053851397,
	"grad_norm": 1.504873514175415,
	"learning_rate": 0.0004373441475665124,
	"loss": 3.3988775634765624,
	"step": 2100
	},
	{
	"epoch": 5.858895705521473,
	"grad_norm": 1.4465556144714355,
	"learning_rate": 0.00042927126157092204,
	"loss": 3.2702841186523437,
	"step": 2150
	},
	{
	"epoch": 5.9952283571915475,
	"grad_norm": 1.4014344215393066,
	"learning_rate": 0.0004210817290522684,
	"loss": 3.1291094970703126,
	"step": 2200
	},
	{
	"epoch": 6.130879345603272,
	"grad_norm": 1.645821213722229,
	"learning_rate": 0.00041278293974653904,
	"loss": 2.936179504394531,
	"step": 2250
	},
	{
	"epoch": 6.267211997273347,
	"grad_norm": 1.6427346467971802,
	"learning_rate": 0.00040438238197640066,
	"loss": 2.857735900878906,
	"step": 2300
	},
	{
	"epoch": 6.403544648943422,
	"grad_norm": 1.66471529006958,
	"learning_rate": 0.00039588763589419156,
	"loss": 2.748570556640625,
	"step": 2350
	},
	{
	"epoch": 6.539877300613497,
	"grad_norm": 1.5286723375320435,
	"learning_rate": 0.0003873063666420535,
	"loss": 2.6635064697265625,
	"step": 2400
	},
	{
	"epoch": 6.676209952283572,
	"grad_norm": 1.5245234966278076,
	"learning_rate": 0.00037864631743537395,
	"loss": 2.556291046142578,
	"step": 2450
	},
	{
	"epoch": 6.812542603953647,
	"grad_norm": 1.4000400304794312,
	"learning_rate": 0.000369915302575779,
	"loss": 2.4817964172363283,
	"step": 2500
	},
	{
	"epoch": 6.948875255623722,
	"grad_norm": 1.3968268632888794,
	"learning_rate": 0.00036112120039998323,
	"loss": 2.362508087158203,
	"step": 2550
	},
	{
	"epoch": 7.084526244035446,
	"grad_norm": 1.6692546606063843,
	"learning_rate": 0.0003522719461708582,
	"loss": 2.273824005126953,
	"step": 2600
	},
	{
	"epoch": 7.220858895705521,
	"grad_norm": 1.4489309787750244,
	"learning_rate": 0.00034337552491713324,
	"loss": 2.1658897399902344,
	"step": 2650
	},
	{
	"epoch": 7.357191547375597,
	"grad_norm": 1.5687353610992432,
	"learning_rate": 0.00033443996422819145,
	"loss": 2.108182220458984,
	"step": 2700
	},
	{
	"epoch": 7.493524199045671,
	"grad_norm": 1.6350905895233154,
	"learning_rate": 0.00032547332701046195,
	"loss": 1.99987060546875,
	"step": 2750
	},
	{
	"epoch": 7.629856850715746,
	"grad_norm": 1.5019129514694214,
	"learning_rate": 0.0003164837042119428,
	"loss": 1.9454510498046875,
	"step": 2800
	},
	{
	"epoch": 7.766189502385822,
	"grad_norm": 1.4423465728759766,
	"learning_rate": 0.00030747920752142186,
	"loss": 1.9158531188964845,
	"step": 2850
	},
	{
	"epoch": 7.902522154055896,
	"grad_norm": 1.5868362188339233,
	"learning_rate": 0.0002984679620489827,
	"loss": 1.8568917846679687,
	"step": 2900
	},
	{
	"epoch": 8.03817314246762,
	"grad_norm": 1.7355551719665527,
	"learning_rate": 0.0002894580989943989,
	"loss": 1.7664053344726562,
	"step": 2950
	},
	{
	"epoch": 8.174505794137696,
	"grad_norm": 1.4344327449798584,
	"learning_rate": 0.0002804577483100344,
	"loss": 1.6748054504394532,
	"step": 3000
	},
	{
	"epoch": 8.310838445807772,
	"grad_norm": 1.6083476543426514,
	"learning_rate": 0.00027147503136486895,
	"loss": 1.6389869689941405,
	"step": 3050
	},
	{
	"epoch": 8.447171097477845,
	"grad_norm": 1.412381649017334,
	"learning_rate": 0.0002625180536162685,
	"loss": 1.6107588195800782,
	"step": 3100
	},
	{
	"epoch": 8.583503749147921,
	"grad_norm": 1.4404499530792236,
	"learning_rate": 0.00025359489729611366,
	"loss": 1.558354034423828,
	"step": 3150
	},
	{
	"epoch": 8.719836400817996,
	"grad_norm": 1.394539713859558,
	"learning_rate": 0.0002447136141178857,
	"loss": 1.5231396484375,
	"step": 3200
	},
	{
	"epoch": 8.85616905248807,
	"grad_norm": 1.4844084978103638,
	"learning_rate": 0.00023588221801128917,
	"loss": 1.4771731567382813,
	"step": 3250
	},
	{
	"epoch": 8.992501704158146,
	"grad_norm": 1.3957374095916748,
	"learning_rate": 0.0002271086778909701,
	"loss": 1.4401710510253907,
	"step": 3300
	},
	{
	"epoch": 9.12815269256987,
	"grad_norm": 1.4386154413223267,
	"learning_rate": 0.00021840091046585182,
	"loss": 1.3497396850585937,
	"step": 3350
	},
	{
	"epoch": 9.264485344239946,
	"grad_norm": 1.4959100484848022,
	"learning_rate": 0.000209766773095578,
	"loss": 1.3368931579589844,
	"step": 3400
	},
	{
	"epoch": 9.400817995910021,
	"grad_norm": 1.3249437808990479,
	"learning_rate": 0.00020121405670051008,
	"loss": 1.297091064453125,
	"step": 3450
	},
	{
	"epoch": 9.537150647580095,
	"grad_norm": 1.3749561309814453,
	"learning_rate": 0.00019275047873167374,
	"loss": 1.260106658935547,
	"step": 3500
	},
	{
	"epoch": 9.67348329925017,
	"grad_norm": 1.4010766744613647,
	"learning_rate": 0.0001843836762070014,
	"loss": 1.239128646850586,
	"step": 3550
	},
	{
	"epoch": 9.809815950920246,
	"grad_norm": 1.5308102369308472,
	"learning_rate": 0.00017612119882015126,
	"loss": 1.1977056121826173,
	"step": 3600
	},
	{
	"epoch": 9.94614860259032,
	"grad_norm": 1.3873751163482666,
	"learning_rate": 0.00016797050212812275,
	"loss": 1.1842040252685546,
	"step": 3650
	},
	{
	"epoch": 10.081799591002046,
	"grad_norm": 1.3666012287139893,
	"learning_rate": 0.00015993894082381616,
	"loss": 1.1095658111572266,
	"step": 3700
	},
	{
	"epoch": 10.21813224267212,
	"grad_norm": 1.3528972864151,
	"learning_rate": 0.00015203376209960474,
	"loss": 1.103120346069336,
	"step": 3750
	},
	{
	"epoch": 10.354464894342195,
	"grad_norm": 1.3081281185150146,
	"learning_rate": 0.00014426209910790887,
	"loss": 1.0691104125976563,
	"step": 3800
	},
	{
	"epoch": 10.49079754601227,
	"grad_norm": 1.3515572547912598,
	"learning_rate": 0.00013663096452467343,
	"loss": 1.0644143676757813,
	"step": 3850
	},
	{
	"epoch": 10.627130197682344,
	"grad_norm": 1.2935131788253784,
	"learning_rate": 0.00012914724422155598,
	"loss": 1.0334495544433593,
	"step": 3900
	},
	{
	"epoch": 10.76346284935242,
	"grad_norm": 1.3209459781646729,
	"learning_rate": 0.00012181769105253435,
	"loss": 1.0103805541992188,
	"step": 3950
	},
	{
	"epoch": 10.899795501022496,
	"grad_norm": 1.324385643005371,
	"learning_rate": 0.00011464891876054252,
	"loss": 0.990460433959961,
	"step": 4000
	},
	{
	"epoch": 11.03544648943422,
	"grad_norm": 1.374879002571106,
	"learning_rate": 0.00010764739600963116,
	"loss": 0.9643755340576172,
	"step": 4050
	},
	{
	"epoch": 11.171779141104295,
	"grad_norm": 1.275993824005127,
	"learning_rate": 0.00010081944054803842,
	"loss": 0.936119155883789,
	"step": 4100
	},
	{
	"epoch": 11.308111792774369,
	"grad_norm": 1.2590258121490479,
	"learning_rate": 9.417121350743844e-05,
	"loss": 0.9281440734863281,
	"step": 4150
	},
	{
	"epoch": 11.444444444444445,
	"grad_norm": 1.2088381052017212,
	"learning_rate": 8.770871384351085e-05,
	"loss": 0.9070972442626953,
	"step": 4200
	}
	],
	"logging_steps": 50,
	"max_steps": 5505,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 15,
	"save_steps": 200,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 9.989600444122399e+17,
	"train_batch_size": 16,
	"trial_name": null,
	"trial_params": null
	}