GPokeT2 / trainer_state.json
iamthinbaker's picture
Upload model v0.1-wip-4200
69e558e verified
Raw
History Blame Contribute Delete
16.3 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 11.444444444444445,
"eval_steps": 500,
"global_step": 4200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.136332651670075,
"grad_norm": 11.269988059997559,
"learning_rate": 0.00010652173913043477,
"loss": 39.8198193359375,
"step": 50
},
{
"epoch": 0.27266530334015,
"grad_norm": 14.962249755859375,
"learning_rate": 0.0002152173913043478,
"loss": 26.48206298828125,
"step": 100
},
{
"epoch": 0.40899795501022496,
"grad_norm": 16.134239196777344,
"learning_rate": 0.0003239130434782608,
"loss": 19.36090087890625,
"step": 150
},
{
"epoch": 0.5453306066803,
"grad_norm": 12.964629173278809,
"learning_rate": 0.00043260869565217385,
"loss": 14.5181689453125,
"step": 200
},
{
"epoch": 0.6816632583503749,
"grad_norm": 10.061964988708496,
"learning_rate": 0.0005413043478260869,
"loss": 12.766522216796876,
"step": 250
},
{
"epoch": 0.8179959100204499,
"grad_norm": 6.430169105529785,
"learning_rate": 0.0005999713580566041,
"loss": 11.881512451171876,
"step": 300
},
{
"epoch": 0.9543285616905249,
"grad_norm": 5.836061954498291,
"learning_rate": 0.0005997115106245061,
"loss": 11.075633544921875,
"step": 350
},
{
"epoch": 1.0899795501022496,
"grad_norm": 4.700469017028809,
"learning_rate": 0.000599181221756225,
"loss": 10.015964965820313,
"step": 400
},
{
"epoch": 1.2263122017723245,
"grad_norm": 4.282314777374268,
"learning_rate": 0.0005983809699521793,
"loss": 9.592711181640626,
"step": 450
},
{
"epoch": 1.3626448534423994,
"grad_norm": 3.662461280822754,
"learning_rate": 0.0005973114773109183,
"loss": 9.211348266601563,
"step": 500
},
{
"epoch": 1.4989775051124745,
"grad_norm": 3.783862590789795,
"learning_rate": 0.0005959737088775463,
"loss": 8.782565307617187,
"step": 550
},
{
"epoch": 1.6353101567825494,
"grad_norm": 3.429069757461548,
"learning_rate": 0.0005943688717729229,
"loss": 8.386593627929688,
"step": 600
},
{
"epoch": 1.7716428084526243,
"grad_norm": 2.529482841491699,
"learning_rate": 0.0005924984141044315,
"loss": 8.06916259765625,
"step": 650
},
{
"epoch": 1.9079754601226995,
"grad_norm": 2.4761486053466797,
"learning_rate": 0.0005903640236592949,
"loss": 7.7736474609375,
"step": 700
},
{
"epoch": 2.043626448534424,
"grad_norm": 2.3675243854522705,
"learning_rate": 0.0005879676263816192,
"loss": 7.4699859619140625,
"step": 750
},
{
"epoch": 2.179959100204499,
"grad_norm": 2.305575132369995,
"learning_rate": 0.0005853113846345384,
"loss": 7.305302734375,
"step": 800
},
{
"epoch": 2.316291751874574,
"grad_norm": 2.3954548835754395,
"learning_rate": 0.0005823976952490298,
"loss": 7.102890625,
"step": 850
},
{
"epoch": 2.452624403544649,
"grad_norm": 2.2483468055725098,
"learning_rate": 0.0005792291873611596,
"loss": 6.9435498046875,
"step": 900
},
{
"epoch": 2.588957055214724,
"grad_norm": 2.0154542922973633,
"learning_rate": 0.00057580872003971,
"loss": 6.761861572265625,
"step": 950
},
{
"epoch": 2.7252897068847988,
"grad_norm": 2.155539035797119,
"learning_rate": 0.00057213937970633,
"loss": 6.58298583984375,
"step": 1000
},
{
"epoch": 2.861622358554874,
"grad_norm": 1.7670893669128418,
"learning_rate": 0.0005682244773505363,
"loss": 6.419741821289063,
"step": 1050
},
{
"epoch": 2.997955010224949,
"grad_norm": 2.3611533641815186,
"learning_rate": 0.0005640675455420765,
"loss": 6.288121948242187,
"step": 1100
},
{
"epoch": 3.1336059986366736,
"grad_norm": 1.8783221244812012,
"learning_rate": 0.0005596723352433551,
"loss": 6.090737915039062,
"step": 1150
},
{
"epoch": 3.2699386503067487,
"grad_norm": 2.2795541286468506,
"learning_rate": 0.0005550428124247912,
"loss": 5.979439086914063,
"step": 1200
},
{
"epoch": 3.4062713019768234,
"grad_norm": 1.3074142932891846,
"learning_rate": 0.0005501831544861696,
"loss": 5.897046508789063,
"step": 1250
},
{
"epoch": 3.5426039536468985,
"grad_norm": 1.259432077407837,
"learning_rate": 0.0005450977464872081,
"loss": 5.734913940429688,
"step": 1300
},
{
"epoch": 3.6789366053169736,
"grad_norm": 1.4511066675186157,
"learning_rate": 0.0005397911771907473,
"loss": 5.604786987304688,
"step": 1350
},
{
"epoch": 3.8152692569870483,
"grad_norm": 1.428958535194397,
"learning_rate": 0.0005342682349221297,
"loss": 5.445667114257812,
"step": 1400
},
{
"epoch": 3.9516019086571235,
"grad_norm": 1.2921603918075562,
"learning_rate": 0.000528533903248506,
"loss": 5.391282958984375,
"step": 1450
},
{
"epoch": 4.087252897068848,
"grad_norm": 1.3616167306900024,
"learning_rate": 0.0005225933564819676,
"loss": 5.183615112304688,
"step": 1500
},
{
"epoch": 4.223585548738923,
"grad_norm": 1.2340154647827148,
"learning_rate": 0.0005164519550105623,
"loss": 5.060681457519531,
"step": 1550
},
{
"epoch": 4.359918200408998,
"grad_norm": 1.5526384115219116,
"learning_rate": 0.0005101152404614052,
"loss": 4.902400817871094,
"step": 1600
},
{
"epoch": 4.496250852079073,
"grad_norm": 1.4958291053771973,
"learning_rate": 0.0005035889307002529,
"loss": 4.787099304199219,
"step": 1650
},
{
"epoch": 4.632583503749148,
"grad_norm": 1.4236118793487549,
"learning_rate": 0.0004968789146720478,
"loss": 4.660638427734375,
"step": 1700
},
{
"epoch": 4.768916155419223,
"grad_norm": 1.3592256307601929,
"learning_rate": 0.0004899912470870939,
"loss": 4.454691162109375,
"step": 1750
},
{
"epoch": 4.905248807089298,
"grad_norm": 1.502578616142273,
"learning_rate": 0.00048293214295765303,
"loss": 4.297479553222656,
"step": 1800
},
{
"epoch": 5.040899795501023,
"grad_norm": 1.7106261253356934,
"learning_rate": 0.0004757079719898968,
"loss": 4.13409423828125,
"step": 1850
},
{
"epoch": 5.1772324471710975,
"grad_norm": 1.2559808492660522,
"learning_rate": 0.00046832525283627114,
"loss": 3.96047607421875,
"step": 1900
},
{
"epoch": 5.313565098841172,
"grad_norm": 1.2694923877716064,
"learning_rate": 0.0004607906472134603,
"loss": 3.8196981811523436,
"step": 1950
},
{
"epoch": 5.449897750511248,
"grad_norm": 1.6992137432098389,
"learning_rate": 0.0004531109538912596,
"loss": 3.6628662109375,
"step": 2000
},
{
"epoch": 5.5862304021813225,
"grad_norm": 1.453190803527832,
"learning_rate": 0.00044529310255777855,
"loss": 3.52033935546875,
"step": 2050
},
{
"epoch": 5.722563053851397,
"grad_norm": 1.504873514175415,
"learning_rate": 0.0004373441475665124,
"loss": 3.3988775634765624,
"step": 2100
},
{
"epoch": 5.858895705521473,
"grad_norm": 1.4465556144714355,
"learning_rate": 0.00042927126157092204,
"loss": 3.2702841186523437,
"step": 2150
},
{
"epoch": 5.9952283571915475,
"grad_norm": 1.4014344215393066,
"learning_rate": 0.0004210817290522684,
"loss": 3.1291094970703126,
"step": 2200
},
{
"epoch": 6.130879345603272,
"grad_norm": 1.645821213722229,
"learning_rate": 0.00041278293974653904,
"loss": 2.936179504394531,
"step": 2250
},
{
"epoch": 6.267211997273347,
"grad_norm": 1.6427346467971802,
"learning_rate": 0.00040438238197640066,
"loss": 2.857735900878906,
"step": 2300
},
{
"epoch": 6.403544648943422,
"grad_norm": 1.66471529006958,
"learning_rate": 0.00039588763589419156,
"loss": 2.748570556640625,
"step": 2350
},
{
"epoch": 6.539877300613497,
"grad_norm": 1.5286723375320435,
"learning_rate": 0.0003873063666420535,
"loss": 2.6635064697265625,
"step": 2400
},
{
"epoch": 6.676209952283572,
"grad_norm": 1.5245234966278076,
"learning_rate": 0.00037864631743537395,
"loss": 2.556291046142578,
"step": 2450
},
{
"epoch": 6.812542603953647,
"grad_norm": 1.4000400304794312,
"learning_rate": 0.000369915302575779,
"loss": 2.4817964172363283,
"step": 2500
},
{
"epoch": 6.948875255623722,
"grad_norm": 1.3968268632888794,
"learning_rate": 0.00036112120039998323,
"loss": 2.362508087158203,
"step": 2550
},
{
"epoch": 7.084526244035446,
"grad_norm": 1.6692546606063843,
"learning_rate": 0.0003522719461708582,
"loss": 2.273824005126953,
"step": 2600
},
{
"epoch": 7.220858895705521,
"grad_norm": 1.4489309787750244,
"learning_rate": 0.00034337552491713324,
"loss": 2.1658897399902344,
"step": 2650
},
{
"epoch": 7.357191547375597,
"grad_norm": 1.5687353610992432,
"learning_rate": 0.00033443996422819145,
"loss": 2.108182220458984,
"step": 2700
},
{
"epoch": 7.493524199045671,
"grad_norm": 1.6350905895233154,
"learning_rate": 0.00032547332701046195,
"loss": 1.99987060546875,
"step": 2750
},
{
"epoch": 7.629856850715746,
"grad_norm": 1.5019129514694214,
"learning_rate": 0.0003164837042119428,
"loss": 1.9454510498046875,
"step": 2800
},
{
"epoch": 7.766189502385822,
"grad_norm": 1.4423465728759766,
"learning_rate": 0.00030747920752142186,
"loss": 1.9158531188964845,
"step": 2850
},
{
"epoch": 7.902522154055896,
"grad_norm": 1.5868362188339233,
"learning_rate": 0.0002984679620489827,
"loss": 1.8568917846679687,
"step": 2900
},
{
"epoch": 8.03817314246762,
"grad_norm": 1.7355551719665527,
"learning_rate": 0.0002894580989943989,
"loss": 1.7664053344726562,
"step": 2950
},
{
"epoch": 8.174505794137696,
"grad_norm": 1.4344327449798584,
"learning_rate": 0.0002804577483100344,
"loss": 1.6748054504394532,
"step": 3000
},
{
"epoch": 8.310838445807772,
"grad_norm": 1.6083476543426514,
"learning_rate": 0.00027147503136486895,
"loss": 1.6389869689941405,
"step": 3050
},
{
"epoch": 8.447171097477845,
"grad_norm": 1.412381649017334,
"learning_rate": 0.0002625180536162685,
"loss": 1.6107588195800782,
"step": 3100
},
{
"epoch": 8.583503749147921,
"grad_norm": 1.4404499530792236,
"learning_rate": 0.00025359489729611366,
"loss": 1.558354034423828,
"step": 3150
},
{
"epoch": 8.719836400817996,
"grad_norm": 1.394539713859558,
"learning_rate": 0.0002447136141178857,
"loss": 1.5231396484375,
"step": 3200
},
{
"epoch": 8.85616905248807,
"grad_norm": 1.4844084978103638,
"learning_rate": 0.00023588221801128917,
"loss": 1.4771731567382813,
"step": 3250
},
{
"epoch": 8.992501704158146,
"grad_norm": 1.3957374095916748,
"learning_rate": 0.0002271086778909701,
"loss": 1.4401710510253907,
"step": 3300
},
{
"epoch": 9.12815269256987,
"grad_norm": 1.4386154413223267,
"learning_rate": 0.00021840091046585182,
"loss": 1.3497396850585937,
"step": 3350
},
{
"epoch": 9.264485344239946,
"grad_norm": 1.4959100484848022,
"learning_rate": 0.000209766773095578,
"loss": 1.3368931579589844,
"step": 3400
},
{
"epoch": 9.400817995910021,
"grad_norm": 1.3249437808990479,
"learning_rate": 0.00020121405670051008,
"loss": 1.297091064453125,
"step": 3450
},
{
"epoch": 9.537150647580095,
"grad_norm": 1.3749561309814453,
"learning_rate": 0.00019275047873167374,
"loss": 1.260106658935547,
"step": 3500
},
{
"epoch": 9.67348329925017,
"grad_norm": 1.4010766744613647,
"learning_rate": 0.0001843836762070014,
"loss": 1.239128646850586,
"step": 3550
},
{
"epoch": 9.809815950920246,
"grad_norm": 1.5308102369308472,
"learning_rate": 0.00017612119882015126,
"loss": 1.1977056121826173,
"step": 3600
},
{
"epoch": 9.94614860259032,
"grad_norm": 1.3873751163482666,
"learning_rate": 0.00016797050212812275,
"loss": 1.1842040252685546,
"step": 3650
},
{
"epoch": 10.081799591002046,
"grad_norm": 1.3666012287139893,
"learning_rate": 0.00015993894082381616,
"loss": 1.1095658111572266,
"step": 3700
},
{
"epoch": 10.21813224267212,
"grad_norm": 1.3528972864151,
"learning_rate": 0.00015203376209960474,
"loss": 1.103120346069336,
"step": 3750
},
{
"epoch": 10.354464894342195,
"grad_norm": 1.3081281185150146,
"learning_rate": 0.00014426209910790887,
"loss": 1.0691104125976563,
"step": 3800
},
{
"epoch": 10.49079754601227,
"grad_norm": 1.3515572547912598,
"learning_rate": 0.00013663096452467343,
"loss": 1.0644143676757813,
"step": 3850
},
{
"epoch": 10.627130197682344,
"grad_norm": 1.2935131788253784,
"learning_rate": 0.00012914724422155598,
"loss": 1.0334495544433593,
"step": 3900
},
{
"epoch": 10.76346284935242,
"grad_norm": 1.3209459781646729,
"learning_rate": 0.00012181769105253435,
"loss": 1.0103805541992188,
"step": 3950
},
{
"epoch": 10.899795501022496,
"grad_norm": 1.324385643005371,
"learning_rate": 0.00011464891876054252,
"loss": 0.990460433959961,
"step": 4000
},
{
"epoch": 11.03544648943422,
"grad_norm": 1.374879002571106,
"learning_rate": 0.00010764739600963116,
"loss": 0.9643755340576172,
"step": 4050
},
{
"epoch": 11.171779141104295,
"grad_norm": 1.275993824005127,
"learning_rate": 0.00010081944054803842,
"loss": 0.936119155883789,
"step": 4100
},
{
"epoch": 11.308111792774369,
"grad_norm": 1.2590258121490479,
"learning_rate": 9.417121350743844e-05,
"loss": 0.9281440734863281,
"step": 4150
},
{
"epoch": 11.444444444444445,
"grad_norm": 1.2088381052017212,
"learning_rate": 8.770871384351085e-05,
"loss": 0.9070972442626953,
"step": 4200
}
],
"logging_steps": 50,
"max_steps": 5505,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.989600444122399e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}