kaa-gpt / trainer_state.json
kdrnyzv890's picture
Upload kaa-gpt checkpoint-8000
41d6980 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9873039581777445,
"eval_steps": 500,
"global_step": 8000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.037341299477221805,
"grad_norm": 0.9237794876098633,
"learning_rate": 4.938386855862584e-05,
"loss": 8.8586,
"step": 100
},
{
"epoch": 0.07468259895444361,
"grad_norm": 1.3463598489761353,
"learning_rate": 4.876151356733881e-05,
"loss": 8.1136,
"step": 200
},
{
"epoch": 0.11202389843166542,
"grad_norm": 1.2067362070083618,
"learning_rate": 4.8139158576051786e-05,
"loss": 7.8791,
"step": 300
},
{
"epoch": 0.14936519790888722,
"grad_norm": 1.3990025520324707,
"learning_rate": 4.7516803584764754e-05,
"loss": 7.7099,
"step": 400
},
{
"epoch": 0.18670649738610903,
"grad_norm": 1.5422577857971191,
"learning_rate": 4.689444859347772e-05,
"loss": 7.5067,
"step": 500
},
{
"epoch": 0.22404779686333084,
"grad_norm": 1.439476490020752,
"learning_rate": 4.627209360219069e-05,
"loss": 7.4054,
"step": 600
},
{
"epoch": 0.26138909634055263,
"grad_norm": 1.4985345602035522,
"learning_rate": 4.564973861090366e-05,
"loss": 7.3043,
"step": 700
},
{
"epoch": 0.29873039581777444,
"grad_norm": 1.6559139490127563,
"learning_rate": 4.502738361961663e-05,
"loss": 7.1577,
"step": 800
},
{
"epoch": 0.33607169529499625,
"grad_norm": 1.677016258239746,
"learning_rate": 4.4405028628329605e-05,
"loss": 7.0927,
"step": 900
},
{
"epoch": 0.37341299477221807,
"grad_norm": 1.5764284133911133,
"learning_rate": 4.378267363704257e-05,
"loss": 6.9999,
"step": 1000
},
{
"epoch": 0.4107542942494399,
"grad_norm": 1.7246273756027222,
"learning_rate": 4.316031864575554e-05,
"loss": 6.9177,
"step": 1100
},
{
"epoch": 0.4480955937266617,
"grad_norm": 1.7886656522750854,
"learning_rate": 4.253796365446851e-05,
"loss": 6.8377,
"step": 1200
},
{
"epoch": 0.4854368932038835,
"grad_norm": 1.710162878036499,
"learning_rate": 4.191560866318148e-05,
"loss": 6.7706,
"step": 1300
},
{
"epoch": 0.5227781926811053,
"grad_norm": 1.8272024393081665,
"learning_rate": 4.129325367189445e-05,
"loss": 6.7233,
"step": 1400
},
{
"epoch": 0.5601194921583271,
"grad_norm": 1.8057252168655396,
"learning_rate": 4.0670898680607424e-05,
"loss": 6.6577,
"step": 1500
},
{
"epoch": 0.5974607916355489,
"grad_norm": 1.9049605131149292,
"learning_rate": 4.004854368932039e-05,
"loss": 6.6056,
"step": 1600
},
{
"epoch": 0.6348020911127707,
"grad_norm": 1.9891490936279297,
"learning_rate": 3.942618869803336e-05,
"loss": 6.5396,
"step": 1700
},
{
"epoch": 0.6721433905899925,
"grad_norm": 1.8199862241744995,
"learning_rate": 3.880383370674633e-05,
"loss": 6.4798,
"step": 1800
},
{
"epoch": 0.7094846900672144,
"grad_norm": 1.8243427276611328,
"learning_rate": 3.81814787154593e-05,
"loss": 6.4103,
"step": 1900
},
{
"epoch": 0.7468259895444361,
"grad_norm": 1.8216381072998047,
"learning_rate": 3.755912372417227e-05,
"loss": 6.4254,
"step": 2000
},
{
"epoch": 0.784167289021658,
"grad_norm": 1.7757889032363892,
"learning_rate": 3.693676873288524e-05,
"loss": 6.3402,
"step": 2100
},
{
"epoch": 0.8215085884988798,
"grad_norm": 1.9034113883972168,
"learning_rate": 3.631441374159821e-05,
"loss": 6.3114,
"step": 2200
},
{
"epoch": 0.8588498879761016,
"grad_norm": 1.8567832708358765,
"learning_rate": 3.569205875031118e-05,
"loss": 6.2969,
"step": 2300
},
{
"epoch": 0.8961911874533234,
"grad_norm": 1.7683762311935425,
"learning_rate": 3.5069703759024146e-05,
"loss": 6.2537,
"step": 2400
},
{
"epoch": 0.9335324869305451,
"grad_norm": 1.8633127212524414,
"learning_rate": 3.444734876773712e-05,
"loss": 6.2365,
"step": 2500
},
{
"epoch": 0.970873786407767,
"grad_norm": 2.112578868865967,
"learning_rate": 3.382499377645009e-05,
"loss": 6.1906,
"step": 2600
},
{
"epoch": 1.0082150858849888,
"grad_norm": 2.0527567863464355,
"learning_rate": 3.320263878516306e-05,
"loss": 6.1523,
"step": 2700
},
{
"epoch": 1.0455563853622105,
"grad_norm": 1.8975645303726196,
"learning_rate": 3.258028379387602e-05,
"loss": 6.0659,
"step": 2800
},
{
"epoch": 1.0828976848394325,
"grad_norm": 1.9186785221099854,
"learning_rate": 3.1957928802589e-05,
"loss": 6.0698,
"step": 2900
},
{
"epoch": 1.1202389843166543,
"grad_norm": 1.9995322227478027,
"learning_rate": 3.1335573811301965e-05,
"loss": 6.019,
"step": 3000
},
{
"epoch": 1.157580283793876,
"grad_norm": 1.8683958053588867,
"learning_rate": 3.071321882001494e-05,
"loss": 6.007,
"step": 3100
},
{
"epoch": 1.1949215832710978,
"grad_norm": 1.8966848850250244,
"learning_rate": 3.0090863828727907e-05,
"loss": 5.9987,
"step": 3200
},
{
"epoch": 1.2322628827483197,
"grad_norm": 2.010756731033325,
"learning_rate": 2.9468508837440878e-05,
"loss": 5.9598,
"step": 3300
},
{
"epoch": 1.2696041822255415,
"grad_norm": 2.081808567047119,
"learning_rate": 2.8846153846153845e-05,
"loss": 5.9516,
"step": 3400
},
{
"epoch": 1.3069454817027633,
"grad_norm": 1.923231601715088,
"learning_rate": 2.8223798854866816e-05,
"loss": 5.9389,
"step": 3500
},
{
"epoch": 1.344286781179985,
"grad_norm": 1.9322913885116577,
"learning_rate": 2.7601443863579784e-05,
"loss": 5.9107,
"step": 3600
},
{
"epoch": 1.3816280806572068,
"grad_norm": 1.9434425830841064,
"learning_rate": 2.697908887229276e-05,
"loss": 5.8912,
"step": 3700
},
{
"epoch": 1.4189693801344287,
"grad_norm": 2.046572208404541,
"learning_rate": 2.635673388100573e-05,
"loss": 5.8741,
"step": 3800
},
{
"epoch": 1.4563106796116505,
"grad_norm": 1.9436527490615845,
"learning_rate": 2.5734378889718697e-05,
"loss": 5.8457,
"step": 3900
},
{
"epoch": 1.4936519790888723,
"grad_norm": 2.0540173053741455,
"learning_rate": 2.5112023898431668e-05,
"loss": 5.8389,
"step": 4000
},
{
"epoch": 1.5309932785660942,
"grad_norm": 2.0585784912109375,
"learning_rate": 2.4489668907144635e-05,
"loss": 5.8202,
"step": 4100
},
{
"epoch": 1.568334578043316,
"grad_norm": 2.124342441558838,
"learning_rate": 2.3867313915857606e-05,
"loss": 5.8081,
"step": 4200
},
{
"epoch": 1.6056758775205378,
"grad_norm": 2.073033571243286,
"learning_rate": 2.3244958924570577e-05,
"loss": 5.8116,
"step": 4300
},
{
"epoch": 1.6430171769977595,
"grad_norm": 2.0749969482421875,
"learning_rate": 2.2622603933283545e-05,
"loss": 5.7896,
"step": 4400
},
{
"epoch": 1.6803584764749813,
"grad_norm": 2.076416015625,
"learning_rate": 2.2000248941996516e-05,
"loss": 5.7661,
"step": 4500
},
{
"epoch": 1.717699775952203,
"grad_norm": 2.149789810180664,
"learning_rate": 2.1377893950709483e-05,
"loss": 5.7682,
"step": 4600
},
{
"epoch": 1.7550410754294248,
"grad_norm": 2.0563135147094727,
"learning_rate": 2.0755538959422454e-05,
"loss": 5.7755,
"step": 4700
},
{
"epoch": 1.7923823749066468,
"grad_norm": 2.032025098800659,
"learning_rate": 2.0133183968135425e-05,
"loss": 5.7458,
"step": 4800
},
{
"epoch": 1.8297236743838685,
"grad_norm": 2.0097744464874268,
"learning_rate": 1.9510828976848393e-05,
"loss": 5.7221,
"step": 4900
},
{
"epoch": 1.8670649738610905,
"grad_norm": 2.105161190032959,
"learning_rate": 1.8888473985561364e-05,
"loss": 5.7195,
"step": 5000
},
{
"epoch": 1.9044062733383122,
"grad_norm": 2.0916309356689453,
"learning_rate": 1.8266118994274335e-05,
"loss": 5.7039,
"step": 5100
},
{
"epoch": 1.941747572815534,
"grad_norm": 2.057687520980835,
"learning_rate": 1.7643764002987302e-05,
"loss": 5.6782,
"step": 5200
},
{
"epoch": 1.9790888722927558,
"grad_norm": 2.152930498123169,
"learning_rate": 1.7021409011700273e-05,
"loss": 5.6739,
"step": 5300
},
{
"epoch": 2.0164301717699775,
"grad_norm": 2.140216827392578,
"learning_rate": 1.6399054020413244e-05,
"loss": 5.6407,
"step": 5400
},
{
"epoch": 2.0537714712471993,
"grad_norm": 2.0678555965423584,
"learning_rate": 1.5776699029126215e-05,
"loss": 5.5972,
"step": 5500
},
{
"epoch": 2.091112770724421,
"grad_norm": 2.1241703033447266,
"learning_rate": 1.5154344037839185e-05,
"loss": 5.5728,
"step": 5600
},
{
"epoch": 2.1284540702016432,
"grad_norm": 2.1589901447296143,
"learning_rate": 1.4531989046552156e-05,
"loss": 5.6079,
"step": 5700
},
{
"epoch": 2.165795369678865,
"grad_norm": 2.1543970108032227,
"learning_rate": 1.3909634055265125e-05,
"loss": 5.5878,
"step": 5800
},
{
"epoch": 2.2031366691560867,
"grad_norm": 2.368490219116211,
"learning_rate": 1.3287279063978094e-05,
"loss": 5.5668,
"step": 5900
},
{
"epoch": 2.2404779686333085,
"grad_norm": 2.3079488277435303,
"learning_rate": 1.2664924072691065e-05,
"loss": 5.5384,
"step": 6000
},
{
"epoch": 2.2778192681105303,
"grad_norm": 2.2130212783813477,
"learning_rate": 1.2042569081404033e-05,
"loss": 5.5472,
"step": 6100
},
{
"epoch": 2.315160567587752,
"grad_norm": 2.1821630001068115,
"learning_rate": 1.1420214090117002e-05,
"loss": 5.5582,
"step": 6200
},
{
"epoch": 2.3525018670649738,
"grad_norm": 2.238124132156372,
"learning_rate": 1.0797859098829975e-05,
"loss": 5.5296,
"step": 6300
},
{
"epoch": 2.3898431665421955,
"grad_norm": 2.233442544937134,
"learning_rate": 1.0175504107542944e-05,
"loss": 5.5563,
"step": 6400
},
{
"epoch": 2.4271844660194173,
"grad_norm": 2.1606245040893555,
"learning_rate": 9.553149116255913e-06,
"loss": 5.5595,
"step": 6500
},
{
"epoch": 2.4645257654966395,
"grad_norm": 2.3241500854492188,
"learning_rate": 8.930794124968882e-06,
"loss": 5.5406,
"step": 6600
},
{
"epoch": 2.5018670649738612,
"grad_norm": 2.2552995681762695,
"learning_rate": 8.308439133681853e-06,
"loss": 5.522,
"step": 6700
},
{
"epoch": 2.539208364451083,
"grad_norm": 2.2733113765716553,
"learning_rate": 7.686084142394823e-06,
"loss": 5.5386,
"step": 6800
},
{
"epoch": 2.5765496639283048,
"grad_norm": 2.2476372718811035,
"learning_rate": 7.063729151107793e-06,
"loss": 5.5117,
"step": 6900
},
{
"epoch": 2.6138909634055265,
"grad_norm": 2.283897638320923,
"learning_rate": 6.441374159820762e-06,
"loss": 5.5096,
"step": 7000
},
{
"epoch": 2.6512322628827483,
"grad_norm": 2.2005655765533447,
"learning_rate": 5.819019168533732e-06,
"loss": 5.4929,
"step": 7100
},
{
"epoch": 2.68857356235997,
"grad_norm": 2.318183422088623,
"learning_rate": 5.1966641772467014e-06,
"loss": 5.5228,
"step": 7200
},
{
"epoch": 2.725914861837192,
"grad_norm": 2.2970705032348633,
"learning_rate": 4.5743091859596715e-06,
"loss": 5.5041,
"step": 7300
},
{
"epoch": 2.7632561613144135,
"grad_norm": 2.26883602142334,
"learning_rate": 3.951954194672642e-06,
"loss": 5.4682,
"step": 7400
},
{
"epoch": 2.8005974607916357,
"grad_norm": 2.2655177116394043,
"learning_rate": 3.329599203385611e-06,
"loss": 5.484,
"step": 7500
},
{
"epoch": 2.8379387602688575,
"grad_norm": 2.264005184173584,
"learning_rate": 2.707244212098581e-06,
"loss": 5.4943,
"step": 7600
},
{
"epoch": 2.8752800597460793,
"grad_norm": 2.2275617122650146,
"learning_rate": 2.0848892208115507e-06,
"loss": 5.5058,
"step": 7700
},
{
"epoch": 2.912621359223301,
"grad_norm": 2.2127552032470703,
"learning_rate": 1.4625342295245209e-06,
"loss": 5.4636,
"step": 7800
},
{
"epoch": 2.9499626587005228,
"grad_norm": 2.3027756214141846,
"learning_rate": 8.401792382374907e-07,
"loss": 5.4856,
"step": 7900
},
{
"epoch": 2.9873039581777445,
"grad_norm": 2.233044147491455,
"learning_rate": 2.1782424695046054e-07,
"loss": 5.4813,
"step": 8000
}
],
"logging_steps": 100,
"max_steps": 8034,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8359993562628096.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}