Behemoth-T1-123B-LoRA / trainer_state.json
tacodevs's picture
Add files using upload-large-folder tool
242e266 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 15,
"global_step": 60,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.181640625,
"epoch": 0.03361344537815126,
"grad_norm": 1.7928742767831207,
"learning_rate": 0.0,
"loss": 1.3131,
"mean_token_accuracy": 0.6369074434041977,
"num_tokens": 75310.0,
"step": 1
},
{
"entropy": 1.119140625,
"epoch": 0.06722689075630252,
"grad_norm": 1.6506218424352948,
"learning_rate": 3e-05,
"loss": 1.3469,
"mean_token_accuracy": 0.6300242394208908,
"num_tokens": 153607.0,
"step": 2
},
{
"entropy": 1.1728515625,
"epoch": 0.13445378151260504,
"grad_norm": 1.3796606244503395,
"learning_rate": 2.9482758620689654e-05,
"loss": 1.289,
"mean_token_accuracy": 0.6425867825746536,
"num_tokens": 289000.0,
"step": 4
},
{
"entropy": 1.2216796875,
"epoch": 0.20168067226890757,
"grad_norm": 3.4206735699743493,
"learning_rate": 2.8448275862068966e-05,
"loss": 1.2113,
"mean_token_accuracy": 0.6542532071471214,
"num_tokens": 438702.0,
"step": 6
},
{
"entropy": 1.259765625,
"epoch": 0.2689075630252101,
"grad_norm": 2.2379649541100193,
"learning_rate": 2.741379310344828e-05,
"loss": 1.1519,
"mean_token_accuracy": 0.663833349943161,
"num_tokens": 580037.0,
"step": 8
},
{
"entropy": 1.2392578125,
"epoch": 0.33613445378151263,
"grad_norm": 1.1181315509763536,
"learning_rate": 2.6379310344827588e-05,
"loss": 1.097,
"mean_token_accuracy": 0.6762372255325317,
"num_tokens": 724254.0,
"step": 10
},
{
"entropy": 1.1494140625,
"epoch": 0.40336134453781514,
"grad_norm": 1.191042616939082,
"learning_rate": 2.5344827586206897e-05,
"loss": 1.088,
"mean_token_accuracy": 0.6783676072955132,
"num_tokens": 875036.0,
"step": 12
},
{
"entropy": 1.154296875,
"epoch": 0.47058823529411764,
"grad_norm": 1.249507656809331,
"learning_rate": 2.4310344827586206e-05,
"loss": 1.0712,
"mean_token_accuracy": 0.676607720553875,
"num_tokens": 1023337.0,
"step": 14
},
{
"epoch": 0.5042016806722689,
"eval_entropy": 1.1395089285714286,
"eval_loss": 1.0697113275527954,
"eval_mean_token_accuracy": 0.6743263857705253,
"eval_num_tokens": 1094042.0,
"eval_runtime": 14.7587,
"eval_samples_per_second": 3.388,
"eval_steps_per_second": 0.474,
"step": 15
},
{
"entropy": 1.1630859375,
"epoch": 0.5378151260504201,
"grad_norm": 1.3489797487532325,
"learning_rate": 2.327586206896552e-05,
"loss": 1.0461,
"mean_token_accuracy": 0.6843359917402267,
"num_tokens": 1168826.0,
"step": 16
},
{
"entropy": 1.1298828125,
"epoch": 0.6050420168067226,
"grad_norm": 1.4108132020883717,
"learning_rate": 2.2241379310344828e-05,
"loss": 1.0388,
"mean_token_accuracy": 0.6870746314525604,
"num_tokens": 1322246.0,
"step": 18
},
{
"entropy": 1.1669921875,
"epoch": 0.6722689075630253,
"grad_norm": 1.3354043806449676,
"learning_rate": 2.1206896551724137e-05,
"loss": 1.0249,
"mean_token_accuracy": 0.688920646905899,
"num_tokens": 1464063.0,
"step": 20
},
{
"entropy": 1.16015625,
"epoch": 0.7394957983193278,
"grad_norm": 1.666062653107746,
"learning_rate": 2.017241379310345e-05,
"loss": 1.0323,
"mean_token_accuracy": 0.6870173960924149,
"num_tokens": 1600863.0,
"step": 22
},
{
"entropy": 1.1806640625,
"epoch": 0.8067226890756303,
"grad_norm": 1.2035574738536507,
"learning_rate": 1.9137931034482762e-05,
"loss": 1.0128,
"mean_token_accuracy": 0.6902513056993484,
"num_tokens": 1772011.0,
"step": 24
},
{
"entropy": 1.1025390625,
"epoch": 0.8739495798319328,
"grad_norm": 1.0664858711055925,
"learning_rate": 1.8103448275862068e-05,
"loss": 1.014,
"mean_token_accuracy": 0.6933257803320885,
"num_tokens": 1928733.0,
"step": 26
},
{
"entropy": 1.12744140625,
"epoch": 0.9411764705882353,
"grad_norm": 1.070630709251991,
"learning_rate": 1.706896551724138e-05,
"loss": 0.994,
"mean_token_accuracy": 0.6967712193727493,
"num_tokens": 2068126.0,
"step": 28
},
{
"entropy": 1.1819196428571428,
"epoch": 1.0,
"grad_norm": 2.6295169243673575,
"learning_rate": 1.603448275862069e-05,
"loss": 0.9902,
"mean_token_accuracy": 0.6980065788541522,
"num_tokens": 2182360.0,
"step": 30
},
{
"epoch": 1.0,
"eval_entropy": 1.125,
"eval_loss": 1.0098644495010376,
"eval_mean_token_accuracy": 0.6880264622824532,
"eval_num_tokens": 2182360.0,
"eval_runtime": 14.5734,
"eval_samples_per_second": 3.431,
"eval_steps_per_second": 0.48,
"step": 30
},
{
"entropy": 1.099609375,
"epoch": 1.0672268907563025,
"grad_norm": 2.050074542522579,
"learning_rate": 1.5e-05,
"loss": 0.8933,
"mean_token_accuracy": 0.7218180298805237,
"num_tokens": 2325564.0,
"step": 32
},
{
"entropy": 1.16015625,
"epoch": 1.134453781512605,
"grad_norm": 1.0945630850199306,
"learning_rate": 1.396551724137931e-05,
"loss": 0.9022,
"mean_token_accuracy": 0.7185544371604919,
"num_tokens": 2478474.0,
"step": 34
},
{
"entropy": 1.0927734375,
"epoch": 1.2016806722689075,
"grad_norm": 1.4418349180955692,
"learning_rate": 1.293103448275862e-05,
"loss": 0.8661,
"mean_token_accuracy": 0.7244188115000725,
"num_tokens": 2626615.0,
"step": 36
},
{
"entropy": 1.052734375,
"epoch": 1.26890756302521,
"grad_norm": 1.1722563745135475,
"learning_rate": 1.1896551724137931e-05,
"loss": 0.8564,
"mean_token_accuracy": 0.7331436201930046,
"num_tokens": 2778957.0,
"step": 38
},
{
"entropy": 1.0771484375,
"epoch": 1.3361344537815127,
"grad_norm": 1.2206493994010927,
"learning_rate": 1.0862068965517242e-05,
"loss": 0.8483,
"mean_token_accuracy": 0.731599785387516,
"num_tokens": 2934688.0,
"step": 40
},
{
"entropy": 1.03759765625,
"epoch": 1.403361344537815,
"grad_norm": 1.213607284992565,
"learning_rate": 9.827586206896551e-06,
"loss": 0.8423,
"mean_token_accuracy": 0.7331928312778473,
"num_tokens": 3078722.0,
"step": 42
},
{
"entropy": 1.0634765625,
"epoch": 1.4705882352941178,
"grad_norm": 1.8432998759311154,
"learning_rate": 8.793103448275862e-06,
"loss": 0.8403,
"mean_token_accuracy": 0.7300194650888443,
"num_tokens": 3222738.0,
"step": 44
},
{
"epoch": 1.504201680672269,
"eval_entropy": 1.05859375,
"eval_loss": 0.999748706817627,
"eval_mean_token_accuracy": 0.6920892340796334,
"eval_num_tokens": 3300341.0,
"eval_runtime": 14.5813,
"eval_samples_per_second": 3.429,
"eval_steps_per_second": 0.48,
"step": 45
},
{
"entropy": 1.060546875,
"epoch": 1.53781512605042,
"grad_norm": 1.2747974851902248,
"learning_rate": 7.758620689655173e-06,
"loss": 0.8441,
"mean_token_accuracy": 0.7343461066484451,
"num_tokens": 3379604.0,
"step": 46
},
{
"entropy": 1.03125,
"epoch": 1.6050420168067228,
"grad_norm": 1.5541516528133,
"learning_rate": 6.724137931034483e-06,
"loss": 0.8433,
"mean_token_accuracy": 0.7312941700220108,
"num_tokens": 3524464.0,
"step": 48
},
{
"entropy": 1.099609375,
"epoch": 1.6722689075630253,
"grad_norm": 1.4541114741349364,
"learning_rate": 5.689655172413793e-06,
"loss": 0.8468,
"mean_token_accuracy": 0.7327947691082954,
"num_tokens": 3671601.0,
"step": 50
},
{
"entropy": 1.0830078125,
"epoch": 1.7394957983193278,
"grad_norm": 1.3250573197747766,
"learning_rate": 4.655172413793104e-06,
"loss": 0.8337,
"mean_token_accuracy": 0.7332699969410896,
"num_tokens": 3831435.0,
"step": 52
},
{
"entropy": 1.060546875,
"epoch": 1.8067226890756303,
"grad_norm": 1.9274897237742625,
"learning_rate": 3.620689655172414e-06,
"loss": 0.8375,
"mean_token_accuracy": 0.7363650351762772,
"num_tokens": 3970690.0,
"step": 54
},
{
"entropy": 1.0380859375,
"epoch": 1.8739495798319328,
"grad_norm": 1.3947288532125532,
"learning_rate": 2.5862068965517246e-06,
"loss": 0.8176,
"mean_token_accuracy": 0.7397993430495262,
"num_tokens": 4116319.0,
"step": 56
},
{
"entropy": 1.0576171875,
"epoch": 1.9411764705882353,
"grad_norm": 1.2609408019593042,
"learning_rate": 1.5517241379310346e-06,
"loss": 0.8193,
"mean_token_accuracy": 0.7374719008803368,
"num_tokens": 4261278.0,
"step": 58
},
{
"entropy": 1.0569196428571428,
"epoch": 2.0,
"grad_norm": 2.203503634167548,
"learning_rate": 5.172413793103448e-07,
"loss": 0.8165,
"mean_token_accuracy": 0.7374913011278424,
"num_tokens": 4371243.0,
"step": 60
},
{
"epoch": 2.0,
"eval_entropy": 1.0647321428571428,
"eval_loss": 0.9897834658622742,
"eval_mean_token_accuracy": 0.6942669238362994,
"eval_num_tokens": 4371243.0,
"eval_runtime": 14.5645,
"eval_samples_per_second": 3.433,
"eval_steps_per_second": 0.481,
"step": 60
}
],
"logging_steps": 2,
"max_steps": 60,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 30,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 209132409847808.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}