advy's picture
Training in progress, step 2400, checkpoint
0195c09 verified
{
"best_metric": 0.7816067934036255,
"best_model_checkpoint": "./results/tinyllama-mentalchat16k/checkpoint-2000",
"epoch": 3.590127150336574,
"eval_steps": 100,
"global_step": 2400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014958863126402393,
"grad_norm": 0.7731354832649231,
"learning_rate": 2.469135802469136e-06,
"loss": 1.497,
"step": 1
},
{
"epoch": 0.07479431563201197,
"grad_norm": 0.5000666975975037,
"learning_rate": 0.0001234567901234568,
"loss": 1.228,
"step": 50
},
{
"epoch": 0.14958863126402394,
"grad_norm": 0.49768775701522827,
"learning_rate": 0.00019997346476623897,
"loss": 0.9602,
"step": 100
},
{
"epoch": 0.14958863126402394,
"eval_loss": 0.9461249709129333,
"eval_runtime": 18.5764,
"eval_samples_per_second": 25.409,
"eval_steps_per_second": 6.352,
"step": 100
},
{
"epoch": 0.2243829468960359,
"grad_norm": 0.519241452217102,
"learning_rate": 0.00019965023224267117,
"loss": 0.9233,
"step": 150
},
{
"epoch": 0.2991772625280479,
"grad_norm": 0.43131566047668457,
"learning_rate": 0.0001989608571521631,
"loss": 0.9058,
"step": 200
},
{
"epoch": 0.2991772625280479,
"eval_loss": 0.892902672290802,
"eval_runtime": 18.5677,
"eval_samples_per_second": 25.421,
"eval_steps_per_second": 6.355,
"step": 200
},
{
"epoch": 0.3739715781600598,
"grad_norm": 0.5047702193260193,
"learning_rate": 0.00019790787244982738,
"loss": 0.8891,
"step": 250
},
{
"epoch": 0.4487658937920718,
"grad_norm": 0.44667184352874756,
"learning_rate": 0.00019649514709324878,
"loss": 0.8474,
"step": 300
},
{
"epoch": 0.4487658937920718,
"eval_loss": 0.8695303797721863,
"eval_runtime": 18.5746,
"eval_samples_per_second": 25.411,
"eval_steps_per_second": 6.353,
"step": 300
},
{
"epoch": 0.5235602094240838,
"grad_norm": 0.4298022985458374,
"learning_rate": 0.0001947278718268621,
"loss": 0.8576,
"step": 350
},
{
"epoch": 0.5983545250560958,
"grad_norm": 0.4632634222507477,
"learning_rate": 0.00019261254010971866,
"loss": 0.8474,
"step": 400
},
{
"epoch": 0.5983545250560958,
"eval_loss": 0.8519686460494995,
"eval_runtime": 18.5692,
"eval_samples_per_second": 25.418,
"eval_steps_per_second": 6.355,
"step": 400
},
{
"epoch": 0.6731488406881077,
"grad_norm": 0.4614291787147522,
"learning_rate": 0.00019015692425671844,
"loss": 0.8404,
"step": 450
},
{
"epoch": 0.7479431563201197,
"grad_norm": 0.4503993093967438,
"learning_rate": 0.00018737004688097132,
"loss": 0.8393,
"step": 500
},
{
"epoch": 0.7479431563201197,
"eval_loss": 0.8392879962921143,
"eval_runtime": 18.5676,
"eval_samples_per_second": 25.421,
"eval_steps_per_second": 6.355,
"step": 500
},
{
"epoch": 0.8227374719521316,
"grad_norm": 0.44001927971839905,
"learning_rate": 0.00018426214774221636,
"loss": 0.842,
"step": 550
},
{
"epoch": 0.8975317875841436,
"grad_norm": 0.48289844393730164,
"learning_rate": 0.00018084464612310694,
"loss": 0.8246,
"step": 600
},
{
"epoch": 0.8975317875841436,
"eval_loss": 0.8246618509292603,
"eval_runtime": 18.5675,
"eval_samples_per_second": 25.421,
"eval_steps_per_second": 6.355,
"step": 600
},
{
"epoch": 0.9723261032161555,
"grad_norm": 0.46136462688446045,
"learning_rate": 0.00017713009887160194,
"loss": 0.815,
"step": 650
},
{
"epoch": 1.0471204188481675,
"grad_norm": 0.5062119364738464,
"learning_rate": 0.00017313215426362693,
"loss": 0.7652,
"step": 700
},
{
"epoch": 1.0471204188481675,
"eval_loss": 0.8210210204124451,
"eval_runtime": 18.558,
"eval_samples_per_second": 25.434,
"eval_steps_per_second": 6.358,
"step": 700
},
{
"epoch": 1.1219147344801794,
"grad_norm": 0.49691587686538696,
"learning_rate": 0.00016886550185552613,
"loss": 0.745,
"step": 750
},
{
"epoch": 1.1967090501121915,
"grad_norm": 0.5466598272323608,
"learning_rate": 0.00016434581851056202,
"loss": 0.7436,
"step": 800
},
{
"epoch": 1.1967090501121915,
"eval_loss": 0.8142367601394653,
"eval_runtime": 18.5738,
"eval_samples_per_second": 25.412,
"eval_steps_per_second": 6.353,
"step": 800
},
{
"epoch": 1.2715033657442034,
"grad_norm": 0.5322102904319763,
"learning_rate": 0.00015958971079777556,
"loss": 0.7507,
"step": 850
},
{
"epoch": 1.3462976813762153,
"grad_norm": 0.48673272132873535,
"learning_rate": 0.00015461465397484964,
"loss": 0.7248,
"step": 900
},
{
"epoch": 1.3462976813762153,
"eval_loss": 0.8076364398002625,
"eval_runtime": 18.5597,
"eval_samples_per_second": 25.431,
"eval_steps_per_second": 6.358,
"step": 900
},
{
"epoch": 1.4210919970082274,
"grad_norm": 0.5510265231132507,
"learning_rate": 0.00014943892777916998,
"loss": 0.7233,
"step": 950
},
{
"epoch": 1.4958863126402393,
"grad_norm": 0.5195121169090271,
"learning_rate": 0.00014408154926300447,
"loss": 0.7341,
"step": 1000
},
{
"epoch": 1.4958863126402393,
"eval_loss": 0.8008113503456116,
"eval_runtime": 18.5542,
"eval_samples_per_second": 25.439,
"eval_steps_per_second": 6.36,
"step": 1000
},
{
"epoch": 1.5706806282722514,
"grad_norm": 0.512640655040741,
"learning_rate": 0.00013856220291958335,
"loss": 0.7309,
"step": 1050
},
{
"epoch": 1.6454749439042633,
"grad_norm": 0.6050971746444702,
"learning_rate": 0.0001329011683568166,
"loss": 0.7295,
"step": 1100
},
{
"epoch": 1.6454749439042633,
"eval_loss": 0.7940045595169067,
"eval_runtime": 18.552,
"eval_samples_per_second": 25.442,
"eval_steps_per_second": 6.36,
"step": 1100
},
{
"epoch": 1.7202692595362752,
"grad_norm": 0.5502830147743225,
"learning_rate": 0.00012711924578439465,
"loss": 0.7282,
"step": 1150
},
{
"epoch": 1.795063575168287,
"grad_norm": 0.5085413455963135,
"learning_rate": 0.00012123767958805418,
"loss": 0.7073,
"step": 1200
},
{
"epoch": 1.795063575168287,
"eval_loss": 0.7896184325218201,
"eval_runtime": 18.5479,
"eval_samples_per_second": 25.448,
"eval_steps_per_second": 6.362,
"step": 1200
},
{
"epoch": 1.8698578908002992,
"grad_norm": 0.6039131879806519,
"learning_rate": 0.00011527808027181803,
"loss": 0.7308,
"step": 1250
},
{
"epoch": 1.9446522064323113,
"grad_norm": 0.5409778952598572,
"learning_rate": 0.00010926234505501502,
"loss": 0.7209,
"step": 1300
},
{
"epoch": 1.9446522064323113,
"eval_loss": 0.7835570573806763,
"eval_runtime": 18.5573,
"eval_samples_per_second": 25.435,
"eval_steps_per_second": 6.359,
"step": 1300
},
{
"epoch": 2.019446522064323,
"grad_norm": 0.6131395101547241,
"learning_rate": 0.00010321257741582816,
"loss": 0.6814,
"step": 1350
},
{
"epoch": 2.094240837696335,
"grad_norm": 0.6148830652236938,
"learning_rate": 9.715100587699098e-05,
"loss": 0.6142,
"step": 1400
},
{
"epoch": 2.094240837696335,
"eval_loss": 0.7934580445289612,
"eval_runtime": 18.5546,
"eval_samples_per_second": 25.438,
"eval_steps_per_second": 6.36,
"step": 1400
},
{
"epoch": 2.169035153328347,
"grad_norm": 0.6375288367271423,
"learning_rate": 9.10999023320352e-05,
"loss": 0.6042,
"step": 1450
},
{
"epoch": 2.243829468960359,
"grad_norm": 0.6075552701950073,
"learning_rate": 8.508150021218224e-05,
"loss": 0.6168,
"step": 1500
},
{
"epoch": 2.243829468960359,
"eval_loss": 0.7945839762687683,
"eval_runtime": 18.559,
"eval_samples_per_second": 25.432,
"eval_steps_per_second": 6.358,
"step": 1500
},
{
"epoch": 2.318623784592371,
"grad_norm": 0.6649960875511169,
"learning_rate": 7.911791279455607e-05,
"loss": 0.6216,
"step": 1550
},
{
"epoch": 2.393418100224383,
"grad_norm": 0.6594658493995667,
"learning_rate": 7.323105195187506e-05,
"loss": 0.6068,
"step": 1600
},
{
"epoch": 2.393418100224383,
"eval_loss": 0.790826678276062,
"eval_runtime": 18.556,
"eval_samples_per_second": 25.436,
"eval_steps_per_second": 6.359,
"step": 1600
},
{
"epoch": 2.468212415856395,
"grad_norm": 0.6420191526412964,
"learning_rate": 6.744254764215987e-05,
"loss": 0.6339,
"step": 1650
},
{
"epoch": 2.543006731488407,
"grad_norm": 0.6741716861724854,
"learning_rate": 6.177366843427392e-05,
"loss": 0.6264,
"step": 1700
},
{
"epoch": 2.543006731488407,
"eval_loss": 0.7899439334869385,
"eval_runtime": 18.5486,
"eval_samples_per_second": 25.447,
"eval_steps_per_second": 6.362,
"step": 1700
},
{
"epoch": 2.6178010471204187,
"grad_norm": 0.678027331829071,
"learning_rate": 5.624524336130754e-05,
"loss": 0.607,
"step": 1750
},
{
"epoch": 2.6925953627524306,
"grad_norm": 0.6342390775680542,
"learning_rate": 5.087758538893881e-05,
"loss": 0.6187,
"step": 1800
},
{
"epoch": 2.6925953627524306,
"eval_loss": 0.7833373546600342,
"eval_runtime": 18.5473,
"eval_samples_per_second": 25.448,
"eval_steps_per_second": 6.362,
"step": 1800
},
{
"epoch": 2.767389678384443,
"grad_norm": 0.6665670275688171,
"learning_rate": 4.569041677996858e-05,
"loss": 0.6165,
"step": 1850
},
{
"epoch": 2.842183994016455,
"grad_norm": 0.7102829217910767,
"learning_rate": 4.0702796629261964e-05,
"loss": 0.6049,
"step": 1900
},
{
"epoch": 2.842183994016455,
"eval_loss": 0.7841951847076416,
"eval_runtime": 18.5527,
"eval_samples_per_second": 25.441,
"eval_steps_per_second": 6.36,
"step": 1900
},
{
"epoch": 2.9169783096484667,
"grad_norm": 0.6682849526405334,
"learning_rate": 3.593305083535229e-05,
"loss": 0.617,
"step": 1950
},
{
"epoch": 2.9917726252804786,
"grad_norm": 0.7063353061676025,
"learning_rate": 3.139870476601171e-05,
"loss": 0.6007,
"step": 2000
},
{
"epoch": 2.9917726252804786,
"eval_loss": 0.7816067934036255,
"eval_runtime": 18.5552,
"eval_samples_per_second": 25.438,
"eval_steps_per_second": 6.359,
"step": 2000
},
{
"epoch": 3.0665669409124905,
"grad_norm": 0.6726052165031433,
"learning_rate": 2.7116418865193638e-05,
"loss": 0.5516,
"step": 2050
},
{
"epoch": 3.141361256544503,
"grad_norm": 0.7561970353126526,
"learning_rate": 2.310192743794496e-05,
"loss": 0.5308,
"step": 2100
},
{
"epoch": 3.141361256544503,
"eval_loss": 0.8031598329544067,
"eval_runtime": 18.5474,
"eval_samples_per_second": 25.448,
"eval_steps_per_second": 6.362,
"step": 2100
},
{
"epoch": 3.2161555721765147,
"grad_norm": 0.7770841717720032,
"learning_rate": 1.9369980838209156e-05,
"loss": 0.5417,
"step": 2150
},
{
"epoch": 3.2909498878085266,
"grad_norm": 0.6870452165603638,
"learning_rate": 1.5934291271938596e-05,
"loss": 0.5341,
"step": 2200
},
{
"epoch": 3.2909498878085266,
"eval_loss": 0.8018179535865784,
"eval_runtime": 18.5529,
"eval_samples_per_second": 25.441,
"eval_steps_per_second": 6.36,
"step": 2200
},
{
"epoch": 3.3657442034405385,
"grad_norm": 0.6838712096214294,
"learning_rate": 1.2807482414650063e-05,
"loss": 0.529,
"step": 2250
},
{
"epoch": 3.4405385190725504,
"grad_norm": 0.7482870817184448,
"learning_rate": 1.0001043028542834e-05,
"loss": 0.5376,
"step": 2300
},
{
"epoch": 3.4405385190725504,
"eval_loss": 0.8013114929199219,
"eval_runtime": 18.5497,
"eval_samples_per_second": 25.445,
"eval_steps_per_second": 6.361,
"step": 2300
},
{
"epoch": 3.5153328347045623,
"grad_norm": 0.7258043885231018,
"learning_rate": 7.5252847496027565e-06,
"loss": 0.531,
"step": 2350
},
{
"epoch": 3.590127150336574,
"grad_norm": 0.6967834830284119,
"learning_rate": 5.389304199794209e-06,
"loss": 0.5365,
"step": 2400
},
{
"epoch": 3.590127150336574,
"eval_loss": 0.8018017411231995,
"eval_runtime": 18.5459,
"eval_samples_per_second": 25.45,
"eval_steps_per_second": 6.363,
"step": 2400
}
],
"logging_steps": 50,
"max_steps": 2672,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 200,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.001
},
"attributes": {
"early_stopping_patience_counter": 4
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.072984234550886e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}