| { |
| "best_metric": 0.7816067934036255, |
| "best_model_checkpoint": "./results/tinyllama-mentalchat16k/checkpoint-2000", |
| "epoch": 3.590127150336574, |
| "eval_steps": 100, |
| "global_step": 2400, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0014958863126402393, |
| "grad_norm": 0.7731354832649231, |
| "learning_rate": 2.469135802469136e-06, |
| "loss": 1.497, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.07479431563201197, |
| "grad_norm": 0.5000666975975037, |
| "learning_rate": 0.0001234567901234568, |
| "loss": 1.228, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.14958863126402394, |
| "grad_norm": 0.49768775701522827, |
| "learning_rate": 0.00019997346476623897, |
| "loss": 0.9602, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.14958863126402394, |
| "eval_loss": 0.9461249709129333, |
| "eval_runtime": 18.5764, |
| "eval_samples_per_second": 25.409, |
| "eval_steps_per_second": 6.352, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2243829468960359, |
| "grad_norm": 0.519241452217102, |
| "learning_rate": 0.00019965023224267117, |
| "loss": 0.9233, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.2991772625280479, |
| "grad_norm": 0.43131566047668457, |
| "learning_rate": 0.0001989608571521631, |
| "loss": 0.9058, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.2991772625280479, |
| "eval_loss": 0.892902672290802, |
| "eval_runtime": 18.5677, |
| "eval_samples_per_second": 25.421, |
| "eval_steps_per_second": 6.355, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3739715781600598, |
| "grad_norm": 0.5047702193260193, |
| "learning_rate": 0.00019790787244982738, |
| "loss": 0.8891, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.4487658937920718, |
| "grad_norm": 0.44667184352874756, |
| "learning_rate": 0.00019649514709324878, |
| "loss": 0.8474, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4487658937920718, |
| "eval_loss": 0.8695303797721863, |
| "eval_runtime": 18.5746, |
| "eval_samples_per_second": 25.411, |
| "eval_steps_per_second": 6.353, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5235602094240838, |
| "grad_norm": 0.4298022985458374, |
| "learning_rate": 0.0001947278718268621, |
| "loss": 0.8576, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.5983545250560958, |
| "grad_norm": 0.4632634222507477, |
| "learning_rate": 0.00019261254010971866, |
| "loss": 0.8474, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5983545250560958, |
| "eval_loss": 0.8519686460494995, |
| "eval_runtime": 18.5692, |
| "eval_samples_per_second": 25.418, |
| "eval_steps_per_second": 6.355, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6731488406881077, |
| "grad_norm": 0.4614291787147522, |
| "learning_rate": 0.00019015692425671844, |
| "loss": 0.8404, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.7479431563201197, |
| "grad_norm": 0.4503993093967438, |
| "learning_rate": 0.00018737004688097132, |
| "loss": 0.8393, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7479431563201197, |
| "eval_loss": 0.8392879962921143, |
| "eval_runtime": 18.5676, |
| "eval_samples_per_second": 25.421, |
| "eval_steps_per_second": 6.355, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.8227374719521316, |
| "grad_norm": 0.44001927971839905, |
| "learning_rate": 0.00018426214774221636, |
| "loss": 0.842, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.8975317875841436, |
| "grad_norm": 0.48289844393730164, |
| "learning_rate": 0.00018084464612310694, |
| "loss": 0.8246, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.8975317875841436, |
| "eval_loss": 0.8246618509292603, |
| "eval_runtime": 18.5675, |
| "eval_samples_per_second": 25.421, |
| "eval_steps_per_second": 6.355, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9723261032161555, |
| "grad_norm": 0.46136462688446045, |
| "learning_rate": 0.00017713009887160194, |
| "loss": 0.815, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.0471204188481675, |
| "grad_norm": 0.5062119364738464, |
| "learning_rate": 0.00017313215426362693, |
| "loss": 0.7652, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.0471204188481675, |
| "eval_loss": 0.8210210204124451, |
| "eval_runtime": 18.558, |
| "eval_samples_per_second": 25.434, |
| "eval_steps_per_second": 6.358, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.1219147344801794, |
| "grad_norm": 0.49691587686538696, |
| "learning_rate": 0.00016886550185552613, |
| "loss": 0.745, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.1967090501121915, |
| "grad_norm": 0.5466598272323608, |
| "learning_rate": 0.00016434581851056202, |
| "loss": 0.7436, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.1967090501121915, |
| "eval_loss": 0.8142367601394653, |
| "eval_runtime": 18.5738, |
| "eval_samples_per_second": 25.412, |
| "eval_steps_per_second": 6.353, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.2715033657442034, |
| "grad_norm": 0.5322102904319763, |
| "learning_rate": 0.00015958971079777556, |
| "loss": 0.7507, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.3462976813762153, |
| "grad_norm": 0.48673272132873535, |
| "learning_rate": 0.00015461465397484964, |
| "loss": 0.7248, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.3462976813762153, |
| "eval_loss": 0.8076364398002625, |
| "eval_runtime": 18.5597, |
| "eval_samples_per_second": 25.431, |
| "eval_steps_per_second": 6.358, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.4210919970082274, |
| "grad_norm": 0.5510265231132507, |
| "learning_rate": 0.00014943892777916998, |
| "loss": 0.7233, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.4958863126402393, |
| "grad_norm": 0.5195121169090271, |
| "learning_rate": 0.00014408154926300447, |
| "loss": 0.7341, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.4958863126402393, |
| "eval_loss": 0.8008113503456116, |
| "eval_runtime": 18.5542, |
| "eval_samples_per_second": 25.439, |
| "eval_steps_per_second": 6.36, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.5706806282722514, |
| "grad_norm": 0.512640655040741, |
| "learning_rate": 0.00013856220291958335, |
| "loss": 0.7309, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.6454749439042633, |
| "grad_norm": 0.6050971746444702, |
| "learning_rate": 0.0001329011683568166, |
| "loss": 0.7295, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.6454749439042633, |
| "eval_loss": 0.7940045595169067, |
| "eval_runtime": 18.552, |
| "eval_samples_per_second": 25.442, |
| "eval_steps_per_second": 6.36, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.7202692595362752, |
| "grad_norm": 0.5502830147743225, |
| "learning_rate": 0.00012711924578439465, |
| "loss": 0.7282, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.795063575168287, |
| "grad_norm": 0.5085413455963135, |
| "learning_rate": 0.00012123767958805418, |
| "loss": 0.7073, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.795063575168287, |
| "eval_loss": 0.7896184325218201, |
| "eval_runtime": 18.5479, |
| "eval_samples_per_second": 25.448, |
| "eval_steps_per_second": 6.362, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.8698578908002992, |
| "grad_norm": 0.6039131879806519, |
| "learning_rate": 0.00011527808027181803, |
| "loss": 0.7308, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.9446522064323113, |
| "grad_norm": 0.5409778952598572, |
| "learning_rate": 0.00010926234505501502, |
| "loss": 0.7209, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.9446522064323113, |
| "eval_loss": 0.7835570573806763, |
| "eval_runtime": 18.5573, |
| "eval_samples_per_second": 25.435, |
| "eval_steps_per_second": 6.359, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.019446522064323, |
| "grad_norm": 0.6131395101547241, |
| "learning_rate": 0.00010321257741582816, |
| "loss": 0.6814, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.094240837696335, |
| "grad_norm": 0.6148830652236938, |
| "learning_rate": 9.715100587699098e-05, |
| "loss": 0.6142, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.094240837696335, |
| "eval_loss": 0.7934580445289612, |
| "eval_runtime": 18.5546, |
| "eval_samples_per_second": 25.438, |
| "eval_steps_per_second": 6.36, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.169035153328347, |
| "grad_norm": 0.6375288367271423, |
| "learning_rate": 9.10999023320352e-05, |
| "loss": 0.6042, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.243829468960359, |
| "grad_norm": 0.6075552701950073, |
| "learning_rate": 8.508150021218224e-05, |
| "loss": 0.6168, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.243829468960359, |
| "eval_loss": 0.7945839762687683, |
| "eval_runtime": 18.559, |
| "eval_samples_per_second": 25.432, |
| "eval_steps_per_second": 6.358, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.318623784592371, |
| "grad_norm": 0.6649960875511169, |
| "learning_rate": 7.911791279455607e-05, |
| "loss": 0.6216, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.393418100224383, |
| "grad_norm": 0.6594658493995667, |
| "learning_rate": 7.323105195187506e-05, |
| "loss": 0.6068, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.393418100224383, |
| "eval_loss": 0.790826678276062, |
| "eval_runtime": 18.556, |
| "eval_samples_per_second": 25.436, |
| "eval_steps_per_second": 6.359, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.468212415856395, |
| "grad_norm": 0.6420191526412964, |
| "learning_rate": 6.744254764215987e-05, |
| "loss": 0.6339, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.543006731488407, |
| "grad_norm": 0.6741716861724854, |
| "learning_rate": 6.177366843427392e-05, |
| "loss": 0.6264, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.543006731488407, |
| "eval_loss": 0.7899439334869385, |
| "eval_runtime": 18.5486, |
| "eval_samples_per_second": 25.447, |
| "eval_steps_per_second": 6.362, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.6178010471204187, |
| "grad_norm": 0.678027331829071, |
| "learning_rate": 5.624524336130754e-05, |
| "loss": 0.607, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.6925953627524306, |
| "grad_norm": 0.6342390775680542, |
| "learning_rate": 5.087758538893881e-05, |
| "loss": 0.6187, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.6925953627524306, |
| "eval_loss": 0.7833373546600342, |
| "eval_runtime": 18.5473, |
| "eval_samples_per_second": 25.448, |
| "eval_steps_per_second": 6.362, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.767389678384443, |
| "grad_norm": 0.6665670275688171, |
| "learning_rate": 4.569041677996858e-05, |
| "loss": 0.6165, |
| "step": 1850 |
| }, |
| { |
| "epoch": 2.842183994016455, |
| "grad_norm": 0.7102829217910767, |
| "learning_rate": 4.0702796629261964e-05, |
| "loss": 0.6049, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.842183994016455, |
| "eval_loss": 0.7841951847076416, |
| "eval_runtime": 18.5527, |
| "eval_samples_per_second": 25.441, |
| "eval_steps_per_second": 6.36, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.9169783096484667, |
| "grad_norm": 0.6682849526405334, |
| "learning_rate": 3.593305083535229e-05, |
| "loss": 0.617, |
| "step": 1950 |
| }, |
| { |
| "epoch": 2.9917726252804786, |
| "grad_norm": 0.7063353061676025, |
| "learning_rate": 3.139870476601171e-05, |
| "loss": 0.6007, |
| "step": 2000 |
| }, |
| { |
| "epoch": 2.9917726252804786, |
| "eval_loss": 0.7816067934036255, |
| "eval_runtime": 18.5552, |
| "eval_samples_per_second": 25.438, |
| "eval_steps_per_second": 6.359, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.0665669409124905, |
| "grad_norm": 0.6726052165031433, |
| "learning_rate": 2.7116418865193638e-05, |
| "loss": 0.5516, |
| "step": 2050 |
| }, |
| { |
| "epoch": 3.141361256544503, |
| "grad_norm": 0.7561970353126526, |
| "learning_rate": 2.310192743794496e-05, |
| "loss": 0.5308, |
| "step": 2100 |
| }, |
| { |
| "epoch": 3.141361256544503, |
| "eval_loss": 0.8031598329544067, |
| "eval_runtime": 18.5474, |
| "eval_samples_per_second": 25.448, |
| "eval_steps_per_second": 6.362, |
| "step": 2100 |
| }, |
| { |
| "epoch": 3.2161555721765147, |
| "grad_norm": 0.7770841717720032, |
| "learning_rate": 1.9369980838209156e-05, |
| "loss": 0.5417, |
| "step": 2150 |
| }, |
| { |
| "epoch": 3.2909498878085266, |
| "grad_norm": 0.6870452165603638, |
| "learning_rate": 1.5934291271938596e-05, |
| "loss": 0.5341, |
| "step": 2200 |
| }, |
| { |
| "epoch": 3.2909498878085266, |
| "eval_loss": 0.8018179535865784, |
| "eval_runtime": 18.5529, |
| "eval_samples_per_second": 25.441, |
| "eval_steps_per_second": 6.36, |
| "step": 2200 |
| }, |
| { |
| "epoch": 3.3657442034405385, |
| "grad_norm": 0.6838712096214294, |
| "learning_rate": 1.2807482414650063e-05, |
| "loss": 0.529, |
| "step": 2250 |
| }, |
| { |
| "epoch": 3.4405385190725504, |
| "grad_norm": 0.7482870817184448, |
| "learning_rate": 1.0001043028542834e-05, |
| "loss": 0.5376, |
| "step": 2300 |
| }, |
| { |
| "epoch": 3.4405385190725504, |
| "eval_loss": 0.8013114929199219, |
| "eval_runtime": 18.5497, |
| "eval_samples_per_second": 25.445, |
| "eval_steps_per_second": 6.361, |
| "step": 2300 |
| }, |
| { |
| "epoch": 3.5153328347045623, |
| "grad_norm": 0.7258043885231018, |
| "learning_rate": 7.5252847496027565e-06, |
| "loss": 0.531, |
| "step": 2350 |
| }, |
| { |
| "epoch": 3.590127150336574, |
| "grad_norm": 0.6967834830284119, |
| "learning_rate": 5.389304199794209e-06, |
| "loss": 0.5365, |
| "step": 2400 |
| }, |
| { |
| "epoch": 3.590127150336574, |
| "eval_loss": 0.8018017411231995, |
| "eval_runtime": 18.5459, |
| "eval_samples_per_second": 25.45, |
| "eval_steps_per_second": 6.363, |
| "step": 2400 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 2672, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 200, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 5, |
| "early_stopping_threshold": 0.001 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 4 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.072984234550886e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|