| { | |
| "best_global_step": 221, | |
| "best_metric": 0.4947091042995453, | |
| "best_model_checkpoint": "saves/test/checkpoint-221", | |
| "epoch": 1.0, | |
| "eval_steps": 13, | |
| "global_step": 249, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.020080321285140562, | |
| "grad_norm": 1.1343824863433838, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.9403, | |
| "num_input_tokens_seen": 832, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.040160642570281124, | |
| "grad_norm": 0.9066250920295715, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.9316, | |
| "num_input_tokens_seen": 1760, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05220883534136546, | |
| "eval_loss": 0.9549338221549988, | |
| "eval_runtime": 1.0153, | |
| "eval_samples_per_second": 55.156, | |
| "eval_steps_per_second": 27.578, | |
| "num_input_tokens_seen": 2288, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.060240963855421686, | |
| "grad_norm": 0.845765233039856, | |
| "learning_rate": 2.8000000000000003e-05, | |
| "loss": 0.9858, | |
| "num_input_tokens_seen": 2608, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.08032128514056225, | |
| "grad_norm": 1.3779510259628296, | |
| "learning_rate": 3.8e-05, | |
| "loss": 0.8997, | |
| "num_input_tokens_seen": 3536, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10040160642570281, | |
| "grad_norm": 1.0899865627288818, | |
| "learning_rate": 4.8e-05, | |
| "loss": 1.1199, | |
| "num_input_tokens_seen": 4496, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.10441767068273092, | |
| "eval_loss": 0.8821852803230286, | |
| "eval_runtime": 0.9542, | |
| "eval_samples_per_second": 58.685, | |
| "eval_steps_per_second": 29.342, | |
| "num_input_tokens_seen": 4656, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.12048192771084337, | |
| "grad_norm": 1.0002409219741821, | |
| "learning_rate": 4.996067037544542e-05, | |
| "loss": 1.0716, | |
| "num_input_tokens_seen": 5424, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.14056224899598393, | |
| "grad_norm": 1.0406622886657715, | |
| "learning_rate": 4.980110583549062e-05, | |
| "loss": 0.8317, | |
| "num_input_tokens_seen": 6304, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1566265060240964, | |
| "eval_loss": 0.8176223039627075, | |
| "eval_runtime": 1.015, | |
| "eval_samples_per_second": 55.175, | |
| "eval_steps_per_second": 27.587, | |
| "num_input_tokens_seen": 6944, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.1606425702811245, | |
| "grad_norm": 0.8227869272232056, | |
| "learning_rate": 4.951963201008076e-05, | |
| "loss": 0.9978, | |
| "num_input_tokens_seen": 7072, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18072289156626506, | |
| "grad_norm": 0.9921690821647644, | |
| "learning_rate": 4.91176324775594e-05, | |
| "loss": 0.7925, | |
| "num_input_tokens_seen": 7856, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.20080321285140562, | |
| "grad_norm": 0.5807920098304749, | |
| "learning_rate": 4.8597083257709194e-05, | |
| "loss": 0.7882, | |
| "num_input_tokens_seen": 8880, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.20883534136546184, | |
| "eval_loss": 0.7668408155441284, | |
| "eval_runtime": 1.1336, | |
| "eval_samples_per_second": 49.4, | |
| "eval_steps_per_second": 24.7, | |
| "num_input_tokens_seen": 9232, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.22088353413654618, | |
| "grad_norm": 0.7548654079437256, | |
| "learning_rate": 4.796054309867053e-05, | |
| "loss": 0.7209, | |
| "num_input_tokens_seen": 9680, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.24096385542168675, | |
| "grad_norm": 0.8454243540763855, | |
| "learning_rate": 4.721114089947181e-05, | |
| "loss": 0.8405, | |
| "num_input_tokens_seen": 10576, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.26104417670682734, | |
| "grad_norm": 0.6962847113609314, | |
| "learning_rate": 4.6352560329995686e-05, | |
| "loss": 0.7909, | |
| "num_input_tokens_seen": 11424, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.26104417670682734, | |
| "eval_loss": 0.6973205208778381, | |
| "eval_runtime": 1.0994, | |
| "eval_samples_per_second": 50.939, | |
| "eval_steps_per_second": 25.469, | |
| "num_input_tokens_seen": 11424, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.28112449799196787, | |
| "grad_norm": 0.88432377576828, | |
| "learning_rate": 4.538902172398151e-05, | |
| "loss": 0.7974, | |
| "num_input_tokens_seen": 12224, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.30120481927710846, | |
| "grad_norm": 0.6160923838615417, | |
| "learning_rate": 4.4325261334068426e-05, | |
| "loss": 0.7007, | |
| "num_input_tokens_seen": 13168, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.3132530120481928, | |
| "eval_loss": 0.6643471121788025, | |
| "eval_runtime": 1.1689, | |
| "eval_samples_per_second": 47.909, | |
| "eval_steps_per_second": 23.954, | |
| "num_input_tokens_seen": 13760, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.321285140562249, | |
| "grad_norm": 0.7029849886894226, | |
| "learning_rate": 4.316650805085068e-05, | |
| "loss": 0.6931, | |
| "num_input_tokens_seen": 14080, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3413654618473896, | |
| "grad_norm": 0.7781380414962769, | |
| "learning_rate": 4.1918457700381855e-05, | |
| "loss": 0.6361, | |
| "num_input_tokens_seen": 15056, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.3614457831325301, | |
| "grad_norm": 0.9208192229270935, | |
| "learning_rate": 4.058724504646834e-05, | |
| "loss": 0.7416, | |
| "num_input_tokens_seen": 15904, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3654618473895582, | |
| "eval_loss": 0.6244128346443176, | |
| "eval_runtime": 1.1184, | |
| "eval_samples_per_second": 50.073, | |
| "eval_steps_per_second": 25.037, | |
| "num_input_tokens_seen": 16048, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.3815261044176707, | |
| "grad_norm": 0.6942310333251953, | |
| "learning_rate": 3.9179413635373897e-05, | |
| "loss": 0.7075, | |
| "num_input_tokens_seen": 16688, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.40160642570281124, | |
| "grad_norm": 0.7927543520927429, | |
| "learning_rate": 3.770188363116324e-05, | |
| "loss": 0.8212, | |
| "num_input_tokens_seen": 17552, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.41767068273092367, | |
| "eval_loss": 0.5989917516708374, | |
| "eval_runtime": 1.0356, | |
| "eval_samples_per_second": 54.073, | |
| "eval_steps_per_second": 27.037, | |
| "num_input_tokens_seen": 18272, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.42168674698795183, | |
| "grad_norm": 0.5144712328910828, | |
| "learning_rate": 3.616191779978907e-05, | |
| "loss": 0.71, | |
| "num_input_tokens_seen": 18400, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.44176706827309237, | |
| "grad_norm": 0.5579636693000793, | |
| "learning_rate": 3.456708580912725e-05, | |
| "loss": 0.5662, | |
| "num_input_tokens_seen": 19456, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.46184738955823296, | |
| "grad_norm": 0.6035380959510803, | |
| "learning_rate": 3.292522702044221e-05, | |
| "loss": 0.4927, | |
| "num_input_tokens_seen": 20288, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.46987951807228917, | |
| "eval_loss": 0.5652104020118713, | |
| "eval_runtime": 1.0011, | |
| "eval_samples_per_second": 55.939, | |
| "eval_steps_per_second": 27.97, | |
| "num_input_tokens_seen": 20656, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.4819277108433735, | |
| "grad_norm": 0.4272744059562683, | |
| "learning_rate": 3.1244411954180676e-05, | |
| "loss": 0.5256, | |
| "num_input_tokens_seen": 21328, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5020080321285141, | |
| "grad_norm": 0.5280653238296509, | |
| "learning_rate": 2.9532902619507462e-05, | |
| "loss": 0.4829, | |
| "num_input_tokens_seen": 22304, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5220883534136547, | |
| "grad_norm": 0.6218326687812805, | |
| "learning_rate": 2.7799111902582696e-05, | |
| "loss": 0.5708, | |
| "num_input_tokens_seen": 23056, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5220883534136547, | |
| "eval_loss": 0.5374971032142639, | |
| "eval_runtime": 0.9697, | |
| "eval_samples_per_second": 57.749, | |
| "eval_steps_per_second": 28.875, | |
| "num_input_tokens_seen": 23056, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5421686746987951, | |
| "grad_norm": 0.435881644487381, | |
| "learning_rate": 2.6051562213206632e-05, | |
| "loss": 0.4772, | |
| "num_input_tokens_seen": 23840, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.5622489959839357, | |
| "grad_norm": 0.49277955293655396, | |
| "learning_rate": 2.429884359310328e-05, | |
| "loss": 0.4855, | |
| "num_input_tokens_seen": 24832, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5742971887550201, | |
| "eval_loss": 0.5332380533218384, | |
| "eval_runtime": 1.0793, | |
| "eval_samples_per_second": 51.887, | |
| "eval_steps_per_second": 25.943, | |
| "num_input_tokens_seen": 25312, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.5823293172690763, | |
| "grad_norm": 0.45027071237564087, | |
| "learning_rate": 2.2549571491760986e-05, | |
| "loss": 0.5654, | |
| "num_input_tokens_seen": 25648, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.6024096385542169, | |
| "grad_norm": 0.6077672243118286, | |
| "learning_rate": 2.0812344417381595e-05, | |
| "loss": 0.5382, | |
| "num_input_tokens_seen": 26496, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6224899598393574, | |
| "grad_norm": 0.26088064908981323, | |
| "learning_rate": 1.909570167110415e-05, | |
| "loss": 0.5239, | |
| "num_input_tokens_seen": 27392, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6265060240963856, | |
| "eval_loss": 0.5172758102416992, | |
| "eval_runtime": 1.6299, | |
| "eval_samples_per_second": 34.358, | |
| "eval_steps_per_second": 17.179, | |
| "num_input_tokens_seen": 27552, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.642570281124498, | |
| "grad_norm": 0.32266005873680115, | |
| "learning_rate": 1.7408081372259632e-05, | |
| "loss": 0.52, | |
| "num_input_tokens_seen": 28272, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6626506024096386, | |
| "grad_norm": 0.33992618322372437, | |
| "learning_rate": 1.5757778980982626e-05, | |
| "loss": 0.4772, | |
| "num_input_tokens_seen": 29184, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.678714859437751, | |
| "eval_loss": 0.5134099721908569, | |
| "eval_runtime": 0.9826, | |
| "eval_samples_per_second": 56.991, | |
| "eval_steps_per_second": 28.496, | |
| "num_input_tokens_seen": 29984, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.6827309236947792, | |
| "grad_norm": 0.40361297130584717, | |
| "learning_rate": 1.4152906522061048e-05, | |
| "loss": 0.4832, | |
| "num_input_tokens_seen": 30128, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7028112449799196, | |
| "grad_norm": 0.35581162571907043, | |
| "learning_rate": 1.2601352710458313e-05, | |
| "loss": 0.494, | |
| "num_input_tokens_seen": 30976, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7228915662650602, | |
| "grad_norm": 0.30878543853759766, | |
| "learning_rate": 1.1110744174509952e-05, | |
| "loss": 0.4958, | |
| "num_input_tokens_seen": 31776, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7309236947791165, | |
| "eval_loss": 0.5050697922706604, | |
| "eval_runtime": 1.0024, | |
| "eval_samples_per_second": 55.865, | |
| "eval_steps_per_second": 27.932, | |
| "num_input_tokens_seen": 32080, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.7429718875502008, | |
| "grad_norm": 0.6177117228507996, | |
| "learning_rate": 9.688407967401248e-06, | |
| "loss": 0.6199, | |
| "num_input_tokens_seen": 32608, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.7630522088353414, | |
| "grad_norm": 0.39982450008392334, | |
| "learning_rate": 8.341335551199902e-06, | |
| "loss": 0.6049, | |
| "num_input_tokens_seen": 33360, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7831325301204819, | |
| "grad_norm": 0.559433102607727, | |
| "learning_rate": 7.076148430479321e-06, | |
| "loss": 0.6547, | |
| "num_input_tokens_seen": 34176, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.7831325301204819, | |
| "eval_loss": 0.5061560273170471, | |
| "eval_runtime": 0.9678, | |
| "eval_samples_per_second": 57.864, | |
| "eval_steps_per_second": 28.932, | |
| "num_input_tokens_seen": 34176, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8032128514056225, | |
| "grad_norm": 0.3360762298107147, | |
| "learning_rate": 5.899065604459814e-06, | |
| "loss": 0.4622, | |
| "num_input_tokens_seen": 34992, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8232931726907631, | |
| "grad_norm": 0.3794894516468048, | |
| "learning_rate": 4.81587299765594e-06, | |
| "loss": 0.6246, | |
| "num_input_tokens_seen": 35888, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8353413654618473, | |
| "eval_loss": 0.5011698007583618, | |
| "eval_runtime": 1.0745, | |
| "eval_samples_per_second": 52.116, | |
| "eval_steps_per_second": 26.058, | |
| "num_input_tokens_seen": 36512, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.8433734939759037, | |
| "grad_norm": 0.2998324930667877, | |
| "learning_rate": 3.831895019292897e-06, | |
| "loss": 0.553, | |
| "num_input_tokens_seen": 36848, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8634538152610441, | |
| "grad_norm": 0.384755939245224, | |
| "learning_rate": 2.9519683912911266e-06, | |
| "loss": 0.5892, | |
| "num_input_tokens_seen": 37888, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.8835341365461847, | |
| "grad_norm": 0.2608492970466614, | |
| "learning_rate": 2.1804183734670277e-06, | |
| "loss": 0.5174, | |
| "num_input_tokens_seen": 38768, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8875502008032129, | |
| "eval_loss": 0.4947091042995453, | |
| "eval_runtime": 1.0001, | |
| "eval_samples_per_second": 55.993, | |
| "eval_steps_per_second": 27.996, | |
| "num_input_tokens_seen": 38912, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.9036144578313253, | |
| "grad_norm": 0.3358154296875, | |
| "learning_rate": 1.5210375028143097e-06, | |
| "loss": 0.6159, | |
| "num_input_tokens_seen": 39488, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.9236947791164659, | |
| "grad_norm": 0.34916290640830994, | |
| "learning_rate": 9.770669513725128e-07, | |
| "loss": 0.5318, | |
| "num_input_tokens_seen": 40336, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9397590361445783, | |
| "eval_loss": 0.49773862957954407, | |
| "eval_runtime": 0.9441, | |
| "eval_samples_per_second": 59.319, | |
| "eval_steps_per_second": 29.659, | |
| "num_input_tokens_seen": 41120, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.9437751004016064, | |
| "grad_norm": 0.3469204604625702, | |
| "learning_rate": 5.5118059431781e-07, | |
| "loss": 0.6135, | |
| "num_input_tokens_seen": 41328, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.963855421686747, | |
| "grad_norm": 0.28425994515419006, | |
| "learning_rate": 2.454718665888589e-07, | |
| "loss": 0.6099, | |
| "num_input_tokens_seen": 42176, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9839357429718876, | |
| "grad_norm": 0.3837321698665619, | |
| "learning_rate": 6.14434726538493e-08, | |
| "loss": 0.445, | |
| "num_input_tokens_seen": 43312, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.9919678714859438, | |
| "eval_loss": 0.5010460019111633, | |
| "eval_runtime": 1.0639, | |
| "eval_samples_per_second": 52.635, | |
| "eval_steps_per_second": 26.317, | |
| "num_input_tokens_seen": 43600, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "num_input_tokens_seen": 43904, | |
| "step": 249, | |
| "total_flos": 256382402519040.0, | |
| "train_loss": 0.6638821186310795, | |
| "train_runtime": 70.9038, | |
| "train_samples_per_second": 7.024, | |
| "train_steps_per_second": 3.512 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 249, | |
| "num_input_tokens_seen": 43904, | |
| "num_train_epochs": 1, | |
| "save_steps": 13, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 256382402519040.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |