| { | |
| "best_global_step": 16500, | |
| "best_metric": 0.10550642758607864, | |
| "best_model_checkpoint": "forward_model/checkpoint-16500", | |
| "epoch": 2.4041964155617075, | |
| "eval_steps": 500, | |
| "global_step": 16500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.07285443683520326, | |
| "grad_norm": 0.4506877362728119, | |
| "learning_rate": 0.00019818228180096167, | |
| "loss": 0.5081, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.07285443683520326, | |
| "eval_loss": 0.3113231360912323, | |
| "eval_runtime": 0.5561, | |
| "eval_samples_per_second": 179.835, | |
| "eval_steps_per_second": 23.378, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.14570887367040652, | |
| "grad_norm": 0.8721633553504944, | |
| "learning_rate": 0.0001963609208800816, | |
| "loss": 0.2947, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.14570887367040652, | |
| "eval_loss": 0.2740160822868347, | |
| "eval_runtime": 0.578, | |
| "eval_samples_per_second": 173.017, | |
| "eval_steps_per_second": 22.492, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2185633105056098, | |
| "grad_norm": 0.3565625250339508, | |
| "learning_rate": 0.0001945395599592015, | |
| "loss": 0.2593, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.2185633105056098, | |
| "eval_loss": 0.24169117212295532, | |
| "eval_runtime": 0.5889, | |
| "eval_samples_per_second": 169.798, | |
| "eval_steps_per_second": 22.074, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.29141774734081305, | |
| "grad_norm": 0.30462226271629333, | |
| "learning_rate": 0.00019271819903832145, | |
| "loss": 0.2435, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.29141774734081305, | |
| "eval_loss": 0.2248290479183197, | |
| "eval_runtime": 0.5621, | |
| "eval_samples_per_second": 177.893, | |
| "eval_steps_per_second": 23.126, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3642721841760163, | |
| "grad_norm": 0.5108212232589722, | |
| "learning_rate": 0.00019089683811744136, | |
| "loss": 0.2287, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.3642721841760163, | |
| "eval_loss": 0.2162795513868332, | |
| "eval_runtime": 0.5633, | |
| "eval_samples_per_second": 177.534, | |
| "eval_steps_per_second": 23.079, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.4371266210112196, | |
| "grad_norm": 0.5142472386360168, | |
| "learning_rate": 0.0001890754771965613, | |
| "loss": 0.2197, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.4371266210112196, | |
| "eval_loss": 0.20293699204921722, | |
| "eval_runtime": 0.5931, | |
| "eval_samples_per_second": 168.6, | |
| "eval_steps_per_second": 21.918, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5099810578464229, | |
| "grad_norm": 0.39115020632743835, | |
| "learning_rate": 0.0001872541162756812, | |
| "loss": 0.2112, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5099810578464229, | |
| "eval_loss": 0.1898386925458908, | |
| "eval_runtime": 0.5913, | |
| "eval_samples_per_second": 169.122, | |
| "eval_steps_per_second": 21.986, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5828354946816261, | |
| "grad_norm": 0.5981117486953735, | |
| "learning_rate": 0.0001854327553548011, | |
| "loss": 0.2016, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5828354946816261, | |
| "eval_loss": 0.1830490082502365, | |
| "eval_runtime": 0.583, | |
| "eval_samples_per_second": 171.516, | |
| "eval_steps_per_second": 22.297, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6556899315168294, | |
| "grad_norm": 0.3978489339351654, | |
| "learning_rate": 0.00018361139443392105, | |
| "loss": 0.1961, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.6556899315168294, | |
| "eval_loss": 0.17647209763526917, | |
| "eval_runtime": 0.5656, | |
| "eval_samples_per_second": 176.789, | |
| "eval_steps_per_second": 22.983, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.7285443683520326, | |
| "grad_norm": 0.41381800174713135, | |
| "learning_rate": 0.00018179003351304095, | |
| "loss": 0.1908, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.7285443683520326, | |
| "eval_loss": 0.16872741281986237, | |
| "eval_runtime": 0.5658, | |
| "eval_samples_per_second": 176.756, | |
| "eval_steps_per_second": 22.978, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8013988051872359, | |
| "grad_norm": 0.434097021818161, | |
| "learning_rate": 0.0001799686725921609, | |
| "loss": 0.1881, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.8013988051872359, | |
| "eval_loss": 0.17042766511440277, | |
| "eval_runtime": 0.559, | |
| "eval_samples_per_second": 178.906, | |
| "eval_steps_per_second": 23.258, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.8742532420224391, | |
| "grad_norm": 0.534040093421936, | |
| "learning_rate": 0.00017814731167128077, | |
| "loss": 0.1742, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.8742532420224391, | |
| "eval_loss": 0.1600825935602188, | |
| "eval_runtime": 0.5648, | |
| "eval_samples_per_second": 177.049, | |
| "eval_steps_per_second": 23.016, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.9471076788576425, | |
| "grad_norm": 0.39319896697998047, | |
| "learning_rate": 0.0001763259507504007, | |
| "loss": 0.173, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.9471076788576425, | |
| "eval_loss": 0.149684339761734, | |
| "eval_runtime": 0.5762, | |
| "eval_samples_per_second": 173.56, | |
| "eval_steps_per_second": 22.563, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.0199621156928458, | |
| "grad_norm": 0.31258702278137207, | |
| "learning_rate": 0.00017450458982952062, | |
| "loss": 0.1699, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.0199621156928458, | |
| "eval_loss": 0.15212562680244446, | |
| "eval_runtime": 0.5619, | |
| "eval_samples_per_second": 177.961, | |
| "eval_steps_per_second": 23.135, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.0928165525280489, | |
| "grad_norm": 0.4483049511909485, | |
| "learning_rate": 0.00017268322890864055, | |
| "loss": 0.1621, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.0928165525280489, | |
| "eval_loss": 0.14983947575092316, | |
| "eval_runtime": 0.5707, | |
| "eval_samples_per_second": 175.22, | |
| "eval_steps_per_second": 22.779, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.1656709893632522, | |
| "grad_norm": 0.4742591083049774, | |
| "learning_rate": 0.00017086186798776046, | |
| "loss": 0.1614, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.1656709893632522, | |
| "eval_loss": 0.14483533799648285, | |
| "eval_runtime": 0.564, | |
| "eval_samples_per_second": 177.314, | |
| "eval_steps_per_second": 23.051, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.2385254261984555, | |
| "grad_norm": 0.2674083113670349, | |
| "learning_rate": 0.00016904050706688037, | |
| "loss": 0.1566, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.2385254261984555, | |
| "eval_loss": 0.1402028650045395, | |
| "eval_runtime": 0.5685, | |
| "eval_samples_per_second": 175.899, | |
| "eval_steps_per_second": 22.867, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.3113798630336588, | |
| "grad_norm": 0.46651679277420044, | |
| "learning_rate": 0.0001672191461460003, | |
| "loss": 0.1543, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.3113798630336588, | |
| "eval_loss": 0.13286742568016052, | |
| "eval_runtime": 0.5592, | |
| "eval_samples_per_second": 178.818, | |
| "eval_steps_per_second": 23.246, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.384234299868862, | |
| "grad_norm": 0.47738003730773926, | |
| "learning_rate": 0.0001653977852251202, | |
| "loss": 0.1496, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.384234299868862, | |
| "eval_loss": 0.13214418292045593, | |
| "eval_runtime": 0.5666, | |
| "eval_samples_per_second": 176.48, | |
| "eval_steps_per_second": 22.942, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.4570887367040652, | |
| "grad_norm": 0.27837279438972473, | |
| "learning_rate": 0.00016357642430424015, | |
| "loss": 0.1514, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.4570887367040652, | |
| "eval_loss": 0.12786731123924255, | |
| "eval_runtime": 0.5651, | |
| "eval_samples_per_second": 176.951, | |
| "eval_steps_per_second": 23.004, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.5299431735392686, | |
| "grad_norm": 0.3466807007789612, | |
| "learning_rate": 0.00016175506338336006, | |
| "loss": 0.1459, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.5299431735392686, | |
| "eval_loss": 0.12736321985721588, | |
| "eval_runtime": 0.5602, | |
| "eval_samples_per_second": 178.504, | |
| "eval_steps_per_second": 23.205, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.6027976103744717, | |
| "grad_norm": 0.32600516080856323, | |
| "learning_rate": 0.00015993370246247996, | |
| "loss": 0.1446, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.6027976103744717, | |
| "eval_loss": 0.12436921149492264, | |
| "eval_runtime": 0.5633, | |
| "eval_samples_per_second": 177.529, | |
| "eval_steps_per_second": 23.079, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.6756520472096752, | |
| "grad_norm": 0.26724693179130554, | |
| "learning_rate": 0.00015811234154159987, | |
| "loss": 0.1408, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.6756520472096752, | |
| "eval_loss": 0.12056704610586166, | |
| "eval_runtime": 0.5598, | |
| "eval_samples_per_second": 178.631, | |
| "eval_steps_per_second": 23.222, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.7485064840448783, | |
| "grad_norm": 0.5725470185279846, | |
| "learning_rate": 0.0001562909806207198, | |
| "loss": 0.1411, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.7485064840448783, | |
| "eval_loss": 0.11899165064096451, | |
| "eval_runtime": 0.5579, | |
| "eval_samples_per_second": 179.25, | |
| "eval_steps_per_second": 23.302, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.8213609208800816, | |
| "grad_norm": 0.5043156743049622, | |
| "learning_rate": 0.00015446961969983972, | |
| "loss": 0.1405, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.8213609208800816, | |
| "eval_loss": 0.11793835461139679, | |
| "eval_runtime": 0.5784, | |
| "eval_samples_per_second": 172.898, | |
| "eval_steps_per_second": 22.477, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.894215357715285, | |
| "grad_norm": 0.3471129536628723, | |
| "learning_rate": 0.00015264825877895965, | |
| "loss": 0.1362, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.894215357715285, | |
| "eval_loss": 0.11688660830259323, | |
| "eval_runtime": 0.5574, | |
| "eval_samples_per_second": 179.401, | |
| "eval_steps_per_second": 23.322, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.967069794550488, | |
| "grad_norm": 0.3712925910949707, | |
| "learning_rate": 0.0001508268978580796, | |
| "loss": 0.1345, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.967069794550488, | |
| "eval_loss": 0.11329545080661774, | |
| "eval_runtime": 0.5595, | |
| "eval_samples_per_second": 178.744, | |
| "eval_steps_per_second": 23.237, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.0399242313856916, | |
| "grad_norm": 0.33280235528945923, | |
| "learning_rate": 0.00014900553693719947, | |
| "loss": 0.1288, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.0399242313856916, | |
| "eval_loss": 0.1130962073802948, | |
| "eval_runtime": 0.5627, | |
| "eval_samples_per_second": 177.721, | |
| "eval_steps_per_second": 23.104, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.1127786682208947, | |
| "grad_norm": 0.4070642590522766, | |
| "learning_rate": 0.0001471841760163194, | |
| "loss": 0.1301, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.1127786682208947, | |
| "eval_loss": 0.11047343909740448, | |
| "eval_runtime": 0.5653, | |
| "eval_samples_per_second": 176.895, | |
| "eval_steps_per_second": 22.996, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.1856331050560978, | |
| "grad_norm": 0.31729069352149963, | |
| "learning_rate": 0.0001453628150954393, | |
| "loss": 0.1269, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.1856331050560978, | |
| "eval_loss": 0.11027682572603226, | |
| "eval_runtime": 0.5874, | |
| "eval_samples_per_second": 170.237, | |
| "eval_steps_per_second": 22.131, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.2584875418913013, | |
| "grad_norm": 0.265652596950531, | |
| "learning_rate": 0.00014354145417455925, | |
| "loss": 0.1247, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.2584875418913013, | |
| "eval_loss": 0.10951703786849976, | |
| "eval_runtime": 0.5678, | |
| "eval_samples_per_second": 176.131, | |
| "eval_steps_per_second": 22.897, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.3313419787265044, | |
| "grad_norm": 0.4715974032878876, | |
| "learning_rate": 0.00014172009325367916, | |
| "loss": 0.124, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.3313419787265044, | |
| "eval_loss": 0.10613198578357697, | |
| "eval_runtime": 0.5536, | |
| "eval_samples_per_second": 180.629, | |
| "eval_steps_per_second": 23.482, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.4041964155617075, | |
| "grad_norm": 0.3433252274990082, | |
| "learning_rate": 0.00013989873233279907, | |
| "loss": 0.1247, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.4041964155617075, | |
| "eval_loss": 0.10550642758607864, | |
| "eval_runtime": 0.5616, | |
| "eval_samples_per_second": 178.071, | |
| "eval_steps_per_second": 23.149, | |
| "step": 16500 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 54904, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 3, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4466279448576000.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |