{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.1569197239375226, "eval_steps": 100, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02905920813657828, "grad_norm": 1.8251163959503174, "learning_rate": 1.2903225806451613e-05, "loss": 2.6703, "num_input_tokens_seen": 686720, "step": 5, "train_runtime": 395.6486, "train_tokens_per_second": 1735.682 }, { "epoch": 0.05811841627315656, "grad_norm": 1.6785106658935547, "learning_rate": 2.9032258064516133e-05, "loss": 2.6024, "num_input_tokens_seen": 1373472, "step": 10, "train_runtime": 791.2815, "train_tokens_per_second": 1735.757 }, { "epoch": 0.08717762440973484, "grad_norm": 1.5060399770736694, "learning_rate": 4.516129032258064e-05, "loss": 2.2899, "num_input_tokens_seen": 2061472, "step": 15, "train_runtime": 1188.0329, "train_tokens_per_second": 1735.198 }, { "epoch": 0.11623683254631312, "grad_norm": 0.8076892495155334, "learning_rate": 6.129032258064517e-05, "loss": 1.7635, "num_input_tokens_seen": 2748800, "step": 20, "train_runtime": 1583.7612, "train_tokens_per_second": 1735.615 }, { "epoch": 0.1452960406828914, "grad_norm": 0.5285284519195557, "learning_rate": 7.741935483870968e-05, "loss": 1.4089, "num_input_tokens_seen": 3436480, "step": 25, "train_runtime": 1980.3723, "train_tokens_per_second": 1735.27 }, { "epoch": 0.17435524881946968, "grad_norm": 0.4370571970939636, "learning_rate": 9.35483870967742e-05, "loss": 1.2673, "num_input_tokens_seen": 4122560, "step": 30, "train_runtime": 2375.5923, "train_tokens_per_second": 1735.382 }, { "epoch": 0.20341445695604796, "grad_norm": 0.3576990067958832, "learning_rate": 9.99906754234138e-05, "loss": 1.1446, "num_input_tokens_seen": 4810240, "step": 35, "train_runtime": 2771.8387, "train_tokens_per_second": 1735.397 }, { "epoch": 0.23247366509262624, "grad_norm": 0.2466888129711151, "learning_rate": 9.993370449424153e-05, "loss": 1.0947, "num_input_tokens_seen": 5498080, "step": 40, "train_runtime": 3168.1072, "train_tokens_per_second": 1735.446 }, { "epoch": 0.2615328732292045, "grad_norm": 0.23221422731876373, "learning_rate": 9.982500190692845e-05, "loss": 1.0456, "num_input_tokens_seen": 6187968, "step": 45, "train_runtime": 3566.2467, "train_tokens_per_second": 1735.149 }, { "epoch": 0.2905920813657828, "grad_norm": 0.26600706577301025, "learning_rate": 9.966468027809582e-05, "loss": 1.0029, "num_input_tokens_seen": 6875040, "step": 50, "train_runtime": 3962.2914, "train_tokens_per_second": 1735.117 }, { "epoch": 0.31965128950236105, "grad_norm": 0.24320659041404724, "learning_rate": 9.945290570204359e-05, "loss": 0.974, "num_input_tokens_seen": 7561952, "step": 55, "train_runtime": 4357.8907, "train_tokens_per_second": 1735.232 }, { "epoch": 0.34871049763893935, "grad_norm": 0.22472068667411804, "learning_rate": 9.918989757867583e-05, "loss": 0.944, "num_input_tokens_seen": 8248800, "step": 60, "train_runtime": 4753.5334, "train_tokens_per_second": 1735.299 }, { "epoch": 0.3777697057755176, "grad_norm": 0.268387109041214, "learning_rate": 9.88759283862006e-05, "loss": 0.9328, "num_input_tokens_seen": 8937280, "step": 65, "train_runtime": 5150.3485, "train_tokens_per_second": 1735.277 }, { "epoch": 0.4068289139120959, "grad_norm": 0.21440809965133667, "learning_rate": 9.851132339884096e-05, "loss": 0.9074, "num_input_tokens_seen": 9625248, "step": 70, "train_runtime": 5546.5992, "train_tokens_per_second": 1735.342 }, { "epoch": 0.43588812204867416, "grad_norm": 0.228573739528656, "learning_rate": 9.80964603498485e-05, "loss": 0.8937, "num_input_tokens_seen": 10312960, "step": 75, "train_runtime": 5943.0082, "train_tokens_per_second": 1735.31 }, { "epoch": 0.46494733018525247, "grad_norm": 0.21550454199314117, "learning_rate": 9.763176904016913e-05, "loss": 0.8789, "num_input_tokens_seen": 11001696, "step": 80, "train_runtime": 6340.0803, "train_tokens_per_second": 1735.261 }, { "epoch": 0.4940065383218307, "grad_norm": 0.23164565861225128, "learning_rate": 9.711773089316645e-05, "loss": 0.8684, "num_input_tokens_seen": 11688192, "step": 85, "train_runtime": 6735.2127, "train_tokens_per_second": 1735.386 }, { "epoch": 0.523065746458409, "grad_norm": 0.2465435415506363, "learning_rate": 9.655487845586377e-05, "loss": 0.8422, "num_input_tokens_seen": 12375296, "step": 90, "train_runtime": 7131.144, "train_tokens_per_second": 1735.387 }, { "epoch": 0.5521249545949873, "grad_norm": 0.24671192467212677, "learning_rate": 9.594379484722184e-05, "loss": 0.8408, "num_input_tokens_seen": 13063552, "step": 95, "train_runtime": 7528.0327, "train_tokens_per_second": 1735.321 }, { "epoch": 0.5811841627315656, "grad_norm": 0.2649816572666168, "learning_rate": 9.528511315402358e-05, "loss": 0.8422, "num_input_tokens_seen": 13751648, "step": 100, "train_runtime": 7924.8612, "train_tokens_per_second": 1735.254 }, { "epoch": 0.5811841627315656, "eval_loss": 0.8274134397506714, "eval_runtime": 872.0056, "eval_samples_per_second": 6.314, "eval_steps_per_second": 1.579, "num_input_tokens_seen": 13751648, "step": 100 }, { "epoch": 0.6102433708681438, "grad_norm": 0.2663179636001587, "learning_rate": 9.457951577499187e-05, "loss": 0.8217, "num_input_tokens_seen": 14438496, "step": 105, "train_runtime": 9194.7951, "train_tokens_per_second": 1570.29 }, { "epoch": 0.6393025790047221, "grad_norm": 0.2964800000190735, "learning_rate": 9.382773371381985e-05, "loss": 0.8018, "num_input_tokens_seen": 15126496, "step": 110, "train_runtime": 9591.6416, "train_tokens_per_second": 1577.05 }, { "epoch": 0.6683617871413003, "grad_norm": 0.28969624638557434, "learning_rate": 9.303054582184609e-05, "loss": 0.8072, "num_input_tokens_seen": 15815136, "step": 115, "train_runtime": 9989.0582, "train_tokens_per_second": 1583.246 }, { "epoch": 0.6974209952778787, "grad_norm": 0.30194368958473206, "learning_rate": 9.218877799115928e-05, "loss": 0.8014, "num_input_tokens_seen": 16503360, "step": 120, "train_runtime": 10386.1584, "train_tokens_per_second": 1588.976 }, { "epoch": 0.726480203414457, "grad_norm": 0.2715190052986145, "learning_rate": 9.130330229896847e-05, "loss": 0.7902, "num_input_tokens_seen": 17190176, "step": 125, "train_runtime": 10782.1528, "train_tokens_per_second": 1594.318 }, { "epoch": 0.7555394115510352, "grad_norm": 0.2829165756702423, "learning_rate": 9.037503610412501e-05, "loss": 0.7874, "num_input_tokens_seen": 17877120, "step": 130, "train_runtime": 11178.1048, "train_tokens_per_second": 1599.298 }, { "epoch": 0.7845986196876135, "grad_norm": 0.3267139196395874, "learning_rate": 8.940494109673265e-05, "loss": 0.7963, "num_input_tokens_seen": 18563488, "step": 135, "train_runtime": 11573.6201, "train_tokens_per_second": 1603.948 }, { "epoch": 0.8136578278241918, "grad_norm": 0.31520357728004456, "learning_rate": 8.839402230183e-05, "loss": 0.7822, "num_input_tokens_seen": 19253216, "step": 140, "train_runtime": 11971.6869, "train_tokens_per_second": 1608.229 }, { "epoch": 0.8427170359607701, "grad_norm": 0.30459001660346985, "learning_rate": 8.734332703817771e-05, "loss": 0.7859, "num_input_tokens_seen": 19941568, "step": 145, "train_runtime": 12368.4401, "train_tokens_per_second": 1612.294 }, { "epoch": 0.8717762440973483, "grad_norm": 0.32623955607414246, "learning_rate": 8.625394383322914e-05, "loss": 0.7687, "num_input_tokens_seen": 20629312, "step": 150, "train_runtime": 12764.6653, "train_tokens_per_second": 1616.126 }, { "epoch": 0.9008354522339266, "grad_norm": 0.32089152932167053, "learning_rate": 8.512700129540847e-05, "loss": 0.7672, "num_input_tokens_seen": 21315136, "step": 155, "train_runtime": 13160.0163, "train_tokens_per_second": 1619.689 }, { "epoch": 0.9298946603705049, "grad_norm": 0.3055724799633026, "learning_rate": 8.396366694486466e-05, "loss": 0.7639, "num_input_tokens_seen": 22002976, "step": 160, "train_runtime": 13557.0617, "train_tokens_per_second": 1622.99 }, { "epoch": 0.9589538685070832, "grad_norm": 0.30428361892700195, "learning_rate": 8.276514600391272e-05, "loss": 0.7617, "num_input_tokens_seen": 22690560, "step": 165, "train_runtime": 13953.7665, "train_tokens_per_second": 1626.124 }, { "epoch": 0.9880130766436614, "grad_norm": 0.3108614981174469, "learning_rate": 8.153268014841506e-05, "loss": 0.7613, "num_input_tokens_seen": 23378048, "step": 170, "train_runtime": 14350.762, "train_tokens_per_second": 1629.046 }, { "epoch": 1.0116236832546313, "grad_norm": 0.3532414436340332, "learning_rate": 8.026754622139691e-05, "loss": 0.7645, "num_input_tokens_seen": 23937248, "step": 175, "train_runtime": 14673.5871, "train_tokens_per_second": 1631.315 }, { "epoch": 1.0406828913912096, "grad_norm": 0.32768887281417847, "learning_rate": 7.897105491022818e-05, "loss": 0.7563, "num_input_tokens_seen": 24623744, "step": 180, "train_runtime": 15069.3557, "train_tokens_per_second": 1634.028 }, { "epoch": 1.069742099527788, "grad_norm": 0.310390830039978, "learning_rate": 7.764454938874252e-05, "loss": 0.7389, "num_input_tokens_seen": 25312576, "step": 185, "train_runtime": 15466.8535, "train_tokens_per_second": 1636.569 }, { "epoch": 1.0988013076643661, "grad_norm": 0.3332880139350891, "learning_rate": 7.628940392569994e-05, "loss": 0.7544, "num_input_tokens_seen": 25999584, "step": 190, "train_runtime": 15863.0121, "train_tokens_per_second": 1639.007 }, { "epoch": 1.1278605158009445, "grad_norm": 0.3539334237575531, "learning_rate": 7.490702246103513e-05, "loss": 0.7455, "num_input_tokens_seen": 26685632, "step": 195, "train_runtime": 16258.3382, "train_tokens_per_second": 1641.351 }, { "epoch": 1.1569197239375226, "grad_norm": 0.3256574869155884, "learning_rate": 7.3498837151366e-05, "loss": 0.7465, "num_input_tokens_seen": 27371456, "step": 200, "train_runtime": 16653.8315, "train_tokens_per_second": 1643.553 }, { "epoch": 1.1569197239375226, "eval_loss": 0.7508572340011597, "eval_runtime": 873.0587, "eval_samples_per_second": 6.307, "eval_steps_per_second": 1.577, "num_input_tokens_seen": 27371456, "step": 200 } ], "logging_steps": 5, "max_steps": 519, "num_input_tokens_seen": 27371456, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2501177571560653e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }