{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 185, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005405405405405406, "grad_norm": 2.6678006649017334, "learning_rate": 1.0526315789473684e-05, "loss": 1.4457, "step": 1 }, { "epoch": 0.02702702702702703, "grad_norm": 2.4659605026245117, "learning_rate": 5.2631578947368424e-05, "loss": 1.4311, "step": 5 }, { "epoch": 0.05405405405405406, "grad_norm": 2.074678659439087, "learning_rate": 0.00010526315789473685, "loss": 1.376, "step": 10 }, { "epoch": 0.08108108108108109, "grad_norm": 1.9776719808578491, "learning_rate": 0.00015789473684210527, "loss": 1.2676, "step": 15 }, { "epoch": 0.10810810810810811, "grad_norm": 1.8173789978027344, "learning_rate": 0.00019998209226697376, "loss": 1.1517, "step": 20 }, { "epoch": 0.13513513513513514, "grad_norm": 1.0727437734603882, "learning_rate": 0.0001993559947963185, "loss": 1.0512, "step": 25 }, { "epoch": 0.16216216216216217, "grad_norm": 0.8763325214385986, "learning_rate": 0.00019784091409455728, "loss": 0.9971, "step": 30 }, { "epoch": 0.1891891891891892, "grad_norm": 0.6051992774009705, "learning_rate": 0.0001954504062771555, "loss": 0.9685, "step": 35 }, { "epoch": 0.21621621621621623, "grad_norm": 0.6363440752029419, "learning_rate": 0.00019220586030376134, "loss": 0.9609, "step": 40 }, { "epoch": 0.24324324324324326, "grad_norm": 0.7076846957206726, "learning_rate": 0.00018813630660146488, "loss": 0.938, "step": 45 }, { "epoch": 0.2702702702702703, "grad_norm": 0.5835558772087097, "learning_rate": 0.00018327815731637612, "loss": 0.9376, "step": 50 }, { "epoch": 0.2972972972972973, "grad_norm": 0.6069886088371277, "learning_rate": 0.00017767488051760857, "loss": 0.9303, "step": 55 }, { "epoch": 0.32432432432432434, "grad_norm": 0.7000340819358826, "learning_rate": 0.0001713766112687139, "loss": 0.9276, "step": 60 }, { "epoch": 0.35135135135135137, "grad_norm": 0.6510019898414612, "learning_rate": 0.0001644397030464877, "loss": 0.9282, "step": 65 }, { "epoch": 0.3783783783783784, "grad_norm": 0.6287819147109985, "learning_rate": 0.00015692622352080662, "loss": 0.9125, "step": 70 }, { "epoch": 0.40540540540540543, "grad_norm": 0.6086368560791016, "learning_rate": 0.00014890339920698334, "loss": 0.9291, "step": 75 }, { "epoch": 0.43243243243243246, "grad_norm": 0.5720112919807434, "learning_rate": 0.0001404430139595877, "loss": 0.9152, "step": 80 }, { "epoch": 0.4594594594594595, "grad_norm": 0.6146399974822998, "learning_rate": 0.0001316207666896824, "loss": 0.9201, "step": 85 }, { "epoch": 0.4864864864864865, "grad_norm": 0.5874430537223816, "learning_rate": 0.00012251559405226941, "loss": 0.9071, "step": 90 }, { "epoch": 0.5135135135135135, "grad_norm": 0.609653115272522, "learning_rate": 0.00011320896416417026, "loss": 0.9111, "step": 95 }, { "epoch": 0.5405405405405406, "grad_norm": 0.5806834101676941, "learning_rate": 0.00010378414767176705, "loss": 0.9008, "step": 100 }, { "epoch": 0.5675675675675675, "grad_norm": 0.5760082602500916, "learning_rate": 9.432547269069261e-05, "loss": 0.9053, "step": 105 }, { "epoch": 0.5945945945945946, "grad_norm": 0.6656131148338318, "learning_rate": 8.491757028386263e-05, "loss": 0.9029, "step": 110 }, { "epoch": 0.6216216216216216, "grad_norm": 0.6559913754463196, "learning_rate": 7.564461722890081e-05, "loss": 0.9103, "step": 115 }, { "epoch": 0.6486486486486487, "grad_norm": 0.5813584327697754, "learning_rate": 6.658958285026102e-05, "loss": 0.9033, "step": 120 }, { "epoch": 0.6756756756756757, "grad_norm": 0.538943350315094, "learning_rate": 5.7833486654981606e-05, "loss": 0.9059, "step": 125 }, { "epoch": 0.7027027027027027, "grad_norm": 0.5927494764328003, "learning_rate": 4.945467341434195e-05, "loss": 0.9031, "step": 130 }, { "epoch": 0.7297297297297297, "grad_norm": 0.6069759726524353, "learning_rate": 4.152811217759529e-05, "loss": 0.9027, "step": 135 }, { "epoch": 0.7567567567567568, "grad_norm": 0.5718995332717896, "learning_rate": 3.4124725489820645e-05, "loss": 0.8891, "step": 140 }, { "epoch": 0.7837837837837838, "grad_norm": 0.6052721738815308, "learning_rate": 2.7310754815685624e-05, "loss": 0.8972, "step": 145 }, { "epoch": 0.8108108108108109, "grad_norm": 0.6083750128746033, "learning_rate": 2.1147167846963422e-05, "loss": 0.8948, "step": 150 }, { "epoch": 0.8378378378378378, "grad_norm": 0.5737211108207703, "learning_rate": 1.5689112996891576e-05, "loss": 0.9016, "step": 155 }, { "epoch": 0.8648648648648649, "grad_norm": 0.6462253332138062, "learning_rate": 1.0985425962260343e-05, "loss": 0.8989, "step": 160 }, { "epoch": 0.8918918918918919, "grad_norm": 0.6070159077644348, "learning_rate": 7.078192768243486e-06, "loss": 0.8913, "step": 165 }, { "epoch": 0.918918918918919, "grad_norm": 0.5784199833869934, "learning_rate": 4.002373205607723e-06, "loss": 0.8962, "step": 170 }, { "epoch": 0.9459459459459459, "grad_norm": 0.6253275871276855, "learning_rate": 1.7854880295797405e-06, "loss": 0.8815, "step": 175 }, { "epoch": 0.972972972972973, "grad_norm": 0.5931078195571899, "learning_rate": 4.4737271914411236e-07, "loss": 0.8919, "step": 180 }, { "epoch": 1.0, "grad_norm": 0.6198851466178894, "learning_rate": 0.0, "loss": 0.893, "step": 185 }, { "epoch": 1.0, "eval_loss": 1.4024296998977661, "eval_runtime": 0.674, "eval_samples_per_second": 16.321, "eval_steps_per_second": 1.484, "step": 185 }, { "epoch": 1.0, "step": 185, "total_flos": 9.060485625492275e+17, "train_loss": 0.9603648733448338, "train_runtime": 724.3175, "train_samples_per_second": 57.2, "train_steps_per_second": 0.255 } ], "logging_steps": 5, "max_steps": 185, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.060485625492275e+17, "train_batch_size": 14, "trial_name": null, "trial_params": null }