{ "best_global_step": 4000, "best_metric": 2.539825201034546, "best_model_checkpoint": "./qlora_gpt2/checkpoint-4000", "epoch": 2.2785531187695813, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05696382796923953, "grad_norm": 0.22663557529449463, "learning_rate": 3.960000000000001e-05, "loss": 3.4511, "step": 100 }, { "epoch": 0.11392765593847906, "grad_norm": 0.2373093217611313, "learning_rate": 7.960000000000001e-05, "loss": 3.068, "step": 200 }, { "epoch": 0.1708914839077186, "grad_norm": 0.25586986541748047, "learning_rate": 0.00011960000000000001, "loss": 2.9652, "step": 300 }, { "epoch": 0.22785531187695812, "grad_norm": 0.24306726455688477, "learning_rate": 0.0001596, "loss": 2.9145, "step": 400 }, { "epoch": 0.28481913984619767, "grad_norm": 0.2453053742647171, "learning_rate": 0.0001996, "loss": 2.8514, "step": 500 }, { "epoch": 0.28481913984619767, "eval_loss": 2.73831844329834, "eval_runtime": 780.3351, "eval_samples_per_second": 15.997, "eval_steps_per_second": 1.001, "step": 500 }, { "epoch": 0.3417829678154372, "grad_norm": 0.2287718951702118, "learning_rate": 0.00019584731543624163, "loss": 2.8526, "step": 600 }, { "epoch": 0.39874679578467676, "grad_norm": 0.23401789367198944, "learning_rate": 0.0001916526845637584, "loss": 2.8242, "step": 700 }, { "epoch": 0.45571062375391624, "grad_norm": 0.2420588731765747, "learning_rate": 0.0001874580536912752, "loss": 2.8295, "step": 800 }, { "epoch": 0.5126744517231558, "grad_norm": 0.24802158772945404, "learning_rate": 0.00018326342281879197, "loss": 2.7982, "step": 900 }, { "epoch": 0.5696382796923953, "grad_norm": 0.24083346128463745, "learning_rate": 0.00017906879194630872, "loss": 2.7647, "step": 1000 }, { "epoch": 0.5696382796923953, "eval_loss": 2.657050371170044, "eval_runtime": 780.7988, "eval_samples_per_second": 15.987, "eval_steps_per_second": 1.0, "step": 1000 }, { "epoch": 0.6266021076616348, "grad_norm": 0.23890583217144012, "learning_rate": 0.0001748741610738255, "loss": 2.7541, "step": 1100 }, { "epoch": 0.6835659356308744, "grad_norm": 0.2339860498905182, "learning_rate": 0.00017067953020134227, "loss": 2.7423, "step": 1200 }, { "epoch": 0.7405297636001139, "grad_norm": 0.224105566740036, "learning_rate": 0.00016648489932885908, "loss": 2.7567, "step": 1300 }, { "epoch": 0.7974935915693535, "grad_norm": 0.21676279604434967, "learning_rate": 0.00016229026845637586, "loss": 2.7369, "step": 1400 }, { "epoch": 0.854457419538593, "grad_norm": 0.22006016969680786, "learning_rate": 0.00015809563758389263, "loss": 2.7527, "step": 1500 }, { "epoch": 0.854457419538593, "eval_loss": 2.614635467529297, "eval_runtime": 780.923, "eval_samples_per_second": 15.985, "eval_steps_per_second": 1.0, "step": 1500 }, { "epoch": 0.9114212475078325, "grad_norm": 0.2208578735589981, "learning_rate": 0.0001539010067114094, "loss": 2.7038, "step": 1600 }, { "epoch": 0.9683850754770721, "grad_norm": 0.245719775557518, "learning_rate": 0.00014970637583892616, "loss": 2.7085, "step": 1700 }, { "epoch": 1.0256337225861578, "grad_norm": 0.22791120409965515, "learning_rate": 0.00014551174496644294, "loss": 2.7286, "step": 1800 }, { "epoch": 1.0825975505553973, "grad_norm": 0.2143191248178482, "learning_rate": 0.00014131711409395975, "loss": 2.6748, "step": 1900 }, { "epoch": 1.1395613785246368, "grad_norm": 0.2522701323032379, "learning_rate": 0.00013712248322147652, "loss": 2.6871, "step": 2000 }, { "epoch": 1.1395613785246368, "eval_loss": 2.5919432640075684, "eval_runtime": 777.5861, "eval_samples_per_second": 16.054, "eval_steps_per_second": 1.004, "step": 2000 }, { "epoch": 1.1965252064938765, "grad_norm": 0.21982581913471222, "learning_rate": 0.0001329278523489933, "loss": 2.644, "step": 2100 }, { "epoch": 1.253489034463116, "grad_norm": 0.19931554794311523, "learning_rate": 0.00012873322147651008, "loss": 2.6782, "step": 2200 }, { "epoch": 1.3104528624323555, "grad_norm": 0.22992636263370514, "learning_rate": 0.00012453859060402686, "loss": 2.6368, "step": 2300 }, { "epoch": 1.367416690401595, "grad_norm": 0.257996529340744, "learning_rate": 0.00012034395973154362, "loss": 2.6744, "step": 2400 }, { "epoch": 1.4243805183708345, "grad_norm": 0.23609480261802673, "learning_rate": 0.0001161493288590604, "loss": 2.662, "step": 2500 }, { "epoch": 1.4243805183708345, "eval_loss": 2.572186231613159, "eval_runtime": 779.8917, "eval_samples_per_second": 16.006, "eval_steps_per_second": 1.001, "step": 2500 }, { "epoch": 1.481344346340074, "grad_norm": 0.22605575621128082, "learning_rate": 0.00011195469798657718, "loss": 2.6789, "step": 2600 }, { "epoch": 1.5383081743093134, "grad_norm": 0.2314230501651764, "learning_rate": 0.00010776006711409397, "loss": 2.6356, "step": 2700 }, { "epoch": 1.5952720022785531, "grad_norm": 0.23274995386600494, "learning_rate": 0.00010356543624161075, "loss": 2.661, "step": 2800 }, { "epoch": 1.6522358302477926, "grad_norm": 0.20582696795463562, "learning_rate": 9.937080536912751e-05, "loss": 2.643, "step": 2900 }, { "epoch": 1.7091996582170323, "grad_norm": 0.2208004742860794, "learning_rate": 9.51761744966443e-05, "loss": 2.6328, "step": 3000 }, { "epoch": 1.7091996582170323, "eval_loss": 2.558286190032959, "eval_runtime": 779.5251, "eval_samples_per_second": 16.014, "eval_steps_per_second": 1.002, "step": 3000 }, { "epoch": 1.7661634861862718, "grad_norm": 0.23239333927631378, "learning_rate": 9.098154362416108e-05, "loss": 2.6733, "step": 3100 }, { "epoch": 1.8231273141555113, "grad_norm": 0.2152535766363144, "learning_rate": 8.678691275167785e-05, "loss": 2.6505, "step": 3200 }, { "epoch": 1.8800911421247508, "grad_norm": 0.21094359457492828, "learning_rate": 8.259228187919464e-05, "loss": 2.6153, "step": 3300 }, { "epoch": 1.9370549700939903, "grad_norm": 0.20640310645103455, "learning_rate": 7.839765100671142e-05, "loss": 2.6411, "step": 3400 }, { "epoch": 1.9940187980632298, "grad_norm": 0.2932434678077698, "learning_rate": 7.42030201342282e-05, "loss": 2.6608, "step": 3500 }, { "epoch": 1.9940187980632298, "eval_loss": 2.5474600791931152, "eval_runtime": 779.7457, "eval_samples_per_second": 16.009, "eval_steps_per_second": 1.002, "step": 3500 }, { "epoch": 2.0506978068926234, "grad_norm": 0.22819621860980988, "learning_rate": 7.000838926174496e-05, "loss": 2.6262, "step": 3600 }, { "epoch": 2.107661634861863, "grad_norm": 0.23161746561527252, "learning_rate": 6.581375838926175e-05, "loss": 2.6318, "step": 3700 }, { "epoch": 2.1646254628311024, "grad_norm": 0.26975372433662415, "learning_rate": 6.161912751677853e-05, "loss": 2.6433, "step": 3800 }, { "epoch": 2.221589290800342, "grad_norm": 0.23308990895748138, "learning_rate": 5.74244966442953e-05, "loss": 2.6375, "step": 3900 }, { "epoch": 2.2785531187695813, "grad_norm": 0.23112072050571442, "learning_rate": 5.322986577181208e-05, "loss": 2.6248, "step": 4000 }, { "epoch": 2.2785531187695813, "eval_loss": 2.539825201034546, "eval_runtime": 779.2483, "eval_samples_per_second": 16.019, "eval_steps_per_second": 1.002, "step": 4000 } ], "logging_steps": 100, "max_steps": 5268, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.439796133185454e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }