| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| Main entry point to run the experiments. Contains general setup and the proper inference code. |
| """ |
|
|
| import argparse |
| import gc |
| import json |
| import os |
| import sys |
| import time |
| from typing import Optional |
|
|
| import bitsandbytes |
| import torch |
| import transformers |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed |
| from utils import ( |
| BenchmarkConfig, |
| BenchmarkResult, |
| BenchmarkStatus, |
| get_memory_usage, |
| init_accelerator, |
| log_results, |
| validate_experiment_path, |
| ) |
|
|
| import peft |
| from data import prepare_benchmark_prompts |
| from peft import PeftConfig, get_peft_model |
|
|
|
|
| def load_base_results(model_id: str) -> Optional[dict]: |
| """Load base model results if they exist.""" |
| base_results_dir = os.path.join(os.path.dirname(__file__), "base_results") |
| model_name = model_id.replace("/", "_").replace("-", "_") |
| filename = f"base_{model_name}.json" |
| filepath = os.path.join(base_results_dir, filename) |
|
|
| if os.path.exists(filepath): |
| with open(filepath) as f: |
| return json.load(f) |
| return None |
|
|
|
|
| def measure_inference_time(model, tokenizer, prompts, max_new_tokens, num_runs, print_fn, category_generation_params): |
| """Measure inference time for each prompt category.""" |
| inference_times = {} |
| time_per_token = {} |
| generated_tokens = {} |
| individual_samples = {} |
|
|
| for category, category_prompts in prompts.items(): |
| print_fn(f"\nMeasuring inference time for {category} prompts...") |
| category_times = [] |
| category_tokens = [] |
| category_time_per_token = [] |
| category_samples = [] |
|
|
| for prompt in category_prompts: |
| prompt_times = [] |
| prompt_tokens = [] |
| prompt_time_per_token = [] |
|
|
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
|
| cat_max_new_tokens = category_generation_params.get(category, {}).get("max_new_tokens", max_new_tokens) |
|
|
| for _ in range(num_runs): |
| start_time = time.perf_counter() |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=cat_max_new_tokens, |
| min_new_tokens=cat_max_new_tokens, |
| pad_token_id=tokenizer.pad_token_id, |
| ) |
| end_time = time.perf_counter() |
|
|
| |
| inference_time = end_time - start_time |
| num_tokens = len(outputs[0]) - len(inputs["input_ids"][0]) |
| time_per_token_val = inference_time / num_tokens if num_tokens > 0 else 0 |
|
|
| prompt_times.append(inference_time) |
| prompt_tokens.append(num_tokens) |
| prompt_time_per_token.append(time_per_token_val) |
|
|
| |
| avg_time = sum(prompt_times) / len(prompt_times) |
| avg_tokens = sum(prompt_tokens) / len(prompt_tokens) |
| avg_time_per_token = sum(prompt_time_per_token) / len(prompt_time_per_token) |
|
|
| sample_result = { |
| "inference_time": avg_time, |
| "generated_tokens": avg_tokens, |
| "time_per_token": avg_time_per_token, |
| "individual_runs": [ |
| {"inference_time": t, "generated_tokens": tok, "time_per_token": tpt} |
| for t, tok, tpt in zip(prompt_times, prompt_tokens, prompt_time_per_token) |
| ], |
| } |
| category_samples.append(sample_result) |
|
|
| category_times.append(avg_time) |
| category_tokens.append(avg_tokens) |
| category_time_per_token.append(avg_time_per_token) |
|
|
| if category_times: |
| avg_category_time = sum(category_times) / len(category_times) |
| avg_category_tokens = sum(category_tokens) / len(category_tokens) |
| avg_category_time_per_token = sum(category_time_per_token) / len(category_time_per_token) |
|
|
| inference_times[category] = avg_category_time |
| generated_tokens[category] = avg_category_tokens |
| time_per_token[category] = avg_category_time_per_token |
| individual_samples[category] = category_samples |
|
|
| return { |
| "inference_times": inference_times, |
| "time_per_token": time_per_token, |
| "generated_tokens": generated_tokens, |
| "individual_samples": individual_samples, |
| } |
|
|
|
|
| def run_benchmark( |
| benchmark_config: BenchmarkConfig, experiment_name: str, experiment_path: str, print_fn=print |
| ) -> BenchmarkResult: |
| """Run benchmarks for the specified PEFT method configuration.""" |
| result = BenchmarkResult( |
| experiment_name=experiment_name, |
| status=BenchmarkStatus.RUNNING, |
| model_id=benchmark_config.model_id, |
| ) |
|
|
| result.save() |
|
|
| start_time = time.perf_counter() |
| e_main_benchmark: Optional[Exception] = None |
|
|
| try: |
| print_fn("Initializing accelerator...") |
| accelerator_allocated_init, accelerator_reserved_init = init_accelerator() |
| set_seed(benchmark_config.seed) |
|
|
| print_fn(f"Loading base model: {benchmark_config.model_id}") |
| tokenizer = AutoTokenizer.from_pretrained(benchmark_config.model_id) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| model_kwargs = { |
| "device_map": "auto" if (torch.cuda.is_available() or torch.xpu.is_available()) else None, |
| } |
|
|
| if benchmark_config.dtype == "float32": |
| model_kwargs["torch_dtype"] = torch.float32 |
| elif benchmark_config.dtype == "float16": |
| model_kwargs["torch_dtype"] = torch.float16 |
| elif benchmark_config.dtype == "bfloat16": |
| model_kwargs["torch_dtype"] = torch.bfloat16 |
| else: |
| raise ValueError(f"Unsupported dtype: {benchmark_config.dtype}") |
|
|
| if benchmark_config.use_8bit: |
| model_kwargs["quantization_config"] = BitsAndBytesConfig( |
| load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True |
| ) |
| elif benchmark_config.use_4bit: |
| model_kwargs["quantization_config"] = BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_compute_dtype=model_kwargs.get("torch_dtype", torch.float16), |
| bnb_4bit_use_double_quant=True, |
| bnb_4bit_quant_type="nf4", |
| ) |
|
|
| base_model = AutoModelForCausalLM.from_pretrained(benchmark_config.model_id, **model_kwargs) |
|
|
| base_results = load_base_results(benchmark_config.model_id) |
|
|
| print_fn("Preparing benchmark prompts...") |
| prompts = prepare_benchmark_prompts( |
| config=benchmark_config, |
| tokenizer=tokenizer, |
| max_input_length=None, |
| seed=benchmark_config.seed, |
| ) |
|
|
| if base_results: |
| print_fn("Using cached base model results...") |
| base_inference_times = base_results["inference_results"] |
| else: |
| raise FileNotFoundError( |
| "No cached base results found. Please run `python run_base.py` first to generate base model results." |
| ) |
|
|
| try: |
| print_fn(f"Loading PEFT config from {experiment_path}") |
| peft_config = PeftConfig.from_pretrained(experiment_path) |
| print_fn(f"Loaded PEFT config: {peft_config.peft_type}, with parameters: {vars(peft_config)}") |
| model = get_peft_model(base_model, peft_config) |
| except Exception as exc: |
| error_msg = f"Error loading PEFT config: {exc!s}" |
| print_fn(error_msg) |
|
|
| del base_model |
| gc.collect() |
| if torch.cuda.is_available(): |
| torch.cuda.empty_cache() |
| elif torch.xpu.is_available(): |
| torch.xpu.empty_cache() |
|
|
| ram, accelerator_allocated, accelerator_reserved = get_memory_usage() |
| result.add_memory_log("peft_model_loaded", ram, accelerator_allocated, accelerator_reserved) |
|
|
| |
| trainable_params = model.get_nb_trainable_parameters()[0] |
| total_params = sum(p.numel() for p in model.parameters()) |
| base_params = sum(p.numel() for p in model.base_model.parameters()) |
| dtype_bytes = 2 if benchmark_config.dtype in ["float16", "bfloat16"] else 4 |
| adapter_size_mb = trainable_params * dtype_bytes / (1024 * 1024) |
| base_model_size_mb = base_params * dtype_bytes / (1024 * 1024) |
| param_ratio = trainable_params / total_params if total_params > 0 else 0 |
|
|
| result.update_meta_info( |
| param_counts={ |
| "base_params": base_params, |
| "trainable_params": trainable_params, |
| "total_params": total_params, |
| "param_ratio": param_ratio, |
| }, |
| size_info={"base_model_size_mb": base_model_size_mb, "adapter_size_mb": adapter_size_mb}, |
| package_info={ |
| "transformers-version": transformers.__version__, |
| "peft-version": peft.__version__, |
| "bitsandbytes-version": bitsandbytes.__version__ if hasattr(bitsandbytes, "__version__") else None, |
| }, |
| ) |
|
|
| print_fn("Measuring PEFT model inference times...") |
| peft_inference_times = measure_inference_time( |
| model, |
| tokenizer, |
| prompts, |
| max_new_tokens=benchmark_config.max_new_tokens, |
| num_runs=benchmark_config.num_inference_runs, |
| print_fn=print_fn, |
| category_generation_params=benchmark_config.category_generation_params, |
| ) |
|
|
| |
| inference_overhead = { |
| k: (peft_inference_times["inference_times"][k] - base_inference_times["inference_times"][k]) |
| / base_inference_times["inference_times"][k] |
| * 100 |
| for k in base_inference_times["inference_times"] |
| } |
|
|
| for category in prompts: |
| category_metrics = { |
| "inference_time": peft_inference_times["inference_times"][category], |
| "base_inference_time": base_inference_times["inference_times"][category], |
| "inference_overhead_pct": inference_overhead[category], |
| "time_per_token": peft_inference_times["time_per_token"][category], |
| "generated_tokens": peft_inference_times["generated_tokens"][category], |
| } |
| result.add_metrics_for_category( |
| category, category_metrics, individual_samples=peft_inference_times["individual_samples"][category] |
| ) |
|
|
| result.update_generation_info( |
| memory_data={ |
| "peak_accelerator_memory_mb": max( |
| (log["accelerator_allocated_mb"] for log in result.generation_info["memory"]["memory_logs"]), default=0 |
| ), |
| "peak_ram_memory_mb": max( |
| (log["ram_mb"] for log in result.generation_info["memory"]["memory_logs"]), default=0 |
| ), |
| } |
| ) |
|
|
| ram, accelerator_allocated, accelerator_reserved = get_memory_usage() |
| result.add_memory_log("benchmark_complete", ram, accelerator_allocated, accelerator_reserved) |
|
|
| result.status = BenchmarkStatus.SUCCESS |
|
|
| except Exception as exc: |
| print_fn(f"Benchmark failed with error: {exc}") |
| result.status = BenchmarkStatus.FAILED |
| e_main_benchmark = exc |
| end_time = time.perf_counter() |
| error_message = str(e_main_benchmark) if e_main_benchmark is not None else None |
|
|
| peft_config_dict = peft_config.to_dict() if "peft_config" in locals() else None |
| if peft_config_dict: |
| for key, value in peft_config_dict.items(): |
| if isinstance(value, set): |
| peft_config_dict[key] = list(value) |
|
|
| result.update_run_info( |
| duration=end_time - start_time, |
| status=result.status, |
| error=error_message, |
| peft_config=peft_config_dict, |
| benchmark_config=benchmark_config.to_dict(), |
| ) |
|
|
| return result |
|
|
|
|
| def main() -> None: |
| """Main entry point for the benchmark runner.""" |
| parser = argparse.ArgumentParser(description="Run PEFT method benchmarks") |
| parser.add_argument("experiment_path", help="Path to experiment directory") |
| parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose output") |
| args = parser.parse_args() |
|
|
| print_fn = print if args.verbose else lambda *args, **kwargs: None |
|
|
| experiment_path = args.experiment_path |
| allowed_root = os.path.abspath(os.path.join(os.path.dirname(__file__))) |
| abs_experiment_path = os.path.abspath(experiment_path) |
| if not abs_experiment_path.startswith(allowed_root): |
| print(f"Experiment path must be inside {allowed_root}, got: {abs_experiment_path}. Skipping execution.") |
| return 0 |
| if not os.path.exists(abs_experiment_path): |
| print(f"Experiment path not found: {abs_experiment_path}. Skipping execution.") |
| return 0 |
| experiment_path = abs_experiment_path |
|
|
| experiment_name, benchmark_config = validate_experiment_path(experiment_path) |
|
|
| print_fn(f"Running benchmark for experiment: {experiment_name}") |
|
|
| result = run_benchmark( |
| benchmark_config=benchmark_config, |
| experiment_name=experiment_name, |
| experiment_path=experiment_path, |
| print_fn=print_fn, |
| ) |
|
|
| log_results(experiment_name, result, print_fn=print) |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|