| | |
| |
|
| | from __future__ import annotations |
| |
|
| | from pathlib import Path |
| | from urllib import request |
| | import os |
| | import shlex |
| | import shutil |
| | import subprocess |
| | import sys |
| | from typing import Any, Sequence |
| | import logging |
| | import json |
| | import argparse |
| |
|
| | curdir = Path(os.path.dirname(__file__)) |
| |
|
| | logger = logging.getLogger("bench") |
| |
|
| | MODEL_DIR = curdir / "bench-TriLMs-models" |
| | LLAMA_CPP_PATH = curdir / "." |
| | MODEL_SIZES = ("1.5", "2.4", "3.9") |
| | ALL_TYPES = ("TQ1_0", "TQ2_0", "Q4_K_M", "Q8_0", "F16", "BF16") |
| | GPU_TYPES = ("TQ2_0", "Q4_K_M", "Q8_0", "F16") |
| |
|
| |
|
| | def gather_models(sizes: Sequence[str] = MODEL_SIZES): |
| | logger.info("Gathering models") |
| | if not MODEL_DIR.exists(): |
| | MODEL_DIR.mkdir(parents=True, exist_ok=True) |
| | for size in sizes: |
| | filename = f"TriLM_{size}B_Unpacked-TQ1_0-F16.gguf" |
| | file = MODEL_DIR / filename |
| | if not file.exists(): |
| | url = ( |
| | f"https://huggingface.co/compilade/quant-tests/resolve/main/{filename}" |
| | ) |
| | logger.info(f"Fetching {filename} from {url}") |
| | request.urlretrieve(url, file) |
| |
|
| |
|
| | def build_llama_cpp(options: Sequence[str]): |
| | logger.info("Building llama.cpp") |
| | builddir = LLAMA_CPP_PATH / "build" |
| | if builddir.exists(): |
| | |
| | cmake_cache = builddir / "CMakeCache.txt" |
| | cmake_files = builddir / "CMakeFiles" |
| | logger.info("Removing %s and %s", cmake_cache, cmake_files) |
| | os.system(shlex.join(("rm", "-rf", str(cmake_cache), str(cmake_files)))) |
| | builddir.mkdir(exist_ok=True) |
| | old_cwd = os.path.curdir |
| | os.chdir(builddir) |
| | os.system(shlex.join(("cmake", "..", *options))) |
| | os.system(f"make -j{os.cpu_count()} llama-bench llama-quantize test-backend-ops") |
| | os.chdir(old_cwd) |
| |
|
| |
|
| | def quantize(types: Sequence[str] = ALL_TYPES, sizes: Sequence[str] = MODEL_SIZES): |
| | logger.info("Make all model types we'll test") |
| | for size in sizes: |
| | source = MODEL_DIR / f"TriLM_{size}B_Unpacked-TQ1_0-F16.gguf" |
| | for ty in types: |
| | target = MODEL_DIR / f"TriLM_{size}B_Unpacked-{ty}.gguf" |
| | if not target.exists() or target.is_file() and target.stat().st_size == 0: |
| | command = shlex.join( |
| | ( |
| | str(LLAMA_CPP_PATH / "build" / "bin" / "llama-quantize"), |
| | "--allow-requantize", |
| | str(source), |
| | str(target), |
| | ty, |
| | ) |
| | ) |
| | logger.info("Running: %s", command) |
| | ret = os.system(command) |
| | if ret != 0 or target.is_file() and target.stat().st_size == 0: |
| | logger.error("Failed to quantize to %s", target) |
| | |
| |
|
| |
|
| | def llama_bench( |
| | repetitions: int = 5, |
| | types: Sequence[str] = ALL_TYPES, |
| | sizes: Sequence[str] = MODEL_SIZES, |
| | ) -> list[dict[str, Any]]: |
| | logger.info("Test each model one by one for different numbers of threads") |
| |
|
| | threads = [2**i for i in range(5) if 2**i <= os.cpu_count()] |
| | logger.info(f"Numbers of threads to be tested: {threads}") |
| |
|
| | out = [] |
| |
|
| | for size in sizes: |
| | for ty in types: |
| | for th in threads: |
| | model_path = MODEL_DIR / f"TriLM_{size}B_Unpacked-{ty}.gguf" |
| | args = [ |
| | "-v", |
| | "-m", |
| | str(model_path), |
| | "-t", |
| | str(th), |
| | "-r", |
| | str(repetitions), |
| | "-p", |
| | "512", |
| | "-n", |
| | "128", |
| | "-o", |
| | "json", |
| | ] |
| | command = [str(LLAMA_CPP_PATH / "build" / "bin" / "llama-bench")] + args |
| | logger.info("Running: %s", " ".join(command)) |
| | result = subprocess.run(command, capture_output=True) |
| | logger.debug(result.stderr.decode(errors="ignore")) |
| | if result.returncode != 0 or len(result.stdout) == 0: |
| | logger.error("Failed to run %s", " ".join(command)) |
| | break; |
| |
|
| | new_output = json.loads(result.stdout) |
| | logger.info(json.dumps(new_output, indent=4)) |
| | out.extend(new_output) |
| | return out |
| |
|
| |
|
| | def test_backend_perf() -> str: |
| | logger.info("Test MUL_MAT performance") |
| | result = subprocess.run( |
| | [ |
| | str(LLAMA_CPP_PATH / "build" / "bin" / "test-backend-ops"), |
| | "perf", |
| | "-o", |
| | "MUL_MAT", |
| | ], |
| | capture_output=True, |
| | ) |
| | logger.debug(result.stdout.decode()) |
| | return result.stdout.decode(encoding="utf-8") |
| |
|
| |
|
| | def parse_args(args: Sequence[str]): |
| | parser = argparse.ArgumentParser( |
| | prog=args[0], description="Benchmark ternary models" |
| | ) |
| | parser.add_argument("--gpu", action="store_true", help="Run benchmarks on GPU") |
| | parser.add_argument("--cpu", action="store_true", help="Run benchmarks on CPU") |
| | parser.add_argument( |
| | "--llama-cpp-path", |
| | type=Path, |
| | default=LLAMA_CPP_PATH, |
| | help="Path to a llama.cpp checkout", |
| | ) |
| | parser.add_argument( |
| | "--model-dir", |
| | type=Path, |
| | default=MODEL_DIR, |
| | help="Where the tested models will be stored", |
| | ) |
| | parser.add_argument( |
| | "--repetitions", |
| | type=int, |
| | default=5, |
| | required=False, |
| | help="How many repetitions are run for each test", |
| | ) |
| | parser.add_argument( |
| | "--out", |
| | type=Path, |
| | default=Path(os.path.curdir) / "result.json", |
| | help="Path of the benchmark results to be written", |
| | ) |
| | parser.add_argument("--force", action="store_true", help="Overwrite the result file without asking") |
| | return parser.parse_args(args[1:]) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | args = parse_args(sys.argv) |
| |
|
| | logging.basicConfig(level=logging.DEBUG) |
| |
|
| | LLAMA_CPP_PATH = args.llama_cpp_path |
| | MODEL_DIR = args.model_dir |
| |
|
| | output_file = Path(args.out).absolute() |
| |
|
| | if output_file.exists() and not args.force: |
| | ask = input("Result file exists. Do you want to overwrite it? [y/N]") |
| | if not ask.strip().lower().startswith("y"): |
| | logger.info("Not running, leaving output file intact") |
| | exit() |
| |
|
| | results = [] |
| | mulmat_perf = [] |
| | repetitions: int = args.repetitions |
| |
|
| | if args.cpu: |
| | gather_models() |
| | build_llama_cpp(["-DGGML_NATIVE=ON", "-DGGML_CPU=ON"]) |
| | quantize() |
| | mulmat_perf.append(test_backend_perf()) |
| | results.extend(llama_bench(repetitions=repetitions)) |
| |
|
| | if args.gpu: |
| | gather_models() |
| | build_llama_cpp(["-DGGML_NATIVE=ON", "-DGGML_CUDA=ON", "-DGGML_CUDA_F16=ON"]) |
| | quantize() |
| | mulmat_perf.append(test_backend_perf()) |
| | results.extend(llama_bench(repetitions=repetitions, types=GPU_TYPES)) |
| |
|
| | final_result: dict[str, Any] = { |
| | "mulmat_perf": mulmat_perf, |
| | "results": results, |
| | } |
| |
|
| | if shutil.which("lscpu") is not None: |
| | logger.info("Getting CPU info") |
| | final_result["cpuinfo"] = subprocess.run(["lscpu"], capture_output=True).stdout.decode( |
| | encoding="utf-8" |
| | ) |
| |
|
| | if args.gpu and shutil.which("nvidia-smi") is not None: |
| | logger.info("Getting NVIDIA GPU info") |
| | final_result["gpuinfo"] = subprocess.run(["nvidia-smi", "-q"], capture_output=True).stdout.decode(encoding="utf-8") |
| |
|
| | logger.info("Writing output to: %s", output_file) |
| | logger.debug("Final results: %s", json.dumps(final_result, indent=4)) |
| | with open(output_file, "w") as f: |
| | json.dump(final_result, f, indent=4) |
| | f.flush() |
| |
|