|
|
""" |
|
|
This is a script for launching PyTorch inference on Intel(R) Xeon(R) Scalable Processors with optimal configurations. |
|
|
Single instance inference, multi-instance inference are enabled. |
|
|
|
|
|
Note: term "instance" here doesn't refer to a cloud instance. This script is executed as a single process. It invokes |
|
|
multiple "instances" which are formed from multiple threads for each. "instance" is kind of group of threads in this |
|
|
context. |
|
|
|
|
|
Illustrated as below: |
|
|
|
|
|
:: |
|
|
|
|
|
+-----------------------------+----------------------+-------+ |
|
|
| process | thread | core | |
|
|
+=============================+======================+=======+ |
|
|
| torch.backends.xeon.run_cpu | instance 0: thread 0 | 0 | |
|
|
| | thread 1 | 1 | |
|
|
| +----------------------+-------+ |
|
|
| | instance 1: thread 0 | 2 | |
|
|
| | thread 1 | 3 | |
|
|
| +----------------------+-------+ |
|
|
| | ... | ... | |
|
|
| +----------------------+-------+ |
|
|
| | instance N: thread 0 | M | |
|
|
| | thread 1 | M+1 | |
|
|
+-----------------------------+----------------------+-------+ |
|
|
|
|
|
To get the peak performance on Intel(R) Xeon(R) Scalable Processors, the script optimizes the configuration of thread and memory |
|
|
management. For thread management, the script configures thread affinity and the preload of Intel OMP library. |
|
|
For memory management, it configures NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc). |
|
|
|
|
|
Environment variables that will be set by this script: |
|
|
|
|
|
+------------------+-------------------------------------------------------------------------------------------------+ |
|
|
| Environ Variable | Value | |
|
|
+==================+=================================================================================================+ |
|
|
| LD_PRELOAD | Depending on knobs you set, <lib>/libiomp5.so, <lib>/libjemalloc.so, <lib>/libtcmalloc.so might | |
|
|
| | be appended to LD_PRELOAD. | |
|
|
+------------------+-------------------------------------------------------------------------------------------------+ |
|
|
| KMP_AFFINITY | If libiomp5.so is preloaded, KMP_AFFINITY could be set to "granularity=fine,compact,1,0". | |
|
|
+------------------+-------------------------------------------------------------------------------------------------+ |
|
|
| KMP_BLOCKTIME | If libiomp5.so is preloaded, KMP_BLOCKTIME is set to "1". | |
|
|
+------------------+-------------------------------------------------------------------------------------------------+ |
|
|
| OMP_NUM_THREADS | value of ncores_per_instance | |
|
|
+------------------+-------------------------------------------------------------------------------------------------+ |
|
|
| MALLOC_CONF | If libjemalloc.so is preloaded, MALLOC_CONF will be set to | |
|
|
| | "oversize_threshold:1,background_thread:true,metadata_thp:auto". | |
|
|
+------------------+-------------------------------------------------------------------------------------------------+ |
|
|
|
|
|
*Note*: This script respects environment variables set preliminarily. I.e. If you set the environment variables |
|
|
mentioned above before running the script, the script will not overwrite the values in the script. |
|
|
|
|
|
How to use this module: |
|
|
~~~~~~~~~~~~~~~~~~~~~~~ |
|
|
|
|
|
Single instance inference |
|
|
------------------------- |
|
|
|
|
|
1. Run single-instance inference on a single node with all CPU nodes. |
|
|
|
|
|
:: |
|
|
|
|
|
python -m torch.backends.xeon.run_cpu --throughput_mode script.py args |
|
|
|
|
|
2. Run single-instance inference on a single CPU node. |
|
|
|
|
|
:: |
|
|
|
|
|
python -m torch.backends.xeon.run_cpu --node_id 1 script.py args |
|
|
|
|
|
Multi-instance inference |
|
|
------------------------ |
|
|
|
|
|
1. Multi-instance |
|
|
By default this tool runs one process per node. If you want to set the instance numbers and core per instance, |
|
|
--ninstances and --ncores_per_instance should be set. |
|
|
|
|
|
:: |
|
|
|
|
|
python -m torch.backends.xeon.run_cpu -- python_script args |
|
|
|
|
|
eg: on an Intel(R) Xeon(R) Scalable Processor with 14 instance, 4 cores per instance |
|
|
|
|
|
:: |
|
|
|
|
|
python -m torch.backends.xeon.run_cpu --ninstances 14 --ncores_per_instance 4 python_script args |
|
|
|
|
|
2. Run single-instance inference among multiple instances. |
|
|
By default, runs all ninstances. If you want to independently run a single instance among ninstances, specify rank. |
|
|
|
|
|
eg: run 0th instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance (i.e., numactl -C 0-27) |
|
|
|
|
|
:: |
|
|
|
|
|
python -m torch.backends.xeon.run_cpu --ninstances 2 --rank 0 python_script args |
|
|
|
|
|
eg: run 1st instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance (i.e., numactl -C 28-55) |
|
|
|
|
|
:: |
|
|
|
|
|
python -m torch.backends.xeon.run_cpu --ninstances 2 --rank 1 python_script args |
|
|
|
|
|
eg: run 0th instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance, 2 cores per instance, |
|
|
first four cores (i.e., numactl -C 0-1) |
|
|
|
|
|
:: |
|
|
|
|
|
python -m torch.backends.xeon.run_cpu --core_list "0, 1, 2, 3" --ninstances 2 --ncores_per_instance 2 |
|
|
--rank 0 python_script args |
|
|
|
|
|
3. To look up what optional arguments this module offers: |
|
|
|
|
|
:: |
|
|
|
|
|
python -m torch.backends.xeon.run_cpu --help |
|
|
|
|
|
Memory allocator |
|
|
---------------- |
|
|
|
|
|
"--enable_tcmalloc" and "--enable_jemalloc" can be used to enable different memory allcator. |
|
|
|
|
|
""" |
|
|
|
|
|
import sys |
|
|
import platform |
|
|
import subprocess |
|
|
import os |
|
|
from os.path import expanduser |
|
|
import re |
|
|
import glob |
|
|
from argparse import ArgumentParser, REMAINDER |
|
|
from argparse import RawTextHelpFormatter |
|
|
import logging |
|
|
from torch.distributed.elastic.multiprocessing import Std, start_processes |
|
|
from typing import List, Dict |
|
|
|
|
|
format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" |
|
|
logging.basicConfig(level=logging.INFO, format=format_str) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class _CPUinfo(): |
|
|
""" |
|
|
Get CPU inforamation, such as cores list and NUMA information. |
|
|
""" |
|
|
def __init__(self, test_input=""): |
|
|
|
|
|
self.cpuinfo = [] |
|
|
if platform.system() in ["Windows", "Darwin"]: |
|
|
raise RuntimeError(f"{platform.system()} is not supported!!!") |
|
|
elif platform.system() == "Linux": |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if test_input == "": |
|
|
lscpu_cmd = ["lscpu", "--parse=CPU,Core,Socket,Node"] |
|
|
lscpu_info = subprocess.check_output(lscpu_cmd, universal_newlines=True).split("\n") |
|
|
else: |
|
|
lscpu_info = test_input.split("\n") |
|
|
|
|
|
|
|
|
for line in lscpu_info: |
|
|
pattern = r"^([\d]+,[\d]+,[\d]+,[\d]?)" |
|
|
regex_out = re.search(pattern, line) |
|
|
if regex_out: |
|
|
self.cpuinfo.append(regex_out.group(1).strip().split(",")) |
|
|
|
|
|
|
|
|
|
|
|
self.node_nums = int(max([line[3] for line in self.cpuinfo])) + 1 |
|
|
self.node_physical_cores: List[List[int]] = [] |
|
|
self.node_logical_cores: List[List[int]] = [] |
|
|
self.physical_core_node_map = {} |
|
|
self.logical_core_node_map = {} |
|
|
|
|
|
for node_id in range(self.node_nums): |
|
|
cur_node_physical_core = [] |
|
|
cur_node_logical_core = [] |
|
|
for cpuinfo in self.cpuinfo: |
|
|
nid = cpuinfo[3] if cpuinfo[3] != "" else "0" |
|
|
if node_id == int(nid): |
|
|
if int(cpuinfo[1]) not in cur_node_physical_core: |
|
|
cur_node_physical_core.append(int(cpuinfo[1])) |
|
|
self.physical_core_node_map[int(cpuinfo[1])] = int(node_id) |
|
|
cur_node_logical_core.append(int(cpuinfo[0])) |
|
|
self.logical_core_node_map[int(cpuinfo[0])] = int(node_id) |
|
|
self.node_physical_cores.append(cur_node_physical_core) |
|
|
self.node_logical_cores.append(cur_node_logical_core) |
|
|
|
|
|
def _physical_core_nums(self): |
|
|
return len(self.node_physical_cores) * len(self.node_physical_cores[0]) |
|
|
|
|
|
def _logical_core_nums(self): |
|
|
return len(self.node_logical_cores) * len(self.node_logical_cores[0]) |
|
|
|
|
|
def get_node_physical_cores(self, node_id): |
|
|
if node_id < 0 or node_id > self.node_nums - 1: |
|
|
raise ValueError(f"Invalid node id: {node_id}. Valid node ids: {list(range(len(self.node_physical_cores)))}") |
|
|
return self.node_physical_cores[node_id] |
|
|
|
|
|
def get_node_logical_cores(self, node_id): |
|
|
if node_id < 0 or node_id > self.node_nums - 1: |
|
|
raise ValueError(f"Invalid node id: {node_id}. Valid node ids: {list(range(len(self.node_physical_cores)))}") |
|
|
return self.node_logical_cores[node_id] |
|
|
|
|
|
def get_all_physical_cores(self): |
|
|
all_cores = [] |
|
|
for cores in self.node_physical_cores: |
|
|
all_cores.extend(cores) |
|
|
return all_cores |
|
|
|
|
|
def get_all_logical_cores(self): |
|
|
all_cores = [] |
|
|
for cores in self.node_logical_cores: |
|
|
all_cores.extend(cores) |
|
|
return all_cores |
|
|
|
|
|
def numa_aware_check(self, core_list): |
|
|
""" |
|
|
Check whether all cores in core_list are in the same NUMA node. cross NUMA will reduce perforamnce. |
|
|
We strongly advice to not use cores on different nodes. |
|
|
""" |
|
|
cores_numa_map = self.logical_core_node_map |
|
|
numa_ids = [] |
|
|
for core in core_list: |
|
|
numa_id = cores_numa_map[core] |
|
|
if numa_id not in numa_ids: |
|
|
numa_ids.append(numa_id) |
|
|
if len(numa_ids) > 1: |
|
|
logger.warning(f"Numa Aware: cores:{str(core_list)} on different NUMA nodes:{str(numa_ids)}. To avoid \ |
|
|
this behavior, please use --ncores_per_instance knob to make sure number of cores is divisible by --ncores_per_\ |
|
|
instance. Alternatively, please use --skip_cross_node_cores knob.") |
|
|
if len(numa_ids) == 0: |
|
|
raise RuntimeError("invalid number of NUMA nodes; please make sure numa_ids >= 1") |
|
|
return numa_ids |
|
|
|
|
|
class _Launcher(): |
|
|
r""" |
|
|
Class for launcher |
|
|
""" |
|
|
|
|
|
msg_lib_notfound = f"Unable to find the {{0}} library file lib{{1}}.so in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib \ |
|
|
or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or \ |
|
|
{expanduser('~')}/.local/lib/ so the LD_PRELOAD environment variable will not be set." |
|
|
|
|
|
def __init__(self): |
|
|
self.cpuinfo = _CPUinfo() |
|
|
|
|
|
def add_lib_preload(self, lib_type): |
|
|
""" |
|
|
Enale TCMalloc/JeMalloc/intel OpenMP |
|
|
""" |
|
|
library_paths = [] |
|
|
if "CONDA_PREFIX" in os.environ: |
|
|
library_paths.append(f"{os.environ['CONDA_PREFIX']}/lib") |
|
|
if "VIRTUAL_ENV" in os.environ: |
|
|
library_paths.append(f"{os.environ['VIRTUAL_ENV']}/lib") |
|
|
|
|
|
library_paths += [f"{expanduser('~')}/.local/lib", "/usr/local/lib", |
|
|
"/usr/local/lib64", "/usr/lib", "/usr/lib64"] |
|
|
|
|
|
lib_find = False |
|
|
lib_set = False |
|
|
for item in os.getenv("LD_PRELOAD", "").split(":"): |
|
|
if item.endswith(f"lib{lib_type}.so"): |
|
|
lib_set = True |
|
|
break |
|
|
if not lib_set: |
|
|
for lib_path in library_paths: |
|
|
library_file = os.path.join(lib_path, f"lib{lib_type}.so") |
|
|
matches = glob.glob(library_file) |
|
|
if len(matches) > 0: |
|
|
ld_preloads = [f"{matches[0]}", os.getenv("LD_PRELOAD", "")] |
|
|
os.environ["LD_PRELOAD"] = os.pathsep.join([p.strip(os.pathsep) for p in ld_preloads if p]) |
|
|
lib_find = True |
|
|
break |
|
|
return lib_set or lib_find |
|
|
|
|
|
|
|
|
def set_memory_allocator(self, enable_tcmalloc=True, enable_jemalloc=False, use_default_allocator=False): |
|
|
""" |
|
|
Enable TCMalloc/JeMalloc with LD_PRELOAD and set configuration for JeMalloc. |
|
|
By default, PTMalloc will be used for PyTorch, but TCMalloc and JeMalloc can get better |
|
|
memory resue and reduce page fault to improve performance. |
|
|
""" |
|
|
if enable_tcmalloc and enable_jemalloc: |
|
|
raise RuntimeError("Unable to enable TCMalloc and JEMalloc at the same time.") |
|
|
|
|
|
if enable_tcmalloc: |
|
|
find_tc = self.add_lib_preload(lib_type="tcmalloc") |
|
|
if not find_tc: |
|
|
msg = f"{self.msg_lib_notfound} you can use \"conda install -c conda-forge gperftools\" to install {{0}}" |
|
|
logger.warning(msg.format("TCmalloc", "tcmalloc")) |
|
|
else: |
|
|
logger.info("Use TCMalloc memory allocator") |
|
|
|
|
|
elif enable_jemalloc: |
|
|
find_je = self.add_lib_preload(lib_type="jemalloc") |
|
|
if not find_je: |
|
|
msg = f"{self.msg_lib_notfound} you can use \"conda install -c conda-forge jemalloc\" to install {{0}}" |
|
|
logger.warning(msg.format("Jemalloc", "jemalloc")) |
|
|
else: |
|
|
logger.info("Use JeMalloc memory allocator") |
|
|
self.set_env("MALLOC_CONF", "oversize_threshold:1,background_thread:true,metadata_thp:auto") |
|
|
|
|
|
elif use_default_allocator: |
|
|
pass |
|
|
|
|
|
else: |
|
|
find_tc = self.add_lib_preload(lib_type="tcmalloc") |
|
|
if find_tc: |
|
|
logger.info("Use TCMalloc memory allocator") |
|
|
return |
|
|
find_je = self.add_lib_preload(lib_type="jemalloc") |
|
|
if find_je: |
|
|
logger.info("Use JeMalloc memory allocator") |
|
|
return |
|
|
logger.warning(f"""Neither TCMalloc nor JeMalloc is found in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib |
|
|
or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or |
|
|
{expanduser("~")}/.local/lib/ so the LD_PRELOAD environment variable will not be set. |
|
|
This may drop the performance""") |
|
|
|
|
|
def log_env_var(self, env_var_name=""): |
|
|
if env_var_name in os.environ: |
|
|
logger.info(f"{env_var_name}={os.environ[env_var_name]}") |
|
|
|
|
|
def set_env(self, env_name, env_value): |
|
|
if not env_value: |
|
|
logger.warning(f"{env_name} is None") |
|
|
if env_name not in os.environ: |
|
|
os.environ[env_name] = env_value |
|
|
elif os.environ[env_name] != env_value: |
|
|
logger.warning(f"Overriding value with the one set in environment variable: {env_name}. \ |
|
|
Value applied: {os.environ[env_name]}. Value ignored: {env_value}") |
|
|
self.log_env_var(env_name) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def set_multi_thread_and_allocator(self, ncores_per_instance, |
|
|
disable_iomp=False, |
|
|
set_kmp_affinity=True, |
|
|
enable_tcmalloc=True, |
|
|
enable_jemalloc=False, |
|
|
use_default_allocator=False): |
|
|
""" |
|
|
Set multi-thread configuration and enable Intel openMP and TCMalloc/JeMalloc. |
|
|
By default, GNU openMP and PTMalloc are used in PyTorch. but Intel openMP and TCMalloc/JeMalloc are better alternatives |
|
|
to get performance benifit. |
|
|
""" |
|
|
self.set_memory_allocator(enable_tcmalloc, enable_jemalloc, use_default_allocator) |
|
|
self.set_env("OMP_NUM_THREADS", str(ncores_per_instance)) |
|
|
if not disable_iomp: |
|
|
find_iomp = self.add_lib_preload(lib_type="iomp5") |
|
|
if not find_iomp: |
|
|
msg = f"{self.msg_lib_notfound} you can use \"conda install mkl\" to install {{0}}" |
|
|
logger.warning(msg.format("iomp", "iomp5")) |
|
|
else: |
|
|
logger.info("Using Intel OpenMP") |
|
|
if set_kmp_affinity: |
|
|
self.set_env("KMP_AFFINITY", "granularity=fine,compact,1,0") |
|
|
self.set_env("KMP_BLOCKTIME", "1") |
|
|
self.log_env_var("LD_PRELOAD") |
|
|
|
|
|
r""" |
|
|
Launcher for single instance and multi-instance |
|
|
""" |
|
|
def launch(self, args): |
|
|
cores = [] |
|
|
set_kmp_affinity = True |
|
|
if args.core_list: |
|
|
cores = [int(x) for x in args.core_list.split(",")] |
|
|
if args.ncores_per_instance == -1: |
|
|
raise RuntimeError("please specify the \"--ncores_per_instance\" if you have pass the --core_list params") |
|
|
elif args.ninstances > 1 and args.ncores_per_instance * args.ninstances < len(cores): |
|
|
logger.warning(f"only first {args.ncores_per_instance * args.ninstances} cores will be used, \ |
|
|
but you specify {len(cores)} cores in core_list") |
|
|
else: |
|
|
args.ninstances = len(cores) // args.ncores_per_instance |
|
|
|
|
|
else: |
|
|
if args.use_logical_core: |
|
|
if args.node_id != -1: |
|
|
cores = self.cpuinfo.get_node_logical_cores(args.node_id) |
|
|
else: |
|
|
cores = self.cpuinfo.get_all_logical_cores() |
|
|
|
|
|
|
|
|
set_kmp_affinity = False |
|
|
else: |
|
|
if args.node_id != -1: |
|
|
cores = self.cpuinfo.get_node_physical_cores(args.node_id) |
|
|
else: |
|
|
cores = self.cpuinfo.get_all_physical_cores() |
|
|
if not args.multi_instance and args.ninstances == -1 and args.ncores_per_instance == -1: |
|
|
args.ninstances = 1 |
|
|
args.ncores_per_instance = len(cores) |
|
|
elif args.multi_instance and args.ninstances == -1 and args.ncores_per_instance == -1: |
|
|
args.throughput_mode = True |
|
|
elif args.ncores_per_instance == -1 and args.ninstances != -1: |
|
|
if args.ninstances > len(cores): |
|
|
raise RuntimeError(f"there are {len(cores)} total cores but you specify {args.ninstances} ninstances; \ |
|
|
please make sure ninstances <= total_cores)") |
|
|
else: |
|
|
args.ncores_per_instance = len(cores) // args.ninstances |
|
|
elif args.ncores_per_instance != -1 and args.ninstances == -1: |
|
|
if not args.skip_cross_node_cores: |
|
|
args.ninstances = len(cores) // args.ncores_per_instance |
|
|
else: |
|
|
ncore_per_node = len(self.cpuinfo.node_physical_cores[0]) |
|
|
num_leftover_cores = ncore_per_node % args.ncores_per_instance |
|
|
if args.ncores_per_instance > ncore_per_node: |
|
|
|
|
|
logger.warning("there are {} core(s) per socket, but you specify {} ncores_per_instance and \ |
|
|
skip_cross_node_cores. Please make sure --ncores_per_instance < core(s) per \ |
|
|
socket".format(ncore_per_node, args.ncores_per_instance)) |
|
|
exit(-1) |
|
|
elif num_leftover_cores == 0: |
|
|
|
|
|
logger.info('--skip_cross_node_cores is set, but there are no cross-node cores.') |
|
|
args.ninstances = len(cores) // args.ncores_per_instance |
|
|
else: |
|
|
|
|
|
if args.ninstances != -1: |
|
|
logger.warning('--skip_cross_node_cores is exclusive to --ninstances. --ninstances \ |
|
|
won\'t take effect even if it is set explicitly.') |
|
|
|
|
|
i = 1 |
|
|
leftover_cores = set() |
|
|
while ncore_per_node * i <= len(cores): |
|
|
leftover_cores.update(cores[ncore_per_node * i - num_leftover_cores : ncore_per_node * i]) |
|
|
i += 1 |
|
|
cores = list(set(cores) - leftover_cores) |
|
|
assert len(cores) % args.ncores_per_instance == 0 |
|
|
args.ninstances = len(cores) // args.ncores_per_instance |
|
|
else: |
|
|
if args.ninstances * args.ncores_per_instance > len(cores): |
|
|
raise RuntimeError("Please make sure ninstances * ncores_per_instance <= total_cores") |
|
|
if args.latency_mode: |
|
|
logger.warning("--latency_mode is exclusive to --ninstances, --ncores_per_instance, --node_id and \ |
|
|
--use_logical_core. They won't take effect even they are set explicitly.") |
|
|
args.ncores_per_instance = 4 |
|
|
cores = self.cpuinfo.get_all_physical_cores() |
|
|
args.ninstances = len(cores) // args.ncores_per_instance |
|
|
|
|
|
if args.throughput_mode: |
|
|
logger.warning("--throughput_mode is exclusive to --ninstances, --ncores_per_instance, --node_id and \ |
|
|
--use_logical_core. They won't take effect even they are set explicitly.") |
|
|
args.ninstances = self.cpuinfo.node_nums |
|
|
cores = self.cpuinfo.get_all_physical_cores() |
|
|
args.ncores_per_instance = len(cores) // args.ninstances |
|
|
|
|
|
if args.ninstances > 1 and args.rank != -1: |
|
|
logger.info(f"assigning {args.ncores_per_instance} cores for instance {args.rank}") |
|
|
|
|
|
self.set_multi_thread_and_allocator(args.ncores_per_instance, |
|
|
args.disable_iomp, |
|
|
set_kmp_affinity, |
|
|
args.enable_tcmalloc, |
|
|
args.enable_jemalloc, |
|
|
args.use_default_allocator) |
|
|
entrypoint = "" |
|
|
launch_args = {} |
|
|
launch_envs: Dict[int, Dict] = {} |
|
|
launch_tee = {} |
|
|
for i in range(args.ninstances): |
|
|
cmd = [] |
|
|
cur_process_cores = "" |
|
|
if not args.disable_numactl: |
|
|
cmd = ["numactl"] |
|
|
cores = sorted(cores) |
|
|
if args.rank == -1: |
|
|
core_list = cores[i * args.ncores_per_instance : (i + 1) * args.ncores_per_instance] |
|
|
else: |
|
|
core_list = cores[args.rank * args.ncores_per_instance |
|
|
: (args.rank + 1) * args.ncores_per_instance] |
|
|
|
|
|
core_ranges: List[Dict] = [] |
|
|
for core in core_list: |
|
|
if len(core_ranges) == 0: |
|
|
range_elem = {"start": core, "end": core} |
|
|
core_ranges.append(range_elem) |
|
|
else: |
|
|
if core - core_ranges[-1]["end"] == 1: |
|
|
core_ranges[-1]["end"] = core |
|
|
else: |
|
|
range_elem = {"start": core, "end": core} |
|
|
core_ranges.append(range_elem) |
|
|
for r in core_ranges: |
|
|
cur_process_cores = f"{cur_process_cores}{r['start']}-{r['end']}," |
|
|
cur_process_cores = cur_process_cores[:-1] |
|
|
numa_params = f"-C {cur_process_cores} " |
|
|
numa_ids = ",".join([str(numa_id) for numa_id in self.cpuinfo.numa_aware_check(core_list)]) |
|
|
numa_params += f"-m {numa_ids}" |
|
|
cmd.extend(numa_params.split()) |
|
|
with_python = not args.no_python |
|
|
if with_python: |
|
|
cmd.append(sys.executable) |
|
|
cmd.append("-u") |
|
|
if args.module: |
|
|
cmd.append("-m") |
|
|
cmd.append(args.program) |
|
|
cmd.extend(args.program_args) |
|
|
cmd_s = " ".join(cmd) |
|
|
logger.info(cmd_s) |
|
|
if entrypoint == "": |
|
|
entrypoint = cmd[0] |
|
|
del cmd[0] |
|
|
launch_args[i] = tuple(cmd) |
|
|
launch_envs[i] = {} |
|
|
launch_tee[i] = Std.ALL |
|
|
|
|
|
if args.rank != -1: |
|
|
break |
|
|
|
|
|
ctx = start_processes(name=args.log_file_prefix, |
|
|
entrypoint=entrypoint, |
|
|
args=launch_args, |
|
|
envs=launch_envs, |
|
|
log_dir=args.log_path, |
|
|
tee=launch_tee) |
|
|
ctx.wait() |
|
|
|
|
|
|
|
|
def _add_memory_allocator_params(parser): |
|
|
|
|
|
group = parser.add_argument_group("Memory Allocator Parameters") |
|
|
|
|
|
group.add_argument("--enable_tcmalloc", action="store_true", default=False, |
|
|
help="Enable tcmalloc allocator") |
|
|
group.add_argument("--enable_jemalloc", action="store_true", default=False, |
|
|
help="Enable jemalloc allocator") |
|
|
group.add_argument("--use_default_allocator", action="store_true", default=False, |
|
|
help="Use default memory allocator") |
|
|
|
|
|
def _add_multi_instance_params(parser): |
|
|
|
|
|
group = parser.add_argument_group("Multi-instance Parameters") |
|
|
|
|
|
group.add_argument("--ncores_per_instance", metavar="\b", default=-1, type=int, |
|
|
help="Cores per instance") |
|
|
group.add_argument("--ninstances", metavar="\b", default=-1, type=int, |
|
|
help="For multi-instance, you should give the cores number you used for per instance.") |
|
|
group.add_argument("--skip_cross_node_cores", action='store_true', default=False, |
|
|
help="If specified --ncores_per_instance, skips cross-node cores.") |
|
|
group.add_argument("--rank", metavar="\b", default="-1", type=int, |
|
|
help="Specify instance index to assign ncores_per_instance for rank; \ |
|
|
otherwise ncores_per_instance will be assigned sequentially to ninstances. Please refer to \ |
|
|
https://github.com/intel/intel-extension-for-pytorch/blob/master/docs/tutorials/performance_tuning/launch_script.md") |
|
|
group.add_argument("--latency_mode", action="store_true", default=False, |
|
|
help="By detault 4 core per instance and use all physical cores") |
|
|
group.add_argument("--throughput_mode", action="store_true", default=False, |
|
|
help="By default one instance per node and use all physical cores") |
|
|
group.add_argument("--node_id", metavar="\b", default=-1, type=int, |
|
|
help="node id for multi-instance, by default all nodes will be used") |
|
|
group.add_argument("--use_logical_core", action="store_true", default=False, |
|
|
help="Whether only use physical cores") |
|
|
group.add_argument("--disable_numactl", action="store_true", default=False, |
|
|
help="Disable numactl") |
|
|
group.add_argument("--core_list", metavar="\b", default=None, type=str, |
|
|
help="Specify the core list as \"core_id, core_id, ....\", otherwise, all the cores will be used.") |
|
|
group.add_argument("--log_path", metavar="\b", default="logs", type=str, |
|
|
help="The log file directory. Default path is "", which means disable logging to files.") |
|
|
group.add_argument("--log_file_prefix", metavar="\b", default="run", type=str, |
|
|
help="log file prefix") |
|
|
|
|
|
def _add_kmp_iomp_params(parser): |
|
|
|
|
|
group = parser.add_argument_group("IOMP Parameters") |
|
|
group.add_argument("--disable_iomp", action="store_true", default=False, |
|
|
help="By default, we use Intel OpenMP and libiomp5.so will be add to LD_PRELOAD") |
|
|
|
|
|
def create_args(parser=None): |
|
|
""" |
|
|
Helper function parsing the command line options |
|
|
@retval ArgumentParser |
|
|
""" |
|
|
parser.add_argument("--multi_instance", action="store_true", default=False, |
|
|
help="Enable multi-instance, by default one instance per node") |
|
|
|
|
|
parser.add_argument("-m", "--module", default=False, action="store_true", |
|
|
help="Changes each process to interpret the launch script " |
|
|
"as a python module, executing with the same behavior as" |
|
|
"\"python -m\".") |
|
|
|
|
|
parser.add_argument("--no_python", default=False, action="store_true", |
|
|
help="Do not prepend the --program script with \"python\" - just exec " |
|
|
"it directly. Useful when the script is not a Python script.") |
|
|
|
|
|
_add_memory_allocator_params(parser) |
|
|
_add_kmp_iomp_params(parser) |
|
|
|
|
|
_add_multi_instance_params(parser) |
|
|
|
|
|
parser.add_argument("program", type=str, |
|
|
help="The full path to the proram/script to be launched. " |
|
|
"followed by all the arguments for the script") |
|
|
|
|
|
|
|
|
parser.add_argument("program_args", nargs=REMAINDER) |
|
|
|
|
|
def main(args): |
|
|
env_before = set(os.environ.keys()) |
|
|
if platform.system() in ["Windows", "Darwin"]: |
|
|
raise RuntimeError(f"{platform.system()} is not supported!!!") |
|
|
|
|
|
if args.log_path: |
|
|
os.makedirs(args.log_path, exist_ok=True) |
|
|
|
|
|
if args.latency_mode and args.throughput_mode: |
|
|
raise RuntimeError("Either args.latency_mode or args.throughput_mode should be set") |
|
|
|
|
|
if not args.no_python and not args.program.endswith(".py"): |
|
|
raise RuntimeError("For non Python script, you should use \"--no_python\" parameter.") |
|
|
|
|
|
|
|
|
if "LD_PRELOAD" in os.environ: |
|
|
lst_valid = [] |
|
|
tmp_ldpreload = os.environ["LD_PRELOAD"] |
|
|
for item in tmp_ldpreload.split(":"): |
|
|
matches = glob.glob(item) |
|
|
if len(matches) > 0: |
|
|
lst_valid.append(item) |
|
|
else: |
|
|
logger.warning(f"{item} doesn't exist. Removing it from LD_PRELOAD.") |
|
|
if len(lst_valid) > 0: |
|
|
os.environ["LD_PRELOAD"] = ":".join(lst_valid) |
|
|
else: |
|
|
os.environ["LD_PRELOAD"] = "" |
|
|
|
|
|
launcher = _Launcher() |
|
|
launcher.launch(args) |
|
|
for x in sorted(set(os.environ.keys()) - env_before): |
|
|
logger.debug("{x}={os.environ[x]}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = ArgumentParser(description="This is a script for launching PyTorch inference on Intel(R) Xeon(R) Scalable " |
|
|
"Processors with optimal configurations. Single instance inference, " |
|
|
"multi-instance inference are enable. To get the peak performance on Intel(R) " |
|
|
"Xeon(R) Scalable Processors, the script optimizes the configuration " |
|
|
"of thread and memory management. For thread management, the script configures thread " |
|
|
"affinity and the preload of Intel OMP library. For memory management, it configures " |
|
|
"NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc) " |
|
|
"\n################################# Basic usage ############################# \n" |
|
|
"\n 1. single instance\n" |
|
|
"\n >>> python -m torch.backends.xeon.run_cpu python_script args \n" |
|
|
"\n2. multi-instance \n" |
|
|
"\n >>> python -m torch.backends.xeon.run_cpu --ninstances xxx " |
|
|
"--ncores_per_instance xx python_script args\n" |
|
|
"\n############################################################################# \n", |
|
|
formatter_class=RawTextHelpFormatter) |
|
|
create_args(parser) |
|
|
args = parser.parse_args() |
|
|
main(args) |
|
|
|