| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """Diagnose script for checking OS/hardware/python/pip/verl/network. |
| The output of this script can be a very good hint to issue/problem. |
| """ |
|
|
| import os |
| import platform |
| import socket |
| import subprocess |
| import sys |
| import time |
|
|
| import psutil |
|
|
| try: |
| from urllib.parse import urlparse |
| from urllib.request import urlopen |
| except ImportError: |
| from urllib2 import urlopen |
| from urlparse import urlparse |
| import argparse |
| import importlib.metadata |
|
|
| import torch |
|
|
| URLS = { |
| "PYPI": "https://pypi.python.org/pypi/pip", |
| } |
|
|
| REGIONAL_URLS = { |
| "cn": { |
| "PYPI(douban)": "https://pypi.douban.com/", |
| "Conda(tsinghua)": "https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/", |
| } |
| } |
|
|
|
|
| def test_connection(name, url, timeout=10): |
| """Simple connection test""" |
| urlinfo = urlparse(url) |
| start = time.time() |
| try: |
| socket.gethostbyname(urlinfo.netloc) |
| except Exception as e: |
| print("Error resolving DNS for {}: {}, {}".format(name, url, e)) |
| return |
| dns_elapsed = time.time() - start |
| start = time.time() |
| try: |
| _ = urlopen(url, timeout=timeout) |
| except Exception as e: |
| print("Error open {}: {}, {}, DNS finished in {} sec.".format(name, url, e, dns_elapsed)) |
| return |
| load_elapsed = time.time() - start |
| print("Timing for {}: {}, DNS: {:.4f} sec, LOAD: {:.4f} sec.".format(name, url, dns_elapsed, load_elapsed)) |
|
|
|
|
| def check_python(): |
| print("----------Python Info----------") |
| print("Version :", platform.python_version()) |
| print("Compiler :", platform.python_compiler()) |
| print("Build :", platform.python_build()) |
| print("Arch :", platform.architecture()) |
|
|
|
|
| def check_pip(): |
| print("------------Pip Info-----------") |
| try: |
| import pip |
|
|
| print("Version :", pip.__version__) |
| print("Directory :", os.path.dirname(pip.__file__)) |
| except ImportError: |
| print("No corresponding pip install for current python.") |
|
|
|
|
| def _get_current_git_commit(): |
| try: |
| result = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True) |
| return result.stdout.strip() |
| except subprocess.CalledProcessError as e: |
| print(f"Error running git command: {e.stderr.strip()}") |
| return None |
| except FileNotFoundError: |
| print("Did not find command: git") |
| return None |
|
|
|
|
| def check_verl(): |
| print("----------verl Info-----------") |
| try: |
| sys.path.insert(0, os.getcwd()) |
| import verl |
|
|
| print("Version :", verl.__version__) |
| verl_dir = os.path.dirname(verl.__file__) |
| print("Directory :", verl_dir) |
| try: |
| commit_hash = _get_current_git_commit() |
| print("Commit Hash :", commit_hash) |
| except AttributeError: |
| print("Commit hash not found. ") |
| except ImportError as e: |
| print(f"No verl installed: {e}") |
| except Exception as e: |
| import traceback |
|
|
| if not isinstance(e, IOError): |
| print("An error occurred trying to import verl.") |
| print("This is very likely due to missing or incompatible library files.") |
| print(traceback.format_exc()) |
|
|
|
|
| def check_os(): |
| print("----------Platform Info----------") |
| print("Platform :", platform.platform()) |
| print("system :", platform.system()) |
| print("node :", platform.node()) |
| print("release :", platform.release()) |
| print("version :", platform.version()) |
|
|
|
|
| def check_hardware(): |
| print("----------Hardware Info----------") |
| print("machine :", platform.machine()) |
| print("processor :", platform.processor()) |
| if sys.platform.startswith("darwin"): |
| pipe = subprocess.Popen(("sysctl", "-a"), stdout=subprocess.PIPE) |
| output = pipe.communicate()[0] |
| for line in output.split(b"\n"): |
| if b"brand_string" in line or b"features" in line: |
| print(line.strip()) |
| elif sys.platform.startswith("linux"): |
| subprocess.call(["lscpu"]) |
| elif sys.platform.startswith("win32"): |
| subprocess.call(["wmic", "cpu", "get", "name"]) |
|
|
|
|
| def check_network(args): |
| print("----------Network Test----------") |
| if args.timeout > 0: |
| print("Setting timeout: {}".format(args.timeout)) |
| socket.setdefaulttimeout(10) |
| for region in args.region.strip().split(","): |
| r = region.strip().lower() |
| if not r: |
| continue |
| if r in REGIONAL_URLS: |
| URLS.update(REGIONAL_URLS[r]) |
| else: |
| import warnings |
|
|
| warnings.warn("Region {} do not need specific test, please refer to global sites.".format(r), stacklevel=2) |
| for name, url in URLS.items(): |
| test_connection(name, url, args.timeout) |
|
|
|
|
| def check_environment(): |
| print("----------Environment----------") |
| for k, v in os.environ.items(): |
| if k.startswith("VERL_") or k.startswith("OMP_") or k.startswith("KMP_") or k == "CC" or k == "CXX": |
| print('{}="{}"'.format(k, v)) |
|
|
|
|
| def check_pip_package_versions(): |
| packages = ["vllm", "sglang", "ray", "torch"] |
| for package in packages: |
| try: |
| version = importlib.metadata.version(package) |
| print(f"{package}\t : {version}") |
| except importlib.metadata.PackageNotFoundError: |
| print(f"{package}\t : not found.") |
|
|
|
|
| def check_cuda_versions(): |
| if torch.cuda.is_available(): |
| try: |
| cuda_runtime_version = torch.version.cuda |
| print(f"CUDA Runtime : {cuda_runtime_version}") |
| import subprocess |
|
|
| nvcc_output = subprocess.check_output(["nvcc", "--version"]).decode("utf-8") |
| cuda_compiler_version = next((line for line in nvcc_output.splitlines() if "release" in line), None) |
| if cuda_compiler_version: |
| print(f"CUDA Compiler : {cuda_compiler_version.strip()}") |
| else: |
| print("Could not determine CUDA compiler version.") |
| except FileNotFoundError as e: |
| print(f"CUDA compiler : Not found: {e}") |
| except Exception as e: |
| print(f"An error occurred while checking CUDA versions: {e}") |
| else: |
| print("CUDA is not available.") |
|
|
|
|
| def _get_cpu_memory(): |
| """ |
| Get the total CPU memory capacity in GB. |
| """ |
| memory = psutil.virtual_memory() |
| return memory.total / (1024**3) |
|
|
|
|
| def _get_gpu_info(): |
| """ |
| Get GPU type, GPU memory, and GPU count using nvidia-smi command. |
| """ |
| try: |
| result = subprocess.run( |
| ["nvidia-smi", "--query-gpu=gpu_name,memory.total", "--format=csv,noheader,nounits"], |
| capture_output=True, |
| text=True, |
| check=True, |
| ) |
| gpu_lines = result.stdout.strip().split("\n") |
| gpu_count = len(gpu_lines) |
| gpu_info = [] |
| for line in gpu_lines: |
| gpu_name, gpu_memory = line.split(", ") |
| gpu_info.append( |
| { |
| "type": gpu_name, |
| "memory": float(gpu_memory) / 1024, |
| } |
| ) |
| return gpu_count, gpu_info |
| except (subprocess.CalledProcessError, FileNotFoundError): |
| print("Failed to execute nvidia-smi command.") |
| return 0, [] |
|
|
|
|
| def _get_system_info(): |
| """ |
| Get CPU memory capacity, GPU type, GPU memory, and GPU count. |
| """ |
| cpu_memory = _get_cpu_memory() |
| gpu_count, gpu_info = _get_gpu_info() |
| return {"cpu_memory": cpu_memory, "gpu_count": gpu_count, "gpu_info": gpu_info} |
|
|
|
|
| def check_system_info(): |
| print("----------System Info----------") |
| system_info = _get_system_info() |
| print(f"CPU Memory\t: {system_info['cpu_memory']:.2f} GB") |
| print(f"GPU Count\t: {system_info['gpu_count']}") |
| for i, gpu in enumerate(system_info["gpu_info"]): |
| print(f"GPU {i + 1}\tType : {gpu['type']}") |
| print(f"GPU {i + 1}\tMemory : {gpu['memory']:.2f} GB") |
|
|
|
|
| def parse_args(): |
| """Parse arguments.""" |
| parser = argparse.ArgumentParser( |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
| description="Diagnose script for checking the current system.", |
| ) |
| choices = ["python", "pip", "verl", "system", "os", "environment"] |
| for choice in choices: |
| parser.add_argument("--" + choice, default=1, type=int, help="Diagnose {}.".format(choice)) |
| parser.add_argument("--network", default=0, type=int, help="Diagnose network.") |
| parser.add_argument("--hardware", default=0, type=int, help="Diagnose hardware.") |
| parser.add_argument( |
| "--region", |
| default="", |
| type=str, |
| help="Additional sites in which region(s) to test. \ |
| Specify 'cn' for example to test mirror sites in China.", |
| ) |
| parser.add_argument("--timeout", default=10, type=int, help="Connection test timeout threshold, 0 to disable.") |
| args = parser.parse_args() |
| return args |
|
|
|
|
| if __name__ == "__main__": |
| args = parse_args() |
| if args.python: |
| check_python() |
|
|
| if args.pip: |
| check_pip() |
| check_pip_package_versions() |
|
|
| if args.verl: |
| check_verl() |
|
|
| if args.os: |
| check_os() |
|
|
| if args.hardware: |
| check_hardware() |
|
|
| if args.network: |
| check_network(args) |
|
|
| if args.environment: |
| check_environment() |
| check_cuda_versions() |
|
|
| if args.system: |
| check_system_info() |
|
|