Spaces:
Sleeping
Sleeping
| import os | |
| import subprocess | |
| from typing import Optional, Dict, Tuple | |
| MANAGER_NODE_TABLE = { | |
| '10.198.8': '10.198.8.31', | |
| '10.198.6': '10.198.6.31', | |
| '10.5.38': '10.5.38.31', | |
| '10.5.39': '10.5.38.31', | |
| '10.5.36': '10.5.36.31', | |
| '10.5.37': '10.5.36.31', | |
| '10.10.30': '10.10.30.91', | |
| } | |
| def get_ip() -> str: | |
| """ | |
| Overview: | |
| Get the ip of the current node | |
| """ | |
| assert os.environ.get('SLURMD_NODENAME'), 'not found SLURMD_NODENAME env variable' | |
| # expecting nodename to be like: 'SH-IDC1-10-5-36-64' | |
| nodename = os.environ.get('SLURMD_NODENAME', '') | |
| myaddr = '.'.join(nodename.split('-')[-4:]) | |
| return myaddr | |
| def get_manager_node_ip(node_ip: Optional[str] = None) -> str: | |
| """ | |
| Overview: | |
| Look up the manager node of the slurm cluster and return the node ip | |
| Arguments: | |
| - node_ip (:obj:`Optional[str]`): The ip of the current node | |
| """ | |
| if 'SLURM_JOB_ID' not in os.environ: | |
| from ditk import logging | |
| logging.error( | |
| 'We are not running on slurm!, \'auto\' for manager_ip or ' | |
| 'coordinator_ip is only intended for running on multiple slurm clusters' | |
| ) | |
| return '127.0.0.1' | |
| node_ip = node_ip or get_ip() | |
| learner_manager_ip_prefix = '.'.join(node_ip.split('.')[0:3]) | |
| if learner_manager_ip_prefix in MANAGER_NODE_TABLE: | |
| return MANAGER_NODE_TABLE[learner_manager_ip_prefix] | |
| else: | |
| raise KeyError("Cluster not found, please add it to the MANAGER_NODE_TABLE in {}".format(__file__)) | |
| # get all info of cluster | |
| def get_cls_info() -> Dict[str, list]: | |
| """ | |
| Overview: | |
| Get the cluster info | |
| """ | |
| ret_dict = {} | |
| info = subprocess.getoutput('sinfo -Nh').split('\n') | |
| for line in info: | |
| line = line.strip().split() | |
| if len(line) != 4: | |
| continue | |
| node, _, partition, state = line | |
| if partition not in ret_dict: | |
| ret_dict[partition] = [] | |
| assert node not in ret_dict[partition] | |
| if state in ['idle', 'mix']: | |
| ret_dict[partition].append(node) | |
| return ret_dict | |
| def node_to_partition(target_node: str) -> Tuple[str, str]: | |
| """ | |
| Overview: | |
| Get the partition of the target node | |
| Arguments: | |
| - target_node (:obj:`str`): The target node | |
| """ | |
| info = subprocess.getoutput('sinfo -Nh').split('\n') | |
| for line in info: | |
| line = line.strip().split() | |
| if len(line) != 4: | |
| continue | |
| node, _, partition, state = line | |
| if node == target_node: | |
| return partition | |
| raise RuntimeError("not found target_node: {}".format(target_node)) | |
| def node_to_host(node: str) -> str: | |
| """ | |
| Overview: | |
| Get the host of the node | |
| Arguments: | |
| - node (:obj:`str`): The node | |
| """ | |
| return '.'.join(node.split('-')[-4:]) | |
| def find_free_port_slurm(node: str) -> int: | |
| """ | |
| Overview: | |
| Find a free port on the node | |
| Arguments: | |
| - node (:obj:`str`): The node | |
| """ | |
| partition = node_to_partition(node) | |
| if partition == 'spring_scheduler': | |
| comment = '--comment=spring-submit' | |
| else: | |
| comment = '' | |
| output = subprocess.getoutput( | |
| "srun -p {} -w {} {} python -c \"from ding.utils import find_free_port; print('port' + str(find_free_port(0)))\"" # noqa | |
| .format(partition, node, comment) | |
| ) | |
| port = output.split('port')[-1] | |
| return int(port) | |