| |
| |
| |
| |
|
|
| from enum import Enum |
| import os |
| from pathlib import Path |
| from typing import Any, Dict, Optional |
|
|
|
|
| class ClusterType(Enum): |
| AWS = "aws" |
| FAIR = "fair" |
| RSC = "rsc" |
|
|
|
|
| def _guess_cluster_type() -> ClusterType: |
| uname = os.uname() |
| if uname.sysname == "Linux": |
| if uname.release.endswith("-aws"): |
| |
| return ClusterType.AWS |
| elif uname.nodename.startswith("rsc"): |
| |
| return ClusterType.RSC |
|
|
| return ClusterType.FAIR |
|
|
|
|
| def get_cluster_type(cluster_type: Optional[ClusterType] = None) -> Optional[ClusterType]: |
| if cluster_type is None: |
| return _guess_cluster_type() |
|
|
| return cluster_type |
|
|
|
|
| def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]: |
| cluster_type = get_cluster_type(cluster_type) |
| if cluster_type is None: |
| return None |
|
|
| CHECKPOINT_DIRNAMES = { |
| ClusterType.AWS: "checkpoints", |
| ClusterType.FAIR: "checkpoint", |
| ClusterType.RSC: "checkpoint/dino", |
| } |
| return Path("/") / CHECKPOINT_DIRNAMES[cluster_type] |
|
|
|
|
| def get_user_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]: |
| checkpoint_path = get_checkpoint_path(cluster_type) |
| if checkpoint_path is None: |
| return None |
|
|
| username = os.environ.get("USER") |
| assert username is not None |
| return checkpoint_path / username |
|
|
|
|
| def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]: |
| cluster_type = get_cluster_type(cluster_type) |
| if cluster_type is None: |
| return None |
|
|
| SLURM_PARTITIONS = { |
| ClusterType.AWS: "learnlab", |
| ClusterType.FAIR: "learnlab", |
| ClusterType.RSC: "learn", |
| } |
| return SLURM_PARTITIONS[cluster_type] |
|
|
|
|
| def get_slurm_executor_parameters( |
| nodes: int, num_gpus_per_node: int, cluster_type: Optional[ClusterType] = None, **kwargs |
| ) -> Dict[str, Any]: |
| |
| params = { |
| "mem_gb": 0, |
| "gpus_per_node": num_gpus_per_node, |
| "tasks_per_node": num_gpus_per_node, |
| "cpus_per_task": 10, |
| "nodes": nodes, |
| "slurm_partition": get_slurm_partition(cluster_type), |
| } |
| |
| cluster_type = get_cluster_type(cluster_type) |
| if cluster_type == ClusterType.AWS: |
| params["cpus_per_task"] = 12 |
| del params["mem_gb"] |
| elif cluster_type == ClusterType.RSC: |
| params["cpus_per_task"] = 12 |
| |
| params.update(kwargs) |
| return params |
|
|