| | |
| | |
| | |
| | |
| |
|
| | import argparse |
| | import logging |
| | import os |
| | from pathlib import Path |
| | from typing import List, Optional |
| |
|
| | import submitit |
| |
|
| | from dinov2.utils.cluster import ( |
| | get_slurm_executor_parameters, |
| | get_slurm_partition, |
| | get_user_checkpoint_path, |
| | ) |
| |
|
| |
|
| | logger = logging.getLogger("dinov2") |
| |
|
| |
|
| | def get_args_parser( |
| | description: Optional[str] = None, |
| | parents: Optional[List[argparse.ArgumentParser]] = None, |
| | add_help: bool = True, |
| | ) -> argparse.ArgumentParser: |
| | parents = parents or [] |
| | slurm_partition = get_slurm_partition() |
| | parser = argparse.ArgumentParser( |
| | description=description, |
| | parents=parents, |
| | add_help=add_help, |
| | ) |
| | parser.add_argument( |
| | "--ngpus", |
| | "--gpus", |
| | "--gpus-per-node", |
| | default=8, |
| | type=int, |
| | help="Number of GPUs to request on each node", |
| | ) |
| | parser.add_argument( |
| | "--nodes", |
| | "--nnodes", |
| | default=1, |
| | type=int, |
| | help="Number of nodes to request", |
| | ) |
| | parser.add_argument( |
| | "--timeout", |
| | default=2800, |
| | type=int, |
| | help="Duration of the job", |
| | ) |
| | parser.add_argument( |
| | "--partition", |
| | default=slurm_partition, |
| | type=str, |
| | help="Partition where to submit", |
| | ) |
| | parser.add_argument( |
| | "--use-volta32", |
| | action="store_true", |
| | help="Request V100-32GB GPUs", |
| | ) |
| | parser.add_argument( |
| | "--comment", |
| | default="", |
| | type=str, |
| | help="Comment to pass to scheduler, e.g. priority message", |
| | ) |
| | parser.add_argument( |
| | "--exclude", |
| | default="", |
| | type=str, |
| | help="Nodes to exclude", |
| | ) |
| | return parser |
| |
|
| |
|
| | def get_shared_folder() -> Path: |
| | user_checkpoint_path = get_user_checkpoint_path() |
| | if user_checkpoint_path is None: |
| | raise RuntimeError("Path to user checkpoint cannot be determined") |
| | path = user_checkpoint_path / "experiments" |
| | path.mkdir(exist_ok=True) |
| | return path |
| |
|
| |
|
| | def submit_jobs(task_class, args, name: str): |
| | if not args.output_dir: |
| | args.output_dir = str(get_shared_folder() / "%j") |
| |
|
| | Path(args.output_dir).mkdir(parents=True, exist_ok=True) |
| | executor = submitit.AutoExecutor(folder=args.output_dir, slurm_max_num_timeout=30) |
| |
|
| | kwargs = {} |
| | if args.use_volta32: |
| | kwargs["slurm_constraint"] = "volta32gb" |
| | if args.comment: |
| | kwargs["slurm_comment"] = args.comment |
| | if args.exclude: |
| | kwargs["slurm_exclude"] = args.exclude |
| |
|
| | executor_params = get_slurm_executor_parameters( |
| | nodes=args.nodes, |
| | num_gpus_per_node=args.ngpus, |
| | timeout_min=args.timeout, |
| | slurm_signal_delay_s=120, |
| | slurm_partition=args.partition, |
| | **kwargs, |
| | ) |
| | executor.update_parameters(name=name, **executor_params) |
| |
|
| | task = task_class(args) |
| | job = executor.submit(task) |
| |
|
| | logger.info(f"Submitted job_id: {job.job_id}") |
| | str_output_dir = os.path.abspath(args.output_dir).replace("%j", str(job.job_id)) |
| | logger.info(f"Logs and checkpoints will be saved at: {str_output_dir}") |
| |
|