| """ |
| utils for submitting to clusters, such as slurm |
| """ |
|
|
| import os |
| from omegaconf import DictConfig, OmegaConf |
| from datetime import datetime |
| from pathlib import Path |
|
|
| from utils.print_utils import cyan |
|
|
| |
| REPO_DIR = None |
|
|
|
|
| def submit_slurm_job( |
| cfg: DictConfig, |
| python_args: str, |
| project_root: Path, |
| ): |
| log_dir = project_root / "slurm_logs" / f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}-{cfg.name}" |
| log_dir.mkdir(exist_ok=True, parents=True) |
| (project_root / "slurm_logs" / "latest").unlink(missing_ok=True) |
| (project_root / "slurm_logs" / "latest").symlink_to(log_dir, target_is_directory=True) |
|
|
| params = dict(name=cfg.name, log_dir=log_dir, project_root=project_root, python_args=python_args) |
| params.update(cfg.cluster.params) |
|
|
| slurm_script = cfg.cluster.launch_template.format(**params) |
|
|
| slurm_script_path = log_dir / "job.slurm" |
| with slurm_script_path.open("w") as f: |
| f.write(slurm_script) |
|
|
| os.system(f"chmod +x {slurm_script_path}") |
| os.system(f"sbatch {slurm_script_path}") |
|
|
| print(f"\n{cyan('script:')} {slurm_script_path}\n{cyan('slurm errors and logs:')} {log_dir}\n") |
|
|
| return log_dir |
|
|