| | |
| | |
| |
|
| | import os |
| | import sys |
| | from pathlib import Path |
| |
|
| | import torch |
| | import yaml |
| |
|
| | FILE = Path(__file__).resolve() |
| | ROOT = FILE.parents[2] |
| | if str(ROOT) not in sys.path: |
| | sys.path.append(str(ROOT)) |
| |
|
| | port = 0 |
| | path = Path('').resolve() |
| | for last in path.rglob('*/**/last.pt'): |
| | ckpt = torch.load(last) |
| | if ckpt['optimizer'] is None: |
| | continue |
| |
|
| | |
| | with open(last.parent.parent / 'opt.yaml', errors='ignore') as f: |
| | opt = yaml.safe_load(f) |
| |
|
| | |
| | d = opt['device'].split(',') |
| | nd = len(d) |
| | ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) |
| |
|
| | if ddp: |
| | port += 1 |
| | cmd = f'python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}' |
| | else: |
| | cmd = f'python train.py --resume {last}' |
| |
|
| | cmd += ' > /dev/null 2>&1 &' |
| | print(cmd) |
| | os.system(cmd) |
| |
|