Spaces:

xfys
/

detect

Configuration error

detect / utils /aws /resume.py

Upload 233 files

a82c176 almost 3 years ago

1.2 kB

	# Resume all interrupted trainings in yolov5/ dir including DDP trainings
	# Usage: $ python utils/aws/resume.py

	import os
	import sys
	from pathlib import Path

	import torch
	import yaml

	FILE = Path(__file__).resolve()
	ROOT = FILE.parents[2] # YOLOv5 root directory
	if str(ROOT) not in sys.path:
	sys.path.append(str(ROOT)) # add ROOT to PATH

	port = 0 # --master_port
	path = Path('').resolve()
	for last in path.rglob('/*/last.pt'):
	ckpt = torch.load(last)
	if ckpt['optimizer'] is None:
	continue

	# Load opt.yaml
	with open(last.parent.parent / 'opt.yaml', errors='ignore') as f:
	opt = yaml.safe_load(f)

	# Get device count
	d = opt['device'].split(',') # devices
	nd = len(d) # number of devices
	ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel

	if ddp: # multi-GPU
	port += 1
	cmd = f'python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}'
	else: # single-GPU
	cmd = f'python train.py --resume {last}'

	cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread
	print(cmd)
	os.system(cmd)