Add core reproduction code (binarization layers, PTv3, superpoint ops, min-repro pack)

7b95dc2 verified 14 days ago

14 kB

	from __future__ import print_function
	"""
	Volcano ML Platform launcher for Pointcept jobs on the Haozhe Vepfs tree.
	The default target is the current S3DIS superpoint reboot follow-up setup.

	Usage
	-----
	Preview only:
	python3 run.py

	Preview a 4-GPU superpoint reboot launch:
	MLP_NUM_GPUS=4 MLP_CUDA_VISIBLE_DEVICES=0,1,2,3 \\
	MLP_CONFIG=configs/s3dis/reboot_override.py python3 run.py

	Render a Haozhe-local launcher without poplab paths:
	MLP_RENDER_HAOZHE=1 \\
	MLP_HAOZHE_LAUNCHER_OUT=outputs/run_training_haozhe.py \\
	MLP_CONFIG=configs/s3dis/reboot_override.py python3 run.py

	Real submit (requires volcenginesdk* plus VOLC_AK / VOLC_SK):
	export VOLC_AK=... VOLC_SK=...
	MLP_SUBMIT=1 python3 run.py

	Common environment variables
	----------------------------
	MLP_POINTCEPT_ROOT Compute-side Pointcept root. Defaults to the Haozhe path.
	MLP_PYTHON Compute-side Python executable.
	MLP_NUM_GPUS Passed to tools/train.py --num-gpus.
	MLP_CUDA_VISIBLE_DEVICES
	Defaults to 0,1,...,N-1 based on MLP_NUM_GPUS.
	MLP_CONFIG Config path relative to MLP_POINTCEPT_ROOT.
	Default: configs/s3dis/semseg-pt-v3m1-0-rpe_sp_after_pool_reboot.py
	MLP_SAVE_PATH save_path passed through --options.
	Default: exp/<dataset>/<config>_<timestamp>
	MLP_OPTIONS Extra tools/train.py --options, for example:
	"resume=True weight=exp/.../model/model_last.pth"
	MLP_LOG_DIR Writable log directory on the compute side.
	MLP_CARD_TYPE Instance type id.
	MLP_RESOURCE_QUEUE_ID Resource queue id.
	MLP_VEPFS_ID Vepfs mount id.
	MLP_IMAGE Container image url.
	MLP_RENDER_HAOZHE When set to 1, also print a Haozhe-local launcher.
	MLP_HAOZHE_LAUNCHER_OUT
	Optional output path for the rendered launcher.

	Notes
	-----
	- This is the canonical poplab -> compute launcher.
	- Keep PAMI2026 on poplab; do not mirror it onto Haozhe2.
	- The compute-side code root stays /map-vepfs/haozhe/PAMI_superpoint/pointcept_framework.
	- The rendered Haozhe launcher contains only Haozhe-side paths.
	"""
	import datetime
	import os
	import shlex
	import sys

	os.environ.setdefault("no_proxy", "volces.com,volcengineapi.com")

	ACTUALLY_SUBMIT = os.environ.get("MLP_SUBMIT", "0") == "1"
	RENDER_HAOZHE = os.environ.get("MLP_RENDER_HAOZHE", "0") == "1"


	def _default_cuda_visible(num_gpus):
	return ",".join(str(i) for i in range(max(1, num_gpus)))


	def _config_group(config_rel):
	parts = config_rel.replace("\\", "/").split("/")
	if len(parts) >= 3 and parts[0] == "configs":
	return parts[1]
	return "misc"


	def _config_tag(config_rel):
	return os.path.splitext(os.path.basename(config_rel))[0]


	def _default_save_path(config_rel, timestamp):
	return "exp/{}/{}_{}".format(
	_config_group(config_rel),
	_config_tag(config_rel),
	timestamp,
	)


	def _compose_options_list(save_path, extra_options):
	options = ["save_path={}".format(save_path)]
	if extra_options:
	options.extend(shlex.split(extra_options))
	return options


	def _compose_options_text(options):
	return " ".join(options)


	def _int_env(name, default):
	try:
	return int(os.environ.get(name, str(default)))
	except (TypeError, ValueError):
	return int(default)


	def _render_haozhe_launcher(
	pointcept_root,
	py_exe,
	num_gpus,
	cuda_visible,
	config_rel,
	save_path,
	extra_options,
	torch_home,
	):
	return """
	import os
	import shlex
	import subprocess

	def run():
	root = os.environ.get("MLP_POINTCEPT_ROOT", {root!r})
	py_exe = os.environ.get("MLP_PYTHON", {py!r})
	num_gpus = int(os.environ.get("MLP_NUM_GPUS", {ngpu!r}))
	cuda_visible = os.environ.get("MLP_CUDA_VISIBLE_DEVICES", {cuda!r})
	config_rel = os.environ.get("MLP_CONFIG", {cfg!r})
	save_path = os.environ.get("MLP_SAVE_PATH", {save_path!r})
	extra_options = os.environ.get("MLP_OPTIONS", {extra!r}).strip()
	torch_home = os.environ.get("TORCH_HOME", {torch_home!r})
	options = ["save_path={{}}".format(save_path)]
	if extra_options:
	options.extend(shlex.split(extra_options))
	env = os.environ.copy()
	env["PYTHONPATH"] = "{{}}:{{}}/libs/pointops/build/lib.linux-x86_64-cpython-310:{{}}".format(
	root,
	root,
	env.get("PYTHONPATH", ""),
	)
	env["TORCH_HOME"] = torch_home
	env["CUDA_HOME"] = os.environ.get("CUDA_HOME", "/usr/local/cuda-12.1")
	env["CUDA_VISIBLE_DEVICES"] = cuda_visible
	cmd = [
	py_exe,
	"-u",
	"tools/train.py",
	"--num-gpus",
	str(num_gpus),
	"--dist-url",
	"auto",
	"--config-file",
	config_rel,
	"--options",
	]
	cmd.extend(options)
	os.chdir(root)
	subprocess.run(cmd, env=env, check=True)

	if __name__ == "__main__":
	run()
	""".lstrip().format(
	root=pointcept_root,
	py=py_exe,
	ngpu=str(num_gpus),
	cuda=cuda_visible,
	cfg=config_rel,
	save_path=save_path,
	extra=extra_options,
	torch_home=torch_home,
	)


	def _write_launcher_if_requested(launcher_text):
	out_path = os.environ.get("MLP_HAOZHE_LAUNCHER_OUT", "").strip()
	if not out_path:
	return None
	out_dir = os.path.dirname(out_path)
	if out_dir:
	os.makedirs(out_dir, exist_ok=True)
	with open(out_path, "w", encoding="utf-8", newline="\n") as f:
	f.write(launcher_text)
	return out_path


	if __name__ == "__main__":
	timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

	pointcept_root = os.environ.get(
	"MLP_POINTCEPT_ROOT",
	"/map-vepfs/haozhe/PAMI_superpoint/pointcept_framework",
	)
	py_exe = os.environ.get(
	"MLP_PYTHON",
	"/map-vepfs/haozhe/PAMI_superpoint/superpoint/bin/python",
	)
	num_gpus = int(os.environ.get("MLP_NUM_GPUS", "1"))
	num_machines = _int_env("MLP_NUM_MACHINES", 1)
	role_replicas = _int_env("MLP_ROLE_REPLICAS", num_machines)
	cuda_visible = os.environ.get(
	"MLP_CUDA_VISIBLE_DEVICES",
	_default_cuda_visible(num_gpus),
	)
	config_rel = os.environ.get(
	"MLP_CONFIG",
	"configs/s3dis/semseg-pt-v3m1-0-rpe_sp_after_pool_reboot.py",
	)
	config_tag = _config_tag(config_rel)
	save_path = os.environ.get(
	"MLP_SAVE_PATH",
	_default_save_path(config_rel, timestamp),
	)
	extra_options = os.environ.get("MLP_OPTIONS", "").strip()
	train_options = _compose_options_list(save_path, extra_options)
	train_options_text = _compose_options_text(train_options)
	log_dir = os.environ.get("MLP_LOG_DIR", os.path.join(pointcept_root, "logs"))
	log_file_eval = os.path.join(log_dir, "volc_job_{}.log".format(timestamp))

	torch_home = os.environ.get(
	"TORCH_HOME",
	"/map-vepfs/haozhe/PAMI_superpoint/.torch_cache",
	)

	card_type = os.environ.get("MLP_CARD_TYPE", "ml.pni2l.7xlarge")
	jobname = os.environ.get("MLP_JOB_NAME", "PAMI-{}-{}".format(config_tag, timestamp))
	resource_queue_id = os.environ.get(
	"MLP_RESOURCE_QUEUE_ID", "q-20241024095431-86z6j"
	)
	vepfs_id = os.environ.get("MLP_VEPFS_ID", "vepfs-cnbj26c39866e9ec1")
	vepfs_host_path = "/mnt/{}".format(vepfs_id)
	image_url = os.environ.get(
	"MLP_IMAGE",
	"vemlp-cn-beijing2.cr.volces.com/preset-images/ray:2.12.0-cuda-121-py310",
	)
	master_addr_env = os.environ.get("MLP_MASTER_ADDR_ENV", "MLP_WORKER_0_HOST")
	master_port_env = os.environ.get("MLP_MASTER_PORT_ENV", "MLP_WORKER_0_PORT")
	machine_rank_env = os.environ.get("MLP_MACHINE_RANK_ENV", "MLP_ROLE_INDEX")
	default_master_port = os.environ.get("MLP_MASTER_PORT", "29500")

	bash_command = """
	set -euo pipefail
	export PYTHONPATH={root}:{root}/libs/pointops/build/lib.linux-x86_64-cpython-310:${{PYTHONPATH:-}}
	export TORCH_HOME=${{TORCH_HOME:-{torch_home}}}
	export CUDA_HOME=/usr/local/cuda-12.1
	export MLP_NUM_MACHINES=${{MLP_NUM_MACHINES:-{num_machines}}}
	export MLP_ROLE_REPLICAS=${{MLP_ROLE_REPLICAS:-{role_replicas}}}
	export MLP_MACHINE_RANK=${{{machine_rank_env}:-0}}
	export MLP_MASTER_ADDR=${{{master_addr_env}:-127.0.0.1}}
	export MLP_MASTER_PORT=${{{master_port_env}:-{default_master_port}}}
	cd {root}
	export CUDA_VISIBLE_DEVICES={cuda}
	echo "[volc-ddp] host=$(hostname) rank=${{MLP_MACHINE_RANK}} master=${{MLP_MASTER_ADDR}}:${{MLP_MASTER_PORT}} num_machines=${{MLP_NUM_MACHINES}} num_gpus={ngpu} cuda=${{CUDA_VISIBLE_DEVICES}}"
	"{py}" -u tools/train.py --num-gpus {ngpu} --num-machines ${{MLP_NUM_MACHINES}} --machine-rank ${{MLP_MACHINE_RANK}} --dist-url tcp://${{MLP_MASTER_ADDR}}:${{MLP_MASTER_PORT}} \\
	--config-file {cfg} \\
	--options {options} \\
	2>&1 \| tee {log}
	""".format(
	root=pointcept_root,
	torch_home=torch_home,
	py=py_exe,
	num_machines=num_machines,
	role_replicas=role_replicas,
	machine_rank_env=machine_rank_env,
	master_addr_env=master_addr_env,
	master_port_env=master_port_env,
	default_master_port=default_master_port,
	cuda=cuda_visible,
	ngpu=num_gpus,
	cfg=config_rel,
	options=train_options_text,
	log=log_file_eval,
	)

	print("========== ML Platform Job Preview ==========")
	print("ACTUALLY_SUBMIT:", ACTUALLY_SUBMIT)
	print("jobname:", jobname)
	print("card_type:", card_type)
	print("resource_queue_id:", resource_queue_id)
	print("vepfs_id:", vepfs_id, "-> /map-vepfs")
	print("image:", image_url)
	print("MLP_POINTCEPT_ROOT:", pointcept_root)
	print("MLP_PYTHON:", py_exe)
	print("MLP_NUM_GPUS / CUDA_VISIBLE_DEVICES:", num_gpus, "/", cuda_visible)
	print("MLP_NUM_MACHINES / MLP_ROLE_REPLICAS:", num_machines, "/", role_replicas)
	print("MLP_CONFIG:", config_rel)
	print("MLP_SAVE_PATH:", save_path)
	print("MLP_OPTIONS:", extra_options or "(none)")
	print("MLP_RENDER_HAOZHE:", RENDER_HAOZHE)
	print("log_file_eval:", log_file_eval)
	print("----- bash_command -----")
	print(bash_command)
	print("------------------------")

	if RENDER_HAOZHE:
	launcher_text = _render_haozhe_launcher(
	pointcept_root,
	py_exe,
	num_gpus,
	cuda_visible,
	config_rel,
	save_path,
	extra_options,
	torch_home,
	)
	print("----- haozhe_launcher -----")
	print(launcher_text)
	print("---------------------------")
	launcher_out = _write_launcher_if_requested(launcher_text)
	if launcher_out:
	print("haozhe_launcher_out:", launcher_out)

	if not ACTUALLY_SUBMIT:
	print(
	"\nPreview only. Real create_job requires: "
	"export VOLC_AK=... VOLC_SK=... && MLP_SUBMIT=1 python3 run.py"
	)
	sys.exit(0)

	if not os.environ.get("VOLC_AK") or not os.environ.get("VOLC_SK"):
	sys.exit("MLP_SUBMIT=1 requires VOLC_AK and VOLC_SK")

	try:
	import volcenginesdkcore
	import volcenginesdkmlplatform20240701
	from volcenginesdkcore.rest import ApiException
	except ImportError as e:
	sys.exit(
	"Please install the Volcano ML Platform SDK first: "
	"pip install volcenginesdkcore ... Error: {}".format(e)
	)

	configuration = volcenginesdkcore.Configuration()
	configuration.ak = os.environ["VOLC_AK"]
	configuration.sk = os.environ["VOLC_SK"]
	configuration.region = "cn-beijing2"

	volcenginesdkcore.Configuration.set_default(configuration)
	api_instance = volcenginesdkmlplatform20240701.MLPLATFORM20240701Api()

	def create_job():
	req_resource = volcenginesdkmlplatform20240701.ResourceForCreateJobInput(
	instance_type_id=card_type,
	type="Preset",
	zone_id="cn-beijing2-a",
	)
	req_roles = volcenginesdkmlplatform20240701.RoleForCreateJobInput(
	name="worker",
	replicas=role_replicas,
	resource=req_resource,
	)
	req_resource_config = volcenginesdkmlplatform20240701.ResourceConfigForCreateJobInput(
	priority=2,
	resource_queue_id=resource_queue_id,
	roles=[req_roles],
	)
	req_image = volcenginesdkmlplatform20240701.ImageForCreateJobInput(
	type="Prebuild",
	url=image_url,
	)
	req_runtime_config = volcenginesdkmlplatform20240701.RuntimeConfigForCreateJobInput(
	command=bash_command,
	framework="PyTorch",
	image=req_image,
	)
	req_vepfs = volcenginesdkmlplatform20240701.VepfsForCreateJobInput(
	host_path=vepfs_host_path,
	id=vepfs_id,
	)
	req_config = volcenginesdkmlplatform20240701.ConfigForCreateJobInput(
	vepfs=req_vepfs,
	)
	req_storages = volcenginesdkmlplatform20240701.StorageForCreateJobInput(
	config=req_config,
	mount_path="/map-vepfs",
	type="Vepfs",
	)
	req_storage_config = volcenginesdkmlplatform20240701.StorageConfigForCreateJobInput(
	storages=[req_storages],
	)
	create_job_request = volcenginesdkmlplatform20240701.CreateJobRequest(
	dry_run=False,
	name=jobname,
	resource_config=req_resource_config,
	runtime_config=req_runtime_config,
	storage_config=req_storage_config,
	)

	try:
	response = api_instance.create_job(create_job_request)
	job_id = response.id
	print("Job submitted successfully")
	print(" job_id:", job_id)
	print(" job_name:", jobname)
	return job_id
	except ApiException as e:
	print("Submit failed:", e)
	return None

	create_job()