YYYYYYUUU's picture
Add core reproduction code (binarization layers, PTv3, superpoint ops, min-repro pack)
7b95dc2 verified
Raw
History Blame Contribute Delete
14 kB
from __future__ import print_function
"""
Volcano ML Platform launcher for Pointcept jobs on the Haozhe Vepfs tree.
The default target is the current S3DIS superpoint reboot follow-up setup.
Usage
-----
Preview only:
python3 run.py
Preview a 4-GPU superpoint reboot launch:
MLP_NUM_GPUS=4 MLP_CUDA_VISIBLE_DEVICES=0,1,2,3 \\
MLP_CONFIG=configs/s3dis/reboot_override.py python3 run.py
Render a Haozhe-local launcher without poplab paths:
MLP_RENDER_HAOZHE=1 \\
MLP_HAOZHE_LAUNCHER_OUT=outputs/run_training_haozhe.py \\
MLP_CONFIG=configs/s3dis/reboot_override.py python3 run.py
Real submit (requires volcenginesdk* plus VOLC_AK / VOLC_SK):
export VOLC_AK=... VOLC_SK=...
MLP_SUBMIT=1 python3 run.py
Common environment variables
----------------------------
MLP_POINTCEPT_ROOT Compute-side Pointcept root. Defaults to the Haozhe path.
MLP_PYTHON Compute-side Python executable.
MLP_NUM_GPUS Passed to tools/train.py --num-gpus.
MLP_CUDA_VISIBLE_DEVICES
Defaults to 0,1,...,N-1 based on MLP_NUM_GPUS.
MLP_CONFIG Config path relative to MLP_POINTCEPT_ROOT.
Default: configs/s3dis/semseg-pt-v3m1-0-rpe_sp_after_pool_reboot.py
MLP_SAVE_PATH save_path passed through --options.
Default: exp/<dataset>/<config>_<timestamp>
MLP_OPTIONS Extra tools/train.py --options, for example:
"resume=True weight=exp/.../model/model_last.pth"
MLP_LOG_DIR Writable log directory on the compute side.
MLP_CARD_TYPE Instance type id.
MLP_RESOURCE_QUEUE_ID Resource queue id.
MLP_VEPFS_ID Vepfs mount id.
MLP_IMAGE Container image url.
MLP_RENDER_HAOZHE When set to 1, also print a Haozhe-local launcher.
MLP_HAOZHE_LAUNCHER_OUT
Optional output path for the rendered launcher.
Notes
-----
- This is the canonical poplab -> compute launcher.
- Keep PAMI2026 on poplab; do not mirror it onto Haozhe2.
- The compute-side code root stays /map-vepfs/haozhe/PAMI_superpoint/pointcept_framework.
- The rendered Haozhe launcher contains only Haozhe-side paths.
"""
import datetime
import os
import shlex
import sys
os.environ.setdefault("no_proxy", "volces.com,volcengineapi.com")
ACTUALLY_SUBMIT = os.environ.get("MLP_SUBMIT", "0") == "1"
RENDER_HAOZHE = os.environ.get("MLP_RENDER_HAOZHE", "0") == "1"
def _default_cuda_visible(num_gpus):
return ",".join(str(i) for i in range(max(1, num_gpus)))
def _config_group(config_rel):
parts = config_rel.replace("\\", "/").split("/")
if len(parts) >= 3 and parts[0] == "configs":
return parts[1]
return "misc"
def _config_tag(config_rel):
return os.path.splitext(os.path.basename(config_rel))[0]
def _default_save_path(config_rel, timestamp):
return "exp/{}/{}_{}".format(
_config_group(config_rel),
_config_tag(config_rel),
timestamp,
)
def _compose_options_list(save_path, extra_options):
options = ["save_path={}".format(save_path)]
if extra_options:
options.extend(shlex.split(extra_options))
return options
def _compose_options_text(options):
return " ".join(options)
def _int_env(name, default):
try:
return int(os.environ.get(name, str(default)))
except (TypeError, ValueError):
return int(default)
def _render_haozhe_launcher(
pointcept_root,
py_exe,
num_gpus,
cuda_visible,
config_rel,
save_path,
extra_options,
torch_home,
):
return """
import os
import shlex
import subprocess
def run():
root = os.environ.get("MLP_POINTCEPT_ROOT", {root!r})
py_exe = os.environ.get("MLP_PYTHON", {py!r})
num_gpus = int(os.environ.get("MLP_NUM_GPUS", {ngpu!r}))
cuda_visible = os.environ.get("MLP_CUDA_VISIBLE_DEVICES", {cuda!r})
config_rel = os.environ.get("MLP_CONFIG", {cfg!r})
save_path = os.environ.get("MLP_SAVE_PATH", {save_path!r})
extra_options = os.environ.get("MLP_OPTIONS", {extra!r}).strip()
torch_home = os.environ.get("TORCH_HOME", {torch_home!r})
options = ["save_path={{}}".format(save_path)]
if extra_options:
options.extend(shlex.split(extra_options))
env = os.environ.copy()
env["PYTHONPATH"] = "{{}}:{{}}/libs/pointops/build/lib.linux-x86_64-cpython-310:{{}}".format(
root,
root,
env.get("PYTHONPATH", ""),
)
env["TORCH_HOME"] = torch_home
env["CUDA_HOME"] = os.environ.get("CUDA_HOME", "/usr/local/cuda-12.1")
env["CUDA_VISIBLE_DEVICES"] = cuda_visible
cmd = [
py_exe,
"-u",
"tools/train.py",
"--num-gpus",
str(num_gpus),
"--dist-url",
"auto",
"--config-file",
config_rel,
"--options",
]
cmd.extend(options)
os.chdir(root)
subprocess.run(cmd, env=env, check=True)
if __name__ == "__main__":
run()
""".lstrip().format(
root=pointcept_root,
py=py_exe,
ngpu=str(num_gpus),
cuda=cuda_visible,
cfg=config_rel,
save_path=save_path,
extra=extra_options,
torch_home=torch_home,
)
def _write_launcher_if_requested(launcher_text):
out_path = os.environ.get("MLP_HAOZHE_LAUNCHER_OUT", "").strip()
if not out_path:
return None
out_dir = os.path.dirname(out_path)
if out_dir:
os.makedirs(out_dir, exist_ok=True)
with open(out_path, "w", encoding="utf-8", newline="\n") as f:
f.write(launcher_text)
return out_path
if __name__ == "__main__":
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
pointcept_root = os.environ.get(
"MLP_POINTCEPT_ROOT",
"/map-vepfs/haozhe/PAMI_superpoint/pointcept_framework",
)
py_exe = os.environ.get(
"MLP_PYTHON",
"/map-vepfs/haozhe/PAMI_superpoint/superpoint/bin/python",
)
num_gpus = int(os.environ.get("MLP_NUM_GPUS", "1"))
num_machines = _int_env("MLP_NUM_MACHINES", 1)
role_replicas = _int_env("MLP_ROLE_REPLICAS", num_machines)
cuda_visible = os.environ.get(
"MLP_CUDA_VISIBLE_DEVICES",
_default_cuda_visible(num_gpus),
)
config_rel = os.environ.get(
"MLP_CONFIG",
"configs/s3dis/semseg-pt-v3m1-0-rpe_sp_after_pool_reboot.py",
)
config_tag = _config_tag(config_rel)
save_path = os.environ.get(
"MLP_SAVE_PATH",
_default_save_path(config_rel, timestamp),
)
extra_options = os.environ.get("MLP_OPTIONS", "").strip()
train_options = _compose_options_list(save_path, extra_options)
train_options_text = _compose_options_text(train_options)
log_dir = os.environ.get("MLP_LOG_DIR", os.path.join(pointcept_root, "logs"))
log_file_eval = os.path.join(log_dir, "volc_job_{}.log".format(timestamp))
torch_home = os.environ.get(
"TORCH_HOME",
"/map-vepfs/haozhe/PAMI_superpoint/.torch_cache",
)
card_type = os.environ.get("MLP_CARD_TYPE", "ml.pni2l.7xlarge")
jobname = os.environ.get("MLP_JOB_NAME", "PAMI-{}-{}".format(config_tag, timestamp))
resource_queue_id = os.environ.get(
"MLP_RESOURCE_QUEUE_ID", "q-20241024095431-86z6j"
)
vepfs_id = os.environ.get("MLP_VEPFS_ID", "vepfs-cnbj26c39866e9ec1")
vepfs_host_path = "/mnt/{}".format(vepfs_id)
image_url = os.environ.get(
"MLP_IMAGE",
"vemlp-cn-beijing2.cr.volces.com/preset-images/ray:2.12.0-cuda-121-py310",
)
master_addr_env = os.environ.get("MLP_MASTER_ADDR_ENV", "MLP_WORKER_0_HOST")
master_port_env = os.environ.get("MLP_MASTER_PORT_ENV", "MLP_WORKER_0_PORT")
machine_rank_env = os.environ.get("MLP_MACHINE_RANK_ENV", "MLP_ROLE_INDEX")
default_master_port = os.environ.get("MLP_MASTER_PORT", "29500")
bash_command = """
set -euo pipefail
export PYTHONPATH={root}:{root}/libs/pointops/build/lib.linux-x86_64-cpython-310:${{PYTHONPATH:-}}
export TORCH_HOME=${{TORCH_HOME:-{torch_home}}}
export CUDA_HOME=/usr/local/cuda-12.1
export MLP_NUM_MACHINES=${{MLP_NUM_MACHINES:-{num_machines}}}
export MLP_ROLE_REPLICAS=${{MLP_ROLE_REPLICAS:-{role_replicas}}}
export MLP_MACHINE_RANK=${{{machine_rank_env}:-0}}
export MLP_MASTER_ADDR=${{{master_addr_env}:-127.0.0.1}}
export MLP_MASTER_PORT=${{{master_port_env}:-{default_master_port}}}
cd {root}
export CUDA_VISIBLE_DEVICES={cuda}
echo "[volc-ddp] host=$(hostname) rank=${{MLP_MACHINE_RANK}} master=${{MLP_MASTER_ADDR}}:${{MLP_MASTER_PORT}} num_machines=${{MLP_NUM_MACHINES}} num_gpus={ngpu} cuda=${{CUDA_VISIBLE_DEVICES}}"
"{py}" -u tools/train.py --num-gpus {ngpu} --num-machines ${{MLP_NUM_MACHINES}} --machine-rank ${{MLP_MACHINE_RANK}} --dist-url tcp://${{MLP_MASTER_ADDR}}:${{MLP_MASTER_PORT}} \\
--config-file {cfg} \\
--options {options} \\
2>&1 | tee {log}
""".format(
root=pointcept_root,
torch_home=torch_home,
py=py_exe,
num_machines=num_machines,
role_replicas=role_replicas,
machine_rank_env=machine_rank_env,
master_addr_env=master_addr_env,
master_port_env=master_port_env,
default_master_port=default_master_port,
cuda=cuda_visible,
ngpu=num_gpus,
cfg=config_rel,
options=train_options_text,
log=log_file_eval,
)
print("========== ML Platform Job Preview ==========")
print("ACTUALLY_SUBMIT:", ACTUALLY_SUBMIT)
print("jobname:", jobname)
print("card_type:", card_type)
print("resource_queue_id:", resource_queue_id)
print("vepfs_id:", vepfs_id, "-> /map-vepfs")
print("image:", image_url)
print("MLP_POINTCEPT_ROOT:", pointcept_root)
print("MLP_PYTHON:", py_exe)
print("MLP_NUM_GPUS / CUDA_VISIBLE_DEVICES:", num_gpus, "/", cuda_visible)
print("MLP_NUM_MACHINES / MLP_ROLE_REPLICAS:", num_machines, "/", role_replicas)
print("MLP_CONFIG:", config_rel)
print("MLP_SAVE_PATH:", save_path)
print("MLP_OPTIONS:", extra_options or "(none)")
print("MLP_RENDER_HAOZHE:", RENDER_HAOZHE)
print("log_file_eval:", log_file_eval)
print("----- bash_command -----")
print(bash_command)
print("------------------------")
if RENDER_HAOZHE:
launcher_text = _render_haozhe_launcher(
pointcept_root,
py_exe,
num_gpus,
cuda_visible,
config_rel,
save_path,
extra_options,
torch_home,
)
print("----- haozhe_launcher -----")
print(launcher_text)
print("---------------------------")
launcher_out = _write_launcher_if_requested(launcher_text)
if launcher_out:
print("haozhe_launcher_out:", launcher_out)
if not ACTUALLY_SUBMIT:
print(
"\nPreview only. Real create_job requires: "
"export VOLC_AK=... VOLC_SK=... && MLP_SUBMIT=1 python3 run.py"
)
sys.exit(0)
if not os.environ.get("VOLC_AK") or not os.environ.get("VOLC_SK"):
sys.exit("MLP_SUBMIT=1 requires VOLC_AK and VOLC_SK")
try:
import volcenginesdkcore
import volcenginesdkmlplatform20240701
from volcenginesdkcore.rest import ApiException
except ImportError as e:
sys.exit(
"Please install the Volcano ML Platform SDK first: "
"pip install volcenginesdkcore ... Error: {}".format(e)
)
configuration = volcenginesdkcore.Configuration()
configuration.ak = os.environ["VOLC_AK"]
configuration.sk = os.environ["VOLC_SK"]
configuration.region = "cn-beijing2"
volcenginesdkcore.Configuration.set_default(configuration)
api_instance = volcenginesdkmlplatform20240701.MLPLATFORM20240701Api()
def create_job():
req_resource = volcenginesdkmlplatform20240701.ResourceForCreateJobInput(
instance_type_id=card_type,
type="Preset",
zone_id="cn-beijing2-a",
)
req_roles = volcenginesdkmlplatform20240701.RoleForCreateJobInput(
name="worker",
replicas=role_replicas,
resource=req_resource,
)
req_resource_config = volcenginesdkmlplatform20240701.ResourceConfigForCreateJobInput(
priority=2,
resource_queue_id=resource_queue_id,
roles=[req_roles],
)
req_image = volcenginesdkmlplatform20240701.ImageForCreateJobInput(
type="Prebuild",
url=image_url,
)
req_runtime_config = volcenginesdkmlplatform20240701.RuntimeConfigForCreateJobInput(
command=bash_command,
framework="PyTorch",
image=req_image,
)
req_vepfs = volcenginesdkmlplatform20240701.VepfsForCreateJobInput(
host_path=vepfs_host_path,
id=vepfs_id,
)
req_config = volcenginesdkmlplatform20240701.ConfigForCreateJobInput(
vepfs=req_vepfs,
)
req_storages = volcenginesdkmlplatform20240701.StorageForCreateJobInput(
config=req_config,
mount_path="/map-vepfs",
type="Vepfs",
)
req_storage_config = volcenginesdkmlplatform20240701.StorageConfigForCreateJobInput(
storages=[req_storages],
)
create_job_request = volcenginesdkmlplatform20240701.CreateJobRequest(
dry_run=False,
name=jobname,
resource_config=req_resource_config,
runtime_config=req_runtime_config,
storage_config=req_storage_config,
)
try:
response = api_instance.create_job(create_job_request)
job_id = response.id
print("Job submitted successfully")
print(" job_id:", job_id)
print(" job_name:", jobname)
return job_id
except ApiException as e:
print("Submit failed:", e)
return None
create_job()