| from __future__ import print_function |
| """ |
| Volcano ML Platform launcher for Pointcept jobs on the Haozhe Vepfs tree. |
| The default target is the current S3DIS superpoint reboot follow-up setup. |
| |
| Usage |
| ----- |
| Preview only: |
| python3 run.py |
| |
| Preview a 4-GPU superpoint reboot launch: |
| MLP_NUM_GPUS=4 MLP_CUDA_VISIBLE_DEVICES=0,1,2,3 \\ |
| MLP_CONFIG=configs/s3dis/reboot_override.py python3 run.py |
| |
| Render a Haozhe-local launcher without poplab paths: |
| MLP_RENDER_HAOZHE=1 \\ |
| MLP_HAOZHE_LAUNCHER_OUT=outputs/run_training_haozhe.py \\ |
| MLP_CONFIG=configs/s3dis/reboot_override.py python3 run.py |
| |
| Real submit (requires volcenginesdk* plus VOLC_AK / VOLC_SK): |
| export VOLC_AK=... VOLC_SK=... |
| MLP_SUBMIT=1 python3 run.py |
| |
| Common environment variables |
| ---------------------------- |
| MLP_POINTCEPT_ROOT Compute-side Pointcept root. Defaults to the Haozhe path. |
| MLP_PYTHON Compute-side Python executable. |
| MLP_NUM_GPUS Passed to tools/train.py --num-gpus. |
| MLP_CUDA_VISIBLE_DEVICES |
| Defaults to 0,1,...,N-1 based on MLP_NUM_GPUS. |
| MLP_CONFIG Config path relative to MLP_POINTCEPT_ROOT. |
| Default: configs/s3dis/semseg-pt-v3m1-0-rpe_sp_after_pool_reboot.py |
| MLP_SAVE_PATH save_path passed through --options. |
| Default: exp/<dataset>/<config>_<timestamp> |
| MLP_OPTIONS Extra tools/train.py --options, for example: |
| "resume=True weight=exp/.../model/model_last.pth" |
| MLP_LOG_DIR Writable log directory on the compute side. |
| MLP_CARD_TYPE Instance type id. |
| MLP_RESOURCE_QUEUE_ID Resource queue id. |
| MLP_VEPFS_ID Vepfs mount id. |
| MLP_IMAGE Container image url. |
| MLP_RENDER_HAOZHE When set to 1, also print a Haozhe-local launcher. |
| MLP_HAOZHE_LAUNCHER_OUT |
| Optional output path for the rendered launcher. |
| |
| Notes |
| ----- |
| - This is the canonical poplab -> compute launcher. |
| - Keep PAMI2026 on poplab; do not mirror it onto Haozhe2. |
| - The compute-side code root stays /map-vepfs/haozhe/PAMI_superpoint/pointcept_framework. |
| - The rendered Haozhe launcher contains only Haozhe-side paths. |
| """ |
| import datetime |
| import os |
| import shlex |
| import sys |
|
|
| os.environ.setdefault("no_proxy", "volces.com,volcengineapi.com") |
|
|
| ACTUALLY_SUBMIT = os.environ.get("MLP_SUBMIT", "0") == "1" |
| RENDER_HAOZHE = os.environ.get("MLP_RENDER_HAOZHE", "0") == "1" |
|
|
|
|
| def _default_cuda_visible(num_gpus): |
| return ",".join(str(i) for i in range(max(1, num_gpus))) |
|
|
|
|
| def _config_group(config_rel): |
| parts = config_rel.replace("\\", "/").split("/") |
| if len(parts) >= 3 and parts[0] == "configs": |
| return parts[1] |
| return "misc" |
|
|
|
|
| def _config_tag(config_rel): |
| return os.path.splitext(os.path.basename(config_rel))[0] |
|
|
|
|
| def _default_save_path(config_rel, timestamp): |
| return "exp/{}/{}_{}".format( |
| _config_group(config_rel), |
| _config_tag(config_rel), |
| timestamp, |
| ) |
|
|
|
|
| def _compose_options_list(save_path, extra_options): |
| options = ["save_path={}".format(save_path)] |
| if extra_options: |
| options.extend(shlex.split(extra_options)) |
| return options |
|
|
|
|
| def _compose_options_text(options): |
| return " ".join(options) |
|
|
|
|
| def _int_env(name, default): |
| try: |
| return int(os.environ.get(name, str(default))) |
| except (TypeError, ValueError): |
| return int(default) |
|
|
|
|
| def _render_haozhe_launcher( |
| pointcept_root, |
| py_exe, |
| num_gpus, |
| cuda_visible, |
| config_rel, |
| save_path, |
| extra_options, |
| torch_home, |
| ): |
| return """ |
| import os |
| import shlex |
| import subprocess |
| |
| def run(): |
| root = os.environ.get("MLP_POINTCEPT_ROOT", {root!r}) |
| py_exe = os.environ.get("MLP_PYTHON", {py!r}) |
| num_gpus = int(os.environ.get("MLP_NUM_GPUS", {ngpu!r})) |
| cuda_visible = os.environ.get("MLP_CUDA_VISIBLE_DEVICES", {cuda!r}) |
| config_rel = os.environ.get("MLP_CONFIG", {cfg!r}) |
| save_path = os.environ.get("MLP_SAVE_PATH", {save_path!r}) |
| extra_options = os.environ.get("MLP_OPTIONS", {extra!r}).strip() |
| torch_home = os.environ.get("TORCH_HOME", {torch_home!r}) |
| options = ["save_path={{}}".format(save_path)] |
| if extra_options: |
| options.extend(shlex.split(extra_options)) |
| env = os.environ.copy() |
| env["PYTHONPATH"] = "{{}}:{{}}/libs/pointops/build/lib.linux-x86_64-cpython-310:{{}}".format( |
| root, |
| root, |
| env.get("PYTHONPATH", ""), |
| ) |
| env["TORCH_HOME"] = torch_home |
| env["CUDA_HOME"] = os.environ.get("CUDA_HOME", "/usr/local/cuda-12.1") |
| env["CUDA_VISIBLE_DEVICES"] = cuda_visible |
| cmd = [ |
| py_exe, |
| "-u", |
| "tools/train.py", |
| "--num-gpus", |
| str(num_gpus), |
| "--dist-url", |
| "auto", |
| "--config-file", |
| config_rel, |
| "--options", |
| ] |
| cmd.extend(options) |
| os.chdir(root) |
| subprocess.run(cmd, env=env, check=True) |
| |
| if __name__ == "__main__": |
| run() |
| """.lstrip().format( |
| root=pointcept_root, |
| py=py_exe, |
| ngpu=str(num_gpus), |
| cuda=cuda_visible, |
| cfg=config_rel, |
| save_path=save_path, |
| extra=extra_options, |
| torch_home=torch_home, |
| ) |
|
|
|
|
| def _write_launcher_if_requested(launcher_text): |
| out_path = os.environ.get("MLP_HAOZHE_LAUNCHER_OUT", "").strip() |
| if not out_path: |
| return None |
| out_dir = os.path.dirname(out_path) |
| if out_dir: |
| os.makedirs(out_dir, exist_ok=True) |
| with open(out_path, "w", encoding="utf-8", newline="\n") as f: |
| f.write(launcher_text) |
| return out_path |
|
|
|
|
| if __name__ == "__main__": |
| timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
| pointcept_root = os.environ.get( |
| "MLP_POINTCEPT_ROOT", |
| "/map-vepfs/haozhe/PAMI_superpoint/pointcept_framework", |
| ) |
| py_exe = os.environ.get( |
| "MLP_PYTHON", |
| "/map-vepfs/haozhe/PAMI_superpoint/superpoint/bin/python", |
| ) |
| num_gpus = int(os.environ.get("MLP_NUM_GPUS", "1")) |
| num_machines = _int_env("MLP_NUM_MACHINES", 1) |
| role_replicas = _int_env("MLP_ROLE_REPLICAS", num_machines) |
| cuda_visible = os.environ.get( |
| "MLP_CUDA_VISIBLE_DEVICES", |
| _default_cuda_visible(num_gpus), |
| ) |
| config_rel = os.environ.get( |
| "MLP_CONFIG", |
| "configs/s3dis/semseg-pt-v3m1-0-rpe_sp_after_pool_reboot.py", |
| ) |
| config_tag = _config_tag(config_rel) |
| save_path = os.environ.get( |
| "MLP_SAVE_PATH", |
| _default_save_path(config_rel, timestamp), |
| ) |
| extra_options = os.environ.get("MLP_OPTIONS", "").strip() |
| train_options = _compose_options_list(save_path, extra_options) |
| train_options_text = _compose_options_text(train_options) |
| log_dir = os.environ.get("MLP_LOG_DIR", os.path.join(pointcept_root, "logs")) |
| log_file_eval = os.path.join(log_dir, "volc_job_{}.log".format(timestamp)) |
|
|
| torch_home = os.environ.get( |
| "TORCH_HOME", |
| "/map-vepfs/haozhe/PAMI_superpoint/.torch_cache", |
| ) |
|
|
| card_type = os.environ.get("MLP_CARD_TYPE", "ml.pni2l.7xlarge") |
| jobname = os.environ.get("MLP_JOB_NAME", "PAMI-{}-{}".format(config_tag, timestamp)) |
| resource_queue_id = os.environ.get( |
| "MLP_RESOURCE_QUEUE_ID", "q-20241024095431-86z6j" |
| ) |
| vepfs_id = os.environ.get("MLP_VEPFS_ID", "vepfs-cnbj26c39866e9ec1") |
| vepfs_host_path = "/mnt/{}".format(vepfs_id) |
| image_url = os.environ.get( |
| "MLP_IMAGE", |
| "vemlp-cn-beijing2.cr.volces.com/preset-images/ray:2.12.0-cuda-121-py310", |
| ) |
| master_addr_env = os.environ.get("MLP_MASTER_ADDR_ENV", "MLP_WORKER_0_HOST") |
| master_port_env = os.environ.get("MLP_MASTER_PORT_ENV", "MLP_WORKER_0_PORT") |
| machine_rank_env = os.environ.get("MLP_MACHINE_RANK_ENV", "MLP_ROLE_INDEX") |
| default_master_port = os.environ.get("MLP_MASTER_PORT", "29500") |
|
|
| bash_command = """ |
| set -euo pipefail |
| export PYTHONPATH={root}:{root}/libs/pointops/build/lib.linux-x86_64-cpython-310:${{PYTHONPATH:-}} |
| export TORCH_HOME=${{TORCH_HOME:-{torch_home}}} |
| export CUDA_HOME=/usr/local/cuda-12.1 |
| export MLP_NUM_MACHINES=${{MLP_NUM_MACHINES:-{num_machines}}} |
| export MLP_ROLE_REPLICAS=${{MLP_ROLE_REPLICAS:-{role_replicas}}} |
| export MLP_MACHINE_RANK=${{{machine_rank_env}:-0}} |
| export MLP_MASTER_ADDR=${{{master_addr_env}:-127.0.0.1}} |
| export MLP_MASTER_PORT=${{{master_port_env}:-{default_master_port}}} |
| cd {root} |
| export CUDA_VISIBLE_DEVICES={cuda} |
| echo "[volc-ddp] host=$(hostname) rank=${{MLP_MACHINE_RANK}} master=${{MLP_MASTER_ADDR}}:${{MLP_MASTER_PORT}} num_machines=${{MLP_NUM_MACHINES}} num_gpus={ngpu} cuda=${{CUDA_VISIBLE_DEVICES}}" |
| "{py}" -u tools/train.py --num-gpus {ngpu} --num-machines ${{MLP_NUM_MACHINES}} --machine-rank ${{MLP_MACHINE_RANK}} --dist-url tcp://${{MLP_MASTER_ADDR}}:${{MLP_MASTER_PORT}} \\ |
| --config-file {cfg} \\ |
| --options {options} \\ |
| 2>&1 | tee {log} |
| """.format( |
| root=pointcept_root, |
| torch_home=torch_home, |
| py=py_exe, |
| num_machines=num_machines, |
| role_replicas=role_replicas, |
| machine_rank_env=machine_rank_env, |
| master_addr_env=master_addr_env, |
| master_port_env=master_port_env, |
| default_master_port=default_master_port, |
| cuda=cuda_visible, |
| ngpu=num_gpus, |
| cfg=config_rel, |
| options=train_options_text, |
| log=log_file_eval, |
| ) |
|
|
| print("========== ML Platform Job Preview ==========") |
| print("ACTUALLY_SUBMIT:", ACTUALLY_SUBMIT) |
| print("jobname:", jobname) |
| print("card_type:", card_type) |
| print("resource_queue_id:", resource_queue_id) |
| print("vepfs_id:", vepfs_id, "-> /map-vepfs") |
| print("image:", image_url) |
| print("MLP_POINTCEPT_ROOT:", pointcept_root) |
| print("MLP_PYTHON:", py_exe) |
| print("MLP_NUM_GPUS / CUDA_VISIBLE_DEVICES:", num_gpus, "/", cuda_visible) |
| print("MLP_NUM_MACHINES / MLP_ROLE_REPLICAS:", num_machines, "/", role_replicas) |
| print("MLP_CONFIG:", config_rel) |
| print("MLP_SAVE_PATH:", save_path) |
| print("MLP_OPTIONS:", extra_options or "(none)") |
| print("MLP_RENDER_HAOZHE:", RENDER_HAOZHE) |
| print("log_file_eval:", log_file_eval) |
| print("----- bash_command -----") |
| print(bash_command) |
| print("------------------------") |
|
|
| if RENDER_HAOZHE: |
| launcher_text = _render_haozhe_launcher( |
| pointcept_root, |
| py_exe, |
| num_gpus, |
| cuda_visible, |
| config_rel, |
| save_path, |
| extra_options, |
| torch_home, |
| ) |
| print("----- haozhe_launcher -----") |
| print(launcher_text) |
| print("---------------------------") |
| launcher_out = _write_launcher_if_requested(launcher_text) |
| if launcher_out: |
| print("haozhe_launcher_out:", launcher_out) |
|
|
| if not ACTUALLY_SUBMIT: |
| print( |
| "\nPreview only. Real create_job requires: " |
| "export VOLC_AK=... VOLC_SK=... && MLP_SUBMIT=1 python3 run.py" |
| ) |
| sys.exit(0) |
|
|
| if not os.environ.get("VOLC_AK") or not os.environ.get("VOLC_SK"): |
| sys.exit("MLP_SUBMIT=1 requires VOLC_AK and VOLC_SK") |
|
|
| try: |
| import volcenginesdkcore |
| import volcenginesdkmlplatform20240701 |
| from volcenginesdkcore.rest import ApiException |
| except ImportError as e: |
| sys.exit( |
| "Please install the Volcano ML Platform SDK first: " |
| "pip install volcenginesdkcore ... Error: {}".format(e) |
| ) |
|
|
| configuration = volcenginesdkcore.Configuration() |
| configuration.ak = os.environ["VOLC_AK"] |
| configuration.sk = os.environ["VOLC_SK"] |
| configuration.region = "cn-beijing2" |
|
|
| volcenginesdkcore.Configuration.set_default(configuration) |
| api_instance = volcenginesdkmlplatform20240701.MLPLATFORM20240701Api() |
|
|
| def create_job(): |
| req_resource = volcenginesdkmlplatform20240701.ResourceForCreateJobInput( |
| instance_type_id=card_type, |
| type="Preset", |
| zone_id="cn-beijing2-a", |
| ) |
| req_roles = volcenginesdkmlplatform20240701.RoleForCreateJobInput( |
| name="worker", |
| replicas=role_replicas, |
| resource=req_resource, |
| ) |
| req_resource_config = volcenginesdkmlplatform20240701.ResourceConfigForCreateJobInput( |
| priority=2, |
| resource_queue_id=resource_queue_id, |
| roles=[req_roles], |
| ) |
| req_image = volcenginesdkmlplatform20240701.ImageForCreateJobInput( |
| type="Prebuild", |
| url=image_url, |
| ) |
| req_runtime_config = volcenginesdkmlplatform20240701.RuntimeConfigForCreateJobInput( |
| command=bash_command, |
| framework="PyTorch", |
| image=req_image, |
| ) |
| req_vepfs = volcenginesdkmlplatform20240701.VepfsForCreateJobInput( |
| host_path=vepfs_host_path, |
| id=vepfs_id, |
| ) |
| req_config = volcenginesdkmlplatform20240701.ConfigForCreateJobInput( |
| vepfs=req_vepfs, |
| ) |
| req_storages = volcenginesdkmlplatform20240701.StorageForCreateJobInput( |
| config=req_config, |
| mount_path="/map-vepfs", |
| type="Vepfs", |
| ) |
| req_storage_config = volcenginesdkmlplatform20240701.StorageConfigForCreateJobInput( |
| storages=[req_storages], |
| ) |
| create_job_request = volcenginesdkmlplatform20240701.CreateJobRequest( |
| dry_run=False, |
| name=jobname, |
| resource_config=req_resource_config, |
| runtime_config=req_runtime_config, |
| storage_config=req_storage_config, |
| ) |
|
|
| try: |
| response = api_instance.create_job(create_job_request) |
| job_id = response.id |
| print("Job submitted successfully") |
| print(" job_id:", job_id) |
| print(" job_name:", jobname) |
| return job_id |
| except ApiException as e: |
| print("Submit failed:", e) |
| return None |
|
|
| create_job() |
|
|