from __future__ import print_function """ Volcano ML Platform launcher for Pointcept jobs on the Haozhe Vepfs tree. The default target is the current S3DIS superpoint reboot follow-up setup. Usage ----- Preview only: python3 run.py Preview a 4-GPU superpoint reboot launch: MLP_NUM_GPUS=4 MLP_CUDA_VISIBLE_DEVICES=0,1,2,3 \\ MLP_CONFIG=configs/s3dis/reboot_override.py python3 run.py Render a Haozhe-local launcher without poplab paths: MLP_RENDER_HAOZHE=1 \\ MLP_HAOZHE_LAUNCHER_OUT=outputs/run_training_haozhe.py \\ MLP_CONFIG=configs/s3dis/reboot_override.py python3 run.py Real submit (requires volcenginesdk* plus VOLC_AK / VOLC_SK): export VOLC_AK=... VOLC_SK=... MLP_SUBMIT=1 python3 run.py Common environment variables ---------------------------- MLP_POINTCEPT_ROOT Compute-side Pointcept root. Defaults to the Haozhe path. MLP_PYTHON Compute-side Python executable. MLP_NUM_GPUS Passed to tools/train.py --num-gpus. MLP_CUDA_VISIBLE_DEVICES Defaults to 0,1,...,N-1 based on MLP_NUM_GPUS. MLP_CONFIG Config path relative to MLP_POINTCEPT_ROOT. Default: configs/s3dis/semseg-pt-v3m1-0-rpe_sp_after_pool_reboot.py MLP_SAVE_PATH save_path passed through --options. Default: exp//_ MLP_OPTIONS Extra tools/train.py --options, for example: "resume=True weight=exp/.../model/model_last.pth" MLP_LOG_DIR Writable log directory on the compute side. MLP_CARD_TYPE Instance type id. MLP_RESOURCE_QUEUE_ID Resource queue id. MLP_VEPFS_ID Vepfs mount id. MLP_IMAGE Container image url. MLP_RENDER_HAOZHE When set to 1, also print a Haozhe-local launcher. MLP_HAOZHE_LAUNCHER_OUT Optional output path for the rendered launcher. Notes ----- - This is the canonical poplab -> compute launcher. - Keep PAMI2026 on poplab; do not mirror it onto Haozhe2. - The compute-side code root stays /map-vepfs/haozhe/PAMI_superpoint/pointcept_framework. - The rendered Haozhe launcher contains only Haozhe-side paths. """ import datetime import os import shlex import sys os.environ.setdefault("no_proxy", "volces.com,volcengineapi.com") ACTUALLY_SUBMIT = os.environ.get("MLP_SUBMIT", "0") == "1" RENDER_HAOZHE = os.environ.get("MLP_RENDER_HAOZHE", "0") == "1" def _default_cuda_visible(num_gpus): return ",".join(str(i) for i in range(max(1, num_gpus))) def _config_group(config_rel): parts = config_rel.replace("\\", "/").split("/") if len(parts) >= 3 and parts[0] == "configs": return parts[1] return "misc" def _config_tag(config_rel): return os.path.splitext(os.path.basename(config_rel))[0] def _default_save_path(config_rel, timestamp): return "exp/{}/{}_{}".format( _config_group(config_rel), _config_tag(config_rel), timestamp, ) def _compose_options_list(save_path, extra_options): options = ["save_path={}".format(save_path)] if extra_options: options.extend(shlex.split(extra_options)) return options def _compose_options_text(options): return " ".join(options) def _int_env(name, default): try: return int(os.environ.get(name, str(default))) except (TypeError, ValueError): return int(default) def _render_haozhe_launcher( pointcept_root, py_exe, num_gpus, cuda_visible, config_rel, save_path, extra_options, torch_home, ): return """ import os import shlex import subprocess def run(): root = os.environ.get("MLP_POINTCEPT_ROOT", {root!r}) py_exe = os.environ.get("MLP_PYTHON", {py!r}) num_gpus = int(os.environ.get("MLP_NUM_GPUS", {ngpu!r})) cuda_visible = os.environ.get("MLP_CUDA_VISIBLE_DEVICES", {cuda!r}) config_rel = os.environ.get("MLP_CONFIG", {cfg!r}) save_path = os.environ.get("MLP_SAVE_PATH", {save_path!r}) extra_options = os.environ.get("MLP_OPTIONS", {extra!r}).strip() torch_home = os.environ.get("TORCH_HOME", {torch_home!r}) options = ["save_path={{}}".format(save_path)] if extra_options: options.extend(shlex.split(extra_options)) env = os.environ.copy() env["PYTHONPATH"] = "{{}}:{{}}/libs/pointops/build/lib.linux-x86_64-cpython-310:{{}}".format( root, root, env.get("PYTHONPATH", ""), ) env["TORCH_HOME"] = torch_home env["CUDA_HOME"] = os.environ.get("CUDA_HOME", "/usr/local/cuda-12.1") env["CUDA_VISIBLE_DEVICES"] = cuda_visible cmd = [ py_exe, "-u", "tools/train.py", "--num-gpus", str(num_gpus), "--dist-url", "auto", "--config-file", config_rel, "--options", ] cmd.extend(options) os.chdir(root) subprocess.run(cmd, env=env, check=True) if __name__ == "__main__": run() """.lstrip().format( root=pointcept_root, py=py_exe, ngpu=str(num_gpus), cuda=cuda_visible, cfg=config_rel, save_path=save_path, extra=extra_options, torch_home=torch_home, ) def _write_launcher_if_requested(launcher_text): out_path = os.environ.get("MLP_HAOZHE_LAUNCHER_OUT", "").strip() if not out_path: return None out_dir = os.path.dirname(out_path) if out_dir: os.makedirs(out_dir, exist_ok=True) with open(out_path, "w", encoding="utf-8", newline="\n") as f: f.write(launcher_text) return out_path if __name__ == "__main__": timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") pointcept_root = os.environ.get( "MLP_POINTCEPT_ROOT", "/map-vepfs/haozhe/PAMI_superpoint/pointcept_framework", ) py_exe = os.environ.get( "MLP_PYTHON", "/map-vepfs/haozhe/PAMI_superpoint/superpoint/bin/python", ) num_gpus = int(os.environ.get("MLP_NUM_GPUS", "1")) num_machines = _int_env("MLP_NUM_MACHINES", 1) role_replicas = _int_env("MLP_ROLE_REPLICAS", num_machines) cuda_visible = os.environ.get( "MLP_CUDA_VISIBLE_DEVICES", _default_cuda_visible(num_gpus), ) config_rel = os.environ.get( "MLP_CONFIG", "configs/s3dis/semseg-pt-v3m1-0-rpe_sp_after_pool_reboot.py", ) config_tag = _config_tag(config_rel) save_path = os.environ.get( "MLP_SAVE_PATH", _default_save_path(config_rel, timestamp), ) extra_options = os.environ.get("MLP_OPTIONS", "").strip() train_options = _compose_options_list(save_path, extra_options) train_options_text = _compose_options_text(train_options) log_dir = os.environ.get("MLP_LOG_DIR", os.path.join(pointcept_root, "logs")) log_file_eval = os.path.join(log_dir, "volc_job_{}.log".format(timestamp)) torch_home = os.environ.get( "TORCH_HOME", "/map-vepfs/haozhe/PAMI_superpoint/.torch_cache", ) card_type = os.environ.get("MLP_CARD_TYPE", "ml.pni2l.7xlarge") jobname = os.environ.get("MLP_JOB_NAME", "PAMI-{}-{}".format(config_tag, timestamp)) resource_queue_id = os.environ.get( "MLP_RESOURCE_QUEUE_ID", "q-20241024095431-86z6j" ) vepfs_id = os.environ.get("MLP_VEPFS_ID", "vepfs-cnbj26c39866e9ec1") vepfs_host_path = "/mnt/{}".format(vepfs_id) image_url = os.environ.get( "MLP_IMAGE", "vemlp-cn-beijing2.cr.volces.com/preset-images/ray:2.12.0-cuda-121-py310", ) master_addr_env = os.environ.get("MLP_MASTER_ADDR_ENV", "MLP_WORKER_0_HOST") master_port_env = os.environ.get("MLP_MASTER_PORT_ENV", "MLP_WORKER_0_PORT") machine_rank_env = os.environ.get("MLP_MACHINE_RANK_ENV", "MLP_ROLE_INDEX") default_master_port = os.environ.get("MLP_MASTER_PORT", "29500") bash_command = """ set -euo pipefail export PYTHONPATH={root}:{root}/libs/pointops/build/lib.linux-x86_64-cpython-310:${{PYTHONPATH:-}} export TORCH_HOME=${{TORCH_HOME:-{torch_home}}} export CUDA_HOME=/usr/local/cuda-12.1 export MLP_NUM_MACHINES=${{MLP_NUM_MACHINES:-{num_machines}}} export MLP_ROLE_REPLICAS=${{MLP_ROLE_REPLICAS:-{role_replicas}}} export MLP_MACHINE_RANK=${{{machine_rank_env}:-0}} export MLP_MASTER_ADDR=${{{master_addr_env}:-127.0.0.1}} export MLP_MASTER_PORT=${{{master_port_env}:-{default_master_port}}} cd {root} export CUDA_VISIBLE_DEVICES={cuda} echo "[volc-ddp] host=$(hostname) rank=${{MLP_MACHINE_RANK}} master=${{MLP_MASTER_ADDR}}:${{MLP_MASTER_PORT}} num_machines=${{MLP_NUM_MACHINES}} num_gpus={ngpu} cuda=${{CUDA_VISIBLE_DEVICES}}" "{py}" -u tools/train.py --num-gpus {ngpu} --num-machines ${{MLP_NUM_MACHINES}} --machine-rank ${{MLP_MACHINE_RANK}} --dist-url tcp://${{MLP_MASTER_ADDR}}:${{MLP_MASTER_PORT}} \\ --config-file {cfg} \\ --options {options} \\ 2>&1 | tee {log} """.format( root=pointcept_root, torch_home=torch_home, py=py_exe, num_machines=num_machines, role_replicas=role_replicas, machine_rank_env=machine_rank_env, master_addr_env=master_addr_env, master_port_env=master_port_env, default_master_port=default_master_port, cuda=cuda_visible, ngpu=num_gpus, cfg=config_rel, options=train_options_text, log=log_file_eval, ) print("========== ML Platform Job Preview ==========") print("ACTUALLY_SUBMIT:", ACTUALLY_SUBMIT) print("jobname:", jobname) print("card_type:", card_type) print("resource_queue_id:", resource_queue_id) print("vepfs_id:", vepfs_id, "-> /map-vepfs") print("image:", image_url) print("MLP_POINTCEPT_ROOT:", pointcept_root) print("MLP_PYTHON:", py_exe) print("MLP_NUM_GPUS / CUDA_VISIBLE_DEVICES:", num_gpus, "/", cuda_visible) print("MLP_NUM_MACHINES / MLP_ROLE_REPLICAS:", num_machines, "/", role_replicas) print("MLP_CONFIG:", config_rel) print("MLP_SAVE_PATH:", save_path) print("MLP_OPTIONS:", extra_options or "(none)") print("MLP_RENDER_HAOZHE:", RENDER_HAOZHE) print("log_file_eval:", log_file_eval) print("----- bash_command -----") print(bash_command) print("------------------------") if RENDER_HAOZHE: launcher_text = _render_haozhe_launcher( pointcept_root, py_exe, num_gpus, cuda_visible, config_rel, save_path, extra_options, torch_home, ) print("----- haozhe_launcher -----") print(launcher_text) print("---------------------------") launcher_out = _write_launcher_if_requested(launcher_text) if launcher_out: print("haozhe_launcher_out:", launcher_out) if not ACTUALLY_SUBMIT: print( "\nPreview only. Real create_job requires: " "export VOLC_AK=... VOLC_SK=... && MLP_SUBMIT=1 python3 run.py" ) sys.exit(0) if not os.environ.get("VOLC_AK") or not os.environ.get("VOLC_SK"): sys.exit("MLP_SUBMIT=1 requires VOLC_AK and VOLC_SK") try: import volcenginesdkcore import volcenginesdkmlplatform20240701 from volcenginesdkcore.rest import ApiException except ImportError as e: sys.exit( "Please install the Volcano ML Platform SDK first: " "pip install volcenginesdkcore ... Error: {}".format(e) ) configuration = volcenginesdkcore.Configuration() configuration.ak = os.environ["VOLC_AK"] configuration.sk = os.environ["VOLC_SK"] configuration.region = "cn-beijing2" volcenginesdkcore.Configuration.set_default(configuration) api_instance = volcenginesdkmlplatform20240701.MLPLATFORM20240701Api() def create_job(): req_resource = volcenginesdkmlplatform20240701.ResourceForCreateJobInput( instance_type_id=card_type, type="Preset", zone_id="cn-beijing2-a", ) req_roles = volcenginesdkmlplatform20240701.RoleForCreateJobInput( name="worker", replicas=role_replicas, resource=req_resource, ) req_resource_config = volcenginesdkmlplatform20240701.ResourceConfigForCreateJobInput( priority=2, resource_queue_id=resource_queue_id, roles=[req_roles], ) req_image = volcenginesdkmlplatform20240701.ImageForCreateJobInput( type="Prebuild", url=image_url, ) req_runtime_config = volcenginesdkmlplatform20240701.RuntimeConfigForCreateJobInput( command=bash_command, framework="PyTorch", image=req_image, ) req_vepfs = volcenginesdkmlplatform20240701.VepfsForCreateJobInput( host_path=vepfs_host_path, id=vepfs_id, ) req_config = volcenginesdkmlplatform20240701.ConfigForCreateJobInput( vepfs=req_vepfs, ) req_storages = volcenginesdkmlplatform20240701.StorageForCreateJobInput( config=req_config, mount_path="/map-vepfs", type="Vepfs", ) req_storage_config = volcenginesdkmlplatform20240701.StorageConfigForCreateJobInput( storages=[req_storages], ) create_job_request = volcenginesdkmlplatform20240701.CreateJobRequest( dry_run=False, name=jobname, resource_config=req_resource_config, runtime_config=req_runtime_config, storage_config=req_storage_config, ) try: response = api_instance.create_job(create_job_request) job_id = response.id print("Job submitted successfully") print(" job_id:", job_id) print(" job_name:", jobname) return job_id except ApiException as e: print("Submit failed:", e) return None create_job()