from __future__ import print_function
"""
Volcano ML Platform launcher for Pointcept jobs on the Haozhe Vepfs tree.
The default target is the current S3DIS superpoint reboot follow-up setup.

Usage
-----
  Preview only:
    python3 run.py

  Preview a 4-GPU superpoint reboot launch:
    MLP_NUM_GPUS=4 MLP_CUDA_VISIBLE_DEVICES=0,1,2,3 \\
    MLP_CONFIG=configs/s3dis/reboot_override.py python3 run.py

  Render a Haozhe-local launcher without poplab paths:
    MLP_RENDER_HAOZHE=1 \\
    MLP_HAOZHE_LAUNCHER_OUT=outputs/run_training_haozhe.py \\
    MLP_CONFIG=configs/s3dis/reboot_override.py python3 run.py

  Real submit (requires volcenginesdk* plus VOLC_AK / VOLC_SK):
    export VOLC_AK=... VOLC_SK=...
    MLP_SUBMIT=1 python3 run.py

Common environment variables
----------------------------
  MLP_POINTCEPT_ROOT   Compute-side Pointcept root. Defaults to the Haozhe path.
  MLP_PYTHON           Compute-side Python executable.
  MLP_NUM_GPUS         Passed to tools/train.py --num-gpus.
  MLP_CUDA_VISIBLE_DEVICES
                       Defaults to 0,1,...,N-1 based on MLP_NUM_GPUS.
  MLP_CONFIG           Config path relative to MLP_POINTCEPT_ROOT.
                       Default: configs/s3dis/semseg-pt-v3m1-0-rpe_sp_after_pool_reboot.py
  MLP_SAVE_PATH        save_path passed through --options.
                       Default: exp/<dataset>/<config>_<timestamp>
  MLP_OPTIONS          Extra tools/train.py --options, for example:
                       "resume=True weight=exp/.../model/model_last.pth"
  MLP_LOG_DIR          Writable log directory on the compute side.
  MLP_CARD_TYPE        Instance type id.
  MLP_RESOURCE_QUEUE_ID Resource queue id.
  MLP_VEPFS_ID         Vepfs mount id.
  MLP_IMAGE            Container image url.
  MLP_RENDER_HAOZHE    When set to 1, also print a Haozhe-local launcher.
  MLP_HAOZHE_LAUNCHER_OUT
                       Optional output path for the rendered launcher.

Notes
-----
  - This is the canonical poplab -> compute launcher.
  - Keep PAMI2026 on poplab; do not mirror it onto Haozhe2.
  - The compute-side code root stays /map-vepfs/haozhe/PAMI_superpoint/pointcept_framework.
  - The rendered Haozhe launcher contains only Haozhe-side paths.
"""
import datetime
import os
import shlex
import sys

os.environ.setdefault("no_proxy", "volces.com,volcengineapi.com")

ACTUALLY_SUBMIT = os.environ.get("MLP_SUBMIT", "0") == "1"
RENDER_HAOZHE = os.environ.get("MLP_RENDER_HAOZHE", "0") == "1"


def _default_cuda_visible(num_gpus):
    return ",".join(str(i) for i in range(max(1, num_gpus)))


def _config_group(config_rel):
    parts = config_rel.replace("\\", "/").split("/")
    if len(parts) >= 3 and parts[0] == "configs":
        return parts[1]
    return "misc"


def _config_tag(config_rel):
    return os.path.splitext(os.path.basename(config_rel))[0]


def _default_save_path(config_rel, timestamp):
    return "exp/{}/{}_{}".format(
        _config_group(config_rel),
        _config_tag(config_rel),
        timestamp,
    )


def _compose_options_list(save_path, extra_options):
    options = ["save_path={}".format(save_path)]
    if extra_options:
        options.extend(shlex.split(extra_options))
    return options


def _compose_options_text(options):
    return " ".join(options)


def _int_env(name, default):
    try:
        return int(os.environ.get(name, str(default)))
    except (TypeError, ValueError):
        return int(default)


def _render_haozhe_launcher(
    pointcept_root,
    py_exe,
    num_gpus,
    cuda_visible,
    config_rel,
    save_path,
    extra_options,
    torch_home,
):
    return """
import os
import shlex
import subprocess

def run():
    root = os.environ.get("MLP_POINTCEPT_ROOT", {root!r})
    py_exe = os.environ.get("MLP_PYTHON", {py!r})
    num_gpus = int(os.environ.get("MLP_NUM_GPUS", {ngpu!r}))
    cuda_visible = os.environ.get("MLP_CUDA_VISIBLE_DEVICES", {cuda!r})
    config_rel = os.environ.get("MLP_CONFIG", {cfg!r})
    save_path = os.environ.get("MLP_SAVE_PATH", {save_path!r})
    extra_options = os.environ.get("MLP_OPTIONS", {extra!r}).strip()
    torch_home = os.environ.get("TORCH_HOME", {torch_home!r})
    options = ["save_path={{}}".format(save_path)]
    if extra_options:
        options.extend(shlex.split(extra_options))
    env = os.environ.copy()
    env["PYTHONPATH"] = "{{}}:{{}}/libs/pointops/build/lib.linux-x86_64-cpython-310:{{}}".format(
        root,
        root,
        env.get("PYTHONPATH", ""),
    )
    env["TORCH_HOME"] = torch_home
    env["CUDA_HOME"] = os.environ.get("CUDA_HOME", "/usr/local/cuda-12.1")
    env["CUDA_VISIBLE_DEVICES"] = cuda_visible
    cmd = [
        py_exe,
        "-u",
        "tools/train.py",
        "--num-gpus",
        str(num_gpus),
        "--dist-url",
        "auto",
        "--config-file",
        config_rel,
        "--options",
    ]
    cmd.extend(options)
    os.chdir(root)
    subprocess.run(cmd, env=env, check=True)

if __name__ == "__main__":
    run()
""".lstrip().format(
        root=pointcept_root,
        py=py_exe,
        ngpu=str(num_gpus),
        cuda=cuda_visible,
        cfg=config_rel,
        save_path=save_path,
        extra=extra_options,
        torch_home=torch_home,
    )


def _write_launcher_if_requested(launcher_text):
    out_path = os.environ.get("MLP_HAOZHE_LAUNCHER_OUT", "").strip()
    if not out_path:
        return None
    out_dir = os.path.dirname(out_path)
    if out_dir:
        os.makedirs(out_dir, exist_ok=True)
    with open(out_path, "w", encoding="utf-8", newline="\n") as f:
        f.write(launcher_text)
    return out_path


if __name__ == "__main__":
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

    pointcept_root = os.environ.get(
        "MLP_POINTCEPT_ROOT",
        "/map-vepfs/haozhe/PAMI_superpoint/pointcept_framework",
    )
    py_exe = os.environ.get(
        "MLP_PYTHON",
        "/map-vepfs/haozhe/PAMI_superpoint/superpoint/bin/python",
    )
    num_gpus = int(os.environ.get("MLP_NUM_GPUS", "1"))
    num_machines = _int_env("MLP_NUM_MACHINES", 1)
    role_replicas = _int_env("MLP_ROLE_REPLICAS", num_machines)
    cuda_visible = os.environ.get(
        "MLP_CUDA_VISIBLE_DEVICES",
        _default_cuda_visible(num_gpus),
    )
    config_rel = os.environ.get(
        "MLP_CONFIG",
        "configs/s3dis/semseg-pt-v3m1-0-rpe_sp_after_pool_reboot.py",
    )
    config_tag = _config_tag(config_rel)
    save_path = os.environ.get(
        "MLP_SAVE_PATH",
        _default_save_path(config_rel, timestamp),
    )
    extra_options = os.environ.get("MLP_OPTIONS", "").strip()
    train_options = _compose_options_list(save_path, extra_options)
    train_options_text = _compose_options_text(train_options)
    log_dir = os.environ.get("MLP_LOG_DIR", os.path.join(pointcept_root, "logs"))
    log_file_eval = os.path.join(log_dir, "volc_job_{}.log".format(timestamp))

    torch_home = os.environ.get(
        "TORCH_HOME",
        "/map-vepfs/haozhe/PAMI_superpoint/.torch_cache",
    )

    card_type = os.environ.get("MLP_CARD_TYPE", "ml.pni2l.7xlarge")
    jobname = os.environ.get("MLP_JOB_NAME", "PAMI-{}-{}".format(config_tag, timestamp))
    resource_queue_id = os.environ.get(
        "MLP_RESOURCE_QUEUE_ID", "q-20241024095431-86z6j"
    )
    vepfs_id = os.environ.get("MLP_VEPFS_ID", "vepfs-cnbj26c39866e9ec1")
    vepfs_host_path = "/mnt/{}".format(vepfs_id)
    image_url = os.environ.get(
        "MLP_IMAGE",
        "vemlp-cn-beijing2.cr.volces.com/preset-images/ray:2.12.0-cuda-121-py310",
    )
    master_addr_env = os.environ.get("MLP_MASTER_ADDR_ENV", "MLP_WORKER_0_HOST")
    master_port_env = os.environ.get("MLP_MASTER_PORT_ENV", "MLP_WORKER_0_PORT")
    machine_rank_env = os.environ.get("MLP_MACHINE_RANK_ENV", "MLP_ROLE_INDEX")
    default_master_port = os.environ.get("MLP_MASTER_PORT", "29500")

    bash_command = """
set -euo pipefail
export PYTHONPATH={root}:{root}/libs/pointops/build/lib.linux-x86_64-cpython-310:${{PYTHONPATH:-}}
export TORCH_HOME=${{TORCH_HOME:-{torch_home}}}
export CUDA_HOME=/usr/local/cuda-12.1
export MLP_NUM_MACHINES=${{MLP_NUM_MACHINES:-{num_machines}}}
export MLP_ROLE_REPLICAS=${{MLP_ROLE_REPLICAS:-{role_replicas}}}
export MLP_MACHINE_RANK=${{{machine_rank_env}:-0}}
export MLP_MASTER_ADDR=${{{master_addr_env}:-127.0.0.1}}
export MLP_MASTER_PORT=${{{master_port_env}:-{default_master_port}}}
cd {root}
export CUDA_VISIBLE_DEVICES={cuda}
echo "[volc-ddp] host=$(hostname) rank=${{MLP_MACHINE_RANK}} master=${{MLP_MASTER_ADDR}}:${{MLP_MASTER_PORT}} num_machines=${{MLP_NUM_MACHINES}} num_gpus={ngpu} cuda=${{CUDA_VISIBLE_DEVICES}}"
"{py}" -u tools/train.py --num-gpus {ngpu} --num-machines ${{MLP_NUM_MACHINES}} --machine-rank ${{MLP_MACHINE_RANK}} --dist-url tcp://${{MLP_MASTER_ADDR}}:${{MLP_MASTER_PORT}} \\
  --config-file {cfg} \\
  --options {options} \\
  2>&1 | tee {log}
""".format(
        root=pointcept_root,
        torch_home=torch_home,
        py=py_exe,
        num_machines=num_machines,
        role_replicas=role_replicas,
        machine_rank_env=machine_rank_env,
        master_addr_env=master_addr_env,
        master_port_env=master_port_env,
        default_master_port=default_master_port,
        cuda=cuda_visible,
        ngpu=num_gpus,
        cfg=config_rel,
        options=train_options_text,
        log=log_file_eval,
    )

    print("========== ML Platform Job Preview ==========")
    print("ACTUALLY_SUBMIT:", ACTUALLY_SUBMIT)
    print("jobname:", jobname)
    print("card_type:", card_type)
    print("resource_queue_id:", resource_queue_id)
    print("vepfs_id:", vepfs_id, "-> /map-vepfs")
    print("image:", image_url)
    print("MLP_POINTCEPT_ROOT:", pointcept_root)
    print("MLP_PYTHON:", py_exe)
    print("MLP_NUM_GPUS / CUDA_VISIBLE_DEVICES:", num_gpus, "/", cuda_visible)
    print("MLP_NUM_MACHINES / MLP_ROLE_REPLICAS:", num_machines, "/", role_replicas)
    print("MLP_CONFIG:", config_rel)
    print("MLP_SAVE_PATH:", save_path)
    print("MLP_OPTIONS:", extra_options or "(none)")
    print("MLP_RENDER_HAOZHE:", RENDER_HAOZHE)
    print("log_file_eval:", log_file_eval)
    print("----- bash_command -----")
    print(bash_command)
    print("------------------------")

    if RENDER_HAOZHE:
        launcher_text = _render_haozhe_launcher(
            pointcept_root,
            py_exe,
            num_gpus,
            cuda_visible,
            config_rel,
            save_path,
            extra_options,
            torch_home,
        )
        print("----- haozhe_launcher -----")
        print(launcher_text)
        print("---------------------------")
        launcher_out = _write_launcher_if_requested(launcher_text)
        if launcher_out:
            print("haozhe_launcher_out:", launcher_out)

    if not ACTUALLY_SUBMIT:
        print(
            "\nPreview only. Real create_job requires: "
            "export VOLC_AK=... VOLC_SK=... && MLP_SUBMIT=1 python3 run.py"
        )
        sys.exit(0)

    if not os.environ.get("VOLC_AK") or not os.environ.get("VOLC_SK"):
        sys.exit("MLP_SUBMIT=1 requires VOLC_AK and VOLC_SK")

    try:
        import volcenginesdkcore
        import volcenginesdkmlplatform20240701
        from volcenginesdkcore.rest import ApiException
    except ImportError as e:
        sys.exit(
            "Please install the Volcano ML Platform SDK first: "
            "pip install volcenginesdkcore ... Error: {}".format(e)
        )

    configuration = volcenginesdkcore.Configuration()
    configuration.ak = os.environ["VOLC_AK"]
    configuration.sk = os.environ["VOLC_SK"]
    configuration.region = "cn-beijing2"

    volcenginesdkcore.Configuration.set_default(configuration)
    api_instance = volcenginesdkmlplatform20240701.MLPLATFORM20240701Api()

    def create_job():
        req_resource = volcenginesdkmlplatform20240701.ResourceForCreateJobInput(
            instance_type_id=card_type,
            type="Preset",
            zone_id="cn-beijing2-a",
        )
        req_roles = volcenginesdkmlplatform20240701.RoleForCreateJobInput(
            name="worker",
            replicas=role_replicas,
            resource=req_resource,
        )
        req_resource_config = volcenginesdkmlplatform20240701.ResourceConfigForCreateJobInput(
            priority=2,
            resource_queue_id=resource_queue_id,
            roles=[req_roles],
        )
        req_image = volcenginesdkmlplatform20240701.ImageForCreateJobInput(
            type="Prebuild",
            url=image_url,
        )
        req_runtime_config = volcenginesdkmlplatform20240701.RuntimeConfigForCreateJobInput(
            command=bash_command,
            framework="PyTorch",
            image=req_image,
        )
        req_vepfs = volcenginesdkmlplatform20240701.VepfsForCreateJobInput(
            host_path=vepfs_host_path,
            id=vepfs_id,
        )
        req_config = volcenginesdkmlplatform20240701.ConfigForCreateJobInput(
            vepfs=req_vepfs,
        )
        req_storages = volcenginesdkmlplatform20240701.StorageForCreateJobInput(
            config=req_config,
            mount_path="/map-vepfs",
            type="Vepfs",
        )
        req_storage_config = volcenginesdkmlplatform20240701.StorageConfigForCreateJobInput(
            storages=[req_storages],
        )
        create_job_request = volcenginesdkmlplatform20240701.CreateJobRequest(
            dry_run=False,
            name=jobname,
            resource_config=req_resource_config,
            runtime_config=req_runtime_config,
            storage_config=req_storage_config,
        )

        try:
            response = api_instance.create_job(create_job_request)
            job_id = response.id
            print("Job submitted successfully")
            print("  job_id:", job_id)
            print("  job_name:", jobname)
            return job_id
        except ApiException as e:
            print("Submit failed:", e)
            return None

    create_job()