# Puck's cloud brain: Holo-3.1-4B on Modal via vLLM, OpenAI-compatible.
# Same contract as the local Ollama brain — point PUCK_BRAIN_URL at the
# deployed URL (+ set PUCK_BRAIN_MODEL=Hcompany/Holo-3.1-4B) and the daemon
# can't tell the difference.
#
#   modal token new            # once
#   modal deploy brain_modal.py
#   PUCK_BRAIN_URL=https://<you>--puck-brain-serve.modal.run/v1 \
#   PUCK_BRAIN_MODEL=Hcompany/Holo-3.1-4B uv run app.py

import modal

MODEL = "Hcompany/Holotron-12B"  # Nemotron-derived CUA VLM; full-precision on the GPU
PORT = 8000

# CUDA *devel* base (ships nvcc): Holotron is a Nemotron-H hybrid, and vLLM's
# flashinfer JIT-compiles kernels at runtime — without nvcc the engine core dies.
image = (
    modal.Image.from_registry("nvidia/cuda:12.8.1-devel-ubuntu22.04", add_python="3.12")
    .entrypoint([])  # drop the base image's nvidia entrypoint
    .pip_install("vllm>=0.11", "huggingface_hub[hf_transfer]")
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "CUDA_HOME": "/usr/local/cuda"})
)

# persist model weights between cold starts
hf_cache = modal.Volume.from_name("puck-hf-cache", create_if_missing=True)

app = modal.App("puck-brain")


@app.function(
    image=image,
    gpu="L40S",
    min_containers=1,  # always hot through the deadline — no cold start. Set to 0 + redeploy when done.
    timeout=600,
    volumes={"/root/.cache/huggingface": hf_cache},
)
@modal.concurrent(max_inputs=16)
@modal.web_server(port=PORT, startup_timeout=600)
def serve():
    import subprocess

    # Holotron is a multimodal hybrid → trust-remote-code for the custom modeling,
    # and cap images-per-prompt so the screenshot path is bounded.
    subprocess.Popen(
        [
            "vllm",
            "serve",
            MODEL,
            "--port",
            str(PORT),
            "--max-model-len",
            "12288",
            "--trust-remote-code",
            "--limit-mm-per-prompt",
            '{"image": 1}',  # vLLM wants JSON here, not image=1
        ]
    )