puck / server /brain_modal.py
vu1n's picture
Puck β€” desktop fairy familiar (HF Build Small)
3c124f3
Raw
History Blame Contribute Delete
2.02 kB
# Puck's cloud brain: Holo-3.1-4B on Modal via vLLM, OpenAI-compatible.
# Same contract as the local Ollama brain β€” point PUCK_BRAIN_URL at the
# deployed URL (+ set PUCK_BRAIN_MODEL=Hcompany/Holo-3.1-4B) and the daemon
# can't tell the difference.
#
# modal token new # once
# modal deploy brain_modal.py
# PUCK_BRAIN_URL=https://<you>--puck-brain-serve.modal.run/v1 \
# PUCK_BRAIN_MODEL=Hcompany/Holo-3.1-4B uv run app.py
import modal
MODEL = "Hcompany/Holotron-12B" # Nemotron-derived CUA VLM; full-precision on the GPU
PORT = 8000
# CUDA *devel* base (ships nvcc): Holotron is a Nemotron-H hybrid, and vLLM's
# flashinfer JIT-compiles kernels at runtime β€” without nvcc the engine core dies.
image = (
modal.Image.from_registry("nvidia/cuda:12.8.1-devel-ubuntu22.04", add_python="3.12")
.entrypoint([]) # drop the base image's nvidia entrypoint
.pip_install("vllm>=0.11", "huggingface_hub[hf_transfer]")
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "CUDA_HOME": "/usr/local/cuda"})
)
# persist model weights between cold starts
hf_cache = modal.Volume.from_name("puck-hf-cache", create_if_missing=True)
app = modal.App("puck-brain")
@app.function(
image=image,
gpu="L40S",
min_containers=1, # always hot through the deadline β€” no cold start. Set to 0 + redeploy when done.
timeout=600,
volumes={"/root/.cache/huggingface": hf_cache},
)
@modal.concurrent(max_inputs=16)
@modal.web_server(port=PORT, startup_timeout=600)
def serve():
import subprocess
# Holotron is a multimodal hybrid β†’ trust-remote-code for the custom modeling,
# and cap images-per-prompt so the screenshot path is bounded.
subprocess.Popen(
[
"vllm",
"serve",
MODEL,
"--port",
str(PORT),
"--max-model-len",
"12288",
"--trust-remote-code",
"--limit-mm-per-prompt",
'{"image": 1}', # vLLM wants JSON here, not image=1
]
)