# Puck's cloud brain: Holo-3.1-4B on Modal via vLLM, OpenAI-compatible. # Same contract as the local Ollama brain — point PUCK_BRAIN_URL at the # deployed URL (+ set PUCK_BRAIN_MODEL=Hcompany/Holo-3.1-4B) and the daemon # can't tell the difference. # # modal token new # once # modal deploy brain_modal.py # PUCK_BRAIN_URL=https://--puck-brain-serve.modal.run/v1 \ # PUCK_BRAIN_MODEL=Hcompany/Holo-3.1-4B uv run app.py import modal MODEL = "Hcompany/Holotron-12B" # Nemotron-derived CUA VLM; full-precision on the GPU PORT = 8000 # CUDA *devel* base (ships nvcc): Holotron is a Nemotron-H hybrid, and vLLM's # flashinfer JIT-compiles kernels at runtime — without nvcc the engine core dies. image = ( modal.Image.from_registry("nvidia/cuda:12.8.1-devel-ubuntu22.04", add_python="3.12") .entrypoint([]) # drop the base image's nvidia entrypoint .pip_install("vllm>=0.11", "huggingface_hub[hf_transfer]") .env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "CUDA_HOME": "/usr/local/cuda"}) ) # persist model weights between cold starts hf_cache = modal.Volume.from_name("puck-hf-cache", create_if_missing=True) app = modal.App("puck-brain") @app.function( image=image, gpu="L40S", min_containers=1, # always hot through the deadline — no cold start. Set to 0 + redeploy when done. timeout=600, volumes={"/root/.cache/huggingface": hf_cache}, ) @modal.concurrent(max_inputs=16) @modal.web_server(port=PORT, startup_timeout=600) def serve(): import subprocess # Holotron is a multimodal hybrid → trust-remote-code for the custom modeling, # and cap images-per-prompt so the screenshot path is bounded. subprocess.Popen( [ "vllm", "serve", MODEL, "--port", str(PORT), "--max-model-len", "12288", "--trust-remote-code", "--limit-mm-per-prompt", '{"image": 1}', # vLLM wants JSON here, not image=1 ] )