Spaces:
Running
Running
| # Puck's cloud brain: Holo-3.1-4B on Modal via vLLM, OpenAI-compatible. | |
| # Same contract as the local Ollama brain β point PUCK_BRAIN_URL at the | |
| # deployed URL (+ set PUCK_BRAIN_MODEL=Hcompany/Holo-3.1-4B) and the daemon | |
| # can't tell the difference. | |
| # | |
| # modal token new # once | |
| # modal deploy brain_modal.py | |
| # PUCK_BRAIN_URL=https://<you>--puck-brain-serve.modal.run/v1 \ | |
| # PUCK_BRAIN_MODEL=Hcompany/Holo-3.1-4B uv run app.py | |
| import modal | |
| MODEL = "Hcompany/Holotron-12B" # Nemotron-derived CUA VLM; full-precision on the GPU | |
| PORT = 8000 | |
| # CUDA *devel* base (ships nvcc): Holotron is a Nemotron-H hybrid, and vLLM's | |
| # flashinfer JIT-compiles kernels at runtime β without nvcc the engine core dies. | |
| image = ( | |
| modal.Image.from_registry("nvidia/cuda:12.8.1-devel-ubuntu22.04", add_python="3.12") | |
| .entrypoint([]) # drop the base image's nvidia entrypoint | |
| .pip_install("vllm>=0.11", "huggingface_hub[hf_transfer]") | |
| .env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "CUDA_HOME": "/usr/local/cuda"}) | |
| ) | |
| # persist model weights between cold starts | |
| hf_cache = modal.Volume.from_name("puck-hf-cache", create_if_missing=True) | |
| app = modal.App("puck-brain") | |
| def serve(): | |
| import subprocess | |
| # Holotron is a multimodal hybrid β trust-remote-code for the custom modeling, | |
| # and cap images-per-prompt so the screenshot path is bounded. | |
| subprocess.Popen( | |
| [ | |
| "vllm", | |
| "serve", | |
| MODEL, | |
| "--port", | |
| str(PORT), | |
| "--max-model-len", | |
| "12288", | |
| "--trust-remote-code", | |
| "--limit-mm-per-prompt", | |
| '{"image": 1}', # vLLM wants JSON here, not image=1 | |
| ] | |
| ) | |