Spaces:

build-small-hackathon
/

puck

Running

App Files Files Community

puck / server /brain_modal.py

vu1n

Puck — desktop fairy familiar (HF Build Small)

3c124f3 17 days ago

Raw

History Blame Contribute Delete

2.02 kB

	# Puck's cloud brain: Holo-3.1-4B on Modal via vLLM, OpenAI-compatible.
	# Same contract as the local Ollama brain — point PUCK_BRAIN_URL at the
	# deployed URL (+ set PUCK_BRAIN_MODEL=Hcompany/Holo-3.1-4B) and the daemon
	# can't tell the difference.
	#
	# modal token new # once
	# modal deploy brain_modal.py
	# PUCK_BRAIN_URL=https://<you>--puck-brain-serve.modal.run/v1 \
	# PUCK_BRAIN_MODEL=Hcompany/Holo-3.1-4B uv run app.py

	import modal

	MODEL = "Hcompany/Holotron-12B" # Nemotron-derived CUA VLM; full-precision on the GPU
	PORT = 8000

	# CUDA devel base (ships nvcc): Holotron is a Nemotron-H hybrid, and vLLM's
	# flashinfer JIT-compiles kernels at runtime — without nvcc the engine core dies.
	image = (
	modal.Image.from_registry("nvidia/cuda:12.8.1-devel-ubuntu22.04", add_python="3.12")
	.entrypoint([]) # drop the base image's nvidia entrypoint
	.pip_install("vllm>=0.11", "huggingface_hub[hf_transfer]")
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "CUDA_HOME": "/usr/local/cuda"})
	)

	# persist model weights between cold starts
	hf_cache = modal.Volume.from_name("puck-hf-cache", create_if_missing=True)

	app = modal.App("puck-brain")


	@app.function(
	image=image,
	gpu="L40S",
	min_containers=1, # always hot through the deadline — no cold start. Set to 0 + redeploy when done.
	timeout=600,
	volumes={"/root/.cache/huggingface": hf_cache},
	)
	@modal.concurrent(max_inputs=16)
	@modal.web_server(port=PORT, startup_timeout=600)
	def serve():
	import subprocess

	# Holotron is a multimodal hybrid → trust-remote-code for the custom modeling,
	# and cap images-per-prompt so the screenshot path is bounded.
	subprocess.Popen(
	[
	"vllm",
	"serve",
	MODEL,
	"--port",
	str(PORT),
	"--max-model-len",
	"12288",
	"--trust-remote-code",
	"--limit-mm-per-prompt",
	'{"image": 1}', # vLLM wants JSON here, not image=1
	]
	)