Spaces:

lfoppiano
/

document-qa

Running

document-qa / deployment /modal_inference.py

add modal inference

f1ac57a 7 months ago

1.71 kB

	import os

	import modal

	vllm_image = (
	modal.Image.debian_slim(python_version="3.10")
	.pip_install(
	"vllm",
	"huggingface_hub[hf_transfer]==0.26.2",
	"flashinfer-python==0.2.0.post2", # pinning, very unstable
	extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
	)
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
	)

	MODELS_DIR = "/llamas"
	MODEL_NAME = "microsoft/Phi-4-mini-instruct"
	MODEL_REVISION = "c0fb9e74abda11b496b7907a9c6c9009a7a0488f"

	hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
	vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)


	app = modal.App("phi-4-mini-instruct-qa-vllm")

	N_GPU = 1
	MINUTES = 60 # seconds
	VLLM_PORT = 8000


	@app.function(
	image=vllm_image,
	# gpu=f"L40S:{N_GPU}",
	gpu=f"A10G:{N_GPU}",
	# how long should we stay up with no requests?
	scaledown_window=5 * MINUTES,
	volumes={
	"/root/.cache/huggingface": hf_cache_vol,
	"/root/.cache/vllm": vllm_cache_vol,
	},
	secrets=[modal.Secret.from_name("document-qa-api-key")]
	)
	@modal.concurrent(
	max_inputs=5
	) # how many requests can one replica handle? tune carefully!
	@modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
	def serve():
	import subprocess

	cmd = [
	"vllm",
	"serve",
	"--uvicorn-log-level=info",
	MODEL_NAME,
	"--revision",
	MODEL_REVISION,
	"--max-model-len",
	"32768",
	"--host",
	"0.0.0.0",
	"--port",
	str(VLLM_PORT),
	"--api-key",
	os.environ["API_KEY"],
	]

	subprocess.Popen(" ".join(cmd), shell=True)