Spaces:

build-small-hackathon
/

secret-student

Running

App Files Files Community

secret-student / scripts /deploy_llamacpp_modal.py

asanwari

Add routed Modal multi-model deployment

efee902 4 days ago

Raw

History Blame Contribute Delete

13 kB

	from __future__ import annotations

	from contextlib import asynccontextmanager
	import os
	from pathlib import Path
	import shlex
	import shutil
	import subprocess
	import time
	from urllib.error import URLError
	from urllib.request import Request as UrlRequest
	from urllib.request import urlopen

	import modal
	from fastapi import FastAPI, HTTPException, Request
	from fastapi.responses import JSONResponse, StreamingResponse
	import httpx
	from starlette.background import BackgroundTask

	try:
	from scripts.llamacpp_modal_config import (
	DeploymentConfig,
	ModalAppConfig,
	ModelConfig,
	load_deployment_config,
	)
	except ModuleNotFoundError:
	from llamacpp_modal_config import ( # type: ignore[no-redef]
	DeploymentConfig,
	ModalAppConfig,
	ModelConfig,
	load_deployment_config,
	)


	MINUTES = 60
	DEFAULT_CONFIG_PATH = Path("config/modal-models.yaml")
	SERVER_BIN = os.getenv("MODAL_LLAMA_CPP_SERVER_BIN", "/app/llama-server")


	def deployment_from_environment() -> DeploymentConfig:
	configured_path = os.getenv("MODAL_LLAMA_CPP_CONFIG", "").strip()
	if configured_path:
	return load_deployment_config(configured_path)
	if DEFAULT_CONFIG_PATH.is_file():
	return load_deployment_config(DEFAULT_CONFIG_PATH)

	app_config = ModalAppConfig(
	name=os.getenv("MODAL_LLAMA_CPP_APP_NAME", ModalAppConfig.name),
	gpu=os.getenv("MODAL_LLAMA_CPP_GPU", ModalAppConfig.gpu),
	image=os.getenv("MODAL_LLAMA_CPP_IMAGE", ModalAppConfig.image),
	secret_name=os.getenv(
	"MODAL_LLAMA_CPP_SECRET_NAME", ModalAppConfig.secret_name
	),
	timeout_minutes=int(
	os.getenv("MODAL_LLAMA_CPP_TIMEOUT_MINUTES", "15")
	),
	scaledown_minutes=int(
	os.getenv("MODAL_LLAMA_CPP_SCALEDOWN_MINUTES", "15")
	),
	startup_timeout_seconds=int(
	os.getenv("MODAL_LLAMA_CPP_STARTUP_TIMEOUT", "900")
	),
	max_concurrent_inputs=int(
	os.getenv("MODAL_LLAMA_CPP_MAX_CONCURRENT_INPUTS", "12")
	),
	)
	model = ModelConfig(
	route="",
	role="text",
	model_ref=os.getenv(
	"MODAL_LLAMA_CPP_MODEL_REF", "openbmb/MiniCPM-V-4_5-gguf:Q4_K_M"
	),
	port=8001,
	ctx_size=int(os.getenv("MODAL_LLAMA_CPP_CTX_SIZE", "8192")),
	gpu_layers=int(os.getenv("MODAL_LLAMA_CPP_GPU_LAYERS", "999")),
	threads=int(os.getenv("MODAL_LLAMA_CPP_THREADS", "8")),
	parallel=int(os.getenv("MODAL_LLAMA_CPP_PARALLEL", "2")),
	extra_args=tuple(
	shlex.split(os.getenv("MODAL_LLAMA_CPP_EXTRA_ARGS", ""))
	),
	)
	return DeploymentConfig(
	app=app_config,
	models=(model,),
	source="legacy environment variables",
	legacy=True,
	)


	DEPLOYMENT = deployment_from_environment()
	APP_CONFIG = DEPLOYMENT.app
	RUNTIME_CONFIG_PATH = "/root/config/modal-models.yaml"

	app = modal.App(APP_CONFIG.name)
	hf_cache = modal.Volume.from_name("secret-student-hf-cache", create_if_missing=True)
	llama_cache = modal.Volume.from_name(
	"secret-student-llamacpp-cache", create_if_missing=True
	)
	image = (
	modal.Image.from_registry(APP_CONFIG.image, add_python="3.12")
	.entrypoint([])
	.pip_install(
	"fastapi>=0.136.3",
	"httpx>=0.28.1",
	"uvicorn[standard]>=0.49.0",
	)
	.add_local_python_source("scripts.llamacpp_modal_config", copy=True)
	.env(
	{
	"HF_XET_HIGH_PERFORMANCE": "1",
	"LLAMA_CACHE": "/root/.cache/llama.cpp",
	}
	)
	)
	if not DEPLOYMENT.legacy:
	image = image.add_local_file(
	DEPLOYMENT.source,
	RUNTIME_CONFIG_PATH,
	copy=True,
	).env({"MODAL_LLAMA_CPP_CONFIG": RUNTIME_CONFIG_PATH})


	def resolve_llama_server_bin() -> str:
	candidates = [
	SERVER_BIN,
	shutil.which("llama-server") or "",
	"/app/llama-server",
	"/usr/local/bin/llama-server",
	]
	for candidate in candidates:
	if candidate and os.path.isfile(candidate) and os.access(candidate, os.X_OK):
	return candidate
	raise FileNotFoundError(f"Could not find llama-server. Tried: {candidates}")


	def build_server_command(
	model: ModelConfig, server_bin: str, api_key: str
	) -> list[str]:
	return [
	server_bin,
	"--host",
	"127.0.0.1",
	"--port",
	str(model.port),
	"-hf",
	model.model_ref,
	"--ctx-size",
	str(model.ctx_size),
	"--n-gpu-layers",
	str(model.gpu_layers),
	"--threads",
	str(model.threads),
	"--parallel",
	str(model.parallel),
	"--api-key",
	api_key,
	*model.extra_args,
	]


	def start_model_servers(api_key: str) -> dict[str, subprocess.Popen]:
	server_bin = resolve_llama_server_bin()
	processes: dict[str, subprocess.Popen] = {}
	try:
	for model in DEPLOYMENT.models:
	command = build_server_command(model, server_bin, api_key)
	label = model.route or "root"
	printable = ["<redacted>" if value == api_key else value for value in command]
	print(f"Starting llama.cpp model {label}: {' '.join(printable)}")
	processes[model.route] = subprocess.Popen(
	command, start_new_session=True
	)
	wait_for_model_servers(processes, api_key)
	return processes
	except BaseException:
	stop_model_servers(processes)
	raise


	def wait_for_model_servers(
	processes: dict[str, subprocess.Popen], api_key: str
	) -> None:
	pending = {model.route: model for model in DEPLOYMENT.models}
	deadline = time.monotonic() + APP_CONFIG.startup_timeout_seconds
	headers = {"Authorization": f"Bearer {api_key}"}
	while pending and time.monotonic() < deadline:
	for route, model in tuple(pending.items()):
	process = processes[route]
	return_code = process.poll()
	if return_code is not None:
	raise RuntimeError(
	f"llama-server for {route or 'root'} exited during startup "
	f"with code {return_code}."
	)
	try:
	health_url = f"http://127.0.0.1:{model.port}/health"
	with urlopen(UrlRequest(health_url, headers=headers), timeout=2) as response:
	if response.status < 500:
	print(f"llama.cpp model {route or 'root'} is ready on port {model.port}")
	pending.pop(route)
	except (OSError, URLError):
	pass
	if pending:
	time.sleep(2)
	if pending:
	routes = ", ".join(route or "root" for route in pending)
	raise TimeoutError(
	f"llama-server models did not become ready within "
	f"{APP_CONFIG.startup_timeout_seconds}s: {routes}"
	)


	def stop_model_servers(processes: dict[str, subprocess.Popen]) -> None:
	for process in processes.values():
	if process.poll() is None:
	process.terminate()
	deadline = time.monotonic() + 15
	for process in processes.values():
	if process.poll() is not None:
	continue
	try:
	process.wait(timeout=max(0.1, deadline - time.monotonic()))
	except subprocess.TimeoutExpired:
	process.kill()


	def create_proxy_app(api_key: str):
	processes: dict[str, subprocess.Popen] = {}
	client: httpx.AsyncClient \| None = None
	models_by_route = {model.route: model for model in DEPLOYMENT.models}

	@asynccontextmanager
	async def lifespan(_app: FastAPI):
	nonlocal processes, client
	processes = start_model_servers(api_key)
	client = httpx.AsyncClient(timeout=None)
	try:
	yield
	finally:
	if client is not None:
	await client.aclose()
	stop_model_servers(processes)

	proxy = FastAPI(title="Secret Student llama.cpp router", lifespan=lifespan)

	@proxy.get("/health")
	async def aggregate_health():
	statuses = {
	(route or "root"): {
	"model_ref": models_by_route[route].model_ref,
	"port": models_by_route[route].port,
	"running": process.poll() is None,
	"return_code": process.poll(),
	}
	for route, process in processes.items()
	}
	ready = len(statuses) == len(models_by_route) and all(
	status["running"] for status in statuses.values()
	)
	return JSONResponse(
	status_code=200 if ready else 503,
	content={"ready": ready, "models": statuses},
	)

	async def forward(request: Request, route: str, path: str):
	if client is None:
	raise HTTPException(status_code=503, detail="Model router is starting.")
	model = models_by_route.get(route)
	if model is None:
	raise HTTPException(status_code=404, detail=f"Unknown model route: {route}")
	process = processes.get(route)
	if process is None or process.poll() is not None:
	raise HTTPException(
	status_code=503, detail=f"Model route {route or 'root'} is unavailable."
	)

	target = f"http://127.0.0.1:{model.port}/{path}"
	if request.url.query:
	target = f"{target}?{request.url.query}"
	excluded_request_headers = {"host", "content-length", "connection"}
	headers = {
	key: value
	for key, value in request.headers.items()
	if key.lower() not in excluded_request_headers
	}
	upstream_request = client.build_request(
	request.method,
	target,
	headers=headers,
	content=await request.body(),
	)
	upstream = await client.send(upstream_request, stream=True)
	excluded_response_headers = {
	"content-length",
	"connection",
	"keep-alive",
	"proxy-authenticate",
	"proxy-authorization",
	"te",
	"trailers",
	"transfer-encoding",
	"upgrade",
	}
	response_headers = {
	key: value
	for key, value in upstream.headers.items()
	if key.lower() not in excluded_response_headers
	}
	return StreamingResponse(
	upstream.aiter_raw(),
	status_code=upstream.status_code,
	headers=response_headers,
	background=BackgroundTask(upstream.aclose),
	)

	if DEPLOYMENT.legacy:

	@proxy.api_route(
	"/{path:path}", methods=["GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"]
	)
	async def forward_legacy(request: Request, path: str):
	return await forward(request, "", path)

	else:

	@proxy.api_route(
	"/{route}", methods=["GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"]
	)
	async def forward_route_root(request: Request, route: str):
	return await forward(request, route, "")

	@proxy.api_route(
	"/{route}/{path:path}",
	methods=["GET", "POST", "PUT", "PATCH", "DELETE", "OPTIONS"],
	)
	async def forward_route(request: Request, route: str, path: str):
	return await forward(request, route, path)

	return proxy


	@app.function(
	image=image,
	gpu=APP_CONFIG.gpu,
	timeout=APP_CONFIG.timeout_minutes * MINUTES,
	scaledown_window=APP_CONFIG.scaledown_minutes * MINUTES,
	secrets=[modal.Secret.from_name(APP_CONFIG.secret_name)],
	volumes={
	"/root/.cache/huggingface": hf_cache,
	"/root/.cache/llama.cpp": llama_cache,
	},
	)
	@modal.concurrent(max_inputs=APP_CONFIG.max_concurrent_inputs)
	@modal.asgi_app()
	def serve():
	api_key = (
	os.getenv("LLAMA_ARG_API_KEY", "").strip()
	or os.getenv("LLM_API_KEY", "").strip()
	)
	if not api_key:
	raise RuntimeError(
	f"Modal secret {APP_CONFIG.secret_name!r} must contain "
	"LLAMA_ARG_API_KEY or LLM_API_KEY."
	)
	return create_proxy_app(api_key)


	@app.local_entrypoint()
	async def main() -> None:
	url = (await serve.get_web_url.aio()).rstrip("/")
	print(f"Modal config={DEPLOYMENT.source}")
	for model in DEPLOYMENT.models:
	base_url = url if DEPLOYMENT.legacy else f"{url}/{model.route}"
	if model.role == "text":
	print(f"LLM_BASE_URL={base_url}")
	print(f"LLM_MODEL={model.model_ref}")
	elif model.role == "vision":
	print(f"VISION_LLM_BASE_URL={base_url}")
	print(f"VISION_LLM_MODEL={model.model_ref}")
	else:
	variable = f"{model.route.upper().replace('-', '_')}_BASE_URL"
	print(f"{variable}={base_url}")
	print(f"MODEL_ROUTE {model.route or '/'} -> {model.model_ref}")
	print(f"Modal GPU={APP_CONFIG.gpu}; llama.cpp image={APP_CONFIG.image}")