Spaces:

build-small-hackathon
/

TinyNarrator

Running

TinyNarrator / modal_workers /reader_brain.py

cvpfus

Reduce reader brain Modal memory use

718f2c4 17 days ago

3.04 kB

	"""Modal worker for Tiny Narrator's llama.cpp reader brain.

	Deploy:
	modal deploy modal_workers/reader_brain.py

	The deployed web server exposes llama.cpp's OpenAI-compatible API. Set:
	LLAMA_CPP_BASE_URL=https://<modal-app-url>/v1
	LLAMA_CPP_MODEL=narrator-brain
	LLAMA_CPP_TOKEN=<same value as Modal secret>

	The server intentionally binds to 0.0.0.0 because Modal's web_server proxy
	routes traffic to the container port.
	"""

	import os
	from pathlib import Path
	import subprocess
	import shutil

	import modal


	APP_NAME = "tiny-narrator-reader-brain"
	MODEL_REF = "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF:Q4_K_M"
	MODEL_ALIAS = "narrator-brain"
	SERVER_PORT = 8080
	CACHE_DIR = "/cache"

	app = modal.App(APP_NAME)
	model_cache = modal.Volume.from_name("tiny-narrator-reader-brain-cache", create_if_missing=True)


	def _secret_names() -> list[modal.Secret]:
	return [modal.Secret.from_name("tiny-narrator-reader-brain-token")]


	def _find_llama_server() -> str:
	candidates = [
	shutil.which("llama-server"),
	"/app/llama-server",
	"/usr/local/bin/llama-server",
	"/usr/bin/llama-server",
	"/bin/llama-server",
	"/llama-server",
	]
	for candidate in candidates:
	if candidate and Path(candidate).exists():
	return candidate

	inspected = []
	for directory in ("/app", "/usr/local/bin", "/usr/bin", "/bin"):
	path = Path(directory)
	if path.exists():
	inspected.append(f"{directory}: {[item.name for item in path.glob('llama')]}")
	raise FileNotFoundError(f"llama-server binary was not found. Inspected: {'; '.join(inspected)}")


	reader_brain_image = (
	modal.Image.from_registry(
	"ghcr.io/ggml-org/llama.cpp:server-cuda12",
	add_python="3.12",
	)
	.dockerfile_commands("ENTRYPOINT []")
	.env(
	{
	"HF_HOME": f"{CACHE_DIR}/huggingface",
	}
	)
	)


	@app.function(
	image=reader_brain_image,
	gpu="T4",
	volumes={CACHE_DIR: model_cache},
	secrets=_secret_names(),
	timeout=900,
	scaledown_window=300,
	max_containers=1,
	)
	@modal.concurrent(max_inputs=20)
	@modal.web_server(SERVER_PORT, startup_timeout=600)
	def reader_brain_server():
	api_key = os.getenv("LLAMA_CPP_TOKEN", "")

	command = [
	_find_llama_server(),
	"--host",
	"0.0.0.0",
	"--port",
	str(SERVER_PORT),
	"-hf",
	MODEL_REF,
	"--alias",
	MODEL_ALIAS,
	"--ctx-size",
	"4096",
	"--parallel",
	"1",
	"--reasoning",
	"off",
	"--n-gpu-layers",
	"999",
	]
	if api_key:
	command.extend(["--api-key", api_key])

	display_command = command.copy()
	if "--api-key" in display_command:
	key_index = display_command.index("--api-key") + 1
	if key_index < len(display_command):
	display_command[key_index] = "***"
	print(f"[tiny-narrator-reader-brain] starting: {' '.join(display_command)}", flush=True)
	subprocess.Popen(command)