TinyNarrator / modal_workers /reader_brain.py
cvpfus
Reduce reader brain Modal memory use
718f2c4
Raw
History Blame Contribute Delete
3.04 kB
"""Modal worker for Tiny Narrator's llama.cpp reader brain.
Deploy:
modal deploy modal_workers/reader_brain.py
The deployed web server exposes llama.cpp's OpenAI-compatible API. Set:
LLAMA_CPP_BASE_URL=https://<modal-app-url>/v1
LLAMA_CPP_MODEL=narrator-brain
LLAMA_CPP_TOKEN=<same value as Modal secret>
The server intentionally binds to 0.0.0.0 because Modal's web_server proxy
routes traffic to the container port.
"""
import os
from pathlib import Path
import subprocess
import shutil
import modal
APP_NAME = "tiny-narrator-reader-brain"
MODEL_REF = "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF:Q4_K_M"
MODEL_ALIAS = "narrator-brain"
SERVER_PORT = 8080
CACHE_DIR = "/cache"
app = modal.App(APP_NAME)
model_cache = modal.Volume.from_name("tiny-narrator-reader-brain-cache", create_if_missing=True)
def _secret_names() -> list[modal.Secret]:
return [modal.Secret.from_name("tiny-narrator-reader-brain-token")]
def _find_llama_server() -> str:
candidates = [
shutil.which("llama-server"),
"/app/llama-server",
"/usr/local/bin/llama-server",
"/usr/bin/llama-server",
"/bin/llama-server",
"/llama-server",
]
for candidate in candidates:
if candidate and Path(candidate).exists():
return candidate
inspected = []
for directory in ("/app", "/usr/local/bin", "/usr/bin", "/bin"):
path = Path(directory)
if path.exists():
inspected.append(f"{directory}: {[item.name for item in path.glob('*llama*')]}")
raise FileNotFoundError(f"llama-server binary was not found. Inspected: {'; '.join(inspected)}")
reader_brain_image = (
modal.Image.from_registry(
"ghcr.io/ggml-org/llama.cpp:server-cuda12",
add_python="3.12",
)
.dockerfile_commands("ENTRYPOINT []")
.env(
{
"HF_HOME": f"{CACHE_DIR}/huggingface",
}
)
)
@app.function(
image=reader_brain_image,
gpu="T4",
volumes={CACHE_DIR: model_cache},
secrets=_secret_names(),
timeout=900,
scaledown_window=300,
max_containers=1,
)
@modal.concurrent(max_inputs=20)
@modal.web_server(SERVER_PORT, startup_timeout=600)
def reader_brain_server():
api_key = os.getenv("LLAMA_CPP_TOKEN", "")
command = [
_find_llama_server(),
"--host",
"0.0.0.0",
"--port",
str(SERVER_PORT),
"-hf",
MODEL_REF,
"--alias",
MODEL_ALIAS,
"--ctx-size",
"4096",
"--parallel",
"1",
"--reasoning",
"off",
"--n-gpu-layers",
"999",
]
if api_key:
command.extend(["--api-key", api_key])
display_command = command.copy()
if "--api-key" in display_command:
key_index = display_command.index("--api-key") + 1
if key_index < len(display_command):
display_command[key_index] = "***"
print(f"[tiny-narrator-reader-brain] starting: {' '.join(display_command)}", flush=True)
subprocess.Popen(command)