QED / back_modal.py
SPP
Q.E.D — initial submission
ed428ff
Raw
History Blame Contribute Delete
3.88 kB
import modal
import subprocess
app = modal.App("llama-server")
MINUTES = 60
GPU_CONFIG = "A100-40GB"
cache_dir = "/root/.cache/llama.cpp"
#REPO_ID = "google/gemma-4-31B-it-qat-q4_0-gguf"
#MODEL_FILE = "gemma-4-31B_q4_0-it.gguf"
#MMPROJ_FILE = "gemma-4-31B-it-mmproj.gguf"
REPO_ID = "unsloth/Qwen3.6-27B-GGUF"
MODEL_FILE = "Qwen3.6-27B-Q4_K_M.gguf"
cuda_tag = "12.4.0-devel-ubuntu22.04"
model_cache = modal.Volume.from_name("llamacpp-cache", create_if_missing=True)
image = (
modal.Image.from_registry(f"nvidia/cuda:{cuda_tag}", add_python="3.11")
.apt_install(
"git",
"build-essential",
"cmake",
"curl",
"libcurl4-openssl-dev",
"libssl-dev",
)
.run_commands("git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp && git pull origin master")
.run_commands(
"cmake llama.cpp -B llama.cpp/build "
"-DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON -DLLAMA_OPENSSL=ON"
)
.run_commands(
"cmake --build llama.cpp/build --config Release -j "
"--target llama-server "
)
.run_commands("cp llama.cpp/build/bin/llama-server /usr/local/bin/")
.entrypoint([])
)
download_image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install("huggingface_hub[hf_transfer]==0.26.2")
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)
@app.function(
image=download_image,
volumes={cache_dir: model_cache},
timeout=20 * MINUTES,
)
def download_model():
from huggingface_hub import hf_hub_download
for filename in [MODEL_FILE]: #, MMPROJ_FILE
hf_hub_download(
repo_id=REPO_ID,
filename=filename,
local_dir=cache_dir,
)
model_cache.commit()
@app.function(
image=image,
gpu=GPU_CONFIG,
volumes={cache_dir: model_cache},
timeout=60 * MINUTES,
max_containers=1,
min_containers=1, #← for judging so that judges can start with a warm container
)
@modal.web_server(port=8080, startup_timeout=5 * MINUTES)
def serve():
import shutil
import os
import urllib.request
import json
import time
local_model = f"/tmp/{MODEL_FILE}"
#local_mmproj = f"/tmp/{MMPROJ_FILE}"
if not os.path.exists(local_model):
print("Copying model to local storage...", flush=True)
shutil.copy2(f"{cache_dir}/{MODEL_FILE}", local_model)
#shutil.copy2(f"{cache_dir}/{MMPROJ_FILE}", local_mmproj)
print("Copy complete.", flush=True)
subprocess.Popen([
"llama-server",
"-m",local_model ,
# "--mmproj", local_mmproj,
"--host", "0.0.0.0",
"--port", "8080",
"--ctx-size", "4096",
"-ngl", "999",
"--flash-attn","on",
"-np","1",
"-b", "2048",
"-ub", "512",
"--cache-type-k", "q8_0",
"--cache-type-v", "q8_0",
"-t", "8",
#"--no-mmap", ← loads weight into ram/not needed because of tmp
#"--no-warmup", skip empty run
])
# wait for server ready
for _ in range(60):
try:
urllib.request.urlopen("http://localhost:8080/health")
break
except Exception:
time.sleep(5)
# fire a real request to compile actual CUDA graphs
payload = json.dumps({
"model": "any",
"messages": [{"role": "user", "content": "hi"}],
"max_tokens": 10,
"chat_template_kwargs": {"enable_thinking": False}
}).encode()
req = urllib.request.Request(
"http://localhost:8080/v1/chat/completions",
data=payload,
headers={"Content-Type": "application/json"}
)
urllib.request.urlopen(req)
print("Warmup complete.", flush=True)
@app.local_entrypoint()
def main():
download_model.remote()