import modal import subprocess app = modal.App("llama-server") MINUTES = 60 GPU_CONFIG = "A100-40GB" cache_dir = "/root/.cache/llama.cpp" #REPO_ID = "google/gemma-4-31B-it-qat-q4_0-gguf" #MODEL_FILE = "gemma-4-31B_q4_0-it.gguf" #MMPROJ_FILE = "gemma-4-31B-it-mmproj.gguf" REPO_ID = "unsloth/Qwen3.6-27B-GGUF" MODEL_FILE = "Qwen3.6-27B-Q4_K_M.gguf" cuda_tag = "12.4.0-devel-ubuntu22.04" model_cache = modal.Volume.from_name("llamacpp-cache", create_if_missing=True) image = ( modal.Image.from_registry(f"nvidia/cuda:{cuda_tag}", add_python="3.11") .apt_install( "git", "build-essential", "cmake", "curl", "libcurl4-openssl-dev", "libssl-dev", ) .run_commands("git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp && git pull origin master") .run_commands( "cmake llama.cpp -B llama.cpp/build " "-DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON -DLLAMA_OPENSSL=ON" ) .run_commands( "cmake --build llama.cpp/build --config Release -j " "--target llama-server " ) .run_commands("cp llama.cpp/build/bin/llama-server /usr/local/bin/") .entrypoint([]) ) download_image = ( modal.Image.debian_slim(python_version="3.11") .pip_install("huggingface_hub[hf_transfer]==0.26.2") .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) ) @app.function( image=download_image, volumes={cache_dir: model_cache}, timeout=20 * MINUTES, ) def download_model(): from huggingface_hub import hf_hub_download for filename in [MODEL_FILE]: #, MMPROJ_FILE hf_hub_download( repo_id=REPO_ID, filename=filename, local_dir=cache_dir, ) model_cache.commit() @app.function( image=image, gpu=GPU_CONFIG, volumes={cache_dir: model_cache}, timeout=60 * MINUTES, max_containers=1, min_containers=1, #← for judging so that judges can start with a warm container ) @modal.web_server(port=8080, startup_timeout=5 * MINUTES) def serve(): import shutil import os import urllib.request import json import time local_model = f"/tmp/{MODEL_FILE}" #local_mmproj = f"/tmp/{MMPROJ_FILE}" if not os.path.exists(local_model): print("Copying model to local storage...", flush=True) shutil.copy2(f"{cache_dir}/{MODEL_FILE}", local_model) #shutil.copy2(f"{cache_dir}/{MMPROJ_FILE}", local_mmproj) print("Copy complete.", flush=True) subprocess.Popen([ "llama-server", "-m",local_model , # "--mmproj", local_mmproj, "--host", "0.0.0.0", "--port", "8080", "--ctx-size", "4096", "-ngl", "999", "--flash-attn","on", "-np","1", "-b", "2048", "-ub", "512", "--cache-type-k", "q8_0", "--cache-type-v", "q8_0", "-t", "8", #"--no-mmap", ← loads weight into ram/not needed because of tmp #"--no-warmup", skip empty run ]) # wait for server ready for _ in range(60): try: urllib.request.urlopen("http://localhost:8080/health") break except Exception: time.sleep(5) # fire a real request to compile actual CUDA graphs payload = json.dumps({ "model": "any", "messages": [{"role": "user", "content": "hi"}], "max_tokens": 10, "chat_template_kwargs": {"enable_thinking": False} }).encode() req = urllib.request.Request( "http://localhost:8080/v1/chat/completions", data=payload, headers={"Content-Type": "application/json"} ) urllib.request.urlopen(req) print("Warmup complete.", flush=True) @app.local_entrypoint() def main(): download_model.remote()