Spaces:
Running
Running
| import modal | |
| import subprocess | |
| app = modal.App("llama-server") | |
| MINUTES = 60 | |
| GPU_CONFIG = "A100-40GB" | |
| cache_dir = "/root/.cache/llama.cpp" | |
| #REPO_ID = "google/gemma-4-31B-it-qat-q4_0-gguf" | |
| #MODEL_FILE = "gemma-4-31B_q4_0-it.gguf" | |
| #MMPROJ_FILE = "gemma-4-31B-it-mmproj.gguf" | |
| REPO_ID = "unsloth/Qwen3.6-27B-GGUF" | |
| MODEL_FILE = "Qwen3.6-27B-Q4_K_M.gguf" | |
| cuda_tag = "12.4.0-devel-ubuntu22.04" | |
| model_cache = modal.Volume.from_name("llamacpp-cache", create_if_missing=True) | |
| image = ( | |
| modal.Image.from_registry(f"nvidia/cuda:{cuda_tag}", add_python="3.11") | |
| .apt_install( | |
| "git", | |
| "build-essential", | |
| "cmake", | |
| "curl", | |
| "libcurl4-openssl-dev", | |
| "libssl-dev", | |
| ) | |
| .run_commands("git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp && git pull origin master") | |
| .run_commands( | |
| "cmake llama.cpp -B llama.cpp/build " | |
| "-DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON -DLLAMA_OPENSSL=ON" | |
| ) | |
| .run_commands( | |
| "cmake --build llama.cpp/build --config Release -j " | |
| "--target llama-server " | |
| ) | |
| .run_commands("cp llama.cpp/build/bin/llama-server /usr/local/bin/") | |
| .entrypoint([]) | |
| ) | |
| download_image = ( | |
| modal.Image.debian_slim(python_version="3.11") | |
| .pip_install("huggingface_hub[hf_transfer]==0.26.2") | |
| .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) | |
| ) | |
| def download_model(): | |
| from huggingface_hub import hf_hub_download | |
| for filename in [MODEL_FILE]: #, MMPROJ_FILE | |
| hf_hub_download( | |
| repo_id=REPO_ID, | |
| filename=filename, | |
| local_dir=cache_dir, | |
| ) | |
| model_cache.commit() | |
| def serve(): | |
| import shutil | |
| import os | |
| import urllib.request | |
| import json | |
| import time | |
| local_model = f"/tmp/{MODEL_FILE}" | |
| #local_mmproj = f"/tmp/{MMPROJ_FILE}" | |
| if not os.path.exists(local_model): | |
| print("Copying model to local storage...", flush=True) | |
| shutil.copy2(f"{cache_dir}/{MODEL_FILE}", local_model) | |
| #shutil.copy2(f"{cache_dir}/{MMPROJ_FILE}", local_mmproj) | |
| print("Copy complete.", flush=True) | |
| subprocess.Popen([ | |
| "llama-server", | |
| "-m",local_model , | |
| # "--mmproj", local_mmproj, | |
| "--host", "0.0.0.0", | |
| "--port", "8080", | |
| "--ctx-size", "4096", | |
| "-ngl", "999", | |
| "--flash-attn","on", | |
| "-np","1", | |
| "-b", "2048", | |
| "-ub", "512", | |
| "--cache-type-k", "q8_0", | |
| "--cache-type-v", "q8_0", | |
| "-t", "8", | |
| #"--no-mmap", ← loads weight into ram/not needed because of tmp | |
| #"--no-warmup", skip empty run | |
| ]) | |
| # wait for server ready | |
| for _ in range(60): | |
| try: | |
| urllib.request.urlopen("http://localhost:8080/health") | |
| break | |
| except Exception: | |
| time.sleep(5) | |
| # fire a real request to compile actual CUDA graphs | |
| payload = json.dumps({ | |
| "model": "any", | |
| "messages": [{"role": "user", "content": "hi"}], | |
| "max_tokens": 10, | |
| "chat_template_kwargs": {"enable_thinking": False} | |
| }).encode() | |
| req = urllib.request.Request( | |
| "http://localhost:8080/v1/chat/completions", | |
| data=payload, | |
| headers={"Content-Type": "application/json"} | |
| ) | |
| urllib.request.urlopen(req) | |
| print("Warmup complete.", flush=True) | |
| def main(): | |
| download_model.remote() | |