Spaces:

build-small-hackathon
/

QED

Running

QED / back_modal.py

SPP

Q.E.D — initial submission

ed428ff 17 days ago

3.88 kB

	import modal
	import subprocess

	app = modal.App("llama-server")

	MINUTES = 60
	GPU_CONFIG = "A100-40GB"
	cache_dir = "/root/.cache/llama.cpp"

	#REPO_ID = "google/gemma-4-31B-it-qat-q4_0-gguf"
	#MODEL_FILE = "gemma-4-31B_q4_0-it.gguf"
	#MMPROJ_FILE = "gemma-4-31B-it-mmproj.gguf"


	REPO_ID = "unsloth/Qwen3.6-27B-GGUF"
	MODEL_FILE = "Qwen3.6-27B-Q4_K_M.gguf"


	cuda_tag = "12.4.0-devel-ubuntu22.04"

	model_cache = modal.Volume.from_name("llamacpp-cache", create_if_missing=True)

	image = (
	modal.Image.from_registry(f"nvidia/cuda:{cuda_tag}", add_python="3.11")
	.apt_install(
	"git",
	"build-essential",
	"cmake",
	"curl",
	"libcurl4-openssl-dev",
	"libssl-dev",
	)
	.run_commands("git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp && git pull origin master")
	.run_commands(
	"cmake llama.cpp -B llama.cpp/build "
	"-DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON -DLLAMA_OPENSSL=ON"
	)
	.run_commands(
	"cmake --build llama.cpp/build --config Release -j "
	"--target llama-server "
	)
	.run_commands("cp llama.cpp/build/bin/llama-server /usr/local/bin/")
	.entrypoint([])
	)

	download_image = (
	modal.Image.debian_slim(python_version="3.11")
	.pip_install("huggingface_hub[hf_transfer]==0.26.2")
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
	)


	@app.function(
	image=download_image,
	volumes={cache_dir: model_cache},
	timeout=20 * MINUTES,
	)
	def download_model():
	from huggingface_hub import hf_hub_download

	for filename in [MODEL_FILE]: #, MMPROJ_FILE
	hf_hub_download(
	repo_id=REPO_ID,
	filename=filename,
	local_dir=cache_dir,
	)

	model_cache.commit()


	@app.function(
	image=image,
	gpu=GPU_CONFIG,
	volumes={cache_dir: model_cache},
	timeout=60 * MINUTES,
	max_containers=1,
	min_containers=1, #← for judging so that judges can start with a warm container
	)

	@modal.web_server(port=8080, startup_timeout=5 * MINUTES)
	def serve():
	import shutil
	import os
	import urllib.request
	import json
	import time

	local_model = f"/tmp/{MODEL_FILE}"
	#local_mmproj = f"/tmp/{MMPROJ_FILE}"

	if not os.path.exists(local_model):
	print("Copying model to local storage...", flush=True)
	shutil.copy2(f"{cache_dir}/{MODEL_FILE}", local_model)

	#shutil.copy2(f"{cache_dir}/{MMPROJ_FILE}", local_mmproj)
	print("Copy complete.", flush=True)

	subprocess.Popen([
	"llama-server",
	"-m",local_model ,
	# "--mmproj", local_mmproj,
	"--host", "0.0.0.0",
	"--port", "8080",
	"--ctx-size", "4096",
	"-ngl", "999",
	"--flash-attn","on",
	"-np","1",
	"-b", "2048",
	"-ub", "512",
	"--cache-type-k", "q8_0",
	"--cache-type-v", "q8_0",
	"-t", "8",
	#"--no-mmap", ← loads weight into ram/not needed because of tmp
	#"--no-warmup", skip empty run
	])

	# wait for server ready
	for _ in range(60):
	try:
	urllib.request.urlopen("http://localhost:8080/health")
	break
	except Exception:
	time.sleep(5)

	# fire a real request to compile actual CUDA graphs
	payload = json.dumps({
	"model": "any",
	"messages": [{"role": "user", "content": "hi"}],
	"max_tokens": 10,
	"chat_template_kwargs": {"enable_thinking": False}
	}).encode()
	req = urllib.request.Request(
	"http://localhost:8080/v1/chat/completions",
	data=payload,
	headers={"Content-Type": "application/json"}
	)
	urllib.request.urlopen(req)
	print("Warmup complete.", flush=True)




	@app.local_entrypoint()
	def main():
	download_model.remote()