import modal
import subprocess

app = modal.App("llama-server")

MINUTES = 60
GPU_CONFIG = "A100-40GB"
cache_dir = "/root/.cache/llama.cpp"

#REPO_ID = "google/gemma-4-31B-it-qat-q4_0-gguf"
#MODEL_FILE = "gemma-4-31B_q4_0-it.gguf"
#MMPROJ_FILE = "gemma-4-31B-it-mmproj.gguf"


REPO_ID = "unsloth/Qwen3.6-27B-GGUF"
MODEL_FILE = "Qwen3.6-27B-Q4_K_M.gguf"


cuda_tag = "12.4.0-devel-ubuntu22.04"

model_cache = modal.Volume.from_name("llamacpp-cache", create_if_missing=True)

image = (
    modal.Image.from_registry(f"nvidia/cuda:{cuda_tag}", add_python="3.11")
    .apt_install(
        "git",
        "build-essential",
        "cmake",
        "curl",
        "libcurl4-openssl-dev",
        "libssl-dev",
    )
    .run_commands("git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp && git pull origin master")
    .run_commands(
        "cmake llama.cpp -B llama.cpp/build "
        "-DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON -DLLAMA_OPENSSL=ON"
    )
    .run_commands(
        "cmake --build llama.cpp/build --config Release -j "
        "--target llama-server "
    )
    .run_commands("cp llama.cpp/build/bin/llama-server /usr/local/bin/")
    .entrypoint([])
)

download_image = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install("huggingface_hub[hf_transfer]==0.26.2")
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)


@app.function(
    image=download_image,
    volumes={cache_dir: model_cache},
    timeout=20 * MINUTES,
)
def download_model():
    from huggingface_hub import hf_hub_download

    for filename in [MODEL_FILE]: #, MMPROJ_FILE 
        hf_hub_download(
            repo_id=REPO_ID,
            filename=filename,
            local_dir=cache_dir,
        )

    model_cache.commit()


@app.function(
    image=image,
    gpu=GPU_CONFIG,
    volumes={cache_dir: model_cache},
    timeout=60 * MINUTES,
    max_containers=1,
    min_containers=1, #← for judging so that judges can start with a warm container
)

@modal.web_server(port=8080, startup_timeout=5 * MINUTES)
def serve():
     import shutil
     import os
     import urllib.request
     import json
     import time

     local_model = f"/tmp/{MODEL_FILE}"
     #local_mmproj = f"/tmp/{MMPROJ_FILE}"

     if not os.path.exists(local_model):
        print("Copying model to local storage...", flush=True)
        shutil.copy2(f"{cache_dir}/{MODEL_FILE}", local_model)

        #shutil.copy2(f"{cache_dir}/{MMPROJ_FILE}", local_mmproj)
        print("Copy complete.", flush=True)

     subprocess.Popen([
        "llama-server",
        "-m",local_model ,
       # "--mmproj", local_mmproj, 
        "--host", "0.0.0.0",
        "--port", "8080",
        "--ctx-size", "4096",
        "-ngl", "999",
        "--flash-attn","on", 
        "-np","1",
        "-b", "2048",
        "-ub", "512",
        "--cache-type-k", "q8_0",
        "--cache-type-v", "q8_0",
        "-t", "8",
        #"--no-mmap",  ← loads weight into ram/not needed because of tmp 
        #"--no-warmup",  skip empty run 
    ])

        # wait for server ready
     for _ in range(60):
            try:
                urllib.request.urlopen("http://localhost:8080/health")
                break
            except Exception:
                time.sleep(5)

        # fire a real request to compile actual CUDA graphs
     payload = json.dumps({
            "model": "any",
            "messages": [{"role": "user", "content": "hi"}],
            "max_tokens": 10,
            "chat_template_kwargs": {"enable_thinking": False}
        }).encode()
     req = urllib.request.Request(
            "http://localhost:8080/v1/chat/completions",
            data=payload,
            headers={"Content-Type": "application/json"}
        )
     urllib.request.urlopen(req)
     print("Warmup complete.", flush=True)


@app.local_entrypoint()
def main():
    download_model.remote()